Source code for sentiment_classifier.nlp.models.model

""" Module containing the root Model class that every new model \
    must inherit from.

The Model class has the following attributes:

 - model: the ML model, so far built using Keras
 - tokenizer: responsible for mapping words into indices

The Model class implements the following methods:

 - build_model: builds the model
 - train: trains the model
 - save: saves the model weights & tokenizer
 - predict: predicts on sentences
 - _make_training_data: a private method that creates the train/test \
    matrices from a Reader object
"""
import os
from config import logger
from abc import abstractmethod, ABC
import tensorflow as tf


[docs]class Model(ABC): def __init__(self): self.name = self.__class__.__name__ self.tokenizer = None self.model = None logger.info("Initializing {0}".format(self.name)) def _make_training_data(self, reader): """ Method for preparing the training matrices. This function fits the tokenizer and creates train/test matrices. Args: reader (nlp.reader.Reader): a Reader instance that contains \ the data to train the model on. Returns: x_train (np.ndarray) x_test (np.ndarray) y_train (np.ndarray) y_test (np.ndarray) """ self.tokenizer.fit(reader.train_data["review"]) x_train = self.tokenizer.transform(reader.train_data["review"]) x_test = self.tokenizer.transform(reader.test_data["review"]) y_train = reader.train_data["label"].values y_test = reader.test_data["label"].values return x_train, x_test, y_train, y_test
[docs] def save(self, filepath): """Save the model weights and tokenizer Args: filepath (str): Path where to store the model. """ os.makedirs(filepath, exist_ok=True) model_filepath = os.path.join( filepath, "{0}_model.pkl".format(self.name) ) tokenizer_filepath = os.path.join( filepath, "{0}_tokenizer.pkl".format(self.name) ) self.model.save(model_filepath) self.tokenizer.save(tokenizer_filepath)
[docs] def load(self, filepath): """ Load the model weights and tokenizer Args: filepath (str): Path where to load the model. """ model_filepath = os.path.join( filepath, "{0}_model.pkl".format(self.name) ) tokenizer_filepath = os.path.join( filepath, "{0}_tokenizer.pkl".format(self.name) ) self.model = tf.keras.models.load_model(model_filepath) self.tokenizer = self.tokenizer.load(tokenizer_filepath)
[docs] @abstractmethod def build_model(self, input_shape): """ Method for building the model. Args: input_shape (int): Size of the input Returns: model (keras.Models): a keras model, to be compiled and trained """ pass
[docs] @abstractmethod def train(self, reader, filepath): """ Method for training the model. Must be implemented by the subclasses. Args: reader (nlp.reader.Reader): a Reader instance that contains \ the data to train the model on. filepath (str): path to where the model will be stored Returns: None """ pass
[docs] def predict(self, texts, preprocessing_function): """ Predict on a sentence Args: texts (np.ndarray): the texts to predict on preprocessing_function: a preprocessing function, \ from nlp.preprocessing module. Returns: cleaned_texts(list): the cleaned texts """ if not (self.tokenizer and self.model): raise Exception("Model not trained") if isinstance(texts[0], str): cleaned_texts = [preprocessing_function(s) for s in texts] elif isinstance(texts[0], list): cleaned_texts = [preprocessing_function(s[0]) for s in texts] else: raise Exception("Wrong input kind for texts") cleaned_and_tokenized_texts = self.tokenizer.transform(cleaned_texts) predictions = self.model.predict(cleaned_and_tokenized_texts) return predictions