Source code for sentiment_classifier.nlp.reader

"""
We are using the IMDB Large Movie Reviews dataset from Stanford AI.

It provides 50,000 reviews on movies, splitted half-half in train/test \
    and labelled as positive or negative.

We provide an abstract class Reader that we can subclass for each dataset.

We do this to standardise the dataset loading, and make it easy to use \
    multiple datasets in the rest of the code with a common interface.

The IMDBReader class implements all the code needed to load the IMDB dataset.
"""
import os
import pandas as pd
from glob import glob
import io
from abc import ABC, abstractmethod


[docs]class Reader(ABC): def __init__(self, path): self.train_data = None self.test_data = None self.path = path
[docs] @abstractmethod def load_dataset(self, path, limit=None, preprocessing_function=None): pass
[docs]class IMDBReader(Reader): def __init__(self, path): super(IMDBReader, self).__init__(path) def _read_folder(self, path, label, limit, preprocessing_function): """ Read the data from the IMDB dataset folder. The data can come from train/test and pos/neg folders. It is also \ possible to add a limit to avoid reading all the files \ (useful whendebugging). We can also add a preprocessing_function \ from the nlp.preprocessing module. Args: path (str): path to the folder label (int): label of the folder (1/0 for pos/neg) limit (int): maximum number of files to load preprocessing_function: preprocessing function, from \ nlp.preprocessing module Returns: list: list of texts, as str. """ texts = [] files = glob(os.path.join(path, "*.txt")) if limit: files = files[:limit] for i, file in enumerate(files): with io.open(file, "r", encoding="utf8") as f: text = f.read() if preprocessing_function: text = preprocessing_function(text) texts.append((text, label)) return texts def _concat_and_shuffle_dataset(self, pos, neg): """Concatenate pos and neg examples and shuffle them. Args: pos (list): List of positive examples neg (list): List of negative examples Returns: pd.DataFrame: Merged dataframe, with the texts and their labels """ concat_df = pd.concat([pd.DataFrame(pos), pd.DataFrame(neg)]) concat_df = concat_df.sample(frac=1).reset_index(drop=True) concat_df.columns = ['review', 'label'] return concat_df
[docs] def load_dataset(self, limit=None, preprocessing_function=None): """ Load the IMDB dataset. This function can also: - preprocess using a custom function - set a maximum number of files to load Args: limit (int, optional): Defaults to None. \ Max number of files to load. preprocessing_function (optional): Defaults to None. \ Function for preprocessing the texts. \ No preprocessing by default. """ train_pos = self._read_folder( path=os.path.join(self.path, "train", "pos"), label=1, limit=limit, preprocessing_function=preprocessing_function ) train_neg = self._read_folder( path=os.path.join(self.path, "train", "neg"), label=0, limit=limit, preprocessing_function=preprocessing_function ) test_pos = self._read_folder( path=os.path.join(self.path, "test", "pos"), label=1, limit=limit, preprocessing_function=preprocessing_function ) test_neg = self._read_folder( path=os.path.join(self.path, "test", "neg"), label=0, limit=limit, preprocessing_function=preprocessing_function ) train_data = self._concat_and_shuffle_dataset(train_pos, train_neg) test_data = self._concat_and_shuffle_dataset(test_pos, test_neg) self.train_data = train_data self.test_data = test_data