"""
This module abstracts the tokenizer object, so that we can use \
tokenizers from different libraries and provide the same \
interface. Hence, we won't need \
to change the rest of the code when changing tokenizers.
So far we only have one tokenizer, based on keras.preprocessing.text.Tokenizer.
"""
import pickle
from config import logger
from abc import ABC, abstractmethod
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
[docs]class BaseTokenizer(ABC):
def __init__(self):
self.tokenizer = None
[docs] @abstractmethod
def fit(self, train_data):
""" Fit the tokenizer on the training data.
Args:
train_data (list): List of texts to fit the tokenizer on.
"""
pass
[docs] def save(self, filename):
""" Persist the tokenizer to disk
Args:
filename (str): Path to save to.
"""
with open(filename, "wb") as f:
pickle.dump(self.tokenizer, f, protocol=pickle.HIGHEST_PROTOCOL)
[docs] def load(self, filepath):
""" Load the tokenizer from disk
Args:
filename (str): Path to load the tokenizer from
Returns:
self (BaseTokenizer): the tokenizer itself, with loaded data
"""
with open(filepath, "rb") as f:
self.tokenizer = pickle.load(f)
return self
[docs]class KerasTokenizer(BaseTokenizer):
def __init__(self, pad_max_len, lower=False, filters="\t\n"):
self.tokenizer = Tokenizer(lower=lower, filters=filters)
self.pad_max_len = pad_max_len
[docs] def fit(self, train_data):
self.tokenizer.fit_on_texts(train_data)
logger.info("Tokenizer has {0} vocabulary size."
.format(len(self.tokenizer.word_index)))