-
-
Save anonymous/28f30611bb0849ef0d99fd341e6e1d7b to your computer and use it in GitHub Desktop.
Attempt of converting a character-based LSTM into a word-based one.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
# In[43]: | |
import numpy | |
from keras.models import Sequential | |
from keras.layers import Dense | |
from keras.layers import Dropout | |
from keras.layers.recurrent import LSTM | |
from keras.callbacks import ModelCheckpoint | |
from keras.utils import np_utils | |
from keras.preprocessing.text import Tokenizer | |
# In[69]: | |
# load ascii text and covert to lowercase | |
filename = "alice.txt" | |
raw_text = open(filename).read() | |
raw_text = raw_text.lower() | |
# TODO : determine if the tokenizer already lowers the case | |
# Tokenize the text (char sequence => int) | |
tokenizer = Tokenizer() | |
tokenizer.fit_on_texts([raw_text]) | |
sequenced_text = tokenizer.texts_to_sequences([raw_text]) | |
sequenced_alice_text = sequenced_text[0] | |
# Create a tool to invert the tokenization | |
inverse_tokenizer = {} | |
for (word, word_index) in tokenizer.word_index.items(): | |
inverse_tokenizer[word_index] = word | |
num_elements_in_sequence = len(sequenced_alice_text) | |
num_words = len(tokenizer.word_index) | |
print("Number of elements : ", num_elements_in_sequence) | |
print("Number of different words : ", num_words) | |
# In[70]: | |
# prepare the dataset of input to output pairs encoded as integers | |
seq_length = 10 | |
# Using sets to avoid adding already inserted sequences | |
sequences = set() | |
for i in range(0, num_elements_in_sequence - seq_length, 1): | |
sequence = tuple(f for f in sequenced_alice_text[i:i + seq_length]) | |
sequences.add(sequence) | |
# TODO : find better names | |
# dataX = a word sequence list from the corpus | |
# dataY = first element of a sequence | |
dataX = [[el for el in sequence] for sequence in sequences] | |
dataY = [sequence[0] for sequence in sequences] | |
n_patterns = len(dataX) | |
print("Total Patterns: ", n_patterns) | |
# In[71]: | |
# reshape X to be [samples, time steps, features] | |
X = numpy.reshape(dataX, (n_patterns, seq_length, 1)) | |
# normalize | |
X = X / float(num_words) | |
# one hot encode the output variable | |
y = np_utils.to_categorical(dataY) | |
# In[72]: | |
# define the LSTM model | |
model = Sequential() | |
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True)) | |
# The dropout randomly drops results to avoid overfitting (converging towards an unique result without any means of further evolution) | |
model.add(Dropout(0.2)) | |
model.add(LSTM(256)) | |
model.add(Dropout(0.2)) | |
# Simple neural network | |
model.add(Dense(y.shape[1], activation='softmax')) | |
model.compile(loss='categorical_crossentropy', optimizer='adam') | |
# In[73]: | |
# define the checkpoint | |
filepath="tokenizer-weights-improvement-{epoch:02d}-{loss:.4f}.hdf5" | |
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min') | |
# In[ ]: | |
from keras.callbacks import Callback | |
class GenerateAtEndOfEpoch(Callback): | |
def on_epoch_end(self, epoch, logs={}): | |
start = numpy.random.randint(0, len(dataX)-1) | |
pattern = dataX[start] | |
# Generate a sentence | |
out = "" | |
for i in range(100): | |
# Reshape the pattern into a numpy array | |
x = numpy.reshape(pattern, (1, len(pattern), 1)) | |
# Normalization | |
x = x / float(num_words) | |
# Getting the next word | |
prediction = self.model.predict(x, verbose=0) | |
index = numpy.argmax(prediction) | |
# Add it to the pattern | |
pattern.append(index) | |
# Remove the first word?tim | |
pattern = pattern[1:len(pattern)] | |
# TODO : find a better way to keep track of punctuation | |
# Append the word to the result string | |
out = out + " " + inverse_tokenizer[index] | |
print(out) | |
# In[ ]: | |
callbacks_list = [checkpoint, GenerateAtEndOfEpoch()] | |
model.fit(X, y, epochs=60, batch_size=64, callbacks=callbacks_list) | |
# In[ ]: | |
print(dataY[1:100]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment