Skip to content

Instantly share code, notes, and snippets.

/generation.py Secret

Created October 14, 2017 21:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anonymous/28f30611bb0849ef0d99fd341e6e1d7b to your computer and use it in GitHub Desktop.
Save anonymous/28f30611bb0849ef0d99fd341e6e1d7b to your computer and use it in GitHub Desktop.
Attempt of converting a character-based LSTM into a word-based one.
# coding: utf-8
# In[43]:
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers.recurrent import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
from keras.preprocessing.text import Tokenizer
# In[69]:
# load ascii text and covert to lowercase
filename = "alice.txt"
raw_text = open(filename).read()
raw_text = raw_text.lower()
# TODO : determine if the tokenizer already lowers the case
# Tokenize the text (char sequence => int)
tokenizer = Tokenizer()
tokenizer.fit_on_texts([raw_text])
sequenced_text = tokenizer.texts_to_sequences([raw_text])
sequenced_alice_text = sequenced_text[0]
# Create a tool to invert the tokenization
inverse_tokenizer = {}
for (word, word_index) in tokenizer.word_index.items():
inverse_tokenizer[word_index] = word
num_elements_in_sequence = len(sequenced_alice_text)
num_words = len(tokenizer.word_index)
print("Number of elements : ", num_elements_in_sequence)
print("Number of different words : ", num_words)
# In[70]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 10
# Using sets to avoid adding already inserted sequences
sequences = set()
for i in range(0, num_elements_in_sequence - seq_length, 1):
sequence = tuple(f for f in sequenced_alice_text[i:i + seq_length])
sequences.add(sequence)
# TODO : find better names
# dataX = a word sequence list from the corpus
# dataY = first element of a sequence
dataX = [[el for el in sequence] for sequence in sequences]
dataY = [sequence[0] for sequence in sequences]
n_patterns = len(dataX)
print("Total Patterns: ", n_patterns)
# In[71]:
# reshape X to be [samples, time steps, features]
X = numpy.reshape(dataX, (n_patterns, seq_length, 1))
# normalize
X = X / float(num_words)
# one hot encode the output variable
y = np_utils.to_categorical(dataY)
# In[72]:
# define the LSTM model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
# The dropout randomly drops results to avoid overfitting (converging towards an unique result without any means of further evolution)
model.add(Dropout(0.2))
model.add(LSTM(256))
model.add(Dropout(0.2))
# Simple neural network
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
# In[73]:
# define the checkpoint
filepath="tokenizer-weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
# In[ ]:
from keras.callbacks import Callback
class GenerateAtEndOfEpoch(Callback):
def on_epoch_end(self, epoch, logs={}):
start = numpy.random.randint(0, len(dataX)-1)
pattern = dataX[start]
# Generate a sentence
out = ""
for i in range(100):
# Reshape the pattern into a numpy array
x = numpy.reshape(pattern, (1, len(pattern), 1))
# Normalization
x = x / float(num_words)
# Getting the next word
prediction = self.model.predict(x, verbose=0)
index = numpy.argmax(prediction)
# Add it to the pattern
pattern.append(index)
# Remove the first word?tim
pattern = pattern[1:len(pattern)]
# TODO : find a better way to keep track of punctuation
# Append the word to the result string
out = out + " " + inverse_tokenizer[index]
print(out)
# In[ ]:
callbacks_list = [checkpoint, GenerateAtEndOfEpoch()]
model.fit(X, y, epochs=60, batch_size=64, callbacks=callbacks_list)
# In[ ]:
print(dataY[1:100])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment