Created November 14, 2017 21:11
LSTM text-gen implementation adapted from Keras Example
Script to generate text from Chopra's writings.
At least 20 epochs are required before the generated text
starts sounding coherent.
It is recommended to run this script on GPU, as recurrent
networks are quite computationally intensive.
If you try this script on new data, make sure your corpus
has at least ~100k characters. ~1M is better.
Note: The concatenation of "How To Know God" and
"The Seven Spiritual laws of Success"
This script was adapted from this script found in the Keras
from __future__ import print_function
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM, CuDNNLSTM
from keras.optimizers import RMSprop
import numpy as np
import random
import sys
# Load in our corpus
text = open('chopra.txt', encoding="utf8").read().lower()
print('corpus length:', len(text))
chars = sorted(list(set(text)))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))
# Cut the text in semi-redundant sequences of maxlen characters.
# This is done to form batches to learn off.
maxlen = 80
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
sentences.append(text[i: i + maxlen])
next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))
# Here we form vectors out of individual sentences. These are binary based
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
for t, char in enumerate(sentence):
x[i, t, char_indices[char]] = 1
y[i, char_indices[next_chars[i]]] = 1
# build the model: a single LSTM backed by CUDA for optimisation
print('Build model...')
model = Sequential()
# The input shape here refers to having a vectorised sentence with length maxlen, with each element being a vector of length
# len(chars), where this refers to the number of different characters are present in the sample
model.add(CuDNNLSTM(256, input_shape=(maxlen, len(chars))))
optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)
def sample(preds, temperature=1.0):
# helper function to sample an index from a probability array
preds = np.asarray(preds).astype('float64')
preds = np.log(preds) / temperature
exp_preds = np.exp(preds)
preds = exp_preds / np.sum(exp_preds)
probas = np.random.multinomial(1, preds, 1)
return np.argmax(probas)
for iteration in range(1, 50):
print('-' * 50)
print('Iteration', iteration)
# Only fit for one epoch each time such that we can then generate text at each epoch, y,
# Seeding output prediction
start_index = random.randint(0, len(text) - maxlen - 1)
for diversity in [0.2, 0.5, 1.0, 1.2]:
print('----- diversity:', diversity)
generated = ''
sentence = text[start_index: start_index + maxlen]
generated += sentence
print('----- Generating with seed: "' + sentence + '"')
# Take 400 characters as output
for i in range(400):
# Vectorise input sentence
x_pred = np.zeros((1, maxlen, len(chars)))
for t, char in enumerate(sentence):
x_pred[0, t, char_indices[char]] = 1.
# Use vectorised sentence to predict
preds = model.predict(x_pred, verbose=0)[0]
next_index = sample(preds, diversity)
next_char = indices_char[next_index]
# Concatenate to generated, the slide along our seeding sentence
generated += next_char
sentence = sentence[1:] + next_char
