/generation.py Secret

## generation.py

# coding: utf-8

# In[43]:


import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers.recurrent import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

from keras.preprocessing.text import Tokenizer


# In[69]:


# load ascii text and covert to lowercase
filename = "alice.txt"
raw_text = open(filename).read()
raw_text = raw_text.lower()

# TODO : determine if the tokenizer already lowers the case
# Tokenize the text (char sequence => int)
tokenizer = Tokenizer()
tokenizer.fit_on_texts([raw_text])
sequenced_text = tokenizer.texts_to_sequences([raw_text])
sequenced_alice_text = sequenced_text[0]

# Create a tool to invert the tokenization
inverse_tokenizer = {}
for (word, word_index) in tokenizer.word_index.items():
    inverse_tokenizer[word_index] = word

num_elements_in_sequence = len(sequenced_alice_text)
num_words = len(tokenizer.word_index)
print("Number of elements : ", num_elements_in_sequence)
print("Number of different words : ", num_words)


# In[70]:


# prepare the dataset of input to output pairs encoded as integers
seq_length = 10
# Using sets to avoid adding already inserted sequences
sequences = set()
for i in range(0, num_elements_in_sequence - seq_length, 1):
    sequence = tuple(f for f in sequenced_alice_text[i:i + seq_length])
    sequences.add(sequence)

# TODO : find better names
# dataX = a word sequence list from the corpus
# dataY = first element of a sequence
dataX = [[el for el in sequence] for sequence in sequences]
dataY = [sequence[0] for sequence in sequences]
n_patterns = len(dataX)
print("Total Patterns: ", n_patterns)


# In[71]:


# reshape X to be [samples, time steps, features]
X = numpy.reshape(dataX, (n_patterns, seq_length, 1))
# normalize
X = X / float(num_words)
# one hot encode the output variable
y = np_utils.to_categorical(dataY)


# In[72]:


# define the LSTM model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
# The dropout randomly drops results to avoid overfitting (converging towards an unique result without any means of further evolution)
model.add(Dropout(0.2))
model.add(LSTM(256))
model.add(Dropout(0.2))
# Simple neural network
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')


# In[73]:


# define the checkpoint
filepath="tokenizer-weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')


# In[ ]:


from keras.callbacks import Callback

class GenerateAtEndOfEpoch(Callback):
    def on_epoch_end(self, epoch, logs={}):
        start = numpy.random.randint(0, len(dataX)-1)
        pattern = dataX[start]
        # Generate a sentence
        out = ""
        for i in range(100):
            # Reshape the pattern into a numpy array
            x = numpy.reshape(pattern, (1, len(pattern), 1))
            # Normalization
            x = x / float(num_words)
            # Getting the next word
            prediction = self.model.predict(x, verbose=0)
            index = numpy.argmax(prediction)
            # Add it to the pattern
            pattern.append(index)
            # Remove the first word?tim
            pattern = pattern[1:len(pattern)]
            # TODO : find a better way to keep track of punctuation
            # Append the word to the result string
            out = out + " " + inverse_tokenizer[index]
        print(out)


# In[ ]:


callbacks_list = [checkpoint, GenerateAtEndOfEpoch()]
model.fit(X, y, epochs=60, batch_size=64, callbacks=callbacks_list)


# In[ ]:


print(dataY[1:100])

	# coding: utf-8

	# In[43]:


	import numpy
	from keras.models import Sequential
	from keras.layers import Dense
	from keras.layers import Dropout
	from keras.layers.recurrent import LSTM
	from keras.callbacks import ModelCheckpoint
	from keras.utils import np_utils

	from keras.preprocessing.text import Tokenizer


	# In[69]:


	# load ascii text and covert to lowercase
	filename = "alice.txt"
	raw_text = open(filename).read()
	raw_text = raw_text.lower()

	# TODO : determine if the tokenizer already lowers the case
	# Tokenize the text (char sequence => int)
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts([raw_text])
	sequenced_text = tokenizer.texts_to_sequences([raw_text])
	sequenced_alice_text = sequenced_text[0]

	# Create a tool to invert the tokenization
	inverse_tokenizer = {}
	for (word, word_index) in tokenizer.word_index.items():
	inverse_tokenizer[word_index] = word

	num_elements_in_sequence = len(sequenced_alice_text)
	num_words = len(tokenizer.word_index)
	print("Number of elements : ", num_elements_in_sequence)
	print("Number of different words : ", num_words)


	# In[70]:


	# prepare the dataset of input to output pairs encoded as integers
	seq_length = 10
	# Using sets to avoid adding already inserted sequences
	sequences = set()
	for i in range(0, num_elements_in_sequence - seq_length, 1):
	sequence = tuple(f for f in sequenced_alice_text[i:i + seq_length])
	sequences.add(sequence)

	# TODO : find better names
	# dataX = a word sequence list from the corpus
	# dataY = first element of a sequence
	dataX = [[el for el in sequence] for sequence in sequences]
	dataY = [sequence[0] for sequence in sequences]
	n_patterns = len(dataX)
	print("Total Patterns: ", n_patterns)


	# In[71]:


	# reshape X to be [samples, time steps, features]
	X = numpy.reshape(dataX, (n_patterns, seq_length, 1))
	# normalize
	X = X / float(num_words)
	# one hot encode the output variable
	y = np_utils.to_categorical(dataY)


	# In[72]:


	# define the LSTM model
	model = Sequential()
	model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
	# The dropout randomly drops results to avoid overfitting (converging towards an unique result without any means of further evolution)
	model.add(Dropout(0.2))
	model.add(LSTM(256))
	model.add(Dropout(0.2))
	# Simple neural network
	model.add(Dense(y.shape[1], activation='softmax'))
	model.compile(loss='categorical_crossentropy', optimizer='adam')


	# In[73]:


	# define the checkpoint
	filepath="tokenizer-weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
	checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')


	# In[ ]:


	from keras.callbacks import Callback

	class GenerateAtEndOfEpoch(Callback):
	def on_epoch_end(self, epoch, logs={}):
	start = numpy.random.randint(0, len(dataX)-1)
	pattern = dataX[start]
	# Generate a sentence
	out = ""
	for i in range(100):
	# Reshape the pattern into a numpy array
	x = numpy.reshape(pattern, (1, len(pattern), 1))
	# Normalization
	x = x / float(num_words)
	# Getting the next word
	prediction = self.model.predict(x, verbose=0)
	index = numpy.argmax(prediction)
	# Add it to the pattern
	pattern.append(index)
	# Remove the first word?tim
	pattern = pattern[1:len(pattern)]
	# TODO : find a better way to keep track of punctuation
	# Append the word to the result string
	out = out + " " + inverse_tokenizer[index]
	print(out)


	# In[ ]:


	callbacks_list = [checkpoint, GenerateAtEndOfEpoch()]
	model.fit(X, y, epochs=60, batch_size=64, callbacks=callbacks_list)


	# In[ ]:


	print(dataY[1:100])