prakhar21/nlm.py

## nlm.py
#!/usr/bin/env python

"""
@uthor: Prakhar Mishra
date: Dec, 12 2017
"""
# importing packages
from numpy import array
from keras.preprocessing.text import Tokenizer
from numpy import array
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding

# source text
data = """My name is prakhar mishra . prakhar mishra writes blog on medium ."""

# tokenization and encoding into sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
encoded = tokenizer.texts_to_sequences([data])[0]
#print encoded
# [8, 5, 7, 1, 2, 1, 2, 9, 3, 6, 4]

# vocabulary size
vocab_size = len(tokenizer.word_index) + 1

# creating i/o sequence pairs
sequences = list()
for i in range(1, len(encoded)):
    sequence = encoded[i-1:i+1]
    sequences.append(sequence)
#print sequences
# [[8, 5], [5, 7], [7, 1], [1, 2], [2, 1], [1, 2], [2, 9], [9, 3], [3, 6], [6, 4]]

# coenverting pairs to input(X) and Output(y) arrays to feed to NN
sequences = array(sequences)
X, y = sequences[:,0],sequences[:,1]

#converting output to one-hot representation
y = to_categorical(y, num_classes=vocab_size)

# NN define
embedding_size = 10
def model(vocab_size):
    model = Sequential()
    # input_length = 1 (one word at a time)
    model.add(Embedding(vocab_size, embedding_size, input_length=1))
    model.add(LSTM(50))
    model.add(Dense(vocab_size, activation= "softmax"))
    model.compile(loss="categorical_crossentropy" , optimizer= "adam" , metrics=["accuracy"] )
    model.summary()
    return model

model = model(vocab_size)

# training starts
model.fit(X, y, epochs=500, verbose=2)

# testing the model
seed = "writes"
encoded = tokenizer.texts_to_sequences([seed])[0]
encoded = array(encoded)
y_pred = model.predict_classes(encoded, verbose=0)

for word, index in tokenizer.word_index.items():
    if index==y_pred:
        print word
	#!/usr/bin/env python

	"""
	@uthor: Prakhar Mishra
	date: Dec, 12 2017
	"""
	# importing packages
	from numpy import array
	from keras.preprocessing.text import Tokenizer
	from numpy import array
	from keras.utils import to_categorical
	from keras.models import Sequential
	from keras.layers import Dense, LSTM, Embedding

	# source text
	data = """My name is prakhar mishra . prakhar mishra writes blog on medium ."""

	# tokenization and encoding into sequences
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts([data])
	encoded = tokenizer.texts_to_sequences([data])[0]
	#print encoded
	# [8, 5, 7, 1, 2, 1, 2, 9, 3, 6, 4]

	# vocabulary size
	vocab_size = len(tokenizer.word_index) + 1

	# creating i/o sequence pairs
	sequences = list()
	for i in range(1, len(encoded)):
	sequence = encoded[i-1:i+1]
	sequences.append(sequence)
	#print sequences
	# [[8, 5], [5, 7], [7, 1], [1, 2], [2, 1], [1, 2], [2, 9], [9, 3], [3, 6], [6, 4]]

	# coenverting pairs to input(X) and Output(y) arrays to feed to NN
	sequences = array(sequences)
	X, y = sequences[:,0],sequences[:,1]

	#converting output to one-hot representation
	y = to_categorical(y, num_classes=vocab_size)

	# NN define
	embedding_size = 10
	def model(vocab_size):
	model = Sequential()
	# input_length = 1 (one word at a time)
	model.add(Embedding(vocab_size, embedding_size, input_length=1))
	model.add(LSTM(50))
	model.add(Dense(vocab_size, activation= "softmax"))
	model.compile(loss="categorical_crossentropy" , optimizer= "adam" , metrics=["accuracy"] )
	model.summary()
	return model

	model = model(vocab_size)

	# training starts
	model.fit(X, y, epochs=500, verbose=2)

	# testing the model
	seed = "writes"
	encoded = tokenizer.texts_to_sequences([seed])[0]
	encoded = array(encoded)
	y_pred = model.predict_classes(encoded, verbose=0)

	for word, index in tokenizer.word_index.items():
	if index==y_pred:
	print word