Created
June 12, 2018 06:36
-
-
Save chatrapathik/8e3d267319075976b31b954d102f3594 to your computer and use it in GitHub Desktop.
sequence to sequence model with character level using SimpleRNN
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from keras.models import Sequential, Model | |
from keras.layers import Dense, SimpleRNN, Input, LSTM | |
input_text = 'India is a vast South Asian country with diverse terrain – from Himalayan peaks to Indian Ocean coastline – and history reaching back 5 millennia. In the north, Mughal Empire landmarks i | |
nclude Delhi’s Red Fort complex and massive Jama Masjid mosque, plus Agra’s iconic Taj Mahal mausoleum. Pilgrims bathe in the Ganges in Varanasi, and Rishikesh is a yoga centre and base for Himalayan | |
trekking.' | |
UNITS = 100 | |
VOCABS = None | |
def char_to_int(chars): | |
return dict((c, i+1) for i, c in enumerate(chars)) | |
def int_to_char(chars): | |
return dict((i+1, c) for i, c in enumerate(chars)) | |
def get_unique_chars(text): | |
chars = sorted(list(set(text))) | |
return chars | |
def get_train_and_traget_data(char_to_int, text): | |
training_data = [] | |
target_data = [] | |
for num, word in enumerate(text.split(' ')): | |
word = list(word) | |
for i, char in enumerate(word): | |
data = np.zeros(UNITS) | |
data[0] = char_to_int[char] | |
data.tolist() | |
training_data.append(data) | |
target_data = [x[0] for x in training_data[1:]] | |
target_data.append(0) | |
return training_data, target_data | |
def get_one_hot_vectors(training_data, target_data): | |
one_hot_vectors = [] | |
for x in training_data: | |
neasted_data = [] | |
for num in x: | |
data = np.zeros(VOCABS) | |
if int(num) >= 1: | |
data[int(num)] = 1 | |
neasted_data.append(data) | |
one_hot_vectors.append(neasted_data) | |
training_data = np.array(one_hot_vectors) | |
target_vectors = [] | |
for num in target_data: | |
data = np.zeros(VOCABS) | |
if int(num) >= 1: | |
data[int(num)] = 1 | |
target_vectors.append(data) | |
target_data = np.array(target_vectors) | |
return training_data, target_data | |
def create_model(data): | |
model = Sequential() | |
model.add(SimpleRNN(256, input_shape=(data.shape[1], data.shape[2]))) | |
model.add(Dense(VOCABS, activation='sigmoid')) | |
return model | |
def compile_model(model): | |
model.compile(optimizer='adam', | |
loss='categorical_crossentropy') | |
return model | |
def train_model(model, trainx, testx): | |
model.fit(trainx, testx, epochs=100, batch_size=32) | |
return model | |
def test_model(model, chars_to_int): | |
in_text = 'India' | |
mapping = chars_to_int | |
n_chars = 20 | |
seq_length = UNITS | |
# generate a fixed number of characters | |
for _ in range(n_chars): | |
# encode the characters as integers | |
encoded, _ = get_train_and_traget_data(mapping, in_text) | |
# one hot encode | |
encoded, _ = get_one_hot_vectors(encoded, _) | |
# predict character | |
predict_value = model.predict_classes(encoded, verbose=0) | |
predict_value = np.argmax(predict_value) | |
# reverse map integer to character | |
out_char = '' | |
for char, index in mapping.items(): | |
if index == predict_value: | |
out_char = char | |
break | |
# append to input | |
in_text += char | |
return in_text | |
unique_chars = get_unique_chars(input_text) | |
VOCABS = len(unique_chars) + 1 | |
chars_to_int = char_to_int(unique_chars) | |
training_data, target_data = get_train_and_traget_data(chars_to_int, input_text) | |
training_data, target_data = get_one_hot_vectors(training_data, target_data) | |
model = create_model(training_data) | |
model = compile_model(model) | |
model = train_model(model, training_data, target_data) | |
print(test_model(model, chars_to_int)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment