Skip to content

Instantly share code, notes, and snippets.

@maxpagels
Last active July 25, 2017 11:31
Show Gist options
  • Save maxpagels/22645a37873cb29187bc3d8d7df6fab0 to your computer and use it in GitHub Desktop.
Save maxpagels/22645a37873cb29187bc3d8d7df6fab0 to your computer and use it in GitHub Desktop.
import csv
import sys
import numpy
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, TimeDistributed, RepeatVector
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
numpy.random.seed(42)
MAX_SEQUENCE_LENGTH = 3
NUM_WORDS = 450
EMBEDDING_SIZE = 100
NUM_LSTM_LAYERS = 5
HIDDEN_SIZE = 32
NUM_EPOCHS = 200
def seq2sentence(seq, index2word):
sentence = ''
for idx in seq:
sentence += index2word[idx] + ' '
return sentence.strip()
def one_hot_seq2sentence(seq, index2word):
sentence = ''
for one_hot in seq:
sentence += index2word[numpy.argmax(one_hot)] + ' '
return sentence.strip()
with open(sys.argv[1], 'r') as csvfile:
dataset = csv.DictReader(csvfile, delimiter=';')
texts = []
X = []
Y = []
for row in dataset:
texts.append(row['data'])
texts.append(row['label'])
X.append(row['data'])
Y.append(row['label'])
tokenizer = Tokenizer(num_words=NUM_WORDS, lower=False, split=" ")
tokenizer.fit_on_texts(texts)
X_sequences = tokenizer.texts_to_sequences(X)
Y_sequences = tokenizer.texts_to_sequences(Y)
X_train = pad_sequences(X_sequences, maxlen=MAX_SEQUENCE_LENGTH)
Y_sequences = pad_sequences(Y_sequences, maxlen=MAX_SEQUENCE_LENGTH)
Y_train = []
for seq in Y_sequences:
Y_train.append(to_categorical(seq, num_classes=NUM_WORDS + 1))
Y_train = numpy.array(Y_train)
print(X_train.shape)
print(Y_train.shape)
word2index = tokenizer.word_index
index2word = {y:x for x,y in word2index.items()}
index2word[0] = ''
# --- Model ---
model = Sequential()
# --- Encoder ---
model.add(Embedding(NUM_WORDS + 1, EMBEDDING_SIZE, input_length=MAX_SEQUENCE_LENGTH, mask_zero=True, name='emb'))
model.add(LSTM(HIDDEN_SIZE))
model.add(RepeatVector(MAX_SEQUENCE_LENGTH))
# --- Decoder ---
for _ in range(NUM_LSTM_LAYERS):
model.add(LSTM(HIDDEN_SIZE, return_sequences=True))
model.add(TimeDistributed(Dense(NUM_WORDS + 1)))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
# --- Add callbacks & start training ---
tensorboard_callback = TensorBoard(log_dir='./logs', histogram_freq=0, write_graph=True, write_images=False, embeddings_freq=10, embeddings_layer_names=['emb'], embeddings_metadata=None)
filepath="keras-seq2seq-{epoch:02d}-{val_acc:.2f}.hdf5"
checkpoint_callback = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, period=1)
model.fit(X_train, Y_train, validation_split=0.2, epochs=NUM_EPOCHS, batch_size=32, callbacks=[tensorboard_callback, checkpoint_callback])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment