Last active
July 25, 2017 11:31
-
-
Save maxpagels/22645a37873cb29187bc3d8d7df6fab0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import sys | |
import numpy | |
from keras.models import Sequential | |
from keras.layers import Dense, Activation, Embedding, TimeDistributed, RepeatVector | |
from keras.layers import LSTM | |
from keras.callbacks import ModelCheckpoint, TensorBoard | |
from keras.utils import to_categorical | |
from keras.preprocessing.text import Tokenizer | |
from keras.preprocessing.sequence import pad_sequences | |
numpy.random.seed(42) | |
MAX_SEQUENCE_LENGTH = 3 | |
NUM_WORDS = 450 | |
EMBEDDING_SIZE = 100 | |
NUM_LSTM_LAYERS = 5 | |
HIDDEN_SIZE = 32 | |
NUM_EPOCHS = 200 | |
def seq2sentence(seq, index2word): | |
sentence = '' | |
for idx in seq: | |
sentence += index2word[idx] + ' ' | |
return sentence.strip() | |
def one_hot_seq2sentence(seq, index2word): | |
sentence = '' | |
for one_hot in seq: | |
sentence += index2word[numpy.argmax(one_hot)] + ' ' | |
return sentence.strip() | |
with open(sys.argv[1], 'r') as csvfile: | |
dataset = csv.DictReader(csvfile, delimiter=';') | |
texts = [] | |
X = [] | |
Y = [] | |
for row in dataset: | |
texts.append(row['data']) | |
texts.append(row['label']) | |
X.append(row['data']) | |
Y.append(row['label']) | |
tokenizer = Tokenizer(num_words=NUM_WORDS, lower=False, split=" ") | |
tokenizer.fit_on_texts(texts) | |
X_sequences = tokenizer.texts_to_sequences(X) | |
Y_sequences = tokenizer.texts_to_sequences(Y) | |
X_train = pad_sequences(X_sequences, maxlen=MAX_SEQUENCE_LENGTH) | |
Y_sequences = pad_sequences(Y_sequences, maxlen=MAX_SEQUENCE_LENGTH) | |
Y_train = [] | |
for seq in Y_sequences: | |
Y_train.append(to_categorical(seq, num_classes=NUM_WORDS + 1)) | |
Y_train = numpy.array(Y_train) | |
print(X_train.shape) | |
print(Y_train.shape) | |
word2index = tokenizer.word_index | |
index2word = {y:x for x,y in word2index.items()} | |
index2word[0] = '' | |
# --- Model --- | |
model = Sequential() | |
# --- Encoder --- | |
model.add(Embedding(NUM_WORDS + 1, EMBEDDING_SIZE, input_length=MAX_SEQUENCE_LENGTH, mask_zero=True, name='emb')) | |
model.add(LSTM(HIDDEN_SIZE)) | |
model.add(RepeatVector(MAX_SEQUENCE_LENGTH)) | |
# --- Decoder --- | |
for _ in range(NUM_LSTM_LAYERS): | |
model.add(LSTM(HIDDEN_SIZE, return_sequences=True)) | |
model.add(TimeDistributed(Dense(NUM_WORDS + 1))) | |
model.add(Activation('softmax')) | |
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) | |
model.summary() | |
# --- Add callbacks & start training --- | |
tensorboard_callback = TensorBoard(log_dir='./logs', histogram_freq=0, write_graph=True, write_images=False, embeddings_freq=10, embeddings_layer_names=['emb'], embeddings_metadata=None) | |
filepath="keras-seq2seq-{epoch:02d}-{val_acc:.2f}.hdf5" | |
checkpoint_callback = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, period=1) | |
model.fit(X_train, Y_train, validation_split=0.2, epochs=NUM_EPOCHS, batch_size=32, callbacks=[tensorboard_callback, checkpoint_callback]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment