Created
February 27, 2019 09:27
-
-
Save conradludgate/ee267b9e2293c35064c8012edaad69e7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Imports for word2vec | |
from gensim.test.utils import common_texts, get_tmpfile | |
from gensim.models import Word2Vec, KeyedVectors | |
# imports for tensorflow | |
import tensorflow as tf | |
# Numpy | |
import numpy as np | |
from time import time, strftime | |
# Imports for word tokenize | |
from nltk import download, word_tokenize | |
class Stopwatch: | |
def __init__(self, name): | |
self.times = [(time(), name)] | |
def lap(self, name): | |
self.times.append((time(), name)) | |
print("{0}: {1}s".format(name, self.times[-1][0] - self.times[-2][0])) | |
def __str__(self): | |
times = [] | |
for i in range(len(self.times)-1): | |
(t, name) = self.times[i+1] | |
times.append("{0}: {1}s".format(name, t - self.times[i][0])) | |
(t1, name1) = self.times[0] | |
(t2, name2) = self.times[-1] | |
times.append("{0}: {1}s".format(name1, t2 - t1)) | |
return "\n".join(times) | |
sw = Stopwatch("Training example") | |
download('punkt') | |
sw.lap("Download punkt") | |
# Load the trained word2vec model | |
w2v = KeyedVectors.load('data/word2vec-data.bin') | |
sw.lap("Load w2v data") | |
# Convert text into vectors | |
def transform_into(text, seq_len=100): | |
data = [w2v.get_vector(token) | |
if token in w2v.vocab else np.zeros(300) | |
for token in word_tokenize(text)] | |
return (data + [np.zeros(300)] * (seq_len - len(data)))[:seq_len] | |
def lstm_model(batch_size=None, stateful=True, seq_len=50): | |
"""Language model: predict the next word given the current word.""" | |
inputs = tf.keras.layers.Input(shape=(seq_len, 300,), batch_size=batch_size, dtype=tf.float32) | |
lstm = tf.keras.layers.LSTM(64)(inputs) | |
nn = tf.keras.layers.Dense(32, input_shape = (64,), activation="sigmoid")(lstm) | |
output = tf.keras.layers.Dense(1, input_shape = (32,), activation="sigmoid")(nn) | |
model = tf.keras.Model(inputs=[inputs], outputs=[output]) | |
model.compile( | |
optimizer=tf.keras.optimizers.Adam(lr=0.01), | |
loss=tf.keras.losses.binary_crossentropy, | |
metrics=[tf.keras.metrics.binary_accuracy]) | |
return model | |
# def make_more_data(words, threshold = 0.5): | |
def training_generator(seq_len=50, batch_size=1024): | |
"""A generator yields (source, target) arrays for training.""" | |
source = [] | |
target = [] | |
with tf.gfile.GFile("data/sentiment labelled sentences/imdb_labelled.txt", 'r') as f: | |
txt = f.readline() | |
while len(txt) > 0: | |
source.append(transform_into(txt[:-3], seq_len)) | |
target.append([int(txt[-2])]) | |
txt = f.readline() | |
while True: | |
offsets = np.random.randint(0, 900, batch_size) | |
# Our model uses binary crossentropy loss, but Keras requires labels | |
# to have the same rank as the input logits. We add an empty final | |
# dimension to account for this. | |
yield ( | |
np.stack([source[idx] for idx in offsets]), | |
np.stack([target[idx] for idx in offsets]), | |
) | |
def test_generator(seq_len=50, batch_size=1024): | |
"""A generator yields (source, target) arrays for training.""" | |
source = [] | |
target = [] | |
with tf.gfile.GFile("data/sentiment labelled sentences/imdb_labelled.txt", 'r') as f: | |
txt = f.readline() | |
while len(txt) > 0: | |
source.append(transform_into(txt[:-3], seq_len)) | |
target.append([int(txt[-2])]) | |
txt = f.readline() | |
while True: | |
offsets = np.random.randint(900, len(source), batch_size) | |
# Our model uses binary crossentropy loss, but Keras requires labels | |
# to have the same rank as the input logits. We add an empty final | |
# dimension to account for this. | |
yield ( | |
np.stack([source[idx] for idx in offsets]), | |
np.stack([target[idx] for idx in offsets]), | |
) | |
model = lstm_model(seq_len=50) | |
sw.lap("Generate model") | |
model.fit_generator( | |
training_generator(seq_len=50, batch_size=512), | |
steps_per_epoch=200, | |
epochs=10, | |
callbacks = [ | |
tf.keras.callbacks.TensorBoard( | |
log_dir="dtype=tf.float32.logs", | |
batch_size=256, | |
write_images=True, | |
write_graph=True, | |
histogram_freq=5, | |
write_grads=True | |
), | |
tf.keras.callbacks.EarlyStopping()], | |
validation_data=test_generator(seq_len=50, batch_size=10), | |
validation_steps=10 | |
) | |
sw.lap("Fit model") | |
print(sw) | |
model.save(strftime("data/%Y-%m-%d_%H:%M:%S_sentiment.h5")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment