Skip to content

Instantly share code, notes, and snippets.

@conradludgate
Created February 27, 2019 09:27
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save conradludgate/ee267b9e2293c35064c8012edaad69e7 to your computer and use it in GitHub Desktop.
Save conradludgate/ee267b9e2293c35064c8012edaad69e7 to your computer and use it in GitHub Desktop.
# Imports for word2vec
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec, KeyedVectors
# imports for tensorflow
import tensorflow as tf
# Numpy
import numpy as np
from time import time, strftime
# Imports for word tokenize
from nltk import download, word_tokenize
class Stopwatch:
def __init__(self, name):
self.times = [(time(), name)]
def lap(self, name):
self.times.append((time(), name))
print("{0}: {1}s".format(name, self.times[-1][0] - self.times[-2][0]))
def __str__(self):
times = []
for i in range(len(self.times)-1):
(t, name) = self.times[i+1]
times.append("{0}: {1}s".format(name, t - self.times[i][0]))
(t1, name1) = self.times[0]
(t2, name2) = self.times[-1]
times.append("{0}: {1}s".format(name1, t2 - t1))
return "\n".join(times)
sw = Stopwatch("Training example")
download('punkt')
sw.lap("Download punkt")
# Load the trained word2vec model
w2v = KeyedVectors.load('data/word2vec-data.bin')
sw.lap("Load w2v data")
# Convert text into vectors
def transform_into(text, seq_len=100):
data = [w2v.get_vector(token)
if token in w2v.vocab else np.zeros(300)
for token in word_tokenize(text)]
return (data + [np.zeros(300)] * (seq_len - len(data)))[:seq_len]
def lstm_model(batch_size=None, stateful=True, seq_len=50):
"""Language model: predict the next word given the current word."""
inputs = tf.keras.layers.Input(shape=(seq_len, 300,), batch_size=batch_size, dtype=tf.float32)
lstm = tf.keras.layers.LSTM(64)(inputs)
nn = tf.keras.layers.Dense(32, input_shape = (64,), activation="sigmoid")(lstm)
output = tf.keras.layers.Dense(1, input_shape = (32,), activation="sigmoid")(nn)
model = tf.keras.Model(inputs=[inputs], outputs=[output])
model.compile(
optimizer=tf.keras.optimizers.Adam(lr=0.01),
loss=tf.keras.losses.binary_crossentropy,
metrics=[tf.keras.metrics.binary_accuracy])
return model
# def make_more_data(words, threshold = 0.5):
def training_generator(seq_len=50, batch_size=1024):
"""A generator yields (source, target) arrays for training."""
source = []
target = []
with tf.gfile.GFile("data/sentiment labelled sentences/imdb_labelled.txt", 'r') as f:
txt = f.readline()
while len(txt) > 0:
source.append(transform_into(txt[:-3], seq_len))
target.append([int(txt[-2])])
txt = f.readline()
while True:
offsets = np.random.randint(0, 900, batch_size)
# Our model uses binary crossentropy loss, but Keras requires labels
# to have the same rank as the input logits. We add an empty final
# dimension to account for this.
yield (
np.stack([source[idx] for idx in offsets]),
np.stack([target[idx] for idx in offsets]),
)
def test_generator(seq_len=50, batch_size=1024):
"""A generator yields (source, target) arrays for training."""
source = []
target = []
with tf.gfile.GFile("data/sentiment labelled sentences/imdb_labelled.txt", 'r') as f:
txt = f.readline()
while len(txt) > 0:
source.append(transform_into(txt[:-3], seq_len))
target.append([int(txt[-2])])
txt = f.readline()
while True:
offsets = np.random.randint(900, len(source), batch_size)
# Our model uses binary crossentropy loss, but Keras requires labels
# to have the same rank as the input logits. We add an empty final
# dimension to account for this.
yield (
np.stack([source[idx] for idx in offsets]),
np.stack([target[idx] for idx in offsets]),
)
model = lstm_model(seq_len=50)
sw.lap("Generate model")
model.fit_generator(
training_generator(seq_len=50, batch_size=512),
steps_per_epoch=200,
epochs=10,
callbacks = [
tf.keras.callbacks.TensorBoard(
log_dir="dtype=tf.float32.logs",
batch_size=256,
write_images=True,
write_graph=True,
histogram_freq=5,
write_grads=True
),
tf.keras.callbacks.EarlyStopping()],
validation_data=test_generator(seq_len=50, batch_size=10),
validation_steps=10
)
sw.lap("Fit model")
print(sw)
model.save(strftime("data/%Y-%m-%d_%H:%M:%S_sentiment.h5"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment