conradludgate/training.py

## training.py
# Imports for word2vec
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec, KeyedVectors

# imports for tensorflow
import tensorflow as tf

# Numpy
import numpy as np

from time import time, strftime

# Imports for word tokenize
from nltk import download, word_tokenize

class Stopwatch:
	def __init__(self, name):
		self.times = [(time(), name)]

	def lap(self, name):
		self.times.append((time(), name))
		print("{0}: {1}s".format(name, self.times[-1][0] - self.times[-2][0]))

	def __str__(self):
		times = []
		for i in range(len(self.times)-1):
			(t, name) = self.times[i+1]
			times.append("{0}: {1}s".format(name, t - self.times[i][0]))
		(t1, name1) = self.times[0]
		(t2, name2) = self.times[-1]
		times.append("{0}: {1}s".format(name1, t2 - t1))
		return "\n".join(times)

sw = Stopwatch("Training example")

download('punkt')

sw.lap("Download punkt")

# Load the trained word2vec model
w2v = KeyedVectors.load('data/word2vec-data.bin')

sw.lap("Load w2v data")

# Convert text into vectors
def transform_into(text, seq_len=100):
	data = [w2v.get_vector(token)
		if token in w2v.vocab else np.zeros(300)
		for token in word_tokenize(text)]
	return (data + [np.zeros(300)] * (seq_len - len(data)))[:seq_len]

def lstm_model(batch_size=None, stateful=True, seq_len=50):
	"""Language model: predict the next word given the current word."""
	inputs = tf.keras.layers.Input(shape=(seq_len, 300,), batch_size=batch_size, dtype=tf.float32)

	lstm = tf.keras.layers.LSTM(64)(inputs)

	nn = tf.keras.layers.Dense(32, input_shape = (64,), activation="sigmoid")(lstm)
	output = tf.keras.layers.Dense(1, input_shape = (32,), activation="sigmoid")(nn)

	model = tf.keras.Model(inputs=[inputs], outputs=[output])
	model.compile(
		optimizer=tf.keras.optimizers.Adam(lr=0.01),
		loss=tf.keras.losses.binary_crossentropy,
		metrics=[tf.keras.metrics.binary_accuracy])

	return model

# def make_more_data(words, threshold = 0.5):


def training_generator(seq_len=50, batch_size=1024):
	"""A generator yields (source, target) arrays for training."""
	source = []
	target = []
	with tf.gfile.GFile("data/sentiment labelled sentences/imdb_labelled.txt", 'r') as f:
		txt = f.readline()
		while len(txt) > 0:
			source.append(transform_into(txt[:-3], seq_len))
			target.append([int(txt[-2])])

			txt = f.readline()

	while True:
		offsets = np.random.randint(0, 900, batch_size)

		# Our model uses binary crossentropy loss, but Keras requires labels
		# to have the same rank as the input logits.  We add an empty final
		# dimension to account for this.
		yield (
			np.stack([source[idx] for idx in offsets]),
			np.stack([target[idx] for idx in offsets]),
		)

def test_generator(seq_len=50, batch_size=1024):
	"""A generator yields (source, target) arrays for training."""
	source = []
	target = []
	with tf.gfile.GFile("data/sentiment labelled sentences/imdb_labelled.txt", 'r') as f:
		txt = f.readline()
		while len(txt) > 0:
			source.append(transform_into(txt[:-3], seq_len))
			target.append([int(txt[-2])])

			txt = f.readline()

	while True:
		offsets = np.random.randint(900, len(source), batch_size)

		# Our model uses binary crossentropy loss, but Keras requires labels
		# to have the same rank as the input logits.  We add an empty final
		# dimension to account for this.
		yield (
			np.stack([source[idx] for idx in offsets]),
			np.stack([target[idx] for idx in offsets]),
		)

model = lstm_model(seq_len=50)

sw.lap("Generate model")

model.fit_generator(
	training_generator(seq_len=50, batch_size=512),
    steps_per_epoch=200,
    epochs=10,
    callbacks = [
    	tf.keras.callbacks.TensorBoard(
    		log_dir="dtype=tf.float32.logs",
	    	batch_size=256,
	    	write_images=True,
	    	write_graph=True,
	    	histogram_freq=5,
	    	write_grads=True
	    ),
    	tf.keras.callbacks.EarlyStopping()],
    validation_data=test_generator(seq_len=50, batch_size=10),
    validation_steps=10
)

sw.lap("Fit model")

print(sw)

model.save(strftime("data/%Y-%m-%d_%H:%M:%S_sentiment.h5"))
	# Imports for word2vec
	from gensim.test.utils import common_texts, get_tmpfile
	from gensim.models import Word2Vec, KeyedVectors

	# imports for tensorflow
	import tensorflow as tf

	# Numpy
	import numpy as np

	from time import time, strftime

	# Imports for word tokenize
	from nltk import download, word_tokenize

	class Stopwatch:
	def __init__(self, name):
	self.times = [(time(), name)]

	def lap(self, name):
	self.times.append((time(), name))
	print("{0}: {1}s".format(name, self.times[-1][0] - self.times[-2][0]))

	def __str__(self):
	times = []
	for i in range(len(self.times)-1):
	(t, name) = self.times[i+1]
	times.append("{0}: {1}s".format(name, t - self.times[i][0]))
	(t1, name1) = self.times[0]
	(t2, name2) = self.times[-1]
	times.append("{0}: {1}s".format(name1, t2 - t1))
	return "\n".join(times)

	sw = Stopwatch("Training example")

	download('punkt')

	sw.lap("Download punkt")

	# Load the trained word2vec model
	w2v = KeyedVectors.load('data/word2vec-data.bin')

	sw.lap("Load w2v data")

	# Convert text into vectors
	def transform_into(text, seq_len=100):
	data = [w2v.get_vector(token)
	if token in w2v.vocab else np.zeros(300)
	for token in word_tokenize(text)]
	return (data + [np.zeros(300)] * (seq_len - len(data)))[:seq_len]

	def lstm_model(batch_size=None, stateful=True, seq_len=50):
	"""Language model: predict the next word given the current word."""
	inputs = tf.keras.layers.Input(shape=(seq_len, 300,), batch_size=batch_size, dtype=tf.float32)

	lstm = tf.keras.layers.LSTM(64)(inputs)

	nn = tf.keras.layers.Dense(32, input_shape = (64,), activation="sigmoid")(lstm)
	output = tf.keras.layers.Dense(1, input_shape = (32,), activation="sigmoid")(nn)

	model = tf.keras.Model(inputs=[inputs], outputs=[output])
	model.compile(
	optimizer=tf.keras.optimizers.Adam(lr=0.01),
	loss=tf.keras.losses.binary_crossentropy,
	metrics=[tf.keras.metrics.binary_accuracy])

	return model

	# def make_more_data(words, threshold = 0.5):


	def training_generator(seq_len=50, batch_size=1024):
	"""A generator yields (source, target) arrays for training."""
	source = []
	target = []
	with tf.gfile.GFile("data/sentiment labelled sentences/imdb_labelled.txt", 'r') as f:
	txt = f.readline()
	while len(txt) > 0:
	source.append(transform_into(txt[:-3], seq_len))
	target.append([int(txt[-2])])

	txt = f.readline()

	while True:
	offsets = np.random.randint(0, 900, batch_size)

	# Our model uses binary crossentropy loss, but Keras requires labels
	# to have the same rank as the input logits. We add an empty final
	# dimension to account for this.
	yield (
	np.stack([source[idx] for idx in offsets]),
	np.stack([target[idx] for idx in offsets]),
	)

	def test_generator(seq_len=50, batch_size=1024):
	"""A generator yields (source, target) arrays for training."""
	source = []
	target = []
	with tf.gfile.GFile("data/sentiment labelled sentences/imdb_labelled.txt", 'r') as f:
	txt = f.readline()
	while len(txt) > 0:
	source.append(transform_into(txt[:-3], seq_len))
	target.append([int(txt[-2])])

	txt = f.readline()

	while True:
	offsets = np.random.randint(900, len(source), batch_size)

	# Our model uses binary crossentropy loss, but Keras requires labels
	# to have the same rank as the input logits. We add an empty final
	# dimension to account for this.
	yield (
	np.stack([source[idx] for idx in offsets]),
	np.stack([target[idx] for idx in offsets]),
	)

	model = lstm_model(seq_len=50)

	sw.lap("Generate model")

	model.fit_generator(
	training_generator(seq_len=50, batch_size=512),
	steps_per_epoch=200,
	epochs=10,
	callbacks = [
	tf.keras.callbacks.TensorBoard(
	log_dir="dtype=tf.float32.logs",
	batch_size=256,
	write_images=True,
	write_graph=True,
	histogram_freq=5,
	write_grads=True
	),
	tf.keras.callbacks.EarlyStopping()],
	validation_data=test_generator(seq_len=50, batch_size=10),
	validation_steps=10
	)

	sw.lap("Fit model")

	print(sw)

	model.save(strftime("data/%Y-%m-%d_%H:%M:%S_sentiment.h5"))