phraniiac/glove.py

## glove.py
import numpy as np
import tensorflow as tf
from random import shuffle

class Glove:
	"""docstring for Glove"""
	def __init__(self, corpus, context_size=2, learning_rate=0.05, batch_size=100, num_epochs=100):
		super(Glove, self).__init__()
		self.corpus = corpus
		self.CONTEXT_SIZE = context_size
		self.word_map = {}
		self.count = 0
		self.learning_rate = learning_rate
		self.summaries = []
		self.batch_size = batch_size
		self.num_epochs = num_epochs

	def word_mapping(self, corpus):
		for sentence in corpus:
			for word in sentence.split():
				if word not in self.word_map:
					self.word_map[word] = self.count
					self.count += 1
		print self.word_map

	def get_index(word):
		return self.word_map[word]

	def build_coo_matrix(self):
		self.word_mapping(corpus)
		self.cooccurence_matrix = np.zeros([self.count, self.count])
		for sentence in self.corpus:
			for l_window, r_window, center_word in context_words(sentence, self.CONTEXT_SIZE):
				for i, word in enumerate(l_window[::-1]):
					self.cooccurence_matrix[self.word_map[center_word], self.word_map[word]] += float(1/(1+i))
				for i, word in enumerate(r_window):
					self.cooccurence_matrix[self.word_map[center_word], self.word_map[word]] += float(1/(1+i))
		self.tf_graph()

	def f(cooc_count):
		return min(1, pow(cooc_count/100, 0.75))

	def tf_graph():
		# tensorflow api functions referenced from
		# https://github.com/GradySimon/tensorflow-glove/blob/master/tf_glove.py
		# really helpful.
		self.graph = tf.Graph()
		with graph.as_default():

			# feed_dict vars
			# this is the id of words in word_map.
			# placeholders are made for feeding data in graph interations.
			batch_size = tf.placeholder(tf.int32)
			current_words_main = tf.placeholder(tf.int32, shape=[batch_size])
			current_words_context = tf.placeholder(tf.int32, shape=[batch_size])
			# cooccurence matrix counts for each word in main, and its corresponding context.
			cooc_counts = tf.placeholder(tf.float32, shape=[batch_size])


			# variables
			tf_train_w_main = tf.Variable(tf.random_uniform([len(self.word_map),\
								20], 1.0, -1.0))
			tf_train_w_context = tf.Variable(tf.random_uniform([len(self.word_map),\
								20], 1.0, -1.0))
			tf_train_b_main = tf.Variable(tf.random_uniform([len(self.word_map)], 1.0, -1.0))
			tf_train_b_context = tf.Variable(tf.random_uniform([len(self.word_map)], 1.0, -1.0))

			# extract embeddings
			# weights.
			main_embeddings_w = tf.nn.embedding_lookup([tf_train_w_main], current_words_main)
			context_embeddings_w = tf.nn.embedding_lookup([tf_train_w_context], current_words_context)
			print(main_embeddings_w)
			# biases
			main_embeddings_b = tf.nn.embedding_lookup([tf_train_b_main], current_words_main)
			print(main_embeddings_b)	# checking
			context_embeddings_b = tf.nn.embedding_lookup([tf_train_b_context], current_words_context)

			## calculations.
			log_counts = tf.log(tf.to_float(self.cooccurence_matrix))	# might contain ints.
			prod_sum = tf.reduce_sum(tf.mul(main_embeddings_w, context_embeddings_w), axis=1)
			single_losses = tf.square(tf.add_n([log_counts, prod_sum, main_embeddings_b, context_embeddings_b,  tf.negative(log_counts)]))
			total_loss_sum = tf.reduce_sum(single_losses)

			# Using adaptive gradient optimizer. Sources of information
			# - http://sebastianruder.com/optimizing-gradient-descent/
			self.optimizer = tf.train.AdagradOptimizer(self.learning_rate).minimize(
                total_loss_sum)

			print("Total Loss sum: " + str(total_loss_sum))

			#self.summaries.append(tf.merge_all_summaries())

	def train_model():
		# mini_batches iterator..

		# start session.
		with tf.Session(graph=self.graph) as session:
			tf.initialize_all_variables().run()
			for epoch in range(self.num_epochs):
				shuffle(batches)
				for batch in batch_iterator:
					feed_dict = {
						current_words_main: batch[0],
						current_words_context: batch[1],
						cooc_counts: batch[2],
						batch_size = self.batch_size
					}
					session.run([self.__optimizer], feed_dict=feed_dict)

def context_words(sentence, CONTEXT_SIZE):
	sentence = sentence.split()
	ind = 0
	for word in sentence:
		left_index = ind - CONTEXT_SIZE
		right_index = ind + 1 + CONTEXT_SIZE
		ind += 1
		yield window_words(left_index, right_index, sentence)

def window_words(left_index, right_index, sentence):
	ind = (left_index + right_index - 1)//2
	maxlen = len(sentence)
	words_left = sentence[max(left_index, 0): ind]
	words_right = sentence[ind + 1: min(right_index, maxlen)]
	return words_left, words_right, sentence[ind]

corpus = ["THis is me writing a corpus.","THis is me writing a corpus.","THis is me writing a corpus."]

glove = Glove(corpus)
glove.build_coo_matrix()
	import numpy as np
	import tensorflow as tf
	from random import shuffle

	class Glove:
	"""docstring for Glove"""
	def __init__(self, corpus, context_size=2, learning_rate=0.05, batch_size=100, num_epochs=100):
	super(Glove, self).__init__()
	self.corpus = corpus
	self.CONTEXT_SIZE = context_size
	self.word_map = {}
	self.count = 0
	self.learning_rate = learning_rate
	self.summaries = []
	self.batch_size = batch_size
	self.num_epochs = num_epochs

	def word_mapping(self, corpus):
	for sentence in corpus:
	for word in sentence.split():
	if word not in self.word_map:
	self.word_map[word] = self.count
	self.count += 1
	print self.word_map

	def get_index(word):
	return self.word_map[word]

	def build_coo_matrix(self):
	self.word_mapping(corpus)
	self.cooccurence_matrix = np.zeros([self.count, self.count])
	for sentence in self.corpus:
	for l_window, r_window, center_word in context_words(sentence, self.CONTEXT_SIZE):
	for i, word in enumerate(l_window[::-1]):
	self.cooccurence_matrix[self.word_map[center_word], self.word_map[word]] += float(1/(1+i))
	for i, word in enumerate(r_window):
	self.cooccurence_matrix[self.word_map[center_word], self.word_map[word]] += float(1/(1+i))
	self.tf_graph()

	def f(cooc_count):
	return min(1, pow(cooc_count/100, 0.75))

	def tf_graph():
	# tensorflow api functions referenced from
	# https://github.com/GradySimon/tensorflow-glove/blob/master/tf_glove.py
	# really helpful.
	self.graph = tf.Graph()
	with graph.as_default():

	# feed_dict vars
	# this is the id of words in word_map.
	# placeholders are made for feeding data in graph interations.
	batch_size = tf.placeholder(tf.int32)
	current_words_main = tf.placeholder(tf.int32, shape=[batch_size])
	current_words_context = tf.placeholder(tf.int32, shape=[batch_size])
	# cooccurence matrix counts for each word in main, and its corresponding context.
	cooc_counts = tf.placeholder(tf.float32, shape=[batch_size])


	# variables
	tf_train_w_main = tf.Variable(tf.random_uniform([len(self.word_map),\
	20], 1.0, -1.0))
	tf_train_w_context = tf.Variable(tf.random_uniform([len(self.word_map),\
	20], 1.0, -1.0))
	tf_train_b_main = tf.Variable(tf.random_uniform([len(self.word_map)], 1.0, -1.0))
	tf_train_b_context = tf.Variable(tf.random_uniform([len(self.word_map)], 1.0, -1.0))

	# extract embeddings
	# weights.
	main_embeddings_w = tf.nn.embedding_lookup([tf_train_w_main], current_words_main)
	context_embeddings_w = tf.nn.embedding_lookup([tf_train_w_context], current_words_context)
	print(main_embeddings_w)
	# biases
	main_embeddings_b = tf.nn.embedding_lookup([tf_train_b_main], current_words_main)
	print(main_embeddings_b) # checking
	context_embeddings_b = tf.nn.embedding_lookup([tf_train_b_context], current_words_context)

	## calculations.
	log_counts = tf.log(tf.to_float(self.cooccurence_matrix)) # might contain ints.
	prod_sum = tf.reduce_sum(tf.mul(main_embeddings_w, context_embeddings_w), axis=1)
	single_losses = tf.square(tf.add_n([log_counts, prod_sum, main_embeddings_b, context_embeddings_b, tf.negative(log_counts)]))
	total_loss_sum = tf.reduce_sum(single_losses)

	# Using adaptive gradient optimizer. Sources of information
	# - http://sebastianruder.com/optimizing-gradient-descent/
	self.optimizer = tf.train.AdagradOptimizer(self.learning_rate).minimize(
	total_loss_sum)

	print("Total Loss sum: " + str(total_loss_sum))

	#self.summaries.append(tf.merge_all_summaries())

	def train_model():
	# mini_batches iterator..

	# start session.
	with tf.Session(graph=self.graph) as session:
	tf.initialize_all_variables().run()
	for epoch in range(self.num_epochs):
	shuffle(batches)
	for batch in batch_iterator:
	feed_dict = {
	current_words_main: batch[0],
	current_words_context: batch[1],
	cooc_counts: batch[2],
	batch_size = self.batch_size
	}
	session.run([self.__optimizer], feed_dict=feed_dict)

	def context_words(sentence, CONTEXT_SIZE):
	sentence = sentence.split()
	ind = 0
	for word in sentence:
	left_index = ind - CONTEXT_SIZE
	right_index = ind + 1 + CONTEXT_SIZE
	ind += 1
	yield window_words(left_index, right_index, sentence)

	def window_words(left_index, right_index, sentence):
	ind = (left_index + right_index - 1)//2
	maxlen = len(sentence)
	words_left = sentence[max(left_index, 0): ind]
	words_right = sentence[ind + 1: min(right_index, maxlen)]
	return words_left, words_right, sentence[ind]

	corpus = ["THis is me writing a corpus.","THis is me writing a corpus.","THis is me writing a corpus."]

	glove = Glove(corpus)
	glove.build_coo_matrix()