Skip to content

Instantly share code, notes, and snippets.

@phraniiac
Created February 25, 2017 06:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save phraniiac/a5c8efb02931ea06b5cdca6ad4db2cdd to your computer and use it in GitHub Desktop.
Save phraniiac/a5c8efb02931ea06b5cdca6ad4db2cdd to your computer and use it in GitHub Desktop.
import numpy as np
import tensorflow as tf
from random import shuffle
class Glove:
"""docstring for Glove"""
def __init__(self, corpus, context_size=2, learning_rate=0.05, batch_size=100, num_epochs=100):
super(Glove, self).__init__()
self.corpus = corpus
self.CONTEXT_SIZE = context_size
self.word_map = {}
self.count = 0
self.learning_rate = learning_rate
self.summaries = []
self.batch_size = batch_size
self.num_epochs = num_epochs
def word_mapping(self, corpus):
for sentence in corpus:
for word in sentence.split():
if word not in self.word_map:
self.word_map[word] = self.count
self.count += 1
print self.word_map
def get_index(word):
return self.word_map[word]
def build_coo_matrix(self):
self.word_mapping(corpus)
self.cooccurence_matrix = np.zeros([self.count, self.count])
for sentence in self.corpus:
for l_window, r_window, center_word in context_words(sentence, self.CONTEXT_SIZE):
for i, word in enumerate(l_window[::-1]):
self.cooccurence_matrix[self.word_map[center_word], self.word_map[word]] += float(1/(1+i))
for i, word in enumerate(r_window):
self.cooccurence_matrix[self.word_map[center_word], self.word_map[word]] += float(1/(1+i))
self.tf_graph()
def f(cooc_count):
return min(1, pow(cooc_count/100, 0.75))
def tf_graph():
# tensorflow api functions referenced from
# https://github.com/GradySimon/tensorflow-glove/blob/master/tf_glove.py
# really helpful.
self.graph = tf.Graph()
with graph.as_default():
# feed_dict vars
# this is the id of words in word_map.
# placeholders are made for feeding data in graph interations.
batch_size = tf.placeholder(tf.int32)
current_words_main = tf.placeholder(tf.int32, shape=[batch_size])
current_words_context = tf.placeholder(tf.int32, shape=[batch_size])
# cooccurence matrix counts for each word in main, and its corresponding context.
cooc_counts = tf.placeholder(tf.float32, shape=[batch_size])
# variables
tf_train_w_main = tf.Variable(tf.random_uniform([len(self.word_map),\
20], 1.0, -1.0))
tf_train_w_context = tf.Variable(tf.random_uniform([len(self.word_map),\
20], 1.0, -1.0))
tf_train_b_main = tf.Variable(tf.random_uniform([len(self.word_map)], 1.0, -1.0))
tf_train_b_context = tf.Variable(tf.random_uniform([len(self.word_map)], 1.0, -1.0))
# extract embeddings
# weights.
main_embeddings_w = tf.nn.embedding_lookup([tf_train_w_main], current_words_main)
context_embeddings_w = tf.nn.embedding_lookup([tf_train_w_context], current_words_context)
print(main_embeddings_w)
# biases
main_embeddings_b = tf.nn.embedding_lookup([tf_train_b_main], current_words_main)
print(main_embeddings_b) # checking
context_embeddings_b = tf.nn.embedding_lookup([tf_train_b_context], current_words_context)
## calculations.
log_counts = tf.log(tf.to_float(self.cooccurence_matrix)) # might contain ints.
prod_sum = tf.reduce_sum(tf.mul(main_embeddings_w, context_embeddings_w), axis=1)
single_losses = tf.square(tf.add_n([log_counts, prod_sum, main_embeddings_b, context_embeddings_b, tf.negative(log_counts)]))
total_loss_sum = tf.reduce_sum(single_losses)
# Using adaptive gradient optimizer. Sources of information
# - http://sebastianruder.com/optimizing-gradient-descent/
self.optimizer = tf.train.AdagradOptimizer(self.learning_rate).minimize(
total_loss_sum)
print("Total Loss sum: " + str(total_loss_sum))
#self.summaries.append(tf.merge_all_summaries())
def train_model():
# mini_batches iterator..
# start session.
with tf.Session(graph=self.graph) as session:
tf.initialize_all_variables().run()
for epoch in range(self.num_epochs):
shuffle(batches)
for batch in batch_iterator:
feed_dict = {
current_words_main: batch[0],
current_words_context: batch[1],
cooc_counts: batch[2],
batch_size = self.batch_size
}
session.run([self.__optimizer], feed_dict=feed_dict)
def context_words(sentence, CONTEXT_SIZE):
sentence = sentence.split()
ind = 0
for word in sentence:
left_index = ind - CONTEXT_SIZE
right_index = ind + 1 + CONTEXT_SIZE
ind += 1
yield window_words(left_index, right_index, sentence)
def window_words(left_index, right_index, sentence):
ind = (left_index + right_index - 1)//2
maxlen = len(sentence)
words_left = sentence[max(left_index, 0): ind]
words_right = sentence[ind + 1: min(right_index, maxlen)]
return words_left, words_right, sentence[ind]
corpus = ["THis is me writing a corpus.","THis is me writing a corpus.","THis is me writing a corpus."]
glove = Glove(corpus)
glove.build_coo_matrix()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment