Created
February 25, 2017 06:52
-
-
Save phraniiac/a5c8efb02931ea06b5cdca6ad4db2cdd to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import tensorflow as tf | |
from random import shuffle | |
class Glove: | |
"""docstring for Glove""" | |
def __init__(self, corpus, context_size=2, learning_rate=0.05, batch_size=100, num_epochs=100): | |
super(Glove, self).__init__() | |
self.corpus = corpus | |
self.CONTEXT_SIZE = context_size | |
self.word_map = {} | |
self.count = 0 | |
self.learning_rate = learning_rate | |
self.summaries = [] | |
self.batch_size = batch_size | |
self.num_epochs = num_epochs | |
def word_mapping(self, corpus): | |
for sentence in corpus: | |
for word in sentence.split(): | |
if word not in self.word_map: | |
self.word_map[word] = self.count | |
self.count += 1 | |
print self.word_map | |
def get_index(word): | |
return self.word_map[word] | |
def build_coo_matrix(self): | |
self.word_mapping(corpus) | |
self.cooccurence_matrix = np.zeros([self.count, self.count]) | |
for sentence in self.corpus: | |
for l_window, r_window, center_word in context_words(sentence, self.CONTEXT_SIZE): | |
for i, word in enumerate(l_window[::-1]): | |
self.cooccurence_matrix[self.word_map[center_word], self.word_map[word]] += float(1/(1+i)) | |
for i, word in enumerate(r_window): | |
self.cooccurence_matrix[self.word_map[center_word], self.word_map[word]] += float(1/(1+i)) | |
self.tf_graph() | |
def f(cooc_count): | |
return min(1, pow(cooc_count/100, 0.75)) | |
def tf_graph(): | |
# tensorflow api functions referenced from | |
# https://github.com/GradySimon/tensorflow-glove/blob/master/tf_glove.py | |
# really helpful. | |
self.graph = tf.Graph() | |
with graph.as_default(): | |
# feed_dict vars | |
# this is the id of words in word_map. | |
# placeholders are made for feeding data in graph interations. | |
batch_size = tf.placeholder(tf.int32) | |
current_words_main = tf.placeholder(tf.int32, shape=[batch_size]) | |
current_words_context = tf.placeholder(tf.int32, shape=[batch_size]) | |
# cooccurence matrix counts for each word in main, and its corresponding context. | |
cooc_counts = tf.placeholder(tf.float32, shape=[batch_size]) | |
# variables | |
tf_train_w_main = tf.Variable(tf.random_uniform([len(self.word_map),\ | |
20], 1.0, -1.0)) | |
tf_train_w_context = tf.Variable(tf.random_uniform([len(self.word_map),\ | |
20], 1.0, -1.0)) | |
tf_train_b_main = tf.Variable(tf.random_uniform([len(self.word_map)], 1.0, -1.0)) | |
tf_train_b_context = tf.Variable(tf.random_uniform([len(self.word_map)], 1.0, -1.0)) | |
# extract embeddings | |
# weights. | |
main_embeddings_w = tf.nn.embedding_lookup([tf_train_w_main], current_words_main) | |
context_embeddings_w = tf.nn.embedding_lookup([tf_train_w_context], current_words_context) | |
print(main_embeddings_w) | |
# biases | |
main_embeddings_b = tf.nn.embedding_lookup([tf_train_b_main], current_words_main) | |
print(main_embeddings_b) # checking | |
context_embeddings_b = tf.nn.embedding_lookup([tf_train_b_context], current_words_context) | |
## calculations. | |
log_counts = tf.log(tf.to_float(self.cooccurence_matrix)) # might contain ints. | |
prod_sum = tf.reduce_sum(tf.mul(main_embeddings_w, context_embeddings_w), axis=1) | |
single_losses = tf.square(tf.add_n([log_counts, prod_sum, main_embeddings_b, context_embeddings_b, tf.negative(log_counts)])) | |
total_loss_sum = tf.reduce_sum(single_losses) | |
# Using adaptive gradient optimizer. Sources of information | |
# - http://sebastianruder.com/optimizing-gradient-descent/ | |
self.optimizer = tf.train.AdagradOptimizer(self.learning_rate).minimize( | |
total_loss_sum) | |
print("Total Loss sum: " + str(total_loss_sum)) | |
#self.summaries.append(tf.merge_all_summaries()) | |
def train_model(): | |
# mini_batches iterator.. | |
# start session. | |
with tf.Session(graph=self.graph) as session: | |
tf.initialize_all_variables().run() | |
for epoch in range(self.num_epochs): | |
shuffle(batches) | |
for batch in batch_iterator: | |
feed_dict = { | |
current_words_main: batch[0], | |
current_words_context: batch[1], | |
cooc_counts: batch[2], | |
batch_size = self.batch_size | |
} | |
session.run([self.__optimizer], feed_dict=feed_dict) | |
def context_words(sentence, CONTEXT_SIZE): | |
sentence = sentence.split() | |
ind = 0 | |
for word in sentence: | |
left_index = ind - CONTEXT_SIZE | |
right_index = ind + 1 + CONTEXT_SIZE | |
ind += 1 | |
yield window_words(left_index, right_index, sentence) | |
def window_words(left_index, right_index, sentence): | |
ind = (left_index + right_index - 1)//2 | |
maxlen = len(sentence) | |
words_left = sentence[max(left_index, 0): ind] | |
words_right = sentence[ind + 1: min(right_index, maxlen)] | |
return words_left, words_right, sentence[ind] | |
corpus = ["THis is me writing a corpus.","THis is me writing a corpus.","THis is me writing a corpus."] | |
glove = Glove(corpus) | |
glove.build_coo_matrix() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment