Skip to content

Instantly share code, notes, and snippets.

@JonathanRaiman
Last active August 29, 2015 14:07
Show Gist options
  • Save JonathanRaiman/0d45d1ab214119cf45eb to your computer and use it in GitHub Desktop.
Save JonathanRaiman/0d45d1ab214119cf45eb to your computer and use it in GitHub Desktop.
# coding: utf-8
#
# @author Jonathan Raiman
# @date 9th October 2014
#
# Messing around with Stanford's GloVe words
# Download them [here](http://www-nlp.stanford.edu/projects/glove/)
import gzip, numpy as np, io
class GloveModel:
def __init__(self, path):
self.path = path;
self.load_model()
def load_model(self):
vecs = []
self.index2word = []
self.word2index = {}
with gzip.open(self.path, "rt") as f:
for i, line in enumerate(f):
index = line.find(' ')
word = line[:index]
self.word2index[word] = i
self.index2word.append(word)
vec = np.loadtxt(io.StringIO(line[index:]), dtype=np.float32)
vec = vec / np.linalg.norm(vec)
vecs.append(vec)
vecs = np.vstack(vecs)
self.model_matrix = vecs
def sentence_to_vec(self, sentence):
indices = [self.word2index.get(word, None) for word in sentence.split(" ")]
num_words = 0
vectors = np.zeros(self.model_matrix.shape[1], dtype = self.model_matrix.dtype)
for i in indices:
if i != None:
num_words += 1
vectors += self.model_matrix[i]
if num_words > 0:
# take average and normalize
vectors = vectors / num_words
vectors = vectors / np.linalg.norm(vectors)
return vectors
def sentence_similarity(self, sentence_a, sentence_b):
return np.dot(self.sentence_to_vect(sentnence_a), self.sentence_to_vect(sentence_b))
def most_similar_words(sentence, topn = 10):
vector = self.sentence_to_vec(sentence)
dists = np.dot(self.model_matrix, vector)
best = np.argsort(dists)[::-1][0:topn + 1]
words = [(self.index2word[k], k, dists[k]) for k in best]
return words
model = GloveModel("/Users/jonathanraiman/Desktop/glove_words.gz")
print(model.most_similar_words("take me to a chinese restaurant in 15 minutes"))
print(model.sentence_similarity("15", "100"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment