Skip to content

Instantly share code, notes, and snippets.

@bwang482
Last active March 10, 2017 19:06
Show Gist options
  • Save bwang482/a4b2de9b5037d9ad69fa82da6ae67641 to your computer and use it in GitHub Desktop.
Save bwang482/a4b2de9b5037d9ad69fa82da6ae67641 to your computer and use it in GitHub Desktop.
Sentence embedding method in [Arora et al. ICLR 2017] - https://openreview.net/pdf?id=SyK00v5xx
from __future__ import division
import gensim
import itertools
import numpy as np
from collections import Counter
from sklearn.decomposition import PCA
def gensim_load_vec(path):
w2v_model = gensim.models.Word2Vec.load_word2vec_format(path, binary=False)
shape = gensim_emb.syn0.shape
return w2v_model, shape
def map_word_frequency(document):
return Counter(itertools.chain(*document))
def sentence2vec(tokenised_sentence_list, embedding_size, word_emb_model, a = 1e-3):
"""
Computing weighted average of the word vectors in the sentence;
remove the projection of the average vectors on their first principal component.
Borrowed from https://github.com/peter3125/sentence2vec; now compatible with python 2.7
"""
word_counts = map_word_frequency(tokenised_sentence_list)
sentence_set=[]
for sentence in tokenised_sentence_list:
vs = np.zeros(embedding_size)
sentence_length = len(sentence)
for word in sentence:
a_value = a / (a + word_counts[word]) # smooth inverse frequency, SIF
try:
vs = np.add(vs, np.multiply(a_value, word_emb_model[word])) # vs += sif * word_vector
except:
pass
vs = np.divide(vs, sentence_length) # weighted average
sentence_set.append(vs)
# calculate PCA of this sentence set
pca = PCA(n_components=embedding_size)
pca.fit(np.array(sentence_set))
u = pca.explained_variance_ratio_ # the PCA vector
u = np.multiply(u, np.transpose(u)) # u x uT
if len(u) < embedding_size:
for i in range(embedding_size - len(u)):
u = np.append(u, 0) # add needed extension for multiplication below
# resulting sentence vectors, vs = vs - u x uT x vs
sentence_vecs = []
for vs in sentence_set:
sub = np.multiply(u,vs)
sentence_vecs.append(np.subtract(vs, sub))
return sentence_vecs
# yo
w2v_model, glove_shape = gensim_load_vec('../glove.twitter.word2vec.27B.100d.txt')
tweets = ['It was all a dream', 'I used to read Word Up magazine']
tweets = [tweet.split() for tweet in tweets]
embedding_size = glove_shape[1]
sent_emb = sentence2vec(tweets, embedding_size, w2v_model)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment