bwang482/sentence_embedding.py

## sentence_embedding.py
from __future__ import division
import gensim
import itertools
import numpy as np
from collections import Counter
from sklearn.decomposition import PCA


def gensim_load_vec(path):
    w2v_model = gensim.models.Word2Vec.load_word2vec_format(path, binary=False)
    shape = gensim_emb.syn0.shape
    return w2v_model, shape

def map_word_frequency(document):
    return Counter(itertools.chain(*document))

def sentence2vec(tokenised_sentence_list, embedding_size, word_emb_model, a = 1e-3):
	"""
	Computing weighted average of the word vectors in the sentence;
	remove the projection of the average vectors on their first principal component.
	Borrowed from https://github.com/peter3125/sentence2vec; now compatible with python 2.7
	"""
	word_counts = map_word_frequency(tokenised_sentence_list)
	sentence_set=[]
	for sentence in tokenised_sentence_list:
	    vs = np.zeros(embedding_size)
	    sentence_length = len(sentence)
	    for word in sentence:
	        a_value = a / (a + word_counts[word]) # smooth inverse frequency, SIF
		try:
			vs = np.add(vs, np.multiply(a_value, word_emb_model[word])) # vs += sif * word_vector
		except:
			pass
	    vs = np.divide(vs, sentence_length) # weighted average
	    sentence_set.append(vs)

    # calculate PCA of this sentence set
	pca = PCA(n_components=embedding_size)
	pca.fit(np.array(sentence_set))
	u = pca.explained_variance_ratio_  # the PCA vector
	u = np.multiply(u, np.transpose(u))  # u x uT

	if len(u) < embedding_size:
	    for i in range(embedding_size - len(u)):
	        u = np.append(u, 0)  # add needed extension for multiplication below

	# resulting sentence vectors, vs = vs - u x uT x vs
	sentence_vecs = []
	for vs in sentence_set:
	    sub = np.multiply(u,vs)
	    sentence_vecs.append(np.subtract(vs, sub))

	return sentence_vecs


# yo
w2v_model, glove_shape = gensim_load_vec('../glove.twitter.word2vec.27B.100d.txt')
tweets = ['It was all a dream', 'I used to read Word Up magazine']
tweets = [tweet.split() for tweet in tweets]
embedding_size = glove_shape[1]
sent_emb = sentence2vec(tweets, embedding_size, w2v_model)
	from __future__ import division
	import gensim
	import itertools
	import numpy as np
	from collections import Counter
	from sklearn.decomposition import PCA


	def gensim_load_vec(path):
	w2v_model = gensim.models.Word2Vec.load_word2vec_format(path, binary=False)
	shape = gensim_emb.syn0.shape
	return w2v_model, shape

	def map_word_frequency(document):
	return Counter(itertools.chain(*document))

	def sentence2vec(tokenised_sentence_list, embedding_size, word_emb_model, a = 1e-3):
	"""
	Computing weighted average of the word vectors in the sentence;
	remove the projection of the average vectors on their first principal component.
	Borrowed from https://github.com/peter3125/sentence2vec; now compatible with python 2.7
	"""
	word_counts = map_word_frequency(tokenised_sentence_list)
	sentence_set=[]
	for sentence in tokenised_sentence_list:
	vs = np.zeros(embedding_size)
	sentence_length = len(sentence)
	for word in sentence:
	a_value = a / (a + word_counts[word]) # smooth inverse frequency, SIF
	try:
	vs = np.add(vs, np.multiply(a_value, word_emb_model[word])) # vs += sif * word_vector
	except:
	pass
	vs = np.divide(vs, sentence_length) # weighted average
	sentence_set.append(vs)

	# calculate PCA of this sentence set
	pca = PCA(n_components=embedding_size)
	pca.fit(np.array(sentence_set))
	u = pca.explained_variance_ratio_ # the PCA vector
	u = np.multiply(u, np.transpose(u)) # u x uT

	if len(u) < embedding_size:
	for i in range(embedding_size - len(u)):
	u = np.append(u, 0) # add needed extension for multiplication below

	# resulting sentence vectors, vs = vs - u x uT x vs
	sentence_vecs = []
	for vs in sentence_set:
	sub = np.multiply(u,vs)
	sentence_vecs.append(np.subtract(vs, sub))

	return sentence_vecs


	# yo
	w2v_model, glove_shape = gensim_load_vec('../glove.twitter.word2vec.27B.100d.txt')
	tweets = ['It was all a dream', 'I used to read Word Up magazine']
	tweets = [tweet.split() for tweet in tweets]
	embedding_size = glove_shape[1]
	sent_emb = sentence2vec(tweets, embedding_size, w2v_model)