Skip to content

Instantly share code, notes, and snippets.

@himangSharatun
Last active May 31, 2018 03:48
Show Gist options
  • Save himangSharatun/2332c94735e2a328dabf0cfad078c75c to your computer and use it in GitHub Desktop.
Save himangSharatun/2332c94735e2a328dabf0cfad078c75c to your computer and use it in GitHub Desktop.
from gensim.models import Word2Vec
import re
def tokenize(sentence):
remove_dots = re.sub("[.]", "", sentence.lower())
return re.findall("[A-Za-z]{2,}", remove_dots)
def w2v_average(sentence, w2v_model):
words = tokenize(sentence.lower())
sum_w2v = 0
count_w2v = 0
for word in words:
if word in w2v_model.wv.vocab:
sum_w2v += w2v_model.wv[word]
count_w2v += 1
if count_w2v:
return sum_w2v/count_w2v
loaded_w2v = Word2Vec.load('word2vec.bin')
print(w2v_average('VirginAmerica SFO PDX schedule is still MIA ', loaded_w2v))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment