Skip to content

Instantly share code, notes, and snippets.

@himangSharatun
Created June 3, 2018 09:59
Show Gist options
  • Save himangSharatun/0539695577ba8ee31c6db8bfe04a6af0 to your computer and use it in GitHub Desktop.
Save himangSharatun/0539695577ba8ee31c6db8bfe04a6af0 to your computer and use it in GitHub Desktop.
from gensim.models import Word2Vec
import re
import pickle
import numpy as np
def tokenize(sentence):
remove_dots = re.sub("[.]", "", sentence.lower())
return re.findall("[A-Za-z]{2,}", remove_dots)
def w2v_tfidf(sentence, w2v_model, tfidf_model):
sentence_tfidf = np.asarray(loaded_tfidf.transform([sentence]).todense())
words = tokenize(sentence.lower())
sum_w2v = 0
count_w2v = 0
for word in words:
if word in w2v_model.wv.vocab and word in loaded_tfidf.vocabulary_:
index = loaded_tfidf.vocabulary_[word]
word_tfidf = sentence_tfidf[0][index]
word_w2v = w2v_model.wv[word]
sum_w2v += word_w2v * word_tfidf
count_w2v += 1
if count_w2v:
return sum_w2v/count_w2v
loaded_w2v = Word2Vec.load('word2vec.bin')
loaded_tfidf = pickle.load(open("tfidf.pk", "rb"))
print(w2v_tfidf('VirginAmerica SFO PDX schedule is still MIA ', loaded_w2v, loaded_tfidf))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment