Skip to content

Instantly share code, notes, and snippets.

@ravi07bec
Created October 21, 2020 06:21
Show Gist options
  • Save ravi07bec/2a60f3057f26c671dcbf58d04f00be38 to your computer and use it in GitHub Desktop.
Save ravi07bec/2a60f3057f26c671dcbf58d04f00be38 to your computer and use it in GitHub Desktop.
from datetime import datetime
from gensim.models import Word2Vec
from fse import IndexedList
#Data Inputs
movies=pd.read_csv('ml-latest/movies.csv')
links=pd.read_csv('ml-latest/ratings.csv')
links=links.sample(frac=0.05)
links['movieId']=links['movieId'].astype('str')
links['userId']=links['userId'].astype('str')
def combine(kw):
a=1
kw=kw.tolist()
size=len(kw)
base=[]
for i in range(0,size):
#print(kw[0][i].split(" "))
string=kw[i].split(",")
#print(string)
base=base+string
if(len(base)>2):
base = ','.join(base)
return base
s_test=links
s_test=s_test[['userId','movieId']].drop_duplicates()
s_test['keywords_comb']=s_test.groupby(['movieId'])['userId'].transform(combine)
print("Start of Word2Vec",datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
data=s_test[['movieId','keywords_comb']].drop_duplicates()
data['word_count'] = data.keywords_comb.str.count(',')+1
data=data[data['word_count']>5]
data['tokenise'] = data['keywords_comb'].astype('str').apply(lambda x: [x for x in x.split(',')])
a1=data['tokenise'].tolist()
model_items = Word2Vec(a1, size=100)
#data=data.dropna()
print("Start of Sentence To vec",datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
sent=data['keywords_comb'].astype('str').apply(lambda x: [x for x in x.split(',')]).tolist()
s = IndexedList(sent)
from fse.models import uSIF
model = uSIF(model_items, lang_freq="en")
model.train(s)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment