Skip to content

Instantly share code, notes, and snippets.

View prateekjoshi565's full-sized avatar
🎯
Focusing

Prateek Joshi prateekjoshi565

🎯
Focusing
View GitHub Profile
for i in range(5):
k = xval.sample(1).index[0]
print("Movie: ", movies_new['movie_name'][k], "\nPredicted genre: ", infer_tags(xval[k])), print("Actual genre: ",movies_new['genre_new'][k], "\n")
@prateekjoshi565
prateekjoshi565 / w2v_rcm_libs.py
Last active July 28, 2019 18:07
word2vec for recommendation
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
from gensim.models import Word2Vec
import matplotlib.pyplot as plt
%matplotlib inline
import warnings;
warnings.filterwarnings('ignore')
df = pd.read_excel('Online Retail.xlsx')
df.head()
# remove missing values
df.dropna(inplace=True)
# again check missing values
df.isnull().sum()
# shuffle customer ID's
random.shuffle(customers)
# extract 90% of customer ID's
customers_train = [customers[i] for i in range(round(0.9*len(customers)))]
# split data into train and validation set
train_df = df[df['CustomerID'].isin(customers_train)]
validation_df = df[~df['CustomerID'].isin(customers_train)]
# list to capture purchase history of the customers
purchases_train = []
# populate the list with the product codes
for i in tqdm(customers_train):
temp = train_df[train_df["CustomerID"] == i]["StockCode"].tolist()
purchases_train.append(temp)
# list to capture purchase history of the customers
purchases_val = []
# populate the list with the product codes
for i in tqdm(validation_df['CustomerID'].unique()):
temp = validation_df[validation_df["CustomerID"] == i]["StockCode"].tolist()
purchases_val.append(temp)
# train word2vec model
model = Word2Vec(window = 10, sg = 1, hs = 0,
negative = 10, # for negative sampling
alpha=0.03, min_alpha=0.0007,
seed = 14)
model.build_vocab(purchases_train, progress_per=200)
model.train(purchases_train, total_examples = model.corpus_count,
epochs=10, report_delay=1)
# extract all vectors
X = model[model.wv.vocab]
X.shape
import umap
cluster_embedding = umap.UMAP(n_neighbors=30, min_dist=0.0,
n_components=2, random_state=42).fit_transform(X)
plt.figure(figsize=(10,9))
plt.scatter(cluster_embedding[:, 0], cluster_embedding[:, 1], s=3, cmap='Spectral')