Prateek Joshi prateekjoshi565

## genre_infer_newdata.py
for i in range(5):
  k = xval.sample(1).index[0]
  print("Movie: ", movies_new['movie_name'][k], "\nPredicted genre: ", infer_tags(xval[k])), print("Actual genre: ",movies_new['genre_new'][k], "\n")

## w2v_rcm_libs.py
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
from gensim.models import Word2Vec
import matplotlib.pyplot as plt
%matplotlib inline

import warnings;
warnings.filterwarnings('ignore')

## w2v_rcm_read_data.py
df = pd.read_excel('Online Retail.xlsx')
df.head()

## w2v_rcm_missing_data.py
# remove missing values
df.dropna(inplace=True)

# again check missing values
df.isnull().sum()

## w2v_rcm_seq.py
# shuffle customer ID's
random.shuffle(customers)

# extract 90% of customer ID's
customers_train = [customers[i] for i in range(round(0.9*len(customers)))]

# split data into train and validation set
train_df = df[df['CustomerID'].isin(customers_train)]
validation_df = df[~df['CustomerID'].isin(customers_train)]

## w2v_rcm_train.py
# list to capture purchase history of the customers
purchases_train = []

# populate the list with the product codes
for i in tqdm(customers_train):
    temp = train_df[train_df["CustomerID"] == i]["StockCode"].tolist()
    purchases_train.append(temp)

## w2v_rcm_val.py
# list to capture purchase history of the customers
purchases_val = []

# populate the list with the product codes
for i in tqdm(validation_df['CustomerID'].unique()):
    temp = validation_df[validation_df["CustomerID"] == i]["StockCode"].tolist()
    purchases_val.append(temp)

## w2v_rcm_model.py
# train word2vec model
model = Word2Vec(window = 10, sg = 1, hs = 0,
                 negative = 10, # for negative sampling
                 alpha=0.03, min_alpha=0.0007,
                 seed = 14)

model.build_vocab(purchases_train, progress_per=200)

model.train(purchases_train, total_examples = model.corpus_count,
            epochs=10, report_delay=1)

## w2v_rcm_extract_vecs.py
# extract all vectors
X = model[model.wv.vocab]

X.shape

## w2v_rcm_umap.py
import umap

cluster_embedding = umap.UMAP(n_neighbors=30, min_dist=0.0,
                              n_components=2, random_state=42).fit_transform(X)

plt.figure(figsize=(10,9))
plt.scatter(cluster_embedding[:, 0], cluster_embedding[:, 1], s=3, cmap='Spectral')
	for i in range(5):
	k = xval.sample(1).index[0]
	print("Movie: ", movies_new['movie_name'][k], "\nPredicted genre: ", infer_tags(xval[k])), print("Actual genre: ",movies_new['genre_new'][k], "\n")
	import pandas as pd
	import numpy as np
	import random
	from tqdm import tqdm
	from gensim.models import Word2Vec
	import matplotlib.pyplot as plt
	%matplotlib inline

	import warnings;
	warnings.filterwarnings('ignore')
	# remove missing values
	df.dropna(inplace=True)

	# again check missing values
	df.isnull().sum()
	# shuffle customer ID's
	random.shuffle(customers)

	# extract 90% of customer ID's
	customers_train = [customers[i] for i in range(round(0.9*len(customers)))]

	# split data into train and validation set
	train_df = df[df['CustomerID'].isin(customers_train)]
	validation_df = df[~df['CustomerID'].isin(customers_train)]
	# list to capture purchase history of the customers
	purchases_train = []

	# populate the list with the product codes
	for i in tqdm(customers_train):
	temp = train_df[train_df["CustomerID"] == i]["StockCode"].tolist()
	purchases_train.append(temp)
	# list to capture purchase history of the customers
	purchases_val = []

	# populate the list with the product codes
	for i in tqdm(validation_df['CustomerID'].unique()):
	temp = validation_df[validation_df["CustomerID"] == i]["StockCode"].tolist()
	purchases_val.append(temp)
	# train word2vec model
	model = Word2Vec(window = 10, sg = 1, hs = 0,
	negative = 10, # for negative sampling
	alpha=0.03, min_alpha=0.0007,
	seed = 14)

	model.build_vocab(purchases_train, progress_per=200)

	model.train(purchases_train, total_examples = model.corpus_count,
	epochs=10, report_delay=1)
	import umap

	cluster_embedding = umap.UMAP(n_neighbors=30, min_dist=0.0,
	n_components=2, random_state=42).fit_transform(X)

	plt.figure(figsize=(10,9))
	plt.scatter(cluster_embedding[:, 0], cluster_embedding[:, 1], s=3, cmap='Spectral')