Tan Pengshi Alvin tanpengshi

## random_forest.py
"""
Implementation of algorithm to train random forest classifiers.

Author: Tan Pengshi Alvin
Adapted from: https://towardsdatascience.com/master-machine-learning-random-forest-from-scratch-with-python-3efdd51b6d7a
"""
import numpy as np
import pandas as pd

from scipy import stats

## decision_tree.py
"""
Implementation of algorithm to train decision tree classifiers.

Author: Tan Pengshi Alvin
Adapted from: https://towardsdatascience.com/decision-tree-from-scratch-in-python-46e99dfea775
"""
import numpy as np
import pandas as pd
import random

## sampleuser_recommend.csv

          
            
            user_id
            product_id
            predicted_interaction
            category_code
            brand
            price
            price_category

            
              22
              518044530
              1304392
              0.6230029231898839
              computers.notebook
              lenovo
              180.11
              1

            
              29
              518044530
              1307188
              0.5954320043486347
              computers.notebook
              hp
              285.51
              1

            
              16
              518044530
              1306818
              0.6230029231898839
              computers.notebook
              lenovo
              218.77
              1

            
              15
              518044530
              1307004
              0.6230029231898839
              computers.notebook
              lenovo
              290.61
              1

            
              11
              518044530
              1307068
              0.6230029231898839
              computers.notebook
              lenovo
              303.48
              1

            
              20
              518044530
              1307366
              0.6230029231898839
              computers.notebook
              lenovo
              248.62
              1

            
              26
              518044530
              1305998
              0.5954320043486347
              computers.notebook
              hp
              270.02
              1

            
              10
              518044530
              1307151
              0.6230029231898839
              computers.notebook
              lenovo
              329.46
              1

            
              25
              518044530
              1305583
              0.5954320043486347
              computers.notebook
              hp
              295.99
              1

## sampleuser_test.csv

          
            
            user_id
            product_id
            user_score
            user_purchase
            interaction_score
            category_code
            brand
            price
            price_category
            predicted_interaction
            predicted_purchase

            
              7739
              518044530
              1307135
              100
              1
              0.9999999999999999
              computers.notebook
              hp
              320.35
              1
              0.5954320043486347
              1

            
              13096
              518044530
              1307356
              1
              0
              0.025
              computers.notebook
              asus
              373.21
              2
              0.03571788529647339
              0

            
              16729
              518044530
              1306686
              1
              0
              0.025
              computers.notebook
              prestigio
              257.15
              1
              0.10429113969159012
              0

            
              20097
              518044530
              1306185
              2
              0
              0.03484848484848485
              computers.notebook
              acer
              386.08
              2
              0.14809916917365376
              0

## sampleuser_trainval.csv

          
            
            user_id
            product_id
            user_score
            user_purchase
            interaction_score
            category_code
            brand
            price
            price_category

            
              543
              518044530
              1307237
              6
              0
              0.07424242424242423
              computers.notebook
              lenovo
              257.38
              1

            
              2063
              518044530
              1307366
              54
              1
              0.5469696969696969
              computers.notebook
              lenovo
              248.62
              1

            
              9017
              518044530
              1307067
              100
              1
              0.9999999999999999
              computers.notebook
              lenovo
              251.74
              1

            
              113488
              518044530
              1307316
              1
              0
              0.025
              computers.notebook
              lenovo
              248.89
              1

            
              49036
              518044530
              1307188
              100
              1
              0.9999999999999999
              computers.notebook
              hp
              285.51
              1

            
              20103
              518044530
              1307370
              52
              1
              0.5272727272727272
              computers.notebook
              acer
              257.15
              1

## matrix2.py
content_matrix = pd.DataFrame(content_matrix,columns=sorted(X_train['product_id'].unique()),index=sorted(X_train['user_id'].unique()))
content_df = content_matrix.stack().reset_index()
content_df = content_df.rename(columns={'level_0':'user_id','level_1':'product_id',0:'predicted_interaction'})
X_valid = X_valid.merge(content_df,on=['user_id','product_id'])

X_valid['predicted_purchase'] = X_valid['predicted_interaction'].apply(lambda x:1 if x>=0.5 else 0)

## matrix.py
product_cat = X_train[['product_id','price_category','category_code','brand']].drop_duplicates('product_id')
product_cat = product_cat.sort_values(by='product_id')

price_cat_matrix = np.reciprocal(euclidean_distances(np.array(product_cat['price_category']).reshape(-1,1))+1)
euclidean_matrix = pd.DataFrame(price_cat_matrix,columns=product_cat['product_id'],index=product_cat['product_id'])

tfidf_vectorizer = TfidfVectorizer()
doc_term = tfidf_vectorizer.fit_transform(list(product_cat['category_code']))
dt_matrix = pd.DataFrame(doc_term.toarray().round(3), index=[i for i in product_cat['product_id']], columns=tfidf_vectorizer.get_feature_names())
cos_similar_matrix = pd.DataFrame(cosine_similarity(dt_matrix.values),columns=product_cat['product_id'],index=product_cat['product_id'])

## X_train.py
X_train_matrix = pd.pivot_table(X_train,values='user_score',index='user_id',columns='product_id')
X_train_matrix = X_train_matrix.fillna(0)

## groupdf.csv

          
            
            user_id
            product_id
            user_score
            user_purchase
            interaction_score
            category_code
            brand
            price
            price_category

            
              0
              269253210
              1401933
              1
              0
              0.025
              computers.desktop
              acer
              1029.6
              5

            
              1
              295655799
              6400036
              1
              0
              0.025
              computers.components.cpu
              intel
              338.23
              4

            
              2
              337535108
              1307285
              5
              0
              0.064393939
              computers.notebook
              hp
              1407.76
              5

            
              3
              512430246
              1307285
              1
              0
              0.025
              computers.notebook
              hp
              1407.76
              5

            
              4
              512617890
              1307285
              1
              0
              0.025
              computers.notebook
              hp
              1407.76
              5

            
              ...
              ...
              ...
              ...
              ...
              ...
              ...
              ...
              ...
              ...

            
              195668
              557140924
              1306861
              1
              0
              0.025
              computers.notebook
              asus
              933.61
              4

            
              195669
              557155858
              9700243
              1
              0
              0.025
              computers.components.power_supply
              thermaltake
              149.07
              5

            
              195670
              557162041
              6600870
              1
              0
              0.025
              computers.components.memory
              zeppelin
              17.5
              1

## group.py
group = df.groupby(['user_id','product_id'])['user_score','user_purchase'].sum().reset_index()
group['user_purchase'] = group['user_purchase'].apply(lambda x: 1 if x>1 else x)
group['user_score'] = group['user_score'].apply(lambda x: 100 if x>100 else x)

std = MinMaxScaler(feature_range=(0.025, 1))
std.fit(group['user_score'].values.reshape(-1,1))
group['interaction_score'] = std.transform(group['user_score'].values.reshape(-1,1))

group = group.merge(df[['product_id','category_code','brand','price','price_category']].drop_duplicates('product_id'),on=['product_id'])
	"""
	Implementation of algorithm to train random forest classifiers.

	Author: Tan Pengshi Alvin
	Adapted from: https://towardsdatascience.com/master-machine-learning-random-forest-from-scratch-with-python-3efdd51b6d7a
	"""
	import numpy as np
	import pandas as pd

	from scipy import stats
	"""
	Implementation of algorithm to train decision tree classifiers.

	Author: Tan Pengshi Alvin
	Adapted from: https://towardsdatascience.com/decision-tree-from-scratch-in-python-46e99dfea775
	"""
	import numpy as np
	import pandas as pd
	import random
	user_id	product_id	predicted_interaction	category_code	brand	price	price_category
22	518044530	1304392	0.6230029231898839	computers.notebook	lenovo	180.11	1
29	518044530	1307188	0.5954320043486347	computers.notebook	hp	285.51	1
16	518044530	1306818	0.6230029231898839	computers.notebook	lenovo	218.77	1
15	518044530	1307004	0.6230029231898839	computers.notebook	lenovo	290.61	1
11	518044530	1307068	0.6230029231898839	computers.notebook	lenovo	303.48	1
20	518044530	1307366	0.6230029231898839	computers.notebook	lenovo	248.62	1
26	518044530	1305998	0.5954320043486347	computers.notebook	hp	270.02	1
10	518044530	1307151	0.6230029231898839	computers.notebook	lenovo	329.46	1
25	518044530	1305583	0.5954320043486347	computers.notebook	hp	295.99	1
	user_id	product_id	user_score	user_purchase	interaction_score	category_code	brand	price	price_category	predicted_interaction	predicted_purchase
7739	518044530	1307135	100	1	0.9999999999999999	computers.notebook	hp	320.35	1	0.5954320043486347	1
13096	518044530	1307356	1	0	0.025	computers.notebook	asus	373.21	2	0.03571788529647339	0
16729	518044530	1306686	1	0	0.025	computers.notebook	prestigio	257.15	1	0.10429113969159012	0
20097	518044530	1306185	2	0	0.03484848484848485	computers.notebook	acer	386.08	2	0.14809916917365376	0
	content_matrix = pd.DataFrame(content_matrix,columns=sorted(X_train['product_id'].unique()),index=sorted(X_train['user_id'].unique()))
	content_df = content_matrix.stack().reset_index()
	content_df = content_df.rename(columns={'level_0':'user_id','level_1':'product_id',0:'predicted_interaction'})
	X_valid = X_valid.merge(content_df,on=['user_id','product_id'])

	X_valid['predicted_purchase'] = X_valid['predicted_interaction'].apply(lambda x:1 if x>=0.5 else 0)
	product_cat = X_train[['product_id','price_category','category_code','brand']].drop_duplicates('product_id')
	product_cat = product_cat.sort_values(by='product_id')

	price_cat_matrix = np.reciprocal(euclidean_distances(np.array(product_cat['price_category']).reshape(-1,1))+1)
	euclidean_matrix = pd.DataFrame(price_cat_matrix,columns=product_cat['product_id'],index=product_cat['product_id'])

	tfidf_vectorizer = TfidfVectorizer()
	doc_term = tfidf_vectorizer.fit_transform(list(product_cat['category_code']))
	dt_matrix = pd.DataFrame(doc_term.toarray().round(3), index=[i for i in product_cat['product_id']], columns=tfidf_vectorizer.get_feature_names())
	cos_similar_matrix = pd.DataFrame(cosine_similarity(dt_matrix.values),columns=product_cat['product_id'],index=product_cat['product_id'])
	X_train_matrix = pd.pivot_table(X_train,values='user_score',index='user_id',columns='product_id')
	X_train_matrix = X_train_matrix.fillna(0)
	group = df.groupby(['user_id','product_id'])['user_score','user_purchase'].sum().reset_index()
	group['user_purchase'] = group['user_purchase'].apply(lambda x: 1 if x>1 else x)
	group['user_score'] = group['user_score'].apply(lambda x: 100 if x>100 else x)

	std = MinMaxScaler(feature_range=(0.025, 1))
	std.fit(group['user_score'].values.reshape(-1,1))
	group['interaction_score'] = std.transform(group['user_score'].values.reshape(-1,1))

	group = group.merge(df[['product_id','category_code','brand','price','price_category']].drop_duplicates('product_id'),on=['product_id'])