Skip to content

Instantly share code, notes, and snippets.

View tanpengshi's full-sized avatar

Tan Pengshi Alvin tanpengshi

View GitHub Profile
@tanpengshi
tanpengshi / random_forest.py
Created November 27, 2021 14:39
Random Forest Algorithm from Scratch
"""
Implementation of algorithm to train random forest classifiers.
Author: Tan Pengshi Alvin
Adapted from: https://towardsdatascience.com/master-machine-learning-random-forest-from-scratch-with-python-3efdd51b6d7a
"""
import numpy as np
import pandas as pd
from scipy import stats
@tanpengshi
tanpengshi / decision_tree.py
Created November 27, 2021 14:25
Decision Tree Algorithm from Scratch
"""
Implementation of algorithm to train decision tree classifiers.
Author: Tan Pengshi Alvin
Adapted from: https://towardsdatascience.com/decision-tree-from-scratch-in-python-46e99dfea775
"""
import numpy as np
import pandas as pd
import random
user_id product_id predicted_interaction category_code brand price price_category
22 518044530 1304392 0.6230029231898839 computers.notebook lenovo 180.11 1
29 518044530 1307188 0.5954320043486347 computers.notebook hp 285.51 1
16 518044530 1306818 0.6230029231898839 computers.notebook lenovo 218.77 1
15 518044530 1307004 0.6230029231898839 computers.notebook lenovo 290.61 1
11 518044530 1307068 0.6230029231898839 computers.notebook lenovo 303.48 1
20 518044530 1307366 0.6230029231898839 computers.notebook lenovo 248.62 1
26 518044530 1305998 0.5954320043486347 computers.notebook hp 270.02 1
10 518044530 1307151 0.6230029231898839 computers.notebook lenovo 329.46 1
25 518044530 1305583 0.5954320043486347 computers.notebook hp 295.99 1
user_id product_id user_score user_purchase interaction_score category_code brand price price_category predicted_interaction predicted_purchase
7739 518044530 1307135 100 1 0.9999999999999999 computers.notebook hp 320.35 1 0.5954320043486347 1
13096 518044530 1307356 1 0 0.025 computers.notebook asus 373.21 2 0.03571788529647339 0
16729 518044530 1306686 1 0 0.025 computers.notebook prestigio 257.15 1 0.10429113969159012 0
20097 518044530 1306185 2 0 0.03484848484848485 computers.notebook acer 386.08 2 0.14809916917365376 0
user_id product_id user_score user_purchase interaction_score category_code brand price price_category
543 518044530 1307237 6 0 0.07424242424242423 computers.notebook lenovo 257.38 1
2063 518044530 1307366 54 1 0.5469696969696969 computers.notebook lenovo 248.62 1
9017 518044530 1307067 100 1 0.9999999999999999 computers.notebook lenovo 251.74 1
113488 518044530 1307316 1 0 0.025 computers.notebook lenovo 248.89 1
49036 518044530 1307188 100 1 0.9999999999999999 computers.notebook hp 285.51 1
20103 518044530 1307370 52 1 0.5272727272727272 computers.notebook acer 257.15 1
content_matrix = pd.DataFrame(content_matrix,columns=sorted(X_train['product_id'].unique()),index=sorted(X_train['user_id'].unique()))
content_df = content_matrix.stack().reset_index()
content_df = content_df.rename(columns={'level_0':'user_id','level_1':'product_id',0:'predicted_interaction'})
X_valid = X_valid.merge(content_df,on=['user_id','product_id'])
X_valid['predicted_purchase'] = X_valid['predicted_interaction'].apply(lambda x:1 if x>=0.5 else 0)
product_cat = X_train[['product_id','price_category','category_code','brand']].drop_duplicates('product_id')
product_cat = product_cat.sort_values(by='product_id')
price_cat_matrix = np.reciprocal(euclidean_distances(np.array(product_cat['price_category']).reshape(-1,1))+1)
euclidean_matrix = pd.DataFrame(price_cat_matrix,columns=product_cat['product_id'],index=product_cat['product_id'])
tfidf_vectorizer = TfidfVectorizer()
doc_term = tfidf_vectorizer.fit_transform(list(product_cat['category_code']))
dt_matrix = pd.DataFrame(doc_term.toarray().round(3), index=[i for i in product_cat['product_id']], columns=tfidf_vectorizer.get_feature_names())
cos_similar_matrix = pd.DataFrame(cosine_similarity(dt_matrix.values),columns=product_cat['product_id'],index=product_cat['product_id'])
X_train_matrix = pd.pivot_table(X_train,values='user_score',index='user_id',columns='product_id')
X_train_matrix = X_train_matrix.fillna(0)
user_id product_id user_score user_purchase interaction_score category_code brand price price_category
0 269253210 1401933 1 0 0.025 computers.desktop acer 1029.6 5
1 295655799 6400036 1 0 0.025 computers.components.cpu intel 338.23 4
2 337535108 1307285 5 0 0.064393939 computers.notebook hp 1407.76 5
3 512430246 1307285 1 0 0.025 computers.notebook hp 1407.76 5
4 512617890 1307285 1 0 0.025 computers.notebook hp 1407.76 5
... ... ... ... ... ... ... ... ... ...
195668 557140924 1306861 1 0 0.025 computers.notebook asus 933.61 4
195669 557155858 9700243 1 0 0.025 computers.components.power_supply thermaltake 149.07 5
195670 557162041 6600870 1 0 0.025 computers.components.memory zeppelin 17.5 1
group = df.groupby(['user_id','product_id'])['user_score','user_purchase'].sum().reset_index()
group['user_purchase'] = group['user_purchase'].apply(lambda x: 1 if x>1 else x)
group['user_score'] = group['user_score'].apply(lambda x: 100 if x>100 else x)
std = MinMaxScaler(feature_range=(0.025, 1))
std.fit(group['user_score'].values.reshape(-1,1))
group['interaction_score'] = std.transform(group['user_score'].values.reshape(-1,1))
group = group.merge(df[['product_id','category_code','brand','price','price_category']].drop_duplicates('product_id'),on=['product_id'])