Created
June 23, 2019 19:15
-
-
Save himanshk96/21594b9f49a8b3060ff1f00d0a0d8ec5 to your computer and use it in GitHub Desktop.
Recommendation using ALS for implicit data. Code for Medium Blog
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Created on Sun Jun 23 22:20:58 2019 | |
@author: himansh | |
""" | |
#import libraries | |
import sys | |
import pandas as pd | |
import numpy as np | |
import scipy.sparse as sparse | |
from scipy.sparse.linalg import spsolve | |
import random | |
from sklearn.preprocessing import MinMaxScaler | |
import implicit | |
from datetime import datetime, timedelta | |
#Data Preprocessing | |
def create_data(datapath,start_date,end_date): | |
df=pd.read_csv(datapath) | |
df=df.assign(date=pd.Series(datetime.fromtimestamp(a/1000).date() for a in df.timestamp)) | |
df=df.sort_values(by='date').reset_index(drop=True) # for some reasons RetailRocket did NOT sort data by date | |
df=df[(df.date>=datetime.strptime(start_date,'%Y-%m-%d').date())&(df.date<=datetime.strptime(end_date,'%Y-%m-%d').date())] | |
df=df[['visitorid','itemid','event']] | |
return df | |
#Download the kaggle RetailRocket data and give the events.csv file path | |
datapath= 'events.csv' | |
data=create_data(datapath,'2015-5-3','2015-5-18') | |
data['visitorid'] = data['visitorid'].astype("category") | |
data['itemid'] = data['itemid'].astype("category") | |
data['visitor_id'] = data['visitorid'].cat.codes | |
data['item_id'] = data['itemid'].cat.codes | |
data['event']=data['event'].astype('category') | |
data['event']=data['event'].cat.codes | |
sparse_item_user = sparse.csr_matrix((data['event'].astype(float), (data['item_id'], data['visitor_id']))) | |
sparse_user_item = sparse.csr_matrix((data['event'].astype(float), (data['visitor_id'], data['item_id']))) | |
#Building the model | |
model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=20) | |
alpha_val = 40 | |
data_conf = (sparse_item_user * alpha_val).astype('double') | |
model.fit(data_conf) | |
###USING THE MODEL | |
#Get Recommendations | |
user_id = 14 | |
recommended = model.recommend(user_id, sparse_user_item) | |
print(recommended) | |
#Get similar items | |
item_id = 7 | |
n_similar = 3 | |
similar = model.similar_items(item_id, n_similar) | |
print(similar) | |
How shall we tune the parameters? e.g. why did you chose these parameters? factors=20, regularization=0.1, iterations=20
There are always default values or those that usually work "better" and generally these values can be chosen at random (in the case of some parameters because others penalize the algorithm and "hinder" the recommendations). In case you have a metric that allows you to measure the performance of your model, I would advise you to use something like True Positive or the calculation of some distance, you can also calculate the average distance
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
How shall we tune the parameters? e.g. why did you chose these parameters?
factors=20, regularization=0.1, iterations=20