Skip to content

Instantly share code, notes, and snippets.

@reachsumit
Created September 26, 2022 04:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save reachsumit/b6d4ef3b2bc9f7ec6cb5ef9414d7a734 to your computer and use it in GitHub Desktop.
Save reachsumit/b6d4ef3b2bc9f7ec6cb5ef9414d7a734 to your computer and use it in GitHub Desktop.
import numpy as np
import pandas as pd
from numpy import bincount, log, log1p
from scipy.sparse import coo_matrix, linalg
class ExplicitCF:
def __init__(self):
self.df = pd.read_csv("ml-100k/u.data", sep='\t', header=None, names=['user', 'item', 'rating'], usecols=range(3))
self.df['user'] = self.df['user'].astype("category")
self.df['item'] = self.df['item'].astype("category")
self.df.dropna(inplace=True)
self.rating_matrix = coo_matrix((self.df['rating'].astype(float),
(self.df['item'].cat.codes,
self.df['user'].cat.codes)))
def _bm25_weight(self, X, K1=100, B=0.8):
"""Weighs each row of a sparse matrix X by BM25 weighting"""
# calculate idf per term (user)
X = coo_matrix(X)
N = float(X.shape[0])
idf = log(N) - log1p(bincount(X.col))
# calculate length_norm per document (artist)
row_sums = np.ravel(X.sum(axis=1))
average_length = row_sums.mean()
length_norm = (1.0 - B) + B * row_sums / average_length
# weight matrix rows by bm25
X.data = X.data * (K1 + 1.0) / (K1 * length_norm[X.row] + X.data) * idf[X.col]
return X
def factorize(self):
item_factor, _, user_factor = linalg.svds(self._bm25_weight(self.rating_matrix), 50)
return item_factor, user_factor
def init_predict(self, x_factors):
# fully normalize factors, so can compare with only the dot product
norms = np.linalg.norm(x_factors, axis=-1)
self.factors = x_factors / norms[:, np.newaxis]
def get_related(self, x_id, N=5):
scores = self.factors.dot(self.factors[x_id])
best = np.argpartition(scores, -N)[-N:]
print("Recommendations:")
for _id, score in sorted(zip(best, scores[best]), key=lambda x: -x[1]):
print(f"item id: {_id}, score: {score}")
cf_object = ExplicitCF()
print(cf_object.df.head())
# user item rating
#0 196 242 3
#1 186 302 3
#2 22 377 1
#3 244 51 2
#4 166 346 1
print(cf_object.df.user.nunique()) # 943
print(cf_object.df.item.nunique()) # 1682
print(cf_object.df.rating.describe())
#count 100000.000000
#mean 3.529860
#std 1.125674
#min 1.000000
#25% 3.000000
#50% 4.000000
#75% 4.000000
#max 5.000000
#Name: rating, dtype: float64
print(cf_object.rating_matrix.shape) # (1682, 943)
item_factor, user_factor = cf_object.factorize()
print(item_factor.shape) # (1682, 50)
print(user_factor.shape) # (50, 943)
cf_object.init_predict(item_factor)
print(cf_object.factors.shape) # (1682, 50)
cf_object.get_related(314)
#Recommendations:
#item id: 314, score: 1.0
#item id: 315, score: 0.8940031189407059
#item id: 346, score: 0.8509562164687848
#item id: 271, score: 0.8441764974934266
#item id: 312, score: 0.7475076699852435
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment