reachsumit/cf_explicit.py

## cf_explicit.py
import numpy as np
import pandas as pd

from numpy import bincount, log, log1p
from scipy.sparse import coo_matrix, linalg

class ExplicitCF:
    def __init__(self):
        self.df = pd.read_csv("ml-100k/u.data", sep='\t', header=None, names=['user', 'item', 'rating'], usecols=range(3))
        self.df['user'] = self.df['user'].astype("category")
        self.df['item'] = self.df['item'].astype("category")
        self.df.dropna(inplace=True)

        self.rating_matrix = coo_matrix((self.df['rating'].astype(float),
                                  (self.df['item'].cat.codes,
                                  self.df['user'].cat.codes)))

    def _bm25_weight(self, X, K1=100, B=0.8):
        """Weighs each row of a sparse matrix X  by BM25 weighting"""
        # calculate idf per term (user)
        X = coo_matrix(X)
        N = float(X.shape[0])
        idf = log(N) - log1p(bincount(X.col))
        # calculate length_norm per document (artist)
        row_sums = np.ravel(X.sum(axis=1))
        average_length = row_sums.mean()
        length_norm = (1.0 - B) + B * row_sums / average_length
        # weight matrix rows by bm25
        X.data = X.data * (K1 + 1.0) / (K1 * length_norm[X.row] + X.data) * idf[X.col]
        return X

    def factorize(self):
        item_factor, _, user_factor = linalg.svds(self._bm25_weight(self.rating_matrix), 50)
        return item_factor, user_factor

    def init_predict(self, x_factors):
        # fully normalize factors, so can compare with only the dot product
        norms = np.linalg.norm(x_factors, axis=-1)
        self.factors = x_factors / norms[:, np.newaxis]

    def get_related(self, x_id, N=5):
        scores = self.factors.dot(self.factors[x_id])
        best = np.argpartition(scores, -N)[-N:]
        print("Recommendations:")
        for _id, score in sorted(zip(best, scores[best]), key=lambda x: -x[1]):
            print(f"item id: {_id}, score: {score}")

cf_object = ExplicitCF()
print(cf_object.df.head())
#  user item  rating
#0  196  242       3
#1  186  302       3
#2   22  377       1
#3  244   51       2
#4  166  346       1
print(cf_object.df.user.nunique()) # 943
print(cf_object.df.item.nunique()) # 1682
print(cf_object.df.rating.describe())
#count    100000.000000
#mean          3.529860
#std           1.125674
#min           1.000000
#25%           3.000000
#50%           4.000000
#75%           4.000000
#max           5.000000
#Name: rating, dtype: float64
print(cf_object.rating_matrix.shape) # (1682, 943)

item_factor, user_factor = cf_object.factorize()
print(item_factor.shape) # (1682, 50)
print(user_factor.shape) # (50, 943)

cf_object.init_predict(item_factor)
print(cf_object.factors.shape) # (1682, 50)
cf_object.get_related(314)
#Recommendations:
#item id: 314, score: 1.0
#item id: 315, score: 0.8940031189407059
#item id: 346, score: 0.8509562164687848
#item id: 271, score: 0.8441764974934266
#item id: 312, score: 0.7475076699852435
	import numpy as np
	import pandas as pd

	from numpy import bincount, log, log1p
	from scipy.sparse import coo_matrix, linalg

	class ExplicitCF:
	def __init__(self):
	self.df = pd.read_csv("ml-100k/u.data", sep='\t', header=None, names=['user', 'item', 'rating'], usecols=range(3))
	self.df['user'] = self.df['user'].astype("category")
	self.df['item'] = self.df['item'].astype("category")
	self.df.dropna(inplace=True)

	self.rating_matrix = coo_matrix((self.df['rating'].astype(float),
	(self.df['item'].cat.codes,
	self.df['user'].cat.codes)))

	def _bm25_weight(self, X, K1=100, B=0.8):
	"""Weighs each row of a sparse matrix X by BM25 weighting"""
	# calculate idf per term (user)
	X = coo_matrix(X)
	N = float(X.shape[0])
	idf = log(N) - log1p(bincount(X.col))
	# calculate length_norm per document (artist)
	row_sums = np.ravel(X.sum(axis=1))
	average_length = row_sums.mean()
	length_norm = (1.0 - B) + B * row_sums / average_length
	# weight matrix rows by bm25
	X.data = X.data * (K1 + 1.0) / (K1 * length_norm[X.row] + X.data) * idf[X.col]
	return X

	def factorize(self):
	item_factor, _, user_factor = linalg.svds(self._bm25_weight(self.rating_matrix), 50)
	return item_factor, user_factor

	def init_predict(self, x_factors):
	# fully normalize factors, so can compare with only the dot product
	norms = np.linalg.norm(x_factors, axis=-1)
	self.factors = x_factors / norms[:, np.newaxis]

	def get_related(self, x_id, N=5):
	scores = self.factors.dot(self.factors[x_id])
	best = np.argpartition(scores, -N)[-N:]
	print("Recommendations:")
	for _id, score in sorted(zip(best, scores[best]), key=lambda x: -x[1]):
	print(f"item id: {_id}, score: {score}")

	cf_object = ExplicitCF()
	print(cf_object.df.head())
	# user item rating
	#0 196 242 3
	#1 186 302 3
	#2 22 377 1
	#3 244 51 2
	#4 166 346 1
	print(cf_object.df.user.nunique()) # 943
	print(cf_object.df.item.nunique()) # 1682
	print(cf_object.df.rating.describe())
	#count 100000.000000
	#mean 3.529860
	#std 1.125674
	#min 1.000000
	#25% 3.000000
	#50% 4.000000
	#75% 4.000000
	#max 5.000000
	#Name: rating, dtype: float64
	print(cf_object.rating_matrix.shape) # (1682, 943)

	item_factor, user_factor = cf_object.factorize()
	print(item_factor.shape) # (1682, 50)
	print(user_factor.shape) # (50, 943)

	cf_object.init_predict(item_factor)
	print(cf_object.factors.shape) # (1682, 50)
	cf_object.get_related(314)
	#Recommendations:
	#item id: 314, score: 1.0
	#item id: 315, score: 0.8940031189407059
	#item id: 346, score: 0.8509562164687848
	#item id: 271, score: 0.8441764974934266
	#item id: 312, score: 0.7475076699852435