Skip to content

Instantly share code, notes, and snippets.

@gsakkis
Created October 3, 2011 21:36
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save gsakkis/1260324 to your computer and use it in GitHub Desktop.
Save gsakkis/1260324 to your computer and use it in GitHub Desktop.
Slope One
import sys, csv, time
from collections import defaultdict
from itertools import islice
import orig_slopeone, slopeone, numpy_slopeone
classes = [
orig_slopeone.SlopeOne,
#orig_slopeone.SlopeOneNumpy, # XXX: dog slow
slopeone.SlopeOne,
slopeone.SlopeOne2,
slopeone.SlopeOneSym,
slopeone.SlopeOneSym2,
numpy_slopeone.SlopeOneDense,
numpy_slopeone.SlopeOneSparse,
]
class ItemsStorage(object):
def __init__(self, maxsize=None):
self.items = []
self.item2id = {}
self.maxsize = maxsize
def add_item(self, item):
id = self.item2id.get(item)
if id is None:
n = len(self.items)
if self.maxsize is None or n < self.maxsize:
self.item2id[item] = n
self.items.append(item)
id = n
return id
def id(self, item):
return self.item2id[item]
def item(self, id):
return self.items[id]
def __len__(self):
return len(self.items)
def load_data(max_num_items=None):
users_ratings = defaultdict(dict)
items_storage = ItemsStorage(max_num_items)
reader = csv.reader(open('BX-Book-Ratings.csv'), delimiter=';')
reader.next() # skip header
for user_id, isbn, rating in reader:
item_id = items_storage.add_item(isbn)
rating = int(rating)
if item_id is None or rating == 0:
continue
users_ratings[user_id][item_id] = rating
return len(items_storage), users_ratings
def main(max_num_items, num_users, top=5):
num_items, user_ratings = load_data(max_num_items)
print >> sys.stderr, '%d items / %d users / %d ratings' % (
num_items, len(user_ratings),
sum(len(r) for r in user_ratings.itervalues()))
for cls in classes:
name = '%s.%s' % (cls.__module__, cls.__name__)
print >> sys.stderr, name
start = time.time()
slopone = cls(num_items)
slopone.update(user_ratings.itervalues())
print >> sys.stderr, ' training time: %s' % (time.time() - start)
out = open('%s.out' % name, 'w')
if 0:
slopone.dump(out)
else:
start = time.time()
for itemid2rating in islice(user_ratings.itervalues(), num_users):
print >> out, slopone.recomendations(itemid2rating, top)
print >> sys.stderr, ' prediction time (top %d items for %d users): %s' % (
top, num_users, time.time() - start)
out.close()
if __name__ == '__main__':
main(*map(int, sys.argv[1:]))
from collections import defaultdict
from heapq import nlargest
from itertools import izip
from operator import itemgetter
import numpy as np
from scipy import sparse
class SlopeOneDense(object):
def __init__(self, num_items):
self.num_items = num_items
def update(self, users):
# collect the freqs and dicts in dict-of-dicts
freqdict = defaultdict(lambda: defaultdict(int))
diffdict = defaultdict(lambda: defaultdict(int))
for user in users:
itemid_ratings = sorted(user.iteritems(), key=itemgetter(0))
while itemid_ratings:
i, ri = itemid_ratings.pop()
freqs_i = freqdict[i]
diffs_i = diffdict[i]
for j, rj in itemid_ratings:
freqs_i[j] += 1
diffs_i[j] += ri - rj
self.freqsmat = freqsmat = np.zeros((self.num_items, self.num_items), dtype=int)
self.diffsmat = diffsmat = np.zeros((self.num_items, self.num_items), dtype=float)
while freqdict:
i, freqdict_i = freqdict.popitem()
freqsmat[i].put(freqdict_i.keys(), freqdict_i.values())
diffdict_i = diffdict[i]
diffsmat[i].put(diffdict_i.keys(), diffdict_i.values())
del diffdict[i]
nonzero = freqsmat.nonzero()
# scale the diffs by freq
diffsmat[nonzero] /= freqsmat[nonzero]
# freqsmat is symmetric
freqsmat.T[nonzero] = freqsmat[nonzero]
# diffsmat is antisymmetric
diffsmat.T[nonzero] = -diffsmat[nonzero]
def dump(self, fileobj):
freqsmat, diffsmat = self.freqsmat, self.diffsmat
for i, j in izip(*freqsmat.nonzero()):
diff = diffsmat[i,j]
# convert -0.0 to 0.0
if diff == 0: diff = np.abs(diff)
print >> fileobj, i, j, freqsmat[i,j], diff
def recomendations(self, user, top=None):
# ids and ratings of items already rated by user
user_itemids = user.keys()
# take the diff and freq columns corresponding to the user itemids
user_diffs_mat = self.diffsmat[:,user_itemids]
user_freqs_mat = self.freqsmat[:,user_itemids]
# for each candidate item compute the sum of co-occurences with all user
# rated items
user_freqs_vec = user_freqs_mat.sum(axis=1)
# consider only the items with non-zero total freq excluding the already
# rated ones
candidate_ids = np.setdiff1d(user_freqs_vec.nonzero()[0], user_itemids)
if candidate_ids.size == 0:
return []
# memory-efficient and slightly faster version of:
# predictions_vec = (user_freqs_mat[candidate_ids] *
# (user_ratings_vec + user_diffs_mat[candidate_ids])
# ).sum(axis=1) / user_freqs_vec[candidate_ids]
predictions_mat = user_diffs_mat[candidate_ids]
predictions_mat += user.values()
predictions_mat *= user_freqs_mat[candidate_ids]
predictions_vec = predictions_mat.sum(axis=1)
predictions_vec /= user_freqs_vec[candidate_ids]
# translate from numpy to python-land and sort or find top largest
predictions = izip(predictions_vec, candidate_ids)
if top is not None:
return nlargest(top, predictions)
else:
return sorted(predictions, reverse=True)
class SlopeOneSparse(object):
def __init__(self, num_items):
self.num_items = num_items
def update(self, users):
# collect the freqs and dicts in dict-of-dicts
freqdict = defaultdict(lambda: defaultdict(int))
diffdict = defaultdict(lambda: defaultdict(int))
for user in users:
itemid_ratings = sorted(user.iteritems(), key=itemgetter(0))
while itemid_ratings:
i, ri = itemid_ratings.pop()
freqs_i = freqdict[i]
diffs_i = diffdict[i]
for j, rj in itemid_ratings:
freqs_i[j] += 1
diffs_i[j] += ri - rj
# double the size of nonzero entries to account for the symmetric ones
nnz = 2 * sum(len(v) for v in freqdict.itervalues())
# (i, j) rows for nonzero freqs
ij = np.empty((2, nnz), dtype=int)
# nonzero freqs
freqs = np.empty(nnz, dtype=int)
# diffs for nonzero freqs
diffs = np.empty(nnz, dtype=float)
# populate the lower triangle
start = 0
while freqdict:
i, freqdict_i = freqdict.popitem()
end = start + len(freqdict_i)
ij[0, start:end] = i
ij[1, start:end] = freqdict_i.keys()
freqs[start:end] = freqdict_i.values()
diffs[start:end] = diffdict[i].values()
del diffdict[i]
start = end
# populate the upper triangle
nnz2 = nnz / 2
ij[:, nnz2:] = ij[[1,0],:nnz2]
freqs[nnz2:] = freqs[:nnz2]
# scale the diffs by freq
diffs[:nnz2] /= freqs[:nnz2]
# diffs is antisymmetric
diffs[nnz2:] = -diffs[:nnz2]
shape = (self.num_items, self.num_items)
self.freqsmat = sparse.csr_matrix((freqs, ij), shape=shape)
# negate the diffs because we'll be selecting itemids by rows instead
# of columns
self.diffsmat = sparse.csr_matrix((-diffs, ij), shape=shape)
def dump(self, fileobj):
freqsmat, diffsmat = self.freqsmat, self.diffsmat
for i, j in sorted(izip(*freqsmat.nonzero())):
print >> fileobj, i, j, freqsmat[i,j], float(diffsmat[i,j])
def recomendations(self, user, top=None):
# ids and ratings of items already rated by user
user_itemids = user.keys()
# take the diff and freq rows corresponding to the user itemids
user_diffs_mat = self.diffsmat[user_itemids]
user_freqs_mat = self.freqsmat[user_itemids]
# for each candidate item compute the sum of co-occurences with all user
# rated items
user_freqs_vec = user_freqs_mat.sum(axis=0)
# consider only the items with non-zero total freq excluding the already
# rated ones
candidate_ids = np.setdiff1d(np.asarray(user_freqs_vec.nonzero()[1]).ravel(),
user_itemids)
if candidate_ids.size == 0:
return []
# compute the prediction vector
predictions_mat = np.asarray(user_diffs_mat[:, candidate_ids].todense())
predictions_mat += np.array(user.values()).reshape((len(user), 1))
predictions_mat *= user_freqs_mat[:, candidate_ids].todense()
predictions_vec = predictions_mat.sum(axis=0)
predictions_vec /= np.asarray(user_freqs_vec[:, candidate_ids]).ravel()
# translate from numpy to python-land and sort or find top largest
predictions = izip(predictions_vec, candidate_ids)
if top is not None:
return nlargest(top, predictions)
else:
return sorted(predictions, reverse=True)
import sys
from heapq import nlargest
from operator import itemgetter
from numpy import int32, float32
from scipy.sparse import dok_matrix
class SlopeOne(object):
def __init__(self, num_items):
self.diffs = {}
self.freqs = {}
def recomendations(self, useritems, top=None):
preds, freqs = {}, {}
useritems = dict(useritems)
for item, rating in useritems.iteritems():
for diffitem, diffratings in self.diffs.iteritems():
try:
freq = self.freqs[diffitem][item]
except KeyError:
continue
preds.setdefault(diffitem, 0.0)
freqs.setdefault(diffitem, 0)
preds[diffitem] += freq * (diffratings[item] + rating)
freqs[diffitem] += freq
predlist = [(value / freqs[item], item)
for item, value in preds.iteritems()
if item not in useritems and freqs[item] > 0]
if top is not None:
predlist = nlargest(top, predlist)
else:
predlist.sort(key=lambda x: (-x[0], -x[1]))
return predlist
def update(self, userdata):
for userratings in userdata:
for item1, rating1 in userratings.iteritems():
self.freqs.setdefault(item1, {})
self.diffs.setdefault(item1, {})
for item2, rating2 in userratings.iteritems():
if item2 == item1:
continue
self.freqs[item1].setdefault(item2, 0)
self.diffs[item1].setdefault(item2, 0.0)
self.freqs[item1][item2] += 1
self.diffs[item1][item2] += rating1 - rating2
for item1, ratings in self.diffs.iteritems():
for item2 in ratings:
ratings[item2] /= self.freqs[item1][item2]
def dump(self, fileobj=sys.stdout):
diffs = self.diffs
key = itemgetter(0)
for i, rows in sorted(self.freqs.iteritems(), key=key):
diff_i = diffs[i]
for j, freq in sorted(rows.iteritems(), key=key):
print >> fileobj, i, j, freq, float(diff_i[j])
class SlopeOneNumpy(object):
def __init__(self, num_items):
self.freqs = dok_matrix((num_items, num_items), dtype=int32)
self.diffs = dok_matrix((num_items, num_items), dtype=float32)
def recomendations(self, useritems, top=None):
resultpreds, resultfreqs = {}, {}
useritems = dict(useritems)
for id, rating in useritems.iteritems():
freqrow = self.freqs.getrow(id)
for freq, indice in zip(freqrow.data, freqrow.indices):
resultpreds.setdefault(indice, 0.0)
resultfreqs.setdefault(indice, 0)
resultpreds[indice] += freq * (self.diffs[indice, id] + rating)
resultfreqs[indice] += freq
predlist = [(value / resultfreqs[item], item)
for item, value in resultpreds.iteritems()
if item not in useritems and resultfreqs[item] > 0]
if top is not None:
predlist = nlargest(top, predlist)
else:
predlist.sort(key=lambda x: (-x[0], -x[1]))
return predlist
def update(self, userdata):
for userratings in userdata:
for item1, rating1 in userratings.iteritems():
for item2, rating2 in userratings.iteritems():
self.freqs[item1, item2] += 1
delta = float32(rating1 - rating2)
if delta != 0:
# there is a bug in scipy when assign a 0 value for a already zero entry
self.diffs[item1, item2] += delta
for key, rating in self.diffs.iteritems():
self.diffs[key] = rating/self.freqs[key]
from __future__ import division
import sys
from collections import defaultdict
from heapq import nlargest
from operator import itemgetter
class SlopeOne(object):
def __init__(self, num_items):
self.freqs = defaultdict(lambda: defaultdict(int))
self.diffs = defaultdict(lambda: defaultdict(int))
def update(self, users):
freqs, diffs = self.freqs, self.diffs
for user in users:
item_ratings = sorted(user.iteritems(), key=itemgetter(0))
while item_ratings:
item1, rating1 = item_ratings.pop()
item1_freqs = freqs[item1]
item1_diffs = diffs[item1]
for item2, rating2 in item_ratings:
item1_freqs[item2] += 1
freqs[item2][item1] += 1
diff = rating1 - rating2
if diff:
item1_diffs[item2] += diff
diffs[item2][item1] -= diff
for item1, ratings in diffs.iteritems():
item1_freqs = freqs[item1]
for item2 in ratings:
ratings[item2] /= item1_freqs[item2]
def recomendations(self, item2rating, top=None):
freqs, diffs = self.freqs, self.diffs
predfreqs = defaultdict(int)
preds = defaultdict(int)
for olditem in freqs:
if olditem in item2rating:
continue
freqs_olditem = freqs[olditem]
for item in item2rating:
freq = freqs_olditem.get(item)
if freq is not None:
predfreqs[olditem] += freq
diffrating = diffs[olditem][item]
preds[olditem] += freq * (item2rating[item] + diffrating)
results = ((preds[item]/predfreqs[item], item) for item in preds)
if top is not None:
results = nlargest(top, results)
else:
results = sorted(results, reverse=True)
return results
def dump(self, fileobj=sys.stdout):
diffs = self.diffs
key = itemgetter(0)
for i, rows in sorted(self.freqs.iteritems(), key=key):
diff_i = diffs[i]
for j, freq in sorted(rows.iteritems(), key=key):
print >> fileobj, i, j, freq, float(diff_i[j])
class SlopeOne2(SlopeOne):
def update(self, users):
freqs, diffs = self.freqs, self.diffs
for user in users:
for item1, rating1 in user.iteritems():
item1_freqs = freqs[item1]
item1_diffs = diffs[item1]
for item2, rating2 in user.iteritems():
item1_freqs[item2] += 1
item1_diffs[item2] += rating1 - rating2
for item1, ratings in diffs.iteritems():
item1_freqs = freqs[item1]
for item2 in ratings:
ratings[item2] /= item1_freqs[item2]
class SlopeOneSym(SlopeOne):
def update(self, users):
freqs, diffs = self.freqs, self.diffs
for user in users:
item_ratings = sorted(user.iteritems(), key=itemgetter(0))
while item_ratings:
item1, rating1 = item_ratings.pop()
item1_freqs = freqs[item1]
item1_diffs = diffs[item1]
for item2, rating2 in item_ratings:
item1_freqs[item2] += 1
item1_diffs[item2] += rating1 - rating2
for item1, ratings in diffs.iteritems():
item1_freqs = freqs[item1]
for item2 in ratings:
ratings[item2] /= item1_freqs[item2]
def recomendations(self, item2rating, top=None):
freqs, diffs = self.freqs, self.diffs
predfreqs = defaultdict(int)
preds = defaultdict(int)
for olditem in freqs:
if olditem in item2rating:
continue
freqs_olditem = freqs[olditem]
for item in item2rating:
if olditem > item:
freq = freqs_olditem.get(item)
else:
freq = freqs[item].get(olditem)
if freq is not None:
predfreqs[olditem] += freq
if olditem > item:
diffrating = diffs[olditem][item]
else:
diffrating = -diffs[item][olditem]
preds[olditem] += freq * (item2rating[item] + diffrating)
results = ((preds[item]/predfreqs[item], item) for item in preds)
if top is not None:
results = nlargest(top, results)
else:
results = sorted(results, reverse=True)
return results
class SlopeOneSym2(SlopeOneSym):
def update(self, users):
freqs, diffs = self.freqs, self.diffs
for user in users:
for item1, rating1 in user.iteritems():
item1_freqs = freqs[item1]
item1_diffs = diffs[item1]
for item2 in user:
if item2 >= item1:
continue
item1_freqs[item2] += 1
item1_diffs[item2] += rating1 - user[item2]
for item1, ratings in diffs.iteritems():
item1_freqs = freqs[item1]
for item2 in ratings:
ratings[item2] /= item1_freqs[item2]
import unittest
import orig_slopeone, slopeone, numpy_slopeone
from benchmark import ItemsStorage
class SlopeOneTest(unittest.TestCase):
users = [
dict(b1=10, b2=5, b3=7, b4=0, b5=8),
dict(b1=3, b3=6, b5=9, b7=1),
dict(b2=4, b4=6, b6=8),
dict(b5=2, b6=3, b7=4, b8=5, b9=6),
dict(b8=10, b9=2, b10=8),
dict(b4=3, b5=5, b6=7),
]
recommendations = [
[('b9', 12), ('b8', 11), ('b6', 6.8), ('b7', 5)],
[('b9', 8), ('b8', 7), ('b6', 7), ('b2', 8/3.), ('b4', 0)],
[('b1', 12.5), ('b9', 11), ('b8', 10), ('b3', 9.5), ('b7', 9.0), ('b5', 8.4)],
[('b10', 7.5), ('b3', 3), ('b1', 2), ('b2', -1), ('b4', -1.5)],
[('b7', 4.5), ('b6', 3.5), ('b5', 2.5)],
[('b9', 9.5), ('b8', 8.5), ('b1', 19/3.), ('b3', 16/3.), ('b7', 4), ('b2', 3.5)],
]
def setUp(self):
self.items_storage = items_storage = ItemsStorage()
for user in self.users:
for book_id in user.iterkeys():
items_storage.add_item(book_id)
self.ratings = map(self._itemid2rating, self.users)
def test_slope_one(self):
self._test(slopeone.SlopeOne)
def test_slope_one2(self):
self._test(slopeone.SlopeOne2)
def test_slope_one_sym(self):
self._test(slopeone.SlopeOneSym)
def test_slope_one_sym2(self):
self._test(slopeone.SlopeOneSym2)
def test_slope_one_dense(self):
self._test(numpy_slopeone.SlopeOneDense)
def test_slope_one_sparse(self):
self._test(numpy_slopeone.SlopeOneSparse)
def test_orig_slope_one(self):
self._test(orig_slopeone.SlopeOne)
def test_orig_slope_one_numpy(self):
self._test(orig_slopeone.SlopeOneNumpy)
def _itemid2rating(self, user):
return dict((self.items_storage.id(book_id), rating)
for book_id, rating in user.iteritems())
def _test(self, cls):
slopeone = cls(len(self.items_storage))
slopeone.update(self.ratings)
for user, recommendations in zip(self.users, self.recommendations):
for top in None, 1, 2, 3:
predictions = [
(self.items_storage.item(item_id), rating)
for rating, item_id in
slopeone.recomendations(self._itemid2rating(user), top=top)
]
self.assertEqual(predictions, recommendations[:top])
class SmallSlopeOneTest(SlopeOneTest):
users = [
dict(b1=5, b2=3, b3=2),
dict(b1=3, b2=4),
dict(b2=2, b3=5),
]
recommendations = [
[],
[('b3', 10/3.)],
[('b1', 13/3.)],
]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment