Skip to content

Instantly share code, notes, and snippets.

@aclisp
Last active April 22, 2019 07:45
Show Gist options
  • Save aclisp/5b30eb4442c5e9f3099c61290419b9be to your computer and use it in GitHub Desktop.
Save aclisp/5b30eb4442c5e9f3099c61290419b9be to your computer and use it in GitHub Desktop.
Recommendation using Python
def calc_precision_recall(recommend_method, training_data, testing_data, user_colname, item_colname, sim_matrix, top_k):
users_testing_and_training = list(
set( testing_data[user_colname].unique() ).intersection(
set( training_data[user_colname].unique() ))
)
hit = 0
n_recall = 0
n_precision = 0
for user in users_testing_and_training:
rank = recommend_method(training_data, user_colname, item_colname, user, sim_matrix, top_k)
rank = set(item[0] for item in rank)
items = set(testing_data[testing_data[user_colname] == user][item_colname].unique())
hit += len(rank.intersection(items))
n_recall += len(items)
n_precision += top_k
return [hit/(1.0*n_precision), hit/(1.0*n_recall)]
def calc_coverage(recommend_method, training_data, testing_data, user_colname, item_colname, sim_matrix, top_k):
recommend_items = set()
all_items = set()
for user in training_data[user_colname].unique():
for item in training_data[training_data[user_colname] == user][item_colname]:
all_items.add(item)
rank = recommend_method(training_data, user_colname, item_colname, user, sim_matrix, top_k)
for item, score in rank:
recommend_items.add(item)
return len(recommend_items) / (len(all_items) * 1.0)
def calc_popularity(recommend_method, training_data, testing_data, user_colname, item_colname, sim_matrix, top_k):
item_popularity = dict()
training_data_grouped = training_data.groupby([item_colname]).agg({user_colname: 'count'}).reset_index()
for row in training_data_grouped.itertuples():
item_popularity[getattr(row, item_colname)] = getattr(row, user_colname)
ret = 0
n = 0
for user in training_data[user_colname].unique():
rank = recommend_method(training_data, user_colname, item_colname, user, sim_matrix, top_k)
for item, score in rank:
ret += math.log(1 + item_popularity[item])
n += 1
ret /= n * 1.0
return ret
def recommend_item_cf(training_data, user_colname, item_colname, target_user, sim_matrix, top_k):
rank = dict()
user_items = training_data[training_data[user_colname] == target_user][item_colname].unique()
for item_i in user_items:
for item_j, similarity in sim_matrix.loc[item_i].sort_values(ascending=False)[0:top_k].items():
#print(item_i, item_j, similarity)
if item_j in user_items:
continue
rank[item_j] = rank.setdefault(item_j, 0) + similarity
return sorted(rank.items(), key=operator.itemgetter(1), reverse=True)[0:top_k]
my_matrix = similarity_matrix(my_data, 'user_id', 'song')
recommend_item_cf(my_data, 'user_id', 'song', '001', my_matrix, 5)
def recommend_most_popular(training_data, user_colname, item_colname, target_user, sim_matrix, top_k):
training_data_grouped = training_data.groupby([item_colname]).agg({user_colname: 'count'}).reset_index()
training_data_grouped.rename(columns={user_colname: 'score'}, inplace=True)
training_data_sort = training_data_grouped.sort_values(['score', item_colname], ascending=[0, 1])
return training_data_sort.head(top_k).values.tolist()
def recommend_random(training_data, user_colname, item_colname, target_user, sim_matrix, top_k):
items = list(training_data[item_colname].unique())
return [(items[i], 0) for i in random.sample(range(len(items)), top_k)]
my_data = pandas.DataFrame([['001', 'A'],['001', 'B'],['001', 'D'],
['002', 'B'],['002', 'C'],['002', 'E'],
['003', 'C'],['003', 'D'],
['004', 'B'],['004', 'C'], ['004', 'D'],
['005', 'A'],['005','D']], columns=['user_id', 'song'])
def similarity_matrix(training_data, user_colname, item_colname):
all_items = training_data[item_colname].unique()
#print("all_items", all_items)
items_users = []
for item in all_items:
users = set( training_data[training_data[item_colname] == item][user_colname].unique() )
items_users.append( users )
#print("items_users", items_users)
cooccurence_matrix = np.matrix(np.zeros(shape=(len(all_items), len(all_items))), float)
for i in range(0, len(all_items)):
users_i = items_users[i]
for j in range(0, len(all_items)):
users_j = items_users[j]
users_intersection = users_i.intersection(users_j)
# Jaccard Index
if len(users_intersection) != 0:
users_union = users_i.union(users_j)
cooccurence_matrix[j,i] = float(len(users_intersection))/float(len(users_union))
else:
cooccurence_matrix[j,i] = 0
return pandas.DataFrame(cooccurence_matrix, index=all_items, columns=all_items)
similarity_matrix(my_data, 'user_id', 'song')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment