Skip to content

Embed URL


Subversion checkout URL

You can clone with
Download ZIP
friends recommender with explanations
#-*-coding: utf-8 -*-
This module represents the recommender system for recommending
new friends based on 'mutual friends'.
__author__ = 'Marcel Caraciolo <>'
from mrjob.job import MRJob
def combinations(iterable, r):
Implementation of itertools combinations method. Re-implemented here because
of import issues in Amazon Elastic MapReduce. Was just easier to do this than
More info here:
combinations('ABCD', 2) --> AB AC AD BC BD CD
combinations(range(4), 3) --> 012 013 023 123
pool = tuple(iterable)
n = len(pool)
if r > n:
indices = range(r)
yield tuple(pool[i] for i in indices)
while True:
for i in reversed(range(r)):
if indices[i] != i + n - r:
indices[i] += 1
for j in range(i + 1, r):
indices[j] = indices[j - 1] + 1
yield tuple(pool[i] for i in indices)
TOP_N = 10
class FriendsRecommender(MRJob):
def steps(self):
return [, self.reverse_index),, self.count_number_of_friends),, self.top_recommendations)]
def map_input(self, key, line):
Compute the reversed index for each connection
between the source.
Input (source -> {“friend1”, “friend2”, “friend3”}):
Output {[friend1], source, 1, friend1,
{“friend1”, “friend2”, “friend3”}};
{[friend2], source, 1, friend2,
{“friend1”, “friend2”, “friend3”}};
input = line.split(';')
user_id, item_ids = input[0], input[1:]
for i in range(len(item_ids)):
f1 = item_ids[i]
yield f1, (user_id, 1, f1, item_ids)
def reverse_index(self, key, values):
Compute the cartesian product between among all
sources that shares common friends.
Input (friend1 -> (source, 1, friend1,
{“friend1”, “friend2”, “friend3”}};
marcel, jonas,maria,jose,amanda
Output {(jonas, maria), (1, jonas)}, {(maria, jose), (1, maria)},
{(jonas,amanda), (1, jonas)}, {(maria, amanda),(1, maria)},
{(jose, jonas), (1, jonas)}
final_list = []
for value in values:
for item1, item2 in combinations(final_list, 2):
f1, count1, exp1, f1_friends = item1
f2, count2, exp2, f2_friends = item2
if f1 not in f2_friends:
yield (f2, f1), (count1, exp1)
if f2 not in f1_friends:
yield (f1, f2), (count2, exp2)
def count_number_of_friends(self, key, values):
Count the number of mutual friends.
Input {(source, recommended), [(count1,exp),
(count2,exp), ... (count n,exp)]}
{(jonas, maria), [(1,marcel),(1, marcel),(1, marcel)]},
{(jose, jonas), [(1,marcel),(1, marcel)]},
{(maria, amanda), [(1,marcel)]},
{(maria,jose), [(1, marcel)]},
{(jonas,amanda), [(1,marcel), (1,marcel), (1,marcel)]}
Output ({friend1, friend2} numberOfMutualFriends):
(jonas, marcel ) (1, [marcel]]
(fabiola, marcel) (1, [marcel])
(marcel, patricia) (1, [marcel])
(marcel, paula) (1, [marcel])
(carol, marcel) (2, [maria, jonas])
user_id, possible_friend = key
mutual_friends_count = 0
mutual_friends = []
for value in values:
count, exp = value
mutual_friends_count += count
yield (user_id), \
(possible_friend, mutual_friends_count, mutual_friends)
def top_recommendations(self, key, values):
Get the TOP N recommendations for user.
Input ({user_id, [(numberOfMutualFriends, friend, [exp1,exp2]),
(numberOfMutualFriends2, friend,[exp1,exp2,expn])]}):
[{marcel, [(patricia, 1, [jonas]), (paula, 1, [flavia])]},
{carol, [(marcel, 2, [jonas, flavia])]},
{fabiola, [(marcel,1, [maria])]},
{jonas, [(marcel,1, [fabiola])]}]
Output ({user_id, [(numberOfMutualFriends, friend, [exp1, exp2]),
(numberOfMutualFriends2, friend, [exp1, exp2])]}):
Ex: Get the top 3 suggestions.
[{marcel, [(patricia, 1, [jonas]), (paula, 1, [flavia])]},
{carol, [(marcel, 2, [jonas, flavia])]},
{fabiola, [(marcel,1, [maria])]},
{jonas, [(marcel,1, [fabiola])]}]
recommendations = []
for idx, (item, score, exp) in enumerate(values):
recommendations.append((item, score, exp))
yield key, sorted(recommendations, key=lambda k: -k[1])[:TOP_N]
if __name__ == '__main__':
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Something went wrong with that request. Please try again.