Skip to content

Instantly share code, notes, and snippets.

@marcelcaraciolo
Created October 10, 2012 18:31
  • Star 1 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save marcelcaraciolo/3867521 to your computer and use it in GitHub Desktop.
Twitter Friends Recommender
#-*-coding: utf-8 -*-
'''
This module represents the recommender system for recommending
new friends based on 'mutual friends'.
'''
__author__ = 'Marcel Caraciolo <caraciol@gmail.com>'
from mrjob.job import MRJob
def combinations(iterable, r):
"""
Implementation of itertools combinations method. Re-implemented here because
of import issues in Amazon Elastic MapReduce. Was just easier to do this than
bootstrap.
More info here: http://docs.python.org/library/itertools.html#itertools.combinations
Input/Output:
combinations('ABCD', 2) --> AB AC AD BC BD CD
combinations(range(4), 3) --> 012 013 023 123
"""
pool = tuple(iterable)
n = len(pool)
if r > n:
return
indices = range(r)
yield tuple(pool[i] for i in indices)
while True:
for i in reversed(range(r)):
if indices[i] != i + n - r:
break
else:
return
indices[i] += 1
for j in range(i + 1, r):
indices[j] = indices[j - 1] + 1
yield tuple(pool[i] for i in indices)
TOP_N = 10
class FriendsRecommender(MRJob):
def steps(self):
return [self.mr(self.map_input, self.reverse_index),
self.mr(None, self.count_number_of_friends),
self.mr(None, self.top_recommendations)]
def map_input(self, key, line):
'''
Compute the reversed index for each connection
between the source.
Input (source -> {“friend1”, “friend2”, “friend3”}):
marcel,jonas,maria,jose,amanda
Output {[friend1], source, 1,
{“friend1”, “friend2”, “friend3”}};
{[friend2], source, 1,
{“friend1”, “friend2”, “friend3”}};
'''
input = line.split(';')
user_id, item_ids = input[0], input[1:]
for i in range(len(item_ids)):
f1 = item_ids[i]
yield f1, (user_id, 1, item_ids)
def reverse_index(self, key, values):
'''
Compute the cartesian product between among all
sources that shares common friends.
Input (friend1 -> (source, 1,
{“friend1”, “friend2”, “friend3”}};
marcel, jonas,maria,jose,amanda
Output {(jonas, maria), 1}, {(maria, jose), 1},
{(jonas,amanda), 1}, {(maria, amanda),1},
{(jose, jonas), 1}
'''
final_list = []
for value in values:
final_list.append(value)
for item1, item2 in combinations(final_list, 2):
f1, count1, f1_friends = item1
f2, count2, f2_friends = item2
if f1 not in f2_friends:
yield (f2, f1), (count1)
if f2 not in f1_friends:
yield (f1, f2), (count2)
def count_number_of_friends(self, key, values):
'''
Count the number of mutual friends.
Input {(source, recommended), [count1, count2, ... count n]}
{(jonas, maria), [1,1,1]},
{(jose, jonas), [1,1]},
{(maria, amanda), [1]},
{(maria,jose), [1]},
{(jonas,amanda), [1, 1, 1]}
Output ({friend1, friend2} numberOfMutualFriends):
(jonas, marcel ) 1
(fabiola, marcel) 1
(marcel, patricia) 1
(marcel, paula) 1
(carol, marcel) 2
'''
user_id, possible_friend = key
yield (user_id), \
(possible_friend, sum(values))
def top_recommendations(self, key, values):
'''
Get the TOP N recommendations for user.
Input ({user_id, [(numberOfMutualFriends, friend),
(numberOfMutualFriends2, friend)]}):
[{marcel, [(patricia, 1), (paula, 1)]},
{carol, [(marcel, 2)]},
{fabiola, [(marcel,1)]},
{jonas, [(marcel,1)]}]
Output ({user_id, [(numberOfMutualFriends, friend),
(numberOfMutualFriends2, friend)]}):
Ex: Get the top 3 suggestions.
[{marcel, [(patricia, 1), (paula, 1)]},
{carol, [(marcel, 2)]},
{fabiola, [(marcel,1)]},
{jonas, [(marcel,1)]}]
'''
recommendations = []
for idx, (item, score) in enumerate(values):
recommendations.append((item, score))
yield key, sorted(recommendations, key=lambda k: -k[1])[:TOP_N]
if __name__ == '__main__':
FriendsRecommender.run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment