Skip to content

Instantly share code, notes, and snippets.

@rjurney
Last active December 18, 2015 00:58
Show Gist options
  • Save rjurney/5700143 to your computer and use it in GitHub Desktop.
Save rjurney/5700143 to your computer and use it in GitHub Desktop.
Processing the github data into distances between items...
/* Combine all different event types into one global, bi-directional rating */
all_ratings = UNION watch_ratings, fork_ratings, create_ratings, download_ratings, issues_ratings;
all_ratings = FILTER all_ratings BY (follower IS NOT NULL) AND (repo IS NOT NULL);
/* If there are multiple events per follower/repo pair, average them into a single value */
all_ratings = FOREACH (GROUP all_ratings BY (follower, repo)) GENERATE FLATTEN(group) AS (follower, repo),
MAX(all_ratings.rating) as rating;
/* Filter the top most populate all_ratings, as their size means the computation never finishes */
sizes = FOREACH (GROUP all_ratings BY follower) GENERATE FLATTEN(all_ratings), COUNT_STAR(all_ratings) AS size;
lt_1k = FILTER sizes BY size < 1000;
lt_1k = FOREACH lt_1k GENERATE all_ratings::repo as repo,
follower as follower,
rating as rating;
/* Emit all co-ratings per login */
front_pairs = FOREACH (GROUP lt_1k BY follower) GENERATE FLATTEN(datafu.pig.bags.UnorderedPairs(lt_1k)) AS (elem1, elem2);
back_pairs = FOREACH front_pairs GENERATE elem1 as elem2, elem2 as elem1;
pairs = UNION front_pairs, back_pairs;
pairs = FOREACH pairs GENERATE elem1.follower AS follower,
elem1.repo AS repo1,
elem2.repo AS repo2,
elem1.rating AS rating1,
elem2.rating AS rating2;
by_repos = GROUP pairs BY (repo1, repo2);
gt_2 = FILTER by_repos BY COUNT_STAR(pairs) > 3;
pearson = FOREACH gt_2 GENERATE FLATTEN(group) AS (repo1, repo2), udfs.pearsons(pairs.rating1, pairs.rating2) as similarity;
store pearson into '/tmp/pearson.txt';
import sys
from math import *
# Derived from example in Programming Collective Intelligence
@outputSchema("pearson:double")
def pearsons(ratings_tuples_1, ratings_tuples_2):
# Convert to an array of numbers
ratings1 = [r[0] for r in ratings_tuples_1]
ratings2 = [r[0] for r in ratings_tuples_2]
# Find number of elements
n = len(ratings1)
# No ratings in common, return 0 (shouldn't happen)
#if n == 0: return 0
# Sum the ratings
sum_1 = sum(ratings1)
sum_2 = sum(ratings2)
# Sum the squares
sum_squares_1 = sum([pow(it, 2) for it in ratings1])
sum_squares_2 = sum([pow(it, 2) for it in ratings2])
# Sum the products
sum_product = 0
for idx, rating1 in enumerate(ratings1):
rating2 = ratings2[idx]
sum_product += rating1 * rating2
# Calculate the Pearson score
numerator = sum_product - (sum_1 * sum_2/n)
denominator = sqrt((sum_squares_1 - pow(sum_1, 2)/n) * (sum_squares_2 - pow(sum_2, 2)/n))
if denominator == 0: return 0.0
pearson = numerator/denominator
return pearson
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment