Last active
December 18, 2015 00:58
-
-
Save rjurney/5700143 to your computer and use it in GitHub Desktop.
Processing the github data into distances between items...
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* Combine all different event types into one global, bi-directional rating */ | |
all_ratings = UNION watch_ratings, fork_ratings, create_ratings, download_ratings, issues_ratings; | |
all_ratings = FILTER all_ratings BY (follower IS NOT NULL) AND (repo IS NOT NULL); | |
/* If there are multiple events per follower/repo pair, average them into a single value */ | |
all_ratings = FOREACH (GROUP all_ratings BY (follower, repo)) GENERATE FLATTEN(group) AS (follower, repo), | |
MAX(all_ratings.rating) as rating; | |
/* Filter the top most populate all_ratings, as their size means the computation never finishes */ | |
sizes = FOREACH (GROUP all_ratings BY follower) GENERATE FLATTEN(all_ratings), COUNT_STAR(all_ratings) AS size; | |
lt_1k = FILTER sizes BY size < 1000; | |
lt_1k = FOREACH lt_1k GENERATE all_ratings::repo as repo, | |
follower as follower, | |
rating as rating; | |
/* Emit all co-ratings per login */ | |
front_pairs = FOREACH (GROUP lt_1k BY follower) GENERATE FLATTEN(datafu.pig.bags.UnorderedPairs(lt_1k)) AS (elem1, elem2); | |
back_pairs = FOREACH front_pairs GENERATE elem1 as elem2, elem2 as elem1; | |
pairs = UNION front_pairs, back_pairs; | |
pairs = FOREACH pairs GENERATE elem1.follower AS follower, | |
elem1.repo AS repo1, | |
elem2.repo AS repo2, | |
elem1.rating AS rating1, | |
elem2.rating AS rating2; | |
by_repos = GROUP pairs BY (repo1, repo2); | |
gt_2 = FILTER by_repos BY COUNT_STAR(pairs) > 3; | |
pearson = FOREACH gt_2 GENERATE FLATTEN(group) AS (repo1, repo2), udfs.pearsons(pairs.rating1, pairs.rating2) as similarity; | |
store pearson into '/tmp/pearson.txt'; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
from math import * | |
# Derived from example in Programming Collective Intelligence | |
@outputSchema("pearson:double") | |
def pearsons(ratings_tuples_1, ratings_tuples_2): | |
# Convert to an array of numbers | |
ratings1 = [r[0] for r in ratings_tuples_1] | |
ratings2 = [r[0] for r in ratings_tuples_2] | |
# Find number of elements | |
n = len(ratings1) | |
# No ratings in common, return 0 (shouldn't happen) | |
#if n == 0: return 0 | |
# Sum the ratings | |
sum_1 = sum(ratings1) | |
sum_2 = sum(ratings2) | |
# Sum the squares | |
sum_squares_1 = sum([pow(it, 2) for it in ratings1]) | |
sum_squares_2 = sum([pow(it, 2) for it in ratings2]) | |
# Sum the products | |
sum_product = 0 | |
for idx, rating1 in enumerate(ratings1): | |
rating2 = ratings2[idx] | |
sum_product += rating1 * rating2 | |
# Calculate the Pearson score | |
numerator = sum_product - (sum_1 * sum_2/n) | |
denominator = sqrt((sum_squares_1 - pow(sum_1, 2)/n) * (sum_squares_2 - pow(sum_2, 2)/n)) | |
if denominator == 0: return 0.0 | |
pearson = numerator/denominator | |
return pearson |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment