Last active
December 18, 2015 03:19
-
-
Save rjurney/5716929 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
register 'udfs.py' using jython as udfs; | |
/* Get a Pearson's correlation coefficient between all github projects that were rated by the same person, in two steps (merged by Pig into one M/R job) */ | |
by_repos = GROUP pairs BY (repo1, repo2); | |
gt_5 = FILTER by_repos BY COUNT_STAR(pairs) > 5; | |
pearson = FOREACH gt_5 GENERATE FLATTEN(group) AS (repo1, repo2), | |
udfs.cosine(pairs.rating1, pairs.rating2) as similarity; | |
STORE pearson INTO '/tmp/pearson.txt'; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Dot product | |
def dot(a,b): | |
n = len(a) | |
_sum = 0.0 | |
for i in xrange(n): | |
_sum += float(a[i]) * float(avg(b)) | |
return _sum | |
# L2 Norm | |
def norm(a): | |
n = len(a) | |
_sum = 0.0 | |
for i in xrange(n): | |
_sum += float(a[i]) * float(a[i]) | |
return sqrt(_sum) | |
# Cosine similarity | |
@outputSchema("similarity:double") | |
def cosine(ratings_tuples_1, ratings_tuples_2): | |
# Convert to an array of numbers | |
a = [r[0] for r in ratings_tuples_1] | |
b = [r[0] for r in ratings_tuples_2] | |
if float(norm(a) * norm(b)) == 0.0: | |
return 0.0 | |
return float(dot(a, b)) / float(norm(a) * norm(b)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment