Skip to content

Instantly share code, notes, and snippets.

@rjurney
Last active December 18, 2015 03:19
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rjurney/5716929 to your computer and use it in GitHub Desktop.
Save rjurney/5716929 to your computer and use it in GitHub Desktop.
register 'udfs.py' using jython as udfs;
/* Get a Pearson's correlation coefficient between all github projects that were rated by the same person, in two steps (merged by Pig into one M/R job) */
by_repos = GROUP pairs BY (repo1, repo2);
gt_5 = FILTER by_repos BY COUNT_STAR(pairs) > 5;
pearson = FOREACH gt_5 GENERATE FLATTEN(group) AS (repo1, repo2),
udfs.cosine(pairs.rating1, pairs.rating2) as similarity;
STORE pearson INTO '/tmp/pearson.txt';
# Dot product
def dot(a,b):
n = len(a)
_sum = 0.0
for i in xrange(n):
_sum += float(a[i]) * float(avg(b))
return _sum
# L2 Norm
def norm(a):
n = len(a)
_sum = 0.0
for i in xrange(n):
_sum += float(a[i]) * float(a[i])
return sqrt(_sum)
# Cosine similarity
@outputSchema("similarity:double")
def cosine(ratings_tuples_1, ratings_tuples_2):
# Convert to an array of numbers
a = [r[0] for r in ratings_tuples_1]
b = [r[0] for r in ratings_tuples_2]
if float(norm(a) * norm(b)) == 0.0:
return 0.0
return float(dot(a, b)) / float(norm(a) * norm(b))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment