rjurney/recommend.pig

## recommend.pig
/* Combine all different event types into one global, bi-directional rating */
all_ratings = UNION watch_ratings, fork_ratings, create_ratings, download_ratings, issues_ratings;
all_ratings = FILTER all_ratings BY (follower IS NOT NULL) AND (repo IS NOT NULL);
/* If there are multiple events per follower/repo pair, average them into a single value */
all_ratings = FOREACH (GROUP all_ratings BY (follower, repo)) GENERATE FLATTEN(group) AS (follower, repo),
                                                                       MAX(all_ratings.rating) as rating;

/* Filter the top most populate all_ratings, as their size means the computation never finishes */
sizes = FOREACH (GROUP all_ratings BY follower) GENERATE FLATTEN(all_ratings), COUNT_STAR(all_ratings) AS size;
lt_1k = FILTER sizes BY size < 1000;
lt_1k = FOREACH lt_1k GENERATE all_ratings::repo as repo,
                                 follower as follower,
                                 rating as rating;

/* Emit all co-ratings per login */
front_pairs = FOREACH (GROUP lt_1k BY follower) GENERATE FLATTEN(datafu.pig.bags.UnorderedPairs(lt_1k)) AS (elem1, elem2);
back_pairs = FOREACH front_pairs GENERATE elem1 as elem2, elem2 as elem1;
pairs = UNION front_pairs, back_pairs;
pairs = FOREACH pairs GENERATE elem1.follower AS follower,
                               elem1.repo AS repo1,
                               elem2.repo AS repo2,
                               elem1.rating AS rating1,
                               elem2.rating AS rating2;

by_repos = GROUP pairs BY (repo1, repo2);
gt_2 = FILTER by_repos BY COUNT_STAR(pairs) > 3;
pearson = FOREACH gt_2 GENERATE FLATTEN(group) AS (repo1, repo2), udfs.pearsons(pairs.rating1, pairs.rating2) as similarity;

store pearson into '/tmp/pearson.txt';

## udfs.py
import sys
from math import *

# Derived from example in Programming Collective Intelligence
@outputSchema("pearson:double")
def pearsons(ratings_tuples_1, ratings_tuples_2):

  # Convert to an array of numbers
  ratings1 = [r[0] for r in ratings_tuples_1]
  ratings2 = [r[0] for r in ratings_tuples_2]

  # Find number of elements
  n = len(ratings1)

  # No ratings in common, return 0 (shouldn't happen)
  #if n == 0: return 0

  # Sum the ratings
  sum_1 = sum(ratings1)
  sum_2 = sum(ratings2)

  # Sum the squares
  sum_squares_1 = sum([pow(it, 2) for it in ratings1])
  sum_squares_2 = sum([pow(it, 2) for it in ratings2])

  # Sum the products
  sum_product = 0
  for idx, rating1 in enumerate(ratings1):
    rating2 = ratings2[idx]
    sum_product += rating1 * rating2

  # Calculate the Pearson score
  numerator = sum_product - (sum_1 * sum_2/n)
  denominator = sqrt((sum_squares_1 - pow(sum_1, 2)/n) * (sum_squares_2 - pow(sum_2, 2)/n))
  if denominator == 0: return 0.0
  pearson = numerator/denominator
  return pearson
	/* Combine all different event types into one global, bi-directional rating */
	all_ratings = UNION watch_ratings, fork_ratings, create_ratings, download_ratings, issues_ratings;
	all_ratings = FILTER all_ratings BY (follower IS NOT NULL) AND (repo IS NOT NULL);
	/* If there are multiple events per follower/repo pair, average them into a single value */
	all_ratings = FOREACH (GROUP all_ratings BY (follower, repo)) GENERATE FLATTEN(group) AS (follower, repo),
	MAX(all_ratings.rating) as rating;

	/* Filter the top most populate all_ratings, as their size means the computation never finishes */
	sizes = FOREACH (GROUP all_ratings BY follower) GENERATE FLATTEN(all_ratings), COUNT_STAR(all_ratings) AS size;
	lt_1k = FILTER sizes BY size < 1000;
	lt_1k = FOREACH lt_1k GENERATE all_ratings::repo as repo,
	follower as follower,
	rating as rating;

	/* Emit all co-ratings per login */
	front_pairs = FOREACH (GROUP lt_1k BY follower) GENERATE FLATTEN(datafu.pig.bags.UnorderedPairs(lt_1k)) AS (elem1, elem2);
	back_pairs = FOREACH front_pairs GENERATE elem1 as elem2, elem2 as elem1;
	pairs = UNION front_pairs, back_pairs;
	pairs = FOREACH pairs GENERATE elem1.follower AS follower,
	elem1.repo AS repo1,
	elem2.repo AS repo2,
	elem1.rating AS rating1,
	elem2.rating AS rating2;

	by_repos = GROUP pairs BY (repo1, repo2);
	gt_2 = FILTER by_repos BY COUNT_STAR(pairs) > 3;
	pearson = FOREACH gt_2 GENERATE FLATTEN(group) AS (repo1, repo2), udfs.pearsons(pairs.rating1, pairs.rating2) as similarity;

	store pearson into '/tmp/pearson.txt';
	import sys
	from math import *

	# Derived from example in Programming Collective Intelligence
	@outputSchema("pearson:double")
	def pearsons(ratings_tuples_1, ratings_tuples_2):

	# Convert to an array of numbers
	ratings1 = [r[0] for r in ratings_tuples_1]
	ratings2 = [r[0] for r in ratings_tuples_2]

	# Find number of elements
	n = len(ratings1)

	# No ratings in common, return 0 (shouldn't happen)
	#if n == 0: return 0

	# Sum the ratings
	sum_1 = sum(ratings1)
	sum_2 = sum(ratings2)

	# Sum the squares
	sum_squares_1 = sum([pow(it, 2) for it in ratings1])
	sum_squares_2 = sum([pow(it, 2) for it in ratings2])

	# Sum the products
	sum_product = 0
	for idx, rating1 in enumerate(ratings1):
	rating2 = ratings2[idx]
	sum_product += rating1 * rating2

	# Calculate the Pearson score
	numerator = sum_product - (sum_1 * sum_2/n)
	denominator = sqrt((sum_squares_1 - pow(sum_1, 2)/n) * (sum_squares_2 - pow(sum_2, 2)/n))
	if denominator == 0: return 0.0
	pearson = numerator/denominator
	return pearson