Skip to content

Instantly share code, notes, and snippets.

@davidsnyder
Created May 17, 2012 17:28
Show Gist options
  • Save davidsnyder/2720382 to your computer and use it in GitHub Desktop.
Save davidsnyder/2720382 to your computer and use it in GitHub Desktop.
Jaccard Similarity Score for the Netflix graph
edges = LOAD '$GRAPH' AS (user_id:chararray,movie_id:chararray);
edges_dup = LOAD '$GRAPH' AS (user_id:chararray,movie_id:chararray); -- hack for self-join
-- (user_id,|A|)
grouped_edges = GROUP edges by user_id; --reduce
outgoing_links = FOREACH grouped_edges GENERATE --map
group AS user_id,
COUNT(edges) AS user_out;
-- (movie_id,user_a_id,movie_id,user_b_id)
movie_groups = COGROUP edges by movie_id INNER,edges_dup by movie_id INNER; --reduce
flat_groups = FOREACH movie_groups GENERATE --map
FLATTEN(edges),
FLATTEN(edges_dup);
-- projection (user_a_id,user_b_id,movie_id)
snipped_groups = FOREACH flat_groups GENERATE --map
edges::user_id AS user_a_id,
edges_dup::user_id AS user_b_id,
edges::movie_id AS movie_id;
-- (user_a_id,user_b_id,|A intersection B|)
grouped_groups = GROUP snipped_groups by (user_a_id,user_b_id); --reduce
intersection = FOREACH grouped_groups GENERATE --map
FLATTEN(group) AS (user_a_id,user_b_id),
COUNT(snipped_groups) AS intersection_size;
--reduce/map
joined_a = JOIN intersection by $0, outgoing_links by user_id; --append (user_a_id,|A|)
joined_b = JOIN joined_a by $1, outgoing_links by user_id; --append (user_b_id,|B|)
-- (user_a_id,user_b_id,|A intersection B|,user_a_id,|A|,user_b_id,|B|)
jaccard = FOREACH joined_b { --map
-- |A int B| / |A| + |B| - |A int B| (which is just |A union B|)
sim = (float)$2 / ((int)$4 + (int)$6 - (int)$2);
GENERATE
$0 AS user_a_id,
$1 AS user_b_id,
sim AS jaccard_sim;
};
--map
filtered = FILTER jaccard by jaccard_sim < 1.0; -- prune self matches
sorted = ORDER filtered by *;
grouped = GROUP sorted by user_a_id; --reduce
topped = FOREACH grouped { --map
top_jac = TOP(1,2,sorted); -- take only the user with the highest jaccard sim
GENERATE FLATTEN(top_jac);
};
describe topped;
STORE topped INTO '$OUT';
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment