Skip to content

Instantly share code, notes, and snippets.

@davidsnyder
Created February 29, 2012 18:03
Show Gist options
  • Save davidsnyder/1943050 to your computer and use it in GitHub Desktop.
Save davidsnyder/1943050 to your computer and use it in GitHub Desktop.
-- 1,2000,Dinosaur Planet,3.74
-- 2,2000,Isle of Man TT 2004 Review,3.53
-- 3,1990,Character,3.64
-- 4,1990,Paula Abdul's Get Up & Dance,2.72
movie_decades = LOAD '$DECADE_TITLES' USING PigStorage(',') AS (movie_id:chararray,decade:int,title:chararray,avg_rating:float);
snipped_decades = FOREACH movie_decades GENERATE movie_id,decade;
-- 9983,462930,3
-- 9983,1149472,4
-- 9983,238407,4
-- 9983,616720,4
raw_ratings = LOAD '$TRAINING_SET' USING PigStorage(',') AS (movie_id:chararray,customer_id:chararray,rating:int);
-- (4,1990,4,616720,3)
joined = JOIN snipped_decades by movie_id,raw_ratings BY movie_id;
--(4,616720,1990,3)
pruned = FOREACH joined GENERATE raw_ratings::movie_id AS movie_id,raw_ratings::customer_id AS customer_id,raw_ratings::rating AS rating,snipped_decades::decade AS decade;
-- {(616720,1990),{(4,616720,1990,3),...}}
grouped = GROUP pruned BY (customer_id,decade);
-- (616720,1990,3.6)
final = FOREACH grouped {
count = COUNT(joined);
sum = SUM(joined.rating);
avg = (float)sum / count;
GENERATE
FLATTEN(group) AS (customer_id,decade),
avg AS avg_rating;
};
STORE final INTO '$OUT';
@alienrobotwizard
Copy link

-- Load input data

movie_decades = load '$DECADE_TITLES' using PigStorage(',') AS (movie_id:chararray,decade:int,title:chararray,avg_rating:float);
raw_ratings = load '$TRAINING_SET' using PigStorage(',') AS (movie_id:chararray,customer_id:chararray,rating:int);

-- Throw away information irrelevant to the algorithm

snipped_decades = foreach movie_decades generate movie_id,decade;

-- Attach the decade of the movie to each rating

joined_and_pruned = foreach (join raw_ratings by movie_id, snipped_decades by movie_id) generate
raw_ratings::movie_id as movie_id,
raw_ratings::customer_id as customer_id,
raw_ratings::rating as rating,
snipped_decades::decade as decade;

with_avgs = foreach (group joined_and_pruned by (customer_id, decade)) generate
FLATTEN(group) as (customer_id, decade),
AVG(joined_and_pruned.rating) as avg_rating;

store with_avgs into '$OUT';

@alienrobotwizard
Copy link

set default_parallel 50

try that at the top of the script

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment