Created
February 29, 2012 18:03
-
-
Save davidsnyder/1943050 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- 1,2000,Dinosaur Planet,3.74 | |
-- 2,2000,Isle of Man TT 2004 Review,3.53 | |
-- 3,1990,Character,3.64 | |
-- 4,1990,Paula Abdul's Get Up & Dance,2.72 | |
movie_decades = LOAD '$DECADE_TITLES' USING PigStorage(',') AS (movie_id:chararray,decade:int,title:chararray,avg_rating:float); | |
snipped_decades = FOREACH movie_decades GENERATE movie_id,decade; | |
-- 9983,462930,3 | |
-- 9983,1149472,4 | |
-- 9983,238407,4 | |
-- 9983,616720,4 | |
raw_ratings = LOAD '$TRAINING_SET' USING PigStorage(',') AS (movie_id:chararray,customer_id:chararray,rating:int); | |
-- (4,1990,4,616720,3) | |
joined = JOIN snipped_decades by movie_id,raw_ratings BY movie_id; | |
--(4,616720,1990,3) | |
pruned = FOREACH joined GENERATE raw_ratings::movie_id AS movie_id,raw_ratings::customer_id AS customer_id,raw_ratings::rating AS rating,snipped_decades::decade AS decade; | |
-- {(616720,1990),{(4,616720,1990,3),...}} | |
grouped = GROUP pruned BY (customer_id,decade); | |
-- (616720,1990,3.6) | |
final = FOREACH grouped { | |
count = COUNT(joined); | |
sum = SUM(joined.rating); | |
avg = (float)sum / count; | |
GENERATE | |
FLATTEN(group) AS (customer_id,decade), | |
avg AS avg_rating; | |
}; | |
STORE final INTO '$OUT'; |
set default_parallel 50
try that at the top of the script
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
-- Load input data
movie_decades = load '$DECADE_TITLES' using PigStorage(',') AS (movie_id:chararray,decade:int,title:chararray,avg_rating:float);
raw_ratings = load '$TRAINING_SET' using PigStorage(',') AS (movie_id:chararray,customer_id:chararray,rating:int);
-- Throw away information irrelevant to the algorithm
snipped_decades = foreach movie_decades generate movie_id,decade;
-- Attach the decade of the movie to each rating
joined_and_pruned = foreach (join raw_ratings by movie_id, snipped_decades by movie_id) generate
raw_ratings::movie_id as movie_id,
raw_ratings::customer_id as customer_id,
raw_ratings::rating as rating,
snipped_decades::decade as decade;
with_avgs = foreach (group joined_and_pruned by (customer_id, decade)) generate
FLATTEN(group) as (customer_id, decade),
AVG(joined_and_pruned.rating) as avg_rating;
store with_avgs into '$OUT';