Created
February 29, 2012 18:03
-
-
Save davidsnyder/1943050 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- 1,2000,Dinosaur Planet,3.74 | |
-- 2,2000,Isle of Man TT 2004 Review,3.53 | |
-- 3,1990,Character,3.64 | |
-- 4,1990,Paula Abdul's Get Up & Dance,2.72 | |
movie_decades = LOAD '$DECADE_TITLES' USING PigStorage(',') AS (movie_id:chararray,decade:int,title:chararray,avg_rating:float); | |
snipped_decades = FOREACH movie_decades GENERATE movie_id,decade; | |
-- 9983,462930,3 | |
-- 9983,1149472,4 | |
-- 9983,238407,4 | |
-- 9983,616720,4 | |
raw_ratings = LOAD '$TRAINING_SET' USING PigStorage(',') AS (movie_id:chararray,customer_id:chararray,rating:int); | |
-- (4,1990,4,616720,3) | |
joined = JOIN snipped_decades by movie_id,raw_ratings BY movie_id; | |
--(4,616720,1990,3) | |
pruned = FOREACH joined GENERATE raw_ratings::movie_id AS movie_id,raw_ratings::customer_id AS customer_id,raw_ratings::rating AS rating,snipped_decades::decade AS decade; | |
-- {(616720,1990),{(4,616720,1990,3),...}} | |
grouped = GROUP pruned BY (customer_id,decade); | |
-- (616720,1990,3.6) | |
final = FOREACH grouped { | |
count = COUNT(joined); | |
sum = SUM(joined.rating); | |
avg = (float)sum / count; | |
GENERATE | |
FLATTEN(group) AS (customer_id,decade), | |
avg AS avg_rating; | |
}; | |
STORE final INTO '$OUT'; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
set default_parallel 50
try that at the top of the script