Aggregate PIG script
-- tweet data stored in CSV format | |
tweets = LOAD 'tweets.csv' using PigStorage('\t'); | |
/* | |
00 user_id | |
01 MRT Station | |
02 Language | |
03 Source | |
04 Date | |
*/ | |
tweetlogs = FOREACH tweets GENERATE (chararray) $0 as user_id, | |
(chararray) $1 as station, | |
(chararray) $2 as lang, | |
(chararray) $3 as source, | |
(chararray) $4 as date; | |
-- by station name | |
grouped = GROUP tweetlogs BY station; | |
grouped_and_counted = FOREACH grouped { | |
D = DISTINCT tweetlogs.user_id; | |
GENERATE flatten(group), COUNT(D); | |
} | |
STORE grouped_and_counted INTO 'totals/by_station' using PigStorage('\t'); | |
-- by station, language | |
grouped = GROUP tweetlogs BY (station, lang); | |
grouped_and_counted = FOREACH grouped { | |
D = DISTINCT tweetlogs.user_id; | |
GENERATE flatten(group), COUNT(D); | |
} | |
STORE grouped_and_counted INTO 'totals/by_station_lang' using PigStorage('\t'); | |
-- | |
-- filter by date | |
-- | |
filtered_tweets = FILTER tweetlogs BY date == '$DATE'; | |
tweets = FOREACH filtered_tweets GENERATE user_id, station, source, lang, date; | |
-- by station | |
grouped = GROUP tweets BY station; | |
grouped_and_counted = FOREACH grouped { | |
D = DISTINCT tweets.user_id; | |
GENERATE flatten(group), COUNT(D); | |
} | |
STORE grouped_and_counted INTO 'daily/by_station/ds=$DATE' using PigStorage('\t'); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment