Created
May 15, 2012 14:56
-
-
Save ccattuto/2702390 to your computer and use it in GitHub Desktop.
compute timelines of daily hashtag activity
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
REGISTER lib/com.googlecode.json-simple-json-simple-1.1.jar; | |
REGISTER lib/joda-time-1.6.jar; | |
REGISTER eb.jar; | |
REGISTER piggybank.jar; | |
DEFINE LOWER org.apache.pig.piggybank.evaluation.string.LOWER; | |
DEFINE ISOToDay org.apache.pig.piggybank.evaluation.datetime.truncate.ISOToDay; | |
DEFINE CustomFormatToISO org.apache.pig.piggybank.evaluation.datetime.convert.CustomFormatToISO; | |
tweets = LOAD '/twitter/*.json.lzo' using com.twitter.elephantbird.pig.load.LzoJsonLoader() AS (json: map[]); | |
hashtags = FOREACH tweets GENERATE json#'entities'#'hashtags' AS h, CustomFormatToISO(json#'created_at', 'EEE MMM dd HH:mm:ss Z yyyy') AS t; | |
hashtags = FOREACH hashtags GENERATE FLATTEN(h) AS h, t; | |
hashtags = FOREACH hashtags GENERATE LOWER(h#'text') AS h, t; | |
ghashtags = GROUP hashtags BY h; | |
hfreqs = FOREACH ghashtags GENERATE $0 as h, COUNT(hashtags) as freq; | |
high_freqs = FILTER hfreqs BY freq > 1000; | |
hashtags2 = JOIN hashtags BY h, high_freqs BY h; | |
hashtags3 = FOREACH hashtags2 GENERATE $0 as h, ISOToDay($1) as day; | |
ghashtags2 = GROUP hashtags3 BY (h, day); | |
timelines = FOREACH ghashtags2 GENERATE group.h AS hashtag, group.day as day , COUNT(hashtags3) AS freq; | |
STORE timelines into 'hashtag_timelines'; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment