Skip to content

Instantly share code, notes, and snippets.

@ccattuto
Created May 15, 2012 14:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ccattuto/2702390 to your computer and use it in GitHub Desktop.
Save ccattuto/2702390 to your computer and use it in GitHub Desktop.
compute timelines of daily hashtag activity
REGISTER lib/com.googlecode.json-simple-json-simple-1.1.jar;
REGISTER lib/joda-time-1.6.jar;
REGISTER eb.jar;
REGISTER piggybank.jar;
DEFINE LOWER org.apache.pig.piggybank.evaluation.string.LOWER;
DEFINE ISOToDay org.apache.pig.piggybank.evaluation.datetime.truncate.ISOToDay;
DEFINE CustomFormatToISO org.apache.pig.piggybank.evaluation.datetime.convert.CustomFormatToISO;
tweets = LOAD '/twitter/*.json.lzo' using com.twitter.elephantbird.pig.load.LzoJsonLoader() AS (json: map[]);
hashtags = FOREACH tweets GENERATE json#'entities'#'hashtags' AS h, CustomFormatToISO(json#'created_at', 'EEE MMM dd HH:mm:ss Z yyyy') AS t;
hashtags = FOREACH hashtags GENERATE FLATTEN(h) AS h, t;
hashtags = FOREACH hashtags GENERATE LOWER(h#'text') AS h, t;
ghashtags = GROUP hashtags BY h;
hfreqs = FOREACH ghashtags GENERATE $0 as h, COUNT(hashtags) as freq;
high_freqs = FILTER hfreqs BY freq > 1000;
hashtags2 = JOIN hashtags BY h, high_freqs BY h;
hashtags3 = FOREACH hashtags2 GENERATE $0 as h, ISOToDay($1) as day;
ghashtags2 = GROUP hashtags3 BY (h, day);
timelines = FOREACH ghashtags2 GENERATE group.h AS hashtag, group.day as day , COUNT(hashtags3) AS freq;
STORE timelines into 'hashtag_timelines';
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment