Last active
August 29, 2015 13:59
-
-
Save r/10977884 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
set default_parallel 10; | |
register 'lib/jython.jar'; | |
register 's3://tumo/scripts/date.py' using jython as datefunctions; | |
tweets = load 's3://tumo/input/twitter*.tsv' using PigStorage as (tweet_id:long, created_at:chararray, user_id:long, text:chararray, in_reply_to_status_id:long, retweet_tweet_id:long, retweet_created_at:chararray, retweet_user_id:long, retweet_text:chararray, latitude:double, longitude:double); | |
limited_tweets = limit tweets 10; | |
millisecond_tweets = foreach limited_tweets generate tweet_id, datefunctions.dateToMillis(created_at); | |
describe millisecond_tweets; | |
dump millisecond_tweets; | |
---------- | |
#!/usr/bin/python | |
import re, datetime | |
@outputSchema("dateInMillis:long") | |
def dateToMillis(dateString): | |
d = datetime.datetime(*map(int, re.split('[^\d]', dateString)[:-1])) | |
epoch = datetime.datetime.utcfromtimestamp(0) | |
delta = d - epoch | |
return (delta.microseconds + (delta.seconds + delta.days * 24 * 3600)) * 1000; | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment