Created
April 14, 2013 13:06
-
-
Save cathalgarvey/5382677 to your computer and use it in GitHub Desktop.
A script to concatenate the monthly JSON twitter files given in Twitter's tweets archive, and to add a UTC Unix timestamp to each tweet for easy parsing with other tools.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import time | |
import datetime | |
import os | |
import json | |
timestamp_format = '%a %b %d %H:%M:%S %z %Y' | |
def twitter_timestamp_to_obj(time_string): | |
'Returns a timezone-aware datetime object.' | |
return datetime.datetime.strptime(time_string, timestamp_format) | |
def twitter_timestamp_to_unix(time_string): | |
timest = twitter_timestamp_to_obj("Sat Apr 13 18:04:17 +0000 2013") | |
timest = timest.utctimetuple() | |
return time.mktime(timest) | |
def import_tweets(tweetfile): | |
with open(tweetfile) as File: | |
firstline, linebuffer = True, [] | |
for line in File: | |
if firstline: | |
firstline=False | |
continue | |
linebuffer.append(line.strip()) | |
tweet_objects = json.loads('\n'.join(linebuffer)) | |
tweet_objects = sorted(tweet_objects, key=lambda x:twitter_timestamp_to_unix(x["created_at"])) | |
return tweet_objects[::-1] | |
#!/usr/bin/env python3 | |
def aggregate_tweets(tweets_dir): | |
'Runs through the monthly "20XX_YY.js" files twitter proides in archive, returns single list containing all tweets.' | |
all_tweets = [] | |
for x in sorted(os.listdir(tweets_dir)): | |
# Ignore files that don't look like "2011_05.js" | |
x = os.path.join(tweets_dir, x) | |
if x[len(x)-3:] != ".js" or len(os.path.split(x)[len(os.path.split(x))-1]) != 10: continue | |
all_tweets.extend(import_tweets(x)) | |
return all_tweets | |
def add_unixtime_to_tweets(tweets_list): | |
'Adds a new field to all tweets: "unix_timestamp", with seconds-since-utc-epoch.' | |
for tweet in tweets_list: | |
tweet['unix_timestamp'] = twitter_timestamp_to_unix(tweet["created_at"]) | |
if __name__ == "__main__": | |
import argparse | |
ArgP = argparse.ArgumentParser( | |
description=("A little script to concatenate JSON tweets provided by " | |
"twitter, and to add a unix timestamp to each for ease of sorting and " | |
"comparison with other tools. Prints or saves to JSON."), | |
epilog="by Cathal Garey, released under GNU Affero GPL v3.") | |
ArgP.add_argument("-d","--directory",default=".", | |
help="Directory to seek monthly tweet archive files in. Default is current.") | |
ArgP.add_argument("-o","--output-file", | |
help="File to save output to rather than printing to stdout.") | |
args = ArgP.parse_args() | |
tweet_list = aggregate_tweets(args.directory) | |
add_unixtime_to_tweets(tweet_list) | |
if args.output_file: | |
with open(args.output_file,"w") as SaveF: | |
json.dump(tweet_list, SaveF, indent=1) | |
else: | |
print(json.dumps(tweet_list,indent=1)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment