Skip to content

Instantly share code, notes, and snippets.

@cathalgarvey
Created April 14, 2013 13:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cathalgarvey/5382677 to your computer and use it in GitHub Desktop.
Save cathalgarvey/5382677 to your computer and use it in GitHub Desktop.
A script to concatenate the monthly JSON twitter files given in Twitter's tweets archive, and to add a UTC Unix timestamp to each tweet for easy parsing with other tools.
#!/usr/bin/env python3
import time
import datetime
import os
import json
timestamp_format = '%a %b %d %H:%M:%S %z %Y'
def twitter_timestamp_to_obj(time_string):
'Returns a timezone-aware datetime object.'
return datetime.datetime.strptime(time_string, timestamp_format)
def twitter_timestamp_to_unix(time_string):
timest = twitter_timestamp_to_obj("Sat Apr 13 18:04:17 +0000 2013")
timest = timest.utctimetuple()
return time.mktime(timest)
def import_tweets(tweetfile):
with open(tweetfile) as File:
firstline, linebuffer = True, []
for line in File:
if firstline:
firstline=False
continue
linebuffer.append(line.strip())
tweet_objects = json.loads('\n'.join(linebuffer))
tweet_objects = sorted(tweet_objects, key=lambda x:twitter_timestamp_to_unix(x["created_at"]))
return tweet_objects[::-1]
#!/usr/bin/env python3
def aggregate_tweets(tweets_dir):
'Runs through the monthly "20XX_YY.js" files twitter proides in archive, returns single list containing all tweets.'
all_tweets = []
for x in sorted(os.listdir(tweets_dir)):
# Ignore files that don't look like "2011_05.js"
x = os.path.join(tweets_dir, x)
if x[len(x)-3:] != ".js" or len(os.path.split(x)[len(os.path.split(x))-1]) != 10: continue
all_tweets.extend(import_tweets(x))
return all_tweets
def add_unixtime_to_tweets(tweets_list):
'Adds a new field to all tweets: "unix_timestamp", with seconds-since-utc-epoch.'
for tweet in tweets_list:
tweet['unix_timestamp'] = twitter_timestamp_to_unix(tweet["created_at"])
if __name__ == "__main__":
import argparse
ArgP = argparse.ArgumentParser(
description=("A little script to concatenate JSON tweets provided by "
"twitter, and to add a unix timestamp to each for ease of sorting and "
"comparison with other tools. Prints or saves to JSON."),
epilog="by Cathal Garey, released under GNU Affero GPL v3.")
ArgP.add_argument("-d","--directory",default=".",
help="Directory to seek monthly tweet archive files in. Default is current.")
ArgP.add_argument("-o","--output-file",
help="File to save output to rather than printing to stdout.")
args = ArgP.parse_args()
tweet_list = aggregate_tweets(args.directory)
add_unixtime_to_tweets(tweet_list)
if args.output_file:
with open(args.output_file,"w") as SaveF:
json.dump(tweet_list, SaveF, indent=1)
else:
print(json.dumps(tweet_list,indent=1))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment