Created
December 14, 2015 07:22
-
-
Save jsundram/3d99b0d4b65f8ed639d6 to your computer and use it in GitHub Desktop.
compress some data for tweety
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
metadata: | |
"id": "55677993", | |
"name": "Jason Sundram", | |
"numFollowers": 2010, | |
"numTweets": 3345, | |
"screen_name": "jsundram", | |
tweets: | |
'148125787172765696', { | |
u'c': u'Sat Dec 17 19:41:54 +0000 2011', | |
"h": [ | |
"cocktail", | |
"fact" | |
], | |
u'name': u'jsundram', | |
u'retweet': {u'id': u'147479157947891712', u'name': u'somebitsLinks'}, | |
u'stats': {u'favorites': 0, u'retweets': 11}, | |
u'text': u'RT @somebitsLinks: Perceived color brightness: A little color theory for you http://t.co/U7DhlOxq', | |
u'um': [{u'name': u'somebitsLinks', u'user_id': 358426597}]} | |
extract to global metadata | |
=> | |
tweet_id, time, hashtags, faves, retweets, text, mentions | |
""" | |
import csv | |
from datetime import datetime | |
import json | |
def parse_time(t): | |
"""takes string, returns seconds since 1970""" | |
dt = datetime.strptime(t, "%a %b %d %H:%M:%S +0000 %Y") | |
return int((dt - datetime(1970, 1, 1)).total_seconds()) | |
def flatten(tweet_id, tweet): | |
return [ | |
tweet_id, | |
parse_time(tweet['c']), | |
json.dumps(tweet.get('h', [])), | |
tweet['stats']['favorites'], | |
tweet['stats']['retweets'], | |
tweet['text'].encode('utf-8'), | |
# store userid to username mapping just once, not in every row; write user_ids list only | |
json.dumps([i['user_id'] for i in tweet.get('um', [])]) | |
], tweet.get('um', []) | |
def process(filename, metadata): | |
with open(filename) as f: | |
data = json.load(f) | |
metadata[data['id']] = [ | |
data['name'], | |
data['numFollowers'], | |
data['numTweets'], | |
data['screen_name'] | |
] | |
with open('jsundram_tweets.csv', 'w') as f: | |
w = csv.writer(f) | |
w.writerow('tweet_id, time, hashtags, faves, retweets, text, mentions'.split(', ')) | |
for (k, v) in data['tweets'].iteritems(): | |
row, mentions = flatten(k, v) | |
w.writerow(row) | |
# Update metadata; we will write that separately | |
for m in mentions: | |
if m['user_id'] not in metadata: | |
metadata[m['user_id']] = ['', -1, -1, m['name']] | |
def main(): | |
metadata = {} | |
process('data/jsundram-min.json', metadata) | |
with open('jsundram-metadata.csv', 'w') as f: | |
w = csv.writer(f) | |
w.writerow('id, name, followers, tweets, screenname'.split(', ')) | |
for (k, v) in sorted(metadata.iteritems()): | |
w.writerow([k] + v) | |
# on the js side, just use queue.js and wait for jsundram-min.csv and jsundram-metadata.csv; 50% smaller | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment