Skip to content

Instantly share code, notes, and snippets.

@jsundram
Created December 14, 2015 07:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jsundram/3d99b0d4b65f8ed639d6 to your computer and use it in GitHub Desktop.
Save jsundram/3d99b0d4b65f8ed639d6 to your computer and use it in GitHub Desktop.
compress some data for tweety
"""
metadata:
"id": "55677993",
"name": "Jason Sundram",
"numFollowers": 2010,
"numTweets": 3345,
"screen_name": "jsundram",
tweets:
'148125787172765696', {
u'c': u'Sat Dec 17 19:41:54 +0000 2011',
"h": [
"cocktail",
"fact"
],
u'name': u'jsundram',
u'retweet': {u'id': u'147479157947891712', u'name': u'somebitsLinks'},
u'stats': {u'favorites': 0, u'retweets': 11},
u'text': u'RT @somebitsLinks: Perceived color brightness: A little color theory for you http://t.co/U7DhlOxq',
u'um': [{u'name': u'somebitsLinks', u'user_id': 358426597}]}
extract to global metadata
=>
tweet_id, time, hashtags, faves, retweets, text, mentions
"""
import csv
from datetime import datetime
import json
def parse_time(t):
"""takes string, returns seconds since 1970"""
dt = datetime.strptime(t, "%a %b %d %H:%M:%S +0000 %Y")
return int((dt - datetime(1970, 1, 1)).total_seconds())
def flatten(tweet_id, tweet):
return [
tweet_id,
parse_time(tweet['c']),
json.dumps(tweet.get('h', [])),
tweet['stats']['favorites'],
tweet['stats']['retweets'],
tweet['text'].encode('utf-8'),
# store userid to username mapping just once, not in every row; write user_ids list only
json.dumps([i['user_id'] for i in tweet.get('um', [])])
], tweet.get('um', [])
def process(filename, metadata):
with open(filename) as f:
data = json.load(f)
metadata[data['id']] = [
data['name'],
data['numFollowers'],
data['numTweets'],
data['screen_name']
]
with open('jsundram_tweets.csv', 'w') as f:
w = csv.writer(f)
w.writerow('tweet_id, time, hashtags, faves, retweets, text, mentions'.split(', '))
for (k, v) in data['tweets'].iteritems():
row, mentions = flatten(k, v)
w.writerow(row)
# Update metadata; we will write that separately
for m in mentions:
if m['user_id'] not in metadata:
metadata[m['user_id']] = ['', -1, -1, m['name']]
def main():
metadata = {}
process('data/jsundram-min.json', metadata)
with open('jsundram-metadata.csv', 'w') as f:
w = csv.writer(f)
w.writerow('id, name, followers, tweets, screenname'.split(', '))
for (k, v) in sorted(metadata.iteritems()):
w.writerow([k] + v)
# on the js side, just use queue.js and wait for jsundram-min.csv and jsundram-metadata.csv; 50% smaller
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment