Skip to content

Instantly share code, notes, and snippets.

@cholick

cholick/parse.py Secret

Created Sep 30, 2017
Embed
What would you like to do?
Parse twitter archive
import csv
import html
import datetime
import functools
# Print some stats around a Twitter archive https://support.twitter.com/articles/20170160
class Tweet:
def __init__(self, row):
self.tweet_id = row[0]
self.in_reply_to_status_id = row[1]
self.in_reply_to_user_id = row[2]
self.timestamp = datetime.datetime.strptime(row[3], "%Y-%m-%d %H:%M:%S +0000")
self.source = row[4]
self.text = html.unescape(row[5])
self.retweeted_status_id = row[6]
self.retweeted_status_user_id = row[7]
self.retweeted_status_timestamp = row[8]
self.expanded_urls = row[9]
def include(self):
if self.in_reply_to_status_id != '' or self.in_reply_to_user_id != '':
return False
if self.retweeted_status_id != '' or self.retweeted_status_timestamp != '' or self.retweeted_status_user_id != '':
return False
if self.text.startswith('@'):
return False
return True
with open('tweets.csv') as raw_tweets:
rows = csv.reader(raw_tweets)
next(rows)
years = {}
for raw_tweet in rows:
tweet = Tweet(raw_tweet)
if tweet.include():
if not years.get(tweet.timestamp.year):
years[tweet.timestamp.year] = []
years[tweet.timestamp.year].append(len(tweet.text))
for year in years:
lengths = years[year]
average = sum(lengths) / len(lengths)
len_all = len(lengths)
len_140 = functools.reduce(lambda running, l: running + 1 if l == 140 else running, lengths, 0)
len_135 = functools.reduce(lambda running, l: running + 1 if l >= 135 else running, lengths, 0)
print(
year, len_all,
"{:.0f}".format(average),
"{:.0f}%".format(100 * len_135 / len_all),
"{:.0f}%".format(100 * len_140 / len_all)
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment