Last active
October 16, 2016 00:35
-
-
Save edsu/ca3260c7ee050206d4a3071448f43836 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
follower_counts.py reads a batch of line oriented twitter data and | |
tracks the follower counts for supplied users over time. It is useful | |
if you have a collection of twitter data and you are interested in seeing | |
how their number of followers grows and drops over time. | |
So if you collected twitter data during a presidential campaign you could do | |
something like this: | |
./follower_counts --user trump,hillary tweets.json > report.csv | |
or if you wanted to see how their followers changed during a debate you can | |
report the stats every minute: | |
./follower_counts --granularity minute --user trump,hillary tweets.json > report.csv | |
""" | |
import csv | |
import sys | |
import json | |
import gzip | |
import time | |
import logging | |
import datetime | |
import optparse | |
import dateutil.parser | |
def add_counts(counts, dt, t): | |
user = t['user']['screen_name'] | |
followers_count = t['user']['followers_count'] | |
if user not in counts: | |
counts[user] = {} | |
if dt not in counts[user]: | |
logging.info("%s %s %s", dt, user, followers_count) | |
if followers_count > counts[user].get(dt, 0): | |
counts[user][dt] = followers_count | |
return counts | |
def process_file(fh, dt_format, users, counts): | |
try: | |
for line in fh: | |
try: | |
tweet = json.loads(line) | |
except: | |
continue | |
dt = dateutil.parser.parse(tweet['created_at']).strftime(dt_format) | |
if not users or tweet['user']['screen_name'].lower() in users: | |
add_counts(counts, dt, tweet) | |
if 'retweeted_status' in tweet and (not users or tweet['retweeted_status']['user']['screen_name'].lower() in users): | |
add_counts(counts, dt, tweet['retweeted_status']) | |
except IOError: | |
pass # can get these when the gzip file was cut off | |
return counts | |
def minmax(counts): | |
min_day = None | |
max_day = None | |
for user in counts.keys(): | |
for day in counts[user].keys(): | |
if not min_day or day < min_day: | |
min_day = day | |
if not max_day or day > max_day: | |
max_day = day | |
return (min_day, max_day) | |
def write_csv(filename, dt_format, dt_incr, counts): | |
min_day, max_day = minmax(counts) | |
day = dateutil.parser.parse(min_day) | |
max_day = dateutil.parser.parse(max_day) | |
users = counts.keys() | |
users.sort() | |
if filename: | |
data = csv.writer(open(filename, "w")) | |
else: | |
data = csv.writer(sys.stdout) | |
data.writerow(["time"] + users) | |
while day <= max_day: | |
row = [day.strftime(dt_format)] | |
for user in users: | |
row.append(counts[user].get(day.strftime(dt_format))) | |
data.writerow(row) | |
day += dt_incr | |
if __name__ == "__main__": | |
parser = optparse.OptionParser() | |
parser.add_option("-u", "--user", dest="user") | |
parser.add_option("-g", "--granularity", dest="granularity", default="day", choices=["day", "hour", "minute"]) | |
parser.add_option("-c", "--csv", dest="csv") | |
parser.add_option("-l", "--log", dest="log", default="follower_counts.log") | |
(options, files) = parser.parse_args() | |
if options.user: | |
users = options.user.lower().split(",") | |
else: | |
users = [] | |
if len(files) == 0: | |
parser.error("you must supply one or more filenames") | |
logging.basicConfig(filename=options.log, level=logging.INFO) | |
counts = {} | |
if options.granularity == "day": | |
dt_format = "%Y-%m-%d" | |
dt_incr = datetime.timedelta(days=1) | |
elif options.granularity == "hour": | |
dt_format = "%Y-%m-%d %H:00:00" | |
dt_incr = datetime.timedelta(hours=1) | |
elif options.granularity == "minute": | |
dt_format = "%Y-%m-%d %H:%M:00" | |
dt_incr = datetime.timedelta(minutes=1) | |
for filename in files: | |
logging.info("processing %s", filename) | |
if filename.endswith(".gz"): | |
fh = gzip.open(filename) | |
else: | |
fh = open(filename) | |
process_file(fh, dt_format, users, counts) | |
write_csv(options.csv, dt_format, dt_incr, counts) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment