Skip to content

Instantly share code, notes, and snippets.

@edsu
Last active October 16, 2016 00:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save edsu/ca3260c7ee050206d4a3071448f43836 to your computer and use it in GitHub Desktop.
Save edsu/ca3260c7ee050206d4a3071448f43836 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
"""
follower_counts.py reads a batch of line oriented twitter data and
tracks the follower counts for supplied users over time. It is useful
if you have a collection of twitter data and you are interested in seeing
how their number of followers grows and drops over time.
So if you collected twitter data during a presidential campaign you could do
something like this:
./follower_counts --user trump,hillary tweets.json > report.csv
or if you wanted to see how their followers changed during a debate you can
report the stats every minute:
./follower_counts --granularity minute --user trump,hillary tweets.json > report.csv
"""
import csv
import sys
import json
import gzip
import time
import logging
import datetime
import optparse
import dateutil.parser
def add_counts(counts, dt, t):
user = t['user']['screen_name']
followers_count = t['user']['followers_count']
if user not in counts:
counts[user] = {}
if dt not in counts[user]:
logging.info("%s %s %s", dt, user, followers_count)
if followers_count > counts[user].get(dt, 0):
counts[user][dt] = followers_count
return counts
def process_file(fh, dt_format, users, counts):
try:
for line in fh:
try:
tweet = json.loads(line)
except:
continue
dt = dateutil.parser.parse(tweet['created_at']).strftime(dt_format)
if not users or tweet['user']['screen_name'].lower() in users:
add_counts(counts, dt, tweet)
if 'retweeted_status' in tweet and (not users or tweet['retweeted_status']['user']['screen_name'].lower() in users):
add_counts(counts, dt, tweet['retweeted_status'])
except IOError:
pass # can get these when the gzip file was cut off
return counts
def minmax(counts):
min_day = None
max_day = None
for user in counts.keys():
for day in counts[user].keys():
if not min_day or day < min_day:
min_day = day
if not max_day or day > max_day:
max_day = day
return (min_day, max_day)
def write_csv(filename, dt_format, dt_incr, counts):
min_day, max_day = minmax(counts)
day = dateutil.parser.parse(min_day)
max_day = dateutil.parser.parse(max_day)
users = counts.keys()
users.sort()
if filename:
data = csv.writer(open(filename, "w"))
else:
data = csv.writer(sys.stdout)
data.writerow(["time"] + users)
while day <= max_day:
row = [day.strftime(dt_format)]
for user in users:
row.append(counts[user].get(day.strftime(dt_format)))
data.writerow(row)
day += dt_incr
if __name__ == "__main__":
parser = optparse.OptionParser()
parser.add_option("-u", "--user", dest="user")
parser.add_option("-g", "--granularity", dest="granularity", default="day", choices=["day", "hour", "minute"])
parser.add_option("-c", "--csv", dest="csv")
parser.add_option("-l", "--log", dest="log", default="follower_counts.log")
(options, files) = parser.parse_args()
if options.user:
users = options.user.lower().split(",")
else:
users = []
if len(files) == 0:
parser.error("you must supply one or more filenames")
logging.basicConfig(filename=options.log, level=logging.INFO)
counts = {}
if options.granularity == "day":
dt_format = "%Y-%m-%d"
dt_incr = datetime.timedelta(days=1)
elif options.granularity == "hour":
dt_format = "%Y-%m-%d %H:00:00"
dt_incr = datetime.timedelta(hours=1)
elif options.granularity == "minute":
dt_format = "%Y-%m-%d %H:%M:00"
dt_incr = datetime.timedelta(minutes=1)
for filename in files:
logging.info("processing %s", filename)
if filename.endswith(".gz"):
fh = gzip.open(filename)
else:
fh = open(filename)
process_file(fh, dt_format, users, counts)
write_csv(options.csv, dt_format, dt_incr, counts)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment