Skip to content

Instantly share code, notes, and snippets.

@collina
Last active December 22, 2015 01:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save collina/6395530 to your computer and use it in GitHub Desktop.
Save collina/6395530 to your computer and use it in GitHub Desktop.
Code to Process Tor Project's Direct Connecting User Statistics
#-*- coding: utf-8 -*-
#
# :authors: Collin Anderson
# :license: CC0
# :dataset: https://metrics.torproject.org/csv/direct-users.csv
# :example: python direct-users-transform.py direct-users.csv
import csv, sys, datetime
import pylab
import matplotlib
import matplotlib.pyplot as pyplot
import matplotlib.dates as mdates
import pycountry
# Play with specific times to narrow graph time period
startdate = datetime.datetime.strptime("2013-08-13", "%Y-%m-%d")
enddate = datetime.datetime.strptime("2014-08-30", "%Y-%m-%d")
# True = cumulative growth, False = growth in real numbers
cumulative = False
# Only consider countries with users greater than
threshold = 1000
# Only consider countries in this cc list, or None or all
# countries = ['SY']
countries = None
csv_in_file = open(sys.argv[1], 'rb')
linereader = csv.reader(csv_in_file, delimiter=',', quotechar='|')
headers = linereader.next()
dictionary = {}
rdictionary = {}
for line in linereader:
pline = dict(zip(headers, line))
key = pline.pop('date')
dictionary[key] = pline
# pivot the table
for date, records in dictionary.iteritems():
date = datetime.datetime.strptime(date, "%Y-%m-%d")
for cc, measurement in records.iteritems():
if cc not in rdictionary: rdictionary[cc] = {}
if measurement in ['NA', None, 0, '0']: measurement = 0
rdictionary[cc][date] = float(measurement)
rdates = [k for k in sorted(rdictionary[rdictionary.keys()[0]].keys()) if k > startdate and k < enddate]
font = {'family' : 'serif',
'weight' : 'normal',
'size' : 9}
matplotlib.rc('font', **font)
figure, plot = pyplot.subplots()
for cc, records in rdictionary.iteritems():
if countries is not None and cc.upper() not in countries: continue
try:
label = (pycountry.countries.get(alpha2=cc.upper())).name
except KeyError:
label = cc
records = [records[k] for k in rdates if k > startdate and k < enddate]
first_value = False
slope = []
for r in records:
if r == 0: r = 1 # It's one user, just fake it, Tor doesn't report < 8
if first_value is False: first_value = r
slope += [(r-first_value)/first_value]
if True not in [r > threshold for r in records] or cc in ['all']:
print 'Skipping', label
continue
if cumulative is True: records = slope
plot.plot(rdates, records, label = label)
ann_xy = (mdates.date2num(rdates[-1]), records[-1])
plot.annotate(label, ann_xy, size = 7)
Xs=pylab.gca()
dt_format = '%b %d'
Xs.xaxis.set_major_formatter(matplotlib.dates.DateFormatter(dt_format))
if cumulative is True: Xs.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x,p :'%.0f%%'%(x*(100))))
pylab.grid(True)
pylab.suptitle("Directly Connecting Users for Tor: %s to %s" % (rdates[0].strftime('%Y-%m-%d'), rdates[-1].strftime('%Y-%m-%d')), size= 16)
pylab.xlabel("Time (mm/dd)")
pylab.ylabel("Directly Connecting Users")
pylab.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment