Skip to content

Instantly share code, notes, and snippets.

@ketralnis
Created July 12, 2011 06:24
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ketralnis/1077498 to your computer and use it in GitHub Desktop.
Save ketralnis/1077498 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python2.7
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt # For plotting graphs.
from contextlib import contextmanager
from collections import namedtuple
import urllib
import json
import os, os.path
import random
from votemining import to36
class data(object):
def __init__(self, f, types):
self.f = open(f)
self.i = iter(self.f)
self.types = types
def __iter__(self):
return self
def next(self):
try:
line = self.i.next()
except StopIteration:
self.f.close()
raise
line = line.strip()
fields = line.split(',')
if self.types:
fields = [f(d) for (f, d) in zip(self.types, fields)]
return fields
def all(self):
return list(self)
def memoize(fn):
memo = {}
def _fn(*a):
a = tuple(a)
try:
return memo[a]
except KeyError:
ret = memo[a] = fn(*a)
return ret
return _fn
@memoize
def reddit_title(linkid):
id36 = to36(linkid)
url = 'http://www.reddit.com/by_id/t3_%s.json' % to36(linkid)
js = json.loads(urllib.urlopen(url).read())
try:
title = js['data']['children'][0]['data']['title']
except IndexError:
return '(unknown)'
print linkid, '->', title
return title
@contextmanager
def csvplot(fname, pngname, types = []):
d = data(fname, types)
yield (d, plt)
plt.savefig('data/pngs/'+pngname+'.png')
plt.clf()
@contextmanager
def plot(pngname):
yield plt
plt.savefig('data/pngs/'+pngname+'.png')
plt.clf()
with csvplot('data/dirhist.csv', 'dirhist', (int, float)) as (d, plt):
plt.title('Vote directions')
dirs = dict(d)
downs = dirs[-1]
nones = dirs[0]
ups = dirs[1]
plt.pie([downs, nones, ups], labels=['downs', 'nones', 'ups'], autopct='%1.1f%%')
with csvplot('data/timeofdayhist.csv', 'timeofdayhist', (int, int)) as (d, plt):
plt.title('Number of votes per hour')
plt.ylabel('# votes')
plt.xlabel('hour in GMT')
d = list(d)
plt.bar([x[0] for x in d], [x[1] for x in d])
plt.xticks([x[0]+0.5 for x in d], [x[0] for x in d])
plt.xlim(d[0][0], d[-1][0]+1)
with csvplot('data/scorehist.csv', 'scorehist', (int, int)) as (d, plt):
plt.title('Histogram of score per link')
plt.ylabel('# links')
plt.xlabel('score ranges')
d = list(d)
xs = []
ys = []
ticks = []
labels = []
for n, ((thismin, thiscount), (nextmin, nextcount)) in enumerate(zip(d, d[1:])):
xs.append(n)
ys.append(thiscount)
ticks.append(n+0.5)
if thismin == -1:
labels.append('<=0')
else:
labels.append('(%s..%s]' % (thismin, nextmin))
#print n, (thismin, thiscount), (nextmin, nextcount)
plt.bar(xs, ys)
plt.xticks(ticks, labels, rotation=30, size='small')
#plt.xlim(d[0][0]+1, d[-1][0]+1)
with csvplot('data/numvoteshist.csv', 'numvoteshist', (int, int)) as (d, plt):
plt.title('Histogram of number of votes received per link')
plt.ylabel('# links')
plt.xlabel('# of votes')
d = list(d)
xs = []
ys = []
ticks = []
labels = []
for n, ((thismin, thiscount), (nextmin, nextcount)) in enumerate(zip(d, d[1:])):
xs.append(n)
ys.append(thiscount)
ticks.append(n+0.5)
if thismin == -1:
# shouldn't happen
labels.append('<=0')
else:
labels.append('(%s..%s]' % (thismin, nextmin))
#print n, (thismin, thiscount), (nextmin, nextcount)
plt.bar(xs, ys)
plt.xticks(ticks, labels, rotation=30, size='small')
#plt.xlim(d[0][0]+1, d[-1][0]+1)
# now let's pick some links at random to analyse
d = 'data/scorebytime'
links = {}
for h in os.listdir(d):
for l in os.listdir(os.path.join(d, h)):
links[int(l.split('.')[0])] = os.path.join(d, h, l)
for x in range(10):
picks = random.sample(links.keys(), 100)
with plot('randommulti_%d' % x) as plt:
plt.title('Progression of scores of some random links over time')
plt.xlabel('timestamp')
plt.ylabel('score')
graphs = []
for pick in picks:
xs = []
ys = []
for timestamp, score in data(links[pick], (float, int)):
xs.append(timestamp),
ys.append(score)
graphs.extend([xs, ys])
# TODO: titles
plt.plot(*graphs)
#plt.legend()
# the few with the largest numbers of votes
largest = sorted(links.keys(),
key=lambda x: os.stat(links[x]).st_size,
reverse=True)[:15]
for x in largest:
with plot('choicesingle_%d' % x) as plt:
title = 'Progression of a link over time: ' + reddit_title(x)
plt.title(title)
plt.xlabel('timestamp')
plt.ylabel('score')
xs = []
ys = []
for timestamp, score in data(links[x], (float, int)):
xs.append(timestamp),
ys.append(score)
plt.plot(xs, ys)
with plot('largestmulti_score') as plt:
plt.title('Progression of some larged-scored links over time')
plt.xlabel('timestamp')
plt.ylabel('score')
# TODO: titles in legend?
graphs = []
for pick in largest:
xs = []
ys = []
for timestamp, score in data(links[pick], (float, int)):
xs.append(timestamp),
ys.append(score)
graphs.extend([xs, ys])
plt.plot(*graphs)
Graph = namedtuple('Graph', ('xs', 'ys'))
respect_top = 25
# we have a three-day dataset; this starts at the second day
after = 1309832976.63
before = lambda h: after+(h*60*60)
for hours in 6, 12, 24, 48:
with plot('chunkedreplay_%d' % hours) as plt:
# TODO: labels?
plt.title('Progression of the front page over %d hours' % hours)
plt.xlabel('timestamp')
plt.ylabel('rank')
clinks = {}
for timestamp, linkid, hot, rank in data('data/snapshots.csv', [float, int, float, int]):
if rank > respect_top or timestamp < after or timestamp > before(hours):
continue
if linkid not in clinks:
clinks[linkid] = Graph([], [])
clinks[linkid].xs.append(timestamp)
# this was 0-indexed on generation
rank += 1
clinks[linkid].ys.append(rank)
graphs = []
for linkid, g in clinks.iteritems():
graphs.append(g.xs)
graphs.append(g.ys)
plt.plot(*graphs)
# reverse the axes (this must be done after the graph has been plotted in )
ax = plt.gca()
ax.set_ylim([respect_top,1])
plt.yticks(range(1, respect_top))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment