ketralnis/graphs.py

## graphs.py
#!/usr/bin/env python2.7

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt   # For plotting graphs.

from contextlib import contextmanager
from collections import namedtuple
import urllib
import json
import os, os.path
import random

from votemining import to36

class data(object):
    def __init__(self, f, types):
        self.f = open(f)
        self.i = iter(self.f)
        self.types = types

    def __iter__(self):
        return self

    def next(self):
        try:
            line = self.i.next()
        except StopIteration:
            self.f.close()
            raise
        line = line.strip()
        fields = line.split(',')
        if self.types:
            fields = [f(d) for (f, d) in zip(self.types, fields)]
        return fields

    def all(self):
        return list(self)

def memoize(fn):
    memo = {}
    def _fn(*a):
        a = tuple(a)
        try:
            return memo[a]
        except KeyError:
            ret = memo[a] = fn(*a)
            return ret
    return _fn

@memoize
def reddit_title(linkid):
    id36 = to36(linkid)
    url = 'http://www.reddit.com/by_id/t3_%s.json' % to36(linkid)
    js = json.loads(urllib.urlopen(url).read())
    try:
        title = js['data']['children'][0]['data']['title']
    except IndexError:
        return '(unknown)'
    print linkid, '->', title
    return title

@contextmanager
def csvplot(fname, pngname, types = []):
    d = data(fname, types)
    yield (d, plt)
    plt.savefig('data/pngs/'+pngname+'.png')
    plt.clf()

@contextmanager
def plot(pngname):
    yield plt
    plt.savefig('data/pngs/'+pngname+'.png')
    plt.clf()

with csvplot('data/dirhist.csv', 'dirhist', (int, float)) as (d, plt):
    plt.title('Vote directions')
    dirs  = dict(d)
    downs = dirs[-1]
    nones = dirs[0]
    ups   = dirs[1]
    plt.pie([downs, nones, ups], labels=['downs', 'nones', 'ups'], autopct='%1.1f%%')

with csvplot('data/timeofdayhist.csv', 'timeofdayhist', (int, int)) as (d, plt):
    plt.title('Number of votes per hour')
    plt.ylabel('# votes')
    plt.xlabel('hour in GMT')
    d = list(d)
    plt.bar([x[0] for x in d], [x[1] for x in d])
    plt.xticks([x[0]+0.5 for x in d], [x[0] for x in d])
    plt.xlim(d[0][0], d[-1][0]+1)

with csvplot('data/scorehist.csv', 'scorehist', (int, int)) as (d, plt):
    plt.title('Histogram of score per link')
    plt.ylabel('# links')
    plt.xlabel('score ranges')

    d = list(d)

    xs = []
    ys = []
    ticks = []
    labels = []

    for n, ((thismin, thiscount), (nextmin, nextcount)) in enumerate(zip(d, d[1:])):
        xs.append(n)
        ys.append(thiscount)
        ticks.append(n+0.5)
        if thismin == -1:
            labels.append('<=0')
        else:
            labels.append('(%s..%s]' % (thismin, nextmin))

        #print n, (thismin, thiscount), (nextmin, nextcount)
    plt.bar(xs, ys)
    plt.xticks(ticks, labels, rotation=30, size='small')
    #plt.xlim(d[0][0]+1, d[-1][0]+1)

with csvplot('data/numvoteshist.csv', 'numvoteshist', (int, int)) as (d, plt):
    plt.title('Histogram of number of votes received per link')
    plt.ylabel('# links')
    plt.xlabel('# of votes')

    d = list(d)

    xs = []
    ys = []
    ticks = []
    labels = []

    for n, ((thismin, thiscount), (nextmin, nextcount)) in enumerate(zip(d, d[1:])):
        xs.append(n)
        ys.append(thiscount)
        ticks.append(n+0.5)
        if thismin == -1:
            # shouldn't happen
            labels.append('<=0')
        else:
            labels.append('(%s..%s]' % (thismin, nextmin))

        #print n, (thismin, thiscount), (nextmin, nextcount)
    plt.bar(xs, ys)
    plt.xticks(ticks, labels, rotation=30, size='small')
    #plt.xlim(d[0][0]+1, d[-1][0]+1)

# now let's pick some links at random to analyse
d = 'data/scorebytime'
links = {}
for h in os.listdir(d):
    for l in os.listdir(os.path.join(d, h)):
        links[int(l.split('.')[0])] = os.path.join(d, h, l)

for x in range(10):
    picks = random.sample(links.keys(), 100)
    with plot('randommulti_%d' % x) as plt:
        plt.title('Progression of scores of some random links over time')
        plt.xlabel('timestamp')
        plt.ylabel('score')

        graphs = []
        for pick in picks:
            xs = []
            ys = []
            for timestamp, score in data(links[pick], (float, int)):
                xs.append(timestamp),
                ys.append(score)
            graphs.extend([xs, ys])
        # TODO: titles
        plt.plot(*graphs)
        #plt.legend()

# the few with the largest numbers of votes
largest = sorted(links.keys(),
                 key=lambda x: os.stat(links[x]).st_size,
                 reverse=True)[:15]

for x in largest:
    with plot('choicesingle_%d' % x) as plt:
        title = 'Progression of a link over time: ' + reddit_title(x)

        plt.title(title)
        plt.xlabel('timestamp')
        plt.ylabel('score')

        xs = []
        ys = []
        for timestamp, score in data(links[x], (float, int)):
            xs.append(timestamp),
            ys.append(score)
        plt.plot(xs, ys)

with plot('largestmulti_score') as plt:
    plt.title('Progression of some larged-scored links over time')
    plt.xlabel('timestamp')
    plt.ylabel('score')

    # TODO: titles in legend?

    graphs = []
    for pick in largest:
        xs = []
        ys = []
        for timestamp, score in data(links[pick], (float, int)):
            xs.append(timestamp),
            ys.append(score)
        graphs.extend([xs, ys])
    plt.plot(*graphs)

Graph = namedtuple('Graph', ('xs', 'ys'))

respect_top = 25
# we have a three-day dataset; this starts at the second day
after = 1309832976.63
before = lambda h: after+(h*60*60)

for hours in 6, 12, 24, 48:

    with plot('chunkedreplay_%d' % hours) as plt:
        # TODO: labels?
        plt.title('Progression of the front page over %d hours' % hours)
        plt.xlabel('timestamp')
        plt.ylabel('rank')

        clinks = {}
        for timestamp, linkid, hot, rank in data('data/snapshots.csv', [float, int, float, int]):
            if rank > respect_top or timestamp < after or timestamp > before(hours):
                continue

            if linkid not in clinks:
                clinks[linkid] = Graph([], [])
            clinks[linkid].xs.append(timestamp)

            # this was 0-indexed on generation
            rank += 1

            clinks[linkid].ys.append(rank)
        graphs = []
        for linkid, g in clinks.iteritems():
            graphs.append(g.xs)
            graphs.append(g.ys)
        plt.plot(*graphs)

        # reverse the axes (this must be done after the graph has been plotted in )
        ax = plt.gca()
        ax.set_ylim([respect_top,1])
        plt.yticks(range(1, respect_top))
	#!/usr/bin/env python2.7

	import matplotlib
	matplotlib.use('Agg')
	import matplotlib.pyplot as plt # For plotting graphs.

	from contextlib import contextmanager
	from collections import namedtuple
	import urllib
	import json
	import os, os.path
	import random

	from votemining import to36

	class data(object):
	def __init__(self, f, types):
	self.f = open(f)
	self.i = iter(self.f)
	self.types = types

	def __iter__(self):
	return self

	def next(self):
	try:
	line = self.i.next()
	except StopIteration:
	self.f.close()
	raise
	line = line.strip()
	fields = line.split(',')
	if self.types:
	fields = [f(d) for (f, d) in zip(self.types, fields)]
	return fields

	def all(self):
	return list(self)

	def memoize(fn):
	memo = {}
	def _fn(*a):
	a = tuple(a)
	try:
	return memo[a]
	except KeyError:
	ret = memo[a] = fn(*a)
	return ret
	return _fn

	@memoize
	def reddit_title(linkid):
	id36 = to36(linkid)
	url = 'http://www.reddit.com/by_id/t3_%s.json' % to36(linkid)
	js = json.loads(urllib.urlopen(url).read())
	try:
	title = js['data']['children'][0]['data']['title']
	except IndexError:
	return '(unknown)'
	print linkid, '->', title
	return title

	@contextmanager
	def csvplot(fname, pngname, types = []):
	d = data(fname, types)
	yield (d, plt)
	plt.savefig('data/pngs/'+pngname+'.png')
	plt.clf()

	@contextmanager
	def plot(pngname):
	yield plt
	plt.savefig('data/pngs/'+pngname+'.png')
	plt.clf()

	with csvplot('data/dirhist.csv', 'dirhist', (int, float)) as (d, plt):
	plt.title('Vote directions')
	dirs = dict(d)
	downs = dirs[-1]
	nones = dirs[0]
	ups = dirs[1]
	plt.pie([downs, nones, ups], labels=['downs', 'nones', 'ups'], autopct='%1.1f%%')

	with csvplot('data/timeofdayhist.csv', 'timeofdayhist', (int, int)) as (d, plt):
	plt.title('Number of votes per hour')
	plt.ylabel('# votes')
	plt.xlabel('hour in GMT')
	d = list(d)
	plt.bar([x[0] for x in d], [x[1] for x in d])
	plt.xticks([x[0]+0.5 for x in d], [x[0] for x in d])
	plt.xlim(d[0][0], d[-1][0]+1)

	with csvplot('data/scorehist.csv', 'scorehist', (int, int)) as (d, plt):
	plt.title('Histogram of score per link')
	plt.ylabel('# links')
	plt.xlabel('score ranges')

	d = list(d)

	xs = []
	ys = []
	ticks = []
	labels = []

	for n, ((thismin, thiscount), (nextmin, nextcount)) in enumerate(zip(d, d[1:])):
	xs.append(n)
	ys.append(thiscount)
	ticks.append(n+0.5)
	if thismin == -1:
	labels.append('<=0')
	else:
	labels.append('(%s..%s]' % (thismin, nextmin))

	#print n, (thismin, thiscount), (nextmin, nextcount)
	plt.bar(xs, ys)
	plt.xticks(ticks, labels, rotation=30, size='small')
	#plt.xlim(d[0][0]+1, d[-1][0]+1)

	with csvplot('data/numvoteshist.csv', 'numvoteshist', (int, int)) as (d, plt):
	plt.title('Histogram of number of votes received per link')
	plt.ylabel('# links')
	plt.xlabel('# of votes')

	d = list(d)

	xs = []
	ys = []
	ticks = []
	labels = []

	for n, ((thismin, thiscount), (nextmin, nextcount)) in enumerate(zip(d, d[1:])):
	xs.append(n)
	ys.append(thiscount)
	ticks.append(n+0.5)
	if thismin == -1:
	# shouldn't happen
	labels.append('<=0')
	else:
	labels.append('(%s..%s]' % (thismin, nextmin))

	#print n, (thismin, thiscount), (nextmin, nextcount)
	plt.bar(xs, ys)
	plt.xticks(ticks, labels, rotation=30, size='small')
	#plt.xlim(d[0][0]+1, d[-1][0]+1)

	# now let's pick some links at random to analyse
	d = 'data/scorebytime'
	links = {}
	for h in os.listdir(d):
	for l in os.listdir(os.path.join(d, h)):
	links[int(l.split('.')[0])] = os.path.join(d, h, l)

	for x in range(10):
	picks = random.sample(links.keys(), 100)
	with plot('randommulti_%d' % x) as plt:
	plt.title('Progression of scores of some random links over time')
	plt.xlabel('timestamp')
	plt.ylabel('score')

	graphs = []
	for pick in picks:
	xs = []
	ys = []
	for timestamp, score in data(links[pick], (float, int)):
	xs.append(timestamp),
	ys.append(score)
	graphs.extend([xs, ys])
	# TODO: titles
	plt.plot(*graphs)
	#plt.legend()

	# the few with the largest numbers of votes
	largest = sorted(links.keys(),
	key=lambda x: os.stat(links[x]).st_size,
	reverse=True)[:15]

	for x in largest:
	with plot('choicesingle_%d' % x) as plt:
	title = 'Progression of a link over time: ' + reddit_title(x)

	plt.title(title)
	plt.xlabel('timestamp')
	plt.ylabel('score')

	xs = []
	ys = []
	for timestamp, score in data(links[x], (float, int)):
	xs.append(timestamp),
	ys.append(score)
	plt.plot(xs, ys)

	with plot('largestmulti_score') as plt:
	plt.title('Progression of some larged-scored links over time')
	plt.xlabel('timestamp')
	plt.ylabel('score')

	# TODO: titles in legend?

	graphs = []
	for pick in largest:
	xs = []
	ys = []
	for timestamp, score in data(links[pick], (float, int)):
	xs.append(timestamp),
	ys.append(score)
	graphs.extend([xs, ys])
	plt.plot(*graphs)

	Graph = namedtuple('Graph', ('xs', 'ys'))

	respect_top = 25
	# we have a three-day dataset; this starts at the second day
	after = 1309832976.63
	before = lambda h: after+(h6060)

	for hours in 6, 12, 24, 48:

	with plot('chunkedreplay_%d' % hours) as plt:
	# TODO: labels?
	plt.title('Progression of the front page over %d hours' % hours)
	plt.xlabel('timestamp')
	plt.ylabel('rank')

	clinks = {}
	for timestamp, linkid, hot, rank in data('data/snapshots.csv', [float, int, float, int]):
	if rank > respect_top or timestamp < after or timestamp > before(hours):
	continue

	if linkid not in clinks:
	clinks[linkid] = Graph([], [])
	clinks[linkid].xs.append(timestamp)

	# this was 0-indexed on generation
	rank += 1

	clinks[linkid].ys.append(rank)
	graphs = []
	for linkid, g in clinks.iteritems():
	graphs.append(g.xs)
	graphs.append(g.ys)
	plt.plot(*graphs)

	# reverse the axes (this must be done after the graph has been plotted in )
	ax = plt.gca()
	ax.set_ylim([respect_top,1])
	plt.yticks(range(1, respect_top))