Skip to content

Instantly share code, notes, and snippets.

@westurner
Created March 23, 2014 00:17
Show Gist options
  • Save westurner/9716436 to your computer and use it in GitHub Desktop.
Save westurner/9716436 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
# -*- coding: utf-8 -*-
# <nbformat>3.0</nbformat>
# <markdowncell>
# Draw a co-ocurrence matrix with matplotlib.matshow (matplotlib.imshow)
#
# https://en.wikipedia.org/wiki/Co-occurrence_matrix
#
# * https://en.wikipedia.org/wiki/Matplotlib
# * http://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.matshow
# * http://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.imshow
#
# Created with IPython notebook in an Anaconda 1.9.1 environment
#
# * http://ipython.org/notebook.html
# * http://docs.continuum.io/anaconda/install.html
# * http://docs.continuum.io/anaconda/pkgs.html
# <codecell>
%pylab inline --no-import-all
# import numpy as np
# import matplotlib.pyplot as plt
# import pylab
import collections
from pprint import pformat
# <codecell>
DATA_CONSTRAINTS = (
(('english',), 5),
(('math',), 7),
(('social studies',), 3),
(('science',), 3),
(('english', 'science'), 1),
(('english', 'social studies',), 1),
(('math', 'science'), 2),
(('science', 'social studies',), 1)
)
def generate_data(constraints):
"""
Args:
constraints: ((categories_tuple,), count_int)
Returns:
list of category tuples satisfying constraints
"""
def _standardize_constraints(constraints):
return [(tuple(sorted(c[0])), c[1]) for c in constraints]
def fits_constraints(data, constraints):
dataset = list(data)
counts = collections.Counter(tuple(sorted(elem)) for elem in dataset)
_constraints = _standardize_constraints(constraints)
return sorted(counts.iteritems()) == sorted(_constraints)
def generate_data(constraints):
_constraints = _standardize_constraints(constraints)
for categories, count in _constraints:
for n in xrange(count):
yield categories
data = list(generate_data(constraints))
if fits_constraints(data, constraints):
return data
raise Exception("uh") # XXX
data = generate_data(DATA_CONSTRAINTS)
data
# <codecell>
def iter_adjacencies(data):
"""
Args:
data: iterable of categories
Returns:
iterable of (row, (category_x, category_y)) pairs with self edges
"""
for row_n, row in enumerate(data):
_len = len(row)
for category in row:
yield (row_n,row), (category, category)
if _len > 1:
for i in xrange(_len-1):
yield (row_n,row), (row[i], row[i+1])
adj_list = list(iter_adjacencies(data))
adj_list
# <codecell>
def build_array_from_adj_list(data, adj_list):
print(pformat(collections.Counter(data).items()))
categories = collections.OrderedDict(
(v,k) for k,v in enumerate(sorted(set(item for row in data for item in row))))
print("Indices: %s" % categories)
adjacency_dimensions = len(categories), len(categories)
print(adjacency_dimensions)
adj = np.zeros(adjacency_dimensions)
#print(adj)
for row, adjacencies in adj_list:
x, y = categories.get(adjacencies[0]), categories.get(adjacencies[1])
adj[x][y] += 1
if x != y:
adj[y][x] += 1
print(adj)
subtotal_0 = np.sum(adj, axis=0)
subtotal_1 = np.sum(adj, axis=1)
if not np.all(np.equal(subtotal_0, subtotal_1)):
raise Exception("Should be the same")
totals = zip(categories.keys(), subtotal_0)
print("Totals: %s" % totals)
return adj, categories
adj, categories = build_array_from_adj_list(data, adj_list)
# <codecell>
def draw_co_ocurrence_diagram(adj, categories, figsize=(4,5)):
pylab.rcParams['figure.figsize'] = figsize
plt.matshow(adj, cmap="Greys")
ticks = (np.arange(len(categories)), categories.keys())
plt.xticks(*ticks, rotation=90)
plt.yticks(*ticks)
plt.colorbar(orientation='horizontal')
draw_co_ocurrence_diagram(adj, categories)
# <codecell>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment