Skip to content

Instantly share code, notes, and snippets.

@westurner
Created March 23, 2014 00:17
Show Gist options
  • Save westurner/9716436 to your computer and use it in GitHub Desktop.
Save westurner/9716436 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": "categorical_vis"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": "Draw a co-ocurrence matrix with matplotlib.matshow (matplotlib.imshow)\n\nhttps://en.wikipedia.org/wiki/Co-occurrence_matrix\n\n* https://en.wikipedia.org/wiki/Matplotlib\n* http://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.matshow\n* http://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.imshow\n\nCreated with IPython notebook in an Anaconda 1.9.1 environment\n\n* http://ipython.org/notebook.html\n* http://docs.continuum.io/anaconda/install.html\n* http://docs.continuum.io/anaconda/pkgs.html"
},
{
"cell_type": "code",
"collapsed": false,
"input": "%pylab inline --no-import-all\n# import numpy as np\n# import matplotlib.pyplot as plt\n# import pylab\nimport collections\nfrom pprint import pformat",
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "Populating the interactive namespace from numpy and matplotlib\n"
}
],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": "DATA_CONSTRAINTS = (\n (('english',), 5),\n (('math',), 7),\n (('social studies',), 3),\n (('science',), 3),\n (('english', 'science'), 1),\n (('english', 'social studies',), 1),\n (('math', 'science'), 2),\n (('science', 'social studies',), 1)\n)\n\ndef generate_data(constraints):\n \"\"\"\n Args:\n constraints: ((categories_tuple,), count_int)\n\n Returns:\n list of category tuples satisfying constraints\n\n \"\"\"\n def _standardize_constraints(constraints):\n return [(tuple(sorted(c[0])), c[1]) for c in constraints]\n\n def fits_constraints(data, constraints):\n dataset = list(data)\n counts = collections.Counter(tuple(sorted(elem)) for elem in dataset)\n _constraints = _standardize_constraints(constraints)\n return sorted(counts.iteritems()) == sorted(_constraints)\n\n def generate_data(constraints):\n _constraints = _standardize_constraints(constraints)\n for categories, count in _constraints:\n for n in xrange(count):\n yield categories\n\n data = list(generate_data(constraints))\n if fits_constraints(data, constraints):\n return data\n raise Exception(\"uh\") # XXX\ndata = generate_data(DATA_CONSTRAINTS)\ndata",
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 2,
"text": "[('english',),\n ('english',),\n ('english',),\n ('english',),\n ('english',),\n ('math',),\n ('math',),\n ('math',),\n ('math',),\n ('math',),\n ('math',),\n ('math',),\n ('social studies',),\n ('social studies',),\n ('social studies',),\n ('science',),\n ('science',),\n ('science',),\n ('english', 'science'),\n ('english', 'social studies'),\n ('math', 'science'),\n ('math', 'science'),\n ('science', 'social studies')]"
}
],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": "def iter_adjacencies(data):\n \"\"\"\n Args:\n data: iterable of categories\n Returns:\n iterable of (row, (category_x, category_y)) pairs with self edges\n \"\"\"\n for row_n, row in enumerate(data):\n _len = len(row)\n for category in row:\n yield (row_n,row), (category, category)\n if _len > 1:\n for i in xrange(_len-1):\n yield (row_n,row), (row[i], row[i+1])\n\nadj_list = list(iter_adjacencies(data))\nadj_list",
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 3,
"text": "[((0, ('english',)), ('english', 'english')),\n ((1, ('english',)), ('english', 'english')),\n ((2, ('english',)), ('english', 'english')),\n ((3, ('english',)), ('english', 'english')),\n ((4, ('english',)), ('english', 'english')),\n ((5, ('math',)), ('math', 'math')),\n ((6, ('math',)), ('math', 'math')),\n ((7, ('math',)), ('math', 'math')),\n ((8, ('math',)), ('math', 'math')),\n ((9, ('math',)), ('math', 'math')),\n ((10, ('math',)), ('math', 'math')),\n ((11, ('math',)), ('math', 'math')),\n ((12, ('social studies',)), ('social studies', 'social studies')),\n ((13, ('social studies',)), ('social studies', 'social studies')),\n ((14, ('social studies',)), ('social studies', 'social studies')),\n ((15, ('science',)), ('science', 'science')),\n ((16, ('science',)), ('science', 'science')),\n ((17, ('science',)), ('science', 'science')),\n ((18, ('english', 'science')), ('english', 'english')),\n ((18, ('english', 'science')), ('science', 'science')),\n ((18, ('english', 'science')), ('english', 'science')),\n ((19, ('english', 'social studies')), ('english', 'english')),\n ((19, ('english', 'social studies')), ('social studies', 'social studies')),\n ((19, ('english', 'social studies')), ('english', 'social studies')),\n ((20, ('math', 'science')), ('math', 'math')),\n ((20, ('math', 'science')), ('science', 'science')),\n ((20, ('math', 'science')), ('math', 'science')),\n ((21, ('math', 'science')), ('math', 'math')),\n ((21, ('math', 'science')), ('science', 'science')),\n ((21, ('math', 'science')), ('math', 'science')),\n ((22, ('science', 'social studies')), ('science', 'science')),\n ((22, ('science', 'social studies')), ('social studies', 'social studies')),\n ((22, ('science', 'social studies')), ('science', 'social studies'))]"
}
],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": "def build_array_from_adj_list(data, adj_list):\n print(pformat(collections.Counter(data).items()))\n \n categories = collections.OrderedDict(\n (v,k) for k,v in enumerate(sorted(set(item for row in data for item in row))))\n print(\"Indices: %s\" % categories)\n \n adjacency_dimensions = len(categories), len(categories)\n print(adjacency_dimensions)\n \n adj = np.zeros(adjacency_dimensions)\n #print(adj)\n \n for row, adjacencies in adj_list:\n x, y = categories.get(adjacencies[0]), categories.get(adjacencies[1])\n adj[x][y] += 1\n if x != y:\n adj[y][x] += 1\n print(adj)\n \n subtotal_0 = np.sum(adj, axis=0)\n subtotal_1 = np.sum(adj, axis=1)\n if not np.all(np.equal(subtotal_0, subtotal_1)):\n raise Exception(\"Should be the same\")\n \n totals = zip(categories.keys(), subtotal_0)\n print(\"Totals: %s\" % totals)\n return adj, categories\n\nadj, categories = build_array_from_adj_list(data, adj_list)",
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "[(('english',), 5),\n (('math', 'science'), 2),\n (('math',), 7),\n (('science',), 3),\n (('english', 'science'), 1),\n (('science', 'social studies'), 1),\n (('social studies',), 3),\n (('english', 'social studies'), 1)]\nIndices: OrderedDict([('english', 0), ('math', 1), ('science', 2), ('social studies', 3)])\n(4, 4)\n[[ 7. 0. 1. 1.]\n [ 0. 9. 2. 0.]\n [ 1. 2. 7. 1.]\n [ 1. 0. 1. 5.]]\nTotals: [('english', 9.0), ('math', 11.0), ('science', 11.0), ('social studies', 7.0)]\n"
}
],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": "def draw_co_ocurrence_diagram(adj, categories, figsize=(4,5)):\n pylab.rcParams['figure.figsize'] = figsize\n plt.matshow(adj, cmap=\"Greys\")\n ticks = (np.arange(len(categories)), categories.keys())\n plt.xticks(*ticks, rotation=90)\n plt.yticks(*ticks)\n plt.colorbar(orientation='horizontal')\n\ndraw_co_ocurrence_diagram(adj, categories)",
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "display_data",
"png": "iVBORw0KGgoAAAANSUhEUgAAAUUAAAFcCAYAAABfrawOAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3XtcVHX+P/DXEQTFC4aZiaKOqNxhhkFREYNwJVe0ldRE\nTR/p1pYbkaWirm62m7ctN8BVy/JWyyasD32EongNRU0FHIyV1LiMl/JSKXKTEDi/P/wx3w8XEULm\nzIyv5+PBwznDmfN5z4AvPudzLh9JlmUZREQEAGijdAFERKaEoUhEJGAoEhEJGIpERAKGIhGRgKFI\nRCRgKBJZoJKSElRVVQEALly4gKSkJNy7d0/hqsyDxPMUiSyPr68vjh07htu3byMgIACDBg2CjY0N\n4uPjlS7N5LGnSGSBZFmGnZ0dduzYgdmzZ+O///0v/ve//yldlllgKBJZqG+++Qbx8fEYM2YMAKC6\nulrhiswDQ5HIAsXExGDFihUYP348PDw8kJeXh+DgYKXLMgscUySyYKWlpejQoYPSZZgV9hSJLNCJ\nEyfg7u4OV1dXAEBWVhZmz56tcFXmgaFIZIHeeustpKSk4MknnwQAqNVqHDlyROGqzANDkchC9e7d\nu9aytbW1QpWYF35KRBaod+/eOH78OACgoqICcXFxcHNzU7gq88ADLUQW6KeffkJUVBQOHjwIWZYx\natQoxMXFoWvXrkqXZvIYikREAu4+E1mQVatWITo6GpGRkfW+J0kS4uLiFKjKvDAUiSyIu7s7AECr\n1db7niRJxi7HLHH3mYhIwJ4ikQUZO3as4bEkSRD7PJIkISkpSYmyzApDkciCvPPOOwCAnTt34vr1\n65g2bRpkWcaXX36J7t27K1ydeeDuM5EF0mq1yMzMfOhzVB+vaCGyQGVlZcjLyzMs5+fno6ysTMGK\nzAd3n4ks0EcffYTg4GCoVCoAgF6vx4YNGxSuyjxw95nIQpWXl+P8+fOQJAmurq6wtbVVuiSzwFAk\nskBbt26tdfS55hzF6dOnK1mWWeDuM5EFSk9PNwRheXk5Dh06BF9fX4ZiE7CnSPQYKCwsxIsvvoh9\n+/YpXYrJ49FnoseAnZ0dCgoKlC7DLHD3mcgCiVe2VFdXIycnB5MmTVKwIvPB3WciC3TkyBHDQRZr\na2v06dMHTk5OCldlHrj7TGSBkpOTERQUhKCgIAwfPhxOTk6Ijo5WuiyzwFAkskAHDhyo99yePXsU\nqMT8cEyRyIKsX78e69atQ15eHry8vAzPFxcXIyAgQMHKzAfHFIksyJ07d3D79m0sWLAAq1atMowr\ndu7cGQ4ODgpXZx4YikQNKCsrw5UrV+Di4qJ0Kb9Jbm4uevXqhXbt2uHrr79GdnY2pk+fji5duihd\nmsnjmCJRHUlJSdBoNAgNDQUA6HQ6jBs3TuGqmmfChAmwtrZGbm4u/vSnP+HKlSuYMmWK0mWZBY4p\nmpDjx49Dr9ejsrISwP3rVXlZlvEtXboUp06dQnBwMABAo9EgPz9f4aqaR5IkWFtbY8eOHYiMjERk\nZCQ0Go3SZZkFhqKJmDZtGvLz86FWq2FlZWV4nqFofG3btq23m9mmjXntVNnY2OA///kPPv/8c+za\ntQsAcO/ePYWrMg8MRRORmZmJnJwczrhmAjw8PBAfH4/Kykp8//33iIuLw7Bhw5Quq1k2bdqETz75\nBH/5y1+gUqlQUFCAl156SemyzAIPtJiIiRMnIjY2Fo6OjkqX8tgrLS3FsmXLsH//fgBAaGgolixZ\ngnbt2ilcGRkDQ1FhNdeolpSUQKfTYfDgwYabgXL2NSLj4+6zwmpmX6uLu9HKGTlyJLZv324YV7x1\n6xYiIiJ4263HBENRYUFBQQDu9xTbt28PKysrXLhwARcuXMDo0aOVLe4x9fPPP9c60OLg4IAbN24o\nWBEZE0PRRIwYMQLHjh3D7du3ERoaikGDBiEhIQHx8fFKl/bYsbKywqVLl9CnTx8A9yd9Mpejz+It\nw+ricEzTMBRNhCzLsLOzw8aNGzF79mzMnz8fPj4+Spf1WFq2bBkCAwMxYsQIAMDRo0fNZia8Bw3H\nUNMxFE3IN998g/j4eGzcuBHA/ZuDkvE999xzyMzMxMmTJyFJEmJiYvDkk08qXVaT1AzH0G/HUDQR\nMTExWLFiBcaPHw8PDw/k5eUZrqgwN1VVVbhx44bhyhwA6N27t4IVNV9FRQUcHBxQWVmJnJwcADD0\nHM3BxYsXsWjRIpw7dw7l5eUA7u8+m9uVOUrgKTn0SK1ZswbvvfcennrqqVpX5mRnZytYVfNER0cj\nISEB7u7utd5DzZUh5iAgIADvvfce3n77bezatQubN29GVVUV/v73vytdmsljKCosKioKsbGxDQ6Q\nm+PAuLOzM06fPo2uXbsqXcpvNnDgQGRnZ5v15PG+vr44c+YMvLy8DH+Qap6jxnH3WWE1l15ZygB5\n79690blzZ6XLaBFnZ2dUVFSYdSi2a9cOVVVV6N+/P/71r3/B0dERpaWlSpdlFthTpEdi9erVAICc\nnBycP38eYWFhsLGxAXC/x/v2228rWV6zhIeH4+zZswgJCal1dVFcXJzClTXd6dOn4ebmhsLCQixZ\nsgRFRUWYP38+hgwZonRpJo89RYWJt4yvS5IkfPvtt0as5rcrLi6GJEno3bs3nJycUFFRgYqKCqXL\n+k3GjRuHcePGGa4qkmXZ7K4wGjx4MACgU6dO2LJli7LFmBn2FBWm1+sb/X7fvn2NUsejkpiYWG9+\n4YaeM3VlZWW4fPkyXF1dlS6lWSxtjFoJDEV6pDQaDXQ63UOfM2VJSUmYN28efv31V+j1euh0Orz7\n7rtmESiZmZnQarVITU2t9z1JkvDMM88Yvygzw91nE9GpU6d6z9nb22PQoEFYvXo1+vXrp0BVTbd3\n717s2bMHP/zwA958803DhEnFxcVo27atwtU1jznfeVur1QIA/Pz8DNfSA/fPHa05X5Eax1A0EVFR\nUXByckJERAQAYNu2bcjLy4NGo8HMmTMb/MtvShwdHaHVavHVV19Bq9XWmkXuo48+Uri65rGEO2+H\nhITg0KFD6NixI4D7wwGhoaE4ceKEwpWZPu4+mwhvb+96B1XUajWysrLg4+ODs2fPKlRZ81RUVBiO\nOpurmTNnIiQkBCtXrsSOHTsQFxeHe/fu4eOPP1a6tCar+d152HNUn3n9+bNgdnZ2SEhIQHV1Naqr\nq5GYmGi407M5HfnU6/WYMGEC3N3doVKpoFKpTH7Xv641a9bg3LlzsLW1RUREBDp37oyYmBily2qW\nDh06IDMz07CckZGB9u3bK1iR+WBP0UTk5eUhKioKJ0+eBAAMGTIEMTEx6NmzJzIzMzF8+HCFK2wa\nXl5mGtLT0zF58mT06NEDAHDt2jUkJCTAz89P4cpMH0ORHilzvrzM0k5nqaiowMWLFwEALi4uZnfA\nSyk80GIibt68iU8//bTevM+bNm1SuLLmMefLy2qmk23okktzGsIA7gfi+vXrcfToUQD3byn22muv\nMRibgD1FEzF06FCMGDECWq3WcKRTkiS88MILClfWPOnp6fUuL5s3b55ZXV4mTg0B/N/pLB06dFC4\nsqabNWsWKisrMWPGDMiyjC+++ALW1tb47LPPlC7N5DEUTYSlHBlMT0/H8uXLDT1eWZbRpk0bs7lc\nEQD8/f1rnc5SXFxsdqezNHQ2Q0PPUX3cfTYRYWFhSE5OxpgxY5QupUWmTp2KDz/8EJ6enmZ3bl+N\nX3/91RCIwP0T68vKyhSsqPmsra2Rm5uL/v37A7h/IM/amv/dm4KfkomIiYnB8uXLYWNjYxj3kSQJ\nRUVFClfWPN26dcO4ceOULqNF7OzsDJfLAeZ5OssHH3yAZ599FiqVCsD9U6U2b96scFXmgbvPJqKq\nqgrx8fEoKCjAu+++i0uXLuH69evw9/dXurRm2b9/PxISEjBy5Mhatw4LDw9XuLKmq3s6y/Xr17Ft\n2zazO52lvLwcFy5cgCRJcHFxMev7QxqTee7fWKA///nPOHXqFLZt2wbg/i7bG2+8oXBVzbd161ac\nPXsWKSkp2L17N3bv3m1Wt/EHgIKCAuh0Oqxfvx6/+93v4OLiYnZDAYmJiaioqICPjw+++uorRERE\nmMVpUSZBJpOgVqtr/SvLsuzt7a1UOb/ZwIED5erqaqXLaBFPT09ZlmU5LS1NfuaZZ+Rdu3bJgwcP\nVriq5mnoPQwaNEjhqsyDef35s2A2NjaoqqoyLP/0009m1zsBgGHDhhlmvzNXNafi7N69G6+88grC\nwsLM7oa5Db2He/fuKVyVeeCBFhMRGRmJ8ePH4+bNm1i0aBG2b9+O999/X+mymu2bb76BWq2GSqWq\ndSt/czoVpGfPnnj11Vdx4MABLFiwAOXl5WY3B7clvAel8ECLCfnuu+9w6NAhAPdv/eTm5qZwRc33\noDuJm9MdxEtLS5GSkgJvb28MGDAA165dQ3Z2NkaNGqV0aU1mCe9BKQxFIiKB+Q1aERG1IoYiEZGA\nB1pawNzunELUmixlJI6h2EKhoaGttm3x2tXWkpKS0qrbX7p0KZYuXdqqbbT2pZArVqzAwoULW7WN\n1mSM+u3t7Vt1+8bE3WciIgFDkYhIwFA0YQ4ODkqX0GJBQUFKl9Bi5jI/zoOYe/3GxvMUW0CSpFYd\nUzSG1h5TNAZzu72aJbK3t7eYAy3sKRIRCRiKREQChiIRkYChSEQkYCgSEQkYikREAoYiEZGAoUhE\nJGAoEhEJGIpERAKGIhGRgKFIRCRgKBIRCR6LUNTr9fDy8gIAZGRkICoq6oHrpqamYuzYscYqjYhM\nzGM3HYGfnx/8/PyULoOITJTJ9xT//e9/w9/fHxqNBq+99hqqqqrQsWNHLF68GGq1GkOHDsXNmzcB\nAHl5eRgyZAi8vb2xePFidOrUqd72xJ7gkSNHoNFooNFo4Ovri5KSEgBASUkJJk6cCDc3N0ybNs14\nb5aIFGfSofjdd98hMTERJ06cgE6ng5WVFeLj41FWVoahQ4ciKysLI0aMwKeffgoAiIqKwpw5c/Dt\nt9/CycnpodtfvXo11q1bB51Oh2PHjqF9+/YAAJ1Oh9jYWOTk5CA/Px/Hjx9v1fdJRKbDpEPx0KFD\nyMzMhJ+fHzQaDQ4fPoyCggLY2NhgzJgxAACtVgu9Xg8AOHnyJCZOnAgAiIiIeOj2AwICMGfOHKxZ\nswa3b9+GlZUVAGDw4MFwdHSEJElQq9WG7ROR5TP5McUZM2Zg+fLltZ778MMPDY/btGmDysrK37Tt\n6OhohIWFITk5GQEBAdi3bx8AwNbW1rCOlZVVo9vPzc01PHZwcLCIeVWIHiYtLQ3Hjh1TuoxWYdKh\nGBISgueffx5z5sxBt27dcOvWLRQXFz9w/SFDhmD79u2YNGkStm3b9tDt5+XlwcPDAx4eHkhPT8eF\nCxeaPX9ta8/LTGSKAgMDERgYaFheuXKlgtU8Wia9++zm5ob3338fo0aNgo+PD0JDQ3H9+nVIkmRY\nR5Ikw3JMTAz++c9/Qq1WIy8vr1bA1X0NAMTGxsLLyws+Pj6wsbHB6NGj663b0DIRWS6Lms3v7t27\nhoMl27ZtQ0JCAnbu3Nlq7XE2P9PA2fyUZ0mz+Zn07nNzZWZm4o033oAsy3jiiSewadMmpUsiIjNj\nUaE4fPhwZGVlKV0GEZkxkx5TJCIyNoYiEZGAoUhEJGAoEhEJGIpERAKGIhGRgKFIRCRgKBIRCRiK\nREQChiIRkYChSEQkYCgSEQkYikREAoYiEZGAoUhEJGAoEhEJGIpERAKGIhGRwKImrjI2SZLMfrIe\nS5ipMD8/X+kSWkylUildQotYwv+FGuwpEhEJGIpERAKGIhGRgKFIRCRgKBIRCRiKREQChiIRkYCh\nSEQkYCgSEQkYikREAoYiEZGAoUhEJGAoEhEJGIpERAKGIhGRgKFIRCRgKBIRCRiKREQChiIRkYCh\nSEQkYCgCOHv2LPbu3WtYXrp0KVavXq1gRUSkFIYiAJ1Ohz179hiWLWGGOyL6bSwmFPV6PVxdXfHy\nyy/DxcUFU6dOxf79+xEQEICBAwciPT0d6enpGDZsGHx9fREQEICLFy+ioqICf/3rX5GQkACNRoPE\nxEQAQE5ODoKDg+Hs7Iw1a9Yo/O6IyFgsZt5nvV6PAQMGICsrC+7u7hg0aBB8fHywceNGJCUlYfPm\nzfjiiy/Qvn17WFlZ4eDBg/j444+xfft2bN26FZmZmYiLiwNwf/f5wIEDSE1NRVFREVxcXHDjxg1Y\nWVnVatMS5rq1hF4x531WniX8X6hhrXQBj5JKpYKHhwcAwMPDAyNHjgQAeHp6Qq/Xo7CwEC+99BJy\nc3MhSRIqKysBALIs1/qBSpKEsLAwtG3bFl27dsVTTz2FGzduwNHRsV6bS5cuNTwOCgpCUFBQ671B\nIhORmpqK1NRUpctoFRYVira2tobHbdq0gY2NjeFxZWUllixZgpCQEOzcuROXLl1qNMBqXgsAVlZW\nhgCtSwxFosdF3Q7Ae++9p1wxj5jFjCk+jCzLKCoqMvT2Nm/ebPhe586dUVxcrFRpRGRCLCoU646P\nictt2rTBvHnzsHDhQvj6+qKqqsrw/eDgYOTk5NQ60GIJY21E1HwWc6BFCZYwuGwJ4c8DLcqzhP8L\nNSyqp0hE1FIMRSIiAUORiEjAUCQiEjAUiYgEDEUiIgFDkYhIwFAkIhIwFImIBAxFIiIBQ5GISMBQ\nJCISMBSJiAQMRSIiAUORiEjAUCQiEjAUiYgEDEUiIgFDkYhIYFFTnCqhqKhI6RJaxBLmN3n99deV\nLqHFaiZMI+Wxp0hEJGAoEhEJGIpERAKGIhGRgKFIRCRgKBIRCRiKREQChiIRkYChSEQkYCgSEQkY\nikREAoYiEZGAoUhEJGAoEhEJGIpERAKGIhGRgKFIRCRgKBIRCRiKREQChiIRkcCsQzEzMxNRUVFK\nl0FEFsSsZ/PTarXQarVKl0FEFsQke4qlpaUYM2YM1Go1vLy8kJiYiPT0dAwbNgxqtRr+/v4oKSlB\namoqxo4da3jNzJkz4e/vD19fXyQlJQEAtmzZgvDwcIwePRoDBw5EdHS0oZ2UlBRotVqo1WqMHDmy\n0e0Q0ePBJHuKKSkp6NmzJ5KTkwHcn1tZo9EgMTERWq0WJSUlaN++fa3XLFu2DCEhIdi0aRMKCwvh\n7+9vCLqzZ88iKysLNjY2cHFxwZtvvgkbGxu8+uqrSEtLQ58+fVBYWNjoduzs7Iz7IRCRIkwyFL29\nvTF37lwsWLAAYWFhsLe3R48ePQy7yh07dqz3mv3792PXrl348MMPAQC//vorLl++DEmSEBISgk6d\nOgEA3N3dodfrcevWLYwYMQJ9+vQBAHTp0uWB27ly5QpcXFwarHXFihWGx8OHD0dgYOAj+hSITFda\nWhqOHTumdBmtwiRDccCAAdDpdEhOTsbixYsRHBzcpNft2LEDAwYMqPXcqVOnYGtra1i2srJCZWUl\nJElq1nYeZOHChU1aj8iSBAYG1uoArFy5UsFqHi2THFO8du0a2rVrh6lTp2Lu3Lk4ffo0rl+/joyM\nDABAcXExqqqqar0mNDQUcXFxhmWdTgcAkGW53vYlScKQIUNw9OhR6PV6AMCtW7ca3Q4RPR5MsqeY\nnZ2NefPmoU2bNrCxscH69etRXV2NyMhI3L17F3Z2djhw4AAkSTL0+JYsWYK33noL3t7eqK6uRr9+\n/ZCUlFRrHdGTTz6JDRs2IDw8HNXV1ejevTv27dv3wO0Q0eNBkhvqSlGTSJKEO3fuKF1Gi/zyyy9K\nl9Bir7/+utIltFhiYqLSJbSIvb19g3tl5sgkd5+JiJTCUCQiEjAUiYgEDEUiIgFDkYhIwFAkIhIw\nFImIBAxFIiIBQ5GISMBQJCISMBSJiAQMRSIiAUORiEjAUCQiEjAUiYgEDEUiIgFDkYhIwFAkIhIw\nFImIBCY5cRUZj0qlUrqEFjP3+U0AYO3atUqXQP8fe4pERAKGIhGRgKFIRCRgKBIRCRiKREQChiIR\nkYChSEQkYCgSEQkYikREAoYiEZGAoUhEJGAoEhEJGIpERAKGIhGRgKFIRCRgKBIRCRiKREQChiIR\nkYChSEQkYCgSEQlaPRQzMzMRFRXV6DqpqakYO3Zsk7Z3584drF+/vtl1iG3s2rULq1atavY2iMjy\ntXooarVaxMbGPrLt3b59G+vWrWvRNsaOHYvo6OhHVBERWZJGQ7G0tBRjxoyBWq2Gl5eXYSrJQ4cO\nwdfXF97e3pg1axYqKioAAOnp6QgICIBarYa/vz9KSkpq9dBOnz6NYcOGwdfXFwEBAbh48WKjxZ07\ndw7+/v7QaDRQq9XIzc3FggULkJeXB41Gg/nz5+PIkSO1eplvvPEGtm7dCgBISUmBm5sbtFotdu7c\naVhny5YtiIyMBAD89NNPmDBhAgYPHozBgwfjxIkTAIAjR45Ao9FAo9HA19cXJSUlzfpgicg8NTrv\nc0pKCnr27Ink5GQAQFFREcrLy/Hyyy/j8OHD6N+/P2bMmIH169fj9ddfx+TJk5GYmAitVouSkhK0\nb9++1vbc3NyQlpYGKysrHDx4EIsWLcL27dsf2P4nn3yCqKgoTJkyBZWVlaisrMSqVatw7tw56HQ6\nAPd3i0WSJEGSJJSXl+PVV1/F119/DWdnZ7z44ouQJKleG1FRUZgzZw4CAgJw+fJlPPfcc8jJycHq\n1auxbt06DB06FGVlZbC1tW3SB0pE5q3RUPT29sbcuXOxYMEChIWFYfjw4Th79ixUKhX69+8PAJgx\nYwbWrl2LkJAQ9OjRA1qtFgDQsWPHetsrLCzE9OnTkZubC0mScO/evUaLGzp0KJYtW4arV68iPDwc\n/fv3hyzLD31Tsizj/PnzUKlUcHZ2BgBMmzYNGzZsqLfuwYMH8d133xmWi4uLUVpaioCAAMyZMwdT\np05FeHg4evbs2WBbK1asMDwePnw4AgMDH1ofkbnLz89HQUGB0mW0ikZDccCAAdDpdEhOTsbixYsR\nEhKC559/vtY6TQmpGkuWLEFISAh27tyJS5cuISgoqNH1IyIiMGTIEOzevRu///3v8cknn0ClUtV+\nA9bWqK6uNiyXl5cDQL1e4YPqlGUZp06dgo2NTa3no6OjERYWhuTkZAQEBGDfvn1wcXGp9/qFCxc2\n+h6ILFG/fv3Qr18/w/KhQ4cUrObRanRM8dq1a2jXrh2mTp2KuXPnQqfTwcXFBXq9Hnl5eQCAL774\nAkFBQXBxccG1a9eQkZEB4H6Pq6qqqtb2ioqK4OjoCADYvHnzQ4srKCiASqVCZGQknn/+eWRnZ6Nz\n584oLi42rNOnTx/k5OSgoqIChYWFOHToECRJgqurK/R6PfLz8wEAX375ZYNtjBo1CnFxcYblrKws\nAEBeXh48PDwwf/58DBo0CBcuXHhovURk/hoNxezsbMOBjr/97W9YvHgxbG1tsXnzZkycOBHe3t6w\ntrbGa6+9hrZt2yIhIQGRkZFQq9UIDQ1FeXm5YYwPAObPn4+FCxfC19cXVVVVtXpzDY33JSYmwtPT\nExqNBufOncP06dPh4OCAgIAAeHl5ITo6Gk5OTpg0aRI8PT3x4osvwtfXFwBga2uLDRs2YMyYMdBq\ntejevbuhDbGmuLg4ZGRkwMfHBx4eHoZd7NjYWHh5ecHHxwc2NjYYPXr0I/i4icjUSXJz9n+pFkmS\ncOfOHaXLaJHOnTsrXUKLFRUVKV1Ci61du1bpElpk0aJFzRpKM2W8ooWISMBQJCISMBSJiAQMRSIi\nAUORiEjAUCQiEjAUiYgEDEUiIgFDkYhIwFAkIhIwFImIBAxFIiIBQ5GISMBQJCISMBSJiAQMRROW\nlpamdAktVndiMXNk7j+HmrvPU9MwFE3YsWPHlC6hxSwhFM3952CpE0y1FoYiEZGAoUhEJOAcLS3Q\n0GRbRI8rS4mSRud9psZZyi8BEf0f7j4TEQkYikREAoYiEZGAoUhEJGAoEhEJGIpERAKGIhGRgKFI\nRCRgKBIRCRiKREQChiIRkYChSEQkYCgSEQkYikREAoYiEZGAoUhEJGAoEhEJGIpERAKGIhGRgKFI\nRCRgKBIRCRiKREQChiIRkYChSEQkYCgSEQkYikREAoYiEZGAoUhEJGAoEhEJGIpERAKGIhGRgKFI\nRCRgKBIRCRiKREQChiIRkYChSEQkYCgSEQkYikREAoYiEZGAoUhEJLBWuoCGdOrUCSUlJUqXQUQW\npmPHjiguLm50HUmWZdlI9TSZJEmorq42LNeU2NR/H/S4KesvX74cCxcurLVOY6+v+/2Hva6xdWJi\nYhAVFdXo9up+ifU39vWgOmq+PvvsM8yaNavR14qPq6ura60jLovrN7Zezdf27dsRHh7+0PVqnmts\n+43V39C6Bw8exLPPPltv/aZstyXry7KM9PR0aLXaZr2mqe+77nuu+29ubi5UKpXhuYY+U7GNus/V\nfc+NvaZu/bdu3YK9vf0Df77Neb8N/U486DUNZUNDuPtMRCRgKBIRCRiKdQwfPlyxtv39/RVrW6PR\nKNa2u7u7Ym2rVCrF2nZ0dFSs7SeeeEKxttu1a6dY203BMcUmrPOw7z1o3Ue1ft3XPmicpCljTA/b\n1sOer3n8KMcUgfpjcsYaU3zQ+sYYU2zOV93XtHRMse5zDX2mrTWm+LDxwea8X44pEhG1MoYiEZGA\noUhEJGAoEhEJGIp1pKWlKdb2yZMnFWv7zJkzirWdk5OjWNv5+fmKtf3jjz8q1vatW7cUa7u8vFyx\ntpuCoVjHsWPHFGv71KlTirWt0+kUa1vJUCwoKFCsbSVD8fbt24q1zVAkIjIjDEUiIoHJnrxNRPSo\nNeUuOSZ56zATzGkiekxw95mISMBQJCISmFQopqSkwNXVFQMGDMCqVauM1u7MmTPRvXt3eHl5Ga3N\nGleuXEFwcDA8PDzg6emJuLg4o7VdXl4Of39/qNVquLu7Y+HChUZru0ZVVRU0Gg3Gjh1r1Hb79u0L\nb29vaDQaDB482KhtFxYWYsKECXBzc4O7u7vRzk+9cOECNBqN4cve3t6ov28rVqyAh4cHvLy8MGXK\nFPz6668YIYH4AAAFAElEQVRGazs2NhZeXl7w9PREbGxs4yvLJqKyslJ2dnaWCwoK5IqKCtnHx0fO\nyckxSttHjx6Vz5w5I3t6ehqlPdG1a9dknU4ny7IsFxcXywMHDjTa+5ZlWS4tLZVlWZbv3bsn+/v7\ny2lpaUZrW5ZlefXq1fKUKVPksWPHGrXdvn37yr/88otR26wxffp0eePGjbIs3//cCwsLjV5DVVWV\n/PTTT8uXL182SnsFBQWySqWSy8vLZVmW5UmTJslbtmwxStvZ2dmyp6enfPfuXbmyslIeOXKknJub\n+8D1TaanePr0afTv3x99+/ZF27ZtMXnyZHz11VdGaTswMFCx+8s9/fTTUKvVAO4fGXNzczPqSb12\ndnYAgIqKClRVVcHBwcFobV+9ehV79uzBH//4R0UOrinR5p07d5CWloaZM2cCAKytrWFvb2/0Og4e\nPAhnZ2c4OTkZpb3OnTujbdu2KCsrQ2VlJcrKytCzZ0+jtH3+/Hn4+/ujXbt2sLKywjPPPIMdO3Y8\ncH2TCcUffvih1g+oV69e+OGHHxSsyPj0ej10Op1RbzZbXV0NtVqN7t27Izg42Kg3fJ0zZw4++OAD\ntGlj/F9DSZIwcuRI+Pn54dNPPzVauwUFBejWrRtefvll+Pr64pVXXkFZWZnR2q+xbds2TJkyxWjt\nOTg44J133kHv3r3h6OiILl26YOTIkUZp29PTE2lpabh16xbKysqQnJyMq1evPnB9kwnFx/3cxJKS\nEkyYMAGxsbHo2LGj0dpt06YNsrKycPXqVRw9ehSpqalGaXf37t146qmnoNFoFOmxHT9+HDqdDnv3\n7sXatWuNds17ZWUlzpw5g9mzZ+PMmTPo0KEDVq5caZS2a1RUVGDXrl2YOHGi0drMy8tDTEwM9Ho9\nfvzxR5SUlCA+Pt4obbu6uiI6OhqjRo3C6NGjodFoGv1DbDKh2LNnT1y5csWwfOXKFfTq1UvBiozn\n3r17eOGFFzBt2jT84Q9/UKQGe3t7jBkzBhkZGUZp78SJE0hKSoJKpUJERAQOHz6M6dOnG6VtAOjR\nowcAoFu3bhg/fjxOnz5tlHZ79eqFXr16YdCgQQCACRMmGP1mHHv37oVWq0W3bt2M1mZGRgaGDRuG\nrl27wtraGuHh4Thx4oTR2p85cyYyMjJw5MgRdOnSBS4uLg9c12RC0c/PD99//z30ej0qKiqQkJCA\ncePGKV1Wq5NlGbNmzYK7uzveeusto7b9888/o7CwEABw9+5dHDhwwGhztSxfvhxXrlxBQUEBtm3b\nhmeffRaff/65UdouKyszXNVQWlqK/fv3G+3Mg6effhpOTk64ePEigPtjex4eHkZpu8aXX36JiIgI\no7bp6uqKkydP4u7du5Dl+1PLGnOo5ubNmwCAy5cvY+fOnY0PHRjl8E8T7dmzRx44cKDs7OwsL1++\n3GjtTp48We7Ro4dsY2Mj9+rVS960aZPR2k5LS5MlSZJ9fHxktVotq9Vqee/evUZp+9tvv5U1Go3s\n4+Mje3l5yf/4xz+M0m5dqampRj36nJ+fL/v4+Mg+Pj6yh4eHUX/XZFmWs7KyZD8/P9nb21seP368\nUY8+l5SUyF27dpWLioqM1maNVatWye7u7rKnp6c8ffp0uaKiwmhtBwYGyu7u7rKPj498+PDhRtc1\nyWufiYiUYjK7z0REpoChSEQkYCgSEQkYikREAoYiEZGAoUhEJPh/tMPJGJ+CzicAAAAASUVORK5C\nYII=\n",
"text": "<matplotlib.figure.Figure at 0x96c550c>"
}
],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": "",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 5
}
],
"metadata": {}
}
]
}
# -*- coding: utf-8 -*-
# <nbformat>3.0</nbformat>
# <markdowncell>
# Draw a co-ocurrence matrix with matplotlib.matshow (matplotlib.imshow)
#
# https://en.wikipedia.org/wiki/Co-occurrence_matrix
#
# * https://en.wikipedia.org/wiki/Matplotlib
# * http://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.matshow
# * http://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.imshow
#
# Created with IPython notebook in an Anaconda 1.9.1 environment
#
# * http://ipython.org/notebook.html
# * http://docs.continuum.io/anaconda/install.html
# * http://docs.continuum.io/anaconda/pkgs.html
# <codecell>
%pylab inline --no-import-all
# import numpy as np
# import matplotlib.pyplot as plt
# import pylab
import collections
from pprint import pformat
# <codecell>
DATA_CONSTRAINTS = (
(('english',), 5),
(('math',), 7),
(('social studies',), 3),
(('science',), 3),
(('english', 'science'), 1),
(('english', 'social studies',), 1),
(('math', 'science'), 2),
(('science', 'social studies',), 1)
)
def generate_data(constraints):
"""
Args:
constraints: ((categories_tuple,), count_int)
Returns:
list of category tuples satisfying constraints
"""
def _standardize_constraints(constraints):
return [(tuple(sorted(c[0])), c[1]) for c in constraints]
def fits_constraints(data, constraints):
dataset = list(data)
counts = collections.Counter(tuple(sorted(elem)) for elem in dataset)
_constraints = _standardize_constraints(constraints)
return sorted(counts.iteritems()) == sorted(_constraints)
def generate_data(constraints):
_constraints = _standardize_constraints(constraints)
for categories, count in _constraints:
for n in xrange(count):
yield categories
data = list(generate_data(constraints))
if fits_constraints(data, constraints):
return data
raise Exception("uh") # XXX
data = generate_data(DATA_CONSTRAINTS)
data
# <codecell>
def iter_adjacencies(data):
"""
Args:
data: iterable of categories
Returns:
iterable of (row, (category_x, category_y)) pairs with self edges
"""
for row_n, row in enumerate(data):
_len = len(row)
for category in row:
yield (row_n,row), (category, category)
if _len > 1:
for i in xrange(_len-1):
yield (row_n,row), (row[i], row[i+1])
adj_list = list(iter_adjacencies(data))
adj_list
# <codecell>
def build_array_from_adj_list(data, adj_list):
print(pformat(collections.Counter(data).items()))
categories = collections.OrderedDict(
(v,k) for k,v in enumerate(sorted(set(item for row in data for item in row))))
print("Indices: %s" % categories)
adjacency_dimensions = len(categories), len(categories)
print(adjacency_dimensions)
adj = np.zeros(adjacency_dimensions)
#print(adj)
for row, adjacencies in adj_list:
x, y = categories.get(adjacencies[0]), categories.get(adjacencies[1])
adj[x][y] += 1
if x != y:
adj[y][x] += 1
print(adj)
subtotal_0 = np.sum(adj, axis=0)
subtotal_1 = np.sum(adj, axis=1)
if not np.all(np.equal(subtotal_0, subtotal_1)):
raise Exception("Should be the same")
totals = zip(categories.keys(), subtotal_0)
print("Totals: %s" % totals)
return adj, categories
adj, categories = build_array_from_adj_list(data, adj_list)
# <codecell>
def draw_co_ocurrence_diagram(adj, categories, figsize=(4,5)):
pylab.rcParams['figure.figsize'] = figsize
plt.matshow(adj, cmap="Greys")
ticks = (np.arange(len(categories)), categories.keys())
plt.xticks(*ticks, rotation=90)
plt.yticks(*ticks)
plt.colorbar(orientation='horizontal')
draw_co_ocurrence_diagram(adj, categories)
# <codecell>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment