justmytwospence/sankey.html

## sankey.html
<!DOCTYPE html>
<meta charset="utf-8">
<body>
 <style>
#chart {
height: 500px;
}
.node rect {
cursor: move;
fill-opacity: .9;
shape-rendering: crispEdges;
}
.node text {
font: 12px serif;
pointer-events: none;
text-shadow: 0 1px 0 #fff;
}
.link {
fill: none;
stroke: #000;
stroke-opacity: .2;
}
.link:hover {
stroke-opacity: .5;
}
</style>

<p id="chart"></p>

<script src=http://d3js.org/d3.v3.min.js></script>
 <script>
d3.sankey = function() {
var sankey = {},
nodeWidth = 24,
nodePadding = 8,
size = [1, 1],
nodes = [],
links = [];

sankey.nodeWidth = function(_) {
if (!arguments.length) return nodeWidth;
nodeWidth = +_;
return sankey;
};

sankey.nodePadding = function(_) {
if (!arguments.length) return nodePadding;
nodePadding = +_;
return sankey;
};

sankey.nodes = function(_) {
if (!arguments.length) return nodes;
nodes = _;
return sankey;
};

sankey.links = function(_) {
if (!arguments.length) return links;
links = _;
return sankey;
};

sankey.size = function(_) {
if (!arguments.length) return size;
size = _;
return sankey;
};

sankey.layout = function(iterations) {
computeNodeLinks();
computeNodeValues();
computeNodeBreadths();
computeNodeDepths(iterations);
computeLinkDepths();
return sankey;
};

sankey.relayout = function() {
computeLinkDepths();
return sankey;
};

sankey.link = function() {
var curvature = .5;

function link(d) {
var x0 = d.source.x + d.source.dx,
x1 = d.target.x,
xi = d3.interpolateNumber(x0, x1),
x2 = xi(curvature),
x3 = xi(1 - curvature),
y0 = d.source.y + d.sy + d.dy / 2,
y1 = d.target.y + d.ty + d.dy / 2;
return "M" + x0 + "," + y0
+ "C" + x2 + "," + y0
+ " " + x3 + "," + y1
+ " " + x1 + "," + y1;
}

link.curvature = function(_) {
if (!arguments.length) return curvature;
curvature = +_;
return link;
};

return link;
};

// Populate the sourceLinks and targetLinks for each node.
// Also, if the source and target are not objects, assume they are indices.
function computeNodeLinks() {
nodes.forEach(function(node) {
node.sourceLinks = [];
node.targetLinks = [];
});
links.forEach(function(link) {
var source = link.source,
target = link.target;
if (typeof source === "number") source = link.source = nodes[link.source];
if (typeof target === "number") target = link.target = nodes[link.target];
source.sourceLinks.push(link);
target.targetLinks.push(link);
});
}

// Compute the value (size) of each node by summing the associated links.
function computeNodeValues() {
nodes.forEach(function(node) {
node.value = Math.max(
d3.sum(node.sourceLinks, value),
d3.sum(node.targetLinks, value)
);
});
}

// Iteratively assign the breadth (x-position) for each node.
// Nodes are assigned the maximum breadth of incoming neighbors plus one;
// nodes with no incoming links are assigned breadth zero, while
// nodes with no outgoing links are assigned the maximum breadth.
function computeNodeBreadths() {
var remainingNodes = nodes,
nextNodes,
x = 0;

while (remainingNodes.length) {
nextNodes = [];
remainingNodes.forEach(function(node) {
node.x = x;
node.dx = nodeWidth;
node.sourceLinks.forEach(function(link) {
nextNodes.push(link.target);
});
});
remainingNodes = nextNodes;
++x;
}

//
moveSinksRight(x);
scaleNodeBreadths((width - nodeWidth) / (x - 1));
}

function moveSourcesRight() {
nodes.forEach(function(node) {
if (!node.targetLinks.length) {
node.x = d3.min(node.sourceLinks, function(d) { return d.target.x; }) - 1;
}
});
}

function moveSinksRight(x) {
nodes.forEach(function(node) {
if (!node.sourceLinks.length) {
node.x = x - 1;
}
});
}

function scaleNodeBreadths(kx) {
nodes.forEach(function(node) {
node.x *= kx;
});
}

function computeNodeDepths(iterations) {
var nodesByBreadth = d3.nest()
.key(function(d) { return d.x; })
.sortKeys(d3.ascending)
.entries(nodes)
.map(function(d) { return d.values; });

//
initializeNodeDepth();
resolveCollisions();
for (var alpha = 1; iterations > 0; --iterations) {
relaxRightToLeft(alpha *= .99);
resolveCollisions();
relaxLeftToRight(alpha);
resolveCollisions();
}

function initializeNodeDepth() {
var ky = d3.min(nodesByBreadth, function(nodes) {
return (size[1] - (nodes.length - 1) * nodePadding) / d3.sum(nodes, value);
});

nodesByBreadth.forEach(function(nodes) {
nodes.forEach(function(node, i) {
node.y = i;
node.dy = node.value * ky;
});
});

links.forEach(function(link) {
link.dy = link.value * ky;
});
}

function relaxLeftToRight(alpha) {
nodesByBreadth.forEach(function(nodes, breadth) {
nodes.forEach(function(node) {
if (node.targetLinks.length) {
var y = d3.sum(node.targetLinks, weightedSource) / d3.sum(node.targetLinks, value);
node.y += (y - center(node)) * alpha;
}
});
});

function weightedSource(link) {
return center(link.source) * link.value;
}
}

function relaxRightToLeft(alpha) {
nodesByBreadth.slice().reverse().forEach(function(nodes) {
nodes.forEach(function(node) {
if (node.sourceLinks.length) {
var y = d3.sum(node.sourceLinks, weightedTarget) / d3.sum(node.sourceLinks, value);
node.y += (y - center(node)) * alpha;
}
});
});

function weightedTarget(link) {
return center(link.target) * link.value;
}
}

function resolveCollisions() {
nodesByBreadth.forEach(function(nodes) {
var node,
dy,
y0 = 0,
n = nodes.length,
i;

// Push any overlapping nodes down.
nodes.sort(ascendingDepth);
for (i = 0; i < n; ++i) {
node = nodes[i];
dy = y0 - node.y;
if (dy > 0) node.y += dy;
y0 = node.y + node.dy + nodePadding;
}

// If the bottommost node goes outside the bounds, push it back up.
dy = y0 - nodePadding - size[1];
if (dy > 0) {
y0 = node.y -= dy;

// Push any overlapping nodes back up.
for (i = n - 2; i >= 0; --i) {
node = nodes[i];
dy = node.y + node.dy + nodePadding - y0;
if (dy > 0) node.y -= dy;
y0 = node.y;
}
}
});
}

function ascendingDepth(a, b) {
return a.y - b.y;
}
}

function computeLinkDepths() {
nodes.forEach(function(node) {
node.sourceLinks.sort(ascendingTargetDepth);
node.targetLinks.sort(ascendingSourceDepth);
});
nodes.forEach(function(node) {
var sy = 0, ty = 0;
node.sourceLinks.forEach(function(link) {
link.sy = sy;
sy += link.dy;
});
node.targetLinks.forEach(function(link) {
link.ty = ty;
ty += link.dy;
});
});

function ascendingSourceDepth(a, b) {
return a.source.y - b.source.y;
}

function ascendingTargetDepth(a, b) {
return a.target.y - b.target.y;
}
}

function center(node) {
return node.y + node.dy / 2;
}

function value(link) {
return link.value;
}

return sankey;
};
</script>

<script>

 var links = [ { "source" : 0, "target" : 11, "value" : 100 }, { "source" : 0, "target" : 19, "value" : 87 }, { "source" : 1, "target" : 0, "value" : 66 }, { "source" : 2, "target" : 6, "value" : 77 }, { "source" : 3, "target" : 21, "value" : 185 }, { "source" : 3, "target" : 10, "value" : 67 }, { "source" : 3, "target" : 0, "value" : 296 }, { "source" : 4, "target" : 12, "value" : 51 }, { "source" : 4, "target" : 6, "value" : 112 }, { "source" : 5, "target" : 0, "value" : 68 }, { "source" : 6, "target" : 21, "value" : 363 }, { "source" : 6, "target" : 10, "value" : 209 }, { "source" : 6, "target" : 24, "value" : 366 }, { "source" : 7, "target" : 12, "value" : 182 }, { "source" : 7, "target" : 6, "value" : 130 }, { "source" : 8, "target" : 12, "value" : 138 }, { "source" : 8, "target" : 6, "value" : 105 }, { "source" : 9, "target" : 5, "value" : 120 }, { "source" : 9, "target" : 12, "value" : 1550 }, { "source" : 9, "target" : 22, "value" : 111 }, { "source" : 9, "target" : 13, "value" : 130 }, { "source" : 9, "target" : 3, "value" : 612 }, { "source" : 9, "target" : 1, "value" : 128 }, { "source" : 9, "target" : 15, "value" : 698 }, { "source" : 10, "target" : 11, "value" : 86 }, { "source" : 10, "target" : 18, "value" : 63 }, { "source" : 12, "target" : 10, "value" : 93 }, { "source" : 12, "target" : 0, "value" : 411 }, { "source" : 12, "target" : 24, "value" : 210 }, { "source" : 13, "target" : 21, "value" : 58 }, { "source" : 13, "target" : 0, "value" : 122 }, { "source" : 14, "target" : 6, "value" : 105 }, { "source" : 15, "target" : 24, "value" : 95 }, { "source" : 15, "target" : 21, "value" : 74 }, { "source" : 15, "target" : 0, "value" : 251 }, { "source" : 16, "target" : 0, "value" : 106 }, { "source" : 17, "target" : 3, "value" : 98 }, { "source" : 17, "target" : 12, "value" : 127 }, { "source" : 17, "target" : 6, "value" : 841 }, { "source" : 17, "target" : 22, "value" : 218 }, { "source" : 20, "target" : 5, "value" : 80 }, { "source" : 20, "target" : 6, "value" : 1999 }, { "source" : 20, "target" : 13, "value" : 335 }, { "source" : 20, "target" : 1, "value" : 72 }, { "source" : 20, "target" : 3, "value" : 491 }, { "source" : 20, "target" : 15, "value" : 227 }, { "source" : 20, "target" : 16, "value" : 182 }, { "source" : 21, "target" : 18, "value" : 65 }, { "source" : 22, "target" : 10, "value" : 72 }, { "source" : 22, "target" : 0, "value" : 95 }, { "source" : 23, "target" : 15, "value" : 107 }, { "source" : 23, "target" : 12, "value" : 311 }, { "source" : 23, "target" : 6, "value" : 484 }, { "source" : 24, "target" : 18, "value" : 73 } ] ;
 var nodes = [ { "name" : "funny" }, { "name" : "atheism" }, { "name" : "AdviceAnimals" }, { "name" : "WTF" }, { "name" : "atheism" }, { "name" : "gaming" }, { "name" : "funny" }, { "name" : "aww" }, { "name" : "reddit.com" }, { "name" : "funny" }, { "name" : "gifs" }, { "name" : "GifSound" }, { "name" : "pics" }, { "name" : "aww" }, { "name" : "gaming" }, { "name" : "gifs" }, { "name" : "reddit.com" }, { "name" : "gifs" }, { "name" : "funny" }, { "name" : "WTF" }, { "name" : "pics" }, { "name" : "pics" }, { "name" : "GifSound" }, { "name" : "WTF" }, { "name" : "WTF" } ] ;
 var margin = {top: 1, right: 1, bottom: 6, left: 1},
width = 600 - margin.left - margin.right,
height = 500 - margin.top - margin.bottom;

var formatNumber = d3.format(",.0f"),
format = function(d) { return formatNumber(d) + " TWh"; },
color = d3.scale.category20();

var svg = d3.select("#chart").append("svg")
.attr("width", width + margin.left + margin.right)
.attr("height", height + margin.top + margin.bottom)
.append("g")
.attr("transform", "translate(" + margin.left + "," + margin.top + ")");

var sankey = d3.sankey()
.nodes(d3.values(nodes))
.links(links)
.nodeWidth(15)
.nodePadding(10)
.size([width, height])
.layout(32);

var path = sankey.link();

var link = svg.append("g").selectAll(".link")
.data(sankey.links())
.enter().append("path")
.attr("class", "link")
.attr("d", path)
.style("stroke-width", function(d) { return Math.max(1, d.dy); })
.sort(function(a, b) { return b.dy - a.dy; });

link.append("title")
.text(function(d) { return d.source.name + " → " + d.target.name + "\n" + format(d.value); });

var node = svg.append("g").selectAll(".node")
.data(sankey.nodes())
.enter().append("g")
.attr("class", "node")
.attr("transform", function(d) { return "translate(" + d.x + "," + d.y + ")"; })
.call(d3.behavior.drag()
.origin(function(d) { return d; })
.on("dragstart", function() { this.parentNode.appendChild(this); })
.on("drag", dragmove));

node.append("rect")
.attr("height", function(d) { return d.dy; })
.attr("width", sankey.nodeWidth())
.style("fill", function(d) { return d.color = color(d.name.replace(/ .*/, "")); })
.style("stroke", function(d) { return d3.rgb(d.color).darker(2); })
.append("title")
.text(function(d) { return d.name + "\n" + format(d.value); });

node.append("text")
.attr("x", -6)
.attr("y", function(d) { return d.dy / 2; })
.attr("dy", ".35em")
.attr("text-anchor", "end")
.attr("transform", null)
.text(function(d) { return d.name; })
.filter(function(d) { return d.x < width / 2; })
.attr("x", 6 + sankey.nodeWidth())
.attr("text-anchor", "start");

function dragmove(d) {
d3.select(this).attr("transform", "translate(" + d.x + "," + (d.y = Math.max(0, Math.min(height - d.dy, d3.event.y))) + ")");
sankey.relayout();
link.attr("d", path);
}

</script>

 </body>

## sankey.ipynb
{
 "metadata": {
  "name": "",
  "signature": "sha256:55a6975379af1311b33f0008769415352a95457fc9ef5db824df6bf8820fe1a4"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "I'm currently working on the final project for my data visualization course. The dataset that I've chosen to work with can be [downloaded here](http://snap.stanford.edu/data/web-Reddit.html) -- it's a compendium Reddit resubmissions over a period of several years (ie, images that were submitted to more than one and/or to multiple subreddits). I waffled for a long time trying to decide what the best way to visualize the *flow* of images through various subreddits would be, but just in the nick of time, I stumbled across Christopher Gandrud's new [d3Network package for R](http://christophergandrud.github.io/d3Network/), and that was enough cause for me to settle on a Sankey diagram. If you've never heard of Reddit, the illustrious CPG Grey will enlighten you."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from IPython.display import YouTubeVideo\n",
      "YouTubeVideo('tlI022aUWQQ')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "\n",
        "        <iframe\n",
        "            width=\"400\"\n",
        "            height=300\"\n",
        "            src=\"https://www.youtube.com/embed/tlI022aUWQQ\"\n",
        "            frameborder=\"0\"\n",
        "            allowfullscreen\n",
        "        ></iframe>\n",
        "        "
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 1,
       "text": [
        "<IPython.lib.display.YouTubeVideo at 0x10780e110>"
       ]
      }
     ],
     "prompt_number": 1
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "The task of massaging columnar data consisting of an image ID, subreddit name, and timestamp for each submission into a more networky format suitable for this type of visualization was interesting enough that I thought it might be a good post. If nothing else, Christopher's awesome package deserves some love.\n",
      "\n",
      "Python is my go-to language for data munging of this calibre, so we will use a Pandas -> NetworkX -> R -> D3 worflow. Without further ado, lets load the Python modules we will need and take a look at the data."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Load modules and data\n",
      "\n",
      "import pandas as pd                        # For reading/munging the data\n",
      "import networkx as nx                      # For creating a graph structure\n",
      "from networkx.readwrite import json_graph  # For exporting a graph structure\n",
      "from itertools import islice               # For some more interesting munging\n",
      "\n",
      "%load_ext rmagic                           \n",
      "from IPython.display import HTML           # To display results when we're done\n",
      "\n",
      "!csvclean redditSubmissions.csv\n",
      "d = pd.read_csv('redditSubmissions_out.csv')\n",
      "d.head()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "6 errors logged to redditSubmissions_err.csv\r\n"
       ]
      },
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>#image_id</th>\n",
        "      <th>unixtime</th>\n",
        "      <th>rawtime</th>\n",
        "      <th>title</th>\n",
        "      <th>total_votes</th>\n",
        "      <th>reddit_id</th>\n",
        "      <th>number_of_upvotes</th>\n",
        "      <th>subreddit</th>\n",
        "      <th>number_of_downvotes</th>\n",
        "      <th>localtime</th>\n",
        "      <th>score</th>\n",
        "      <th>number_of_comments</th>\n",
        "      <th>username</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td> 0</td>\n",
        "      <td> 1333172439</td>\n",
        "      <td> 2012-03-31T12:40:39.590113-07:00</td>\n",
        "      <td>          And here's a downvote.</td>\n",
        "      <td> 63470</td>\n",
        "      <td> rmqjs</td>\n",
        "      <td> 32657</td>\n",
        "      <td>    funny</td>\n",
        "      <td> 30813</td>\n",
        "      <td> 1333197639</td>\n",
        "      <td> 1844</td>\n",
        "      <td> 622</td>\n",
        "      <td> Animates_Everything</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td> 0</td>\n",
        "      <td> 1333178161</td>\n",
        "      <td> 2012-03-31T14:16:01.093638-07:00</td>\n",
        "      <td>                     Expectation</td>\n",
        "      <td>    35</td>\n",
        "      <td> rmun4</td>\n",
        "      <td>    29</td>\n",
        "      <td> GifSound</td>\n",
        "      <td>     6</td>\n",
        "      <td> 1333203361</td>\n",
        "      <td>   23</td>\n",
        "      <td>   3</td>\n",
        "      <td>       Gangsta_Raper</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td> 0</td>\n",
        "      <td> 1333199913</td>\n",
        "      <td> 2012-03-31T20:18:33.192906-07:00</td>\n",
        "      <td>                        Downvote</td>\n",
        "      <td>    41</td>\n",
        "      <td> rna86</td>\n",
        "      <td>    32</td>\n",
        "      <td> GifSound</td>\n",
        "      <td>     9</td>\n",
        "      <td> 1333225113</td>\n",
        "      <td>   23</td>\n",
        "      <td>   0</td>\n",
        "      <td>       Gangsta_Raper</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td> 0</td>\n",
        "      <td> 1333252330</td>\n",
        "      <td>        2012-04-01T10:52:10-07:00</td>\n",
        "      <td> Every time I downvote something</td>\n",
        "      <td>    10</td>\n",
        "      <td> ro7e4</td>\n",
        "      <td>     6</td>\n",
        "      <td> GifSound</td>\n",
        "      <td>     4</td>\n",
        "      <td> 1333277530</td>\n",
        "      <td>    2</td>\n",
        "      <td>   0</td>\n",
        "      <td>       Gangsta_Raper</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td> 0</td>\n",
        "      <td> 1333272954</td>\n",
        "      <td> 2012-04-01T16:35:54.393381-07:00</td>\n",
        "      <td>  Downvote &amp;quot;Dies Irae&amp;quot;</td>\n",
        "      <td>    65</td>\n",
        "      <td> rooof</td>\n",
        "      <td>    57</td>\n",
        "      <td> GifSound</td>\n",
        "      <td>     8</td>\n",
        "      <td> 1333298154</td>\n",
        "      <td>   49</td>\n",
        "      <td>   0</td>\n",
        "      <td>       Gangsta_Raper</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "<p>5 rows \u00d7 13 columns</p>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 2,
       "text": [
        "   #image_id    unixtime                           rawtime  \\\n",
        "0          0  1333172439  2012-03-31T12:40:39.590113-07:00   \n",
        "1          0  1333178161  2012-03-31T14:16:01.093638-07:00   \n",
        "2          0  1333199913  2012-03-31T20:18:33.192906-07:00   \n",
        "3          0  1333252330         2012-04-01T10:52:10-07:00   \n",
        "4          0  1333272954  2012-04-01T16:35:54.393381-07:00   \n",
        "\n",
        "                             title  total_votes reddit_id  number_of_upvotes  \\\n",
        "0           And here's a downvote.        63470     rmqjs              32657   \n",
        "1                      Expectation           35     rmun4                 29   \n",
        "2                         Downvote           41     rna86                 32   \n",
        "3  Every time I downvote something           10     ro7e4                  6   \n",
        "4   Downvote &quot;Dies Irae&quot;           65     rooof                 57   \n",
        "\n",
        "  subreddit  number_of_downvotes   localtime  score  number_of_comments  \\\n",
        "0     funny                30813  1333197639   1844                 622   \n",
        "1  GifSound                    6  1333203361     23                   3   \n",
        "2  GifSound                    9  1333225113     23                   0   \n",
        "3  GifSound                    4  1333277530      2                   0   \n",
        "4  GifSound                    8  1333298154     49                   0   \n",
        "\n",
        "              username  \n",
        "0  Animates_Everything  \n",
        "1        Gangsta_Raper  \n",
        "2        Gangsta_Raper  \n",
        "3        Gangsta_Raper  \n",
        "4        Gangsta_Raper  \n",
        "\n",
        "[5 rows x 13 columns]"
       ]
      }
     ],
     "prompt_number": 2
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Note that I first called `csvclean`, from the [csvkit suite of command line utilities](http://csvkit.readthedocs.org/en/latest/index.html). The bang (!) symbol calls the command line from IPython. `csvclean` fixes a couple of formatting errors in the original dataset that interfere with R/Panda's parsing functions (something to do with quotes or commas in the \"title\" field, I believe). The repaired CSV is saved with a `_out` prepended to the filename. Nothing fancy is required for the `read_csv` call in our case.\n",
      "\n",
      "Now for the hard/interesting part. How do we map the flow of each image submission through the various subreddits?\n",
      " - First we sort by image and (crucially) timetamp on line 4.\n",
      " - On line 5, I simply extract the 3 columns that we care about. \n",
      " - Now we drop resubmissions of each image to the *same* subreddit with drop_duplicates on line 6, which only keeps each image's *first* submission to a particular subreddit (why we sorted first). \n",
      " - The last thing we need Pandas for is to group by image ID (line 7).\n",
      "\n",
      "On line 7, we pull the list of subreddits (now unique and nicely ordered) for each image. The nested list comprehension is necessary only because calling `.subreddit` on the groupby object `g` returns a tuple by default, and we'd rather have a list of lists."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Identify the order in which each image is submitted to various subreddits, \n",
      "# removing repeats within a subreddit\n",
      "\n",
      "g = d.sort(['#image_id', 'unixtime'])\\\n",
      "     .ix[:,['#image_id', 'unixtime', 'subreddit']]\\\n",
      "     .drop_duplicates(cols = ['#image_id', 'subreddit'])\\\n",
      "     .groupby('#image_id')\n",
      "flow = [[el for el in x[1]] for x in list(g.subreddit)]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 3
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Now we need a function `window` that rolls along each of the lists in `flow` and connects every subsequent pair of subreddits that a particular image was submitted to. We'll do this with the help of the wonderful `itertools` module, creating two dimensional tuples that encode the \"from\" subreddit and the \"to\" subreddit, respectively. In lines 14 and 15, we apply the function and flatten the result to a single list.\n",
      "\n",
      "In order to truly capture the \"flow,\" however, we need to distinguish between the \"gifs\" subreddit node where images are popping up for the first time and the \"gifs\" subreddit node when the image has already appeared in another subreddit (say, \"pics\"). The `enumerate` in line 14 does this by tacking on the ordinality to the name of the node, admittedly very hacky, but we have a lot of tuples floating around already."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Roll along the list of subreddits each image has been submitted to, \n",
      "# creating an edge tuple for each subsequent pair\n",
      "def window(seq, n=2):\n",
      "    '''Returns a sliding window (of width n) over data from the iterable\n",
      "       s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ...'''\n",
      "    it = iter(seq)\n",
      "    result = tuple(islice(it, n))\n",
      "    if len(result) == n:\n",
      "        yield result    \n",
      "    for elem in it:\n",
      "        result = result[1:] + (elem,)\n",
      "        yield result\n",
      "\n",
      "sankey = [list(window([str(i) + x for i, x in enumerate(sub)])) for sub in flow]\n",
      "sankey = [item for sublist in sankey for item in sublist] # flatten"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 4
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "At last we have a list of edges that on some level describes the flow that we are trying to get at. Now we can just iterate through them and use `NetworkX` to create the graph and weight the edges appropriately. In lines 10--13, I prune back the tiny edges that clutter up the diagram, and then the nodes that are no longer associated with any edges. Last but not least, we export the structure to a JSON in line 16."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Create network structure\n",
      "S = nx.DiGraph()\n",
      "for edge in sankey:\n",
      "    if S.has_edge(*edge):\n",
      "        S[edge[0]][edge[1]]['weight'] +=1\n",
      "    else:\n",
      "        S.add_edge(*edge, weight = 1)\n",
      "        \n",
      "# Trim edges\n",
      "S.remove_edges_from([x for x in S.edges(data=True) if x[2]['weight'] < 50])\n",
      "flagged = [x for x, el in S.out_degree().items() if (x[0] != '3') & (el == 0)]\n",
      "S.remove_edges_from([x for x in S.edges(data=True) if x[1] in flagged])\n",
      "S.remove_nodes_from([x for x, n in S.degree().items() if n == 0])\n",
      "\n",
      "# Export\n",
      "json_graph.dump(S, open('sankey.json', 'w'))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 5
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Time for R!\n",
      "\n",
      "We will need to make sure that `d3Network` is installed to the the instance of R that is used by IPython's `Rmagic` via `Rpy2`. It was different for me (I think?) so if you are running something like this for the first time, include the lines that are commented out.\n",
      "\n",
      "The `%%R` denotes block-level R magicks in IPython (`%R` will give you line-level magicks)"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "%%R\n",
      "#install.packages('devtools')\n",
      "#library(devtools)\n",
      "#devtools::install_github(\"christophergandrud/d3Network\")\n",
      "library(d3Network)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 6
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "This is finally the point at which Christopher Gandrud's package simplifies everything for us. We simply read in the nodes and linkes (edges) from the JSON file (they get converted to two dataframes). Note that we have to strip the janky ordinality numbers that we tacked onto the node names (line 3). Now that different nodes have the same names, the package will even make sure that each subreddit node has the same color every time it appears!\n",
      "\n",
      "The call to `d3Sankey` points to the the nodes dataframe, the links dataframe, the name of the sources/targets in the links dataframe, the name of the column that holds the link weights, and then some display configuration stuff."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "%%R\n",
      "nodes <- JSONtoDF(file = paste0('sankey.json'), array = 'nodes')\n",
      "nodes$id <- substring(nodes$id, 2)\n",
      "links <- JSONtoDF(file = paste0('sankey.json'), array = 'links')\n",
      "d3Sankey(Nodes = nodes, Links = links, Source = 'source',\n",
      "         Target = 'target', Value = 'weight', NodeID = 'id', \n",
      "         width = 600, height = 500, fontsize = 12,\n",
      "         standAlone = TRUE, iframe = TRUE, file = '../extra/sankey.html')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "display_data",
       "text": [
        "<iframe src='../extra/sankey.html' height=535 width=618></iframe>"
       ]
      }
     ],
     "prompt_number": 7
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "We can render the resulting iframe directly in our IPython notebook! Hover over edges for some nice brushing or click and drag the nodes to untangle a relationship you're interested in."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "HTML('<iframe src=\"../extra/sankey.html\" height=540 width=700 frameBorder=\"0\"></iframe>')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<iframe src=\"../extra/sankey.html\" height=540 width=700 frameBorder=\"0\"></iframe>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 8,
       "text": [
        "<IPython.core.display.HTML at 0x10780e8d0>"
       ]
      }
     ],
     "prompt_number": 8
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "This type of \"tiered\" Sankey diagram is a little unconventional, but so far its the best way I can come up with to visualize the interesting phenomenon of submisison flow through Reddit. Leave a comment if this gives you any interesting ideas, I'd love to hear them!"
     ]
    }
   ],
   "metadata": {}
  }
 ]
}
	<!DOCTYPE html>
	<meta charset="utf-8">
	<body>
	<style>
	#chart {
	height: 500px;
	}
	.node rect {
	cursor: move;
	fill-opacity: .9;
	shape-rendering: crispEdges;
	}
	.node text {
	font: 12px serif;
	pointer-events: none;
	text-shadow: 0 1px 0 #fff;
	}
	.link {
	fill: none;
	stroke: #000;
	stroke-opacity: .2;
	}
	.link:hover {
	stroke-opacity: .5;
	}
	</style>

	<p id="chart"></p>

	<script src=http://d3js.org/d3.v3.min.js></script>
	<script>
	d3.sankey = function() {
	var sankey = {},
	nodeWidth = 24,
	nodePadding = 8,
	size = [1, 1],
	nodes = [],
	links = [];

	sankey.nodeWidth = function(_) {
	if (!arguments.length) return nodeWidth;
	nodeWidth = +_;
	return sankey;
	};

	sankey.nodePadding = function(_) {
	if (!arguments.length) return nodePadding;
	nodePadding = +_;
	return sankey;
	};

	sankey.nodes = function(_) {
	if (!arguments.length) return nodes;
	nodes = _;
	return sankey;
	};

	sankey.links = function(_) {
	if (!arguments.length) return links;
	links = _;
	return sankey;
	};

	sankey.size = function(_) {
	if (!arguments.length) return size;
	size = _;
	return sankey;
	};

	sankey.layout = function(iterations) {
	computeNodeLinks();
	computeNodeValues();
	computeNodeBreadths();
	computeNodeDepths(iterations);
	computeLinkDepths();
	return sankey;
	};

	sankey.relayout = function() {
	computeLinkDepths();
	return sankey;
	};

	sankey.link = function() {
	var curvature = .5;

	function link(d) {
	var x0 = d.source.x + d.source.dx,
	x1 = d.target.x,
	xi = d3.interpolateNumber(x0, x1),
	x2 = xi(curvature),
	x3 = xi(1 - curvature),
	y0 = d.source.y + d.sy + d.dy / 2,
	y1 = d.target.y + d.ty + d.dy / 2;
	return "M" + x0 + "," + y0
	+ "C" + x2 + "," + y0
	+ " " + x3 + "," + y1
	+ " " + x1 + "," + y1;
	}

	link.curvature = function(_) {
	if (!arguments.length) return curvature;
	curvature = +_;
	return link;
	};

	return link;
	};

	// Populate the sourceLinks and targetLinks for each node.
	// Also, if the source and target are not objects, assume they are indices.
	function computeNodeLinks() {
	nodes.forEach(function(node) {
	node.sourceLinks = [];
	node.targetLinks = [];
	});
	links.forEach(function(link) {
	var source = link.source,
	target = link.target;
	if (typeof source === "number") source = link.source = nodes[link.source];
	if (typeof target === "number") target = link.target = nodes[link.target];
	source.sourceLinks.push(link);
	target.targetLinks.push(link);
	});
	}

	// Compute the value (size) of each node by summing the associated links.
	function computeNodeValues() {
	nodes.forEach(function(node) {
	node.value = Math.max(
	d3.sum(node.sourceLinks, value),
	d3.sum(node.targetLinks, value)
	);
	});
	}

	// Iteratively assign the breadth (x-position) for each node.
	// Nodes are assigned the maximum breadth of incoming neighbors plus one;
	// nodes with no incoming links are assigned breadth zero, while
	// nodes with no outgoing links are assigned the maximum breadth.
	function computeNodeBreadths() {
	var remainingNodes = nodes,
	nextNodes,
	x = 0;

	while (remainingNodes.length) {
	nextNodes = [];
	remainingNodes.forEach(function(node) {
	node.x = x;
	node.dx = nodeWidth;
	node.sourceLinks.forEach(function(link) {
	nextNodes.push(link.target);
	});
	});
	remainingNodes = nextNodes;
	++x;
	}

	//
	moveSinksRight(x);
	scaleNodeBreadths((width - nodeWidth) / (x - 1));
	}

	function moveSourcesRight() {
	nodes.forEach(function(node) {
	if (!node.targetLinks.length) {
	node.x = d3.min(node.sourceLinks, function(d) { return d.target.x; }) - 1;
	}
	});
	}

	function moveSinksRight(x) {
	nodes.forEach(function(node) {
	if (!node.sourceLinks.length) {
	node.x = x - 1;
	}
	});
	}

	function scaleNodeBreadths(kx) {
	nodes.forEach(function(node) {
	node.x *= kx;
	});
	}

	function computeNodeDepths(iterations) {
	var nodesByBreadth = d3.nest()
	.key(function(d) { return d.x; })
	.sortKeys(d3.ascending)
	.entries(nodes)
	.map(function(d) { return d.values; });

	//
	initializeNodeDepth();
	resolveCollisions();
	for (var alpha = 1; iterations > 0; --iterations) {
	relaxRightToLeft(alpha *= .99);
	resolveCollisions();
	relaxLeftToRight(alpha);
	resolveCollisions();
	}

	function initializeNodeDepth() {
	var ky = d3.min(nodesByBreadth, function(nodes) {
	return (size[1] - (nodes.length - 1) * nodePadding) / d3.sum(nodes, value);
	});

	nodesByBreadth.forEach(function(nodes) {
	nodes.forEach(function(node, i) {
	node.y = i;
	node.dy = node.value * ky;
	});
	});

	links.forEach(function(link) {
	link.dy = link.value * ky;
	});
	}

	function relaxLeftToRight(alpha) {
	nodesByBreadth.forEach(function(nodes, breadth) {
	nodes.forEach(function(node) {
	if (node.targetLinks.length) {
	var y = d3.sum(node.targetLinks, weightedSource) / d3.sum(node.targetLinks, value);
	node.y += (y - center(node)) * alpha;
	}
	});
	});

	function weightedSource(link) {
	return center(link.source) * link.value;
	}
	}

	function relaxRightToLeft(alpha) {
	nodesByBreadth.slice().reverse().forEach(function(nodes) {
	nodes.forEach(function(node) {
	if (node.sourceLinks.length) {
	var y = d3.sum(node.sourceLinks, weightedTarget) / d3.sum(node.sourceLinks, value);
	node.y += (y - center(node)) * alpha;
	}
	});
	});

	function weightedTarget(link) {
	return center(link.target) * link.value;
	}
	}

	function resolveCollisions() {
	nodesByBreadth.forEach(function(nodes) {
	var node,
	dy,
	y0 = 0,
	n = nodes.length,
	i;

	// Push any overlapping nodes down.
	nodes.sort(ascendingDepth);
	for (i = 0; i < n; ++i) {
	node = nodes[i];
	dy = y0 - node.y;
	if (dy > 0) node.y += dy;
	y0 = node.y + node.dy + nodePadding;
	}

	// If the bottommost node goes outside the bounds, push it back up.
	dy = y0 - nodePadding - size[1];
	if (dy > 0) {
	y0 = node.y -= dy;

	// Push any overlapping nodes back up.
	for (i = n - 2; i >= 0; --i) {
	node = nodes[i];
	dy = node.y + node.dy + nodePadding - y0;
	if (dy > 0) node.y -= dy;
	y0 = node.y;
	}
	}
	});
	}

	function ascendingDepth(a, b) {
	return a.y - b.y;
	}
	}

	function computeLinkDepths() {
	nodes.forEach(function(node) {
	node.sourceLinks.sort(ascendingTargetDepth);
	node.targetLinks.sort(ascendingSourceDepth);
	});
	nodes.forEach(function(node) {
	var sy = 0, ty = 0;
	node.sourceLinks.forEach(function(link) {
	link.sy = sy;
	sy += link.dy;
	});
	node.targetLinks.forEach(function(link) {
	link.ty = ty;
	ty += link.dy;
	});
	});

	function ascendingSourceDepth(a, b) {
	return a.source.y - b.source.y;
	}

	function ascendingTargetDepth(a, b) {
	return a.target.y - b.target.y;
	}
	}

	function center(node) {
	return node.y + node.dy / 2;
	}

	function value(link) {
	return link.value;
	}

	return sankey;
	};
	</script>

	<script>

	var links = [ { "source" : 0, "target" : 11, "value" : 100 }, { "source" : 0, "target" : 19, "value" : 87 }, { "source" : 1, "target" : 0, "value" : 66 }, { "source" : 2, "target" : 6, "value" : 77 }, { "source" : 3, "target" : 21, "value" : 185 }, { "source" : 3, "target" : 10, "value" : 67 }, { "source" : 3, "target" : 0, "value" : 296 }, { "source" : 4, "target" : 12, "value" : 51 }, { "source" : 4, "target" : 6, "value" : 112 }, { "source" : 5, "target" : 0, "value" : 68 }, { "source" : 6, "target" : 21, "value" : 363 }, { "source" : 6, "target" : 10, "value" : 209 }, { "source" : 6, "target" : 24, "value" : 366 }, { "source" : 7, "target" : 12, "value" : 182 }, { "source" : 7, "target" : 6, "value" : 130 }, { "source" : 8, "target" : 12, "value" : 138 }, { "source" : 8, "target" : 6, "value" : 105 }, { "source" : 9, "target" : 5, "value" : 120 }, { "source" : 9, "target" : 12, "value" : 1550 }, { "source" : 9, "target" : 22, "value" : 111 }, { "source" : 9, "target" : 13, "value" : 130 }, { "source" : 9, "target" : 3, "value" : 612 }, { "source" : 9, "target" : 1, "value" : 128 }, { "source" : 9, "target" : 15, "value" : 698 }, { "source" : 10, "target" : 11, "value" : 86 }, { "source" : 10, "target" : 18, "value" : 63 }, { "source" : 12, "target" : 10, "value" : 93 }, { "source" : 12, "target" : 0, "value" : 411 }, { "source" : 12, "target" : 24, "value" : 210 }, { "source" : 13, "target" : 21, "value" : 58 }, { "source" : 13, "target" : 0, "value" : 122 }, { "source" : 14, "target" : 6, "value" : 105 }, { "source" : 15, "target" : 24, "value" : 95 }, { "source" : 15, "target" : 21, "value" : 74 }, { "source" : 15, "target" : 0, "value" : 251 }, { "source" : 16, "target" : 0, "value" : 106 }, { "source" : 17, "target" : 3, "value" : 98 }, { "source" : 17, "target" : 12, "value" : 127 }, { "source" : 17, "target" : 6, "value" : 841 }, { "source" : 17, "target" : 22, "value" : 218 }, { "source" : 20, "target" : 5, "value" : 80 }, { "source" : 20, "target" : 6, "value" : 1999 }, { "source" : 20, "target" : 13, "value" : 335 }, { "source" : 20, "target" : 1, "value" : 72 }, { "source" : 20, "target" : 3, "value" : 491 }, { "source" : 20, "target" : 15, "value" : 227 }, { "source" : 20, "target" : 16, "value" : 182 }, { "source" : 21, "target" : 18, "value" : 65 }, { "source" : 22, "target" : 10, "value" : 72 }, { "source" : 22, "target" : 0, "value" : 95 }, { "source" : 23, "target" : 15, "value" : 107 }, { "source" : 23, "target" : 12, "value" : 311 }, { "source" : 23, "target" : 6, "value" : 484 }, { "source" : 24, "target" : 18, "value" : 73 } ] ;
	var nodes = [ { "name" : "funny" }, { "name" : "atheism" }, { "name" : "AdviceAnimals" }, { "name" : "WTF" }, { "name" : "atheism" }, { "name" : "gaming" }, { "name" : "funny" }, { "name" : "aww" }, { "name" : "reddit.com" }, { "name" : "funny" }, { "name" : "gifs" }, { "name" : "GifSound" }, { "name" : "pics" }, { "name" : "aww" }, { "name" : "gaming" }, { "name" : "gifs" }, { "name" : "reddit.com" }, { "name" : "gifs" }, { "name" : "funny" }, { "name" : "WTF" }, { "name" : "pics" }, { "name" : "pics" }, { "name" : "GifSound" }, { "name" : "WTF" }, { "name" : "WTF" } ] ;
	var margin = {top: 1, right: 1, bottom: 6, left: 1},
	width = 600 - margin.left - margin.right,
	height = 500 - margin.top - margin.bottom;

	var formatNumber = d3.format(",.0f"),
	format = function(d) { return formatNumber(d) + " TWh"; },
	color = d3.scale.category20();

	var svg = d3.select("#chart").append("svg")
	.attr("width", width + margin.left + margin.right)
	.attr("height", height + margin.top + margin.bottom)
	.append("g")
	.attr("transform", "translate(" + margin.left + "," + margin.top + ")");

	var sankey = d3.sankey()
	.nodes(d3.values(nodes))
	.links(links)
	.nodeWidth(15)
	.nodePadding(10)
	.size([width, height])
	.layout(32);

	var path = sankey.link();

	var link = svg.append("g").selectAll(".link")
	.data(sankey.links())
	.enter().append("path")
	.attr("class", "link")
	.attr("d", path)
	.style("stroke-width", function(d) { return Math.max(1, d.dy); })
	.sort(function(a, b) { return b.dy - a.dy; });

	link.append("title")
	.text(function(d) { return d.source.name + " → " + d.target.name + "\n" + format(d.value); });

	var node = svg.append("g").selectAll(".node")
	.data(sankey.nodes())
	.enter().append("g")
	.attr("class", "node")
	.attr("transform", function(d) { return "translate(" + d.x + "," + d.y + ")"; })
	.call(d3.behavior.drag()
	.origin(function(d) { return d; })
	.on("dragstart", function() { this.parentNode.appendChild(this); })
	.on("drag", dragmove));

	node.append("rect")
	.attr("height", function(d) { return d.dy; })
	.attr("width", sankey.nodeWidth())
	.style("fill", function(d) { return d.color = color(d.name.replace(/ .*/, "")); })
	.style("stroke", function(d) { return d3.rgb(d.color).darker(2); })
	.append("title")
	.text(function(d) { return d.name + "\n" + format(d.value); });

	node.append("text")
	.attr("x", -6)
	.attr("y", function(d) { return d.dy / 2; })
	.attr("dy", ".35em")
	.attr("text-anchor", "end")
	.attr("transform", null)
	.text(function(d) { return d.name; })
	.filter(function(d) { return d.x < width / 2; })
	.attr("x", 6 + sankey.nodeWidth())
	.attr("text-anchor", "start");

	function dragmove(d) {
	d3.select(this).attr("transform", "translate(" + d.x + "," + (d.y = Math.max(0, Math.min(height - d.dy, d3.event.y))) + ")");
	sankey.relayout();
	link.attr("d", path);
	}

	</script>

	</body>
	{
	"metadata": {
	"name": "",
	"signature": "sha256:55a6975379af1311b33f0008769415352a95457fc9ef5db824df6bf8820fe1a4"
	},
	"nbformat": 3,
	"nbformat_minor": 0,
	"worksheets": [
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"I'm currently working on the final project for my data visualization course. The dataset that I've chosen to work with can be [downloaded here](http://snap.stanford.edu/data/web-Reddit.html) -- it's a compendium Reddit resubmissions over a period of several years (ie, images that were submitted to more than one and/or to multiple subreddits). I waffled for a long time trying to decide what the best way to visualize the flow of images through various subreddits would be, but just in the nick of time, I stumbled across Christopher Gandrud's new [d3Network package for R](http://christophergandrud.github.io/d3Network/), and that was enough cause for me to settle on a Sankey diagram. If you've never heard of Reddit, the illustrious CPG Grey will enlighten you."
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"from IPython.display import YouTubeVideo\n",
	"YouTubeVideo('tlI022aUWQQ')"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"html": [
	"\n",
	" <iframe\n",
	" width=\"400\"\n",
	" height=300\"\n",
	" src=\"https://www.youtube.com/embed/tlI022aUWQQ\"\n",
	" frameborder=\"0\"\n",
	" allowfullscreen\n",
	" ></iframe>\n",
	" "
	],
	"metadata": {},
	"output_type": "pyout",
	"prompt_number": 1,
	"text": [
	"<IPython.lib.display.YouTubeVideo at 0x10780e110>"
	]
	}
	],
	"prompt_number": 1
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"The task of massaging columnar data consisting of an image ID, subreddit name, and timestamp for each submission into a more networky format suitable for this type of visualization was interesting enough that I thought it might be a good post. If nothing else, Christopher's awesome package deserves some love.\n",
	"\n",
	"Python is my go-to language for data munging of this calibre, so we will use a Pandas -> NetworkX -> R -> D3 worflow. Without further ado, lets load the Python modules we will need and take a look at the data."
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# Load modules and data\n",
	"\n",
	"import pandas as pd # For reading/munging the data\n",
	"import networkx as nx # For creating a graph structure\n",
	"from networkx.readwrite import json_graph # For exporting a graph structure\n",
	"from itertools import islice # For some more interesting munging\n",
	"\n",
	"%load_ext rmagic \n",
	"from IPython.display import HTML # To display results when we're done\n",
	"\n",
	"!csvclean redditSubmissions.csv\n",
	"d = pd.read_csv('redditSubmissions_out.csv')\n",
	"d.head()"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"6 errors logged to redditSubmissions_err.csv\r\n"
	]
	},
	{
	"html": [
	"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>#image_id</th>\n",
	" <th>unixtime</th>\n",
	" <th>rawtime</th>\n",
	" <th>title</th>\n",
	" <th>total_votes</th>\n",
	" <th>reddit_id</th>\n",
	" <th>number_of_upvotes</th>\n",
	" <th>subreddit</th>\n",
	" <th>number_of_downvotes</th>\n",
	" <th>localtime</th>\n",
	" <th>score</th>\n",
	" <th>number_of_comments</th>\n",
	" <th>username</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td> 0</td>\n",
	" <td> 1333172439</td>\n",
	" <td> 2012-03-31T12:40:39.590113-07:00</td>\n",
	" <td> And here's a downvote.</td>\n",
	" <td> 63470</td>\n",
	" <td> rmqjs</td>\n",
	" <td> 32657</td>\n",
	" <td> funny</td>\n",
	" <td> 30813</td>\n",
	" <td> 1333197639</td>\n",
	" <td> 1844</td>\n",
	" <td> 622</td>\n",
	" <td> Animates_Everything</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td> 0</td>\n",
	" <td> 1333178161</td>\n",
	" <td> 2012-03-31T14:16:01.093638-07:00</td>\n",
	" <td> Expectation</td>\n",
	" <td> 35</td>\n",
	" <td> rmun4</td>\n",
	" <td> 29</td>\n",
	" <td> GifSound</td>\n",
	" <td> 6</td>\n",
	" <td> 1333203361</td>\n",
	" <td> 23</td>\n",
	" <td> 3</td>\n",
	" <td> Gangsta_Raper</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td> 0</td>\n",
	" <td> 1333199913</td>\n",
	" <td> 2012-03-31T20:18:33.192906-07:00</td>\n",
	" <td> Downvote</td>\n",
	" <td> 41</td>\n",
	" <td> rna86</td>\n",
	" <td> 32</td>\n",
	" <td> GifSound</td>\n",
	" <td> 9</td>\n",
	" <td> 1333225113</td>\n",
	" <td> 23</td>\n",
	" <td> 0</td>\n",
	" <td> Gangsta_Raper</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td> 0</td>\n",
	" <td> 1333252330</td>\n",
	" <td> 2012-04-01T10:52:10-07:00</td>\n",
	" <td> Every time I downvote something</td>\n",
	" <td> 10</td>\n",
	" <td> ro7e4</td>\n",
	" <td> 6</td>\n",
	" <td> GifSound</td>\n",
	" <td> 4</td>\n",
	" <td> 1333277530</td>\n",
	" <td> 2</td>\n",
	" <td> 0</td>\n",
	" <td> Gangsta_Raper</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td> 0</td>\n",
	" <td> 1333272954</td>\n",
	" <td> 2012-04-01T16:35:54.393381-07:00</td>\n",
	" <td> Downvote &quot;Dies Irae&quot;</td>\n",
	" <td> 65</td>\n",
	" <td> rooof</td>\n",
	" <td> 57</td>\n",
	" <td> GifSound</td>\n",
	" <td> 8</td>\n",
	" <td> 1333298154</td>\n",
	" <td> 49</td>\n",
	" <td> 0</td>\n",
	" <td> Gangsta_Raper</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"<p>5 rows \u00d7 13 columns</p>\n",
	"</div>"
	],
	"metadata": {},
	"output_type": "pyout",
	"prompt_number": 2,
	"text": [
	" #image_id unixtime rawtime \\\n",
	"0 0 1333172439 2012-03-31T12:40:39.590113-07:00 \n",
	"1 0 1333178161 2012-03-31T14:16:01.093638-07:00 \n",
	"2 0 1333199913 2012-03-31T20:18:33.192906-07:00 \n",
	"3 0 1333252330 2012-04-01T10:52:10-07:00 \n",
	"4 0 1333272954 2012-04-01T16:35:54.393381-07:00 \n",
	"\n",
	" title total_votes reddit_id number_of_upvotes \\\n",
	"0 And here's a downvote. 63470 rmqjs 32657 \n",
	"1 Expectation 35 rmun4 29 \n",
	"2 Downvote 41 rna86 32 \n",
	"3 Every time I downvote something 10 ro7e4 6 \n",
	"4 Downvote "Dies Irae" 65 rooof 57 \n",
	"\n",
	" subreddit number_of_downvotes localtime score number_of_comments \\\n",
	"0 funny 30813 1333197639 1844 622 \n",
	"1 GifSound 6 1333203361 23 3 \n",
	"2 GifSound 9 1333225113 23 0 \n",
	"3 GifSound 4 1333277530 2 0 \n",
	"4 GifSound 8 1333298154 49 0 \n",
	"\n",
	" username \n",
	"0 Animates_Everything \n",
	"1 Gangsta_Raper \n",
	"2 Gangsta_Raper \n",
	"3 Gangsta_Raper \n",
	"4 Gangsta_Raper \n",
	"\n",
	"[5 rows x 13 columns]"
	]
	}
	],
	"prompt_number": 2
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Note that I first called `csvclean`, from the [csvkit suite of command line utilities](http://csvkit.readthedocs.org/en/latest/index.html). The bang (!) symbol calls the command line from IPython. `csvclean` fixes a couple of formatting errors in the original dataset that interfere with R/Panda's parsing functions (something to do with quotes or commas in the \"title\" field, I believe). The repaired CSV is saved with a `_out` prepended to the filename. Nothing fancy is required for the `read_csv` call in our case.\n",
	"\n",
	"Now for the hard/interesting part. How do we map the flow of each image submission through the various subreddits?\n",
	" - First we sort by image and (crucially) timetamp on line 4.\n",
	" - On line 5, I simply extract the 3 columns that we care about. \n",
	" - Now we drop resubmissions of each image to the same subreddit with drop_duplicates on line 6, which only keeps each image's first submission to a particular subreddit (why we sorted first). \n",
	" - The last thing we need Pandas for is to group by image ID (line 7).\n",
	"\n",
	"On line 7, we pull the list of subreddits (now unique and nicely ordered) for each image. The nested list comprehension is necessary only because calling `.subreddit` on the groupby object `g` returns a tuple by default, and we'd rather have a list of lists."
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# Identify the order in which each image is submitted to various subreddits, \n",
	"# removing repeats within a subreddit\n",
	"\n",
	"g = d.sort(['#image_id', 'unixtime'])\\\n",
	" .ix[:,['#image_id', 'unixtime', 'subreddit']]\\\n",
	" .drop_duplicates(cols = ['#image_id', 'subreddit'])\\\n",
	" .groupby('#image_id')\n",
	"flow = [[el for el in x[1]] for x in list(g.subreddit)]"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 3
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Now we need a function `window` that rolls along each of the lists in `flow` and connects every subsequent pair of subreddits that a particular image was submitted to. We'll do this with the help of the wonderful `itertools` module, creating two dimensional tuples that encode the \"from\" subreddit and the \"to\" subreddit, respectively. In lines 14 and 15, we apply the function and flatten the result to a single list.\n",
	"\n",
	"In order to truly capture the \"flow,\" however, we need to distinguish between the \"gifs\" subreddit node where images are popping up for the first time and the \"gifs\" subreddit node when the image has already appeared in another subreddit (say, \"pics\"). The `enumerate` in line 14 does this by tacking on the ordinality to the name of the node, admittedly very hacky, but we have a lot of tuples floating around already."
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# Roll along the list of subreddits each image has been submitted to, \n",
	"# creating an edge tuple for each subsequent pair\n",
	"def window(seq, n=2):\n",
	" '''Returns a sliding window (of width n) over data from the iterable\n",
	" s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ...'''\n",
	" it = iter(seq)\n",
	" result = tuple(islice(it, n))\n",
	" if len(result) == n:\n",
	" yield result \n",
	" for elem in it:\n",
	" result = result[1:] + (elem,)\n",
	" yield result\n",
	"\n",
	"sankey = [list(window([str(i) + x for i, x in enumerate(sub)])) for sub in flow]\n",
	"sankey = [item for sublist in sankey for item in sublist] # flatten"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 4
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"At last we have a list of edges that on some level describes the flow that we are trying to get at. Now we can just iterate through them and use `NetworkX` to create the graph and weight the edges appropriately. In lines 10--13, I prune back the tiny edges that clutter up the diagram, and then the nodes that are no longer associated with any edges. Last but not least, we export the structure to a JSON in line 16."
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# Create network structure\n",
	"S = nx.DiGraph()\n",
	"for edge in sankey:\n",
	" if S.has_edge(*edge):\n",
	" S[edge[0]][edge[1]]['weight'] +=1\n",
	" else:\n",
	" S.add_edge(*edge, weight = 1)\n",
	" \n",
	"# Trim edges\n",
	"S.remove_edges_from([x for x in S.edges(data=True) if x[2]['weight'] < 50])\n",
	"flagged = [x for x, el in S.out_degree().items() if (x[0] != '3') & (el == 0)]\n",
	"S.remove_edges_from([x for x in S.edges(data=True) if x[1] in flagged])\n",
	"S.remove_nodes_from([x for x, n in S.degree().items() if n == 0])\n",
	"\n",
	"# Export\n",
	"json_graph.dump(S, open('sankey.json', 'w'))"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 5
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Time for R!\n",
	"\n",
	"We will need to make sure that `d3Network` is installed to the the instance of R that is used by IPython's `Rmagic` via `Rpy2`. It was different for me (I think?) so if you are running something like this for the first time, include the lines that are commented out.\n",
	"\n",
	"The `%%R` denotes block-level R magicks in IPython (`%R` will give you line-level magicks)"
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"%%R\n",
	"#install.packages('devtools')\n",
	"#library(devtools)\n",
	"#devtools::install_github(\"christophergandrud/d3Network\")\n",
	"library(d3Network)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 6
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"This is finally the point at which Christopher Gandrud's package simplifies everything for us. We simply read in the nodes and linkes (edges) from the JSON file (they get converted to two dataframes). Note that we have to strip the janky ordinality numbers that we tacked onto the node names (line 3). Now that different nodes have the same names, the package will even make sure that each subreddit node has the same color every time it appears!\n",
	"\n",
	"The call to `d3Sankey` points to the the nodes dataframe, the links dataframe, the name of the sources/targets in the links dataframe, the name of the column that holds the link weights, and then some display configuration stuff."
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"%%R\n",
	"nodes <- JSONtoDF(file = paste0('sankey.json'), array = 'nodes')\n",
	"nodes$id <- substring(nodes$id, 2)\n",
	"links <- JSONtoDF(file = paste0('sankey.json'), array = 'links')\n",
	"d3Sankey(Nodes = nodes, Links = links, Source = 'source',\n",
	" Target = 'target', Value = 'weight', NodeID = 'id', \n",
	" width = 600, height = 500, fontsize = 12,\n",
	" standAlone = TRUE, iframe = TRUE, file = '../extra/sankey.html')"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"metadata": {},
	"output_type": "display_data",
	"text": [
	"<iframe src='../extra/sankey.html' height=535 width=618></iframe>"
	]
	}
	],
	"prompt_number": 7
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"We can render the resulting iframe directly in our IPython notebook! Hover over edges for some nice brushing or click and drag the nodes to untangle a relationship you're interested in."
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"HTML('<iframe src=\"../extra/sankey.html\" height=540 width=700 frameBorder=\"0\"></iframe>')"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"html": [
	"<iframe src=\"../extra/sankey.html\" height=540 width=700 frameBorder=\"0\"></iframe>"
	],
	"metadata": {},
	"output_type": "pyout",
	"prompt_number": 8,
	"text": [
	"<IPython.core.display.HTML at 0x10780e8d0>"
	]
	}
	],
	"prompt_number": 8
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"This type of \"tiered\" Sankey diagram is a little unconventional, but so far its the best way I can come up with to visualize the interesting phenomenon of submisison flow through Reddit. Leave a comment if this gives you any interesting ideas, I'd love to hear them!"
	]
	}
	],
	"metadata": {}
	}
	]
	}