Skip to content

Instantly share code, notes, and snippets.

@mimno
Last active August 29, 2015 14:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mimno/11482432 to your computer and use it in GitHub Desktop.
Save mimno/11482432 to your computer and use it in GitHub Desktop.

We just sampled two groups of points (red and blue), and fit a regression line to each one.

How much confidence do we have in these fitted linear models? This page shows three variations of randomization tests that show us what regression lines from "similar" datasets would look like. The buttons at the top will sample a random, but similar, dataset in one of three ways. We will then fit regression lines for the randomized dataset, and then go back to the original data.

Comparing the "real" line to these replicated lines can tell us whether the original line tells us something interesting about the dataset, or if it's just fitting random noise. The numbers at the top will tell us how many of the replicated models have had a greater slope than the original model of the same color.

  1. Bootstrap. How sure are we of the slope of the lines? This test samples with replacement from the original data. Some data points may appear several times, others not at all. If the slope of the line depends a lot on a small number of outliers, we should see lots of variation in the replicated lines.

  2. Permuting y. Is the slope of the lines significant? This test randomly shuffles the y values of the data points, leaving the x values unchanged. Since there's no connection between x and y, we should expect the regression line to be flat. But for a small sample, we might get a steeper slope by chance. Does the slope of our original model lie within the range of slopes that we get by random chance?

  3. Permuting class. Are the blue points really different from the red points? This test keeps the points in the same x,y positions, but randomly swaps their class, so some blue points become red, and vice versa. If the two classes are really different, we should see regression lines that are close to each other and somewhere in the middle of the original regression lines.

Copy this gist and try modifying the parameters used to generate the two classes of points. What happens to each test if the two classes have different sample sizes? Try varying the difference in slope and standard deviation (which I'm calling error).

<html>
<head>
<script src="http://d3js.org/d3.v3.min.js"></script>
<style>
body { font-family: "Open Sans", "Calibri", "Verdana"; font-size: small;}
.red { color: red; }
.blue { color: blue; }
</style>
</head>
<body>
<div id="header">
<button id="bootstrap">Bootstrap</button>
<button id="shuffleY">Shuffle Y</button>
<button id="shuffleClass">Shuffle Class</button>
<button id="clear">Clear</button>
Randomized slopes &gt; original slope: <span class="blue"><span id="gtblue">0</span>/<span class="samples">0</span></span> <span class="red"><span id="gtred">0</span>/<span class="samples">0</span></span>
</div>
</div>
<div id="plot"></div>
<script>
var height = 600;
var width = 800;
var svg = d3.select("#plot").append("svg").attr("height", height).attr("width", width);
// Get a function that will generate samples from a standard
// normal distribution (mean 0, std. dev. 1)
var normalGenerator = d3.random.normal();
var pointGenerator = function(n, slope, error, color) {
var result = [];
for (var i = 0; i < n; i++) {
var x = normalGenerator();
var y = slope * x + error * normalGenerator();
result.push({ x: x, y: y, fill: color });
}
return result;
}
var points = pointGenerator(30, -1.5, 0.3, "#f00")
.concat(pointGenerator(30, -0.5, 0.3, "#00f"));
var linearModel = function(p) {
var meanX = d3.mean(p, function(point) { return point.x; });
var meanY = d3.mean(p, function(point) { return point.y; });
var slope = d3.sum(p, function (point) { return (point.x - meanX) * (point.y - meanY); }) /
d3.sum(p, function (point) { return (point.x - meanX) * (point.x -meanX); })
var intercept = meanY - slope * meanX;
return { slope: slope, intercept: intercept };
};
var xDomain = d3.extent(points, function (point) { return point.x; });
var yDomain = d3.extent(points, function (point) { return point.y; });
var padding = 20;
var xScale = d3.scale.linear().domain(xDomain).range([padding, width - padding]);
var yScale = d3.scale.linear().domain(yDomain).range([height - padding, padding]);
// Helper functions to be used as callbacks
var fill = function (point) { return point.fill; };
var x = function (point) { return xScale(point.x); }
var y = function (point) { return yScale(point.y); }
var isRed = function (point) { return point.fill === "#f00"; }
var isBlue = function (point) { return point.fill === "#00f"; }
var line = function(model, color, type) {
svg.append("line").attr("class", type)
.transition().delay(500)
.attr("x1", xScale(xDomain[0])).attr("y1", yScale(xDomain[0] * model.slope + model.intercept))
.attr("x2", xScale(xDomain[1])).attr("y2", yScale(xDomain[1] * model.slope + model.intercept))
.style("stroke", color)
.style("stroke-width", type === "replicated" ? 2 : 4)
.style("opacity", type === "replicated" ? 0.3 : 1.0);
}
// Plot the points
var circles = svg.selectAll("circle").data(points).enter().append("circle");
circles.attr("cx", function (point) { return xScale(point.x); })
.attr("cy", function (point) { return yScale(point.y); })
.attr("r", 4)
.style("fill", fill).style("opacity", 0.3);
var realBlueModel = linearModel(points.filter(isBlue));
line(realBlueModel, "#00f", "real");
var realRedModel = linearModel(points.filter(isRed));
line(realRedModel, "#f00", "real");
var samples = 0;
var greaterThanRed = 0;
var greaterThanBlue = 0;
d3.select("#bootstrap").on("click", function () {
var randomizedPoints = [];
for (var i = 0; i < points.length; i++) {
randomizedPoints.push( points[ Math.floor(Math.random() * points.length) ] );
}
circles.data(randomizedPoints).transition().duration(500)
.attr("cx", x).attr("cy", y);
var blueModel = linearModel(randomizedPoints.filter(isBlue));
line(blueModel, "#00f", "replicated");
var redModel = linearModel(randomizedPoints.filter(isRed));
line(redModel, "#f00", "replicated");
circles.data(points).transition().delay(1000).duration(500)
.attr("cx", x).attr("cy", y);
samples++;
if (redModel.slope > realRedModel.slope) { greaterThanRed++; }
if (blueModel.slope > realBlueModel.slope) { greaterThanBlue++; }
d3.selectAll(".samples").text(samples);
d3.select("#gtred").text(greaterThanRed);
d3.select("#gtblue").text(greaterThanBlue);
});
d3.select("#shuffleY").on("click", function () {
var randomizedIndices = d3.shuffle(d3.range(0, points.length));
var randomizedPoints = [];
for (var i = 0; i < points.length; i++) {
randomizedPoints.push( { x: points[i].x, y: points[ randomizedIndices[i] ].y,
fill: points[i].fill } );
}
circles.data(randomizedPoints).transition().duration(500)
.attr("cy", y);
var blueModel = linearModel(randomizedPoints.filter(isBlue));
line(blueModel, "#00f", "replicated");
var redModel = linearModel(randomizedPoints.filter(isRed));
line(redModel, "#f00", "replicated");
circles.data(points).transition().delay(1000).duration(500)
.attr("cy", y);
samples++;
if (redModel.slope > realRedModel.slope) { greaterThanRed++; }
if (blueModel.slope > realBlueModel.slope) { greaterThanBlue++; }
d3.selectAll(".samples").text(samples);
d3.select("#gtred").text(greaterThanRed);
d3.select("#gtblue").text(greaterThanBlue);
});
d3.select("#shuffleClass").on("click", function () {
var randomizedIndices = d3.shuffle(d3.range(0, points.length));
var randomizedPoints = [];
for (var i = 0; i < points.length; i++) {
randomizedPoints.push( { x: points[i].x, y: points[i].y,
fill: points[ randomizedIndices[i] ].fill } );
}
circles.data(randomizedPoints).transition().duration(500)
.style("opacity", 1.0)
.style("fill", fill);
var blueModel = linearModel(randomizedPoints.filter(isBlue));
line(blueModel, "#00f", "replicated");
var redModel = linearModel(randomizedPoints.filter(isRed));
line(redModel, "#f00", "replicated");
circles.data(points).transition().delay(1000).duration(500)
.style("opacity", 0.3)
.style("fill", fill);
samples++;
if (redModel.slope > realRedModel.slope) { greaterThanRed++; }
if (blueModel.slope > realBlueModel.slope) { greaterThanBlue++; }
d3.selectAll(".samples").text(samples);
d3.select("#gtred").text(greaterThanRed);
d3.select("#gtblue").text(greaterThanBlue);
});
d3.select("#clear").on("click", function() {
d3.selectAll("line.replicated").remove();
samples = 0;
greaterThanRed = 0;
greaterThanBlue = 0;
d3.selectAll(".samples").text(samples);
d3.select("#gtred").text(greaterThanRed);
d3.select("#gtblue").text(greaterThanBlue);
});
</script>
</body>
</html>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment