Skip to content

Instantly share code, notes, and snippets.

@homerhanumat
Last active February 26, 2018 11:39
Show Gist options
  • Save homerhanumat/d6570cbb6920f86c3e2bb6482fac63b0 to your computer and use it in GitHub Desktop.
Save homerhanumat/d6570cbb6920f86c3e2bb6482fac63b0 to your computer and use it in GitHub Desktop.
Regression to the Mean
license: MIT
height: 1000
border: no
<html>
<head>
<h3>Regression to the Mean</h3>
<style>
body {
margin: 0 auto;
margin-top: 1em;
display: table;
font-family: "Helvetica Neue", sans-serif;
}
p {
max-width: 700px;
}
svg {
padding-bottom: 16px;
}
.regression {
stroke-width: 2px;
stroke: blue;
}
.sdline {
stroke-width: 2px;
stroke: red;
stroke-dasharray: 10, 5;
}
.equation {
font-size: 12px;
margin-top: 5px;
text-align: center;
}
.axistext {
font-size: 14px;
}
#selector {
margin-left: 50px;
}
.kmeans {
visibility: hidden;
}
.meansPlot {
visibility: hidden;
}
.meanpoint {
visibility: hidden;
}
.kmeans rect {
opacity: 0.5;
}
.meanlevel {
stroke-width: 3px;
stroke: darkgreen;
}
</style>
</head>
<body>
<div>
<select id="selector">
<option value="noshow" selected>Do not show y-means</option>
<option value="rect" selected>Show mean of moving range</option>
<option value="all">Show all means</option>
</select>
<label for="rho">Target Correlation: </label>
<input id="rho" type="number" value="0.5" step="0.01" min="0" max="1">
</div>
<div class="chart"></div>
<div class="equation"></div>
<div class="equation"></div>
<script src="https://d3js.org/d3.v4.min.js"></script>
<script>
// started from
//:http://blockbuilder.org/uredkar/e87f0cf2925cd5a78ba919bf7a279e1d
// setup
let margin = { top: 33, right: 5, bottom: 20, left: 50 },
width = 450 - margin.left - margin.right,
height = 450 - margin.top - margin.bottom;
// numer of points in scatterplot
let numberOfPoints = 500;
// initialize global to hold state info for vvarious elements
let elemInfo = {
// for mean-line and point in rectangles
meanInRange: undefined
}
// add listeners
document.querySelector("#selector")
.addEventListener("input", selectHandler);
document.querySelector("#rho")
.addEventListener("input", rhoHandler);
// kick-off
document.querySelector("#rho").dispatchEvent(new Event('input'));
/// FUNCTIONS..................................................
function rhoHandler(e) {
let rho = this.value;
let data = bvn(n = numberOfPoints, rho = rho);
document.querySelector(".chart").innerHTML = "";
document.querySelector("#selector").value = "noshow";
let svg = d3.select(".chart").append("svg")
.attr("width", width + margin.left + margin.right)
.attr("height", height + margin.top + margin.bottom)
.append("g")
.attr("transform", "translate(" + margin.left + "," + margin.top + ")");
let x = d3.scaleLinear()
.range([0, width]);
let y = d3.scaleLinear()
.range([height, 0]);
let xAxis = d3.axisBottom()
.scale(x);
let yAxis = d3.axisLeft()
.scale(y);
// axis title variables
let yaxistext = "y"
let xaxistext = "x"
// text label for the x axis
svg.append("text")
.attr("class", "axistext")
.attr("transform",
"translate(" + (width - margin.left) + " ," +
(height + margin.top) + ")")
.style("text-anchor", "middle")
.text(xaxistext);
// text label for the y axis
svg.append("text")
.attr("class", "axistext")
.attr("transform", "rotate(-90)")
.attr("y", 0 - margin.left)
.attr("x", 0 - (height / 2))
.attr("dy", "1em")
.style("text-anchor", "middle")
.text(yaxistext)
// add basic axes, all points, calculate scales
y.domain(d3.extent(data, function (d) { return d.y }));
x.domain(d3.extent(data, function (d) { return d.x }));
svg.append("g")
.attr("class", "x axis")
.attr("transform", "translate(0," + height + ")")
.call(xAxis)
svg.append("g")
.attr("class", "y axis")
.call(yAxis);
svg.selectAll(".point")
.data(data)
.enter().append("circle")
.attr("class", "point")
.attr("r", 2)
.attr("cy", function (d) { return y(d.y); })
.attr("cx", function (d) { return x(d.x); })
;
// r, slope, regression ans sd lines
let xSeries = data.map(function (e) { return e.x; });
let ySeries = data.map(function (e) { return e.y; });
let {slope, intercept, rSquare,
xBar, yBar, sdRatio} = leastSquares(xSeries, ySeries);
// regression line equation
document.getElementsByClassName("equation")[0]
.innerHTML = "y = " + round(slope, 2) + "x + " + round(intercept, 2);
// r
document.getElementsByClassName("equation")[1]
.innerHTML = "r = " + round(Math.sqrt(rSquare), 2);
// Add trendline
let ptAx = d3.min(xSeries);
let ptAy = slope * d3.min(xSeries) + intercept;
let ptBx = d3.max(xSeries);
let ptBy = intercept + slope * d3.max(xSeries);;
svg.append("line")
.attr("class", "regression")
.attr("x1", x(ptAx))
.attr("y1", y(ptAy))
.attr("x2", x(ptBx))
.attr("y2", y(ptBy))
;
var ptsdAX = d3.min(xSeries);
var ptsdAY = yBar + sdRatio * (ptsdAX - xBar);
var ptsdBX = d3.max(xSeries);
var ptsdBY = yBar + sdRatio * (ptsdBX - xBar);
svg.append("line")
.attr("class", "sdline")
.attr("x1", x(ptsdAX))
.attr("y1", y(ptsdAY))
.attr("x2", x(ptsdBX))
.attr("y2", y(ptsdBY))
;
svg.append("g")
.attr("class", "kmeans")
.attr("transform", `translate(${x(xBar) - 20}, 0)`)
;
let kmeans = d3.select("g.kmeans");
kmeans.append("rect")
.attr("x", 0)
.attr("y", 0)
.attr("width", 40)
.attr("height", height)
.attr("fill", "palegreen")
.attr("cursor", "move")
.attr("pointer-events", "all")
;
kmeans.append("line")
.attr("class", "meanlevel")
.attr("x1", 0)
.attr("x2", 40)
.attr("y1", y(yBar))
.attr("y2", y(yBar))
;
kmeans.append("circle")
.attr("class", "meanpoint")
.attr("r", 4)
.attr("fill", "darkgreen")
.attr("cx", 20)
.attr("cy", y(yBar))
;
const dragHandler = d3.drag().on("drag", function () {
if (d3.event.x >= 0 && d3.event.x <= width - 40) {
d3.select(this)
.attr("transform", `translate(${d3.event.x}, 0)`);
let mean = meanInRange(data, x.invert(d3.event.x),
x.invert(d3.event.x + 40));
elemInfo.meanInRange = mean;
let line = d3.select(".meanlevel");
let meanPoint = d3.select(".meanpoint");
if (isNaN(mean)) {
line.style("visibility", "hidden");
meanPoint.style("visibility", "hidden");
} else {
line.attr("y1", y(mean)).attr("y2", y(mean));
meanPoint.attr("cy", y(mean));
line.style("visibility", "visible");
meanPoint.style("visibility", "visible");
}
}
});
dragHandler(kmeans);
// make line graph of kmeans
let bandWidth = 0.2;
let step = (d3.max(xSeries) - d3.min(xSeries)) / 200;
let xvals = d3.range(d3.min(xSeries), d3.max(xSeries), step);
let pathPoints = [];
for (let i = 0; i < xvals.length; i++) {
let mean = meanInRange(data, xvals[i] - bandWidth,
xvals[i] + bandWidth);
if (!isNaN(mean)) {
pathPoints.push({x: xvals[i], y: mean});
}
}
let line = d3.line()
.x(function(d) { return x(d.x); })
.y(function(d) { return y(d.y); });
svg.append("path")
.datum(pathPoints)
.attr("class", "meansPlot")
.attr("fill", "none")
.attr("stroke", "darkgray")
.attr("stroke-linejoin", "round")
.attr("stroke-linecap", "round")
.attr("stroke-width", 2.0)
.attr("d", line)
;
}
function gaussian(a, b) {
return {
u: Math.sqrt(-2 * Math.log(a)) * Math.cos(2 * Math.PI * b),
v: Math.sqrt(-2 * Math.log(a)) * Math.sin(2 * Math.PI * b)
};
}
function bvn(n = 200, rho = 0.5, mu1 = 0, sigma1 = 1,
mu2 = 0, sigma2 = 1) {
let data = [];
for (let i = 0; i < n; i++) {
//generate two independent normals (box-muller)
let u1 = Math.random();
let u2 = Math.random();
let { u, v } = gaussian(u1, u2);
let x = mu1 + sigma1 * u;
let sdY = Math.sqrt((1 - Math.pow(rho, 2)) * Math.pow(sigma2, 2));
let y = mu2 + (sigma2 / sigma1) * rho * (u - mu1) + sdY * v;
data.push({ x: x, y: y });
}
return data;
}
function selectHandler(e) {
let selection = this.value;
if (selection === "noshow") {
d3.select(".kmeans").style("visibility", "hidden");
d3.select(".meanlevel").style("visibility", "hidden");
d3.select(".meanpoint").style("visibility", "hidden");
d3.select(".meansPlot").style("visibility", "hidden");
}
if (selection === "rect") {
d3.select(".kmeans").style("visibility", "visible");
d3.select(".meanlevel").style("visibility",
!isNaN(elemInfo.meanInRange) ? "visible" : "hidden");
d3.select(".meanpoint").style("visibility",
!isNaN(elemInfo.meanInRange) ? "visible" : "hidden");
d3.select(".meansPlot").style("visibility", "hidden");
}
if (selection === "all") {
d3.select(".kmeans").style("visibility", "hidden");
d3.select(".meanlevel").style("visibility", "hidden");
d3.select(".meanpoint").style("visibility", "hidden");
d3.select(".meansPlot").style("visibility", "visible");
}
}
function meanInRange(data, a, b) {
let sum = 0;
let count = 0;
for (let i = 0; i < data.length; i++) {
let x = data[i].x;
if (x >= a && x <= b) {
sum += data[i].y
count++
}
}
return (sum / count);
}
// round decimals
function round(value, decimals) {
return Number(Math.round(value + 'e' + decimals) + 'e-' + decimals);
}
// calculate linear regression
function leastSquares(xSeries, ySeries) {
let reduceSumFunc = function (prev, cur) { return prev + cur; }
let xBar = xSeries.reduce(reduceSumFunc) * 1.0 / xSeries.length;
let yBar = ySeries.reduce(reduceSumFunc) * 1.0 / ySeries.length;
let ssXX = xSeries.map(function (d) { return Math.pow(d - xBar, 2); })
.reduce(reduceSumFunc);
let ssYY = ySeries.map(function (d) { return Math.pow(d - yBar, 2); })
.reduce(reduceSumFunc);
let sdRatio = Math.sqrt(ssYY / ssXX);
let ssXY = xSeries.map(function (d, i) { return (d - xBar) * (ySeries[i] - yBar); })
.reduce(reduceSumFunc);
let slope = ssXY / ssXX;
let intercept = yBar - (xBar * slope);
let rSquare = Math.pow(ssXY, 2) / (ssXX * ssYY);
return {slope, intercept, rSquare, xBar, yBar, sdRatio};
}
</script>
<p>This is an instructional app to illustrate regression to the mean. The
red dotted line is the <em>standard deviation line</em> that
goes through the point of averages of the x and y-values on
the scatterplot. Its slope is the ratio of the standard deviation
of the y-values to that of the x-values. The blue line is the
regression line, which also contins the point of averages but which
has a slope equal to the correlation coefficient times the slope of
the SD-line. Most beginning students regard the SD-line as a superior
summary of the scatterplot.
</p>
<p>Nevertheless the regression line is much better for predicting
y-value from a known x-value. The point of the app is to realize this.
</p>
<p>There is an option to compute means of y-values in a
draggable rectangele or to view a plot of near-neighbor means.
Either way, the regression line emerges as the superior predictor
of the mean y-value for a given x-value.
</p>
<p>If an individual is <em>s</em> standard deviations above the mean with respect
to her x-value, she is predicted to be <em>rs</em>em> standard deviations above
the mean with respect to her y-value: above average, but not so strikingly above
average as her x-value was.</p>
</body>
</html>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment