Skip to content

Instantly share code, notes, and snippets.

@jensgrubert
Forked from mbostock/.block
Last active April 18, 2017 18:39
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save jensgrubert/7777399 to your computer and use it in GitHub Desktop.
Save jensgrubert/7777399 to your computer and use it in GitHub Desktop.
Histogram and Kernel density estimation

Kernel density estimation is a method of estimating the probability distribution of a random variable based on a random sample. In contrast to a histogram, kernel density estimation produces a smooth estimate. The smoothness can be tuned via the kernel’s bandwidth parameter. With the correct choice of bandwidth, important features of the distribution can be seen, while an incorrect choice results in undersmoothing or oversmoothing and obscured features.

This example shows a histogram and a kernel density estimation for times between eruptions of Old Faithful Geyser in Yellowstone National Park, taken from R’s faithful dataset. The data follow a bimodal distribution; short eruptions are followed by a wait time averaging about 55 minutes, and long eruptions by a wait time averaging about 80 minutes. In recent years, wait times have been increasing, possibly due to the effects of earthquakes on the geyser’s geohydrology.

This example is based on the version by Mike Bostock. It makes the parameters for changing histogram and kernel density plot parameters more accessible.

[79,54,74,62,85,55,88,85,51,85,54,84,78,47,83,52,62,84,52,79,51,47,78,69,74,83,55,76,78,79,73,77,66,80,74,52,48,80,59,90,80,58,84,58,73,83,64,53,82,59,75,90,54,80,54,83,71,64,77,81,59,84,48,82,60,92,78,78,65,73,82,56,79,71,62,76,60,78,76,83,75,82,70,65,73,88,76,80,48,86,60,90,50,78,63,72,84,75,51,82,62,88,49,83,81,47,84,52,86,81,75,59,89,79,59,81,50,85,59,87,53,69,77,56,88,81,45,82,55,90,45,83,56,89,46,82,51,86,53,79,81,60,82,77,76,59,80,49,96,53,77,77,65,81,71,70,81,93,53,89,45,86,58,78,66,76,63,88,52,93,49,57,77,68,81,81,73,50,85,74,55,77,83,83,51,78,84,46,83,55,81,57,76,84,77,81,87,77,51,78,60,82,91,53,78,46,77,84,49,83,71,80,49,75,64,76,53,94,55,76,50,82,54,75,78,79,78,78,70,79,70,54,86,50,90,54,54,77,79,64,75,47,86,63,85,82,57,82,67,74,54,83,73,73,88,80,71,83,56,79,78,84,58,83,43,60,75,81,46,90,46,74]
<!DOCTYPE html>
<meta charset="utf-8">
<title>Histogram and Kernel Density Plot</title>
<style>
<!-- based on http://bl.ocks.org/mbostock/4341954 -->
body {
font: 10px sans-serif;
}
.bar {
fill: #bbb;
shape-rendering: crispEdges;
}
.line {
fill: none;
stroke: #000;
stroke-width: 1.5px;
}
.axis path,
.axis line {
fill: none;
stroke: #000;
shape-rendering: crispEdges;
}
.y.axis path {
display: none;
}
</style>
<body>
<script src="http://d3js.org/d3.v3.min.js"></script>
<script>
// USER DEFINABLE VARIABLES START
var numHistBins = 10; // number of bins for the histogram
var calcHistBinsAutmoatic = true; // if true, the number of bins are calculated automatically and
// numHistBins is overwritten
var showKDP = true; // show the kernel density plot?
var bandwith = 4; // bandwith (smoothing constant) h of the kernel density estimator
var dataFN = "./faithful.json"; // the filename of the data to be visualized
// USER DEFINABLE VARIABLES END
var margin = {top: 20, right: 30, bottom: 30, left: 40},
width = 960 - margin.left - margin.right,
height = 500 - margin.top - margin.bottom;
// the x-scale parameters
var x = d3.scale.linear()
.domain([30, 110])
.range([0, width]);
// the y-scale parameters
var y = d3.scale.linear()
.domain([0, .15])
.range([height, 0]);
var xAxis = d3.svg.axis()
.scale(x)
.orient("bottom");
var yAxis = d3.svg.axis()
.scale(y)
.orient("left")
.tickFormat(d3.format("%"));
var line = d3.svg.line()
.x(function(d) { return x(d[0]); })
.y(function(d) { return y(d[1]); });
// the histogram function
var histogram;
var svg = d3.select("body").append("svg")
.attr("width", width + margin.left + margin.right)
.attr("height", height + margin.top + margin.bottom)
.append("g")
.attr("transform", "translate(" + margin.left + "," + margin.top + ")");
// draw the background
svg.append("g")
.attr("class", "x axis")
.attr("transform", "translate(0," + height + ")")
.call(xAxis)
.append("text")
.attr("class", "label")
.attr("x", width)
.attr("y", -6)
.style("text-anchor", "end")
.text("Time between Eruptions (min.)");
svg.append("g")
.attr("class", "y axis")
.call(yAxis);
// draw the histogram and kernel density plot
d3.json(dataFN, function(error, faithful) {
// calculate the number of histogram bins
if( calcHistBinsAutmoatic == true) {
numHistBins = Math.ceil(Math.sqrt(faithful.length)); // global variable
}
// the histogram function
histogram = d3.layout.histogram()
.frequency(false)
.bins(numHistBins);
//.bins(x.ticks(500));
var data = histogram(faithful);
//var kde = kernelDensityEstimator(epanechnikovKernel(7), x.ticks(100));
var kde = kernelDensityEstimator(epanechnikovKernel(bandwith), x.ticks(100));
//alert("kde is " + kde.toSource());
//console.log(svg.datum(kde(faithful)));
svg.selectAll(".bar")
.data(data)
.enter().insert("rect", ".axis")
.attr("class", "bar")
.attr("x", function(d) { return x(d.x) + 1; })
.attr("y", function(d) { return y(d.y); })
.attr("width", x(data[0].dx + data[0].x) - x(data[0].x) - 1)
.attr("height", function(d) { return height - y(d.y); });
// show the kernel density plot
if(showKDP == true) {
svg.append("path")
.datum(kde(faithful))
.attr("class", "line")
.attr("d", line);
}
});
function kernelDensityEstimator(kernel, x) {
return function(sample) {
return x.map(function(x) {
//console.log(x + " ... " + d3.mean(sample, function(v) { return kernel(x - v); }));
return [x, d3.mean(sample, function(v) { return kernel(x - v); })];
});
};
}
function epanechnikovKernel(bandwith) {
return function(u) {
//return Math.abs(u /= bandwith) <= 1 ? .75 * (1 - u * u) / bandwith : 0;
if(Math.abs(u = u / bandwith) <= 1) {
return 0.75 * (1 - u * u) / bandwith;
} else return 0;
};
}
</script>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment