Extracted from simple-statistics.
-
-
Save tmcw/4977508 to your computer and use it in GitHub Desktop.
// # [Jenks natural breaks optimization](http://en.wikipedia.org/wiki/Jenks_natural_breaks_optimization) | |
// | |
// Implementations: [1](http://danieljlewis.org/files/2010/06/Jenks.pdf) (python), | |
// [2](https://github.com/vvoovv/djeo-jenks/blob/master/main.js) (buggy), | |
// [3](https://github.com/simogeo/geostats/blob/master/lib/geostats.js#L407) (works) | |
function jenks(data, n_classes) { | |
// Compute the matrices required for Jenks breaks. These matrices | |
// can be used for any classing of data with `classes <= n_classes` | |
function getMatrices(data, n_classes) { | |
// in the original implementation, these matrices are referred to | |
// as `LC` and `OP` | |
// | |
// * lower_class_limits (LC): optimal lower class limits | |
// * variance_combinations (OP): optimal variance combinations for all classes | |
var lower_class_limits = [], | |
variance_combinations = [], | |
// loop counters | |
i, j, | |
// the variance, as computed at each step in the calculation | |
variance = 0; | |
// Initialize and fill each matrix with zeroes | |
for (i = 0; i < data.length + 1; i++) { | |
var tmp1 = [], tmp2 = []; | |
for (j = 0; j < n_classes + 1; j++) { | |
tmp1.push(0); | |
tmp2.push(0); | |
} | |
lower_class_limits.push(tmp1); | |
variance_combinations.push(tmp2); | |
} | |
for (i = 1; i < n_classes + 1; i++) { | |
lower_class_limits[1][i] = 1; | |
variance_combinations[1][i] = 0; | |
// in the original implementation, 9999999 is used but | |
// since Javascript has `Infinity`, we use that. | |
for (j = 2; j < data.length + 1; j++) { | |
variance_combinations[j][i] = Infinity; | |
} | |
} | |
for (var l = 2; l < data.length + 1; l++) { | |
// `SZ` originally. this is the sum of the values seen thus | |
// far when calculating variance. | |
var sum = 0, | |
// `ZSQ` originally. the sum of squares of values seen | |
// thus far | |
sum_squares = 0, | |
// `WT` originally. This is the number of | |
w = 0, | |
// `IV` originally | |
i4 = 0; | |
// in several instances, you could say `Math.pow(x, 2)` | |
// instead of `x * x`, but this is slower in some browsers | |
// introduces an unnecessary concept. | |
for (var m = 1; m < l + 1; m++) { | |
// `III` originally | |
var lower_class_limit = l - m + 1, | |
val = data[lower_class_limit - 1]; | |
// here we're estimating variance for each potential classing | |
// of the data, for each potential number of classes. `w` | |
// is the number of data points considered so far. | |
w++; | |
// increase the current sum and sum-of-squares | |
sum += val; | |
sum_squares += val * val; | |
// the variance at this point in the sequence is the difference | |
// between the sum of squares and the total x 2, over the number | |
// of samples. | |
variance = sum_squares - (sum * sum) / w; | |
i4 = lower_class_limit - 1; | |
if (i4 !== 0) { | |
for (j = 2; j < n_classes + 1; j++) { | |
// if adding this element to an existing class | |
// will increase its variance beyond the limit, break | |
// the class at this point, setting the lower_class_limit | |
// at this point. | |
if (variance_combinations[l][j] >= | |
(variance + variance_combinations[i4][j - 1])) { | |
lower_class_limits[l][j] = lower_class_limit; | |
variance_combinations[l][j] = variance + | |
variance_combinations[i4][j - 1]; | |
} | |
} | |
} | |
} | |
lower_class_limits[l][1] = 1; | |
variance_combinations[l][1] = variance; | |
} | |
// return the two matrices. for just providing breaks, only | |
// `lower_class_limits` is needed, but variances can be useful to | |
// evaluage goodness of fit. | |
return { | |
lower_class_limits: lower_class_limits, | |
variance_combinations: variance_combinations | |
}; | |
} | |
// the second part of the jenks recipe: take the calculated matrices | |
// and derive an array of n breaks. | |
function breaks(data, lower_class_limits, n_classes) { | |
var k = data.length - 1, | |
kclass = [], | |
countNum = n_classes; | |
// the calculation of classes will never include the upper and | |
// lower bounds, so we need to explicitly set them | |
kclass[n_classes] = data[data.length - 1]; | |
kclass[0] = data[0]; | |
// the lower_class_limits matrix is used as indexes into itself | |
// here: the `k` variable is reused in each iteration. | |
while (countNum > 1) { | |
kclass[countNum - 1] = data[lower_class_limits[k][countNum] - 2]; | |
k = lower_class_limits[k][countNum] - 1; | |
countNum--; | |
} | |
return kclass; | |
} | |
if (n_classes > data.length) return null; | |
// sort data in numerical order, since this is expected | |
// by the matrices function | |
data = data.slice().sort(function (a, b) { return a - b; }); | |
// get our basic matrices | |
var matrices = getMatrices(data, n_classes), | |
// we only need lower class limits here | |
lower_class_limits = matrices.lower_class_limits; | |
// extract n_classes out of the computed matrices | |
return breaks(data, lower_class_limits, n_classes); | |
} |
@simogeo That does not appear to be a bug in the Jenks implementation, but in the usage of d3.scale.threshold. In this case, 8 breaks should be computed (8 values in the domain), corresponding to 9 colors (9 values in the range). The example you linked uses 9 breaks rather than 8, so there’s one too many values in the domain and thus some counties with an undefined class.
@mbostock Thanks for the explanation and thanks for d3, which is just an amazing library!
Hi Tom,
I don't want to be impolite, just I don't understand something. I read your article where you stated that your implementation is "literate and highly descriptive", however this implementation is the same as the others. I found the same "mistakes" and unnecessary assignments that I found in the other implementations. Does this version supposed to be corrected by us or is that version the "literate and highly descriptive" that you have written?
Thanks for your work and time,
Sorry for the misunderstanding,
I seem to recall there being a more readable version with the code on the left-hand side of the page and many nice descriptions on the right. I could be misremembering, though.
Sadly Jenks had been dropped from simple-statistics in favor of ckmeans: https://github.com/simple-statistics/simple-statistics/blob/4db0dd820ebb5bc9bd7635715a3ef8a4678e180e/CHANGELOG.md#jenks---ckmeans
@r-barnes, you probably mean http://web.archive.org/web/20130317024515/http://macwright.org/simple-statistics/docs/simple_statistics.html#section-96 . It is not online anymore from what I can see and I am not sure the formatting is correct in the archive. There is jenks code before its "Jenks natural breaks optimization" header.
Hi Tom.
You did a nice/hard job starting coding Jenks algorithm from Jenks writing but since I see black states into the given example, I just wonder if the script is stable or not.
simo, (geostats library author/maintainer).
By the way, the JS implementation embedded into geostats has been coded by Doug Curl (as mentioned inside code itself).