Skip to content

Instantly share code, notes, and snippets.

@kaizhu256
Last active November 11, 2021 01:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kaizhu256/064f65818c75b3ab5fc2a6309ee19922 to your computer and use it in GitHub Desktop.
Save kaizhu256/064f65818c75b3ab5fc2a6309ee19922 to your computer and use it in GitHub Desktop.
This page contains C++ code of a O(k×n×log(n)) method for the classification of an array of numeric values such that the variance per class is minimal, known as a Fisher's Natural Breaks Classification
/*
copyright
This code is written by Maarten Hilferink, © Object Vision BV, and is provided
under GNU GPL v3.0 license
https://www.geodms.nl/CalcNaturalBreaks
This page contains C++ code of a O(k×n×log(n)) method for the classification of
an array of numeric values such that the variance per class is minimal, known as
a Fisher's Natural Breaks Classification. This algorithm is an improvement of
Jenks' Natural Breaks Optimization method, which is an O(k×n2) algorithm. See:
Fisher's Natural Breaks Classification complexity proof
This code is derived from clc/dll/src1/CalcClassBreaks.cpp in our Subversion
Repository which implements the ClassifyJenksFisher operation in the [GeoDms].
It is independently compilable and runnable and finds the JenksFisher
class-breaks for an array of random numbers and made that available here.
*/
#include <algorithm>
#include <assert.h>
#include <iostream>
#include <vector>
typedef std::size_t SizeT;
typedef SizeT CountType;
typedef std::pair<double, CountType> ValueCountPair;
typedef std::vector<double> LimitsContainer;
typedef std::vector<ValueCountPair> ValueCountPairContainer;
// helper funcs
template <typename T> T Min(T a, T b) { return (a<b) ? a : b; }
SizeT GetTotalCount(const ValueCountPairContainer& vcpc)
{
SizeT sum = 0;
ValueCountPairContainer::const_iterator
i = vcpc.begin(),
e = vcpc.end();
for(sum = 0; i!=e; ++i)
sum += (*i).second;
return sum;
}
// helper struct JenksFisher
struct JenksFisher // captures the intermediate data and methods for the calculation of Natural Class Breaks.
{
JenksFisher(const ValueCountPairContainer& vcpc, SizeT k)
: m_M(vcpc.size())
, m_K(k)
, m_BufSize(vcpc.size()-(k-1))
, m_PrevSSM(m_BufSize)
, m_CurrSSM(m_BufSize)
, m_CB(m_BufSize * (m_K-1))
, m_CBPtr()
{
m_CumulValues.reserve(vcpc.size());
double cwv=0;
CountType cw = 0, w;
for(SizeT i=0; i!=m_M; ++i)
{
assert(!i || vcpc[i].first > vcpc[i-1].first); // PRECONDITION: the value sequence must be strictly increasing
w = vcpc[i].second;
assert(w > 0); // PRECONDITION: all weights must be positive
cw += w;
assert(cw > w || !i); // No overflow? No loss of precision?
cwv += w * vcpc[i].first;
m_CumulValues.push_back(ValueCountPair(cwv, cw));
if (i < m_BufSize)
m_PrevSSM[i] = cwv * cwv / cw; // prepare SSM for first class. Last (k-1) values are omitted
}
}
double GetW(SizeT b, SizeT e)
// Gets sum of weighs for elements b..e.
{
assert(b); // First element always belongs to class 0, thus queries should never include it.
assert(b<=e);
assert(e<m_M);
double res = m_CumulValues[e].second;
res -= m_CumulValues[b-1].second;
return res;
}
double GetWV(SizeT b, SizeT e)
// Gets sum of weighed values for elements b..e
{
assert(b);
assert(b<=e);
assert(e<m_M);
double res = m_CumulValues[e].first;
res -= m_CumulValues[b-1].first;
return res;
}
double GetSSM(SizeT b, SizeT e)
// Gets the Squared Mean for elements b..e, multiplied by weight.
// Note that n*mean^2 = sum^2/n when mean := sum/n
{
double res = GetWV(b,e);
return res * res / GetW(b,e);
}
SizeT FindMaxBreakIndex(SizeT i, SizeT bp, SizeT ep)
// finds CB[i+m_NrCompletedRows] given that
// the result is at least bp+(m_NrCompletedRows-1)
// and less than ep+(m_NrCompletedRows-1)
// Complexity: O(ep-bp) <= O(m)
{
assert(bp < ep);
assert(bp <= i);
assert(ep <= i+1);
assert(i < m_BufSize);
assert(ep <= m_BufSize);
double minSSM = m_PrevSSM[bp] + GetSSM(bp+m_NrCompletedRows, i+m_NrCompletedRows);
SizeT foundP = bp;
while (++bp < ep)
{
double currSSM = m_PrevSSM[bp] + GetSSM(bp+m_NrCompletedRows, i+m_NrCompletedRows);
if (currSSM > minSSM)
{
minSSM = currSSM;
foundP = bp;
}
}
m_CurrSSM[i] = minSSM;
return foundP;
}
void CalcRange(SizeT bi, SizeT ei, SizeT bp, SizeT ep)
// find CB[i+m_NrCompletedRows]
// for all i>=bi and i<ei given that
// the results are at least bp+(m_NrCompletedRows-1)
// and less than ep+(m_NrCompletedRows-1)
// Complexity: O(log(ei-bi)*Max((ei-bi),(ep-bp))) <= O(m*log(m))
{
assert(bi <= ei);
assert(ep <= ei);
assert(bp <= bi);
if (bi == ei)
return;
assert(bp < ep);
SizeT mi = (bi + ei)/2;
SizeT mp = FindMaxBreakIndex(mi, bp, Min<SizeT>(ep, mi+1));
assert(bp <= mp);
assert(mp < ep);
assert(mp <= mi);
// solve first half of the sub-problems with lower 'half' of possible outcomes
CalcRange(bi, mi, bp, Min<SizeT>(mi, mp+1));
m_CBPtr[ mi ] = mp; // store result for the middle element.
// solve second half of the sub-problems with upper 'half' of possible outcomes
CalcRange(mi+1, ei, mp, ep);
}
void CalcAll()
// complexity: O(m*log(m)*k)
{
if (m_K>=2)
{
m_CBPtr = m_CB.begin();
for (m_NrCompletedRows=1; m_NrCompletedRows<m_K-1; ++m_NrCompletedRows)
{
CalcRange(0, m_BufSize, 0, m_BufSize); // complexity: O(m*log(m))
m_PrevSSM.swap(m_CurrSSM);
m_CBPtr += m_BufSize;
}
}
}
SizeT m_M, m_K, m_BufSize;
ValueCountPairContainer m_CumulValues;
std::vector<double> m_PrevSSM;
std::vector<double> m_CurrSSM;
std::vector<SizeT> m_CB;
std::vector<SizeT>::iterator m_CBPtr;
SizeT m_NrCompletedRows;
};
// GetValueCountPairs
//
// GetValueCountPairs sorts chunks of values and then merges them in order to minimize extra memory and work when many values are equal.
// This is done recursively while retaining used intermediary buffers in order to minimize heap allocations.
const SizeT BUFFER_SIZE = 1024;
void GetCountsDirect(ValueCountPairContainer& vcpc, const double* values, SizeT size)
{
assert(size <= BUFFER_SIZE);
assert(size > 0);
assert(vcpc.empty());
double buffer[BUFFER_SIZE];
std::copy(values, values+size, buffer);
std::sort(buffer, buffer+size);
double currValue = buffer[0];
SizeT currCount = 1;
for (SizeT index = 1; index != size; ++index)
{
if (currValue < buffer[index])
{
vcpc.push_back(ValueCountPair(currValue, currCount));
currValue = buffer[index];
currCount = 1;
}
else
++currCount;
}
vcpc.push_back(ValueCountPair(currValue, currCount));
}
struct CompareFirst
{
bool operator () (const ValueCountPair& lhs, const ValueCountPair& rhs)
{
return lhs.first < rhs.first;
}
};
void MergeToLeft(ValueCountPairContainer& vcpcLeft, const ValueCountPairContainer& vcpcRight, ValueCountPairContainer& vcpcDummy)
{
assert(vcpcDummy.empty());
vcpcDummy.swap(vcpcLeft);
vcpcLeft.resize(vcpcRight.size() + vcpcDummy.size());
std::merge(vcpcRight.begin(), vcpcRight.end(), vcpcDummy.begin(), vcpcDummy.end(), vcpcLeft.begin(), CompareFirst());
ValueCountPairContainer::iterator
currPair = vcpcLeft.begin(),
lastPair = vcpcLeft.end();
ValueCountPairContainer::iterator index = currPair+1;
while (index != lastPair && currPair->first < index->first)
{
currPair = index;
++index;
}
double currValue = currPair->first;
SizeT currCount = currPair->second;
for (; index != lastPair;++index)
{
if (currValue < index->first)
{
*currPair++ = ValueCountPair(currValue, currCount);
currValue = index->first;
currCount = index->second;
}
else
currCount += index->second;
}
*currPair++ = ValueCountPair(currValue, currCount);
vcpcLeft.erase(currPair, lastPair);
vcpcDummy.clear();
}
struct ValueCountPairContainerArray : std::vector<ValueCountPairContainer>
{
void resize(SizeT k)
{
assert(capacity() >= k);
while (size() < k)
{
push_back(ValueCountPairContainer());
back().reserve(BUFFER_SIZE);
}
}
void GetValueCountPairs(ValueCountPairContainer& vcpc, const double* values, SizeT size, unsigned int nrUsedContainers)
{
assert(vcpc.empty());
if (size <= BUFFER_SIZE)
GetCountsDirect(vcpc, values, size);
else
{
resize(nrUsedContainers+2);
unsigned int m = size/2;
GetValueCountPairs(vcpc, values, m, nrUsedContainers);
GetValueCountPairs(begin()[nrUsedContainers], values + m, size - m, nrUsedContainers+1);
MergeToLeft(vcpc, begin()[nrUsedContainers], begin()[nrUsedContainers+1]);
begin()[nrUsedContainers].clear();
}
assert(GetTotalCount(vcpc) == size);
}
};
void GetValueCountPairs(ValueCountPairContainer& vcpc, const double* values, SizeT n)
{
vcpc.clear();
if(n)
{
ValueCountPairContainerArray vcpca;
// max nr halving is log2(max cardinality / BUFFER_SIZE); max cardinality is SizeT(-1)
vcpca.reserve(3+8*sizeof(SizeT)-10);
vcpca.GetValueCountPairs(vcpc, values, n, 0);
assert(vcpc.size());
}
}
void ClassifyJenksFisherFromValueCountPairs(LimitsContainer& breaksArray, SizeT k, const ValueCountPairContainer& vcpc)
{
breaksArray.resize(k);
SizeT m = vcpc.size();
assert(k <= m); // PRECONDITION
if (!k)
return;
JenksFisher jf(vcpc, k);
if (k > 1)
{
jf.CalcAll();
SizeT lastClassBreakIndex = jf.FindMaxBreakIndex(jf.m_BufSize-1, 0, jf.m_BufSize);
while (--k)
{
breaksArray[k] = vcpc[lastClassBreakIndex+k].first;
assert(lastClassBreakIndex < jf.m_BufSize);
if (k > 1)
{
jf.m_CBPtr -= jf.m_BufSize;
lastClassBreakIndex = jf.m_CBPtr[lastClassBreakIndex];
}
}
assert(jf.m_CBPtr == jf.m_CB.begin());
}
assert( k == 0 );
breaksArray[0] = vcpc[0].first; // break for the first class is the minimum of the dataset.
}
// test code
#include "boost/random.hpp"
int main(int c, char** argv)
{
const double rangeMin = 0.0;
const double rangeMax = 10.0;
typedef boost::uniform_real<double> NumberDistribution;
typedef boost::mt19937 RandomNumberGenerator;
typedef boost::variate_generator<RandomNumberGenerator&, NumberDistribution> Generator;
NumberDistribution distribution(rangeMin, rangeMax);
RandomNumberGenerator generator;
generator.seed(0); // seed with the current time
Generator numberGenerator(generator, distribution);
const int n = 1000000;
const int k = 10;
std::cout << "Generating random numbers..." << std::endl;
std::vector<double> values;
values.reserve(n);
for (int i=0; i!=n; ++i)
{
double v = numberGenerator();
values.push_back(v*v); //populate a distribuiton slightly more interesting than uniform, with a lower density at higher values.
}
assert(values.size() == n);
std::cout << "Generating sortedUniqueValueCounts ..." << std::endl;
ValueCountPairContainer sortedUniqueValueCounts;
GetValueCountPairs(sortedUniqueValueCounts, &values[0], n);
std::cout << "Finding Jenks ClassBreaks..." << std::endl;
LimitsContainer resultingbreaksArray;
ClassifyJenksFisherFromValueCountPairs(resultingbreaksArray, k, sortedUniqueValueCounts);
std::cout << "Reporting results..." << std::endl;
for (double breakValue: resultingbreaksArray)
std::cout << breakValue << std::endl << std::endl;
std::cout << "Press a char and <enter> to terminate" << std::endl;
char ch;
std::cin >> ch; // wait for user to enter a key
} // main
/*
* https://github.com/pschoepf/naturalbreaks/blob/master/src/main/javascript/JenksFisher.js
* Port of Jenks/Fisher breaks originally created in C by Maarten Hilferink.
*
* Copyright (C) {2015} {Philipp Schoepf}
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
**/
/**
* Internal helper used for assertions.
*
* @param {boolean} correct The boolean input.
* @param {String} msg Message to be thrown.
*/
function assert(correct, msg) {
//alert("correct="+correct+",\n\nactual="+actual);
if (!correct)
throw new Error("ASSERTION FAILED: " + msg);
}
/**
* Internal helper to swap two arrays.
*
* @param {Array} a Array one.
* @param {Array} b Array two and the holder of the result.
*/
function swapArrays(a, b) {
var temp = a.slice(0);
a.length = 0;
a.push.apply(a, b);
b.length = 0;
b.push.apply(b, temp);
}
/**
* Constructor that initializes main variables used in fisher calculation of natural breaks.
*
* @param vcpc Ordered list of pairs of values to occurrence counts.
* @param k Number of breaks to find.
*/
function JenksFisher(/*array of {val:double, count:double}*/ vcpc, /*int*/ k) {
if (!(this instanceof JenksFisher)) {
return new JenksFisher();
}
this.cumulValues = [/*{wv: weighted value as double,cwv: cummul weighted value as double}*/];
this.numValues = 0;
this.numBreaks = 0;
this.bufferSize = 0;
this.previousSSM = [];
this.currentSSM = [];
this.classBreaks = [];
this.classBreaksIndex = 0;
this.completedRows = 0;
this.cumulValues.length = [],
this.numValues = vcpc.length;
this.numBreaks = k,
this.bufferSize = (vcpc.length - (k - 1)),
this.previousSSM = new Array(this.bufferSize),
this.currentSSM = new Array(this.bufferSize),
this.classBreaks = new Array(this.bufferSize * (this.numBreaks - 1)),
this.classBreaksIndex = 0, this.completedRows = 0;
var cwv = 0.0;
var cw = 0, w = 0;
//console.debug("init:: values and weight:"+vcpc.length+", buffSize:"+this.bufferSize+", content:"+JSON.stringify(vcpc));
for (var i = 0; i != this.numValues; ++i)
{
assert(!i || vcpc[i].val >= vcpc[i - 1].val); // PRECONDITION: the value sequence must be strictly increasing
w = vcpc[i].count;
assert(w > 0); // PRECONDITION: all weights must be positive
cw += w;
assert(cw >= w); // No overflow? No loss of precision?
cwv += w * vcpc[i].val;
this.cumulValues.push({cwv: cwv, cw: cw});
if (i < this.bufferSize)
this.previousSSM[i] = cwv * cwv / cw; // prepare SSM for first class. Last (k-1) values are omitted
}
//console.debug("init:: CumulValues:"+this.cumulValues);
//console.debug("init:: previousSSM:"+this.previousSSM)
}
/**
* Gets sum of weighs for elements with index b..e.
*
* @param b index of begin element
* @param e index of end element
* @return sum of weights.
*/
JenksFisher.prototype.getSumOfWeights = function (/*int*/ b, /*int*/ e)
{
assert(b); // First element always belongs to class 0, thus queries should never include it.
assert(b <= e);
assert(e < this.numValues);
var res = this.cumulValues[e].cw;
res -= this.cumulValues[b - 1].cw;
return res;
}
/**
* Gets sum of weighed values for elements with index b..e
*
* @param b index of begin element.
* @param e index of end element
* @return the cumul. sum of the values*weight
*/
JenksFisher.prototype.getSumOfWeightedValues = function (/*int*/ b, /*int*/ e)
// Gets sum of weighed values for elements b..e
{
assert(b);
assert(b <= e);
assert(e < this.numValues);
var res = this.cumulValues[e].cwv;
res -= this.cumulValues[b - 1].cwv;
return res;
}
/**
* Gets the Squared Mean for elements within index b..e, multiplied by weight. Note that
* n*mean^2 = sum^2/n when mean := sum/n
*
* @param b index of begin element
* @param e index of end element
* @return the sum of squared mean
*/
JenksFisher.prototype.getSSM = function (/*int*/ b, /*int*/ e)
{
var res = this.getSumOfWeightedValues(b, e);
return res * res / this.getSumOfWeights(b, e);
}
/**
* Finds CB[i+completedRows] given that the result is at least
* bp+(completedRows-1) and less than ep+(completedRows-1)
* Complexity: O(ep-bp) <= O(m) @
*
* @param i startIndex
* @param bp endindex
* @param ep
*
* @return the index
*/
JenksFisher.prototype.findMaxBreakIndex = function (/*int*/ i, /*int*/ bp, /*int*/ ep)
{
assert(bp < ep);
assert(bp <= i);
assert(ep <= i + 1);
assert(i < this.bufferSize);
assert(ep <= this.bufferSize);
var minSSM = this.previousSSM[bp] + this.getSSM(bp + this.completedRows, i + this.completedRows);
var foundP = bp;
while (++bp < ep)
{
var currSSM = this.previousSSM[bp] + this.getSSM(bp + this.completedRows, i + this.completedRows);
if (currSSM > minSSM)
{
//console.debug("findMaxBreakIndex:: new minSSM: " + minSSM+ ", currSSM:"+ currSSM);
minSSM = currSSM;
foundP = bp;
}
}
this.currentSSM[i] = minSSM;
return foundP;
},
/**
* Find CB[i+completedRows] for all i>=bi and i<ei given that the
* results are at least bp+(completedRows-1) and less than
* ep+(completedRows-1)
* Complexity: O(log(ei-bi)*Max((ei-bi),(ep-bp)))
* <= O(m*log(m))
*
*
* @param bi
* @param ei
* @param bp
* @param ep
* @return
*/
JenksFisher.prototype.calcRange = function (/*int*/ bi, /*int*/ ei, /*int*/ bp, /*int*/ ep)
{
assert(bi <= ei);
assert(ep <= ei);
assert(bp <= bi);
if (bi === ei)
return;
assert(bp < ep);
var mi = Math.floor((bi + ei) / 2);
var mp = this.findMaxBreakIndex(mi, bp, Math.min(ep, mi + 1));
assert(bp <= mp);
assert(mp < ep);
assert(mp <= mi);
// solve first half of the sub-problems with lower 'half' of possible outcomes
this.calcRange(bi, mi, bp, Math.min(mi, mp + 1));
this.classBreaks[this.classBreaksIndex + mi ] = mp; // store result for the middle element.
// solve second half of the sub-problems with upper 'half' of possible outcomes
this.calcRange(mi + 1, ei, mp, ep);
},
/**
* Starting point of calculation of breaks.
*
* complexity: O(m*log(m)*k)
*/
JenksFisher.prototype.calcAll = function ()
{
if (this.numBreaks >= 2)
{
this.classBreaksIndex = 0;
for (this.completedRows = 1; this.completedRows < this.numBreaks - 1; ++this.completedRows)
{
this.calcRange(0, this.bufferSize, 0, this.bufferSize); // complexity: O(m*log(m))
swapArrays(this.previousSSM, this.currentSSM);
this.classBreaksIndex += this.bufferSize;
}
}
}
/**
* Does the internal processing to actually create the breaks.
*
* @param k number of breaks
* @param vcpc asc ordered input of values and their occurence counts.
*/
function classifyJenksFisherFromValueCountPairs(/*double[]*/ breaksArray, /*int*/ k, /*{val:value,count:count}*/ vcpc)
{
var m = vcpc.length;
assert(k <= m); // PRECONDITION
if (!k)
return;
var jf = new JenksFisher(vcpc, k);
if (k > 1) {
// runs the actual calculation
jf.calcAll();
var lastClassBreakIndex = jf.findMaxBreakIndex(jf.bufferSize - 1, 0, jf.bufferSize);
while (--k) {
// assign the break values to the result
breaksArray[k] = vcpc[lastClassBreakIndex + k].val;
assert(lastClassBreakIndex < jf.bufferSize);
if (k > 1)
{
jf.classBreaksIndex -= jf.bufferSize;
lastClassBreakIndex = jf.classBreaks[jf.classBreaksIndex + lastClassBreakIndex];
}
}
assert(jf.classBreaks[jf.classBreaksIndex] === jf.classBreaks[0]);
}
assert(k === 0);
breaksArray[0] = vcpc[0].val; // break for the first class is the minimum of the dataset.
}
/**
* Main entry point for creation of Jenks-Fisher natural breaks.
*
* @param {type} values - array of the values, do not need to be sorted.
* @param {type} k - number of breaks to create
* @returns {Array} - Array with breaks
*/
function createJenksFisherBreaksArray(/*double*/ values,/*int*/ k)
{
// create pair of value->number of occurences (weight) which is input for the JF- algorithm
var sortedUniqueValueCounts = getValueCountPairs(values);
var breaksArray = new Array(k);
if (sortedUniqueValueCounts.length>k){
classifyJenksFisherFromValueCountPairs(breaksArray, k, sortedUniqueValueCounts);
}
else {
breaksArray = [];
sortedUniqueValueCounts.forEach(function(el){
breaksArray.push(Math.abs(el.val))});
}
return breaksArray;
}
/**
* create list with weighted unique values, sorted ASC by values.
* @param {type} rawValues
* @returns [{val:value as int,count:weight as int},...]
*/
function getValueCountPairs(rawValues) {
var result = {};
rawValues.forEach(function (el) {
var k = "k_"+Number(el).toString();
if (result[k]=="undefined"|| result[k] == null){
result[k] = 1;
}else{
result[k] = result[k] + 1;
}
});
var sortedResults = [];
Object.keys(result).forEach(function (key) {
sortedResults.push({val: Number(key.substring(2)), count: result[key]});
});
sortedResults.sort(function (a, b) {
return a.val - b.val;
});
return sortedResults;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment