Skip to content

Instantly share code, notes, and snippets.

@sklam
Last active February 23, 2018 04:48
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sklam/4979921 to your computer and use it in GitHub Desktop.
Save sklam/4979921 to your computer and use it in GitHub Desktop.
import json
from pprint import pprint as pp
import numpy as np
from numba import autojit, typeof, int32
INF = float('inf')
@autojit
def jenks_matrics_init(data, n_classes, ):
#fill the matrices with data+1 arrays of n_classes 0s
n_data = len(data)
lower_class_limits = np.zeros((n_data + 1, n_classes + 1), dtype=data.dtype)
variance_combinations = np.zeros((n_data + 1, n_classes + 1), dtype=data.dtype)
for i in xrange(1, n_classes + 1):
lower_class_limits[1, i] = 1.
variance_combinations[1, i] = 0.
for j in xrange(2, len(data) + 1):
variance_combinations[j, i] = INF
return lower_class_limits, variance_combinations
@autojit
def jenks_matrices(data, n_classes, lower_class_limits, variance_combinations):
variance = 0.0
for l in range(2, len(data) + 1):
sum = 0.0
sum_squares = 0.0
w = 0.0
for m in range(1, l + 1):
# `III` originally
lower_class_limit = l - m + 1
val = data[lower_class_limit - 1]
# here we're estimating variance for each potential classing
# of the data, for each potential number of classes. `w`
# is the number of data points considered so far.
w += 1
# increase the current sum and sum-of-squares
sum += val
sum_squares += val * val
# the variance at this point in the sequence is the difference
# between the sum of squares and the total x 2, over the number
# of samples.
variance = sum_squares - (sum * sum) / w
i4 = lower_class_limit - 1
if i4 != 0:
for j in xrange(2, n_classes + 1):
if variance_combinations[l, j] >= (variance + variance_combinations[i4, j - 1]):
lower_class_limits[l, j] = lower_class_limit
variance_combinations[l, j] = variance + variance_combinations[i4, j - 1]
lower_class_limits[l, 1] = 1.
variance_combinations[l, 1] = variance
return lower_class_limits, variance_combinations
def get_jenks_breaks(data, lower_class_limits, n_classes):
k = int(len(data) - 1)
kclass = np.zeros(n_classes + 1, dtype=data.dtype)
kclass[n_classes] = data[len(data) - 1]
kclass[0] = data[0]
for countNum in xrange(n_classes, 1, -1):
elt = int(lower_class_limits[k, countNum] - 2)
kclass[countNum - 1] = data[elt]
k = int(lower_class_limits[k, countNum] - 1)
return kclass
def jenks(data, n_classes):
if n_classes > len(data): return
data.sort()
lower_class_limits, variance_combinations = jenks_matrics_init(data, n_classes)
jenks_matrices(data, n_classes, lower_class_limits, variance_combinations)
return get_jenks_breaks(data, lower_class_limits, n_classes)
def main():
rawdata = json.load(open('test.json'))
data = np.array(rawdata)
pp(jenks(data, 5).tolist())
if __name__ == "__main__":
main()
jenks2$ time python numbajenks2.py
[0.0028109620325267315,
2.0935479691252112,
4.205495140049607,
6.178148351609707,
8.09175917180255,
9.997982932254672]
real 0m0.865s
user 0m0.735s
sys 0m0.127s
jenks2$ time python jenks2.py
[0.0028109620325267315,
2.0935479691252112,
4.205495140049607,
6.178148351609707,
8.09175917180255,
9.997982932254672]
real 0m4.390s
user 0m4.373s
sys 0m0.016s
Or,
%timeit jenks2.main()
1 loops, best of 3: 4.58 s per loop
%timeit numbajenks2.main()
1 loops, best of 3: 18.3 ms per loop
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment