public
Last active

  • Download Gist
numbajenks2.py
Python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92
import json
from pprint import pprint as pp
 
import numpy as np
from numba import autojit, typeof, int32
 
INF = float('inf')
 
@autojit
def jenks_matrics_init(data, n_classes, ):
#fill the matrices with data+1 arrays of n_classes 0s
n_data = len(data)
lower_class_limits = np.zeros((n_data + 1, n_classes + 1), dtype=data.dtype)
variance_combinations = np.zeros((n_data + 1, n_classes + 1), dtype=data.dtype)
 
for i in xrange(1, n_classes + 1):
lower_class_limits[1, i] = 1.
variance_combinations[1, i] = 0.
for j in xrange(2, len(data) + 1):
variance_combinations[j, i] = INF
 
return lower_class_limits, variance_combinations
 
@autojit
def jenks_matrices(data, n_classes, lower_class_limits, variance_combinations):
variance = 0.0
for l in range(2, len(data) + 1):
sum = 0.0
sum_squares = 0.0
w = 0.0
for m in range(1, l + 1):
# `III` originally
lower_class_limit = l - m + 1
val = data[lower_class_limit - 1]
 
# here we're estimating variance for each potential classing
# of the data, for each potential number of classes. `w`
# is the number of data points considered so far.
w += 1
 
# increase the current sum and sum-of-squares
sum += val
sum_squares += val * val
 
# the variance at this point in the sequence is the difference
# between the sum of squares and the total x 2, over the number
# of samples.
variance = sum_squares - (sum * sum) / w
 
i4 = lower_class_limit - 1
 
if i4 != 0:
for j in xrange(2, n_classes + 1):
if variance_combinations[l, j] >= (variance + variance_combinations[i4, j - 1]):
lower_class_limits[l, j] = lower_class_limit
variance_combinations[l, j] = variance + variance_combinations[i4, j - 1]
 
lower_class_limits[l, 1] = 1.
variance_combinations[l, 1] = variance
 
return lower_class_limits, variance_combinations
 
def get_jenks_breaks(data, lower_class_limits, n_classes):
k = int(len(data) - 1)
kclass = np.zeros(n_classes + 1, dtype=data.dtype)
 
kclass[n_classes] = data[len(data) - 1]
kclass[0] = data[0]
 
for countNum in xrange(n_classes, 1, -1):
elt = int(lower_class_limits[k, countNum] - 2)
kclass[countNum - 1] = data[elt]
k = int(lower_class_limits[k, countNum] - 1)
return kclass
 
def jenks(data, n_classes):
if n_classes > len(data): return
 
data.sort()
 
lower_class_limits, variance_combinations = jenks_matrics_init(data, n_classes)
jenks_matrices(data, n_classes, lower_class_limits, variance_combinations)
 
return get_jenks_breaks(data, lower_class_limits, n_classes)
 
def main():
rawdata = json.load(open('test.json'))
data = np.array(rawdata)
pp(jenks(data, 5).tolist())
 
if __name__ == "__main__":
main()
time.txt
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
jenks2$ time python numbajenks2.py
[0.0028109620325267315,
2.0935479691252112,
4.205495140049607,
6.178148351609707,
8.09175917180255,
9.997982932254672]
 
real 0m0.865s
user 0m0.735s
sys 0m0.127s
 
jenks2$ time python jenks2.py
[0.0028109620325267315,
2.0935479691252112,
4.205495140049607,
6.178148351609707,
8.09175917180255,
9.997982932254672]
 
real 0m4.390s
user 0m4.373s
sys 0m0.016s
 
Or,
 
%timeit jenks2.main()
1 loops, best of 3: 4.58 s per loop
 
%timeit numbajenks2.main()
1 loops, best of 3: 18.3 ms per loop

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.