sklam/numbajenks2.py

## numbajenks2.py
import json
from pprint import pprint as pp

import numpy as np
from numba import autojit, typeof, int32

INF = float('inf')

@autojit
def jenks_matrics_init(data, n_classes, ):
    #fill the matrices with data+1 arrays of n_classes 0s
    n_data = len(data)
    lower_class_limits = np.zeros((n_data + 1, n_classes + 1), dtype=data.dtype)
    variance_combinations = np.zeros((n_data + 1, n_classes + 1), dtype=data.dtype)

    for i in xrange(1, n_classes + 1):
        lower_class_limits[1, i] = 1.
        variance_combinations[1, i] = 0.
        for j in xrange(2, len(data) + 1):
            variance_combinations[j, i] = INF

    return lower_class_limits, variance_combinations

@autojit
def jenks_matrices(data, n_classes, lower_class_limits, variance_combinations):
    variance = 0.0
    for l in range(2, len(data) + 1):
        sum = 0.0
        sum_squares = 0.0
        w = 0.0
        for m in range(1, l + 1):
            # `III` originally
            lower_class_limit = l - m + 1
            val = data[lower_class_limit - 1]

            # here we're estimating variance for each potential classing
            # of the data, for each potential number of classes. `w`
            # is the number of data points considered so far.
            w += 1

            # increase the current sum and sum-of-squares
            sum += val
            sum_squares += val * val

            # the variance at this point in the sequence is the difference
            # between the sum of squares and the total x 2, over the number
            # of samples.
            variance = sum_squares - (sum * sum) / w

            i4 = lower_class_limit - 1

            if i4 != 0:
                for j in xrange(2, n_classes + 1):
                    if variance_combinations[l, j] >= (variance + variance_combinations[i4, j - 1]):
                        lower_class_limits[l, j] = lower_class_limit
                        variance_combinations[l, j] = variance + variance_combinations[i4, j - 1]

        lower_class_limits[l, 1] = 1.
        variance_combinations[l, 1] = variance

    return lower_class_limits, variance_combinations

def get_jenks_breaks(data, lower_class_limits, n_classes):
    k = int(len(data) - 1)
    kclass = np.zeros(n_classes + 1, dtype=data.dtype)

    kclass[n_classes] = data[len(data) - 1]
    kclass[0] = data[0]

    for countNum in xrange(n_classes, 1, -1):
        elt = int(lower_class_limits[k, countNum] - 2)
        kclass[countNum - 1] = data[elt]
        k = int(lower_class_limits[k, countNum] - 1)
    return kclass

def jenks(data, n_classes):
    if n_classes > len(data): return

    data.sort()

    lower_class_limits, variance_combinations = jenks_matrics_init(data, n_classes)
    jenks_matrices(data, n_classes, lower_class_limits, variance_combinations)

    return get_jenks_breaks(data, lower_class_limits, n_classes)

def main():
    rawdata = json.load(open('test.json'))
    data = np.array(rawdata)
    pp(jenks(data, 5).tolist())

if __name__ == "__main__":
    main()

## time.txt
jenks2$ time python numbajenks2.py
[0.0028109620325267315,
 2.0935479691252112,
 4.205495140049607,
 6.178148351609707,
 8.09175917180255,
 9.997982932254672]

real    0m0.865s
user	0m0.735s
sys	    0m0.127s

jenks2$ time python jenks2.py
[0.0028109620325267315,
 2.0935479691252112,
 4.205495140049607,
 6.178148351609707,
 8.09175917180255,
 9.997982932254672]

real	0m4.390s
user	0m4.373s
sys	    0m0.016s

Or,

%timeit jenks2.main()
1 loops, best of 3: 4.58 s per loop

%timeit numbajenks2.main()
1 loops, best of 3: 18.3 ms per loop
	import json
	from pprint import pprint as pp

	import numpy as np
	from numba import autojit, typeof, int32

	INF = float('inf')

	@autojit
	def jenks_matrics_init(data, n_classes, ):
	#fill the matrices with data+1 arrays of n_classes 0s
	n_data = len(data)
	lower_class_limits = np.zeros((n_data + 1, n_classes + 1), dtype=data.dtype)
	variance_combinations = np.zeros((n_data + 1, n_classes + 1), dtype=data.dtype)

	for i in xrange(1, n_classes + 1):
	lower_class_limits[1, i] = 1.
	variance_combinations[1, i] = 0.
	for j in xrange(2, len(data) + 1):
	variance_combinations[j, i] = INF

	return lower_class_limits, variance_combinations

	@autojit
	def jenks_matrices(data, n_classes, lower_class_limits, variance_combinations):
	variance = 0.0
	for l in range(2, len(data) + 1):
	sum = 0.0
	sum_squares = 0.0
	w = 0.0
	for m in range(1, l + 1):
	# `III` originally
	lower_class_limit = l - m + 1
	val = data[lower_class_limit - 1]

	# here we're estimating variance for each potential classing
	# of the data, for each potential number of classes. `w`
	# is the number of data points considered so far.
	w += 1

	# increase the current sum and sum-of-squares
	sum += val
	sum_squares += val * val

	# the variance at this point in the sequence is the difference
	# between the sum of squares and the total x 2, over the number
	# of samples.
	variance = sum_squares - (sum * sum) / w

	i4 = lower_class_limit - 1

	if i4 != 0:
	for j in xrange(2, n_classes + 1):
	if variance_combinations[l, j] >= (variance + variance_combinations[i4, j - 1]):
	lower_class_limits[l, j] = lower_class_limit
	variance_combinations[l, j] = variance + variance_combinations[i4, j - 1]

	lower_class_limits[l, 1] = 1.
	variance_combinations[l, 1] = variance

	return lower_class_limits, variance_combinations

	def get_jenks_breaks(data, lower_class_limits, n_classes):
	k = int(len(data) - 1)
	kclass = np.zeros(n_classes + 1, dtype=data.dtype)

	kclass[n_classes] = data[len(data) - 1]
	kclass[0] = data[0]

	for countNum in xrange(n_classes, 1, -1):
	elt = int(lower_class_limits[k, countNum] - 2)
	kclass[countNum - 1] = data[elt]
	k = int(lower_class_limits[k, countNum] - 1)
	return kclass

	def jenks(data, n_classes):
	if n_classes > len(data): return

	data.sort()

	lower_class_limits, variance_combinations = jenks_matrics_init(data, n_classes)
	jenks_matrices(data, n_classes, lower_class_limits, variance_combinations)

	return get_jenks_breaks(data, lower_class_limits, n_classes)

	def main():
	rawdata = json.load(open('test.json'))
	data = np.array(rawdata)
	pp(jenks(data, 5).tolist())

	if __name__ == "__main__":
	main()
	jenks2$ time python numbajenks2.py
	[0.0028109620325267315,
	2.0935479691252112,
	4.205495140049607,
	6.178148351609707,
	8.09175917180255,
	9.997982932254672]

	real 0m0.865s
	user 0m0.735s
	sys 0m0.127s

	jenks2$ time python jenks2.py
	[0.0028109620325267315,
	2.0935479691252112,
	4.205495140049607,
	6.178148351609707,
	8.09175917180255,
	9.997982932254672]

	real 0m4.390s
	user 0m4.373s
	sys 0m0.016s

	Or,

	%timeit jenks2.main()
	1 loops, best of 3: 4.58 s per loop

	%timeit numbajenks2.main()
	1 loops, best of 3: 18.3 ms per loop