Skip to content

Instantly share code, notes, and snippets.

@gsakkis
Created April 21, 2011 20:46
Show Gist options
  • Save gsakkis/935425 to your computer and use it in GitHub Desktop.
Save gsakkis/935425 to your computer and use it in GitHub Desktop.
automatic binning for integer data histograms
import sys
from itertools import groupby
from operator import itemgetter
def bincount_histogram(nums, num_bins=sys.maxint, min_bin_count=1):
num_bins = max(num_bins, 1)
min_bin_count = max(min_bin_count, 1)
# initialize the bins of width=1
bin_edges, counts = [], []
for num, group in groupby(sorted(nums)):
bin_edges.append(num)
counts.append(sum(1 for _ in group))
# append the rightmost edge
bin_edges.append(bin_edges[-1] + 1)
num_counts = len(counts)
while num_counts:
# find the least populated bin
j, min_count = min(enumerate(counts), key=itemgetter(1))
# stop looping as soon as the bins have been reduced to `num_bins`
# (or less) and no bin contains less than `min_bin_count`
if num_counts <= num_bins and min_count >= min_bin_count:
break
# merge the min_count bin with a neighbor
if j == 0 or j < num_counts-1 and counts[j-1] > counts[j+1]:
# leftmost bin or with left neighbor bigger than right:
# merge into right neighbor
del bin_edges[j+1]
counts[j+1] += min_count
else:
# rightmost bin or with right neighbor bigger or equal than left:
# merge into left neighbor
del bin_edges[j]
counts[j-1] += min_count
del counts[j]
num_counts -= 1
return counts, bin_edges
def validate_bincount_histogram(nums, num_bins=sys.maxint, min_bin_count=1):
assert all(isinstance(x,int) for x in nums)
assert len(nums) >= max(1, min_bin_count)
counts, bin_edges = bincount_histogram(nums, num_bins, min_bin_count)
assert len(bin_edges) == len(counts) + 1
assert bin_edges == sorted(bin_edges)
assert sum(counts) == len(nums)
assert len(counts) <= num_bins
assert all(c>=min_bin_count for c in counts)
from numpy import histogram
counts2, bin_edges2 = histogram(nums, bins=bin_edges)
assert counts == list(counts2)
assert bin_edges == list(bin_edges2)
if __name__ == '__main__':
from numpy.random import randint
i = 0
for _ in xrange(100):
for size in 10, 100:
for num_bins in sys.maxint, randint(1,size+1):
for min_bin_count in 1, randint(1,size+1):
for k in 0.1, 0.5, 1.0, 2.0, 10:
nums = randint(-k*size, k*size, size=size)
try:
validate_bincount_histogram(nums, num_bins, min_bin_count)
i += 1
except AssertionError:
print "nums:", nums
print "num_bins:", num_bins
print "min_bin_count:", min_bin_count
raise
print '%d tests ran successfully' % i
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment