Skip to content

Instantly share code, notes, and snippets.

@GaelVaroquaux
Created June 6, 2011 10:46
Show Gist options
  • Save GaelVaroquaux/1010064 to your computer and use it in GitHub Desktop.
Save GaelVaroquaux/1010064 to your computer and use it in GitHub Desktop.
Computing bins to have a somewhat equal partitioning of the data
import numpy as np
def _compute_bins(x, n_bins=10):
""" Find optimal bins from a univariate distribution
Parameters
===========
x: 1D array-like
Samples
n_bins: integer
Number of bins to return. The algorithm will return at most
n_bins, but may return less.
Returns
========
bins: 1D array, length: n_bins + 1
The bounds of the bins: the first value is the lower bound of
the first bins, the second value is the upper bound of the
first bin and the lower bound of the second bin... the last
value is the upper bound of the last bin.
Notes
======
This function tries to return bins defined by quantiles: divide
the sample set into bins with an equal number of observations.
However, if the sample has macroscopicaly occupated state (for
instance in the case of a sparse vector, many coefficients are
zero), the function will single these out.
"""
# Use quantiles as bins to equally divide the distribution
distribution = np.asarray(x).copy().astype(np.float)
distribution.sort()
eps = np.finfo(np.float).eps
bin_indices = np.linspace(0, (len(distribution)-1),
n_bins).astype(np.int)
bins = distribution[bin_indices]
unique_bins = np.unique(bins)
if not unique_bins.size == n_bins+1:
# some bins are equal, this can be the case for sparse
# distributions, for which many coefficients are near zero. In this
# case, we take new quantiles regularly-spaced in between sets of
# zero-sized bins
posterized_bins = np.searchsorted(unique_bins, bins)
edges = np.where(np.diff(posterized_bins, n=2))[0].tolist()
edges.insert(0, -1)
edges.append(n_bins-2)
new_indices = [0]
scale = (n_bins+1)/float(unique_bins.size)
for index_start, index_end in zip(edges[::2], edges[1::2]):
# We are specifying only one value for end of a bin and start
# of the next, remove the duplicated value
new_indices.pop()
new_indices.extend(np.linspace(bin_indices[index_start+1],
bin_indices[index_end+1],
scale*(index_end - index_start +1)))
new_indices[-1] = -1
new_indices = np.array(new_indices, dtype=np.int)
bins = distribution[new_indices]
# Finally clean up left over duplicated bins, rather than
# iterating on this process:
bins = np.unique(bins)
# If we are still low on bins (because there are very few
# unique numbers in our observations), create some more by
# subdividing bins
if bins.size < .5 * (n_bins + 1):
bins = np.r_[bins, .5*(bins[1:] + bins[:-1])]
bins.sort()
# Avoid aliasing the bins with the data
bins += 10*eps
bins[-1] = distribution[-1] + .01*max(-distribution[0], distribution[-1])
bins[0] = distribution[0] - eps
return bins
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment