Skip to content

Instantly share code, notes, and snippets.

@henryiii
Forked from jpivarski/hist.py
Last active April 22, 2019 09:04
Show Gist options
  • Save henryiii/d545a673ea2b3225cb985c9c02ac958b to your computer and use it in GitHub Desktop.
Save henryiii/d545a673ea2b3225cb985c9c02ac958b to your computer and use it in GitHub Desktop.
Functional histogram slice proposal
import numbers
import math
import numpy
class Binning:
"Abstract superclass of all binning types. Only Regular is implemented."
def __init__(self, *args, **kwargs):
raise NotImplementedError
def __repr__(self):
raise NotImplementedError
def __len__(self):
raise NotImplementedError
def __eq__(self, other):
raise NotImplementedError
def __ne__(self, other):
raise NotImplementedError
def sliced(self, start, stop, axis, counts, wantflows=True):
"""
Return a sliced binning and counts for start:stop at a given axis.
* start and stop may (individually) be None, integers, or functions (binning, isleft) → integer or None; a function would be called with this binning and isleft=True for start and isleft=False for stop and its return value would be used in its place
* axis is an integer greater than or equal to 0 and less than len(counts.shape)
* counts is a Numpy array of bin counts for all axes
* wantflows (bool) controls whether the presence of start and stop create underflow and overflow bins (see below for details)
The start and stop index coordinates are 0-based, negative counts from the right, and are independent of whether under/overflow exist ("external"). A value of 0 is the left edge of the first regular bin and a value of num is the right edge of the last regular bin.
If start is not None and wantflows is True, the return values will have an underflow bin with all sliced data added to it. Note that start=0 does not slice, but guarantees that an (empty) underflow bin will exist in the output. If start is None or wantflows is False, the return values will only have an underflow bin if this binning does.
If stop is not None and wantflows is True, the return values will have an overflow bin with all sliced data added to it. Note that stop=num does not slice, but guarantees that an (empty) overflow bin will exist in the output. If stop is None or wantflows is False, the return values will only have an overflow bin if this binning does.
"""
raise NotImplementedError
def left(self, index):
"""
Convert an array index position into the left (low) edge of the data coordinate interval.
This index coordinate is 0-based, negative counts from the right, and it depends on whether under/overflow exist ("internal"). An index of 0 is the underflow bin if it exists and the first regular bin otherwise. An index of -1 is the overflow bin if it exists and the last regular bin otherwise.
"""
raise NotImplementedError
def right(self, index):
"""
Convert an array index position into the right (high) edge of the data coordinate interval.
This index coordinate is 0-based, negative counts from the right, and it depends on whether under/overflow exist ("internal"). An index of 0 is the underflow bin if it exists and the first regular bin otherwise. An index of -1 is the overflow bin if it exists and the last regular bin otherwise.
"""
raise NotImplementedError
def index(self, x, isleft=True, clip=False):
"""
Convert a data coordinate x into an array index position.
This index coordinate is 0-based, negative counts from the right, and it depends on whether under/overflow exist ("internal"). An index of 0 is the underflow bin if it exists and the first regular bin otherwise. An index of -1 is the overflow bin if it exists and the last regular bin otherwise.
If isleft is True, round down to the nearest index (as when filling a histogram); otherwise, round up.
If clip is False, raise an error for out-of-range x values; otherwise, return 0 if isleft, else len(binning). Underflow and overflow are considered in-range and will not clip.
x=NaN always raises an error (no nanflow).
"""
raise NotImplementedError
class Regular(Binning):
"""
Binning of data coordinates from low to high into num regular (equal-sized) bins.
* num must be a positive integer
* low is an inclusive lower endpoint (float)
* high is an exclusive upper endpoint (float)
* if hasunder (bool), array index 0 is the underflow bin and regular bins start at 1; otherwise, regular bins start at 0
* if hasover (bool), the last array index is the overflow bin
If not hasunder, data coordinates less than low are not valid. Otherwise, data coordinates all the way down to (and including) -inf are valid.
If not hasover, data coordinates greater than or equal to high are not valid. Otherwise, data coordinates all the way up to (and including) +inf are valid.
NaN is never a valid data coordinate.
The len of this binning is the total number of bins, including underflow and overflow.
"""
def __init__(self, num, low, high, hasunder=False, hasover=False):
self.num = int(num)
self.low = float(low)
self.high = float(high)
self.hasunder = bool(hasunder)
self.hasover = bool(hasover)
def __repr__(self):
hasunder = ", hasunder=True" if self.hasunder else ""
hasover = ", hasover=True" if self.hasover else ""
return "Regular({0}, {1}, {2}{3}{4})".format(self.num, self.low, self.high, hasunder, hasover)
def __len__(self):
return self.num + int(self.hasunder) + int(self.hasover)
def __eq__(self, other):
return isinstance(other, Regular) and (self.num, self.low, self.high, self.hasunder, self.hasover) == (other.num, other.low, other.high, other.hasunder, other.hasover)
def __ne__(self, other):
return not self.__eq__(other)
def sliced(self, start, stop, axis, counts, *, projection=False):
if counts.shape[axis] != len(self):
raise ValueError("counts length does not fit this binning")
if hasattr(start, 'value'):
start = self.index(start.value, isleft=True, clip=True)
if hasattr(stop, 'value'):
stop = self.index(stop.value, isleft=True, clip=True)
hasunder = self.hasunder
if start is not None:
hasunder = not projection
hasover = self.hasover
if stop is not None:
hasover = not projection
if start is None:
start = 0
if stop is None:
stop = self.num
if start < 0:
start += self.num
if stop < 0:
stop += self.num
if not 0 <= start < self.num:
start = 0
if not 0 <= stop < self.num:
stop = self.num
num = stop - start
if num <= 0:
raise ValueError("slice starting index must be strictly greater than slice stopping index")
sliced_counts = numpy.empty(counts.shape[:axis] + (num + int(hasunder) + int(hasover),) + counts.shape[axis + 1:], counts.dtype)
def at(where):
return (slice(None),) * axis + (where,)
if hasunder:
sliced_counts[at(0)] = counts[at(slice(None, start + int(self.hasunder)))].sum(axis=axis)
if hasover:
sliced_counts[at(-1)] = counts[at(slice(stop + int(self.hasunder), None))].sum(axis=axis)
sliced_counts[at(slice(int(hasunder), num + int(hasunder)))] = counts[at(slice(start + int(self.hasunder), stop + int(self.hasunder)))]
return Regular(num, self.left(start + int(self.hasunder)), self.right(stop + int(self.hasunder) - 1), hasunder=hasunder, hasover=hasover), sliced_counts
def _fixnegative(self, index):
if not isinstance(index, (numbers.Integral, numpy.int_)):
raise TypeError("index must be an integer")
if index < 0:
index += len(self)
if not 0 <= index < len(self):
raise ValueError("index out of range")
return index
def left(self, index):
index = self._fixnegative(index)
if self.hasunder and index == 0:
return float("-inf")
else:
return (index - int(self.hasunder)) * (self.high - self.low) / self.num + self.low
def right(self, index):
index = self._fixnegative(index)
if self.hasover and index == len(self) - 1:
return float("inf")
else:
return (1 + index - int(self.hasunder)) * (self.high - self.low) / self.num + self.low
def index(self, x, isleft=True, clip=False):
if math.isnan(x):
raise ValueError("x must not be NaN")
if x < self.low:
if self.hasunder:
return 0 if isleft else 1
elif clip:
return 0
else:
raise ValueError("x is below range and binning has no underflow")
elif x >= self.high:
if self.hasover:
return len(self) - 1 if isleft else len(self)
elif clip:
return len(self)
else:
raise ValueError("x is above range and binning has no overflow")
else:
approximate = math.floor if isleft else math.ceil
return int(approximate((x - self.low)/(self.high - self.low) * self.num)) + int(self.hasunder)
class Histogram:
"""
Simple histogram class.
* non-keyword arguments must be Binnings
* if provided, counts may be a prefilled array or a number to prefill all bins
Histograms may be sliced, projected, and rebinned using square bracket syntax:
histogram[i0:i1, j0:j1, k0:k1] # slicing in multiple dimensions (remainder goes to under/overflow)
histogram[loc(x0):loc(x1), loc(y0):loc(y1)] # slicing using data coordinates x and y
histogram[i0:i1, ::project, k0:k1] # projecting a dimension
histogram[i0:i1, ::rebin(5), k0:k1] # rebinning a dimension (incomplete last bin goes to overflow)
histogram[i0:i1, j0:j1:project, k0:k1] # slice and project, summing over only slice
histogram[i0:i1, j0:j1:rebin(5), k0:k1] # slice and rebin (remainder and incomplete last bin go to under/overflow)
and any combination thereof. Non-functions are never accepted as the third argument to a slice.
The first two arguments of each slice (start and stop) may be None (unspecified), an integer, or a function (binning, isleft) → integer. loc(x) returns such a function.
The third argument of a slice may be None (no summation: projection or rebinning) or a function (binning, axis, counts) → (binning, counts). project is such a function, and rebin(n) returns such a function.
"""
def __init__(self, *binnings, counts=None):
if len(binnings) == 0:
raise TypeError("Histogram must not have zero dimensions")
if any(not isinstance(binning, Binning) for binning in binnings):
raise TypeError("non-keyword arguments to Histogram must all be binnings")
self.binnings = binnings
if counts is None:
counts = numpy.zeros([len(binning) for binning in self.binnings], int)
if isinstance(counts, numbers.Integral):
counts = numpy.full([len(binning) for binning in self.binnings], counts)
if not isinstance(counts, numpy.ndarray):
counts = numpy.array(counts)
self.counts = counts
assert len(binnings) == len(counts.shape)
assert all(len(binning) == length for binning, length in zip(binnings, counts.shape))
def __repr__(self):
return "Histogram({0})".format(", ".join(repr(binning) for binning in self.binnings))
def __getitem__(self, slices):
if not isinstance(slices, tuple):
slices = (slices,)
# TODO: turn Ellipsis into empty slices here...
if len(slices) > len(self.binnings):
raise IndexError("too many slices for Histogram of dimension {0}".format(len(self.binnings)))
slices = slices + (slice(None),) * (len(self.binnings) - len(slices))
binnings = []
counts = self.counts
for item, binning in zip(slices, self.binnings):
if not isinstance(item, slice):
raise TypeError("only slices are allowed")
start, stop, step = item.start, item.stop, item.step
# the current axis depends on the dimensionality of the output
axis = len(binnings)
# slice the binning, creating under/overflows if necessary
if start is not None or stop is not None:
binning, counts = binning.sliced(start, stop, axis, counts, projection=getattr(step, "projection", False))
# apply the project/rebin function
if step is not None:
if not callable(step):
raise TypeError("when slicing a Histogram, the slice's third argument must be callable")
original_shape = counts.shape
# signature: (binning for axis=axis, axis number, whole counts array) → new binning (or None), new counts
binning, counts = step(binning, axis, counts)
# ensure that the (possibly user-supplied) function is sane
if binning is None:
assert original_shape[:axis] + original_shape[axis + 1:] == counts.shape
else:
assert original_shape[:axis] == counts.shape[:axis]
assert original_shape[axis + 1:] == counts.shape[axis + 1:]
assert len(binning) == counts.shape[axis]
# also determines whether the axis number will be increased in the next round
if binning is not None:
binnings.append(binning)
return Histogram(*binnings, counts=counts)
def __eq__(self, other):
return isinstance(other, Histogram) and self.binnings == other.binnings and numpy.array_equal(self.counts, other.counts)
def __ne__(self, other):
return not self.__eq__(other)
class loc:
"When used in the start or stop of a Histogram's slice, x is taken to be the position in data coordinates."
def __init__(self, x):
self.value = x
class project:
"When used in the step of a Histogram's slice, project sums over and eliminates what remains of the axis after slicing."
def __new__(cls, binning, axis, counts):
return None, numpy.add.reduce(counts, axis=axis)
projection = True
class rebin:
"When used in the step of a Histogram's slice, rebin(n) combines bins, scaling their widths by a factor of n. If the number of bins is not divisible by n, the remainder is added to the overflow bin."
def __init__(self, factor):
self.factor = factor
def __call__(self, binning, axis, counts):
factor = self.factor
if isinstance(binning, Regular):
indexes = (numpy.arange(0, binning.num, factor),)
num, remainder = divmod(binning.num, factor)
high, hasover = binning.high, binning.hasover
if binning.hasunder:
indexes[0][:] += 1
indexes = ([0],) + indexes
if remainder == 0:
if binning.hasover:
indexes = indexes + ([binning.num + int(binning.hasunder)],)
else:
high = binning.left(indexes[-1][-1])
hasover = True
binning = Regular(num, binning.low, high, hasunder=binning.hasunder, hasover=hasover)
counts = numpy.add.reduceat(counts, numpy.concatenate(indexes), axis=axis)
return binning, counts
else:
raise NotImplementedError(type(binning))
projection = False
# slicing a 1-dimensional histogram moves data to under and overflow bins
h = Histogram(Regular(10, -5, 5), counts=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
assert h[3:8] == Histogram(Regular(5, -2, 3, True, True), counts=[3, 1, 1, 1, 1, 1, 2])
assert h[loc(-2):loc(3)] == Histogram(Regular(5, -2, 3, True, True), counts=[3, 1, 1, 1, 1, 1, 2])
# rebinning by an evenly divisible factor does not create overflow bins
h = Histogram(Regular(12, 0, 6), counts=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
assert h[::rebin(3)] == Histogram(Regular(4, 0, 6), counts=[3, 3, 3, 3])
assert h[::rebin(4)] == Histogram(Regular(3, 0, 6), counts=[4, 4, 4])
# but rebinning by a factor that doesn't evenly divide the number of bins makes an overflow
assert h[::rebin(5)] == Histogram(Regular(2, 0, 5, hasover=True), counts=[5, 5, 2])
# slicing and rebinning creates overflows and merges them if necessary
assert h[3:-3:rebin(2)] == Histogram(Regular(3, 1.5, 4.5, True, True), counts=[3, 2, 2, 2, 3])
assert h[3:-3:rebin(3)] == Histogram(Regular(2, 1.5, 4.5, True, True), counts=[3, 3, 3, 3])
assert h[3:-3:rebin(4)] == Histogram(Regular(1, 1.5, 3.5, True, True), counts=[3, 4, 5]) # merged
# you can project dimensions if doing so leaves you with at least one (zero-dimensional histograms are not allowed)
h = Histogram(Regular(4, 0, 40), Regular(3, 0, 30), counts=[[1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1]])
assert h[::project] == Histogram(Regular(3, 0, 30), counts=[4, 4, 4])
assert h[:, ::project] == Histogram(Regular(4, 0, 40), counts=[3, 3, 3, 3])
# you can also slice and project; the sliced-away endpoints are not counted in the sum (otherwise, there would be no point to slicing before a projection)
assert h[loc(20)::project] == Histogram(Regular(3, 0, 30), counts=[2, 2, 2])
assert h[:, :-1:project] == Histogram(Regular(4, 0, 40), counts=[2, 2, 2, 2])
assert h[:, 1:-1:project] == Histogram(Regular(4, 0, 40), counts=[1, 1, 1, 1])
# but under/overflow bins are not counted if a slice explicitly rejects them
h = Histogram(Regular(4, 0, 40), Regular(3, 0, 30, True, True), counts=[[1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1]])
assert h[:, ::project] == Histogram(Regular(4, 0, 40), counts=[5, 5, 5, 5])
assert h[:, 0::project] == Histogram(Regular(4, 0, 40), counts=[4, 4, 4, 4])
assert h[:, :3:project] == Histogram(Regular(4, 0, 40), counts=[4, 4, 4, 4])
assert h[:, 0:3:project] == Histogram(Regular(4, 0, 40), counts=[3, 3, 3, 3])
print('success')
from hist import *
# slicing a 1-dimensional histogram moves data to under and overflow bins
h = Histogram(Regular(10, -5, 5), counts=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
assert h[3:8] == Histogram(Regular(5, -2, 3, True, True), counts=[3, 1, 1, 1, 1, 1, 2])
assert h[loc(-2):loc(3)] == Histogram(Regular(5, -2, 3, True, True), counts=[3, 1, 1, 1, 1, 1, 2])
# rebinning by an evenly divisible factor does not create overflow bins
h = Histogram(Regular(12, 0, 6), counts=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
assert h[::rebin(3)] == Histogram(Regular(4, 0, 6), counts=[3, 3, 3, 3])
assert h[::rebin(4)] == Histogram(Regular(3, 0, 6), counts=[4, 4, 4])
# but rebinning by a factor that doesn't evenly divide the number of bins makes an overflow
assert h[::rebin(5)] == Histogram(Regular(2, 0, 5, hasover=True), counts=[5, 5, 2])
# slicing and rebinning creates overflows and merges them if necessary
assert h[3:-3:rebin(2)] == Histogram(Regular(3, 1.5, 4.5, True, True), counts=[3, 2, 2, 2, 3])
assert h[3:-3:rebin(3)] == Histogram(Regular(2, 1.5, 4.5, True, True), counts=[3, 3, 3, 3])
assert h[3:-3:rebin(4)] == Histogram(Regular(1, 1.5, 3.5, True, True), counts=[3, 4, 5]) # merged
# you can project dimensions if doing so leaves you with at least one (zero-dimensional histograms are not allowed)
h = Histogram(Regular(4, 0, 40), Regular(3, 0, 30), counts=[[1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1]])
assert h[::project] == Histogram(Regular(3, 0, 30), counts=[4, 4, 4])
assert h[:, ::project] == Histogram(Regular(4, 0, 40), counts=[3, 3, 3, 3])
# you can also slice and project; the sliced-away endpoints are not counted in the sum (otherwise, there would be no point to slicing before a projection)
assert h[loc(20)::project] == Histogram(Regular(3, 0, 30), counts=[2, 2, 2])
assert h[:, :-1:project] == Histogram(Regular(4, 0, 40), counts=[2, 2, 2, 2])
assert h[:, 1:-1:project] == Histogram(Regular(4, 0, 40), counts=[1, 1, 1, 1])
# but under/overflow bins are not counted if a slice explicitly rejects them
h = Histogram(Regular(4, 0, 40), Regular(3, 0, 30, True, True), counts=[[1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1]])
assert h[:, ::project] == Histogram(Regular(4, 0, 40), counts=[5, 5, 5, 5])
assert h[:, 0::project] == Histogram(Regular(4, 0, 40), counts=[4, 4, 4, 4])
assert h[:, :3:project] == Histogram(Regular(4, 0, 40), counts=[4, 4, 4, 4])
assert h[:, 0:3:project] == Histogram(Regular(4, 0, 40), counts=[3, 3, 3, 3])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment