Instantly share code, notes, and snippets.

# henryiii/hist.py

Forked from jpivarski/hist.py
Last active April 22, 2019 09:04
Star You must be signed in to star a gist
Functional histogram slice proposal
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
 import numbers import math import numpy class Binning: "Abstract superclass of all binning types. Only Regular is implemented." def __init__(self, *args, **kwargs): raise NotImplementedError def __repr__(self): raise NotImplementedError def __len__(self): raise NotImplementedError def __eq__(self, other): raise NotImplementedError def __ne__(self, other): raise NotImplementedError def sliced(self, start, stop, axis, counts, wantflows=True): """ Return a sliced binning and counts for start:stop at a given axis. * start and stop may (individually) be None, integers, or functions (binning, isleft) → integer or None; a function would be called with this binning and isleft=True for start and isleft=False for stop and its return value would be used in its place * axis is an integer greater than or equal to 0 and less than len(counts.shape) * counts is a Numpy array of bin counts for all axes * wantflows (bool) controls whether the presence of start and stop create underflow and overflow bins (see below for details) The start and stop index coordinates are 0-based, negative counts from the right, and are independent of whether under/overflow exist ("external"). A value of 0 is the left edge of the first regular bin and a value of num is the right edge of the last regular bin. If start is not None and wantflows is True, the return values will have an underflow bin with all sliced data added to it. Note that start=0 does not slice, but guarantees that an (empty) underflow bin will exist in the output. If start is None or wantflows is False, the return values will only have an underflow bin if this binning does. If stop is not None and wantflows is True, the return values will have an overflow bin with all sliced data added to it. Note that stop=num does not slice, but guarantees that an (empty) overflow bin will exist in the output. If stop is None or wantflows is False, the return values will only have an overflow bin if this binning does. """ raise NotImplementedError def left(self, index): """ Convert an array index position into the left (low) edge of the data coordinate interval. This index coordinate is 0-based, negative counts from the right, and it depends on whether under/overflow exist ("internal"). An index of 0 is the underflow bin if it exists and the first regular bin otherwise. An index of -1 is the overflow bin if it exists and the last regular bin otherwise. """ raise NotImplementedError def right(self, index): """ Convert an array index position into the right (high) edge of the data coordinate interval. This index coordinate is 0-based, negative counts from the right, and it depends on whether under/overflow exist ("internal"). An index of 0 is the underflow bin if it exists and the first regular bin otherwise. An index of -1 is the overflow bin if it exists and the last regular bin otherwise. """ raise NotImplementedError def index(self, x, isleft=True, clip=False): """ Convert a data coordinate x into an array index position. This index coordinate is 0-based, negative counts from the right, and it depends on whether under/overflow exist ("internal"). An index of 0 is the underflow bin if it exists and the first regular bin otherwise. An index of -1 is the overflow bin if it exists and the last regular bin otherwise. If isleft is True, round down to the nearest index (as when filling a histogram); otherwise, round up. If clip is False, raise an error for out-of-range x values; otherwise, return 0 if isleft, else len(binning). Underflow and overflow are considered in-range and will not clip. x=NaN always raises an error (no nanflow). """ raise NotImplementedError class Regular(Binning): """ Binning of data coordinates from low to high into num regular (equal-sized) bins. * num must be a positive integer * low is an inclusive lower endpoint (float) * high is an exclusive upper endpoint (float) * if hasunder (bool), array index 0 is the underflow bin and regular bins start at 1; otherwise, regular bins start at 0 * if hasover (bool), the last array index is the overflow bin If not hasunder, data coordinates less than low are not valid. Otherwise, data coordinates all the way down to (and including) -inf are valid. If not hasover, data coordinates greater than or equal to high are not valid. Otherwise, data coordinates all the way up to (and including) +inf are valid. NaN is never a valid data coordinate. The len of this binning is the total number of bins, including underflow and overflow. """ def __init__(self, num, low, high, hasunder=False, hasover=False): self.num = int(num) self.low = float(low) self.high = float(high) self.hasunder = bool(hasunder) self.hasover = bool(hasover) def __repr__(self): hasunder = ", hasunder=True" if self.hasunder else "" hasover = ", hasover=True" if self.hasover else "" return "Regular({0}, {1}, {2}{3}{4})".format(self.num, self.low, self.high, hasunder, hasover) def __len__(self): return self.num + int(self.hasunder) + int(self.hasover) def __eq__(self, other): return isinstance(other, Regular) and (self.num, self.low, self.high, self.hasunder, self.hasover) == (other.num, other.low, other.high, other.hasunder, other.hasover) def __ne__(self, other): return not self.__eq__(other) def sliced(self, start, stop, axis, counts, *, projection=False): if counts.shape[axis] != len(self): raise ValueError("counts length does not fit this binning") if hasattr(start, 'value'): start = self.index(start.value, isleft=True, clip=True) if hasattr(stop, 'value'): stop = self.index(stop.value, isleft=True, clip=True) hasunder = self.hasunder if start is not None: hasunder = not projection hasover = self.hasover if stop is not None: hasover = not projection if start is None: start = 0 if stop is None: stop = self.num if start < 0: start += self.num if stop < 0: stop += self.num if not 0 <= start < self.num: start = 0 if not 0 <= stop < self.num: stop = self.num num = stop - start if num <= 0: raise ValueError("slice starting index must be strictly greater than slice stopping index") sliced_counts = numpy.empty(counts.shape[:axis] + (num + int(hasunder) + int(hasover),) + counts.shape[axis + 1:], counts.dtype) def at(where): return (slice(None),) * axis + (where,) if hasunder: sliced_counts[at(0)] = counts[at(slice(None, start + int(self.hasunder)))].sum(axis=axis) if hasover: sliced_counts[at(-1)] = counts[at(slice(stop + int(self.hasunder), None))].sum(axis=axis) sliced_counts[at(slice(int(hasunder), num + int(hasunder)))] = counts[at(slice(start + int(self.hasunder), stop + int(self.hasunder)))] return Regular(num, self.left(start + int(self.hasunder)), self.right(stop + int(self.hasunder) - 1), hasunder=hasunder, hasover=hasover), sliced_counts def _fixnegative(self, index): if not isinstance(index, (numbers.Integral, numpy.int_)): raise TypeError("index must be an integer") if index < 0: index += len(self) if not 0 <= index < len(self): raise ValueError("index out of range") return index def left(self, index): index = self._fixnegative(index) if self.hasunder and index == 0: return float("-inf") else: return (index - int(self.hasunder)) * (self.high - self.low) / self.num + self.low def right(self, index): index = self._fixnegative(index) if self.hasover and index == len(self) - 1: return float("inf") else: return (1 + index - int(self.hasunder)) * (self.high - self.low) / self.num + self.low def index(self, x, isleft=True, clip=False): if math.isnan(x): raise ValueError("x must not be NaN") if x < self.low: if self.hasunder: return 0 if isleft else 1 elif clip: return 0 else: raise ValueError("x is below range and binning has no underflow") elif x >= self.high: if self.hasover: return len(self) - 1 if isleft else len(self) elif clip: return len(self) else: raise ValueError("x is above range and binning has no overflow") else: approximate = math.floor if isleft else math.ceil return int(approximate((x - self.low)/(self.high - self.low) * self.num)) + int(self.hasunder) class Histogram: """ Simple histogram class. * non-keyword arguments must be Binnings * if provided, counts may be a prefilled array or a number to prefill all bins Histograms may be sliced, projected, and rebinned using square bracket syntax: histogram[i0:i1, j0:j1, k0:k1] # slicing in multiple dimensions (remainder goes to under/overflow) histogram[loc(x0):loc(x1), loc(y0):loc(y1)] # slicing using data coordinates x and y histogram[i0:i1, ::project, k0:k1] # projecting a dimension histogram[i0:i1, ::rebin(5), k0:k1] # rebinning a dimension (incomplete last bin goes to overflow) histogram[i0:i1, j0:j1:project, k0:k1] # slice and project, summing over only slice histogram[i0:i1, j0:j1:rebin(5), k0:k1] # slice and rebin (remainder and incomplete last bin go to under/overflow) and any combination thereof. Non-functions are never accepted as the third argument to a slice. The first two arguments of each slice (start and stop) may be None (unspecified), an integer, or a function (binning, isleft) → integer. loc(x) returns such a function. The third argument of a slice may be None (no summation: projection or rebinning) or a function (binning, axis, counts) → (binning, counts). project is such a function, and rebin(n) returns such a function. """ def __init__(self, *binnings, counts=None): if len(binnings) == 0: raise TypeError("Histogram must not have zero dimensions") if any(not isinstance(binning, Binning) for binning in binnings): raise TypeError("non-keyword arguments to Histogram must all be binnings") self.binnings = binnings if counts is None: counts = numpy.zeros([len(binning) for binning in self.binnings], int) if isinstance(counts, numbers.Integral): counts = numpy.full([len(binning) for binning in self.binnings], counts) if not isinstance(counts, numpy.ndarray): counts = numpy.array(counts) self.counts = counts assert len(binnings) == len(counts.shape) assert all(len(binning) == length for binning, length in zip(binnings, counts.shape)) def __repr__(self): return "Histogram({0})".format(", ".join(repr(binning) for binning in self.binnings)) def __getitem__(self, slices): if not isinstance(slices, tuple): slices = (slices,) # TODO: turn Ellipsis into empty slices here... if len(slices) > len(self.binnings): raise IndexError("too many slices for Histogram of dimension {0}".format(len(self.binnings))) slices = slices + (slice(None),) * (len(self.binnings) - len(slices)) binnings = [] counts = self.counts for item, binning in zip(slices, self.binnings): if not isinstance(item, slice): raise TypeError("only slices are allowed") start, stop, step = item.start, item.stop, item.step # the current axis depends on the dimensionality of the output axis = len(binnings) # slice the binning, creating under/overflows if necessary if start is not None or stop is not None: binning, counts = binning.sliced(start, stop, axis, counts, projection=getattr(step, "projection", False)) # apply the project/rebin function if step is not None: if not callable(step): raise TypeError("when slicing a Histogram, the slice's third argument must be callable") original_shape = counts.shape # signature: (binning for axis=axis, axis number, whole counts array) → new binning (or None), new counts binning, counts = step(binning, axis, counts) # ensure that the (possibly user-supplied) function is sane if binning is None: assert original_shape[:axis] + original_shape[axis + 1:] == counts.shape else: assert original_shape[:axis] == counts.shape[:axis] assert original_shape[axis + 1:] == counts.shape[axis + 1:] assert len(binning) == counts.shape[axis] # also determines whether the axis number will be increased in the next round if binning is not None: binnings.append(binning) return Histogram(*binnings, counts=counts) def __eq__(self, other): return isinstance(other, Histogram) and self.binnings == other.binnings and numpy.array_equal(self.counts, other.counts) def __ne__(self, other): return not self.__eq__(other) class loc: "When used in the start or stop of a Histogram's slice, x is taken to be the position in data coordinates." def __init__(self, x): self.value = x class project: "When used in the step of a Histogram's slice, project sums over and eliminates what remains of the axis after slicing." def __new__(cls, binning, axis, counts): return None, numpy.add.reduce(counts, axis=axis) projection = True class rebin: "When used in the step of a Histogram's slice, rebin(n) combines bins, scaling their widths by a factor of n. If the number of bins is not divisible by n, the remainder is added to the overflow bin." def __init__(self, factor): self.factor = factor def __call__(self, binning, axis, counts): factor = self.factor if isinstance(binning, Regular): indexes = (numpy.arange(0, binning.num, factor),) num, remainder = divmod(binning.num, factor) high, hasover = binning.high, binning.hasover if binning.hasunder: indexes[0][:] += 1 indexes = ([0],) + indexes if remainder == 0: if binning.hasover: indexes = indexes + ([binning.num + int(binning.hasunder)],) else: high = binning.left(indexes[-1][-1]) hasover = True binning = Regular(num, binning.low, high, hasunder=binning.hasunder, hasover=hasover) counts = numpy.add.reduceat(counts, numpy.concatenate(indexes), axis=axis) return binning, counts else: raise NotImplementedError(type(binning)) projection = False # slicing a 1-dimensional histogram moves data to under and overflow bins h = Histogram(Regular(10, -5, 5), counts=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) assert h[3:8] == Histogram(Regular(5, -2, 3, True, True), counts=[3, 1, 1, 1, 1, 1, 2]) assert h[loc(-2):loc(3)] == Histogram(Regular(5, -2, 3, True, True), counts=[3, 1, 1, 1, 1, 1, 2]) # rebinning by an evenly divisible factor does not create overflow bins h = Histogram(Regular(12, 0, 6), counts=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) assert h[::rebin(3)] == Histogram(Regular(4, 0, 6), counts=[3, 3, 3, 3]) assert h[::rebin(4)] == Histogram(Regular(3, 0, 6), counts=[4, 4, 4]) # but rebinning by a factor that doesn't evenly divide the number of bins makes an overflow assert h[::rebin(5)] == Histogram(Regular(2, 0, 5, hasover=True), counts=[5, 5, 2]) # slicing and rebinning creates overflows and merges them if necessary assert h[3:-3:rebin(2)] == Histogram(Regular(3, 1.5, 4.5, True, True), counts=[3, 2, 2, 2, 3]) assert h[3:-3:rebin(3)] == Histogram(Regular(2, 1.5, 4.5, True, True), counts=[3, 3, 3, 3]) assert h[3:-3:rebin(4)] == Histogram(Regular(1, 1.5, 3.5, True, True), counts=[3, 4, 5]) # merged # you can project dimensions if doing so leaves you with at least one (zero-dimensional histograms are not allowed) h = Histogram(Regular(4, 0, 40), Regular(3, 0, 30), counts=[[1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1]]) assert h[::project] == Histogram(Regular(3, 0, 30), counts=[4, 4, 4]) assert h[:, ::project] == Histogram(Regular(4, 0, 40), counts=[3, 3, 3, 3]) # you can also slice and project; the sliced-away endpoints are not counted in the sum (otherwise, there would be no point to slicing before a projection) assert h[loc(20)::project] == Histogram(Regular(3, 0, 30), counts=[2, 2, 2]) assert h[:, :-1:project] == Histogram(Regular(4, 0, 40), counts=[2, 2, 2, 2]) assert h[:, 1:-1:project] == Histogram(Regular(4, 0, 40), counts=[1, 1, 1, 1]) # but under/overflow bins are not counted if a slice explicitly rejects them h = Histogram(Regular(4, 0, 40), Regular(3, 0, 30, True, True), counts=[[1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1]]) assert h[:, ::project] == Histogram(Regular(4, 0, 40), counts=[5, 5, 5, 5]) assert h[:, 0::project] == Histogram(Regular(4, 0, 40), counts=[4, 4, 4, 4]) assert h[:, :3:project] == Histogram(Regular(4, 0, 40), counts=[4, 4, 4, 4]) assert h[:, 0:3:project] == Histogram(Regular(4, 0, 40), counts=[3, 3, 3, 3]) print('success')
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
 from hist import * # slicing a 1-dimensional histogram moves data to under and overflow bins h = Histogram(Regular(10, -5, 5), counts=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) assert h[3:8] == Histogram(Regular(5, -2, 3, True, True), counts=[3, 1, 1, 1, 1, 1, 2]) assert h[loc(-2):loc(3)] == Histogram(Regular(5, -2, 3, True, True), counts=[3, 1, 1, 1, 1, 1, 2]) # rebinning by an evenly divisible factor does not create overflow bins h = Histogram(Regular(12, 0, 6), counts=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) assert h[::rebin(3)] == Histogram(Regular(4, 0, 6), counts=[3, 3, 3, 3]) assert h[::rebin(4)] == Histogram(Regular(3, 0, 6), counts=[4, 4, 4]) # but rebinning by a factor that doesn't evenly divide the number of bins makes an overflow assert h[::rebin(5)] == Histogram(Regular(2, 0, 5, hasover=True), counts=[5, 5, 2]) # slicing and rebinning creates overflows and merges them if necessary assert h[3:-3:rebin(2)] == Histogram(Regular(3, 1.5, 4.5, True, True), counts=[3, 2, 2, 2, 3]) assert h[3:-3:rebin(3)] == Histogram(Regular(2, 1.5, 4.5, True, True), counts=[3, 3, 3, 3]) assert h[3:-3:rebin(4)] == Histogram(Regular(1, 1.5, 3.5, True, True), counts=[3, 4, 5]) # merged # you can project dimensions if doing so leaves you with at least one (zero-dimensional histograms are not allowed) h = Histogram(Regular(4, 0, 40), Regular(3, 0, 30), counts=[[1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1]]) assert h[::project] == Histogram(Regular(3, 0, 30), counts=[4, 4, 4]) assert h[:, ::project] == Histogram(Regular(4, 0, 40), counts=[3, 3, 3, 3]) # you can also slice and project; the sliced-away endpoints are not counted in the sum (otherwise, there would be no point to slicing before a projection) assert h[loc(20)::project] == Histogram(Regular(3, 0, 30), counts=[2, 2, 2]) assert h[:, :-1:project] == Histogram(Regular(4, 0, 40), counts=[2, 2, 2, 2]) assert h[:, 1:-1:project] == Histogram(Regular(4, 0, 40), counts=[1, 1, 1, 1]) # but under/overflow bins are not counted if a slice explicitly rejects them h = Histogram(Regular(4, 0, 40), Regular(3, 0, 30, True, True), counts=[[1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1]]) assert h[:, ::project] == Histogram(Regular(4, 0, 40), counts=[5, 5, 5, 5]) assert h[:, 0::project] == Histogram(Regular(4, 0, 40), counts=[4, 4, 4, 4]) assert h[:, :3:project] == Histogram(Regular(4, 0, 40), counts=[4, 4, 4, 4]) assert h[:, 0:3:project] == Histogram(Regular(4, 0, 40), counts=[3, 3, 3, 3])