Skip to content

Instantly share code, notes, and snippets.

@andyfaff
Created April 1, 2016 04:35
Show Gist options
  • Save andyfaff/9a7930f3d93a99f478cdadae4d1bb3b2 to your computer and use it in GitHub Desktop.
Save andyfaff/9a7930f3d93a99f478cdadae4d1bb3b2 to your computer and use it in GitHub Desktop.
scipy.stats sampling from arbitrary distribution
import numpy as np
import scipy.interpolate as interpolate
from scipy.integrate import simps
from scipy.stats import rv_continuous
class arbitrary(rv_continuous):
def __init__(self, hist, bin_edges, xtol=1e-14, seed=None):
_hist = np.array(hist, float)
_bin_edges = np.asfarray(bin_edges)
if _bin_edges.size - 1 != hist.size:
raise ValueError("For a histogram the bins_edges array must be 1"
" larger than data.")
# need to normalise to density
# perhaps should use different integration method. If there are
# only a couple of samples then simple rectangular integration
# may be more applicable, etc.
bin_centres = (_bin_edges[1:] + _bin_edges[:-1]) * 0.5
total_area = simps(_hist, bin_centres)
_hist /= total_area
self._hist, self._bin_edges = _hist, _bin_edges
cum_values = np.zeros(bin_edges.shape)
cum_values[1:] = np.cumsum(_hist * np.diff(_bin_edges))
self._cum_values = cum_values
self._inv_cdf = interpolate.interp1d(cum_values, _bin_edges)
super(arbitrary, self).__init__(a=self._bin_edges[0],
b=self._bin_edges[-1],
xtol=xtol, seed=seed)
def _parse_args_rvs(*args, **kwds):
return args, 0, 1, kwds['size']
def _parse_args(*args, **kwds):
return args, 0, 1
def _parse_args_stats(*args, **kwds):
return args, 0, 1, kwds['moments']
ns = {'_parse_args_rvs': _parse_args_rvs,
'_parse_args': _parse_args,
'_parse_args_stats': _parse_args_stats}
for name, func in ns.items():
setattr(self, name, func)
def _pdf(self, x):
# find out where x is in _bin_edges
loc = np.searchsorted(self._bin_edges, x)
if loc == 0 or loc == self._bin_edges.size:
# x wasn't in the support of the data
return 0
else:
# x is somewhere in the histogram
return self._hist[loc - 1]
def _cdf(self, x):
return self._cum_values(x)
def _ppf(self, x):
return self._inv_cdf(x)
def fit(self, data, *args, **kwds):
raise NotImplementedError("fit method not supported for arbitrary"
" distribution")
def fit_loc_scale(self, data, *args):
raise NotImplementedError("fit_loc_scale not supported for arbitrary"
" distribution")
@ev-br
Copy link

ev-br commented Apr 1, 2016

A couple of quick comments on technicalities: I think you don't need to redo constructing _parse... functions, it'll happen automatically in the superclass constructor. The in-bounds check is also done in the superclass (RV_continuous.PDF method) as long as you have a and b attributes defined.

W.r.t. interpolation, are you certain you want linear interpolation and not e.g. monotone splines (Pchip or akima)?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment