Skip to content

Instantly share code, notes, and snippets.

@xmodar
Created October 22, 2018 10:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save xmodar/360c91438332b16bc459efc3cec9a262 to your computer and use it in GitHub Desktop.
Save xmodar/360c91438332b16bc459efc3cec9a262 to your computer and use it in GitHub Desktop.
If you want to histogram samples of data, use Freedman–Diaconis rule which is the default method to choose the number of bins in Matlab. Here is an implementation in PyTorch:
import torch
def _indexer_into(x, dim=0, keepdim=False):
'''indexes into x along dim.'''
def indexer(i):
# (e.g., x[:, 2, :] is indexer(2) if dim == 1)
out = x[[slice(None, None)] * dim + [i, ...]]
return out.unsqueeze(dim) if keepdim and x.dim() != out.dim() else out
return indexer
def percentile(x, q, dim=0, keepdim=False, sort=True):
'''The percentile of data samples.
Args:
x: Data tensor.
q: The percentage in [0, 100].
dim: The dimension along which we perform the operation.
keepdim: Whether to keep the dimension of operation.
sort: Set to False only if x is sorted to save computation.
Returns:
The `q`th percentile of the data.
'''
if sort:
x = x.data.sort(dim)[0]
if not (0 <= q <= 100):
raise ValueError('`q` must be in between 0 and 100.')
value = (q / 100) * x.size(dim) - 1
if value < 0:
return 0
index = int(value)
frac = value - index
next_index = min(index + 1, x.size(dim) - 1)
X = _indexer_into(x.data, dim, keepdim)
return (1 - frac) * X(index) + frac * X(next_index)
def inter_quartile_range(x, dim=0, keepdim=False, sort=True):
'''IQR: the inter-quartile range of samples of data.
Args:
x: Data tensor.
dim: The dimension along which we perform the operation.
keepdim: Whether to keep the dimension of operation.
sort: Set to False only if x is sorted to save computation.
Returns:
The IQR of the data.
'''
if sort:
x = x.data.sort(dim)[0]
q1 = percentile(x, 25, dim=dim, keepdim=keepdim, sort=False)
q3 = percentile(x, 75, dim=dim, keepdim=keepdim, sort=False)
return q3 - q1
def num_hist_bins(x, min_bins=1, max_bins=1e6,
dim=0, keepdim=False, sort=True):
'''Freedman-Diaconis rule for the number of bins in a histogram.
Args:
x: Data tensor.
dim: The dimension along which we perform the operation.
keepdim: Whether to keep the dimension of operation.
min_bins: The minimum number of bins.
max_bins: The maximum number of bins.
sort: Set to False only if x is sorted to save computation.
Returns:
The appropriate number of bins to histogram the given data.
'''
if sort:
x = x.data.sort(dim)[0]
long = lambda v: torch.tensor(v, dtype=torch.long, device=x.device)
iqr = inter_quartile_range(x, dim=dim, keepdim=keepdim, sort=False)
bin_width = 2 * x.size(dim)**(-1 / 3) * iqr
X = _indexer_into(x.data, dim, keepdim)
num_bins = ((X(-1) - X(0)) / bin_width).round().long()
return num_bins.clamp(min=long(min_bins), max=long(max_bins))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment