Created
October 22, 2018 10:58
-
-
Save xmodar/360c91438332b16bc459efc3cec9a262 to your computer and use it in GitHub Desktop.
If you want to histogram samples of data, use Freedman–Diaconis rule which is the default method to choose the number of bins in Matlab. Here is an implementation in PyTorch:
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
def _indexer_into(x, dim=0, keepdim=False): | |
'''indexes into x along dim.''' | |
def indexer(i): | |
# (e.g., x[:, 2, :] is indexer(2) if dim == 1) | |
out = x[[slice(None, None)] * dim + [i, ...]] | |
return out.unsqueeze(dim) if keepdim and x.dim() != out.dim() else out | |
return indexer | |
def percentile(x, q, dim=0, keepdim=False, sort=True): | |
'''The percentile of data samples. | |
Args: | |
x: Data tensor. | |
q: The percentage in [0, 100]. | |
dim: The dimension along which we perform the operation. | |
keepdim: Whether to keep the dimension of operation. | |
sort: Set to False only if x is sorted to save computation. | |
Returns: | |
The `q`th percentile of the data. | |
''' | |
if sort: | |
x = x.data.sort(dim)[0] | |
if not (0 <= q <= 100): | |
raise ValueError('`q` must be in between 0 and 100.') | |
value = (q / 100) * x.size(dim) - 1 | |
if value < 0: | |
return 0 | |
index = int(value) | |
frac = value - index | |
next_index = min(index + 1, x.size(dim) - 1) | |
X = _indexer_into(x.data, dim, keepdim) | |
return (1 - frac) * X(index) + frac * X(next_index) | |
def inter_quartile_range(x, dim=0, keepdim=False, sort=True): | |
'''IQR: the inter-quartile range of samples of data. | |
Args: | |
x: Data tensor. | |
dim: The dimension along which we perform the operation. | |
keepdim: Whether to keep the dimension of operation. | |
sort: Set to False only if x is sorted to save computation. | |
Returns: | |
The IQR of the data. | |
''' | |
if sort: | |
x = x.data.sort(dim)[0] | |
q1 = percentile(x, 25, dim=dim, keepdim=keepdim, sort=False) | |
q3 = percentile(x, 75, dim=dim, keepdim=keepdim, sort=False) | |
return q3 - q1 | |
def num_hist_bins(x, min_bins=1, max_bins=1e6, | |
dim=0, keepdim=False, sort=True): | |
'''Freedman-Diaconis rule for the number of bins in a histogram. | |
Args: | |
x: Data tensor. | |
dim: The dimension along which we perform the operation. | |
keepdim: Whether to keep the dimension of operation. | |
min_bins: The minimum number of bins. | |
max_bins: The maximum number of bins. | |
sort: Set to False only if x is sorted to save computation. | |
Returns: | |
The appropriate number of bins to histogram the given data. | |
''' | |
if sort: | |
x = x.data.sort(dim)[0] | |
long = lambda v: torch.tensor(v, dtype=torch.long, device=x.device) | |
iqr = inter_quartile_range(x, dim=dim, keepdim=keepdim, sort=False) | |
bin_width = 2 * x.size(dim)**(-1 / 3) * iqr | |
X = _indexer_into(x.data, dim, keepdim) | |
num_bins = ((X(-1) - X(0)) / bin_width).round().long() | |
return num_bins.clamp(min=long(min_bins), max=long(max_bins)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment