Skip to content

Instantly share code, notes, and snippets.

@adriancaruana
Last active January 12, 2024 05:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save adriancaruana/e1b72a7c62259d27431869a5460114df to your computer and use it in GitHub Desktop.
Save adriancaruana/e1b72a7c62259d27431869a5460114df to your computer and use it in GitHub Desktop.
Estimate the percentile and ratio values of a distribution from its histogram
import numpy as np
from numpy.typing import ArrayLike, NDArray
def clip_histogram(histogram: NDArray, bin_edges: NDArray, threshold: float):
# Apply threshold to the distribution to exclude all values below the threshold. Note: Since
# we're using histograms, we might have the threshold fall within a bin, so we need to
# interpolate the value of the bin to the threshold, and artificially reduce the ratio for
# the bin by the fractional amount of the bin that's below the threshold.
# Start by finding the value in bin edges that is closest to the threshold, but not greater than it
bin_idx = np.searchsorted(bin_edges, threshold, side='right') - 1
left_bin_edge = bin_edges[bin_idx]
right_bin_edge = bin_edges[bin_idx + 1]
# Calculate the fractional amount of the bin that's below the threshold
fractional_position = (threshold - left_bin_edge) / (right_bin_edge - left_bin_edge)
# Apply the threshold to the bin which contains the threshold
histogram[bin_idx] = histogram[bin_idx] * (1 - fractional_position)
# Zero out all bins below the threshold
histogram = np.where((np.array(bin_edges[:-1]) >= threshold), histogram, 0)
return histogram
def ratio_above_clipped_histogram(histogram: ArrayLike, bin_edges: ArrayLike, threshold: float):
"""
Estimates the proportion of the distribution that is above a given threshold based on its histogram,
linearly interpolating between bin edges.
Example usage:
```
# Generate random samples from the normal distribution
samples = np.random.uniform(0, 1, 10000)
# Create a histogram of the samples
histogram, bin_edges = np.histogram(samples, bins=100)
# Get the ratio of samples above the threshold of 0.5
actual_threshold, ratio = ratio_above_threshold_from_histogram(histogram, bin_edges, 0.5)
print(f"Threshold: {actual_threshold}, Ratio above threshold: {ratio:.2f}")
```
Parameters:
- histogram (array-like): The histogram counts for each bin.
- bin_edges (array-like): The values at the edges of each bin.
- threshold (float): The desired threshold value.
Returns:
- float: The estimated proportion of the distribution that is above the threshold.
"""
bin_edges = np.array(bin_edges)
histogram = np.array(histogram)
assert len(histogram) == len(bin_edges) - 1, "The histogram should have one less entry than bin edges."
assert np.sum(histogram) > 0, "The histogram must not be empty."
if not (min(bin_edges) < threshold < max(bin_edges)):
print("Warning: threshold not within `bin_edges` range.")
# Apply threshold to the distribution to exclude all values below the threshold.
clipped_histogram = clip_histogram(histogram, bin_edges, threshold)
# Calculate ratio above the threshold
ratio = np.sum(clipped_histogram) / np.sum(histogram)
return ratio
def percentile_from_clipped_histogram(histogram: NDArray, bin_edges: NDArray, threshold: float, percentile: int = 50):
"""
Estimates the value of a given percentile of a distribution from a histogram of the
distribution. This function linearly interpolates between the bin edges.
Example usage:
```
# Generate random samples from the normal distribution
samples = np.random.normal(5, 2, 10000)
# Create a histogram of the samples
histogram, bin_edges = np.histogram(samples, bins=100)
# Get the 25th, 50th, and 75th percentiles
percentiles = [25, 50, 75]
# Specify a threshold to clip the histogram
threshold = 0.1
for percentile in percentiles:
_, estimated_percentile = percentile_from_thresholded_histogram(histogram, bin_edges, threshold, percentile)
```
Args:
histogram (array-like): The histogram counts for each bin.
bin_edges (array-like): The values at the edges of each bin.
threshold (float): The desired threshold value.
percentile (float): The desired percentile value. Default is 50 (median).
Returns:
float or None: The percentile value, or None if the histogram or thresholded histogram is empty.
"""
bin_edges = np.array(bin_edges)
histogram = np.array(histogram)
assert len(histogram) == len(bin_edges) - 1, "The histogram should have one less entry than bin edges."
assert np.sum(histogram) > 0, "The histogram must not be empty."
assert 0 <= percentile <= 100, f"Percentile must be in the range 0 to 100. Got: {percentile=}."
if not (min(bin_edges) < threshold < max(bin_edges)):
print("Warning: threshold not within `bin_edges` range.")
#Apply threshold to the distribution to exclude all values below the threshold, and renormalise.
histogram = clip_histogram(histogram, bin_edges, threshold)
if histogram.sum() == 0:
return None
histogram /= histogram.sum()
# We now want to find the percentile (e.g., median if percentile=50) of the distribution that
# remains after thresholding. The percentile may fall part-way through a bin, so we need to
# interpolate the value of the bin to the percentile.
# Calculate the cumulative sum of the histogram.
cumulative_sum = np.cumsum(histogram)
# Find the index of the bin that contains the desired percentile.
target_bin_idx = np.searchsorted(cumulative_sum, percentile / 100)
# Find the fractional position.
if target_bin_idx == 0:
left_bin_edge = bin_edges[0]
right_bin_edge = bin_edges[1]
fractional_position = (percentile / 100) / histogram[0]
else:
left_bin_edge = bin_edges[target_bin_idx]
right_bin_edge = bin_edges[target_bin_idx + 1]
C = cumulative_sum[target_bin_idx - 1]
N = histogram[target_bin_idx]
M = percentile / 100
fractional_position = (M - C) / N
# Interpolate to find the percentile value.
percentile_value = left_bin_edge + fractional_position * (right_bin_edge - left_bin_edge)
return percentile_value
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment