Last active
January 12, 2024 05:38
-
-
Save adriancaruana/e1b72a7c62259d27431869a5460114df to your computer and use it in GitHub Desktop.
Estimate the percentile and ratio values of a distribution from its histogram
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from numpy.typing import ArrayLike, NDArray | |
def clip_histogram(histogram: NDArray, bin_edges: NDArray, threshold: float): | |
# Apply threshold to the distribution to exclude all values below the threshold. Note: Since | |
# we're using histograms, we might have the threshold fall within a bin, so we need to | |
# interpolate the value of the bin to the threshold, and artificially reduce the ratio for | |
# the bin by the fractional amount of the bin that's below the threshold. | |
# Start by finding the value in bin edges that is closest to the threshold, but not greater than it | |
bin_idx = np.searchsorted(bin_edges, threshold, side='right') - 1 | |
left_bin_edge = bin_edges[bin_idx] | |
right_bin_edge = bin_edges[bin_idx + 1] | |
# Calculate the fractional amount of the bin that's below the threshold | |
fractional_position = (threshold - left_bin_edge) / (right_bin_edge - left_bin_edge) | |
# Apply the threshold to the bin which contains the threshold | |
histogram[bin_idx] = histogram[bin_idx] * (1 - fractional_position) | |
# Zero out all bins below the threshold | |
histogram = np.where((np.array(bin_edges[:-1]) >= threshold), histogram, 0) | |
return histogram | |
def ratio_above_clipped_histogram(histogram: ArrayLike, bin_edges: ArrayLike, threshold: float): | |
""" | |
Estimates the proportion of the distribution that is above a given threshold based on its histogram, | |
linearly interpolating between bin edges. | |
Example usage: | |
``` | |
# Generate random samples from the normal distribution | |
samples = np.random.uniform(0, 1, 10000) | |
# Create a histogram of the samples | |
histogram, bin_edges = np.histogram(samples, bins=100) | |
# Get the ratio of samples above the threshold of 0.5 | |
actual_threshold, ratio = ratio_above_threshold_from_histogram(histogram, bin_edges, 0.5) | |
print(f"Threshold: {actual_threshold}, Ratio above threshold: {ratio:.2f}") | |
``` | |
Parameters: | |
- histogram (array-like): The histogram counts for each bin. | |
- bin_edges (array-like): The values at the edges of each bin. | |
- threshold (float): The desired threshold value. | |
Returns: | |
- float: The estimated proportion of the distribution that is above the threshold. | |
""" | |
bin_edges = np.array(bin_edges) | |
histogram = np.array(histogram) | |
assert len(histogram) == len(bin_edges) - 1, "The histogram should have one less entry than bin edges." | |
assert np.sum(histogram) > 0, "The histogram must not be empty." | |
if not (min(bin_edges) < threshold < max(bin_edges)): | |
print("Warning: threshold not within `bin_edges` range.") | |
# Apply threshold to the distribution to exclude all values below the threshold. | |
clipped_histogram = clip_histogram(histogram, bin_edges, threshold) | |
# Calculate ratio above the threshold | |
ratio = np.sum(clipped_histogram) / np.sum(histogram) | |
return ratio | |
def percentile_from_clipped_histogram(histogram: NDArray, bin_edges: NDArray, threshold: float, percentile: int = 50): | |
""" | |
Estimates the value of a given percentile of a distribution from a histogram of the | |
distribution. This function linearly interpolates between the bin edges. | |
Example usage: | |
``` | |
# Generate random samples from the normal distribution | |
samples = np.random.normal(5, 2, 10000) | |
# Create a histogram of the samples | |
histogram, bin_edges = np.histogram(samples, bins=100) | |
# Get the 25th, 50th, and 75th percentiles | |
percentiles = [25, 50, 75] | |
# Specify a threshold to clip the histogram | |
threshold = 0.1 | |
for percentile in percentiles: | |
_, estimated_percentile = percentile_from_thresholded_histogram(histogram, bin_edges, threshold, percentile) | |
``` | |
Args: | |
histogram (array-like): The histogram counts for each bin. | |
bin_edges (array-like): The values at the edges of each bin. | |
threshold (float): The desired threshold value. | |
percentile (float): The desired percentile value. Default is 50 (median). | |
Returns: | |
float or None: The percentile value, or None if the histogram or thresholded histogram is empty. | |
""" | |
bin_edges = np.array(bin_edges) | |
histogram = np.array(histogram) | |
assert len(histogram) == len(bin_edges) - 1, "The histogram should have one less entry than bin edges." | |
assert np.sum(histogram) > 0, "The histogram must not be empty." | |
assert 0 <= percentile <= 100, f"Percentile must be in the range 0 to 100. Got: {percentile=}." | |
if not (min(bin_edges) < threshold < max(bin_edges)): | |
print("Warning: threshold not within `bin_edges` range.") | |
#Apply threshold to the distribution to exclude all values below the threshold, and renormalise. | |
histogram = clip_histogram(histogram, bin_edges, threshold) | |
if histogram.sum() == 0: | |
return None | |
histogram /= histogram.sum() | |
# We now want to find the percentile (e.g., median if percentile=50) of the distribution that | |
# remains after thresholding. The percentile may fall part-way through a bin, so we need to | |
# interpolate the value of the bin to the percentile. | |
# Calculate the cumulative sum of the histogram. | |
cumulative_sum = np.cumsum(histogram) | |
# Find the index of the bin that contains the desired percentile. | |
target_bin_idx = np.searchsorted(cumulative_sum, percentile / 100) | |
# Find the fractional position. | |
if target_bin_idx == 0: | |
left_bin_edge = bin_edges[0] | |
right_bin_edge = bin_edges[1] | |
fractional_position = (percentile / 100) / histogram[0] | |
else: | |
left_bin_edge = bin_edges[target_bin_idx] | |
right_bin_edge = bin_edges[target_bin_idx + 1] | |
C = cumulative_sum[target_bin_idx - 1] | |
N = histogram[target_bin_idx] | |
M = percentile / 100 | |
fractional_position = (M - C) / N | |
# Interpolate to find the percentile value. | |
percentile_value = left_bin_edge + fractional_position * (right_bin_edge - left_bin_edge) | |
return percentile_value |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment