adriancaruana/estimate_from_histogram.py

## estimate_from_histogram.py
import numpy as np
from numpy.typing import ArrayLike, NDArray


def clip_histogram(histogram: NDArray, bin_edges: NDArray, threshold: float):
    # Apply threshold to the distribution to exclude all values below the threshold. Note: Since
    # we're using histograms, we might have the threshold fall within a bin, so we need to
    # interpolate the value of the bin to the threshold, and artificially reduce the ratio for
    # the bin by the fractional amount of the bin that's below the threshold.

    # Start by finding the value in bin edges that is closest to the threshold, but not greater than it
    bin_idx = np.searchsorted(bin_edges, threshold, side='right') - 1
    left_bin_edge = bin_edges[bin_idx]
    right_bin_edge = bin_edges[bin_idx + 1]
    # Calculate the fractional amount of the bin that's below the threshold
    fractional_position = (threshold - left_bin_edge) / (right_bin_edge - left_bin_edge)
    # Apply the threshold to the bin which contains the threshold
    histogram[bin_idx] = histogram[bin_idx] * (1 - fractional_position)
    # Zero out all bins below the threshold
    histogram = np.where((np.array(bin_edges[:-1]) >= threshold), histogram, 0)

    return histogram


def ratio_above_clipped_histogram(histogram: ArrayLike, bin_edges: ArrayLike, threshold: float):
    """
    Estimates the proportion of the distribution that is above a given threshold based on its histogram,
    linearly interpolating between bin edges.

    Example usage:
    ```
    # Generate random samples from the normal distribution
    samples = np.random.uniform(0, 1, 10000)
    # Create a histogram of the samples
    histogram, bin_edges = np.histogram(samples, bins=100)
    # Get the ratio of samples above the threshold of 0.5
    actual_threshold, ratio = ratio_above_threshold_from_histogram(histogram, bin_edges, 0.5)
    print(f"Threshold: {actual_threshold}, Ratio above threshold: {ratio:.2f}")
    ```

    Parameters:
    - histogram (array-like): The histogram counts for each bin.
    - bin_edges (array-like): The values at the edges of each bin.
    - threshold (float): The desired threshold value.

    Returns:
    - float: The estimated proportion of the distribution that is above the threshold.
    """

    bin_edges = np.array(bin_edges)
    histogram = np.array(histogram)

    assert len(histogram) == len(bin_edges) - 1, "The histogram should have one less entry than bin edges."
    assert np.sum(histogram) > 0, "The histogram must not be empty."
    if not (min(bin_edges) < threshold < max(bin_edges)):
        print("Warning: threshold not within `bin_edges` range.")

    # Apply threshold to the distribution to exclude all values below the threshold.
    clipped_histogram = clip_histogram(histogram, bin_edges, threshold)
    # Calculate ratio above the threshold
    ratio = np.sum(clipped_histogram) / np.sum(histogram)

    return ratio


def percentile_from_clipped_histogram(histogram: NDArray, bin_edges: NDArray, threshold: float, percentile: int = 50):
    """
    Estimates the value of a given percentile of a distribution from a histogram of the
    distribution. This function linearly interpolates between the bin edges.

    Example usage:
    ```
    # Generate random samples from the normal distribution
    samples = np.random.normal(5, 2, 10000)
    # Create a histogram of the samples
    histogram, bin_edges = np.histogram(samples, bins=100)
    # Get the 25th, 50th, and 75th percentiles
    percentiles = [25, 50, 75]
    # Specify a threshold to clip the histogram
    threshold = 0.1
    for percentile in percentiles:
        _, estimated_percentile = percentile_from_thresholded_histogram(histogram, bin_edges, threshold, percentile)
    ```

    Args:
        histogram (array-like): The histogram counts for each bin.
        bin_edges (array-like): The values at the edges of each bin.
        threshold (float): The desired threshold value.
        percentile (float): The desired percentile value. Default is 50 (median).

    Returns:
        float or None: The percentile value, or None if the histogram or thresholded histogram is empty.
    """

    bin_edges = np.array(bin_edges)
    histogram = np.array(histogram)

    assert len(histogram) == len(bin_edges) - 1, "The histogram should have one less entry than bin edges."
    assert np.sum(histogram) > 0, "The histogram must not be empty."
    assert 0 <= percentile <= 100, f"Percentile must be in the range 0 to 100. Got: {percentile=}."
    if not (min(bin_edges) < threshold < max(bin_edges)):
        print("Warning: threshold not within `bin_edges` range.")

    #Apply threshold to the distribution to exclude all values below the threshold, and renormalise.
    histogram = clip_histogram(histogram, bin_edges, threshold)
    if histogram.sum() == 0:
        return None
    histogram /= histogram.sum()

    # We now want to find the percentile (e.g., median if percentile=50) of the distribution that
    # remains after thresholding. The percentile may fall part-way through a bin, so we need to
    # interpolate the value of the bin to the percentile.

    # Calculate the cumulative sum of the histogram.
    cumulative_sum = np.cumsum(histogram)
    # Find the index of the bin that contains the desired percentile.
    target_bin_idx = np.searchsorted(cumulative_sum, percentile / 100)
    # Find the fractional position.
    if target_bin_idx == 0:
        left_bin_edge = bin_edges[0]
        right_bin_edge = bin_edges[1]
        fractional_position = (percentile / 100) / histogram[0]
    else:
        left_bin_edge = bin_edges[target_bin_idx]
        right_bin_edge = bin_edges[target_bin_idx + 1]
        C = cumulative_sum[target_bin_idx - 1]
        N = histogram[target_bin_idx]
        M = percentile / 100
        fractional_position = (M - C) / N

    # Interpolate to find the percentile value.
    percentile_value = left_bin_edge + fractional_position * (right_bin_edge - left_bin_edge)

    return percentile_value
	import numpy as np
	from numpy.typing import ArrayLike, NDArray


	def clip_histogram(histogram: NDArray, bin_edges: NDArray, threshold: float):
	# Apply threshold to the distribution to exclude all values below the threshold. Note: Since
	# we're using histograms, we might have the threshold fall within a bin, so we need to
	# interpolate the value of the bin to the threshold, and artificially reduce the ratio for
	# the bin by the fractional amount of the bin that's below the threshold.

	# Start by finding the value in bin edges that is closest to the threshold, but not greater than it
	bin_idx = np.searchsorted(bin_edges, threshold, side='right') - 1
	left_bin_edge = bin_edges[bin_idx]
	right_bin_edge = bin_edges[bin_idx + 1]
	# Calculate the fractional amount of the bin that's below the threshold
	fractional_position = (threshold - left_bin_edge) / (right_bin_edge - left_bin_edge)
	# Apply the threshold to the bin which contains the threshold
	histogram[bin_idx] = histogram[bin_idx] * (1 - fractional_position)
	# Zero out all bins below the threshold
	histogram = np.where((np.array(bin_edges[:-1]) >= threshold), histogram, 0)

	return histogram


	def ratio_above_clipped_histogram(histogram: ArrayLike, bin_edges: ArrayLike, threshold: float):
	"""
	Estimates the proportion of the distribution that is above a given threshold based on its histogram,
	linearly interpolating between bin edges.

	Example usage:
	```
	# Generate random samples from the normal distribution
	samples = np.random.uniform(0, 1, 10000)
	# Create a histogram of the samples
	histogram, bin_edges = np.histogram(samples, bins=100)
	# Get the ratio of samples above the threshold of 0.5
	actual_threshold, ratio = ratio_above_threshold_from_histogram(histogram, bin_edges, 0.5)
	print(f"Threshold: {actual_threshold}, Ratio above threshold: {ratio:.2f}")
	```

	Parameters:
	- histogram (array-like): The histogram counts for each bin.
	- bin_edges (array-like): The values at the edges of each bin.
	- threshold (float): The desired threshold value.

	Returns:
	- float: The estimated proportion of the distribution that is above the threshold.
	"""

	bin_edges = np.array(bin_edges)
	histogram = np.array(histogram)

	assert len(histogram) == len(bin_edges) - 1, "The histogram should have one less entry than bin edges."
	assert np.sum(histogram) > 0, "The histogram must not be empty."
	if not (min(bin_edges) < threshold < max(bin_edges)):
	print("Warning: threshold not within `bin_edges` range.")

	# Apply threshold to the distribution to exclude all values below the threshold.
	clipped_histogram = clip_histogram(histogram, bin_edges, threshold)
	# Calculate ratio above the threshold
	ratio = np.sum(clipped_histogram) / np.sum(histogram)

	return ratio


	def percentile_from_clipped_histogram(histogram: NDArray, bin_edges: NDArray, threshold: float, percentile: int = 50):
	"""
	Estimates the value of a given percentile of a distribution from a histogram of the
	distribution. This function linearly interpolates between the bin edges.

	Example usage:
	```
	# Generate random samples from the normal distribution
	samples = np.random.normal(5, 2, 10000)
	# Create a histogram of the samples
	histogram, bin_edges = np.histogram(samples, bins=100)
	# Get the 25th, 50th, and 75th percentiles
	percentiles = [25, 50, 75]
	# Specify a threshold to clip the histogram
	threshold = 0.1
	for percentile in percentiles:
	_, estimated_percentile = percentile_from_thresholded_histogram(histogram, bin_edges, threshold, percentile)
	```

	Args:
	histogram (array-like): The histogram counts for each bin.
	bin_edges (array-like): The values at the edges of each bin.
	threshold (float): The desired threshold value.
	percentile (float): The desired percentile value. Default is 50 (median).

	Returns:
	float or None: The percentile value, or None if the histogram or thresholded histogram is empty.
	"""

	bin_edges = np.array(bin_edges)
	histogram = np.array(histogram)

	assert len(histogram) == len(bin_edges) - 1, "The histogram should have one less entry than bin edges."
	assert np.sum(histogram) > 0, "The histogram must not be empty."
	assert 0 <= percentile <= 100, f"Percentile must be in the range 0 to 100. Got: {percentile=}."
	if not (min(bin_edges) < threshold < max(bin_edges)):
	print("Warning: threshold not within `bin_edges` range.")

	#Apply threshold to the distribution to exclude all values below the threshold, and renormalise.
	histogram = clip_histogram(histogram, bin_edges, threshold)
	if histogram.sum() == 0:
	return None
	histogram /= histogram.sum()

	# We now want to find the percentile (e.g., median if percentile=50) of the distribution that
	# remains after thresholding. The percentile may fall part-way through a bin, so we need to
	# interpolate the value of the bin to the percentile.

	# Calculate the cumulative sum of the histogram.
	cumulative_sum = np.cumsum(histogram)
	# Find the index of the bin that contains the desired percentile.
	target_bin_idx = np.searchsorted(cumulative_sum, percentile / 100)
	# Find the fractional position.
	if target_bin_idx == 0:
	left_bin_edge = bin_edges[0]
	right_bin_edge = bin_edges[1]
	fractional_position = (percentile / 100) / histogram[0]
	else:
	left_bin_edge = bin_edges[target_bin_idx]
	right_bin_edge = bin_edges[target_bin_idx + 1]
	C = cumulative_sum[target_bin_idx - 1]
	N = histogram[target_bin_idx]
	M = percentile / 100
	fractional_position = (M - C) / N

	# Interpolate to find the percentile value.
	percentile_value = left_bin_edge + fractional_position * (right_bin_edge - left_bin_edge)

	return percentile_value