cgreer/bag_of_little_bootstraps.py

## bag_of_little_bootstraps.py
import random
from typing import (
    List,
    Dict,
    Callable,
    Union,
)

import numpy

Number = Union[int, float]
Array = numpy.array

# A sample stored by the counts of its unique values.
SampleCounts = Dict[Number, int] # unique_value: count


#############################
# Estimators
#############################

SampleValues = Array # [float, ...]
ValueCounts = Array # [int, ...]
Estimate = float
Estimator = Callable[[SampleValues, ValueCounts], Estimate]


def mean_estimator(
    values: SampleValues,
    counts: ValueCounts,
) -> Estimate:
    sum_values = (counts * values).sum()
    num_values = counts.sum()
    return sum_values / num_values


#############################
# Estimate Quality Assessors
#############################

Estimates = Array # [Estimate, ...]
EstimateAssessor = Callable[[Estimates], Estimate]


def standard_deviation(estimates: Estimates) -> Estimate:
    return estimates.std()


def percentile_05(estimates: Estimates) -> Estimate:
    return numpy.percentile(estimates, 5)


def percentile_95(estimates: Estimates) -> Estimate:
    return numpy.percentile(estimates, 95)


def percentile_NN(nn) -> EstimateAssessor:
    assert 0 <= nn <= 100

    def p_nn(estimates: Estimates) -> Estimate:
        return numpy.percentile(estimates, nn)
    return p_nn


#############################
# Algorithm
#############################

def bag_little_bootstraps(
    data: SampleCounts,
    estimator: Estimator,
    estimate_assessors: List[EstimateAssessor],
    s=20,
    r=50,
    b_power=0.65,
) -> List[Estimate]:
    '''
    Bag of little bootstraps:
        https://arxiv.org/pdf/1112.5016.pdf

    The default args should be sane enough defaults for a variety of situations,
    but if you need to be precise you can optimize them by doing the relative
    error analysis in the paper with various values.

    Params
        data: Input data you sampled from some population
        estimator: function that calculates estimate (e.g. mean_estimator)
        estimate_assessors: functions that assesses estimates (e.g. standard_deviation)
        s: # subsamples
        r: # bootstrap resamples per subsample
        b_power: Determines size of subsample

    '''
    data_values = list(data.keys()) # Unique values
    data_value_counts = list(data.values()) # Unique value counts
    data_size = sum(data_value_counts)

    # Determine b
    # "b" is the subsample size
    b = int(data_size ** b_power)
    uniform_subsample_weights = Array([1.0 / b] * b)

    # Every assessor gets s subset assessments
    assessor_estimates = [] # [[est_1, est_2, est_s], ...]
    for _ in estimate_assessors:
        assessor_estimates.append(Array([0.0] * s))

    # Calculate assessor estimates for every subset
    for s_j in range(s):
        # Pick a random subset of data
        # - The size of the sample is "b"
        # - Sample w/out replacement
        sample_subset = Array(
            random.sample(
                data_values,
                counts=data_value_counts,
                k=b,
            )
        )

        # Bootstrap resamples
        # - Calculate an estimate for every resample
        b_estimates = Array([0.0] * r)
        for b_k in range(r):
            # Create a bootstrap resample
            # - Bootstrap resample size is the size of the original sample, not
            #   the size of the subset sample.
            b_counts = numpy.random.multinomial(
                data_size,
                pvals=uniform_subsample_weights,
            )

            # Calculate estimate of the resampled data
            b_estimates[b_k] = estimator(sample_subset, b_counts)

        # Calculate an estimate quality assessment for each assessor
        for i, estimate_assessor in enumerate(estimate_assessors):
            assessor_estimates[i][s_j] = estimate_assessor(b_estimates)

    # Average assessor assessments for each subset to get final estimates
    final_assessment_estimates = []
    for sample_ests in assessor_estimates:
        final_assessment_estimates.append(sample_ests.mean())
    return final_assessment_estimates


def mean_characterization_example():
    '''
    Use BLB to estimate the percentiles of the sampling distribution of the mean.
    '''
    from collections import Counter

    # Create a sample Uniform(0, 100)
    # - Discretize it to 0.1
    N = 1_000_000
    print(f"\nCreating data, size: {N} ...")
    data = [round(random.random() * 100, 1) for _ in range(N)]
    data = Counter(data)

    # Run BLB
    # - Can swap out mean_estimator for median, ...
    # - Can swap out percentile_05 for standard_deviation, ...
    print("BLB Bootstrapping...")
    estimator = mean_estimator
    # estimate_assessors = [percentile_05, percentile_95]
    estimate_assessors = [percentile_NN(x) for x in range(101)]
    assessor_stats = bag_little_bootstraps(
        data,
        estimator=estimator,
        estimate_assessors=estimate_assessors,
    )
    print(f" 1st percentile: {assessor_stats[1]}")
    print(f"99th percentile: {assessor_stats[99]}")


if __name__ == "__main__":
    mean_characterization_example()
	import random
	from typing import (
	List,
	Dict,
	Callable,
	Union,
	)

	import numpy

	Number = Union[int, float]
	Array = numpy.array

	# A sample stored by the counts of its unique values.
	SampleCounts = Dict[Number, int] # unique_value: count


	#############################
	# Estimators
	#############################

	SampleValues = Array # [float, ...]
	ValueCounts = Array # [int, ...]
	Estimate = float
	Estimator = Callable[[SampleValues, ValueCounts], Estimate]


	def mean_estimator(
	values: SampleValues,
	counts: ValueCounts,
	) -> Estimate:
	sum_values = (counts * values).sum()
	num_values = counts.sum()
	return sum_values / num_values


	#############################
	# Estimate Quality Assessors
	#############################

	Estimates = Array # [Estimate, ...]
	EstimateAssessor = Callable[[Estimates], Estimate]


	def standard_deviation(estimates: Estimates) -> Estimate:
	return estimates.std()


	def percentile_05(estimates: Estimates) -> Estimate:
	return numpy.percentile(estimates, 5)


	def percentile_95(estimates: Estimates) -> Estimate:
	return numpy.percentile(estimates, 95)


	def percentile_NN(nn) -> EstimateAssessor:
	assert 0 <= nn <= 100

	def p_nn(estimates: Estimates) -> Estimate:
	return numpy.percentile(estimates, nn)
	return p_nn


	#############################
	# Algorithm
	#############################

	def bag_little_bootstraps(
	data: SampleCounts,
	estimator: Estimator,
	estimate_assessors: List[EstimateAssessor],
	s=20,
	r=50,
	b_power=0.65,
	) -> List[Estimate]:
	'''
	Bag of little bootstraps:
	https://arxiv.org/pdf/1112.5016.pdf

	The default args should be sane enough defaults for a variety of situations,
	but if you need to be precise you can optimize them by doing the relative
	error analysis in the paper with various values.

	Params
	data: Input data you sampled from some population
	estimator: function that calculates estimate (e.g. mean_estimator)
	estimate_assessors: functions that assesses estimates (e.g. standard_deviation)
	s: # subsamples
	r: # bootstrap resamples per subsample
	b_power: Determines size of subsample

	'''
	data_values = list(data.keys()) # Unique values
	data_value_counts = list(data.values()) # Unique value counts
	data_size = sum(data_value_counts)

	# Determine b
	# "b" is the subsample size
	b = int(data_size ** b_power)
	uniform_subsample_weights = Array([1.0 / b] * b)

	# Every assessor gets s subset assessments
	assessor_estimates = [] # [[est_1, est_2, est_s], ...]
	for _ in estimate_assessors:
	assessor_estimates.append(Array([0.0] * s))

	# Calculate assessor estimates for every subset
	for s_j in range(s):
	# Pick a random subset of data
	# - The size of the sample is "b"
	# - Sample w/out replacement
	sample_subset = Array(
	random.sample(
	data_values,
	counts=data_value_counts,
	k=b,
	)
	)

	# Bootstrap resamples
	# - Calculate an estimate for every resample
	b_estimates = Array([0.0] * r)
	for b_k in range(r):
	# Create a bootstrap resample
	# - Bootstrap resample size is the size of the original sample, not
	# the size of the subset sample.
	b_counts = numpy.random.multinomial(
	data_size,
	pvals=uniform_subsample_weights,
	)

	# Calculate estimate of the resampled data
	b_estimates[b_k] = estimator(sample_subset, b_counts)

	# Calculate an estimate quality assessment for each assessor
	for i, estimate_assessor in enumerate(estimate_assessors):
	assessor_estimates[i][s_j] = estimate_assessor(b_estimates)

	# Average assessor assessments for each subset to get final estimates
	final_assessment_estimates = []
	for sample_ests in assessor_estimates:
	final_assessment_estimates.append(sample_ests.mean())
	return final_assessment_estimates


	def mean_characterization_example():
	'''
	Use BLB to estimate the percentiles of the sampling distribution of the mean.
	'''
	from collections import Counter

	# Create a sample Uniform(0, 100)
	# - Discretize it to 0.1
	N = 1_000_000
	print(f"\nCreating data, size: {N} ...")
	data = [round(random.random() * 100, 1) for _ in range(N)]
	data = Counter(data)

	# Run BLB
	# - Can swap out mean_estimator for median, ...
	# - Can swap out percentile_05 for standard_deviation, ...
	print("BLB Bootstrapping...")
	estimator = mean_estimator
	# estimate_assessors = [percentile_05, percentile_95]
	estimate_assessors = [percentile_NN(x) for x in range(101)]
	assessor_stats = bag_little_bootstraps(
	data,
	estimator=estimator,
	estimate_assessors=estimate_assessors,
	)
	print(f" 1st percentile: {assessor_stats[1]}")
	print(f"99th percentile: {assessor_stats[99]}")


	if __name__ == "__main__":
	mean_characterization_example()