Skip to content

Instantly share code, notes, and snippets.

@cgreer
Last active October 9, 2023 17:09
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save cgreer/ed246cde84bafb3149c7a7438a16a1c9 to your computer and use it in GitHub Desktop.
Save cgreer/ed246cde84bafb3149c7a7438a16a1c9 to your computer and use it in GitHub Desktop.
Bag of Little Bootstraps (w/ example usage)
import random
from typing import (
List,
Dict,
Callable,
Union,
)
import numpy
Number = Union[int, float]
Array = numpy.array
# A sample stored by the counts of its unique values.
SampleCounts = Dict[Number, int] # unique_value: count
#############################
# Estimators
#############################
SampleValues = Array # [float, ...]
ValueCounts = Array # [int, ...]
Estimate = float
Estimator = Callable[[SampleValues, ValueCounts], Estimate]
def mean_estimator(
values: SampleValues,
counts: ValueCounts,
) -> Estimate:
sum_values = (counts * values).sum()
num_values = counts.sum()
return sum_values / num_values
#############################
# Estimate Quality Assessors
#############################
Estimates = Array # [Estimate, ...]
EstimateAssessor = Callable[[Estimates], Estimate]
def standard_deviation(estimates: Estimates) -> Estimate:
return estimates.std()
def percentile_05(estimates: Estimates) -> Estimate:
return numpy.percentile(estimates, 5)
def percentile_95(estimates: Estimates) -> Estimate:
return numpy.percentile(estimates, 95)
def percentile_NN(nn) -> EstimateAssessor:
assert 0 <= nn <= 100
def p_nn(estimates: Estimates) -> Estimate:
return numpy.percentile(estimates, nn)
return p_nn
#############################
# Algorithm
#############################
def bag_little_bootstraps(
data: SampleCounts,
estimator: Estimator,
estimate_assessors: List[EstimateAssessor],
s=20,
r=50,
b_power=0.65,
) -> List[Estimate]:
'''
Bag of little bootstraps:
https://arxiv.org/pdf/1112.5016.pdf
The default args should be sane enough defaults for a variety of situations,
but if you need to be precise you can optimize them by doing the relative
error analysis in the paper with various values.
Params
data: Input data you sampled from some population
estimator: function that calculates estimate (e.g. mean_estimator)
estimate_assessors: functions that assesses estimates (e.g. standard_deviation)
s: # subsamples
r: # bootstrap resamples per subsample
b_power: Determines size of subsample
'''
data_values = list(data.keys()) # Unique values
data_value_counts = list(data.values()) # Unique value counts
data_size = sum(data_value_counts)
# Determine b
# "b" is the subsample size
b = int(data_size ** b_power)
uniform_subsample_weights = Array([1.0 / b] * b)
# Every assessor gets s subset assessments
assessor_estimates = [] # [[est_1, est_2, est_s], ...]
for _ in estimate_assessors:
assessor_estimates.append(Array([0.0] * s))
# Calculate assessor estimates for every subset
for s_j in range(s):
# Pick a random subset of data
# - The size of the sample is "b"
# - Sample w/out replacement
sample_subset = Array(
random.sample(
data_values,
counts=data_value_counts,
k=b,
)
)
# Bootstrap resamples
# - Calculate an estimate for every resample
b_estimates = Array([0.0] * r)
for b_k in range(r):
# Create a bootstrap resample
# - Bootstrap resample size is the size of the original sample, not
# the size of the subset sample.
b_counts = numpy.random.multinomial(
data_size,
pvals=uniform_subsample_weights,
)
# Calculate estimate of the resampled data
b_estimates[b_k] = estimator(sample_subset, b_counts)
# Calculate an estimate quality assessment for each assessor
for i, estimate_assessor in enumerate(estimate_assessors):
assessor_estimates[i][s_j] = estimate_assessor(b_estimates)
# Average assessor assessments for each subset to get final estimates
final_assessment_estimates = []
for sample_ests in assessor_estimates:
final_assessment_estimates.append(sample_ests.mean())
return final_assessment_estimates
def mean_characterization_example():
'''
Use BLB to estimate the percentiles of the sampling distribution of the mean.
'''
from collections import Counter
# Create a sample Uniform(0, 100)
# - Discretize it to 0.1
N = 1_000_000
print(f"\nCreating data, size: {N} ...")
data = [round(random.random() * 100, 1) for _ in range(N)]
data = Counter(data)
# Run BLB
# - Can swap out mean_estimator for median, ...
# - Can swap out percentile_05 for standard_deviation, ...
print("BLB Bootstrapping...")
estimator = mean_estimator
# estimate_assessors = [percentile_05, percentile_95]
estimate_assessors = [percentile_NN(x) for x in range(101)]
assessor_stats = bag_little_bootstraps(
data,
estimator=estimator,
estimate_assessors=estimate_assessors,
)
print(f" 1st percentile: {assessor_stats[1]}")
print(f"99th percentile: {assessor_stats[99]}")
if __name__ == "__main__":
mean_characterization_example()
@aladinoster
Copy link

Thanks for this! I just computed a 3Q estimator of a huge dataset >14M lines in a few seconds!

@cgreer
Copy link
Author

cgreer commented Oct 9, 2023

Thanks for this! I just computed a 3Q estimator of a huge dataset >14M lines in a few seconds!

You're welcome!

It's been a while since I implemented that, so it might be worth double-checking it. Maybe doing a slow bootstrap one time over 14M dataset w/ multinomial and seeing if gives the same answer?

The research you do seems really interesting!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment