Skip to content

Instantly share code, notes, and snippets.

@JotaRata
Created June 21, 2024 00:42
Show Gist options
  • Save JotaRata/0796446c83a944b0c811d893b5a627a3 to your computer and use it in GitHub Desktop.
Save JotaRata/0796446c83a944b0c811d893b5a627a3 to your computer and use it in GitHub Desktop.
Bootstrap sampling methods
import random
from typing import Any, Callable, Concatenate, List, ParamSpec, ParamSpecKwargs, Tuple, cast
import numpy as np
from numpy.typing import NDArray,ArrayLike
from pandas import DataFrame, Series
# ---------------- Bootstrapping -------------------------------
_ArrayLike = NDArray[np.float_] | None
_NumFunction = Callable[..., float]
def sampler(sample : List[float],
stat : _NumFunction = np.nanmedian,
est : _NumFunction = np.nanmedian,
iter : int = 1000,
include_error : bool = False,
print_values : bool = False) -> Tuple[float, float] | float:
'''
## Bootstrap sampler
Randomly samples a list of values, computing the estimator function each time and returning the statistic of the computed values.
### Parameters:
* sample (list or array) : Input list of values
* stat (function) : The statistic to compute from the subsample estimator values, default is np.nanmedian
* est (function) : The estimator function computed for each sub-sample, default is np.nanmedian
* iter (int) : Number of sub-samples to compute, default is 1000
* include_error (bool) : Whether to include the standard deviation of the estimated values along the statistic, if True then the return output is going to be a tuple of two floats, default is False.
* print_values (bool) : For debugging purposes, whether to print the estimated values on screen.
### Returns:
A single float or a tuple of floats if include_error is enabled.
### Notes:
The function works by creating a sub-sample of the input of the same length then calling the estimator function for this sub-sample and appending the result value into a temporal list, after repeating this process a by number of iteration the final statistic is computed from the list of estimated values.
It's important that the estimator function and the statistic function both take a list or array of values and return a single float, for multivariate sampling see bootstrap.sampler_mv.
The process of computing bootstrap statistics can be slow, for further optimizations consider compiling this module with mypyc.
'''
values : List[float] = list[float]()
_size = len(sample)
if _size < 3:
if include_error:
return np.nan, np.nan
else:
return np.nan
i : int
for i in range(iter):
boot_sample : List[float] = np.random.choice(sample, replace = True, size = len(sample)).tolist()
boot_mean = est(boot_sample)
values.append(boot_mean)
if print_values:
print('values: ', values)
if include_error:
return stat(values), cast(float, np.nanstd(values))
else:
return stat(values)
def sampler_mv(sample : Tuple[ List[float], List[float]],
stat : _NumFunction = np.nanmedian,
est : Callable[..., float] | None = None,
iter : int = 1000,
indices : List[int]|None = None,
include_error : bool = False,
print_values : bool = False) -> Tuple[float, float] | float:
'''
## Multivariate bootstrap sampler
Randomly samples a list of vectors, computing the estimator function each time and returning the statistic of the computed values.
### Parameters:
* sample (list or array of 2D vectors) : Input rows and columns, it must be provided as a tuple of lists or a 2D array.
* stat (function) : The statistic to compute from the subsample estimator values, default is np.nanmedian
* est (function) : The estimator function computed for each sub-sample, this function must accept two float values and return a single float.
* iter (int) : Number of sub-samples to compute, default is 1000
* include_error (bool) : Whether to include the standard deviation of the estimated values along the statistic, if True then the return output is going to be a tuple of two floats, default is False.
* print_values (bool) : For debugging purposes, whether to print the estimated values on screen.
### Returns:
A single float or a tuple of floats if include_error is enabled.
### Notes:
The function works by creating a sub-sample of the input of the same length then calling the estimator function for this sub-sample and appending the result value into a temporal list, after repeating this process a by number of iteration the final statistic is computed from the list of estimated values.
Unlike boostrap.sampler, the estimator function must be given or an AssertionError may be raised. The estimator function must accept two list of values and return a single float value, the statistic function can be whatever function that accepts a list of values and returns a single value instead.
The process of computing bootstrap statistics can be slow, for further optimizations consider compiling this module with mypyc.
'''
values : List[float] = list[float]()
assert stat is not None and est is not None
_sample0, _sample1 = sample
assert _sample0 is not None and _sample1 is not None
_size : int = len(_sample0)
assert _size == len(_sample1)
assert est is not None
if _size < 3:
if include_error:
return np.nan, np.nan
else:
return np.nan
if indices is None:
indices = list(range(_size))
for _ in range(iter):
boot_sample : List[int] = np.random.choice(indices, replace = True, size = len(indices)).tolist()
_s0 = [ _sample0[i] for i in boot_sample ]
_s1 = [ _sample1[i] for i in boot_sample ]
boot = est(_s0, _s1)
values.append(boot)
if print_values:
print('values: ', values)
if include_error:
return stat(values), cast(float, np.nanstd(values))
else:
return stat(values)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment