Created
June 21, 2024 00:42
-
-
Save JotaRata/0796446c83a944b0c811d893b5a627a3 to your computer and use it in GitHub Desktop.
Bootstrap sampling methods
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import random | |
from typing import Any, Callable, Concatenate, List, ParamSpec, ParamSpecKwargs, Tuple, cast | |
import numpy as np | |
from numpy.typing import NDArray,ArrayLike | |
from pandas import DataFrame, Series | |
# ---------------- Bootstrapping ------------------------------- | |
_ArrayLike = NDArray[np.float_] | None | |
_NumFunction = Callable[..., float] | |
def sampler(sample : List[float], | |
stat : _NumFunction = np.nanmedian, | |
est : _NumFunction = np.nanmedian, | |
iter : int = 1000, | |
include_error : bool = False, | |
print_values : bool = False) -> Tuple[float, float] | float: | |
''' | |
## Bootstrap sampler | |
Randomly samples a list of values, computing the estimator function each time and returning the statistic of the computed values. | |
### Parameters: | |
* sample (list or array) : Input list of values | |
* stat (function) : The statistic to compute from the subsample estimator values, default is np.nanmedian | |
* est (function) : The estimator function computed for each sub-sample, default is np.nanmedian | |
* iter (int) : Number of sub-samples to compute, default is 1000 | |
* include_error (bool) : Whether to include the standard deviation of the estimated values along the statistic, if True then the return output is going to be a tuple of two floats, default is False. | |
* print_values (bool) : For debugging purposes, whether to print the estimated values on screen. | |
### Returns: | |
A single float or a tuple of floats if include_error is enabled. | |
### Notes: | |
The function works by creating a sub-sample of the input of the same length then calling the estimator function for this sub-sample and appending the result value into a temporal list, after repeating this process a by number of iteration the final statistic is computed from the list of estimated values. | |
It's important that the estimator function and the statistic function both take a list or array of values and return a single float, for multivariate sampling see bootstrap.sampler_mv. | |
The process of computing bootstrap statistics can be slow, for further optimizations consider compiling this module with mypyc. | |
''' | |
values : List[float] = list[float]() | |
_size = len(sample) | |
if _size < 3: | |
if include_error: | |
return np.nan, np.nan | |
else: | |
return np.nan | |
i : int | |
for i in range(iter): | |
boot_sample : List[float] = np.random.choice(sample, replace = True, size = len(sample)).tolist() | |
boot_mean = est(boot_sample) | |
values.append(boot_mean) | |
if print_values: | |
print('values: ', values) | |
if include_error: | |
return stat(values), cast(float, np.nanstd(values)) | |
else: | |
return stat(values) | |
def sampler_mv(sample : Tuple[ List[float], List[float]], | |
stat : _NumFunction = np.nanmedian, | |
est : Callable[..., float] | None = None, | |
iter : int = 1000, | |
indices : List[int]|None = None, | |
include_error : bool = False, | |
print_values : bool = False) -> Tuple[float, float] | float: | |
''' | |
## Multivariate bootstrap sampler | |
Randomly samples a list of vectors, computing the estimator function each time and returning the statistic of the computed values. | |
### Parameters: | |
* sample (list or array of 2D vectors) : Input rows and columns, it must be provided as a tuple of lists or a 2D array. | |
* stat (function) : The statistic to compute from the subsample estimator values, default is np.nanmedian | |
* est (function) : The estimator function computed for each sub-sample, this function must accept two float values and return a single float. | |
* iter (int) : Number of sub-samples to compute, default is 1000 | |
* include_error (bool) : Whether to include the standard deviation of the estimated values along the statistic, if True then the return output is going to be a tuple of two floats, default is False. | |
* print_values (bool) : For debugging purposes, whether to print the estimated values on screen. | |
### Returns: | |
A single float or a tuple of floats if include_error is enabled. | |
### Notes: | |
The function works by creating a sub-sample of the input of the same length then calling the estimator function for this sub-sample and appending the result value into a temporal list, after repeating this process a by number of iteration the final statistic is computed from the list of estimated values. | |
Unlike boostrap.sampler, the estimator function must be given or an AssertionError may be raised. The estimator function must accept two list of values and return a single float value, the statistic function can be whatever function that accepts a list of values and returns a single value instead. | |
The process of computing bootstrap statistics can be slow, for further optimizations consider compiling this module with mypyc. | |
''' | |
values : List[float] = list[float]() | |
assert stat is not None and est is not None | |
_sample0, _sample1 = sample | |
assert _sample0 is not None and _sample1 is not None | |
_size : int = len(_sample0) | |
assert _size == len(_sample1) | |
assert est is not None | |
if _size < 3: | |
if include_error: | |
return np.nan, np.nan | |
else: | |
return np.nan | |
if indices is None: | |
indices = list(range(_size)) | |
for _ in range(iter): | |
boot_sample : List[int] = np.random.choice(indices, replace = True, size = len(indices)).tolist() | |
_s0 = [ _sample0[i] for i in boot_sample ] | |
_s1 = [ _sample1[i] for i in boot_sample ] | |
boot = est(_s0, _s1) | |
values.append(boot) | |
if print_values: | |
print('values: ', values) | |
if include_error: | |
return stat(values), cast(float, np.nanstd(values)) | |
else: | |
return stat(values) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment