Skip to content

Instantly share code, notes, and snippets.

@Zmey56
Created June 28, 2022 19:15
Show Gist options
  • Save Zmey56/e0a848618ef60b3be94b061544d91c8a to your computer and use it in GitHub Desktop.
Save Zmey56/e0a848618ef60b3be94b061544d91c8a to your computer and use it in GitHub Desktop.
def get_bootstrap(
data_column_1, # numeric values of the first sample
data_column_2, # numeric values of the second sample
boot_it = 10000, # number of bootstrap subsamples
statistic = np.mean, # statistics of interest to us
bootstrap_conf_level = 0.95 # significance level
):
boot_len = max([len(data_column_1), len(data_column_2)])
boot_data = []
for i in range(boot_it): # extracting subsamples
samples_1 = data_column_1.sample(
boot_len,
replace = True # return parameter
).values
samples_2 = data_column_2.sample(
boot_len, # to preserve the variance, we take the same sample size
replace = True
).values
boot_data.append(statistic(samples_1-samples_2))
pd_boot_data = pd.DataFrame(boot_data)
left_quant = (1 - bootstrap_conf_level)/2
right_quant = 1 - (1 - bootstrap_conf_level) / 2
quants = pd_boot_data.quantile([left_quant, right_quant])
p_1 = norm.cdf(
x = 0,
loc = np.mean(boot_data),
scale = np.std(boot_data)
)
p_2 = norm.cdf(
x = 0,
loc = -np.mean(boot_data),
scale = np.std(boot_data)
)
p_value = min(p_1, p_2) * 2
return {"p_value": p_value}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment