Last active
June 29, 2020 11:24
-
-
Save abhijeet-talaulikar/f9108734571ed02fcf56c3e675133c81 to your computer and use it in GitHub Desktop.
Utilities for measuring variance of statistical model
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def _draw_bootstrap_sample(rng, X, y, frac = 1.0): | |
""" | |
Function to draw a single bootstrap sub-sample from given data. | |
Use frac to adjust the size of the sample as a fraction of data. Defaults to size of given data. | |
""" | |
sample_indices = np.arange(X.shape[0]) | |
bootstrap_indices = rng.choice(sample_indices, | |
size=int(sample_indices.shape[0] * frac), | |
replace=True) | |
return X[bootstrap_indices], y[bootstrap_indices] | |
def bias_variance_decomp(estimator, X_train, y_train, X_test, y_test, | |
loss='0-1_loss', num_rounds=200, random_seed=None, bootstrap_frac = 1.0, | |
single_sample=False, single_sample_index=0): | |
""" | |
Extended from: mlxtend library (URL:http://rasbt.github.io/mlxtend/user_guide/evaluate/bias_variance_decomp/) | |
Function to generate "num_rounds" realizations of the given ML estimator using bootstrap sub-samples. | |
""" | |
supported = ['0-1_loss', 'mse'] | |
if loss not in supported: | |
raise NotImplementedError('loss must be one of the following: %s' % | |
supported) | |
rng = np.random.RandomState(random_seed) | |
# Array with predictions of all test set observations for "num_round" bootstrap sub-samples | |
all_pred = np.zeros((num_rounds, y_test.shape[0]), dtype=np.int) | |
for i in range(num_rounds): | |
X_boot, y_boot = _draw_bootstrap_sample(rng, X_train, y_train, frac = bootstrap_frac) | |
pred = estimator.fit(X_boot, y_boot).predict(X_test) | |
all_pred[i] = pred | |
# Only if "single_sample" is set to True | |
# A slice of "all_pred" to get predictions of a single observation from "all_pred" | |
# Provide observation index in "single_sample_index | |
single_sample_estimates = None | |
if loss == '0-1_loss': | |
# This is in the case of classification model whose outcomes are 0 or 1 | |
main_predictions = np.apply_along_axis(lambda x: | |
np.argmax(np.bincount(x)), | |
axis=0, | |
arr=all_pred) | |
avg_expected_loss = (main_predictions != y_test).sum()/y_test.size | |
avg_expected_loss = np.apply_along_axis(lambda x: | |
(x != y_test).mean(), | |
axis=1, | |
arr=all_pred).mean() | |
avg_bias = np.sum(main_predictions != y_test) / y_test.size | |
var = np.zeros(pred.shape) | |
for pred in all_pred: | |
var += (pred != main_predictions).astype(np.int) | |
var /= num_rounds | |
avg_var = var.sum()/y_test.shape[0] | |
single_sample_estimates = np.zeros(num_rounds) | |
if single_sample == True: | |
single_sample_estimates = all_pred[:,single_sample_index] | |
else: | |
# This is in the case of regression model whose outcomes are a continuous real value. | |
avg_expected_loss = np.apply_along_axis( | |
lambda x: | |
((x - y_test)**2).mean(), | |
axis=1, | |
arr=all_pred).mean() | |
main_predictions = np.mean(all_pred, axis=0) | |
avg_bias = np.sum((main_predictions - y_test)**2) / y_test.size | |
avg_var = np.sum((main_predictions - all_pred)**2) / all_pred.size | |
single_sample_estimates = np.zeros(num_rounds) | |
if single_sample == True: | |
single_sample_estimates = all_pred[:,single_sample_index] | |
return avg_expected_loss, avg_bias, avg_var, all_pred, single_sample_estimates |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment