Skip to content

Instantly share code, notes, and snippets.

@abhijeet-talaulikar
Last active June 29, 2020 11:24
Show Gist options
  • Save abhijeet-talaulikar/f9108734571ed02fcf56c3e675133c81 to your computer and use it in GitHub Desktop.
Save abhijeet-talaulikar/f9108734571ed02fcf56c3e675133c81 to your computer and use it in GitHub Desktop.
Utilities for measuring variance of statistical model
def _draw_bootstrap_sample(rng, X, y, frac = 1.0):
"""
Function to draw a single bootstrap sub-sample from given data.
Use frac to adjust the size of the sample as a fraction of data. Defaults to size of given data.
"""
sample_indices = np.arange(X.shape[0])
bootstrap_indices = rng.choice(sample_indices,
size=int(sample_indices.shape[0] * frac),
replace=True)
return X[bootstrap_indices], y[bootstrap_indices]
def bias_variance_decomp(estimator, X_train, y_train, X_test, y_test,
loss='0-1_loss', num_rounds=200, random_seed=None, bootstrap_frac = 1.0,
single_sample=False, single_sample_index=0):
"""
Extended from: mlxtend library (URL:http://rasbt.github.io/mlxtend/user_guide/evaluate/bias_variance_decomp/)
Function to generate "num_rounds" realizations of the given ML estimator using bootstrap sub-samples.
"""
supported = ['0-1_loss', 'mse']
if loss not in supported:
raise NotImplementedError('loss must be one of the following: %s' %
supported)
rng = np.random.RandomState(random_seed)
# Array with predictions of all test set observations for "num_round" bootstrap sub-samples
all_pred = np.zeros((num_rounds, y_test.shape[0]), dtype=np.int)
for i in range(num_rounds):
X_boot, y_boot = _draw_bootstrap_sample(rng, X_train, y_train, frac = bootstrap_frac)
pred = estimator.fit(X_boot, y_boot).predict(X_test)
all_pred[i] = pred
# Only if "single_sample" is set to True
# A slice of "all_pred" to get predictions of a single observation from "all_pred"
# Provide observation index in "single_sample_index
single_sample_estimates = None
if loss == '0-1_loss':
# This is in the case of classification model whose outcomes are 0 or 1
main_predictions = np.apply_along_axis(lambda x:
np.argmax(np.bincount(x)),
axis=0,
arr=all_pred)
avg_expected_loss = (main_predictions != y_test).sum()/y_test.size
avg_expected_loss = np.apply_along_axis(lambda x:
(x != y_test).mean(),
axis=1,
arr=all_pred).mean()
avg_bias = np.sum(main_predictions != y_test) / y_test.size
var = np.zeros(pred.shape)
for pred in all_pred:
var += (pred != main_predictions).astype(np.int)
var /= num_rounds
avg_var = var.sum()/y_test.shape[0]
single_sample_estimates = np.zeros(num_rounds)
if single_sample == True:
single_sample_estimates = all_pred[:,single_sample_index]
else:
# This is in the case of regression model whose outcomes are a continuous real value.
avg_expected_loss = np.apply_along_axis(
lambda x:
((x - y_test)**2).mean(),
axis=1,
arr=all_pred).mean()
main_predictions = np.mean(all_pred, axis=0)
avg_bias = np.sum((main_predictions - y_test)**2) / y_test.size
avg_var = np.sum((main_predictions - all_pred)**2) / all_pred.size
single_sample_estimates = np.zeros(num_rounds)
if single_sample == True:
single_sample_estimates = all_pred[:,single_sample_index]
return avg_expected_loss, avg_bias, avg_var, all_pred, single_sample_estimates
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment