abhijeet-talaulikar/utilities-variance.py

## utilities-variance.py
def _draw_bootstrap_sample(rng, X, y, frac = 1.0):
    """
    Function to draw a single bootstrap sub-sample from given data.
    Use frac to adjust the size of the sample as a fraction of data. Defaults to size of given data.
    """
    sample_indices = np.arange(X.shape[0])
    bootstrap_indices = rng.choice(sample_indices,
                                   size=int(sample_indices.shape[0] * frac),
                                   replace=True)
    return X[bootstrap_indices], y[bootstrap_indices]


def bias_variance_decomp(estimator, X_train, y_train, X_test, y_test,
                         loss='0-1_loss', num_rounds=200, random_seed=None, bootstrap_frac = 1.0,
                         single_sample=False, single_sample_index=0):
    """
    Extended from: mlxtend library (URL:http://rasbt.github.io/mlxtend/user_guide/evaluate/bias_variance_decomp/)

    Function to generate "num_rounds" realizations of the given ML estimator using bootstrap sub-samples.
    """
    supported = ['0-1_loss', 'mse']
    if loss not in supported:
        raise NotImplementedError('loss must be one of the following: %s' %
                                  supported)

    rng = np.random.RandomState(random_seed)

    # Array with predictions of all test set observations for "num_round" bootstrap sub-samples
    all_pred = np.zeros((num_rounds, y_test.shape[0]), dtype=np.int)

    for i in range(num_rounds):
        X_boot, y_boot = _draw_bootstrap_sample(rng, X_train, y_train, frac = bootstrap_frac)
        pred = estimator.fit(X_boot, y_boot).predict(X_test)
        all_pred[i] = pred

    # Only if "single_sample" is set to True
    # A slice of "all_pred" to get predictions of a single observation from "all_pred"
    # Provide observation index in "single_sample_index
    single_sample_estimates = None

    if loss == '0-1_loss':
        # This is in the case of classification model whose outcomes are 0 or 1

        main_predictions = np.apply_along_axis(lambda x:
                                               np.argmax(np.bincount(x)),
                                               axis=0,
                                               arr=all_pred)

        avg_expected_loss = (main_predictions != y_test).sum()/y_test.size

        avg_expected_loss = np.apply_along_axis(lambda x:
                                                (x != y_test).mean(),
                                                axis=1,
                                                arr=all_pred).mean()

        avg_bias = np.sum(main_predictions != y_test) / y_test.size

        var = np.zeros(pred.shape)

        for pred in all_pred:
            var += (pred != main_predictions).astype(np.int)
        var /= num_rounds

        avg_var = var.sum()/y_test.shape[0]

        single_sample_estimates = np.zeros(num_rounds)
        if single_sample == True:
            single_sample_estimates = all_pred[:,single_sample_index]

    else:
        # This is in the case of regression model whose outcomes are a continuous real value.

        avg_expected_loss = np.apply_along_axis(
            lambda x:
            ((x - y_test)**2).mean(),
            axis=1,
            arr=all_pred).mean()

        main_predictions = np.mean(all_pred, axis=0)

        avg_bias = np.sum((main_predictions - y_test)**2) / y_test.size
        avg_var = np.sum((main_predictions - all_pred)**2) / all_pred.size

        single_sample_estimates = np.zeros(num_rounds)
        if single_sample == True:
            single_sample_estimates = all_pred[:,single_sample_index]

    return avg_expected_loss, avg_bias, avg_var, all_pred, single_sample_estimates
	def _draw_bootstrap_sample(rng, X, y, frac = 1.0):
	"""
	Function to draw a single bootstrap sub-sample from given data.
	Use frac to adjust the size of the sample as a fraction of data. Defaults to size of given data.
	"""
	sample_indices = np.arange(X.shape[0])
	bootstrap_indices = rng.choice(sample_indices,
	size=int(sample_indices.shape[0] * frac),
	replace=True)
	return X[bootstrap_indices], y[bootstrap_indices]


	def bias_variance_decomp(estimator, X_train, y_train, X_test, y_test,
	loss='0-1_loss', num_rounds=200, random_seed=None, bootstrap_frac = 1.0,
	single_sample=False, single_sample_index=0):
	"""
	Extended from: mlxtend library (URL:http://rasbt.github.io/mlxtend/user_guide/evaluate/bias_variance_decomp/)

	Function to generate "num_rounds" realizations of the given ML estimator using bootstrap sub-samples.
	"""
	supported = ['0-1_loss', 'mse']
	if loss not in supported:
	raise NotImplementedError('loss must be one of the following: %s' %
	supported)

	rng = np.random.RandomState(random_seed)

	# Array with predictions of all test set observations for "num_round" bootstrap sub-samples
	all_pred = np.zeros((num_rounds, y_test.shape[0]), dtype=np.int)

	for i in range(num_rounds):
	X_boot, y_boot = _draw_bootstrap_sample(rng, X_train, y_train, frac = bootstrap_frac)
	pred = estimator.fit(X_boot, y_boot).predict(X_test)
	all_pred[i] = pred

	# Only if "single_sample" is set to True
	# A slice of "all_pred" to get predictions of a single observation from "all_pred"
	# Provide observation index in "single_sample_index
	single_sample_estimates = None

	if loss == '0-1_loss':
	# This is in the case of classification model whose outcomes are 0 or 1

	main_predictions = np.apply_along_axis(lambda x:
	np.argmax(np.bincount(x)),
	axis=0,
	arr=all_pred)

	avg_expected_loss = (main_predictions != y_test).sum()/y_test.size

	avg_expected_loss = np.apply_along_axis(lambda x:
	(x != y_test).mean(),
	axis=1,
	arr=all_pred).mean()

	avg_bias = np.sum(main_predictions != y_test) / y_test.size

	var = np.zeros(pred.shape)

	for pred in all_pred:
	var += (pred != main_predictions).astype(np.int)
	var /= num_rounds

	avg_var = var.sum()/y_test.shape[0]

	single_sample_estimates = np.zeros(num_rounds)
	if single_sample == True:
	single_sample_estimates = all_pred[:,single_sample_index]

	else:
	# This is in the case of regression model whose outcomes are a continuous real value.

	avg_expected_loss = np.apply_along_axis(
	lambda x:
	((x - y_test)**2).mean(),
	axis=1,
	arr=all_pred).mean()

	main_predictions = np.mean(all_pred, axis=0)

	avg_bias = np.sum((main_predictions - y_test)**2) / y_test.size
	avg_var = np.sum((main_predictions - all_pred)**2) / all_pred.size

	single_sample_estimates = np.zeros(num_rounds)
	if single_sample == True:
	single_sample_estimates = all_pred[:,single_sample_index]

	return avg_expected_loss, avg_bias, avg_var, all_pred, single_sample_estimates