Skip to content

Instantly share code, notes, and snippets.

@phydev
Last active May 25, 2024 23:06
Show Gist options
  • Save phydev/e109810a06071448a137f515039fc203 to your computer and use it in GitHub Desktop.
Save phydev/e109810a06071448a137f515039fc203 to your computer and use it in GitHub Desktop.
Implementation of bootstrap .632+ estimator exemplified with breast cancer dataset
from typing import Type, Dict
import numpy as np
import pandas as pd
def bootstrap(data: Type[pd.DataFrame],
n_resamples: int,
id_column: str,
seed: int=42) -> Type[pd.DataFrame]:
"""
Draw N resamples with replacements from the given data.
Parameters:
data (pd.DataFrame): The input data from which to draw bootstrap samples.
n_resamples (int): The number of bootstrap samples to draw.
id_column (str): The column name representing the study ID.
seed (int, optional): The random seed for reproducibility. Defaults to 42.
Returns:
pd.DataFrame: A DataFrame containing the bootstrap samples with an additional 'bs_id' column.
"""
# select only the study id to save storage and memory
data = data[[id_column]]
# Initialize an empty data frame
bootstrap_samples = pd.DataFrame()
# Set the seed
np.random.seed(seed)
# Number of rows in original data frame
n_rows = len(data)
# For loop to create bootstrap samples
for sample in range(1, n_resamples + 1):
# Randomly select rows from the original data frame
bootstrap_sample = data.sample(n_rows, replace=True)
bootstrap_sample['bs_id'] = sample
# Add bootstrap sample to bootstrap_samples
bootstrap_samples = pd.concat([bootstrap_samples, bootstrap_sample], ignore_index=True)
# Re-order the columns
cols = ["bs_id"] + [col for col in bootstrap_samples.columns if col != "bs_id"]
bootstrap_samples = bootstrap_samples[cols]
return bootstrap_samples
def get_oob_samples(data: Type[pd.DataFrame],
bootstrap_samples: Type[pd.DataFrame],
id_column: str) -> Dict:
"""
This function generates a dictionary containing the out-of-bag (OOB) samples for each bootstrap sample.
Parameters:
data (pd.DataFrame): The original data.
bootstrap_samples (pd.DataFrame): The bootstrap samples.
id_column (str): The column name representing the unique identifier for each sample.
Returns:
Dict: A dictionary containing the out-of-bag samples for each bootstrap sample.
"""
# Get the unique bootstrap sample ids
unique_bs_ids = bootstrap_samples['bs_id'].unique()
# Initialize a dictionary to store OOB samples for each bootstrap sample
oob_samples = {}
# Iterate over each bootstrap sample id
for bs_id in unique_bs_ids:
# Get the indices of the current bootstrap sample
in_bag_indices = bootstrap_samples[bootstrap_samples['bs_id'] == bs_id][id_column].values
# Get the indices of the original data
original_indices = data[id_column].values
# Find the out-of-bag indices
oob_indices = np.setdiff1d(original_indices, in_bag_indices)
# Store the OOB indices in the dictionary
oob_samples[bs_id] = oob_indices
return oob_samples
def compute_632plus_estimator(
bootstrap_theta: float,
oob_theta: Type[np.ndarray],
non_information_theta: float =0.5):
"""
This function calculates the .632+ bootstrap estimator, which is a method for estimating the performance of a model
trained using bootstrap resampling. The estimator takes into account both the in-sample performance (bootstrap_theta)
and the out-of-bag performance (oob_theta) to provide a more accurate estimate of the model's performance.
Parameters:
bootstrap_theta (float): The performance of the model on the bootstrap sample.
oob_theta (np.ndarray): An array containing the performance of the model on the out-of-bag samples.
non_information_theta (float, optional): The performance of a non-informative model. Defaults to 0.5.
Returns:
float: The .632+ bootstrap estimator.
"""
# Calculate the average OOB performance
avg_oob_theta = np.mean(oob_theta)
# Calculate the relative overfitting rate R
R = (avg_oob_theta - bootstrap_theta) / (non_information_theta - avg_oob_theta)
# Calculate the weight omega
omega = 0.632 / (1 - 0.368 * R)
# Calculate the .632+ estimator
theta_632plus = (1 - omega) * bootstrap_theta + omega * avg_oob_theta
return theta_632plus
if __name__ == "__main__":
import sklearn
cancer = sklearn.datasets.load_breast_cancer()
df = pd.DataFrame(np.c_[cancer['data'], cancer['target']],
columns= np.append(cancer['feature_names'], ['target']))
# create unique ids for the observations
df = df.assign(patient_id = np.arange(1, df.shape[0]+1))
df.index = df.loc[:, 'patient_id']
n_bootstraps = 1000
bootstrap_ids = bootstrap(df, n_resamples = n_bootstraps, id_column = "patient_id")
oob_indices = get_oob_samples(df, bootstrap_ids, "patient_id")
# check the percentage of observations in resamples and oob
# the theory says that it should be around 63.2% for resamples
# and 36.8% for oob
oob_perc = np.zeros(n_bootstraps)
bs_perc = np.zeros(n_bootstraps)
for bs in range(0,n_bootstraps):
oob_perc[bs] = oob_indices[bs+1].__len__()/569*100
bs_perc[bs]= 100-oob_perc[bs]
print(oob_perc.mean(),bs_perc.mean(), oob_perc.std())
predictors = ['mean radius', 'mean texture', 'mean perimeter', 'mean area',
'mean smoothness', 'mean compactness', 'mean concavity',
'mean concave points', 'mean symmetry', 'mean fractal dimension',
'radius error', 'texture error', 'perimeter error', 'area error',
'smoothness error', 'compactness error', 'concavity error',
'concave points error', 'symmetry error', 'fractal dimension error',
'worst radius', 'worst texture', 'worst perimeter', 'worst area',
'worst smoothness', 'worst compactness', 'worst concavity',
'worst concave points', 'worst symmetry', 'worst fractal dimension']
roc_auc_oob = np.zeros(n_bootstraps)
# train your model - No splitting, we use the whole data
X, y = df[predictors], df['target']
# Create a pipeline with a scaler and logistic regression
main_model = sklearn.pipeline.Pipeline([
('scaler', sklearn.preprocessing.StandardScaler()),
('logreg', sklearn.linear_model.LogisticRegression(random_state=42))
])
main_model.fit(X, y)
y_pred = main_model.predict(X)
roc_auc_orig = sklearn.metrics.roc_auc_score(y, y_pred)
for bs_id in range(2, n_bootstraps):
# select bootstrap
bs = df.loc[bootstrap_ids[bootstrap_ids['bs_id']==bs_id].loc[:, 'patient_id'], :]
oob = df.loc[oob_indices[bs_id], :]
X_train, y_train, X_test, y_test = bs[predictors], bs['target'], oob[predictors], oob['target']
# Create a pipeline with a scaler and logistic regression
pipeline = sklearn.pipeline.Pipeline([
('scaler', sklearn.preprocessing.StandardScaler()),
('logreg', sklearn.linear_model.LogisticRegression(random_state=42))
])
# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)
# Predict on the test data
y_pred = pipeline.predict_proba(X_test)[:, 1]
# Evaluate the model
roc_auc_oob[bs_id] = sklearn.metrics.roc_auc_score(y_test, y_pred)
# Calculate the .632+ estimator
theta_632plus = compute_632plus_estimator(bootstrap_theta = roc_auc_orig, oob_theta = roc_auc_oob, non_information_theta=0.5)
cis = np.quantile(roc_auc_oob, [0.025, 0.975])
print(f'.632+ Estimator: {theta_632plus:.4f}')
print(f'mean oob: {roc_auc_oob.mean():.4f}')
print(f'Confidence intervals: {cis[0]:.4f}, {cis[1]:.4f}')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment