Last active
May 25, 2024 23:06
-
-
Save phydev/e109810a06071448a137f515039fc203 to your computer and use it in GitHub Desktop.
Implementation of bootstrap .632+ estimator exemplified with breast cancer dataset
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from typing import Type, Dict | |
import numpy as np | |
import pandas as pd | |
def bootstrap(data: Type[pd.DataFrame], | |
n_resamples: int, | |
id_column: str, | |
seed: int=42) -> Type[pd.DataFrame]: | |
""" | |
Draw N resamples with replacements from the given data. | |
Parameters: | |
data (pd.DataFrame): The input data from which to draw bootstrap samples. | |
n_resamples (int): The number of bootstrap samples to draw. | |
id_column (str): The column name representing the study ID. | |
seed (int, optional): The random seed for reproducibility. Defaults to 42. | |
Returns: | |
pd.DataFrame: A DataFrame containing the bootstrap samples with an additional 'bs_id' column. | |
""" | |
# select only the study id to save storage and memory | |
data = data[[id_column]] | |
# Initialize an empty data frame | |
bootstrap_samples = pd.DataFrame() | |
# Set the seed | |
np.random.seed(seed) | |
# Number of rows in original data frame | |
n_rows = len(data) | |
# For loop to create bootstrap samples | |
for sample in range(1, n_resamples + 1): | |
# Randomly select rows from the original data frame | |
bootstrap_sample = data.sample(n_rows, replace=True) | |
bootstrap_sample['bs_id'] = sample | |
# Add bootstrap sample to bootstrap_samples | |
bootstrap_samples = pd.concat([bootstrap_samples, bootstrap_sample], ignore_index=True) | |
# Re-order the columns | |
cols = ["bs_id"] + [col for col in bootstrap_samples.columns if col != "bs_id"] | |
bootstrap_samples = bootstrap_samples[cols] | |
return bootstrap_samples | |
def get_oob_samples(data: Type[pd.DataFrame], | |
bootstrap_samples: Type[pd.DataFrame], | |
id_column: str) -> Dict: | |
""" | |
This function generates a dictionary containing the out-of-bag (OOB) samples for each bootstrap sample. | |
Parameters: | |
data (pd.DataFrame): The original data. | |
bootstrap_samples (pd.DataFrame): The bootstrap samples. | |
id_column (str): The column name representing the unique identifier for each sample. | |
Returns: | |
Dict: A dictionary containing the out-of-bag samples for each bootstrap sample. | |
""" | |
# Get the unique bootstrap sample ids | |
unique_bs_ids = bootstrap_samples['bs_id'].unique() | |
# Initialize a dictionary to store OOB samples for each bootstrap sample | |
oob_samples = {} | |
# Iterate over each bootstrap sample id | |
for bs_id in unique_bs_ids: | |
# Get the indices of the current bootstrap sample | |
in_bag_indices = bootstrap_samples[bootstrap_samples['bs_id'] == bs_id][id_column].values | |
# Get the indices of the original data | |
original_indices = data[id_column].values | |
# Find the out-of-bag indices | |
oob_indices = np.setdiff1d(original_indices, in_bag_indices) | |
# Store the OOB indices in the dictionary | |
oob_samples[bs_id] = oob_indices | |
return oob_samples | |
def compute_632plus_estimator( | |
bootstrap_theta: float, | |
oob_theta: Type[np.ndarray], | |
non_information_theta: float =0.5): | |
""" | |
This function calculates the .632+ bootstrap estimator, which is a method for estimating the performance of a model | |
trained using bootstrap resampling. The estimator takes into account both the in-sample performance (bootstrap_theta) | |
and the out-of-bag performance (oob_theta) to provide a more accurate estimate of the model's performance. | |
Parameters: | |
bootstrap_theta (float): The performance of the model on the bootstrap sample. | |
oob_theta (np.ndarray): An array containing the performance of the model on the out-of-bag samples. | |
non_information_theta (float, optional): The performance of a non-informative model. Defaults to 0.5. | |
Returns: | |
float: The .632+ bootstrap estimator. | |
""" | |
# Calculate the average OOB performance | |
avg_oob_theta = np.mean(oob_theta) | |
# Calculate the relative overfitting rate R | |
R = (avg_oob_theta - bootstrap_theta) / (non_information_theta - avg_oob_theta) | |
# Calculate the weight omega | |
omega = 0.632 / (1 - 0.368 * R) | |
# Calculate the .632+ estimator | |
theta_632plus = (1 - omega) * bootstrap_theta + omega * avg_oob_theta | |
return theta_632plus | |
if __name__ == "__main__": | |
import sklearn | |
cancer = sklearn.datasets.load_breast_cancer() | |
df = pd.DataFrame(np.c_[cancer['data'], cancer['target']], | |
columns= np.append(cancer['feature_names'], ['target'])) | |
# create unique ids for the observations | |
df = df.assign(patient_id = np.arange(1, df.shape[0]+1)) | |
df.index = df.loc[:, 'patient_id'] | |
n_bootstraps = 1000 | |
bootstrap_ids = bootstrap(df, n_resamples = n_bootstraps, id_column = "patient_id") | |
oob_indices = get_oob_samples(df, bootstrap_ids, "patient_id") | |
# check the percentage of observations in resamples and oob | |
# the theory says that it should be around 63.2% for resamples | |
# and 36.8% for oob | |
oob_perc = np.zeros(n_bootstraps) | |
bs_perc = np.zeros(n_bootstraps) | |
for bs in range(0,n_bootstraps): | |
oob_perc[bs] = oob_indices[bs+1].__len__()/569*100 | |
bs_perc[bs]= 100-oob_perc[bs] | |
print(oob_perc.mean(),bs_perc.mean(), oob_perc.std()) | |
predictors = ['mean radius', 'mean texture', 'mean perimeter', 'mean area', | |
'mean smoothness', 'mean compactness', 'mean concavity', | |
'mean concave points', 'mean symmetry', 'mean fractal dimension', | |
'radius error', 'texture error', 'perimeter error', 'area error', | |
'smoothness error', 'compactness error', 'concavity error', | |
'concave points error', 'symmetry error', 'fractal dimension error', | |
'worst radius', 'worst texture', 'worst perimeter', 'worst area', | |
'worst smoothness', 'worst compactness', 'worst concavity', | |
'worst concave points', 'worst symmetry', 'worst fractal dimension'] | |
roc_auc_oob = np.zeros(n_bootstraps) | |
# train your model - No splitting, we use the whole data | |
X, y = df[predictors], df['target'] | |
# Create a pipeline with a scaler and logistic regression | |
main_model = sklearn.pipeline.Pipeline([ | |
('scaler', sklearn.preprocessing.StandardScaler()), | |
('logreg', sklearn.linear_model.LogisticRegression(random_state=42)) | |
]) | |
main_model.fit(X, y) | |
y_pred = main_model.predict(X) | |
roc_auc_orig = sklearn.metrics.roc_auc_score(y, y_pred) | |
for bs_id in range(2, n_bootstraps): | |
# select bootstrap | |
bs = df.loc[bootstrap_ids[bootstrap_ids['bs_id']==bs_id].loc[:, 'patient_id'], :] | |
oob = df.loc[oob_indices[bs_id], :] | |
X_train, y_train, X_test, y_test = bs[predictors], bs['target'], oob[predictors], oob['target'] | |
# Create a pipeline with a scaler and logistic regression | |
pipeline = sklearn.pipeline.Pipeline([ | |
('scaler', sklearn.preprocessing.StandardScaler()), | |
('logreg', sklearn.linear_model.LogisticRegression(random_state=42)) | |
]) | |
# Fit the pipeline on the training data | |
pipeline.fit(X_train, y_train) | |
# Predict on the test data | |
y_pred = pipeline.predict_proba(X_test)[:, 1] | |
# Evaluate the model | |
roc_auc_oob[bs_id] = sklearn.metrics.roc_auc_score(y_test, y_pred) | |
# Calculate the .632+ estimator | |
theta_632plus = compute_632plus_estimator(bootstrap_theta = roc_auc_orig, oob_theta = roc_auc_oob, non_information_theta=0.5) | |
cis = np.quantile(roc_auc_oob, [0.025, 0.975]) | |
print(f'.632+ Estimator: {theta_632plus:.4f}') | |
print(f'mean oob: {roc_auc_oob.mean():.4f}') | |
print(f'Confidence intervals: {cis[0]:.4f}, {cis[1]:.4f}') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment