phydev/bootstrap_breast.py

## bootstrap_breast.py
from typing import Type, Dict
import numpy as np
import pandas as pd

def bootstrap(data: Type[pd.DataFrame],
              n_resamples: int,
              id_column: str,
              seed: int=42) -> Type[pd.DataFrame]:
    """
    Draw N resamples with replacements from the given data.

    Parameters:
    data (pd.DataFrame): The input data from which to draw bootstrap samples.
    n_resamples (int): The number of bootstrap samples to draw.
    id_column (str): The column name representing the study ID.
    seed (int, optional): The random seed for reproducibility. Defaults to 42.

    Returns:
    pd.DataFrame: A DataFrame containing the bootstrap samples with an additional 'bs_id' column.

    """

    # select only the study id to save storage and memory
    data = data[[id_column]]

    # Initialize an empty data frame
    bootstrap_samples = pd.DataFrame()

    # Set the seed
    np.random.seed(seed)

    # Number of rows in original data frame
    n_rows = len(data)

    # For loop to create bootstrap samples
    for sample in range(1, n_resamples + 1):
        # Randomly select rows from the original data frame
        bootstrap_sample = data.sample(n_rows, replace=True)
        bootstrap_sample['bs_id'] = sample

        # Add bootstrap sample to bootstrap_samples
        bootstrap_samples = pd.concat([bootstrap_samples, bootstrap_sample], ignore_index=True)

    # Re-order the columns
    cols = ["bs_id"] + [col for col in bootstrap_samples.columns if col != "bs_id"]
    bootstrap_samples = bootstrap_samples[cols]


    return bootstrap_samples


def get_oob_samples(data: Type[pd.DataFrame],
                    bootstrap_samples: Type[pd.DataFrame],
                    id_column: str) -> Dict:
    """
    This function generates a dictionary containing the out-of-bag (OOB) samples for each bootstrap sample.

    Parameters:
        data (pd.DataFrame): The original data.
        bootstrap_samples (pd.DataFrame): The bootstrap samples.
        id_column (str): The column name representing the unique identifier for each sample.

    Returns:
        Dict: A dictionary containing the out-of-bag samples for each bootstrap sample.
    """

    # Get the unique bootstrap sample ids
    unique_bs_ids = bootstrap_samples['bs_id'].unique()

    # Initialize a dictionary to store OOB samples for each bootstrap sample
    oob_samples = {}

    # Iterate over each bootstrap sample id
    for bs_id in unique_bs_ids:
        # Get the indices of the current bootstrap sample
        in_bag_indices = bootstrap_samples[bootstrap_samples['bs_id'] == bs_id][id_column].values

        # Get the indices of the original data
        original_indices = data[id_column].values

        # Find the out-of-bag indices
        oob_indices = np.setdiff1d(original_indices, in_bag_indices)

        # Store the OOB indices in the dictionary
        oob_samples[bs_id] = oob_indices


    return oob_samples


def compute_632plus_estimator(
        bootstrap_theta: float,
        oob_theta: Type[np.ndarray],
        non_information_theta: float =0.5):
    """
    This function calculates the .632+ bootstrap estimator, which is a method for estimating the performance of a model
    trained using bootstrap resampling. The estimator takes into account both the in-sample performance (bootstrap_theta)
    and the out-of-bag performance (oob_theta) to provide a more accurate estimate of the model's performance.

    Parameters:
        bootstrap_theta (float): The performance of the model on the bootstrap sample.
        oob_theta (np.ndarray): An array containing the performance of the model on the out-of-bag samples.
        non_information_theta (float, optional): The performance of a non-informative model. Defaults to 0.5.

    Returns:
        float: The .632+ bootstrap estimator.
    """

    # Calculate the average OOB performance
    avg_oob_theta = np.mean(oob_theta)

    # Calculate the relative overfitting rate R
    R = (avg_oob_theta - bootstrap_theta) / (non_information_theta - avg_oob_theta)

    # Calculate the weight omega
    omega = 0.632 / (1 - 0.368 * R)

    # Calculate the .632+ estimator
    theta_632plus = (1 - omega) * bootstrap_theta + omega * avg_oob_theta

    return theta_632plus


if __name__ == "__main__":

    import sklearn

    cancer = sklearn.datasets.load_breast_cancer()

    df = pd.DataFrame(np.c_[cancer['data'], cancer['target']],
                    columns= np.append(cancer['feature_names'], ['target']))

    # create unique ids for the observations
    df = df.assign(patient_id = np.arange(1, df.shape[0]+1))

    df.index = df.loc[:, 'patient_id']

    n_bootstraps = 1000
    bootstrap_ids = bootstrap(df, n_resamples = n_bootstraps, id_column = "patient_id")
    oob_indices = get_oob_samples(df, bootstrap_ids, "patient_id")


    # check the percentage of observations in resamples and oob
    # the theory says that it should be around 63.2% for resamples
    # and 36.8% for oob
    oob_perc = np.zeros(n_bootstraps)
    bs_perc = np.zeros(n_bootstraps)

    for bs in range(0,n_bootstraps):
        oob_perc[bs] = oob_indices[bs+1].__len__()/569*100
        bs_perc[bs]= 100-oob_perc[bs]

    print(oob_perc.mean(),bs_perc.mean(), oob_perc.std())

    predictors = ['mean radius', 'mean texture', 'mean perimeter', 'mean area',
        'mean smoothness', 'mean compactness', 'mean concavity',
        'mean concave points', 'mean symmetry', 'mean fractal dimension',
        'radius error', 'texture error', 'perimeter error', 'area error',
        'smoothness error', 'compactness error', 'concavity error',
        'concave points error', 'symmetry error', 'fractal dimension error',
        'worst radius', 'worst texture', 'worst perimeter', 'worst area',
        'worst smoothness', 'worst compactness', 'worst concavity',
        'worst concave points', 'worst symmetry', 'worst fractal dimension']


    roc_auc_oob = np.zeros(n_bootstraps)

    # train your model - No splitting, we use the whole data
    X, y = df[predictors], df['target']

    # Create a pipeline with a scaler and logistic regression
    main_model = sklearn.pipeline.Pipeline([
        ('scaler', sklearn.preprocessing.StandardScaler()),
        ('logreg', sklearn.linear_model.LogisticRegression(random_state=42))
    ])

    main_model.fit(X, y)

    y_pred = main_model.predict(X)
    roc_auc_orig = sklearn.metrics.roc_auc_score(y, y_pred)


    for bs_id in range(2, n_bootstraps):

        # select bootstrap
        bs = df.loc[bootstrap_ids[bootstrap_ids['bs_id']==bs_id].loc[:, 'patient_id'], :]
        oob = df.loc[oob_indices[bs_id], :]

        X_train, y_train, X_test, y_test = bs[predictors], bs['target'], oob[predictors], oob['target']


        # Create a pipeline with a scaler and logistic regression
        pipeline = sklearn.pipeline.Pipeline([
            ('scaler', sklearn.preprocessing.StandardScaler()),
            ('logreg', sklearn.linear_model.LogisticRegression(random_state=42))
        ])

        # Fit the pipeline on the training data
        pipeline.fit(X_train, y_train)

        # Predict on the test data
        y_pred = pipeline.predict_proba(X_test)[:, 1]

        # Evaluate the model
        roc_auc_oob[bs_id] = sklearn.metrics.roc_auc_score(y_test, y_pred)

    # Calculate the .632+ estimator
    theta_632plus = compute_632plus_estimator(bootstrap_theta = roc_auc_orig, oob_theta = roc_auc_oob, non_information_theta=0.5)

    cis = np.quantile(roc_auc_oob, [0.025, 0.975])

    print(f'.632+ Estimator: {theta_632plus:.4f}')
    print(f'mean oob: {roc_auc_oob.mean():.4f}')
    print(f'Confidence intervals: {cis[0]:.4f}, {cis[1]:.4f}')
	from typing import Type, Dict
	import numpy as np
	import pandas as pd

	def bootstrap(data: Type[pd.DataFrame],
	n_resamples: int,
	id_column: str,
	seed: int=42) -> Type[pd.DataFrame]:
	"""
	Draw N resamples with replacements from the given data.

	Parameters:
	data (pd.DataFrame): The input data from which to draw bootstrap samples.
	n_resamples (int): The number of bootstrap samples to draw.
	id_column (str): The column name representing the study ID.
	seed (int, optional): The random seed for reproducibility. Defaults to 42.

	Returns:
	pd.DataFrame: A DataFrame containing the bootstrap samples with an additional 'bs_id' column.

	"""

	# select only the study id to save storage and memory
	data = data[[id_column]]

	# Initialize an empty data frame
	bootstrap_samples = pd.DataFrame()

	# Set the seed
	np.random.seed(seed)

	# Number of rows in original data frame
	n_rows = len(data)

	# For loop to create bootstrap samples
	for sample in range(1, n_resamples + 1):
	# Randomly select rows from the original data frame
	bootstrap_sample = data.sample(n_rows, replace=True)
	bootstrap_sample['bs_id'] = sample

	# Add bootstrap sample to bootstrap_samples
	bootstrap_samples = pd.concat([bootstrap_samples, bootstrap_sample], ignore_index=True)

	# Re-order the columns
	cols = ["bs_id"] + [col for col in bootstrap_samples.columns if col != "bs_id"]
	bootstrap_samples = bootstrap_samples[cols]


	return bootstrap_samples



	def get_oob_samples(data: Type[pd.DataFrame],
	bootstrap_samples: Type[pd.DataFrame],
	id_column: str) -> Dict:
	"""
	This function generates a dictionary containing the out-of-bag (OOB) samples for each bootstrap sample.

	Parameters:
	data (pd.DataFrame): The original data.
	bootstrap_samples (pd.DataFrame): The bootstrap samples.
	id_column (str): The column name representing the unique identifier for each sample.

	Returns:
	Dict: A dictionary containing the out-of-bag samples for each bootstrap sample.
	"""

	# Get the unique bootstrap sample ids
	unique_bs_ids = bootstrap_samples['bs_id'].unique()

	# Initialize a dictionary to store OOB samples for each bootstrap sample
	oob_samples = {}

	# Iterate over each bootstrap sample id
	for bs_id in unique_bs_ids:
	# Get the indices of the current bootstrap sample
	in_bag_indices = bootstrap_samples[bootstrap_samples['bs_id'] == bs_id][id_column].values

	# Get the indices of the original data
	original_indices = data[id_column].values

	# Find the out-of-bag indices
	oob_indices = np.setdiff1d(original_indices, in_bag_indices)

	# Store the OOB indices in the dictionary
	oob_samples[bs_id] = oob_indices


	return oob_samples


	def compute_632plus_estimator(
	bootstrap_theta: float,
	oob_theta: Type[np.ndarray],
	non_information_theta: float =0.5):
	"""
	This function calculates the .632+ bootstrap estimator, which is a method for estimating the performance of a model
	trained using bootstrap resampling. The estimator takes into account both the in-sample performance (bootstrap_theta)
	and the out-of-bag performance (oob_theta) to provide a more accurate estimate of the model's performance.

	Parameters:
	bootstrap_theta (float): The performance of the model on the bootstrap sample.
	oob_theta (np.ndarray): An array containing the performance of the model on the out-of-bag samples.
	non_information_theta (float, optional): The performance of a non-informative model. Defaults to 0.5.

	Returns:
	float: The .632+ bootstrap estimator.
	"""

	# Calculate the average OOB performance
	avg_oob_theta = np.mean(oob_theta)

	# Calculate the relative overfitting rate R
	R = (avg_oob_theta - bootstrap_theta) / (non_information_theta - avg_oob_theta)

	# Calculate the weight omega
	omega = 0.632 / (1 - 0.368 * R)

	# Calculate the .632+ estimator
	theta_632plus = (1 - omega) * bootstrap_theta + omega * avg_oob_theta

	return theta_632plus


	if __name__ == "__main__":

	import sklearn

	cancer = sklearn.datasets.load_breast_cancer()

	df = pd.DataFrame(np.c_[cancer['data'], cancer['target']],
	columns= np.append(cancer['feature_names'], ['target']))

	# create unique ids for the observations
	df = df.assign(patient_id = np.arange(1, df.shape[0]+1))

	df.index = df.loc[:, 'patient_id']

	n_bootstraps = 1000
	bootstrap_ids = bootstrap(df, n_resamples = n_bootstraps, id_column = "patient_id")
	oob_indices = get_oob_samples(df, bootstrap_ids, "patient_id")


	# check the percentage of observations in resamples and oob
	# the theory says that it should be around 63.2% for resamples
	# and 36.8% for oob
	oob_perc = np.zeros(n_bootstraps)
	bs_perc = np.zeros(n_bootstraps)

	for bs in range(0,n_bootstraps):
	oob_perc[bs] = oob_indices[bs+1].__len__()/569*100
	bs_perc[bs]= 100-oob_perc[bs]

	print(oob_perc.mean(),bs_perc.mean(), oob_perc.std())

	predictors = ['mean radius', 'mean texture', 'mean perimeter', 'mean area',
	'mean smoothness', 'mean compactness', 'mean concavity',
	'mean concave points', 'mean symmetry', 'mean fractal dimension',
	'radius error', 'texture error', 'perimeter error', 'area error',
	'smoothness error', 'compactness error', 'concavity error',
	'concave points error', 'symmetry error', 'fractal dimension error',
	'worst radius', 'worst texture', 'worst perimeter', 'worst area',
	'worst smoothness', 'worst compactness', 'worst concavity',
	'worst concave points', 'worst symmetry', 'worst fractal dimension']


	roc_auc_oob = np.zeros(n_bootstraps)

	# train your model - No splitting, we use the whole data
	X, y = df[predictors], df['target']

	# Create a pipeline with a scaler and logistic regression
	main_model = sklearn.pipeline.Pipeline([
	('scaler', sklearn.preprocessing.StandardScaler()),
	('logreg', sklearn.linear_model.LogisticRegression(random_state=42))
	])

	main_model.fit(X, y)

	y_pred = main_model.predict(X)
	roc_auc_orig = sklearn.metrics.roc_auc_score(y, y_pred)


	for bs_id in range(2, n_bootstraps):

	# select bootstrap
	bs = df.loc[bootstrap_ids[bootstrap_ids['bs_id']==bs_id].loc[:, 'patient_id'], :]
	oob = df.loc[oob_indices[bs_id], :]

	X_train, y_train, X_test, y_test = bs[predictors], bs['target'], oob[predictors], oob['target']


	# Create a pipeline with a scaler and logistic regression
	pipeline = sklearn.pipeline.Pipeline([
	('scaler', sklearn.preprocessing.StandardScaler()),
	('logreg', sklearn.linear_model.LogisticRegression(random_state=42))
	])

	# Fit the pipeline on the training data
	pipeline.fit(X_train, y_train)

	# Predict on the test data
	y_pred = pipeline.predict_proba(X_test)[:, 1]

	# Evaluate the model
	roc_auc_oob[bs_id] = sklearn.metrics.roc_auc_score(y_test, y_pred)

	# Calculate the .632+ estimator
	theta_632plus = compute_632plus_estimator(bootstrap_theta = roc_auc_orig, oob_theta = roc_auc_oob, non_information_theta=0.5)

	cis = np.quantile(roc_auc_oob, [0.025, 0.975])

	print(f'.632+ Estimator: {theta_632plus:.4f}')
	print(f'mean oob: {roc_auc_oob.mean():.4f}')
	print(f'Confidence intervals: {cis[0]:.4f}, {cis[1]:.4f}')