Skip to content

Instantly share code, notes, and snippets.

@Dpananos
Created April 4, 2022 02:32
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Dpananos/852dbd0b48d2c23da2183539180d7716 to your computer and use it in GitHub Desktop.
Save Dpananos/852dbd0b48d2c23da2183539180d7716 to your computer and use it in GitHub Desktop.
import numpy as np
from statsmodels.nonparametric.smoothers_lowess import lowess
from sklearn.datasets import load_breast_cancer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, RepeatedKFold, GridSearchCV, cross_val_score
from sklearn.metrics import make_scorer, brier_score_loss
from sklearn.utils import resample
import matplotlib.pyplot as plt
# Load in the data to be used.
data = load_breast_cancer()
X, y = data['data'], data['target']
# The model we will be using will be an L2 penalized logistic regression.
# We will have to select the appropriate penalty strength, and that requires cross validation.
# There are some convergence issues, so pick a stupid large max iteration
pipeline_components = [
('scaling',StandardScaler()),
('logistic_regression', LogisticRegression(penalty='l2', max_iter = 1_000_000))
]
a_model = Pipeline(pipeline_components)
param_grid = {'logistic_regression__C': 1/np.logspace(-2, 2, base = np.exp(1))}
# Choose a proper scoring rule for model selection
brier_score = make_scorer(brier_score_loss, needs_proba=True, pos_label=1)
# According to Frank Harrell, 100 repeats of 10-fold CV is about as good as optimism corrected bootstrap
outer_cv = RepeatedKFold(n_splits=10, n_repeats=100)
# We will choose our regularization strength through 10 fold cv
inner_cv = KFold(n_splits=10)
gscv = GridSearchCV(model, param_grid=param_grid, cv = inner_cv, scoring = brier_score, verbose=0)
# Now let's estimate how well a model selected via 10 fold cv would do by using repeated cv
# This is going to take a long time.
if False:
cv_results = cross_val_score(gscv, X, y, cv=outer_cv, scoring = brier_score)
print(cv_results.mean())
# Construct the apparent calibration
prange = np.linspace(0, 1, 25)
# Fit our model on all the data
best_model = gscv.fit(X, y).best_estimator_
# Estimate the risks from the best model
predicted_p = best_model.predict_proba(X)[:, 1]
# Compute the apparent calibration
apparent_cal = lowess(y, predicted_p, it=0, xvals=prange)
# To the Optimism Corrected Bootstrap for the Calibration Curve
nsim = 500
optimism = np.zeros((nsim, prange.size))
for i in range(nsim):
# Bootstrap the original dataset
Xb, yb = resample(X, y)
# Fit the model, including the hyperparameter selection, on the bootstrapped data
fit = gscv.fit(Xb, yb).best_estimator_
# Get the risk estimates from the model fit on the bootstrapped predictions
predicted_risk_bs = fit.predict_proba(Xb)[:, 1]
# Fit a calibration curve to the predicted risk on bootstrapped data
smooth_p_bs = lowess(yb, predicted_risk_bs, it=0, xvals=prange)
# Apply the bootstrap model on the original data
predicted_risk_orig = fit.predict_proba(X)[:, 1]
# Fit a calibration curve on the original data using predictions from bootstrapped model
smooth_p_bs_orig = lowess(y, predicted_risk_orig, it=0, xvals=prange)
optimism[i] = smooth_p_bs - smooth_p_bs_orig
# Here is the bias corrected calibration curve
bias_corrected_cal = apparent_cal - optimism.mean(0)
# and finally the plot
fig, ax = plt.subplots(dpi=240)
ax.set_aspect('equal')
plt.scatter(predicted_p, y, s=1, alpha = 0.4, c='k')
plt.plot(prange, apparent_cal, 'red', label = 'Non-Parametric Estimate')
plt.plot([0, 1], [0, 1], 'k--', label = 'Perfect Calibration')
plt.plot(prange, bias_corrected_cal, 'C0', label = 'Optimism Corrected Calibration')
plt.legend(loc = 'upper left', prop = {'size':6})
bias_corrected_cal = apparent_cal - optimism.mean(0)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment