Skip to content

Instantly share code, notes, and snippets.

@alekfrohlich
Last active May 3, 2024 12:36
Show Gist options
  • Save alekfrohlich/11a47ce0d19f846e024c0c5602cf60f0 to your computer and use it in GitHub Desktop.
Save alekfrohlich/11a47ce0d19f846e024c0c5602cf60f0 to your computer and use it in GitHub Desktop.
Machine learning experiments of my master's thesis without the dataset.
# Code for plotting predictive value and precision-recall curves along with their associated confidence bands.
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix
def estimator_bias(m: int, k: int) -> float:
'''Compute bias term appearing in Lemma X.'''
return (
m * np.sqrt(np.pi / (2 * (m - 1))) / (2 * k),
m * np.sqrt(np.pi / (2 * (m - 1))) / (2 * (m - k))
)
def large_deviation_bound(delta: float, m: int, k: float, *args) -> tuple:
'''Compute large deviation bound as in X.'''
ppv_bias, npv_bias = estimator_bias(m, k)
ppv_bound_first_term = np.sqrt((m * np.log(4*m / delta)) / 2) / k
npv_bound_first_term = np.sqrt((m * np.log(4*m / delta)) / 2) / (m - k)
return (
ppv_bound_first_term + ppv_bias,
npv_bound_first_term + npv_bias
)
def uniform_bound(delta: float, m: int, k: int, vc) -> tuple:
'''Compute uniform generalization bound as in X.'''
ppv_bias, npv_bias = estimator_bias(m, k)
ppv_complexity_term = np.sqrt(2*m*(np.log(8*m / delta) + 2*vc*np.log(np.e*m / vc))) / k
npv_complexity_term = np.sqrt(2*m*(np.log(8*m / delta) + 2*vc*np.log(np.e*m / vc))) / (m - k)
return (
ppv_complexity_term + ppv_bias,
npv_complexity_term + npv_bias
)
def plot_curves(axes: tuple, xgb: XGBClassifier, X: pd.DataFrame, y: pd.Series, train=False, vc=None) -> mpl.axes.Axes:
def qfhat(k):
'''Compute qfhat as in X.'''
return (sorted_scores[m-k-1] + sorted_scores[m-k]) / 2
# Validating parameters
assert((train == False) or vc is not None)
bound = uniform_bound if train else large_deviation_bound
m = y.size
scores = xgb.predict_proba(X)[:, 1]
sorted_scores = np.sort(scores)
# Computing empirical predictive values
cfs = [confusion_matrix(y, (scores > qfhat(k)).astype(int)) for k in range(1, m)]
ppvs = np.array([tp / (tp + fp) for ((tn, fp), (fn, tp)) in cfs])
npvs = np.array([tn / (tn + fn) for ((tn, fp), (fn, tp)) in cfs])
recalls = np.array([tp / (tp + fn) for ((tn, fp), (fn, tp)) in cfs])
ax1, ax2 = axes
# Plot predictive value curves along their confidence bands
# - The values must be capped, otherwise Numpy will generate NaNs and the confidence bands
# for training won't appear in the plot.
ax1.plot(np.arange(1, m) / m, ppvs, label='PPV')
bounds = np.nan_to_num(np.array([bound(0.05, m, k, vc)[0] for k in range(1, m)]), nan=100)
ax1.fill_between(np.arange(1, m) / m, np.maximum(ppvs-bounds, 0), np.minimum(ppvs+bounds, 1.1), alpha=0.3)
ax1.plot(np.arange(1, m) / m, npvs, label='NPV')
bounds = np.nan_to_num(np.array([bound(0.05, m, k, vc)[1] for k in range(1, m)]), nan=100)
ax1.fill_between(np.arange(1, m) / m, np.maximum(npvs-bounds, 0), np.minimum(npvs+bounds, 1.1), alpha=0.3)
# Plot precision-recall curve
ax2.plot(recalls, ppvs, label='PRC')
# Configure plots
ax1.set_ylim(0.3, 1.1)
ax1.set_title('Predictive Value Curves')
ax1.set_xlabel('Alpha')
ax1.set_ylabel('Predictive Values')
ax1.legend()
ax2.set_ylim(0.3, 1.1)
ax2.set_title('Precision-Recall Curve')
ax2.set_xlabel('Recall')
ax2.set_ylabel('Precision')
ax2.legend()
return (ax1, ax2)
def classification_curves(y_true: np.array, probas_pred: np.array) -> tuple:
def qfhat(k):
'''Compute qfhat as in X.'''
return (sorted_scores[m-k-1] + sorted_scores[m-k]) / 2
m = y_true.size
scores = probas_pred
sorted_scores = np.sort(scores)
# Computing classification_curves
cfs = [confusion_matrix(y_true, (scores > qfhat(k)).astype(int)) for k in range(1, m)]
ppvs = np.array([tp / (tp + fp) for ((tn, fp), (fn, tp)) in cfs])
npvs = np.array([tn / (tn + fn) for ((tn, fp), (fn, tp)) in cfs])
sens = np.array([tp / (tp + fn) for ((tn, fp), (fn, tp)) in cfs])
spec = np.array([tn / (tn + fp) for ((tn, fp), (fn, tp)) in cfs])
return (np.arange(1, m) / m, ppvs, npvs, sens, spec)
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment