Last active
May 3, 2024 12:36
-
-
Save alekfrohlich/11a47ce0d19f846e024c0c5602cf60f0 to your computer and use it in GitHub Desktop.
Machine learning experiments of my master's thesis without the dataset.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Code for plotting predictive value and precision-recall curves along with their associated confidence bands. | |
import numpy as np | |
import pandas as pd | |
import matplotlib as mpl | |
import matplotlib.pyplot as plt | |
from xgboost import XGBClassifier | |
from sklearn.metrics import confusion_matrix | |
def estimator_bias(m: int, k: int) -> float: | |
'''Compute bias term appearing in Lemma X.''' | |
return ( | |
m * np.sqrt(np.pi / (2 * (m - 1))) / (2 * k), | |
m * np.sqrt(np.pi / (2 * (m - 1))) / (2 * (m - k)) | |
) | |
def large_deviation_bound(delta: float, m: int, k: float, *args) -> tuple: | |
'''Compute large deviation bound as in X.''' | |
ppv_bias, npv_bias = estimator_bias(m, k) | |
ppv_bound_first_term = np.sqrt((m * np.log(4*m / delta)) / 2) / k | |
npv_bound_first_term = np.sqrt((m * np.log(4*m / delta)) / 2) / (m - k) | |
return ( | |
ppv_bound_first_term + ppv_bias, | |
npv_bound_first_term + npv_bias | |
) | |
def uniform_bound(delta: float, m: int, k: int, vc) -> tuple: | |
'''Compute uniform generalization bound as in X.''' | |
ppv_bias, npv_bias = estimator_bias(m, k) | |
ppv_complexity_term = np.sqrt(2*m*(np.log(8*m / delta) + 2*vc*np.log(np.e*m / vc))) / k | |
npv_complexity_term = np.sqrt(2*m*(np.log(8*m / delta) + 2*vc*np.log(np.e*m / vc))) / (m - k) | |
return ( | |
ppv_complexity_term + ppv_bias, | |
npv_complexity_term + npv_bias | |
) | |
def plot_curves(axes: tuple, xgb: XGBClassifier, X: pd.DataFrame, y: pd.Series, train=False, vc=None) -> mpl.axes.Axes: | |
def qfhat(k): | |
'''Compute qfhat as in X.''' | |
return (sorted_scores[m-k-1] + sorted_scores[m-k]) / 2 | |
# Validating parameters | |
assert((train == False) or vc is not None) | |
bound = uniform_bound if train else large_deviation_bound | |
m = y.size | |
scores = xgb.predict_proba(X)[:, 1] | |
sorted_scores = np.sort(scores) | |
# Computing empirical predictive values | |
cfs = [confusion_matrix(y, (scores > qfhat(k)).astype(int)) for k in range(1, m)] | |
ppvs = np.array([tp / (tp + fp) for ((tn, fp), (fn, tp)) in cfs]) | |
npvs = np.array([tn / (tn + fn) for ((tn, fp), (fn, tp)) in cfs]) | |
recalls = np.array([tp / (tp + fn) for ((tn, fp), (fn, tp)) in cfs]) | |
ax1, ax2 = axes | |
# Plot predictive value curves along their confidence bands | |
# - The values must be capped, otherwise Numpy will generate NaNs and the confidence bands | |
# for training won't appear in the plot. | |
ax1.plot(np.arange(1, m) / m, ppvs, label='PPV') | |
bounds = np.nan_to_num(np.array([bound(0.05, m, k, vc)[0] for k in range(1, m)]), nan=100) | |
ax1.fill_between(np.arange(1, m) / m, np.maximum(ppvs-bounds, 0), np.minimum(ppvs+bounds, 1.1), alpha=0.3) | |
ax1.plot(np.arange(1, m) / m, npvs, label='NPV') | |
bounds = np.nan_to_num(np.array([bound(0.05, m, k, vc)[1] for k in range(1, m)]), nan=100) | |
ax1.fill_between(np.arange(1, m) / m, np.maximum(npvs-bounds, 0), np.minimum(npvs+bounds, 1.1), alpha=0.3) | |
# Plot precision-recall curve | |
ax2.plot(recalls, ppvs, label='PRC') | |
# Configure plots | |
ax1.set_ylim(0.3, 1.1) | |
ax1.set_title('Predictive Value Curves') | |
ax1.set_xlabel('Alpha') | |
ax1.set_ylabel('Predictive Values') | |
ax1.legend() | |
ax2.set_ylim(0.3, 1.1) | |
ax2.set_title('Precision-Recall Curve') | |
ax2.set_xlabel('Recall') | |
ax2.set_ylabel('Precision') | |
ax2.legend() | |
return (ax1, ax2) | |
def classification_curves(y_true: np.array, probas_pred: np.array) -> tuple: | |
def qfhat(k): | |
'''Compute qfhat as in X.''' | |
return (sorted_scores[m-k-1] + sorted_scores[m-k]) / 2 | |
m = y_true.size | |
scores = probas_pred | |
sorted_scores = np.sort(scores) | |
# Computing classification_curves | |
cfs = [confusion_matrix(y_true, (scores > qfhat(k)).astype(int)) for k in range(1, m)] | |
ppvs = np.array([tp / (tp + fp) for ((tn, fp), (fn, tp)) in cfs]) | |
npvs = np.array([tn / (tn + fn) for ((tn, fp), (fn, tp)) in cfs]) | |
sens = np.array([tp / (tp + fn) for ((tn, fp), (fn, tp)) in cfs]) | |
spec = np.array([tn / (tn + fp) for ((tn, fp), (fn, tp)) in cfs]) | |
return (np.arange(1, m) / m, ppvs, npvs, sens, spec) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment