alekfrohlich/curves.py

## curves.py
# Code for plotting predictive value and precision-recall curves along with their associated confidence bands.
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix

def estimator_bias(m: int, k: int) -> float:
    '''Compute bias term appearing in Lemma X.'''
    return (
        m * np.sqrt(np.pi / (2 * (m - 1))) / (2 * k),
        m * np.sqrt(np.pi / (2 * (m - 1))) / (2 * (m - k))
    )

def large_deviation_bound(delta: float, m: int, k: float, *args) -> tuple:
    '''Compute large deviation bound as in X.'''
    ppv_bias, npv_bias = estimator_bias(m, k)

    ppv_bound_first_term = np.sqrt((m * np.log(4*m / delta)) / 2) / k
    npv_bound_first_term = np.sqrt((m * np.log(4*m / delta)) / 2) / (m - k)

    return (
        ppv_bound_first_term + ppv_bias,
        npv_bound_first_term + npv_bias
    )

def uniform_bound(delta: float, m: int, k: int, vc) -> tuple:
    '''Compute uniform generalization bound as in X.'''
    ppv_bias, npv_bias = estimator_bias(m, k)

    ppv_complexity_term = np.sqrt(2*m*(np.log(8*m / delta) + 2*vc*np.log(np.e*m / vc))) / k
    npv_complexity_term = np.sqrt(2*m*(np.log(8*m / delta) + 2*vc*np.log(np.e*m / vc))) / (m - k)

    return (
        ppv_complexity_term + ppv_bias,
        npv_complexity_term + npv_bias
    )

def plot_curves(axes: tuple, xgb: XGBClassifier, X: pd.DataFrame, y: pd.Series, train=False, vc=None) -> mpl.axes.Axes:
    def qfhat(k):
        '''Compute qfhat as in X.'''
        return (sorted_scores[m-k-1] + sorted_scores[m-k]) / 2

    # Validating parameters
    assert((train == False) or vc is not None)
    bound = uniform_bound if train else large_deviation_bound

    m = y.size

    scores = xgb.predict_proba(X)[:, 1]
    sorted_scores = np.sort(scores)

    # Computing empirical predictive values
    cfs     = [confusion_matrix(y, (scores > qfhat(k)).astype(int)) for k in range(1, m)]
    ppvs    = np.array([tp / (tp + fp) for ((tn, fp), (fn, tp)) in cfs])
    npvs    = np.array([tn / (tn + fn) for ((tn, fp), (fn, tp)) in cfs])
    recalls = np.array([tp / (tp + fn) for ((tn, fp), (fn, tp)) in cfs])

    ax1, ax2 = axes

    # Plot predictive value curves along their confidence bands
    #   - The values must be capped, otherwise Numpy will generate NaNs and the confidence bands
    #     for training won't appear in the plot.
    ax1.plot(np.arange(1, m) / m, ppvs, label='PPV')
    bounds = np.nan_to_num(np.array([bound(0.05, m, k, vc)[0] for k in range(1, m)]), nan=100)
    ax1.fill_between(np.arange(1, m) / m, np.maximum(ppvs-bounds, 0), np.minimum(ppvs+bounds, 1.1), alpha=0.3)

    ax1.plot(np.arange(1, m) / m, npvs, label='NPV')
    bounds = np.nan_to_num(np.array([bound(0.05, m, k, vc)[1] for k in range(1, m)]), nan=100)
    ax1.fill_between(np.arange(1, m) / m, np.maximum(npvs-bounds, 0), np.minimum(npvs+bounds, 1.1), alpha=0.3)

    # Plot precision-recall curve
    ax2.plot(recalls, ppvs, label='PRC')

    # Configure plots
    ax1.set_ylim(0.3, 1.1)
    ax1.set_title('Predictive Value Curves')
    ax1.set_xlabel('Alpha')
    ax1.set_ylabel('Predictive Values')
    ax1.legend()

    ax2.set_ylim(0.3, 1.1)
    ax2.set_title('Precision-Recall Curve')
    ax2.set_xlabel('Recall')
    ax2.set_ylabel('Precision')
    ax2.legend()

    return (ax1, ax2)

def classification_curves(y_true: np.array, probas_pred: np.array) -> tuple:
    def qfhat(k):
        '''Compute qfhat as in X.'''
        return (sorted_scores[m-k-1] + sorted_scores[m-k]) / 2

    m = y_true.size

    scores = probas_pred
    sorted_scores = np.sort(scores)

    # Computing classification_curves
    cfs     = [confusion_matrix(y_true, (scores > qfhat(k)).astype(int)) for k in range(1, m)]
    ppvs    = np.array([tp / (tp + fp) for ((tn, fp), (fn, tp)) in cfs])
    npvs    = np.array([tn / (tn + fn) for ((tn, fp), (fn, tp)) in cfs])
    sens    = np.array([tp / (tp + fn) for ((tn, fp), (fn, tp)) in cfs])
    spec    = np.array([tn / (tn + fp) for ((tn, fp), (fn, tp)) in cfs])

    return (np.arange(1, m) / m, ppvs, npvs, sens, spec)


## machine_learning_experiments_msc.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              machine_learning_experiments_msc.ipynb
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
	# Code for plotting predictive value and precision-recall curves along with their associated confidence bands.
	import numpy as np
	import pandas as pd
	import matplotlib as mpl
	import matplotlib.pyplot as plt
	from xgboost import XGBClassifier
	from sklearn.metrics import confusion_matrix

	def estimator_bias(m: int, k: int) -> float:
	'''Compute bias term appearing in Lemma X.'''
	return (
	m * np.sqrt(np.pi / (2 * (m - 1))) / (2 * k),
	m * np.sqrt(np.pi / (2 * (m - 1))) / (2 * (m - k))
	)

	def large_deviation_bound(delta: float, m: int, k: float, *args) -> tuple:
	'''Compute large deviation bound as in X.'''
	ppv_bias, npv_bias = estimator_bias(m, k)

	ppv_bound_first_term = np.sqrt((m * np.log(4*m / delta)) / 2) / k
	npv_bound_first_term = np.sqrt((m * np.log(4*m / delta)) / 2) / (m - k)

	return (
	ppv_bound_first_term + ppv_bias,
	npv_bound_first_term + npv_bias
	)

	def uniform_bound(delta: float, m: int, k: int, vc) -> tuple:
	'''Compute uniform generalization bound as in X.'''
	ppv_bias, npv_bias = estimator_bias(m, k)

	ppv_complexity_term = np.sqrt(2m(np.log(8m / delta) + 2vcnp.log(np.em / vc))) / k
	npv_complexity_term = np.sqrt(2m(np.log(8m / delta) + 2vcnp.log(np.em / vc))) / (m - k)

	return (
	ppv_complexity_term + ppv_bias,
	npv_complexity_term + npv_bias
	)

	def plot_curves(axes: tuple, xgb: XGBClassifier, X: pd.DataFrame, y: pd.Series, train=False, vc=None) -> mpl.axes.Axes:
	def qfhat(k):
	'''Compute qfhat as in X.'''
	return (sorted_scores[m-k-1] + sorted_scores[m-k]) / 2

	# Validating parameters
	assert((train == False) or vc is not None)
	bound = uniform_bound if train else large_deviation_bound

	m = y.size

	scores = xgb.predict_proba(X)[:, 1]
	sorted_scores = np.sort(scores)

	# Computing empirical predictive values
	cfs = [confusion_matrix(y, (scores > qfhat(k)).astype(int)) for k in range(1, m)]
	ppvs = np.array([tp / (tp + fp) for ((tn, fp), (fn, tp)) in cfs])
	npvs = np.array([tn / (tn + fn) for ((tn, fp), (fn, tp)) in cfs])
	recalls = np.array([tp / (tp + fn) for ((tn, fp), (fn, tp)) in cfs])

	ax1, ax2 = axes

	# Plot predictive value curves along their confidence bands
	# - The values must be capped, otherwise Numpy will generate NaNs and the confidence bands
	# for training won't appear in the plot.
	ax1.plot(np.arange(1, m) / m, ppvs, label='PPV')
	bounds = np.nan_to_num(np.array([bound(0.05, m, k, vc)[0] for k in range(1, m)]), nan=100)
	ax1.fill_between(np.arange(1, m) / m, np.maximum(ppvs-bounds, 0), np.minimum(ppvs+bounds, 1.1), alpha=0.3)

	ax1.plot(np.arange(1, m) / m, npvs, label='NPV')
	bounds = np.nan_to_num(np.array([bound(0.05, m, k, vc)[1] for k in range(1, m)]), nan=100)
	ax1.fill_between(np.arange(1, m) / m, np.maximum(npvs-bounds, 0), np.minimum(npvs+bounds, 1.1), alpha=0.3)

	# Plot precision-recall curve
	ax2.plot(recalls, ppvs, label='PRC')

	# Configure plots
	ax1.set_ylim(0.3, 1.1)
	ax1.set_title('Predictive Value Curves')
	ax1.set_xlabel('Alpha')
	ax1.set_ylabel('Predictive Values')
	ax1.legend()

	ax2.set_ylim(0.3, 1.1)
	ax2.set_title('Precision-Recall Curve')
	ax2.set_xlabel('Recall')
	ax2.set_ylabel('Precision')
	ax2.legend()

	return (ax1, ax2)

	def classification_curves(y_true: np.array, probas_pred: np.array) -> tuple:
	def qfhat(k):
	'''Compute qfhat as in X.'''
	return (sorted_scores[m-k-1] + sorted_scores[m-k]) / 2

	m = y_true.size

	scores = probas_pred
	sorted_scores = np.sort(scores)

	# Computing classification_curves
	cfs = [confusion_matrix(y_true, (scores > qfhat(k)).astype(int)) for k in range(1, m)]
	ppvs = np.array([tp / (tp + fp) for ((tn, fp), (fn, tp)) in cfs])
	npvs = np.array([tn / (tn + fn) for ((tn, fp), (fn, tp)) in cfs])
	sens = np.array([tp / (tp + fn) for ((tn, fp), (fn, tp)) in cfs])
	spec = np.array([tn / (tn + fp) for ((tn, fp), (fn, tp)) in cfs])

	return (np.arange(1, m) / m, ppvs, npvs, sens, spec)