Binary Classification, a visual refresher
 venv pos.txt neg.txt plot.png

## Binary Classification: a visual refresher

Contents:

• A visual refresher of binary classification statistics: refresher.png.
• python code to produce Gaussian random data: generate.py.
• python code to produce typical plots: plot.py. This includes:
• probability density functions for pos and neg sets,
• cumulative probability distributions for pos and neg sets,
• ROC curve (FPR vs TPR),
• precision-recall curve (PPV vs TPR).

Caution: everything in the refresher and in code hardcodes the assumption that we expect the statistic of interest to have lower values for negative samples. Otherwise, you need to update the way tpr, fpr are calculated from the cumulative distributions.

Code usage:

(venv) \$ pip install numpy matplotlib
(venv) \$ python generate.py # produces pos.txt and neg.txt
(venv) \$ python plot.py     # consumes pos.txt and neg.txt, produces plot.png
 #!/usr/bin/env python3 import numpy as np n_samples = { 'pos': 10000, 'neg': 50000 } moments = { 'neg': [37, .5], 'pos': [39, 1], } pos = np.random.normal(loc=moments['pos'][0], scale=moments['pos'][1], size=n_samples['pos']) neg = np.random.normal(loc=moments['neg'][0], scale=moments['neg'][1], size=n_samples['neg']) with open('pos.txt', 'w') as f: f.write('\n'.join(map(str, pos))) with open('neg.txt', 'w') as f: f.write('\n'.join(map(str, neg)))
 #!/usr/bin/env python3 import bisect import matplotlib matplotlib.use('agg') import numpy as np from matplotlib import pyplot as plt def generate_bins(pos, neg, step): lower = min(np.min(pos), np.min(neg)) upper = max(np.max(pos), np.max(neg)) return np.arange(lower, upper, step) def estimate_probability_densities(values, bins): hist, bin_edges = np.histogram(values, bins=bins, density=True) bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2 return bin_centers, hist def estimate_cumulative_probabilities(values, bins): centers, densities = estimate_probability_densities(values, bins) cumulative_hist = np.cumsum(densities) # the values produced by np.histogram are pointwise probability density # estimates. We need to multiply them by bin width when integrating to get # cumulative distribution # NOTE hardcodes assumption that bin widths are constant step = centers[1] - centers[0] cumulative_dist = cumulative_hist * step return centers, cumulative_dist def plot_densities(ax, pos, neg, bins, candidate_threshold): pos_x, pos_y = estimate_probability_densities(pos, bins) neg_x, neg_y = estimate_probability_densities(neg, bins) ax.plot(pos_x, pos_y, color='g', label='pos') ax.plot(neg_x, neg_y, color='r', label='neg') ax.vlines(candidate_threshold, 0, max(np.max(pos_y), np.max(neg_y)), color='b', alpha=.5, label='candidate') ax.grid(True) ax.set_ylabel('probability density') ax.legend(loc='upper right', fontsize=10) def plot_cumulatives(ax, pos, neg, bins, candidate_threshold): ax.plot(*estimate_cumulative_probabilities(pos, bins), color='g', label='pos') ax.plot(*estimate_cumulative_probabilities(neg, bins), color='r', label='neg') ax.vlines(candidate_threshold, 0, 1, color='b', alpha=.5, label='candidate') ax.grid(True) ax.set_ylabel('cumulative probability') ax.legend(loc='lower right', fontsize=10) def plot_roc(ax, pos, neg, bins, candidate_threshold): _, tnr = estimate_cumulative_probabilities(neg, bins) _, fnr = estimate_cumulative_probabilities(pos, bins) fpr = 1 - tnr tpr = 1 - fnr ax.plot(fpr, tpr, color='k', label='ROC curve') ax.plot(fpr, fpr, color='k', lw=.2) cand_bin_idx = bisect.bisect_left(bins, candidate_threshold) ax.scatter([fpr[cand_bin_idx]], [tpr[cand_bin_idx]], c='b', s=50, alpha=.5, label='candidate') ax.grid(True) ax.set_xlabel('FPR') ax.set_ylabel('TPR') ax.legend(loc='lower right', fontsize=10) def plot_precision_recall(ax, pos, neg, bins, candidate_threshold): _, tnr = estimate_cumulative_probabilities(neg, bins) _, fnr = estimate_cumulative_probabilities(pos, bins) tpr = 1 - fnr fpr = 1 - tnr ppv = tpr * len(pos) / (tpr * len(pos) + fpr * len(neg)) ax.plot(ppv, tpr, color='k', label='precision-recall curve') cand_bin_idx = bisect.bisect_left(bins, candidate_threshold) ax.scatter([ppv[cand_bin_idx]], [tpr[cand_bin_idx]], c='b', s=50, alpha=.5, label='candidate') ax.grid(True) ax.set_xlabel('PPV') ax.set_ylabel('TPR') ax.legend(loc='center left', fontsize=10) if __name__ == '__main__': pos = np.loadtxt('pos.txt') neg = np.loadtxt('neg.txt') threshold = 37.7 step = 1e-1 bins = generate_bins(pos, neg, step) fig = plt.figure(figsize=(12, 8)) ax = fig.add_subplot(2, 2, 1) plot_densities(ax, pos, neg, bins, threshold) ax = fig.add_subplot(2, 2, 2) plot_cumulatives(ax, pos, neg, bins, threshold) ax = fig.add_subplot(2, 2, 3) plot_roc(ax, pos, neg, bins, threshold) ax = fig.add_subplot(2, 2, 4) plot_precision_recall(ax, pos, neg, bins, threshold) fig.tight_layout() fig.savefig('plot.png', dpi=180)