Skip to content

Instantly share code, notes, and snippets.

@amirkdv amirkdv/.gitignore
Last active May 7, 2020

Embed
What would you like to do?
Binary Classification, a visual refresher
venv
pos.txt
neg.txt
plot.png

Binary Classification: a visual refresher

Contents:

  • A visual refresher of binary classification statistics: refresher.png.
  • python code to produce Gaussian random data: generate.py.
  • python code to produce typical plots: plot.py. This includes:
    • probability density functions for pos and neg sets,
    • cumulative probability distributions for pos and neg sets,
    • ROC curve (FPR vs TPR),
    • precision-recall curve (PPV vs TPR).

Caution: everything in the refresher and in code hardcodes the assumption that we expect the statistic of interest to have lower values for negative samples. Otherwise, you need to update the way tpr, fpr are calculated from the cumulative distributions.

Code usage:

(venv) $ pip install numpy matplotlib
(venv) $ python generate.py # produces pos.txt and neg.txt
(venv) $ python plot.py     # consumes pos.txt and neg.txt, produces plot.png
#!/usr/bin/env python3
import numpy as np
n_samples = {
'pos': 10000,
'neg': 50000
}
moments = {
'neg': [37, .5],
'pos': [39, 1],
}
pos = np.random.normal(loc=moments['pos'][0], scale=moments['pos'][1], size=n_samples['pos'])
neg = np.random.normal(loc=moments['neg'][0], scale=moments['neg'][1], size=n_samples['neg'])
with open('pos.txt', 'w') as f:
f.write('\n'.join(map(str, pos)))
with open('neg.txt', 'w') as f:
f.write('\n'.join(map(str, neg)))
#!/usr/bin/env python3
import bisect
import matplotlib
matplotlib.use('agg')
import numpy as np
from matplotlib import pyplot as plt
def generate_bins(pos, neg, step):
lower = min(np.min(pos), np.min(neg))
upper = max(np.max(pos), np.max(neg))
return np.arange(lower, upper, step)
def estimate_probability_densities(values, bins):
hist, bin_edges = np.histogram(values, bins=bins, density=True)
bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
return bin_centers, hist
def estimate_cumulative_probabilities(values, bins):
centers, densities = estimate_probability_densities(values, bins)
cumulative_hist = np.cumsum(densities)
# the values produced by np.histogram are pointwise probability density
# estimates. We need to multiply them by bin width when integrating to get
# cumulative distribution
# NOTE hardcodes assumption that bin widths are constant
step = centers[1] - centers[0]
cumulative_dist = cumulative_hist * step
return centers, cumulative_dist
def plot_densities(ax, pos, neg, bins, candidate_threshold):
pos_x, pos_y = estimate_probability_densities(pos, bins)
neg_x, neg_y = estimate_probability_densities(neg, bins)
ax.plot(pos_x, pos_y, color='g', label='pos')
ax.plot(neg_x, neg_y, color='r', label='neg')
ax.vlines(candidate_threshold, 0, max(np.max(pos_y), np.max(neg_y)),
color='b', alpha=.5, label='candidate')
ax.grid(True)
ax.set_ylabel('probability density')
ax.legend(loc='upper right', fontsize=10)
def plot_cumulatives(ax, pos, neg, bins, candidate_threshold):
ax.plot(*estimate_cumulative_probabilities(pos, bins), color='g', label='pos')
ax.plot(*estimate_cumulative_probabilities(neg, bins), color='r', label='neg')
ax.vlines(candidate_threshold, 0, 1, color='b', alpha=.5, label='candidate')
ax.grid(True)
ax.set_ylabel('cumulative probability')
ax.legend(loc='lower right', fontsize=10)
def plot_roc(ax, pos, neg, bins, candidate_threshold):
_, tnr = estimate_cumulative_probabilities(neg, bins)
_, fnr = estimate_cumulative_probabilities(pos, bins)
fpr = 1 - tnr
tpr = 1 - fnr
ax.plot(fpr, tpr, color='k', label='ROC curve')
ax.plot(fpr, fpr, color='k', lw=.2)
cand_bin_idx = bisect.bisect_left(bins, candidate_threshold)
ax.scatter([fpr[cand_bin_idx]], [tpr[cand_bin_idx]], c='b', s=50, alpha=.5, label='candidate')
ax.grid(True)
ax.set_xlabel('FPR')
ax.set_ylabel('TPR')
ax.legend(loc='lower right', fontsize=10)
def plot_precision_recall(ax, pos, neg, bins, candidate_threshold):
_, tnr = estimate_cumulative_probabilities(neg, bins)
_, fnr = estimate_cumulative_probabilities(pos, bins)
tpr = 1 - fnr
fpr = 1 - tnr
ppv = tpr * len(pos) / (tpr * len(pos) + fpr * len(neg))
ax.plot(ppv, tpr, color='k', label='precision-recall curve')
cand_bin_idx = bisect.bisect_left(bins, candidate_threshold)
ax.scatter([ppv[cand_bin_idx]], [tpr[cand_bin_idx]], c='b', s=50, alpha=.5, label='candidate')
ax.grid(True)
ax.set_xlabel('PPV')
ax.set_ylabel('TPR')
ax.legend(loc='center left', fontsize=10)
if __name__ == '__main__':
pos = np.loadtxt('pos.txt')
neg = np.loadtxt('neg.txt')
threshold = 37.7
step = 1e-1
bins = generate_bins(pos, neg, step)
fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(2, 2, 1)
plot_densities(ax, pos, neg, bins, threshold)
ax = fig.add_subplot(2, 2, 2)
plot_cumulatives(ax, pos, neg, bins, threshold)
ax = fig.add_subplot(2, 2, 3)
plot_roc(ax, pos, neg, bins, threshold)
ax = fig.add_subplot(2, 2, 4)
plot_precision_recall(ax, pos, neg, bins, threshold)
fig.tight_layout()
fig.savefig('plot.png', dpi=180)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.