Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@amirkdv
Last active May 7, 2020 21:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save amirkdv/ec50cb4254bcf17a25f13f4e8bdb8ee2 to your computer and use it in GitHub Desktop.
Save amirkdv/ec50cb4254bcf17a25f13f4e8bdb8ee2 to your computer and use it in GitHub Desktop.
Binary Classification, a visual refresher
venv
pos.txt
neg.txt
plot.png

Binary Classification: a visual refresher

Contents:

  • A visual refresher of binary classification statistics: refresher.png.
  • python code to produce Gaussian random data: generate.py.
  • python code to produce typical plots: plot.py. This includes:
    • probability density functions for pos and neg sets,
    • cumulative probability distributions for pos and neg sets,
    • ROC curve (FPR vs TPR),
    • precision-recall curve (PPV vs TPR).

Caution: everything in the refresher and in code hardcodes the assumption that we expect the statistic of interest to have lower values for negative samples. Otherwise, you need to update the way tpr, fpr are calculated from the cumulative distributions.

Code usage:

(venv) $ pip install numpy matplotlib
(venv) $ python generate.py # produces pos.txt and neg.txt
(venv) $ python plot.py     # consumes pos.txt and neg.txt, produces plot.png
#!/usr/bin/env python3
import numpy as np
n_samples = {
'pos': 10000,
'neg': 50000
}
moments = {
'neg': [37, .5],
'pos': [39, 1],
}
pos = np.random.normal(loc=moments['pos'][0], scale=moments['pos'][1], size=n_samples['pos'])
neg = np.random.normal(loc=moments['neg'][0], scale=moments['neg'][1], size=n_samples['neg'])
with open('pos.txt', 'w') as f:
f.write('\n'.join(map(str, pos)))
with open('neg.txt', 'w') as f:
f.write('\n'.join(map(str, neg)))
#!/usr/bin/env python3
import bisect
import matplotlib
matplotlib.use('agg')
import numpy as np
from matplotlib import pyplot as plt
def generate_bins(pos, neg, step):
lower = min(np.min(pos), np.min(neg))
upper = max(np.max(pos), np.max(neg))
return np.arange(lower, upper, step)
def estimate_probability_densities(values, bins):
hist, bin_edges = np.histogram(values, bins=bins, density=True)
bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
return bin_centers, hist
def estimate_cumulative_probabilities(values, bins):
centers, densities = estimate_probability_densities(values, bins)
cumulative_hist = np.cumsum(densities)
# the values produced by np.histogram are pointwise probability density
# estimates. We need to multiply them by bin width when integrating to get
# cumulative distribution
# NOTE hardcodes assumption that bin widths are constant
step = centers[1] - centers[0]
cumulative_dist = cumulative_hist * step
return centers, cumulative_dist
def plot_densities(ax, pos, neg, bins, candidate_threshold):
pos_x, pos_y = estimate_probability_densities(pos, bins)
neg_x, neg_y = estimate_probability_densities(neg, bins)
ax.plot(pos_x, pos_y, color='g', label='pos')
ax.plot(neg_x, neg_y, color='r', label='neg')
ax.vlines(candidate_threshold, 0, max(np.max(pos_y), np.max(neg_y)),
color='b', alpha=.5, label='candidate')
ax.grid(True)
ax.set_ylabel('probability density')
ax.legend(loc='upper right', fontsize=10)
def plot_cumulatives(ax, pos, neg, bins, candidate_threshold):
ax.plot(*estimate_cumulative_probabilities(pos, bins), color='g', label='pos')
ax.plot(*estimate_cumulative_probabilities(neg, bins), color='r', label='neg')
ax.vlines(candidate_threshold, 0, 1, color='b', alpha=.5, label='candidate')
ax.grid(True)
ax.set_ylabel('cumulative probability')
ax.legend(loc='lower right', fontsize=10)
def plot_roc(ax, pos, neg, bins, candidate_threshold):
_, tnr = estimate_cumulative_probabilities(neg, bins)
_, fnr = estimate_cumulative_probabilities(pos, bins)
fpr = 1 - tnr
tpr = 1 - fnr
ax.plot(fpr, tpr, color='k', label='ROC curve')
ax.plot(fpr, fpr, color='k', lw=.2)
cand_bin_idx = bisect.bisect_left(bins, candidate_threshold)
ax.scatter([fpr[cand_bin_idx]], [tpr[cand_bin_idx]], c='b', s=50, alpha=.5, label='candidate')
ax.grid(True)
ax.set_xlabel('FPR')
ax.set_ylabel('TPR')
ax.legend(loc='lower right', fontsize=10)
def plot_precision_recall(ax, pos, neg, bins, candidate_threshold):
_, tnr = estimate_cumulative_probabilities(neg, bins)
_, fnr = estimate_cumulative_probabilities(pos, bins)
tpr = 1 - fnr
fpr = 1 - tnr
ppv = tpr * len(pos) / (tpr * len(pos) + fpr * len(neg))
ax.plot(ppv, tpr, color='k', label='precision-recall curve')
cand_bin_idx = bisect.bisect_left(bins, candidate_threshold)
ax.scatter([ppv[cand_bin_idx]], [tpr[cand_bin_idx]], c='b', s=50, alpha=.5, label='candidate')
ax.grid(True)
ax.set_xlabel('PPV')
ax.set_ylabel('TPR')
ax.legend(loc='center left', fontsize=10)
if __name__ == '__main__':
pos = np.loadtxt('pos.txt')
neg = np.loadtxt('neg.txt')
threshold = 37.7
step = 1e-1
bins = generate_bins(pos, neg, step)
fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(2, 2, 1)
plot_densities(ax, pos, neg, bins, threshold)
ax = fig.add_subplot(2, 2, 2)
plot_cumulatives(ax, pos, neg, bins, threshold)
ax = fig.add_subplot(2, 2, 3)
plot_roc(ax, pos, neg, bins, threshold)
ax = fig.add_subplot(2, 2, 4)
plot_precision_recall(ax, pos, neg, bins, threshold)
fig.tight_layout()
fig.savefig('plot.png', dpi=180)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment