amirkdv/.gitignore

## .gitignore
venv
pos.txt
neg.txt
plot.png

## 0_notes.md

      
    Raw
  

              0_notes.md
            
          
    Binary Classification: a visual refresher

Contents:

A visual refresher of binary classification statistics: refresher.png.
python code to produce Gaussian random data: generate.py.
python code to produce typical plots: plot.py. This includes:

probability density functions for pos and neg sets,
cumulative probability distributions for pos and neg sets,
ROC curve (FPR vs TPR),
precision-recall curve (PPV vs TPR).


Caution: everything in the refresher and in code hardcodes the assumption
that we expect the statistic of interest to have lower values for negative
samples. Otherwise, you need to update the way tpr, fpr are calculated from the
cumulative distributions.
Code usage:
(venv) $ pip install numpy matplotlib
(venv) $ python generate.py # produces pos.txt and neg.txt
(venv) $ python plot.py     # consumes pos.txt and neg.txt, produces plot.png

  
## generate.py
#!/usr/bin/env python3
import numpy as np

n_samples = {
    'pos': 10000,
    'neg': 50000
}
moments = {
    'neg': [37, .5],
    'pos': [39, 1],
}

pos = np.random.normal(loc=moments['pos'][0], scale=moments['pos'][1], size=n_samples['pos'])
neg = np.random.normal(loc=moments['neg'][0], scale=moments['neg'][1], size=n_samples['neg'])

with open('pos.txt', 'w') as f:
    f.write('\n'.join(map(str, pos)))

with open('neg.txt', 'w') as f:
    f.write('\n'.join(map(str, neg)))

## plot.py
#!/usr/bin/env python3
import bisect
import matplotlib
matplotlib.use('agg')

import numpy as np
from matplotlib import pyplot as plt


def generate_bins(pos, neg, step):
    lower = min(np.min(pos), np.min(neg))
    upper = max(np.max(pos), np.max(neg))
    return np.arange(lower, upper, step)


def estimate_probability_densities(values, bins):
    hist, bin_edges = np.histogram(values, bins=bins, density=True)
    bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
    return bin_centers, hist


def estimate_cumulative_probabilities(values, bins):
    centers, densities = estimate_probability_densities(values, bins)
    cumulative_hist = np.cumsum(densities)
    # the values produced by np.histogram are pointwise probability density
    # estimates. We need to multiply them by bin width when integrating to get
    # cumulative distribution
    # NOTE hardcodes assumption that bin widths are constant
    step = centers[1] - centers[0]
    cumulative_dist = cumulative_hist * step

    return centers, cumulative_dist


def plot_densities(ax, pos, neg, bins, candidate_threshold):
    pos_x, pos_y = estimate_probability_densities(pos, bins)
    neg_x, neg_y = estimate_probability_densities(neg, bins)

    ax.plot(pos_x, pos_y, color='g', label='pos')
    ax.plot(neg_x, neg_y, color='r', label='neg')

    ax.vlines(candidate_threshold, 0, max(np.max(pos_y), np.max(neg_y)),
              color='b', alpha=.5, label='candidate')

    ax.grid(True)
    ax.set_ylabel('probability density')
    ax.legend(loc='upper right', fontsize=10)


def plot_cumulatives(ax, pos, neg, bins, candidate_threshold):
    ax.plot(*estimate_cumulative_probabilities(pos, bins), color='g', label='pos')
    ax.plot(*estimate_cumulative_probabilities(neg, bins), color='r', label='neg')

    ax.vlines(candidate_threshold, 0, 1, color='b', alpha=.5, label='candidate')

    ax.grid(True)
    ax.set_ylabel('cumulative probability')
    ax.legend(loc='lower right', fontsize=10)


def plot_roc(ax, pos, neg, bins, candidate_threshold):
    _, tnr = estimate_cumulative_probabilities(neg, bins)
    _, fnr = estimate_cumulative_probabilities(pos, bins)

    fpr = 1 - tnr
    tpr = 1 - fnr

    ax.plot(fpr, tpr, color='k', label='ROC curve')
    ax.plot(fpr, fpr, color='k', lw=.2)

    cand_bin_idx = bisect.bisect_left(bins, candidate_threshold)
    ax.scatter([fpr[cand_bin_idx]], [tpr[cand_bin_idx]], c='b', s=50, alpha=.5, label='candidate')

    ax.grid(True)
    ax.set_xlabel('FPR')
    ax.set_ylabel('TPR')
    ax.legend(loc='lower right', fontsize=10)


def plot_precision_recall(ax, pos, neg, bins, candidate_threshold):
    _, tnr = estimate_cumulative_probabilities(neg, bins)
    _, fnr = estimate_cumulative_probabilities(pos, bins)

    tpr = 1 - fnr
    fpr = 1 - tnr
    ppv = tpr * len(pos) / (tpr * len(pos) + fpr * len(neg))
    ax.plot(ppv, tpr, color='k', label='precision-recall curve')

    cand_bin_idx = bisect.bisect_left(bins, candidate_threshold)
    ax.scatter([ppv[cand_bin_idx]], [tpr[cand_bin_idx]], c='b', s=50, alpha=.5, label='candidate')

    ax.grid(True)
    ax.set_xlabel('PPV')
    ax.set_ylabel('TPR')
    ax.legend(loc='center left', fontsize=10)


if __name__ == '__main__':
    pos = np.loadtxt('pos.txt')
    neg = np.loadtxt('neg.txt')
    threshold = 37.7
    step = 1e-1

    bins = generate_bins(pos, neg, step)

    fig = plt.figure(figsize=(12, 8))

    ax = fig.add_subplot(2, 2, 1)
    plot_densities(ax, pos, neg, bins, threshold)

    ax = fig.add_subplot(2, 2, 2)
    plot_cumulatives(ax, pos, neg, bins, threshold)

    ax = fig.add_subplot(2, 2, 3)
    plot_roc(ax, pos, neg, bins, threshold)

    ax = fig.add_subplot(2, 2, 4)
    plot_precision_recall(ax, pos, neg, bins, threshold)

    fig.tight_layout()
    fig.savefig('plot.png', dpi=180)

## refresher.png

      
    Raw
  

              refresher.png
	#!/usr/bin/env python3
	import numpy as np

	n_samples = {
	'pos': 10000,
	'neg': 50000
	}
	moments = {
	'neg': [37, .5],
	'pos': [39, 1],
	}

	pos = np.random.normal(loc=moments['pos'][0], scale=moments['pos'][1], size=n_samples['pos'])
	neg = np.random.normal(loc=moments['neg'][0], scale=moments['neg'][1], size=n_samples['neg'])

	with open('pos.txt', 'w') as f:
	f.write('\n'.join(map(str, pos)))

	with open('neg.txt', 'w') as f:
	f.write('\n'.join(map(str, neg)))
	#!/usr/bin/env python3
	import bisect
	import matplotlib
	matplotlib.use('agg')

	import numpy as np
	from matplotlib import pyplot as plt


	def generate_bins(pos, neg, step):
	lower = min(np.min(pos), np.min(neg))
	upper = max(np.max(pos), np.max(neg))
	return np.arange(lower, upper, step)


	def estimate_probability_densities(values, bins):
	hist, bin_edges = np.histogram(values, bins=bins, density=True)
	bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
	return bin_centers, hist


	def estimate_cumulative_probabilities(values, bins):
	centers, densities = estimate_probability_densities(values, bins)
	cumulative_hist = np.cumsum(densities)
	# the values produced by np.histogram are pointwise probability density
	# estimates. We need to multiply them by bin width when integrating to get
	# cumulative distribution
	# NOTE hardcodes assumption that bin widths are constant
	step = centers[1] - centers[0]
	cumulative_dist = cumulative_hist * step

	return centers, cumulative_dist


	def plot_densities(ax, pos, neg, bins, candidate_threshold):
	pos_x, pos_y = estimate_probability_densities(pos, bins)
	neg_x, neg_y = estimate_probability_densities(neg, bins)

	ax.plot(pos_x, pos_y, color='g', label='pos')
	ax.plot(neg_x, neg_y, color='r', label='neg')

	ax.vlines(candidate_threshold, 0, max(np.max(pos_y), np.max(neg_y)),
	color='b', alpha=.5, label='candidate')

	ax.grid(True)
	ax.set_ylabel('probability density')
	ax.legend(loc='upper right', fontsize=10)


	def plot_cumulatives(ax, pos, neg, bins, candidate_threshold):
	ax.plot(*estimate_cumulative_probabilities(pos, bins), color='g', label='pos')
	ax.plot(*estimate_cumulative_probabilities(neg, bins), color='r', label='neg')

	ax.vlines(candidate_threshold, 0, 1, color='b', alpha=.5, label='candidate')

	ax.grid(True)
	ax.set_ylabel('cumulative probability')
	ax.legend(loc='lower right', fontsize=10)


	def plot_roc(ax, pos, neg, bins, candidate_threshold):
	_, tnr = estimate_cumulative_probabilities(neg, bins)
	_, fnr = estimate_cumulative_probabilities(pos, bins)

	fpr = 1 - tnr
	tpr = 1 - fnr

	ax.plot(fpr, tpr, color='k', label='ROC curve')
	ax.plot(fpr, fpr, color='k', lw=.2)

	cand_bin_idx = bisect.bisect_left(bins, candidate_threshold)
	ax.scatter([fpr[cand_bin_idx]], [tpr[cand_bin_idx]], c='b', s=50, alpha=.5, label='candidate')

	ax.grid(True)
	ax.set_xlabel('FPR')
	ax.set_ylabel('TPR')
	ax.legend(loc='lower right', fontsize=10)


	def plot_precision_recall(ax, pos, neg, bins, candidate_threshold):
	_, tnr = estimate_cumulative_probabilities(neg, bins)
	_, fnr = estimate_cumulative_probabilities(pos, bins)

	tpr = 1 - fnr
	fpr = 1 - tnr
	ppv = tpr * len(pos) / (tpr * len(pos) + fpr * len(neg))
	ax.plot(ppv, tpr, color='k', label='precision-recall curve')

	cand_bin_idx = bisect.bisect_left(bins, candidate_threshold)
	ax.scatter([ppv[cand_bin_idx]], [tpr[cand_bin_idx]], c='b', s=50, alpha=.5, label='candidate')

	ax.grid(True)
	ax.set_xlabel('PPV')
	ax.set_ylabel('TPR')
	ax.legend(loc='center left', fontsize=10)


	if __name__ == '__main__':
	pos = np.loadtxt('pos.txt')
	neg = np.loadtxt('neg.txt')
	threshold = 37.7
	step = 1e-1

	bins = generate_bins(pos, neg, step)

	fig = plt.figure(figsize=(12, 8))

	ax = fig.add_subplot(2, 2, 1)
	plot_densities(ax, pos, neg, bins, threshold)

	ax = fig.add_subplot(2, 2, 2)
	plot_cumulatives(ax, pos, neg, bins, threshold)

	ax = fig.add_subplot(2, 2, 3)
	plot_roc(ax, pos, neg, bins, threshold)

	ax = fig.add_subplot(2, 2, 4)
	plot_precision_recall(ax, pos, neg, bins, threshold)

	fig.tight_layout()
	fig.savefig('plot.png', dpi=180)