mbillingr/fancy_correlation.py

## fancy_correlation.py
# Released under The MIT License (MIT)
# http://opensource.org/licenses/MIT
# Copyright (c) 2015 Martin Billinger

import numpy as np
import scipy as sp
import matplotlib
from matplotlib.colors import LinearSegmentedColormap


def plot_correlation(x, y, measure=sp.stats.pearsonr, alpha=0.05, n_reps=10000, fig=None):
    """This function shows useful information about the correlation of two variables.

        1. A scatter plot with pearson's correlation coefficient r, and p-value p
           Points in the scatter plot are colored by "sample impact". This is the
           amount the correlation changes when removing a particular sample (details
           see below).
        2. Bootstrap distribution of r with confidence interval.
        3. Distribution of r under the null hypothesis (no correlation), obtained
           with random permutation of the variables. A confidence interval of
           the p-value is obtained from this distribution.

        Note that the correlation can be very sensitive to outliers In the example
        an outlier is added to 14 samples from two uncorrelated variables. This
        outlier leads to a significant correlation (r=0.73, p=0.002). Also, the
        permutation test does not help, while bootstrapping does a little bit
        better. The bootstrap confidence interval of r overlaps 0 and the
        distribution is clearly bimodal. The modes correspond to bootstrap resamples
        that include or exclude the outlier.
        The "sample impact" metric uses this information to quantify how much
        impact each sample has on the resulting correlation. For each sample, the
        average correlation is computed from all bootstrap resamples that include
        the sample, and again for all resamples that do not include it. The sample
        impact is the difference of these two correlations.
        The sample impact is useful for outlier detection since outliers will often
        have a high impact. However, not every high-impact sample is an outlier.
        Especially with small sample sizes most individual samples have a high
        impact on correlation. This highlights the problematic of interpreting
        correlations in small sample sizes.

    """

    if fig is None:
        fig = matplotlib.pyplot.figure()
    elif not isinstance(fig, matplotlib.figure.Figure):
        fig = matplotlib.pyplot.figure(**fig)

    r0, p0 = measure(x, y)

    r_perm = permutationcorrelation(x, y, n_reps=n_reps, measure=lambda a, b: measure(a, b)[0])
    p = sp.sum(np.abs(r_perm) >= np.abs(r0)) / n_reps
    kcl, kcu = sp.stats.binom.interval(1 - alpha/2, n_reps, sp.clip(p, 1e-15, 1.0))
    pcl = kcl / n_reps
    pcu = kcu / n_reps

    r_boot, r_diff = bootstrapcorrelation(x, y, n_reps=n_reps, n_boot=x.size, measure=lambda a, b: measure(a, b)[0])
    rbs = sp.sort(r_boot)
    rcl = rbs[sp.floor(r_boot.size*alpha/2)]
    rcu = rbs[sp.floor(r_boot.size*(1 - alpha/2))]

    gs = matplotlib.gridspec.GridSpec(2, 2)
    ax1 = fig.add_subplot(gs[:, 0])
    ax2 = fig.add_subplot(gs[0, 1])
    ax3 = fig.add_subplot(gs[1, 1])

    mappable = ax1.scatter(x, y, c=sp.absolute(r_diff), s=100, vmin=0, vmax=0.2, cmap=cmap)
    #plt.colorbar()
    ax1.set_title('r={:.2f} (p={:.3f})'.format(r0, p0))
    cb = fig.colorbar(mappable, ax=ax1, label='sample impact')

    ax2.hist(r_boot, bins=40, range=(-1, 1))
    ax2.axvline(rcl, color='r')
    ax2.axvline(rcu, color='r')
    ax2.set_yticks([])
    ax2.set_xticks([-1, -0.8, -0.6, -0.4, -0.2, 0, 0.2, 0.4, 0.6, 0.8, 1])
    ax2.set_xticklabels([])
    ax2.set_title('Bootstrap: r $\\approx$ {:.2f} ... {:.2f} ({}% CI)'.format(rcl, rcu, (1-alpha)*100))

    ax3.hist(r_perm, bins=40, range=(-1, 1))
    ax3.axvline(r0, color='r')
    ax3.set_yticks([])
    ax3.set_xticks([-1, -0.8, -0.6, -0.4, -0.2, 0, 0.2, 0.4, 0.6, 0.8, 1])
    ax3.set_title('random permutation test: p $\\approx$ {:.3f} ... {:.3f} ({}% CI)'.format(pcl, pcu, (1-alpha)*100))

    fig.tight_layout()

    return r_boot, r_diff


def bootstrapcorrelation(x, y, n_reps=10000, n_boot=None, measure=None):
    if measure is None:
        measure = lambda a, b: sp.corrcoef(a, b)[0, 1]
    x = sp.ravel(x)
    y = sp.ravel(y)
    n_boot = x.size if n_boot is None else n_boot
    r = sp.empty(n_reps)
    mask = sp.zeros((n_reps, x.size), dtype=int)
    for rep in range(n_reps):
        i = sp.random.randint(x.size, size=n_boot)
        j, c = sp.unique(i, return_counts=True)
        mask[rep, j] += c
        r[rep] = measure(x[i], y[i])

    r_incl = sp.empty(x.size)
    r_excl = sp.empty(x.size)
    for i in range(x.size):
        #r_incl[i] = sp.average(r, weights=mask[:, i])
        r_incl[i] = sp.nanmean(r[mask[:, i] > 0])
        r_excl[i] = sp.nanmean(r[mask[:, i] == 0])

    return r, r_incl - r_excl


def permutationcorrelation(x, y, n_reps=10000, measure=None):
    if measure is None:
        measure = lambda a, b: sp.corrcoef(a, b)[0, 1]
    x = sp.ravel(x.copy())
    y = sp.ravel(y.copy())
    r = sp.empty(n_reps)
    for rep in range(n_reps):
        sp.random.shuffle(x)
        sp.random.shuffle(y)
        r[rep] = measure(x, y)
    return r

cdict = {'red':   ((0.0,  0.0, 0.0),
                   (0.5,  1.0, 1.0),
                   (1.0,  1.0, 1.0)),

         'green': ((0.0,  1.0, 1.0),
                   (0.5,  1.0, 1.0),
                   (1.0,  0.0, 0.0)),

         'blue':  ((0.0,  0.0, 0.0),
                   (0.5,  0.0, 0.0),
                   (1.0,  0.0, 0.0))}

cmap = LinearSegmentedColormap('goodbad', cdict)


if __name__ == '__main__':
    sp.random.seed(42)
    x = sp.random.randn(25) * 10
    y = sp.random.randn(25)

    # one outlier
    x[0] = 50
    y[0] = 5

    fig = matplotlib.pyplot.figure(figsize=[12, 4])
    r_boot, r_diff = plot_correlation(x, y, fig=fig)

    # another outlier
    x[1] = -10
    y[1] = 5

    fig = matplotlib.pyplot.figure(figsize=[12, 4])
    r_boot, r_diff = plot_correlation(x, y, fig=fig)
	# Released under The MIT License (MIT)
	# http://opensource.org/licenses/MIT
	# Copyright (c) 2015 Martin Billinger

	import numpy as np
	import scipy as sp
	import matplotlib
	from matplotlib.colors import LinearSegmentedColormap


	def plot_correlation(x, y, measure=sp.stats.pearsonr, alpha=0.05, n_reps=10000, fig=None):
	"""This function shows useful information about the correlation of two variables.

	1. A scatter plot with pearson's correlation coefficient r, and p-value p
	Points in the scatter plot are colored by "sample impact". This is the
	amount the correlation changes when removing a particular sample (details
	see below).
	2. Bootstrap distribution of r with confidence interval.
	3. Distribution of r under the null hypothesis (no correlation), obtained
	with random permutation of the variables. A confidence interval of
	the p-value is obtained from this distribution.

	Note that the correlation can be very sensitive to outliers In the example
	an outlier is added to 14 samples from two uncorrelated variables. This
	outlier leads to a significant correlation (r=0.73, p=0.002). Also, the
	permutation test does not help, while bootstrapping does a little bit
	better. The bootstrap confidence interval of r overlaps 0 and the
	distribution is clearly bimodal. The modes correspond to bootstrap resamples
	that include or exclude the outlier.
	The "sample impact" metric uses this information to quantify how much
	impact each sample has on the resulting correlation. For each sample, the
	average correlation is computed from all bootstrap resamples that include
	the sample, and again for all resamples that do not include it. The sample
	impact is the difference of these two correlations.
	The sample impact is useful for outlier detection since outliers will often
	have a high impact. However, not every high-impact sample is an outlier.
	Especially with small sample sizes most individual samples have a high
	impact on correlation. This highlights the problematic of interpreting
	correlations in small sample sizes.

	"""

	if fig is None:
	fig = matplotlib.pyplot.figure()
	elif not isinstance(fig, matplotlib.figure.Figure):
	fig = matplotlib.pyplot.figure(**fig)

	r0, p0 = measure(x, y)

	r_perm = permutationcorrelation(x, y, n_reps=n_reps, measure=lambda a, b: measure(a, b)[0])
	p = sp.sum(np.abs(r_perm) >= np.abs(r0)) / n_reps
	kcl, kcu = sp.stats.binom.interval(1 - alpha/2, n_reps, sp.clip(p, 1e-15, 1.0))
	pcl = kcl / n_reps
	pcu = kcu / n_reps

	r_boot, r_diff = bootstrapcorrelation(x, y, n_reps=n_reps, n_boot=x.size, measure=lambda a, b: measure(a, b)[0])
	rbs = sp.sort(r_boot)
	rcl = rbs[sp.floor(r_boot.size*alpha/2)]
	rcu = rbs[sp.floor(r_boot.size*(1 - alpha/2))]

	gs = matplotlib.gridspec.GridSpec(2, 2)
	ax1 = fig.add_subplot(gs[:, 0])
	ax2 = fig.add_subplot(gs[0, 1])
	ax3 = fig.add_subplot(gs[1, 1])

	mappable = ax1.scatter(x, y, c=sp.absolute(r_diff), s=100, vmin=0, vmax=0.2, cmap=cmap)
	#plt.colorbar()
	ax1.set_title('r={:.2f} (p={:.3f})'.format(r0, p0))
	cb = fig.colorbar(mappable, ax=ax1, label='sample impact')

	ax2.hist(r_boot, bins=40, range=(-1, 1))
	ax2.axvline(rcl, color='r')
	ax2.axvline(rcu, color='r')
	ax2.set_yticks([])
	ax2.set_xticks([-1, -0.8, -0.6, -0.4, -0.2, 0, 0.2, 0.4, 0.6, 0.8, 1])
	ax2.set_xticklabels([])
	ax2.set_title('Bootstrap: r $\\approx$ {:.2f} ... {:.2f} ({}% CI)'.format(rcl, rcu, (1-alpha)*100))

	ax3.hist(r_perm, bins=40, range=(-1, 1))
	ax3.axvline(r0, color='r')
	ax3.set_yticks([])
	ax3.set_xticks([-1, -0.8, -0.6, -0.4, -0.2, 0, 0.2, 0.4, 0.6, 0.8, 1])
	ax3.set_title('random permutation test: p $\\approx$ {:.3f} ... {:.3f} ({}% CI)'.format(pcl, pcu, (1-alpha)*100))

	fig.tight_layout()

	return r_boot, r_diff




	def bootstrapcorrelation(x, y, n_reps=10000, n_boot=None, measure=None):
	if measure is None:
	measure = lambda a, b: sp.corrcoef(a, b)[0, 1]
	x = sp.ravel(x)
	y = sp.ravel(y)
	n_boot = x.size if n_boot is None else n_boot
	r = sp.empty(n_reps)
	mask = sp.zeros((n_reps, x.size), dtype=int)
	for rep in range(n_reps):
	i = sp.random.randint(x.size, size=n_boot)
	j, c = sp.unique(i, return_counts=True)
	mask[rep, j] += c
	r[rep] = measure(x[i], y[i])

	r_incl = sp.empty(x.size)
	r_excl = sp.empty(x.size)
	for i in range(x.size):
	#r_incl[i] = sp.average(r, weights=mask[:, i])
	r_incl[i] = sp.nanmean(r[mask[:, i] > 0])
	r_excl[i] = sp.nanmean(r[mask[:, i] == 0])

	return r, r_incl - r_excl


	def permutationcorrelation(x, y, n_reps=10000, measure=None):
	if measure is None:
	measure = lambda a, b: sp.corrcoef(a, b)[0, 1]
	x = sp.ravel(x.copy())
	y = sp.ravel(y.copy())
	r = sp.empty(n_reps)
	for rep in range(n_reps):
	sp.random.shuffle(x)
	sp.random.shuffle(y)
	r[rep] = measure(x, y)
	return r

	cdict = {'red': ((0.0, 0.0, 0.0),
	(0.5, 1.0, 1.0),
	(1.0, 1.0, 1.0)),

	'green': ((0.0, 1.0, 1.0),
	(0.5, 1.0, 1.0),
	(1.0, 0.0, 0.0)),

	'blue': ((0.0, 0.0, 0.0),
	(0.5, 0.0, 0.0),
	(1.0, 0.0, 0.0))}

	cmap = LinearSegmentedColormap('goodbad', cdict)


	if __name__ == '__main__':
	sp.random.seed(42)
	x = sp.random.randn(25) * 10
	y = sp.random.randn(25)

	# one outlier
	x[0] = 50
	y[0] = 5

	fig = matplotlib.pyplot.figure(figsize=[12, 4])
	r_boot, r_diff = plot_correlation(x, y, fig=fig)

	# another outlier
	x[1] = -10
	y[1] = 5

	fig = matplotlib.pyplot.figure(figsize=[12, 4])
	r_boot, r_diff = plot_correlation(x, y, fig=fig)