Skip to content

Instantly share code, notes, and snippets.

@mbillingr
Last active December 8, 2015 15:40
Show Gist options
  • Save mbillingr/7a2c1ab6a0ecc471f452 to your computer and use it in GitHub Desktop.
Save mbillingr/7a2c1ab6a0ecc471f452 to your computer and use it in GitHub Desktop.
Fancy correlation plots with some sort of outlier detection.
# Released under The MIT License (MIT)
# http://opensource.org/licenses/MIT
# Copyright (c) 2015 Martin Billinger
import numpy as np
import scipy as sp
import matplotlib
from matplotlib.colors import LinearSegmentedColormap
def plot_correlation(x, y, measure=sp.stats.pearsonr, alpha=0.05, n_reps=10000, fig=None):
"""This function shows useful information about the correlation of two variables.
1. A scatter plot with pearson's correlation coefficient r, and p-value p
Points in the scatter plot are colored by "sample impact". This is the
amount the correlation changes when removing a particular sample (details
see below).
2. Bootstrap distribution of r with confidence interval.
3. Distribution of r under the null hypothesis (no correlation), obtained
with random permutation of the variables. A confidence interval of
the p-value is obtained from this distribution.
Note that the correlation can be very sensitive to outliers In the example
an outlier is added to 14 samples from two uncorrelated variables. This
outlier leads to a significant correlation (r=0.73, p=0.002). Also, the
permutation test does not help, while bootstrapping does a little bit
better. The bootstrap confidence interval of r overlaps 0 and the
distribution is clearly bimodal. The modes correspond to bootstrap resamples
that include or exclude the outlier.
The "sample impact" metric uses this information to quantify how much
impact each sample has on the resulting correlation. For each sample, the
average correlation is computed from all bootstrap resamples that include
the sample, and again for all resamples that do not include it. The sample
impact is the difference of these two correlations.
The sample impact is useful for outlier detection since outliers will often
have a high impact. However, not every high-impact sample is an outlier.
Especially with small sample sizes most individual samples have a high
impact on correlation. This highlights the problematic of interpreting
correlations in small sample sizes.
"""
if fig is None:
fig = matplotlib.pyplot.figure()
elif not isinstance(fig, matplotlib.figure.Figure):
fig = matplotlib.pyplot.figure(**fig)
r0, p0 = measure(x, y)
r_perm = permutationcorrelation(x, y, n_reps=n_reps, measure=lambda a, b: measure(a, b)[0])
p = sp.sum(np.abs(r_perm) >= np.abs(r0)) / n_reps
kcl, kcu = sp.stats.binom.interval(1 - alpha/2, n_reps, sp.clip(p, 1e-15, 1.0))
pcl = kcl / n_reps
pcu = kcu / n_reps
r_boot, r_diff = bootstrapcorrelation(x, y, n_reps=n_reps, n_boot=x.size, measure=lambda a, b: measure(a, b)[0])
rbs = sp.sort(r_boot)
rcl = rbs[sp.floor(r_boot.size*alpha/2)]
rcu = rbs[sp.floor(r_boot.size*(1 - alpha/2))]
gs = matplotlib.gridspec.GridSpec(2, 2)
ax1 = fig.add_subplot(gs[:, 0])
ax2 = fig.add_subplot(gs[0, 1])
ax3 = fig.add_subplot(gs[1, 1])
mappable = ax1.scatter(x, y, c=sp.absolute(r_diff), s=100, vmin=0, vmax=0.2, cmap=cmap)
#plt.colorbar()
ax1.set_title('r={:.2f} (p={:.3f})'.format(r0, p0))
cb = fig.colorbar(mappable, ax=ax1, label='sample impact')
ax2.hist(r_boot, bins=40, range=(-1, 1))
ax2.axvline(rcl, color='r')
ax2.axvline(rcu, color='r')
ax2.set_yticks([])
ax2.set_xticks([-1, -0.8, -0.6, -0.4, -0.2, 0, 0.2, 0.4, 0.6, 0.8, 1])
ax2.set_xticklabels([])
ax2.set_title('Bootstrap: r $\\approx$ {:.2f} ... {:.2f} ({}% CI)'.format(rcl, rcu, (1-alpha)*100))
ax3.hist(r_perm, bins=40, range=(-1, 1))
ax3.axvline(r0, color='r')
ax3.set_yticks([])
ax3.set_xticks([-1, -0.8, -0.6, -0.4, -0.2, 0, 0.2, 0.4, 0.6, 0.8, 1])
ax3.set_title('random permutation test: p $\\approx$ {:.3f} ... {:.3f} ({}% CI)'.format(pcl, pcu, (1-alpha)*100))
fig.tight_layout()
return r_boot, r_diff
def bootstrapcorrelation(x, y, n_reps=10000, n_boot=None, measure=None):
if measure is None:
measure = lambda a, b: sp.corrcoef(a, b)[0, 1]
x = sp.ravel(x)
y = sp.ravel(y)
n_boot = x.size if n_boot is None else n_boot
r = sp.empty(n_reps)
mask = sp.zeros((n_reps, x.size), dtype=int)
for rep in range(n_reps):
i = sp.random.randint(x.size, size=n_boot)
j, c = sp.unique(i, return_counts=True)
mask[rep, j] += c
r[rep] = measure(x[i], y[i])
r_incl = sp.empty(x.size)
r_excl = sp.empty(x.size)
for i in range(x.size):
#r_incl[i] = sp.average(r, weights=mask[:, i])
r_incl[i] = sp.nanmean(r[mask[:, i] > 0])
r_excl[i] = sp.nanmean(r[mask[:, i] == 0])
return r, r_incl - r_excl
def permutationcorrelation(x, y, n_reps=10000, measure=None):
if measure is None:
measure = lambda a, b: sp.corrcoef(a, b)[0, 1]
x = sp.ravel(x.copy())
y = sp.ravel(y.copy())
r = sp.empty(n_reps)
for rep in range(n_reps):
sp.random.shuffle(x)
sp.random.shuffle(y)
r[rep] = measure(x, y)
return r
cdict = {'red': ((0.0, 0.0, 0.0),
(0.5, 1.0, 1.0),
(1.0, 1.0, 1.0)),
'green': ((0.0, 1.0, 1.0),
(0.5, 1.0, 1.0),
(1.0, 0.0, 0.0)),
'blue': ((0.0, 0.0, 0.0),
(0.5, 0.0, 0.0),
(1.0, 0.0, 0.0))}
cmap = LinearSegmentedColormap('goodbad', cdict)
if __name__ == '__main__':
sp.random.seed(42)
x = sp.random.randn(25) * 10
y = sp.random.randn(25)
# one outlier
x[0] = 50
y[0] = 5
fig = matplotlib.pyplot.figure(figsize=[12, 4])
r_boot, r_diff = plot_correlation(x, y, fig=fig)
# another outlier
x[1] = -10
y[1] = 5
fig = matplotlib.pyplot.figure(figsize=[12, 4])
r_boot, r_diff = plot_correlation(x, y, fig=fig)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment