Last active
December 8, 2015 15:40
-
-
Save mbillingr/7a2c1ab6a0ecc471f452 to your computer and use it in GitHub Desktop.
Fancy correlation plots with some sort of outlier detection.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Released under The MIT License (MIT) | |
# http://opensource.org/licenses/MIT | |
# Copyright (c) 2015 Martin Billinger | |
import numpy as np | |
import scipy as sp | |
import matplotlib | |
from matplotlib.colors import LinearSegmentedColormap | |
def plot_correlation(x, y, measure=sp.stats.pearsonr, alpha=0.05, n_reps=10000, fig=None): | |
"""This function shows useful information about the correlation of two variables. | |
1. A scatter plot with pearson's correlation coefficient r, and p-value p | |
Points in the scatter plot are colored by "sample impact". This is the | |
amount the correlation changes when removing a particular sample (details | |
see below). | |
2. Bootstrap distribution of r with confidence interval. | |
3. Distribution of r under the null hypothesis (no correlation), obtained | |
with random permutation of the variables. A confidence interval of | |
the p-value is obtained from this distribution. | |
Note that the correlation can be very sensitive to outliers In the example | |
an outlier is added to 14 samples from two uncorrelated variables. This | |
outlier leads to a significant correlation (r=0.73, p=0.002). Also, the | |
permutation test does not help, while bootstrapping does a little bit | |
better. The bootstrap confidence interval of r overlaps 0 and the | |
distribution is clearly bimodal. The modes correspond to bootstrap resamples | |
that include or exclude the outlier. | |
The "sample impact" metric uses this information to quantify how much | |
impact each sample has on the resulting correlation. For each sample, the | |
average correlation is computed from all bootstrap resamples that include | |
the sample, and again for all resamples that do not include it. The sample | |
impact is the difference of these two correlations. | |
The sample impact is useful for outlier detection since outliers will often | |
have a high impact. However, not every high-impact sample is an outlier. | |
Especially with small sample sizes most individual samples have a high | |
impact on correlation. This highlights the problematic of interpreting | |
correlations in small sample sizes. | |
""" | |
if fig is None: | |
fig = matplotlib.pyplot.figure() | |
elif not isinstance(fig, matplotlib.figure.Figure): | |
fig = matplotlib.pyplot.figure(**fig) | |
r0, p0 = measure(x, y) | |
r_perm = permutationcorrelation(x, y, n_reps=n_reps, measure=lambda a, b: measure(a, b)[0]) | |
p = sp.sum(np.abs(r_perm) >= np.abs(r0)) / n_reps | |
kcl, kcu = sp.stats.binom.interval(1 - alpha/2, n_reps, sp.clip(p, 1e-15, 1.0)) | |
pcl = kcl / n_reps | |
pcu = kcu / n_reps | |
r_boot, r_diff = bootstrapcorrelation(x, y, n_reps=n_reps, n_boot=x.size, measure=lambda a, b: measure(a, b)[0]) | |
rbs = sp.sort(r_boot) | |
rcl = rbs[sp.floor(r_boot.size*alpha/2)] | |
rcu = rbs[sp.floor(r_boot.size*(1 - alpha/2))] | |
gs = matplotlib.gridspec.GridSpec(2, 2) | |
ax1 = fig.add_subplot(gs[:, 0]) | |
ax2 = fig.add_subplot(gs[0, 1]) | |
ax3 = fig.add_subplot(gs[1, 1]) | |
mappable = ax1.scatter(x, y, c=sp.absolute(r_diff), s=100, vmin=0, vmax=0.2, cmap=cmap) | |
#plt.colorbar() | |
ax1.set_title('r={:.2f} (p={:.3f})'.format(r0, p0)) | |
cb = fig.colorbar(mappable, ax=ax1, label='sample impact') | |
ax2.hist(r_boot, bins=40, range=(-1, 1)) | |
ax2.axvline(rcl, color='r') | |
ax2.axvline(rcu, color='r') | |
ax2.set_yticks([]) | |
ax2.set_xticks([-1, -0.8, -0.6, -0.4, -0.2, 0, 0.2, 0.4, 0.6, 0.8, 1]) | |
ax2.set_xticklabels([]) | |
ax2.set_title('Bootstrap: r $\\approx$ {:.2f} ... {:.2f} ({}% CI)'.format(rcl, rcu, (1-alpha)*100)) | |
ax3.hist(r_perm, bins=40, range=(-1, 1)) | |
ax3.axvline(r0, color='r') | |
ax3.set_yticks([]) | |
ax3.set_xticks([-1, -0.8, -0.6, -0.4, -0.2, 0, 0.2, 0.4, 0.6, 0.8, 1]) | |
ax3.set_title('random permutation test: p $\\approx$ {:.3f} ... {:.3f} ({}% CI)'.format(pcl, pcu, (1-alpha)*100)) | |
fig.tight_layout() | |
return r_boot, r_diff | |
def bootstrapcorrelation(x, y, n_reps=10000, n_boot=None, measure=None): | |
if measure is None: | |
measure = lambda a, b: sp.corrcoef(a, b)[0, 1] | |
x = sp.ravel(x) | |
y = sp.ravel(y) | |
n_boot = x.size if n_boot is None else n_boot | |
r = sp.empty(n_reps) | |
mask = sp.zeros((n_reps, x.size), dtype=int) | |
for rep in range(n_reps): | |
i = sp.random.randint(x.size, size=n_boot) | |
j, c = sp.unique(i, return_counts=True) | |
mask[rep, j] += c | |
r[rep] = measure(x[i], y[i]) | |
r_incl = sp.empty(x.size) | |
r_excl = sp.empty(x.size) | |
for i in range(x.size): | |
#r_incl[i] = sp.average(r, weights=mask[:, i]) | |
r_incl[i] = sp.nanmean(r[mask[:, i] > 0]) | |
r_excl[i] = sp.nanmean(r[mask[:, i] == 0]) | |
return r, r_incl - r_excl | |
def permutationcorrelation(x, y, n_reps=10000, measure=None): | |
if measure is None: | |
measure = lambda a, b: sp.corrcoef(a, b)[0, 1] | |
x = sp.ravel(x.copy()) | |
y = sp.ravel(y.copy()) | |
r = sp.empty(n_reps) | |
for rep in range(n_reps): | |
sp.random.shuffle(x) | |
sp.random.shuffle(y) | |
r[rep] = measure(x, y) | |
return r | |
cdict = {'red': ((0.0, 0.0, 0.0), | |
(0.5, 1.0, 1.0), | |
(1.0, 1.0, 1.0)), | |
'green': ((0.0, 1.0, 1.0), | |
(0.5, 1.0, 1.0), | |
(1.0, 0.0, 0.0)), | |
'blue': ((0.0, 0.0, 0.0), | |
(0.5, 0.0, 0.0), | |
(1.0, 0.0, 0.0))} | |
cmap = LinearSegmentedColormap('goodbad', cdict) | |
if __name__ == '__main__': | |
sp.random.seed(42) | |
x = sp.random.randn(25) * 10 | |
y = sp.random.randn(25) | |
# one outlier | |
x[0] = 50 | |
y[0] = 5 | |
fig = matplotlib.pyplot.figure(figsize=[12, 4]) | |
r_boot, r_diff = plot_correlation(x, y, fig=fig) | |
# another outlier | |
x[1] = -10 | |
y[1] = 5 | |
fig = matplotlib.pyplot.figure(figsize=[12, 4]) | |
r_boot, r_diff = plot_correlation(x, y, fig=fig) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment