Skip to content

Instantly share code, notes, and snippets.

@raphaelvallat
Forked from wladston/distcorr.py
Last active December 12, 2023 06:50
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save raphaelvallat/386da9eff858b8bd2e647bc1be4c7566 to your computer and use it in GitHub Desktop.
Save raphaelvallat/386da9eff858b8bd2e647bc1be4c7566 to your computer and use it in GitHub Desktop.
Distance correlation with permutation test
import numpy as np
import multiprocessing
from joblib import Parallel, delayed
from scipy.spatial.distance import pdist, squareform
def _dcorr(y, n2, A, dcov2_xx):
"""Helper function for distance correlation bootstrapping.
"""
# Pairwise Euclidean distances
b = squareform(pdist(y, metric='euclidean'))
# Double centering
B = b - b.mean(axis=0)[None, :] - b.mean(axis=1)[:, None] + b.mean()
# Compute squared distance covariances
dcov2_yy = np.vdot(B, B) / n2
dcov2_xy = np.vdot(A, B) / n2
return np.sqrt(dcov2_xy) / np.sqrt(np.sqrt(dcov2_xx) * np.sqrt(dcov2_yy))
def distcorr(x, y, n_boot=1000, seed=None):
"""Compute the distance correlation between two N-dimensional arrays.
Statistical significance (p-value) is evaluated with a permutation test.
Parameters
----------
x, y : np.ndarray
Input arrays
n_boot : int or None
Number of bootstrap to perform.
If None, no bootstrapping is performed and the function
only returns the distance correlation (no p-value).
Default is 1000 (thus giving a precision of 0.001).
seed : int or None
Random state seed.
Returns
-------
dcor : float
Distance correlation (range from 0 to 1).
pval : float
P-value.
Notes
-----
From Wikipedia:
Distance correlation is a measure of dependence between two paired
random vectors of arbitrary, not necessarily equal, dimension. The population
distance correlation coefficient is zero if and only if the random vectors
are independent. Thus, distance correlation measures both linear and
nonlinear association between two random variables or random vectors.
This is in contrast to Pearson's correlation, which can only detect
linear association between two random variables.
This function uses the Joblib package for parallel bootstrapping.
References
----------
.. [1] https://en.wikipedia.org/wiki/Distance_correlation
.. [2] https://gist.github.com/satra/aa3d19a12b74e9ab7941
.. [3] https://gist.github.com/wladston/c931b1495184fbb99bec
.. [4] https://joblib.readthedocs.io/en/latest/
Examples
--------
1. With two 1D vectors
>>> a = [1, 2, 3, 4, 5]
>>> b = [1, 2, 9, 4, 4]
>>> distcorr(a, b, seed=9)
(0.7626762424168667, 0.334)
2. With two 2D arrays and no p-value
>>> import numpy as np
>>> np.random.seed(123)
>>> a = np.random.random((10, 10))
>>> b = np.random.random((10, 10))
>>> distcorr(a, b, n_boot=None)
0.8799633012275321
"""
x = np.asarray(x)
y = np.asarray(y)
if x.ndim == 1:
x = x[:, None]
if y.ndim == 1:
y = y[:, None]
assert x.shape[0] == y.shape[0], 'x and y must have same number of samples'
# Extract number of samples
n = x.shape[0]
n2 = n**2
# Process first array to avoid redundancy when performing bootstrap
a = squareform(pdist(x, metric='euclidean'))
A = a - a.mean(axis=0)[None, :] - a.mean(axis=1)[:, None] + a.mean()
dcov2_xx = np.vdot(A, A) / n2
# Process second array and compute final distance correlation
dcor = _dcorr(y, n2, A, dcov2_xx)
# Compute p-value using a bootstrap procedure
if n_boot is not None and n_boot > 1:
# Define random seed and permutation
rng = np.random.RandomState(seed)
bootsam = rng.random_sample((n, n_boot)).argsort(axis=0)
num_cores = multiprocessing.cpu_count()
bootstat = Parallel(n_jobs=num_cores)(delayed(_dcorr)(y[bootsam[:, i]], n2, A, dcov2_xx) for i in range(n_boot))
pval = np.greater_equal(bootstat, dcor).sum() / n_boot
return dcor, pval
else:
return dcor
# Examples
print('Example 1')
a = [1, 2, 3, 4, 5]
b = [1, 2, 9, 4, 4]
print(distcorr(a, b, seed=9))
print('Example 2')
np.random.seed(123)
a = np.random.random((10, 10))
b = np.random.random((10, 10))
print(distcorr(a, b, n_boot=None))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment