from scipy.spatial.distance import pdist, squareform | |
import numpy as np | |
import copy | |
def distcorr(Xval, Yval, pval=True, nruns=500): | |
""" Compute the distance correlation function, returning the p-value. | |
Based on Satra/distcorr.py (gist aa3d19a12b74e9ab7941) | |
>>> a = [1,2,3,4,5] | |
>>> b = np.array([1,2,9,4,4]) | |
>>> distcorr(a, b) | |
(0.76267624241686671, 0.266) | |
""" | |
X = np.atleast_1d(Xval) | |
Y = np.atleast_1d(Yval) | |
if np.prod(X.shape) == len(X): | |
X = X[:, None] | |
if np.prod(Y.shape) == len(Y): | |
Y = Y[:, None] | |
X = np.atleast_2d(X) | |
Y = np.atleast_2d(Y) | |
n = X.shape[0] | |
if Y.shape[0] != X.shape[0]: | |
raise ValueError('Number of samples must match') | |
a = squareform(pdist(X)) | |
b = squareform(pdist(Y)) | |
A = a - a.mean(axis=0)[None, :] - a.mean(axis=1)[:, None] + a.mean() | |
B = b - b.mean(axis=0)[None, :] - b.mean(axis=1)[:, None] + b.mean() | |
dcov2_xy = (A * B).sum() / float(n * n) | |
dcov2_xx = (A * A).sum() / float(n * n) | |
dcov2_yy = (B * B).sum() / float(n * n) | |
dcor = np.sqrt(dcov2_xy) / np.sqrt(np.sqrt(dcov2_xx) * np.sqrt(dcov2_yy)) | |
if pval: | |
greater = 0 | |
for i in range(nruns): | |
Y_r = copy.copy(Yval) | |
np.random.shuffle(Y_r) | |
if distcorr(Xval, Y_r, pval=False) > dcor: | |
greater += 1 | |
return (dcor, greater / float(nruns)) | |
else: | |
return dcor |
This comment has been minimized.
This comment has been minimized.
This also gives different results from |
This comment has been minimized.
This comment has been minimized.
I believe the p-value should be calculated using nruns and not n, since the p-value can be greater than 1.0 using n. So |
This comment has been minimized.
This comment has been minimized.
Hi @B1azingB1ade, thanks for your comments! I have changed the code to use np.random.shuffle. According to the docs, it only shuffles along the first axis: https://docs.scipy.org/doc/numpy-1.14.0/reference/generated/numpy.random.shuffle.html… they seem to be cooking a way to do this: numpy/numpy#5173. @kirk86 yes, it seems like "correlation distance" is different from "distance correlation". @rmill040, correct! Thanks for spotting this bug, I just fixed it. |
This comment has been minimized.
This comment has been minimized.
Hi, |
This comment has been minimized.
This comment has been minimized.
I tried running |
This comment has been minimized.
Hi! dcor also applies to 2D arrays. The original function computes the correct dcor, but I think that the pvalue part needs the following two changes: