Created
June 20, 2012 15:10
-
-
Save ffinkernagel/2960386 to your computer and use it in GitHub Desktop.
Distance correlation - not quadratic in space
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Copyright (c) 2012, Florian Finkernagel. All right reserved. | |
## Redistribution and use in source and binary forms, with or without | |
## modification, are permitted provided that the following conditions are | |
## met: | |
## * Redistributions of source code must retain the above copyright | |
## notice, this list of conditions and the following disclaimer. | |
## * Redistributions in binary form must reproduce the above | |
## copyright notice, this list of conditions and the following | |
## disclaimer in the documentation and/or other materials provided | |
## with the distribution. | |
## * Neither the name of Florian Finkernagel nor the names of its | |
## contributors may be used to endorse or promote products derived | |
## from this software without specific prior written permission. | |
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
## "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
## LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
## A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
## OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
## LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
## DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
## THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
## (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
## OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
"""An implementation of Distance Correlation (see http://en.wikipedia.org/wiki/Distance_correlation ) | |
that is not quadratic in space requirements (only in runtime).""" | |
import unittest | |
import timeit | |
import sys | |
import random | |
import numpy as np | |
cimport numpy as np | |
import cython | |
ctypedef np.double_t DTYPE_t | |
#purely for speed reasons | |
cdef inline double myabs(double a) : return a if a >= 0. else -1 * a | |
def dcov_all(x, y): | |
'Calculate distance covariance, distance correlation, distance variance of x sample and distance variance of y sample' | |
x = np.array(x, dtype=np.double) | |
y = np.array(y, dtype=np.double) | |
dnx = D_N(x) | |
dny = D_N(y) | |
denom = float(dnx.dim * dnx.dim) | |
dc = dnx.product_sum(dny) / denom | |
dvx = dnx.squared_sum() / denom | |
dvy = dny.squared_sum() / denom | |
dr = dc / (np.sqrt(dvx) * np.sqrt(dvy)) | |
return dc, dr, dvx, dvy | |
class D_N: | |
"""Inner helper of dcov_all. Cache different means that are required for calculating | |
the matrix members on the fly""" | |
def __init__(self, x): | |
self.x = np.array(x) | |
self.dim = x.shape[0] | |
self.calculate_means() | |
@cython.boundscheck(False) | |
def calculate_means(self): | |
cdef int dim = self.dim | |
cdef DTYPE_t value | |
cdef DTYPE_t sum_total = 0 | |
cdef np.ndarray[DTYPE_t, ndim=1] sum_0 = np.zeros(dim, dtype=np.double) | |
cdef np.ndarray[DTYPE_t, ndim=1] sum_1 = np.zeros(dim, dtype=np.double) | |
cdef np.ndarray[DTYPE_t, ndim=1] x = self.x | |
cdef unsigned int ii | |
cdef unsigned int jj | |
for ii in range(dim): | |
for jj in range(dim): | |
value = myabs(x[jj] - x[ii]) | |
sum_total += value | |
sum_1[jj] += value | |
sum_0[ii] += value | |
self.mean = sum_total / (self.dim**2) | |
self.mean_0 = sum_0 / (self.dim) | |
self.mean_1 = sum_1 / (self.dim) | |
return | |
@cython.boundscheck(False) | |
def squared_sum(self): | |
cdef np.ndarray[DTYPE_t, ndim=1] mean_0 = self.mean_0 | |
cdef np.ndarray[DTYPE_t, ndim=1] mean_1 = self.mean_1 | |
cdef DTYPE_t mean = self.mean | |
cdef DTYPE_t squared_sum = 0 | |
cdef DTYPE_t dist | |
cdef DTYPE_t d | |
cdef np.ndarray[DTYPE_t, ndim=1] x = self.x | |
cdef unsigned int dim = self.dim | |
cdef unsigned int ii | |
cdef unsigned int jj | |
for ii in range(dim): | |
for jj in range(dim): | |
dist = myabs(x[jj] - x[ii]) | |
d = dist - mean_0[ii] - mean_1[jj] + mean | |
squared_sum += d * d | |
return squared_sum | |
@cython.boundscheck(False) | |
def product_sum(self, other): | |
cdef np.ndarray[DTYPE_t, ndim=1] mean_0_here = self.mean_0 | |
cdef np.ndarray[DTYPE_t, ndim=1] mean_1_here = self.mean_1 | |
cdef DTYPE_t mean_here = self.mean | |
cdef np.ndarray[DTYPE_t, ndim=1] mean_0_there = other.mean_0 | |
cdef np.ndarray[DTYPE_t, ndim=1] mean_1_there = other.mean_1 | |
cdef DTYPE_t mean_there = other.mean | |
cdef DTYPE_t d_here | |
cdef DTYPE_t d_there | |
cdef DTYPE_t product_sum = 0 | |
cdef np.ndarray[DTYPE_t, ndim=1] x = self.x | |
cdef np.ndarray[DTYPE_t, ndim=1] y = other.x | |
cdef unsigned int dim = self.dim | |
cdef unsigned int ii | |
cdef unsigned int jj | |
for ii in range(dim): | |
for jj in range(dim): | |
d_here = myabs(x[jj] - x[ii]) - mean_0_here[ii] - mean_1_here[jj] + mean_here | |
d_there = myabs(y[jj] - y[ii]) - mean_0_there[ii] - mean_1_there[jj] + mean_there | |
product_sum += d_here * d_there | |
return product_sum | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment