dustinstansbury/chaterjee_corr.py

## chaterjee_corr.py
from scipy import stats
from typing import Tuple, Union
import numpy as np
import pandas as pd


def xi_corr(
    x: np.ndarray, y: np.ndarray, return_p_values: bool = False
) -> Union[float, Tuple[float, float]]:
    """Implementation of Chatterjee's correlation, which can capture both monotonic
    and non-monotonic coupling between variables. The metric incorporates degree
    of uncertainty, so as the length of x, y increases, so too does the confidence
    in the correlation.

    Parameters
    ----------
    x, y: np.ndarray
        Vectors of equal length
    return_p_values : bool
        Whether to return p-value for the correlation estimate. Note that we use
        the asymptotic limit, assuming no ties amongst values.

    Returns
    -------
    xi : float
        The Chatterjee correlation
    p_value : float (optional)
        If return_p_values=True, returns the symptotic p-value for the correlation.
        Just like any p-value, as the length of x, y increases, p-value decreases
        if there is any relationship between the variables.

    Note
    ----
    Unexpected behavior, when x, y are linearly independent, p-value does not approach
    .5, which is expected. But maybe DS just needs to read the paper more deeply.

    References
    ----------
    - "A new coefficient of correlation": https://arxiv.org/pdf/1909.10140.pdf
    - R implementation: https://souravchatterjee.su.domains//xi.R
    """

    def _preprocess(x, y):
        # convert values to overlapping-valued DataFrame to take advantage
        # of useful rank methods
        df = pd.DataFrame(np.vstack([x.ravel(), y.ravel()]).T, columns=["x", "y"])
        return df.dropna(axis=0, how="any")

    xy = _preprocess(x, y)

    # this is equivalent to .rank(method='random'), which isn't readily available
    # in scipy.rankdata or pandas.rank -- ties are broken randomly
    pi = xy.x.sample(frac=1).rank(method="first").reindex_like(xy)


    n = len(xy)
    f = xy.y.rank(method="max").values / n

    order = np.argsort(pi)
    f = f[order]

    g = (-xy.y).rank(method="max").values / n

    A1 = np.mean(np.abs(f[:-1] - f[1:])) * (n - 1) / (2 * n)
    C = np.mean(g * (1 - g))

    xi = 1 - (A1 / C)

    if return_p_values:
        pval = 1 - stats.norm.cdf(np.sqrt(n) * xi, scale=np.sqrt(2.0 / 5))
        return xi, pval
    return xi


def xi_corr2(x, y, return_p_values: bool = False):
    """Alternative implementation that's suppose to be >300x faster
    Reference: https://github.com/czbiohub/xicor/issues/17
    """
    n = x.size
    xi = np.argsort(x, kind="quicksort")
    y = y[xi]
    _, b, c = np.unique(y, return_counts=True, return_inverse=True)
    r = np.cumsum(c)[b]
    _, b, c = np.unique(-y, return_counts=True, return_inverse=True)
    l = np.cumsum(c)[b]
    xi = 1 - n * np.abs(np.diff(r)).sum() / (2 * (l * (n - l)).sum())

    if return_p_values:
        pval = 1 - stats.norm.cdf(np.sqrt(n) * xi, scale=np.sqrt(2.0 / 5))
        return xi, pval
    return xi
	from scipy import stats
	from typing import Tuple, Union
	import numpy as np
	import pandas as pd


	def xi_corr(
	x: np.ndarray, y: np.ndarray, return_p_values: bool = False
	) -> Union[float, Tuple[float, float]]:
	"""Implementation of Chatterjee's correlation, which can capture both monotonic
	and non-monotonic coupling between variables. The metric incorporates degree
	of uncertainty, so as the length of x, y increases, so too does the confidence
	in the correlation.

	Parameters
	----------
	x, y: np.ndarray
	Vectors of equal length
	return_p_values : bool
	Whether to return p-value for the correlation estimate. Note that we use
	the asymptotic limit, assuming no ties amongst values.

	Returns
	-------
	xi : float
	The Chatterjee correlation
	p_value : float (optional)
	If return_p_values=True, returns the symptotic p-value for the correlation.
	Just like any p-value, as the length of x, y increases, p-value decreases
	if there is any relationship between the variables.

	Note
	----
	Unexpected behavior, when x, y are linearly independent, p-value does not approach
	.5, which is expected. But maybe DS just needs to read the paper more deeply.

	References
	----------
	- "A new coefficient of correlation": https://arxiv.org/pdf/1909.10140.pdf
	- R implementation: https://souravchatterjee.su.domains//xi.R
	"""

	def _preprocess(x, y):
	# convert values to overlapping-valued DataFrame to take advantage
	# of useful rank methods
	df = pd.DataFrame(np.vstack([x.ravel(), y.ravel()]).T, columns=["x", "y"])
	return df.dropna(axis=0, how="any")

	xy = _preprocess(x, y)

	# this is equivalent to .rank(method='random'), which isn't readily available
	# in scipy.rankdata or pandas.rank -- ties are broken randomly
	pi = xy.x.sample(frac=1).rank(method="first").reindex_like(xy)


	n = len(xy)
	f = xy.y.rank(method="max").values / n

	order = np.argsort(pi)
	f = f[order]

	g = (-xy.y).rank(method="max").values / n

	A1 = np.mean(np.abs(f[:-1] - f[1:])) * (n - 1) / (2 * n)
	C = np.mean(g * (1 - g))

	xi = 1 - (A1 / C)

	if return_p_values:
	pval = 1 - stats.norm.cdf(np.sqrt(n) * xi, scale=np.sqrt(2.0 / 5))
	return xi, pval
	return xi


	def xi_corr2(x, y, return_p_values: bool = False):
	"""Alternative implementation that's suppose to be >300x faster
	Reference: https://github.com/czbiohub/xicor/issues/17
	"""
	n = x.size
	xi = np.argsort(x, kind="quicksort")
	y = y[xi]
	_, b, c = np.unique(y, return_counts=True, return_inverse=True)
	r = np.cumsum(c)[b]
	_, b, c = np.unique(-y, return_counts=True, return_inverse=True)
	l = np.cumsum(c)[b]
	xi = 1 - n * np.abs(np.diff(r)).sum() / (2 * (l * (n - l)).sum())

	if return_p_values:
	pval = 1 - stats.norm.cdf(np.sqrt(n) * xi, scale=np.sqrt(2.0 / 5))
	return xi, pval
	return xi