vmonaco/runs_test.py

## runs_test.py
"""
Multivariate Wald-Wolfowitz test for two samples in separate CSV files.

See:
  Friedman, Jerome H., and Lawrence C. Rafsky.
  "Multivariate generalizations of the Wald-Wolfowitz and Smirnov two-sample tests."
  The Annals of Statistics (1979): 697-717.

Given multivariate sample X of length m and sample Y of length n, test the null hypothesis:

    H_0: The samples were generated by the same distribution

The algorithm uses a KD-tree to construct the minimum spanning tree, therefore is O(Nk log(Nk))
instead of O(N^3), where N = m + n is the total number of observations. Though approximate,
results are generally valid. See also:

Monaco, John V.
"Classification and authentication of one-dimensional behavioral biometrics."
Biometrics (IJCB), 2014 IEEE International Joint Conference on. IEEE, 2014.

The input files should be CSV files with no header and row observations, for example:

X.csv
-----
1,2
2,2
3,1

Y.csv
-----
1,1
2,4
3,2
4,2

Usage:

$ python X.csv Y.csv

> W = 0.485, 5 runs
> p = 0.6862
> Fail to reject H_0 at 0.05 significance level
> The samples appear to have similar distribution
"""
import sys
import numpy as np
import pandas as pd
import scipy.stats as stats
from sklearn.neighbors import kneighbors_graph
from scipy.sparse.csgraph import minimum_spanning_tree

SIGNIFICANCE = 0.05


def mst_edges(V, k):
    """
    Construct the approximate minimum spanning tree from vectors V

    :param: V: 2D array, sequence of vectors
    :param: k: int the number of neighbor to consider for each vector
    :return: V ndarray of edges forming the MST
    """

    # k = len(X)-1 gives the exact MST
    k = min(len(V) - 1, k)

    # generate a sparse graph using the k nearest neighbors of each point
    G = kneighbors_graph(V, n_neighbors=k, mode='distance')

    # Compute the minimum spanning tree of this graph
    full_tree = minimum_spanning_tree(G, overwrite=True)

    return np.array(full_tree.nonzero()).T


def ww_test(X, Y, k=10):
    """
    Multi-dimensional Wald-Wolfowitz test

    :param X: multivariate sample X as a numpy ndarray
    :param Y: multivariate sample Y as a numpy ndarray
    :param k: number of neighbors to consider for each vector
    :return: W the WW test statistic, R the number of runs
    """
    m, n = len(X), len(Y)
    N = m + n

    XY = np.concatenate([X, Y]).astype(np.float)

    # XY += np.random.normal(0, noise_scale, XY.shape)

    edges = mst_edges(XY, k)

    labels = np.array([0] * m + [1] * n)

    c = labels[edges]
    runs_edges = edges[c[:, 0] == c[:, 1]]

    # number of runs is the total number of observations minus edges within each run
    R = N - len(runs_edges)

    # expected value of R
    e_R = ((2.0 * m * n) / N) + 1

    # variance of R is _numer/_denom
    _numer = 2 * m * n * (2 * m * n - N)
    _denom = N ** 2 * (N - 1)

    # see Eq. 1 in Friedman 1979
    # W approaches a standard normal distribution
    W = (R - e_R) / np.sqrt(_numer/_denom)

    return W, R


if __name__ == '__main__':
    if len(sys.argv) != 3:
        print('Usage: $ python runs_test.py [X] [Y]')
        sys.exit(1)

    X = pd.read_csv(sys.argv[1], header=None).values
    Y = pd.read_csv(sys.argv[2], header=None).values

    W, R = ww_test(X, Y)
    pvalue = stats.norm.cdf(W)  # one sided test
    reject = pvalue <= SIGNIFICANCE

    print('W = %.3f, %d runs' % (W, R))
    print('p = %.4f' % pvalue)
    print('%s H_0 at 0.05 significance level' % ('Reject' if reject else 'Fail to reject'))
    print('The samples appear to have %s distribution' % ('different' if reject else 'similar'))
	"""
	Multivariate Wald-Wolfowitz test for two samples in separate CSV files.

	See:
	Friedman, Jerome H., and Lawrence C. Rafsky.
	"Multivariate generalizations of the Wald-Wolfowitz and Smirnov two-sample tests."
	The Annals of Statistics (1979): 697-717.

	Given multivariate sample X of length m and sample Y of length n, test the null hypothesis:

	H_0: The samples were generated by the same distribution

	The algorithm uses a KD-tree to construct the minimum spanning tree, therefore is O(Nk log(Nk))
	instead of O(N^3), where N = m + n is the total number of observations. Though approximate,
	results are generally valid. See also:

	Monaco, John V.
	"Classification and authentication of one-dimensional behavioral biometrics."
	Biometrics (IJCB), 2014 IEEE International Joint Conference on. IEEE, 2014.

	The input files should be CSV files with no header and row observations, for example:

	X.csv
	-----
	1,2
	2,2
	3,1

	Y.csv
	-----
	1,1
	2,4
	3,2
	4,2

	Usage:

	$ python X.csv Y.csv

	> W = 0.485, 5 runs
	> p = 0.6862
	> Fail to reject H_0 at 0.05 significance level
	> The samples appear to have similar distribution
	"""
	import sys
	import numpy as np
	import pandas as pd
	import scipy.stats as stats
	from sklearn.neighbors import kneighbors_graph
	from scipy.sparse.csgraph import minimum_spanning_tree

	SIGNIFICANCE = 0.05


	def mst_edges(V, k):
	"""
	Construct the approximate minimum spanning tree from vectors V

	:param: V: 2D array, sequence of vectors
	:param: k: int the number of neighbor to consider for each vector
	:return: V ndarray of edges forming the MST
	"""

	# k = len(X)-1 gives the exact MST
	k = min(len(V) - 1, k)

	# generate a sparse graph using the k nearest neighbors of each point
	G = kneighbors_graph(V, n_neighbors=k, mode='distance')

	# Compute the minimum spanning tree of this graph
	full_tree = minimum_spanning_tree(G, overwrite=True)

	return np.array(full_tree.nonzero()).T


	def ww_test(X, Y, k=10):
	"""
	Multi-dimensional Wald-Wolfowitz test

	:param X: multivariate sample X as a numpy ndarray
	:param Y: multivariate sample Y as a numpy ndarray
	:param k: number of neighbors to consider for each vector
	:return: W the WW test statistic, R the number of runs
	"""
	m, n = len(X), len(Y)
	N = m + n

	XY = np.concatenate([X, Y]).astype(np.float)

	# XY += np.random.normal(0, noise_scale, XY.shape)

	edges = mst_edges(XY, k)

	labels = np.array([0] * m + [1] * n)

	c = labels[edges]
	runs_edges = edges[c[:, 0] == c[:, 1]]

	# number of runs is the total number of observations minus edges within each run
	R = N - len(runs_edges)

	# expected value of R
	e_R = ((2.0 * m * n) / N) + 1

	# variance of R is _numer/_denom
	_numer = 2 * m * n * (2 * m * n - N)
	_denom = N ** 2 * (N - 1)

	# see Eq. 1 in Friedman 1979
	# W approaches a standard normal distribution
	W = (R - e_R) / np.sqrt(_numer/_denom)

	return W, R


	if __name__ == '__main__':
	if len(sys.argv) != 3:
	print('Usage: $ python runs_test.py [X] [Y]')
	sys.exit(1)

	X = pd.read_csv(sys.argv[1], header=None).values
	Y = pd.read_csv(sys.argv[2], header=None).values

	W, R = ww_test(X, Y)
	pvalue = stats.norm.cdf(W) # one sided test
	reject = pvalue <= SIGNIFICANCE

	print('W = %.3f, %d runs' % (W, R))
	print('p = %.4f' % pvalue)
	print('%s H_0 at 0.05 significance level' % ('Reject' if reject else 'Fail to reject'))
	print('The samples appear to have %s distribution' % ('different' if reject else 'similar'))