josef-pkt/kruskal_permuted.py

## kruskal_permuted.py
# -*- coding: utf-8 -*-
"""scipy.stats.kruskal with random permutation p-values

Created on Thu Oct 06 16:52:14 2011

Author: scipy developers, Christopher Kuster and edited by Josef Perktold
License: Scipy, BSD-3

https://github.com/scipy/scipy/pull/87
"""

import numpy as np
from scipy import stats
from scipy.stats.stats import rankdata, square_of_sums, tiecorrect

chisqprob = stats.chi2.sf

def kruskal(data, nrep=100000):
    """
    Compute the Kruskal-Wallis H-test for independent samples

    The Kruskal-Wallis H-test tests the null hypothesis that the population
    median of all of the groups are equal. It is a non-parametric version of
    ANOVA. The test works on 2 or more independent samples, which may have
    different sizes. Note that rejecting the null hypothesis does not
    indicate which of the groups differs. Post-hoc comparisons between
    groups are required to determine which groups are different.

    Parameters
    ----------
    data : list of array_like
        An iterable with two or more arrays with the sample measurements can
        be given as arguments.
    nrep : int
        Number of random permutations to be used in the p-value calculations.
        If nrep=0, then the permutation p-value is not calculated.

    Returns
    -------
    H-statistic : float
    The Kruskal-Wallis H statistic, corrected for ties
    p-value-chi2 : float
        The p-value for the test using the assumption that H has a chi
        square distribution
    p-value-perm : float
        The p-value for the test based on random permutation

    Notes
    -----
    Due to the assumption that H has a chi square distribution, the number
    of samples in each group must not be too small. A typical rule is
    that each sample must have at least 5 measurements.

    In contrast to scipy.stats.kruskal, the samples are only a single
    argument and need to be in an iterable (tuple, list or similar).

    References
    ----------
    .. [1] http://en.wikipedia.org/wiki/Kruskal-Wallis_one-way_analysis_of_variance

    """
    args = map(np.asarray, data) # convert to a numpy array
    na = len(args) # Kruskal-Wallis on 'na' groups, each in it's own array
    if na < 2:
        raise ValueError("Need at least two groups in stats.kruskal()")
    n = map(len, args)

    alldata = np.concatenate(args)

    ranked = rankdata(alldata) # Rank the data
    T = tiecorrect(ranked) # Correct for ties
    if T == 0:
        raise ValueError('All numbers are identical in kruskal')

    j = np.insert(np.cumsum(n),0,0)
    ssbn = 0
    for i in range(na):
        ssbn += square_of_sums(ranked[j[i]:j[i+1]])/float(n[i]) # Compute sum^2/n for each group

    totaln = np.sum(n)
    h = 12.0 / (totaln*(totaln+1)) * ssbn - 3*(totaln+1)
    df = len(args) - 1
    h = h / float(T)
    #return h, chisqprob(h,df)

    pval = np.nan
    if nrep > 0:
        #bootstrap by ckuster

        # functional approximation (current implementation)
        #    pval = chisqprob(h,df)
        # approximation via simulation (alternative, very slow implementation)
        tot_count = 0
        pas_count = 0
        # There are a few ways of deciding when to end the following loop
        #     1) convergence of p-value (this could take a long time)
        #     2) maximum iteration count
        #     3) maximum time spent
        # I think SPSS (now PASW) has at least 1 and 3 from this list asoptions
        while (tot_count < nrep):  # this is the stupid way
            np.random.shuffle(ranked)
            ssbn = 0.0
            for i in range(na):
                ssbn = ssbn + square_of_sums(ranked[j[i]:j[i+1]])/float(n[i])
                # Compute sum^2/n for each group
            totaln = np.sum(n)
            htest = 12.0 / (totaln*(totaln+1)) * ssbn - 3*(totaln+1)
            htest = htest / float(T)
            tot_count += 1
            if htest >= h:
                pas_count += 1
        pval = float(pas_count)/float(tot_count)
    return h, chisqprob(h,df), pval

if __name__ == '__main__':

    seed = np.random.randint(1000000)
    print 'seed', seed
    #seed= 544859
    np.random.seed(seed)
    #use array is an iterable over rows
    print kruskal((np.array([[2,0.5,3]]).T*np.random.randn(3,10)**2))
    np.random.seed(seed)
    print kruskal((np.array([[3,0.5,5]]).T*np.random.random_integers(0,7,size=(3,10))**2))
    np.random.seed(seed)
    x = np.array([[1,0.5,2]]).T+np.random.random_integers(0,7,size=(3,10))
    print kruskal(x), 'nrep=10000'
    print kruskal(x, nrep=1000), 'nrep=1000'
    print kruskal(x, nrep=0), 'nrep=0'
    print kruskal(x.tolist(), nrep=1000), 'nrep=1000, list'

    print '\nwith unequal sizes and small samples the chisquare approximation ' \
          'might not be not very good'
    x = ([4.0, 7.0, 1.0, 4.0, 3.0, 1.0, 5.0, 4.0, 1.0, 7.0],
         [1.0, 4.0, 1.0],
         [7.0, 4.0, 9.0, 9.0])
    print kruskal(x, nrep=10000), 'nrep=10000, unequal sizes'
	# -- coding: utf-8 --
	"""scipy.stats.kruskal with random permutation p-values

	Created on Thu Oct 06 16:52:14 2011

	Author: scipy developers, Christopher Kuster and edited by Josef Perktold
	License: Scipy, BSD-3

	https://github.com/scipy/scipy/pull/87
	"""

	import numpy as np
	from scipy import stats
	from scipy.stats.stats import rankdata, square_of_sums, tiecorrect

	chisqprob = stats.chi2.sf

	def kruskal(data, nrep=100000):
	"""
	Compute the Kruskal-Wallis H-test for independent samples

	The Kruskal-Wallis H-test tests the null hypothesis that the population
	median of all of the groups are equal. It is a non-parametric version of
	ANOVA. The test works on 2 or more independent samples, which may have
	different sizes. Note that rejecting the null hypothesis does not
	indicate which of the groups differs. Post-hoc comparisons between
	groups are required to determine which groups are different.

	Parameters
	----------
	data : list of array_like
	An iterable with two or more arrays with the sample measurements can
	be given as arguments.
	nrep : int
	Number of random permutations to be used in the p-value calculations.
	If nrep=0, then the permutation p-value is not calculated.

	Returns
	-------
	H-statistic : float
	The Kruskal-Wallis H statistic, corrected for ties
	p-value-chi2 : float
	The p-value for the test using the assumption that H has a chi
	square distribution
	p-value-perm : float
	The p-value for the test based on random permutation

	Notes
	-----
	Due to the assumption that H has a chi square distribution, the number
	of samples in each group must not be too small. A typical rule is
	that each sample must have at least 5 measurements.

	In contrast to scipy.stats.kruskal, the samples are only a single
	argument and need to be in an iterable (tuple, list or similar).

	References
	----------
	.. [1] http://en.wikipedia.org/wiki/Kruskal-Wallis_one-way_analysis_of_variance

	"""
	args = map(np.asarray, data) # convert to a numpy array
	na = len(args) # Kruskal-Wallis on 'na' groups, each in it's own array
	if na < 2:
	raise ValueError("Need at least two groups in stats.kruskal()")
	n = map(len, args)

	alldata = np.concatenate(args)

	ranked = rankdata(alldata) # Rank the data
	T = tiecorrect(ranked) # Correct for ties
	if T == 0:
	raise ValueError('All numbers are identical in kruskal')

	j = np.insert(np.cumsum(n),0,0)
	ssbn = 0
	for i in range(na):
	ssbn += square_of_sums(ranked[j[i]:j[i+1]])/float(n[i]) # Compute sum^2/n for each group

	totaln = np.sum(n)
	h = 12.0 / (totaln(totaln+1)) ssbn - 3*(totaln+1)
	df = len(args) - 1
	h = h / float(T)
	#return h, chisqprob(h,df)

	pval = np.nan
	if nrep > 0:
	#bootstrap by ckuster

	# functional approximation (current implementation)
	# pval = chisqprob(h,df)
	# approximation via simulation (alternative, very slow implementation)
	tot_count = 0
	pas_count = 0
	# There are a few ways of deciding when to end the following loop
	# 1) convergence of p-value (this could take a long time)
	# 2) maximum iteration count
	# 3) maximum time spent
	# I think SPSS (now PASW) has at least 1 and 3 from this list asoptions
	while (tot_count < nrep): # this is the stupid way
	np.random.shuffle(ranked)
	ssbn = 0.0
	for i in range(na):
	ssbn = ssbn + square_of_sums(ranked[j[i]:j[i+1]])/float(n[i])
	# Compute sum^2/n for each group
	totaln = np.sum(n)
	htest = 12.0 / (totaln(totaln+1)) ssbn - 3*(totaln+1)
	htest = htest / float(T)
	tot_count += 1
	if htest >= h:
	pas_count += 1
	pval = float(pas_count)/float(tot_count)
	return h, chisqprob(h,df), pval

	if __name__ == '__main__':

	seed = np.random.randint(1000000)
	print 'seed', seed
	#seed= 544859
	np.random.seed(seed)
	#use array is an iterable over rows
	print kruskal((np.array([[2,0.5,3]]).Tnp.random.randn(3,10)*2))
	np.random.seed(seed)
	print kruskal((np.array([[3,0.5,5]]).Tnp.random.random_integers(0,7,size=(3,10))*2))
	np.random.seed(seed)
	x = np.array([[1,0.5,2]]).T+np.random.random_integers(0,7,size=(3,10))
	print kruskal(x), 'nrep=10000'
	print kruskal(x, nrep=1000), 'nrep=1000'
	print kruskal(x, nrep=0), 'nrep=0'
	print kruskal(x.tolist(), nrep=1000), 'nrep=1000, list'

	print '\nwith unequal sizes and small samples the chisquare approximation ' \
	'might not be not very good'
	x = ([4.0, 7.0, 1.0, 4.0, 3.0, 1.0, 5.0, 4.0, 1.0, 7.0],
	[1.0, 4.0, 1.0],
	[7.0, 4.0, 9.0, 9.0])
	print kruskal(x, nrep=10000), 'nrep=10000, unequal sizes'