Created
October 7, 2011 13:49
-
-
Save josef-pkt/1270325 to your computer and use it in GitHub Desktop.
scipy.stats.kruskal with random permutation p-values
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
"""scipy.stats.kruskal with random permutation p-values | |
Created on Thu Oct 06 16:52:14 2011 | |
Author: scipy developers, Christopher Kuster and edited by Josef Perktold | |
License: Scipy, BSD-3 | |
https://github.com/scipy/scipy/pull/87 | |
""" | |
import numpy as np | |
from scipy import stats | |
from scipy.stats.stats import rankdata, square_of_sums, tiecorrect | |
chisqprob = stats.chi2.sf | |
def kruskal(data, nrep=100000): | |
""" | |
Compute the Kruskal-Wallis H-test for independent samples | |
The Kruskal-Wallis H-test tests the null hypothesis that the population | |
median of all of the groups are equal. It is a non-parametric version of | |
ANOVA. The test works on 2 or more independent samples, which may have | |
different sizes. Note that rejecting the null hypothesis does not | |
indicate which of the groups differs. Post-hoc comparisons between | |
groups are required to determine which groups are different. | |
Parameters | |
---------- | |
data : list of array_like | |
An iterable with two or more arrays with the sample measurements can | |
be given as arguments. | |
nrep : int | |
Number of random permutations to be used in the p-value calculations. | |
If nrep=0, then the permutation p-value is not calculated. | |
Returns | |
------- | |
H-statistic : float | |
The Kruskal-Wallis H statistic, corrected for ties | |
p-value-chi2 : float | |
The p-value for the test using the assumption that H has a chi | |
square distribution | |
p-value-perm : float | |
The p-value for the test based on random permutation | |
Notes | |
----- | |
Due to the assumption that H has a chi square distribution, the number | |
of samples in each group must not be too small. A typical rule is | |
that each sample must have at least 5 measurements. | |
In contrast to scipy.stats.kruskal, the samples are only a single | |
argument and need to be in an iterable (tuple, list or similar). | |
References | |
---------- | |
.. [1] http://en.wikipedia.org/wiki/Kruskal-Wallis_one-way_analysis_of_variance | |
""" | |
args = map(np.asarray, data) # convert to a numpy array | |
na = len(args) # Kruskal-Wallis on 'na' groups, each in it's own array | |
if na < 2: | |
raise ValueError("Need at least two groups in stats.kruskal()") | |
n = map(len, args) | |
alldata = np.concatenate(args) | |
ranked = rankdata(alldata) # Rank the data | |
T = tiecorrect(ranked) # Correct for ties | |
if T == 0: | |
raise ValueError('All numbers are identical in kruskal') | |
j = np.insert(np.cumsum(n),0,0) | |
ssbn = 0 | |
for i in range(na): | |
ssbn += square_of_sums(ranked[j[i]:j[i+1]])/float(n[i]) # Compute sum^2/n for each group | |
totaln = np.sum(n) | |
h = 12.0 / (totaln*(totaln+1)) * ssbn - 3*(totaln+1) | |
df = len(args) - 1 | |
h = h / float(T) | |
#return h, chisqprob(h,df) | |
pval = np.nan | |
if nrep > 0: | |
#bootstrap by ckuster | |
# functional approximation (current implementation) | |
# pval = chisqprob(h,df) | |
# approximation via simulation (alternative, very slow implementation) | |
tot_count = 0 | |
pas_count = 0 | |
# There are a few ways of deciding when to end the following loop | |
# 1) convergence of p-value (this could take a long time) | |
# 2) maximum iteration count | |
# 3) maximum time spent | |
# I think SPSS (now PASW) has at least 1 and 3 from this list asoptions | |
while (tot_count < nrep): # this is the stupid way | |
np.random.shuffle(ranked) | |
ssbn = 0.0 | |
for i in range(na): | |
ssbn = ssbn + square_of_sums(ranked[j[i]:j[i+1]])/float(n[i]) | |
# Compute sum^2/n for each group | |
totaln = np.sum(n) | |
htest = 12.0 / (totaln*(totaln+1)) * ssbn - 3*(totaln+1) | |
htest = htest / float(T) | |
tot_count += 1 | |
if htest >= h: | |
pas_count += 1 | |
pval = float(pas_count)/float(tot_count) | |
return h, chisqprob(h,df), pval | |
if __name__ == '__main__': | |
seed = np.random.randint(1000000) | |
print 'seed', seed | |
#seed= 544859 | |
np.random.seed(seed) | |
#use array is an iterable over rows | |
print kruskal((np.array([[2,0.5,3]]).T*np.random.randn(3,10)**2)) | |
np.random.seed(seed) | |
print kruskal((np.array([[3,0.5,5]]).T*np.random.random_integers(0,7,size=(3,10))**2)) | |
np.random.seed(seed) | |
x = np.array([[1,0.5,2]]).T+np.random.random_integers(0,7,size=(3,10)) | |
print kruskal(x), 'nrep=10000' | |
print kruskal(x, nrep=1000), 'nrep=1000' | |
print kruskal(x, nrep=0), 'nrep=0' | |
print kruskal(x.tolist(), nrep=1000), 'nrep=1000, list' | |
print '\nwith unequal sizes and small samples the chisquare approximation ' \ | |
'might not be not very good' | |
x = ([4.0, 7.0, 1.0, 4.0, 3.0, 1.0, 5.0, 4.0, 1.0, 7.0], | |
[1.0, 4.0, 1.0], | |
[7.0, 4.0, 9.0, 9.0]) | |
print kruskal(x, nrep=10000), 'nrep=10000, unequal sizes' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment