Skip to content

Instantly share code, notes, and snippets.

@josef-pkt
Last active January 1, 2023 15:29
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save josef-pkt/7035711 to your computer and use it in GitHub Desktop.
Save josef-pkt/7035711 to your computer and use it in GitHub Desktop.
Cuzick's non-parametric test of trend
'''Cuzick's non-parametric test of trend
Author : Josef Perktold
'''
import numpy as np
from scipy import stats
from scipy.stats.stats import rankdata, tiecorrect, square_of_sums
class Bunch(object):
pass
# written based on scipy.stats.kruskal
def trend_cuzick(data_list, levels=None):
"""
Compute the Cuzick trend test for independent samples
Parameters
----------
sample1, sample2, ... : array_like
Two or more arrays with the sample measurements can be given as
arguments.
levels : None, or array_like
These are the numberic values associated with the groups. If it is None,
then integers 1, 2, 3, ... are used.
Returns
-------
res : a Bunch instance with
p_value, test statistic and additional attributes
Notes
-----
The p-value is based on the approximation by the normal distribution.
To be a good approximation the number of samples in each group should
not be too small.
"""
args = list(map(np.asarray, data_list)) # convert to a numpy array
na = len(args) # Kruskal-Wallis on 'na' groups, each in it's own array
if na < 2:
raise ValueError("Need at least two groups in stats.kruskal()")
n = np.asarray(list(map(len, args)))
if levels is None:
levels = np.arange(1, na + 1)
else:
levels = np.asarray(levels)
alldata = np.concatenate(args)
ranked = rankdata(alldata) # Rank the data
T = tiecorrect(ranked) # Correct for ties
if T == 0:
raise ValueError('All numbers are identical in kruskal')
L = (levels * n).sum()
# group indices interval
idx = np.insert(np.cumsum(n), 0, 0)
ranksum = [ranked[idx[i]:idx[i+1]].sum() for i in range(na)]
# could use reduceat instead
value = (levels * ranksum).sum() # T in Stata manual
nobs = n.sum()
#tie_correction is np.sqrt(T)
var = (nobs + 1.) / 12. * (nobs * (levels**2 * n).sum() - L**2) * T
se = np.sqrt(var)
value_standardized = (value - 0.5 * (nobs + 1) * L) / se #/ np.sqrt(T)
res = Bunch()
res.value = value
res.p_value = stats.norm.sf(value_standardized)*2
res.statistic = value_standardized
res.ranksum = ranksum
res.tiecorrection = T
res.nobs_groups = n
res.n_groups = na
res.nobs = nobs
return res
#group exposure
data = np.array('''\
1 1.4
1 1.4
1 1.4
1 1.6
1 2.3
1 2.3
2 .9
2 1
2 1.1
2 1.1
2 1.2
2 1.2
2 1.5
2 1.9
2 2.2
2 2.6
2 2.6
2 2.6
2 2.8
2 2.8
2 3.2
2 3.5
2 4.3
2 5.1
3 .8
3 1.7
3 1.7
3 1.7
3 3.4
3 7.1
3 8.9
3 13.5'''.split(), float).reshape(-1, 2)
groups = data[:,0].astype(int)
xli = [data[data[:,0] == k, 1] for k in [1, 2, 3]]
res = trend_cuzick(xli)
from pprint import pprint
pprint(vars(res))
r = {}
r['p'] = .1287642613976359
r['z'] = 1.51899298636755
r['T'] = 1142
r['N'] = 32
pprint(r)
print 'diff p-value', res.p_value - r['p']
print 'diff statistic', res.statistic - r['z']
print 'diff value', res.value - r['T']
print '\n changing levels'
res2 = trend_cuzick(xli, levels=[1, 2, 5])
pprint(vars(res2))
@josef-pkt
Copy link
Author

<__main__.Bunch object at 0x03F276D0>
{'n_groups': 3,
 'nobs': 32,
 'nobs_groups': array([ 6, 18,  8]),
 'p_value': 0.12876426139763578,
 'ranksum': [76.0, 290.0, 162.0],
 'statistic': 1.5189929863675509,
 'tiecorrection': 0.9970674486803519,
 'value': 1142.0}
{'N': 32, 'T': 1142, 'p': 0.1287642613976359, 'z': 1.51899298636755}
diff p-value -1.11022302463e-16
diff statistic 8.881784197e-16
diff value 0.0

 changing levels
{'n_groups': 3,
 'nobs': 32,
 'nobs_groups': array([ 6, 18,  8]),
 'p_value': 0.14312133949843658,
 'ranksum': [76.0, 290.0, 162.0],
 'statistic': 1.4642657937713346,
 'tiecorrection': 0.9970674486803519,
 'value': 1466.0}

@Majte
Copy link

Majte commented Jan 1, 2023

Great! Love it! Although it's been ages since this been worked on, I'd like to propose a very minor correction. You need to take the abs of the z score for the p value, as otherwise your p is going to be larger than 1 for the reverse case, e.g. levels = [3, 2, 1]

res.p_value = stats.norm.sf(np.abs(value_standardized))*2

And one more thing... 'square_of_sums' module is not used and I believe not part of the stats,stats module anymore

Regards, Majte

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment