josef-pkt/nonparametric_cuzicktrend.py

## nonparametric_cuzicktrend.py
'''Cuzick's non-parametric test of trend

Author : Josef Perktold

'''
import numpy as np

from scipy import stats
from scipy.stats.stats import rankdata, tiecorrect, square_of_sums

class Bunch(object):
    pass


# written based on scipy.stats.kruskal
def trend_cuzick(data_list, levels=None):
    """
    Compute the Cuzick trend test for independent samples

    Parameters
    ----------
    sample1, sample2, ... : array_like
       Two or more arrays with the sample measurements can be given as
       arguments.
    levels : None, or array_like
       These are the numberic values associated with the groups. If it is None,
       then integers 1, 2, 3, ... are used.

    Returns
    -------
    res : a Bunch instance with
        p_value, test statistic and additional attributes

    Notes
    -----
    The p-value is based on the approximation by the normal distribution.
    To be a good approximation the number of samples in each group should
    not be too small.

    """
    args = list(map(np.asarray, data_list)) # convert to a numpy array
    na = len(args)               # Kruskal-Wallis on 'na' groups, each in it's own array
    if na < 2:
        raise ValueError("Need at least two groups in stats.kruskal()")
    n = np.asarray(list(map(len, args)))

    if levels is None:
        levels = np.arange(1, na + 1)
    else:
        levels = np.asarray(levels)

    alldata = np.concatenate(args)

    ranked = rankdata(alldata)  # Rank the data
    T = tiecorrect(ranked)      # Correct for ties
    if T == 0:
        raise ValueError('All numbers are identical in kruskal')

    L = (levels * n).sum()

    # group indices interval
    idx = np.insert(np.cumsum(n), 0, 0)
    ranksum = [ranked[idx[i]:idx[i+1]].sum() for i in range(na)]
    # could use reduceat instead

    value = (levels * ranksum).sum()  # T in Stata manual

    nobs = n.sum()
    #tie_correction is np.sqrt(T)
    var = (nobs + 1.) / 12. * (nobs * (levels**2 * n).sum() - L**2) * T
    se = np.sqrt(var)

    value_standardized = (value - 0.5 * (nobs + 1) * L) / se #/ np.sqrt(T)

    res = Bunch()
    res.value = value
    res.p_value = stats.norm.sf(value_standardized)*2
    res.statistic = value_standardized
    res.ranksum = ranksum
    res.tiecorrection = T
    res.nobs_groups = n
    res.n_groups = na
    res.nobs = nobs
    return res

#group	exposure
data = np.array('''\
1	1.4
1	1.4
1	1.4
1	1.6
1	2.3
1	2.3
2	.9
2	1
2	1.1
2	1.1
2	1.2
2	1.2
2	1.5
2	1.9
2	2.2
2	2.6
2	2.6
2	2.6
2	2.8
2	2.8
2	3.2
2	3.5
2	4.3
2	5.1
3	.8
3	1.7
3	1.7
3	1.7
3	3.4
3	7.1
3	8.9
3	13.5'''.split(), float).reshape(-1, 2)

groups = data[:,0].astype(int)
xli = [data[data[:,0] == k, 1] for k in [1, 2, 3]]

res = trend_cuzick(xli)
from pprint import pprint
pprint(vars(res))

r = {}
r['p'] =  .1287642613976359
r['z'] =  1.51899298636755
r['T'] =  1142
r['N'] =  32
pprint(r)
print 'diff p-value', res.p_value - r['p']
print 'diff statistic', res.statistic - r['z']
print 'diff value', res.value - r['T']

print '\n changing levels'
res2 = trend_cuzick(xli, levels=[1, 2, 5])
pprint(vars(res2))
	'''Cuzick's non-parametric test of trend

	Author : Josef Perktold

	'''
	import numpy as np

	from scipy import stats
	from scipy.stats.stats import rankdata, tiecorrect, square_of_sums

	class Bunch(object):
	pass


	# written based on scipy.stats.kruskal
	def trend_cuzick(data_list, levels=None):
	"""
	Compute the Cuzick trend test for independent samples

	Parameters
	----------
	sample1, sample2, ... : array_like
	Two or more arrays with the sample measurements can be given as
	arguments.
	levels : None, or array_like
	These are the numberic values associated with the groups. If it is None,
	then integers 1, 2, 3, ... are used.

	Returns
	-------
	res : a Bunch instance with
	p_value, test statistic and additional attributes

	Notes
	-----
	The p-value is based on the approximation by the normal distribution.
	To be a good approximation the number of samples in each group should
	not be too small.

	"""
	args = list(map(np.asarray, data_list)) # convert to a numpy array
	na = len(args) # Kruskal-Wallis on 'na' groups, each in it's own array
	if na < 2:
	raise ValueError("Need at least two groups in stats.kruskal()")
	n = np.asarray(list(map(len, args)))

	if levels is None:
	levels = np.arange(1, na + 1)
	else:
	levels = np.asarray(levels)

	alldata = np.concatenate(args)

	ranked = rankdata(alldata) # Rank the data
	T = tiecorrect(ranked) # Correct for ties
	if T == 0:
	raise ValueError('All numbers are identical in kruskal')

	L = (levels * n).sum()

	# group indices interval
	idx = np.insert(np.cumsum(n), 0, 0)
	ranksum = [ranked[idx[i]:idx[i+1]].sum() for i in range(na)]
	# could use reduceat instead

	value = (levels * ranksum).sum() # T in Stata manual

	nobs = n.sum()
	#tie_correction is np.sqrt(T)
	var = (nobs + 1.) / 12. * (nobs * (levels*2 n).sum() - L*2) T
	se = np.sqrt(var)

	value_standardized = (value - 0.5 * (nobs + 1) * L) / se #/ np.sqrt(T)

	res = Bunch()
	res.value = value
	res.p_value = stats.norm.sf(value_standardized)*2
	res.statistic = value_standardized
	res.ranksum = ranksum
	res.tiecorrection = T
	res.nobs_groups = n
	res.n_groups = na
	res.nobs = nobs
	return res

	#group exposure
	data = np.array('''\
	1 1.4
	1 1.4
	1 1.4
	1 1.6
	1 2.3
	1 2.3
	2 .9
	2 1
	2 1.1
	2 1.1
	2 1.2
	2 1.2
	2 1.5
	2 1.9
	2 2.2
	2 2.6
	2 2.6
	2 2.6
	2 2.8
	2 2.8
	2 3.2
	2 3.5
	2 4.3
	2 5.1
	3 .8
	3 1.7
	3 1.7
	3 1.7
	3 3.4
	3 7.1
	3 8.9
	3 13.5'''.split(), float).reshape(-1, 2)

	groups = data[:,0].astype(int)
	xli = [data[data[:,0] == k, 1] for k in [1, 2, 3]]

	res = trend_cuzick(xli)
	from pprint import pprint
	pprint(vars(res))

	r = {}
	r['p'] = .1287642613976359
	r['z'] = 1.51899298636755
	r['T'] = 1142
	r['N'] = 32
	pprint(r)
	print 'diff p-value', res.p_value - r['p']
	print 'diff statistic', res.statistic - r['z']
	print 'diff value', res.value - r['T']

	print '\n changing levels'
	res2 = trend_cuzick(xli, levels=[1, 2, 5])
	pprint(vars(res2))