ricardoV94/dunn.py

## dunn.py
import numpy as np
from scipy import stats
from itertools import combinations
from statsmodels.stats.multitest import multipletests
from statsmodels.stats.libqsturng import psturng
import warnings


def kw_dunn(groups, to_compare=None, alpha=0.05, method='bonf'):
    """

    Kruskal-Wallis 1-way ANOVA with Dunn's multiple comparison test

    Arguments:
    ---------------
    groups: sequence
        arrays corresponding to k mutually independent samples from
        continuous populations

    to_compare: sequence
        tuples specifying the indices of pairs of groups to compare, e.g.
        [(0, 1), (0, 2)] would compare group 0 with 1 & 2. by default, all
        possible pairwise comparisons between groups are performed.

    alpha: float
        family-wise error rate used for correcting for multiple comparisons
        (see statsmodels.stats.multitest.multipletests for details)

    method: string
        method used to adjust p-values to account for multiple corrections (see
        statsmodels.stats.multitest.multipletests for options)

    Returns:
    ---------------
    H: float
        Kruskal-Wallis H-statistic

    p_omnibus: float
        p-value corresponding to the global null hypothesis that the medians of
        the groups are all equal

    Z_pairs: float array
        Z-scores computed for the absolute difference in mean ranks for each
        pairwise comparison

    p_corrected: float array
        corrected p-values for each pairwise comparison, corresponding to the
        null hypothesis that the pair of groups has equal medians. note that
        these are only meaningful if the global null hypothesis is rejected.

    reject: bool array
        True for pairs where the null hypothesis can be rejected for the given
        alpha

    Reference:
    ---------------
    Gibbons, J. D., & Chakraborti, S. (2011). Nonparametric Statistical
    Inference (5th ed., pp. 353-357). Boca Raton, FL: Chapman & Hall.

    """

    # omnibus test (K-W ANOVA)
    # -------------------------------------------------------------------------

    groups = [np.array(gg) for gg in groups]

    k = len(groups)

    n = np.array([len(gg) for gg in groups])
    if np.any(n < 5):
        warnings.warn("Sample sizes < 5 are not recommended (K-W test assumes "
                      "a chi square distribution)")

    allgroups = np.concatenate(groups)
    N = len(allgroups)
    ranked = stats.rankdata(allgroups)

    # correction factor for ties
    T = stats.tiecorrect(ranked)
    if T == 0:
        raise ValueError('All numbers are identical in kruskal')

    # sum of ranks for each group
    j = np.insert(np.cumsum(n), 0, 0)
    R = np.empty(k, dtype=np.float)
    for ii in range(k):
        R[ii] = ranked[j[ii]:j[ii + 1]].sum()

    # the Kruskal-Wallis H-statistic
    H = (12. / (N * (N + 1.))) * ((R ** 2.) / n).sum() - 3 * (N + 1)

    # apply correction factor for ties
    H /= T

    df_omnibus = k - 1
    p_omnibus = stats.chisqprob(H, df_omnibus)

    # multiple comparisons
    # -------------------------------------------------------------------------

    # by default we compare every possible pair of groups
    if to_compare is None:
        to_compare = tuple(combinations(range(k), 2))

    ncomp = len(to_compare)

    Z_pairs = np.empty(ncomp, dtype=np.float)
    p_uncorrected = np.empty(ncomp, dtype=np.float)
    Rmean = R / n

    for pp, (ii, jj) in enumerate(to_compare):

        # standardized score
        # Added correction suggested by @jazon33y. The funtion now produces the same output as that of R library(dunn.test)
        ts3_ts = list(np.unique(allgroups, return_counts=True)[1])
        E_ts3_ts = sum([x**3 - x for x in ts3_ts if x>1])

        if sum([x>1 for x in ts3_ts]) > 0:
            yi = np.abs(Rmean[ii] - Rmean[jj])
            theta10 = (N * (N + 1)) / 12
            theta11 =  E_ts3_ts / ( 12* (N - 1) )
            theta2 = (1 / n[ii] + 1 / n[jj])
            theta = np.sqrt( (theta10 - theta11) * theta2 )
            Zij = yi / theta
        else:
            Zij = (np.abs(Rmean[ii] - Rmean[jj]) /
                   np.sqrt((1. / 12.) * N * (N + 1) * (1. / n[ii] + 1. / n[jj])))
        Z_pairs[pp] = Zij

    # corresponding p-values obtained from upper quantiles of the standard
    # normal distribution
    p_uncorrected = stats.norm.sf(Z_pairs) * 2.

    # correction for multiple comparisons
    reject, p_corrected, alphac_sidak, alphac_bonf = multipletests(
        p_uncorrected, method=method
    )

    return H, p_omnibus, Z_pairs, p_corrected, reject
	import numpy as np
	from scipy import stats
	from itertools import combinations
	from statsmodels.stats.multitest import multipletests
	from statsmodels.stats.libqsturng import psturng
	import warnings


	def kw_dunn(groups, to_compare=None, alpha=0.05, method='bonf'):
	"""

	Kruskal-Wallis 1-way ANOVA with Dunn's multiple comparison test

	Arguments:
	---------------
	groups: sequence
	arrays corresponding to k mutually independent samples from
	continuous populations

	to_compare: sequence
	tuples specifying the indices of pairs of groups to compare, e.g.
	[(0, 1), (0, 2)] would compare group 0 with 1 & 2. by default, all
	possible pairwise comparisons between groups are performed.

	alpha: float
	family-wise error rate used for correcting for multiple comparisons
	(see statsmodels.stats.multitest.multipletests for details)

	method: string
	method used to adjust p-values to account for multiple corrections (see
	statsmodels.stats.multitest.multipletests for options)

	Returns:
	---------------
	H: float
	Kruskal-Wallis H-statistic

	p_omnibus: float
	p-value corresponding to the global null hypothesis that the medians of
	the groups are all equal

	Z_pairs: float array
	Z-scores computed for the absolute difference in mean ranks for each
	pairwise comparison

	p_corrected: float array
	corrected p-values for each pairwise comparison, corresponding to the
	null hypothesis that the pair of groups has equal medians. note that
	these are only meaningful if the global null hypothesis is rejected.

	reject: bool array
	True for pairs where the null hypothesis can be rejected for the given
	alpha

	Reference:
	---------------
	Gibbons, J. D., & Chakraborti, S. (2011). Nonparametric Statistical
	Inference (5th ed., pp. 353-357). Boca Raton, FL: Chapman & Hall.

	"""

	# omnibus test (K-W ANOVA)
	# -------------------------------------------------------------------------

	groups = [np.array(gg) for gg in groups]

	k = len(groups)

	n = np.array([len(gg) for gg in groups])
	if np.any(n < 5):
	warnings.warn("Sample sizes < 5 are not recommended (K-W test assumes "
	"a chi square distribution)")

	allgroups = np.concatenate(groups)
	N = len(allgroups)
	ranked = stats.rankdata(allgroups)

	# correction factor for ties
	T = stats.tiecorrect(ranked)
	if T == 0:
	raise ValueError('All numbers are identical in kruskal')

	# sum of ranks for each group
	j = np.insert(np.cumsum(n), 0, 0)
	R = np.empty(k, dtype=np.float)
	for ii in range(k):
	R[ii] = ranked[j[ii]:j[ii + 1]].sum()

	# the Kruskal-Wallis H-statistic
	H = (12. / (N * (N + 1.))) * ((R ** 2.) / n).sum() - 3 * (N + 1)

	# apply correction factor for ties
	H /= T

	df_omnibus = k - 1
	p_omnibus = stats.chisqprob(H, df_omnibus)

	# multiple comparisons
	# -------------------------------------------------------------------------

	# by default we compare every possible pair of groups
	if to_compare is None:
	to_compare = tuple(combinations(range(k), 2))

	ncomp = len(to_compare)

	Z_pairs = np.empty(ncomp, dtype=np.float)
	p_uncorrected = np.empty(ncomp, dtype=np.float)
	Rmean = R / n

	for pp, (ii, jj) in enumerate(to_compare):

	# standardized score
	# Added correction suggested by @jazon33y. The funtion now produces the same output as that of R library(dunn.test)
	ts3_ts = list(np.unique(allgroups, return_counts=True)[1])
	E_ts3_ts = sum([x**3 - x for x in ts3_ts if x>1])

	if sum([x>1 for x in ts3_ts]) > 0:
	yi = np.abs(Rmean[ii] - Rmean[jj])
	theta10 = (N * (N + 1)) / 12
	theta11 = E_ts3_ts / ( 12* (N - 1) )
	theta2 = (1 / n[ii] + 1 / n[jj])
	theta = np.sqrt( (theta10 - theta11) * theta2 )
	Zij = yi / theta
	else:
	Zij = (np.abs(Rmean[ii] - Rmean[jj]) /
	np.sqrt((1. / 12.) * N * (N + 1) * (1. / n[ii] + 1. / n[jj])))
	Z_pairs[pp] = Zij

	# corresponding p-values obtained from upper quantiles of the standard
	# normal distribution
	p_uncorrected = stats.norm.sf(Z_pairs) * 2.

	# correction for multiple comparisons
	reject, p_corrected, alphac_sidak, alphac_bonf = multipletests(
	p_uncorrected, method=method
	)

	return H, p_omnibus, Z_pairs, p_corrected, reject