jeanbaptisteb/fei.py

## fei.py
"""
Python implementation of the פ (Fei) effect size for goodness-of-fit tests.
This is a beta version. Confidence intervals may differ from the original R implementation (see the "Returns" section below).

Reference:  Ben-Shachar, M.S.; Patil, I.; Thériault, R.; Wiernik, B.M.; Lüdecke, D. Phi, Fei, Fo, Fum: Effect Sizes for Categorical Data That Use the Chi-Squared Statistic. Mathematics 2023, 11, 1982. https://doi.org/10.3390/math11091982
For the original R implementation of Fei, see the "effectsize" library https://easystats.github.io/effectsize/reference/phi.html.
"""
from scipy.stats import chisquare, ncx2
from scipy.optimize import least_squares
from collections import namedtuple
import numpy as np
import math
def fei(obs, exp, ci=0.95, alternative="greater"):
    """
    Parameters
    ----------
    obs : list or numpy array
        Observed counts. Must be a one dimensional table of counts.
        Example: obs=[10,50,120,20]
    exp : list or numpy array
        Expected counts or probabilities. Must be a one dimensional table of counts or probabilities.
        Examples:
            uniform counts: exp=[50, 50, 50, 50]
            uniform probabilities: exp=[0.25, 0.25, 0.25, 0.25]
            non-uniform counts: exp=[50,10,90,50]
            non-uniform probabilities: exp=[0.25, 0.05, 0.45, 0.25]
    ci: float
        Confidence interval level.
    alternative: string
        Alternative hypothesis. Controls the type of confidence interval returned.
        Possible values:
            - "greater" (default)
            - "two.sided"
            - "less"
        If "greater" is selected, the upper confidence interval limit will always be 1.
        If "less" is selected, the lower confidence interval limit will always be 0.


    Returns
    -------
    statistic : float
        Returns the fei coefficient, as described by Ben-Shachar et al. (2023).
    interval: tuple of two floats
        Returns the confidence interval.
        Note that if "less" set as the alternative, the lower interval limit will always be 0, while if it's set
        to "greater", the upper interval limit will always be 1.
        The confidence interval is calculated by using the non-centrality parameter method (aka the "pivot method"),
        similarly to the R implementation of Fei in the library effectsize and to what is described in Ben-Shachar et al. (2023).
        However, the "pivot" calculation uses SciPy's least squares optimization function, which may result in
        different confidence intervals than the R implementation of Fei in a few cases.
        For details about SciPy's least squares optimization function, see https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.least_squares.html

    Reference
    -------
    Ben-Shachar, M.S.; Patil, I.; Thériault, R.; Wiernik, B.M.; Lüdecke, D. Phi, Fei, Fo, Fum: Effect Sizes for Categorical Data That Use the Chi-Squared Statistic. Mathematics 2023, 11, 1982. https://doi.org/10.3390/math11091982
    """

    result_tuple = namedtuple('Fei', ['statistic', 'interval'])
    # =============================================================================
    # Fei statistic calculations
    # =============================================================================
    obs = np.array(obs)
    exp = np.array(exp)
    if len(obs.shape) > 1 or len(exp.shape) >1:
        raise ValueError(f"Observed counts and expected counts/probabilities must be both one-dimensional tables. You passed a {len(obs.shape)}-dimensional table for observed counts, and a {len(exp.shape)}-dimensional table for expected counts/probabilities.")
    N=obs.sum()
    if exp.sum() <= 1:
        exp = exp*N
    exp_probas = exp/exp.sum()
    chi, p =  chisquare(f_obs=obs, f_exp=exp)
    min_p = exp_probas.min()
    fei = np.sqrt(chi/(N*((1/min_p )-1)))

    # =============================================================================
    # confidence intervals calculations
    # =============================================================================
    def ncp_fun(nc, chi, dof, q):
        #internal function to calculate confidence intervals according to the "pivot method"
        p1 = ncx2.cdf(chi,df=dof, nc=nc)
        res = p1 - q
        return res
    alpha = 1-ci
    dof = len(obs)-1
    lower_ci = 0
    upper_ci = 1

    if alternative in ["two-sided", "two.sided"]:
        upper_ncp = least_squares(ncp_fun,
                           chi,
                           args=(chi, dof, alpha/2),
                           bounds = (chi, N * ((1/min_p)-1)),
                           ).x[0]
        upper_ci = np.sqrt(upper_ncp/ (N * ((1/min_p)-1)))
        if chi == 0 or \
            math.isclose(ncx2.ppf(1-alpha, df=dof, nc=0, loc=0, scale=1), chi,
                         abs_tol=1e-3):
            lower_ncp = 0
        else:
            lower_ncp = least_squares(ncp_fun,
                               chi,
                               args=(chi, dof, 1-alpha/2),
                               bounds=(-0.0,chi),
                               ).x[0]
        lower_ci = np.sqrt(lower_ncp/ (N * ((1/min_p)-1)))

    elif alternative == "less":
        upper_ncp = least_squares(ncp_fun,
                           chi,
                           args=(chi, dof, alpha),
                           bounds=(chi,N * ((1/min_p)-1)),
                           ).x[0]
        upper_ci= np.sqrt(upper_ncp / (N * ((1/min_p)-1)))

    elif alternative == "greater":

        if chi == 0 or \
            math.isclose(ncx2.ppf(1-alpha, df=dof, nc=0, loc=0, scale=1), chi,
                         abs_tol=1e-3):
            upper_ncp = 0
        else:
            upper_ncp= least_squares(ncp_fun,
                                    chi,
                                    args=(chi, dof, 1-alpha),
                                    bounds=(0,chi),

                                    ).x[0]

        lower_ci= np.sqrt(upper_ncp / (N * ((1/min_p)-1)))

    result = result_tuple(fei, (lower_ci, upper_ci))
    return result

## fei_basic_examples.py
# =============================================================================
# Examples from https://easystats.github.io/effectsize/articles/xtabs.html#goodness-of-fit
# =============================================================================
O = [89, 37, 130, 28, 2] # observed group sizes
E = [.40, .20, .20, .15, .05] # expected group freq
fei(O, E) # == 0.14933

# Observed perfectly matches Expected
O1 = np.array(E) * 286
fei(O1, E) # == 0

# Observed deviates maximally from Expected:
O2 = [0,0,0,0,286]
fei(O2,E) # ~= 1
	"""
	Python implementation of the פ (Fei) effect size for goodness-of-fit tests.
	This is a beta version. Confidence intervals may differ from the original R implementation (see the "Returns" section below).

	Reference: Ben-Shachar, M.S.; Patil, I.; Thériault, R.; Wiernik, B.M.; Lüdecke, D. Phi, Fei, Fo, Fum: Effect Sizes for Categorical Data That Use the Chi-Squared Statistic. Mathematics 2023, 11, 1982. https://doi.org/10.3390/math11091982
	For the original R implementation of Fei, see the "effectsize" library https://easystats.github.io/effectsize/reference/phi.html.
	"""
	from scipy.stats import chisquare, ncx2
	from scipy.optimize import least_squares
	from collections import namedtuple
	import numpy as np
	import math
	def fei(obs, exp, ci=0.95, alternative="greater"):
	"""
	Parameters
	----------
	obs : list or numpy array
	Observed counts. Must be a one dimensional table of counts.
	Example: obs=[10,50,120,20]
	exp : list or numpy array
	Expected counts or probabilities. Must be a one dimensional table of counts or probabilities.
	Examples:
	uniform counts: exp=[50, 50, 50, 50]
	uniform probabilities: exp=[0.25, 0.25, 0.25, 0.25]
	non-uniform counts: exp=[50,10,90,50]
	non-uniform probabilities: exp=[0.25, 0.05, 0.45, 0.25]
	ci: float
	Confidence interval level.
	alternative: string
	Alternative hypothesis. Controls the type of confidence interval returned.
	Possible values:
	- "greater" (default)
	- "two.sided"
	- "less"
	If "greater" is selected, the upper confidence interval limit will always be 1.
	If "less" is selected, the lower confidence interval limit will always be 0.


	Returns
	-------
	statistic : float
	Returns the fei coefficient, as described by Ben-Shachar et al. (2023).
	interval: tuple of two floats
	Returns the confidence interval.
	Note that if "less" set as the alternative, the lower interval limit will always be 0, while if it's set
	to "greater", the upper interval limit will always be 1.
	The confidence interval is calculated by using the non-centrality parameter method (aka the "pivot method"),
	similarly to the R implementation of Fei in the library effectsize and to what is described in Ben-Shachar et al. (2023).
	However, the "pivot" calculation uses SciPy's least squares optimization function, which may result in
	different confidence intervals than the R implementation of Fei in a few cases.
	For details about SciPy's least squares optimization function, see https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.least_squares.html

	Reference
	-------
	Ben-Shachar, M.S.; Patil, I.; Thériault, R.; Wiernik, B.M.; Lüdecke, D. Phi, Fei, Fo, Fum: Effect Sizes for Categorical Data That Use the Chi-Squared Statistic. Mathematics 2023, 11, 1982. https://doi.org/10.3390/math11091982
	"""

	result_tuple = namedtuple('Fei', ['statistic', 'interval'])
	# =============================================================================
	# Fei statistic calculations
	# =============================================================================
	obs = np.array(obs)
	exp = np.array(exp)
	if len(obs.shape) > 1 or len(exp.shape) >1:
	raise ValueError(f"Observed counts and expected counts/probabilities must be both one-dimensional tables. You passed a {len(obs.shape)}-dimensional table for observed counts, and a {len(exp.shape)}-dimensional table for expected counts/probabilities.")
	N=obs.sum()
	if exp.sum() <= 1:
	exp = exp*N
	exp_probas = exp/exp.sum()
	chi, p = chisquare(f_obs=obs, f_exp=exp)
	min_p = exp_probas.min()
	fei = np.sqrt(chi/(N*((1/min_p )-1)))

	# =============================================================================
	# confidence intervals calculations
	# =============================================================================
	def ncp_fun(nc, chi, dof, q):
	#internal function to calculate confidence intervals according to the "pivot method"
	p1 = ncx2.cdf(chi,df=dof, nc=nc)
	res = p1 - q
	return res
	alpha = 1-ci
	dof = len(obs)-1
	lower_ci = 0
	upper_ci = 1

	if alternative in ["two-sided", "two.sided"]:
	upper_ncp = least_squares(ncp_fun,
	chi,
	args=(chi, dof, alpha/2),
	bounds = (chi, N * ((1/min_p)-1)),
	).x[0]
	upper_ci = np.sqrt(upper_ncp/ (N * ((1/min_p)-1)))
	if chi == 0 or \
	math.isclose(ncx2.ppf(1-alpha, df=dof, nc=0, loc=0, scale=1), chi,
	abs_tol=1e-3):
	lower_ncp = 0
	else:
	lower_ncp = least_squares(ncp_fun,
	chi,
	args=(chi, dof, 1-alpha/2),
	bounds=(-0.0,chi),
	).x[0]
	lower_ci = np.sqrt(lower_ncp/ (N * ((1/min_p)-1)))

	elif alternative == "less":
	upper_ncp = least_squares(ncp_fun,
	chi,
	args=(chi, dof, alpha),
	bounds=(chi,N * ((1/min_p)-1)),
	).x[0]
	upper_ci= np.sqrt(upper_ncp / (N * ((1/min_p)-1)))

	elif alternative == "greater":

	if chi == 0 or \
	math.isclose(ncx2.ppf(1-alpha, df=dof, nc=0, loc=0, scale=1), chi,
	abs_tol=1e-3):
	upper_ncp = 0
	else:
	upper_ncp= least_squares(ncp_fun,
	chi,
	args=(chi, dof, 1-alpha),
	bounds=(0,chi),

	).x[0]

	lower_ci= np.sqrt(upper_ncp / (N * ((1/min_p)-1)))

	result = result_tuple(fei, (lower_ci, upper_ci))
	return result
	# =============================================================================
	# Examples from https://easystats.github.io/effectsize/articles/xtabs.html#goodness-of-fit
	# =============================================================================
	O = [89, 37, 130, 28, 2] # observed group sizes
	E = [.40, .20, .20, .15, .05] # expected group freq
	fei(O, E) # == 0.14933

	# Observed perfectly matches Expected
	O1 = np.array(E) * 286
	fei(O1, E) # == 0

	# Observed deviates maximally from Expected:
	O2 = [0,0,0,0,286]
	fei(O2,E) # ~= 1