EoinTravers/better_lrt.py

## better_lrt.py
# This version is more complicated, but can handle > 2 models and throws an error on non-nested models.
import numpy as np
from scipy import stats
import statsmodels.formula.api as smf
import pandas as pd

def likelihood_ratio_test(*models):
    '''Model comparison via likelihood ratio chi-square test
    See https://en.wikipedia.org/wiki/Likelihood-ratio_test

    Arguments:
        *models: The nested models you want to compared, e.g. likelihood_ratio_test(m0, m1, m2)
    Returns:
        pandas DataFrame with columns
        - `new_parameters` (names of new parameters added at each step),
        - `test_statistic`,
        - `df` (number of parameters added)
        - `p_value` (for the null hypothesis that adding these parameters does not improve model fit).
    '''
    logliks = [m.llf for m in models]
    degrees_of_freedom = [m.df_resid for m in models]
    delta_ll = np.diff(logliks) * -1
    delta_df = np.diff(degrees_of_freedom) * -1
    test_statistics = -2 * delta_ll
    p_values = [stats.chi2.sf(x, df) for x, df, in zip(test_statistics, delta_df)]

    # Get names of new parameters added in each model
    parameter_names = [m.params.index.values for m in models]
    new_parameters = [
        np.setdiff1d(parameter_names[i+1], parameter_names[i]).tolist()
        for i in range(len(parameter_names) - 1)
    ]
    new_parameters = [repr(p) for p in new_parameters] # Could be made prettier

    # Ensure the models are nested
    # All parameters in model i should also be in model i+1
    non_nested_parameters = [
        np.setdiff1d(parameter_names[i], parameter_names[i+1]).tolist()
        for i in range(len(parameter_names) - 1)
    ]
    some_non_nested = np.any([len(p) > 0 for p in non_nested_parameters])
    if some_non_nested:
        raise Exception('Model parameters are not nested')

    return pd.DataFrame({'new_parameters' : new_parameters,
                         'test_statistic': test_statistics,
                         'df' : delta_df,
                         'p_value' : p_values})

# Downloading example data from https://stats.oarc.ucla.edu/r/dae/poisson-regression
awards = pd.read_csv("https://stats.idre.ucla.edu/stat/data/poisson_sim.csv")
model_formulae = ['num_awards ~ 1',
                  'num_awards ~ C(prog)',
                  'num_awards ~ C(prog) + math']
m0, m1, m2 = [smf.poisson(f, data = awards).fit(disp=0) for f in model_formulae]
likelihood_ratio_test(m0, m1, m2)
#                      new_parameters  test_statistic   df       p_value
# 0  ['C(prog)[T.2]', 'C(prog)[T.3]']       53.212261  2.0  2.786791e-12
# 1                          ['math']       45.010354  1.0  1.959954e-11

## likelihood_ratio_test.py
import numpy as np
from scipy import stats
import statsmodels.formula.api as smf
import pandas as pd

def likelihood_ratio_test(model0, model1):
    '''Model comparison via likelihood ratio chi-square test

    I have no idea why this still isn't implemented in statsmodels...
    See https://en.wikipedia.org/wiki/Likelihood-ratio_test

    Arguments:
        model0: Model without the parameters of interest
        model1: Model with the parameters of interest
    Returns:
        Single-row pandas DataFrame with `test_statistic`,
        `df` (number of additional parametrs in model1), and
        `p_value` for the null hypothesis that adding these parameters
        does not improve model fit.
    '''
    models = [model0, model1]
    ll_0, ll_1 = [m.llf for m in models]
    df_0, df_1 = [m.df_resid for m in models]
    delta_ll = ll_0 - ll_1
    delta_df = df_0 - df_1
    test_statistic = -2 * delta_ll
    p_value = stats.chi2.sf(test_statistic, df = delta_df)
    # Nice label
    parameter_names = [m.params.index.values for m in models]
    new_parameters = np.setdiff1d(parameter_names[1], parameter_names[0])
    return pd.DataFrame({'new_parameters' : str(new_parameters),
                         'test_statistic': test_statistic,
                         'df' : delta_df,
                         'p_value' : p_value}, index = [0])


# Example 1
data = pd.DataFrame({'group' : ['A', 'B', 'C'],'count' : [100, 110, 150]})
m_null = smf.poisson('count ~ 1', data = data).fit()
m_alt = smf.poisson('count ~ group', data = data).fit()
print(likelihood_ratio_test(m_null, m_alt))
#                 new_parameters  test_statistic   df   p_value
# 0  ['group[T.B]' 'group[T.C]']       11.336251  2.0  0.003454

# Example 2 - Results match for DF = 1
data2 = pd.DataFrame({'group' : ['A', 'B'],'count' : [100, 125]})
m_null = smf.poisson('count ~ 1', data = data2).fit()
m_alt = smf.poisson('count ~ group', data = data2).fit()
result = likelihood_ratio_test(m_null, m_alt)
print(m_alt.summary()) # Note p = 0.096
#                           Poisson Regression Results
# ==============================================================================
# Dep. Variable:                  count   No. Observations:                    2
# Model:                        Poisson   Df Residuals:                        0
# Method:                           MLE   Df Model:                            1
# Date:                Thu, 04 Aug 2022   Pseudo R-squ.:                  0.1751
# Time:                        11:32:39   Log-Likelihood:                -6.5561
# converged:                       True   LL-Null:                       -7.9479
# Covariance Type:            nonrobust   LLR p-value:                   0.09524
# ==============================================================================
#                  coef    std err          z      P>|z|      [0.025      0.975]
# ------------------------------------------------------------------------------
# Intercept      4.6052      0.100     46.052      0.000       4.409       4.801
# group[T.B]     0.2231      0.134      1.663      0.096      -0.040       0.486
# ==============================================================================

print(likelihood_ratio_test(m_null, m_alt))
#    new_parameters  test_statistic   df   p_value
# 0  ['group[T.B]']        2.783522  1.0  0.095239
	# This version is more complicated, but can handle > 2 models and throws an error on non-nested models.
	import numpy as np
	from scipy import stats
	import statsmodels.formula.api as smf
	import pandas as pd

	def likelihood_ratio_test(*models):
	'''Model comparison via likelihood ratio chi-square test
	See https://en.wikipedia.org/wiki/Likelihood-ratio_test

	Arguments:
	*models: The nested models you want to compared, e.g. likelihood_ratio_test(m0, m1, m2)
	Returns:
	pandas DataFrame with columns
	- `new_parameters` (names of new parameters added at each step),
	- `test_statistic`,
	- `df` (number of parameters added)
	- `p_value` (for the null hypothesis that adding these parameters does not improve model fit).
	'''
	logliks = [m.llf for m in models]
	degrees_of_freedom = [m.df_resid for m in models]
	delta_ll = np.diff(logliks) * -1
	delta_df = np.diff(degrees_of_freedom) * -1
	test_statistics = -2 * delta_ll
	p_values = [stats.chi2.sf(x, df) for x, df, in zip(test_statistics, delta_df)]

	# Get names of new parameters added in each model
	parameter_names = [m.params.index.values for m in models]
	new_parameters = [
	np.setdiff1d(parameter_names[i+1], parameter_names[i]).tolist()
	for i in range(len(parameter_names) - 1)
	]
	new_parameters = [repr(p) for p in new_parameters] # Could be made prettier

	# Ensure the models are nested
	# All parameters in model i should also be in model i+1
	non_nested_parameters = [
	np.setdiff1d(parameter_names[i], parameter_names[i+1]).tolist()
	for i in range(len(parameter_names) - 1)
	]
	some_non_nested = np.any([len(p) > 0 for p in non_nested_parameters])
	if some_non_nested:
	raise Exception('Model parameters are not nested')

	return pd.DataFrame({'new_parameters' : new_parameters,
	'test_statistic': test_statistics,
	'df' : delta_df,
	'p_value' : p_values})

	# Downloading example data from https://stats.oarc.ucla.edu/r/dae/poisson-regression
	awards = pd.read_csv("https://stats.idre.ucla.edu/stat/data/poisson_sim.csv")
	model_formulae = ['num_awards ~ 1',
	'num_awards ~ C(prog)',
	'num_awards ~ C(prog) + math']
	m0, m1, m2 = [smf.poisson(f, data = awards).fit(disp=0) for f in model_formulae]
	likelihood_ratio_test(m0, m1, m2)
	# new_parameters test_statistic df p_value
	# 0 ['C(prog)[T.2]', 'C(prog)[T.3]'] 53.212261 2.0 2.786791e-12
	# 1 ['math'] 45.010354 1.0 1.959954e-11