Skip to content

Instantly share code, notes, and snippets.

@EoinTravers
Last active August 4, 2022 14:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save EoinTravers/c9fe5959aae4806d596c1a09c8c9af4d to your computer and use it in GitHub Desktop.
Save EoinTravers/c9fe5959aae4806d596c1a09c8c9af4d to your computer and use it in GitHub Desktop.
Likelhood ratio tests for python statsmodels GLMs
# This version is more complicated, but can handle > 2 models and throws an error on non-nested models.
import numpy as np
from scipy import stats
import statsmodels.formula.api as smf
import pandas as pd
def likelihood_ratio_test(*models):
'''Model comparison via likelihood ratio chi-square test
See https://en.wikipedia.org/wiki/Likelihood-ratio_test
Arguments:
*models: The nested models you want to compared, e.g. likelihood_ratio_test(m0, m1, m2)
Returns:
pandas DataFrame with columns
- `new_parameters` (names of new parameters added at each step),
- `test_statistic`,
- `df` (number of parameters added)
- `p_value` (for the null hypothesis that adding these parameters does not improve model fit).
'''
logliks = [m.llf for m in models]
degrees_of_freedom = [m.df_resid for m in models]
delta_ll = np.diff(logliks) * -1
delta_df = np.diff(degrees_of_freedom) * -1
test_statistics = -2 * delta_ll
p_values = [stats.chi2.sf(x, df) for x, df, in zip(test_statistics, delta_df)]
# Get names of new parameters added in each model
parameter_names = [m.params.index.values for m in models]
new_parameters = [
np.setdiff1d(parameter_names[i+1], parameter_names[i]).tolist()
for i in range(len(parameter_names) - 1)
]
new_parameters = [repr(p) for p in new_parameters] # Could be made prettier
# Ensure the models are nested
# All parameters in model i should also be in model i+1
non_nested_parameters = [
np.setdiff1d(parameter_names[i], parameter_names[i+1]).tolist()
for i in range(len(parameter_names) - 1)
]
some_non_nested = np.any([len(p) > 0 for p in non_nested_parameters])
if some_non_nested:
raise Exception('Model parameters are not nested')
return pd.DataFrame({'new_parameters' : new_parameters,
'test_statistic': test_statistics,
'df' : delta_df,
'p_value' : p_values})
# Downloading example data from https://stats.oarc.ucla.edu/r/dae/poisson-regression
awards = pd.read_csv("https://stats.idre.ucla.edu/stat/data/poisson_sim.csv")
model_formulae = ['num_awards ~ 1',
'num_awards ~ C(prog)',
'num_awards ~ C(prog) + math']
m0, m1, m2 = [smf.poisson(f, data = awards).fit(disp=0) for f in model_formulae]
likelihood_ratio_test(m0, m1, m2)
# new_parameters test_statistic df p_value
# 0 ['C(prog)[T.2]', 'C(prog)[T.3]'] 53.212261 2.0 2.786791e-12
# 1 ['math'] 45.010354 1.0 1.959954e-11
import numpy as np
from scipy import stats
import statsmodels.formula.api as smf
import pandas as pd
def likelihood_ratio_test(model0, model1):
'''Model comparison via likelihood ratio chi-square test
I have no idea why this still isn't implemented in statsmodels...
See https://en.wikipedia.org/wiki/Likelihood-ratio_test
Arguments:
model0: Model without the parameters of interest
model1: Model with the parameters of interest
Returns:
Single-row pandas DataFrame with `test_statistic`,
`df` (number of additional parametrs in model1), and
`p_value` for the null hypothesis that adding these parameters
does not improve model fit.
'''
models = [model0, model1]
ll_0, ll_1 = [m.llf for m in models]
df_0, df_1 = [m.df_resid for m in models]
delta_ll = ll_0 - ll_1
delta_df = df_0 - df_1
test_statistic = -2 * delta_ll
p_value = stats.chi2.sf(test_statistic, df = delta_df)
# Nice label
parameter_names = [m.params.index.values for m in models]
new_parameters = np.setdiff1d(parameter_names[1], parameter_names[0])
return pd.DataFrame({'new_parameters' : str(new_parameters),
'test_statistic': test_statistic,
'df' : delta_df,
'p_value' : p_value}, index = [0])
# Example 1
data = pd.DataFrame({'group' : ['A', 'B', 'C'],'count' : [100, 110, 150]})
m_null = smf.poisson('count ~ 1', data = data).fit()
m_alt = smf.poisson('count ~ group', data = data).fit()
print(likelihood_ratio_test(m_null, m_alt))
# new_parameters test_statistic df p_value
# 0 ['group[T.B]' 'group[T.C]'] 11.336251 2.0 0.003454
# Example 2 - Results match for DF = 1
data2 = pd.DataFrame({'group' : ['A', 'B'],'count' : [100, 125]})
m_null = smf.poisson('count ~ 1', data = data2).fit()
m_alt = smf.poisson('count ~ group', data = data2).fit()
result = likelihood_ratio_test(m_null, m_alt)
print(m_alt.summary()) # Note p = 0.096
# Poisson Regression Results
# ==============================================================================
# Dep. Variable: count No. Observations: 2
# Model: Poisson Df Residuals: 0
# Method: MLE Df Model: 1
# Date: Thu, 04 Aug 2022 Pseudo R-squ.: 0.1751
# Time: 11:32:39 Log-Likelihood: -6.5561
# converged: True LL-Null: -7.9479
# Covariance Type: nonrobust LLR p-value: 0.09524
# ==============================================================================
# coef std err z P>|z| [0.025 0.975]
# ------------------------------------------------------------------------------
# Intercept 4.6052 0.100 46.052 0.000 4.409 4.801
# group[T.B] 0.2231 0.134 1.663 0.096 -0.040 0.486
# ==============================================================================
print(likelihood_ratio_test(m_null, m_alt))
# new_parameters test_statistic df p_value
# 0 ['group[T.B]'] 2.783522 1.0 0.095239
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment