Last active
August 4, 2022 14:12
-
-
Save EoinTravers/c9fe5959aae4806d596c1a09c8c9af4d to your computer and use it in GitHub Desktop.
Likelhood ratio tests for python statsmodels GLMs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This version is more complicated, but can handle > 2 models and throws an error on non-nested models. | |
import numpy as np | |
from scipy import stats | |
import statsmodels.formula.api as smf | |
import pandas as pd | |
def likelihood_ratio_test(*models): | |
'''Model comparison via likelihood ratio chi-square test | |
See https://en.wikipedia.org/wiki/Likelihood-ratio_test | |
Arguments: | |
*models: The nested models you want to compared, e.g. likelihood_ratio_test(m0, m1, m2) | |
Returns: | |
pandas DataFrame with columns | |
- `new_parameters` (names of new parameters added at each step), | |
- `test_statistic`, | |
- `df` (number of parameters added) | |
- `p_value` (for the null hypothesis that adding these parameters does not improve model fit). | |
''' | |
logliks = [m.llf for m in models] | |
degrees_of_freedom = [m.df_resid for m in models] | |
delta_ll = np.diff(logliks) * -1 | |
delta_df = np.diff(degrees_of_freedom) * -1 | |
test_statistics = -2 * delta_ll | |
p_values = [stats.chi2.sf(x, df) for x, df, in zip(test_statistics, delta_df)] | |
# Get names of new parameters added in each model | |
parameter_names = [m.params.index.values for m in models] | |
new_parameters = [ | |
np.setdiff1d(parameter_names[i+1], parameter_names[i]).tolist() | |
for i in range(len(parameter_names) - 1) | |
] | |
new_parameters = [repr(p) for p in new_parameters] # Could be made prettier | |
# Ensure the models are nested | |
# All parameters in model i should also be in model i+1 | |
non_nested_parameters = [ | |
np.setdiff1d(parameter_names[i], parameter_names[i+1]).tolist() | |
for i in range(len(parameter_names) - 1) | |
] | |
some_non_nested = np.any([len(p) > 0 for p in non_nested_parameters]) | |
if some_non_nested: | |
raise Exception('Model parameters are not nested') | |
return pd.DataFrame({'new_parameters' : new_parameters, | |
'test_statistic': test_statistics, | |
'df' : delta_df, | |
'p_value' : p_values}) | |
# Downloading example data from https://stats.oarc.ucla.edu/r/dae/poisson-regression | |
awards = pd.read_csv("https://stats.idre.ucla.edu/stat/data/poisson_sim.csv") | |
model_formulae = ['num_awards ~ 1', | |
'num_awards ~ C(prog)', | |
'num_awards ~ C(prog) + math'] | |
m0, m1, m2 = [smf.poisson(f, data = awards).fit(disp=0) for f in model_formulae] | |
likelihood_ratio_test(m0, m1, m2) | |
# new_parameters test_statistic df p_value | |
# 0 ['C(prog)[T.2]', 'C(prog)[T.3]'] 53.212261 2.0 2.786791e-12 | |
# 1 ['math'] 45.010354 1.0 1.959954e-11 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from scipy import stats | |
import statsmodels.formula.api as smf | |
import pandas as pd | |
def likelihood_ratio_test(model0, model1): | |
'''Model comparison via likelihood ratio chi-square test | |
I have no idea why this still isn't implemented in statsmodels... | |
See https://en.wikipedia.org/wiki/Likelihood-ratio_test | |
Arguments: | |
model0: Model without the parameters of interest | |
model1: Model with the parameters of interest | |
Returns: | |
Single-row pandas DataFrame with `test_statistic`, | |
`df` (number of additional parametrs in model1), and | |
`p_value` for the null hypothesis that adding these parameters | |
does not improve model fit. | |
''' | |
models = [model0, model1] | |
ll_0, ll_1 = [m.llf for m in models] | |
df_0, df_1 = [m.df_resid for m in models] | |
delta_ll = ll_0 - ll_1 | |
delta_df = df_0 - df_1 | |
test_statistic = -2 * delta_ll | |
p_value = stats.chi2.sf(test_statistic, df = delta_df) | |
# Nice label | |
parameter_names = [m.params.index.values for m in models] | |
new_parameters = np.setdiff1d(parameter_names[1], parameter_names[0]) | |
return pd.DataFrame({'new_parameters' : str(new_parameters), | |
'test_statistic': test_statistic, | |
'df' : delta_df, | |
'p_value' : p_value}, index = [0]) | |
# Example 1 | |
data = pd.DataFrame({'group' : ['A', 'B', 'C'],'count' : [100, 110, 150]}) | |
m_null = smf.poisson('count ~ 1', data = data).fit() | |
m_alt = smf.poisson('count ~ group', data = data).fit() | |
print(likelihood_ratio_test(m_null, m_alt)) | |
# new_parameters test_statistic df p_value | |
# 0 ['group[T.B]' 'group[T.C]'] 11.336251 2.0 0.003454 | |
# Example 2 - Results match for DF = 1 | |
data2 = pd.DataFrame({'group' : ['A', 'B'],'count' : [100, 125]}) | |
m_null = smf.poisson('count ~ 1', data = data2).fit() | |
m_alt = smf.poisson('count ~ group', data = data2).fit() | |
result = likelihood_ratio_test(m_null, m_alt) | |
print(m_alt.summary()) # Note p = 0.096 | |
# Poisson Regression Results | |
# ============================================================================== | |
# Dep. Variable: count No. Observations: 2 | |
# Model: Poisson Df Residuals: 0 | |
# Method: MLE Df Model: 1 | |
# Date: Thu, 04 Aug 2022 Pseudo R-squ.: 0.1751 | |
# Time: 11:32:39 Log-Likelihood: -6.5561 | |
# converged: True LL-Null: -7.9479 | |
# Covariance Type: nonrobust LLR p-value: 0.09524 | |
# ============================================================================== | |
# coef std err z P>|z| [0.025 0.975] | |
# ------------------------------------------------------------------------------ | |
# Intercept 4.6052 0.100 46.052 0.000 4.409 4.801 | |
# group[T.B] 0.2231 0.134 1.663 0.096 -0.040 0.486 | |
# ============================================================================== | |
print(likelihood_ratio_test(m_null, m_alt)) | |
# new_parameters test_statistic df p_value | |
# 0 ['group[T.B]'] 2.783522 1.0 0.095239 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment