Skip to content

Instantly share code, notes, and snippets.

@josef-pkt
Last active February 7, 2024 20:55
Show Gist options
  • Save josef-pkt/88ad009fe7e41e0bb09e4777fc383a11 to your computer and use it in GitHub Desktop.
Save josef-pkt/88ad009fe7e41e0bb09e4777fc383a11 to your computer and use it in GitHub Desktop.
comparing GLM gaussian using var_weights and WLS (updated after fit_constrained bug fix, rerun based on 3856)
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Comparing GLM-Gaussian with WLS\n",
"\n",
"This notebook checks and illustrates the equivalence of GLM-Gaussian using var_weights and WLS.\n",
"\n",
"We only check params and bse systematically and compare a few other results."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import numpy as np\n",
"import statsmodels.api as sm\n",
"import statsmodels.formula.api as smf\n",
"from statsmodels.iolib.summary2 import summary_col"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data and Model"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"data = sm.datasets.cpunish.load_pandas()\n",
"endog = data.endog\n",
"data = data.exog\n",
"data['EXECUTIONS'] = endog\n",
"data['INCOME'] /= 1000\n",
"aweights = np.array([1, 2, 3, 4, 5, 4, 3, 2, 1, 2, 3, 4, 5, 4, 3, 2,\n",
" 1])"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"model = smf.glm(\n",
" 'EXECUTIONS ~ INCOME + SOUTH - 1',\n",
" data=data,\n",
" family=sm.families.Gaussian(link=sm.families.links.identity()),\n",
" var_weights=aweights\n",
")\n",
"wlsmodel = smf.wls(\n",
" 'EXECUTIONS ~ INCOME + SOUTH - 1',\n",
" data=data,\n",
" weights=aweights)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Models with nonrobust standard errors"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"res_glm = model.fit(rtol=1e-25, atol=1e-25, use_t=True)\n",
"res_wls = wlsmodel.fit()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Generalized Linear Model Regression Results \n",
"==============================================================================\n",
"Dep. Variable: EXECUTIONS No. Observations: 17\n",
"Model: GLM Df Residuals: 15\n",
"Model Family: Gaussian Df Model: 1\n",
"Link Function: identity Scale: 83.991\n",
"Method: IRLS Log-Likelihood: -52.754\n",
"Date: Mon, 08 Jan 2018 Deviance: 1259.9\n",
"Time: 19:39:44 Pearson chi2: 1.26e+03\n",
"No. Iterations: 3 Covariance Type: nonrobust\n",
"==============================================================================\n",
" coef std err t P>|t| [0.025 0.975]\n",
"------------------------------------------------------------------------------\n",
"INCOME 0.0509 0.046 1.097 0.290 -0.048 0.150\n",
"SOUTH 2.9783 2.497 1.193 0.252 -2.344 8.301\n",
"==============================================================================\n"
]
}
],
"source": [
"print(res_glm.summary())"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" WLS Regression Results \n",
"==============================================================================\n",
"Dep. Variable: EXECUTIONS R-squared: 0.304\n",
"Model: WLS Adj. R-squared: 0.211\n",
"Method: Least Squares F-statistic: 3.269\n",
"Date: Mon, 08 Jan 2018 Prob (F-statistic): 0.0663\n",
"Time: 19:39:45 Log-Likelihood: -52.754\n",
"No. Observations: 17 AIC: 109.5\n",
"Df Residuals: 15 BIC: 111.2\n",
"Df Model: 2 \n",
"Covariance Type: nonrobust \n",
"==============================================================================\n",
" coef std err t P>|t| [0.025 0.975]\n",
"------------------------------------------------------------------------------\n",
"INCOME 0.0509 0.046 1.097 0.290 -0.048 0.150\n",
"SOUTH 2.9783 2.497 1.193 0.252 -2.344 8.301\n",
"==============================================================================\n",
"Omnibus: 33.202 Durbin-Watson: 0.741\n",
"Prob(Omnibus): 0.000 Jarque-Bera (JB): 65.147\n",
"Skew: 2.820 Prob(JB): 7.14e-15\n",
"Kurtosis: 10.756 Cond. No. 67.2\n",
"==============================================================================\n",
"\n",
"Warnings:\n",
"[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\josef\\Downloads\\WinPython-64bit-3.4.4.5Qt5\\python-3.4.4.amd64\\lib\\site-packages\\scipy\\stats\\stats.py:1327: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=17\n",
" \"anyway, n=%i\" % int(n))\n"
]
}
],
"source": [
"print(res_wls.summary())"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(<class 'statsmodels.stats.contrast.ContrastResults'>\n",
" Test for Constraints \n",
" ==============================================================================\n",
" coef std err t P>|t| [0.025 0.975]\n",
" ------------------------------------------------------------------------------\n",
" c0 0.0509 0.046 1.097 0.290 -0.048 0.150\n",
" c1 2.9783 2.497 1.193 0.252 -2.344 8.301\n",
" ==============================================================================,\n",
" <class 'statsmodels.stats.contrast.ContrastResults'>\n",
" Test for Constraints \n",
" ==============================================================================\n",
" coef std err t P>|t| [0.025 0.975]\n",
" ------------------------------------------------------------------------------\n",
" c0 0.0509 0.046 1.097 0.290 -0.048 0.150\n",
" c1 2.9783 2.497 1.193 0.252 -2.344 8.301\n",
" ==============================================================================)"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"res_glm.t_test(np.eye(2)), res_wls.t_test(np.eye(2))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Models with heteroscedasticity robust standard errors"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Generalized Linear Model Regression Results \n",
"==============================================================================\n",
"Dep. Variable: EXECUTIONS No. Observations: 17\n",
"Model: GLM Df Residuals: 15\n",
"Model Family: Gaussian Df Model: 1\n",
"Link Function: identity Scale: 83.991\n",
"Method: IRLS Log-Likelihood: -52.754\n",
"Date: Mon, 08 Jan 2018 Deviance: 1259.9\n",
"Time: 19:39:45 Pearson chi2: 1.26e+03\n",
"No. Iterations: 3 Covariance Type: HC0\n",
"==============================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"------------------------------------------------------------------------------\n",
"INCOME 0.0509 0.014 3.602 0.000 0.023 0.079\n",
"SOUTH 2.9783 1.885 1.580 0.114 -0.717 6.674\n",
"==============================================================================\n",
" WLS Regression Results \n",
"==============================================================================\n",
"Dep. Variable: EXECUTIONS R-squared: 0.304\n",
"Model: WLS Adj. R-squared: 0.211\n",
"Method: Least Squares F-statistic: 8.471\n",
"Date: Mon, 08 Jan 2018 Prob (F-statistic): 0.00345\n",
"Time: 19:39:45 Log-Likelihood: -52.754\n",
"No. Observations: 17 AIC: 109.5\n",
"Df Residuals: 15 BIC: 111.2\n",
"Df Model: 2 \n",
"Covariance Type: HC0 \n",
"==============================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"------------------------------------------------------------------------------\n",
"INCOME 0.0509 0.014 3.602 0.000 0.023 0.079\n",
"SOUTH 2.9783 1.885 1.580 0.114 -0.717 6.674\n",
"==============================================================================\n",
"Omnibus: 33.202 Durbin-Watson: 0.741\n",
"Prob(Omnibus): 0.000 Jarque-Bera (JB): 65.147\n",
"Skew: 2.820 Prob(JB): 7.14e-15\n",
"Kurtosis: 10.756 Cond. No. 67.2\n",
"==============================================================================\n",
"\n",
"Warnings:\n",
"[1] Standard Errors are heteroscedasticity robust (HC0)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\josef\\Downloads\\WinPython-64bit-3.4.4.5Qt5\\python-3.4.4.amd64\\lib\\site-packages\\scipy\\stats\\stats.py:1327: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=17\n",
" \"anyway, n=%i\" % int(n))\n"
]
}
],
"source": [
"res_glm_hc = model.fit(rtol=1e-25, atol=1e-25, cov_type='HC0')\n",
"res_wls_hc = wlsmodel.fit(cov_type='HC0')\n",
"print(res_glm_hc.summary())\n",
"print(res_wls_hc.summary())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The default distribution for inference if a nonrobust cov_type is selected is the normal distribution. We can switch back to the t distribution by adding the keyword argument `use_t=True` in the call of fit."
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Generalized Linear Model Regression Results \n",
"==============================================================================\n",
"Dep. Variable: EXECUTIONS No. Observations: 17\n",
"Model: GLM Df Residuals: 15\n",
"Model Family: Gaussian Df Model: 1\n",
"Link Function: identity Scale: 83.991\n",
"Method: IRLS Log-Likelihood: -52.754\n",
"Date: Mon, 08 Jan 2018 Deviance: 1259.9\n",
"Time: 19:39:45 Pearson chi2: 1.26e+03\n",
"No. Iterations: 3 Covariance Type: HC0\n",
"==============================================================================\n",
" coef std err t P>|t| [0.025 0.975]\n",
"------------------------------------------------------------------------------\n",
"INCOME 0.0509 0.014 3.602 0.003 0.021 0.081\n",
"SOUTH 2.9783 1.885 1.580 0.135 -1.040 6.997\n",
"==============================================================================\n",
"\n",
" WLS Regression Results \n",
"==============================================================================\n",
"Dep. Variable: EXECUTIONS R-squared: 0.304\n",
"Model: WLS Adj. R-squared: 0.211\n",
"Method: Least Squares F-statistic: 8.471\n",
"Date: Mon, 08 Jan 2018 Prob (F-statistic): 0.00345\n",
"Time: 19:39:45 Log-Likelihood: -52.754\n",
"No. Observations: 17 AIC: 109.5\n",
"Df Residuals: 15 BIC: 111.2\n",
"Df Model: 2 \n",
"Covariance Type: HC0 \n",
"==============================================================================\n",
" coef std err t P>|t| [0.025 0.975]\n",
"------------------------------------------------------------------------------\n",
"INCOME 0.0509 0.014 3.602 0.003 0.021 0.081\n",
"SOUTH 2.9783 1.885 1.580 0.135 -1.040 6.997\n",
"==============================================================================\n",
"Omnibus: 33.202 Durbin-Watson: 0.741\n",
"Prob(Omnibus): 0.000 Jarque-Bera (JB): 65.147\n",
"Skew: 2.820 Prob(JB): 7.14e-15\n",
"Kurtosis: 10.756 Cond. No. 67.2\n",
"==============================================================================\n",
"\n",
"Warnings:\n",
"[1] Standard Errors are heteroscedasticity robust (HC0)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\josef\\Downloads\\WinPython-64bit-3.4.4.5Qt5\\python-3.4.4.amd64\\lib\\site-packages\\scipy\\stats\\stats.py:1327: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=17\n",
" \"anyway, n=%i\" % int(n))\n"
]
}
],
"source": [
"res_glm_hc = model.fit(rtol=1e-25, atol=1e-25, cov_type='HC0', use_t=True)\n",
"res_wls_hc = wlsmodel.fit(cov_type='HC0', use_t=True)\n",
"print(res_glm_hc.summary())\n",
"print()\n",
"print(res_wls_hc.summary())"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"INCOME -1.561251e-17\n",
"SOUTH -2.664535e-15\n",
"dtype: float64"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"res_glm_hc.bse - res_wls_hc.bse"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(109.50710112028318, 109.50710112028317)"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"res_glm_hc.aic, res_wls_hc.aic"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(1217.3661709162607, 111.1735278083956)"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"res_glm_hc.bic, res_wls_hc.bic"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(1, 2.0)"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"res_glm_hc.df_model, res_wls_hc.df_model"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(15, 15.0)"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"res_glm_hc.df_resid, res_wls_hc.df_resid"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Summary table with nonrobust and HC"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<table class=\"simpletable\">\n",
"<tr>\n",
" <td></td> <th>glm</th> <th>wls</th> <th>glm_hc</th> <th>wls_hc</th> \n",
"</tr>\n",
"<tr>\n",
" <th>INCOME</th> <td>0.0509</td> <td>0.0509</td> <td>0.0509</td> <td>0.0509</td> \n",
"</tr>\n",
"<tr>\n",
" <th></th> <td>(0.0464)</td> <td>(0.0464)</td> <td>(0.0141)</td> <td>(0.0141)</td>\n",
"</tr>\n",
"<tr>\n",
" <th>SOUTH</th> <td>2.9783</td> <td>2.9783</td> <td>2.9783</td> <td>2.9783</td> \n",
"</tr>\n",
"<tr>\n",
" <th></th> <td>(2.4972)</td> <td>(2.4972)</td> <td>(1.8854)</td> <td>(1.8854)</td>\n",
"</tr>\n",
"</table>"
],
"text/plain": [
"<class 'statsmodels.iolib.summary2.Summary'>\n",
"\"\"\"\n",
"\n",
"==========================================\n",
" glm wls glm_hc wls_hc \n",
"------------------------------------------\n",
"INCOME 0.0509 0.0509 0.0509 0.0509 \n",
" (0.0464) (0.0464) (0.0141) (0.0141)\n",
"SOUTH 2.9783 2.9783 2.9783 2.9783 \n",
" (2.4972) (2.4972) (1.8854) (1.8854)\n",
"==========================================\n",
"Standard errors in parentheses.\n",
"\"\"\""
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"summary_col([res_glm, res_wls, res_glm_hc, res_wls_hc], \n",
" model_names='glm wls glm_hc wls_hc'.split())"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#summary_col?"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Models with other robust standard errors\n",
"\n",
"Note that we are just making up some cluster or serial correlation, this has no meaning in the actual data used in the example."
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Generalized Linear Model Regression Results \n",
"==============================================================================\n",
"Dep. Variable: EXECUTIONS No. Observations: 17\n",
"Model: GLM Df Residuals: 15\n",
"Model Family: Gaussian Df Model: 1\n",
"Link Function: identity Scale: 83.991\n",
"Method: IRLS Log-Likelihood: -52.754\n",
"Date: Mon, 08 Jan 2018 Deviance: 1259.9\n",
"Time: 19:39:45 Pearson chi2: 1.26e+03\n",
"No. Iterations: 3 Covariance Type: cluster\n",
"==============================================================================\n",
" coef std err t P>|t| [0.025 0.975]\n",
"------------------------------------------------------------------------------\n",
"INCOME 0.0509 0.024 2.081 0.129 -0.027 0.129\n",
"SOUTH 2.9783 1.441 2.067 0.131 -1.607 7.564\n",
"==============================================================================\n",
"\n",
" WLS Regression Results \n",
"==============================================================================\n",
"Dep. Variable: EXECUTIONS R-squared: 0.304\n",
"Model: WLS Adj. R-squared: 0.211\n",
"Method: Least Squares F-statistic: 2.326\n",
"Date: Mon, 08 Jan 2018 Prob (F-statistic): 0.245\n",
"Time: 19:39:45 Log-Likelihood: -52.754\n",
"No. Observations: 17 AIC: 109.5\n",
"Df Residuals: 15 BIC: 111.2\n",
"Df Model: 2 \n",
"Covariance Type: cluster \n",
"==============================================================================\n",
" coef std err t P>|t| [0.025 0.975]\n",
"------------------------------------------------------------------------------\n",
"INCOME 0.0509 0.024 2.081 0.129 -0.027 0.129\n",
"SOUTH 2.9783 1.441 2.067 0.131 -1.607 7.564\n",
"==============================================================================\n",
"Omnibus: 33.202 Durbin-Watson: 0.741\n",
"Prob(Omnibus): 0.000 Jarque-Bera (JB): 65.147\n",
"Skew: 2.820 Prob(JB): 7.14e-15\n",
"Kurtosis: 10.756 Cond. No. 67.2\n",
"==============================================================================\n",
"\n",
"Warnings:\n",
"[1] Standard Errors are robust tocluster correlation (cluster)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"m:\\josef_new\\eclipse_ws\\statsmodels\\statsmodels_py34_pr\\statsmodels\\genmod\\generalized_linear_model.py:1445: SpecificationWarning: cov_type not fully supported with var_weights\n",
" SpecificationWarning)\n",
"C:\\Users\\josef\\Downloads\\WinPython-64bit-3.4.4.5Qt5\\python-3.4.4.amd64\\lib\\site-packages\\scipy\\stats\\stats.py:1327: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=17\n",
" \"anyway, n=%i\" % int(n))\n"
]
}
],
"source": [
"cov_type='cluster'\n",
"cov_kwds = {'groups': (np.arange(model.nobs) // 5)}\n",
"\n",
"res_glm_clu = model.fit(rtol=1e-25, atol=1e-25, cov_type=cov_type, cov_kwds=cov_kwds, use_t=True)\n",
"res_wls_clu = wlsmodel.fit(cov_type=cov_type, cov_kwds=cov_kwds, use_t=True)\n",
"print(res_glm_clu.summary())\n",
"print()\n",
"print(res_wls_clu.summary())"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Generalized Linear Model Regression Results \n",
"==============================================================================\n",
"Dep. Variable: EXECUTIONS No. Observations: 17\n",
"Model: GLM Df Residuals: 15\n",
"Model Family: Gaussian Df Model: 1\n",
"Link Function: identity Scale: 83.991\n",
"Method: IRLS Log-Likelihood: -52.754\n",
"Date: Mon, 08 Jan 2018 Deviance: 1259.9\n",
"Time: 19:39:45 Pearson chi2: 1.26e+03\n",
"No. Iterations: 3 Covariance Type: hac\n",
"==============================================================================\n",
" coef std err t P>|t| [0.025 0.975]\n",
"------------------------------------------------------------------------------\n",
"INCOME 0.0509 0.018 2.780 0.014 0.012 0.090\n",
"SOUTH 2.9783 1.972 1.510 0.152 -1.226 7.182\n",
"==============================================================================\n",
"\n",
" WLS Regression Results \n",
"==============================================================================\n",
"Dep. Variable: EXECUTIONS R-squared: 0.304\n",
"Model: WLS Adj. R-squared: 0.211\n",
"Method: Least Squares F-statistic: 4.068\n",
"Date: Mon, 08 Jan 2018 Prob (F-statistic): 0.0388\n",
"Time: 19:39:45 Log-Likelihood: -52.754\n",
"No. Observations: 17 AIC: 109.5\n",
"Df Residuals: 15 BIC: 111.2\n",
"Df Model: 2 \n",
"Covariance Type: hac \n",
"==============================================================================\n",
" coef std err t P>|t| [0.025 0.975]\n",
"------------------------------------------------------------------------------\n",
"INCOME 0.0509 0.018 2.780 0.014 0.012 0.090\n",
"SOUTH 2.9783 1.972 1.510 0.152 -1.226 7.182\n",
"==============================================================================\n",
"Omnibus: 33.202 Durbin-Watson: 0.741\n",
"Prob(Omnibus): 0.000 Jarque-Bera (JB): 65.147\n",
"Skew: 2.820 Prob(JB): 7.14e-15\n",
"Kurtosis: 10.756 Cond. No. 67.2\n",
"==============================================================================\n",
"\n",
"Warnings:\n",
"[1] Standard Errors are heteroscedasticity and autocorrelation robust (HAC) using 2 lags and with small sample correction\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"m:\\josef_new\\eclipse_ws\\statsmodels\\statsmodels_py34_pr\\statsmodels\\genmod\\generalized_linear_model.py:1445: SpecificationWarning: cov_type not fully supported with var_weights\n",
" SpecificationWarning)\n",
"C:\\Users\\josef\\Downloads\\WinPython-64bit-3.4.4.5Qt5\\python-3.4.4.amd64\\lib\\site-packages\\scipy\\stats\\stats.py:1327: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=17\n",
" \"anyway, n=%i\" % int(n))\n"
]
}
],
"source": [
"cov_type='hac'\n",
"cov_kwds = {'maxlags': 2, 'use_correction':True}\n",
"\n",
"res_glm_hac = model.fit(rtol=1e-25, atol=1e-25, cov_type=cov_type, cov_kwds=cov_kwds, use_t=True)\n",
"res_wls_hac = wlsmodel.fit(cov_type=cov_type, cov_kwds=cov_kwds, use_t=True)\n",
"print(res_glm_hac.summary())\n",
"print()\n",
"print(res_wls_hac.summary())"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Generalized Linear Model Regression Results \n",
"==============================================================================\n",
"Dep. Variable: EXECUTIONS No. Observations: 17\n",
"Model: GLM Df Residuals: 15\n",
"Model Family: Gaussian Df Model: 1\n",
"Link Function: identity Scale: 83.991\n",
"Method: IRLS Log-Likelihood: -52.754\n",
"Date: Mon, 08 Jan 2018 Deviance: 1259.9\n",
"Time: 19:39:46 Pearson chi2: 1.26e+03\n",
"No. Iterations: 3 Covariance Type: hac-panel\n",
"==============================================================================\n",
" coef std err t P>|t| [0.025 0.975]\n",
"------------------------------------------------------------------------------\n",
"INCOME 0.0509 0.016 3.244 0.048 0.001 0.101\n",
"SOUTH 2.9783 1.926 1.547 0.220 -3.150 9.107\n",
"==============================================================================\n",
"\n",
" WLS Regression Results \n",
"==============================================================================\n",
"Dep. Variable: EXECUTIONS R-squared: 0.304\n",
"Model: WLS Adj. R-squared: 0.211\n",
"Method: Least Squares F-statistic: 5.997\n",
"Date: Mon, 08 Jan 2018 Prob (F-statistic): 0.0895\n",
"Time: 19:39:46 Log-Likelihood: -52.754\n",
"No. Observations: 17 AIC: 109.5\n",
"Df Residuals: 15 BIC: 111.2\n",
"Df Model: 2 \n",
"Covariance Type: hac-panel \n",
"==============================================================================\n",
" coef std err t P>|t| [0.025 0.975]\n",
"------------------------------------------------------------------------------\n",
"INCOME 0.0509 0.016 3.244 0.048 0.001 0.101\n",
"SOUTH 2.9783 1.926 1.547 0.220 -3.150 9.107\n",
"==============================================================================\n",
"Omnibus: 33.202 Durbin-Watson: 0.741\n",
"Prob(Omnibus): 0.000 Jarque-Bera (JB): 65.147\n",
"Skew: 2.820 Prob(JB): 7.14e-15\n",
"Kurtosis: 10.756 Cond. No. 67.2\n",
"==============================================================================\n",
"\n",
"Warnings:\n",
"[1] Standard Errors are robust tocluster correlation (hac-panel)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"m:\\josef_new\\eclipse_ws\\statsmodels\\statsmodels_py34_pr\\statsmodels\\genmod\\generalized_linear_model.py:1445: SpecificationWarning: cov_type not fully supported with var_weights\n",
" SpecificationWarning)\n",
"C:\\Users\\josef\\Downloads\\WinPython-64bit-3.4.4.5Qt5\\python-3.4.4.amd64\\lib\\site-packages\\scipy\\stats\\stats.py:1327: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=17\n",
" \"anyway, n=%i\" % int(n))\n"
]
}
],
"source": [
"cov_type='hac-panel'\n",
"cov_kwds = {'groups': (np.arange(model.nobs) // 5), 'maxlags': 1, 'use_correction':True}\n",
"\n",
"res_glm_hacp = model.fit(rtol=1e-25, atol=1e-25, cov_type=cov_type, cov_kwds=cov_kwds, use_t=True)\n",
"res_wls_hacp = wlsmodel.fit(cov_type=cov_type, cov_kwds=cov_kwds, use_t=True)\n",
"print(res_glm_hacp.summary())\n",
"print()\n",
"print(res_wls_hacp.summary())"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Generalized Linear Model Regression Results \n",
"==============================================================================\n",
"Dep. Variable: EXECUTIONS No. Observations: 17\n",
"Model: GLM Df Residuals: 15\n",
"Model Family: Gaussian Df Model: 1\n",
"Link Function: identity Scale: 83.991\n",
"Method: IRLS Log-Likelihood: -52.754\n",
"Date: Mon, 08 Jan 2018 Deviance: 1259.9\n",
"Time: 19:39:46 Pearson chi2: 1.26e+03\n",
"No. Iterations: 3 Covariance Type: hac-groupsum\n",
"==============================================================================\n",
" coef std err t P>|t| [0.025 0.975]\n",
"------------------------------------------------------------------------------\n",
"INCOME 0.0509 0.006 8.867 0.000 0.036 0.066\n",
"SOUTH 2.9783 0.262 11.352 0.000 2.304 3.653\n",
"==============================================================================\n",
"\n",
" WLS Regression Results \n",
"==============================================================================\n",
"Dep. Variable: EXECUTIONS R-squared: 0.304\n",
"Model: WLS Adj. R-squared: 0.211\n",
"Method: Least Squares F-statistic: 241.0\n",
"Date: Mon, 08 Jan 2018 Prob (F-statistic): 1.07e-05\n",
"Time: 19:39:46 Log-Likelihood: -52.754\n",
"No. Observations: 17 AIC: 109.5\n",
"Df Residuals: 15 BIC: 111.2\n",
"Df Model: 2 \n",
"Covariance Type: hac-groupsum \n",
"==============================================================================\n",
" coef std err t P>|t| [0.025 0.975]\n",
"------------------------------------------------------------------------------\n",
"INCOME 0.0509 0.006 8.867 0.000 0.036 0.066\n",
"SOUTH 2.9783 0.262 11.352 0.000 2.304 3.653\n",
"==============================================================================\n",
"Omnibus: 33.202 Durbin-Watson: 0.741\n",
"Prob(Omnibus): 0.000 Jarque-Bera (JB): 65.147\n",
"Skew: 2.820 Prob(JB): 7.14e-15\n",
"Kurtosis: 10.756 Cond. No. 67.2\n",
"==============================================================================\n",
"\n",
"Warnings:\n",
"[1] Driscoll and Kraay Standard Errors are robust to cluster correlation (hac-groupsum)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"m:\\josef_new\\eclipse_ws\\statsmodels\\statsmodels_py34_pr\\statsmodels\\genmod\\generalized_linear_model.py:1445: SpecificationWarning: cov_type not fully supported with var_weights\n",
" SpecificationWarning)\n",
"C:\\Users\\josef\\Downloads\\WinPython-64bit-3.4.4.5Qt5\\python-3.4.4.amd64\\lib\\site-packages\\scipy\\stats\\stats.py:1327: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=17\n",
" \"anyway, n=%i\" % int(n))\n"
]
}
],
"source": [
"cov_type='hac-groupsum'\n",
"cov_kwds = {'time': np.tile(np.arange(3), 6)[:model.nobs], 'maxlags': 2, 'use_correction':True}\n",
"\n",
"res_glm_hacg = model.fit(rtol=1e-25, atol=1e-25, cov_type=cov_type, cov_kwds=cov_kwds, use_t=True)\n",
"res_wls_hacg = wlsmodel.fit(cov_type=cov_type, cov_kwds=cov_kwds, use_t=True)\n",
"print(res_glm_hacg.summary())\n",
"print()\n",
"print(res_wls_hacg.summary())"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"'hac-groupsum'"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"res_glm_hacg.cov_type"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Summary other robust standard errors"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<table class=\"simpletable\">\n",
"<tr>\n",
" <td></td> <th>glm_clu</th> <th>wls_clu</th> <th>glm_hac</th> <th>wls_hac</th> <th>glm_hacp</th> <th>wls_hacp</th> <th>glm_hacg</th> <th>wls_hacg</th>\n",
"</tr>\n",
"<tr>\n",
" <th>INCOME</th> <td>0.0509</td> <td>0.0509</td> <td>0.0509</td> <td>0.0509</td> <td>0.0509</td> <td>0.0509</td> <td>0.0509</td> <td>0.0509</td> \n",
"</tr>\n",
"<tr>\n",
" <th></th> <td>(0.0245)</td> <td>(0.0245)</td> <td>(0.0183)</td> <td>(0.0183)</td> <td>(0.0157)</td> <td>(0.0157)</td> <td>(0.0057)</td> <td>(0.0057)</td>\n",
"</tr>\n",
"<tr>\n",
" <th>SOUTH</th> <td>2.9783</td> <td>2.9783</td> <td>2.9783</td> <td>2.9783</td> <td>2.9783</td> <td>2.9783</td> <td>2.9783</td> <td>2.9783</td> \n",
"</tr>\n",
"<tr>\n",
" <th></th> <td>(1.4409)</td> <td>(1.4409)</td> <td>(1.9724)</td> <td>(1.9724)</td> <td>(1.9257)</td> <td>(1.9257)</td> <td>(0.2624)</td> <td>(0.2624)</td>\n",
"</tr>\n",
"</table>"
],
"text/plain": [
"<class 'statsmodels.iolib.summary2.Summary'>\n",
"\"\"\"\n",
"\n",
"==============================================================================\n",
" glm_clu wls_clu glm_hac wls_hac glm_hacp wls_hacp glm_hacg wls_hacg\n",
"------------------------------------------------------------------------------\n",
"INCOME 0.0509 0.0509 0.0509 0.0509 0.0509 0.0509 0.0509 0.0509 \n",
" (0.0245) (0.0245) (0.0183) (0.0183) (0.0157) (0.0157) (0.0057) (0.0057)\n",
"SOUTH 2.9783 2.9783 2.9783 2.9783 2.9783 2.9783 2.9783 2.9783 \n",
" (1.4409) (1.4409) (1.9724) (1.9724) (1.9257) (1.9257) (0.2624) (0.2624)\n",
"==============================================================================\n",
"Standard errors in parentheses.\n",
"\"\"\""
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model_list = [res_glm, res_wls, res_glm_hc, res_wls_hc, res_glm_clu, res_wls_clu, \n",
" res_glm_hac, res_wls_hac, res_glm_hacp, res_wls_hacp,\n",
" res_glm_hacg, res_wls_hacg]\n",
"model_names=('glm wls glm_hc wls_hc glm_clu wls_clu glm_hac wls_hac '\n",
" 'glm_hacp wls_hacp glm_hacg wls_hacg').split()\n",
"summary_col(model_list[4:], model_names=model_names[4:])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Interlude: some background information on robust standard errors and a list of methods and attributes"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Help on method get_robustcov_results in module statsmodels.regression.linear_model:\n",
"\n",
"get_robustcov_results(cov_type='HC1', use_t=None, **kwds) method of statsmodels.regression.linear_model.RegressionResults instance\n",
" create new results instance with robust covariance as default\n",
" \n",
" Parameters\n",
" ----------\n",
" cov_type : string\n",
" the type of robust sandwich estimator to use. see Notes below\n",
" use_t : bool\n",
" If true, then the t distribution is used for inference.\n",
" If false, then the normal distribution is used.\n",
" If `use_t` is None, then an appropriate default is used, which is\n",
" `true` if the cov_type is nonrobust, and `false` in all other\n",
" cases.\n",
" kwds : depends on cov_type\n",
" Required or optional arguments for robust covariance calculation.\n",
" see Notes below\n",
" \n",
" Returns\n",
" -------\n",
" results : results instance\n",
" This method creates a new results instance with the\n",
" requested robust covariance as the default covariance of\n",
" the parameters. Inferential statistics like p-values and\n",
" hypothesis tests will be based on this covariance matrix.\n",
" \n",
" Notes\n",
" -----\n",
" The following covariance types and required or optional arguments are\n",
" currently available:\n",
" \n",
" - 'fixed scale' and optional keyword argument 'scale' which uses\n",
" a predefined scale estimate with default equal to one.\n",
" - 'HC0', 'HC1', 'HC2', 'HC3' and no keyword arguments:\n",
" heteroscedasticity robust covariance\n",
" - 'HAC' and keywords\n",
" \n",
" - `maxlag` integer (required) : number of lags to use\n",
" - `kernel` string (optional) : kernel, default is Bartlett\n",
" - `use_correction` bool (optional) : If true, use small sample\n",
" correction\n",
" \n",
" - 'cluster' and required keyword `groups`, integer group indicator\n",
" \n",
" - `groups` array_like, integer (required) :\n",
" index of clusters or groups\n",
" - `use_correction` bool (optional) :\n",
" If True the sandwich covariance is calculated with a small\n",
" sample correction.\n",
" If False the sandwich covariance is calculated without\n",
" small sample correction.\n",
" - `df_correction` bool (optional)\n",
" If True (default), then the degrees of freedom for the\n",
" inferential statistics and hypothesis tests, such as\n",
" pvalues, f_pvalue, conf_int, and t_test and f_test, are\n",
" based on the number of groups minus one instead of the\n",
" total number of observations minus the number of explanatory\n",
" variables. `df_resid` of the results instance is adjusted.\n",
" If False, then `df_resid` of the results instance is not\n",
" adjusted.\n",
" \n",
" - 'hac-groupsum' Driscoll and Kraay, heteroscedasticity and\n",
" autocorrelation robust standard errors in panel data\n",
" keywords\n",
" \n",
" - `time` array_like (required) : index of time periods\n",
" - `maxlag` integer (required) : number of lags to use\n",
" - `kernel` string (optional) : kernel, default is Bartlett\n",
" - `use_correction` False or string in ['hac', 'cluster'] (optional) :\n",
" If False the the sandwich covariance is calulated without\n",
" small sample correction.\n",
" If `use_correction = 'cluster'` (default), then the same\n",
" small sample correction as in the case of 'covtype='cluster''\n",
" is used.\n",
" - `df_correction` bool (optional)\n",
" adjustment to df_resid, see cov_type 'cluster' above\n",
" # TODO: we need more options here\n",
" \n",
" - 'hac-panel' heteroscedasticity and autocorrelation robust standard\n",
" errors in panel data.\n",
" The data needs to be sorted in this case, the time series\n",
" for each panel unit or cluster need to be stacked. The\n",
" membership to a timeseries of an individual or group can\n",
" be either specified by group indicators or by increasing\n",
" time periods.\n",
" \n",
" keywords\n",
" \n",
" - either `groups` or `time` : array_like (required)\n",
" `groups` : indicator for groups\n",
" `time` : index of time periods\n",
" - `maxlag` integer (required) : number of lags to use\n",
" - `kernel` string (optional) : kernel, default is Bartlett\n",
" - `use_correction` False or string in ['hac', 'cluster'] (optional) :\n",
" If False the sandwich covariance is calculated without\n",
" small sample correction.\n",
" - `df_correction` bool (optional)\n",
" adjustment to df_resid, see cov_type 'cluster' above\n",
" # TODO: we need more options here\n",
" \n",
" Reminder:\n",
" `use_correction` in \"hac-groupsum\" and \"hac-panel\" is not bool,\n",
" needs to be in [False, 'hac', 'cluster']\n",
" \n",
" TODO: Currently there is no check for extra or misspelled keywords,\n",
" except in the case of cov_type `HCx`\n",
"\n"
]
}
],
"source": [
"help(res_wls.get_robustcov_results)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1])"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.tile(np.arange(3), 6)[:model.nobs]"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"['__class__',\n",
" '__delattr__',\n",
" '__dict__',\n",
" '__dir__',\n",
" '__doc__',\n",
" '__eq__',\n",
" '__format__',\n",
" '__ge__',\n",
" '__getattribute__',\n",
" '__gt__',\n",
" '__hash__',\n",
" '__init__',\n",
" '__le__',\n",
" '__lt__',\n",
" '__module__',\n",
" '__ne__',\n",
" '__new__',\n",
" '__reduce__',\n",
" '__reduce_ex__',\n",
" '__repr__',\n",
" '__setattr__',\n",
" '__sizeof__',\n",
" '__str__',\n",
" '__subclasshook__',\n",
" '__weakref__',\n",
" '_check_inputs',\n",
" '_data_attr',\n",
" '_estimate_x2_scale',\n",
" '_fit_gradient',\n",
" '_fit_irls',\n",
" '_get_init_kwds',\n",
" '_handle_data',\n",
" '_has_freq_weights',\n",
" '_has_var_weights',\n",
" '_init_keys',\n",
" '_offset_exposure',\n",
" '_setup_binomial',\n",
" '_update_history',\n",
" 'data',\n",
" 'df_model',\n",
" 'df_resid',\n",
" 'endog',\n",
" 'endog_names',\n",
" 'estimate_scale',\n",
" 'estimate_tweedie_power',\n",
" 'exog',\n",
" 'exog_names',\n",
" 'family',\n",
" 'fit',\n",
" 'fit_constrained',\n",
" 'fit_regularized',\n",
" 'formula',\n",
" 'freq_weights',\n",
" 'from_formula',\n",
" 'get_distribution',\n",
" 'hessian',\n",
" 'hessian_factor',\n",
" 'history',\n",
" 'information',\n",
" 'initialize',\n",
" 'iweights',\n",
" 'k_constant',\n",
" 'loglike',\n",
" 'loglike_mu',\n",
" 'mu',\n",
" 'n_trials',\n",
" 'nobs',\n",
" 'normalized_cov_params',\n",
" 'pinv_wexog',\n",
" 'predict',\n",
" 'scale',\n",
" 'scaletype',\n",
" 'score',\n",
" 'score_factor',\n",
" 'score_obs',\n",
" 'score_test',\n",
" 'var_weights',\n",
" 'weights',\n",
" 'wnobs']"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dir(model)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"['__class__',\n",
" '__delattr__',\n",
" '__dict__',\n",
" '__dir__',\n",
" '__doc__',\n",
" '__eq__',\n",
" '__format__',\n",
" '__ge__',\n",
" '__getattribute__',\n",
" '__gt__',\n",
" '__hash__',\n",
" '__init__',\n",
" '__le__',\n",
" '__lt__',\n",
" '__module__',\n",
" '__ne__',\n",
" '__new__',\n",
" '__reduce__',\n",
" '__reduce_ex__',\n",
" '__repr__',\n",
" '__setattr__',\n",
" '__sizeof__',\n",
" '__str__',\n",
" '__subclasshook__',\n",
" '__weakref__',\n",
" '_cache',\n",
" '_data_attr',\n",
" '_data_attr_model',\n",
" '_endog',\n",
" '_freq_weights',\n",
" '_get_robustcov_results',\n",
" '_iweights',\n",
" '_n_trials',\n",
" '_var_weights',\n",
" 'aic',\n",
" 'bic',\n",
" 'bse',\n",
" 'conf_int',\n",
" 'converged',\n",
" 'cov_kwds',\n",
" 'cov_params',\n",
" 'cov_params_default',\n",
" 'cov_type',\n",
" 'data_in_cache',\n",
" 'deviance',\n",
" 'df_model',\n",
" 'df_resid',\n",
" 'f_test',\n",
" 'family',\n",
" 'fit_history',\n",
" 'fittedvalues',\n",
" 'get_prediction',\n",
" 'initialize',\n",
" 'k_constant',\n",
" 'llf',\n",
" 'llnull',\n",
" 'load',\n",
" 'method',\n",
" 'mle_settings',\n",
" 'model',\n",
" 'mu',\n",
" 'nobs',\n",
" 'normalized_cov_params',\n",
" 'null',\n",
" 'null_deviance',\n",
" 'params',\n",
" 'pearson_chi2',\n",
" 'pinv_wexog',\n",
" 'plot_added_variable',\n",
" 'plot_ceres_residuals',\n",
" 'plot_partial_residuals',\n",
" 'predict',\n",
" 'pvalues',\n",
" 'remove_data',\n",
" 'resid_anscombe',\n",
" 'resid_deviance',\n",
" 'resid_pearson',\n",
" 'resid_response',\n",
" 'resid_working',\n",
" 'save',\n",
" 'scale',\n",
" 'summary',\n",
" 'summary2',\n",
" 't_test',\n",
" 'tvalues',\n",
" 'use_t',\n",
" 'wald_test',\n",
" 'wald_test_terms']"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dir(res_glm_hc)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Estimation with linear constraints\n",
"\n",
"GLM has a method `fit_constrained` to impose linear equality constraints in the estimation. OLS and WLS do not have a corresponding method, but we can estimate a model with simple constraints by moving the constraint part to the left hand side, i.e. incorporate it in endog. For a simple constraint that sets one or some of the parameters to a predefined level can be estimated as\n",
"\n",
"$y - a x_c = b x_{uc}$\n",
"\n",
"where $a$ contains the fixed parameters for exog $x_c$ and $b$ contains the parameters to be estimated for the unconstraint exog $x_{uc}$."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### with var_weights"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"sval = 3\n",
"res_glm_c = model.fit_constrained('SOUTH=%f' % sval)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Generalized Linear Model Regression Results \n",
"==============================================================================\n",
"Dep. Variable: EXECUTIONS No. Observations: 17\n",
"Model: GLM Df Residuals: 16\n",
"Model Family: Gaussian Df Model: 0\n",
"Link Function: identity Scale: 78.742\n",
"Method: IRLS Log-Likelihood: -52.754\n",
"Date: Mon, 08 Jan 2018 Deviance: 1259.9\n",
"Time: 19:39:46 Pearson chi2: 1.26e+03\n",
"No. Iterations: 1 Covariance Type: nonrobust\n",
"==============================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"------------------------------------------------------------------------------\n",
"INCOME 0.0507 0.036 1.408 0.159 -0.020 0.121\n",
"SOUTH 3.0000 0 inf 0.000 3.000 3.000\n",
"==============================================================================\n",
"\n",
"Model has been estimated subject to linear equality constraints.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"m:\\josef_new\\eclipse_ws\\statsmodels\\statsmodels_py34_pr\\statsmodels\\base\\model.py:1079: RuntimeWarning: divide by zero encountered in true_divide\n",
" return self.params / self.bse\n"
]
}
],
"source": [
"print(res_glm_c.summary())"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Generalized Linear Model Regression Results \n",
"==============================================================================\n",
"Dep. Variable: EXECUTIONS No. Observations: 17\n",
"Model: GLM Df Residuals: 16\n",
"Model Family: Gaussian Df Model: 0\n",
"Link Function: identity Scale: 78.742\n",
"Method: IRLS Log-Likelihood: -52.754\n",
"Date: Mon, 08 Jan 2018 Deviance: 1259.9\n",
"Time: 19:39:46 Pearson chi2: 1.26e+03\n",
"No. Iterations: 3 Covariance Type: nonrobust\n",
"==============================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"------------------------------------------------------------------------------\n",
"INCOME 0.0507 0.036 1.408 0.159 -0.020 0.121\n",
"==============================================================================\n"
]
}
],
"source": [
"model_o = smf.glm('EXECUTIONS ~ INCOME - 1',\n",
" data=data,\n",
" family=sm.families.Gaussian(link=sm.families.links.identity()),\n",
" offset=sval*data['SOUTH'],\n",
" var_weights=aweights\n",
" )\n",
"res_o = model_o.fit()\n",
"print(res_o.summary())"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" WLS Regression Results \n",
"==============================================================================\n",
"Dep. Variable: y_corrected R-squared: 0.110\n",
"Model: WLS Adj. R-squared: 0.055\n",
"Method: Least Squares F-statistic: 1.983\n",
"Date: Mon, 08 Jan 2018 Prob (F-statistic): 0.178\n",
"Time: 19:39:46 Log-Likelihood: -52.754\n",
"No. Observations: 17 AIC: 107.5\n",
"Df Residuals: 16 BIC: 108.3\n",
"Df Model: 1 \n",
"Covariance Type: nonrobust \n",
"==============================================================================\n",
" coef std err t P>|t| [0.025 0.975]\n",
"------------------------------------------------------------------------------\n",
"INCOME 0.0507 0.036 1.408 0.178 -0.026 0.127\n",
"==============================================================================\n",
"Omnibus: 33.145 Durbin-Watson: 0.743\n",
"Prob(Omnibus): 0.000 Jarque-Bera (JB): 64.900\n",
"Skew: 2.816 Prob(JB): 8.08e-15\n",
"Kurtosis: 10.740 Cond. No. 1.00\n",
"==============================================================================\n",
"\n",
"Warnings:\n",
"[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\josef\\Downloads\\WinPython-64bit-3.4.4.5Qt5\\python-3.4.4.amd64\\lib\\site-packages\\scipy\\stats\\stats.py:1327: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=17\n",
" \"anyway, n=%i\" % int(n))\n"
]
}
],
"source": [
"data['y_corrected'] = data['EXECUTIONS'] - sval * data['SOUTH']\n",
"wlsmodel_c = smf.wls('y_corrected ~ INCOME- 1',\n",
" data=data, weights=aweights)\n",
"\n",
"res_wls_c = wlsmodel_c.fit()\n",
"print(res_wls_c.summary())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### without weights"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"model0 = smf.glm(\n",
" 'EXECUTIONS ~ INCOME + SOUTH - 1',\n",
" data=data,\n",
" family=sm.families.Gaussian(link=sm.families.links.identity()),\n",
" )"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"If we do not use var_weights, then the parameter for SOUTH more than doubles, as can be seen from the following estimation. However, we still impose the same constrained value as before. \n",
"This is just for illustration and does not try to interpret the actual data."
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Generalized Linear Model Regression Results \n",
"==============================================================================\n",
"Dep. Variable: EXECUTIONS No. Observations: 17\n",
"Model: GLM Df Residuals: 15\n",
"Model Family: Gaussian Df Model: 1\n",
"Link Function: identity Scale: 68.810\n",
"Method: IRLS Log-Likelihood: -59.024\n",
"Date: Mon, 08 Jan 2018 Deviance: 1032.1\n",
"Time: 19:39:46 Pearson chi2: 1.03e+03\n",
"No. Iterations: 3 Covariance Type: nonrobust\n",
"==============================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"------------------------------------------------------------------------------\n",
"INCOME 0.0505 0.070 0.720 0.471 -0.087 0.188\n",
"SOUTH 6.4594 3.910 1.652 0.099 -1.205 14.123\n",
"==============================================================================\n"
]
}
],
"source": [
"res0 = model0.fit()\n",
"print(res0.summary())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Next we compute and compare three ways of imposing the constraint that the SOUTH coefficient is 3."
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Generalized Linear Model Regression Results \n",
"==============================================================================\n",
"Dep. Variable: EXECUTIONS No. Observations: 17\n",
"Model: GLM Df Residuals: 16\n",
"Model Family: Gaussian Df Model: 0\n",
"Link Function: identity Scale: 67.875\n",
"Method: IRLS Log-Likelihood: -59.457\n",
"Date: Mon, 08 Jan 2018 Deviance: 1086.0\n",
"Time: 19:39:46 Pearson chi2: 1.09e+03\n",
"No. Iterations: 1 Covariance Type: nonrobust\n",
"==============================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"------------------------------------------------------------------------------\n",
"INCOME 0.0876 0.056 1.569 0.117 -0.022 0.197\n",
"SOUTH 3.0000 0 inf 0.000 3.000 3.000\n",
"==============================================================================\n",
"\n",
"Model has been estimated subject to linear equality constraints.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"m:\\josef_new\\eclipse_ws\\statsmodels\\statsmodels_py34_pr\\statsmodels\\base\\model.py:1079: RuntimeWarning: divide by zero encountered in true_divide\n",
" return self.params / self.bse\n"
]
}
],
"source": [
"res0_c = model0.fit_constrained('SOUTH=%f' % sval)\n",
"print(res0_c.summary())"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Generalized Linear Model Regression Results \n",
"==============================================================================\n",
"Dep. Variable: EXECUTIONS No. Observations: 17\n",
"Model: GLM Df Residuals: 16\n",
"Model Family: Gaussian Df Model: 0\n",
"Link Function: identity Scale: 67.875\n",
"Method: IRLS Log-Likelihood: -59.457\n",
"Date: Mon, 08 Jan 2018 Deviance: 1086.0\n",
"Time: 19:39:47 Pearson chi2: 1.09e+03\n",
"No. Iterations: 3 Covariance Type: nonrobust\n",
"==============================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"------------------------------------------------------------------------------\n",
"INCOME 0.0876 0.056 1.569 0.117 -0.022 0.197\n",
"==============================================================================\n"
]
}
],
"source": [
"model0_o = smf.glm(\n",
" 'EXECUTIONS ~ INCOME - 1',\n",
" data=data,\n",
" family=sm.families.Gaussian(link=sm.families.links.identity()),\n",
" offset=sval*data['SOUTH']\n",
" )\n",
"res0_o = model0_o.fit()\n",
"print(res0_o.summary())"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" WLS Regression Results \n",
"==============================================================================\n",
"Dep. Variable: y_corrected R-squared: 0.133\n",
"Model: WLS Adj. R-squared: 0.079\n",
"Method: Least Squares F-statistic: 2.460\n",
"Date: Mon, 08 Jan 2018 Prob (F-statistic): 0.136\n",
"Time: 19:39:47 Log-Likelihood: -59.457\n",
"No. Observations: 17 AIC: 120.9\n",
"Df Residuals: 16 BIC: 121.7\n",
"Df Model: 1 \n",
"Covariance Type: nonrobust \n",
"==============================================================================\n",
" coef std err t P>|t| [0.025 0.975]\n",
"------------------------------------------------------------------------------\n",
"INCOME 0.0876 0.056 1.569 0.136 -0.031 0.206\n",
"==============================================================================\n",
"Omnibus: 40.598 Durbin-Watson: 0.806\n",
"Prob(Omnibus): 0.000 Jarque-Bera (JB): 107.312\n",
"Skew: 3.379 Prob(JB): 4.98e-24\n",
"Kurtosis: 13.287 Cond. No. 1.00\n",
"==============================================================================\n",
"\n",
"Warnings:\n",
"[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\josef\\Downloads\\WinPython-64bit-3.4.4.5Qt5\\python-3.4.4.amd64\\lib\\site-packages\\scipy\\stats\\stats.py:1327: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=17\n",
" \"anyway, n=%i\" % int(n))\n"
]
}
],
"source": [
"olsmodel_c = smf.wls('y_corrected ~ INCOME- 1', data=data)\n",
"res_ols_c = olsmodel_c.fit()\n",
"print(res_ols_c.summary())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Summary table\n",
"\n",
"As we can see in the summary table the unconstrained parameter and its standard errors agree across the three models for both weighted and unweighted versions."
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<table class=\"simpletable\">\n",
"<tr>\n",
" <td></td> <th>glm_c_w</th> <th>glm_offset_w</th> <th>wls_c</th> <th>glm_c</th> <th>glm_offset</th> <th>ols_c</th> \n",
"</tr>\n",
"<tr>\n",
" <th>INCOME</th> <td>0.0507</td> <td>0.0507</td> <td>0.0507</td> <td>0.0876</td> <td>0.0876</td> <td>0.0876</td> \n",
"</tr>\n",
"<tr>\n",
" <th></th> <td>(0.0360)</td> <td>(0.0360)</td> <td>(0.0360)</td> <td>(0.0558)</td> <td>(0.0558)</td> <td>(0.0558)</td>\n",
"</tr>\n",
"<tr>\n",
" <th>SOUTH</th> <td>3.0000</td> <td></td> <td></td> <td>3.0000</td> <td></td> <td></td> \n",
"</tr>\n",
"<tr>\n",
" <th></th> <td>(0.0000)</td> <td></td> <td></td> <td>(0.0000)</td> <td></td> <td></td> \n",
"</tr>\n",
"</table>"
],
"text/plain": [
"<class 'statsmodels.iolib.summary2.Summary'>\n",
"\"\"\"\n",
"\n",
"==================================================================\n",
" glm_c_w glm_offset_w wls_c glm_c glm_offset ols_c \n",
"------------------------------------------------------------------\n",
"INCOME 0.0507 0.0507 0.0507 0.0876 0.0876 0.0876 \n",
" (0.0360) (0.0360) (0.0360) (0.0558) (0.0558) (0.0558)\n",
"SOUTH 3.0000 3.0000 \n",
" (0.0000) (0.0000) \n",
"==================================================================\n",
"Standard errors in parentheses.\n",
"\"\"\""
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model_list = [res_glm_c, res_o, res_wls_c, res0_c, res0_o, res_ols_c]\n",
"model_names = 'glm_c_w glm_offset_w wls_c glm_c glm_offset ols_c'.split()\n",
"summary_col(model_list, model_names=model_names)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### More on fit_constrained"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(0.055837892461225488, 0.055837892461225488)"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"res0_c.bse.values[0], res0_o.bse.values[0]"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(0.050672264109255509, 0.050672264109255509)"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"res_o.params.values[0], res_glm_c.params.values[0]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The `fit_constrained` method adds some additional information about the constraints in the results instance, and attaches that reparameterized auxiliary model. The auxiliary model is in this case the same as the glm offset version above. We print the summary for the offset version again for easier comparison."
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"LinearConstraint(['INCOME', 'SOUTH'], array([[ 0., 1.]]), array([[ 3.]]))"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"res_glm_c.constraints"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Generalized Linear Model Regression Results \n",
"==============================================================================\n",
"Dep. Variable: y No. Observations: 17\n",
"Model: GLM Df Residuals: 16\n",
"Model Family: Gaussian Df Model: 0\n",
"Link Function: identity Scale: 78.742\n",
"Method: IRLS Log-Likelihood: -52.754\n",
"Date: Mon, 08 Jan 2018 Deviance: 1259.9\n",
"Time: 19:39:47 Pearson chi2: 1.26e+03\n",
"No. Iterations: 3 Covariance Type: nonrobust\n",
"==============================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"------------------------------------------------------------------------------\n",
"x1 0.0507 0.036 1.408 0.159 -0.020 0.121\n",
"==============================================================================\n"
]
}
],
"source": [
"print(res_glm_c.results_constrained.summary())"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Generalized Linear Model Regression Results \n",
"==============================================================================\n",
"Dep. Variable: EXECUTIONS No. Observations: 17\n",
"Model: GLM Df Residuals: 16\n",
"Model Family: Gaussian Df Model: 0\n",
"Link Function: identity Scale: 78.742\n",
"Method: IRLS Log-Likelihood: -52.754\n",
"Date: Mon, 08 Jan 2018 Deviance: 1259.9\n",
"Time: 19:39:47 Pearson chi2: 1.26e+03\n",
"No. Iterations: 3 Covariance Type: nonrobust\n",
"==============================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"------------------------------------------------------------------------------\n",
"INCOME 0.0507 0.036 1.408 0.159 -0.020 0.121\n",
"==============================================================================\n"
]
}
],
"source": [
"print(res_o.summary())"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([1, 2, 3, 4, 5, 4, 3, 2, 1, 2, 3, 4, 5, 4, 3, 2, 1])"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"res_glm_c.results_constrained.model.var_weights"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([1, 2, 3, 4, 5, 4, 3, 2, 1, 2, 3, 4, 5, 4, 3, 2, 1])"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"res_glm_c.model.var_weights"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([1, 2, 3, 4, 5, 4, 3, 2, 1, 2, 3, 4, 5, 4, 3, 2, 1])"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"res_o.model.var_weights"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Warning\n",
"\n",
"And a warning at the end. `fit_constrained` uses the instance of the unconstrained model and estimates the constrained model in an auxiliary model. The results from the auxiliary constrained model are transformed back to the original model. Harameter estimates and covariance of the parameter estimates and consequently also Wald inference are based on the constrained model. However, the model instance is for the full model and it has to be kept in mind that methods do not impose the constraints directly. \n",
"\n",
"As an example, score and hessian are derivatives with respect to the parameters of the unconstrained model but evaluated at the constrained parameter estimates. This provides immediately the foundation for score or lagrange multiplier tests."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Estimation with fit_regularized\n",
"\n",
"Some comparison of elastic net regularized fit between GLM and OLS/WLS with and without var_weights.\n",
"\n",
"We currently do not have standard errors and inference after fit_regularized, so we only compare the parameter estimates and the fitted values."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"First we check whether fit_regularized is close to the unregularized version if the penalization weight alpha is small.\n",
"As we can see the parameter estimates are close if alpha is 1e-8."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### without weights\n",
"\n",
"`res0` and `model0` is the basic GLM model without weights, which we can compare with the corresponding OLS model."
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"INCOME 0.050515\n",
"SOUTH 6.459415\n",
"dtype: float64"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"res0.params"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"INCOME 0.050515\n",
"SOUTH 6.459412\n",
"dtype: float64"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"res0_r = model0.fit_regularized(alpha=1e-8)\n",
"res0_r.params"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"INCOME 0.050515\n",
"SOUTH 6.459415\n",
"dtype: float64"
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"olsmodel = smf.ols(\n",
" 'EXECUTIONS ~ INCOME + SOUTH - 1',\n",
" data=data)\n",
"res_ols_r = olsmodel.fit_regularized(alpha=1e-8)\n",
"res_ols_r.params"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"1.7012421391626731e-06"
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.max(np.abs(res0_r.fittedvalues - res_ols_r.fittedvalues))"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"['_cache',\n",
" '_data_attr',\n",
" 'fittedvalues',\n",
" 'initialize',\n",
" 'k_constant',\n",
" 'model',\n",
" 'params',\n",
" 'predict',\n",
" 'summary']"
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"[i for i in dir(res0_r) if not i[:2] == '__']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### with var_weights\n",
"\n",
"`res_glm` and `model` is the basic GLM model with var_weights."
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"INCOME 0.050913\n",
"SOUTH 2.978327\n",
"dtype: float64"
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"res_glm.params"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"INCOME 0.050913\n",
"SOUTH 2.978326\n",
"dtype: float64"
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"res_r = model.fit_regularized(alpha=1e-8)\n",
"res_r.params"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"INCOME 0.050913\n",
"SOUTH 2.978327\n",
"dtype: float64"
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"res_wls_r = wlsmodel.fit_regularized(alpha=1e-8)\n",
"res_wls_r.params"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"6.6853358138274643e-07"
]
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.max(np.abs(res_r.fittedvalues - res_wls_r.fittedvalues))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### elastic net penalization"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"INCOME 0.050913\n",
"SOUTH 2.978325\n",
"dtype: float64"
]
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"res_r = model.fit_regularized(alpha=1e-8, L1_wt=0.5)\n",
"res_r.params"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"INCOME 0.050913\n",
"SOUTH 2.978327\n",
"dtype: float64"
]
},
"execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"res_wls_r = wlsmodel.fit_regularized(alpha=1e-8, L1_wt=0.5)\n",
"res_wls_r.params"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"1.3323735625903055e-06"
]
},
"execution_count": 56,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.max(np.abs(res_r.fittedvalues - res_wls_r.fittedvalues))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Larger penalization\n",
"\n",
"The parameterization for the penalization weight does not agree between GLM and OLS. If we increase the penalization weight, then there is a larger difference between the estimated parameters in GLM and OLS/WLS in both cases with and without var_weights. "
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>GLM</th>\n",
" <th>WLS</th>\n",
" <th>diff</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>INCOME</th>\n",
" <td>0.051131</td>\n",
" <td>0.050921</td>\n",
" <td>0.000209</td>\n",
" </tr>\n",
" <tr>\n",
" <th>SOUTH</th>\n",
" <td>2.958712</td>\n",
" <td>2.977605</td>\n",
" <td>-0.018893</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" GLM WLS diff\n",
"INCOME 0.051131 0.050921 0.000209\n",
"SOUTH 2.958712 2.977605 -0.018893"
]
},
"execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"res_r = model.fit_regularized(alpha=1e-4, L1_wt=0.5)\n",
"res_wls_r = wlsmodel.fit_regularized(alpha=1e-4, L1_wt=0.5)\n",
"import pandas as pd\n",
"out = pd.concat((res_r.params, res_wls_r.params, res_r.params - res_wls_r.params), axis=1)\n",
"out.columns = 'GLM WLS diff'.split()\n",
"out"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We can use a brief grid search to find the penalization weight that approximately minimizes the difference between GLM and WLS regularized parameter estimates."
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"3.4e-06 5.13371108348e-05\n",
"3.40816326531e-06 4.97284968684e-05\n",
"3.41632653061e-06 4.81198842079e-05\n",
"3.42448979592e-06 4.65112728518e-05\n",
"3.43265306122e-06 4.49026628004e-05\n",
"3.44081632653e-06 4.32940540538e-05\n",
"3.44897959184e-06 4.16854466132e-05\n",
"3.45714285714e-06 4.00768404765e-05\n",
"3.46530612245e-06 3.84682356449e-05\n",
"3.47346938776e-06 3.6859632119e-05\n",
"3.48163265306e-06 3.52510298978e-05\n",
"3.48979591837e-06 3.36424289817e-05\n",
"3.49795918367e-06 3.203382937e-05\n",
"3.50612244898e-06 3.04252310634e-05\n",
"3.51428571429e-06 2.88166340621e-05\n",
"3.52244897959e-06 2.7208038365e-05\n",
"3.5306122449e-06 2.55994439735e-05\n",
"3.5387755102e-06 2.39908508872e-05\n",
"3.54693877551e-06 2.23822591043e-05\n",
"3.55510204082e-06 2.07736686284e-05\n",
"3.56326530612e-06 1.91650794568e-05\n",
"3.57142857143e-06 1.75564915899e-05\n",
"3.57959183673e-06 1.59479050277e-05\n",
"3.58775510204e-06 1.43393197707e-05\n",
"3.59591836735e-06 1.27307358184e-05\n",
"3.60408163265e-06 1.11221531713e-05\n",
"3.61224489796e-06 9.51357182899e-06\n",
"3.62040816327e-06 7.90499179182e-06\n",
"3.62857142857e-06 6.29641305938e-06\n",
"3.63673469388e-06 4.68783563212e-06\n",
"3.64489795918e-06 3.07925951004e-06\n",
"3.65306122449e-06 1.47068469136e-06\n",
"3.6612244898e-06 1.3788882125e-07\n",
"3.6693877551e-06 1.74646102913e-06\n",
"3.67755102041e-06 3.35503193227e-06\n",
"3.68571428571e-06 4.96360152979e-06\n",
"3.69387755102e-06 6.57216982303e-06\n",
"3.70204081633e-06 8.18073681108e-06\n",
"3.71020408163e-06 9.78930249484e-06\n",
"3.71836734694e-06 1.13978668725e-05\n",
"3.72653061224e-06 1.30064299459e-05\n",
"3.73469387755e-06 1.46149917151e-05\n",
"3.74285714286e-06 1.62235521781e-05\n",
"3.75102040816e-06 1.78321113378e-05\n",
"3.75918367347e-06 1.94406691909e-05\n",
"3.76734693878e-06 2.10492257402e-05\n",
"3.77551020408e-06 2.26577809843e-05\n",
"3.78367346939e-06 2.42663349228e-05\n",
"3.79183673469e-06 2.58748875579e-05\n",
"3.8e-06 2.74834388874e-05\n"
]
}
],
"source": [
"res_wls_r = wlsmodel.fit_regularized(alpha=1e-4, L1_wt=0.5)\n",
"for pw in np.linspace(3.4e-6, 3.8e-6):\n",
" res_r = model.fit_regularized(alpha=pw, L1_wt=0.5)\n",
" print(pw, np.max(np.abs(res_r.params - res_wls_r.params)))"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>GLM</th>\n",
" <th>WLS</th>\n",
" <th>diff</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>INCOME</th>\n",
" <td>0.050921</td>\n",
" <td>0.050921</td>\n",
" <td>1.526545e-09</td>\n",
" </tr>\n",
" <tr>\n",
" <th>SOUTH</th>\n",
" <td>2.977605</td>\n",
" <td>2.977605</td>\n",
" <td>-1.378888e-07</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" GLM WLS diff\n",
"INCOME 0.050921 0.050921 1.526545e-09\n",
"SOUTH 2.977605 2.977605 -1.378888e-07"
]
},
"execution_count": 59,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pw0 = 3.6612244898e-06\n",
"res_r = model.fit_regularized(alpha=pw0, L1_wt=0.5)\n",
"out = pd.concat((res_r.params, res_wls_r.params, res_r.params - res_wls_r.params), axis=1)\n",
"out.columns = 'GLM WLS diff'.split()\n",
"out"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"65.66753106718858"
]
},
"execution_count": 60,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"((model.endog - res_r.fittedvalues)**2).sum() / res0.model.nobs #df_resid"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"68.809555690191615"
]
},
"execution_count": 61,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"res0.scale"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"27.313266443670777"
]
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"1e-4 / pw0"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Note, the disagreement between the parameterization weights also affects the parameter estimates when we do not use var_weights. Also the penalization weight adjustment differes across models, and should be related how the variance of the error term affects the relative weight between likelihood or sum of squares."
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>GLM</th>\n",
" <th>OLS</th>\n",
" <th>diff</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>INCOME</th>\n",
" <td>0.051473</td>\n",
" <td>0.050519</td>\n",
" <td>0.000954</td>\n",
" </tr>\n",
" <tr>\n",
" <th>SOUTH</th>\n",
" <td>6.369753</td>\n",
" <td>6.459041</td>\n",
" <td>-0.089288</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" GLM OLS diff\n",
"INCOME 0.051473 0.050519 0.000954\n",
"SOUTH 6.369753 6.459041 -0.089288"
]
},
"execution_count": 63,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"res0_r = model0.fit_regularized(alpha=1e-4, L1_wt=0.5)\n",
"res_ols_r = olsmodel.fit_regularized(alpha=1e-4)\n",
"out = pd.concat((res0_r.params, res_ols_r.params, res0_r.params - res_ols_r.params), axis=1)\n",
"out.columns = 'GLM OLS diff'.split()\n",
"out"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>GLM</th>\n",
" <th>OLS</th>\n",
" <th>diff</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>INCOME</th>\n",
" <td>0.050551</td>\n",
" <td>0.050519</td>\n",
" <td>0.000032</td>\n",
" </tr>\n",
" <tr>\n",
" <th>SOUTH</th>\n",
" <td>6.456094</td>\n",
" <td>6.459041</td>\n",
" <td>-0.002947</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" GLM OLS diff\n",
"INCOME 0.050551 0.050519 0.000032\n",
"SOUTH 6.456094 6.459041 -0.002947"
]
},
"execution_count": 64,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"res0_r = model0.fit_regularized(alpha=pw0, L1_wt=0.5)\n",
"res_ols_r = olsmodel.fit_regularized(alpha=1e-4)\n",
"out = pd.concat((res0_r.params, res_ols_r.params, res0_r.params - res_ols_r.params), axis=1)\n",
"out.columns = 'GLM OLS diff'.split()\n",
"out"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.4.4"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment