Created
May 23, 2016 10:40
-
-
Save vitillo/0bdd2dc09c1071fedcee37234a687edf to your computer and use it in GitHub Desktop.
Hypothesis test for count data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import json\n", | |
"import pandas as pd\n", | |
"\n", | |
"%load_ext rpy2.ipython" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"with open(\"crashes.json\") as f:\n", | |
" crashes = pd.DataFrame(json.load(f))\n", | |
" \n", | |
"# Extracted from https://gist.github.com/bsmedberg/59d421cdea64bed439747993053a39e5b" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>branch</th>\n", | |
" <th>crashes</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>control</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>control</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>control</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>control</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>control</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>control</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>control</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>control</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>control</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td>control</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" branch crashes\n", | |
"0 control 0\n", | |
"1 control 0\n", | |
"2 control 0\n", | |
"3 control 0\n", | |
"4 control 0\n", | |
"5 control 0\n", | |
"6 control 0\n", | |
"7 control 0\n", | |
"8 control 0\n", | |
"9 control 0" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"crashes[:10]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Install dependencies:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"%%R\n", | |
"\n", | |
"install <- function(mypkg){\n", | |
" if (!is.element(mypkg, installed.packages()[,1]))\n", | |
" install.packages(mypkg, repos=\"http://cran.rstudio.com/\", quiet=TRUE)\n", | |
"}\n", | |
" \n", | |
"options(warn=-1)\n", | |
" \n", | |
"install(\"pscl\")\n", | |
"install(\"MASS\")\n", | |
" \n", | |
"library(MASS)\n", | |
"library(pscl)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Load data:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"%%R -i crashes\n", | |
"\n", | |
"crashes <- within(crashes, {\n", | |
" branch <- factor(branch, levels = c(\"control\", \"aggressive\", \"conservative\"))\n", | |
"})" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": false, | |
"scrolled": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"\n", | |
"Call:\n", | |
"glm.nb(formula = crashes ~ branch, data = crashes, init.theta = 0.002076156342, \n", | |
" link = log)\n", | |
"\n", | |
"Deviance Residuals: \n", | |
" Min 1Q Median 3Q Max \n", | |
"-0.0870 -0.0870 -0.0843 -0.0840 9.7319 \n", | |
"\n", | |
"Coefficients:\n", | |
" Estimate Std. Error z value Pr(>|z|) \n", | |
"(Intercept) -4.53125 0.03532 -128.281 < 2e-16 ***\n", | |
"branchaggressive -0.13304 0.05020 -2.650 0.00805 ** \n", | |
"branchconservative -0.15072 0.05033 -2.995 0.00275 ** \n", | |
"---\n", | |
"Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n", | |
"\n", | |
"(Dispersion parameter for Negative Binomial(0.0021) family taken to be 1)\n", | |
"\n", | |
" Null deviance: 15054 on 1380992 degrees of freedom\n", | |
"Residual deviance: 15043 on 1380990 degrees of freedom\n", | |
"AIC: 81076\n", | |
"\n", | |
"Number of Fisher Scoring iterations: 1\n", | |
"\n", | |
"\n", | |
" Theta: 0.0020762 \n", | |
" Std. Err.: 3.72e-05 \n", | |
"\n", | |
" 2 x log-likelihood: -81068.2150000 \n" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"%%R\n", | |
"\n", | |
"summary(nb <- glm.nb(crashes ~ branch, data = crashes))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"The model has the following form: $$ \\log(\\mathbf{E}(crashes | branch)) = \\beta_0 + \\beta_1\\mathbb{1}_{aggressive}(branch) + \\beta_2\\mathbb{1}_{conservative}(branch) $$ i.e.:\n", | |
"$$ \\mathbf{E}(crashes | branch) = e^{\\beta_0 + \\beta_1\\mathbb{1}_{aggressive}(branch) + \\beta_2\\mathbb{1}_{conservative}(branch)} $$" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": false, | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/Users/vitillo/.pyenv/versions/anaconda2-2.5.0/lib/python2.7/site-packages/rpy2/rinterface/__init__.py:185: RRuntimeWarning: Waiting for profiling to be done...\n", | |
"\n", | |
" warnings.warn(x, RRuntimeWarning)\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
" Estimate 2.5 % 97.5 %\n", | |
"(Intercept) 0.01076721 0.01005236 0.0115454\n", | |
"branchaggressive 0.87542951 0.79338343 0.9659509\n", | |
"branchconservative 0.86008451 0.77927677 0.9492689\n" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"%%R\n", | |
"\n", | |
"estimate <- exp(cbind(Estimate = coef(nb), confint(nb)))\n", | |
"estimate" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[1] \"The aggressive branch reduced the number of plugin crashes by 12.46% wrt. the control branch.\"\n", | |
"[1] \"The 95% confidence interval of the estimate is (3.404907%, 20.661657%).\"\n" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"%%R\n", | |
"\n", | |
"print(sprintf(\"The aggressive branch reduced the number of plugin crashes by %.2f%% wrt. the control branch.\", 100*(1 - estimate[2, 1])))\n", | |
"print(sprintf(\"The 95%% confidence interval of the estimate is (%f%%, %f%%).\", 100*(1 - estimate[2, 3]), 100*(1 - estimate[2, 2])))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Is there overdispersion or would a Poisson model fit the data equally well?" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"Likelihood ratio test of H0: Poisson, as restricted NB model:\n", | |
"n.b., the distribution of the test-statistic under H0 is non-standard\n", | |
"e.g., see help(odTest) for details/references\n", | |
"\n", | |
"Critical value of test statistic at the alpha= 0.05 level: 2.7055 \n", | |
"Chi-Square Test Statistic = 108508.5961 p-value = < 2.2e-16 \n" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"%%R\n", | |
"\n", | |
"odTest(nb)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Goodness of fit:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[1] 1\n" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"%%R\n", | |
"\n", | |
"1 - pchisq(nb$deviance, nb$df.residual)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"References:\n", | |
"\n", | |
"http://advan.physiology.org/content/34/3/128\n", | |
"\n", | |
"http://datavoreconsulting.com/programming-tips/count-data-glms-choosing-poisson-negative-binomial-zero-inflated-poisson/" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.11" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment