Last active
February 9, 2018 07:57
-
-
Save drorata/f001d2f03812f4ca29c545e131e857d3 to your computer and use it in GitHub Desktop.
Reproducing a blog post
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"This notebook is a fork of the data and steps executed in [this nice blog post](http://hamelg.blogspot.de/2015/11/python-for-data-analysis-part-25-chi.html).\n", | |
"In particular, $\\chi^2$ statistic and $p$-value are computed \"manually\" and using `scipy.stats.chi2_contingency` directly." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"from scipy import stats" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def comp_expected(df):\n", | |
" \"\"\"\n", | |
" Compute the expected matrix\n", | |
" \"\"\"\n", | |
" return pd.DataFrame(\n", | |
" np.outer(df.sum(axis=1), df.sum(axis=0)) / df.sum().sum(),\n", | |
" columns=df.columns,\n", | |
" index=df.index\n", | |
" )" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"chi_squared_stat = lambda observed, expected: (((observed-expected)**2)/expected).sum().sum()\n", | |
"p_value = lambda observed, chi_squared_stat: 1 - stats.chi2.cdf(\n", | |
" x=chi_squared_stat, \n", | |
" df=(observed.shape[0]-1)*(observed.shape[1]-1))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def compare_computations(observed):\n", | |
" print(observed)\n", | |
" chi2_stat = chi_squared_stat(observed, comp_expected(observed))\n", | |
" pval = p_value(observed, chi_squared_stat(observed, comp_expected(observed)))\n", | |
" chi2_contingency = stats.chi2_contingency(observed)\n", | |
" print('chi2_stat = {}\\npval = {}\\nchi2_contingency output = {}'.format(\n", | |
" chi2_stat, \n", | |
" pval,\n", | |
" chi2_contingency\n", | |
" ))\n", | |
" print(\"chi2_stat identical - {}\".format(np.isclose(chi2_stat, chi2_contingency[0])))\n", | |
" print(\"pval identical - {}\".format(np.isclose(pval, chi2_contingency[1])))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"The observed data here has the form of the result of an A/B/C/D/E test.\n", | |
"The null hypothesis is that the results are independent.\n", | |
"The objective is to check whether the null hypothesis holds or not." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>Result 1</th>\n", | |
" <th>Result 2</th>\n", | |
" <th>Result 3</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>Control</th>\n", | |
" <td>21</td>\n", | |
" <td>7</td>\n", | |
" <td>32</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>VAR1</th>\n", | |
" <td>65</td>\n", | |
" <td>25</td>\n", | |
" <td>64</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>VAR2</th>\n", | |
" <td>107</td>\n", | |
" <td>50</td>\n", | |
" <td>94</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>VAR3</th>\n", | |
" <td>15</td>\n", | |
" <td>8</td>\n", | |
" <td>15</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>VAR4</th>\n", | |
" <td>189</td>\n", | |
" <td>6</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" Result 1 Result 2 Result 3\n", | |
"Control 21 7 32\n", | |
"VAR1 65 25 64\n", | |
"VAR2 107 50 94\n", | |
"VAR3 15 8 15\n", | |
"VAR4 189 6 2" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Observed data\n", | |
"observed_base = pd.DataFrame(\n", | |
" {\n", | |
" \"Result 1\": [21, 65, 107, 15, 189],\n", | |
" \"Result 2\": [7, 25, 50, 8, 6],\n", | |
" \"Result 3\": [32, 64, 94, 15, 2]\n", | |
" },\n", | |
" index=[\"Control\", \n", | |
" \"VAR1\", \n", | |
" \"VAR2\", \n", | |
" \"VAR3\", \n", | |
" \"VAR4\"]\n", | |
")\n", | |
"observed_base" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
" Result 1 Result 2 Result 3\n", | |
"Control 21 7 32\n", | |
"VAR1 65 25 64\n", | |
"VAR2 107 50 94\n", | |
"VAR3 15 8 15\n", | |
"VAR4 189 6 2\n", | |
"chi2_stat = 180.49674748377237\n", | |
"pval = 0.0\n", | |
"chi2_contingency output = (180.4967474837724, 8.09675550680417e-35, 8, array([[ 34.02857143, 8.22857143, 17.74285714],\n", | |
" [ 87.34 , 21.12 , 45.54 ],\n", | |
" [142.35285714, 34.42285714, 74.22428571],\n", | |
" [ 21.55142857, 5.21142857, 11.23714286],\n", | |
" [111.72714286, 27.01714286, 58.25571429]]))\n", | |
"chi2_stat identical - True\n", | |
"pval identical - True\n" | |
] | |
} | |
], | |
"source": [ | |
"compare_computations(observed_base.copy())" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Next, we take subsets of the observed data and run the same procedure.\n", | |
"Once taking a $2\\mathrm{x}2$ subset of the data, the \"manually\" generated numbers no longer align with those obtained by `stats.chi2_contingency`" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
" Result 1 Result 2 Result 3\n", | |
"Control 21 7 32\n", | |
"VAR1 65 25 64\n", | |
"chi2_stat = 2.4949588283163875\n", | |
"pval = 0.28722786769299047\n", | |
"chi2_contingency output = (2.4949588283163875, 0.2872278676929904, 2, array([[24.11214953, 8.97196262, 26.91588785],\n", | |
" [61.88785047, 23.02803738, 69.08411215]]))\n", | |
"chi2_stat identical - True\n", | |
"pval identical - True\n" | |
] | |
} | |
], | |
"source": [ | |
"# 2x3 subset of the observed data\n", | |
"compare_computations(observed_base.iloc[0:2].copy())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
" Result 1 Result 2\n", | |
"Control 21 7\n", | |
"VAR1 65 25\n", | |
"VAR2 107 50\n", | |
"VAR3 15 8\n", | |
"VAR4 189 6\n", | |
"chi2_stat = 56.703292391691704\n", | |
"pval = 1.4278023208191826e-11\n", | |
"chi2_contingency output = (56.703292391691704, 1.4278052288246689e-11, 4, array([[ 22.54766734, 5.45233266],\n", | |
" [ 72.47464503, 17.52535497],\n", | |
" [126.42799189, 30.57200811],\n", | |
" [ 18.52129817, 4.47870183],\n", | |
" [157.02839757, 37.97160243]]))\n", | |
"chi2_stat identical - True\n", | |
"pval identical - True\n" | |
] | |
} | |
], | |
"source": [ | |
"# 5x2 subset of the observed data\n", | |
"compare_computations(observed_base[observed_base.columns[0:2]].copy())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
" Result 1 Result 2\n", | |
"Control 21 7\n", | |
"VAR1 65 25\n", | |
"chi2_stat = 0.08337370801033595\n", | |
"pval = 0.7727764799468594\n", | |
"chi2_contingency output = (0.00205882013658177, 0.9638090266779162, 1, array([[20.40677966, 7.59322034],\n", | |
" [65.59322034, 24.40677966]]))\n", | |
"chi2_stat identical - False\n", | |
"pval identical - False\n" | |
] | |
} | |
], | |
"source": [ | |
"# 2x2 subset of the observed data\n", | |
"compare_computations(observed_base.iloc[0:2][observed_base.columns[0:2]].copy())" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python [conda env:ds-review-ab-tests]", | |
"language": "python", | |
"name": "conda-env-ds-review-ab-tests-py" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.4" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment