drorata/Chi squared example.ipynb

## Chi squared example.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This notebook is a fork of the data and steps executed in [this nice blog post](http://hamelg.blogspot.de/2015/11/python-for-data-analysis-part-25-chi.html).\n",
    "In particular, $\\chi^2$ statistic and $p$-value are computed \"manually\" and using `scipy.stats.chi2_contingency` directly."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from scipy import  stats"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def comp_expected(df):\n",
    "    \"\"\"\n",
    "    Compute the expected matrix\n",
    "    \"\"\"\n",
    "    return pd.DataFrame(\n",
    "        np.outer(df.sum(axis=1), df.sum(axis=0)) / df.sum().sum(),\n",
    "        columns=df.columns,\n",
    "        index=df.index\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "chi_squared_stat = lambda observed, expected: (((observed-expected)**2)/expected).sum().sum()\n",
    "p_value = lambda observed, chi_squared_stat: 1 - stats.chi2.cdf(\n",
    "    x=chi_squared_stat, \n",
    "    df=(observed.shape[0]-1)*(observed.shape[1]-1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def compare_computations(observed):\n",
    "    print(observed)\n",
    "    chi2_stat = chi_squared_stat(observed, comp_expected(observed))\n",
    "    pval = p_value(observed, chi_squared_stat(observed, comp_expected(observed)))\n",
    "    chi2_contingency = stats.chi2_contingency(observed)\n",
    "    print('chi2_stat = {}\\npval = {}\\nchi2_contingency output = {}'.format(\n",
    "        chi2_stat, \n",
    "        pval,\n",
    "        chi2_contingency\n",
    "    ))\n",
    "    print(\"chi2_stat identical - {}\".format(np.isclose(chi2_stat, chi2_contingency[0])))\n",
    "    print(\"pval      identical - {}\".format(np.isclose(pval, chi2_contingency[1])))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The observed data here has the form of the result of an A/B/C/D/E test.\n",
    "The null hypothesis is that the results are independent.\n",
    "The objective is to check whether the null hypothesis holds or not."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Result 1</th>\n",
       "      <th>Result 2</th>\n",
       "      <th>Result 3</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Control</th>\n",
       "      <td>21</td>\n",
       "      <td>7</td>\n",
       "      <td>32</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>VAR1</th>\n",
       "      <td>65</td>\n",
       "      <td>25</td>\n",
       "      <td>64</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>VAR2</th>\n",
       "      <td>107</td>\n",
       "      <td>50</td>\n",
       "      <td>94</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>VAR3</th>\n",
       "      <td>15</td>\n",
       "      <td>8</td>\n",
       "      <td>15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>VAR4</th>\n",
       "      <td>189</td>\n",
       "      <td>6</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         Result 1  Result 2  Result 3\n",
       "Control        21         7        32\n",
       "VAR1           65        25        64\n",
       "VAR2          107        50        94\n",
       "VAR3           15         8        15\n",
       "VAR4          189         6         2"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Observed data\n",
    "observed_base = pd.DataFrame(\n",
    "    {\n",
    "        \"Result 1\": [21, 65, 107, 15, 189],\n",
    "        \"Result 2\": [7, 25, 50, 8, 6],\n",
    "        \"Result 3\": [32, 64, 94, 15, 2]\n",
    "    },\n",
    "    index=[\"Control\", \n",
    "           \"VAR1\", \n",
    "           \"VAR2\", \n",
    "           \"VAR3\", \n",
    "           \"VAR4\"]\n",
    ")\n",
    "observed_base"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "         Result 1  Result 2  Result 3\n",
      "Control        21         7        32\n",
      "VAR1           65        25        64\n",
      "VAR2          107        50        94\n",
      "VAR3           15         8        15\n",
      "VAR4          189         6         2\n",
      "chi2_stat = 180.49674748377237\n",
      "pval = 0.0\n",
      "chi2_contingency output = (180.4967474837724, 8.09675550680417e-35, 8, array([[ 34.02857143,   8.22857143,  17.74285714],\n",
      "       [ 87.34      ,  21.12      ,  45.54      ],\n",
      "       [142.35285714,  34.42285714,  74.22428571],\n",
      "       [ 21.55142857,   5.21142857,  11.23714286],\n",
      "       [111.72714286,  27.01714286,  58.25571429]]))\n",
      "chi2_stat identical - True\n",
      "pval      identical - True\n"
     ]
    }
   ],
   "source": [
    "compare_computations(observed_base.copy())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Next, we take subsets of the observed data and run the same procedure.\n",
    "Once taking a $2\\mathrm{x}2$ subset of the data, the \"manually\" generated numbers no longer align with those obtained by `stats.chi2_contingency`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "         Result 1  Result 2  Result 3\n",
      "Control        21         7        32\n",
      "VAR1           65        25        64\n",
      "chi2_stat = 2.4949588283163875\n",
      "pval = 0.28722786769299047\n",
      "chi2_contingency output = (2.4949588283163875, 0.2872278676929904, 2, array([[24.11214953,  8.97196262, 26.91588785],\n",
      "       [61.88785047, 23.02803738, 69.08411215]]))\n",
      "chi2_stat identical - True\n",
      "pval      identical - True\n"
     ]
    }
   ],
   "source": [
    "# 2x3 subset of the observed data\n",
    "compare_computations(observed_base.iloc[0:2].copy())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "         Result 1  Result 2\n",
      "Control        21         7\n",
      "VAR1           65        25\n",
      "VAR2          107        50\n",
      "VAR3           15         8\n",
      "VAR4          189         6\n",
      "chi2_stat = 56.703292391691704\n",
      "pval = 1.4278023208191826e-11\n",
      "chi2_contingency output = (56.703292391691704, 1.4278052288246689e-11, 4, array([[ 22.54766734,   5.45233266],\n",
      "       [ 72.47464503,  17.52535497],\n",
      "       [126.42799189,  30.57200811],\n",
      "       [ 18.52129817,   4.47870183],\n",
      "       [157.02839757,  37.97160243]]))\n",
      "chi2_stat identical - True\n",
      "pval      identical - True\n"
     ]
    }
   ],
   "source": [
    "# 5x2 subset of the observed data\n",
    "compare_computations(observed_base[observed_base.columns[0:2]].copy())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "         Result 1  Result 2\n",
      "Control        21         7\n",
      "VAR1           65        25\n",
      "chi2_stat = 0.08337370801033595\n",
      "pval = 0.7727764799468594\n",
      "chi2_contingency output = (0.00205882013658177, 0.9638090266779162, 1, array([[20.40677966,  7.59322034],\n",
      "       [65.59322034, 24.40677966]]))\n",
      "chi2_stat identical - False\n",
      "pval      identical - False\n"
     ]
    }
   ],
   "source": [
    "# 2x2 subset of the observed data\n",
    "compare_computations(observed_base.iloc[0:2][observed_base.columns[0:2]].copy())"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:ds-review-ab-tests]",
   "language": "python",
   "name": "conda-env-ds-review-ab-tests-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"This notebook is a fork of the data and steps executed in [this nice blog post](http://hamelg.blogspot.de/2015/11/python-for-data-analysis-part-25-chi.html).\n",
	"In particular, $\\chi^2$ statistic and $p$-value are computed \"manually\" and using `scipy.stats.chi2_contingency` directly."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"import pandas as pd\n",
	"import numpy as np\n",
	"from scipy import stats"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"def comp_expected(df):\n",
	" \"\"\"\n",
	" Compute the expected matrix\n",
	" \"\"\"\n",
	" return pd.DataFrame(\n",
	" np.outer(df.sum(axis=1), df.sum(axis=0)) / df.sum().sum(),\n",
	" columns=df.columns,\n",
	" index=df.index\n",
	" )"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"chi_squared_stat = lambda observed, expected: (((observed-expected)**2)/expected).sum().sum()\n",
	"p_value = lambda observed, chi_squared_stat: 1 - stats.chi2.cdf(\n",
	" x=chi_squared_stat, \n",
	" df=(observed.shape[0]-1)*(observed.shape[1]-1))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [],
	"source": [
	"def compare_computations(observed):\n",
	" print(observed)\n",
	" chi2_stat = chi_squared_stat(observed, comp_expected(observed))\n",
	" pval = p_value(observed, chi_squared_stat(observed, comp_expected(observed)))\n",
	" chi2_contingency = stats.chi2_contingency(observed)\n",
	" print('chi2_stat = {}\\npval = {}\\nchi2_contingency output = {}'.format(\n",
	" chi2_stat, \n",
	" pval,\n",
	" chi2_contingency\n",
	" ))\n",
	" print(\"chi2_stat identical - {}\".format(np.isclose(chi2_stat, chi2_contingency[0])))\n",
	" print(\"pval identical - {}\".format(np.isclose(pval, chi2_contingency[1])))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"The observed data here has the form of the result of an A/B/C/D/E test.\n",
	"The null hypothesis is that the results are independent.\n",
	"The objective is to check whether the null hypothesis holds or not."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>Result 1</th>\n",
	" <th>Result 2</th>\n",
	" <th>Result 3</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>Control</th>\n",
	" <td>21</td>\n",
	" <td>7</td>\n",
	" <td>32</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>VAR1</th>\n",
	" <td>65</td>\n",
	" <td>25</td>\n",
	" <td>64</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>VAR2</th>\n",
	" <td>107</td>\n",
	" <td>50</td>\n",
	" <td>94</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>VAR3</th>\n",
	" <td>15</td>\n",
	" <td>8</td>\n",
	" <td>15</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>VAR4</th>\n",
	" <td>189</td>\n",
	" <td>6</td>\n",
	" <td>2</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" Result 1 Result 2 Result 3\n",
	"Control 21 7 32\n",
	"VAR1 65 25 64\n",
	"VAR2 107 50 94\n",
	"VAR3 15 8 15\n",
	"VAR4 189 6 2"
	]
	},
	"execution_count": 5,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# Observed data\n",
	"observed_base = pd.DataFrame(\n",
	" {\n",
	" \"Result 1\": [21, 65, 107, 15, 189],\n",
	" \"Result 2\": [7, 25, 50, 8, 6],\n",
	" \"Result 3\": [32, 64, 94, 15, 2]\n",
	" },\n",
	" index=[\"Control\", \n",
	" \"VAR1\", \n",
	" \"VAR2\", \n",
	" \"VAR3\", \n",
	" \"VAR4\"]\n",
	")\n",
	"observed_base"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" Result 1 Result 2 Result 3\n",
	"Control 21 7 32\n",
	"VAR1 65 25 64\n",
	"VAR2 107 50 94\n",
	"VAR3 15 8 15\n",
	"VAR4 189 6 2\n",
	"chi2_stat = 180.49674748377237\n",
	"pval = 0.0\n",
	"chi2_contingency output = (180.4967474837724, 8.09675550680417e-35, 8, array([[ 34.02857143, 8.22857143, 17.74285714],\n",
	" [ 87.34 , 21.12 , 45.54 ],\n",
	" [142.35285714, 34.42285714, 74.22428571],\n",
	" [ 21.55142857, 5.21142857, 11.23714286],\n",
	" [111.72714286, 27.01714286, 58.25571429]]))\n",
	"chi2_stat identical - True\n",
	"pval identical - True\n"
	]
	}
	],
	"source": [
	"compare_computations(observed_base.copy())"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Next, we take subsets of the observed data and run the same procedure.\n",
	"Once taking a $2\\mathrm{x}2$ subset of the data, the \"manually\" generated numbers no longer align with those obtained by `stats.chi2_contingency`"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" Result 1 Result 2 Result 3\n",
	"Control 21 7 32\n",
	"VAR1 65 25 64\n",
	"chi2_stat = 2.4949588283163875\n",
	"pval = 0.28722786769299047\n",
	"chi2_contingency output = (2.4949588283163875, 0.2872278676929904, 2, array([[24.11214953, 8.97196262, 26.91588785],\n",
	" [61.88785047, 23.02803738, 69.08411215]]))\n",
	"chi2_stat identical - True\n",
	"pval identical - True\n"
	]
	}
	],
	"source": [
	"# 2x3 subset of the observed data\n",
	"compare_computations(observed_base.iloc[0:2].copy())"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" Result 1 Result 2\n",
	"Control 21 7\n",
	"VAR1 65 25\n",
	"VAR2 107 50\n",
	"VAR3 15 8\n",
	"VAR4 189 6\n",
	"chi2_stat = 56.703292391691704\n",
	"pval = 1.4278023208191826e-11\n",
	"chi2_contingency output = (56.703292391691704, 1.4278052288246689e-11, 4, array([[ 22.54766734, 5.45233266],\n",
	" [ 72.47464503, 17.52535497],\n",
	" [126.42799189, 30.57200811],\n",
	" [ 18.52129817, 4.47870183],\n",
	" [157.02839757, 37.97160243]]))\n",
	"chi2_stat identical - True\n",
	"pval identical - True\n"
	]
	}
	],
	"source": [
	"# 5x2 subset of the observed data\n",
	"compare_computations(observed_base[observed_base.columns[0:2]].copy())"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" Result 1 Result 2\n",
	"Control 21 7\n",
	"VAR1 65 25\n",
	"chi2_stat = 0.08337370801033595\n",
	"pval = 0.7727764799468594\n",
	"chi2_contingency output = (0.00205882013658177, 0.9638090266779162, 1, array([[20.40677966, 7.59322034],\n",
	" [65.59322034, 24.40677966]]))\n",
	"chi2_stat identical - False\n",
	"pval identical - False\n"
	]
	}
	],
	"source": [
	"# 2x2 subset of the observed data\n",
	"compare_computations(observed_base.iloc[0:2][observed_base.columns[0:2]].copy())"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python [conda env:ds-review-ab-tests]",
	"language": "python",
	"name": "conda-env-ds-review-ab-tests-py"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.4"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}