mintaow/Medium_Workbook_Sample_Size_Calculation(The Naive Formulation).ipynb

## Medium_Workbook_Sample_Size_Calculation(The Naive Formulation).ipynb
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "toc_visible": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "source": [
        "# Sample Size Estimation Example:\n",
        "## Analyzing Internet Download Speed Difference between South Shore and Lincoln Park in Chicago\n"
      ],
      "metadata": {
        "id": "7lXtrcnx8JZS"
      }
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "VBw8SBou4kRm"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Define Functions"
      ],
      "metadata": {
        "id": "TA7P3fVXzIOa"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from numpy.random import normal\n",
        "from scipy import stats\n",
        "from scipy.stats import norm\n",
        "import numpy as np\n",
        "import pandas as pd\n",
        "\n",
        "def generate_data(mu, std, sample_size):\n",
        "    '''\n",
        "    generate_data(...) generates the synthetic data from the normal distribution \n",
        "    with the mean as mu and the standard deviation as std\n",
        "    '''\n",
        "    return normal(loc = mu, scale = std, size = sample_size) \n",
        "def get_sample_size(mu_1, std_1, mde_perc_lift, alpha=0.05):\n",
        "    '''\n",
        "    get_sample_size(...) takes in the two sample statistics (mu_1, std_1) of the control group \n",
        "    and two manually-determined inputs (alpha, mde_perc_lift), returns the minimum required sample size, as demonstrated in Figure 6.\n",
        "    \n",
        "    Note that this version of the get_sample_size(...) doesn't evaluate the statistical power (1-beta). \n",
        "    Meanwhile, it assumes the standard deviations are the same in the two groups. However, it is easy to release this assumption by replacing 2*variance with variance_1+variance_2\n",
        "    \n",
        "    Input:\n",
        "        mu_1: float, the sample mean of the control group (group 1)'s metric\n",
        "        std_1: float, the standard deviation of the control group (group 1)'s metric\n",
        "        mde_perc_lift: float, the minimal detectable effect, or expected lift, in percentage form, manually set by the analyst using domain knowledge \n",
        "        alpha: float, the significance level and set as 0.05 by default （assuming 5% significance level)      \n",
        "    Output:\n",
        "        n: int, the minimum required sample size (based on the single-tail hypothesis testing), determined by the formula in Figure 6. \n",
        "    '''\n",
        "    # Multiply mu_1 and the expected percentage MDE, we get the MDE in absolute value form\n",
        "    mde = mu_1*mde_perc_lift\n",
        "    # \n",
        "    n = np.ceil(\n",
        "            2*pow(norm.ppf(1-alpha),2)*pow(std_1,2) # Numerator        \n",
        "        / pow(mde,2) # Denominator\n",
        "        )\n",
        "    \n",
        "    print(('In order to detect a change of {0} between groups with the SD of {1},'.format(mde, std_1)))\n",
        "    print(('with significance {0}, we need in each group at least {1:d} subjects.'.format(alpha,  int(n))))\n",
        "    return n\n",
        "    \n",
        "def t_test(data1, data2):    \n",
        "    # Use Welch's t-test, assuming the variances for the two groups are not equal\n",
        "    t,p = stats.ttest_ind(data1,data2, equal_var = False)\n",
        "    print(\"------------ H0: There is no statistically significant difference in the population mean. ------------\")\n",
        "    print(f\"Independent t-test Statistics: t={round(t,3)}, p={round(p,3)}\")\n",
        "    if p <= 0.05:\n",
        "        print('------------ We reject the null hypothesis at the 5% significance level ------------')\n",
        "    else:\n",
        "        print('------------ We fail to reject the null hypothesis at the 5% significance level ------------')"
      ],
      "metadata": {
        "id": "skUqa6wL4kgL"
      },
      "execution_count": 1,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Generate Simulated Data to Represent Current Sample (N=10)"
      ],
      "metadata": {
        "id": "P52xkXHAzLVK"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# In the beginning, we only have 10 observations for each group\n",
        "sample_size = 10\n",
        "\n",
        "# create random samples \n",
        "np.random.seed(0)\n",
        "group1 = generate_data(mu = 500, std = 20, sample_size = sample_size) # synthetic data representing South Shore download speeds\n",
        "np.random.seed(0)\n",
        "group2 = generate_data(mu = 515, std = 20, sample_size = sample_size) # synthetic data representing Lincoln Park download speeds"
      ],
      "metadata": {
        "id": "9IiTN4AhC95S"
      },
      "execution_count": 2,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Take a look at the data (N=10)\n",
        "df = pd.DataFrame()\n",
        "df['household_id'] = [i+1001 for i in range(sample_size*2)]\n",
        "df['download_speed(mbps)'] = list(np.round(group1,2))+list(np.round(group2,2))\n",
        "df['group'] = ['south_shore']*sample_size+['lincoln_park']*sample_size\n",
        "df.set_index('household_id')"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 708
        },
        "id": "LB8QmfWVVRpo",
        "outputId": "2750f8e1-407d-435f-eff6-5d24da4a5462"
      },
      "execution_count": 3,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "              download_speed(mbps)         group\n",
              "household_id                                    \n",
              "1001                        535.28   south_shore\n",
              "1002                        508.00   south_shore\n",
              "1003                        519.57   south_shore\n",
              "1004                        544.82   south_shore\n",
              "1005                        537.35   south_shore\n",
              "1006                        480.45   south_shore\n",
              "1007                        519.00   south_shore\n",
              "1008                        496.97   south_shore\n",
              "1009                        497.94   south_shore\n",
              "1010                        508.21   south_shore\n",
              "1011                        550.28  lincoln_park\n",
              "1012                        523.00  lincoln_park\n",
              "1013                        534.57  lincoln_park\n",
              "1014                        559.82  lincoln_park\n",
              "1015                        552.35  lincoln_park\n",
              "1016                        495.45  lincoln_park\n",
              "1017                        534.00  lincoln_park\n",
              "1018                        511.97  lincoln_park\n",
              "1019                        512.94  lincoln_park\n",
              "1020                        523.21  lincoln_park"
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-8ff78925-3c86-4495-8d24-3c8216f64826\">\n",
              "    <div class=\"colab-df-container\">\n",
              "      <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>download_speed(mbps)</th>\n",
              "      <th>group</th>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>household_id</th>\n",
              "      <th></th>\n",
              "      <th></th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>1001</th>\n",
              "      <td>535.28</td>\n",
              "      <td>south_shore</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1002</th>\n",
              "      <td>508.00</td>\n",
              "      <td>south_shore</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1003</th>\n",
              "      <td>519.57</td>\n",
              "      <td>south_shore</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1004</th>\n",
              "      <td>544.82</td>\n",
              "      <td>south_shore</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1005</th>\n",
              "      <td>537.35</td>\n",
              "      <td>south_shore</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1006</th>\n",
              "      <td>480.45</td>\n",
              "      <td>south_shore</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1007</th>\n",
              "      <td>519.00</td>\n",
              "      <td>south_shore</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1008</th>\n",
              "      <td>496.97</td>\n",
              "      <td>south_shore</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1009</th>\n",
              "      <td>497.94</td>\n",
              "      <td>south_shore</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1010</th>\n",
              "      <td>508.21</td>\n",
              "      <td>south_shore</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1011</th>\n",
              "      <td>550.28</td>\n",
              "      <td>lincoln_park</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1012</th>\n",
              "      <td>523.00</td>\n",
              "      <td>lincoln_park</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1013</th>\n",
              "      <td>534.57</td>\n",
              "      <td>lincoln_park</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1014</th>\n",
              "      <td>559.82</td>\n",
              "      <td>lincoln_park</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1015</th>\n",
              "      <td>552.35</td>\n",
              "      <td>lincoln_park</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1016</th>\n",
              "      <td>495.45</td>\n",
              "      <td>lincoln_park</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1017</th>\n",
              "      <td>534.00</td>\n",
              "      <td>lincoln_park</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1018</th>\n",
              "      <td>511.97</td>\n",
              "      <td>lincoln_park</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1019</th>\n",
              "      <td>512.94</td>\n",
              "      <td>lincoln_park</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1020</th>\n",
              "      <td>523.21</td>\n",
              "      <td>lincoln_park</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-8ff78925-3c86-4495-8d24-3c8216f64826')\"\n",
              "              title=\"Convert this dataframe to an interactive table.\"\n",
              "              style=\"display:none;\">\n",
              "        \n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "       width=\"24px\">\n",
              "    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
              "    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
              "  </svg>\n",
              "      </button>\n",
              "      \n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      flex-wrap:wrap;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "      <script>\n",
              "        const buttonEl =\n",
              "          document.querySelector('#df-8ff78925-3c86-4495-8d24-3c8216f64826 button.colab-df-convert');\n",
              "        buttonEl.style.display =\n",
              "          google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "        async function convertToInteractive(key) {\n",
              "          const element = document.querySelector('#df-8ff78925-3c86-4495-8d24-3c8216f64826');\n",
              "          const dataTable =\n",
              "            await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                     [key], {});\n",
              "          if (!dataTable) return;\n",
              "\n",
              "          const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "            + ' to learn more about interactive tables.';\n",
              "          element.innerHTML = '';\n",
              "          dataTable['output_type'] = 'display_data';\n",
              "          await google.colab.output.renderOutput(dataTable, element);\n",
              "          const docLink = document.createElement('div');\n",
              "          docLink.innerHTML = docLinkHtml;\n",
              "          element.appendChild(docLink);\n",
              "        }\n",
              "      </script>\n",
              "    </div>\n",
              "  </div>\n",
              "  "
            ]
          },
          "metadata": {},
          "execution_count": 3
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "# T-test on Current Sample (N=10): \n",
        "## We fail to detect significant differences (likely due to the lack of data)"
      ],
      "metadata": {
        "id": "ld-x9QJgzXBX"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "group1_n10 = df.loc[df.group == 'south_shore','download_speed(mbps)'] # South Shore households' download speeds synthetic data\n",
        "group2_n10 = df.loc[df.group == 'lincoln_park','download_speed(mbps)'] # Lincoln Park households' download speeds synthetic data\n",
        "\n",
        "# Check the sample statistics (N=10)\n",
        "print(\"sample mean for group 1: \", round(np.mean(group1_n10),2))\n",
        "print(\"sample mean for group 2: \", round(np.mean(group2_n10),2))\n",
        "print(\"sample standard deviation for group 1: \", round(np.std(group1_n10),2))\n",
        "print(\"sample standard deviation for group 2: \", round(np.std(group2_n10),2))\n",
        "print(\"-------------------------\")\n",
        "\n",
        "# t-test the difference in average download speed per household between South Shore and Lincoln Park\n",
        "t_test(\n",
        "    data1 = group1_n10, # South Shore households' download speeds synthetic data\n",
        "    data2 = group2_n10 # Lincoln Park households' download speeds synthetic data\n",
        ")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "S_3zu97hYN39",
        "outputId": "8a0976a1-f673-429d-cf33-d5d0ca758f00"
      },
      "execution_count": 4,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "sample mean for group 1:  514.76\n",
            "sample mean for group 2:  529.76\n",
            "sample standard deviation for group 1:  19.34\n",
            "sample standard deviation for group 2:  19.34\n",
            "-------------------------\n",
            "------------ H0: There is no statistically significant difference in the population mean. ------------\n",
            "Independent t-test Statistics: t=-1.645, p=0.117\n",
            "------------ We fail to reject the null hypothesis at the 5% significance level ------------\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Compute the Minimum Required Sample Size `N` (N=77)"
      ],
      "metadata": {
        "id": "9mzR3yZTzdGK"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# ==============================\n",
        "# ----------- Test -------------\n",
        "# ==============================\n",
        "n = get_sample_size(\n",
        "    mu_1 = 514.76, # sample average of households' download speeds in South Shore\n",
        "    std_1 = 19.34, # sample standard deviation of households' download speeds in South Shore\n",
        "    mde_perc_lift = 0.01,  # Professor expects there exists at least 1% difference in download speeds\n",
        "    alpha = 0.05  # 5% significance level  \n",
        ")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "8ETQe7AQE9Fc",
        "outputId": "7835f195-0ff6-4666-d2b9-a65ff70dbb3a"
      },
      "execution_count": 5,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "In order to detect a change of 5.1476 between groups with the SD of 19.34,\n",
            "with significance 0.05, we need in each group at least 77 subjects.\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Collect More Data (or Generate Simulated Data) (N=77)"
      ],
      "metadata": {
        "id": "-9XsmDkazvmm"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# we should survey more households to collect more data \n",
        "# because we need at least 77 observations to detect the 1% or larger difference in the metric if there does exist such difference.\n",
        "sample_size_new = 77\n",
        "\n",
        "group1_n77 = group1_n10.tolist() + generate_data(mu = 500, std = 20, sample_size = sample_size_new).tolist()\n",
        "group2_n77 = group2_n10.tolist() + generate_data(mu = 515, std = 20, sample_size = sample_size_new).tolist()"
      ],
      "metadata": {
        "id": "MN5meO8Dz0YI"
      },
      "execution_count": 6,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# T-test on Larger Sample (N=77): \n",
        "## We manage to detect such significant difference"
      ],
      "metadata": {
        "id": "eufyeXRAzoVX"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# perform the test\n",
        "t_test(\n",
        "    data1 = group1_n77, # South Shore households' download speeds synthetic data\n",
        "    data2 = group2_n77 # Lincoln Park households' download speeds synthetic data\n",
        ")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "a9qeXeNy4kbN",
        "outputId": "dd5fb67e-9171-4347-9067-d11aef78c07e"
      },
      "execution_count": 7,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "------------ H0: There is no statistically significant difference in the population mean. ------------\n",
            "Independent t-test Statistics: t=-6.693, p=0.0\n",
            "------------ We reject the null hypothesis at the 5% significance level ------------\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# Check the sample statistics for the new sample (N=77)\n",
        "print(\"sample mean for group 1: \", round(np.mean(group1_n77),2))\n",
        "print(\"sample mean for group 2: \", round(np.mean(group2_n77),2))\n",
        "print(\"sample standard deviation for group 1: \", round(np.std(group1_n77),2))\n",
        "print(\"sample standard deviation for group 2: \", round(np.std(group2_n77),2))"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "YUei0k6u4kY0",
        "outputId": "8d815fcf-80ab-4dee-ef82-49ce23b8d0fb"
      },
      "execution_count": 8,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "sample mean for group 1:  500.18\n",
            "sample mean for group 2:  520.9\n",
            "sample standard deviation for group 1:  20.68\n",
            "sample standard deviation for group 2:  19.91\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "UVAB7Bef4kPH"
      },
      "execution_count": 8,
      "outputs": []
    }
  ]
}
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"provenance": [],
	"toc_visible": true
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3"
	},
	"language_info": {
	"name": "python"
	}
	},
	"cells": [
	{
	"cell_type": "markdown",
	"source": [
	"# Sample Size Estimation Example:\n",
	"## Analyzing Internet Download Speed Difference between South Shore and Lincoln Park in Chicago\n"
	],
	"metadata": {
	"id": "7lXtrcnx8JZS"
	}
	},
	{
	"cell_type": "code",
	"source": [],
	"metadata": {
	"id": "VBw8SBou4kRm"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"source": [
	"# Define Functions"
	],
	"metadata": {
	"id": "TA7P3fVXzIOa"
	}
	},
	{
	"cell_type": "code",
	"source": [
	"from numpy.random import normal\n",
	"from scipy import stats\n",
	"from scipy.stats import norm\n",
	"import numpy as np\n",
	"import pandas as pd\n",
	"\n",
	"def generate_data(mu, std, sample_size):\n",
	" '''\n",
	" generate_data(...) generates the synthetic data from the normal distribution \n",
	" with the mean as mu and the standard deviation as std\n",
	" '''\n",
	" return normal(loc = mu, scale = std, size = sample_size) \n",
	"def get_sample_size(mu_1, std_1, mde_perc_lift, alpha=0.05):\n",
	" '''\n",
	" get_sample_size(...) takes in the two sample statistics (mu_1, std_1) of the control group \n",
	" and two manually-determined inputs (alpha, mde_perc_lift), returns the minimum required sample size, as demonstrated in Figure 6.\n",
	" \n",
	" Note that this version of the get_sample_size(...) doesn't evaluate the statistical power (1-beta). \n",
	" Meanwhile, it assumes the standard deviations are the same in the two groups. However, it is easy to release this assumption by replacing 2*variance with variance_1+variance_2\n",
	" \n",
	" Input:\n",
	" mu_1: float, the sample mean of the control group (group 1)'s metric\n",
	" std_1: float, the standard deviation of the control group (group 1)'s metric\n",
	" mde_perc_lift: float, the minimal detectable effect, or expected lift, in percentage form, manually set by the analyst using domain knowledge \n",
	" alpha: float, the significance level and set as 0.05 by default （assuming 5% significance level) \n",
	" Output:\n",
	" n: int, the minimum required sample size (based on the single-tail hypothesis testing), determined by the formula in Figure 6. \n",
	" '''\n",
	" # Multiply mu_1 and the expected percentage MDE, we get the MDE in absolute value form\n",
	" mde = mu_1*mde_perc_lift\n",
	" # \n",
	" n = np.ceil(\n",
	" 2pow(norm.ppf(1-alpha),2)pow(std_1,2) # Numerator \n",
	" / pow(mde,2) # Denominator\n",
	" )\n",
	" \n",
	" print(('In order to detect a change of {0} between groups with the SD of {1},'.format(mde, std_1)))\n",
	" print(('with significance {0}, we need in each group at least {1:d} subjects.'.format(alpha, int(n))))\n",
	" return n\n",
	" \n",
	"def t_test(data1, data2): \n",
	" # Use Welch's t-test, assuming the variances for the two groups are not equal\n",
	" t,p = stats.ttest_ind(data1,data2, equal_var = False)\n",
	" print(\"------------ H0: There is no statistically significant difference in the population mean. ------------\")\n",
	" print(f\"Independent t-test Statistics: t={round(t,3)}, p={round(p,3)}\")\n",
	" if p <= 0.05:\n",
	" print('------------ We reject the null hypothesis at the 5% significance level ------------')\n",
	" else:\n",
	" print('------------ We fail to reject the null hypothesis at the 5% significance level ------------')"
	],
	"metadata": {
	"id": "skUqa6wL4kgL"
	},
	"execution_count": 1,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"source": [
	"# Generate Simulated Data to Represent Current Sample (N=10)"
	],
	"metadata": {
	"id": "P52xkXHAzLVK"
	}
	},
	{
	"cell_type": "code",
	"source": [
	"# In the beginning, we only have 10 observations for each group\n",
	"sample_size = 10\n",
	"\n",
	"# create random samples \n",
	"np.random.seed(0)\n",
	"group1 = generate_data(mu = 500, std = 20, sample_size = sample_size) # synthetic data representing South Shore download speeds\n",
	"np.random.seed(0)\n",
	"group2 = generate_data(mu = 515, std = 20, sample_size = sample_size) # synthetic data representing Lincoln Park download speeds"
	],
	"metadata": {
	"id": "9IiTN4AhC95S"
	},
	"execution_count": 2,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"# Take a look at the data (N=10)\n",
	"df = pd.DataFrame()\n",
	"df['household_id'] = [i+1001 for i in range(sample_size*2)]\n",
	"df['download_speed(mbps)'] = list(np.round(group1,2))+list(np.round(group2,2))\n",
	"df['group'] = ['south_shore']sample_size+['lincoln_park']sample_size\n",
	"df.set_index('household_id')"
	],
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 708
	},
	"id": "LB8QmfWVVRpo",
	"outputId": "2750f8e1-407d-435f-eff6-5d24da4a5462"
	},
	"execution_count": 3,
	"outputs": [
	{
	"output_type": "execute_result",
	"data": {
	"text/plain": [
	" download_speed(mbps) group\n",
	"household_id \n",
	"1001 535.28 south_shore\n",
	"1002 508.00 south_shore\n",
	"1003 519.57 south_shore\n",
	"1004 544.82 south_shore\n",
	"1005 537.35 south_shore\n",
	"1006 480.45 south_shore\n",
	"1007 519.00 south_shore\n",
	"1008 496.97 south_shore\n",
	"1009 497.94 south_shore\n",
	"1010 508.21 south_shore\n",
	"1011 550.28 lincoln_park\n",
	"1012 523.00 lincoln_park\n",
	"1013 534.57 lincoln_park\n",
	"1014 559.82 lincoln_park\n",
	"1015 552.35 lincoln_park\n",
	"1016 495.45 lincoln_park\n",
	"1017 534.00 lincoln_park\n",
	"1018 511.97 lincoln_park\n",
	"1019 512.94 lincoln_park\n",
	"1020 523.21 lincoln_park"
	],
	"text/html": [
	"\n",
	" <div id=\"df-8ff78925-3c86-4495-8d24-3c8216f64826\">\n",
	" <div class=\"colab-df-container\">\n",
	" <div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>download_speed(mbps)</th>\n",
	" <th>group</th>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>household_id</th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>1001</th>\n",
	" <td>535.28</td>\n",
	" <td>south_shore</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1002</th>\n",
	" <td>508.00</td>\n",
	" <td>south_shore</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1003</th>\n",
	" <td>519.57</td>\n",
	" <td>south_shore</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1004</th>\n",
	" <td>544.82</td>\n",
	" <td>south_shore</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1005</th>\n",
	" <td>537.35</td>\n",
	" <td>south_shore</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1006</th>\n",
	" <td>480.45</td>\n",
	" <td>south_shore</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1007</th>\n",
	" <td>519.00</td>\n",
	" <td>south_shore</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1008</th>\n",
	" <td>496.97</td>\n",
	" <td>south_shore</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1009</th>\n",
	" <td>497.94</td>\n",
	" <td>south_shore</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1010</th>\n",
	" <td>508.21</td>\n",
	" <td>south_shore</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1011</th>\n",
	" <td>550.28</td>\n",
	" <td>lincoln_park</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1012</th>\n",
	" <td>523.00</td>\n",
	" <td>lincoln_park</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1013</th>\n",
	" <td>534.57</td>\n",
	" <td>lincoln_park</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1014</th>\n",
	" <td>559.82</td>\n",
	" <td>lincoln_park</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1015</th>\n",
	" <td>552.35</td>\n",
	" <td>lincoln_park</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1016</th>\n",
	" <td>495.45</td>\n",
	" <td>lincoln_park</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1017</th>\n",
	" <td>534.00</td>\n",
	" <td>lincoln_park</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1018</th>\n",
	" <td>511.97</td>\n",
	" <td>lincoln_park</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1019</th>\n",
	" <td>512.94</td>\n",
	" <td>lincoln_park</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1020</th>\n",
	" <td>523.21</td>\n",
	" <td>lincoln_park</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>\n",
	" <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-8ff78925-3c86-4495-8d24-3c8216f64826')\"\n",
	" title=\"Convert this dataframe to an interactive table.\"\n",
	" style=\"display:none;\">\n",
	" \n",
	" <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
	" width=\"24px\">\n",
	" <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
	" <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
	" </svg>\n",
	" </button>\n",
	" \n",
	" <style>\n",
	" .colab-df-container {\n",
	" display:flex;\n",
	" flex-wrap:wrap;\n",
	" gap: 12px;\n",
	" }\n",
	"\n",
	" .colab-df-convert {\n",
	" background-color: #E8F0FE;\n",
	" border: none;\n",
	" border-radius: 50%;\n",
	" cursor: pointer;\n",
	" display: none;\n",
	" fill: #1967D2;\n",
	" height: 32px;\n",
	" padding: 0 0 0 0;\n",
	" width: 32px;\n",
	" }\n",
	"\n",
	" .colab-df-convert:hover {\n",
	" background-color: #E2EBFA;\n",
	" box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
	" fill: #174EA6;\n",
	" }\n",
	"\n",
	" [theme=dark] .colab-df-convert {\n",
	" background-color: #3B4455;\n",
	" fill: #D2E3FC;\n",
	" }\n",
	"\n",
	" [theme=dark] .colab-df-convert:hover {\n",
	" background-color: #434B5C;\n",
	" box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
	" filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
	" fill: #FFFFFF;\n",
	" }\n",
	" </style>\n",
	"\n",
	" <script>\n",
	" const buttonEl =\n",
	" document.querySelector('#df-8ff78925-3c86-4495-8d24-3c8216f64826 button.colab-df-convert');\n",
	" buttonEl.style.display =\n",
	" google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
	"\n",
	" async function convertToInteractive(key) {\n",
	" const element = document.querySelector('#df-8ff78925-3c86-4495-8d24-3c8216f64826');\n",
	" const dataTable =\n",
	" await google.colab.kernel.invokeFunction('convertToInteractive',\n",
	" [key], {});\n",
	" if (!dataTable) return;\n",
	"\n",
	" const docLinkHtml = 'Like what you see? Visit the ' +\n",
	" '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
	" + ' to learn more about interactive tables.';\n",
	" element.innerHTML = '';\n",
	" dataTable['output_type'] = 'display_data';\n",
	" await google.colab.output.renderOutput(dataTable, element);\n",
	" const docLink = document.createElement('div');\n",
	" docLink.innerHTML = docLinkHtml;\n",
	" element.appendChild(docLink);\n",
	" }\n",
	" </script>\n",
	" </div>\n",
	" </div>\n",
	" "
	]
	},
	"metadata": {},
	"execution_count": 3
	}
	]
	},
	{
	"cell_type": "markdown",
	"source": [
	"# T-test on Current Sample (N=10): \n",
	"## We fail to detect significant differences (likely due to the lack of data)"
	],
	"metadata": {
	"id": "ld-x9QJgzXBX"
	}
	},
	{
	"cell_type": "code",
	"source": [
	"group1_n10 = df.loc[df.group == 'south_shore','download_speed(mbps)'] # South Shore households' download speeds synthetic data\n",
	"group2_n10 = df.loc[df.group == 'lincoln_park','download_speed(mbps)'] # Lincoln Park households' download speeds synthetic data\n",
	"\n",
	"# Check the sample statistics (N=10)\n",
	"print(\"sample mean for group 1: \", round(np.mean(group1_n10),2))\n",
	"print(\"sample mean for group 2: \", round(np.mean(group2_n10),2))\n",
	"print(\"sample standard deviation for group 1: \", round(np.std(group1_n10),2))\n",
	"print(\"sample standard deviation for group 2: \", round(np.std(group2_n10),2))\n",
	"print(\"-------------------------\")\n",
	"\n",
	"# t-test the difference in average download speed per household between South Shore and Lincoln Park\n",
	"t_test(\n",
	" data1 = group1_n10, # South Shore households' download speeds synthetic data\n",
	" data2 = group2_n10 # Lincoln Park households' download speeds synthetic data\n",
	")"
	],
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "S_3zu97hYN39",
	"outputId": "8a0976a1-f673-429d-cf33-d5d0ca758f00"
	},
	"execution_count": 4,
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"sample mean for group 1: 514.76\n",
	"sample mean for group 2: 529.76\n",
	"sample standard deviation for group 1: 19.34\n",
	"sample standard deviation for group 2: 19.34\n",
	"-------------------------\n",
	"------------ H0: There is no statistically significant difference in the population mean. ------------\n",
	"Independent t-test Statistics: t=-1.645, p=0.117\n",
	"------------ We fail to reject the null hypothesis at the 5% significance level ------------\n"
	]
	}
	]
	},
	{
	"cell_type": "markdown",
	"source": [
	"# Compute the Minimum Required Sample Size `N` (N=77)"
	],
	"metadata": {
	"id": "9mzR3yZTzdGK"
	}
	},
	{
	"cell_type": "code",
	"source": [
	"# ==============================\n",
	"# ----------- Test -------------\n",
	"# ==============================\n",
	"n = get_sample_size(\n",
	" mu_1 = 514.76, # sample average of households' download speeds in South Shore\n",
	" std_1 = 19.34, # sample standard deviation of households' download speeds in South Shore\n",
	" mde_perc_lift = 0.01, # Professor expects there exists at least 1% difference in download speeds\n",
	" alpha = 0.05 # 5% significance level \n",
	")"
	],
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "8ETQe7AQE9Fc",
	"outputId": "7835f195-0ff6-4666-d2b9-a65ff70dbb3a"
	},
	"execution_count": 5,
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"In order to detect a change of 5.1476 between groups with the SD of 19.34,\n",
	"with significance 0.05, we need in each group at least 77 subjects.\n"
	]
	}
	]
	},
	{
	"cell_type": "markdown",
	"source": [
	"# Collect More Data (or Generate Simulated Data) (N=77)"
	],
	"metadata": {
	"id": "-9XsmDkazvmm"
	}
	},
	{
	"cell_type": "code",
	"source": [
	"# we should survey more households to collect more data \n",
	"# because we need at least 77 observations to detect the 1% or larger difference in the metric if there does exist such difference.\n",
	"sample_size_new = 77\n",
	"\n",
	"group1_n77 = group1_n10.tolist() + generate_data(mu = 500, std = 20, sample_size = sample_size_new).tolist()\n",
	"group2_n77 = group2_n10.tolist() + generate_data(mu = 515, std = 20, sample_size = sample_size_new).tolist()"
	],
	"metadata": {
	"id": "MN5meO8Dz0YI"
	},
	"execution_count": 6,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"source": [
	"# T-test on Larger Sample (N=77): \n",
	"## We manage to detect such significant difference"
	],
	"metadata": {
	"id": "eufyeXRAzoVX"
	}
	},
	{
	"cell_type": "code",
	"source": [
	"# perform the test\n",
	"t_test(\n",
	" data1 = group1_n77, # South Shore households' download speeds synthetic data\n",
	" data2 = group2_n77 # Lincoln Park households' download speeds synthetic data\n",
	")"
	],
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "a9qeXeNy4kbN",
	"outputId": "dd5fb67e-9171-4347-9067-d11aef78c07e"
	},
	"execution_count": 7,
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"------------ H0: There is no statistically significant difference in the population mean. ------------\n",
	"Independent t-test Statistics: t=-6.693, p=0.0\n",
	"------------ We reject the null hypothesis at the 5% significance level ------------\n"
	]
	}
	]
	},
	{
	"cell_type": "code",
	"source": [
	"# Check the sample statistics for the new sample (N=77)\n",
	"print(\"sample mean for group 1: \", round(np.mean(group1_n77),2))\n",
	"print(\"sample mean for group 2: \", round(np.mean(group2_n77),2))\n",
	"print(\"sample standard deviation for group 1: \", round(np.std(group1_n77),2))\n",
	"print(\"sample standard deviation for group 2: \", round(np.std(group2_n77),2))"
	],
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "YUei0k6u4kY0",
	"outputId": "8d815fcf-80ab-4dee-ef82-49ce23b8d0fb"
	},
	"execution_count": 8,
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"sample mean for group 1: 500.18\n",
	"sample mean for group 2: 520.9\n",
	"sample standard deviation for group 1: 20.68\n",
	"sample standard deviation for group 2: 19.91\n"
	]
	}
	]
	},
	{
	"cell_type": "code",
	"source": [],
	"metadata": {
	"id": "UVAB7Bef4kPH"
	},
	"execution_count": 8,
	"outputs": []
	}
	]
	}