Last active
August 7, 2023 18:18
-
-
Save mintaow/30506b7c2ae54da5d13623a3e4595beb to your computer and use it in GitHub Desktop.
Example with Code: Offline Sample Size Estimation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"provenance": [], | |
"toc_visible": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"language_info": { | |
"name": "python" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"# Sample Size Estimation Example:\n", | |
"## Analyzing Internet Download Speed Difference between South Shore and Lincoln Park in Chicago\n" | |
], | |
"metadata": { | |
"id": "7lXtrcnx8JZS" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [], | |
"metadata": { | |
"id": "VBw8SBou4kRm" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"# Define Functions" | |
], | |
"metadata": { | |
"id": "TA7P3fVXzIOa" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"from numpy.random import normal\n", | |
"from scipy import stats\n", | |
"from scipy.stats import norm\n", | |
"import numpy as np\n", | |
"import pandas as pd\n", | |
"\n", | |
"def generate_data(mu, std, sample_size):\n", | |
" '''\n", | |
" generate_data(...) generates the synthetic data from the normal distribution \n", | |
" with the mean as mu and the standard deviation as std\n", | |
" '''\n", | |
" return normal(loc = mu, scale = std, size = sample_size) \n", | |
"def get_sample_size(mu_1, std_1, mde_perc_lift, alpha=0.05):\n", | |
" '''\n", | |
" get_sample_size(...) takes in the two sample statistics (mu_1, std_1) of the control group \n", | |
" and two manually-determined inputs (alpha, mde_perc_lift), returns the minimum required sample size, as demonstrated in Figure 6.\n", | |
" \n", | |
" Note that this version of the get_sample_size(...) doesn't evaluate the statistical power (1-beta). \n", | |
" Meanwhile, it assumes the standard deviations are the same in the two groups. However, it is easy to release this assumption by replacing 2*variance with variance_1+variance_2\n", | |
" \n", | |
" Input:\n", | |
" mu_1: float, the sample mean of the control group (group 1)'s metric\n", | |
" std_1: float, the standard deviation of the control group (group 1)'s metric\n", | |
" mde_perc_lift: float, the minimal detectable effect, or expected lift, in percentage form, manually set by the analyst using domain knowledge \n", | |
" alpha: float, the significance level and set as 0.05 by default (assuming 5% significance level) \n", | |
" Output:\n", | |
" n: int, the minimum required sample size (based on the single-tail hypothesis testing), determined by the formula in Figure 6. \n", | |
" '''\n", | |
" # Multiply mu_1 and the expected percentage MDE, we get the MDE in absolute value form\n", | |
" mde = mu_1*mde_perc_lift\n", | |
" # \n", | |
" n = np.ceil(\n", | |
" 2*pow(norm.ppf(1-alpha),2)*pow(std_1,2) # Numerator \n", | |
" / pow(mde,2) # Denominator\n", | |
" )\n", | |
" \n", | |
" print(('In order to detect a change of {0} between groups with the SD of {1},'.format(mde, std_1)))\n", | |
" print(('with significance {0}, we need in each group at least {1:d} subjects.'.format(alpha, int(n))))\n", | |
" return n\n", | |
" \n", | |
"def t_test(data1, data2): \n", | |
" # Use Welch's t-test, assuming the variances for the two groups are not equal\n", | |
" t,p = stats.ttest_ind(data1,data2, equal_var = False)\n", | |
" print(\"------------ H0: There is no statistically significant difference in the population mean. ------------\")\n", | |
" print(f\"Independent t-test Statistics: t={round(t,3)}, p={round(p,3)}\")\n", | |
" if p <= 0.05:\n", | |
" print('------------ We reject the null hypothesis at the 5% significance level ------------')\n", | |
" else:\n", | |
" print('------------ We fail to reject the null hypothesis at the 5% significance level ------------')" | |
], | |
"metadata": { | |
"id": "skUqa6wL4kgL" | |
}, | |
"execution_count": 1, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"# Generate Simulated Data to Represent Current Sample (N=10)" | |
], | |
"metadata": { | |
"id": "P52xkXHAzLVK" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# In the beginning, we only have 10 observations for each group\n", | |
"sample_size = 10\n", | |
"\n", | |
"# create random samples \n", | |
"np.random.seed(0)\n", | |
"group1 = generate_data(mu = 500, std = 20, sample_size = sample_size) # synthetic data representing South Shore download speeds\n", | |
"np.random.seed(0)\n", | |
"group2 = generate_data(mu = 515, std = 20, sample_size = sample_size) # synthetic data representing Lincoln Park download speeds" | |
], | |
"metadata": { | |
"id": "9IiTN4AhC95S" | |
}, | |
"execution_count": 2, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# Take a look at the data (N=10)\n", | |
"df = pd.DataFrame()\n", | |
"df['household_id'] = [i+1001 for i in range(sample_size*2)]\n", | |
"df['download_speed(mbps)'] = list(np.round(group1,2))+list(np.round(group2,2))\n", | |
"df['group'] = ['south_shore']*sample_size+['lincoln_park']*sample_size\n", | |
"df.set_index('household_id')" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 708 | |
}, | |
"id": "LB8QmfWVVRpo", | |
"outputId": "2750f8e1-407d-435f-eff6-5d24da4a5462" | |
}, | |
"execution_count": 3, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
" download_speed(mbps) group\n", | |
"household_id \n", | |
"1001 535.28 south_shore\n", | |
"1002 508.00 south_shore\n", | |
"1003 519.57 south_shore\n", | |
"1004 544.82 south_shore\n", | |
"1005 537.35 south_shore\n", | |
"1006 480.45 south_shore\n", | |
"1007 519.00 south_shore\n", | |
"1008 496.97 south_shore\n", | |
"1009 497.94 south_shore\n", | |
"1010 508.21 south_shore\n", | |
"1011 550.28 lincoln_park\n", | |
"1012 523.00 lincoln_park\n", | |
"1013 534.57 lincoln_park\n", | |
"1014 559.82 lincoln_park\n", | |
"1015 552.35 lincoln_park\n", | |
"1016 495.45 lincoln_park\n", | |
"1017 534.00 lincoln_park\n", | |
"1018 511.97 lincoln_park\n", | |
"1019 512.94 lincoln_park\n", | |
"1020 523.21 lincoln_park" | |
], | |
"text/html": [ | |
"\n", | |
" <div id=\"df-8ff78925-3c86-4495-8d24-3c8216f64826\">\n", | |
" <div class=\"colab-df-container\">\n", | |
" <div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>download_speed(mbps)</th>\n", | |
" <th>group</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>household_id</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>1001</th>\n", | |
" <td>535.28</td>\n", | |
" <td>south_shore</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1002</th>\n", | |
" <td>508.00</td>\n", | |
" <td>south_shore</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1003</th>\n", | |
" <td>519.57</td>\n", | |
" <td>south_shore</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1004</th>\n", | |
" <td>544.82</td>\n", | |
" <td>south_shore</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1005</th>\n", | |
" <td>537.35</td>\n", | |
" <td>south_shore</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1006</th>\n", | |
" <td>480.45</td>\n", | |
" <td>south_shore</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1007</th>\n", | |
" <td>519.00</td>\n", | |
" <td>south_shore</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1008</th>\n", | |
" <td>496.97</td>\n", | |
" <td>south_shore</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1009</th>\n", | |
" <td>497.94</td>\n", | |
" <td>south_shore</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1010</th>\n", | |
" <td>508.21</td>\n", | |
" <td>south_shore</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1011</th>\n", | |
" <td>550.28</td>\n", | |
" <td>lincoln_park</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1012</th>\n", | |
" <td>523.00</td>\n", | |
" <td>lincoln_park</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1013</th>\n", | |
" <td>534.57</td>\n", | |
" <td>lincoln_park</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1014</th>\n", | |
" <td>559.82</td>\n", | |
" <td>lincoln_park</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1015</th>\n", | |
" <td>552.35</td>\n", | |
" <td>lincoln_park</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1016</th>\n", | |
" <td>495.45</td>\n", | |
" <td>lincoln_park</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1017</th>\n", | |
" <td>534.00</td>\n", | |
" <td>lincoln_park</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1018</th>\n", | |
" <td>511.97</td>\n", | |
" <td>lincoln_park</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1019</th>\n", | |
" <td>512.94</td>\n", | |
" <td>lincoln_park</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1020</th>\n", | |
" <td>523.21</td>\n", | |
" <td>lincoln_park</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>\n", | |
" <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-8ff78925-3c86-4495-8d24-3c8216f64826')\"\n", | |
" title=\"Convert this dataframe to an interactive table.\"\n", | |
" style=\"display:none;\">\n", | |
" \n", | |
" <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n", | |
" width=\"24px\">\n", | |
" <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n", | |
" <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n", | |
" </svg>\n", | |
" </button>\n", | |
" \n", | |
" <style>\n", | |
" .colab-df-container {\n", | |
" display:flex;\n", | |
" flex-wrap:wrap;\n", | |
" gap: 12px;\n", | |
" }\n", | |
"\n", | |
" .colab-df-convert {\n", | |
" background-color: #E8F0FE;\n", | |
" border: none;\n", | |
" border-radius: 50%;\n", | |
" cursor: pointer;\n", | |
" display: none;\n", | |
" fill: #1967D2;\n", | |
" height: 32px;\n", | |
" padding: 0 0 0 0;\n", | |
" width: 32px;\n", | |
" }\n", | |
"\n", | |
" .colab-df-convert:hover {\n", | |
" background-color: #E2EBFA;\n", | |
" box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n", | |
" fill: #174EA6;\n", | |
" }\n", | |
"\n", | |
" [theme=dark] .colab-df-convert {\n", | |
" background-color: #3B4455;\n", | |
" fill: #D2E3FC;\n", | |
" }\n", | |
"\n", | |
" [theme=dark] .colab-df-convert:hover {\n", | |
" background-color: #434B5C;\n", | |
" box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n", | |
" filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n", | |
" fill: #FFFFFF;\n", | |
" }\n", | |
" </style>\n", | |
"\n", | |
" <script>\n", | |
" const buttonEl =\n", | |
" document.querySelector('#df-8ff78925-3c86-4495-8d24-3c8216f64826 button.colab-df-convert');\n", | |
" buttonEl.style.display =\n", | |
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n", | |
"\n", | |
" async function convertToInteractive(key) {\n", | |
" const element = document.querySelector('#df-8ff78925-3c86-4495-8d24-3c8216f64826');\n", | |
" const dataTable =\n", | |
" await google.colab.kernel.invokeFunction('convertToInteractive',\n", | |
" [key], {});\n", | |
" if (!dataTable) return;\n", | |
"\n", | |
" const docLinkHtml = 'Like what you see? Visit the ' +\n", | |
" '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n", | |
" + ' to learn more about interactive tables.';\n", | |
" element.innerHTML = '';\n", | |
" dataTable['output_type'] = 'display_data';\n", | |
" await google.colab.output.renderOutput(dataTable, element);\n", | |
" const docLink = document.createElement('div');\n", | |
" docLink.innerHTML = docLinkHtml;\n", | |
" element.appendChild(docLink);\n", | |
" }\n", | |
" </script>\n", | |
" </div>\n", | |
" </div>\n", | |
" " | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 3 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"# T-test on Current Sample (N=10): \n", | |
"## We fail to detect significant differences (likely due to the lack of data)" | |
], | |
"metadata": { | |
"id": "ld-x9QJgzXBX" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"group1_n10 = df.loc[df.group == 'south_shore','download_speed(mbps)'] # South Shore households' download speeds synthetic data\n", | |
"group2_n10 = df.loc[df.group == 'lincoln_park','download_speed(mbps)'] # Lincoln Park households' download speeds synthetic data\n", | |
"\n", | |
"# Check the sample statistics (N=10)\n", | |
"print(\"sample mean for group 1: \", round(np.mean(group1_n10),2))\n", | |
"print(\"sample mean for group 2: \", round(np.mean(group2_n10),2))\n", | |
"print(\"sample standard deviation for group 1: \", round(np.std(group1_n10),2))\n", | |
"print(\"sample standard deviation for group 2: \", round(np.std(group2_n10),2))\n", | |
"print(\"-------------------------\")\n", | |
"\n", | |
"# t-test the difference in average download speed per household between South Shore and Lincoln Park\n", | |
"t_test(\n", | |
" data1 = group1_n10, # South Shore households' download speeds synthetic data\n", | |
" data2 = group2_n10 # Lincoln Park households' download speeds synthetic data\n", | |
")" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "S_3zu97hYN39", | |
"outputId": "8a0976a1-f673-429d-cf33-d5d0ca758f00" | |
}, | |
"execution_count": 4, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"sample mean for group 1: 514.76\n", | |
"sample mean for group 2: 529.76\n", | |
"sample standard deviation for group 1: 19.34\n", | |
"sample standard deviation for group 2: 19.34\n", | |
"-------------------------\n", | |
"------------ H0: There is no statistically significant difference in the population mean. ------------\n", | |
"Independent t-test Statistics: t=-1.645, p=0.117\n", | |
"------------ We fail to reject the null hypothesis at the 5% significance level ------------\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"# Compute the Minimum Required Sample Size `N` (N=77)" | |
], | |
"metadata": { | |
"id": "9mzR3yZTzdGK" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# ==============================\n", | |
"# ----------- Test -------------\n", | |
"# ==============================\n", | |
"n = get_sample_size(\n", | |
" mu_1 = 514.76, # sample average of households' download speeds in South Shore\n", | |
" std_1 = 19.34, # sample standard deviation of households' download speeds in South Shore\n", | |
" mde_perc_lift = 0.01, # Professor expects there exists at least 1% difference in download speeds\n", | |
" alpha = 0.05 # 5% significance level \n", | |
")" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "8ETQe7AQE9Fc", | |
"outputId": "7835f195-0ff6-4666-d2b9-a65ff70dbb3a" | |
}, | |
"execution_count": 5, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"In order to detect a change of 5.1476 between groups with the SD of 19.34,\n", | |
"with significance 0.05, we need in each group at least 77 subjects.\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"# Collect More Data (or Generate Simulated Data) (N=77)" | |
], | |
"metadata": { | |
"id": "-9XsmDkazvmm" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# we should survey more households to collect more data \n", | |
"# because we need at least 77 observations to detect the 1% or larger difference in the metric if there does exist such difference.\n", | |
"sample_size_new = 77\n", | |
"\n", | |
"group1_n77 = group1_n10.tolist() + generate_data(mu = 500, std = 20, sample_size = sample_size_new).tolist()\n", | |
"group2_n77 = group2_n10.tolist() + generate_data(mu = 515, std = 20, sample_size = sample_size_new).tolist()" | |
], | |
"metadata": { | |
"id": "MN5meO8Dz0YI" | |
}, | |
"execution_count": 6, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"# T-test on Larger Sample (N=77): \n", | |
"## We manage to detect such significant difference" | |
], | |
"metadata": { | |
"id": "eufyeXRAzoVX" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# perform the test\n", | |
"t_test(\n", | |
" data1 = group1_n77, # South Shore households' download speeds synthetic data\n", | |
" data2 = group2_n77 # Lincoln Park households' download speeds synthetic data\n", | |
")" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "a9qeXeNy4kbN", | |
"outputId": "dd5fb67e-9171-4347-9067-d11aef78c07e" | |
}, | |
"execution_count": 7, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"------------ H0: There is no statistically significant difference in the population mean. ------------\n", | |
"Independent t-test Statistics: t=-6.693, p=0.0\n", | |
"------------ We reject the null hypothesis at the 5% significance level ------------\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# Check the sample statistics for the new sample (N=77)\n", | |
"print(\"sample mean for group 1: \", round(np.mean(group1_n77),2))\n", | |
"print(\"sample mean for group 2: \", round(np.mean(group2_n77),2))\n", | |
"print(\"sample standard deviation for group 1: \", round(np.std(group1_n77),2))\n", | |
"print(\"sample standard deviation for group 2: \", round(np.std(group2_n77),2))" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "YUei0k6u4kY0", | |
"outputId": "8d815fcf-80ab-4dee-ef82-49ce23b8d0fb" | |
}, | |
"execution_count": 8, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"sample mean for group 1: 500.18\n", | |
"sample mean for group 2: 520.9\n", | |
"sample standard deviation for group 1: 20.68\n", | |
"sample standard deviation for group 2: 19.91\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [], | |
"metadata": { | |
"id": "UVAB7Bef4kPH" | |
}, | |
"execution_count": 8, | |
"outputs": [] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment