mintaow/Medium_Workbook_Sample_Size_Calculation (The Standard Formulation).ipynb

## Medium_Workbook_Sample_Size_Calculation (The Standard Formulation).ipynb
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "toc_visible": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "code",
      "source": [
        "from scipy.stats import norm\n",
        "\n",
        "def get_sample_size_with_power(mu_1, std_1, mde, is_absolute_mde, alpha=0.05, beta = 0.2):\n",
        "    '''\n",
        "    get_sample_size_with_power(...) takes in two sample statistics (mu_1, std_1) of the control group (as baselines) \n",
        "    and two manually-determined inputs (alpha, mde), returns the minimum required sample size, as demonstrated in Figure 6.\n",
        "    \n",
        "    Note that get_sample_size_with_power(...) calculates the sample size that gives us the statistical power (1-beta) we specify. \n",
        "    Meanwhile, it assumes the standard deviations are the same in the two groups. \n",
        "    However, it is easy to release this assumption by replacing 2*variance with variance_1+variance_2\n",
        "    \n",
        "    Input:\n",
        "        mu_1: float, the sample mean of the control group (group 1)'s metric\n",
        "        std_1: float, the standard deviation of the control group (group 1)'s metric\n",
        "        mde: float, the minimal detectable effect (MDE), often set by the analyst using domain knowledge \n",
        "        is_absolute_mde: bool, True means the mde input is absolute MDE, False means it is relative MDE\n",
        "        alpha: float, the significance level and set as 0.05 by default （assuming 5% significance level)      \n",
        "        beta: float, the type II error rate and set as 0.2 by default (assuming 80% statistical power)\n",
        "    Output:\n",
        "        n: int, the minimum required sample size (based on the single-tail hypothesis testing), determined by the formula in Figure 6. \n",
        "    '''\n",
        "    if is_absolute_mde:\n",
        "        # Absolute MDE\n",
        "        dmin = mde\n",
        "    else:\n",
        "        # Relative MDE\n",
        "        dmin = mu_1*mde\n",
        "\n",
        "    stat_power = 1-beta\n",
        "\n",
        "    # Calculate the minimum required sample size \n",
        "    n = np.ceil(\n",
        "            2*(pow(norm.ppf(1-alpha/2)+norm.ppf(stat_power),2))*pow(std_1,2) # Numerator        \n",
        "        / pow(mde,2) # Denominator\n",
        "        )\n",
        "    \n",
        "    print(('In order to detect a change of {0:f} between groups with the SD of {1},'.format(mde, std_1)))\n",
        "    print(('with significance {0} and statistical power {1}, we need in each group at least {2:d} subjects.'.format(alpha, stat_power, int(n))))\n",
        "    return n\n"
      ],
      "metadata": {
        "id": "skUqa6wL4kgL"
      },
      "execution_count": 36,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# ----------------\n",
        "# ----- TEST -----\n",
        "# ----------------\n",
        "base_cvr = 0.2\n",
        "N = get_sample_size_with_power(\n",
        "    mu_1 = base_cvr, \n",
        "    std_1 = np.sqrt(base_cvr*(1-base_cvr)),\n",
        "    mde = 0.01, \n",
        "    is_absolute_mde = True,\n",
        "    alpha=0.05, \n",
        "    beta = 0.2\n",
        ")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "w6WJEpNsK1R1",
        "outputId": "abf92601-83b5-4d88-8297-926f18cc2663"
      },
      "execution_count": 37,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "In order to detect a change of 0.010000 between groups with the SD of 0.4,\n",
            "with significance 0.05 and statistical power 0.8, we need in each group at least 25117 subjects.\n"
          ]
        }
      ]
    }
  ]
}
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"provenance": [],
	"toc_visible": true
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3"
	},
	"language_info": {
	"name": "python"
	}
	},
	"cells": [
	{
	"cell_type": "code",
	"source": [
	"from scipy.stats import norm\n",
	"\n",
	"def get_sample_size_with_power(mu_1, std_1, mde, is_absolute_mde, alpha=0.05, beta = 0.2):\n",
	" '''\n",
	" get_sample_size_with_power(...) takes in two sample statistics (mu_1, std_1) of the control group (as baselines) \n",
	" and two manually-determined inputs (alpha, mde), returns the minimum required sample size, as demonstrated in Figure 6.\n",
	" \n",
	" Note that get_sample_size_with_power(...) calculates the sample size that gives us the statistical power (1-beta) we specify. \n",
	" Meanwhile, it assumes the standard deviations are the same in the two groups. \n",
	" However, it is easy to release this assumption by replacing 2*variance with variance_1+variance_2\n",
	" \n",
	" Input:\n",
	" mu_1: float, the sample mean of the control group (group 1)'s metric\n",
	" std_1: float, the standard deviation of the control group (group 1)'s metric\n",
	" mde: float, the minimal detectable effect (MDE), often set by the analyst using domain knowledge \n",
	" is_absolute_mde: bool, True means the mde input is absolute MDE, False means it is relative MDE\n",
	" alpha: float, the significance level and set as 0.05 by default （assuming 5% significance level) \n",
	" beta: float, the type II error rate and set as 0.2 by default (assuming 80% statistical power)\n",
	" Output:\n",
	" n: int, the minimum required sample size (based on the single-tail hypothesis testing), determined by the formula in Figure 6. \n",
	" '''\n",
	" if is_absolute_mde:\n",
	" # Absolute MDE\n",
	" dmin = mde\n",
	" else:\n",
	" # Relative MDE\n",
	" dmin = mu_1*mde\n",
	"\n",
	" stat_power = 1-beta\n",
	"\n",
	" # Calculate the minimum required sample size \n",
	" n = np.ceil(\n",
	" 2(pow(norm.ppf(1-alpha/2)+norm.ppf(stat_power),2))pow(std_1,2) # Numerator \n",
	" / pow(mde,2) # Denominator\n",
	" )\n",
	" \n",
	" print(('In order to detect a change of {0:f} between groups with the SD of {1},'.format(mde, std_1)))\n",
	" print(('with significance {0} and statistical power {1}, we need in each group at least {2:d} subjects.'.format(alpha, stat_power, int(n))))\n",
	" return n\n"
	],
	"metadata": {
	"id": "skUqa6wL4kgL"
	},
	"execution_count": 36,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"# ----------------\n",
	"# ----- TEST -----\n",
	"# ----------------\n",
	"base_cvr = 0.2\n",
	"N = get_sample_size_with_power(\n",
	" mu_1 = base_cvr, \n",
	" std_1 = np.sqrt(base_cvr*(1-base_cvr)),\n",
	" mde = 0.01, \n",
	" is_absolute_mde = True,\n",
	" alpha=0.05, \n",
	" beta = 0.2\n",
	")"
	],
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "w6WJEpNsK1R1",
	"outputId": "abf92601-83b5-4d88-8297-926f18cc2663"
	},
	"execution_count": 37,
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"In order to detect a change of 0.010000 between groups with the SD of 0.4,\n",
	"with significance 0.05 and statistical power 0.8, we need in each group at least 25117 subjects.\n"
	]
	}
	]
	}
	]
	}