tonicanada/20220409_understanding_clt_and_ttest_part6.ipynb

## 20220409_understanding_clt_and_ttest_part6.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [],
   "source": [
    "def plot_histograms_sample_mean_diff(m, n_min, n_max, \n",
    "                                     n_step, df, variable):\n",
    "    \"\"\"\n",
    "    Plot \"(n_max-n_min)/n_step\" histograms for the \n",
    "    difference between the means of 2 random samples of\n",
    "    different sizes. On top of histogram will also \n",
    "    plot the corresponding distribution curve.\n",
    "    \n",
    "    Parameters\n",
    "    ----------\n",
    "    m: int\n",
    "        Number of samples to take.\n",
    "    m_min: int\n",
    "        Minimun sample 1 size\n",
    "    n_max: int\n",
    "        Maximum sample 1 size\n",
    "    n_step: int\n",
    "        Difference between sample sizes\n",
    "    df: pandas dataframe\n",
    "        Dataframe where each column is a dataset\n",
    "    variable: str\n",
    "         Pandas dataframe column (in our example can be one \n",
    "         of following: 'normal', 'uniform', 'binomial')\n",
    "    \"\"\"\n",
    "    \n",
    "    diff_mean_dict = {}\n",
    "    std_sample1_dict = {}\n",
    "    std_sample2_dict = {}\n",
    "    \n",
    "    params_dict =  {\n",
    "        \"mean\": df[variable].mean(),\n",
    "        \"std\": df[variable].std(),\n",
    "        \"median\": df[variable].median(),\n",
    "    }\n",
    "\n",
    "    for n in range(n_min, n_max, n_step):\n",
    "        for x in range(m):\n",
    "            #We are going to generate the difference between the \n",
    "            #mean of 2 samples. First sample with size = n, \n",
    "            #second sample with size round(n/4)\n",
    "            \n",
    "            n1 = n\n",
    "            n2 = round(n/4)\n",
    "            df_sample_1 = df.sample(n=n1)[variable]\n",
    "            df_sample_2 = df.sample(n=n2)[variable]\n",
    "            if (n in diff_mean_dict.keys()):\n",
    "                diff_mean_dict[n].append(df_sample_1.mean()-df_sample_2.mean())\n",
    "                std_sample1_dict[n].append(df_sample_1.std())\n",
    "                std_sample2_dict[n].append(df_sample_2.std())\n",
    "            else:\n",
    "                diff_mean_dict[n] = [df_sample_1.mean() - df_sample_2.mean()]\n",
    "                std_sample1_dict[n] = [df_sample_1.std()]\n",
    "                std_sample2_dict[n] = [df_sample_2.std()]\n",
    "\n",
    "        df_diff_means = pd.DataFrame.from_dict(diff_mean_dict)\n",
    "        df_sample1_std = pd.DataFrame.from_dict(std_sample1_dict)\n",
    "        df_sample2_std = pd.DataFrame.from_dict(std_sample2_dict)\n",
    "\n",
    "    fig, ax = plt.subplots(1, \n",
    "                           int(np.ceil((n_max-n_min)/n_step)), \n",
    "                           sharex='col', \n",
    "                           sharey='row', \n",
    "                           figsize=(20, 3))\n",
    "    \n",
    "    fig.suptitle(f\"\"\"Histograms for {m} (difference between 2 samples) mean:\n",
    "                sample_1 size between {n_min} and {n_max}\n",
    "                sample_2 size between {round(n_min/4)} and {round(n_max/4)}\n",
    "                for variable with {variable} distribution\"\"\", \n",
    "                 fontsize=16, \n",
    "                 y = 1.45)\n",
    "\n",
    "    for i in range(len(df_diff_means.columns)):\n",
    "        df_diff_means.hist(column=df_diff_means.columns[i], \n",
    "                           ax=ax[i], \n",
    "                           alpha=0.5, \n",
    "                           color='red', \n",
    "                           bins = 30)\n",
    "        ax[i].set_xlim((-1.5*params_dict['std'],\n",
    "                        1.5*params_dict['std']))\n",
    "        ax[i].tick_params(axis='both', which='major', labelsize=12)\n",
    "        ax[i].set_title(f\"n1 = {df_diff_means.columns[i]} \\n n2 = {round(df_diff_means.columns[i]/4)}\",\n",
    "                       fontsize=16)\n",
    "        \n",
    "        #Here we compute the standard error for this kind of test\n",
    "        n1 = df_sample1_std.columns[i]\n",
    "        n2 = round(n1/4)\n",
    "        s1 = df_sample1_std[df_sample1_std.columns[i]].mean()   \n",
    "        s2 = df_sample2_std[df_sample2_std.columns[i]].mean()\n",
    "        sp = np.sqrt(((n1-1)*s1**2 + (n2-1)*s2**2)/(n1+n2-2))\n",
    "        std_error = sp*np.sqrt((1/n1) + (1/n2))\n",
    "        \n",
    "        #Here we plot the distribution curve\n",
    "        x = np.linspace(-6*params_dict['std'], \n",
    "                        6*params_dict['std'], \n",
    "                        1000)\n",
    "        y_normcurve_2 = norm.pdf(x, 0, std_error)\n",
    "        ax[i].plot(x, y_normcurve_2, 'b--', linewidth=2)\n",
    "\n",
    "#Here we exetute the following example:\n",
    "plot_histograms_sample_mean_diff(300, 10, 100, 20, df, 'normal')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 68,
	"metadata": {},
	"outputs": [],
	"source": [
	"def plot_histograms_sample_mean_diff(m, n_min, n_max, \n",
	" n_step, df, variable):\n",
	" \"\"\"\n",
	" Plot \"(n_max-n_min)/n_step\" histograms for the \n",
	" difference between the means of 2 random samples of\n",
	" different sizes. On top of histogram will also \n",
	" plot the corresponding distribution curve.\n",
	" \n",
	" Parameters\n",
	" ----------\n",
	" m: int\n",
	" Number of samples to take.\n",
	" m_min: int\n",
	" Minimun sample 1 size\n",
	" n_max: int\n",
	" Maximum sample 1 size\n",
	" n_step: int\n",
	" Difference between sample sizes\n",
	" df: pandas dataframe\n",
	" Dataframe where each column is a dataset\n",
	" variable: str\n",
	" Pandas dataframe column (in our example can be one \n",
	" of following: 'normal', 'uniform', 'binomial')\n",
	" \"\"\"\n",
	" \n",
	" diff_mean_dict = {}\n",
	" std_sample1_dict = {}\n",
	" std_sample2_dict = {}\n",
	" \n",
	" params_dict = {\n",
	" \"mean\": df[variable].mean(),\n",
	" \"std\": df[variable].std(),\n",
	" \"median\": df[variable].median(),\n",
	" }\n",
	"\n",
	" for n in range(n_min, n_max, n_step):\n",
	" for x in range(m):\n",
	" #We are going to generate the difference between the \n",
	" #mean of 2 samples. First sample with size = n, \n",
	" #second sample with size round(n/4)\n",
	" \n",
	" n1 = n\n",
	" n2 = round(n/4)\n",
	" df_sample_1 = df.sample(n=n1)[variable]\n",
	" df_sample_2 = df.sample(n=n2)[variable]\n",
	" if (n in diff_mean_dict.keys()):\n",
	" diff_mean_dict[n].append(df_sample_1.mean()-df_sample_2.mean())\n",
	" std_sample1_dict[n].append(df_sample_1.std())\n",
	" std_sample2_dict[n].append(df_sample_2.std())\n",
	" else:\n",
	" diff_mean_dict[n] = [df_sample_1.mean() - df_sample_2.mean()]\n",
	" std_sample1_dict[n] = [df_sample_1.std()]\n",
	" std_sample2_dict[n] = [df_sample_2.std()]\n",
	"\n",
	" df_diff_means = pd.DataFrame.from_dict(diff_mean_dict)\n",
	" df_sample1_std = pd.DataFrame.from_dict(std_sample1_dict)\n",
	" df_sample2_std = pd.DataFrame.from_dict(std_sample2_dict)\n",
	"\n",
	" fig, ax = plt.subplots(1, \n",
	" int(np.ceil((n_max-n_min)/n_step)), \n",
	" sharex='col', \n",
	" sharey='row', \n",
	" figsize=(20, 3))\n",
	" \n",
	" fig.suptitle(f\"\"\"Histograms for {m} (difference between 2 samples) mean:\n",
	" sample_1 size between {n_min} and {n_max}\n",
	" sample_2 size between {round(n_min/4)} and {round(n_max/4)}\n",
	" for variable with {variable} distribution\"\"\", \n",
	" fontsize=16, \n",
	" y = 1.45)\n",
	"\n",
	" for i in range(len(df_diff_means.columns)):\n",
	" df_diff_means.hist(column=df_diff_means.columns[i], \n",
	" ax=ax[i], \n",
	" alpha=0.5, \n",
	" color='red', \n",
	" bins = 30)\n",
	" ax[i].set_xlim((-1.5*params_dict['std'],\n",
	" 1.5*params_dict['std']))\n",
	" ax[i].tick_params(axis='both', which='major', labelsize=12)\n",
	" ax[i].set_title(f\"n1 = {df_diff_means.columns[i]} \\n n2 = {round(df_diff_means.columns[i]/4)}\",\n",
	" fontsize=16)\n",
	" \n",
	" #Here we compute the standard error for this kind of test\n",
	" n1 = df_sample1_std.columns[i]\n",
	" n2 = round(n1/4)\n",
	" s1 = df_sample1_std[df_sample1_std.columns[i]].mean() \n",
	" s2 = df_sample2_std[df_sample2_std.columns[i]].mean()\n",
	" sp = np.sqrt(((n1-1)s12 + (n2-1)s2**2)/(n1+n2-2))\n",
	" std_error = sp*np.sqrt((1/n1) + (1/n2))\n",
	" \n",
	" #Here we plot the distribution curve\n",
	" x = np.linspace(-6*params_dict['std'], \n",
	" 6*params_dict['std'], \n",
	" 1000)\n",
	" y_normcurve_2 = norm.pdf(x, 0, std_error)\n",
	" ax[i].plot(x, y_normcurve_2, 'b--', linewidth=2)\n",
	"\n",
	"#Here we exetute the following example:\n",
	"plot_histograms_sample_mean_diff(300, 10, 100, 20, df, 'normal')"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.8.8"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}