Last active
April 16, 2022 00:03
-
-
Save tonicanada/d8005b0f87097e24bbf24bfff817bee5 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 68, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def plot_histograms_sample_mean_diff(m, n_min, n_max, \n", | |
" n_step, df, variable):\n", | |
" \"\"\"\n", | |
" Plot \"(n_max-n_min)/n_step\" histograms for the \n", | |
" difference between the means of 2 random samples of\n", | |
" different sizes. On top of histogram will also \n", | |
" plot the corresponding distribution curve.\n", | |
" \n", | |
" Parameters\n", | |
" ----------\n", | |
" m: int\n", | |
" Number of samples to take.\n", | |
" m_min: int\n", | |
" Minimun sample 1 size\n", | |
" n_max: int\n", | |
" Maximum sample 1 size\n", | |
" n_step: int\n", | |
" Difference between sample sizes\n", | |
" df: pandas dataframe\n", | |
" Dataframe where each column is a dataset\n", | |
" variable: str\n", | |
" Pandas dataframe column (in our example can be one \n", | |
" of following: 'normal', 'uniform', 'binomial')\n", | |
" \"\"\"\n", | |
" \n", | |
" diff_mean_dict = {}\n", | |
" std_sample1_dict = {}\n", | |
" std_sample2_dict = {}\n", | |
" \n", | |
" params_dict = {\n", | |
" \"mean\": df[variable].mean(),\n", | |
" \"std\": df[variable].std(),\n", | |
" \"median\": df[variable].median(),\n", | |
" }\n", | |
"\n", | |
" for n in range(n_min, n_max, n_step):\n", | |
" for x in range(m):\n", | |
" #We are going to generate the difference between the \n", | |
" #mean of 2 samples. First sample with size = n, \n", | |
" #second sample with size round(n/4)\n", | |
" \n", | |
" n1 = n\n", | |
" n2 = round(n/4)\n", | |
" df_sample_1 = df.sample(n=n1)[variable]\n", | |
" df_sample_2 = df.sample(n=n2)[variable]\n", | |
" if (n in diff_mean_dict.keys()):\n", | |
" diff_mean_dict[n].append(df_sample_1.mean()-df_sample_2.mean())\n", | |
" std_sample1_dict[n].append(df_sample_1.std())\n", | |
" std_sample2_dict[n].append(df_sample_2.std())\n", | |
" else:\n", | |
" diff_mean_dict[n] = [df_sample_1.mean() - df_sample_2.mean()]\n", | |
" std_sample1_dict[n] = [df_sample_1.std()]\n", | |
" std_sample2_dict[n] = [df_sample_2.std()]\n", | |
"\n", | |
" df_diff_means = pd.DataFrame.from_dict(diff_mean_dict)\n", | |
" df_sample1_std = pd.DataFrame.from_dict(std_sample1_dict)\n", | |
" df_sample2_std = pd.DataFrame.from_dict(std_sample2_dict)\n", | |
"\n", | |
" fig, ax = plt.subplots(1, \n", | |
" int(np.ceil((n_max-n_min)/n_step)), \n", | |
" sharex='col', \n", | |
" sharey='row', \n", | |
" figsize=(20, 3))\n", | |
" \n", | |
" fig.suptitle(f\"\"\"Histograms for {m} (difference between 2 samples) mean:\n", | |
" sample_1 size between {n_min} and {n_max}\n", | |
" sample_2 size between {round(n_min/4)} and {round(n_max/4)}\n", | |
" for variable with {variable} distribution\"\"\", \n", | |
" fontsize=16, \n", | |
" y = 1.45)\n", | |
"\n", | |
" for i in range(len(df_diff_means.columns)):\n", | |
" df_diff_means.hist(column=df_diff_means.columns[i], \n", | |
" ax=ax[i], \n", | |
" alpha=0.5, \n", | |
" color='red', \n", | |
" bins = 30)\n", | |
" ax[i].set_xlim((-1.5*params_dict['std'],\n", | |
" 1.5*params_dict['std']))\n", | |
" ax[i].tick_params(axis='both', which='major', labelsize=12)\n", | |
" ax[i].set_title(f\"n1 = {df_diff_means.columns[i]} \\n n2 = {round(df_diff_means.columns[i]/4)}\",\n", | |
" fontsize=16)\n", | |
" \n", | |
" #Here we compute the standard error for this kind of test\n", | |
" n1 = df_sample1_std.columns[i]\n", | |
" n2 = round(n1/4)\n", | |
" s1 = df_sample1_std[df_sample1_std.columns[i]].mean() \n", | |
" s2 = df_sample2_std[df_sample2_std.columns[i]].mean()\n", | |
" sp = np.sqrt(((n1-1)*s1**2 + (n2-1)*s2**2)/(n1+n2-2))\n", | |
" std_error = sp*np.sqrt((1/n1) + (1/n2))\n", | |
" \n", | |
" #Here we plot the distribution curve\n", | |
" x = np.linspace(-6*params_dict['std'], \n", | |
" 6*params_dict['std'], \n", | |
" 1000)\n", | |
" y_normcurve_2 = norm.pdf(x, 0, std_error)\n", | |
" ax[i].plot(x, y_normcurve_2, 'b--', linewidth=2)\n", | |
"\n", | |
"#Here we exetute the following example:\n", | |
"plot_histograms_sample_mean_diff(300, 10, 100, 20, df, 'normal')" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.8.8" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment