Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tonicanada/d8005b0f87097e24bbf24bfff817bee5 to your computer and use it in GitHub Desktop.
Save tonicanada/d8005b0f87097e24bbf24bfff817bee5 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [],
"source": [
"def plot_histograms_sample_mean_diff(m, n_min, n_max, \n",
" n_step, df, variable):\n",
" \"\"\"\n",
" Plot \"(n_max-n_min)/n_step\" histograms for the \n",
" difference between the means of 2 random samples of\n",
" different sizes. On top of histogram will also \n",
" plot the corresponding distribution curve.\n",
" \n",
" Parameters\n",
" ----------\n",
" m: int\n",
" Number of samples to take.\n",
" m_min: int\n",
" Minimun sample 1 size\n",
" n_max: int\n",
" Maximum sample 1 size\n",
" n_step: int\n",
" Difference between sample sizes\n",
" df: pandas dataframe\n",
" Dataframe where each column is a dataset\n",
" variable: str\n",
" Pandas dataframe column (in our example can be one \n",
" of following: 'normal', 'uniform', 'binomial')\n",
" \"\"\"\n",
" \n",
" diff_mean_dict = {}\n",
" std_sample1_dict = {}\n",
" std_sample2_dict = {}\n",
" \n",
" params_dict = {\n",
" \"mean\": df[variable].mean(),\n",
" \"std\": df[variable].std(),\n",
" \"median\": df[variable].median(),\n",
" }\n",
"\n",
" for n in range(n_min, n_max, n_step):\n",
" for x in range(m):\n",
" #We are going to generate the difference between the \n",
" #mean of 2 samples. First sample with size = n, \n",
" #second sample with size round(n/4)\n",
" \n",
" n1 = n\n",
" n2 = round(n/4)\n",
" df_sample_1 = df.sample(n=n1)[variable]\n",
" df_sample_2 = df.sample(n=n2)[variable]\n",
" if (n in diff_mean_dict.keys()):\n",
" diff_mean_dict[n].append(df_sample_1.mean()-df_sample_2.mean())\n",
" std_sample1_dict[n].append(df_sample_1.std())\n",
" std_sample2_dict[n].append(df_sample_2.std())\n",
" else:\n",
" diff_mean_dict[n] = [df_sample_1.mean() - df_sample_2.mean()]\n",
" std_sample1_dict[n] = [df_sample_1.std()]\n",
" std_sample2_dict[n] = [df_sample_2.std()]\n",
"\n",
" df_diff_means = pd.DataFrame.from_dict(diff_mean_dict)\n",
" df_sample1_std = pd.DataFrame.from_dict(std_sample1_dict)\n",
" df_sample2_std = pd.DataFrame.from_dict(std_sample2_dict)\n",
"\n",
" fig, ax = plt.subplots(1, \n",
" int(np.ceil((n_max-n_min)/n_step)), \n",
" sharex='col', \n",
" sharey='row', \n",
" figsize=(20, 3))\n",
" \n",
" fig.suptitle(f\"\"\"Histograms for {m} (difference between 2 samples) mean:\n",
" sample_1 size between {n_min} and {n_max}\n",
" sample_2 size between {round(n_min/4)} and {round(n_max/4)}\n",
" for variable with {variable} distribution\"\"\", \n",
" fontsize=16, \n",
" y = 1.45)\n",
"\n",
" for i in range(len(df_diff_means.columns)):\n",
" df_diff_means.hist(column=df_diff_means.columns[i], \n",
" ax=ax[i], \n",
" alpha=0.5, \n",
" color='red', \n",
" bins = 30)\n",
" ax[i].set_xlim((-1.5*params_dict['std'],\n",
" 1.5*params_dict['std']))\n",
" ax[i].tick_params(axis='both', which='major', labelsize=12)\n",
" ax[i].set_title(f\"n1 = {df_diff_means.columns[i]} \\n n2 = {round(df_diff_means.columns[i]/4)}\",\n",
" fontsize=16)\n",
" \n",
" #Here we compute the standard error for this kind of test\n",
" n1 = df_sample1_std.columns[i]\n",
" n2 = round(n1/4)\n",
" s1 = df_sample1_std[df_sample1_std.columns[i]].mean() \n",
" s2 = df_sample2_std[df_sample2_std.columns[i]].mean()\n",
" sp = np.sqrt(((n1-1)*s1**2 + (n2-1)*s2**2)/(n1+n2-2))\n",
" std_error = sp*np.sqrt((1/n1) + (1/n2))\n",
" \n",
" #Here we plot the distribution curve\n",
" x = np.linspace(-6*params_dict['std'], \n",
" 6*params_dict['std'], \n",
" 1000)\n",
" y_normcurve_2 = norm.pdf(x, 0, std_error)\n",
" ax[i].plot(x, y_normcurve_2, 'b--', linewidth=2)\n",
"\n",
"#Here we exetute the following example:\n",
"plot_histograms_sample_mean_diff(300, 10, 100, 20, df, 'normal')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment