Last active
April 11, 2022 01:27
-
-
Save tonicanada/3b038fcfacb3e56623c32af9529691f9 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"import matplotlib.pyplot as plt\n", | |
"from scipy.stats import norm, t\n", | |
"import json\n", | |
"import pprint" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Generating 3 arrays with uniform, random, normal distribution\n", | |
"# We'll set random seed to obtain the same results if we reexecute \n", | |
"# the code.\n", | |
"np.random.seed(42)\n", | |
"uniform_distribution = np.random.uniform(4,5.5, 1600)\n", | |
"normal_distribution = np.random.normal(size = 1600, \n", | |
" loc = 5, \n", | |
" scale = 0.05)\n", | |
"binomial_distribution = np.random.binomial(15,0.05,1600)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df = pd.DataFrame(uniform_distribution, columns=['uniform'])\n", | |
"df['normal'] = pd.DataFrame(normal_distribution, columns=['normal'])\n", | |
"df['binomial'] = pd.DataFrame(binomial_distribution, columns=['binomial'])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>uniform</th>\n", | |
" <th>normal</th>\n", | |
" <th>binomial</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>4.561810</td>\n", | |
" <td>4.970916</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>5.426071</td>\n", | |
" <td>4.949262</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>5.097991</td>\n", | |
" <td>4.967536</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>4.897988</td>\n", | |
" <td>4.938803</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>4.234028</td>\n", | |
" <td>5.001704</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" uniform normal binomial\n", | |
"0 4.561810 4.970916 0\n", | |
"1 5.426071 4.949262 1\n", | |
"2 5.097991 4.967536 0\n", | |
"3 4.897988 4.938803 1\n", | |
"4 4.234028 5.001704 0" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"image/png": "\n", | |
"text/plain": [ | |
"<Figure size 432x288 with 1 Axes>" | |
] | |
}, | |
"metadata": { | |
"needs_background": "light" | |
}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"#Plotting the histograms for the 3 variables\n", | |
"df.normal.hist();\n", | |
"plt.title('Random Normal Histogram \\n (\\u03BC = 5, \\u03C3 = 0.05)');" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"image/png": "\n", | |
"text/plain": [ | |
"<Figure size 432x288 with 1 Axes>" | |
] | |
}, | |
"metadata": { | |
"needs_background": "light" | |
}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"df.uniform.hist();\n", | |
"plt.title('Random Uniform Distribution Histogram \\n (a = 4.0, b = 5.5)');" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"image/png": "\n", | |
"text/plain": [ | |
"<Figure size 432x288 with 1 Axes>" | |
] | |
}, | |
"metadata": { | |
"needs_background": "light" | |
}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"df.binomial.hist()\n", | |
"plt.title('Random Binomial Distribution Histogram \\n (n = 15, p = 0.05)');" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#Storing the mean, median and desvest of the population\n", | |
"parameters = {\n", | |
" \"uniform_dist\": {\n", | |
" \"mean\": df.uniform.mean(),\n", | |
" \"median\": df.uniform.median(),\n", | |
" \"std\": df.uniform.std()\n", | |
" },\n", | |
" \"normal_dist\": {\n", | |
" \"mean\": df.normal.mean(),\n", | |
" \"median\": df.normal.median(),\n", | |
" \"std\": df.normal.std()\n", | |
" },\n", | |
" \"binomial\": {\n", | |
" \"mean\": df.binomial.mean(),\n", | |
" \"median\": df.binomial.median(),\n", | |
" \"std\": df.binomial.std()\n", | |
" }\n", | |
"}" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"{\n", | |
" \"uniform_dist\": {\n", | |
" \"mean\": 4.747806528172976,\n", | |
" \"median\": 4.759842615679644,\n", | |
" \"std\": 0.43976063753401085\n", | |
" },\n", | |
" \"normal_dist\": {\n", | |
" \"mean\": 5.001599754267347,\n", | |
" \"median\": 5.000861698922361,\n", | |
" \"std\": 0.04901432417153434\n", | |
" },\n", | |
" \"binomial\": {\n", | |
" \"mean\": 0.715625,\n", | |
" \"median\": 1.0,\n", | |
" \"std\": 0.822452811574647\n", | |
" }\n", | |
"}\n" | |
] | |
} | |
], | |
"source": [ | |
"print(json.dumps(parameters, indent=4))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Building a function to generate m samples of the (mean, median, std...) of n-sized samples" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def get_sample_parameter_from_variable(m, n, df, parameter, \n", | |
" variable, random_state=42):\n", | |
" \"\"\"\n", | |
" Function that takes m random samples of n-size of the specified \n", | |
" dataframe column and applies the selected parameter \n", | |
" (mean, median, std) to each n-size sample. Returns an \n", | |
" m-size array with the sample parameters.\n", | |
" \n", | |
" Parameters\n", | |
" ----------\n", | |
" m: int\n", | |
" Number of samples to take.\n", | |
" m: int\n", | |
" Sample size\n", | |
" df: pandas dataframe\n", | |
" Dataframe where each column is a dataset\n", | |
" parameter: str\n", | |
" Can be one of following: 'mean', 'median', 'std'\n", | |
" variable: str\n", | |
" Pandas dataframe column \n", | |
" (in our example can be one of following: 'normal', \n", | |
" 'uniform', 'binomial')\n", | |
" random_state: int (optional):\n", | |
" Random seed number to initialize random number generator\n", | |
" \"\"\"\n", | |
" result = []\n", | |
" for x in range(m):\n", | |
" df_sample = df[variable].sample(n=n, \n", | |
" random_state = random_state)\n", | |
" if parameter == 'mean':\n", | |
" result.append(df_sample.mean())\n", | |
" elif parameter == 'median':\n", | |
" result.append(df_sample.median())\n", | |
" elif parameter == 'std':\n", | |
" result.append(df_sample.std())\n", | |
" random_state+=1\n", | |
" return list(result)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Example os usage: Let's say we want to get a list of the mean of 20 random samples (each one with size 35), from the binomial distribution variable." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[0.8571428571428571,\n", | |
" 0.4857142857142857,\n", | |
" 0.6571428571428571,\n", | |
" 0.6857142857142857,\n", | |
" 0.7714285714285715,\n", | |
" 0.7714285714285715,\n", | |
" 0.7142857142857143,\n", | |
" 0.5714285714285714,\n", | |
" 0.7142857142857143,\n", | |
" 0.7428571428571429,\n", | |
" 0.5714285714285714,\n", | |
" 0.8571428571428571,\n", | |
" 0.9714285714285714,\n", | |
" 0.7142857142857143,\n", | |
" 0.8571428571428571,\n", | |
" 0.6857142857142857,\n", | |
" 0.8,\n", | |
" 0.8,\n", | |
" 0.7714285714285715,\n", | |
" 0.8571428571428571]\n" | |
] | |
} | |
], | |
"source": [ | |
"sample_mean_from_binomial = get_sample_parameter_from_variable(20, \n", | |
" 35, \n", | |
" df, \n", | |
" 'mean',\n", | |
" 'binomial')\n", | |
"pprint.pprint(sample_mean_from_binomial)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.8.8" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment