Last active
December 8, 2024 04:19
-
-
Save sachinsdate/534d921c937cc730e2039cf2396b80f2 to your computer and use it in GitHub Desktop.
Experiments with simple random sampling, systematic sampling, and stratified random sampling. Download automobiles data from https://gist.github.com/sachinsdate/254be0cf9a631ffd943c0746e103cd85
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import numpy as np | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| import statsmodels.api as sm | |
| import scipy.stats as stats | |
| from sklearn.model_selection import train_test_split | |
| ############################################################################### | |
| # Read the autos dataset into a Dataframe and carve out the subset of interest | |
| ############################################################################### | |
| df_auto = pd.read_csv(filepath_or_buffer='automobiles_for_statistical_sampling.csv', header=0) | |
| df_auto = df_auto[['curb_weight', 'engine_size', 'horsepower', 'city_mpg']] | |
| df_auto = df_auto.dropna() | |
| print(df_auto) | |
| ############################################################################### | |
| # Inspect the dataset using pairplots and histogram plots | |
| ############################################################################### | |
| sns.pairplot(df_auto) | |
| ############################################################################### | |
| # Draw a simple random sample of size 30 | |
| ############################################################################### | |
| srs_sample_auto = df_auto.sample(n=30, random_state=np.random.randint(0, 100)) | |
| ############################################################################### | |
| # Build and train an OLS linear regression model of horsepower on engine size | |
| ############################################################################### | |
| olsr_model_auto = sm.OLS(endog=srs_sample_auto['horsepower'], exog=srs_sample_auto[['engine_size']]) | |
| olsr_results_auto= olsr_model_auto.fit() | |
| print(olsr_results_auto.summary()) | |
| ########################################################################################################## | |
| # Compare the performance of simple random sampling and systematic sampling on samples of different sizes: | |
| ########################################################################################################## | |
| # 1. Sample the engine size column of the automobiles dataset 100 times to generate 100 random samples, each of size 20. | |
| # 2. In each of the 100 samples, calculate various sample statistics such as the the mean, min, max, percentiles etc. | |
| # 3. Calculate the mean of each sample statistic. | |
| # 4. Repeat the above steps by varying the sample size from 20 to 100 in steps of 5. | |
| # 5. Reconduct the above experiment using systematic sampling instead of simple random sampling. | |
| dataset = df_auto | |
| column_name = 'engine_size' | |
| # Vary sample size from 20 to 250 in steps of 5 | |
| sample_sizes = range(20, 100, 5) | |
| sampling_methods = ['Simple Random Sampling', 'Systematic Sampling'] | |
| results = {} | |
| for method in sampling_methods: | |
| results[method] = {} | |
| results[method]['N'] = 0 | |
| results[method]['Mean'] = 0 | |
| results[method]['Min'] = 0 | |
| results[method]['Max'] = 0 | |
| results[method]['10th'] = 0 | |
| results[method]['25th'] = 0 | |
| results[method]['50th'] = 0 | |
| results[method]['75th'] = 0 | |
| results[method]['90th'] = 0 | |
| results[method]['SD'] = 0 | |
| num_iterations = 100 | |
| pop_params = { | |
| 'N' : len(dataset), | |
| 'Mean' : dataset[column_name].mean(), | |
| 'Max' : dataset[column_name].max(), | |
| 'Min' : dataset[column_name].min(), | |
| '10th' : np.percentile(dataset[column_name], 10), | |
| '25th' : np.percentile(dataset[column_name], 25), | |
| '50th' : np.percentile(dataset[column_name], 50), | |
| '75th' : np.percentile(dataset[column_name], 75), | |
| '90th' : np.percentile(dataset[column_name], 90), | |
| 'SD' : dataset[column_name].std() | |
| } | |
| results_df = pd.DataFrame(columns=['N', 'Method', 'Mean', 'SD', 'Min', '10th', '25th', '50th', '75th', '90th', 'Max']) | |
| for sample_size in sample_sizes: | |
| for i in range(num_iterations): | |
| # Simple Random Sampling | |
| method = 'Simple Random Sampling' | |
| srs_sample = dataset.sample(n=sample_size, random_state=np.random.randint(0, num_iterations)) | |
| # Calculate the sample statistics | |
| results[method]['N'] = sample_size | |
| results[method]['Mean'] = results[method]['Mean'] + srs_sample[column_name].mean() | |
| results[method]['Min'] = results[method]['Min'] + srs_sample[column_name].min() | |
| results[method]['Max'] = results[method]['Max'] + srs_sample[column_name].max() | |
| results[method]['10th'] = results[method]['10th'] + np.percentile(srs_sample[column_name], 10) | |
| results[method]['25th'] = results[method]['25th'] + np.percentile(srs_sample[column_name], 25) | |
| results[method]['50th'] = results[method]['50th'] + np.percentile(srs_sample[column_name], 50) | |
| results[method]['75th'] = results[method]['75th'] + np.percentile(srs_sample[column_name], 75) | |
| results[method]['90th'] = results[method]['90th'] + np.percentile(srs_sample[column_name], 90) | |
| results[method]['SD'] = results[method]['SD'] + srs_sample[column_name].std() | |
| # Systematic Sampling | |
| method = 'Systematic Sampling' | |
| k = len(df_auto) // sample_size | |
| random_start = np.random.randint(0, k) | |
| sysrs_indices = np.arange(random_start, len(df_auto), k)[:sample_size] | |
| sysrs_sample = dataset.iloc[sysrs_indices] | |
| # Calculate the sample statistics | |
| results[method]['N'] = sample_size | |
| results[method]['Mean'] = results[method]['Mean'] + sysrs_sample[column_name].mean() | |
| results[method]['Min'] = results[method]['Min'] + sysrs_sample[column_name].min() | |
| results[method]['Max'] = results[method]['Max'] + sysrs_sample[column_name].max() | |
| results[method]['10th'] = results[method]['10th'] + np.percentile(sysrs_sample[column_name], 10) | |
| results[method]['25th'] = results[method]['25th'] + np.percentile(sysrs_sample[column_name], 25) | |
| results[method]['50th'] = results[method]['50th'] + np.percentile(sysrs_sample[column_name], 50) | |
| results[method]['75th'] = results[method]['75th'] + np.percentile(sysrs_sample[column_name], 75) | |
| results[method]['90th'] = results[method]['90th'] + np.percentile(sysrs_sample[column_name], 90) | |
| results[method]['SD'] = results[method]['SD'] + sysrs_sample[column_name].std() | |
| # Calculate the means of all te sample statistics | |
| for method in sampling_methods: | |
| results[method]['Mean'] = results[method]['Mean']/num_iterations | |
| results[method]['Min'] = results[method]['Min']/num_iterations | |
| results[method]['Max'] = results[method]['Max']/num_iterations | |
| results[method]['10th'] = results[method]['10th']/num_iterations | |
| results[method]['25th'] = results[method]['25th']/num_iterations | |
| results[method]['50th'] = results[method]['50th']/num_iterations | |
| results[method]['75th'] = results[method]['75th']/num_iterations | |
| results[method]['90th'] = results[method]['90th']/num_iterations | |
| results[method]['SD'] = results[method]['SD']/num_iterations | |
| # Store all the means in the Dataframe | |
| results_df.loc[len(results_df.index)] = [results[method]['N'], method, results[method]['Mean'], results[method]['SD'], results[method]['Min'], results[method]['10th'], results[method]['25th'], results[method]['50th'], results[method]['75th'], results[method]['90th'], results[method]['Max']] | |
| # Display the first 10 rows of results | |
| print(results_df.head(10)) | |
| ######################################################################################### | |
| # Plot all the sample statistics versus sample size for each of the two sampling methods | |
| ######################################################################################### | |
| # Create a 3 x 3 grid | |
| fig, axes = plt.subplots(3, 3, figsize=(14, 14)) | |
| # List of statistics to plot | |
| stats = [ | |
| 'Mean', | |
| 'Max', | |
| 'Min', | |
| 'SD', | |
| '10th', | |
| '25th', | |
| '50th', | |
| '75th', | |
| '90th' | |
| ] | |
| # Plot each statistic | |
| for i, stat in enumerate(stats): | |
| # Calculate row and column for subplot grid | |
| row = i // 3 | |
| col = i % 3 | |
| sns.lineplot(data=results_df[results_df['Method'] == 'Simple Random Sampling'], | |
| x='N', y=stat, ax=axes[row, col], marker='o', label='Simple Random Sampling') | |
| axes[row, col].axhline(y=pop_params[stat], color='r', linestyle='--') | |
| sns.lineplot(data=results_df[results_df['Method'] == 'Systematic Sampling'], | |
| x='N', y=stat, ax=axes[row, col], marker='o', label='Systematic Sampling') | |
| axes[row, col].set_title(f'{stat} percentile value vs. sample size', fontsize=6) | |
| axes[row, col].set_xlabel('Sample Size', fontsize=6) | |
| axes[row, col].set_ylabel(stat + ' percentile value', fontsize=6) | |
| axes[row, col].legend(fontsize=6) | |
| #plt.tight_layout() | |
| plt.show() | |
| ########################################################################################## | |
| # Carve out the subset of interest to study teh performance of stratified random sampling | |
| ########################################################################################## | |
| df_auto_subset = df_auto[['city_mpg', 'curb_weight', 'horsepower']].dropna() | |
| # Bin the horsepower and curb weight columns | |
| df_auto_subset['hp_binned'] = pd.cut(x=df_auto_subset['horsepower'], bins=3, | |
| labels=['low_bhp', 'medium_bhp', 'high_bhp'], include_lowest=True) | |
| df_auto_subset['weight_binned'] = pd.cut(x=df_auto_subset['curb_weight'], bins=3, | |
| labels=['light_wt', 'medium_wt', 'heavy_wt'], include_lowest=True) | |
| # Add a column to store the strata labels | |
| df_auto_subset['strata'] = df_auto_subset['weight_binned'].astype('str') + '_' + df_auto_subset['hp_binned'].astype('str') | |
| # Print the distribution of combinations | |
| print(df_auto_subset.groupby(['hp_binned', 'weight_binned']).count()['city_mpg']) | |
| ########################################################################################## | |
| # Using MSE, compare the goodness-of-fit of a linear model | |
| # fitted on a sample random sample versus one fitted on a stratified random sample | |
| ########################################################################################## | |
| df_mse = pd.DataFrame(columns=['Method', 'n', 'mse']) | |
| # Vary the sample size from 15 to 100 | |
| for n in range(15, int(len(df_auto_subset)/2), 5): | |
| avg_mse = 0 | |
| num_iter = 100 | |
| for i in range(num_iter): | |
| # Create train-test pair using simple random sampling | |
| srs_sample_train, srs_sample_test = train_test_split(df_auto_subset, train_size=n, stratify=None, random_state=np.random.randint(0, 100)) | |
| # Fit a linear model on the training data | |
| olsr_model_auto = sm.OLS(endog=srs_sample_train['city_mpg'], exog=srs_sample_train[['curb_weight', 'horsepower']]) | |
| olsr_results_auto= olsr_model_auto.fit() | |
| #print(olsr_results_auto.summary()) | |
| # Get the model's predictions on the test data | |
| olsr_prediction_results = olsr_results_auto.get_prediction(exog=srs_sample_test[['curb_weight', 'horsepower']]) | |
| # Calculate and accumulate the MSE for averaging | |
| avg_mse = avg_mse + np.sum(pow(srs_sample_test['city_mpg']-olsr_prediction_results.summary_frame()['mean'],2))/len(srs_sample_test) | |
| # Store the average MSE | |
| df_mse.loc[len(df_mse.index)] = ['Simple Random Sampling', n, avg_mse/num_iter] | |
| # Vary the sample size from 15 to 100 | |
| for n in range(15, int(len(df_auto_subset)/2), 5): | |
| avg_mse = 0 | |
| num_iter = 100 | |
| for i in range(num_iter): | |
| # Create train-test pair using simple random sampling | |
| stratified_sample_train, stratified_sample_test = train_test_split(df_auto_subset, train_size=n, stratify=df_auto_subset['strata'], random_state=np.random.randint(0, 100)) | |
| # Fit a linear model on the training data | |
| olsr_model_auto = sm.OLS(endog=stratified_sample_train['city_mpg'], exog=stratified_sample_train[['curb_weight', 'horsepower']]) | |
| olsr_results_auto= olsr_model_auto.fit() | |
| #print(olsr_results_auto.summary()) | |
| # Get the model's predictions on the test data | |
| olsr_prediction_results = olsr_results_auto.get_prediction(exog=stratified_sample_test[['curb_weight', 'horsepower']]) | |
| # Calculate and accumulate the MSE for averaging | |
| avg_mse = avg_mse + np.sum(pow(stratified_sample_test['city_mpg']-olsr_prediction_results.summary_frame()['mean'],2))/len(stratified_sample_test) | |
| # Store the average MSE | |
| df_mse.loc[len(df_mse.index)] = ['Stratified Random Sampling', n, avg_mse/num_iter] | |
| ##################################################################################################### | |
| # Plot the MSE values versus sample size for models trained on simple and stratified sampling methods | |
| ##################################################################################################### | |
| # Create a 4 x 2 grid | |
| fig, axes = plt.subplots(figsize=(8, 6)) | |
| stat = 'mse' | |
| sns.lineplot(data=df_mse[df_mse['Method'] == 'Simple Random Sampling'], | |
| x='n', y=stat, marker='o', label='Simple Random Sampling') | |
| sns.lineplot(data=df_mse[df_mse['Method'] == 'Stratified Random Sampling'], | |
| x='n', y=stat, marker='o', label='Stratified Random Sampling') | |
| axes.set_title(f'{stat} of linear model vs. sample size', fontsize=16) | |
| axes.set_xlabel('Sample Size', fontsize=14) | |
| axes.set_ylabel(stat, fontsize=14) | |
| axes.legend(fontsize=14) | |
| plt.tight_layout() | |
| plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment