Skip to content

Instantly share code, notes, and snippets.

@sachinsdate
Last active December 8, 2024 04:19
Show Gist options
  • Select an option

  • Save sachinsdate/534d921c937cc730e2039cf2396b80f2 to your computer and use it in GitHub Desktop.

Select an option

Save sachinsdate/534d921c937cc730e2039cf2396b80f2 to your computer and use it in GitHub Desktop.
Experiments with simple random sampling, systematic sampling, and stratified random sampling. Download automobiles data from https://gist.github.com/sachinsdate/254be0cf9a631ffd943c0746e103cd85
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import scipy.stats as stats
from sklearn.model_selection import train_test_split
###############################################################################
# Read the autos dataset into a Dataframe and carve out the subset of interest
###############################################################################
df_auto = pd.read_csv(filepath_or_buffer='automobiles_for_statistical_sampling.csv', header=0)
df_auto = df_auto[['curb_weight', 'engine_size', 'horsepower', 'city_mpg']]
df_auto = df_auto.dropna()
print(df_auto)
###############################################################################
# Inspect the dataset using pairplots and histogram plots
###############################################################################
sns.pairplot(df_auto)
###############################################################################
# Draw a simple random sample of size 30
###############################################################################
srs_sample_auto = df_auto.sample(n=30, random_state=np.random.randint(0, 100))
###############################################################################
# Build and train an OLS linear regression model of horsepower on engine size
###############################################################################
olsr_model_auto = sm.OLS(endog=srs_sample_auto['horsepower'], exog=srs_sample_auto[['engine_size']])
olsr_results_auto= olsr_model_auto.fit()
print(olsr_results_auto.summary())
##########################################################################################################
# Compare the performance of simple random sampling and systematic sampling on samples of different sizes:
##########################################################################################################
# 1. Sample the engine size column of the automobiles dataset 100 times to generate 100 random samples, each of size 20.
# 2. In each of the 100 samples, calculate various sample statistics such as the the mean, min, max, percentiles etc.
# 3. Calculate the mean of each sample statistic.
# 4. Repeat the above steps by varying the sample size from 20 to 100 in steps of 5.
# 5. Reconduct the above experiment using systematic sampling instead of simple random sampling.
dataset = df_auto
column_name = 'engine_size'
# Vary sample size from 20 to 250 in steps of 5
sample_sizes = range(20, 100, 5)
sampling_methods = ['Simple Random Sampling', 'Systematic Sampling']
results = {}
for method in sampling_methods:
results[method] = {}
results[method]['N'] = 0
results[method]['Mean'] = 0
results[method]['Min'] = 0
results[method]['Max'] = 0
results[method]['10th'] = 0
results[method]['25th'] = 0
results[method]['50th'] = 0
results[method]['75th'] = 0
results[method]['90th'] = 0
results[method]['SD'] = 0
num_iterations = 100
pop_params = {
'N' : len(dataset),
'Mean' : dataset[column_name].mean(),
'Max' : dataset[column_name].max(),
'Min' : dataset[column_name].min(),
'10th' : np.percentile(dataset[column_name], 10),
'25th' : np.percentile(dataset[column_name], 25),
'50th' : np.percentile(dataset[column_name], 50),
'75th' : np.percentile(dataset[column_name], 75),
'90th' : np.percentile(dataset[column_name], 90),
'SD' : dataset[column_name].std()
}
results_df = pd.DataFrame(columns=['N', 'Method', 'Mean', 'SD', 'Min', '10th', '25th', '50th', '75th', '90th', 'Max'])
for sample_size in sample_sizes:
for i in range(num_iterations):
# Simple Random Sampling
method = 'Simple Random Sampling'
srs_sample = dataset.sample(n=sample_size, random_state=np.random.randint(0, num_iterations))
# Calculate the sample statistics
results[method]['N'] = sample_size
results[method]['Mean'] = results[method]['Mean'] + srs_sample[column_name].mean()
results[method]['Min'] = results[method]['Min'] + srs_sample[column_name].min()
results[method]['Max'] = results[method]['Max'] + srs_sample[column_name].max()
results[method]['10th'] = results[method]['10th'] + np.percentile(srs_sample[column_name], 10)
results[method]['25th'] = results[method]['25th'] + np.percentile(srs_sample[column_name], 25)
results[method]['50th'] = results[method]['50th'] + np.percentile(srs_sample[column_name], 50)
results[method]['75th'] = results[method]['75th'] + np.percentile(srs_sample[column_name], 75)
results[method]['90th'] = results[method]['90th'] + np.percentile(srs_sample[column_name], 90)
results[method]['SD'] = results[method]['SD'] + srs_sample[column_name].std()
# Systematic Sampling
method = 'Systematic Sampling'
k = len(df_auto) // sample_size
random_start = np.random.randint(0, k)
sysrs_indices = np.arange(random_start, len(df_auto), k)[:sample_size]
sysrs_sample = dataset.iloc[sysrs_indices]
# Calculate the sample statistics
results[method]['N'] = sample_size
results[method]['Mean'] = results[method]['Mean'] + sysrs_sample[column_name].mean()
results[method]['Min'] = results[method]['Min'] + sysrs_sample[column_name].min()
results[method]['Max'] = results[method]['Max'] + sysrs_sample[column_name].max()
results[method]['10th'] = results[method]['10th'] + np.percentile(sysrs_sample[column_name], 10)
results[method]['25th'] = results[method]['25th'] + np.percentile(sysrs_sample[column_name], 25)
results[method]['50th'] = results[method]['50th'] + np.percentile(sysrs_sample[column_name], 50)
results[method]['75th'] = results[method]['75th'] + np.percentile(sysrs_sample[column_name], 75)
results[method]['90th'] = results[method]['90th'] + np.percentile(sysrs_sample[column_name], 90)
results[method]['SD'] = results[method]['SD'] + sysrs_sample[column_name].std()
# Calculate the means of all te sample statistics
for method in sampling_methods:
results[method]['Mean'] = results[method]['Mean']/num_iterations
results[method]['Min'] = results[method]['Min']/num_iterations
results[method]['Max'] = results[method]['Max']/num_iterations
results[method]['10th'] = results[method]['10th']/num_iterations
results[method]['25th'] = results[method]['25th']/num_iterations
results[method]['50th'] = results[method]['50th']/num_iterations
results[method]['75th'] = results[method]['75th']/num_iterations
results[method]['90th'] = results[method]['90th']/num_iterations
results[method]['SD'] = results[method]['SD']/num_iterations
# Store all the means in the Dataframe
results_df.loc[len(results_df.index)] = [results[method]['N'], method, results[method]['Mean'], results[method]['SD'], results[method]['Min'], results[method]['10th'], results[method]['25th'], results[method]['50th'], results[method]['75th'], results[method]['90th'], results[method]['Max']]
# Display the first 10 rows of results
print(results_df.head(10))
#########################################################################################
# Plot all the sample statistics versus sample size for each of the two sampling methods
#########################################################################################
# Create a 3 x 3 grid
fig, axes = plt.subplots(3, 3, figsize=(14, 14))
# List of statistics to plot
stats = [
'Mean',
'Max',
'Min',
'SD',
'10th',
'25th',
'50th',
'75th',
'90th'
]
# Plot each statistic
for i, stat in enumerate(stats):
# Calculate row and column for subplot grid
row = i // 3
col = i % 3
sns.lineplot(data=results_df[results_df['Method'] == 'Simple Random Sampling'],
x='N', y=stat, ax=axes[row, col], marker='o', label='Simple Random Sampling')
axes[row, col].axhline(y=pop_params[stat], color='r', linestyle='--')
sns.lineplot(data=results_df[results_df['Method'] == 'Systematic Sampling'],
x='N', y=stat, ax=axes[row, col], marker='o', label='Systematic Sampling')
axes[row, col].set_title(f'{stat} percentile value vs. sample size', fontsize=6)
axes[row, col].set_xlabel('Sample Size', fontsize=6)
axes[row, col].set_ylabel(stat + ' percentile value', fontsize=6)
axes[row, col].legend(fontsize=6)
#plt.tight_layout()
plt.show()
##########################################################################################
# Carve out the subset of interest to study teh performance of stratified random sampling
##########################################################################################
df_auto_subset = df_auto[['city_mpg', 'curb_weight', 'horsepower']].dropna()
# Bin the horsepower and curb weight columns
df_auto_subset['hp_binned'] = pd.cut(x=df_auto_subset['horsepower'], bins=3,
labels=['low_bhp', 'medium_bhp', 'high_bhp'], include_lowest=True)
df_auto_subset['weight_binned'] = pd.cut(x=df_auto_subset['curb_weight'], bins=3,
labels=['light_wt', 'medium_wt', 'heavy_wt'], include_lowest=True)
# Add a column to store the strata labels
df_auto_subset['strata'] = df_auto_subset['weight_binned'].astype('str') + '_' + df_auto_subset['hp_binned'].astype('str')
# Print the distribution of combinations
print(df_auto_subset.groupby(['hp_binned', 'weight_binned']).count()['city_mpg'])
##########################################################################################
# Using MSE, compare the goodness-of-fit of a linear model
# fitted on a sample random sample versus one fitted on a stratified random sample
##########################################################################################
df_mse = pd.DataFrame(columns=['Method', 'n', 'mse'])
# Vary the sample size from 15 to 100
for n in range(15, int(len(df_auto_subset)/2), 5):
avg_mse = 0
num_iter = 100
for i in range(num_iter):
# Create train-test pair using simple random sampling
srs_sample_train, srs_sample_test = train_test_split(df_auto_subset, train_size=n, stratify=None, random_state=np.random.randint(0, 100))
# Fit a linear model on the training data
olsr_model_auto = sm.OLS(endog=srs_sample_train['city_mpg'], exog=srs_sample_train[['curb_weight', 'horsepower']])
olsr_results_auto= olsr_model_auto.fit()
#print(olsr_results_auto.summary())
# Get the model's predictions on the test data
olsr_prediction_results = olsr_results_auto.get_prediction(exog=srs_sample_test[['curb_weight', 'horsepower']])
# Calculate and accumulate the MSE for averaging
avg_mse = avg_mse + np.sum(pow(srs_sample_test['city_mpg']-olsr_prediction_results.summary_frame()['mean'],2))/len(srs_sample_test)
# Store the average MSE
df_mse.loc[len(df_mse.index)] = ['Simple Random Sampling', n, avg_mse/num_iter]
# Vary the sample size from 15 to 100
for n in range(15, int(len(df_auto_subset)/2), 5):
avg_mse = 0
num_iter = 100
for i in range(num_iter):
# Create train-test pair using simple random sampling
stratified_sample_train, stratified_sample_test = train_test_split(df_auto_subset, train_size=n, stratify=df_auto_subset['strata'], random_state=np.random.randint(0, 100))
# Fit a linear model on the training data
olsr_model_auto = sm.OLS(endog=stratified_sample_train['city_mpg'], exog=stratified_sample_train[['curb_weight', 'horsepower']])
olsr_results_auto= olsr_model_auto.fit()
#print(olsr_results_auto.summary())
# Get the model's predictions on the test data
olsr_prediction_results = olsr_results_auto.get_prediction(exog=stratified_sample_test[['curb_weight', 'horsepower']])
# Calculate and accumulate the MSE for averaging
avg_mse = avg_mse + np.sum(pow(stratified_sample_test['city_mpg']-olsr_prediction_results.summary_frame()['mean'],2))/len(stratified_sample_test)
# Store the average MSE
df_mse.loc[len(df_mse.index)] = ['Stratified Random Sampling', n, avg_mse/num_iter]
#####################################################################################################
# Plot the MSE values versus sample size for models trained on simple and stratified sampling methods
#####################################################################################################
# Create a 4 x 2 grid
fig, axes = plt.subplots(figsize=(8, 6))
stat = 'mse'
sns.lineplot(data=df_mse[df_mse['Method'] == 'Simple Random Sampling'],
x='n', y=stat, marker='o', label='Simple Random Sampling')
sns.lineplot(data=df_mse[df_mse['Method'] == 'Stratified Random Sampling'],
x='n', y=stat, marker='o', label='Stratified Random Sampling')
axes.set_title(f'{stat} of linear model vs. sample size', fontsize=16)
axes.set_xlabel('Sample Size', fontsize=14)
axes.set_ylabel(stat, fontsize=14)
axes.legend(fontsize=14)
plt.tight_layout()
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment