jkclem/simulation_study1.py

## simulation_study1.py
# for linear algebra and random numbers
import numpy as np
# for linear regression
import statsmodels.api as sm
# for visualization
import matplotlib.pyplot as plt
# for generating combinations of explanatory variables for model selection based on AIC
from itertools import combinations

# set a random seed for reproducibility
np.random.seed(123)

# set the number of simulations to run
num_sims = 500

# set n and k and make a n by k+1 matrix of standard normal random numbers
n = 100
k = 6

# set the number of predictors, including the intercept that are significant
k_significant = 3

# create a column vector of zeros, with k + 1 betas, where the first value is the true intercept
beta = np.zeros(shape=(k + 1, 1))
# set the first k_significant betas to be non-zero values, using iid N(3, 1) random variables
beta[:k_significant] = np.random.normal(loc=5.0, scale=1.0, size=(k_significant, 1))

# create an array of zeros to hold our UNBIASED estimates of the standard deviation of our error term
unbiased_sigma_estimates = np.zeros(shape=num_sims)
# create an array of zeros to hold our BIASED estimates of the standard deviation of our error term (from AIC)
aic_sigma_estimates = np.zeros(shape=num_sims)
# create an array of zeros to hold our BIASED estimates of the standard deviation of our error term (from BIC)
bic_sigma_estimates = np.zeros(shape=num_sims)

# perform num_sims simulations
for i in range(num_sims):

    ###
    # This block generates the random sample
    ###

    # create the temporary design n by k + 1 design matrix
    temp_X = np.random.normal(loc=0.0, scale=1.0, size=(n, k + 1))

    # set the first column of the design matrix to 1.0 to include an intercept
    temp_X[:,0] = 1.0

    # create temporary y values using the design matrix and beta vector, and add standard normal random noise
    temp_y = np.matmul(temp_X, beta) + np.random.normal(loc=0.0, scale=1.0, size=(n, 1))


    ###
    # This block estimates the standard deviation using the True OLS model
    ###

    # fit an OLS linear model to the temporary data
    temp_true_mod = sm.OLS(temp_y, temp_X[:,0:k_significant]).fit()

    # estimate the standard deviation of the error term
    temp_unbiased_sigma_estimate = np.sqrt(np.sum(temp_true_mod.resid **2) /
                                         (n - temp_true_mod.params.shape[0]))
    # add the unbiased estimate to the array at the ith slot
    unbiased_sigma_estimates[i] = temp_unbiased_sigma_estimate


    ###
    # This block estimates the standard deviation using the OLS model with best subset selection of variables
    # using AIC
    ###

    # select the model with the lowest information criterion
    temp_best_AIC_mod = best_information_criterion_selection(temp_y, temp_X, criterion='AIC')
    # estimate the standard deviation of the error term
    temp_aic_sigma_estimate = np.sqrt(np.sum(temp_best_AIC_mod.resid ** 2) / temp_best_AIC_mod.df_resid)
    # add the unbiased estimate to the array at the ith slot
    aic_sigma_estimates[i] = temp_aic_sigma_estimate


    ###
    # This block estimates the standard deviation using the OLS model with best subset selection of variables
    # using BIC
    ###

    # select the model with the lowest information criterion
    temp_best_BIC_mod = best_information_criterion_selection(temp_y, temp_X, criterion='BIC')
    # estimate the standard deviation of the error term
    temp_bic_sigma_estimate = np.sqrt(np.sum(temp_best_BIC_mod.resid ** 2) / temp_best_BIC_mod.df_resid)
    # add the unbiased estimate to the array at the ith slot
    bic_sigma_estimates[i] = temp_bic_sigma_estimate
	# for linear algebra and random numbers
	import numpy as np
	# for linear regression
	import statsmodels.api as sm
	# for visualization
	import matplotlib.pyplot as plt
	# for generating combinations of explanatory variables for model selection based on AIC
	from itertools import combinations

	# set a random seed for reproducibility
	np.random.seed(123)

	# set the number of simulations to run
	num_sims = 500

	# set n and k and make a n by k+1 matrix of standard normal random numbers
	n = 100
	k = 6

	# set the number of predictors, including the intercept that are significant
	k_significant = 3

	# create a column vector of zeros, with k + 1 betas, where the first value is the true intercept
	beta = np.zeros(shape=(k + 1, 1))
	# set the first k_significant betas to be non-zero values, using iid N(3, 1) random variables
	beta[:k_significant] = np.random.normal(loc=5.0, scale=1.0, size=(k_significant, 1))

	# create an array of zeros to hold our UNBIASED estimates of the standard deviation of our error term
	unbiased_sigma_estimates = np.zeros(shape=num_sims)
	# create an array of zeros to hold our BIASED estimates of the standard deviation of our error term (from AIC)
	aic_sigma_estimates = np.zeros(shape=num_sims)
	# create an array of zeros to hold our BIASED estimates of the standard deviation of our error term (from BIC)
	bic_sigma_estimates = np.zeros(shape=num_sims)

	# perform num_sims simulations
	for i in range(num_sims):

	###
	# This block generates the random sample
	###

	# create the temporary design n by k + 1 design matrix
	temp_X = np.random.normal(loc=0.0, scale=1.0, size=(n, k + 1))

	# set the first column of the design matrix to 1.0 to include an intercept
	temp_X[:,0] = 1.0

	# create temporary y values using the design matrix and beta vector, and add standard normal random noise
	temp_y = np.matmul(temp_X, beta) + np.random.normal(loc=0.0, scale=1.0, size=(n, 1))


	###
	# This block estimates the standard deviation using the True OLS model
	###

	# fit an OLS linear model to the temporary data
	temp_true_mod = sm.OLS(temp_y, temp_X[:,0:k_significant]).fit()

	# estimate the standard deviation of the error term
	temp_unbiased_sigma_estimate = np.sqrt(np.sum(temp_true_mod.resid **2) /
	(n - temp_true_mod.params.shape[0]))
	# add the unbiased estimate to the array at the ith slot
	unbiased_sigma_estimates[i] = temp_unbiased_sigma_estimate


	###
	# This block estimates the standard deviation using the OLS model with best subset selection of variables
	# using AIC
	###

	# select the model with the lowest information criterion
	temp_best_AIC_mod = best_information_criterion_selection(temp_y, temp_X, criterion='AIC')
	# estimate the standard deviation of the error term
	temp_aic_sigma_estimate = np.sqrt(np.sum(temp_best_AIC_mod.resid ** 2) / temp_best_AIC_mod.df_resid)
	# add the unbiased estimate to the array at the ith slot
	aic_sigma_estimates[i] = temp_aic_sigma_estimate


	###
	# This block estimates the standard deviation using the OLS model with best subset selection of variables
	# using BIC
	###

	# select the model with the lowest information criterion
	temp_best_BIC_mod = best_information_criterion_selection(temp_y, temp_X, criterion='BIC')
	# estimate the standard deviation of the error term
	temp_bic_sigma_estimate = np.sqrt(np.sum(temp_best_BIC_mod.resid ** 2) / temp_best_BIC_mod.df_resid)
	# add the unbiased estimate to the array at the ith slot
	bic_sigma_estimates[i] = temp_bic_sigma_estimate