sachinsdate/gls.py

## gls.py
import pandas as pd
import statsmodels.formula.api as smf
import statsmodels.api as sm
from patsy import dmatrices
from matplotlib import pyplot as plt
import numpy as np


#Load the US Census Bureau data into a Dataframe
df = pd.read_csv('us_census_bureau_acs_2015_2019_subset.csv', header=0)

#Construct the model's equation in Patsy syntax. Statsmodels will automatically add the intercept and so we don't explicitly specify it in the model's equation
reg_expr = 'Percent_Households_Below_Poverty_Level ~ Median_Age + Homeowner_Vacancy_Rate + Percent_Pop_25_And_Over_With_College_Or_Higher_Educ'

#Build and train the OLS regression model and print the training summary
olsr_model = smf.ols(formula=reg_expr, data=df)
olsr_model_results = olsr_model.fit()
print(olsr_model_results.summary())

#Also refit the OLS model using White's Heteroskedasticity Consistent Estimator of the
# variances of the estimated coefficients
olsr_hc0_model_results = olsr_model.fit(cov_type='HC0')
print(olsr_hc0_model_results.summary())

#Carve out the y and X matrices
y_train, X_train = dmatrices(reg_expr, df, return_type='dataframe')

#Get the predicted values of y from the OLS fitted model (without the White's HC estimator)
y_cap = olsr_model_results.predict(X_train)

#Plot the model's residuals against the predicted values of y to confirm the presence of
# heteroskedasticity
plt.xlabel('Predicted value of Percent_Households_Below_Poverty_Level')
plt.ylabel('Residual error')
plt.scatter(y_cap, olsr_model_results.resid)
plt.show()

#We will now build GLS model for this data. To do so, we need to estimate the Sigma matrix,
# namely the matrix of correlations between the error terms of the Linear model.

#We will estimate Sigma by chopping y_cap into intervals of size 1. For each interval,
# we will collect the residual errors from the OLSR model and compute their sample variance. Next,
# we'll regress these sample variances on the centers of the intervals of y_cap. For e.g. the
# center of the interval (5, 6] is 5.5. We will use this fitted regression model to estimate the
# sample variance of the residuals of the OLSR model that we fitted earlier. And we'll use these
# predicted residual sample variances as the estimates of the corresponding error term's population
# variances.

#Create a range object for intervalizing (binning) y_cap
interval_range=range(round(min(y_cap)), round(max(y_cap)))

#Convert it to a list
ycap_intervals=list(interval_range)

#Intervalize y_cap
intervalized_ycap=pd.cut(x=y_cap, bins=ycap_intervals)

#Print it out to see how it looks like. We see that the intervalizing has created 37 unique
# intervals across 3219 y values
print(intervalized_ycap)

#Use a dictionary to extract the set of unique intervals of y_cap. The keys of the dict are the
# intervals while the values will be list of residuals corresponding to those intervals. We'll
# fill in the values soon.
distinct_intervals_of_ycap={}
for intr in list(intervalized_ycap):
    distinct_intervals_of_ycap[intr] = []

#Create a DataFrame consisting of fitted (predicted) y values from the OLSR model, the interval
# that each fitted value lies in, and the residual from the fitted OLSR model corresponding to
# each fitted y value
series_dict={'Y_CAP':y_cap, 'INTR':intervalized_ycap, 'RESID':olsr_model_results.resid}
df_intervalized_ycap = pd.DataFrame(data=series_dict)

#Now iterate through the rows of this DataFrame, and for each row, collect the residual,
# and using the interval as the key, add the residual into the approrpiate slot
# in distinct_intervals_of_ycap dict
for index, row in df_intervalized_ycap.iterrows():
    resid = row['RESID']
    interval = row['INTR']
    distinct_intervals_of_ycap[interval].append(resid)

#Next, construct two arrays of size=length of data set. The fist array will store the midpoints
# of the intervals of fitted y values. The second array will store the sample variance of the
# residuals corresponding to all the fitted y values that lie within the interval.
midpoints_of_ycap_intervals = []
sample_variance_of_intervalized_residuals = []
for key in distinct_intervals_of_ycap.keys():
    if pd.isnull(key):
        continue
    resids = distinct_intervals_of_ycap[key]
    #Skip intervals with <= 1 residual
    if len(resids) > 1:
        sample_variance_of_intervalized_residuals.append(np.var(a=resids, ddof=1))
        midpoints_of_ycap_intervals.append(key.mid)

#Plot the sample variances of intervalized residuals against the midpoints of the intervalized
# fitted y to get a sense of the relationship between them
plt.xlabel('Midpoints of intervalized fitted y values')
plt.ylabel('Variance of intervalized residuals')
plt.scatter(x=midpoints_of_ycap_intervals, y=sample_variance_of_intervalized_residuals)
plt.show()

#From looking at the plot, we suspect an exponential relationship between the variance and
# ycap. An exp relation is commonly seen when the errors are heteroskedastic.

#We will build a log-linear model between the intervalized variances and the intervalized fitted
# y. Specifically, we'll regress log(sample_variance_of_intervalized_residuals) on
# midpoints_of_ycap_intervals and a constant

log_sample_variance_of_intervalized_residuals = np.log(sample_variance_of_intervalized_residuals)

#Construct a DataFrame consisting of log_sample_variance_of_intervalized_residuals and
# sample_variance_of_intervalized_residuals
df_ycap_mids_resid_vars = pd.DataFrame(data={
    'LOG_INTERVALIZED_RESID_VARIANCE':log_sample_variance_of_intervalized_residuals,
    'MIDPOINT_INTERVALIZED_YCAP':midpoints_of_ycap_intervals})
ols_resid_variance_model = smf.ols(
    'LOG_INTERVALIZED_RESID_VARIANCE ~ MIDPOINT_INTERVALIZED_YCAP',  data=df_ycap_mids_resid_vars)
ols_resid_variance_model_results = ols_resid_variance_model.fit()

#Use this model to predict the sample variances of all residuals in the original OLSR model in
# which we regressed Percent_Households_Below_Poverty_Level on Median_Age, Homeowner_Vacancy_Rate,
# and Percent_Pop_25_And_Over_With_College_Or_Higher_Educ
df_ycap_from_ols_model = pd.DataFrame(data={'MIDPOINT_INTERVALIZED_YCAP':y_cap})
log_resid_sample_variance_estimate = ols_resid_variance_model_results.predict(
    df_ycap_from_ols_model['MIDPOINT_INTERVALIZED_YCAP'])
resid_sample_variance_estimate = np.exp(log_resid_sample_variance_estimate)

#Let's visually inspect the estimated sample variance of residuals by plotting them against the
# fitted values from the original OLSR model
plt.xlabel('Fitted y values')
plt.ylabel('Estimated sample variance of residuals')
plt.scatter(x=y_cap, y=resid_sample_variance_estimate)
plt.show()

#The plot shows the exponential relationship that we are expecting

#Let's construct the Sigma matrix of size n x n where n=length of data set. We'll begin with
# creating an Identity matrix of size n x n and fill in its diagonal with the estimated sample
# variances of residuals
sigma_matrix = np.identity(len(y_cap))
np.fill_diagonal(sigma_matrix, np.exp(resid_sample_variance_estimate))

#Build and train the GLS model using this Sigma matrix
gls_model = sm.GLS(endog=y_train, exog=X_train, sigma=sigma_matrix)
gls_model_results = gls_model.fit()
print(gls_model_results.summary())

#Let's compare the parameter estimates from the OLS model, the OLS model with White's correction
# for heteroskedastic errors, and the GLS model
data={'OLS_MODEL':olsr_model_results.params, 'OLS_HC0_MODEL':olsr_hc0_model_results.params,
      'GLS_MODEL':gls_model_results.params}
print(pd.DataFrame(data=data))

#Compare the standard errors of the parameter estimates from the OLS model, the OLS model with
# White's correction for heteroskedastic errors, and the GLS model
data={'OLS_MODEL':olsr_model_results.bse, 'OLS_HC0_MODEL':olsr_hc0_model_results.bse,
      'GLS_MODEL':gls_model_results.bse}
print(pd.DataFrame(data=data))
	import pandas as pd
	import statsmodels.formula.api as smf
	import statsmodels.api as sm
	from patsy import dmatrices
	from matplotlib import pyplot as plt
	import numpy as np


	#Load the US Census Bureau data into a Dataframe
	df = pd.read_csv('us_census_bureau_acs_2015_2019_subset.csv', header=0)

	#Construct the model's equation in Patsy syntax. Statsmodels will automatically add the intercept and so we don't explicitly specify it in the model's equation
	reg_expr = 'Percent_Households_Below_Poverty_Level ~ Median_Age + Homeowner_Vacancy_Rate + Percent_Pop_25_And_Over_With_College_Or_Higher_Educ'

	#Build and train the OLS regression model and print the training summary
	olsr_model = smf.ols(formula=reg_expr, data=df)
	olsr_model_results = olsr_model.fit()
	print(olsr_model_results.summary())

	#Also refit the OLS model using White's Heteroskedasticity Consistent Estimator of the
	# variances of the estimated coefficients
	olsr_hc0_model_results = olsr_model.fit(cov_type='HC0')
	print(olsr_hc0_model_results.summary())

	#Carve out the y and X matrices
	y_train, X_train = dmatrices(reg_expr, df, return_type='dataframe')

	#Get the predicted values of y from the OLS fitted model (without the White's HC estimator)
	y_cap = olsr_model_results.predict(X_train)

	#Plot the model's residuals against the predicted values of y to confirm the presence of
	# heteroskedasticity
	plt.xlabel('Predicted value of Percent_Households_Below_Poverty_Level')
	plt.ylabel('Residual error')
	plt.scatter(y_cap, olsr_model_results.resid)
	plt.show()

	#We will now build GLS model for this data. To do so, we need to estimate the Sigma matrix,
	# namely the matrix of correlations between the error terms of the Linear model.

	#We will estimate Sigma by chopping y_cap into intervals of size 1. For each interval,
	# we will collect the residual errors from the OLSR model and compute their sample variance. Next,
	# we'll regress these sample variances on the centers of the intervals of y_cap. For e.g. the
	# center of the interval (5, 6] is 5.5. We will use this fitted regression model to estimate the
	# sample variance of the residuals of the OLSR model that we fitted earlier. And we'll use these
	# predicted residual sample variances as the estimates of the corresponding error term's population
	# variances.

	#Create a range object for intervalizing (binning) y_cap
	interval_range=range(round(min(y_cap)), round(max(y_cap)))

	#Convert it to a list
	ycap_intervals=list(interval_range)

	#Intervalize y_cap
	intervalized_ycap=pd.cut(x=y_cap, bins=ycap_intervals)

	#Print it out to see how it looks like. We see that the intervalizing has created 37 unique
	# intervals across 3219 y values
	print(intervalized_ycap)

	#Use a dictionary to extract the set of unique intervals of y_cap. The keys of the dict are the
	# intervals while the values will be list of residuals corresponding to those intervals. We'll
	# fill in the values soon.
	distinct_intervals_of_ycap={}
	for intr in list(intervalized_ycap):
	distinct_intervals_of_ycap[intr] = []

	#Create a DataFrame consisting of fitted (predicted) y values from the OLSR model, the interval
	# that each fitted value lies in, and the residual from the fitted OLSR model corresponding to
	# each fitted y value
	series_dict={'Y_CAP':y_cap, 'INTR':intervalized_ycap, 'RESID':olsr_model_results.resid}
	df_intervalized_ycap = pd.DataFrame(data=series_dict)

	#Now iterate through the rows of this DataFrame, and for each row, collect the residual,
	# and using the interval as the key, add the residual into the approrpiate slot
	# in distinct_intervals_of_ycap dict
	for index, row in df_intervalized_ycap.iterrows():
	resid = row['RESID']
	interval = row['INTR']
	distinct_intervals_of_ycap[interval].append(resid)

	#Next, construct two arrays of size=length of data set. The fist array will store the midpoints
	# of the intervals of fitted y values. The second array will store the sample variance of the
	# residuals corresponding to all the fitted y values that lie within the interval.
	midpoints_of_ycap_intervals = []
	sample_variance_of_intervalized_residuals = []
	for key in distinct_intervals_of_ycap.keys():
	if pd.isnull(key):
	continue
	resids = distinct_intervals_of_ycap[key]
	#Skip intervals with <= 1 residual
	if len(resids) > 1:
	sample_variance_of_intervalized_residuals.append(np.var(a=resids, ddof=1))
	midpoints_of_ycap_intervals.append(key.mid)

	#Plot the sample variances of intervalized residuals against the midpoints of the intervalized
	# fitted y to get a sense of the relationship between them
	plt.xlabel('Midpoints of intervalized fitted y values')
	plt.ylabel('Variance of intervalized residuals')
	plt.scatter(x=midpoints_of_ycap_intervals, y=sample_variance_of_intervalized_residuals)
	plt.show()

	#From looking at the plot, we suspect an exponential relationship between the variance and
	# ycap. An exp relation is commonly seen when the errors are heteroskedastic.

	#We will build a log-linear model between the intervalized variances and the intervalized fitted
	# y. Specifically, we'll regress log(sample_variance_of_intervalized_residuals) on
	# midpoints_of_ycap_intervals and a constant

	log_sample_variance_of_intervalized_residuals = np.log(sample_variance_of_intervalized_residuals)

	#Construct a DataFrame consisting of log_sample_variance_of_intervalized_residuals and
	# sample_variance_of_intervalized_residuals
	df_ycap_mids_resid_vars = pd.DataFrame(data={
	'LOG_INTERVALIZED_RESID_VARIANCE':log_sample_variance_of_intervalized_residuals,
	'MIDPOINT_INTERVALIZED_YCAP':midpoints_of_ycap_intervals})
	ols_resid_variance_model = smf.ols(
	'LOG_INTERVALIZED_RESID_VARIANCE ~ MIDPOINT_INTERVALIZED_YCAP', data=df_ycap_mids_resid_vars)
	ols_resid_variance_model_results = ols_resid_variance_model.fit()

	#Use this model to predict the sample variances of all residuals in the original OLSR model in
	# which we regressed Percent_Households_Below_Poverty_Level on Median_Age, Homeowner_Vacancy_Rate,
	# and Percent_Pop_25_And_Over_With_College_Or_Higher_Educ
	df_ycap_from_ols_model = pd.DataFrame(data={'MIDPOINT_INTERVALIZED_YCAP':y_cap})
	log_resid_sample_variance_estimate = ols_resid_variance_model_results.predict(
	df_ycap_from_ols_model['MIDPOINT_INTERVALIZED_YCAP'])
	resid_sample_variance_estimate = np.exp(log_resid_sample_variance_estimate)

	#Let's visually inspect the estimated sample variance of residuals by plotting them against the
	# fitted values from the original OLSR model
	plt.xlabel('Fitted y values')
	plt.ylabel('Estimated sample variance of residuals')
	plt.scatter(x=y_cap, y=resid_sample_variance_estimate)
	plt.show()

	#The plot shows the exponential relationship that we are expecting

	#Let's construct the Sigma matrix of size n x n where n=length of data set. We'll begin with
	# creating an Identity matrix of size n x n and fill in its diagonal with the estimated sample
	# variances of residuals
	sigma_matrix = np.identity(len(y_cap))
	np.fill_diagonal(sigma_matrix, np.exp(resid_sample_variance_estimate))

	#Build and train the GLS model using this Sigma matrix
	gls_model = sm.GLS(endog=y_train, exog=X_train, sigma=sigma_matrix)
	gls_model_results = gls_model.fit()
	print(gls_model_results.summary())

	#Let's compare the parameter estimates from the OLS model, the OLS model with White's correction
	# for heteroskedastic errors, and the GLS model
	data={'OLS_MODEL':olsr_model_results.params, 'OLS_HC0_MODEL':olsr_hc0_model_results.params,
	'GLS_MODEL':gls_model_results.params}
	print(pd.DataFrame(data=data))

	#Compare the standard errors of the parameter estimates from the OLS model, the OLS model with
	# White's correction for heteroskedastic errors, and the GLS model
	data={'OLS_MODEL':olsr_model_results.bse, 'OLS_HC0_MODEL':olsr_hc0_model_results.bse,
	'GLS_MODEL':gls_model_results.bse}
	print(pd.DataFrame(data=data))