Skip to content

Instantly share code, notes, and snippets.

@sachinsdate
Created October 30, 2022 17:02
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sachinsdate/0c20f63a1770e3cb2b4a7fc4dce844bd to your computer and use it in GitHub Desktop.
Save sachinsdate/0c20f63a1770e3cb2b4a7fc4dce844bd to your computer and use it in GitHub Desktop.
A tutorial on fitting a linear model using the Generalized Least Squares estimator
import pandas as pd
import statsmodels.formula.api as smf
import statsmodels.api as sm
from patsy import dmatrices
from matplotlib import pyplot as plt
import numpy as np
#Load the US Census Bureau data into a Dataframe
df = pd.read_csv('us_census_bureau_acs_2015_2019_subset.csv', header=0)
#Construct the model's equation in Patsy syntax. Statsmodels will automatically add the intercept and so we don't explicitly specify it in the model's equation
reg_expr = 'Percent_Households_Below_Poverty_Level ~ Median_Age + Homeowner_Vacancy_Rate + Percent_Pop_25_And_Over_With_College_Or_Higher_Educ'
#Build and train the OLS regression model and print the training summary
olsr_model = smf.ols(formula=reg_expr, data=df)
olsr_model_results = olsr_model.fit()
print(olsr_model_results.summary())
#Also refit the OLS model using White's Heteroskedasticity Consistent Estimator of the
# variances of the estimated coefficients
olsr_hc0_model_results = olsr_model.fit(cov_type='HC0')
print(olsr_hc0_model_results.summary())
#Carve out the y and X matrices
y_train, X_train = dmatrices(reg_expr, df, return_type='dataframe')
#Get the predicted values of y from the OLS fitted model (without the White's HC estimator)
y_cap = olsr_model_results.predict(X_train)
#Plot the model's residuals against the predicted values of y to confirm the presence of
# heteroskedasticity
plt.xlabel('Predicted value of Percent_Households_Below_Poverty_Level')
plt.ylabel('Residual error')
plt.scatter(y_cap, olsr_model_results.resid)
plt.show()
#We will now build GLS model for this data. To do so, we need to estimate the Sigma matrix,
# namely the matrix of correlations between the error terms of the Linear model.
#We will estimate Sigma by chopping y_cap into intervals of size 1. For each interval,
# we will collect the residual errors from the OLSR model and compute their sample variance. Next,
# we'll regress these sample variances on the centers of the intervals of y_cap. For e.g. the
# center of the interval (5, 6] is 5.5. We will use this fitted regression model to estimate the
# sample variance of the residuals of the OLSR model that we fitted earlier. And we'll use these
# predicted residual sample variances as the estimates of the corresponding error term's population
# variances.
#Create a range object for intervalizing (binning) y_cap
interval_range=range(round(min(y_cap)), round(max(y_cap)))
#Convert it to a list
ycap_intervals=list(interval_range)
#Intervalize y_cap
intervalized_ycap=pd.cut(x=y_cap, bins=ycap_intervals)
#Print it out to see how it looks like. We see that the intervalizing has created 37 unique
# intervals across 3219 y values
print(intervalized_ycap)
#Use a dictionary to extract the set of unique intervals of y_cap. The keys of the dict are the
# intervals while the values will be list of residuals corresponding to those intervals. We'll
# fill in the values soon.
distinct_intervals_of_ycap={}
for intr in list(intervalized_ycap):
distinct_intervals_of_ycap[intr] = []
#Create a DataFrame consisting of fitted (predicted) y values from the OLSR model, the interval
# that each fitted value lies in, and the residual from the fitted OLSR model corresponding to
# each fitted y value
series_dict={'Y_CAP':y_cap, 'INTR':intervalized_ycap, 'RESID':olsr_model_results.resid}
df_intervalized_ycap = pd.DataFrame(data=series_dict)
#Now iterate through the rows of this DataFrame, and for each row, collect the residual,
# and using the interval as the key, add the residual into the approrpiate slot
# in distinct_intervals_of_ycap dict
for index, row in df_intervalized_ycap.iterrows():
resid = row['RESID']
interval = row['INTR']
distinct_intervals_of_ycap[interval].append(resid)
#Next, construct two arrays of size=length of data set. The fist array will store the midpoints
# of the intervals of fitted y values. The second array will store the sample variance of the
# residuals corresponding to all the fitted y values that lie within the interval.
midpoints_of_ycap_intervals = []
sample_variance_of_intervalized_residuals = []
for key in distinct_intervals_of_ycap.keys():
if pd.isnull(key):
continue
resids = distinct_intervals_of_ycap[key]
#Skip intervals with <= 1 residual
if len(resids) > 1:
sample_variance_of_intervalized_residuals.append(np.var(a=resids, ddof=1))
midpoints_of_ycap_intervals.append(key.mid)
#Plot the sample variances of intervalized residuals against the midpoints of the intervalized
# fitted y to get a sense of the relationship between them
plt.xlabel('Midpoints of intervalized fitted y values')
plt.ylabel('Variance of intervalized residuals')
plt.scatter(x=midpoints_of_ycap_intervals, y=sample_variance_of_intervalized_residuals)
plt.show()
#From looking at the plot, we suspect an exponential relationship between the variance and
# ycap. An exp relation is commonly seen when the errors are heteroskedastic.
#We will build a log-linear model between the intervalized variances and the intervalized fitted
# y. Specifically, we'll regress log(sample_variance_of_intervalized_residuals) on
# midpoints_of_ycap_intervals and a constant
log_sample_variance_of_intervalized_residuals = np.log(sample_variance_of_intervalized_residuals)
#Construct a DataFrame consisting of log_sample_variance_of_intervalized_residuals and
# sample_variance_of_intervalized_residuals
df_ycap_mids_resid_vars = pd.DataFrame(data={
'LOG_INTERVALIZED_RESID_VARIANCE':log_sample_variance_of_intervalized_residuals,
'MIDPOINT_INTERVALIZED_YCAP':midpoints_of_ycap_intervals})
ols_resid_variance_model = smf.ols(
'LOG_INTERVALIZED_RESID_VARIANCE ~ MIDPOINT_INTERVALIZED_YCAP', data=df_ycap_mids_resid_vars)
ols_resid_variance_model_results = ols_resid_variance_model.fit()
#Use this model to predict the sample variances of all residuals in the original OLSR model in
# which we regressed Percent_Households_Below_Poverty_Level on Median_Age, Homeowner_Vacancy_Rate,
# and Percent_Pop_25_And_Over_With_College_Or_Higher_Educ
df_ycap_from_ols_model = pd.DataFrame(data={'MIDPOINT_INTERVALIZED_YCAP':y_cap})
log_resid_sample_variance_estimate = ols_resid_variance_model_results.predict(
df_ycap_from_ols_model['MIDPOINT_INTERVALIZED_YCAP'])
resid_sample_variance_estimate = np.exp(log_resid_sample_variance_estimate)
#Let's visually inspect the estimated sample variance of residuals by plotting them against the
# fitted values from the original OLSR model
plt.xlabel('Fitted y values')
plt.ylabel('Estimated sample variance of residuals')
plt.scatter(x=y_cap, y=resid_sample_variance_estimate)
plt.show()
#The plot shows the exponential relationship that we are expecting
#Let's construct the Sigma matrix of size n x n where n=length of data set. We'll begin with
# creating an Identity matrix of size n x n and fill in its diagonal with the estimated sample
# variances of residuals
sigma_matrix = np.identity(len(y_cap))
np.fill_diagonal(sigma_matrix, np.exp(resid_sample_variance_estimate))
#Build and train the GLS model using this Sigma matrix
gls_model = sm.GLS(endog=y_train, exog=X_train, sigma=sigma_matrix)
gls_model_results = gls_model.fit()
print(gls_model_results.summary())
#Let's compare the parameter estimates from the OLS model, the OLS model with White's correction
# for heteroskedastic errors, and the GLS model
data={'OLS_MODEL':olsr_model_results.params, 'OLS_HC0_MODEL':olsr_hc0_model_results.params,
'GLS_MODEL':gls_model_results.params}
print(pd.DataFrame(data=data))
#Compare the standard errors of the parameter estimates from the OLS model, the OLS model with
# White's correction for heteroskedastic errors, and the GLS model
data={'OLS_MODEL':olsr_model_results.bse, 'OLS_HC0_MODEL':olsr_hc0_model_results.bse,
'GLS_MODEL':gls_model_results.bse}
print(pd.DataFrame(data=data))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment