Created
October 30, 2022 17:02
-
-
Save sachinsdate/0c20f63a1770e3cb2b4a7fc4dce844bd to your computer and use it in GitHub Desktop.
A tutorial on fitting a linear model using the Generalized Least Squares estimator
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import statsmodels.formula.api as smf | |
import statsmodels.api as sm | |
from patsy import dmatrices | |
from matplotlib import pyplot as plt | |
import numpy as np | |
#Load the US Census Bureau data into a Dataframe | |
df = pd.read_csv('us_census_bureau_acs_2015_2019_subset.csv', header=0) | |
#Construct the model's equation in Patsy syntax. Statsmodels will automatically add the intercept and so we don't explicitly specify it in the model's equation | |
reg_expr = 'Percent_Households_Below_Poverty_Level ~ Median_Age + Homeowner_Vacancy_Rate + Percent_Pop_25_And_Over_With_College_Or_Higher_Educ' | |
#Build and train the OLS regression model and print the training summary | |
olsr_model = smf.ols(formula=reg_expr, data=df) | |
olsr_model_results = olsr_model.fit() | |
print(olsr_model_results.summary()) | |
#Also refit the OLS model using White's Heteroskedasticity Consistent Estimator of the | |
# variances of the estimated coefficients | |
olsr_hc0_model_results = olsr_model.fit(cov_type='HC0') | |
print(olsr_hc0_model_results.summary()) | |
#Carve out the y and X matrices | |
y_train, X_train = dmatrices(reg_expr, df, return_type='dataframe') | |
#Get the predicted values of y from the OLS fitted model (without the White's HC estimator) | |
y_cap = olsr_model_results.predict(X_train) | |
#Plot the model's residuals against the predicted values of y to confirm the presence of | |
# heteroskedasticity | |
plt.xlabel('Predicted value of Percent_Households_Below_Poverty_Level') | |
plt.ylabel('Residual error') | |
plt.scatter(y_cap, olsr_model_results.resid) | |
plt.show() | |
#We will now build GLS model for this data. To do so, we need to estimate the Sigma matrix, | |
# namely the matrix of correlations between the error terms of the Linear model. | |
#We will estimate Sigma by chopping y_cap into intervals of size 1. For each interval, | |
# we will collect the residual errors from the OLSR model and compute their sample variance. Next, | |
# we'll regress these sample variances on the centers of the intervals of y_cap. For e.g. the | |
# center of the interval (5, 6] is 5.5. We will use this fitted regression model to estimate the | |
# sample variance of the residuals of the OLSR model that we fitted earlier. And we'll use these | |
# predicted residual sample variances as the estimates of the corresponding error term's population | |
# variances. | |
#Create a range object for intervalizing (binning) y_cap | |
interval_range=range(round(min(y_cap)), round(max(y_cap))) | |
#Convert it to a list | |
ycap_intervals=list(interval_range) | |
#Intervalize y_cap | |
intervalized_ycap=pd.cut(x=y_cap, bins=ycap_intervals) | |
#Print it out to see how it looks like. We see that the intervalizing has created 37 unique | |
# intervals across 3219 y values | |
print(intervalized_ycap) | |
#Use a dictionary to extract the set of unique intervals of y_cap. The keys of the dict are the | |
# intervals while the values will be list of residuals corresponding to those intervals. We'll | |
# fill in the values soon. | |
distinct_intervals_of_ycap={} | |
for intr in list(intervalized_ycap): | |
distinct_intervals_of_ycap[intr] = [] | |
#Create a DataFrame consisting of fitted (predicted) y values from the OLSR model, the interval | |
# that each fitted value lies in, and the residual from the fitted OLSR model corresponding to | |
# each fitted y value | |
series_dict={'Y_CAP':y_cap, 'INTR':intervalized_ycap, 'RESID':olsr_model_results.resid} | |
df_intervalized_ycap = pd.DataFrame(data=series_dict) | |
#Now iterate through the rows of this DataFrame, and for each row, collect the residual, | |
# and using the interval as the key, add the residual into the approrpiate slot | |
# in distinct_intervals_of_ycap dict | |
for index, row in df_intervalized_ycap.iterrows(): | |
resid = row['RESID'] | |
interval = row['INTR'] | |
distinct_intervals_of_ycap[interval].append(resid) | |
#Next, construct two arrays of size=length of data set. The fist array will store the midpoints | |
# of the intervals of fitted y values. The second array will store the sample variance of the | |
# residuals corresponding to all the fitted y values that lie within the interval. | |
midpoints_of_ycap_intervals = [] | |
sample_variance_of_intervalized_residuals = [] | |
for key in distinct_intervals_of_ycap.keys(): | |
if pd.isnull(key): | |
continue | |
resids = distinct_intervals_of_ycap[key] | |
#Skip intervals with <= 1 residual | |
if len(resids) > 1: | |
sample_variance_of_intervalized_residuals.append(np.var(a=resids, ddof=1)) | |
midpoints_of_ycap_intervals.append(key.mid) | |
#Plot the sample variances of intervalized residuals against the midpoints of the intervalized | |
# fitted y to get a sense of the relationship between them | |
plt.xlabel('Midpoints of intervalized fitted y values') | |
plt.ylabel('Variance of intervalized residuals') | |
plt.scatter(x=midpoints_of_ycap_intervals, y=sample_variance_of_intervalized_residuals) | |
plt.show() | |
#From looking at the plot, we suspect an exponential relationship between the variance and | |
# ycap. An exp relation is commonly seen when the errors are heteroskedastic. | |
#We will build a log-linear model between the intervalized variances and the intervalized fitted | |
# y. Specifically, we'll regress log(sample_variance_of_intervalized_residuals) on | |
# midpoints_of_ycap_intervals and a constant | |
log_sample_variance_of_intervalized_residuals = np.log(sample_variance_of_intervalized_residuals) | |
#Construct a DataFrame consisting of log_sample_variance_of_intervalized_residuals and | |
# sample_variance_of_intervalized_residuals | |
df_ycap_mids_resid_vars = pd.DataFrame(data={ | |
'LOG_INTERVALIZED_RESID_VARIANCE':log_sample_variance_of_intervalized_residuals, | |
'MIDPOINT_INTERVALIZED_YCAP':midpoints_of_ycap_intervals}) | |
ols_resid_variance_model = smf.ols( | |
'LOG_INTERVALIZED_RESID_VARIANCE ~ MIDPOINT_INTERVALIZED_YCAP', data=df_ycap_mids_resid_vars) | |
ols_resid_variance_model_results = ols_resid_variance_model.fit() | |
#Use this model to predict the sample variances of all residuals in the original OLSR model in | |
# which we regressed Percent_Households_Below_Poverty_Level on Median_Age, Homeowner_Vacancy_Rate, | |
# and Percent_Pop_25_And_Over_With_College_Or_Higher_Educ | |
df_ycap_from_ols_model = pd.DataFrame(data={'MIDPOINT_INTERVALIZED_YCAP':y_cap}) | |
log_resid_sample_variance_estimate = ols_resid_variance_model_results.predict( | |
df_ycap_from_ols_model['MIDPOINT_INTERVALIZED_YCAP']) | |
resid_sample_variance_estimate = np.exp(log_resid_sample_variance_estimate) | |
#Let's visually inspect the estimated sample variance of residuals by plotting them against the | |
# fitted values from the original OLSR model | |
plt.xlabel('Fitted y values') | |
plt.ylabel('Estimated sample variance of residuals') | |
plt.scatter(x=y_cap, y=resid_sample_variance_estimate) | |
plt.show() | |
#The plot shows the exponential relationship that we are expecting | |
#Let's construct the Sigma matrix of size n x n where n=length of data set. We'll begin with | |
# creating an Identity matrix of size n x n and fill in its diagonal with the estimated sample | |
# variances of residuals | |
sigma_matrix = np.identity(len(y_cap)) | |
np.fill_diagonal(sigma_matrix, np.exp(resid_sample_variance_estimate)) | |
#Build and train the GLS model using this Sigma matrix | |
gls_model = sm.GLS(endog=y_train, exog=X_train, sigma=sigma_matrix) | |
gls_model_results = gls_model.fit() | |
print(gls_model_results.summary()) | |
#Let's compare the parameter estimates from the OLS model, the OLS model with White's correction | |
# for heteroskedastic errors, and the GLS model | |
data={'OLS_MODEL':olsr_model_results.params, 'OLS_HC0_MODEL':olsr_hc0_model_results.params, | |
'GLS_MODEL':gls_model_results.params} | |
print(pd.DataFrame(data=data)) | |
#Compare the standard errors of the parameter estimates from the OLS model, the OLS model with | |
# White's correction for heteroskedastic errors, and the GLS model | |
data={'OLS_MODEL':olsr_model_results.bse, 'OLS_HC0_MODEL':olsr_hc0_model_results.bse, | |
'GLS_MODEL':gls_model_results.bse} | |
print(pd.DataFrame(data=data)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment