Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
A comparison of heteroskedasticity consistent estimators
import pandas as pd
import statsmodels.formula.api as smf
from patsy import dmatrices
from matplotlib import pyplot as plt
#Load the US Census Bureau data into a Dataframe
df = pd.read_csv('us_census_bureau_acs_2015_2019_subset.csv', header=0)
#Construct the model's equation in Patsy syntax. Statsmodels will automatically add the intercept and so we don't explicitly specify it in the model's equation
reg_expr = 'Percent_Households_Below_Poverty_Level ~ Median_Age + Homeowner_Vacancy_Rate + Percent_Pop_25_And_Over_With_College_Or_Higher_Educ'
#Build and train the model and print the training summary
olsr_model = smf.ols(formula=reg_expr, data=df)
olsr_model_results = olsr_model.fit()
print(olsr_model_results.summary())
#Carve out the y and X matrices
y_train, X_train = dmatrices(reg_expr, df, return_type='dataframe')
#Get the predicted values of y from the fitted model
y_cap = olsr_model_results.predict(X_train)
#Plot the model's residuals against the predicted values of y
plt.xlabel('Predicted value of Percent_Households_Below_Poverty_Level')
plt.ylabel('Residual error')
plt.scatter(y_cap, olsr_model_results.resid)
plt.show()
#Ask statsmodels to use the HC estimators
olsr_model_results = olsr_model.fit(cov_type='HC0')
print(olsr_model_results.summary())
olsr_model_results = olsr_model.fit(cov_type='HC1')
print(olsr_model_results.summary())
olsr_model_results = olsr_model.fit(cov_type='HC2')
print(olsr_model_results.summary())
olsr_model_results = olsr_model.fit(cov_type='HC3')
print(olsr_model_results.summary())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment