sachinsdate/white_hc_matrix.py

## white_hc_matrix.py
import pandas as pd
import statsmodels.formula.api as smf
from patsy import dmatrices
from matplotlib import pyplot as plt


#Load the US Census Bureau data into a Dataframe
df = pd.read_csv('us_census_bureau_acs_2015_2019_subset.csv', header=0)

#Construct the model's equation in Patsy syntax. Statsmodels will automatically add the intercept and so we don't explicitly specify it in the model's equation
reg_expr = 'Percent_Households_Below_Poverty_Level ~ Median_Age + Homeowner_Vacancy_Rate + Percent_Pop_25_And_Over_With_College_Or_Higher_Educ'

#Build and train the model and print the training summary
olsr_model = smf.ols(formula=reg_expr, data=df)
olsr_model_results = olsr_model.fit()
print(olsr_model_results.summary())

#Carve out the y and X matrices
y_train, X_train = dmatrices(reg_expr, df, return_type='dataframe')

#Get the predicted values of y from the fitted model
y_cap = olsr_model_results.predict(X_train)

#Plot the model's residuals against the predicted values of y
plt.xlabel('Predicted value of Percent_Households_Below_Poverty_Level')
plt.ylabel('Residual error')
plt.scatter(y_cap, olsr_model_results.resid)
plt.show()


#Ask statsmodels to use the HC estimators
olsr_model_results = olsr_model.fit(cov_type='HC0')
print(olsr_model_results.summary())

olsr_model_results = olsr_model.fit(cov_type='HC1')
print(olsr_model_results.summary())

olsr_model_results = olsr_model.fit(cov_type='HC2')
print(olsr_model_results.summary())

olsr_model_results = olsr_model.fit(cov_type='HC3')
print(olsr_model_results.summary())
	import pandas as pd
	import statsmodels.formula.api as smf
	from patsy import dmatrices
	from matplotlib import pyplot as plt


	#Load the US Census Bureau data into a Dataframe
	df = pd.read_csv('us_census_bureau_acs_2015_2019_subset.csv', header=0)

	#Construct the model's equation in Patsy syntax. Statsmodels will automatically add the intercept and so we don't explicitly specify it in the model's equation
	reg_expr = 'Percent_Households_Below_Poverty_Level ~ Median_Age + Homeowner_Vacancy_Rate + Percent_Pop_25_And_Over_With_College_Or_Higher_Educ'

	#Build and train the model and print the training summary
	olsr_model = smf.ols(formula=reg_expr, data=df)
	olsr_model_results = olsr_model.fit()
	print(olsr_model_results.summary())

	#Carve out the y and X matrices
	y_train, X_train = dmatrices(reg_expr, df, return_type='dataframe')

	#Get the predicted values of y from the fitted model
	y_cap = olsr_model_results.predict(X_train)

	#Plot the model's residuals against the predicted values of y
	plt.xlabel('Predicted value of Percent_Households_Below_Poverty_Level')
	plt.ylabel('Residual error')
	plt.scatter(y_cap, olsr_model_results.resid)
	plt.show()


	#Ask statsmodels to use the HC estimators
	olsr_model_results = olsr_model.fit(cov_type='HC0')
	print(olsr_model_results.summary())

	olsr_model_results = olsr_model.fit(cov_type='HC1')
	print(olsr_model_results.summary())

	olsr_model_results = olsr_model.fit(cov_type='HC2')
	print(olsr_model_results.summary())

	olsr_model_results = olsr_model.fit(cov_type='HC3')
	print(olsr_model_results.summary())