sachinsdate/instrumental_variables_regression.py

## instrumental_variables_regression.py
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from statsmodels.api import add_constant
from statsmodels.sandbox.regression.gmm import IV2SLS


#Load the Panel Study of Income Dynamics (PSID) into a Dataframe
df = pd.read_csv('PSID1976.csv', header=0)

#Use a subset of the dataset where participating=yes
df_1975 = df.query('participation == \'yes\'')
print(df_1975)

#Let's confirm that meducation and feducation satisfy the relevance condition for education
reg_expr = 'education ~ meducation + feducation'

#Build an train an OLS model that regresses education on meducation and feducation and verify
# using the F-test that coefficients of meducation and feducation are jointly significant
# significance
olsr_model = smf.ols(formula=reg_expr, data=df_1975)
olsr_model_results = olsr_model.fit()
print(olsr_model_results.summary())

#Build the dependent variable column
df_1975['ln_wage'] = np.log(df_1975['wage'])

#Build out the exog matrix. Statsmodels requires this matrix to contain all the endogenous and
# exogenous variables, plus the constant.
exog = df_1975[['education']]
exog = add_constant(exog)

#Build out the instruments matrix. Statsmodels requires this matrix to contain not only all the
# instruments but also the variables in exog that will NOT be instrumented
instruments = df_1975[['meducation', 'feducation']]
instruments = add_constant(instruments)

#Build and train the IV2SLS model
iv2sls_model = IV2SLS(endog=df_1975['ln_wage'], exog=exog, instrument=instruments)
iv2sls_model_results = iv2sls_model.fit()

#Print the training summary
print(iv2sls_model_results.summary())

#Compare the performance of 2SLS with OLS of ln(wage) on education
reg_expr = 'ln_wage ~ education'
olsr_model = smf.ols(formula=reg_expr, data=df_1975)
olsr_model_results = olsr_model.fit()
print(olsr_model_results.summary())
	import pandas as pd
	import numpy as np
	import statsmodels.formula.api as smf
	from statsmodels.api import add_constant
	from statsmodels.sandbox.regression.gmm import IV2SLS


	#Load the Panel Study of Income Dynamics (PSID) into a Dataframe
	df = pd.read_csv('PSID1976.csv', header=0)

	#Use a subset of the dataset where participating=yes
	df_1975 = df.query('participation == \'yes\'')
	print(df_1975)

	#Let's confirm that meducation and feducation satisfy the relevance condition for education
	reg_expr = 'education ~ meducation + feducation'

	#Build an train an OLS model that regresses education on meducation and feducation and verify
	# using the F-test that coefficients of meducation and feducation are jointly significant
	# significance
	olsr_model = smf.ols(formula=reg_expr, data=df_1975)
	olsr_model_results = olsr_model.fit()
	print(olsr_model_results.summary())

	#Build the dependent variable column
	df_1975['ln_wage'] = np.log(df_1975['wage'])

	#Build out the exog matrix. Statsmodels requires this matrix to contain all the endogenous and
	# exogenous variables, plus the constant.
	exog = df_1975[['education']]
	exog = add_constant(exog)

	#Build out the instruments matrix. Statsmodels requires this matrix to contain not only all the
	# instruments but also the variables in exog that will NOT be instrumented
	instruments = df_1975[['meducation', 'feducation']]
	instruments = add_constant(instruments)

	#Build and train the IV2SLS model
	iv2sls_model = IV2SLS(endog=df_1975['ln_wage'], exog=exog, instrument=instruments)
	iv2sls_model_results = iv2sls_model.fit()

	#Print the training summary
	print(iv2sls_model_results.summary())

	#Compare the performance of 2SLS with OLS of ln(wage) on education
	reg_expr = 'ln_wage ~ education'
	olsr_model = smf.ols(formula=reg_expr, data=df_1975)
	olsr_model_results = olsr_model.fit()
	print(olsr_model_results.summary())