A tutorial on instrumental variables regression using the IV2SLS class of statsmodels
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
import statsmodels.formula.api as smf | |
from statsmodels.api import add_constant | |
from statsmodels.sandbox.regression.gmm import IV2SLS | |
#Load the Panel Study of Income Dynamics (PSID) into a Dataframe | |
df = pd.read_csv('PSID1976.csv', header=0) | |
#Use a subset of the dataset where participating=yes | |
df_1975 = df.query('participation == \'yes\'') | |
print(df_1975) | |
#Let's confirm that meducation and feducation satisfy the relevance condition for education | |
reg_expr = 'education ~ meducation + feducation' | |
#Build an train an OLS model that regresses education on meducation and feducation and verify | |
# using the F-test that coefficients of meducation and feducation are jointly significant | |
# significance | |
olsr_model = smf.ols(formula=reg_expr, data=df_1975) | |
olsr_model_results = olsr_model.fit() | |
print(olsr_model_results.summary()) | |
#Build the dependent variable column | |
df_1975['ln_wage'] = np.log(df_1975['wage']) | |
#Build out the exog matrix. Statsmodels requires this matrix to contain all the endogenous and | |
# exogenous variables, plus the constant. | |
exog = df_1975[['education']] | |
exog = add_constant(exog) | |
#Build out the instruments matrix. Statsmodels requires this matrix to contain not only all the | |
# instruments but also the variables in exog that will NOT be instrumented | |
instruments = df_1975[['meducation', 'feducation']] | |
instruments = add_constant(instruments) | |
#Build and train the IV2SLS model | |
iv2sls_model = IV2SLS(endog=df_1975['ln_wage'], exog=exog, instrument=instruments) | |
iv2sls_model_results = iv2sls_model.fit() | |
#Print the training summary | |
print(iv2sls_model_results.summary()) | |
#Compare the performance of 2SLS with OLS of ln(wage) on education | |
reg_expr = 'ln_wage ~ education' | |
olsr_model = smf.ols(formula=reg_expr, data=df_1975) | |
olsr_model_results = olsr_model.fit() | |
print(olsr_model_results.summary()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment