Skip to content

Instantly share code, notes, and snippets.

@sachinsdate
Last active August 30, 2022 14:30
Show Gist options
  • Save sachinsdate/3fd5748706a371ec5174d9b6d8e04bc2 to your computer and use it in GitHub Desktop.
Save sachinsdate/3fd5748706a371ec5174d9b6d8e04bc2 to your computer and use it in GitHub Desktop.
A tutorial on instrumental variables regression using the IV2SLS class of statsmodels
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from statsmodels.api import add_constant
from statsmodels.sandbox.regression.gmm import IV2SLS
#Load the Panel Study of Income Dynamics (PSID) into a Dataframe
df = pd.read_csv('PSID1976.csv', header=0)
#Use a subset of the dataset where participating=yes
df_1975 = df.query('participation == \'yes\'')
print(df_1975)
#Let's confirm that meducation and feducation satisfy the relevance condition for education
reg_expr = 'education ~ meducation + feducation'
#Build an train an OLS model that regresses education on meducation and feducation and verify
# using the F-test that coefficients of meducation and feducation are jointly significant
# significance
olsr_model = smf.ols(formula=reg_expr, data=df_1975)
olsr_model_results = olsr_model.fit()
print(olsr_model_results.summary())
#Build the dependent variable column
df_1975['ln_wage'] = np.log(df_1975['wage'])
#Build out the exog matrix. Statsmodels requires this matrix to contain all the endogenous and
# exogenous variables, plus the constant.
exog = df_1975[['education']]
exog = add_constant(exog)
#Build out the instruments matrix. Statsmodels requires this matrix to contain not only all the
# instruments but also the variables in exog that will NOT be instrumented
instruments = df_1975[['meducation', 'feducation']]
instruments = add_constant(instruments)
#Build and train the IV2SLS model
iv2sls_model = IV2SLS(endog=df_1975['ln_wage'], exog=exog, instrument=instruments)
iv2sls_model_results = iv2sls_model.fit()
#Print the training summary
print(iv2sls_model_results.summary())
#Compare the performance of 2SLS with OLS of ln(wage) on education
reg_expr = 'ln_wage ~ education'
olsr_model = smf.ols(formula=reg_expr, data=df_1975)
olsr_model_results = olsr_model.fit()
print(olsr_model_results.summary())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment