Skip to content

Instantly share code, notes, and snippets.

@BioSciEconomist
Created November 14, 2021 22:35
Show Gist options
  • Save BioSciEconomist/18d8f7f5eacbf7107c5a72b036d37ea9 to your computer and use it in GitHub Desktop.
Save BioSciEconomist/18d8f7f5eacbf7107c5a72b036d37ea9 to your computer and use it in GitHub Desktop.
Example calculations of marginal effects for LOGIT vs LPM VIFs and robust SEs
# *-----------------------------------------------------------------
# | PROGRAM NAME: ex basic econometrics.py
# | DATE: 11/14/21
# | CREATED BY: MATT BOGARD
# | PROJECT FILE:
# *----------------------------------------------------------------
# | PURPOSE: example calculate ME for LOGIT vs LPM VIFs and robust SEs
# *----------------------------------------------------------------
# see also: ex basic econometrics.R https://gist.github.com/BioSciEconomist/b674d323b867b0da4483dfe97ed117ea
import pandas as pd
import numpy as np
import statsmodels.api as sm # import stastmodels
import statsmodels.formula.api as smf # this allows us to use an explicit formulation
# read lalonde data
df = pd.read_csv("/Users/mattbogard/Google Drive/Data/lalonde.csv")
df['ID'] = range(0,614) # create ID
df.head(100) # check
df.columns # check
df.info() # check
df.describe() # check
tmp = df[['re74','re75']] # check additional fields
tmp.describe()
# basic regression lpm
results = smf.ols('treat ~ age + nodegree + re74 + re75', data=df).fit()
results.summary2()
# logit model
model = smf.logit(formula= 'treat ~ age + nodegree + re74 + re75', data=df).fit()
model.summary()
#
# marginal effects using stats models functions
#
print(model.get_margeff(at ='overall').summary()) # get marginal effects
print(model.get_margeff(at ='mean').summary()) # get marginal effects
# home grown marginal effects (using equivalent glm model)
model = smf.glm('treat ~ age + nodegree + re74 + re75', data=df, family=sm.families.Binomial(link = sm.genmod.families.links.logit))
result = model.fit()
result.summary()
mu1 = round(df.age.mean())
mu2 = round(df.nodegree.mean())
mu3 = round(df.re74.mean())
mu4 = round(df.re75.mean())
def mfx(result,mu1,mu2,mu3,mu4,par):
"""
result: model object from stats models logistic regression
ex: y ~ b0 + b1*x1 + b2*x2
mu1: mean value for first variable in model
mu2: mean value for 2nd variable in model
par: indicates index from 0 for model parameter you want to convert to a
marginal effect
note: this easily extends to more variables but does not handle predictors
with multiple categories (unless they are dummy coded)
"""
b0 = result.params[0]
b1 = result.params[1]
b2 = result.params[2]
b3 = result.params[3]
b4 = result.params[4]
XB = mu1*b1 + mu2*b2 + mu3*b3 +mu4*b4+ b0
return (np.exp(XB)/((1+np.exp(XB))**2))*result.params[par]
# calculate marginal effets for each variable (matches at the mean ME using smf)
mfx(result,mu1,mu2,mu3,mu4,1) # age
mfx(result,mu1,mu2,mu3,mu4,2) # nodegree
mfx(result,mu1,mu2,mu3,mu4,3) # re74
mfx(result,mu1,mu2,mu3,mu4,4) # re75
#
# VIFs
#
def variance_inflation_factor(exog, exog_idx):
"""
exog : ndarray, (nobs, k_vars)
design matrix with all explanatory variables, as for example used in
regression
exog_idx : int
index of the exogenous variable in the columns of exog
"""
k_vars = exog.shape[1]
x_i = exog[:, exog_idx]
mask = np.arange(k_vars) != exog_idx
x_noti = exog[:, mask]
r_squared_i = sm.OLS(x_i, x_noti).fit().rsquared
vif = 1. / (1. - r_squared_i)
return vif
tmp = df[['age','nodegree','re74','re75']]
X = sm.add_constant(tmp)
pd.Series([variance_inflation_factor(X.values, i)
for i in range(X.shape[1])],
index=X.columns)
# compare to R vif function from car library
# library(car)
# df = read.csv("/Users/mattbogard/Google Drive/Data/lalonde.csv")
# vif(lm(treat ~ age + nodegree + re74 + re75, data=df))
# age nodegree re74 re75
# 1.128 1.049 1.654 1.447
# in order to include categoricals we have to recode
tmp = pd.get_dummies(df, columns=['race'])
tmp = tmp[['age','race_black','race_hispan','nodegree','re74','re75']]
X = sm.add_constant(tmp)
pd.Series([variance_inflation_factor(X.values, i)
for i in range(X.shape[1])],
index=X.columns)
#
# robust SEs
#
# basic regression lpm
results = smf.ols('treat ~ age + nodegree + re74 + re75', data=df).fit()
results.summary2()
results = smf.ols('treat ~ age + nodegree + re74 + re75', data=df).fit(cov_type = 'HC1')
results.summary2()
# similar results to R below:
# Estimate Std. Error t value Pr(>|t|)
#(Intercept) 0.369393334 0.055311541 6.68 0.000000000055 ***
#age -0.001014507 0.001802206 -0.56 0.57
#nodegree 0.053119443 0.036936833 1.44 0.15
#re74 -0.000016601 0.000003369 -4.93 0.000001071947 ***
#re75 0.000000845 0.000006455 0.13 0.90
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment