BioSciEconomist/ex basic econometrics.py

## ex basic econometrics.py

# *-----------------------------------------------------------------
# | PROGRAM NAME: ex basic econometrics.py
# | DATE: 11/14/21
# | CREATED BY: MATT BOGARD
# | PROJECT FILE:
# *----------------------------------------------------------------
# | PURPOSE: example calculate ME for LOGIT vs LPM VIFs and robust SEs
# *----------------------------------------------------------------

# see also: ex basic econometrics.R https://gist.github.com/BioSciEconomist/b674d323b867b0da4483dfe97ed117ea

import pandas as pd
import numpy as np


import statsmodels.api as sm # import stastmodels
import statsmodels.formula.api as smf # this allows us to use an explicit formulation


# read lalonde data

df = pd.read_csv("/Users/mattbogard/Google Drive/Data/lalonde.csv")

df['ID'] = range(0,614) # create ID

df.head(100) # check
df.columns # check
df.info() # check
df.describe() # check

tmp = df[['re74','re75']] # check additional fields
tmp.describe()

# basic regression lpm
results = smf.ols('treat ~  age + nodegree + re74 + re75', data=df).fit()
results.summary2()


# logit model

model = smf.logit(formula= 'treat ~   age + nodegree + re74 + re75', data=df).fit()
model.summary()


#
# marginal effects using stats models functions
#

print(model.get_margeff(at ='overall').summary()) # get marginal effects
print(model.get_margeff(at ='mean').summary()) # get marginal effects

# home grown marginal effects (using equivalent glm model)


model = smf.glm('treat ~   age + nodegree + re74 + re75', data=df, family=sm.families.Binomial(link = sm.genmod.families.links.logit))
result = model.fit()
result.summary()

mu1 = round(df.age.mean())
mu2 = round(df.nodegree.mean())
mu3 = round(df.re74.mean())
mu4 = round(df.re75.mean())

def mfx(result,mu1,mu2,mu3,mu4,par):
    """
    result: model object from stats models logistic regression
    ex: y ~ b0 + b1*x1 + b2*x2
    mu1: mean value for first variable in model
    mu2: mean value for 2nd variable in model
    par: indicates index from 0 for model parameter you want to convert to a
         marginal effect
    note: this easily extends to more variables but does not handle predictors
          with multiple categories (unless they are dummy coded)
    """
    b0 =  result.params[0]
    b1 =  result.params[1]
    b2 =  result.params[2]
    b3 =  result.params[3]
    b4 =  result.params[4]
    XB = mu1*b1 + mu2*b2 + mu3*b3 +mu4*b4+ b0
    return (np.exp(XB)/((1+np.exp(XB))**2))*result.params[par]

# calculate marginal effets for each variable (matches at the mean ME using smf)
mfx(result,mu1,mu2,mu3,mu4,1) # age
mfx(result,mu1,mu2,mu3,mu4,2) # nodegree
mfx(result,mu1,mu2,mu3,mu4,3) # re74
mfx(result,mu1,mu2,mu3,mu4,4) # re75

#
# VIFs
#

def variance_inflation_factor(exog, exog_idx):
    """
    exog : ndarray, (nobs, k_vars)
        design matrix with all explanatory variables, as for example used in
        regression
    exog_idx : int
        index of the exogenous variable in the columns of exog
    """
    k_vars = exog.shape[1]
    x_i = exog[:, exog_idx]
    mask = np.arange(k_vars) != exog_idx
    x_noti = exog[:, mask]
    r_squared_i = sm.OLS(x_i, x_noti).fit().rsquared
    vif = 1. / (1. - r_squared_i)
    return vif


tmp = df[['age','nodegree','re74','re75']]

X = sm.add_constant(tmp)

pd.Series([variance_inflation_factor(X.values, i)
               for i in range(X.shape[1])],
              index=X.columns)

# compare to R vif function from car library

# library(car)

# df = read.csv("/Users/mattbogard/Google Drive/Data/lalonde.csv")

# vif(lm(treat ~ age + nodegree + re74 + re75, data=df))

#     age nodegree     re74     re75
#   1.128    1.049    1.654    1.447


# in order to include categoricals we have to recode

tmp = pd.get_dummies(df, columns=['race'])

tmp = tmp[['age','race_black','race_hispan','nodegree','re74','re75']]


X = sm.add_constant(tmp)

pd.Series([variance_inflation_factor(X.values, i)
               for i in range(X.shape[1])],
              index=X.columns)


#
# robust SEs
#

# basic regression lpm
results = smf.ols('treat ~  age + nodegree + re74 + re75', data=df).fit()
results.summary2()

results = smf.ols('treat ~  age + nodegree + re74 + re75', data=df).fit(cov_type = 'HC1')
results.summary2()

# similar results to R below:
#                Estimate   Std. Error t value       Pr(>|t|)
#(Intercept)  0.369393334  0.055311541    6.68 0.000000000055 ***
#age         -0.001014507  0.001802206   -0.56           0.57
#nodegree     0.053119443  0.036936833    1.44           0.15
#re74        -0.000016601  0.000003369   -4.93 0.000001071947 ***
#re75         0.000000845  0.000006455    0.13           0.90

	# *-----------------------------------------------------------------
	# \| PROGRAM NAME: ex basic econometrics.py
	# \| DATE: 11/14/21
	# \| CREATED BY: MATT BOGARD
	# \| PROJECT FILE:
	# *----------------------------------------------------------------
	# \| PURPOSE: example calculate ME for LOGIT vs LPM VIFs and robust SEs
	# *----------------------------------------------------------------

	# see also: ex basic econometrics.R https://gist.github.com/BioSciEconomist/b674d323b867b0da4483dfe97ed117ea

	import pandas as pd
	import numpy as np


	import statsmodels.api as sm # import stastmodels
	import statsmodels.formula.api as smf # this allows us to use an explicit formulation


	# read lalonde data

	df = pd.read_csv("/Users/mattbogard/Google Drive/Data/lalonde.csv")

	df['ID'] = range(0,614) # create ID

	df.head(100) # check
	df.columns # check
	df.info() # check
	df.describe() # check

	tmp = df[['re74','re75']] # check additional fields
	tmp.describe()

	# basic regression lpm
	results = smf.ols('treat ~ age + nodegree + re74 + re75', data=df).fit()
	results.summary2()


	# logit model

	model = smf.logit(formula= 'treat ~ age + nodegree + re74 + re75', data=df).fit()
	model.summary()


	#
	# marginal effects using stats models functions
	#

	print(model.get_margeff(at ='overall').summary()) # get marginal effects
	print(model.get_margeff(at ='mean').summary()) # get marginal effects

	# home grown marginal effects (using equivalent glm model)


	model = smf.glm('treat ~ age + nodegree + re74 + re75', data=df, family=sm.families.Binomial(link = sm.genmod.families.links.logit))
	result = model.fit()
	result.summary()

	mu1 = round(df.age.mean())
	mu2 = round(df.nodegree.mean())
	mu3 = round(df.re74.mean())
	mu4 = round(df.re75.mean())

	def mfx(result,mu1,mu2,mu3,mu4,par):
	"""
	result: model object from stats models logistic regression
	ex: y ~ b0 + b1x1 + b2x2
	mu1: mean value for first variable in model
	mu2: mean value for 2nd variable in model
	par: indicates index from 0 for model parameter you want to convert to a
	marginal effect
	note: this easily extends to more variables but does not handle predictors
	with multiple categories (unless they are dummy coded)
	"""
	b0 = result.params[0]
	b1 = result.params[1]
	b2 = result.params[2]
	b3 = result.params[3]
	b4 = result.params[4]
	XB = mu1b1 + mu2b2 + mu3b3 +mu4b4+ b0
	return (np.exp(XB)/((1+np.exp(XB))*2))result.params[par]

	# calculate marginal effets for each variable (matches at the mean ME using smf)
	mfx(result,mu1,mu2,mu3,mu4,1) # age
	mfx(result,mu1,mu2,mu3,mu4,2) # nodegree
	mfx(result,mu1,mu2,mu3,mu4,3) # re74
	mfx(result,mu1,mu2,mu3,mu4,4) # re75

	#
	# VIFs
	#

	def variance_inflation_factor(exog, exog_idx):
	"""
	exog : ndarray, (nobs, k_vars)
	design matrix with all explanatory variables, as for example used in
	regression
	exog_idx : int
	index of the exogenous variable in the columns of exog
	"""
	k_vars = exog.shape[1]
	x_i = exog[:, exog_idx]
	mask = np.arange(k_vars) != exog_idx
	x_noti = exog[:, mask]
	r_squared_i = sm.OLS(x_i, x_noti).fit().rsquared
	vif = 1. / (1. - r_squared_i)
	return vif


	tmp = df[['age','nodegree','re74','re75']]

	X = sm.add_constant(tmp)

	pd.Series([variance_inflation_factor(X.values, i)
	for i in range(X.shape[1])],
	index=X.columns)

	# compare to R vif function from car library

	# library(car)

	# df = read.csv("/Users/mattbogard/Google Drive/Data/lalonde.csv")

	# vif(lm(treat ~ age + nodegree + re74 + re75, data=df))

	# age nodegree re74 re75
	# 1.128 1.049 1.654 1.447




	# in order to include categoricals we have to recode

	tmp = pd.get_dummies(df, columns=['race'])

	tmp = tmp[['age','race_black','race_hispan','nodegree','re74','re75']]


	X = sm.add_constant(tmp)

	pd.Series([variance_inflation_factor(X.values, i)
	for i in range(X.shape[1])],
	index=X.columns)



	#
	# robust SEs
	#

	# basic regression lpm
	results = smf.ols('treat ~ age + nodegree + re74 + re75', data=df).fit()
	results.summary2()

	results = smf.ols('treat ~ age + nodegree + re74 + re75', data=df).fit(cov_type = 'HC1')
	results.summary2()

	# similar results to R below:
	# Estimate Std. Error t value Pr(>\|t\|)
	#(Intercept) 0.369393334 0.055311541 6.68 0.000000000055 ***
	#age -0.001014507 0.001802206 -0.56 0.57
	#nodegree 0.053119443 0.036936833 1.44 0.15
	#re74 -0.000016601 0.000003369 -4.93 0.000001071947 ***
	#re75 0.000000845 0.000006455 0.13 0.90