Skip to content

Instantly share code, notes, and snippets.

@apoorvalal
Last active June 23, 2023 15:37
Show Gist options
  • Save apoorvalal/db934b9b9b8ec849c5aac914a6a2ca57 to your computer and use it in GitHub Desktop.
Save apoorvalal/db934b9b9b8ec849c5aac914a6a2ca57 to your computer and use it in GitHub Desktop.
split-sample estimation using a single regression + delta-method for IV
rm(list = ls())
library(LalRUtils); libreq(fixest, data.table, AER, car, sandwich)
# stata auto data; download from https://github.com/apoorvalal/LalRUtils/blob/master/data/auto.RData
data(auto); setDT(auto)
auto = auto[!is.na(rep78)][, id := .I] # drop missings and create ID
reg1 = feols(price ~ mpg + weight, data = auto, se = 'hc1')
reg2 = feols(weight ~ 1 | 0 | length ~ rep78, data = auto, se = 'hc1', subset = auto[, foreign == "Foreign"])
# %% stack
reg1samp = copy(auto); reg2samp = copy(auto[reg2$obs_selection$subset])
reg2samp[, samp2 := 1]
# append them
auto2 = rbindlist(list(reg1samp, reg2samp), fill = T)
auto2[is.na(samp2), samp2 := 0]
# %% tricksy columns
auto2[, y := price * (1-samp2) + weight * samp2]
# rhs in reg1
samp1v = c('mpg', 'weight') # can be done cleverly using labels(terms(reg1$fml))
# generate samp1 specific columns
auto2[, paste0(samp1v, "_1") := lapply(.SD, function(x) x * (1-samp2)), .SDcols = samp1v]
# generate samp2 specific columns
samp2v = c('length', 'rep78') # trickier for IV formula reg2$fml_all$iv
auto2[, paste0(samp2v, "_2") := lapply(.SD, function(x) x * samp2), .SDcols = samp2v]
# %% IV + OLS at once - deltaMethod for AER::ivreg works
reg3 = AER::ivreg(y ~ length_2 + mpg_1 + weight_1 + samp2 | rep78_2 + mpg_1 + weight_1 + samp2,
data = auto2)
vc = vcovCL(reg3, ~id)
car::deltaMethod(reg3, "mpg_1 + length_2", vc)
# %% and feols does too (thanks grant)
reg3 = feols(y ~ mpg_1 + weight_1 + samp2 | 0 | length_2 ~ rep78_2, auto2, cluster = ~id)
etable(list(reg1, reg2, reg3))
# fixest works (use 'fit_' prefix from vcov)
deltaMethod(reg3, "mpg_1 + fit_length_2", vcov(reg3))
# linalg-y solution (via doug rivers)
vcovget = function(m, ...){
X = model.matrix(m, ...)
n = nrow(X); k = ncol(X)
# white matrix
e = resid(m)/ sqrt(n / (n - k))
# bread
A = crossprod(X)
# meat
B = wcrossprod(X, w = e^2)
# bread meat bread
V = solve(A) %*% B %*% solve(A)
return(list(V = V, A = A, B = B, e = e, X = X))
}
# %%
f1 = feols(price ~ mpg + weight, data = auto, se = 'hc1')
f2 = feols(weight ~ 1 | 0 | length ~ rep78, data = auto, subset = ~ foreign == "Foreign", se = 'hc1')
r1 = vcovget(f1)
r2 = vcovget(f2, type ="iv.rhs2")
# this hardcodes the fact that m2 has a subset of obs - can be generalised
B12 = wcrossprod(r1$X[f2$obs_selection$subset, ], r2$X,
w = r1$e[f2$obs_selection$subset] * r2$e)
V12 = solve(r1$A) %*% B12 %*% solve(r2$A)
V = rbind(cbind(r1$V, V12), cbind(t(V12), r2$V))
list(Estimate = coef(f1)['mpg'] + coef(f2)['fit_length'],
se = sqrt(V[2, 2] + 2 * V[2, 5] + V[5, 5])
)
@apoorvalal
Copy link
Author

ah, so that's what set is for. Thanks, @MichaelChirico .

Thanks as usual for the ultra-clean implementation, @grantmcdermott

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment