apoorvalal/OaxacaBlinderATTEstimation.R

## OaxacaBlinderATTEstimation.R
rm(list = ls())
libreq(data.table, fixest, rio, ggplot2, ebal)


# %%
cps3 = import("cps3re74.dta") |> setDT()
cps3 = cps3 |> na.omit()
setnames(cps3, c("re78", "treat"), c("y", "d"))
xs = c("age", "age2", "ed", "black", "hisp", "married", "nodeg", "re74", "re75")

# %%
OB_ATT = function(df, xs){
  X1bar = df[d == 1, lapply(.SD, mean), .SDcols = xs] |> as.numeric()
  Xdemeaned = sweep(df[, ..xs], 2, X1bar)
  colnames(Xdemeaned) = paste0(colnames(Xdemeaned), "_dem")
  regdf = cbind(df[, .(y, d)], df[, ..xs], Xdemeaned)
  f = as.formula(paste0("y ~ d + ", paste(xs, collapse = "+"), "+",
    "d:(", paste(colnames(Xdemeaned), collapse = "+"), ")"))
  feols(f, data = regdf, vcov = 'hc1')
}

OB_ATT(cps3, xs)

# %%
data(lalonde.psid); setDT(lalonde.psid)
setnames(lalonde.psid, c("re78", "treat"), c("y", "d"))
xs = setdiff(colnames(lalonde.psid), c("y", 'd'))

OB_ATT(lalonde.psid, xs)

# %%
ebest = function(df, xs){
  ebobj = ebalance( df$d, df[, ..xs] |> as.matrix() )
  df[d == 1, mean(y)] - weighted.mean(df[d == 0, y], ebobj$w )
}

ebest(lalonde.psid, xs)

ebest(cps3, c("age", "ed", "black", "hisp", "married", "nodeg", "re74", "re75"))

# %% ####################################################
Kline_OB = function(y, xs, df){
  require(data.table)
  setDT(df)
  # outcome model in treatment
  omod = feols(formula_stitcher(y, xs), data = df[d == 0], vcov = "hc1")
  b = omod$coefficients; V0 = vcov(omod)
  # estimate effect
  df$Y0 = predict(omod, df)
  df[, dif := y - Y0]
  M1 = df[d == 1, mean(dif)]
  # standard error
  X = cbind(1, df[, ..xs]) |> as.matrix()
  N1 = sum(df$d)
  D = df$d |> as.matrix()
  V1 = df[d == 1, .(var(y), .N)] |> as.numeric()
  V1 = V1[1]/V1[2]
  Vdif = V1 + (t(D)  %*% X %*% V0  %*% t(X)  %*% (D / (N1^2)) )
  SE = sqrt(Vdif) |> as.numeric()
  # return
  c(est = M1, se = SE) |> round(3)
}

Kline_OB('y', c("age", "age2", "ed", "black", "hisp", "married", "nodeg", "re74", "re75"), cps3)

# %%
##      ## ######## ####  ######   ##     ## ########  ######
##  ##  ## ##        ##  ##    ##  ##     ##    ##    ##    ##
##  ##  ## ##        ##  ##        ##     ##    ##    ##
##  ##  ## ######    ##  ##   #### #########    ##     ######
##  ##  ## ##        ##  ##    ##  ##     ##    ##          ##
##  ##  ## ##        ##  ##    ##  ##     ##    ##    ##    ##
 ###  ###  ######## ####  ######   ##     ##    ##     ######


cps3$p = glm(formula_stitcher("d", xs), data = cps3, family = binomial())$fitted.values

# %%
π = mean(cps3$d); N1 = sum(cps3$d); N0 = nrow(cps3) - N1
cps3[d== 0, wlogit := p/(1-p) * (1-π)/π]
# %% OB weights

N = nrow(cps3)
X = cbind(1, cps3[, ..xs]) |> as.matrix()
D = cps3$d       |> as.matrix()
Y = cps3$y       |> as.matrix()
# initialize zero
S = matrix(0, nrow = N, ncol = N)
# replace diag with 1 - d
diag(S) = 1 - cps3$d
H = X %*% solve(t(X) %*% S %*% X)  %*% t(X) %*% S
obW = (1/N1) * t(D) %*% H |> as.numeric()

# %% # counterfactual mean
as.numeric(obW %*% Y)
cps3[d ==1, mean(y)] - as.numeric(obW %*% Y)
# %%
cps3[, wob := obW * N1]
with(cps3, plot(wlogit, wob))

# %% balance
cps3[d == 1 , mean(age)]
cps3[d == 0, weighted.mean(age, wlogit)]
cps3[d == 0, weighted.mean(age, wob)]

# %% OB estimate
	rm(list = ls())
	libreq(data.table, fixest, rio, ggplot2, ebal)



	# %%
	cps3 = import("cps3re74.dta") \|> setDT()
	cps3 = cps3 \|> na.omit()
	setnames(cps3, c("re78", "treat"), c("y", "d"))
	xs = c("age", "age2", "ed", "black", "hisp", "married", "nodeg", "re74", "re75")

	# %%
	OB_ATT = function(df, xs){
	X1bar = df[d == 1, lapply(.SD, mean), .SDcols = xs] \|> as.numeric()
	Xdemeaned = sweep(df[, ..xs], 2, X1bar)
	colnames(Xdemeaned) = paste0(colnames(Xdemeaned), "_dem")
	regdf = cbind(df[, .(y, d)], df[, ..xs], Xdemeaned)
	f = as.formula(paste0("y ~ d + ", paste(xs, collapse = "+"), "+",
	"d:(", paste(colnames(Xdemeaned), collapse = "+"), ")"))
	feols(f, data = regdf, vcov = 'hc1')
	}

	OB_ATT(cps3, xs)

	# %%
	data(lalonde.psid); setDT(lalonde.psid)
	setnames(lalonde.psid, c("re78", "treat"), c("y", "d"))
	xs = setdiff(colnames(lalonde.psid), c("y", 'd'))

	OB_ATT(lalonde.psid, xs)

	# %%
	ebest = function(df, xs){
	ebobj = ebalance( df$d, df[, ..xs] \|> as.matrix() )
	df[d == 1, mean(y)] - weighted.mean(df[d == 0, y], ebobj$w )
	}

	ebest(lalonde.psid, xs)

	ebest(cps3, c("age", "ed", "black", "hisp", "married", "nodeg", "re74", "re75"))

	# %% ####################################################
	Kline_OB = function(y, xs, df){
	require(data.table)
	setDT(df)
	# outcome model in treatment
	omod = feols(formula_stitcher(y, xs), data = df[d == 0], vcov = "hc1")
	b = omod$coefficients; V0 = vcov(omod)
	# estimate effect
	df$Y0 = predict(omod, df)
	df[, dif := y - Y0]
	M1 = df[d == 1, mean(dif)]
	# standard error
	X = cbind(1, df[, ..xs]) \|> as.matrix()
	N1 = sum(df$d)
	D = df$d \|> as.matrix()
	V1 = df[d == 1, .(var(y), .N)] \|> as.numeric()
	V1 = V1[1]/V1[2]
	Vdif = V1 + (t(D) %% X %% V0 %% t(X) %% (D / (N1^2)) )
	SE = sqrt(Vdif) \|> as.numeric()
	# return
	c(est = M1, se = SE) \|> round(3)
	}

	Kline_OB('y', c("age", "age2", "ed", "black", "hisp", "married", "nodeg", "re74", "re75"), cps3)

	# %%
	## ## ######## #### ###### ## ## ######## ######
	## ## ## ## ## ## ## ## ## ## ## ##
	## ## ## ## ## ## ## ## ## ##
	## ## ## ###### ## ## #### ######### ## ######
	## ## ## ## ## ## ## ## ## ## ##
	## ## ## ## ## ## ## ## ## ## ## ##
	### ### ######## #### ###### ## ## ## ######


	cps3$p = glm(formula_stitcher("d", xs), data = cps3, family = binomial())$fitted.values

	# %%
	π = mean(cps3$d); N1 = sum(cps3$d); N0 = nrow(cps3) - N1
	cps3[d== 0, wlogit := p/(1-p) * (1-π)/π]
	# %% OB weights

	N = nrow(cps3)
	X = cbind(1, cps3[, ..xs]) \|> as.matrix()
	D = cps3$d \|> as.matrix()
	Y = cps3$y \|> as.matrix()
	# initialize zero
	S = matrix(0, nrow = N, ncol = N)
	# replace diag with 1 - d
	diag(S) = 1 - cps3$d
	H = X %% solve(t(X) %% S %% X) %% t(X) %*% S
	obW = (1/N1) * t(D) %*% H \|> as.numeric()

	# %% # counterfactual mean
	as.numeric(obW %*% Y)
	cps3[d ==1, mean(y)] - as.numeric(obW %*% Y)
	# %%
	cps3[, wob := obW * N1]
	with(cps3, plot(wlogit, wob))

	# %% balance
	cps3[d == 1 , mean(age)]
	cps3[d == 0, weighted.mean(age, wlogit)]
	cps3[d == 0, weighted.mean(age, wob)]

	# %% OB estimate