khakieconomics/double_ml_hierarchical_bayes.R

## double_ml_hierarchical_bayes.R
library(rstanarm); library(tidyverse)
options(mc.cores = parallel::detectCores())
set.seed(42)
data("radon")
head(treatment_sample)

# Some levels have no variance in the outcomes, making likelihood estimates impossible
# Adding a tiny bit of noise fixes the problem
radon$log_uranium <- rnorm(nrow(radon), radon$log_uranium, 0.05)

# Split the data
sample_split <- sample(1:nrow(radon), round(nrow(radon)/2))
outcome_sample <- radon[sample_split,]
treatment_sample <- radon[-sample_split,]

# Fit the outcome model
summary(lm(log_uranium ~ log_radon + floor + county, data = treatment_sample))
outcome_model <- stan_lmer(log_uranium ~ log_radon + (1 | county),
                           data = treatment_sample,
                           cores=  4, iter = 1000,
                           prior = student_t(df = 5, location = 0, scale = 0.5),
                           prior_intercept = student_t(df = 7, location = 0, scale = 1),
                           prior_aux = student_t(df = 4, scale = 1))

# Fit the treatment model
treatment_model <- stan_glmer(floor ~ log_radon + (1 | county),
                              data = treatment_sample, cores = 4,
                              iter = 1000,
                              family = binomial(link = "logit"),
                              prior = student_t(df = 5, location = 0, scale = 0.5),
                              prior_intercept = student_t(df = 3, location = 0, scale = 2))

# Get posterior mean predictions for treatments and outcomes
treatment_predictions <- t(posterior_linpred(treatment_model, newdata = outcome_sample, transform = T))
hist(treatment_predictions)

outcome_predictions <- t(posterior_linpred(outcome_model,newdata = outcome_sample))
hist(outcome_predictions)

# This contains the residuals from the treatment model applied to the outcome dataset.
# Note there are 2000 columns---a separate vector of residuals for every draw from the posterior.
treatment_errors <- matrix(outcome_sample$floor, nrow(outcome_sample), ncol(treatment_predictions)) - treatment_predictions
outcome_errors <- matrix(outcome_sample$log_uranium, nrow(outcome_sample), ncol(treatment_predictions)) - outcome_predictions

# Get the theta estimates for each set of residuals
thetahat <- sapply(1:2000, function(i) coef(lm(outcome_errors[,i] ~ -1 + treatment_errors[,i])))

# And get our posterior interval
ATE_bayes <- mean(thetahat)
confint_bayes <- quantile(thetahat, c(0.025, 0.975))


# Let’s now do double-ML with Random Forest -------------------------------
library(randomForest)

# One-hot encode the categorical features
radon_onehot <- model.matrix(~ ., data = radon)[,-1]

# Perform a split
onehot_outcome <- radon_onehot[sample_split,]
onehot_treatment <- radon_onehot[-sample_split,]

# Fit models and construct residuals
treatment_model_rf <- randomForest(factor(floor) ~ . - log_uranium, data = onehot_treatment)
outcome_model_rf <- randomForest(log_uranium ~ . - floor, data = onehot_treatment, mtry = 15)
resid_outcome_rf <- onehot_outcome[,"log_uranium"] - predict(outcome_model_rf, newdata = onehot_outcome)
resid_treatment_rf <- onehot_outcome[,"floor"] - predict(treatment_model_rf, newdata = onehot_outcome, type ="prob")[,2]

# Get our treatment effect estimate
fit_rf <- lm(resid_outcome_rf ~ -1 + resid_treatment_rf)
se <- (1/mean(resid_treatment_rf^2))^2 * mean(resid_treatment_rf^2 * resid(fit_rf)^2)
rf_confint <- c(coef(fit_rf) - 1.96*se, coef(fit_rf) + 1.96*se)


# Now a Naive BART implementation -----------------------------------------
library(BART)

y_train <- radon_onehot[,"log_uranium"]
x_train <- radon_onehot[,-ncol(radon_onehot)]
bart_est <- wbart(x.train = x_train, y.train = y_train, sparse = T)


x_test_1 <- x_train
x_test_1[,"floor"] <- 1
x_test_0 <- x_train
x_test_0[,"floor"] <- 0

y_pred_1 <- predict(bart_est, newdata = x_test_1)
y_pred_0 <- predict(bart_est, newdata = x_test_0)

# Difference = posterior of prediction
Ind_TE <- y_pred_1 - y_pred_0
ATE <- rowMeans(Ind_TE)
hist(ATE)

ATE_mean <- mean(ATE)
ATE_confint <- quantile(ATE, c(0.025, 0.975))


# Visualize comparative effects -------------------------------------------

data_frame(model = c("Bayesian hierarchical",
                     "RF Double ML",
                     "BART"),
           mean = c(ATE_bayes, coef(fit_rf), ATE_mean),
           lower = c(confint_bayes[1], confint(fit_rf)[1], ATE_confint[1]),
           upper = c(confint_bayes[2], confint(fit_rf)[2], ATE_confint[2])) %>%
  ggplot(aes(x = model)) +
  geom_linerange(aes(ymin = lower, ymax = upper), colour = "green") +
  geom_point(aes(y = mean), size = 2) +
  theme_bw()

# So it seems the Bayesian Hierarchical estimate is way more precise. Why might this be the case?

error_scales <- data_frame(model = c("treatment model", "outcome model"),
                           `error scale bayes` = c(sd(treatment_errors), sd(outcome_errors)),
                           `error scale RF` = c(sd(resid_treatment_rf), sd(resid_outcome_rf)))

error_scales %>%
  gather(measure, value, -model) %>%
  ggplot(aes(x = model)) +
  geom_point(aes(y = value, colour = measure), size = 2)

# There we go: because the error for the hierarchical bayes model is so much smaller,
# we get a way narrower confidence band. This is a feature of the well-estimated random effects
# doing such a good job for prediction
	library(rstanarm); library(tidyverse)
	options(mc.cores = parallel::detectCores())
	set.seed(42)
	data("radon")
	head(treatment_sample)

	# Some levels have no variance in the outcomes, making likelihood estimates impossible
	# Adding a tiny bit of noise fixes the problem
	radon$log_uranium <- rnorm(nrow(radon), radon$log_uranium, 0.05)

	# Split the data
	sample_split <- sample(1:nrow(radon), round(nrow(radon)/2))
	outcome_sample <- radon[sample_split,]
	treatment_sample <- radon[-sample_split,]

	# Fit the outcome model
	summary(lm(log_uranium ~ log_radon + floor + county, data = treatment_sample))
	outcome_model <- stan_lmer(log_uranium ~ log_radon + (1 \| county),
	data = treatment_sample,
	cores= 4, iter = 1000,
	prior = student_t(df = 5, location = 0, scale = 0.5),
	prior_intercept = student_t(df = 7, location = 0, scale = 1),
	prior_aux = student_t(df = 4, scale = 1))

	# Fit the treatment model
	treatment_model <- stan_glmer(floor ~ log_radon + (1 \| county),
	data = treatment_sample, cores = 4,
	iter = 1000,
	family = binomial(link = "logit"),
	prior = student_t(df = 5, location = 0, scale = 0.5),
	prior_intercept = student_t(df = 3, location = 0, scale = 2))

	# Get posterior mean predictions for treatments and outcomes
	treatment_predictions <- t(posterior_linpred(treatment_model, newdata = outcome_sample, transform = T))
	hist(treatment_predictions)

	outcome_predictions <- t(posterior_linpred(outcome_model,newdata = outcome_sample))
	hist(outcome_predictions)

	# This contains the residuals from the treatment model applied to the outcome dataset.
	# Note there are 2000 columns---a separate vector of residuals for every draw from the posterior.
	treatment_errors <- matrix(outcome_sample$floor, nrow(outcome_sample), ncol(treatment_predictions)) - treatment_predictions
	outcome_errors <- matrix(outcome_sample$log_uranium, nrow(outcome_sample), ncol(treatment_predictions)) - outcome_predictions

	# Get the theta estimates for each set of residuals
	thetahat <- sapply(1:2000, function(i) coef(lm(outcome_errors[,i] ~ -1 + treatment_errors[,i])))

	# And get our posterior interval
	ATE_bayes <- mean(thetahat)
	confint_bayes <- quantile(thetahat, c(0.025, 0.975))




	# Let’s now do double-ML with Random Forest -------------------------------
	library(randomForest)

	# One-hot encode the categorical features
	radon_onehot <- model.matrix(~ ., data = radon)[,-1]

	# Perform a split
	onehot_outcome <- radon_onehot[sample_split,]
	onehot_treatment <- radon_onehot[-sample_split,]

	# Fit models and construct residuals
	treatment_model_rf <- randomForest(factor(floor) ~ . - log_uranium, data = onehot_treatment)
	outcome_model_rf <- randomForest(log_uranium ~ . - floor, data = onehot_treatment, mtry = 15)
	resid_outcome_rf <- onehot_outcome[,"log_uranium"] - predict(outcome_model_rf, newdata = onehot_outcome)
	resid_treatment_rf <- onehot_outcome[,"floor"] - predict(treatment_model_rf, newdata = onehot_outcome, type ="prob")[,2]

	# Get our treatment effect estimate
	fit_rf <- lm(resid_outcome_rf ~ -1 + resid_treatment_rf)
	se <- (1/mean(resid_treatment_rf^2))^2 * mean(resid_treatment_rf^2 * resid(fit_rf)^2)
	rf_confint <- c(coef(fit_rf) - 1.96se, coef(fit_rf) + 1.96se)


	# Now a Naive BART implementation -----------------------------------------
	library(BART)

	y_train <- radon_onehot[,"log_uranium"]
	x_train <- radon_onehot[,-ncol(radon_onehot)]
	bart_est <- wbart(x.train = x_train, y.train = y_train, sparse = T)


	x_test_1 <- x_train
	x_test_1[,"floor"] <- 1
	x_test_0 <- x_train
	x_test_0[,"floor"] <- 0

	y_pred_1 <- predict(bart_est, newdata = x_test_1)
	y_pred_0 <- predict(bart_est, newdata = x_test_0)

	# Difference = posterior of prediction
	Ind_TE <- y_pred_1 - y_pred_0
	ATE <- rowMeans(Ind_TE)
	hist(ATE)

	ATE_mean <- mean(ATE)
	ATE_confint <- quantile(ATE, c(0.025, 0.975))



	# Visualize comparative effects -------------------------------------------

	data_frame(model = c("Bayesian hierarchical",
	"RF Double ML",
	"BART"),
	mean = c(ATE_bayes, coef(fit_rf), ATE_mean),
	lower = c(confint_bayes[1], confint(fit_rf)[1], ATE_confint[1]),
	upper = c(confint_bayes[2], confint(fit_rf)[2], ATE_confint[2])) %>%
	ggplot(aes(x = model)) +
	geom_linerange(aes(ymin = lower, ymax = upper), colour = "green") +
	geom_point(aes(y = mean), size = 2) +
	theme_bw()

	# So it seems the Bayesian Hierarchical estimate is way more precise. Why might this be the case?

	error_scales <- data_frame(model = c("treatment model", "outcome model"),
	`error scale bayes` = c(sd(treatment_errors), sd(outcome_errors)),
	`error scale RF` = c(sd(resid_treatment_rf), sd(resid_outcome_rf)))

	error_scales %>%
	gather(measure, value, -model) %>%
	ggplot(aes(x = model)) +
	geom_point(aes(y = value, colour = measure), size = 2)

	# There we go: because the error for the hierarchical bayes model is so much smaller,
	# we get a way narrower confidence band. This is a feature of the well-estimated random effects
	# doing such a good job for prediction