Last active
February 17, 2019 15:35
-
-
Save khakieconomics/1faab8f2560b1726c74d082a79def29f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(rstanarm); library(tidyverse) | |
options(mc.cores = parallel::detectCores()) | |
set.seed(42) | |
data("radon") | |
head(treatment_sample) | |
# Some levels have no variance in the outcomes, making likelihood estimates impossible | |
# Adding a tiny bit of noise fixes the problem | |
radon$log_uranium <- rnorm(nrow(radon), radon$log_uranium, 0.05) | |
# Split the data | |
sample_split <- sample(1:nrow(radon), round(nrow(radon)/2)) | |
outcome_sample <- radon[sample_split,] | |
treatment_sample <- radon[-sample_split,] | |
# Fit the outcome model | |
summary(lm(log_uranium ~ log_radon + floor + county, data = treatment_sample)) | |
outcome_model <- stan_lmer(log_uranium ~ log_radon + (1 | county), | |
data = treatment_sample, | |
cores= 4, iter = 1000, | |
prior = student_t(df = 5, location = 0, scale = 0.5), | |
prior_intercept = student_t(df = 7, location = 0, scale = 1), | |
prior_aux = student_t(df = 4, scale = 1)) | |
# Fit the treatment model | |
treatment_model <- stan_glmer(floor ~ log_radon + (1 | county), | |
data = treatment_sample, cores = 4, | |
iter = 1000, | |
family = binomial(link = "logit"), | |
prior = student_t(df = 5, location = 0, scale = 0.5), | |
prior_intercept = student_t(df = 3, location = 0, scale = 2)) | |
# Get posterior mean predictions for treatments and outcomes | |
treatment_predictions <- t(posterior_linpred(treatment_model, newdata = outcome_sample, transform = T)) | |
hist(treatment_predictions) | |
outcome_predictions <- t(posterior_linpred(outcome_model,newdata = outcome_sample)) | |
hist(outcome_predictions) | |
# This contains the residuals from the treatment model applied to the outcome dataset. | |
# Note there are 2000 columns---a separate vector of residuals for every draw from the posterior. | |
treatment_errors <- matrix(outcome_sample$floor, nrow(outcome_sample), ncol(treatment_predictions)) - treatment_predictions | |
outcome_errors <- matrix(outcome_sample$log_uranium, nrow(outcome_sample), ncol(treatment_predictions)) - outcome_predictions | |
# Get the theta estimates for each set of residuals | |
thetahat <- sapply(1:2000, function(i) coef(lm(outcome_errors[,i] ~ -1 + treatment_errors[,i]))) | |
# And get our posterior interval | |
ATE_bayes <- mean(thetahat) | |
confint_bayes <- quantile(thetahat, c(0.025, 0.975)) | |
# Let’s now do double-ML with Random Forest ------------------------------- | |
library(randomForest) | |
# One-hot encode the categorical features | |
radon_onehot <- model.matrix(~ ., data = radon)[,-1] | |
# Perform a split | |
onehot_outcome <- radon_onehot[sample_split,] | |
onehot_treatment <- radon_onehot[-sample_split,] | |
# Fit models and construct residuals | |
treatment_model_rf <- randomForest(factor(floor) ~ . - log_uranium, data = onehot_treatment) | |
outcome_model_rf <- randomForest(log_uranium ~ . - floor, data = onehot_treatment, mtry = 15) | |
resid_outcome_rf <- onehot_outcome[,"log_uranium"] - predict(outcome_model_rf, newdata = onehot_outcome) | |
resid_treatment_rf <- onehot_outcome[,"floor"] - predict(treatment_model_rf, newdata = onehot_outcome, type ="prob")[,2] | |
# Get our treatment effect estimate | |
fit_rf <- lm(resid_outcome_rf ~ -1 + resid_treatment_rf) | |
se <- (1/mean(resid_treatment_rf^2))^2 * mean(resid_treatment_rf^2 * resid(fit_rf)^2) | |
rf_confint <- c(coef(fit_rf) - 1.96*se, coef(fit_rf) + 1.96*se) | |
# Now a Naive BART implementation ----------------------------------------- | |
library(BART) | |
y_train <- radon_onehot[,"log_uranium"] | |
x_train <- radon_onehot[,-ncol(radon_onehot)] | |
bart_est <- wbart(x.train = x_train, y.train = y_train, sparse = T) | |
x_test_1 <- x_train | |
x_test_1[,"floor"] <- 1 | |
x_test_0 <- x_train | |
x_test_0[,"floor"] <- 0 | |
y_pred_1 <- predict(bart_est, newdata = x_test_1) | |
y_pred_0 <- predict(bart_est, newdata = x_test_0) | |
# Difference = posterior of prediction | |
Ind_TE <- y_pred_1 - y_pred_0 | |
ATE <- rowMeans(Ind_TE) | |
hist(ATE) | |
ATE_mean <- mean(ATE) | |
ATE_confint <- quantile(ATE, c(0.025, 0.975)) | |
# Visualize comparative effects ------------------------------------------- | |
data_frame(model = c("Bayesian hierarchical", | |
"RF Double ML", | |
"BART"), | |
mean = c(ATE_bayes, coef(fit_rf), ATE_mean), | |
lower = c(confint_bayes[1], confint(fit_rf)[1], ATE_confint[1]), | |
upper = c(confint_bayes[2], confint(fit_rf)[2], ATE_confint[2])) %>% | |
ggplot(aes(x = model)) + | |
geom_linerange(aes(ymin = lower, ymax = upper), colour = "green") + | |
geom_point(aes(y = mean), size = 2) + | |
theme_bw() | |
# So it seems the Bayesian Hierarchical estimate is way more precise. Why might this be the case? | |
error_scales <- data_frame(model = c("treatment model", "outcome model"), | |
`error scale bayes` = c(sd(treatment_errors), sd(outcome_errors)), | |
`error scale RF` = c(sd(resid_treatment_rf), sd(resid_outcome_rf))) | |
error_scales %>% | |
gather(measure, value, -model) %>% | |
ggplot(aes(x = model)) + | |
geom_point(aes(y = value, colour = measure), size = 2) | |
# There we go: because the error for the hierarchical bayes model is so much smaller, | |
# we get a way narrower confidence band. This is a feature of the well-estimated random effects | |
# doing such a good job for prediction |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment