Last active
February 9, 2023 22:00
-
-
Save bgall/f3eb58339909dba2e46f99e3aba21ebe to your computer and use it in GitHub Desktop.
Show bias in covariate adjustment and linear combination "issue"
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(dplyr) | |
# Wrap data simulation in a function | |
sim_data <- function(N) { | |
########################################################################### | |
# parameters | |
########################################################################### | |
set.seed(123) | |
ages <- 16:30 | |
control_mean <- 0 | |
control_sd <- 1 | |
teffect_mean_mean <- 1 | |
teffect_mean_sd <- 1 | |
teffect_sd_rate <- 1 | |
########################################################################### | |
# Simulate data | |
########################################################################### | |
# Age, individual treatment effects, and row identifiers | |
dat <- tibble( | |
age = sample(ages, size = N, replace = T), | |
age_cat = cut( | |
age, | |
breaks = 3, | |
labels = c("low", "med", "high") | |
), | |
het_teffect_mean = rnorm(n = N, mean = teffect_mean_mean, sd = teffect_mean_sd), | |
het_teffect_sd = rexp(n = N, rate = teffect_sd_rate), | |
ids = 1:N | |
) | |
# Ensure we have obs in every age value (better example) | |
stopifnot(dat$age %>% unique() %>% length() == length(ages)) | |
# Block randomized treatment and outcome draws | |
treated <- dat %>% | |
group_by(age_cat) %>% | |
slice_sample(prop = 0.5) %>% | |
ungroup() | |
dat <- dat %>% | |
mutate( | |
treat = if_else(ids %in% treated$ids, 1, 0), | |
y = rnorm( | |
n = N, | |
control_mean + treat * het_teffect_mean, | |
sd = control_sd + treat * het_teffect_sd | |
) | |
) | |
return(dat) | |
} | |
########################################################################### | |
# Example simulation | |
########################################################################### | |
# Simulate N = 100 | |
dat <- sim_data(100) | |
table(dat$treat, dat$age_cat) # check blocking | |
########################################################################### | |
# Linear combination? | |
########################################################################### | |
# Regression: w/ constant | |
lm(y ~ treat + age + age_cat, data = dat) # factor level term drops | |
# Regression: no constant | |
lm(y ~ -1 + treat + age + age_cat, data = dat) # no dropped term | |
########################################################################### | |
# Show bias in adjusted ATE | |
########################################################################### | |
# Throw some of above code into function | |
get_bias <- function(data){ | |
# unadjusted estimate | |
unadj <- lm(y ~ treat, data = data) | |
# adjusted estimate | |
adj <- lm(y ~ treat + age + age_cat, data = data) | |
# bias | |
unadj$coefficients["treat"] - adj$coefficients["treat"] | |
} | |
########################################################################### | |
# Show bias for varying N | |
########################################################################### | |
# Expected bias magnitude declines with N (better balance) | |
get_bias(sim_data(50)) | |
get_bias(sim_data(100)) | |
get_bias(sim_data(500)) | |
get_bias(sim_data(1000)) | |
get_bias(sim_data(5000)) | |
get_bias(sim_data(500000)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment