bgall/covariate_adjustment_bias.R

## covariate_adjustment_bias.R
library(dplyr)


# Wrap data simulation in a function
sim_data <- function(N) {
  ###########################################################################
  # parameters
  ###########################################################################

  set.seed(123)
  ages <- 16:30
  control_mean <- 0
  control_sd <- 1
  teffect_mean_mean <- 1
  teffect_mean_sd <- 1
  teffect_sd_rate <- 1

  ###########################################################################
  # Simulate data
  ###########################################################################

  # Age, individual treatment effects, and row identifiers
  dat <- tibble(
    age = sample(ages, size = N, replace = T),
    age_cat = cut(
      age,
      breaks = 3,
      labels = c("low", "med", "high")
    ),
    het_teffect_mean = rnorm(n = N, mean = teffect_mean_mean, sd = teffect_mean_sd),
    het_teffect_sd = rexp(n = N, rate = teffect_sd_rate),
    ids = 1:N
  )

  # Ensure we have obs in every age value (better example)
  stopifnot(dat$age %>% unique() %>% length() == length(ages))

  # Block randomized treatment and outcome draws
  treated <- dat %>%
    group_by(age_cat) %>%
    slice_sample(prop = 0.5) %>%
    ungroup()

  dat <- dat %>%
    mutate(
      treat = if_else(ids %in% treated$ids, 1, 0),
      y = rnorm(
        n = N,
        control_mean + treat * het_teffect_mean,
        sd = control_sd + treat * het_teffect_sd
      )
    )

  return(dat)
}

###########################################################################
# Example simulation
###########################################################################

# Simulate N = 100
dat <- sim_data(100)
table(dat$treat, dat$age_cat) # check blocking

###########################################################################
# Linear combination?
###########################################################################

# Regression: w/ constant
lm(y ~ treat + age + age_cat, data = dat) # factor level term drops

# Regression: no constant
lm(y ~ -1 + treat + age + age_cat, data = dat) # no dropped term

###########################################################################
# Show bias in adjusted ATE
###########################################################################

# Throw some of above code into function
get_bias <- function(data){

  # unadjusted estimate
  unadj <- lm(y ~ treat, data = data)

  # adjusted estimate
  adj <- lm(y ~ treat + age + age_cat, data = data)

  # bias
  unadj$coefficients["treat"] - adj$coefficients["treat"]
}

###########################################################################
# Show bias for varying N
###########################################################################

# Expected bias magnitude declines with N (better balance)
get_bias(sim_data(50))
get_bias(sim_data(100))
get_bias(sim_data(500))
get_bias(sim_data(1000))
get_bias(sim_data(5000))
get_bias(sim_data(500000))
	library(dplyr)


	# Wrap data simulation in a function
	sim_data <- function(N) {
	###########################################################################
	# parameters
	###########################################################################

	set.seed(123)
	ages <- 16:30
	control_mean <- 0
	control_sd <- 1
	teffect_mean_mean <- 1
	teffect_mean_sd <- 1
	teffect_sd_rate <- 1

	###########################################################################
	# Simulate data
	###########################################################################

	# Age, individual treatment effects, and row identifiers
	dat <- tibble(
	age = sample(ages, size = N, replace = T),
	age_cat = cut(
	age,
	breaks = 3,
	labels = c("low", "med", "high")
	),
	het_teffect_mean = rnorm(n = N, mean = teffect_mean_mean, sd = teffect_mean_sd),
	het_teffect_sd = rexp(n = N, rate = teffect_sd_rate),
	ids = 1:N
	)

	# Ensure we have obs in every age value (better example)
	stopifnot(dat$age %>% unique() %>% length() == length(ages))

	# Block randomized treatment and outcome draws
	treated <- dat %>%
	group_by(age_cat) %>%
	slice_sample(prop = 0.5) %>%
	ungroup()

	dat <- dat %>%
	mutate(
	treat = if_else(ids %in% treated$ids, 1, 0),
	y = rnorm(
	n = N,
	control_mean + treat * het_teffect_mean,
	sd = control_sd + treat * het_teffect_sd
	)
	)

	return(dat)
	}

	###########################################################################
	# Example simulation
	###########################################################################

	# Simulate N = 100
	dat <- sim_data(100)
	table(dat$treat, dat$age_cat) # check blocking

	###########################################################################
	# Linear combination?
	###########################################################################

	# Regression: w/ constant
	lm(y ~ treat + age + age_cat, data = dat) # factor level term drops

	# Regression: no constant
	lm(y ~ -1 + treat + age + age_cat, data = dat) # no dropped term

	###########################################################################
	# Show bias in adjusted ATE
	###########################################################################

	# Throw some of above code into function
	get_bias <- function(data){

	# unadjusted estimate
	unadj <- lm(y ~ treat, data = data)

	# adjusted estimate
	adj <- lm(y ~ treat + age + age_cat, data = data)

	# bias
	unadj$coefficients["treat"] - adj$coefficients["treat"]
	}

	###########################################################################
	# Show bias for varying N
	###########################################################################

	# Expected bias magnitude declines with N (better balance)
	get_bias(sim_data(50))
	get_bias(sim_data(100))
	get_bias(sim_data(500))
	get_bias(sim_data(1000))
	get_bias(sim_data(5000))
	get_bias(sim_data(500000))