jwdink/xcory_sim.R

## xcory_sim.R
library('dplyr')
library('tidyr')
library('pbapply')

#' simulate()
#'
#' @param num_groups Number of groups
#' @param num_per_group Number of observations per group
#' @param true_coef The true `beta_1` in `Y_t ~ beta_0 + beta_1*X_t`
#' @param phi A function that takes Y_t-1,...,Y_t and returns Xt
#'
#' @return A dataframe with group_id, time, x, y
simulate <- function(num_groups, num_per_group, true_coef, phi) {

  # generate groups, with varying baselines:
  df_groups <- tibble(group_id = 1:num_groups) %>%
    mutate(.baseline = rnorm(n = num_groups, mean = 3))

  out <- vector(mode = 'list', length = num_per_group)
  for (t in (1:num_per_group)) {
    dft <- df_groups %>%
      mutate(.white_noise = rnorm(n=n()), time = t)
    if (t > 1) {
      dft <- dft %>%
        left_join(
          bind_rows(out) %>%
            group_by(group_id) %>%
            summarize(x = phi(y), .groups='drop'),
          by = 'group_id'
        )
    }  else {
      dft <- dft %>%
        group_by(group_id) %>%
        mutate(x = phi(numeric(0))) %>%
        ungroup()
    }
    out[[t]] <- dft %>%  mutate(y = exp(.baseline + .white_noise + true_coef * coalesce(log(x),0)))
  }
  return(bind_rows(out) %>% select(-starts_with(".")) %>% arrange(group_id, time))
}

# Simulation ----------------------------------------------------------------------------------------------------
UNBIASED_PHI <- function(y) exp(rnorm(1))
BIASED_PHI <- function(y) {
  if (length(y)==0) return(exp(2)) # note: this ends up being critical for HLM, see below
  return( mean(y) * runif(n = 1, min = .10, max = .20) )
}

set.seed(2020-9-21)
TRUE_COEF <- 0.0
df_sim <- simulate(num_groups = 5000, num_per_group = 10, true_coef = TRUE_COEF, phi = BIASED_PHI)


# Individual Models -----------------------------------------------------------------------------------------------
library('ggplot2')
# running a bunch of individual linear-models, we see with BIASED_PHI the models under-estimate the true value
# (this is because x is proportional to y_lag_avg, and (in general) `cor(y, y_lag_avg) < 0`)
df_sim %>%
  group_by(group_id) %>%
  nest() %>%
  ungroup() %>%
  mutate(model = pblapply(data, function(.x) broom::tidy(lm(log(y) ~ log(x), data=.x)))) %>%
  select(-data) %>%
  unnest(cols=c(model)) %>%
  ggplot(aes(x=term, y=estimate)) +
  stat_summary(fun.data = mean_cl_boot) +
  geom_hline(yintercept = 0)

# Mixed-Effects Model -----------------------------------------------------------------------------------------------
library('lme4')
# a mixed-effects model can get this right!
# (important note: i had previously mistakenly concluded that the model will be biased, because I previously implemented
# BIASED_PHI to return NA on the first timestep (i.e., before we have any `ys`). This caused lmer to drop t=0 rows,
# which meant that `x` contained information about each group's average that the lmer model didn't have access to when
# calculating random-intercepts. this led to an upwardly biased coefficient on `x`)

bind_rows(pblapply(X = 1:250, FUN = function(i) {
  df_sim <- simulate(num_groups = 5000, num_per_group = 10, true_coef = TRUE_COEF, phi = BIASED_PHI)
  as.data.frame(summary(lmer(
    log(y) ~ log(x) + ( 1 | group_id),
    data = df_sim )
  )$coefficients) %>%
    tibble::rownames_to_column('term')
})) %>%
  filter(term=='log(x)') %>%
  ggplot(aes(x=Estimate, group=abs(`t value`)>1.96, fill=abs(`t value`)>1.96)) +
  geom_histogram()
	library('dplyr')
	library('tidyr')
	library('pbapply')

	#' simulate()
	#'
	#' @param num_groups Number of groups
	#' @param num_per_group Number of observations per group
	#' @param true_coef The true `beta_1` in `Y_t ~ beta_0 + beta_1*X_t`
	#' @param phi A function that takes Y_t-1,...,Y_t and returns Xt
	#'
	#' @return A dataframe with group_id, time, x, y
	simulate <- function(num_groups, num_per_group, true_coef, phi) {

	# generate groups, with varying baselines:
	df_groups <- tibble(group_id = 1:num_groups) %>%
	mutate(.baseline = rnorm(n = num_groups, mean = 3))

	out <- vector(mode = 'list', length = num_per_group)
	for (t in (1:num_per_group)) {
	dft <- df_groups %>%
	mutate(.white_noise = rnorm(n=n()), time = t)
	if (t > 1) {
	dft <- dft %>%
	left_join(
	bind_rows(out) %>%
	group_by(group_id) %>%
	summarize(x = phi(y), .groups='drop'),
	by = 'group_id'
	)
	} else {
	dft <- dft %>%
	group_by(group_id) %>%
	mutate(x = phi(numeric(0))) %>%
	ungroup()
	}
	out[[t]] <- dft %>% mutate(y = exp(.baseline + .white_noise + true_coef * coalesce(log(x),0)))
	}
	return(bind_rows(out) %>% select(-starts_with(".")) %>% arrange(group_id, time))
	}

	# Simulation ----------------------------------------------------------------------------------------------------
	UNBIASED_PHI <- function(y) exp(rnorm(1))
	BIASED_PHI <- function(y) {
	if (length(y)==0) return(exp(2)) # note: this ends up being critical for HLM, see below
	return( mean(y) * runif(n = 1, min = .10, max = .20) )
	}

	set.seed(2020-9-21)
	TRUE_COEF <- 0.0
	df_sim <- simulate(num_groups = 5000, num_per_group = 10, true_coef = TRUE_COEF, phi = BIASED_PHI)


	# Individual Models -----------------------------------------------------------------------------------------------
	library('ggplot2')
	# running a bunch of individual linear-models, we see with BIASED_PHI the models under-estimate the true value
	# (this is because x is proportional to y_lag_avg, and (in general) `cor(y, y_lag_avg) < 0`)
	df_sim %>%
	group_by(group_id) %>%
	nest() %>%
	ungroup() %>%
	mutate(model = pblapply(data, function(.x) broom::tidy(lm(log(y) ~ log(x), data=.x)))) %>%
	select(-data) %>%
	unnest(cols=c(model)) %>%
	ggplot(aes(x=term, y=estimate)) +
	stat_summary(fun.data = mean_cl_boot) +
	geom_hline(yintercept = 0)

	# Mixed-Effects Model -----------------------------------------------------------------------------------------------
	library('lme4')
	# a mixed-effects model can get this right!
	# (important note: i had previously mistakenly concluded that the model will be biased, because I previously implemented
	# BIASED_PHI to return NA on the first timestep (i.e., before we have any `ys`). This caused lmer to drop t=0 rows,
	# which meant that `x` contained information about each group's average that the lmer model didn't have access to when
	# calculating random-intercepts. this led to an upwardly biased coefficient on `x`)

	bind_rows(pblapply(X = 1:250, FUN = function(i) {
	df_sim <- simulate(num_groups = 5000, num_per_group = 10, true_coef = TRUE_COEF, phi = BIASED_PHI)
	as.data.frame(summary(lmer(
	log(y) ~ log(x) + ( 1 \| group_id),
	data = df_sim )
	)$coefficients) %>%
	tibble::rownames_to_column('term')
	})) %>%
	filter(term=='log(x)') %>%
	ggplot(aes(x=Estimate, group=abs(`t value`)>1.96, fill=abs(`t value`)>1.96)) +
	geom_histogram()