Skip to content

Instantly share code, notes, and snippets.

@jwdink
Last active September 22, 2020 02:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jwdink/0f48bbeeb5449640de8666b089367395 to your computer and use it in GitHub Desktop.
Save jwdink/0f48bbeeb5449640de8666b089367395 to your computer and use it in GitHub Desktop.
regression where predictors are correlated with past values of y
library('dplyr')
library('tidyr')
library('pbapply')
#' simulate()
#'
#' @param num_groups Number of groups
#' @param num_per_group Number of observations per group
#' @param true_coef The true `beta_1` in `Y_t ~ beta_0 + beta_1*X_t`
#' @param phi A function that takes Y_t-1,...,Y_t and returns Xt
#'
#' @return A dataframe with group_id, time, x, y
simulate <- function(num_groups, num_per_group, true_coef, phi) {
# generate groups, with varying baselines:
df_groups <- tibble(group_id = 1:num_groups) %>%
mutate(.baseline = rnorm(n = num_groups, mean = 3))
out <- vector(mode = 'list', length = num_per_group)
for (t in (1:num_per_group)) {
dft <- df_groups %>%
mutate(.white_noise = rnorm(n=n()), time = t)
if (t > 1) {
dft <- dft %>%
left_join(
bind_rows(out) %>%
group_by(group_id) %>%
summarize(x = phi(y), .groups='drop'),
by = 'group_id'
)
} else {
dft <- dft %>%
group_by(group_id) %>%
mutate(x = phi(numeric(0))) %>%
ungroup()
}
out[[t]] <- dft %>% mutate(y = exp(.baseline + .white_noise + true_coef * coalesce(log(x),0)))
}
return(bind_rows(out) %>% select(-starts_with(".")) %>% arrange(group_id, time))
}
# Simulation ----------------------------------------------------------------------------------------------------
UNBIASED_PHI <- function(y) exp(rnorm(1))
BIASED_PHI <- function(y) {
if (length(y)==0) return(exp(2)) # note: this ends up being critical for HLM, see below
return( mean(y) * runif(n = 1, min = .10, max = .20) )
}
set.seed(2020-9-21)
TRUE_COEF <- 0.0
df_sim <- simulate(num_groups = 5000, num_per_group = 10, true_coef = TRUE_COEF, phi = BIASED_PHI)
# Individual Models -----------------------------------------------------------------------------------------------
library('ggplot2')
# running a bunch of individual linear-models, we see with BIASED_PHI the models under-estimate the true value
# (this is because x is proportional to y_lag_avg, and (in general) `cor(y, y_lag_avg) < 0`)
df_sim %>%
group_by(group_id) %>%
nest() %>%
ungroup() %>%
mutate(model = pblapply(data, function(.x) broom::tidy(lm(log(y) ~ log(x), data=.x)))) %>%
select(-data) %>%
unnest(cols=c(model)) %>%
ggplot(aes(x=term, y=estimate)) +
stat_summary(fun.data = mean_cl_boot) +
geom_hline(yintercept = 0)
# Mixed-Effects Model -----------------------------------------------------------------------------------------------
library('lme4')
# a mixed-effects model can get this right!
# (important note: i had previously mistakenly concluded that the model will be biased, because I previously implemented
# BIASED_PHI to return NA on the first timestep (i.e., before we have any `ys`). This caused lmer to drop t=0 rows,
# which meant that `x` contained information about each group's average that the lmer model didn't have access to when
# calculating random-intercepts. this led to an upwardly biased coefficient on `x`)
bind_rows(pblapply(X = 1:250, FUN = function(i) {
df_sim <- simulate(num_groups = 5000, num_per_group = 10, true_coef = TRUE_COEF, phi = BIASED_PHI)
as.data.frame(summary(lmer(
log(y) ~ log(x) + ( 1 | group_id),
data = df_sim )
)$coefficients) %>%
tibble::rownames_to_column('term')
})) %>%
filter(term=='log(x)') %>%
ggplot(aes(x=Estimate, group=abs(`t value`)>1.96, fill=abs(`t value`)>1.96)) +
geom_histogram()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment