Created
January 1, 2023 20:23
-
-
Save nazlialagoz/a6fe4131b69edb9c8f0593984b88a17b to your computer and use it in GitHub Desktop.
Simulation study for the Medium article: Event Studies for Causal Inference: The Dos and Don'ts - A guide to avoiding the common pitfalls of event studies
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Simulation study for the Medium article: | |
# Event Studies for Causal Inference: The Dos and Don'ts - | |
# A guide to avoiding the common pitfalls of event studies | |
# This code simulates a panel dataset and then runs event studies | |
# Different scenarios are created to demonstrate pitfalls of event studies | |
# The simulaiton part of the code is adapted from Andrew Baker's awesome blog: | |
# https://andrewcbaker.netlify.app/2020/06/27/how-to-create-relative-time-indicators/ | |
# Also see a relevant package and blog by Sant'Anna & Callaway: | |
# https://bcallaway11.github.io/did/articles/pre-testing.html | |
rm(list = ls()) | |
library(data.table) | |
library(fastDummies) | |
library(tidyverse) | |
library(ggthemes) | |
library(fixest) | |
library(kableExtra) | |
theme_set(theme_clean() + theme(plot.background = element_blank())) | |
select <- dplyr::select | |
set.seed(20221228) | |
# Simulate data ----------------------------------------------------------- | |
# Create a function that takes in the number of units, the number of periods, | |
# a treatment effect per period (tau_cum), and | |
# cohort periods (in which periods a cohort is treated). | |
# If a cohort period is greater than the end observation period this means that | |
# this cohort is untreated during the observation period. | |
# Create a function to calculate the dependent variable throughout | |
calc_dep_var <- function(constant, unit_fe, period_fe, tau_cum, error){ | |
dep_var = constant + unit_fe + period_fe + tau_cum + error | |
} | |
constant = 80 # set the constant so that the outcome variable makes sense for the example | |
make_data <- function(num_unit, num_period,tau, cohort_periods){ | |
end_period = num_period-1 # periods start from 0 | |
# Fixed Effects ------------------------------------------------ | |
# unit fixed effects | |
unit <- tibble( | |
unit = 1:num_unit, | |
unit_fe = rnorm(num_unit, 0, .1), | |
# Assign units to different treatment cohorts | |
cohort_period = sample(cohort_periods, num_unit, replace = TRUE), | |
# generate treatment effect | |
mu = rnorm(num_unit, tau, 0.2)) | |
# period fixed effects | |
period <- tibble( | |
period = 0:end_period, | |
period_fe = rnorm(num_period, 0, .1)) | |
# Trend Break ------------------------------------------------------------- | |
# make main dataset | |
# full interaction of unit X period | |
tot_num_obs = num_unit*num_period | |
expand_grid(unit = 1:num_unit, period = 0:end_period) %>% | |
left_join(., unit) %>% | |
left_join(., period) %>% | |
# make error term and get treatment indicators and treatment effects | |
mutate(error = rnorm(tot_num_obs, 0, 0.5), | |
treat = ifelse(period >= cohort_period, 1, 0), | |
tau = ifelse(treat == 1, mu, 0)) %>% | |
# calculate cumulative treatment effects | |
group_by(unit) %>% | |
mutate(tau_cum = cumsum(tau)) %>% | |
ungroup() %>% | |
# calculate the dep variable | |
mutate(dep_var = calc_dep_var(constant, unit_fe, period_fe, tau_cum,error)) | |
} | |
# Make data | |
num_unit <- 10000 | |
num_period <- 5 | |
cohort_periods <- c(2,3,99) # Cohort period 99 is the untreated cohort | |
tau <- 1.00 # For each treated period the treatment effect accumulates by this number | |
data <- make_data(num_unit, num_period, tau, cohort_periods) | |
data <- as.data.table(data) | |
setnames(data, 'dep_var', 'hrs_listened') | |
setkeyv(data, c('cohort_period', 'unit', 'period')) | |
select_cols <- c('unit', 'period', 'cohort_period','treat','hrs_listened') | |
kable(head(data[, ..select_cols]), 'simple') | |
kable(summary(data[, ..select_cols]), 'simple') | |
nrow(data) | |
# We have 10000 units and 5 periods (incl. 0). This makes a total of 50000 observations. | |
avg_dv_period <- data[, .(mean_hrs_listened = mean(hrs_listened)), by = c('cohort_period','period')] | |
# The treatment effect changes overtime | |
avg_treat_period <- data[treat == 1, .(mean_treat_effect = mean(tau_cum)), by = c('cohort_period','period')] | |
# Final plot showing the outcome over time | |
cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") | |
ggplot(avg_dv_period, aes(fill=factor(cohort_period), y=mean_hrs_listened, x=period)) + | |
geom_bar(position="dodge", stat="identity") + coord_cartesian(ylim=c(79,85))+ | |
labs(x = "Period", y = "Hours", title = 'Average music listening (hours)', | |
caption = 'Cohort 2 is the early treated, cohort 3 is the late treated and cohort 99 is the never treated group.') + | |
theme(legend.position = 'bottom', | |
axis.title = element_text(size = 14), | |
axis.text = element_text(size = 12)) + scale_fill_manual(values=cbPalette) + | |
geom_vline(xintercept = 1.5, color = '#999999', lty = 5)+ | |
geom_vline(xintercept = 2.5, color = '#E69F00', lty = 5) + | |
geom_text(label = 'Cohort period 2 is treated',aes(1.4,83), color = '#999999', angle = 90)+ | |
geom_text(label = 'Cohort period 3 is treated',aes(2.4,83), color = '#E69F00', angle = 90) + | |
guides(fill=guide_legend(title="Treatment cohort period")) | |
# Treatment effect | |
ggplot(avg_treat_period, aes(fill=factor(cohort_period), y=mean_treat_effect, x=period)) + | |
geom_bar(position="dodge", stat="identity") + #coord_cartesian(ylim=c(79,85))+ | |
labs(x = "Period", y = "Hours", title = 'True treatment effect (hrs)', | |
caption = 'Cohort 2 is the early treated, cohort 3 is the late treated and cohort 99 is the never treated group.') + | |
theme(legend.position = 'bottom', | |
axis.title = element_text(size = 14), | |
axis.text = element_text(size = 12)) + scale_fill_manual(values=cbPalette) + | |
guides(fill=guide_legend(title="Treatment cohort period")) | |
# Create relative time dummies to use in the regression | |
data <- data %>% | |
# make relative year indicator | |
mutate(rel_period = ifelse(cohort_period == 99,99,period - cohort_period)) | |
summary(data$rel_period) | |
data <- data %>% | |
dummy_cols(select_columns = "rel_period") | |
rel_per_dummies <- colnames(data)[grepl('rel_period_', colnames(data))] | |
# Change name w/ minuses to handle them more easily | |
rel_per_dummies_new<-gsub('-','min', rel_per_dummies) | |
setnames(data, rel_per_dummies, rel_per_dummies_new) | |
# Event studies in the simple situation ----------------------------------- | |
# Event study | |
covs <- setdiff(rel_per_dummies_new, c('rel_period_99','rel_period_min1')) | |
covs_collapse <- paste0(covs, collapse='+') | |
formula <- as.formula(paste0('hrs_listened ~ ',covs_collapse)) | |
model <- feols(formula, | |
data = data, panel.id = "unit", | |
fixef = c("unit", "period")) | |
summary(model) | |
# Create function to report simple event study results | |
tbl_res_rel_period <- function(model){ | |
res <- as.data.table(model$coeftable, keep.rownames = T) | |
res[, rn := gsub('rel_period_','', rn)] | |
res[, rn := as.numeric(gsub('min','-', rn))] | |
res <- res %>% | |
mutate_at(vars(-rn), funs(round(., 2))) | |
setnames(res, 'rn','Relative period') | |
setorder(res, by = 'Relative period') | |
return(res) | |
} | |
kable(tbl_res_rel_period(model), 'simple') | |
# Event study with cohort specific treatment effects | |
# Create dummies for the cohort-period | |
data <- data %>% | |
dummy_cols(select_columns = "cohort_period") | |
cohort_dummies <- c('cohort_period_2','cohort_period_3') | |
# Create interactions between relative period and cohort dummies | |
interact <- as.data.table(expand_grid(cohort_dummies, covs)) | |
# Ps. one can drop rel_period_min3 for cohort 2 and | |
# rel_period_2 for cohort 3 as these observations don't exist | |
# but I'll let the regression drop these | |
interact[, interaction := paste0(cohort_dummies,':',covs)] | |
interact_covs <- interact$interaction | |
interact_covs_collapse <- paste0(interact_covs,collapse = '+') | |
formula <- as.formula(paste0('hrs_listened ~ ',interact_covs_collapse)) | |
model <- feols(formula, | |
data = data, panel.id = "unit", | |
fixef = c("unit", "period")) | |
summary(model) | |
# Two variables are naturally dropped because the most negative relative time period | |
# for cohort 2 is -2 and for the cohort period 3, | |
# we only have relative period up to 1. | |
# The results are consistent with the true effects. There are no statistically | |
# significant effects pre-treatment and there are positive treatment effects | |
# after the treatment | |
# Report period-cohort event study result | |
tbl_res_period_cohort <- function(model){ | |
res <- as.data.table(model$coeftable, keep.rownames = T) | |
res[, 'Cohort period' := as.numeric(str_extract(rn, "(?i)(?<=cohort_period_)\\d+"))] | |
res[, rn := gsub('cohort_period_3','', rn)] | |
res[, rn := gsub('cohort_period_2','', rn)] | |
res[, rn := gsub(':','', rn)] | |
res[, rn := gsub('rel_period_','', rn)] | |
res[, rn := as.numeric(gsub('min','-', rn))] | |
res <- res %>% | |
mutate_at(vars(-rn), funs(round(., 2))) | |
setnames(res, 'rn','Relative period') | |
setorderv(res, c('Cohort period','Relative period')) | |
setcolorder(res, 'Cohort period') | |
return(res) | |
} | |
kable(tbl_res_period_cohort(model), format = 'simple') | |
# 1. Anticipation --------------------------------------------------------- | |
tau_cum_anticipation = .5 # set how much anticipation is going to be there | |
data_anticip <- copy(data) # make a copy | |
summary(data_anticip[rel_period ==-1]$tau_cum) | |
# Include some anticipation: there is already a positive effect before the treatment implementation | |
data_anticip[rel_period ==-1, hrs_listened := calc_dep_var(constant,unit_fe,period_fe,tau_cum_anticipation,error)] | |
# Summarize the dependent variable over relative period | |
avg_dep_anticip <- data_anticip[rel_period != 99, .(mean_hrs_listened = mean(hrs_listened)), (rel_period)] | |
setorder(avg_dep_anticip, 'rel_period') | |
rel_periods <- sort(unique(avg_dep_anticip$rel_period)) | |
ggplot(avg_dep_anticip, aes(y=mean_hrs_listened, x=rel_period)) + | |
geom_bar(position="dodge", stat="identity", fill = 'deepskyblue') + coord_cartesian(ylim=c(79,85))+ | |
labs(x = "Relative period", y = "Hours", title = 'Average music listening over relative time period', | |
caption = 'Only for the treated units') + | |
theme(legend.position = 'bottom', | |
legend.title = element_blank(), | |
axis.title = element_text(size = 14), | |
axis.text = element_text(size = 12)) + scale_x_continuous(breaks = min(rel_periods):max(rel_periods)) | |
# The dependent variable is already starting to pick up at rel.period = -1 | |
# Event study | |
formula <- as.formula(paste0('hrs_listened ~ ',covs_collapse)) | |
model <- feols(formula, | |
data = data_anticip, panel.id = "unit", | |
fixef = c("unit", "period")) | |
summary(model) | |
kable(tbl_res_rel_period(model) , 'simple') | |
# Now prev times are minus and the treatment effect in the post are not true even though I changed nothing | |
# The issue is that I still use the rel.period. -1 as the reference period | |
# Use release period -2 as the reference period instead | |
covs_anticip <- setdiff(c(covs,'rel_period_min1'),'rel_period_min2') | |
covs_anticip_collapse <- paste0(covs_anticip,collapse = '+') | |
formula <- as.formula(paste0('hrs_listened ~ ',covs_anticip_collapse)) | |
model <- feols(formula, | |
data = data_anticip, panel.id = "unit", | |
fixef = c("unit", "period")) | |
summary(model) | |
kable(tbl_res_rel_period(model) , 'simple') | |
# Now, we can see that the treatment effects (TEs) are accurately estimated | |
# Plus, you can also see that there is no effect in rel. period -3 but there | |
# is already a positive treatment effect in the rel. period -1 | |
# 2. Heterogeneity ------------------------------------------------------- | |
# Until now, the dynamics of the TE between the treatment groups are the same | |
# They are affected by the treatment in the same fashion overtime | |
# However, what if the first treated group is affected much more positively? | |
data_hetero <- copy(data) | |
# calculate tau | |
data_hetero[cohort_period==2, tau := 1.5*tau] | |
# calculate tau_cum | |
setkeyv(data_hetero, c('unit', 'period')) # order | |
data_hetero[cohort_period==2, tau_cum := cumsum(tau), by = unit] | |
# calculate the dependent variable | |
data_hetero[cohort_period==2, hrs_listened := calc_dep_var(constant,unit_fe,period_fe,tau_cum,error)] | |
# Visualize | |
avg_dv_period <- data_hetero[, .(mean_hrs_listened = mean(hrs_listened)), by = c('cohort_period','period')] # Graph this | |
ggplot(avg_dv_period, aes(fill=factor(cohort_period), y=mean_hrs_listened, x=period)) + | |
geom_bar(position="dodge", stat="identity") + coord_cartesian(ylim=c(79,85))+ | |
labs(x = "Period", y = "Hours", title = 'Average music listening (hours)', | |
caption = 'Cohort 2 is the early treated, cohort 3 is the late treated and cohort 99 is the never treated group.', | |
subtitle = 'Treatment effect heterogeneity across cohorts') + | |
theme(legend.position = 'bottom', | |
axis.title = element_text(size = 14), | |
axis.text = element_text(size = 12)) + scale_fill_manual(values=cbPalette) + | |
geom_vline(xintercept = 1.5, color = '#999999', lty = 5)+ | |
geom_vline(xintercept = 2.5, color = '#E69F00', lty = 5) + | |
geom_text(label = 'Cohort period 2 is treated',aes(1.4,83), color = '#999999', angle = 90)+ | |
geom_text(label = 'Cohort period 3 is treated',aes(2.4,83), color = '#E69F00', angle = 90) + | |
guides(fill=guide_legend(title="Treatment cohort period")) | |
# Treatment effect | |
avg_treat_period <- data_hetero[treat == 1, .(mean_treat_effect = mean(tau_cum)), by = c('cohort_period','period')] | |
ggplot(avg_treat_period, aes(fill=factor(cohort_period), y=mean_treat_effect, x=period)) + | |
geom_bar(position="dodge", stat="identity") + #coord_cartesian(ylim=c(79,85))+ | |
labs(x = "Period", y = "Hours", title = 'True treatment effect (hrs)', | |
caption = 'Cohort 2 is the early treated and cohort 3 is the late treated group.', | |
subtitle = 'Treatment effect heterogeneity across cohorts') + | |
theme(legend.position = 'bottom', | |
axis.title = element_text(size = 14), | |
axis.text = element_text(size = 12)) + scale_fill_manual(values=cbPalette) + | |
guides(fill=guide_legend(title="Treatment cohort period")) | |
# Event study | |
formula <- as.formula(paste0('hrs_listened ~ ',covs_collapse)) | |
model <- feols(formula, | |
data = data_hetero, panel.id = "unit", | |
fixef = c("unit", "period")) | |
summary(model) | |
kable(tbl_res_rel_period(model), 'simple') | |
# Event study with cohort specific treatment effects | |
formula <- as.formula(paste0('hrs_listened ~ ',interact_covs_collapse)) | |
model <- feols(formula, | |
data = data_hetero, panel.id = "unit", | |
fixef = c("unit", "period")) | |
summary(model) | |
kable(tbl_res_period_cohort(model), 'simple') | |
# 3. Under-identification ------------------------------------------------- | |
# Just drop the non treated group | |
# see what happens | |
data_under_id <- data[cohort_period != 99] | |
avg_dv_period <- data_under_id[, .(mean_hrs_listened = mean(hrs_listened)), by = c('cohort_period','period')] | |
ggplot(avg_dv_period, aes(fill=factor(cohort_period), y=mean_hrs_listened, x=period)) + | |
geom_bar(position="dodge", stat="identity") + coord_cartesian(ylim=c(79,85))+ | |
labs(x = "Period", y = "Hours", title = 'Average music listening (hours)', | |
caption = 'Cohort 2 is the early treated and cohort 3 is the late treated group (cohort 99 is excluded).') + | |
theme(legend.position = 'bottom', | |
axis.title = element_text(size = 14), | |
axis.text = element_text(size = 12)) + scale_fill_manual(values=cbPalette) + | |
geom_vline(xintercept = 1.5, color = '#999999', lty = 5)+ | |
geom_vline(xintercept = 2.5, color = '#E69F00', lty = 5) + | |
geom_text(label = 'Cohort period 2 is treated',aes(1.4,83), color = '#999999', angle = 90)+ | |
geom_text(label = 'Cohort period 3 is treated',aes(2.4,83), color = '#E69F00', angle = 90) + | |
guides(fill=guide_legend(title="Treatment cohort period")) | |
# Graph true treat effect (it is the same as the original) | |
avg_treat_period <- data_under_id[treat == 1, .(mean_treat_effect = mean(tau_cum)), by = c('cohort_period','period')] | |
ggplot(avg_treat_period, aes(fill=factor(cohort_period), y=mean_treat_effect, x=period)) + | |
geom_bar(position="dodge", stat="identity") + #coord_cartesian(ylim=c(79,85))+ | |
labs(x = "Period", y = "Hours", title = 'True treatment effect (hrs)', | |
caption = 'Cohort 2 is the early treated, cohort 3 is the late treated group.', | |
subtitle = 'Treatment effect heterogeneity across cohorts') + | |
theme(legend.position = 'bottom', | |
axis.title = element_text(size = 14), | |
axis.text = element_text(size = 12)) + scale_fill_manual(values=cbPalette) + | |
guides(fill=guide_legend(title="Treatment cohort period")) | |
# Event study | |
# Just running a regular event study w/o changing anything | |
formula <- as.formula(paste0('hrs_listened ~ ',covs_collapse)) | |
model <- feols(formula, | |
data = data_under_id, panel.id = "unit", | |
fixef = c("unit", "period")) | |
summary(model) # --> crazy results. let's exclude the most negative relative time period dummy. | |
kable(tbl_res_rel_period(model), 'simple') | |
# Use the most negative and also 1 period before as the reference period | |
# The most negative relative time period is -3, exclude that in addition to the regular -1 | |
covs <- setdiff(rel_per_dummies_new, c('rel_period_99','rel_period_min1', 'rel_period_min3')) | |
covs_collapse <- paste0(covs, collapse='+') | |
formula <- as.formula(paste0('hrs_listened ~ ',covs_collapse)) | |
model <- feols(formula, | |
data = data_under_id, panel.id = "unit", | |
fixef = c("unit", "period")) | |
summary(model) # --> no longer crazy results! Accurately estimating the TEs. | |
kable(tbl_res_rel_period(model), 'simple') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment