nazlialagoz/Event_Study_Medium_Article.R

## Event_Study_Medium_Article.R

# Simulation study for the Medium article:
# Event Studies for Causal Inference: The Dos and Don'ts -
# A guide to avoiding the common pitfalls of event studies

# This code simulates a panel dataset and then runs event studies
# Different scenarios are created to demonstrate pitfalls of event studies
# The simulaiton part of the code is adapted from Andrew Baker's awesome blog:
# https://andrewcbaker.netlify.app/2020/06/27/how-to-create-relative-time-indicators/
# Also see a relevant package and blog by Sant'Anna & Callaway:
# https://bcallaway11.github.io/did/articles/pre-testing.html

rm(list = ls())
library(data.table)
library(fastDummies)
library(tidyverse)
library(ggthemes)
library(fixest)
library(kableExtra)


theme_set(theme_clean() + theme(plot.background = element_blank()))
select <- dplyr::select
set.seed(20221228)

# Simulate data -----------------------------------------------------------
  # Create a function that takes in the number of units, the number of periods,
  # a treatment effect per period (tau_cum), and
  # cohort periods (in which periods a cohort is treated).

  # If a cohort period is greater than the end observation period this means that
  # this cohort is untreated during the observation period.

  # Create a function to calculate the dependent variable throughout
  calc_dep_var <- function(constant, unit_fe, period_fe, tau_cum, error){
    dep_var = constant + unit_fe + period_fe + tau_cum + error
  }

  constant = 80 # set the constant so that the outcome variable makes sense for the example

  make_data <- function(num_unit, num_period,tau, cohort_periods){
    end_period = num_period-1 # periods start from 0
    # Fixed Effects ------------------------------------------------
    # unit fixed effects
    unit <- tibble(
      unit = 1:num_unit,
      unit_fe = rnorm(num_unit, 0, .1),
      # Assign units to different treatment cohorts
      cohort_period = sample(cohort_periods, num_unit, replace = TRUE),
      # generate treatment effect
      mu = rnorm(num_unit, tau, 0.2))

    # period fixed effects
    period <- tibble(
      period = 0:end_period,
      period_fe = rnorm(num_period, 0, .1))

    # Trend Break -------------------------------------------------------------

    # make main dataset
    # full interaction of unit X period
    tot_num_obs = num_unit*num_period

    expand_grid(unit = 1:num_unit, period = 0:end_period) %>%
      left_join(., unit) %>%
      left_join(., period) %>%
      # make error term and get treatment indicators and treatment effects
      mutate(error = rnorm(tot_num_obs, 0, 0.5),
             treat = ifelse(period >= cohort_period, 1, 0),
             tau = ifelse(treat == 1, mu, 0)) %>%
      # calculate cumulative treatment effects
      group_by(unit) %>%
      mutate(tau_cum = cumsum(tau)) %>%
      ungroup() %>%
      # calculate the dep variable
      mutate(dep_var = calc_dep_var(constant, unit_fe, period_fe, tau_cum,error))
  }

  # Make data
  num_unit <- 10000
  num_period <- 5
  cohort_periods <- c(2,3,99) # Cohort period 99 is the untreated cohort
  tau <- 1.00 # For each treated period the treatment effect accumulates by this number
  data <- make_data(num_unit, num_period, tau, cohort_periods)
  data <- as.data.table(data)

  setnames(data, 'dep_var', 'hrs_listened')

  setkeyv(data, c('cohort_period', 'unit', 'period'))
  select_cols <- c('unit', 'period', 'cohort_period','treat','hrs_listened')

  kable(head(data[, ..select_cols]), 'simple')

  kable(summary(data[, ..select_cols]), 'simple')
  nrow(data)
  # We have 10000 units and 5 periods (incl. 0). This makes a total of 50000 observations.

  avg_dv_period <- data[, .(mean_hrs_listened = mean(hrs_listened)), by = c('cohort_period','period')]

  # The treatment effect changes overtime
  avg_treat_period <- data[treat == 1, .(mean_treat_effect = mean(tau_cum)), by = c('cohort_period','period')]


  # Final plot showing the outcome over time
  cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

  ggplot(avg_dv_period, aes(fill=factor(cohort_period), y=mean_hrs_listened, x=period)) +
    geom_bar(position="dodge", stat="identity") +  coord_cartesian(ylim=c(79,85))+
    labs(x = "Period", y = "Hours", title = 'Average music listening (hours)',
         caption = 'Cohort 2 is the early treated, cohort 3 is the late treated and cohort 99 is the never treated group.') +
    theme(legend.position = 'bottom',
          axis.title = element_text(size = 14),
          axis.text = element_text(size = 12)) + scale_fill_manual(values=cbPalette) +
    geom_vline(xintercept = 1.5, color = '#999999', lty = 5)+
    geom_vline(xintercept = 2.5, color = '#E69F00', lty = 5) +
    geom_text(label = 'Cohort period 2 is treated',aes(1.4,83), color = '#999999', angle = 90)+
    geom_text(label = 'Cohort period 3 is treated',aes(2.4,83), color = '#E69F00', angle = 90) +
    guides(fill=guide_legend(title="Treatment cohort period"))

  # Treatment effect
  ggplot(avg_treat_period, aes(fill=factor(cohort_period), y=mean_treat_effect, x=period)) +
    geom_bar(position="dodge", stat="identity") +  #coord_cartesian(ylim=c(79,85))+
    labs(x = "Period", y = "Hours", title = 'True treatment effect (hrs)',
         caption = 'Cohort 2 is the early treated, cohort 3 is the late treated and cohort 99 is the never treated group.') +
    theme(legend.position = 'bottom',
          axis.title = element_text(size = 14),
          axis.text = element_text(size = 12)) + scale_fill_manual(values=cbPalette) +
    guides(fill=guide_legend(title="Treatment cohort period"))


  # Create relative time dummies to use in the regression
  data <- data %>%
    # make relative year indicator
    mutate(rel_period = ifelse(cohort_period == 99,99,period - cohort_period))
  summary(data$rel_period)

  data <- data %>%
    dummy_cols(select_columns = "rel_period")

  rel_per_dummies <- colnames(data)[grepl('rel_period_', colnames(data))]
  # Change name w/ minuses to handle them more easily
  rel_per_dummies_new<-gsub('-','min', rel_per_dummies)
  setnames(data, rel_per_dummies, rel_per_dummies_new)

# Event studies in the simple situation -----------------------------------
  # Event study
  covs <- setdiff(rel_per_dummies_new, c('rel_period_99','rel_period_min1'))
  covs_collapse <- paste0(covs, collapse='+')
  formula <- as.formula(paste0('hrs_listened ~ ',covs_collapse))
  model <- feols(formula,
               data = data, panel.id = "unit",
               fixef = c("unit", "period"))
  summary(model)

  # Create function to report simple event study results
  tbl_res_rel_period <- function(model){
    res <- as.data.table(model$coeftable, keep.rownames = T)
    res[, rn := gsub('rel_period_','', rn)]
    res[, rn := as.numeric(gsub('min','-', rn))]
    res <- res %>%
      mutate_at(vars(-rn), funs(round(., 2)))
    setnames(res, 'rn','Relative period')
    setorder(res, by = 'Relative period')
    return(res)
  }

  kable(tbl_res_rel_period(model), 'simple')

  # Event study with cohort specific treatment effects
  # Create dummies for the cohort-period
  data <- data %>%
    dummy_cols(select_columns = "cohort_period")
  cohort_dummies <- c('cohort_period_2','cohort_period_3')
  # Create interactions between relative period and cohort dummies
  interact <- as.data.table(expand_grid(cohort_dummies, covs))
  # Ps. one can drop rel_period_min3 for cohort 2 and
  # rel_period_2 for cohort 3 as these observations don't exist
  # but I'll let the regression drop these
  interact[, interaction := paste0(cohort_dummies,':',covs)]
  interact_covs <- interact$interaction
  interact_covs_collapse <- paste0(interact_covs,collapse = '+')

  formula <- as.formula(paste0('hrs_listened ~ ',interact_covs_collapse))
  model <- feols(formula,
               data = data, panel.id = "unit",
               fixef = c("unit", "period"))
  summary(model)
  # Two variables are naturally dropped because the most negative relative time period
  # for cohort 2 is -2 and for the cohort period 3,
  # we only have relative period up to 1.

  # The results are consistent with the true effects. There are no statistically
  # significant effects pre-treatment and there are positive treatment effects
  # after the treatment


  # Report period-cohort event study result
  tbl_res_period_cohort <- function(model){
    res <- as.data.table(model$coeftable, keep.rownames = T)

    res[, 'Cohort period' := as.numeric(str_extract(rn, "(?i)(?<=cohort_period_)\\d+"))]
    res[, rn := gsub('cohort_period_3','', rn)]
    res[, rn := gsub('cohort_period_2','', rn)]
    res[, rn := gsub(':','', rn)]

    res[, rn := gsub('rel_period_','', rn)]
    res[, rn := as.numeric(gsub('min','-', rn))]
    res <- res %>%
      mutate_at(vars(-rn), funs(round(., 2)))
    setnames(res, 'rn','Relative period')
    setorderv(res, c('Cohort period','Relative period'))
    setcolorder(res, 'Cohort period')
    return(res)
  }

  kable(tbl_res_period_cohort(model), format = 'simple')

# 1. Anticipation ---------------------------------------------------------
  tau_cum_anticipation = .5 # set how much anticipation is going to be there
  data_anticip <- copy(data) # make a copy
  summary(data_anticip[rel_period ==-1]$tau_cum)
  # Include some anticipation: there is already a positive effect before the treatment implementation
  data_anticip[rel_period ==-1, hrs_listened := calc_dep_var(constant,unit_fe,period_fe,tau_cum_anticipation,error)]

  # Summarize the dependent variable over relative period
  avg_dep_anticip <- data_anticip[rel_period != 99, .(mean_hrs_listened = mean(hrs_listened)), (rel_period)]
  setorder(avg_dep_anticip, 'rel_period')

  rel_periods <- sort(unique(avg_dep_anticip$rel_period))
  ggplot(avg_dep_anticip, aes(y=mean_hrs_listened, x=rel_period)) +
    geom_bar(position="dodge", stat="identity", fill = 'deepskyblue') +  coord_cartesian(ylim=c(79,85))+
    labs(x = "Relative period", y = "Hours", title = 'Average music listening over relative time period',
         caption = 'Only for the treated units') +
    theme(legend.position = 'bottom',
          legend.title = element_blank(),
          axis.title = element_text(size = 14),
          axis.text = element_text(size = 12)) + scale_x_continuous(breaks = min(rel_periods):max(rel_periods))

  # The dependent variable is already starting to pick up at rel.period = -1

  # Event study
  formula <- as.formula(paste0('hrs_listened ~ ',covs_collapse))
  model <- feols(formula,
                 data = data_anticip, panel.id = "unit",
                 fixef = c("unit", "period"))
  summary(model)
  kable(tbl_res_rel_period(model) , 'simple')

  # Now prev times are minus and the treatment effect in the post are not true even though I changed nothing
  # The issue is that I still use the rel.period. -1 as the reference period

  # Use release period -2 as the reference period instead
  covs_anticip <- setdiff(c(covs,'rel_period_min1'),'rel_period_min2')
  covs_anticip_collapse <- paste0(covs_anticip,collapse = '+')

  formula <- as.formula(paste0('hrs_listened ~ ',covs_anticip_collapse))
  model <- feols(formula,
               data = data_anticip, panel.id = "unit",
               fixef = c("unit", "period"))
  summary(model)
  kable(tbl_res_rel_period(model) , 'simple')
  # Now, we can see that the treatment effects (TEs) are accurately estimated
  # Plus, you can also see that there is no effect in rel. period -3 but there
  # is already a positive treatment effect in the rel. period -1


# 2.  Heterogeneity -------------------------------------------------------
  # Until now, the dynamics of the TE between the treatment groups are the same
  # They are affected by the treatment in the same fashion overtime
  # However, what if the first treated group is affected much more positively?
  data_hetero <- copy(data)

  # calculate tau
  data_hetero[cohort_period==2, tau := 1.5*tau]
  # calculate tau_cum
  setkeyv(data_hetero, c('unit', 'period')) # order
  data_hetero[cohort_period==2, tau_cum := cumsum(tau), by = unit]
  # calculate the dependent variable
  data_hetero[cohort_period==2, hrs_listened := calc_dep_var(constant,unit_fe,period_fe,tau_cum,error)]

  # Visualize
  avg_dv_period <- data_hetero[, .(mean_hrs_listened = mean(hrs_listened)), by = c('cohort_period','period')] # Graph this

  ggplot(avg_dv_period, aes(fill=factor(cohort_period), y=mean_hrs_listened, x=period)) +
    geom_bar(position="dodge", stat="identity") +  coord_cartesian(ylim=c(79,85))+
    labs(x = "Period", y = "Hours", title = 'Average music listening (hours)',
         caption = 'Cohort 2 is the early treated, cohort 3 is the late treated and cohort 99 is the never treated group.',
         subtitle = 'Treatment effect heterogeneity across cohorts') +
    theme(legend.position = 'bottom',
          axis.title = element_text(size = 14),
          axis.text = element_text(size = 12)) + scale_fill_manual(values=cbPalette) +
    geom_vline(xintercept = 1.5, color = '#999999', lty = 5)+
    geom_vline(xintercept = 2.5, color = '#E69F00', lty = 5) +
    geom_text(label = 'Cohort period 2 is treated',aes(1.4,83), color = '#999999', angle = 90)+
    geom_text(label = 'Cohort period 3 is treated',aes(2.4,83), color = '#E69F00', angle = 90) +
    guides(fill=guide_legend(title="Treatment cohort period"))

  # Treatment effect
  avg_treat_period <- data_hetero[treat == 1, .(mean_treat_effect = mean(tau_cum)), by = c('cohort_period','period')]
  ggplot(avg_treat_period, aes(fill=factor(cohort_period), y=mean_treat_effect, x=period)) +
    geom_bar(position="dodge", stat="identity") +  #coord_cartesian(ylim=c(79,85))+
    labs(x = "Period", y = "Hours", title = 'True treatment effect (hrs)',
         caption = 'Cohort 2 is the early treated and cohort 3 is the late treated group.',
         subtitle = 'Treatment effect heterogeneity across cohorts') +
    theme(legend.position = 'bottom',
          axis.title = element_text(size = 14),
          axis.text = element_text(size = 12)) + scale_fill_manual(values=cbPalette) +
    guides(fill=guide_legend(title="Treatment cohort period"))

  # Event study
  formula <- as.formula(paste0('hrs_listened ~ ',covs_collapse))
  model <- feols(formula,
               data = data_hetero, panel.id = "unit",
               fixef = c("unit", "period"))
  summary(model)

  kable(tbl_res_rel_period(model), 'simple')

  # Event study with cohort specific treatment effects
  formula <- as.formula(paste0('hrs_listened ~ ',interact_covs_collapse))
  model <- feols(formula,
               data = data_hetero, panel.id = "unit",
               fixef = c("unit", "period"))
  summary(model)

  kable(tbl_res_period_cohort(model), 'simple')

# 3. Under-identification -------------------------------------------------
  # Just drop the non treated group
  # see what happens
  data_under_id <- data[cohort_period != 99]

  avg_dv_period <- data_under_id[, .(mean_hrs_listened = mean(hrs_listened)), by = c('cohort_period','period')]

  ggplot(avg_dv_period, aes(fill=factor(cohort_period), y=mean_hrs_listened, x=period)) +
    geom_bar(position="dodge", stat="identity") +  coord_cartesian(ylim=c(79,85))+
    labs(x = "Period", y = "Hours", title = 'Average music listening (hours)',
         caption = 'Cohort 2 is the early treated and cohort 3 is the late treated group (cohort 99 is excluded).') +
    theme(legend.position = 'bottom',
          axis.title = element_text(size = 14),
          axis.text = element_text(size = 12)) + scale_fill_manual(values=cbPalette) +
    geom_vline(xintercept = 1.5, color = '#999999', lty = 5)+
    geom_vline(xintercept = 2.5, color = '#E69F00', lty = 5) +
    geom_text(label = 'Cohort period 2 is treated',aes(1.4,83), color = '#999999', angle = 90)+
    geom_text(label = 'Cohort period 3 is treated',aes(2.4,83), color = '#E69F00', angle = 90) +
    guides(fill=guide_legend(title="Treatment cohort period"))

  # Graph true treat effect (it is the same as the original)
  avg_treat_period <- data_under_id[treat == 1, .(mean_treat_effect = mean(tau_cum)), by = c('cohort_period','period')]
  ggplot(avg_treat_period, aes(fill=factor(cohort_period), y=mean_treat_effect, x=period)) +
    geom_bar(position="dodge", stat="identity") +  #coord_cartesian(ylim=c(79,85))+
    labs(x = "Period", y = "Hours", title = 'True treatment effect (hrs)',
         caption = 'Cohort 2 is the early treated, cohort 3 is the late treated group.',
         subtitle = 'Treatment effect heterogeneity across cohorts') +
    theme(legend.position = 'bottom',
          axis.title = element_text(size = 14),
          axis.text = element_text(size = 12)) + scale_fill_manual(values=cbPalette) +
    guides(fill=guide_legend(title="Treatment cohort period"))

  # Event study
  # Just running a regular event study w/o changing anything
  formula <- as.formula(paste0('hrs_listened ~ ',covs_collapse))
  model <- feols(formula,
                 data = data_under_id, panel.id = "unit",
                 fixef = c("unit", "period"))
  summary(model) # --> crazy results. let's exclude the most negative relative time period dummy.

  kable(tbl_res_rel_period(model), 'simple')

  # Use the most negative and also 1 period before as the reference period
  # The most negative relative time period is -3, exclude that in addition to the regular -1
  covs <- setdiff(rel_per_dummies_new, c('rel_period_99','rel_period_min1', 'rel_period_min3'))
  covs_collapse <- paste0(covs, collapse='+')
  formula <- as.formula(paste0('hrs_listened ~ ',covs_collapse))

  model <- feols(formula,
                 data = data_under_id, panel.id = "unit",
                 fixef = c("unit", "period"))
  summary(model) # --> no longer crazy results! Accurately estimating the TEs.

  kable(tbl_res_rel_period(model), 'simple')

	# Simulation study for the Medium article:
	# Event Studies for Causal Inference: The Dos and Don'ts -
	# A guide to avoiding the common pitfalls of event studies

	# This code simulates a panel dataset and then runs event studies
	# Different scenarios are created to demonstrate pitfalls of event studies
	# The simulaiton part of the code is adapted from Andrew Baker's awesome blog:
	# https://andrewcbaker.netlify.app/2020/06/27/how-to-create-relative-time-indicators/
	# Also see a relevant package and blog by Sant'Anna & Callaway:
	# https://bcallaway11.github.io/did/articles/pre-testing.html

	rm(list = ls())
	library(data.table)
	library(fastDummies)
	library(tidyverse)
	library(ggthemes)
	library(fixest)
	library(kableExtra)


	theme_set(theme_clean() + theme(plot.background = element_blank()))
	select <- dplyr::select
	set.seed(20221228)

	# Simulate data -----------------------------------------------------------
	# Create a function that takes in the number of units, the number of periods,
	# a treatment effect per period (tau_cum), and
	# cohort periods (in which periods a cohort is treated).

	# If a cohort period is greater than the end observation period this means that
	# this cohort is untreated during the observation period.

	# Create a function to calculate the dependent variable throughout
	calc_dep_var <- function(constant, unit_fe, period_fe, tau_cum, error){
	dep_var = constant + unit_fe + period_fe + tau_cum + error
	}

	constant = 80 # set the constant so that the outcome variable makes sense for the example

	make_data <- function(num_unit, num_period,tau, cohort_periods){
	end_period = num_period-1 # periods start from 0
	# Fixed Effects ------------------------------------------------
	# unit fixed effects
	unit <- tibble(
	unit = 1:num_unit,
	unit_fe = rnorm(num_unit, 0, .1),
	# Assign units to different treatment cohorts
	cohort_period = sample(cohort_periods, num_unit, replace = TRUE),
	# generate treatment effect
	mu = rnorm(num_unit, tau, 0.2))

	# period fixed effects
	period <- tibble(
	period = 0:end_period,
	period_fe = rnorm(num_period, 0, .1))

	# Trend Break -------------------------------------------------------------

	# make main dataset
	# full interaction of unit X period
	tot_num_obs = num_unit*num_period

	expand_grid(unit = 1:num_unit, period = 0:end_period) %>%
	left_join(., unit) %>%
	left_join(., period) %>%
	# make error term and get treatment indicators and treatment effects
	mutate(error = rnorm(tot_num_obs, 0, 0.5),
	treat = ifelse(period >= cohort_period, 1, 0),
	tau = ifelse(treat == 1, mu, 0)) %>%
	# calculate cumulative treatment effects
	group_by(unit) %>%
	mutate(tau_cum = cumsum(tau)) %>%
	ungroup() %>%
	# calculate the dep variable
	mutate(dep_var = calc_dep_var(constant, unit_fe, period_fe, tau_cum,error))
	}

	# Make data
	num_unit <- 10000
	num_period <- 5
	cohort_periods <- c(2,3,99) # Cohort period 99 is the untreated cohort
	tau <- 1.00 # For each treated period the treatment effect accumulates by this number
	data <- make_data(num_unit, num_period, tau, cohort_periods)
	data <- as.data.table(data)

	setnames(data, 'dep_var', 'hrs_listened')

	setkeyv(data, c('cohort_period', 'unit', 'period'))
	select_cols <- c('unit', 'period', 'cohort_period','treat','hrs_listened')

	kable(head(data[, ..select_cols]), 'simple')

	kable(summary(data[, ..select_cols]), 'simple')
	nrow(data)
	# We have 10000 units and 5 periods (incl. 0). This makes a total of 50000 observations.

	avg_dv_period <- data[, .(mean_hrs_listened = mean(hrs_listened)), by = c('cohort_period','period')]

	# The treatment effect changes overtime
	avg_treat_period <- data[treat == 1, .(mean_treat_effect = mean(tau_cum)), by = c('cohort_period','period')]


	# Final plot showing the outcome over time
	cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

	ggplot(avg_dv_period, aes(fill=factor(cohort_period), y=mean_hrs_listened, x=period)) +
	geom_bar(position="dodge", stat="identity") + coord_cartesian(ylim=c(79,85))+
	labs(x = "Period", y = "Hours", title = 'Average music listening (hours)',
	caption = 'Cohort 2 is the early treated, cohort 3 is the late treated and cohort 99 is the never treated group.') +
	theme(legend.position = 'bottom',
	axis.title = element_text(size = 14),
	axis.text = element_text(size = 12)) + scale_fill_manual(values=cbPalette) +
	geom_vline(xintercept = 1.5, color = '#999999', lty = 5)+
	geom_vline(xintercept = 2.5, color = '#E69F00', lty = 5) +
	geom_text(label = 'Cohort period 2 is treated',aes(1.4,83), color = '#999999', angle = 90)+
	geom_text(label = 'Cohort period 3 is treated',aes(2.4,83), color = '#E69F00', angle = 90) +
	guides(fill=guide_legend(title="Treatment cohort period"))

	# Treatment effect
	ggplot(avg_treat_period, aes(fill=factor(cohort_period), y=mean_treat_effect, x=period)) +
	geom_bar(position="dodge", stat="identity") + #coord_cartesian(ylim=c(79,85))+
	labs(x = "Period", y = "Hours", title = 'True treatment effect (hrs)',
	caption = 'Cohort 2 is the early treated, cohort 3 is the late treated and cohort 99 is the never treated group.') +
	theme(legend.position = 'bottom',
	axis.title = element_text(size = 14),
	axis.text = element_text(size = 12)) + scale_fill_manual(values=cbPalette) +
	guides(fill=guide_legend(title="Treatment cohort period"))


	# Create relative time dummies to use in the regression
	data <- data %>%
	# make relative year indicator
	mutate(rel_period = ifelse(cohort_period == 99,99,period - cohort_period))
	summary(data$rel_period)

	data <- data %>%
	dummy_cols(select_columns = "rel_period")

	rel_per_dummies <- colnames(data)[grepl('rel_period_', colnames(data))]
	# Change name w/ minuses to handle them more easily
	rel_per_dummies_new<-gsub('-','min', rel_per_dummies)
	setnames(data, rel_per_dummies, rel_per_dummies_new)

	# Event studies in the simple situation -----------------------------------
	# Event study
	covs <- setdiff(rel_per_dummies_new, c('rel_period_99','rel_period_min1'))
	covs_collapse <- paste0(covs, collapse='+')
	formula <- as.formula(paste0('hrs_listened ~ ',covs_collapse))
	model <- feols(formula,
	data = data, panel.id = "unit",
	fixef = c("unit", "period"))
	summary(model)

	# Create function to report simple event study results
	tbl_res_rel_period <- function(model){
	res <- as.data.table(model$coeftable, keep.rownames = T)
	res[, rn := gsub('rel_period_','', rn)]
	res[, rn := as.numeric(gsub('min','-', rn))]
	res <- res %>%
	mutate_at(vars(-rn), funs(round(., 2)))
	setnames(res, 'rn','Relative period')
	setorder(res, by = 'Relative period')
	return(res)
	}

	kable(tbl_res_rel_period(model), 'simple')

	# Event study with cohort specific treatment effects
	# Create dummies for the cohort-period
	data <- data %>%
	dummy_cols(select_columns = "cohort_period")
	cohort_dummies <- c('cohort_period_2','cohort_period_3')
	# Create interactions between relative period and cohort dummies
	interact <- as.data.table(expand_grid(cohort_dummies, covs))
	# Ps. one can drop rel_period_min3 for cohort 2 and
	# rel_period_2 for cohort 3 as these observations don't exist
	# but I'll let the regression drop these
	interact[, interaction := paste0(cohort_dummies,':',covs)]
	interact_covs <- interact$interaction
	interact_covs_collapse <- paste0(interact_covs,collapse = '+')

	formula <- as.formula(paste0('hrs_listened ~ ',interact_covs_collapse))
	model <- feols(formula,
	data = data, panel.id = "unit",
	fixef = c("unit", "period"))
	summary(model)
	# Two variables are naturally dropped because the most negative relative time period
	# for cohort 2 is -2 and for the cohort period 3,
	# we only have relative period up to 1.

	# The results are consistent with the true effects. There are no statistically
	# significant effects pre-treatment and there are positive treatment effects
	# after the treatment


	# Report period-cohort event study result
	tbl_res_period_cohort <- function(model){
	res <- as.data.table(model$coeftable, keep.rownames = T)

	res[, 'Cohort period' := as.numeric(str_extract(rn, "(?i)(?<=cohort_period_)\\d+"))]
	res[, rn := gsub('cohort_period_3','', rn)]
	res[, rn := gsub('cohort_period_2','', rn)]
	res[, rn := gsub(':','', rn)]

	res[, rn := gsub('rel_period_','', rn)]
	res[, rn := as.numeric(gsub('min','-', rn))]
	res <- res %>%
	mutate_at(vars(-rn), funs(round(., 2)))
	setnames(res, 'rn','Relative period')
	setorderv(res, c('Cohort period','Relative period'))
	setcolorder(res, 'Cohort period')
	return(res)
	}

	kable(tbl_res_period_cohort(model), format = 'simple')

	# 1. Anticipation ---------------------------------------------------------
	tau_cum_anticipation = .5 # set how much anticipation is going to be there
	data_anticip <- copy(data) # make a copy
	summary(data_anticip[rel_period ==-1]$tau_cum)
	# Include some anticipation: there is already a positive effect before the treatment implementation
	data_anticip[rel_period ==-1, hrs_listened := calc_dep_var(constant,unit_fe,period_fe,tau_cum_anticipation,error)]

	# Summarize the dependent variable over relative period
	avg_dep_anticip <- data_anticip[rel_period != 99, .(mean_hrs_listened = mean(hrs_listened)), (rel_period)]
	setorder(avg_dep_anticip, 'rel_period')

	rel_periods <- sort(unique(avg_dep_anticip$rel_period))
	ggplot(avg_dep_anticip, aes(y=mean_hrs_listened, x=rel_period)) +
	geom_bar(position="dodge", stat="identity", fill = 'deepskyblue') + coord_cartesian(ylim=c(79,85))+
	labs(x = "Relative period", y = "Hours", title = 'Average music listening over relative time period',
	caption = 'Only for the treated units') +
	theme(legend.position = 'bottom',
	legend.title = element_blank(),
	axis.title = element_text(size = 14),
	axis.text = element_text(size = 12)) + scale_x_continuous(breaks = min(rel_periods):max(rel_periods))

	# The dependent variable is already starting to pick up at rel.period = -1

	# Event study
	formula <- as.formula(paste0('hrs_listened ~ ',covs_collapse))
	model <- feols(formula,
	data = data_anticip, panel.id = "unit",
	fixef = c("unit", "period"))
	summary(model)
	kable(tbl_res_rel_period(model) , 'simple')

	# Now prev times are minus and the treatment effect in the post are not true even though I changed nothing
	# The issue is that I still use the rel.period. -1 as the reference period

	# Use release period -2 as the reference period instead
	covs_anticip <- setdiff(c(covs,'rel_period_min1'),'rel_period_min2')
	covs_anticip_collapse <- paste0(covs_anticip,collapse = '+')

	formula <- as.formula(paste0('hrs_listened ~ ',covs_anticip_collapse))
	model <- feols(formula,
	data = data_anticip, panel.id = "unit",
	fixef = c("unit", "period"))
	summary(model)
	kable(tbl_res_rel_period(model) , 'simple')
	# Now, we can see that the treatment effects (TEs) are accurately estimated
	# Plus, you can also see that there is no effect in rel. period -3 but there
	# is already a positive treatment effect in the rel. period -1


	# 2. Heterogeneity -------------------------------------------------------
	# Until now, the dynamics of the TE between the treatment groups are the same
	# They are affected by the treatment in the same fashion overtime
	# However, what if the first treated group is affected much more positively?
	data_hetero <- copy(data)

	# calculate tau
	data_hetero[cohort_period==2, tau := 1.5*tau]
	# calculate tau_cum
	setkeyv(data_hetero, c('unit', 'period')) # order
	data_hetero[cohort_period==2, tau_cum := cumsum(tau), by = unit]
	# calculate the dependent variable
	data_hetero[cohort_period==2, hrs_listened := calc_dep_var(constant,unit_fe,period_fe,tau_cum,error)]

	# Visualize
	avg_dv_period <- data_hetero[, .(mean_hrs_listened = mean(hrs_listened)), by = c('cohort_period','period')] # Graph this

	ggplot(avg_dv_period, aes(fill=factor(cohort_period), y=mean_hrs_listened, x=period)) +
	geom_bar(position="dodge", stat="identity") + coord_cartesian(ylim=c(79,85))+
	labs(x = "Period", y = "Hours", title = 'Average music listening (hours)',
	caption = 'Cohort 2 is the early treated, cohort 3 is the late treated and cohort 99 is the never treated group.',
	subtitle = 'Treatment effect heterogeneity across cohorts') +
	theme(legend.position = 'bottom',
	axis.title = element_text(size = 14),
	axis.text = element_text(size = 12)) + scale_fill_manual(values=cbPalette) +
	geom_vline(xintercept = 1.5, color = '#999999', lty = 5)+
	geom_vline(xintercept = 2.5, color = '#E69F00', lty = 5) +
	geom_text(label = 'Cohort period 2 is treated',aes(1.4,83), color = '#999999', angle = 90)+
	geom_text(label = 'Cohort period 3 is treated',aes(2.4,83), color = '#E69F00', angle = 90) +
	guides(fill=guide_legend(title="Treatment cohort period"))

	# Treatment effect
	avg_treat_period <- data_hetero[treat == 1, .(mean_treat_effect = mean(tau_cum)), by = c('cohort_period','period')]
	ggplot(avg_treat_period, aes(fill=factor(cohort_period), y=mean_treat_effect, x=period)) +
	geom_bar(position="dodge", stat="identity") + #coord_cartesian(ylim=c(79,85))+
	labs(x = "Period", y = "Hours", title = 'True treatment effect (hrs)',
	caption = 'Cohort 2 is the early treated and cohort 3 is the late treated group.',
	subtitle = 'Treatment effect heterogeneity across cohorts') +
	theme(legend.position = 'bottom',
	axis.title = element_text(size = 14),
	axis.text = element_text(size = 12)) + scale_fill_manual(values=cbPalette) +
	guides(fill=guide_legend(title="Treatment cohort period"))

	# Event study
	formula <- as.formula(paste0('hrs_listened ~ ',covs_collapse))
	model <- feols(formula,
	data = data_hetero, panel.id = "unit",
	fixef = c("unit", "period"))
	summary(model)

	kable(tbl_res_rel_period(model), 'simple')

	# Event study with cohort specific treatment effects
	formula <- as.formula(paste0('hrs_listened ~ ',interact_covs_collapse))
	model <- feols(formula,
	data = data_hetero, panel.id = "unit",
	fixef = c("unit", "period"))
	summary(model)

	kable(tbl_res_period_cohort(model), 'simple')

	# 3. Under-identification -------------------------------------------------
	# Just drop the non treated group
	# see what happens
	data_under_id <- data[cohort_period != 99]

	avg_dv_period <- data_under_id[, .(mean_hrs_listened = mean(hrs_listened)), by = c('cohort_period','period')]

	ggplot(avg_dv_period, aes(fill=factor(cohort_period), y=mean_hrs_listened, x=period)) +
	geom_bar(position="dodge", stat="identity") + coord_cartesian(ylim=c(79,85))+
	labs(x = "Period", y = "Hours", title = 'Average music listening (hours)',
	caption = 'Cohort 2 is the early treated and cohort 3 is the late treated group (cohort 99 is excluded).') +
	theme(legend.position = 'bottom',
	axis.title = element_text(size = 14),
	axis.text = element_text(size = 12)) + scale_fill_manual(values=cbPalette) +
	geom_vline(xintercept = 1.5, color = '#999999', lty = 5)+
	geom_vline(xintercept = 2.5, color = '#E69F00', lty = 5) +
	geom_text(label = 'Cohort period 2 is treated',aes(1.4,83), color = '#999999', angle = 90)+
	geom_text(label = 'Cohort period 3 is treated',aes(2.4,83), color = '#E69F00', angle = 90) +
	guides(fill=guide_legend(title="Treatment cohort period"))

	# Graph true treat effect (it is the same as the original)
	avg_treat_period <- data_under_id[treat == 1, .(mean_treat_effect = mean(tau_cum)), by = c('cohort_period','period')]
	ggplot(avg_treat_period, aes(fill=factor(cohort_period), y=mean_treat_effect, x=period)) +
	geom_bar(position="dodge", stat="identity") + #coord_cartesian(ylim=c(79,85))+
	labs(x = "Period", y = "Hours", title = 'True treatment effect (hrs)',
	caption = 'Cohort 2 is the early treated, cohort 3 is the late treated group.',
	subtitle = 'Treatment effect heterogeneity across cohorts') +
	theme(legend.position = 'bottom',
	axis.title = element_text(size = 14),
	axis.text = element_text(size = 12)) + scale_fill_manual(values=cbPalette) +
	guides(fill=guide_legend(title="Treatment cohort period"))

	# Event study
	# Just running a regular event study w/o changing anything
	formula <- as.formula(paste0('hrs_listened ~ ',covs_collapse))
	model <- feols(formula,
	data = data_under_id, panel.id = "unit",
	fixef = c("unit", "period"))
	summary(model) # --> crazy results. let's exclude the most negative relative time period dummy.

	kable(tbl_res_rel_period(model), 'simple')

	# Use the most negative and also 1 period before as the reference period
	# The most negative relative time period is -3, exclude that in addition to the regular -1
	covs <- setdiff(rel_per_dummies_new, c('rel_period_99','rel_period_min1', 'rel_period_min3'))
	covs_collapse <- paste0(covs, collapse='+')
	formula <- as.formula(paste0('hrs_listened ~ ',covs_collapse))

	model <- feols(formula,
	data = data_under_id, panel.id = "unit",
	fixef = c("unit", "period"))
	summary(model) # --> no longer crazy results! Accurately estimating the TEs.

	kable(tbl_res_rel_period(model), 'simple')