Skip to content

Instantly share code, notes, and snippets.

@srosh2000
Created September 1, 2023 08:46
Show Gist options
  • Save srosh2000/13be23a4a37e16d21d2842e704b2c60d to your computer and use it in GitHub Desktop.
Save srosh2000/13be23a4a37e16d21d2842e704b2c60d to your computer and use it in GitHub Desktop.
Staggered DiD example
# Load packages
library(data.table)
library(dplyr)
library(tidyr)
library(zoo)
library(did)
library(lubridate)
library(googledrive)
library(ggplot2)
#### --- Load data ---###
# Download data from google drive
drive_deauth()
file_id<- "1e-VogI4zQPi_xbSydVLH7PrgkOlWpczl"
drive_download(as_id(file_id), overwrite = TRUE)
GoodAma_sample<- fread("GoodAma_sample.csv")
# SETTING UP THE DATA FOR ESTIMATION
# Get the minimum and maximum review dates
min_date <- min(GoodAma_sample$year_month)
max_date <- max(GoodAma_sample$year_month)
# Create a dataframe with all unique dates in the range
date_range <- seq(min_date, max_date, by = "month")
date_df <- data.frame(date = date_range)
# Assign time period values to the date dataframe
date_df$period <- seq_len(nrow(date_df))
# convert question time to year month variable
GoodAma_sample$question_ym<- as.Date(as.yearmon(GoodAma_sample$cleaned_question_time))
# Merge the review dates with the date dataframe to get the corresponding time period labels
merged_df <- merge(GoodAma_sample, date_df, by.x = "year_month", by.y = "date", all.x = TRUE)
# Merge the question time with the date dataframe to get the corresponding time period values
merged_df <- merge(merged_df, date_df, by.x = "question_ym", by.y = "date", all.x = TRUE)
# Rename the merged columns for review and question time
names(merged_df)[names(merged_df) == "period.x"] <- "period_review"
names(merged_df)[names(merged_df) == "period.y"] <- "period_question"
# Add condition to set period_question to zero for control group
merged_df$period_question <- ifelse(merged_df$goodr == 0, 0, merged_df$period_question)
merged_df$period_review <- ifelse(merged_df$goodr == 0, 0, merged_df$period_review)
# Change Amazon's ASIN to identify never treated group (otherwise `goodr`was the treatment group indicator)
# This step is crucial because otherwise the package cannot identify the control group as both Goodreads and Amazon books are matched with the same ASIN
merged_df$asin[which(merged_df$goodr == 0)] = paste0(merged_df$asin[which(merged_df$goodr==0)],"1")
# create treatment dummy
merged_df$treat <- ifelse(merged_df$goodr == 0, 0, ifelse(merged_df$period_review >= merged_df$period_question, 1, 0))
# --- ESTIMATION USING NOT-YET-TREATED UNITS AS CONTROL --- #
# subset to just goodreads data and estimate with "not-yet-treated" as control group
goodreads_df <- merged_df[merged_df$goodr == 1, ]
# convert all columns to numeric
goodreads_df$period_review<- as.numeric(goodreads_df$period_review)
goodreads_df$asin<- as.numeric(as.factor(goodreads_df$asin))
staggered_notyettreated <- att_gt(yname = "rating",
tname = "period_review",
idname = "asin",
gname = "period_question",
control_group = "notyettreated",
data = goodreads_df,
allow_unbalanced_panel = TRUE
)
summary(staggered_notyettreated)
#ggdid(staggered_notyettreated) no need to plot this right?
# aggregate the group-time average treatment effects
staggered_notyettreated_aggregate<- aggte(staggered_notyettreated, type = "dynamic", na.rm = TRUE)
summary(staggered_notyettreated_aggregate)
staggered_notyettreated_plot<- ggdid(staggered_notyettreated_aggregate)+ labs(x = "Time Relative to Q&A Adoption (in 30-day bins)", y = "ATT")
print(staggered_notyettreated_plot)
# --- ESTIMATION USING NEVER-TREATED UNITS AS CONTROL --- #
# Select required columns
stag_did <- merged_df[, c("asin", "rating", "period_review", "period_question","treat","fiction")]
# convert all columns to numeric
stag_did$period_review<- as.numeric(stag_did$period_review)
stag_did$asin<- as.numeric(as.factor(stag_did$asin))
# estimation
staggered_nevertreated <- att_gt(yname = "rating",
tname = "period_review",
idname = "asin",
gname = "period_question",
data = stag_did,
control_group = "nevertreated",
allow_unbalanced_panel = TRUE,
clustervars = "asin")
summary(staggered_nevertreated)
# aggregate effects
staggered_nevertreated_aggregated<- aggte(staggered_nevertreated, type = "dynamic", na.rm = TRUE)
summary(staggered_nevertreated_aggregated)
staggered_nevertreated_aggregated_plot <- ggdid(staggered_nevertreated_aggregated) + labs(x = "Time Relative to Q&A Adoption (in 30-day bins)", y = "ATT")
print(staggered_nevertreated_aggregated_plot)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment