srosh2000/staggered_DiD_example.R

## staggered_DiD_example.R
# Load packages
library(data.table)
library(dplyr)
library(tidyr)
library(zoo)
library(did)
library(lubridate)
library(googledrive)
library(ggplot2)

#### --- Load data ---###
# Download data from google drive

drive_deauth()

file_id<- "1e-VogI4zQPi_xbSydVLH7PrgkOlWpczl"

drive_download(as_id(file_id), overwrite = TRUE)

GoodAma_sample<- fread("GoodAma_sample.csv")

# SETTING UP THE DATA FOR ESTIMATION

# Get the minimum and maximum review dates
min_date <- min(GoodAma_sample$year_month)
max_date <- max(GoodAma_sample$year_month)

# Create a dataframe with all unique dates in the range
date_range <- seq(min_date, max_date, by = "month")
date_df <- data.frame(date = date_range)

# Assign time period values to the date dataframe
date_df$period <- seq_len(nrow(date_df))

# convert question time to year month variable
GoodAma_sample$question_ym<- as.Date(as.yearmon(GoodAma_sample$cleaned_question_time))
# Merge the review dates with the date dataframe to get the corresponding time period labels
merged_df <- merge(GoodAma_sample, date_df, by.x = "year_month", by.y = "date", all.x = TRUE)


# Merge the question time with the date dataframe to get the corresponding time period values
merged_df <- merge(merged_df, date_df, by.x = "question_ym", by.y = "date", all.x = TRUE)

# Rename the merged columns for review and question time
names(merged_df)[names(merged_df) == "period.x"] <- "period_review"
names(merged_df)[names(merged_df) == "period.y"] <- "period_question"

# Add condition to set period_question to zero for control group
merged_df$period_question <- ifelse(merged_df$goodr == 0, 0, merged_df$period_question)
merged_df$period_review <- ifelse(merged_df$goodr == 0, 0, merged_df$period_review)

# Change Amazon's ASIN to identify never treated group (otherwise `goodr`was the treatment group indicator)
# This step is crucial because otherwise the package cannot identify the control group as both Goodreads and Amazon books are matched with the same ASIN
merged_df$asin[which(merged_df$goodr == 0)] = paste0(merged_df$asin[which(merged_df$goodr==0)],"1")
# create treatment dummy
merged_df$treat <- ifelse(merged_df$goodr == 0, 0, ifelse(merged_df$period_review >= merged_df$period_question, 1, 0))
# --- ESTIMATION USING NOT-YET-TREATED UNITS AS CONTROL ---   #

# subset to just goodreads data and estimate with "not-yet-treated" as control group
goodreads_df <- merged_df[merged_df$goodr == 1, ]
# convert all columns to numeric
goodreads_df$period_review<- as.numeric(goodreads_df$period_review)
goodreads_df$asin<- as.numeric(as.factor(goodreads_df$asin))

staggered_notyettreated <- att_gt(yname = "rating",
                                  tname = "period_review",
                                  idname = "asin",
                                  gname = "period_question",
                                  control_group = "notyettreated",
                                  data = goodreads_df,
                                  allow_unbalanced_panel = TRUE
)

summary(staggered_notyettreated)

#ggdid(staggered_notyettreated) no need to plot this right?

# aggregate the group-time average treatment effects

staggered_notyettreated_aggregate<- aggte(staggered_notyettreated, type = "dynamic", na.rm = TRUE)
summary(staggered_notyettreated_aggregate)

staggered_notyettreated_plot<- ggdid(staggered_notyettreated_aggregate)+ labs(x = "Time Relative to Q&A Adoption (in 30-day bins)", y = "ATT")
print(staggered_notyettreated_plot)

# --- ESTIMATION USING NEVER-TREATED UNITS AS CONTROL --- #
# Select required columns
stag_did <- merged_df[, c("asin", "rating", "period_review", "period_question","treat","fiction")]

# convert all columns to numeric
stag_did$period_review<- as.numeric(stag_did$period_review)
stag_did$asin<- as.numeric(as.factor(stag_did$asin))

# estimation

staggered_nevertreated <- att_gt(yname = "rating",
                                 tname = "period_review",
                                 idname = "asin",
                                 gname = "period_question",
                                 data = stag_did,
                                 control_group = "nevertreated",
                                 allow_unbalanced_panel = TRUE,
                                 clustervars = "asin")

summary(staggered_nevertreated)

# aggregate effects
staggered_nevertreated_aggregated<- aggte(staggered_nevertreated, type = "dynamic", na.rm = TRUE)
summary(staggered_nevertreated_aggregated)

staggered_nevertreated_aggregated_plot <- ggdid(staggered_nevertreated_aggregated) + labs(x = "Time Relative to Q&A Adoption (in 30-day bins)", y = "ATT")
print(staggered_nevertreated_aggregated_plot)
	# Load packages
	library(data.table)
	library(dplyr)
	library(tidyr)
	library(zoo)
	library(did)
	library(lubridate)
	library(googledrive)
	library(ggplot2)

	#### --- Load data ---###
	# Download data from google drive

	drive_deauth()

	file_id<- "1e-VogI4zQPi_xbSydVLH7PrgkOlWpczl"

	drive_download(as_id(file_id), overwrite = TRUE)

	GoodAma_sample<- fread("GoodAma_sample.csv")

	# SETTING UP THE DATA FOR ESTIMATION

	# Get the minimum and maximum review dates
	min_date <- min(GoodAma_sample$year_month)
	max_date <- max(GoodAma_sample$year_month)

	# Create a dataframe with all unique dates in the range
	date_range <- seq(min_date, max_date, by = "month")
	date_df <- data.frame(date = date_range)

	# Assign time period values to the date dataframe
	date_df$period <- seq_len(nrow(date_df))

	# convert question time to year month variable
	GoodAma_sample$question_ym<- as.Date(as.yearmon(GoodAma_sample$cleaned_question_time))
	# Merge the review dates with the date dataframe to get the corresponding time period labels
	merged_df <- merge(GoodAma_sample, date_df, by.x = "year_month", by.y = "date", all.x = TRUE)


	# Merge the question time with the date dataframe to get the corresponding time period values
	merged_df <- merge(merged_df, date_df, by.x = "question_ym", by.y = "date", all.x = TRUE)

	# Rename the merged columns for review and question time
	names(merged_df)[names(merged_df) == "period.x"] <- "period_review"
	names(merged_df)[names(merged_df) == "period.y"] <- "period_question"

	# Add condition to set period_question to zero for control group
	merged_df$period_question <- ifelse(merged_df$goodr == 0, 0, merged_df$period_question)
	merged_df$period_review <- ifelse(merged_df$goodr == 0, 0, merged_df$period_review)

	# Change Amazon's ASIN to identify never treated group (otherwise `goodr`was the treatment group indicator)
	# This step is crucial because otherwise the package cannot identify the control group as both Goodreads and Amazon books are matched with the same ASIN
	merged_df$asin[which(merged_df$goodr == 0)] = paste0(merged_df$asin[which(merged_df$goodr==0)],"1")
	# create treatment dummy
	merged_df$treat <- ifelse(merged_df$goodr == 0, 0, ifelse(merged_df$period_review >= merged_df$period_question, 1, 0))
	# --- ESTIMATION USING NOT-YET-TREATED UNITS AS CONTROL --- #

	# subset to just goodreads data and estimate with "not-yet-treated" as control group
	goodreads_df <- merged_df[merged_df$goodr == 1, ]
	# convert all columns to numeric
	goodreads_df$period_review<- as.numeric(goodreads_df$period_review)
	goodreads_df$asin<- as.numeric(as.factor(goodreads_df$asin))

	staggered_notyettreated <- att_gt(yname = "rating",
	tname = "period_review",
	idname = "asin",
	gname = "period_question",
	control_group = "notyettreated",
	data = goodreads_df,
	allow_unbalanced_panel = TRUE
	)

	summary(staggered_notyettreated)

	#ggdid(staggered_notyettreated) no need to plot this right?

	# aggregate the group-time average treatment effects

	staggered_notyettreated_aggregate<- aggte(staggered_notyettreated, type = "dynamic", na.rm = TRUE)
	summary(staggered_notyettreated_aggregate)

	staggered_notyettreated_plot<- ggdid(staggered_notyettreated_aggregate)+ labs(x = "Time Relative to Q&A Adoption (in 30-day bins)", y = "ATT")
	print(staggered_notyettreated_plot)

	# --- ESTIMATION USING NEVER-TREATED UNITS AS CONTROL --- #
	# Select required columns
	stag_did <- merged_df[, c("asin", "rating", "period_review", "period_question","treat","fiction")]

	# convert all columns to numeric
	stag_did$period_review<- as.numeric(stag_did$period_review)
	stag_did$asin<- as.numeric(as.factor(stag_did$asin))

	# estimation

	staggered_nevertreated <- att_gt(yname = "rating",
	tname = "period_review",
	idname = "asin",
	gname = "period_question",
	data = stag_did,
	control_group = "nevertreated",
	allow_unbalanced_panel = TRUE,
	clustervars = "asin")

	summary(staggered_nevertreated)

	# aggregate effects
	staggered_nevertreated_aggregated<- aggte(staggered_nevertreated, type = "dynamic", na.rm = TRUE)
	summary(staggered_nevertreated_aggregated)

	staggered_nevertreated_aggregated_plot <- ggdid(staggered_nevertreated_aggregated) + labs(x = "Time Relative to Q&A Adoption (in 30-day bins)", y = "ATT")
	print(staggered_nevertreated_aggregated_plot)