srosh2000/canonical_DiD_example.R

## canonical_DiD_example.R
# Load packages
library(googledrive)
library(data.table)
library(fixest)
library(broom)
library(ggplot2)
library(dplyr)
library(zoo)

# download data from google drive

drive_deauth()

file_id_gr<- "1voW5dWUiplIzWHa-KnJx0lh9lz7h1Wzf"
file_id_am<- "1E0HG5ZCSF939rTE7gRPHUP6QFS43JXDm"

drive_download(as_id(file_id_gr), overwrite = TRUE)
drive_download(as_id(file_id_am), overwrite = TRUE)

# load data
goodreads_subset<- fread("gr_did.csv")
amazon_subset<- fread("am_did.csv")

### --- DATA PREPARATION ---###
# create dummy "qa" which takes 0 if before may 21st, 2014 and 1 for after
cutoff_date<- as.Date('2014-05-21')
amazon_subset$qa<- ifelse(amazon_subset$unixReviewTime < cutoff_date, 0, 1)
amazon_subset$goodr <- 0
amazon_subset$goodr_after<- 0

# now for goodreads
goodreads_subset$goodr <- 1
goodreads_subset$qa <- ifelse(goodreads_subset$date < cutoff_date, 0, 1)
goodreads_subset$goodr_after<- goodreads_subset$qa * goodreads_subset$goodr

# add time fixed effects to both datasets

# check which dataset has the oldest observation
amazon_subset %>% arrange(unixReviewTime) %>% head() # oldest: 1996-11-14
goodreads_subset %>% arrange(date) %>% head() # oldest: 2007-01-06

first_date<- as.Date('1996-11-14')
# compute time fixed effect as number of months since first review
amazon_subset$t<- (as.yearmon(amazon_subset$unixReviewTime) - as.yearmon(first_date))*12
goodreads_subset$t<- (as.yearmon(goodreads_subset$date) - as.yearmon(first_date))*12


# merge both datasets
GoodAma <- select (amazon_subset,-c( reviewerID))
goodreads_subset <- goodreads_subset[ , c("asin","t", "rating", "date", "year_month", "goodr", "qa", "goodr_after")]
GoodAma <- dplyr::rename(GoodAma, rating = overall)
GoodAma <- dplyr::rename(GoodAma, date = unixReviewTime)

goodreads_subset$date<- as.Date(goodreads_subset$date) # resolve error of unmatched date class attribute
goodreads_subset$year_month<- as.Date(as.yearmon(goodreads_subset$year_month)) # same as above to match classes of year_month
GoodAma$date<- as.Date(GoodAma$date)
GoodAma$year_month<- as.Date(as.yearmon(GoodAma$year_month))

GoodAma <- rbind(GoodAma, goodreads_subset, fill = TRUE) # FINAL DATASET

means<- GoodAma %>%
  group_by(goodr, qa) %>%
  summarize(rating=mean(rating)) %>%
  ungroup()

print(means)

# add labels
library(dplyr)

# Modify the labels using mutate and case_when
means_modified <- means %>%
  mutate(
    goodr_label = case_when(
      goodr == 0 ~ "Amazon",
      goodr == 1 ~ "Goodreads"
    ),
    qa_label = case_when(
      qa == 0 ~ "Before",
      qa == 1 ~ "After"
    )
  )

# Print the modified table
print(means_modified)


# compute treatment effect
# before-after difference for untreated, has time effect only
bef_aft_untreated <- filter(means, goodr == 0, qa == 1)$rating - filter(means, goodr == 0, qa == 0)$rating

# before-after for treated, has time and treatment effect
bef_aft_treated<- filter(means, goodr == 1, qa == 1)$rating - filter(means, goodr == 1, qa == 0)$rating

# difference-in-difference: take time +treatment effect, and remove the time effect
did<- bef_aft_treated - bef_aft_untreated

print(paste("Diff in Diff Estimate: " , did))
### --- ANALYSIS --- ###

# Check if parallel trends assumption holds by visualising
temp<- GoodAma %>% filter(goodr == 1) %>% group_by(t) %>% summarize(mean_goodreads = mean(rating), n_obs = n()) %>% filter(n_obs > 100)
temp2<- GoodAma %>% filter(goodr == 0) %>% group_by(t) %>% summarize(mean_amazon = mean(rating), n_obs = n()) %>% filter(n_obs >100)
plot_data<- temp %>% left_join(temp2, by = "t")

plot_avg_rating<- ggplot(plot_data, (aes(x = t)))+
  geom_line(aes(y = mean_goodreads), color = "red")+
  geom_line(aes(y = mean_amazon), color = "blue")+
  geom_vline(xintercept = 210, linetype = "longdash")+
  ylab("Mean Rating")+
  xlab("Number of periods") +
  theme_bw()+
  annotate("text", x = 175, y = 4.3, label = "Amazon", size = 5.5, color = "blue")+
  annotate("text", x = 164, y = 3.88, label = "Goodreads", size = 5.5, color = "red") +
  annotate("text", x = 225, y = 4.12, label = "QA launch", size = 5)+
  theme(plot.title = element_text(hjust = 0.5))

plot_avg_rating

# Simple linear regression
model_1<- lm(rating ~ goodr*qa, data = GoodAma)
tidy(model_1, conf.int = TRUE)


# time fixed effects only model

model_2<- feols(rating ~ goodr*qa
                  |
                  year_month,
                data = GoodAma)

tidy(model_2, conf.int = TRUE)


# both time and book fixed effects model

model_3<- feols(rating ~ goodr*qa
                |
                year_month + asin,
                data = GoodAma)
tidy(model_3, conf.int = TRUE)

# time and book fixed effects with clustered SE
model_4<- feols(rating ~ goodr*qa
                |
                year_month + asin,
                cluster = c("asin"),
                data = GoodAma)
tidy(model_4, conf.int = TRUE)

# Get all results in summary table
install.packages("modelsummary")
library(modelsummary)

models <- list(
  "(1)" = model_1,
  "(2)" = model_2,
  "(3)" = model_3,
  "(4)" = model_4)

msummary(models)

# add variable names to the output
cm = c('goodr'    = 'Treated (1 = Goodreads)',
       'qa'    = 'After (1 = post QA)',
       'goodr:qa'    = 'Treatment Effect (treated x after)')

# include table notes and caption
notetable <- c(
  "Notes: Model 1 is the simple regression. Model 2 includes time fixed effects. Model 3 includes time and book fixed effects. Model 4 includes time and book fixed effects and also clustered standard errors by books.
  Standard errors are in parentheses.",
  "***Significant at the 1 percent level.",
  "**Significant at the 5 percent level.",
  "*Significant at the 10 percent level.")

titletable <- 'Table 1— All DiD Estimation Results'
# get summary of results
msummary(models,
         output = "table1.md",
         stars = c('*' = .1, '**' = 0.05, '***' = .01),
         estimate = "{estimate}",
         statistic = "[{std.error}]{stars}",
         coef_map = cm,
         gof_omit = 'AIC|BIC|RMSE|Within|Log.Lik.|Std.Errors',
         notes = notetable,
         title = titletable)
	# Load packages
	library(googledrive)
	library(data.table)
	library(fixest)
	library(broom)
	library(ggplot2)
	library(dplyr)
	library(zoo)

	# download data from google drive

	drive_deauth()

	file_id_gr<- "1voW5dWUiplIzWHa-KnJx0lh9lz7h1Wzf"
	file_id_am<- "1E0HG5ZCSF939rTE7gRPHUP6QFS43JXDm"

	drive_download(as_id(file_id_gr), overwrite = TRUE)
	drive_download(as_id(file_id_am), overwrite = TRUE)

	# load data
	goodreads_subset<- fread("gr_did.csv")
	amazon_subset<- fread("am_did.csv")

	### --- DATA PREPARATION ---###
	# create dummy "qa" which takes 0 if before may 21st, 2014 and 1 for after
	cutoff_date<- as.Date('2014-05-21')
	amazon_subset$qa<- ifelse(amazon_subset$unixReviewTime < cutoff_date, 0, 1)
	amazon_subset$goodr <- 0
	amazon_subset$goodr_after<- 0

	# now for goodreads
	goodreads_subset$goodr <- 1
	goodreads_subset$qa <- ifelse(goodreads_subset$date < cutoff_date, 0, 1)
	goodreads_subset$goodr_after<- goodreads_subset$qa * goodreads_subset$goodr

	# add time fixed effects to both datasets

	# check which dataset has the oldest observation
	amazon_subset %>% arrange(unixReviewTime) %>% head() # oldest: 1996-11-14
	goodreads_subset %>% arrange(date) %>% head() # oldest: 2007-01-06

	first_date<- as.Date('1996-11-14')
	# compute time fixed effect as number of months since first review
	amazon_subset$t<- (as.yearmon(amazon_subset$unixReviewTime) - as.yearmon(first_date))*12
	goodreads_subset$t<- (as.yearmon(goodreads_subset$date) - as.yearmon(first_date))*12


	# merge both datasets
	GoodAma <- select (amazon_subset,-c( reviewerID))
	goodreads_subset <- goodreads_subset[ , c("asin","t", "rating", "date", "year_month", "goodr", "qa", "goodr_after")]
	GoodAma <- dplyr::rename(GoodAma, rating = overall)
	GoodAma <- dplyr::rename(GoodAma, date = unixReviewTime)

	goodreads_subset$date<- as.Date(goodreads_subset$date) # resolve error of unmatched date class attribute
	goodreads_subset$year_month<- as.Date(as.yearmon(goodreads_subset$year_month)) # same as above to match classes of year_month
	GoodAma$date<- as.Date(GoodAma$date)
	GoodAma$year_month<- as.Date(as.yearmon(GoodAma$year_month))

	GoodAma <- rbind(GoodAma, goodreads_subset, fill = TRUE) # FINAL DATASET

	means<- GoodAma %>%
	group_by(goodr, qa) %>%
	summarize(rating=mean(rating)) %>%
	ungroup()

	print(means)

	# add labels
	library(dplyr)

	# Modify the labels using mutate and case_when
	means_modified <- means %>%
	mutate(
	goodr_label = case_when(
	goodr == 0 ~ "Amazon",
	goodr == 1 ~ "Goodreads"
	),
	qa_label = case_when(
	qa == 0 ~ "Before",
	qa == 1 ~ "After"
	)
	)

	# Print the modified table
	print(means_modified)



	# compute treatment effect
	# before-after difference for untreated, has time effect only
	bef_aft_untreated <- filter(means, goodr == 0, qa == 1)$rating - filter(means, goodr == 0, qa == 0)$rating

	# before-after for treated, has time and treatment effect
	bef_aft_treated<- filter(means, goodr == 1, qa == 1)$rating - filter(means, goodr == 1, qa == 0)$rating

	# difference-in-difference: take time +treatment effect, and remove the time effect
	did<- bef_aft_treated - bef_aft_untreated

	print(paste("Diff in Diff Estimate: " , did))
	### --- ANALYSIS --- ###

	# Check if parallel trends assumption holds by visualising
	temp<- GoodAma %>% filter(goodr == 1) %>% group_by(t) %>% summarize(mean_goodreads = mean(rating), n_obs = n()) %>% filter(n_obs > 100)
	temp2<- GoodAma %>% filter(goodr == 0) %>% group_by(t) %>% summarize(mean_amazon = mean(rating), n_obs = n()) %>% filter(n_obs >100)
	plot_data<- temp %>% left_join(temp2, by = "t")

	plot_avg_rating<- ggplot(plot_data, (aes(x = t)))+
	geom_line(aes(y = mean_goodreads), color = "red")+
	geom_line(aes(y = mean_amazon), color = "blue")+
	geom_vline(xintercept = 210, linetype = "longdash")+
	ylab("Mean Rating")+
	xlab("Number of periods") +
	theme_bw()+
	annotate("text", x = 175, y = 4.3, label = "Amazon", size = 5.5, color = "blue")+
	annotate("text", x = 164, y = 3.88, label = "Goodreads", size = 5.5, color = "red") +
	annotate("text", x = 225, y = 4.12, label = "QA launch", size = 5)+
	theme(plot.title = element_text(hjust = 0.5))

	plot_avg_rating

	# Simple linear regression
	model_1<- lm(rating ~ goodr*qa, data = GoodAma)
	tidy(model_1, conf.int = TRUE)



	# time fixed effects only model

	model_2<- feols(rating ~ goodr*qa
	\|
	year_month,
	data = GoodAma)

	tidy(model_2, conf.int = TRUE)


	# both time and book fixed effects model

	model_3<- feols(rating ~ goodr*qa
	\|
	year_month + asin,
	data = GoodAma)
	tidy(model_3, conf.int = TRUE)

	# time and book fixed effects with clustered SE
	model_4<- feols(rating ~ goodr*qa
	\|
	year_month + asin,
	cluster = c("asin"),
	data = GoodAma)
	tidy(model_4, conf.int = TRUE)

	# Get all results in summary table
	install.packages("modelsummary")
	library(modelsummary)

	models <- list(
	"(1)" = model_1,
	"(2)" = model_2,
	"(3)" = model_3,
	"(4)" = model_4)

	msummary(models)

	# add variable names to the output
	cm = c('goodr' = 'Treated (1 = Goodreads)',
	'qa' = 'After (1 = post QA)',
	'goodr:qa' = 'Treatment Effect (treated x after)')

	# include table notes and caption
	notetable <- c(
	"Notes: Model 1 is the simple regression. Model 2 includes time fixed effects. Model 3 includes time and book fixed effects. Model 4 includes time and book fixed effects and also clustered standard errors by books.
	Standard errors are in parentheses.",
	"***Significant at the 1 percent level.",
	"**Significant at the 5 percent level.",
	"*Significant at the 10 percent level.")

	titletable <- 'Table 1— All DiD Estimation Results'
	# get summary of results
	msummary(models,
	output = "table1.md",
	stars = c('' = .1, '' = 0.05, '**' = .01),
	estimate = "{estimate}",
	statistic = "[{std.error}]{stars}",
	coef_map = cm,
	gof_omit = 'AIC\|BIC\|RMSE\|Within\|Log.Lik.\|Std.Errors',
	notes = notetable,
	title = titletable)