Last active
June 12, 2023 11:11
-
-
Save srosh2000/f52600b76999e88f0fe316e8f23b419e to your computer and use it in GitHub Desktop.
2 x 2 difference-in-difference estimation example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Load packages | |
library(googledrive) | |
library(data.table) | |
library(fixest) | |
library(broom) | |
library(ggplot2) | |
library(dplyr) | |
library(zoo) | |
# download data from google drive | |
drive_deauth() | |
file_id_gr<- "1voW5dWUiplIzWHa-KnJx0lh9lz7h1Wzf" | |
file_id_am<- "1E0HG5ZCSF939rTE7gRPHUP6QFS43JXDm" | |
drive_download(as_id(file_id_gr), overwrite = TRUE) | |
drive_download(as_id(file_id_am), overwrite = TRUE) | |
# load data | |
goodreads_subset<- fread("gr_did.csv") | |
amazon_subset<- fread("am_did.csv") | |
### --- DATA PREPARATION ---### | |
# create dummy "qa" which takes 0 if before may 21st, 2014 and 1 for after | |
cutoff_date<- as.Date('2014-05-21') | |
amazon_subset$qa<- ifelse(amazon_subset$unixReviewTime < cutoff_date, 0, 1) | |
amazon_subset$goodr <- 0 | |
amazon_subset$goodr_after<- 0 | |
# now for goodreads | |
goodreads_subset$goodr <- 1 | |
goodreads_subset$qa <- ifelse(goodreads_subset$date < cutoff_date, 0, 1) | |
goodreads_subset$goodr_after<- goodreads_subset$qa * goodreads_subset$goodr | |
# add time fixed effects to both datasets | |
# check which dataset has the oldest observation | |
amazon_subset %>% arrange(unixReviewTime) %>% head() # oldest: 1996-11-14 | |
goodreads_subset %>% arrange(date) %>% head() # oldest: 2007-01-06 | |
first_date<- as.Date('1996-11-14') | |
# compute time fixed effect as number of months since first review | |
amazon_subset$t<- (as.yearmon(amazon_subset$unixReviewTime) - as.yearmon(first_date))*12 | |
goodreads_subset$t<- (as.yearmon(goodreads_subset$date) - as.yearmon(first_date))*12 | |
# merge both datasets | |
GoodAma <- select (amazon_subset,-c( reviewerID)) | |
goodreads_subset <- goodreads_subset[ , c("asin","t", "rating", "date", "year_month", "goodr", "qa", "goodr_after")] | |
GoodAma <- dplyr::rename(GoodAma, rating = overall) | |
GoodAma <- dplyr::rename(GoodAma, date = unixReviewTime) | |
goodreads_subset$date<- as.Date(goodreads_subset$date) # resolve error of unmatched date class attribute | |
goodreads_subset$year_month<- as.Date(as.yearmon(goodreads_subset$year_month)) # same as above to match classes of year_month | |
GoodAma$date<- as.Date(GoodAma$date) | |
GoodAma$year_month<- as.Date(as.yearmon(GoodAma$year_month)) | |
GoodAma <- rbind(GoodAma, goodreads_subset, fill = TRUE) # FINAL DATASET | |
means<- GoodAma %>% | |
group_by(goodr, qa) %>% | |
summarize(rating=mean(rating)) %>% | |
ungroup() | |
print(means) | |
# add labels | |
library(dplyr) | |
# Modify the labels using mutate and case_when | |
means_modified <- means %>% | |
mutate( | |
goodr_label = case_when( | |
goodr == 0 ~ "Amazon", | |
goodr == 1 ~ "Goodreads" | |
), | |
qa_label = case_when( | |
qa == 0 ~ "Before", | |
qa == 1 ~ "After" | |
) | |
) | |
# Print the modified table | |
print(means_modified) | |
# compute treatment effect | |
# before-after difference for untreated, has time effect only | |
bef_aft_untreated <- filter(means, goodr == 0, qa == 1)$rating - filter(means, goodr == 0, qa == 0)$rating | |
# before-after for treated, has time and treatment effect | |
bef_aft_treated<- filter(means, goodr == 1, qa == 1)$rating - filter(means, goodr == 1, qa == 0)$rating | |
# difference-in-difference: take time +treatment effect, and remove the time effect | |
did<- bef_aft_treated - bef_aft_untreated | |
print(paste("Diff in Diff Estimate: " , did)) | |
### --- ANALYSIS --- ### | |
# Check if parallel trends assumption holds by visualising | |
temp<- GoodAma %>% filter(goodr == 1) %>% group_by(t) %>% summarize(mean_goodreads = mean(rating), n_obs = n()) %>% filter(n_obs > 100) | |
temp2<- GoodAma %>% filter(goodr == 0) %>% group_by(t) %>% summarize(mean_amazon = mean(rating), n_obs = n()) %>% filter(n_obs >100) | |
plot_data<- temp %>% left_join(temp2, by = "t") | |
plot_avg_rating<- ggplot(plot_data, (aes(x = t)))+ | |
geom_line(aes(y = mean_goodreads), color = "red")+ | |
geom_line(aes(y = mean_amazon), color = "blue")+ | |
geom_vline(xintercept = 210, linetype = "longdash")+ | |
ylab("Mean Rating")+ | |
xlab("Number of periods") + | |
theme_bw()+ | |
annotate("text", x = 175, y = 4.3, label = "Amazon", size = 5.5, color = "blue")+ | |
annotate("text", x = 164, y = 3.88, label = "Goodreads", size = 5.5, color = "red") + | |
annotate("text", x = 225, y = 4.12, label = "QA launch", size = 5)+ | |
theme(plot.title = element_text(hjust = 0.5)) | |
plot_avg_rating | |
# Simple linear regression | |
model_1<- lm(rating ~ goodr*qa, data = GoodAma) | |
tidy(model_1, conf.int = TRUE) | |
# time fixed effects only model | |
model_2<- feols(rating ~ goodr*qa | |
| | |
year_month, | |
data = GoodAma) | |
tidy(model_2, conf.int = TRUE) | |
# both time and book fixed effects model | |
model_3<- feols(rating ~ goodr*qa | |
| | |
year_month + asin, | |
data = GoodAma) | |
tidy(model_3, conf.int = TRUE) | |
# time and book fixed effects with clustered SE | |
model_4<- feols(rating ~ goodr*qa | |
| | |
year_month + asin, | |
cluster = c("asin"), | |
data = GoodAma) | |
tidy(model_4, conf.int = TRUE) | |
# Get all results in summary table | |
install.packages("modelsummary") | |
library(modelsummary) | |
models <- list( | |
"(1)" = model_1, | |
"(2)" = model_2, | |
"(3)" = model_3, | |
"(4)" = model_4) | |
msummary(models) | |
# add variable names to the output | |
cm = c('goodr' = 'Treated (1 = Goodreads)', | |
'qa' = 'After (1 = post QA)', | |
'goodr:qa' = 'Treatment Effect (treated x after)') | |
# include table notes and caption | |
notetable <- c( | |
"Notes: Model 1 is the simple regression. Model 2 includes time fixed effects. Model 3 includes time and book fixed effects. Model 4 includes time and book fixed effects and also clustered standard errors by books. | |
Standard errors are in parentheses.", | |
"***Significant at the 1 percent level.", | |
"**Significant at the 5 percent level.", | |
"*Significant at the 10 percent level.") | |
titletable <- 'Table 1— All DiD Estimation Results' | |
# get summary of results | |
msummary(models, | |
output = "table1.md", | |
stars = c('*' = .1, '**' = 0.05, '***' = .01), | |
estimate = "{estimate}", | |
statistic = "[{std.error}]{stars}", | |
coef_map = cm, | |
gof_omit = 'AIC|BIC|RMSE|Within|Log.Lik.|Std.Errors', | |
notes = notetable, | |
title = titletable) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment