sdeture/gist:0346e40fae0ca4308a3cf600a893250e Secret

## gistfile1.txt
###Replication of Allen Larson and Sloan 2013###


#IMPORTANT
#Run all the libraries and functions up to line 204 in "libraries and functions for ALS replication"

##load in data (FUNDA universe 1950-present; CRSP universe 1950-present; and merged link table)
FUNDA <- read_csv("~/Documents/Research/Specific Accruals/FUNDA_f.csv")
CRSP <- read_csv("~/Documents/Research/Specific Accruals/CRSP_f.csv")
LINK <- read_csv("~/Documents/Research/Specific Accruals/linktable.csv")

#Convert to data.table format for faster computing
CRSP<-data.table(CRSP)
FUNDA<-data.table(FUNDA)
LINK<-data.table(LINK)

#make a copy of data
FUNDA_original <- FUNDA
CRSP_original <- CRSP
LINK_original <- LINK

#set this variable sets the number of clusters for the main cluster analysis
k<-3

#Make a smaller version of data to test code, if desired
#FUNDA<-FUNDA_original[is.element(tic, c("MSFT","AAPL"))]
#CRSP<-CRSP_original[is.element(TICKER, c("MSFT","AAPL"))]

#remove some unneeded variables
#do not use sich because the data is not there before the 80s
FUNDA<-FUNDA[, .(gvkey, datadate, fyear, fyr, cusip, at, dlc, txp, che,lct,act,oibdp,sic,fic,popsrc, sale, emp,
                 prcc_f, csho, dltt, pstkl, txditc, xad, epspx, adjex_f)]
CRSP<-CRSP[, .(RET, PERMNO, date, CUSIP, EXCHCD)]
LINK<- LINK[, .(LPERMNO, LINKTYPE, gvkey, LINKDT, LINKENDDT)]


##create annual return var
##gives return over this month and past 12 months
CRSP[, RET:=as.numeric(RET)]
CRSP[,num_mo:=.N,PERMNO]
CRSP<-CRSP[num_mo>12]
CRSP[, ann_return := rollapplyr((1+RET), 12, prod), by = PERMNO]
CRSP[, ann_return := lagged(ann_return, 11), by = PERMNO]

##create MOM6 var
##this gives return over this month and previous 5]
CRSP[,num_mo:=.N,PERMNO]
CRSP<-CRSP[num_mo>12]
CRSP[, MOM6 := rollapplyr((1+RET), 6, prod), by = PERMNO]
CRSP[, MOM6 := lagged(MOM6, 17), by = PERMNO]

##create MOM36 var
##this gives return over this month and previous 35]
CRSP[,num_mo:=.N,PERMNO]
CRSP<-CRSP[num_mo>36]
CRSP[, MOM36 := rollapplyr((1+RET), 36, prod), by = PERMNO]
CRSP[, MOM36 := lagged(MOM36, 47), by = PERMNO]


# prepare to merge CRSP and FUNDA by creating a match_date variable to match on (note that
# the FUNDA matchdate is 16months ahead of the CRSP matchdate...this is because each CRSP
# row has return for past 12 months.  For my analysis, I want returns for the 12 months
# start 4 months after FUNDA fiscal yearend.  Fiscal Year end is defined in the third line,
# fyear+fyr).  Take fiscal year end, add 16 months to it, and take return over previous
# 12 months from that date.
CRSP[,data_year := sub("^(\\d{4}).*$", "\\1", date)]
CRSP[,match_date := as.Date(as.character(date), "%Y%m%d")]
CRSP[,match_date := format(match_date, "%Y %m")]
FUNDA[,match_date := as.Date(paste(sprintf("%02d%02d", fyear, fyr), rep("01",length(fyear)), sep=""), "%Y%m%d")]
month(FUNDA$match_date)=month(FUNDA$match_date)+16
FUNDA[,match_date := format(match_date, "%Y %m")]

# Get Permno into FUNDA by merging with LINK
LINK<-LINK[is.element(LINKTYPE, c('LU', 'LC', 'LD', 'LF', 'LN', 'LO', 'LS', 'LX'))]
FUNDA <- merge(FUNDA, LINK, by="gvkey", all.x = T, allow.cartesian = T)
FUNDA <- FUNDA[(LINKDT <= datadate) | (LINKDT=="B")]
FUNDA <- FUNDA[(datadate <= LINKENDDT) | (LINKDT=="E")]
FUNDA[, PERMNO:=LPERMNO]
FUNDA<-unique(FUNDA, by = c("datadate", "PERMNO"))
FUNDA<-FUNDA[, c("LINKDT","LINKENDDT", "LINKTYPE", "LPERMNO"):=NULL]

#save a copy
FUNDA_01<-FUNDA

#merge CRSP and COMPUSTAT data
FUNDA <- merge(FUNDA, CRSP,by=c("PERMNO", "match_date"),all.x = T)
FUNDA<-unique(FUNDA, by = c("match_date", "PERMNO"))

#save a copy
FUNDA1<-FUNDA

##define ACC variable
FUNDA[,avg_at:=(at+lagged(at,1))/2,by=gvkey]
FUNDA[dlc == NA, dlc := 0]
FUNDA[txp == NA, txp := 0]
FUNDA[,ACC:= (delta(act,1)-delta(che,1)-delta(lct,1)+delta(dlc,1)+delta(txp,1)),by=gvkey]
FUNDA[,ACC:= ACC/avg_at, by=gvkey]
FUNDA[,ann_return:=ann_return-1]

#save a copy
FUNDA1_0<-FUNDA

##define INC and CF variables
FUNDA[,INC:=oibdp/avg_at]
FUNDA[,CF:=INC-ACC]

#Winsorize
FUNDA[, ACC:=winsorize(ACC)]
FUNDA[, INC:=winsorize(INC)]
FUNDA[, CF:=winsorize(CF)]

#define leads and lags of INC and CF variables
FUNDA[,lead_INC:=lead(INC,1), by=gvkey]
FUNDA[,lead_CF:=lead(CF,1), by=gvkey]
FUNDA[,lead_ACC:=lead(ACC,1), by=gvkey]

#save a copy of data before moving forward
FUNDA_retro<-FUNDA

#restrict sample to just their sample
FUNDA<-FUNDA[!(6000<sic & sic<6999)]
FUNDA<-FUNDA[fic=="USA"]
FUNDA<-FUNDA[popsrc=="D"]
FUNDA<-FUNDA[(1961<fyear & fyear<2009)]
FUNDA2<-FUNDA ##save data
FUNDA<-FUNDA[!(EXCHCD==0)]
FUNDA3<-FUNDA ##save data
FUNDA[, change:=length(unique(fyr)), gvkey] #number of unique fyear months for each firm


FUNDA<-FUNDA[!(is.na(ACC+CF+ann_return+INC+lead_INC+lead_ACC+lead_CF))]


##NOTE: Each section table code outputs latex code to generate that table

###TABLE 1###

table1<-data.table(FUNDA[,.(ACC,CF,INC,ann_return)])
pearson.correlation.matrix <- cor(FUNDA[,.(ACC, lead_ACC, CF, lead_CF, INC, lead_INC, ann_return)],method="pearson")
spearman.correlation.matrix <- cor(FUNDA[,.(ACC, lead_ACC, CF, lead_CF, INC, lead_INC, ann_return)],method="spearman")
correlation.matrix<-upper.tri(pearson.correlation.matrix)*pearson.correlation.matrix + lower.tri(spearman.correlation.matrix)*spearman.correlation.matrix
stargazer(table1, summary.stat = c("mean", "sd", "p25", "median", "p75"), title="Univariate Statistics")
stargazer(correlation.matrix, title="Pair-wise correlations--Pearson (above) and Spearman (below) diagonal")


###TABLE 2###

#make quantiles of ACC CF INC and leads using user defined function (in libraries and functions file)
FUNDA[, quint_INC:=rank_data(INC,5), fyear]
FUNDA[, quint_leadINC:=rank_data(lead_INC,5), fyear]
FUNDA[, quint_CF:=rank_data(CF,5), fyear]
FUNDA[, quint_leadCF:=rank_data(lead_CF,5), fyear]
FUNDA[, quint_ACC:=rank_data(ACC,5), fyear]
FUNDA[, quint_leadACC:=rank_data(lead_ACC,5), fyear]

##make a function that creates each panel of table 2
##input variables quintXXX and quintXXXlead for x and y
##input variables XXX and XXXlead for xx and yy
##input title of table and notes as ttt and nnn
tab2<-function(x, y, ttt,nnn) {

  #make a basic two way table
  table2a<-table(x, y)
  table2a<-round(100*table2a/sum(table2a),2)

  #make a list containing the two tway table above redone for every year
  A<-list()
  for (i in min(FUNDA$fyear):max(FUNDA$fyear)) {
    name <- paste('year:',i,sep='')
    A[[name]]<-as.matrix(table(x[FUNDA$fyear==i],y[FUNDA$fyear==i]))
  }

  A<-lapply(A, function(z) 100*z/sum(z))

  #find element wise sd of that list
  sd_thing<-round(apply(array(unlist(A), c(5, 5, length(A))), c(1,2), sd),2)

  ##make a table of 1 (if significantly different from 4 at 1% level t test) and 0 otherwise
  stars<-(abs((table2a-4)/(sd_thing/sqrt(length(A))))>2.7)+0
  stars<-replace(as.vector(stars), stars==1, "*")
  stars<-replace(as.vector(stars), stars==0, "")
  stars<-matrix(stars, nrow=5, ncol=5)
  bold<-(table2a>4)+0
  boldpre<-replace(as.vector(bold), bold==1, "||")
  boldpre<-replace(as.vector(boldpre), bold==0, "")
  boldpre<-matrix(boldpre, nrow=5, ncol=5)
  boldpost<-replace(as.vector(bold), bold==1, "||")
  boldpost<-replace(as.vector(boldpost), bold==0, "")
  boldpost<-matrix(boldpost, nrow=5, ncol=5)

  ##add percentage marks and stars to the two way table, then make latex code
  table2af<-matrix(paste(boldpre, table2a, "%", boldpost, stars,sep=""), nrow=5, ncol=5)
  stargazer(table2af, title=ttt, colnames=T,rownames=T)
}

#panel a
tab2(FUNDA$quint_INC,FUNDA$quint_leadINC, "Distribution of firm-years across consecutive annual income quintiles","")
#panel b
tab2(FUNDA$quint_CF,FUNDA$quint_leadCF, "Distribution of firm-years across consecutive annual cash flow quintiles")
#panel c
tab2(FUNDA$quint_ACC,FUNDA$quint_leadACC, "Distribution of firm-years across consecutive annual accrual quintiles")


###TABLE 3###

##PANEL A##

#define more variables, and save a copy of the data
FUNDA[,lagged_CF:=lagged(CF,1), by=gvkey]

FUNDA[,sales_growth:=(sale-lagged(sale,1))/lagged(sale,1),  by=gvkey]
FUNDA[sales_growth==Inf, sales_growth:=NA]
FUNDA[,sales_growth:=wins99(sales_growth), by=fyear]

FUNDA[,emp_growth:=(emp-lagged(emp,1))/lagged(emp,1),  by=gvkey]
FUNDA[emp_growth==Inf, emp_growth:=NA]
FUNDA[,emp_growth:=wins99(emp_growth), by=fyear]

FUNDAt3<-FUNDA

#define two digit SIC...note that we have to remove sic groups that have
#too few observations to run the model to avoid errors in the lapply stage
FUNDA[,sic_group := sub("^(\\d{2}).*$", "\\1", sic)]
FUNDA[,sic_size:=.N,sic_group]

#make list of large enough sic groups, then run the model on each group.
#the output is a LIST of models, stored as tab3_lms
vec_of_sic<-unique(FUNDA[sic_size>30]$sic_group)
tab3_lms <- lapply(1:length(vec_of_sic),
                 function(x)
                   lm(ACC~
                        sales_growth+emp_growth+
                        lagged_CF+CF+lead_CF,
                      FUNDA[sic_group==vec_of_sic[x]], na.action=na.omit))
##produce a LIST of reduced models
tab3_lms_mddgrowth <- lapply(1:length(vec_of_sic),
                   function(x)
                     lm(ACC~
                          sales_growth+emp_growth,
                        FUNDA[sic_group==vec_of_sic[x]], na.action=na.omit))
##produce a LIST of reduced models
tab3_lms_mddmatch <- lapply(1:length(vec_of_sic),
                   function(x)
                     lm(ACC~
                          lagged_CF+CF+lead_CF,
                        FUNDA[sic_group==vec_of_sic[x]], na.action=na.omit))
#create partial R^2 for each model
partial_R_growth<-lapply(1:length(tab3_lms),
                         function(x)
                           rsq.partial(tab3_lms[[x]], tab3_lms_mddgrowth[[x]],type = "sse")$partial.rsq)
partial_R_match<-lapply(1:length(tab3_lms),
                        function(x)
                          rsq.partial(tab3_lms[[x]], tab3_lms_mddmatch[[x]],type = "sse")$partial.rsq)

#pull coefficients, partial Rsquares, Rsquare, and TStats from each model
tab3_summaries<-lapply(1:length(tab3_lms), function(x)
  append(
  append(
    append(
      append(tab3_lms[[x]]$coefficients,
             partial_R_growth[[x]]),
              partial_R_match[[x]]),
               summary(tab3_lms[[x]])$r.square),
                summary(tab3_lms[[x]])$coefficients[,3]
    ))

##find mean and percentiles,accross industries, of the summary stats produced above
mean_of_lm<-round(apply(array(unlist(tab3_summaries), c(15, 1, length(tab3_summaries))), c(1,2), mean),3)
q25_of_lm<-round(apply(array(unlist(tab3_summaries), c(15, 1, length(tab3_summaries))), c(1,2), function(x) quantile(x,.25)),3)
q50_of_lm<-round(apply(array(unlist(tab3_summaries), c(15, 1, length(tab3_summaries))), c(1,2), function(x) quantile(x,.50)),3)
q75_of_lm<-round(apply(array(unlist(tab3_summaries), c(15, 1, length(tab3_summaries))), c(1,2), function(x) quantile(x,.75)),3)

##bind the results into a table.  Note that the first 9 elements of the above vectors relate to
##coefficients and R^2,and the last 6 elements of each vector relate to TStats
tab3<-cbind(mean_of_lm[1:9], append(mean_of_lm[10:15], c("", "","")), q25_of_lm[1:9], q50_of_lm[1:9],q75_of_lm[1:9])
rownames(tab3) <- c("Intercept", "Sales Growth","Employee Growth", "Lagged Cash Flow", "Cash Flow", "Lead Cash Flow", "Partial R^2 Growth", "Partial R^2 Match", "R^2")
rownames(tab3, do.NULL = F)
colnames(tab3)<- c("Mean", "T-Statistic", "25th Percentile", "Median","75th Percentile")

##make the latex code for the table3 panel A
stargazer(tab3, title="Distribution of coefficients for the modified Dechow and Dichev (MDD) Model")

##PANEL B##

#Calculate fitted values for MDD model
silent<-lapply(1:length(vec_of_sic),
       function(x)
         FUNDA[sic_group==vec_of_sic[x],
               MDDGOOD:=tab3_lms[[x]]$coefficients[1]+
                 sales_growth*tab3_lms[[x]]$coefficients[2]+
                 emp_growth*tab3_lms[[x]]$coefficients[3]+
                 lagged_CF*tab3_lms[[x]]$coefficients[4]+
                 CF*tab3_lms[[x]]$coefficients[5]+
                 lead_CF*tab3_lms[[x]]$coefficients[6]
                 ])
silent<-lapply(1:length(vec_of_sic),
               function(x)
                 FUNDA[sic_group==vec_of_sic[x],
                       MDDGROWTH:=
                         sales_growth*tab3_lms[[x]]$coefficients[2]+
                         emp_growth*tab3_lms[[x]]$coefficients[3]
                       ])
silent<-lapply(1:length(vec_of_sic),
               function(x)
                 FUNDA[sic_group==vec_of_sic[x],
                       MDDMATCH:=
                         lagged_CF*tab3_lms[[x]]$coefficients[4]+
                         CF*tab3_lms[[x]]$coefficients[5]+
                         lead_CF*tab3_lms[[x]]$coefficients[6]
                       ])
silent<-lapply(1:length(vec_of_sic),
               function(x)
                 FUNDA[sic_group==vec_of_sic[x],
                       MDDERROR:=ACC-(tab3_lms[[x]]$coefficients[1]+
                         sales_growth*tab3_lms[[x]]$coefficients[2]+
                         emp_growth*tab3_lms[[x]]$coefficients[3]+
                         lagged_CF*tab3_lms[[x]]$coefficients[4]+
                         CF*tab3_lms[[x]]$coefficients[5]+
                         lead_CF*tab3_lms[[x]]$coefficients[6])
                       ])

#Create lagged values for MDD fitted values
FUNDA[,lagged_MDDGOOD:=lagged(MDDGOOD,1), by=gvkey]
FUNDA[,lagged_MDDGROWTH:=lagged(MDDGROWTH,1), by=gvkey]
FUNDA[,lagged_MDDMATCH:=lagged(MDDMATCH,1), by=gvkey]
FUNDA[,lagged_MDDERROR:=lagged(MDDERROR,1), by=gvkey]
FUNDA[,lead_MDDGOOD:=lead(MDDGOOD,1), by=gvkey]
FUNDA[,lead_MDDGROWTH:=lead(MDDGROWTH,1), by=gvkey]
FUNDA[,lead_MDDMATCH:=lead(MDDMATCH,1), by=gvkey]
FUNDA[,lead_MDDERROR:=lead(MDDERROR,1), by=gvkey]

#compute correlations and pvalues
CPvalues<-corr.test(FUNDA[,.(MDDGOOD,MDDGROWTH,MDDMATCH,MDDERROR,lagged_MDDGOOD,lagged_MDDGROWTH,lagged_MDDMATCH),
                          lagged_MDDERROR])

#organize into table, then round, then add parenthesis for pvalues, then add row labels
tab3b<-rbind(CPvalues$r[2,6], CPvalues$p[2,6], CPvalues$r[3,7], CPvalues$p[3,7],
             CPvalues$r[4,8], CPvalues$p[4,8], CPvalues$r[5,1], CPvalues$p[5,1])
tab3b<-round(tab3b, 3)
auto_correlation<-paste(rep(c("","("),4), tab3b, rep(c("",")"),4),sep="")
tab3b<-as.data.frame(auto_correlation,nrow=16)
rownames(tab3b, do.NULL = F)
rownames(tab3b) <- c("MDDGOOD", "", "MDDGROWTH"," " ,"MDDMATCH", "   ","MDDERROR","    ")

#make latex code for table 3b
stargazer(as.matrix(tab3b), title="Autocorrelations of good accruals and accrual estimation error")


###TABLE 4###

##Panel A##

#Define variables
FUNDA[,lagged_ACC:=lagged(ACC,1), by=gvkey]

##function inputs a number of clusters, and outputs a table 4 panel A
make.tab4<-function(x) {
  m1 <- flexmix(ACC ~ lagged_ACC, data = FUNDA[(!is.na(ACC))&(!is.na(lagged_ACC))], k = x)
  tab4<-cbind( t(as.matrix(parameters(m1))),
               table(clusters(m1))/sum(table(clusters(m1))), append(AIC(m1),rep(NA,x-1)))
  colnames(tab4, do.NULL = F)
  colnames(tab4) <- c("alpha", "beta","" ,"cluster size", "AIC")
  return(round(tab4[,c(1,2,4,5)],3))
}

#bind together outputs from above to make a pretty table, then add row names, then make latex code for the table
tab4<-rbind(rep("",4), make.tab4(1), rep("",4), make.tab4(2), rep("",4), make.tab4(3))
row.names(tab4)<-c("One Cluster","Cluster 1","Two Clusters","Cluster 1","Cluster 2","Three Clusters","Cluster 1", "Cluster 2", "Cluster 3")
stargazer(tab4, title="Latent class mixture models foraccrual autoregressions: cluster analysis")

##Panel B##
#Make a version of FUNDA that includes mixture classes for use in tables 4 panel b
FUNDAt4<-FUNDA[(!is.na(ACC))&(!is.na(lagged_ACC))]
m2<-flexmix(ACC ~ lagged_ACC, data = FUNDAt4, k = 2)
m3<-flexmix(ACC ~ lagged_ACC, data = FUNDAt4, k = 3)
m4<-flexmix(ACC ~ lagged_ACC, data = FUNDAt4, k = 4)
FUNDAt4[,m2cluster:=clusters(m2)]
FUNDAt4[,m3cluster:=clusters(m3)]
FUNDAt4[,m4cluster:=clusters(m4)]

##this function inputs a number, k, the number of clusters...and outputs a list of k linear models:
##the main model evaluated within each cluster (have to hand set k=3, done at beggining of this file)
make.tab4b<- function(k) { return(
                            lapply(1:k,
                             function(x)
                               lm1<-lm(ACC~sales_growth+emp_growth+lagged_CF+CF+lead_CF,
                                       FUNDAt4[eval(parse(text=paste("m",k,"cluster", sep="")))==x], na.action=na.omit))
) }

##this function inputs a number, k, the number of clusters...and outputs a list of k linear models:
##the reduced growth model evaluated within each cluster (have to hand set k=3, done at beggining of this file)
make.tab4bGROWTH<- function(k) { return(
  lapply(1:k,
         function(x)
           lm1<-lm(ACC~sales_growth+emp_growth,
                   FUNDAt4[eval(parse(text=paste("m",k,"cluster", sep="")))==x], na.action=na.omit))
) }

##this function inputs a number, k, the number of clusters...and outputs a list of k linear models:
##the reduced match model evaluated within each cluster (have to hand set k=3, done at beggining of this file)
make.tab4bMATCH<- function(k) { return(
  lapply(1:k,
         function(x)
           lm1<-lm(ACC~lagged_CF+CF+lead_CF,
                   FUNDAt4[eval(parse(text=paste("m",k,"cluster", sep="")))==x], na.action=na.omit))
) }


##THIS MAIN FUNCTION inputs the number of clusters, and calls the previous function to make models, then organizes the results of that
##model into table 4b.  This if the function you should use to replicate table4b if you come back to this project.
make.tab4b.final <- function(num_cluster) {
  tab4_lms <-
    make.tab4b(num_cluster) #calls a previous function to creates linear models
  tab4_lms_mddgrowth<-make.tab4bGROWTH(num_cluster) #calls a previous function to creates reduced version linear models
  tab4_lms_mddmatch<-make.tab4bMATCH(num_cluster) #calls a previous function to creates reduced version linear models
  partial_R_growth<-lapply(1:length(tab4_lms), #calculates partial Rsq for each linear model
                           function(x)
                             rsq.partial(tab4_lms[[x]], tab4_lms_mddgrowth[[x]],type = "sse")$partial.rsq)
  partial_R_match<-lapply(1:length(tab4_lms), #calculates partial Rsq for each linear model
                          function(x)
                            rsq.partial(tab4_lms[[x]], tab4_lms_mddmatch[[x]],type = "sse")$partial.rsq)

  tab4b_summaries <-
    lapply(1:length(tab4_lms), function(x)
      #pull summary stats from each linear model in the above list, so we can put this info into a table
      append(
        append(
          append(
            append(tab4_lms[[x]]$coefficients,
                   partial_R_growth[[x]]),
            partial_R_match[[x]]
          ),
          summary(tab4_lms[[x]])$r.square
        ),
        coeftest.cluster(data.frame(FUNDAt4[eval(parse(text=paste("m",k,"cluster", sep="")))==x]), tab4_lms[[x]], cluster1="gvkey",cluster2 = "fyear")[,3]
      ))
  table4bmain <- t(matrix(unlist(tab4b_summaries), c(15, length(tab4b_summaries)))) ##organize summary into matrix
  table4bmain<-round(table4bmain,3)
  table4b<-table4bmain[,1:9] #keep just coefficient and r2 info from matrix
  colnames(table4b, do.NULL=FALSE) ##add columns and make coefficient/r2 matrix pretty **
  colnames(table4b) <-
    c("intcpt",
      "SGR",
      "EMPGR",
      "LagCF",
      "CF",
      "LeadCF",
      "R^2 Growth",
      "R^2 Match",
      "R^2")
  rownames(table4b) <-
    as.vector(unlist(lapply(1:num_cluster, function(x)
      paste("Cluster", x))))

  table4bt<-table4bmain[,10:15] #keep just t-stat info from main matrix
  table4bt<-matrix(paste("(",table4bt,")",sep=""),ncol=6)
  colnames(table4bt) <-
    c("intcpt",
      "SGR",
      "EMPGR",
      "LagCF",
      "CF",
      "LeadCF")
  rownames(table4bt) <-
    as.vector(unlist(lapply(1:num_cluster, function(x)
      paste("Cluster", x))))


  table4b<-cbind(Cluster=row.names(table4b), table4b) #organize into table
  table4bt<-cbind(Cluster=row.names(table4bt), table4bt)
  table4b<-rbindlist(list(data.table(table4b),data.table(table4bt)), fill=T)[order(Cluster)]
  table4b[is.na(table4b)]<-""
  table4b[2*(1:num_cluster)]$Cluster<-""
  table4b<-as.matrix(table4b)
  return(table4b)
}

stargazer(make.tab4b.final(3), summary= FALSE,title="Regressions of accruals on growth and cash flow variables by accrual cluster",column.sep.width="2pt" )


####TABLE 5####
#step one, create a version of MDDMATCH/GOOD/ACCRUALS/ERROR without leadCF in the model
tab5_lms <- lapply(1:length(vec_of_sic),
                   function(x)
                     lm(ACC~
                          sales_growth+emp_growth+
                          lagged_CF+CF,
                        FUNDA[sic_group==vec_of_sic[x]], na.action=na.omit))
silent<-lapply(1:length(vec_of_sic),
               function(x)
                 FUNDA[sic_group==vec_of_sic[x],
                       MDDGOOD5:=tab5_lms[[x]]$coefficients[1]+
                         sales_growth*tab5_lms[[x]]$coefficients[2]+
                         emp_growth*tab5_lms[[x]]$coefficients[3]+
                         lagged_CF*tab5_lms[[x]]$coefficients[4]+
                         CF*tab5_lms[[x]]$coefficients[5]
                       ])
silent<-lapply(1:length(vec_of_sic),
               function(x)
                 FUNDA[sic_group==vec_of_sic[x],
                       MDDGROWTH5:=
                         sales_growth*tab5_lms[[x]]$coefficients[2]+
                         emp_growth*tab5_lms[[x]]$coefficients[3]
                       ])
silent<-lapply(1:length(vec_of_sic),
               function(x)
                 FUNDA[sic_group==vec_of_sic[x],
                       MDDMATCH5:=
                         lagged_CF*tab5_lms[[x]]$coefficients[4]+
                         CF*tab5_lms[[x]]$coefficients[5]
                       ])
silent<-lapply(1:length(vec_of_sic),
               function(x)
                 FUNDA[sic_group==vec_of_sic[x],
                       MDDERROR5:=ACC-(tab5_lms[[x]]$coefficients[1]+
                                        sales_growth*tab5_lms[[x]]$coefficients[2]+
                                        emp_growth*tab5_lms[[x]]$coefficients[3]+
                                        lagged_CF*tab5_lms[[x]]$coefficients[4]+
                                        CF*tab5_lms[[x]]$coefficients[5])
                       ])

#step two, create winsorized variables if they havent been used already MMDMATCH5/MDDGROWTH5 etc
#are the versions of MDD components calculated without leadCF

FUNDA[,MDDMATCH:=winsorize(MDDMATCH)]
FUNDA[,MDDGROWTH:=winsorize(MDDGROWTH)]
FUNDA[,MDDGOOD:=winsorize(MDDGOOD)]
FUNDA[,MDDERROR:=winsorize(MDDERROR)]

FUNDA[,MDDMATCH5:=winsorize(MDDMATCH5)]
FUNDA[,MDDGROWTH5:=winsorize(MDDGROWTH5)]
FUNDA[,MDDGOOD5:=winsorize(MDDGOOD5)]
FUNDA[,MDDERROR5:=winsorize(MDDERROR5)]

#step three, define models (one version to get r square, one version to get GOW t-values)
tab5lm1r<-lm(lead_INC~CF+ACC, FUNDA)
tab5lm1<-coeftest.cluster(data.frame(FUNDA), lm(lead_INC~CF+ACC, FUNDA), cluster1="gvkey",cluster2="fyear")
tab5lm2r<-lm(lead_INC~CF+MDDGOOD+MDDERROR, FUNDA)
tab5lm2<-coeftest.cluster(data.frame(FUNDA), lm(lead_INC~CF+MDDGOOD+MDDERROR, FUNDA), cluster1="gvkey",cluster2="fyear")
tab5lm3r<-lm(lead_INC~CF+MDDGROWTH+MDDMATCH+MDDERROR, FUNDA)
tab5lm3<-coeftest.cluster(data.frame(FUNDA), lm(lead_INC~CF+MDDGROWTH+MDDMATCH+MDDERROR, FUNDA), cluster1="gvkey",cluster2="fyear")
tab5lm4r<-lm(lead_INC~CF+MDDGOOD5+MDDERROR5, FUNDA)
tab5lm4<-coeftest.cluster(data.frame(FUNDA), lm(lead_INC~CF+MDDGOOD5+MDDERROR5, FUNDA), cluster1="gvkey",cluster2="fyear")
tab5lm5r<-lm(lead_INC~CF+MDDGROWTH5+MDDMATCH5+MDDERROR5, FUNDA)
tab5lm5<-coeftest.cluster(data.frame(FUNDA), lm(lead_INC~CF+MDDGROWTH5+MDDMATCH5+MDDERROR5, FUNDA), cluster1="gvkey",cluster2="fyear")

#step four, put it all into a table
tab5<-smartbind(tab5lm1[,1], round(tab5lm1[,3],2), tab5lm2[,1], round(tab5lm2[,3],2), tab5lm3[,1], round(tab5lm3[,3],2),tab5lm4[,1], round(tab5lm4[,3],2),tab5lm5[,1], round(tab5lm5[,3],2))
tab5[7:10,4:7]<-tab5[7:10,8:11]
tab5<-tab5[,1:7]
adj_r_sq<-c(summary(tab5lm1r)$adj.r.squared,NA,summary(tab5lm2r)$adj.r.squared, NA, summary(tab5lm3r)$adj.r.squared, NA, summary(tab5lm4r)$adj.r.squared, NA, summary(tab5lm5r)$adj.r.squared, NA)
tab5<-cbind(tab5,adj_r_sq)
tab5<-round(tab5,3)
parenth<-rbind(rep(FALSE,8), !is.na(tab5[2,]),rep(FALSE,8), !is.na(tab5[4,]),rep(FALSE,8), !is.na(tab5[6,]), rep(FALSE,8), !is.na(tab5[8,]), rep(FALSE,8), !is.na(tab5[10,]))
parenthpre<-replace(as.vector(parenth), parenth, "(")
parenthpre<-matrix(replace(as.vector(parenthpre), !parenth, ""),nrow=10)
parenthpost<-replace(as.vector(parenth), parenth, ")")
parenthpost<-matrix(replace(as.vector(parenthpost), !parenth, ""),nrow=10)
tab5[is.na(tab5)]<-""
tab5<-matrix(paste(parenthpre,as.matrix(tab5),parenthpost,sep=""),nrow=10)
tab5<-data.table(tab5)
tab5<-tab5[,c(1,2,3,4,6,7,5,8)]
tab5<-rbind(c("Panel A"), tab5[1:2], c("Panel B"), tab5[3:6], c("Panel C"), tab5[7:10],fill=T)
tab5[is.na(tab5)]<-""
colnames(tab5)<-c("","intcpt","CF","ACC","good","growth","match","error","Adj R Squared")
stargazer(as.matrix(tab5), summary=FALSE,title="Pooled cross-sectional regressions of next year's income on cash flows, accruals and accrual components from the MDD Model.",column.sep.width="2pt" )


#####TABLE 6#####

#step one, make some new variables
FUNDA[,size := log(prcc_f*csho)]
FUNDA[,mk2bk := at/(prcc_f*csho+dlc+dltt+pstkl+txditc)]

#step two, define models (one version to get r square, one version to get GOW t-values)
tab6lm1r<-lm(ann_return~CF+ACC+size+mk2bk+MOM6+MOM36, FUNDA)
tab6lm1<-coeftest.cluster(data.frame(FUNDA), tab6lm1r, cluster1="gvkey",cluster2="fyear")
tab6lm2r<-lm(ann_return~CF+MDDGOOD+MDDERROR+size+mk2bk+MOM6+MOM36, FUNDA)
tab6lm2<-coeftest.cluster(data.frame(FUNDA), tab6lm2r, cluster1="gvkey",cluster2="fyear")
tab6lm3r<-lm(ann_return~CF+MDDGROWTH+MDDMATCH+MDDERROR+size+mk2bk+MOM6+MOM36, FUNDA)
tab6lm3<-coeftest.cluster(data.frame(FUNDA), tab6lm3r, cluster1="gvkey",cluster2="fyear")
tab6lm4r<-lm(ann_return~CF+MDDGOOD5+MDDERROR5+size+mk2bk+MOM6+MOM36, FUNDA)
tab6lm4<-coeftest.cluster(data.frame(FUNDA), tab6lm4r, cluster1="gvkey",cluster2="fyear")
tab6lm5r<-lm(ann_return~CF+MDDGROWTH5+MDDMATCH5+MDDERROR5+size+mk2bk+MOM6+MOM36, FUNDA)
tab6lm5<-coeftest.cluster(data.frame(FUNDA), tab6lm5r, cluster1="gvkey",cluster2="fyear")

#step four, put it all into a table
tab6<-smartbind(tab6lm1[,1], round(tab6lm1[,3],2), tab6lm2[,1], round(tab6lm2[,3],2), tab6lm3[,1], round(tab6lm3[,3],2),tab6lm4[,1], round(tab6lm4[,3],2),tab6lm5[,1], round(tab6lm5[,3],2))
tab6[7:10,8:11]<-tab6[7:10,12:15]
tab6<-tab6[,1:11]
adj_r_sq<-c(summary(tab6lm1r)$adj.r.squared,NA,summary(tab6lm2r)$adj.r.squared, NA, summary(tab6lm3r)$adj.r.squared, NA, summary(tab6lm4r)$adj.r.squared, NA, summary(tab6lm5r)$adj.r.squared, NA)
tab6<-cbind(tab6,adj_r_sq)
tab6<-round(tab6,3)
parenth<-rbind(rep(FALSE,8), !is.na(tab6[2,]),rep(FALSE,8), !is.na(tab6[4,]),rep(FALSE,8), !is.na(tab6[6,]),
               rep(FALSE,8), !is.na(tab6[8,]), rep(FALSE,8), !is.na(tab6[10,]))
parenthpre<-replace(as.vector(parenth), parenth, "(")
parenthpre<-matrix(replace(as.vector(parenthpre), !parenth, ""),nrow=10)
parenthpost<-replace(as.vector(parenth), parenth, ")")
parenthpost<-matrix(replace(as.vector(parenthpost), !parenth, ""),nrow=10)
tab6[is.na(tab6)]<-""
tab6<-matrix(paste(parenthpre,as.matrix(tab6),parenthpost,sep=""),nrow=10)
tab6<-data.table(tab6)
tab6<-tab6[,c(1,2,3,8,10,11,9,5,4,6,7,12)]
tab6<-rbind(c("Panel A"), tab6[1:2], c("Panel B"), tab6[3:6], c("Panel C"), tab6[7:10],fill=TRUE)
tab6[is.na(tab6)]<-""
colnames(tab6)<-c("","intcpt","CF","ACC","good","growth","match","error","BM","size","M6","M36","Adj R Squared")
stargazer(as.matrix(tab6), summary=FALSE,title="Pooled cross-sectional regressions of next year's stock returns on cash flows, accruals and accrual components from the MDD Model.",column.sep.width="2pt" )


#merge cluster data in with rest of FUNDA, then save a version--Only necessary for extension purposes
FUNDA<-merge(FUNDA, FUNDAt4[,.(m3cluster, match_date, gvkey)], by=c("gvkey","match_date"),all.x=TRUE)
write.csv(FUNDA, file = "/Users/skylardeture/documents/Research/Specific Accruals/end_of_replication_data_8_26_2017.csv")