Last active
September 4, 2017 23:07
-
-
Save sdeture/a72458637e342047fa95c128256f868c to your computer and use it in GitHub Desktop.
Libraries and Functions for First Year Summer Paper (ALS Replication and Spectral Analysis Extension)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
##packages for specific accruals, all in one place | |
library(readr) | |
require(sandwich) | |
require(lmtest) | |
library("data.table", lib.loc="/Library/Frameworks/R.framework/Versions/3.3/Resources/library") | |
library("zoo", lib.loc="/Library/Frameworks/R.framework/Versions/3.3/Resources/library") | |
library(lubridate) | |
library("stargazer", lib.loc="/Library/Frameworks/R.framework/Versions/3.3/Resources/library") | |
library("gmodels", lib.loc="/Library/Frameworks/R.framework/Versions/3.3/Resources/library") | |
library("DescTools", lib.loc="/Library/Frameworks/R.framework/Versions/3.3/Resources/library") | |
library("rsq", lib.loc="/Library/Frameworks/R.framework/Versions/3.3/Resources/library") | |
library("psych", lib.loc="/Library/Frameworks/R.framework/Versions/3.3/Resources/library") | |
library("flexmix", lib.loc="/Library/Frameworks/R.framework/Versions/3.3/Resources/library") | |
library("gtools", lib.loc="/Library/Frameworks/R.framework/Versions/3.3/Resources/library") | |
library("tabplot", lib.loc="/Library/Frameworks/R.framework/Versions/3.3/Resources/library") | |
library("ggplot2", lib.loc="/Library/Frameworks/R.framework/Versions/3.3/Resources/library") | |
library("hexbin", lib.loc="/Library/Frameworks/R.framework/Versions/3.3/Resources/library") | |
library("MASS", lib.loc="/Library/Frameworks/R.framework/Versions/3.3/Resources/library") | |
library("sqldf", lib.loc="/Library/Frameworks/R.framework/Versions/3.3/Resources/library") | |
library("RH2", lib.loc="/Library/Frameworks/R.framework/Versions/3.3/Resources/library") | |
library("GGally", lib.loc="/Library/Frameworks/R.framework/Versions/3.3/Resources/library") | |
library("roll", lib.loc="/Library/Frameworks/R.framework/Versions/3.3/Resources/library") | |
##define lag functions | |
lagged<-function(X, n) { | |
return( | |
append(rep(NA,n),head(X,length(X)-n)) | |
) | |
} | |
lead<-function(X, n) { | |
return( | |
append(tail(X,length(X)-n),rep(NA,n)) | |
) | |
} | |
##define delta functions | |
delta<-function(X,n) { | |
return( | |
X-lagged(X,n) | |
) | |
} | |
#inputs a vector, outputs vector winsorized at -1 and 1 | |
winsorize<-function(x) { | |
x[x>1]<-1 | |
x[x<(-1)]<-(-1) | |
return(x) | |
} | |
#inputs a vector, winsorizes at 1 and 99% | |
wins99<-function(x) { | |
x99<-quantile(x,.99,na.rm=T) | |
x01<-quantile(x,.01,na.rm=T) | |
x[x>x99]<-x99 | |
x[x<x01]<-x01 | |
return(x) | |
} | |
# R function for computing two-way cluster-robust standard errors. | |
# The code below was adapted by Ian Gow on May 16, 2011 using code supplied | |
# via Mitchell Petersen's website by Mahmood Arai, Jan 21, 2008. | |
# | |
# Apart from a little cleanup of the code, the main difference between this | |
# and the earlier code is in the handling of missing values. Look at the file | |
# cluster.test.R to see example usage. Note that care should be taken to | |
# do subsetting outside of the call to lm or glm, as it is difficult to recon- | |
# struct subsetting of this kind from the fitted model. However, the code | |
# does handle transformations of variables in the model (e.g., logs). Please | |
# report any bugs, suggestions, or errors to iandgow@gmail.com. The output has | |
# been tested fairly extensively against output of Mitchell Petersen's | |
# cluster2.ado commmand (hence implicitly against the Matlab and SAS code posted | |
# elsewhere here), but I have not tested the code against code for non-linear | |
# models, such as logit2.ado. | |
# See: Thompson (2006), Cameron, Gelbach and Miller (2006) and Petersen (2010). | |
# and Gow, Ormazabal, and Taylor (2010) for more discussion of this code | |
# and two-way cluster-robust standard errors. | |
# The arguments of the function are data, fitted model, cluster1 and cluster2 | |
# You need to install packages `sandwich' by Thomas Lumley and Achim Zeileis and | |
# `lmtest' by Torsten Hothorn, Achim Zeileis, Giovanni Millo and David Mitchell. | |
# (For example, type install.packages("sandwich") on the R console.) | |
coeftest.cluster <- function(data, fm, cluster1, cluster2=NULL) { | |
require(sandwich) | |
require(lmtest) | |
# Calculation shared by covariance estimates | |
est.fun <- estfun(fm) | |
# est.fun <- sweep(fm$model,MARGIN=2,fm$residuals,`*`) | |
# Need to identify observations used in the regression (i.e., | |
# non-missing) values, as the cluster vectors come from the full | |
# data set and may not be in the regression model. | |
# I use complete.cases following a suggestion from | |
# Francois Cocquemas <francois.cocquemas@gmail.com> | |
inc.obs <- complete.cases(data[,names(fm$model)]) | |
# inc.obs <- !is.na(est.fun[,1]) | |
# est.fun <- est.fun[inc.obs,] | |
# Shared data for degrees-of-freedom corrections | |
N <- dim(fm$model)[1] | |
NROW <- NROW(est.fun) | |
K <- fm$rank | |
# Calculate the sandwich covariance estimate | |
cov <- function(cluster) { | |
cluster <- factor(cluster) | |
# Calculate the "meat" of the sandwich estimators | |
u <- apply(est.fun, 2, function(x) tapply(x, cluster, sum)) | |
meat <- crossprod(u)/N | |
# Calculations for degrees-of-freedom corrections, followed | |
# by calculation of the variance-covariance estimate. | |
# NOTE: NROW/N is a kluge to address the fact that sandwich uses the | |
# wrong number of rows (includes rows omitted from the regression). | |
M <- length(levels(cluster)) | |
dfc <- M/(M-1) * (N-1)/(N-K) | |
dfc * NROW/N * sandwich(fm, meat=meat) | |
} | |
# Calculate the covariance matrix estimate for the first cluster. | |
cluster1 <- data[inc.obs,cluster1] | |
cov1 <- cov(cluster1) | |
if(is.null(cluster2)) { | |
# If only one cluster supplied, return single cluster | |
# results | |
return(coeftest(fm, cov1)) | |
} else { | |
# Otherwise do the calculations for the second cluster | |
# and the "intersection" cluster. | |
cluster2 <- data[inc.obs,cluster2] | |
cluster12 <- paste(cluster1,cluster2, sep="") | |
# Calculate the covariance matrices for cluster2, the "intersection" | |
# cluster, then then put all the pieces together. | |
cov2 <- cov(cluster2) | |
cov12 <- cov(cluster12) | |
covMCL <- (cov1 + cov2 - cov12) | |
# Return the output of coeftest using two-way cluster-robust | |
# standard errors. | |
return(coeftest(fm, covMCL)) | |
} | |
} | |
##end GOW function | |
#this function assigns observations to n equal size groups (quantiles) | |
rank_data<-function(X, n) { | |
cut(X, | |
breaks=quantile(X, probs=seq(0, 1, by=(1/n)), na.rm=T), | |
include.lowest= TRUE, labels=1:n)} | |
#extract p-value from an lm | |
lmp <- function (modelobject) { | |
if (class(modelobject) != "lm") stop("Not an object of class 'lm' ") | |
f <- summary(modelobject)$fstatistic | |
p <- pf(f[1],f[2],f[3],lower.tail=F) | |
attributes(p) <- NULL | |
return(p) | |
} | |
##compute R^2 from time series model predictions | |
RS <- function(x, xhat) { | |
return(sum((na.omit(xhat) - mean(na.omit( | |
x | |
))) ^ 2) / sum((na.omit(x) - mean(na.omit( | |
x | |
))) ^ 2)) | |
} | |
pers<- function(Z) | |
{lm = lm(eps_adj~lag_eps_adj, data= as.data.frame(Z)); | |
return(lm$coefficients[2])} | |
write.csv(FUNDA, file = "/Users/skylardeture/documents/Specific Accruals/ALSdata1.csv") | |
##imput ivs and dvs as matrices, use a 10 observation rolling window | |
rolling_reg <- function(ivs, dv) { | |
lm<-roll_lm(ivs,dv,10) | |
coef<-lm$coefficients | |
return(lapply(seq_len(ncol(coef)), function(i) coef[,i])) | |
} | |
rolling_regR <- function(ivs, dv) { | |
lm<-roll_lm(ivs,dv,10) | |
coef<-lm$r.squared | |
return(lapply(seq_len(ncol(coef)), function(i) coef[,i])) | |
} | |
######################################################################## | |
# CREDIT: IAN GOW | |
# Small program to fetch and organize Fama-French industry data. | |
# The idea is to make a table that could be used for SQL merges. | |
# CREDIT: IAN GOW | |
######################################################################## | |
# The URL for the data. | |
ff.url <- paste("http://mba.tuck.dartmouth.edu", | |
"pages/faculty/ken.french/ftp", | |
"Industry_Definitions.zip", sep="/") | |
# Download the data and unzip it | |
f <- tempfile() | |
download.file(ff.url, f) | |
file.list <- unzip(f,list=TRUE) | |
trim <- function(string) { | |
# Remove leading and trailing spaces from a string | |
ifelse(grepl("^\\s*$", string, perl=TRUE),"", | |
gsub("^\\s*(.*?)\\s*$","\\1",string,perl=TRUE)) | |
} | |
# Function to do the heavy lifting | |
extract_ff_ind_data <- function (file) { | |
# Read in the data in a plain form | |
ff_ind <- as.vector(read.delim(unzip(f, files=file), header=FALSE, | |
stringsAsFactors=FALSE)) | |
# The first 10 characters of each line are the industry data, but only the first | |
# row of the data for the SIC codes in an industry are filled in; | |
# so fill in the rest. | |
ind_num <- trim(substr(ff_ind[,1],1,10)) | |
for (i in 2:length(ind_num)) { | |
if (ind_num[i]=="") ind_num[i] <- ind_num[i-1] | |
} | |
# The rest of each line is either detail on an industry or details about the | |
# range of SIC codes that fit in each industry with a label for each group | |
# of SIC codes. | |
sic_detail <- trim(substr(ff_ind[,1],11,100)) | |
# If the line doesn't start with a number, it's an industry description | |
is.desc <- grepl("^\\D",sic_detail,perl=TRUE) | |
# Pull out information from rows about industries | |
regex.ind <- "^(\\d+)\\s+(\\w+).*$" | |
ind_num <- gsub(regex.ind,"\\1",ind_num,perl=TRUE) | |
ind_abbrev <- gsub(regex.ind,"\\2",ind_num[is.desc],perl=TRUE) | |
ind_list <- data.frame(ind_num=ind_num[is.desc],ind_abbrev, | |
ind_desc=sic_detail[is.desc]) | |
# Pull out information rows about ranges of SIC codes | |
regex.sic <- "^(\\d+)-(\\d+)\\s*(.*)$" | |
ind_num <- ind_num[!is.desc] | |
sic_detail <- sic_detail[!is.desc] | |
sic_low <- as.integer(gsub(regex.sic,"\\1",sic_detail,perl=TRUE)) | |
sic_high <- as.integer(gsub(regex.sic,"\\2",sic_detail,perl=TRUE)) | |
sic_desc <- gsub(regex.sic,"\\3",sic_detail,perl=TRUE) | |
sic_list <- data.frame(ind_num, sic_low, sic_high, sic_desc) | |
return(merge(ind_list,sic_list,by="ind_num",all=TRUE)) | |
} | |
# Extract the data of interest | |
ind_48_table <- extract_ff_ind_data("Siccodes48.txt") | |
# ind_49_table <- extract_ff_ind_data("Siccodes49.txt") | |
# ind_12_table <- extract_ff_ind_data("Siccodes12.txt") | |
# ind_17_table <- extract_ff_ind_data("Siccodes17.txt") | |
# Load the data into my database | |
library(RPostgreSQL) | |
drv <- dbDriver("PostgreSQL") | |
pg <- dbConnect(drv, dbname = "crsp") | |
rs <- dbWriteTable(pg,c("ff","ind_48"),ind_48_table,overwrite=TRUE) | |
##merge in Fama French industries to FUNDA data | |
library("sqldf", lib.loc = "/Library/Frameworks/R.framework/Versions/3.3/Resources/library") | |
library("RH2", lib.loc = "/Library/Frameworks/R.framework/Versions/3.3/Resources/library") | |
detach("package:RPostgreSQL", unload=TRUE) | |
x <- (FUNDA[, .(sic)]) | |
y = sqldf(" SELECT * | |
FROM x d1 JOIN ind_48_table d2 | |
ON d1.sic<=d2.sic_high | |
AND d1.sic>=d2.sic_low | |
") | |
y <- unique(data.table(y), by = c("sic")) | |
y$sic <- y$sic | |
FUNDA <- merge(FUNDA, y, by = c("sic"), all.x = T) | |
##see average frequency and period by Fama French industry | |
View(FUNDA_abv[, .(mean(max_freq, na.rm = T), sd(max_freq, na.rm = | |
T), .N), ind_desc][order(V1)][N > 1]) | |
View(FUNDA_abv[, .(1 / (mean(max_freq, na.rm = T)), sd(max_freq, na.rm = | |
T), .N), ind_desc][order(V1)][N > 1]) | |
######################################################################## | |
# End of Code for Obtaining Fama French Industry Info | |
# CREDIT: IAN GOW | |
######################################################################## |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment