Skip to content

Instantly share code, notes, and snippets.

@craha22
Created May 23, 2018 18:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save craha22/3909ab4ce285086c0981b11f996fd033 to your computer and use it in GitHub Desktop.
Save craha22/3909ab4ce285086c0981b11f996fd033 to your computer and use it in GitHub Desktop.
Code to accompany Lazy Prices blog post
# Code for obtaining annual reports and computing similarity measures
# Used for: https://craha.wordpress.com/2018/05/21/an-implementation-of-lazy-prices/
library(edgarWebR)
library(dplyr)
#library(parallel) - I used mclapply instead of lapply below to speed things up when running more stocks
library(text2vec)
library(stringr)
stock_list <- c("MMM", "AXP", "AAPL", "CAT")
#this may take a while, many html requests.
filings_list <- lapply(stock_list, function(x) {
#Search for company's filings- count param supports intervals of 20
filings <- company_filings(x, type = "10-", count = 20)
#isolate 10K - 10Q
filings <- filings[filings$type == "10-K" | filings$type == "10-Q",]
#get filing information of each item
filing_infos <- purrr::map_df(filings$href, filing_information)
#join for some info
filings <- bind_cols(filings, filing_infos)
#look at each report in search results for company
filings_reports <- lapply(filings$href, function(x) {
docs <- filing_documents(x)
doc <- docs[docs$description == 'Complete submission text file', ]
#this will prevent an error if the stack gets too big when parsing the report and fails
parsed_docs <- tryCatch({parse_submission(doc$href)}, error = function(e) {return(NULL)})
if(is.null(parsed_docs)) {return(-1)}
doc <- parse_filing(parsed_docs[which(stringr::str_detect(parsed_docs$TYPE, "10-")),'TEXT'])
#MDA
MDA <- doc[grepl("management", doc$item.name, ignore.case = TRUE), ]
#RF
RF <- doc[grepl("risk factor", doc$item.name, ignore.case = TRUE), ]
#Legal
LEGAL <- doc[grepl("legal", doc$item.name, ignore.case = TRUE), ]
#QnQ
QNQ <- doc[grepl("Quant", doc$item.name, ignore.case = TRUE), ]
#Control
CONTROL <- doc[grepl("Control", doc$item.name, ignore.case = TRUE), ]
#other
OTHER <- doc[grepl("Other", doc$item.name, ignore.case = TRUE), ]
#strip "Tables" or highly numeric paragraphs
MDA <- MDA[(stringr::str_count(MDA$text, pattern = '[0-9]')/nchar(MDA$text)) < .10,1]
RF <- RF[(stringr::str_count(RF$text, pattern = '[0-9]')/nchar(RF$text)) < .10,1]
LEGAL <- LEGAL[(stringr::str_count(LEGAL$text, pattern = '[0-9]')/nchar(LEGAL$text)) < .10,1]
QNQ <- QNQ[(stringr::str_count(QNQ$text, pattern = '[0-9]')/nchar(QNQ$text)) < .10,1]
CONTROL <- CONTROL[(stringr::str_count(CONTROL$text, pattern = '[0-9]')/nchar(CONTROL$text)) < .10,1]
OTHER <- OTHER[(stringr::str_count(OTHER$text, pattern = '[0-9]')/nchar(OTHER$text)) < .10,1]
sections <- list(MDA, RF, LEGAL, QNQ, CONTROL, OTHER)
names(sections) <- c("MDA", "RF", "LEGAL", "QNQ", "CONTROL", "OTHER")
return(sections)
})
names(filings_reports) <- filings$period_date
return(filings_reports)
})
names(filings_list) <- stock_list
#Remove any companies that failed
filings_list <- filings_list[lapply(filings_list, length) > 1]
prep_fun <- function(x) {
x %>%
# make text lower case
str_to_lower %>%
# remove non-alphanumeric symbols
str_replace_all("[^[:alnum:]]", " ") %>%
# collapse multiple spaces
str_replace_all("\\s+", " ")
}
similarities <- lapply(filings_list_mc[1:4], function(x){
n_reports <- nrow(summary(x))
get_sims <- c('MDA', 'RF', 'LEGAL', 'QNQ', 'CONTROL', 'OTHER')
all_sims <- list()
for (i in 1:(n_reports-5)){
this_period <- x[[i]]
last_year <- x[[(i+4)]]
sims <- lapply(get_sims, function(d){
#Get text
texta <- prep_fun(paste(this_period[d], collapse = ' '))
textb <- prep_fun(paste(last_year[d], collapse = ' '))
#tokenize
it1 <- itoken(texta, progressbar = FALSE)
it2 <- itoken(textb, progressbar = FALSE)
#build vocab
v <- create_vocabulary(c(it1, it2))
vectorizer <- vocab_vectorizer(v)
#D-TF vectors
dtm1 <- create_dtm(it1, vectorizer)
dtm2 <- create_dtm(it2, vectorizer)
#Get the sims
jaccard <- as.numeric(sim2(dtm1, dtm2, method = "jaccard", norm = "none"))
cosine <- as.numeric(sim2(dtm1, dtm2, method = "cosine", norm = "l2"))
sim_vec <- c(jaccard, cosine)
names(sim_vec) <- c("jaccard", "cosine")
return(sim_vec)
})
names(sims) <- get_sims
sims <- do.call('rbind', sims)
all_sims[[i]] <- sims
}
names(all_sims) <- names(x)[1:length(all_sims)]
return(all_sims)
})
names(similarities) <- names(filings_list)
#Jaccard Similarity
jaccard_sims <- lapply(similarities, function(x) {
#x is a stock, list of dates
jaccard_stock <- lapply(x, function(d) {
jc <- d[,1]
return(jc)
})
names(jaccard_stock) <- names(x)
jaccard_stock <- do.call("rbind", jaccard_stock)
return(jaccard_stock)
})
#add columns of the stocks and the dates to the similarity matrix
label_col <- unlist(lapply( names(similarities), function(x){return(rep(x, as.numeric(summary(jaccard_sims)[x,][1])/6))}))
jaccard_sims <- do.call("rbind", jaccard_sims)
jaccard_sims <- cbind(label_col, jaccard_sims, lubridate::year(rownames(jaccard_sims)),lubridate::month(rownames(jaccard_sims)),lubridate::day(rownames(jaccard_sims)) )
rownames(jaccard_sims) <- NULL
jaccard_sims <- as.data.frame(jaccard_sims, row.names = NULL)
colnames(jaccard_sims)[(length(jaccard_sims)-2):length(jaccard_sims)] <- c('y','m','d')
cosine_sims <- lapply(similarities, function(x) {
#x is a stock, list of dates
cos_stock <- lapply(x, function(d) {
cs <- d[,2]
return(cs)
})
names(cos_stock) <- names(x)
cos_stock <- do.call("rbind", cos_stock)
return(cos_stock)
})
#add columns of the stocks and the dates to the similarity matrix
cosine_sims <- do.call("rbind", cosine_sims)
cosine_sims <- cbind(label_col, cosine_sims, lubridate::year(rownames(cosine_sims)),lubridate::month(rownames(cosine_sims)),lubridate::day(rownames(cosine_sims)) )
rownames(cosine_sims) <- NULL
cosine_sims <- as.data.frame(cosine_sims, row.names = NULL)
colnames(cosine_sims)[(length(cosine_sims)-2):length(cosine_sims)] <- c('y','m','d')
#now you have similarity matricies of 10K-10Q items for you to rank, join to returns, and build what you need
#If you'd like my code for that, please reach out
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment