Created
October 15, 2018 16:54
-
-
Save yanlesin/b9820084f18a76eae47666ded8987557 to your computer and use it in GitHub Desktop.
Parsing list of securities for 13F Report
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(pdftools) | |
library(stringr) | |
library(dplyr) | |
library(openxlsx) | |
download.file("https://www.sec.gov/divisions/investment/13f/13flist2018q2.pdf",'13flist2018q2.pdf',mode='wb') | |
file <- "13flist2018q2.pdf" | |
text <- pdf_text(file) | |
pages <- length(text) | |
# Look for line separator \r\n or \n or \r depending on operating system ---- | |
CR <- str_locate(text[1],"\r")[1] | |
LF <- str_locate(text[1],"\n")[1] | |
if (!is.na(CR)) { | |
if (str_sub(text[1], CR + 1, CR + 1) == "\n") { | |
line_separator <-"\r\n" | |
} else { | |
line_separator <-"\r" | |
} | |
} else { | |
if (!is.na(LF)) { | |
line_separator <- "\n" | |
} else { | |
stop("Undefined line separator") | |
} | |
} | |
# Create dataframe with all relevant pages split by each string ---- | |
df_strings <- data.frame(Page=integer(), | |
Line=integer(), | |
CUSIP=character(), | |
Star=character(), | |
ISSUER_NAME=character(), | |
ISSUER_DESCRIPTION=character(), | |
STATUS=character(), | |
stringsAsFactors = FALSE) | |
# Initialize Progress Bar ---- | |
progress_bar <- txtProgressBar(min = 3, max = pages) | |
# Main Parsing Loop ---- | |
for (i in 3:pages) | |
{ | |
df_strings_temp <- df_strings[0,] | |
strings_on_page <- str_split(text[i],line_separator, simplify = FALSE) | |
page_length <- length(strings_on_page[[1]]) | |
for (j in 1:page_length) | |
{ | |
df_strings_temp <- cbind(Page=i, | |
Line=j, | |
CUSIP=substr(strings_on_page[[1]][j],1,11), | |
Star=substr(strings_on_page[[1]][j],13,13), | |
ISSUER_NAME=substr(strings_on_page[[1]][j],17,52), | |
ISSUER_DESCRIPTION=substr(strings_on_page[[1]][j],53,71), | |
STATUS=substr(strings_on_page[[1]][j],72,78) | |
) | |
df_strings <- rbind(df_strings,df_strings_temp,stringsAsFactors=FALSE) | |
} | |
info <- sprintf("Processing page %d of PDF document", round(i)) | |
setTxtProgressBar(progress_bar,i,title = info) | |
} | |
close(progress_bar) | |
# Filter out irrelevant strings from dataframe ---- | |
# CUSIP, Run Date, Run Time, and empty string at the end of each page are irrelevant lines | |
#trim <- function (x) gsub("^\\s+|\\s+$", "", x) | |
df_strings_filtered <- filter(df_strings, !str_detect(CUSIP, "CUSIP")) %>% | |
filter(!str_detect(CUSIP, "Run Date")) %>% | |
filter(!str_detect(CUSIP, "Run Time")) %>% | |
filter(!str_detect(ISSUER_DESCRIPTION, "Total C")) %>% | |
filter(str_detect(CUSIP, "")) %>% | |
filter(STATUS!='DELETED') %>% | |
mutate(ISSUER_NAME=str_trim(ISSUER_NAME)) | |
# Make valid CUSIP ---- | |
List_13F_2018_Q2 <- mutate(df_strings_filtered,CUSIP_NEW=str_replace_all(CUSIP," ", "")) %>% | |
select(-CUSIP) | |
# Save results to XLSX format ---- | |
#Sys.setenv("R_ZIPCMD" = "C:/RTools/bin/zip.exe") | |
write.xlsx(List_13F_2018_Q2,"13flist2018q2.xlsx",asTable = TRUE) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment