Skip to content

Instantly share code, notes, and snippets.

@yanlesin
Created October 15, 2018 16:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save yanlesin/b9820084f18a76eae47666ded8987557 to your computer and use it in GitHub Desktop.
Save yanlesin/b9820084f18a76eae47666ded8987557 to your computer and use it in GitHub Desktop.
Parsing list of securities for 13F Report
library(pdftools)
library(stringr)
library(dplyr)
library(openxlsx)
download.file("https://www.sec.gov/divisions/investment/13f/13flist2018q2.pdf",'13flist2018q2.pdf',mode='wb')
file <- "13flist2018q2.pdf"
text <- pdf_text(file)
pages <- length(text)
# Look for line separator \r\n or \n or \r depending on operating system ----
CR <- str_locate(text[1],"\r")[1]
LF <- str_locate(text[1],"\n")[1]
if (!is.na(CR)) {
if (str_sub(text[1], CR + 1, CR + 1) == "\n") {
line_separator <-"\r\n"
} else {
line_separator <-"\r"
}
} else {
if (!is.na(LF)) {
line_separator <- "\n"
} else {
stop("Undefined line separator")
}
}
# Create dataframe with all relevant pages split by each string ----
df_strings <- data.frame(Page=integer(),
Line=integer(),
CUSIP=character(),
Star=character(),
ISSUER_NAME=character(),
ISSUER_DESCRIPTION=character(),
STATUS=character(),
stringsAsFactors = FALSE)
# Initialize Progress Bar ----
progress_bar <- txtProgressBar(min = 3, max = pages)
# Main Parsing Loop ----
for (i in 3:pages)
{
df_strings_temp <- df_strings[0,]
strings_on_page <- str_split(text[i],line_separator, simplify = FALSE)
page_length <- length(strings_on_page[[1]])
for (j in 1:page_length)
{
df_strings_temp <- cbind(Page=i,
Line=j,
CUSIP=substr(strings_on_page[[1]][j],1,11),
Star=substr(strings_on_page[[1]][j],13,13),
ISSUER_NAME=substr(strings_on_page[[1]][j],17,52),
ISSUER_DESCRIPTION=substr(strings_on_page[[1]][j],53,71),
STATUS=substr(strings_on_page[[1]][j],72,78)
)
df_strings <- rbind(df_strings,df_strings_temp,stringsAsFactors=FALSE)
}
info <- sprintf("Processing page %d of PDF document", round(i))
setTxtProgressBar(progress_bar,i,title = info)
}
close(progress_bar)
# Filter out irrelevant strings from dataframe ----
# CUSIP, Run Date, Run Time, and empty string at the end of each page are irrelevant lines
#trim <- function (x) gsub("^\\s+|\\s+$", "", x)
df_strings_filtered <- filter(df_strings, !str_detect(CUSIP, "CUSIP")) %>%
filter(!str_detect(CUSIP, "Run Date")) %>%
filter(!str_detect(CUSIP, "Run Time")) %>%
filter(!str_detect(ISSUER_DESCRIPTION, "Total C")) %>%
filter(str_detect(CUSIP, "")) %>%
filter(STATUS!='DELETED') %>%
mutate(ISSUER_NAME=str_trim(ISSUER_NAME))
# Make valid CUSIP ----
List_13F_2018_Q2 <- mutate(df_strings_filtered,CUSIP_NEW=str_replace_all(CUSIP," ", "")) %>%
select(-CUSIP)
# Save results to XLSX format ----
#Sys.setenv("R_ZIPCMD" = "C:/RTools/bin/zip.exe")
write.xlsx(List_13F_2018_Q2,"13flist2018q2.xlsx",asTable = TRUE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment