Skip to content

Instantly share code, notes, and snippets.

@anstosa
Created September 6, 2017 05:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anstosa/1b3cefade6615a39922798a4099a903b to your computer and use it in GitHub Desktop.
Save anstosa/1b3cefade6615a39922798a4099a903b to your computer and use it in GitHub Desktop.
SEC embedded XML document parser in R
# Load XML library
# If not already installed, install with
# sudo Rscript -e 'install.packages("XML", repos = "http://cran.us.r-project.org")'
library(XML)
# Select input file
FILENAME <- "./0001086364-17-000005.txt"
# Read raw file into memory
book <- readLines(FILENAME)
# Store each XML document in this list
chapters <- list()
# Loop through lines in raw file searching for embedded XML documents
isInChapter <- FALSE
currentChapter <- 0
currentIndex <- 1
for (currentLine in 1:length(book)) {
if (!isInChapter && book[currentLine] == "<XML>") {
# This is the start of a new embedded docoument
# Set the state accordingly and reset the chapter line counter
isInChapter <- TRUE
currentIndex <- 0
currentChapter <- currentChapter + 1
chapters[[currentChapter]] <- list()
}
else if (isInChapter && book[currentLine] == "</XML>") {
# This is the start of a new embedded docoument
# Set the state accordingly
isInChapter <- FALSE
}
else if (isInChapter) {
# We're still in a chapter
# Append the current line to the end of the active chapter
currentIndex <- currentIndex + 1
chapters[[currentChapter]][[currentIndex]] <- book[currentLine]
}
}
# Loop through each document searching for the target document
data <- list()
for (currentChapter in 1:length(chapters)) {
document <- xmlParse(chapters[[currentChapter]])
root <- xmlRoot(document)
if (xmlName(root) == "informationTable") {
rows = xmlChildren(root)
for (currentIndex in 1:length(rows)) {
row = rows[[currentIndex]]
name <- xmlValue(xmlChildren(row)[[1]])
value <- xmlValue(xmlChildren(row)[[4]])
data[[currentIndex]] <- list(name, value)
}
}
}
cat("Total Rows: ", length(data), "\n")
print("Example Data:")
for (row in 1:10) {
cat(data[[row]][[2]], "\t", data[[row]][[1]], "\n")
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment