Created
October 4, 2012 15:19
-
-
Save abelsonlive/3834327 to your computer and use it in GitHub Desktop.
PDF to Text, R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# helper function: get number of words in a string, separated by tab, space, return, or point. | |
nwords <- function(x){ | |
res <- strsplit(as.character(x), "[ \t\n,\\.]+") | |
res <- lapply(res, length) | |
unlist(res) | |
} | |
# sanitize file name for terminal usage (i.e., escape spaces) | |
sanitize <- function(str) { | |
gsub('([#$%&~_\\^\\\\{}\\s\\(\\)])', '\\\\\\1', str, perl = TRUE) | |
} | |
# get a list of all files in the current directory | |
fi <- list.files() | |
fi2 <- fi[grepl(".pdf", fi)] | |
## Parse files and do something with it ... | |
res <- data.frame() # keeps records of the calculations | |
for (f in fi2) { | |
print(paste("Parsing", f)) | |
f2 <- sanitize(f) | |
system(paste0("pdftotext ", f2), wait = TRUE) | |
# read content of converted txt file | |
filetxt <- sub(".pdf", ".txt", f) | |
text <- readLines(filetxt, warn=FALSE) | |
# adjust encoding of text - you have to know it | |
Encoding(text) <- "latin1" | |
# Do something with the content - here: get word and character count of all pdfs in the current directory | |
text2 <- paste(text, collapse="\n") # collapse lines into one long string | |
res <- rbind(res, data.frame(filename=f, wc=nwords(text2), cs=nchar(text2), cs.nospace=nchar(gsub("\\s", "", text2)))) | |
# remove converted text file | |
file.remove(filetxt) | |
} | |
print(res) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment