abelsonlive/gist:3834327

## gistfile1.r
# helper function: get number of words in a string, separated by tab, space, return, or point.
nwords <- function(x){
	res <- strsplit(as.character(x), "[ \t\n,\\.]+")
	res <- lapply(res, length)
	unlist(res)
}

# sanitize file name for terminal usage (i.e., escape spaces)
sanitize <- function(str) {
	gsub('([#$%&~_\\^\\\\{}\\s\\(\\)])', '\\\\\\1', str, perl = TRUE)
}

# get a list of all files in the current directory
fi <- list.files()
fi2 <- fi[grepl(".pdf", fi)]


## Parse files and do something with it ...
res <- data.frame() # keeps records of the calculations
for (f in fi2) {
	print(paste("Parsing", f))

	f2 <- sanitize(f)
	system(paste0("pdftotext ", f2), wait = TRUE)

	# read content of converted txt file
	filetxt <- sub(".pdf", ".txt", f)
	text <- readLines(filetxt, warn=FALSE)

	# adjust encoding of text - you have to know it
	Encoding(text) <- "latin1"

	# Do something with the content - here: get word and character count of all pdfs in the current directory
	text2 <- paste(text, collapse="\n")	# collapse lines into one long string

	res <- rbind(res, data.frame(filename=f, wc=nwords(text2), cs=nchar(text2), cs.nospace=nchar(gsub("\\s", "", text2))))

	# remove converted text file
	file.remove(filetxt)
}

print(res)
	# helper function: get number of words in a string, separated by tab, space, return, or point.
	nwords <- function(x){
	res <- strsplit(as.character(x), "[ \t\n,\\.]+")
	res <- lapply(res, length)
	unlist(res)
	}

	# sanitize file name for terminal usage (i.e., escape spaces)
	sanitize <- function(str) {
	gsub('([#$%&~_\\^\\\\{}\\s\\(\\)])', '\\\\\\1', str, perl = TRUE)
	}

	# get a list of all files in the current directory
	fi <- list.files()
	fi2 <- fi[grepl(".pdf", fi)]


	## Parse files and do something with it ...
	res <- data.frame() # keeps records of the calculations
	for (f in fi2) {
	print(paste("Parsing", f))

	f2 <- sanitize(f)
	system(paste0("pdftotext ", f2), wait = TRUE)

	# read content of converted txt file
	filetxt <- sub(".pdf", ".txt", f)
	text <- readLines(filetxt, warn=FALSE)

	# adjust encoding of text - you have to know it
	Encoding(text) <- "latin1"

	# Do something with the content - here: get word and character count of all pdfs in the current directory
	text2 <- paste(text, collapse="\n") # collapse lines into one long string

	res <- rbind(res, data.frame(filename=f, wc=nwords(text2), cs=nchar(text2), cs.nospace=nchar(gsub("\\s", "", text2))))

	# remove converted text file
	file.remove(filetxt)
	}

	print(res)