leeper/word_count.R

## word_count.R
# Copyright (c) 2018 Thomas J. Leeper
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

#' @title Word Count a PDF
#' @description Obtain a Word Count from a PDF
#' @param document A file path specifying a PDF document.
#' @param pages Optionally, an integer vector specifying a subset of pages to count from. Negative values serve as negative subsets.
#' @param count_numbers A logical specifying whether to count numbers as words.
#' @param count_captions A logical specifying whether to count lines beginning with \dQuote{Table} or \dQuote{Figure} in word count.
#' @param count_equations A logical specifying whether to count lines ending with \dQuote{([Number])} in word count.
#' @param split_hyphenated A logical specifying whether to split hyphenated words or expressions as separate words.
#' @param split_urls A logical specifying whether to split URLs into multiple words when counting.
#' @param verbose A logical specifying whether to be verbose. If \code{TRUE}, the page and word counts are printed to the console and the result is is returned invisibly. If \code{FALSE}, the result is visible.
#' @return A data frame with two columns, one specifying page and the other specifying word count for that page.
#' @details This is useful for obtaining a word count for a LaTeX-compiled PDF. Counting words in the tex source is a likely undercount (due to missing citations, cross-references, and parenthetical citations). Counting words from the PDF is likely over count (due to hyphenation issues, URLs, ligatures, tables and figures, and various other things). This function tries to obtain a word from the PDF while accounting for some of the sources of overcounting.
#'
#' It is often desirable to have word counts excluding tables and figures. A solution on TeX StackExchange (\url{https://tex.stackexchange.com/a/352394/30039}) provides guidance on how to exclude tables and figures (or any arbitrary LaTeX environment) from a compiled document, which may be useful before attempting to word count the PDF.
#'
#' @author Thomas J. Leeper <thosjleeper@gmail.com>
#' @examples
#' # "R-intro.pdf" manual
#' rintro <- file.path(Sys.getenv("R_HOME"), "doc", "manual", "R-intro.pdf")
#'
#' # Online service at http://www.montereylanguages.com/pdf-word-count-online-free-tool.html
#' # claims the word count to be 36,530 words
#'
#' # Microsoft Word (PDF conversion) word count is 36,869 words
#'
#' word_count(rintro)      # all pages (105 pages, 37870 words)
#' word_count(rintro, 1:3) # pages 1-3
#' word_count(rintro, -1)  # skip first page
#'
#' @import pdftools
#' @import dplyr
#' @import tidytext
#' @export
word_count <-
function(
  document,
  pages = NULL,
  count_numbers = TRUE,
  count_captions = FALSE,
  count_equations = FALSE,
  split_hyphenated = FALSE,
  split_urls = FALSE,
  verbose = getOption("verbose", FALSE)
) {

    # import
    char <- pdftools::pdf_text(document)

    # handle URLs
    ## unnest_tokens() splits URLs by default into multiple tokens
    if (!isTRUE(split_urls)) {
        # borrowed from: https://stackoverflow.com/a/8234912/2338862
        url_regex <- "((([A-Za-z]{3,9}:(?:\\/\\/)?)(?:[-;:&=+$,\\w]+@)?[A-Za-z0-9.-]+|(?:www.|[-;:&=+$,\\w]+@)[A-Za-z0-9.-]+)((?:\\/[\\+~%\\/.\\w-_]*)?\\??(?:[-\\\\+=&;%@.\\w_]*)#?(?:[\\w]*))?)"
        char <- gsub(url_regex, "URL", char, perl = TRUE)
    }

    # cleanup hypenations across line breaks
    char <- gsub("-\n", "", char)
    # handle hyphenated words
    ## unnest_tokens() splits URLs by default into multiple tokens
    if (!isTRUE(split_hyphenated)) {
        char <- gsub("(?<=.)-(?=.)", "", char, perl = TRUE)
    }

    # cleanup hypenations across line breaks
    char <- gsub("-\n", "", char)

    all_pages <- seq_len(length(char))

    # subset pages
    to_count <- rep(FALSE, length(char))
    if (!is.null(pages)) {
        ## inclusions
        pos <- pages[pages > 0]
        if (length(pos)) {
            to_count[pos] <- TRUE
        } else {
            to_count[] <- TRUE
        }
        ## exclusions
        neg <- pages[pages < 0]
        if (length(neg)) {
            to_count[abs(neg)] <- FALSE
        }
        ## subset
        char <- char[to_count]
        pages <- pages[to_count]
    } else {
        pages <- all_pages
    }

    # tidy lines
    txt_df <- data.frame(page = pages, text = char, stringsAsFactors = FALSE)
    tidy_lines <- tidytext::unnest_tokens(txt_df, line, text, token = "lines")

    # remove likely figure/title captions
    if (!isTRUE(count_captions)) {
        tidy_lines <- tidy_lines[!grepl("^([Ff]igure)|([Tt]able) [[:digit:]]+ ?[.:,] ?", tidy_lines$line), ]
    }

    # remove likely equations
    if (!isTRUE(count_equations)) {
        tidy_lines <- tidy_lines[!grepl(" +\\([[:digit:]]+\\)$", tidy_lines$line), ]
    }

    # tidy words
    tidy_words <- tidytext::unnest_tokens(tidy_lines, word, line, drop = FALSE)

    # handle numbers
    if (!isTRUE(count_numbers)) {
        suppressWarnings(tidy_words$number <- as.numeric(tidy_words$word))
        tidy_words <- tidy_words[is.na(tidy_words$number), ]
        tidy_words$number <- NULL
    }

    # count and, if verbose, message() the count
    if (isTRUE(verbose)) {
        message(sprintf("Document with %d %s and %d %s",
                        nrow(txt_df),
                        ngettext(nrow(txt_df), "page", "pages"),
                        nrow(tidy_words),
                        ngettext(nrow(tidy_words), "word", "words")))
    }

    # construct page-level data frame of counts to return
    out <- dplyr::ungroup(dplyr::summarize(dplyr::group_by(tidy_words, page), words = n()))

    if (isTRUE(verbose)) {
        invisible(out)
    } else {
        out
    }
}
	# Copyright (c) 2018 Thomas J. Leeper
	#
	# Permission is hereby granted, free of charge, to any person obtaining a copy
	# of this software and associated documentation files (the "Software"), to deal
	# in the Software without restriction, including without limitation the rights
	# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	# copies of the Software, and to permit persons to whom the Software is
	# furnished to do so, subject to the following conditions:
	#
	# The above copyright notice and this permission notice shall be included in all
	# copies or substantial portions of the Software.
	#
	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	# SOFTWARE.

	#' @title Word Count a PDF
	#' @description Obtain a Word Count from a PDF
	#' @param document A file path specifying a PDF document.
	#' @param pages Optionally, an integer vector specifying a subset of pages to count from. Negative values serve as negative subsets.
	#' @param count_numbers A logical specifying whether to count numbers as words.
	#' @param count_captions A logical specifying whether to count lines beginning with \dQuote{Table} or \dQuote{Figure} in word count.
	#' @param count_equations A logical specifying whether to count lines ending with \dQuote{([Number])} in word count.
	#' @param split_hyphenated A logical specifying whether to split hyphenated words or expressions as separate words.
	#' @param split_urls A logical specifying whether to split URLs into multiple words when counting.
	#' @param verbose A logical specifying whether to be verbose. If \code{TRUE}, the page and word counts are printed to the console and the result is is returned invisibly. If \code{FALSE}, the result is visible.
	#' @return A data frame with two columns, one specifying page and the other specifying word count for that page.
	#' @details This is useful for obtaining a word count for a LaTeX-compiled PDF. Counting words in the tex source is a likely undercount (due to missing citations, cross-references, and parenthetical citations). Counting words from the PDF is likely over count (due to hyphenation issues, URLs, ligatures, tables and figures, and various other things). This function tries to obtain a word from the PDF while accounting for some of the sources of overcounting.
	#'
	#' It is often desirable to have word counts excluding tables and figures. A solution on TeX StackExchange (\url{https://tex.stackexchange.com/a/352394/30039}) provides guidance on how to exclude tables and figures (or any arbitrary LaTeX environment) from a compiled document, which may be useful before attempting to word count the PDF.
	#'
	#' @author Thomas J. Leeper <thosjleeper@gmail.com>
	#' @examples
	#' # "R-intro.pdf" manual
	#' rintro <- file.path(Sys.getenv("R_HOME"), "doc", "manual", "R-intro.pdf")
	#'
	#' # Online service at http://www.montereylanguages.com/pdf-word-count-online-free-tool.html
	#' # claims the word count to be 36,530 words
	#'
	#' # Microsoft Word (PDF conversion) word count is 36,869 words
	#'
	#' word_count(rintro) # all pages (105 pages, 37870 words)
	#' word_count(rintro, 1:3) # pages 1-3
	#' word_count(rintro, -1) # skip first page
	#'
	#' @import pdftools
	#' @import dplyr
	#' @import tidytext
	#' @export
	word_count <-
	function(
	document,
	pages = NULL,
	count_numbers = TRUE,
	count_captions = FALSE,
	count_equations = FALSE,
	split_hyphenated = FALSE,
	split_urls = FALSE,
	verbose = getOption("verbose", FALSE)
	) {

	# import
	char <- pdftools::pdf_text(document)

	# handle URLs
	## unnest_tokens() splits URLs by default into multiple tokens
	if (!isTRUE(split_urls)) {
	# borrowed from: https://stackoverflow.com/a/8234912/2338862
	url_regex <- "((([A-Za-z]{3,9}:(?:\\/\\/)?)(?:[-;:&=+$,\\w]+@)?[A-Za-z0-9.-]+\|(?:www.\|[-;:&=+$,\\w]+@)[A-Za-z0-9.-]+)((?:\\/[\\+~%\\/.\\w-_])?\\??(?:[-\\\\+=&;%@.\\w_])#?(?:[\\w]*))?)"
	char <- gsub(url_regex, "URL", char, perl = TRUE)
	}

	# cleanup hypenations across line breaks
	char <- gsub("-\n", "", char)
	# handle hyphenated words
	## unnest_tokens() splits URLs by default into multiple tokens
	if (!isTRUE(split_hyphenated)) {
	char <- gsub("(?<=.)-(?=.)", "", char, perl = TRUE)
	}

	# cleanup hypenations across line breaks
	char <- gsub("-\n", "", char)

	all_pages <- seq_len(length(char))

	# subset pages
	to_count <- rep(FALSE, length(char))
	if (!is.null(pages)) {
	## inclusions
	pos <- pages[pages > 0]
	if (length(pos)) {
	to_count[pos] <- TRUE
	} else {
	to_count[] <- TRUE
	}
	## exclusions
	neg <- pages[pages < 0]
	if (length(neg)) {
	to_count[abs(neg)] <- FALSE
	}
	## subset
	char <- char[to_count]
	pages <- pages[to_count]
	} else {
	pages <- all_pages
	}

	# tidy lines
	txt_df <- data.frame(page = pages, text = char, stringsAsFactors = FALSE)
	tidy_lines <- tidytext::unnest_tokens(txt_df, line, text, token = "lines")

	# remove likely figure/title captions
	if (!isTRUE(count_captions)) {
	tidy_lines <- tidy_lines[!grepl("^([Ff]igure)\|([Tt]able) [[:digit:]]+ ?[.:,] ?", tidy_lines$line), ]
	}

	# remove likely equations
	if (!isTRUE(count_equations)) {
	tidy_lines <- tidy_lines[!grepl(" +\\([[:digit:]]+\\)$", tidy_lines$line), ]
	}

	# tidy words
	tidy_words <- tidytext::unnest_tokens(tidy_lines, word, line, drop = FALSE)

	# handle numbers
	if (!isTRUE(count_numbers)) {
	suppressWarnings(tidy_words$number <- as.numeric(tidy_words$word))
	tidy_words <- tidy_words[is.na(tidy_words$number), ]
	tidy_words$number <- NULL
	}

	# count and, if verbose, message() the count
	if (isTRUE(verbose)) {
	message(sprintf("Document with %d %s and %d %s",
	nrow(txt_df),
	ngettext(nrow(txt_df), "page", "pages"),
	nrow(tidy_words),
	ngettext(nrow(tidy_words), "word", "words")))
	}

	# construct page-level data frame of counts to return
	out <- dplyr::ungroup(dplyr::summarize(dplyr::group_by(tidy_words, page), words = n()))

	if (isTRUE(verbose)) {
	invisible(out)
	} else {
	out
	}
	}