equaeghe/charcount.py Secret

## charcount.py
# run as `python charcount.py <pdf file name>`

"""
    A script to count relevant characters in a class of conference paper pdfs

    This script assumes the pdf is generated with LaTeX from a conference
    template that includes the introduction as the first numbered section
    and a reference list generated using Bib(La)TeX.
    (Papers from UAI 2021 for example typically satisfy these criteria.)

    It returns the characters counted for pages starting from the one with
    the introduction and ending with the last page before the reference
    entry first referenced, and the total character count over all pages.
    (This page range is just an example. You'll likely want better control
    over the range.)

    Pages with atypically low number of characters may correspond to those
    with a relatively large fraction of graphical content. Pages with
    atypically high number of characters may indicate text extraction errors
    (pypdf does a decent job, but is not perfect).
"""

import pypdf
import sys
import re

pdf = pypdf.PdfReader(sys.argv[-1])  # argv[-1] should be <pdf file name>

intro_destination = pdf.named_destinations["section.1"]
page_to_start_counting = pdf.get_destination_page_number(intro_destination)

for name in pdf.named_destinations:
    if name.startswith("cite"):
        first_ref_destination = pdf.named_destinations[name]
        break
page_to_end_counting = pdf.get_destination_page_number(first_ref_destination)

total_characters = 0
for k in range(page_to_start_counting, page_to_end_counting):
    text = pdf.pages[k].extract_text()
    if sys.argv[-2] == "-v":
        print("\n", text, "\n")  # useful for investigating text extraction quality
    characters = len(
        # remove whitespace from string of characters
        re.sub(r"\s+", "", text, flags=re.UNICODE)
    )
    print(f"page {k+1}: {characters}")
    total_characters += characters
print(f"all counted pages: {total_characters}")
	# run as `python charcount.py <pdf file name>`

	"""
	A script to count relevant characters in a class of conference paper pdfs

	This script assumes the pdf is generated with LaTeX from a conference
	template that includes the introduction as the first numbered section
	and a reference list generated using Bib(La)TeX.
	(Papers from UAI 2021 for example typically satisfy these criteria.)

	It returns the characters counted for pages starting from the one with
	the introduction and ending with the last page before the reference
	entry first referenced, and the total character count over all pages.
	(This page range is just an example. You'll likely want better control
	over the range.)

	Pages with atypically low number of characters may correspond to those
	with a relatively large fraction of graphical content. Pages with
	atypically high number of characters may indicate text extraction errors
	(pypdf does a decent job, but is not perfect).
	"""

	import pypdf
	import sys
	import re

	pdf = pypdf.PdfReader(sys.argv[-1]) # argv[-1] should be <pdf file name>

	intro_destination = pdf.named_destinations["section.1"]
	page_to_start_counting = pdf.get_destination_page_number(intro_destination)

	for name in pdf.named_destinations:
	if name.startswith("cite"):
	first_ref_destination = pdf.named_destinations[name]
	break
	page_to_end_counting = pdf.get_destination_page_number(first_ref_destination)

	total_characters = 0
	for k in range(page_to_start_counting, page_to_end_counting):
	text = pdf.pages[k].extract_text()
	if sys.argv[-2] == "-v":
	print("\n", text, "\n") # useful for investigating text extraction quality
	characters = len(
	# remove whitespace from string of characters
	re.sub(r"\s+", "", text, flags=re.UNICODE)
	)
	print(f"page {k+1}: {characters}")
	total_characters += characters
	print(f"all counted pages: {total_characters}")