Skip to content

Instantly share code, notes, and snippets.

@mitya57
Last active October 20, 2015 10:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mitya57/f616f32514e1f13da495 to your computer and use it in GitHub Desktop.
Save mitya57/f616f32514e1f13da495 to your computer and use it in GitHub Desktop.
Statistical information about PDFs
# Get statistical information about PDFs
# Author: 2015 Dmitry Shachnev <mitya57@gmail.com>
# Required packages (in Debian/Ubuntu):
# - gir1.2-poppler-0.18
# - python3-gi or python-gi
import gi
gi.require_version('Poppler', '0.18')
from gi.repository import GLib, Poppler
CMS_IN_INCH = 2.54
CMS_IN_PT = CMS_IN_INCH / 72
SQUARE_CMS_IN_QUIRE = 3000
CHARS_IN_PAGE_THRESHOLD = 1000
CHARS_IN_QUIRE = 25000
class PdfInfo(object):
"""This class represents the PDF information."""
pages_count = 0
characters_count = 0
quires_count = 0.0
images_count = 0
total_area = 0.0
images_area = 0.0
def _PopplerDocument_get_pages(document):
n_pages = document.get_n_pages()
for i in range(n_pages):
yield document.get_page(i)
Poppler.Document.get_pages = _PopplerDocument_get_pages
def get_area(document):
"""Returns total area of all pages in the documents, in square
centimeters."""
total_area_sqcm = 0
for page in document.get_pages():
width_pt, height_pt = page.get_size()
width_cm = round(width_pt * CMS_IN_PT, 4)
height_cm = round(height_pt * CMS_IN_PT, 4)
total_area_sqcm += width_cm * height_cm
return round(total_area_sqcm, 4)
def get_quires_count(document, use_square=False):
"""Returns approximate number of quires needed to print the document."""
if use_square:
return get_area(document) / SQUARE_CMS_IN_QUIRE
images_area = get_images_info(document)[1]
characters_count = get_characters_count(document)
quires_from_text = characters_count / CHARS_IN_QUIRE
quires_from_images = images_area / SQUARE_CMS_IN_QUIRE
return quires_from_text + quires_from_images
def get_characters_count(document):
"""Returns number of characters in the document."""
pages = document.get_pages()
return sum(map(len, map(Poppler.Page.get_text, pages)))
def get_images_info(document):
"""Returns (number of images, total area of all images in square
centimeters) tuple."""
images_count = 0
total_area_sqcm = 0
for page in document.get_pages():
for img in page.get_image_mapping():
images_count += 1
img_width_cm = (img.area.x2 - img.area.x1) * CMS_IN_PT
img_height_cm = (img.area.y2 - img.area.y1) * CMS_IN_PT
total_area_sqcm += img_width_cm * img_height_cm
return images_count, total_area_sqcm
def get_info(document):
"""Returns an instance of PdfInfo class."""
info = PdfInfo()
info.pages_count = document.get_n_pages()
info.characters_count = get_characters_count(document)
info.quires_count = get_quires_count(document)
info.images_count, info.images_area = get_images_info(document)
info.total_area = get_area(document)
return info
def document_from_file(filename):
uri = GLib.filename_to_uri(filename)
return Poppler.Document.new_from_file(uri)
@mitya57
Copy link
Author

mitya57 commented Oct 20, 2015

Пример использования:

>>> import pdfinfo
>>> document = pdfinfo.document_from_file('/home/dmitry/work/referat/work.pdf')
>>> info = pdfinfo.get_info(document)
>>> info.pages_count
13
>>> info.characters_count
24456
>>> info.images_count
1
>>> info.total_area    # в см²
8108.1
>>> info.images_area   # в см²
155.5154197530864
>>> info.quires_count  # печатные листы
1.0300784732510289

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment