Skip to content

Instantly share code, notes, and snippets.

@turicas
Created April 26, 2023 00:22
Show Gist options
  • Save turicas/22219672d78e9fb3dfd3a66bf2e0923c to your computer and use it in GitHub Desktop.
Save turicas/22219672d78e9fb3dfd3a66bf2e0923c to your computer and use it in GitHub Desktop.
Plot PDF text/rect objects' using rows + Pillow
# pip install pillow cached-property pdfminer.six https://github.com/turicas/rows/archive/develop.zip
import argparse
from rows.plugins.plugin_pdf import (
RectObject,
TextObject,
PDFMinerBackend,
group_objects,
YGroupsAlgorithm,
plot_objects,
split_object_lines,
)
from PIL.ImageShow import register, DisplayViewer
register(DisplayViewer(), 0) # Use `display` command on Image.show()
parser = argparse.ArgumentParser()
parser.add_argument("--width")
parser.add_argument("--height")
parser.add_argument("--page-number", type=int, default=1)
parser.add_argument("pdf_filename")
args = parser.parse_args()
doc = PDFMinerBackend(args.pdf_filename)
selected_page = None
for counter, page in enumerate(doc.objects(), start=1):
if counter == args.page_number:
selected_page = page
break
img1 = plot_objects(selected_page, width=args.width, height=args.height)
img1.show()
page_split = []
for obj in selected_page:
if isinstance(obj, TextObject):
page_split.extend(split_object_lines(obj))
else:
page_split.append(obj)
img2 = plot_objects(page_split, width=args.width, height=args.height)
img2.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment