Skip to content

Instantly share code, notes, and snippets.

@josemarcosrf
Last active October 10, 2023 13:25
Show Gist options
  • Save josemarcosrf/5e40936fa4fc6c1490c41cc36e4dd263 to your computer and use it in GitHub Desktop.
Save josemarcosrf/5e40936fa4fc6c1490c41cc36e4dd263 to your computer and use it in GitHub Desktop.
Quick and dirty implementation of a text and bounding box extraction from PDFs using pdfminer.six==20191110
from collections import defaultdict
import fire
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams
from pdfminer.layout import LTChar
from pdfminer.layout import LTTextBox
from pdfminer.layout import LTTextLine
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage
from rich import print as rprint
def get_char_bboxes(pdf_path: str):
fp = open(pdf_path, "rb")
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
pages = PDFPage.get_pages(fp)
per_page_tboxes = defaultdict(list)
for i, page in enumerate(pages):
print("Processing next page...")
interpreter.process_page(page)
layout = device.get_result()
for lobj in layout:
if isinstance(lobj, LTTextBox):
for tline in lobj:
for tchar in tline:
if isinstance(tchar, LTChar):
# x0, y0, x1, y1 = tchar.bbox[0], tchar.bbox[1], tchar.bbox[2], tchar.bbox[3]
# text = tchar.get_text()
per_page_tboxes[i].append((tchar.bbox, tchar.get_text()))
return per_page_tboxes
def textract(pdf_path: str):
per_page_bboxes = get_char_bboxes(pdf_path)
for p_idx, page_bboxes in per_page_bboxes.items():
# 1. Group by vertical position
groups = defaultdict(list)
for tbox in page_bboxes:
coords, _ = tbox
_, y0, _, _ = coords
groups[y0].append(tbox)
# 2. Split by space and get the enclosing box
for i, (y0, tbox_list) in enumerate(groups.items()):
# Group in words
words = []
wboxes = []
bb = []
w = ""
for tbox in tbox_list:
try:
coords, c = tbox
if c != " ":
w += c
bb.append(coords)
else:
if len(bb):
words.append(w)
if len(bb) > 1:
bb_s, bb_f = bb[0], bb[-1]
wboxes.append((bb_s[0], bb_s[1], bb_f[2], bb_f[3]))
else:
wboxes.append((bb[0]))
w = ""
bb = []
except Exception as e:
rprint(f"[red] Error: {e}[/red]")
rprint(f"[red] w: {w} | bb: {bb}[/red]")
tboxes = list(zip(words, wboxes))
rprint(f"PAGE {p_idx} - LINE: {i}")
rprint(tboxes)
if __name__ == "__main__":
# Requires:
# fire==0.5.0
# pdfminer.six==20191110
# rich==13.6.0
fire.Fire({"pdf": textract})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment