Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Select an option

  • Save obikastanya/311f24513d414ca29a662ba7fa871e16 to your computer and use it in GitHub Desktop.

Select an option

Save obikastanya/311f24513d414ca29a662ba7fa871e16 to your computer and use it in GitHub Desktop.
Extract text from scanned pdf files using tesseract ocr
import pytesseract
from pypdfium2 import PdfDocument
def parse_pdf(
path_or_io: str | bytes,
pill_scale: int = 2,
lang: str = "eng",
page_sep: str = "\n\n",
config: str = "",
) -> str:
pdf = PdfDocument(path_or_io)
pages = []
total_pages = len(pdf)
try:
if not config:
config = (
f"-l {lang} --oem 1 --psm 6 "
"-c preserve_interword_spaces=1 "
"-c tessedit_do_invert=0 "
"-c tosp_min_sane_kn_sp=2.8"
)
for page_idx in range(total_pages):
print(f"Processing page {page_idx + 1}/{total_pages}...")
page = pdf.get_page(page_idx)
page_img = page.render(scale=pill_scale).to_pil()
page.close()
text = pytesseract.image_to_string(page_img, config=config)
try:
page_img.close() # Pillow ≥10
except AttributeError:
pass # Pillow <10 fallback
del page_img # Free memory
pages.append(text)
finally:
pdf.close()
print("OCR complete.")
return page_sep.join(pages)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment