Skip to content

Instantly share code, notes, and snippets.

@nalf3in
Last active October 17, 2020 19:35
Show Gist options
  • Save nalf3in/9d3dd574ea88a11542c77518879a8da5 to your computer and use it in GitHub Desktop.
Save nalf3in/9d3dd574ea88a11542c77518879a8da5 to your computer and use it in GitHub Desktop.
Script to convert unsearchable pdf to a searchable one using tesseract (I should be studying instead of making this)
#TESSERACT NEEDS TO BE INSTALLED FOR THIS TO WORK
# %%
#Imports
import os
import fitz
import pytesseract
# %%
#Create dirs if necessary
IMG_OUT_FOLDER = "tmp/imgs/"
PDF_OUT_FOLDER = "tmp/pdfs/"
os.makedirs(IMG_OUT_FOLDER, exist_ok=True)
os.makedirs(PDF_OUT_FOLDER, exist_ok=True)
# %%
#Convert pdf to images
PDF_FILE = "7-Infrastructure d'éxécution"
doc = fitz.open(f"{PDF_FILE}.pdf")
NUMBER_OF_PAGES = len(doc)
for i in range(NUMBER_OF_PAGES):
page = doc.loadPage(i) # number of page
pix = page.getPixmap()
output = f"{IMG_OUT_FOLDER}img{i}.png"
pix.writePNG(output)
# %%
#Convert images to searchable pdf
pytesseract.pytesseract.tesseract_cmd = r'C:/Program Files/Tesseract-OCR/tesseract'
for i in range(NUMBER_OF_PAGES):
pdf = pytesseract.image_to_pdf_or_hocr(f'{IMG_OUT_FOLDER}img{i}.png', extension='pdf')
with open(f'{PDF_OUT_FOLDER}pdf{i}.pdf', 'w+b') as f:
f.write(pdf) # pdf type is bytes by default
# %%
#Merges pdfs generated by Tesseract and write final pdf
if NUMBER_OF_PAGES > 1:
doc = fitz.open(f"{PDF_OUT_FOLDER}pdf0.pdf")
for i in range(NUMBER_OF_PAGES-1):
doc.insertPDF(fitz.open(f"{PDF_OUT_FOLDER}pdf{i}.pdf"))
doc.save(f"{PDF_FILE}_converted.pdf")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment