nalf3in/pdf_to_searchabled_pdf.py

## pdf_to_searchabled_pdf.py
#TESSERACT NEEDS TO BE INSTALLED FOR THIS TO WORK
# %%
#Imports
import os
import fitz
import pytesseract

# %%
#Create dirs if necessary
IMG_OUT_FOLDER = "tmp/imgs/"
PDF_OUT_FOLDER = "tmp/pdfs/"
os.makedirs(IMG_OUT_FOLDER, exist_ok=True)
os.makedirs(PDF_OUT_FOLDER, exist_ok=True)


# %%
#Convert pdf to images
PDF_FILE = "7-Infrastructure d'éxécution"
doc = fitz.open(f"{PDF_FILE}.pdf")

NUMBER_OF_PAGES = len(doc)

for i in range(NUMBER_OF_PAGES):
    page = doc.loadPage(i)  # number of page
    pix = page.getPixmap()
    output = f"{IMG_OUT_FOLDER}img{i}.png"
    pix.writePNG(output)


# %%
#Convert images to searchable pdf
pytesseract.pytesseract.tesseract_cmd = r'C:/Program Files/Tesseract-OCR/tesseract'
for i in range(NUMBER_OF_PAGES):
    pdf = pytesseract.image_to_pdf_or_hocr(f'{IMG_OUT_FOLDER}img{i}.png', extension='pdf')
    with open(f'{PDF_OUT_FOLDER}pdf{i}.pdf', 'w+b') as f:
        f.write(pdf) # pdf type is bytes by default


# %%
#Merges pdfs generated by Tesseract and write final pdf
if NUMBER_OF_PAGES > 1:
    doc = fitz.open(f"{PDF_OUT_FOLDER}pdf0.pdf")

    for i in range(NUMBER_OF_PAGES-1):
        doc.insertPDF(fitz.open(f"{PDF_OUT_FOLDER}pdf{i}.pdf"))

    doc.save(f"{PDF_FILE}_converted.pdf")
	#TESSERACT NEEDS TO BE INSTALLED FOR THIS TO WORK
	# %%
	#Imports
	import os
	import fitz
	import pytesseract

	# %%
	#Create dirs if necessary
	IMG_OUT_FOLDER = "tmp/imgs/"
	PDF_OUT_FOLDER = "tmp/pdfs/"
	os.makedirs(IMG_OUT_FOLDER, exist_ok=True)
	os.makedirs(PDF_OUT_FOLDER, exist_ok=True)


	# %%
	#Convert pdf to images
	PDF_FILE = "7-Infrastructure d'éxécution"
	doc = fitz.open(f"{PDF_FILE}.pdf")

	NUMBER_OF_PAGES = len(doc)

	for i in range(NUMBER_OF_PAGES):
	page = doc.loadPage(i) # number of page
	pix = page.getPixmap()
	output = f"{IMG_OUT_FOLDER}img{i}.png"
	pix.writePNG(output)


	# %%
	#Convert images to searchable pdf
	pytesseract.pytesseract.tesseract_cmd = r'C:/Program Files/Tesseract-OCR/tesseract'
	for i in range(NUMBER_OF_PAGES):
	pdf = pytesseract.image_to_pdf_or_hocr(f'{IMG_OUT_FOLDER}img{i}.png', extension='pdf')
	with open(f'{PDF_OUT_FOLDER}pdf{i}.pdf', 'w+b') as f:
	f.write(pdf) # pdf type is bytes by default


	# %%
	#Merges pdfs generated by Tesseract and write final pdf
	if NUMBER_OF_PAGES > 1:
	doc = fitz.open(f"{PDF_OUT_FOLDER}pdf0.pdf")

	for i in range(NUMBER_OF_PAGES-1):
	doc.insertPDF(fitz.open(f"{PDF_OUT_FOLDER}pdf{i}.pdf"))

	doc.save(f"{PDF_FILE}_converted.pdf")