Skip to content

Instantly share code, notes, and snippets.

@satish860
Created December 21, 2023 15:40
Show Gist options
  • Save satish860/1f8a5125f76f0482f1956fc2db6ded52 to your computer and use it in GitHub Desktop.
Save satish860/1f8a5125f76f0482f1956fc2db6ded52 to your computer and use it in GitHub Desktop.
from pdf2image import convert_from_path
from paddleocr import PaddleOCR
from reportlab.pdfgen import canvas
from PyPDF2 import PdfReader, PdfWriter
import io
import numpy as np
import cv2
from tqdm import tqdm
def process_page(page_num, pil_image, input_pdf_path):
if pil_image.mode != 'RGB':
pil_image = pil_image.convert('RGB')
# Convert PIL Image to a format compatible with PaddleOCR (which uses OpenCV)
image = np.array(pil_image)
# OpenCV expects images in BGR format
image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
# Perform OCR
ocr_results = model.ocr(image, cls=True)
# Create a PDF page with text overlay
packet = io.BytesIO()
can = canvas.Canvas(packet)
for line in ocr_results:
for element in line:
bbox, (text, _) = element
x, y = bbox[0]
# Use image.shape[0] for the image height
can.drawString(x, image.shape[0] - y, text)
can.save()
# Move the "canvas" into a PDF object
packet.seek(0)
new_pdf = PdfReader(packet)
page = new_pdf.pages[0]
# Merge with original PDF page
original_pdf = PdfReader(input_pdf_path)
page.merge_page(original_pdf.pages[page_num])
return page
def ocr_pdf_to_text_pdf(input_pdf_path, output_pdf_path):
# Convert PDF to images
images = convert_from_path(input_pdf_path)
# Create a PDF writer object
output_pdf = PdfWriter()
for i, image in tqdm(enumerate(images), total=len(images)):
page = process_page(i, image, input_pdf_path)
output_pdf.add_page(page)
# Write the output PDF to a file
with open(output_pdf_path, "wb") as output_file:
output_pdf.write(output_file)
# Usage
input_pdf_path = 'SOC VOL. 3.pdf'
output_pdf_path = 'OCR_SOC VOL. 3.pdf'
ocr_pdf_to_text_pdf(input_pdf_path, output_pdf_path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment