Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
import io
from PIL import Image
from fpdf import FPDF
from wand.image import Image as wi
import cv2
import numpy as np
import tesserocr as tr
import os
api = tr.PyTessBaseAPI()
try:
pdf = wi(filename = "abc.pdf", resolution = 300)
pdfImage = pdf.convert('jpeg')
imageBlobs = []
for img in pdfImage.sequence:
imgPage = wi(image = img)
imageBlobs.append(imgPage.make_blob('jpeg'))
recognized_text = []
box_list = {}
count = 0
for imgBlob in imageBlobs:
im = Image.open(io.BytesIO(imgBlob))
api.SetImage(im)
boxes = api.GetComponentImages(tr.RIL.TEXTLINE,True)
text = api.GetUTF8Text()
cv_img = np.array(im)
for (im,box,_,_) in boxes:
x,y,w,h = box['x'],box['y'],box['w'],box['h']
cv2.rectangle(cv_img, (x,y), (x+w,y+h), color=(0,255,0))
fname = 'result'+str(count)+'.png'
cv2.imwrite(fname, cv_img)
count += 1
pdf = FPDF()
for file in os.listdir():
if file.endswith(".png") or file.endswith(".PNG"):
img=os.path.join(file)
pdf.add_page()
pdf.set_font('Arial', 'B', 16)
pdf.image('%s'%img, 3,3,204 )
os.remove('%s'%img)
pdf.output('processed.pdf')
except Exception as e:
print(e)
finally:
api.End()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment