Skip to content

Instantly share code, notes, and snippets.

@Ahanmr
Created March 30, 2020 12:26
Show Gist options
  • Save Ahanmr/813a1ba3a507f50b444df7b6c61e58ba to your computer and use it in GitHub Desktop.
Save Ahanmr/813a1ba3a507f50b444df7b6c61e58ba to your computer and use it in GitHub Desktop.
import io
from PIL import Image
import pytesseract
from wand.image import Image as wi
pdf = wi(filename = "sample2.pdf", resolution = 300)
pdfImage = pdf.convert('jpeg')
imageBlobs = []
for img in pdfImage.sequence:
imgPage = wi(image = img)
imageBlobs.append(imgPage.make_blob('jpeg'))
recognized_text = []
for imgBlob in imageBlobs:
im = Image.open(io.BytesIO(imgBlob))
text = pytesseract.image_to_string(im, lang = 'eng')
recognized_text.append(text)
print(recognized_text)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment