Skip to content

Instantly share code, notes, and snippets.

@yinleon
Created January 3, 2022 21:32
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save yinleon/9e0bd70806b90395e35b87ab37366c9d to your computer and use it in GitHub Desktop.
Save yinleon/9e0bd70806b90395e35b87ab37366c9d to your computer and use it in GitHub Desktop.
Tesseract ORC in Python
numpy
tqdm
pdf2image
opencv-python
pytesseract
Pillow
import os
from PIL import Image
import pytesseract
import cv2
from pdf2image import convert_from_path
import numpy as np
from tqdm import tqdm
# change these variables to the path of your input/output
fn_pdf = ''
fn_out = ''
def orc_flow(img):
"""
Takes PIL image, preprocesses, and then returns text
"""
image = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
gray = cv2.medianBlur(gray, 3)
text = pytesseract.image_to_string(gray)
return text
# get all the images from on PDF file
images = convert_from_path(fn_pdf)
# process each image with ORC
data = []
for img in tqdm(images):
text = orc_flow(img)
data.append(text)
# write file
with open(fn_out, 'w') as f:
for txt in data:
f.write(txt + '\n')
@yinleon
Copy link
Author

yinleon commented Jan 3, 2022

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment