Created
June 12, 2020 13:43
-
-
Save aso2101/43fe9dc07be904bf0cf0fdd3a4ed2fc2 to your computer and use it in GitHub Desktop.
Python script for OCR using Tesseract
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Uses Tesseract (tesserocr) to recognize files | |
# in a directory. I use this as follows: | |
# | |
# 1. split the PDF into JPG images in a directory | |
# called "IMAGES" (e.g., pdftoppm -jpeg input.pdf IMAGES/output) | |
# 2. run this script (python OSD.py), which will | |
# produce a text file for each image in "IMAGES". | |
# 3. concatenate the text files with tail | |
# 4. if desired, | |
# Note that I have explicitly set the language to "kan" | |
# (Kannada), because this gives better results than | |
# having Tesseract "guess." | |
import glob | |
import io | |
import os | |
from PIL import Image | |
from tesserocr import PyTessBaseAPI | |
image_list = [] | |
for filename in glob.glob("IMAGES/*.jpg"): | |
image_list.append(filename) | |
with PyTessBaseAPI(lang="kan") as api: | |
for img in sorted(image_list): | |
output = os.path.splitext(img)[0] + ".txt" | |
api.SetImageFile(img) | |
with io.open(output, "a", encoding="utf8") as f: | |
f.write(api.GetUTF8Text()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment