aso2101/OSD.py

## OSD.py
# Uses Tesseract (tesserocr) to recognize files
#   in a directory. I use this as follows:
#
#   1. split the PDF into JPG images in a directory
#        called "IMAGES" (e.g., pdftoppm -jpeg input.pdf IMAGES/output)
#   2. run this script (python OSD.py), which will
#        produce a text file for each image in "IMAGES".
#   3. concatenate the text files with tail
#   4. if desired,

# Note that I have explicitly set the language to "kan"
#   (Kannada), because this gives better results than
#   having Tesseract "guess."

import glob
import io
import os
from PIL import Image
from tesserocr import PyTessBaseAPI

image_list = []

for filename in glob.glob("IMAGES/*.jpg"):
    image_list.append(filename)

with PyTessBaseAPI(lang="kan") as api:
    for img in sorted(image_list):
        output = os.path.splitext(img)[0] + ".txt"
        api.SetImageFile(img)
        with io.open(output, "a", encoding="utf8") as f:
            f.write(api.GetUTF8Text())
	# Uses Tesseract (tesserocr) to recognize files
	# in a directory. I use this as follows:
	#
	# 1. split the PDF into JPG images in a directory
	# called "IMAGES" (e.g., pdftoppm -jpeg input.pdf IMAGES/output)
	# 2. run this script (python OSD.py), which will
	# produce a text file for each image in "IMAGES".
	# 3. concatenate the text files with tail
	# 4. if desired,

	# Note that I have explicitly set the language to "kan"
	# (Kannada), because this gives better results than
	# having Tesseract "guess."

	import glob
	import io
	import os
	from PIL import Image
	from tesserocr import PyTessBaseAPI

	image_list = []

	for filename in glob.glob("IMAGES/*.jpg"):
	image_list.append(filename)

	with PyTessBaseAPI(lang="kan") as api:
	for img in sorted(image_list):
	output = os.path.splitext(img)[0] + ".txt"
	api.SetImageFile(img)
	with io.open(output, "a", encoding="utf8") as f:
	f.write(api.GetUTF8Text())