Last active August 31, 2015 00:51
Recursive batch OCR script (Requires Tesseract-OCR or Cuneiform).
import os
import sys
import json
from optparse import OptionParser
from PIL import Image
from pyocr import pyocr
from import TextBuilder
def get_opt():
parser = OptionParser()
parser.add_option("-d", "--directory", action = "store", dest = "directory", help = "[Required] Target directory")
parser.add_option("-l", "--lang", action = "store", dest = "lang", help = "Language", default = "jpn")
parser.add_option("-c", "--config", action = "store", dest = "args", help = "Other config args (e.g. digits, only works with Tesseract)")
parser.add_option("-o", "--output", action = "store", dest = "output_file", help = "Output file (Not supported at this time.)")
(options, args) = parser.parse_args()
if not
print "[DEBUG] Entering directory %s ..." %
return options
def select_tool():
tools = pyocr.get_available_tools()[:]
if len(tools) == 0:
print "[EMERG] There are no OCR tools available."
print "[DEBUG] Using %s for OCR..." % (tools[0].get_name())
return tools[0]
def process(matches, options):
tool = select_tool()
builder = TextBuilder()
builder.tesseract_configs = [options.args]
for f in matches:
text = tool.image_to_string(, lang = options.lang, builder = builder)
print json.dumps({"filename": f, "text": text}, ensure_ascii=False)
if __name__ == "__main__":
options = get_opt()
matches = []
for root, dirnames, filenames in os.walk(
for f in filenames:
if f.endswith(('.png', '.jpg', '.jpeg')):
print "[DEBUG] Processing %d images..." % len(matches)
process(matches, options)
tellts commented Jan 22, 2015

Sorry for the bad English. Which operating system and file handling any format is this script?

I'm so sorry that I haven't noticed your comment for quite a while.

This script should run on any operating system supporting Python 2.7 and Tesseract OCR.
To use this on Ubuntu 14.04, you should install tesseract-ocr using APT, then PIL and pyocr using pip.

PIL supports many image types but in this script, particularly PNG and JPEG are supported.

