chris-x86-64/ocr.py

## ocr.py
import os
import sys
import json
from optparse import OptionParser
from PIL import Image
from pyocr import pyocr
from pyocr.builders import TextBuilder

def get_opt():
	parser = OptionParser()
	parser.add_option("-d", "--directory", action = "store", dest = "directory", help = "[Required] Target directory")
	parser.add_option("-l", "--lang", action = "store", dest = "lang", help = "Language", default = "jpn")
	parser.add_option("-c", "--config", action = "store", dest = "args", help = "Other config args (e.g. digits, only works with Tesseract)")
	parser.add_option("-o", "--output", action = "store", dest = "output_file", help = "Output file (Not supported at this time.)")
	(options, args) = parser.parse_args()
	if not options.directory:
		parser.print_help()
		sys.exit(1)
	else:
		print "[DEBUG] Entering directory %s ..." % options.directory
	return options

def select_tool():
	tools = pyocr.get_available_tools()[:]
	if len(tools) == 0:
		print "[EMERG] There are no OCR tools available."
		sys.exit(1)
	print "[DEBUG] Using %s for OCR..." % (tools[0].get_name())
	return tools[0]

def process(matches, options):
	tool = select_tool()
	builder = TextBuilder()
	builder.tesseract_configs = [options.args]
	for f in matches:
		text = tool.image_to_string(Image.open(f), lang = options.lang, builder = builder)
		print json.dumps({"filename": f, "text": text}, ensure_ascii=False)


if __name__ == "__main__":
	options = get_opt()
	matches = []
	for root, dirnames, filenames in os.walk(options.directory):
		for f in filenames:
			if f.endswith(('.png', '.jpg', '.jpeg')):
				matches.append(os.path.join(root,f))

	print "[DEBUG] Processing %d images..." % len(matches)
	process(matches, options)
	import os
	import sys
	import json
	from optparse import OptionParser
	from PIL import Image
	from pyocr import pyocr
	from pyocr.builders import TextBuilder

	def get_opt():
	parser = OptionParser()
	parser.add_option("-d", "--directory", action = "store", dest = "directory", help = "[Required] Target directory")
	parser.add_option("-l", "--lang", action = "store", dest = "lang", help = "Language", default = "jpn")
	parser.add_option("-c", "--config", action = "store", dest = "args", help = "Other config args (e.g. digits, only works with Tesseract)")
	parser.add_option("-o", "--output", action = "store", dest = "output_file", help = "Output file (Not supported at this time.)")
	(options, args) = parser.parse_args()
	if not options.directory:
	parser.print_help()
	sys.exit(1)
	else:
	print "[DEBUG] Entering directory %s ..." % options.directory
	return options

	def select_tool():
	tools = pyocr.get_available_tools()[:]
	if len(tools) == 0:
	print "[EMERG] There are no OCR tools available."
	sys.exit(1)
	print "[DEBUG] Using %s for OCR..." % (tools[0].get_name())
	return tools[0]

	def process(matches, options):
	tool = select_tool()
	builder = TextBuilder()
	builder.tesseract_configs = [options.args]
	for f in matches:
	text = tool.image_to_string(Image.open(f), lang = options.lang, builder = builder)
	print json.dumps({"filename": f, "text": text}, ensure_ascii=False)


	if __name__ == "__main__":
	options = get_opt()
	matches = []
	for root, dirnames, filenames in os.walk(options.directory):
	for f in filenames:
	if f.endswith(('.png', '.jpg', '.jpeg')):
	matches.append(os.path.join(root,f))

	print "[DEBUG] Processing %d images..." % len(matches)
	process(matches, options)