albcunha/OCR_IMAGE.PY

## formula_exif.py
# this files is the logic and works only from the ocr_image.py file.

from PIL import Image
import sys
import os
from pathlib import Path
import pyocr
import pyocr.builders

import piexif
import time
tools = pyocr.get_available_tools()
if len(tools) == 0:
    print("No OCR tool found")
    sys.exit(1)
# The tools are returned in the recommended order of usage
tool = tools[0]
print("Will use tool '%s'" % (tool.get_name()))
# Ex: Will use tool 'libtesseract'

langs = tool.get_available_languages()
print("Available languages: %s" % ", ".join(langs))
lang = langs[0]
print("Will use lang '%s'" % (lang))


def adiciona_exif(arquivo):
    try:
        if 'thumb' in arquivo:
            return arquivo
        if os.path.getsize(arquivo) < 8000: #8kb
            return arquivo
        my_file = Path(arquivo + '.jpg')
        if my_file.is_file():
            return arquivo

        im = Image.open(arquivo)
        if '.jpg' == arquivo.lower()[-4:]:
            try:
                info_exif = im._getexif()
                if len(info_exif) > 0:
                    # checa se a o exife de descrição(270) já foi inserido. Se foi, encerra.
                    try:
                        im._getexif()[270]
                        return arquivo
                    except:
                        pass
            except:
                pass


        txt = tool.image_to_string(
            im,
            lang=lang,
            builder=pyocr.builders.TextBuilder()
            )
        zeroth_ifd = {piexif.ImageIFD.ImageDescription: txt.encode('utf8').decode('latin1')}
        exif_dict = {"0th":zeroth_ifd}
        exif_bytes = piexif.dump(exif_dict)
        if '.jpg' == arquivo[-4:].lower():
            im.save(arquivo, exif=exif_bytes)
        else:
            im = im.convert("L")
            im.save(arquivo+'.jpg',exif=exif_bytes)
        return arquivo
    except:
        print('#### ERRO!!!! Arquivo não foi processado:', arquivo)

## OCR_IMAGE.PY
# this script needs an extra file called formula_exif!
# It will ocr an image and save it will save its text content on the EXIF DATA
# this script is used on jupyter notebooks, that´s why the multiprocessing logic need to be on a different file.
# author: Alberto Cartaxo
# 11/01/2018

import glob
import multiprocessing
from tqdm import tqdm
import formula_exif # where the magic happens

# Will use tool 'Tesseract (sh)'
# Available languages: eng, osd, por
# Will use lang 'por'
finished_files = []

# Creates a list of files
path_files = r'F:\EVIDENCIAS'
image_files = glob.glob( path_files + r'\**\*.jpg',recursive=True)
image_files += glob.glob(path_files + r'\**\*.jpeg',recursive=True)
image_files += glob.glob(path_files + r'\**\*.png',recursive=True)
print('Total of images to OCR:', len(image_files))

pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
for item in tqdm(pool.imap_unordered(formula_exif.adiciona_exif, image_files), total=len(image_files)):
    finished_files.append(item)
print('## ENCERRADO')
	# this files is the logic and works only from the ocr_image.py file.

	from PIL import Image
	import sys
	import os
	from pathlib import Path
	import pyocr
	import pyocr.builders

	import piexif
	import time
	tools = pyocr.get_available_tools()
	if len(tools) == 0:
	print("No OCR tool found")
	sys.exit(1)
	# The tools are returned in the recommended order of usage
	tool = tools[0]
	print("Will use tool '%s'" % (tool.get_name()))
	# Ex: Will use tool 'libtesseract'

	langs = tool.get_available_languages()
	print("Available languages: %s" % ", ".join(langs))
	lang = langs[0]
	print("Will use lang '%s'" % (lang))


	def adiciona_exif(arquivo):
	try:
	if 'thumb' in arquivo:
	return arquivo
	if os.path.getsize(arquivo) < 8000: #8kb
	return arquivo
	my_file = Path(arquivo + '.jpg')
	if my_file.is_file():
	return arquivo

	im = Image.open(arquivo)
	if '.jpg' == arquivo.lower()[-4:]:
	try:
	info_exif = im._getexif()
	if len(info_exif) > 0:
	# checa se a o exife de descrição(270) já foi inserido. Se foi, encerra.
	try:
	im._getexif()[270]
	return arquivo
	except:
	pass
	except:
	pass


	txt = tool.image_to_string(
	im,
	lang=lang,
	builder=pyocr.builders.TextBuilder()
	)
	zeroth_ifd = {piexif.ImageIFD.ImageDescription: txt.encode('utf8').decode('latin1')}
	exif_dict = {"0th":zeroth_ifd}
	exif_bytes = piexif.dump(exif_dict)
	if '.jpg' == arquivo[-4:].lower():
	im.save(arquivo, exif=exif_bytes)
	else:
	im = im.convert("L")
	im.save(arquivo+'.jpg',exif=exif_bytes)
	return arquivo
	except:
	print('#### ERRO!!!! Arquivo não foi processado:', arquivo)
	# this script needs an extra file called formula_exif!
	# It will ocr an image and save it will save its text content on the EXIF DATA
	# this script is used on jupyter notebooks, that´s why the multiprocessing logic need to be on a different file.
	# author: Alberto Cartaxo
	# 11/01/2018

	import glob
	import multiprocessing
	from tqdm import tqdm
	import formula_exif # where the magic happens

	# Will use tool 'Tesseract (sh)'
	# Available languages: eng, osd, por
	# Will use lang 'por'
	finished_files = []

	# Creates a list of files
	path_files = r'F:\EVIDENCIAS'
	image_files = glob.glob( path_files + r'\*\.jpg',recursive=True)
	image_files += glob.glob(path_files + r'\*\.jpeg',recursive=True)
	image_files += glob.glob(path_files + r'\*\.png',recursive=True)
	print('Total of images to OCR:', len(image_files))

	pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
	for item in tqdm(pool.imap_unordered(formula_exif.adiciona_exif, image_files), total=len(image_files)):
	finished_files.append(item)
	print('## ENCERRADO')