Last active
November 2, 2018 18:48
-
-
Save albcunha/67379eac046a7528dffdf86e9058d3a9 to your computer and use it in GitHub Desktop.
OCRs image and adds its content to EXIF field called 'description'
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# this files is the logic and works only from the ocr_image.py file. | |
from PIL import Image | |
import sys | |
import os | |
from pathlib import Path | |
import pyocr | |
import pyocr.builders | |
import piexif | |
import time | |
tools = pyocr.get_available_tools() | |
if len(tools) == 0: | |
print("No OCR tool found") | |
sys.exit(1) | |
# The tools are returned in the recommended order of usage | |
tool = tools[0] | |
print("Will use tool '%s'" % (tool.get_name())) | |
# Ex: Will use tool 'libtesseract' | |
langs = tool.get_available_languages() | |
print("Available languages: %s" % ", ".join(langs)) | |
lang = langs[0] | |
print("Will use lang '%s'" % (lang)) | |
def adiciona_exif(arquivo): | |
try: | |
if 'thumb' in arquivo: | |
return arquivo | |
if os.path.getsize(arquivo) < 8000: #8kb | |
return arquivo | |
my_file = Path(arquivo + '.jpg') | |
if my_file.is_file(): | |
return arquivo | |
im = Image.open(arquivo) | |
if '.jpg' == arquivo.lower()[-4:]: | |
try: | |
info_exif = im._getexif() | |
if len(info_exif) > 0: | |
# checa se a o exife de descrição(270) já foi inserido. Se foi, encerra. | |
try: | |
im._getexif()[270] | |
return arquivo | |
except: | |
pass | |
except: | |
pass | |
txt = tool.image_to_string( | |
im, | |
lang=lang, | |
builder=pyocr.builders.TextBuilder() | |
) | |
zeroth_ifd = {piexif.ImageIFD.ImageDescription: txt.encode('utf8').decode('latin1')} | |
exif_dict = {"0th":zeroth_ifd} | |
exif_bytes = piexif.dump(exif_dict) | |
if '.jpg' == arquivo[-4:].lower(): | |
im.save(arquivo, exif=exif_bytes) | |
else: | |
im = im.convert("L") | |
im.save(arquivo+'.jpg',exif=exif_bytes) | |
return arquivo | |
except: | |
print('#### ERRO!!!! Arquivo não foi processado:', arquivo) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# this script needs an extra file called formula_exif! | |
# It will ocr an image and save it will save its text content on the EXIF DATA | |
# this script is used on jupyter notebooks, that´s why the multiprocessing logic need to be on a different file. | |
# author: Alberto Cartaxo | |
# 11/01/2018 | |
import glob | |
import multiprocessing | |
from tqdm import tqdm | |
import formula_exif # where the magic happens | |
# Will use tool 'Tesseract (sh)' | |
# Available languages: eng, osd, por | |
# Will use lang 'por' | |
finished_files = [] | |
# Creates a list of files | |
path_files = r'F:\EVIDENCIAS' | |
image_files = glob.glob( path_files + r'\**\*.jpg',recursive=True) | |
image_files += glob.glob(path_files + r'\**\*.jpeg',recursive=True) | |
image_files += glob.glob(path_files + r'\**\*.png',recursive=True) | |
print('Total of images to OCR:', len(image_files)) | |
pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) | |
for item in tqdm(pool.imap_unordered(formula_exif.adiciona_exif, image_files), total=len(image_files)): | |
finished_files.append(item) | |
print('## ENCERRADO') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment