Skip to content

Instantly share code, notes, and snippets.

@albcunha
Last active November 2, 2018 18:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save albcunha/67379eac046a7528dffdf86e9058d3a9 to your computer and use it in GitHub Desktop.
Save albcunha/67379eac046a7528dffdf86e9058d3a9 to your computer and use it in GitHub Desktop.
OCRs image and adds its content to EXIF field called 'description'
# this files is the logic and works only from the ocr_image.py file.
from PIL import Image
import sys
import os
from pathlib import Path
import pyocr
import pyocr.builders
import piexif
import time
tools = pyocr.get_available_tools()
if len(tools) == 0:
print("No OCR tool found")
sys.exit(1)
# The tools are returned in the recommended order of usage
tool = tools[0]
print("Will use tool '%s'" % (tool.get_name()))
# Ex: Will use tool 'libtesseract'
langs = tool.get_available_languages()
print("Available languages: %s" % ", ".join(langs))
lang = langs[0]
print("Will use lang '%s'" % (lang))
def adiciona_exif(arquivo):
try:
if 'thumb' in arquivo:
return arquivo
if os.path.getsize(arquivo) < 8000: #8kb
return arquivo
my_file = Path(arquivo + '.jpg')
if my_file.is_file():
return arquivo
im = Image.open(arquivo)
if '.jpg' == arquivo.lower()[-4:]:
try:
info_exif = im._getexif()
if len(info_exif) > 0:
# checa se a o exife de descrição(270) já foi inserido. Se foi, encerra.
try:
im._getexif()[270]
return arquivo
except:
pass
except:
pass
txt = tool.image_to_string(
im,
lang=lang,
builder=pyocr.builders.TextBuilder()
)
zeroth_ifd = {piexif.ImageIFD.ImageDescription: txt.encode('utf8').decode('latin1')}
exif_dict = {"0th":zeroth_ifd}
exif_bytes = piexif.dump(exif_dict)
if '.jpg' == arquivo[-4:].lower():
im.save(arquivo, exif=exif_bytes)
else:
im = im.convert("L")
im.save(arquivo+'.jpg',exif=exif_bytes)
return arquivo
except:
print('#### ERRO!!!! Arquivo não foi processado:', arquivo)
# this script needs an extra file called formula_exif!
# It will ocr an image and save it will save its text content on the EXIF DATA
# this script is used on jupyter notebooks, that´s why the multiprocessing logic need to be on a different file.
# author: Alberto Cartaxo
# 11/01/2018
import glob
import multiprocessing
from tqdm import tqdm
import formula_exif # where the magic happens
# Will use tool 'Tesseract (sh)'
# Available languages: eng, osd, por
# Will use lang 'por'
finished_files = []
# Creates a list of files
path_files = r'F:\EVIDENCIAS'
image_files = glob.glob( path_files + r'\**\*.jpg',recursive=True)
image_files += glob.glob(path_files + r'\**\*.jpeg',recursive=True)
image_files += glob.glob(path_files + r'\**\*.png',recursive=True)
print('Total of images to OCR:', len(image_files))
pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
for item in tqdm(pool.imap_unordered(formula_exif.adiciona_exif, image_files), total=len(image_files)):
finished_files.append(item)
print('## ENCERRADO')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment