Last active
November 8, 2021 23:29
-
-
Save RhetTbull/4bb88eaa2a3b8e91837660d4f63b22ed to your computer and use it in GitHub Desktop.
Extract text from images in MacOS Photos.app using tesseract OCR and update Photo description with extracted text
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" Extract text from images in Photos.app using tesseract and | |
update Photo description with extracted text """ | |
# NOTE: doesn't currently work well as OCR image needs more pre-processing | |
# see: https://towardsdatascience.com/optical-character-recognition-ocr-with-less-than-12-lines-of-code-using-python-48404218cccb | |
import datetime | |
import pathlib | |
import osxphotos | |
import pytesseract | |
import tinydb | |
from PIL import Image | |
import photoscript | |
photosdb = osxphotos.PhotosDB() | |
library = pathlib.Path(photosdb.library_path) | |
db_name = f"{library.parent}/.{library.stem}.photos_text_db" | |
print(f"Processing Photos library '{library}'") | |
print(f"Loading photos_text database {db_name}") | |
db = tinydb.TinyDB(db_name) | |
query = tinydb.Query() | |
photoapp = photoscript.PhotosLibrary() | |
count = 0 | |
text_count = 0 | |
for photo in photosdb.photos(): | |
count += 1 | |
if db.search(query.uuid == photo.uuid): | |
print( | |
f"Skipping already processed photo {photo.original_filename}, {photo.uuid}" | |
) | |
continue | |
if not photo.path: | |
print(f"Skipping missing photo {photo.original_filename}, {photo.uuid}") | |
db.insert( | |
{ | |
"uuid": photo.uuid, | |
"date": datetime.datetime.now().isoformat(), | |
"text": None, | |
"previous_text": photo.description, | |
"missing": True, | |
} | |
) | |
continue | |
print(f"Looking for text in photo {photo.original_filename}, {photo.uuid}") | |
try: | |
text = pytesseract.image_to_string(Image.open(photo.path)).strip() | |
text = " ".join(text.split()) | |
except: | |
text = None | |
if text: | |
text_count += 1 | |
print(f"Found text in photo: {text}") | |
photo2 = photoscript.Photo(photo.uuid) | |
descr = photo2.description | |
new_descr = descr + " " + text if descr else text | |
photo2.description = new_descr | |
print(f"Updated description to: {new_descr}") | |
db.insert( | |
{ | |
"uuid": photo.uuid, | |
"date": datetime.datetime.now().isoformat(), | |
"text": text, | |
"previous_text": descr, | |
"missing": False, | |
} | |
) | |
print("Done") | |
if count: | |
print(f"Processed {count} photos, updated text in {text_count}") | |
else: | |
print(f"No photos to process") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment