Skip to content

Instantly share code, notes, and snippets.

@RhetTbull
Last active November 8, 2021 23:29
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save RhetTbull/4bb88eaa2a3b8e91837660d4f63b22ed to your computer and use it in GitHub Desktop.
Save RhetTbull/4bb88eaa2a3b8e91837660d4f63b22ed to your computer and use it in GitHub Desktop.
Extract text from images in MacOS Photos.app using tesseract OCR and update Photo description with extracted text
""" Extract text from images in Photos.app using tesseract and
update Photo description with extracted text """
# NOTE: doesn't currently work well as OCR image needs more pre-processing
# see: https://towardsdatascience.com/optical-character-recognition-ocr-with-less-than-12-lines-of-code-using-python-48404218cccb
import datetime
import pathlib
import osxphotos
import pytesseract
import tinydb
from PIL import Image
import photoscript
photosdb = osxphotos.PhotosDB()
library = pathlib.Path(photosdb.library_path)
db_name = f"{library.parent}/.{library.stem}.photos_text_db"
print(f"Processing Photos library '{library}'")
print(f"Loading photos_text database {db_name}")
db = tinydb.TinyDB(db_name)
query = tinydb.Query()
photoapp = photoscript.PhotosLibrary()
count = 0
text_count = 0
for photo in photosdb.photos():
count += 1
if db.search(query.uuid == photo.uuid):
print(
f"Skipping already processed photo {photo.original_filename}, {photo.uuid}"
)
continue
if not photo.path:
print(f"Skipping missing photo {photo.original_filename}, {photo.uuid}")
db.insert(
{
"uuid": photo.uuid,
"date": datetime.datetime.now().isoformat(),
"text": None,
"previous_text": photo.description,
"missing": True,
}
)
continue
print(f"Looking for text in photo {photo.original_filename}, {photo.uuid}")
try:
text = pytesseract.image_to_string(Image.open(photo.path)).strip()
text = " ".join(text.split())
except:
text = None
if text:
text_count += 1
print(f"Found text in photo: {text}")
photo2 = photoscript.Photo(photo.uuid)
descr = photo2.description
new_descr = descr + " " + text if descr else text
photo2.description = new_descr
print(f"Updated description to: {new_descr}")
db.insert(
{
"uuid": photo.uuid,
"date": datetime.datetime.now().isoformat(),
"text": text,
"previous_text": descr,
"missing": False,
}
)
print("Done")
if count:
print(f"Processed {count} photos, updated text in {text_count}")
else:
print(f"No photos to process")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment