Skip to content

Instantly share code, notes, and snippets.

@RhetTbull
Last active April 26, 2024 13:58
Show Gist options
  • Star 43 You must be signed in to star a gist
  • Fork 8 You must be signed in to fork a gist
  • Save RhetTbull/1c34fc07c95733642cffcd1ac587fc4c to your computer and use it in GitHub Desktop.
Save RhetTbull/1c34fc07c95733642cffcd1ac587fc4c to your computer and use it in GitHub Desktop.
Use Apple's Vision framework from Python to detect text in images
""" Use Apple's Vision Framework via PyObjC to detect text in images
To use:
python3 -m pip install pyobjc-core pyobjc-framework-Quartz pyobjc-framework-Vision wurlitzer
"""
import pathlib
import Quartz
import Vision
from Cocoa import NSURL
from Foundation import NSDictionary
# needed to capture system-level stderr
from wurlitzer import pipes
def image_to_text(img_path, lang="eng"):
input_url = NSURL.fileURLWithPath_(img_path)
with pipes() as (out, err):
# capture stdout and stderr from system calls
# otherwise, Quartz.CIImage.imageWithContentsOfURL_
# prints to stderr something like:
# 2020-09-20 20:55:25.538 python[73042:5650492] Creating client/daemon connection: B8FE995E-3F27-47F4-9FA8-559C615FD774
# 2020-09-20 20:55:25.652 python[73042:5650492] Got the query meta data reply for: com.apple.MobileAsset.RawCamera.Camera, response: 0
input_image = Quartz.CIImage.imageWithContentsOfURL_(input_url)
vision_options = NSDictionary.dictionaryWithDictionary_({})
vision_handler = Vision.VNImageRequestHandler.alloc().initWithCIImage_options_(
input_image, vision_options
)
results = []
handler = make_request_handler(results)
vision_request = Vision.VNRecognizeTextRequest.alloc().initWithCompletionHandler_(handler)
error = vision_handler.performRequests_error_([vision_request], None)
return results
def make_request_handler(results):
""" results: list to store results """
if not isinstance(results, list):
raise ValueError("results must be a list")
def handler(request, error):
if error:
print(f"Error! {error}")
else:
observations = request.results()
for text_observation in observations:
recognized_text = text_observation.topCandidates_(1)[0]
results.append([recognized_text.string(), recognized_text.confidence()])
return handler
def main():
import sys
import pathlib
img_path = pathlib.Path(sys.argv[1])
if not img_path.is_file():
sys.exit("Invalid image path")
img_path = str(img_path.resolve())
results = image_to_text(img_path)
print(results)
if __name__ == "__main__":
main()
@psungho
Copy link

psungho commented Jul 19, 2023

doesn't really seem to be friendly. keep getting things like Object ID x,0 ref repaired where x is a number

@psungho
Copy link

psungho commented Jul 20, 2023

I guess in theory you could use NSThreads instead? @RhetTbull

Not sure how much of a performance improvement it will bring. Relatively a new obj-c coder (in fact learning it for a project I have). What I want to do is OCR a bunch of pdfs concurrently -- maybe there is some alternate solution?

@nevinpuri
Copy link

I had to run pip install pyobjc-framework-Quartz pyobjc-framework-Vision wurlitzer to make it work on m2 mac.

@bert9946
Copy link

I'm not familiar with objective-c. How to load image from a numpy array image, instead of a image from the disk?

@RhetTbull
Copy link
Author

@bert9946 I'm not very familiar with numpy but this might work:

"""Create a CIImage from a numpy array"""

import io
import sys

import numpy as np
from AppKit import NSBitmapImageRep, NSImage
from Foundation import NSData
from PIL import Image
from Quartz import CIImage


def createNSImageFromNumpyArray(numpy_array):
    image = Image.fromarray(numpy_array)
    data = io.BytesIO()
    image.save(data, "JPEG")
    nsdata = NSData.dataWithBytes_length_(data.getvalue(), len(data.getvalue()))
    rep = NSBitmapImageRep.imageRepWithData_(nsdata)
    nsimage = NSImage.alloc().initWithSize_((rep.pixelsWide(), rep.pixelsHigh()))
    nsimage.addRepresentation_(rep)
    return nsimage


def convertNSImageToCIImage(nsimage):
    imageData = nsimage.TIFFRepresentation()
    bitmap = NSBitmapImageRep.alloc().initWithData_(imageData)
    ciimage = CIImage.alloc().initWithBitmapImageRep_(bitmap)
    return ciimage


if __name__ == "__main__":
    filepath = sys.argv[1]
    pil_img = Image.open(filepath)
    print(pil_img.format, pil_img.size, pil_img.mode)
    np_img = np.asarray(pil_img)
    nsi = createNSImageFromNumpyArray(np_img)
    print(nsi)
    cii = convertNSImageToCIImage(nsi)
    print(cii)

@bert9946
Copy link

@RhetTbull It works! This is very helpful. Thanks lot.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment