-
-
Save RhetTbull/1c34fc07c95733642cffcd1ac587fc4c to your computer and use it in GitHub Desktop.
""" Use Apple's Vision Framework via PyObjC to detect text in images | |
To use: | |
python3 -m pip install pyobjc-core pyobjc-framework-Quartz pyobjc-framework-Vision wurlitzer | |
""" | |
import pathlib | |
import Quartz | |
import Vision | |
from Cocoa import NSURL | |
from Foundation import NSDictionary | |
# needed to capture system-level stderr | |
from wurlitzer import pipes | |
def image_to_text(img_path, lang="eng"): | |
input_url = NSURL.fileURLWithPath_(img_path) | |
with pipes() as (out, err): | |
# capture stdout and stderr from system calls | |
# otherwise, Quartz.CIImage.imageWithContentsOfURL_ | |
# prints to stderr something like: | |
# 2020-09-20 20:55:25.538 python[73042:5650492] Creating client/daemon connection: B8FE995E-3F27-47F4-9FA8-559C615FD774 | |
# 2020-09-20 20:55:25.652 python[73042:5650492] Got the query meta data reply for: com.apple.MobileAsset.RawCamera.Camera, response: 0 | |
input_image = Quartz.CIImage.imageWithContentsOfURL_(input_url) | |
vision_options = NSDictionary.dictionaryWithDictionary_({}) | |
vision_handler = Vision.VNImageRequestHandler.alloc().initWithCIImage_options_( | |
input_image, vision_options | |
) | |
results = [] | |
handler = make_request_handler(results) | |
vision_request = Vision.VNRecognizeTextRequest.alloc().initWithCompletionHandler_(handler) | |
error = vision_handler.performRequests_error_([vision_request], None) | |
return results | |
def make_request_handler(results): | |
""" results: list to store results """ | |
if not isinstance(results, list): | |
raise ValueError("results must be a list") | |
def handler(request, error): | |
if error: | |
print(f"Error! {error}") | |
else: | |
observations = request.results() | |
for text_observation in observations: | |
recognized_text = text_observation.topCandidates_(1)[0] | |
results.append([recognized_text.string(), recognized_text.confidence()]) | |
return handler | |
def main(): | |
import sys | |
import pathlib | |
img_path = pathlib.Path(sys.argv[1]) | |
if not img_path.is_file(): | |
sys.exit("Invalid image path") | |
img_path = str(img_path.resolve()) | |
results = image_to_text(img_path) | |
print(results) | |
if __name__ == "__main__": | |
main() |
@psungho I'm not sure how well the pyobjc stuff works with python's threads. I would try multiprocessing (spawn multiple separate python processes each running the vision framework).
doesn't really seem to be friendly. keep getting things like Object ID x,0 ref repaired where x is a number
I guess in theory you could use NSThreads instead? @RhetTbull
Not sure how much of a performance improvement it will bring. Relatively a new obj-c coder (in fact learning it for a project I have). What I want to do is OCR a bunch of pdfs concurrently -- maybe there is some alternate solution?
I had to run pip install pyobjc-framework-Quartz pyobjc-framework-Vision wurlitzer
to make it work on m2 mac.
I'm not familiar with objective-c. How to load image from a numpy array image, instead of a image from the disk?
@bert9946 I'm not very familiar with numpy but this might work:
"""Create a CIImage from a numpy array"""
import io
import sys
import numpy as np
from AppKit import NSBitmapImageRep, NSImage
from Foundation import NSData
from PIL import Image
from Quartz import CIImage
def createNSImageFromNumpyArray(numpy_array):
image = Image.fromarray(numpy_array)
data = io.BytesIO()
image.save(data, "JPEG")
nsdata = NSData.dataWithBytes_length_(data.getvalue(), len(data.getvalue()))
rep = NSBitmapImageRep.imageRepWithData_(nsdata)
nsimage = NSImage.alloc().initWithSize_((rep.pixelsWide(), rep.pixelsHigh()))
nsimage.addRepresentation_(rep)
return nsimage
def convertNSImageToCIImage(nsimage):
imageData = nsimage.TIFFRepresentation()
bitmap = NSBitmapImageRep.alloc().initWithData_(imageData)
ciimage = CIImage.alloc().initWithBitmapImageRep_(bitmap)
return ciimage
if __name__ == "__main__":
filepath = sys.argv[1]
pil_img = Image.open(filepath)
print(pil_img.format, pil_img.size, pil_img.mode)
np_img = np.asarray(pil_img)
nsi = createNSImageFromNumpyArray(np_img)
print(nsi)
cii = convertNSImageToCIImage(nsi)
print(cii)
@RhetTbull It works! This is very helpful. Thanks lot.
Can this be used in Linux systems or is it Mac specific?
@emilanovix this uses Apple macOS APIs thus it is macOS only. There are plenty of OCR packages that will run on Linux but this is specific to macOS.
Here's a cleaned-up version that includes bounding boxes https://gist.github.com/jonashaag/95e8b75ed44cc5b93cbc5d4599e3803a
I recently found this and found it quite useful.
I was planning on OCRing about 10000 pdfs with apple's api. your code works well. however I'm a bit stuck on how to multithread/parallel process it. concurrent.futures does not seemingly work. if there any suggestion you would make for this?