Skip to content

Instantly share code, notes, and snippets.

@companje
Created April 25, 2023 11:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save companje/80b253a51cfd8f5b02e5a1e962cfbff9 to your computer and use it in GitHub Desktop.
Save companje/80b253a51cfd8f5b02e5a1e962cfbff9 to your computer and use it in GitHub Desktop.
ocr mac
#!/Applications/Xcode.app/Contents/Developer/usr/bin/python3
import config,json,os,subprocess,ocr2json
from tqdm import tqdm
from joblib import Parallel, delayed
import Quartz,Vision
from Cocoa import NSURL
from Foundation import NSDictionary
from wurlitzer import pipes # needed to capture system-level stderr
def ocr(image_filename):
input_url = NSURL.fileURLWithPath_(image_filename)
with pipes() as (out, err):
input_image = Quartz.CIImage.imageWithContentsOfURL_(input_url)
(width,height) = input_image.extent().size
vision_options = NSDictionary.dictionaryWithDictionary_({})
vision_handler = Vision.VNImageRequestHandler.alloc().initWithCIImage_options_(
input_image, vision_options
)
request = Vision.VNRecognizeTextRequest.alloc().init().autorelease()
request.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate) #VNRequestTextRecognitionLevelFast
request.setRecognitionLanguages_(["nl-NL"])
error = vision_handler.performRequests_error_([request], None)
results = []
for item in request.results():
bbox = item.boundingBox()
w, h = bbox.size.width, bbox.size.height
x, y = bbox.origin.x, bbox.origin.y
results.append({
"x":int(x*width),
"y":int(height - y*height - h*height),
"w":int(w*width),
"h":int(h*height),
"conf":item.confidence(),
"text":item.text()
})
return results
def do_ocr(image_file_path, progress):
file_id = config.get_id_from_path(image_file_path)
json_file_path = config.make_path(base_folder=config.DEEDS_OCR_JSON_FOLDER, id=file_id, suffix=".json")
if not os.path.exists(json_file_path):
data = ocr(str(image_file_path))
if data and len(data)>0:
json.dump(data, open(json_file_path,"w"), indent=2)
print(json_file_path,str(int(progress*1000)/10.)+"%")
def run():
print("run ocr_per_akte")
if not os.path.exists(config.DEEDS_OCR_JSON_FOLDER):
os.mkdir(config.DEEDS_OCR_JSON_FOLDER)
file_paths = config.get_deeds_image_file_paths()
results = Parallel(n_jobs=1, prefer="threads")(
delayed(do_ocr)(image_file_path,i/len(file_paths))
for i,image_file_path in enumerate(file_paths)
)
if __name__ == '__main__':
run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment