Skip to content

Instantly share code, notes, and snippets.

@companje
Last active February 13, 2024 08:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save companje/32c95f431d2e389a3ba5f97f063b2267 to your computer and use it in GitHub Desktop.
Save companje/32c95f431d2e389a3ba5f97f063b2267 to your computer and use it in GitHub Desktop.
Mac OCR for crops
#!/usr/bin/env python3
import shutil,os,cv2,sys,glob
from tqdm import tqdm
from PIL import Image
from pathlib import Path
from joblib import Parallel, delayed
crop_width = 200
crop_height = 100
output_scale = .33
input_folder = "crops_small/"
output_folder = "crops_nummers_small/"
if not os.path.exists(output_folder):
os.makedirs(output_folder)
def do_crop(input_filepath, output_folder, progress):
print(str(int(progress*1000)/10.)+"%")
filename = input_filepath.strip()
basename = os.path.basename(filename)
output_filename = output_folder + basename
img = Image.open(input_filepath)
img = img.crop((0, 0, crop_width, crop_height))
img.save(output_filename)
file_paths = glob.glob(input_folder+"*.jpg", recursive=False)
print("totaal",len(file_paths))
results = Parallel(n_jobs=1, prefer="threads")(
delayed(do_crop)(image_file_path,output_folder, i/len(file_paths))
for i,image_file_path in enumerate(file_paths)
)
#!/usr/bin/env python3
import csv,json
from tqdm import tqdm
from rapidfuzz import fuzz
import rapidfuzz.process as fuzzy
def fuzzy_extract(input_str, compare_strs): #(result, match_pct, idx)
return fuzzy.extractOne(input_str, compare_strs, scorer=fuzz.ratio)
header = ["filename","text","corrected"]
corrected = csv.DictWriter(open("corrected.csv", 'w', encoding='utf8'), header)
corrected.writeheader()
count = json.load(open("count_fixed.json"))
dictionary = count.keys()
for row in tqdm(list(csv.DictReader(open("per-scan.csv")))):
result,score,_ = fuzzy_extract(row['text'], dictionary)
row['corrected'] = result
corrected.writerow(row)
#!/usr/bin/env python3
import shutil,os,cv2,sys
from tqdm import tqdm
from PIL import Image
from pathlib import Path
import json,os,subprocess
from tqdm import tqdm
from joblib import Parallel, delayed
import glob,csv
import Quartz,Vision
from Cocoa import NSURL
from Foundation import NSDictionary
from wurlitzer import pipes # needed to capture system-level stderr
def ocr(image_filename):
input_url = NSURL.fileURLWithPath_(image_filename)
with pipes() as (out, err):
input_image = Quartz.CIImage.imageWithContentsOfURL_(input_url)
(width,height) = input_image.extent().size
vision_options = NSDictionary.dictionaryWithDictionary_({})
vision_handler = Vision.VNImageRequestHandler.alloc().initWithCIImage_options_(
input_image, vision_options
)
request = Vision.VNRecognizeTextRequest.alloc().init().autorelease()
request.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate) #VNRequestTextRecognitionLevelFast
request.setRecognitionLanguages_(["nl-NL"])
error = vision_handler.performRequests_error_([request], None)
results = []
for item in request.results():
bbox = item.boundingBox()
w, h = bbox.size.width, bbox.size.height
x, y = bbox.origin.x, bbox.origin.y
results.append({
"x":int(x*width),
"y":int(height - y*height - h*height),
"w":int(w*width),
"h":int(h*height),
"conf":item.confidence(),
"text":item.text()
})
return results
def do_ocr(input_filepath, output_folder, progress):
output_filepath = output_folder + os.path.basename(input_filepath) + ".csv"
if not os.path.exists(output_filepath):
data = ocr(input_filepath)
if data and len(data)>0:
with open(output_filepath, 'w', encoding='utf8') as file:
writer = csv.DictWriter(file, data[0].keys())
writer.writeheader()
writer.writerows(data)
print(output_filepath,str(int(progress*1000)/10.)+"%")
input_folder = "crops_small/"
output_folder = "ocr/"
if not os.path.exists(output_folder):
os.makedirs(output_folder)
file_paths = glob.glob(input_folder+"*.jpg", recursive=False)
results = Parallel(n_jobs=1, prefer="threads")(
delayed(do_ocr)(image_file_path,output_folder, i/len(file_paths))
for i,image_file_path in enumerate(file_paths)
)
#!/usr/bin/env python3
from tqdm import tqdm
import glob,csv,re,os,json
from collections import defaultdict
input_folder = "ocr/"
counter = defaultdict(int)
file_paths = glob.glob(input_folder+"*.csv", recursive=False)
file_paths.sort()
header = ["filename","text","x","y","w","h","conf"]
boxes = csv.DictWriter(open("boxes.csv", 'w', encoding='utf8'), header)
boxes.writeheader()
header = ["filename","text"]
scans = csv.DictWriter(open("per-scan.csv", 'w', encoding='utf8'), header)
scans.writeheader()
for file_path in tqdm(file_paths): #[:10000]):
filename = os.path.basename(file_path)
filename = filename.replace(".csv","")
words = []
for row in csv.DictReader(open(file_path)):
if int(row['x'])<200 and int(row['w'])<300:
continue
if int(row['x'])>650:
continue
if int(row['h'])<=10: # datum, aantal etc
continue
if re.findall("aantal|betrek|betrok|datum",row['text'], re.IGNORECASE):
continue
words.append(row['text'])
row['filename'] = filename
boxes.writerow(row)
##########
text = " ".join(words).strip()
text = re.sub("\.$","",text)
scans.writerow({
"filename": filename,
"text": text
})
if text:
counter[text] += 1
counter = dict(sorted(counter.items(), key=lambda x:x[1], reverse=True))
json.dump(counter, open("count.json","w"), indent=2, ensure_ascii=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment