peterc/screencastocr.py

## screencastocr.py
# Scan screencast videos for (sensitive) text

# MIT licensed – Copyright (c) 2022 Peter Cooper – @cooperx86

# Quite a bit of the code comes from
# https://github.com/RhetTbull/osxphotos/blob/master/osxphotos/text_detection.py
# which is itself MIT licensed and copyright (c) 2019-2021 Rhet Turnbull

import tempfile
import subprocess
import os
import sys
import glob
import objc
import Quartz
from Cocoa import NSURL
import Vision
from PIL import Image
import imagehash

def detect_text(img_path, orientation = None):
    with objc.autorelease_pool():
        input_url = NSURL.fileURLWithPath_(img_path)
        input_image = Quartz.CIImage.imageWithContentsOfURL_(input_url)
        vision_handler = Vision.VNImageRequestHandler.alloc().initWithCIImage_options_(input_image, None)
        results = []
        handler = make_request_handler(results)
        vision_request = Vision.VNRecognizeTextRequest.alloc().initWithCompletionHandler_(handler)
        #vision_request.setRecognitionLevel_(0)  # 0 is accurate, 1 is fast
        error = vision_handler.performRequests_error_([vision_request], None)
        if error[1]:
            print(f"Error: {error}")
        vision_request.dealloc()
        vision_handler.dealloc()
        return ' '.join(results)

def make_request_handler(results):
    def handler(request, error):
        if error:
            print(f"Error: {error}")
        else:
            observations = request.results()
            for text_observation in observations:
                recognized_text = text_observation.topCandidates_(1)[0]
                results.append(recognized_text.string())
                #results.append([recognized_text.string(), recognized_text.confidence()])

    return handler

temp_dir = tempfile.TemporaryDirectory()
temp_dir_name = temp_dir.name

output_pattern = os.path.join(temp_dir_name, "%06d.png")
ffmpeg_options = ("ffmpeg","-i","in.mp4","-vf","fps=4",output_pattern)

print("Extracting frames", file=sys.stderr)

if subprocess.run(ffmpeg_options, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL).returncode != 0:
    raise RuntimeError("Error while extracting frames")

previous_hash = None
words = set()
for filename in sorted(glob.glob(os.path.join(temp_dir_name,"*.png"))):
    print(f"Processing {os.path.basename(filename)}", file=sys.stderr)
    image_filename = os.path.join(temp_dir_name, filename)
    image_hash = imagehash.average_hash(Image.open(image_filename))
    if image_hash == previous_hash:
        print("Skipping duplicate frame", file=sys.stderr)
        continue
    previous_hash = image_hash
    text = detect_text(image_filename)
    for word in text.split():
        words.add(word)

temp_dir.cleanup()
print("\n".join(words))
	# Scan screencast videos for (sensitive) text

	# MIT licensed – Copyright (c) 2022 Peter Cooper – @cooperx86

	# Quite a bit of the code comes from
	# https://github.com/RhetTbull/osxphotos/blob/master/osxphotos/text_detection.py
	# which is itself MIT licensed and copyright (c) 2019-2021 Rhet Turnbull

	import tempfile
	import subprocess
	import os
	import sys
	import glob
	import objc
	import Quartz
	from Cocoa import NSURL
	import Vision
	from PIL import Image
	import imagehash

	def detect_text(img_path, orientation = None):
	with objc.autorelease_pool():
	input_url = NSURL.fileURLWithPath_(img_path)
	input_image = Quartz.CIImage.imageWithContentsOfURL_(input_url)
	vision_handler = Vision.VNImageRequestHandler.alloc().initWithCIImage_options_(input_image, None)
	results = []
	handler = make_request_handler(results)
	vision_request = Vision.VNRecognizeTextRequest.alloc().initWithCompletionHandler_(handler)
	#vision_request.setRecognitionLevel_(0) # 0 is accurate, 1 is fast
	error = vision_handler.performRequests_error_([vision_request], None)
	if error[1]:
	print(f"Error: {error}")
	vision_request.dealloc()
	vision_handler.dealloc()
	return ' '.join(results)

	def make_request_handler(results):
	def handler(request, error):
	if error:
	print(f"Error: {error}")
	else:
	observations = request.results()
	for text_observation in observations:
	recognized_text = text_observation.topCandidates_(1)[0]
	results.append(recognized_text.string())
	#results.append([recognized_text.string(), recognized_text.confidence()])

	return handler

	temp_dir = tempfile.TemporaryDirectory()
	temp_dir_name = temp_dir.name

	output_pattern = os.path.join(temp_dir_name, "%06d.png")
	ffmpeg_options = ("ffmpeg","-i","in.mp4","-vf","fps=4",output_pattern)

	print("Extracting frames", file=sys.stderr)

	if subprocess.run(ffmpeg_options, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL).returncode != 0:
	raise RuntimeError("Error while extracting frames")

	previous_hash = None
	words = set()
	for filename in sorted(glob.glob(os.path.join(temp_dir_name,"*.png"))):
	print(f"Processing {os.path.basename(filename)}", file=sys.stderr)
	image_filename = os.path.join(temp_dir_name, filename)
	image_hash = imagehash.average_hash(Image.open(image_filename))
	if image_hash == previous_hash:
	print("Skipping duplicate frame", file=sys.stderr)
	continue
	previous_hash = image_hash
	text = detect_text(image_filename)
	for word in text.split():
	words.add(word)

	temp_dir.cleanup()
	print("\n".join(words))