nuclearsecrecy/pdfexpert_batch_ocr.applescript

## pdfexpert_batch_ocr.applescript
# Applescript to batch OCR PDFs using PDF Expert.
# By Alex Wellerstein. Last updated June 18, 2024. No copyright asserted -- released for public domain use.
# Absolutely no warranties, guarantees, promises, ANYTHING provided. Use at your own risk.
#
# Will automatically save and close each PDF after OCR completes.
# Assumes PDF Expert is the default program to open PDFs!
# Does not have robust error handling. Held together with duct tape.
# Just a temporary solution until Readdle actually supports batch operations.
# Seems to work on OS 14.3.1, with PDF Expert 3.10.4.
# Has not been extensively tested to see what happens if you try to do other work while it is running; could foul up.
# When I use it, I just let it run for awhile by itself, and don't touch anything while it is running.

# Prompt user for PDFs
set theFiles to choose file with prompt "Select all PDFs to run OCR on:" of type {"pdf"} with multiple selections allowed

set totalFiles to length of theFiles
set progress total steps to totalFiles
set progress completed steps to 0
set progress description to "Processing files..."
set progress additional description to "Beginning."


# iterate over all of them
repeat with i from 1 to (count theFiles)
	delay 0.5 # these delays are just in case things are taking a moment
	set filePath to item i of theFiles
	set progress completed steps to i
	set progress additional description to "Processing file " & i & " of " & totalFiles
	pdfexpert_ocr(POSIX path of filePath)

	# will close PDF Expert to clear out RAM every 10 files -- change as necessary
	if i mod 10 is 0 then
		tell application "PDF Expert"
			quit
		end tell
		delay 1
	end if
end repeat

log ("All done")

# main function for OCRing a file with PDFExpert and saving it
on pdfexpert_ocr(filePath)
	log ("starting to process " & filePath)
	do shell script "open " & quoted form of filePath # assumes PDF Expert is default PDF reader
	delay 0.5
	activate application "PDF Expert"
	delay 0.5
	set fileTitle to getFilename(filePath, ".pdf")
	tell application "System Events"
		tell process "PDF Expert"

			# find the right window
			set win to false
			set allWin to every window
			repeat with i from 1 to count allWin
				set wTitle to title of window i
				if wTitle is fileTitle then
					set win to window i
				end if
			end repeat

			#if we couldn't find it... wait a sec and one more time
			if (win is false) then
				delay 2
				set allWin to every window
				repeat with i from 1 to count allWin
					set wTitle to title of window i
					log (wTitle)
					log (wTitle is fileTitle)
					if wTitle is fileTitle then
						log (wTitle)
						set win to window i
					end if
				end repeat
			end if

			# something went wrong, abort
			if (win is false) then
				display dialog "File '" & fileTitle & "' apparently failed to open in PDF Expert."
				error number -128
			end if

			# otherwise...

			# click "Recognize Text" from "Scan & OCR" menu
			click menu item "Recognize Text" of menu 1 of menu bar item "Scan & OCR" of menu bar 1

			# click "Recognize..." on side bar.
			# note that finding the right way to reference this was a pain.
			# I finally found it through "get entire contents of front window"
			tell button "Recognize..." of group 1 of group 1 of group 2 of UI element 1 of win
				click
				# If there is more than 1 page, then it asks how many pages to do.
				# We want to select "All" in that case. If there is only 1 page, this doesn't come up, and
				# starts automaticaly after the previous step.
				if exists radio button "All" of sheet 1 of win then
					tell radio button "All" of sheet 1 of win # all pages plz
						click
					end tell
					tell button "Apply" of sheet 1 of win #start OCR
						click
					end tell
				end if
			end tell

			# Wait a slight delay before we start checking if it is done.
			delay 1

			# This searches for the sign that the OCR has completed. This is finicky
			# and has been the source of most issues with this script. I blame AppleScript.
			# Basically, we try to see if the progress indicator, which is kept on a modal "sheet",
			# has left us. For some unknown reason, trying to find it will occasionally trigger
			# its own error, but only when it is gone? So we just wrap the whole thing in a `try`
			# and use any errors as an excuse to move on. Which probably isn't failproof.
			repeat
				try
					# look for the progress indicator - these tiers are to avoid an error if something
					# disappears along the hierarchy (which apparently can happen)
					if not (exists win) then
						#log ("no win")
						exit repeat
					end if
					if ((count sheet of win) = 0) then
						#log ("sheet = 0")
						exit repeat
					end if
					if not (exists progress indicator 1 of sheet 1 of win) then
						#log ("no indicator")
						exit repeat
					end if
				on error errStr number errorNumber
					#log (errStr) #one of the weird failure modes
					exit repeat
				end try
			end repeat

			# Just half a sec to make sure it is done.
			delay 0.5

			# Then save file and close window
			click menu item "Save" of menu 1 of menu bar item "File" of menu bar 1
			delay 1
			click menu item "Close Window" of menu 1 of menu bar item "File" of menu bar 1
			delay 0.5
			log ("OCR run on " & filePath)
		end tell
	end tell
end pdfexpert_ocr

# returns just the filename. if theExtension is set, will trim it.
on getFilename(theFile, theExtension)
	tell application "Finder" to set fName to name of (POSIX file theFile as alias)
	if (theExtension is not "") then
		return trimText(fName, theExtension, "end")
	else
		return fName
	end if
end getFilename

# god awful function for trimming characters in Applescript
on trimText(theText, theCharactersToTrim, theTrimDirection)
	set theTrimLength to length of theCharactersToTrim
	if theTrimDirection is in {"beginning", "both"} then
		repeat while theText begins with theCharactersToTrim
			try
				set theText to characters (theTrimLength + 1) thru -1 of theText as string
			on error
				-- text contains nothing but trim characters
				return ""
			end try
		end repeat
	end if
	if theTrimDirection is in {"end", "both"} then
		repeat while theText ends with theCharactersToTrim
			try
				set theText to characters 1 thru -(theTrimLength + 1) of theText as string
			on error
				-- text contains nothing but trim characters
				return ""
			end try
		end repeat
	end if
	return theText
end trimText
	# Applescript to batch OCR PDFs using PDF Expert.
	# By Alex Wellerstein. Last updated June 18, 2024. No copyright asserted -- released for public domain use.
	# Absolutely no warranties, guarantees, promises, ANYTHING provided. Use at your own risk.
	#
	# Will automatically save and close each PDF after OCR completes.
	# Assumes PDF Expert is the default program to open PDFs!
	# Does not have robust error handling. Held together with duct tape.
	# Just a temporary solution until Readdle actually supports batch operations.
	# Seems to work on OS 14.3.1, with PDF Expert 3.10.4.
	# Has not been extensively tested to see what happens if you try to do other work while it is running; could foul up.
	# When I use it, I just let it run for awhile by itself, and don't touch anything while it is running.

	# Prompt user for PDFs
	set theFiles to choose file with prompt "Select all PDFs to run OCR on:" of type {"pdf"} with multiple selections allowed

	set totalFiles to length of theFiles
	set progress total steps to totalFiles
	set progress completed steps to 0
	set progress description to "Processing files..."
	set progress additional description to "Beginning."


	# iterate over all of them
	repeat with i from 1 to (count theFiles)
	delay 0.5 # these delays are just in case things are taking a moment
	set filePath to item i of theFiles
	set progress completed steps to i
	set progress additional description to "Processing file " & i & " of " & totalFiles
	pdfexpert_ocr(POSIX path of filePath)

	# will close PDF Expert to clear out RAM every 10 files -- change as necessary
	if i mod 10 is 0 then
	tell application "PDF Expert"
	quit
	end tell
	delay 1
	end if
	end repeat

	log ("All done")

	# main function for OCRing a file with PDFExpert and saving it
	on pdfexpert_ocr(filePath)
	log ("starting to process " & filePath)
	do shell script "open " & quoted form of filePath # assumes PDF Expert is default PDF reader
	delay 0.5
	activate application "PDF Expert"
	delay 0.5
	set fileTitle to getFilename(filePath, ".pdf")
	tell application "System Events"
	tell process "PDF Expert"

	# find the right window
	set win to false
	set allWin to every window
	repeat with i from 1 to count allWin
	set wTitle to title of window i
	if wTitle is fileTitle then
	set win to window i
	end if
	end repeat

	#if we couldn't find it... wait a sec and one more time
	if (win is false) then
	delay 2
	set allWin to every window
	repeat with i from 1 to count allWin
	set wTitle to title of window i
	log (wTitle)
	log (wTitle is fileTitle)
	if wTitle is fileTitle then
	log (wTitle)
	set win to window i
	end if
	end repeat
	end if

	# something went wrong, abort
	if (win is false) then
	display dialog "File '" & fileTitle & "' apparently failed to open in PDF Expert."
	error number -128
	end if

	# otherwise...

	# click "Recognize Text" from "Scan & OCR" menu
	click menu item "Recognize Text" of menu 1 of menu bar item "Scan & OCR" of menu bar 1

	# click "Recognize..." on side bar.
	# note that finding the right way to reference this was a pain.
	# I finally found it through "get entire contents of front window"
	tell button "Recognize..." of group 1 of group 1 of group 2 of UI element 1 of win
	click
	# If there is more than 1 page, then it asks how many pages to do.
	# We want to select "All" in that case. If there is only 1 page, this doesn't come up, and
	# starts automaticaly after the previous step.
	if exists radio button "All" of sheet 1 of win then
	tell radio button "All" of sheet 1 of win # all pages plz
	click
	end tell
	tell button "Apply" of sheet 1 of win #start OCR
	click
	end tell
	end if
	end tell

	# Wait a slight delay before we start checking if it is done.
	delay 1

	# This searches for the sign that the OCR has completed. This is finicky
	# and has been the source of most issues with this script. I blame AppleScript.
	# Basically, we try to see if the progress indicator, which is kept on a modal "sheet",
	# has left us. For some unknown reason, trying to find it will occasionally trigger
	# its own error, but only when it is gone? So we just wrap the whole thing in a `try`
	# and use any errors as an excuse to move on. Which probably isn't failproof.
	repeat
	try
	# look for the progress indicator - these tiers are to avoid an error if something
	# disappears along the hierarchy (which apparently can happen)
	if not (exists win) then
	#log ("no win")
	exit repeat
	end if
	if ((count sheet of win) = 0) then
	#log ("sheet = 0")
	exit repeat
	end if
	if not (exists progress indicator 1 of sheet 1 of win) then
	#log ("no indicator")
	exit repeat
	end if
	on error errStr number errorNumber
	#log (errStr) #one of the weird failure modes
	exit repeat
	end try
	end repeat

	# Just half a sec to make sure it is done.
	delay 0.5

	# Then save file and close window
	click menu item "Save" of menu 1 of menu bar item "File" of menu bar 1
	delay 1
	click menu item "Close Window" of menu 1 of menu bar item "File" of menu bar 1
	delay 0.5
	log ("OCR run on " & filePath)
	end tell
	end tell
	end pdfexpert_ocr

	# returns just the filename. if theExtension is set, will trim it.
	on getFilename(theFile, theExtension)
	tell application "Finder" to set fName to name of (POSIX file theFile as alias)
	if (theExtension is not "") then
	return trimText(fName, theExtension, "end")
	else
	return fName
	end if
	end getFilename

	# god awful function for trimming characters in Applescript
	on trimText(theText, theCharactersToTrim, theTrimDirection)
	set theTrimLength to length of theCharactersToTrim
	if theTrimDirection is in {"beginning", "both"} then
	repeat while theText begins with theCharactersToTrim
	try
	set theText to characters (theTrimLength + 1) thru -1 of theText as string
	on error
	-- text contains nothing but trim characters
	return ""
	end try
	end repeat
	end if
	if theTrimDirection is in {"end", "both"} then
	repeat while theText ends with theCharactersToTrim
	try
	set theText to characters 1 thru -(theTrimLength + 1) of theText as string
	on error
	-- text contains nothing but trim characters
	return ""
	end try
	end repeat
	end if
	return theText
	end trimText