Skip to content

Instantly share code, notes, and snippets.

@tjluoma
Created March 28, 2020 03:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tjluoma/e175022bba39356259a376f0db65c830 to your computer and use it in GitHub Desktop.
Save tjluoma/e175022bba39356259a376f0db65c830 to your computer and use it in GitHub Desktop.
This script will bulk OCR all of the PDFs in a given directory. https://github.com/jbarlow83/OCRmyPDF is doing all of the hard work, TBH.
#!/usr/bin/env zsh -f
# Purpose: OCR all of the PDFs in a given directory
#
# From: Timothy J. Luoma
# Mail: luomat at gmail dot com
# Date: 2020-03-27
# ⚠️⚠️⚠️ !!! ⚠️⚠️⚠️ !!! ⚠️⚠️⚠️ !!! ⚠️⚠️⚠️ !!! ⚠️⚠️⚠️ !!! ⚠️⚠️⚠️ !!! ⚠️⚠️⚠️ !!! ⚠️⚠️⚠️ !!! ⚠️⚠️⚠️ !!! ⚠️⚠️⚠️ !!! ⚠️⚠️⚠️ !!! ⚠️⚠️⚠️ !!! ⚠️⚠️⚠️
# !!! ⚠️⚠️⚠️ you *MUST* change this to the directory where all of your PDFs are that you want to OCR !!!
DIR="$HOME/AllMyPDFs"
# This is the folder where PDFs will be stored after they are OCR'd
# You can change this if you want. The directory will be created if needed.
OUT="$HOME/OCRedPDFs"
# don't change this
NAME="$0:t:r"
# this is where error messages will be saved
# you could change/rename this if you wanted to
ERROR_LOG="$HOME/Desktop/${NAME}.errors.log"
#########################################################################################################
###
### You should not need to change anything below this line
###
#########################################################################################################
if [[ -e "$HOME/.path" ]]
then
source "$HOME/.path"
else
PATH="$HOME/scripts:/usr/local/bin:/usr/bin:/usr/sbin:/sbin:/bin"
fi
if ((! $+commands[ocrmypdf] ))
then
echo "$NAME: 'ocrmypdf' is required but not found in $PATH" >>/dev/stderr
echo "$NAME: The easiest way to install it is with 'brew'." >>/dev/stderr
exit 2
fi
# create the directory to store PDFs, if needed
if [[ ! -d "$DIR" ]]
then
echo "$NAME: Ummmm. '$DIR' does not exist or is not a directory." >>/dev/stderr
echo "$NAME: You should change the line 'DIR=' in '$0' to point to the directory you want to use." >>/dev/stderr
exit 2
fi
# create the directory to store PDFs, if needed
if [[ ! -d "$OUT" ]]
then
echo "$NAME: Creating '$OUT'..."
mkdir -p "$OUT"
if [[ ! -d "$OUT" ]]
then
echo "$NAME: failed to create '$OUT' for some reason." >>/dev/stderr
exit 2
fi
fi
cd "$DIR"
COUNT='0'
ls -1 | egrep -i '\.pdf$' | while read line
do
FILE="$line"
# if we have already OCR'd this PDF then there's no need to do it again
[[ -e "$OUT/$FILE" ]] && echo "$NAME: '$OUT/$FILE' already exists. Skipping..." && continue
echo "\n${NAME}: Starting OCR on '$FILE'..."
ocrmypdf --output-type pdf --skip-text "$FILE" "$OUT/$FILE" 2> "$OUT/$NAME.$FILE.errors.log"
EXIT="$?"
if [[ "$EXIT" == "0" ]]
then
echo "$NAME: Successfully completed OCR on '$FILE'...\n"
else
# put the filename in the error log so we know which file it is about
echo "\n$NAME: errors from '$FILE':" >> "$ERROR_LOG"
# add the error message to the Error Log
cat "$OUT/$NAME.$FILE.errors.log" >> "$ERROR_LOG"
# get rid of the individual error log (no longer needed)
mv -vf "$OUT/$NAME.$FILE.errors.log" "$HOME/.Trash/"
# tell the user we failed
echo "$NAME: OCR on '$FILE' FAILED (\$EXIT = $EXIT)" | tee -a "$ERROR_LOG"
# remove any file that was created by the failed process, if it exists
[[ -e "$OUT/$FILE" ]] && rm -f "$OUT/$FILE"
# increment error counter
((COUNT++))
fi
done
if [[ "$COUNT" = "0" ]]
then
echo "$NAME: Finished processing all files in '$DIR' with no errors."
exit 0
elif [[ "$COUNT" = "1" ]]
then
echo "$NAME: Finished processing all files in '$DIR' with 1 error. See '$ERROR_LOG'"
# reveal error log in Finder
open -R "$ERROR_LOG"
exit 1
else
echo "$NAME: Finished processing all files in '$DIR' with $COUNT errors. See '$ERROR_LOG'"
# reveal error log in Finder
open -R "$ERROR_LOG"
exit $COUNT
fi
exit 0
#EOF
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment