Created
May 14, 2022 20:13
-
-
Save gordyt/40c4c9842e93f32af557ed6395e70db2 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# Prerequisites (all can be brew installed): | |
# pdf2image qpdf tesseract | |
replace=false | |
usage() { | |
cat << EOF | |
usage: $0 [-r] file1 ... | |
Convert each (pdf) file to one that has the hidden (searchable) text layer. | |
If the -r option is specified, each file will be REPLACED. | |
If the -r option is NOT specified, a new PDF will be created. | |
Example: | |
- original file: sample.pdf | |
- new file: sample.ocr.pdf | |
EOF | |
exit 1 | |
} | |
if [ $# -eq 0 ]; then | |
usage | |
fi | |
convert() { | |
inFile="$1" | |
if $replace; then | |
outFile="$inFile" | |
else | |
# shellcheck disable=SC2001 | |
outFile=$(echo "$inFile" | sed 's/^\(.*\)\(\.[^.]*\)$/\1.ocr\2/') | |
fi | |
echo "Converting file $inFile to $outFile" | |
if ! [ -f "$inFile" ]; then | |
>&2 echo "error: specified input pdf file '$inFile' not found" | |
exit 1 | |
fi | |
tmpDir=$(mktemp -d -t pdf) | |
cp "$inFile" "$tmpDir/$inFile" | |
pushd "$tmpDir" || exit 1 | |
pdf2image -zoom 3 "$inFile" JUNK | |
for p in JUNK*.png; do tesseract "$p" "$p" -l eng pdf; done | |
popd || exit 1 | |
qpdf --empty --pages "$tmpDir"/JUNK*.pdf -- "$outFile" | |
rm -rf "$tmpDir" | |
} | |
while [ $# -gt 0 ]; do | |
if [ "$1" = "-r" ]; then | |
replace=true | |
else | |
convert "$1" | |
fi | |
shift | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment