Skip to content

Instantly share code, notes, and snippets.

@gordyt
Created May 14, 2022 20:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gordyt/40c4c9842e93f32af557ed6395e70db2 to your computer and use it in GitHub Desktop.
Save gordyt/40c4c9842e93f32af557ed6395e70db2 to your computer and use it in GitHub Desktop.
#!/usr/bin/env bash
# Prerequisites (all can be brew installed):
# pdf2image qpdf tesseract
replace=false
usage() {
cat << EOF
usage: $0 [-r] file1 ...
Convert each (pdf) file to one that has the hidden (searchable) text layer.
If the -r option is specified, each file will be REPLACED.
If the -r option is NOT specified, a new PDF will be created.
Example:
- original file: sample.pdf
- new file: sample.ocr.pdf
EOF
exit 1
}
if [ $# -eq 0 ]; then
usage
fi
convert() {
inFile="$1"
if $replace; then
outFile="$inFile"
else
# shellcheck disable=SC2001
outFile=$(echo "$inFile" | sed 's/^\(.*\)\(\.[^.]*\)$/\1.ocr\2/')
fi
echo "Converting file $inFile to $outFile"
if ! [ -f "$inFile" ]; then
>&2 echo "error: specified input pdf file '$inFile' not found"
exit 1
fi
tmpDir=$(mktemp -d -t pdf)
cp "$inFile" "$tmpDir/$inFile"
pushd "$tmpDir" || exit 1
pdf2image -zoom 3 "$inFile" JUNK
for p in JUNK*.png; do tesseract "$p" "$p" -l eng pdf; done
popd || exit 1
qpdf --empty --pages "$tmpDir"/JUNK*.pdf -- "$outFile"
rm -rf "$tmpDir"
}
while [ $# -gt 0 ]; do
if [ "$1" = "-r" ]; then
replace=true
else
convert "$1"
fi
shift
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment