Created
May 14, 2022 20:24
-
-
Save gordyt/9e59b2ade9cbc1271bbf6218303c2fd2 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# Prerequisites (all can be brew installed): | |
# pdf2image tesseract | |
usage() { | |
cat << EOF | |
usage: $0 file1 ... | |
Extract text from each (pdf) file and save it to a file with the same | |
name as the source file but with a .txt extension | |
Example: | |
- original file: sample.pdf | |
- new file: sample.txt | |
EOF | |
exit 1 | |
} | |
if [ $# -eq 0 ]; then | |
usage | |
fi | |
convert() { | |
inFile="$1" | |
# shellcheck disable=SC2001 | |
outFile=$(echo "$inFile" | sed 's/^\(.*\)\(\.[^.]*\)$/\1.txt/') | |
echo "OCR'ing text from $inFile and saving to $outFile" | |
if ! [ -f "$inFile" ]; then | |
>&2 echo "error: specified input pdf file '$inFile' not found" | |
exit 1 | |
fi | |
tmpDir=$(mktemp -d -t pdf) | |
cp "$inFile" "$tmpDir/$inFile" | |
pushd "$tmpDir" || exit 1 | |
pdf2image -zoom 3 "$inFile" JUNK | |
for p in JUNK*.png; do tesseract "$p" "$p" -l eng txt; done | |
popd || exit 1 | |
cat "$tmpDir"/JUNK*.txt > "$outFile" | |
rm -rf "$tmpDir" | |
} | |
while [ $# -gt 0 ]; do | |
convert "$1" | |
shift | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment