Last active
November 26, 2018 06:22
-
-
Save alexander-heimbuch/2aea20294fd326f768784f8fe11b1adf to your computer and use it in GitHub Desktop.
cli tool for inline pdf ocr on mac osx
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# ocr.sh for Mac OSX 10.11 | |
# works in memory (~/.ocr-TIMESTAMP) | |
# needs pdftk, pdftoppm, tesseract-ocr | |
# Port from multithreaded OCR with tesseract 3.03/3.04 (Michael Luthardt <edv@dr-luthardt.de> 2016, https://dr-luthardt.de/linux.htm?tip=pdfx) | |
# PREREQUISITES (requires homebrew) | |
# 1 - tesseract: brew install tesseract --with-all-languages (all languages will take up to 1.2gb space, see https://github.com/tesseract-ocr/tesseract/wiki to install ony selective) | |
# 2 - pdftk: https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/pdftk_server-2.02-mac_osx-10.11-setup.pkg | |
# 3 - pdftotext, pdftoppm: brew install poppler | |
WORKDIR="$HOME/.ocr-$(date +"%m_%d_%Y")" | |
# check for valid PDF as first argument | |
if (! [ $# -ge 1 ] || ! (gs -q -o nul -sDEVICE=nullpage -dFirstPage=1 -dLastPage=1 "$1" &>/dev/null)); then | |
cat << eot | |
First argument is not a valid PDF or doesn't exist. | |
Usage: [path/to/]pdffile [lan] [-y] | |
See tesseract --list-langs for installed languages. | |
eot | |
exit 1 | |
fi | |
# check for parameters | |
# language as 2nd argument, if not, set default (eng+deu) | |
# it's your own task to provide lan.traineddata | |
# with option -y OCR is forced anyway | |
FORCE=0 # as a precaution | |
if [ ${#} -eq 1 ]; then | |
LANG=eng+deu | |
else | |
[ ${#2} -eq 3 ] && LANG=$2 || LANG=eng+deu | |
[ "x$2" == "x-y" ] && FORCE=1 | |
[ "x$3" == "x-y" ] && FORCE=1 | |
fi | |
if [ $FORCE -eq 0 ]; then | |
#check if PDF already contains a text layer | |
unset -v ANS # as a precaution | |
if [ $(pdftotext "$1" - | grep -cE '[[:alpha:]]' 2>/dev/null) -ne 0 ]; then | |
echo -e "\nThis PDF contains a text layer." | |
read -r -p "Proceed anyway? Old text will be removed. [N|y] " ANS | |
[ "x$ANS" = "xy" ] || { echo; exit 2; } | |
fi | |
fi | |
FILE=`basename "$1"` | |
# find number of cpus, calculate maxjobs | |
NCPUS=$(sysctl -n hw.ncpu) | |
MAXJOBS=$((($NCPUS+1)/2)) | |
# work in memory: | |
mkdir -p $WORKDIR | |
cp "$1" $WORKDIR/ | |
cd $WORKDIR | |
rm -f pg_* pg-* # as a precaution | |
# burst input PDF into pages | |
tput bold; echo -e "\n${FILE}:"; tput sgr0 | |
pdftk "$FILE" burst | |
echo "`sed -n '/NumberOfPages/s/NumberOfPages: //p' doc_data.txt` pages to process ..." | |
# bundle all actions into one function | |
echo 0 > err | |
ocr() | |
{ | |
pdftoppm -thinlinemode shape -singlefile -png -r 200 $PAGE ${PAGE%.*} &>/dev/null | |
echo $? > err | |
tesseract -l $LANG ${PAGE%.*}.png ${PAGE%.*} pdf &>/dev/null | |
ERR=$?; echo $((`cat err` + $ERR)) > err | |
[ `cat err` -eq 0 ] && { tput civis; echo -en " ... $PAGE done\r"; tput cnorm; } \ | |
|| { tput civis; echo -en "\n ... $PAGE something went wrong\n"; tput cnorm; } | |
} | |
# do orc() in background | |
for PAGE in pg_*.pdf; do | |
[ `cat err` -ne 0 ] && break | |
ocr $PAGE & | |
# but limit the number of simultaneous jobs | |
[ `jobs -p | wc -l` -ge $MAXJOBS ] && wait | |
done | |
# wait for last bg job to finish – important! | |
wait | |
# leave memory | |
cd - &>/dev/null | |
# concatenate pdf's to input_ocr.pdf | |
if [ `cat $WORKDIR/err` -eq 0 ]; then | |
pdftk $WORKDIR/pg_*.pdf cat output "${1%.*}_ocr.pdf" | |
[ $? -eq 0 ] && echo -e "\n\n\t... ${1%.*}_ocr.pdf created\n" \ | |
|| echo -e "\n\n\t... Error encountered. No output created.\n" | |
else | |
echo -e "\n\\t... Errors encountered. No output created.\n" | |
fi | |
# clear memory | |
rm -rf $WORKDIR | |
exit |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
batch use:
Copy ocr.sh in an executable path without
.sh
file ending. Runls *.pdf | xargs -I '{}' ocr {}
.