Skip to content

Instantly share code, notes, and snippets.

@stesie
Created February 16, 2017 12:02
Show Gist options
  • Save stesie/42dff3d14fbfac60524f381babb8f81d to your computer and use it in GitHub Desktop.
Save stesie/42dff3d14fbfac60524f381babb8f81d to your computer and use it in GitHub Desktop.
Shell script to scan pdf, tesseract (ocr) it and create pdf with down-sampled image with text overlay
#!/usr/bin/env bash
set -e
## SCAN settings
FORMAT="-l 0 -t 0 -x 210 -y 297"
MODE=color
RESOLUTION=300
LANG=deu
postprocess_scan() {
TMPFILE="$1"; shift
if [ -e "$HOME/.tesseract/user-words" ]; then
TESSUW="--user-words $HOME/.tesseract/user-words"
fi
# tesseract unfortunately has no --quiet option ...
tesseract $TESSUW -l "$LANG" "$TMPFILE".tif "$TMPFILE" pdf 2>&1 | grep -v \
-e "Tesseract Open Source OCR Engine" \
-e "^Page" \
-e "Warning in pixReadMemTiff"
# next step: remove image from pdf
#
# -> image payload always stored in object 11 (remove it)
# -> remove XObject from document
sed -e "/\/XObject << \/Im1 11 0 R >>/d" \
-e "/^11 0 obj/,/^endobj/d" \
"$TMPFILE".pdf | csplit --quiet --prefix="$TMPFILE" - '/^10 0 obj/' '/^stream/+1' '/^endobj/' '/^xref/' '/^trailer/' '/^startxref/'
# csplit generated following parts now:
# -> 00 : pdf start part
# -> 01 : object 10 header (= document content)
# -> 02 : zlib deflated content of object 10
# -> 03 : everything beyond object 10 (including the "endobj" string of object 10)
# -> 04 : xref table
# -> 05 : trailer
# -> 06 : startxref pointer + %%EOF
# remove image reference from object 10
(echo "stream"; zlib-flate -uncompress < "$TMPFILE"02 | grep -ve ^q; echo "endstream") > "$TMPFILE"02.patched
# regenerate object 10 header
cat > "$TMPFILE"01.patched <<EOF
10 0 obj
<<
/Length `stat -c %s "$TMPFILE"02.patched`
>>
EOF
# re-pack pdf, leaving away the xref table
(cat "$TMPFILE"{00,01.patched,02.patched,03,05}; echo "%%EOF") > "$TMPFILE"-overlay.pdf
# down-sample scanned image + convert to pdf
convert "$TMPFILE".tif -resize $[150 * 100 / $RESOLUTION]% "$TMPFILE".scaled.tif
tiff2pdf -zj "$TMPFILE".scaled.tif -o "$TMPFILE".scaled.pdf
# combine text layer with down-sampled image
pdftk "$TMPFILE"-overlay.pdf background "$TMPFILE".scaled.pdf output "$TMPFILE".pdf
rm "$TMPFILE".{tif,scaled.tif,scaled.pdf} "$TMPFILE"{,00,01,01.patched,02,02.patched,03,04,05,06,-overlay.pdf}
}
PAGE=0
while [ "$userinput" != "q" ]; do
PAGEFILE=$(mktemp)
PAGE=$[$PAGE + 1]
echo ""
echo "Scanning page $PAGE ..."
scanimage --format tiff --mode "$MODE" $FORMAT --resolution "$RESOLUTION" -p > "$PAGEFILE".tif
postprocess_scan "$PAGEFILE" &
PARTS="$PARTS $PAGEFILE.pdf"
echo ""
echo ""
echo "Scan complete. Insert next sheet and hit RET to scan another page."
echo "Type 'q RET' to exit scanning"
read userinput
done
echo ""
echo "Waiting for child jobs to complete ..."
set +e
for job in `jobs -p`; do wait $job; done
set -e
echo ""
echo "Child jobs finished, bundling PDFs now ..."
pdftk $PARTS cat output result.pdf
rm $PARTS
echo ""
echo "Final PDF document stored to : result.pdf"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment