stesie/scan2pdf+ocr.sh

## scan2pdf+ocr.sh
#!/usr/bin/env bash
set -e

## SCAN settings
FORMAT="-l 0 -t 0 -x 210 -y 297"
MODE=color
RESOLUTION=300
LANG=deu

postprocess_scan() {
  TMPFILE="$1"; shift

  if [ -e "$HOME/.tesseract/user-words" ]; then
    TESSUW="--user-words $HOME/.tesseract/user-words"
  fi

  # tesseract unfortunately has no --quiet option ...
  tesseract $TESSUW -l "$LANG" "$TMPFILE".tif "$TMPFILE" pdf 2>&1 | grep -v \
    -e "Tesseract Open Source OCR Engine" \
    -e "^Page"                            \
    -e "Warning in pixReadMemTiff"

  # next step: remove image from pdf
  #
  # -> image payload always stored in object 11 (remove it)
  # -> remove XObject from document
  sed -e "/\/XObject << \/Im1 11 0 R >>/d" \
      -e "/^11 0 obj/,/^endobj/d" \
    "$TMPFILE".pdf | csplit --quiet --prefix="$TMPFILE" - '/^10 0 obj/' '/^stream/+1' '/^endobj/' '/^xref/' '/^trailer/' '/^startxref/'

  # csplit generated following parts now:
  # -> 00 : pdf start part
  # -> 01 : object 10 header (= document content)
  # -> 02 : zlib deflated content of object 10
  # -> 03 : everything beyond object 10 (including the "endobj" string of object 10)
  # -> 04 : xref table
  # -> 05 : trailer
  # -> 06 : startxref pointer + %%EOF

  # remove image reference from object 10
  (echo "stream"; zlib-flate -uncompress < "$TMPFILE"02 | grep -ve ^q; echo "endstream") > "$TMPFILE"02.patched

  # regenerate object 10 header
  cat > "$TMPFILE"01.patched <<EOF
  10 0 obj
  <<
    /Length `stat -c %s "$TMPFILE"02.patched`
  >>
EOF

  # re-pack pdf, leaving away the xref table
  (cat "$TMPFILE"{00,01.patched,02.patched,03,05}; echo "%%EOF") > "$TMPFILE"-overlay.pdf

  # down-sample scanned image + convert to pdf
  convert "$TMPFILE".tif -resize $[150 * 100 / $RESOLUTION]% "$TMPFILE".scaled.tif
  tiff2pdf -zj "$TMPFILE".scaled.tif -o "$TMPFILE".scaled.pdf

  # combine text layer with down-sampled image
  pdftk "$TMPFILE"-overlay.pdf background "$TMPFILE".scaled.pdf output "$TMPFILE".pdf

  rm "$TMPFILE".{tif,scaled.tif,scaled.pdf} "$TMPFILE"{,00,01,01.patched,02,02.patched,03,04,05,06,-overlay.pdf}
}

PAGE=0
while [ "$userinput" != "q" ]; do
  PAGEFILE=$(mktemp)
  PAGE=$[$PAGE + 1]

  echo ""
  echo "Scanning page $PAGE ..."
  scanimage --format tiff --mode "$MODE" $FORMAT --resolution "$RESOLUTION" -p > "$PAGEFILE".tif

  postprocess_scan "$PAGEFILE" &
  PARTS="$PARTS $PAGEFILE.pdf"

  echo ""
  echo ""
  echo "Scan complete.  Insert next sheet and hit RET to scan another page."
  echo "Type 'q RET' to exit scanning"

  read userinput
done

echo ""
echo "Waiting for child jobs to complete ..."
set +e
for job in `jobs -p`; do wait $job; done
set -e

echo ""
echo "Child jobs finished, bundling PDFs now ..."
pdftk $PARTS cat output result.pdf
rm $PARTS

echo ""
echo "Final PDF document stored to : result.pdf"
	#!/usr/bin/env bash
	set -e

	## SCAN settings
	FORMAT="-l 0 -t 0 -x 210 -y 297"
	MODE=color
	RESOLUTION=300
	LANG=deu

	postprocess_scan() {
	TMPFILE="$1"; shift

	if [ -e "$HOME/.tesseract/user-words" ]; then
	TESSUW="--user-words $HOME/.tesseract/user-words"
	fi

	# tesseract unfortunately has no --quiet option ...
	tesseract $TESSUW -l "$LANG" "$TMPFILE".tif "$TMPFILE" pdf 2>&1 \| grep -v \
	-e "Tesseract Open Source OCR Engine" \
	-e "^Page" \
	-e "Warning in pixReadMemTiff"

	# next step: remove image from pdf
	#
	# -> image payload always stored in object 11 (remove it)
	# -> remove XObject from document
	sed -e "/\/XObject << \/Im1 11 0 R >>/d" \
	-e "/^11 0 obj/,/^endobj/d" \
	"$TMPFILE".pdf \| csplit --quiet --prefix="$TMPFILE" - '/^10 0 obj/' '/^stream/+1' '/^endobj/' '/^xref/' '/^trailer/' '/^startxref/'

	# csplit generated following parts now:
	# -> 00 : pdf start part
	# -> 01 : object 10 header (= document content)
	# -> 02 : zlib deflated content of object 10
	# -> 03 : everything beyond object 10 (including the "endobj" string of object 10)
	# -> 04 : xref table
	# -> 05 : trailer
	# -> 06 : startxref pointer + %%EOF

	# remove image reference from object 10
	(echo "stream"; zlib-flate -uncompress < "$TMPFILE"02 \| grep -ve ^q; echo "endstream") > "$TMPFILE"02.patched

	# regenerate object 10 header
	cat > "$TMPFILE"01.patched <<EOF
	10 0 obj
	<<
	/Length `stat -c %s "$TMPFILE"02.patched`
	>>
	EOF

	# re-pack pdf, leaving away the xref table
	(cat "$TMPFILE"{00,01.patched,02.patched,03,05}; echo "%%EOF") > "$TMPFILE"-overlay.pdf

	# down-sample scanned image + convert to pdf
	convert "$TMPFILE".tif -resize $[150 * 100 / $RESOLUTION]% "$TMPFILE".scaled.tif
	tiff2pdf -zj "$TMPFILE".scaled.tif -o "$TMPFILE".scaled.pdf

	# combine text layer with down-sampled image
	pdftk "$TMPFILE"-overlay.pdf background "$TMPFILE".scaled.pdf output "$TMPFILE".pdf

	rm "$TMPFILE".{tif,scaled.tif,scaled.pdf} "$TMPFILE"{,00,01,01.patched,02,02.patched,03,04,05,06,-overlay.pdf}
	}

	PAGE=0
	while [ "$userinput" != "q" ]; do
	PAGEFILE=$(mktemp)
	PAGE=$[$PAGE + 1]

	echo ""
	echo "Scanning page $PAGE ..."
	scanimage --format tiff --mode "$MODE" $FORMAT --resolution "$RESOLUTION" -p > "$PAGEFILE".tif

	postprocess_scan "$PAGEFILE" &
	PARTS="$PARTS $PAGEFILE.pdf"

	echo ""
	echo ""
	echo "Scan complete. Insert next sheet and hit RET to scan another page."
	echo "Type 'q RET' to exit scanning"

	read userinput
	done

	echo ""
	echo "Waiting for child jobs to complete ..."
	set +e
	for job in `jobs -p`; do wait $job; done
	set -e

	echo ""
	echo "Child jobs finished, bundling PDFs now ..."
	pdftk $PARTS cat output result.pdf
	rm $PARTS

	echo ""
	echo "Final PDF document stored to : result.pdf"