Last active
August 29, 2015 13:56
-
-
Save jnerin/8953933 to your computer and use it in GitHub Desktop.
Simple script to scan consecutive pages and OCR them waiting for Intro between pages
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# | |
# Script to automatically scan consecutive pages and OCR them | |
# | |
# Dependencies: | |
# scanimage from SANE | |
# convert from imagemagick | |
# tesseract for OCR | |
# | |
# But they are easy to replace | |
# | |
# Place to store the scanned images | |
DIRECTORY="$HOME/Pictures/autoscan" | |
# device argument to connect to the scanner | |
SCANNER="smfp:net;192.168.x.x" | |
DPI=300 | |
# OCR language | |
TESSERACT_OCR_LANG="spa" | |
fin() { | |
echo -n "Waiting for all the children... " | |
wait # we're waiting for all the background spawned ocrs (tesseract &) | |
echo "done" | |
exit; | |
} | |
trap 'fin' INT | |
cd "$DIRECTORY"; | |
while true ; do | |
FILENAME="scan-$(date +%Y%m%d-%H%M%S).jpg" | |
scanimage --device-name="$SCANNER" -x 216mm -y 297mm --resolution $DPI --page-format A4 --format=pnm --progress | convert pnm:- "$FILENAME" | |
echo "Scan done, change sheet. Doing OCR now... " | |
nice tesseract "$FILENAME" "$FILENAME" -l $TESSERACT_OCR_LANG -psm 1 & | |
echo "Press Intro to scan another page or Ctrl-C to finish." | |
read | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment