Skip to content

Instantly share code, notes, and snippets.

@jikamens
Last active June 26, 2023 15:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jikamens/9cc4973a595c82d47c941892d97d07d0 to your computer and use it in GitHub Desktop.
Save jikamens/9cc4973a595c82d47c941892d97d07d0 to your computer and use it in GitHub Desktop.
Demonstration of how to automatically scan and file bills
#!/bin/bash -e
WHOAMI=$(basename $0)
DEV_FILE=$HOME/.scan-bill.dev
RETRY=false
MULTIPLE=false
DUPLEX_SOURCE='Automatic Document Feeder(centrally aligned,Duplex)'
SIMPLEX_SOURCE='Automatic Document Feeder(centrally aligned)'
SOURCE="$DUPLEX_SOURCE"
STARTED=false
USAGE="Usage: $WHOAMI [--retry] [--date M/D/Y] [--multiple] [--simplex]
[--pages-per-bill #]"
while [ -n "$1" ]; do
case "$1" in
-h|--help) echo "$USAGE"; exit ;;
# Try to process the files in /tmp/scan-bill again. Useful if something
# failed and then you fixed the script to account for the failure and
# don't want to rescan the document.
--retry) RETRY=true; shift ;;
# Specify the bill date for bill being processed. If processing
# multiple bills (see --pages-per-bill below), only specifies the bill
# date for the first one.
--date) shift; MMDDYY="$(date +%D --date "$1")"; shift ;;
# Allow multiple bills for the same date for the same service provider.
# Adds "-2", "-3", etc. suffixes to the file names of bills after the
# first one.
--multiple) MULTIPLE=true; shift ;;
# Tells the scanner to only scan the fronts of pages.
--simplex) SOURCE="$SIMPLEX_SOURCE"; shift ;;
# Indicates that multiple bills are being scanned, and that each bill
# has the specified number of pages. Note that this is *sides of a
# page*, so e.g. if you're scanning duplex bills that have only one
# sheet then the correct number to specify here is 2.
# When you specify this, then you can scan a stack of bills -- all of
# which are the same in terms of simplex/duplex and length -- and the
# script processes them all sequentially. If one of them fails the
# script aborts with its files in place, and then you can fix the
# problem and rerun the script with --retry and it'll pick up where it
# left off.
--pages-per-bill) shift; PAGES_PER_BILL="$1"; shift ;;
*) echo "Unrecognized argument: $1" 1>&2; exit 1 ;;
esac
done
TD_SCANNING=/tmp/$WHOAMI-images
TD_WORKING=/tmp/$WHOAMI
main() {
while do_one && [ -n "$PAGES_PER_BILL" ]; do
:
done
}
do_scanadf() {
mkdir -p $TD_SCANNING || exit 1
if ! cd $TD_SCANNING; then
exit 1
fi
if ! ls image* &>/dev/null; then
if $STARTED; then
return 1
fi
if [ -f $DEV_FILE ]; then
dev=$(cat $DEV_FILE)
else
dev=$(scanimage -L |
sed -E -n -e "s/^device \`(.*)' is a .*/\\1/p" |
head -1)
if [ -z $dev ]; then
echo "Could not find scanner" 1>&2
exit 1
fi
echo $dev > $DEV_FILE
fi
if ! scanadf --device "$dev" --mode 'Black & White' \
--resolution 300 --source "$SOURCE" -y 279.4 >| \
scanadf.log 2>&1; then
rm -f image*
return 1
fi
fi
if [ -n "$PAGES_PER_BILL" ]; then
mv $(ls image* | head -$PAGES_PER_BILL) $TD_WORKING/. || exit 1
else
mv image* $TD_WORKING/. || exit 1
fi
}
check_empty() {
image_file=$1; shift
if [ ! -f $image_file ]; then
echo "check_empty: $image_file does not exist" 1>&2
return
fi
rm -f maybe-empty.txt
tesseract --psm 6 $image_file maybe-empty >| maybe-empty.log 2>&1 || exit 1
chars=$(wc -c < maybe-empty.txt)
if ((chars < 65)); then
echo "$image_file is empty, removing" 1>&2
rm -f $image_file
else
echo "$image_file has $chars characters in it, preserving" 1>&2
fi
rm -f maybe-empty.txt
}
eastern_bank_statement_date() {
MMDDYY="$(tre-agrep -2 'Statement *Date:? *[0-9][0-9]/[0-9][0-9]/([0-9][0-9])?[0-9][0-9]' page1.txt | sed -E -n -e 's,.*([0-9][0-9]/[0-9][0-9]/)([0-9][0-9])?([0-9][0-9]).*,\1\3,p')"
if [ -n "$MMDDYY" ]; then
echo "$MMDDYY"
return
fi
ENGLISH_DATE="$(tre-agrep -2 'Statement *Date:? *[A-Z][a-z][a-z][- ]*[1-9][0-9]*, *20[0-9][0-9]' page1.txt | sed -n -e 's/.*\([A-Z][a-z][a-z]\)[- ]*\([1-9][0-9]*,\)[- ]*\(20[0-9][0-9]\).*/\1 \2 \3/p')"
if [ -n "$ENGLISH_DATE" ]; then
date --date "$ENGLISH_DATE" +%m/%d/%y
return
fi
echo "Failed to detect statement date" 1>&2
exit 1
}
do_one() {
local SUBDIR DIR FILE_BASENAME
if $RETRY; then
RETRY=false
else
rm -rf $TD_WORKING
mkdir $TD_WORKING || exit 1
if ! do_scanadf; then
if $STARTED; then
exit 1
fi
rm -f $DEV_FILE
do_scanadf || exit 1
fi
fi
if ! cd $TD_WORKING; then
exit 1
fi
STARTED=true
tesseract --psm 6 $(ls image* | head -1) page1 >| tesseract.log 2>&1 || exit 1
if tre-agrep -q -s -2 home-loan-account-number page1.txt; then
if [ ! "$MMDDYY" ]; then
MMDDYY=$(eastern_bank_statement_date)
fi
SUBDIR=eastern_bank/home_loan
check_empty image-0004
elif tre-agrep -q -s -2 heloc-account-number page1.txt; then
if [ ! "$MMDDYY" ]; then
MMDDYY=$(eastern_bank_statement_date)
fi
SUBDIR=eastern_bank/heloc
check_empty image-0004
elif tre-agrep -q -s -2 heat-loan-account-number page1.txt; then
if [ ! "$MMDDYY" ]; then
MMDDYY=$(eastern_bank_statement_date)
fi
SUBDIR=eastern_bank/heat_loan
check_empty image-0002
check_empty image-0004
elif egrep -q -s -i 'JEWISH COMMUNITY DAY SCHOOL|Afterschool Invoice|jcdsboston|JCDS|57 Stanley' page1.txt
then
if [ ! "$MMDDYY" ]; then
MMDDYY=$(perl -e 'use Date::Parse; use POSIX "strftime"; $t = 0; while (<>) { while (m,(\d\d?/\d\d?/\d\d\s*\d\s*\d),g) { ($s = $1) =~ s/\s+//g; $t2 = str2time($s); if ($t2 > $t) { $t = $t2; } } } print(strftime("%D", localtime($t))) if ($t);' < page1.txt)
if [ -z "$MMDDYY" ]; then
echo "Failed to detect statement date" 1>&2
exit 1
fi
fi
SUBDIR=jcds
elif grep -q -s -i 'boston water' page1.txt; then
echo "Detected Boston Water statement"
if [ ! "$MMDDYY" ]; then
MMDDYY=$(sed -E -n -e 's;.*([0-9][0-9]/[0-9][0-9]/[0-9][0-9]) *previous balance.*;\1;pi' page1.txt)
if [ -z "$MMDDYY" ]; then
echo "Failed to detect statement date" 1>&2
exit 1
fi
fi
SUBDIR=boston_water
fi
if [ -n "$MMDDYY" -a -n "$SUBDIR" ]; then
# The $(expr 0 + ...) trick is to remove leading zeroes from month and day
# numbers so printf won't treat them like octal numbers.
MONTH=$(printf '%02d' $(expr 0 + $(expr $MMDDYY : '\(..\?\)/..\?/..')))
DAY=$(printf '%02d' $(expr 0 + $(expr $MMDDYY : '..\?/\(..\?\)/..')))
YEAR=20$(expr $MMDDYY : '..\?/..\?/\(..\)')
DIR=$HOME/closed/finances/statements/$SUBDIR/$YEAR
FILE_BASENAME=$YEAR$MONTH$DAY
fi
convert image* bill.pdf >| convert.log 2>&1 || exit 1
if [ -n "$DIR" -a -n "$FILE_BASENAME" ]; then
SUFFIX=""
while true; do
TF="$DIR/$FILE_BASENAME$SUFFIX.pdf"
if [ -f $TF ]; then
if ! $MULTIPLE; then
echo "$TF already exists, aborting." 1>&2
exit 1
fi
if [ ! "$SUFFIX" ]; then
SUFFIX=-2
else
SUFFIX=-$((${SUFFIX#*-}+1))
fi
continue
fi
mkdir -p $DIR || exit 1
cp -i bill.pdf "$TF" || exit 1
echo Saved as "$TF"
break
done
else
echo Failed to determine bill type 1>&2
exit 1
fi
# If specified on command, only applies to first bill.
MMDDYY=""
}
main
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment