Skip to content

Instantly share code, notes, and snippets.

@RidaAyed
Created March 4, 2018 12:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save RidaAyed/a19fb18634e4d9b4998c55d8e8dfc85b to your computer and use it in GitHub Desktop.
Save RidaAyed/a19fb18634e4d9b4998c55d8e8dfc85b to your computer and use it in GitHub Desktop.
#!/bin/bash
##########################################
## SCRIPT VERSION 1.0.5 ##
## AUTHOR: MARKUS (www.och-group.de) ##
## Requires apt-get install: ##
## libtiff-tools ##
## tesseract\* ##
## libtiff-dev ##
## pdftk ##
## imagemagick ##
##########################################
DATETIME=$(date +%Y-%m-%d"_"%H-%M-%S)
#startdir=$(pwd)
startdir=/home/ra/pi
RANDOMNUMBER=$(cat /dev/urandom | tr -dc A-Za-z0-9 | head -c 16)
outname=$DATETIME.pdf
tmpdir=/tmp/scan-$RANDOMNUMBER
echo "####### $DATETIME #########"
# Either Scan or use prepared *.tif files in folder named in first parameter
if [ -z "$1" ]; then
echo "####### TMPDIR $tmpdir ##########"
echo "####### OUTNAME $outname ##########"
mkdir $tmpdir
cd $tmpdir
echo "################## Scanning ###################"
scanResult=$(scanimage --page-width 221.121 --page-height 876.695 -l 0 -t 0 -x 221.121 -y 876.695 --ald=yes --overscan On --prepick=On -b --format=tiff --mode Color --resolution 300 --source 'ADF Duplex' --swcrop=yes --buffermode On --swdespeck 2 --swdeskew=yes --swskip 5% -d 'fujitsu:ScanSnap iX500:10443')
regexScan="\s+scanimage: no SANE devices found\s+"
if [[ " $scanResult " =~ $regexScan ]]; then
echo "!!!!!!! No scanner found !!!!!!!"
echo "SCANRESULT: "$scanResult
exit 1
fi
echo "################## Scanned ####################"
else
# use existing folder (absolute path)
tmpdir="$1"
echo "####### TMPDIR $tmpdir ##########"
echo "####### OUTNAME $outname ##########"
cd $tmpdir
if [ ! -d "$tmpdir" ]; then
echo "tmpdir $tmpdir does not exist"
exit 1
fi
fi
optimize_color() {
############################################################
# Optimize Color of image
############################################################
# call: #
# optimize_color <filename> #
# result: #
# <filename> #
############################################################
thresholdc=0.91
# optimize Colors --> test for colors
convert $1 -level 20%,80%,2.0 $1
######## Other Color check variants - best is scale option, then breakup option
# testing average colorfulness of an image in HSL (green channel is colorfulness) http://www.imagemagick.org/discourse-server/viewtopic.php?t=19580
#testc1=`convert $1 -colorspace HSV -channel g -separate +channel -format "%[fx:mean]" info:`
#testc2=`convert $1 -colorspace HSL -channel g -separate +channel -format "%[fx:mean]" info:`
#echo " PAGE: ${1%.*} this pic is grey if close to 0:" $testc1 "and" $testc2
# Two methods from here http://www.imagemagick.org/discourse-server/viewtopic.php?f=1&t=29781
#testc3=`convert $1 -crop 50x50 -colorspace HCL -scale 1x1! -channel G -separate +channel -evaluate-sequence Max -format %[fx:mean] info: 2>/dev/null`
#echo " PAGE: ${1%.*} breakup option says color value is" $testc3
testc=`convert $1 -colorspace HCL -scale 2% -format "%[fx:maxima.g+$thresholdc>1?1:0]" info:`
echo " PAGE: ${1%.*} scale option says color exists for %:" `convert $1 -colorspace HCL -scale 2% -format "%[fx:maxima.g]" info:`
if [ $testc -eq 1 ]; then
echo " PAGE: ${1%.*} is colorful"
## OPTIMIZE COLORS http://www.imagemagick.org/Usage/color_mods/
#convert $1 -level 20%,80%,2.5 ${1%.*}"_color.tif"
## Alternative Color optimization (for me it does not look as good as the first)
#convert $infile -sigmoidal-contrast 10,50% ${inname}_color_sigmoidal.tif
else
echo " PAGE: ${1%.*} is not colorful"
######### DITHER IS BEST FOR COLORED IMAGES! - tx is fine for text
#convert $1 -compress Group4 -adaptive-resize 75% -density 200 -type bilevel TIFF:- | convert - ${1%.*}"_compressed.pdf"
##Text Optimization: Convert to lineart
#convert $1 -negate -separate -lat 20x20+25% -negate -evaluate-sequence add ${1%.*}"_la.png"
##Text Optimization: dither to black / white picture
convert $1 +dither -colors 2 -colorspace gray -contrast-stretch 0 ${1%.*}"_dither.png"
#Text Optimization: lots of Magic
#convert -respect-parenthesis \
#\( $1 -colorspace gray -type grayscale -contrast-stretch 0 \) \
#\( -clone 0 -colorspace gray -negate -lat 15x15+10% -contrast-stretch 0 \) \
#-compose copy_opacity -composite -fill white -opaque none +matte -deskew 40% +repage -sharpen 0x1 \
#$1
fi
}
optimize_crop() {
############################################################
# Crop Image
############################################################
# call: #
# optimize_crop <filename> #
# result: #
# <filename> #
############################################################
##################################### CROPPED 2 IS BETTER!
# crop Borders if black 1
#infile=$1
#inname=${1%.*}
#convert $infile +repage -scale x1! -bordercolor black -border 1 -fuzz 30% -trim ${inname}_tmp1.png
#width=`convert ${inname}_tmp1.png -format "%w" info:`
#offsets=`convert ${inname}_tmp1.png -format "%O" info:`
#xoff=`echo $offsets | cut -d+ -f2`
#convert $infile +repage -scale 1x! -bordercolor black -border 1 -fuzz 60% -trim ${inname}_tmp2.png
#height=`convert ${inname}_tmp2.png -format "%h" info:`
#offsets=`convert ${inname}_tmp2.png -format "%O" info:`
#yoff=`echo $offsets | cut -d+ -f3`
#convert $infile -crop ${width}x${height}+${xoff}+${yoff} +repage ${inname}_cropped_1.jpg
# Crop Borders variant 2
convert -fuzz 15% -trim $1 $1
}
correct_orientation() {
########################################################
# Orientation correction (rotate if 90,180,270 degree) #
########################################################
# call: #
# correct_orientation <filename)> #
# result: #
# <filename> with correct orientation #
########################################################
# Get info from tesseract without creating a pdf file
#tesseract -psm 0 -l eng+deu $1 result_${1%.*} 1>${1%.*}"_tesseract.info" 2>&1
tesseract -psm 0 -l deu $1 result_${1%.*} 1>${1%.*}"_tesseract.info" 2>&1
regexOrientation="\s+Orientation in degrees: ([0-9]{3})\s+"
file=${1%.*}"_tesseract.info"
file_content=$( cat "${file}" )
orientation=0
if [[ " $file_content " =~ $regexOrientation ]]; then
case "${BASH_REMATCH[1]}" in
'90')
# 90 is readable from the right side
#echo " PAGE: ${1%.*} Detected wrong orientation:" ${BASH_REMATCH[1]}
## Rotate picture
#convert $1 -rotate 90 +repage $1;
orientation=90
;;
'180')
# 180 is upside down
echo " PAGE: ${1%.*} Detected wrong orientation:" ${BASH_REMATCH[1]}
# Rotate picture
convert $1 -rotate 180 $1
orientation=180
;;
'270')
# 270 is readable from left side
echo " PAGE: ${1%.*} Detected wrong orientation:" ${BASH_REMATCH[1]}
# Rotate picture
convert $1 -rotate 180 +repage $1
orientation=270
;;
*)
#echo " PAGE: ${1%.*} Detected correct orientation:" ${BASH_REMATCH[1]}
;;
esac
else
echo " PAGE: ${1%.*} Cannot find any orientation"
fi
############################################################
# END Orientation correction #
############################################################
}
correct_blank_page() {
############################################################
# Test if it is a blank page
############################################################
# call: #
# correct_blank_page <filename> #
# result: #
# <filename> or deleted file #
############################################################
# Threshold for deleting blank pages (Percentage)
threshold=0.99
# Test percentage of lineart against threshold
test=`convert $1 +dither -colors 2 -colorspace gray -contrast-stretch 0 -format "%[fx:mean>$threshold?1:0]" info:`
echo " PAGE: ${1%.*} is blank for %:" `convert $1 +dither -colors 2 -colorspace gray -contrast-stretch 0 -format "%[fx:mean]" info:`
if [ $test -eq 1 ]; then
echo " PAGE: ${1%.*} Blank Page (1/2 - GREY-CHECK): SEEMS TO BE A BLANK PAGE------"
BLANKPROSPECT=true
else
BLANKPROSPECT=false
return
fi
# If file does not exist - create it - else use it
if [ ! -f ${1%.*}"_tesseract.info" ]; then
#tesseract -psm 0 -l eng+deu $1 result_${1%.*} 1>${1%.*}"_tesseract.info" 2>&1
tesseract -psm 0 -l deu $1 result_${1%.*} 1>${1%.*}"_tesseract.info" 2>&1
fi
regexCharacters="\s+Too few characters. Skipping this page\s+"
file=${1%.*}"_tesseract.info"
file_content=$( cat "${file}" )
if [[ " $file_content " =~ $regexCharacters ]]; then
echo " PAGE: ${1%.*} Blank Page (2/2 - CHARACTER-CHECK): SEEMS TO BE A BLANK PAGE------"
if [ "$BLANKPROSPECT" = "true" ]; then
# If both matches for blank page
echo " PAGE: ${1%.*} is a blank page - deleting $1------"
rm $1
fi
fi
############################################################
}
declare -a pids
waitProcessing() {
############################################################
# parallelizable Process for working on each page
############################################################
# usage:
# waitProcessing
############################################################
# return:
# when all pids are processed, this one will end too
############################################################
while [ ${#pids[@]} -ne 0 ]; do
#echo "Waiting for pids: ${pids[@]}"
local range=$(eval echo {0..$((${#pids[@]}-1))})
local i
for i in $range; do
if ! kill -0 ${pids[$i]} 2> /dev/null; then
#echo "Done -- ${pids[$i]}"
unset pids[$i]
fi
done
pids=("${pids[@]}") # Expunge nulls created by unset.
sleep 1
done
echo "---All Pages Done!"
}
addProcess() {
############################################################
# remembers process pids of main process
############################################################
# usage:
# addProcess <filename> $!
############################################################
# return:
# <filename> (processed file or deleted file)
############################################################
x=${1%.*}
pid=$2
pids=(${pids[@]} $pid)
echo " PAGE: $x (PID $pid)"
############################################################
}
process() {
############################################################
# parallelizable Process for working on each page
############################################################
# usage:
# process <filename>
############################################################
# return:
# <filename> (processed file or deleted file)
############################################################
x=${1%.*}
############################################################
echo " PAGE: $x - BEGIN"
cp $x.tif ${x}_BEGIN.tif
echo " PAGE: $x - CROP"
optimize_crop $x.tif
if [ ! -f $x.tif ]; then
continue
fi
cp $x.tif ${x}_CROPPED.tif
echo " PAGE: $x - COLOR CHECK"
optimize_color $x.tif
cp $x.tif ${x}_COLOR_OPTIMIZED.tif
echo " PAGE: $x - BLANK CHECK"
correct_blank_page $x.tif
if [ ! -f $x.tif ]; then
continue
fi
cp $x.tif ${x}_NOT_BLANK.tif
echo " PAGE: $x - ROTATION CHECK"
correct_orientation $x.tif
cp $x.tif ${x}_CORRECT_ORIENTATION.tif
echo " PAGE: $x - CREATE PDF PAGE and reorientate" $orientation
if [ -f $x".tif" ]; then
#tiff2pdf -o "final_$x.pdf" -z -u m -p "A4" -F $x".tif"
## Create PDF with fit to A4 - even in landscape mode - does not work
#convert -compress Group4 -density 300 -define pdf:fit-page=A4 $x".tif" $x"_single.pdf"
#convert $x".tif" -resize 595x823^> -gravity center -background white $x"_singles.pdf"
# convert file to A4 PDF
tiff2pdf -p a4 -z -u m -t "Scan-$DATETIME" -f -o $x".pdf" $x".tif"
if [ "$orientation" = "90" ]; then
# rotate pdf +90
pdftk $x".pdf" cat 1east output $x"_o.pdf"
rm $x".pdf"
mv $x"_o.pdf" $x".pdf"
elif [ "$orientation" = "270" ]; then
# was rotated 180 - now add 90
pdftk $x".pdf" cat 1east output $x"_o.pdf"
rm $x".pdf"
mv $x"_o.pdf" $x".pdf"
fi
fi
#echo "---PAGE: $i -PDFTK-----"
#echo "---PAGE: $i -FLATTEN---"
#pdftk tiff2pdf_$x.pdf cat output pdftk_$x.pdf flatten
#pdftk pdftk_$x.pdf dump_data > pdftk_$x.info
#echo "---PAGE: $i -NORMALIZE-"
#convert -normalize -density 300 -depth 8 pdftk_$x.pdf $x.png
# echo "---PAGE: $i -TESSERACT get info-"
# #FOR: correct_orientation(): tesseract -psm 0 -l deu+eng $x.png result_$x 1>tesseract_$x.info 2>&1
# tesseract -psm 1 -l deu+eng $x.png result_$x pdf quiet 1>/dev/null 2>&1
# echo "---PAGE: $i -METADATA--"
# pdftk result_$x.pdf dump_data > pdftk_$x.info2
# pdftk result_$x.pdf update_info pdftk_$x.info output final_$x.pdf
echo "---PAGE: $i -END------"
}
############################################################
# MAIN
############################################################
echo "################### Preprocessing ####################"
i=1
for page in $(ls -v *.tif); do
# Create x as number with 4 digits counting up
x=`printf "%04d" $i`
cp $page $x.tif
# Execute parallel worker for each scanned page
process $x.tif &
addProcess $x.tif $!
# Next page
i=$(expr $i + 1)
done
# Wait until all pages are done
waitProcessing
echo "############ Combine all pdf to one ###########"
pdftk *.pdf cat output $outname
echo "############ OCR complete pdf #################"
## ocrmypdf mit -l eng+deu ..dann klappts auch mit den Umlauten
ocrmypdf $outname $startdir/$outname -l eng+deu
cp $startdir/$outname /home/ra/temp
#paperwork-shell import $startdir/$outname
#ranger --selectfile=$startdir/$outname
ranger --selectfile=/home/ra/temp/$outname
echo "################ Cleaning Up ##################"
cd ..
rm -rf $tmpdir
cd $startdir
# REMINDER for BARCODE FUNCTIONALITY
#convert -density 150 "$i[0]" -quality 100 -sharpen 0x1.0 "$i.jpg" # JPG erzeugen um möglichen Barcode zu suchen
#barcode=`zbarimg -q --raw "$i.jpg"` # Barcode suchen und in Variable speichern
#rm "$i.jpg" # Bild wieder löschen
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment