tariqadel/ocr.sh

## ocr.sh
#!/bin/sh
# This script turns ugly scanned images into plaintext
#
# @version 1.00 (ultra alpha)

# OCR program path; we assume tesseract, though.
OCR=tesseract
# Imagemagick
CONVERT=convert
# Do unpaper. This operation is costly, but can improve OCR.
UNPAPER=unpaper
UNPAPEROPTS=" --overwrite "
# Set this to 1 if you want more accurate OCR.
DOUNPAPER=0
# EXT, for easy cleanup
EXT=".OCR"

# SUPER dangerous, make $EXT very unique! Set to 1 to enable
# Edit: functionality removed.
DOCLEANUP=0

USAGE="Usage: ocr <outputfile> <filestobeconverted>"
ERR2="Unable to create output file."

# We need a minimum of two arguements
if [ $# -le 1 ]; then
   echo $USAGE
   exit 1
fi

OUTPUTFILE=$1

# Attempt to output some file. Run rm first to delete file if it exists,
# USERS BEWARE!
rm $OUTPUTFILE
touch $OUTPUTFILE

# We couldn't
if [ $? -ne 0 ]; then
   echo $ERR2
   exit 2
fi

for image in $*; do
   # Skip the first arguement....so ugly, clean when time permits.
   if [ $image == $OUTPUTFILE ]; then
      continue;
   fi

   # Do unpaper step
   if [ $DOUNPAPER == 1 ]; then
      $CONVERT $image $image$EXT.pnm
      $UNPAPER $UNPAPEROPTS $image$EXT.pnm $image$EXT.unpapered.pnm
      # tif required for tesseract
      $CONVERT $image$EXT.unpapered.pnm $image$EXT.tif
   # Convert to tif
   else
      $CONVERT $image $image$EXT.tif
   fi

   $OCR $image$EXT.tif $image$EXT
   # Make all new lines spaces. All lines end at the right of
   # the page, so tesseract happily inserts a line break there
   # which we don't want. Now our doument fits on one line,
   # awesome
   cat $image$EXT.txt | tr '\n' ' ' >> $OUTPUTFILE
   echo "---" >> $OUTPUTFILE
done;

echo "Finished ripping strings from images. Do rm *.$EXT.* to cleanup."
	#!/bin/sh
	# This script turns ugly scanned images into plaintext
	#
	# @version 1.00 (ultra alpha)

	# OCR program path; we assume tesseract, though.
	OCR=tesseract
	# Imagemagick
	CONVERT=convert
	# Do unpaper. This operation is costly, but can improve OCR.
	UNPAPER=unpaper
	UNPAPEROPTS=" --overwrite "
	# Set this to 1 if you want more accurate OCR.
	DOUNPAPER=0
	# EXT, for easy cleanup
	EXT=".OCR"

	# SUPER dangerous, make $EXT very unique! Set to 1 to enable
	# Edit: functionality removed.
	DOCLEANUP=0

	USAGE="Usage: ocr <outputfile> <filestobeconverted>"
	ERR2="Unable to create output file."

	# We need a minimum of two arguements
	if [ $# -le 1 ]; then
	echo $USAGE
	exit 1
	fi

	OUTPUTFILE=$1

	# Attempt to output some file. Run rm first to delete file if it exists,
	# USERS BEWARE!
	rm $OUTPUTFILE
	touch $OUTPUTFILE

	# We couldn't
	if [ $? -ne 0 ]; then
	echo $ERR2
	exit 2
	fi

	for image in $*; do
	# Skip the first arguement....so ugly, clean when time permits.
	if [ $image == $OUTPUTFILE ]; then
	continue;
	fi

	# Do unpaper step
	if [ $DOUNPAPER == 1 ]; then
	$CONVERT $image $image$EXT.pnm
	$UNPAPER $UNPAPEROPTS $image$EXT.pnm $image$EXT.unpapered.pnm
	# tif required for tesseract
	$CONVERT $image$EXT.unpapered.pnm $image$EXT.tif
	# Convert to tif
	else
	$CONVERT $image $image$EXT.tif
	fi

	$OCR $image$EXT.tif $image$EXT
	# Make all new lines spaces. All lines end at the right of
	# the page, so tesseract happily inserts a line break there
	# which we don't want. Now our doument fits on one line,
	# awesome
	cat $image$EXT.txt \| tr '\n' ' ' >> $OUTPUTFILE
	echo "---" >> $OUTPUTFILE
	done;

	echo "Finished ripping strings from images. Do rm .$EXT. to cleanup."