public
Created

Script for converting images to text (ocr) for the Noisebridge Archivists group. Requires tesseract. Ubuntu packages: tesseract-ocr and tesseract-ocr-eng

  • Download Gist
img2txt.sh
Shell
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57
#!/bin/bash
#Converts images to text using tesseract (package tesseract-ocr & tesseract-ocr-eng)
 
function usage
{
echo "img2txt -i <input directory> -o <output directory> --concat"
}
 
function concat
{
o=$1
for f in $o/*
do
cat $f >> $o/concatenated.txt
done
}
 
function convert
{
i=$1
o=$2
mkdir $o
echo "Converting and placing files into $o"
for f in $i/*
do
test -f $f || continue
echo "processing file $f... $o/${f##*/}.txt"
tesseract $f $o/${f##*/} &> /dev/null
done
echo "done."
}
inputdir=
outputdir=
concatenate=
while [ "$1" != "" ]; do
case $1 in
-i | --input ) shift
inputdir=$1
;;
-o | --output ) shift
outputdir=$1
;;
-c | --concat ) concat=1
;;
-h | --help ) usage
exit
;;
* ) usage
exit 1
esac
shift
done
 
convert $inputdir $outputdir
if [ $concat == 1 ]; then
concat $outputdir
fi

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.