Skip to content

Instantly share code, notes, and snippets.

@rhoit
Created September 23, 2014 14:54
Show Gist options
  • Save rhoit/e22d573ba75f0b91a456 to your computer and use it in GitHub Desktop.
Save rhoit/e22d573ba75f0b91a456 to your computer and use it in GitHub Desktop.
#!/bin/bash
cd run
#tesseract npl.preeti.exp0.tif
# NOTE: for 3.0.4 version and above
export TESSDATA_PREFIX=/usr/share/tessdata
# NOTE: make sure the tessaract is in your run $PATH
# open the terminal at the folder where you have the tif image
lang="urd"
font="nas"
file_name="${lang}2.$font.exp0"
echo $file_name
rm -f "$file_name".{box,tr} unicharset
if [[ ! -e "$file_name.box" ]]; then
echo "# CREATE BOXFILE"
tesseract "$file_name.tif" "$file_name" batch.nochop makebox
else
echo "'$file_name.tif' already exists"
fi
exit
echo -e "\n# USE COWBOXER"
rm "$file_name".box
cp "$file_name".box_edited "$file_name".box
if [[ ! -e "$file_name.tr" ]]; then
echo -e "\n# TRAIN BOX FILE"
tesseract "$file_name".tif "$file_name" nobatch box.train #.stderr
else
echo "'$file_name.tr' already exist"
fi
# NOTE: this will take out unique character set from box file
if [[ ! -e unicharset ]]; then
echo -e "\nUNICHARSET"
unicharset_extractor "$file_name".box
else
echo "'unicharset' already exist"
fi
# <font-name> <italic> <bold> <fixed> <serif> <fraktur>
echo -e "\n# MAKING font_properties.txt"
echo -n "nas 0 0 0 0 0" > font_properties.txt
cat font_properties.txt
echo
file font_properties.txt
# clustering
echo -e "\n# CLUSTERING"
mftraining -F font_properties.txt -U unicharset -O "$lang".unicharset "$file_name".tr
# cn train
echo -e "\n# CN TRAIN"
cntraining "$file_name".tr
## NOTE: combine the files │
# It will output eh lang.traineddata
echo -e "\n# COMBINE FILE"
combine_tessdata $lang.
echo -e "\n# Recognition"
tesseract "$file_name".tif tessaract_output -l eng
#- Move eng1.traineddata to Tessaract-OCR\tessdata\ directory
#- change directory to .. Tesserac-OCR\ and run recognition command
# $ tesseract image.tif output -l lang
# e.g.
# $ tessaract eurotext.tif -l eng1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment