Created
May 24, 2017 11:42
-
-
Save mylamour/e4f116e64d690c366715f67fefc8357f to your computer and use it in GitHub Desktop.
There was two file show how to traing with tesseract ,include comman_training and lstm training
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
export PATH=$PATH:/home/tesseract/training | |
#for comman training | |
#step 1 : gen img | |
chmod +x ./gen_img.sh | |
./gen_img.sh #gen_img is my script to generator the image, you can see it in my another gist | |
#step 2 : gen box file and gen tr file | |
for tifile in $(ls *.tif) | |
do | |
tifname=`echo $tifile | awk -F "." '{print $2}'` | |
tesseract --tessdata-dir /home/tessdata/ $tifile $tifname -l chi_sim batch.nochop makebox | |
tesseract -l chi_sim $tifile $tifname nobatch box.train | |
done | |
#step 3 : gen tr | |
#for tifile in $(ls *.tif) | |
#do | |
# tifname=`echo $tifile | awk -F "." '{print $2}'` | |
# tesseract -l chi_sim $tifile $tifname nobatch box.train | |
#done | |
#step 4 : gen unicharest | |
unicharset_extractor ./*.box | |
mv unicharset chi_sim.unicharset | |
#step 5 : gen font_propertis | |
ls *.tif | awk -F "." '{print $2, 0,0,0,0}' > font_properties | |
#step 6 : gen middle file | |
# gen shapetable | |
shapeclustering -F font_properties -U chi_sim.unicharset *.tr | |
# gen inttemp | |
mftraining -F font_properties -U unicharset -O chi_sim.unicharset *.tr | |
# gen normproto | |
cntraining *.tr | |
#step 7 : rename file for combine_tessdata | |
# rename | |
for datafile in inttemp normproto pffmtable shapetable unicharset | |
do | |
mv $datafile chi_sim.$datafile | |
done | |
combine_tessdata chi_sim. #please make sure your tr file and your datafile in the same folder |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# step 1 | |
# use fc-list check which font your want and change your languange.sh(this file in your training folder) | |
# step 2 : extract a lstm model from exist traineddata, if not you can training from scratch | |
combine_tessdata -e \ | |
/usr/local/share/tessdata/chi_sim.traineddata \ | |
./chi_sim.lstm | |
# step 3 : use font file to gen train file and eval file for the lstm training | |
# gen train_file | |
tesstrain.sh --fonts_dir /usr/share/fonts/fontfile/ \ | |
--lang chi_sim --linedata_only --noextract_font_properties \ | |
--langdata_dir /home/langdata \ | |
--tessdata_dir ./tessdata \ | |
--output_dir /home/lstmtest/tesseract-ocr/chieval | |
# gen eval_file | |
tesstrain.sh --fonts_dir /usr/share/fonts --lang chi_sim --linedata_only \ | |
--langdata_dir ../langdata --tessdata_dir ./tessdata \ | |
--fontlist "STXinwei" \ | |
--output_dir /home/lstmtest/chitest | |
# step 4 : training, if you training from scratch, also you need to run a scollview.jar in another terminal | |
lstmtraining -U ./chitrain/chi_sim.unicharset \ | |
--script_dir ../langdata \ | |
--debug_interval 0 \ | |
--continue_from ./trainlayer/chi_sim.lstm \ | |
--append_index 5 --net_spec '[Lfx256 O1c105]' \ | |
--model_output ./trainmodel \ | |
--train_listfile ./chitrain/chi_sim.training_files.txt --max_iterations 5000 | |
# step 5 : now you have a lot lstm file, especially when you increase the iterations, and you can combine it to have a new trained datafile | |
combine_tessdata -o ../chi_sim.traineddata \ | |
../trainlayer/your_trained.lstm \ | |
./chi_sim.lstm-number-dawg \ | |
./chi_sim.lstm-punc-dawg \ | |
./chi_sim.lstm-word-dawg |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment