mylamour/comman_training.sh

## comman_training.sh
#!/usr/bin/env bash

export PATH=$PATH:/home/tesseract/training

#for comman training
#step 1 : gen img
chmod +x ./gen_img.sh
./gen_img.sh                            #gen_img is my script to generator the image, you can see it in my another gist

#step 2 : gen box file and gen tr file
for tifile in $(ls *.tif)
do
	tifname=`echo $tifile | awk -F "." '{print $2}'`
	tesseract --tessdata-dir /home/tessdata/ $tifile $tifname -l chi_sim batch.nochop makebox
	tesseract -l chi_sim $tifile $tifname nobatch box.train
done
#step 3 : gen tr
#for tifile in $(ls *.tif)
#do
#	tifname=`echo $tifile | awk -F "." '{print $2}'`
#	tesseract -l chi_sim $tifile $tifname nobatch box.train
#done

#step 4 : gen unicharest
unicharset_extractor ./*.box
mv unicharset chi_sim.unicharset

#step 5 : gen font_propertis
ls *.tif | awk -F "." '{print $2, 0,0,0,0}' > font_properties

#step 6 : gen middle file
	# gen shapetable
	shapeclustering -F font_properties -U chi_sim.unicharset *.tr
	# gen inttemp
	mftraining -F font_properties -U unicharset -O chi_sim.unicharset *.tr

	# gen normproto
	cntraining *.tr

#step 7 : rename file for combine_tessdata
	# rename
	for datafile in inttemp  normproto  pffmtable  shapetable  unicharset
	do
		mv $datafile chi_sim.$datafile
	done

	combine_tessdata chi_sim.		#please make sure your tr file and your datafile in the same folder

## lstm_training.sh
#!/usr/bin/env bash
# step 1
# use fc-list check which font your want and change your languange.sh(this file in your training folder)

# step 2 : extract a lstm model from exist traineddata, if not you can training from scratch
combine_tessdata -e \
        /usr/local/share/tessdata/chi_sim.traineddata \
        ./chi_sim.lstm

# step 3 : use font file to gen train file and eval file for the lstm training
# gen train_file
tesstrain.sh --fonts_dir  /usr/share/fonts/fontfile/ \
        --lang chi_sim --linedata_only --noextract_font_properties \
        --langdata_dir /home/langdata  \
        --tessdata_dir ./tessdata \
        --output_dir /home/lstmtest/tesseract-ocr/chieval
# gen eval_file
tesstrain.sh --fonts_dir /usr/share/fonts --lang chi_sim  --linedata_only \
      --langdata_dir ../langdata --tessdata_dir ./tessdata \
      --fontlist "STXinwei" \
      --output_dir /home/lstmtest/chitest
# step 4 : training, if you training from scratch, also you need to run a scollview.jar in another terminal
lstmtraining -U ./chitrain/chi_sim.unicharset  \
       --script_dir ../langdata \
       --debug_interval 0 \
       --continue_from ./trainlayer/chi_sim.lstm \
       --append_index 5 --net_spec '[Lfx256 O1c105]' \
       --model_output ./trainmodel \
       --train_listfile ./chitrain/chi_sim.training_files.txt --max_iterations 5000

 # step 5 : now you have a lot lstm file, especially when you increase the iterations, and you can combine it to have a new trained datafile
 combine_tessdata -o  ../chi_sim.traineddata \
        ../trainlayer/your_trained.lstm \
        ./chi_sim.lstm-number-dawg \
        ./chi_sim.lstm-punc-dawg \
        ./chi_sim.lstm-word-dawg
	#!/usr/bin/env bash

	export PATH=$PATH:/home/tesseract/training

	#for comman training
	#step 1 : gen img
	chmod +x ./gen_img.sh
	./gen_img.sh #gen_img is my script to generator the image, you can see it in my another gist

	#step 2 : gen box file and gen tr file
	for tifile in $(ls *.tif)
	do
	tifname=`echo $tifile \| awk -F "." '{print $2}'`
	tesseract --tessdata-dir /home/tessdata/ $tifile $tifname -l chi_sim batch.nochop makebox
	tesseract -l chi_sim $tifile $tifname nobatch box.train
	done
	#step 3 : gen tr
	#for tifile in $(ls *.tif)
	#do
	# tifname=`echo $tifile \| awk -F "." '{print $2}'`
	# tesseract -l chi_sim $tifile $tifname nobatch box.train
	#done

	#step 4 : gen unicharest
	unicharset_extractor ./*.box
	mv unicharset chi_sim.unicharset

	#step 5 : gen font_propertis
	ls *.tif \| awk -F "." '{print $2, 0,0,0,0}' > font_properties

	#step 6 : gen middle file
	# gen shapetable
	shapeclustering -F font_properties -U chi_sim.unicharset *.tr
	# gen inttemp
	mftraining -F font_properties -U unicharset -O chi_sim.unicharset *.tr

	# gen normproto
	cntraining *.tr

	#step 7 : rename file for combine_tessdata
	# rename
	for datafile in inttemp normproto pffmtable shapetable unicharset
	do
	mv $datafile chi_sim.$datafile
	done

	combine_tessdata chi_sim. #please make sure your tr file and your datafile in the same folder
	#!/usr/bin/env bash
	# step 1
	# use fc-list check which font your want and change your languange.sh(this file in your training folder)

	# step 2 : extract a lstm model from exist traineddata, if not you can training from scratch
	combine_tessdata -e \
	/usr/local/share/tessdata/chi_sim.traineddata \
	./chi_sim.lstm

	# step 3 : use font file to gen train file and eval file for the lstm training
	# gen train_file
	tesstrain.sh --fonts_dir /usr/share/fonts/fontfile/ \
	--lang chi_sim --linedata_only --noextract_font_properties \
	--langdata_dir /home/langdata \
	--tessdata_dir ./tessdata \
	--output_dir /home/lstmtest/tesseract-ocr/chieval
	# gen eval_file
	tesstrain.sh --fonts_dir /usr/share/fonts --lang chi_sim --linedata_only \
	--langdata_dir ../langdata --tessdata_dir ./tessdata \
	--fontlist "STXinwei" \
	--output_dir /home/lstmtest/chitest
	# step 4 : training, if you training from scratch, also you need to run a scollview.jar in another terminal
	lstmtraining -U ./chitrain/chi_sim.unicharset \
	--script_dir ../langdata \
	--debug_interval 0 \
	--continue_from ./trainlayer/chi_sim.lstm \
	--append_index 5 --net_spec '[Lfx256 O1c105]' \
	--model_output ./trainmodel \
	--train_listfile ./chitrain/chi_sim.training_files.txt --max_iterations 5000

	# step 5 : now you have a lot lstm file, especially when you increase the iterations, and you can combine it to have a new trained datafile
	combine_tessdata -o ../chi_sim.traineddata \
	../trainlayer/your_trained.lstm \
	./chi_sim.lstm-number-dawg \
	./chi_sim.lstm-punc-dawg \
	./chi_sim.lstm-word-dawg