billju/tesseract_ocr_train.py

## tesseract_ocr_train.py
# 下載 https://tesseract-ocr.github.io/tessdoc/Downloads.html
# 權重 https://github.com/tesseract-ocr/tessdata_best
# .box檔案格式：字 6 394 45 410 0
import os
import shutil
from PIL import Image
from glob import glob
lang = 'chi_tra'
font = 'ocrb'
dist = 'tessdata'
# 切換資料夾
if os.path.basename(os.getcwd()) is not dist:
    os.makedirs(dist+'/jpg', exist_ok='OK')
    os.chdir(dist)
# 轉檔
for image in glob(f'*.png'):
    Image.open(image).convert('RGB').save(image[:4]+'.jpg')
# 複製圖片檔(.jpg)、產生辨識框(.box)、訓練資料(.tr)
for image in glob('*.jpg'):
    os.remove(image)
for i, image in enumerate(glob(f'jpg/*.jpg')):
    train = f'{lang}.{font}.exp{i}'
    shutil.copy(image, f'{train}.jpg')
    os.system(f'tesseract -l {lang} {train}.jpg {train} batch.nochop makebox')
    os.system(f'tesseract -l {lang} {train}.jpg {train} nobatch box.train')
boxes = ' '.join(glob(f'*.box'))
trs = ' '.join(glob(f'*.tr'))
# 製作字型屬性
open(f'font_properties', 'w').write(f'{lang} 0 0 0 1 0')
os.system(f'unicharset_extractor --output_unicharset unicharset {boxes}')
os.system(f'mftraining -F font_properties -U unicharset -O {lang}.unicharset -D . {trs}')
os.system(f'cntraining -D . {trs}')
os.rename('inttemp', f'{lang}.inttemp')
os.rename('normproto', f'{lang}.normproto')
os.rename('pffmtable', f'{lang}.pffmtable')
os.rename('shapetable', f'{lang}.shapetable')
os.system(f'combine_tessdata {lang}.')

# 引擎 OCR_ENGINE_MODE
# 0 = 'Legacy'
# 1 = 'LSTM'

# 模式 PAGE_SEG_MODE
# 0  Orientation and script detection (OSD) only.
# 1  Automatic page segmentation with OSD.
# 2  Automatic page segmentation, but no OSD, or OCR. (not implemented)
# 3  Fully automatic page segmentation, but no OSD. (Default)
# 4  Assume a single column of text of variable sizes.
# 5  Assume a single uniform block of vertically aligned text.
# 6  Assume a single uniform block of text.
# 7  Treat the image as a single text line.
# 8  Treat the image as a single word.
# 9  Treat the image as a single word in a circle.
# 10 Treat the image as a single character.
# 11 Sparse text. Find as much text as possible in no particular order.
# 12 Sparse text with OSD.
# 13 Raw line. Treat the image as a single text line, bypassing hacks that are Tesseract-specific.

# 格式(加在語言後) '' | 'hocr' | 'tsv'
os.system(f'tesseract {train}.jpg result -l {lang} --oem 1 --psm 3')
	# 下載 https://tesseract-ocr.github.io/tessdoc/Downloads.html
	# 權重 https://github.com/tesseract-ocr/tessdata_best
	# .box檔案格式：字 6 394 45 410 0
	import os
	import shutil
	from PIL import Image
	from glob import glob
	lang = 'chi_tra'
	font = 'ocrb'
	dist = 'tessdata'
	# 切換資料夾
	if os.path.basename(os.getcwd()) is not dist:
	os.makedirs(dist+'/jpg', exist_ok='OK')
	os.chdir(dist)
	# 轉檔
	for image in glob(f'*.png'):
	Image.open(image).convert('RGB').save(image[:4]+'.jpg')
	# 複製圖片檔(.jpg)、產生辨識框(.box)、訓練資料(.tr)
	for image in glob('*.jpg'):
	os.remove(image)
	for i, image in enumerate(glob(f'jpg/*.jpg')):
	train = f'{lang}.{font}.exp{i}'
	shutil.copy(image, f'{train}.jpg')
	os.system(f'tesseract -l {lang} {train}.jpg {train} batch.nochop makebox')
	os.system(f'tesseract -l {lang} {train}.jpg {train} nobatch box.train')
	boxes = ' '.join(glob(f'*.box'))
	trs = ' '.join(glob(f'*.tr'))
	# 製作字型屬性
	open(f'font_properties', 'w').write(f'{lang} 0 0 0 1 0')
	os.system(f'unicharset_extractor --output_unicharset unicharset {boxes}')
	os.system(f'mftraining -F font_properties -U unicharset -O {lang}.unicharset -D . {trs}')
	os.system(f'cntraining -D . {trs}')
	os.rename('inttemp', f'{lang}.inttemp')
	os.rename('normproto', f'{lang}.normproto')
	os.rename('pffmtable', f'{lang}.pffmtable')
	os.rename('shapetable', f'{lang}.shapetable')
	os.system(f'combine_tessdata {lang}.')

	# 引擎 OCR_ENGINE_MODE
	# 0 = 'Legacy'
	# 1 = 'LSTM'

	# 模式 PAGE_SEG_MODE
	# 0 Orientation and script detection (OSD) only.
	# 1 Automatic page segmentation with OSD.
	# 2 Automatic page segmentation, but no OSD, or OCR. (not implemented)
	# 3 Fully automatic page segmentation, but no OSD. (Default)
	# 4 Assume a single column of text of variable sizes.
	# 5 Assume a single uniform block of vertically aligned text.
	# 6 Assume a single uniform block of text.
	# 7 Treat the image as a single text line.
	# 8 Treat the image as a single word.
	# 9 Treat the image as a single word in a circle.
	# 10 Treat the image as a single character.
	# 11 Sparse text. Find as much text as possible in no particular order.
	# 12 Sparse text with OSD.
	# 13 Raw line. Treat the image as a single text line, bypassing hacks that are Tesseract-specific.

	# 格式(加在語言後) '' \| 'hocr' \| 'tsv'
	os.system(f'tesseract {train}.jpg result -l {lang} --oem 1 --psm 3')