Skip to content

Instantly share code, notes, and snippets.

@billju
Last active December 9, 2021 10:33
Show Gist options
  • Save billju/c55b75e91c774e0bb837fc00253068cc to your computer and use it in GitHub Desktop.
Save billju/c55b75e91c774e0bb837fc00253068cc to your computer and use it in GitHub Desktop.
# 下載 https://tesseract-ocr.github.io/tessdoc/Downloads.html
# 權重 https://github.com/tesseract-ocr/tessdata_best
# .box檔案格式:字 6 394 45 410 0
import os
import shutil
from PIL import Image
from glob import glob
lang = 'chi_tra'
font = 'ocrb'
dist = 'tessdata'
# 切換資料夾
if os.path.basename(os.getcwd()) is not dist:
os.makedirs(dist+'/jpg', exist_ok='OK')
os.chdir(dist)
# 轉檔
for image in glob(f'*.png'):
Image.open(image).convert('RGB').save(image[:4]+'.jpg')
# 複製圖片檔(.jpg)、產生辨識框(.box)、訓練資料(.tr)
for image in glob('*.jpg'):
os.remove(image)
for i, image in enumerate(glob(f'jpg/*.jpg')):
train = f'{lang}.{font}.exp{i}'
shutil.copy(image, f'{train}.jpg')
os.system(f'tesseract -l {lang} {train}.jpg {train} batch.nochop makebox')
os.system(f'tesseract -l {lang} {train}.jpg {train} nobatch box.train')
boxes = ' '.join(glob(f'*.box'))
trs = ' '.join(glob(f'*.tr'))
# 製作字型屬性
open(f'font_properties', 'w').write(f'{lang} 0 0 0 1 0')
os.system(f'unicharset_extractor --output_unicharset unicharset {boxes}')
os.system(f'mftraining -F font_properties -U unicharset -O {lang}.unicharset -D . {trs}')
os.system(f'cntraining -D . {trs}')
os.rename('inttemp', f'{lang}.inttemp')
os.rename('normproto', f'{lang}.normproto')
os.rename('pffmtable', f'{lang}.pffmtable')
os.rename('shapetable', f'{lang}.shapetable')
os.system(f'combine_tessdata {lang}.')
# 引擎 OCR_ENGINE_MODE
# 0 = 'Legacy'
# 1 = 'LSTM'
# 模式 PAGE_SEG_MODE
# 0 Orientation and script detection (OSD) only.
# 1 Automatic page segmentation with OSD.
# 2 Automatic page segmentation, but no OSD, or OCR. (not implemented)
# 3 Fully automatic page segmentation, but no OSD. (Default)
# 4 Assume a single column of text of variable sizes.
# 5 Assume a single uniform block of vertically aligned text.
# 6 Assume a single uniform block of text.
# 7 Treat the image as a single text line.
# 8 Treat the image as a single word.
# 9 Treat the image as a single word in a circle.
# 10 Treat the image as a single character.
# 11 Sparse text. Find as much text as possible in no particular order.
# 12 Sparse text with OSD.
# 13 Raw line. Treat the image as a single text line, bypassing hacks that are Tesseract-specific.
# 格式(加在語言後) '' | 'hocr' | 'tsv'
os.system(f'tesseract {train}.jpg result -l {lang} --oem 1 --psm 3')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment