billju/tesseract_ocr訓練.js

## tesseract_ocr訓練.js
const puppeteer = require('puppeteer-core');
const fs = require('fs');
const path = require('path');
const glob = require('glob').sync;
const { execSync } = require('child_process');
const edgeExe = 'C:/Program Files (x86)/Microsoft/Edge/Application/msedge.exe';
/**
 * 全國電子公佈欄爬蟲
 * https://www.odbbs.gov.tw/odbbs/html/announce.jsp
 */
async function fetch_odbbs() {
	const browser = await puppeteer.launch({ executablePath: edgeExe, defaultViewport: null, headless: false });
	const page = await browser.newPage();
	await page.goto('https://www.odbbs.gov.tw/odbbs/html/announce.jsp');
	await page.waitForSelector('[name=fryear]');
	// 調整查詢起始日
	await page.select('[name=fryear]', '109');
	await page.select('[name=frmonth]', '01');
	await page.select('[name=frdate]', '01');
	await page.click('input[value="查 詢 "]');
	await page.waitForTimeout(3000);
	const total = await page.$eval('#total2', (el) => el.textContent);
	while (true) {
		// 點擊附件按鈕
		const tb_list_trs = await page.$$('#tb_list tr:not(:first-child)');
		await page.waitForTimeout(500);
		for (let tr of tb_list_trs) {
			const docNo = await tr.$eval('td:nth-child(3)', (el) => el.textContent);
			// 建立資料夾、改寫下載地點
			const downloadPath = path.join(__dirname, 'odbbs/' + docNo);
			console.log(downloadPath);
			if (fs.existsSync(downloadPath)) continue;
			else fs.mkdirSync(downloadPath, { recursive: true });
			await page._client.send('Page.setDownloadBehavior', { behavior: 'allow', downloadPath });
			// 進入下載頁面，寫tr.click('td:last-child img')會無效
			await tr.$('td:last-child img').then((el) => el.click());
			await page.waitForSelector('#att_download');
			await page.waitForTimeout(1000);
			// 對每個檔案點擊下載按鈕
			const att_trs = await page.$$('#att_download tr:not(:first-child)');
			for (let t of att_trs) {
				await t.$('td:last-child a').then((el) => el.click());
				await page.waitForTimeout(2000);
			}
			await page.click('#box_05 input[value="返 回 列 表 "]');
			await page.waitForTimeout(500);
			// 判斷是否為最後一筆
			const index = await tr.$eval('td:first-child', (el) => el.textContent);
			if (index == total) break;
		}
		await page.click('img[title=下一頁]');
		await page.waitForTimeout(500);
	}
}
/**
 * PDF轉成訓練資料集
 * https://mozilla.github.io/pdf.js/web/viewer.html
 */
async function pdf2image() {
	const browser = await puppeteer.launch({ executablePath: edgeExe, defaultViewport: null, headless: false });
	const page = await browser.newPage();
	// 將畫面調整到A4大小
	await page.setViewport({ width: parseInt(210 * 3.78) + 20, height: parseInt(297 * 3.78) + 40 });
	await page.goto('https://mozilla.github.io/pdf.js/web/viewer.html');
	await page.waitForSelector('input[type=file]');
	const fileInput = await page.$('input[type=file]');
	for (let pdfFile of glob(path.join(__dirname, 'odbbs/**/*.pdf'))) {
		// 上傳PDF檔
		fileInput.uploadFile(pdfFile);
		await page.waitForTimeout(2000);
		await page.select('#scaleSelect', 'page-actual');
		await page.waitForTimeout(500);
		const totalPage = await page.$eval('#pageNumber', (el) => el.max);
		let curPage = 1;
		while (curPage <= totalPage) {
			// 取出每個字元框
			const { box, clip } = await page.$eval(`#viewer .page:nth-child(${curPage}) .textLayer`, (el) => {
				const { x, y, width, height } = el.getBoundingClientRect();
				const range = new Range();
				let box = '';
				for (let span of el.querySelectorAll('span[role=presentation]')) {
					// 排除裝訂線、頁碼
					if (parseFloat(span.style.left) < 70) continue;
					if (parseFloat(span.style.top) > 800) continue;
					const textNode = span.firstChild;
					for (let i = 0; i < textNode.length; i++) {
						range.setStart(textNode, i);
						range.setEnd(textNode, i + 1);
						let { top, left, right, bottom } = range.getBoundingClientRect();
						const char = textNode.textContent.slice(i, i + 1);
						if (char == ' ') continue;
						const rect = [left - x, height - bottom + y, right - x, height - top + y].map((x) => parseInt(x)).join(' ');
						box += char + ' ' + rect + ' 0\n';
					}
				}
				return { box, clip: { x, y, width, height } };
			});
			// 標註頁數並換下一頁
			const name = path.basename(pdfFile).replace('.pdf', '') + 'P' + curPage++;
			const boxFile = path.join(__dirname, 'odbbs/box', name + '.box');
			const jpgFile = path.join(__dirname, 'odbbs/jpg', name + '.jpg');
			// 跳過文字過少的訓練資料
			if (box.length < 100) continue;
			fs.writeFileSync(boxFile, box);
			await page.screenshot({ path: jpgFile, clip });
			await page.click('button#next');
			await page.waitForTimeout(1000);
		}
	}
}
/**
 * 訓練OCR引擎
 * 官方說明 https://tesseract-ocr.github.io/tessdoc/tess3/Training-Tesseract-3.03%E2%80%933.05.html
 * 引擎下載(建議第四版) https://github.com/UB-Mannheim/tesseract/wiki
 * 權重下載(有版本問題，建議安裝時選取language data>chi_tra) https://github.com/tesseract-ocr/tessdata_best
 * 環境變數 C:\Program Files (x86)\Tesseract-OCR\
 */
function trainTesseract() {
	process.chdir(path.join(__dirname, 'odbbs/tessdata'));
	console.log(process.cwd());
	const lang = 'chi_tra';
	const font = 'kaiu';
	// 複製圖片檔(.jpg)、產生辨識框(.box)、訓練資料(.tr)
	const jpgs = glob('../jpg/*.jpg');
	let boxes = glob('../box/*.box');
	for (let i = 0; i < jpgs.length; i++) {
		let train = `${lang}.${font}.exp${i}`;
		if (fs.existsSync(train + '.tr')) continue;
		fs.copyFileSync(jpgs[i], train + '.jpg');
		fs.copyFileSync(boxes[i], train + '.box');
		// execSync(`tesseract -l ${lang} ${train}.jpg ${train} batch.nochop makebox`);
		execSync(`tesseract -l ${lang} ${train}.jpg ${train} box.train`);
	}
	boxes = glob('*.box').join(' ');
	const trs = glob('*.tr').join(' ');
	// 製作字型屬性
	fs.writeFileSync('font_properties', `${lang} 0 0 0 1 0`);
	execSync(`unicharset_extractor --output_unicharset unicharset ${boxes}`);
	execSync(`mftraining -F font_properties -U unicharset -O ${lang}.unicharset -D . ${trs}`);
	execSync(`cntraining -D . ${trs}`);
	fs.renameSync('inttemp', `${lang}.inttemp`);
	fs.renameSync('normproto', `${lang}.normproto`);
	fs.renameSync('pffmtable', `${lang}.pffmtable`);
	fs.renameSync('shapetable', `${lang}.shapetable`);
	// 合併檔案以產生權重
	execSync(`combine_tessdata ${lang}.`);
	// 重新命名避開現有名稱，並搬到資料夾下(注意Program Files會被擋權限)
	fs.renameSync(`${lang}.traineddata`, `${lang}_test.traineddata`);
	fs.copyFileSync(`${lang}_test.traineddata`, 'C:/Program Files (x86)/Tesseract-OCR/tessdata/');
}
/**
 * 測試辨識結果
 */
function testTesseract() {
	/**
	 * 引擎 OCR_ENGINE_MODE
	 * 0 = 'Legacy'
	 * 1 = 'LSTM'
	 *
	 * 模式 PAGE_SEG_MODE
	 * 0  Orientation and script detection (OSD) only.
	 * 1  Automatic page segmentation with OSD.
	 * 2  Automatic page segmentation, but no OSD, or OCR. (not implemented)
	 * 3  Fully automatic page segmentation, but no OSD. (Default)
	 * 4  Assume a single column of text of variable sizes.
	 * 5  Assume a single uniform block of vertically aligned text.
	 * 6  Assume a single uniform block of text.
	 * 7  Treat the image as a single text line.
	 * 8  Treat the image as a single word.
	 * 9  Treat the image as a single word in a circle.
	 * 10 Treat the image as a single character.
	 * 11 Sparse text. Find as much text as possible in no particular order.
	 * 12 Sparse text with OSD.
	 * 13 Raw line. Treat the image as a single text line, bypassing hacks that are Tesseract-specific.
	 *
	 * 格式(加在語言後) '' | 'hocr' | 'tsv' | 'batch.nochop makebox'
	 */
	process.chdir(path.join(__dirname, 'odbbs/tessdata'));
	const lang = 'chi_tra_test';
	const testImage = glob('*.jpg')[0];
	const textFile = 'result'; // 會自動添加.txt
	execSync(`tesseract ${testImage} ${textFile} -l ${lang} --oem 1 --psm 3`);
	console.log('odbbs/tessdata/' + testImage, 'odbbs/tessdata/' + textFile + '.txt');
}
/**
 * 建立空資料夾
 */
for (let dir of ['jpg', 'box', 'tessdata'])
	if (!fs.existsSync(__dirname + '/odbbs/' + dir)) fs.mkdirSync(__dirname + '/odbbs/' + dir, { recursive: true });
/**
 * 執行by指令(node scripts/odbbs-crawler [引數])
 */
if (process.argv[2] == 'fetch') fetch_odbbs();
if (process.argv[2] == 'image') pdf2image();
if (process.argv[2] == 'train') trainTesseract();
if (process.argv[2] == 'test') testTesseract();
	const puppeteer = require('puppeteer-core');
	const fs = require('fs');
	const path = require('path');
	const glob = require('glob').sync;
	const { execSync } = require('child_process');
	const edgeExe = 'C:/Program Files (x86)/Microsoft/Edge/Application/msedge.exe';
	/**
	* 全國電子公佈欄爬蟲
	* https://www.odbbs.gov.tw/odbbs/html/announce.jsp
	*/
	async function fetch_odbbs() {
	const browser = await puppeteer.launch({ executablePath: edgeExe, defaultViewport: null, headless: false });
	const page = await browser.newPage();
	await page.goto('https://www.odbbs.gov.tw/odbbs/html/announce.jsp');
	await page.waitForSelector('[name=fryear]');
	// 調整查詢起始日
	await page.select('[name=fryear]', '109');
	await page.select('[name=frmonth]', '01');
	await page.select('[name=frdate]', '01');
	await page.click('input[value="查詢 "]');
	await page.waitForTimeout(3000);
	const total = await page.$eval('#total2', (el) => el.textContent);
	while (true) {
	// 點擊附件按鈕
	const tb_list_trs = await page.$$('#tb_list tr:not(:first-child)');
	await page.waitForTimeout(500);
	for (let tr of tb_list_trs) {
	const docNo = await tr.$eval('td:nth-child(3)', (el) => el.textContent);
	// 建立資料夾、改寫下載地點
	const downloadPath = path.join(__dirname, 'odbbs/' + docNo);
	console.log(downloadPath);
	if (fs.existsSync(downloadPath)) continue;
	else fs.mkdirSync(downloadPath, { recursive: true });
	await page._client.send('Page.setDownloadBehavior', { behavior: 'allow', downloadPath });
	// 進入下載頁面，寫tr.click('td:last-child img')會無效
	await tr.$('td:last-child img').then((el) => el.click());
	await page.waitForSelector('#att_download');
	await page.waitForTimeout(1000);
	// 對每個檔案點擊下載按鈕
	const att_trs = await page.$$('#att_download tr:not(:first-child)');
	for (let t of att_trs) {
	await t.$('td:last-child a').then((el) => el.click());
	await page.waitForTimeout(2000);
	}
	await page.click('#box_05 input[value="返回列表 "]');
	await page.waitForTimeout(500);
	// 判斷是否為最後一筆
	const index = await tr.$eval('td:first-child', (el) => el.textContent);
	if (index == total) break;
	}
	await page.click('img[title=下一頁]');
	await page.waitForTimeout(500);
	}
	}
	/**
	* PDF轉成訓練資料集
	* https://mozilla.github.io/pdf.js/web/viewer.html
	*/
	async function pdf2image() {
	const browser = await puppeteer.launch({ executablePath: edgeExe, defaultViewport: null, headless: false });
	const page = await browser.newPage();
	// 將畫面調整到A4大小
	await page.setViewport({ width: parseInt(210 * 3.78) + 20, height: parseInt(297 * 3.78) + 40 });
	await page.goto('https://mozilla.github.io/pdf.js/web/viewer.html');
	await page.waitForSelector('input[type=file]');
	const fileInput = await page.$('input[type=file]');
	for (let pdfFile of glob(path.join(__dirname, 'odbbs/*/.pdf'))) {
	// 上傳PDF檔
	fileInput.uploadFile(pdfFile);
	await page.waitForTimeout(2000);
	await page.select('#scaleSelect', 'page-actual');
	await page.waitForTimeout(500);
	const totalPage = await page.$eval('#pageNumber', (el) => el.max);
	let curPage = 1;
	while (curPage <= totalPage) {
	// 取出每個字元框
	const { box, clip } = await page.$eval(`#viewer .page:nth-child(${curPage}) .textLayer`, (el) => {
	const { x, y, width, height } = el.getBoundingClientRect();
	const range = new Range();
	let box = '';
	for (let span of el.querySelectorAll('span[role=presentation]')) {
	// 排除裝訂線、頁碼
	if (parseFloat(span.style.left) < 70) continue;
	if (parseFloat(span.style.top) > 800) continue;
	const textNode = span.firstChild;
	for (let i = 0; i < textNode.length; i++) {
	range.setStart(textNode, i);
	range.setEnd(textNode, i + 1);
	let { top, left, right, bottom } = range.getBoundingClientRect();
	const char = textNode.textContent.slice(i, i + 1);
	if (char == ' ') continue;
	const rect = [left - x, height - bottom + y, right - x, height - top + y].map((x) => parseInt(x)).join(' ');
	box += char + ' ' + rect + ' 0\n';
	}
	}
	return { box, clip: { x, y, width, height } };
	});
	// 標註頁數並換下一頁
	const name = path.basename(pdfFile).replace('.pdf', '') + 'P' + curPage++;
	const boxFile = path.join(__dirname, 'odbbs/box', name + '.box');
	const jpgFile = path.join(__dirname, 'odbbs/jpg', name + '.jpg');
	// 跳過文字過少的訓練資料
	if (box.length < 100) continue;
	fs.writeFileSync(boxFile, box);
	await page.screenshot({ path: jpgFile, clip });
	await page.click('button#next');
	await page.waitForTimeout(1000);
	}
	}
	}
	/**
	* 訓練OCR引擎
	* 官方說明 https://tesseract-ocr.github.io/tessdoc/tess3/Training-Tesseract-3.03%E2%80%933.05.html
	* 引擎下載(建議第四版) https://github.com/UB-Mannheim/tesseract/wiki
	* 權重下載(有版本問題，建議安裝時選取language data>chi_tra) https://github.com/tesseract-ocr/tessdata_best
	* 環境變數 C:\Program Files (x86)\Tesseract-OCR\
	*/
	function trainTesseract() {
	process.chdir(path.join(__dirname, 'odbbs/tessdata'));
	console.log(process.cwd());
	const lang = 'chi_tra';
	const font = 'kaiu';
	// 複製圖片檔(.jpg)、產生辨識框(.box)、訓練資料(.tr)
	const jpgs = glob('../jpg/*.jpg');
	let boxes = glob('../box/*.box');
	for (let i = 0; i < jpgs.length; i++) {
	let train = `${lang}.${font}.exp${i}`;
	if (fs.existsSync(train + '.tr')) continue;
	fs.copyFileSync(jpgs[i], train + '.jpg');
	fs.copyFileSync(boxes[i], train + '.box');
	// execSync(`tesseract -l ${lang} ${train}.jpg ${train} batch.nochop makebox`);
	execSync(`tesseract -l ${lang} ${train}.jpg ${train} box.train`);
	}
	boxes = glob('*.box').join(' ');
	const trs = glob('*.tr').join(' ');
	// 製作字型屬性
	fs.writeFileSync('font_properties', `${lang} 0 0 0 1 0`);
	execSync(`unicharset_extractor --output_unicharset unicharset ${boxes}`);
	execSync(`mftraining -F font_properties -U unicharset -O ${lang}.unicharset -D . ${trs}`);
	execSync(`cntraining -D . ${trs}`);
	fs.renameSync('inttemp', `${lang}.inttemp`);
	fs.renameSync('normproto', `${lang}.normproto`);
	fs.renameSync('pffmtable', `${lang}.pffmtable`);
	fs.renameSync('shapetable', `${lang}.shapetable`);
	// 合併檔案以產生權重
	execSync(`combine_tessdata ${lang}.`);
	// 重新命名避開現有名稱，並搬到資料夾下(注意Program Files會被擋權限)
	fs.renameSync(`${lang}.traineddata`, `${lang}_test.traineddata`);
	fs.copyFileSync(`${lang}_test.traineddata`, 'C:/Program Files (x86)/Tesseract-OCR/tessdata/');
	}
	/**
	* 測試辨識結果
	*/
	function testTesseract() {
	/**
	* 引擎 OCR_ENGINE_MODE
	* 0 = 'Legacy'
	* 1 = 'LSTM'
	*
	* 模式 PAGE_SEG_MODE
	* 0 Orientation and script detection (OSD) only.
	* 1 Automatic page segmentation with OSD.
	* 2 Automatic page segmentation, but no OSD, or OCR. (not implemented)
	* 3 Fully automatic page segmentation, but no OSD. (Default)
	* 4 Assume a single column of text of variable sizes.
	* 5 Assume a single uniform block of vertically aligned text.
	* 6 Assume a single uniform block of text.
	* 7 Treat the image as a single text line.
	* 8 Treat the image as a single word.
	* 9 Treat the image as a single word in a circle.
	* 10 Treat the image as a single character.
	* 11 Sparse text. Find as much text as possible in no particular order.
	* 12 Sparse text with OSD.
	* 13 Raw line. Treat the image as a single text line, bypassing hacks that are Tesseract-specific.
	*
	* 格式(加在語言後) '' \| 'hocr' \| 'tsv' \| 'batch.nochop makebox'
	*/
	process.chdir(path.join(__dirname, 'odbbs/tessdata'));
	const lang = 'chi_tra_test';
	const testImage = glob('*.jpg')[0];
	const textFile = 'result'; // 會自動添加.txt
	execSync(`tesseract ${testImage} ${textFile} -l ${lang} --oem 1 --psm 3`);
	console.log('odbbs/tessdata/' + testImage, 'odbbs/tessdata/' + textFile + '.txt');
	}
	/**
	* 建立空資料夾
	*/
	for (let dir of ['jpg', 'box', 'tessdata'])
	if (!fs.existsSync(__dirname + '/odbbs/' + dir)) fs.mkdirSync(__dirname + '/odbbs/' + dir, { recursive: true });
	/**
	* 執行by指令(node scripts/odbbs-crawler [引數])
	*/
	if (process.argv[2] == 'fetch') fetch_odbbs();
	if (process.argv[2] == 'image') pdf2image();
	if (process.argv[2] == 'train') trainTesseract();
	if (process.argv[2] == 'test') testTesseract();