Skip to content

Instantly share code, notes, and snippets.

@pladaria
Created April 8, 2023 11:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pladaria/c19c123a60199d7dfd19a81f9c61f9d2 to your computer and use it in GitHub Desktop.
Save pladaria/c19c123a60199d7dfd19a81f9c61f9d2 to your computer and use it in GitHub Desktop.
Tesseract OCR tsv to json converter
const tesseractTsvToJson = (tsvFilename, jsonFilename) => {
console.log(`> tsvToJson ${tsvFilename} => ${jsonFilename}`);
const tsvLines = fs.readFileSync(tsvFilename, 'utf-8').trim().split('\n').reverse();
/** @type {any} */
const json = {pages: []};
// discard header
tsvLines.pop();
while (tsvLines.length) {
let [
level,
pageNumberRaw,
blockNumberRaw,
paragraphNumberRaw,
lineNumberRaw,
wordNumberRaw,
left,
top,
width,
height,
confidence,
text = '',
] = tsvLines.pop()?.trim().split('\t') ?? [];
const pageNumber = Number(pageNumberRaw) - 1;
const blockNumber = Number(blockNumberRaw) - 1;
const paragraphNumber = Number(paragraphNumberRaw) - 1;
const lineNumber = Number(lineNumberRaw) - 1;
const wordNumber = Number(wordNumberRaw) - 1;
const box = {top, left, width, height};
if (!json.pages[pageNumber]) {
json.pages[pageNumber] = {blocks: [], ...box};
continue;
}
const page = json.pages[pageNumber];
if (!page.blocks[blockNumber]) {
page.blocks[blockNumber] = {paragraphs: [], ...box};
continue;
}
const block = page.blocks[blockNumber];
if (!block.paragraphs[paragraphNumber]) {
block.paragraphs[paragraphNumber] = {lines: [], ...box};
continue;
}
const paragraph = block.paragraphs[paragraphNumber];
if (!paragraph.lines[lineNumber]) {
paragraph.lines[lineNumber] = {words: [], ...box};
continue;
}
const line = paragraph.lines[lineNumber];
if (!line.words[wordNumber]) {
line.words[wordNumber] = {text, ...box};
}
}
fs.writeFileSync(jsonFilename, JSON.stringify(json, null, 2));
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment