Skip to content

Instantly share code, notes, and snippets.

@vsemozhetbyt
Last active April 26, 2017 20:53
Show Gist options
  • Save vsemozhetbyt/99132f861d8ab9505d7a9c1988e7b544 to your computer and use it in GitHub Desktop.
Save vsemozhetbyt/99132f861d8ab9505d7a9c1988e7b544 to your computer and use it in GitHub Desktop.
/******************************************************************************/
'use strict';
/******************************************************************************/
const [, , fpath] = process.argv;
if (!fpath) {
console.error('Add the file path, please.');
process.exit(1);
}
/******************************************************************************/
const fs = require('fs');
const rl = require('readline');
const inCoding = guessEncoding(fpath);
if (!inCoding) process.exit(1);
const outCoding = inCoding;
const toChangeFileNameRE = /(\.[^.]+)?$/;
const bomRE = /^\uFEFF/;
const rli = rl.createInterface({ input: fs.createReadStream(fpath, inCoding) });
const outFileName = fpath.replace(toChangeFileNameRE, '.retagged$1');
const outFile = fs.openSync(outFileName, 'w');
fs.writeSync(outFile, '\uFEFF', null, outCoding);
/******************************************************************************/
const r = String.raw;
const isDirective = /^#/;
const isHeadword = /^[^#\s]/;
const isBody = /^\s/;
let cardBuf = '';
const startOfString = '^';
const notEscapeSymbol = r`[^\x5c]`;
const escapedEscapeSymbols = r`(?:${startOfString}|${notEscapeSymbol})(?:\x5c{2})+`;
const DSLTag = r`\x5b([^\x5d]+?)(?: ([^\x5d]+))*\x5d`;
const notEscapedDSLTagRE = new RegExp(
`(?<=${startOfString}|${notEscapeSymbol}|${escapedEscapeSymbols})${DSLTag}`, 'g'
);
const HTMLTagRE = /<([^>]+?)(?: title="([^\x22]+)")*>/g;
/* eslint-disable array-bracket-spacing */
const DSLtoHTMLmap = new Map([
['b', 'b' ],
['i', 'i' ],
['u', 'u' ],
['sub', 'small' ],
['sup', 'big' ],
['p', 'a' ],
['!trs', 'strike'],
['ex', 'tt' ],
['com', 'code' ],
['trn', 'strong'],
["'", 'nobr' ],
['*', 's' ],
['c', 'em' ],
['lang', 'font' ],
]);
/* eslint-enable array-bracket-spacing */
[...DSLtoHTMLmap.entries()].forEach(
(pair) => { DSLtoHTMLmap.set(`/${pair[0]}`, `/${pair[1]}`); }
);
const HTMLtoDSLmap = new Map(
[...DSLtoHTMLmap.entries()].map(pair => pair.reverse())
);
let retagged = 0;
/******************************************************************************/
const { JSDOM } = require('jsdom');
const doc = (new JSDOM()).window.document;
doc.body.innerHTML = '';
/******************************************************************************/
console.log('Processing dictionary...\n');
const pbRead = pb(fs.statSync(fpath).size);
pbRead.start();
let lineNumber = 0;
rli.on('line', (line) => {
pbRead.stat += Buffer.byteLength(line, inCoding) + 1;
if (++lineNumber === 1) {
line = line.replace(bomRE, ''); // eslint-disable-line no-param-reassign
}
if (isDirective.test(line)) {
fs.writeSync(outFile, `${line}\n`, null, outCoding);
} else if (isHeadword.test(line)) {
if (cardBuf) {
fs.writeSync(outFile, `${dsl2html2dsl(cardBuf)}`, null, outCoding);
cardBuf = '';
}
fs.writeSync(outFile, `${dsl2html2dsl(line)}\n`, null, outCoding);
} else if (isBody.test(line)) {
cardBuf += `${line}\n`;
} else if (!line) {
if (cardBuf) cardBuf += '\n';
else fs.writeSync(outFile, '\n', null, outCoding);
}
}).on('close', () => {
if (cardBuf) {
fs.writeSync(outFile, `${dsl2html2dsl(cardBuf)}`, null, outCoding);
}
pbRead.end();
fs.closeSync(outFile);
fs.renameSync(
outFileName, outFileName.replace(toChangeFileNameRE, `_${retagged}$1`)
);
console.log(`Retagged headwords or cards: ${retagged}.`);
});
/******************************************************************************/
function dsl2html2dsl(str) {
const [, startSpaces, endSpaces] = str.match(/^(\s*)[^]*?(\s*)$/);
doc.body.innerHTML = '';
doc.body.appendChild(doc.createTextNode(str.trim()));
const escapedStr = doc.body.innerHTML;
doc.body.innerHTML = escapedStr.replace(notEscapedDSLTagRE, (match, tag, attr) => {
if (DSLtoHTMLmap.has(tag)) {
return `<${DSLtoHTMLmap.get(tag)}${attr ? ` title='${attr}'` : ''}>`;
}
return match;
});
doc.body.innerHTML = doc.body.innerHTML.replace(HTMLTagRE, (match, tag, attr) => {
if (HTMLtoDSLmap.has(tag)) {
return `[${HTMLtoDSLmap.get(tag)}${attr ? ` ${attr}` : ''}]`;
}
return match;
});
if (str.trim() !== doc.body.textContent) retagged++;
return `${startSpaces}${doc.body.textContent}${endSpaces}`;
}
/******************************************************************************/
function guessEncoding(path) {
const BOM_0 = 0xFF;
const BOM_1 = 0xFE;
try {
const fd = fs.openSync(path, 'r');
const bf = Buffer.alloc(2);
fs.readSync(fd, bf, 0, 2, 0);
fs.closeSync(fd);
return bf[0] === BOM_0 && bf[1] === BOM_1 ? 'utf16le' : 'utf8';
} catch (e) {
console.error(`Error: ${e.message}.`);
return null;
}
}
/******************************************************************************/
function pb(edge = 0) {
const DEFAULT_FREQ = 500;
const HUNDRED_PERCENT = 100;
const PB_LENGTH = 50;
const PB_SCALE = HUNDRED_PERCENT / PB_LENGTH;
function clearLine() {
rl.cursorTo(process.stdout, 0);
rl.clearLine(process.stdout, 0);
}
return {
edge,
stat: 0,
start(freq = DEFAULT_FREQ) {
this.updater = setInterval(() => { this.update(); }, freq);
},
update(stat = this.stat) {
let statPercent = Math.ceil(stat / this.edge * HUNDRED_PERCENT);
if (statPercent > HUNDRED_PERCENT) statPercent = HUNDRED_PERCENT;
const barsNumber = Math.floor(statPercent / PB_SCALE);
const padsNumber = PB_LENGTH - barsNumber;
clearLine();
process.stdout.write(
`${'█'.repeat(barsNumber)}${' '.repeat(padsNumber)} ${statPercent}%`
);
},
end() {
clearInterval(this.updater);
this.stat = this.edge;
this.update();
console.log('\n');
},
clear() {
clearInterval(this.updater);
clearLine();
},
};
}
/******************************************************************************/
@vsemozhetbyt
Copy link
Author

vsemozhetbyt commented Apr 24, 2016

1 . Требуется модуль jsdom версии 10+ и Node.js версии 6+. Установка модуля из командной строки (в папке скрипта или одной из его родительских папок):

npm install jsdom

Модуль основательный, грузится при установке со всеми зависимыми подмодулями ощутимое время. В начале работы скрипта тоже запускается с небольшой задержкой.

2 . Пока lookbehind assertions остаются под флагом, запускать скрипт нужно так (изменив по необходимости пути к скрипту и словарю):

node --harmony_regexp_lookbehind retag_misnested_dsl.js test.dsl

  1. Словарь может быть в кодировке UTF-8 или UTF-16 (распознаётся автоматически).

Некоторые особенности реализации скрипта см. здесь.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment