Last active
April 26, 2017 20:53
-
-
Save vsemozhetbyt/99132f861d8ab9505d7a9c1988e7b544 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/******************************************************************************/ | |
'use strict'; | |
/******************************************************************************/ | |
const [, , fpath] = process.argv; | |
if (!fpath) { | |
console.error('Add the file path, please.'); | |
process.exit(1); | |
} | |
/******************************************************************************/ | |
const fs = require('fs'); | |
const rl = require('readline'); | |
const inCoding = guessEncoding(fpath); | |
if (!inCoding) process.exit(1); | |
const outCoding = inCoding; | |
const toChangeFileNameRE = /(\.[^.]+)?$/; | |
const bomRE = /^\uFEFF/; | |
const rli = rl.createInterface({ input: fs.createReadStream(fpath, inCoding) }); | |
const outFileName = fpath.replace(toChangeFileNameRE, '.retagged$1'); | |
const outFile = fs.openSync(outFileName, 'w'); | |
fs.writeSync(outFile, '\uFEFF', null, outCoding); | |
/******************************************************************************/ | |
const r = String.raw; | |
const isDirective = /^#/; | |
const isHeadword = /^[^#\s]/; | |
const isBody = /^\s/; | |
let cardBuf = ''; | |
const startOfString = '^'; | |
const notEscapeSymbol = r`[^\x5c]`; | |
const escapedEscapeSymbols = r`(?:${startOfString}|${notEscapeSymbol})(?:\x5c{2})+`; | |
const DSLTag = r`\x5b([^\x5d]+?)(?: ([^\x5d]+))*\x5d`; | |
const notEscapedDSLTagRE = new RegExp( | |
`(?<=${startOfString}|${notEscapeSymbol}|${escapedEscapeSymbols})${DSLTag}`, 'g' | |
); | |
const HTMLTagRE = /<([^>]+?)(?: title="([^\x22]+)")*>/g; | |
/* eslint-disable array-bracket-spacing */ | |
const DSLtoHTMLmap = new Map([ | |
['b', 'b' ], | |
['i', 'i' ], | |
['u', 'u' ], | |
['sub', 'small' ], | |
['sup', 'big' ], | |
['p', 'a' ], | |
['!trs', 'strike'], | |
['ex', 'tt' ], | |
['com', 'code' ], | |
['trn', 'strong'], | |
["'", 'nobr' ], | |
['*', 's' ], | |
['c', 'em' ], | |
['lang', 'font' ], | |
]); | |
/* eslint-enable array-bracket-spacing */ | |
[...DSLtoHTMLmap.entries()].forEach( | |
(pair) => { DSLtoHTMLmap.set(`/${pair[0]}`, `/${pair[1]}`); } | |
); | |
const HTMLtoDSLmap = new Map( | |
[...DSLtoHTMLmap.entries()].map(pair => pair.reverse()) | |
); | |
let retagged = 0; | |
/******************************************************************************/ | |
const { JSDOM } = require('jsdom'); | |
const doc = (new JSDOM()).window.document; | |
doc.body.innerHTML = ''; | |
/******************************************************************************/ | |
console.log('Processing dictionary...\n'); | |
const pbRead = pb(fs.statSync(fpath).size); | |
pbRead.start(); | |
let lineNumber = 0; | |
rli.on('line', (line) => { | |
pbRead.stat += Buffer.byteLength(line, inCoding) + 1; | |
if (++lineNumber === 1) { | |
line = line.replace(bomRE, ''); // eslint-disable-line no-param-reassign | |
} | |
if (isDirective.test(line)) { | |
fs.writeSync(outFile, `${line}\n`, null, outCoding); | |
} else if (isHeadword.test(line)) { | |
if (cardBuf) { | |
fs.writeSync(outFile, `${dsl2html2dsl(cardBuf)}`, null, outCoding); | |
cardBuf = ''; | |
} | |
fs.writeSync(outFile, `${dsl2html2dsl(line)}\n`, null, outCoding); | |
} else if (isBody.test(line)) { | |
cardBuf += `${line}\n`; | |
} else if (!line) { | |
if (cardBuf) cardBuf += '\n'; | |
else fs.writeSync(outFile, '\n', null, outCoding); | |
} | |
}).on('close', () => { | |
if (cardBuf) { | |
fs.writeSync(outFile, `${dsl2html2dsl(cardBuf)}`, null, outCoding); | |
} | |
pbRead.end(); | |
fs.closeSync(outFile); | |
fs.renameSync( | |
outFileName, outFileName.replace(toChangeFileNameRE, `_${retagged}$1`) | |
); | |
console.log(`Retagged headwords or cards: ${retagged}.`); | |
}); | |
/******************************************************************************/ | |
function dsl2html2dsl(str) { | |
const [, startSpaces, endSpaces] = str.match(/^(\s*)[^]*?(\s*)$/); | |
doc.body.innerHTML = ''; | |
doc.body.appendChild(doc.createTextNode(str.trim())); | |
const escapedStr = doc.body.innerHTML; | |
doc.body.innerHTML = escapedStr.replace(notEscapedDSLTagRE, (match, tag, attr) => { | |
if (DSLtoHTMLmap.has(tag)) { | |
return `<${DSLtoHTMLmap.get(tag)}${attr ? ` title='${attr}'` : ''}>`; | |
} | |
return match; | |
}); | |
doc.body.innerHTML = doc.body.innerHTML.replace(HTMLTagRE, (match, tag, attr) => { | |
if (HTMLtoDSLmap.has(tag)) { | |
return `[${HTMLtoDSLmap.get(tag)}${attr ? ` ${attr}` : ''}]`; | |
} | |
return match; | |
}); | |
if (str.trim() !== doc.body.textContent) retagged++; | |
return `${startSpaces}${doc.body.textContent}${endSpaces}`; | |
} | |
/******************************************************************************/ | |
function guessEncoding(path) { | |
const BOM_0 = 0xFF; | |
const BOM_1 = 0xFE; | |
try { | |
const fd = fs.openSync(path, 'r'); | |
const bf = Buffer.alloc(2); | |
fs.readSync(fd, bf, 0, 2, 0); | |
fs.closeSync(fd); | |
return bf[0] === BOM_0 && bf[1] === BOM_1 ? 'utf16le' : 'utf8'; | |
} catch (e) { | |
console.error(`Error: ${e.message}.`); | |
return null; | |
} | |
} | |
/******************************************************************************/ | |
function pb(edge = 0) { | |
const DEFAULT_FREQ = 500; | |
const HUNDRED_PERCENT = 100; | |
const PB_LENGTH = 50; | |
const PB_SCALE = HUNDRED_PERCENT / PB_LENGTH; | |
function clearLine() { | |
rl.cursorTo(process.stdout, 0); | |
rl.clearLine(process.stdout, 0); | |
} | |
return { | |
edge, | |
stat: 0, | |
start(freq = DEFAULT_FREQ) { | |
this.updater = setInterval(() => { this.update(); }, freq); | |
}, | |
update(stat = this.stat) { | |
let statPercent = Math.ceil(stat / this.edge * HUNDRED_PERCENT); | |
if (statPercent > HUNDRED_PERCENT) statPercent = HUNDRED_PERCENT; | |
const barsNumber = Math.floor(statPercent / PB_SCALE); | |
const padsNumber = PB_LENGTH - barsNumber; | |
clearLine(); | |
process.stdout.write( | |
`${'█'.repeat(barsNumber)}${' '.repeat(padsNumber)} ${statPercent}%` | |
); | |
}, | |
end() { | |
clearInterval(this.updater); | |
this.stat = this.edge; | |
this.update(); | |
console.log('\n'); | |
}, | |
clear() { | |
clearInterval(this.updater); | |
clearLine(); | |
}, | |
}; | |
} | |
/******************************************************************************/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
1 . Требуется модуль
jsdom
версии 10+ и Node.js версии 6+. Установка модуля из командной строки (в папке скрипта или одной из его родительских папок):npm install jsdom
Модуль основательный, грузится при установке со всеми зависимыми подмодулями ощутимое время. В начале работы скрипта тоже запускается с небольшой задержкой.
2 . Пока lookbehind assertions остаются под флагом, запускать скрипт нужно так (изменив по необходимости пути к скрипту и словарю):
node --harmony_regexp_lookbehind retag_misnested_dsl.js test.dsl
Некоторые особенности реализации скрипта см. здесь.