Last active
February 24, 2016 03:39
-
-
Save vsemozhetbyt/bfa76deac0d374b6b276 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/******************************************************************************/ | |
'use strict'; | |
const fs = require('fs'); | |
const pth = require('path'); | |
const readline = require('readline'); | |
const nwWin = nw.Window.get(); | |
const nwDoc = window.document; | |
const eTocFile = nwDoc.querySelector('#tocFile'); | |
const eOutputDir = nwDoc.querySelector('#outputDir'); | |
const eSave = nwDoc.querySelector('#save'); | |
const eInfo = nwDoc.querySelector('#info'); | |
const eAudio = nwDoc.querySelector('#audio'); | |
const eBrowser = nwDoc.querySelector('#browser'); | |
const config = {}; | |
const io = {}; | |
const formatNumberRE = /\B(?=(?:\d{3})+$)/g; | |
const hour = 1000 * 60 * 60; | |
const toc = []; | |
const selectorsToCheck = ['#content-text #newsletter a[href]', | |
'#content-text .index-words a[href]']; | |
const selectorsToDelete = ['#content-text #social', '#content-text #newsletter']; | |
const selectorsToSave = ['#content-text']; | |
const checkFreq = 100; | |
const headwordsBuffer = new Set(); | |
let prevURL = ''; | |
let currURL = ''; | |
let restMark; | |
let speedInfo = '?/h (?/min): ~? hours left, ~? days left.'; | |
let saving = false; | |
let stop = false; | |
let exit = false; | |
/******************************************************************************/ | |
nwWin.on('close', onExit); | |
try { | |
Object.assign(config, JSON.parse( fs.readFileSync('config.json', 'utf8') )); | |
eTocFile.setAttribute('nwworkingdir', pth.dirname(config.tocFile)); | |
eOutputDir.setAttribute('nwworkingdir', config.outputDir); | |
} catch(e) {} | |
eTocFile.addEventListener('change', checkDirs); | |
eOutputDir.addEventListener('change', checkDirs); | |
eSave.addEventListener('click', saveDic); | |
checkDirs(); | |
/******************************************************************************/ | |
function checkDirs() { | |
config.tocFile = io.tocFile = eTocFile.title = eTocFile.value; | |
config.outputDir = io.outputDir = eOutputDir.title = eOutputDir.value; | |
if (io.tocFile && io.outputDir) { | |
eInfo.textContent = ''; | |
eSave.disabled = false; | |
fs.writeFileSync('config.json', JSON.stringify(config), 'utf8'); | |
} else { | |
eInfo.textContent = 'Select the TOC file and the output directory please.'; | |
eSave.disabled = true; | |
} | |
} | |
/******************************************************************************/ | |
function onStop() { | |
stop = true; | |
} | |
/******************************************************************************/ | |
function onExit() { | |
if (saving) { | |
if (confirm('Do you want to exit? Dictionary saving will be suspended.')) { | |
stop = exit = true; | |
} else { | |
return; | |
} | |
} else { | |
nwWin.close(true); | |
} | |
} | |
/******************************************************************************/ | |
function setSpeedInfo() { | |
const donePerHour = restMark - toc.length; | |
const donePerMin = Math.round(donePerHour / 60); | |
restMark = toc.length; | |
const hoursLeft = Math.round(restMark / donePerHour); | |
const daysLeft = Math.round(hoursLeft / 24); | |
speedInfo = `${donePerHour}/h (${donePerMin}/min): ~${hoursLeft} hours left, ~${daysLeft} days left.`; | |
eAudio.play(); | |
} | |
/******************************************************************************/ | |
function updateInfo(str) { | |
eInfo.textContent += `${str}\n`; | |
if (/(?:.*\n){11}/.test(eInfo.textContent)) { | |
eInfo.textContent = eInfo.textContent.replace(/[^]+((?:^.*\n){10})/m, '$1'); | |
eInfo.scrollTop = eInfo.scrollHeight; | |
} | |
} | |
/******************************************************************************/ | |
function logError(evt) { | |
if (currURL !== prevURL) { | |
fs.writeSync(io.errFile, | |
//////////////////////////////////////////////////////////////////////////////// | |
`Iframe error (${new Date()}). | |
${currURL} | |
${JSON.stringify(evt)} | |
` | |
//////////////////////////////////////////////////////////////////////////////// | |
, null, 'utf8'); | |
} | |
eAudio.play(); | |
} | |
/******************************************************************************/ | |
function secureLow(str) { | |
return str.replace(/[\\\[\]{}@^~<>#()]/g, '\\$&'); | |
} | |
/******************************************************************************/ | |
function secureHigh(str, isHeadword) { | |
str = str.trim().replace(/[ \t]{2,}/g, ' '); | |
if (!isHeadword) { | |
str = str.replace(/^[ \t]+|[ \t]+$/gm, '') | |
.replace(/\n/g, '\n\t') | |
.replace(/(?:\n\t){2,}/g, '\n\t\\ \n\t'); | |
} | |
return str; | |
} | |
/******************************************************************************/ | |
function saveDic() { | |
saving = true; | |
eSave.removeEventListener('click', saveDic); | |
eSave.textContent = 'Stop'; | |
eSave.addEventListener('click', onStop); | |
eTocFile.disabled = true; | |
eOutputDir.disabled = true; | |
try { | |
fs.accessSync(io.tocFile); | |
} catch(e) { | |
eInfo.textContent = `TOC file not available: '${e}.'`; | |
eTocFile.disabled = false; | |
eOutputDir.disabled = false; | |
eAudio.play(); | |
} | |
const rl = readline.createInterface({ | |
input: fs.createReadStream(io.tocFile, 'utf8'), | |
terminal: false, | |
historySize: 0 | |
}); | |
io.dicFile = fs.openSync(pth.join(io.outputDir, 'WordSpy.dic.dsl'), 'a'); | |
io.logFile = fs.openSync(pth.join(io.outputDir, 'WordSpy.dic.log'), 'a+'); | |
io.errFile = fs.openSync(pth.join(io.outputDir, 'WordSpy.dic.errors.log'), 'a'); | |
if (fs.fstatSync(io.dicFile).size === 0) { | |
fs.writeSync(io.dicFile, '\uFEFF' + | |
//////////////////////////////////////////////////////////////////////////////// | |
`#NAME "Word Spy 2016 (Eng-Eng)" | |
#INDEX_LANGUAGE "English" | |
#CONTENTS_LANGUAGE "English" | |
` | |
//////////////////////////////////////////////////////////////////////////////// | |
, null, 'utf8'); | |
} | |
eInfo.textContent = 'Reading the TOC file...\n'; | |
rl.on('line', line => { | |
line = line.trim(); | |
if (line) toc.push(line); | |
}).on('close', () => { | |
if (toc.length) { | |
eBrowser.addEventListener('load', checkDoc); | |
eBrowser.addEventListener('error', logError); | |
if (fs.fstatSync(io.logFile).size !== 0) { | |
const rl = readline.createInterface({ | |
input: fs.createReadStream(null, {encoding: 'utf8', fd: io.logFile, autoClose: false}), | |
terminal: false, | |
historySize: 0 | |
}); | |
updateInfo('Reading the log file...'); | |
let lastLine; | |
rl.on('line', line => { | |
line = line.trim(); | |
if (line) lastLine = line; | |
}).on('close', () => { | |
toc.splice(0, toc.indexOf(lastLine) + 1); | |
restMark = toc.length; | |
global.setInterval(setSpeedInfo, hour).unref(); | |
updateInfo( | |
nwDoc.title = nwWin.title = | |
`Rest: ${ toc.length.toString().replace(formatNumberRE, ' ') }.` | |
); | |
getDoc(toc.shift()); | |
}); | |
} else { | |
restMark = toc.length; | |
global.setInterval(setSpeedInfo, hour).unref(); | |
updateInfo( | |
nwDoc.title = nwWin.title = | |
`Rest: ${ toc.length.toString().replace(formatNumberRE, ' ') }.` | |
); | |
getDoc(toc.shift()); | |
} | |
} else { | |
eInfo.textContent = 'No URLs found.'; | |
endSaving(); | |
} | |
}); | |
} | |
/******************************************************************************/ | |
function getDoc(url) { | |
if (stop) { | |
updateInfo(nwDoc.title = nwWin.title = 'Stop on demand.'); | |
endSaving(); | |
return; | |
} | |
prevURL = currURL; | |
currURL = url; | |
updateInfo(` ${url}`); | |
eBrowser.src = url; | |
} | |
/******************************************************************************/ | |
function checkDoc() { | |
const iWin = eBrowser.contentWindow; | |
const iDoc = iWin.document; | |
const iLoc = iWin.location.href; | |
let iter = 0; | |
const checker = global.setInterval(() => { | |
iter++; | |
if (iDoc.querySelector(selectorsToCheck.join(', '))) { | |
global.clearInterval(checker); | |
processDoc(iWin, iDoc, iLoc, iter); | |
} else if (iter > 50) { | |
global.clearInterval(checker); | |
if (iLoc === currURL) { | |
eAudio.play(); | |
updateInfo( nwDoc.title = nwWin.title = 'HTTP error. Retrying...' ); | |
getDoc(currURL); | |
} else { | |
fs.writeSync(io.errFile, | |
//////////////////////////////////////////////////////////////////////////////// | |
`Something wrong (${new Date()}). | |
${currURL} | |
` | |
//////////////////////////////////////////////////////////////////////////////// | |
, null, 'utf8'); | |
updateInfo( nwDoc.title = nwWin.title = 'Something wrong...' ); | |
endSaving(); | |
} | |
} | |
}, checkFreq); | |
} | |
/******************************************************************************/ | |
function processDoc(iWin, iDoc, iLoc, iter) { | |
selectorsToDelete.forEach(s => { | |
const e = iDoc.querySelector(s); | |
if (e) e.parentNode.removeChild(e); | |
}); | |
const elmToSave = iDoc.querySelector(selectorsToSave.join(', ')); | |
const textNodes = iDoc.evaluate( | |
'.//text()', elmToSave, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null | |
); | |
for (let i = 0, t; t = textNodes.snapshotItem(i); i++) { | |
t.textContent = secureLow(t.textContent); | |
} | |
if (iLoc.startsWith('http://www.wordspy.com/index.php?word=')) { | |
headwordsBuffer.add(secureHigh(secureLow(iDoc.querySelector('#content-header').innerText), true)); | |
if (elmToSave.querySelector('#synonyms .word-meta-data')) { | |
elmToSave.querySelector('#synonyms .word-meta-data').innerText.split('·') | |
.forEach(synonym => { headwordsBuffer.add(secureHigh(synonym, true)); }); | |
} | |
if (elmToSave.querySelector('#inflections .word-meta-data')) { | |
Array.from(elmToSave.querySelectorAll('#inflections .italicized')) | |
.forEach(abbrv => {abbrv.style.visibility = 'hidden';}); | |
elmToSave.querySelector('#inflections .word-meta-data').innerText.split('·') | |
.forEach(form => { | |
headwordsBuffer.add(secureHigh(form.trim().replace(/\s+\.$/, ''), true)); | |
}); | |
Array.from(elmToSave.querySelectorAll('#inflections .italicized')) | |
.forEach(abbrv => {abbrv.style.visibility = 'visible';}); | |
} | |
} else { | |
headwordsBuffer.add('\\# ' + secureHigh(secureLow(iDoc.querySelector('#content-header').innerText), true)); | |
} | |
Array.from(elmToSave.querySelectorAll('blockquote, div, h3, ol, p, table, ul')) | |
.forEach(elm => { | |
elm.insertAdjacentHTML('beforebegin', '<br><br>'); | |
elm.insertAdjacentHTML('afterend', '<br><br>'); | |
}); | |
Array.from(elmToSave.querySelectorAll('hr')) | |
.forEach(elm => { | |
elm.insertAdjacentHTML('afterend', '<br><br>----------<br><br>'); | |
}); | |
Array.from(elmToSave.querySelectorAll('iframe')) | |
.forEach(elm => { | |
elm.insertAdjacentHTML('afterend', | |
'<br><br>\\[Embeded video or page. See on the site.\\]<br><br>'); | |
}); | |
Array.from(elmToSave.querySelectorAll('blockquote')) | |
.forEach(elm => { | |
if (elm.lastChild.tagName === 'BR') elm.removeChild(elm.lastChild); | |
elm.insertAdjacentHTML('afterbegin', '“'); | |
elm.insertAdjacentHTML('beforeend', '”'); | |
}); | |
Array.from(elmToSave.querySelectorAll('ul')) | |
.forEach(elm => { | |
Array.from(elm.querySelectorAll('li')).forEach(li => { | |
li.insertAdjacentHTML('afterbegin', '• '); | |
}); | |
}); | |
Array.from(elmToSave.querySelectorAll('ol')) | |
.forEach(elm => { | |
Array.from(elm.querySelectorAll('li')).forEach((li, i) => { | |
li.insertAdjacentHTML('afterbegin', `${i + 1}. `); | |
}); | |
}); | |
Array.from(elmToSave.querySelectorAll('smirk, flame')) | |
.forEach(elm => { | |
elm.insertAdjacentHTML('afterbegin', `\\<${elm.tagName.toLowerCase()}\\>`); | |
elm.insertAdjacentHTML('beforeend', `\\</${elm.tagName.toLowerCase()}\\>`); | |
}); | |
Array.from(elmToSave.querySelectorAll('span.add-separator')) | |
.forEach(elm => { | |
elm.insertAdjacentHTML('afterbegin', ' · '); | |
}); | |
Array.from(elmToSave.querySelectorAll('h3')) | |
.forEach(elm => { | |
elm.insertAdjacentHTML('afterbegin', '[b][c steelblue]'); | |
elm.insertAdjacentHTML('beforeend', '[/c][/b]'); | |
}); | |
Array.from(elmToSave.querySelectorAll('div.word-citation-year')) | |
.forEach(elm => { | |
elm.insertAdjacentHTML('afterbegin', '[b][c gray]'); | |
elm.insertAdjacentHTML('beforeend', '[/c][/b]'); | |
}); | |
Array.from(elmToSave.querySelectorAll('i, em, span.italicized')) | |
.forEach(elm => { | |
if (!iDoc.evaluate( './ancestor::*[@data-dsl-i]', elm, | |
null, XPathResult.FIRST_ORDERED_NODE_TYPE, null ).singleNodeValue) { | |
elm.insertAdjacentHTML('afterbegin', '[i]'); | |
elm.insertAdjacentHTML('beforeend', '[/i]'); | |
elm.setAttribute('data-dsl-i', ''); | |
} | |
}); | |
Array.from(elmToSave.querySelectorAll('span')) | |
.filter(elm => iWin.getComputedStyle(elm).fontStyle !== 'normal') | |
.forEach(elm => { | |
if (!iDoc.evaluate( './ancestor::*[@data-dsl-i]', elm, | |
null, XPathResult.FIRST_ORDERED_NODE_TYPE, null ).singleNodeValue) { | |
elm.insertAdjacentHTML('afterbegin', '[i]'); | |
elm.insertAdjacentHTML('beforeend', '[/i]'); | |
elm.setAttribute('data-dsl-i', ''); | |
} | |
}); | |
Array.from(elmToSave.querySelectorAll('b, span.headword')) | |
.forEach(elm => { | |
if (!iDoc.evaluate( './ancestor::*[@data-dsl-b]', elm, | |
null, XPathResult.FIRST_ORDERED_NODE_TYPE, null ).singleNodeValue) { | |
elm.insertAdjacentHTML('afterbegin', '[b]'); | |
elm.insertAdjacentHTML('beforeend', '[/b]'); | |
elm.setAttribute('data-dsl-b', ''); | |
} | |
}); | |
Array.from(elmToSave.querySelectorAll('span')) | |
.filter(elm => iWin.getComputedStyle(elm).fontWeight === 'bold') | |
.forEach(elm => { | |
if (!iDoc.evaluate( './ancestor::*[@data-dsl-b]', elm, | |
null, XPathResult.FIRST_ORDERED_NODE_TYPE, null ).singleNodeValue) { | |
elm.insertAdjacentHTML('afterbegin', '[b]'); | |
elm.insertAdjacentHTML('beforeend', '[/b]'); | |
elm.setAttribute('data-dsl-b', ''); | |
} | |
}); | |
Array.from(elmToSave.querySelectorAll('sup')) | |
.forEach(elm => { | |
elm.insertAdjacentHTML('afterbegin', '[sup]'); | |
elm.insertAdjacentHTML('beforeend', '[/sup]'); | |
}); | |
Array.from(elmToSave.querySelectorAll('a[href]')) | |
.forEach(elm => { | |
if (elm.href.replace(/http:\/\/(?:www\.)?/, '') === | |
elm.innerText.trim().replace(/http:\/\/(?:www\.)?/, '')) { | |
elm.insertAdjacentHTML('afterbegin', '[url]'); | |
elm.insertAdjacentHTML('beforeend', '[/url]'); | |
} else if (/http:\/\/www\.wordspy\.com\/index\.php\?word=/.test(elm.href)) { | |
elm.insertAdjacentHTML('afterbegin', '[ref]'); | |
elm.insertAdjacentHTML('beforeend', '[/ref]'); | |
} else if (/http:\/\/www\.wordspy\.com\/index\.php\?tag=/.test(elm.href)) { | |
elm.insertAdjacentHTML('afterbegin', '[ref]\\# '); | |
elm.insertAdjacentHTML('beforeend', '[/ref]'); | |
} else { | |
if (elm.innerText.trim()) { | |
elm.insertAdjacentHTML('afterbegin', '[u]'); | |
elm.insertAdjacentHTML('beforeend', '[/u]'); | |
} | |
elm.insertAdjacentHTML('beforeend', ` ([url]${secureLow(elm.href)}[/url])`); | |
} | |
}); | |
Array.from(elmToSave.querySelectorAll('img')) | |
.forEach(elm => { | |
elm.insertAdjacentHTML('afterend', | |
`<br><br>[url]${secureLow(elm.src)}[/url]<br><br>`); | |
}); | |
Array.from(elmToSave.querySelectorAll('div.word-citation')) | |
.forEach(elm => { | |
elm.insertAdjacentHTML('afterbegin', '[m2]'); | |
elm.insertAdjacentHTML('beforeend', '[/m]'); | |
}); | |
fs.writeSync(io.dicFile, | |
//////////////////////////////////////////////////////////////////////////////// | |
`${Array.from(headwordsBuffer).join('\n')} | |
${ secureHigh(elmToSave.innerText) } | |
` | |
//////////////////////////////////////////////////////////////////////////////// | |
, null, 'utf8'); | |
fs.writeSync(io.logFile, `${iLoc}\n`, null, 'utf8'); | |
updateInfo( ` ${(iter * checkFreq / 1000).toFixed(1)} s. ${speedInfo}` ); | |
headwordsBuffer.clear(); | |
if (toc.length) { | |
updateInfo( | |
nwDoc.title = nwWin.title = | |
`Rest: ${ toc.length.toString().replace(formatNumberRE, ' ') }.`); | |
getDoc(toc.shift()); | |
} else { | |
updateInfo( nwDoc.title = nwWin.title = 'Saving complete.' ); | |
endSaving(); | |
} | |
} | |
/******************************************************************************/ | |
function endSaving() { | |
io.dicFile && fs.closeSync(io.dicFile); | |
io.logFile && fs.closeSync(io.logFile); | |
io.errFile && fs.closeSync(io.errFile); | |
io.dicFile = io.logFile = io.errFile = null; | |
eBrowser.removeEventListener('load', checkDoc); | |
eBrowser.removeEventListener('error', logError); | |
eSave.removeEventListener('click', onStop); | |
eSave.textContent = 'Save'; | |
eSave.addEventListener('click', saveDic); | |
eTocFile.disabled = false; | |
eOutputDir.disabled = false; | |
saving = false; | |
eBrowser.src = 'about:blank'; | |
eAudio.play(); | |
if (exit) { | |
nwWin.close(true); | |
} | |
} | |
/******************************************************************************/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment