Last active
December 9, 2017 16:01
-
-
Save vsemozhetbyt/bf247ba23378598420f1 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
'use strict'; | |
/******************************************************************************/ | |
console.log('Requiring modules...'); | |
const fs = require('fs'); | |
const path = require('path'); | |
const readline = require('readline'); | |
const exec = require('child_process').execFile; | |
const execSync = require('child_process').execFileSync; | |
const jsdom = require('jsdom'); | |
const inDir = process.argv[2] || __dirname; | |
try { | |
fs.accessSync(path.join(inDir, 'UD.toc.txt')) | |
} catch(e) { | |
console.error('TOC file not found.'); | |
process.exit(); | |
} | |
const rl = readline.createInterface({ | |
input: fs.createReadStream(path.join(inDir, 'UD.toc.txt'), {encoding: 'utf8'}), | |
terminal: false, | |
historySize: 0 | |
}); | |
const outDir = process.argv[3] || __dirname; | |
const dicFile = fs.openSync(path.join(outDir, 'UD.dic.dsl'), 'a'); | |
const logFile = fs.openSync(path.join(outDir, 'UD.dic.log'), 'a+'); | |
const errFile = fs.openSync(path.join(outDir, 'UD.dic.errors.log'), 'a'); | |
const formatNumberRE = /\B(?=(?:\d{3})+$)/g; | |
const hour = 1000 * 60 * 60; | |
const toc = []; | |
const headwordsBuffer = new Set(); | |
const cardBodyBuffer = []; | |
let prevURL = ''; | |
let currURL = ''; | |
let restMark; | |
let speedInfo = '?/h (?/min): ~? hours left, ~? days left.'; | |
let terminate = false; | |
/******************************************************************************/ | |
process.on('exit', () => { | |
fs.closeSync(dicFile); | |
fs.closeSync(logFile); | |
fs.closeSync(errFile); | |
playAlert(); | |
}); | |
process.on('SIGINT', () => { | |
terminate = true; | |
}); | |
/******************************************************************************/ | |
if (fs.fstatSync(dicFile).size === 0) { | |
fs.writeSync(dicFile, '\uFEFF' + | |
//////////////////////////////////////////////////////////////////////////////// | |
`#NAME "Urban Dictionary 2015 (Eng-Eng)" | |
#INDEX_LANGUAGE "English" | |
#CONTENTS_LANGUAGE "English" | |
` | |
//////////////////////////////////////////////////////////////////////////////// | |
, null, 'utf8'); | |
} | |
/******************************************************************************/ | |
console.log('Reading the TOC file...'); | |
rl.on('line', line => { | |
line = line.trim(); | |
if (line) toc.push(line); | |
}).on('close', () => { | |
if (toc.length) { | |
if (fs.fstatSync(logFile).size !== 0) { | |
const rl = readline.createInterface({ | |
input: fs.createReadStream(null, {encoding: 'utf8', fd: logFile, autoClose: false}), | |
terminal: false, | |
historySize: 0 | |
}); | |
let lastLine; | |
console.log('Reading the log file...'); | |
rl.on('line', line => { | |
line = line.trim(); | |
if (line) lastLine = line; | |
}).on('close', () => { | |
toc.splice(0, toc.indexOf(lastLine.replace(/&page=\d+$/, '')) + 1); | |
restMark = toc.length; | |
setInterval(setSpeedInfo, hour).unref(); | |
console.log(process.title = | |
`Rest: ${ toc.length.toString().replace(formatNumberRE, ' ') }.`); | |
getDoc(toc.shift()); | |
}); | |
} else { | |
restMark = toc.length; | |
setInterval(setSpeedInfo, hour).unref(); | |
console.log(process.title = | |
`Rest: ${ toc.length.toString().replace(formatNumberRE, ' ') }.`); | |
getDoc(toc.shift()); | |
} | |
} else { | |
console.error('No URLs found.'); | |
process.exit(); | |
} | |
}); | |
/******************************************************************************/ | |
function playAlert() { | |
execSync( | |
'f:\\BAK\\prg\\mm\\ffmpeg\\bin\\ffplay.exe', | |
['-v', 'quiet', '-nodisp', '-autoexit', '-af', 'volume=1.0', | |
'c:\\WINDOWS\\Media\\Windows Ringin.wav'] | |
); | |
} | |
/******************************************************************************/ | |
function secure(str, isHeadword) { | |
str = str.replace(/[\x00-\x09\x0b\x0c\x0e-\x1f\u2028]/g, mtch => ` ${encodeURIComponent(mtch)} `) | |
.replace(/ +/g, ' '); | |
if (!isHeadword) { | |
str = str.replace(/\S{256,}/g, mtch => mtch.substr(0, 250) + '[...]'); | |
} | |
return str; | |
} | |
/******************************************************************************/ | |
function setSpeedInfo() { | |
const donePerHour = restMark - toc.length; | |
const donePerMin = Math.round(donePerHour / 60); | |
restMark = toc.length; | |
const hoursLeft = Math.round(restMark / donePerHour); | |
const daysLeft = Math.round(hoursLeft / 24); | |
speedInfo = `${donePerHour}/h (${donePerMin}/min): ~${hoursLeft} hours left, ~${daysLeft} days left.`; | |
exec( | |
'f:\\BAK\\prg\\mm\\ffmpeg\\bin\\ffplay.exe', | |
['-v', 'quiet', '-nodisp', '-autoexit', '-af', 'volume=1.0', | |
'c:\\WINDOWS\\Media\\Windows Ringin.wav'] | |
); | |
} | |
/******************************************************************************/ | |
function getDoc(url) { | |
if (terminate) { | |
console.log('Exit on demand.'); | |
process.exit(); | |
} | |
prevURL = currURL; | |
currURL = url; | |
console.log(` ${url}`); | |
jsdom.env({ url, done: processDoc }); //, proxy: 'http://127.0.0.1:8888', | |
//setTimeout(() => { jsdom.env({ url, done: processDoc }); }, 1000); | |
} | |
/******************************************************************************/ | |
function processDoc(err, window) { | |
if (err) { | |
playAlert(); | |
if (currURL !== prevURL) { | |
fs.writeSync(errFile, | |
//////////////////////////////////////////////////////////////////////////////// | |
`jsdom error (${new Date()}). | |
${currURL} | |
${err} | |
` | |
//////////////////////////////////////////////////////////////////////////////// | |
, null, 'utf8'); | |
} | |
console.error(` ${err}`); | |
console.error(process.title = 'jsdom error. Retrying...'); | |
getDoc(currURL); | |
} else { | |
const doc = window.document; | |
const loc = window.location.href; | |
const entries = doc.querySelectorAll('#content div.def-panel[data-defid]'); | |
if (entries.length && loc.startsWith('http://www.urbandictionary.com/define.php?term=')) { | |
Array.from(entries).forEach(entry => { | |
const headwordRaw = secure(entry.querySelector('div.def-header a[href].word').textContent, true) | |
.trim() | |
.replace(/[ \t]{2,}/g, ' '); | |
const headword = headwordRaw.replace(/[\\\[\]{}@^~<>#()]/g, '\\$&'); | |
const headwordForCard = headwordRaw.replace(/[\\\[\]{}@^~<>#]/g, '\\$&'); | |
headwordsBuffer.add(headword.length <= 246? headword : headword.substr(0, 243) + '...'); | |
Array.from(entry.querySelectorAll('br')).forEach(br => { | |
const ps = br.previousSibling, ns = br.nextSibling; | |
if ( | |
(! ps || ! ps.textContent.endsWith('\n')) && | |
(! ns || ! ns.textContent.startsWith('\n')) | |
) { | |
br.parentNode.insertBefore(doc.createTextNode('\n'), br); | |
} | |
}); | |
const meanings = secure(entry.querySelector('div.meaning').textContent, false) | |
.trim() | |
.replace(/^[ \t]+|[ \t]+$/gm, '') | |
.replace(/[ \t]{2,}/g, ' ') | |
.replace(/[\\\[\]{}@^~<>#]/g, '\\$&') | |
.replace(/\n/g, '\n\t') | |
.replace(/(\n\t){2,}/g, '\n\t\\ \n\t'); | |
const examples = secure(entry.querySelector('div.example').textContent, false) | |
.trim() | |
.replace(/^[ \t]+|[ \t]+$/gm, '') | |
.replace(/[ \t]{2,}/g, ' ') | |
.replace(/[\\\[\]{}@^~<>#]/g, '\\$&') | |
.replace(/\n/g, '\n\t') | |
.replace(/(\n\t){2,}/g, '\n\t\\ \n\t'); | |
const up = entry.querySelector('div.def-footer div.thumbs a.up span.count').textContent.trim(); | |
const down = entry.querySelector('div.def-footer div.thumbs a.down span.count').textContent.trim(); | |
const time = entry.querySelector('div.contributor').lastChild.textContent | |
.replace(/\n+/g, ' ').replace(/by anonymous\s+/g, '').trim(); | |
cardBodyBuffer.push( | |
//////////////////////////////////////////////////////////////////////////////// | |
` [b]${cardBodyBuffer.length + 1}. ${headwordForCard}[/b] [c green]↑${up}[/c] [c red]↓${down}[/c] [c silver]${time}[/c] | |
\\ | |
${meanings} | |
\\ | |
${examples? `[m2][i]${examples}[/i][/m]\n\t` : ''}` | |
//////////////////////////////////////////////////////////////////////////////// | |
); | |
}); | |
const nextLink = doc.querySelector('#content div.pagination-centered li a[href][rel="next"]'); | |
if (nextLink) { | |
getDoc(nextLink.href); | |
} else { | |
fs.writeSync(dicFile, | |
//////////////////////////////////////////////////////////////////////////////// | |
`${Array.from(headwordsBuffer).join('\n')} | |
${cardBodyBuffer.join('\n\t\\ \n')} | |
` | |
//////////////////////////////////////////////////////////////////////////////// | |
, null, 'utf8'); | |
fs.writeSync(logFile, `${loc}\n`, null, 'utf8'); | |
console.log(` Entries: ${cardBodyBuffer.length}. ${speedInfo}`); | |
headwordsBuffer.clear(); | |
cardBodyBuffer.length = 0; | |
if (toc.length) { | |
console.log(process.title = | |
`Rest: ${ toc.length.toString().replace(formatNumberRE, ' ') }.`); | |
getDoc(toc.shift()); | |
} else { | |
process.exit(); | |
} | |
} | |
} else if (loc.startsWith('http://www.urbandictionary.com/define.php?term=')) { | |
playAlert(); | |
const nodef = doc.querySelector('#content div.def-header span.word'); | |
if (nodef && nodef.textContent === '¯\\_(ツ)_/¯') { | |
fs.writeSync(errFile, `No definitions (${new Date()}).\n${loc}\n\n`, null, 'utf8'); | |
console.error(' No definitions. Next URL...'); | |
if (toc.length) { | |
console.log(process.title = | |
`Rest: ${ toc.length.toString().replace(formatNumberRE, ' ') }.`); | |
getDoc(toc.shift()); | |
} else { | |
process.exit(); | |
} | |
} else { | |
console.error(process.title = 'HTTP error. Retrying...'); | |
getDoc(currURL); | |
} | |
} else { | |
fs.writeSync(errFile, `Something wrong (${new Date()}).\n${currURL}\n\n`, null, 'utf8'); | |
console.error('Something wrong...'); | |
process.exit(); | |
} | |
} | |
} | |
/******************************************************************************/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Еще раз спасибо за подробную статью на Хабре!
Чтобы заработало в 2017 нужно установить jsdom версии 9.10.0 и в 171 строке вместо http: написать https: .