Skip to content

Instantly share code, notes, and snippets.

@vsemozhetbyt
Last active December 9, 2017 16:01
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save vsemozhetbyt/bf247ba23378598420f1 to your computer and use it in GitHub Desktop.
Save vsemozhetbyt/bf247ba23378598420f1 to your computer and use it in GitHub Desktop.
'use strict';
/******************************************************************************/
console.log('Requiring modules...');
const fs = require('fs');
const path = require('path');
const readline = require('readline');
const exec = require('child_process').execFile;
const execSync = require('child_process').execFileSync;
const jsdom = require('jsdom');
const inDir = process.argv[2] || __dirname;
try {
fs.accessSync(path.join(inDir, 'UD.toc.txt'))
} catch(e) {
console.error('TOC file not found.');
process.exit();
}
const rl = readline.createInterface({
input: fs.createReadStream(path.join(inDir, 'UD.toc.txt'), {encoding: 'utf8'}),
terminal: false,
historySize: 0
});
const outDir = process.argv[3] || __dirname;
const dicFile = fs.openSync(path.join(outDir, 'UD.dic.dsl'), 'a');
const logFile = fs.openSync(path.join(outDir, 'UD.dic.log'), 'a+');
const errFile = fs.openSync(path.join(outDir, 'UD.dic.errors.log'), 'a');
const formatNumberRE = /\B(?=(?:\d{3})+$)/g;
const hour = 1000 * 60 * 60;
const toc = [];
const headwordsBuffer = new Set();
const cardBodyBuffer = [];
let prevURL = '';
let currURL = '';
let restMark;
let speedInfo = '?/h (?/min): ~? hours left, ~? days left.';
let terminate = false;
/******************************************************************************/
process.on('exit', () => {
fs.closeSync(dicFile);
fs.closeSync(logFile);
fs.closeSync(errFile);
playAlert();
});
process.on('SIGINT', () => {
terminate = true;
});
/******************************************************************************/
if (fs.fstatSync(dicFile).size === 0) {
fs.writeSync(dicFile, '\uFEFF' +
////////////////////////////////////////////////////////////////////////////////
`#NAME "Urban Dictionary 2015 (Eng-Eng)"
#INDEX_LANGUAGE "English"
#CONTENTS_LANGUAGE "English"
`
////////////////////////////////////////////////////////////////////////////////
, null, 'utf8');
}
/******************************************************************************/
console.log('Reading the TOC file...');
rl.on('line', line => {
line = line.trim();
if (line) toc.push(line);
}).on('close', () => {
if (toc.length) {
if (fs.fstatSync(logFile).size !== 0) {
const rl = readline.createInterface({
input: fs.createReadStream(null, {encoding: 'utf8', fd: logFile, autoClose: false}),
terminal: false,
historySize: 0
});
let lastLine;
console.log('Reading the log file...');
rl.on('line', line => {
line = line.trim();
if (line) lastLine = line;
}).on('close', () => {
toc.splice(0, toc.indexOf(lastLine.replace(/&page=\d+$/, '')) + 1);
restMark = toc.length;
setInterval(setSpeedInfo, hour).unref();
console.log(process.title =
`Rest: ${ toc.length.toString().replace(formatNumberRE, ' ') }.`);
getDoc(toc.shift());
});
} else {
restMark = toc.length;
setInterval(setSpeedInfo, hour).unref();
console.log(process.title =
`Rest: ${ toc.length.toString().replace(formatNumberRE, ' ') }.`);
getDoc(toc.shift());
}
} else {
console.error('No URLs found.');
process.exit();
}
});
/******************************************************************************/
function playAlert() {
execSync(
'f:\\BAK\\prg\\mm\\ffmpeg\\bin\\ffplay.exe',
['-v', 'quiet', '-nodisp', '-autoexit', '-af', 'volume=1.0',
'c:\\WINDOWS\\Media\\Windows Ringin.wav']
);
}
/******************************************************************************/
function secure(str, isHeadword) {
str = str.replace(/[\x00-\x09\x0b\x0c\x0e-\x1f\u2028]/g, mtch => ` ${encodeURIComponent(mtch)} `)
.replace(/ +/g, ' ');
if (!isHeadword) {
str = str.replace(/\S{256,}/g, mtch => mtch.substr(0, 250) + '[...]');
}
return str;
}
/******************************************************************************/
function setSpeedInfo() {
const donePerHour = restMark - toc.length;
const donePerMin = Math.round(donePerHour / 60);
restMark = toc.length;
const hoursLeft = Math.round(restMark / donePerHour);
const daysLeft = Math.round(hoursLeft / 24);
speedInfo = `${donePerHour}/h (${donePerMin}/min): ~${hoursLeft} hours left, ~${daysLeft} days left.`;
exec(
'f:\\BAK\\prg\\mm\\ffmpeg\\bin\\ffplay.exe',
['-v', 'quiet', '-nodisp', '-autoexit', '-af', 'volume=1.0',
'c:\\WINDOWS\\Media\\Windows Ringin.wav']
);
}
/******************************************************************************/
function getDoc(url) {
if (terminate) {
console.log('Exit on demand.');
process.exit();
}
prevURL = currURL;
currURL = url;
console.log(` ${url}`);
jsdom.env({ url, done: processDoc }); //, proxy: 'http://127.0.0.1:8888',
//setTimeout(() => { jsdom.env({ url, done: processDoc }); }, 1000);
}
/******************************************************************************/
function processDoc(err, window) {
if (err) {
playAlert();
if (currURL !== prevURL) {
fs.writeSync(errFile,
////////////////////////////////////////////////////////////////////////////////
`jsdom error (${new Date()}).
${currURL}
${err}
`
////////////////////////////////////////////////////////////////////////////////
, null, 'utf8');
}
console.error(` ${err}`);
console.error(process.title = 'jsdom error. Retrying...');
getDoc(currURL);
} else {
const doc = window.document;
const loc = window.location.href;
const entries = doc.querySelectorAll('#content div.def-panel[data-defid]');
if (entries.length && loc.startsWith('http://www.urbandictionary.com/define.php?term=')) {
Array.from(entries).forEach(entry => {
const headwordRaw = secure(entry.querySelector('div.def-header a[href].word').textContent, true)
.trim()
.replace(/[ \t]{2,}/g, ' ');
const headword = headwordRaw.replace(/[\\\[\]{}@^~<>#()]/g, '\\$&');
const headwordForCard = headwordRaw.replace(/[\\\[\]{}@^~<>#]/g, '\\$&');
headwordsBuffer.add(headword.length <= 246? headword : headword.substr(0, 243) + '...');
Array.from(entry.querySelectorAll('br')).forEach(br => {
const ps = br.previousSibling, ns = br.nextSibling;
if (
(! ps || ! ps.textContent.endsWith('\n')) &&
(! ns || ! ns.textContent.startsWith('\n'))
) {
br.parentNode.insertBefore(doc.createTextNode('\n'), br);
}
});
const meanings = secure(entry.querySelector('div.meaning').textContent, false)
.trim()
.replace(/^[ \t]+|[ \t]+$/gm, '')
.replace(/[ \t]{2,}/g, ' ')
.replace(/[\\\[\]{}@^~<>#]/g, '\\$&')
.replace(/\n/g, '\n\t')
.replace(/(\n\t){2,}/g, '\n\t\\ \n\t');
const examples = secure(entry.querySelector('div.example').textContent, false)
.trim()
.replace(/^[ \t]+|[ \t]+$/gm, '')
.replace(/[ \t]{2,}/g, ' ')
.replace(/[\\\[\]{}@^~<>#]/g, '\\$&')
.replace(/\n/g, '\n\t')
.replace(/(\n\t){2,}/g, '\n\t\\ \n\t');
const up = entry.querySelector('div.def-footer div.thumbs a.up span.count').textContent.trim();
const down = entry.querySelector('div.def-footer div.thumbs a.down span.count').textContent.trim();
const time = entry.querySelector('div.contributor').lastChild.textContent
.replace(/\n+/g, ' ').replace(/by anonymous\s+/g, '').trim();
cardBodyBuffer.push(
////////////////////////////////////////////////////////////////////////////////
` [b]${cardBodyBuffer.length + 1}. ${headwordForCard}[/b] [c green]↑${up}[/c] [c red]↓${down}[/c] [c silver]${time}[/c]
\\
${meanings}
\\
${examples? `[m2][i]${examples}[/i][/m]\n\t` : ''}`
////////////////////////////////////////////////////////////////////////////////
);
});
const nextLink = doc.querySelector('#content div.pagination-centered li a[href][rel="next"]');
if (nextLink) {
getDoc(nextLink.href);
} else {
fs.writeSync(dicFile,
////////////////////////////////////////////////////////////////////////////////
`${Array.from(headwordsBuffer).join('\n')}
${cardBodyBuffer.join('\n\t\\ \n')}
`
////////////////////////////////////////////////////////////////////////////////
, null, 'utf8');
fs.writeSync(logFile, `${loc}\n`, null, 'utf8');
console.log(` Entries: ${cardBodyBuffer.length}. ${speedInfo}`);
headwordsBuffer.clear();
cardBodyBuffer.length = 0;
if (toc.length) {
console.log(process.title =
`Rest: ${ toc.length.toString().replace(formatNumberRE, ' ') }.`);
getDoc(toc.shift());
} else {
process.exit();
}
}
} else if (loc.startsWith('http://www.urbandictionary.com/define.php?term=')) {
playAlert();
const nodef = doc.querySelector('#content div.def-header span.word');
if (nodef && nodef.textContent === '¯\\_(ツ)_/¯') {
fs.writeSync(errFile, `No definitions (${new Date()}).\n${loc}\n\n`, null, 'utf8');
console.error(' No definitions. Next URL...');
if (toc.length) {
console.log(process.title =
`Rest: ${ toc.length.toString().replace(formatNumberRE, ' ') }.`);
getDoc(toc.shift());
} else {
process.exit();
}
} else {
console.error(process.title = 'HTTP error. Retrying...');
getDoc(currURL);
}
} else {
fs.writeSync(errFile, `Something wrong (${new Date()}).\n${currURL}\n\n`, null, 'utf8');
console.error('Something wrong...');
process.exit();
}
}
}
/******************************************************************************/
@radioaktive
Copy link

radioaktive commented Dec 9, 2017

Еще раз спасибо за подробную статью на Хабре!
Чтобы заработало в 2017 нужно установить jsdom версии 9.10.0 и в 171 строке вместо http: написать https: .

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment