Skip to content

Instantly share code, notes, and snippets.

@vsemozhetbyt
Created February 21, 2016 13:31
Show Gist options
  • Save vsemozhetbyt/e1ff91c8a18154375811 to your computer and use it in GitHub Desktop.
Save vsemozhetbyt/e1ff91c8a18154375811 to your computer and use it in GitHub Desktop.
'use strict';
/******************************************************************************/
console.log('Requiring modules...');
const fs = require('fs');
const path = require('path');
const readline = require('readline');
const exec = require('child_process').execFile;
const execSync = require('child_process').execFileSync;
const jsdom = require('jsdom');
const inDir = process.argv[2] || __dirname;
try {
fs.accessSync(path.join(inDir, 'WordSpy.toc.txt'));
} catch(e) {
console.error('TOC file not found.');
process.exit();
}
const rl = readline.createInterface({
input: fs.createReadStream(path.join(inDir, 'WordSpy.toc.txt'), 'utf8'),
terminal: false,
historySize: 0
});
const outDir = process.argv[3] || __dirname;
const tagFile = fs.openSync(path.join(outDir, 'WordSpy.tags.txt'), 'a+');
const errFile = fs.openSync(path.join(outDir, 'WordSpy.tags.errors.log'), 'a');
const formatNumberRE = /\B(?=(?:\d{3})+$)/g;
const hour = 1000 * 60 * 60;
const toc = [];
const tags = {};
const selectorsToCheck = ['#content-text #newsletter a[href]',
'#content-text .index-words a[href]'];
const checkFrequency = 100;
const selectorsToDelete = ['#content-text #social', '#content-text #newsletter'];
const selectorsToExtract = ['#content-text *'];
let prevURL = '';
let currURL = '';
let restMark;
let speedInfo = '?/h (?/min): ~? hours left, ~? days left.';
let terminate = false;
/******************************************************************************/
process.on('exit', () => {
fs.closeSync(tagFile);
fs.closeSync(errFile);
playAlert();
});
process.on('SIGINT', () => {
terminate = true;
});
/******************************************************************************/
console.log('Reading the TOC file...');
rl.on('line', line => {
line = line.trim();
if (line) toc.push(line);
}).on('close', () => {
if (toc.length) {
if (fs.fstatSync(tagFile).size !== 0) {
const rl = readline.createInterface({
input: fs.createReadStream(null, {encoding: 'utf8', fd: tagFile, autoClose: false}),
terminal: false,
historySize: 0
});
let lastURL;
console.log('Reading the tag file...');
rl.on('line', line => {
line = line.trim();
if (line) {
if (line.startsWith('http:')) {
lastURL = line;
} else {
line.split(', ').forEach(elm => {
tags[elm]? tags[elm]++ : (tags[elm] = 1);
});
}
}
}).on('close', () => {
if (lastURL) toc.splice(0, toc.indexOf(lastURL) + 1);
restMark = toc.length;
setInterval(setSpeedInfo, hour).unref();
console.log(process.title =
`Rest: ${ toc.length.toString().replace(formatNumberRE, ' ') }.`);
getDoc(toc.shift());
});
} else {
fs.writeSync(tagFile, '\uFEFF', null, 'utf8');
restMark = toc.length;
setInterval(setSpeedInfo, hour).unref();
console.log(process.title =
`Rest: ${ toc.length.toString().replace(formatNumberRE, ' ') }.`);
getDoc(toc.shift());
}
} else {
console.error('No URLs found.');
process.exit();
}
});
/******************************************************************************/
function playAlert() {
execSync(
'f:\\BAK\\prg\\mm\\ffmpeg\\bin\\ffplay.exe',
['-v', 'quiet', '-nodisp', '-autoexit', '-af', 'volume=1.0',
'c:\\WINDOWS\\Media\\Windows Ringin.wav']
);
}
/******************************************************************************/
function setSpeedInfo() {
const donePerHour = restMark - toc.length;
const donePerMin = Math.round(donePerHour / 60);
restMark = toc.length;
const hoursLeft = Math.round(restMark / donePerHour);
const daysLeft = Math.round(hoursLeft / 24);
speedInfo = `${donePerHour}/h (${donePerMin}/min): ~${hoursLeft} hours left, ~${daysLeft} days left.`;
exec(
'f:\\BAK\\prg\\mm\\ffmpeg\\bin\\ffplay.exe',
['-v', 'quiet', '-nodisp', '-autoexit', '-af', 'volume=1.0',
'c:\\WINDOWS\\Media\\Windows Ringin.wav']
);
}
/******************************************************************************/
function getDoc(url) {
if (terminate) {
console.log('Exit on demand.');
process.exit();
}
prevURL = currURL;
currURL = url;
console.log(` ${url}`);
jsdom.env({ url, done: checkDoc, features: {
FetchExternalResources: ["script"], ProcessExternalResources: ["script"]
} }); //, proxy: 'http://127.0.0.1:8888'
}
/******************************************************************************/
function checkDoc(err, window) {
if (err) {
playAlert();
if (currURL !== prevURL) {
fs.writeSync(errFile,
////////////////////////////////////////////////////////////////////////////////
`jsdom error (${new Date()}).
${currURL}
${err}
`
////////////////////////////////////////////////////////////////////////////////
, null, 'utf8');
}
console.error(` ${err}`);
console.error(process.title = 'jsdom error. Retrying...');
getDoc(currURL);
} else {
const doc = window.document;
const loc = window.location.href;
let iter = 0;
const checker = setInterval(() => {
iter++;
if (doc.querySelectorAll(selectorsToCheck.join(', ')).length) {
clearInterval(checker);
console.log(` ${iter * checkFrequency} ms`);
processDoc(doc, loc);
} else if (iter > 50) {
clearInterval(checker);
console.log(` ${iter * checkFrequency} ms`);
if (loc === currURL) {
playAlert();
console.error(process.title = 'HTTP error. Retrying...');
getDoc(currURL);
} else {
fs.writeSync(errFile,
////////////////////////////////////////////////////////////////////////////////
`Something wrong (${new Date()}).
${currURL}
`
////////////////////////////////////////////////////////////////////////////////
, null, 'utf8');
console.error('Something wrong...');
process.exit();
}
}
}, checkFrequency);
}
}
/******************************************************************************/
function processDoc(doc, loc) {
selectorsToDelete.forEach(s => {
const e = doc.querySelector(s);
if (e) e.parentNode.removeChild(e);
});
const elms = doc.querySelectorAll(selectorsToExtract.join(', '));
Array.from(elms).forEach(elm => {
tags[elm.tagName]? tags[elm.tagName]++ : (tags[elm.tagName] = 1);
});
fs.writeSync(tagFile, `${loc}\n\t${
Array.from(elms).map(e => e.tagName).join(', ')
}\n\n`, null, 'utf8');
console.log(process.title = ` Tags: ${elms.length}. ${speedInfo}`);
if (toc.length) {
console.log(process.title =
`Rest: ${ toc.length.toString().replace(formatNumberRE, ' ') }.`);
getDoc(toc.shift());
} else {
fs.writeSync(tagFile, `${JSON.stringify(tags, null, '\t')}\n`, null, 'utf8');
process.exit();
}
}
/******************************************************************************/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment