Instantly share code, notes, and snippets.
vsemozhetbyt/WordSpy.extract_HTML_tags.js
Created Feb 21, 2016
'use strict'; | |
/******************************************************************************/ | |
console.log('Requiring modules...'); | |
const fs = require('fs'); | |
const path = require('path'); | |
const readline = require('readline'); | |
const exec = require('child_process').execFile; | |
const execSync = require('child_process').execFileSync; | |
const jsdom = require('jsdom'); | |
const inDir = process.argv[2] || __dirname; | |
try { | |
fs.accessSync(path.join(inDir, 'WordSpy.toc.txt')); | |
} catch(e) { | |
console.error('TOC file not found.'); | |
process.exit(); | |
} | |
const rl = readline.createInterface({ | |
input: fs.createReadStream(path.join(inDir, 'WordSpy.toc.txt'), 'utf8'), | |
terminal: false, | |
historySize: 0 | |
}); | |
const outDir = process.argv[3] || __dirname; | |
const tagFile = fs.openSync(path.join(outDir, 'WordSpy.tags.txt'), 'a+'); | |
const errFile = fs.openSync(path.join(outDir, 'WordSpy.tags.errors.log'), 'a'); | |
const formatNumberRE = /\B(?=(?:\d{3})+$)/g; | |
const hour = 1000 * 60 * 60; | |
const toc = []; | |
const tags = {}; | |
const selectorsToCheck = ['#content-text #newsletter a[href]', | |
'#content-text .index-words a[href]']; | |
const checkFrequency = 100; | |
const selectorsToDelete = ['#content-text #social', '#content-text #newsletter']; | |
const selectorsToExtract = ['#content-text *']; | |
let prevURL = ''; | |
let currURL = ''; | |
let restMark; | |
let speedInfo = '?/h (?/min): ~? hours left, ~? days left.'; | |
let terminate = false; | |
/******************************************************************************/ | |
process.on('exit', () => { | |
fs.closeSync(tagFile); | |
fs.closeSync(errFile); | |
playAlert(); | |
}); | |
process.on('SIGINT', () => { | |
terminate = true; | |
}); | |
/******************************************************************************/ | |
console.log('Reading the TOC file...'); | |
rl.on('line', line => { | |
line = line.trim(); | |
if (line) toc.push(line); | |
}).on('close', () => { | |
if (toc.length) { | |
if (fs.fstatSync(tagFile).size !== 0) { | |
const rl = readline.createInterface({ | |
input: fs.createReadStream(null, {encoding: 'utf8', fd: tagFile, autoClose: false}), | |
terminal: false, | |
historySize: 0 | |
}); | |
let lastURL; | |
console.log('Reading the tag file...'); | |
rl.on('line', line => { | |
line = line.trim(); | |
if (line) { | |
if (line.startsWith('http:')) { | |
lastURL = line; | |
} else { | |
line.split(', ').forEach(elm => { | |
tags[elm]? tags[elm]++ : (tags[elm] = 1); | |
}); | |
} | |
} | |
}).on('close', () => { | |
if (lastURL) toc.splice(0, toc.indexOf(lastURL) + 1); | |
restMark = toc.length; | |
setInterval(setSpeedInfo, hour).unref(); | |
console.log(process.title = | |
`Rest: ${ toc.length.toString().replace(formatNumberRE, ' ') }.`); | |
getDoc(toc.shift()); | |
}); | |
} else { | |
fs.writeSync(tagFile, '\uFEFF', null, 'utf8'); | |
restMark = toc.length; | |
setInterval(setSpeedInfo, hour).unref(); | |
console.log(process.title = | |
`Rest: ${ toc.length.toString().replace(formatNumberRE, ' ') }.`); | |
getDoc(toc.shift()); | |
} | |
} else { | |
console.error('No URLs found.'); | |
process.exit(); | |
} | |
}); | |
/******************************************************************************/ | |
function playAlert() { | |
execSync( | |
'f:\\BAK\\prg\\mm\\ffmpeg\\bin\\ffplay.exe', | |
['-v', 'quiet', '-nodisp', '-autoexit', '-af', 'volume=1.0', | |
'c:\\WINDOWS\\Media\\Windows Ringin.wav'] | |
); | |
} | |
/******************************************************************************/ | |
function setSpeedInfo() { | |
const donePerHour = restMark - toc.length; | |
const donePerMin = Math.round(donePerHour / 60); | |
restMark = toc.length; | |
const hoursLeft = Math.round(restMark / donePerHour); | |
const daysLeft = Math.round(hoursLeft / 24); | |
speedInfo = `${donePerHour}/h (${donePerMin}/min): ~${hoursLeft} hours left, ~${daysLeft} days left.`; | |
exec( | |
'f:\\BAK\\prg\\mm\\ffmpeg\\bin\\ffplay.exe', | |
['-v', 'quiet', '-nodisp', '-autoexit', '-af', 'volume=1.0', | |
'c:\\WINDOWS\\Media\\Windows Ringin.wav'] | |
); | |
} | |
/******************************************************************************/ | |
function getDoc(url) { | |
if (terminate) { | |
console.log('Exit on demand.'); | |
process.exit(); | |
} | |
prevURL = currURL; | |
currURL = url; | |
console.log(` ${url}`); | |
jsdom.env({ url, done: checkDoc, features: { | |
FetchExternalResources: ["script"], ProcessExternalResources: ["script"] | |
} }); //, proxy: 'http://127.0.0.1:8888' | |
} | |
/******************************************************************************/ | |
function checkDoc(err, window) { | |
if (err) { | |
playAlert(); | |
if (currURL !== prevURL) { | |
fs.writeSync(errFile, | |
//////////////////////////////////////////////////////////////////////////////// | |
`jsdom error (${new Date()}). | |
${currURL} | |
${err} | |
` | |
//////////////////////////////////////////////////////////////////////////////// | |
, null, 'utf8'); | |
} | |
console.error(` ${err}`); | |
console.error(process.title = 'jsdom error. Retrying...'); | |
getDoc(currURL); | |
} else { | |
const doc = window.document; | |
const loc = window.location.href; | |
let iter = 0; | |
const checker = setInterval(() => { | |
iter++; | |
if (doc.querySelectorAll(selectorsToCheck.join(', ')).length) { | |
clearInterval(checker); | |
console.log(` ${iter * checkFrequency} ms`); | |
processDoc(doc, loc); | |
} else if (iter > 50) { | |
clearInterval(checker); | |
console.log(` ${iter * checkFrequency} ms`); | |
if (loc === currURL) { | |
playAlert(); | |
console.error(process.title = 'HTTP error. Retrying...'); | |
getDoc(currURL); | |
} else { | |
fs.writeSync(errFile, | |
//////////////////////////////////////////////////////////////////////////////// | |
`Something wrong (${new Date()}). | |
${currURL} | |
` | |
//////////////////////////////////////////////////////////////////////////////// | |
, null, 'utf8'); | |
console.error('Something wrong...'); | |
process.exit(); | |
} | |
} | |
}, checkFrequency); | |
} | |
} | |
/******************************************************************************/ | |
function processDoc(doc, loc) { | |
selectorsToDelete.forEach(s => { | |
const e = doc.querySelector(s); | |
if (e) e.parentNode.removeChild(e); | |
}); | |
const elms = doc.querySelectorAll(selectorsToExtract.join(', ')); | |
Array.from(elms).forEach(elm => { | |
tags[elm.tagName]? tags[elm.tagName]++ : (tags[elm.tagName] = 1); | |
}); | |
fs.writeSync(tagFile, `${loc}\n\t${ | |
Array.from(elms).map(e => e.tagName).join(', ') | |
}\n\n`, null, 'utf8'); | |
console.log(process.title = ` Tags: ${elms.length}. ${speedInfo}`); | |
if (toc.length) { | |
console.log(process.title = | |
`Rest: ${ toc.length.toString().replace(formatNumberRE, ' ') }.`); | |
getDoc(toc.shift()); | |
} else { | |
fs.writeSync(tagFile, `${JSON.stringify(tags, null, '\t')}\n`, null, 'utf8'); | |
process.exit(); | |
} | |
} | |
/******************************************************************************/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment