Skip to content

Instantly share code, notes, and snippets.

@NeuronQ
Created February 5, 2019 23:54
Show Gist options
  • Save NeuronQ/0a60dfca054810db47841184460dcc14 to your computer and use it in GitHub Desktop.
Save NeuronQ/0a60dfca054810db47841184460dcc14 to your computer and use it in GitHub Desktop.
// callbacks_async_scrape.js (tested with node 11.3)
const http = require('http');
const https = require('https');
const fetchUrl = (url, onSuccess, onError) => {
console.time(`fetchUrl(${url})`);
(url.indexOf('https') === 0 ? https : http).get(url, resp => {
let html = '';
resp.on('data', chunk => html += chunk);
resp.on('end', () => {
console.timeEnd(`fetchUrl(${url})`);
onSuccess(html);
});
}).on('error', onError);
}
const scrapeData = html => {
const re = /href="([^"]+)"/g;
const hrefs = [];
let m;
while (m = re.exec(html)) hrefs.push(m[1]);
return hrefs;
};
const urls = [
"http://neverssl.com/",
"https://www.ietf.org/rfc/rfc2616.txt",
"https://en.wikipedia.org/wiki/Asynchronous_I/O",
];
const extactedData = {};
let nUrlsProcessed = 0;
console.time("elapsed");
for (url of urls) {
fetchUrl(
url,
html => {
nUrlsProcessed++;
extactedData[url] = scrapeData(html);
if (nUrlsProcessed <= 0) {
console.log("> extracted data:", extactedData)
console.timeEnd("elapsed");
}
},
() => nUrlsProcessed++
);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment