Skip to content

Instantly share code, notes, and snippets.

@NeuronQ
Created February 5, 2019 23:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save NeuronQ/06054eebfd890ccf679ea22ab9948b65 to your computer and use it in GitHub Desktop.
Save NeuronQ/06054eebfd890ccf679ea22ab9948b65 to your computer and use it in GitHub Desktop.
// sync_scrape.js (tested with node 11.3)
const request = require("sync-request");
const fetchUrl = url => {
console.time(`fetchUrl(${url})`);
const html = request("GET", url).getBody();
console.timeEnd(`fetchUrl(${url})`);
return html;
};
const scrapeData = html => {
const re = /href="([^"]+)"/g;
const hrefs = [];
let m;
while ((m = re.exec(html))) hrefs.push(m[1]);
return hrefs;
};
const urls = [
"http://neverssl.com/",
"https://www.ietf.org/rfc/rfc2616.txt",
"https://en.wikipedia.org/wiki/Asynchronous_I/O"
];
const extactedData = {};
async function main() {
console.time("elapsed");
for (const url of urls) {
const html = await fetchUrl(url);
extactedData[url] = scrapeData(html);
}
console.log("> extracted data:", extactedData);
console.timeEnd("elapsed");
}
main();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment