Skip to content

Instantly share code, notes, and snippets.

@almost
Last active November 22, 2018 09:30
Show Gist options
  • Save almost/7f10568539b9a079cb8aca3a13d52dc8 to your computer and use it in GitHub Desktop.
Save almost/7f10568539b9a079cb8aca3a13d52dc8 to your computer and use it in GitHub Desktop.
// Solution https://gist.github.com/almost/9ee99b1a3e7fa240c596be3820c0b6b0
"use strict";
const url = require('url');
const rp = require("request-promise-native");
const getHrefs = require("get-hrefs");
const MAX_CONCURRENT = 3;
const MAX_COUNT = 5;
const ALLOW_DOMAINS = new Set(["almostobsolete.net", "tomparslow.co.uk"]);
const START_URLS = ["http://almostobsolete.net/"];
async function getHrefsFromUrl(currentUrl) {
const body = await rp({ url: currentUrl });
return getHrefs(body, { baseUrl: currentUrl});
}
function isAllowedDomain(currentUrl) {
return ALLOW_DOMAINS.has(url.parse(currentUrl).hostname));
}
// TODO
// Starting from START_URLS find links and crawl them.
// Only follow links to pages in th ALLOW_DOMAINS
// Do not make more that MAX_CONCURRENT requests at any one time
// Do not make more than MAX_COUNT requests overall
// Do not request the same url twice
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment