almost/crawler.js

## crawler.js
// Solution https://gist.github.com/almost/9ee99b1a3e7fa240c596be3820c0b6b0

"use strict";
const url = require('url');

const rp = require("request-promise-native");
const getHrefs = require("get-hrefs");

const MAX_CONCURRENT = 3;
const MAX_COUNT = 5;
const ALLOW_DOMAINS = new Set(["almostobsolete.net", "tomparslow.co.uk"]);
const START_URLS = ["http://almostobsolete.net/"];

async function getHrefsFromUrl(currentUrl) {
  const body = await rp({ url: currentUrl });
  return getHrefs(body, { baseUrl: currentUrl});
}

function isAllowedDomain(currentUrl) {
  return ALLOW_DOMAINS.has(url.parse(currentUrl).hostname));
}

// TODO
// Starting from START_URLS find links and crawl them.
// Only follow links to pages in th ALLOW_DOMAINS
// Do not make more that MAX_CONCURRENT requests at any one time
// Do not make more than MAX_COUNT requests overall
// Do not request the same url twice
	// Solution https://gist.github.com/almost/9ee99b1a3e7fa240c596be3820c0b6b0

	"use strict";
	const url = require('url');

	const rp = require("request-promise-native");
	const getHrefs = require("get-hrefs");

	const MAX_CONCURRENT = 3;
	const MAX_COUNT = 5;
	const ALLOW_DOMAINS = new Set(["almostobsolete.net", "tomparslow.co.uk"]);
	const START_URLS = ["http://almostobsolete.net/"];

	async function getHrefsFromUrl(currentUrl) {
	const body = await rp({ url: currentUrl });
	return getHrefs(body, { baseUrl: currentUrl});
	}

	function isAllowedDomain(currentUrl) {
	return ALLOW_DOMAINS.has(url.parse(currentUrl).hostname));
	}

	// TODO
	// Starting from START_URLS find links and crawl them.
	// Only follow links to pages in th ALLOW_DOMAINS
	// Do not make more that MAX_CONCURRENT requests at any one time
	// Do not make more than MAX_COUNT requests overall
	// Do not request the same url twice