Skip to content

Instantly share code, notes, and snippets.

@almost almost/crawler.js
Last active Nov 22, 2018

Embed
What would you like to do?
// Solution https://gist.github.com/almost/9ee99b1a3e7fa240c596be3820c0b6b0
"use strict";
const url = require('url');
const rp = require("request-promise-native");
const getHrefs = require("get-hrefs");
const MAX_CONCURRENT = 3;
const MAX_COUNT = 5;
const ALLOW_DOMAINS = new Set(["almostobsolete.net", "tomparslow.co.uk"]);
const START_URLS = ["http://almostobsolete.net/"];
async function getHrefsFromUrl(currentUrl) {
const body = await rp({ url: currentUrl });
return getHrefs(body, { baseUrl: currentUrl});
}
function isAllowedDomain(currentUrl) {
return ALLOW_DOMAINS.has(url.parse(currentUrl).hostname));
}
// TODO
// Starting from START_URLS find links and crawl them.
// Only follow links to pages in th ALLOW_DOMAINS
// Do not make more that MAX_CONCURRENT requests at any one time
// Do not make more than MAX_COUNT requests overall
// Do not request the same url twice
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.