Skip to content

Instantly share code, notes, and snippets.

@ranyefet
Created August 2, 2018 10:51
Show Gist options
  • Save ranyefet/8d31eb027c1fa8908fb3110b16487d92 to your computer and use it in GitHub Desktop.
Save ranyefet/8d31eb027c1fa8908fb3110b16487d92 to your computer and use it in GitHub Desktop.
A simple web crawler
const got = require("got");
const cheerio = require("cheerio");
const url = require("url");
const URL = url.URL;
const INITIAL_URL = "https://bitcoin.org/";
const MAX_QUEUE_SIZE = 1000;
const MAX_RUNS = 10;
function isRelativePath(url) {
return url.startsWith("/");
}
class Crawler {
constructor(initialUrl) {
this.initialUrl = new URL(initialUrl);
this.queue = new Set();
this.indexed = new Set();
this.titlesIndex = new Map();
this.numRuns = 0;
}
parse(html) {
return cheerio.load(html);
}
async fetch(url) {
console.log(`Crawling ${url}...`);
try {
return await got(url);
} catch (e) {
console.error(`Error crawling ${url}`);
return null;
}
}
// Find all links and filter out external links
getInboundLinks(find) {
return find("a").filter((idx, link) => {
const href = link.attribs.href;
if (!href) return false;
return isRelativePath(href) || href.includes(this.initialUrl.hostname);
});
}
addLink(url) {
if (this.queue.size >= MAX_QUEUE_SIZE) {
throw new Error("Queue reached maximum limit");
}
this.queue.add(url);
}
addLinksToQueue(links) {
const { protocol, hostname } = this.initialUrl;
links.each((idx, link) => {
const href = link.attribs.href;
if (isRelativePath(href)) {
const url = `${protocol}//${hostname}${href}`;
this.addLink(url);
return;
}
this.addLink(href);
});
}
indexPage(url, title) {
if (title) {
const words = title.split(" ");
words.forEach(word => {
const titleAlreadyIndexed = this.titlesIndex.has(word);
if (titleAlreadyIndexed) {
const urlSet = this.titlesIndex.get(word);
this.titlesIndex.set(word, urlSet.add(url));
} else {
this.titlesIndex.set(word, new Set([url]));
}
});
}
}
markAsIndexed(url) {
this.queue.delete(url);
this.indexed.add(url);
}
getNextUrl() {
if (this.queue.size) {
const it = this.queue.values();
return it.next().value;
}
return null;
}
async crawlPage(url) {
const response = await this.fetch(url);
if (!response) {
throw new Error("Unable to fetch: ", url);
}
const find = this.parse(response.body);
const title = find("title").text();
console.log("Page Title", title);
this.indexPage(url, title);
const links = this.getInboundLinks(find);
this.addLinksToQueue(links);
console.log("Queue size", this.queue.size);
this.markAsIndexed(url);
this.numRuns++;
const nextUrl = this.getNextUrl();
if (nextUrl && this.numRuns < MAX_RUNS) {
process.nextTick(() => this.crawlPage(nextUrl));
} else {
console.log("Titles Indexed", this.titlesIndex);
console.log("All done");
}
}
async start() {
await this.crawlPage(this.initialUrl.toString());
}
}
// Run the crawler
const crawler = new Crawler(INITIAL_URL);
const run = (async () => await crawler.start())();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment