Skip to content

Instantly share code, notes, and snippets.

@t3db0t
Created November 29, 2017 23:11
Show Gist options
  • Save t3db0t/2db07159130bc5c0dc9ebe6c8dc8f317 to your computer and use it in GitHub Desktop.
Save t3db0t/2db07159130bc5c0dc9ebe6c8dc8f317 to your computer and use it in GitHub Desktop.
Example using Supercrawler
var supercrawler = require("supercrawler");
// 1. Create a new instance of the Crawler object, providing configuration
// details. Note that configuration cannot be changed after the object is
// created.
var crawler = new supercrawler.Crawler({
// By default, Supercrawler uses a simple FIFO queue, which doesn't support
// retries or memory of crawl state. For any non-trivial crawl, you should
// create a database. Provide your database config to the constructor of
// DbUrlList.
// urlList: new supercrawler.DbUrlList({
// db: {
// database: "crawler",
// username: "root",
// password: secrets.db.password,
// sequelizeOpts: {
// dialect: "mysql",
// host: "localhost"
// }
// }
// }),
// Tme (ms) between requests
interval: 1000,
// Maximum number of requests at any one time.
concurrentRequestsLimit: 5,
// Time (ms) to cache the results of robots.txt queries.
robotsCacheTime: 3600000,
// Query string to use during the crawl.
userAgent: "Mozilla/5.0 (compatible; supercrawler/1.0; +https://github.com/brendonboshell/supercrawler)",
// Custom options to be passed to request.
// request: {
// headers: {
// 'x-custom-header': 'example'
// }
// }
});
crawler.on("crawlurl", function (url) {
console.log("Crawling " + url);
});
// Get "Sitemaps:" directives from robots.txt
crawler.addHandler(supercrawler.handlers.robotsParser());
// Crawl sitemap files and extract their URLs.
crawler.addHandler(supercrawler.handlers.sitemapsParser());
// Pick up <a href> links from HTML documents
crawler.addHandler("text/html", supercrawler.handlers.htmlLinkParser({
// Restrict discovered links to the following hostnames.
hostnames: ["sweetpricing.com"]
}));
// Custom content handler for HTML pages.
crawler.addHandler("text/html", function (context) {
var sizeKb = Buffer.byteLength(context.body) / 1024;
logger.info("Processed", context.url, "Size=", sizeKb, "KB");
});
console.log("Starting Crawler...");
crawler.getUrlList()
.insertIfNotExists(new supercrawler.Url("https://sweetpricing.com"))
.then(function () {
return crawler.start();
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment