Skip to content

Instantly share code, notes, and snippets.

@donfanning
Forked from martinjacobs/sitecrawler.js
Created August 15, 2018 12:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save donfanning/ebe77089ddbbc72a1a8bab758a4228eb to your computer and use it in GitHub Desktop.
Save donfanning/ebe77089ddbbc72a1a8bab758a4228eb to your computer and use it in GitHub Desktop.
Site crawler
var phantom = require('phantom');
var Crawler = require("simplecrawler");
var mycrawler = Crawler.crawl("http://www.example.com/");
mycrawler.maxDepth = 3;
mycrawler.interval = 500;
mycrawler.addFetchCondition(function(parsedURL) {
if (parsedURL.path.match(/\.(css|jpg|pdf|docx|js|png|ico)/i)) {
// console.log("Ignored ",parsedURL.path);
return false;
}
return true;
});
mycrawler.on("fetchcomplete", function(queueItem) {
//console.log("Completed fetching resource:", queueItem.url);
var crawlwait = this.wait();
phantom.create(function(ph) {
ph.createPage(function(page) {
page.open(String(queueItem.url), function(status) {
console.log("opened " + queueItem.url, status);
console.log("rendering " + queueItem.path);
page.render("item" + queueItem.path.replace(/\//g, "_") + '.png');
console.log("finished rendering " + queueItem.path);
crawlwait();
ph.exit();
});
});
});
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment