Skip to content

Instantly share code, notes, and snippets.

@RTLer
Created March 22, 2016 07:56
Show Gist options
  • Save RTLer/a9acf21f556a0de6b4e8 to your computer and use it in GitHub Desktop.
Save RTLer/a9acf21f556a0de6b4e8 to your computer and use it in GitHub Desktop.
node-spider site mapper
var Spider = require('node-spider');
var list = [];
var errors = [];
var map = [];
var spider = new Spider({
concurrent: 10,
delay: 0,
logs: process.stderr,
allowDuplicates: false,
catchErrors: true,
error: function(err, url) {
errors.push(url);
},
done: function() {
var fs = require('fs');
fs.writeFile("./files/results.json", JSON.stringify(list), function(err) {
console.log("The file was saved!");
});
fs.writeFile("./files/errors.json", JSON.stringify(errors), function(err) {
console.log("The file was saved!");
});
fs.writeFile("./files/map.json", JSON.stringify(map), function(err) {
console.log("The file was saved!");
});
},
headers: { 'user-agent': 'node-spider' },
encoding: 'utf8'
});
var handleRequest = function(doc) {
var pageinfo = {
url: doc.url,
title: doc.$('title').first().text(),
links: []
};
doc.$('a').each(function(i, elem) {
if(elem.attribs.href != undefined){
var href = elem.attribs.href.split('#')[0];
var url = doc.resolve(href);
pageinfo.links.push(url);
if(list.indexOf(url) == -1 && url.indexOf('site.com') != -1){
list.push(url);
spider.queue(url, handleRequest);
}
}
});
map.push(pageinfo);
};
spider.queue('https://www.site.com/', handleRequest);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment