Skip to content

Instantly share code, notes, and snippets.

@straps
Created June 7, 2013 08:00
Show Gist options
  • Save straps/5727707 to your computer and use it in GitHub Desktop.
Save straps/5727707 to your computer and use it in GitHub Desktop.
Based on the fantastic node-crawler plugin ( https://github.com/sylvinus/node-crawler ) Requires: npm install crawler Usage: node simple-crawler.js URL EXTENSIONS Example: node simple-crawler.js http://www.omgubuntu.co.uk/ jpg,png Find links for every page and subpages in URL and generates an URL list you can copy to a file and download with wge…
var Crawler = require("crawler").Crawler;
//console.argv=['node', 'app.js', url, ext]
if (process.argv.length < 4) {
console.log('Usage: ' + process.argv[0] + ' ' + process.argv[1] + ' URL EXT');
console.log('Example: ' + process.argv[0] + ' ' + process.argv[1] + ' http://www.omgubuntu.co.uk/ jpg,png');
process.exit(1);
}
var url = process.argv[2],
ext = process.argv[3];
//Converting extension link to regex
// png,jpg ==> /\.png$|\.jpg$/
var regexExt = new RegExp('\\.' + ext.split(',').join('$|\\.') + '$');
//urls just found
var found = [];
var c = new Crawler({
"maxConnections": 4,
// This will be called for each crawled page
"callback": function (error, result, $) {
// $ is a jQuery instance scoped to the server-side DOM of the page
if ($) {
$("a").each(function (index, a) {
if (regexExt.test(a.href)) {
if (found.indexOf(a.href) < 0) {
console.log(a.href);
found.push(a.href);
}
}
//Dont go outside of base url
if (a.href.indexOf(url) === 0) {
c.queue(a.href);
}
});
}
}
});
c.queue(url);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment