Skip to content

Instantly share code, notes, and snippets.

@RadNi
Last active January 31, 2019 10:42
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save RadNi/82f5bb9c33c80aaee0be52a79f8b6454 to your computer and use it in GitHub Desktop.
Save RadNi/82f5bb9c33c80aaee0be52a79f8b6454 to your computer and use it in GitHub Desktop.
It is a simple crawler implemented with Node crawler package .
//require('events').EventEmitter.defaultMaxListeners = 0
var Crawler = require("crawler");
var baseURLS = ['http://www.google.com/','http://www.yahoo.com', 'http://www.amazon.com', 'http://www.sharif.ir/home'];
var depth = 3
var urls = new Set()
var c = new Crawler({
maxConnections : 100000,
retries: 2,
skipDuplicates: true,
// preRequest: function(options, done) {
//
// // console.log("This request: ", options.uri)
// done();
// },
callback : function (error, res, done) {
if(error){
console.log(error);
done()
}else {
var $ = res.$;
console.log(res.request.uri.href + " " + urls.size);
if($) {
var tags = $("a");
// console.log("inja" + " " + res.body)
// console.log($)
for (var a = 0; a < tags.length; a++) {
// console.log(res.request.uri.href+ " " + tags.length)
if (tags[a].attribs.href) {
// console.log(res.request.uri.href)
if (tags[a].attribs.href.startsWith("www") || tags[a].attribs.href.startsWith("http") || tags[a].attribs.href.startsWith("https")) {
// console.log(res.request.uri.href)
urls.add(tags[a].attribs.href)
}
}
}
}
else {
console.log("hereee")
}
done();
}
}
});
c.queue(baseURLS);
// c.queue("http://www.sharif.ir/home");
c.on('drain',function(){
depth--;
if (depth<=0) {
console.log(urls);
console.log("size: ", urls.size)
}
else {
console.log("Draiinnnnnnnnnnn, "+ depth);
// c.queue("http://www.sharif.ir/home")
c.queue(Array.from(urls))
}
});
@RadNi
Copy link
Author

RadNi commented Jan 31, 2019

For testing after initializing new node project, use npm install crawler then node crawler-project.js.
baseURLS should contain the start points you want to crawl.
the depth variable shows depth you want to crawl from start points.

@RadNi
Copy link
Author

RadNi commented Jan 31, 2019

Sometimes javascript heap memory will finish, a naive solution is using this option for node :
--max_old_space_size=2000000

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment