Package crawler of the npm registry using 3 modules request, cheerio and async, inspired by node.io.
For seeding the crawler, the site's main page is enough since the recently updated packages section of the page is updated quite often.
var request = require('request'), | |
cheerio = require('cheerio'), | |
async = require('async'); | |
var base = 'https://www.npmjs.org', | |
concurrency = 2; | |
var q = async.queue(function(task, next) { | |
setTimeout(function() { | |
console.log('GET ' + task.url); | |
request(task.url, function(error, response, body) { | |
if(!error && response.statusCode == 200) { | |
var $ = cheerio.load(body); | |
$('a').each(function() { | |
var href = $(this).attr('href'); | |
var url = base + href; | |
if(href && href.match(/^\/package\//) != null) { | |
console.log('%s', url); | |
q.push({url: url}) | |
console.log(q.length()); | |
} | |
}); | |
} | |
next(); | |
}); | |
}, 1000); | |
}, 2); | |
// Seed the crawler | |
q.push({url: 'https://www.npmjs.org'}); |
Package crawler of the npm registry using 3 modules request, cheerio and async, inspired by node.io.
For seeding the crawler, the site's main page is enough since the recently updated packages section of the page is updated quite often.