Skip to content

Instantly share code, notes, and snippets.

@sandywu
Forked from cmoore4/gist:998126
Created March 6, 2012 06:03
Show Gist options
  • Save sandywu/1983976 to your computer and use it in GitHub Desktop.
Save sandywu/1983976 to your computer and use it in GitHub Desktop.
Node.io Scraper
// This is the library that'll handle all of our input tracking and job dispatching
var nodeio = require('node.io');
// The base_url is the site you want to crawl.
// Links is an array of all the links seen as <a> tags, but not yet scraped.
// crawled_links is the array of all the pages already scraped.
var base_url = 'http://reddit.com',
links = [base_url],
crawled_links = [];
var count = 0;
var methods = {
// Input takes an array that will be fed to jobs as "run"
input: links,
// Here's the function that does most of the heavy lifting.
// Retrieves and processes HTML
run: function(link){
var self = this;
self.getHtml(link, function(err, $, data, headers){
if (err){
console.log('Error scraping page ' + link);
console.log(err);
// "retry" puts the link back in the input queue
if (err.toString() == 'timeout'){
self.retry();
} else {
self.skip();
}
}
// Not error
else {
$a = $('a[href]');
if ($a.length){
$a.foreach(function($el, idx){
url = $el.attr('href');
if (links.indexOf(url) == -1 && crawled_links.indexOf(url) == -1){
// We add the url to links array to keep track of it
// self.add() is from Node.io, and adds the link to the input array
links.push(url);
self.add(url);
console.log(link + ': Added ' + url);
}
});
}
//emits the result to be handled by fail if necessary
this.emit();
count += 1;
}
});
},
complete: function(callback){
console.log('Read in ' + count + ' pages.');
// Node.io requires this, but is unnecessary for now
callback();
}
}
// Here's the actual call to start the Node.io job:
exports.job = new nodeio.Job({
timeout: 12, // 12 secodn timeout per thread
jsdom: true, // use jQuery instead of htmlParser
max: 5 //5 threads in this job
}, methods); // methods is our big object above.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment