Skip to content

Instantly share code, notes, and snippets.

@PaulGuo
Created June 28, 2015 13:57
Show Gist options
  • Save PaulGuo/66c8a588d7104699e6c4 to your computer and use it in GitHub Desktop.
Save PaulGuo/66c8a588d7104699e6c4 to your computer and use it in GitHub Desktop.
Spider for ONE
var Crawler = require("crawler");
var url = require('url');
var fs = require('fs');
var contents = {};
var queue = [];
var c = new Crawler({
maxConnections : 10,
forceUTF8: true,
// This will be called for each crawled page
callback : function (error, result, $) {
// $ is Cheerio by default
//a lean implementation of core jQuery designed specifically for the server
var content = $('.one-cita-wrapper .one-cita').text().trim();
var vol = parseInt($('.one-titulo').text().trim().split('.')[1]);
contents[vol] = content;
console.log('vol.%s done.', vol);
},
onDrain: function() {
// console.log(contents);
fs.writeFile('./contents.json', JSON.stringify(contents));
process.exit();
}
});
// Queue just one URL, with default callback
// c.queue('http://wufazhuce.com/one/vol.992');
for(var i = 1; i <= 992; i++) {
queue.push('http://wufazhuce.com/one/vol.' + i);
}
c.queue(queue);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment