sibnerian/highland_batched_scraper.js

## highland_batched_scraper.js
var _ = require('highland');
var cheerio = require('cheerio');
var request = require('request');

// request library, but with a node.js style callback - i.e. (err, res)
var req = function (url, cb) {
  request(url, function (err, res, data) {
    cb(err, data);
  });
};

// Timeout to demonstrate functionality. Pauses for three seconds in between batches.
var waititout = function (data, cb) {
  setTimeout(function () {
    cb(null, data);
  }, 3000);
};

// Create a stream with all these websites...
_(['https://github.com/isibner',
   'https://github.com/cheeriojs/cheerio',
   'https://github.com/caolan/highland/issues/246',
   'http://highlandjs.org/'
])
// ...batch them two at a time, and send the length-2 arrays down the pipline...
.batch(2)
// ... wait 3 seconds per array (for demonstration purposes)...
.map(_.wrapCallback(waititout))
// ...flatten the arrays so we consider one URL at a time...
.flatten()
// ...GET the URL, and send the body down the pipeline...
.map(_.wrapCallback(req))
// ...process those URLs one at a time (necessary hint to Highland)...
.series()
// ... and finally get the title of the page with Cheerio and log it!
.map(function (page) {
    var $ = cheerio.load(page);
    return $('title').text();
}).each(_.log);
	var _ = require('highland');
	var cheerio = require('cheerio');
	var request = require('request');

	// request library, but with a node.js style callback - i.e. (err, res)
	var req = function (url, cb) {
	request(url, function (err, res, data) {
	cb(err, data);
	});
	};

	// Timeout to demonstrate functionality. Pauses for three seconds in between batches.
	var waititout = function (data, cb) {
	setTimeout(function () {
	cb(null, data);
	}, 3000);
	};

	// Create a stream with all these websites...
	_(['https://github.com/isibner',
	'https://github.com/cheeriojs/cheerio',
	'https://github.com/caolan/highland/issues/246',
	'http://highlandjs.org/'
	])
	// ...batch them two at a time, and send the length-2 arrays down the pipline...
	.batch(2)
	// ... wait 3 seconds per array (for demonstration purposes)...
	.map(_.wrapCallback(waititout))
	// ...flatten the arrays so we consider one URL at a time...
	.flatten()
	// ...GET the URL, and send the body down the pipeline...
	.map(_.wrapCallback(req))
	// ...process those URLs one at a time (necessary hint to Highland)...
	.series()
	// ... and finally get the title of the page with Cheerio and log it!
	.map(function (page) {
	var $ = cheerio.load(page);
	return $('title').text();
	}).each(_.log);