Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
streaming-crawler.js that reads articles from first X pages
var request = require('request');
var cheerio = require('cheerio');
var select = require('html-select');
var tokenize = require('html-tokenize');
var counter = 0;
function crawl(page, stopAt, cb) {
counter++;
var selectStream = select('.title a', function (e) {
var text = '';
var href = '';
e.createReadStream().on('data', function (row) {
if (row[0] === 'text') { text = row[1].toString(); }
if (row[0] === 'open') {
href = row[1].toString().replace('<a href="', '');
href = href.replace('">', '');
}
// console.log([ row[0], row[1].toString() ]);
}).on('end', function() {
if (/rel=\"nofollow/.test(href)) {
return;
}
console.log({
href: href,
text: text
});
});
}).on('end', function() {
// console.log('page: %s, stopAt: %s', page, stopAt);
if (page < stopAt) {
crawl(++page, stopAt, cb);
} else {
cb();
}
});
// .title a
request('https://news.ycombinator.com/?p=' + page).pipe(tokenize()).pipe(selectStream);
selectStream.resume();
// The selectStream.resume() is necesary to put the stream into flow mode
// since we aren't doing anything with the output of s
}
crawl(1, 5, function() {
console.log('pages: ', counter);
console.log('=== THE END ===');
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.