scrape millions of html files in a folder structure
var $ = require('cheerio') | |
var fs = require('fs') | |
var walker = require('folder-walker') | |
var transform = require('parallel-transform') | |
var ndjson = require('ndjson') | |
var walk = walker('./pageblobs') // generated by abstract-blob-store | |
var scraper = transform(10, scrape) | |
var out = ndjson.serialize() | |
walk.pipe(scraper).pipe(out).pipe(process.stdout) | |
function scrape (entry, cb) { | |
if (entry.type === 'directory') return cb() | |
var file = entry.filepath | |
fs.readFile(file, function (err, buff) { | |
if (err) return cb(err) | |
var htmlString = buff.toString() | |
var parsedHTML = $.load(htmlString) | |
var links = [] | |
parsedHTML('.some-class').map(function(i, foo) { | |
// the foo html element into a cheerio object (same pattern as jQuery) | |
foo = $(foo) | |
links.push(foo.attr('href')) | |
}) | |
cb(null, {links: links}) | |
}) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment