Skip to content

Instantly share code, notes, and snippets.

@creationix
Created August 23, 2010 05:12
Show Gist options
  • Save creationix/544831 to your computer and use it in GitHub Desktop.
Save creationix/544831 to your computer and use it in GitHub Desktop.
Simple scraper tool
var http = require('http'),
Url = require('url'),
htmlparser = require('node-htmlparser'),
dom = require('jsdom/level1/core').dom.level1.core,
index = require('jsdom/browser/index'),
sizzleInit = require('sizzle').sizzleInit;
// Simple wrapper around node's http client, htmlparser, jsdom, and sizzle
function fetch(url, callback) {
var uri = Url.parse(url);
var server = http.createClient(uri.port || 80, uri.host);
var request = server.request('GET', uri.pathname, {
Host: uri.host
});
request.end();
request.on('response', function (response) {
response.setEncoding('utf8');
var body = "";
response.on('data', function (chunk) {
body += chunk
});
response.on('end', function () {
var browser = index.windowAugmentation(dom, {parser: htmlparser});
browser.document.body.innerHTML = body;
var Sizzle = sizzleInit(browser, browser.document);
callback(null, Sizzle);
});
response.on('error', callback);
});
request.on('error', callback);
}
// Grabs the main sections from the index page
function getBooks(callback) {
fetch("http://scriptures.lds.org/", function (err, Sizzle) {
if (err) return callback(err);
var results = {};
Sizzle("a").forEach(function (a) {
var href = a.href;
if (!(/\/contents$/).test(href)) return;
Sizzle(".smallcaps", a).forEach(function (div) {
results[href] = div.innerHTML.trim();
})
});
callback(null, results);
});
}
getBooks(function (err, books) {
if (err) throw err;
console.dir(books);
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment