Skip to content

Instantly share code, notes, and snippets.

@dominykas
Last active August 29, 2015 14:10
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dominykas/df640c4c71d9f9ab5112 to your computer and use it in GitHub Desktop.
Save dominykas/df640c4c71d9f9ab5112 to your computer and use it in GitHub Desktop.
Parsing crawler proxy with Hapi
var _ = require("lodash"),
Hapi = require("hapi"),
Nipple = require("nipple"),
jsdom = require("jsdom"),
Url = require("url");
var parsers = {
"default": function (url, document, cb) {
cb(null, {
title: document.title,
text: document.body.textContent.replace(/\s+/g, " ").trim()
});
}
};
function parse(url, responseBody, cb) {
var domain = Url.parse(url).hostname;
if (domain.indexOf("www.") === 0) domain = domain.substr(4);
var parser = parsers[domain] || parsers["default"];
jsdom.env(responseBody.toString(), function (e, win) {
if (e) {
return cb(e);
}
// clean out non-text content tags
_(["script", "link", "img", "style", "meta", "iframe", "noscript"]) // iframe inside noscript??
.map(function (t) {
return _.toArray(win.document.body.getElementsByTagName(t))
})
.flatten()
.forEach(function (el) {
el.parentNode.removeChild(el);
});
parser(url, win.document, function (e, parsed) {
cb(e, parsed);
});
});
}
var server = new Hapi.Server({});
server.connection({port: 3128});
server.method('getUrl', function (url, cb) {
console.log("Downloading", url);
Nipple.get(url, function (err, res, payload) {
cb(err, payload);
});
}, {
cache: {
expiresIn: 10 * 1000
}
});
server.route({
method: '*',
path: '/{p*}',
handler: function (req, reply) {
var url = req.url.href;
req.server.methods.getUrl(url, function (err, payload) {
if (err) {
return reply(err);
}
console.log("Parsing", url);
parse(url, payload, function (err, parsed) {
reply(err || parsed);
});
});
}
});
server.start(function () {
console.log("Listening");
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment