Skip to content

Instantly share code, notes, and snippets.

@eliellis
Created December 31, 2012 01:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save eliellis/4416752 to your computer and use it in GitHub Desktop.
Save eliellis/4416752 to your computer and use it in GitHub Desktop.
Screen scraper for nodejs that uses cheerio and request
var request = require('request'),
http = require('http'),
url = require('url'),
async = require('async');
cheerio = require('cheerio');
function testElement(string){
var tester = new RegExp(string, 'g');
if(tester.test($(this).attr('class')) || tester.test($(this).attr('id'))){
return true;
}
else{
return false;
}
}
//pretty jank readablity body-text extractor, also we have no idea how big the page you want is, so this is async as hell
function parseInnards(context, finished){
$ = cheerio.load(context);
var div_list = [];
async.forEach($('div'),
function(item, callback){
var testEl = testElement.bind(item); //bind the 'this' keyword to the current div
if (testEl('body') || testEl('content') || testEl('text') || testEl('article') || testEl('blog') || testEl('post')){
div_list.push($(item));
}
callback(null);
},
function(err){
//removes extraneouos tags and maps all the items to cheerio objects
async.map(div_list,
function(item, callback){
$(item).find('[class~=image]').remove();
$(item).find('script').remove();
callback(null, item);
},
function(err, results){
async.sortBy(results,
function(item, callback){
var length = item.text().length;
var multiplier = 0;
if (testElement.call(item, 'instapaper_body')){
multiplier += 2;
}
else{
multiplier += 1;
}
var weight = (multiplier * length);
callback(null, -weight); //negative value because we want the largest weighing item at the top
},
function(err, results){
finished(results[0].text());
});
});
});
}
function handleRequest(req, res){
var requested = url.parse(req.url, true) || 'http://google.com';
var query = url.parse(decodeURI(requested.query.url), true);
request({uri: query.href, headers: ['User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.101 Safari/537.11'] }, function(error, response, body){
if (error){
console.warn(error);
res.writeHead(500);
res.end(error.message);
}
else{
$ = cheerio.load(body);
var page = {
"url": query,
"document": {
title: $('head>title').text(),
meta: {
title: $('meta[property="og:title"]').attr("content"),
author: $('meta[property="og:author"]').attr("content"),
description: $('meta[property="og:description"]').attr("content"),
url: $('meta[property="og:url"]').attr("content"),
type: $('meta[property="og:type"]').attr("content"),
image: $('meta[property="og:image"]').attr("content")
},
"content": undefined,
"images": []
}
};
$('img').each(function(){
var url = $(this).attr('src');
if (page.document.images.indexOf(url) === -1){
page.document.images.push(url);
}
});
if (requested.query.content === 'true'){
parseInnards(body, function(text){
page.document.content = text;
res.writeHead(200, {'Content-type': 'application/json', 'Cache-Control': 'private, max-age=3600'});
res.end(JSON.stringify(page));
});
}
else{
res.writeHead(200, {'Content-type': 'application/json', 'Cache-Control': 'private, max-age=3600'});
res.end(JSON.stringify(page));
}
}
});
}
var server = http.createServer(handleRequest).listen(1337);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment