Skip to content

Instantly share code, notes, and snippets.

@DenisMir
Created March 5, 2011 16:45
Show Gist options
  • Save DenisMir/3f9c45f72d1e95479498 to your computer and use it in GitHub Desktop.
Save DenisMir/3f9c45f72d1e95479498 to your computer and use it in GitHub Desktop.
sample scraping
// require the new modules and set up the vars
var express = require('express'),
jjw = require('jjw'),
app,
GOOGLE_SEARCH_URI;
// define the constants
GOOGLE_SEARCH_URI = "http://www.google.de/search?q=";
// create the express (node) server
app = express.createServer();
// setting up some express related stuff
app.use(express.bodyDecoder());
app.set( "view engine","html");
app.set("view options", {layout: false});
app.register(".html", require("jqtpl"));
app.configure(function(){
app.use(express.staticProvider(__dirname + '/public'));
});
// setting up the scraper loop handling all HTTP methods (scraping on post of query string)
app.all('/scrape', function(req, res){
if(req.body){
var uri = GOOGLE_SEARCH_URI + encodeURIComponent(req.body.q);
console.log("scraping the uri: " + uri);
jjw(uri, scrape, function(err, scrapeRes) {
if (err) throw err
console.log("scraping done...");
res.render("scrape", {locals: scrapeRes});
});
}else{
res.render("scrape");
}
});
// scrape related
var scrape = {
results: function($){
var res = [];
$("li.g").each(function(){
var linkNode, descNode, link, text, desc;
$(this).find("span.f").remove(); // clean up the description
linkNode = $(this).find("a.l");
descNode = $(this).find("div.s")
link = linkNode.attr("href");
text = linkNode.text();
desc = descNode.remove("span.f").text();
if(link && text && desc){
res.push({text: text, link: link, desc: desc});
}
});
return res;
}
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment