Skip to content

Instantly share code, notes, and snippets.

@boogheta
Created December 9, 2014 16:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save boogheta/19c7380cdb6124684bf2 to your computer and use it in GitHub Desktop.
Save boogheta/19c7380cdb6124684bf2 to your computer and use it in GitHub Desktop.
Tryouts with sandcrawler on Libération's articles tagged with "sexe"
var sandcrawler = require("sandcrawler"),
//artoo = require("sandcrawler/node_modules/artoo-js"),
artoo = require("artoo-js"),
logger = require("sandcrawler-logger"),
fs = require("fs"),
data = [];
// Declare a plugin
var throttle = function(opts) {
return function(scraper) {
// ( this = scraper )
scraper.beforeScraping(function(req, callback) {
if (req.index) {
console.log("throttling now");
return setTimeout(callback, (Math.random() + opts)*1000);
}
callback();
});
};
}
var scraper = sandcrawler.scraper("diego aime le sexe")
.use(logger({color: "red"}))
.url("http://next.liberation.fr/sexe,100124")
.iterate(function(i, req, res) {
if (res.data.next) { return res.data.next; }
return false;
})
.use(throttle(2))
.limit(3)
.jawascript(function(done) {
done({
next: $(".next a") ? $(".next a")[0].href : null,
data: $(".day li").scrape({
title: {
sel: "h2 a"
},
url: {
sel: "h2 a",
method: function() {
var href = $(this).attr("href");
return href[0] === "/" ? "http://next.liberation.fr" + href : href;
}
},
date: {
sel: "time",
attr: "datetime"
},
description: {
sel: "h3",
method: "text"
//method: "html"
},
author: {
sel: ".authorname",
method: function() {
return $(this).text().trim().replace(/\s+/g, ' ');
}
}
})
});
})
.result(function(err, req, res) {
data = data.concat(res.data.data);
//data.push.apply(data, res.data);
this.logger.info(data.length, "articles so far");
});
sandcrawler.run(
scraper,
function(err, remains) {
fs.writeFileSync("articles-libe.json", JSON.stringify(data, null, 2));
fs.writeFileSync("articles-libe.csv", artoo.helpers.toCSVString(data, {order: ["date", "url", "title", "author", "description"]}));
if (err) {
console.log("ca a chié coco!", err, remains);
}
}
);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment