Skip to content

Instantly share code, notes, and snippets.

@davidrenne
Forked from proudlygeek/artoo-imm.js
Last active August 29, 2015 14:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save davidrenne/7ade3140556ca92228b7 to your computer and use it in GitHub Desktop.
Save davidrenne/7ade3140556ca92228b7 to your computer and use it in GitHub Desktop.
(function(artoo) {
// Artoo Spider for http://www.immobiliare.it/info/ufficio-stampa
var scrape = {
iterator: '#wrapTestoStatiche',
data: {
'id': function() {
var id = artoo.$(this).find('.text-align_left a:last-child').attr('href');
if (id) {
id = id.match(/ImmoNews\-(\d+)\.pdf$/);
} else {
id = artoo.$(this).find('.media-det img').attr('src').match(/\?id=(\d+)$/);
}
return id[1];
},
'title': function() {
return artoo.$(this).find('.title').html().trim();
},
'date': { sel: '.data' },
'source': function() {
var html = artoo.$(this).find('.text-align_left').clone();
html.find('.data').remove();
var text = html.text().split("\n")[4].trim();
return text;
},
'pdf_href': { sel: 'a:last-child', attr: 'href' },
'media': function() {
var media = artoo.$(this).find('.media-det').html();
if (media) {
media = media.trim();
}
return media;
},
'txt': function() {
var clone = artoo.$(this).find('.dettaglio-comunicato').clone();
clone.find('.media-det').remove();
return clone.find('div:last-child').html().trim();
}
},
params: {
limit: 1
},
savePrettyJson: function(obj) {
artoo.savePrettyJson(obj[0], {filename: obj[0].id + '.json'});
}
};
function process(data) {
var result = artoo.scrape(
artoo.$(data).find(this.iterator),
this.data,
this.params);
this.savePrettyJson(result);
return result;
}
function spider(urlList, params) {
artoo.ajaxSpider(urlList, {
done: function(data) { console.log(data); },
// scrape: scrape,
process: process.bind(scrape),
throttle: 500
});
}
function run() {
var urlTemplate = "http://www.immobiliare.it/info/ufficio-stampa?id=",
ids = [1123,1105,1086,1058,1038,1028,1014,1000,986,984,966,954,949,941,939,917,885,883,881,873,871,867,869,863,865,861,857,854,852,850,810,831,806,802,800,796,786,778,777,774,742,726,725,722,721,717,713,709,697,685,678,673,669,665,662];
//ids = [1123];
var urlList = ids.map(function(id) {
return urlTemplate + id;
});
spider(urlList, urlTemplate);
}
run();
})(artoo);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment