Example of using icrawler to scrape data from PuntoLIS.it
var icrawler = require('icrawler'); | |
var fs = require('fs'); | |
var sCookie = 'http://www.puntolis.it/storelocator/defaultsearch.aspx?idcustomer=111'; // Only for cookies | |
var URL = 'http://www.puntolis.it/storelocator/buildMenuProv.ashx?CodSer=111'; // Start URL. List of 'provincia's | |
var sLoc = 'http://www.puntolis.it/storelocator/buildMenuLoc.ashx?CodSer=111&ProvSel=%s'; // List of 'localita's for every provincia (PROV) | |
var sMarker = 'http://www.puntolis.it/storelocator/Result.aspx?provincia=%s&localita=%s&cap=XXXXX&Servizio=111'; // (PROV, LOC) | |
var opts = { | |
errorsFirst: true, | |
delay: 30000 | |
}; | |
opts.init = function(needle, log, cb){ | |
needle.get(sCookie, {}, function(err, res){ | |
if (err) return cb(err); | |
cb(null, res.cookies, {}); | |
}); | |
}; | |
opts.save = function(tasks, results){ | |
fs.writeFileSync('./data.json', JSON.stringify({tasks: tasks, results: results}, null, 4)) | |
}; | |
if(fs.existsSync('./data.json')){ | |
var data = JSON.parse(fs.readFileSync('./data.json', 'utf8')); | |
if (data.tasks.length === 0) { | |
console.log('All tasks done'); | |
process.exit(0); | |
} | |
opts.tasks = data.tasks; | |
opts.results = data.results; | |
} | |
icrawler(URL, opts, function(url, $, _, res){ | |
if ($('#TendinaProv').length > 0) { | |
$('#TendinaProv option').slice(1).each(function() { | |
_.push(sLoc.replace('%s', $(this).attr('value'))); | |
}); | |
} else if ($('select[onchange="onLocSelect()"]').length > 0) { | |
$('option').slice(1).each(function() { | |
_.push(sMarker.replace('%s', url.slice(-2)).replace('%s', $(this).attr('id'))); | |
}); | |
_.log('===' + url.slice(-2) + '===') | |
} else { | |
$('marker').each(function() { | |
_.save({ | |
Title: $(this).attr('insegna').trim(), | |
Address: $(this).attr('indirizzo').trim(), | |
Place: [ | |
$(this).attr('cap').trim(), | |
$(this).attr('localita').trim(), | |
$(this).attr('provincia').trim() | |
].join(' ') | |
}); | |
_.step(); | |
}); | |
} | |
}, function(result){ | |
require('excelize')(result, './', 'ADDR.xlsx', 'sheet', function(err){ | |
if (err) throw err; | |
console.log(result.length + ' adresses saved.'); | |
}); | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment