LIS Map scraping (for Habr)
var log = require('cllc')(); | |
var tress = require('tress'); | |
var needle = require('needle'); | |
var cheerio = require('cheerio'); | |
var fs = require('fs'); | |
var sCookie = 'http://www.puntolis.it/storelocator/defaultsearch.aspx?idcustomer=111'; | |
var sProv = 'http://www.puntolis.it/storelocator/buildMenuProv.ashx?CodSer=111'; | |
var sLoc = 'http://www.puntolis.it/storelocator/buildMenuLoc.ashx?CodSer=111&ProvSel=%s'; | |
var sMarker = 'http://www.puntolis.it/storelocator/Result.aspx?provincia=%s&localita=%s&cap=XXXXX&Servizio=111'; | |
var httpOptions = {}; | |
var results = []; | |
var q = tress(crawl); | |
q.success = function(){ | |
q.concurrency = 1; | |
} | |
q.retry = function(){ | |
q.concurrency = -10000; | |
} | |
q.drain = function(){ | |
fs.writeFileSync('./data.json', JSON.stringify(results, null, 4)); | |
log.finish(); | |
log('Работа закончена'); | |
} | |
needle.get(sCookie, function(err, res){ | |
if (err || res.statusCode !== 200) | |
throw err || res.statusCode; | |
httpOptions.cookies = res.cookies; | |
log('Начало работы'); | |
log.start('Найдено провинций %s, Найдено коммун %s, Найдено маркеров %s.'); | |
q.push(sProv); | |
}); | |
function crawl(url, callback){ | |
needle.get(url, httpOptions, function(err, res){ | |
if (err || res.statusCode !== 200) { | |
q.concurrency === 1 && log.e((err || res.statusCode) + ' - ' + url); | |
return callback(true); | |
} | |
var $ = cheerio.load(res.body); | |
$('#TendinaProv option').slice(1).each(function() { | |
q.push(sLoc.replace('%s', $(this).attr('value'))); | |
log.step(); | |
}); | |
$('select[onchange="onLocSelect()"] option').slice(1).each(function() { | |
q.push(sMarker.replace('%s', url.slice(-2)).replace('%s', $(this).attr('id'))); | |
log.step(0, 1); | |
}); | |
$('marker').each(function() { | |
results.push({ | |
Title: $(this).attr('insegna').trim(), | |
Address: $(this).attr('indirizzo').trim(), | |
Place: [ | |
$(this).attr('cap').trim(), | |
$(this).attr('localita').trim(), | |
$(this).attr('provincia').trim() | |
].join(' ') | |
}); | |
log.step(0, 0, 1); | |
}); | |
callback(); | |
}); | |
} |
{ | |
"private": true, | |
"name": "lis-scraper", | |
"version": "0.0.1", | |
"description": "Web scraping example for habrahabr", | |
"main": "index.js", | |
"author": "astur <astur@yandex.ru> (http://kozlov.am/)", | |
"license": "WTFPL", | |
"dependencies": { | |
"cheerio": "^0.20.0", | |
"needle": "^1.0.0", | |
"tress": "^1.0.0", | |
"cllc": "^0.0.9" | |
} | |
} |
This comment has been minimized.
This comment has been minimized.
Очень хорошая статья для начинающих. ОЧень круто и понятно расписано. Спасибо! |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This comment has been minimized.
thanks