Created
May 4, 2009 00:37
-
-
Save ericabouaf/106224 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Defini la structure des pages et le xpath pour recuperer les donnees | |
var xpath = '//table[@class="sobi2Listing"]/tr//td'; | |
var baseUrl = "http://www.abcvert.fr/annuaire/energies-renouvelables/solaire"; | |
var getUrl = function(pageIndex) { | |
return baseUrl + ((pageIndex == 0) ? ".html" : "/Page-"+(pageIndex+1)+".html"); | |
}; | |
// Recuperation des contacts | |
var contacts = {}; | |
// Nombre de pages a scraper (on peut monter a 35) | |
var pageStart = 0; | |
var nToScrap = 5; | |
// Iteration sur toutes les pages | |
for(var i = pageStart ; i < nToScrap ; i++) { | |
// la requete yql magique | |
var url = getUrl(i); | |
var q = "select * from html where url='"+getUrl(i)+"' and xpath='"+xpath+"'"; | |
var query = y.query(q); | |
// Transforme le resultat en Javascript (c'est du xml a la base) | |
var json = y.xmlToJson(query.results); | |
var list = json.results.td | |
// Itere sur les contacts de la page | |
if( list && list["length"]) { | |
for(var c = 0 ; c < list.length ; c++) { | |
var contactHtml = list[c]; | |
try { | |
// Cree l'objet contact tout bien parse | |
var name = contactHtml.p["0"].a.title; | |
var item = {}; | |
item.name = name; | |
item.url = contactHtml.p["0"].a.href; | |
var span = contactHtml.span; | |
if(span) { | |
if(span["0"]) item.adress = span["0"].content; | |
if(span["1"]) item.zipcode = span["1"].content; | |
if(span["2"]) item.city = span["2"].content; | |
} | |
contacts[name] = item; | |
} catch(ex){} | |
} | |
} | |
} | |
// Objet de reponse: | |
response.object = { | |
version: 0.10, | |
contacts: contacts | |
}; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment