Last active
June 27, 2024 16:16
-
-
Save ygotthilf/290a137950b50cb10fd7 to your computer and use it in GitHub Desktop.
Quick scrapper for list/detail website
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var _ = require('lodash'); | |
var request = require('request'); | |
var cheerio = require('cheerio'); | |
var iconv = require('iconv-lite'); | |
var moment = require('moment'); | |
var headers = { | |
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5376e Safari/8536.25 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)' | |
}; | |
var labels = [ | |
'SIRET', | |
'Site internet ', | |
'Téléphone', | |
]; | |
var baseUrl = 'http://www.annuaire-des-vtc.fr/'; | |
requestListPage() | |
function requestListPage(index) { | |
request({ | |
url: baseUrl + 'rhone-69/', | |
headers: headers | |
}, function(error, response, body) { | |
if (error || response.statusCode !== 200) return console.error(error); | |
var $ = cheerio.load(body); | |
$('.link_black_blue_b_u').each(function(index, element) { | |
var link = $(element).attr('href') | |
requestDetailPage(link); | |
}); | |
}); | |
} | |
function requestDetailPage(path) { | |
var uri = baseUrl + path; | |
request({ | |
encoding: null, | |
uri: uri, | |
headers: headers | |
}, function (error, response, body) { | |
if (error || response.statusCode !== 200) return console.error(error); | |
var $ = cheerio.load(iconv.decode(new Buffer(body), 'utf-8')); | |
var str = $('.title_h h1').text(); | |
$('.form_details').each(function(index, el){ | |
var label = $(el).find('.title_details').text(); | |
if(labels.indexOf(label)>-1) { | |
var value = $(el).find('.infos_details').text(); | |
value = value.replace(/\n/g, ''); | |
str+=';'+value; | |
} | |
}) | |
console.log(str) | |
}); | |
} | |
function validateEmail(email) { | |
var re = /(([^<>()[\]\\.,;:\s@\"]+(\.[^<>()[\]\\.,;:\s@\"]+)*)|(\".+\"))@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\])|(([a-zA-Z\-0-9]+\.)+[a-zA-Z]{2,}))/; | |
return re.exec(email); | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment