Skip to content

Instantly share code, notes, and snippets.

@ygotthilf
Last active September 8, 2015 16:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ygotthilf/290a137950b50cb10fd7 to your computer and use it in GitHub Desktop.
Save ygotthilf/290a137950b50cb10fd7 to your computer and use it in GitHub Desktop.
Quick scrapper for list/detail website
var _ = require('lodash');
var request = require('request');
var cheerio = require('cheerio');
var iconv = require('iconv-lite');
var moment = require('moment');
var headers = {
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5376e Safari/8536.25 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
};
var labels = [
'SIRET',
'Site internet ',
'Téléphone',
];
var baseUrl = 'http://www.annuaire-des-vtc.fr/';
requestListPage()
function requestListPage(index) {
request({
url: baseUrl + 'rhone-69/',
headers: headers
}, function(error, response, body) {
if (error || response.statusCode !== 200) return console.error(error);
var $ = cheerio.load(body);
$('.link_black_blue_b_u').each(function(index, element) {
var link = $(element).attr('href')
requestDetailPage(link);
});
});
}
function requestDetailPage(path) {
var uri = baseUrl + path;
request({
encoding: null,
uri: uri,
headers: headers
}, function (error, response, body) {
if (error || response.statusCode !== 200) return console.error(error);
var $ = cheerio.load(iconv.decode(new Buffer(body), 'utf-8'));
var str = $('.title_h h1').text();
$('.form_details').each(function(index, el){
var label = $(el).find('.title_details').text();
if(labels.indexOf(label)>-1) {
var value = $(el).find('.infos_details').text();
value = value.replace(/\n/g, '');
str+=';'+value;
}
})
console.log(str)
});
}
function validateEmail(email) {
var re = /(([^<>()[\]\\.,;:\s@\"]+(\.[^<>()[\]\\.,;:\s@\"]+)*)|(\".+\"))@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\])|(([a-zA-Z\-0-9]+\.)+[a-zA-Z]{2,}))/;
return re.exec(email);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment