Skip to content

Instantly share code, notes, and snippets.

@julienarcin
Last active August 10, 2022 02:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save julienarcin/5dd53942d5ebd9766d2cbbcb6db6377b to your computer and use it in GitHub Desktop.
Save julienarcin/5dd53942d5ebd9766d2cbbcb6db6377b to your computer and use it in GitHub Desktop.
Scrapping annuaire expert comptable
<?php
/**
*
* Script pour scrapper un annuaire d'expert comptable
* Ref ici: https://www.growthhacking.fr/t/scraper-un-annuaire-expert-comptable/23995
*
* Julien Arcin
*
*/
/*** VARIABLES ***/
$outputTxt = 'extract-json.txt';
$outputCsv = 'extract.csv';
$pageMax = 2976;
/** SCRAPPING CODE ***/
function entity_decode($string) {
return html_entity_decode(preg_replace_callback("/(&#[0-9]+;)/", function($m) { return mb_convert_encoding($m[1], "UTF-8", "HTML-ENTITIES"); }, $string));
}
// Open files
$ftxt = fopen($outputTxt, 'a');
$fcsv = fopen($outputCsv, 'a');
// Add headers to csv
fwrite($fcsv,'NAME;STREET ADDRESS;LOCALITY;POSTAL CODE;PHONE;WEBSITE;ALL WEBSITES;LANGUAGES;MEMBERS' . "\n");
// For each page
for($i = 1;$i <= $pageMax;$i++) {
$percentProgress = round($i/$pageMax*100,2);
echo $percentProgress . '% - Scraping page ' . $i . "...";
$curl = curl_init();
$url = 'https://annuaire.experts-comptables.org/recherche/' . $i . '?localite=&lat=&lon=&type_localite=&comptable=&seed=46109';
curl_setopt($curl, CURLOPT_URL, $url);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curl, CURLOPT_HEADER, false);
$contentSearch = curl_exec($curl);
curl_close($curl);
echo "OK!\n";
// Capture all links on page
preg_match_all('/<div class="name">\n\s*<a href="(\/expert-comptable\/[^"]+)"[^>]+>([^<]+)<\/a>/i', $contentSearch, $matchesIndex);
// For each link
foreach($matchesIndex[1] as $indexLink => $link) {
$url = 'https://annuaire.experts-comptables.org' . $link;
echo ' - scraping ' . entity_decode($matchesIndex[2][$indexLink]) . "...";
// Get link
curl_setopt($curl, CURLOPT_URL, $url);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curl, CURLOPT_HEADER, false);
$contentLink = curl_exec($curl);
curl_close($curl);
// Capture data on page
$dataLink = [];
// Name
if(preg_match('/<h1 class="info-grp_firm-name">([^<]+)<\/h1>/i',$contentLink, $matchLink)) {
$dataLink['name'] = entity_decode($matchLink[1]);
}
// Logo
if(preg_match('/class="logo" id="relativeTarget" src="([^"]*)"/i',$contentLink, $matchLink)) {
if($matchLink[1] !== '/build/images/content/cabinet-anonyme.aa87fff0.svg') {
$dataLink['logo'] = 'https://annuaire.experts-comptables.org' . $matchLink[1];
}
}
// StreetAddress
if(preg_match('/"streetAddress": "([^"]+)",/i',$contentLink, $matchLink)) {
$dataLink['streetAddress'] = entity_decode($matchLink[1]);
}
// AddressLocality
if(preg_match('/"addressLocality": "([^"]+)",/i',$contentLink, $matchLink)) {
$dataLink['addressLocality'] = entity_decode($matchLink[1]);
}
// PostalCode
if(preg_match('/"postalCode": "([^"]+)",/i',$contentLink, $matchLink)) {
$dataLink['postalCode'] = entity_decode($matchLink[1]);
}
// Latitude
if(preg_match('/"latitude": "([^"]+)",/i',$contentLink, $matchLink)) {
$dataLink['latitude'] = entity_decode($matchLink[1]);
}
// Longitude
if(preg_match('/"longitude": "([^"]+)",/i',$contentLink, $matchLink)) {
$dataLink['longitude'] = entity_decode($matchLink[1]);
}
// Telephone
if(preg_match('/"telephone": "([^"]+)",/i',$contentLink, $matchLink)) {
$dataLink['phone'] = entity_decode($matchLink[1]);
}
// Sites
if(preg_match_all('/<a target="_blank" rel="noreferrer" href="([^"]+)"/i',$contentLink, $matchesLink)) {
$dataLink['sites'] = $matchesLink[1];
}
// Languages
if(preg_match_all('/<strong>\n\s*Langues parlées\s*<\/strong>\n\s*<\/h3>\n\s*<div>\n\s*([^<]+)<\/div>/i',$contentLink, $matchesLink)) {
$languages = preg_replace('/\s*\n*/','', $matchesLink[1][0]);
$languages = explode(',', $languages);
$dataLink['languages'] = $languages;
}
// Members
if(preg_match_all('/<img alt="[^"]+" class="avatar" src="([^"]+)">\n\s*<div class="info">\n\s*<span class="text-uppercase">([^<]+)<\/span><br\/>\n\s*<span>([^<]+)<\/span>/i',$contentLink, $matchesLink)) {
$dataLink['members'] = [];
// Foreach member
foreach($matchesLink[0] as $indexMember => $matchLink) {
// Avatar
if($matchesLink[1][$indexMember] !== '/build/images/content/anonyme.ad191e7f.jpg') {
$dataLink['members'][$indexMember]['avatar'] = 'https://annuaire.experts-comptables.org' . $matchesLink[1][$indexMember];
} else {
$dataLink['members'][$indexMember]['avatar'] = null;
}
// First name
$dataLink['members'][$indexMember]['first_name'] = entity_decode($matchesLink[2][$indexMember]);
// Last name
$dataLink['members'][$indexMember]['last_name'] = entity_decode($matchesLink[3][$indexMember]);
}
}
// Generate member string
$membersString = '';
foreach($dataLink['members'] ?? [] as $member) {
$membersString .= $member['first_name'] . ' ' . $member['last_name'] . ',';
}
$membersString = substr($membersString, 0, -1);
// Write to CSV
fwrite($fcsv, ($dataLink['name'] ?? '') . ';' . ($dataLink['streetAddress'] ?? '') . ';' . ($dataLink['addressLocality'] ?? '') . ';' . ($dataLink['postalCode'] ?? '') . ';' . ($dataLink['phone'] ?? '') . ';' . (count($dataLink['sites'] ?? []) > 0 ? $dataLink['sites'][0] : '') . ';' . join(',',$dataLink['sites'] ?? []) . ';' . join(',',$dataLink['languages'] ?? []) . ';' . $membersString . "\n");
// Write to json
fwrite($ftxt, json_encode($dataLink) . "\n");
echo "OK!\n";
}
}
fclose($ftxt);
fclose($fcsv);
echo "=========\n";
echo "Finished!\n";
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment