Last active
August 10, 2022 02:56
-
-
Save julienarcin/5dd53942d5ebd9766d2cbbcb6db6377b to your computer and use it in GitHub Desktop.
Scrapping annuaire expert comptable
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* | |
* Script pour scrapper un annuaire d'expert comptable | |
* Ref ici: https://www.growthhacking.fr/t/scraper-un-annuaire-expert-comptable/23995 | |
* | |
* Julien Arcin | |
* | |
*/ | |
/*** VARIABLES ***/ | |
$outputTxt = 'extract-json.txt'; | |
$outputCsv = 'extract.csv'; | |
$pageMax = 2976; | |
/** SCRAPPING CODE ***/ | |
function entity_decode($string) { | |
return html_entity_decode(preg_replace_callback("/(&#[0-9]+;)/", function($m) { return mb_convert_encoding($m[1], "UTF-8", "HTML-ENTITIES"); }, $string)); | |
} | |
// Open files | |
$ftxt = fopen($outputTxt, 'a'); | |
$fcsv = fopen($outputCsv, 'a'); | |
// Add headers to csv | |
fwrite($fcsv,'NAME;STREET ADDRESS;LOCALITY;POSTAL CODE;PHONE;WEBSITE;ALL WEBSITES;LANGUAGES;MEMBERS' . "\n"); | |
// For each page | |
for($i = 1;$i <= $pageMax;$i++) { | |
$percentProgress = round($i/$pageMax*100,2); | |
echo $percentProgress . '% - Scraping page ' . $i . "..."; | |
$curl = curl_init(); | |
$url = 'https://annuaire.experts-comptables.org/recherche/' . $i . '?localite=&lat=&lon=&type_localite=&comptable=&seed=46109'; | |
curl_setopt($curl, CURLOPT_URL, $url); | |
curl_setopt($curl, CURLOPT_RETURNTRANSFER, true); | |
curl_setopt($curl, CURLOPT_HEADER, false); | |
$contentSearch = curl_exec($curl); | |
curl_close($curl); | |
echo "OK!\n"; | |
// Capture all links on page | |
preg_match_all('/<div class="name">\n\s*<a href="(\/expert-comptable\/[^"]+)"[^>]+>([^<]+)<\/a>/i', $contentSearch, $matchesIndex); | |
// For each link | |
foreach($matchesIndex[1] as $indexLink => $link) { | |
$url = 'https://annuaire.experts-comptables.org' . $link; | |
echo ' - scraping ' . entity_decode($matchesIndex[2][$indexLink]) . "..."; | |
// Get link | |
curl_setopt($curl, CURLOPT_URL, $url); | |
curl_setopt($curl, CURLOPT_RETURNTRANSFER, true); | |
curl_setopt($curl, CURLOPT_HEADER, false); | |
$contentLink = curl_exec($curl); | |
curl_close($curl); | |
// Capture data on page | |
$dataLink = []; | |
// Name | |
if(preg_match('/<h1 class="info-grp_firm-name">([^<]+)<\/h1>/i',$contentLink, $matchLink)) { | |
$dataLink['name'] = entity_decode($matchLink[1]); | |
} | |
// Logo | |
if(preg_match('/class="logo" id="relativeTarget" src="([^"]*)"/i',$contentLink, $matchLink)) { | |
if($matchLink[1] !== '/build/images/content/cabinet-anonyme.aa87fff0.svg') { | |
$dataLink['logo'] = 'https://annuaire.experts-comptables.org' . $matchLink[1]; | |
} | |
} | |
// StreetAddress | |
if(preg_match('/"streetAddress": "([^"]+)",/i',$contentLink, $matchLink)) { | |
$dataLink['streetAddress'] = entity_decode($matchLink[1]); | |
} | |
// AddressLocality | |
if(preg_match('/"addressLocality": "([^"]+)",/i',$contentLink, $matchLink)) { | |
$dataLink['addressLocality'] = entity_decode($matchLink[1]); | |
} | |
// PostalCode | |
if(preg_match('/"postalCode": "([^"]+)",/i',$contentLink, $matchLink)) { | |
$dataLink['postalCode'] = entity_decode($matchLink[1]); | |
} | |
// Latitude | |
if(preg_match('/"latitude": "([^"]+)",/i',$contentLink, $matchLink)) { | |
$dataLink['latitude'] = entity_decode($matchLink[1]); | |
} | |
// Longitude | |
if(preg_match('/"longitude": "([^"]+)",/i',$contentLink, $matchLink)) { | |
$dataLink['longitude'] = entity_decode($matchLink[1]); | |
} | |
// Telephone | |
if(preg_match('/"telephone": "([^"]+)",/i',$contentLink, $matchLink)) { | |
$dataLink['phone'] = entity_decode($matchLink[1]); | |
} | |
// Sites | |
if(preg_match_all('/<a target="_blank" rel="noreferrer" href="([^"]+)"/i',$contentLink, $matchesLink)) { | |
$dataLink['sites'] = $matchesLink[1]; | |
} | |
// Languages | |
if(preg_match_all('/<strong>\n\s*Langues parlées\s*<\/strong>\n\s*<\/h3>\n\s*<div>\n\s*([^<]+)<\/div>/i',$contentLink, $matchesLink)) { | |
$languages = preg_replace('/\s*\n*/','', $matchesLink[1][0]); | |
$languages = explode(',', $languages); | |
$dataLink['languages'] = $languages; | |
} | |
// Members | |
if(preg_match_all('/<img alt="[^"]+" class="avatar" src="([^"]+)">\n\s*<div class="info">\n\s*<span class="text-uppercase">([^<]+)<\/span><br\/>\n\s*<span>([^<]+)<\/span>/i',$contentLink, $matchesLink)) { | |
$dataLink['members'] = []; | |
// Foreach member | |
foreach($matchesLink[0] as $indexMember => $matchLink) { | |
// Avatar | |
if($matchesLink[1][$indexMember] !== '/build/images/content/anonyme.ad191e7f.jpg') { | |
$dataLink['members'][$indexMember]['avatar'] = 'https://annuaire.experts-comptables.org' . $matchesLink[1][$indexMember]; | |
} else { | |
$dataLink['members'][$indexMember]['avatar'] = null; | |
} | |
// First name | |
$dataLink['members'][$indexMember]['first_name'] = entity_decode($matchesLink[2][$indexMember]); | |
// Last name | |
$dataLink['members'][$indexMember]['last_name'] = entity_decode($matchesLink[3][$indexMember]); | |
} | |
} | |
// Generate member string | |
$membersString = ''; | |
foreach($dataLink['members'] ?? [] as $member) { | |
$membersString .= $member['first_name'] . ' ' . $member['last_name'] . ','; | |
} | |
$membersString = substr($membersString, 0, -1); | |
// Write to CSV | |
fwrite($fcsv, ($dataLink['name'] ?? '') . ';' . ($dataLink['streetAddress'] ?? '') . ';' . ($dataLink['addressLocality'] ?? '') . ';' . ($dataLink['postalCode'] ?? '') . ';' . ($dataLink['phone'] ?? '') . ';' . (count($dataLink['sites'] ?? []) > 0 ? $dataLink['sites'][0] : '') . ';' . join(',',$dataLink['sites'] ?? []) . ';' . join(',',$dataLink['languages'] ?? []) . ';' . $membersString . "\n"); | |
// Write to json | |
fwrite($ftxt, json_encode($dataLink) . "\n"); | |
echo "OK!\n"; | |
} | |
} | |
fclose($ftxt); | |
fclose($fcsv); | |
echo "=========\n"; | |
echo "Finished!\n"; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment