Skip to content

Instantly share code, notes, and snippets.

@kidager
Created June 27, 2018 07:18
Show Gist options
  • Save kidager/126c1ce46d445b5409bef1679910f694 to your computer and use it in GitHub Desktop.
Save kidager/126c1ce46d445b5409bef1679910f694 to your computer and use it in GitHub Desktop.
<?php
$response = $guzzleClient->get(
// 'list.htm?tri=initial&idtypebien=2,1&div=2238&idtt=1&naturebien=1,2,4&LISTING-LISTpg=2'
'list.htm?tri=initial&idtypebien=2,1&div=2238&idtt=2,5&naturebien=1,2,4&LISTING-LISTpg=1'
// 'list.htm?tri=initial&idtypebien=1,2&cp=75&idtt=1&naturebien=1,2,4&bd=DetailToList_SL'
);
$html = (string)$response->getBody();
$internalErrors = libxml_use_internal_errors(true);
$dom = new DOMDocument('1.0', 'UTF-8');
$dom->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'));
$annonces = $getSubElements($dom, '//*[starts-with(@id, "annonce-")]');
$annonces->each(function ($element) {
# Get the link for each ad
$link = $getSubElements($element, '//a[@class="link_AB"]')->first();
# Get the ad page HTML
$resp = $guzzleClient->get($link->getAttribute('href'));
$html = (string)$resp->getBody();
$dom = new DOMDocument('1.0', 'UTF-8');
$dom->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'));
# Find the right "script" tag
$scripts = $getSubElements($dom, '//script');
$scriptContent = data_get($scripts->get(7), 'nodeValue');
# Parse it
$scriptContent = preg_replace('/",\n?\s*enumerable\:\s*.*\n?\s*\}\);/imu', '"', $scriptContent);
$scriptContent = preg_replace('/\n\s*Object\.defineProperty\(\s*ConfigDetail,\s*\'/imu', "\n\"", $scriptContent);
$scriptContent = preg_replace('/\',\s*\{\n?\s*value\:\s*"/imu', '":"', $scriptContent);
$scriptContent = preg_replace('/\n(\t|\s)+.*/iu', "\n", $scriptContent);
$scriptContent = preg_replace('/\n+/iu', "\n", $scriptContent);
# This may still need some formatting too
return $scriptContent;
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment