Skip to content

Instantly share code, notes, and snippets.

@ptica
Created January 25, 2019 09:44
Show Gist options
  • Save ptica/94b05eb152ef20d284fab81a035a144d to your computer and use it in GitHub Desktop.
Save ptica/94b05eb152ef20d284fab81a035a144d to your computer and use it in GitHub Desktop.
require 'vendor/autoload.php';
function scrape_details($dom, &$details) {
$images = $dom->querySelector('.images');
foreach ($images->querySelectorAll('.pic') as $i => $e) {
$details[] = $e->getAttribute('href');
}
}
function get_next_page($dom) {
$next = $dom->querySelector('.navig-page-next');
if ($next && $next->querySelector('a')) {
$next = $next->querySelector('a')->getAttribute('href');
return $next;
}
return null;
}
function process_page($base, $url, &$details) {
$dom = new IvoPetkov\HTML5DOMDocument();
$data = file_get_contents($base.$url);
$dom->loadHtml($data);
scrape_details($dom, $details);
if ($url = get_next_page($dom)) {
process_page($base, $url, $details);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment