Skip to content

Instantly share code, notes, and snippets.

@amnuts
Last active April 21, 2021 06:44
Show Gist options
  • Save amnuts/7468033 to your computer and use it in GitHub Desktop.
Save amnuts/7468033 to your computer and use it in GitHub Desktop.
Example of how to scrape multiple pages using Zend\Dom from Zend Framework 2.
<?php
use \Zend\Dom\Query;
use \Zend\Debug\Debug;
/**
* Fetch the page source and cache it, ensuring it's saved as UTF-8
*
* @param string $url
* @return string
*/
function fetch($url)
{
$content = '';
$md5 = md5($url);
$path = __DIR__.'/cache/' . $md5;
if (!file_exists($path)) {
$content = file_get_contents($url);
$content = mb_convert_encoding($content, 'UTF-8', mb_detect_encoding($content, 'UTF-8, ISO-8859-1', true));
file_put_contents($path, $content);
} else {
$content = file_get_contents($path);
}
return $content;
}
$tags = [];
$site = fetch('http://sussex.academia.edu/');
$sdom = new Query($site);
foreach ($sdom->execute('div#department_list ul li a') as $href) {
$url = $href->getAttribute('href');
$ddom = new Query(fetch($url));
$page = $ddom->execute('h1')->current()->nodeValue;
foreach ($ddom->execute('div#user_list fieldset') as $fieldset) {
$xml = simplexml_import_dom($fieldset);
if (strtolower((string)$xml->legend) == 'faculty') {
$subd = new Query($xml->asXml());
foreach ($subd->execute('div.user_strip') as $userNode) {
$userXml = simplexml_import_dom($userNode);
$linkd = new Query($userXml->asXml());
$links = [];
foreach ($linkd->execute('div.user_research_interests a.research_interest_link') as $link) {
$links[] = $link->nodeValue;
}
if (count($links)) {
$tags[$page][(string)$userXml->h3->a] = $links;
}
}
}
}
}
Debug::dump($tags);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment