Skip to content

Instantly share code, notes, and snippets.

@coodix
Forked from bjornjohansen/sitemap-crawler.php
Created February 7, 2017 13:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save coodix/bc0965c2956b747ab4e7c77a0d2e83e9 to your computer and use it in GitHub Desktop.
Save coodix/bc0965c2956b747ab4e7c77a0d2e83e9 to your computer and use it in GitHub Desktop.
Basic sitemap crawler to warm up a full page cache
#!/usr/bin/php
<?php
date_default_timezone_set( 'UTC' );
$sitemaps = array(
'https://bjornjohansen.no/sitemap_index.xml',
);
$crawler = new BJ_Crawler( $sitemaps );
$crawler->run();
/**
* Crawler class
*/
class BJ_Crawler {
protected $_sitemaps = null;
protected $_urls = null;
/**
* Constructor
*
* @param array|string $sitemaps A string with an URL to a XML sitemap, or an array with URLs to XML sitemaps. Sitemap index files works well too.
*
*/
function __construct( $sitemaps = null ) {
$this->_sitemaps = [];
$this->_urls = [];
if ( ! is_null( $sitemaps ) ) {
if ( ! is_array( $sitemaps ) ) {
$sitemaps = array( $sitemaps );
}
foreach ( $sitemaps as $sitemap ) {
$this->add_sitemap( $sitemap );
}
}
}
/**
* Add a sitemap URL to our crawl stack. Sitemap index files works too.
*
* @param string $sitemapurl URL to a XML sitemap or sitemap index
*/
public function add_sitemap( $sitemapurl ) {
if ( in_array( $sitemapurl, $this->_sitemaps ) ) {
return;
}
$this->_sitemaps[] = $sitemapurl;
$ch = curl_init();
curl_setopt( $ch, CURLOPT_URL, $sitemapurl );
curl_setopt( $ch, CURLOPT_RETURNTRANSFER, true );
$content = curl_exec( $ch );
$http_return_code = curl_getinfo( $ch, CURLINFO_HTTP_CODE );
if ( '200' != $http_return_code ) {
return false;
}
$xml = new SimpleXMLElement( $content, LIBXML_NOBLANKS );
if ( ! $xml ) {
return false;
}
switch ( $xml->getName() ) {
case 'sitemapindex':
foreach ( $xml->sitemap as $sitemap ) {
$this->add_sitemap( reset( $sitemap->loc ) );
}
break;
case 'urlset':
foreach ( $xml->url as $url ) {
$this->add_url( reset( $url->loc ) );
}
break;
default:
break;
}
}
/**
* Add a URL to our crawl stack
*
* @param string $url URL to check
*/
public function add_url( $url ) {
if ( ! in_array( $url, $this->_urls ) ) {
$this->_urls[] = $url;
}
}
/**
* Run the crawl
*/
public function run() {
// Split our URLs into chunks of 5 URLs to use with curl multi
$chunks = array_chunk( $this->_urls, 5 );
foreach ( $chunks as $chunk ) {
$mh = curl_multi_init();
foreach ( $chunk as $url ) {
$ch = curl_init();
curl_setopt( $ch, CURLOPT_URL, $url );
curl_setopt( $ch, CURLOPT_RETURNTRANSFER, true );
curl_multi_add_handle( $mh, $ch );
}
$active = null;
do {
$mrc = curl_multi_exec( $mh, $active );
} while ( CURLM_CALL_MULTI_PERFORM == $mrc );
while ( $active && CURLM_OK == $mrc ) {
if ( curl_multi_select( $mh ) != -1) {
do {
$mrc = curl_multi_exec( $mh, $active );
} while ( CURLM_CALL_MULTI_PERFORM == $mrc );
}
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment