Skip to content

Instantly share code, notes, and snippets.

@bjornjohansen
Last active August 14, 2023 18:19
Show Gist options
  • Star 16 You must be signed in to star a gist
  • Fork 4 You must be signed in to fork a gist
  • Save bjornjohansen/4905c93f9bd44e6084ec to your computer and use it in GitHub Desktop.
Save bjornjohansen/4905c93f9bd44e6084ec to your computer and use it in GitHub Desktop.
Basic sitemap crawler to warm up a full page cache
#!/usr/bin/php
<?php
/**
* @license http://www.wtfpl.net/txt/copying/ WTFPL
*/
date_default_timezone_set( 'UTC' );
$sitemaps = array(
'https://bjornjohansen.no/sitemap_index.xml',
);
$crawler = new BJ_Crawler( $sitemaps );
$crawler->run();
/**
* Crawler class
*/
class BJ_Crawler {
protected $_sitemaps = null;
protected $_urls = null;
/**
* Constructor
*
* @param array|string $sitemaps A string with an URL to a XML sitemap, or an array with URLs to XML sitemaps. Sitemap index files works well too.
*
*/
function __construct( $sitemaps = null ) {
$this->_sitemaps = [];
$this->_urls = [];
if ( ! is_null( $sitemaps ) ) {
if ( ! is_array( $sitemaps ) ) {
$sitemaps = array( $sitemaps );
}
foreach ( $sitemaps as $sitemap ) {
$this->add_sitemap( $sitemap );
}
}
}
/**
* Add a sitemap URL to our crawl stack. Sitemap index files works too.
*
* @param string $sitemapurl URL to a XML sitemap or sitemap index
*/
public function add_sitemap( $sitemapurl ) {
if ( in_array( $sitemapurl, $this->_sitemaps ) ) {
return;
}
$this->_sitemaps[] = $sitemapurl;
$ch = curl_init();
curl_setopt( $ch, CURLOPT_URL, $sitemapurl );
curl_setopt( $ch, CURLOPT_RETURNTRANSFER, true );
$content = curl_exec( $ch );
$http_return_code = curl_getinfo( $ch, CURLINFO_HTTP_CODE );
if ( '200' != $http_return_code ) {
return false;
}
$xml = new SimpleXMLElement( $content, LIBXML_NOBLANKS );
if ( ! $xml ) {
return false;
}
switch ( $xml->getName() ) {
case 'sitemapindex':
foreach ( $xml->sitemap as $sitemap ) {
$this->add_sitemap( reset( $sitemap->loc ) );
}
break;
case 'urlset':
foreach ( $xml->url as $url ) {
$this->add_url( reset( $url->loc ) );
}
break;
default:
break;
}
}
/**
* Add a URL to our crawl stack
*
* @param string $url URL to check
*/
public function add_url( $url ) {
if ( ! in_array( $url, $this->_urls ) ) {
$this->_urls[] = $url;
}
}
/**
* Run the crawl
*/
public function run() {
// Split our URLs into chunks of 5 URLs to use with curl multi
$chunks = array_chunk( $this->_urls, 5 );
foreach ( $chunks as $chunk ) {
$mh = curl_multi_init();
foreach ( $chunk as $url ) {
$ch = curl_init();
curl_setopt( $ch, CURLOPT_URL, $url );
curl_setopt( $ch, CURLOPT_RETURNTRANSFER, true );
curl_multi_add_handle( $mh, $ch );
}
$active = null;
do {
$mrc = curl_multi_exec( $mh, $active );
} while ( CURLM_CALL_MULTI_PERFORM == $mrc );
while ( $active && CURLM_OK == $mrc ) {
if ( curl_multi_select( $mh ) != -1) {
do {
$mrc = curl_multi_exec( $mh, $active );
} while ( CURLM_CALL_MULTI_PERFORM == $mrc );
}
}
}
}
}
@bjornjohansen
Copy link
Author

@bjornjohansen I'm going to create a Composer package for cache warmup and would like to use parts of your code. Is this okay for you? Of course I'd add a source link to this gist file :)

Yes, of course @eliashaeussler

You don’t need to link back here. I’m hereby granting you a WTFPL license :-)

@eliashaeussler
Copy link

@bjornjohansen I'm going to create a Composer package for cache warmup and would like to use parts of your code. Is this okay for you? Of course I'd add a source link to this gist file :)

Yes, of course @eliashaeussler

You don’t need to link back here. I’m hereby granting you a WTFPL license :-)

Wow, that's very nice of you @bjornjohansen – thanks!

Here we go: https://packagist.org/packages/eliashaeussler/cache-warmup 🎉

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment