coodix/sitemap-crawler.php

## sitemap-crawler.php
#!/usr/bin/php
<?php

date_default_timezone_set( 'UTC' );


$sitemaps = array(
	'https://bjornjohansen.no/sitemap_index.xml',
);

$crawler = new BJ_Crawler( $sitemaps );
$crawler->run();


/**
 * Crawler class
 */
class BJ_Crawler {

	protected $_sitemaps = null;
	protected $_urls = null;

	/**
	 * Constructor
	 *
	 * @param array|string $sitemaps A string with an URL to a XML sitemap, or an array with URLs to XML sitemaps. Sitemap index files works well too.
	 *
	 */
	function __construct( $sitemaps = null ) {

		$this->_sitemaps = [];
		$this->_urls = [];

		if ( ! is_null( $sitemaps ) ) {
			if ( ! is_array( $sitemaps ) ) {
				$sitemaps = array( $sitemaps );
			}

			foreach ( $sitemaps as $sitemap ) {
				$this->add_sitemap( $sitemap );
			}
		}

	}

	/**
	 * Add a sitemap URL to our crawl stack. Sitemap index files works too.
	 *
	 * @param string $sitemapurl URL to a XML sitemap or sitemap index
	 */
	public function add_sitemap( $sitemapurl ) {

		if ( in_array( $sitemapurl, $this->_sitemaps ) ) {
			return;
		}

		$this->_sitemaps[] = $sitemapurl;

		$ch = curl_init();
		curl_setopt( $ch, CURLOPT_URL, $sitemapurl );
		curl_setopt( $ch, CURLOPT_RETURNTRANSFER, true );
		$content = curl_exec( $ch );
		$http_return_code = curl_getinfo( $ch, CURLINFO_HTTP_CODE );

		if ( '200' != $http_return_code ) {
			return false;
		}

		$xml = new SimpleXMLElement( $content, LIBXML_NOBLANKS );

		if ( ! $xml ) {
			return false;
		}

		switch ( $xml->getName() ) {
			case 'sitemapindex':
				foreach ( $xml->sitemap as $sitemap ) {
					$this->add_sitemap( reset( $sitemap->loc ) );
				}
				break;

			case 'urlset':
				foreach ( $xml->url as $url ) {
					$this->add_url( reset( $url->loc ) );
				}
				break;

			default:
				break;
		 }

	}

	/**
	 * Add a URL to our crawl stack
	 *
	 * @param string $url URL to check
	 */
	public function add_url( $url ) {

		if ( ! in_array( $url, $this->_urls ) ) {
			$this->_urls[] = $url;
		}

	}

	/**
	 * Run the crawl
	 */
	public function run() {

		// Split our URLs into chunks of 5 URLs to use with curl multi
		$chunks =  array_chunk( $this->_urls, 5 );

		foreach ( $chunks as $chunk ) {

			$mh = curl_multi_init();

			foreach ( $chunk as $url ) {
				$ch = curl_init();
				curl_setopt( $ch, CURLOPT_URL, $url );
				curl_setopt( $ch, CURLOPT_RETURNTRANSFER, true );
				curl_multi_add_handle( $mh, $ch );
			}

			$active = null;
			do {
				$mrc = curl_multi_exec( $mh, $active );
			} while ( CURLM_CALL_MULTI_PERFORM == $mrc );

			while ( $active && CURLM_OK == $mrc ) {
				if ( curl_multi_select( $mh ) != -1) {
					do {
						$mrc = curl_multi_exec( $mh, $active );
					} while ( CURLM_CALL_MULTI_PERFORM == $mrc );
				}
			}
		}
	}

}
	#!/usr/bin/php
	<?php

	date_default_timezone_set( 'UTC' );


	$sitemaps = array(
	'https://bjornjohansen.no/sitemap_index.xml',
	);

	$crawler = new BJ_Crawler( $sitemaps );
	$crawler->run();


	/**
	* Crawler class
	*/
	class BJ_Crawler {

	protected $_sitemaps = null;
	protected $_urls = null;

	/**
	* Constructor
	*
	* @param array\|string $sitemaps A string with an URL to a XML sitemap, or an array with URLs to XML sitemaps. Sitemap index files works well too.
	*
	*/
	function __construct( $sitemaps = null ) {

	$this->_sitemaps = [];
	$this->_urls = [];

	if ( ! is_null( $sitemaps ) ) {
	if ( ! is_array( $sitemaps ) ) {
	$sitemaps = array( $sitemaps );
	}

	foreach ( $sitemaps as $sitemap ) {
	$this->add_sitemap( $sitemap );
	}
	}

	}

	/**
	* Add a sitemap URL to our crawl stack. Sitemap index files works too.
	*
	* @param string $sitemapurl URL to a XML sitemap or sitemap index
	*/
	public function add_sitemap( $sitemapurl ) {

	if ( in_array( $sitemapurl, $this->_sitemaps ) ) {
	return;
	}

	$this->_sitemaps[] = $sitemapurl;

	$ch = curl_init();
	curl_setopt( $ch, CURLOPT_URL, $sitemapurl );
	curl_setopt( $ch, CURLOPT_RETURNTRANSFER, true );
	$content = curl_exec( $ch );
	$http_return_code = curl_getinfo( $ch, CURLINFO_HTTP_CODE );

	if ( '200' != $http_return_code ) {
	return false;
	}

	$xml = new SimpleXMLElement( $content, LIBXML_NOBLANKS );

	if ( ! $xml ) {
	return false;
	}

	switch ( $xml->getName() ) {
	case 'sitemapindex':
	foreach ( $xml->sitemap as $sitemap ) {
	$this->add_sitemap( reset( $sitemap->loc ) );
	}
	break;

	case 'urlset':
	foreach ( $xml->url as $url ) {
	$this->add_url( reset( $url->loc ) );
	}
	break;

	default:
	break;
	}

	}

	/**
	* Add a URL to our crawl stack
	*
	* @param string $url URL to check
	*/
	public function add_url( $url ) {

	if ( ! in_array( $url, $this->_urls ) ) {
	$this->_urls[] = $url;
	}

	}

	/**
	* Run the crawl
	*/
	public function run() {

	// Split our URLs into chunks of 5 URLs to use with curl multi
	$chunks = array_chunk( $this->_urls, 5 );

	foreach ( $chunks as $chunk ) {

	$mh = curl_multi_init();

	foreach ( $chunk as $url ) {
	$ch = curl_init();
	curl_setopt( $ch, CURLOPT_URL, $url );
	curl_setopt( $ch, CURLOPT_RETURNTRANSFER, true );
	curl_multi_add_handle( $mh, $ch );
	}

	$active = null;
	do {
	$mrc = curl_multi_exec( $mh, $active );
	} while ( CURLM_CALL_MULTI_PERFORM == $mrc );

	while ( $active && CURLM_OK == $mrc ) {
	if ( curl_multi_select( $mh ) != -1) {
	do {
	$mrc = curl_multi_exec( $mh, $active );
	} while ( CURLM_CALL_MULTI_PERFORM == $mrc );
	}
	}
	}
	}

	}