icambridge/class.webspider.php

## class.webspider.php
<?php

	/**
	 * Web Spider class
	 *
	 * @author Iain Cambridge
	 * @package WebSpider
	 * @version 0.1
	 */

class WebSpider {

	/**
	 * The verison of the crawler.
	 *
	 * @var string
	 */
	const VERSION = "0.1";
	/**
	 * The url of the site being crawled.
	 *
	 * @var string
	 * @since 0.1
	 */
	protected $url;
	/**
	 * The links that have been found in
	 * the current page view.
	 *
	 * @var string
	 * @since 0.1
	 */
	protected $links = array();
	/**
	 * The URL for the sites domain url.
	 *
	 * @var string
	 * @since 0.1t
	 */
	protected $siteUrl;
	/**
	 * The array that will be used by curl_setopt_array
	 * in WebSpider::_executeCurl().
	 *
	 * @var array
	 * @since 0.1
	 */
	protected $curlOps = array();
	/**
	 * The reponse that was returned from curl.
	 *
	 * @var string
	 * @since 0.1
	 */
	protected $curlResponse = null;
	/**
	 * Holds the types of protocols the spider can support.
	 *
	 * @var array
	 * @since 0.1
	 */
	protected $supportedProtocols = array('http','https');
	/**
	 * Contains the urls that have already been crawled.
	 *
	 * @var Array
	 * @since 0.1
	 */
	protected $crawled = array();
	/**
	 * To decide if the spider is to crawl external sites.
	 *
	 * @var boolean
	 * @since 0.1
	 */
	protected $external = false;
	/**
	 * The domain we are currently crawling.
	 *
	 * @var string
	 * @since 0.1
	 */
	protected $currentDomain;
	/**
	 * The attachments for the current session
	 *
	 * @var array
	 * @since 0.1
	 */
	protected $attachments = array();
	/**
	 * The responses for the current session
	 *
	 * @var array
	 * @since 0.1
	 */
	protected $responses = array();
	/**
	 * The urls for the CSS files for the current session
	 *
	 * @var array
	 * @since 0.1
	 */
	protected $cssUrls = array();


	public function crawlExternal( $external ){

		if ( !is_bool($external) ){
			throw new RuntimeException("Invalid value given to external", 110);
		}

		$this->crawlExternal = $external;

		return $this;
	}

	/**
	 * Sets the default curl options which will
	 * generally be used by the Spider.
	 *
	 * @since 0.1
	 */
	public function setDefaultOptions(){

		$this->setCurlOption( CURLOPT_USERAGENT , "WwwBot (".self::VERSION.") " )
			 ->setCurlOption( CURLOPT_FOLLOWLOCATION , 1 )
			 ->setCurlOption( CURLOPT_RETURNTRANSFER , 1 )
			 ->setCurlOption( CURLOPT_MAXREDIRS , 10 )
			 ->setCurlOption( CURLOPT_AUTOREFERER , 1 )
			 ->setCurlOption( CURLOPT_HTTP_VERSION , CURL_HTTP_VERSION_1_1 )
			 ->setCurlOption( CURLOPT_CONNECTTIMEOUT ,  5 );

		return $this;
	}

	/**
	 * Sets a curl option by adding it to the array that is used
	 * by curl_setopt_array in WebSpider::_executeCurl().
	 *
	 * @param string $name
	 * @param mixed $value
	 * @since 0.1
	 */

	public function setCurlOption( $name, $value ){

		$this->curlOps[$name] = $value;

		return $this;
	}

	/**
	 * Executes the curl requests. Realigns the current page url to
	 * the one that was the effective url.
	 *
	 * @since 0.1
	 */
	public function executeCurl( ) {

		$ch = curl_init($this->url);
		curl_setopt_array($ch, $this->curlOps);
		$this->crawled[] = $this->url;
		if ( isset( $this->curlOps[CURLOPT_FILE] ) ){
			unset( $this->curlOps[CURLOPT_FILE] );
		}

		$this->curlResponse = curl_exec($ch);
		// Since we are following redirects then we should make get the current url so we can use it for proper link.
		$this->url = curl_getinfo($ch,CURLINFO_EFFECTIVE_URL);

		if ( !in_array($this->url, $this->crawled) ){
			$this->crawled[] = $this->url;
		}

		if ( !preg_match( "~^http(s)?://(([a-z0-9\-]+\.)+([a-z]{2,6}){1,2})(/.*)?$~iSU" , $this->url , $found )){
			throw new Exception("Invalid URL '".$this->url."' returned from curl request");
			// If that is flung then we are in a whole world of trouble. :D
		}
		$httpCode = curl_getinfo($ch,  CURLINFO_HTTP_CODE);
		$this->responses[ $httpCode ][] = $this->url;

		$this->siteUrl = 'http://'.$found[2].'/';

		$this;

	}

	/**
	 * The file to save the return data to .
	 *
	 * @param string $filename
	 * @param boolean $overwrite
	 * @since 0.1
	 */
	public function saveTo( $filename , $overwrite = false ){

		if ( file_exists($filename) && $overwrite !== true ) {
			throw new RuntimeException( "File '".$filename."' already exists and overwrite isn't enabled." , 101 );
		}

		$dirname = dirname( $filename );

		if ( !is_dir($dirname) ) {
			if ( !mkdir($dirname, 0777, true) ) {
				throw new RuntimeException( "'".$dirname."' directory doesn't exist and unable to create the directory." , 102 );
			}
		}

		if ( !is_writable($dirname) ){
			throw new RuntimeException( "Unable to write to directory '".$dirname."'" , 103 );
		}

		if ( file_exists($filename) && !is_writable($filename) ){
			throw new RuntimeException( "Unable to write to the file '".$dirname."'" , 103 );
		}

		if ( ($fp = fopen($filename, 'w+')) === false ){
			throw new RuntimeException( "Unable to create socket for writing file '".$filename."'" , 104 );
		}

		$this->setCurlOption( CURLOPT_FILE, $fp );

		return $this;
	}


	/**
	 * Searches the page content for CSS.
	 *
	 * @since 0.1
	 */

	public function getCss(){

		$css = $this->regexFind('~<link.*href=[\'"](.*\.css)[\'"].*(/></link>)~isU');

		if ( !isset($css[1]) ){
			for ( $i = 0; $i < sizeof($css[1]); $i++ ){
				if ( !in_array($this->cssUrls) ){
					$this->cssUrls[] = $css[1][$i];
				}
			}
		}

		return $this;
	}

	/**
	 * Searches the page content for JavaScript.
	 *
	 * @since 0.1
	 */
	public function getJs(){

		$js = $this->regexFind('~<javascript.*src=[\'"](.*)[\'"].*>~isU');

		if ( !isset($js[1]) ){
			for ( $i = 0; $i < sizeof($js[1]); $i++ ){
				if ( !in_array($this->jsUrls) ){
					$this->jsUrls[] = $js[1][$i];
				}
			}
		}

		return $this;
	}

	/**
	 * Scrapes the page for links.
	 *
	 * @since 0.1
	 */
	public function links(){

		preg_match_all("~<a.*href=[\"'](.*)[\"'].*>.*</a>~isU", $this->curlResponse , $matches );

		if ( !isset($matches[0]) || empty($matches) ){
			return;
		}

		for ( $i = 0;  $i < sizeof($matches[0]); $i++ ){

			$link = $matches[1][$i];
			$link = str_replace('/../', '/', $link);

			// Make sure we don't visit mailto or javascript links
			if ( preg_match('~^(mailto|javascript)\:~isU',$link) ){
				continue;
			}

			// Check to see if it's an attachment.
			if ( preg_match("~\.(png|jp(e)?g|gif|mp4|mp3|pdf|docx|doc|ppt|psd)$~",$link) ){
				$this->attachments[] = $link;
				continue;
			}

			// Remove anchors
			if ( !preg_match("~^#~",$link) ){
				$link = preg_replace('~#(.*)$~', '', $link);
			}

			// Check to see if it's a protocol link and if it is see if it's
			// one we are willing to support.
			if ( preg_match( "~^([a-z0-9]+)://~isU" , $link , $protocol ) ){
				if ( in_array($protocol[1], $this->supportedProtocols)  ){
					if ( ( $this->external == true ) ||
						 (  $this->external == false &&
						 	preg_match("~^http(s)?://".$this->currentDomain.".*~isU",$link)) ){
						$this->links[] = $link;
					}
				}
			} elseif ( preg_match( "~^[/#].*~isU" , $link ) ) {
				$this->links[] = $this->siteUrl.ltrim($link,'/');
			} else {
				if ( preg_match( "~^/.*~isU" , $link ) ){
					$this->links[] = $this->url.ltrim($link,'/');
				} else {
					$this->links[] = dirname($this->url).'/'.ltrim($link,'/');
				}
			}
		}

		return $this->links;
	}

	/**
	 * Sets the url to the next page.
	 *
	 * @since 0.1
	 */
	public function next(){

		do {
			$url = next($this->links);
			if ( $url !== false ){
				$url = preg_replace('~#(.*)$~', '', $url);
				if ( !in_array($url, $this->crawled) ){
					$this->setUrl($url);
					$setUrl = true;
					break;
				}
			}
		} while( $url !== false  );

		return $url;

	}

	/**
	 * Sets the URL for the page request.
	 *
	 * @param string $url
	 * @since 0.1
	 */
	public function setUrl( $url ){

		if ( !preg_match( "~^http(s)?://(([a-z0-9\-]+\.)+([a-z]{2,6}){1,2})(/.*)?$~iSU" , $url , $found )){
			throw new Exception("Invalid URL '".$this->url."' provided in setUrl()");
		}

		$this->currentDomain = $found[2];
		$this->siteUrl = 'http://'.$found[2].'/';
		$this->url = $url;

		return true;

	}

	/**
	 * Allows the running of a Regular Expression match find on the
	 *
	 * @Since 0.1
	 */
	public function regexFind( $pattern , $single = false ){

		if ( $single != true ){
			preg_match_all($pattern,$this->curlResponse,$found);
		} else {
			preg_match($pattern,$this->curlResponse,$found);
		}

		return $found;

	}

	/**
	 * Get the urls that have been crawled by the crawler already.
	 *
	 * @since 0.1
	 */
	public function getCrawled() {
		return $this->crawled;
	}

	/**
	 * Returns the url for the current request.
	 *
	 * @since 0.1
	 */
	public function getUrl(){
		return $this->url;
	}

	/**
	 * Returns the responses for the current session.
	 *
	 * @since 0.1
	 */
	public function getReponses(){
		return $this->responses;
	}

	/**
	 * Returns the attachments for the current session.
	 *
	 * @since 0.1
	 */
	public function getAttachments(){
		return $this->attachments;
	}

	/**
	 * Wipes the attachments for the session.
	 *
	 * @since 0.1
	 */
	public function wipeAttachments(){
		$this->attachments = array();
	}

	/**
	 * Sets the links that the session will work through.
	 *
	 * @param array $links
	 * @since 0.1
	 */
	public function setLinks( array $links ){
		$this->links = $links;
	}

	/**
	 * returns the current domain.
	 *
	 * @since 0.1
	 */
	public function getDomain(){
		return $this->currentDomain;
	}

	/**
	 * Resets all tha arrays containing session data.
	 *
	 * @since 0.1
	 */
	public function __clone(){

		$this->siteUrl = '';
		$this->url = '';

		$this->links = array();
		$this->crawled = array();
		$this->responses = array();

	}

}

## usage.php
<?php

ini_set('memory_limit',-1);

require_once 'class.webspider.php';

$posts = array();

$webSpider = new WebSpider();
$webSpider->setUrl('http://businesslink.gov.uk');
$webSpider->crawlExternal(false);
$webSpider->setDefaultOptions();

do {

	print "Crawling '".$webSpider->getUrl()."'".PHP_EOL;

	$webSpider->wipeAttachments();
	$webSpider->executeCurl();
	$webSpider->regexFind('~dsadasd~');
	$webSpider->links();
	$attachments = $webSpider->getAttachments();

	if ( !empty($attachments) ){

		$downloadSpider = clone $webSpider;
		$downloadSpider->setLinks($attachments);

		//Reset the url to the first attachment that is
		// to be downloaded.
		$firstDownload = current($attachments);
		$downloadSpider->setUrl( $firstDownload );

		do {

			$basename = basename($downloadSpider->getUrl());

			print "\tDownloading ".$downloadSpider->getUrl()." : ";
			$downloadSpider->saveTo($basename,true);
			$downloadSpider->executeCurl();

			if ( !file_exists($basename) ){
				print "failed".PHP_EOL;
			} else {
				print "success".PHP_EOL;
			}

			// Do stuff with download here.

		} while ( $downloadSpider->next() );

	}


} while ( $webSpider->next() );

print "Crawled ".sizeof($posts)." blog posts".PHP_EOL;
foreach ( $webSpider->getReponses() as $key => $value ){
	print sizeof($value)." out of ".sizeof( $webSpider->getCrawled() )." were ".$key.PHP_EOL;
}
	<?php

	/**
	* Web Spider class
	*
	* @author Iain Cambridge
	* @package WebSpider
	* @version 0.1
	*/

	class WebSpider {

	/**
	* The verison of the crawler.
	*
	* @var string
	*/
	const VERSION = "0.1";
	/**
	* The url of the site being crawled.
	*
	* @var string
	* @since 0.1
	*/
	protected $url;
	/**
	* The links that have been found in
	* the current page view.
	*
	* @var string
	* @since 0.1
	*/
	protected $links = array();
	/**
	* The URL for the sites domain url.
	*
	* @var string
	* @since 0.1t
	*/
	protected $siteUrl;
	/**
	* The array that will be used by curl_setopt_array
	* in WebSpider::_executeCurl().
	*
	* @var array
	* @since 0.1
	*/
	protected $curlOps = array();
	/**
	* The reponse that was returned from curl.
	*
	* @var string
	* @since 0.1
	*/
	protected $curlResponse = null;
	/**
	* Holds the types of protocols the spider can support.
	*
	* @var array
	* @since 0.1
	*/
	protected $supportedProtocols = array('http','https');
	/**
	* Contains the urls that have already been crawled.
	*
	* @var Array
	* @since 0.1
	*/
	protected $crawled = array();
	/**
	* To decide if the spider is to crawl external sites.
	*
	* @var boolean
	* @since 0.1
	*/
	protected $external = false;
	/**
	* The domain we are currently crawling.
	*
	* @var string
	* @since 0.1
	*/
	protected $currentDomain;
	/**
	* The attachments for the current session
	*
	* @var array
	* @since 0.1
	*/
	protected $attachments = array();
	/**
	* The responses for the current session
	*
	* @var array
	* @since 0.1
	*/
	protected $responses = array();
	/**
	* The urls for the CSS files for the current session
	*
	* @var array
	* @since 0.1
	*/
	protected $cssUrls = array();


	public function crawlExternal( $external ){

	if ( !is_bool($external) ){
	throw new RuntimeException("Invalid value given to external", 110);
	}

	$this->crawlExternal = $external;

	return $this;
	}

	/**
	* Sets the default curl options which will
	* generally be used by the Spider.
	*
	* @since 0.1
	*/
	public function setDefaultOptions(){

	$this->setCurlOption( CURLOPT_USERAGENT , "WwwBot (".self::VERSION.") " )
	->setCurlOption( CURLOPT_FOLLOWLOCATION , 1 )
	->setCurlOption( CURLOPT_RETURNTRANSFER , 1 )
	->setCurlOption( CURLOPT_MAXREDIRS , 10 )
	->setCurlOption( CURLOPT_AUTOREFERER , 1 )
	->setCurlOption( CURLOPT_HTTP_VERSION , CURL_HTTP_VERSION_1_1 )
	->setCurlOption( CURLOPT_CONNECTTIMEOUT , 5 );

	return $this;
	}

	/**
	* Sets a curl option by adding it to the array that is used
	* by curl_setopt_array in WebSpider::_executeCurl().
	*
	* @param string $name
	* @param mixed $value
	* @since 0.1
	*/

	public function setCurlOption( $name, $value ){

	$this->curlOps[$name] = $value;

	return $this;
	}

	/**
	* Executes the curl requests. Realigns the current page url to
	* the one that was the effective url.
	*
	* @since 0.1
	*/
	public function executeCurl( ) {

	$ch = curl_init($this->url);
	curl_setopt_array($ch, $this->curlOps);
	$this->crawled[] = $this->url;
	if ( isset( $this->curlOps[CURLOPT_FILE] ) ){
	unset( $this->curlOps[CURLOPT_FILE] );
	}

	$this->curlResponse = curl_exec($ch);
	// Since we are following redirects then we should make get the current url so we can use it for proper link.
	$this->url = curl_getinfo($ch,CURLINFO_EFFECTIVE_URL);

	if ( !in_array($this->url, $this->crawled) ){
	$this->crawled[] = $this->url;
	}

	if ( !preg_match( "~^http(s)?://(([a-z0-9\-]+\.)+([a-z]{2,6}){1,2})(/.*)?$~iSU" , $this->url , $found )){
	throw new Exception("Invalid URL '".$this->url."' returned from curl request");
	// If that is flung then we are in a whole world of trouble. :D
	}
	$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
	$this->responses[ $httpCode ][] = $this->url;

	$this->siteUrl = 'http://'.$found[2].'/';

	$this;

	}

	/**
	* The file to save the return data to .
	*
	* @param string $filename
	* @param boolean $overwrite
	* @since 0.1
	*/
	public function saveTo( $filename , $overwrite = false ){

	if ( file_exists($filename) && $overwrite !== true ) {
	throw new RuntimeException( "File '".$filename."' already exists and overwrite isn't enabled." , 101 );
	}

	$dirname = dirname( $filename );

	if ( !is_dir($dirname) ) {
	if ( !mkdir($dirname, 0777, true) ) {
	throw new RuntimeException( "'".$dirname."' directory doesn't exist and unable to create the directory." , 102 );
	}
	}

	if ( !is_writable($dirname) ){
	throw new RuntimeException( "Unable to write to directory '".$dirname."'" , 103 );
	}

	if ( file_exists($filename) && !is_writable($filename) ){
	throw new RuntimeException( "Unable to write to the file '".$dirname."'" , 103 );
	}

	if ( ($fp = fopen($filename, 'w+')) === false ){
	throw new RuntimeException( "Unable to create socket for writing file '".$filename."'" , 104 );
	}

	$this->setCurlOption( CURLOPT_FILE, $fp );

	return $this;
	}


	/**
	* Searches the page content for CSS.
	*
	* @since 0.1
	*/

	public function getCss(){

	$css = $this->regexFind('~<link.href=[\'"](.\.css)[\'"].*(/></link>)~isU');

	if ( !isset($css[1]) ){
	for ( $i = 0; $i < sizeof($css[1]); $i++ ){
	if ( !in_array($this->cssUrls) ){
	$this->cssUrls[] = $css[1][$i];
	}
	}
	}

	return $this;
	}

	/**
	* Searches the page content for JavaScript.
	*
	* @since 0.1
	*/
	public function getJs(){

	$js = $this->regexFind('~<javascript.src=[\'"](.)[\'"].*>~isU');

	if ( !isset($js[1]) ){
	for ( $i = 0; $i < sizeof($js[1]); $i++ ){
	if ( !in_array($this->jsUrls) ){
	$this->jsUrls[] = $js[1][$i];
	}
	}
	}

	return $this;
	}

	/**
	* Scrapes the page for links.
	*
	* @since 0.1
	*/
	public function links(){

	preg_match_all("~<a.href=[\"'](.)[\"'].>.</a>~isU", $this->curlResponse , $matches );

	if ( !isset($matches[0]) \|\| empty($matches) ){
	return;
	}

	for ( $i = 0; $i < sizeof($matches[0]); $i++ ){

	$link = $matches[1][$i];
	$link = str_replace('/../', '/', $link);

	// Make sure we don't visit mailto or javascript links
	if ( preg_match('~^(mailto\|javascript)\:~isU',$link) ){
	continue;
	}

	// Check to see if it's an attachment.
	if ( preg_match("~\.(png\|jp(e)?g\|gif\|mp4\|mp3\|pdf\|docx\|doc\|ppt\|psd)$~",$link) ){
	$this->attachments[] = $link;
	continue;
	}

	// Remove anchors
	if ( !preg_match("~^#~",$link) ){
	$link = preg_replace('~#(.*)$~', '', $link);
	}

	// Check to see if it's a protocol link and if it is see if it's
	// one we are willing to support.
	if ( preg_match( "~^([a-z0-9]+)://~isU" , $link , $protocol ) ){
	if ( in_array($protocol[1], $this->supportedProtocols) ){
	if ( ( $this->external == true ) \|\|
	( $this->external == false &&
	preg_match("~^http(s)?://".$this->currentDomain.".*~isU",$link)) ){
	$this->links[] = $link;
	}
	}
	} elseif ( preg_match( "~^[/#].*~isU" , $link ) ) {
	$this->links[] = $this->siteUrl.ltrim($link,'/');
	} else {
	if ( preg_match( "~^/.*~isU" , $link ) ){
	$this->links[] = $this->url.ltrim($link,'/');
	} else {
	$this->links[] = dirname($this->url).'/'.ltrim($link,'/');
	}
	}
	}

	return $this->links;
	}

	/**
	* Sets the url to the next page.
	*
	* @since 0.1
	*/
	public function next(){

	do {
	$url = next($this->links);
	if ( $url !== false ){
	$url = preg_replace('~#(.*)$~', '', $url);
	if ( !in_array($url, $this->crawled) ){
	$this->setUrl($url);
	$setUrl = true;
	break;
	}
	}
	} while( $url !== false );

	return $url;

	}

	/**
	* Sets the URL for the page request.
	*
	* @param string $url
	* @since 0.1
	*/
	public function setUrl( $url ){

	if ( !preg_match( "~^http(s)?://(([a-z0-9\-]+\.)+([a-z]{2,6}){1,2})(/.*)?$~iSU" , $url , $found )){
	throw new Exception("Invalid URL '".$this->url."' provided in setUrl()");
	}

	$this->currentDomain = $found[2];
	$this->siteUrl = 'http://'.$found[2].'/';
	$this->url = $url;

	return true;

	}

	/**
	* Allows the running of a Regular Expression match find on the
	*
	* @Since 0.1
	*/
	public function regexFind( $pattern , $single = false ){

	if ( $single != true ){
	preg_match_all($pattern,$this->curlResponse,$found);
	} else {
	preg_match($pattern,$this->curlResponse,$found);
	}

	return $found;

	}

	/**
	* Get the urls that have been crawled by the crawler already.
	*
	* @since 0.1
	*/
	public function getCrawled() {
	return $this->crawled;
	}

	/**
	* Returns the url for the current request.
	*
	* @since 0.1
	*/
	public function getUrl(){
	return $this->url;
	}

	/**
	* Returns the responses for the current session.
	*
	* @since 0.1
	*/
	public function getReponses(){
	return $this->responses;
	}

	/**
	* Returns the attachments for the current session.
	*
	* @since 0.1
	*/
	public function getAttachments(){
	return $this->attachments;
	}

	/**
	* Wipes the attachments for the session.
	*
	* @since 0.1
	*/
	public function wipeAttachments(){
	$this->attachments = array();
	}

	/**
	* Sets the links that the session will work through.
	*
	* @param array $links
	* @since 0.1
	*/
	public function setLinks( array $links ){
	$this->links = $links;
	}

	/**
	* returns the current domain.
	*
	* @since 0.1
	*/
	public function getDomain(){
	return $this->currentDomain;
	}

	/**
	* Resets all tha arrays containing session data.
	*
	* @since 0.1
	*/
	public function __clone(){

	$this->siteUrl = '';
	$this->url = '';

	$this->links = array();
	$this->crawled = array();
	$this->responses = array();

	}

	}
	<?php

	ini_set('memory_limit',-1);

	require_once 'class.webspider.php';

	$posts = array();

	$webSpider = new WebSpider();
	$webSpider->setUrl('http://businesslink.gov.uk');
	$webSpider->crawlExternal(false);
	$webSpider->setDefaultOptions();

	do {

	print "Crawling '".$webSpider->getUrl()."'".PHP_EOL;

	$webSpider->wipeAttachments();
	$webSpider->executeCurl();
	$webSpider->regexFind('~dsadasd~');
	$webSpider->links();
	$attachments = $webSpider->getAttachments();

	if ( !empty($attachments) ){

	$downloadSpider = clone $webSpider;
	$downloadSpider->setLinks($attachments);

	//Reset the url to the first attachment that is
	// to be downloaded.
	$firstDownload = current($attachments);
	$downloadSpider->setUrl( $firstDownload );

	do {

	$basename = basename($downloadSpider->getUrl());

	print "\tDownloading ".$downloadSpider->getUrl()." : ";
	$downloadSpider->saveTo($basename,true);
	$downloadSpider->executeCurl();

	if ( !file_exists($basename) ){
	print "failed".PHP_EOL;
	} else {
	print "success".PHP_EOL;
	}

	// Do stuff with download here.

	} while ( $downloadSpider->next() );

	}


	} while ( $webSpider->next() );

	print "Crawled ".sizeof($posts)." blog posts".PHP_EOL;
	foreach ( $webSpider->getReponses() as $key => $value ){
	print sizeof($value)." out of ".sizeof( $webSpider->getCrawled() )." were ".$key.PHP_EOL;
	}