Created
December 22, 2011 13:23
-
-
Save icambridge/1510300 to your computer and use it in GitHub Desktop.
Web Spider
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* Web Spider class | |
* | |
* @author Iain Cambridge | |
* @package WebSpider | |
* @version 0.1 | |
*/ | |
class WebSpider { | |
/** | |
* The verison of the crawler. | |
* | |
* @var string | |
*/ | |
const VERSION = "0.1"; | |
/** | |
* The url of the site being crawled. | |
* | |
* @var string | |
* @since 0.1 | |
*/ | |
protected $url; | |
/** | |
* The links that have been found in | |
* the current page view. | |
* | |
* @var string | |
* @since 0.1 | |
*/ | |
protected $links = array(); | |
/** | |
* The URL for the sites domain url. | |
* | |
* @var string | |
* @since 0.1t | |
*/ | |
protected $siteUrl; | |
/** | |
* The array that will be used by curl_setopt_array | |
* in WebSpider::_executeCurl(). | |
* | |
* @var array | |
* @since 0.1 | |
*/ | |
protected $curlOps = array(); | |
/** | |
* The reponse that was returned from curl. | |
* | |
* @var string | |
* @since 0.1 | |
*/ | |
protected $curlResponse = null; | |
/** | |
* Holds the types of protocols the spider can support. | |
* | |
* @var array | |
* @since 0.1 | |
*/ | |
protected $supportedProtocols = array('http','https'); | |
/** | |
* Contains the urls that have already been crawled. | |
* | |
* @var Array | |
* @since 0.1 | |
*/ | |
protected $crawled = array(); | |
/** | |
* To decide if the spider is to crawl external sites. | |
* | |
* @var boolean | |
* @since 0.1 | |
*/ | |
protected $external = false; | |
/** | |
* The domain we are currently crawling. | |
* | |
* @var string | |
* @since 0.1 | |
*/ | |
protected $currentDomain; | |
/** | |
* The attachments for the current session | |
* | |
* @var array | |
* @since 0.1 | |
*/ | |
protected $attachments = array(); | |
/** | |
* The responses for the current session | |
* | |
* @var array | |
* @since 0.1 | |
*/ | |
protected $responses = array(); | |
/** | |
* The urls for the CSS files for the current session | |
* | |
* @var array | |
* @since 0.1 | |
*/ | |
protected $cssUrls = array(); | |
public function crawlExternal( $external ){ | |
if ( !is_bool($external) ){ | |
throw new RuntimeException("Invalid value given to external", 110); | |
} | |
$this->crawlExternal = $external; | |
return $this; | |
} | |
/** | |
* Sets the default curl options which will | |
* generally be used by the Spider. | |
* | |
* @since 0.1 | |
*/ | |
public function setDefaultOptions(){ | |
$this->setCurlOption( CURLOPT_USERAGENT , "WwwBot (".self::VERSION.") " ) | |
->setCurlOption( CURLOPT_FOLLOWLOCATION , 1 ) | |
->setCurlOption( CURLOPT_RETURNTRANSFER , 1 ) | |
->setCurlOption( CURLOPT_MAXREDIRS , 10 ) | |
->setCurlOption( CURLOPT_AUTOREFERER , 1 ) | |
->setCurlOption( CURLOPT_HTTP_VERSION , CURL_HTTP_VERSION_1_1 ) | |
->setCurlOption( CURLOPT_CONNECTTIMEOUT , 5 ); | |
return $this; | |
} | |
/** | |
* Sets a curl option by adding it to the array that is used | |
* by curl_setopt_array in WebSpider::_executeCurl(). | |
* | |
* @param string $name | |
* @param mixed $value | |
* @since 0.1 | |
*/ | |
public function setCurlOption( $name, $value ){ | |
$this->curlOps[$name] = $value; | |
return $this; | |
} | |
/** | |
* Executes the curl requests. Realigns the current page url to | |
* the one that was the effective url. | |
* | |
* @since 0.1 | |
*/ | |
public function executeCurl( ) { | |
$ch = curl_init($this->url); | |
curl_setopt_array($ch, $this->curlOps); | |
$this->crawled[] = $this->url; | |
if ( isset( $this->curlOps[CURLOPT_FILE] ) ){ | |
unset( $this->curlOps[CURLOPT_FILE] ); | |
} | |
$this->curlResponse = curl_exec($ch); | |
// Since we are following redirects then we should make get the current url so we can use it for proper link. | |
$this->url = curl_getinfo($ch,CURLINFO_EFFECTIVE_URL); | |
if ( !in_array($this->url, $this->crawled) ){ | |
$this->crawled[] = $this->url; | |
} | |
if ( !preg_match( "~^http(s)?://(([a-z0-9\-]+\.)+([a-z]{2,6}){1,2})(/.*)?$~iSU" , $this->url , $found )){ | |
throw new Exception("Invalid URL '".$this->url."' returned from curl request"); | |
// If that is flung then we are in a whole world of trouble. :D | |
} | |
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); | |
$this->responses[ $httpCode ][] = $this->url; | |
$this->siteUrl = 'http://'.$found[2].'/'; | |
$this; | |
} | |
/** | |
* The file to save the return data to . | |
* | |
* @param string $filename | |
* @param boolean $overwrite | |
* @since 0.1 | |
*/ | |
public function saveTo( $filename , $overwrite = false ){ | |
if ( file_exists($filename) && $overwrite !== true ) { | |
throw new RuntimeException( "File '".$filename."' already exists and overwrite isn't enabled." , 101 ); | |
} | |
$dirname = dirname( $filename ); | |
if ( !is_dir($dirname) ) { | |
if ( !mkdir($dirname, 0777, true) ) { | |
throw new RuntimeException( "'".$dirname."' directory doesn't exist and unable to create the directory." , 102 ); | |
} | |
} | |
if ( !is_writable($dirname) ){ | |
throw new RuntimeException( "Unable to write to directory '".$dirname."'" , 103 ); | |
} | |
if ( file_exists($filename) && !is_writable($filename) ){ | |
throw new RuntimeException( "Unable to write to the file '".$dirname."'" , 103 ); | |
} | |
if ( ($fp = fopen($filename, 'w+')) === false ){ | |
throw new RuntimeException( "Unable to create socket for writing file '".$filename."'" , 104 ); | |
} | |
$this->setCurlOption( CURLOPT_FILE, $fp ); | |
return $this; | |
} | |
/** | |
* Searches the page content for CSS. | |
* | |
* @since 0.1 | |
*/ | |
public function getCss(){ | |
$css = $this->regexFind('~<link.*href=[\'"](.*\.css)[\'"].*(/></link>)~isU'); | |
if ( !isset($css[1]) ){ | |
for ( $i = 0; $i < sizeof($css[1]); $i++ ){ | |
if ( !in_array($this->cssUrls) ){ | |
$this->cssUrls[] = $css[1][$i]; | |
} | |
} | |
} | |
return $this; | |
} | |
/** | |
* Searches the page content for JavaScript. | |
* | |
* @since 0.1 | |
*/ | |
public function getJs(){ | |
$js = $this->regexFind('~<javascript.*src=[\'"](.*)[\'"].*>~isU'); | |
if ( !isset($js[1]) ){ | |
for ( $i = 0; $i < sizeof($js[1]); $i++ ){ | |
if ( !in_array($this->jsUrls) ){ | |
$this->jsUrls[] = $js[1][$i]; | |
} | |
} | |
} | |
return $this; | |
} | |
/** | |
* Scrapes the page for links. | |
* | |
* @since 0.1 | |
*/ | |
public function links(){ | |
preg_match_all("~<a.*href=[\"'](.*)[\"'].*>.*</a>~isU", $this->curlResponse , $matches ); | |
if ( !isset($matches[0]) || empty($matches) ){ | |
return; | |
} | |
for ( $i = 0; $i < sizeof($matches[0]); $i++ ){ | |
$link = $matches[1][$i]; | |
$link = str_replace('/../', '/', $link); | |
// Make sure we don't visit mailto or javascript links | |
if ( preg_match('~^(mailto|javascript)\:~isU',$link) ){ | |
continue; | |
} | |
// Check to see if it's an attachment. | |
if ( preg_match("~\.(png|jp(e)?g|gif|mp4|mp3|pdf|docx|doc|ppt|psd)$~",$link) ){ | |
$this->attachments[] = $link; | |
continue; | |
} | |
// Remove anchors | |
if ( !preg_match("~^#~",$link) ){ | |
$link = preg_replace('~#(.*)$~', '', $link); | |
} | |
// Check to see if it's a protocol link and if it is see if it's | |
// one we are willing to support. | |
if ( preg_match( "~^([a-z0-9]+)://~isU" , $link , $protocol ) ){ | |
if ( in_array($protocol[1], $this->supportedProtocols) ){ | |
if ( ( $this->external == true ) || | |
( $this->external == false && | |
preg_match("~^http(s)?://".$this->currentDomain.".*~isU",$link)) ){ | |
$this->links[] = $link; | |
} | |
} | |
} elseif ( preg_match( "~^[/#].*~isU" , $link ) ) { | |
$this->links[] = $this->siteUrl.ltrim($link,'/'); | |
} else { | |
if ( preg_match( "~^/.*~isU" , $link ) ){ | |
$this->links[] = $this->url.ltrim($link,'/'); | |
} else { | |
$this->links[] = dirname($this->url).'/'.ltrim($link,'/'); | |
} | |
} | |
} | |
return $this->links; | |
} | |
/** | |
* Sets the url to the next page. | |
* | |
* @since 0.1 | |
*/ | |
public function next(){ | |
do { | |
$url = next($this->links); | |
if ( $url !== false ){ | |
$url = preg_replace('~#(.*)$~', '', $url); | |
if ( !in_array($url, $this->crawled) ){ | |
$this->setUrl($url); | |
$setUrl = true; | |
break; | |
} | |
} | |
} while( $url !== false ); | |
return $url; | |
} | |
/** | |
* Sets the URL for the page request. | |
* | |
* @param string $url | |
* @since 0.1 | |
*/ | |
public function setUrl( $url ){ | |
if ( !preg_match( "~^http(s)?://(([a-z0-9\-]+\.)+([a-z]{2,6}){1,2})(/.*)?$~iSU" , $url , $found )){ | |
throw new Exception("Invalid URL '".$this->url."' provided in setUrl()"); | |
} | |
$this->currentDomain = $found[2]; | |
$this->siteUrl = 'http://'.$found[2].'/'; | |
$this->url = $url; | |
return true; | |
} | |
/** | |
* Allows the running of a Regular Expression match find on the | |
* | |
* @Since 0.1 | |
*/ | |
public function regexFind( $pattern , $single = false ){ | |
if ( $single != true ){ | |
preg_match_all($pattern,$this->curlResponse,$found); | |
} else { | |
preg_match($pattern,$this->curlResponse,$found); | |
} | |
return $found; | |
} | |
/** | |
* Get the urls that have been crawled by the crawler already. | |
* | |
* @since 0.1 | |
*/ | |
public function getCrawled() { | |
return $this->crawled; | |
} | |
/** | |
* Returns the url for the current request. | |
* | |
* @since 0.1 | |
*/ | |
public function getUrl(){ | |
return $this->url; | |
} | |
/** | |
* Returns the responses for the current session. | |
* | |
* @since 0.1 | |
*/ | |
public function getReponses(){ | |
return $this->responses; | |
} | |
/** | |
* Returns the attachments for the current session. | |
* | |
* @since 0.1 | |
*/ | |
public function getAttachments(){ | |
return $this->attachments; | |
} | |
/** | |
* Wipes the attachments for the session. | |
* | |
* @since 0.1 | |
*/ | |
public function wipeAttachments(){ | |
$this->attachments = array(); | |
} | |
/** | |
* Sets the links that the session will work through. | |
* | |
* @param array $links | |
* @since 0.1 | |
*/ | |
public function setLinks( array $links ){ | |
$this->links = $links; | |
} | |
/** | |
* returns the current domain. | |
* | |
* @since 0.1 | |
*/ | |
public function getDomain(){ | |
return $this->currentDomain; | |
} | |
/** | |
* Resets all tha arrays containing session data. | |
* | |
* @since 0.1 | |
*/ | |
public function __clone(){ | |
$this->siteUrl = ''; | |
$this->url = ''; | |
$this->links = array(); | |
$this->crawled = array(); | |
$this->responses = array(); | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
ini_set('memory_limit',-1); | |
require_once 'class.webspider.php'; | |
$posts = array(); | |
$webSpider = new WebSpider(); | |
$webSpider->setUrl('http://businesslink.gov.uk'); | |
$webSpider->crawlExternal(false); | |
$webSpider->setDefaultOptions(); | |
do { | |
print "Crawling '".$webSpider->getUrl()."'".PHP_EOL; | |
$webSpider->wipeAttachments(); | |
$webSpider->executeCurl(); | |
$webSpider->regexFind('~dsadasd~'); | |
$webSpider->links(); | |
$attachments = $webSpider->getAttachments(); | |
if ( !empty($attachments) ){ | |
$downloadSpider = clone $webSpider; | |
$downloadSpider->setLinks($attachments); | |
//Reset the url to the first attachment that is | |
// to be downloaded. | |
$firstDownload = current($attachments); | |
$downloadSpider->setUrl( $firstDownload ); | |
do { | |
$basename = basename($downloadSpider->getUrl()); | |
print "\tDownloading ".$downloadSpider->getUrl()." : "; | |
$downloadSpider->saveTo($basename,true); | |
$downloadSpider->executeCurl(); | |
if ( !file_exists($basename) ){ | |
print "failed".PHP_EOL; | |
} else { | |
print "success".PHP_EOL; | |
} | |
// Do stuff with download here. | |
} while ( $downloadSpider->next() ); | |
} | |
} while ( $webSpider->next() ); | |
print "Crawled ".sizeof($posts)." blog posts".PHP_EOL; | |
foreach ( $webSpider->getReponses() as $key => $value ){ | |
print sizeof($value)." out of ".sizeof( $webSpider->getCrawled() )." were ".$key.PHP_EOL; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment