Skip to content

Instantly share code, notes, and snippets.

@icambridge
Created December 22, 2011 13:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save icambridge/1510300 to your computer and use it in GitHub Desktop.
Save icambridge/1510300 to your computer and use it in GitHub Desktop.
Web Spider
<?php
/**
* Web Spider class
*
* @author Iain Cambridge
* @package WebSpider
* @version 0.1
*/
class WebSpider {
/**
* The verison of the crawler.
*
* @var string
*/
const VERSION = "0.1";
/**
* The url of the site being crawled.
*
* @var string
* @since 0.1
*/
protected $url;
/**
* The links that have been found in
* the current page view.
*
* @var string
* @since 0.1
*/
protected $links = array();
/**
* The URL for the sites domain url.
*
* @var string
* @since 0.1t
*/
protected $siteUrl;
/**
* The array that will be used by curl_setopt_array
* in WebSpider::_executeCurl().
*
* @var array
* @since 0.1
*/
protected $curlOps = array();
/**
* The reponse that was returned from curl.
*
* @var string
* @since 0.1
*/
protected $curlResponse = null;
/**
* Holds the types of protocols the spider can support.
*
* @var array
* @since 0.1
*/
protected $supportedProtocols = array('http','https');
/**
* Contains the urls that have already been crawled.
*
* @var Array
* @since 0.1
*/
protected $crawled = array();
/**
* To decide if the spider is to crawl external sites.
*
* @var boolean
* @since 0.1
*/
protected $external = false;
/**
* The domain we are currently crawling.
*
* @var string
* @since 0.1
*/
protected $currentDomain;
/**
* The attachments for the current session
*
* @var array
* @since 0.1
*/
protected $attachments = array();
/**
* The responses for the current session
*
* @var array
* @since 0.1
*/
protected $responses = array();
/**
* The urls for the CSS files for the current session
*
* @var array
* @since 0.1
*/
protected $cssUrls = array();
public function crawlExternal( $external ){
if ( !is_bool($external) ){
throw new RuntimeException("Invalid value given to external", 110);
}
$this->crawlExternal = $external;
return $this;
}
/**
* Sets the default curl options which will
* generally be used by the Spider.
*
* @since 0.1
*/
public function setDefaultOptions(){
$this->setCurlOption( CURLOPT_USERAGENT , "WwwBot (".self::VERSION.") " )
->setCurlOption( CURLOPT_FOLLOWLOCATION , 1 )
->setCurlOption( CURLOPT_RETURNTRANSFER , 1 )
->setCurlOption( CURLOPT_MAXREDIRS , 10 )
->setCurlOption( CURLOPT_AUTOREFERER , 1 )
->setCurlOption( CURLOPT_HTTP_VERSION , CURL_HTTP_VERSION_1_1 )
->setCurlOption( CURLOPT_CONNECTTIMEOUT , 5 );
return $this;
}
/**
* Sets a curl option by adding it to the array that is used
* by curl_setopt_array in WebSpider::_executeCurl().
*
* @param string $name
* @param mixed $value
* @since 0.1
*/
public function setCurlOption( $name, $value ){
$this->curlOps[$name] = $value;
return $this;
}
/**
* Executes the curl requests. Realigns the current page url to
* the one that was the effective url.
*
* @since 0.1
*/
public function executeCurl( ) {
$ch = curl_init($this->url);
curl_setopt_array($ch, $this->curlOps);
$this->crawled[] = $this->url;
if ( isset( $this->curlOps[CURLOPT_FILE] ) ){
unset( $this->curlOps[CURLOPT_FILE] );
}
$this->curlResponse = curl_exec($ch);
// Since we are following redirects then we should make get the current url so we can use it for proper link.
$this->url = curl_getinfo($ch,CURLINFO_EFFECTIVE_URL);
if ( !in_array($this->url, $this->crawled) ){
$this->crawled[] = $this->url;
}
if ( !preg_match( "~^http(s)?://(([a-z0-9\-]+\.)+([a-z]{2,6}){1,2})(/.*)?$~iSU" , $this->url , $found )){
throw new Exception("Invalid URL '".$this->url."' returned from curl request");
// If that is flung then we are in a whole world of trouble. :D
}
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
$this->responses[ $httpCode ][] = $this->url;
$this->siteUrl = 'http://'.$found[2].'/';
$this;
}
/**
* The file to save the return data to .
*
* @param string $filename
* @param boolean $overwrite
* @since 0.1
*/
public function saveTo( $filename , $overwrite = false ){
if ( file_exists($filename) && $overwrite !== true ) {
throw new RuntimeException( "File '".$filename."' already exists and overwrite isn't enabled." , 101 );
}
$dirname = dirname( $filename );
if ( !is_dir($dirname) ) {
if ( !mkdir($dirname, 0777, true) ) {
throw new RuntimeException( "'".$dirname."' directory doesn't exist and unable to create the directory." , 102 );
}
}
if ( !is_writable($dirname) ){
throw new RuntimeException( "Unable to write to directory '".$dirname."'" , 103 );
}
if ( file_exists($filename) && !is_writable($filename) ){
throw new RuntimeException( "Unable to write to the file '".$dirname."'" , 103 );
}
if ( ($fp = fopen($filename, 'w+')) === false ){
throw new RuntimeException( "Unable to create socket for writing file '".$filename."'" , 104 );
}
$this->setCurlOption( CURLOPT_FILE, $fp );
return $this;
}
/**
* Searches the page content for CSS.
*
* @since 0.1
*/
public function getCss(){
$css = $this->regexFind('~<link.*href=[\'"](.*\.css)[\'"].*(/></link>)~isU');
if ( !isset($css[1]) ){
for ( $i = 0; $i < sizeof($css[1]); $i++ ){
if ( !in_array($this->cssUrls) ){
$this->cssUrls[] = $css[1][$i];
}
}
}
return $this;
}
/**
* Searches the page content for JavaScript.
*
* @since 0.1
*/
public function getJs(){
$js = $this->regexFind('~<javascript.*src=[\'"](.*)[\'"].*>~isU');
if ( !isset($js[1]) ){
for ( $i = 0; $i < sizeof($js[1]); $i++ ){
if ( !in_array($this->jsUrls) ){
$this->jsUrls[] = $js[1][$i];
}
}
}
return $this;
}
/**
* Scrapes the page for links.
*
* @since 0.1
*/
public function links(){
preg_match_all("~<a.*href=[\"'](.*)[\"'].*>.*</a>~isU", $this->curlResponse , $matches );
if ( !isset($matches[0]) || empty($matches) ){
return;
}
for ( $i = 0; $i < sizeof($matches[0]); $i++ ){
$link = $matches[1][$i];
$link = str_replace('/../', '/', $link);
// Make sure we don't visit mailto or javascript links
if ( preg_match('~^(mailto|javascript)\:~isU',$link) ){
continue;
}
// Check to see if it's an attachment.
if ( preg_match("~\.(png|jp(e)?g|gif|mp4|mp3|pdf|docx|doc|ppt|psd)$~",$link) ){
$this->attachments[] = $link;
continue;
}
// Remove anchors
if ( !preg_match("~^#~",$link) ){
$link = preg_replace('~#(.*)$~', '', $link);
}
// Check to see if it's a protocol link and if it is see if it's
// one we are willing to support.
if ( preg_match( "~^([a-z0-9]+)://~isU" , $link , $protocol ) ){
if ( in_array($protocol[1], $this->supportedProtocols) ){
if ( ( $this->external == true ) ||
( $this->external == false &&
preg_match("~^http(s)?://".$this->currentDomain.".*~isU",$link)) ){
$this->links[] = $link;
}
}
} elseif ( preg_match( "~^[/#].*~isU" , $link ) ) {
$this->links[] = $this->siteUrl.ltrim($link,'/');
} else {
if ( preg_match( "~^/.*~isU" , $link ) ){
$this->links[] = $this->url.ltrim($link,'/');
} else {
$this->links[] = dirname($this->url).'/'.ltrim($link,'/');
}
}
}
return $this->links;
}
/**
* Sets the url to the next page.
*
* @since 0.1
*/
public function next(){
do {
$url = next($this->links);
if ( $url !== false ){
$url = preg_replace('~#(.*)$~', '', $url);
if ( !in_array($url, $this->crawled) ){
$this->setUrl($url);
$setUrl = true;
break;
}
}
} while( $url !== false );
return $url;
}
/**
* Sets the URL for the page request.
*
* @param string $url
* @since 0.1
*/
public function setUrl( $url ){
if ( !preg_match( "~^http(s)?://(([a-z0-9\-]+\.)+([a-z]{2,6}){1,2})(/.*)?$~iSU" , $url , $found )){
throw new Exception("Invalid URL '".$this->url."' provided in setUrl()");
}
$this->currentDomain = $found[2];
$this->siteUrl = 'http://'.$found[2].'/';
$this->url = $url;
return true;
}
/**
* Allows the running of a Regular Expression match find on the
*
* @Since 0.1
*/
public function regexFind( $pattern , $single = false ){
if ( $single != true ){
preg_match_all($pattern,$this->curlResponse,$found);
} else {
preg_match($pattern,$this->curlResponse,$found);
}
return $found;
}
/**
* Get the urls that have been crawled by the crawler already.
*
* @since 0.1
*/
public function getCrawled() {
return $this->crawled;
}
/**
* Returns the url for the current request.
*
* @since 0.1
*/
public function getUrl(){
return $this->url;
}
/**
* Returns the responses for the current session.
*
* @since 0.1
*/
public function getReponses(){
return $this->responses;
}
/**
* Returns the attachments for the current session.
*
* @since 0.1
*/
public function getAttachments(){
return $this->attachments;
}
/**
* Wipes the attachments for the session.
*
* @since 0.1
*/
public function wipeAttachments(){
$this->attachments = array();
}
/**
* Sets the links that the session will work through.
*
* @param array $links
* @since 0.1
*/
public function setLinks( array $links ){
$this->links = $links;
}
/**
* returns the current domain.
*
* @since 0.1
*/
public function getDomain(){
return $this->currentDomain;
}
/**
* Resets all tha arrays containing session data.
*
* @since 0.1
*/
public function __clone(){
$this->siteUrl = '';
$this->url = '';
$this->links = array();
$this->crawled = array();
$this->responses = array();
}
}
<?php
ini_set('memory_limit',-1);
require_once 'class.webspider.php';
$posts = array();
$webSpider = new WebSpider();
$webSpider->setUrl('http://businesslink.gov.uk');
$webSpider->crawlExternal(false);
$webSpider->setDefaultOptions();
do {
print "Crawling '".$webSpider->getUrl()."'".PHP_EOL;
$webSpider->wipeAttachments();
$webSpider->executeCurl();
$webSpider->regexFind('~dsadasd~');
$webSpider->links();
$attachments = $webSpider->getAttachments();
if ( !empty($attachments) ){
$downloadSpider = clone $webSpider;
$downloadSpider->setLinks($attachments);
//Reset the url to the first attachment that is
// to be downloaded.
$firstDownload = current($attachments);
$downloadSpider->setUrl( $firstDownload );
do {
$basename = basename($downloadSpider->getUrl());
print "\tDownloading ".$downloadSpider->getUrl()." : ";
$downloadSpider->saveTo($basename,true);
$downloadSpider->executeCurl();
if ( !file_exists($basename) ){
print "failed".PHP_EOL;
} else {
print "success".PHP_EOL;
}
// Do stuff with download here.
} while ( $downloadSpider->next() );
}
} while ( $webSpider->next() );
print "Crawled ".sizeof($posts)." blog posts".PHP_EOL;
foreach ( $webSpider->getReponses() as $key => $value ){
print sizeof($value)." out of ".sizeof( $webSpider->getCrawled() )." were ".$key.PHP_EOL;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment