Created
October 9, 2015 03:40
-
-
Save alfchee/1aebdc435a9c70b5b56c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
class Crawler | |
{ | |
protected $_url; | |
protected $_depth; | |
protected $_seen = array(); | |
protected $_host; | |
protected $_sitemap; | |
protected $_curl; | |
protected $_limitUrls; | |
protected $_pool; | |
public function __construct($url, $depth = 3, $limitUrls = 100) | |
{ | |
$this->_url = $url; | |
$this->_depth = $depth; | |
$parsed = parse_url($url); | |
$this->_host = $parsed['host']; | |
$this->_curl = EpiCurl::getInstance(); | |
$this->_limitUrls = $limitUrls; | |
$this->_pool = new Pool(10); | |
}//__construct() | |
public function run() | |
{ | |
// first check for sitemap, if there's no one crawl the site | |
if($this->hasSitemap()) { | |
$this->getContentFromSitemap(); | |
} else { | |
$this->crawlPage($this->_url, $this->_depth); | |
} | |
return array_keys($this->_seen); | |
}//run() | |
public function crawlPage($url, $depth) | |
{ | |
// limit the URL's to 100 | |
if(count($this->_seen) >= $this->_limitUrls) | |
return; | |
if(!$this->isValid($url,$depth)) { | |
return; | |
} | |
// add to the seen URL | |
$this->_seen[$url] = true; | |
// get content an return code | |
$req = $this->executeCurl($url); | |
if($req->code == 200) { | |
$links = $this->processLinks($req->data,$url,$depth); | |
foreach($links as $link) { | |
$task = new AsyncWebRequest($link); | |
$this->_pool->submit($task); | |
} | |
$this->newLinks = array(); | |
// shutdown will wait for current queue to be completed | |
$this->_pool->shutdown(); | |
// garbage collection check / read results | |
$this->_pool->collect(function($checkingTask){ | |
// limit the URL's to 100 | |
if(count($this->_seen) >= $this->_limitUrls) | |
return $checkingTask->isGarbage(); | |
$this->_seen[$checkingTask->url] = true; | |
array_merge($this->newLinks,$this->processLinks($checkingTask->response,$checkingTask->url)); | |
return $checkingTask->isGarbage(); | |
}); | |
foreach($this->newLinks as $link) { | |
$task = new AsyncWebRequest($link); | |
$this->_pool->submit($task); | |
} | |
// shutdown will wait for current queue to be completed | |
$this->_pool->shutdown(); | |
// garbage collection check / read results | |
$this->_pool->collect(function($checkingTask){ | |
// limit the URL's to 100 | |
if(count($this->_seen) >= $this->_limitUrls) | |
return $checkingTask->isGarbage(); | |
$this->_seen[$checkingTask->url] = true; | |
return $checkingTask->isGarbage(); | |
}); | |
} | |
}//crawlPage() | |
...... | |
/** | |
* processLinks process the content of a page search for all the "a" tags | |
* and clean the link inside. Calls ::crawlPage for each link found | |
* @param string $content the content of a page | |
* @param string $url the URL who belongs the content | |
* @param integer $depth the depth of levels to search links | |
*/ | |
protected function processLinks($content, $url) | |
{ | |
$crawler = new DomCrawler($content); | |
$anchors = $crawler->filter('a'); | |
$links = []; | |
foreach($anchors as $element) { | |
$href = $element->getAttribute('href'); | |
if (0 !== strpos($href, 'http')) { | |
$path = '/' . ltrim($href, '/'); | |
if (extension_loaded('http')) { | |
$href = http_build_url($url, array('path' => $path)); | |
} else { | |
$parts = parse_url($url); | |
$href = $parts['scheme'] . '://'; | |
if (isset($parts['user']) && isset($parts['pass'])) { | |
$href .= $parts['user'] . ':' . $parts['pass'] . '@'; | |
} | |
$href .= $parts['host']; | |
if (isset($parts['port'])) { | |
$href .= ':' . $parts['port']; | |
} | |
$href .= $path; | |
} | |
} | |
$links[] = $href; | |
// $this->crawlPage($href,$depth - 1); | |
} | |
return $links; | |
}//processLinks() | |
}//Crawler | |
class AsyncWebRequest extends Collectable | |
{ | |
public $response = null; | |
public $url = null; | |
public function __construct($url) | |
{ | |
$this->url = $url; | |
}//__construct() | |
public function run() | |
{ | |
$curl = EpiCurl::getInstance(); | |
$this->response = file_get_contents($this->url); //$curl->addURL($url); | |
}//run() | |
}//WorkerThreads | |
/** | |
* This goes in the "controller" | |
*/ | |
$startURL = $_POST['url']; | |
$urls = []; | |
$crawler = new Crawler($startURL,3); | |
$urls = $crawler->run(); | |
// var_dump($urls);die(); | |
// Some pages gives the result in ~8 seconds and the taskmanager shows | |
// the cuantity of threads being used by the process |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment