Skip to content

Instantly share code, notes, and snippets.

@alfchee
Created October 9, 2015 03:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save alfchee/1aebdc435a9c70b5b56c to your computer and use it in GitHub Desktop.
Save alfchee/1aebdc435a9c70b5b56c to your computer and use it in GitHub Desktop.
<?php
class Crawler
{
protected $_url;
protected $_depth;
protected $_seen = array();
protected $_host;
protected $_sitemap;
protected $_curl;
protected $_limitUrls;
protected $_pool;
public function __construct($url, $depth = 3, $limitUrls = 100)
{
$this->_url = $url;
$this->_depth = $depth;
$parsed = parse_url($url);
$this->_host = $parsed['host'];
$this->_curl = EpiCurl::getInstance();
$this->_limitUrls = $limitUrls;
$this->_pool = new Pool(10);
}//__construct()
public function run()
{
// first check for sitemap, if there's no one crawl the site
if($this->hasSitemap()) {
$this->getContentFromSitemap();
} else {
$this->crawlPage($this->_url, $this->_depth);
}
return array_keys($this->_seen);
}//run()
public function crawlPage($url, $depth)
{
// limit the URL's to 100
if(count($this->_seen) >= $this->_limitUrls)
return;
if(!$this->isValid($url,$depth)) {
return;
}
// add to the seen URL
$this->_seen[$url] = true;
// get content an return code
$req = $this->executeCurl($url);
if($req->code == 200) {
$links = $this->processLinks($req->data,$url,$depth);
foreach($links as $link) {
$task = new AsyncWebRequest($link);
$this->_pool->submit($task);
}
$this->newLinks = array();
// shutdown will wait for current queue to be completed
$this->_pool->shutdown();
// garbage collection check / read results
$this->_pool->collect(function($checkingTask){
// limit the URL's to 100
if(count($this->_seen) >= $this->_limitUrls)
return $checkingTask->isGarbage();
$this->_seen[$checkingTask->url] = true;
array_merge($this->newLinks,$this->processLinks($checkingTask->response,$checkingTask->url));
return $checkingTask->isGarbage();
});
foreach($this->newLinks as $link) {
$task = new AsyncWebRequest($link);
$this->_pool->submit($task);
}
// shutdown will wait for current queue to be completed
$this->_pool->shutdown();
// garbage collection check / read results
$this->_pool->collect(function($checkingTask){
// limit the URL's to 100
if(count($this->_seen) >= $this->_limitUrls)
return $checkingTask->isGarbage();
$this->_seen[$checkingTask->url] = true;
return $checkingTask->isGarbage();
});
}
}//crawlPage()
......
/**
* processLinks process the content of a page search for all the "a" tags
* and clean the link inside. Calls ::crawlPage for each link found
* @param string $content the content of a page
* @param string $url the URL who belongs the content
* @param integer $depth the depth of levels to search links
*/
protected function processLinks($content, $url)
{
$crawler = new DomCrawler($content);
$anchors = $crawler->filter('a');
$links = [];
foreach($anchors as $element) {
$href = $element->getAttribute('href');
if (0 !== strpos($href, 'http')) {
$path = '/' . ltrim($href, '/');
if (extension_loaded('http')) {
$href = http_build_url($url, array('path' => $path));
} else {
$parts = parse_url($url);
$href = $parts['scheme'] . '://';
if (isset($parts['user']) && isset($parts['pass'])) {
$href .= $parts['user'] . ':' . $parts['pass'] . '@';
}
$href .= $parts['host'];
if (isset($parts['port'])) {
$href .= ':' . $parts['port'];
}
$href .= $path;
}
}
$links[] = $href;
// $this->crawlPage($href,$depth - 1);
}
return $links;
}//processLinks()
}//Crawler
class AsyncWebRequest extends Collectable
{
public $response = null;
public $url = null;
public function __construct($url)
{
$this->url = $url;
}//__construct()
public function run()
{
$curl = EpiCurl::getInstance();
$this->response = file_get_contents($this->url); //$curl->addURL($url);
}//run()
}//WorkerThreads
/**
* This goes in the "controller"
*/
$startURL = $_POST['url'];
$urls = [];
$crawler = new Crawler($startURL,3);
$urls = $crawler->run();
// var_dump($urls);die();
// Some pages gives the result in ~8 seconds and the taskmanager shows
// the cuantity of threads being used by the process
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment