alfchee/PoolFiction.php

## PoolFiction.php
<?php
class Crawler
{
    protected $_url;
    protected $_depth;
    protected $_seen = array();
    protected $_host;
    protected $_sitemap;
    protected $_curl;
    protected $_limitUrls;
    protected $_pool;

    public function __construct($url, $depth = 3, $limitUrls = 100)
    {
        $this->_url = $url;
        $this->_depth = $depth;
        $parsed = parse_url($url);
        $this->_host = $parsed['host'];
        $this->_curl = EpiCurl::getInstance();
        $this->_limitUrls = $limitUrls;
        $this->_pool = new Pool(10);
    }//__construct()

    public function run()
    {
        // first check for sitemap, if there's no one crawl the site
        if($this->hasSitemap()) {
            $this->getContentFromSitemap();
        } else {
            $this->crawlPage($this->_url, $this->_depth);
        }

        return array_keys($this->_seen);
    }//run()

    public function crawlPage($url, $depth)
    {
        // limit the URL's to 100
        if(count($this->_seen) >= $this->_limitUrls)
            return;

        if(!$this->isValid($url,$depth)) {
           return;
        }

        // add to the seen URL
        $this->_seen[$url] = true;

        // get content an return code
        $req = $this->executeCurl($url);

        if($req->code == 200) {
            $links = $this->processLinks($req->data,$url,$depth);

            foreach($links as $link) {
                $task = new AsyncWebRequest($link);
                $this->_pool->submit($task);
            }

            $this->newLinks = array();

            // shutdown will wait for current queue to be completed
            $this->_pool->shutdown();
            // garbage collection check / read results
            $this->_pool->collect(function($checkingTask){
                // limit the URL's to 100
                if(count($this->_seen) >= $this->_limitUrls)
                    return $checkingTask->isGarbage();

                $this->_seen[$checkingTask->url] = true;
                array_merge($this->newLinks,$this->processLinks($checkingTask->response,$checkingTask->url));

                return $checkingTask->isGarbage();
            });

            foreach($this->newLinks as $link) {
                $task = new AsyncWebRequest($link);
                $this->_pool->submit($task);
            }

            // shutdown will wait for current queue to be completed
            $this->_pool->shutdown();
            // garbage collection check / read results
            $this->_pool->collect(function($checkingTask){
                // limit the URL's to 100
                if(count($this->_seen) >= $this->_limitUrls)
                    return $checkingTask->isGarbage();

                $this->_seen[$checkingTask->url] = true;
                return $checkingTask->isGarbage();
            });
        }
    }//crawlPage()

    ......

    /**
     * processLinks process the content of a page search for all the "a" tags
     * and clean the link inside. Calls ::crawlPage for each link found
     * @param  string   $content  the content of a page
     * @param  string   $url      the URL who belongs the content
     * @param  integer  $depth    the depth of levels to search links
     */
    protected function processLinks($content, $url)
    {
        $crawler = new DomCrawler($content);
        $anchors = $crawler->filter('a');
        $links = [];

        foreach($anchors as $element) {
            $href = $element->getAttribute('href');

            if (0 !== strpos($href, 'http')) {
                $path = '/' . ltrim($href, '/');
                if (extension_loaded('http')) {
                    $href = http_build_url($url, array('path' => $path));
                } else {
                    $parts = parse_url($url);
                    $href = $parts['scheme'] . '://';
                    if (isset($parts['user']) && isset($parts['pass'])) {
                        $href .= $parts['user'] . ':' . $parts['pass'] . '@';
                    }
                    $href .= $parts['host'];
                    if (isset($parts['port'])) {
                        $href .= ':' . $parts['port'];
                    }
                    $href .= $path;
                }
            }
            $links[] = $href;
            // $this->crawlPage($href,$depth - 1);
        }
        return $links;
    }//processLinks()

}//Crawler

class AsyncWebRequest extends Collectable
{
    public $response = null;
    public $url = null;

    public function __construct($url)
    {
        $this->url = $url;
    }//__construct()

    public function run()
    {
        $curl = EpiCurl::getInstance();

        $this->response =  file_get_contents($this->url); //$curl->addURL($url);
    }//run()
}//WorkerThreads

/**
* This goes in the "controller"
*/
$startURL = $_POST['url'];
$urls = [];

$crawler = new Crawler($startURL,3);
$urls = $crawler->run();

// var_dump($urls);die();
// Some pages gives the result in ~8 seconds and the taskmanager shows
// the cuantity of threads being used by the process
	<?php
	class Crawler
	{
	protected $_url;
	protected $_depth;
	protected $_seen = array();
	protected $_host;
	protected $_sitemap;
	protected $_curl;
	protected $_limitUrls;
	protected $_pool;

	public function __construct($url, $depth = 3, $limitUrls = 100)
	{
	$this->_url = $url;
	$this->_depth = $depth;
	$parsed = parse_url($url);
	$this->_host = $parsed['host'];
	$this->_curl = EpiCurl::getInstance();
	$this->_limitUrls = $limitUrls;
	$this->_pool = new Pool(10);
	}//__construct()

	public function run()
	{
	// first check for sitemap, if there's no one crawl the site
	if($this->hasSitemap()) {
	$this->getContentFromSitemap();
	} else {
	$this->crawlPage($this->_url, $this->_depth);
	}

	return array_keys($this->_seen);
	}//run()

	public function crawlPage($url, $depth)
	{
	// limit the URL's to 100
	if(count($this->_seen) >= $this->_limitUrls)
	return;

	if(!$this->isValid($url,$depth)) {
	return;
	}

	// add to the seen URL
	$this->_seen[$url] = true;

	// get content an return code
	$req = $this->executeCurl($url);

	if($req->code == 200) {
	$links = $this->processLinks($req->data,$url,$depth);

	foreach($links as $link) {
	$task = new AsyncWebRequest($link);
	$this->_pool->submit($task);
	}

	$this->newLinks = array();

	// shutdown will wait for current queue to be completed
	$this->_pool->shutdown();
	// garbage collection check / read results
	$this->_pool->collect(function($checkingTask){
	// limit the URL's to 100
	if(count($this->_seen) >= $this->_limitUrls)
	return $checkingTask->isGarbage();

	$this->_seen[$checkingTask->url] = true;
	array_merge($this->newLinks,$this->processLinks($checkingTask->response,$checkingTask->url));

	return $checkingTask->isGarbage();
	});

	foreach($this->newLinks as $link) {
	$task = new AsyncWebRequest($link);
	$this->_pool->submit($task);
	}

	// shutdown will wait for current queue to be completed
	$this->_pool->shutdown();
	// garbage collection check / read results
	$this->_pool->collect(function($checkingTask){
	// limit the URL's to 100
	if(count($this->_seen) >= $this->_limitUrls)
	return $checkingTask->isGarbage();

	$this->_seen[$checkingTask->url] = true;
	return $checkingTask->isGarbage();
	});
	}
	}//crawlPage()

	......

	/**
	* processLinks process the content of a page search for all the "a" tags
	* and clean the link inside. Calls ::crawlPage for each link found
	* @param string $content the content of a page
	* @param string $url the URL who belongs the content
	* @param integer $depth the depth of levels to search links
	*/
	protected function processLinks($content, $url)
	{
	$crawler = new DomCrawler($content);
	$anchors = $crawler->filter('a');
	$links = [];

	foreach($anchors as $element) {
	$href = $element->getAttribute('href');

	if (0 !== strpos($href, 'http')) {
	$path = '/' . ltrim($href, '/');
	if (extension_loaded('http')) {
	$href = http_build_url($url, array('path' => $path));
	} else {
	$parts = parse_url($url);
	$href = $parts['scheme'] . '://';
	if (isset($parts['user']) && isset($parts['pass'])) {
	$href .= $parts['user'] . ':' . $parts['pass'] . '@';
	}
	$href .= $parts['host'];
	if (isset($parts['port'])) {
	$href .= ':' . $parts['port'];
	}
	$href .= $path;
	}
	}
	$links[] = $href;
	// $this->crawlPage($href,$depth - 1);
	}
	return $links;
	}//processLinks()

	}//Crawler

	class AsyncWebRequest extends Collectable
	{
	public $response = null;
	public $url = null;

	public function __construct($url)
	{
	$this->url = $url;
	}//__construct()

	public function run()
	{
	$curl = EpiCurl::getInstance();

	$this->response = file_get_contents($this->url); //$curl->addURL($url);
	}//run()
	}//WorkerThreads

	/**
	* This goes in the "controller"
	*/
	$startURL = $_POST['url'];
	$urls = [];

	$crawler = new Crawler($startURL,3);
	$urls = $crawler->run();

	// var_dump($urls);die();
	// Some pages gives the result in ~8 seconds and the taskmanager shows
	// the cuantity of threads being used by the process