cnicodeme/LinkExtractor.php

## LinkExtractor.php
<?php

namespace Crawler\Extractors;

class LinkExtractor {
    private static $excludes = array(
        '.png', '.gif', '.jpg', '.jpeg', '.svg', '.mp3', '.mp4', '.avi', '.mpeg', '.ps', '.swf', '.webm', '.ogg', '.pdf',
        '.3gp', '.apk', '.bmp', '.flac', '.gz', '.gzip', '.jpe', '.kml', '.kmz', '.m4a', '.mov', '.mpg', '.odp', '.oga', '.ogv', '.pps', '.pptx', '.qt', '.tar', '.tif', '.wav', '.wmv', '.zip',

        // Removed '.js', '.coffee', '.css', '.less', '.csv', '.xsl', '.xsd', '.xml', '.html', '.html', '.php', '.txt', '.atom', '.rss'

        // Implement later ?
        '.doc', '.docx', '.ods', '.odt', '.xls', '.xlsx',
    );

    private static $excludedDomains = array(
        '.google.', '.facebook.', '.bing.'
    );

    private static function _getBaseUrl($parsed_url) {
        $scheme   = isset($parsed_url['scheme']) ? $parsed_url['scheme'] . '://' : '//';
        $host     = isset($parsed_url['host']) ? $parsed_url['host'] : '';
        $port     = isset($parsed_url['port']) ? ':' . $parsed_url['port'] : '';

        return strtolower("$scheme$host$port");
    }

    public static function extract(\Crawler\Engine\Spider $spider) {
        $parsed = parse_url(strtolower($spider->getUrl()));
        if (!isset($parsed['scheme'])) {
            $parsed['scheme'] = 'http';
        }

        $base = self::_getBaseUrl($parsed);
        $host_length = strlen($parsed['host']);

        preg_match_all("/(href|src)=[\'\"]?([^\'\">]+)/i", $spider->getSource(), $out);
        $linkPattern = '/^(?:[;\/?:@&=+$,]|(?:[^\W_]|[-_.!~*\()\[\] ])|(?:%[\da-fA-F]{2}))*$/';

        $urls = array();
        if (is_array($out) && isset($out[2])) {
            foreach ($out[2] as $key=>$url) {
                if (substr($url, 0, 2) === '#!') {
                    // see https://developers.google.com/webmasters/ajax-crawling/docs/getting-started
                    $url = $base.$parsed['path'].'?_escaped_fragment_='.substr($url, 2);
                } else if (substr($url, 0, 2) === '//') { // generic scheme
                    $url = $parsed['scheme'].'://'.$url;
                } else if (substr($url, 0, 1) === '/') { // generic scheme
                    $url = $base.$url;
                } else if (substr($url, 0, 4) !== 'http') {
                    continue;
                }

                if (strlen($url) > 250) continue; // We ignore too long urls

                $urll = strtolower($url);

                $parsed_url = parse_url($url);
                if ($parsed_url === false) continue; // We ignore invalid urls
                if (preg_match($linkPattern, $urll) !== 1) continue;

                $isExcluded = false;
                foreach (self::$excludes as $exclude) {
                    if (substr($urll, strlen($exclude) * -1) === $exclude) {
                        $isExcluded = true;
                        break;
                    }
                }

                foreach (self::$excludedDomains as $exclude) {
                    if (strpos($urll, $exclude) !== false) {
                        $isExcluded = true;
                        break;
                    }
                }

                if ($isExcluded) continue; // We ignore some extensions
                if (\Crawler\Models\LinkModel::isPresent($url)) continue; // We don't add a link that is already present
                if (\Crawler\RobotsTxtParser::disallowed($url)) continue; // We respect robots.txt

                $urls[$url] = true;
            }
        }

        return array_keys($urls);
    }
}

## LinkModel.php
<?php

namespace Crawler\Models;

class LinkModel {
    public static function __callStatic($name, $arguments) {
        return call_user_func_array(array(self::get(), '_'.$name), $arguments);
    }

    private static $instance = null;

    public static function get() {
        if (is_null(self::$instance)) {
            self::$instance = new self();
        }

        return self::$instance;
    }

    private $presentStmt = null;
    private $countQueuedStmt = null;
    private $countTotalStmt = null;

    private function __construct() {
        $this->presentStmt = \Crawler\Database::prepare('SELECT `id` FROM `urls` WHERE `url` = :url AND `executed` > (UTC_TIMESTAMP() - INTERVAL 1 MONTH) LIMIT 1;');
        $this->detailsStmt = \Crawler\Database::prepare('SELECT `job_id` AS `job` FROM `urls` WHERE `url` = :url AND `executed` > (UTC_TIMESTAMP() - INTERVAL 1 MONTH) LIMIT 1;');
        $this->insertStmt = \Crawler\Database::prepare('INSERT INTO `urls` (`url`, `is_crawled`, `executed`, `source`, `job_id`) VALUES (:url, :crawled, UTC_TIMESTAMP(), :source, :job)');
        $this->updateStmt = \Crawler\Database::prepare('UPDATE `urls` SET `is_crawled` = :crawled WHERE `url` = :url AND `executed` > (UTC_TIMESTAMP() - INTERVAL 1 MONTH) LIMIT 1;');

        $this->countQueuedStmt = \Crawler\Database::prepare('SELECT COUNT(id) AS `total` FROM `urls` WHERE (`url` LIKE :domaina OR url LIKE :domainb) AND `source` IS NULL AND `is_crawled` = 0 AND `executed` > (UTC_TIMESTAMP() - INTERVAL 1 MONTH);');
        $this->countTotalStmt = \Crawler\Database::prepare('SELECT COUNT(id) AS `total` FROM `urls` WHERE (`url` LIKE :domaina OR url LIKE :domainb) AND `source` IS NULL AND `executed` > (UTC_TIMESTAMP() - INTERVAL 1 MONTH);');
    }

    public function _isPresent($url) {
        $this->presentStmt->execute(array('url' => strtolower($url)));
        $result = $this->presentStmt->fetch(\PDO::FETCH_ASSOC);
        return is_array($result);
    }

    /**
     * crawled : The engine extracted this url
     * redirectedFrom : The url it cames from, was redirected
     *
     * In certain case, crawled != fetched. This means the $url was a redrection from an other url
     */
    public function _add($url, $crawled = false, $redirectedFrom = null, $jobId = null) {
        $url = strtolower($url);
        if (is_null($jobId)) {
            $this->detailsStmt->execute(array('url' => $url));
            $result = $this->detailsStmt->fetch(\PDO::FETCH_ASSOC);

            // We search if already exists
            if (is_array($result)) {
                $this->_update($url, $crawled);

                // And return the job id if present !
                return (empty($result['job']) ? null : $result['job']);
            }
        }

        // We insert
        $this->insertStmt->execute(array(
            'url' => $url,
            'crawled' => $crawled,
            'source' => $redirectedFrom,
            'job' => $jobId
        ));

        return null;
    }

    public function _update($url, $crawled = false) {
        $url = strtolower($url);

        $this->updateStmt->execute(array(
            'url' => $url,
            'crawled' => $crawled
        ));
    }

    public function _countQueued($domain) {
        $this->countQueuedStmt->execute(array(
            'domaina' => 'http://'.$domain.'%',
            'domainb' => 'https://'.$domain.'%',
        ));
        $result = $this->countQueuedStmt->fetch(\PDO::FETCH_ASSOC);
        if (!is_array($result)) return 0;

        return $result['total'];
    }

    public function _countTotal($domain) {
        $this->countTotalStmt->execute(array(
            'domaina' => 'http://'.$domain.'%',
            'domainb' => 'https://'.$domain.'%',
        ));
        $result = $this->countTotalStmt->fetch(\PDO::FETCH_ASSOC);
        if (!is_array($result)) return 0;

        return $result['total'];
    }
}

## Spider.php
<?php
namespace Crawler\Engine;

class Spider {
    const MAX_DOWNLOAD_SIZE = 1024*1024*100; // in bytes, =100kb

    const LOW_PRIORITY = 1024; // = Default
    const MEDIUM_PRIORITY = 512;
    const HIGH_PRIORITY = 256;

    private $options = array(
        CURLOPT_FOLLOWLOCATION => true,
        CURLOPT_FORBID_REUSE   => true,
        CURLOPT_FRESH_CONNECT  => true,
        CURLOPT_HEADER         => false,
        CURLOPT_RETURNTRANSFER => true,
        CURLOPT_SSL_VERIFYPEER => false,
        CURLOPT_MAXREDIRS      => 5,
        CURLOPT_TIMEOUT        => 5,
        CURLOPT_ENCODING       => ''
    );

    private $curl = null;

    private $url = null;
    private $urlParts = array();
    private $statusCode = null;
    private $source = null;

    public function __construct($url, $referer) {
        $this->options[CURLOPT_WRITEFUNCTION] = array($this, 'curl_handler_recv');
        $this->options[CURLOPT_REFERER] = $referer;

        $this->curl = curl_init();
        curl_setopt($this->curl, CURLOPT_URL, $url);
        curl_setopt_array($this->curl, $this->options);

        $this->source = '';
    }

    public function curl_handler_recv($curl, $data) {
        $this->source .= $data;
        if (strlen($this->source) > self::MAX_DOWNLOAD_SIZE) return 0;
        return strlen($data);
    }

    public function exec() {
        $start = round(microtime(true) * 1000);
        curl_exec($this->getCurl());

        $this->getUrl();
        $this->getStatusCode();

        curl_close($this->getCurl());
        return round(microtime(true) * 1000) - $start;
    }

    public function getCurl() {
        return $this->curl;
    }

    public function getSource() {
        return $this->source;
    }

    public function getUrl() {
        if (is_null($this->url)) {
            $this->url = curl_getinfo($this->getCurl(), CURLINFO_EFFECTIVE_URL);
            $this->urlParts = parse_url($this->url);
        }

        return $this->url;
    }

    public function getUrlParts($key = null) {
        if (!is_null($key) && isset($this->urlParts[$key])) {
            return $this->urlParts[$key];
        }

        return $this->urlParts;
    }

    public function getStatusCode() {
        if (is_null($this->statusCode)) {
            $this->statusCode = curl_getinfo($this->getCurl(), CURLINFO_HTTP_CODE);
        }

        return $this->statusCode;
    }
}

## worker.php
<?php
if (php_sapi_name() !== 'cli') exit(1);

require_once(__DIR__.'/../init.php');

define('WORKER_LIMIT_INSTANCES', 200);
define('CRAWLER_MAX_DEPTH', 10000);
define('CRAWLER_MAX_HIGH_URLS', 100);

use \Pheanstalk\Pheanstalk;
use \Crawler\Models\LinkModel;

$pheanstalk = new Pheanstalk('127.0.0.1');

$reloadedInitialTime = filemtime(__DIR__.'/../reloaded');
fwrite(STDOUT, "Started new instance of script (".$reloadedInitialTime.").\n");

$loopCounter = 0;
while (true) {
    clearstatcache();

    // Script to stop the service
    if (intval(file_get_contents(__DIR__.'/../breakworker')) === 1 ) exit(1);

    // We check if we need to stop this worker (code update?)
    $autoReloadSystem = filemtime(__DIR__.'/../reloaded');
    if ($reloadedInitialTime !== $autoReloadSystem) {
        fwrite(STDOUT, "New update - Reloading script.\n");
        exit(0);
    }

    usleep(500000); // Give it some slack ; 1/2 second

    $loopCounter++;
    if ($loopCounter > WORKER_LIMIT_INSTANCES) break; // We count on Supervisord to reload workers

    // grab the next job off the queue and reserve it
    $job = $pheanstalk->watch(QUEUE_NAME)
        ->ignore('default')
        ->reserve();

    // remove the job from the queue
    $pheanstalk->delete($job);

    $data = json_decode($job->getData(), true);
    if (is_null($data)) {
        fwrite(STDERR, "[FATAL] Invalid Job data : ".$job->getData()."\n");
    }

    if (!isset($data['retries']))  $data['retries'] = 0;
    if (!isset($data['priority'])) $data['priority'] = \Crawler\Engine\Spider::MEDIUM_PRIORITY;

    if ($data['priority'] == \Crawler\Engine\Spider::LOW_PRIORITY) {
        // Normally, only new links are in low priority
        $data['priority'] = \Crawler\Engine\Spider::MEDIUM_PRIORITY;
    }

    /*
     * The "Spider" goes to the website using a basic CURL request
     * It also pre-fetch the robots.txt the first request to ensure we respect it
     * With the following CURL rules :
     *  CURLOPT_FOLLOWLOCATION => true,
        CURLOPT_FORBID_REUSE   => true,
        CURLOPT_FRESH_CONNECT  => true,
        CURLOPT_HEADER         => false,
        CURLOPT_RETURNTRANSFER => true,
        CURLOPT_SSL_VERIFYPEER => false,
        CURLOPT_MAXREDIRS      => 5,
        CURLOPT_TIMEOUT        => 5,
        CURLOPT_ENCODING       => ''
     */
    $spider = new \Crawler\Engine\Spider($data['url']);
    $duration = $spider->exec();

    // First, we ensure that we are not black-listed
    // So we analyze the status code
    // For 401, 403 and 404, we retry once
    // For 408, 429 and 503, we retry 3 times, with increasing wait between requests

    if (in_array($spider->getStatusCode(), array(401, 403, 404, 408, 429, 503))) {
        $data['retries']++;
        if ((in_array($spider->getStatusCode(), array(401, 403, 404)) && $data['retries'] <= 1) // Only one retry
            ||
            (in_array($spider->getStatusCode(), array(408, 429, 503)) && $data['retries'] <= 3) // 3 retries
        ) {
            $pheanstalk->putInTube(QUEUE_NAME, json_encode($data), $data['priority'], $data['retries'] * 30);
            continue;
        }

        // We are here (and not in the "if" section) when the status code is in the array
        // but the retries are reached, that mean we stop for this url
        // So the next step will be to add it in the Link database and stop the data.
    }

    // We update the url in the database to indicate it has been crawled
    LinkModel::update($data['url'], true);
    if (strtolower($data['url']) !== strtolower($spider->getUrl())) {
        // We were redirected, so we add a new URL also marked as being crawled, with $data['url'] being the origin
        $jobId = LinkModel::add($spider->getUrl(), true, $data['url']);
        // We remove the job of the redirect url because we had it already in queue
        if (!is_null($jobId)) {
            // We catch exception in case the url has already been processed
            try {
                $job = $pheanstalk->peek($jobId);
                $pheanstalk->delete($job);
            } catch (\Exception $e) {}
        }
    }

    $domainName = $spider->getUrlParts(PHP_URL_HOST);
    $domainName = strtolower($domainName['host']);

    // Here's the code I do to index the webpages
    // I removed it because it's not interesting in our case
    // But in general, if you are looking for a similar work, you can implement your need here :)

    // This code extract all the links in the page to add them in the queue
    $links = \Crawler\Extractors\LinkExtractor::extract($spider);

    // And we add them now :
    $priority = $data['priority'];
    foreach ($links as $link) {
        $parsedDomain = strtolower(parse_url($link, PHP_URL_HOST));

        $jobsData = array(
            'url' => $link,
            'retries' => 0,
            'referer' => $spider->getUrl()
        );

        $jobsData['delay'] = ceil($duration * (rand(1, 10)/10000)); // Delay between 0.1 and 1 seconds x $duration of the request
        if ($jobsData['delay'] > 5) $jobsData['delay'] = 5;

        // We increase the time to wait per number of links for this specific domain
        $jobsData['delay'] = $jobsData['delay'] + LinkModel::countQueued($parsedDomain);

        if (\Crawler\Engine\Spider::HIGH_PRIORITY) {
            // Allow 5 simultaneous request on high priority
            $jobsData['delay'] = floor($jobsData['delay'] / 10);
        }

        $iCountCrawledUrls = LinkModel::countTotal($parsedDomain);
        if ($iCountCrawledUrls > CRAWLER_MAX_DEPTH) break; // We stop crawling this domain

        if ($domainName === $parsedDomain) {
            if ($priority === \Crawler\Engine\Spider::HIGH_PRIORITY && $iCountCrawledUrls > CRAWLER_MAX_HIGH_URLS) {
                $priority = \Crawler\Engine\Spider::MEDIUM_PRIORITY;
            }
            $jobsData['priority'] = $priority;
        } else {
            $jobsData['priority'] = \Crawler\Engine\Spider::LOW_PRIORITY;
        }

        $jobId = $pheanstalk->putInTube(QUEUE_NAME, json_encode($jobsData), $jobsData['priority'], $jobsData['delay']);

        // The add method checks if the url is already present in the database
        // To avoid adding multiple time the same url (and going in loop in case two sites links to each others !)
        LinkModel::add($link, false, null, $jobId);
    }
}
	<?php

	namespace Crawler\Extractors;

	class LinkExtractor {
	private static $excludes = array(
	'.png', '.gif', '.jpg', '.jpeg', '.svg', '.mp3', '.mp4', '.avi', '.mpeg', '.ps', '.swf', '.webm', '.ogg', '.pdf',
	'.3gp', '.apk', '.bmp', '.flac', '.gz', '.gzip', '.jpe', '.kml', '.kmz', '.m4a', '.mov', '.mpg', '.odp', '.oga', '.ogv', '.pps', '.pptx', '.qt', '.tar', '.tif', '.wav', '.wmv', '.zip',

	// Removed '.js', '.coffee', '.css', '.less', '.csv', '.xsl', '.xsd', '.xml', '.html', '.html', '.php', '.txt', '.atom', '.rss'

	// Implement later ?
	'.doc', '.docx', '.ods', '.odt', '.xls', '.xlsx',
	);

	private static $excludedDomains = array(
	'.google.', '.facebook.', '.bing.'
	);

	private static function _getBaseUrl($parsed_url) {
	$scheme = isset($parsed_url['scheme']) ? $parsed_url['scheme'] . '://' : '//';
	$host = isset($parsed_url['host']) ? $parsed_url['host'] : '';
	$port = isset($parsed_url['port']) ? ':' . $parsed_url['port'] : '';

	return strtolower("$scheme$host$port");
	}

	public static function extract(\Crawler\Engine\Spider $spider) {
	$parsed = parse_url(strtolower($spider->getUrl()));
	if (!isset($parsed['scheme'])) {
	$parsed['scheme'] = 'http';
	}

	$base = self::_getBaseUrl($parsed);
	$host_length = strlen($parsed['host']);

	preg_match_all("/(href\|src)=[\'\"]?([^\'\">]+)/i", $spider->getSource(), $out);
	$linkPattern = '/^(?:[;\/?:@&=+$,]\|(?:[^\W_]\|[-_.!~\()\[\] ])\|(?:%[\da-fA-F]{2}))$/';

	$urls = array();
	if (is_array($out) && isset($out[2])) {
	foreach ($out[2] as $key=>$url) {
	if (substr($url, 0, 2) === '#!') {
	// see https://developers.google.com/webmasters/ajax-crawling/docs/getting-started
	$url = $base.$parsed['path'].'?_escaped_fragment_='.substr($url, 2);
	} else if (substr($url, 0, 2) === '//') { // generic scheme
	$url = $parsed['scheme'].'://'.$url;
	} else if (substr($url, 0, 1) === '/') { // generic scheme
	$url = $base.$url;
	} else if (substr($url, 0, 4) !== 'http') {
	continue;
	}

	if (strlen($url) > 250) continue; // We ignore too long urls

	$urll = strtolower($url);

	$parsed_url = parse_url($url);
	if ($parsed_url === false) continue; // We ignore invalid urls
	if (preg_match($linkPattern, $urll) !== 1) continue;

	$isExcluded = false;
	foreach (self::$excludes as $exclude) {
	if (substr($urll, strlen($exclude) * -1) === $exclude) {
	$isExcluded = true;
	break;
	}
	}

	foreach (self::$excludedDomains as $exclude) {
	if (strpos($urll, $exclude) !== false) {
	$isExcluded = true;
	break;
	}
	}

	if ($isExcluded) continue; // We ignore some extensions
	if (\Crawler\Models\LinkModel::isPresent($url)) continue; // We don't add a link that is already present
	if (\Crawler\RobotsTxtParser::disallowed($url)) continue; // We respect robots.txt

	$urls[$url] = true;
	}
	}

	return array_keys($urls);
	}
	}
	<?php

	namespace Crawler\Models;

	class LinkModel {
	public static function __callStatic($name, $arguments) {
	return call_user_func_array(array(self::get(), '_'.$name), $arguments);
	}

	private static $instance = null;

	public static function get() {
	if (is_null(self::$instance)) {
	self::$instance = new self();
	}

	return self::$instance;
	}

	private $presentStmt = null;
	private $countQueuedStmt = null;
	private $countTotalStmt = null;

	private function __construct() {
	$this->presentStmt = \Crawler\Database::prepare('SELECT `id` FROM `urls` WHERE `url` = :url AND `executed` > (UTC_TIMESTAMP() - INTERVAL 1 MONTH) LIMIT 1;');
	$this->detailsStmt = \Crawler\Database::prepare('SELECT `job_id` AS `job` FROM `urls` WHERE `url` = :url AND `executed` > (UTC_TIMESTAMP() - INTERVAL 1 MONTH) LIMIT 1;');
	$this->insertStmt = \Crawler\Database::prepare('INSERT INTO `urls` (`url`, `is_crawled`, `executed`, `source`, `job_id`) VALUES (:url, :crawled, UTC_TIMESTAMP(), :source, :job)');
	$this->updateStmt = \Crawler\Database::prepare('UPDATE `urls` SET `is_crawled` = :crawled WHERE `url` = :url AND `executed` > (UTC_TIMESTAMP() - INTERVAL 1 MONTH) LIMIT 1;');

	$this->countQueuedStmt = \Crawler\Database::prepare('SELECT COUNT(id) AS `total` FROM `urls` WHERE (`url` LIKE :domaina OR url LIKE :domainb) AND `source` IS NULL AND `is_crawled` = 0 AND `executed` > (UTC_TIMESTAMP() - INTERVAL 1 MONTH);');
	$this->countTotalStmt = \Crawler\Database::prepare('SELECT COUNT(id) AS `total` FROM `urls` WHERE (`url` LIKE :domaina OR url LIKE :domainb) AND `source` IS NULL AND `executed` > (UTC_TIMESTAMP() - INTERVAL 1 MONTH);');
	}

	public function _isPresent($url) {
	$this->presentStmt->execute(array('url' => strtolower($url)));
	$result = $this->presentStmt->fetch(\PDO::FETCH_ASSOC);
	return is_array($result);
	}

	/**
	* crawled : The engine extracted this url
	* redirectedFrom : The url it cames from, was redirected
	*
	* In certain case, crawled != fetched. This means the $url was a redrection from an other url
	*/
	public function _add($url, $crawled = false, $redirectedFrom = null, $jobId = null) {
	$url = strtolower($url);
	if (is_null($jobId)) {
	$this->detailsStmt->execute(array('url' => $url));
	$result = $this->detailsStmt->fetch(\PDO::FETCH_ASSOC);

	// We search if already exists
	if (is_array($result)) {
	$this->_update($url, $crawled);

	// And return the job id if present !
	return (empty($result['job']) ? null : $result['job']);
	}
	}

	// We insert
	$this->insertStmt->execute(array(
	'url' => $url,
	'crawled' => $crawled,
	'source' => $redirectedFrom,
	'job' => $jobId
	));

	return null;
	}

	public function _update($url, $crawled = false) {
	$url = strtolower($url);

	$this->updateStmt->execute(array(
	'url' => $url,
	'crawled' => $crawled
	));
	}

	public function _countQueued($domain) {
	$this->countQueuedStmt->execute(array(
	'domaina' => 'http://'.$domain.'%',
	'domainb' => 'https://'.$domain.'%',
	));
	$result = $this->countQueuedStmt->fetch(\PDO::FETCH_ASSOC);
	if (!is_array($result)) return 0;

	return $result['total'];
	}

	public function _countTotal($domain) {
	$this->countTotalStmt->execute(array(
	'domaina' => 'http://'.$domain.'%',
	'domainb' => 'https://'.$domain.'%',
	));
	$result = $this->countTotalStmt->fetch(\PDO::FETCH_ASSOC);
	if (!is_array($result)) return 0;

	return $result['total'];
	}
	}
	<?php
	namespace Crawler\Engine;

	class Spider {
	const MAX_DOWNLOAD_SIZE = 10241024100; // in bytes, =100kb

	const LOW_PRIORITY = 1024; // = Default
	const MEDIUM_PRIORITY = 512;
	const HIGH_PRIORITY = 256;

	private $options = array(
	CURLOPT_FOLLOWLOCATION => true,
	CURLOPT_FORBID_REUSE => true,
	CURLOPT_FRESH_CONNECT => true,
	CURLOPT_HEADER => false,
	CURLOPT_RETURNTRANSFER => true,
	CURLOPT_SSL_VERIFYPEER => false,
	CURLOPT_MAXREDIRS => 5,
	CURLOPT_TIMEOUT => 5,
	CURLOPT_ENCODING => ''
	);

	private $curl = null;

	private $url = null;
	private $urlParts = array();
	private $statusCode = null;
	private $source = null;

	public function __construct($url, $referer) {
	$this->options[CURLOPT_WRITEFUNCTION] = array($this, 'curl_handler_recv');
	$this->options[CURLOPT_REFERER] = $referer;

	$this->curl = curl_init();
	curl_setopt($this->curl, CURLOPT_URL, $url);
	curl_setopt_array($this->curl, $this->options);

	$this->source = '';
	}

	public function curl_handler_recv($curl, $data) {
	$this->source .= $data;
	if (strlen($this->source) > self::MAX_DOWNLOAD_SIZE) return 0;
	return strlen($data);
	}

	public function exec() {
	$start = round(microtime(true) * 1000);
	curl_exec($this->getCurl());

	$this->getUrl();
	$this->getStatusCode();

	curl_close($this->getCurl());
	return round(microtime(true) * 1000) - $start;
	}

	public function getCurl() {
	return $this->curl;
	}

	public function getSource() {
	return $this->source;
	}

	public function getUrl() {
	if (is_null($this->url)) {
	$this->url = curl_getinfo($this->getCurl(), CURLINFO_EFFECTIVE_URL);
	$this->urlParts = parse_url($this->url);
	}

	return $this->url;
	}

	public function getUrlParts($key = null) {
	if (!is_null($key) && isset($this->urlParts[$key])) {
	return $this->urlParts[$key];
	}

	return $this->urlParts;
	}

	public function getStatusCode() {
	if (is_null($this->statusCode)) {
	$this->statusCode = curl_getinfo($this->getCurl(), CURLINFO_HTTP_CODE);
	}

	return $this->statusCode;
	}
	}
	<?php
	if (php_sapi_name() !== 'cli') exit(1);

	require_once(__DIR__.'/../init.php');

	define('WORKER_LIMIT_INSTANCES', 200);
	define('CRAWLER_MAX_DEPTH', 10000);
	define('CRAWLER_MAX_HIGH_URLS', 100);

	use \Pheanstalk\Pheanstalk;
	use \Crawler\Models\LinkModel;

	$pheanstalk = new Pheanstalk('127.0.0.1');

	$reloadedInitialTime = filemtime(__DIR__.'/../reloaded');
	fwrite(STDOUT, "Started new instance of script (".$reloadedInitialTime.").\n");

	$loopCounter = 0;
	while (true) {
	clearstatcache();

	// Script to stop the service
	if (intval(file_get_contents(__DIR__.'/../breakworker')) === 1 ) exit(1);

	// We check if we need to stop this worker (code update?)
	$autoReloadSystem = filemtime(__DIR__.'/../reloaded');
	if ($reloadedInitialTime !== $autoReloadSystem) {
	fwrite(STDOUT, "New update - Reloading script.\n");
	exit(0);
	}

	usleep(500000); // Give it some slack ; 1/2 second

	$loopCounter++;
	if ($loopCounter > WORKER_LIMIT_INSTANCES) break; // We count on Supervisord to reload workers

	// grab the next job off the queue and reserve it
	$job = $pheanstalk->watch(QUEUE_NAME)
	->ignore('default')
	->reserve();

	// remove the job from the queue
	$pheanstalk->delete($job);

	$data = json_decode($job->getData(), true);
	if (is_null($data)) {
	fwrite(STDERR, "[FATAL] Invalid Job data : ".$job->getData()."\n");
	}

	if (!isset($data['retries'])) $data['retries'] = 0;
	if (!isset($data['priority'])) $data['priority'] = \Crawler\Engine\Spider::MEDIUM_PRIORITY;

	if ($data['priority'] == \Crawler\Engine\Spider::LOW_PRIORITY) {
	// Normally, only new links are in low priority
	$data['priority'] = \Crawler\Engine\Spider::MEDIUM_PRIORITY;
	}

	/*
	* The "Spider" goes to the website using a basic CURL request
	* It also pre-fetch the robots.txt the first request to ensure we respect it
	* With the following CURL rules :
	* CURLOPT_FOLLOWLOCATION => true,
	CURLOPT_FORBID_REUSE => true,
	CURLOPT_FRESH_CONNECT => true,
	CURLOPT_HEADER => false,
	CURLOPT_RETURNTRANSFER => true,
	CURLOPT_SSL_VERIFYPEER => false,
	CURLOPT_MAXREDIRS => 5,
	CURLOPT_TIMEOUT => 5,
	CURLOPT_ENCODING => ''
	*/
	$spider = new \Crawler\Engine\Spider($data['url']);
	$duration = $spider->exec();

	// First, we ensure that we are not black-listed
	// So we analyze the status code
	// For 401, 403 and 404, we retry once
	// For 408, 429 and 503, we retry 3 times, with increasing wait between requests

	if (in_array($spider->getStatusCode(), array(401, 403, 404, 408, 429, 503))) {
	$data['retries']++;
	if ((in_array($spider->getStatusCode(), array(401, 403, 404)) && $data['retries'] <= 1) // Only one retry
	\|\|
	(in_array($spider->getStatusCode(), array(408, 429, 503)) && $data['retries'] <= 3) // 3 retries
	) {
	$pheanstalk->putInTube(QUEUE_NAME, json_encode($data), $data['priority'], $data['retries'] * 30);
	continue;
	}

	// We are here (and not in the "if" section) when the status code is in the array
	// but the retries are reached, that mean we stop for this url
	// So the next step will be to add it in the Link database and stop the data.
	}

	// We update the url in the database to indicate it has been crawled
	LinkModel::update($data['url'], true);
	if (strtolower($data['url']) !== strtolower($spider->getUrl())) {
	// We were redirected, so we add a new URL also marked as being crawled, with $data['url'] being the origin
	$jobId = LinkModel::add($spider->getUrl(), true, $data['url']);
	// We remove the job of the redirect url because we had it already in queue
	if (!is_null($jobId)) {
	// We catch exception in case the url has already been processed
	try {
	$job = $pheanstalk->peek($jobId);
	$pheanstalk->delete($job);
	} catch (\Exception $e) {}
	}
	}

	$domainName = $spider->getUrlParts(PHP_URL_HOST);
	$domainName = strtolower($domainName['host']);

	// Here's the code I do to index the webpages
	// I removed it because it's not interesting in our case
	// But in general, if you are looking for a similar work, you can implement your need here :)

	// This code extract all the links in the page to add them in the queue
	$links = \Crawler\Extractors\LinkExtractor::extract($spider);

	// And we add them now :
	$priority = $data['priority'];
	foreach ($links as $link) {
	$parsedDomain = strtolower(parse_url($link, PHP_URL_HOST));

	$jobsData = array(
	'url' => $link,
	'retries' => 0,
	'referer' => $spider->getUrl()
	);

	$jobsData['delay'] = ceil($duration * (rand(1, 10)/10000)); // Delay between 0.1 and 1 seconds x $duration of the request
	if ($jobsData['delay'] > 5) $jobsData['delay'] = 5;

	// We increase the time to wait per number of links for this specific domain
	$jobsData['delay'] = $jobsData['delay'] + LinkModel::countQueued($parsedDomain);

	if (\Crawler\Engine\Spider::HIGH_PRIORITY) {
	// Allow 5 simultaneous request on high priority
	$jobsData['delay'] = floor($jobsData['delay'] / 10);
	}

	$iCountCrawledUrls = LinkModel::countTotal($parsedDomain);
	if ($iCountCrawledUrls > CRAWLER_MAX_DEPTH) break; // We stop crawling this domain

	if ($domainName === $parsedDomain) {
	if ($priority === \Crawler\Engine\Spider::HIGH_PRIORITY && $iCountCrawledUrls > CRAWLER_MAX_HIGH_URLS) {
	$priority = \Crawler\Engine\Spider::MEDIUM_PRIORITY;
	}
	$jobsData['priority'] = $priority;
	} else {
	$jobsData['priority'] = \Crawler\Engine\Spider::LOW_PRIORITY;
	}

	$jobId = $pheanstalk->putInTube(QUEUE_NAME, json_encode($jobsData), $jobsData['priority'], $jobsData['delay']);

	// The add method checks if the url is already present in the database
	// To avoid adding multiple time the same url (and going in loop in case two sites links to each others !)
	LinkModel::add($link, false, null, $jobId);
	}
	}