kicken/Crawler.php

## Crawler.php
<?php

class Crawler {
    private $linkQueue;
    private $linkExtractor;

    public function __construct(){
        $this->linkExtractor = new LinkExtractor;
    }

    public function crawl($startingUrl){
        $this->linkQueue = new LinkQueue();
        $this->linkQueue->enqueue($startingUrl);
        $this->run();
    }

    private function run(){
        while (!$this->linkQueue->isEmpty()){
            $url = $this->linkQueue->dequeue();
            echo 'Crawling URL ', $url, PHP_EOL;

            $dom = $this->loadUrl($url);
            if ($dom){
                foreach ($this->linkExtractor->getLinks($dom, $url) as $newUrl){
                    $this->linkQueue->enqueue($newUrl);
                }
            }
        }
    }

    private function loadUrl($url){
        $options = array(
            'http'=>array('method'=>"GET", 'header'=>"User-Agent: imagimediaBot/0.1\n")
        );
        $context = stream_context_create($options);

        $html = file_get_contents($url, false, $context);
        if ($html !== false){
            libxml_use_internal_errors(true);
            $dom = new DomDocument();
            $dom->loadHTML($html);
            libxml_clear_errors();
            libxml_use_internal_errors(false);
            return $dom;
        } else {
            return null;
        }
    }
}

## LinkExtractor.php
<?php
class LinkExtractor {
    public function getLinks(DOMDocument $dom, $baseUrl) {
        $baseScheme = parse_url($baseUrl)["scheme"]; //HTTP
        $baseHost = parse_url($baseUrl)["host"];
        $basePath = parse_url($baseUrl)['path'];

        $linkList = [];
        foreach ($dom->getElementsByTagName("a") as $anchor){
            if ($this->isValid($anchor, $baseHost)){
                $linkList[] = $this->resolveUrl($anchor->getAttribute('href'), $baseHost, $baseScheme, $basePath);
            }
        }

        return $linkList;
    }

    private function isValid($anchor, $host){
        if (!$anchor->hasAttribute('href')){
            return false;
        }

        if ($anchor->hasAttribute('rel')){
            $rel = $anchor->getAttribute('rel');
            return strpos($rel, 'nofollow') === false;
        }

        $href = $anchor->getAttribute('href');
        if ($href[0] === '#'){
            return false;
        } else if (preg_match('@^(\w+)://@', $href, $matches)){
            $scheme = strtolower($matches[1]);
            if (!in_array($scheme, ['http','https'])){
                return false;
            }

            $hrefHost = parse_url($href)['host'];
            if (strcasecmp($hrefHost, $host) !== 0){
                return false;
            }
        }

        return true;
    }

    private function resolveUrl($target, $host, $scheme, $path) {
        if(substr($target, 0, 2) == "//") {
            $target =  $scheme . ":" . $target;
        }
        else if(substr($target, 0, 1) == "/") {
            $target = $scheme . "://" . $host . $target;
        }
        else if (!preg_match('@^https?://@i', $target)){
            if ($path[strlen($path)-1] !== '/'){
                $path = substr($path, 0, strrpos($path, '/')+1);
            }

            if (substr($target, 0, 2) === './'){
                $target = substr($target, 2);
            } else if (substr($target, 0, 3) === '../'){
                $target = substr($target, 3);
                $path = substr($path, 0, -1);
                $path = substr($path, 0, strrpos($path, '/')+1);
            }

            $target = $scheme . "://" . $host . $path . $target;
        }

        return $target;
    }
}

## LinkQueue.php
<?php

class LinkQueue extends \SplQueue {
    private $seen;

    public function enqueue($url){
        $normalized = $this->normalizeUrl($url);
        if (!isset($this->seen[$normalized])){
            parent::enqueue($url);
            $this->seen[$normalized] = true;
            echo 'Enqueued URL: ', $url, PHP_EOL;
        }
    }

    private function normalizeUrl($url){
        $url = preg_replace('@^https?://@i', '', $url);
        $url = preg_replace('@^www.@', '', $url);
        return $url;
    }
}

## main.php
<?php

require 'Crawler.php';
require 'LinkExtractor.php';
require 'LinkQueue.php';

$crawler = new Crawler();
$crawler->crawl('https://aoeex.com/phile/');
	<?php

	class Crawler {
	private $linkQueue;
	private $linkExtractor;

	public function __construct(){
	$this->linkExtractor = new LinkExtractor;
	}

	public function crawl($startingUrl){
	$this->linkQueue = new LinkQueue();
	$this->linkQueue->enqueue($startingUrl);
	$this->run();
	}

	private function run(){
	while (!$this->linkQueue->isEmpty()){
	$url = $this->linkQueue->dequeue();
	echo 'Crawling URL ', $url, PHP_EOL;

	$dom = $this->loadUrl($url);
	if ($dom){
	foreach ($this->linkExtractor->getLinks($dom, $url) as $newUrl){
	$this->linkQueue->enqueue($newUrl);
	}
	}
	}
	}

	private function loadUrl($url){
	$options = array(
	'http'=>array('method'=>"GET", 'header'=>"User-Agent: imagimediaBot/0.1\n")
	);
	$context = stream_context_create($options);

	$html = file_get_contents($url, false, $context);
	if ($html !== false){
	libxml_use_internal_errors(true);
	$dom = new DomDocument();
	$dom->loadHTML($html);
	libxml_clear_errors();
	libxml_use_internal_errors(false);
	return $dom;
	} else {
	return null;
	}
	}
	}
	<?php
	class LinkExtractor {
	public function getLinks(DOMDocument $dom, $baseUrl) {
	$baseScheme = parse_url($baseUrl)["scheme"]; //HTTP
	$baseHost = parse_url($baseUrl)["host"];
	$basePath = parse_url($baseUrl)['path'];

	$linkList = [];
	foreach ($dom->getElementsByTagName("a") as $anchor){
	if ($this->isValid($anchor, $baseHost)){
	$linkList[] = $this->resolveUrl($anchor->getAttribute('href'), $baseHost, $baseScheme, $basePath);
	}
	}

	return $linkList;
	}

	private function isValid($anchor, $host){
	if (!$anchor->hasAttribute('href')){
	return false;
	}

	if ($anchor->hasAttribute('rel')){
	$rel = $anchor->getAttribute('rel');
	return strpos($rel, 'nofollow') === false;
	}

	$href = $anchor->getAttribute('href');
	if ($href[0] === '#'){
	return false;
	} else if (preg_match('@^(\w+)://@', $href, $matches)){
	$scheme = strtolower($matches[1]);
	if (!in_array($scheme, ['http','https'])){
	return false;
	}

	$hrefHost = parse_url($href)['host'];
	if (strcasecmp($hrefHost, $host) !== 0){
	return false;
	}
	}

	return true;
	}

	private function resolveUrl($target, $host, $scheme, $path) {
	if(substr($target, 0, 2) == "//") {
	$target = $scheme . ":" . $target;
	}
	else if(substr($target, 0, 1) == "/") {
	$target = $scheme . "://" . $host . $target;
	}
	else if (!preg_match('@^https?://@i', $target)){
	if ($path[strlen($path)-1] !== '/'){
	$path = substr($path, 0, strrpos($path, '/')+1);
	}

	if (substr($target, 0, 2) === './'){
	$target = substr($target, 2);
	} else if (substr($target, 0, 3) === '../'){
	$target = substr($target, 3);
	$path = substr($path, 0, -1);
	$path = substr($path, 0, strrpos($path, '/')+1);
	}

	$target = $scheme . "://" . $host . $path . $target;
	}

	return $target;
	}
	}
	<?php

	class LinkQueue extends \SplQueue {
	private $seen;

	public function enqueue($url){
	$normalized = $this->normalizeUrl($url);
	if (!isset($this->seen[$normalized])){
	parent::enqueue($url);
	$this->seen[$normalized] = true;
	echo 'Enqueued URL: ', $url, PHP_EOL;
	}
	}

	private function normalizeUrl($url){
	$url = preg_replace('@^https?://@i', '', $url);
	$url = preg_replace('@^www.@', '', $url);
	return $url;
	}
	}
	<?php

	require 'Crawler.php';
	require 'LinkExtractor.php';
	require 'LinkQueue.php';

	$crawler = new Crawler();
	$crawler->crawl('https://aoeex.com/phile/');