Zegnat/LinkExtractor.php

## LinkExtractor.php
<?php
declare(strict_types=1);

namespace Zegnat\Utilities;

use League\Uri\Schemes\Http;
use League\Uri\Modifiers\Resolve;
use League\Uri\UriException;

class LinkExtractor
{
    /**
     * @url https://www.w3.org/TR/html5/index.html#attributes-1
     **/
    private $urlAttributes = [
        'cite' => ['blockquote', 'del', 'ins', 'q'],
        'href' => ['a', 'area', 'base'],
    ];
    private $nonEmptyUrlAttributes = [
        'action' => ['form'],
        'data' => ['object'],
        'formaction' => ['button', 'input'],
        'href' => ['link'],
        'manifest' => ['html'],
        'poster' => ['video'],
        'src' => ['audio', 'embed', 'iframe', 'img', 'input', 'script', 'source', 'track', 'video'],
    ];

    /**
     * Characters identified as space characters per the HTML5 spec.
     * @url https://www.w3.org/TR/html5/infrastructure.html#space-character
     **/
    private $htmlSpaceCharacters = ' \t\n\f\r';

    private $root;
    private $xpath;
    private $baseUrl;
    private $extracted = null;

    /**
     * Strips leading and trailing whitespace per the HTML5 spec.
     * @url https://www.w3.org/TR/html5/infrastructure.html#strip-leading-and-trailing-whitespace
     **/
    private function htmlStripWhitespace(string $string): string
    {
        return preg_replace(sprintf('@^[%1$s]*|[%1$s]*$@', $this->htmlSpaceCharacters), '', $string);
    }

    private function resolveUrl(string $url): string
    {
        $resolver = new Resolve($this->baseUrl);
        try {
            return strval($resolver->process(Http::createFromString($url)));
        } catch (UriException $e) {
            return $url;
        }
    }

    /**
     * @throws League\Uri\UriException If the BASE element’s HREF could not be parsed as HTTP valid.
     **/
    public function __construct(\DOMNode $root, string $baseUrl = '')
    {
        $this->root = $root;
        $ownerDocument = $root instanceof \DOMDocument ? $root : $root->ownerDocument;
        $this->xpath = new \DOMXPath($ownerDocument);
        $baseUrl = Http::createFromString($baseUrl);

        // Update the base URL in case a BASE element was provided.
        // We are not going to care about the validity of the location of the BASE element.
        $base = $this->xpath->query('//base[@href]', $root);
        if ($base !== false && $base->length > 0) {
            $baseElementUrl = Http::createFromString(trim($base->item(0)->getAttribute('href')));
            $resolver = new Resolve($baseUrl);
            $baseUrl = $resolver->process($baseElementUrl);
        }

        $this->baseUrl = $baseUrl;
    }

    public function extract(): array
    {
        if (is_array($this->extracted)) {
            return $this->extracted;
        }
        $xpath = substr(
            array_reduce(
                array_unique(array_merge(
                    array_keys($this->urlAttributes),
                    array_keys($this->nonEmptyUrlAttributes)
                )),
                function (string $xpath, string $attribute): string {
                    return $xpath . ' | .//@' . $attribute;
                },
                ''
            ),
            3
        );
        $urlAttributes = $this->xpath->query($xpath, $this->root);
        $links = [];
        foreach ($urlAttributes as $urlAttribute) {
            $name = $urlAttribute->name;
            $url = $this->htmlStripWhitespace($urlAttribute->value);
            $element = $urlAttribute->parentNode->tagName;
            if (
                array_key_exists($name, $this->urlAttributes) && in_array($element, $this->urlAttributes[$name])
                ||
                array_key_exists($name, $this->nonEmptyUrlAttributes) && in_array($element, $this->nonEmptyUrlAttributes[$name]) && strlen($url) > 0
            ) {
                $links[] = $url;
            }
        }
        $this->extracted = array_map(
            [$this, 'resolveUrl'],
            $links
        );
        return $this->extracted;
    }

    public function linksTo(string $url): bool
    {
        return in_array($this->resolveUrl($url), $this->extract());
    }
}
	<?php
	declare(strict_types=1);

	namespace Zegnat\Utilities;

	use League\Uri\Schemes\Http;
	use League\Uri\Modifiers\Resolve;
	use League\Uri\UriException;

	class LinkExtractor
	{
	/**
	* @url https://www.w3.org/TR/html5/index.html#attributes-1
	**/
	private $urlAttributes = [
	'cite' => ['blockquote', 'del', 'ins', 'q'],
	'href' => ['a', 'area', 'base'],
	];
	private $nonEmptyUrlAttributes = [
	'action' => ['form'],
	'data' => ['object'],
	'formaction' => ['button', 'input'],
	'href' => ['link'],
	'manifest' => ['html'],
	'poster' => ['video'],
	'src' => ['audio', 'embed', 'iframe', 'img', 'input', 'script', 'source', 'track', 'video'],
	];

	/**
	* Characters identified as space characters per the HTML5 spec.
	* @url https://www.w3.org/TR/html5/infrastructure.html#space-character
	**/
	private $htmlSpaceCharacters = ' \t\n\f\r';

	private $root;
	private $xpath;
	private $baseUrl;
	private $extracted = null;

	/**
	* Strips leading and trailing whitespace per the HTML5 spec.
	* @url https://www.w3.org/TR/html5/infrastructure.html#strip-leading-and-trailing-whitespace
	**/
	private function htmlStripWhitespace(string $string): string
	{
	return preg_replace(sprintf('@^[%1$s]\|[%1$s]$@', $this->htmlSpaceCharacters), '', $string);
	}

	private function resolveUrl(string $url): string
	{
	$resolver = new Resolve($this->baseUrl);
	try {
	return strval($resolver->process(Http::createFromString($url)));
	} catch (UriException $e) {
	return $url;
	}
	}

	/**
	* @throws League\Uri\UriException If the BASE element’s HREF could not be parsed as HTTP valid.
	**/
	public function __construct(\DOMNode $root, string $baseUrl = '')
	{
	$this->root = $root;
	$ownerDocument = $root instanceof \DOMDocument ? $root : $root->ownerDocument;
	$this->xpath = new \DOMXPath($ownerDocument);
	$baseUrl = Http::createFromString($baseUrl);

	// Update the base URL in case a BASE element was provided.
	// We are not going to care about the validity of the location of the BASE element.
	$base = $this->xpath->query('//base[@href]', $root);
	if ($base !== false && $base->length > 0) {
	$baseElementUrl = Http::createFromString(trim($base->item(0)->getAttribute('href')));
	$resolver = new Resolve($baseUrl);
	$baseUrl = $resolver->process($baseElementUrl);
	}

	$this->baseUrl = $baseUrl;
	}

	public function extract(): array
	{
	if (is_array($this->extracted)) {
	return $this->extracted;
	}
	$xpath = substr(
	array_reduce(
	array_unique(array_merge(
	array_keys($this->urlAttributes),
	array_keys($this->nonEmptyUrlAttributes)
	)),
	function (string $xpath, string $attribute): string {
	return $xpath . ' \| .//@' . $attribute;
	},
	''
	),
	3
	);
	$urlAttributes = $this->xpath->query($xpath, $this->root);
	$links = [];
	foreach ($urlAttributes as $urlAttribute) {
	$name = $urlAttribute->name;
	$url = $this->htmlStripWhitespace($urlAttribute->value);
	$element = $urlAttribute->parentNode->tagName;
	if (
	array_key_exists($name, $this->urlAttributes) && in_array($element, $this->urlAttributes[$name])
	\|\|
	array_key_exists($name, $this->nonEmptyUrlAttributes) && in_array($element, $this->nonEmptyUrlAttributes[$name]) && strlen($url) > 0
	) {
	$links[] = $url;
	}
	}
	$this->extracted = array_map(
	[$this, 'resolveUrl'],
	$links
	);
	return $this->extracted;
	}

	public function linksTo(string $url): bool
	{
	return in_array($this->resolveUrl($url), $this->extract());
	}
	}