Skip to content

Instantly share code, notes, and snippets.

@Zegnat
Last active November 5, 2017 15:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Zegnat/6b93c6e36682a9554a99696a11fa5fa4 to your computer and use it in GitHub Desktop.
Save Zegnat/6b93c6e36682a9554a99696a11fa5fa4 to your computer and use it in GitHub Desktop.
<?php
declare(strict_types=1);
namespace Zegnat\Utilities;
use League\Uri\Schemes\Http;
use League\Uri\Modifiers\Resolve;
use League\Uri\UriException;
class LinkExtractor
{
/**
* @url https://www.w3.org/TR/html5/index.html#attributes-1
**/
private $urlAttributes = [
'cite' => ['blockquote', 'del', 'ins', 'q'],
'href' => ['a', 'area', 'base'],
];
private $nonEmptyUrlAttributes = [
'action' => ['form'],
'data' => ['object'],
'formaction' => ['button', 'input'],
'href' => ['link'],
'manifest' => ['html'],
'poster' => ['video'],
'src' => ['audio', 'embed', 'iframe', 'img', 'input', 'script', 'source', 'track', 'video'],
];
/**
* Characters identified as space characters per the HTML5 spec.
* @url https://www.w3.org/TR/html5/infrastructure.html#space-character
**/
private $htmlSpaceCharacters = ' \t\n\f\r';
private $root;
private $xpath;
private $baseUrl;
private $extracted = null;
/**
* Strips leading and trailing whitespace per the HTML5 spec.
* @url https://www.w3.org/TR/html5/infrastructure.html#strip-leading-and-trailing-whitespace
**/
private function htmlStripWhitespace(string $string): string
{
return preg_replace(sprintf('@^[%1$s]*|[%1$s]*$@', $this->htmlSpaceCharacters), '', $string);
}
private function resolveUrl(string $url): string
{
$resolver = new Resolve($this->baseUrl);
try {
return strval($resolver->process(Http::createFromString($url)));
} catch (UriException $e) {
return $url;
}
}
/**
* @throws League\Uri\UriException If the BASE element’s HREF could not be parsed as HTTP valid.
**/
public function __construct(\DOMNode $root, string $baseUrl = '')
{
$this->root = $root;
$ownerDocument = $root instanceof \DOMDocument ? $root : $root->ownerDocument;
$this->xpath = new \DOMXPath($ownerDocument);
$baseUrl = Http::createFromString($baseUrl);
// Update the base URL in case a BASE element was provided.
// We are not going to care about the validity of the location of the BASE element.
$base = $this->xpath->query('//base[@href]', $root);
if ($base !== false && $base->length > 0) {
$baseElementUrl = Http::createFromString(trim($base->item(0)->getAttribute('href')));
$resolver = new Resolve($baseUrl);
$baseUrl = $resolver->process($baseElementUrl);
}
$this->baseUrl = $baseUrl;
}
public function extract(): array
{
if (is_array($this->extracted)) {
return $this->extracted;
}
$xpath = substr(
array_reduce(
array_unique(array_merge(
array_keys($this->urlAttributes),
array_keys($this->nonEmptyUrlAttributes)
)),
function (string $xpath, string $attribute): string {
return $xpath . ' | .//@' . $attribute;
},
''
),
3
);
$urlAttributes = $this->xpath->query($xpath, $this->root);
$links = [];
foreach ($urlAttributes as $urlAttribute) {
$name = $urlAttribute->name;
$url = $this->htmlStripWhitespace($urlAttribute->value);
$element = $urlAttribute->parentNode->tagName;
if (
array_key_exists($name, $this->urlAttributes) && in_array($element, $this->urlAttributes[$name])
||
array_key_exists($name, $this->nonEmptyUrlAttributes) && in_array($element, $this->nonEmptyUrlAttributes[$name]) && strlen($url) > 0
) {
$links[] = $url;
}
}
$this->extracted = array_map(
[$this, 'resolveUrl'],
$links
);
return $this->extracted;
}
public function linksTo(string $url): bool
{
return in_array($this->resolveUrl($url), $this->extract());
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment