Last active
November 5, 2017 15:28
-
-
Save Zegnat/6b93c6e36682a9554a99696a11fa5fa4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
declare(strict_types=1); | |
namespace Zegnat\Utilities; | |
use League\Uri\Schemes\Http; | |
use League\Uri\Modifiers\Resolve; | |
use League\Uri\UriException; | |
class LinkExtractor | |
{ | |
/** | |
* @url https://www.w3.org/TR/html5/index.html#attributes-1 | |
**/ | |
private $urlAttributes = [ | |
'cite' => ['blockquote', 'del', 'ins', 'q'], | |
'href' => ['a', 'area', 'base'], | |
]; | |
private $nonEmptyUrlAttributes = [ | |
'action' => ['form'], | |
'data' => ['object'], | |
'formaction' => ['button', 'input'], | |
'href' => ['link'], | |
'manifest' => ['html'], | |
'poster' => ['video'], | |
'src' => ['audio', 'embed', 'iframe', 'img', 'input', 'script', 'source', 'track', 'video'], | |
]; | |
/** | |
* Characters identified as space characters per the HTML5 spec. | |
* @url https://www.w3.org/TR/html5/infrastructure.html#space-character | |
**/ | |
private $htmlSpaceCharacters = ' \t\n\f\r'; | |
private $root; | |
private $xpath; | |
private $baseUrl; | |
private $extracted = null; | |
/** | |
* Strips leading and trailing whitespace per the HTML5 spec. | |
* @url https://www.w3.org/TR/html5/infrastructure.html#strip-leading-and-trailing-whitespace | |
**/ | |
private function htmlStripWhitespace(string $string): string | |
{ | |
return preg_replace(sprintf('@^[%1$s]*|[%1$s]*$@', $this->htmlSpaceCharacters), '', $string); | |
} | |
private function resolveUrl(string $url): string | |
{ | |
$resolver = new Resolve($this->baseUrl); | |
try { | |
return strval($resolver->process(Http::createFromString($url))); | |
} catch (UriException $e) { | |
return $url; | |
} | |
} | |
/** | |
* @throws League\Uri\UriException If the BASE element’s HREF could not be parsed as HTTP valid. | |
**/ | |
public function __construct(\DOMNode $root, string $baseUrl = '') | |
{ | |
$this->root = $root; | |
$ownerDocument = $root instanceof \DOMDocument ? $root : $root->ownerDocument; | |
$this->xpath = new \DOMXPath($ownerDocument); | |
$baseUrl = Http::createFromString($baseUrl); | |
// Update the base URL in case a BASE element was provided. | |
// We are not going to care about the validity of the location of the BASE element. | |
$base = $this->xpath->query('//base[@href]', $root); | |
if ($base !== false && $base->length > 0) { | |
$baseElementUrl = Http::createFromString(trim($base->item(0)->getAttribute('href'))); | |
$resolver = new Resolve($baseUrl); | |
$baseUrl = $resolver->process($baseElementUrl); | |
} | |
$this->baseUrl = $baseUrl; | |
} | |
public function extract(): array | |
{ | |
if (is_array($this->extracted)) { | |
return $this->extracted; | |
} | |
$xpath = substr( | |
array_reduce( | |
array_unique(array_merge( | |
array_keys($this->urlAttributes), | |
array_keys($this->nonEmptyUrlAttributes) | |
)), | |
function (string $xpath, string $attribute): string { | |
return $xpath . ' | .//@' . $attribute; | |
}, | |
'' | |
), | |
3 | |
); | |
$urlAttributes = $this->xpath->query($xpath, $this->root); | |
$links = []; | |
foreach ($urlAttributes as $urlAttribute) { | |
$name = $urlAttribute->name; | |
$url = $this->htmlStripWhitespace($urlAttribute->value); | |
$element = $urlAttribute->parentNode->tagName; | |
if ( | |
array_key_exists($name, $this->urlAttributes) && in_array($element, $this->urlAttributes[$name]) | |
|| | |
array_key_exists($name, $this->nonEmptyUrlAttributes) && in_array($element, $this->nonEmptyUrlAttributes[$name]) && strlen($url) > 0 | |
) { | |
$links[] = $url; | |
} | |
} | |
$this->extracted = array_map( | |
[$this, 'resolveUrl'], | |
$links | |
); | |
return $this->extracted; | |
} | |
public function linksTo(string $url): bool | |
{ | |
return in_array($this->resolveUrl($url), $this->extract()); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment