Last active
March 26, 2023 19:29
-
-
Save ulrischa/f99280e34ecec432d228eea44c89794b to your computer and use it in GitHub Desktop.
PHP check dead links with cirl
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class WebCrawler { | |
private const CURL_OPTIONS = array( | |
CURLOPT_RETURNTRANSFER => true, | |
CURLOPT_FOLLOWLOCATION => true, | |
CURLOPT_SSL_VERIFYHOST => 2, | |
CURLOPT_SSL_VERIFYPEER => true, | |
CURLOPT_CAINFO => "/path/to/cert.pem" // hier den Pfad zum Zertifikat eintragen | |
); | |
private $visited_links = []; | |
private $max_depth; | |
private $user_agent; | |
private $proxy; | |
private $base_host; | |
public function __construct($max_depth = null, $user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36", $proxy = null) { | |
$this->max_depth = $max_depth; | |
$this->user_agent = $user_agent; | |
$this->proxy = $proxy; | |
if ($proxy !== null) { | |
self::CURL_OPTIONS[CURLOPT_PROXY] = $proxy; | |
} | |
self::CURL_OPTIONS[CURLOPT_USERAGENT] = $user_agent; | |
} | |
public function crawl($url, $depth = 0) { | |
if ($this->max_depth !== null && $depth > $this->max_depth) { | |
return; | |
} | |
if (in_array($url, $this->visited_links)) { | |
return; | |
} | |
if ($this->base_host === null) { | |
$this->base_host = parse_url($url, PHP_URL_HOST); | |
} | |
$url_host = parse_url($url, PHP_URL_HOST); | |
if ($url_host !== null && $url_host !== $this->base_host) { | |
echo "Link zu externer Domain gefunden: $url (depth=$depth)\n"; | |
} else { | |
echo "Crawling: $url\n"; | |
$this->visited_links[] = $url; | |
$curl = curl_init(); | |
curl_setopt_array($curl, self::CURL_OPTIONS); | |
curl_setopt($curl, CURLOPT_URL, $url); | |
$html = curl_exec($curl); | |
curl_close($curl); | |
if ($html !== false) { | |
$links = $this->extractLinks($html, $url); | |
foreach ($links as $link) { | |
$this->crawl($link, $depth + 1); | |
} | |
} else { | |
echo "Fehler beim Abrufen von $url\n"; | |
} | |
} | |
} | |
private function makeAbsoluteUrl($url, $base_url) { | |
if (strpos($url, '://') !== false) { | |
// URL ist bereits absolut | |
return $url; | |
} | |
if (strpos($url, '//') === 0) { | |
// URL ist schema-relativ (z.B. "//example.com") | |
$url = parse_url($base_url, PHP_URL_SCHEME) . ':' . $url; | |
} elseif (strpos($url, '/') === 0) { | |
// URL ist pfad-relativ (z.B. "/path/to/file") | |
$url = parse_url($base_url, PHP_URL_SCHEME) . '://' . parse_url($base_url, PHP_URL_HOST) . $url; | |
} else { | |
// URL ist relativ zum aktuellen Pfad (z.B. "file.html" oder "../file.html") | |
$path = parse_url($base_url, PHP_URL_PATH); | |
$path = rtrim($path, '/'); | |
$url = parse_url($base_url, PHP_URL_SCHEME) . '://' . parse_url($base_url, PHP_URL_HOST) . '/' . ltrim($path . '/' . $url, '/'); | |
} | |
// Fragmente (#) und Query-Parameter (?) beibehalten | |
$query = parse_url($base_url, PHP_URL_QUERY); | |
if ($query !== null) { | |
$url .= '?' . $query; | |
} | |
$fragment = parse_url($base_url, PHP_URL_FRAGMENT); | |
if ($fragment !== null) { | |
$url .= '#' . $fragment; | |
} | |
return $url; | |
} | |
private function extractLinks($html, $base_url) { | |
$dom = new DOMDocument(); | |
@$dom->loadHTML($html); | |
$links = $dom->getElementsByTagName('a'); | |
$result = []; | |
foreach ($links as $link) { | |
$href = $link->getAttribute('href'); | |
if (!empty($href)) { | |
$url = $this->makeAbsoluteUrl($href, $base_url); | |
$result[] = $url; | |
} | |
} | |
return $result; | |
} | |
private function checkLink($url, $depth) { | |
$curl = curl_init(); | |
curl_setopt_array($curl, self::CURL_OPTIONS); | |
curl_setopt($curl, CURLOPT_URL, $url); | |
curl_setopt($curl, CURLOPT_HEADER, true); | |
$headers = curl_exec($curl); | |
curl_close($curl); | |
if ($headers !== false) { | |
$status_code = $this->getStatusCode($headers); | |
if ($status_code == 200) { | |
echo "Link OK: $url (depth=$depth)\n"; | |
} else { | |
echo "Link fehlerhaft: $url ($status_code) (depth=$depth)\n"; | |
} | |
} else { | |
echo "Link fehlerhaft: $url (depth=$depth)\n"; | |
} | |
} | |
private function getStatusCode($headers) { | |
$lines = explode("\n", $headers); | |
if (count($lines) > 0) { | |
preg_match('/HTTP\/.* ([0-9]+) .*/', $lines[0], $matches); | |
return isset($matches[1]) ? intval($matches[1]) : null; | |
} else { | |
return null; | |
} | |
} | |
private function isValidUrl($url) { | |
return filter_var($url, FILTER_VALIDATE_URL) !== false; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment