Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save hzhopen/9906254 to your computer and use it in GitHub Desktop.
Save hzhopen/9906254 to your computer and use it in GitHub Desktop.
检测一个网页上的链接是否都可以正常访问
<?php
//检测一个网页上的链接是否都可以正常访问
//指定测试的地址(页面):
$viewUrl = "";
$host = "";
function _striplinks($document) {
preg_match_all("'<\s*a\s.*?href\s*=\s*([\"\'])?(?(1) (.*?)\\1 | ([^\s\>]+))'isx", $document, $links);
// catenate the non-empty matches from the conditional subpattern
while (list($key, $val) = each($links[2])) {
if (!empty($val))
$match[] = $val;
} while (list($key, $val) = each($links[3])) {
if (!empty($val))
$match[] = $val;
}
// return the links
return $match;
}
/*===================================================================*\
Function: _expandlinks
Purpose: expand each link into a fully qualified URL
Input: $links the links to qualify
$URI the full URI to get the base from
Output: $expandedLinks the expanded links
\*===================================================================*/
function _expandlinks($links,$URI){
$URI_PARTS = parse_url($URI);
$host = $URI_PARTS["host"];
preg_match("/^[^\?]+/",$URI,$match);
$match = preg_replace("|/[^\/\.]+\.[^\/\.]+$|","",$match[0]);
$match = preg_replace("|/$|","",$match);
$match_part = parse_url($match);
$match_root =
$match_part["scheme"]."://".$match_part["host"];
$search = array( "|^http://".preg_quote($host)."|i",
"|^(\/)|i",
"|^(?!http://)(?!mailto:)|i",
"|/\./|",
"|/[^\/]+/\.\./|"
);
$replace = array( "",
$match_root."/",
$match."/",
"/",
"/"
);
$expandedLinks = preg_replace($search,$replace,$links);
return $expandedLinks;
}
function _checkOk($url, $key){
$headers = @get_headers($url);
if ($headers[0] == 'HTTP/1.0 404 Not Found') {
echo "\n".$key;
echo '---<div style=" color: red; font-size: 18px; font-weight: bold;"> Not Found </div>';
echo "\n";
} else {
echo "\n".$key;
echo "---- ok \n";
}
}
/*
* 使用curl 的所有链接。
*/
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
// 只需返回HTTP header
curl_setopt($ch, CURLOPT_HEADER, 1);
// 页面内容我们并不需要
// curl_setopt($ch, CURLOPT_NOBODY, 1);
// 返回结果,而不是输出它
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
$html = curl_exec($ch);
$info = curl_getinfo($ch);
if ($html === false) {
echo "cURL Error: " . curl_error($ch);
}
curl_close($ch);
$linkarr = _striplinks($html);
// 主机部分,补全用
if (is_array($linkarr)) {
foreach ($linkarr as $k => $v) {
$linkresult[$k] = _expandlinks($v, $host);
}
}
$linkresult = array_unique($linkresult);
echo '<pre>';
foreach ($linkresult as $key => $value){
echo $key."--".$value;
_checkOk($value,$key);
}
echo "\n ==============RUN OVER=============== \n";
//printf("<p>此页面的所有链接为:</p><pre>%s</pre>\n", var_export($linkresult , true));
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment