Created
April 1, 2014 01:52
-
-
Save hzhopen/9906254 to your computer and use it in GitHub Desktop.
检测一个网页上的链接是否都可以正常访问
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
//检测一个网页上的链接是否都可以正常访问 | |
//指定测试的地址(页面): | |
$viewUrl = ""; | |
$host = ""; | |
function _striplinks($document) { | |
preg_match_all("'<\s*a\s.*?href\s*=\s*([\"\'])?(?(1) (.*?)\\1 | ([^\s\>]+))'isx", $document, $links); | |
// catenate the non-empty matches from the conditional subpattern | |
while (list($key, $val) = each($links[2])) { | |
if (!empty($val)) | |
$match[] = $val; | |
} while (list($key, $val) = each($links[3])) { | |
if (!empty($val)) | |
$match[] = $val; | |
} | |
// return the links | |
return $match; | |
} | |
/*===================================================================*\ | |
Function: _expandlinks | |
Purpose: expand each link into a fully qualified URL | |
Input: $links the links to qualify | |
$URI the full URI to get the base from | |
Output: $expandedLinks the expanded links | |
\*===================================================================*/ | |
function _expandlinks($links,$URI){ | |
$URI_PARTS = parse_url($URI); | |
$host = $URI_PARTS["host"]; | |
preg_match("/^[^\?]+/",$URI,$match); | |
$match = preg_replace("|/[^\/\.]+\.[^\/\.]+$|","",$match[0]); | |
$match = preg_replace("|/$|","",$match); | |
$match_part = parse_url($match); | |
$match_root = | |
$match_part["scheme"]."://".$match_part["host"]; | |
$search = array( "|^http://".preg_quote($host)."|i", | |
"|^(\/)|i", | |
"|^(?!http://)(?!mailto:)|i", | |
"|/\./|", | |
"|/[^\/]+/\.\./|" | |
); | |
$replace = array( "", | |
$match_root."/", | |
$match."/", | |
"/", | |
"/" | |
); | |
$expandedLinks = preg_replace($search,$replace,$links); | |
return $expandedLinks; | |
} | |
function _checkOk($url, $key){ | |
$headers = @get_headers($url); | |
if ($headers[0] == 'HTTP/1.0 404 Not Found') { | |
echo "\n".$key; | |
echo '---<div style=" color: red; font-size: 18px; font-weight: bold;"> Not Found </div>'; | |
echo "\n"; | |
} else { | |
echo "\n".$key; | |
echo "---- ok \n"; | |
} | |
} | |
/* | |
* 使用curl 的所有链接。 | |
*/ | |
$ch = curl_init(); | |
curl_setopt($ch, CURLOPT_URL, $url); | |
// 只需返回HTTP header | |
curl_setopt($ch, CURLOPT_HEADER, 1); | |
// 页面内容我们并不需要 | |
// curl_setopt($ch, CURLOPT_NOBODY, 1); | |
// 返回结果,而不是输出它 | |
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); | |
$html = curl_exec($ch); | |
$info = curl_getinfo($ch); | |
if ($html === false) { | |
echo "cURL Error: " . curl_error($ch); | |
} | |
curl_close($ch); | |
$linkarr = _striplinks($html); | |
// 主机部分,补全用 | |
if (is_array($linkarr)) { | |
foreach ($linkarr as $k => $v) { | |
$linkresult[$k] = _expandlinks($v, $host); | |
} | |
} | |
$linkresult = array_unique($linkresult); | |
echo '<pre>'; | |
foreach ($linkresult as $key => $value){ | |
echo $key."--".$value; | |
_checkOk($value,$key); | |
} | |
echo "\n ==============RUN OVER=============== \n"; | |
//printf("<p>此页面的所有链接为:</p><pre>%s</pre>\n", var_export($linkresult , true)); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment