hzhopen/检测一个网页上的链接是否都可以正常访问

## 检测一个网页上的链接是否都可以正常访问
<?php
//检测一个网页上的链接是否都可以正常访问

//指定测试的地址(页面):

$viewUrl = "";
$host = "";


function _striplinks($document) {
	preg_match_all("'<\s*a\s.*?href\s*=\s*([\"\'])?(?(1) (.*?)\\1 | ([^\s\>]+))'isx", $document, $links);
	// catenate the non-empty matches from the conditional subpattern
	while (list($key, $val) = each($links[2])) {
		if (!empty($val))
			$match[] = $val;
	} while (list($key, $val) = each($links[3])) {
		if (!empty($val))
			$match[] = $val;
	}
	// return the links
	return $match;
}
/*===================================================================*\
	Function:	_expandlinks
	Purpose:	expand each link into a fully qualified URL
	Input:		$links			the links to qualify
				$URI			the full URI to get the base from
	Output:		$expandedLinks	the expanded links
\*===================================================================*/
function _expandlinks($links,$URI){
	$URI_PARTS = parse_url($URI);
	$host = $URI_PARTS["host"];
	preg_match("/^[^\?]+/",$URI,$match);
	$match = preg_replace("|/[^\/\.]+\.[^\/\.]+$|","",$match[0]);
	$match = preg_replace("|/$|","",$match);
	$match_part = parse_url($match);
	$match_root =
	$match_part["scheme"]."://".$match_part["host"];
	$search = array( 	"|^http://".preg_quote($host)."|i",
						"|^(\/)|i",
						"|^(?!http://)(?!mailto:)|i",
						"|/\./|",
						"|/[^\/]+/\.\./|"
					);
	$replace = array(	"",
						$match_root."/",
						$match."/",
						"/",
						"/"
					);
	$expandedLinks = preg_replace($search,$replace,$links);
	return $expandedLinks;
}

function _checkOk($url, $key){
	$headers = @get_headers($url);
	if ($headers[0] == 'HTTP/1.0 404 Not Found') {
		echo "\n".$key;
		echo '---<div style=" color: red; font-size: 18px; font-weight: bold;"> Not Found </div>';
		echo "\n";
	} else {
		echo "\n".$key;
		echo "---- ok \n";
	}
}


/*
 * 使用curl 的所有链接。
 */
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
// 只需返回HTTP header
curl_setopt($ch, CURLOPT_HEADER, 1);
// 页面内容我们并不需要
// curl_setopt($ch, CURLOPT_NOBODY, 1);
// 返回结果，而不是输出它
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
$html = curl_exec($ch);
$info = curl_getinfo($ch);
if ($html === false) {
	echo "cURL Error: " . curl_error($ch);
}
curl_close($ch);
$linkarr = _striplinks($html);

// 主机部分，补全用
if (is_array($linkarr)) {
	foreach ($linkarr as $k => $v) {
		$linkresult[$k] = _expandlinks($v, $host);
	}
}

$linkresult = array_unique($linkresult);

echo '<pre>';
foreach ($linkresult as $key => $value){
	echo $key."--".$value;
	 _checkOk($value,$key);

}

echo "\n ==============RUN OVER=============== \n";
//printf("<p>此页面的所有链接为：</p><pre>%s</pre>\n", var_export($linkresult , true));
	<?php
	//检测一个网页上的链接是否都可以正常访问

	//指定测试的地址(页面):

	$viewUrl = "";
	$host = "";


	function _striplinks($document) {
	preg_match_all("'<\sa\s.?href\s=\s([\"\'])?(?(1) (.*?)\\1 \| ([^\s\>]+))'isx", $document, $links);
	// catenate the non-empty matches from the conditional subpattern
	while (list($key, $val) = each($links[2])) {
	if (!empty($val))
	$match[] = $val;
	} while (list($key, $val) = each($links[3])) {
	if (!empty($val))
	$match[] = $val;
	}
	// return the links
	return $match;
	}
	/===================================================================\
	Function: _expandlinks
	Purpose: expand each link into a fully qualified URL
	Input: $links the links to qualify
	$URI the full URI to get the base from
	Output: $expandedLinks the expanded links
	\===================================================================/
	function _expandlinks($links,$URI){
	$URI_PARTS = parse_url($URI);
	$host = $URI_PARTS["host"];
	preg_match("/^[^\?]+/",$URI,$match);
	$match = preg_replace("\|/[^\/\.]+\.[^\/\.]+$\|","",$match[0]);
	$match = preg_replace("\|/$\|","",$match);
	$match_part = parse_url($match);
	$match_root =
	$match_part["scheme"]."://".$match_part["host"];
	$search = array( "\|^http://".preg_quote($host)."\|i",
	"\|^(\/)\|i",
	"\|^(?!http://)(?!mailto:)\|i",
	"\|/\./\|",
	"\|/[^\/]+/\.\./\|"
	);
	$replace = array( "",
	$match_root."/",
	$match."/",
	"/",
	"/"
	);
	$expandedLinks = preg_replace($search,$replace,$links);
	return $expandedLinks;
	}

	function _checkOk($url, $key){
	$headers = @get_headers($url);
	if ($headers[0] == 'HTTP/1.0 404 Not Found') {
	echo "\n".$key;
	echo '---<div style=" color: red; font-size: 18px; font-weight: bold;"> Not Found </div>';
	echo "\n";
	} else {
	echo "\n".$key;
	echo "---- ok \n";
	}
	}




	/*
	* 使用curl 的所有链接。
	*/
	$ch = curl_init();
	curl_setopt($ch, CURLOPT_URL, $url);
	// 只需返回HTTP header
	curl_setopt($ch, CURLOPT_HEADER, 1);
	// 页面内容我们并不需要
	// curl_setopt($ch, CURLOPT_NOBODY, 1);
	// 返回结果，而不是输出它
	curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
	$html = curl_exec($ch);
	$info = curl_getinfo($ch);
	if ($html === false) {
	echo "cURL Error: " . curl_error($ch);
	}
	curl_close($ch);
	$linkarr = _striplinks($html);

	// 主机部分，补全用
	if (is_array($linkarr)) {
	foreach ($linkarr as $k => $v) {
	$linkresult[$k] = _expandlinks($v, $host);
	}
	}

	$linkresult = array_unique($linkresult);

	echo '<pre>';
	foreach ($linkresult as $key => $value){
	echo $key."--".$value;
	_checkOk($value,$key);

	}

	echo "\n ==============RUN OVER=============== \n";
	//printf("<p>此页面的所有链接为：</p><pre>%s</pre>\n", var_export($linkresult , true));