Created
June 2, 2012 16:22
-
-
Save zhangguiqiang/2859043 to your computer and use it in GitHub Desktop.
PHP:crawler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//获得当前的脚本网址 | |
function get_php_url(){ | |
if(!empty($_SERVER["REQUEST_URI"])){ | |
$scriptName = $_SERVER["REQUEST_URI"]; | |
$nowurl = $scriptName; | |
}else{ | |
$scriptName = $_SERVER["PHP_SELF"]; | |
if(empty($_SERVER["QUERY_STRING"])) $nowurl = $scriptName; | |
else $nowurl = $scriptName."?".$_SERVER["QUERY_STRING"]; | |
} | |
return $nowurl; | |
} | |
//把全角数字转为半角数字 | |
function GetAlabNum($fnum){ | |
$nums = array("0","1","2","3","4","5","6","7","8","9"); | |
$fnums = "0123456789"; | |
for($i=0;$i<=9;$i++) $fnum = str_replace($nums[$i],$fnums[$i],$fnum); | |
$fnum = ereg_replace("[^0-9\.]|^0{1,}","",$fnum); | |
if($fnum=="") $fnum=0; | |
return $fnum; | |
} | |
//去除HTML标记 | |
function Text2Html($txt){ | |
$txt = str_replace(" "," ",$txt); | |
$txt = str_replace("<","<",$txt); | |
$txt = str_replace(">",">",$txt); | |
$txt = preg_replace("/[\r\n]{1,}/isU","<br/>\r\n",$txt); | |
return $txt; | |
} | |
//清除HTML标记 | |
function ClearHtml($str){ | |
$str = str_replace('<','<',$str); | |
$str = str_replace('>','>',$str); | |
return $str; | |
} | |
//相对路径转化成绝对路径 | |
function relative_to_absolute($content, $feed_url) { | |
preg_match('/(http|https|ftp):\/\//', $feed_url, $protocol); | |
$server_url = preg_replace("/(http|https|ftp|news):\/\//", "", $feed_url); | |
$server_url = preg_replace("/\/.*/", "", $server_url); | |
if ($server_url == '') { | |
return $content; | |
} | |
if (isset($protocol[0])) { | |
$new_content = preg_replace('/href="\//', 'href="'.$protocol[0].$server_url.'/', $content); | |
$new_content = preg_replace('/src="\//', 'src="'.$protocol[0].$server_url.'/', $new_content); | |
} else { | |
$new_content = $content; | |
} | |
return $new_content; | |
} | |
//取得所有链接 | |
function get_all_url($code){ | |
preg_match_all('/<a\s+href=["|\']?([^>"\' ]+)["|\']?\s*[^>]*>([^>]+)<\/a>/i',$code,$arr); | |
return array('name'=>$arr[2],'url'=>$arr[1]); | |
} | |
//获取指定标记中的内容 | |
function get_tag_data($str, $start, $end){ | |
if ( $start == '' || $end == '' ){ | |
return; | |
} | |
$str = explode($start, $str); | |
$str = explode($end, $str[1]); | |
return $str[0]; | |
} | |
//HTML表格的每行转为CSV格式数组 | |
function get_tr_array($table) { | |
$table = preg_replace("'<td[^>]*?>'si",'"',$table); | |
$table = str_replace("</td>",'",',$table); | |
$table = str_replace("</tr>","{tr}",$table); | |
//去掉 HTML 标记 | |
$table = preg_replace("'<[\/\!]*?[^<>]*?>'si","",$table); | |
//去掉空白字符 | |
$table = preg_replace("'([\r\n])[\s]+'","",$table); | |
$table = str_replace(" ","",$table); | |
$table = str_replace(" ","",$table); | |
$table = explode(",{tr}",$table); | |
array_pop($table); | |
return $table; | |
} | |
//将HTML表格的每行每列转为数组,采集表格数据 | |
function get_td_array($table) { | |
$table = preg_replace("'<table[^>]*?>'si","",$table); | |
$table = preg_replace("'<tr[^>]*?>'si","",$table); | |
$table = preg_replace("'<td[^>]*?>'si","",$table); | |
$table = str_replace("</tr>","{tr}",$table); | |
$table = str_replace("</td>","{td}",$table); | |
//去掉 HTML 标记 | |
$table = preg_replace("'<[\/\!]*?[^<>]*?>'si","",$table); | |
//去掉空白字符 | |
$table = preg_replace("'([\r\n])[\s]+'","",$table); | |
$table = str_replace(" ","",$table); | |
$table = str_replace(" ","",$table); | |
$table = explode('{tr}', $table); | |
array_pop($table); | |
foreach ($table as $key=>$tr) { | |
$td = explode('{td}', $tr); | |
array_pop($td); | |
$td_array[] = $td; | |
} | |
return $td_array; | |
} | |
//返回字符串中的所有单词 $distinct=true 去除重复 | |
function split_en_str($str,$distinct=true) { | |
preg_match_all('/([a-zA-Z]+)/',$str,$match); | |
if ($distinct == true) { | |
$match[1] = array_unique($match[1]); | |
} | |
sort($match[1]); | |
return $match[1]; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment