Created
October 19, 2012 07:36
-
-
Save zubinJiang/3916752 to your computer and use it in GitHub Desktop.
PHP中文分词
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
最常见的词语二分法: | |
$str = '这是我的网站www.7di.net!'; | |
//$str = iconv('GB2312','UTF-8',$str); | |
$result = spStr($str); | |
print_r($result); | |
/** | |
* UTF-8版 中文二元分词 | |
*/ | |
function spStr($str) | |
{ | |
$cstr = array(); | |
$search = array(",", "/", "\\", ".", ";", ":", "\"", "!", "~", "`", "^", "(", ")", "?", "-", "\t", "\n", "'", "<", ">", "\r", "\r\n", "{1}quot;", "&", "%", "#", "@", "+", "=", "{", "}", "[", "]", ":", ")", "(", ".", "。", ",", "!", ";", "“", "”", "‘", "’", "[", "]", "、", "—", " ", "《", "》", "-", "…", "【", "】",); | |
$str = str_replace($search, " ", $str); | |
preg_match_all("/[a-zA-Z]+/", $str, $estr); | |
preg_match_all("/[0-9]+/", $str, $nstr); | |
$str = preg_replace("/[0-9a-zA-Z]+/", " ", $str); | |
$str = preg_replace("/\s{2,}/", " ", $str); | |
$str = explode(" ", trim($str)); | |
foreach ($str as $s) { | |
$l = strlen($s); | |
$bf = null; | |
for ($i= 0; $i< $l; $i=$i+3) { | |
$ns1 = $s{$i}.$s{$i+1}.$s{$i+2}; | |
if (isset($s{$i+3})) { | |
$ns2 = $s{$i+3}.$s{$i+4}.$s{$i+5}; | |
if (preg_match("/[\x80-\xff]{3}/",$ns2)) $cstr[] = $ns1.$ns2; | |
} else if ($i == 0) { | |
$cstr[] = $ns1; | |
} | |
} | |
} | |
$estr = isset($estr[0])?$estr[0]:array(); | |
$nstr = isset($nstr[0])?$nstr[0]:array(); | |
return array_merge($nstr,$estr,$cstr); | |
} | |
執行結果是: | |
? | |
Array ( [0] => 7 [1] => www [2] => di [3] => net [4] => 这是 [5] => 是我 [6] => 我的 [7] => 的网 [8] => 网站 ) | |
接下来,将以上结果转换为区位码,PHP代码是: | |
? | |
foreach ($result as $s) { | |
$s = iconv('UTF-8','GB2312',$s); | |
$code[] = gbCode($s); | |
} | |
$code = implode(" ", $code); | |
echo $code; | |
function gbCode($str) { | |
$return = null; | |
if (!preg_match("/^[\x80-\xff]{2,}$/",$str)) return $str; | |
$len = strlen($str); | |
for ($i= 0; $i< $len; $i=$i+2) { | |
$return .= sprintf("%02d%02d",ord($str{$i})-160,ord($str{$i+1})-160); | |
} | |
return $return; | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment