Created
July 26, 2013 03:54
-
-
Save zubinJiang/6086031 to your computer and use it in GitHub Desktop.
PHP敏感词匹配核心库,本函数适用于GBK的编码匹配
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* @todo 敏感词匹配核心库,本函数适用于GBK的编码匹配 | |
* | |
* @fun1.生产成Tree型文件方法: | |
* @name execMmakeWordsTree($filePaths="./wordsTree.txt") | |
* @param filePaths => 生成文件的路径 | |
* @return bool#gbk | |
* | |
* @fun2.获取Tree型文件数据方法: | |
* @name getWordsTree() | |
* @return Array#utf-8 | |
* | |
* @fun3.获取数据库中敏感词库方法: | |
* @name getWordsArr() | |
* @return Array#gbk | |
* | |
* @fun4.执行检查内容中是否存在敏感词方法: | |
* @name execMatchWord($message, $isrReplaceChar, $isrReplaceHtml, $isRturnStr, $isMatchAll, $wordsArr) | |
* @param message => 匹配的内容#Str | |
* @param isrReplaceChar => 是否移内容中除汉字以外的字符再匹配#bool | |
* @param isrReplaceHtml => 是否移除内容中html标签#bool | |
* @param isRturnStr => 是否返回匹配出的敏感词#bool | |
* @param isMatchAll => 是否匹配出内容中出现所有的词,false情况下,匹配出第一个词就Return#bool | |
* @param wordsArr => 自定定义敏感词数组#Array | |
* @return Str/bool#gbk | |
* | |
* @time 2013/7/15 22:37 | |
* @author jiang.zubin@onlylady.com | |
*/ | |
/** | |
* 调度案例: | |
* require './CalssCensor.php'; | |
* $censor = new FApp_SensitiveWord_CalssCensor(); | |
* $censor->fun1(); | |
*/ | |
// 设置执行时间 | |
set_time_limit(0); | |
// 显示错误等级 | |
ini_set('display_errors',1); | |
error_reporting(E_ALL ^ E_NOTICE); | |
define('API_ROOT', realpath(dirname(__FILE__))); | |
if($_REQUEST['type']=="test") { | |
list($usec, $sec) = explode(" ", microtime()); //$usec:微妙数 | |
$start = ((float)$usec + (float)$sec); | |
$censor = new FApp_SensitiveWord_CalssCensor(); | |
$rstr = $censor->execMatchWord($message="<b>电话</b>155.5708.8321婷婷 ㊣新浪合作★—委托发布←155.5708.8321 婷婷敬候佳音 【155.5708.8321】婷婷 灵活管理、全城区无论酒店宾馆只要到位理 质量,资源更新快、迎合客人各口味、听话、漂亮、满意为止。 收费合理公道、只要我们能去的地方都会有安全护航、玩家无需担心 放心愉悦。<span>【155.5708.8321】</span>婷婷欢迎新老朋友 我们这里有最精致的资源找我们放松您的心,小姐", $isrReplaceChar=true, $isrReplaceHtml=true, $isRturnStr=true, $isMatchAll=false, $wordsArr=array()); | |
list($usec, $sec) = explode(" ", microtime()); //$usec:微妙数 | |
$end = ((float)$usec + (float)$sec); | |
echo "匹配到的敏感词:",$rstr,"\t执行耗费时间:",$end-$start."毫秒"; | |
} | |
//$str = $censor->execMmakeWordsTree($filePaths="./wordsTree.txt"); | |
//$arr = $censor->getWordsTree(); | |
//$arr = $censor->getWordsArr(); | |
class FApp_SensitiveWord_CalssCensor | |
{ | |
private $read_dbhost; | |
private $read_dbuser; | |
private $read_dbpw; | |
private $read_dbname; | |
private $filePaths; | |
private $dbcharset = 'gbk'; | |
private $dbtabpre = 'pre_'; | |
private $read_db = null; | |
//敏感词数组 | |
var $tree_arr = array(); | |
var $result = array(); | |
var $exist_arr = array(); | |
var $tree_list = array(); | |
public function execMatchWord($message, $isrReplaceChar, $isrReplaceHtml, $isRturnStr, $isMatchAll, $wordsArr) | |
{ | |
$tmp_message = $message; | |
$this->tree_arr = $this->getWordsTree(); | |
if($isrReplaceHtml) $message = strip_tags($message); | |
$message = iconv("gbk", "utf-8//IGNORE", $message); | |
if($isrReplaceChar) { | |
preg_match_all('/[\x{4e00}-\x{9fff}]+/u', $message, $matches); | |
$message = join('', $matches[0]); | |
} | |
if($wordsArr && is_array($wordsArr)) foreach ($wordsArr as $row) { | |
if($rstr = stripos($tmp_message, $row)) { | |
return $isRturnStr ? $row : true; | |
} | |
} | |
$succeed = $this->check($message, $isMatchAll); | |
$returnStr = $this->result; | |
unset($this->result); | |
if($succeed) { | |
if($returnStr) { | |
$rstr = ''; | |
$rstr = implode(",",$returnStr); | |
return $rstr = iconv("utf-8","gbk",$rstr); | |
} else { | |
return true; | |
} | |
} else { | |
return false; | |
} | |
} | |
/** | |
* @todo 检验内容中是否存在敏感词 | |
*/ | |
private function check(&$message, $isMatchAll) | |
{ | |
$strLen = strlen($message); | |
for ($i = 0; $i < $strLen; $i++) { | |
if(ord($message[$i]) <= 127) { | |
$char = $message[$i]; | |
} else { | |
$char = $message[$i].$message[++$i].$message[++$i]; | |
} | |
$this->getWord($char); | |
if(!$isMatchAll && count($this->result) > 0) { | |
return true; | |
} | |
} | |
if(count($this->result) > 0) { | |
return true; | |
} | |
$this->tree_list = array(); | |
return false; | |
} | |
/** | |
* 核心算法 通用的分词算法 | |
*/ | |
private function getWord($char) | |
{ | |
if($this->tree_arr[$char]) { | |
$this->tree_list[] = $this->tree_arr; | |
} | |
$count = count($this->tree_list); | |
for($i = 0; $i < $count; $i++) { | |
if($this->tree_list[$i]) { | |
$this->tree_list[$i] = $this->tree_list[$i][$char]; | |
} | |
if($this->tree_list[$i]["_word_"]) { | |
$this->result[] = $this->tree_list[$i]["_word_"]; | |
} | |
} | |
if(!$this->tree_list[0]) { | |
while($this->tree_list && !$this->tree_list[0]) { | |
array_shift($this->tree_list); | |
} | |
} | |
} | |
/** | |
* @ todo 读取tree文件数据 | |
* @ return array() | |
*/ | |
public function getWordsTree() | |
{ | |
return unserialize(file_get_contents('./wordsTree.txt')); | |
} | |
/** | |
* @ todo 读取敏感词库 | |
* @ return array() | |
*/ | |
public function getWordsArr() | |
{ | |
$this->dbReadLink(); | |
$sql = mysql_query("select find from pre_common_word where 1",$this->read_db); | |
while($row = mysql_fetch_array($sql, MYSQL_ASSOC)) { | |
$words[] = $row["find"]; | |
} | |
return $words; | |
} | |
/** | |
* @ 执行生成文件程序 | |
*/ | |
public function execMmakeWordsTree($filePaths) | |
{ | |
$this->filePaths = $filePaths; | |
$this->dbReadLink(); | |
$succeed = $this->makeWordsTree(); | |
return $succeed ? true : false; | |
} | |
/** | |
* @todo 生成tree型文件 | |
*/ | |
private function makeWordsTree() | |
{ | |
$sql = mysql_query("select find from pre_common_word where 1", $this->read_db); | |
$x = array(); | |
while($row = mysql_fetch_array($sql, MYSQL_ASSOC)) { | |
$word = iconv("gbk", "utf-8//IGNORE", $row["find"]); | |
$this->str2MutiArray($word, $x); | |
} | |
$json_arr = serialize($x); | |
$filePaths = $this->filePaths ? $this->filePaths : "./wordsTree.txt"; | |
return file_put_contents($filePaths, $json_arr); | |
} | |
/** | |
* @todo 快速的将字符串转换成多维的数组多叉树 | |
*/ | |
private function str2MutiArray($str, &$x) | |
{ | |
$tmp = preg_replace("/([\w\W{4E00}-\x{9FA5}])/u", "['\\1']", $str); | |
eval("\$x".$tmp."['_word_']='".$str."';"); | |
return $x; | |
} | |
/** | |
* @todo 链接读取敏感词库的db | |
*/ | |
private function dbReadLink() | |
{ | |
/** | |
* @todo 定义读敏感词库的db配置信息 | |
* @dbhost 主库的ip | |
* @dbuser 主库用户名 | |
* @dbpw 主库密码 | |
* @dbname 数据库名称 | |
* @tablename 表名称 | |
*/ | |
$config = array( | |
'read_dbhost' => '', | |
'read_dbuser' => '', | |
'read_dbpw' => '', | |
'read_dbname' => '', | |
); | |
$this->read_dbhost = $config['read_dbhost']; | |
$this->read_dbuser = $config['read_dbuser']; | |
$this->read_dbpw = $config['read_dbpw']; | |
$this->read_dbname = $config['read_dbname']; | |
$this->read_db = mysql_connect($this->read_dbhost, $this->read_dbuser, $this->read_dbpw); | |
if (!$this->read_db) die('Could not connect: ' . mysql_error()); | |
mysql_query('SET character_set_connection='.$this->dbcharset.', character_set_results='.$this->dbcharset.', character_set_client=binary', $this->read_db); | |
mysql_select_db($this->read_dbname, $this->read_db); | |
return $this->read_db; | |
} | |
} | |
?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment