Skip to content

Instantly share code, notes, and snippets.

@zubinJiang
Created July 26, 2013 03:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zubinJiang/6086031 to your computer and use it in GitHub Desktop.
Save zubinJiang/6086031 to your computer and use it in GitHub Desktop.
PHP敏感词匹配核心库,本函数适用于GBK的编码匹配
<?php
/**
* @todo 敏感词匹配核心库,本函数适用于GBK的编码匹配
*
* @fun1.生产成Tree型文件方法:
* @name execMmakeWordsTree($filePaths="./wordsTree.txt")
* @param filePaths => 生成文件的路径
* @return bool#gbk
*
* @fun2.获取Tree型文件数据方法:
* @name getWordsTree()
* @return Array#utf-8
*
* @fun3.获取数据库中敏感词库方法:
* @name getWordsArr()
* @return Array#gbk
*
* @fun4.执行检查内容中是否存在敏感词方法:
* @name execMatchWord($message, $isrReplaceChar, $isrReplaceHtml, $isRturnStr, $isMatchAll, $wordsArr)
* @param message => 匹配的内容#Str
* @param isrReplaceChar => 是否移内容中除汉字以外的字符再匹配#bool
* @param isrReplaceHtml => 是否移除内容中html标签#bool
* @param isRturnStr => 是否返回匹配出的敏感词#bool
* @param isMatchAll => 是否匹配出内容中出现所有的词,false情况下,匹配出第一个词就Return#bool
* @param wordsArr => 自定定义敏感词数组#Array
* @return Str/bool#gbk
*
* @time 2013/7/15 22:37
* @author jiang.zubin@onlylady.com
*/
/**
* 调度案例:
* require './CalssCensor.php';
* $censor = new FApp_SensitiveWord_CalssCensor();
* $censor->fun1();
*/
// 设置执行时间
set_time_limit(0);
// 显示错误等级
ini_set('display_errors',1);
error_reporting(E_ALL ^ E_NOTICE);
define('API_ROOT', realpath(dirname(__FILE__)));
if($_REQUEST['type']=="test") {
list($usec, $sec) = explode(" ", microtime()); //$usec:微妙数
$start = ((float)$usec + (float)$sec);
$censor = new FApp_SensitiveWord_CalssCensor();
$rstr = $censor->execMatchWord($message="<b>电话</b>155.5708.8321婷婷 ㊣新浪合作★—委托发布←155.5708.8321 婷婷敬候佳音 【155.5708.8321】婷婷 灵活管理、全城区无论酒店宾馆只要到位理 质量,资源更新快、迎合客人各口味、听话、漂亮、满意为止。 收费合理公道、只要我们能去的地方都会有安全护航、玩家无需担心 放心愉悦。<span>【155.5708.8321】</span>婷婷欢迎新老朋友 我们这里有最精致的资源找我们放松您的心,小姐", $isrReplaceChar=true, $isrReplaceHtml=true, $isRturnStr=true, $isMatchAll=false, $wordsArr=array());
list($usec, $sec) = explode(" ", microtime()); //$usec:微妙数
$end = ((float)$usec + (float)$sec);
echo "匹配到的敏感词:",$rstr,"\t执行耗费时间:",$end-$start."毫秒";
}
//$str = $censor->execMmakeWordsTree($filePaths="./wordsTree.txt");
//$arr = $censor->getWordsTree();
//$arr = $censor->getWordsArr();
class FApp_SensitiveWord_CalssCensor
{
private $read_dbhost;
private $read_dbuser;
private $read_dbpw;
private $read_dbname;
private $filePaths;
private $dbcharset = 'gbk';
private $dbtabpre = 'pre_';
private $read_db = null;
//敏感词数组
var $tree_arr = array();
var $result = array();
var $exist_arr = array();
var $tree_list = array();
public function execMatchWord($message, $isrReplaceChar, $isrReplaceHtml, $isRturnStr, $isMatchAll, $wordsArr)
{
$tmp_message = $message;
$this->tree_arr = $this->getWordsTree();
if($isrReplaceHtml) $message = strip_tags($message);
$message = iconv("gbk", "utf-8//IGNORE", $message);
if($isrReplaceChar) {
preg_match_all('/[\x{4e00}-\x{9fff}]+/u', $message, $matches);
$message = join('', $matches[0]);
}
if($wordsArr && is_array($wordsArr)) foreach ($wordsArr as $row) {
if($rstr = stripos($tmp_message, $row)) {
return $isRturnStr ? $row : true;
}
}
$succeed = $this->check($message, $isMatchAll);
$returnStr = $this->result;
unset($this->result);
if($succeed) {
if($returnStr) {
$rstr = '';
$rstr = implode(",",$returnStr);
return $rstr = iconv("utf-8","gbk",$rstr);
} else {
return true;
}
} else {
return false;
}
}
/**
* @todo 检验内容中是否存在敏感词
*/
private function check(&$message, $isMatchAll)
{
$strLen = strlen($message);
for ($i = 0; $i < $strLen; $i++) {
if(ord($message[$i]) <= 127) {
$char = $message[$i];
} else {
$char = $message[$i].$message[++$i].$message[++$i];
}
$this->getWord($char);
if(!$isMatchAll && count($this->result) > 0) {
return true;
}
}
if(count($this->result) > 0) {
return true;
}
$this->tree_list = array();
return false;
}
/**
* 核心算法 通用的分词算法
*/
private function getWord($char)
{
if($this->tree_arr[$char]) {
$this->tree_list[] = $this->tree_arr;
}
$count = count($this->tree_list);
for($i = 0; $i < $count; $i++) {
if($this->tree_list[$i]) {
$this->tree_list[$i] = $this->tree_list[$i][$char];
}
if($this->tree_list[$i]["_word_"]) {
$this->result[] = $this->tree_list[$i]["_word_"];
}
}
if(!$this->tree_list[0]) {
while($this->tree_list && !$this->tree_list[0]) {
array_shift($this->tree_list);
}
}
}
/**
* @ todo 读取tree文件数据
* @ return array()
*/
public function getWordsTree()
{
return unserialize(file_get_contents('./wordsTree.txt'));
}
/**
* @ todo 读取敏感词库
* @ return array()
*/
public function getWordsArr()
{
$this->dbReadLink();
$sql = mysql_query("select find from pre_common_word where 1",$this->read_db);
while($row = mysql_fetch_array($sql, MYSQL_ASSOC)) {
$words[] = $row["find"];
}
return $words;
}
/**
* @ 执行生成文件程序
*/
public function execMmakeWordsTree($filePaths)
{
$this->filePaths = $filePaths;
$this->dbReadLink();
$succeed = $this->makeWordsTree();
return $succeed ? true : false;
}
/**
* @todo 生成tree型文件
*/
private function makeWordsTree()
{
$sql = mysql_query("select find from pre_common_word where 1", $this->read_db);
$x = array();
while($row = mysql_fetch_array($sql, MYSQL_ASSOC)) {
$word = iconv("gbk", "utf-8//IGNORE", $row["find"]);
$this->str2MutiArray($word, $x);
}
$json_arr = serialize($x);
$filePaths = $this->filePaths ? $this->filePaths : "./wordsTree.txt";
return file_put_contents($filePaths, $json_arr);
}
/**
* @todo 快速的将字符串转换成多维的数组多叉树
*/
private function str2MutiArray($str, &$x)
{
$tmp = preg_replace("/([\w\W{4E00}-\x{9FA5}])/u", "['\\1']", $str);
eval("\$x".$tmp."['_word_']='".$str."';");
return $x;
}
/**
* @todo 链接读取敏感词库的db
*/
private function dbReadLink()
{
/**
* @todo 定义读敏感词库的db配置信息
* @dbhost 主库的ip
* @dbuser 主库用户名
* @dbpw 主库密码
* @dbname 数据库名称
* @tablename 表名称
*/
$config = array(
'read_dbhost' => '',
'read_dbuser' => '',
'read_dbpw' => '',
'read_dbname' => '',
);
$this->read_dbhost = $config['read_dbhost'];
$this->read_dbuser = $config['read_dbuser'];
$this->read_dbpw = $config['read_dbpw'];
$this->read_dbname = $config['read_dbname'];
$this->read_db = mysql_connect($this->read_dbhost, $this->read_dbuser, $this->read_dbpw);
if (!$this->read_db) die('Could not connect: ' . mysql_error());
mysql_query('SET character_set_connection='.$this->dbcharset.', character_set_results='.$this->dbcharset.', character_set_client=binary', $this->read_db);
mysql_select_db($this->read_dbname, $this->read_db);
return $this->read_db;
}
}
?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment