zubinJiang/gist:6086031

## gistfile1.txt
<?php
/**
 * @todo 敏感词匹配核心库，本函数适用于GBK的编码匹配
 *
 * @fun1.生产成Tree型文件方法：
 *		 @name execMmakeWordsTree($filePaths="./wordsTree.txt")
 *       @param filePaths => 生成文件的路径
 *       @return bool#gbk
 *
 * @fun2.获取Tree型文件数据方法：
 *		 @name getWordsTree()
 *		 @return Array#utf-8
 *
 * @fun3.获取数据库中敏感词库方法：
 *		 @name getWordsArr()
 *		 @return Array#gbk
 *
 * @fun4.执行检查内容中是否存在敏感词方法：
 *		 @name execMatchWord($message, $isrReplaceChar, $isrReplaceHtml, $isRturnStr, $isMatchAll, $wordsArr)
 *		 @param message => 匹配的内容#Str
 *		 @param isrReplaceChar => 是否移内容中除汉字以外的字符再匹配#bool
 *		 @param isrReplaceHtml => 是否移除内容中html标签#bool
 *		 @param isRturnStr => 是否返回匹配出的敏感词#bool
 *       @param isMatchAll => 是否匹配出内容中出现所有的词，false情况下，匹配出第一个词就Return#bool
 *		 @param wordsArr => 自定定义敏感词数组#Array
 *       @return Str/bool#gbk
 *
 * @time 2013/7/15 22:37
 * @author jiang.zubin@onlylady.com
 */

/**
 * 调度案例：
 * require './CalssCensor.php';
 * $censor = new FApp_SensitiveWord_CalssCensor();
 * $censor->fun1();
 */

// 设置执行时间
set_time_limit(0);

// 显示错误等级
ini_set('display_errors',1);
error_reporting(E_ALL ^ E_NOTICE);
define('API_ROOT', realpath(dirname(__FILE__)));


if($_REQUEST['type']=="test") {

	list($usec, $sec) = explode(" ", microtime()); //$usec:微妙数
    $start = ((float)$usec + (float)$sec);

	$censor = new FApp_SensitiveWord_CalssCensor();
	$rstr = $censor->execMatchWord($message="<b>电话</b>155.5708.8321婷婷 ㊣新浪合作★—委托发布←155.5708.8321 婷婷敬候佳音 【155.5708.8321】婷婷 灵活管理、全城区无论酒店宾馆只要到位理　质量，资源更新快、迎合客人各口味、听话、漂亮、满意为止。 收费合理公道、只要我们能去的地方都会有安全护航、玩家无需担心 放心愉悦。<span>【155.5708.8321】</span>婷婷欢迎新老朋友 我们这里有最精致的资源找我们放松您的心，小姐", $isrReplaceChar=true, $isrReplaceHtml=true, $isRturnStr=true, $isMatchAll=false, $wordsArr=array());
	list($usec, $sec) = explode(" ", microtime()); //$usec:微妙数
    $end = ((float)$usec + (float)$sec);

	echo "匹配到的敏感词:",$rstr,"\t执行耗费时间:",$end-$start."毫秒";
}
//$str = $censor->execMmakeWordsTree($filePaths="./wordsTree.txt");
//$arr = $censor->getWordsTree();
//$arr = $censor->getWordsArr();

class FApp_SensitiveWord_CalssCensor
{
	private $read_dbhost;
	private $read_dbuser;
	private $read_dbpw;
	private $read_dbname;
	private $filePaths;

	private $dbcharset = 'gbk';
	private $dbtabpre = 'pre_';

	private $read_db = null;

	//敏感词数组
	var $tree_arr = array();
	var $result = array();
	var $exist_arr = array();
    var $tree_list = array();

	public function execMatchWord($message, $isrReplaceChar, $isrReplaceHtml, $isRturnStr, $isMatchAll, $wordsArr)
	{
		$tmp_message  =  $message;

		$this->tree_arr = $this->getWordsTree();

		if($isrReplaceHtml) $message = strip_tags($message);

		$message = iconv("gbk", "utf-8//IGNORE", $message);

		if($isrReplaceChar) {
			preg_match_all('/[\x{4e00}-\x{9fff}]+/u', $message, $matches);
			$message = join('', $matches[0]);
		}

		if($wordsArr && is_array($wordsArr)) foreach ($wordsArr as $row) {
			if($rstr = stripos($tmp_message, $row)) {
				return $isRturnStr ? $row : true;
			}
		}
		$succeed = $this->check($message, $isMatchAll);
		$returnStr = $this->result;
		unset($this->result);

		if($succeed) {
			if($returnStr) {
				$rstr = '';
				$rstr = implode(",",$returnStr);
				return $rstr = iconv("utf-8","gbk",$rstr);
			} else {
				return true;
			}
		} else {
			return false;
		}
    }

    /**
     * @todo 检验内容中是否存在敏感词
     */
	private function check(&$message, $isMatchAll)
	{
		$strLen = strlen($message);

		for ($i = 0; $i < $strLen; $i++) {
			if(ord($message[$i]) <= 127) {
                $char = $message[$i];
            } else {
                $char = $message[$i].$message[++$i].$message[++$i];
            }
			$this->getWord($char);

			if(!$isMatchAll && count($this->result) > 0) {
				return true;
			}
		}

		if(count($this->result) > 0) {
			return true;
		}

		$this->tree_list = array();

		return false;
	}

	 /**
	  * 核心算法 通用的分词算法
      */
	private function getWord($char)
    {
        if($this->tree_arr[$char]) {
            $this->tree_list[] = $this->tree_arr;
        }

		$count = count($this->tree_list);
		for($i = 0; $i < $count; $i++) {

            if($this->tree_list[$i]) {
                $this->tree_list[$i] = $this->tree_list[$i][$char];
            }
            if($this->tree_list[$i]["_word_"]) {
                $this->result[] = $this->tree_list[$i]["_word_"];
            }
        }

        if(!$this->tree_list[0]) {
            while($this->tree_list && !$this->tree_list[0]) {
                array_shift($this->tree_list);
            }
        }
    }

    /**
     * @ todo 读取tree文件数据
     * @ return array()
     */
	public function getWordsTree()
	{
		return unserialize(file_get_contents('./wordsTree.txt'));
	}

    /**
     * @ todo 读取敏感词库
     * @ return array()
     */
    public function getWordsArr()
    {
       	$this->dbReadLink();
        $sql = mysql_query("select find from pre_common_word where 1",$this->read_db);
        while($row = mysql_fetch_array($sql, MYSQL_ASSOC)) {
            $words[] = $row["find"];
        }
        return $words;
    }


    /**
     * @ 执行生成文件程序
     */
	public function execMmakeWordsTree($filePaths)
	{
		$this->filePaths      = $filePaths;
		$this->dbReadLink();
		$succeed = $this->makeWordsTree();
		return $succeed ? true : false;
	}

    /**
     * @todo 生成tree型文件
     */
	private function makeWordsTree()
	{
		$sql = mysql_query("select find from pre_common_word where 1", $this->read_db);
		$x = array();
		while($row = mysql_fetch_array($sql, MYSQL_ASSOC)) {
			$word = iconv("gbk", "utf-8//IGNORE", $row["find"]);
			$this->str2MutiArray($word, $x);
		}
		$json_arr = serialize($x);
		$filePaths = $this->filePaths ? $this->filePaths : "./wordsTree.txt";
		return file_put_contents($filePaths, $json_arr);
	}

	/**
	 * @todo 快速的将字符串转换成多维的数组多叉树
	 */
	private function str2MutiArray($str, &$x)
	{
		$tmp = preg_replace("/([\w\W{4E00}-\x{9FA5}])/u", "['\\1']", $str);
		eval("\$x".$tmp."['_word_']='".$str."';");
		return $x;
	}


    /**
     * @todo 链接读取敏感词库的db
     */
	private function dbReadLink()
    {
        /**
		 * @todo   定义读敏感词库的db配置信息
		 * @dbhost 主库的ip
		 * @dbuser 主库用户名
		 * @dbpw   主库密码
		 * @dbname 数据库名称
		 * @tablename 表名称
		 */
		  $config = array(
			'read_dbhost'    => '',
			'read_dbuser'    => '',
			'read_dbpw'      => '',
			'read_dbname'    => '',
		 );

		$this->read_dbhost    = $config['read_dbhost'];
		$this->read_dbuser    = $config['read_dbuser'];
		$this->read_dbpw      = $config['read_dbpw'];
        $this->read_dbname    = $config['read_dbname'];

		$this->read_db = mysql_connect($this->read_dbhost, $this->read_dbuser, $this->read_dbpw);
		if (!$this->read_db) die('Could not connect: ' . mysql_error());
		mysql_query('SET character_set_connection='.$this->dbcharset.', character_set_results='.$this->dbcharset.', character_set_client=binary', $this->read_db);
		mysql_select_db($this->read_dbname, $this->read_db);
		return $this->read_db;
	}
}
?>
	<?php
	/**
	* @todo 敏感词匹配核心库，本函数适用于GBK的编码匹配
	*
	* @fun1.生产成Tree型文件方法：
	* @name execMmakeWordsTree($filePaths="./wordsTree.txt")
	* @param filePaths => 生成文件的路径
	* @return bool#gbk
	*
	* @fun2.获取Tree型文件数据方法：
	* @name getWordsTree()
	* @return Array#utf-8
	*
	* @fun3.获取数据库中敏感词库方法：
	* @name getWordsArr()
	* @return Array#gbk
	*
	* @fun4.执行检查内容中是否存在敏感词方法：
	* @name execMatchWord($message, $isrReplaceChar, $isrReplaceHtml, $isRturnStr, $isMatchAll, $wordsArr)
	* @param message => 匹配的内容#Str
	* @param isrReplaceChar => 是否移内容中除汉字以外的字符再匹配#bool
	* @param isrReplaceHtml => 是否移除内容中html标签#bool
	* @param isRturnStr => 是否返回匹配出的敏感词#bool
	* @param isMatchAll => 是否匹配出内容中出现所有的词，false情况下，匹配出第一个词就Return#bool
	* @param wordsArr => 自定定义敏感词数组#Array
	* @return Str/bool#gbk
	*
	* @time 2013/7/15 22:37
	* @author jiang.zubin@onlylady.com
	*/

	/**
	* 调度案例：
	* require './CalssCensor.php';
	* $censor = new FApp_SensitiveWord_CalssCensor();
	* $censor->fun1();
	*/

	// 设置执行时间
	set_time_limit(0);

	// 显示错误等级
	ini_set('display_errors',1);
	error_reporting(E_ALL ^ E_NOTICE);
	define('API_ROOT', realpath(dirname(__FILE__)));


	if($_REQUEST['type']=="test") {

	list($usec, $sec) = explode(" ", microtime()); //$usec:微妙数
	$start = ((float)$usec + (float)$sec);

	$censor = new FApp_SensitiveWord_CalssCensor();
	$rstr = $censor->execMatchWord($message="<b>电话</b>155.5708.8321婷婷㊣新浪合作★—委托发布←155.5708.8321 婷婷敬候佳音【155.5708.8321】婷婷灵活管理、全城区无论酒店宾馆只要到位理　质量，资源更新快、迎合客人各口味、听话、漂亮、满意为止。收费合理公道、只要我们能去的地方都会有安全护航、玩家无需担心放心愉悦。<span>【155.5708.8321】</span>婷婷欢迎新老朋友我们这里有最精致的资源找我们放松您的心，小姐", $isrReplaceChar=true, $isrReplaceHtml=true, $isRturnStr=true, $isMatchAll=false, $wordsArr=array());
	list($usec, $sec) = explode(" ", microtime()); //$usec:微妙数
	$end = ((float)$usec + (float)$sec);

	echo "匹配到的敏感词:",$rstr,"\t执行耗费时间:",$end-$start."毫秒";
	}
	//$str = $censor->execMmakeWordsTree($filePaths="./wordsTree.txt");
	//$arr = $censor->getWordsTree();
	//$arr = $censor->getWordsArr();

	class FApp_SensitiveWord_CalssCensor
	{
	private $read_dbhost;
	private $read_dbuser;
	private $read_dbpw;
	private $read_dbname;
	private $filePaths;

	private $dbcharset = 'gbk';
	private $dbtabpre = 'pre_';

	private $read_db = null;

	//敏感词数组
	var $tree_arr = array();
	var $result = array();
	var $exist_arr = array();
	var $tree_list = array();

	public function execMatchWord($message, $isrReplaceChar, $isrReplaceHtml, $isRturnStr, $isMatchAll, $wordsArr)
	{
	$tmp_message = $message;

	$this->tree_arr = $this->getWordsTree();

	if($isrReplaceHtml) $message = strip_tags($message);

	$message = iconv("gbk", "utf-8//IGNORE", $message);

	if($isrReplaceChar) {
	preg_match_all('/[\x{4e00}-\x{9fff}]+/u', $message, $matches);
	$message = join('', $matches[0]);
	}

	if($wordsArr && is_array($wordsArr)) foreach ($wordsArr as $row) {
	if($rstr = stripos($tmp_message, $row)) {
	return $isRturnStr ? $row : true;
	}
	}
	$succeed = $this->check($message, $isMatchAll);
	$returnStr = $this->result;
	unset($this->result);

	if($succeed) {
	if($returnStr) {
	$rstr = '';
	$rstr = implode(",",$returnStr);
	return $rstr = iconv("utf-8","gbk",$rstr);
	} else {
	return true;
	}
	} else {
	return false;
	}
	}

	/**
	* @todo 检验内容中是否存在敏感词
	*/
	private function check(&$message, $isMatchAll)
	{
	$strLen = strlen($message);

	for ($i = 0; $i < $strLen; $i++) {
	if(ord($message[$i]) <= 127) {
	$char = $message[$i];
	} else {
	$char = $message[$i].$message[++$i].$message[++$i];
	}
	$this->getWord($char);

	if(!$isMatchAll && count($this->result) > 0) {
	return true;
	}
	}

	if(count($this->result) > 0) {
	return true;
	}

	$this->tree_list = array();

	return false;
	}

	/**
	* 核心算法通用的分词算法
	*/
	private function getWord($char)
	{
	if($this->tree_arr[$char]) {
	$this->tree_list[] = $this->tree_arr;
	}

	$count = count($this->tree_list);
	for($i = 0; $i < $count; $i++) {

	if($this->tree_list[$i]) {
	$this->tree_list[$i] = $this->tree_list[$i][$char];
	}
	if($this->tree_list[$i]["_word_"]) {
	$this->result[] = $this->tree_list[$i]["_word_"];
	}
	}

	if(!$this->tree_list[0]) {
	while($this->tree_list && !$this->tree_list[0]) {
	array_shift($this->tree_list);
	}
	}
	}

	/**
	* @ todo 读取tree文件数据
	* @ return array()
	*/
	public function getWordsTree()
	{
	return unserialize(file_get_contents('./wordsTree.txt'));
	}

	/**
	* @ todo 读取敏感词库
	* @ return array()
	*/
	public function getWordsArr()
	{
	$this->dbReadLink();
	$sql = mysql_query("select find from pre_common_word where 1",$this->read_db);
	while($row = mysql_fetch_array($sql, MYSQL_ASSOC)) {
	$words[] = $row["find"];
	}
	return $words;
	}


	/**
	* @ 执行生成文件程序
	*/
	public function execMmakeWordsTree($filePaths)
	{
	$this->filePaths = $filePaths;
	$this->dbReadLink();
	$succeed = $this->makeWordsTree();
	return $succeed ? true : false;
	}

	/**
	* @todo 生成tree型文件
	*/
	private function makeWordsTree()
	{
	$sql = mysql_query("select find from pre_common_word where 1", $this->read_db);
	$x = array();
	while($row = mysql_fetch_array($sql, MYSQL_ASSOC)) {
	$word = iconv("gbk", "utf-8//IGNORE", $row["find"]);
	$this->str2MutiArray($word, $x);
	}
	$json_arr = serialize($x);
	$filePaths = $this->filePaths ? $this->filePaths : "./wordsTree.txt";
	return file_put_contents($filePaths, $json_arr);
	}

	/**
	* @todo 快速的将字符串转换成多维的数组多叉树
	*/
	private function str2MutiArray($str, &$x)
	{
	$tmp = preg_replace("/([\w\W{4E00}-\x{9FA5}])/u", "['\\1']", $str);
	eval("\$x".$tmp."['_word_']='".$str."';");
	return $x;
	}


	/**
	* @todo 链接读取敏感词库的db
	*/
	private function dbReadLink()
	{
	/**
	* @todo 定义读敏感词库的db配置信息
	* @dbhost 主库的ip
	* @dbuser 主库用户名
	* @dbpw 主库密码
	* @dbname 数据库名称
	* @tablename 表名称
	*/
	$config = array(
	'read_dbhost' => '',
	'read_dbuser' => '',
	'read_dbpw' => '',
	'read_dbname' => '',
	);

	$this->read_dbhost = $config['read_dbhost'];
	$this->read_dbuser = $config['read_dbuser'];
	$this->read_dbpw = $config['read_dbpw'];
	$this->read_dbname = $config['read_dbname'];

	$this->read_db = mysql_connect($this->read_dbhost, $this->read_dbuser, $this->read_dbpw);
	if (!$this->read_db) die('Could not connect: ' . mysql_error());
	mysql_query('SET character_set_connection='.$this->dbcharset.', character_set_results='.$this->dbcharset.', character_set_client=binary', $this->read_db);
	mysql_select_db($this->read_dbname, $this->read_db);
	return $this->read_db;
	}
	}
	?>