zubinJiang/php发帖匹配敏感词

## php发帖匹配敏感词
<?php
set_time_limit(0);

class Test_database
{

  private $read_dbhost = '';
	private $read_dbuser = '';
	private $read_dbpw = '';
	private $read_dbname = '';

	private $dbcharset = 'gbk';
	private $dbtabpre = 'pre_';

	private $read_db = null;
	private $write_db = null;

	private static $index=1;

	//敏感词数组
	var $tree_arr = array();
	var $result = array();
	var $exist_arr = array();
    var $tree_list = array();

	function __construct()
	{
		$this->db_read_link();
		$this->tree_arr = unserialize(file_get_contents('./tree.txt'));
		$this->maketree();
		//echo "<pre>";
		//var_dump($this->tree_arr);
		//echo "<pre>";
	}

	function db_read_link()
	{
		$this->read_db = mysql_connect($this->read_dbhost, $this->read_dbuser, $this->read_dbpw);
		if (!$this->read_db)
		{
			die('Could not connect: ' . mysql_error());
		}
		mysql_query('SET character_set_connection='.$this->dbcharset.', character_set_results='.$this->dbcharset.', character_set_client=binary', $this->read_db);
		mysql_query('SET names GBK', $this->read_db);
		mysql_select_db($this->read_dbname, $this->read_db);
		return $this->read_db;
	}


	/**
	 * 快速的将字符串转换成多维的数组多叉树
	 */
	function str2MutiArray($str, &$x)
	{
		//global $x;
		$tmp = preg_replace("/([\w\W{4E00}-\x{9FA5}])/u", "['\\1']", $str);
		eval("\$x".$tmp."['_word_']='".$str."';");
		return $x;
	}

	function maketree()
	{
		$sql = mysql_query("select find from pre_common_word where 1", $this->read_db);
		$x = array();
		while($row = mysql_fetch_array($sql, MYSQL_ASSOC))
		{
			$word = iconv("gbk", "utf-8//IGNORE", $row["find"]);
			$this->str2MutiArray($word, $x);
		}
		$json_arr = serialize($x);
		file_put_contents("./tree.txt", $json_arr);
	}


	function match_word()
	{
		//如果优化，还可以把直接把敏感词库放入内存中，过滤帖子和博文中的bbcode代码
		$message = file_get_contents('./message3.txt');
		$message = iconv("gbk", "utf-8", $message);
		echo "start time:".$start = time()."<hr/>";
		$word = $this->check($message);
		echo "end time:".$end = time()."<hr/>";
		echo "diff time:",$end-$start."<hr/>";
	}

	function check(&$message)
	{
		$strLen = strlen($message);

		for ($i = 0; $i < $strLen; $i++)
		{
			if(ord($message[$i]) <= 127)
            {
                $char = $message[$i];
            }
            else
            {
                $char = $message[$i].$message[++$i].$message[++$i];
            }
			$this->getWord($char);
			if(count($this->result) > 0)
			{
				var_dump($this->result);
				return true;
			}

		}
		$this->tree_list = array();
	}

	 /**
	  * 核心算法 通用的分词算法
      */
	function getWord($char)
    {
        if($this->tree_arr[$char])
        {
            $this->tree_list[] = $this->tree_arr;
        }

		$count = count($this->tree_list);
		for($i = 0; $i < $count; $i++)
        {

            if($this->tree_list[$i])
            {
                $this->tree_list[$i] = $this->tree_list[$i][$char];
            }
            if($this->tree_list[$i]["_word_"])
            {
                $this->result[] = $this->tree_list[$i]["_word_"];
            }
        }

        if(!$this->tree_list[0])
        {
            while($this->tree_list && !$this->tree_list[0])
            {
                array_shift($this->tree_list);
            }
        }
    }
}

$db = new Test_database();
$db->match_word();
?>
	<?php
	set_time_limit(0);

	class Test_database
	{

	private $read_dbhost = '';
	private $read_dbuser = '';
	private $read_dbpw = '';
	private $read_dbname = '';

	private $dbcharset = 'gbk';
	private $dbtabpre = 'pre_';

	private $read_db = null;
	private $write_db = null;

	private static $index=1;

	//敏感词数组
	var $tree_arr = array();
	var $result = array();
	var $exist_arr = array();
	var $tree_list = array();

	function __construct()
	{
	$this->db_read_link();
	$this->tree_arr = unserialize(file_get_contents('./tree.txt'));
	$this->maketree();
	//echo "<pre>";
	//var_dump($this->tree_arr);
	//echo "<pre>";
	}

	function db_read_link()
	{
	$this->read_db = mysql_connect($this->read_dbhost, $this->read_dbuser, $this->read_dbpw);
	if (!$this->read_db)
	{
	die('Could not connect: ' . mysql_error());
	}
	mysql_query('SET character_set_connection='.$this->dbcharset.', character_set_results='.$this->dbcharset.', character_set_client=binary', $this->read_db);
	mysql_query('SET names GBK', $this->read_db);
	mysql_select_db($this->read_dbname, $this->read_db);
	return $this->read_db;
	}


	/**
	* 快速的将字符串转换成多维的数组多叉树
	*/
	function str2MutiArray($str, &$x)
	{
	//global $x;
	$tmp = preg_replace("/([\w\W{4E00}-\x{9FA5}])/u", "['\\1']", $str);
	eval("\$x".$tmp."['_word_']='".$str."';");
	return $x;
	}

	function maketree()
	{
	$sql = mysql_query("select find from pre_common_word where 1", $this->read_db);
	$x = array();
	while($row = mysql_fetch_array($sql, MYSQL_ASSOC))
	{
	$word = iconv("gbk", "utf-8//IGNORE", $row["find"]);
	$this->str2MutiArray($word, $x);
	}
	$json_arr = serialize($x);
	file_put_contents("./tree.txt", $json_arr);
	}


	function match_word()
	{
	//如果优化，还可以把直接把敏感词库放入内存中，过滤帖子和博文中的bbcode代码
	$message = file_get_contents('./message3.txt');
	$message = iconv("gbk", "utf-8", $message);
	echo "start time:".$start = time()."<hr/>";
	$word = $this->check($message);
	echo "end time:".$end = time()."<hr/>";
	echo "diff time:",$end-$start."<hr/>";
	}

	function check(&$message)
	{
	$strLen = strlen($message);

	for ($i = 0; $i < $strLen; $i++)
	{
	if(ord($message[$i]) <= 127)
	{
	$char = $message[$i];
	}
	else
	{
	$char = $message[$i].$message[++$i].$message[++$i];
	}
	$this->getWord($char);
	if(count($this->result) > 0)
	{
	var_dump($this->result);
	return true;
	}

	}
	$this->tree_list = array();
	}

	/**
	* 核心算法通用的分词算法
	*/
	function getWord($char)
	{
	if($this->tree_arr[$char])
	{
	$this->tree_list[] = $this->tree_arr;
	}

	$count = count($this->tree_list);
	for($i = 0; $i < $count; $i++)
	{

	if($this->tree_list[$i])
	{
	$this->tree_list[$i] = $this->tree_list[$i][$char];
	}
	if($this->tree_list[$i]["_word_"])
	{
	$this->result[] = $this->tree_list[$i]["_word_"];
	}
	}

	if(!$this->tree_list[0])
	{
	while($this->tree_list && !$this->tree_list[0])
	{
	array_shift($this->tree_list);
	}
	}
	}
	}

	$db = new Test_database();
	$db->match_word();
	?>