Skip to content

Instantly share code, notes, and snippets.

@zubinJiang
Last active December 14, 2015 16:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zubinJiang/5114701 to your computer and use it in GitHub Desktop.
Save zubinJiang/5114701 to your computer and use it in GitHub Desktop.
<?php
set_time_limit(0);
class Test_database
{
private $read_dbhost = '';
private $read_dbuser = '';
private $read_dbpw = '';
private $read_dbname = '';
private $dbcharset = 'gbk';
private $dbtabpre = 'pre_';
private $read_db = null;
private $write_db = null;
private static $index=1;
//敏感词数组
var $tree_arr = array();
var $result = array();
var $exist_arr = array();
var $tree_list = array();
function __construct()
{
$this->db_read_link();
$this->tree_arr = unserialize(file_get_contents('./tree.txt'));
$this->maketree();
//echo "<pre>";
//var_dump($this->tree_arr);
//echo "<pre>";
}
function db_read_link()
{
$this->read_db = mysql_connect($this->read_dbhost, $this->read_dbuser, $this->read_dbpw);
if (!$this->read_db)
{
die('Could not connect: ' . mysql_error());
}
mysql_query('SET character_set_connection='.$this->dbcharset.', character_set_results='.$this->dbcharset.', character_set_client=binary', $this->read_db);
mysql_query('SET names GBK', $this->read_db);
mysql_select_db($this->read_dbname, $this->read_db);
return $this->read_db;
}
/**
* 快速的将字符串转换成多维的数组多叉树
*/
function str2MutiArray($str, &$x)
{
//global $x;
$tmp = preg_replace("/([\w\W{4E00}-\x{9FA5}])/u", "['\\1']", $str);
eval("\$x".$tmp."['_word_']='".$str."';");
return $x;
}
function maketree()
{
$sql = mysql_query("select find from pre_common_word where 1", $this->read_db);
$x = array();
while($row = mysql_fetch_array($sql, MYSQL_ASSOC))
{
$word = iconv("gbk", "utf-8//IGNORE", $row["find"]);
$this->str2MutiArray($word, $x);
}
$json_arr = serialize($x);
file_put_contents("./tree.txt", $json_arr);
}
function match_word()
{
//如果优化,还可以把直接把敏感词库放入内存中,过滤帖子和博文中的bbcode代码
$message = file_get_contents('./message3.txt');
$message = iconv("gbk", "utf-8", $message);
echo "start time:".$start = time()."<hr/>";
$word = $this->check($message);
echo "end time:".$end = time()."<hr/>";
echo "diff time:",$end-$start."<hr/>";
}
function check(&$message)
{
$strLen = strlen($message);
for ($i = 0; $i < $strLen; $i++)
{
if(ord($message[$i]) <= 127)
{
$char = $message[$i];
}
else
{
$char = $message[$i].$message[++$i].$message[++$i];
}
$this->getWord($char);
if(count($this->result) > 0)
{
var_dump($this->result);
return true;
}
}
$this->tree_list = array();
}
/**
* 核心算法 通用的分词算法
*/
function getWord($char)
{
if($this->tree_arr[$char])
{
$this->tree_list[] = $this->tree_arr;
}
$count = count($this->tree_list);
for($i = 0; $i < $count; $i++)
{
if($this->tree_list[$i])
{
$this->tree_list[$i] = $this->tree_list[$i][$char];
}
if($this->tree_list[$i]["_word_"])
{
$this->result[] = $this->tree_list[$i]["_word_"];
}
}
if(!$this->tree_list[0])
{
while($this->tree_list && !$this->tree_list[0])
{
array_shift($this->tree_list);
}
}
}
}
$db = new Test_database();
$db->match_word();
?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment