Last active
December 14, 2015 16:19
-
-
Save zubinJiang/5114701 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
set_time_limit(0); | |
class Test_database | |
{ | |
private $read_dbhost = ''; | |
private $read_dbuser = ''; | |
private $read_dbpw = ''; | |
private $read_dbname = ''; | |
private $dbcharset = 'gbk'; | |
private $dbtabpre = 'pre_'; | |
private $read_db = null; | |
private $write_db = null; | |
private static $index=1; | |
//敏感词数组 | |
var $tree_arr = array(); | |
var $result = array(); | |
var $exist_arr = array(); | |
var $tree_list = array(); | |
function __construct() | |
{ | |
$this->db_read_link(); | |
$this->tree_arr = unserialize(file_get_contents('./tree.txt')); | |
$this->maketree(); | |
//echo "<pre>"; | |
//var_dump($this->tree_arr); | |
//echo "<pre>"; | |
} | |
function db_read_link() | |
{ | |
$this->read_db = mysql_connect($this->read_dbhost, $this->read_dbuser, $this->read_dbpw); | |
if (!$this->read_db) | |
{ | |
die('Could not connect: ' . mysql_error()); | |
} | |
mysql_query('SET character_set_connection='.$this->dbcharset.', character_set_results='.$this->dbcharset.', character_set_client=binary', $this->read_db); | |
mysql_query('SET names GBK', $this->read_db); | |
mysql_select_db($this->read_dbname, $this->read_db); | |
return $this->read_db; | |
} | |
/** | |
* 快速的将字符串转换成多维的数组多叉树 | |
*/ | |
function str2MutiArray($str, &$x) | |
{ | |
//global $x; | |
$tmp = preg_replace("/([\w\W{4E00}-\x{9FA5}])/u", "['\\1']", $str); | |
eval("\$x".$tmp."['_word_']='".$str."';"); | |
return $x; | |
} | |
function maketree() | |
{ | |
$sql = mysql_query("select find from pre_common_word where 1", $this->read_db); | |
$x = array(); | |
while($row = mysql_fetch_array($sql, MYSQL_ASSOC)) | |
{ | |
$word = iconv("gbk", "utf-8//IGNORE", $row["find"]); | |
$this->str2MutiArray($word, $x); | |
} | |
$json_arr = serialize($x); | |
file_put_contents("./tree.txt", $json_arr); | |
} | |
function match_word() | |
{ | |
//如果优化,还可以把直接把敏感词库放入内存中,过滤帖子和博文中的bbcode代码 | |
$message = file_get_contents('./message3.txt'); | |
$message = iconv("gbk", "utf-8", $message); | |
echo "start time:".$start = time()."<hr/>"; | |
$word = $this->check($message); | |
echo "end time:".$end = time()."<hr/>"; | |
echo "diff time:",$end-$start."<hr/>"; | |
} | |
function check(&$message) | |
{ | |
$strLen = strlen($message); | |
for ($i = 0; $i < $strLen; $i++) | |
{ | |
if(ord($message[$i]) <= 127) | |
{ | |
$char = $message[$i]; | |
} | |
else | |
{ | |
$char = $message[$i].$message[++$i].$message[++$i]; | |
} | |
$this->getWord($char); | |
if(count($this->result) > 0) | |
{ | |
var_dump($this->result); | |
return true; | |
} | |
} | |
$this->tree_list = array(); | |
} | |
/** | |
* 核心算法 通用的分词算法 | |
*/ | |
function getWord($char) | |
{ | |
if($this->tree_arr[$char]) | |
{ | |
$this->tree_list[] = $this->tree_arr; | |
} | |
$count = count($this->tree_list); | |
for($i = 0; $i < $count; $i++) | |
{ | |
if($this->tree_list[$i]) | |
{ | |
$this->tree_list[$i] = $this->tree_list[$i][$char]; | |
} | |
if($this->tree_list[$i]["_word_"]) | |
{ | |
$this->result[] = $this->tree_list[$i]["_word_"]; | |
} | |
} | |
if(!$this->tree_list[0]) | |
{ | |
while($this->tree_list && !$this->tree_list[0]) | |
{ | |
array_shift($this->tree_list); | |
} | |
} | |
} | |
} | |
$db = new Test_database(); | |
$db->match_word(); | |
?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment