Last active
February 13, 2022 03:20
-
-
Save Eminlin/7496c1435d994a2d01a42b41cb3c106e to your computer and use it in GitHub Desktop.
DFA算法构建敏感词匹配
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
class SensitiveController extends Controller | |
{ | |
private $dict; | |
public function __construct($words) { | |
$this->dict = array(); | |
// 构建敏感词树 | |
foreach ($words as $_word) { | |
$uWord = $this->unicodeSplit($_word); | |
$pdict = &$this->dict; | |
$count = count($uWord); | |
for ($i = 0; $i < $count; $i++) { | |
if (!isset($pdict[$uWord[$i]])) { | |
$pdict[$uWord[$i]] = array(); | |
} | |
$pdict = &$pdict[$uWord[$i]]; | |
} | |
$pdict['end'] = true; | |
} | |
} | |
// 判断是否包含敏感词 | |
public function contains($str) { | |
$uStr = $this->unicodeSplit($str); | |
$count = count($uStr); | |
for ($i = 0; $i < $count; $i++) { | |
$pdict = $this->dict; | |
$char = strtolower($uStr[$i]); | |
if (isset($pdict[$char])) { | |
$pdict = $pdict[$char]; | |
for ($j = $i + 1; $j < $count; $j++) { | |
$char2 = strtolower($uStr[$j]); | |
if (isset($pdict[$char2])) { | |
if (isset($pdict['end'])) { | |
return true; | |
} | |
$pdict = $pdict[$char2]; | |
} else if (!preg_match("/[ +=*&$#@\"')(~_]/", $char2)) { | |
// 以特别的字符进行分割敏感词的手段也要匹配 | |
// 例:大****麻,匹配:大麻 | |
break; | |
} | |
} | |
if (isset($pdict['end'])) { | |
return true; | |
} | |
} | |
} | |
return false; | |
} | |
// 将字符串分割成数组 | |
public function unicodeSplit($str, $caseword = true) { | |
if ($caseword) | |
$str = strtolower($str); | |
$ret = array(); | |
$len = strlen($str); | |
for ($i = 0; $i < $len; $i++) { | |
$c = ord($str[$i]); | |
if ($c & 0x80) { | |
if (($c & 0xf8) == 0xf0 && $len - $i >= 4) { | |
if ((ord($str[$i + 1]) & 0xc0) == 0x80 && (ord($str[$i + 2]) & 0xc0) == 0x80 && (ord($str[$i + 3]) & 0xc0) == 0x80) { | |
$uc = substr($str, $i, 4); | |
$ret[] = $uc; | |
$i += 3; | |
} | |
} else if (($c & 0xf0) == 0xe0 && $len - $i >= 3) { | |
if ((ord($str[$i + 1]) & 0xc0) == 0x80 && (ord($str[$i + 2]) & 0xc0) == 0x80) { | |
$uc = substr($str, $i, 3); | |
$ret[] = $uc; | |
$i += 2; | |
} | |
} else if (($c & 0xe0) == 0xc0 && $len - $i >= 2) { | |
if ((ord($str[$i + 1]) & 0xc0) == 0x80) { | |
$uc = substr($str, $i, 2); | |
$ret[] = $uc; | |
$i += 1; | |
} | |
} | |
} else { | |
$ret[] = $str[$i]; | |
} | |
} | |
return $ret; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment