Skip to content

Instantly share code, notes, and snippets.

@Eminlin
Last active February 13, 2022 03:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Eminlin/7496c1435d994a2d01a42b41cb3c106e to your computer and use it in GitHub Desktop.
Save Eminlin/7496c1435d994a2d01a42b41cb3c106e to your computer and use it in GitHub Desktop.
DFA算法构建敏感词匹配
<?php
class SensitiveController extends Controller
{
private $dict;
public function __construct($words) {
$this->dict = array();
// 构建敏感词树
foreach ($words as $_word) {
$uWord = $this->unicodeSplit($_word);
$pdict = &$this->dict;
$count = count($uWord);
for ($i = 0; $i < $count; $i++) {
if (!isset($pdict[$uWord[$i]])) {
$pdict[$uWord[$i]] = array();
}
$pdict = &$pdict[$uWord[$i]];
}
$pdict['end'] = true;
}
}
// 判断是否包含敏感词
public function contains($str) {
$uStr = $this->unicodeSplit($str);
$count = count($uStr);
for ($i = 0; $i < $count; $i++) {
$pdict = $this->dict;
$char = strtolower($uStr[$i]);
if (isset($pdict[$char])) {
$pdict = $pdict[$char];
for ($j = $i + 1; $j < $count; $j++) {
$char2 = strtolower($uStr[$j]);
if (isset($pdict[$char2])) {
if (isset($pdict['end'])) {
return true;
}
$pdict = $pdict[$char2];
} else if (!preg_match("/[ +=*&$#@\"')(~_]/", $char2)) {
// 以特别的字符进行分割敏感词的手段也要匹配
// 例:大****麻,匹配:大麻
break;
}
}
if (isset($pdict['end'])) {
return true;
}
}
}
return false;
}
// 将字符串分割成数组
public function unicodeSplit($str, $caseword = true) {
if ($caseword)
$str = strtolower($str);
$ret = array();
$len = strlen($str);
for ($i = 0; $i < $len; $i++) {
$c = ord($str[$i]);
if ($c & 0x80) {
if (($c & 0xf8) == 0xf0 && $len - $i >= 4) {
if ((ord($str[$i + 1]) & 0xc0) == 0x80 && (ord($str[$i + 2]) & 0xc0) == 0x80 && (ord($str[$i + 3]) & 0xc0) == 0x80) {
$uc = substr($str, $i, 4);
$ret[] = $uc;
$i += 3;
}
} else if (($c & 0xf0) == 0xe0 && $len - $i >= 3) {
if ((ord($str[$i + 1]) & 0xc0) == 0x80 && (ord($str[$i + 2]) & 0xc0) == 0x80) {
$uc = substr($str, $i, 3);
$ret[] = $uc;
$i += 2;
}
} else if (($c & 0xe0) == 0xc0 && $len - $i >= 2) {
if ((ord($str[$i + 1]) & 0xc0) == 0x80) {
$uc = substr($str, $i, 2);
$ret[] = $uc;
$i += 1;
}
}
} else {
$ret[] = $str[$i];
}
}
return $ret;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment