Skip to content

Instantly share code, notes, and snippets.

@wangzaixiang
Last active July 9, 2021 12:59
Show Gist options
  • Save wangzaixiang/3422ad6a443ec34504e3d315d7305900 to your computer and use it in GitHub Desktop.
Save wangzaixiang/3422ad6a443ec34504e3d315d7305900 to your computer and use it in GitHub Desktop.
一个高效的敏感词过滤实现,同时也演示一下 FP 的简洁和强大
package demo
import java.lang
import java.util.regex.Pattern
import scala.annotation.tailrec
import scala.collection.{BitSet, mutable}
import scala.io.Source
trait SensitiveEngine {
def setKeywords(words: List[String]): Unit
def process(input: String): String
}
/**
* a fast sensetive word replace util.
*/
object FastEngine extends SensitiveEngine {
class Node(val ch: Char, val depth: Int) {
var isKeyword: Boolean = false
val nexts: mutable.Map[Char, Node] = collection.mutable.Map[Char, Node]()
var fallthrough: Node = _ // TODO 暂时未实现,可以进一步优化,避免回溯,理论上性能会更高
def addWord(word: String): Unit = {
assert(word.nonEmpty)
val char = word.charAt(0)
val remain = word.substring(1)
val next: Node = nexts.get(char) match {
case Some(node) => node
case None =>
val it = new Node(char, depth + 1)
nexts(char) = it
it
}
if (remain.isEmpty) {
next.isKeyword = true
}
else {
next.addWord(remain)
}
}
def getNext(ch: Char): Option[Node] = nexts.get(ch)
}
private val root = new Node(0, 0)
private var rootBitSet: BitSet = _ // the root node have a lot of children, using the BitSet for fast detection, avoid hashmap lookup
override def setKeywords(words: List[String]): Unit = {
words.foreach(addWord)
rootBitSet = buildRootBitSet(root)
}
def addWord(word: String): Unit = {
root.addWord(word)
}
def process(input: String): String = {
process(input, 0, root, new StringBuilder(input.length))
}
@tailrec
def process(input: String, pos: Int, currentNode: Node, builder: StringBuilder): String = {
def emitNodeAsStar(node: Node): Unit = {
var i = 0
while (i < node.depth) {
builder.append('*')
i += 1
}
}
if (pos >= input.length) { // process the last node
if (currentNode.isKeyword) {
emitNodeAsStar(currentNode)
}
else {
val begin = pos - currentNode.depth
builder.append(input.substring(begin, begin + currentNode.depth))
}
builder.toString()
}
else {
val ch = input.charAt(pos)
val nextNode: Option[Node] =
if (currentNode == root) {
if (isRootChar(ch)) currentNode.getNext(ch)
else None
}
else currentNode.getNext(ch)
if(nextNode != None) {
process(input, pos + 1, nextNode.get, builder)
}
else { // nextNode == None
if (currentNode eq root) { // currentNode == root && nextNode == None
builder.append(ch)
process(input, pos + 1, root, builder) // process the next char
}
else { // currentNode != root && nextNode == None
if (currentNode.isKeyword) {
emitNodeAsStar(currentNode)
process(input, pos, root, builder) // continue process at pos
}
else { // TODO avoid backward, using the fallthrough field, then continue at the pos + 1 not begin + 1
val begin = pos - currentNode.depth
builder.append(input.charAt(begin))
process(input, begin + 1, root, builder)
}
}
}
}
}
private def buildRootBitSet(root: Node): BitSet = {
val mutable = collection.mutable.BitSet()
root.nexts.keySet.foreach { ch: Char =>
mutable.add(ch.toInt)
}
mutable.toImmutable
}
private def isRootChar(ch: Char): Boolean = rootBitSet.contains(ch.toInt)
}
/**
* a simple prototype implementation
*/
object PrototypeEngine extends SensitiveEngine {
var pattern: Pattern = _
override def setKeywords(words: List[String]): Unit = {
val regexp = words.mkString("|")
pattern = Pattern.compile(regexp)
}
override def process(input: String): String = {
val matcher = pattern.matcher(input)
matcher.replaceAll("***")
}
}
object Test1 {
val keywords =
"""
福音会
中国教徒
统一教
观音法门
清海无上师
盘古
李洪志
志洪李
李宏志
轮功
法轮
轮法功
三去车仑
氵去车仑
发论工
法x功
法o功
法0功
法一轮一功
轮子功
车仑工力
法lun
fa轮
法lg
flg
fl功
falungong
大法弟子
大纪元
dajiyuan
明慧网
明慧周报
正见网
新唐人
伪火
退党
tuidang
退dang
超越红墙
自fen
真善忍
"""
val keywords2 = """爱女人
|爱液
|按摩棒
|拔出来
|爆草
|包二奶
|暴干
|暴奸
|暴乳
|爆乳
|暴淫
|屄
|被操
|被插
|被干
|逼奸
|仓井空
|插暴
|操逼
|操黑
|操烂
|肏你
|肏死
|操死
|操我
|厕奴
|插比
|插b
|插逼
|插进
|插你
|插我
|插阴
|潮吹
|潮喷
|成人dv
|成人电影
|成人论坛
|成人小说
|成人电
|成人电影
|成人卡通
|成人聊
|成人片""".stripMargin
def buildEngine(engine: SensitiveEngine, kws: List[String]): SensitiveEngine = {
engine.setKeywords(kws)
engine
}
def main(args: Array[String]): Unit = {
val kws = Source.fromString(keywords).getLines().filter(_.nonEmpty).toList
val engine = buildEngine(FastEngine, kws)
val input = "在中国,法轮功是一种邪教,李洪志是邪教的头目,大法弟大法弟子"
val output = engine.process(input)
println(output)
}
def times(label: String, skip: Int, loops: Int)(f: => Unit): Unit = {
0.until(skip).foreach(_ => f)
val tm0 = System.currentTimeMillis()
0.until(loops).foreach(_ => f)
val tm1 = System.currentTimeMillis()
println(s"$label loops $loops time:${tm1 - tm0}ms, ${(tm1 - tm0) * 1.0 / loops}ms/per")
}
def main2(args: Array[String]): Unit = {
val kws = Source.fromString(keywords + "\n" + keywords2).getLines().filter(_.nonEmpty).toList
println(s"keywords:${kws.size}")
val engine1 = buildEngine(FastEngine, kws)
val engine2 = buildEngine(PrototypeEngine, kws)
val input = Source.fromFile("/tmp/app.e96ae46d.js").mkString
times("version1", skip = 1000, loops = 1000) {
engine1.process(input)
}
times("version2", skip = 100, loops = 100) {
engine2.process(input)
}
}
}
package demo
import java.io.{FileOutputStream, PrintWriter}
import java.util.regex.Pattern
import scala.annotation.tailrec
import scala.collection.immutable.IntMap
import scala.collection.{BitSet, mutable}
import scala.io.Source
trait SensitiveEngine {
def setKeywords(words: List[String]): Unit
def process(input: String): String
}
/**
* faster sensetive word replace util.
*/
object FastEngine extends SensitiveEngine {
class Node(val ch: Char, val depth: Int) {
var isKeyword: Boolean = false
var nexts: mutable.Map[Char, Node] = collection.mutable.Map[Char, Node]()
// optimize for small set (10) using array, otherwise using IntMap
private var next_chars: Array[Char] = _
private var next_nodes: Array[Node] = _
private var nn: IntMap[Node] = _
var fallthrough: Node = _ // TODO 暂时未实现,可以进一步优化,避免回溯,理论上性能会更高
def addWord(word: String): Unit = {
assert(word.nonEmpty)
val char = word.charAt(0)
val remain = word.substring(1)
val next: Node = nexts.get(char) match {
case Some(node) => node
case None =>
val it = new Node(char, depth + 1)
nexts(char) = it
it
}
if (remain.isEmpty) {
next.isKeyword = true
}
else {
next.addWord(remain)
}
}
def getNext(ch: Char): Node = {
if(next_chars != null){
var i = 0
while(i < next_chars.length) {
if(next_chars(i) == ch) return next_nodes(i)
i += 1
}
null
}
else nn.getOrElse(ch, null)
}
def afterInit(): Unit ={
if(nexts.size < 10) {
next_chars = nexts.keys.toArray.sorted
next_nodes = next_chars.map(ch => nexts(ch) )
}
else {
nn = IntMap.from( nexts.map( x=> (x._1.toInt, x._2) ) )
}
nexts.values.foreach(_.afterInit())
nexts = null // free memory
}
}
private val root = new Node(0, 0)
private var rootNodes: Array[Node] = _
override def setKeywords(words: List[String]): Unit = {
words.foreach(addWord)
rootNodes = new Array[Node](0x10000)
(0 to 0xFFFF).foreach { ch =>
rootNodes(ch) = root.nexts.getOrElse(ch.toChar, null)
}
root.afterInit()
}
def addWord(word: String): Unit = {
root.addWord(word)
}
def process(input: String): String = {
emits = 0
process(input, input.length, 0, root, new Array[Char](input.length), 0)
}
var emits: Int = _
@tailrec
def process(input: String, inputLength: Int, pos: Int, currentNode: Node, buffer: Array[Char], buffPos: Int): String = {
var _buffPos = buffPos;
if (pos >= inputLength) { // process the last node
if (currentNode.isKeyword) {
var i = 0
while(i < currentNode.depth) {
buffer(_buffPos) = '*'
_buffPos += 1
i += 1
}
emits += 1
}
else {
val begin = pos - currentNode.depth
var i = begin
while(i < pos){
buffer(_buffPos) = input.charAt(i)
_buffPos += 1
i += 1
}
}
new String(buffer)
}
else {
val ch = input.charAt(pos)
val nextNode: Node = // currentNode.getNext(ch)
if (currentNode eq root) getRootNext(ch)
else currentNode.getNext(ch)
if ((currentNode eq root) && (nextNode eq null)) { // currentNode == root && nextNode == None
buffer(_buffPos) = ch; _buffPos += 1
process(input, inputLength, pos+1, root, buffer, _buffPos)
}
else if(!(currentNode eq root) && (nextNode eq null)) { // currentNode != root && nextNode == None
if (currentNode.isKeyword) {
var i = 0
while(i < currentNode.depth){
buffer(_buffPos) = '*'
_buffPos += 1
i += 1
}
emits += 1
process(input, inputLength, pos, root, buffer, _buffPos)
}
else { // TODO avoid backward, using the fallthrough field, then continue at the pos + 1 not begin + 1
val begin = pos - currentNode.depth
buffer(_buffPos) = input.charAt(begin); _buffPos += 1
process(input, inputLength, begin + 1, root, buffer, _buffPos)
}
}
else { // nextNode != null
process(input, inputLength, pos+1, nextNode, buffer, _buffPos)
}
}
}
private def buildRootBitSet(root: Node): BitSet = {
val mutable = collection.mutable.BitSet()
root.nexts.keySet.foreach { ch: Char =>
mutable.add(ch.toInt)
}
mutable.toImmutable
}
private def getRootNext(ch: Char): Node = rootNodes(ch)
}
/**
* a simple prototype implementation
*/
object PrototypeEngine extends SensitiveEngine {
var pattern: Pattern = _
override def setKeywords(words: List[String]): Unit = {
val regexp = words.mkString("|")
pattern = Pattern.compile(regexp)
}
override def process(input: String): String = {
val matcher = pattern.matcher(input)
matcher.replaceAll("***")
}
}
object Test1 {
val keywords =
"""
福音会
中国教徒
统一教
观音法门
清海无上师
盘古
李洪志
志洪李
李宏志
轮功
法轮
轮法功
三去车仑
氵去车仑
发论工
法x功
法o功
法0功
法一轮一功
轮子功
车仑工力
法lun
fa轮
法lg
flg
fl功
falungong
大法弟子
大纪元
dajiyuan
明慧网
明慧周报
正见网
新唐人
伪火
退党
tuidang
退dang
超越红墙
自fen
真善忍
"""
def buildEngine[T <: SensitiveEngine](engine: T, kws: List[String]): T = {
engine.setKeywords(kws)
engine
}
def main1(args: Array[String]): Unit = {
val kws = Source.fromString(keywords).getLines().filter(_.nonEmpty).toList
val engine = buildEngine(FastEngine, kws)
val input = "在中国,法轮功是一种邪教,李洪志是邪教的头目,大法弟大法弟子"
val output = engine.process(input)
println(output)
}
def times(label: String, skip: Int, loops: Int)(f: => Unit): Unit = {
0.until(skip).foreach(_ => f)
val tm0 = System.currentTimeMillis()
0.until(loops).foreach(_ => f)
val tm1 = System.currentTimeMillis()
println(s"$label loops $loops time:${tm1 - tm0}ms, ${(tm1 - tm0) * 1.0 / loops}ms/per")
}
def load(file: String): List[String] = Source.fromResource(file).getLines()
.filter(_.nonEmpty).toList
def main(args: Array[String]): Unit = {
val kws = // Source.fromString(keywords).getLines().filter(_.nonEmpty).toList
load("demo/policy.txt") ++
load("demo/sex.txt") ++
load("demo/sen3.txt")
println(s"keywords:${kws.size}")
val engine1 = buildEngine(FastEngine, kws)
val engine2 = buildEngine(PrototypeEngine, kws)
// val input = Source.fromFile("/tmp/app.e96ae46d.js").mkString
val input = Source.fromFile("/tmp/sen1.txt").mkString
val output = engine1.process(input)
val file = new PrintWriter(new FileOutputStream("/tmp/sen2.txt"))
file.println(output)
file.close()
println(s"emits = ${engine1.emits}")
times("version1", skip = 1000, loops = 10_000) {
engine1.process(input)
}
// times("version2", skip = 100, loops = 100) {
// engine2.process(input)
// }
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment