Skip to content

Instantly share code, notes, and snippets.

@geniuszxy
Last active February 28, 2020 08:11
Show Gist options
  • Save geniuszxy/fb2fa8c126986ef4e515624a3cad367f to your computer and use it in GitHub Desktop.
Save geniuszxy/fb2fa8c126986ef4e515624a3cad367f to your computer and use it in GitHub Desktop.
Efficient trie implemention without multiple dictionaries (or linked lists)
using System;
using System.Collections;
using System.Collections.Generic;
namespace Filter
{
using NodeKey = System.UInt64;
public class Trie
{
private struct NodeValue
{
// ID of this node
public int id;
// Is this the last character of a word?
public bool end;
public NodeValue(int id, bool end)
{
this.id = id;
this.end = end;
}
}
private struct RemoveHelper
{
public NodeKey key;
public bool end;
public RemoveHelper(NodeKey key, bool end)
{
this.key = key;
this.end = end;
}
}
// All nodes
private Dictionary<NodeKey, NodeValue> _nodes = new Dictionary<NodeKey, NodeValue>();
// The last inserted node ID
private int _maxId = 0;
// Combine a character and an ID
private static NodeKey MakeKey(char c, int prevId)
{
return (NodeKey)((prevId << 16) | c);
}
/// <summary>
/// Add a word
/// </summary>
public bool Add(string word)
{
if (word == null)
return false;
int len = word.Length, prevId = 0;
for (int i = 0; i < len;)
{
//TODO skip punctuation, whitespace, symbol?
NodeKey key = MakeKey(word[i], prevId);
NodeValue value;
++i;
if (!_nodes.TryGetValue(key, out value))
_nodes.Add(key, new NodeValue(prevId = ++_maxId, i == len));
// Only stores the shortest sequence
else if (value.end)
return false;
// Re-add the value, mark it is the end
else if (i == len)
_nodes[key] = new NodeValue(value.id, true);
else
prevId = value.id;
}
return true;
}
/// <summary>
/// Remove a word
/// </summary>
public bool Remove(string word)
{
if (word == null)
return false;
int len = word.Length, prevId = 0;
RemoveHelper[] helpers = new RemoveHelper[len];
for (int i = 0; i < len; i++)
{
NodeKey key = MakeKey(word[i], prevId);
NodeValue value;
if (!_nodes.TryGetValue(key, out value))
return false;
prevId = value.id;
helpers[i] = new RemoveHelper(key, value.end);
}
// The last character must be the end of a word
if (!helpers[--len].end)
return false;
// Make sure there is no longer sequences
foreach (var key in _nodes.Keys)
{
if ((int)(key >> 16) == prevId)
{
// Just remove the end flag
_nodes[helpers[len].key] = new NodeValue(prevId, false);
return true;
}
}
// Remove from the last character, until we find a shorter sequence.
do _nodes.Remove(helpers[len].key);
while (--len >= 0 && !helpers[len].end);
return true;
}
private struct Iterator : IEnumerable<string>, IEnumerator<string>
{
private Dictionary<NodeKey, NodeValue> nodes;
private string text;
private int i, j;
public Iterator(string text, Trie root)
{
this.text = text;
nodes = root._nodes;
i = j = -1;
}
public void Replace(char[] chars, char replace)
{
for (int k = i; k <= j; k++)
chars[k] = replace;
}
public void Remove(char[] chars, ref int length)
{
int jnext = j + 1;
if (jnext < length)
Array.Copy(chars, jnext, chars, i, length - jnext);
length -= jnext - i;
}
public bool MoveNext()
{
int len = text.Length;
while (++i < len)
{
// Does the first character match ?
NodeValue value;
if (!nodes.TryGetValue(MakeKey(text[i], 0), out value))
continue;
// The sequence contains only 1 character
if (value.end)
{
j = i;
return true;
}
// Match the remain sequences
for (j = i + 1; j < len; j++)
{
char c = text[j];
// Skip punctuation, whitespace, symbol
if (char.IsPunctuation(c) || char.IsWhiteSpace(c) || char.IsSymbol(c))
continue;
// Try to match the next character
if (!nodes.TryGetValue(MakeKey(c, value.id), out value))
break;
if (value.end)
return true;
}
}
return false;
}
public string Current
{
get { return text.Substring(i, j - i + 1); }
}
object IEnumerator.Current
{
get { return Current; }
}
public IEnumerator<string> GetEnumerator()
{
return this;
}
IEnumerator IEnumerable.GetEnumerator()
{
return this;
}
public void Dispose()
{
}
public void Reset()
{
}
}
private Iterator CreateIterator(string text)
{
return new Iterator(text, this);
}
/// <summary>
/// Does the text contains any added sequence?
/// </summary>
public bool Contains(string text)
{
return CreateIterator(text).MoveNext();
}
/// <summary>
/// Find the first occurence in the text
/// </summary>
public string First(string text)
{
var itor = CreateIterator(text);
if (itor.MoveNext())
return itor.Current;
return string.Empty;
}
/// <summary>
/// Find all occurences in the text
/// </summary>
public IEnumerable<string> All(string text)
{
return CreateIterator(text);
}
/// <summary>
/// Replace the filted sequence in the text with a specific character
/// </summary>
public string Replace(string text, char replace)
{
char[] chars = null;
var itor = CreateIterator(text);
while (itor.MoveNext())
itor.Replace(chars ?? (chars = text.ToCharArray()), replace);
return chars == null ? text : new string(chars);
}
/// <summary>
/// Remove the filted sequence in the text
/// </summary>
public string Filter(string text)
{
char[] chars = null;
int length = text.Length;
var itor = CreateIterator(text);
while (itor.MoveNext())
itor.Remove(chars ?? (chars = text.ToCharArray()), ref length);
return chars == null ? text : new string(chars, 0, length);
}
}
}
@geniuszxy
Copy link
Author

Usage

var trie = new Filter.Trie();
foreach (var t in filterList)
    trie.Add(t);

// Replace
text = trie.Replace(text, '*');

// Filter
text = trie.Filter(text);

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment