Last active
February 28, 2020 08:11
-
-
Save geniuszxy/fb2fa8c126986ef4e515624a3cad367f to your computer and use it in GitHub Desktop.
Efficient trie implemention without multiple dictionaries (or linked lists)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Collections; | |
using System.Collections.Generic; | |
namespace Filter | |
{ | |
using NodeKey = System.UInt64; | |
public class Trie | |
{ | |
private struct NodeValue | |
{ | |
// ID of this node | |
public int id; | |
// Is this the last character of a word? | |
public bool end; | |
public NodeValue(int id, bool end) | |
{ | |
this.id = id; | |
this.end = end; | |
} | |
} | |
private struct RemoveHelper | |
{ | |
public NodeKey key; | |
public bool end; | |
public RemoveHelper(NodeKey key, bool end) | |
{ | |
this.key = key; | |
this.end = end; | |
} | |
} | |
// All nodes | |
private Dictionary<NodeKey, NodeValue> _nodes = new Dictionary<NodeKey, NodeValue>(); | |
// The last inserted node ID | |
private int _maxId = 0; | |
// Combine a character and an ID | |
private static NodeKey MakeKey(char c, int prevId) | |
{ | |
return (NodeKey)((prevId << 16) | c); | |
} | |
/// <summary> | |
/// Add a word | |
/// </summary> | |
public bool Add(string word) | |
{ | |
if (word == null) | |
return false; | |
int len = word.Length, prevId = 0; | |
for (int i = 0; i < len;) | |
{ | |
//TODO skip punctuation, whitespace, symbol? | |
NodeKey key = MakeKey(word[i], prevId); | |
NodeValue value; | |
++i; | |
if (!_nodes.TryGetValue(key, out value)) | |
_nodes.Add(key, new NodeValue(prevId = ++_maxId, i == len)); | |
// Only stores the shortest sequence | |
else if (value.end) | |
return false; | |
// Re-add the value, mark it is the end | |
else if (i == len) | |
_nodes[key] = new NodeValue(value.id, true); | |
else | |
prevId = value.id; | |
} | |
return true; | |
} | |
/// <summary> | |
/// Remove a word | |
/// </summary> | |
public bool Remove(string word) | |
{ | |
if (word == null) | |
return false; | |
int len = word.Length, prevId = 0; | |
RemoveHelper[] helpers = new RemoveHelper[len]; | |
for (int i = 0; i < len; i++) | |
{ | |
NodeKey key = MakeKey(word[i], prevId); | |
NodeValue value; | |
if (!_nodes.TryGetValue(key, out value)) | |
return false; | |
prevId = value.id; | |
helpers[i] = new RemoveHelper(key, value.end); | |
} | |
// The last character must be the end of a word | |
if (!helpers[--len].end) | |
return false; | |
// Make sure there is no longer sequences | |
foreach (var key in _nodes.Keys) | |
{ | |
if ((int)(key >> 16) == prevId) | |
{ | |
// Just remove the end flag | |
_nodes[helpers[len].key] = new NodeValue(prevId, false); | |
return true; | |
} | |
} | |
// Remove from the last character, until we find a shorter sequence. | |
do _nodes.Remove(helpers[len].key); | |
while (--len >= 0 && !helpers[len].end); | |
return true; | |
} | |
private struct Iterator : IEnumerable<string>, IEnumerator<string> | |
{ | |
private Dictionary<NodeKey, NodeValue> nodes; | |
private string text; | |
private int i, j; | |
public Iterator(string text, Trie root) | |
{ | |
this.text = text; | |
nodes = root._nodes; | |
i = j = -1; | |
} | |
public void Replace(char[] chars, char replace) | |
{ | |
for (int k = i; k <= j; k++) | |
chars[k] = replace; | |
} | |
public void Remove(char[] chars, ref int length) | |
{ | |
int jnext = j + 1; | |
if (jnext < length) | |
Array.Copy(chars, jnext, chars, i, length - jnext); | |
length -= jnext - i; | |
} | |
public bool MoveNext() | |
{ | |
int len = text.Length; | |
while (++i < len) | |
{ | |
// Does the first character match ? | |
NodeValue value; | |
if (!nodes.TryGetValue(MakeKey(text[i], 0), out value)) | |
continue; | |
// The sequence contains only 1 character | |
if (value.end) | |
{ | |
j = i; | |
return true; | |
} | |
// Match the remain sequences | |
for (j = i + 1; j < len; j++) | |
{ | |
char c = text[j]; | |
// Skip punctuation, whitespace, symbol | |
if (char.IsPunctuation(c) || char.IsWhiteSpace(c) || char.IsSymbol(c)) | |
continue; | |
// Try to match the next character | |
if (!nodes.TryGetValue(MakeKey(c, value.id), out value)) | |
break; | |
if (value.end) | |
return true; | |
} | |
} | |
return false; | |
} | |
public string Current | |
{ | |
get { return text.Substring(i, j - i + 1); } | |
} | |
object IEnumerator.Current | |
{ | |
get { return Current; } | |
} | |
public IEnumerator<string> GetEnumerator() | |
{ | |
return this; | |
} | |
IEnumerator IEnumerable.GetEnumerator() | |
{ | |
return this; | |
} | |
public void Dispose() | |
{ | |
} | |
public void Reset() | |
{ | |
} | |
} | |
private Iterator CreateIterator(string text) | |
{ | |
return new Iterator(text, this); | |
} | |
/// <summary> | |
/// Does the text contains any added sequence? | |
/// </summary> | |
public bool Contains(string text) | |
{ | |
return CreateIterator(text).MoveNext(); | |
} | |
/// <summary> | |
/// Find the first occurence in the text | |
/// </summary> | |
public string First(string text) | |
{ | |
var itor = CreateIterator(text); | |
if (itor.MoveNext()) | |
return itor.Current; | |
return string.Empty; | |
} | |
/// <summary> | |
/// Find all occurences in the text | |
/// </summary> | |
public IEnumerable<string> All(string text) | |
{ | |
return CreateIterator(text); | |
} | |
/// <summary> | |
/// Replace the filted sequence in the text with a specific character | |
/// </summary> | |
public string Replace(string text, char replace) | |
{ | |
char[] chars = null; | |
var itor = CreateIterator(text); | |
while (itor.MoveNext()) | |
itor.Replace(chars ?? (chars = text.ToCharArray()), replace); | |
return chars == null ? text : new string(chars); | |
} | |
/// <summary> | |
/// Remove the filted sequence in the text | |
/// </summary> | |
public string Filter(string text) | |
{ | |
char[] chars = null; | |
int length = text.Length; | |
var itor = CreateIterator(text); | |
while (itor.MoveNext()) | |
itor.Remove(chars ?? (chars = text.ToCharArray()), ref length); | |
return chars == null ? text : new string(chars, 0, length); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Usage