Skip to content

Instantly share code, notes, and snippets.

Created June 5, 2015 22:03
Show Gist options
  • Save anonymous/1e755a5f0880523d3ef3 to your computer and use it in GitHub Desktop.
Save anonymous/1e755a5f0880523d3ef3 to your computer and use it in GitHub Desktop.
A benchmark with a number of implementations for performing word prefix searches versus a built word dictionary, see http://stackoverflow.com/questions/30578450/optimal-compare-algorithm-for-finding-string-matches-in-list-of-strings-c-sharp
namespace TryOuts
{
using System;
using System.IO;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Diagnostics;
/// <summary>
/// Common interface
/// </summary>
public interface IWordMatcher
{
void BuildDictionary(IEnumerable<string> wordList);
IEnumerable<string> FindAllPrefixesFor(string searchTerm);
}
/// <summary>
/// Simplest very naive implementation.
/// </summary>
public sealed class NaiveWordMatcher : IWordMatcher
{
private List<string> _uniqueWords;
public void BuildDictionary(IEnumerable<string> wordList)
{
_uniqueWords = wordList.Distinct().ToList();
}
public IEnumerable<string> FindAllPrefixesFor(string searchTerm)
{
foreach (var w in _uniqueWords)
if (searchTerm.StartsWith(w))
yield return w;
}
}
/// <summary>
/// Simple implementation using binary search on a sorted list, see
/// http://stackoverflow.com/a/30604336/2573395, @JimMischel's answer.
/// </summary>
public sealed class JimMischel_SortedWordListMatcher : IWordMatcher
{
private List<string> _uniqueSortedWords;
public void BuildDictionary(IEnumerable<string> wordList)
{
_uniqueSortedWords = wordList.Distinct().ToList();
_uniqueSortedWords.Sort();
}
public IEnumerable<string> FindAllPrefixesFor(string searchTerm)
{
if (searchTerm.Length > 0)
{
var prefixLen = 1;
var prefix = searchTerm.Substring(0, prefixLen);
var ndx = _uniqueSortedWords.BinarySearch(prefix);
if (ndx < 0)
ndx = ~ndx;
while (prefixLen <= searchTerm.Length && ndx < _uniqueSortedWords.Count)
{
var cmp = String.Compare(_uniqueSortedWords[ndx], prefix);
if (cmp == 0)
{
yield return prefix;
ndx++;
}
else if (cmp > 0)
{
if (_uniqueSortedWords[ndx].Length <= prefixLen || _uniqueSortedWords[ndx][0] != prefix[0])
break;
if (++prefixLen <= searchTerm.Length)
prefix = searchTerm.Substring(0, prefixLen);
}
else
{
ndx++;
}
}
}
}
}
/// <summary>
/// Simple implementation using binary search on a sorted list, see
/// http://stackoverflow.com/a/30604336/2573395, @JimMischel's answer.
/// </summary>
public sealed class JimMischel_DictSortedWordListMatcher : IWordMatcher
{
public Dictionary<char, List<string>> MyDictionary;
public void BuildDictionary(IEnumerable<string> wordList)
{
MyDictionary = wordList
.Where(x => x.Length > 0)
.GroupBy(x => x[0])
.ToDictionary(
g => g.Key,
g => g.Distinct().OrderBy(x => x).Select(x => x.Substring(1)).ToList());
}
public IEnumerable<string> FindAllPrefixesFor(string searchTerm)
{
var list = default(List<string>);
if (searchTerm.Length > 0 && MyDictionary.TryGetValue(searchTerm[0], out list))
{
if (list[0].Length == 0)
yield return searchTerm.Substring(0, 1);
if (searchTerm.Length > 1)
{
var prefix = searchTerm.Substring(1, 1);
var ndx = list.BinarySearch(prefix);
if (ndx < 0)
ndx = ~ndx;
var toMatch = searchTerm.Substring(1);
while (ndx < list.Count)
{
var cmp = String.Compare(list[ndx], 0, toMatch, 0, list[ndx].Length);
if (cmp == 0)
{
yield return searchTerm.Substring(0, list[ndx].Length + 1);
ndx++;
}
else if (cmp > 0)
break;
else
ndx++;
}
}
}
}
}
/// <summary>
/// Slightly reworked version of @MattyMerrix CompareHelper (see
/// http://stackoverflow.com/a/30597532/2573395).
/// </summary>
public sealed class MattyMerrix_CompareHelper : IWordMatcher
{
private readonly StringBuilder ThisWord = new StringBuilder();
private List<string> CurrentWordList;
public Dictionary<char, List<string>> MyDictionary;
public void BuildDictionary(IEnumerable<string> wordList)
{
MyDictionary = wordList
.Where(x => x.Length > 0)
.GroupBy(x => x[0])
.ToDictionary(
g => g.Key,
g => g.Distinct().OrderBy(x => x).ToList());
}
public IEnumerable<string> FindAllPrefixesFor(string searchTerm)
{
int index = 0;
if (InitCompare(searchTerm[index++]))
yield return ThisWord.ToString();
if (CurrentWordList != null)
{
while (CurrentWordList.Count > 0 && index < searchTerm.Length)
if (NextCompare(index, searchTerm[index++]))
yield return ThisWord.ToString();
}
}
private bool InitCompare(char firstChar)
{
ThisWord.Clear();
if (MyDictionary.TryGetValue(firstChar, out CurrentWordList))
{
ThisWord.Append(firstChar);
return CurrentWordList[0].Length == 1;
}
return false;
}
private bool NextCompare(int index, char nextChar)
{
ThisWord.Append(nextChar);
CurrentWordList = CurrentWordList
.Where(word => (word.Length > index && word[index] == nextChar))
.ToList();
return (CurrentWordList.Count > 0 && CurrentWordList[0].Length == ThisWord.Length);
}
}
/// <summary>
/// Uses a simple non-compressed trie, storing a single character
/// per node and a dictionary for children. This is fast, but requires
/// a lot of memory overhead for each node.
/// </summary>
public sealed class SimpleTrieWordMatcher : IWordMatcher
{
private readonly Node _root = new Node(); // root represents empty string.
public void BuildDictionary(IEnumerable<string> wordList)
{
foreach (var word in wordList)
AddWord(word);
}
public void AddWord(string word)
{
var ndx = 0;
var cur = _root;
while (cur != null)
cur = cur.Add(word, ndx++);
}
public IEnumerable<string> FindAllPrefixesFor(string searchTerm)
{
var ndx = 0;
var cur = _root;
while (cur != null)
{
if (cur.IsTerminal)
yield return searchTerm.Substring(0, ndx);
cur = cur.Find(searchTerm, ndx++);
}
}
private class Node
{
public Dictionary<char, Node> Children;
public bool IsTerminal; // whether a full word ends here.
public Node Find(string word, int index)
{
var child = default(Node);
if (index < word.Length && Children != null)
Children.TryGetValue(word[index], out child);
return child;
}
public Node Add(string word, int toConsume)
{
var child = default(Node);
if (toConsume == word.Length)
this.IsTerminal = true;
else if (Children == null || !Children.TryGetValue(word[toConsume], out child))
{
if (Children == null)
Children = new Dictionary<char, Node>();
Children[word[toConsume]] = child = new Node();
}
return child;
}
}
}
/// <summary>
/// A simple compressed prefix trie implementation. It uses a
/// dictionary with default initial capacity for storing child nodes.
/// Very fast, but memory space requirements can be substantially
/// improved.
/// </summary>
public sealed class SimpleCompressedTrieWordMatcher : IWordMatcher
{
private readonly Node _root = new Node();
public void BuildDictionary(IEnumerable<string> wordList)
{
foreach (var word in wordList)
AddWord(word);
}
public void AddWord(string word)
{
var startIndex = 0;
var current = _root;
while (current != null)
current = current.Add(word, ref startIndex);
}
public IEnumerable<string> FindAllPrefixesFor(string searchTerm)
{
var startNdx = 0;
var cur = _root;
while (cur != null)
{
var matchLen = 0;
cur = cur.Find(searchTerm, ref startNdx, out matchLen);
if (matchLen >= 0)
yield return searchTerm.Substring(0, matchLen);
};
}
private class Node
{
private Dictionary<char, Node> _children;
public string PrefixValue = string.Empty;
public bool IsTerminal;
public Node Add(string word, ref int startIndex)
{
var n = FindSharedPrefix(word, startIndex);
startIndex += n;
if (n == PrefixValue.Length) // full prefix match
{
if (startIndex == word.Length) // full match
IsTerminal = true;
else
return AddToChild(word, ref startIndex);
}
else // partial match, need to split this node's prefix.
SplittingAdd(word, n, ref startIndex);
return null;
}
public Node Find(string word, ref int startIndex, out int matchLen)
{
var n = FindSharedPrefix(word, startIndex);
startIndex += n;
matchLen = -1;
if (n == PrefixValue.Length)
{
if (IsTerminal)
matchLen = startIndex;
var child = default(Node);
if (_children != null && startIndex < word.Length && _children.TryGetValue(word[startIndex], out child))
{
startIndex++; // consumed map key character.
return child;
}
}
return null;
}
private Node AddToChild(string word, ref int startIndex)
{
var key = word[startIndex++]; // consume the mapping character
var nextNode = default(Node);
if (_children == null)
_children = new Dictionary<char, Node>();
else if (_children.TryGetValue(key, out nextNode))
return nextNode;
var remainder = word.Substring(startIndex);
_children[key] = new Node() { PrefixValue = remainder, IsTerminal = true };
return null; // consumed.
}
private void SplittingAdd(string word, int n, ref int startIndex)
{
var curChildren = _children;
_children = new Dictionary<char, Node>();
_children[PrefixValue[n]] = new Node() {
PrefixValue = this.PrefixValue.Substring(n + 1),
IsTerminal = this.IsTerminal,
_children = curChildren
};
PrefixValue = PrefixValue.Substring(0, n);
IsTerminal = startIndex == word.Length;
if (!IsTerminal)
{
var prefix = word.Length > startIndex + 1 ? word.Substring(startIndex + 1) : string.Empty;
_children[word[startIndex]] = new Node() { PrefixValue = prefix, IsTerminal = true };
startIndex++;
}
}
private int FindSharedPrefix(string word, int startIndex)
{
var n = Math.Min(PrefixValue.Length, word.Length - startIndex);
var len = 0;
while (len < n && PrefixValue[len] == word[len + startIndex])
len++;
return len;
}
}
}
/// <summary>
/// Interface used in testing, to have different sources that can
/// generate a dictionary word list and a list of search terms.
/// </summary>
public interface IWordListProvider
{
string[] GenerateDictionary(int nStrings, int maxLen, Random rnd);
string[] GenerateSearchWords(int nStrings, Random rnd);
}
/// <summary>
/// Generates random character sequences
/// </summary>
public class RandomWordListGenerator : IWordListProvider
{
private static readonly char[] Alphabet = Enumerable.Range(0, 26).Select(x => (char)('A' + x)).ToArray();
private int _maxLen;
public string[] GenerateDictionary(int nStrings, int maxLen, Random rnd)
{
_maxLen = maxLen;
return GenerateRandomStrings(Alphabet, nStrings, _maxLen, rnd);
}
public string[] GenerateSearchWords(int nStrings, Random rnd)
{
return GenerateRandomStrings(Alphabet, nStrings, _maxLen, rnd);
}
private static string[] GenerateRandomStrings(char[] alphabet, int nStrings, int maxLen, Random rnd)
{
var alphabetLen = alphabet.Length;
var result = new string[nStrings];
var buffer = new char[maxLen];
for (var wi = 0; wi < nStrings; wi++)
{
var n = 1 + rnd.Next(maxLen);
for (var ci = 0; ci < n; ci++)
buffer[ci] = alphabet[rnd.Next(alphabetLen)];
result[wi] = new string(buffer, 0, n);
}
return result;
}
}
/// <summary>
/// Loads words from a word list file, that optionally has a delimited format.
/// </summary>
public class FileWordListGenerator : IWordListProvider
{
private static readonly string[] _seps = { " ", "\t", ",", ";", ":", "|", ".", "?", "!" };
private readonly List<string> _wordList = new List<string>();
private int _maxLen;
public FileWordListGenerator(string fileName, bool upcase)
{
if (upcase)
{
foreach (var line in File.ReadLines(fileName))
_wordList.AddRange(line.Split(_seps, StringSplitOptions.RemoveEmptyEntries).Select(x => x.ToUpperInvariant()));
}
else
{
foreach (var line in File.ReadLines(fileName))
_wordList.AddRange(line.Split(_seps, StringSplitOptions.RemoveEmptyEntries));
}
}
public string[] GenerateDictionary(int nStrings, int maxLen, Random rnd)
{
_maxLen = maxLen;
return SelectRandomStrings(nStrings, rnd);
}
public string[] GenerateSearchWords(int nStrings, Random rnd)
{
return SelectRandomStrings(nStrings, rnd);
}
private string[] SelectRandomStrings(int nStrings, Random rnd)
{
var result = new string[nStrings];
var i = 0;
while (i < nStrings)
{
result[i] = _wordList[rnd.Next(_wordList.Count)];
if (result[i].Length <= _maxLen)
i++;
}
return result;
}
}
/// <summary>
/// We need a trick to be able to accurately measure memory usage for the
/// implementations that would otherwise only store a reference to the
/// already allocated string from the input word list. I.e. we need to get
/// an actual copy.
/// </summary>
internal static class StringCopy
{
public static string MakeCopy(this string input)
{
return Encoding.Unicode.GetString(Encoding.Unicode.GetBytes(input));
}
}
/// <summary>
/// Timing & memory usage bench
/// </summary>
public static class WordPrefixSearchBench
{
public static void Run(int dictionarySize, int maxWordSize, int nToSearch, int nRepeats)
{
// Set up
var methods = new [] {
new Tester("Naive", true, () => new NaiveWordMatcher()),
new Tester("JimMischel", true, () => new JimMischel_SortedWordListMatcher()),
new Tester("JM_MM_DSBS", true, () => new JimMischel_DictSortedWordListMatcher()),
new Tester("SimpleTrie", true, () => new SimpleTrieWordMatcher()),
new Tester("CompessedTrie", true, () => new SimpleCompressedTrieWordMatcher()),
new Tester("MattyMerrix", true, () => new MattyMerrix_CompareHelper()),
};
var measurements = methods.ToDictionary(x => x.Name, x => new List<TestResult>());
var rnd = new Random();
var generator = new RandomWordListGenerator();
var dictionaryWords = generator.GenerateDictionary(dictionarySize, maxWordSize, rnd);
var matchedWordsList = new List<string>(dictionaryWords.Length);
var stopWatch = new Stopwatch();
Console.WriteLine("Matching {0} words to dictionary of {1} terms of max length {2}", nToSearch, dictionarySize, maxWordSize);
// We do nRepeats + 1, the first will be warmup run.
for (int i = 0; i < nRepeats + 1; i++)
{
var searchWords = generator.GenerateSearchWords(nToSearch, rnd);
var naiveResults = default(List<string>);
foreach (var method in methods)
{
if (i == 0)
Console.Write("Warmup with {0} ...", method.Name);
else
Console.Write("Running {0} (repeat #{1} of {2}) ...", method.Name, i, nRepeats);
matchedWordsList.Clear();
var result = method.Run(dictionaryWords, searchWords, matchedWordsList, stopWatch);
// Verify results vs. naive implemenation results.
if (naiveResults == null)
naiveResults = new List<string>(matchedWordsList);
else
VerifyResults(naiveResults, matchedWordsList);
if (i > 0)
measurements[method.Name].Add(result);
Console.CursorLeft = 0;
Console.Write(new string(' ', 79));
Console.CursorLeft = 0;
}
matchedWordsList.Clear();
}
// Report results
Console.WriteLine("{0,13} {1,22} {2,22} {3,22}", "Method", "Memory (MB)", "Build Time (s)", "Lookup Time (s)");
foreach (var m in measurements)
{
var perf = m.Value;
var minMem = (double)perf.Min(x => x.MemUsage) / (1024 * 1024);
var maxMem = (double)perf.Max(x => x.MemUsage) / (1024 * 1024);
var avgMem = perf.Average(x => x.MemUsage) / (1024 * 1024);
var minBuildTime = perf.Min(x => x.BuildTime.TotalSeconds);
var maxBuildTime = perf.Max(x => x.BuildTime.TotalSeconds);
var avgBuildTime = perf.Average(x => x.BuildTime.TotalSeconds);
var minLookupTime = perf.Min(x => x.LookupTime.TotalSeconds);
var maxLookupTime = perf.Max(x => x.LookupTime.TotalSeconds);
var avgLookupTime = perf.Average(x => x.LookupTime.TotalSeconds);
Console.WriteLine("{0,13} {1,22} {2,22} {3,22}", m.Key,
string.Format("{0:0.00}-{1:0.00}, {2:0.00}", minMem, maxMem, avgMem),
string.Format("{0:0.000}-{1:0.000}, {2:0.000}", minBuildTime, maxBuildTime, avgBuildTime),
string.Format("{0:0.000}-{1:0.000}, {2:0.000}", minLookupTime, maxLookupTime, avgLookupTime));
}
}
private class TestResult
{
public long MemStart;
public long MemEnd;
public long MemUsage
{
get { return MemEnd - MemStart; }
}
public TimeSpan BuildTime;
public TimeSpan TotalTime;
public TimeSpan LookupTime
{
get { return TotalTime - BuildTime; }
}
}
private class Tester
{
private readonly Func<IWordMatcher> _factory;
private readonly bool _needsWordCopy;
private IWordMatcher _matcher;
public Tester(string name, bool needsWordCopy, Func<IWordMatcher> factory)
{
Name = name;
_factory = factory;
_needsWordCopy = needsWordCopy;
}
public readonly string Name;
public TestResult Run(string[] dictionaryWords, string[] searchWords, List<string> matchedWords, Stopwatch timer)
{
var res = new TestResult();
res.MemStart = GetMemory();
var input = _needsWordCopy
? dictionaryWords.Select(x => x.MakeCopy()).ToArray()
: dictionaryWords;
// Build
_matcher = _factory();
timer.Restart();
_matcher.BuildDictionary(input);
timer.Stop();
res.BuildTime = timer.Elapsed;
input = null;
GC.Collect();
GC.WaitForPendingFinalizers();
// Lookup.
timer.Start();
foreach (var w in searchWords)
matchedWords.AddRange(_matcher.FindAllPrefixesFor(w));
timer.Stop();
res.TotalTime = timer.Elapsed;
res.MemEnd = GetMemory();
GC.KeepAlive(_matcher);
_matcher = null;
return res;
}
private long GetMemory()
{
GC.Collect();
GC.WaitForPendingFinalizers();
return GC.GetTotalMemory(false);
}
}
private static void VerifyResults(List<string> reference, List<string> results)
{
if (results.Count != reference.Count)
Console.WriteLine("Error: {0} matches expected, but was {1}", reference.Count, results.Count);
foreach (var w in reference.Where(x => !results.Contains(x)))
Console.WriteLine("{0} missing in results", w);
foreach (var w in results.Where(x => !reference.Contains(x)))
Console.WriteLine("{0} extra in results", w);
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment