Created
June 5, 2015 22:03
-
-
Save anonymous/1e755a5f0880523d3ef3 to your computer and use it in GitHub Desktop.
A benchmark with a number of implementations for performing word prefix searches versus a built word dictionary, see http://stackoverflow.com/questions/30578450/optimal-compare-algorithm-for-finding-string-matches-in-list-of-strings-c-sharp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
namespace TryOuts | |
{ | |
using System; | |
using System.IO; | |
using System.Collections.Generic; | |
using System.Linq; | |
using System.Text; | |
using System.Diagnostics; | |
/// <summary> | |
/// Common interface | |
/// </summary> | |
public interface IWordMatcher | |
{ | |
void BuildDictionary(IEnumerable<string> wordList); | |
IEnumerable<string> FindAllPrefixesFor(string searchTerm); | |
} | |
/// <summary> | |
/// Simplest very naive implementation. | |
/// </summary> | |
public sealed class NaiveWordMatcher : IWordMatcher | |
{ | |
private List<string> _uniqueWords; | |
public void BuildDictionary(IEnumerable<string> wordList) | |
{ | |
_uniqueWords = wordList.Distinct().ToList(); | |
} | |
public IEnumerable<string> FindAllPrefixesFor(string searchTerm) | |
{ | |
foreach (var w in _uniqueWords) | |
if (searchTerm.StartsWith(w)) | |
yield return w; | |
} | |
} | |
/// <summary> | |
/// Simple implementation using binary search on a sorted list, see | |
/// http://stackoverflow.com/a/30604336/2573395, @JimMischel's answer. | |
/// </summary> | |
public sealed class JimMischel_SortedWordListMatcher : IWordMatcher | |
{ | |
private List<string> _uniqueSortedWords; | |
public void BuildDictionary(IEnumerable<string> wordList) | |
{ | |
_uniqueSortedWords = wordList.Distinct().ToList(); | |
_uniqueSortedWords.Sort(); | |
} | |
public IEnumerable<string> FindAllPrefixesFor(string searchTerm) | |
{ | |
if (searchTerm.Length > 0) | |
{ | |
var prefixLen = 1; | |
var prefix = searchTerm.Substring(0, prefixLen); | |
var ndx = _uniqueSortedWords.BinarySearch(prefix); | |
if (ndx < 0) | |
ndx = ~ndx; | |
while (prefixLen <= searchTerm.Length && ndx < _uniqueSortedWords.Count) | |
{ | |
var cmp = String.Compare(_uniqueSortedWords[ndx], prefix); | |
if (cmp == 0) | |
{ | |
yield return prefix; | |
ndx++; | |
} | |
else if (cmp > 0) | |
{ | |
if (_uniqueSortedWords[ndx].Length <= prefixLen || _uniqueSortedWords[ndx][0] != prefix[0]) | |
break; | |
if (++prefixLen <= searchTerm.Length) | |
prefix = searchTerm.Substring(0, prefixLen); | |
} | |
else | |
{ | |
ndx++; | |
} | |
} | |
} | |
} | |
} | |
/// <summary> | |
/// Simple implementation using binary search on a sorted list, see | |
/// http://stackoverflow.com/a/30604336/2573395, @JimMischel's answer. | |
/// </summary> | |
public sealed class JimMischel_DictSortedWordListMatcher : IWordMatcher | |
{ | |
public Dictionary<char, List<string>> MyDictionary; | |
public void BuildDictionary(IEnumerable<string> wordList) | |
{ | |
MyDictionary = wordList | |
.Where(x => x.Length > 0) | |
.GroupBy(x => x[0]) | |
.ToDictionary( | |
g => g.Key, | |
g => g.Distinct().OrderBy(x => x).Select(x => x.Substring(1)).ToList()); | |
} | |
public IEnumerable<string> FindAllPrefixesFor(string searchTerm) | |
{ | |
var list = default(List<string>); | |
if (searchTerm.Length > 0 && MyDictionary.TryGetValue(searchTerm[0], out list)) | |
{ | |
if (list[0].Length == 0) | |
yield return searchTerm.Substring(0, 1); | |
if (searchTerm.Length > 1) | |
{ | |
var prefix = searchTerm.Substring(1, 1); | |
var ndx = list.BinarySearch(prefix); | |
if (ndx < 0) | |
ndx = ~ndx; | |
var toMatch = searchTerm.Substring(1); | |
while (ndx < list.Count) | |
{ | |
var cmp = String.Compare(list[ndx], 0, toMatch, 0, list[ndx].Length); | |
if (cmp == 0) | |
{ | |
yield return searchTerm.Substring(0, list[ndx].Length + 1); | |
ndx++; | |
} | |
else if (cmp > 0) | |
break; | |
else | |
ndx++; | |
} | |
} | |
} | |
} | |
} | |
/// <summary> | |
/// Slightly reworked version of @MattyMerrix CompareHelper (see | |
/// http://stackoverflow.com/a/30597532/2573395). | |
/// </summary> | |
public sealed class MattyMerrix_CompareHelper : IWordMatcher | |
{ | |
private readonly StringBuilder ThisWord = new StringBuilder(); | |
private List<string> CurrentWordList; | |
public Dictionary<char, List<string>> MyDictionary; | |
public void BuildDictionary(IEnumerable<string> wordList) | |
{ | |
MyDictionary = wordList | |
.Where(x => x.Length > 0) | |
.GroupBy(x => x[0]) | |
.ToDictionary( | |
g => g.Key, | |
g => g.Distinct().OrderBy(x => x).ToList()); | |
} | |
public IEnumerable<string> FindAllPrefixesFor(string searchTerm) | |
{ | |
int index = 0; | |
if (InitCompare(searchTerm[index++])) | |
yield return ThisWord.ToString(); | |
if (CurrentWordList != null) | |
{ | |
while (CurrentWordList.Count > 0 && index < searchTerm.Length) | |
if (NextCompare(index, searchTerm[index++])) | |
yield return ThisWord.ToString(); | |
} | |
} | |
private bool InitCompare(char firstChar) | |
{ | |
ThisWord.Clear(); | |
if (MyDictionary.TryGetValue(firstChar, out CurrentWordList)) | |
{ | |
ThisWord.Append(firstChar); | |
return CurrentWordList[0].Length == 1; | |
} | |
return false; | |
} | |
private bool NextCompare(int index, char nextChar) | |
{ | |
ThisWord.Append(nextChar); | |
CurrentWordList = CurrentWordList | |
.Where(word => (word.Length > index && word[index] == nextChar)) | |
.ToList(); | |
return (CurrentWordList.Count > 0 && CurrentWordList[0].Length == ThisWord.Length); | |
} | |
} | |
/// <summary> | |
/// Uses a simple non-compressed trie, storing a single character | |
/// per node and a dictionary for children. This is fast, but requires | |
/// a lot of memory overhead for each node. | |
/// </summary> | |
public sealed class SimpleTrieWordMatcher : IWordMatcher | |
{ | |
private readonly Node _root = new Node(); // root represents empty string. | |
public void BuildDictionary(IEnumerable<string> wordList) | |
{ | |
foreach (var word in wordList) | |
AddWord(word); | |
} | |
public void AddWord(string word) | |
{ | |
var ndx = 0; | |
var cur = _root; | |
while (cur != null) | |
cur = cur.Add(word, ndx++); | |
} | |
public IEnumerable<string> FindAllPrefixesFor(string searchTerm) | |
{ | |
var ndx = 0; | |
var cur = _root; | |
while (cur != null) | |
{ | |
if (cur.IsTerminal) | |
yield return searchTerm.Substring(0, ndx); | |
cur = cur.Find(searchTerm, ndx++); | |
} | |
} | |
private class Node | |
{ | |
public Dictionary<char, Node> Children; | |
public bool IsTerminal; // whether a full word ends here. | |
public Node Find(string word, int index) | |
{ | |
var child = default(Node); | |
if (index < word.Length && Children != null) | |
Children.TryGetValue(word[index], out child); | |
return child; | |
} | |
public Node Add(string word, int toConsume) | |
{ | |
var child = default(Node); | |
if (toConsume == word.Length) | |
this.IsTerminal = true; | |
else if (Children == null || !Children.TryGetValue(word[toConsume], out child)) | |
{ | |
if (Children == null) | |
Children = new Dictionary<char, Node>(); | |
Children[word[toConsume]] = child = new Node(); | |
} | |
return child; | |
} | |
} | |
} | |
/// <summary> | |
/// A simple compressed prefix trie implementation. It uses a | |
/// dictionary with default initial capacity for storing child nodes. | |
/// Very fast, but memory space requirements can be substantially | |
/// improved. | |
/// </summary> | |
public sealed class SimpleCompressedTrieWordMatcher : IWordMatcher | |
{ | |
private readonly Node _root = new Node(); | |
public void BuildDictionary(IEnumerable<string> wordList) | |
{ | |
foreach (var word in wordList) | |
AddWord(word); | |
} | |
public void AddWord(string word) | |
{ | |
var startIndex = 0; | |
var current = _root; | |
while (current != null) | |
current = current.Add(word, ref startIndex); | |
} | |
public IEnumerable<string> FindAllPrefixesFor(string searchTerm) | |
{ | |
var startNdx = 0; | |
var cur = _root; | |
while (cur != null) | |
{ | |
var matchLen = 0; | |
cur = cur.Find(searchTerm, ref startNdx, out matchLen); | |
if (matchLen >= 0) | |
yield return searchTerm.Substring(0, matchLen); | |
}; | |
} | |
private class Node | |
{ | |
private Dictionary<char, Node> _children; | |
public string PrefixValue = string.Empty; | |
public bool IsTerminal; | |
public Node Add(string word, ref int startIndex) | |
{ | |
var n = FindSharedPrefix(word, startIndex); | |
startIndex += n; | |
if (n == PrefixValue.Length) // full prefix match | |
{ | |
if (startIndex == word.Length) // full match | |
IsTerminal = true; | |
else | |
return AddToChild(word, ref startIndex); | |
} | |
else // partial match, need to split this node's prefix. | |
SplittingAdd(word, n, ref startIndex); | |
return null; | |
} | |
public Node Find(string word, ref int startIndex, out int matchLen) | |
{ | |
var n = FindSharedPrefix(word, startIndex); | |
startIndex += n; | |
matchLen = -1; | |
if (n == PrefixValue.Length) | |
{ | |
if (IsTerminal) | |
matchLen = startIndex; | |
var child = default(Node); | |
if (_children != null && startIndex < word.Length && _children.TryGetValue(word[startIndex], out child)) | |
{ | |
startIndex++; // consumed map key character. | |
return child; | |
} | |
} | |
return null; | |
} | |
private Node AddToChild(string word, ref int startIndex) | |
{ | |
var key = word[startIndex++]; // consume the mapping character | |
var nextNode = default(Node); | |
if (_children == null) | |
_children = new Dictionary<char, Node>(); | |
else if (_children.TryGetValue(key, out nextNode)) | |
return nextNode; | |
var remainder = word.Substring(startIndex); | |
_children[key] = new Node() { PrefixValue = remainder, IsTerminal = true }; | |
return null; // consumed. | |
} | |
private void SplittingAdd(string word, int n, ref int startIndex) | |
{ | |
var curChildren = _children; | |
_children = new Dictionary<char, Node>(); | |
_children[PrefixValue[n]] = new Node() { | |
PrefixValue = this.PrefixValue.Substring(n + 1), | |
IsTerminal = this.IsTerminal, | |
_children = curChildren | |
}; | |
PrefixValue = PrefixValue.Substring(0, n); | |
IsTerminal = startIndex == word.Length; | |
if (!IsTerminal) | |
{ | |
var prefix = word.Length > startIndex + 1 ? word.Substring(startIndex + 1) : string.Empty; | |
_children[word[startIndex]] = new Node() { PrefixValue = prefix, IsTerminal = true }; | |
startIndex++; | |
} | |
} | |
private int FindSharedPrefix(string word, int startIndex) | |
{ | |
var n = Math.Min(PrefixValue.Length, word.Length - startIndex); | |
var len = 0; | |
while (len < n && PrefixValue[len] == word[len + startIndex]) | |
len++; | |
return len; | |
} | |
} | |
} | |
/// <summary> | |
/// Interface used in testing, to have different sources that can | |
/// generate a dictionary word list and a list of search terms. | |
/// </summary> | |
public interface IWordListProvider | |
{ | |
string[] GenerateDictionary(int nStrings, int maxLen, Random rnd); | |
string[] GenerateSearchWords(int nStrings, Random rnd); | |
} | |
/// <summary> | |
/// Generates random character sequences | |
/// </summary> | |
public class RandomWordListGenerator : IWordListProvider | |
{ | |
private static readonly char[] Alphabet = Enumerable.Range(0, 26).Select(x => (char)('A' + x)).ToArray(); | |
private int _maxLen; | |
public string[] GenerateDictionary(int nStrings, int maxLen, Random rnd) | |
{ | |
_maxLen = maxLen; | |
return GenerateRandomStrings(Alphabet, nStrings, _maxLen, rnd); | |
} | |
public string[] GenerateSearchWords(int nStrings, Random rnd) | |
{ | |
return GenerateRandomStrings(Alphabet, nStrings, _maxLen, rnd); | |
} | |
private static string[] GenerateRandomStrings(char[] alphabet, int nStrings, int maxLen, Random rnd) | |
{ | |
var alphabetLen = alphabet.Length; | |
var result = new string[nStrings]; | |
var buffer = new char[maxLen]; | |
for (var wi = 0; wi < nStrings; wi++) | |
{ | |
var n = 1 + rnd.Next(maxLen); | |
for (var ci = 0; ci < n; ci++) | |
buffer[ci] = alphabet[rnd.Next(alphabetLen)]; | |
result[wi] = new string(buffer, 0, n); | |
} | |
return result; | |
} | |
} | |
/// <summary> | |
/// Loads words from a word list file, that optionally has a delimited format. | |
/// </summary> | |
public class FileWordListGenerator : IWordListProvider | |
{ | |
private static readonly string[] _seps = { " ", "\t", ",", ";", ":", "|", ".", "?", "!" }; | |
private readonly List<string> _wordList = new List<string>(); | |
private int _maxLen; | |
public FileWordListGenerator(string fileName, bool upcase) | |
{ | |
if (upcase) | |
{ | |
foreach (var line in File.ReadLines(fileName)) | |
_wordList.AddRange(line.Split(_seps, StringSplitOptions.RemoveEmptyEntries).Select(x => x.ToUpperInvariant())); | |
} | |
else | |
{ | |
foreach (var line in File.ReadLines(fileName)) | |
_wordList.AddRange(line.Split(_seps, StringSplitOptions.RemoveEmptyEntries)); | |
} | |
} | |
public string[] GenerateDictionary(int nStrings, int maxLen, Random rnd) | |
{ | |
_maxLen = maxLen; | |
return SelectRandomStrings(nStrings, rnd); | |
} | |
public string[] GenerateSearchWords(int nStrings, Random rnd) | |
{ | |
return SelectRandomStrings(nStrings, rnd); | |
} | |
private string[] SelectRandomStrings(int nStrings, Random rnd) | |
{ | |
var result = new string[nStrings]; | |
var i = 0; | |
while (i < nStrings) | |
{ | |
result[i] = _wordList[rnd.Next(_wordList.Count)]; | |
if (result[i].Length <= _maxLen) | |
i++; | |
} | |
return result; | |
} | |
} | |
/// <summary> | |
/// We need a trick to be able to accurately measure memory usage for the | |
/// implementations that would otherwise only store a reference to the | |
/// already allocated string from the input word list. I.e. we need to get | |
/// an actual copy. | |
/// </summary> | |
internal static class StringCopy | |
{ | |
public static string MakeCopy(this string input) | |
{ | |
return Encoding.Unicode.GetString(Encoding.Unicode.GetBytes(input)); | |
} | |
} | |
/// <summary> | |
/// Timing & memory usage bench | |
/// </summary> | |
public static class WordPrefixSearchBench | |
{ | |
public static void Run(int dictionarySize, int maxWordSize, int nToSearch, int nRepeats) | |
{ | |
// Set up | |
var methods = new [] { | |
new Tester("Naive", true, () => new NaiveWordMatcher()), | |
new Tester("JimMischel", true, () => new JimMischel_SortedWordListMatcher()), | |
new Tester("JM_MM_DSBS", true, () => new JimMischel_DictSortedWordListMatcher()), | |
new Tester("SimpleTrie", true, () => new SimpleTrieWordMatcher()), | |
new Tester("CompessedTrie", true, () => new SimpleCompressedTrieWordMatcher()), | |
new Tester("MattyMerrix", true, () => new MattyMerrix_CompareHelper()), | |
}; | |
var measurements = methods.ToDictionary(x => x.Name, x => new List<TestResult>()); | |
var rnd = new Random(); | |
var generator = new RandomWordListGenerator(); | |
var dictionaryWords = generator.GenerateDictionary(dictionarySize, maxWordSize, rnd); | |
var matchedWordsList = new List<string>(dictionaryWords.Length); | |
var stopWatch = new Stopwatch(); | |
Console.WriteLine("Matching {0} words to dictionary of {1} terms of max length {2}", nToSearch, dictionarySize, maxWordSize); | |
// We do nRepeats + 1, the first will be warmup run. | |
for (int i = 0; i < nRepeats + 1; i++) | |
{ | |
var searchWords = generator.GenerateSearchWords(nToSearch, rnd); | |
var naiveResults = default(List<string>); | |
foreach (var method in methods) | |
{ | |
if (i == 0) | |
Console.Write("Warmup with {0} ...", method.Name); | |
else | |
Console.Write("Running {0} (repeat #{1} of {2}) ...", method.Name, i, nRepeats); | |
matchedWordsList.Clear(); | |
var result = method.Run(dictionaryWords, searchWords, matchedWordsList, stopWatch); | |
// Verify results vs. naive implemenation results. | |
if (naiveResults == null) | |
naiveResults = new List<string>(matchedWordsList); | |
else | |
VerifyResults(naiveResults, matchedWordsList); | |
if (i > 0) | |
measurements[method.Name].Add(result); | |
Console.CursorLeft = 0; | |
Console.Write(new string(' ', 79)); | |
Console.CursorLeft = 0; | |
} | |
matchedWordsList.Clear(); | |
} | |
// Report results | |
Console.WriteLine("{0,13} {1,22} {2,22} {3,22}", "Method", "Memory (MB)", "Build Time (s)", "Lookup Time (s)"); | |
foreach (var m in measurements) | |
{ | |
var perf = m.Value; | |
var minMem = (double)perf.Min(x => x.MemUsage) / (1024 * 1024); | |
var maxMem = (double)perf.Max(x => x.MemUsage) / (1024 * 1024); | |
var avgMem = perf.Average(x => x.MemUsage) / (1024 * 1024); | |
var minBuildTime = perf.Min(x => x.BuildTime.TotalSeconds); | |
var maxBuildTime = perf.Max(x => x.BuildTime.TotalSeconds); | |
var avgBuildTime = perf.Average(x => x.BuildTime.TotalSeconds); | |
var minLookupTime = perf.Min(x => x.LookupTime.TotalSeconds); | |
var maxLookupTime = perf.Max(x => x.LookupTime.TotalSeconds); | |
var avgLookupTime = perf.Average(x => x.LookupTime.TotalSeconds); | |
Console.WriteLine("{0,13} {1,22} {2,22} {3,22}", m.Key, | |
string.Format("{0:0.00}-{1:0.00}, {2:0.00}", minMem, maxMem, avgMem), | |
string.Format("{0:0.000}-{1:0.000}, {2:0.000}", minBuildTime, maxBuildTime, avgBuildTime), | |
string.Format("{0:0.000}-{1:0.000}, {2:0.000}", minLookupTime, maxLookupTime, avgLookupTime)); | |
} | |
} | |
private class TestResult | |
{ | |
public long MemStart; | |
public long MemEnd; | |
public long MemUsage | |
{ | |
get { return MemEnd - MemStart; } | |
} | |
public TimeSpan BuildTime; | |
public TimeSpan TotalTime; | |
public TimeSpan LookupTime | |
{ | |
get { return TotalTime - BuildTime; } | |
} | |
} | |
private class Tester | |
{ | |
private readonly Func<IWordMatcher> _factory; | |
private readonly bool _needsWordCopy; | |
private IWordMatcher _matcher; | |
public Tester(string name, bool needsWordCopy, Func<IWordMatcher> factory) | |
{ | |
Name = name; | |
_factory = factory; | |
_needsWordCopy = needsWordCopy; | |
} | |
public readonly string Name; | |
public TestResult Run(string[] dictionaryWords, string[] searchWords, List<string> matchedWords, Stopwatch timer) | |
{ | |
var res = new TestResult(); | |
res.MemStart = GetMemory(); | |
var input = _needsWordCopy | |
? dictionaryWords.Select(x => x.MakeCopy()).ToArray() | |
: dictionaryWords; | |
// Build | |
_matcher = _factory(); | |
timer.Restart(); | |
_matcher.BuildDictionary(input); | |
timer.Stop(); | |
res.BuildTime = timer.Elapsed; | |
input = null; | |
GC.Collect(); | |
GC.WaitForPendingFinalizers(); | |
// Lookup. | |
timer.Start(); | |
foreach (var w in searchWords) | |
matchedWords.AddRange(_matcher.FindAllPrefixesFor(w)); | |
timer.Stop(); | |
res.TotalTime = timer.Elapsed; | |
res.MemEnd = GetMemory(); | |
GC.KeepAlive(_matcher); | |
_matcher = null; | |
return res; | |
} | |
private long GetMemory() | |
{ | |
GC.Collect(); | |
GC.WaitForPendingFinalizers(); | |
return GC.GetTotalMemory(false); | |
} | |
} | |
private static void VerifyResults(List<string> reference, List<string> results) | |
{ | |
if (results.Count != reference.Count) | |
Console.WriteLine("Error: {0} matches expected, but was {1}", reference.Count, results.Count); | |
foreach (var w in reference.Where(x => !results.Contains(x))) | |
Console.WriteLine("{0} missing in results", w); | |
foreach (var w in results.Where(x => !reference.Contains(x))) | |
Console.WriteLine("{0} extra in results", w); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment