Last active
October 31, 2019 17:49
-
-
Save kzu/e6222670201ab630103671177f17e76d to your computer and use it in GitHub Desktop.
Using SymSpell to detect 1-char misspellings without hardcoding gazzillion misspellings
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Requires package reference to https://www.nuget.org/packages/symspell | |
using System; | |
using System.IO; | |
using System.Linq; | |
using System.Text; | |
class Program | |
{ | |
static void Main(string[] args) | |
{ | |
//create object | |
var initialCapacity = 100; | |
var maxEditDistanceDictionary = 2; //maximum edit distance per dictionary precalculation | |
var symSpell = new SymSpell(initialCapacity, maxEditDistanceDictionary); | |
// This would get the current known-metadata values | |
string[] words = | |
{ | |
"PrivateAssets", | |
"IncludeAssets", | |
"ExcludeAssets" | |
}; | |
// The SymSpell API seems to always lowercase all words and then the edit distance includes | |
// case changes, which isn't very useful (i.e. PrivateAsets would be reported as a 3 edit | |
// distance because the internal words dictionary would contain privateassets). | |
// So we can keep a lookup dictionary to get back the proper case value for the | |
// suggestion. | |
var lookup = words.ToDictionary(x => x, x => x, StringComparer.OrdinalIgnoreCase); | |
symSpell.CreateDictionary(new MemoryStream(Encoding.UTF8.GetBytes(string.Join(' ', words)))); | |
string line; | |
while ((line = Console.ReadLine()).Length > 0) | |
{ | |
// Get closest suggestion within a 1-edit distance. Could be loosened to 2? | |
var suggestions = symSpell.Lookup(line.Trim().ToLowerInvariant(), SymSpell.Verbosity.Closest, 1); | |
//display term and frequency | |
foreach (var suggestion in suggestions) | |
{ | |
// Just render if there's any suggestion at all | |
Console.WriteLine(lookup[suggestion.term]); | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The following are examples of returned suggestions:
PrivateAsets
PrivateAssets
privateasets
PrivateAssets
PrivateAsset
PrivateAssets
PivateAsets
PrivateAset