Skip to content

Instantly share code, notes, and snippets.

@kreeben
Forked from ayende/better-tokenizer.cs
Last active July 10, 2017 11:58
Show Gist options
  • Save kreeben/e5c239c53f944f1d777ac947d23a8d2d to your computer and use it in GitHub Desktop.
Save kreeben/e5c239c53f944f1d777ac947d23a8d2d to your computer and use it in GitHub Desktop.
public void Tokenize(string value, List<(int Start, int Length)> tokens)
{
// To find a word we wait for:
// 1. a char that is noice
// 2. a buffer larger than zero that is not a stopword
int length = 0, start = 0;
for (int i = 0; i < value.Length; i++)
{
var ch = char.ToLowerInvariant(value[i]);
if (!IsNoice(ch))
{
length++;
continue;
}
if (length == 0)
{
start++;
continue;
}
if (IsStopword(value, start, length) == false)
{
tokens.Add((start, length));
start += length + 1;
length = 0;
}
}
if (length > 0 && IsStopword(value, start, length) == false)
{
tokens.Add((start, length));
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment