-
-
Save mattwarren/f0594a9f3afa9377a4bbc2bcf8e573c5 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[Config(typeof(Config))] | |
public class Program | |
{ | |
static void Main(string[] args) | |
{ | |
var summary = BenchmarkRunner.Run<Program>(); | |
} | |
private class Config : ManualConfig | |
{ | |
public Config() | |
{ | |
Add(Job.Clr.WithLaunchCount(1)); | |
Add(new GCDiagnoser()); | |
} | |
} | |
[Params("Foo;Bar", "Foo;FooBar;Whatever", "Bar;blaat;foo", "blaat;foo;Bar", "foo;Bar;Blaat", "foo;FooBar;Blaat", | |
"Bar1;Bar2;Bar3;Bar4;Bar", "Bar1;Bar2;Bar3;Bar4;NoMatch", "Foo;FooBar;Whatever", "Some;Other;Really;Interesting;Tokens")] | |
public string Value { get; set; } | |
public string Match = "Bar"; | |
private static char [] delimeter = new [] { ';' }; | |
[Benchmark(Baseline = true)] | |
public bool StringSplit() | |
{ | |
// To make is a fair comparision! | |
if (string.IsNullOrEmpty(Match)) return false; | |
if (string.IsNullOrEmpty(Value)) return false; | |
var tokens = Value.Split(delimeter); | |
foreach (var token in tokens) | |
{ | |
if (token == Match) | |
return true; | |
} | |
return false; | |
} | |
[Benchmark] | |
public bool ContainsToken() | |
{ | |
return ContainsToken(Value, Match); | |
} | |
[Benchmark] | |
public bool ContainsTokenXoofx() | |
{ | |
return ContainsTokenXoofx(Value, Match); | |
} | |
[Benchmark] | |
public bool ContainsTokenXoofxUnsafe() | |
{ | |
return ContainsTokenXoofxUnsafe(Value, Match); | |
} | |
[Benchmark] | |
public bool ContainsTokenHellBrick() | |
{ | |
return ContainsTokenHellBrick(Value, Match); | |
} | |
[Benchmark] | |
public bool ContainsTokenMonty() | |
{ | |
return ContainsTokenMonty(Value, Match); | |
} | |
/// <summary> | |
/// Code taken from https://twitter.com/Nick_Craver/status/722741298575319040 | |
/// Checks whether the *entire* token exists, correctly handling seperators | |
/// (optional at start and end) so: "Foo;Bar" contains "Bar", but | |
/// "Foo;FooBar;Whatever" does not | |
/// </summary> | |
public static bool ContainsToken(string value, string token, char delimiter = ';') | |
{ | |
if (string.IsNullOrEmpty(token)) return false; | |
if (string.IsNullOrEmpty(value)) return false; | |
int lastIndex = -1, idx, endIndex = value.Length - token.Length, tokenLength = token.Length; | |
while ((idx = value.IndexOf(token, lastIndex + 1)) > lastIndex) | |
{ | |
lastIndex = idx; | |
if ((idx == 0 || (value[idx - 1] == delimiter)) | |
&& (idx == endIndex || (value[idx + tokenLength] == delimiter))) | |
{ | |
return true; | |
} | |
} | |
return false; | |
} | |
// See https://gist.github.com/mattwarren/f0594a9f3afa9377a4bbc2bcf8e573c5#gistcomment-1757392 | |
public static bool ContainsTokenXoofx(string value, string token, char delimiter = ';') | |
{ | |
if (string.IsNullOrEmpty(token)) return false; | |
if (string.IsNullOrEmpty(value)) return false; | |
int length = value.Length; | |
int tokenLength = token.Length; | |
for (int i = 0; i < length; i++) | |
{ | |
for (int j = 0; j < tokenLength; j++, i++) | |
{ | |
if (i >= length || value[i] != token[j]) | |
{ | |
goto next; | |
} | |
} | |
if ((i == length || (i < length && value[i] == delimiter))) | |
{ | |
return true; | |
} | |
next: | |
i = value.IndexOf(delimiter, i); | |
if (i < 0) | |
{ | |
break; | |
} | |
} | |
return false; | |
} | |
public unsafe static bool ContainsTokenXoofxUnsafe(string value, string token, char delimiter = ';') | |
{ | |
if (string.IsNullOrEmpty(token)) return false; | |
if (string.IsNullOrEmpty(value)) return false; | |
int length = value.Length; | |
int tokenLength = token.Length; | |
int i = 0; | |
fixed (char* pValue = value) | |
fixed (char* pToken = token) | |
while (true) | |
{ | |
for (int j = 0; j < tokenLength; j++, i++) | |
{ | |
if (i >= length || pValue[i] != pToken[j]) | |
{ | |
goto next; | |
} | |
} | |
if (i == length || pValue[i] == delimiter) | |
{ | |
return true; | |
} | |
next: | |
i = value.IndexOf(delimiter, i) + 1; | |
if (i == 0) | |
{ | |
break; | |
} | |
} | |
return false; | |
} | |
public static unsafe bool ContainsTokenHellBrick(string value, string token, char delimiter = ';') | |
{ | |
if (string.IsNullOrEmpty(token)) return false; | |
if (string.IsNullOrEmpty(value)) return false; | |
int length = value.Length; | |
int tokenLength = token.Length; | |
int currentTokenOffset = 0; | |
int maxTokenOffset = length - tokenLength; | |
while (currentTokenOffset <= maxTokenOffset) | |
{ | |
int nextDelimiterOffset = value.IndexOf(delimiter, currentTokenOffset); | |
if (nextDelimiterOffset < 0) | |
nextDelimiterOffset = length; | |
if (tokenLength == nextDelimiterOffset - currentTokenOffset) | |
{ | |
for (int i = 0; i < tokenLength; i++) | |
{ | |
if (value[currentTokenOffset + i] != token[i]) | |
goto next; | |
} | |
return true; | |
} | |
next: | |
currentTokenOffset = nextDelimiterOffset + 1; | |
} | |
return false; | |
} | |
public static bool ContainsTokenMonty(string value, string token, char delimiter = ';') | |
{ | |
// this code assumes you're running x64 | |
// it _works_ regardless, but some ifdef junk for x32 | |
// would be handy if you care about ideal x32 perf | |
const int charsPerLong = sizeof(long) / sizeof(char); // 4 | |
const int charsPerInt = sizeof(int) / sizeof(char); // 2 | |
const int bytesPerChar = sizeof(char) / sizeof(byte); // 2 | |
if (string.IsNullOrEmpty(token)) return false; | |
if (string.IsNullOrEmpty(value)) return false; | |
var valueLength = value.Length; | |
var tokenLength = token.Length; | |
if (tokenLength > valueLength) return false; | |
int tokenLongs; | |
bool tokenTrailingInt, tokenTrailingChar; | |
{ | |
var remainingChars = tokenLength; | |
tokenLongs = remainingChars / charsPerLong; | |
tokenTrailingInt = (tokenLength & 0x02) != 0; | |
tokenTrailingChar = (tokenLength & 0x01) != 0; | |
} | |
var tokenByteLength = tokenLength * bytesPerChar; | |
unsafe | |
{ | |
fixed (char* valuePtr = value, tokenPtr = token) | |
{ | |
var curValuePtr = valuePtr; | |
var endValuePtr = valuePtr + valueLength; | |
while (true) | |
{ | |
// 8-byte steps | |
long* tokenLongPtr = (long*)tokenPtr; | |
{ | |
for (var i = 0; i < tokenLongs; i++) | |
{ | |
var tokenLong = *tokenLongPtr; | |
var valueLong = *((long*)curValuePtr); | |
if (tokenLong == valueLong) | |
{ | |
// we have a match, continue | |
tokenLongPtr++; | |
curValuePtr += charsPerLong; | |
continue; | |
} | |
else | |
{ | |
goto advanceToNextDelimiter; | |
} | |
} | |
} | |
// can only be 1 4-byte value, 'cause if there were 2 it'd have been handled in the 8-byte stride | |
int* tokenIntPtr = (int*)tokenLongPtr; | |
if (tokenTrailingInt) | |
{ | |
var tokenInt = *tokenIntPtr; | |
var valueInt = *((int*)curValuePtr); | |
if (tokenInt == valueInt) | |
{ | |
// we have a match, continue | |
tokenIntPtr++; | |
curValuePtr += charsPerInt; | |
} | |
else | |
{ | |
goto advanceToNextDelimiter; | |
} | |
} | |
// likewise, only 1 2-byte value or it'd be handled in the 4-byte check | |
char* tokenCharPtr = (char*)tokenIntPtr; | |
if (tokenTrailingChar) | |
{ | |
var tokenChar = *tokenCharPtr; | |
var valueChar = *curValuePtr; | |
if (tokenChar == valueChar) | |
{ | |
// we have a match, continue | |
tokenCharPtr++; | |
curValuePtr++; | |
} | |
else | |
{ | |
goto advanceToNextDelimiter; | |
} | |
} | |
// at this point, the full text of token has been matched, so we need to check if the _next_ char | |
// in value is a delimiter or the end of value | |
if (curValuePtr == endValuePtr || *curValuePtr == delimiter) | |
{ | |
return true; | |
} | |
// in this case, we found a string that starts with value, but is not value | |
// "Foo" in "FooBar" for example | |
// advance here to skip the rest of whatever's before | |
// the next delimiter | |
advanceToNextDelimiter: | |
// our own IndexOf()-esque functionality | |
// at the end of this curValuePtr points to the | |
// character _after_ the next delimiter in the string | |
// | |
// not using IndexOf() is to save us the trouble | |
// of tracking additional state, or doing extra work, | |
// to move between the index-into-a-string world and | |
// our constantly-advancing-pointers world | |
while (true) | |
{ | |
var curVal = *curValuePtr; | |
if (curVal == delimiter) | |
{ | |
// got one! | |
curValuePtr++; | |
if (curValuePtr == endValuePtr) | |
{ | |
// but there wasn't anything after it | |
return false; | |
} | |
break; | |
} | |
curValuePtr++; | |
if (curValuePtr == endValuePtr) | |
{ | |
// no more delimiters | |
return false; | |
} | |
} | |
// we only need to do this check after a delimiter is advanced | |
// because this check _itself_ guarantees that we won't | |
// exceed the end of value before we're done comparing | |
// against token | |
var remainingBytesInValue = ((byte*)endValuePtr) - ((byte*)curValuePtr); // we do this bytes to save an op | |
if (remainingBytesInValue < tokenByteLength) | |
{ | |
// not possible for token to fit in string, fail early | |
return false; | |
} | |
} | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
BTW, @kevin-montrose: I found that your algorithms won't support tokens larger than 6 chars anywhere but the beginning of the list. I haven't tried to debug them, though.