-
-
Save mattwarren/f0594a9f3afa9377a4bbc2bcf8e573c5 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[Config(typeof(Config))] | |
public class Program | |
{ | |
static void Main(string[] args) | |
{ | |
var summary = BenchmarkRunner.Run<Program>(); | |
} | |
private class Config : ManualConfig | |
{ | |
public Config() | |
{ | |
Add(Job.Clr.WithLaunchCount(1)); | |
Add(new GCDiagnoser()); | |
} | |
} | |
[Params("Foo;Bar", "Foo;FooBar;Whatever", "Bar;blaat;foo", "blaat;foo;Bar", "foo;Bar;Blaat", "foo;FooBar;Blaat", | |
"Bar1;Bar2;Bar3;Bar4;Bar", "Bar1;Bar2;Bar3;Bar4;NoMatch", "Foo;FooBar;Whatever", "Some;Other;Really;Interesting;Tokens")] | |
public string Value { get; set; } | |
public string Match = "Bar"; | |
private static char [] delimeter = new [] { ';' }; | |
[Benchmark(Baseline = true)] | |
public bool StringSplit() | |
{ | |
// To make is a fair comparision! | |
if (string.IsNullOrEmpty(Match)) return false; | |
if (string.IsNullOrEmpty(Value)) return false; | |
var tokens = Value.Split(delimeter); | |
foreach (var token in tokens) | |
{ | |
if (token == Match) | |
return true; | |
} | |
return false; | |
} | |
[Benchmark] | |
public bool ContainsToken() | |
{ | |
return ContainsToken(Value, Match); | |
} | |
[Benchmark] | |
public bool ContainsTokenXoofx() | |
{ | |
return ContainsTokenXoofx(Value, Match); | |
} | |
[Benchmark] | |
public bool ContainsTokenXoofxUnsafe() | |
{ | |
return ContainsTokenXoofxUnsafe(Value, Match); | |
} | |
[Benchmark] | |
public bool ContainsTokenHellBrick() | |
{ | |
return ContainsTokenHellBrick(Value, Match); | |
} | |
[Benchmark] | |
public bool ContainsTokenMonty() | |
{ | |
return ContainsTokenMonty(Value, Match); | |
} | |
/// <summary> | |
/// Code taken from https://twitter.com/Nick_Craver/status/722741298575319040 | |
/// Checks whether the *entire* token exists, correctly handling seperators | |
/// (optional at start and end) so: "Foo;Bar" contains "Bar", but | |
/// "Foo;FooBar;Whatever" does not | |
/// </summary> | |
public static bool ContainsToken(string value, string token, char delimiter = ';') | |
{ | |
if (string.IsNullOrEmpty(token)) return false; | |
if (string.IsNullOrEmpty(value)) return false; | |
int lastIndex = -1, idx, endIndex = value.Length - token.Length, tokenLength = token.Length; | |
while ((idx = value.IndexOf(token, lastIndex + 1)) > lastIndex) | |
{ | |
lastIndex = idx; | |
if ((idx == 0 || (value[idx - 1] == delimiter)) | |
&& (idx == endIndex || (value[idx + tokenLength] == delimiter))) | |
{ | |
return true; | |
} | |
} | |
return false; | |
} | |
// See https://gist.github.com/mattwarren/f0594a9f3afa9377a4bbc2bcf8e573c5#gistcomment-1757392 | |
public static bool ContainsTokenXoofx(string value, string token, char delimiter = ';') | |
{ | |
if (string.IsNullOrEmpty(token)) return false; | |
if (string.IsNullOrEmpty(value)) return false; | |
int length = value.Length; | |
int tokenLength = token.Length; | |
for (int i = 0; i < length; i++) | |
{ | |
for (int j = 0; j < tokenLength; j++, i++) | |
{ | |
if (i >= length || value[i] != token[j]) | |
{ | |
goto next; | |
} | |
} | |
if ((i == length || (i < length && value[i] == delimiter))) | |
{ | |
return true; | |
} | |
next: | |
i = value.IndexOf(delimiter, i); | |
if (i < 0) | |
{ | |
break; | |
} | |
} | |
return false; | |
} | |
public unsafe static bool ContainsTokenXoofxUnsafe(string value, string token, char delimiter = ';') | |
{ | |
if (string.IsNullOrEmpty(token)) return false; | |
if (string.IsNullOrEmpty(value)) return false; | |
int length = value.Length; | |
int tokenLength = token.Length; | |
int i = 0; | |
fixed (char* pValue = value) | |
fixed (char* pToken = token) | |
while (true) | |
{ | |
for (int j = 0; j < tokenLength; j++, i++) | |
{ | |
if (i >= length || pValue[i] != pToken[j]) | |
{ | |
goto next; | |
} | |
} | |
if (i == length || pValue[i] == delimiter) | |
{ | |
return true; | |
} | |
next: | |
i = value.IndexOf(delimiter, i) + 1; | |
if (i == 0) | |
{ | |
break; | |
} | |
} | |
return false; | |
} | |
public static unsafe bool ContainsTokenHellBrick(string value, string token, char delimiter = ';') | |
{ | |
if (string.IsNullOrEmpty(token)) return false; | |
if (string.IsNullOrEmpty(value)) return false; | |
int length = value.Length; | |
int tokenLength = token.Length; | |
int currentTokenOffset = 0; | |
int maxTokenOffset = length - tokenLength; | |
while (currentTokenOffset <= maxTokenOffset) | |
{ | |
int nextDelimiterOffset = value.IndexOf(delimiter, currentTokenOffset); | |
if (nextDelimiterOffset < 0) | |
nextDelimiterOffset = length; | |
if (tokenLength == nextDelimiterOffset - currentTokenOffset) | |
{ | |
for (int i = 0; i < tokenLength; i++) | |
{ | |
if (value[currentTokenOffset + i] != token[i]) | |
goto next; | |
} | |
return true; | |
} | |
next: | |
currentTokenOffset = nextDelimiterOffset + 1; | |
} | |
return false; | |
} | |
public static bool ContainsTokenMonty(string value, string token, char delimiter = ';') | |
{ | |
// this code assumes you're running x64 | |
// it _works_ regardless, but some ifdef junk for x32 | |
// would be handy if you care about ideal x32 perf | |
const int charsPerLong = sizeof(long) / sizeof(char); // 4 | |
const int charsPerInt = sizeof(int) / sizeof(char); // 2 | |
const int bytesPerChar = sizeof(char) / sizeof(byte); // 2 | |
if (string.IsNullOrEmpty(token)) return false; | |
if (string.IsNullOrEmpty(value)) return false; | |
var valueLength = value.Length; | |
var tokenLength = token.Length; | |
if (tokenLength > valueLength) return false; | |
int tokenLongs; | |
bool tokenTrailingInt, tokenTrailingChar; | |
{ | |
var remainingChars = tokenLength; | |
tokenLongs = remainingChars / charsPerLong; | |
tokenTrailingInt = (tokenLength & 0x02) != 0; | |
tokenTrailingChar = (tokenLength & 0x01) != 0; | |
} | |
var tokenByteLength = tokenLength * bytesPerChar; | |
unsafe | |
{ | |
fixed (char* valuePtr = value, tokenPtr = token) | |
{ | |
var curValuePtr = valuePtr; | |
var endValuePtr = valuePtr + valueLength; | |
while (true) | |
{ | |
// 8-byte steps | |
long* tokenLongPtr = (long*)tokenPtr; | |
{ | |
for (var i = 0; i < tokenLongs; i++) | |
{ | |
var tokenLong = *tokenLongPtr; | |
var valueLong = *((long*)curValuePtr); | |
if (tokenLong == valueLong) | |
{ | |
// we have a match, continue | |
tokenLongPtr++; | |
curValuePtr += charsPerLong; | |
continue; | |
} | |
else | |
{ | |
goto advanceToNextDelimiter; | |
} | |
} | |
} | |
// can only be 1 4-byte value, 'cause if there were 2 it'd have been handled in the 8-byte stride | |
int* tokenIntPtr = (int*)tokenLongPtr; | |
if (tokenTrailingInt) | |
{ | |
var tokenInt = *tokenIntPtr; | |
var valueInt = *((int*)curValuePtr); | |
if (tokenInt == valueInt) | |
{ | |
// we have a match, continue | |
tokenIntPtr++; | |
curValuePtr += charsPerInt; | |
} | |
else | |
{ | |
goto advanceToNextDelimiter; | |
} | |
} | |
// likewise, only 1 2-byte value or it'd be handled in the 4-byte check | |
char* tokenCharPtr = (char*)tokenIntPtr; | |
if (tokenTrailingChar) | |
{ | |
var tokenChar = *tokenCharPtr; | |
var valueChar = *curValuePtr; | |
if (tokenChar == valueChar) | |
{ | |
// we have a match, continue | |
tokenCharPtr++; | |
curValuePtr++; | |
} | |
else | |
{ | |
goto advanceToNextDelimiter; | |
} | |
} | |
// at this point, the full text of token has been matched, so we need to check if the _next_ char | |
// in value is a delimiter or the end of value | |
if (curValuePtr == endValuePtr || *curValuePtr == delimiter) | |
{ | |
return true; | |
} | |
// in this case, we found a string that starts with value, but is not value | |
// "Foo" in "FooBar" for example | |
// advance here to skip the rest of whatever's before | |
// the next delimiter | |
advanceToNextDelimiter: | |
// our own IndexOf()-esque functionality | |
// at the end of this curValuePtr points to the | |
// character _after_ the next delimiter in the string | |
// | |
// not using IndexOf() is to save us the trouble | |
// of tracking additional state, or doing extra work, | |
// to move between the index-into-a-string world and | |
// our constantly-advancing-pointers world | |
while (true) | |
{ | |
var curVal = *curValuePtr; | |
if (curVal == delimiter) | |
{ | |
// got one! | |
curValuePtr++; | |
if (curValuePtr == endValuePtr) | |
{ | |
// but there wasn't anything after it | |
return false; | |
} | |
break; | |
} | |
curValuePtr++; | |
if (curValuePtr == endValuePtr) | |
{ | |
// no more delimiters | |
return false; | |
} | |
} | |
// we only need to do this check after a delimiter is advanced | |
// because this check _itself_ guarantees that we won't | |
// exceed the end of value before we're done comparing | |
// against token | |
var remainingBytesInValue = ((byte*)endValuePtr) - ((byte*)curValuePtr); // we do this bytes to save an op | |
if (remainingBytesInValue < tokenByteLength) | |
{ | |
// not possible for token to fit in string, fail early | |
return false; | |
} | |
} | |
} | |
} | |
} | |
} |
This is epic! =)
Ok I couldn't resist so here is another variation :)
https://gist.github.com/bbowyersmyth/c308aa62e871d44024c0bab552567961
Minimize the expense looking for a token as there is most likely not a match. Monty2 would probably be quicker on longer partial matches.
Use loop unrolling when looking for the next token.
BenchmarkDotNet=v0.9.4.0
OS=Microsoft Windows NT 6.2.9200.0
Processor=Intel(R) Core(TM) i5-6600 CPU @ 3.30GHz, ProcessorCount=4
Frequency=3234371 ticks, Resolution=309.1791 ns, Timer=TSC
HostCLR=MS.NET 4.0.30319.42000, Arch=64-bit RELEASE [RyuJIT]
JitModules=clrjit-v4.6.1078.0
Type=ContainsToken Mode=Throughput Platform=X64
Jit=RyuJit
Method | Median | StdDev | Value |
---|---|---|---|
ContainsTokenUnroll | 11.1478 ns | 0.3191 ns | Bar1;Bar2;Bar3;Bar4;Bar |
ContainsTokenMonty2 | 15.4258 ns | 0.2308 ns | Bar1;Bar2;Bar3;Bar4;Bar |
ContainsTokenUnroll | 12.4808 ns | 2.7918 ns | Bar1;Bar2;Bar3;Bar4;NoMatch |
ContainsTokenMonty2 | 17.0302 ns | 0.3017 ns | Bar1;Bar2;Bar3;Bar4;NoMatch |
ContainsTokenUnroll | 3.9933 ns | 0.1028 ns | Bar;blaat;foo |
ContainsTokenMonty2 | 5.7593 ns | 0.1258 ns | Bar;blaat;foo |
ContainsTokenUnroll | 5.7880 ns | 0.1428 ns | Foo;Bar |
ContainsTokenMonty2 | 8.0731 ns | 0.1841 ns | Foo;Bar |
ContainsTokenUnroll | 11.8456 ns | 0.2204 ns | Foo;FooBar;Whatever |
ContainsTokenMonty2 | 15.0076 ns | 0.2669 ns | Foo;FooBar;Whatever |
ContainsTokenUnroll | 18.5287 ns | 0.2764 ns | Some;Other;Really;Interesting;Tokens |
ContainsTokenMonty2 | 21.6304 ns | 0.4823 ns | Some;Other;Really;Interesting;Tokens |
ContainsTokenUnroll | 8.7534 ns | 0.1295 ns | blaat;foo;Bar |
ContainsTokenMonty2 | 11.2296 ns | 0.2047 ns | blaat;foo;Bar |
ContainsTokenUnroll | 5.9761 ns | 0.1274 ns | foo;Bar;Blaat |
ContainsTokenMonty2 | 8.1325 ns | 0.2437 ns | foo;Bar;Blaat |
ContainsTokenUnroll | 10.5742 ns | 0.2372 ns | foo;FooBar;Blaat |
ContainsTokenMonty2 | 14.2297 ns | 0.2002 ns | foo;FooBar;Blaat |
I know I'm late to the game, but here's my take. On a Core i7-4810MQ @ 2.8GHz running 4.6.1 x64 w/ RyuJIT, it appears to consistently beat the previous winners (including Monty2). Enjoy!
public static bool ContainsTokenAfward(string value, string token, char delimiter = ';')
{
if (token == null || value == null) return false;
var valueLength = value.Length;
var tokenLength = token.Length;
// if the token has any length, we only need to make sure the value is long enough to hold it
if (tokenLength == 0 || tokenLength > valueLength) return false;
// algorithm::
// check the first segment to see if it matches; return true if it does
// look for {Delimiter}{token[0]} via int* cast
// match the remaining characters of the token if found
// we assume we're on little-endian and that strings are terminated by null
// we also only do ordinal comparison
unsafe
{
fixed (char* valueStartPtr = value, tokenStartPtr = token)
{
// keep up with where we are in the value and token strings
var valuePtr = valueStartPtr;
var tokenPtr = tokenStartPtr;
// keep up with the ending pointer for the value
var valueEndPtr = valueStartPtr + valueLength;
// see if the first segment is the token
if (*tokenPtr == *valuePtr)
{
// go through at a rate of 2 chars per check...
while (*(uint*)++tokenPtr == *(uint*)++valuePtr)
{
++tokenPtr;
if (++valuePtr >= valueEndPtr)
{
// we're at the end of value and it matches
return true;
}
}
if (*tokenPtr == '\0')
{
// we matched the whole token, so make sure the next char is a delimiter
if (*valuePtr == delimiter)
{
return true;
}
}
// we matched all but the final character, so check it and the delimiter
else if (*tokenPtr == *valuePtr)
{
if (*++valuePtr == delimiter)
{
return true;
}
}
}
// set our search limit (no point going further if token hasn't been found yet)
var valueEffEndPtr = valueEndPtr - tokenLength;
// build a matching value that is the delimiter + the first token char
var matchVal = (int)delimiter | ((int)*tokenStartPtr << 16);
// save off the position of the 2nd token character for later...
var resetPtr = tokenStartPtr + 1;
// search through the string to find the correct location
while (valuePtr < valueEffEndPtr)
{
// look for the next segment delimiter where the segment starts with the correct character
var checkSegment = true;
while (*(int*)valuePtr != matchVal)
{
++valuePtr;
if (valuePtr > valueEffEndPtr)
{
checkSegment = false;
break;
}
}
if (checkSegment)
{
// valuePtr is pointing at the delimiter, so move to the 2nd char in the segment
valuePtr += 2;
// reset our matching pointer
tokenPtr = resetPtr;
// go through at a rate of 2 chars per check...
while (*(int*)tokenPtr == *(int*)valuePtr)
{
tokenPtr += 2;
if ((valuePtr += 2) >= valueEndPtr)
{
// we're at the end of value and it matches
return true;
}
}
if (*tokenPtr == '\0')
{
// we matched the whole token, so make sure the next char is a delimiter
if (*valuePtr == delimiter)
{
return true;
}
}
// we matched all but the final character, so check it and the delimiter
else if (*tokenPtr == *valuePtr)
{
if (*++valuePtr == delimiter)
{
return true;
}
}
}
}
// we didn't find it
return false;
}
}
}
BTW, @kevin-montrose: I found that your algorithms won't support tokens larger than 6 chars anywhere but the beginning of the list. I haven't tried to debug them, though.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Nice! Interesting results, for sure :) Still glad I'm in second place in most tests (not all) ;).