Skip to content

Instantly share code, notes, and snippets.

@mattwarren
Last active July 25, 2023 00:20
Show Gist options
  • Save mattwarren/f0594a9f3afa9377a4bbc2bcf8e573c5 to your computer and use it in GitHub Desktop.
Save mattwarren/f0594a9f3afa9377a4bbc2bcf8e573c5 to your computer and use it in GitHub Desktop.
[Config(typeof(Config))]
public class Program
{
static void Main(string[] args)
{
var summary = BenchmarkRunner.Run<Program>();
}
private class Config : ManualConfig
{
public Config()
{
Add(Job.Clr.WithLaunchCount(1));
Add(new GCDiagnoser());
}
}
[Params("Foo;Bar", "Foo;FooBar;Whatever", "Bar;blaat;foo", "blaat;foo;Bar", "foo;Bar;Blaat", "foo;FooBar;Blaat",
"Bar1;Bar2;Bar3;Bar4;Bar", "Bar1;Bar2;Bar3;Bar4;NoMatch", "Foo;FooBar;Whatever", "Some;Other;Really;Interesting;Tokens")]
public string Value { get; set; }
public string Match = "Bar";
private static char [] delimeter = new [] { ';' };
[Benchmark(Baseline = true)]
public bool StringSplit()
{
// To make is a fair comparision!
if (string.IsNullOrEmpty(Match)) return false;
if (string.IsNullOrEmpty(Value)) return false;
var tokens = Value.Split(delimeter);
foreach (var token in tokens)
{
if (token == Match)
return true;
}
return false;
}
[Benchmark]
public bool ContainsToken()
{
return ContainsToken(Value, Match);
}
[Benchmark]
public bool ContainsTokenXoofx()
{
return ContainsTokenXoofx(Value, Match);
}
[Benchmark]
public bool ContainsTokenXoofxUnsafe()
{
return ContainsTokenXoofxUnsafe(Value, Match);
}
[Benchmark]
public bool ContainsTokenHellBrick()
{
return ContainsTokenHellBrick(Value, Match);
}
[Benchmark]
public bool ContainsTokenMonty()
{
return ContainsTokenMonty(Value, Match);
}
/// <summary>
/// Code taken from https://twitter.com/Nick_Craver/status/722741298575319040
/// Checks whether the *entire* token exists, correctly handling seperators
/// (optional at start and end) so: "Foo;Bar" contains "Bar", but
/// "Foo;FooBar;Whatever" does not
/// </summary>
public static bool ContainsToken(string value, string token, char delimiter = ';')
{
if (string.IsNullOrEmpty(token)) return false;
if (string.IsNullOrEmpty(value)) return false;
int lastIndex = -1, idx, endIndex = value.Length - token.Length, tokenLength = token.Length;
while ((idx = value.IndexOf(token, lastIndex + 1)) > lastIndex)
{
lastIndex = idx;
if ((idx == 0 || (value[idx - 1] == delimiter))
&& (idx == endIndex || (value[idx + tokenLength] == delimiter)))
{
return true;
}
}
return false;
}
// See https://gist.github.com/mattwarren/f0594a9f3afa9377a4bbc2bcf8e573c5#gistcomment-1757392
public static bool ContainsTokenXoofx(string value, string token, char delimiter = ';')
{
if (string.IsNullOrEmpty(token)) return false;
if (string.IsNullOrEmpty(value)) return false;
int length = value.Length;
int tokenLength = token.Length;
for (int i = 0; i < length; i++)
{
for (int j = 0; j < tokenLength; j++, i++)
{
if (i >= length || value[i] != token[j])
{
goto next;
}
}
if ((i == length || (i < length && value[i] == delimiter)))
{
return true;
}
next:
i = value.IndexOf(delimiter, i);
if (i < 0)
{
break;
}
}
return false;
}
public unsafe static bool ContainsTokenXoofxUnsafe(string value, string token, char delimiter = ';')
{
if (string.IsNullOrEmpty(token)) return false;
if (string.IsNullOrEmpty(value)) return false;
int length = value.Length;
int tokenLength = token.Length;
int i = 0;
fixed (char* pValue = value)
fixed (char* pToken = token)
while (true)
{
for (int j = 0; j < tokenLength; j++, i++)
{
if (i >= length || pValue[i] != pToken[j])
{
goto next;
}
}
if (i == length || pValue[i] == delimiter)
{
return true;
}
next:
i = value.IndexOf(delimiter, i) + 1;
if (i == 0)
{
break;
}
}
return false;
}
public static unsafe bool ContainsTokenHellBrick(string value, string token, char delimiter = ';')
{
if (string.IsNullOrEmpty(token)) return false;
if (string.IsNullOrEmpty(value)) return false;
int length = value.Length;
int tokenLength = token.Length;
int currentTokenOffset = 0;
int maxTokenOffset = length - tokenLength;
while (currentTokenOffset <= maxTokenOffset)
{
int nextDelimiterOffset = value.IndexOf(delimiter, currentTokenOffset);
if (nextDelimiterOffset < 0)
nextDelimiterOffset = length;
if (tokenLength == nextDelimiterOffset - currentTokenOffset)
{
for (int i = 0; i < tokenLength; i++)
{
if (value[currentTokenOffset + i] != token[i])
goto next;
}
return true;
}
next:
currentTokenOffset = nextDelimiterOffset + 1;
}
return false;
}
public static bool ContainsTokenMonty(string value, string token, char delimiter = ';')
{
// this code assumes you're running x64
// it _works_ regardless, but some ifdef junk for x32
// would be handy if you care about ideal x32 perf
const int charsPerLong = sizeof(long) / sizeof(char); // 4
const int charsPerInt = sizeof(int) / sizeof(char); // 2
const int bytesPerChar = sizeof(char) / sizeof(byte); // 2
if (string.IsNullOrEmpty(token)) return false;
if (string.IsNullOrEmpty(value)) return false;
var valueLength = value.Length;
var tokenLength = token.Length;
if (tokenLength > valueLength) return false;
int tokenLongs;
bool tokenTrailingInt, tokenTrailingChar;
{
var remainingChars = tokenLength;
tokenLongs = remainingChars / charsPerLong;
tokenTrailingInt = (tokenLength & 0x02) != 0;
tokenTrailingChar = (tokenLength & 0x01) != 0;
}
var tokenByteLength = tokenLength * bytesPerChar;
unsafe
{
fixed (char* valuePtr = value, tokenPtr = token)
{
var curValuePtr = valuePtr;
var endValuePtr = valuePtr + valueLength;
while (true)
{
// 8-byte steps
long* tokenLongPtr = (long*)tokenPtr;
{
for (var i = 0; i < tokenLongs; i++)
{
var tokenLong = *tokenLongPtr;
var valueLong = *((long*)curValuePtr);
if (tokenLong == valueLong)
{
// we have a match, continue
tokenLongPtr++;
curValuePtr += charsPerLong;
continue;
}
else
{
goto advanceToNextDelimiter;
}
}
}
// can only be 1 4-byte value, 'cause if there were 2 it'd have been handled in the 8-byte stride
int* tokenIntPtr = (int*)tokenLongPtr;
if (tokenTrailingInt)
{
var tokenInt = *tokenIntPtr;
var valueInt = *((int*)curValuePtr);
if (tokenInt == valueInt)
{
// we have a match, continue
tokenIntPtr++;
curValuePtr += charsPerInt;
}
else
{
goto advanceToNextDelimiter;
}
}
// likewise, only 1 2-byte value or it'd be handled in the 4-byte check
char* tokenCharPtr = (char*)tokenIntPtr;
if (tokenTrailingChar)
{
var tokenChar = *tokenCharPtr;
var valueChar = *curValuePtr;
if (tokenChar == valueChar)
{
// we have a match, continue
tokenCharPtr++;
curValuePtr++;
}
else
{
goto advanceToNextDelimiter;
}
}
// at this point, the full text of token has been matched, so we need to check if the _next_ char
// in value is a delimiter or the end of value
if (curValuePtr == endValuePtr || *curValuePtr == delimiter)
{
return true;
}
// in this case, we found a string that starts with value, but is not value
// "Foo" in "FooBar" for example
// advance here to skip the rest of whatever's before
// the next delimiter
advanceToNextDelimiter:
// our own IndexOf()-esque functionality
// at the end of this curValuePtr points to the
// character _after_ the next delimiter in the string
//
// not using IndexOf() is to save us the trouble
// of tracking additional state, or doing extra work,
// to move between the index-into-a-string world and
// our constantly-advancing-pointers world
while (true)
{
var curVal = *curValuePtr;
if (curVal == delimiter)
{
// got one!
curValuePtr++;
if (curValuePtr == endValuePtr)
{
// but there wasn't anything after it
return false;
}
break;
}
curValuePtr++;
if (curValuePtr == endValuePtr)
{
// no more delimiters
return false;
}
}
// we only need to do this check after a delimiter is advanced
// because this check _itself_ guarantees that we won't
// exceed the end of value before we're done comparing
// against token
var remainingBytesInValue = ((byte*)endValuePtr) - ((byte*)curValuePtr); // we do this bytes to save an op
if (remainingBytesInValue < tokenByteLength)
{
// not possible for token to fit in string, fail early
return false;
}
}
}
}
}
}
@bbowyersmyth
Copy link

Ok I couldn't resist so here is another variation :)
https://gist.github.com/bbowyersmyth/c308aa62e871d44024c0bab552567961

Minimize the expense looking for a token as there is most likely not a match. Monty2 would probably be quicker on longer partial matches.
Use loop unrolling when looking for the next token.

BenchmarkDotNet=v0.9.4.0
OS=Microsoft Windows NT 6.2.9200.0
Processor=Intel(R) Core(TM) i5-6600 CPU @ 3.30GHz, ProcessorCount=4
Frequency=3234371 ticks, Resolution=309.1791 ns, Timer=TSC
HostCLR=MS.NET 4.0.30319.42000, Arch=64-bit RELEASE [RyuJIT]
JitModules=clrjit-v4.6.1078.0

Type=ContainsToken  Mode=Throughput  Platform=X64  
Jit=RyuJit  
Method Median StdDev Value
ContainsTokenUnroll 11.1478 ns 0.3191 ns Bar1;Bar2;Bar3;Bar4;Bar
ContainsTokenMonty2 15.4258 ns 0.2308 ns Bar1;Bar2;Bar3;Bar4;Bar
ContainsTokenUnroll 12.4808 ns 2.7918 ns Bar1;Bar2;Bar3;Bar4;NoMatch
ContainsTokenMonty2 17.0302 ns 0.3017 ns Bar1;Bar2;Bar3;Bar4;NoMatch
ContainsTokenUnroll 3.9933 ns 0.1028 ns Bar;blaat;foo
ContainsTokenMonty2 5.7593 ns 0.1258 ns Bar;blaat;foo
ContainsTokenUnroll 5.7880 ns 0.1428 ns Foo;Bar
ContainsTokenMonty2 8.0731 ns 0.1841 ns Foo;Bar
ContainsTokenUnroll 11.8456 ns 0.2204 ns Foo;FooBar;Whatever
ContainsTokenMonty2 15.0076 ns 0.2669 ns Foo;FooBar;Whatever
ContainsTokenUnroll 18.5287 ns 0.2764 ns Some;Other;Really;Interesting;Tokens
ContainsTokenMonty2 21.6304 ns 0.4823 ns Some;Other;Really;Interesting;Tokens
ContainsTokenUnroll 8.7534 ns 0.1295 ns blaat;foo;Bar
ContainsTokenMonty2 11.2296 ns 0.2047 ns blaat;foo;Bar
ContainsTokenUnroll 5.9761 ns 0.1274 ns foo;Bar;Blaat
ContainsTokenMonty2 8.1325 ns 0.2437 ns foo;Bar;Blaat
ContainsTokenUnroll 10.5742 ns 0.2372 ns foo;FooBar;Blaat
ContainsTokenMonty2 14.2297 ns 0.2002 ns foo;FooBar;Blaat

@ioctlLR
Copy link

ioctlLR commented May 25, 2016

I know I'm late to the game, but here's my take. On a Core i7-4810MQ @ 2.8GHz running 4.6.1 x64 w/ RyuJIT, it appears to consistently beat the previous winners (including Monty2). Enjoy!

public static bool ContainsTokenAfward(string value, string token, char delimiter = ';')
{
    if (token == null || value == null) return false;

    var valueLength = value.Length;
    var tokenLength = token.Length;

    // if the token has any length, we only need to make sure the value is long enough to hold it
    if (tokenLength == 0 || tokenLength > valueLength) return false;

    // algorithm::
    // check the first segment to see if it matches; return true if it does
    // look for {Delimiter}{token[0]} via int* cast
    // match the remaining characters of the token if found

    // we assume we're on little-endian and that strings are terminated by null
    // we also only do ordinal comparison

    unsafe
    {
        fixed (char* valueStartPtr = value, tokenStartPtr = token)
        {
            // keep up with where we are in the value and token strings
            var valuePtr = valueStartPtr;
            var tokenPtr = tokenStartPtr;

            // keep up with the ending pointer for the value
            var valueEndPtr = valueStartPtr + valueLength;

            // see if the first segment is the token
            if (*tokenPtr == *valuePtr)
            {
                // go through at a rate of 2 chars per check...
                while (*(uint*)++tokenPtr == *(uint*)++valuePtr)
                {
                    ++tokenPtr;
                    if (++valuePtr >= valueEndPtr)
                    {
                        // we're at the end of value and it matches
                        return true;
                    }
                }
                if (*tokenPtr == '\0')
                {
                    // we matched the whole token, so make sure the next char is a delimiter
                    if (*valuePtr == delimiter)
                    {
                        return true;
                    }
                }
                // we matched all but the final character, so check it and the delimiter
                else if (*tokenPtr == *valuePtr)
                {
                    if (*++valuePtr == delimiter)
                    {
                        return true;
                    }
                }
            }

            // set our search limit (no point going further if token hasn't been found yet)
            var valueEffEndPtr = valueEndPtr - tokenLength;

            // build a matching value that is the delimiter + the first token char
            var matchVal = (int)delimiter | ((int)*tokenStartPtr << 16);

            // save off the position of the 2nd token character for later...
            var resetPtr = tokenStartPtr + 1;

            // search through the string to find the correct location
            while (valuePtr < valueEffEndPtr)
            {
                // look for the next segment delimiter where the segment starts with the correct character
                var checkSegment = true;
                while (*(int*)valuePtr != matchVal)
                {
                    ++valuePtr;
                    if (valuePtr > valueEffEndPtr)
                    {
                        checkSegment = false;
                        break;
                    }
                }

                if (checkSegment)
                {
                    // valuePtr is pointing at the delimiter, so move to the 2nd char in the segment
                    valuePtr += 2;

                    // reset our matching pointer
                    tokenPtr = resetPtr;

                    // go through at a rate of 2 chars per check...
                    while (*(int*)tokenPtr == *(int*)valuePtr)
                    {
                        tokenPtr += 2;
                        if ((valuePtr += 2) >= valueEndPtr)
                        {
                            // we're at the end of value and it matches
                            return true;
                        }
                    }
                    if (*tokenPtr == '\0')
                    {
                        // we matched the whole token, so make sure the next char is a delimiter
                        if (*valuePtr == delimiter)
                        {
                            return true;
                        }
                    }
                    // we matched all but the final character, so check it and the delimiter
                    else if (*tokenPtr == *valuePtr)
                    {
                        if (*++valuePtr == delimiter)
                        {
                            return true;
                        }
                    }
                }
            }

            // we didn't find it
            return false;
        }
    }
}

@ioctlLR
Copy link

ioctlLR commented May 25, 2016

BTW, @kevin-montrose: I found that your algorithms won't support tokens larger than 6 chars anywhere but the beginning of the list. I haven't tried to debug them, though.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment