kevin-montrose/ContainsTokenMonty2.cs Secret

## ContainsTokenMonty2.cs
public static bool ContainsTokenMonty2(string value, string token, char delimiter = ';')
{
    // this code REQUIRES little endian architecture
    //   it will not function correctly on big endian

    // this code assumes you're running x64
    //   it _works_ regardless, but some ifdef junk for x32
    //   would be handy if you care about ideal x32 perf

    const int charsPerLong = sizeof(long) / sizeof(char);   // 4
    const int charsPerInt = sizeof(int) / sizeof(char);     // 2
    const int bytesPerChar = sizeof(char) / sizeof(byte);   // 2

    if (string.IsNullOrEmpty(token)) return false;
    if (string.IsNullOrEmpty(value)) return false;

    var delimiterTwice = (delimiter << 16) | delimiter;

    var valueLength = value.Length;
    var tokenLength = token.Length;

    if (tokenLength > valueLength) return false;

    int tokenLongs;
    bool tokenTrailingInt, tokenTrailingChar;
    {
        var remainingChars = tokenLength;
        tokenLongs = remainingChars / charsPerLong;
        tokenTrailingInt = (tokenLength & 0x02) != 0;
        tokenTrailingChar = (tokenLength & 0x01) != 0;
    }

    var tokenByteLength = tokenLength * bytesPerChar;

    unsafe
    {
        fixed (char* valuePtr = value, tokenPtr = token)
        {
            var curValuePtr = valuePtr;
            var endValuePtr = valuePtr + valueLength ;

            while (true)
            {
                // 8-byte steps
                long* tokenLongPtr = (long*)tokenPtr;
                {
                    for (var i = 0; i < tokenLongs; i++)
                    {
                        var tokenLong = *tokenLongPtr;

                        var valueLong = *((long*)curValuePtr);

                        if (tokenLong == valueLong)
                        {
                            // we have a match, continue
                            tokenLongPtr++;
                            curValuePtr += charsPerLong;
                            continue;
                        }
                        else
                        {
                            goto advanceToNextDelimiter;
                        }
                    }
                }

                // can only be 1 4-byte value, 'cause if there were 2 it'd have been handled in the 8-byte stride
                int* tokenIntPtr = (int*)tokenLongPtr;
                if (tokenTrailingInt)
                {
                    var tokenInt = *tokenIntPtr;

                    var valueInt = *((int*)curValuePtr);

                    if (tokenInt == valueInt)
                    {
                        // we have a match, continue
                        tokenIntPtr++;
                        curValuePtr += charsPerInt;
                    }
                    else
                    {
                        goto advanceToNextDelimiter;
                    }
                }

                // likewise, only 1 2-byte value or it'd be handled in the 4-byte check
                char* tokenCharPtr = (char*)tokenIntPtr;
                if (tokenTrailingChar)
                {
                    var tokenChar = *tokenCharPtr;

                    var valueChar = *curValuePtr;

                    if (tokenChar == valueChar)
                    {
                        // we have a match, continue
                        tokenCharPtr++;
                        curValuePtr++;
                    }
                    else
                    {
                        goto advanceToNextDelimiter;
                    }
                }

                // at this point, the full text of token has been matched, so we need to check if the _next_ char
                //    in value is a delimiter or the end of value
                if (curValuePtr == endValuePtr || *curValuePtr == delimiter)
                {
                    return true;
                }

            // in this case, we found a string that starts with value, but is not value
            //    "Foo" in "FooBar" for example

            // advance here to skip the rest of whatever's before
            //   the next delimiter
            advanceToNextDelimiter:

                // Really dirty trick time
                //
                // C# strings are length prefixed _and_ null terminated (see: https://msdn.microsoft.com/en-us/library/aa664784(v=vs.71).aspx)
                //   with '\0' (that is a 2-byte char) so we can read 1 past the "end" of the string w/o issue.
                // This lets us process an int at a time (2 chars), so if we precalc a mask and do some fancy
                //   bit twiddling we can avoid a branch per-char.
                while (true)
                {
                    var curVal = *((int*)curValuePtr);

                    var masked = curVal ^ delimiterTwice;   // whole char chunks will be zero if any equal delimiter
                    var temp = masked & 0x7FFF7FFF;         // zero out the high bits of both chars
                    temp = temp + 0x7FFF7FFF;               // will overflow _into_ the high bits of any other bits are set in either value
                    temp = (int)(temp & 0x80008000);        // high bits of each char will be set if any of the values was zero
                    temp = temp | 0x7FFF7FFF;               // whole char will be ones if any of the bits in the char are
                    temp = ~temp;                           // if both chars aren't matches, temp will be set entirely; flip so it'll be zero in that case
                    var neitherMatch = temp == 0;

                    if (neitherMatch)
                    {
                        curValuePtr += charsPerInt;
                        if(curValuePtr >= endValuePtr)
                        {
                            return false;
                        }
                        continue;
                    }

                    var top16 = temp & 0xFFFF0000;
                    if (top16 != 0)
                    {
                        // got one, and it's in the top of the int under C# rules (big endian),
                        //     but because we're little endian it's actually _at_
                        //     the tail end of the word (so advance two)
                        curValuePtr += charsPerInt;

                        break;
                    }

                    var bottom16 = temp & 0x0000FFFF;
                    if (bottom16 != 0)
                    {
                        // got one, and it's at the bottom of the int under C# rules (big endian),
                        //     but because we're little endian it's actually _at_
                        //     the head of the word (so just advance one)
                        curValuePtr += 1;
                    }
                }

                // we only need to do this check after a delimiter is advanced
                //    because this check _itself_ guarantees that we won't
                //    exceed the end of value before we're done comparing
                //    against token
                var remainingBytesInValue = ((byte*)endValuePtr) - ((byte*)curValuePtr);    // we do this bytes to save an op
                if (remainingBytesInValue < tokenByteLength)
                {
                    // not possible for token to fit in string, fail early
                    return false;
                }
            }
        }
    }
}
	public static bool ContainsTokenMonty2(string value, string token, char delimiter = ';')
	{
	// this code REQUIRES little endian architecture
	// it will not function correctly on big endian

	// this code assumes you're running x64
	// it _works_ regardless, but some ifdef junk for x32
	// would be handy if you care about ideal x32 perf

	const int charsPerLong = sizeof(long) / sizeof(char); // 4
	const int charsPerInt = sizeof(int) / sizeof(char); // 2
	const int bytesPerChar = sizeof(char) / sizeof(byte); // 2

	if (string.IsNullOrEmpty(token)) return false;
	if (string.IsNullOrEmpty(value)) return false;

	var delimiterTwice = (delimiter << 16) \| delimiter;

	var valueLength = value.Length;
	var tokenLength = token.Length;

	if (tokenLength > valueLength) return false;

	int tokenLongs;
	bool tokenTrailingInt, tokenTrailingChar;
	{
	var remainingChars = tokenLength;
	tokenLongs = remainingChars / charsPerLong;
	tokenTrailingInt = (tokenLength & 0x02) != 0;
	tokenTrailingChar = (tokenLength & 0x01) != 0;
	}

	var tokenByteLength = tokenLength * bytesPerChar;

	unsafe
	{
	fixed (char* valuePtr = value, tokenPtr = token)
	{
	var curValuePtr = valuePtr;
	var endValuePtr = valuePtr + valueLength ;

	while (true)
	{
	// 8-byte steps
	long* tokenLongPtr = (long*)tokenPtr;
	{
	for (var i = 0; i < tokenLongs; i++)
	{
	var tokenLong = *tokenLongPtr;

	var valueLong = ((long)curValuePtr);

	if (tokenLong == valueLong)
	{
	// we have a match, continue
	tokenLongPtr++;
	curValuePtr += charsPerLong;
	continue;
	}
	else
	{
	goto advanceToNextDelimiter;
	}
	}
	}

	// can only be 1 4-byte value, 'cause if there were 2 it'd have been handled in the 8-byte stride
	int* tokenIntPtr = (int*)tokenLongPtr;
	if (tokenTrailingInt)
	{
	var tokenInt = *tokenIntPtr;

	var valueInt = ((int)curValuePtr);

	if (tokenInt == valueInt)
	{
	// we have a match, continue
	tokenIntPtr++;
	curValuePtr += charsPerInt;
	}
	else
	{
	goto advanceToNextDelimiter;
	}
	}

	// likewise, only 1 2-byte value or it'd be handled in the 4-byte check
	char* tokenCharPtr = (char*)tokenIntPtr;
	if (tokenTrailingChar)
	{
	var tokenChar = *tokenCharPtr;

	var valueChar = *curValuePtr;

	if (tokenChar == valueChar)
	{
	// we have a match, continue
	tokenCharPtr++;
	curValuePtr++;
	}
	else
	{
	goto advanceToNextDelimiter;
	}
	}

	// at this point, the full text of token has been matched, so we need to check if the _next_ char
	// in value is a delimiter or the end of value
	if (curValuePtr == endValuePtr \|\| *curValuePtr == delimiter)
	{
	return true;
	}

	// in this case, we found a string that starts with value, but is not value
	// "Foo" in "FooBar" for example

	// advance here to skip the rest of whatever's before
	// the next delimiter
	advanceToNextDelimiter:

	// Really dirty trick time
	//
	// C# strings are length prefixed _and_ null terminated (see: https://msdn.microsoft.com/en-us/library/aa664784(v=vs.71).aspx)
	// with '\0' (that is a 2-byte char) so we can read 1 past the "end" of the string w/o issue.
	// This lets us process an int at a time (2 chars), so if we precalc a mask and do some fancy
	// bit twiddling we can avoid a branch per-char.
	while (true)
	{
	var curVal = ((int)curValuePtr);

	var masked = curVal ^ delimiterTwice; // whole char chunks will be zero if any equal delimiter
	var temp = masked & 0x7FFF7FFF; // zero out the high bits of both chars
	temp = temp + 0x7FFF7FFF; // will overflow _into_ the high bits of any other bits are set in either value
	temp = (int)(temp & 0x80008000); // high bits of each char will be set if any of the values was zero
	temp = temp \| 0x7FFF7FFF; // whole char will be ones if any of the bits in the char are
	temp = ~temp; // if both chars aren't matches, temp will be set entirely; flip so it'll be zero in that case
	var neitherMatch = temp == 0;

	if (neitherMatch)
	{
	curValuePtr += charsPerInt;
	if(curValuePtr >= endValuePtr)
	{
	return false;
	}
	continue;
	}

	var top16 = temp & 0xFFFF0000;
	if (top16 != 0)
	{
	// got one, and it's in the top of the int under C# rules (big endian),
	// but because we're little endian it's actually _at_
	// the tail end of the word (so advance two)
	curValuePtr += charsPerInt;

	break;
	}

	var bottom16 = temp & 0x0000FFFF;
	if (bottom16 != 0)
	{
	// got one, and it's at the bottom of the int under C# rules (big endian),
	// but because we're little endian it's actually _at_
	// the head of the word (so just advance one)
	curValuePtr += 1;
	}
	}

	// we only need to do this check after a delimiter is advanced
	// because this check _itself_ guarantees that we won't
	// exceed the end of value before we're done comparing
	// against token
	var remainingBytesInValue = ((byte)endValuePtr) - ((byte)curValuePtr); // we do this bytes to save an op
	if (remainingBytesInValue < tokenByteLength)
	{
	// not possible for token to fit in string, fail early
	return false;
	}
	}
	}
	}
	}