Skip to content

Instantly share code, notes, and snippets.

@kevin-montrose
Last active April 25, 2016 22:42
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kevin-montrose/7bb52718af7dc3fba3258db6ab7e6469 to your computer and use it in GitHub Desktop.
Save kevin-montrose/7bb52718af7dc3fba3258db6ab7e6469 to your computer and use it in GitHub Desktop.
public static bool ContainsTokenMonty2(string value, string token, char delimiter = ';')
{
// this code REQUIRES little endian architecture
// it will not function correctly on big endian
// this code assumes you're running x64
// it _works_ regardless, but some ifdef junk for x32
// would be handy if you care about ideal x32 perf
const int charsPerLong = sizeof(long) / sizeof(char); // 4
const int charsPerInt = sizeof(int) / sizeof(char); // 2
const int bytesPerChar = sizeof(char) / sizeof(byte); // 2
if (string.IsNullOrEmpty(token)) return false;
if (string.IsNullOrEmpty(value)) return false;
var delimiterTwice = (delimiter << 16) | delimiter;
var valueLength = value.Length;
var tokenLength = token.Length;
if (tokenLength > valueLength) return false;
int tokenLongs;
bool tokenTrailingInt, tokenTrailingChar;
{
var remainingChars = tokenLength;
tokenLongs = remainingChars / charsPerLong;
tokenTrailingInt = (tokenLength & 0x02) != 0;
tokenTrailingChar = (tokenLength & 0x01) != 0;
}
var tokenByteLength = tokenLength * bytesPerChar;
unsafe
{
fixed (char* valuePtr = value, tokenPtr = token)
{
var curValuePtr = valuePtr;
var endValuePtr = valuePtr + valueLength ;
while (true)
{
// 8-byte steps
long* tokenLongPtr = (long*)tokenPtr;
{
for (var i = 0; i < tokenLongs; i++)
{
var tokenLong = *tokenLongPtr;
var valueLong = *((long*)curValuePtr);
if (tokenLong == valueLong)
{
// we have a match, continue
tokenLongPtr++;
curValuePtr += charsPerLong;
continue;
}
else
{
goto advanceToNextDelimiter;
}
}
}
// can only be 1 4-byte value, 'cause if there were 2 it'd have been handled in the 8-byte stride
int* tokenIntPtr = (int*)tokenLongPtr;
if (tokenTrailingInt)
{
var tokenInt = *tokenIntPtr;
var valueInt = *((int*)curValuePtr);
if (tokenInt == valueInt)
{
// we have a match, continue
tokenIntPtr++;
curValuePtr += charsPerInt;
}
else
{
goto advanceToNextDelimiter;
}
}
// likewise, only 1 2-byte value or it'd be handled in the 4-byte check
char* tokenCharPtr = (char*)tokenIntPtr;
if (tokenTrailingChar)
{
var tokenChar = *tokenCharPtr;
var valueChar = *curValuePtr;
if (tokenChar == valueChar)
{
// we have a match, continue
tokenCharPtr++;
curValuePtr++;
}
else
{
goto advanceToNextDelimiter;
}
}
// at this point, the full text of token has been matched, so we need to check if the _next_ char
// in value is a delimiter or the end of value
if (curValuePtr == endValuePtr || *curValuePtr == delimiter)
{
return true;
}
// in this case, we found a string that starts with value, but is not value
// "Foo" in "FooBar" for example
// advance here to skip the rest of whatever's before
// the next delimiter
advanceToNextDelimiter:
// Really dirty trick time
//
// C# strings are length prefixed _and_ null terminated (see: https://msdn.microsoft.com/en-us/library/aa664784(v=vs.71).aspx)
// with '\0' (that is a 2-byte char) so we can read 1 past the "end" of the string w/o issue.
// This lets us process an int at a time (2 chars), so if we precalc a mask and do some fancy
// bit twiddling we can avoid a branch per-char.
while (true)
{
var curVal = *((int*)curValuePtr);
var masked = curVal ^ delimiterTwice; // whole char chunks will be zero if any equal delimiter
var temp = masked & 0x7FFF7FFF; // zero out the high bits of both chars
temp = temp + 0x7FFF7FFF; // will overflow _into_ the high bits of any other bits are set in either value
temp = (int)(temp & 0x80008000); // high bits of each char will be set if any of the values was zero
temp = temp | 0x7FFF7FFF; // whole char will be ones if any of the bits in the char are
temp = ~temp; // if both chars aren't matches, temp will be set entirely; flip so it'll be zero in that case
var neitherMatch = temp == 0;
if (neitherMatch)
{
curValuePtr += charsPerInt;
if(curValuePtr >= endValuePtr)
{
return false;
}
continue;
}
var top16 = temp & 0xFFFF0000;
if (top16 != 0)
{
// got one, and it's in the top of the int under C# rules (big endian),
// but because we're little endian it's actually _at_
// the tail end of the word (so advance two)
curValuePtr += charsPerInt;
break;
}
var bottom16 = temp & 0x0000FFFF;
if (bottom16 != 0)
{
// got one, and it's at the bottom of the int under C# rules (big endian),
// but because we're little endian it's actually _at_
// the head of the word (so just advance one)
curValuePtr += 1;
}
}
// we only need to do this check after a delimiter is advanced
// because this check _itself_ guarantees that we won't
// exceed the end of value before we're done comparing
// against token
var remainingBytesInValue = ((byte*)endValuePtr) - ((byte*)curValuePtr); // we do this bytes to save an op
if (remainingBytesInValue < tokenByteLength)
{
// not possible for token to fit in string, fail early
return false;
}
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment