-
-
Save kevin-montrose/7bb52718af7dc3fba3258db6ab7e6469 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
public static bool ContainsTokenMonty2(string value, string token, char delimiter = ';') | |
{ | |
// this code REQUIRES little endian architecture | |
// it will not function correctly on big endian | |
// this code assumes you're running x64 | |
// it _works_ regardless, but some ifdef junk for x32 | |
// would be handy if you care about ideal x32 perf | |
const int charsPerLong = sizeof(long) / sizeof(char); // 4 | |
const int charsPerInt = sizeof(int) / sizeof(char); // 2 | |
const int bytesPerChar = sizeof(char) / sizeof(byte); // 2 | |
if (string.IsNullOrEmpty(token)) return false; | |
if (string.IsNullOrEmpty(value)) return false; | |
var delimiterTwice = (delimiter << 16) | delimiter; | |
var valueLength = value.Length; | |
var tokenLength = token.Length; | |
if (tokenLength > valueLength) return false; | |
int tokenLongs; | |
bool tokenTrailingInt, tokenTrailingChar; | |
{ | |
var remainingChars = tokenLength; | |
tokenLongs = remainingChars / charsPerLong; | |
tokenTrailingInt = (tokenLength & 0x02) != 0; | |
tokenTrailingChar = (tokenLength & 0x01) != 0; | |
} | |
var tokenByteLength = tokenLength * bytesPerChar; | |
unsafe | |
{ | |
fixed (char* valuePtr = value, tokenPtr = token) | |
{ | |
var curValuePtr = valuePtr; | |
var endValuePtr = valuePtr + valueLength ; | |
while (true) | |
{ | |
// 8-byte steps | |
long* tokenLongPtr = (long*)tokenPtr; | |
{ | |
for (var i = 0; i < tokenLongs; i++) | |
{ | |
var tokenLong = *tokenLongPtr; | |
var valueLong = *((long*)curValuePtr); | |
if (tokenLong == valueLong) | |
{ | |
// we have a match, continue | |
tokenLongPtr++; | |
curValuePtr += charsPerLong; | |
continue; | |
} | |
else | |
{ | |
goto advanceToNextDelimiter; | |
} | |
} | |
} | |
// can only be 1 4-byte value, 'cause if there were 2 it'd have been handled in the 8-byte stride | |
int* tokenIntPtr = (int*)tokenLongPtr; | |
if (tokenTrailingInt) | |
{ | |
var tokenInt = *tokenIntPtr; | |
var valueInt = *((int*)curValuePtr); | |
if (tokenInt == valueInt) | |
{ | |
// we have a match, continue | |
tokenIntPtr++; | |
curValuePtr += charsPerInt; | |
} | |
else | |
{ | |
goto advanceToNextDelimiter; | |
} | |
} | |
// likewise, only 1 2-byte value or it'd be handled in the 4-byte check | |
char* tokenCharPtr = (char*)tokenIntPtr; | |
if (tokenTrailingChar) | |
{ | |
var tokenChar = *tokenCharPtr; | |
var valueChar = *curValuePtr; | |
if (tokenChar == valueChar) | |
{ | |
// we have a match, continue | |
tokenCharPtr++; | |
curValuePtr++; | |
} | |
else | |
{ | |
goto advanceToNextDelimiter; | |
} | |
} | |
// at this point, the full text of token has been matched, so we need to check if the _next_ char | |
// in value is a delimiter or the end of value | |
if (curValuePtr == endValuePtr || *curValuePtr == delimiter) | |
{ | |
return true; | |
} | |
// in this case, we found a string that starts with value, but is not value | |
// "Foo" in "FooBar" for example | |
// advance here to skip the rest of whatever's before | |
// the next delimiter | |
advanceToNextDelimiter: | |
// Really dirty trick time | |
// | |
// C# strings are length prefixed _and_ null terminated (see: https://msdn.microsoft.com/en-us/library/aa664784(v=vs.71).aspx) | |
// with '\0' (that is a 2-byte char) so we can read 1 past the "end" of the string w/o issue. | |
// This lets us process an int at a time (2 chars), so if we precalc a mask and do some fancy | |
// bit twiddling we can avoid a branch per-char. | |
while (true) | |
{ | |
var curVal = *((int*)curValuePtr); | |
var masked = curVal ^ delimiterTwice; // whole char chunks will be zero if any equal delimiter | |
var temp = masked & 0x7FFF7FFF; // zero out the high bits of both chars | |
temp = temp + 0x7FFF7FFF; // will overflow _into_ the high bits of any other bits are set in either value | |
temp = (int)(temp & 0x80008000); // high bits of each char will be set if any of the values was zero | |
temp = temp | 0x7FFF7FFF; // whole char will be ones if any of the bits in the char are | |
temp = ~temp; // if both chars aren't matches, temp will be set entirely; flip so it'll be zero in that case | |
var neitherMatch = temp == 0; | |
if (neitherMatch) | |
{ | |
curValuePtr += charsPerInt; | |
if(curValuePtr >= endValuePtr) | |
{ | |
return false; | |
} | |
continue; | |
} | |
var top16 = temp & 0xFFFF0000; | |
if (top16 != 0) | |
{ | |
// got one, and it's in the top of the int under C# rules (big endian), | |
// but because we're little endian it's actually _at_ | |
// the tail end of the word (so advance two) | |
curValuePtr += charsPerInt; | |
break; | |
} | |
var bottom16 = temp & 0x0000FFFF; | |
if (bottom16 != 0) | |
{ | |
// got one, and it's at the bottom of the int under C# rules (big endian), | |
// but because we're little endian it's actually _at_ | |
// the head of the word (so just advance one) | |
curValuePtr += 1; | |
} | |
} | |
// we only need to do this check after a delimiter is advanced | |
// because this check _itself_ guarantees that we won't | |
// exceed the end of value before we're done comparing | |
// against token | |
var remainingBytesInValue = ((byte*)endValuePtr) - ((byte*)curValuePtr); // we do this bytes to save an op | |
if (remainingBytesInValue < tokenByteLength) | |
{ | |
// not possible for token to fit in string, fail early | |
return false; | |
} | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment