Skip to content

Instantly share code, notes, and snippets.

@bbowyersmyth
Created July 24, 2016 07:13
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save bbowyersmyth/81cd1b6155d70fb9c6b416ec3660cb89 to your computer and use it in GitHub Desktop.
using BenchmarkDotNet.Attributes;
using System;
namespace ConsoleApplication2
{
[Config("jobs=RyuJitX64")]
public class IndexOf
{
// Not found
//[Params(1, 12, 20, 30, 40, 50, 100, 200, 500)]
//public int length = 0;
// Different at pos n
[Params(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14)]
public int offset = 0;
public int length = 15;
public string _string;
[Setup]
public void Setup()
{
// Not found
//_string = new string('a', length);
// Different at pos n
_string = new string('a', length);
var chars = _string.ToCharArray();
chars[offset] = 'X';
_string = new string(chars);
}
[Benchmark]
public int NewIndexOf()
{
return NewIndexOf('X', 0, length);
}
[Benchmark]
public int NewIndexOfMask()
{
return NewIndexOfMask('X', 0, length);
}
public unsafe int NewIndexOf(char value, int startIndex, int count)
{
if (startIndex < 0 || startIndex > _string.Length)
throw new ArgumentOutOfRangeException("startIndex", "ArgumentOutOfRange_Index");
if (count < 0 || count > _string.Length - startIndex)
throw new ArgumentOutOfRangeException("count", "ArgumentOutOfRange_Count");
fixed (char* pChars = _string)
{
char* pCh = pChars + startIndex;
while (count >= 4)
{
if (*pCh == value) goto ReturnIndex;
if (*(pCh + 1) == value) goto ReturnIndex1;
if (*(pCh + 2) == value) goto ReturnIndex2;
if (*(pCh + 3) == value) goto ReturnIndex3;
count -= 4;
pCh += 4;
}
while (count > 0)
{
if (*pCh == value) goto ReturnIndex;
count--;
pCh++;
}
return -1;
ReturnIndex3: pCh++;
ReturnIndex2: pCh++;
ReturnIndex1: pCh++;
ReturnIndex:
return (int)(pCh - pChars);
}
}
public unsafe int NewIndexOfMask(char value, int startIndex, int count)
{
if (startIndex < 0 || startIndex > _string.Length)
throw new ArgumentOutOfRangeException("startIndex", "ArgumentOutOfRange_Index");
if (count < 0 || count > _string.Length - startIndex)
throw new ArgumentOutOfRangeException("count", "ArgumentOutOfRange_Count");
fixed (char* pChars = _string)
{
char* pCh = pChars + startIndex;
char* pEnd = pCh + count;
#if BIT64
// We want to stick to the normal loop if
// the high bit is set on the char otherwise the
// 'optimized' version will cause massive performance
// degradations. (see notes below)
if (value <= '\u7fff')
{
// We use a different implementation on x64
// than on x86, processing one word at a time. This
// enables us to avoid 3 branches per iteration than
// had we used the loop below, but also takes some
// more operations (xor, add, or). It may also
// report false positives if a character in the
// string has its high bit set.
// STEP 1: Aligning the pointer
// We need to align the pointer on 8 bytes,
// since we'll be casting it to ulong* and
// reading from it.
// Chunk: the number of chars we'll search
// per iteration of the loop
const int Chunk = 12;
// We want to calculate the number of chars
// until the next aligned address and combine
// that with the chars required for at least
// one iteration of the optimized loop. If it's
// not enough, jump straight to the fallback loop.
// This calculates the number of bytes until the next aligned address
// We have to use uint and 0u - to stop the jit from generating
// some redundant code while doing math
uint ualign = (0u - (uint)pCh) % 8; // 0 -> 0, 2 -> 6, 4 -> 4, 6 -> 2, 8 -> 0
// Contract.Assert(ualign % 2 == 0); // char* from strings should always have an even address
ualign /= 2; // we're counting in chars, not bytes, so divide by sizeof(char) which is 2
// Contract.Assert(((uint)pCh + ualign) % 8 == 0);
// Now we have to cast to int to prevent Roslyn
// from converting everything to long when mixing
// ints and uints
int alignment = (int)ualign;
if (alignment + Chunk > count)
{
goto FallbackLoop;
}
char* pAlign = pCh + alignment;
while (pCh != pAlign)
{
if (*pCh == value)
goto ReturnIndex;
pCh++;
}
//Contract.Assert((int)pCh % 8 == 0);
//Contract.Assert(count >= Chunk);
// STEP 2: The main loop
// Here, we're going to take value and repeat it
// 4 times into a word; e.g. if someone gave us
// the char 0x1234, we want to create the word
// 0x1234123412341234.
// Then, we're going to read a word at a time
// from pCh and xor it with this mask. If value
// is present in the word, this is going to
// create a zero in its place.
// As an example, say we're reading the substring
// "abcd" and we're looking for 'a'. When we xor
// it, this will happen:
// 0x0061 0x0062 0x0063 0x0064
// ^ 0x0061 0x0061 0x0061 0x0061
// =============================
// 0x0000 0x0003 0x0002 0x0005
// As you can see, the position that contains 'a'
// has been zeroed out.
// STEP 2.1: Detecting 0
// We then add ZeroMask to the word, which
// has the same effect as adding 0x7fff to
// each of the individual chars. This will
// set the high bit of the char only if the
// char is 0 or > 0x8000. Finally, we OR
// each of the chars with 0x7fff, causing
// all of the high bits to be set except the msb.
// This has the effect that if all of the
// chars > 0 and <= 0x8000
// (which should be the common case if the char isn't present),
// then all the bits in the word will be set
// and we know that 0 isn't in the word.
// As noted, it's possible for this to misfire
// if the word contains something > 0x8000,
// since adding 0x7fff to that will cause it
// to overflow and the high bit to not be set.
// No worries, however, since we check for
// misfires (see below) before returning.
// Continuing on our previous example, here
// is what this step will do to the string:
// 0x0000 0x0003 0x0002 0x0005
// + 0x7fff 0x7fff 0x7fff 0x7fff
// =============================
// 0x7fff 0x8002 0x8001 0x8004
// | 0x7fff 0x7fff 0x7fff 0x7fff
// =============================
// 0x7fff 0xffff 0xffff 0xffff
// ^
// Bit unset here!
// Note: This is based on the trick from
// https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord,
// using 0x7fff instead of 0x7f7f because chars are 2 in size.
const ulong ZeroMask = 0x7fff7fff7fff7fff;
const ulong AllBitsSet = ulong.MaxValue;
ulong valueMask = ((uint)value << 16) | value;
valueMask = (valueMask << 32) | valueMask;
char* pStop = pEnd - Chunk; // we want to make sure there are at least Chunk chars left
Loop:
do // we know there's enough room the first time around, due to the earlier assert
{
// Code repetition is for the purposes of loop unrolling,
// so we don't have to write to pCh and check the
// loop condition as often
ulong zeroed = *(ulong*)pCh ^ valueMask;
if (((zeroed + ZeroMask) | ZeroMask) != AllBitsSet) goto MaybeReturn;
zeroed = *(ulong*)(pCh + 4) ^ valueMask;
if (((zeroed + ZeroMask) | ZeroMask) != AllBitsSet) goto MaybeReturn4;
zeroed = *(ulong*)(pCh + 8) ^ valueMask;
if (((zeroed + ZeroMask) | ZeroMask) != AllBitsSet) goto MaybeReturn8;
pCh += Chunk;
}
while (pCh <= pStop);
// STEP 3.2: The fallback loop
// If we weren't able to find it within the optimized
// loop, go to the fallback loop and process each character
// individually.
goto FallbackLoop;
// STEP 3.1: Detection and misfires
// If we reached here, we either
// 1) saw 0 in the xored string, or
// 2) saw something > 0x8000 in the xored string.
// The latter case can happen when a char with
// its high bit set (excluding value | 0x8000,
// which will produce 0x8000 when xored) is
// present in the original string. We know that
// the high bit has to come from the char in the
// string, rather than value, since earlier we
// checked value < '\u8000'. (Otherwise, we would
// be entering this codepath quite frequently for
// values with the msb set.)
// Check each char individually to prevent
// false positives. We have to do this anyways,
// since we don't know which position in the
// word the value would be at.
MaybeReturn8: pCh += 4;
MaybeReturn4: pCh += 4;
MaybeReturn:
if (pCh[0] == value)
goto ReturnIndex;
if (pCh[1] == value)
goto ReturnIndex1;
if (pCh[2] == value)
goto ReturnIndex2;
if (pCh[3] == value)
goto ReturnIndex3;
// We misfired if we got here; so, continue searching at the next word.
// Note that we're incrementing pCh by 4 rather than Chunk,
// since we don't want to skip over the next word in the loop.
// (This will still leave pCh 8-byte aligned.)
// Also note that we have to repeat the loop condition check,
// since the main loop is a do... while loop and assumes there's
// at least Chunk chars left at the beginning.
pCh += 4; // We just processed one word, which on 64-bit is 4 chars
if (pCh <= pStop)
goto Loop;
}
else
{
#endif // BIT64
// On x86 wee use a (relatively) straightforward
// implementation: Just loop unroll, and check each
// char for value.
// We also go down this codepath on 64-bit
// for chars >= '\u8000', see notes above.
char* pStop = pEnd - 4; // we need at least 4 chars for 1 iteration of the loop
while (pCh <= pStop)
{
if (*pCh == value) goto ReturnIndex;
if (*(pCh + 1) == value) goto ReturnIndex1;
if (*(pCh + 2) == value) goto ReturnIndex2;
if (*(pCh + 3) == value) goto ReturnIndex3;
pCh += 4;
}
#if BIT64
}
FallbackLoop:
#endif // BIT64
while (pCh != pEnd)
{
if (*pCh == value)
goto ReturnIndex;
pCh++;
}
return -1;
ReturnIndex3: pCh++;
ReturnIndex2: pCh++;
ReturnIndex1: pCh++;
ReturnIndex:
return (int)(pCh - pChars);
}
}
}
}
@bbowyersmyth
Copy link
Author

String of 15 chars, index found at offset

Method offset Median StdDev
NewIndexOf 0 1.6149 ns 0.0740 ns
NewIndexOfMask 0 2.6913 ns 0.0988 ns
NewIndexOf 1 1.6181 ns 0.0642 ns
NewIndexOfMask 1 2.9676 ns 0.1080 ns
NewIndexOf 2 1.8864 ns 0.0648 ns
NewIndexOfMask 2 4.0452 ns 0.0927 ns
NewIndexOf 3 2.1521 ns 0.0462 ns
NewIndexOfMask 3 4.3100 ns 0.1334 ns
NewIndexOf 4 2.4232 ns 0.0744 ns
NewIndexOfMask 4 4.6741 ns 0.0856 ns
NewIndexOf 5 2.4193 ns 0.0662 ns
NewIndexOfMask 5 4.6721 ns 0.1335 ns
NewIndexOf 6 2.6926 ns 0.0643 ns
NewIndexOfMask 6 4.6809 ns 0.1202 ns
NewIndexOf 7 3.0878 ns 0.1684 ns
NewIndexOfMask 7 4.9488 ns 0.0819 ns
NewIndexOf 8 3.2379 ns 0.0745 ns
NewIndexOfMask 8 4.9556 ns 0.1158 ns
NewIndexOf 9 3.2328 ns 0.0707 ns
NewIndexOfMask 9 5.2193 ns 0.0904 ns
NewIndexOf 10 3.5007 ns 0.0833 ns
NewIndexOfMask 10 5.2204 ns 0.1189 ns
NewIndexOf 11 4.0354 ns 0.0689 ns
NewIndexOfMask 11 5.3074 ns 0.1058 ns
NewIndexOf 12 3.7641 ns 0.1487 ns
NewIndexOfMask 12 5.5144 ns 0.1315 ns
NewIndexOf 13 4.3020 ns 0.1599 ns
NewIndexOfMask 13 5.7480 ns 0.0861 ns
NewIndexOf 14 4.9432 ns 0.0456 ns
NewIndexOfMask 14 5.6020 ns 0.1650 ns

Not found in string of length

Method length Median StdDev
NewIndexOf 1 1.6256 ns 0.0741 ns
NewIndexOfMask 1 2.4346 ns 0.0758 ns
NewIndexOf 12 3.5212 ns 0.1139 ns
NewIndexOfMask 12 8.1650 ns 0.1636 ns
NewIndexOf 20 5.2177 ns 0.1367 ns
NewIndexOfMask 20 8.4462 ns 0.2003 ns
NewIndexOf 30 7.5331 ns 0.1905 ns
NewIndexOfMask 30 8.1159 ns 0.1703 ns
NewIndexOf 40 9.7737 ns 0.2373 ns
NewIndexOfMask 40 9.3015 ns 0.1808 ns
NewIndexOf 50 11.4125 ns 0.2679 ns
NewIndexOfMask 50 10.7487 ns 0.2397 ns
NewIndexOf 100 30.1480 ns 0.8667 ns
NewIndexOfMask 100 18.7080 ns 0.3850 ns
NewIndexOf 200 49.1154 ns 1.3110 ns
NewIndexOfMask 200 36.5417 ns 0.7455 ns
NewIndexOf 500 109.2226 ns 0.3247 ns
NewIndexOfMask 500 80.0745 ns 0.9029 ns

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment