Skip to content

Instantly share code, notes, and snippets.

@EgorBo
Last active December 22, 2021 11:07
Show Gist options
  • Save EgorBo/1d059726dae285e3a1db501896e8a1bd to your computer and use it in GitHub Desktop.
Save EgorBo/1d059726dae285e3a1db501896e8a1bd to your computer and use it in GitHub Desktop.
Faster_SpanHelpers_IndexOf.cs
using System.Diagnostics;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.Arm;
using System.Runtime.Intrinsics.X86;
using System.Text;
using BenchmarkDotNet.Attributes;
public class P
{
static byte[] data1 = { 67, 111, 110, 110, 101, 99, 116, 105, 111, 110, 58, 32, 107, 101, 101, 112, 45, 97, 108, 105, 118, 101, 13, 10, 13, 10 };
static byte[] data2 = { 72, 111, 115, 116, 58, 32, 49, 48, 46, 48, 46, 48, 46, 49, 48, 50, 58, 53, 48, 48, 48, 13, 10, 67, 111, 110, 110, 101, 99, 116, 105, 111, 110, 58, 32, 107, 101, 101, 112, 45, 97, 108, 105, 118, 101, 13, 10, 13, 10 };
static byte[] data3 = { 65, 99, 99, 101, 112, 116, 58, 32, 97, 112, 112, 108, 105, 99, 97, 116, 105, 111, 110, 47, 106, 115, 111, 110, 44, 116, 101, 120, 116, 47, 104, 116, 109, 108, 59, 113, 61, 48, 46, 57, 44, 97, 112, 112, 108, 105, 99, 97, 116, 105, 111, 110, 47, 120, 104, 116, 109, 108, 43, 120, 109, 108, 59, 113, 61, 48, 46, 57, 44, 97, 112, 112, 108, 105, 99, 97, 116, 105, 111, 110, 47, 120, 109, 108, 59, 113, 61, 48, 46, 56, 44, 42, 47, 42, 59, 113, 61, 48, 46, 55, 13, 10, 72, 111, 115, 116, 58, 32, 49, 48, 46, 48, 46, 48, 46, 49, 48, 50, 58, 53, 48, 48, 48, 13, 10, 67, 111, 110, 110, 101, 99, 116, 105, 111, 110, 58, 32, 107, 101, 101, 112, 45, 97, 108, 105, 118, 101, 13, 10, 13, 10 };
static void Main()
{
var str1 = Encoding.UTF8.GetString(data1);
var str2 = Encoding.UTF8.GetString(data2);
var str3 = Encoding.UTF8.GetString(data3);
var p = new P();
Debug.Assert(p.BCL_data1() == p.Egor_data1());
Debug.Assert(p.BCL_data2() == p.Egor_data2());
Debug.Assert(p.BCL_data3() == p.Egor_data3());
BenchmarkDotNet.Running.BenchmarkRunner.Run<P>();
}
[Benchmark]
public int BCL_data1()
{
var d = data1;
return SpanHelpers.IndexOfAny(ref d[0], (byte)'\r', (byte)'\n', d.Length);
}
[Benchmark]
public int BCL_data2()
{
var d = data2;
return SpanHelpers.IndexOfAny(ref d[0], (byte)'\r', (byte)'\n', d.Length);
}
[Benchmark]
public int BCL_data3()
{
var d = data3;
return SpanHelpers.IndexOfAny(ref d[0], (byte)'\r', (byte)'\n', d.Length);
}
[Benchmark]
public int Egor_data1()
{
var d = data1;
return SpanHelpers_Egor.IndexOfAny(ref d[0], (byte)'\r', (byte)'\n', d.Length);
}
[Benchmark]
public int Egor_data2()
{
var d = data2;
return SpanHelpers_Egor.IndexOfAny(ref d[0], (byte)'\r', (byte)'\n', d.Length);
}
[Benchmark]
public int Egor_data3()
{
var d = data3;
return SpanHelpers_Egor.IndexOfAny(ref d[0], (byte)'\r', (byte)'\n', d.Length);
}
}
// My changes:
public static class SpanHelpers_Egor
{
public static int IndexOfAny(ref byte searchSpace, byte value0, byte value1, nint length)
{
nuint ulen = (nuint)length;
if (ulen >= 64)
goto DOUBLE_AVX;
if (ulen >= 32)
goto AVX;
if (ulen >= 16)
goto SSE;
for (nuint i = 0; i < ulen; i++)
{
byte x = Unsafe.AddByteOffset(ref searchSpace, i);
if (x == value0 || x == value1)
return (int)i;
}
return -1;
SSE:
{
var vec0 = Vector128.Create(value0);
var vec1 = Vector128.Create(value1);
nuint offset = 0;
ref byte b = ref searchSpace;
while (offset < ulen)
{
var vector = Unsafe.ReadUnaligned<Vector128<byte>>(ref Unsafe.AddByteOffset(ref b, offset));
var matches = Sse2.MoveMask(
Sse2.Or(
Sse2.CompareEqual(vec0, vector),
Sse2.CompareEqual(vec1, vector))
.AsByte());
if (matches == 0)
{
offset += (nuint)Vector128<byte>.Count;
if (offset > ulen)
offset = ulen - (nuint)Vector128<byte>.Count;
continue;
}
return (int)(offset + (nuint)BitOperations.TrailingZeroCount(matches));
}
return -1;
}
AVX:
{
var vec0 = Vector256.Create(value0);
var vec1 = Vector256.Create(value1);
nuint offset = 0;
ref byte b = ref searchSpace;
while (offset < ulen)
{
var vector = Unsafe.ReadUnaligned<Vector256<byte>>(ref Unsafe.AddByteOffset(ref b, offset));
var matches = Avx2.MoveMask(
Avx2.Or(
Avx2.CompareEqual(vec0, vector),
Avx2.CompareEqual(vec1, vector))
.AsByte());
if (matches == 0)
{
offset += (nuint)Vector256<byte>.Count;
if (offset > ulen)
offset = ulen - (nuint)Vector256<byte>.Count;
continue;
}
return (int)(offset + (nuint)BitOperations.TrailingZeroCount(matches));
}
return -1;
}
DOUBLE_AVX:
{
var vec0 = Vector256.Create(value0);
var vec1 = Vector256.Create(value1);
nuint offset = 0;
ref byte b = ref searchSpace;
while (offset < ulen)
{
Vector256<byte> vector1 = Unsafe.ReadUnaligned<Vector256<byte>>(
ref Unsafe.AddByteOffset(ref b, offset));
Vector256<byte> vector2 = Unsafe.ReadUnaligned<Vector256<byte>>(
ref Unsafe.AddByteOffset(ref b, offset + (nuint)Vector256<byte>.Count));
Vector256<byte> cmp1 = Avx2.CompareEqual(vec0, vector1);
Vector256<byte> cmp2 = Avx2.CompareEqual(vec1, vector1);
Vector256<byte> cmp3 = Avx2.CompareEqual(vec0, vector2);
Vector256<byte> cmp4 = Avx2.CompareEqual(vec1, vector2);
Vector256<byte> or1 = Avx2.Or(cmp1, cmp2).AsByte();
Vector256<byte> or2 = Avx2.Or(cmp3, cmp4).AsByte();
int matches1 = Avx2.MoveMask(or1);
int matches2 = Avx2.MoveMask(or2);
if ((matches1 | matches2) == 0)
{
offset += (nuint)Vector256<byte>.Count * 2;
if (offset > ulen)
// no problem, next chunk will overlap with this one
offset = ulen - ((nuint)Vector256<byte>.Count * 2);
continue;
}
if (matches1 != 0)
{
return (int)(offset + (nuint)BitOperations.TrailingZeroCount(matches1));
}
return (int)(offset + (nuint)Vector256<byte>.Count + (nuint)BitOperations.TrailingZeroCount(matches2));
}
}
return -1;
}
}
//
//
// Extracted from SpanHelpers.byte.cs (100%)
//
//
public static class SpanHelpers
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector<byte> LoadVector(ref byte start, nuint offset)
=> Unsafe.ReadUnaligned<Vector<byte>>(ref Unsafe.AddByteOffset(ref start, offset));
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector256<byte> LoadVector256(ref byte start, nuint offset)
=> Unsafe.ReadUnaligned<Vector256<byte>>(ref Unsafe.AddByteOffset(ref start, offset));
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector128<byte> LoadVector128(ref byte start, nuint offset)
=> Unsafe.ReadUnaligned<Vector128<byte>>(ref Unsafe.AddByteOffset(ref start, offset));
// Vector sub-search adapted from https://github.com/aspnet/KestrelHttpServer/pull/1138
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int LocateFirstFoundByte(Vector<byte> match)
{
var vector64 = Vector.AsVectorUInt64(match);
ulong candidate = 0;
int i = 0;
// Pattern unrolled by jit https://github.com/dotnet/coreclr/pull/8001
for (; i < Vector<ulong>.Count; i++)
{
candidate = vector64[i];
if (candidate != 0)
{
break;
}
}
// Single LEA instruction with jitted const (using function result)
return i * 8 + LocateFirstFoundByte(candidate);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int LocateFirstFoundByte(ulong match)
=> BitOperations.TrailingZeroCount(match) >> 3;
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static bool TryFindFirstMatchedLane(Vector128<byte> mask, Vector128<byte> compareResult, ref int matchedLane)
{
Debug.Assert(AdvSimd.Arm64.IsSupported);
// Find the first lane that is set inside compareResult.
Vector128<byte> maskedSelectedLanes = AdvSimd.And(compareResult, mask);
Vector128<byte> pairwiseSelectedLane = AdvSimd.Arm64.AddPairwise(maskedSelectedLanes, maskedSelectedLanes);
ulong selectedLanes = pairwiseSelectedLane.AsUInt64().ToScalar();
if (selectedLanes == 0)
{
// all lanes are zero, so nothing matched.
return false;
}
// Find the first lane that is set inside compareResult.
matchedLane = BitOperations.TrailingZeroCount(selectedLanes) >> 2;
return true;
}
public static int IndexOfAny(ref byte searchSpace, byte value0, byte value1, int length)
{
Debug.Assert(length >= 0);
uint uValue0 = value0; // Use uint for comparisons to avoid unnecessary 8->32 extensions
uint uValue1 = value1; // Use uint for comparisons to avoid unnecessary 8->32 extensions
nuint offset = 0; // Use nuint for arithmetic to avoid unnecessary 64->32->64 truncations
nuint lengthToExamine = (nuint)(uint)length;
if (Sse2.IsSupported || AdvSimd.Arm64.IsSupported)
{
// Avx2 branch also operates on Sse2 sizes, so check is combined.
nint vectorDiff = (nint)length - Vector128<byte>.Count;
if (vectorDiff >= 0)
{
// >= Sse2 intrinsics are supported, and length is enough to use them so use that path.
// We jump forward to the intrinsics at the end of the method so a naive branch predict
// will choose the non-intrinsic path so short lengths which don't gain anything aren't
// overly disadvantaged by having to jump over a lot of code. Whereas the longer lengths
// more than make this back from the intrinsics.
lengthToExamine = (nuint)vectorDiff;
goto IntrinsicsCompare;
}
}
else if (Vector.IsHardwareAccelerated)
{
// Calculate lengthToExamine here for test, as it is used later
nint vectorDiff = (nint)length - Vector<byte>.Count;
if (vectorDiff >= 0)
{
// Similar as above for Vector version
lengthToExamine = (nuint)vectorDiff;
goto IntrinsicsCompare;
}
}
uint lookUp;
while (lengthToExamine >= 8)
{
lengthToExamine -= 8;
lookUp = Unsafe.AddByteOffset(ref searchSpace, offset);
if (uValue0 == lookUp || uValue1 == lookUp)
goto Found;
lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 1);
if (uValue0 == lookUp || uValue1 == lookUp)
goto Found1;
lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 2);
if (uValue0 == lookUp || uValue1 == lookUp)
goto Found2;
lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 3);
if (uValue0 == lookUp || uValue1 == lookUp)
goto Found3;
lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 4);
if (uValue0 == lookUp || uValue1 == lookUp)
goto Found4;
lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 5);
if (uValue0 == lookUp || uValue1 == lookUp)
goto Found5;
lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 6);
if (uValue0 == lookUp || uValue1 == lookUp)
goto Found6;
lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 7);
if (uValue0 == lookUp || uValue1 == lookUp)
goto Found7;
offset += 8;
}
if (lengthToExamine >= 4)
{
lengthToExamine -= 4;
lookUp = Unsafe.AddByteOffset(ref searchSpace, offset);
if (uValue0 == lookUp || uValue1 == lookUp)
goto Found;
lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 1);
if (uValue0 == lookUp || uValue1 == lookUp)
goto Found1;
lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 2);
if (uValue0 == lookUp || uValue1 == lookUp)
goto Found2;
lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 3);
if (uValue0 == lookUp || uValue1 == lookUp)
goto Found3;
offset += 4;
}
while (lengthToExamine > 0)
{
lookUp = Unsafe.AddByteOffset(ref searchSpace, offset);
if (uValue0 == lookUp || uValue1 == lookUp)
goto Found;
offset += 1;
lengthToExamine -= 1;
}
NotFound:
return -1;
Found: // Workaround for https://github.com/dotnet/runtime/issues/8795
return (int)offset;
Found1:
return (int)(offset + 1);
Found2:
return (int)(offset + 2);
Found3:
return (int)(offset + 3);
Found4:
return (int)(offset + 4);
Found5:
return (int)(offset + 5);
Found6:
return (int)(offset + 6);
Found7:
return (int)(offset + 7);
IntrinsicsCompare:
// When we move into a Vectorized block, we process everything of Vector size;
// and then for any remainder we do a final compare of Vector size but starting at
// the end and forwards, which may overlap on an earlier compare.
// We include the Supported check again here even though path will not be taken, so the asm isn't generated if not supported.
if (Sse2.IsSupported)
{
int matches;
if (Avx2.IsSupported)
{
Vector256<byte> search;
// Guard as we may only have a valid size for Vector128; when we will move to the Sse2
// We have already subtracted Vector128<byte>.Count from lengthToExamine so compare against that
// to see if we have double the size for Vector256<byte>.Count
if (lengthToExamine >= (nuint)Vector128<byte>.Count)
{
Vector256<byte> values0 = Vector256.Create(value0);
Vector256<byte> values1 = Vector256.Create(value1);
// Subtract Vector128<byte>.Count so we have now subtracted Vector256<byte>.Count
lengthToExamine -= (nuint)Vector128<byte>.Count;
// First time this checks again against 0, however we will move into final compare if it fails.
while (lengthToExamine > offset)
{
search = LoadVector256(ref searchSpace, offset);
// Bitwise Or to combine the flagged matches for the second value to our match flags
matches = Avx2.MoveMask(
Avx2.Or(
Avx2.CompareEqual(values0, search),
Avx2.CompareEqual(values1, search)));
// Note that MoveMask has converted the equal vector elements into a set of bit flags,
// So the bit position in 'matches' corresponds to the element offset.
if (matches == 0)
{
// None matched
offset += (nuint)Vector256<byte>.Count;
continue;
}
goto IntrinsicsMatch;
}
// Move to Vector length from end for final compare
search = LoadVector256(ref searchSpace, lengthToExamine);
offset = lengthToExamine;
// Same as method as above
matches = Avx2.MoveMask(
Avx2.Or(
Avx2.CompareEqual(values0, search),
Avx2.CompareEqual(values1, search)));
if (matches == 0)
{
// None matched
goto NotFound;
}
goto IntrinsicsMatch;
}
}
// Initial size check was done on method entry.
Debug.Assert(length >= Vector128<byte>.Count);
{
Vector128<byte> search;
Vector128<byte> values0 = Vector128.Create(value0);
Vector128<byte> values1 = Vector128.Create(value1);
// First time this checks against 0 and we will move into final compare if it fails.
while (lengthToExamine > offset)
{
search = LoadVector128(ref searchSpace, offset);
matches = Sse2.MoveMask(
Sse2.Or(
Sse2.CompareEqual(values0, search),
Sse2.CompareEqual(values1, search))
.AsByte());
// Note that MoveMask has converted the equal vector elements into a set of bit flags,
// So the bit position in 'matches' corresponds to the element offset.
if (matches == 0)
{
// None matched
offset += (nuint)Vector128<byte>.Count;
continue;
}
goto IntrinsicsMatch;
}
// Move to Vector length from end for final compare
search = LoadVector128(ref searchSpace, lengthToExamine);
offset = lengthToExamine;
// Same as method as above
matches = Sse2.MoveMask(
Sse2.Or(
Sse2.CompareEqual(values0, search),
Sse2.CompareEqual(values1, search)));
if (matches == 0)
{
// None matched
goto NotFound;
}
}
IntrinsicsMatch:
// Find bitflag offset of first difference and add to current offset
offset += (nuint)BitOperations.TrailingZeroCount(matches);
goto Found;
}
else if (AdvSimd.Arm64.IsSupported)
{
// Mask to help find the first lane in compareResult that is set.
// LSB 0x01 corresponds to lane 0, 0x10 - to lane 1, and so on.
Vector128<byte> mask = Vector128.Create((ushort)0x1001).AsByte();
int matchedLane = 0;
Vector128<byte> search;
Vector128<byte> matches;
Vector128<byte> values0 = Vector128.Create(value0);
Vector128<byte> values1 = Vector128.Create(value1);
// First time this checks against 0 and we will move into final compare if it fails.
while (lengthToExamine > offset)
{
search = LoadVector128(ref searchSpace, offset);
matches = AdvSimd.Or(
AdvSimd.CompareEqual(values0, search),
AdvSimd.CompareEqual(values1, search));
if (!TryFindFirstMatchedLane(mask, matches, ref matchedLane))
{
// Zero flags set so no matches
offset += (nuint)Vector128<byte>.Count;
continue;
}
// Find bitflag offset of first match and add to current offset
offset += (uint)matchedLane;
goto Found;
}
// Move to Vector length from end for final compare
search = LoadVector128(ref searchSpace, lengthToExamine);
offset = lengthToExamine;
// Same as method as above
matches = AdvSimd.Or(
AdvSimd.CompareEqual(values0, search),
AdvSimd.CompareEqual(values1, search));
if (!TryFindFirstMatchedLane(mask, matches, ref matchedLane))
{
// None matched
goto NotFound;
}
// Find bitflag offset of first match and add to current offset
offset += (nuint)(uint)matchedLane;
goto Found;
}
else if (Vector.IsHardwareAccelerated)
{
Vector<byte> values0 = new Vector<byte>(value0);
Vector<byte> values1 = new Vector<byte>(value1);
Vector<byte> search;
// First time this checks against 0 and we will move into final compare if it fails.
while (lengthToExamine > offset)
{
search = LoadVector(ref searchSpace, offset);
search = Vector.BitwiseOr(
Vector.Equals(search, values0),
Vector.Equals(search, values1));
if (Vector<byte>.Zero.Equals(search))
{
// None matched
offset += (nuint)Vector<byte>.Count;
continue;
}
goto VectorMatch;
}
// Move to Vector length from end for final compare
search = LoadVector(ref searchSpace, lengthToExamine);
offset = lengthToExamine;
search = Vector.BitwiseOr(
Vector.Equals(search, values0),
Vector.Equals(search, values1));
if (Vector<byte>.Zero.Equals(search))
{
// None matched
goto NotFound;
}
VectorMatch:
offset += (nuint)LocateFirstFoundByte(search);
goto Found;
}
Debug.Fail("Unreachable");
goto NotFound;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment