vchirikov/Benchmark.md

## Benchmark.md

      
    Raw
  

              Benchmark.md
            
          
    Binary search is theoretically optimal, but it's possible to speed it up substantially using AVX2 and branchless code even in .NET Core.
Memory access is the limiting factor for binary search. When we access each element for comparison a cache line is loaded, so we could load a 32-byte vector almost free, check if it contains the target value, and if not reduce the search space by 32/sizeof(T) instead of 1 element.
AVX512 with _mm256_cmpge_epi64_mask instruction should improve it even more, but it is not available on .NET yet.
Results for long. Searching [0,1,2,3,...] in [0,2,4,6,...] array (50% hit rate)

SearchBench


Case
MOPS
Elapsed
GC0
GC1
GC2
Memory


BS_Classic_2
570.38
15 ms
0.0
0.0
0.0
0.000 MB


BS_Classic_1
569.90
15 ms
0.0
0.0
0.0
0.000 MB


BS_Avx_1
505.60
17 ms
0.0
0.0
0.0
0.000 MB


BS_Avx_2
504.03
17 ms
0.0
0.0
0.0
0.000 MB


BS_Avx_4
503.77
17 ms
0.0
0.0
0.0
0.000 MB


BS_Classic_4
288.84
29 ms
0.0
0.0
0.0
0.000 MB


BS_Avx_8
285.34
29 ms
0.0
0.0
0.0
0.000 MB


BS_Avx_16
186.61
45 ms
0.0
0.0
0.0
0.000 MB


BS_Classic_8
183.58
46 ms
0.0
0.0
0.0
0.000 MB


BS_Avx_32
151.42
55 ms
0.0
0.0
0.0
0.000 MB


BS_Avx_64
117.92
71 ms
0.0
0.0
0.0
0.000 MB


BS_Classic_16
111.73
75 ms
0.0
0.0
0.0
0.000 MB


BS_Avx_128
95.59
88 ms
0.0
0.0
0.0
0.000 MB


BS_Classic_32
90.19
93 ms
0.0
0.0
0.0
0.000 MB


BS_Classic_64
71.43
117 ms
0.0
0.0
0.0
0.000 MB


BS_Avx_256
63.05
133 ms
0.0
0.0
0.0
0.000 MB


BS_Classic_128
60.40
139 ms
0.0
0.0
0.0
0.000 MB


BS_Classic_256
47.07
178 ms
0.0
0.0
0.0
0.000 MB


BS_Avx_512
40.02
210 ms
0.0
0.0
0.0
0.000 MB


BS_Avx_1024
31.25
268 ms
0.0
0.0
0.0
0.000 MB


BS_Classic_512
28.62
293 ms
0.0
0.0
0.0
0.000 MB


BS_Avx_4096
22.84
367 ms
0.0
0.0
0.0
0.000 MB


BS_Classic_1024
22.35
375 ms
0.0
0.0
0.0
0.000 MB


BS_Avx_16384
19.77
424 ms
0.0
0.0
0.0
0.000 MB


BS_Classic_4096
19.03
441 ms
0.0
0.0
0.0
0.000 MB


BS_Classic_16384
17.71
474 ms
0.0
0.0
0.0
0.000 MB


## BinarySearch.cs
/// <summary>
/// Performs classic binary search and returns index of the value or its negative binary complement.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining
#if HAS_AGGR_OPT
            | MethodImplOptions.AggressiveOptimization
#endif
)]
[SuppressMessage("ReSharper", "HeapView.BoxingAllocation")] // false warnings for (type)(object)value pattern
public static int BinarySearch<T>(ref T vecStart, int length, T value, KeyComparer<T> comparer = default)
{
#if HAS_INTRINSICS
    if (Avx2.IsSupported)
    {
        if (typeof(T) == typeof(sbyte))
            return BinarySearchAvx(ref Unsafe.As<T, sbyte>(ref vecStart), length, (sbyte) (object) value);

        if (typeof(T) == typeof(short))
            return BinarySearchAvx(ref Unsafe.As<T, short>(ref vecStart), length, (short) (object) value);

        if (typeof(T) == typeof(int))
            return BinarySearchAvx(ref Unsafe.As<T, int>(ref vecStart), length, (int) (object) value);

        if (typeof(T) == typeof(long)
            || typeof(T) == typeof(Timestamp)
            || typeof(T) == typeof(DateTime)
        )
            return BinarySearchAvx(ref Unsafe.As<T, long>(ref vecStart), length, (long) (object) value);
    }
#endif

    // This one is actually very hard to beat in general case
    // because of memory access (cache miss) costs. In the classic
    // algorithm every memory access is useful, i.e. it halves the
    // search space. K-ary search has K-2 useless memory accesses.
    // E.g. for SIMD-ized search with K = 4 we do 4 memory accesses
    // but reduce the search space to the same size as 2 accesses
    // in the classic algorithm. SIMD doesn't speedup memory access,
    // which is the main cost for high number of items.
    return BinarySearchClassic(ref vecStart, length, value, comparer);
}

#if HAS_INTRINSICS
[MethodImpl(MethodImplOptions.AggressiveInlining
#if HAS_AGGR_OPT
            | MethodImplOptions.AggressiveOptimization
#endif
)]
internal static int BinarySearchAvx(ref long vecStart, int length, long value)
{
    unchecked
    {
        int i;
        int c;
        int lo = 0;
        int hi = length - 1;
        var valVec = Vector256.Create(value);
        while (hi - lo > Vector256<long>.Count - 1)
        {
            i = (int) (((uint) hi + (uint) lo) >> 1) - (Vector256<long>.Count >> 1);

            var vec = Unsafe.ReadUnaligned<Vector256<long>>(ref Unsafe.As<long, byte>(ref Unsafe.Add(ref vecStart, i)));

            // AVX512 has _mm256_cmpge_epi64_mask that should allow to combine the two operations
            // and avoid edge-case check in `mask == 0` case below
            var gt = Avx2.CompareGreaterThan(valVec, vec); // _mm256_cmpgt_epi64
            var mask = Avx2.MoveMask(gt.AsByte());

            if (mask == 0) // val is smaller than all in vec
            {
                // but could be equal to the first element
                c = value.CompareTo(UnsafeEx.ReadUnaligned(ref Unsafe.Add(ref vecStart, i)));
                if (c == 0)
                {
                    lo = i;
                    goto RETURN;
                }

                hi = i - 1;
            }
            else if (mask == -1) // val is larger than all in vec
            {
                lo = i + Vector256<long>.Count;
            }
            else
            {
                var clz = BitUtil.NumberOfLeadingZeros(mask);
                var index = (32 - clz) / Unsafe.SizeOf<long>();
                lo = i + index;
                c = value.CompareTo(UnsafeEx.ReadUnaligned(ref Unsafe.Add(ref vecStart, lo)));
                goto RETURN;
            }
        }

        while ((c = value.CompareTo(UnsafeEx.ReadUnaligned(ref Unsafe.Add(ref vecStart, lo)))) > 0
               & ++lo <= hi) // if using branchless & then need to correct lo below
        {
        }

        // correct back non-short-circuit & evaluation
        lo -= UnsafeEx.Clt(c, 1); // (int)(c < 1)

        RETURN:
        var ceq1 = -UnsafeEx.Ceq(c, 0); // (int)(c == 0)
        return (ceq1 & lo) | (~ceq1 & ~lo);
    }
}
#endif

/// <summary>
/// Performs classic binary search and returns index of the value or its negative binary complement.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining
#if HAS_AGGR_OPT
            | MethodImplOptions.AggressiveOptimization
#endif
)]
internal static int BinarySearchClassic<T>(ref T vecStart, int length, T value, KeyComparer<T> comparer = default)
{
    unchecked
    {
        int lo = 0;
        int hi = length - 1;
        // If length == 0, hi == -1, and loop will not be entered
        while (lo <= hi)
        {
            // PERF: `lo` or `hi` will never be negative inside the loop,
            //       so computing median using uints is safe since we know
            //       `length <= int.MaxValue`, and indices are >= 0
            //       and thus cannot overflow an uint.
            //       Saves one subtraction per loop compared to
            //       `int i = lo + ((hi - lo) >> 1);`
            int i = (int) (((uint) hi + (uint) lo) >> 1);

            int c = comparer.Compare(value, UnsafeEx.ReadUnaligned(ref Unsafe.Add(ref vecStart, i)));

            if (c == 0)
            {
                return i;
            }

            if (c > 0)
            {
                lo = i + 1;
            }
            else
            {
                hi = i - 1;
            }
        }

        // If none found, then a negative number that is the bitwise complement
        // of the index of the next element that is larger than or, if there is
        // no larger element, the bitwise complement of `length`, which
        // is `lo` at this point.
        return ~lo;
    }
}

## Note on branchless code in .NET.md

      
    Raw
  

              Note on branchless code in .NET.md
            
          
    In .NET it's not trivial to cast a boolean to int, e.g.:
int lo = 0;
...
var c = a.CompareTo(b);
var cInt = *(int*)&c;

or similar manipulations.
It looks simple but is actually quite slow (when we count cycles, not milliseconds) probably due to bad codegen, local variable allocation and the likes.
In some cases we could write branchless code that is faster than code that uses if statement. E.g. in binary search
we return current index lo if values are equal or los binary complement otherwise.
int lo = 0;
...
var c = a.CompareTo(b);
return c == 0 ? lo : ~lo;

Some modern compilers convert ternary expressions to cmov, but it's tricky. Even @lemire writes this:

The condition move instructions are pretty much standard and old at this point. Sadly, I only know how to convince one compiler (GNU GCC) to reliably produce conditional move instructions. And I have no clue how to control how Java, Swift, Rust or any other language deals with branches.

So instead of entertaining any hope that .NET JIT could/would support it, we could avoid branches using bitwise operations.
int lo = 0;
...
var ceq = a.CompareTo(b) == 0; // boolean
var ceqInt = -(*(int*)&ceq); // -1 if equal, 0 otherwise
return (ceq1 & lo) | (~ceq1 & ~lo);

However, this cast from bool to int is slow no matter how I tried to do it (pointers/unsafe).
Interestingly the IL instruction ceq already returns exactly 1 or 0, it is C#-the-language that exposes the result as a boolean.
To get access to the int result we could write our own unsafe method:
  .method public hidebysig static int32 Ceq(int32 first, int32 second) cil managed aggressiveinlining
  {
    ldarg.0
    ldarg.1
    ceq
    ret
  }

and similar methods for greater/less than operations.
Using this method as UnsafeEx.Ceq(first: int, second:int):int we could rewrite the code as:
int lo = 0;
...
var c = a.CompareTo(b);
var ceqInt = -UnsafeEx.Ceq(c, 0); // -1 if equal, 0 otherwise
return (ceqInt & lo) | (~ceqInt & ~lo);

and it finally performs very fast and faster than the branchy version when the branch is not predictable.
There is another example when branchless eager & comparison with subsequent correction operation via UnsafeEx.Clt
allows to eliminate one branch. But benchmarks are inconclusive for this usage (branchless is definitely not slower, even though it does more work).
while ((c = value.CompareTo(UnsafeEx.ReadUnaligned(ref Unsafe.Add(ref vecStart, lo)))) > 0
       & ++lo <= hi) // if using branchless & then need to correct lo below
{
}
        
// correct back non-short-circuit & evaluation
lo -= UnsafeEx.Clt(c, 1); // (int)(c < 1)

Such methods that return int from comparison would be a good addition to System.Runtime.CompilerServices.Unsafe, but are also available in Spreads.Native repo and NuGet package.
The code for Avx2- and branch-optimized binary search for sbyte, short, int and long is generated from a T4 template here: https://github.com/Spreads/Spreads/blob/master/src/Spreads.Core/Algorithms/VectorSearch.Avx.tt
Case	MOPS	Elapsed
BS_Classic_2	570.38	15 ms
BS_Classic_1	569.90	15 ms
BS_Avx_1	505.60	17 ms
BS_Avx_2	504.03	17 ms
BS_Avx_4	503.77	17 ms
BS_Classic_4	288.84	29 ms
BS_Avx_8	285.34	29 ms
BS_Avx_16	186.61	45 ms
BS_Classic_8	183.58	46 ms
BS_Avx_32	151.42	55 ms
BS_Avx_64	117.92	71 ms
BS_Classic_16	111.73	75 ms
BS_Avx_128	95.59	88 ms
BS_Classic_32	90.19	93 ms
BS_Classic_64	71.43	117 ms
BS_Avx_256	63.05	133 ms
BS_Classic_128	60.40	139 ms
BS_Classic_256	47.07	178 ms
BS_Avx_512	40.02	210 ms
BS_Avx_1024	31.25	268 ms
BS_Classic_512	28.62	293 ms
BS_Avx_4096	22.84	367 ms
BS_Classic_1024	22.35	375 ms
BS_Avx_16384	19.77	424 ms
BS_Classic_4096	19.03	441 ms
BS_Classic_16384	17.71	474 ms
	/// <summary>
	/// Performs classic binary search and returns index of the value or its negative binary complement.
	/// </summary>
	[MethodImpl(MethodImplOptions.AggressiveInlining
	#if HAS_AGGR_OPT
	\| MethodImplOptions.AggressiveOptimization
	#endif
	)]
	[SuppressMessage("ReSharper", "HeapView.BoxingAllocation")] // false warnings for (type)(object)value pattern
	public static int BinarySearch<T>(ref T vecStart, int length, T value, KeyComparer<T> comparer = default)
	{
	#if HAS_INTRINSICS
	if (Avx2.IsSupported)
	{
	if (typeof(T) == typeof(sbyte))
	return BinarySearchAvx(ref Unsafe.As<T, sbyte>(ref vecStart), length, (sbyte) (object) value);

	if (typeof(T) == typeof(short))
	return BinarySearchAvx(ref Unsafe.As<T, short>(ref vecStart), length, (short) (object) value);

	if (typeof(T) == typeof(int))
	return BinarySearchAvx(ref Unsafe.As<T, int>(ref vecStart), length, (int) (object) value);

	if (typeof(T) == typeof(long)
	\|\| typeof(T) == typeof(Timestamp)
	\|\| typeof(T) == typeof(DateTime)
	)
	return BinarySearchAvx(ref Unsafe.As<T, long>(ref vecStart), length, (long) (object) value);
	}
	#endif

	// This one is actually very hard to beat in general case
	// because of memory access (cache miss) costs. In the classic
	// algorithm every memory access is useful, i.e. it halves the
	// search space. K-ary search has K-2 useless memory accesses.
	// E.g. for SIMD-ized search with K = 4 we do 4 memory accesses
	// but reduce the search space to the same size as 2 accesses
	// in the classic algorithm. SIMD doesn't speedup memory access,
	// which is the main cost for high number of items.
	return BinarySearchClassic(ref vecStart, length, value, comparer);
	}

	#if HAS_INTRINSICS
	[MethodImpl(MethodImplOptions.AggressiveInlining
	#if HAS_AGGR_OPT
	\| MethodImplOptions.AggressiveOptimization
	#endif
	)]
	internal static int BinarySearchAvx(ref long vecStart, int length, long value)
	{
	unchecked
	{
	int i;
	int c;
	int lo = 0;
	int hi = length - 1;
	var valVec = Vector256.Create(value);
	while (hi - lo > Vector256<long>.Count - 1)
	{
	i = (int) (((uint) hi + (uint) lo) >> 1) - (Vector256<long>.Count >> 1);

	var vec = Unsafe.ReadUnaligned<Vector256<long>>(ref Unsafe.As<long, byte>(ref Unsafe.Add(ref vecStart, i)));

	// AVX512 has _mm256_cmpge_epi64_mask that should allow to combine the two operations
	// and avoid edge-case check in `mask == 0` case below
	var gt = Avx2.CompareGreaterThan(valVec, vec); // _mm256_cmpgt_epi64
	var mask = Avx2.MoveMask(gt.AsByte());

	if (mask == 0) // val is smaller than all in vec
	{
	// but could be equal to the first element
	c = value.CompareTo(UnsafeEx.ReadUnaligned(ref Unsafe.Add(ref vecStart, i)));
	if (c == 0)
	{
	lo = i;
	goto RETURN;
	}

	hi = i - 1;
	}
	else if (mask == -1) // val is larger than all in vec
	{
	lo = i + Vector256<long>.Count;
	}
	else
	{
	var clz = BitUtil.NumberOfLeadingZeros(mask);
	var index = (32 - clz) / Unsafe.SizeOf<long>();
	lo = i + index;
	c = value.CompareTo(UnsafeEx.ReadUnaligned(ref Unsafe.Add(ref vecStart, lo)));
	goto RETURN;
	}
	}

	while ((c = value.CompareTo(UnsafeEx.ReadUnaligned(ref Unsafe.Add(ref vecStart, lo)))) > 0
	& ++lo <= hi) // if using branchless & then need to correct lo below
	{
	}

	// correct back non-short-circuit & evaluation
	lo -= UnsafeEx.Clt(c, 1); // (int)(c < 1)

	RETURN:
	var ceq1 = -UnsafeEx.Ceq(c, 0); // (int)(c == 0)
	return (ceq1 & lo) \| (~ceq1 & ~lo);
	}
	}
	#endif

	/// <summary>
	/// Performs classic binary search and returns index of the value or its negative binary complement.
	/// </summary>
	[MethodImpl(MethodImplOptions.AggressiveInlining
	#if HAS_AGGR_OPT
	\| MethodImplOptions.AggressiveOptimization
	#endif
	)]
	internal static int BinarySearchClassic<T>(ref T vecStart, int length, T value, KeyComparer<T> comparer = default)
	{
	unchecked
	{
	int lo = 0;
	int hi = length - 1;
	// If length == 0, hi == -1, and loop will not be entered
	while (lo <= hi)
	{
	// PERF: `lo` or `hi` will never be negative inside the loop,
	// so computing median using uints is safe since we know
	// `length <= int.MaxValue`, and indices are >= 0
	// and thus cannot overflow an uint.
	// Saves one subtraction per loop compared to
	// `int i = lo + ((hi - lo) >> 1);`
	int i = (int) (((uint) hi + (uint) lo) >> 1);

	int c = comparer.Compare(value, UnsafeEx.ReadUnaligned(ref Unsafe.Add(ref vecStart, i)));

	if (c == 0)
	{
	return i;
	}

	if (c > 0)
	{
	lo = i + 1;
	}
	else
	{
	hi = i - 1;
	}
	}

	// If none found, then a negative number that is the bitwise complement
	// of the index of the next element that is larger than or, if there is
	// no larger element, the bitwise complement of `length`, which
	// is `lo` at this point.
	return ~lo;
	}
	}