Skip to content

Instantly share code, notes, and snippets.

@Const-me
Created March 3, 2021 09:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Const-me/ef93b842cd3c13c47c0b5d0ebff4a0a8 to your computer and use it in GitHub Desktop.
Save Const-me/ef93b842cd3c13c47c0b5d0ebff4a0a8 to your computer and use it in GitHub Desktop.
using System;
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.Arm;
static class MotionDetectNeon
{
/// <summary>Compute absolute difference between a and b, count the elements with difference above the threshold.</summary>
[MethodImpl( MethodImplOptions.AggressiveInlining )]
static Vector128<int> countAboveThreshold( Vector128<byte> a, Vector128<byte> b, Vector128<byte> threshold, Vector128<int> acc )
{
// Integer absolute difference
Vector128<byte> diff = AdvSimd.AbsoluteDifference( a, b );
// Compare with the threshold
Vector128<byte> passed = AdvSimd.CompareGreaterThanOrEqual( diff, threshold );
// Long pairwise add: expand int8 into int16, add them pairwise
// The `passed` value is either 0xFF or 0, reinterpreting to signed number gets us -1 for true / 0 for false
Vector128<short> sum16 = AdvSimd.AddPairwiseWidening( passed.AsSByte() );
// Long pairwise add + accumulate: expand int16 into int32, add them pairwise, and accumulate
return AdvSimd.AddPairwiseWideningAndAdd( acc, sum16 );
}
public static int IntrinsicTest( byte[] lhs, byte[] rhs, byte thresholdValue = 16 )
{
if( lhs.Length != rhs.Length )
throw new ArgumentException();
Vector128<byte> threshold = AdvSimd.DuplicateToVector128( thresholdValue );
Vector128<int> acc = Vector128<int>.Zero;
// C# arrays are on the GC heap, all the stuff there is movable.
// The unsafe code below pins the arrays only once for the duration of the complete function.
// Otherwise, the runtime gonna pin/unpin both arrays for each 16-byte slice being processed.
unsafe
{
fixed( byte* p1begin = lhs )
fixed( byte* p2begin = rhs )
{
byte* p1 = p1begin;
byte* p2 = p2begin;
byte* p1EndAligned = p1 + ( lhs.Length / 16 ) * 16;
while( p1 < p1EndAligned )
{
// Load 16-byte vectors from both pointers
Vector128<byte> a = AdvSimd.LoadVector128( p1 );
Vector128<byte> b = AdvSimd.LoadVector128( p2 );
p1 += 16;
p2 += 16;
// Apply threshold and count these elements
acc = countAboveThreshold( a, b, threshold, acc );
}
int remainder = ( lhs.Length % 16 );
if( remainder != 0 )
{
// Need to deal with the remainder; allocate 32 bytes on stack
byte* localBuffer = stackalloc byte[ 32 ];
// Copy the last few payload bytes into the local buffer
for( int i = 0; i < remainder; i++ )
{
localBuffer[ i ] = p1[ i ];
localBuffer[ i + 16 ] = p2[ i ];
}
// The content of the newly allocated memory is undefined: https://docs.microsoft.com/en-us/dotnet/csharp/language-reference/operators/stackalloc
for( int i = remainder; i < 16; i++ )
{
localBuffer[ i ] = 0;
localBuffer[ i + 16 ] = 0;
}
// Load vectors from the local buffer
Vector128<byte> a = AdvSimd.LoadVector128( localBuffer );
Vector128<byte> b = AdvSimd.LoadVector128( localBuffer + 16 );
// Apply threshold and count these elements
acc = countAboveThreshold( a, b, threshold, acc );
}
}
// Compute horizontal sum of all 4 lanes in the accumulator
Vector64<int> acc2 = AdvSimd.Add( acc.GetLower(), acc.GetUpper() );
acc2 = AdvSimd.AddPairwise( acc2, acc2 );
// Comparison instructions return -1 instead of +1 for `true`, inverting sign of the result.
// This way is slightly faster than Negate or And on each loop iteration.
return -acc2.ToScalar();
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment