Created
March 3, 2021 09:07
-
-
Save Const-me/ef93b842cd3c13c47c0b5d0ebff4a0a8 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Runtime.CompilerServices; | |
using System.Runtime.Intrinsics; | |
using System.Runtime.Intrinsics.Arm; | |
static class MotionDetectNeon | |
{ | |
/// <summary>Compute absolute difference between a and b, count the elements with difference above the threshold.</summary> | |
[MethodImpl( MethodImplOptions.AggressiveInlining )] | |
static Vector128<int> countAboveThreshold( Vector128<byte> a, Vector128<byte> b, Vector128<byte> threshold, Vector128<int> acc ) | |
{ | |
// Integer absolute difference | |
Vector128<byte> diff = AdvSimd.AbsoluteDifference( a, b ); | |
// Compare with the threshold | |
Vector128<byte> passed = AdvSimd.CompareGreaterThanOrEqual( diff, threshold ); | |
// Long pairwise add: expand int8 into int16, add them pairwise | |
// The `passed` value is either 0xFF or 0, reinterpreting to signed number gets us -1 for true / 0 for false | |
Vector128<short> sum16 = AdvSimd.AddPairwiseWidening( passed.AsSByte() ); | |
// Long pairwise add + accumulate: expand int16 into int32, add them pairwise, and accumulate | |
return AdvSimd.AddPairwiseWideningAndAdd( acc, sum16 ); | |
} | |
public static int IntrinsicTest( byte[] lhs, byte[] rhs, byte thresholdValue = 16 ) | |
{ | |
if( lhs.Length != rhs.Length ) | |
throw new ArgumentException(); | |
Vector128<byte> threshold = AdvSimd.DuplicateToVector128( thresholdValue ); | |
Vector128<int> acc = Vector128<int>.Zero; | |
// C# arrays are on the GC heap, all the stuff there is movable. | |
// The unsafe code below pins the arrays only once for the duration of the complete function. | |
// Otherwise, the runtime gonna pin/unpin both arrays for each 16-byte slice being processed. | |
unsafe | |
{ | |
fixed( byte* p1begin = lhs ) | |
fixed( byte* p2begin = rhs ) | |
{ | |
byte* p1 = p1begin; | |
byte* p2 = p2begin; | |
byte* p1EndAligned = p1 + ( lhs.Length / 16 ) * 16; | |
while( p1 < p1EndAligned ) | |
{ | |
// Load 16-byte vectors from both pointers | |
Vector128<byte> a = AdvSimd.LoadVector128( p1 ); | |
Vector128<byte> b = AdvSimd.LoadVector128( p2 ); | |
p1 += 16; | |
p2 += 16; | |
// Apply threshold and count these elements | |
acc = countAboveThreshold( a, b, threshold, acc ); | |
} | |
int remainder = ( lhs.Length % 16 ); | |
if( remainder != 0 ) | |
{ | |
// Need to deal with the remainder; allocate 32 bytes on stack | |
byte* localBuffer = stackalloc byte[ 32 ]; | |
// Copy the last few payload bytes into the local buffer | |
for( int i = 0; i < remainder; i++ ) | |
{ | |
localBuffer[ i ] = p1[ i ]; | |
localBuffer[ i + 16 ] = p2[ i ]; | |
} | |
// The content of the newly allocated memory is undefined: https://docs.microsoft.com/en-us/dotnet/csharp/language-reference/operators/stackalloc | |
for( int i = remainder; i < 16; i++ ) | |
{ | |
localBuffer[ i ] = 0; | |
localBuffer[ i + 16 ] = 0; | |
} | |
// Load vectors from the local buffer | |
Vector128<byte> a = AdvSimd.LoadVector128( localBuffer ); | |
Vector128<byte> b = AdvSimd.LoadVector128( localBuffer + 16 ); | |
// Apply threshold and count these elements | |
acc = countAboveThreshold( a, b, threshold, acc ); | |
} | |
} | |
// Compute horizontal sum of all 4 lanes in the accumulator | |
Vector64<int> acc2 = AdvSimd.Add( acc.GetLower(), acc.GetUpper() ); | |
acc2 = AdvSimd.AddPairwise( acc2, acc2 ); | |
// Comparison instructions return -1 instead of +1 for `true`, inverting sign of the result. | |
// This way is slightly faster than Negate or And on each loop iteration. | |
return -acc2.ToScalar(); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment