Const-me/SimdUtilsPublic.dolby.cpp

## SimdUtilsPublic.dolby.cpp
// 384.0 in float: https://www.h-schmidt.net/FloatConverter/IEEE754.html
// Which was the bias value passed to a52_frame in C# code
constexpr int zeroLevel = 0x43c00000;

#ifdef _MSC_VER
using int16x8_t = __m128i;
using int32x4_t = __m128i;

__forceinline int32x4_t load4( const float* src )
{
	return _mm_loadu_si128( ( const __m128i * )src );
}
__forceinline int32x4_t zeroVector()
{
	return _mm_set1_epi32( zeroLevel );
}
__forceinline void store8( int16_t* dest, int16x8_t v )
{
	_mm_storeu_si128( ( __m128i* )dest, v );
}
#else
__forceinline int32x4_t load4( const float* src )
{
	return vld1q_s32( (const int*)(const void*)src );
}
__forceinline int32x4_t zeroVector()
{
	return vdupq_n_s32( zeroLevel );
}
__forceinline void store8( int16_t* dest, int16x8_t v )
{
	vst1q_s16( dest, v );
}
#endif

// Read 8 floats, convert them into signed integers with signed saturation
__forceinline int16x8_t convert8( const float* src, const int32x4_t zero )
{
	int32x4_t low = load4( src );
	int32x4_t high = load4( src + 4 );

#ifdef _MSC_VER
	low = _mm_sub_epi32( low, zero );
	high = _mm_sub_epi32( high, zero );
	return _mm_packs_epi32( low, high );
#else
	low = vsubq_s32( low, zero );
	high = vsubq_s32( high, zero );
	// Signed saturating extract Narrow.
	// This instruction reads each vector element from the source SIMD&FP register, saturates the value to half the original width,
	// places the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&FP register.
	// The destination vector elements are half as long as the source vector elements. All the values in this instruction are signed integer values.
	int16x4_t low16 = vqmovn_s32( low );
	int16x4_t high16 = vqmovn_s32( high );
	return vcombine_s16( low16, high16 );
#endif
}
	// 384.0 in float: https://www.h-schmidt.net/FloatConverter/IEEE754.html
	// Which was the bias value passed to a52_frame in C# code
	constexpr int zeroLevel = 0x43c00000;

	#ifdef _MSC_VER
	using int16x8_t = __m128i;
	using int32x4_t = __m128i;

	__forceinline int32x4_t load4( const float* src )
	{
	return _mm_loadu_si128( ( const __m128i * )src );
	}
	__forceinline int32x4_t zeroVector()
	{
	return _mm_set1_epi32( zeroLevel );
	}
	__forceinline void store8( int16_t* dest, int16x8_t v )
	{
	_mm_storeu_si128( ( __m128i* )dest, v );
	}
	#else
	__forceinline int32x4_t load4( const float* src )
	{
	return vld1q_s32( (const int)(const void)src );
	}
	__forceinline int32x4_t zeroVector()
	{
	return vdupq_n_s32( zeroLevel );
	}
	__forceinline void store8( int16_t* dest, int16x8_t v )
	{
	vst1q_s16( dest, v );
	}
	#endif

	// Read 8 floats, convert them into signed integers with signed saturation
	__forceinline int16x8_t convert8( const float* src, const int32x4_t zero )
	{
	int32x4_t low = load4( src );
	int32x4_t high = load4( src + 4 );

	#ifdef _MSC_VER
	low = _mm_sub_epi32( low, zero );
	high = _mm_sub_epi32( high, zero );
	return _mm_packs_epi32( low, high );
	#else
	low = vsubq_s32( low, zero );
	high = vsubq_s32( high, zero );
	// Signed saturating extract Narrow.
	// This instruction reads each vector element from the source SIMD&FP register, saturates the value to half the original width,
	// places the result into a vector, and writes the vector to the lower or upper half of the destination SIMD&FP register.
	// The destination vector elements are half as long as the source vector elements. All the values in this instruction are signed integer values.
	int16x4_t low16 = vqmovn_s32( low );
	int16x4_t high16 = vqmovn_s32( high );
	return vcombine_s16( low16, high16 );
	#endif
	}