Const-me/vrcpps-errors.cpp

## vrcpps-errors.cpp
#include <stdint.h>
#include <atlfile.h>
#include <intrin.h>
#include <array>
// These headers are from there: https://github.com/Const-me/IntelIntrinsics/tree/master/CppDemo/Intrinsics
#include "Intrinsics/avx.hpp"
#include "Intrinsics/avx2.hpp"
#include "Intrinsics/sse.hpp"
#include "Intrinsics/sse2.hpp"
using namespace Intrinsics::Avx;

constexpr uint64_t gb = ( 1 << 30 );
// Total count of different 32-bit floats
constexpr uint64_t totalFloats = 4 * gb;
// Count of __m256 lanes in the file
constexpr int64_t totalVectors = totalFloats / 8;

// LPCTSTR destFileName = LR"(C:\Temp\rcpss-amd.bin)";
LPCTSTR destFileName = LR"(\\CONST-PC\Temp\rcpss-intel.bin)";

LPCTSTR readAmd = LR"(C:\Temp\rcpss-amd.bin)";
LPCTSTR readIntel = LR"(D:\Temp\rcpss-intel.bin)";

// ==== I/O Boilerplate ====
#define CHECK( hr ) { const HRESULT __hr = (hr); if( FAILED( __hr ) ) { printf( "%s failed: 0x%08x\n", #hr, __hr ); return __hr; } }

class MappedFile
{
	CAtlFile file;
	CAtlFileMapping<__m256> map;

public:

	HRESULT create( LPCTSTR path, bool write )
	{
		const DWORD acc = write ? GENERIC_READ | GENERIC_WRITE : GENERIC_READ;
		const DWORD mode = write ? CREATE_ALWAYS : OPEN_EXISTING;
		CHECK( file.Create( path, acc, 0, mode ) );

		if( write )
		{
			CHECK( file.SetSize( 4 * totalFloats ) );
			CHECK( map.MapFile( file, 0, 0, PAGE_READWRITE, FILE_MAP_ALL_ACCESS ) );
		}
		else
			CHECK( map.MapFile( file ) );

		return S_OK;
	}

	// Pointer to the mapped data
	operator __m256* const( ) { return map; }

	// Use mapped table to read a single float
	float operator[] ( float x ) const
	{
		uint32_t idx;
		CopyMemory( &idx, &x, 4 );
		const float* ptr = (const float*)( map.operator __m256 *( ) );
		return ptr[ idx ];
	}
};

// __m256 elements per block. Each element has 8 floats.
constexpr int64_t blockSize = 16 * 1024 * 1024;

// ==== Producing these 16GB binary files ====

inline void computeBlock( __m256* pointer, uint32_t firstValue )
{
	using namespace Intrinsics::Avx::Int32;

	const __m256i offsets = set_epu32( 0, 1, 2, 3, 4, 5, 6, 7 );
	__m256i vec = set1_epu32( firstValue ) + offsets;
	__m256* const endPointer = pointer + blockSize;
	while( pointer < endPointer )
	{
		// Compute the result
		__m256 floats = castall_ps( vec );
		floats = rcp_ps( floats );
		storeu_ps( (float*)pointer, floats );
		// Advance pointer + values
		pointer++;
		vec += set1_epu32( 8 );
	}
}

int produceValues( LPCTSTR destPath )
{
	// Create 16GB memory-mapped file
	MappedFile file;
	if( FAILED( file.create( destPath, true ) ) )
		return 1;

	// Write the values
	#pragma omp parallel for
	for( int64_t i = 0; i < totalVectors; i += blockSize )
		computeBlock( file + i, (uint32_t)( i * 8 ) );

	return 0;
}

// ==== Producing .tsv for Excel ====

int graphValues()
{
	// Open both files
	MappedFile amd, intel;
	if( FAILED( amd.create( readAmd, false ) ) || FAILED( intel.create( readIntel, false ) ) )
		return 1;

	// Compare the content
	constexpr int valuesPerInt = 64;
	constexpr double mul = 1.0 / valuesPerInt;
	printf( "x\tamd\tintel\n" );
	for( int i = 1; i <= 10 * 64; i++ )
	{
		const double x = mul * i;
		const double precise = 1.0 / x;

		const float f = (float)x;
		const double approxAmd = amd[ f ];
		const double approxIntel = intel[ f ];
		printf( "%f\t%f\t%f\n", x, abs( approxAmd - precise ), abs( approxIntel - precise ) );
	}
	return 0;
}

// ==== Summarizing errors over the complete range of floats ====

class MinMaxError
{
	__m256 maxError;
	__m256d totalError;
	size_t valuesCount;

	double getMaxError() const
	{
		using namespace Intrinsics::Sse;
		// Compute horizontal maximum of 8 values in maxAbsError
		__m128 result = castps256_ps128( maxError );
		result = max_ps( result, extractf128_ps<1>( maxError ) );
		result = max_ps( result, movehl_ps( result, result ) );
		result = max_ss( result, shuffle_ps<1, 1, 1, 1>( result, result ) );
		return cvtss_f32( result );
	}

	double getAvgError() const
	{
		using namespace Intrinsics::Sse;
		// Compute horizontal sum of 4 values in totalAbsError
		__m128d result = castpd256_pd128( totalError );
		result += extractf128_pd<1>( totalError );
		result += shuffle_pd<1, 1>( result, result );
		// Return the average
		return cvtsd_f64( result ) / valuesCount;
	}

public:
	MinMaxError()
	{
		maxError = setzero_ps();
		totalError = setzero_pd();
		valuesCount = 0;
	}

	__forceinline void updateAll( __m256 v )
	{
		// Accumulate max. error
		maxError = max_ps( maxError, v );

		// Accumulate total error
		totalError += cvtps_pd( castps256_ps128( v ) );
		totalError += cvtps_pd( extractf128_ps<1>( v ) );

		// Increment values count
		valuesCount += 8;
	}

	__forceinline void updateSome( __m256 v, int maskScalar, __m256 maskVec )
	{
		if( 0 == maskScalar )
			return;

		// Accumulate max. error
		maxError = blendv_ps( maxError, max_ps( maxError, v ), maskVec );

		// Accumulate total error
		__m256d dbl, dblMask;
		dbl = cvtps_pd( castps256_ps128( v ) );
		dblMask = cvtps_pd( castps256_ps128( maskVec ) );
		totalError = blendv_pd( totalError, totalError + dbl, dblMask );

		dbl = cvtps_pd( extractf128_ps<1>( v ) );
		dblMask = cvtps_pd( extractf128_ps<1>( maskVec ) );
		totalError = blendv_pd( totalError, totalError + dbl, dblMask );

		// Increment values count
		valuesCount += __popcnt( (unsigned int)maskScalar );
	}

	void mergeFrom( const MinMaxError& that )
	{
		maxError = max_ps( maxError, that.maxError );
		totalError += that.totalError;
		valuesCount += that.valuesCount;
	}

	void print( const char* what ) const
	{
		printf( "%s: max rel. error %f, avg. rel. error %f, %zu values\n",
			what, getMaxError(), getAvgError(), valuesCount );
	}
};

__forceinline __m256 finiteValues( __m256 x )
{
	using namespace Intrinsics::Avx::Int32;

	__m256i i = castps_all( x );
	const __m256i expBits = set1_epu32( 0x7F800000 );
	i &= expBits;	// expBits if INF or NAN, something else otherwise
	i = ( i == expBits );	// 0xFFFFFFFF if INF or NAN, 0 otherwise
	i = ~i;	// 0 if INF or NAN, 0xFFFFFFFF if it's finite
	return castall_ps( i ) & cmpneq_ps( x, setzero_ps() );
}

// Test for exact bitwise equality
__forceinline bool isEqual( __m256 a, __m256 b )
{
	const __m256i x = castps_all( xor_ps( a, b ) );
	return testz_all( x, x );
}

// Absolute value
__forceinline __m256 abs_ps( __m256 v )
{
	return andnot_ps( set1_ps( -0.0f ), v );
}

__forceinline __m256 relativeError( __m256 precise, __m256 approx )
{
	const __m256 error = approx - precise;
	const __m256 relError = error / precise;
	return abs_ps( relError );
}

class ErrorSummary
{
	MinMaxError m_amd, m_intel;

public:

	__forceinline void update( __m256 amd, __m256 intel, __m256 sourceValues )
	{
		__m256 goodValuesMask = cmpneq_ps( sourceValues, setzero_ps() );
		goodValuesMask &= finiteValues( sourceValues );

		const __m256 amdFinite = finiteValues( amd );

		// Just in case, ensure the Intel/AMD have same finite values
		if( !isEqual( amdFinite, finiteValues( intel ) ) )
			__debugbreak();
		goodValuesMask &= amdFinite;

		const __m256 precise = set1_ps( 1 ) / sourceValues;
		goodValuesMask &= finiteValues( precise );

		const int goodValuesScalar = movemask_ps( goodValuesMask );
		if( 0 == goodValuesScalar )
			return;

		amd = relativeError( precise, amd );
		intel = relativeError( precise, intel );

		if( 0xFF == goodValuesScalar )
		{
			m_amd.updateAll( amd );
			m_intel.updateAll( intel );
		}
		else
		{
			m_amd.updateSome( amd, goodValuesScalar, goodValuesMask );
			m_intel.updateSome( intel, goodValuesScalar, goodValuesMask );
		}
	}

	void mergeFrom( const ErrorSummary& that )
	{
		m_amd.mergeFrom( that.m_amd );
		m_intel.mergeFrom( that.m_intel );
	}

	void print() const
	{
		m_amd.print( "AMD" );
		m_intel.print( "Intel" );
	}
};

inline void compareBlock( const __m256* ptrAmd, const __m256* ptrIntel, uint32_t firstValue, ErrorSummary& es )
{
	const __m256i offsets = _mm256_setr_epi32( 0, 1, 2, 3, 4, 5, 6, 7 );
	__m256i vec = _mm256_add_epi32( _mm256_set1_epi32( (int)firstValue ), offsets );

	const __m256* const endPointer = ptrAmd + blockSize;
	while( ptrAmd < endPointer )
	{
		// Compute the result
		const __m256 floats = _mm256_castsi256_ps( vec );
		const __m256 amd = _mm256_loadu_ps( (const float*)ptrAmd );
		const __m256 intel = _mm256_loadu_ps( (const float*)ptrIntel );
		es.update( amd, intel, floats );

		// Advance pointer + values
		ptrAmd++;
		ptrIntel++;
		vec = _mm256_add_epi32( vec, _mm256_set1_epi32( 8 ) );
	}
}

int compareValues()
{
	// Open both files
	MappedFile amd, intel;
	if( FAILED( amd.create( readAmd, false ) ) || FAILED( intel.create( readIntel, false ) ) )
		return 1;

	ErrorSummary summary;
	CComAutoCriticalSection cs;
	using CsLock = CComCritSecLock<CComAutoCriticalSection>;

	#pragma omp parallel for
	for( int64_t i = 0; i < totalVectors; i += blockSize )
	{
		ErrorSummary localSummary;
		compareBlock( amd + i, intel + i, (uint32_t)( i * 8 ), localSummary );

		{
			CsLock __lock{ cs };
			summary.mergeFrom( localSummary );
		}
	}

	summary.print();
	return 0;
}

int main()
{
	// return produceValues( destFileName );
	// return graphValues();
	return compareValues();
}
	#include <stdint.h>
	#include <atlfile.h>
	#include <intrin.h>
	#include <array>
	// These headers are from there: https://github.com/Const-me/IntelIntrinsics/tree/master/CppDemo/Intrinsics
	#include "Intrinsics/avx.hpp"
	#include "Intrinsics/avx2.hpp"
	#include "Intrinsics/sse.hpp"
	#include "Intrinsics/sse2.hpp"
	using namespace Intrinsics::Avx;

	constexpr uint64_t gb = ( 1 << 30 );
	// Total count of different 32-bit floats
	constexpr uint64_t totalFloats = 4 * gb;
	// Count of __m256 lanes in the file
	constexpr int64_t totalVectors = totalFloats / 8;

	// LPCTSTR destFileName = LR"(C:\Temp\rcpss-amd.bin)";
	LPCTSTR destFileName = LR"(\\CONST-PC\Temp\rcpss-intel.bin)";

	LPCTSTR readAmd = LR"(C:\Temp\rcpss-amd.bin)";
	LPCTSTR readIntel = LR"(D:\Temp\rcpss-intel.bin)";

	// ==== I/O Boilerplate ====
	#define CHECK( hr ) { const HRESULT __hr = (hr); if( FAILED( __hr ) ) { printf( "%s failed: 0x%08x\n", #hr, __hr ); return __hr; } }

	class MappedFile
	{
	CAtlFile file;
	CAtlFileMapping<__m256> map;

	public:

	HRESULT create( LPCTSTR path, bool write )
	{
	const DWORD acc = write ? GENERIC_READ \| GENERIC_WRITE : GENERIC_READ;
	const DWORD mode = write ? CREATE_ALWAYS : OPEN_EXISTING;
	CHECK( file.Create( path, acc, 0, mode ) );

	if( write )
	{
	CHECK( file.SetSize( 4 * totalFloats ) );
	CHECK( map.MapFile( file, 0, 0, PAGE_READWRITE, FILE_MAP_ALL_ACCESS ) );
	}
	else
	CHECK( map.MapFile( file ) );

	return S_OK;
	}

	// Pointer to the mapped data
	operator __m256* const( ) { return map; }

	// Use mapped table to read a single float
	float operator[] ( float x ) const
	{
	uint32_t idx;
	CopyMemory( &idx, &x, 4 );
	const float* ptr = (const float)( map.operator __m256 ( ) );
	return ptr[ idx ];
	}
	};

	// __m256 elements per block. Each element has 8 floats.
	constexpr int64_t blockSize = 16 * 1024 * 1024;

	// ==== Producing these 16GB binary files ====

	inline void computeBlock( __m256* pointer, uint32_t firstValue )
	{
	using namespace Intrinsics::Avx::Int32;

	const __m256i offsets = set_epu32( 0, 1, 2, 3, 4, 5, 6, 7 );
	__m256i vec = set1_epu32( firstValue ) + offsets;
	__m256* const endPointer = pointer + blockSize;
	while( pointer < endPointer )
	{
	// Compute the result
	__m256 floats = castall_ps( vec );
	floats = rcp_ps( floats );
	storeu_ps( (float*)pointer, floats );
	// Advance pointer + values
	pointer++;
	vec += set1_epu32( 8 );
	}
	}

	int produceValues( LPCTSTR destPath )
	{
	// Create 16GB memory-mapped file
	MappedFile file;
	if( FAILED( file.create( destPath, true ) ) )
	return 1;

	// Write the values
	#pragma omp parallel for
	for( int64_t i = 0; i < totalVectors; i += blockSize )
	computeBlock( file + i, (uint32_t)( i * 8 ) );

	return 0;
	}

	// ==== Producing .tsv for Excel ====

	int graphValues()
	{
	// Open both files
	MappedFile amd, intel;
	if( FAILED( amd.create( readAmd, false ) ) \|\| FAILED( intel.create( readIntel, false ) ) )
	return 1;

	// Compare the content
	constexpr int valuesPerInt = 64;
	constexpr double mul = 1.0 / valuesPerInt;
	printf( "x\tamd\tintel\n" );
	for( int i = 1; i <= 10 * 64; i++ )
	{
	const double x = mul * i;
	const double precise = 1.0 / x;

	const float f = (float)x;
	const double approxAmd = amd[ f ];
	const double approxIntel = intel[ f ];
	printf( "%f\t%f\t%f\n", x, abs( approxAmd - precise ), abs( approxIntel - precise ) );
	}
	return 0;
	}

	// ==== Summarizing errors over the complete range of floats ====

	class MinMaxError
	{
	__m256 maxError;
	__m256d totalError;
	size_t valuesCount;

	double getMaxError() const
	{
	using namespace Intrinsics::Sse;
	// Compute horizontal maximum of 8 values in maxAbsError
	__m128 result = castps256_ps128( maxError );
	result = max_ps( result, extractf128_ps<1>( maxError ) );
	result = max_ps( result, movehl_ps( result, result ) );
	result = max_ss( result, shuffle_ps<1, 1, 1, 1>( result, result ) );
	return cvtss_f32( result );
	}

	double getAvgError() const
	{
	using namespace Intrinsics::Sse;
	// Compute horizontal sum of 4 values in totalAbsError
	__m128d result = castpd256_pd128( totalError );
	result += extractf128_pd<1>( totalError );
	result += shuffle_pd<1, 1>( result, result );
	// Return the average
	return cvtsd_f64( result ) / valuesCount;
	}

	public:
	MinMaxError()
	{
	maxError = setzero_ps();
	totalError = setzero_pd();
	valuesCount = 0;
	}

	__forceinline void updateAll( __m256 v )
	{
	// Accumulate max. error
	maxError = max_ps( maxError, v );

	// Accumulate total error
	totalError += cvtps_pd( castps256_ps128( v ) );
	totalError += cvtps_pd( extractf128_ps<1>( v ) );

	// Increment values count
	valuesCount += 8;
	}

	__forceinline void updateSome( __m256 v, int maskScalar, __m256 maskVec )
	{
	if( 0 == maskScalar )
	return;

	// Accumulate max. error
	maxError = blendv_ps( maxError, max_ps( maxError, v ), maskVec );

	// Accumulate total error
	__m256d dbl, dblMask;
	dbl = cvtps_pd( castps256_ps128( v ) );
	dblMask = cvtps_pd( castps256_ps128( maskVec ) );
	totalError = blendv_pd( totalError, totalError + dbl, dblMask );

	dbl = cvtps_pd( extractf128_ps<1>( v ) );
	dblMask = cvtps_pd( extractf128_ps<1>( maskVec ) );
	totalError = blendv_pd( totalError, totalError + dbl, dblMask );

	// Increment values count
	valuesCount += __popcnt( (unsigned int)maskScalar );
	}

	void mergeFrom( const MinMaxError& that )
	{
	maxError = max_ps( maxError, that.maxError );
	totalError += that.totalError;
	valuesCount += that.valuesCount;
	}

	void print( const char* what ) const
	{
	printf( "%s: max rel. error %f, avg. rel. error %f, %zu values\n",
	what, getMaxError(), getAvgError(), valuesCount );
	}
	};

	__forceinline __m256 finiteValues( __m256 x )
	{
	using namespace Intrinsics::Avx::Int32;

	__m256i i = castps_all( x );
	const __m256i expBits = set1_epu32( 0x7F800000 );
	i &= expBits; // expBits if INF or NAN, something else otherwise
	i = ( i == expBits ); // 0xFFFFFFFF if INF or NAN, 0 otherwise
	i = ~i; // 0 if INF or NAN, 0xFFFFFFFF if it's finite
	return castall_ps( i ) & cmpneq_ps( x, setzero_ps() );
	}

	// Test for exact bitwise equality
	__forceinline bool isEqual( __m256 a, __m256 b )
	{
	const __m256i x = castps_all( xor_ps( a, b ) );
	return testz_all( x, x );
	}

	// Absolute value
	__forceinline __m256 abs_ps( __m256 v )
	{
	return andnot_ps( set1_ps( -0.0f ), v );
	}

	__forceinline __m256 relativeError( __m256 precise, __m256 approx )
	{
	const __m256 error = approx - precise;
	const __m256 relError = error / precise;
	return abs_ps( relError );
	}

	class ErrorSummary
	{
	MinMaxError m_amd, m_intel;

	public:

	__forceinline void update( __m256 amd, __m256 intel, __m256 sourceValues )
	{
	__m256 goodValuesMask = cmpneq_ps( sourceValues, setzero_ps() );
	goodValuesMask &= finiteValues( sourceValues );

	const __m256 amdFinite = finiteValues( amd );

	// Just in case, ensure the Intel/AMD have same finite values
	if( !isEqual( amdFinite, finiteValues( intel ) ) )
	__debugbreak();
	goodValuesMask &= amdFinite;

	const __m256 precise = set1_ps( 1 ) / sourceValues;
	goodValuesMask &= finiteValues( precise );

	const int goodValuesScalar = movemask_ps( goodValuesMask );
	if( 0 == goodValuesScalar )
	return;

	amd = relativeError( precise, amd );
	intel = relativeError( precise, intel );

	if( 0xFF == goodValuesScalar )
	{
	m_amd.updateAll( amd );
	m_intel.updateAll( intel );
	}
	else
	{
	m_amd.updateSome( amd, goodValuesScalar, goodValuesMask );
	m_intel.updateSome( intel, goodValuesScalar, goodValuesMask );
	}
	}

	void mergeFrom( const ErrorSummary& that )
	{
	m_amd.mergeFrom( that.m_amd );
	m_intel.mergeFrom( that.m_intel );
	}

	void print() const
	{
	m_amd.print( "AMD" );
	m_intel.print( "Intel" );
	}
	};

	inline void compareBlock( const __m256* ptrAmd, const __m256* ptrIntel, uint32_t firstValue, ErrorSummary& es )
	{
	const __m256i offsets = _mm256_setr_epi32( 0, 1, 2, 3, 4, 5, 6, 7 );
	__m256i vec = _mm256_add_epi32( _mm256_set1_epi32( (int)firstValue ), offsets );

	const __m256* const endPointer = ptrAmd + blockSize;
	while( ptrAmd < endPointer )
	{
	// Compute the result
	const __m256 floats = _mm256_castsi256_ps( vec );
	const __m256 amd = _mm256_loadu_ps( (const float*)ptrAmd );
	const __m256 intel = _mm256_loadu_ps( (const float*)ptrIntel );
	es.update( amd, intel, floats );

	// Advance pointer + values
	ptrAmd++;
	ptrIntel++;
	vec = _mm256_add_epi32( vec, _mm256_set1_epi32( 8 ) );
	}
	}

	int compareValues()
	{
	// Open both files
	MappedFile amd, intel;
	if( FAILED( amd.create( readAmd, false ) ) \|\| FAILED( intel.create( readIntel, false ) ) )
	return 1;

	ErrorSummary summary;
	CComAutoCriticalSection cs;
	using CsLock = CComCritSecLock<CComAutoCriticalSection>;

	#pragma omp parallel for
	for( int64_t i = 0; i < totalVectors; i += blockSize )
	{
	ErrorSummary localSummary;
	compareBlock( amd + i, intel + i, (uint32_t)( i * 8 ), localSummary );

	{
	CsLock __lock{ cs };
	summary.mergeFrom( localSummary );
	}
	}

	summary.print();
	return 0;
	}

	int main()
	{
	// return produceValues( destFileName );
	// return graphValues();
	return compareValues();
	}