Created
December 8, 2019 08:47
-
-
Save Const-me/a6d36f70a3a77de00c61cf4f6c17c7ac to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdint.h> | |
#include <atlfile.h> | |
#include <intrin.h> | |
#include <array> | |
// These headers are from there: https://github.com/Const-me/IntelIntrinsics/tree/master/CppDemo/Intrinsics | |
#include "Intrinsics/avx.hpp" | |
#include "Intrinsics/avx2.hpp" | |
#include "Intrinsics/sse.hpp" | |
#include "Intrinsics/sse2.hpp" | |
using namespace Intrinsics::Avx; | |
constexpr uint64_t gb = ( 1 << 30 ); | |
// Total count of different 32-bit floats | |
constexpr uint64_t totalFloats = 4 * gb; | |
// Count of __m256 lanes in the file | |
constexpr int64_t totalVectors = totalFloats / 8; | |
// LPCTSTR destFileName = LR"(C:\Temp\rcpss-amd.bin)"; | |
LPCTSTR destFileName = LR"(\\CONST-PC\Temp\rcpss-intel.bin)"; | |
LPCTSTR readAmd = LR"(C:\Temp\rcpss-amd.bin)"; | |
LPCTSTR readIntel = LR"(D:\Temp\rcpss-intel.bin)"; | |
// ==== I/O Boilerplate ==== | |
#define CHECK( hr ) { const HRESULT __hr = (hr); if( FAILED( __hr ) ) { printf( "%s failed: 0x%08x\n", #hr, __hr ); return __hr; } } | |
class MappedFile | |
{ | |
CAtlFile file; | |
CAtlFileMapping<__m256> map; | |
public: | |
HRESULT create( LPCTSTR path, bool write ) | |
{ | |
const DWORD acc = write ? GENERIC_READ | GENERIC_WRITE : GENERIC_READ; | |
const DWORD mode = write ? CREATE_ALWAYS : OPEN_EXISTING; | |
CHECK( file.Create( path, acc, 0, mode ) ); | |
if( write ) | |
{ | |
CHECK( file.SetSize( 4 * totalFloats ) ); | |
CHECK( map.MapFile( file, 0, 0, PAGE_READWRITE, FILE_MAP_ALL_ACCESS ) ); | |
} | |
else | |
CHECK( map.MapFile( file ) ); | |
return S_OK; | |
} | |
// Pointer to the mapped data | |
operator __m256* const( ) { return map; } | |
// Use mapped table to read a single float | |
float operator[] ( float x ) const | |
{ | |
uint32_t idx; | |
CopyMemory( &idx, &x, 4 ); | |
const float* ptr = (const float*)( map.operator __m256 *( ) ); | |
return ptr[ idx ]; | |
} | |
}; | |
// __m256 elements per block. Each element has 8 floats. | |
constexpr int64_t blockSize = 16 * 1024 * 1024; | |
// ==== Producing these 16GB binary files ==== | |
inline void computeBlock( __m256* pointer, uint32_t firstValue ) | |
{ | |
using namespace Intrinsics::Avx::Int32; | |
const __m256i offsets = set_epu32( 0, 1, 2, 3, 4, 5, 6, 7 ); | |
__m256i vec = set1_epu32( firstValue ) + offsets; | |
__m256* const endPointer = pointer + blockSize; | |
while( pointer < endPointer ) | |
{ | |
// Compute the result | |
__m256 floats = castall_ps( vec ); | |
floats = rcp_ps( floats ); | |
storeu_ps( (float*)pointer, floats ); | |
// Advance pointer + values | |
pointer++; | |
vec += set1_epu32( 8 ); | |
} | |
} | |
int produceValues( LPCTSTR destPath ) | |
{ | |
// Create 16GB memory-mapped file | |
MappedFile file; | |
if( FAILED( file.create( destPath, true ) ) ) | |
return 1; | |
// Write the values | |
#pragma omp parallel for | |
for( int64_t i = 0; i < totalVectors; i += blockSize ) | |
computeBlock( file + i, (uint32_t)( i * 8 ) ); | |
return 0; | |
} | |
// ==== Producing .tsv for Excel ==== | |
int graphValues() | |
{ | |
// Open both files | |
MappedFile amd, intel; | |
if( FAILED( amd.create( readAmd, false ) ) || FAILED( intel.create( readIntel, false ) ) ) | |
return 1; | |
// Compare the content | |
constexpr int valuesPerInt = 64; | |
constexpr double mul = 1.0 / valuesPerInt; | |
printf( "x\tamd\tintel\n" ); | |
for( int i = 1; i <= 10 * 64; i++ ) | |
{ | |
const double x = mul * i; | |
const double precise = 1.0 / x; | |
const float f = (float)x; | |
const double approxAmd = amd[ f ]; | |
const double approxIntel = intel[ f ]; | |
printf( "%f\t%f\t%f\n", x, abs( approxAmd - precise ), abs( approxIntel - precise ) ); | |
} | |
return 0; | |
} | |
// ==== Summarizing errors over the complete range of floats ==== | |
class MinMaxError | |
{ | |
__m256 maxError; | |
__m256d totalError; | |
size_t valuesCount; | |
double getMaxError() const | |
{ | |
using namespace Intrinsics::Sse; | |
// Compute horizontal maximum of 8 values in maxAbsError | |
__m128 result = castps256_ps128( maxError ); | |
result = max_ps( result, extractf128_ps<1>( maxError ) ); | |
result = max_ps( result, movehl_ps( result, result ) ); | |
result = max_ss( result, shuffle_ps<1, 1, 1, 1>( result, result ) ); | |
return cvtss_f32( result ); | |
} | |
double getAvgError() const | |
{ | |
using namespace Intrinsics::Sse; | |
// Compute horizontal sum of 4 values in totalAbsError | |
__m128d result = castpd256_pd128( totalError ); | |
result += extractf128_pd<1>( totalError ); | |
result += shuffle_pd<1, 1>( result, result ); | |
// Return the average | |
return cvtsd_f64( result ) / valuesCount; | |
} | |
public: | |
MinMaxError() | |
{ | |
maxError = setzero_ps(); | |
totalError = setzero_pd(); | |
valuesCount = 0; | |
} | |
__forceinline void updateAll( __m256 v ) | |
{ | |
// Accumulate max. error | |
maxError = max_ps( maxError, v ); | |
// Accumulate total error | |
totalError += cvtps_pd( castps256_ps128( v ) ); | |
totalError += cvtps_pd( extractf128_ps<1>( v ) ); | |
// Increment values count | |
valuesCount += 8; | |
} | |
__forceinline void updateSome( __m256 v, int maskScalar, __m256 maskVec ) | |
{ | |
if( 0 == maskScalar ) | |
return; | |
// Accumulate max. error | |
maxError = blendv_ps( maxError, max_ps( maxError, v ), maskVec ); | |
// Accumulate total error | |
__m256d dbl, dblMask; | |
dbl = cvtps_pd( castps256_ps128( v ) ); | |
dblMask = cvtps_pd( castps256_ps128( maskVec ) ); | |
totalError = blendv_pd( totalError, totalError + dbl, dblMask ); | |
dbl = cvtps_pd( extractf128_ps<1>( v ) ); | |
dblMask = cvtps_pd( extractf128_ps<1>( maskVec ) ); | |
totalError = blendv_pd( totalError, totalError + dbl, dblMask ); | |
// Increment values count | |
valuesCount += __popcnt( (unsigned int)maskScalar ); | |
} | |
void mergeFrom( const MinMaxError& that ) | |
{ | |
maxError = max_ps( maxError, that.maxError ); | |
totalError += that.totalError; | |
valuesCount += that.valuesCount; | |
} | |
void print( const char* what ) const | |
{ | |
printf( "%s: max rel. error %f, avg. rel. error %f, %zu values\n", | |
what, getMaxError(), getAvgError(), valuesCount ); | |
} | |
}; | |
__forceinline __m256 finiteValues( __m256 x ) | |
{ | |
using namespace Intrinsics::Avx::Int32; | |
__m256i i = castps_all( x ); | |
const __m256i expBits = set1_epu32( 0x7F800000 ); | |
i &= expBits; // expBits if INF or NAN, something else otherwise | |
i = ( i == expBits ); // 0xFFFFFFFF if INF or NAN, 0 otherwise | |
i = ~i; // 0 if INF or NAN, 0xFFFFFFFF if it's finite | |
return castall_ps( i ) & cmpneq_ps( x, setzero_ps() ); | |
} | |
// Test for exact bitwise equality | |
__forceinline bool isEqual( __m256 a, __m256 b ) | |
{ | |
const __m256i x = castps_all( xor_ps( a, b ) ); | |
return testz_all( x, x ); | |
} | |
// Absolute value | |
__forceinline __m256 abs_ps( __m256 v ) | |
{ | |
return andnot_ps( set1_ps( -0.0f ), v ); | |
} | |
__forceinline __m256 relativeError( __m256 precise, __m256 approx ) | |
{ | |
const __m256 error = approx - precise; | |
const __m256 relError = error / precise; | |
return abs_ps( relError ); | |
} | |
class ErrorSummary | |
{ | |
MinMaxError m_amd, m_intel; | |
public: | |
__forceinline void update( __m256 amd, __m256 intel, __m256 sourceValues ) | |
{ | |
__m256 goodValuesMask = cmpneq_ps( sourceValues, setzero_ps() ); | |
goodValuesMask &= finiteValues( sourceValues ); | |
const __m256 amdFinite = finiteValues( amd ); | |
// Just in case, ensure the Intel/AMD have same finite values | |
if( !isEqual( amdFinite, finiteValues( intel ) ) ) | |
__debugbreak(); | |
goodValuesMask &= amdFinite; | |
const __m256 precise = set1_ps( 1 ) / sourceValues; | |
goodValuesMask &= finiteValues( precise ); | |
const int goodValuesScalar = movemask_ps( goodValuesMask ); | |
if( 0 == goodValuesScalar ) | |
return; | |
amd = relativeError( precise, amd ); | |
intel = relativeError( precise, intel ); | |
if( 0xFF == goodValuesScalar ) | |
{ | |
m_amd.updateAll( amd ); | |
m_intel.updateAll( intel ); | |
} | |
else | |
{ | |
m_amd.updateSome( amd, goodValuesScalar, goodValuesMask ); | |
m_intel.updateSome( intel, goodValuesScalar, goodValuesMask ); | |
} | |
} | |
void mergeFrom( const ErrorSummary& that ) | |
{ | |
m_amd.mergeFrom( that.m_amd ); | |
m_intel.mergeFrom( that.m_intel ); | |
} | |
void print() const | |
{ | |
m_amd.print( "AMD" ); | |
m_intel.print( "Intel" ); | |
} | |
}; | |
inline void compareBlock( const __m256* ptrAmd, const __m256* ptrIntel, uint32_t firstValue, ErrorSummary& es ) | |
{ | |
const __m256i offsets = _mm256_setr_epi32( 0, 1, 2, 3, 4, 5, 6, 7 ); | |
__m256i vec = _mm256_add_epi32( _mm256_set1_epi32( (int)firstValue ), offsets ); | |
const __m256* const endPointer = ptrAmd + blockSize; | |
while( ptrAmd < endPointer ) | |
{ | |
// Compute the result | |
const __m256 floats = _mm256_castsi256_ps( vec ); | |
const __m256 amd = _mm256_loadu_ps( (const float*)ptrAmd ); | |
const __m256 intel = _mm256_loadu_ps( (const float*)ptrIntel ); | |
es.update( amd, intel, floats ); | |
// Advance pointer + values | |
ptrAmd++; | |
ptrIntel++; | |
vec = _mm256_add_epi32( vec, _mm256_set1_epi32( 8 ) ); | |
} | |
} | |
int compareValues() | |
{ | |
// Open both files | |
MappedFile amd, intel; | |
if( FAILED( amd.create( readAmd, false ) ) || FAILED( intel.create( readIntel, false ) ) ) | |
return 1; | |
ErrorSummary summary; | |
CComAutoCriticalSection cs; | |
using CsLock = CComCritSecLock<CComAutoCriticalSection>; | |
#pragma omp parallel for | |
for( int64_t i = 0; i < totalVectors; i += blockSize ) | |
{ | |
ErrorSummary localSummary; | |
compareBlock( amd + i, intel + i, (uint32_t)( i * 8 ), localSummary ); | |
{ | |
CsLock __lock{ cs }; | |
summary.mergeFrom( localSummary ); | |
} | |
} | |
summary.print(); | |
return 0; | |
} | |
int main() | |
{ | |
// return produceValues( destFileName ); | |
// return graphValues(); | |
return compareValues(); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment