Skip to content

Instantly share code, notes, and snippets.

@Const-me
Created December 8, 2019 08:47
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Const-me/a6d36f70a3a77de00c61cf4f6c17c7ac to your computer and use it in GitHub Desktop.
Save Const-me/a6d36f70a3a77de00c61cf4f6c17c7ac to your computer and use it in GitHub Desktop.
#include <stdint.h>
#include <atlfile.h>
#include <intrin.h>
#include <array>
// These headers are from there: https://github.com/Const-me/IntelIntrinsics/tree/master/CppDemo/Intrinsics
#include "Intrinsics/avx.hpp"
#include "Intrinsics/avx2.hpp"
#include "Intrinsics/sse.hpp"
#include "Intrinsics/sse2.hpp"
using namespace Intrinsics::Avx;
constexpr uint64_t gb = ( 1 << 30 );
// Total count of different 32-bit floats
constexpr uint64_t totalFloats = 4 * gb;
// Count of __m256 lanes in the file
constexpr int64_t totalVectors = totalFloats / 8;
// LPCTSTR destFileName = LR"(C:\Temp\rcpss-amd.bin)";
LPCTSTR destFileName = LR"(\\CONST-PC\Temp\rcpss-intel.bin)";
LPCTSTR readAmd = LR"(C:\Temp\rcpss-amd.bin)";
LPCTSTR readIntel = LR"(D:\Temp\rcpss-intel.bin)";
// ==== I/O Boilerplate ====
#define CHECK( hr ) { const HRESULT __hr = (hr); if( FAILED( __hr ) ) { printf( "%s failed: 0x%08x\n", #hr, __hr ); return __hr; } }
class MappedFile
{
CAtlFile file;
CAtlFileMapping<__m256> map;
public:
HRESULT create( LPCTSTR path, bool write )
{
const DWORD acc = write ? GENERIC_READ | GENERIC_WRITE : GENERIC_READ;
const DWORD mode = write ? CREATE_ALWAYS : OPEN_EXISTING;
CHECK( file.Create( path, acc, 0, mode ) );
if( write )
{
CHECK( file.SetSize( 4 * totalFloats ) );
CHECK( map.MapFile( file, 0, 0, PAGE_READWRITE, FILE_MAP_ALL_ACCESS ) );
}
else
CHECK( map.MapFile( file ) );
return S_OK;
}
// Pointer to the mapped data
operator __m256* const( ) { return map; }
// Use mapped table to read a single float
float operator[] ( float x ) const
{
uint32_t idx;
CopyMemory( &idx, &x, 4 );
const float* ptr = (const float*)( map.operator __m256 *( ) );
return ptr[ idx ];
}
};
// __m256 elements per block. Each element has 8 floats.
constexpr int64_t blockSize = 16 * 1024 * 1024;
// ==== Producing these 16GB binary files ====
inline void computeBlock( __m256* pointer, uint32_t firstValue )
{
using namespace Intrinsics::Avx::Int32;
const __m256i offsets = set_epu32( 0, 1, 2, 3, 4, 5, 6, 7 );
__m256i vec = set1_epu32( firstValue ) + offsets;
__m256* const endPointer = pointer + blockSize;
while( pointer < endPointer )
{
// Compute the result
__m256 floats = castall_ps( vec );
floats = rcp_ps( floats );
storeu_ps( (float*)pointer, floats );
// Advance pointer + values
pointer++;
vec += set1_epu32( 8 );
}
}
int produceValues( LPCTSTR destPath )
{
// Create 16GB memory-mapped file
MappedFile file;
if( FAILED( file.create( destPath, true ) ) )
return 1;
// Write the values
#pragma omp parallel for
for( int64_t i = 0; i < totalVectors; i += blockSize )
computeBlock( file + i, (uint32_t)( i * 8 ) );
return 0;
}
// ==== Producing .tsv for Excel ====
int graphValues()
{
// Open both files
MappedFile amd, intel;
if( FAILED( amd.create( readAmd, false ) ) || FAILED( intel.create( readIntel, false ) ) )
return 1;
// Compare the content
constexpr int valuesPerInt = 64;
constexpr double mul = 1.0 / valuesPerInt;
printf( "x\tamd\tintel\n" );
for( int i = 1; i <= 10 * 64; i++ )
{
const double x = mul * i;
const double precise = 1.0 / x;
const float f = (float)x;
const double approxAmd = amd[ f ];
const double approxIntel = intel[ f ];
printf( "%f\t%f\t%f\n", x, abs( approxAmd - precise ), abs( approxIntel - precise ) );
}
return 0;
}
// ==== Summarizing errors over the complete range of floats ====
class MinMaxError
{
__m256 maxError;
__m256d totalError;
size_t valuesCount;
double getMaxError() const
{
using namespace Intrinsics::Sse;
// Compute horizontal maximum of 8 values in maxAbsError
__m128 result = castps256_ps128( maxError );
result = max_ps( result, extractf128_ps<1>( maxError ) );
result = max_ps( result, movehl_ps( result, result ) );
result = max_ss( result, shuffle_ps<1, 1, 1, 1>( result, result ) );
return cvtss_f32( result );
}
double getAvgError() const
{
using namespace Intrinsics::Sse;
// Compute horizontal sum of 4 values in totalAbsError
__m128d result = castpd256_pd128( totalError );
result += extractf128_pd<1>( totalError );
result += shuffle_pd<1, 1>( result, result );
// Return the average
return cvtsd_f64( result ) / valuesCount;
}
public:
MinMaxError()
{
maxError = setzero_ps();
totalError = setzero_pd();
valuesCount = 0;
}
__forceinline void updateAll( __m256 v )
{
// Accumulate max. error
maxError = max_ps( maxError, v );
// Accumulate total error
totalError += cvtps_pd( castps256_ps128( v ) );
totalError += cvtps_pd( extractf128_ps<1>( v ) );
// Increment values count
valuesCount += 8;
}
__forceinline void updateSome( __m256 v, int maskScalar, __m256 maskVec )
{
if( 0 == maskScalar )
return;
// Accumulate max. error
maxError = blendv_ps( maxError, max_ps( maxError, v ), maskVec );
// Accumulate total error
__m256d dbl, dblMask;
dbl = cvtps_pd( castps256_ps128( v ) );
dblMask = cvtps_pd( castps256_ps128( maskVec ) );
totalError = blendv_pd( totalError, totalError + dbl, dblMask );
dbl = cvtps_pd( extractf128_ps<1>( v ) );
dblMask = cvtps_pd( extractf128_ps<1>( maskVec ) );
totalError = blendv_pd( totalError, totalError + dbl, dblMask );
// Increment values count
valuesCount += __popcnt( (unsigned int)maskScalar );
}
void mergeFrom( const MinMaxError& that )
{
maxError = max_ps( maxError, that.maxError );
totalError += that.totalError;
valuesCount += that.valuesCount;
}
void print( const char* what ) const
{
printf( "%s: max rel. error %f, avg. rel. error %f, %zu values\n",
what, getMaxError(), getAvgError(), valuesCount );
}
};
__forceinline __m256 finiteValues( __m256 x )
{
using namespace Intrinsics::Avx::Int32;
__m256i i = castps_all( x );
const __m256i expBits = set1_epu32( 0x7F800000 );
i &= expBits; // expBits if INF or NAN, something else otherwise
i = ( i == expBits ); // 0xFFFFFFFF if INF or NAN, 0 otherwise
i = ~i; // 0 if INF or NAN, 0xFFFFFFFF if it's finite
return castall_ps( i ) & cmpneq_ps( x, setzero_ps() );
}
// Test for exact bitwise equality
__forceinline bool isEqual( __m256 a, __m256 b )
{
const __m256i x = castps_all( xor_ps( a, b ) );
return testz_all( x, x );
}
// Absolute value
__forceinline __m256 abs_ps( __m256 v )
{
return andnot_ps( set1_ps( -0.0f ), v );
}
__forceinline __m256 relativeError( __m256 precise, __m256 approx )
{
const __m256 error = approx - precise;
const __m256 relError = error / precise;
return abs_ps( relError );
}
class ErrorSummary
{
MinMaxError m_amd, m_intel;
public:
__forceinline void update( __m256 amd, __m256 intel, __m256 sourceValues )
{
__m256 goodValuesMask = cmpneq_ps( sourceValues, setzero_ps() );
goodValuesMask &= finiteValues( sourceValues );
const __m256 amdFinite = finiteValues( amd );
// Just in case, ensure the Intel/AMD have same finite values
if( !isEqual( amdFinite, finiteValues( intel ) ) )
__debugbreak();
goodValuesMask &= amdFinite;
const __m256 precise = set1_ps( 1 ) / sourceValues;
goodValuesMask &= finiteValues( precise );
const int goodValuesScalar = movemask_ps( goodValuesMask );
if( 0 == goodValuesScalar )
return;
amd = relativeError( precise, amd );
intel = relativeError( precise, intel );
if( 0xFF == goodValuesScalar )
{
m_amd.updateAll( amd );
m_intel.updateAll( intel );
}
else
{
m_amd.updateSome( amd, goodValuesScalar, goodValuesMask );
m_intel.updateSome( intel, goodValuesScalar, goodValuesMask );
}
}
void mergeFrom( const ErrorSummary& that )
{
m_amd.mergeFrom( that.m_amd );
m_intel.mergeFrom( that.m_intel );
}
void print() const
{
m_amd.print( "AMD" );
m_intel.print( "Intel" );
}
};
inline void compareBlock( const __m256* ptrAmd, const __m256* ptrIntel, uint32_t firstValue, ErrorSummary& es )
{
const __m256i offsets = _mm256_setr_epi32( 0, 1, 2, 3, 4, 5, 6, 7 );
__m256i vec = _mm256_add_epi32( _mm256_set1_epi32( (int)firstValue ), offsets );
const __m256* const endPointer = ptrAmd + blockSize;
while( ptrAmd < endPointer )
{
// Compute the result
const __m256 floats = _mm256_castsi256_ps( vec );
const __m256 amd = _mm256_loadu_ps( (const float*)ptrAmd );
const __m256 intel = _mm256_loadu_ps( (const float*)ptrIntel );
es.update( amd, intel, floats );
// Advance pointer + values
ptrAmd++;
ptrIntel++;
vec = _mm256_add_epi32( vec, _mm256_set1_epi32( 8 ) );
}
}
int compareValues()
{
// Open both files
MappedFile amd, intel;
if( FAILED( amd.create( readAmd, false ) ) || FAILED( intel.create( readIntel, false ) ) )
return 1;
ErrorSummary summary;
CComAutoCriticalSection cs;
using CsLock = CComCritSecLock<CComAutoCriticalSection>;
#pragma omp parallel for
for( int64_t i = 0; i < totalVectors; i += blockSize )
{
ErrorSummary localSummary;
compareBlock( amd + i, intel + i, (uint32_t)( i * 8 ), localSummary );
{
CsLock __lock{ cs };
summary.mergeFrom( localSummary );
}
}
summary.print();
return 0;
}
int main()
{
// return produceValues( destFileName );
// return graphValues();
return compareValues();
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment