Created
September 26, 2020 18:06
-
-
Save bosilca/b1f4463fb7401f6d2af2dcc8dc3eb1eb to your computer and use it in GitHub Desktop.
Playground for the AVX512 support on KNL / KNC.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdlib.h> | |
#include <stdio.h> | |
#include <immintrin.h> | |
#define OMPI_OP_AVX_HAS_AVX512BW_FLAG 0x00000200 | |
#define OMPI_OP_AVX_HAS_AVX512F_FLAG 0x00000100 | |
#define OMPI_OP_AVX_HAS_AVX2_FLAG 0x00000020 | |
#define OMPI_OP_AVX_HAS_AVX_FLAG 0x00000010 | |
#define OMPI_OP_AVX_HAS_SSE4_1_FLAG 0x00000008 | |
#define OMPI_OP_AVX_HAS_SSE3_FLAG 0x00000004 | |
#define OMPI_OP_AVX_HAS_SSE2_FLAG 0x00000002 | |
#define OMPI_OP_AVX_HAS_SSE_FLAG 0x00000001 | |
static int intel_avx_flags = 0; | |
/** | |
* A slightly modified code from | |
* https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family | |
*/ | |
#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1300) | |
static uint32_t has_intel_AVX_features(void) | |
{ | |
uint32_t flags = 0; | |
flags |= _may_i_use_cpu_feature(_FEATURE_AVX512F) ? OMPI_OP_AVX_HAS_AVX512F_FLAG : 0; | |
flags |= _may_i_use_cpu_feature(_FEATURE_AVX512BW) ? OMPI_OP_AVX_HAS_AVX512BW_FLAG : 0; | |
flags |= _may_i_use_cpu_feature(_FEATURE_AVX2) ? OMPI_OP_AVX_HAS_AVX2_FLAG : 0; | |
flags |= _may_i_use_cpu_feature(_FEATURE_AVX) ? OMPI_OP_AVX_HAS_AVX_FLAG : 0; | |
flags |= _may_i_use_cpu_feature(_FEATURE_SSE4_1) ? OMPI_OP_AVX_HAS_SSE4_1_FLAG : 0; | |
flags |= _may_i_use_cpu_feature(_FEATURE_SSE3) ? OMPI_OP_AVX_HAS_SSE3_FLAG : 0; | |
flags |= _may_i_use_cpu_feature(_FEATURE_SSE2) ? OMPI_OP_AVX_HAS_SSE2_FLAG : 0; | |
flags |= _may_i_use_cpu_feature(_FEATURE_SSE) ? OMPI_OP_AVX_HAS_SSE_FLAG : 0; | |
return flags; | |
} | |
#else /* non-Intel compiler */ | |
#include <stdint.h> | |
#if defined(_MSC_VER) | |
#include <intrin.h> | |
#endif | |
static void run_cpuid(uint32_t eax, uint32_t ecx, uint32_t* abcd) | |
{ | |
#if defined(_MSC_VER) | |
__cpuidex(abcd, eax, ecx); | |
#else | |
uint32_t ebx = 0, edx = 0; | |
#if defined( __i386__ ) && defined ( __PIC__ ) | |
/* in case of PIC under 32-bit EBX cannot be clobbered */ | |
__asm__ ( "movl %%ebx, %%edi \n\t cpuid \n\t xchgl %%ebx, %%edi" : "=D" (ebx), | |
#else | |
__asm__ ( "cpuid" : "+b" (ebx), | |
#endif /* defined( __i386__ ) && defined ( __PIC__ ) */ | |
"+a" (eax), "+c" (ecx), "=d" (edx) ); | |
abcd[0] = eax; abcd[1] = ebx; abcd[2] = ecx; abcd[3] = edx; | |
#endif | |
} | |
static uint32_t has_intel_AVX_features(void) | |
{ | |
/* From https://en.wikipedia.org/wiki/CPUID#EAX=1:_Processor_Info_and_Feature_Bits */ | |
const uint32_t avx512f_mask = (1U << 16); // AVX512F (EAX = 7, ECX = 0) : EBX | |
const uint32_t avx512_bw_mask = (1U << 30); // AVX512BW (EAX = 7, ECX = 0) : EBX | |
const uint32_t avx2_mask = (1U << 5); // AVX2 (EAX = 7, ECX = 0) : EBX | |
const uint32_t avx_mask = (1U << 28); // AVX (EAX = 1, ECX = 0) : ECX | |
const uint32_t sse4_1_mask = (1U << 19); // SSE4.1 (EAX = 1, ECX = 0) : ECX | |
const uint32_t sse3_mask = (1U << 0); // SSE3 (EAX = 1, ECX = 0) : ECX | |
const uint32_t sse2_mask = (1U << 26); // SSE2 (EAX = 1, ECX = 0) : EDX | |
const uint32_t sse_mask = (1U << 15); // SSE (EAX = 1, ECX = 0) : EDX | |
uint32_t flags = 0, abcd[4]; | |
run_cpuid( 1, 0, abcd ); | |
flags |= (abcd[2] & avx_mask) ? OMPI_OP_AVX_HAS_AVX_FLAG : 0; | |
flags |= (abcd[2] & sse4_1_mask) ? OMPI_OP_AVX_HAS_SSE4_1_FLAG : 0; | |
flags |= (abcd[2] & sse3_mask) ? OMPI_OP_AVX_HAS_SSE3_FLAG : 0; | |
flags |= (abcd[3] & sse2_mask) ? OMPI_OP_AVX_HAS_SSE2_FLAG : 0; | |
flags |= (abcd[3] & sse_mask) ? OMPI_OP_AVX_HAS_SSE_FLAG : 0; | |
#if defined(__APPLE__) | |
uint32_t fma_movbe_osxsave_mask = ((1U << 12) | (1U << 22) | (1U << 27)); /* FMA(12) + MOVBE (22) OSXSAVE (27) */ | |
// OS supports extended processor state management ? | |
if ( (abcd[2] & fma_movbe_osxsave_mask) != fma_movbe_osxsave_mask ) | |
return 0; | |
#endif /* defined(__APPLE__) */ | |
run_cpuid( 7, 0, abcd ); | |
flags |= (abcd[1] & avx512f_mask) ? OMPI_OP_AVX_HAS_AVX512F_FLAG : 0; | |
flags |= (abcd[1] & avx512_bw_mask) ? OMPI_OP_AVX_HAS_AVX512BW_FLAG : 0; | |
flags |= (abcd[1] & avx2_mask) ? OMPI_OP_AVX_HAS_AVX2_FLAG : 0; | |
return flags; | |
} | |
#endif /* non-Intel compiler */ | |
#define AVX_HAS_FLAGS(_flag) \ | |
(((_flag) & intel_avx_flags) == (_flag)) | |
void do_add_int32_avx512(const int32_t *in, int32_t *out, int *count) | |
{ | |
int left_over = *count; | |
if( AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG | OMPI_OP_AVX_HAS_AVX512BW_FLAG) ) { | |
int types_per_step = (512 / 8) / sizeof(int32_t); | |
for( ; left_over >= types_per_step; left_over -= types_per_step ) { | |
__m512i vecA = _mm512_loadu_si512((__m512*)in); | |
in += types_per_step; | |
__m512i vecB = _mm512_loadu_si512((__m512*)out); | |
__m512i res = _mm512_add_epi32(vecA, vecB); | |
_mm512_storeu_si512((__m512*)out, res); | |
out += types_per_step; | |
} | |
*count = left_over; | |
} else { | |
printf("Lacks AVX512 capabilities\n"); | |
} | |
} | |
void do_add_int32_avx2(const int32_t *in, int32_t *out, int *count) | |
{ | |
int left_over = *count; | |
if( AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX2_FLAG | OMPI_OP_AVX_HAS_AVX_FLAG) ) { | |
int types_per_step = (256 / 8) / sizeof(int32_t); /* AVX2 */ | |
for( ; left_over >= types_per_step; left_over -= types_per_step ) { | |
__m256i vecA = _mm256_loadu_si256((__m256i*)in); | |
in += types_per_step; | |
__m256i vecB = _mm256_loadu_si256((__m256i*)out); | |
__m256i res = _mm256_add_epi32(vecA, vecB); | |
_mm256_storeu_si256((__m256i*)out, res); | |
out += types_per_step; | |
} | |
*count = left_over; | |
} else { | |
printf("Lacks AVX2 capabilities\n"); | |
} | |
} | |
void do_add_int32_sse(const int32_t *in, int32_t *out, int *count) | |
{ | |
int left_over = *count; | |
if( AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE3_FLAG | OMPI_OP_AVX_HAS_SSE4_1_FLAG) ) { | |
int types_per_step = (128 / 8) / sizeof(int32_t); /* AVX */ | |
for( ; left_over >= types_per_step; left_over -= types_per_step ) { | |
__m128i vecA = _mm_lddqu_si128((__m128i*)in); | |
in += types_per_step; | |
__m128i vecB = _mm_lddqu_si128((__m128i*)out); | |
__m128i res = _mm_add_epi32(vecA, vecB); | |
_mm_storeu_si128((__m128i*)out, res); | |
out += types_per_step; | |
} | |
*count = left_over; | |
} else { | |
printf("Lacks SSE4 capabilities\n"); | |
} | |
} | |
void do_add_int32(const int32_t *in, int32_t *out, int count) | |
{ | |
int left_over = count; | |
do_add_int32_avx512(in, out, &left_over); | |
if( 0 == left_over ) return; | |
do_add_int32_avx2(in, out, &left_over); | |
if( 0 == left_over ) return; | |
do_add_int32_sse(in, out, &left_over); | |
while( left_over > 0 ) { | |
int how_much = (left_over > 8) ? 8 : left_over; | |
switch(how_much) { | |
case 8: out[7] = out[7] + in[7]; | |
case 7: out[6] = out[6] + in[6]; | |
case 6: out[5] = out[5] + in[5]; | |
case 5: out[4] = out[4] + in[4]; | |
case 4: out[3] = out[3] + in[3]; | |
case 3: out[2] = out[2] + in[2]; | |
case 2: out[1] = out[1] + in[1]; | |
case 1: out[0] = out[0] + in[0]; | |
} | |
left_over -= how_much; | |
out += how_much; | |
in += how_much; | |
} | |
} | |
int main(int argc, char* argv[]) | |
{ | |
int32_t count = 1024, *in, *out; | |
intel_avx_flags = has_intel_AVX_features(); | |
in = (int32_t*)malloc(count * sizeof(int32_t)); | |
out = (int32_t*)malloc(count * sizeof(int32_t)); | |
do_add_int32(in, out, count); | |
free(in); | |
free(out); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
when I compile this using gcc 9.3.0 targeting KNL I get this: