Skip to content

Instantly share code, notes, and snippets.

@bosilca
Created September 26, 2020 18:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bosilca/b1f4463fb7401f6d2af2dcc8dc3eb1eb to your computer and use it in GitHub Desktop.
Save bosilca/b1f4463fb7401f6d2af2dcc8dc3eb1eb to your computer and use it in GitHub Desktop.
Playground for the AVX512 support on KNL / KNC.
#include <stdlib.h>
#include <stdio.h>
#include <immintrin.h>
#define OMPI_OP_AVX_HAS_AVX512BW_FLAG 0x00000200
#define OMPI_OP_AVX_HAS_AVX512F_FLAG 0x00000100
#define OMPI_OP_AVX_HAS_AVX2_FLAG 0x00000020
#define OMPI_OP_AVX_HAS_AVX_FLAG 0x00000010
#define OMPI_OP_AVX_HAS_SSE4_1_FLAG 0x00000008
#define OMPI_OP_AVX_HAS_SSE3_FLAG 0x00000004
#define OMPI_OP_AVX_HAS_SSE2_FLAG 0x00000002
#define OMPI_OP_AVX_HAS_SSE_FLAG 0x00000001
static int intel_avx_flags = 0;
/**
* A slightly modified code from
* https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
*/
#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1300)
static uint32_t has_intel_AVX_features(void)
{
uint32_t flags = 0;
flags |= _may_i_use_cpu_feature(_FEATURE_AVX512F) ? OMPI_OP_AVX_HAS_AVX512F_FLAG : 0;
flags |= _may_i_use_cpu_feature(_FEATURE_AVX512BW) ? OMPI_OP_AVX_HAS_AVX512BW_FLAG : 0;
flags |= _may_i_use_cpu_feature(_FEATURE_AVX2) ? OMPI_OP_AVX_HAS_AVX2_FLAG : 0;
flags |= _may_i_use_cpu_feature(_FEATURE_AVX) ? OMPI_OP_AVX_HAS_AVX_FLAG : 0;
flags |= _may_i_use_cpu_feature(_FEATURE_SSE4_1) ? OMPI_OP_AVX_HAS_SSE4_1_FLAG : 0;
flags |= _may_i_use_cpu_feature(_FEATURE_SSE3) ? OMPI_OP_AVX_HAS_SSE3_FLAG : 0;
flags |= _may_i_use_cpu_feature(_FEATURE_SSE2) ? OMPI_OP_AVX_HAS_SSE2_FLAG : 0;
flags |= _may_i_use_cpu_feature(_FEATURE_SSE) ? OMPI_OP_AVX_HAS_SSE_FLAG : 0;
return flags;
}
#else /* non-Intel compiler */
#include <stdint.h>
#if defined(_MSC_VER)
#include <intrin.h>
#endif
static void run_cpuid(uint32_t eax, uint32_t ecx, uint32_t* abcd)
{
#if defined(_MSC_VER)
__cpuidex(abcd, eax, ecx);
#else
uint32_t ebx = 0, edx = 0;
#if defined( __i386__ ) && defined ( __PIC__ )
/* in case of PIC under 32-bit EBX cannot be clobbered */
__asm__ ( "movl %%ebx, %%edi \n\t cpuid \n\t xchgl %%ebx, %%edi" : "=D" (ebx),
#else
__asm__ ( "cpuid" : "+b" (ebx),
#endif /* defined( __i386__ ) && defined ( __PIC__ ) */
"+a" (eax), "+c" (ecx), "=d" (edx) );
abcd[0] = eax; abcd[1] = ebx; abcd[2] = ecx; abcd[3] = edx;
#endif
}
static uint32_t has_intel_AVX_features(void)
{
/* From https://en.wikipedia.org/wiki/CPUID#EAX=1:_Processor_Info_and_Feature_Bits */
const uint32_t avx512f_mask = (1U << 16); // AVX512F (EAX = 7, ECX = 0) : EBX
const uint32_t avx512_bw_mask = (1U << 30); // AVX512BW (EAX = 7, ECX = 0) : EBX
const uint32_t avx2_mask = (1U << 5); // AVX2 (EAX = 7, ECX = 0) : EBX
const uint32_t avx_mask = (1U << 28); // AVX (EAX = 1, ECX = 0) : ECX
const uint32_t sse4_1_mask = (1U << 19); // SSE4.1 (EAX = 1, ECX = 0) : ECX
const uint32_t sse3_mask = (1U << 0); // SSE3 (EAX = 1, ECX = 0) : ECX
const uint32_t sse2_mask = (1U << 26); // SSE2 (EAX = 1, ECX = 0) : EDX
const uint32_t sse_mask = (1U << 15); // SSE (EAX = 1, ECX = 0) : EDX
uint32_t flags = 0, abcd[4];
run_cpuid( 1, 0, abcd );
flags |= (abcd[2] & avx_mask) ? OMPI_OP_AVX_HAS_AVX_FLAG : 0;
flags |= (abcd[2] & sse4_1_mask) ? OMPI_OP_AVX_HAS_SSE4_1_FLAG : 0;
flags |= (abcd[2] & sse3_mask) ? OMPI_OP_AVX_HAS_SSE3_FLAG : 0;
flags |= (abcd[3] & sse2_mask) ? OMPI_OP_AVX_HAS_SSE2_FLAG : 0;
flags |= (abcd[3] & sse_mask) ? OMPI_OP_AVX_HAS_SSE_FLAG : 0;
#if defined(__APPLE__)
uint32_t fma_movbe_osxsave_mask = ((1U << 12) | (1U << 22) | (1U << 27)); /* FMA(12) + MOVBE (22) OSXSAVE (27) */
// OS supports extended processor state management ?
if ( (abcd[2] & fma_movbe_osxsave_mask) != fma_movbe_osxsave_mask )
return 0;
#endif /* defined(__APPLE__) */
run_cpuid( 7, 0, abcd );
flags |= (abcd[1] & avx512f_mask) ? OMPI_OP_AVX_HAS_AVX512F_FLAG : 0;
flags |= (abcd[1] & avx512_bw_mask) ? OMPI_OP_AVX_HAS_AVX512BW_FLAG : 0;
flags |= (abcd[1] & avx2_mask) ? OMPI_OP_AVX_HAS_AVX2_FLAG : 0;
return flags;
}
#endif /* non-Intel compiler */
#define AVX_HAS_FLAGS(_flag) \
(((_flag) & intel_avx_flags) == (_flag))
void do_add_int32_avx512(const int32_t *in, int32_t *out, int *count)
{
int left_over = *count;
if( AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG | OMPI_OP_AVX_HAS_AVX512BW_FLAG) ) {
int types_per_step = (512 / 8) / sizeof(int32_t);
for( ; left_over >= types_per_step; left_over -= types_per_step ) {
__m512i vecA = _mm512_loadu_si512((__m512*)in);
in += types_per_step;
__m512i vecB = _mm512_loadu_si512((__m512*)out);
__m512i res = _mm512_add_epi32(vecA, vecB);
_mm512_storeu_si512((__m512*)out, res);
out += types_per_step;
}
*count = left_over;
} else {
printf("Lacks AVX512 capabilities\n");
}
}
void do_add_int32_avx2(const int32_t *in, int32_t *out, int *count)
{
int left_over = *count;
if( AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX2_FLAG | OMPI_OP_AVX_HAS_AVX_FLAG) ) {
int types_per_step = (256 / 8) / sizeof(int32_t); /* AVX2 */
for( ; left_over >= types_per_step; left_over -= types_per_step ) {
__m256i vecA = _mm256_loadu_si256((__m256i*)in);
in += types_per_step;
__m256i vecB = _mm256_loadu_si256((__m256i*)out);
__m256i res = _mm256_add_epi32(vecA, vecB);
_mm256_storeu_si256((__m256i*)out, res);
out += types_per_step;
}
*count = left_over;
} else {
printf("Lacks AVX2 capabilities\n");
}
}
void do_add_int32_sse(const int32_t *in, int32_t *out, int *count)
{
int left_over = *count;
if( AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE3_FLAG | OMPI_OP_AVX_HAS_SSE4_1_FLAG) ) {
int types_per_step = (128 / 8) / sizeof(int32_t); /* AVX */
for( ; left_over >= types_per_step; left_over -= types_per_step ) {
__m128i vecA = _mm_lddqu_si128((__m128i*)in);
in += types_per_step;
__m128i vecB = _mm_lddqu_si128((__m128i*)out);
__m128i res = _mm_add_epi32(vecA, vecB);
_mm_storeu_si128((__m128i*)out, res);
out += types_per_step;
}
*count = left_over;
} else {
printf("Lacks SSE4 capabilities\n");
}
}
void do_add_int32(const int32_t *in, int32_t *out, int count)
{
int left_over = count;
do_add_int32_avx512(in, out, &left_over);
if( 0 == left_over ) return;
do_add_int32_avx2(in, out, &left_over);
if( 0 == left_over ) return;
do_add_int32_sse(in, out, &left_over);
while( left_over > 0 ) {
int how_much = (left_over > 8) ? 8 : left_over;
switch(how_much) {
case 8: out[7] = out[7] + in[7];
case 7: out[6] = out[6] + in[6];
case 6: out[5] = out[5] + in[5];
case 5: out[4] = out[4] + in[4];
case 4: out[3] = out[3] + in[3];
case 3: out[2] = out[2] + in[2];
case 2: out[1] = out[1] + in[1];
case 1: out[0] = out[0] + in[0];
}
left_over -= how_much;
out += how_much;
in += how_much;
}
}
int main(int argc, char* argv[])
{
int32_t count = 1024, *in, *out;
intel_avx_flags = has_intel_AVX_features();
in = (int32_t*)malloc(count * sizeof(int32_t));
out = (int32_t*)malloc(count * sizeof(int32_t));
do_add_int32(in, out, count);
free(in);
free(out);
return 0;
}
@hppritcha
Copy link

when I compile this using gcc 9.3.0 targeting KNL I get this:

hpp@nid00192[]:/XXXXXX/hpp>./check_avx
Lacks AVX512 capabilities

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment