Last active
May 2, 2018 15:24
-
-
Save manodeep/ffdc60024fd6df8b5264657f0be2f967 to your computer and use it in GitHub Desktop.
Simple AVX512F kernel
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const int64_t bits_set_in_avx512_mask_float[] = { B16(0) }; | |
const uint16_t masks_per_misalignment_value_float[] = {0b1111111111111111, | |
0b0000000000000001, | |
0b0000000000000011, | |
0b0000000000000111, | |
0b0000000000001111, | |
0b0000000000011111, | |
0b0000000000111111, | |
0b0000000001111111, | |
0b0000000011111111, | |
0b0000000111111111, | |
0b0000001111111111, | |
0b0000011111111111, | |
0b0000111111111111, | |
0b0001111111111111, | |
0b0011111111111111, | |
0b0111111111111111}; | |
const int64_t bits_set_in_avx512_mask_double[] = { B8(0) }; | |
const uint8_t masks_per_misalignment_value_double[] = {0b11111111, | |
0b00000001, | |
0b00000011, | |
0b00000111, | |
0b00001111, | |
0b00011111, | |
0b00111111, | |
0b01111111}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* File: avx_calls.h */ | |
/* | |
This file is a part of the Corrfunc package | |
Copyright (C) 2015-- Manodeep Sinha (manodeep@gmail.com) | |
License: MIT LICENSE. See LICENSE file under the top-level | |
directory at https://github.com/manodeep/Corrfunc/ | |
*/ | |
#pragma once | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <stdint.h> | |
#include <immintrin.h> | |
#ifdef __cplusplus | |
extern "C" { | |
#endif | |
#include "function_precision.h" | |
#define PREFETCH(mem) asm ("prefetcht0 %0"::"m"(mem)) | |
#if defined(__GNUC__) || defined(__GNUG__) | |
#define AVX512_BIT_COUNT_INT(X) __builtin_popcount(X) | |
#else | |
#define AVX512_BIT_COUNT_INT(X) _popcnt32(X) | |
#endif | |
#define AVX512_BIT_COUNT_LONG(X) _popcnt64(X) | |
#define AVX512_BIT_COUNT_UNSIGNED_INT(X) _mm_popcnt_u32(X) | |
#define AVX512_BIT_COUNT_UNSIGNED_LONG(X) _mm_popcnt_u64(X) | |
#define AVX512_MASK_BITWISE_AND(X,Y) _mm512_kand(X,Y) | |
#define AVX512_MASK_BITWISE_OR(X,Y) _mm512_kor(X,Y) | |
#define AVX512_MASK_BITWISE_XOR_FLOATS(X,Y) _mm512_kxor(X,Y) | |
#define AVX512_MASK_BITWISE_AND_NOT(X,Y) _mm512_kandn(X,Y) //~X & Y | |
/* For setting up the array that contains the number bits set in a mask */ | |
// Can be used to generate up to 16 bit lookup tables | |
# define B2(n) n, n+1, n+1, n+2 | |
# define B4(n) B2(n), B2(n+1), B2(n+1), B2(n+2) | |
# define B6(n) B4(n), B4(n+1), B4(n+1), B4(n+2) | |
# define B8(n) B6(n), B6(n+1), B6(n+1), B6(n+2) | |
# define B10(n) B8(n), B8(n+1), B8(n+1), B8(n+2) | |
# define B12(n) B10(n), B10(n+1), B10(n+1), B10(n+2) | |
# define B14(n) B12(n), B12(n+1), B12(n+1), B12(n+2) | |
# define B16(n) B14(0),B14(1), B14(1), B14(2) | |
#ifndef DOUBLE_PREC | |
#define DOUBLE float | |
#define AVX512_NVEC 16 | |
#define AVX512_MASK __mmask16 | |
#define AVX512_FLOATS __m512 | |
#define AVX512_INTS __m512i | |
#define AVX512_SET_INT(X) _mm512_set1_epi32(X) | |
#define AVX512_SETZERO_INT() _mm512_setzero_epi32() | |
#if 0 | |
/* commenting out the integer math operations since they are either cumbersome or produce results of different SIMD widths*/ | |
#define AVX512_ADD_INTS(X, Y) _mm512_add_epi32(X, Y) | |
#define AVX512_MASK_ADD_INTS(FALSEVALS, MASK, X, Y) _mm512_mask_add_epi32(FALSEVALS, MASK, X, Y) | |
#define AVX512_MASKZ_ADD_INTS(MASK, X, Y) _mm512_maskz_add_epi32(MASK, X, Y) | |
#define AVX512_MULTIPLY_INTS(X, Y) _mm512_mul_epi32(X, Y) | |
#define AVX512_MASK_MULTIPLY_INTS(FALSEVALS, MASK, X, Y) _mm512_mask_mul_epi32(FALSEVALS, MASK, X, Y) | |
#define AVX512_MASKZ_MULTIPLY_INTS(MASK, X, Y) _mm512_maskz_mul_epi32(MASK, X, Y) | |
#endif /*end of integer math*/ | |
#define AVX512_SETZERO_FLOAT() _mm512_setzero_ps() | |
#define AVX512_LOAD_FLOATS_UNALIGNED(X) _mm512_loadu_ps(X) | |
#define AVX512_MASK_LOAD_FLOATS_UNALIGNED(FALSEVALS, MASK, X) _mm512_mask_loadu_ps(FALSEVALS, MASK, X) | |
#define AVX512_MASKZ_LOAD_FLOATS_UNALIGNED(MASK, X) _mm512_maskz_loadu_ps(MASK, X) | |
#define AVX512_LOAD_FLOATS_ALIGNED(X) _mm512_load_ps(X) | |
#define AVX512_MASK_LOAD_FLOATS_ALIGNED(FALSEVALS, MASK, X) _mm512_mask_load_ps(FALSEVALS, MASK, X) | |
#define AVX512_MASKZ_LOAD_FLOATS_ALIGNED(MASK, X) _mm512_maskz_load_ps(MASK, X) | |
#define AVX512_MULTIPLY_FLOATS(X,Y) _mm512_mul_ps(X,Y) | |
#define AVX512_MASK_MULTIPLY_FLOATS(FALSEVALS, MASK, X,Y) _mm512_mask_mul_ps(FALSEVALS, MASK, X,Y) | |
#define AVX512_MASKZ_MULTIPLY_FLOATS(MASK, X,Y) _mm512_maskz_mul_ps(MASK, X,Y) | |
#define AVX512_DIVIDE_FLOATS(X,Y) _mm512_div_ps(X,Y) | |
#define AVX512_MASK_DIVIDE_FLOATS(FALSEVALS, MASK, X,Y) _mm512_mask_div_ps(FALSEVALS, MASK, X,Y) | |
#define AVX512_MASKZ_DIVIDE_FLOATS(MASK, X,Y) _mm512_maskz_div_ps(MASK, X,Y) | |
#define AVX512_ADD_FLOATS(X,Y) _mm512_add_ps(X,Y) | |
#define AVX512_MASK_ADD_FLOATS(FALSEVALS, MASK, X,Y) _mm512_mask_add_ps(FALSEVALS, MASK, X,Y) | |
#define AVX512_MASKZ_ADD_FLOATS(MASK, X,Y) _mm512_maskz_add_ps(MASK, X,Y) | |
#define AVX512_SUBTRACT_FLOATS(X,Y) _mm512_sub_ps(X,Y) | |
#define AVX512_MASK_SUBTRACT_FLOATS(FALSEVALS, MASK, X,Y) _mm512_mask_sub_ps(FALSEVALS, MASK, X,Y) | |
#define AVX512_MASKZ_SUBTRACT_FLOATS(MASK, X,Y) _mm512_maskz_sub_ps(MASK, X,Y) | |
/* returns Z + XY*/ | |
#define AVX512_FMA_ADD_FLOATS(X,Y,Z) _mm512_fmadd_ps(X,Y,Z) | |
#define AVX512_MASK_FMA_ADD_FLOATS(X, MASK, Y, Z) _mm512_mask_fmadd_ps(X, MASK, Y, Z) | |
#define AVX512_MASKZ_FMA_ADD_FLOATS(X, MASK, Y, Z) _mm512_maskz_fmadd_ps(MASK, X, Y, Z) | |
/* returns Z - XY*/ | |
#define AVX512_FNMA_ADD_FLOATS(X, Y, Z) _mm512_fnmadd_ps(X, Y, Z) | |
#define AVX512_MASK_FNMA_ADD_FLOATS(X, MASK, Y, Z) _mm512_mask_fnmadd_ps(X, MASK, Y, Z) | |
#define AVX512_MASKZ_FNMA_ADD_FLOATS(X, MASK, Y, Z) _mm512_maskz_fnmadd_ps(MASK, X, Y, Z) | |
/* returns XY - Z */ | |
#define AVX512_FMA_SUBTRACT_FLOATS(X,Y,Z) _mm512_fmsub_ps(X,Y,Z) | |
#define AVX512_MASK_FMA_SUBTRACT_FLOATS(X, MASK, Y, Z) _mm512_mask_fmsub_ps(X, MASK, Y, Z) | |
#define AVX512_MASKZ_FMA_SUBTRACT_FLOATS(X, MASK, Y, Z) _mm512_maskz_fmsub_ps(MASK, X, Y, Z) | |
#ifdef __INTEL_COMPILER | |
#define AVX512_HORIZONTAL_SUM_FLOATS(X) _mm512_reduce_add_ps(X) | |
#define AVX512_MASK_HORIZONTAL_SUM_FLOATS(MASK, X) _mm512_mask_reduce_add_ps(MASK, X) | |
#else | |
#define AVX512_HORIZONTAL_SUM_FLOATS(X) _horizontal_sum_floats(X) | |
#define AVX512_MASK_HORIZONTAL_SUM_FLOATS(MASK, X) _horizontal_mask_sum_floats(X) | |
#endif | |
#define AVX512_SQRT_FLOAT(X) _mm512_sqrt_ps(X) | |
#define AVX512_MASK_SQRT_FLOAT(FALSEVALS, MASK, X) _mm512_mask_sqrt_ps(FALSEVALS, MASK, X) | |
#define AVX512_MASKZ_SQRT_FLOAT(MASK, X) _mm512_maskz_sqrt_ps(MASK, X) | |
#define AVX512_SVML_SQRT_FLOAT(X) _mm512_svml_sqrt_ps(X) | |
#define AVX512_TRUNCATE_FLOAT_TO_INT(X) _mm512_cvttps_epi32(X) | |
#define AVX512_STORE_FLOATS_TO_MEMORY(X,Y) _mm512_storeu_ps(X,Y) | |
#define AVX512_SQUARE_FLOAT(X) _mm512_mul_ps(X,X) | |
#define AVX512_LOG_FLOAT(X) _mm512_log_ps(X) | |
#define AVX512_LOG10_FLOAT(X) _mm512_log10_ps(X) | |
#define AVX512_LOG2_FLOAT(X) _mm512_log2_ps(X) | |
#define AVX512_RECIPROCAL_FLOATS(X) _mm512_rcp14_ps(X) | |
#define AVX512_MASK_RECIPROCAL_FLOATS(FALSEVALS, MASK, X) _mm512_mask_rcp14_ps(FALSEVALS, MASK, X) | |
#define AVX512_MASKZ_RECIPROCAL_FLOATS(MASK, X) _mm512_maskz_rcp14_ps(MASK, X) | |
#define AVX512_SET_FLOAT(X) _mm512_set1_ps(X) | |
// X OP Y | |
#define AVX512_COMPARE_FLOATS(X, Y, OP) _mm512_cmp_ps_mask(X, Y, OP) | |
//Mask operations (new in AVX512) | |
#define AVX512_MASK_COMPARE_FLOATS(M, X, Y, OP) _mm512_mask_cmp_ps_mask(M, X, Y, OP) | |
#define AVX512_BLEND_FLOATS_WITH_MASK(MASK, FALSE,TRUE) _mm512_mask_blend_ps(MASK, FALSE,TRUE) | |
#define AVX512_BLEND_INTS_WITH_MASK(MASK, FALSE, TRUE) _mm512_mask_blend_epi32(MASK, FALSE, TRUE) | |
//Trig | |
#ifdef __INTEL_COMPILER | |
/* Needs SVML */ | |
#define AVX512_ARC_COSINE(X, order) _mm512_acos_ps(X) | |
#else | |
//Other compilers do not have the vectorized arc-cosine | |
#define AVX512_ARC_COSINE(X, order) inv_cosine_avx512(X, order) | |
#endif | |
//Max | |
#define AVX512_MAX_FLOATS(X,Y) _mm512_max_ps(X,Y) | |
//Absolute value | |
#define AVX512_ABS_FLOAT(X) _mm512_abs_ps(X) | |
//Casting (does not actual convert between types) | |
#define AVX512_CAST_FLOAT_TO_INT(X) _mm512_castps_si512(X) | |
#define AVX512_CAST_INT_TO_FLOAT(X) _mm512_castsi512_ps(X) | |
#else //DOUBLE PRECISION CALCULATIONS | |
#define DOUBLE double | |
#define AVX512_NVEC 8 | |
#define AVX512_MASK __mmask8 | |
#define AVX512_FLOATS __m512d | |
//This is AVX2 and not AVX512F | |
#define AVX512_INTS __m256i | |
#define AVX512_SET_INT(X) _mm256_set1_epi32(X) | |
#define AVX512_SETZERO_INT() _mm256_setzero_si256() | |
#if 0 | |
#define AVX512_ADD_INTS(X, Y) _mm256_add_epi32(X, Y) | |
#define AVX512_MULTIPLY_INTS(X, Y) _mm256_mul_epi32(X, Y) | |
#if defined(__AVX512VL__) | |
#define AVX512_MASK_ADD_INTS(FALSEVALS, MASK, X, Y) _mm256_mask_add_epi32(FALSEVALS, MASK, X, Y) | |
#define AVX512_MASKZ_ADD_INTS(MASK, X, Y) _mm256_maskz_add_epi32(MASK, X, Y) | |
#define AVX512_MASK_MULTIPLY_INTS(FALSEVALS, MASK, X, Y) _mm256_mask_mul_epi32(FALSEVALS, MASK, X, Y) | |
#define AVX512_MASKZ_MULTIPLY_INTS(MASK, X, Y) _mm256_maskz_mul_epi32(MASK, X, Y) | |
#elif defined(__AVX512F__) | |
#define AVX512_MASK_ADD_INTS(FALSEVALS, MASK, X, Y) _mm512_castsi512_si256(_mm512_mask_add_epi32(_mm512_castsi256_si512(FALSEVALS), MASK, _mm512_castsi256_si512(X), _mm512_castsi256_si512(Y))) | |
#define AVX512_MASKZ_ADD_INTS(MASK, X, Y) _mm512_castsi512_si256(_mm512_maskz_add_epi32(MASK, _mm512_castsi256_si512(X), _mm512_castsi256_si512(Y))) | |
#define AVX512_MASK_MULTIPLY_INTS(FALSEVALS, MASK, X, Y) _mm512_castsi512_si256(_mm512_mask_mul_epi32(_mm512_castsi256_si512(FALSEVALS), MASK, _mm512_castsi256_si512(X), _mm512_castsi256_si512(Y))) | |
#define AVX512_MASKZ_MULTIPLY_INTS(MASK, X, Y) _mm512_castsi512_si256(_mm512_maskz_mul_epi32(MASK, _mm512_castsi256_si512(X), _mm512_castsi256_si512(Y))) | |
#endif | |
#endif /* commenting out the int math operations since they are either cumbersome or produce results of different SIMD widths*/ | |
#define AVX512_SETZERO_FLOAT() _mm512_setzero_pd() | |
#define AVX512_LOAD_FLOATS_UNALIGNED(X) _mm512_loadu_pd(X) | |
#define AVX512_MASK_LOAD_FLOATS_UNALIGNED(FALSEVALS, MASK, X) _mm512_mask_loadu_pd(FALSEVALS, MASK, X) | |
#define AVX512_MASKZ_LOAD_FLOATS_UNALIGNED(MASK, X) _mm512_maskz_loadu_pd(MASK, X) | |
#define AVX512_LOAD_FLOATS_ALIGNED(X) _mm512_load_pd(X) | |
#define AVX512_MASK_LOAD_FLOATS_ALIGNED(FALSEVALS, MASK, X) _mm512_mask_load_pd(FALSEVALS, MASK, X) | |
#define AVX512_MASKZ_LOAD_FLOATS_ALIGNED(MASK, X) _mm512_maskz_load_pd(MASK, X) | |
#define AVX512_MULTIPLY_FLOATS(X,Y) _mm512_mul_pd(X,Y) | |
#define AVX512_MASK_MULTIPLY_FLOATS(FALSEVALS, MASK, X,Y) _mm512_mask_mul_pd(FALSEVALS, MASK, X,Y) | |
#define AVX512_MASKZ_MULTIPLY_FLOATS(MASK, X,Y) _mm512_maskz_mul_pd(MASK, X,Y) | |
#define AVX512_DIVIDE_FLOATS(X,Y) _mm512_div_pd(X,Y) | |
#define AVX512_MASK_DIVIDE_FLOATS(FALSEVALS, MASK, X,Y) _mm512_mask_div_pd(FALSEVALS, MASK, X,Y) | |
#define AVX512_MASKZ_DIVIDE_FLOATS(MASK, X,Y) _mm512_maskz_div_pd(MASK, X,Y) | |
#define AVX512_ADD_FLOATS(X,Y) _mm512_add_pd(X,Y) | |
#define AVX512_MASK_ADD_FLOATS(FALSEVALS, MASK, X,Y) _mm512_mask_add_pd(FALSEVALS, MASK, X,Y) | |
#define AVX512_MASKZ_ADD_FLOATS(MASK, X,Y) _mm512_maskz_add_pd(MASK, X,Y) | |
#define AVX512_SUBTRACT_FLOATS(X,Y) _mm512_sub_pd(X,Y) | |
#define AVX512_MASK_SUBTRACT_FLOATS(FALSEVALS, MASK, X,Y) _mm512_mask_sub_pd(FALSEVALS, MASK, X,Y) | |
#define AVX512_MASKZ_SUBTRACT_FLOATS(MASK, X,Y) _mm512_maskz_sub_pd(MASK, X,Y) | |
/* returns Z + XY*/ | |
#define AVX512_FMA_ADD_FLOATS(X,Y,Z) _mm512_fmadd_pd(X,Y,Z) | |
#define AVX512_MASK_FMA_ADD_FLOATS(X, MASK, Y, Z) _mm512_mask_fmadd_pd(X, MASK, Y, Z) | |
#define AVX512_MASKZ_FMA_ADD_FLOATS(X, MASK, Y, Z) _mm512_maskz_fmadd_pd(MASK, X, Y, Z) | |
/* returns Z - XY*/ | |
#define AVX512_FNMA_ADD_FLOATS(X, Y, Z) _mm512_fnmadd_pd(X, Y, Z) | |
#define AVX512_MASK_FNMA_ADD_FLOATS(X, MASK, Y, Z) _mm512_mask_fnmadd_pd(X, MASK, Y, Z) | |
#define AVX512_MASKZ_FNMA_ADD_FLOATS(X, MASK, Y, Z) _mm512_maskz_fnmadd_pd(MASK, X, Y, Z) | |
/* returns XY - Z */ | |
#define AVX512_FMA_SUBTRACT_FLOATS(X,Y,Z) _mm512_fmsub_pd(X,Y,Z) | |
#define AVX512_MASK_FMA_SUBTRACT_FLOATS(X, MASK, Y, Z) _mm512_mask_fmsub_pd(X, MASK, Y, Z) | |
#define AVX512_MASKZ_FMA_SUBTRACT_FLOATS(X, MASK, Y, Z) _mm512_maskz_fmsub_pd(MASK, X, Y, Z) | |
#ifdef __INTEL_COMPILER | |
#define AVX512_HORIZONTAL_SUM_FLOATS(X) _mm512_reduce_add_pd(X) | |
#define AVX512_MASK_HORIZONTAL_SUM_FLOATS(MASK, X) _mm512_mask_reduce_add_pd(MASK, X) | |
#endif | |
#define AVX512_SQRT_FLOAT(X) _mm512_sqrt_pd(X) | |
#define AVX512_MASK_SQRT_FLOAT(FALSEVALS, MASK, X) _mm512_mask_sqrt_pd(FALSEVALS, MASK, X) | |
#define AVX512_MASKZ_SQRT_FLOAT(MASK, X) _mm512_maskz_sqrt_pd(MASK, X) | |
#define AVX512_SVML_SQRT_FLOAT(X) _mm512_svml_sqrt_pd(X) | |
#define AVX512_TRUNCATE_FLOAT_TO_INT(X) _mm512_cvttpd_epi32(X) | |
#define AVX512_STORE_FLOATS_TO_MEMORY(X,Y) _mm512_storeu_pd(X,Y) | |
#define AVX512_SQUARE_FLOAT(X) _mm512_mul_pd(X,X) | |
#define AVX512_LOG_FLOAT(X) _mm512_log_pd(X) | |
#define AVX512_LOG2_FLOAT(X) _mm512_log2_pd(X) | |
#define AVX512_LOG10_FLOAT(X) _mm512_log10_pd(X) | |
#define AVX512_RECIPROCAL_FLOATS(X) _mm512_rcp14_pd(X) | |
#define AVX512_MASK_RECIPROCAL_FLOATS(FALSEVALS, MASK, X) _mm512_mask_rcp14_pd(FALSEVALS, MASK, X) | |
#define AVX512_MASKZ_RECIPROCAL_FLOATS(MASK, X) _mm512_maskz_rcp14_pd(MASK, X) | |
// X OP Y | |
#define AVX512_COMPARE_FLOATS(X, Y, OP) _mm512_cmp_pd_mask(X, Y, OP) | |
#define AVX512_MASK_COMPARE_FLOATS(M, X, Y, OP) _mm512_mask_cmp_pd_mask(M, X, Y, OP) | |
#define AVX512_SET_FLOAT(X) _mm512_set1_pd(X) | |
#define AVX512_BLEND_FLOATS_WITH_MASK(MASK, FALSEVALUE,TRUEVALUE) _mm512_mask_blend_pd(MASK, FALSEVALUE,TRUEVALUE) | |
#if defined(__AVX512VL__) | |
#define AVX512_BLEND_INTS_WITH_MASK(MASK, FALSE,TRUE) _mm256_mask_blend_epi32(MASK, FALSE,TRUE) | |
#elif defined(__AVX2__) | |
#define AVX512_BLEND_INTS_WITH_MASK(MASK, FALSE,TRUE) _mm256_blend_epi32(FALSE, TRUE, MASK)//AVX2 | |
#else | |
#define AVX512_BLEND_INTS_WITH_MASK(MASK, FALSE, TRUE) _blend_epi32_with_mask(MASK, FALSE, TRUE) | |
#endif | |
//Trig | |
#ifdef __INTEL_COMPILER | |
#define AVX512_ARC_COSINE(X, order) _mm512_acos_pd(X) | |
#else | |
#define AVX512_ARC_COSINE(X, order) inv_cosine_avx512(X, order) | |
#endif | |
//Max | |
#define AVX512_MAX_FLOATS(X,Y) _mm512_max_pd(X,Y) | |
//Absolute value | |
#define AVX512_ABS_FLOAT(X) _mm512_abs_pd(X) | |
//Casting (does not actual convert between types) | |
#define AVX512_CAST_FLOAT_TO_INT(X) _mm512_castpd_si512(X) | |
#define AVX512_CAST_INT_TO_FLOAT(X) _mm512_castsi512_pd(_mm512_castsi256_si512(X)) | |
#endif //DOUBLE_PREC | |
#ifndef __INTEL_COMPILER | |
#include "fast_acos.h" | |
static inline AVX512_FLOATS inv_cosine_avx512(const AVX512_FLOATS X, const int order) | |
{ | |
union cos{ | |
AVX512_FLOATS m; | |
DOUBLE x[AVX512_NVEC]; | |
}; | |
union cos union_costheta; | |
union cos union_returnvalue; | |
union_costheta.m = X; | |
const DOUBLE minus_one = (DOUBLE) -1.0; | |
const DOUBLE one = (DOUBLE) 1.0; | |
//Force everything to be in range [0,1] | |
for(int ii=0;ii<AVX512_NVEC;ii++) { | |
const DOUBLE costheta = union_costheta.x[ii]; | |
union_costheta.x[ii] = costheta <= minus_one ? minus_one:costheta; | |
union_costheta.x[ii] = costheta >= one ? one:costheta; | |
} | |
if(order == 0) { | |
for(int ii=0;ii<AVX512_NVEC;ii++) { | |
const DOUBLE costheta = union_costheta.x[ii]; | |
union_returnvalue.x[ii] = ACOS(costheta); | |
} | |
} else { | |
//fast acos | |
/*Taken from associated C++ code in http://www.geometrictools.com/GTEngine/Include/Mathematics/GteACosEstimate.h*/ | |
for(int ii=0;ii<AVX512_NVEC;ii++) { | |
union_returnvalue.x[ii] = FAST_ACOS(union_costheta.x[ii]); | |
} | |
} | |
return union_returnvalue.m; | |
} | |
#endif | |
extern const int64_t bits_set_in_avx512_mask_float[]; | |
extern const uint16_t masks_per_misalignment_value_float[]; | |
extern const int64_t bits_set_in_avx512_mask_double[]; | |
extern const uint8_t masks_per_misalignment_value_double[]; | |
union int16 { | |
AVX512_INTS m_ibin; | |
int ibin[AVX512_NVEC]; | |
}; | |
union float16{ | |
AVX512_FLOATS m_Dperp; | |
DOUBLE Dperp[AVX512_NVEC]; | |
}; | |
union float16_weights{ | |
AVX512_FLOATS m_weights; | |
DOUBLE weights[AVX512_NVEC]; | |
}; | |
#define CHECK_AND_FAST_DIVIDE(result, numerator, denominator, mask, num_nr_steps) { \ | |
int _ii; \ | |
if (fast_divide == 0) { \ | |
result = AVX512_DIVIDE_FLOATS(numerator, denominator); \ | |
/* The divide is the actual operation we need */ \ | |
/* but divides are about 10x slower than multiplies. So, I am replacing it */ \ | |
/* with a approximate reciprocal in floating point */ \ | |
/* + 2 iterations of newton-raphson in case of DOUBLE */ \ | |
} else { \ | |
/* following blocks do an approximate reciprocal followed by two iterations of Newton-Raphson */ \ | |
const AVX512_FLOATS rc = AVX512_MASKZ_RECIPROCAL_FLOATS(mask, denominator); \ | |
/* We have the double->float->approx. reciprocal->double process done. */ \ | |
/* Now improve the accuracy of the divide with newton-raphson. */ \ | |
/* Ist iteration of NewtonRaphson */ \ | |
const AVX512_FLOATS two = AVX512_SET_FLOAT((DOUBLE) 2.0); \ | |
AVX512_FLOATS rc_iter = rc; \ | |
for(_ii=0;ii<num_nr_steps;_ii++) { \ | |
rc_iter = AVX512_MULTIPLY_FLOATS(rc_iter, \ | |
AVX512_FNMA_ADD_FLOATS(denominator, rc_iter, two));/*2.0 - l^2*rc */ \ | |
} \ | |
result = AVX512_MASKZ_MULTIPLY_FLOATS(mask, numerator, rc_iter); \ | |
} /* end of FAST_DIVIDE */ \ | |
} | |
#ifdef __cplusplus | |
} | |
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
for(int64_t i=0;i<N0;i++) { | |
const AVX512_FLOATS m_xpos = AVX512_SET_FLOAT(*x0++); | |
const AVX512_FLOATS m_ypos = AVX512_SET_FLOAT(*y0++); | |
const AVX512_FLOATS m_zpos = AVX512_SET_FLOAT(*z0++); | |
DOUBLE *localx1 = x1, *localy1 = y1, *localz1 = z1; | |
for(int64_t j=0;j<N1;j++) { | |
AVX512_MASK m_mask_left = (N1 - j) >= AVX512_NVEC ? ~0:masks_per_misalignment_value_DOUBLE[N1-j]; | |
const AVX512_FLOATS m_x1 = AVX512_MASKZ_LOAD_FLOATS_UNALIGNED(m_mask_left, localx1); | |
const AVX512_FLOATS m_y1 = AVX512_MASKZ_LOAD_FLOATS_UNALIGNED(m_mask_left, localy1); | |
const AVX512_FLOATS m_z1 = AVX512_MASKZ_LOAD_FLOATS_UNALIGNED(m_mask_left, localz1); | |
/* this might actually exceed the allocated range but we will never dereference that */ | |
localx1 += AVX512_NVEC; | |
localy1 += AVX512_NVEC; | |
localz1 += AVX512_NVEC; | |
const AVX512_FLOATS m_xdiff = AVX512_SUBTRACT_FLOATS(m_x1, m_xpos); //(x[j:j+NVEC-1] - x0) | |
const AVX512_FLOATS m_ydiff = AVX512_SUBTRACT_FLOATS(m_y1, m_ypos); //(y[j:j+NVEC-1] - y0) | |
const AVX512_FLOATS m_zdiff = AVX512_SUBTRACT_FLOATS(m_z1, m_zpos); //(z[j:j+NVEC-1] - z1 | |
const AVX512_FLOATS m_sqr_xdiff = AVX512_SQUARE_FLOAT(m_xdiff); //(x0 - x[j])^2 | |
const AVX512_FLOATS x2py2 = AVX512_FMA_ADD_FLOATS(m_ydiff, m_ydiff, m_sqr_xdiff);/* dy*dy + dx^2*/ | |
const AVX512_FLOATS r2 = AVX512_FMA_ADD_FLOATS(m_zdiff, m_zdiff, x2py2);/* dz*dz + (dy^2 + dx^2)*/ | |
const AVX512_MASK m_rpmax_mask = AVX512_MASK_COMPARE_FLOATS(m_mask_pimax, r2, m_sqr_rpmax, _CMP_LT_OQ); | |
/* Create a combined mask */ | |
/* This gives us the mask for all sqr_rpmin <= r2 < sqr_rpmax */ | |
m_mask_left = AVX512_MASK_COMPARE_FLOATS(m_rpmax_mask, r2, m_sqr_rpmin, _CMP_GE_OQ); | |
if(m_mask_left == 0) { | |
continue; | |
} | |
/* Loop backwards through nbins. m_mask_left contains all the points that */ | |
/* are less than rpmax at the beginning of the loop. */ | |
for(int kbin=nbin-1;kbin>=1;kbin--) { | |
const AVX512_MASK m_bin_mask = AVX512_MASK_COMPARE_FLOATS(m_mask_left, r2,m_rupp_sqr[kbin-1],_CMP_GE_OS); | |
npairs[kbin] += bits_set_in_avx512_mask_DOUBLE[m_bin_mask]; | |
/* ANDNOT(X, Y) -> NOT X AND Y */ | |
m_mask_left = AVX512_MASK_BITWISE_AND_NOT(m_bin_mask, m_mask_left); | |
if(m_mask_left == 0) { | |
break; | |
} | |
} | |
} | |
} |
The avx512_calls.[ch]
files exist to using the same simple_avx512.c
file for both double
and float
arrays.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
A very simple AVX512F kernel for Python in Astronomy 2018