Last active
December 21, 2022 17:32
-
-
Save zbjornson/f90513fcaaee4d6268d26f9c1a0507dd to your computer and use it in GitHub Desktop.
Code for fast, accurate float summation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cstddef> | |
#include <cmath> | |
#ifdef __FAST_MATH__ | |
#error Compensated summation is unsafe with -ffast-math (/fp:fast) | |
#endif | |
inline static void kadd(double& sum, double& c, double y) { | |
y -= c; | |
auto t = sum + y; | |
c = (t - sum) - y; | |
sum = t; | |
} | |
double accurate(const float* values, std::size_t len) { | |
double sum = 0, c = 0; | |
for (const float* const end = values + len; values < end; values++) { | |
auto y = static_cast<double>(*values); | |
kadd(sum, c, y); | |
} | |
return sum + c; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cstddef> | |
#include <cstdint> | |
#if defined(_MSC_VER) | |
# include <intrin.h> | |
#elif defined(__GNUC__) | |
# include <x86intrin.h> | |
#endif | |
#ifdef __FAST_MATH__ | |
#error Compensated summation is unsafe with -ffast-math (/fp:fast) | |
#endif | |
using std::size_t; | |
using std::uint32_t; | |
enum class Method { Kahan, Neumaier, LongDouble }; | |
// Kahan summation | |
inline static void ksum(double& sum, double& c, double y) { | |
y -= c; | |
auto t = sum + y; | |
c = (t - sum) - y; | |
sum = t; | |
} | |
inline static void ksum(__m256d& sumV, __m256d& cV, __m256d y) { | |
y = _mm256_sub_pd(y, cV); | |
__m256d t = _mm256_add_pd(sumV, y); | |
cV = _mm256_sub_pd(_mm256_sub_pd(t, sumV), y); | |
sumV = t; | |
} | |
#ifdef __AVX512F__ | |
inline static void ksum(__m512d& sumV, __m512d& cV, __m512d y) { | |
y = _mm512_sub_pd(y, cV); | |
__m512d t = _mm512_add_pd(sumV, y); | |
cV = _mm512_sub_pd(_mm512_sub_pd(t, sumV), y); | |
sumV = t; | |
} | |
#endif | |
// Neumaier summation | |
inline static void nsum(double& sum, double& c, double y) { | |
auto t = sum + y; | |
auto sabs = std::abs(sum); | |
auto yabs = std::abs(y); | |
auto t1 = sabs >= yabs ? sum : y; | |
auto t3 = sabs >= yabs ? y : sum; | |
c += (t1 - t) + t3; | |
sum = t; | |
} | |
inline static void nsum(__m256d& sumV, __m256d& cV, __m256d y) { | |
const __m256d NOT_SIGN_BIT = _mm256_castsi256_pd(_mm256_set1_epi64x(0x7FFF'FFFF'FFFF'FFFF)); | |
__m256d t = _mm256_add_pd(sumV, y); | |
__m256d sabs = _mm256_and_pd(sumV, NOT_SIGN_BIT); | |
__m256d yabs = _mm256_and_pd(y, NOT_SIGN_BIT); | |
__m256d mask = _mm256_cmp_pd(sabs, yabs, _CMP_GE_OQ); | |
__m256d t1 = _mm256_blendv_pd(sumV, y, mask); | |
__m256d t3 = _mm256_blendv_pd(y, sumV, mask); | |
cV = _mm256_add_pd(cV, _mm256_add_pd(_mm256_sub_pd(t1, t), t3)); | |
sumV = t; | |
} | |
#ifdef __AVX512F__ | |
inline static void nsum(__m512d& sumV, __m512d& cV, __m512d y) { | |
const __m512d NOT_SIGN_BIT = _mm512_castsi512_pd(_mm512_set1_epi64(0x7FFF'FFFF'FFFF'FFFF)); | |
__m512d t = _mm512_add_pd(sumV, y); | |
__m512d sabs = _mm512_and_pd(sumV, NOT_SIGN_BIT); | |
__m512d yabs = _mm512_and_pd(y, NOT_SIGN_BIT); | |
__mmask8 mask = _mm512_cmp_pd_mask(sabs, yabs, _CMP_GE_OQ); | |
__m512d t1 = _mm512_mask_blend_pd(mask, sumV, y); | |
__m512d t3 = _mm512_mask_blend_pd(mask, y, sumV); | |
cV = _mm512_add_pd(cV, _mm512_add_pd(_mm512_sub_pd(t1, t), t3)); | |
sumV = t; | |
} | |
#endif | |
// Loads the first N floats starting at p. | |
inline static __m128 mm_load_partial_ps(const float* p, size_t N) { | |
// This uses BMI2, which doesn't exist in Intel Sandy/Ivy Bridge or AMD | |
// Jaguar/Puma/Bulldozer/Piledriver/Steamroller, but does exist in all other | |
// AVX-supporting CPUs. | |
uint32_t k1 = _bzhi_u32(-1, N); // set N low bits | |
k1 = _pdep_u32(k1, 0x80808080); // deposit the set bits into high bit of each byte | |
__m128i k2 = _mm_set1_epi32(k1); // broadcast to vector | |
k2 = _mm_cvtepi8_epi32(k2); // expand 8-bit els into 32-bit els | |
return _mm_maskload_ps(p, k2); | |
} | |
#ifdef __AVX512F__ | |
inline static __m256 mm256_load_partial_ps(const float* p, size_t N) { | |
uint8_t k1 = _bzhi_u32(0xFF, N); // set N low bits | |
return _mm256_maskz_load_ps(k1, p); | |
} | |
#endif | |
inline static void zeroVecArr(__m256d* r, size_t N) { | |
for (size_t i = 0; i < N; i++) r[i] = _mm256_setzero_pd(); | |
} | |
#ifdef __AVX512F__ | |
inline static void zeroVecArr(__m512d* r, size_t N) { | |
for (size_t i = 0; i < N; i++) r[i] = _mm512_setzero_pd(); | |
} | |
#endif | |
/******************************************************************************/ | |
// One single-precision accumulator. | |
float naive(const float* values, size_t len) { | |
float sum = 0.f; | |
for (const float* const end = values + len; values < end; values++) { | |
sum += *values; | |
} | |
return sum; | |
} | |
// Variable number of single-precision accumulators. | |
template<size_t NACC> | |
float fast(const float* values, size_t len) { | |
float sums[NACC] = {}; | |
const float* const end = values + len; | |
while (values < end - NACC) { | |
for (size_t i = 0; i < NACC; i++) { | |
sums[i] += *values++; | |
} | |
} | |
float sum = 0.f; | |
while (values < end) | |
sum += *values++; | |
for (size_t i = 0; i < NACC; i++) | |
sum += sums[i]; | |
return sum; | |
} | |
// Single accumulator with Kahan or Neumaier summation or LongDouble | |
// intermediate precision. | |
template <Method M> | |
double accurate(const float* values, size_t len) { | |
double sum = 0, c = 0; | |
long double ldsum = 0; | |
for (const float* const end = values + len; values < end; values++) { | |
auto y = static_cast<double>(*values); | |
if (M == Method::Kahan) | |
ksum(sum, c, y); | |
else if (M == Method::Neumaier) | |
nsum(sum, c, y); | |
else | |
ldsum += y; | |
} | |
if (M == Method::Neumaier) sum += c; | |
if (M == Method::LongDouble) return ldsum; | |
return sum; | |
} | |
// Variable number of accumulators with Kahan or Neumaier summation, AVX+BMI2. | |
template<Method M, size_t NACC> | |
double fastAccurate(const float* values, size_t len) { | |
constexpr size_t ELS_PER_VEC = 4; // 4 doubles per 256b vec | |
const float* const end = values + len; | |
#define SUM(a, b, c) M == Method::Kahan ? ksum(a, b, c) : nsum(a, b, c); | |
double sum = 0., c = 0.; | |
// Align to 16B boundary | |
while (reinterpret_cast<uintptr_t>(values) % 16 && values < end) { | |
SUM(sum, c, *values); | |
values++; | |
} | |
__m256d sumV[NACC], cV[NACC]; | |
zeroVecArr(sumV, NACC); | |
zeroVecArr(cV, NACC); | |
// Continue the compensation from the alignment loop. | |
sumV[0] = _mm256_setr_pd(sum, 0., 0., 0.); | |
cV[0] = _mm256_setr_pd(c, 0., 0., 0.); | |
// Main vectorized loop: | |
while (values < end - NACC * ELS_PER_VEC) { | |
for (size_t i = 0; i < NACC; i++) { | |
__m256d y = _mm256_cvtps_pd(_mm_load_ps(values)); | |
SUM(sumV[i], cV[i], y); | |
values += ELS_PER_VEC; | |
} | |
} | |
// Up to NACC * ELS_PER_VEC values remain. | |
while (values < end - ELS_PER_VEC) { | |
__m256d y = _mm256_cvtps_pd(_mm_load_ps(values)); | |
SUM(sumV[0], cV[0], y); | |
values += ELS_PER_VEC; | |
} | |
// Up to ELS_PER_VEC values remain. Use masked loads. | |
__m256d y = _mm256_cvtps_pd(mm_load_partial_ps(values, end - values)); | |
SUM(sumV[0], cV[0], y); | |
// Fold the accumulators together. | |
for (size_t i = 1; i < NACC; i++) { | |
if (M == Method::Neumaier) { | |
sumV[i] = _mm256_add_pd(sumV[i], cV[i]); | |
} | |
SUM(sumV[0], cV[0], sumV[i]); | |
} | |
if (M == Method::Neumaier) | |
sumV[0] = _mm256_add_pd(sumV[0], cV[0]); | |
// Horizontally add the elements of sumV[0]. | |
// (Consider using compensated summation here, but we can probably assume that | |
// all of our accumulators are similar magnitude at this point.) | |
__m128d lo = _mm256_castpd256_pd128(sumV[0]); | |
__m128d hi = _mm256_extractf128_pd(sumV[0], 1); | |
lo = _mm_add_pd(lo, hi); // 0+2, 1+3 | |
__m128d hi64 = _mm_unpackhi_pd(lo, lo); | |
return _mm_cvtsd_f64(_mm_add_sd(lo, hi64)); // 0+2+1+3 | |
#undef SUM | |
} | |
#ifdef __AVX512F__ | |
// Variable number of accumulators with Kahan or Neumaier summation, AVX-512. | |
template<Method M, size_t NACC> | |
double fastAccurateAVX512(const float* values, size_t len) { | |
constexpr size_t ELS_PER_VEC = 8; // 8 doubles per 512b vec | |
const float* const end = values + len; | |
#define SUM(a, b, c) M == Method::Kahan ? ksum(a, b, c) : nsum(a, b, c); | |
double sum = 0., c = 0.; | |
// Align to 32B boundary | |
while (reinterpret_cast<uintptr_t>(values) % 32 && values < end) { | |
SUM(sum, c, *values); | |
values++; | |
} | |
__m512d sumV[NACC], cV[NACC]; | |
zeroVecArr(sumV, NACC); | |
zeroVecArr(cV, NACC); | |
// Continue the compensation from the alignment loop. | |
sumV[0] = _mm512_setr_pd(sum, 0., 0., 0., 0., 0., 0., 0.); | |
cV[0] = _mm512_setr_pd(c, 0., 0., 0., 0., 0., 0., 0.); | |
// Main vectorized loop: | |
while (values < end - NACC * ELS_PER_VEC) { | |
for (size_t i = 0; i < NACC; i++) { | |
__m512d y = _mm512_cvtps_pd(_mm256_load_ps(values)); | |
SUM(sumV[i], cV[i], y); | |
values += ELS_PER_VEC; | |
} | |
} | |
// Up to NACC * ELS_PER_VEC = 64 values remain. | |
while (values < end - ELS_PER_VEC) { | |
__m512d y = _mm512_cvtps_pd(_mm256_load_ps(values)); | |
SUM(sumV[0], cV[0], y); | |
values += ELS_PER_VEC; | |
} | |
// Up to ELS_PER_VEC values remain. | |
__m512d y = _mm512_cvtps_pd(mm256_load_partial_ps(values, end - values)); | |
SUM(sumV[0], cV[0], y); | |
// Fold the accumulators together. | |
for (size_t i = 1; i < NACC; i++) { | |
if (M == Method::Neumaier) { | |
sumV[i] = _mm512_add_pd(sumV[i], cV[i]); | |
} | |
SUM(sumV[0], cV[0], sumV[i]); | |
} | |
if (M == Method::Neumaier) | |
sumV[0] = _mm512_add_pd(sumV[0], cV[0]); | |
// Horizontally add the elements of sumV[0]. | |
// (Consider using compensated summation here, but we can probably assume that | |
// all of our accumulators are similar magnitude at this point.) | |
// (Note: this intrinsic doesn't map to a single instruction.) | |
return _mm512_reduce_add_pd(sumV[0]); | |
#undef SUM | |
} | |
#endif | |
/******************************************************************************/ | |
#include <chrono> | |
#include <iostream> | |
#include <iomanip> | |
#include <cstdlib> | |
int main() { | |
// L2 = 1MB | |
constexpr size_t N = 187'500; | |
float* data = new float[N]; | |
std::srand(1); | |
for (size_t i = 0; i < N; i++) { | |
data[i] = static_cast<float>(std::rand()) / static_cast<float>(RAND_MAX / 500'000); | |
} | |
std::cout << std::setprecision(25); | |
std::cout << "naive " << naive(data, N) << std::endl; | |
std::cout << "fast " << fast<4>(data, N) << std::endl; | |
std::cout << "accurate<Method::Kahan> " << accurate<Method::Kahan>(data, N) << std::endl; | |
std::cout << "accurate<Method::Neumaier> " << accurate<Method::Neumaier>(data, N) << std::endl; | |
std::cout << "accurate<Method::LongDouble> " << accurate<Method::LongDouble>(data, N) << std::endl; | |
std::cout << "fastAccurate<Method::Kahan> " << fastAccurate<Method::Kahan, 4>(data, N) << std::endl; | |
std::cout << "fastAccurate<Method::Neumaier> " << fastAccurate<Method::Neumaier, 4>(data, N) << std::endl; | |
std::cout << "fastAccurateAVX512<Method::Kahan> " << fastAccurateAVX512<Method::Kahan, 4>(data, N) << std::endl; | |
std::cout << "fastAccurateAVX512<Method::Neumaier> " << fastAccurateAVX512<Method::Neumaier, 4>(data, N) << std::endl; | |
double noelim = 0.0; | |
std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now(); | |
for (size_t i = 0; i < 2000; i++) { | |
// noelim += naive(data, N); | |
noelim += fast<9>(data, N); | |
// noelim += accurate<Method::Kahan>(data, N); | |
// noelim += accurate<Method::Neumaier>(data, N); | |
// noelim += accurate<Method::LongDouble>(data, N); | |
// noelim += fastAccurate<Method::Kahan, 3>(data, N); | |
// noelim += fastAccurate<Method::Neumaier, 10>(data, N); | |
// noelim += fastAccurateAVX512<Method::Kahan, 10>(data, N); | |
// noelim += fastAccurateAVX512<Method::Neumaier, 2>(data, N); | |
} | |
std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now(); | |
std::cout << "Time difference = " << std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count() << "[ms]" << std::endl; | |
std::cout << "Time difference = " << std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count() << "[µs]" << std::endl; | |
std::cout << "Time difference = " << std::chrono::duration_cast<std::chrono::nanoseconds> (end - begin).count() << "[ns]" << std::endl; | |
std::cout << noelim << std::endl; | |
} | |
// 187,500k elements, 2000 times | |
// naive [1]: 403 ms | |
// naive [2]: 200 ms | |
// naive [3]: 133 ms | |
// naive [4]: 100 ms | |
// naive [5]: 80 ms | |
// naive [6]: 67 ms | |
// naive [8]: 57 ms | |
// naive [8]: 50 ms | |
// naive [9]: 51 ms | |
// naive [10]: 46 ms | |
// scalar Kahan: 1601 ms | |
// scalar Neumaier: 1302 ms | |
// scalar LongDouble: 300 ms | |
// AVX Kahan [1]: 404 ms | |
// AVX Kahan [2]: 232 ms | |
// AVX Kahan [3]: 153 ms | |
// AVX Kahan [4]: 115 ms | |
// AVX Kahan [6]: 87 ms | |
// AVX Kahan [8]: 74 ms | |
// AVX Kahan [10]: 74 ms | |
// AVX Neumaier [1]: 167 ms | |
// AVX Neumaier [2]: 157 ms | |
// AVX Neumaier [3]: 156 ms | |
// AVX Neumaier [4]: 156 ms | |
// AVX Neumaier [6]: 160 ms | |
// AVX Neumaier [8]: 161 ms | |
// AVX Neumaier [10]: 163 ms | |
// AVX-512 Kahan [1]: 233 ms | |
// AVX-512 Kahan [2]: 136 ms | |
// AVX-512 Kahan [3]: 91 ms | |
// AVX-512 Kahan [4]: 68 ms | |
// AVX-512 Kahan [6]: 51 ms | |
// AVX-512 Kahan [8]: 50 ms | |
// AVX-512 Kahan [10]: 49 ms | |
// AVX-512 Neumaier [1]: 95 ms | |
// AVX-512 Neumaier [2]: 94 ms | |
// AVX-512 Neumaier [4]: 89 ms | |
// AVX-512 Neumaier [6]: 91 ms | |
// AVX-512 Neumaier [8]: 96 ms |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cstddef> | |
#include <cstdint> | |
#if defined(_MSC_VER) | |
#include <intrin.h> | |
#elif defined(__GNUC__) | |
#include <x86intrin.h> | |
#endif | |
#ifdef __FAST_MATH__ | |
#error Compensated summation is unsafe with -ffast-math (/fp:fast) | |
#endif | |
using std::size_t; | |
using std::uint32_t; | |
// Kahan summation | |
inline static void kadd(double& sum, double& c, double y) { | |
y -= c; | |
auto t = sum + y; | |
c = (t - sum) - y; | |
sum = t; | |
} | |
inline static void kadd(__m256d& sumV, __m256d& cV, __m256d y) { | |
y = _mm256_sub_pd(y, cV); | |
__m256d t = _mm256_add_pd(sumV, y); | |
cV = _mm256_sub_pd(_mm256_sub_pd(t, sumV), y); | |
sumV = t; | |
} | |
#ifdef __AVX512F__ | |
inline static void kadd(__m512d& sumV, __m512d& cV, __m512d y) { | |
y = _mm512_sub_pd(y, cV); | |
__m512d t = _mm512_add_pd(sumV, y); | |
cV = _mm512_sub_pd(_mm512_sub_pd(t, sumV), y); | |
sumV = t; | |
} | |
#endif | |
// Loads the first N floats starting at p. | |
inline static __m128 mm_load_partial_ps(const float* p, size_t N) { | |
// This uses BMI2, which doesn't exist in Intel Sandy/Ivy Bridge or AMD | |
// Jaguar/Puma/Bulldozer/Piledriver/Steamroller, but does exist in all other | |
// AVX-supporting CPUs. | |
uint32_t k1 = _bzhi_u32(-1, N); // set N low bits | |
k1 = _pdep_u32(k1, 0x80808080); // deposit the set bits into high bit of each byte | |
__m128i k2 = _mm_set1_epi32(k1); // broadcast to vector | |
k2 = _mm_cvtepi8_epi32(k2); // expand 8-bit els into 32-bit els | |
return _mm_maskload_ps(p, k2); | |
} | |
#ifdef __AVX512F__ | |
inline static __m256 mm256_load_partial_ps(const float* p, size_t N) { | |
uint8_t k1 = _bzhi_u32(0xFF, N); // set N low bits | |
return _mm256_maskz_load_ps(k1, p); | |
} | |
#endif | |
// Zeros r | |
inline static void zeroVecArr(__m256d* r, size_t N) { | |
for (size_t i = 0; i < N; i++) r[i] = _mm256_setzero_pd(); | |
} | |
#ifdef __AVX512F__ | |
inline static void zeroVecArr(__m512d* r, size_t N) { | |
for (size_t i = 0; i < N; i++) r[i] = _mm512_setzero_pd(); | |
} | |
#endif | |
double fastAccurate(const float* values, size_t len) { | |
constexpr size_t NACC = 8; // 8 vector accumulators | |
constexpr size_t ELS_PER_VEC = 4; // 4 doubles per 256b vec | |
const float* const end = values + len; | |
double sum = 0., c = 0.; | |
// Align to 16B boundary | |
while (reinterpret_cast<uintptr_t>(values) % 16 && values < end) { | |
kadd(sum, c, *values); | |
values++; | |
} | |
__m256d sumV[NACC], cV[NACC]; | |
zeroVecArr(sumV, NACC); | |
zeroVecArr(cV, NACC); | |
// Continue the compensation from the alignment loop. | |
sumV[0] = _mm256_setr_pd(sum, 0., 0., 0.); | |
cV[0] = _mm256_setr_pd(c, 0., 0., 0.); | |
// Main vectorized loop: | |
while (values < end - NACC * ELS_PER_VEC) { | |
for (size_t i = 0; i < NACC; i++) { | |
__m256d y = _mm256_cvtps_pd(_mm_load_ps(values)); | |
kadd(sumV[i], cV[i], y); | |
values += ELS_PER_VEC; | |
} | |
} | |
// Up to NACC * ELS_PER_VEC values remain. | |
while (values < end - ELS_PER_VEC) { | |
__m256d y = _mm256_cvtps_pd(_mm_load_ps(values)); | |
kadd(sumV[0], cV[0], y); | |
values += ELS_PER_VEC; | |
} | |
// Up to ELS_PER_VEC values remain. Use masked loads. | |
__m256d y = _mm256_cvtps_pd(mm_load_partial_ps(values, end - values)); | |
kadd(sumV[0], cV[0], y); | |
// Fold the accumulators together. | |
for (size_t i = 1; i < NACC; i++) | |
kadd(sumV[0], cV[0], sumV[i]); | |
// Horizontally add the elements of sumV[0]. | |
// (Consider using compensated summation here, but we can probably assume that | |
// all of our accumulators are similar magnitude at this point.) | |
__m128d lo = _mm256_castpd256_pd128(sumV[0]); | |
__m128d hi = _mm256_extractf128_pd(sumV[0], 1); | |
lo = _mm_add_pd(lo, hi); // 0+2, 1+3 | |
__m128d hi64 = _mm_unpackhi_pd(lo, lo); | |
return _mm_cvtsd_f64(_mm_add_sd(lo, hi64)); // 0+2+1+3 | |
} | |
#ifdef __AVX512F__ | |
double fastAccurateAVX512(const float* values, size_t len) { | |
constexpr size_t NACC = 8; // 8 vector accumulators | |
constexpr size_t ELS_PER_VEC = 8; // 8 doubles per 512b vec | |
const float* const end = values + len; | |
double sum = 0., c = 0.; | |
// Align to 32B boundary | |
while (reinterpret_cast<uintptr_t>(values) % 32 && values < end) { | |
kadd(sum, c, *values); | |
values++; | |
} | |
__m512d sumV[NACC], cV[NACC]; | |
zeroVecArr(sumV, NACC); | |
zeroVecArr(cV, NACC); | |
// Continue the compensation from the alignment loop. | |
sumV[0] = _mm512_setr_pd(sum, 0., 0., 0., 0., 0., 0., 0.); | |
cV[0] = _mm512_setr_pd(c, 0., 0., 0., 0., 0., 0., 0.); | |
// Main vectorized loop: | |
while (values < end - NACC * ELS_PER_VEC) { | |
for (size_t i = 0; i < NACC; i++) { | |
__m512d y = _mm512_cvtps_pd(_mm256_load_ps(values)); | |
kadd(sumV[i], cV[i], y); | |
values += ELS_PER_VEC; | |
} | |
} | |
// Up to NACC * ELS_PER_VEC = 64 values remain. | |
while (values < end - ELS_PER_VEC) { | |
__m512d y = _mm512_cvtps_pd(_mm256_load_ps(values)); | |
kadd(sumV[0], cV[0], y); | |
values += ELS_PER_VEC; | |
} | |
// Up to ELS_PER_VEC values remain. | |
__m512d y = _mm512_cvtps_pd(mm256_load_partial_ps(values, end - values)); | |
kadd(sumV[0], cV[0], y); | |
// Fold the accumulators together. | |
for (size_t i = 1; i < NACC; i++) | |
kadd(sumV[0], cV[0], sumV[i]); | |
// Horizontally add the elements of sumV[0]. | |
// (Consider using compensated summation here, but we can probably assume that | |
// all of our accumulators are similar magnitude at this point.) | |
// (Note: this intrinsic doesn't map to a single instruction.) | |
return _mm512_reduce_add_pd(sumV[0]); | |
} | |
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cstddef> | |
float naive(const float* values, std::size_t len) { | |
float sum = 0.f; | |
for (const float* const end = values + len; values < end; values++) { | |
sum += *values; | |
} | |
return sum; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Nice, thanks for making this available. Please specify a licence for this code.