Skip to content

Instantly share code, notes, and snippets.

@zbjornson
Last active December 21, 2022 17:32
Show Gist options
  • Save zbjornson/f90513fcaaee4d6268d26f9c1a0507dd to your computer and use it in GitHub Desktop.
Save zbjornson/f90513fcaaee4d6268d26f9c1a0507dd to your computer and use it in GitHub Desktop.
Code for fast, accurate float summation
#include <cstddef>
#include <cmath>
#ifdef __FAST_MATH__
#error Compensated summation is unsafe with -ffast-math (/fp:fast)
#endif
inline static void kadd(double& sum, double& c, double y) {
y -= c;
auto t = sum + y;
c = (t - sum) - y;
sum = t;
}
double accurate(const float* values, std::size_t len) {
double sum = 0, c = 0;
for (const float* const end = values + len; values < end; values++) {
auto y = static_cast<double>(*values);
kadd(sum, c, y);
}
return sum + c;
}
#include <cstddef>
#include <cstdint>
#if defined(_MSC_VER)
# include <intrin.h>
#elif defined(__GNUC__)
# include <x86intrin.h>
#endif
#ifdef __FAST_MATH__
#error Compensated summation is unsafe with -ffast-math (/fp:fast)
#endif
using std::size_t;
using std::uint32_t;
enum class Method { Kahan, Neumaier, LongDouble };
// Kahan summation
inline static void ksum(double& sum, double& c, double y) {
y -= c;
auto t = sum + y;
c = (t - sum) - y;
sum = t;
}
inline static void ksum(__m256d& sumV, __m256d& cV, __m256d y) {
y = _mm256_sub_pd(y, cV);
__m256d t = _mm256_add_pd(sumV, y);
cV = _mm256_sub_pd(_mm256_sub_pd(t, sumV), y);
sumV = t;
}
#ifdef __AVX512F__
inline static void ksum(__m512d& sumV, __m512d& cV, __m512d y) {
y = _mm512_sub_pd(y, cV);
__m512d t = _mm512_add_pd(sumV, y);
cV = _mm512_sub_pd(_mm512_sub_pd(t, sumV), y);
sumV = t;
}
#endif
// Neumaier summation
inline static void nsum(double& sum, double& c, double y) {
auto t = sum + y;
auto sabs = std::abs(sum);
auto yabs = std::abs(y);
auto t1 = sabs >= yabs ? sum : y;
auto t3 = sabs >= yabs ? y : sum;
c += (t1 - t) + t3;
sum = t;
}
inline static void nsum(__m256d& sumV, __m256d& cV, __m256d y) {
const __m256d NOT_SIGN_BIT = _mm256_castsi256_pd(_mm256_set1_epi64x(0x7FFF'FFFF'FFFF'FFFF));
__m256d t = _mm256_add_pd(sumV, y);
__m256d sabs = _mm256_and_pd(sumV, NOT_SIGN_BIT);
__m256d yabs = _mm256_and_pd(y, NOT_SIGN_BIT);
__m256d mask = _mm256_cmp_pd(sabs, yabs, _CMP_GE_OQ);
__m256d t1 = _mm256_blendv_pd(sumV, y, mask);
__m256d t3 = _mm256_blendv_pd(y, sumV, mask);
cV = _mm256_add_pd(cV, _mm256_add_pd(_mm256_sub_pd(t1, t), t3));
sumV = t;
}
#ifdef __AVX512F__
inline static void nsum(__m512d& sumV, __m512d& cV, __m512d y) {
const __m512d NOT_SIGN_BIT = _mm512_castsi512_pd(_mm512_set1_epi64(0x7FFF'FFFF'FFFF'FFFF));
__m512d t = _mm512_add_pd(sumV, y);
__m512d sabs = _mm512_and_pd(sumV, NOT_SIGN_BIT);
__m512d yabs = _mm512_and_pd(y, NOT_SIGN_BIT);
__mmask8 mask = _mm512_cmp_pd_mask(sabs, yabs, _CMP_GE_OQ);
__m512d t1 = _mm512_mask_blend_pd(mask, sumV, y);
__m512d t3 = _mm512_mask_blend_pd(mask, y, sumV);
cV = _mm512_add_pd(cV, _mm512_add_pd(_mm512_sub_pd(t1, t), t3));
sumV = t;
}
#endif
// Loads the first N floats starting at p.
inline static __m128 mm_load_partial_ps(const float* p, size_t N) {
// This uses BMI2, which doesn't exist in Intel Sandy/Ivy Bridge or AMD
// Jaguar/Puma/Bulldozer/Piledriver/Steamroller, but does exist in all other
// AVX-supporting CPUs.
uint32_t k1 = _bzhi_u32(-1, N); // set N low bits
k1 = _pdep_u32(k1, 0x80808080); // deposit the set bits into high bit of each byte
__m128i k2 = _mm_set1_epi32(k1); // broadcast to vector
k2 = _mm_cvtepi8_epi32(k2); // expand 8-bit els into 32-bit els
return _mm_maskload_ps(p, k2);
}
#ifdef __AVX512F__
inline static __m256 mm256_load_partial_ps(const float* p, size_t N) {
uint8_t k1 = _bzhi_u32(0xFF, N); // set N low bits
return _mm256_maskz_load_ps(k1, p);
}
#endif
inline static void zeroVecArr(__m256d* r, size_t N) {
for (size_t i = 0; i < N; i++) r[i] = _mm256_setzero_pd();
}
#ifdef __AVX512F__
inline static void zeroVecArr(__m512d* r, size_t N) {
for (size_t i = 0; i < N; i++) r[i] = _mm512_setzero_pd();
}
#endif
/******************************************************************************/
// One single-precision accumulator.
float naive(const float* values, size_t len) {
float sum = 0.f;
for (const float* const end = values + len; values < end; values++) {
sum += *values;
}
return sum;
}
// Variable number of single-precision accumulators.
template<size_t NACC>
float fast(const float* values, size_t len) {
float sums[NACC] = {};
const float* const end = values + len;
while (values < end - NACC) {
for (size_t i = 0; i < NACC; i++) {
sums[i] += *values++;
}
}
float sum = 0.f;
while (values < end)
sum += *values++;
for (size_t i = 0; i < NACC; i++)
sum += sums[i];
return sum;
}
// Single accumulator with Kahan or Neumaier summation or LongDouble
// intermediate precision.
template <Method M>
double accurate(const float* values, size_t len) {
double sum = 0, c = 0;
long double ldsum = 0;
for (const float* const end = values + len; values < end; values++) {
auto y = static_cast<double>(*values);
if (M == Method::Kahan)
ksum(sum, c, y);
else if (M == Method::Neumaier)
nsum(sum, c, y);
else
ldsum += y;
}
if (M == Method::Neumaier) sum += c;
if (M == Method::LongDouble) return ldsum;
return sum;
}
// Variable number of accumulators with Kahan or Neumaier summation, AVX+BMI2.
template<Method M, size_t NACC>
double fastAccurate(const float* values, size_t len) {
constexpr size_t ELS_PER_VEC = 4; // 4 doubles per 256b vec
const float* const end = values + len;
#define SUM(a, b, c) M == Method::Kahan ? ksum(a, b, c) : nsum(a, b, c);
double sum = 0., c = 0.;
// Align to 16B boundary
while (reinterpret_cast<uintptr_t>(values) % 16 && values < end) {
SUM(sum, c, *values);
values++;
}
__m256d sumV[NACC], cV[NACC];
zeroVecArr(sumV, NACC);
zeroVecArr(cV, NACC);
// Continue the compensation from the alignment loop.
sumV[0] = _mm256_setr_pd(sum, 0., 0., 0.);
cV[0] = _mm256_setr_pd(c, 0., 0., 0.);
// Main vectorized loop:
while (values < end - NACC * ELS_PER_VEC) {
for (size_t i = 0; i < NACC; i++) {
__m256d y = _mm256_cvtps_pd(_mm_load_ps(values));
SUM(sumV[i], cV[i], y);
values += ELS_PER_VEC;
}
}
// Up to NACC * ELS_PER_VEC values remain.
while (values < end - ELS_PER_VEC) {
__m256d y = _mm256_cvtps_pd(_mm_load_ps(values));
SUM(sumV[0], cV[0], y);
values += ELS_PER_VEC;
}
// Up to ELS_PER_VEC values remain. Use masked loads.
__m256d y = _mm256_cvtps_pd(mm_load_partial_ps(values, end - values));
SUM(sumV[0], cV[0], y);
// Fold the accumulators together.
for (size_t i = 1; i < NACC; i++) {
if (M == Method::Neumaier) {
sumV[i] = _mm256_add_pd(sumV[i], cV[i]);
}
SUM(sumV[0], cV[0], sumV[i]);
}
if (M == Method::Neumaier)
sumV[0] = _mm256_add_pd(sumV[0], cV[0]);
// Horizontally add the elements of sumV[0].
// (Consider using compensated summation here, but we can probably assume that
// all of our accumulators are similar magnitude at this point.)
__m128d lo = _mm256_castpd256_pd128(sumV[0]);
__m128d hi = _mm256_extractf128_pd(sumV[0], 1);
lo = _mm_add_pd(lo, hi); // 0+2, 1+3
__m128d hi64 = _mm_unpackhi_pd(lo, lo);
return _mm_cvtsd_f64(_mm_add_sd(lo, hi64)); // 0+2+1+3
#undef SUM
}
#ifdef __AVX512F__
// Variable number of accumulators with Kahan or Neumaier summation, AVX-512.
template<Method M, size_t NACC>
double fastAccurateAVX512(const float* values, size_t len) {
constexpr size_t ELS_PER_VEC = 8; // 8 doubles per 512b vec
const float* const end = values + len;
#define SUM(a, b, c) M == Method::Kahan ? ksum(a, b, c) : nsum(a, b, c);
double sum = 0., c = 0.;
// Align to 32B boundary
while (reinterpret_cast<uintptr_t>(values) % 32 && values < end) {
SUM(sum, c, *values);
values++;
}
__m512d sumV[NACC], cV[NACC];
zeroVecArr(sumV, NACC);
zeroVecArr(cV, NACC);
// Continue the compensation from the alignment loop.
sumV[0] = _mm512_setr_pd(sum, 0., 0., 0., 0., 0., 0., 0.);
cV[0] = _mm512_setr_pd(c, 0., 0., 0., 0., 0., 0., 0.);
// Main vectorized loop:
while (values < end - NACC * ELS_PER_VEC) {
for (size_t i = 0; i < NACC; i++) {
__m512d y = _mm512_cvtps_pd(_mm256_load_ps(values));
SUM(sumV[i], cV[i], y);
values += ELS_PER_VEC;
}
}
// Up to NACC * ELS_PER_VEC = 64 values remain.
while (values < end - ELS_PER_VEC) {
__m512d y = _mm512_cvtps_pd(_mm256_load_ps(values));
SUM(sumV[0], cV[0], y);
values += ELS_PER_VEC;
}
// Up to ELS_PER_VEC values remain.
__m512d y = _mm512_cvtps_pd(mm256_load_partial_ps(values, end - values));
SUM(sumV[0], cV[0], y);
// Fold the accumulators together.
for (size_t i = 1; i < NACC; i++) {
if (M == Method::Neumaier) {
sumV[i] = _mm512_add_pd(sumV[i], cV[i]);
}
SUM(sumV[0], cV[0], sumV[i]);
}
if (M == Method::Neumaier)
sumV[0] = _mm512_add_pd(sumV[0], cV[0]);
// Horizontally add the elements of sumV[0].
// (Consider using compensated summation here, but we can probably assume that
// all of our accumulators are similar magnitude at this point.)
// (Note: this intrinsic doesn't map to a single instruction.)
return _mm512_reduce_add_pd(sumV[0]);
#undef SUM
}
#endif
/******************************************************************************/
#include <chrono>
#include <iostream>
#include <iomanip>
#include <cstdlib>
int main() {
// L2 = 1MB
constexpr size_t N = 187'500;
float* data = new float[N];
std::srand(1);
for (size_t i = 0; i < N; i++) {
data[i] = static_cast<float>(std::rand()) / static_cast<float>(RAND_MAX / 500'000);
}
std::cout << std::setprecision(25);
std::cout << "naive " << naive(data, N) << std::endl;
std::cout << "fast " << fast<4>(data, N) << std::endl;
std::cout << "accurate<Method::Kahan> " << accurate<Method::Kahan>(data, N) << std::endl;
std::cout << "accurate<Method::Neumaier> " << accurate<Method::Neumaier>(data, N) << std::endl;
std::cout << "accurate<Method::LongDouble> " << accurate<Method::LongDouble>(data, N) << std::endl;
std::cout << "fastAccurate<Method::Kahan> " << fastAccurate<Method::Kahan, 4>(data, N) << std::endl;
std::cout << "fastAccurate<Method::Neumaier> " << fastAccurate<Method::Neumaier, 4>(data, N) << std::endl;
std::cout << "fastAccurateAVX512<Method::Kahan> " << fastAccurateAVX512<Method::Kahan, 4>(data, N) << std::endl;
std::cout << "fastAccurateAVX512<Method::Neumaier> " << fastAccurateAVX512<Method::Neumaier, 4>(data, N) << std::endl;
double noelim = 0.0;
std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now();
for (size_t i = 0; i < 2000; i++) {
// noelim += naive(data, N);
noelim += fast<9>(data, N);
// noelim += accurate<Method::Kahan>(data, N);
// noelim += accurate<Method::Neumaier>(data, N);
// noelim += accurate<Method::LongDouble>(data, N);
// noelim += fastAccurate<Method::Kahan, 3>(data, N);
// noelim += fastAccurate<Method::Neumaier, 10>(data, N);
// noelim += fastAccurateAVX512<Method::Kahan, 10>(data, N);
// noelim += fastAccurateAVX512<Method::Neumaier, 2>(data, N);
}
std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
std::cout << "Time difference = " << std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count() << "[ms]" << std::endl;
std::cout << "Time difference = " << std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count() << "[µs]" << std::endl;
std::cout << "Time difference = " << std::chrono::duration_cast<std::chrono::nanoseconds> (end - begin).count() << "[ns]" << std::endl;
std::cout << noelim << std::endl;
}
// 187,500k elements, 2000 times
// naive [1]: 403 ms
// naive [2]: 200 ms
// naive [3]: 133 ms
// naive [4]: 100 ms
// naive [5]: 80 ms
// naive [6]: 67 ms
// naive [8]: 57 ms
// naive [8]: 50 ms
// naive [9]: 51 ms
// naive [10]: 46 ms
// scalar Kahan: 1601 ms
// scalar Neumaier: 1302 ms
// scalar LongDouble: 300 ms
// AVX Kahan [1]: 404 ms
// AVX Kahan [2]: 232 ms
// AVX Kahan [3]: 153 ms
// AVX Kahan [4]: 115 ms
// AVX Kahan [6]: 87 ms
// AVX Kahan [8]: 74 ms
// AVX Kahan [10]: 74 ms
// AVX Neumaier [1]: 167 ms
// AVX Neumaier [2]: 157 ms
// AVX Neumaier [3]: 156 ms
// AVX Neumaier [4]: 156 ms
// AVX Neumaier [6]: 160 ms
// AVX Neumaier [8]: 161 ms
// AVX Neumaier [10]: 163 ms
// AVX-512 Kahan [1]: 233 ms
// AVX-512 Kahan [2]: 136 ms
// AVX-512 Kahan [3]: 91 ms
// AVX-512 Kahan [4]: 68 ms
// AVX-512 Kahan [6]: 51 ms
// AVX-512 Kahan [8]: 50 ms
// AVX-512 Kahan [10]: 49 ms
// AVX-512 Neumaier [1]: 95 ms
// AVX-512 Neumaier [2]: 94 ms
// AVX-512 Neumaier [4]: 89 ms
// AVX-512 Neumaier [6]: 91 ms
// AVX-512 Neumaier [8]: 96 ms
#include <cstddef>
#include <cstdint>
#if defined(_MSC_VER)
#include <intrin.h>
#elif defined(__GNUC__)
#include <x86intrin.h>
#endif
#ifdef __FAST_MATH__
#error Compensated summation is unsafe with -ffast-math (/fp:fast)
#endif
using std::size_t;
using std::uint32_t;
// Kahan summation
inline static void kadd(double& sum, double& c, double y) {
y -= c;
auto t = sum + y;
c = (t - sum) - y;
sum = t;
}
inline static void kadd(__m256d& sumV, __m256d& cV, __m256d y) {
y = _mm256_sub_pd(y, cV);
__m256d t = _mm256_add_pd(sumV, y);
cV = _mm256_sub_pd(_mm256_sub_pd(t, sumV), y);
sumV = t;
}
#ifdef __AVX512F__
inline static void kadd(__m512d& sumV, __m512d& cV, __m512d y) {
y = _mm512_sub_pd(y, cV);
__m512d t = _mm512_add_pd(sumV, y);
cV = _mm512_sub_pd(_mm512_sub_pd(t, sumV), y);
sumV = t;
}
#endif
// Loads the first N floats starting at p.
inline static __m128 mm_load_partial_ps(const float* p, size_t N) {
// This uses BMI2, which doesn't exist in Intel Sandy/Ivy Bridge or AMD
// Jaguar/Puma/Bulldozer/Piledriver/Steamroller, but does exist in all other
// AVX-supporting CPUs.
uint32_t k1 = _bzhi_u32(-1, N); // set N low bits
k1 = _pdep_u32(k1, 0x80808080); // deposit the set bits into high bit of each byte
__m128i k2 = _mm_set1_epi32(k1); // broadcast to vector
k2 = _mm_cvtepi8_epi32(k2); // expand 8-bit els into 32-bit els
return _mm_maskload_ps(p, k2);
}
#ifdef __AVX512F__
inline static __m256 mm256_load_partial_ps(const float* p, size_t N) {
uint8_t k1 = _bzhi_u32(0xFF, N); // set N low bits
return _mm256_maskz_load_ps(k1, p);
}
#endif
// Zeros r
inline static void zeroVecArr(__m256d* r, size_t N) {
for (size_t i = 0; i < N; i++) r[i] = _mm256_setzero_pd();
}
#ifdef __AVX512F__
inline static void zeroVecArr(__m512d* r, size_t N) {
for (size_t i = 0; i < N; i++) r[i] = _mm512_setzero_pd();
}
#endif
double fastAccurate(const float* values, size_t len) {
constexpr size_t NACC = 8; // 8 vector accumulators
constexpr size_t ELS_PER_VEC = 4; // 4 doubles per 256b vec
const float* const end = values + len;
double sum = 0., c = 0.;
// Align to 16B boundary
while (reinterpret_cast<uintptr_t>(values) % 16 && values < end) {
kadd(sum, c, *values);
values++;
}
__m256d sumV[NACC], cV[NACC];
zeroVecArr(sumV, NACC);
zeroVecArr(cV, NACC);
// Continue the compensation from the alignment loop.
sumV[0] = _mm256_setr_pd(sum, 0., 0., 0.);
cV[0] = _mm256_setr_pd(c, 0., 0., 0.);
// Main vectorized loop:
while (values < end - NACC * ELS_PER_VEC) {
for (size_t i = 0; i < NACC; i++) {
__m256d y = _mm256_cvtps_pd(_mm_load_ps(values));
kadd(sumV[i], cV[i], y);
values += ELS_PER_VEC;
}
}
// Up to NACC * ELS_PER_VEC values remain.
while (values < end - ELS_PER_VEC) {
__m256d y = _mm256_cvtps_pd(_mm_load_ps(values));
kadd(sumV[0], cV[0], y);
values += ELS_PER_VEC;
}
// Up to ELS_PER_VEC values remain. Use masked loads.
__m256d y = _mm256_cvtps_pd(mm_load_partial_ps(values, end - values));
kadd(sumV[0], cV[0], y);
// Fold the accumulators together.
for (size_t i = 1; i < NACC; i++)
kadd(sumV[0], cV[0], sumV[i]);
// Horizontally add the elements of sumV[0].
// (Consider using compensated summation here, but we can probably assume that
// all of our accumulators are similar magnitude at this point.)
__m128d lo = _mm256_castpd256_pd128(sumV[0]);
__m128d hi = _mm256_extractf128_pd(sumV[0], 1);
lo = _mm_add_pd(lo, hi); // 0+2, 1+3
__m128d hi64 = _mm_unpackhi_pd(lo, lo);
return _mm_cvtsd_f64(_mm_add_sd(lo, hi64)); // 0+2+1+3
}
#ifdef __AVX512F__
double fastAccurateAVX512(const float* values, size_t len) {
constexpr size_t NACC = 8; // 8 vector accumulators
constexpr size_t ELS_PER_VEC = 8; // 8 doubles per 512b vec
const float* const end = values + len;
double sum = 0., c = 0.;
// Align to 32B boundary
while (reinterpret_cast<uintptr_t>(values) % 32 && values < end) {
kadd(sum, c, *values);
values++;
}
__m512d sumV[NACC], cV[NACC];
zeroVecArr(sumV, NACC);
zeroVecArr(cV, NACC);
// Continue the compensation from the alignment loop.
sumV[0] = _mm512_setr_pd(sum, 0., 0., 0., 0., 0., 0., 0.);
cV[0] = _mm512_setr_pd(c, 0., 0., 0., 0., 0., 0., 0.);
// Main vectorized loop:
while (values < end - NACC * ELS_PER_VEC) {
for (size_t i = 0; i < NACC; i++) {
__m512d y = _mm512_cvtps_pd(_mm256_load_ps(values));
kadd(sumV[i], cV[i], y);
values += ELS_PER_VEC;
}
}
// Up to NACC * ELS_PER_VEC = 64 values remain.
while (values < end - ELS_PER_VEC) {
__m512d y = _mm512_cvtps_pd(_mm256_load_ps(values));
kadd(sumV[0], cV[0], y);
values += ELS_PER_VEC;
}
// Up to ELS_PER_VEC values remain.
__m512d y = _mm512_cvtps_pd(mm256_load_partial_ps(values, end - values));
kadd(sumV[0], cV[0], y);
// Fold the accumulators together.
for (size_t i = 1; i < NACC; i++)
kadd(sumV[0], cV[0], sumV[i]);
// Horizontally add the elements of sumV[0].
// (Consider using compensated summation here, but we can probably assume that
// all of our accumulators are similar magnitude at this point.)
// (Note: this intrinsic doesn't map to a single instruction.)
return _mm512_reduce_add_pd(sumV[0]);
}
#endif
#include <cstddef>
float naive(const float* values, std::size_t len) {
float sum = 0.f;
for (const float* const end = values + len; values < end; values++) {
sum += *values;
}
return sum;
}
@MaxBarraclough
Copy link

Nice, thanks for making this available. Please specify a licence for this code.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment