Skip to content

Instantly share code, notes, and snippets.

@alfanick
Last active August 17, 2016 16:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save alfanick/1f48dde5836ddcac9810140c8170c9b2 to your computer and use it in GitHub Desktop.
Save alfanick/1f48dde5836ddcac9810140c8170c9b2 to your computer and use it in GitHub Desktop.
Funny doubles behaviour
clang++ clang++/fast-math g++ g++/fast-math
Naive 36028796817637376 36028796884746240 36028796817637376 36028796884746240
Naive OpenMP 36028796884746240 36028796884746240 36028796884746240 36028796884746240
Vectorization 36028796884746240 36028796884746240 - -

Correct answer -- 36028796884746240

  • clang++ -- clang++ -march=native -O3 -std=c++11 -fopenmp -L/usr/local/opt/llvm/lib -o main ./main.cpp

  • clang++/fast-math -- clang++ -march=native -O3 -std=c++11 -fopenmp -ffast-math -L/usr/local/opt/llvm/lib -o main ./main.cpp

  • g++ -- g++-5 -march=native -O3 -std=c++11 -fopenmp -o main ./main.cpp

  • g++/fast-math -- g++-5 -march=native -O3 -std=c++11 -fopenmp -ffast-math -o main ./main.cpp

  • clang version 3.8.1 (tags/RELEASE_381/final)

  • g++-5 (Homebrew gcc 5.3.0) 5.3.0

  • OS X 10.11, Intel i7

OpenMP not required, just comment it out. I played with vectorization, compared results to naive solution (assumed they would be correct) – debugged vectorization instead of naive...

I am aware of float/double inaccuracy, however I've never thought it could be that large (underflow by 67108864). Is there any other explanation for this? Furthermore, fast-math generates no errors, which is even more strange... I assumed it was clang bug, but tried the code on g++, O1, O2, same behaviour, therefore I must be missing something.

Can anyone somehow illuminate me? Is it just floating point underflow, why fast-math gives correct result?

#include <iostream>
#include <chrono>
#include <functional>
#include <string>
#include <libiomp/omp.h>
double benchmark(std::string name, std::function<double()> f, size_t iterations = 100);
int main() {
const size_t data_size = (1<<28);
double *data = new double[data_size];
for (size_t i = 0; i < data_size; i++)
data[i] = i;
benchmark("naive", [=]() {
double sum = 0;
for (size_t i = 0; i < data_size; i++) {
sum += data[i];
}
return sum;
});
benchmark("naive openmp", [=]() {
double sum = 0;
#pragma omp parallel for reduction(+:sum)
for (size_t i = 0; i < data_size; i++) {
sum += data[i];
}
return sum;
});
benchmark("vectorize and interleave", [=]() {
double sum = 0;
#pragma clang loop vectorize(enable) interleave(enable)
for (size_t i = 0; i < data_size; i++) {
sum += data[i];
}
return sum;
});
delete[] data;
return EXIT_SUCCESS;
}
double benchmark(std::string name, std::function<double()> f, size_t iterations) {
std::cerr << "Executing " << name << ":" << std::endl;
double sum = 0;
auto start_time = std::chrono::high_resolution_clock::now();
for (size_t i = 0; i < iterations; i++) {
sum = f();
}
auto end_time = std::chrono::high_resolution_clock::now();
double avg = (std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time) / iterations).count();
std::cerr.precision(17);
std::cerr << "\tsum is " << std::fixed << sum << std::endl;
std::cerr << "\taverage " << avg << "ms" << std::endl << std::endl;
return avg;
}
877c877
< ## BB#0: ## %min.iters.checked
---
> ## BB#0:
887c887
< vxorpd %ymm0, %ymm0, %ymm0
---
> vxorpd %xmm0, %xmm0, %xmm0
889,891d888
< vxorpd %ymm1, %ymm1, %ymm1
< vxorpd %ymm2, %ymm2, %ymm2
< vxorpd %ymm3, %ymm3, %ymm3
893,911c890,907
< LBB8_1: ## %vector.body
< ## =>This Inner Loop Header: Depth=1
< vaddpd (%rax,%rcx,8), %ymm0, %ymm0
< vaddpd 32(%rax,%rcx,8), %ymm1, %ymm1
< vaddpd 64(%rax,%rcx,8), %ymm2, %ymm2
< vaddpd 96(%rax,%rcx,8), %ymm3, %ymm3
< vaddpd 128(%rax,%rcx,8), %ymm0, %ymm0
< vaddpd 160(%rax,%rcx,8), %ymm1, %ymm1
< vaddpd 192(%rax,%rcx,8), %ymm2, %ymm2
< vaddpd 224(%rax,%rcx,8), %ymm3, %ymm3
< vaddpd 256(%rax,%rcx,8), %ymm0, %ymm0
< vaddpd 288(%rax,%rcx,8), %ymm1, %ymm1
< vaddpd 320(%rax,%rcx,8), %ymm2, %ymm2
< vaddpd 352(%rax,%rcx,8), %ymm3, %ymm3
< vaddpd 384(%rax,%rcx,8), %ymm0, %ymm0
< vaddpd 416(%rax,%rcx,8), %ymm1, %ymm1
< vaddpd 448(%rax,%rcx,8), %ymm2, %ymm2
< vaddpd 480(%rax,%rcx,8), %ymm3, %ymm3
< addq $64, %rcx
---
> LBB8_1: ## =>This Inner Loop Header: Depth=1
> vaddsd (%rax,%rcx,8), %xmm0, %xmm0
> vaddsd 8(%rax,%rcx,8), %xmm0, %xmm0
> vaddsd 16(%rax,%rcx,8), %xmm0, %xmm0
> vaddsd 24(%rax,%rcx,8), %xmm0, %xmm0
> vaddsd 32(%rax,%rcx,8), %xmm0, %xmm0
> vaddsd 40(%rax,%rcx,8), %xmm0, %xmm0
> vaddsd 48(%rax,%rcx,8), %xmm0, %xmm0
> vaddsd 56(%rax,%rcx,8), %xmm0, %xmm0
> vaddsd 64(%rax,%rcx,8), %xmm0, %xmm0
> vaddsd 72(%rax,%rcx,8), %xmm0, %xmm0
> vaddsd 80(%rax,%rcx,8), %xmm0, %xmm0
> vaddsd 88(%rax,%rcx,8), %xmm0, %xmm0
> vaddsd 96(%rax,%rcx,8), %xmm0, %xmm0
> vaddsd 104(%rax,%rcx,8), %xmm0, %xmm0
> vaddsd 112(%rax,%rcx,8), %xmm0, %xmm0
> vaddsd 120(%rax,%rcx,8), %xmm0, %xmm0
> addq $16, %rcx
914,920c910
< ## BB#2: ## %middle.block
< vaddpd %ymm0, %ymm1, %ymm0
< vaddpd %ymm0, %ymm2, %ymm0
< vaddpd %ymm0, %ymm3, %ymm0
< vextractf128 $1, %ymm0, %xmm1
< vaddpd %ymm1, %ymm0, %ymm0
< vhaddpd %ymm0, %ymm0, %ymm0
---
> ## BB#2: ## %"_ZNSt3__128__invoke_void_return_wrapperIdE6__callIJRZ4mainE3$_0EEEdDpOT_.exit"
922d911
< vzeroupper
1248,1249c1237,1238
< vmovsd -88(%rbp), %xmm0 ## xmm0 = mem[0],zero
< vaddsd (%rbx), %xmm0, %xmm0
---
> vmovsd (%rbx), %xmm0 ## xmm0 = mem[0],zero
> vaddsd -88(%rbp), %xmm0, %xmm0
1295,1296c1284,1285
< vmovsd (%rax), %xmm0 ## xmm0 = mem[0],zero
< vaddsd (%rcx), %xmm0, %xmm0
---
> vmovsd (%rcx), %xmm0 ## xmm0 = mem[0],zero
> vaddsd (%rax), %xmm0, %xmm0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment