-
-
Save Lewiscowles1986/90191c59c9aedf3d08bf0b129065cccc to your computer and use it in GitHub Desktop.
// Integer and float benchmark for Win32 and Win64 | |
// Results are below main(), line 91 | |
#include <stdlib.h> | |
#include <stdio.h> | |
#ifdef _WIN32 | |
#include <sys/timeb.h> | |
#else | |
#include <sys/time.h> | |
#endif | |
#include <time.h> | |
double | |
mygettime(void) { | |
# ifdef _WIN32 | |
struct _timeb tb; | |
_ftime(&tb); | |
return (double)tb.time + (0.001 * (double)tb.millitm); | |
# else | |
struct timeval tv; | |
if(gettimeofday(&tv, 0) < 0) { | |
perror("oops"); | |
} | |
return (double)tv.tv_sec + (0.000001 * (double)tv.tv_usec); | |
# endif | |
} | |
template< typename Type > | |
void my_test(const char* name) { | |
volatile Type v = 0; | |
// Do not use constants or repeating values | |
// to avoid loop unroll optimizations. | |
// All values >0 to avoid division by 0 | |
Type v0 = (Type)(rand() % 256)/16 + 1; | |
Type v1 = (Type)(rand() % 256)/16 + 1; | |
Type v2 = (Type)(rand() % 256)/16 + 1; | |
Type v3 = (Type)(rand() % 256)/16 + 1; | |
Type v4 = (Type)(rand() % 256)/16 + 1; | |
Type v5 = (Type)(rand() % 256)/16 + 1; | |
Type v6 = (Type)(rand() % 256)/16 + 1; | |
Type v7 = (Type)(rand() % 256)/16 + 1; | |
double t1 = mygettime(); | |
for (size_t i = 0; i < 100000000; ++i) { | |
v += v0; | |
v += v2; | |
v += v4; | |
v += v6; | |
} | |
printf("%s add: %f\n", name, mygettime() - t1); | |
t1 = mygettime(); | |
for (size_t i = 0; i < 100000000; ++i) { | |
v -= v1; | |
v -= v3; | |
v -= v5; | |
v -= v7; | |
} | |
printf("%s sub: %f\n", name, mygettime() - t1); | |
t1 = mygettime(); | |
for (size_t i = 0; i < 100000000; ++i) { | |
v *= v0; | |
v *= v2; | |
v *= v4; | |
v *= v6; | |
} | |
printf("%s mul: %f\n", name, mygettime() - t1); | |
t1 = mygettime(); | |
for (size_t i = 0; i < 100000000; ++i) { | |
v /= v1; | |
v /= v3; | |
v /= v5; | |
v /= v7; | |
} | |
printf("%s div: %f\n", name, mygettime() - t1); | |
} | |
int main() { | |
my_test< short >(" short"); | |
my_test< int >(" int"); | |
my_test< long >(" long"); | |
my_test< long long >("long long"); | |
my_test< float >(" float"); | |
my_test< double >(" double"); | |
return 0; | |
} |
Using volatile Type sink = v;
inside the loop (after every v *= foo
) would force it to store every result separately (so it couldn't optimize the add loop to a single multiply), without having to use volatile v
. So the compiler could still keep v
in a register.
You can instead use inline asm to force the compiler to have a result in a register without adding any extra instructions, if you don't mind using GNU C extensions. See https://kojirion.github.io/2016/04/04/Profiling.html for an escape
function, from Chandler Carruth's CppCon2015 talk: "Tuning C++: Benchmarks, and CPUs, and Compilers! Oh My!" https://www.youtube.com/watch?v=nXaxk27zwlk. (Using perf
on Linux.)
Also note that repeated division will quickly make v = 0
, which is the fastest case for div
/ idiv
. The other operations don't have data-dependent performance, but divide does on most CPUs. Especially for 64-bit, 0
can be 2x faster than large dividends.
e.g. Agner Fog lists idiv r64
latency for Haswell as 39-103, (and throughput 24-81). (http://agner.org/optimize/). Not such a big effect for 32-bit and smaller, e.g. latency = 22-29 for Haswell idiv r32
.
signed overflow is UB, although I don't think the compiler will "see it" at compile time so it shouldn't actually be a problem on normal architectures.
Most of the difference between add and mul is hidden by using
volatile
inside the loop; instead of seeing 3x the latency for this dependency chain, you only see5+1
vs.5+3
.