Created
March 13, 2018 17:56
-
-
Save Lewiscowles1986/90191c59c9aedf3d08bf0b129065cccc to your computer and use it in GitHub Desktop.
Benchmarking PC
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Integer and float benchmark for Win32 and Win64 | |
// Results are below main(), line 91 | |
#include <stdlib.h> | |
#include <stdio.h> | |
#ifdef _WIN32 | |
#include <sys/timeb.h> | |
#else | |
#include <sys/time.h> | |
#endif | |
#include <time.h> | |
double | |
mygettime(void) { | |
# ifdef _WIN32 | |
struct _timeb tb; | |
_ftime(&tb); | |
return (double)tb.time + (0.001 * (double)tb.millitm); | |
# else | |
struct timeval tv; | |
if(gettimeofday(&tv, 0) < 0) { | |
perror("oops"); | |
} | |
return (double)tv.tv_sec + (0.000001 * (double)tv.tv_usec); | |
# endif | |
} | |
template< typename Type > | |
void my_test(const char* name) { | |
volatile Type v = 0; | |
// Do not use constants or repeating values | |
// to avoid loop unroll optimizations. | |
// All values >0 to avoid division by 0 | |
Type v0 = (Type)(rand() % 256)/16 + 1; | |
Type v1 = (Type)(rand() % 256)/16 + 1; | |
Type v2 = (Type)(rand() % 256)/16 + 1; | |
Type v3 = (Type)(rand() % 256)/16 + 1; | |
Type v4 = (Type)(rand() % 256)/16 + 1; | |
Type v5 = (Type)(rand() % 256)/16 + 1; | |
Type v6 = (Type)(rand() % 256)/16 + 1; | |
Type v7 = (Type)(rand() % 256)/16 + 1; | |
double t1 = mygettime(); | |
for (size_t i = 0; i < 100000000; ++i) { | |
v += v0; | |
v += v2; | |
v += v4; | |
v += v6; | |
} | |
printf("%s add: %f\n", name, mygettime() - t1); | |
t1 = mygettime(); | |
for (size_t i = 0; i < 100000000; ++i) { | |
v -= v1; | |
v -= v3; | |
v -= v5; | |
v -= v7; | |
} | |
printf("%s sub: %f\n", name, mygettime() - t1); | |
t1 = mygettime(); | |
for (size_t i = 0; i < 100000000; ++i) { | |
v *= v0; | |
v *= v2; | |
v *= v4; | |
v *= v6; | |
} | |
printf("%s mul: %f\n", name, mygettime() - t1); | |
t1 = mygettime(); | |
for (size_t i = 0; i < 100000000; ++i) { | |
v /= v1; | |
v /= v3; | |
v /= v5; | |
v /= v7; | |
} | |
printf("%s div: %f\n", name, mygettime() - t1); | |
} | |
int main() { | |
my_test< short >(" short"); | |
my_test< int >(" int"); | |
my_test< long >(" long"); | |
my_test< long long >("long long"); | |
my_test< float >(" float"); | |
my_test< double >(" double"); | |
return 0; | |
} |
Also note that repeated division will quickly make v = 0
, which is the fastest case for div
/ idiv
. The other operations don't have data-dependent performance, but divide does on most CPUs. Especially for 64-bit, 0
can be 2x faster than large dividends.
e.g. Agner Fog lists idiv r64
latency for Haswell as 39-103, (and throughput 24-81). (http://agner.org/optimize/). Not such a big effect for 32-bit and smaller, e.g. latency = 22-29 for Haswell idiv r32
.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Using
volatile Type sink = v;
inside the loop (after everyv *= foo
) would force it to store every result separately (so it couldn't optimize the add loop to a single multiply), without having to usevolatile v
. So the compiler could still keepv
in a register.You can instead use inline asm to force the compiler to have a result in a register without adding any extra instructions, if you don't mind using GNU C extensions. See https://kojirion.github.io/2016/04/04/Profiling.html for an
escape
function, from Chandler Carruth's CppCon2015 talk: "Tuning C++: Benchmarks, and CPUs, and Compilers! Oh My!" https://www.youtube.com/watch?v=nXaxk27zwlk. (Usingperf
on Linux.)