Created
June 8, 2012 15:23
-
-
Save cslarsen/2896137 to your computer and use it in GitHub Desktop.
Shows how gcc cancels bad "optimizations"
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* Simple program I wrote after reading Felix Von Leitner's notes | |
* on optimization at http://www.fefe.de/source-code-optimization.pdf | |
* | |
* In the 90s, it used to be faster to do (x<<8)+(x<<6) than x*320, | |
* but today with the Intel Core i7, it's actually faster to do | |
* x*320. | |
* | |
* And the compiler knows it! | |
* | |
* If you compile this with `g++ -O3 -g -S` and check the output, you'll | |
* see that both mul320_normal() and mul320_fast() are both optimized | |
* to use imull $320, %edi, %eax. Running them should give same numbers | |
* for CPU time. | |
* | |
* GCC actually CANCELS my hand-woven optimization, because it recognizes | |
* it as a FALSE speed optimization! | |
* | |
* If you compile with `g++ g -S` (NO optimizations) you can see that | |
* the code is not changed, and in fact, mul320_normal() is, indeed, | |
* faster on an i7. | |
* | |
* Written by Christian Stigen Larsen, | |
* http://csl.sublevel3.org | |
* | |
* Public domain, 2012-05-08 | |
* | |
* Compiling: | |
* | |
* ~/tmp/speed csl$ g++ mul.cpp -o mul | |
* ~/tmp/speed csl$ time ./mul | |
* doing 1 billion 320*x ... took 3.939162 secs | |
* doing 1 billion (x<<8)+(x<<6) ... took 4.624359 secs | |
* | |
* real 0m8.571s | |
* user 0m8.564s | |
* sys 0m0.005s | |
*/ | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <sys/resource.h> | |
static double mark; | |
double getrusage() | |
{ | |
struct rusage ru; | |
getrusage(RUSAGE_SELF, &ru); | |
return ru.ru_utime.tv_sec + ru.ru_utime.tv_usec / 1000000.0; | |
} | |
void mark_time() | |
{ | |
mark = getrusage(); | |
} | |
double elapsed_secs() | |
{ | |
return getrusage() - mark; | |
} | |
int mul320_normal(int x) | |
{ | |
/* | |
* Normal, simple x*320. | |
* Will be compiled to imull $320, %edi, %eax, | |
* which is faster than doing SHL and ADD. | |
*/ | |
return x*320; | |
} | |
int mul320_fast(int x) | |
{ | |
/* | |
* This is NOT really fast. | |
* It used to, in the 90s, but | |
* modern compilers know your | |
* CPU quite well, so this is | |
* actually optimized by gcc -O3 | |
* to imull 320, edi, eax. | |
*/ | |
return (x<<8) + (x<<6); | |
} | |
int main(int argc, char** argv) | |
{ | |
int x = atoi(argv[0]); | |
int loop = 1000000000; | |
{ | |
printf("doing 1 billion 320*x ... "); | |
fflush(stdout); | |
mark_time(); | |
for ( int n=0; n<loop; ++n ) | |
int tmp = mul320_normal(x); | |
double secs = elapsed_secs(); | |
printf("took %f secs\n", secs); | |
} | |
{ | |
printf("doing 1 billion (x<<8)+(x<<6) ... "); | |
fflush(stdout); | |
mark_time(); | |
for ( int n=0; n<loop; ++n ) | |
int tmp = mul320_fast(x); | |
double secs = elapsed_secs(); | |
printf("took %f secs\n", secs); | |
} | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I used this compiler: