cslarsen/mul.cpp

## mul.cpp
/*
 * Simple program I wrote after reading Felix Von Leitner's notes
 * on optimization at http://www.fefe.de/source-code-optimization.pdf
 *
 * In the 90s, it used to be faster to do (x<<8)+(x<<6) than x*320,
 * but today with the Intel Core i7, it's actually faster to do
 * x*320.
 *
 * And the compiler knows it!
 *
 * If you compile this with `g++ -O3 -g -S` and check the output, you'll
 * see that both mul320_normal() and mul320_fast() are both optimized
 * to use imull $320, %edi, %eax.  Running them should give same numbers
 * for CPU time.
 *
 * GCC actually CANCELS my hand-woven optimization, because it recognizes
 * it as a FALSE speed optimization!
 *
 * If you compile with `g++ g -S` (NO optimizations) you can see that
 * the code is not changed, and in fact, mul320_normal() is, indeed,
 * faster on an i7.
 *
 * Written by Christian Stigen Larsen,
 * http://csl.sublevel3.org
 *
 * Public domain, 2012-05-08
 *
 * Compiling:
 *
 * ~/tmp/speed csl$ g++ mul.cpp -o mul
 * ~/tmp/speed csl$ time ./mul
 * doing 1 billion 320*x ... took 3.939162 secs
 * doing 1 billion (x<<8)+(x<<6) ... took 4.624359 secs
 *
 * real 0m8.571s
 * user 0m8.564s
 * sys  0m0.005s
 */

#include <stdio.h>
#include <stdlib.h>
#include <sys/resource.h>

static double mark;

double getrusage()
{
  struct rusage ru;
  getrusage(RUSAGE_SELF, &ru);
  return ru.ru_utime.tv_sec + ru.ru_utime.tv_usec / 1000000.0;
}

void mark_time()
{
  mark = getrusage();
}

double elapsed_secs()
{
  return getrusage() - mark;
}

int mul320_normal(int x)
{
  /*
   * Normal, simple x*320.
   * Will be compiled to imull $320, %edi, %eax,
   * which is faster than doing SHL and ADD.
   */
  return x*320;
}

int mul320_fast(int x)
{
  /*
   * This is NOT really fast.
   * It used to, in the 90s, but
   * modern compilers know your
   * CPU quite well, so this is
   * actually optimized by gcc -O3
   * to imull 320, edi, eax.
   */
  return (x<<8) + (x<<6);
}

int main(int argc, char** argv)
{
  int x = atoi(argv[0]);

  int loop = 1000000000;

  {
    printf("doing 1 billion 320*x ... ");
    fflush(stdout);
    mark_time();

    for ( int n=0; n<loop; ++n )
      int tmp = mul320_normal(x);

    double secs = elapsed_secs();
    printf("took %f secs\n", secs);
  }

  {
    printf("doing 1 billion (x<<8)+(x<<6) ... ");
    fflush(stdout);
    mark_time();

    for ( int n=0; n<loop; ++n )
      int tmp = mul320_fast(x);

    double secs = elapsed_secs();
    printf("took %f secs\n", secs);
  }

  return 0;
}
	/*
	* Simple program I wrote after reading Felix Von Leitner's notes
	* on optimization at http://www.fefe.de/source-code-optimization.pdf
	*
	* In the 90s, it used to be faster to do (x<<8)+(x<<6) than x*320,
	* but today with the Intel Core i7, it's actually faster to do
	* x*320.
	*
	* And the compiler knows it!
	*
	* If you compile this with `g++ -O3 -g -S` and check the output, you'll
	* see that both mul320_normal() and mul320_fast() are both optimized
	* to use imull $320, %edi, %eax. Running them should give same numbers
	* for CPU time.
	*
	* GCC actually CANCELS my hand-woven optimization, because it recognizes
	* it as a FALSE speed optimization!
	*
	* If you compile with `g++ g -S` (NO optimizations) you can see that
	* the code is not changed, and in fact, mul320_normal() is, indeed,
	* faster on an i7.
	*
	* Written by Christian Stigen Larsen,
	* http://csl.sublevel3.org
	*
	* Public domain, 2012-05-08
	*
	* Compiling:
	*
	* ~/tmp/speed csl$ g++ mul.cpp -o mul
	* ~/tmp/speed csl$ time ./mul
	* doing 1 billion 320*x ... took 3.939162 secs
	* doing 1 billion (x<<8)+(x<<6) ... took 4.624359 secs
	*
	* real 0m8.571s
	* user 0m8.564s
	* sys 0m0.005s
	*/

	#include <stdio.h>
	#include <stdlib.h>
	#include <sys/resource.h>

	static double mark;

	double getrusage()
	{
	struct rusage ru;
	getrusage(RUSAGE_SELF, &ru);
	return ru.ru_utime.tv_sec + ru.ru_utime.tv_usec / 1000000.0;
	}

	void mark_time()
	{
	mark = getrusage();
	}

	double elapsed_secs()
	{
	return getrusage() - mark;
	}

	int mul320_normal(int x)
	{
	/*
	* Normal, simple x*320.
	* Will be compiled to imull $320, %edi, %eax,
	* which is faster than doing SHL and ADD.
	*/
	return x*320;
	}

	int mul320_fast(int x)
	{
	/*
	* This is NOT really fast.
	* It used to, in the 90s, but
	* modern compilers know your
	* CPU quite well, so this is
	* actually optimized by gcc -O3
	* to imull 320, edi, eax.
	*/
	return (x<<8) + (x<<6);
	}

	int main(int argc, char** argv)
	{
	int x = atoi(argv[0]);

	int loop = 1000000000;

	{
	printf("doing 1 billion 320*x ... ");
	fflush(stdout);
	mark_time();

	for ( int n=0; n<loop; ++n )
	int tmp = mul320_normal(x);

	double secs = elapsed_secs();
	printf("took %f secs\n", secs);
	}

	{
	printf("doing 1 billion (x<<8)+(x<<6) ... ");
	fflush(stdout);
	mark_time();

	for ( int n=0; n<loop; ++n )
	int tmp = mul320_fast(x);

	double secs = elapsed_secs();
	printf("took %f secs\n", secs);
	}

	return 0;
	}