kragen/hashnum.c

## hashnum.c
/* How fast is hashing an integer?  Versus comparing it?  Hashing is
 * faster. */

/* gcc-4.3.2 -O5 hashnum.c -o hashnum; time ./hashnum; time ./hashnum foo */

/* Doing 1024 million remainder operations took 8.152 seconds; doing
 * the same number of compare and conditional-jump operations took
 * 9.142 seconds. (Best of three, on a Celeron E1200, 32-bit mode.)
 *
 * I think this means that finite maps implemented as comparison-based
 * trees will have a really hard time competing with hashing on a CPU
 * like this one when the keys are integers, because traversing every
 * tree node will be slightly more expensive than doing the entire
 * hash function.  That means that if your tree is ten levels deep, it
 * will take you slightly more than ten times as long to get to the
 * right tree node than it would take you to get to the right hash
 * bucket.
 *
 * I note with interest that comparing without doing a conditional
 * jump is about four times faster (although I haven’t done the
 * experiment very rigorously; it was an unintentional experiment as I
 * struggled to defeat the optimizer).  So trees could maybe get back
 * some of their advantage if you could use a code sequence something
 * like this:
 *
 *     loop:
 *         load treenode pointed to by %right into registers %nodekey, %left, %right
 *         cmp %needle, %nodekey
 *         je foundit
 *         cmovle %left, %right       # %right <- %left if %needle < %nodekey else %right
 *         jmp loop
 *
 * Without having tried it, though, I assume that GCC didn’t emit an
 * analogous code sequence in this test because it would have been
 * slower, not because I’m better at optimizing assembly than GCC’s
 * optimizer.  Linus explains in
 * <http://ondioline.org/mail/cmov-a-bad-idea-on-out-of-order-cpus>:
 *
 * > CMOV (and, more generically, any “predicated instruction”) tends
 * > to generally a bad idea on an aggressively out-of-order CPU.
 *
 * The difference might actually be larger, since I’m superstitiously
 * writing to memory to avoid being optimized into oblivion, while
 * tree traversal is normally done without writing to memory.
 */

#include <stdlib.h>
#include <stdio.h>

enum { n_numbers = 1024 };
int numbers[n_numbers], out_numbers[n_numbers];

int main(int argc, char **argv)
{
  int ii, jj;

  for (ii = 0; ii != n_numbers; ii++) numbers[ii] = rand();

  if (argc == 1) {
    puts("Hashing...");
    for (ii = 0; ii < 1000*1000; ii++) {
      for (jj = 0; jj < n_numbers; jj++) out_numbers[jj] = numbers[jj] % 1021; /* simple hash */
    }
  } else {
    puts("Comparing...");
    for (ii = 0; ii < 1000*1000; ii++) {
      for (jj = 0; jj < n_numbers; jj++) {
        if (numbers[jj] < 1234567890) {
          out_numbers[jj] = -37;
        } else {
          /* See footnote comment [0]. */
          out_numbers[jj] = numbers[jj];
        }
      }
    }
  }

  return 0;
}

/* [0] If I use just a constant here, GCC’s optimizer is smart enough
 * to avoid using a conditional jump.  One particularly clever code
 * sequence it emitted looked like this:

   8048450:	31 c0                	xor    %eax,%eax
   8048452:	81 3c 95 40 b0 04 08 	cmpl   $0x499602d1,0x804b040(,%edx,4)
   8048459:	d1 02 96 49
   804845d:	0f 9f c0             	setg   %al
   8048460:	83 e8 01             	sub    $0x1,%eax
   8048463:	83 e0 c6             	and    $0xffffffc6,%eax
   8048466:	83 c0 15             	add    $0x15,%eax
   8048469:	89 04 95 40 a0 04 08 	mov    %eax,0x804a040(,%edx,4)

 * If you can figure out what it’s going to store without writing
 * anything down, you should win a prize.  It turns out it's really
 * not the comparison that's expensive, but the conditional jump.
 */
	/* How fast is hashing an integer? Versus comparing it? Hashing is
	* faster. */

	/* gcc-4.3.2 -O5 hashnum.c -o hashnum; time ./hashnum; time ./hashnum foo */

	/* Doing 1024 million remainder operations took 8.152 seconds; doing
	* the same number of compare and conditional-jump operations took
	* 9.142 seconds. (Best of three, on a Celeron E1200, 32-bit mode.)
	*
	* I think this means that finite maps implemented as comparison-based
	* trees will have a really hard time competing with hashing on a CPU
	* like this one when the keys are integers, because traversing every
	* tree node will be slightly more expensive than doing the entire
	* hash function. That means that if your tree is ten levels deep, it
	* will take you slightly more than ten times as long to get to the
	* right tree node than it would take you to get to the right hash
	* bucket.
	*
	* I note with interest that comparing without doing a conditional
	* jump is about four times faster (although I haven’t done the
	* experiment very rigorously; it was an unintentional experiment as I
	* struggled to defeat the optimizer). So trees could maybe get back
	* some of their advantage if you could use a code sequence something
	* like this:
	*
	* loop:
	* load treenode pointed to by %right into registers %nodekey, %left, %right
	* cmp %needle, %nodekey
	* je foundit
	* cmovle %left, %right # %right <- %left if %needle < %nodekey else %right
	* jmp loop
	*
	* Without having tried it, though, I assume that GCC didn’t emit an
	* analogous code sequence in this test because it would have been
	* slower, not because I’m better at optimizing assembly than GCC’s
	* optimizer. Linus explains in
	* <http://ondioline.org/mail/cmov-a-bad-idea-on-out-of-order-cpus>:
	*
	* > CMOV (and, more generically, any “predicated instruction”) tends
	* > to generally a bad idea on an aggressively out-of-order CPU.
	*
	* The difference might actually be larger, since I’m superstitiously
	* writing to memory to avoid being optimized into oblivion, while
	* tree traversal is normally done without writing to memory.
	*/

	#include <stdlib.h>
	#include <stdio.h>

	enum { n_numbers = 1024 };
	int numbers[n_numbers], out_numbers[n_numbers];

	int main(int argc, char **argv)
	{
	int ii, jj;

	for (ii = 0; ii != n_numbers; ii++) numbers[ii] = rand();

	if (argc == 1) {
	puts("Hashing...");
	for (ii = 0; ii < 1000*1000; ii++) {
	for (jj = 0; jj < n_numbers; jj++) out_numbers[jj] = numbers[jj] % 1021; /* simple hash */
	}
	} else {
	puts("Comparing...");
	for (ii = 0; ii < 1000*1000; ii++) {
	for (jj = 0; jj < n_numbers; jj++) {
	if (numbers[jj] < 1234567890) {
	out_numbers[jj] = -37;
	} else {
	/* See footnote comment [0]. */
	out_numbers[jj] = numbers[jj];
	}
	}
	}
	}

	return 0;
	}

	/* [0] If I use just a constant here, GCC’s optimizer is smart enough
	* to avoid using a conditional jump. One particularly clever code
	* sequence it emitted looked like this:

	8048450: 31 c0 xor %eax,%eax
	8048452: 81 3c 95 40 b0 04 08 cmpl $0x499602d1,0x804b040(,%edx,4)
	8048459: d1 02 96 49
	804845d: 0f 9f c0 setg %al
	8048460: 83 e8 01 sub $0x1,%eax
	8048463: 83 e0 c6 and $0xffffffc6,%eax
	8048466: 83 c0 15 add $0x15,%eax
	8048469: 89 04 95 40 a0 04 08 mov %eax,0x804a040(,%edx,4)

	* If you can figure out what it’s going to store without writing
	* anything down, you should win a prize. It turns out it's really
	* not the comparison that's expensive, but the conditional jump.
	*/