OhMeadhbh/00_readme.txt

## 00_readme.txt
Read this file first.
Then read the comments at the top of parity_test.c
Then read the comments above each algorithm implementation in parity.c

So there I was talking about efficient implemetation of fundamental functions
with Palmer over at SiFive. Back in the day, I used to participate in informal
contests on rec.games.programmer to see who could craft the most efficient
implementations for population count or bit reversal or bit-blitting. For
programmer types, it's great fun.

This day, however, we picked "how do you efficently calculate the parity of a
32 bit integer" as a practical example of a typical problem a programmer might
encounter.

After thinking about it for a while and coming up with a number of different
implementations, we felt we had exhausted the solution space. Practical
ramifications of our conversation included a few key take-aways:

1. C compilers are pretty good these days, but if you're seriously worried about
   efficiency, you want to learn how to read assembly for your target CPU and
   take a look at the what the compiler produces.

2. You can often get a good sense for efficiency by looking at the assembly
   output, but DEFINITELY run some timing tests. Modern CPUs are complex beasts
   where subtle changes in code can lead to radical changes in run-time
   behaviour.

And that's what this code is about. It's set up to test several different
techniques for producing the parity value of a 32 bit integer. As a bonus, it's
pretty easy to tell the compiler to spit out the assembly code it's generating.

I don't think any of the approaches provided here are completely new, you can
probably find them discussed out on the internet. Palmer described a few of
them in a very succinct manner, exposing the relationship between CPU design
and the production of optimized code. I haven't seen such a discussion online
before, so I figured it might be cool to step through a couple of different
implementations of a particular problem, highlighting how features in modern
CPUs make some optimizations more or less effective.

In addition to telling you raw numbers, it gives you information about relative
performance. It's not a perfect test, however, as it's intended to run on a unix
like operating system (Darwin, Leenucks, BSD, etc.) and it's likely machines
running each of these OSes have things going on in the background. But if you
run it several times, it will probably give you a decent picture of what parity
algorithms run faster on your hardware.

Enjoy!

## Makefile
# Makefile
#
# Copyright (c) 2018, Meadhbh S. Hamrick
# Released under a BSD Open Source License, see parity_test.c for details

default :
	@echo "try one of the following:"
	@echo "make test   - builds the parity_test executable"
	@echo "make clean  - removes object files and executables"
	@echo "make asm    - compiles parity.c to assembly language"
	@echo "make custom - makes a version that includes an assembly implementation"

test : parity_test

parity_test : parity_test.o parity.o
	$(CC) $(LFLAGS) -o parity_test parity_test.o parity.o

parity_test.o : parity_test.c parity.h
	$(CC) $(CFLAGS) -c -o parity_test.o parity_test.c

parity.o : parity.c parity.h
	$(CC) $(CFLAGS) -c -o parity.o parity.c

clean :
	$(RM) parity_test parity_test.o parity.o parity.s \
        parity_custom.o custom_test.o custom_test

asm :
	$(CC) -S $(CFLAGS) -c -o parity.s parity.c

custom : custom_test

custom_test : custom_test.o parity.o parity_custom.o
	$(CC) $(LDFLAGS) -o custom_test custom_test.o parity.o parity_custom.o

custom_test.o : parity_test.c parity.h
	$(CC) $(CFLAGS) -D_CUSTOM -c -o custom_test.o parity_test.c

## parity.c
/* parity.c
**
** Copyright (c) 2018, Meadhbh S. Hamrick
** Released under a BSD Open Source License, see parity_test.c for details
**
** This  file contains  several  different functions  that calculate  the
** parity of a 32 bit integer. See comments in the parity_test.c file for
** instructions in  how to  build this  file and  it's test  harness. The
** objective of this package is to  demonstrate how much of a performance
** boost you can  get out of optimizing your algorithm.
**
** The function  we're looking at here  is the even parity  function. The
** parity of  a string  of bits  is defined  as "the  number of  one bits
** needed to  be added  to ensure  the total  number of  one bits  in the
** string is even." In  other words, if you have an odd number of bits in
** your string, your parity is 1 because you need to add a single one bit
** to have an even number of ones. If  you have an even number of bits in
** your  bit string,  you don't  need  to add  any one  bits because  you
** already have an even number, so  your parity is 0. The parity function
** has a  long history of  being used as  a very simple  error correction
** scheme for memory, disk storage and  data streaming over a network (or
** heaven forbid, a modem.)
*/

#include "parity.h"

/*
** This is the  parity Look-Up Table (aka  "the lut".) It is  a 256 octet
** entry which contains the parity of the octets 0 through 255. We use it
** in several of the parity implementations below.
*/

static uint8_t lut[ 256 ] = {
  0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
  1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
  1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
  0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
  1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
  0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
  0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
  1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
  1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
  0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
  0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
  1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
  0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
  1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
  1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
  0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0
};

/*
** Okay. Here's our first, naive implementation. This function takes a 32
** bit integer  as input and  returns the parity  of the input  (either a
** zero or a one.) The implementation is pretty straight forward; we loop
** through the bits in the input and  count how many one bits we find. We
** then return  the least significant bit  of the count because  it turns
** out that's actually the parity.
*/
uint32_t parity_naive( uint32_t input ) {
  uint32_t i, count;

  for( i = 0, count = 0; i < 32; i++ ) {
    if( 1 == ( input & 1 ) ) {
      count++;
    }

    input = input >> 1;
  }

  return count & 1;
}

/*
** So let's have  a conversation about instruction  pipelines. Don't know
** about pipelinging in CPUs? Check out this link:
**
** https://en.wikipedia.org/wiki/Instruction_pipelining
**
** One  of the  problems with  if-thens  is when  a CPU  executes a  Jump
** instruction, it means  all the work ahead of the  jump in the pipeline
** has to be  thrown away.  You're JUMPING to a  different location where
** there  will  be  new  instructions.   Your CPU  has  to  re-fill  that
** pipeline, which takes time.
**
** One way to avoid this is to using logical functions  like AND,  OR  or
** XOR to  combine the  existing state  (the current parity) with an input
** bit to find the new parity.
**
** So in this  function, we're not counting bits. We  start by saying our
** initial parity is zero. Then we look at the first bit; if it's one, we
** know  our  parity is  now  one.  If it's  zero,  the  parity is  still
** zero.  Then  we shift  the  input  one bit  to  the  right and  do  it
** again. Here's a  truth table for how  we combine each of  the 32 input
** bits into the running state of the function:
**
**   +----------------+-----------------+----------------+
**   | Current Parity |    Input Bit    |   New Parity   |
**   +----------------+-----------------+----------------+
**   |        0       |        0        |       0        |
**   |        0       |        1        |       1        |
**   |        1       |        0        |       1        |
**   |        1       |        1        |       0        |
**   +----------------+-----------------+----------------+
**
** Clever readers  will recognize  this as  the truth  table for  the XOR
** function.  So our  new and improved function simply  XORs the existing
** state together with each of the 32 bits in the input one at a time.
**
** As an interesting  aside, a similar technique was used to fix a famous
** bug  in  the  TENEX  operating  system  back  in  the  golden  era  of
** computing. More info at:
**
** http://meadhbh.hamrick.rocks/home/technical-sh-t/considerations-for-passwords-in-secure-systems
*/
uint32_t parity_xor( uint32_t input ) {
  uint32_t i, state;

  for( i = 0, state = 0; i < 32; i++ ) {
    state = state ^ ( input & 1 );
    input = input >> 1;
  }

  return state;
}

/*
** So you  may be thinking,  "If Jump instructions cause  the instruction
** pipeline to  plotz, why  are we  using a  loop that  ends with  a Jump
** instruction?"
**
** Good question.
**
** "Loop Unrolling" is  a time-honored technique for  optimizing code. In
** addition to  avoiding the hit  to the pipeline, you probably  know how
** many  times  you have to execute the code in the loop.  Therefore  you
** don't have to execute the code which checks whether it's time to  exit
** the loop.
**
** You  know, I  don't recommend  the  Wikipedia as  the be-all,  end-all
** reference  for good  coding practice,  but  this article  *IS* a  good
** introduction to loop unrolling and it has some decent external links:
**
** https://en.wikipedia.org/wiki/Loop_unrolling
**
** At the time I looked at this article,  it had a reference to a book by
** Michael Abrash.  For the record,  I love Mr.  Abrash. He wrote  a book
** called "The Zen of Assembly Language"  back in the day that emphisized
** one important point: DON'T  ASSUME ANYTHING ABOUT OPTIMIZATION; ALWAYS
** TEST YOUR CODE.
**
** So you probably want to look at the timing tests for this function. It
** *MAY* not be as fast as you suspect.
**
** Why is this?
**
** So  it  turns   out  modern  CPUs  have  this   thing  called  "Branch
** Prediction." This is logic in your CPU that tries to guess which way a
** conditional  branch  will go  so  the  CPU  can better  optimize   its
** pipeline.  As mi  amigo  Palmer  pointed out  to  me recently,  branch
** predictors  are pretty  good  for small  loops.  Here's the  requisite
** Wikipedia reference for more details:
**
** https://en.wikipedia.org/wiki/Branch_predictor
**
** I would be willing to bet, however, that older CPUs (like the original
** 8086)  that  don't do branch  prediction, may actually run  this  code
** appreciably faster.
**
** Also, check  out the  output when  you compile  this function  down to
** assembly  (see  comments in parity_test.c  for more info.)  Check  out
** what happens when you add increasing levels of optimization.
**
** So, to be clear here, we're "unrolling" the loop from the parity_xor()
** function. That is, instead of having a for-loop, we just do 32 XORs.
*/
uint32_t parity_unrolled( uint32_t input ) {
  uint32_t state = 0;

  state = state ^ ( input & 1 );
  input = input >> 1;
  state = state ^ ( input & 1 );
  input = input >> 1;
  state = state ^ ( input & 1 );
  input = input >> 1;
  state = state ^ ( input & 1 );
  input = input >> 1;
  state = state ^ ( input & 1 );
  input = input >> 1;
  state = state ^ ( input & 1 );
  input = input >> 1;
  state = state ^ ( input & 1 );
  input = input >> 1;
  state = state ^ ( input & 1 );
  input = input >> 1;
  state = state ^ ( input & 1 );
  input = input >> 1;
  state = state ^ ( input & 1 );
  input = input >> 1;
  state = state ^ ( input & 1 );
  input = input >> 1;
  state = state ^ ( input & 1 );
  input = input >> 1;
  state = state ^ ( input & 1 );
  input = input >> 1;
  state = state ^ ( input & 1 );
  input = input >> 1;
  state = state ^ ( input & 1 );
  input = input >> 1;
  state = state ^ ( input & 1 );
  input = input >> 1;
  state = state ^ ( input & 1 );
  input = input >> 1;
  state = state ^ ( input & 1 );
  input = input >> 1;
  state = state ^ ( input & 1 );
  input = input >> 1;
  state = state ^ ( input & 1 );
  input = input >> 1;
  state = state ^ ( input & 1 );
  input = input >> 1;
  state = state ^ ( input & 1 );
  input = input >> 1;
  state = state ^ ( input & 1 );
  input = input >> 1;
  state = state ^ ( input & 1 );
  input = input >> 1;
  state = state ^ ( input & 1 );
  input = input >> 1;
  state = state ^ ( input & 1 );
  input = input >> 1;
  state = state ^ ( input & 1 );
  input = input >> 1;
  state = state ^ ( input & 1 );
  input = input >> 1;
  state = state ^ ( input & 1 );
  input = input >> 1;
  state = state ^ ( input & 1 );
  input = input >> 1;
  state = state ^ ( input & 1 );
  input = input >> 1;
  state = state ^ ( input & 1 );

  return state;
}

/*
** But we can do better.
**
** Each of the bits in the  state and input values are independent. While
** it's true we're only concerned about the low bit of the state or input
** variables, we don't *HAVE* to AND-out all the other bits to zero after
** every other shift.
**
** In this implementation we're avoiding AND-ing the input before XOR-ing
** it with the  state after each shift.  But we *DO* need to  AND it with
** one before we return it. Still, we're getting rid of 31 ANDs.
**
** You would think this implementation would run a little faster than the
** previous  one,  but definitely  test  this  assertion by  running  the
** parity_test executable.
*/
uint32_t parity_better_unrolled( uint32_t input ) {
  uint32_t state = 0;

  state = state ^ input;
  input = input >> 1;
  state = state ^ input;
  input = input >> 1;
  state = state ^ input;
  input = input >> 1;
  state = state ^ input;
  input = input >> 1;
  state = state ^ input;
  input = input >> 1;
  state = state ^ input;
  input = input >> 1;
  state = state ^ input;
  input = input >> 1;
  state = state ^ input;
  input = input >> 1;
  state = state ^ input;
  input = input >> 1;
  state = state ^ input;
  input = input >> 1;
  state = state ^ input;
  input = input >> 1;
  state = state ^ input;
  input = input >> 1;
  state = state ^ input;
  input = input >> 1;
  state = state ^ input;
  input = input >> 1;
  state = state ^ input;
  input = input >> 1;
  state = state ^ input;
  input = input >> 1;
  state = state ^ input;
  input = input >> 1;
  state = state ^ input;
  input = input >> 1;
  state = state ^ input;
  input = input >> 1;
  state = state ^ input;
  input = input >> 1;
  state = state ^ input;
  input = input >> 1;
  state = state ^ input;
  input = input >> 1;
  state = state ^ input;
  input = input >> 1;
  state = state ^ input;
  input = input >> 1;
  state = state ^ input;
  input = input >> 1;
  state = state ^ input;
  input = input >> 1;
  state = state ^ input;
  input = input >> 1;
  state = state ^ input;
  input = input >> 1;
  state = state ^ input;
  input = input >> 1;
  state = state ^ input;
  input = input >> 1;
  state = state ^ input;
  input = input >> 1;
  state = state ^ input;

  return state & 1;
}

/*
** So remember  that big  look-up table  at the  beginning of  this file?
** Yeah. Now is when we actually use it.
**
** The idea behind the look-up table  is we pre-compute the parity of the
** octets  from  0 to  255  and  store  those  pre-computed values  in  a
** table. So now instead  of COMPUTING the parity of an  octet, we use it
** as an index  into a table where we fetch  the pre-computed value. This
** is a simple example of a "time-space tradeoff" and of course there's a
** wikipedia entry on it:
**
** https://en.wikipedia.org/wiki/Space%E2%80%93time_tradeoff
**
** In this  implementation, we treat  the uint32_t  input as an  array of
** four octets and  use each as an offset into  the lookup table, XOR-ing
** the new parity we looked up with the existing state variable.
*/
uint32_t parity_lut( uint32_t input ) {
  uint32_t state;

  // Actually,  we can  save ourselves  an  XOR operation  if we  just
  // assign our initial state as the first looked-up value.

  state =         (uint32_t) lut[ input & 0xFF ];
  input = input >> 8;

  // But the rest of the bytes get XOR-ed into the existing state.

  state = state ^ (uint32_t) lut[ input & 0xFF ];
  input = input >> 8;
  state = state ^ (uint32_t) lut[ input & 0xFF ];
  input = input >> 8;
  state = state ^ (uint32_t) lut[ input & 0xFF ];

  // And the state variable will ONLY be  a zero or a one, so we don't
  // need to AND it with one.

  return state;
}

/*
** So when I was a kid,  a few CPUs implemented shift operations so  that
** EACH bit shifted took one clock  cycle. So, shifting a register 8 bits
** to the right  took eight times as  long as shifting it one  bit to the
** right.
**
** I don't think  that's true with modern CPUs, but  what the heck, let's
** test that assertion.
**
** In this implementation, we treat the  input as an array of four octets
** and use  pointer manipulation to  extract four different  indexes into
** the look-up table, again, XOR-ing the looked-up values together.
*/
uint32_t parity_lut_bis( uint32_t input ) {
  uint32_t state;

  // FWIW,  I know  a  lot  of people  who  really  frown on  creating
  // pointers to  things on the stack.  If you're coding this  at 3AM,
  // you can easily cause a core dump.  The more I think about it, the
  // more I think  I like the previous implementation. It  seems a lot
  // clearer  as  to what  we're  trying  to do  &  I  bet this  isn't
  // especially faster than the previous implementation.

  uint8_t * ptr = (uint8_t *) & input;

  state =         (uint32_t) lut[ ptr[0] ];
  state = state ^ (uint32_t) lut[ ptr[1] ];
  state = state ^ (uint32_t) lut[ ptr[2] ];
  state = state ^ (uint32_t) lut[ ptr[3] ];

  return state;
}

/*
** So by  now you've  probably figured out  that the parity  of a  32 bit
** integer is the XOR of all it's bits. So if the function bit() returned
** the bit at a particular offset, the  parity function for a 4 bit value
** could be calculated like so:
**
**     bit(0) ^ bit(1) ^ bit(2) ^ bit(3)
**
** Another  way  to think  about  this  is  to  form an  expression  tree
** representing these calculations:
**
**         ( ^ )
**        /     \
**     bit(0)  ( ^ )
**            /     \
**        bit(1)   ( ^ )
**                /     \
**            bit(2)   bit(3)
**
** If the idea of an expression tree  is new to you, here's the wikipedia
** link:
**
** https://en.wikipedia.org/wiki/Binary_expression_tree
**
** Expression  trees  can  often  help  you  identify  opportunities  for
** optimization.  In  this case we  have a very unbalanced  tree. Because
** the XOR operation is associative, we can balance the tree thusly:
**
**                ( ^ )
**               /    \
**              /      \
**         ( ^ )         ( ^ )
**         /   \         /  \
**        /     \       /    \
**     bit(0) bit(2) bit(1) bit(3)
**
** Take a close look at how we chose the particular bits along the bottom
** row.   Through the  miracle of  the  commutative property  of the  XOR
** function, we can pick any  permutation of bit indexes. This particular
** permutation lets us calculate two XORs at a time by doing this:
**
**     input = input ^ ( input >> 2 )
**
** After  executing  this  statement,  bit zero  of  the  input  will  be
** bit(0) ^  bit(2) and  bit one of  the input will  be bit(1)  ^ bit(3).
** (incidentally,  this  is  exactly  what's   in  the  middle  layer  of
** expression nodes  in the  tree above.)  If  we repeat  this operation,
** shifting only by one, it's  the equivalent of performing the operation
** in the top node:
**
**     input = input ^ ( input >> 1 )
**
** This can be extended to any width bit field. In the case of our 32 bit
** input, we could do this:
**
**     input = ( input ^ ( input >> 16 ) ) & 0xFFFF
**     input = ( input ^ ( input >>  8 ) ) &   0xFF
**     input = ( input ^ ( input >>  4 ) ) &    0xF
**     input = ( input ^ ( input >>  2 ) ) &    0x3
**     input = ( input ^ ( input >>  1 ) ) &    0x1
**
** But  since we're  going to  throw away  the high  order bits  of input
** anyway  and  we  don't  use  them again,  we  can  eliminate  all  the
** intermediate ANDs and do this:
**
**     input =   input ^ ( input >> 16 )
**     input =   input ^ ( input >>  8 )
**     input =   input ^ ( input >>  4 )
**     input =   input ^ ( input >>  2 )
**     input = ( input ^ ( input >>  1 ) ) & 0x1
*/

uint32_t parity_fast_xor( uint32_t input ) {
  input =  input ^ ( input >> 16 );
  input =  input ^ ( input >>  8 );
  input =  input ^ ( input >>  4 );
  input =  input ^ ( input >>  2 );

  return ( input ^ ( input >>  1 ) ) & 0x1;
}

/*
** PARITY WARS - EPISODE V : REVENGE OF THE LOOK-UP TABLES
**
** Hey, but remember  that look-up table? We could eliminate  a couple of
** shifts and XORs by doing using it.
**
** At this  point you might  be thinking  "hey, why not  just use a  4 Gb
** look-up  table and  do the  calculation  in one  operation?" This  may
** actually be an  option in the glorious  future, but for now  we need a
** look-up table  that can easily  fit in  the processor's cache.  At the
** time I wrote this,  16k or 32k caches for  embedded CPUs not uncommon,
** but not 4Gb caches are nowhere to be seen.
**
** Even  if  you  DID have  a  4Gb cache,  it may  not  be worth  it.  It
** probably takes at least one clock cycle per 1, 4 or 16 bytes of memory
** to load from main memory into the cache, so a 256 byte table will take
** 256, 64 or 16 cycles to load.  Assuming you have a 4Gb cache some day,
** it's still going to take at least 2^(31-4) cycles to load. Adding 2^28
** machine cycles just so  you can save the number of  cycles it takes to
** execute these lines is probably not going to be a net-positive:
**
**   input =  input ^ ( input >> 16 );
**   input =  input ^ ( input >>  8 );
**
** But definitely test this hypothesis, you *should* be able to use a 4Gb
** lookup table on a modern 64 bit CPU, but I'm going to leave that as an
** exercise for  the reader. If  you have  access to CPUs  with different
** cache  sizes, it  might  be  instructive to  see  how different  sized
** look-up tables affect  performance. But, this also is  an exercise for
** the reader.
*/
uint32_t parity_faster_xor( uint32_t input ) {
  input =  input ^ ( input >> 16 );
  input =  input ^ ( input >>  8 );

  return lut[ input & 0xFF ];
}

/*
**
** After digging around,  I found this implementation  at Steven Brumme's
** site at:
**
**   http://bits.stephan-brumme.com/parity.html
**
** It's pretty  cool. It does the  shift and XOR trick  multiple times to
** get down to a  4 bit value. Then it shifts  a specially constructed 16
** bit  constant  by  that many  bits  and  as  if  by magic,  the  least
** significant bit is the parity of the 32 bit value.
**
** The  magic   here  comes   from  the   construction  of   the  special
** constant. Remember  that the shift and  XOR trick takes a  n-bit value
** and  produces  a   n/2-bit  value  with  the   same  parity.  Brumme's
** implementation does effectively the same  thing as the lookup table in
** the parity_lut implementation  above, but it uses the  16 bit constant
** as the look-up table. Each of the bits in the special constant encodes
** the parity of  the index's parity. Here's the table  from Brumme's web
** page redrawn:
**
**   Index      Parity
**   0 0000     0      --------------------------------------+
**   1 0001     1      ------------------------------------+ |
**   2 0010     1      ----------------------------------+ | |
**   3 0011     0      --------------------------------+ | | |
**                                                     | | | |
**   4 0100     1      ----------------------------+   | | | |
**   5 0101     0      --------------------------+ |   | | | |
**   6 0110     0      ------------------------+ | |   | | | |
**   7 0111     1      ----------------------+ | | |   | | | |
**                                           | | | |   | | | |
**   8 1000     1      ------------------+   | | | |   | | | |
**   9 1001     0      ----------------+ |   | | | |   | | | |
**   A 1010     0      --------------+ | |   | | | |   | | | |
**   B 1011     1      ------------+ | | |   | | | |   | | | |
**                                 | | | |   | | | |   | | | |
**   C 1100     0      --------+   | | | |   | | | |   | | | |
**   D 1101     1      ------+ |   | | | |   | | | |   | | | |
**   E 1110     1      ----+ | |   | | | |   | | | |   | | | |
**   F 1111     0      --+ | | |   | | | |   | | | |   | | | |
**                       | | | |   | | | |   | | | |   | | | |
**                       v v v v   v v v v   v v v v   v v v v
**   Magic Constant  ->  0 1 1 0   1 0 0 1   1 0 0 1   0 1 1 0
**   Magic Constant (in Hex) -> 0x6996
**
** I  don't know  if  Stephan  devised this  shift  look-up table  scheme
** himself or if  he's copying it from someone else,  but it's definitely
** clever. I'm not sure if it's going to be faster than the look-up table
** used in  parity_faster_xor() on a modern  CPU, but if you're  on a CPU
** with  a fast  shifter and  a (very)  small data  cache, I  wouldn't be
** surprised if it *was* faster.
*/

uint32_t parity_shift_lut( uint32_t input ) {
  input =  input ^ ( input >> 16 );
  input =  input ^ ( input >>  8 );
  input =  input ^ ( input >>  4 );
  input =  input & 0xF;

  return ( 0x6996 >> input & 0x0F ) & 1;
}

/*
** Here's another  algorithm on Brumme's web  page, though I think  I saw
** this one on  the USENET back in the day.  Stephan's description of the
** algorithm is pretty good, so you may want to look at it as well:
**
**   http://bits.stephan-brumme.com/parity.html
**
** This algorithm starts by calculating  the parity of the eight four-bit
** nybbles in the thirty-two bit input  value, depositing a zero or a one
** into  that  nybble.  For  example,  if  you  started  with  the  value
** 0xEBBF6632, after  the first half of  the algorithm you would  have an
** intermediary value of 0x11100010.
**
** Next, it multiplies this value by 0x11111111 (note, that's a Hexadecimal
** constant, not a binary constant.) You can see why this is interesting if
** you think about how we performed multipication in grade school:
**
**             ABCDEFGH
**           x 11111111
**           ----------
**             ABCDEFGH
**            ABCDEFGH
**           ABCDEFGH
**          ABCDEFGH
**         ABCDEFGH
**        ABCDEFGH
**       ABCDEFGH
**    + ABCDEFGH
**    -----------------
**      *******X********
**             ^
**             |
**             +--------- (A + B + C + D + E + F + G + H)
**
** The digit in the  product highlighted above is going to  be the sum of
** each of  the digits  in the  intermediate value  we calculated  in the
** previous  step. As  it turns  out, the  low bit  of the  sum of  these
** intermediary  parities  is the  parity  of  the whole  thirty-two  bit
** value.  And because  we  constrained each  of the  hex  digits of  the
** intermediary value to  a one or a  zero, we don't have  to worry about
** previous digits carrying over into this middle digit of the product.
**
** So now the parity  is in bit 28 of the resulting  product. All we need
** do now is shift and use  the AND operation to eliminate any extraneous
** data in the higher order bits.
**
** In the limited  testing I was able to do,  I found this implementation
** to  be   just  slightly   slower  than   the  XOR/look-up   hybrid  in
** parity_faster_xor()  on an  Intel i5  CPU running  (*shudder*) Windows
** 10. But it's close enough to  warrant a little extra investigation. If
** you're using a CPU with a (very) small data cache and a fast multiply,
** I bet  this implementation would be  faster. But, as always,  run some
** tests to be sure.
*/
uint32_t parity_and_mult(  uint32_t input ) {
  input = input ^ ( input >> 1 );
  input = input ^ ( input >> 2 );
  input = input &   0x11111111;
  input = input *   0x11111111;

  return ( input >> 28 ) & 1;
}

## parity.h
/* parity.h
**
** Copyright (c) 2018, Meadhbh S. Hamrick
** Released under a BSD Open Source License, see parity_test.c for details
*/

#ifndef _H_PARITY
#define _H_PARITY

#include <stdint.h>

uint32_t parity_naive( uint32_t input );
uint32_t parity_xor( uint32_t input );
uint32_t parity_unrolled( uint32_t input );
uint32_t parity_better_unrolled( uint32_t input );
uint32_t parity_lut( uint32_t input );
uint32_t parity_lut_bis( uint32_t input );
uint32_t parity_fast_xor( uint32_t input );
uint32_t parity_faster_xor( uint32_t input );
uint32_t parity_shift_lut( uint32_t input );
uint32_t parity_and_mult(  uint32_t input );

#ifdef _CUSTOM
uint32_t parity_custom( uint32_t input );
#endif

#endif

## parity_custom.s
  # parity_custom.s
  #
  # Copyright (c) 2018 Meadhbh S. Hamrick, All Rights Reserved
  # Released under a BSD Open Source License. See parity_test.c for license
  # text.
  #
  # This is a very bad implementation of a custom assembly parity algorithm.
  # Hopefully you recognize this as Intel assembly. My system seems to have
  # come with v2.26 of the Gnu Assembler (GAS). You can find more information
  # about it at:
  #
  #   https://sourceware.org/binutils/docs/as/index.html
  #
  # If you want to try your hand at writing some assembly, replace the
  # extremely bad implementation below with an implementation of your own
  # design and recompile thusly:
  #
  #   gcc -c -o parity_custom.o parity_custom.s
  #   gcc -c -o parity.o -D_CUSTOM parity.c
  #   gcc -c -o custom_test.o -D_CUSTOM parity_test.c
  #   gcc -o custom_test custom_test.o parity.o parity_custom.o
  #
  # This will produce a version of the test program called custom_test which
  # may be executed normally. Or... you can just type:
  #
  #   make custom
  #

        .ident "Custom Assembly Implementation"
        .file "parity_custom.c"
        .text

        .globl parity_custom
        .type parity_custom, @function

parity_custom:
        .cfi_startproc

        xorl %eax, %eax
        ret

        .cfi_endproc


## parity_test.c
/* parity_test.c
**
** Copyright (c) 2018, Meadhbh S. Hamrick
**
** Redistribution  and use  in source  and  binary forms,  with or  without
** modification, are  permitted provided that the  following conditions are
** met:
**
**  1. Redistributions  of  source code  must  retain  the above  copyright
**     notice, this list of conditions and the following disclaimer.
**
**  2. Redistributions in  binary form  must reproduce the  above copyright
**     notice, this list of conditions  and the following disclaimer in the
**     documentation and/or other materials provided with the distribution.
**
**  3. Neither  the name  of  the copyright  holder nor  the  names of  its
**     contributors may be used to endorse or promote products derived from
**     this software without specific prior written permission.
**
** THIS SOFTWARE IS PROVIDED BY  THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
** IS" AND  ANY EXPRESS OR  IMPLIED WARRANTIES, INCLUDING, BUT  NOT LIMITED
** TO,  THE  IMPLIED  WARRANTIES  OF  MERCHANTABILITY  AND  FITNESS  FOR  A
** PARTICULAR  PURPOSE ARE  DISCLAIMED.  IN NO  EVENT  SHALL THE  COPYRIGHT
** HOLDER OR CONTRIBUTORS  BE LIABLE FOR ANY  DIRECT, INDIRECT, INCIDENTAL,
** SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
** TO, PROCUREMENT OF  SUBSTITUTE GOODS OR SERVICES; LOSS OF  USE, DATA, OR
** PROFITS; OR BUSINESS  INTERRUPTION) HOWEVER CAUSED AND ON  ANY THEORY OF
** LIABILITY,  WHETHER IN  CONTRACT, STRICT  LIABILITY, OR  TORT (INCLUDING
** NEGLIGENCE  OR OTHERWISE)  ARISING IN  ANY WAY  OUT OF  THE USE  OF THIS
** SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

/*
** This is a quick bit of test code to verify a few fun conversations about
** how to  most efficiently calculate the  parity of a 32  bit integer. The
** implementation is  broken into three files:  parity_test.c, parity.c and
** parity.h.  This file,  parity_test.c, contains  the main  function which
** loops   through   a  number   of   test   fixtures  and   verifies   our
** implementation(s) are working correctly.  The actual implementations are
** defined  in  parity.c  while  parity.h  defines  the  interface  to  the
** implementations in parity.c.
**
** What you're supposed  to learn from  this is  how  radically different a
** "smart"  implementation  can be.  There's  more  detail in  comments  in
** parity.c,  but what  you *should*  see  is the  naive implementation  is
** plenty fast, but as we make successive optimizations,  we get impressive
** improvements in speed.
**
** To compile this program, type `make test` on the command line, or do the
** following:
**
**   gcc -c -o parity.o parity.c
**   gcc -c -o parity_test.o parity_test.c
**   gcc -o parity_test parity_test.o parity.o
**
** It  might be  fun  to also  add  optimization flags  like  -O3, but  the
** implementations are pretty simple, so the default optimization(s) may be
** pretty good to start with.  But... I don't know. Give it a try. See what
** happens. I'm  not too  proud to  say I've  been pleasantly  surprised by
** compilers' ability to optimize code in the last couple of decades.
**
** If you want to see what happens when you turn on agressive optimization,
** try building with this command:
**
**   CFLAGS=-O3 make test && ./parity_test
**
** This program runs  several self-tests on the  implementations defined in
** parity.c. You SHOULD see a test value  printed on a line followed by the
** values  returned  by  different implementations.  Something's  seriously
** wrong if the different implementations don't return the same value.
**
** Next  the test  program  runs each  of  the implementations  _COUNT_TIME
** number of  times and measures how  long it takes. Pedants  will probably
** point  out  that  we're  including  the   time  it  takes  to  CALL  the
** implementation in the timing data. They're right, we are doing that. But
** what  we're  trying to  accomplish  is  to  show ball-park  figures  for
** relative performance, not  to collect absolute timing  data. (After all,
** this code uses Unix standard library calls to get time of day data. It's
** highly likely  you're running this code  on a machine that  may be doing
** all sorts  of other tasks in  the background. If you  get weird results,
** try running  the program  a couple  times and making  sure there  are no
** other CPU intensive processes running.)
**
** Another  fun thing  to  try is  to  compile the  parity.c  file down  to
** assembly language with this command:
**
**   gcc -S -c -o parity.s parity.c
**
** This requires you to understand assembly language for your platform, but
** if you do, it's pretty revealing what's happening. FWIW, the reason this
** test is split  into multiple files is  so it's easy to  compile just the
** parity implementations to assembly.
**
** Anyway, enjoy. Also, don't forget to read the comments in parity.c.
**
** And hella-props to  Palmer Dabbelt at SiFive for pointing  out to me how
** many ways there are to calculate parity.
*/
#ifndef _COUNT_TIME
#define _COUNT_TIME 1048576
#endif

#ifndef _CUSTOM
#define _COUNT_IMPL 10
#else
#define _COUNT_IMPL 11
#endif

#define _COUNT_TEST 10

#include "parity.h"
#include <stdio.h>
#include <sys/time.h>

typedef struct {
  uint32_t test;
  uint32_t count;
} tFixture;

typedef struct {
  unsigned char * name;
  uint32_t        (*func)( uint32_t );
} tImplementation;

static tFixture fixtures[ _COUNT_TEST ] = {
  {0x00000000, 0},
  {0x00000001, 1},
  {0x00000002, 1},
  {0x00000004, 1},
  {0x00000008, 1},
  {0x00000011, 0},
  {0xAAAAAAAA, 0},
  {0x55555555, 0},
  {0xA5A5A5A4, 1},
  {0x3C3CC3C3, 0}
};

static tImplementation implementations[ _COUNT_IMPL ] = {
   {"Naive", parity_naive}
  ,{"XOR", parity_xor}
  ,{"Unrolled", parity_unrolled}
  ,{"Better Unrolled", parity_better_unrolled}
  ,{"Lookup Table", parity_lut}
  ,{"Lookup Table (bis)", parity_lut_bis}
  ,{"Fast XOR", parity_fast_xor}
  ,{"Faster XOR", parity_faster_xor}
  ,{"Shift Lookup", parity_shift_lut}
  ,{"AND and Multiply", parity_and_mult}
#ifdef _CUSTOM
  ,{"Custom Assembly", parity_custom}
#endif
};

void list_implementations() {
  int i;

  printf( "I know of the following implementations: \n" );

  for( i = 0; i < _COUNT_IMPL; i++ ) {
    printf( "  %02d - %-20s\n", i + 1, implementations[ i ].name );
  }

  printf( "\n" );
}

void test_implementations() {
  int i, j;

  printf( "Testing %d implementations\n", _COUNT_IMPL );
  printf( "  ## Fixture  ^" );

  for( i = 0; i < _COUNT_IMPL; i++ ) {
    printf( " %02d", i + 1);
  }

  printf( "\n" );

  for( i = 0; i < _COUNT_TEST; i++ ) {
    printf( "  %02d %08X %1d", i + 1, fixtures[i].test, fixtures[i].count );

    for( j = 0; j < _COUNT_IMPL; j++ ) {
      printf( " %2d", (*implementations[j].func)( fixtures[i].test ) );
    }

    printf( "\n" );
  }

  printf( "\n" );
}

uint64_t gen_timestamp() {
  struct timeval time_data;
  gettimeofday( & time_data, NULL );
  return (uint64_t) ( time_data.tv_sec * 1000000 + time_data.tv_usec );
}

double _diff( uint64_t extent, uint64_t base ) {
  return (double) base / (double) extent;
}

void time_implementations() {
  int i, j;
  uint64_t naive, current, begin, end;
  uint32_t (* impl )( uint32_t );

  printf( "Timing %d implementations by calling each %d times\n",
          _COUNT_IMPL, _COUNT_TIME );

  printf( "  ## Implementation       usec     times faster than Naive\n" );

  for( i = 0; i < _COUNT_IMPL; i++ ) {
    impl = implementations[i].func;

    begin = gen_timestamp();

    for( j = 0; j < _COUNT_TIME; j++ ) {
      (*impl)( j );
    }

    end = gen_timestamp();

    if( 0 == i ) {
      naive = end - begin;
      printf( "  %02d %-20s %07ld\n", i + 1, implementations[i].name, naive );
    } else {
      current = end - begin;
      printf( "  %02d %-20s %07ld  %5.1f\n", i + 1,
              implementations[i].name, current,
              (double) naive / (double) current );
    }
  }

  printf( "\n" );
}

int main( int argc, char * argv [] ) {
  int i;

  list_implementations();
  test_implementations();
  time_implementations();
}
	Read this file first.
	Then read the comments at the top of parity_test.c
	Then read the comments above each algorithm implementation in parity.c

	So there I was talking about efficient implemetation of fundamental functions
	with Palmer over at SiFive. Back in the day, I used to participate in informal
	contests on rec.games.programmer to see who could craft the most efficient
	implementations for population count or bit reversal or bit-blitting. For
	programmer types, it's great fun.

	This day, however, we picked "how do you efficently calculate the parity of a
	32 bit integer" as a practical example of a typical problem a programmer might
	encounter.

	After thinking about it for a while and coming up with a number of different
	implementations, we felt we had exhausted the solution space. Practical
	ramifications of our conversation included a few key take-aways:

	1. C compilers are pretty good these days, but if you're seriously worried about
	efficiency, you want to learn how to read assembly for your target CPU and
	take a look at the what the compiler produces.

	2. You can often get a good sense for efficiency by looking at the assembly
	output, but DEFINITELY run some timing tests. Modern CPUs are complex beasts
	where subtle changes in code can lead to radical changes in run-time
	behaviour.

	And that's what this code is about. It's set up to test several different
	techniques for producing the parity value of a 32 bit integer. As a bonus, it's
	pretty easy to tell the compiler to spit out the assembly code it's generating.

	I don't think any of the approaches provided here are completely new, you can
	probably find them discussed out on the internet. Palmer described a few of
	them in a very succinct manner, exposing the relationship between CPU design
	and the production of optimized code. I haven't seen such a discussion online
	before, so I figured it might be cool to step through a couple of different
	implementations of a particular problem, highlighting how features in modern
	CPUs make some optimizations more or less effective.

	In addition to telling you raw numbers, it gives you information about relative
	performance. It's not a perfect test, however, as it's intended to run on a unix
	like operating system (Darwin, Leenucks, BSD, etc.) and it's likely machines
	running each of these OSes have things going on in the background. But if you
	run it several times, it will probably give you a decent picture of what parity
	algorithms run faster on your hardware.

	Enjoy!
	# Makefile
	#
	# Copyright (c) 2018, Meadhbh S. Hamrick
	# Released under a BSD Open Source License, see parity_test.c for details

	default :
	@echo "try one of the following:"
	@echo "make test - builds the parity_test executable"
	@echo "make clean - removes object files and executables"
	@echo "make asm - compiles parity.c to assembly language"
	@echo "make custom - makes a version that includes an assembly implementation"

	test : parity_test

	parity_test : parity_test.o parity.o
	$(CC) $(LFLAGS) -o parity_test parity_test.o parity.o

	parity_test.o : parity_test.c parity.h
	$(CC) $(CFLAGS) -c -o parity_test.o parity_test.c

	parity.o : parity.c parity.h
	$(CC) $(CFLAGS) -c -o parity.o parity.c

	clean :
	$(RM) parity_test parity_test.o parity.o parity.s \
	parity_custom.o custom_test.o custom_test

	asm :
	$(CC) -S $(CFLAGS) -c -o parity.s parity.c

	custom : custom_test

	custom_test : custom_test.o parity.o parity_custom.o
	$(CC) $(LDFLAGS) -o custom_test custom_test.o parity.o parity_custom.o

	custom_test.o : parity_test.c parity.h
	$(CC) $(CFLAGS) -D_CUSTOM -c -o custom_test.o parity_test.c
	/* parity.h
	**
	** Copyright (c) 2018, Meadhbh S. Hamrick
	** Released under a BSD Open Source License, see parity_test.c for details
	*/

	#ifndef _H_PARITY
	#define _H_PARITY

	#include <stdint.h>

	uint32_t parity_naive( uint32_t input );
	uint32_t parity_xor( uint32_t input );
	uint32_t parity_unrolled( uint32_t input );
	uint32_t parity_better_unrolled( uint32_t input );
	uint32_t parity_lut( uint32_t input );
	uint32_t parity_lut_bis( uint32_t input );
	uint32_t parity_fast_xor( uint32_t input );
	uint32_t parity_faster_xor( uint32_t input );
	uint32_t parity_shift_lut( uint32_t input );
	uint32_t parity_and_mult( uint32_t input );

	#ifdef _CUSTOM
	uint32_t parity_custom( uint32_t input );
	#endif

	#endif
	# parity_custom.s
	#
	# Copyright (c) 2018 Meadhbh S. Hamrick, All Rights Reserved
	# Released under a BSD Open Source License. See parity_test.c for license
	# text.
	#
	# This is a very bad implementation of a custom assembly parity algorithm.
	# Hopefully you recognize this as Intel assembly. My system seems to have
	# come with v2.26 of the Gnu Assembler (GAS). You can find more information
	# about it at:
	#
	# https://sourceware.org/binutils/docs/as/index.html
	#
	# If you want to try your hand at writing some assembly, replace the
	# extremely bad implementation below with an implementation of your own
	# design and recompile thusly:
	#
	# gcc -c -o parity_custom.o parity_custom.s
	# gcc -c -o parity.o -D_CUSTOM parity.c
	# gcc -c -o custom_test.o -D_CUSTOM parity_test.c
	# gcc -o custom_test custom_test.o parity.o parity_custom.o
	#
	# This will produce a version of the test program called custom_test which
	# may be executed normally. Or... you can just type:
	#
	# make custom
	#

	.ident "Custom Assembly Implementation"
	.file "parity_custom.c"
	.text

	.globl parity_custom
	.type parity_custom, @function

	parity_custom:
	.cfi_startproc

	xorl %eax, %eax
	ret

	.cfi_endproc
	/* parity_test.c
	**
	** Copyright (c) 2018, Meadhbh S. Hamrick
	**
	** Redistribution and use in source and binary forms, with or without
	** modification, are permitted provided that the following conditions are
	** met:
	**
	** 1. Redistributions of source code must retain the above copyright
	** notice, this list of conditions and the following disclaimer.
	**
	** 2. Redistributions in binary form must reproduce the above copyright
	** notice, this list of conditions and the following disclaimer in the
	** documentation and/or other materials provided with the distribution.
	**
	** 3. Neither the name of the copyright holder nor the names of its
	** contributors may be used to endorse or promote products derived from
	** this software without specific prior written permission.
	**
	** THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
	** IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
	** TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
	** PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	** HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	** SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
	** TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
	** PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
	** LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
	** NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	** SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	/*
	** This is a quick bit of test code to verify a few fun conversations about
	** how to most efficiently calculate the parity of a 32 bit integer. The
	** implementation is broken into three files: parity_test.c, parity.c and
	** parity.h. This file, parity_test.c, contains the main function which
	** loops through a number of test fixtures and verifies our
	** implementation(s) are working correctly. The actual implementations are
	** defined in parity.c while parity.h defines the interface to the
	** implementations in parity.c.
	**
	** What you're supposed to learn from this is how radically different a
	** "smart" implementation can be. There's more detail in comments in
	** parity.c, but what you should see is the naive implementation is
	** plenty fast, but as we make successive optimizations, we get impressive
	** improvements in speed.
	**
	** To compile this program, type `make test` on the command line, or do the
	** following:
	**
	** gcc -c -o parity.o parity.c
	** gcc -c -o parity_test.o parity_test.c
	** gcc -o parity_test parity_test.o parity.o
	**
	** It might be fun to also add optimization flags like -O3, but the
	** implementations are pretty simple, so the default optimization(s) may be
	** pretty good to start with. But... I don't know. Give it a try. See what
	** happens. I'm not too proud to say I've been pleasantly surprised by
	** compilers' ability to optimize code in the last couple of decades.
	**
	** If you want to see what happens when you turn on agressive optimization,
	** try building with this command:
	**
	** CFLAGS=-O3 make test && ./parity_test
	**
	** This program runs several self-tests on the implementations defined in
	** parity.c. You SHOULD see a test value printed on a line followed by the
	** values returned by different implementations. Something's seriously
	** wrong if the different implementations don't return the same value.
	**
	** Next the test program runs each of the implementations _COUNT_TIME
	** number of times and measures how long it takes. Pedants will probably
	** point out that we're including the time it takes to CALL the
	** implementation in the timing data. They're right, we are doing that. But
	** what we're trying to accomplish is to show ball-park figures for
	** relative performance, not to collect absolute timing data. (After all,
	** this code uses Unix standard library calls to get time of day data. It's
	** highly likely you're running this code on a machine that may be doing
	** all sorts of other tasks in the background. If you get weird results,
	** try running the program a couple times and making sure there are no
	** other CPU intensive processes running.)
	**
	** Another fun thing to try is to compile the parity.c file down to
	** assembly language with this command:
	**
	** gcc -S -c -o parity.s parity.c
	**
	** This requires you to understand assembly language for your platform, but
	** if you do, it's pretty revealing what's happening. FWIW, the reason this
	** test is split into multiple files is so it's easy to compile just the
	** parity implementations to assembly.
	**
	** Anyway, enjoy. Also, don't forget to read the comments in parity.c.
	**
	** And hella-props to Palmer Dabbelt at SiFive for pointing out to me how
	** many ways there are to calculate parity.
	*/
	#ifndef _COUNT_TIME
	#define _COUNT_TIME 1048576
	#endif

	#ifndef _CUSTOM
	#define _COUNT_IMPL 10
	#else
	#define _COUNT_IMPL 11
	#endif

	#define _COUNT_TEST 10

	#include "parity.h"
	#include <stdio.h>
	#include <sys/time.h>

	typedef struct {
	uint32_t test;
	uint32_t count;
	} tFixture;

	typedef struct {
	unsigned char * name;
	uint32_t (*func)( uint32_t );
	} tImplementation;

	static tFixture fixtures[ _COUNT_TEST ] = {
	{0x00000000, 0},
	{0x00000001, 1},
	{0x00000002, 1},
	{0x00000004, 1},
	{0x00000008, 1},
	{0x00000011, 0},
	{0xAAAAAAAA, 0},
	{0x55555555, 0},
	{0xA5A5A5A4, 1},
	{0x3C3CC3C3, 0}
	};

	static tImplementation implementations[ _COUNT_IMPL ] = {
	{"Naive", parity_naive}
	,{"XOR", parity_xor}
	,{"Unrolled", parity_unrolled}
	,{"Better Unrolled", parity_better_unrolled}
	,{"Lookup Table", parity_lut}
	,{"Lookup Table (bis)", parity_lut_bis}
	,{"Fast XOR", parity_fast_xor}
	,{"Faster XOR", parity_faster_xor}
	,{"Shift Lookup", parity_shift_lut}
	,{"AND and Multiply", parity_and_mult}
	#ifdef _CUSTOM
	,{"Custom Assembly", parity_custom}
	#endif
	};

	void list_implementations() {
	int i;

	printf( "I know of the following implementations: \n" );

	for( i = 0; i < _COUNT_IMPL; i++ ) {
	printf( " %02d - %-20s\n", i + 1, implementations[ i ].name );
	}

	printf( "\n" );
	}

	void test_implementations() {
	int i, j;

	printf( "Testing %d implementations\n", _COUNT_IMPL );
	printf( " ## Fixture ^" );

	for( i = 0; i < _COUNT_IMPL; i++ ) {
	printf( " %02d", i + 1);
	}

	printf( "\n" );

	for( i = 0; i < _COUNT_TEST; i++ ) {
	printf( " %02d %08X %1d", i + 1, fixtures[i].test, fixtures[i].count );

	for( j = 0; j < _COUNT_IMPL; j++ ) {
	printf( " %2d", (*implementations[j].func)( fixtures[i].test ) );
	}

	printf( "\n" );
	}

	printf( "\n" );
	}

	uint64_t gen_timestamp() {
	struct timeval time_data;
	gettimeofday( & time_data, NULL );
	return (uint64_t) ( time_data.tv_sec * 1000000 + time_data.tv_usec );
	}

	double _diff( uint64_t extent, uint64_t base ) {
	return (double) base / (double) extent;
	}

	void time_implementations() {
	int i, j;
	uint64_t naive, current, begin, end;
	uint32_t (* impl )( uint32_t );

	printf( "Timing %d implementations by calling each %d times\n",
	_COUNT_IMPL, _COUNT_TIME );

	printf( " ## Implementation usec times faster than Naive\n" );

	for( i = 0; i < _COUNT_IMPL; i++ ) {
	impl = implementations[i].func;

	begin = gen_timestamp();

	for( j = 0; j < _COUNT_TIME; j++ ) {
	(*impl)( j );
	}

	end = gen_timestamp();

	if( 0 == i ) {
	naive = end - begin;
	printf( " %02d %-20s %07ld\n", i + 1, implementations[i].name, naive );
	} else {
	current = end - begin;
	printf( " %02d %-20s %07ld %5.1f\n", i + 1,
	implementations[i].name, current,
	(double) naive / (double) current );
	}
	}

	printf( "\n" );
	}

	int main( int argc, char * argv [] ) {
	int i;

	list_implementations();
	test_implementations();
	time_implementations();
	}