ned14/hamming_test.cpp

## hamming_test.cpp
/* hamming_test.cpp
Test generic hamming coding routines (SECDEC). These can fix 1 bit errors and detect two bit errors.
(C) 2015 Niall Douglas http://www.nedprod.com

g++ -std=c++11 -o hamming_test -O3 -fverbose-asm -Wa,-adhln hamming_test.cpp -march=ivybridge > hamming_test.S

On my 3.9Ghz Ivy Bridge Intel 3770K:

For no intrinsics:
Calculating 4Kb: 12.06Mb/sec
     Fixing 4Kb: 14.98Mb/sec

With intrinsics (bitscan for set bits):
Calculating 4Kb: 73.78Mb/sec
     Fixing 4Kb: 64.15Mb/sec

With tables (bitscan for set bits):
Calculating 4Kb: 82.41Mb/sec
     Fixing 4Kb: 72.58Mb/sec

With prefetch of packed tables and slice by 8 with fixed bit masking:
Calculating 4Kb: 168.68Mb/sec (22 corrected cycles/byte)
     Fixing 4Kb: 110.97Mb/sec

With prefetch of packed tables and slice by 8 with fixed bit masking and OpenMP:
Calculating 4Kb: 683Mb/sec
     Fixing 4Kb: 383Mb/sec
*/

#include <atomic>
#include <iostream>
#include <random>
#include <vector>
#include <chrono>
#include <stdexcept>
#include <string.h>

#define USE_INTRINSICS

/*! \class secdec_ecc
\brief Calculates the single error correcting double error detecting (SECDEC) Hamming Error Correcting Code for a \em blocksize block of bytes.

After construction, no state is modified and therefore this class is safe for static storage.
*/
template<size_t blocksize> class secdec_ecc
{
public:
  typedef unsigned int result_type; //!< The largest ECC which can be calculated
private:
  static constexpr size_t bits_per_byte=8;
  typedef unsigned char unit_type;  // The batch unit of processing
  result_type bitsvalid;
  // Many CPUs (x86) are slow doing variable bit shifts, so keep a table
  result_type ecc_twospowers[sizeof(result_type)*bits_per_byte];
  unsigned short ecc_table[blocksize*bits_per_byte];
  static bool _is_single_bit_set(unsigned long long x)
  {
#ifdef USE_INTRINSICS
    return __builtin_popcountl(x)==1;
#else
    x -= (x >> 1) & 0x5555555555555555ULL;
    x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL);
    x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0fULL;
    unsigned long long count=(x * 0x0101010101010101ULL)>>56;
    return count==1;
#endif
  }
public:
  //! Constructs an instance, configuring the necessary lookup tables
  secdec_ecc()
  {
    for(size_t n=0; n<sizeof(result_type)*bits_per_byte; n++)
      ecc_twospowers[n]=((result_type)1<<n);
    result_type length=blocksize*8;
    // This is (data bits + parity bits + 1) <= 2^(parity bits)
    for(result_type p=1; p<sizeof(result_type)*bits_per_byte; p++)
      if((length+p+1)<=ecc_twospowers[p])
      {
        bitsvalid=p;
        break;
      }
    if((7+bitsvalid)/8>sizeof(result_type))
      throw std::runtime_error("ECC would exceed the size of result_type!");
    for(result_type i=0; i<blocksize*bits_per_byte; i++)
    {
      // Make a code bit
      result_type b=i+1;
#ifdef USE_INTRINSICS
      result_type topbit=8*sizeof(result_type)-__builtin_clz(b);
      b+=topbit;
      if(b>=ecc_twospowers[topbit]) b++;
      //while(b>ecc_twospowers(_topbit+1)) _topbit++;
      //b+=_topbit;
      //if(b>=ecc_twospowers(_topbit)) b++;
#else
      for(size_t p=0; ecc_twospowers[p]<(b+1); p++)
        b++;
#endif
      ecc_table[i]=b;
      if(b>(unsigned short)-1)
        throw std::runtime_error("Precalculated table has exceeded its bounds");
    }
  }
  //! The number of bits valid in result_type
  size_t result_bits_valid() const noexcept
  {
    return bitsvalid;
  }
  //! Accumulate ECC from buffer
  result_type operator()(result_type ecc, const char *buffer) const noexcept
  {
#if 1
    // Process in lumps of eight
    const unit_type *_buffer=(const unit_type *) buffer;
#pragma omp parallel for reduction(^:ecc)
    for(size_t i=0; i<blocksize; i+=sizeof(unit_type)*8)
    {
      union { unsigned long long v; unit_type c[8]; };
      result_type prefetch[8];
      v=*(unsigned long long *)(&_buffer[0+i/sizeof(unit_type)]); // min 1 cycle
#define ROUND(n) \
        prefetch[0]=ecc_table[(i+0)*8+n]; \
        prefetch[1]=ecc_table[(i+1)*8+n]; \
        prefetch[2]=ecc_table[(i+2)*8+n]; \
        prefetch[3]=ecc_table[(i+3)*8+n]; \
        prefetch[4]=ecc_table[(i+4)*8+n]; \
        prefetch[5]=ecc_table[(i+5)*8+n]; \
        prefetch[6]=ecc_table[(i+6)*8+n]; \
        prefetch[7]=ecc_table[(i+7)*8+n]; \
        if(c[0]&((unit_type)1<<n))\
          ecc^=prefetch[0];\
        if(c[1]&((unit_type)1<<n))\
          ecc^=prefetch[1];\
        if(c[2]&((unit_type)1<<n))\
          ecc^=prefetch[2];\
        if(c[3]&((unit_type)1<<n))\
          ecc^=prefetch[3];\
        if(c[4]&((unit_type)1<<n))\
          ecc^=prefetch[4];\
        if(c[5]&((unit_type)1<<n))\
          ecc^=prefetch[5];\
        if(c[6]&((unit_type)1<<n))\
          ecc^=prefetch[6];\
        if(c[7]&((unit_type)1<<n))\
          ecc^=prefetch[7];
      ROUND(0)                                                    // prefetch = min 8, bit test and xor = min 16, total = 24
      ROUND(1)
      ROUND(2)
      ROUND(3)
      ROUND(4)
      ROUND(5)
      ROUND(6)
      ROUND(7)
#undef ROUND                                                      // total should be 1+(8*24/3)=65
    }
    return ecc;
#else
    const unit_type *_buffer=(const unit_type *) buffer;
#pragma omp parallel for reduction(^:ecc)
    for(size_t i=0; i<blocksize; i+=sizeof(unit_type))
    {
      unit_type c=_buffer[i/sizeof(unit_type)];
      if(!c)
        continue;
      result_type prefetch[bits_per_byte*sizeof(unit_type)];
//      for(size_t n=0; n<bits_per_byte*sizeof(unit_type); n++)
//          prefetch[n]=ecc_table[i*8+n];
#define ROUND(n) \
          prefetch[n]=ecc_table[i*8+n];
      ROUND(0)
      ROUND(1)
      ROUND(2)
      ROUND(3)
      ROUND(4)
      ROUND(5)
      ROUND(6)
      ROUND(7)
#undef ROUND
//      for(size_t n=0; n<bits_per_byte*sizeof(unit_type); n++)
//      {
//        if(c&((unit_type)1<<n))
//          ecc^=prefetch[n];
//      }
#define ROUND(n) \
        if(c&((unit_type)1<<n))\
          ecc^=prefetch[n];
      ROUND(0)
      ROUND(1)
      ROUND(2)
      ROUND(3)
      ROUND(4)
      ROUND(5)
      ROUND(6)
      ROUND(7)
#undef ROUND
    }
    return ecc;
#endif
  }
  result_type operator()(const char *buffer) const noexcept { return (*this)(0, buffer); }
  //! Accumulate ECC from partial buffer where \em length <= \em blocksize
  result_type operator()(result_type ecc, const char *buffer, size_t length) const noexcept
  {
    const unit_type *_buffer=(const unit_type *) buffer;
#pragma omp parallel for reduction(^:ecc)
    for(size_t i=0; i<length; i+=sizeof(unit_type))
    {
      unit_type c=_buffer[i/sizeof(unit_type)];                 // min 1 cycle
      if(!c)                                                    // min 1 cycle
        continue;
      char bitset[bits_per_byte*sizeof(unit_type)];
      // Most compilers will roll this out
      for(size_t n=0; n<bits_per_byte*sizeof(unit_type); n++)   // min 16 cycles
        bitset[n]=!!(c&((unit_type)1<<n));
      result_type localecc=0;
      for(size_t n=0; n<bits_per_byte*sizeof(unit_type); n++)
      {
        if(bitset[n])                                           // min 8 cycles
          localecc^=ecc_table[i*8+n];                           // min 16 cycles
      }
      ecc^=localecc;                                            // min 1 cycle. Total cycles = min 43 cycles/byte
    }
    return ecc;
  }
  result_type operator()(const char *buffer, size_t length) const noexcept { return (*this)(0, buffer, length); }
  //! Given the original ECC and the new ECC for a buffer, find the bad bit. Return (size_t)-1 if not found (e.g. ECC corrupt)
  size_t find_bad_bit(result_type good_ecc, result_type bad_ecc) const noexcept
  {
    result_type length=blocksize*8, eccdiff=good_ecc^bad_ecc;
    if(_is_single_bit_set(eccdiff))
      return (size_t)-1;
    for(result_type i=0, b=1; i<length; i++, b++)
    {
      // Skip parity bits
      while(_is_single_bit_set(b))
        b++;
      if(b==eccdiff)
        return i;
    }
    return (size_t)-1;
  }
  //! Verifies and heals when possible a buffer, returning non zero if the buffer is error free
  int verify(char *buffer, result_type good_ecc) const noexcept
  {
    result_type this_ecc=(*this)(0, buffer);
    if(this_ecc==good_ecc)
      return 1; // no errors
    size_t badbit=find_bad_bit(good_ecc, this_ecc);
    if((size_t)-1==badbit)
      return 0; // parity corrupt?
    buffer[badbit/8]^=(unsigned char) ecc_twospowers[badbit%8];
    this_ecc=(*this)(0, buffer);
    if(this_ecc==good_ecc)
      return 2; // error healed
    // Put the bit back
    buffer[badbit/8]^=(unsigned char) ecc_twospowers[badbit%8];
    return 0; // more than one bit was corrupt
  }
  //! Verifies and heals when possible a partial buffer, returning non zero if the buffer is error free
  int verify(char *buffer, size_t length, result_type good_ecc) const noexcept
  {
    result_type this_ecc=(*this)(0, buffer, length);
    if(this_ecc==good_ecc)
      return 1; // no errors
    size_t badbit=find_bad_bit(good_ecc, this_ecc);
    if((size_t)-1==badbit)
      return 0; // parity corrupt?
    buffer[badbit/8]^=(unsigned char) ecc_twospowers[badbit%8];
    this_ecc=(*this)(0, buffer, length);
    if(this_ecc==good_ecc)
      return 2; // error healed
    // Put the bit back
    buffer[badbit/8]^=(unsigned char) ecc_twospowers[badbit%8];
    return 0; // more than one bit was corrupt
  }
};

int main(int argc, const char *argv[])
{
  int errcode=0;
  const size_t bytes=4096;
  std::vector<char> buffer(bytes);
  {
    std::random_device r;
    std::random_device::result_type *b=(std::random_device::result_type *) buffer.data();
    for(size_t n=0; n<buffer.size()/sizeof(*b); n++)
      b[n]=r();
  }
  secdec_ecc<4096> engine;
  size_t eccbits=engine.result_bits_valid();
  std::cout << "ECC will be " << eccbits << " bits long" << std::endl;
  size_t ecc=engine(buffer.data());
  std::cout << "ECC was calculated to be " << std::hex << ecc << std::dec << std::endl;

  auto end=std::chrono::high_resolution_clock::now(), begin=std::chrono::high_resolution_clock::now();
  unsigned long long _begin=__rdtsc();
  auto diff=std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1>>>(end-begin);
#if 0
  do
  {
    end=std::chrono::high_resolution_clock::now();
  } while(std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1>>>(end-begin).count()<1);
  unsigned long long _end=__rdtsc();
  std::cout << "There are " << (_end-_begin) << " TSCs in 1 second." << std::endl;
#endif

  std::cout << "Flipping every bit in the buffer to see if it is correctly detected ..." << std::endl;
  begin=std::chrono::high_resolution_clock::now();
  for(size_t toflip=0; toflip<bytes*8; toflip++)
  {
    buffer[toflip/8]^=((size_t)1<<(toflip%8));
    size_t newecc=engine(buffer.data());
    if(ecc==newecc)
    {
      std::cerr << "ERROR: Flipping bit " << toflip << " not detected!" << std::endl;
      errcode=2;
    }
    else
    {
      size_t badbit=engine.find_bad_bit(ecc, newecc);
      if(badbit!=toflip)
      {
        std::cerr << "ERROR: Bad bit " << badbit << " is not the bit " << toflip << " we flipped!" << std::endl;
        errcode=2;
      }
//      else
//        std::cout << "SUCCESS: Bit flip " << toflip << " correctly detected" << std::endl;
    }
    if(2!=engine.verify(buffer.data(), ecc))
    {
      std::cerr << "ERROR: verify() did not heal the buffer!" << std::endl;
      errcode=2;
    }
  }
  end=std::chrono::high_resolution_clock::now();
  diff=std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1>>>(end-begin);
  std::cout << "Checking and fixing is approximately " << (bytes*10000/diff.count()/1024/1024) << " Mb/sec" << std::endl;

  std::cout << "\nFlipping two bits in the buffer to see if it is correctly detected ..." << std::endl;
  buffer[0]^=1;
  begin=std::chrono::high_resolution_clock::now();
  for(size_t toflip=1; toflip<bytes*8; toflip++)
  {
    buffer[toflip/8]^=((size_t)1<<(toflip%8));
    size_t newecc=engine(buffer.data());
    if(ecc==newecc)
    {
      std::cerr << "ERROR: Flipping bits 0 and " << toflip << " not detected!" << std::endl;
      errcode=3;
    }
//    else
//    {
//      std::cout << "SUCCESS: Flipping bits 0 and " << toflip << " correctly detected" << std::endl;
//    }
//    if(0!=engine.verify(buffer.data(), ecc))
//    {
//      std::cerr << "ERROR: verify() thought it healed the buffer!" << std::endl;
//      errcode=2;
//    }
    buffer[toflip/8]^=((size_t)1<<(toflip%8));
  }
  end=std::chrono::high_resolution_clock::now();
  diff=std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1>>>(end-begin);
  std::cout << "Calculating is approximately " << (bytes*10000/diff.count()/1024/1024) << " Mb/sec" << std::endl;

  std::cout << "\nCalculating speeds ..." << std::endl;
  size_t foo=0;
  begin=std::chrono::high_resolution_clock::now();
  _begin=__rdtsc();
  for(size_t n=0; n<10000; n++)
  {
    buffer[0]=(char)n;
    foo+=engine(buffer.data());
  }
  _end=__rdtsc();
  end=std::chrono::high_resolution_clock::now();
  diff=std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1>>>(end-begin);
  if(foo)
    std::cout << "Calculating is approximately " << (bytes*10000/diff.count()/1024/1024) << " Mb/sec, or " << ((_end-_begin)/10000.0/4096) << " cycles/byte" << std::endl;
  foo=0;
  begin=std::chrono::high_resolution_clock::now();
  for(size_t n=0; n<10000; n++)
  {
    buffer[0]=(char)n;
    foo+=engine.verify(buffer.data(), ecc);
  }
  end=std::chrono::high_resolution_clock::now();
  diff=std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1>>>(end-begin);
  if(foo)
    std::cout << "Checking and fixing is approximately " << (bytes*10000/diff.count()/1024/1024) << " Mb/sec" << std::endl;
  return errcode;
}
	/* hamming_test.cpp
	Test generic hamming coding routines (SECDEC). These can fix 1 bit errors and detect two bit errors.
	(C) 2015 Niall Douglas http://www.nedprod.com

	g++ -std=c++11 -o hamming_test -O3 -fverbose-asm -Wa,-adhln hamming_test.cpp -march=ivybridge > hamming_test.S

	On my 3.9Ghz Ivy Bridge Intel 3770K:

	For no intrinsics:
	Calculating 4Kb: 12.06Mb/sec
	Fixing 4Kb: 14.98Mb/sec

	With intrinsics (bitscan for set bits):
	Calculating 4Kb: 73.78Mb/sec
	Fixing 4Kb: 64.15Mb/sec

	With tables (bitscan for set bits):
	Calculating 4Kb: 82.41Mb/sec
	Fixing 4Kb: 72.58Mb/sec

	With prefetch of packed tables and slice by 8 with fixed bit masking:
	Calculating 4Kb: 168.68Mb/sec (22 corrected cycles/byte)
	Fixing 4Kb: 110.97Mb/sec

	With prefetch of packed tables and slice by 8 with fixed bit masking and OpenMP:
	Calculating 4Kb: 683Mb/sec
	Fixing 4Kb: 383Mb/sec
	*/

	#include <atomic>
	#include <iostream>
	#include <random>
	#include <vector>
	#include <chrono>
	#include <stdexcept>
	#include <string.h>

	#define USE_INTRINSICS

	/*! \class secdec_ecc
	\brief Calculates the single error correcting double error detecting (SECDEC) Hamming Error Correcting Code for a \em blocksize block of bytes.

	After construction, no state is modified and therefore this class is safe for static storage.
	*/
	template<size_t blocksize> class secdec_ecc
	{
	public:
	typedef unsigned int result_type; //!< The largest ECC which can be calculated
	private:
	static constexpr size_t bits_per_byte=8;
	typedef unsigned char unit_type; // The batch unit of processing
	result_type bitsvalid;
	// Many CPUs (x86) are slow doing variable bit shifts, so keep a table
	result_type ecc_twospowers[sizeof(result_type)*bits_per_byte];
	unsigned short ecc_table[blocksize*bits_per_byte];
	static bool _is_single_bit_set(unsigned long long x)
	{
	#ifdef USE_INTRINSICS
	return __builtin_popcountl(x)==1;
	#else
	x -= (x >> 1) & 0x5555555555555555ULL;
	x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL);
	x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0fULL;
	unsigned long long count=(x * 0x0101010101010101ULL)>>56;
	return count==1;
	#endif
	}
	public:
	//! Constructs an instance, configuring the necessary lookup tables
	secdec_ecc()
	{
	for(size_t n=0; n<sizeof(result_type)*bits_per_byte; n++)
	ecc_twospowers[n]=((result_type)1<<n);
	result_type length=blocksize*8;
	// This is (data bits + parity bits + 1) <= 2^(parity bits)
	for(result_type p=1; p<sizeof(result_type)*bits_per_byte; p++)
	if((length+p+1)<=ecc_twospowers[p])
	{
	bitsvalid=p;
	break;
	}
	if((7+bitsvalid)/8>sizeof(result_type))
	throw std::runtime_error("ECC would exceed the size of result_type!");
	for(result_type i=0; i<blocksize*bits_per_byte; i++)
	{
	// Make a code bit
	result_type b=i+1;
	#ifdef USE_INTRINSICS
	result_type topbit=8*sizeof(result_type)-__builtin_clz(b);
	b+=topbit;
	if(b>=ecc_twospowers[topbit]) b++;
	//while(b>ecc_twospowers(_topbit+1)) _topbit++;
	//b+=_topbit;
	//if(b>=ecc_twospowers(_topbit)) b++;
	#else
	for(size_t p=0; ecc_twospowers[p]<(b+1); p++)
	b++;
	#endif
	ecc_table[i]=b;
	if(b>(unsigned short)-1)
	throw std::runtime_error("Precalculated table has exceeded its bounds");
	}
	}
	//! The number of bits valid in result_type
	size_t result_bits_valid() const noexcept
	{
	return bitsvalid;
	}
	//! Accumulate ECC from buffer
	result_type operator()(result_type ecc, const char *buffer) const noexcept
	{
	#if 1
	// Process in lumps of eight
	const unit_type _buffer=(const unit_type ) buffer;
	#pragma omp parallel for reduction(^:ecc)
	for(size_t i=0; i<blocksize; i+=sizeof(unit_type)*8)
	{
	union { unsigned long long v; unit_type c[8]; };
	result_type prefetch[8];
	v=(unsigned long long )(&_buffer[0+i/sizeof(unit_type)]); // min 1 cycle
	#define ROUND(n) \
	prefetch[0]=ecc_table[(i+0)*8+n]; \
	prefetch[1]=ecc_table[(i+1)*8+n]; \
	prefetch[2]=ecc_table[(i+2)*8+n]; \
	prefetch[3]=ecc_table[(i+3)*8+n]; \
	prefetch[4]=ecc_table[(i+4)*8+n]; \
	prefetch[5]=ecc_table[(i+5)*8+n]; \
	prefetch[6]=ecc_table[(i+6)*8+n]; \
	prefetch[7]=ecc_table[(i+7)*8+n]; \
	if(c[0]&((unit_type)1<<n))\
	ecc^=prefetch[0];\
	if(c[1]&((unit_type)1<<n))\
	ecc^=prefetch[1];\
	if(c[2]&((unit_type)1<<n))\
	ecc^=prefetch[2];\
	if(c[3]&((unit_type)1<<n))\
	ecc^=prefetch[3];\
	if(c[4]&((unit_type)1<<n))\
	ecc^=prefetch[4];\
	if(c[5]&((unit_type)1<<n))\
	ecc^=prefetch[5];\
	if(c[6]&((unit_type)1<<n))\
	ecc^=prefetch[6];\
	if(c[7]&((unit_type)1<<n))\
	ecc^=prefetch[7];
	ROUND(0) // prefetch = min 8, bit test and xor = min 16, total = 24
	ROUND(1)
	ROUND(2)
	ROUND(3)
	ROUND(4)
	ROUND(5)
	ROUND(6)
	ROUND(7)
	#undef ROUND // total should be 1+(8*24/3)=65
	}
	return ecc;
	#else
	const unit_type _buffer=(const unit_type ) buffer;
	#pragma omp parallel for reduction(^:ecc)
	for(size_t i=0; i<blocksize; i+=sizeof(unit_type))
	{
	unit_type c=_buffer[i/sizeof(unit_type)];
	if(!c)
	continue;
	result_type prefetch[bits_per_byte*sizeof(unit_type)];
	// for(size_t n=0; n<bits_per_byte*sizeof(unit_type); n++)
	// prefetch[n]=ecc_table[i*8+n];
	#define ROUND(n) \
	prefetch[n]=ecc_table[i*8+n];
	ROUND(0)
	ROUND(1)
	ROUND(2)
	ROUND(3)
	ROUND(4)
	ROUND(5)
	ROUND(6)
	ROUND(7)
	#undef ROUND
	// for(size_t n=0; n<bits_per_byte*sizeof(unit_type); n++)
	// {
	// if(c&((unit_type)1<<n))
	// ecc^=prefetch[n];
	// }
	#define ROUND(n) \
	if(c&((unit_type)1<<n))\
	ecc^=prefetch[n];
	ROUND(0)
	ROUND(1)
	ROUND(2)
	ROUND(3)
	ROUND(4)
	ROUND(5)
	ROUND(6)
	ROUND(7)
	#undef ROUND
	}
	return ecc;
	#endif
	}
	result_type operator()(const char buffer) const noexcept { return (this)(0, buffer); }
	//! Accumulate ECC from partial buffer where \em length <= \em blocksize
	result_type operator()(result_type ecc, const char *buffer, size_t length) const noexcept
	{
	const unit_type _buffer=(const unit_type ) buffer;
	#pragma omp parallel for reduction(^:ecc)
	for(size_t i=0; i<length; i+=sizeof(unit_type))
	{
	unit_type c=_buffer[i/sizeof(unit_type)]; // min 1 cycle
	if(!c) // min 1 cycle
	continue;
	char bitset[bits_per_byte*sizeof(unit_type)];
	// Most compilers will roll this out
	for(size_t n=0; n<bits_per_byte*sizeof(unit_type); n++) // min 16 cycles
	bitset[n]=!!(c&((unit_type)1<<n));
	result_type localecc=0;
	for(size_t n=0; n<bits_per_byte*sizeof(unit_type); n++)
	{
	if(bitset[n]) // min 8 cycles
	localecc^=ecc_table[i*8+n]; // min 16 cycles
	}
	ecc^=localecc; // min 1 cycle. Total cycles = min 43 cycles/byte
	}
	return ecc;
	}
	result_type operator()(const char buffer, size_t length) const noexcept { return (this)(0, buffer, length); }
	//! Given the original ECC and the new ECC for a buffer, find the bad bit. Return (size_t)-1 if not found (e.g. ECC corrupt)
	size_t find_bad_bit(result_type good_ecc, result_type bad_ecc) const noexcept
	{
	result_type length=blocksize*8, eccdiff=good_ecc^bad_ecc;
	if(_is_single_bit_set(eccdiff))
	return (size_t)-1;
	for(result_type i=0, b=1; i<length; i++, b++)
	{
	// Skip parity bits
	while(_is_single_bit_set(b))
	b++;
	if(b==eccdiff)
	return i;
	}
	return (size_t)-1;
	}
	//! Verifies and heals when possible a buffer, returning non zero if the buffer is error free
	int verify(char *buffer, result_type good_ecc) const noexcept
	{
	result_type this_ecc=(*this)(0, buffer);
	if(this_ecc==good_ecc)
	return 1; // no errors
	size_t badbit=find_bad_bit(good_ecc, this_ecc);
	if((size_t)-1==badbit)
	return 0; // parity corrupt?
	buffer[badbit/8]^=(unsigned char) ecc_twospowers[badbit%8];
	this_ecc=(*this)(0, buffer);
	if(this_ecc==good_ecc)
	return 2; // error healed
	// Put the bit back
	buffer[badbit/8]^=(unsigned char) ecc_twospowers[badbit%8];
	return 0; // more than one bit was corrupt
	}
	//! Verifies and heals when possible a partial buffer, returning non zero if the buffer is error free
	int verify(char *buffer, size_t length, result_type good_ecc) const noexcept
	{
	result_type this_ecc=(*this)(0, buffer, length);
	if(this_ecc==good_ecc)
	return 1; // no errors
	size_t badbit=find_bad_bit(good_ecc, this_ecc);
	if((size_t)-1==badbit)
	return 0; // parity corrupt?
	buffer[badbit/8]^=(unsigned char) ecc_twospowers[badbit%8];
	this_ecc=(*this)(0, buffer, length);
	if(this_ecc==good_ecc)
	return 2; // error healed
	// Put the bit back
	buffer[badbit/8]^=(unsigned char) ecc_twospowers[badbit%8];
	return 0; // more than one bit was corrupt
	}
	};

	int main(int argc, const char *argv[])
	{
	int errcode=0;
	const size_t bytes=4096;
	std::vector<char> buffer(bytes);
	{
	std::random_device r;
	std::random_device::result_type b=(std::random_device::result_type ) buffer.data();
	for(size_t n=0; n<buffer.size()/sizeof(*b); n++)
	b[n]=r();
	}
	secdec_ecc<4096> engine;
	size_t eccbits=engine.result_bits_valid();
	std::cout << "ECC will be " << eccbits << " bits long" << std::endl;
	size_t ecc=engine(buffer.data());
	std::cout << "ECC was calculated to be " << std::hex << ecc << std::dec << std::endl;

	auto end=std::chrono::high_resolution_clock::now(), begin=std::chrono::high_resolution_clock::now();
	unsigned long long _begin=__rdtsc();
	auto diff=std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1>>>(end-begin);
	#if 0
	do
	{
	end=std::chrono::high_resolution_clock::now();
	} while(std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1>>>(end-begin).count()<1);
	unsigned long long _end=__rdtsc();
	std::cout << "There are " << (_end-_begin) << " TSCs in 1 second." << std::endl;
	#endif

	std::cout << "Flipping every bit in the buffer to see if it is correctly detected ..." << std::endl;
	begin=std::chrono::high_resolution_clock::now();
	for(size_t toflip=0; toflip<bytes*8; toflip++)
	{
	buffer[toflip/8]^=((size_t)1<<(toflip%8));
	size_t newecc=engine(buffer.data());
	if(ecc==newecc)
	{
	std::cerr << "ERROR: Flipping bit " << toflip << " not detected!" << std::endl;
	errcode=2;
	}
	else
	{
	size_t badbit=engine.find_bad_bit(ecc, newecc);
	if(badbit!=toflip)
	{
	std::cerr << "ERROR: Bad bit " << badbit << " is not the bit " << toflip << " we flipped!" << std::endl;
	errcode=2;
	}
	// else
	// std::cout << "SUCCESS: Bit flip " << toflip << " correctly detected" << std::endl;
	}
	if(2!=engine.verify(buffer.data(), ecc))
	{
	std::cerr << "ERROR: verify() did not heal the buffer!" << std::endl;
	errcode=2;
	}
	}
	end=std::chrono::high_resolution_clock::now();
	diff=std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1>>>(end-begin);
	std::cout << "Checking and fixing is approximately " << (bytes*10000/diff.count()/1024/1024) << " Mb/sec" << std::endl;

	std::cout << "\nFlipping two bits in the buffer to see if it is correctly detected ..." << std::endl;
	buffer[0]^=1;
	begin=std::chrono::high_resolution_clock::now();
	for(size_t toflip=1; toflip<bytes*8; toflip++)
	{
	buffer[toflip/8]^=((size_t)1<<(toflip%8));
	size_t newecc=engine(buffer.data());
	if(ecc==newecc)
	{
	std::cerr << "ERROR: Flipping bits 0 and " << toflip << " not detected!" << std::endl;
	errcode=3;
	}
	// else
	// {
	// std::cout << "SUCCESS: Flipping bits 0 and " << toflip << " correctly detected" << std::endl;
	// }
	// if(0!=engine.verify(buffer.data(), ecc))
	// {
	// std::cerr << "ERROR: verify() thought it healed the buffer!" << std::endl;
	// errcode=2;
	// }
	buffer[toflip/8]^=((size_t)1<<(toflip%8));
	}
	end=std::chrono::high_resolution_clock::now();
	diff=std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1>>>(end-begin);
	std::cout << "Calculating is approximately " << (bytes*10000/diff.count()/1024/1024) << " Mb/sec" << std::endl;

	std::cout << "\nCalculating speeds ..." << std::endl;
	size_t foo=0;
	begin=std::chrono::high_resolution_clock::now();
	_begin=__rdtsc();
	for(size_t n=0; n<10000; n++)
	{
	buffer[0]=(char)n;
	foo+=engine(buffer.data());
	}
	_end=__rdtsc();
	end=std::chrono::high_resolution_clock::now();
	diff=std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1>>>(end-begin);
	if(foo)
	std::cout << "Calculating is approximately " << (bytes*10000/diff.count()/1024/1024) << " Mb/sec, or " << ((_end-_begin)/10000.0/4096) << " cycles/byte" << std::endl;
	foo=0;
	begin=std::chrono::high_resolution_clock::now();
	for(size_t n=0; n<10000; n++)
	{
	buffer[0]=(char)n;
	foo+=engine.verify(buffer.data(), ecc);
	}
	end=std::chrono::high_resolution_clock::now();
	diff=std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1>>>(end-begin);
	if(foo)
	std::cout << "Checking and fixing is approximately " << (bytes*10000/diff.count()/1024/1024) << " Mb/sec" << std::endl;
	return errcode;
	}