Last active
April 6, 2020 20:28
-
-
Save ned14/fe863db40482bc4d62bd to your computer and use it in GitHub Desktop.
More generalised C++ hamming code testing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* hamming_test.cpp | |
Test generic hamming coding routines (SECDEC). These can fix 1 bit errors and detect two bit errors. | |
(C) 2015 Niall Douglas http://www.nedprod.com | |
g++ -std=c++11 -o hamming_test -O3 -fverbose-asm -Wa,-adhln hamming_test.cpp -march=ivybridge > hamming_test.S | |
On my 3.9Ghz Ivy Bridge Intel 3770K: | |
For no intrinsics: | |
Calculating 4Kb: 12.06Mb/sec | |
Fixing 4Kb: 14.98Mb/sec | |
With intrinsics (bitscan for set bits): | |
Calculating 4Kb: 73.78Mb/sec | |
Fixing 4Kb: 64.15Mb/sec | |
With tables (bitscan for set bits): | |
Calculating 4Kb: 82.41Mb/sec | |
Fixing 4Kb: 72.58Mb/sec | |
With prefetch of packed tables and slice by 8 with fixed bit masking: | |
Calculating 4Kb: 168.68Mb/sec (22 corrected cycles/byte) | |
Fixing 4Kb: 110.97Mb/sec | |
With prefetch of packed tables and slice by 8 with fixed bit masking and OpenMP: | |
Calculating 4Kb: 683Mb/sec | |
Fixing 4Kb: 383Mb/sec | |
*/ | |
#include <atomic> | |
#include <iostream> | |
#include <random> | |
#include <vector> | |
#include <chrono> | |
#include <stdexcept> | |
#include <string.h> | |
#define USE_INTRINSICS | |
/*! \class secdec_ecc | |
\brief Calculates the single error correcting double error detecting (SECDEC) Hamming Error Correcting Code for a \em blocksize block of bytes. | |
After construction, no state is modified and therefore this class is safe for static storage. | |
*/ | |
template<size_t blocksize> class secdec_ecc | |
{ | |
public: | |
typedef unsigned int result_type; //!< The largest ECC which can be calculated | |
private: | |
static constexpr size_t bits_per_byte=8; | |
typedef unsigned char unit_type; // The batch unit of processing | |
result_type bitsvalid; | |
// Many CPUs (x86) are slow doing variable bit shifts, so keep a table | |
result_type ecc_twospowers[sizeof(result_type)*bits_per_byte]; | |
unsigned short ecc_table[blocksize*bits_per_byte]; | |
static bool _is_single_bit_set(unsigned long long x) | |
{ | |
#ifdef USE_INTRINSICS | |
return __builtin_popcountl(x)==1; | |
#else | |
x -= (x >> 1) & 0x5555555555555555ULL; | |
x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL); | |
x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0fULL; | |
unsigned long long count=(x * 0x0101010101010101ULL)>>56; | |
return count==1; | |
#endif | |
} | |
public: | |
//! Constructs an instance, configuring the necessary lookup tables | |
secdec_ecc() | |
{ | |
for(size_t n=0; n<sizeof(result_type)*bits_per_byte; n++) | |
ecc_twospowers[n]=((result_type)1<<n); | |
result_type length=blocksize*8; | |
// This is (data bits + parity bits + 1) <= 2^(parity bits) | |
for(result_type p=1; p<sizeof(result_type)*bits_per_byte; p++) | |
if((length+p+1)<=ecc_twospowers[p]) | |
{ | |
bitsvalid=p; | |
break; | |
} | |
if((7+bitsvalid)/8>sizeof(result_type)) | |
throw std::runtime_error("ECC would exceed the size of result_type!"); | |
for(result_type i=0; i<blocksize*bits_per_byte; i++) | |
{ | |
// Make a code bit | |
result_type b=i+1; | |
#ifdef USE_INTRINSICS | |
result_type topbit=8*sizeof(result_type)-__builtin_clz(b); | |
b+=topbit; | |
if(b>=ecc_twospowers[topbit]) b++; | |
//while(b>ecc_twospowers(_topbit+1)) _topbit++; | |
//b+=_topbit; | |
//if(b>=ecc_twospowers(_topbit)) b++; | |
#else | |
for(size_t p=0; ecc_twospowers[p]<(b+1); p++) | |
b++; | |
#endif | |
ecc_table[i]=b; | |
if(b>(unsigned short)-1) | |
throw std::runtime_error("Precalculated table has exceeded its bounds"); | |
} | |
} | |
//! The number of bits valid in result_type | |
size_t result_bits_valid() const noexcept | |
{ | |
return bitsvalid; | |
} | |
//! Accumulate ECC from buffer | |
result_type operator()(result_type ecc, const char *buffer) const noexcept | |
{ | |
#if 1 | |
// Process in lumps of eight | |
const unit_type *_buffer=(const unit_type *) buffer; | |
#pragma omp parallel for reduction(^:ecc) | |
for(size_t i=0; i<blocksize; i+=sizeof(unit_type)*8) | |
{ | |
union { unsigned long long v; unit_type c[8]; }; | |
result_type prefetch[8]; | |
v=*(unsigned long long *)(&_buffer[0+i/sizeof(unit_type)]); // min 1 cycle | |
#define ROUND(n) \ | |
prefetch[0]=ecc_table[(i+0)*8+n]; \ | |
prefetch[1]=ecc_table[(i+1)*8+n]; \ | |
prefetch[2]=ecc_table[(i+2)*8+n]; \ | |
prefetch[3]=ecc_table[(i+3)*8+n]; \ | |
prefetch[4]=ecc_table[(i+4)*8+n]; \ | |
prefetch[5]=ecc_table[(i+5)*8+n]; \ | |
prefetch[6]=ecc_table[(i+6)*8+n]; \ | |
prefetch[7]=ecc_table[(i+7)*8+n]; \ | |
if(c[0]&((unit_type)1<<n))\ | |
ecc^=prefetch[0];\ | |
if(c[1]&((unit_type)1<<n))\ | |
ecc^=prefetch[1];\ | |
if(c[2]&((unit_type)1<<n))\ | |
ecc^=prefetch[2];\ | |
if(c[3]&((unit_type)1<<n))\ | |
ecc^=prefetch[3];\ | |
if(c[4]&((unit_type)1<<n))\ | |
ecc^=prefetch[4];\ | |
if(c[5]&((unit_type)1<<n))\ | |
ecc^=prefetch[5];\ | |
if(c[6]&((unit_type)1<<n))\ | |
ecc^=prefetch[6];\ | |
if(c[7]&((unit_type)1<<n))\ | |
ecc^=prefetch[7]; | |
ROUND(0) // prefetch = min 8, bit test and xor = min 16, total = 24 | |
ROUND(1) | |
ROUND(2) | |
ROUND(3) | |
ROUND(4) | |
ROUND(5) | |
ROUND(6) | |
ROUND(7) | |
#undef ROUND // total should be 1+(8*24/3)=65 | |
} | |
return ecc; | |
#else | |
const unit_type *_buffer=(const unit_type *) buffer; | |
#pragma omp parallel for reduction(^:ecc) | |
for(size_t i=0; i<blocksize; i+=sizeof(unit_type)) | |
{ | |
unit_type c=_buffer[i/sizeof(unit_type)]; | |
if(!c) | |
continue; | |
result_type prefetch[bits_per_byte*sizeof(unit_type)]; | |
// for(size_t n=0; n<bits_per_byte*sizeof(unit_type); n++) | |
// prefetch[n]=ecc_table[i*8+n]; | |
#define ROUND(n) \ | |
prefetch[n]=ecc_table[i*8+n]; | |
ROUND(0) | |
ROUND(1) | |
ROUND(2) | |
ROUND(3) | |
ROUND(4) | |
ROUND(5) | |
ROUND(6) | |
ROUND(7) | |
#undef ROUND | |
// for(size_t n=0; n<bits_per_byte*sizeof(unit_type); n++) | |
// { | |
// if(c&((unit_type)1<<n)) | |
// ecc^=prefetch[n]; | |
// } | |
#define ROUND(n) \ | |
if(c&((unit_type)1<<n))\ | |
ecc^=prefetch[n]; | |
ROUND(0) | |
ROUND(1) | |
ROUND(2) | |
ROUND(3) | |
ROUND(4) | |
ROUND(5) | |
ROUND(6) | |
ROUND(7) | |
#undef ROUND | |
} | |
return ecc; | |
#endif | |
} | |
result_type operator()(const char *buffer) const noexcept { return (*this)(0, buffer); } | |
//! Accumulate ECC from partial buffer where \em length <= \em blocksize | |
result_type operator()(result_type ecc, const char *buffer, size_t length) const noexcept | |
{ | |
const unit_type *_buffer=(const unit_type *) buffer; | |
#pragma omp parallel for reduction(^:ecc) | |
for(size_t i=0; i<length; i+=sizeof(unit_type)) | |
{ | |
unit_type c=_buffer[i/sizeof(unit_type)]; // min 1 cycle | |
if(!c) // min 1 cycle | |
continue; | |
char bitset[bits_per_byte*sizeof(unit_type)]; | |
// Most compilers will roll this out | |
for(size_t n=0; n<bits_per_byte*sizeof(unit_type); n++) // min 16 cycles | |
bitset[n]=!!(c&((unit_type)1<<n)); | |
result_type localecc=0; | |
for(size_t n=0; n<bits_per_byte*sizeof(unit_type); n++) | |
{ | |
if(bitset[n]) // min 8 cycles | |
localecc^=ecc_table[i*8+n]; // min 16 cycles | |
} | |
ecc^=localecc; // min 1 cycle. Total cycles = min 43 cycles/byte | |
} | |
return ecc; | |
} | |
result_type operator()(const char *buffer, size_t length) const noexcept { return (*this)(0, buffer, length); } | |
//! Given the original ECC and the new ECC for a buffer, find the bad bit. Return (size_t)-1 if not found (e.g. ECC corrupt) | |
size_t find_bad_bit(result_type good_ecc, result_type bad_ecc) const noexcept | |
{ | |
result_type length=blocksize*8, eccdiff=good_ecc^bad_ecc; | |
if(_is_single_bit_set(eccdiff)) | |
return (size_t)-1; | |
for(result_type i=0, b=1; i<length; i++, b++) | |
{ | |
// Skip parity bits | |
while(_is_single_bit_set(b)) | |
b++; | |
if(b==eccdiff) | |
return i; | |
} | |
return (size_t)-1; | |
} | |
//! Verifies and heals when possible a buffer, returning non zero if the buffer is error free | |
int verify(char *buffer, result_type good_ecc) const noexcept | |
{ | |
result_type this_ecc=(*this)(0, buffer); | |
if(this_ecc==good_ecc) | |
return 1; // no errors | |
size_t badbit=find_bad_bit(good_ecc, this_ecc); | |
if((size_t)-1==badbit) | |
return 0; // parity corrupt? | |
buffer[badbit/8]^=(unsigned char) ecc_twospowers[badbit%8]; | |
this_ecc=(*this)(0, buffer); | |
if(this_ecc==good_ecc) | |
return 2; // error healed | |
// Put the bit back | |
buffer[badbit/8]^=(unsigned char) ecc_twospowers[badbit%8]; | |
return 0; // more than one bit was corrupt | |
} | |
//! Verifies and heals when possible a partial buffer, returning non zero if the buffer is error free | |
int verify(char *buffer, size_t length, result_type good_ecc) const noexcept | |
{ | |
result_type this_ecc=(*this)(0, buffer, length); | |
if(this_ecc==good_ecc) | |
return 1; // no errors | |
size_t badbit=find_bad_bit(good_ecc, this_ecc); | |
if((size_t)-1==badbit) | |
return 0; // parity corrupt? | |
buffer[badbit/8]^=(unsigned char) ecc_twospowers[badbit%8]; | |
this_ecc=(*this)(0, buffer, length); | |
if(this_ecc==good_ecc) | |
return 2; // error healed | |
// Put the bit back | |
buffer[badbit/8]^=(unsigned char) ecc_twospowers[badbit%8]; | |
return 0; // more than one bit was corrupt | |
} | |
}; | |
int main(int argc, const char *argv[]) | |
{ | |
int errcode=0; | |
const size_t bytes=4096; | |
std::vector<char> buffer(bytes); | |
{ | |
std::random_device r; | |
std::random_device::result_type *b=(std::random_device::result_type *) buffer.data(); | |
for(size_t n=0; n<buffer.size()/sizeof(*b); n++) | |
b[n]=r(); | |
} | |
secdec_ecc<4096> engine; | |
size_t eccbits=engine.result_bits_valid(); | |
std::cout << "ECC will be " << eccbits << " bits long" << std::endl; | |
size_t ecc=engine(buffer.data()); | |
std::cout << "ECC was calculated to be " << std::hex << ecc << std::dec << std::endl; | |
auto end=std::chrono::high_resolution_clock::now(), begin=std::chrono::high_resolution_clock::now(); | |
unsigned long long _begin=__rdtsc(); | |
auto diff=std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1>>>(end-begin); | |
#if 0 | |
do | |
{ | |
end=std::chrono::high_resolution_clock::now(); | |
} while(std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1>>>(end-begin).count()<1); | |
unsigned long long _end=__rdtsc(); | |
std::cout << "There are " << (_end-_begin) << " TSCs in 1 second." << std::endl; | |
#endif | |
std::cout << "Flipping every bit in the buffer to see if it is correctly detected ..." << std::endl; | |
begin=std::chrono::high_resolution_clock::now(); | |
for(size_t toflip=0; toflip<bytes*8; toflip++) | |
{ | |
buffer[toflip/8]^=((size_t)1<<(toflip%8)); | |
size_t newecc=engine(buffer.data()); | |
if(ecc==newecc) | |
{ | |
std::cerr << "ERROR: Flipping bit " << toflip << " not detected!" << std::endl; | |
errcode=2; | |
} | |
else | |
{ | |
size_t badbit=engine.find_bad_bit(ecc, newecc); | |
if(badbit!=toflip) | |
{ | |
std::cerr << "ERROR: Bad bit " << badbit << " is not the bit " << toflip << " we flipped!" << std::endl; | |
errcode=2; | |
} | |
// else | |
// std::cout << "SUCCESS: Bit flip " << toflip << " correctly detected" << std::endl; | |
} | |
if(2!=engine.verify(buffer.data(), ecc)) | |
{ | |
std::cerr << "ERROR: verify() did not heal the buffer!" << std::endl; | |
errcode=2; | |
} | |
} | |
end=std::chrono::high_resolution_clock::now(); | |
diff=std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1>>>(end-begin); | |
std::cout << "Checking and fixing is approximately " << (bytes*10000/diff.count()/1024/1024) << " Mb/sec" << std::endl; | |
std::cout << "\nFlipping two bits in the buffer to see if it is correctly detected ..." << std::endl; | |
buffer[0]^=1; | |
begin=std::chrono::high_resolution_clock::now(); | |
for(size_t toflip=1; toflip<bytes*8; toflip++) | |
{ | |
buffer[toflip/8]^=((size_t)1<<(toflip%8)); | |
size_t newecc=engine(buffer.data()); | |
if(ecc==newecc) | |
{ | |
std::cerr << "ERROR: Flipping bits 0 and " << toflip << " not detected!" << std::endl; | |
errcode=3; | |
} | |
// else | |
// { | |
// std::cout << "SUCCESS: Flipping bits 0 and " << toflip << " correctly detected" << std::endl; | |
// } | |
// if(0!=engine.verify(buffer.data(), ecc)) | |
// { | |
// std::cerr << "ERROR: verify() thought it healed the buffer!" << std::endl; | |
// errcode=2; | |
// } | |
buffer[toflip/8]^=((size_t)1<<(toflip%8)); | |
} | |
end=std::chrono::high_resolution_clock::now(); | |
diff=std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1>>>(end-begin); | |
std::cout << "Calculating is approximately " << (bytes*10000/diff.count()/1024/1024) << " Mb/sec" << std::endl; | |
std::cout << "\nCalculating speeds ..." << std::endl; | |
size_t foo=0; | |
begin=std::chrono::high_resolution_clock::now(); | |
_begin=__rdtsc(); | |
for(size_t n=0; n<10000; n++) | |
{ | |
buffer[0]=(char)n; | |
foo+=engine(buffer.data()); | |
} | |
_end=__rdtsc(); | |
end=std::chrono::high_resolution_clock::now(); | |
diff=std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1>>>(end-begin); | |
if(foo) | |
std::cout << "Calculating is approximately " << (bytes*10000/diff.count()/1024/1024) << " Mb/sec, or " << ((_end-_begin)/10000.0/4096) << " cycles/byte" << std::endl; | |
foo=0; | |
begin=std::chrono::high_resolution_clock::now(); | |
for(size_t n=0; n<10000; n++) | |
{ | |
buffer[0]=(char)n; | |
foo+=engine.verify(buffer.data(), ecc); | |
} | |
end=std::chrono::high_resolution_clock::now(); | |
diff=std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1>>>(end-begin); | |
if(foo) | |
std::cout << "Checking and fixing is approximately " << (bytes*10000/diff.count()/1024/1024) << " Mb/sec" << std::endl; | |
return errcode; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I found this one: Hamming Code in C++