degski/test_hexstring_to_int.cpp

## test_hexstring_to_int.cpp

#include <cassert>
#include <cinttypes>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <cstring>
#include <iomanip>
#include <iostream>
#include <random>
#include <string>
#include <vector>
#include <intrin.h>
#include <frozen/map.h>
#include <frozen/unordered_map.h>

// This post is relevant: https://lemire.me/blog/2019/04/17/parsing-short-hexadecimal-strings-efficiently/
// I don't think this compiles with MSVC, on windows, use Clang/LLVM (or mingw/gcc).

#include <plf/plf_nanotimer.h> // https://github.com/mattreecebentley/plf_nanotimer

const signed char digittoval [ 256 ] = {
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0,  1,  2,  3,  4,  5,  6,  7,  8,
    9,  -1, -1, -1, -1, -1, -1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1,
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    -1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    -1, -1, -1, -1, -1, -1, -1, -1, -1 };
// returns a value with the high 16 bits set if not valid
// otherwise returns the conversion of the 4 hex digits at src into the bottom
// 16 bits of the 32-bit
// return register
//
__attribute__ ( ( noinline ) ) // we do not want the compiler to rewrite the problem
uint32_t hex_to_u32_lookup (
    const uint8_t *src ) { // strictly speaking, static inline is a C-ism
    uint32_t v1 =
        digittoval [ src [ 0 ] ]; // uint32_t v1 = -1 becomes uint32_t v1 = 0xFFFFFFFF.
    uint32_t v2 = digittoval [ src [ 1 ] ];
    uint32_t v3 = digittoval [ src [ 2 ] ];
    uint32_t v4 = digittoval [ src [ 3 ] ];
    return static_cast< uint32_t >( v1 << 12 | v2 << 8 | v3 << 4 | v4 );
}

uint32_t hex_to_u32_lookup_inline (
    const uint8_t *src ) { // strictly speaking, static inline is a C-ism
    uint32_t v1 =
        digittoval [ src [ 0 ] ]; // uint32_t v1 = -1 becomes uint32_t v1 = 0xFFFFFFFF.
    uint32_t v2 = digittoval [ src [ 1 ] ];
    uint32_t v3 = digittoval [ src [ 2 ] ];
    uint32_t v4 = digittoval [ src [ 3 ] ];
    return static_cast< uint32_t >( v1 << 12 | v2 << 8 | v3 << 4 | v4 );
}

__attribute__ ( ( noinline ) ) // we do not want the compiler to rewrite the problem
uint32_t hex_to_u32_lookup_alt (
    const uint8_t *src ) { // strictly speaking, static inline is a C-ism
    uint32_t v1 =
        digittoval [ src [ 0 ] ]; // uint32_t v1 = -1 becomes uint32_t v1 = 0xFFFFFFFF.
    uint32_t v2 = digittoval [ src [ 1 ] ];
    uint32_t v3 = digittoval [ src [ 2 ] ];
    uint32_t v4 = digittoval [ src [ 3 ] ];
    return v1 * 16 * 16 * 16 + v2 * 16 * 16 + v3 * 16 + v4;
}

__attribute__ ( ( noinline ) ) // we do not want the compiler to rewrite the problem
uint32_t bogus ( const uint8_t *src ) {
    return 0;
}


static inline uint32_t convertone ( uint8_t c ) {
    uint32_t v = ( c & 0xF ) + 9 * ( c >> 6 );
    return v;
}

// no error checking
__attribute__ ( ( noinline ) ) // we do not want the compiler to rewrite the problem
uint32_t hex_to_u32_math ( const uint8_t *src ) {
    uint32_t v1 = convertone ( src [ 0 ] );
    uint32_t v2 = convertone ( src [ 1 ] );
    uint32_t v3 = convertone ( src [ 2 ] );
    uint32_t v4 = convertone ( src [ 3 ] );
    return static_cast< uint32_t >( v1 << 12 | v2 << 8 | v3 << 4 | v4 );
}
#ifdef __BMI2__
// no error checking
// http://0x80.pl/notesen/2014-10-09-pext-convert-ascii-hex-to-num.html
__attribute__ ( ( noinline ) ) // we do not want the compiler to rewrite the problem
uint32_t hex_to_u32_mula ( const uint8_t *src ) {
    uint32_t val;
    memcpy ( &val, src, 4 );
    const uint32_t input = __builtin_bswap32 ( val );
    const uint32_t letter = input & 0x40404040;
    const uint32_t shift = letter >> 3 | letter >> 6; // 9 if char ~ [a-fA-F], 0 otherwise
    const uint32_t adjusted = input + shift;
    // gather lower nibbles of each byte
    return _pext_u32 ( adjusted, 0x0f0f0f0f ); // slow on AMD!
}
#endif
// no error checking
// https://johnnylee-sde.github.io/Fast-hex-number-string-to-int/
__attribute__ ( ( noinline ) ) // we do not want the compiler to rewrite the problem
uint32_t hex_to_u32_lee ( const uint8_t *src ) {
    uint32_t val;
    memcpy ( &val, src, 4 );
    val = ( val & 0xf0f0f0f ) + 9 * ( val >> 6 & 0x1010101 );
    val = ( val | val << 12 ) & 0xff00ff00;
    return ( val >> 24 | val ) & 0xffff;
}

__attribute__ ( ( noinline ) ) // we do not want the compiler to rewrite the problem
uint32_t hex_to_u32_aqrit ( const uint8_t *src ) {
    uint32_t in;
    uint64_t v, x;
    const int64_t magic = INT64_C ( 0x1001001000000000 );
    memcpy ( &in, src, 4 );
    v = in;
    x = ( ( ( 0x00404040 & v ) >> 6 ) * 9 ) + ( v & 0x000F0F0F ); // do 3
    x = ( ( ( uint64_t ) ( ( int64_t ) x * magic ) ) >> 48 ) & ~15; // bswap and pack
    v = ( ( v >> 30 ) * 9 ) + ( ( v >> 24 ) & 0x0F ); // do the 4th
    return ( x | v );
}

#ifdef __SSE2__
uint64_t hex_to_u64_sse ( const uint8_t* string ) {

    /*
        '0' .. '9' = 0x30 .. 0x39
        'a' .. 'f' = 0x61 .. 0x66
        'A' .. 'F' = 0x41 .. 0x46
    */

    __m128i input = _mm_loadu_si128 ( ( const __m128i* )string );
    const __m128i mask = _mm_set1_epi8 ( 0x0f );
    const __m128i hi_nibbles = _mm_and_si128 ( _mm_srli_epi32 ( input, 4 ), mask );
    const __m128i lo_nibbles = _mm_and_si128 ( input, mask );

    const int digits = 0x80;
    const int letters = 0x40;
    const int error1 = 0x10;
    const int error2 = 0x20;

    // pretend that input & 0xf0 == 0x30
    const __m128i lo_ascii_09 = _mm_shuffle_epi8 ( _mm_setr_epi8 (
        0 + digits,
        1 + digits,
        2 + digits,
        3 + digits,
        4 + digits,
        5 + digits,
        6 + digits,
        7 + digits,
        8 + digits,
        9 + digits,
        error1, error1, error1, error1, error1, error1 ), lo_nibbles );

// pretend that input & 0xf0 == 0x40 or 0x60
    const __m128i lo_ascii_af = _mm_shuffle_epi8 ( _mm_setr_epi8 (
        error1,
        10 + letters,
        11 + letters,
        12 + letters,
        13 + letters,
        14 + letters,
        15 + letters,
        error1,
        error1,
        error1,
        error1, error1, error1, error1, error1, error1 ), lo_nibbles );

// check value of higher nibble
    const __m128i hi_class = _mm_shuffle_epi8 ( _mm_setr_epi8 (
        error2, error2, error2, digits /* 0x30 */,
        letters /* 0x40 */, error2, letters /* 0x60 */, error2,
        error2, error2, error2, error2,
        error2, error2, error2, error2 ), hi_nibbles );

// and find out which preposition was true
    const __m128i ascii_09_mask = _mm_cmpeq_epi8 ( _mm_and_si128 ( hi_class, lo_ascii_09 ), hi_class );
    const __m128i ascii_af_mask = _mm_cmpeq_epi8 ( _mm_and_si128 ( hi_class, lo_ascii_af ), hi_class );
    const __m128i ascii_09 = _mm_and_si128 ( ascii_09_mask, lo_ascii_09 );
    const __m128i ascii_af = _mm_and_si128 ( ascii_af_mask, lo_ascii_af );

#if 0
    // MSBs of error vector indicates invalid characters
    const __m128i errorvec = _mm_xor_si128 ( ascii_09_mask | ascii_af_mask, _mm_set1_epi8 ( -1 ) );
    if ( _mm_movemask_epi8 ( errorvec ) )
        abort ( );
#endif

    const __m128i result = _mm_and_si128 ( _mm_or_si128 ( ascii_09, ascii_af ), mask );

    // now each byte of result have value 0 .. 15, we're going to merge nibbles:

    const __m128i t0 = result;
    const __m128i t1 = _mm_srli_epi32 ( result, 4 );
    const __m128i t3 = _mm_or_si128 ( t0, t1 );
    const __m128i t4 = _mm_and_si128 ( t3, _mm_set1_epi16 ( 0x00ff ) ); // keep just lower bytes in words
    const __m128i t5 = _mm_packus_epi16 ( t4, t4 ); // now lower part of the reg has 8-byte value

    return _mm_cvtsi128_si64 ( t5 );
}

__attribute__ ( ( noinline ) ) // we do not want the compiler to rewrite the problem
uint32_t hex_to_u64_sse_wrapper ( const uint8_t* string ) {
    // wrap to fullfil the template constraint
    return hex_to_u64_sse ( string );
}
#endif

uint32_t lookup2 [ 65536 ];

void init_lookup2 ( ) {

    for ( int i = 0; i < 0x10000; i++ ) {
        lookup2 [ i ] = ( uint32_t ) -1;
    }

    for ( int i = 0; i < 256; i++ ) {
        char digits [ 3 ];
        sprintf ( digits, "%02x", i );

        uint16_t lvalue;
        memcpy ( &lvalue, digits, 2 );
        lookup2 [ lvalue ] = i;

        char digits_upper [ 3 ];
        digits_upper [ 0 ] = toupper ( digits [ 0 ] );
        digits_upper [ 1 ] = toupper ( digits [ 1 ] );
        digits_upper [ 2 ] = 0;

        if ( ( digits_upper [ 0 ] != digits [ 0 ] ) ||
            ( digits_upper [ 1 ] != digits [ 1 ] ) ) {

            memcpy ( &lvalue, digits_upper, 2 );
            lookup2 [ lvalue ] = i;
        }
    }
}


__attribute__ ( ( noinline ) ) // we do not want the compiler to rewrite the problem
uint32_t hex_2bytes_lookup ( const uint8_t *src ) {
    uint32_t v1 = lookup2 [ ( ( uint16_t* ) src ) [ 0 ] ];
    uint32_t v2 = lookup2 [ ( ( uint16_t* ) src ) [ 1 ] ];
    return v1 << 8 | v2;
}
uint32_t hex_2bytes_lookup_inline ( const uint8_t *src ) {
    uint32_t v1 = lookup2 [ ( ( uint16_t* ) src ) [ 0 ] ];
    uint32_t v2 = lookup2 [ ( ( uint16_t* ) src ) [ 1 ] ];
    return v1 << 8 | v2;
}


int table_lookup ( char s_ ) noexcept {
    struct KeyValue { const char key, value; };
    static constexpr std::array<const KeyValue, 22> table { {
        { '0',  0 }, { '1',  1 }, { '2',  2 }, { '3',  3 }, { '4',  4 }, { '5',  5 }, { '6',  6 }, { '7',  7 },
        { '8',  8 }, { '9',  9 }, { 'A', 10 }, { 'B', 11 }, { 'C', 12 }, { 'D', 13 }, { 'E', 14 }, { 'F', 15 },
        { 'a', 10 }, { 'b', 11 }, { 'c', 12 }, { 'd', 13 }, { 'e', 14 }, { 'f', 15 }
    } };
    for ( auto [ k, v ] : table )
        if ( k == s_ )
            return v;
    return -1;
}


int table_lookup00 ( char s_ ) noexcept {
    struct KeyValue { const char key, value; };
    static constexpr std::array<const KeyValue, 16> table { {
        { '0',  0 }, { '1',  1 }, { '2',  2 }, { '3',  3 }, { '4',  4 }, { '5',  5 }, { '6',  6 }, { '7',  7 },
        { '8',  8 }, { '9',  9 }, { 'A', 10 }, { 'B', 11 }, { 'C', 12 }, { 'D', 13 }, { 'E', 14 }, { 'F', 15 }
    } };
    if ( s_ > 94 )
        s_ -= 47;
    for ( auto [ k, v ] : table )
        if ( k == s_ )
            return v;
    return -1;
}

int table_lookup0 ( char s_ ) noexcept {
    if ( s_ > 94 ) s_ -= ( 95 - 48 );
    struct KeyValue { const char key, value; };
    static constexpr KeyValue table [ ] {
        { '0',  0 }, { '1',  1 }, { '2',  2 }, { '3',  3 }, { '4',  4 }, { '5',  5 }, { '6',  6 }, { '7',  7 },
        { '8',  8 }, { '9',  9 }, { 'A', 10 }, { 'B', 11 }, { 'C', 12 }, { 'D', 13 }, { 'E', 14 }, { 'F', 15 }
    };
    const KeyValue * p = & table [ 0 ];
    if ( p [  8 ].key <= s_ ) p +=  8;
    if ( p [  4 ].key <= s_ ) p +=  4;
    if ( p [  2 ].key <= s_ ) p +=  2;
    if ( p [  1 ].key <= s_ ) p +=  1;
    return p->value;
}


int table_lookup1 ( char s_ ) noexcept {
    if ( s_ > 94 ) return s_ - ( 95 - 10 );
    if ( s_ > 47 ) return s_ - ( 48 - 10 );
    return s_ - 30;
}

int table_lookup2 ( char s_ ) noexcept {
    if ( s_ > 94 ) s_ -= ( 95 - 48 );
    constexpr char key [ ] { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };
    for ( int i = 0; i < 16; ++i )
        if ( key [ i ] == s_ )
            return i;
    return -1;
}

__attribute__ ( ( noinline ) ) // we do not want the compiler to rewrite the problem
uint32_t hex_table_lookup ( const uint8_t *src ) {
    uint32_t v1 = table_lookup ( src [ 0 ] );
    uint32_t v2 = table_lookup ( src [ 1 ] );
    uint32_t v3 = table_lookup ( src [ 2 ] );
    uint32_t v4 = table_lookup ( src [ 3 ] );
    return static_cast< uint32_t >( v1 << 12 | v2 << 8 | v3 << 4 | v4 );
}


frozen::map<char, char, 22> map_lookup {
    { '0',  0 }, { '1',  1 }, { '2',  2 }, { '3',  3 }, { '4',  4 }, { '5',  5 }, { '6',  6 }, { '7',  7 }, { '8',  8 },
    { '9',  9 }, { 'A', 10 }, { 'B', 11 }, { 'C', 12 }, { 'D', 13 }, { 'E', 14 }, { 'F', 15 }, { 'a', 10 }, { 'b', 11 },
    { 'c', 12 }, { 'd', 13 }, { 'e', 14 }, { 'f', 15 }
};

__attribute__ ( ( noinline ) ) // we do not want the compiler to rewrite the problem
uint32_t hex_frozen_map ( const uint8_t *src ) {
    uint32_t v1 = map_lookup.at ( src [ 0 ] );
    uint32_t v2 = map_lookup.at ( src [ 1 ] );
    uint32_t v3 = map_lookup.at ( src [ 2 ] );
    uint32_t v4 = map_lookup.at ( src [ 3 ] );
    return static_cast< uint32_t >( v1 << 12 | v2 << 8 | v3 << 4 | v4 );
}


constexpr frozen::unordered_map<char, char, 22> unordered_map_lookup {
    { '0',  0 },
    { '1',  1 },
    { '2',  2 },
    { '3',  3 },
    { '4',  4 },
    { '5',  5 },
    { '6',  6 },
    { '7',  7 },
    { '8',  8 },
    { '9',  9 },
    { 'a', 10 },
    { 'b', 11 },
    { 'c', 12 },
    { 'd', 13 },
    { 'e', 14 },
    { 'f', 15 },
    { 'A', 10 },
    { 'B', 11 },
    { 'C', 12 },
    { 'D', 13 },
    { 'E', 14 },
    { 'F', 15 }
};

__attribute__ ( ( noinline ) ) // we do not want the compiler to rewrite the problem
uint32_t hex_frozen_unordered_map ( const uint8_t *src ) {
    uint32_t v1 = unordered_map_lookup.at ( src [ 0 ] );
    uint32_t v2 = unordered_map_lookup.at ( src [ 1 ] );
    uint32_t v3 = unordered_map_lookup.at ( src [ 2 ] );
    uint32_t v4 = unordered_map_lookup.at ( src [ 3 ] );
    return static_cast< uint32_t >( v1 << 12 | v2 << 8 | v3 << 4 | v4 );
}

template <uint32_t ( *F )( const uint8_t *src )> void test ( size_t N ) {
    uint8_t *x = ( uint8_t * ) malloc ( sizeof ( uint8_t ) * ( N + 3 ) );
    srand ( 1235 );
    for ( size_t i = 0; i < N + 3; i++ ) {
        int digit = ( rand ( ) % 16 );
        if ( digit < 10 )
            x [ i ] = 48 + digit;
        else
            x [ i ] = 65 + digit - 10;
    }
    for ( size_t i = 0; i < N; i++ ) {
        assert ( hex_to_u32_lookup ( x + i ) == hex_to_u32_math ( x + i ) );
    }
    size_t iterations = 50;
    std::random_device rd;
    std::mt19937 gen ( rd ( ) );
    std::uniform_int_distribution<> dis ( 0, 0xFFFF );
    plf::nanotimer timer;
    timer.start ( );
    uint64_t sum = 0;
    for ( uint32_t i = 0; i < iterations; i++ ) {
        for ( size_t i = 0; i < N; i++ ) {
            sum += F ( x + i );
        }
    }
    uint64_t end = timer.get_elapsed_ms ( );
    free ( x );
    printf ( "sum  = %zu \n", ( size_t ) sum );
    printf ( "time = %zu \n", ( uint64_t ) end );
}

int main ( ) {
    init_lookup2 ( );
    size_t N = 1000 * 1000;
    printf ( "N= %zu \n", N );
    printf ( "empty function (overhead):\n" );
    test<bogus> ( N );
    printf ( "lookup:\n" );
    test<hex_to_u32_lookup> ( N );
    printf ( "lookup (inline):\n" );
    test<hex_to_u32_lookup_inline> ( N );
    printf ( "lookup (alt):\n" );
    test<hex_to_u32_lookup_alt> ( N );
    printf ( "math:\n" );
    test<hex_to_u32_math> ( N );
#ifdef __BMI2__
    printf ( "mula:\n" );
    test<hex_to_u32_mula> ( N );
#endif
    printf ( "lee:\n" );
    test<hex_to_u32_lee> ( N );
    printf ( "aqrit:\n" );
    test<hex_to_u32_aqrit> ( N );
#ifdef __SSE2__
    printf ( "SSE (uint64_t):\n" );
    test<hex_to_u64_sse_wrapper> ( N );
#endif
    printf ( "big lookup:\n" );
    test<hex_2bytes_lookup> ( N );
    printf ( "big lookup (inline):\n" );
    test<hex_2bytes_lookup_inline> ( N );
    printf ( "table_lookup:\n" );
    test<hex_table_lookup> ( N );
    printf ( "frozen_map:\n" );
    test<hex_frozen_map> ( N );
    printf ( "frozen_unordered_map:\n" );
    test<hex_frozen_unordered_map> ( N );
}

	#include <cassert>
	#include <cinttypes>
	#include <cstdint>
	#include <cstdio>
	#include <cstdlib>
	#include <cstring>
	#include <cstring>
	#include <iomanip>
	#include <iostream>
	#include <random>
	#include <string>
	#include <vector>
	#include <intrin.h>
	#include <frozen/map.h>
	#include <frozen/unordered_map.h>

	// This post is relevant: https://lemire.me/blog/2019/04/17/parsing-short-hexadecimal-strings-efficiently/
	// I don't think this compiles with MSVC, on windows, use Clang/LLVM (or mingw/gcc).

	#include <plf/plf_nanotimer.h> // https://github.com/mattreecebentley/plf_nanotimer

	const signed char digittoval [ 256 ] = {
	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8,
	9, -1, -1, -1, -1, -1, -1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1,
	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
	-1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
	-1, -1, -1, -1, -1, -1, -1, -1, -1 };
	// returns a value with the high 16 bits set if not valid
	// otherwise returns the conversion of the 4 hex digits at src into the bottom
	// 16 bits of the 32-bit
	// return register
	//
	__attribute__ ( ( noinline ) ) // we do not want the compiler to rewrite the problem
	uint32_t hex_to_u32_lookup (
	const uint8_t *src ) { // strictly speaking, static inline is a C-ism
	uint32_t v1 =
	digittoval [ src [ 0 ] ]; // uint32_t v1 = -1 becomes uint32_t v1 = 0xFFFFFFFF.
	uint32_t v2 = digittoval [ src [ 1 ] ];
	uint32_t v3 = digittoval [ src [ 2 ] ];
	uint32_t v4 = digittoval [ src [ 3 ] ];
	return static_cast< uint32_t >( v1 << 12 \| v2 << 8 \| v3 << 4 \| v4 );
	}

	uint32_t hex_to_u32_lookup_inline (
	const uint8_t *src ) { // strictly speaking, static inline is a C-ism
	uint32_t v1 =
	digittoval [ src [ 0 ] ]; // uint32_t v1 = -1 becomes uint32_t v1 = 0xFFFFFFFF.
	uint32_t v2 = digittoval [ src [ 1 ] ];
	uint32_t v3 = digittoval [ src [ 2 ] ];
	uint32_t v4 = digittoval [ src [ 3 ] ];
	return static_cast< uint32_t >( v1 << 12 \| v2 << 8 \| v3 << 4 \| v4 );
	}

	__attribute__ ( ( noinline ) ) // we do not want the compiler to rewrite the problem
	uint32_t hex_to_u32_lookup_alt (
	const uint8_t *src ) { // strictly speaking, static inline is a C-ism
	uint32_t v1 =
	digittoval [ src [ 0 ] ]; // uint32_t v1 = -1 becomes uint32_t v1 = 0xFFFFFFFF.
	uint32_t v2 = digittoval [ src [ 1 ] ];
	uint32_t v3 = digittoval [ src [ 2 ] ];
	uint32_t v4 = digittoval [ src [ 3 ] ];
	return v1 * 16 * 16 * 16 + v2 * 16 * 16 + v3 * 16 + v4;
	}

	__attribute__ ( ( noinline ) ) // we do not want the compiler to rewrite the problem
	uint32_t bogus ( const uint8_t *src ) {
	return 0;
	}



	static inline uint32_t convertone ( uint8_t c ) {
	uint32_t v = ( c & 0xF ) + 9 * ( c >> 6 );
	return v;
	}

	// no error checking
	__attribute__ ( ( noinline ) ) // we do not want the compiler to rewrite the problem
	uint32_t hex_to_u32_math ( const uint8_t *src ) {
	uint32_t v1 = convertone ( src [ 0 ] );
	uint32_t v2 = convertone ( src [ 1 ] );
	uint32_t v3 = convertone ( src [ 2 ] );
	uint32_t v4 = convertone ( src [ 3 ] );
	return static_cast< uint32_t >( v1 << 12 \| v2 << 8 \| v3 << 4 \| v4 );
	}
	#ifdef __BMI2__
	// no error checking
	// http://0x80.pl/notesen/2014-10-09-pext-convert-ascii-hex-to-num.html
	__attribute__ ( ( noinline ) ) // we do not want the compiler to rewrite the problem
	uint32_t hex_to_u32_mula ( const uint8_t *src ) {
	uint32_t val;
	memcpy ( &val, src, 4 );
	const uint32_t input = __builtin_bswap32 ( val );
	const uint32_t letter = input & 0x40404040;
	const uint32_t shift = letter >> 3 \| letter >> 6; // 9 if char ~ [a-fA-F], 0 otherwise
	const uint32_t adjusted = input + shift;
	// gather lower nibbles of each byte
	return _pext_u32 ( adjusted, 0x0f0f0f0f ); // slow on AMD!
	}
	#endif
	// no error checking
	// https://johnnylee-sde.github.io/Fast-hex-number-string-to-int/
	__attribute__ ( ( noinline ) ) // we do not want the compiler to rewrite the problem
	uint32_t hex_to_u32_lee ( const uint8_t *src ) {
	uint32_t val;
	memcpy ( &val, src, 4 );
	val = ( val & 0xf0f0f0f ) + 9 * ( val >> 6 & 0x1010101 );
	val = ( val \| val << 12 ) & 0xff00ff00;
	return ( val >> 24 \| val ) & 0xffff;
	}

	__attribute__ ( ( noinline ) ) // we do not want the compiler to rewrite the problem
	uint32_t hex_to_u32_aqrit ( const uint8_t *src ) {
	uint32_t in;
	uint64_t v, x;
	const int64_t magic = INT64_C ( 0x1001001000000000 );
	memcpy ( &in, src, 4 );
	v = in;
	x = ( ( ( 0x00404040 & v ) >> 6 ) * 9 ) + ( v & 0x000F0F0F ); // do 3
	x = ( ( ( uint64_t ) ( ( int64_t ) x * magic ) ) >> 48 ) & ~15; // bswap and pack
	v = ( ( v >> 30 ) * 9 ) + ( ( v >> 24 ) & 0x0F ); // do the 4th
	return ( x \| v );
	}

	#ifdef __SSE2__
	uint64_t hex_to_u64_sse ( const uint8_t* string ) {

	/*
	'0' .. '9' = 0x30 .. 0x39
	'a' .. 'f' = 0x61 .. 0x66
	'A' .. 'F' = 0x41 .. 0x46
	*/

	__m128i input = _mm_loadu_si128 ( ( const __m128i* )string );
	const __m128i mask = _mm_set1_epi8 ( 0x0f );
	const __m128i hi_nibbles = _mm_and_si128 ( _mm_srli_epi32 ( input, 4 ), mask );
	const __m128i lo_nibbles = _mm_and_si128 ( input, mask );

	const int digits = 0x80;
	const int letters = 0x40;
	const int error1 = 0x10;
	const int error2 = 0x20;

	// pretend that input & 0xf0 == 0x30
	const __m128i lo_ascii_09 = _mm_shuffle_epi8 ( _mm_setr_epi8 (
	0 + digits,
	1 + digits,
	2 + digits,
	3 + digits,
	4 + digits,
	5 + digits,
	6 + digits,
	7 + digits,
	8 + digits,
	9 + digits,
	error1, error1, error1, error1, error1, error1 ), lo_nibbles );

	// pretend that input & 0xf0 == 0x40 or 0x60
	const __m128i lo_ascii_af = _mm_shuffle_epi8 ( _mm_setr_epi8 (
	error1,
	10 + letters,
	11 + letters,
	12 + letters,
	13 + letters,
	14 + letters,
	15 + letters,
	error1,
	error1,
	error1,
	error1, error1, error1, error1, error1, error1 ), lo_nibbles );

	// check value of higher nibble
	const __m128i hi_class = _mm_shuffle_epi8 ( _mm_setr_epi8 (
	error2, error2, error2, digits /* 0x30 */,
	letters /* 0x40 /, error2, letters / 0x60 */, error2,
	error2, error2, error2, error2,
	error2, error2, error2, error2 ), hi_nibbles );

	// and find out which preposition was true
	const __m128i ascii_09_mask = _mm_cmpeq_epi8 ( _mm_and_si128 ( hi_class, lo_ascii_09 ), hi_class );
	const __m128i ascii_af_mask = _mm_cmpeq_epi8 ( _mm_and_si128 ( hi_class, lo_ascii_af ), hi_class );
	const __m128i ascii_09 = _mm_and_si128 ( ascii_09_mask, lo_ascii_09 );
	const __m128i ascii_af = _mm_and_si128 ( ascii_af_mask, lo_ascii_af );

	#if 0
	// MSBs of error vector indicates invalid characters
	const __m128i errorvec = _mm_xor_si128 ( ascii_09_mask \| ascii_af_mask, _mm_set1_epi8 ( -1 ) );
	if ( _mm_movemask_epi8 ( errorvec ) )
	abort ( );
	#endif

	const __m128i result = _mm_and_si128 ( _mm_or_si128 ( ascii_09, ascii_af ), mask );

	// now each byte of result have value 0 .. 15, we're going to merge nibbles:

	const __m128i t0 = result;
	const __m128i t1 = _mm_srli_epi32 ( result, 4 );
	const __m128i t3 = _mm_or_si128 ( t0, t1 );
	const __m128i t4 = _mm_and_si128 ( t3, _mm_set1_epi16 ( 0x00ff ) ); // keep just lower bytes in words
	const __m128i t5 = _mm_packus_epi16 ( t4, t4 ); // now lower part of the reg has 8-byte value

	return _mm_cvtsi128_si64 ( t5 );
	}

	__attribute__ ( ( noinline ) ) // we do not want the compiler to rewrite the problem
	uint32_t hex_to_u64_sse_wrapper ( const uint8_t* string ) {
	// wrap to fullfil the template constraint
	return hex_to_u64_sse ( string );
	}
	#endif

	uint32_t lookup2 [ 65536 ];

	void init_lookup2 ( ) {

	for ( int i = 0; i < 0x10000; i++ ) {
	lookup2 [ i ] = ( uint32_t ) -1;
	}

	for ( int i = 0; i < 256; i++ ) {
	char digits [ 3 ];
	sprintf ( digits, "%02x", i );

	uint16_t lvalue;
	memcpy ( &lvalue, digits, 2 );
	lookup2 [ lvalue ] = i;

	char digits_upper [ 3 ];
	digits_upper [ 0 ] = toupper ( digits [ 0 ] );
	digits_upper [ 1 ] = toupper ( digits [ 1 ] );
	digits_upper [ 2 ] = 0;

	if ( ( digits_upper [ 0 ] != digits [ 0 ] ) \|\|
	( digits_upper [ 1 ] != digits [ 1 ] ) ) {

	memcpy ( &lvalue, digits_upper, 2 );
	lookup2 [ lvalue ] = i;
	}
	}
	}


	__attribute__ ( ( noinline ) ) // we do not want the compiler to rewrite the problem
	uint32_t hex_2bytes_lookup ( const uint8_t *src ) {
	uint32_t v1 = lookup2 [ ( ( uint16_t* ) src ) [ 0 ] ];
	uint32_t v2 = lookup2 [ ( ( uint16_t* ) src ) [ 1 ] ];
	return v1 << 8 \| v2;
	}
	uint32_t hex_2bytes_lookup_inline ( const uint8_t *src ) {
	uint32_t v1 = lookup2 [ ( ( uint16_t* ) src ) [ 0 ] ];
	uint32_t v2 = lookup2 [ ( ( uint16_t* ) src ) [ 1 ] ];
	return v1 << 8 \| v2;
	}


	int table_lookup ( char s_ ) noexcept {
	struct KeyValue { const char key, value; };
	static constexpr std::array<const KeyValue, 22> table { {
	{ '0', 0 }, { '1', 1 }, { '2', 2 }, { '3', 3 }, { '4', 4 }, { '5', 5 }, { '6', 6 }, { '7', 7 },
	{ '8', 8 }, { '9', 9 }, { 'A', 10 }, { 'B', 11 }, { 'C', 12 }, { 'D', 13 }, { 'E', 14 }, { 'F', 15 },
	{ 'a', 10 }, { 'b', 11 }, { 'c', 12 }, { 'd', 13 }, { 'e', 14 }, { 'f', 15 }
	} };
	for ( auto [ k, v ] : table )
	if ( k == s_ )
	return v;
	return -1;
	}


	int table_lookup00 ( char s_ ) noexcept {
	struct KeyValue { const char key, value; };
	static constexpr std::array<const KeyValue, 16> table { {
	{ '0', 0 }, { '1', 1 }, { '2', 2 }, { '3', 3 }, { '4', 4 }, { '5', 5 }, { '6', 6 }, { '7', 7 },
	{ '8', 8 }, { '9', 9 }, { 'A', 10 }, { 'B', 11 }, { 'C', 12 }, { 'D', 13 }, { 'E', 14 }, { 'F', 15 }
	} };
	if ( s_ > 94 )
	s_ -= 47;
	for ( auto [ k, v ] : table )
	if ( k == s_ )
	return v;
	return -1;
	}

	int table_lookup0 ( char s_ ) noexcept {
	if ( s_ > 94 ) s_ -= ( 95 - 48 );
	struct KeyValue { const char key, value; };
	static constexpr KeyValue table [ ] {
	{ '0', 0 }, { '1', 1 }, { '2', 2 }, { '3', 3 }, { '4', 4 }, { '5', 5 }, { '6', 6 }, { '7', 7 },
	{ '8', 8 }, { '9', 9 }, { 'A', 10 }, { 'B', 11 }, { 'C', 12 }, { 'D', 13 }, { 'E', 14 }, { 'F', 15 }
	};
	const KeyValue * p = & table [ 0 ];
	if ( p [ 8 ].key <= s_ ) p += 8;
	if ( p [ 4 ].key <= s_ ) p += 4;
	if ( p [ 2 ].key <= s_ ) p += 2;
	if ( p [ 1 ].key <= s_ ) p += 1;
	return p->value;
	}


	int table_lookup1 ( char s_ ) noexcept {
	if ( s_ > 94 ) return s_ - ( 95 - 10 );
	if ( s_ > 47 ) return s_ - ( 48 - 10 );
	return s_ - 30;
	}

	int table_lookup2 ( char s_ ) noexcept {
	if ( s_ > 94 ) s_ -= ( 95 - 48 );
	constexpr char key [ ] { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };
	for ( int i = 0; i < 16; ++i )
	if ( key [ i ] == s_ )
	return i;
	return -1;
	}

	__attribute__ ( ( noinline ) ) // we do not want the compiler to rewrite the problem
	uint32_t hex_table_lookup ( const uint8_t *src ) {
	uint32_t v1 = table_lookup ( src [ 0 ] );
	uint32_t v2 = table_lookup ( src [ 1 ] );
	uint32_t v3 = table_lookup ( src [ 2 ] );
	uint32_t v4 = table_lookup ( src [ 3 ] );
	return static_cast< uint32_t >( v1 << 12 \| v2 << 8 \| v3 << 4 \| v4 );
	}



	frozen::map<char, char, 22> map_lookup {
	{ '0', 0 }, { '1', 1 }, { '2', 2 }, { '3', 3 }, { '4', 4 }, { '5', 5 }, { '6', 6 }, { '7', 7 }, { '8', 8 },
	{ '9', 9 }, { 'A', 10 }, { 'B', 11 }, { 'C', 12 }, { 'D', 13 }, { 'E', 14 }, { 'F', 15 }, { 'a', 10 }, { 'b', 11 },
	{ 'c', 12 }, { 'd', 13 }, { 'e', 14 }, { 'f', 15 }
	};

	__attribute__ ( ( noinline ) ) // we do not want the compiler to rewrite the problem
	uint32_t hex_frozen_map ( const uint8_t *src ) {
	uint32_t v1 = map_lookup.at ( src [ 0 ] );
	uint32_t v2 = map_lookup.at ( src [ 1 ] );
	uint32_t v3 = map_lookup.at ( src [ 2 ] );
	uint32_t v4 = map_lookup.at ( src [ 3 ] );
	return static_cast< uint32_t >( v1 << 12 \| v2 << 8 \| v3 << 4 \| v4 );
	}


	constexpr frozen::unordered_map<char, char, 22> unordered_map_lookup {
	{ '0', 0 },
	{ '1', 1 },
	{ '2', 2 },
	{ '3', 3 },
	{ '4', 4 },
	{ '5', 5 },
	{ '6', 6 },
	{ '7', 7 },
	{ '8', 8 },
	{ '9', 9 },
	{ 'a', 10 },
	{ 'b', 11 },
	{ 'c', 12 },
	{ 'd', 13 },
	{ 'e', 14 },
	{ 'f', 15 },
	{ 'A', 10 },
	{ 'B', 11 },
	{ 'C', 12 },
	{ 'D', 13 },
	{ 'E', 14 },
	{ 'F', 15 }
	};

	__attribute__ ( ( noinline ) ) // we do not want the compiler to rewrite the problem
	uint32_t hex_frozen_unordered_map ( const uint8_t *src ) {
	uint32_t v1 = unordered_map_lookup.at ( src [ 0 ] );
	uint32_t v2 = unordered_map_lookup.at ( src [ 1 ] );
	uint32_t v3 = unordered_map_lookup.at ( src [ 2 ] );
	uint32_t v4 = unordered_map_lookup.at ( src [ 3 ] );
	return static_cast< uint32_t >( v1 << 12 \| v2 << 8 \| v3 << 4 \| v4 );
	}

	template <uint32_t ( F )( const uint8_t src )> void test ( size_t N ) {
	uint8_t x = ( uint8_t ) malloc ( sizeof ( uint8_t ) * ( N + 3 ) );
	srand ( 1235 );
	for ( size_t i = 0; i < N + 3; i++ ) {
	int digit = ( rand ( ) % 16 );
	if ( digit < 10 )
	x [ i ] = 48 + digit;
	else
	x [ i ] = 65 + digit - 10;
	}
	for ( size_t i = 0; i < N; i++ ) {
	assert ( hex_to_u32_lookup ( x + i ) == hex_to_u32_math ( x + i ) );
	}
	size_t iterations = 50;
	std::random_device rd;
	std::mt19937 gen ( rd ( ) );
	std::uniform_int_distribution<> dis ( 0, 0xFFFF );
	plf::nanotimer timer;
	timer.start ( );
	uint64_t sum = 0;
	for ( uint32_t i = 0; i < iterations; i++ ) {
	for ( size_t i = 0; i < N; i++ ) {
	sum += F ( x + i );
	}
	}
	uint64_t end = timer.get_elapsed_ms ( );
	free ( x );
	printf ( "sum = %zu \n", ( size_t ) sum );
	printf ( "time = %zu \n", ( uint64_t ) end );
	}

	int main ( ) {
	init_lookup2 ( );
	size_t N = 1000 * 1000;
	printf ( "N= %zu \n", N );
	printf ( "empty function (overhead):\n" );
	test<bogus> ( N );
	printf ( "lookup:\n" );
	test<hex_to_u32_lookup> ( N );
	printf ( "lookup (inline):\n" );
	test<hex_to_u32_lookup_inline> ( N );
	printf ( "lookup (alt):\n" );
	test<hex_to_u32_lookup_alt> ( N );
	printf ( "math:\n" );
	test<hex_to_u32_math> ( N );
	#ifdef __BMI2__
	printf ( "mula:\n" );
	test<hex_to_u32_mula> ( N );
	#endif
	printf ( "lee:\n" );
	test<hex_to_u32_lee> ( N );
	printf ( "aqrit:\n" );
	test<hex_to_u32_aqrit> ( N );
	#ifdef __SSE2__
	printf ( "SSE (uint64_t):\n" );
	test<hex_to_u64_sse_wrapper> ( N );
	#endif
	printf ( "big lookup:\n" );
	test<hex_2bytes_lookup> ( N );
	printf ( "big lookup (inline):\n" );
	test<hex_2bytes_lookup_inline> ( N );
	printf ( "table_lookup:\n" );
	test<hex_table_lookup> ( N );
	printf ( "frozen_map:\n" );
	test<hex_frozen_map> ( N );
	printf ( "frozen_unordered_map:\n" );
	test<hex_frozen_unordered_map> ( N );
	}