Skip to content

Instantly share code, notes, and snippets.

@degski
Created April 22, 2019 01:52
Show Gist options
  • Save degski/a10a66d12971fa5709494af1ec131a32 to your computer and use it in GitHub Desktop.
Save degski/a10a66d12971fa5709494af1ec131a32 to your computer and use it in GitHub Desktop.
test_hexstring_to_int.cpp, followiing this post by Danile Lemire: https://lemire.me/blog/2019/04/17/parsing-short-hexadecimal-strings-efficiently/
#include <cassert>
#include <cinttypes>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <cstring>
#include <iomanip>
#include <iostream>
#include <random>
#include <string>
#include <vector>
#include <intrin.h>
#include <frozen/map.h>
#include <frozen/unordered_map.h>
// This post is relevant: https://lemire.me/blog/2019/04/17/parsing-short-hexadecimal-strings-efficiently/
// I don't think this compiles with MSVC, on windows, use Clang/LLVM (or mingw/gcc).
#include <plf/plf_nanotimer.h> // https://github.com/mattreecebentley/plf_nanotimer
const signed char digittoval [ 256 ] = {
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8,
9, -1, -1, -1, -1, -1, -1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1 };
// returns a value with the high 16 bits set if not valid
// otherwise returns the conversion of the 4 hex digits at src into the bottom
// 16 bits of the 32-bit
// return register
//
__attribute__ ( ( noinline ) ) // we do not want the compiler to rewrite the problem
uint32_t hex_to_u32_lookup (
const uint8_t *src ) { // strictly speaking, static inline is a C-ism
uint32_t v1 =
digittoval [ src [ 0 ] ]; // uint32_t v1 = -1 becomes uint32_t v1 = 0xFFFFFFFF.
uint32_t v2 = digittoval [ src [ 1 ] ];
uint32_t v3 = digittoval [ src [ 2 ] ];
uint32_t v4 = digittoval [ src [ 3 ] ];
return static_cast< uint32_t >( v1 << 12 | v2 << 8 | v3 << 4 | v4 );
}
uint32_t hex_to_u32_lookup_inline (
const uint8_t *src ) { // strictly speaking, static inline is a C-ism
uint32_t v1 =
digittoval [ src [ 0 ] ]; // uint32_t v1 = -1 becomes uint32_t v1 = 0xFFFFFFFF.
uint32_t v2 = digittoval [ src [ 1 ] ];
uint32_t v3 = digittoval [ src [ 2 ] ];
uint32_t v4 = digittoval [ src [ 3 ] ];
return static_cast< uint32_t >( v1 << 12 | v2 << 8 | v3 << 4 | v4 );
}
__attribute__ ( ( noinline ) ) // we do not want the compiler to rewrite the problem
uint32_t hex_to_u32_lookup_alt (
const uint8_t *src ) { // strictly speaking, static inline is a C-ism
uint32_t v1 =
digittoval [ src [ 0 ] ]; // uint32_t v1 = -1 becomes uint32_t v1 = 0xFFFFFFFF.
uint32_t v2 = digittoval [ src [ 1 ] ];
uint32_t v3 = digittoval [ src [ 2 ] ];
uint32_t v4 = digittoval [ src [ 3 ] ];
return v1 * 16 * 16 * 16 + v2 * 16 * 16 + v3 * 16 + v4;
}
__attribute__ ( ( noinline ) ) // we do not want the compiler to rewrite the problem
uint32_t bogus ( const uint8_t *src ) {
return 0;
}
static inline uint32_t convertone ( uint8_t c ) {
uint32_t v = ( c & 0xF ) + 9 * ( c >> 6 );
return v;
}
// no error checking
__attribute__ ( ( noinline ) ) // we do not want the compiler to rewrite the problem
uint32_t hex_to_u32_math ( const uint8_t *src ) {
uint32_t v1 = convertone ( src [ 0 ] );
uint32_t v2 = convertone ( src [ 1 ] );
uint32_t v3 = convertone ( src [ 2 ] );
uint32_t v4 = convertone ( src [ 3 ] );
return static_cast< uint32_t >( v1 << 12 | v2 << 8 | v3 << 4 | v4 );
}
#ifdef __BMI2__
// no error checking
// http://0x80.pl/notesen/2014-10-09-pext-convert-ascii-hex-to-num.html
__attribute__ ( ( noinline ) ) // we do not want the compiler to rewrite the problem
uint32_t hex_to_u32_mula ( const uint8_t *src ) {
uint32_t val;
memcpy ( &val, src, 4 );
const uint32_t input = __builtin_bswap32 ( val );
const uint32_t letter = input & 0x40404040;
const uint32_t shift = letter >> 3 | letter >> 6; // 9 if char ~ [a-fA-F], 0 otherwise
const uint32_t adjusted = input + shift;
// gather lower nibbles of each byte
return _pext_u32 ( adjusted, 0x0f0f0f0f ); // slow on AMD!
}
#endif
// no error checking
// https://johnnylee-sde.github.io/Fast-hex-number-string-to-int/
__attribute__ ( ( noinline ) ) // we do not want the compiler to rewrite the problem
uint32_t hex_to_u32_lee ( const uint8_t *src ) {
uint32_t val;
memcpy ( &val, src, 4 );
val = ( val & 0xf0f0f0f ) + 9 * ( val >> 6 & 0x1010101 );
val = ( val | val << 12 ) & 0xff00ff00;
return ( val >> 24 | val ) & 0xffff;
}
__attribute__ ( ( noinline ) ) // we do not want the compiler to rewrite the problem
uint32_t hex_to_u32_aqrit ( const uint8_t *src ) {
uint32_t in;
uint64_t v, x;
const int64_t magic = INT64_C ( 0x1001001000000000 );
memcpy ( &in, src, 4 );
v = in;
x = ( ( ( 0x00404040 & v ) >> 6 ) * 9 ) + ( v & 0x000F0F0F ); // do 3
x = ( ( ( uint64_t ) ( ( int64_t ) x * magic ) ) >> 48 ) & ~15; // bswap and pack
v = ( ( v >> 30 ) * 9 ) + ( ( v >> 24 ) & 0x0F ); // do the 4th
return ( x | v );
}
#ifdef __SSE2__
uint64_t hex_to_u64_sse ( const uint8_t* string ) {
/*
'0' .. '9' = 0x30 .. 0x39
'a' .. 'f' = 0x61 .. 0x66
'A' .. 'F' = 0x41 .. 0x46
*/
__m128i input = _mm_loadu_si128 ( ( const __m128i* )string );
const __m128i mask = _mm_set1_epi8 ( 0x0f );
const __m128i hi_nibbles = _mm_and_si128 ( _mm_srli_epi32 ( input, 4 ), mask );
const __m128i lo_nibbles = _mm_and_si128 ( input, mask );
const int digits = 0x80;
const int letters = 0x40;
const int error1 = 0x10;
const int error2 = 0x20;
// pretend that input & 0xf0 == 0x30
const __m128i lo_ascii_09 = _mm_shuffle_epi8 ( _mm_setr_epi8 (
0 + digits,
1 + digits,
2 + digits,
3 + digits,
4 + digits,
5 + digits,
6 + digits,
7 + digits,
8 + digits,
9 + digits,
error1, error1, error1, error1, error1, error1 ), lo_nibbles );
// pretend that input & 0xf0 == 0x40 or 0x60
const __m128i lo_ascii_af = _mm_shuffle_epi8 ( _mm_setr_epi8 (
error1,
10 + letters,
11 + letters,
12 + letters,
13 + letters,
14 + letters,
15 + letters,
error1,
error1,
error1,
error1, error1, error1, error1, error1, error1 ), lo_nibbles );
// check value of higher nibble
const __m128i hi_class = _mm_shuffle_epi8 ( _mm_setr_epi8 (
error2, error2, error2, digits /* 0x30 */,
letters /* 0x40 */, error2, letters /* 0x60 */, error2,
error2, error2, error2, error2,
error2, error2, error2, error2 ), hi_nibbles );
// and find out which preposition was true
const __m128i ascii_09_mask = _mm_cmpeq_epi8 ( _mm_and_si128 ( hi_class, lo_ascii_09 ), hi_class );
const __m128i ascii_af_mask = _mm_cmpeq_epi8 ( _mm_and_si128 ( hi_class, lo_ascii_af ), hi_class );
const __m128i ascii_09 = _mm_and_si128 ( ascii_09_mask, lo_ascii_09 );
const __m128i ascii_af = _mm_and_si128 ( ascii_af_mask, lo_ascii_af );
#if 0
// MSBs of error vector indicates invalid characters
const __m128i errorvec = _mm_xor_si128 ( ascii_09_mask | ascii_af_mask, _mm_set1_epi8 ( -1 ) );
if ( _mm_movemask_epi8 ( errorvec ) )
abort ( );
#endif
const __m128i result = _mm_and_si128 ( _mm_or_si128 ( ascii_09, ascii_af ), mask );
// now each byte of result have value 0 .. 15, we're going to merge nibbles:
const __m128i t0 = result;
const __m128i t1 = _mm_srli_epi32 ( result, 4 );
const __m128i t3 = _mm_or_si128 ( t0, t1 );
const __m128i t4 = _mm_and_si128 ( t3, _mm_set1_epi16 ( 0x00ff ) ); // keep just lower bytes in words
const __m128i t5 = _mm_packus_epi16 ( t4, t4 ); // now lower part of the reg has 8-byte value
return _mm_cvtsi128_si64 ( t5 );
}
__attribute__ ( ( noinline ) ) // we do not want the compiler to rewrite the problem
uint32_t hex_to_u64_sse_wrapper ( const uint8_t* string ) {
// wrap to fullfil the template constraint
return hex_to_u64_sse ( string );
}
#endif
uint32_t lookup2 [ 65536 ];
void init_lookup2 ( ) {
for ( int i = 0; i < 0x10000; i++ ) {
lookup2 [ i ] = ( uint32_t ) -1;
}
for ( int i = 0; i < 256; i++ ) {
char digits [ 3 ];
sprintf ( digits, "%02x", i );
uint16_t lvalue;
memcpy ( &lvalue, digits, 2 );
lookup2 [ lvalue ] = i;
char digits_upper [ 3 ];
digits_upper [ 0 ] = toupper ( digits [ 0 ] );
digits_upper [ 1 ] = toupper ( digits [ 1 ] );
digits_upper [ 2 ] = 0;
if ( ( digits_upper [ 0 ] != digits [ 0 ] ) ||
( digits_upper [ 1 ] != digits [ 1 ] ) ) {
memcpy ( &lvalue, digits_upper, 2 );
lookup2 [ lvalue ] = i;
}
}
}
__attribute__ ( ( noinline ) ) // we do not want the compiler to rewrite the problem
uint32_t hex_2bytes_lookup ( const uint8_t *src ) {
uint32_t v1 = lookup2 [ ( ( uint16_t* ) src ) [ 0 ] ];
uint32_t v2 = lookup2 [ ( ( uint16_t* ) src ) [ 1 ] ];
return v1 << 8 | v2;
}
uint32_t hex_2bytes_lookup_inline ( const uint8_t *src ) {
uint32_t v1 = lookup2 [ ( ( uint16_t* ) src ) [ 0 ] ];
uint32_t v2 = lookup2 [ ( ( uint16_t* ) src ) [ 1 ] ];
return v1 << 8 | v2;
}
int table_lookup ( char s_ ) noexcept {
struct KeyValue { const char key, value; };
static constexpr std::array<const KeyValue, 22> table { {
{ '0', 0 }, { '1', 1 }, { '2', 2 }, { '3', 3 }, { '4', 4 }, { '5', 5 }, { '6', 6 }, { '7', 7 },
{ '8', 8 }, { '9', 9 }, { 'A', 10 }, { 'B', 11 }, { 'C', 12 }, { 'D', 13 }, { 'E', 14 }, { 'F', 15 },
{ 'a', 10 }, { 'b', 11 }, { 'c', 12 }, { 'd', 13 }, { 'e', 14 }, { 'f', 15 }
} };
for ( auto [ k, v ] : table )
if ( k == s_ )
return v;
return -1;
}
int table_lookup00 ( char s_ ) noexcept {
struct KeyValue { const char key, value; };
static constexpr std::array<const KeyValue, 16> table { {
{ '0', 0 }, { '1', 1 }, { '2', 2 }, { '3', 3 }, { '4', 4 }, { '5', 5 }, { '6', 6 }, { '7', 7 },
{ '8', 8 }, { '9', 9 }, { 'A', 10 }, { 'B', 11 }, { 'C', 12 }, { 'D', 13 }, { 'E', 14 }, { 'F', 15 }
} };
if ( s_ > 94 )
s_ -= 47;
for ( auto [ k, v ] : table )
if ( k == s_ )
return v;
return -1;
}
int table_lookup0 ( char s_ ) noexcept {
if ( s_ > 94 ) s_ -= ( 95 - 48 );
struct KeyValue { const char key, value; };
static constexpr KeyValue table [ ] {
{ '0', 0 }, { '1', 1 }, { '2', 2 }, { '3', 3 }, { '4', 4 }, { '5', 5 }, { '6', 6 }, { '7', 7 },
{ '8', 8 }, { '9', 9 }, { 'A', 10 }, { 'B', 11 }, { 'C', 12 }, { 'D', 13 }, { 'E', 14 }, { 'F', 15 }
};
const KeyValue * p = & table [ 0 ];
if ( p [ 8 ].key <= s_ ) p += 8;
if ( p [ 4 ].key <= s_ ) p += 4;
if ( p [ 2 ].key <= s_ ) p += 2;
if ( p [ 1 ].key <= s_ ) p += 1;
return p->value;
}
int table_lookup1 ( char s_ ) noexcept {
if ( s_ > 94 ) return s_ - ( 95 - 10 );
if ( s_ > 47 ) return s_ - ( 48 - 10 );
return s_ - 30;
}
int table_lookup2 ( char s_ ) noexcept {
if ( s_ > 94 ) s_ -= ( 95 - 48 );
constexpr char key [ ] { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };
for ( int i = 0; i < 16; ++i )
if ( key [ i ] == s_ )
return i;
return -1;
}
__attribute__ ( ( noinline ) ) // we do not want the compiler to rewrite the problem
uint32_t hex_table_lookup ( const uint8_t *src ) {
uint32_t v1 = table_lookup ( src [ 0 ] );
uint32_t v2 = table_lookup ( src [ 1 ] );
uint32_t v3 = table_lookup ( src [ 2 ] );
uint32_t v4 = table_lookup ( src [ 3 ] );
return static_cast< uint32_t >( v1 << 12 | v2 << 8 | v3 << 4 | v4 );
}
frozen::map<char, char, 22> map_lookup {
{ '0', 0 }, { '1', 1 }, { '2', 2 }, { '3', 3 }, { '4', 4 }, { '5', 5 }, { '6', 6 }, { '7', 7 }, { '8', 8 },
{ '9', 9 }, { 'A', 10 }, { 'B', 11 }, { 'C', 12 }, { 'D', 13 }, { 'E', 14 }, { 'F', 15 }, { 'a', 10 }, { 'b', 11 },
{ 'c', 12 }, { 'd', 13 }, { 'e', 14 }, { 'f', 15 }
};
__attribute__ ( ( noinline ) ) // we do not want the compiler to rewrite the problem
uint32_t hex_frozen_map ( const uint8_t *src ) {
uint32_t v1 = map_lookup.at ( src [ 0 ] );
uint32_t v2 = map_lookup.at ( src [ 1 ] );
uint32_t v3 = map_lookup.at ( src [ 2 ] );
uint32_t v4 = map_lookup.at ( src [ 3 ] );
return static_cast< uint32_t >( v1 << 12 | v2 << 8 | v3 << 4 | v4 );
}
constexpr frozen::unordered_map<char, char, 22> unordered_map_lookup {
{ '0', 0 },
{ '1', 1 },
{ '2', 2 },
{ '3', 3 },
{ '4', 4 },
{ '5', 5 },
{ '6', 6 },
{ '7', 7 },
{ '8', 8 },
{ '9', 9 },
{ 'a', 10 },
{ 'b', 11 },
{ 'c', 12 },
{ 'd', 13 },
{ 'e', 14 },
{ 'f', 15 },
{ 'A', 10 },
{ 'B', 11 },
{ 'C', 12 },
{ 'D', 13 },
{ 'E', 14 },
{ 'F', 15 }
};
__attribute__ ( ( noinline ) ) // we do not want the compiler to rewrite the problem
uint32_t hex_frozen_unordered_map ( const uint8_t *src ) {
uint32_t v1 = unordered_map_lookup.at ( src [ 0 ] );
uint32_t v2 = unordered_map_lookup.at ( src [ 1 ] );
uint32_t v3 = unordered_map_lookup.at ( src [ 2 ] );
uint32_t v4 = unordered_map_lookup.at ( src [ 3 ] );
return static_cast< uint32_t >( v1 << 12 | v2 << 8 | v3 << 4 | v4 );
}
template <uint32_t ( *F )( const uint8_t *src )> void test ( size_t N ) {
uint8_t *x = ( uint8_t * ) malloc ( sizeof ( uint8_t ) * ( N + 3 ) );
srand ( 1235 );
for ( size_t i = 0; i < N + 3; i++ ) {
int digit = ( rand ( ) % 16 );
if ( digit < 10 )
x [ i ] = 48 + digit;
else
x [ i ] = 65 + digit - 10;
}
for ( size_t i = 0; i < N; i++ ) {
assert ( hex_to_u32_lookup ( x + i ) == hex_to_u32_math ( x + i ) );
}
size_t iterations = 50;
std::random_device rd;
std::mt19937 gen ( rd ( ) );
std::uniform_int_distribution<> dis ( 0, 0xFFFF );
plf::nanotimer timer;
timer.start ( );
uint64_t sum = 0;
for ( uint32_t i = 0; i < iterations; i++ ) {
for ( size_t i = 0; i < N; i++ ) {
sum += F ( x + i );
}
}
uint64_t end = timer.get_elapsed_ms ( );
free ( x );
printf ( "sum = %zu \n", ( size_t ) sum );
printf ( "time = %zu \n", ( uint64_t ) end );
}
int main ( ) {
init_lookup2 ( );
size_t N = 1000 * 1000;
printf ( "N= %zu \n", N );
printf ( "empty function (overhead):\n" );
test<bogus> ( N );
printf ( "lookup:\n" );
test<hex_to_u32_lookup> ( N );
printf ( "lookup (inline):\n" );
test<hex_to_u32_lookup_inline> ( N );
printf ( "lookup (alt):\n" );
test<hex_to_u32_lookup_alt> ( N );
printf ( "math:\n" );
test<hex_to_u32_math> ( N );
#ifdef __BMI2__
printf ( "mula:\n" );
test<hex_to_u32_mula> ( N );
#endif
printf ( "lee:\n" );
test<hex_to_u32_lee> ( N );
printf ( "aqrit:\n" );
test<hex_to_u32_aqrit> ( N );
#ifdef __SSE2__
printf ( "SSE (uint64_t):\n" );
test<hex_to_u64_sse_wrapper> ( N );
#endif
printf ( "big lookup:\n" );
test<hex_2bytes_lookup> ( N );
printf ( "big lookup (inline):\n" );
test<hex_2bytes_lookup_inline> ( N );
printf ( "table_lookup:\n" );
test<hex_table_lookup> ( N );
printf ( "frozen_map:\n" );
test<hex_frozen_map> ( N );
printf ( "frozen_unordered_map:\n" );
test<hex_frozen_unordered_map> ( N );
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment