Created
April 22, 2019 01:52
-
-
Save degski/a10a66d12971fa5709494af1ec131a32 to your computer and use it in GitHub Desktop.
test_hexstring_to_int.cpp, followiing this post by Danile Lemire: https://lemire.me/blog/2019/04/17/parsing-short-hexadecimal-strings-efficiently/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cassert> | |
#include <cinttypes> | |
#include <cstdint> | |
#include <cstdio> | |
#include <cstdlib> | |
#include <cstring> | |
#include <cstring> | |
#include <iomanip> | |
#include <iostream> | |
#include <random> | |
#include <string> | |
#include <vector> | |
#include <intrin.h> | |
#include <frozen/map.h> | |
#include <frozen/unordered_map.h> | |
// This post is relevant: https://lemire.me/blog/2019/04/17/parsing-short-hexadecimal-strings-efficiently/ | |
// I don't think this compiles with MSVC, on windows, use Clang/LLVM (or mingw/gcc). | |
#include <plf/plf_nanotimer.h> // https://github.com/mattreecebentley/plf_nanotimer | |
const signed char digittoval [ 256 ] = { | |
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, | |
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, | |
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, | |
9, -1, -1, -1, -1, -1, -1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, | |
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, | |
-1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, | |
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, | |
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, | |
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, | |
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, | |
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, | |
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, | |
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, | |
-1, -1, -1, -1, -1, -1, -1, -1, -1 }; | |
// returns a value with the high 16 bits set if not valid | |
// otherwise returns the conversion of the 4 hex digits at src into the bottom | |
// 16 bits of the 32-bit | |
// return register | |
// | |
__attribute__ ( ( noinline ) ) // we do not want the compiler to rewrite the problem | |
uint32_t hex_to_u32_lookup ( | |
const uint8_t *src ) { // strictly speaking, static inline is a C-ism | |
uint32_t v1 = | |
digittoval [ src [ 0 ] ]; // uint32_t v1 = -1 becomes uint32_t v1 = 0xFFFFFFFF. | |
uint32_t v2 = digittoval [ src [ 1 ] ]; | |
uint32_t v3 = digittoval [ src [ 2 ] ]; | |
uint32_t v4 = digittoval [ src [ 3 ] ]; | |
return static_cast< uint32_t >( v1 << 12 | v2 << 8 | v3 << 4 | v4 ); | |
} | |
uint32_t hex_to_u32_lookup_inline ( | |
const uint8_t *src ) { // strictly speaking, static inline is a C-ism | |
uint32_t v1 = | |
digittoval [ src [ 0 ] ]; // uint32_t v1 = -1 becomes uint32_t v1 = 0xFFFFFFFF. | |
uint32_t v2 = digittoval [ src [ 1 ] ]; | |
uint32_t v3 = digittoval [ src [ 2 ] ]; | |
uint32_t v4 = digittoval [ src [ 3 ] ]; | |
return static_cast< uint32_t >( v1 << 12 | v2 << 8 | v3 << 4 | v4 ); | |
} | |
__attribute__ ( ( noinline ) ) // we do not want the compiler to rewrite the problem | |
uint32_t hex_to_u32_lookup_alt ( | |
const uint8_t *src ) { // strictly speaking, static inline is a C-ism | |
uint32_t v1 = | |
digittoval [ src [ 0 ] ]; // uint32_t v1 = -1 becomes uint32_t v1 = 0xFFFFFFFF. | |
uint32_t v2 = digittoval [ src [ 1 ] ]; | |
uint32_t v3 = digittoval [ src [ 2 ] ]; | |
uint32_t v4 = digittoval [ src [ 3 ] ]; | |
return v1 * 16 * 16 * 16 + v2 * 16 * 16 + v3 * 16 + v4; | |
} | |
__attribute__ ( ( noinline ) ) // we do not want the compiler to rewrite the problem | |
uint32_t bogus ( const uint8_t *src ) { | |
return 0; | |
} | |
static inline uint32_t convertone ( uint8_t c ) { | |
uint32_t v = ( c & 0xF ) + 9 * ( c >> 6 ); | |
return v; | |
} | |
// no error checking | |
__attribute__ ( ( noinline ) ) // we do not want the compiler to rewrite the problem | |
uint32_t hex_to_u32_math ( const uint8_t *src ) { | |
uint32_t v1 = convertone ( src [ 0 ] ); | |
uint32_t v2 = convertone ( src [ 1 ] ); | |
uint32_t v3 = convertone ( src [ 2 ] ); | |
uint32_t v4 = convertone ( src [ 3 ] ); | |
return static_cast< uint32_t >( v1 << 12 | v2 << 8 | v3 << 4 | v4 ); | |
} | |
#ifdef __BMI2__ | |
// no error checking | |
// http://0x80.pl/notesen/2014-10-09-pext-convert-ascii-hex-to-num.html | |
__attribute__ ( ( noinline ) ) // we do not want the compiler to rewrite the problem | |
uint32_t hex_to_u32_mula ( const uint8_t *src ) { | |
uint32_t val; | |
memcpy ( &val, src, 4 ); | |
const uint32_t input = __builtin_bswap32 ( val ); | |
const uint32_t letter = input & 0x40404040; | |
const uint32_t shift = letter >> 3 | letter >> 6; // 9 if char ~ [a-fA-F], 0 otherwise | |
const uint32_t adjusted = input + shift; | |
// gather lower nibbles of each byte | |
return _pext_u32 ( adjusted, 0x0f0f0f0f ); // slow on AMD! | |
} | |
#endif | |
// no error checking | |
// https://johnnylee-sde.github.io/Fast-hex-number-string-to-int/ | |
__attribute__ ( ( noinline ) ) // we do not want the compiler to rewrite the problem | |
uint32_t hex_to_u32_lee ( const uint8_t *src ) { | |
uint32_t val; | |
memcpy ( &val, src, 4 ); | |
val = ( val & 0xf0f0f0f ) + 9 * ( val >> 6 & 0x1010101 ); | |
val = ( val | val << 12 ) & 0xff00ff00; | |
return ( val >> 24 | val ) & 0xffff; | |
} | |
__attribute__ ( ( noinline ) ) // we do not want the compiler to rewrite the problem | |
uint32_t hex_to_u32_aqrit ( const uint8_t *src ) { | |
uint32_t in; | |
uint64_t v, x; | |
const int64_t magic = INT64_C ( 0x1001001000000000 ); | |
memcpy ( &in, src, 4 ); | |
v = in; | |
x = ( ( ( 0x00404040 & v ) >> 6 ) * 9 ) + ( v & 0x000F0F0F ); // do 3 | |
x = ( ( ( uint64_t ) ( ( int64_t ) x * magic ) ) >> 48 ) & ~15; // bswap and pack | |
v = ( ( v >> 30 ) * 9 ) + ( ( v >> 24 ) & 0x0F ); // do the 4th | |
return ( x | v ); | |
} | |
#ifdef __SSE2__ | |
uint64_t hex_to_u64_sse ( const uint8_t* string ) { | |
/* | |
'0' .. '9' = 0x30 .. 0x39 | |
'a' .. 'f' = 0x61 .. 0x66 | |
'A' .. 'F' = 0x41 .. 0x46 | |
*/ | |
__m128i input = _mm_loadu_si128 ( ( const __m128i* )string ); | |
const __m128i mask = _mm_set1_epi8 ( 0x0f ); | |
const __m128i hi_nibbles = _mm_and_si128 ( _mm_srli_epi32 ( input, 4 ), mask ); | |
const __m128i lo_nibbles = _mm_and_si128 ( input, mask ); | |
const int digits = 0x80; | |
const int letters = 0x40; | |
const int error1 = 0x10; | |
const int error2 = 0x20; | |
// pretend that input & 0xf0 == 0x30 | |
const __m128i lo_ascii_09 = _mm_shuffle_epi8 ( _mm_setr_epi8 ( | |
0 + digits, | |
1 + digits, | |
2 + digits, | |
3 + digits, | |
4 + digits, | |
5 + digits, | |
6 + digits, | |
7 + digits, | |
8 + digits, | |
9 + digits, | |
error1, error1, error1, error1, error1, error1 ), lo_nibbles ); | |
// pretend that input & 0xf0 == 0x40 or 0x60 | |
const __m128i lo_ascii_af = _mm_shuffle_epi8 ( _mm_setr_epi8 ( | |
error1, | |
10 + letters, | |
11 + letters, | |
12 + letters, | |
13 + letters, | |
14 + letters, | |
15 + letters, | |
error1, | |
error1, | |
error1, | |
error1, error1, error1, error1, error1, error1 ), lo_nibbles ); | |
// check value of higher nibble | |
const __m128i hi_class = _mm_shuffle_epi8 ( _mm_setr_epi8 ( | |
error2, error2, error2, digits /* 0x30 */, | |
letters /* 0x40 */, error2, letters /* 0x60 */, error2, | |
error2, error2, error2, error2, | |
error2, error2, error2, error2 ), hi_nibbles ); | |
// and find out which preposition was true | |
const __m128i ascii_09_mask = _mm_cmpeq_epi8 ( _mm_and_si128 ( hi_class, lo_ascii_09 ), hi_class ); | |
const __m128i ascii_af_mask = _mm_cmpeq_epi8 ( _mm_and_si128 ( hi_class, lo_ascii_af ), hi_class ); | |
const __m128i ascii_09 = _mm_and_si128 ( ascii_09_mask, lo_ascii_09 ); | |
const __m128i ascii_af = _mm_and_si128 ( ascii_af_mask, lo_ascii_af ); | |
#if 0 | |
// MSBs of error vector indicates invalid characters | |
const __m128i errorvec = _mm_xor_si128 ( ascii_09_mask | ascii_af_mask, _mm_set1_epi8 ( -1 ) ); | |
if ( _mm_movemask_epi8 ( errorvec ) ) | |
abort ( ); | |
#endif | |
const __m128i result = _mm_and_si128 ( _mm_or_si128 ( ascii_09, ascii_af ), mask ); | |
// now each byte of result have value 0 .. 15, we're going to merge nibbles: | |
const __m128i t0 = result; | |
const __m128i t1 = _mm_srli_epi32 ( result, 4 ); | |
const __m128i t3 = _mm_or_si128 ( t0, t1 ); | |
const __m128i t4 = _mm_and_si128 ( t3, _mm_set1_epi16 ( 0x00ff ) ); // keep just lower bytes in words | |
const __m128i t5 = _mm_packus_epi16 ( t4, t4 ); // now lower part of the reg has 8-byte value | |
return _mm_cvtsi128_si64 ( t5 ); | |
} | |
__attribute__ ( ( noinline ) ) // we do not want the compiler to rewrite the problem | |
uint32_t hex_to_u64_sse_wrapper ( const uint8_t* string ) { | |
// wrap to fullfil the template constraint | |
return hex_to_u64_sse ( string ); | |
} | |
#endif | |
uint32_t lookup2 [ 65536 ]; | |
void init_lookup2 ( ) { | |
for ( int i = 0; i < 0x10000; i++ ) { | |
lookup2 [ i ] = ( uint32_t ) -1; | |
} | |
for ( int i = 0; i < 256; i++ ) { | |
char digits [ 3 ]; | |
sprintf ( digits, "%02x", i ); | |
uint16_t lvalue; | |
memcpy ( &lvalue, digits, 2 ); | |
lookup2 [ lvalue ] = i; | |
char digits_upper [ 3 ]; | |
digits_upper [ 0 ] = toupper ( digits [ 0 ] ); | |
digits_upper [ 1 ] = toupper ( digits [ 1 ] ); | |
digits_upper [ 2 ] = 0; | |
if ( ( digits_upper [ 0 ] != digits [ 0 ] ) || | |
( digits_upper [ 1 ] != digits [ 1 ] ) ) { | |
memcpy ( &lvalue, digits_upper, 2 ); | |
lookup2 [ lvalue ] = i; | |
} | |
} | |
} | |
__attribute__ ( ( noinline ) ) // we do not want the compiler to rewrite the problem | |
uint32_t hex_2bytes_lookup ( const uint8_t *src ) { | |
uint32_t v1 = lookup2 [ ( ( uint16_t* ) src ) [ 0 ] ]; | |
uint32_t v2 = lookup2 [ ( ( uint16_t* ) src ) [ 1 ] ]; | |
return v1 << 8 | v2; | |
} | |
uint32_t hex_2bytes_lookup_inline ( const uint8_t *src ) { | |
uint32_t v1 = lookup2 [ ( ( uint16_t* ) src ) [ 0 ] ]; | |
uint32_t v2 = lookup2 [ ( ( uint16_t* ) src ) [ 1 ] ]; | |
return v1 << 8 | v2; | |
} | |
int table_lookup ( char s_ ) noexcept { | |
struct KeyValue { const char key, value; }; | |
static constexpr std::array<const KeyValue, 22> table { { | |
{ '0', 0 }, { '1', 1 }, { '2', 2 }, { '3', 3 }, { '4', 4 }, { '5', 5 }, { '6', 6 }, { '7', 7 }, | |
{ '8', 8 }, { '9', 9 }, { 'A', 10 }, { 'B', 11 }, { 'C', 12 }, { 'D', 13 }, { 'E', 14 }, { 'F', 15 }, | |
{ 'a', 10 }, { 'b', 11 }, { 'c', 12 }, { 'd', 13 }, { 'e', 14 }, { 'f', 15 } | |
} }; | |
for ( auto [ k, v ] : table ) | |
if ( k == s_ ) | |
return v; | |
return -1; | |
} | |
int table_lookup00 ( char s_ ) noexcept { | |
struct KeyValue { const char key, value; }; | |
static constexpr std::array<const KeyValue, 16> table { { | |
{ '0', 0 }, { '1', 1 }, { '2', 2 }, { '3', 3 }, { '4', 4 }, { '5', 5 }, { '6', 6 }, { '7', 7 }, | |
{ '8', 8 }, { '9', 9 }, { 'A', 10 }, { 'B', 11 }, { 'C', 12 }, { 'D', 13 }, { 'E', 14 }, { 'F', 15 } | |
} }; | |
if ( s_ > 94 ) | |
s_ -= 47; | |
for ( auto [ k, v ] : table ) | |
if ( k == s_ ) | |
return v; | |
return -1; | |
} | |
int table_lookup0 ( char s_ ) noexcept { | |
if ( s_ > 94 ) s_ -= ( 95 - 48 ); | |
struct KeyValue { const char key, value; }; | |
static constexpr KeyValue table [ ] { | |
{ '0', 0 }, { '1', 1 }, { '2', 2 }, { '3', 3 }, { '4', 4 }, { '5', 5 }, { '6', 6 }, { '7', 7 }, | |
{ '8', 8 }, { '9', 9 }, { 'A', 10 }, { 'B', 11 }, { 'C', 12 }, { 'D', 13 }, { 'E', 14 }, { 'F', 15 } | |
}; | |
const KeyValue * p = & table [ 0 ]; | |
if ( p [ 8 ].key <= s_ ) p += 8; | |
if ( p [ 4 ].key <= s_ ) p += 4; | |
if ( p [ 2 ].key <= s_ ) p += 2; | |
if ( p [ 1 ].key <= s_ ) p += 1; | |
return p->value; | |
} | |
int table_lookup1 ( char s_ ) noexcept { | |
if ( s_ > 94 ) return s_ - ( 95 - 10 ); | |
if ( s_ > 47 ) return s_ - ( 48 - 10 ); | |
return s_ - 30; | |
} | |
int table_lookup2 ( char s_ ) noexcept { | |
if ( s_ > 94 ) s_ -= ( 95 - 48 ); | |
constexpr char key [ ] { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' }; | |
for ( int i = 0; i < 16; ++i ) | |
if ( key [ i ] == s_ ) | |
return i; | |
return -1; | |
} | |
__attribute__ ( ( noinline ) ) // we do not want the compiler to rewrite the problem | |
uint32_t hex_table_lookup ( const uint8_t *src ) { | |
uint32_t v1 = table_lookup ( src [ 0 ] ); | |
uint32_t v2 = table_lookup ( src [ 1 ] ); | |
uint32_t v3 = table_lookup ( src [ 2 ] ); | |
uint32_t v4 = table_lookup ( src [ 3 ] ); | |
return static_cast< uint32_t >( v1 << 12 | v2 << 8 | v3 << 4 | v4 ); | |
} | |
frozen::map<char, char, 22> map_lookup { | |
{ '0', 0 }, { '1', 1 }, { '2', 2 }, { '3', 3 }, { '4', 4 }, { '5', 5 }, { '6', 6 }, { '7', 7 }, { '8', 8 }, | |
{ '9', 9 }, { 'A', 10 }, { 'B', 11 }, { 'C', 12 }, { 'D', 13 }, { 'E', 14 }, { 'F', 15 }, { 'a', 10 }, { 'b', 11 }, | |
{ 'c', 12 }, { 'd', 13 }, { 'e', 14 }, { 'f', 15 } | |
}; | |
__attribute__ ( ( noinline ) ) // we do not want the compiler to rewrite the problem | |
uint32_t hex_frozen_map ( const uint8_t *src ) { | |
uint32_t v1 = map_lookup.at ( src [ 0 ] ); | |
uint32_t v2 = map_lookup.at ( src [ 1 ] ); | |
uint32_t v3 = map_lookup.at ( src [ 2 ] ); | |
uint32_t v4 = map_lookup.at ( src [ 3 ] ); | |
return static_cast< uint32_t >( v1 << 12 | v2 << 8 | v3 << 4 | v4 ); | |
} | |
constexpr frozen::unordered_map<char, char, 22> unordered_map_lookup { | |
{ '0', 0 }, | |
{ '1', 1 }, | |
{ '2', 2 }, | |
{ '3', 3 }, | |
{ '4', 4 }, | |
{ '5', 5 }, | |
{ '6', 6 }, | |
{ '7', 7 }, | |
{ '8', 8 }, | |
{ '9', 9 }, | |
{ 'a', 10 }, | |
{ 'b', 11 }, | |
{ 'c', 12 }, | |
{ 'd', 13 }, | |
{ 'e', 14 }, | |
{ 'f', 15 }, | |
{ 'A', 10 }, | |
{ 'B', 11 }, | |
{ 'C', 12 }, | |
{ 'D', 13 }, | |
{ 'E', 14 }, | |
{ 'F', 15 } | |
}; | |
__attribute__ ( ( noinline ) ) // we do not want the compiler to rewrite the problem | |
uint32_t hex_frozen_unordered_map ( const uint8_t *src ) { | |
uint32_t v1 = unordered_map_lookup.at ( src [ 0 ] ); | |
uint32_t v2 = unordered_map_lookup.at ( src [ 1 ] ); | |
uint32_t v3 = unordered_map_lookup.at ( src [ 2 ] ); | |
uint32_t v4 = unordered_map_lookup.at ( src [ 3 ] ); | |
return static_cast< uint32_t >( v1 << 12 | v2 << 8 | v3 << 4 | v4 ); | |
} | |
template <uint32_t ( *F )( const uint8_t *src )> void test ( size_t N ) { | |
uint8_t *x = ( uint8_t * ) malloc ( sizeof ( uint8_t ) * ( N + 3 ) ); | |
srand ( 1235 ); | |
for ( size_t i = 0; i < N + 3; i++ ) { | |
int digit = ( rand ( ) % 16 ); | |
if ( digit < 10 ) | |
x [ i ] = 48 + digit; | |
else | |
x [ i ] = 65 + digit - 10; | |
} | |
for ( size_t i = 0; i < N; i++ ) { | |
assert ( hex_to_u32_lookup ( x + i ) == hex_to_u32_math ( x + i ) ); | |
} | |
size_t iterations = 50; | |
std::random_device rd; | |
std::mt19937 gen ( rd ( ) ); | |
std::uniform_int_distribution<> dis ( 0, 0xFFFF ); | |
plf::nanotimer timer; | |
timer.start ( ); | |
uint64_t sum = 0; | |
for ( uint32_t i = 0; i < iterations; i++ ) { | |
for ( size_t i = 0; i < N; i++ ) { | |
sum += F ( x + i ); | |
} | |
} | |
uint64_t end = timer.get_elapsed_ms ( ); | |
free ( x ); | |
printf ( "sum = %zu \n", ( size_t ) sum ); | |
printf ( "time = %zu \n", ( uint64_t ) end ); | |
} | |
int main ( ) { | |
init_lookup2 ( ); | |
size_t N = 1000 * 1000; | |
printf ( "N= %zu \n", N ); | |
printf ( "empty function (overhead):\n" ); | |
test<bogus> ( N ); | |
printf ( "lookup:\n" ); | |
test<hex_to_u32_lookup> ( N ); | |
printf ( "lookup (inline):\n" ); | |
test<hex_to_u32_lookup_inline> ( N ); | |
printf ( "lookup (alt):\n" ); | |
test<hex_to_u32_lookup_alt> ( N ); | |
printf ( "math:\n" ); | |
test<hex_to_u32_math> ( N ); | |
#ifdef __BMI2__ | |
printf ( "mula:\n" ); | |
test<hex_to_u32_mula> ( N ); | |
#endif | |
printf ( "lee:\n" ); | |
test<hex_to_u32_lee> ( N ); | |
printf ( "aqrit:\n" ); | |
test<hex_to_u32_aqrit> ( N ); | |
#ifdef __SSE2__ | |
printf ( "SSE (uint64_t):\n" ); | |
test<hex_to_u64_sse_wrapper> ( N ); | |
#endif | |
printf ( "big lookup:\n" ); | |
test<hex_2bytes_lookup> ( N ); | |
printf ( "big lookup (inline):\n" ); | |
test<hex_2bytes_lookup_inline> ( N ); | |
printf ( "table_lookup:\n" ); | |
test<hex_table_lookup> ( N ); | |
printf ( "frozen_map:\n" ); | |
test<hex_frozen_map> ( N ); | |
printf ( "frozen_unordered_map:\n" ); | |
test<hex_frozen_unordered_map> ( N ); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment