Skip to content

Instantly share code, notes, and snippets.

@mmozeiko
Last active March 6, 2026 08:57
Show Gist options
  • Select an option

  • Save mmozeiko/68c2d1ce466422b506b2c86e4f603f53 to your computer and use it in GitHub Desktop.

Select an option

Save mmozeiko/68c2d1ce466422b506b2c86e4f603f53 to your computer and use it in GitHub Desktop.
convert 32-bit or 64-bit integer to hex string
#pragma once
#include <stdint.h>
static inline void hex_from_u32(void* hex, uint32_t x, int uppercase); // writes exactly 8 bytes
static inline void hex_from_u64(void* hex, uint64_t x, int uppercase); // writes exactly 16 bytes
// implementation
#if !defined(HEX_NO_SIMD)
# if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512VBMI__)
# include <immintrin.h>
# define HEX_AVX512
# elif defined(__AVX2__) // actually only SSSE3 required
# include <tmmintrin.h>
# define HEX_SSSE3
# elif defined(_M_AMD64) || defined(__x86_64__)
# include <emmintrin.h>
# define HEX_SSE2
# elif defined(_M_ARM64) || defined(__aarch64__)
# include <arm_neon.h>
# define HEX_NEON
# elif defined(__wasm_simd128__)
# include <wasm_simd128.h>
# define HEX_WASM
# elif defined(__riscv) && (__riscv_v >= 1000000)
# include <riscv_vector.h>
# define HEX_RVV // assumes -march=rva23u64 or -march=rv64gbv_zvbb or -march=rv64gv_zba_zbb_zbs_zvbb
# endif
#endif
#if defined(_MSC_VER)
# include <intrin.h>
# define HEX_BSWAP32(x) _byteswap_ulong(x)
# define HEX_BSWAP64(x) _byteswap_uint64(x)
# pragma pack(push, 1)
typedef struct { uint64_t value; } HexUnaligned64;
# pragma pack(pop)
#else
# define HEX_BSWAP32(x) __builtin_bswap32(x)
# define HEX_BSWAP64(x) __builtin_bswap64(x)
typedef struct __attribute__((packed)) { uint64_t value; } HexUnaligned64;
#endif
void hex_from_u32(void* hex, uint32_t x, int uppercase)
{
#if defined(HEX_AVX512)
__m128i bytes = _mm_cvtsi32_si128(x);
const __m128i bits = _mm_setr_epi8(28,24,20,16,12,8,4,0, 0,0,0,0,0,0,0,0);
__m128i nibbles = _mm_multishift_epi64_epi8(bits, bytes);
const char* table = uppercase ? "0123456789ABCDEF" : "0123456789abcdef";
__m128i result = _mm_permutexvar_epi8(nibbles, _mm_loadu_si128((const void*)table));
_mm_storel_epi64(hex, result);
#elif defined(HEX_SSSE3)
__m128i bytes = _mm_cvtsi32_si128(HEX_BSWAP32(x));
__m128i nibbles = _mm_unpacklo_epi8(_mm_srli_epi64(bytes, 4), bytes);
nibbles = _mm_and_si128(nibbles, _mm_set1_epi8(0xf));
const char* table = uppercase ? "0123456789ABCDEF" : "0123456789abcdef";
__m128i result = _mm_shuffle_epi8(_mm_loadu_si128((const void*)table), nibbles);
_mm_storel_epi64(hex, result);
#elif defined(HEX_SSE2)
__m128i bytes = _mm_cvtsi32_si128(HEX_BSWAP32(x));
// 4-bit nibbles
__m128i nibbles = _mm_unpacklo_epi8(_mm_srli_epi64(bytes, 4), bytes);
nibbles = _mm_and_si128(nibbles, _mm_set1_epi8(0xf));
// if nibble > 9
__m128i mask = _mm_cmpgt_epi8(nibbles, _mm_set1_epi8(9));
// result for 0..9
__m128i result = _mm_add_epi8(nibbles, _mm_set1_epi8('0'));
// update for a..f
const char offset = (uppercase ? 'A' : 'a') - '0' - 10;
result = _mm_add_epi8(result, _mm_and_si128(mask, _mm_set1_epi8(offset)));
_mm_storel_epi64(hex, result);
#elif defined(HEX_NEON)
uint8x8_t bytes = vreinterpret_u8_u32(vcreate_u32(HEX_BSWAP32(x)));
uint8x8_t nibbles = vzip1_u8(vshr_n_u8(bytes, 4), bytes);
nibbles = vand_u8(nibbles, vdup_n_u8(0xf));
const char* table = uppercase ? "0123456789ABCDEF" : "0123456789abcdef";
uint8x8_t result = vqtbl1_u8(vld1q_u8((const uint8_t*)table), nibbles);
vst1_u8(hex, result);
#elif defined(HEX_WASM)
v128_t bytes = wasm_u32x4_make(x, 0, 0, 0);
v128_t nibbles = wasm_i8x16_shuffle(wasm_u64x2_shr(bytes, 4), bytes,
3,19, 2,18, 1,17, 0,16, 0,0,0,0,0,0,0,0
);
nibbles = wasm_v128_and(nibbles, wasm_i8x16_splat(0xf));
const char* table = uppercase ? "0123456789ABCDEF" : "0123456789abcdef";
v128_t result = wasm_i8x16_swizzle(wasm_v128_load(table), nibbles);
wasm_v128_store64_lane(hex, result, 0);
#elif defined(HEX_RVV)
vuint32m1_t x32 = __riscv_vmv_s_x_u32m1(HEX_BSWAP32(x), 1);
vuint8m1_t xlo = __riscv_vreinterpret_v_u32m1_u8m1(x32);
vuint8m1_t xhi = __riscv_vsrl_vx_u8m1(xlo, 4, 8);
// interleave bytes from xhi and xlo
vuint16m2_t n8 = __riscv_vwmaccu_vx_u16m2(__riscv_vwaddu_vv_u16m2(xhi, xlo, 8), 0xff, xlo, 8);
vuint16m1_t n16 = __riscv_vget_v_u16m2_u16m1(n8, 0);
// 4-bit nibbles
vuint8m1_t nibbles = __riscv_vand_vx_u8m1(__riscv_vreinterpret_v_u16m1_u8m1(n16), 0xf, 8);
// if nibble > 9
vbool8_t mask = __riscv_vmsgtu_vx_u8m1_b8(nibbles, 9, 8);
// result for 0..9
vuint8m1_t result = __riscv_vadd_vx_u8m1(nibbles, '0', 8);
// update for a..f
const char offset = (uppercase ? 'A' : 'a') - '0' - 10;
result = __riscv_vadd_vx_u8m1_mu(mask, result, result, offset, 8);
__riscv_vse8_v_u8m1(hex, result, 8);
#else // SWAR
uint64_t n = x;
// get nibbles in reversed order // 0x0000'0000'8765'4321
n = ((n & 0x000000000000ffff) << 32) | ((n >> 16) & 0x000000000000ffff); // 0x0000'4321'0000'8765
n = ((n & 0x000000ff000000ff) << 16) | ((n >> 8) & 0x000000ff000000ff); // 0x0021'0043'0065'0087
n = ((n & 0x000f000f000f000f) << 8) | ((n >> 4) & 0x000f000f000f000f); // 0x0102'0304'0506'0708
const uint64_t splat = 0x0101010101010101;
// if nibble >= 10 then addition will overflow in top nibble, shift it down to 0 or 1
uint64_t mask = ((n + (16 - 10) * splat) >> 4) & splat;
// result for 0..9
uint64_t result = n + ('0' * splat);
// update for a..f
const char offset = (uppercase ? 'A' : 'a') - '0' - 10;
result += offset * mask;
// assumes little-endian
((HexUnaligned64*)hex)->value = result;
#endif
}
void hex_from_u64(void* hex, uint64_t x, int uppercase)
{
#if defined(HEX_AVX512)
__m128i bytes = _mm_set1_epi64x(x);
const __m128i bits = _mm_setr_epi8(60,56,52,48,44,40,36,32, 28,24,20,16,12,8,4,0);
__m128i nibbles = _mm_multishift_epi64_epi8(bits, bytes);
const char* table = uppercase ? "0123456789ABCDEF" : "0123456789abcdef";
__m128i result = _mm_permutexvar_epi8(nibbles, _mm_loadu_si128((const void*)table));
_mm_storeu_si128(hex, result);
#elif defined(HEX_SSSE3)
__m128i bytes = _mm_cvtsi64_si128(HEX_BSWAP64(x));
__m128i nibbles = _mm_unpacklo_epi8(_mm_srli_epi64(bytes, 4), bytes);
nibbles = _mm_and_si128(nibbles, _mm_set1_epi8(0xf));
const char* table = uppercase ? "0123456789ABCDEF" : "0123456789abcdef";
__m128i result = _mm_shuffle_epi8(_mm_loadu_si128((const void*)table), nibbles);
_mm_storeu_si128(hex, result);
#elif defined(HEX_SSE2)
__m128i bytes = _mm_cvtsi64_si128(HEX_BSWAP64(x));
// 4-bit nibbles
__m128i nibbles = _mm_unpacklo_epi8(_mm_srli_epi64(bytes, 4), bytes);
nibbles = _mm_and_si128(nibbles, _mm_set1_epi8(0xf));
// if nibble > 9
__m128i mask = _mm_cmpgt_epi8(nibbles, _mm_set1_epi8(9));
// result for 0..9
__m128i result = _mm_add_epi8(nibbles, _mm_set1_epi8('0'));
// update for a..f
const char offset = (uppercase ? 'A' : 'a') - '0' - 10;
result = _mm_add_epi8(result, _mm_and_si128(mask, _mm_set1_epi8(offset)));
_mm_storeu_si128(hex, result);
#elif defined(HEX_NEON)
uint8x8_t bytes = vreinterpret_u8_u64(vcreate_u64(HEX_BSWAP64(x)));
uint8x8x2_t nibbles2 = vzip_u8(vshr_n_u8(bytes, 4), bytes);
uint8x16_t nibbles = vcombine_u8(nibbles2.val[0], nibbles2.val[1]);
nibbles = vandq_u8(nibbles, vdupq_n_u8(0xf));
const char* table = uppercase ? "0123456789ABCDEF" : "0123456789abcdef";
uint8x16_t result = vqtbl1q_u8(vld1q_u8((const uint8_t*)table), nibbles);
vst1q_u8(hex, result);
#elif defined(HEX_WASM)
v128_t bytes = wasm_u64x2_make(x, 0);
v128_t nibbles = wasm_i8x16_shuffle(wasm_u64x2_shr(bytes, 4), bytes,
7,23, 6,22, 5,21, 4,20, 3,19, 2,18, 1,17, 0,16
);
nibbles = wasm_v128_and(nibbles, wasm_i8x16_splat(0xf));
const char* table = uppercase ? "0123456789ABCDEF" : "0123456789abcdef";
v128_t result = wasm_i8x16_swizzle(wasm_v128_load(table), nibbles);
wasm_v128_store(hex, result);
#elif defined(HEX_RVV)
vuint64m1_t x64 = __riscv_vmv_s_x_u64m1(HEX_BSWAP64(x), 1);
vuint8m1_t xlo = __riscv_vreinterpret_v_u64m1_u8m1(x64);
vuint8m1_t xhi = __riscv_vsrl_vx_u8m1(xlo, 4, 16);
// interleave bytes from xhi and xlo
vuint16m2_t n8 = __riscv_vwmaccu_vx_u16m2(__riscv_vwaddu_vv_u16m2(xhi, xlo, 16), 0xff, xlo, 16);
vuint16m1_t n16 = __riscv_vget_v_u16m2_u16m1(n8, 0);
// 4-bit nibbles
vuint8m1_t nibbles = __riscv_vand_vx_u8m1(__riscv_vreinterpret_v_u16m1_u8m1(n16), 0xf, 16);
// if nibble > 9
vbool8_t mask = __riscv_vmsgtu_vx_u8m1_b8(nibbles, 9, 16);
// result for 0..9
vuint8m1_t result = __riscv_vadd_vx_u8m1(nibbles, '0', 16);
// update for a..f
const char offset = (uppercase ? 'A' : 'a') - '0' - 10;
result = __riscv_vadd_vx_u8m1_mu(mask, result, result, offset, 16);
__riscv_vse8_v_u8m1(hex, result, 16);
#else // SWAR
uint64_t hi = x >> 32;
uint64_t lo = x;
// get nibbles in reversed order
hi = ((hi & 0x000000000000ffff) << 32) | ((hi >> 16) & 0x000000000000ffff);
hi = ((hi & 0x000000ff000000ff) << 16) | ((hi >> 8) & 0x000000ff000000ff);
hi = ((hi & 0x000f000f000f000f) << 8) | ((hi >> 4) & 0x000f000f000f000f);
lo = ((lo & 0x000000000000ffff) << 32) | ((lo >> 16) & 0x000000000000ffff);
lo = ((lo & 0x000000ff000000ff) << 16) | ((lo >> 8) & 0x000000ff000000ff);
lo = ((lo & 0x000f000f000f000f) << 8) | ((lo >> 4) & 0x000f000f000f000f);
const uint64_t splat = 0x0101010101010101;
// if nibble >= 10 then addition will overflow in top nibble, shift it down to 0 or 1
uint64_t hmask = ((hi + (16 - 10) * splat) >> 4) & splat;
uint64_t lmask = ((lo + (16 - 10) * splat) >> 4) & splat;
// result for 0..9
hi += '0' * splat;
lo += '0' * splat;
// update for a..f
const char offset = (uppercase ? 'A' : 'a') - '0' - 10;
hi += offset * hmask;
lo += offset * lmask;
// assumes little-endian
((HexUnaligned64*)hex + 0)->value = hi;
((HexUnaligned64*)hex + 1)->value = lo;
#endif
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment