Skip to content

Instantly share code, notes, and snippets.

@aqrit
Last active Nov 24, 2021
Embed
What would you like to do?
/*
convert integer to ascii
If we can do 1 mulx per cycle with a latency of 4...
then we should be able to extract 1 digit per cycle...
However, Agner Fog's Instruction_tables.pdf says 4 cycles on Skylake but
Intel's intrinsics Guide says 9 cycles on Skylake, 4 on Haswell.
(Doh!)
Using the div by mul "trick" with a 64*64 = 64 mul is good for 31-bit values.
It makes for simple implementations of itoa(int32_t) and itoa(uint32_t).
*/
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <inttypes.h>
// about the same speed as khuong_approach (which is faster depends on compile options)
// but pretty good for little effort...
static
void to_string_2x (uint64_t val, char* dst)
{
// assert(val < 10000000000000000);
const uint64_t mask = 0x00FFFFFF00FFFFFF; // mask for swar q:r
const uint32_t magic_div1000 = (0x00FFFFFF / 1000) + 1; // [31:24][23:0]
const uint64_t magic_div10000 = (0x1FFFFFFFFFFF / 10000) + 1; // [63:45][44:21]
uint64_t out0 = 0x3030303030303030; // '0' * 0x0101010101010101
uint64_t out1 = 0x3030303030303030;
uint64_t hi = (val / 100000000) * magic_div10000;
uint64_t lo = (val % 100000000) * magic_div10000;
uint64_t x1 = ((hi >> 45) * magic_div1000) |
((((hi + (1 << 21)) << 11) & (mask << 32)) * 10);
uint64_t x0 = ((lo >> 45) * magic_div1000) |
((((lo + (1 << 21)) << 11) & (mask << 32)) * 10);
// just multiply the remainders by 10 to get the next digit
out1 |= (x1 & ~mask) >> 24;
out0 |= (x0 & ~mask) >> 24;
x1 = (x1 & mask) * 10;
x0 = (x0 & mask) * 10;
out1 |= (x1 & ~mask) >> 16;
out0 |= (x0 & ~mask) >> 16;
x1 = (x1 & mask) * 10;
x0 = (x0 & mask) * 10;
out1 |= (x1 & ~mask) >> 8;
out0 |= (x0 & ~mask) >> 8;
x1 = (x1 & mask) * 10;
x0 = (x0 & mask) * 10;
out1 |= (x1 & ~mask);
out0 |= (x0 & ~mask);
memcpy(&dst[0], &out1, 8);
memcpy(&dst[8], &out0, 8);
//dst[16] = '\0';
}
// similar to `sprintf(out, "%010u", val)`
// note: (x * 5) == ((x << 2) + x) == LEA
// one could fold the `+ 0x30` into the LEA
static void lulz_itoa_linear_u32 (uint32_t val, char* out) {
const uint64_t mask = 0x1FFFFFFFFFFFFFFFULL;
uint32_t lo_bit = val & 1;
uint64_t x = val >> 1;
x *= (mask / 500000000) + 1; out[0] = 0x30 + (x >> 61);
x = (x & mask) * 5; out[1] = 0x30 + (x >> 60);
x = (x & (mask >> 1)) * 5; out[2] = 0x30 + (x >> 59);
x = (x & (mask >> 2)) * 5; out[3] = 0x30 + (x >> 58);
x = (x & (mask >> 3)) * 5; out[4] = 0x30 + (x >> 57);
x = (x & (mask >> 4)) * 5; out[5] = 0x30 + (x >> 56);
x = (x & (mask >> 5)) * 5; out[6] = 0x30 + (x >> 55);
x = (x & (mask >> 6)) * 5; out[7] = 0x30 + (x >> 54);
x = (x & (mask >> 7)) * 5; out[8] = 0x30 + (x >> 53);
x = (x & (mask >> 8)) * 5; out[9] = 0x30 + (x >> 52) + lo_bit;
out[10] = 0;
}
// mulx
static inline
uint64_t mul128(uint64_t* x, uint64_t y) {
// 64:64 = 64*64
__uint128_t v = (__uint128_t)(*x) * y;
*x = (uint64_t)v;
return v >> 64;
}
// similar to `sprintf(out, "%020" PRIu64, x)`
static
void lulz_itoa (uint64_t x, char* out) {
// split the uint64_t into 4 streams (18446`74407`37095`51615)
uint64_t top = x / 10000000000;
uint64_t bottom = x % 10000000000;
uint64_t x3 = top / 100000;
uint64_t x2 = top % 100000;
uint64_t x1 = bottom / 100000;
uint64_t x0 = bottom % 100000;
// divide each value by 10000 but keep the remainder
const uint64_t M = (0xFFFFFFFFFFFFFFFFULL / 10000) + 1;
out[0] = 0x30 + mul128(&x3, M);
out[5] = 0x30 + mul128(&x2, M);
out[10] = 0x30 + mul128(&x1, M);
out[15] = 0x30 + mul128(&x0, M);
// get the rest of the digits by mulitplying the remainder by 10
for (int i = 1; i < 5; i++) {
out[i + 0] = 0x30 + mul128(&x3, 10);
out[i + 5] = 0x30 + mul128(&x2, 10);
out[i + 10] = 0x30 + mul128(&x1, 10);
out[i + 15] = 0x30 + mul128(&x0, 10);
}
out[20] = 0;
}
// similar to `sprintf(out, "%d", val)`
static
void lulz_itoa_linear_i32 (int32_t val, char* out) {
const uint64_t mask = 0x0FFFFFFFFFFFFFFFULL;
uint64_t x = (uint32_t)val;
int i = 0;
if (val == 0) {
out[0] = '0';
out[1] = '\0';
return;
}
if (val < 0) {
*out++ = '-';
x = (x ^ 0x0FFFFFFFF) + 1; // abs
}
x *= ((mask / 1000000000) + 1);
while ((x & ~mask) == 0) { // naive scan to first non-zero digit
x *= 10;
i++;
}
do {
*out++ = 0x30 + (x >> 60);
x = (x & mask) * 10;
} while (++i < 10);
*out = 0;
}
int main (void) {
char ref[24];
char res[24];
/*
for (uint64_t x = 0; x < 0xFFFF; x++) {
sprintf(ref, "%020" PRIu64, x);
lulz_itoa(x, res);
if(memcmp(ref, res, 21)) {
printf("ERROR: %s -> %s\n", ref, res);
break;
}
}
*/
sprintf(ref, "%020" PRIu64, UINT64_C(0));
lulz_itoa(UINT64_C(0), res);
printf("%s -> %s\n", ref, res);
sprintf(ref, "%020" PRIu64, UINT64_C(1));
lulz_itoa(UINT64_C(1), res);
printf("%s -> %s\n", ref, res);
sprintf(ref, "%020" PRIu64, UINT64_C(0xFFFFFFFF));
lulz_itoa(UINT64_C(0xFFFFFFFF), res);
printf("%s -> %s\n", ref, res);
sprintf(ref, "%020" PRIu64, UINT64_C(0x100000000));
lulz_itoa(UINT64_C(0x100000000), res);
printf("%s -> %s\n", ref, res);
sprintf(ref, "%020" PRIu64, UINT64_C(0x1000000000000));
lulz_itoa(UINT64_C(0x1000000000000), res);
printf("%s -> %s\n", ref, res);
sprintf(ref, "%020" PRIu64, UINT64_C(0xFFFFFFFFFFFFFFFF));
lulz_itoa(UINT64_C(0xFFFFFFFFFFFFFFFF), res);
printf("%s -> %s\n", ref, res);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment