Skip to content

Instantly share code, notes, and snippets.

@aqrit
Last active April 20, 2022 00:49
Show Gist options
  • Save aqrit/2997713a00ad043b4bac42e342294259 to your computer and use it in GitHub Desktop.
Save aqrit/2997713a00ad043b4bac42e342294259 to your computer and use it in GitHub Desktop.
/*
convert integer to ascii
see also:
https://johnnylee-sde.github.io/Fast-unsigned-integer-to-string/
http://www.numberworld.org/y-cruncher/internals/radix-conversion.html
*/
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <inttypes.h>
// expected to have performance similar to khuong_approach
static
void to_string_2x (uint64_t val, char* dst)
{
// assert(val < 10000000000000000);
const uint64_t magic_lo = (0x00FFFFFFFFFFFFFF / 10000) + 1;
const uint64_t magic_hi = (0xFFFFFFFFFFFFFFFF / (100000000 >> 4)) + 1;
// chunk it into four dwords (packed in two qwords)
uint64_t hi = val / 100000000;
uint64_t lo = val % 100000000;
lo = ((lo * magic_lo) & 0x00FFFFFF00000000) +
(((lo >> 4) * magic_hi) >> 40) + 0x0000000100000001;
hi = ((hi * magic_lo) & 0x00FFFFFF00000000) +
(((hi >> 4) * magic_hi) >> 40) + 0x0000000100000001;
// uses SWAR to extact two digits at-a-time from our qwords
// (multiplication by `10` is expected to be cheaper than a regular multiply)
const uint64_t mask = 0x00FFFFFF00FFFFFF;
uint64_t out0 = 0x3030303030303030;
uint64_t out1 = 0x3030303030303030;
lo = lo * 10;
hi = hi * 10;
out0 |= (lo & ~mask) >> 24;
out1 |= (hi & ~mask) >> 24;
lo = (lo & mask) * 10;
hi = (hi & mask) * 10;
out0 |= (lo & ~mask) >> 16;
out1 |= (hi & ~mask) >> 16;
lo = (lo & mask) * 10;
hi = (hi & mask) * 10;
out0 |= (lo & ~mask) >> 8;
out1 |= (hi & ~mask) >> 8;
lo = (lo & mask) * 10;
hi = (hi & mask) * 10;
out0 |= (lo & ~mask);
out1 |= (hi & ~mask);
memcpy(&dst[0], &out1, 8);
memcpy(&dst[8], &out0, 8);
//dst[16] = '\0';
}
void bin2str_4digits(uint64_t x, char* buf) {
const uint64_t mask = 0x0FFFFE1FFFF87FFF;
uint64_t y;
// assert(x <= 9999);
x *= 0x000418A051EC0CCD;
y = (x & mask) * 10;
// extract...
y &= ~mask;
x = (uint32_t)((y << 9) + (y >> 21) + (y >> 52) + (x >> 60));
x += 0x30303030;
memcpy(buf, &x, 5);
/*
divide by constant, fixpoint stuff:
max 9999, div 10, (x * 0xCCD) >> 15 // integer part: 10
max 9999, div 100, (x * 0x147B) >> 19 // integer part: 7
max 9999, div 1000, (x * 0x20C5) >> 23 // integer part: 4
However, since we're discarding the integer part...
we can overlap the integer part of a 'lane' with the fractional part of the next lane.
We have plenty of fractional precision anyways... e.g. 26 bits required
to calc but only 14 bits needed to round-trip)
so our magic is: (0x20C5ULL << 37) + (0x147BULL << 18) + 0xCCD;
the mask just cuts out the low 4-bits of the integer part for the result:
mask = ~((0xFULL << 60) + (0xFULL << 37) + (0xF << 15));
we could buy a little wiggle room in the layout.
drop 2 bits off the 1st lane at the cost of a LEA.
drop 3 bits off the 2nd lane at the cost of a +1 in the lane.
if we didn't want the integer part of the 3rd lane then we'd need only
1 bit at the top (for carry).. replace `x *= 10` with `x += (x >> 2)`
for another 3 bits.
*/
}
// similar to `sprintf(out, "%010u", val)`
static inline
void u32toa_baseline(uint32_t x, char *out) {
// `4294967295 / 10000000000` needs 65 fractional bits.
// The leftmost fractional bit is implied -- it is always zero.
uint64_t fractional_part = (uint64_t)x * UINT32_C(0xDBE6FECF);
// Discard excess precision (clear some space above the radix point)
// Only 34 fractional bits needed to "round-trip"
uint64_t r = (fractional_part >> 31) + 1;
// u32 has 10 decimal digits (including leading zeros).
for (int i = 0; i < 10; i++) {
r *= 10; // the integer_part of our fixed-point is now `0..9`
out[i] = (r >> 34) + 0x30; // extract integer_part and add ascii '0'
r &= 0x3FFFFFFFF; // clear integer_part
}
out[10] = 0;
}
static void u32toa_droplsb (uint32_t val, char* out) {
const uint64_t mask = 0x1FFFFFFFFFFFFFFFULL;
uint32_t lo_bit = val & 1;
uint64_t x = val >> 1;
x *= (mask / 500000000) + 1; out[0] = 0x30 + (x >> 61);
x = (x & mask) * 5; out[1] = 0x30 + (x >> 60);
x = (x & (mask >> 1)) * 5; out[2] = 0x30 + (x >> 59);
x = (x & (mask >> 2)) * 5; out[3] = 0x30 + (x >> 58);
x = (x & (mask >> 3)) * 5; out[4] = 0x30 + (x >> 57);
x = (x & (mask >> 4)) * 5; out[5] = 0x30 + (x >> 56);
x = (x & (mask >> 5)) * 5; out[6] = 0x30 + (x >> 55);
x = (x & (mask >> 6)) * 5; out[7] = 0x30 + (x >> 54);
x = (x & (mask >> 7)) * 5; out[8] = 0x30 + (x >> 53);
x = (x & (mask >> 8)) * 5; out[9] = 0x30 + (x >> 52) + lo_bit;
out[10] = 0;
}
// mulx
static inline
uint64_t mul128(uint64_t* x, uint64_t y) {
// 64:64 = 64*64
__uint128_t v = (__uint128_t)(*x) * y;
*x = (uint64_t)v;
return v >> 64;
}
/*
similar to `sprintf(out, "%020" PRIu64, x)`
If we can do 1 mulx per cycle with a latency of 4...
then we should be able to extract 1 digit per cycle...
However, Agner Fog's Instruction_tables.pdf says 4 cycles on Skylake but
Intel's intrinsics Guide says 9 cycles on Skylake, 4 on Haswell.
(Doh!)
*/
static
void lulz_itoa (uint64_t x, char* out) {
// split the uint64_t into 4 streams (18446`74407`37095`51615)
uint64_t top = x / 10000000000;
uint64_t bottom = x % 10000000000;
uint64_t x3 = top / 100000;
uint64_t x2 = top % 100000;
uint64_t x1 = bottom / 100000;
uint64_t x0 = bottom % 100000;
// divide each value by 10000 but keep the remainder
const uint64_t M = (0xFFFFFFFFFFFFFFFFULL / 10000) + 1;
out[0] = 0x30 + mul128(&x3, M);
out[5] = 0x30 + mul128(&x2, M);
out[10] = 0x30 + mul128(&x1, M);
out[15] = 0x30 + mul128(&x0, M);
// get the rest of the digits by mulitplying the remainder by 10
for (int i = 1; i < 5; i++) {
out[i + 0] = 0x30 + mul128(&x3, 10);
out[i + 5] = 0x30 + mul128(&x2, 10);
out[i + 10] = 0x30 + mul128(&x1, 10);
out[i + 15] = 0x30 + mul128(&x0, 10);
}
out[20] = 0;
}
// similar to `sprintf(out, "%d", val)`
static
void lulz_itoa_linear_i32 (int32_t val, char* out) {
const uint64_t mask = 0x0FFFFFFFFFFFFFFFULL;
uint64_t x = (uint32_t)val;
int i = 0;
if (val == 0) {
out[0] = '0';
out[1] = '\0';
return;
}
if (val < 0) {
*out++ = '-';
x = (x ^ 0x0FFFFFFFF) + 1; // abs
}
x *= ((mask / 1000000000) + 1);
while ((x & ~mask) == 0) { // naive scan to first non-zero digit
x *= 10;
i++;
}
do {
*out++ = 0x30 + (x >> 60);
x = (x & mask) * 10;
} while (++i < 10);
*out = 0;
}
int main (void) {
char ref[24];
char res[24];
/*
for (uint64_t x = 0; x < 0xFFFF; x++) {
sprintf(ref, "%020" PRIu64, x);
lulz_itoa(x, res);
if(memcmp(ref, res, 21)) {
printf("ERROR: %s -> %s\n", ref, res);
break;
}
}
*/
sprintf(ref, "%020" PRIu64, UINT64_C(0));
lulz_itoa(UINT64_C(0), res);
printf("%s -> %s\n", ref, res);
sprintf(ref, "%020" PRIu64, UINT64_C(1));
lulz_itoa(UINT64_C(1), res);
printf("%s -> %s\n", ref, res);
sprintf(ref, "%020" PRIu64, UINT64_C(0xFFFFFFFF));
lulz_itoa(UINT64_C(0xFFFFFFFF), res);
printf("%s -> %s\n", ref, res);
sprintf(ref, "%020" PRIu64, UINT64_C(0x100000000));
lulz_itoa(UINT64_C(0x100000000), res);
printf("%s -> %s\n", ref, res);
sprintf(ref, "%020" PRIu64, UINT64_C(0x1000000000000));
lulz_itoa(UINT64_C(0x1000000000000), res);
printf("%s -> %s\n", ref, res);
sprintf(ref, "%020" PRIu64, UINT64_C(0xFFFFFFFFFFFFFFFF));
lulz_itoa(UINT64_C(0xFFFFFFFFFFFFFFFF), res);
printf("%s -> %s\n", ref, res);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment