Last active
April 20, 2022 00:49
-
-
Save aqrit/2997713a00ad043b4bac42e342294259 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
convert integer to ascii | |
see also: | |
https://johnnylee-sde.github.io/Fast-unsigned-integer-to-string/ | |
http://www.numberworld.org/y-cruncher/internals/radix-conversion.html | |
*/ | |
#include <stdio.h> | |
#include <stdint.h> | |
#include <string.h> | |
#include <inttypes.h> | |
// expected to have performance similar to khuong_approach | |
static | |
void to_string_2x (uint64_t val, char* dst) | |
{ | |
// assert(val < 10000000000000000); | |
const uint64_t magic_lo = (0x00FFFFFFFFFFFFFF / 10000) + 1; | |
const uint64_t magic_hi = (0xFFFFFFFFFFFFFFFF / (100000000 >> 4)) + 1; | |
// chunk it into four dwords (packed in two qwords) | |
uint64_t hi = val / 100000000; | |
uint64_t lo = val % 100000000; | |
lo = ((lo * magic_lo) & 0x00FFFFFF00000000) + | |
(((lo >> 4) * magic_hi) >> 40) + 0x0000000100000001; | |
hi = ((hi * magic_lo) & 0x00FFFFFF00000000) + | |
(((hi >> 4) * magic_hi) >> 40) + 0x0000000100000001; | |
// uses SWAR to extact two digits at-a-time from our qwords | |
// (multiplication by `10` is expected to be cheaper than a regular multiply) | |
const uint64_t mask = 0x00FFFFFF00FFFFFF; | |
uint64_t out0 = 0x3030303030303030; | |
uint64_t out1 = 0x3030303030303030; | |
lo = lo * 10; | |
hi = hi * 10; | |
out0 |= (lo & ~mask) >> 24; | |
out1 |= (hi & ~mask) >> 24; | |
lo = (lo & mask) * 10; | |
hi = (hi & mask) * 10; | |
out0 |= (lo & ~mask) >> 16; | |
out1 |= (hi & ~mask) >> 16; | |
lo = (lo & mask) * 10; | |
hi = (hi & mask) * 10; | |
out0 |= (lo & ~mask) >> 8; | |
out1 |= (hi & ~mask) >> 8; | |
lo = (lo & mask) * 10; | |
hi = (hi & mask) * 10; | |
out0 |= (lo & ~mask); | |
out1 |= (hi & ~mask); | |
memcpy(&dst[0], &out1, 8); | |
memcpy(&dst[8], &out0, 8); | |
//dst[16] = '\0'; | |
} | |
void bin2str_4digits(uint64_t x, char* buf) { | |
const uint64_t mask = 0x0FFFFE1FFFF87FFF; | |
uint64_t y; | |
// assert(x <= 9999); | |
x *= 0x000418A051EC0CCD; | |
y = (x & mask) * 10; | |
// extract... | |
y &= ~mask; | |
x = (uint32_t)((y << 9) + (y >> 21) + (y >> 52) + (x >> 60)); | |
x += 0x30303030; | |
memcpy(buf, &x, 5); | |
/* | |
divide by constant, fixpoint stuff: | |
max 9999, div 10, (x * 0xCCD) >> 15 // integer part: 10 | |
max 9999, div 100, (x * 0x147B) >> 19 // integer part: 7 | |
max 9999, div 1000, (x * 0x20C5) >> 23 // integer part: 4 | |
However, since we're discarding the integer part... | |
we can overlap the integer part of a 'lane' with the fractional part of the next lane. | |
We have plenty of fractional precision anyways... e.g. 26 bits required | |
to calc but only 14 bits needed to round-trip) | |
so our magic is: (0x20C5ULL << 37) + (0x147BULL << 18) + 0xCCD; | |
the mask just cuts out the low 4-bits of the integer part for the result: | |
mask = ~((0xFULL << 60) + (0xFULL << 37) + (0xF << 15)); | |
we could buy a little wiggle room in the layout. | |
drop 2 bits off the 1st lane at the cost of a LEA. | |
drop 3 bits off the 2nd lane at the cost of a +1 in the lane. | |
if we didn't want the integer part of the 3rd lane then we'd need only | |
1 bit at the top (for carry).. replace `x *= 10` with `x += (x >> 2)` | |
for another 3 bits. | |
*/ | |
} | |
// similar to `sprintf(out, "%010u", val)` | |
static inline | |
void u32toa_baseline(uint32_t x, char *out) { | |
// `4294967295 / 10000000000` needs 65 fractional bits. | |
// The leftmost fractional bit is implied -- it is always zero. | |
uint64_t fractional_part = (uint64_t)x * UINT32_C(0xDBE6FECF); | |
// Discard excess precision (clear some space above the radix point) | |
// Only 34 fractional bits needed to "round-trip" | |
uint64_t r = (fractional_part >> 31) + 1; | |
// u32 has 10 decimal digits (including leading zeros). | |
for (int i = 0; i < 10; i++) { | |
r *= 10; // the integer_part of our fixed-point is now `0..9` | |
out[i] = (r >> 34) + 0x30; // extract integer_part and add ascii '0' | |
r &= 0x3FFFFFFFF; // clear integer_part | |
} | |
out[10] = 0; | |
} | |
static void u32toa_droplsb (uint32_t val, char* out) { | |
const uint64_t mask = 0x1FFFFFFFFFFFFFFFULL; | |
uint32_t lo_bit = val & 1; | |
uint64_t x = val >> 1; | |
x *= (mask / 500000000) + 1; out[0] = 0x30 + (x >> 61); | |
x = (x & mask) * 5; out[1] = 0x30 + (x >> 60); | |
x = (x & (mask >> 1)) * 5; out[2] = 0x30 + (x >> 59); | |
x = (x & (mask >> 2)) * 5; out[3] = 0x30 + (x >> 58); | |
x = (x & (mask >> 3)) * 5; out[4] = 0x30 + (x >> 57); | |
x = (x & (mask >> 4)) * 5; out[5] = 0x30 + (x >> 56); | |
x = (x & (mask >> 5)) * 5; out[6] = 0x30 + (x >> 55); | |
x = (x & (mask >> 6)) * 5; out[7] = 0x30 + (x >> 54); | |
x = (x & (mask >> 7)) * 5; out[8] = 0x30 + (x >> 53); | |
x = (x & (mask >> 8)) * 5; out[9] = 0x30 + (x >> 52) + lo_bit; | |
out[10] = 0; | |
} | |
// mulx | |
static inline | |
uint64_t mul128(uint64_t* x, uint64_t y) { | |
// 64:64 = 64*64 | |
__uint128_t v = (__uint128_t)(*x) * y; | |
*x = (uint64_t)v; | |
return v >> 64; | |
} | |
/* | |
similar to `sprintf(out, "%020" PRIu64, x)` | |
If we can do 1 mulx per cycle with a latency of 4... | |
then we should be able to extract 1 digit per cycle... | |
However, Agner Fog's Instruction_tables.pdf says 4 cycles on Skylake but | |
Intel's intrinsics Guide says 9 cycles on Skylake, 4 on Haswell. | |
(Doh!) | |
*/ | |
static | |
void lulz_itoa (uint64_t x, char* out) { | |
// split the uint64_t into 4 streams (18446`74407`37095`51615) | |
uint64_t top = x / 10000000000; | |
uint64_t bottom = x % 10000000000; | |
uint64_t x3 = top / 100000; | |
uint64_t x2 = top % 100000; | |
uint64_t x1 = bottom / 100000; | |
uint64_t x0 = bottom % 100000; | |
// divide each value by 10000 but keep the remainder | |
const uint64_t M = (0xFFFFFFFFFFFFFFFFULL / 10000) + 1; | |
out[0] = 0x30 + mul128(&x3, M); | |
out[5] = 0x30 + mul128(&x2, M); | |
out[10] = 0x30 + mul128(&x1, M); | |
out[15] = 0x30 + mul128(&x0, M); | |
// get the rest of the digits by mulitplying the remainder by 10 | |
for (int i = 1; i < 5; i++) { | |
out[i + 0] = 0x30 + mul128(&x3, 10); | |
out[i + 5] = 0x30 + mul128(&x2, 10); | |
out[i + 10] = 0x30 + mul128(&x1, 10); | |
out[i + 15] = 0x30 + mul128(&x0, 10); | |
} | |
out[20] = 0; | |
} | |
// similar to `sprintf(out, "%d", val)` | |
static | |
void lulz_itoa_linear_i32 (int32_t val, char* out) { | |
const uint64_t mask = 0x0FFFFFFFFFFFFFFFULL; | |
uint64_t x = (uint32_t)val; | |
int i = 0; | |
if (val == 0) { | |
out[0] = '0'; | |
out[1] = '\0'; | |
return; | |
} | |
if (val < 0) { | |
*out++ = '-'; | |
x = (x ^ 0x0FFFFFFFF) + 1; // abs | |
} | |
x *= ((mask / 1000000000) + 1); | |
while ((x & ~mask) == 0) { // naive scan to first non-zero digit | |
x *= 10; | |
i++; | |
} | |
do { | |
*out++ = 0x30 + (x >> 60); | |
x = (x & mask) * 10; | |
} while (++i < 10); | |
*out = 0; | |
} | |
int main (void) { | |
char ref[24]; | |
char res[24]; | |
/* | |
for (uint64_t x = 0; x < 0xFFFF; x++) { | |
sprintf(ref, "%020" PRIu64, x); | |
lulz_itoa(x, res); | |
if(memcmp(ref, res, 21)) { | |
printf("ERROR: %s -> %s\n", ref, res); | |
break; | |
} | |
} | |
*/ | |
sprintf(ref, "%020" PRIu64, UINT64_C(0)); | |
lulz_itoa(UINT64_C(0), res); | |
printf("%s -> %s\n", ref, res); | |
sprintf(ref, "%020" PRIu64, UINT64_C(1)); | |
lulz_itoa(UINT64_C(1), res); | |
printf("%s -> %s\n", ref, res); | |
sprintf(ref, "%020" PRIu64, UINT64_C(0xFFFFFFFF)); | |
lulz_itoa(UINT64_C(0xFFFFFFFF), res); | |
printf("%s -> %s\n", ref, res); | |
sprintf(ref, "%020" PRIu64, UINT64_C(0x100000000)); | |
lulz_itoa(UINT64_C(0x100000000), res); | |
printf("%s -> %s\n", ref, res); | |
sprintf(ref, "%020" PRIu64, UINT64_C(0x1000000000000)); | |
lulz_itoa(UINT64_C(0x1000000000000), res); | |
printf("%s -> %s\n", ref, res); | |
sprintf(ref, "%020" PRIu64, UINT64_C(0xFFFFFFFFFFFFFFFF)); | |
lulz_itoa(UINT64_C(0xFFFFFFFFFFFFFFFF), res); | |
printf("%s -> %s\n", ref, res); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment