aqrit/lulz_itoa.c

## lulz_itoa.c
/*
    convert integer to ascii

    see also:
        https://johnnylee-sde.github.io/Fast-unsigned-integer-to-string/
        http://www.numberworld.org/y-cruncher/internals/radix-conversion.html
*/

#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <inttypes.h>


// expected to have performance similar to khuong_approach
static
void to_string_2x (uint64_t val, char* dst)
{
    // assert(val < 10000000000000000);
    const uint64_t magic_lo = (0x00FFFFFFFFFFFFFF / 10000) + 1;
    const uint64_t magic_hi = (0xFFFFFFFFFFFFFFFF / (100000000 >> 4)) + 1;

    // chunk it into four dwords (packed in two qwords)
    uint64_t hi = val / 100000000;
    uint64_t lo = val % 100000000;
    lo = ((lo * magic_lo) & 0x00FFFFFF00000000) +
                (((lo >> 4) * magic_hi) >> 40) + 0x0000000100000001;
    hi = ((hi * magic_lo) & 0x00FFFFFF00000000) +
                (((hi >> 4) * magic_hi) >> 40) + 0x0000000100000001;

    // uses SWAR to extact two digits at-a-time from our qwords
    // (multiplication by `10` is expected to be cheaper than a regular multiply)
    const uint64_t mask = 0x00FFFFFF00FFFFFF;
    uint64_t out0 = 0x3030303030303030;
    uint64_t out1 = 0x3030303030303030;
    lo = lo * 10;
    hi = hi * 10;
    out0 |= (lo & ~mask) >> 24;
    out1 |= (hi & ~mask) >> 24;
    lo = (lo & mask) * 10;
    hi = (hi & mask) * 10;
    out0 |= (lo & ~mask) >> 16;
    out1 |= (hi & ~mask) >> 16;
    lo = (lo & mask) * 10;
    hi = (hi & mask) * 10;
    out0 |= (lo & ~mask) >> 8;
    out1 |= (hi & ~mask) >> 8;
    lo = (lo & mask) * 10;
    hi = (hi & mask) * 10;
    out0 |= (lo & ~mask);
    out1 |= (hi & ~mask);

    memcpy(&dst[0], &out1, 8);
    memcpy(&dst[8], &out0, 8);
    //dst[16] = '\0';
}


void bin2str_4digits(uint64_t x, char* buf) {
    const uint64_t mask = 0x0FFFFE1FFFF87FFF;
    uint64_t y;
    // assert(x <= 9999);

    x *= 0x000418A051EC0CCD;
    y = (x & mask) * 10;

    // extract...
    y &= ~mask;
    x = (uint32_t)((y << 9) + (y >> 21) + (y >> 52) + (x >> 60));
    x += 0x30303030;
    memcpy(buf, &x, 5);

    /*
    divide by constant, fixpoint stuff:
    max 9999, div 10, (x * 0xCCD) >> 15 // integer part: 10
    max 9999, div 100, (x * 0x147B) >> 19 // integer part: 7
    max 9999, div 1000, (x * 0x20C5) >> 23 // integer part: 4

    However, since we're discarding the integer part...
    we can overlap the integer part of a 'lane' with the fractional part of the next lane.
    We have plenty of fractional precision anyways... e.g. 26 bits required
    to calc but only 14 bits needed to round-trip)

    so our magic is: (0x20C5ULL << 37) + (0x147BULL << 18) + 0xCCD;
    the mask just cuts out the low 4-bits of the integer part for the result:
        mask = ~((0xFULL << 60) + (0xFULL << 37) + (0xF << 15));

    we could buy a little wiggle room in the layout.
    drop 2 bits off the 1st lane at the cost of a LEA.
    drop 3 bits off the 2nd lane at the cost of a +1 in the lane.

    if we didn't want the integer part of the 3rd lane then we'd need only
    1 bit at the top (for carry).. replace `x *= 10` with `x += (x >> 2)`
    for another 3 bits.
    */
}


// similar to `sprintf(out, "%010u", val)`
static inline
void u32toa_baseline(uint32_t x, char *out) {

	// `4294967295 / 10000000000` needs 65 fractional bits.
	// The leftmost fractional bit is implied -- it is always zero.
	uint64_t fractional_part = (uint64_t)x * UINT32_C(0xDBE6FECF);

	// Discard excess precision (clear some space above the radix point)
	// Only 34 fractional bits needed to "round-trip"
	uint64_t r = (fractional_part >> 31) + 1;

	// u32 has 10 decimal digits (including leading zeros).
	for (int i = 0; i < 10; i++) {
		r *= 10; // the integer_part of our fixed-point is now `0..9`
		out[i] = (r >> 34) + 0x30; // extract integer_part and add ascii '0'
		r &= 0x3FFFFFFFF; // clear integer_part
	}
	out[10] = 0;
}

static void u32toa_droplsb (uint32_t val, char* out) {
    const uint64_t mask = 0x1FFFFFFFFFFFFFFFULL;
    uint32_t lo_bit = val & 1;
    uint64_t x = val >> 1;

    x *= (mask / 500000000) + 1; out[0] = 0x30 + (x >> 61);
    x = (x & mask) * 5;          out[1] = 0x30 + (x >> 60);
    x = (x & (mask >> 1)) * 5;   out[2] = 0x30 + (x >> 59);
    x = (x & (mask >> 2)) * 5;   out[3] = 0x30 + (x >> 58);
    x = (x & (mask >> 3)) * 5;   out[4] = 0x30 + (x >> 57);
    x = (x & (mask >> 4)) * 5;   out[5] = 0x30 + (x >> 56);
    x = (x & (mask >> 5)) * 5;   out[6] = 0x30 + (x >> 55);
    x = (x & (mask >> 6)) * 5;   out[7] = 0x30 + (x >> 54);
    x = (x & (mask >> 7)) * 5;   out[8] = 0x30 + (x >> 53);
    x = (x & (mask >> 8)) * 5;   out[9] = 0x30 + (x >> 52) + lo_bit;
    out[10] = 0;
}


// mulx
static inline
uint64_t mul128(uint64_t* x, uint64_t y) {
    // 64:64 = 64*64
    __uint128_t v = (__uint128_t)(*x) * y;
    *x = (uint64_t)v;
    return v >> 64;
}

/*
    similar to `sprintf(out, "%020" PRIu64, x)`

    If we can do 1 mulx per cycle with a latency of 4...
    then we should be able to extract 1 digit per cycle...
    However, Agner Fog's Instruction_tables.pdf says 4 cycles on Skylake but
             Intel's intrinsics Guide says 9 cycles on Skylake, 4 on Haswell.
             (Doh!)
*/
static
void lulz_itoa (uint64_t x, char* out) {
    // split the uint64_t into 4 streams (18446`74407`37095`51615)
    uint64_t top = x / 10000000000;
    uint64_t bottom = x % 10000000000;
    uint64_t x3 = top / 100000;
    uint64_t x2 = top % 100000;
    uint64_t x1 = bottom / 100000;
    uint64_t x0 = bottom % 100000;

    // divide each value by 10000 but keep the remainder
    const uint64_t M = (0xFFFFFFFFFFFFFFFFULL / 10000) + 1;
    out[0]  = 0x30 + mul128(&x3, M);
    out[5]  = 0x30 + mul128(&x2, M);
    out[10] = 0x30 + mul128(&x1, M);
    out[15] = 0x30 + mul128(&x0, M);

    // get the rest of the digits by mulitplying the remainder by 10
    for (int i = 1; i < 5; i++) {
        out[i + 0]  = 0x30 + mul128(&x3, 10);
        out[i + 5]  = 0x30 + mul128(&x2, 10);
        out[i + 10] = 0x30 + mul128(&x1, 10);
        out[i + 15] = 0x30 + mul128(&x0, 10);
    }
    out[20] = 0;
}


// similar to `sprintf(out, "%d", val)`
static
void lulz_itoa_linear_i32 (int32_t val, char* out) {
    const uint64_t mask = 0x0FFFFFFFFFFFFFFFULL;
    uint64_t x = (uint32_t)val;
    int i = 0;

    if (val == 0) {
        out[0] = '0';
        out[1] = '\0';
        return;
    }

    if (val < 0) {
        *out++ = '-';
        x = (x ^ 0x0FFFFFFFF) + 1; // abs
    }

    x *= ((mask / 1000000000) + 1);

    while ((x & ~mask) == 0) { // naive scan to first non-zero digit
        x *= 10;
        i++;
    }

    do {
        *out++ = 0x30 + (x >> 60);
        x = (x & mask) * 10;
    } while (++i < 10);

    *out = 0;
}


int main (void) {
    char ref[24];
    char res[24];
    /*
    for (uint64_t x = 0; x < 0xFFFF; x++) {
        sprintf(ref, "%020" PRIu64, x);
        lulz_itoa(x, res);
        if(memcmp(ref, res, 21)) {
            printf("ERROR: %s -> %s\n", ref, res);
            break;
        }
    }
    */

    sprintf(ref, "%020" PRIu64, UINT64_C(0));
    lulz_itoa(UINT64_C(0), res);
    printf("%s -> %s\n", ref, res);

    sprintf(ref, "%020" PRIu64, UINT64_C(1));
    lulz_itoa(UINT64_C(1), res);
    printf("%s -> %s\n", ref, res);

    sprintf(ref, "%020" PRIu64, UINT64_C(0xFFFFFFFF));
    lulz_itoa(UINT64_C(0xFFFFFFFF), res);
    printf("%s -> %s\n", ref, res);

    sprintf(ref, "%020" PRIu64, UINT64_C(0x100000000));
    lulz_itoa(UINT64_C(0x100000000), res);
    printf("%s -> %s\n", ref, res);

    sprintf(ref, "%020" PRIu64, UINT64_C(0x1000000000000));
    lulz_itoa(UINT64_C(0x1000000000000), res);
    printf("%s -> %s\n", ref, res);

    sprintf(ref, "%020" PRIu64, UINT64_C(0xFFFFFFFFFFFFFFFF));
    lulz_itoa(UINT64_C(0xFFFFFFFFFFFFFFFF), res);
    printf("%s -> %s\n", ref, res);

    return 0;
}
	/*
	convert integer to ascii

	see also:
	https://johnnylee-sde.github.io/Fast-unsigned-integer-to-string/
	http://www.numberworld.org/y-cruncher/internals/radix-conversion.html
	*/

	#include <stdio.h>
	#include <stdint.h>
	#include <string.h>
	#include <inttypes.h>


	// expected to have performance similar to khuong_approach
	static
	void to_string_2x (uint64_t val, char* dst)
	{
	// assert(val < 10000000000000000);
	const uint64_t magic_lo = (0x00FFFFFFFFFFFFFF / 10000) + 1;
	const uint64_t magic_hi = (0xFFFFFFFFFFFFFFFF / (100000000 >> 4)) + 1;

	// chunk it into four dwords (packed in two qwords)
	uint64_t hi = val / 100000000;
	uint64_t lo = val % 100000000;
	lo = ((lo * magic_lo) & 0x00FFFFFF00000000) +
	(((lo >> 4) * magic_hi) >> 40) + 0x0000000100000001;
	hi = ((hi * magic_lo) & 0x00FFFFFF00000000) +
	(((hi >> 4) * magic_hi) >> 40) + 0x0000000100000001;

	// uses SWAR to extact two digits at-a-time from our qwords
	// (multiplication by `10` is expected to be cheaper than a regular multiply)
	const uint64_t mask = 0x00FFFFFF00FFFFFF;
	uint64_t out0 = 0x3030303030303030;
	uint64_t out1 = 0x3030303030303030;
	lo = lo * 10;
	hi = hi * 10;
	out0 \|= (lo & ~mask) >> 24;
	out1 \|= (hi & ~mask) >> 24;
	lo = (lo & mask) * 10;
	hi = (hi & mask) * 10;
	out0 \|= (lo & ~mask) >> 16;
	out1 \|= (hi & ~mask) >> 16;
	lo = (lo & mask) * 10;
	hi = (hi & mask) * 10;
	out0 \|= (lo & ~mask) >> 8;
	out1 \|= (hi & ~mask) >> 8;
	lo = (lo & mask) * 10;
	hi = (hi & mask) * 10;
	out0 \|= (lo & ~mask);
	out1 \|= (hi & ~mask);

	memcpy(&dst[0], &out1, 8);
	memcpy(&dst[8], &out0, 8);
	//dst[16] = '\0';
	}


	void bin2str_4digits(uint64_t x, char* buf) {
	const uint64_t mask = 0x0FFFFE1FFFF87FFF;
	uint64_t y;
	// assert(x <= 9999);

	x *= 0x000418A051EC0CCD;
	y = (x & mask) * 10;

	// extract...
	y &= ~mask;
	x = (uint32_t)((y << 9) + (y >> 21) + (y >> 52) + (x >> 60));
	x += 0x30303030;
	memcpy(buf, &x, 5);

	/*
	divide by constant, fixpoint stuff:
	max 9999, div 10, (x * 0xCCD) >> 15 // integer part: 10
	max 9999, div 100, (x * 0x147B) >> 19 // integer part: 7
	max 9999, div 1000, (x * 0x20C5) >> 23 // integer part: 4

	However, since we're discarding the integer part...
	we can overlap the integer part of a 'lane' with the fractional part of the next lane.
	We have plenty of fractional precision anyways... e.g. 26 bits required
	to calc but only 14 bits needed to round-trip)

	so our magic is: (0x20C5ULL << 37) + (0x147BULL << 18) + 0xCCD;
	the mask just cuts out the low 4-bits of the integer part for the result:
	mask = ~((0xFULL << 60) + (0xFULL << 37) + (0xF << 15));

	we could buy a little wiggle room in the layout.
	drop 2 bits off the 1st lane at the cost of a LEA.
	drop 3 bits off the 2nd lane at the cost of a +1 in the lane.

	if we didn't want the integer part of the 3rd lane then we'd need only
	1 bit at the top (for carry).. replace `x *= 10` with `x += (x >> 2)`
	for another 3 bits.
	*/
	}


	// similar to `sprintf(out, "%010u", val)`
	static inline
	void u32toa_baseline(uint32_t x, char *out) {

	// `4294967295 / 10000000000` needs 65 fractional bits.
	// The leftmost fractional bit is implied -- it is always zero.
	uint64_t fractional_part = (uint64_t)x * UINT32_C(0xDBE6FECF);

	// Discard excess precision (clear some space above the radix point)
	// Only 34 fractional bits needed to "round-trip"
	uint64_t r = (fractional_part >> 31) + 1;

	// u32 has 10 decimal digits (including leading zeros).
	for (int i = 0; i < 10; i++) {
	r *= 10; // the integer_part of our fixed-point is now `0..9`
	out[i] = (r >> 34) + 0x30; // extract integer_part and add ascii '0'
	r &= 0x3FFFFFFFF; // clear integer_part
	}
	out[10] = 0;
	}

	static void u32toa_droplsb (uint32_t val, char* out) {
	const uint64_t mask = 0x1FFFFFFFFFFFFFFFULL;
	uint32_t lo_bit = val & 1;
	uint64_t x = val >> 1;

	x *= (mask / 500000000) + 1; out[0] = 0x30 + (x >> 61);
	x = (x & mask) * 5; out[1] = 0x30 + (x >> 60);
	x = (x & (mask >> 1)) * 5; out[2] = 0x30 + (x >> 59);
	x = (x & (mask >> 2)) * 5; out[3] = 0x30 + (x >> 58);
	x = (x & (mask >> 3)) * 5; out[4] = 0x30 + (x >> 57);
	x = (x & (mask >> 4)) * 5; out[5] = 0x30 + (x >> 56);
	x = (x & (mask >> 5)) * 5; out[6] = 0x30 + (x >> 55);
	x = (x & (mask >> 6)) * 5; out[7] = 0x30 + (x >> 54);
	x = (x & (mask >> 7)) * 5; out[8] = 0x30 + (x >> 53);
	x = (x & (mask >> 8)) * 5; out[9] = 0x30 + (x >> 52) + lo_bit;
	out[10] = 0;
	}


	// mulx
	static inline
	uint64_t mul128(uint64_t* x, uint64_t y) {
	// 64:64 = 64*64
	__uint128_t v = (__uint128_t)(x) y;
	*x = (uint64_t)v;
	return v >> 64;
	}

	/*
	similar to `sprintf(out, "%020" PRIu64, x)`

	If we can do 1 mulx per cycle with a latency of 4...
	then we should be able to extract 1 digit per cycle...
	However, Agner Fog's Instruction_tables.pdf says 4 cycles on Skylake but
	Intel's intrinsics Guide says 9 cycles on Skylake, 4 on Haswell.
	(Doh!)
	*/
	static
	void lulz_itoa (uint64_t x, char* out) {
	// split the uint64_t into 4 streams (18446`74407`37095`51615)
	uint64_t top = x / 10000000000;
	uint64_t bottom = x % 10000000000;
	uint64_t x3 = top / 100000;
	uint64_t x2 = top % 100000;
	uint64_t x1 = bottom / 100000;
	uint64_t x0 = bottom % 100000;

	// divide each value by 10000 but keep the remainder
	const uint64_t M = (0xFFFFFFFFFFFFFFFFULL / 10000) + 1;
	out[0] = 0x30 + mul128(&x3, M);
	out[5] = 0x30 + mul128(&x2, M);
	out[10] = 0x30 + mul128(&x1, M);
	out[15] = 0x30 + mul128(&x0, M);

	// get the rest of the digits by mulitplying the remainder by 10
	for (int i = 1; i < 5; i++) {
	out[i + 0] = 0x30 + mul128(&x3, 10);
	out[i + 5] = 0x30 + mul128(&x2, 10);
	out[i + 10] = 0x30 + mul128(&x1, 10);
	out[i + 15] = 0x30 + mul128(&x0, 10);
	}
	out[20] = 0;
	}


	// similar to `sprintf(out, "%d", val)`
	static
	void lulz_itoa_linear_i32 (int32_t val, char* out) {
	const uint64_t mask = 0x0FFFFFFFFFFFFFFFULL;
	uint64_t x = (uint32_t)val;
	int i = 0;

	if (val == 0) {
	out[0] = '0';
	out[1] = '\0';
	return;
	}

	if (val < 0) {
	*out++ = '-';
	x = (x ^ 0x0FFFFFFFF) + 1; // abs
	}

	x *= ((mask / 1000000000) + 1);

	while ((x & ~mask) == 0) { // naive scan to first non-zero digit
	x *= 10;
	i++;
	}

	do {
	*out++ = 0x30 + (x >> 60);
	x = (x & mask) * 10;
	} while (++i < 10);

	*out = 0;
	}


	int main (void) {
	char ref[24];
	char res[24];
	/*
	for (uint64_t x = 0; x < 0xFFFF; x++) {
	sprintf(ref, "%020" PRIu64, x);
	lulz_itoa(x, res);
	if(memcmp(ref, res, 21)) {
	printf("ERROR: %s -> %s\n", ref, res);
	break;
	}
	}
	*/

	sprintf(ref, "%020" PRIu64, UINT64_C(0));
	lulz_itoa(UINT64_C(0), res);
	printf("%s -> %s\n", ref, res);

	sprintf(ref, "%020" PRIu64, UINT64_C(1));
	lulz_itoa(UINT64_C(1), res);
	printf("%s -> %s\n", ref, res);

	sprintf(ref, "%020" PRIu64, UINT64_C(0xFFFFFFFF));
	lulz_itoa(UINT64_C(0xFFFFFFFF), res);
	printf("%s -> %s\n", ref, res);

	sprintf(ref, "%020" PRIu64, UINT64_C(0x100000000));
	lulz_itoa(UINT64_C(0x100000000), res);
	printf("%s -> %s\n", ref, res);

	sprintf(ref, "%020" PRIu64, UINT64_C(0x1000000000000));
	lulz_itoa(UINT64_C(0x1000000000000), res);
	printf("%s -> %s\n", ref, res);

	sprintf(ref, "%020" PRIu64, UINT64_C(0xFFFFFFFFFFFFFFFF));
	lulz_itoa(UINT64_C(0xFFFFFFFFFFFFFFFF), res);
	printf("%s -> %s\n", ref, res);

	return 0;
	}