alnsn/fast-uint-to-str-conv.c

## fast-uint-to-str-conv.c
/*
 * Copyright (C) 2016 Alexander Nasonov.
 * (WINT_R macro) Copyright (C) 2005-2016 Mike Pall.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 *
 * [ MIT license: http://www.opensource.org/licenses/mit-license.php ]
 */

#include <assert.h>
#include <err.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <emmintrin.h>
#include <tmmintrin.h>

/* Comment these declarations out to benchmark inlined functions. */
static void fast04u(unsigned int, char *) __attribute__((noinline));
static void slow04u(unsigned int, char *) __attribute__((noinline));
static void fast08u(unsigned int, char *) __attribute__((noinline));
static void slow08u(unsigned int, char *) __attribute__((noinline));

void test(char *buf) __attribute__((noinline));

/*
 * Given an integer u from 0 to 9999, we want to perform 3 divisions
 * by constants 10, 100 and 1000 in parallel and calculate four digits
 * u - u/10*10, u/10 - u/100*10, etc. These digits can be shuffled,
 * converted to ascii and stored in memory as four consecutive bytes.
 *
 * One common approach to constant division is double-width multiplication
 * by a magic constant and shifting high-word to the right by a constant
 * number of bits.
 *
 * Double-width multiplication in xmm register can be done with pmuludq
 * but it operates on two 32-bit words while we need at least three
 * multiplications. For u that fits into 16-bit word, we can try pmaddwd
 * which multiplies eight signed 16-bit words, takes sums of pairs and
 * stores the results in four 32-bit words.
 *
 * The algorithm below uses these magic multiplications:
 *
 * u/10   : u * 26215 / 2^18,
 * u/100  : u * 10486 / 2^20,
 * u/1000 : u * 8389  / 2^23.
 *
 * The shifts are all different but it doesn't matter. Instead of
 * shifting to the right, low bits are masked and values are later
 * multiplied to scale the results by 256.
 */
static inline __m128i d4toa(unsigned int u)
{
	/*
	 * Multiply u by -65536 (to move -u to the high word),
	 * 26215 (magic for u/10), 10486 (magic for u/100) and
	 * 8389 (magic for u/1000).
	 */
	const __m128i first_madd =
	    _mm_set_epi16(-32768, -32768, 0, 26215, 0, 10486, 0, 8389);

	/*
	 * Zero-out 18 low bits of u*26215, 20 low bits of u*10486
	 * and 23 low bits of u*8389:
	 * [-u, 0, u/10*4, 0, u/100*16, 0, u/1000*128].
	 */
	const __m128i mask =
	    _mm_set_epi16(0xffff, 0, 0xfffc, 0, 0xfff0, 0, 0xff80, 0);

	/*
	 * Input value
	 *
	 * [-u, u/10*4, u/10*4, u/100*16, u/100*16, u/1000*128, n/1000*128, 0]
	 *
	 * is multiplied to produce 4 scaled digits:
	 *
	 * [(-u)*-256 - (u/10*4)*10*64, 0, (u/10*4)*64 - (u/100*16)*16*10,
	 *  (u/100*16)*16 - (u/1000*128)*2*10, (n/1000*128)*2]
	 */
	const __m128i second_madd =
	    _mm_set_epi16(-256, -640, 64, -160, 16, -20, 2, 0);

	/*
	 * Shuffle digits to low bytes and OR with ascii zeroes.
	 * Only low 32-bit word matter, three other words can
	 * have any values.
	 */
	const __m128i shuffle = _mm_set_epi32(0, 0, 0, 0x0d090501);
	const __m128i ascii_zero = _mm_set_epi32(0, 0, 0, 0x30303030);

	__m128i x;

	//assert(u <= 9999);

	x = _mm_madd_epi16(_mm_set1_epi16(u), first_madd);
	x = _mm_and_si128(x, mask);
	x = _mm_or_si128(x, _mm_slli_si128(x, 2));
	x = _mm_madd_epi16(x, second_madd);
	x = _mm_shuffle_epi8(x, shuffle);
	x = _mm_or_si128(x, ascii_zero);

	return x;
}

static void fast04u(unsigned int u, char *p)
{
	*(int *)p = _mm_cvtsi128_si32(d4toa(u));
}

static void fast08u(unsigned int u, char *p)
{
	unsigned int v = u / 10000;

	u -= v * 10000;

	*(int *)(p + 0) = _mm_cvtsi128_si32(d4toa(v));
	*(int *)(p + 4) = _mm_cvtsi128_si32(d4toa(u));
}

/* This macro comes from LuaJIT 2.1, it's defined in src/lj_strfmt.c. */
#define WINT_R(x, sh, sc) \
	{ uint32_t d = (x*(((1<<sh)+sc-1)/sc))>>sh; x -= d*sc; *p++ = (char)('0'+d); }

static void slow04u(unsigned int u, char *p)
{
#if defined(WINT_R)
	WINT_R(u, 23, 1000)
	WINT_R(u, 12, 100)
	WINT_R(u, 10, 10)
	*p++ = (char)('0'+u);
#else
	p[3] = '0' + u % 10; u /= 10;
	p[2] = '0' + u % 10; u /= 10;
	p[1] = '0' + u % 10; u /= 10;
	p[0] = '0' + u % 10;
#endif
}

static void slow08u(unsigned int u, char *p)
{
	unsigned int v = u / 10000;

	u -= v * 10000;

#if defined(WINT_R)
	WINT_R(v, 23, 1000)
	WINT_R(v, 12, 100)
	WINT_R(v, 10, 10)
	*p++ = (char)('0'+v);
	WINT_R(u, 23, 1000)
	WINT_R(u, 12, 100)
	WINT_R(u, 10, 10)
	*p++ = (char)('0'+u);
#else
	p[7] = '0' + u % 10; u /= 10;
	p[6] = '0' + u % 10; u /= 10;
	p[5] = '0' + u % 10; u /= 10;
	p[4] = '0' + u % 10; u /= 10;
	p[3] = '0' + u % 10; u /= 10;
	p[2] = '0' + u % 10; u /= 10;
	p[1] = '0' + u % 10; u /= 10;
	p[0] = '0' + u % 10;
#endif
}

#undef WINT_R

void test(char *buf)
{
	char expected[8];
	int i;

	for (i = 0; i <= 9999; i++) {
		sprintf(expected, "%04d", i);
		fast04u(i, buf);
		if (memcmp(buf, expected, 4) != 0)
			errx(EXIT_FAILURE, "%.4s != %s", buf, expected);
	}
}

/*
 * cc -DNDEBUG -O3 -mssse3 [-march=your favourite arch] fast-uint-conv.c
 *
 * ./a.out # quick test
 *
 * time ./a.out  1e9 # benchmark fast08u()
 * time ./a.out -1e9 # benchmark slow08u()
 */
int main(int argc, char *argv[])
{
	long i, n;
	char buf[4];

	if (argc == 1) {
		test(buf);
	} else {
		n = strtod(argv[1], NULL);

		if (n > 0) {
			for (i = n; i != 0; i--)
				fast08u(i, buf);
		} else {
			for (i = -n; i != 0; i--)
				slow08u(i, buf);
		}
	}

	/* Foul the compiler. */
	return (buf[0] ^ buf[1] ^ buf[2] ^ buf[3]) >> 6;
}
	/*
	* Copyright (C) 2016 Alexander Nasonov.
	* (WINT_R macro) Copyright (C) 2005-2016 Mike Pall.
	*
	* Permission is hereby granted, free of charge, to any person obtaining a copy
	* of this software and associated documentation files (the "Software"), to deal
	* in the Software without restriction, including without limitation the rights
	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	* copies of the Software, and to permit persons to whom the Software is
	* furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice shall be included in
	* all copies or substantial portions of the Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
	* THE SOFTWARE.
	*
	* [ MIT license: http://www.opensource.org/licenses/mit-license.php ]
	*/

	#include <assert.h>
	#include <err.h>
	#include <stdint.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>

	#include <emmintrin.h>
	#include <tmmintrin.h>

	/* Comment these declarations out to benchmark inlined functions. */
	static void fast04u(unsigned int, char *) __attribute__((noinline));
	static void slow04u(unsigned int, char *) __attribute__((noinline));
	static void fast08u(unsigned int, char *) __attribute__((noinline));
	static void slow08u(unsigned int, char *) __attribute__((noinline));

	void test(char *buf) __attribute__((noinline));

	/*
	* Given an integer u from 0 to 9999, we want to perform 3 divisions
	* by constants 10, 100 and 1000 in parallel and calculate four digits
	* u - u/1010, u/10 - u/10010, etc. These digits can be shuffled,
	* converted to ascii and stored in memory as four consecutive bytes.
	*
	* One common approach to constant division is double-width multiplication
	* by a magic constant and shifting high-word to the right by a constant
	* number of bits.
	*
	* Double-width multiplication in xmm register can be done with pmuludq
	* but it operates on two 32-bit words while we need at least three
	* multiplications. For u that fits into 16-bit word, we can try pmaddwd
	* which multiplies eight signed 16-bit words, takes sums of pairs and
	* stores the results in four 32-bit words.
	*
	* The algorithm below uses these magic multiplications:
	*
	* u/10 : u * 26215 / 2^18,
	* u/100 : u * 10486 / 2^20,
	* u/1000 : u * 8389 / 2^23.
	*
	* The shifts are all different but it doesn't matter. Instead of
	* shifting to the right, low bits are masked and values are later
	* multiplied to scale the results by 256.
	*/
	static inline __m128i d4toa(unsigned int u)
	{
	/*
	* Multiply u by -65536 (to move -u to the high word),
	* 26215 (magic for u/10), 10486 (magic for u/100) and
	* 8389 (magic for u/1000).
	*/
	const __m128i first_madd =
	_mm_set_epi16(-32768, -32768, 0, 26215, 0, 10486, 0, 8389);

	/*
	* Zero-out 18 low bits of u26215, 20 low bits of u10486
	* and 23 low bits of u*8389:
	* [-u, 0, u/104, 0, u/10016, 0, u/1000*128].
	*/
	const __m128i mask =
	_mm_set_epi16(0xffff, 0, 0xfffc, 0, 0xfff0, 0, 0xff80, 0);

	/*
	* Input value
	*
	* [-u, u/104, u/104, u/10016, u/10016, u/1000128, n/1000128, 0]
	*
	* is multiplied to produce 4 scaled digits:
	*
	* [(-u)-256 - (u/104)1064, 0, (u/104)64 - (u/10016)16*10,
	* (u/10016)16 - (u/1000128)210, (n/1000128)*2]
	*/
	const __m128i second_madd =
	_mm_set_epi16(-256, -640, 64, -160, 16, -20, 2, 0);

	/*
	* Shuffle digits to low bytes and OR with ascii zeroes.
	* Only low 32-bit word matter, three other words can
	* have any values.
	*/
	const __m128i shuffle = _mm_set_epi32(0, 0, 0, 0x0d090501);
	const __m128i ascii_zero = _mm_set_epi32(0, 0, 0, 0x30303030);

	__m128i x;

	//assert(u <= 9999);

	x = _mm_madd_epi16(_mm_set1_epi16(u), first_madd);
	x = _mm_and_si128(x, mask);
	x = _mm_or_si128(x, _mm_slli_si128(x, 2));
	x = _mm_madd_epi16(x, second_madd);
	x = _mm_shuffle_epi8(x, shuffle);
	x = _mm_or_si128(x, ascii_zero);

	return x;
	}

	static void fast04u(unsigned int u, char *p)
	{
	(int )p = _mm_cvtsi128_si32(d4toa(u));
	}

	static void fast08u(unsigned int u, char *p)
	{
	unsigned int v = u / 10000;

	u -= v * 10000;

	(int )(p + 0) = _mm_cvtsi128_si32(d4toa(v));
	(int )(p + 4) = _mm_cvtsi128_si32(d4toa(u));
	}

	/* This macro comes from LuaJIT 2.1, it's defined in src/lj_strfmt.c. */
	#define WINT_R(x, sh, sc) \
	{ uint32_t d = (x(((1<<sh)+sc-1)/sc))>>sh; x -= dsc; *p++ = (char)('0'+d); }

	static void slow04u(unsigned int u, char *p)
	{
	#if defined(WINT_R)
	WINT_R(u, 23, 1000)
	WINT_R(u, 12, 100)
	WINT_R(u, 10, 10)
	*p++ = (char)('0'+u);
	#else
	p[3] = '0' + u % 10; u /= 10;
	p[2] = '0' + u % 10; u /= 10;
	p[1] = '0' + u % 10; u /= 10;
	p[0] = '0' + u % 10;
	#endif
	}

	static void slow08u(unsigned int u, char *p)
	{
	unsigned int v = u / 10000;

	u -= v * 10000;

	#if defined(WINT_R)
	WINT_R(v, 23, 1000)
	WINT_R(v, 12, 100)
	WINT_R(v, 10, 10)
	*p++ = (char)('0'+v);
	WINT_R(u, 23, 1000)
	WINT_R(u, 12, 100)
	WINT_R(u, 10, 10)
	*p++ = (char)('0'+u);
	#else
	p[7] = '0' + u % 10; u /= 10;
	p[6] = '0' + u % 10; u /= 10;
	p[5] = '0' + u % 10; u /= 10;
	p[4] = '0' + u % 10; u /= 10;
	p[3] = '0' + u % 10; u /= 10;
	p[2] = '0' + u % 10; u /= 10;
	p[1] = '0' + u % 10; u /= 10;
	p[0] = '0' + u % 10;
	#endif
	}

	#undef WINT_R

	void test(char *buf)
	{
	char expected[8];
	int i;

	for (i = 0; i <= 9999; i++) {
	sprintf(expected, "%04d", i);
	fast04u(i, buf);
	if (memcmp(buf, expected, 4) != 0)
	errx(EXIT_FAILURE, "%.4s != %s", buf, expected);
	}
	}

	/*
	* cc -DNDEBUG -O3 -mssse3 [-march=your favourite arch] fast-uint-conv.c
	*
	* ./a.out # quick test
	*
	* time ./a.out 1e9 # benchmark fast08u()
	* time ./a.out -1e9 # benchmark slow08u()
	*/
	int main(int argc, char *argv[])
	{
	long i, n;
	char buf[4];

	if (argc == 1) {
	test(buf);
	} else {
	n = strtod(argv[1], NULL);

	if (n > 0) {
	for (i = n; i != 0; i--)
	fast08u(i, buf);
	} else {
	for (i = -n; i != 0; i--)
	slow08u(i, buf);
	}
	}

	/* Foul the compiler. */
	return (buf[0] ^ buf[1] ^ buf[2] ^ buf[3]) >> 6;
	}