pkhuong/u4.c

## u4.c
#include <assert.h>
#include <err.h>
#include <inttypes.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <emmintrin.h>
#include <tmmintrin.h>

#include "cycle.h"

static inline __m128i d4toa(unsigned int u)
{
	/*
	 * Multiply u by -65536 (to move -u to the high word),
	 * 26215 (magic for u/10), 10486 (magic for u/100) and
	 * 8389 (magic for u/1000).
	 */
	const __m128i first_madd =
	    _mm_set_epi16(-32768, -32768, 0, 26215, 0, 10486, 0, 8389);

	/*
	 * Zero-out 18 low bits of u*26215, 20 low bits of u*10486
	 * and 23 low bits of u*8389:
	 * [-u, 0, u/10*4, 0, u/100*16, 0, u/1000*128].
	 */
	const __m128i mask =
	    _mm_set_epi16(0xffff, 0, 0xfffc, 0, 0xfff0, 0, 0xff80, 0);

	/*
	 * Input value
	 *
	 * [-u, u/10*4, u/10*4, u/100*16, u/100*16, u/1000*128, n/1000*128, 0]
	 *
	 * is multiplied to produce 4 scaled digits:
	 *
	 * [(-u)*-256 - (u/10*4)*10*64, 0, (u/10*4)*64 - (u/100*16)*16*10,
	 *  (u/100*16)*16 - (u/1000*128)*2*10, (n/1000*128)*2]
	 */
	const __m128i second_madd =
	    _mm_set_epi16(-256, -640, 64, -160, 16, -20, 2, 0);

	/*
	 * Shuffle digits to low bytes and OR with ascii zeroes.
	 * Only low 32-bit word matter, three other words can
	 * have any values.
	 */
	const __m128i shuffle = _mm_set_epi32(0, 0, 0, 0x0d090501);
	const __m128i ascii_zero = _mm_set_epi32(0, 0, 0, 0x30303030);

	__m128i x;

	//assert(u <= 9999);

	x = _mm_madd_epi16(_mm_set1_epi16(u), first_madd);
	x = _mm_and_si128(x, mask);
	x = _mm_or_si128(x, _mm_slli_si128(x, 2));
	x = _mm_madd_epi16(x, second_madd);
	x = _mm_shuffle_epi8(x, shuffle);
	x = _mm_or_si128(x, ascii_zero);

	return x;
}

#define INLINE static inline __attribute__((always_inline))

typedef __uint128_t uint128_t;

/* Powers of 10. */
#define PO2                100ULL
#define PO4              10000ULL
#define PO8          100000000ULL
#define PO10       10000000000ULL
#define PO16 10000000000000000ULL

/* 64 bit's worth of '0' characters (i.e., 8 characters). */
#define ZERO_CHARS 0x3030303030303030ULL

INLINE uint32_t
encode_hundreds(uint32_t hi, uint32_t lo)
{
	/*
	 * Pack everything in a single 32 bit value.
	 *
	 * merged = [ hi 0 lo 0 ]
	 */
        uint32_t merged = hi | (lo << 16);
	/*
	 * Fixed-point multiplication by 103/1024 ~= 1/10.
	 */
        uint32_t tens = (merged * 103UL) >> 10;

	/*
	 * Mask away garbage bits between our digits.
	 *
	 * tens = [ hi/10 0 lo/10 0 ]
	 *
	 * On a platform with more restricted literals (ARM, for
	 * instance), it may make sense to and-not with the middle
	 * bits.
	 */
        tens &= (0xFUL << 16) | 0xFUL;

	/*
	 * x mod 10 = x - 10 * (x div 10).
	 *
	 * (merged - 10 * tens) = [ hi%10 0 lo%10 0 ]
	 *
	 * Then insert these values between tens.  Arithmetic instead
	 * of bitwise operation helps the compiler merge this with
	 * later increments by a constant (e.g., ZERO_CHARS).
	 */
	return tens + ((merged - 10UL * tens) << 8);
}

INLINE void *
itoa_ten_thousand(void *out, unsigned int x)
{
	uint32_t buf;
	unsigned int x_div_PO2, x_mod_PO2;
	unsigned int zeros;

	x_div_PO2 = (x * 10486UL) >> 20;
	x_mod_PO2 = x - PO2 * x_div_PO2;
	buf = encode_hundreds(x_div_PO2, x_mod_PO2);
	/*
	 * Count leading (in memory, trailing in register: we're
	 * little endian) zero bytes: count leading zero bits and
	 * round down to 8.
	 */
	zeros = __builtin_ctz(buf) & -8U;
	buf += ZERO_CHARS; /* BCD -> ASCII. */
	buf >>= zeros; /* Shift away leading zero characters */
	memcpy(out, &buf, 4);

	/* zeros is in bits; convert to bytes to find actual length. */
	return (char *)out + 4 - zeros / 8;
}

INLINE void *
itoa_ten_thousand_pad(void *out, unsigned int x)
{
	uint32_t buf;
	unsigned int x_div_PO2, x_mod_PO2;
	unsigned int zeros;

	x_div_PO2 = (x * 10486UL) >> 20;
	x_mod_PO2 = x - PO2 * x_div_PO2;
	buf = encode_hundreds(x_div_PO2, x_mod_PO2);
	buf += ZERO_CHARS; /* BCD -> ASCII. */
	memcpy(out, &buf, 4);
	return (char *)out + 4;
}

static uint32_t fast04u(unsigned int) __attribute__((noinline));
static uint32_t an_4u(unsigned int) __attribute__((noinline));
static uint32_t an_04u(unsigned int) __attribute__((noinline));
static uint32_t nop(unsigned int) __attribute__((noinline));

static uint32_t fast04u(unsigned int u)
{
	uint32_t ret = _mm_cvtsi128_si32(d4toa(u));

	asm("# foo" : "+r"(ret) :: "memory");
	return ret;
}

static uint32_t an_4u(unsigned int u)
{
	uint32_t buf;

	itoa_ten_thousand(&buf, u);
	asm("" : "+r"(buf) :: "memory");
	return buf;
}

static uint32_t an_04u(unsigned int u)
{
	uint32_t buf;

	itoa_ten_thousand_pad(&buf, u);
	asm("" : "+r"(buf) :: "memory");
	return buf;
}

static uint32_t nop(unsigned int u)
{

	asm("" : "+r"(u));
	return u;
}

static void
print_u32(uint32_t buf)
{

	printf("%.4s\n", (char *)&buf);
	return;
}

static int
cmp_u64(const void *vx, const void *vy)
{
	const uint64_t *x = vx;
	const uint64_t *y = vy;

	if (*x == *y) {
		return 0;
	}

	return (*x < *y) ? -1 : 1;
}

static __attribute__((noinline)) void
report(uint64_t *times, size_t n)
{

	qsort(times, n, sizeof(*times), cmp_u64);
	printf("%"PRIu64" %"PRIu64" %"PRIu64"\n",
	    times[n / 100], times[n / 10], times[n / 2]);
	return;
}

static inline void
cpuid(void)
{
	uint32_t a, b, c, d;

	asm("cpuid"
	    : "=a"(a), "=b"(b), "=c"(c), "=d"(d)
	    : "0"(0)
	    : "memory");
	return;
}

#define NRUNS 1000UL
#define K 100

int
main()
{
	static uint64_t times[NRUNS];

#define TEST(X) do {				\
		print_u32(fast04u((X)));	\
		print_u32(an_04u((X)));		\
		print_u32(an_4u((X)));		\
	} while (0)

	/* TEST(0); */
	/* TEST(1); */
	/* TEST(12); */
	/* TEST(123); */
	/* TEST(1234); */
	/* TEST(1230); */
	/* TEST(9900); */

	asm("" ::: "memory");
#define TEST_SERIAL(FUN) do {					\
		memset(times, 0, sizeof(times));		\
								\
		for (size_t i = 0; i < NRUNS; i++) {		\
			ticks begin, end;			\
			uint32_t acc = 0;			\
								\
			cpuid();				\
			begin = getticks();			\
			for (size_t j = 0; j < K; j++) {	\
				asm("# foo" ::: "memory");	\
				acc = FUN (acc % 8192);		\
			}					\
								\
			cpuid();				\
			end = getticks();			\
			times[i] = elapsed(end, begin);		\
		}						\
								\
		report(times, NRUNS);				\
	} while (0)

	printf("serial\n");
	TEST_SERIAL(nop);
	TEST_SERIAL(fast04u);
	TEST_SERIAL(an_04u);
	TEST_SERIAL(an_4u);

#define TEST_THROUGHPUT(FUN) do {				\
		static uint32_t dst[10 * (K)];			\
								\
		memset(times, 0, sizeof(times));		\
		memset(dst, 0, sizeof(dst));			\
								\
		for (size_t i = 0; i < NRUNS; i++) {		\
			ticks begin, end;			\
			uint32_t acc = 0;			\
								\
			cpuid();				\
			begin = getticks();			\
			for (size_t j = 0; j < 10 * (K); j++) {	\
				dst[j] = FUN (j);		\
			}					\
								\
			cpuid();				\
			end = getticks();			\
			times[i] = elapsed(end, begin);		\
		}						\
								\
		report(times, NRUNS);				\
	} while (0)

	printf("throughput\n");
	TEST_THROUGHPUT(nop);
	TEST_THROUGHPUT(fast04u);
	TEST_THROUGHPUT(an_04u);
	TEST_THROUGHPUT(an_4u);

	return 0;
}
	#include <assert.h>
	#include <err.h>
	#include <inttypes.h>
	#include <stdint.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>

	#include <emmintrin.h>
	#include <tmmintrin.h>

	#include "cycle.h"

	static inline __m128i d4toa(unsigned int u)
	{
	/*
	* Multiply u by -65536 (to move -u to the high word),
	* 26215 (magic for u/10), 10486 (magic for u/100) and
	* 8389 (magic for u/1000).
	*/
	const __m128i first_madd =
	_mm_set_epi16(-32768, -32768, 0, 26215, 0, 10486, 0, 8389);

	/*
	* Zero-out 18 low bits of u26215, 20 low bits of u10486
	* and 23 low bits of u*8389:
	* [-u, 0, u/104, 0, u/10016, 0, u/1000*128].
	*/
	const __m128i mask =
	_mm_set_epi16(0xffff, 0, 0xfffc, 0, 0xfff0, 0, 0xff80, 0);

	/*
	* Input value
	*
	* [-u, u/104, u/104, u/10016, u/10016, u/1000128, n/1000128, 0]
	*
	* is multiplied to produce 4 scaled digits:
	*
	* [(-u)-256 - (u/104)1064, 0, (u/104)64 - (u/10016)16*10,
	* (u/10016)16 - (u/1000128)210, (n/1000128)*2]
	*/
	const __m128i second_madd =
	_mm_set_epi16(-256, -640, 64, -160, 16, -20, 2, 0);

	/*
	* Shuffle digits to low bytes and OR with ascii zeroes.
	* Only low 32-bit word matter, three other words can
	* have any values.
	*/
	const __m128i shuffle = _mm_set_epi32(0, 0, 0, 0x0d090501);
	const __m128i ascii_zero = _mm_set_epi32(0, 0, 0, 0x30303030);

	__m128i x;

	//assert(u <= 9999);

	x = _mm_madd_epi16(_mm_set1_epi16(u), first_madd);
	x = _mm_and_si128(x, mask);
	x = _mm_or_si128(x, _mm_slli_si128(x, 2));
	x = _mm_madd_epi16(x, second_madd);
	x = _mm_shuffle_epi8(x, shuffle);
	x = _mm_or_si128(x, ascii_zero);

	return x;
	}

	#define INLINE static inline __attribute__((always_inline))

	typedef __uint128_t uint128_t;

	/* Powers of 10. */
	#define PO2 100ULL
	#define PO4 10000ULL
	#define PO8 100000000ULL
	#define PO10 10000000000ULL
	#define PO16 10000000000000000ULL

	/* 64 bit's worth of '0' characters (i.e., 8 characters). */
	#define ZERO_CHARS 0x3030303030303030ULL

	INLINE uint32_t
	encode_hundreds(uint32_t hi, uint32_t lo)
	{
	/*
	* Pack everything in a single 32 bit value.
	*
	* merged = [ hi 0 lo 0 ]
	*/
	uint32_t merged = hi \| (lo << 16);
	/*
	* Fixed-point multiplication by 103/1024 ~= 1/10.
	*/
	uint32_t tens = (merged * 103UL) >> 10;

	/*
	* Mask away garbage bits between our digits.
	*
	* tens = [ hi/10 0 lo/10 0 ]
	*
	* On a platform with more restricted literals (ARM, for
	* instance), it may make sense to and-not with the middle
	* bits.
	*/
	tens &= (0xFUL << 16) \| 0xFUL;

	/*
	* x mod 10 = x - 10 * (x div 10).
	*
	* (merged - 10 * tens) = [ hi%10 0 lo%10 0 ]
	*
	* Then insert these values between tens. Arithmetic instead
	* of bitwise operation helps the compiler merge this with
	* later increments by a constant (e.g., ZERO_CHARS).
	*/
	return tens + ((merged - 10UL * tens) << 8);
	}

	INLINE void *
	itoa_ten_thousand(void *out, unsigned int x)
	{
	uint32_t buf;
	unsigned int x_div_PO2, x_mod_PO2;
	unsigned int zeros;

	x_div_PO2 = (x * 10486UL) >> 20;
	x_mod_PO2 = x - PO2 * x_div_PO2;
	buf = encode_hundreds(x_div_PO2, x_mod_PO2);
	/*
	* Count leading (in memory, trailing in register: we're
	* little endian) zero bytes: count leading zero bits and
	* round down to 8.
	*/
	zeros = __builtin_ctz(buf) & -8U;
	buf += ZERO_CHARS; /* BCD -> ASCII. */
	buf >>= zeros; /* Shift away leading zero characters */
	memcpy(out, &buf, 4);

	/* zeros is in bits; convert to bytes to find actual length. */
	return (char *)out + 4 - zeros / 8;
	}

	INLINE void *
	itoa_ten_thousand_pad(void *out, unsigned int x)
	{
	uint32_t buf;
	unsigned int x_div_PO2, x_mod_PO2;
	unsigned int zeros;

	x_div_PO2 = (x * 10486UL) >> 20;
	x_mod_PO2 = x - PO2 * x_div_PO2;
	buf = encode_hundreds(x_div_PO2, x_mod_PO2);
	buf += ZERO_CHARS; /* BCD -> ASCII. */
	memcpy(out, &buf, 4);
	return (char *)out + 4;
	}

	static uint32_t fast04u(unsigned int) __attribute__((noinline));
	static uint32_t an_4u(unsigned int) __attribute__((noinline));
	static uint32_t an_04u(unsigned int) __attribute__((noinline));
	static uint32_t nop(unsigned int) __attribute__((noinline));

	static uint32_t fast04u(unsigned int u)
	{
	uint32_t ret = _mm_cvtsi128_si32(d4toa(u));

	asm("# foo" : "+r"(ret) :: "memory");
	return ret;
	}

	static uint32_t an_4u(unsigned int u)
	{
	uint32_t buf;

	itoa_ten_thousand(&buf, u);
	asm("" : "+r"(buf) :: "memory");
	return buf;
	}

	static uint32_t an_04u(unsigned int u)
	{
	uint32_t buf;

	itoa_ten_thousand_pad(&buf, u);
	asm("" : "+r"(buf) :: "memory");
	return buf;
	}

	static uint32_t nop(unsigned int u)
	{

	asm("" : "+r"(u));
	return u;
	}

	static void
	print_u32(uint32_t buf)
	{

	printf("%.4s\n", (char *)&buf);
	return;
	}

	static int
	cmp_u64(const void vx, const void vy)
	{
	const uint64_t *x = vx;
	const uint64_t *y = vy;

	if (x == y) {
	return 0;
	}

	return (x < y) ? -1 : 1;
	}

	static __attribute__((noinline)) void
	report(uint64_t *times, size_t n)
	{

	qsort(times, n, sizeof(*times), cmp_u64);
	printf("%"PRIu64" %"PRIu64" %"PRIu64"\n",
	times[n / 100], times[n / 10], times[n / 2]);
	return;
	}

	static inline void
	cpuid(void)
	{
	uint32_t a, b, c, d;

	asm("cpuid"
	: "=a"(a), "=b"(b), "=c"(c), "=d"(d)
	: "0"(0)
	: "memory");
	return;
	}

	#define NRUNS 1000UL
	#define K 100

	int
	main()
	{
	static uint64_t times[NRUNS];

	#define TEST(X) do { \
	print_u32(fast04u((X))); \
	print_u32(an_04u((X))); \
	print_u32(an_4u((X))); \
	} while (0)

	/* TEST(0); */
	/* TEST(1); */
	/* TEST(12); */
	/* TEST(123); */
	/* TEST(1234); */
	/* TEST(1230); */
	/* TEST(9900); */

	asm("" ::: "memory");
	#define TEST_SERIAL(FUN) do { \
	memset(times, 0, sizeof(times)); \
	\
	for (size_t i = 0; i < NRUNS; i++) { \
	ticks begin, end; \
	uint32_t acc = 0; \
	\
	cpuid(); \
	begin = getticks(); \
	for (size_t j = 0; j < K; j++) { \
	asm("# foo" ::: "memory"); \
	acc = FUN (acc % 8192); \
	} \
	\
	cpuid(); \
	end = getticks(); \
	times[i] = elapsed(end, begin); \
	} \
	\
	report(times, NRUNS); \
	} while (0)

	printf("serial\n");
	TEST_SERIAL(nop);
	TEST_SERIAL(fast04u);
	TEST_SERIAL(an_04u);
	TEST_SERIAL(an_4u);

	#define TEST_THROUGHPUT(FUN) do { \
	static uint32_t dst[10 * (K)]; \
	\
	memset(times, 0, sizeof(times)); \
	memset(dst, 0, sizeof(dst)); \
	\
	for (size_t i = 0; i < NRUNS; i++) { \
	ticks begin, end; \
	uint32_t acc = 0; \
	\
	cpuid(); \
	begin = getticks(); \
	for (size_t j = 0; j < 10 * (K); j++) { \
	dst[j] = FUN (j); \
	} \
	\
	cpuid(); \
	end = getticks(); \
	times[i] = elapsed(end, begin); \
	} \
	\
	report(times, NRUNS); \
	} while (0)

	printf("throughput\n");
	TEST_THROUGHPUT(nop);
	TEST_THROUGHPUT(fast04u);
	TEST_THROUGHPUT(an_04u);
	TEST_THROUGHPUT(an_4u);

	return 0;
	}