Skip to content

Instantly share code, notes, and snippets.

@pkhuong
Created December 24, 2017 05:04
Show Gist options
  • Save pkhuong/a545b5dc072ee4050948c1910b58a014 to your computer and use it in GitHub Desktop.
Save pkhuong/a545b5dc072ee4050948c1910b58a014 to your computer and use it in GitHub Desktop.
#include <assert.h>
#include <err.h>
#include <inttypes.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <emmintrin.h>
#include <tmmintrin.h>
#include "cycle.h"
static inline __m128i d4toa(unsigned int u)
{
/*
* Multiply u by -65536 (to move -u to the high word),
* 26215 (magic for u/10), 10486 (magic for u/100) and
* 8389 (magic for u/1000).
*/
const __m128i first_madd =
_mm_set_epi16(-32768, -32768, 0, 26215, 0, 10486, 0, 8389);
/*
* Zero-out 18 low bits of u*26215, 20 low bits of u*10486
* and 23 low bits of u*8389:
* [-u, 0, u/10*4, 0, u/100*16, 0, u/1000*128].
*/
const __m128i mask =
_mm_set_epi16(0xffff, 0, 0xfffc, 0, 0xfff0, 0, 0xff80, 0);
/*
* Input value
*
* [-u, u/10*4, u/10*4, u/100*16, u/100*16, u/1000*128, n/1000*128, 0]
*
* is multiplied to produce 4 scaled digits:
*
* [(-u)*-256 - (u/10*4)*10*64, 0, (u/10*4)*64 - (u/100*16)*16*10,
* (u/100*16)*16 - (u/1000*128)*2*10, (n/1000*128)*2]
*/
const __m128i second_madd =
_mm_set_epi16(-256, -640, 64, -160, 16, -20, 2, 0);
/*
* Shuffle digits to low bytes and OR with ascii zeroes.
* Only low 32-bit word matter, three other words can
* have any values.
*/
const __m128i shuffle = _mm_set_epi32(0, 0, 0, 0x0d090501);
const __m128i ascii_zero = _mm_set_epi32(0, 0, 0, 0x30303030);
__m128i x;
//assert(u <= 9999);
x = _mm_madd_epi16(_mm_set1_epi16(u), first_madd);
x = _mm_and_si128(x, mask);
x = _mm_or_si128(x, _mm_slli_si128(x, 2));
x = _mm_madd_epi16(x, second_madd);
x = _mm_shuffle_epi8(x, shuffle);
x = _mm_or_si128(x, ascii_zero);
return x;
}
#define INLINE static inline __attribute__((always_inline))
typedef __uint128_t uint128_t;
/* Powers of 10. */
#define PO2 100ULL
#define PO4 10000ULL
#define PO8 100000000ULL
#define PO10 10000000000ULL
#define PO16 10000000000000000ULL
/* 64 bit's worth of '0' characters (i.e., 8 characters). */
#define ZERO_CHARS 0x3030303030303030ULL
INLINE uint32_t
encode_hundreds(uint32_t hi, uint32_t lo)
{
/*
* Pack everything in a single 32 bit value.
*
* merged = [ hi 0 lo 0 ]
*/
uint32_t merged = hi | (lo << 16);
/*
* Fixed-point multiplication by 103/1024 ~= 1/10.
*/
uint32_t tens = (merged * 103UL) >> 10;
/*
* Mask away garbage bits between our digits.
*
* tens = [ hi/10 0 lo/10 0 ]
*
* On a platform with more restricted literals (ARM, for
* instance), it may make sense to and-not with the middle
* bits.
*/
tens &= (0xFUL << 16) | 0xFUL;
/*
* x mod 10 = x - 10 * (x div 10).
*
* (merged - 10 * tens) = [ hi%10 0 lo%10 0 ]
*
* Then insert these values between tens. Arithmetic instead
* of bitwise operation helps the compiler merge this with
* later increments by a constant (e.g., ZERO_CHARS).
*/
return tens + ((merged - 10UL * tens) << 8);
}
INLINE void *
itoa_ten_thousand(void *out, unsigned int x)
{
uint32_t buf;
unsigned int x_div_PO2, x_mod_PO2;
unsigned int zeros;
x_div_PO2 = (x * 10486UL) >> 20;
x_mod_PO2 = x - PO2 * x_div_PO2;
buf = encode_hundreds(x_div_PO2, x_mod_PO2);
/*
* Count leading (in memory, trailing in register: we're
* little endian) zero bytes: count leading zero bits and
* round down to 8.
*/
zeros = __builtin_ctz(buf) & -8U;
buf += ZERO_CHARS; /* BCD -> ASCII. */
buf >>= zeros; /* Shift away leading zero characters */
memcpy(out, &buf, 4);
/* zeros is in bits; convert to bytes to find actual length. */
return (char *)out + 4 - zeros / 8;
}
INLINE void *
itoa_ten_thousand_pad(void *out, unsigned int x)
{
uint32_t buf;
unsigned int x_div_PO2, x_mod_PO2;
unsigned int zeros;
x_div_PO2 = (x * 10486UL) >> 20;
x_mod_PO2 = x - PO2 * x_div_PO2;
buf = encode_hundreds(x_div_PO2, x_mod_PO2);
buf += ZERO_CHARS; /* BCD -> ASCII. */
memcpy(out, &buf, 4);
return (char *)out + 4;
}
static uint32_t fast04u(unsigned int) __attribute__((noinline));
static uint32_t an_4u(unsigned int) __attribute__((noinline));
static uint32_t an_04u(unsigned int) __attribute__((noinline));
static uint32_t nop(unsigned int) __attribute__((noinline));
static uint32_t fast04u(unsigned int u)
{
uint32_t ret = _mm_cvtsi128_si32(d4toa(u));
asm("# foo" : "+r"(ret) :: "memory");
return ret;
}
static uint32_t an_4u(unsigned int u)
{
uint32_t buf;
itoa_ten_thousand(&buf, u);
asm("" : "+r"(buf) :: "memory");
return buf;
}
static uint32_t an_04u(unsigned int u)
{
uint32_t buf;
itoa_ten_thousand_pad(&buf, u);
asm("" : "+r"(buf) :: "memory");
return buf;
}
static uint32_t nop(unsigned int u)
{
asm("" : "+r"(u));
return u;
}
static void
print_u32(uint32_t buf)
{
printf("%.4s\n", (char *)&buf);
return;
}
static int
cmp_u64(const void *vx, const void *vy)
{
const uint64_t *x = vx;
const uint64_t *y = vy;
if (*x == *y) {
return 0;
}
return (*x < *y) ? -1 : 1;
}
static __attribute__((noinline)) void
report(uint64_t *times, size_t n)
{
qsort(times, n, sizeof(*times), cmp_u64);
printf("%"PRIu64" %"PRIu64" %"PRIu64"\n",
times[n / 100], times[n / 10], times[n / 2]);
return;
}
static inline void
cpuid(void)
{
uint32_t a, b, c, d;
asm("cpuid"
: "=a"(a), "=b"(b), "=c"(c), "=d"(d)
: "0"(0)
: "memory");
return;
}
#define NRUNS 1000UL
#define K 100
int
main()
{
static uint64_t times[NRUNS];
#define TEST(X) do { \
print_u32(fast04u((X))); \
print_u32(an_04u((X))); \
print_u32(an_4u((X))); \
} while (0)
/* TEST(0); */
/* TEST(1); */
/* TEST(12); */
/* TEST(123); */
/* TEST(1234); */
/* TEST(1230); */
/* TEST(9900); */
asm("" ::: "memory");
#define TEST_SERIAL(FUN) do { \
memset(times, 0, sizeof(times)); \
\
for (size_t i = 0; i < NRUNS; i++) { \
ticks begin, end; \
uint32_t acc = 0; \
\
cpuid(); \
begin = getticks(); \
for (size_t j = 0; j < K; j++) { \
asm("# foo" ::: "memory"); \
acc = FUN (acc % 8192); \
} \
\
cpuid(); \
end = getticks(); \
times[i] = elapsed(end, begin); \
} \
\
report(times, NRUNS); \
} while (0)
printf("serial\n");
TEST_SERIAL(nop);
TEST_SERIAL(fast04u);
TEST_SERIAL(an_04u);
TEST_SERIAL(an_4u);
#define TEST_THROUGHPUT(FUN) do { \
static uint32_t dst[10 * (K)]; \
\
memset(times, 0, sizeof(times)); \
memset(dst, 0, sizeof(dst)); \
\
for (size_t i = 0; i < NRUNS; i++) { \
ticks begin, end; \
uint32_t acc = 0; \
\
cpuid(); \
begin = getticks(); \
for (size_t j = 0; j < 10 * (K); j++) { \
dst[j] = FUN (j); \
} \
\
cpuid(); \
end = getticks(); \
times[i] = elapsed(end, begin); \
} \
\
report(times, NRUNS); \
} while (0)
printf("throughput\n");
TEST_THROUGHPUT(nop);
TEST_THROUGHPUT(fast04u);
TEST_THROUGHPUT(an_04u);
TEST_THROUGHPUT(an_4u);
return 0;
}
@alnsn
Copy link

alnsn commented Dec 24, 2017

On Intel(R) Xeon(R) CPU E3-1245 v6 @ 3.70GHz (https://ark.intel.com/products/97473/Intel-Xeon-Processor-E3-1245-v6-8M-Cache-3_70-GHz) running NetBSD 8.0 (beta).

$ gcc -O3 -march=corei7-avx -static u4.c
$ ./a.out
serial
124 126 128
2102 2104 2110
2196 2198 2204
2498 2534 2540
throughput
3820 3824 3832
5288 5294 5304
5880 5898 5924
8718 8754 8790

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment