Skip to content

Instantly share code, notes, and snippets.

@pkhuong
Created December 24, 2017 05:04
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save pkhuong/a545b5dc072ee4050948c1910b58a014 to your computer and use it in GitHub Desktop.
Save pkhuong/a545b5dc072ee4050948c1910b58a014 to your computer and use it in GitHub Desktop.
#include <assert.h>
#include <err.h>
#include <inttypes.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <emmintrin.h>
#include <tmmintrin.h>
#include "cycle.h"
static inline __m128i d4toa(unsigned int u)
{
/*
* Multiply u by -65536 (to move -u to the high word),
* 26215 (magic for u/10), 10486 (magic for u/100) and
* 8389 (magic for u/1000).
*/
const __m128i first_madd =
_mm_set_epi16(-32768, -32768, 0, 26215, 0, 10486, 0, 8389);
/*
* Zero-out 18 low bits of u*26215, 20 low bits of u*10486
* and 23 low bits of u*8389:
* [-u, 0, u/10*4, 0, u/100*16, 0, u/1000*128].
*/
const __m128i mask =
_mm_set_epi16(0xffff, 0, 0xfffc, 0, 0xfff0, 0, 0xff80, 0);
/*
* Input value
*
* [-u, u/10*4, u/10*4, u/100*16, u/100*16, u/1000*128, n/1000*128, 0]
*
* is multiplied to produce 4 scaled digits:
*
* [(-u)*-256 - (u/10*4)*10*64, 0, (u/10*4)*64 - (u/100*16)*16*10,
* (u/100*16)*16 - (u/1000*128)*2*10, (n/1000*128)*2]
*/
const __m128i second_madd =
_mm_set_epi16(-256, -640, 64, -160, 16, -20, 2, 0);
/*
* Shuffle digits to low bytes and OR with ascii zeroes.
* Only low 32-bit word matter, three other words can
* have any values.
*/
const __m128i shuffle = _mm_set_epi32(0, 0, 0, 0x0d090501);
const __m128i ascii_zero = _mm_set_epi32(0, 0, 0, 0x30303030);
__m128i x;
//assert(u <= 9999);
x = _mm_madd_epi16(_mm_set1_epi16(u), first_madd);
x = _mm_and_si128(x, mask);
x = _mm_or_si128(x, _mm_slli_si128(x, 2));
x = _mm_madd_epi16(x, second_madd);
x = _mm_shuffle_epi8(x, shuffle);
x = _mm_or_si128(x, ascii_zero);
return x;
}
#define INLINE static inline __attribute__((always_inline))
typedef __uint128_t uint128_t;
/* Powers of 10. */
#define PO2 100ULL
#define PO4 10000ULL
#define PO8 100000000ULL
#define PO10 10000000000ULL
#define PO16 10000000000000000ULL
/* 64 bit's worth of '0' characters (i.e., 8 characters). */
#define ZERO_CHARS 0x3030303030303030ULL
INLINE uint32_t
encode_hundreds(uint32_t hi, uint32_t lo)
{
/*
* Pack everything in a single 32 bit value.
*
* merged = [ hi 0 lo 0 ]
*/
uint32_t merged = hi | (lo << 16);
/*
* Fixed-point multiplication by 103/1024 ~= 1/10.
*/
uint32_t tens = (merged * 103UL) >> 10;
/*
* Mask away garbage bits between our digits.
*
* tens = [ hi/10 0 lo/10 0 ]
*
* On a platform with more restricted literals (ARM, for
* instance), it may make sense to and-not with the middle
* bits.
*/
tens &= (0xFUL << 16) | 0xFUL;
/*
* x mod 10 = x - 10 * (x div 10).
*
* (merged - 10 * tens) = [ hi%10 0 lo%10 0 ]
*
* Then insert these values between tens. Arithmetic instead
* of bitwise operation helps the compiler merge this with
* later increments by a constant (e.g., ZERO_CHARS).
*/
return tens + ((merged - 10UL * tens) << 8);
}
INLINE void *
itoa_ten_thousand(void *out, unsigned int x)
{
uint32_t buf;
unsigned int x_div_PO2, x_mod_PO2;
unsigned int zeros;
x_div_PO2 = (x * 10486UL) >> 20;
x_mod_PO2 = x - PO2 * x_div_PO2;
buf = encode_hundreds(x_div_PO2, x_mod_PO2);
/*
* Count leading (in memory, trailing in register: we're
* little endian) zero bytes: count leading zero bits and
* round down to 8.
*/
zeros = __builtin_ctz(buf) & -8U;
buf += ZERO_CHARS; /* BCD -> ASCII. */
buf >>= zeros; /* Shift away leading zero characters */
memcpy(out, &buf, 4);
/* zeros is in bits; convert to bytes to find actual length. */
return (char *)out + 4 - zeros / 8;
}
INLINE void *
itoa_ten_thousand_pad(void *out, unsigned int x)
{
uint32_t buf;
unsigned int x_div_PO2, x_mod_PO2;
unsigned int zeros;
x_div_PO2 = (x * 10486UL) >> 20;
x_mod_PO2 = x - PO2 * x_div_PO2;
buf = encode_hundreds(x_div_PO2, x_mod_PO2);
buf += ZERO_CHARS; /* BCD -> ASCII. */
memcpy(out, &buf, 4);
return (char *)out + 4;
}
static uint32_t fast04u(unsigned int) __attribute__((noinline));
static uint32_t an_4u(unsigned int) __attribute__((noinline));
static uint32_t an_04u(unsigned int) __attribute__((noinline));
static uint32_t nop(unsigned int) __attribute__((noinline));
static uint32_t fast04u(unsigned int u)
{
uint32_t ret = _mm_cvtsi128_si32(d4toa(u));
asm("# foo" : "+r"(ret) :: "memory");
return ret;
}
static uint32_t an_4u(unsigned int u)
{
uint32_t buf;
itoa_ten_thousand(&buf, u);
asm("" : "+r"(buf) :: "memory");
return buf;
}
static uint32_t an_04u(unsigned int u)
{
uint32_t buf;
itoa_ten_thousand_pad(&buf, u);
asm("" : "+r"(buf) :: "memory");
return buf;
}
static uint32_t nop(unsigned int u)
{
asm("" : "+r"(u));
return u;
}
static void
print_u32(uint32_t buf)
{
printf("%.4s\n", (char *)&buf);
return;
}
static int
cmp_u64(const void *vx, const void *vy)
{
const uint64_t *x = vx;
const uint64_t *y = vy;
if (*x == *y) {
return 0;
}
return (*x < *y) ? -1 : 1;
}
static __attribute__((noinline)) void
report(uint64_t *times, size_t n)
{
qsort(times, n, sizeof(*times), cmp_u64);
printf("%"PRIu64" %"PRIu64" %"PRIu64"\n",
times[n / 100], times[n / 10], times[n / 2]);
return;
}
static inline void
cpuid(void)
{
uint32_t a, b, c, d;
asm("cpuid"
: "=a"(a), "=b"(b), "=c"(c), "=d"(d)
: "0"(0)
: "memory");
return;
}
#define NRUNS 1000UL
#define K 100
int
main()
{
static uint64_t times[NRUNS];
#define TEST(X) do { \
print_u32(fast04u((X))); \
print_u32(an_04u((X))); \
print_u32(an_4u((X))); \
} while (0)
/* TEST(0); */
/* TEST(1); */
/* TEST(12); */
/* TEST(123); */
/* TEST(1234); */
/* TEST(1230); */
/* TEST(9900); */
asm("" ::: "memory");
#define TEST_SERIAL(FUN) do { \
memset(times, 0, sizeof(times)); \
\
for (size_t i = 0; i < NRUNS; i++) { \
ticks begin, end; \
uint32_t acc = 0; \
\
cpuid(); \
begin = getticks(); \
for (size_t j = 0; j < K; j++) { \
asm("# foo" ::: "memory"); \
acc = FUN (acc % 8192); \
} \
\
cpuid(); \
end = getticks(); \
times[i] = elapsed(end, begin); \
} \
\
report(times, NRUNS); \
} while (0)
printf("serial\n");
TEST_SERIAL(nop);
TEST_SERIAL(fast04u);
TEST_SERIAL(an_04u);
TEST_SERIAL(an_4u);
#define TEST_THROUGHPUT(FUN) do { \
static uint32_t dst[10 * (K)]; \
\
memset(times, 0, sizeof(times)); \
memset(dst, 0, sizeof(dst)); \
\
for (size_t i = 0; i < NRUNS; i++) { \
ticks begin, end; \
uint32_t acc = 0; \
\
cpuid(); \
begin = getticks(); \
for (size_t j = 0; j < 10 * (K); j++) { \
dst[j] = FUN (j); \
} \
\
cpuid(); \
end = getticks(); \
times[i] = elapsed(end, begin); \
} \
\
report(times, NRUNS); \
} while (0)
printf("throughput\n");
TEST_THROUGHPUT(nop);
TEST_THROUGHPUT(fast04u);
TEST_THROUGHPUT(an_04u);
TEST_THROUGHPUT(an_4u);
return 0;
}
@pkhuong
Copy link
Author

pkhuong commented Dec 24, 2017

On the same E5-2630L v2 @ 2.40GHz

308% ./a.out
serial
122 126 130 # (1st, 10th, 50th percentile) baseline for 100 conversions
1646 1646 1650 # fast04u
1986 1988 1990 # an with 0 left padding
2260 2262 2274 # an with no leading 0
throughput
5154 5154 5156 # baseline cycle count for 1000 independent conversion
7722 7724 7728 # fast04u
6906 6924 6936 # an with leading 0
9454 9670 9688 # an without leading 0

For serial conversions, if leading 0s are OK, fast04u is ~15 cycles per conversion, VS 19 for AN w/ left padding, and 21 if we also remove the left padding.

For independent (high ILP) conversions, fast04u is ~2.6 cycles per conversion, VS 1.8 for AN w/ left padding and 4.3 if we also remove the left padding.

The E5 is pretty old. Newer machines might have improved the throughput ratio for SSE2 VS GPRs. The big win here is relaxing the requirement that there be no leading 0 (which some systems might parse as octal ;)

an_04u:

    .cfi_startproc
    movl    %edi, %eax
    movl    $100, %ecx
    movl    $103, %esi
    imulq   $10486, %rax, %rax
    shrq    $20, %rax
    movl    %eax, %edx
    imull   %ecx, %edx
    subl    %edx, %edi
    sall    $16, %edi
    orl     %eax, %edi
    movl    %edi, %eax
    imulq   %rsi, %rax
    shrq    $10, %rax
    andl    $983055, %eax
    leal    (%rax,%rax,4), %edx
    addl    %edx, %edx
    subl    %edx, %edi
    sall    $8, %edi
    leal    808464432(%rax,%rdi), %eax
    ret

fast04u:

    .cfi_startproc
    vmovdqa .LC1(%rip), %xmm0
    vmovd   %edi, %xmm1
    vpshufb .LC0(%rip), %xmm1, %xmm1
    vpmaddwd        %xmm0, %xmm1, %xmm1
    vpand   .LC2(%rip), %xmm1, %xmm1
    vpslldq $2, %xmm1, %xmm0
    vpor    %xmm0, %xmm1, %xmm1
    vmovdqa .LC3(%rip), %xmm0
    vpmaddwd        %xmm0, %xmm1, %xmm0
    vpshufb .LC4(%rip), %xmm0, %xmm0
    vpor    .LC5(%rip), %xmm0, %xmm0
    vmovd   %xmm0, %eax

an_4u:

    .cfi_startproc
    movl    %edi, %eax
    movl    $100, %ecx
    movl    $103, %esi
    imulq   $10486, %rax, %rax
    shrq    $20, %rax
    movl    %eax, %edx
    imull   %ecx, %edx
    subl    %edx, %edi
    sall    $16, %edi
    orl     %eax, %edi
    movl    %edi, %eax
    imulq   %rsi, %rax
    shrq    $10, %rax
    andl    $983055, %eax
    leal    (%rax,%rax,4), %edx
    addl    %edx, %edx
    subl    %edx, %edi
    sall    $8, %edi
    addl    %edi, %eax
    bsfl    %eax, %ecx
    addl    $808464432, %eax
    andl    $-8, %ecx
    shrl    %cl, %eax
    ret

@alnsn
Copy link

alnsn commented Dec 24, 2017

On Intel(R) Xeon(R) CPU E3-1245 v6 @ 3.70GHz (https://ark.intel.com/products/97473/Intel-Xeon-Processor-E3-1245-v6-8M-Cache-3_70-GHz) running NetBSD 8.0 (beta).

$ gcc -O3 -march=corei7-avx -static u4.c
$ ./a.out
serial
124 126 128
2102 2104 2110
2196 2198 2204
2498 2534 2540
throughput
3820 3824 3832
5288 5294 5304
5880 5898 5924
8718 8754 8790

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment