-
-
Save pkhuong/a545b5dc072ee4050948c1910b58a014 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <assert.h> | |
#include <err.h> | |
#include <inttypes.h> | |
#include <stdint.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <emmintrin.h> | |
#include <tmmintrin.h> | |
#include "cycle.h" | |
static inline __m128i d4toa(unsigned int u) | |
{ | |
/* | |
* Multiply u by -65536 (to move -u to the high word), | |
* 26215 (magic for u/10), 10486 (magic for u/100) and | |
* 8389 (magic for u/1000). | |
*/ | |
const __m128i first_madd = | |
_mm_set_epi16(-32768, -32768, 0, 26215, 0, 10486, 0, 8389); | |
/* | |
* Zero-out 18 low bits of u*26215, 20 low bits of u*10486 | |
* and 23 low bits of u*8389: | |
* [-u, 0, u/10*4, 0, u/100*16, 0, u/1000*128]. | |
*/ | |
const __m128i mask = | |
_mm_set_epi16(0xffff, 0, 0xfffc, 0, 0xfff0, 0, 0xff80, 0); | |
/* | |
* Input value | |
* | |
* [-u, u/10*4, u/10*4, u/100*16, u/100*16, u/1000*128, n/1000*128, 0] | |
* | |
* is multiplied to produce 4 scaled digits: | |
* | |
* [(-u)*-256 - (u/10*4)*10*64, 0, (u/10*4)*64 - (u/100*16)*16*10, | |
* (u/100*16)*16 - (u/1000*128)*2*10, (n/1000*128)*2] | |
*/ | |
const __m128i second_madd = | |
_mm_set_epi16(-256, -640, 64, -160, 16, -20, 2, 0); | |
/* | |
* Shuffle digits to low bytes and OR with ascii zeroes. | |
* Only low 32-bit word matter, three other words can | |
* have any values. | |
*/ | |
const __m128i shuffle = _mm_set_epi32(0, 0, 0, 0x0d090501); | |
const __m128i ascii_zero = _mm_set_epi32(0, 0, 0, 0x30303030); | |
__m128i x; | |
//assert(u <= 9999); | |
x = _mm_madd_epi16(_mm_set1_epi16(u), first_madd); | |
x = _mm_and_si128(x, mask); | |
x = _mm_or_si128(x, _mm_slli_si128(x, 2)); | |
x = _mm_madd_epi16(x, second_madd); | |
x = _mm_shuffle_epi8(x, shuffle); | |
x = _mm_or_si128(x, ascii_zero); | |
return x; | |
} | |
#define INLINE static inline __attribute__((always_inline)) | |
typedef __uint128_t uint128_t; | |
/* Powers of 10. */ | |
#define PO2 100ULL | |
#define PO4 10000ULL | |
#define PO8 100000000ULL | |
#define PO10 10000000000ULL | |
#define PO16 10000000000000000ULL | |
/* 64 bit's worth of '0' characters (i.e., 8 characters). */ | |
#define ZERO_CHARS 0x3030303030303030ULL | |
INLINE uint32_t | |
encode_hundreds(uint32_t hi, uint32_t lo) | |
{ | |
/* | |
* Pack everything in a single 32 bit value. | |
* | |
* merged = [ hi 0 lo 0 ] | |
*/ | |
uint32_t merged = hi | (lo << 16); | |
/* | |
* Fixed-point multiplication by 103/1024 ~= 1/10. | |
*/ | |
uint32_t tens = (merged * 103UL) >> 10; | |
/* | |
* Mask away garbage bits between our digits. | |
* | |
* tens = [ hi/10 0 lo/10 0 ] | |
* | |
* On a platform with more restricted literals (ARM, for | |
* instance), it may make sense to and-not with the middle | |
* bits. | |
*/ | |
tens &= (0xFUL << 16) | 0xFUL; | |
/* | |
* x mod 10 = x - 10 * (x div 10). | |
* | |
* (merged - 10 * tens) = [ hi%10 0 lo%10 0 ] | |
* | |
* Then insert these values between tens. Arithmetic instead | |
* of bitwise operation helps the compiler merge this with | |
* later increments by a constant (e.g., ZERO_CHARS). | |
*/ | |
return tens + ((merged - 10UL * tens) << 8); | |
} | |
INLINE void * | |
itoa_ten_thousand(void *out, unsigned int x) | |
{ | |
uint32_t buf; | |
unsigned int x_div_PO2, x_mod_PO2; | |
unsigned int zeros; | |
x_div_PO2 = (x * 10486UL) >> 20; | |
x_mod_PO2 = x - PO2 * x_div_PO2; | |
buf = encode_hundreds(x_div_PO2, x_mod_PO2); | |
/* | |
* Count leading (in memory, trailing in register: we're | |
* little endian) zero bytes: count leading zero bits and | |
* round down to 8. | |
*/ | |
zeros = __builtin_ctz(buf) & -8U; | |
buf += ZERO_CHARS; /* BCD -> ASCII. */ | |
buf >>= zeros; /* Shift away leading zero characters */ | |
memcpy(out, &buf, 4); | |
/* zeros is in bits; convert to bytes to find actual length. */ | |
return (char *)out + 4 - zeros / 8; | |
} | |
INLINE void * | |
itoa_ten_thousand_pad(void *out, unsigned int x) | |
{ | |
uint32_t buf; | |
unsigned int x_div_PO2, x_mod_PO2; | |
unsigned int zeros; | |
x_div_PO2 = (x * 10486UL) >> 20; | |
x_mod_PO2 = x - PO2 * x_div_PO2; | |
buf = encode_hundreds(x_div_PO2, x_mod_PO2); | |
buf += ZERO_CHARS; /* BCD -> ASCII. */ | |
memcpy(out, &buf, 4); | |
return (char *)out + 4; | |
} | |
static uint32_t fast04u(unsigned int) __attribute__((noinline)); | |
static uint32_t an_4u(unsigned int) __attribute__((noinline)); | |
static uint32_t an_04u(unsigned int) __attribute__((noinline)); | |
static uint32_t nop(unsigned int) __attribute__((noinline)); | |
static uint32_t fast04u(unsigned int u) | |
{ | |
uint32_t ret = _mm_cvtsi128_si32(d4toa(u)); | |
asm("# foo" : "+r"(ret) :: "memory"); | |
return ret; | |
} | |
static uint32_t an_4u(unsigned int u) | |
{ | |
uint32_t buf; | |
itoa_ten_thousand(&buf, u); | |
asm("" : "+r"(buf) :: "memory"); | |
return buf; | |
} | |
static uint32_t an_04u(unsigned int u) | |
{ | |
uint32_t buf; | |
itoa_ten_thousand_pad(&buf, u); | |
asm("" : "+r"(buf) :: "memory"); | |
return buf; | |
} | |
static uint32_t nop(unsigned int u) | |
{ | |
asm("" : "+r"(u)); | |
return u; | |
} | |
static void | |
print_u32(uint32_t buf) | |
{ | |
printf("%.4s\n", (char *)&buf); | |
return; | |
} | |
static int | |
cmp_u64(const void *vx, const void *vy) | |
{ | |
const uint64_t *x = vx; | |
const uint64_t *y = vy; | |
if (*x == *y) { | |
return 0; | |
} | |
return (*x < *y) ? -1 : 1; | |
} | |
static __attribute__((noinline)) void | |
report(uint64_t *times, size_t n) | |
{ | |
qsort(times, n, sizeof(*times), cmp_u64); | |
printf("%"PRIu64" %"PRIu64" %"PRIu64"\n", | |
times[n / 100], times[n / 10], times[n / 2]); | |
return; | |
} | |
static inline void | |
cpuid(void) | |
{ | |
uint32_t a, b, c, d; | |
asm("cpuid" | |
: "=a"(a), "=b"(b), "=c"(c), "=d"(d) | |
: "0"(0) | |
: "memory"); | |
return; | |
} | |
#define NRUNS 1000UL | |
#define K 100 | |
int | |
main() | |
{ | |
static uint64_t times[NRUNS]; | |
#define TEST(X) do { \ | |
print_u32(fast04u((X))); \ | |
print_u32(an_04u((X))); \ | |
print_u32(an_4u((X))); \ | |
} while (0) | |
/* TEST(0); */ | |
/* TEST(1); */ | |
/* TEST(12); */ | |
/* TEST(123); */ | |
/* TEST(1234); */ | |
/* TEST(1230); */ | |
/* TEST(9900); */ | |
asm("" ::: "memory"); | |
#define TEST_SERIAL(FUN) do { \ | |
memset(times, 0, sizeof(times)); \ | |
\ | |
for (size_t i = 0; i < NRUNS; i++) { \ | |
ticks begin, end; \ | |
uint32_t acc = 0; \ | |
\ | |
cpuid(); \ | |
begin = getticks(); \ | |
for (size_t j = 0; j < K; j++) { \ | |
asm("# foo" ::: "memory"); \ | |
acc = FUN (acc % 8192); \ | |
} \ | |
\ | |
cpuid(); \ | |
end = getticks(); \ | |
times[i] = elapsed(end, begin); \ | |
} \ | |
\ | |
report(times, NRUNS); \ | |
} while (0) | |
printf("serial\n"); | |
TEST_SERIAL(nop); | |
TEST_SERIAL(fast04u); | |
TEST_SERIAL(an_04u); | |
TEST_SERIAL(an_4u); | |
#define TEST_THROUGHPUT(FUN) do { \ | |
static uint32_t dst[10 * (K)]; \ | |
\ | |
memset(times, 0, sizeof(times)); \ | |
memset(dst, 0, sizeof(dst)); \ | |
\ | |
for (size_t i = 0; i < NRUNS; i++) { \ | |
ticks begin, end; \ | |
uint32_t acc = 0; \ | |
\ | |
cpuid(); \ | |
begin = getticks(); \ | |
for (size_t j = 0; j < 10 * (K); j++) { \ | |
dst[j] = FUN (j); \ | |
} \ | |
\ | |
cpuid(); \ | |
end = getticks(); \ | |
times[i] = elapsed(end, begin); \ | |
} \ | |
\ | |
report(times, NRUNS); \ | |
} while (0) | |
printf("throughput\n"); | |
TEST_THROUGHPUT(nop); | |
TEST_THROUGHPUT(fast04u); | |
TEST_THROUGHPUT(an_04u); | |
TEST_THROUGHPUT(an_4u); | |
return 0; | |
} |
On Intel(R) Xeon(R) CPU E3-1245 v6 @ 3.70GHz (https://ark.intel.com/products/97473/Intel-Xeon-Processor-E3-1245-v6-8M-Cache-3_70-GHz) running NetBSD 8.0 (beta).
$ gcc -O3 -march=corei7-avx -static u4.c
$ ./a.out
serial
124 126 128
2102 2104 2110
2196 2198 2204
2498 2534 2540
throughput
3820 3824 3832
5288 5294 5304
5880 5898 5924
8718 8754 8790
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
On the same E5-2630L v2 @ 2.40GHz
For serial conversions, if leading 0s are OK, fast04u is ~15 cycles per conversion, VS 19 for AN w/ left padding, and 21 if we also remove the left padding.
For independent (high ILP) conversions, fast04u is ~2.6 cycles per conversion, VS 1.8 for AN w/ left padding and 4.3 if we also remove the left padding.
The E5 is pretty old. Newer machines might have improved the throughput ratio for SSE2 VS GPRs. The big win here is relaxing the requirement that there be no leading 0 (which some systems might parse as octal ;)
an_04u:
fast04u:
an_4u: