Skip to content

Instantly share code, notes, and snippets.

@RhodiumToad
Created September 20, 2019 03:50
Show Gist options
  • Save RhodiumToad/7f4a83e0e157e7500629025e53239c82 to your computer and use it in GitHub Desktop.
Save RhodiumToad/7f4a83e0e157e7500629025e53239c82 to your computer and use it in GitHub Desktop.
/*
* vectorized (requires SSSE3) implementation of binary to decimal conversion.
* This was mainly an intellectual exercise, any performance gain over methods
* that do 2-digit conversions with lookup table are not worth the effort.
*
* Currently works with clang but can be adapted to gcc.
*/
#include <stdint.h>
#include <stdbool.h>
static inline int
leftmost_one_pos64(uint64_t word)
{
return 63 - __builtin_clzll(word);
}
/*
* Note: value must not be 0.
*/
static int
decimalLength(uint64_t value)
{
const uint64_t powers10[19] = {
1U, 10U, 100U,
1000U, 10000U, 100000U,
1000000U, 10000000U, 100000000U,
1000000000U, 10000000000U, 100000000000U,
1000000000000U, 10000000000000U, 100000000000000U,
1000000000000000U, 10000000000000000U, 100000000000000000U,
1000000000000000000U };
const int t = (leftmost_one_pos64(value) + 1)*1233/4096;
return t + (value >= powers10[t]);
}
/*
* 32-bit platforms may need to optimize this.
*/
static inline uint64_t
div1e8(uint64_t v)
{
return v / 100000000;
}
#define VEC128BITS __attribute__((vector_size (16)))
#define UNALIGNED __attribute__((aligned (1)))
typedef uint8_t v16ub VEC128BITS;
typedef uint8_t v16ubu VEC128BITS UNALIGNED;
typedef int16_t v8sw VEC128BITS;
typedef uint16_t v8uw VEC128BITS;
typedef uint32_t v4ul VEC128BITS;
typedef uint64_t v2uq VEC128BITS;
typedef int16_t v8w VEC128BITS;
typedef int32_t v4l VEC128BITS;
typedef long long v2q VEC128BITS;
#define PMULUDQ(a_,b_) __builtin_ia32_pmuludq128(a_,b_)
#define PMADDWD(a_,b_) __builtin_ia32_pmaddwd128(a_,b_)
#define PMULHUW(a_,b_) __builtin_ia32_pmulhuw128(a_,b_)
/* These need different definitions on GCC */
#ifdef __clang__
#define SHUFFLE(a_, ...) __builtin_shufflevector(a_, a_, __VA_ARGS__)
#define SHUFFLE2(a_, b_, ...) __builtin_shufflevector(a_, b_, __VA_ARGS__)
#define PSLLDQ(a_, c_) __builtin_ia32_pslldqi128_byteshift(a_, c_)
#define PALIGNR(a_, b_, c_) __builtin_ia32_palignr128(a_, b_, c_)
#endif
/*
make_4x4digits
Input: two uint32_t each representing 8 decimal digits.
Call these D15_8 and D7_0.
The task: convert them to groups of 4 digits in uint16_t; each result
group has both the raw digit value and 2x that.
127 0
+-----------------+-----------------+-----------------+-----------------+
| | D7_0 | | D15_8 |
+-----------------+-----------------+-----------------+-----------------+
| - | ce(2^45/10000) | - | ce(2^45/10000) |
+-----------------+-----------------+-----------------+-----------------+
PMULUDQ
PSRLQ #13
+-----------------+-----------------+-----------------+-----------------+
| D7_0/10000 | xx | D15_8/10000 | xx |
| | = D7_4 | | | =D15_12 | |
+-----------------+-----------------+-----------------+-----------------+
PSHUFD
+-----------------+-----------------+-----------------+-----------------+
| 0 | D7_4 | 0 | D7_4 | 0 | D15_12 | 0 | D15_12 |
+-----------------+-----------------+-----------------+-----------------+
| 0 | 10000 | 0 | -1 | 0 | 10000 | 0 | -1 |
+-----------------+-----------------+-----------------+-----------------+
PMADDWD
+-----------------+-----------------+-----------------+-----------------+
| 10000 * D7_4 | -D7_4 | 10000*D15_12 | -D15_12 |
+-----------------+-----------------+-----------------+-----------------+
shuffle original input:
+-----------------+-----------------+-----------------+-----------------+
| D7_0 | 0 | D15_8 | 0 |
+-----------------+-----------------+-----------------+-----------------+
| 10000 * D7_4 | -D7_4 | 10000*D15_12 | -D15_12 |
+-----------------+-----------------+-----------------+-----------------+
PSUBD
+-----------------+-----------------+-----------------+-----------------+
| 0 | D3_0 | 0 | D7_4 | 0 | D11_8 | 0 | D15_12 |
+-----------------+-----------------+-----------------+-----------------+
PSLLD #17
POR
+-----------------+-----------------+-----------------+-----------------+
| 2xD3_0 | D3_0 | 2xD7_4 | D7_4 | 2xD11_8| D11_8 |2xD15_12| D15_12 |
+-----------------+-----------------+-----------------+-----------------+
*/
static inline v8uw
make_4x4digits(uint64_t hi8digits, uint64_t lo8digits)
{
v4ul eightdigits;
eightdigits[0] = hi8digits;
eightdigits[1] = 0;
eightdigits[2] = lo8digits;
eightdigits[3] = 0;
const v4ul div1e4const = { 3518437209, 0, 3518437209, 0 };
const v8uw mul1e4const = { -1, 0, 10000, 0, -1, 0, 10000, 0 };
const v4ul qx = (v4ul) ((v2uq) PMULUDQ(eightdigits, div1e4const) >> 13);
const v4ul q0 = SHUFFLE(qx, 1, 1, 3, 3);
const v4ul r0 = (v4ul) PSLLDQ(eightdigits, 4)
- (v4ul) PMADDWD((v8uw) q0, mul1e4const);
v8uw r1 = (v8uw)(r0 | (r0 << 17));
return (v8uw) r1;
}
/*
cvt_4digits
Input: uint16_t D3_0 representing a 4-digit decimal number.
The task: convert to 4 individual byte values 0-9 (binary).
The trick with starting with 2x the input is so that the input value can
be preserved through a MULHUW operation. This works because we know that
9999 * 2 won't overflow an int16_t.
127 0
+-----------------+-----------------+-----------------+-----------------+
| 0 | D3_0*2 | 0 | D3_0*2 | 0 | D3_0*2 | 0 | D3_0*2 |
+-----------------+-----------------+-----------------+-----------------+
| 0 | 0x8000 | 0 |2^16/10 | 0 |2^20/100| 0 |2^23/1e3|
+-----------------+-----------------+-----------------+-----------------+
PMULHUW
+-----------------+-----------------+-----------------+-----------------+
| 0 | D3_0 | 0 | D3_1*2 | 0 |D3_2<<5 | 0 |D3_1<<8 |
+-----------------+-----------------+-----------------+-----------------+
| 0 | 256 | 0 | 128 | 0 | 8 | 0 | 1 |
+-----------------+-----------------+-----------------+-----------------+
PMADDWD
PSHRLD #8
+-----------------+-----------------+-----------------+-----------------+
| 0 | D3_0 | 0 | D3_1 | 0 | D3_2 | 0 | D3 |
+-----------------+-----------------+-----------------+-----------------+
PSLLDQ #6
+-----------------+-----------------+-----------------+-----------------+
| D3_1 | 0 | D3_2 | 0 | D3 | 0 | 0 | 0 |
+-----------------+-----------------+-----------------+-----------------+
POR
+-----------------+-----------------+-----------------+-----------------+
| D3_1 | D3_0 | D3_2 | D3_1 | D3 | D3_2 | 0 | D3 |
+-----------------+-----------------+-----------------+-----------------+
| -10 | 1 | -10 | 1 | -10 | 1 | -10 | 1 |
+-----------------+-----------------+-----------------+-----------------+
PMADDWD
+-----------------+-----------------+-----------------+-----------------+
| D0 | D1 | D2 | D3 |
+-----------------+-----------------+-----------------+-----------------+
*/
const v8sw mul10constb = { 1, -10, 1, -10, 1, -10, 1, -10 };
static inline v8uw
cvt_4digits(uint16_t digits)
{
const v4ul topq0 = { digits*2, digits*2, digits*2, digits*2 };
const v8uw tmulconst = { 8389, 0, 10486, 0, 6554, 0, 0x8000, 0 };
const v8uw tq0 = PMULHUW((v8uw) topq0, tmulconst);
const v8uw tadj = { 1, 0, 8, 0, 128, 0, 256, 0 };
const v8uw tq1 = (v8uw) ((v4ul) PMADDWD(tq0, tadj) >> 8);
const v8uw tq2 = tq1 | (v8uw) PSLLDQ((v2uq) tq1, 6);
const v8uw res4 = (v8uw) PMADDWD(tq2, mul10constb);
return res4;
}
/*
cvt_16digits
Beginning with a vector of 4 4-digit numbers, repeated in pairs of 16-bit
lanes with the high lane doubled, (reversed order, high digits are in the low
lanes), convert to 16 ASCII digits (most-significant in the low-order byte).
127 0
+-----------------+-----------------+-----------------+-----------------+
| 2xD3_0 | D3_0 | 2xD7_4 | D7_4 | 2xD11_8| D11_8 |2xD15_12| D15_12 |
+-----------------+-----------------+-----------------+-----------------+
| 41944 | 8389 | 41944 | 8389 | 41944 | 8389 | 41944 | 8389 |
+-----------------+-----------------+-----------------+-----------------+
PMULHUW (note that 8389 = ceil(2^23/1000), 41944 = ceil(2^22/100))
PSRLW #7
+-----------------+-----------------+-----------------+-----------------+
|D3_0/1e2|D3_0/1e3|D7_4/1e2|D7_4/1e3|D11_8/ |D11_8/ |D15_12/ |D15_12/ |
|=D3_2 |=D3 |=D7_6 |=D7 |=D11_10 |=D11 |=D15_14 |=D15 | Q1
+-----------------+-----------------+-----------------+-----------------+
| 1 | -10 | 1 | -10 | 1 | -10 | 1 | -10 |
+-----------------+-----------------+-----------------+-----------------+
PMADDWD
+-----------------+-----------------+-----------------+-----------------+
| D3_2 - 10*D3 | D7_6 - 10*D7 | D11_10 - 10*D11 | D15_14 - 10*D15 |
| = D2 | = D6 | = D10 | = D14 |
+-----------------+-----------------+-----------------+-----------------+
PSLLD #8
+-----------------+-----------------+-----------------+-----------------+
| 0 | 0 | D2 | 0 | 0 | 0 | D6 | 0 | 0 | 0 |D10 | 0 | 0 | 0 |D14 | 0 | P1
+-----------------+-----------------+-----------------+-----------------+
+-----------------+-----------------+-----------------+-----------------+
| 2xD3_0 | D3_0 | 2xD7_4 | D7_4 | 2xD11_8| D11_8 |2xD15_12| D15_12 |
+-----------------+-----------------+-----------------+-----------------+
| 52429 | 0 | 52429 | 0 | 52429 | 0 | 52429 | 0 |
+-----------------+-----------------+-----------------+-----------------+
PMULHUW (note 52429 = ceil(2^19/10))
PSHRLW #4
+-----------------+-----------------+-----------------+-----------------+
| D3_0/10| 0 | D7_4/10| 0 |D11_8/10| 0 |D15_12/ | 0 |
|=D3_1 | |=D7_5 | |=D11_9 | |=D15_13 | |
+-----------------+-----------------+-----------------+-----------------+
merge alternate lanes with FOURDIGITS:
+-----------------+-----------------+-----------------+-----------------+
| D3_1 | D3_0 | D7_5 | D7_4 | D11_9 | D11_8 | D15_13 | D15_12 | Q2
+-----------------+-----------------+-----------------+-----------------+
| -10 | 1 | -10 | 1 | -10 | 1 | -10 | 1 |
+-----------------+-----------------+-----------------+-----------------+
PMADDWD
+-----------------+-----------------+-----------------+-----------------+
| D3_0 - 10*D3_1 | D7_4 - 10*D7_5 | D11_8 - 10*D11_9| D15_12-10*D15_13|
| = D0 | = D4 | = D8 | = D12 |
+-----------------+-----------------+-----------------+-----------------+
PSLLD #24
+-----------------+-----------------+-----------------+-----------------+
| D0 | 0 | 0 | 0 | D4 | 0 | 0 | 0 | D8 | 0 | 0 | 0 | D12| 0 | 0 | 0 | P2
+-----------------+-----------------+-----------------+-----------------+
+-----------------+-----------------+-----------------+-----------------+
| D3_2 | D3 | D7_6 | D7 | D11_10 | D11 | D15_14 | D15 | Q1
+-----------------+-----------------+-----------------+-----------------+
| -10 | 1 | -10 | 1 | -10 | 1 | -10 | 1 |
+-----------------+-----------------+-----------------+-----------------+
PMULLW
+-----------------+-----------------+-----------------+-----------------+
| D3_1 | 0 | D7_5 | 0 | D11_9 | 0 | D15_13 | 0 | Q2&mask
+-----------------+-----------------+-----------------+-----------------+
PADDW
+-----------------+-----------------+-----------------+-----------------+
| D1 | D3 | D5 | D7 | D9 | D11 | D13 | D15 | P3
+-----------------+-----------------+-----------------+-----------------+
switch to 16 lanes:
+-----------------+-----------------+-----------------+-----------------+
| 0 |D1 | 0 |D3 | 0 |D5 | 0 |D7 | 0 |D9 | 0 |D11| 0 |D13| 0 |D15|
+-----------------+-----------------+-----------------+-----------------+
| 0 | 0 | D2 | 0 | 0 | 0 | D6 | 0 | 0 | 0 |D10 | 0 | 0 | 0 |D14 | 0 | P1
+-----------------+-----------------+-----------------+-----------------+
| D0 | 0 | 0 | 0 | D4 | 0 | 0 | 0 | D8 | 0 | 0 | 0 | D12| 0 | 0 | 0 | P2
+-----------------+-----------------+-----------------+-----------------+
POR
POR
+-----------------+-----------------+-----------------+-----------------+
| D0 |D1 | D2 |D3 | D4 | D5| D6 |D7 | D8 |D9 |D10 |D11| D12|D13| D14|D15|
+-----------------+-----------------+-----------------+-----------------+
*/
static inline v16ub
cvt_16digits(v8uw fourdigits)
{
const v8uw div1e23 = { 8389, 41944, 8389, 41944, 8389, 41944, 8389, 41944 };
const v8uw q1 = ((v8uw) PMULHUW((v8uw) fourdigits, div1e23)) >> 7;
const v8uw div1e1 = { 0, 52429, 0, 52429, 0, 52429, 0, 52429 };
const v8uw q2 = SHUFFLE2( ((v8uw) PMULHUW((v8uw) fourdigits, div1e1) >> 4),
(v8uw)fourdigits,
8, 1, 10, 3, 12, 5, 14, 7 );
const v8sw mul10const = { -10, 1, -10, 1, -10, 1, -10, 1 };
const v4ul p1 = (v4ul) PMADDWD((v8sw) q1, mul10const) << 8;
const v4ul p2 = (v4ul) PMADDWD((v8sw) q2, mul10constb) << 24;
const v8uw mask3 = { 0, 0xffff, 0, 0xffff, 0, 0xffff, 0, 0xffff };
const v8sw px1 = (v8sw)(q2 & mask3);
const v8sw px2 = ((v8sw)q1 * mul10constb);
const v8uw p3 = (v8uw)(px1 + px2);
const v16ub res = (v16ub)(p1) | (v16ub)(p2) | (v16ub)(p3);
return res;
}
/*
* my_lltoa_n
*
* Given an unsigned 64-bit input, emit decimal digits into a buffer and
* return the number of valid digits. No null terminator is written. The
* function may write garbage after the valid digits, but will not use more
* than 20 bytes.
*/
int
my_ulltoa_n(uint64_t value, char *a)
{
uint32_t olength;
/*
* 0 is too much of a special case below, so fastpath out.
*/
if (value == 0)
{
*a = '0';
return 1;
}
olength = decimalLength(value);
uint16_t topdigits = 0;
uint64_t hi8digits;
uint64_t lo8digits;
if (olength > 16)
{
const uint64_t q1 = div1e8(value);
const uint16_t q2 = div1e8(q1);
lo8digits = (uint32_t) (value - 100000000 * q1);
hi8digits = (uint32_t) (q1 - 100000000 * q2);
topdigits = q2;
}
else if (olength > 8)
{
const uint64_t q1 = div1e8(value);
lo8digits = (uint32_t) (value - 100000000 * q1);
hi8digits = q1;
}
else
{
lo8digits = value;
hi8digits = 0;
}
const v8uw fourdigits = make_4x4digits(hi8digits, lo8digits);
const v8uw res4 = cvt_4digits(topdigits) | 0x3030;
const v16ub res = cvt_16digits(fourdigits) | 0x30;
// At this point:
// res = d0..d15
// res4 = 0,0,0,d16,0,0,0,d17,0,0,0,d18,0,0,0,d19
uint8_t d19 = res4[0];
uint8_t d18 = res4[2];
uint8_t d17 = res4[4];
uint8_t d16 = res4[6];
switch (olength)
{
default:
case 20:
a[0] = d19;
a[1] = d18;
a[2] = d17;
a[3] = d16;
*(v16ubu *)(a + 4) = res;
return olength;
case 19:
a[0] = d18;
a[1] = d17;
a[2] = d16;
*(v16ubu *)(a + 3) = res;
return olength;
case 18:
a[0] = d17;
a[1] = d16;
*(v16ubu *)(a + 2) = res;
return olength;
case 17:
a[0] = d16;
*(v16ubu *)(a + 1) = res;
return olength;
case 16:
*(v16ubu *)a = res;
break;
case 15: *(v16ubu *)a = (v16ub) PALIGNR((v2q) res, (v2q) res, 1); break;
case 14: *(v16ubu *)a = (v16ub) PALIGNR((v2q) res, (v2q) res, 2); break;
case 13: *(v16ubu *)a = (v16ub) PALIGNR((v2q) res, (v2q) res, 3); break;
case 12: *(v16ubu *)a = (v16ub) PALIGNR((v2q) res, (v2q) res, 4); break;
case 11: *(v16ubu *)a = (v16ub) PALIGNR((v2q) res, (v2q) res, 5); break;
case 10: *(v16ubu *)a = (v16ub) PALIGNR((v2q) res, (v2q) res, 6); break;
case 9: *(v16ubu *)a = (v16ub) PALIGNR((v2q) res, (v2q) res, 7); break;
case 8: *(v16ubu *)a = (v16ub) PALIGNR((v2q) res, (v2q) res, 8); break;
case 7: *(v16ubu *)a = (v16ub) PALIGNR((v2q) res, (v2q) res, 9); break;
case 6: *(v16ubu *)a = (v16ub) PALIGNR((v2q) res, (v2q) res, 10); break;
case 5: *(v16ubu *)a = (v16ub) PALIGNR((v2q) res, (v2q) res, 11); break;
case 4: *(v16ubu *)a = (v16ub) PALIGNR((v2q) res, (v2q) res, 12); break;
case 3: *(v16ubu *)a = (v16ub) PALIGNR((v2q) res, (v2q) res, 13); break;
case 2: *(v16ubu *)a = (v16ub) PALIGNR((v2q) res, (v2q) res, 14); break;
case 1: *(v16ubu *)a = (v16ub) PALIGNR((v2q) res, (v2q) res, 15); break;
}
return olength;
}
/*
* Convert signed 64-bit to null-terminated string.
*
* Caller must ensure buffer is long enough.
*/
void
my_lltoa(int64_t value, char *a)
{
int len;
uint64_t uvalue = value;
if (value < 0)
{
*a++ = '-';
/*
* This works even for INT64_MIN:
*
* uvalue is required to be congruent to value mod UINT64_MAX+1 (the
* conversion is defined behavior). (uint64_t)0 - uvalue is required to
* be whatever value in [0,UINT64_MAX] results from adding or
* subtracting multiples of UINT64_MAX+1 from the result.
*/
uvalue = (uint64_t)0 - uvalue;
}
len = my_ulltoa_n(uvalue, a);
a[len] = 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment