Skip to content

Instantly share code, notes, and snippets.

@alnsn
Last active September 24, 2023 11:04
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save alnsn/83ae6391c66bc1f117b9b6b5fbf2c331 to your computer and use it in GitHub Desktop.
Save alnsn/83ae6391c66bc1f117b9b6b5fbf2c331 to your computer and use it in GitHub Desktop.
/*
* Copyright (C) 2016 Alexander Nasonov.
* (WINT_R macro) Copyright (C) 2005-2016 Mike Pall.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
* [ MIT license: http://www.opensource.org/licenses/mit-license.php ]
*/
#include <assert.h>
#include <err.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <emmintrin.h>
#include <tmmintrin.h>
/* Comment these declarations out to benchmark inlined functions. */
static void fast04u(unsigned int, char *) __attribute__((noinline));
static void slow04u(unsigned int, char *) __attribute__((noinline));
static void fast08u(unsigned int, char *) __attribute__((noinline));
static void slow08u(unsigned int, char *) __attribute__((noinline));
void test(char *buf) __attribute__((noinline));
/*
* Given an integer u from 0 to 9999, we want to perform 3 divisions
* by constants 10, 100 and 1000 in parallel and calculate four digits
* u - u/10*10, u/10 - u/100*10, etc. These digits can be shuffled,
* converted to ascii and stored in memory as four consecutive bytes.
*
* One common approach to constant division is double-width multiplication
* by a magic constant and shifting high-word to the right by a constant
* number of bits.
*
* Double-width multiplication in xmm register can be done with pmuludq
* but it operates on two 32-bit words while we need at least three
* multiplications. For u that fits into 16-bit word, we can try pmaddwd
* which multiplies eight signed 16-bit words, takes sums of pairs and
* stores the results in four 32-bit words.
*
* The algorithm below uses these magic multiplications:
*
* u/10 : u * 26215 / 2^18,
* u/100 : u * 10486 / 2^20,
* u/1000 : u * 8389 / 2^23.
*
* The shifts are all different but it doesn't matter. Instead of
* shifting to the right, low bits are masked and values are later
* multiplied to scale the results by 256.
*/
static inline __m128i d4toa(unsigned int u)
{
/*
* Multiply u by -65536 (to move -u to the high word),
* 26215 (magic for u/10), 10486 (magic for u/100) and
* 8389 (magic for u/1000).
*/
const __m128i first_madd =
_mm_set_epi16(-32768, -32768, 0, 26215, 0, 10486, 0, 8389);
/*
* Zero-out 18 low bits of u*26215, 20 low bits of u*10486
* and 23 low bits of u*8389:
* [-u, 0, u/10*4, 0, u/100*16, 0, u/1000*128].
*/
const __m128i mask =
_mm_set_epi16(0xffff, 0, 0xfffc, 0, 0xfff0, 0, 0xff80, 0);
/*
* Input value
*
* [-u, u/10*4, u/10*4, u/100*16, u/100*16, u/1000*128, n/1000*128, 0]
*
* is multiplied to produce 4 scaled digits:
*
* [(-u)*-256 - (u/10*4)*10*64, 0, (u/10*4)*64 - (u/100*16)*16*10,
* (u/100*16)*16 - (u/1000*128)*2*10, (n/1000*128)*2]
*/
const __m128i second_madd =
_mm_set_epi16(-256, -640, 64, -160, 16, -20, 2, 0);
/*
* Shuffle digits to low bytes and OR with ascii zeroes.
* Only low 32-bit word matter, three other words can
* have any values.
*/
const __m128i shuffle = _mm_set_epi32(0, 0, 0, 0x0d090501);
const __m128i ascii_zero = _mm_set_epi32(0, 0, 0, 0x30303030);
__m128i x;
//assert(u <= 9999);
x = _mm_madd_epi16(_mm_set1_epi16(u), first_madd);
x = _mm_and_si128(x, mask);
x = _mm_or_si128(x, _mm_slli_si128(x, 2));
x = _mm_madd_epi16(x, second_madd);
x = _mm_shuffle_epi8(x, shuffle);
x = _mm_or_si128(x, ascii_zero);
return x;
}
static void fast04u(unsigned int u, char *p)
{
*(int *)p = _mm_cvtsi128_si32(d4toa(u));
}
static void fast08u(unsigned int u, char *p)
{
unsigned int v = u / 10000;
u -= v * 10000;
*(int *)(p + 0) = _mm_cvtsi128_si32(d4toa(v));
*(int *)(p + 4) = _mm_cvtsi128_si32(d4toa(u));
}
/* This macro comes from LuaJIT 2.1, it's defined in src/lj_strfmt.c. */
#define WINT_R(x, sh, sc) \
{ uint32_t d = (x*(((1<<sh)+sc-1)/sc))>>sh; x -= d*sc; *p++ = (char)('0'+d); }
static void slow04u(unsigned int u, char *p)
{
#if defined(WINT_R)
WINT_R(u, 23, 1000)
WINT_R(u, 12, 100)
WINT_R(u, 10, 10)
*p++ = (char)('0'+u);
#else
p[3] = '0' + u % 10; u /= 10;
p[2] = '0' + u % 10; u /= 10;
p[1] = '0' + u % 10; u /= 10;
p[0] = '0' + u % 10;
#endif
}
static void slow08u(unsigned int u, char *p)
{
unsigned int v = u / 10000;
u -= v * 10000;
#if defined(WINT_R)
WINT_R(v, 23, 1000)
WINT_R(v, 12, 100)
WINT_R(v, 10, 10)
*p++ = (char)('0'+v);
WINT_R(u, 23, 1000)
WINT_R(u, 12, 100)
WINT_R(u, 10, 10)
*p++ = (char)('0'+u);
#else
p[7] = '0' + u % 10; u /= 10;
p[6] = '0' + u % 10; u /= 10;
p[5] = '0' + u % 10; u /= 10;
p[4] = '0' + u % 10; u /= 10;
p[3] = '0' + u % 10; u /= 10;
p[2] = '0' + u % 10; u /= 10;
p[1] = '0' + u % 10; u /= 10;
p[0] = '0' + u % 10;
#endif
}
#undef WINT_R
void test(char *buf)
{
char expected[8];
int i;
for (i = 0; i <= 9999; i++) {
sprintf(expected, "%04d", i);
fast04u(i, buf);
if (memcmp(buf, expected, 4) != 0)
errx(EXIT_FAILURE, "%.4s != %s", buf, expected);
}
}
/*
* cc -DNDEBUG -O3 -mssse3 [-march=your favourite arch] fast-uint-conv.c
*
* ./a.out # quick test
*
* time ./a.out 1e9 # benchmark fast08u()
* time ./a.out -1e9 # benchmark slow08u()
*/
int main(int argc, char *argv[])
{
long i, n;
char buf[4];
if (argc == 1) {
test(buf);
} else {
n = strtod(argv[1], NULL);
if (n > 0) {
for (i = n; i != 0; i--)
fast08u(i, buf);
} else {
for (i = -n; i != 0; i--)
slow08u(i, buf);
}
}
/* Foul the compiler. */
return (buf[0] ^ buf[1] ^ buf[2] ^ buf[3]) >> 6;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment