Skip to content

Instantly share code, notes, and snippets.

@kazuho
Last active September 22, 2023 14:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kazuho/56891a1a68a0e8d2ae93f56741b0972c to your computer and use it in GitHub Desktop.
Save kazuho/56891a1a68a0e8d2ae93f56741b0972c to your computer and use it in GitHub Desktop.
optimized fizzbuzz in C, using AVX and partial updates
#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <immintrin.h>
#include <unistd.h>
#define inline __attribute__((always_inline))
static inline size_t bcd_incr(uint32_t *dest)
{
size_t i;
*dest += 0x1000000;
if (__builtin_expect(*dest < 0x3a000000, 1))
return 0;
*dest -= (uint32_t)0xa000000;
for (i = 1; i < sizeof(*dest); ++i) {
unsigned shift = 8 * (sizeof(*dest) - i - 1);
*dest += (uint32_t)1 << shift;
if (((*dest >> shift) & 0xff) <= '9')
break;
*dest -= (uint32_t)10 << shift;
}
return i;
}
static inline __m256i bcd_to_256(uint32_t n1, uint32_t n2, uint32_t n3, uint32_t n4, size_t digits)
{
static union {
uint32_t u64[4];
char i8[64];
} ret __attribute__((aligned(64)));
ret.u64[3] = n1;
ret.u64[2] = n2;
ret.u64[1] = n3;
ret.u64[0] = n4;
return _mm256_loadu_si256((__m256i *)(ret.i8 + 16 - digits));
}
static void fizzbuzz(int maxval)
{
char outbuf[32000], *outp = outbuf;
uint32_t bcd0 = 0x30303030, bcd1 = 0x30303030, bcd2 = 0x30303030, bcd3 = 0x30303030;
size_t bcdlen = 0;
int n = 0, n_start = 1, regen = 1;
#define FLUSH() \
do { \
write(1, outbuf, outp - outbuf); \
outp = outbuf; \
} while (0);
#define FLUSH_AND_EXIT() \
do { \
FLUSH(); \
return; \
} while (0)
#define PLUS100() \
do { \
size_t place = bcd_incr(&bcd0); \
if (place == 4) { \
regen = 2; \
if ((place += bcd_incr(&bcd1)) == 8) \
if ((place += bcd_incr(&bcd2)) == 12) \
place += bcd_incr(&bcd3); \
} \
if (bcdlen <= place) \
bcdlen = place + 1; \
} while (0)
#define EMIT(s1, s2, s3, do_regen) \
do { \
if (do_regen) { \
if (++n > do_maxval) \
FLUSH_AND_EXIT(); \
static const char s[] __attribute__((aligned(64))) = \
"\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" s1 s2 s3 \
"\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"; \
_mm256_storeu_si256((__m256i *)outp, _mm256_or_si256(bcd256, _mm256_loadu_si256((__m256i *)(s + 31 - bcdlen)))); \
outp += bcdlen + sizeof(s1) - 1; \
if (sizeof(s2) != 1) { \
if (++n > do_maxval) \
FLUSH_AND_EXIT(); \
outp += sizeof(s2) - 1; \
if (sizeof(s3) != 1) { \
if (++n > do_maxval) \
FLUSH_AND_EXIT(); \
outp += sizeof(s3) - 1; \
} \
} \
} else { \
outp += bcdlen + sizeof(s1 s2 s3) - 1; \
*(uint32_t *)(outp - 4 - (sizeof(s1 s2 s3) - 1)) = bcd0; \
n += 1 + (sizeof(s2) != 1) + (sizeof(s3) != 1); \
} \
} while (0)
#define EMIT_30_1(tench, do_regen) \
do { \
EMIT(tench "1\n", "", "", do_regen); \
EMIT(tench "2\n", "fizz\n", "", do_regen); \
EMIT(tench "4\n", "buzz\n", "fizz\n", do_regen); \
EMIT(tench "7\n", "", "", do_regen); \
EMIT(tench "8\n", "fizz\n", "buzz\n", do_regen); \
} while (0)
#define EMIT_30_2(tench, do_regen) \
do { \
EMIT(tench "1\n", "fizz\n", "", do_regen); \
EMIT(tench "3\n", "", "", do_regen); \
EMIT(tench "4\n", "fizzbuzz\n", "", do_regen); \
EMIT(tench "6\n", "", "", do_regen); \
EMIT(tench "7\n", "fizz\n", "", do_regen); \
EMIT(tench "9\n", "buzz\n", "fizz\n", do_regen); \
} while (0)
#define EMIT_30_3(tench, do_regen) \
do { \
EMIT(tench "2\n", "", "", do_regen); \
EMIT(tench "3\n", "fizz\n", "buzz\n", do_regen); \
EMIT(tench "6\n", "fizz\n", "", do_regen); \
EMIT(tench "8\n", "", "", do_regen); \
EMIT(tench "9\n", "fizzbuzz\n", "", do_regen); \
} while (0)
#define EMIT300_1CORE(zero, do_regen) \
do { \
EMIT_30_1(zero, do_regen); \
EMIT_30_2("1", do_regen); \
EMIT_30_3("2", do_regen); \
EMIT_30_1("3", do_regen); \
EMIT_30_2("4", do_regen); \
EMIT_30_3("5", do_regen); \
EMIT_30_1("6", do_regen); \
EMIT_30_2("7", do_regen); \
EMIT_30_3("8", do_regen); \
EMIT_30_1("9", do_regen); \
} while (0)
#define EMIT300_1(zero) \
do { \
__m256i bcd256; \
if (__builtin_expect(regen || do_maxval != INT_MAX, 0)) { \
bcd256 = bcd_to_256(bcd0, bcd1, bcd2, bcd3, bcdlen); \
EMIT300_1CORE(zero, 1); \
} else { \
EMIT300_1CORE(zero, 0); \
} \
PLUS100(); \
} while (0)
#define EMIT300_2CORE(do_regen) \
do { \
EMIT_30_2("0", do_regen); \
EMIT_30_3("1", do_regen); \
EMIT_30_1("2", do_regen); \
EMIT_30_2("3", do_regen); \
EMIT_30_3("4", do_regen); \
EMIT_30_1("5", do_regen); \
EMIT_30_2("6", do_regen); \
EMIT_30_3("7", do_regen); \
EMIT_30_1("8", do_regen); \
EMIT_30_2("9", do_regen); \
} while (0)
#define EMIT300_2() \
do { \
__m256i bcd256; \
if (__builtin_expect(regen || do_maxval != INT_MAX, 0)) { \
bcd256 = bcd_to_256(bcd0, bcd1, bcd2, bcd3, bcdlen); \
EMIT300_2CORE(1); \
} else { \
EMIT300_2CORE(0); \
} \
PLUS100(); \
} while (0)
#define EMIT300_3CORE(do_regen) \
do { \
EMIT_30_3("0", do_regen); \
EMIT_30_1("1", do_regen); \
EMIT_30_2("2", do_regen); \
EMIT_30_3("3", do_regen); \
EMIT_30_1("4", do_regen); \
EMIT_30_2("5", do_regen); \
EMIT_30_3("6", do_regen); \
EMIT_30_1("7", do_regen); \
EMIT_30_2("8", do_regen); \
EMIT_30_3("9", do_regen); \
} while (0)
#define EMIT300_3() \
do { \
__m256i bcd256; \
if (__builtin_expect(regen || do_maxval != INT_MAX, 0)) { \
bcd256 = bcd_to_256(bcd0, bcd1, bcd2, bcd3, bcdlen); \
EMIT300_3CORE(1); \
} else { \
EMIT300_3CORE(0); \
} \
PLUS100(); \
} while (0)
#define EMIT300(zero, _maxval) \
do { \
int do_maxval = _maxval; \
EMIT300_1(zero); \
EMIT300_2(); \
EMIT300_3(); \
} while (0)
EMIT300("", maxval);
while (n + 300 < maxval) {
EMIT300("0", INT_MAX);
if (__builtin_expect(outp - outbuf >= sizeof(outbuf) - 4000, 0)) {
FLUSH();
if (regen != 0 && n >= 100000 && n_start >= 100000)
--regen;
n_start = n + 1;
}
}
EMIT300("0", maxval);
FLUSH();
}
int main(int argc, char **argv)
{
int maxval;
if (argc < 2 || sscanf(argv[1], "%d", &maxval) != 1) {
fprintf(stderr, "usage: %s <max-value>\n", argv[0]);
exit(1);
}
fizzbuzz(maxval);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment