Skip to content

Instantly share code, notes, and snippets.

@sh1boot
Created March 19, 2024 16:43
Show Gist options
  • Save sh1boot/ca35223a67637a83ef5f8689d05255ff to your computer and use it in GitHub Desktop.
Save sh1boot/ca35223a67637a83ef5f8689d05255ff to your computer and use it in GitHub Desktop.
adler32 generic SIMD calculation method
#include <cstdio>
#include <cstring>
#include <cstdlib>
#include <cstdint>
#define VECLEN 16
#define ADLER_MOD 65521
typedef uint8_t vuint8_t __attribute__((ext_vector_type(VECLEN)));
typedef uint16_t vuint16_t __attribute__((ext_vector_type(VECLEN)));
typedef uint32_t vuint32_t __attribute__((ext_vector_type(VECLEN)));
uint32_t dut(uint32_t sum_in, uint8_t const* buffer, size_t count) {
vuint16_t asum16 = { 0 };
vuint16_t bsum16 = { 0 };
vuint16_t binc16 = { 0 };
size_t head = count % VECLEN;
if (head > 0) {
uint8_t tmp[VECLEN] = { 0 };
memcpy(tmp + VECLEN - head, buffer, head);
vuint8_t in;
memcpy(&in, tmp, VECLEN);
vuint16_t in16 = __builtin_convertvector(in, vuint16_t);
asum16 = in16;
binc16 = in16;
}
for (size_t i = head; i + VECLEN <= count; i += VECLEN) {
vuint8_t in;
memcpy(&in, buffer + i, VECLEN);
vuint16_t in16 = __builtin_convertvector(in, vuint16_t);
auto old = bsum16; // carry possible every iteration
bsum16 += binc16;
for (int i = 0; i < VECLEN; ++i) bsum16[i] -= (bsum16[i] < old[i]) ? ADLER_MOD : 0;
old = asum16; // carry possible once per 256 iterations
asum16 += in16;
for (int i = 0; i < VECLEN; ++i) asum16[i] -= (asum16[i] < old[i]) ? ADLER_MOD : 0;
old = binc16; // carry possible once per 256 iterations, but we need to know which iteration for the cumulative effect on bsum16.
binc16 += in16;
for (int i = 0; i < VECLEN; ++i) binc16[i] -= (binc16[i] < old[i]) ? ADLER_MOD : 0;
}
vuint32_t asum32 = __builtin_convertvector(asum16, vuint32_t);
vuint32_t bsum32 = __builtin_convertvector(bsum16, vuint32_t);
vuint32_t binc32 = __builtin_convertvector(binc16, vuint32_t);
constexpr vuint32_t off = { 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 }; // TODO: don't hard-code VECLEN
bsum32 = binc32 * off + bsum32 * VECLEN;
asum16 = __builtin_convertvector(asum32 % ADLER_MOD, vuint16_t);
bsum16 = __builtin_convertvector(bsum32 % ADLER_MOD, vuint16_t);
uint32_t a = sum_in & 0xffff;
uint32_t b = ((sum_in >> 16) + a * (count % ADLER_MOD)) % ADLER_MOD;
for (int i = 0; i < VECLEN; ++i) {
a += asum16[i];
b += bsum16[i];
}
a %= ADLER_MOD;
b %= ADLER_MOD;
return (b << 16) | a;
}
uint32_t ref(uint32_t sum_in, uint8_t const* buffer, size_t count) {
uint64_t a = sum_in & 0xffff;
uint64_t b = sum_in >> 16;
for (size_t i = 0; i < count; ++i) {
a += buffer[i] & 255;
b += a;
a %= ADLER_MOD;
b %= ADLER_MOD;
}
return (b << 16) | a;
}
int main(void) {
uint8_t testbuf[4096] = { 0 };
int max_fail = 10;
uint32_t sum_start = 1;
uint32_t buffer_sum = 0;
for (int i = 0; i < 1000000; ++i) {
size_t len = rand() % 4096 + 1;
len -= len % VECLEN;
uint32_t ck_dut = dut(sum_start, testbuf, len);
uint32_t ck_ref = ref(sum_start, testbuf, len);
if (ck_dut != ck_ref) {
printf("i:%d, bs: 0x%04x len:%zu(r:%zu) 0x%08x != 0x%08x\n", i, buffer_sum & 0xffff, len, len % VECLEN, ck_dut, ck_ref);
if (--max_fail <= 0) break;
}
int r = rand() & 255;
testbuf[i % 4096] += r;
buffer_sum += r;
sum_start = ck_ref;
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment