Skip to content

Instantly share code, notes, and snippets.

@dougallj
Last active September 9, 2022 01:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dougallj/8acf4233e3534f3278bda5fdaa2f4051 to your computer and use it in GitHub Desktop.
Save dougallj/8acf4233e3534f3278bda5fdaa2f4051 to your computer and use it in GitHub Desktop.
AArch64 Neon in-place bit-reversal transposition of a table with 32-bit entries
#include <stdint.h>
#include <stddef.h>
#include <arm_neon.h>
#define RBIT32(x) __builtin_bitreverse32(x)
void bitreverse_in_place(int32_t *table, int table_bits) {
if (table_bits < 4) {
if (table_bits == 2) {
int32_t a = table[1];
int32_t b = table[2];
table[1] = b;
table[2] = a;
} else if (table_bits == 3) {
int32x4x2_t data = vld1q_s32_x2(table);
vst1q_s32(table, vtrn1q_s32(data.val[0], data.val[1]));
vst1q_s32(table+4, vtrn2q_s32(data.val[0], data.val[1]));
}
return;
}
size_t quarter = ((size_t)1 << table_bits) / 4;
for (size_t i = 0; i < quarter; i += 4) {
uint32_t r = RBIT32(i) >> (32 - table_bits);
if (r == i) {
int32x4_t a = vld1q_s32(&table[i + quarter * 0]);
int32x4_t b = vld1q_s32(&table[i + quarter * 2]);
int32x4_t c = vld1q_s32(&table[i + quarter * 1]);
int32x4_t d = vld1q_s32(&table[i + quarter * 3]);
int32x4_t a1 = vuzp1q_s32(a, b);
int32x4_t b1 = vuzp2q_s32(a, b);
int32x4_t c1 = vuzp1q_s32(c, d);
int32x4_t d1 = vuzp2q_s32(c, d);
int32x4_t a2 = vuzp1q_s32(a1, c1);
int32x4_t c2 = vuzp2q_s32(a1, c1);
int32x4_t b2 = vuzp1q_s32(b1, d1);
int32x4_t d2 = vuzp2q_s32(b1, d1);
vst1q_s32(&table[r + quarter * 0], a2);
vst1q_s32(&table[r + quarter * 2], b2);
vst1q_s32(&table[r + quarter * 1], c2);
vst1q_s32(&table[r + quarter * 3], d2);
} else if (r < i) {
int32x4_t a = vld1q_s32(&table[i + quarter * 0]);
int32x4_t b = vld1q_s32(&table[i + quarter * 2]);
int32x4_t c = vld1q_s32(&table[i + quarter * 1]);
int32x4_t d = vld1q_s32(&table[i + quarter * 3]);
int32x4_t a1 = vuzp1q_s32(a, b);
int32x4_t b1 = vuzp2q_s32(a, b);
int32x4_t c1 = vuzp1q_s32(c, d);
int32x4_t d1 = vuzp2q_s32(c, d);
int32x4_t a2 = vuzp1q_s32(a1, c1);
int32x4_t c2 = vuzp2q_s32(a1, c1);
int32x4_t b2 = vuzp1q_s32(b1, d1);
int32x4_t d2 = vuzp2q_s32(b1, d1);
a = vld1q_s32(&table[r + quarter * 0]);
b = vld1q_s32(&table[r + quarter * 2]);
c = vld1q_s32(&table[r + quarter * 1]);
d = vld1q_s32(&table[r + quarter * 3]);
vst1q_s32(&table[r + quarter * 0], a2);
vst1q_s32(&table[r + quarter * 2], b2);
vst1q_s32(&table[r + quarter * 1], c2);
vst1q_s32(&table[r + quarter * 3], d2);
a1 = vuzp1q_s32(a, b);
b1 = vuzp2q_s32(a, b);
c1 = vuzp1q_s32(c, d);
d1 = vuzp2q_s32(c, d);
a2 = vuzp1q_s32(a1, c1);
c2 = vuzp2q_s32(a1, c1);
b2 = vuzp1q_s32(b1, d1);
d2 = vuzp2q_s32(b1, d1);
vst1q_s32(&table[i + quarter * 0], a2);
vst1q_s32(&table[i + quarter * 2], b2);
vst1q_s32(&table[i + quarter * 1], c2);
vst1q_s32(&table[i + quarter * 3], d2);
}
}
}
#include <stdint.h>
#include <stddef.h>
#include <arm_neon.h>
static void bitreverse_palindrome(int32_t *table, size_t quarter, size_t i) {
int32x4_t a = vld1q_s32(&table[i + quarter * 0]);
int32x4_t b = vld1q_s32(&table[i + quarter * 2]);
int32x4_t c = vld1q_s32(&table[i + quarter * 1]);
int32x4_t d = vld1q_s32(&table[i + quarter * 3]);
int32x4_t a1 = vuzp1q_s32(a, b);
int32x4_t b1 = vuzp2q_s32(a, b);
int32x4_t c1 = vuzp1q_s32(c, d);
int32x4_t d1 = vuzp2q_s32(c, d);
int32x4_t a2 = vuzp1q_s32(a1, c1);
int32x4_t c2 = vuzp2q_s32(a1, c1);
int32x4_t b2 = vuzp1q_s32(b1, d1);
int32x4_t d2 = vuzp2q_s32(b1, d1);
vst1q_s32(&table[i + quarter * 0], a2);
vst1q_s32(&table[i + quarter * 2], b2);
vst1q_s32(&table[i + quarter * 1], c2);
vst1q_s32(&table[i + quarter * 3], d2);
}
static void bitreverse_swap(int32_t *table, size_t quarter, size_t i, size_t r) {
int32x4_t a = vld1q_s32(&table[i + quarter * 0]);
int32x4_t b = vld1q_s32(&table[i + quarter * 2]);
int32x4_t c = vld1q_s32(&table[i + quarter * 1]);
int32x4_t d = vld1q_s32(&table[i + quarter * 3]);
int32x4_t a1 = vuzp1q_s32(a, b);
int32x4_t b1 = vuzp2q_s32(a, b);
int32x4_t c1 = vuzp1q_s32(c, d);
int32x4_t d1 = vuzp2q_s32(c, d);
int32x4_t a2 = vuzp1q_s32(a1, c1);
int32x4_t c2 = vuzp2q_s32(a1, c1);
int32x4_t b2 = vuzp1q_s32(b1, d1);
int32x4_t d2 = vuzp2q_s32(b1, d1);
a = vld1q_s32(&table[r + quarter * 0]);
b = vld1q_s32(&table[r + quarter * 2]);
c = vld1q_s32(&table[r + quarter * 1]);
d = vld1q_s32(&table[r + quarter * 3]);
vst1q_s32(&table[r + quarter * 0], a2);
vst1q_s32(&table[r + quarter * 2], b2);
vst1q_s32(&table[r + quarter * 1], c2);
vst1q_s32(&table[r + quarter * 3], d2);
a1 = vuzp1q_s32(a, b);
b1 = vuzp2q_s32(a, b);
c1 = vuzp1q_s32(c, d);
d1 = vuzp2q_s32(c, d);
a2 = vuzp1q_s32(a1, c1);
c2 = vuzp2q_s32(a1, c1);
b2 = vuzp1q_s32(b1, d1);
d2 = vuzp2q_s32(b1, d1);
vst1q_s32(&table[i + quarter * 0], a2);
vst1q_s32(&table[i + quarter * 2], b2);
vst1q_s32(&table[i + quarter * 1], c2);
vst1q_s32(&table[i + quarter * 3], d2);
}
void fancy_bitreverse_in_place(int32_t *table, size_t size) {
if (size < 4) {
if (size == 2) {
int32_t a = table[1];
int32_t b = table[2];
table[1] = b;
table[2] = a;
} else if (size == 3) {
int32x4x2_t data = vld1q_s32_x2(table);
vst1q_s32(table, vtrn1q_s32(data.val[0], data.val[1]));
vst1q_s32(table+4, vtrn2q_s32(data.val[0], data.val[1]));
}
return;
}
size_t quarter = (size_t)1 << (size - 2);
size_t middle_bit = (size_t)1 << (size / 2);
if (size < 14) {
// bitreverse with lookup-table
uint64_t bitrev = 0x84c2a6e195d3b7f;
for (size_t i = 0; i < middle_bit; i += 4) {
size_t ri = ((bitrev << i) & (15ull << 60)) >> (66 - size);
for (size_t j = 0; j < i; j += 4) {
size_t rj = ((bitrev << j) & (15ull << 60)) >> (66 - size);
size_t v = ri + j;
size_t rv = rj + i;
bitreverse_swap(table, quarter, v, rv);
if (size & 1)
bitreverse_swap(table, quarter, v + middle_bit, rv + middle_bit);
}
uint64_t v = i + ri;
bitreverse_palindrome(table, quarter, v);
if (size & 1)
bitreverse_palindrome(table, quarter, v + middle_bit);
}
return;
}
// big tables (16k entries/66kb or larger)
#ifdef __clang__
// version for fast __builtin_bitreverse32
for (size_t i = 0; i < middle_bit; i += 4) {
size_t ri = __builtin_bitreverse32(i >> 2) >> (34 - size);
for (size_t j = 0; j < i; j += 4) {
size_t rj = (__builtin_bitreverse32(j >> 2) >> (34 - size));
size_t v = ri + j;
size_t rv = rj + i;
bitreverse_swap(table, quarter, v, rv);
if (size & 1)
bitreverse_swap(table, quarter, v + middle_bit, rv + middle_bit);
}
uint64_t v = i + ri;
bitreverse_palindrome(table, quarter, v);
if (size & 1)
bitreverse_palindrome(table, quarter, v + middle_bit);
}
#else
// gray code version for fast __builtin_ctz
size_t top = (size_t)1 << (size - 1);
for (size_t ii = 0, i = 0, ri = 0; ii < middle_bit;) {
for (size_t jj = 0, j = 0, rj = 0; jj < ii;) {
size_t v = ri + j;
size_t rv = rj + i;
bitreverse_swap(table, quarter, v, rv);
if (size & 1)
bitreverse_swap(table, quarter, v + middle_bit, rv + middle_bit);
jj += 4;
j ^= 1 << __builtin_ctz(jj);
rj ^= top >> __builtin_ctz(jj);
}
uint64_t v = i + ri;
bitreverse_palindrome(table, quarter, v);
if (size & 1)
bitreverse_palindrome(table, quarter, v + middle_bit);
ii += 4;
i ^= 1 << __builtin_ctz(ii);
ri ^= top >> __builtin_ctz(ii);
}
#endif
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment