-
-
Save dougallj/8acf4233e3534f3278bda5fdaa2f4051 to your computer and use it in GitHub Desktop.
AArch64 Neon in-place bit-reversal transposition of a table with 32-bit entries
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdint.h> | |
#include <stddef.h> | |
#include <arm_neon.h> | |
#define RBIT32(x) __builtin_bitreverse32(x) | |
void bitreverse_in_place(int32_t *table, int table_bits) { | |
if (table_bits < 4) { | |
if (table_bits == 2) { | |
int32_t a = table[1]; | |
int32_t b = table[2]; | |
table[1] = b; | |
table[2] = a; | |
} else if (table_bits == 3) { | |
int32x4x2_t data = vld1q_s32_x2(table); | |
vst1q_s32(table, vtrn1q_s32(data.val[0], data.val[1])); | |
vst1q_s32(table+4, vtrn2q_s32(data.val[0], data.val[1])); | |
} | |
return; | |
} | |
size_t quarter = ((size_t)1 << table_bits) / 4; | |
for (size_t i = 0; i < quarter; i += 4) { | |
uint32_t r = RBIT32(i) >> (32 - table_bits); | |
if (r == i) { | |
int32x4_t a = vld1q_s32(&table[i + quarter * 0]); | |
int32x4_t b = vld1q_s32(&table[i + quarter * 2]); | |
int32x4_t c = vld1q_s32(&table[i + quarter * 1]); | |
int32x4_t d = vld1q_s32(&table[i + quarter * 3]); | |
int32x4_t a1 = vuzp1q_s32(a, b); | |
int32x4_t b1 = vuzp2q_s32(a, b); | |
int32x4_t c1 = vuzp1q_s32(c, d); | |
int32x4_t d1 = vuzp2q_s32(c, d); | |
int32x4_t a2 = vuzp1q_s32(a1, c1); | |
int32x4_t c2 = vuzp2q_s32(a1, c1); | |
int32x4_t b2 = vuzp1q_s32(b1, d1); | |
int32x4_t d2 = vuzp2q_s32(b1, d1); | |
vst1q_s32(&table[r + quarter * 0], a2); | |
vst1q_s32(&table[r + quarter * 2], b2); | |
vst1q_s32(&table[r + quarter * 1], c2); | |
vst1q_s32(&table[r + quarter * 3], d2); | |
} else if (r < i) { | |
int32x4_t a = vld1q_s32(&table[i + quarter * 0]); | |
int32x4_t b = vld1q_s32(&table[i + quarter * 2]); | |
int32x4_t c = vld1q_s32(&table[i + quarter * 1]); | |
int32x4_t d = vld1q_s32(&table[i + quarter * 3]); | |
int32x4_t a1 = vuzp1q_s32(a, b); | |
int32x4_t b1 = vuzp2q_s32(a, b); | |
int32x4_t c1 = vuzp1q_s32(c, d); | |
int32x4_t d1 = vuzp2q_s32(c, d); | |
int32x4_t a2 = vuzp1q_s32(a1, c1); | |
int32x4_t c2 = vuzp2q_s32(a1, c1); | |
int32x4_t b2 = vuzp1q_s32(b1, d1); | |
int32x4_t d2 = vuzp2q_s32(b1, d1); | |
a = vld1q_s32(&table[r + quarter * 0]); | |
b = vld1q_s32(&table[r + quarter * 2]); | |
c = vld1q_s32(&table[r + quarter * 1]); | |
d = vld1q_s32(&table[r + quarter * 3]); | |
vst1q_s32(&table[r + quarter * 0], a2); | |
vst1q_s32(&table[r + quarter * 2], b2); | |
vst1q_s32(&table[r + quarter * 1], c2); | |
vst1q_s32(&table[r + quarter * 3], d2); | |
a1 = vuzp1q_s32(a, b); | |
b1 = vuzp2q_s32(a, b); | |
c1 = vuzp1q_s32(c, d); | |
d1 = vuzp2q_s32(c, d); | |
a2 = vuzp1q_s32(a1, c1); | |
c2 = vuzp2q_s32(a1, c1); | |
b2 = vuzp1q_s32(b1, d1); | |
d2 = vuzp2q_s32(b1, d1); | |
vst1q_s32(&table[i + quarter * 0], a2); | |
vst1q_s32(&table[i + quarter * 2], b2); | |
vst1q_s32(&table[i + quarter * 1], c2); | |
vst1q_s32(&table[i + quarter * 3], d2); | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdint.h> | |
#include <stddef.h> | |
#include <arm_neon.h> | |
static void bitreverse_palindrome(int32_t *table, size_t quarter, size_t i) { | |
int32x4_t a = vld1q_s32(&table[i + quarter * 0]); | |
int32x4_t b = vld1q_s32(&table[i + quarter * 2]); | |
int32x4_t c = vld1q_s32(&table[i + quarter * 1]); | |
int32x4_t d = vld1q_s32(&table[i + quarter * 3]); | |
int32x4_t a1 = vuzp1q_s32(a, b); | |
int32x4_t b1 = vuzp2q_s32(a, b); | |
int32x4_t c1 = vuzp1q_s32(c, d); | |
int32x4_t d1 = vuzp2q_s32(c, d); | |
int32x4_t a2 = vuzp1q_s32(a1, c1); | |
int32x4_t c2 = vuzp2q_s32(a1, c1); | |
int32x4_t b2 = vuzp1q_s32(b1, d1); | |
int32x4_t d2 = vuzp2q_s32(b1, d1); | |
vst1q_s32(&table[i + quarter * 0], a2); | |
vst1q_s32(&table[i + quarter * 2], b2); | |
vst1q_s32(&table[i + quarter * 1], c2); | |
vst1q_s32(&table[i + quarter * 3], d2); | |
} | |
static void bitreverse_swap(int32_t *table, size_t quarter, size_t i, size_t r) { | |
int32x4_t a = vld1q_s32(&table[i + quarter * 0]); | |
int32x4_t b = vld1q_s32(&table[i + quarter * 2]); | |
int32x4_t c = vld1q_s32(&table[i + quarter * 1]); | |
int32x4_t d = vld1q_s32(&table[i + quarter * 3]); | |
int32x4_t a1 = vuzp1q_s32(a, b); | |
int32x4_t b1 = vuzp2q_s32(a, b); | |
int32x4_t c1 = vuzp1q_s32(c, d); | |
int32x4_t d1 = vuzp2q_s32(c, d); | |
int32x4_t a2 = vuzp1q_s32(a1, c1); | |
int32x4_t c2 = vuzp2q_s32(a1, c1); | |
int32x4_t b2 = vuzp1q_s32(b1, d1); | |
int32x4_t d2 = vuzp2q_s32(b1, d1); | |
a = vld1q_s32(&table[r + quarter * 0]); | |
b = vld1q_s32(&table[r + quarter * 2]); | |
c = vld1q_s32(&table[r + quarter * 1]); | |
d = vld1q_s32(&table[r + quarter * 3]); | |
vst1q_s32(&table[r + quarter * 0], a2); | |
vst1q_s32(&table[r + quarter * 2], b2); | |
vst1q_s32(&table[r + quarter * 1], c2); | |
vst1q_s32(&table[r + quarter * 3], d2); | |
a1 = vuzp1q_s32(a, b); | |
b1 = vuzp2q_s32(a, b); | |
c1 = vuzp1q_s32(c, d); | |
d1 = vuzp2q_s32(c, d); | |
a2 = vuzp1q_s32(a1, c1); | |
c2 = vuzp2q_s32(a1, c1); | |
b2 = vuzp1q_s32(b1, d1); | |
d2 = vuzp2q_s32(b1, d1); | |
vst1q_s32(&table[i + quarter * 0], a2); | |
vst1q_s32(&table[i + quarter * 2], b2); | |
vst1q_s32(&table[i + quarter * 1], c2); | |
vst1q_s32(&table[i + quarter * 3], d2); | |
} | |
void fancy_bitreverse_in_place(int32_t *table, size_t size) { | |
if (size < 4) { | |
if (size == 2) { | |
int32_t a = table[1]; | |
int32_t b = table[2]; | |
table[1] = b; | |
table[2] = a; | |
} else if (size == 3) { | |
int32x4x2_t data = vld1q_s32_x2(table); | |
vst1q_s32(table, vtrn1q_s32(data.val[0], data.val[1])); | |
vst1q_s32(table+4, vtrn2q_s32(data.val[0], data.val[1])); | |
} | |
return; | |
} | |
size_t quarter = (size_t)1 << (size - 2); | |
size_t middle_bit = (size_t)1 << (size / 2); | |
if (size < 14) { | |
// bitreverse with lookup-table | |
uint64_t bitrev = 0x84c2a6e195d3b7f; | |
for (size_t i = 0; i < middle_bit; i += 4) { | |
size_t ri = ((bitrev << i) & (15ull << 60)) >> (66 - size); | |
for (size_t j = 0; j < i; j += 4) { | |
size_t rj = ((bitrev << j) & (15ull << 60)) >> (66 - size); | |
size_t v = ri + j; | |
size_t rv = rj + i; | |
bitreverse_swap(table, quarter, v, rv); | |
if (size & 1) | |
bitreverse_swap(table, quarter, v + middle_bit, rv + middle_bit); | |
} | |
uint64_t v = i + ri; | |
bitreverse_palindrome(table, quarter, v); | |
if (size & 1) | |
bitreverse_palindrome(table, quarter, v + middle_bit); | |
} | |
return; | |
} | |
// big tables (16k entries/66kb or larger) | |
#ifdef __clang__ | |
// version for fast __builtin_bitreverse32 | |
for (size_t i = 0; i < middle_bit; i += 4) { | |
size_t ri = __builtin_bitreverse32(i >> 2) >> (34 - size); | |
for (size_t j = 0; j < i; j += 4) { | |
size_t rj = (__builtin_bitreverse32(j >> 2) >> (34 - size)); | |
size_t v = ri + j; | |
size_t rv = rj + i; | |
bitreverse_swap(table, quarter, v, rv); | |
if (size & 1) | |
bitreverse_swap(table, quarter, v + middle_bit, rv + middle_bit); | |
} | |
uint64_t v = i + ri; | |
bitreverse_palindrome(table, quarter, v); | |
if (size & 1) | |
bitreverse_palindrome(table, quarter, v + middle_bit); | |
} | |
#else | |
// gray code version for fast __builtin_ctz | |
size_t top = (size_t)1 << (size - 1); | |
for (size_t ii = 0, i = 0, ri = 0; ii < middle_bit;) { | |
for (size_t jj = 0, j = 0, rj = 0; jj < ii;) { | |
size_t v = ri + j; | |
size_t rv = rj + i; | |
bitreverse_swap(table, quarter, v, rv); | |
if (size & 1) | |
bitreverse_swap(table, quarter, v + middle_bit, rv + middle_bit); | |
jj += 4; | |
j ^= 1 << __builtin_ctz(jj); | |
rj ^= top >> __builtin_ctz(jj); | |
} | |
uint64_t v = i + ri; | |
bitreverse_palindrome(table, quarter, v); | |
if (size & 1) | |
bitreverse_palindrome(table, quarter, v + middle_bit); | |
ii += 4; | |
i ^= 1 << __builtin_ctz(ii); | |
ri ^= top >> __builtin_ctz(ii); | |
} | |
#endif | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment