Instantly share code, notes, and snippets.

Embed
What would you like to do?
ARM neon branchless despacer
static const uint8_t __attribute__((aligned(16))) shuffle_arr[256*8] = {
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
1,0,0,0,0,0,0,0,
0,1,0,0,0,0,0,0,
2,0,0,0,0,0,0,0,
0,2,0,0,0,0,0,0,
1,2,0,0,0,0,0,0,
0,1,2,0,0,0,0,0,
3,0,0,0,0,0,0,0,
0,3,0,0,0,0,0,0,
1,3,0,0,0,0,0,0,
0,1,3,0,0,0,0,0,
2,3,0,0,0,0,0,0,
0,2,3,0,0,0,0,0,
1,2,3,0,0,0,0,0,
0,1,2,3,0,0,0,0,
4,0,0,0,0,0,0,0,
0,4,0,0,0,0,0,0,
1,4,0,0,0,0,0,0,
0,1,4,0,0,0,0,0,
2,4,0,0,0,0,0,0,
0,2,4,0,0,0,0,0,
1,2,4,0,0,0,0,0,
0,1,2,4,0,0,0,0,
3,4,0,0,0,0,0,0,
0,3,4,0,0,0,0,0,
1,3,4,0,0,0,0,0,
0,1,3,4,0,0,0,0,
2,3,4,0,0,0,0,0,
0,2,3,4,0,0,0,0,
1,2,3,4,0,0,0,0,
0,1,2,3,4,0,0,0,
5,0,0,0,0,0,0,0,
0,5,0,0,0,0,0,0,
1,5,0,0,0,0,0,0,
0,1,5,0,0,0,0,0,
2,5,0,0,0,0,0,0,
0,2,5,0,0,0,0,0,
1,2,5,0,0,0,0,0,
0,1,2,5,0,0,0,0,
3,5,0,0,0,0,0,0,
0,3,5,0,0,0,0,0,
1,3,5,0,0,0,0,0,
0,1,3,5,0,0,0,0,
2,3,5,0,0,0,0,0,
0,2,3,5,0,0,0,0,
1,2,3,5,0,0,0,0,
0,1,2,3,5,0,0,0,
4,5,0,0,0,0,0,0,
0,4,5,0,0,0,0,0,
1,4,5,0,0,0,0,0,
0,1,4,5,0,0,0,0,
2,4,5,0,0,0,0,0,
0,2,4,5,0,0,0,0,
1,2,4,5,0,0,0,0,
0,1,2,4,5,0,0,0,
3,4,5,0,0,0,0,0,
0,3,4,5,0,0,0,0,
1,3,4,5,0,0,0,0,
0,1,3,4,5,0,0,0,
2,3,4,5,0,0,0,0,
0,2,3,4,5,0,0,0,
1,2,3,4,5,0,0,0,
0,1,2,3,4,5,0,0,
6,0,0,0,0,0,0,0,
0,6,0,0,0,0,0,0,
1,6,0,0,0,0,0,0,
0,1,6,0,0,0,0,0,
2,6,0,0,0,0,0,0,
0,2,6,0,0,0,0,0,
1,2,6,0,0,0,0,0,
0,1,2,6,0,0,0,0,
3,6,0,0,0,0,0,0,
0,3,6,0,0,0,0,0,
1,3,6,0,0,0,0,0,
0,1,3,6,0,0,0,0,
2,3,6,0,0,0,0,0,
0,2,3,6,0,0,0,0,
1,2,3,6,0,0,0,0,
0,1,2,3,6,0,0,0,
4,6,0,0,0,0,0,0,
0,4,6,0,0,0,0,0,
1,4,6,0,0,0,0,0,
0,1,4,6,0,0,0,0,
2,4,6,0,0,0,0,0,
0,2,4,6,0,0,0,0,
1,2,4,6,0,0,0,0,
0,1,2,4,6,0,0,0,
3,4,6,0,0,0,0,0,
0,3,4,6,0,0,0,0,
1,3,4,6,0,0,0,0,
0,1,3,4,6,0,0,0,
2,3,4,6,0,0,0,0,
0,2,3,4,6,0,0,0,
1,2,3,4,6,0,0,0,
0,1,2,3,4,6,0,0,
5,6,0,0,0,0,0,0,
0,5,6,0,0,0,0,0,
1,5,6,0,0,0,0,0,
0,1,5,6,0,0,0,0,
2,5,6,0,0,0,0,0,
0,2,5,6,0,0,0,0,
1,2,5,6,0,0,0,0,
0,1,2,5,6,0,0,0,
3,5,6,0,0,0,0,0,
0,3,5,6,0,0,0,0,
1,3,5,6,0,0,0,0,
0,1,3,5,6,0,0,0,
2,3,5,6,0,0,0,0,
0,2,3,5,6,0,0,0,
1,2,3,5,6,0,0,0,
0,1,2,3,5,6,0,0,
4,5,6,0,0,0,0,0,
0,4,5,6,0,0,0,0,
1,4,5,6,0,0,0,0,
0,1,4,5,6,0,0,0,
2,4,5,6,0,0,0,0,
0,2,4,5,6,0,0,0,
1,2,4,5,6,0,0,0,
0,1,2,4,5,6,0,0,
3,4,5,6,0,0,0,0,
0,3,4,5,6,0,0,0,
1,3,4,5,6,0,0,0,
0,1,3,4,5,6,0,0,
2,3,4,5,6,0,0,0,
0,2,3,4,5,6,0,0,
1,2,3,4,5,6,0,0,
0,1,2,3,4,5,6,0,
7,0,0,0,0,0,0,0,
0,7,0,0,0,0,0,0,
1,7,0,0,0,0,0,0,
0,1,7,0,0,0,0,0,
2,7,0,0,0,0,0,0,
0,2,7,0,0,0,0,0,
1,2,7,0,0,0,0,0,
0,1,2,7,0,0,0,0,
3,7,0,0,0,0,0,0,
0,3,7,0,0,0,0,0,
1,3,7,0,0,0,0,0,
0,1,3,7,0,0,0,0,
2,3,7,0,0,0,0,0,
0,2,3,7,0,0,0,0,
1,2,3,7,0,0,0,0,
0,1,2,3,7,0,0,0,
4,7,0,0,0,0,0,0,
0,4,7,0,0,0,0,0,
1,4,7,0,0,0,0,0,
0,1,4,7,0,0,0,0,
2,4,7,0,0,0,0,0,
0,2,4,7,0,0,0,0,
1,2,4,7,0,0,0,0,
0,1,2,4,7,0,0,0,
3,4,7,0,0,0,0,0,
0,3,4,7,0,0,0,0,
1,3,4,7,0,0,0,0,
0,1,3,4,7,0,0,0,
2,3,4,7,0,0,0,0,
0,2,3,4,7,0,0,0,
1,2,3,4,7,0,0,0,
0,1,2,3,4,7,0,0,
5,7,0,0,0,0,0,0,
0,5,7,0,0,0,0,0,
1,5,7,0,0,0,0,0,
0,1,5,7,0,0,0,0,
2,5,7,0,0,0,0,0,
0,2,5,7,0,0,0,0,
1,2,5,7,0,0,0,0,
0,1,2,5,7,0,0,0,
3,5,7,0,0,0,0,0,
0,3,5,7,0,0,0,0,
1,3,5,7,0,0,0,0,
0,1,3,5,7,0,0,0,
2,3,5,7,0,0,0,0,
0,2,3,5,7,0,0,0,
1,2,3,5,7,0,0,0,
0,1,2,3,5,7,0,0,
4,5,7,0,0,0,0,0,
0,4,5,7,0,0,0,0,
1,4,5,7,0,0,0,0,
0,1,4,5,7,0,0,0,
2,4,5,7,0,0,0,0,
0,2,4,5,7,0,0,0,
1,2,4,5,7,0,0,0,
0,1,2,4,5,7,0,0,
3,4,5,7,0,0,0,0,
0,3,4,5,7,0,0,0,
1,3,4,5,7,0,0,0,
0,1,3,4,5,7,0,0,
2,3,4,5,7,0,0,0,
0,2,3,4,5,7,0,0,
1,2,3,4,5,7,0,0,
0,1,2,3,4,5,7,0,
6,7,0,0,0,0,0,0,
0,6,7,0,0,0,0,0,
1,6,7,0,0,0,0,0,
0,1,6,7,0,0,0,0,
2,6,7,0,0,0,0,0,
0,2,6,7,0,0,0,0,
1,2,6,7,0,0,0,0,
0,1,2,6,7,0,0,0,
3,6,7,0,0,0,0,0,
0,3,6,7,0,0,0,0,
1,3,6,7,0,0,0,0,
0,1,3,6,7,0,0,0,
2,3,6,7,0,0,0,0,
0,2,3,6,7,0,0,0,
1,2,3,6,7,0,0,0,
0,1,2,3,6,7,0,0,
4,6,7,0,0,0,0,0,
0,4,6,7,0,0,0,0,
1,4,6,7,0,0,0,0,
0,1,4,6,7,0,0,0,
2,4,6,7,0,0,0,0,
0,2,4,6,7,0,0,0,
1,2,4,6,7,0,0,0,
0,1,2,4,6,7,0,0,
3,4,6,7,0,0,0,0,
0,3,4,6,7,0,0,0,
1,3,4,6,7,0,0,0,
0,1,3,4,6,7,0,0,
2,3,4,6,7,0,0,0,
0,2,3,4,6,7,0,0,
1,2,3,4,6,7,0,0,
0,1,2,3,4,6,7,0,
5,6,7,0,0,0,0,0,
0,5,6,7,0,0,0,0,
1,5,6,7,0,0,0,0,
0,1,5,6,7,0,0,0,
2,5,6,7,0,0,0,0,
0,2,5,6,7,0,0,0,
1,2,5,6,7,0,0,0,
0,1,2,5,6,7,0,0,
3,5,6,7,0,0,0,0,
0,3,5,6,7,0,0,0,
1,3,5,6,7,0,0,0,
0,1,3,5,6,7,0,0,
2,3,5,6,7,0,0,0,
0,2,3,5,6,7,0,0,
1,2,3,5,6,7,0,0,
0,1,2,3,5,6,7,0,
4,5,6,7,0,0,0,0,
0,4,5,6,7,0,0,0,
1,4,5,6,7,0,0,0,
0,1,4,5,6,7,0,0,
2,4,5,6,7,0,0,0,
0,2,4,5,6,7,0,0,
1,2,4,5,6,7,0,0,
0,1,2,4,5,6,7,0,
3,4,5,6,7,0,0,0,
0,3,4,5,6,7,0,0,
1,3,4,5,6,7,0,0,
0,1,3,4,5,6,7,0,
2,3,4,5,6,7,0,0,
0,2,3,4,5,6,7,0,
1,2,3,4,5,6,7,0,
0,1,2,3,4,5,6,7,
};
static inline size_t neon_despace_branchless(char *bytes, size_t howmany) {
size_t i = 0, pos = 0;
const size_t chunk_size = 16;
uint8x16_t bitmask = { 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 };
uint8x16_t space = vdupq_n_u8(' ');
for (; i + chunk_size <= howmany; i += chunk_size) {
uint8x16_t vec = vld1q_u8((uint8_t*)bytes + i);
uint8x16_t cmp = vcgtq_u8(vec, space);
uint64x2_t mask = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8(cmp, bitmask))));
uint8_t mlow = vgetq_lane_u8(vreinterpretq_u8_u64(mask), 0);
uint8_t mhigh = vgetq_lane_u8(vreinterpretq_u8_u64(mask), 8);
uint8x8_t vlow = vget_low_u8(vec);
uint8x8_t vhigh = vget_high_u8(vec);
uint8x8_t rlow = vtbl1_u8(vlow, vld1_u8(shuffle_arr + mlow*8));
uint8x8_t rhigh = vtbl1_u8(vhigh, vld1_u8(shuffle_arr + mhigh*8));
vst1_u8((uint8_t*)bytes + pos, rlow);
pos += __builtin_popcount(mlow);
vst1_u8((uint8_t*)bytes + pos, rhigh);
pos += __builtin_popcount(mhigh);
}
while (i < howmany) {
char c = bytes[i];
if (c == '\r' || c == '\n' || c == ' ') {
continue;
}
bytes[pos++] = c;
}
return pos;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment