Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
branchless ARM neon despacer
static const uint8_t __attribute__((aligned(16))) mask_shuffle[256*8] = {
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1,0,0,0,0,0,0,0, 0,1,0,0,0,0,0,0,
2,0,0,0,0,0,0,0, 0,2,0,0,0,0,0,0, 1,2,0,0,0,0,0,0, 0,1,2,0,0,0,0,0,
3,0,0,0,0,0,0,0, 0,3,0,0,0,0,0,0, 1,3,0,0,0,0,0,0, 0,1,3,0,0,0,0,0,
2,3,0,0,0,0,0,0, 0,2,3,0,0,0,0,0, 1,2,3,0,0,0,0,0, 0,1,2,3,0,0,0,0,
4,0,0,0,0,0,0,0, 0,4,0,0,0,0,0,0, 1,4,0,0,0,0,0,0, 0,1,4,0,0,0,0,0,
2,4,0,0,0,0,0,0, 0,2,4,0,0,0,0,0, 1,2,4,0,0,0,0,0, 0,1,2,4,0,0,0,0,
3,4,0,0,0,0,0,0, 0,3,4,0,0,0,0,0, 1,3,4,0,0,0,0,0, 0,1,3,4,0,0,0,0,
2,3,4,0,0,0,0,0, 0,2,3,4,0,0,0,0, 1,2,3,4,0,0,0,0, 0,1,2,3,4,0,0,0,
5,0,0,0,0,0,0,0, 0,5,0,0,0,0,0,0, 1,5,0,0,0,0,0,0, 0,1,5,0,0,0,0,0,
2,5,0,0,0,0,0,0, 0,2,5,0,0,0,0,0, 1,2,5,0,0,0,0,0, 0,1,2,5,0,0,0,0,
3,5,0,0,0,0,0,0, 0,3,5,0,0,0,0,0, 1,3,5,0,0,0,0,0, 0,1,3,5,0,0,0,0,
2,3,5,0,0,0,0,0, 0,2,3,5,0,0,0,0, 1,2,3,5,0,0,0,0, 0,1,2,3,5,0,0,0,
4,5,0,0,0,0,0,0, 0,4,5,0,0,0,0,0, 1,4,5,0,0,0,0,0, 0,1,4,5,0,0,0,0,
2,4,5,0,0,0,0,0, 0,2,4,5,0,0,0,0, 1,2,4,5,0,0,0,0, 0,1,2,4,5,0,0,0,
3,4,5,0,0,0,0,0, 0,3,4,5,0,0,0,0, 1,3,4,5,0,0,0,0, 0,1,3,4,5,0,0,0,
2,3,4,5,0,0,0,0, 0,2,3,4,5,0,0,0, 1,2,3,4,5,0,0,0, 0,1,2,3,4,5,0,0,
6,0,0,0,0,0,0,0, 0,6,0,0,0,0,0,0, 1,6,0,0,0,0,0,0, 0,1,6,0,0,0,0,0,
2,6,0,0,0,0,0,0, 0,2,6,0,0,0,0,0, 1,2,6,0,0,0,0,0, 0,1,2,6,0,0,0,0,
3,6,0,0,0,0,0,0, 0,3,6,0,0,0,0,0, 1,3,6,0,0,0,0,0, 0,1,3,6,0,0,0,0,
2,3,6,0,0,0,0,0, 0,2,3,6,0,0,0,0, 1,2,3,6,0,0,0,0, 0,1,2,3,6,0,0,0,
4,6,0,0,0,0,0,0, 0,4,6,0,0,0,0,0, 1,4,6,0,0,0,0,0, 0,1,4,6,0,0,0,0,
2,4,6,0,0,0,0,0, 0,2,4,6,0,0,0,0, 1,2,4,6,0,0,0,0, 0,1,2,4,6,0,0,0,
3,4,6,0,0,0,0,0, 0,3,4,6,0,0,0,0, 1,3,4,6,0,0,0,0, 0,1,3,4,6,0,0,0,
2,3,4,6,0,0,0,0, 0,2,3,4,6,0,0,0, 1,2,3,4,6,0,0,0, 0,1,2,3,4,6,0,0,
5,6,0,0,0,0,0,0, 0,5,6,0,0,0,0,0, 1,5,6,0,0,0,0,0, 0,1,5,6,0,0,0,0,
2,5,6,0,0,0,0,0, 0,2,5,6,0,0,0,0, 1,2,5,6,0,0,0,0, 0,1,2,5,6,0,0,0,
3,5,6,0,0,0,0,0, 0,3,5,6,0,0,0,0, 1,3,5,6,0,0,0,0, 0,1,3,5,6,0,0,0,
2,3,5,6,0,0,0,0, 0,2,3,5,6,0,0,0, 1,2,3,5,6,0,0,0, 0,1,2,3,5,6,0,0,
4,5,6,0,0,0,0,0, 0,4,5,6,0,0,0,0, 1,4,5,6,0,0,0,0, 0,1,4,5,6,0,0,0,
2,4,5,6,0,0,0,0, 0,2,4,5,6,0,0,0, 1,2,4,5,6,0,0,0, 0,1,2,4,5,6,0,0,
3,4,5,6,0,0,0,0, 0,3,4,5,6,0,0,0, 1,3,4,5,6,0,0,0, 0,1,3,4,5,6,0,0,
2,3,4,5,6,0,0,0, 0,2,3,4,5,6,0,0, 1,2,3,4,5,6,0,0, 0,1,2,3,4,5,6,0,
7,0,0,0,0,0,0,0, 0,7,0,0,0,0,0,0, 1,7,0,0,0,0,0,0, 0,1,7,0,0,0,0,0,
2,7,0,0,0,0,0,0, 0,2,7,0,0,0,0,0, 1,2,7,0,0,0,0,0, 0,1,2,7,0,0,0,0,
3,7,0,0,0,0,0,0, 0,3,7,0,0,0,0,0, 1,3,7,0,0,0,0,0, 0,1,3,7,0,0,0,0,
2,3,7,0,0,0,0,0, 0,2,3,7,0,0,0,0, 1,2,3,7,0,0,0,0, 0,1,2,3,7,0,0,0,
4,7,0,0,0,0,0,0, 0,4,7,0,0,0,0,0, 1,4,7,0,0,0,0,0, 0,1,4,7,0,0,0,0,
2,4,7,0,0,0,0,0, 0,2,4,7,0,0,0,0, 1,2,4,7,0,0,0,0, 0,1,2,4,7,0,0,0,
3,4,7,0,0,0,0,0, 0,3,4,7,0,0,0,0, 1,3,4,7,0,0,0,0, 0,1,3,4,7,0,0,0,
2,3,4,7,0,0,0,0, 0,2,3,4,7,0,0,0, 1,2,3,4,7,0,0,0, 0,1,2,3,4,7,0,0,
5,7,0,0,0,0,0,0, 0,5,7,0,0,0,0,0, 1,5,7,0,0,0,0,0, 0,1,5,7,0,0,0,0,
2,5,7,0,0,0,0,0, 0,2,5,7,0,0,0,0, 1,2,5,7,0,0,0,0, 0,1,2,5,7,0,0,0,
3,5,7,0,0,0,0,0, 0,3,5,7,0,0,0,0, 1,3,5,7,0,0,0,0, 0,1,3,5,7,0,0,0,
2,3,5,7,0,0,0,0, 0,2,3,5,7,0,0,0, 1,2,3,5,7,0,0,0, 0,1,2,3,5,7,0,0,
4,5,7,0,0,0,0,0, 0,4,5,7,0,0,0,0, 1,4,5,7,0,0,0,0, 0,1,4,5,7,0,0,0,
2,4,5,7,0,0,0,0, 0,2,4,5,7,0,0,0, 1,2,4,5,7,0,0,0, 0,1,2,4,5,7,0,0,
3,4,5,7,0,0,0,0, 0,3,4,5,7,0,0,0, 1,3,4,5,7,0,0,0, 0,1,3,4,5,7,0,0,
2,3,4,5,7,0,0,0, 0,2,3,4,5,7,0,0, 1,2,3,4,5,7,0,0, 0,1,2,3,4,5,7,0,
6,7,0,0,0,0,0,0, 0,6,7,0,0,0,0,0, 1,6,7,0,0,0,0,0, 0,1,6,7,0,0,0,0,
2,6,7,0,0,0,0,0, 0,2,6,7,0,0,0,0, 1,2,6,7,0,0,0,0, 0,1,2,6,7,0,0,0,
3,6,7,0,0,0,0,0, 0,3,6,7,0,0,0,0, 1,3,6,7,0,0,0,0, 0,1,3,6,7,0,0,0,
2,3,6,7,0,0,0,0, 0,2,3,6,7,0,0,0, 1,2,3,6,7,0,0,0, 0,1,2,3,6,7,0,0,
4,6,7,0,0,0,0,0, 0,4,6,7,0,0,0,0, 1,4,6,7,0,0,0,0, 0,1,4,6,7,0,0,0,
2,4,6,7,0,0,0,0, 0,2,4,6,7,0,0,0, 1,2,4,6,7,0,0,0, 0,1,2,4,6,7,0,0,
3,4,6,7,0,0,0,0, 0,3,4,6,7,0,0,0, 1,3,4,6,7,0,0,0, 0,1,3,4,6,7,0,0,
2,3,4,6,7,0,0,0, 0,2,3,4,6,7,0,0, 1,2,3,4,6,7,0,0, 0,1,2,3,4,6,7,0,
5,6,7,0,0,0,0,0, 0,5,6,7,0,0,0,0, 1,5,6,7,0,0,0,0, 0,1,5,6,7,0,0,0,
2,5,6,7,0,0,0,0, 0,2,5,6,7,0,0,0, 1,2,5,6,7,0,0,0, 0,1,2,5,6,7,0,0,
3,5,6,7,0,0,0,0, 0,3,5,6,7,0,0,0, 1,3,5,6,7,0,0,0, 0,1,3,5,6,7,0,0,
2,3,5,6,7,0,0,0, 0,2,3,5,6,7,0,0, 1,2,3,5,6,7,0,0, 0,1,2,3,5,6,7,0,
4,5,6,7,0,0,0,0, 0,4,5,6,7,0,0,0, 1,4,5,6,7,0,0,0, 0,1,4,5,6,7,0,0,
2,4,5,6,7,0,0,0, 0,2,4,5,6,7,0,0, 1,2,4,5,6,7,0,0, 0,1,2,4,5,6,7,0,
3,4,5,6,7,0,0,0, 0,3,4,5,6,7,0,0, 1,3,4,5,6,7,0,0, 0,1,3,4,5,6,7,0,
2,3,4,5,6,7,0,0, 0,2,3,4,5,6,7,0, 1,2,3,4,5,6,7,0, 0,1,2,3,4,5,6,7,
};
static inline size_t neon_despace_branchless(char *bytes, size_t howmany) {
size_t i = 0, pos = 0;
const size_t chunk_size = 16;
const uint8x16_t bitmask = { 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 };
const uint8x16_t space = vdupq_n_u8(' ');
for (; i + chunk_size <= howmany; i += chunk_size) {
uint8x16_t vec = vld1q_u8((uint8_t*)bytes + i);
uint8x16_t cmp = vcgtq_u8(vec, space);
uint8x16_t bitmask_applied = vandq_u8(cmp, bitmask);
uint8_t mlow = vaddv_u8(vget_low_u8(bitmask_applied));
uint8_t mhigh = vaddv_u8(vget_high_u8(bitmask_applied));
uint8x8_t low = vtbl1_u8(vget_low_u8(vec), vld1_u8(mask_shuffle + mlow*8));
uint8x8_t high = vtbl1_u8(vget_high_u8(vec), vld1_u8(mask_shuffle + mhigh*8));
vst1_u8((uint8_t*)bytes + pos, low);
pos += __builtin_popcount(mlow);
vst1_u8((uint8_t*)bytes + pos, high);
pos += __builtin_popcount(mhigh);
}
while (i < howmany) {
char c = bytes[i];
if (c == '\r' || c == '\n' || c == ' ') {
continue;
}
bytes[pos++] = c;
}
return pos;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.