ARM NEON PMOVMSKB substitute to turn 4 predicate results over 128-bits to a single 64-bit value
uint64_t neonmovemask_bulk(uint8x16_t p0, uint8x16_t p1, uint8x16_t p2, uint8x16_t p3) { | |
const uint8x16_t bitmask = { 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, | |
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80}; | |
uint8x16_t t0 = vandq_u8(p0, bitmask); | |
uint8x16_t t1 = vandq_u8(p1, bitmask); | |
uint8x16_t t2 = vandq_u8(p2, bitmask); | |
uint8x16_t t3 = vandq_u8(p3, bitmask); | |
uint8x16_t sum0 = vpaddq_u8(t0, t1); | |
uint8x16_t sum1 = vpaddq_u8(t2, t3); | |
sum0 = vpaddq_u8(sum0, sum1); | |
sum0 = vpaddq_u8(sum0, sum0); | |
return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment