Created
April 1, 2019 05:58
-
-
Save geofflangdale/99393863c8cae3e83195a5e592e7dc82 to your computer and use it in GitHub Desktop.
ARM NEON PMOVMSKB substitute to turn 4 _interleaved_ predicate results over 128-bits to a single 64-bit value
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
uint64_t neonmovemask_bulk(uint8x16_t p0, uint8x16_t p1, uint8x16_t p2, uint8x16_t p3) { | |
const uint8x16_t bitmask1 = { 0x01, 0x10, 0x01, 0x10, 0x01, 0x10, 0x01, 0x10, | |
0x01, 0x10, 0x01, 0x10, 0x01, 0x10, 0x01, 0x10}; | |
const uint8x16_t bitmask2 = { 0x02, 0x20, 0x02, 0x20, 0x02, 0x20, 0x02, 0x20, | |
0x02, 0x20, 0x02, 0x20, 0x02, 0x20, 0x02, 0x20}; | |
const uint8x16_t bitmask3 = { 0x04, 0x40, 0x04, 0x40, 0x04, 0x40, 0x04, 0x40, | |
0x04, 0x40, 0x04, 0x40, 0x04, 0x40, 0x04, 0x40}; | |
const uint8x16_t bitmask4 = { 0x08, 0x80, 0x08, 0x80, 0x08, 0x80, 0x08, 0x80, | |
0x08, 0x80, 0x08, 0x80, 0x08, 0x80, 0x08, 0x80}; | |
uint8x16_t t0 = vandq_u8(p0, bitmask1); | |
uint8x16_t t1 = vbslq_u8(bitmask2, p1, t0); | |
uint8x16_t t2 = vbslq_u8(bitmask3, p2, t1); | |
uint8x16_t tmp = vbslq_u8(bitmask4, p3, t2); | |
uint8x16_t sum = vpaddq_u8(tmp, tmp); | |
return vgetq_lane_u64(vreinterpretq_u64_u8(sum), 0); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
One less operation, replacing AND+ADDP with SHRN (shift-right and narrow):
Also uses half the number of vector constants, leaving more registers free for other stuff, or allows the use of MOVI for generating the constants, instead of loading from memory.