NEON intrinsics blurb.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // NEON ASM: | |
| // 8x8 transpose | |
| vswp d1,d8 | |
| vswp d3,d10 | |
| vswp d5,d12 | |
| vswp d7,d14 | |
| vtrn.32 q0,q2 | |
| vtrn.32 q1,q3 | |
| vtrn.32 q4,q6 | |
| vtrn.32 q5,q7 | |
| vtrn.16 q0,q1 | |
| vtrn.16 q2,q3 | |
| vtrn.16 q4,q5 | |
| vtrn.16 q6,q7 | |
| // NEON intrinsics to accomplish the exact same thing: | |
| int32x4_t a0 = vreinterpretq_s32_s16(vcombine_s16(vget_low_s16(i0), vget_low_s16(i4))); | |
| int32x4_t a1 = vreinterpretq_s32_s16(vcombine_s16(vget_low_s16(i1), vget_low_s16(i5))); | |
| int32x4_t a2 = vreinterpretq_s32_s16(vcombine_s16(vget_low_s16(i2), vget_low_s16(i6))); | |
| int32x4_t a3 = vreinterpretq_s32_s16(vcombine_s16(vget_low_s16(i3), vget_low_s16(i7))); | |
| int32x4_t a4 = vreinterpretq_s32_s16(vcombine_s16(vget_high_s16(i0), vget_high_s16(i4))); | |
| int32x4_t a5 = vreinterpretq_s32_s16(vcombine_s16(vget_high_s16(i1), vget_high_s16(i5))); | |
| int32x4_t a6 = vreinterpretq_s32_s16(vcombine_s16(vget_high_s16(i2), vget_high_s16(i6))); | |
| int32x4_t a7 = vreinterpretq_s32_s16(vcombine_s16(vget_high_s16(i3), vget_high_s16(i7))); | |
| int16x8_t b0 = vreinterpretq_s16_s32(vtrnq_s32(a0, a2).val[0]); | |
| int16x8_t b1 = vreinterpretq_s16_s32(vtrnq_s32(a1, a3).val[0]); | |
| int16x8_t b2 = vreinterpretq_s16_s32(vtrnq_s32(a0, a2).val[1]); | |
| int16x8_t b3 = vreinterpretq_s16_s32(vtrnq_s32(a1, a3).val[1]); | |
| int16x8_t b4 = vreinterpretq_s16_s32(vtrnq_s32(a4, a6).val[0]); | |
| int16x8_t b5 = vreinterpretq_s16_s32(vtrnq_s32(a5, a7).val[0]); | |
| int16x8_t b6 = vreinterpretq_s16_s32(vtrnq_s32(a4, a6).val[1]); | |
| int16x8_t b7 = vreinterpretq_s16_s32(vtrnq_s32(a5, a7).val[1]); | |
| int16x8_t o0 = vtrnq_s16(b0, b1).val[0]; | |
| int16x8_t o1 = vtrnq_s16(b0, b1).val[1]; | |
| int16x8_t o2 = vtrnq_s16(b2, b3).val[0]; | |
| int16x8_t o3 = vtrnq_s16(b2, b3).val[1]; | |
| int16x8_t o4 = vtrnq_s16(b4, b5).val[0]; | |
| int16x8_t o5 = vtrnq_s16(b4, b5).val[1]; | |
| int16x8_t o6 = vtrnq_s16(b6, b7).val[0]; | |
| int16x8_t o7 = vtrnq_s16(b6, b7).val[1]; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment