Skip to content

Instantly share code, notes, and snippets.

@rygorous
Created December 28, 2013 09:59
Embed
What would you like to do?
NEON intrinsics blurb.
// NEON ASM:
// 8x8 transpose
vswp d1,d8
vswp d3,d10
vswp d5,d12
vswp d7,d14
vtrn.32 q0,q2
vtrn.32 q1,q3
vtrn.32 q4,q6
vtrn.32 q5,q7
vtrn.16 q0,q1
vtrn.16 q2,q3
vtrn.16 q4,q5
vtrn.16 q6,q7
// NEON intrinsics to accomplish the exact same thing:
int32x4_t a0 = vreinterpretq_s32_s16(vcombine_s16(vget_low_s16(i0), vget_low_s16(i4)));
int32x4_t a1 = vreinterpretq_s32_s16(vcombine_s16(vget_low_s16(i1), vget_low_s16(i5)));
int32x4_t a2 = vreinterpretq_s32_s16(vcombine_s16(vget_low_s16(i2), vget_low_s16(i6)));
int32x4_t a3 = vreinterpretq_s32_s16(vcombine_s16(vget_low_s16(i3), vget_low_s16(i7)));
int32x4_t a4 = vreinterpretq_s32_s16(vcombine_s16(vget_high_s16(i0), vget_high_s16(i4)));
int32x4_t a5 = vreinterpretq_s32_s16(vcombine_s16(vget_high_s16(i1), vget_high_s16(i5)));
int32x4_t a6 = vreinterpretq_s32_s16(vcombine_s16(vget_high_s16(i2), vget_high_s16(i6)));
int32x4_t a7 = vreinterpretq_s32_s16(vcombine_s16(vget_high_s16(i3), vget_high_s16(i7)));
int16x8_t b0 = vreinterpretq_s16_s32(vtrnq_s32(a0, a2).val[0]);
int16x8_t b1 = vreinterpretq_s16_s32(vtrnq_s32(a1, a3).val[0]);
int16x8_t b2 = vreinterpretq_s16_s32(vtrnq_s32(a0, a2).val[1]);
int16x8_t b3 = vreinterpretq_s16_s32(vtrnq_s32(a1, a3).val[1]);
int16x8_t b4 = vreinterpretq_s16_s32(vtrnq_s32(a4, a6).val[0]);
int16x8_t b5 = vreinterpretq_s16_s32(vtrnq_s32(a5, a7).val[0]);
int16x8_t b6 = vreinterpretq_s16_s32(vtrnq_s32(a4, a6).val[1]);
int16x8_t b7 = vreinterpretq_s16_s32(vtrnq_s32(a5, a7).val[1]);
int16x8_t o0 = vtrnq_s16(b0, b1).val[0];
int16x8_t o1 = vtrnq_s16(b0, b1).val[1];
int16x8_t o2 = vtrnq_s16(b2, b3).val[0];
int16x8_t o3 = vtrnq_s16(b2, b3).val[1];
int16x8_t o4 = vtrnq_s16(b4, b5).val[0];
int16x8_t o5 = vtrnq_s16(b4, b5).val[1];
int16x8_t o6 = vtrnq_s16(b6, b7).val[0];
int16x8_t o7 = vtrnq_s16(b6, b7).val[1];
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment