Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
This is why I don't like NEON intrinsics. (These two functions do the same thing in the same way using the same instruction set.)
#ifdef __SOME_COMPILER__
__asm__ void bink2_mublock_UandD( U16 const * RADRESTRICT mask, U8 * RADRESTRICT ptr, U32 pitch )
{
sub r3,r1,#2
vld1.32 {d0[]},[r3],r2
vld1.32 {d1[]},[r3],r2
vld1.32 {d2[]},[r3],r2
vld1.32 {d3[]},[r3],r2
vld1.32 {d0[1]},[r3],r2
vld1.32 {d1[1]},[r3],r2
vld1.32 {d2[1]},[r3],r2
vld1.32 {d3[1]},[r3]
vtrn.8 d0,d1
vtrn.8 d2,d3
vtrn.16 q0,q1
vld1.16 {d4},[r0] // masks
vsubl.u8 q9,d2,d1 // q9=diff
vmovl.u8 q12,d0
vmovl.u8 q13,d1
vrshr.s16 q10,q9,#2 // q10=delta_near
vrshr.s16 q11,q9,#3 // q11=delta_far
vmovl.u8 q14,d2
vmovl.u8 q15,d3
vmls.i16 q12,q11,d4[0]
vmls.i16 q13,q10,d4[1]
vmla.i16 q14,q10,d4[2]
vmla.i16 q15,q11,d4[3]
vqmovun.s16 d0,q12
vqmovun.s16 d1,q13
vqmovun.s16 d2,q14
vqmovun.s16 d3,q15
sub r3,r1,#2
vtrn.16 q0,q1
vtrn.8 d0,d1
vtrn.8 d2,d3
vst1.32 {d0[0]},[r3],r2
vst1.32 {d1[0]},[r3],r2
vst1.32 {d2[0]},[r3],r2
vst1.32 {d3[0]},[r3],r2
vst1.32 {d0[1]},[r3],r2
vst1.32 {d1[1]},[r3],r2
vst1.32 {d2[1]},[r3],r2
vst1.32 {d3[1]},[r3]
bx lr
}
#define mublock_UandD bink2_mublock_UandD
#else
static CODEGEN_ATTR void mublock_UandD( U16 const * RADRESTRICT mask, U8 * RADRESTRICT ptr, U32 pitch )
{
uint8x16_t r0, r1;
int16x4_t masks;
// load rows and transpose
{
uint8x8_t p0,p1,p2,p3;
uint8x8x2_t p01t, p23t;
uint16x8_t q0,q1;
U8 const * RADRESTRICT inp = ptr - 2;
p0 = vreinterpret_u8_u32(vld1_dup_u32((U32 const *) inp)); inp += pitch;
p1 = vreinterpret_u8_u32(vld1_dup_u32((U32 const *) inp)); inp += pitch;
p2 = vreinterpret_u8_u32(vld1_dup_u32((U32 const *) inp)); inp += pitch;
p3 = vreinterpret_u8_u32(vld1_dup_u32((U32 const *) inp)); inp += pitch;
p0 = vreinterpret_u8_u32(vld1_lane_u32((U32 const *) inp, vreinterpret_u32_u8(p0), 1)); inp += pitch;
p1 = vreinterpret_u8_u32(vld1_lane_u32((U32 const *) inp, vreinterpret_u32_u8(p1), 1)); inp += pitch;
p2 = vreinterpret_u8_u32(vld1_lane_u32((U32 const *) inp, vreinterpret_u32_u8(p2), 1)); inp += pitch;
p3 = vreinterpret_u8_u32(vld1_lane_u32((U32 const *) inp, vreinterpret_u32_u8(p3), 1)); inp += pitch;
p01t = vtrn_u8(p0, p1);
p23t = vtrn_u8(p2, p3);
p0 = p01t.val[0];
p1 = p01t.val[1];
p2 = p23t.val[0];
p3 = p23t.val[1];
q0 = vreinterpretq_u16_u8(vcombine_u8(p0, p1));
q1 = vreinterpretq_u16_u8(vcombine_u8(p2, p3));
r0 = vreinterpretq_u8_u16(vtrnq_u16(q0, q1).val[0]);
r1 = vreinterpretq_u8_u16(vtrnq_u16(q0, q1).val[1]);
}
// load masks
masks = vld1_s16((S16 const *) mask);
{
int16x8_t m2, m1, p0, p1;
int16x8_t diff, delta_near, delta_far;
m2 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(r0)));
m1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(r0)));
p0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(r1)));
p1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(r1)));
diff = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(r1), vget_high_u8(r0))); // p0-m1
delta_near = vrshrq_n_s16(diff, 2);
delta_far = vrshrq_n_s16(diff, 3);
m2 = vmlsq_lane_s16(m2, delta_far, masks, 0);
m1 = vmlsq_lane_s16(m1, delta_near, masks, 1);
p0 = vmlaq_lane_s16(p0, delta_near, masks, 2);
p1 = vmlaq_lane_s16(p1, delta_far, masks, 3);
r0 = vcombine_u8(vqmovun_s16(m2), vqmovun_s16(m1));
r1 = vcombine_u8(vqmovun_s16(p0), vqmovun_s16(p1));
}
// transpose back and store
{
U8 * RADRESTRICT outp = ptr - 2;
uint32x2_t p0,p1,p2,p3;
uint8x16_t q0,q1;
q0 = vreinterpretq_u8_u16(vtrnq_u16(vreinterpretq_u16_u8(r0), vreinterpretq_u16_u8(r1)).val[0]);
q1 = vreinterpretq_u8_u16(vtrnq_u16(vreinterpretq_u16_u8(r0), vreinterpretq_u16_u8(r1)).val[1]);
p0 = vreinterpret_u32_u8(vtrn_u8(vget_low_u8(q0), vget_high_u8(q0)).val[0]);
p1 = vreinterpret_u32_u8(vtrn_u8(vget_low_u8(q0), vget_high_u8(q0)).val[1]);
p2 = vreinterpret_u32_u8(vtrn_u8(vget_low_u8(q1), vget_high_u8(q1)).val[0]);
p3 = vreinterpret_u32_u8(vtrn_u8(vget_low_u8(q1), vget_high_u8(q1)).val[1]);
vst1_lane_u32((U32 *)outp, p0, 0); outp += pitch;
vst1_lane_u32((U32 *)outp, p1, 0); outp += pitch;
vst1_lane_u32((U32 *)outp, p2, 0); outp += pitch;
vst1_lane_u32((U32 *)outp, p3, 0); outp += pitch;
vst1_lane_u32((U32 *)outp, p0, 1); outp += pitch;
vst1_lane_u32((U32 *)outp, p1, 1); outp += pitch;
vst1_lane_u32((U32 *)outp, p2, 1); outp += pitch;
vst1_lane_u32((U32 *)outp, p3, 1); outp += pitch;
}
}
#endif
@wl3b10s
Copy link

wl3b10s commented Oct 10, 2017

so what? use intrinsic seems not reduce the complexity and length of code ?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment