Skip to content

Instantly share code, notes, and snippets.

@rygorous
Created November 25, 2015 23:55
Embed
What would you like to do?
Reduction variants
lzna_simd_vec xvec = vdupq_n_u16((U16) x);
lzna_simd_vec cumfreq0 = vld1q_u16(&cumfreq[0]);
lzna_simd_vec cumfreq1 = vld1q_u16(&cumfreq[8]);
// do compares
lzna_simd_vec gt0 = vcgtq_u16(cumfreq0, xvec);
lzna_simd_vec gt1 = vcgtq_u16(cumfreq1, xvec);
#if 0
// 42.5MB/s on iPad Air 2
lzna_simd_vec lanes0 = { 0, 1, 2, 3, 4, 5, 6, 7 };
lzna_simd_vec lanes1 = { 8, 9,10,11, 12,13,14,15 };
U16 val0 = vmaxvq_u16(vbicq_u16(lanes0, gt0));
U16 val1 = vmaxvq_u16(vbicq_u16(lanes1, gt1));
val = RR_MAX(val0, val1);
#elif 0
// 41.9MB/s on iPad Air 2
uint8x16_t lanes = { 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15 };
uint8x16_t packed = vcombine_u8(vmovn_u16(gt0), vmovn_u16(gt1));
uint8x16_t masked_lanes = vbicq_u8(lanes, packed); // lanes & ~(cumfreq > x) = lanes & (cumfreq <= x)
val = vmaxvq_u8(masked_lanes);
#elif 0
// 45.8MB/s on iPad Air 2
val = 15 + vaddvq_s16(U16toS16(gt0)) + vaddvq_s16(U16toS16(gt1));
#elif 0
// 44.8MB/s on iPad Air 2
val = 15 + vaddvq_s16(U16toS16(vaddq_u16(gt0, gt1)));
#elif 0
// 38.8MB/s on iPad Air 2
// reduce 16 vals -> 8 in [-2,0]
int32x4_t suma0 = vpaddlq_s16(U16toS16(gt0));
int32x4_t suma1 = vpaddlq_s16(U16toS16(gt1));
// reduce 8 vals -> 4 in [-4,0]
int32x2_t sumb0 = vpadd_s32(vget_low_s32(suma0), vget_high_s32(suma0));
int32x2_t sumb1 = vpadd_s32(vget_low_s32(suma1), vget_high_s32(suma1));
// reduce 4 vals -> 2 in [-8,0]
int32x2_t sumc = vadd_s32(sumb0, sumb1);
val = 15 + vget_lane_s32(sumc, 0) + vget_lane_s32(sumc, 1);
#else
// 47.7MB/s on iPad Air 2
val = (cumfreq[ 4] > x) ? 0 : 1;
val += (cumfreq[ 8] > x) ? 0 : 1;
val += (cumfreq[12] > x) ? 0 : 1;
val *= 4;
const U16 *cffine = cumfreq + val;
val += (cffine[ 1] > x) ? 0 : 1;
val += (cffine[ 2] > x) ? 0 : 1;
val += (cffine[ 3] > x) ? 0 : 1;
#endif
RR_ASSERT( val >= 0 && val < nsyms );
U32 start = cumfreq[val];
U32 range = getcumfreq(val + 1) - start;
// cumfreq0, cumfreq1, gt0, gt1 all used after this
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment