Reduction variants
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| lzna_simd_vec xvec = vdupq_n_u16((U16) x); | |
| lzna_simd_vec cumfreq0 = vld1q_u16(&cumfreq[0]); | |
| lzna_simd_vec cumfreq1 = vld1q_u16(&cumfreq[8]); | |
| // do compares | |
| lzna_simd_vec gt0 = vcgtq_u16(cumfreq0, xvec); | |
| lzna_simd_vec gt1 = vcgtq_u16(cumfreq1, xvec); | |
| #if 0 | |
| // 42.5MB/s on iPad Air 2 | |
| lzna_simd_vec lanes0 = { 0, 1, 2, 3, 4, 5, 6, 7 }; | |
| lzna_simd_vec lanes1 = { 8, 9,10,11, 12,13,14,15 }; | |
| U16 val0 = vmaxvq_u16(vbicq_u16(lanes0, gt0)); | |
| U16 val1 = vmaxvq_u16(vbicq_u16(lanes1, gt1)); | |
| val = RR_MAX(val0, val1); | |
| #elif 0 | |
| // 41.9MB/s on iPad Air 2 | |
| uint8x16_t lanes = { 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15 }; | |
| uint8x16_t packed = vcombine_u8(vmovn_u16(gt0), vmovn_u16(gt1)); | |
| uint8x16_t masked_lanes = vbicq_u8(lanes, packed); // lanes & ~(cumfreq > x) = lanes & (cumfreq <= x) | |
| val = vmaxvq_u8(masked_lanes); | |
| #elif 0 | |
| // 45.8MB/s on iPad Air 2 | |
| val = 15 + vaddvq_s16(U16toS16(gt0)) + vaddvq_s16(U16toS16(gt1)); | |
| #elif 0 | |
| // 44.8MB/s on iPad Air 2 | |
| val = 15 + vaddvq_s16(U16toS16(vaddq_u16(gt0, gt1))); | |
| #elif 0 | |
| // 38.8MB/s on iPad Air 2 | |
| // reduce 16 vals -> 8 in [-2,0] | |
| int32x4_t suma0 = vpaddlq_s16(U16toS16(gt0)); | |
| int32x4_t suma1 = vpaddlq_s16(U16toS16(gt1)); | |
| // reduce 8 vals -> 4 in [-4,0] | |
| int32x2_t sumb0 = vpadd_s32(vget_low_s32(suma0), vget_high_s32(suma0)); | |
| int32x2_t sumb1 = vpadd_s32(vget_low_s32(suma1), vget_high_s32(suma1)); | |
| // reduce 4 vals -> 2 in [-8,0] | |
| int32x2_t sumc = vadd_s32(sumb0, sumb1); | |
| val = 15 + vget_lane_s32(sumc, 0) + vget_lane_s32(sumc, 1); | |
| #else | |
| // 47.7MB/s on iPad Air 2 | |
| val = (cumfreq[ 4] > x) ? 0 : 1; | |
| val += (cumfreq[ 8] > x) ? 0 : 1; | |
| val += (cumfreq[12] > x) ? 0 : 1; | |
| val *= 4; | |
| const U16 *cffine = cumfreq + val; | |
| val += (cffine[ 1] > x) ? 0 : 1; | |
| val += (cffine[ 2] > x) ? 0 : 1; | |
| val += (cffine[ 3] > x) ? 0 : 1; | |
| #endif | |
| RR_ASSERT( val >= 0 && val < nsyms ); | |
| U32 start = cumfreq[val]; | |
| U32 range = getcumfreq(val + 1) - start; | |
| // cumfreq0, cumfreq1, gt0, gt1 all used after this |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment