multigetbits
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| static inline __m128i prefix_sum_u8(__m128i x) | |
| { | |
| // x[0], x[1], x[2], x[3], ... | |
| x = _mm_add_epi8(x, _mm_slli_si128(x, 1)); | |
| // x[0], sum(x[0:2]), sum(x[1:3]), sum(x[2:4]), ... | |
| x = _mm_add_epi8(x, _mm_slli_si128(x, 2)); | |
| // x[0], sum(x[0:2]), sum(x[0:3]), sum(x[0:4]), sum(x[1:5]), sum(x[2:6]), ... | |
| x = _mm_add_epi8(x, _mm_slli_si128(x, 4)); | |
| // longest group now sums over 8 elems | |
| x = _mm_add_epi8(x, _mm_slli_si128(x, 8)); | |
| // and now we're done | |
| return x; | |
| } | |
| static inline __m128i multigetbits8(const uint8_t *in_ptr, uint32_t *pbit_basepos, __m128i field_widths) | |
| { | |
| uint32_t bit_basepos = *pbit_basepos; | |
| // prefix-sum the field widths and advance bit position pointer | |
| __m128i summed_widths = prefix_sum_u8(field_widths); | |
| uint32_t total_width = (uint32_t)_mm_extract_epi16(summed_widths, 7) >> 8; // no PEXTRB before SSE4.1, and this is the only place where SSE4.1+ helps | |
| *pbit_basepos = bit_basepos + total_width; | |
| // determine starting bit position for every lane | |
| // and split into bit-within-byte and byte indices | |
| __m128i basepos_u8 = _mm_shuffle_epi8(_mm_cvtsi32_si128(bit_basepos & 7), _mm_setzero_si128()); | |
| __m128i first_bit_index = _mm_add_epi8(basepos_u8, _mm_slli_si128(summed_widths, 1)); | |
| __m128i first_byte_index = _mm_and_si128(_mm_srli_epi16(first_bit_index, 3), _mm_set1_epi8(0x1f)); // no "shift bytes", sigh. | |
| // source bytes | |
| __m128i src_byte0 = _mm_loadu_si128((const __m128i *) (in_ptr + (bit_basepos >> 3) + 0)); | |
| __m128i src_byte1 = _mm_loadu_si128((const __m128i *) (in_ptr + (bit_basepos >> 3) + 1)); | |
| // first/second bytes for every lane | |
| __m128i byte0 = _mm_shuffle_epi8(src_byte0, first_byte_index); | |
| __m128i byte1 = _mm_shuffle_epi8(src_byte1, first_byte_index); | |
| // assemble words | |
| __m128i words0 = _mm_unpacklo_epi8(byte1, byte0); | |
| __m128i words1 = _mm_unpackhi_epi8(byte1, byte0); | |
| // now, need to shift | |
| // ((byte0<<8) | byte1) >> (16 - width - (first_bit_index & 7)) | |
| // we don't have per-lane variable shifts in SSSE3, but we do have PMULHUW, | |
| // and we can do the multiplier table lookup via PSHUFB. | |
| __m128i shift_amt = _mm_add_epi8(_mm_and_si128(first_bit_index, _mm_set1_epi8(7)), field_widths); | |
| __m128i shiftm0_lut = _mm_setr_epi8(0x01,0x02,0x04,0x08, 0x10,0x20,0x40,0x80, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00); | |
| __m128i shiftm1_lut = _mm_setr_epi8(0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x01,0x02,0x04,0x08, 0x10,0x20,0x40,0x80); | |
| __m128i shiftm0 = _mm_shuffle_epi8(shiftm0_lut, shift_amt); | |
| __m128i shiftm1 = _mm_shuffle_epi8(shiftm1_lut, shift_amt); | |
| __m128i shift_mul0 = _mm_unpacklo_epi8(shiftm0, shiftm1); | |
| __m128i shift_mul1 = _mm_unpackhi_epi8(shiftm0, shiftm1); | |
| __m128i shifted0 = _mm_mulhi_epu16(words0, shift_mul0); | |
| __m128i shifted1 = _mm_mulhi_epu16(words1, shift_mul1); | |
| // pack the results back into bytes | |
| __m128i byte_mask = _mm_set1_epi16(0xff); | |
| __m128i shifted_bytes = _mm_packus_epi16(_mm_and_si128(shifted0, byte_mask), _mm_and_si128(shifted1, byte_mask)); | |
| // mask by field width, again using a PSHUFB LUT | |
| __m128i width_mask_lut = _mm_setr_epi8(0,1,3,7, 15,31,63,127, -1,-1,-1,-1, -1,-1,-1,-1); | |
| __m128i width_mask = _mm_shuffle_epi8(width_mask_lut, field_widths); | |
| __m128i result = _mm_and_si128(shifted_bytes, width_mask); | |
| return result; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment