Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
multigetbits
static inline __m128i prefix_sum_u8(__m128i x)
{
// x[0], x[1], x[2], x[3], ...
x = _mm_add_epi8(x, _mm_slli_si128(x, 1));
// x[0], sum(x[0:2]), sum(x[1:3]), sum(x[2:4]), ...
x = _mm_add_epi8(x, _mm_slli_si128(x, 2));
// x[0], sum(x[0:2]), sum(x[0:3]), sum(x[0:4]), sum(x[1:5]), sum(x[2:6]), ...
x = _mm_add_epi8(x, _mm_slli_si128(x, 4));
// longest group now sums over 8 elems
x = _mm_add_epi8(x, _mm_slli_si128(x, 8));
// and now we're done
return x;
}
static inline __m128i multigetbits8(const uint8_t *in_ptr, uint32_t *pbit_basepos, __m128i field_widths)
{
uint32_t bit_basepos = *pbit_basepos;
// prefix-sum the field widths and advance bit position pointer
__m128i summed_widths = prefix_sum_u8(field_widths);
uint32_t total_width = (uint32_t)_mm_extract_epi16(summed_widths, 7) >> 8; // no PEXTRB before SSE4.1, and this is the only place where SSE4.1+ helps
*pbit_basepos = bit_basepos + total_width;
// determine starting bit position for every lane
// and split into bit-within-byte and byte indices
__m128i basepos_u8 = _mm_shuffle_epi8(_mm_cvtsi32_si128(bit_basepos & 7), _mm_setzero_si128());
__m128i first_bit_index = _mm_add_epi8(basepos_u8, _mm_slli_si128(summed_widths, 1));
__m128i first_byte_index = _mm_and_si128(_mm_srli_epi16(first_bit_index, 3), _mm_set1_epi8(0x1f)); // no "shift bytes", sigh.
// source bytes
__m128i src_byte0 = _mm_loadu_si128((const __m128i *) (in_ptr + (bit_basepos >> 3) + 0));
__m128i src_byte1 = _mm_loadu_si128((const __m128i *) (in_ptr + (bit_basepos >> 3) + 1));
// first/second bytes for every lane
__m128i byte0 = _mm_shuffle_epi8(src_byte0, first_byte_index);
__m128i byte1 = _mm_shuffle_epi8(src_byte1, first_byte_index);
// assemble words
__m128i words0 = _mm_unpacklo_epi8(byte1, byte0);
__m128i words1 = _mm_unpackhi_epi8(byte1, byte0);
// now, need to shift
// ((byte0<<8) | byte1) >> (16 - width - (first_bit_index & 7))
// we don't have per-lane variable shifts in SSSE3, but we do have PMULHUW,
// and we can do the multiplier table lookup via PSHUFB.
__m128i shift_amt = _mm_add_epi8(_mm_and_si128(first_bit_index, _mm_set1_epi8(7)), field_widths);
__m128i shiftm0_lut = _mm_setr_epi8(0x01,0x02,0x04,0x08, 0x10,0x20,0x40,0x80, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00);
__m128i shiftm1_lut = _mm_setr_epi8(0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x01,0x02,0x04,0x08, 0x10,0x20,0x40,0x80);
__m128i shiftm0 = _mm_shuffle_epi8(shiftm0_lut, shift_amt);
__m128i shiftm1 = _mm_shuffle_epi8(shiftm1_lut, shift_amt);
__m128i shift_mul0 = _mm_unpacklo_epi8(shiftm0, shiftm1);
__m128i shift_mul1 = _mm_unpackhi_epi8(shiftm0, shiftm1);
__m128i shifted0 = _mm_mulhi_epu16(words0, shift_mul0);
__m128i shifted1 = _mm_mulhi_epu16(words1, shift_mul1);
// pack the results back into bytes
__m128i byte_mask = _mm_set1_epi16(0xff);
__m128i shifted_bytes = _mm_packus_epi16(_mm_and_si128(shifted0, byte_mask), _mm_and_si128(shifted1, byte_mask));
// mask by field width, again using a PSHUFB LUT
__m128i width_mask_lut = _mm_setr_epi8(0,1,3,7, 15,31,63,127, -1,-1,-1,-1, -1,-1,-1,-1);
__m128i width_mask = _mm_shuffle_epi8(width_mask_lut, field_widths);
__m128i result = _mm_and_si128(shifted_bytes, width_mask);
return result;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment