Skip to content

Instantly share code, notes, and snippets.

@rygorous
Created August 5, 2021 02:14
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rygorous/e045d4687ce6782c0f3d5091459f4728 to your computer and use it in GitHub Desktop.
Save rygorous/e045d4687ce6782c0f3d5091459f4728 to your computer and use it in GitHub Desktop.
multigetbits
static inline __m128i prefix_sum_u8(__m128i x)
{
#if 1
// alternative form that uses shifts, not the general shuffle network on port 5 (which is a bottleneck
// for us)
x = _mm_add_epi8(x, _mm_slli_epi64(x, 8));
x = _mm_add_epi8(x, _mm_slli_epi64(x, 16));
x = _mm_add_epi8(x, _mm_slli_epi64(x, 32));
x = _mm_add_epi8(x, _mm_shuffle_epi8(x, _mm_setr_epi8(-1,-1,-1,-1,-1,-1,-1,-1, 7,7,7,7,7,7,7,7)));
#else
// x[0], x[1], x[2], x[3], ...
x = _mm_add_epi8(x, _mm_slli_si128(x, 1));
// x[0], sum(x[0:2]), sum(x[1:3]), sum(x[2:4]), ...
x = _mm_add_epi8(x, _mm_slli_si128(x, 2));
// x[0], sum(x[0:2]), sum(x[0:3]), sum(x[0:4]), sum(x[1:5]), sum(x[2:6]), ...
x = _mm_add_epi8(x, _mm_slli_si128(x, 4));
// longest group now sums over 8 elems
x = _mm_add_epi8(x, _mm_slli_si128(x, 8));
#endif
// and now we're done
return x;
}
static inline __m128i big_endian_load_shift128(const uint8_t *in_ptr, uint32_t bit_basepos)
{
// Grab 128 source bits starting "bit_basepos" bits into the bit stream
__m128i src_byte0 = _mm_loadu_si128((const __m128i *) (in_ptr + (bit_basepos >> 3) + 0));
__m128i src_byte1 = _mm_loadu_si128((const __m128i *) (in_ptr + (bit_basepos >> 3) + 1));
// We need to consume the first (bit_basepos & 7) bits with a big-endian 128-bit
// funnel shift, which we don't have ready at hand, so we need to get creative;
// specifically, use 16-bit shifts by a single distance for all lanes (which we have)
// and make sure to not grab any bits that crossed byte boundaries (which would be
// taken from the wrong byte due to the endianness difference)
uint32_t basepos7 = bit_basepos & 7;
__m128i basepos_shiftamt = _mm_cvtsi32_si128(basepos7);
// Combine to big-endian 16-bit words and shift those since we don't have 8-bit shifts
// at hand
__m128i merged0 = _mm_unpacklo_epi8(src_byte1, src_byte0);
__m128i merged1 = _mm_unpackhi_epi8(src_byte1, src_byte0);
__m128i shifted0 = _mm_sll_epi16(merged0, basepos_shiftamt);
__m128i shifted1 = _mm_sll_epi16(merged1, basepos_shiftamt);
__m128i reduced0 = _mm_srli_epi16(shifted0, 8);
__m128i reduced1 = _mm_srli_epi16(shifted1, 8);
__m128i shifted_src_bytes = _mm_packus_epi16(reduced0, reduced1);
return shifted_src_bytes;
}
// individual field_widths in [0,8]
// MSB-first bit packing convention, SSSE3+
//
// Once more, with feeling
// trying to figure out a niftier way to do this that'll also allow me do to
// wider variants that don't suck
static inline __m128i multigetbits8b(const uint8_t *in_ptr, uint32_t *pbit_basepos, __m128i field_widths)
{
uint32_t bit_basepos = *pbit_basepos;
// Prefix-sum the field widths and advance bit position pointer
__m128i end_bit_index = prefix_sum_u8(field_widths);
uint32_t total_width = (uint32_t)_mm_extract_epi16(end_bit_index, 7) >> 8; // no PEXTRB before SSE4.1, and this is the only place where SSE4.1+ helps
*pbit_basepos = bit_basepos + total_width;
// Doing this shift is a bit of a production, but it simplifies the rest.
__m128i shifted_src_bytes = big_endian_load_shift128(in_ptr, bit_basepos);
__m128i end_byte_index = _mm_and_si128(_mm_srli_epi16(end_bit_index, 3), _mm_set1_epi8(0x1f)); // no "shift bytes", sigh.
// Grab first/second bytes for every lane
__m128i byte1 = _mm_shuffle_epi8(shifted_src_bytes, end_byte_index);
__m128i byte0 = _mm_shuffle_epi8(shifted_src_bytes, _mm_sub_epi8(end_byte_index, _mm_set1_epi8(1)));
// Assemble words (byte1 << 8) | byte0
__m128i words0 = _mm_unpacklo_epi8(byte1, byte0);
__m128i words1 = _mm_unpackhi_epi8(byte1, byte0);
// Now do a left shift by 1 << (end_bit_index & 7) using a multiply,
// putting the end of the bit field at the boundary between the low and high byte
// in every word.
__m128i end_bit_index7 = _mm_and_si128(end_bit_index, _mm_set1_epi8(7));
__m128i left_shift_lut = _mm_setr_epi8(1,2,4,8, 16,32,64,-128, 1,2,4,8, 16,32,64,-128);
__m128i shiftm = _mm_shuffle_epi8(left_shift_lut, end_bit_index7);
__m128i shift_mul0 = _mm_unpacklo_epi8(shiftm, _mm_setzero_si128());
__m128i shift_mul1 = _mm_unpackhi_epi8(shiftm, _mm_setzero_si128());
__m128i shifted0 = _mm_mullo_epi16(words0, shift_mul0);
__m128i shifted1 = _mm_mullo_epi16(words1, shift_mul1);
// Grab the high byte of the results and pack back into bytes
__m128i shifted_bytes = _mm_packus_epi16(_mm_srli_epi16(shifted0, 8), _mm_srli_epi16(shifted1, 8));
// mask by field width, again using a PSHUFB LUT
__m128i width_mask_lut = _mm_setr_epi8(0,1,3,7, 15,31,63,127, -1,-1,-1,-1, -1,-1,-1,-1);
__m128i width_mask = _mm_shuffle_epi8(width_mask_lut, field_widths);
__m128i result = _mm_and_si128(shifted_bytes, width_mask);
return result;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment