Skip to content

Instantly share code, notes, and snippets.

@rygorous
Created September 6, 2022 03:15
Show Gist options
  • Save rygorous/94b044a7c37ae9119286faeec33ea77b to your computer and use it in GitHub Desktop.
Save rygorous/94b044a7c37ae9119286faeec33ea77b to your computer and use it in GitHub Desktop.
MSB-first -> LSB-first Huff table transpose (x86/SSE2 version)
// Un-bit-reversed huff table in MSB-first decode order, used in building the real KrakenHuffTab
struct KrakenMSBHuffTab
{
U8 len[NEWLZ_HUFF_DECODE_TABLE_SIZE + 16]; // code lens; +16 for sloppy memset
U8 sym[NEWLZ_HUFF_DECODE_TABLE_SIZE + 16]; // sym id; +16 for sloppy memset
};
// NOTE: must match what the ASM inner loops expect (or disable them above)!
struct KrakenHuffElem
{
U8 len; // code len
U8 sym; // symbol
};
struct KrakenHuffTab
{
KrakenHuffElem e[NEWLZ_HUFF_DECODE_TABLE_SIZE];
};
// default layout
static void newLZ_bit_reverse_hufftab(KrakenHuffTab * to_table, const KrakenMSBHuffTab * msbHuff)
{
SIMPLEPROFILE_SCOPE(huff_bitrev_default_sse2);
RR_COMPILER_ASSERT( ( NEWLZ_HUFF_DECODE_TABLE_SIZE % 64 ) == 0 );
UINTa e = NEWLZ_HUFF_DECODE_TABLE_SIZE/8;
// NOTE(fg):
// The idea here is to load a square 8x8 group of table entries, transpose
// them (8x8 matrix transpose), and then store it back.
//
// Using bit-reversed indices on both the loads and stores makes the whole
// process equivalent to a large index bit-reverse.
#define interleave_rows(a,b) t = a; a = _mm_unpacklo_epi16(a,b); b = _mm_unpackhi_epi16(t, b)
for (UINTa s = 0; s < e; s += 8)
{
UINTa rs = newlz_huff_bitreverse(static_cast<U32>(s)); // could use a dedicated table for this but meh
__m128i r0,r1,r2,r3,r4,r5,r6,r7,t;
// syms: read rows (bit-reversed index order)
r0 = _mm_loadl_epi64((__m128i const *) (msbHuff->sym + rs + e*0));
r1 = _mm_loadl_epi64((__m128i const *) (msbHuff->sym + rs + e*4));
r2 = _mm_loadl_epi64((__m128i const *) (msbHuff->sym + rs + e*2));
r3 = _mm_loadl_epi64((__m128i const *) (msbHuff->sym + rs + e*6));
r4 = _mm_loadl_epi64((__m128i const *) (msbHuff->sym + rs + e*1));
r5 = _mm_loadl_epi64((__m128i const *) (msbHuff->sym + rs + e*5));
r6 = _mm_loadl_epi64((__m128i const *) (msbHuff->sym + rs + e*3));
r7 = _mm_loadl_epi64((__m128i const *) (msbHuff->sym + rs + e*7));
// lens: read rows (bit-reversed index order), interleave with 8-bit packed syms
r0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *) (msbHuff->len + rs + e*0)), r0);
r1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *) (msbHuff->len + rs + e*4)), r1);
r2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *) (msbHuff->len + rs + e*2)), r2);
r3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *) (msbHuff->len + rs + e*6)), r3);
r4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *) (msbHuff->len + rs + e*1)), r4);
r5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *) (msbHuff->len + rs + e*5)), r5);
r6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *) (msbHuff->len + rs + e*3)), r6);
r7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *) (msbHuff->len + rs + e*7)), r7);
// transpose pass 1
interleave_rows(r0, r4);
interleave_rows(r1, r5);
interleave_rows(r2, r6);
interleave_rows(r3, r7);
// transpose pass 2
interleave_rows(r0, r2);
interleave_rows(r1, r3);
interleave_rows(r4, r6);
interleave_rows(r5, r7);
// transpose pass 3
interleave_rows(r0, r1);
interleave_rows(r2, r3);
interleave_rows(r4, r5);
interleave_rows(r6, r7);
#define STORE_ROW(dest, vals) _mm_store_si128((__m128i *) (dest), (vals))
// store columns (bit-reversed index order)
STORE_ROW(to_table->e + s + e*0, r0);
STORE_ROW(to_table->e + s + e*4, r1);
STORE_ROW(to_table->e + s + e*2, r2);
STORE_ROW(to_table->e + s + e*6, r3);
STORE_ROW(to_table->e + s + e*1, r4);
STORE_ROW(to_table->e + s + e*5, r5);
STORE_ROW(to_table->e + s + e*3, r6);
STORE_ROW(to_table->e + s + e*7, r7);
#undef STORE_ROW
}
#undef interleave_rows
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment