Created
September 6, 2022 03:15
-
-
Save rygorous/94b044a7c37ae9119286faeec33ea77b to your computer and use it in GitHub Desktop.
MSB-first -> LSB-first Huff table transpose (x86/SSE2 version)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Un-bit-reversed huff table in MSB-first decode order, used in building the real KrakenHuffTab | |
struct KrakenMSBHuffTab | |
{ | |
U8 len[NEWLZ_HUFF_DECODE_TABLE_SIZE + 16]; // code lens; +16 for sloppy memset | |
U8 sym[NEWLZ_HUFF_DECODE_TABLE_SIZE + 16]; // sym id; +16 for sloppy memset | |
}; | |
// NOTE: must match what the ASM inner loops expect (or disable them above)! | |
struct KrakenHuffElem | |
{ | |
U8 len; // code len | |
U8 sym; // symbol | |
}; | |
struct KrakenHuffTab | |
{ | |
KrakenHuffElem e[NEWLZ_HUFF_DECODE_TABLE_SIZE]; | |
}; | |
// default layout | |
static void newLZ_bit_reverse_hufftab(KrakenHuffTab * to_table, const KrakenMSBHuffTab * msbHuff) | |
{ | |
SIMPLEPROFILE_SCOPE(huff_bitrev_default_sse2); | |
RR_COMPILER_ASSERT( ( NEWLZ_HUFF_DECODE_TABLE_SIZE % 64 ) == 0 ); | |
UINTa e = NEWLZ_HUFF_DECODE_TABLE_SIZE/8; | |
// NOTE(fg): | |
// The idea here is to load a square 8x8 group of table entries, transpose | |
// them (8x8 matrix transpose), and then store it back. | |
// | |
// Using bit-reversed indices on both the loads and stores makes the whole | |
// process equivalent to a large index bit-reverse. | |
#define interleave_rows(a,b) t = a; a = _mm_unpacklo_epi16(a,b); b = _mm_unpackhi_epi16(t, b) | |
for (UINTa s = 0; s < e; s += 8) | |
{ | |
UINTa rs = newlz_huff_bitreverse(static_cast<U32>(s)); // could use a dedicated table for this but meh | |
__m128i r0,r1,r2,r3,r4,r5,r6,r7,t; | |
// syms: read rows (bit-reversed index order) | |
r0 = _mm_loadl_epi64((__m128i const *) (msbHuff->sym + rs + e*0)); | |
r1 = _mm_loadl_epi64((__m128i const *) (msbHuff->sym + rs + e*4)); | |
r2 = _mm_loadl_epi64((__m128i const *) (msbHuff->sym + rs + e*2)); | |
r3 = _mm_loadl_epi64((__m128i const *) (msbHuff->sym + rs + e*6)); | |
r4 = _mm_loadl_epi64((__m128i const *) (msbHuff->sym + rs + e*1)); | |
r5 = _mm_loadl_epi64((__m128i const *) (msbHuff->sym + rs + e*5)); | |
r6 = _mm_loadl_epi64((__m128i const *) (msbHuff->sym + rs + e*3)); | |
r7 = _mm_loadl_epi64((__m128i const *) (msbHuff->sym + rs + e*7)); | |
// lens: read rows (bit-reversed index order), interleave with 8-bit packed syms | |
r0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *) (msbHuff->len + rs + e*0)), r0); | |
r1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *) (msbHuff->len + rs + e*4)), r1); | |
r2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *) (msbHuff->len + rs + e*2)), r2); | |
r3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *) (msbHuff->len + rs + e*6)), r3); | |
r4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *) (msbHuff->len + rs + e*1)), r4); | |
r5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *) (msbHuff->len + rs + e*5)), r5); | |
r6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *) (msbHuff->len + rs + e*3)), r6); | |
r7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *) (msbHuff->len + rs + e*7)), r7); | |
// transpose pass 1 | |
interleave_rows(r0, r4); | |
interleave_rows(r1, r5); | |
interleave_rows(r2, r6); | |
interleave_rows(r3, r7); | |
// transpose pass 2 | |
interleave_rows(r0, r2); | |
interleave_rows(r1, r3); | |
interleave_rows(r4, r6); | |
interleave_rows(r5, r7); | |
// transpose pass 3 | |
interleave_rows(r0, r1); | |
interleave_rows(r2, r3); | |
interleave_rows(r4, r5); | |
interleave_rows(r6, r7); | |
#define STORE_ROW(dest, vals) _mm_store_si128((__m128i *) (dest), (vals)) | |
// store columns (bit-reversed index order) | |
STORE_ROW(to_table->e + s + e*0, r0); | |
STORE_ROW(to_table->e + s + e*4, r1); | |
STORE_ROW(to_table->e + s + e*2, r2); | |
STORE_ROW(to_table->e + s + e*6, r3); | |
STORE_ROW(to_table->e + s + e*1, r4); | |
STORE_ROW(to_table->e + s + e*5, r5); | |
STORE_ROW(to_table->e + s + e*3, r6); | |
STORE_ROW(to_table->e + s + e*7, r7); | |
#undef STORE_ROW | |
} | |
#undef interleave_rows | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment