This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // for this code: | |
| // (see cbloom rant 10-17-15 http://www.cbloom.com/rants.html for context) | |
| // | |
| // this is completely untested, it's just a sketch | |
| #include <immintrin.h> | |
| #include <stdint.h> | |
| static const uint32_t CODELEN_LIMIT = 12; | |
| static inline uint64_t read_be_u64(uint8_t const *decode_in) | |
| { | |
| return __builtin_bswap64(*((uint64_t const *) decode_in)); | |
| } | |
| void huffdec(uint8_t * decodeptr, uint8_t const * decode_in, uint8_t const * codelens, uint8_t const * symbols, size_t count) | |
| { | |
| // NB peekpos can get negative on last DECONE! | |
| // (since it's CODELEN_LIMIT bits *past* the actual end) | |
| // not a problem, just something to be aware of | |
| int32_t peekpos = 64 - CODELEN_LIMIT; | |
| while (count--) | |
| { | |
| // refill | |
| // note that the number of bits remaining in the buffer is | |
| // bitcount = peekpos + CODELEN_LIMIT; | |
| uint64_t bits = read_be_u64(decode_in); | |
| uint64_t bytes_advance = ((64 - CODELEN_LIMIT) - peekpos) >> 3; | |
| decode_in += bytes_advance; | |
| peekpos += bytes_advance << 3; | |
| uint64_t peek; | |
| int32_t cl, sym; | |
| #define DECONE() \ | |
| peek = _bextr_u64(bits, peekpos, CODELEN_LIMIT); \ | |
| cl = codelens[peek]; sym = symbols[peek]; \ | |
| peekpos -= cl; \ | |
| *decodeptr++ = (uint8_t)sym; | |
| DECONE(); | |
| DECONE(); | |
| DECONE(); | |
| DECONE(); | |
| #undef DECONE | |
| } | |
| } | |
| ; what I got was this: | |
| mov r11d, 52 | |
| .align 16, 0x90 | |
| .LBB0_2: # %.lr.ph | |
| # =>This Inner Loop Header: Depth=1 | |
| movbe r9, qword ptr [rsi] | |
| mov r10d, 52 | |
| sub r10d, r11d | |
| shr r10d, 3 | |
| add rsi, r10 | |
| lea r11d, dword ptr [r11 + 8*r10] | |
| movzx eax, r11b | |
| or eax, 3072 | |
| bextr rax, r9, rax | |
| movzx r10d, byte ptr [rdx + rax] | |
| mov al, byte ptr [rcx + rax] | |
| sub r11d, r10d | |
| mov byte ptr [rdi], al | |
| movzx eax, r11b | |
| or eax, 3072 | |
| bextr rax, r9, rax | |
| movzx r10d, byte ptr [rdx + rax] | |
| mov al, byte ptr [rcx + rax] | |
| sub r11d, r10d | |
| mov byte ptr [rdi + 1], al | |
| movzx eax, r11b | |
| or eax, 3072 | |
| bextr rax, r9, rax | |
| movzx r10d, byte ptr [rdx + rax] | |
| mov al, byte ptr [rcx + rax] | |
| sub r11d, r10d | |
| mov byte ptr [rdi + 2], al | |
| movzx eax, r11b | |
| or eax, 3072 | |
| bextr r9, r9, rax | |
| movzx eax, byte ptr [rdx + r9] | |
| sub r11d, eax | |
| mov al, byte ptr [rcx + r9] | |
| mov byte ptr [rdi + 3], al | |
| lea rdi, qword ptr [rdi + 4] | |
| dec r8 | |
| jne .LBB0_2 | |
| ; when what I wanted is this: | |
| mov r11d, 52 + 3072 | |
| .align 16, 0x90 | |
| .LBB0_2: # %.lr.ph | |
| # =>This Inner Loop Header: Depth=1 | |
| movbe r9, qword ptr [rsi] | |
| mov r10d, 52 | |
| sub r10d, r11d | |
| shr r10d, 3 | |
| add rsi, r10 | |
| lea r11d, dword ptr [r11 + 8*r10] | |
| bextr rax, r9, r11 | |
| movzx r10d, byte ptr [rdx + rax] | |
| mov al, byte ptr [rcx + rax] | |
| sub r11d, r10d | |
| mov byte ptr [rdi], al | |
| bextr rax, r9, r11 | |
| movzx r10d, byte ptr [rdx + rax] | |
| mov al, byte ptr [rcx + rax] | |
| sub r11d, r10d | |
| mov byte ptr [rdi + 1], al | |
| bextr rax, r9, r11 | |
| movzx r10d, byte ptr [rdx + rax] | |
| mov al, byte ptr [rcx + rax] | |
| sub r11d, r10d | |
| mov byte ptr [rdi + 2], al | |
| bextr r9, r9, r11 | |
| movzx eax, byte ptr [rdx + r9] | |
| sub r11d, eax | |
| mov al, byte ptr [rcx + r9] | |
| mov byte ptr [rdi + 3], al | |
| lea rdi, qword ptr [rdi + 4] | |
| dec r8 | |
| jne .LBB0_2 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment