Skip to content

Instantly share code, notes, and snippets.

@rygorous
Last active October 22, 2015 05:26
Embed
What would you like to do?
// for this code:
// (see cbloom rant 10-17-15 http://www.cbloom.com/rants.html for context)
//
// this is completely untested, it's just a sketch
#include <immintrin.h>
#include <stdint.h>
static const uint32_t CODELEN_LIMIT = 12;
static inline uint64_t read_be_u64(uint8_t const *decode_in)
{
return __builtin_bswap64(*((uint64_t const *) decode_in));
}
void huffdec(uint8_t * decodeptr, uint8_t const * decode_in, uint8_t const * codelens, uint8_t const * symbols, size_t count)
{
// NB peekpos can get negative on last DECONE!
// (since it's CODELEN_LIMIT bits *past* the actual end)
// not a problem, just something to be aware of
int32_t peekpos = 64 - CODELEN_LIMIT;
while (count--)
{
// refill
// note that the number of bits remaining in the buffer is
// bitcount = peekpos + CODELEN_LIMIT;
uint64_t bits = read_be_u64(decode_in);
uint64_t bytes_advance = ((64 - CODELEN_LIMIT) - peekpos) >> 3;
decode_in += bytes_advance;
peekpos += bytes_advance << 3;
uint64_t peek;
int32_t cl, sym;
#define DECONE() \
peek = _bextr_u64(bits, peekpos, CODELEN_LIMIT); \
cl = codelens[peek]; sym = symbols[peek]; \
peekpos -= cl; \
*decodeptr++ = (uint8_t)sym;
DECONE();
DECONE();
DECONE();
DECONE();
#undef DECONE
}
}
; what I got was this:
mov r11d, 52
.align 16, 0x90
.LBB0_2: # %.lr.ph
# =>This Inner Loop Header: Depth=1
movbe r9, qword ptr [rsi]
mov r10d, 52
sub r10d, r11d
shr r10d, 3
add rsi, r10
lea r11d, dword ptr [r11 + 8*r10]
movzx eax, r11b
or eax, 3072
bextr rax, r9, rax
movzx r10d, byte ptr [rdx + rax]
mov al, byte ptr [rcx + rax]
sub r11d, r10d
mov byte ptr [rdi], al
movzx eax, r11b
or eax, 3072
bextr rax, r9, rax
movzx r10d, byte ptr [rdx + rax]
mov al, byte ptr [rcx + rax]
sub r11d, r10d
mov byte ptr [rdi + 1], al
movzx eax, r11b
or eax, 3072
bextr rax, r9, rax
movzx r10d, byte ptr [rdx + rax]
mov al, byte ptr [rcx + rax]
sub r11d, r10d
mov byte ptr [rdi + 2], al
movzx eax, r11b
or eax, 3072
bextr r9, r9, rax
movzx eax, byte ptr [rdx + r9]
sub r11d, eax
mov al, byte ptr [rcx + r9]
mov byte ptr [rdi + 3], al
lea rdi, qword ptr [rdi + 4]
dec r8
jne .LBB0_2
; when what I wanted is this:
mov r11d, 52 + 3072
.align 16, 0x90
.LBB0_2: # %.lr.ph
# =>This Inner Loop Header: Depth=1
movbe r9, qword ptr [rsi]
mov r10d, 52
sub r10d, r11d
shr r10d, 3
add rsi, r10
lea r11d, dword ptr [r11 + 8*r10]
bextr rax, r9, r11
movzx r10d, byte ptr [rdx + rax]
mov al, byte ptr [rcx + rax]
sub r11d, r10d
mov byte ptr [rdi], al
bextr rax, r9, r11
movzx r10d, byte ptr [rdx + rax]
mov al, byte ptr [rcx + rax]
sub r11d, r10d
mov byte ptr [rdi + 1], al
bextr rax, r9, r11
movzx r10d, byte ptr [rdx + rax]
mov al, byte ptr [rcx + rax]
sub r11d, r10d
mov byte ptr [rdi + 2], al
bextr r9, r9, r11
movzx eax, byte ptr [rdx + r9]
sub r11d, eax
mov al, byte ptr [rcx + r9]
mov byte ptr [rdi + 3], al
lea rdi, qword ptr [rdi + 4]
dec r8
jne .LBB0_2
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment