Skip to content

Instantly share code, notes, and snippets.

@terrelln
Created November 20, 2023 20:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save terrelln/2e14ff1fb197102a08d7823d8044978d to your computer and use it in GitHub Desktop.
Save terrelln/2e14ff1fb197102a08d7823d8044978d to your computer and use it in GitHub Desktop.
objdump of huf_decompress.o before the manual unrolling
0000000000000450 <HUF_decompress4X2_usingDTable_internal_fast_c_loop>:
; {
450: f3 0f 1e fa endbr64
454: 41 57 pushq %r15
; ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
456: 4c 8d 4f 40 leaq 0x40(%rdi), %r9
; ZSTD_memcpy(&op, &args->op, sizeof(op));
45a: 48 8d 4f 20 leaq 0x20(%rdi), %rcx
; size_t iters = (size_t)(ip[0] - ilimit) / 7;
45e: 49 bf 93 24 49 92 24 49 92 24 movabsq $0x2492492492492493, %r15 # imm = 0x2492492492492493
; {
468: 41 56 pushq %r14
46a: 41 55 pushq %r13
46c: 4d 89 cd movq %r9, %r13
46f: 49 89 c9 movq %rcx, %r9
472: 48 89 f9 movq %rdi, %rcx
475: 41 54 pushq %r12
; size_t const oiters = (size_t)(oend[stream] - op[stream]) / 10;
477: 49 bc cd cc cc cc cc cc cc cc movabsq $-0x3333333333333333, %r12 # imm = 0xCCCCCCCCCCCCCCCD
; {
481: 55 pushq %rbp
482: 53 pushq %rbx
483: 48 81 ec a8 00 00 00 subq $0xa8, %rsp
; ZSTD_memcpy(&op, &args->op, sizeof(op));
48a: 48 8b 57 28 movq 0x28(%rdi), %rdx
; HUF_DEltX2 const* const dtable = (HUF_DEltX2 const*)args->dt;
48e: 4c 8b 57 60 movq 0x60(%rdi), %r10
; {
492: 65 48 8b 04 25 28 00 00 00 movq %gs:0x28, %rax
49b: 48 89 84 24 a0 00 00 00 movq %rax, 0xa0(%rsp)
4a3: 31 c0 xorl %eax, %eax
; ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
4a5: 48 8b 47 40 movq 0x40(%rdi), %rax
; BYTE const* const ilimit = args->ilimit;
4a9: 4c 8b 77 68 movq 0x68(%rdi), %r14
; ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
4ad: 4c 8d 5c 24 38 leaq 0x38(%rsp), %r11
; ZSTD_memcpy(&op, &args->op, sizeof(op));
4b2: 48 89 54 24 68 movq %rdx, 0x68(%rsp)
4b7: 48 8d 5c 24 40 leaq 0x40(%rsp), %rbx
; ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
4bc: 48 89 44 24 20 movq %rax, 0x20(%rsp)
4c1: 48 8b 47 48 movq 0x48(%rdi), %rax
4c5: 48 89 44 24 28 movq %rax, 0x28(%rsp)
4ca: 48 8b 47 50 movq 0x50(%rdi), %rax
4ce: 48 89 44 24 30 movq %rax, 0x30(%rsp)
4d3: 48 8b 47 58 movq 0x58(%rdi), %rax
4d7: 48 89 44 24 38 movq %rax, 0x38(%rsp)
; ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip));
4dc: 48 8b 07 movq (%rdi), %rax
4df: 48 89 44 24 40 movq %rax, 0x40(%rsp)
4e4: 48 8b 47 08 movq 0x8(%rdi), %rax
4e8: 48 89 44 24 48 movq %rax, 0x48(%rsp)
4ed: 48 8b 47 10 movq 0x10(%rdi), %rax
4f1: 48 89 44 24 50 movq %rax, 0x50(%rsp)
4f6: 48 8b 47 18 movq 0x18(%rdi), %rax
4fa: 48 89 44 24 58 movq %rax, 0x58(%rsp)
; ZSTD_memcpy(&op, &args->op, sizeof(op));
4ff: 48 8b 47 20 movq 0x20(%rdi), %rax
503: 48 89 44 24 60 movq %rax, 0x60(%rsp)
508: 48 8b 47 30 movq 0x30(%rdi), %rax
50c: 48 89 44 24 70 movq %rax, 0x70(%rsp)
511: 48 8b 77 38 movq 0x38(%rdi), %rsi
; oend[1] = op[2];
515: 48 89 84 24 88 00 00 00 movq %rax, 0x88(%rsp)
; oend[3] = args->oend;
51d: 48 8b 47 70 movq 0x70(%rdi), %rax
; ZSTD_memcpy(&op, &args->op, sizeof(op));
521: 48 89 74 24 78 movq %rsi, 0x78(%rsp)
; oend[0] = op[1];
526: 48 89 94 24 80 00 00 00 movq %rdx, 0x80(%rsp)
; oend[2] = op[3];
52e: 48 89 b4 24 90 00 00 00 movq %rsi, 0x90(%rsp)
; oend[3] = args->oend;
536: 48 89 84 24 98 00 00 00 movq %rax, 0x98(%rsp)
<L7>:
; size_t iters = (size_t)(ip[0] - ilimit) / 7;
53e: 4c 8b 44 24 40 movq 0x40(%rsp), %r8
543: 4c 89 c7 movq %r8, %rdi
546: 4c 29 f7 subq %r14, %rdi
549: 48 89 f8 movq %rdi, %rax
54c: 49 f7 e7 mulq %r15
54f: 48 29 d7 subq %rdx, %rdi
552: 48 d1 ef shrq %rdi
555: 48 8d 2c 3a leaq (%rdx,%rdi), %rbp
559: 31 ff xorl %edi, %edi
55b: 48 c1 ed 02 shrq $0x2, %rbp
<L0>:
; size_t const oiters = (size_t)(oend[stream] - op[stream]) / 10;
55f: 48 8b 94 3c 80 00 00 00 movq 0x80(%rsp,%rdi), %rdx
567: 48 2b 54 3c 60 subq 0x60(%rsp,%rdi), %rdx
56c: 48 89 d0 movq %rdx, %rax
56f: 49 f7 e4 mulq %r12
572: 48 c1 ea 03 shrq $0x3, %rdx
; iters = MIN(iters, oiters);
576: 48 39 d5 cmpq %rdx, %rbp
579: 48 0f 47 ea cmovaq %rdx, %rbp
; for (stream = 0; stream < 4; ++stream) {
57d: 48 83 c7 08 addq $0x8, %rdi
581: 48 83 ff 20 cmpq $0x20, %rdi
585: 75 d8 jne <L0>
; olimit = op[3] + (iters * 5);
587: 48 8d 44 ad 00 leaq (%rbp,%rbp,4), %rax
58c: 48 8d 3c 06 leaq (%rsi,%rax), %rdi
; if (op[3] + 10 > olimit)
590: 48 8d 46 0a leaq 0xa(%rsi), %rax
594: 48 39 c7 cmpq %rax, %rdi
597: 0f 82 5b 01 00 00 jb <L1>
; if (ip[stream] < ip[stream - 1])
59d: 48 8b 44 24 48 movq 0x48(%rsp), %rax
5a2: 4c 39 c0 cmpq %r8, %rax
5a5: 0f 82 4d 01 00 00 jb <L1>
5ab: 48 8b 54 24 50 movq 0x50(%rsp), %rdx
5b0: 48 39 c2 cmpq %rax, %rdx
5b3: 0f 82 3f 01 00 00 jb <L1>
5b9: 48 39 54 24 58 cmpq %rdx, 0x58(%rsp)
5be: 0f 82 34 01 00 00 jb <L1>
5c4: 4c 89 4c 24 18 movq %r9, 0x18(%rsp)
5c9: 4c 89 6c 24 10 movq %r13, 0x10(%rsp)
5ce: 49 89 cd movq %rcx, %r13
<L6>:
; for (symbol = 0; symbol < 5; ++symbol) {
5d1: 48 89 34 24 movq %rsi, (%rsp)
; size_t iters = (size_t)(ip[0] - ilimit) / 7;
5d5: bd 05 00 00 00 movl $0x5, %ebp
5da: 49 89 f8 movq %rdi, %r8
5dd: 4c 89 74 24 08 movq %r14, 0x8(%rsp)
<L3>:
; for (stream = 0; stream < 3; ++stream) {
5e2: 4c 8d 4c 24 20 leaq 0x20(%rsp), %r9
5e7: 48 8d 74 24 60 leaq 0x60(%rsp), %rsi
; size_t iters = (size_t)(ip[0] - ilimit) / 7;
5ec: 4c 89 c8 movq %r9, %rax
<L2>:
; int const index = (int)(bits[stream] >> 53);
5ef: bf 35 00 00 00 movl $0x35, %edi
5f4: c4 e2 c3 f7 10 shrxq %rdi, (%rax), %rdx
; HUF_DEltX2 const entry = dtable[index];
5f9: 49 8d 14 92 leaq (%r10,%rdx,4), %rdx
; for (stream = 0; stream < 3; ++stream) {
5fd: 48 83 c0 08 addq $0x8, %rax
; HUF_DEltX2 const entry = dtable[index];
601: 44 0f b7 32 movzwl (%rdx), %r14d
605: 0f b6 7a 02 movzbl 0x2(%rdx), %edi
; for (stream = 0; stream < 3; ++stream) {
609: 48 83 c6 08 addq $0x8, %rsi
; bits[stream] <<= (entry.nbBits);
60d: c4 e2 c1 f7 78 f8 shlxq %rdi, -0x8(%rax), %rdi
; HUF_DEltX2 const entry = dtable[index];
613: 0f b6 4a 03 movzbl 0x3(%rdx), %ecx
; MEM_write16(op[stream], entry.sequence);
617: 48 8b 56 f8 movq -0x8(%rsi), %rdx
; bits[stream] <<= (entry.nbBits);
61b: 48 89 78 f8 movq %rdi, -0x8(%rax)
; put_unaligned(value, (U16 *)memPtr);
61f: 66 44 89 32 movw %r14w, (%rdx)
; op[stream] += (entry.length);
623: 48 01 ca addq %rcx, %rdx
626: 48 89 56 f8 movq %rdx, -0x8(%rsi)
; for (stream = 0; stream < 3; ++stream) {
62a: 4c 39 d8 cmpq %r11, %rax
62d: 75 c0 jne <L2>
; for (symbol = 0; symbol < 5; ++symbol) {
62f: 83 ed 01 subl $0x1, %ebp
632: 75 ae jne <L3>
; int const index = (int)(bits[3] >> 53);
634: 48 8b 44 24 38 movq 0x38(%rsp), %rax
639: 48 8b 34 24 movq (%rsp), %rsi
63d: 4c 89 c7 movq %r8, %rdi
640: 4c 8b 74 24 08 movq 0x8(%rsp), %r14
645: 48 89 c2 movq %rax, %rdx
648: 48 c1 ea 35 shrq $0x35, %rdx
; HUF_DEltX2 const entry = dtable[index];
64c: 49 8d 14 92 leaq (%r10,%rdx,4), %rdx
650: 0f b6 4a 02 movzbl 0x2(%rdx), %ecx
654: 44 0f b7 02 movzwl (%rdx), %r8d
658: 0f b6 52 03 movzbl 0x3(%rdx), %edx
; bits[3] <<= (entry.nbBits);
65c: c4 e2 f1 f7 c0 shlxq %rcx, %rax, %rax
; put_unaligned(value, (U16 *)memPtr);
661: 66 44 89 06 movw %r8w, (%rsi)
; op[3] += (entry.length);
665: 48 8d 4c 24 40 leaq 0x40(%rsp), %rcx
66a: 48 01 d6 addq %rdx, %rsi
<L5>:
; int const index = (int)(bits[3] >> 53);
66d: 48 89 c2 movq %rax, %rdx
; for (stream = 0; stream < 4; ++stream) {
670: 49 83 c1 08 addq $0x8, %r9
674: 48 83 c1 08 addq $0x8, %rcx
; int const index = (int)(bits[3] >> 53);
678: 48 c1 ea 35 shrq $0x35, %rdx
; HUF_DEltX2 const entry = dtable[index];
67c: 49 8d 14 92 leaq (%r10,%rdx,4), %rdx
680: 44 0f b6 42 02 movzbl 0x2(%rdx), %r8d
685: 0f b7 2a movzwl (%rdx), %ebp
688: 0f b6 52 03 movzbl 0x3(%rdx), %edx
; bits[3] <<= (entry.nbBits);
68c: c4 e2 b9 f7 c0 shlxq %r8, %rax, %rax
691: 48 89 44 24 38 movq %rax, 0x38(%rsp)
; op[3] += (entry.length);
696: 0f b6 c2 movzbl %dl, %eax
; return (unsigned)__builtin_ctzll(val);
699: 31 d2 xorl %edx, %edx
69b: f3 49 0f bc 51 f8 tzcntq -0x8(%r9), %rdx
; put_unaligned(value, (U16 *)memPtr);
6a1: 66 89 2e movw %bp, (%rsi)
; op[3] += (entry.length);
6a4: 48 01 c6 addq %rax, %rsi
; ip[stream] -= nbBytes;
6a7: 48 8b 41 f8 movq -0x8(%rcx), %rax
6ab: 49 89 d0 movq %rdx, %r8
; int const nbBits = ctz & 7;
6ae: 83 e2 07 andl $0x7, %edx
; ip[stream] -= nbBytes;
6b1: 49 c1 e8 03 shrq $0x3, %r8
6b5: 4c 29 c0 subq %r8, %rax
6b8: 48 89 41 f8 movq %rax, -0x8(%rcx)
; bits[stream] = MEM_read64(ip[stream]) | 1;
6bc: 48 8b 00 movq (%rax), %rax
6bf: 48 83 c8 01 orq $0x1, %rax
; bits[stream] <<= nbBits;
6c3: c4 e2 e9 f7 c0 shlxq %rdx, %rax, %rax
6c8: 49 89 41 f8 movq %rax, -0x8(%r9)
; for (stream = 0; stream < 4; ++stream) {
6cc: 49 39 d9 cmpq %rbx, %r9
6cf: 74 07 je <L4>
; int const index = (int)(bits[3] >> 53);
6d1: 48 8b 44 24 38 movq 0x38(%rsp), %rax
6d6: eb 95 jmp <L5>
<L4>:
; } while (op[3] < olimit);
6d8: 48 39 fe cmpq %rdi, %rsi
6db: 0f 82 f0 fe ff ff jb <L6>
6e1: 4c 89 e9 movq %r13, %rcx
6e4: 48 89 74 24 78 movq %rsi, 0x78(%rsp)
6e9: 4c 8b 4c 24 18 movq 0x18(%rsp), %r9
6ee: 4c 8b 6c 24 10 movq 0x10(%rsp), %r13
6f3: e9 46 fe ff ff jmp <L7>
<L1>:
; ZSTD_memcpy(&args->bits, &bits, sizeof(bits));
6f8: 48 8b 44 24 20 movq 0x20(%rsp), %rax
6fd: 48 89 41 40 movq %rax, 0x40(%rcx)
701: 48 8b 44 24 28 movq 0x28(%rsp), %rax
706: 49 89 45 08 movq %rax, 0x8(%r13)
70a: 48 8b 44 24 30 movq 0x30(%rsp), %rax
70f: 49 89 45 10 movq %rax, 0x10(%r13)
713: 48 8b 44 24 38 movq 0x38(%rsp), %rax
718: 49 89 45 18 movq %rax, 0x18(%r13)
; ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip));
71c: 48 8b 44 24 40 movq 0x40(%rsp), %rax
721: 48 89 01 movq %rax, (%rcx)
724: 48 8b 44 24 48 movq 0x48(%rsp), %rax
729: 48 89 41 08 movq %rax, 0x8(%rcx)
72d: 48 8b 44 24 50 movq 0x50(%rsp), %rax
732: 48 89 41 10 movq %rax, 0x10(%rcx)
736: 48 8b 44 24 58 movq 0x58(%rsp), %rax
73b: 48 89 41 18 movq %rax, 0x18(%rcx)
; ZSTD_memcpy(&args->op, &op, sizeof(op));
73f: 48 8b 44 24 60 movq 0x60(%rsp), %rax
744: 48 89 41 20 movq %rax, 0x20(%rcx)
748: 48 8b 44 24 68 movq 0x68(%rsp), %rax
74d: 49 89 41 08 movq %rax, 0x8(%r9)
751: 48 8b 44 24 70 movq 0x70(%rsp), %rax
756: 49 89 41 10 movq %rax, 0x10(%r9)
75a: 48 8b 44 24 78 movq 0x78(%rsp), %rax
75f: 49 89 41 18 movq %rax, 0x18(%r9)
; }
763: 48 8b 84 24 a0 00 00 00 movq 0xa0(%rsp), %rax
76b: 65 48 2b 04 25 28 00 00 00 subq %gs:0x28, %rax
774: 75 16 jne <L8>
776: 48 81 c4 a8 00 00 00 addq $0xa8, %rsp
77d: 5b popq %rbx
77e: 5d popq %rbp
77f: 41 5c popq %r12
781: 41 5d popq %r13
783: 41 5e popq %r14
785: 41 5f popq %r15
787: e9 00 00 00 00 jmp <L8>
<L8>:
78c: e8 00 00 00 00 callq <L9>
<L9>:
791: 66 66 2e 0f 1f 84 00 00 00 00 00 nopw %cs:(%rax,%rax)
79c: 0f 1f 40 00 nopl (%rax)
7a0: 90 nop
7a1: 90 nop
7a2: 90 nop
7a3: 90 nop
7a4: 90 nop
7a5: 90 nop
7a6: 90 nop
7a7: 90 nop
7a8: 90 nop
7a9: 90 nop
7aa: 90 nop
7ab: 90 nop
7ac: 90 nop
7ad: 90 nop
7ae: 90 nop
7af: 90 nop
00000000000007b0 <HUF_decompress4X1_usingDTable_internal_fast_c_loop>:
; {
7b0: f3 0f 1e fa endbr64
7b4: 41 56 pushq %r14
; ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
7b6: 4c 8d 4f 40 leaq 0x40(%rdi), %r9
; ZSTD_memcpy(&op, &args->op, sizeof(op));
7ba: 4c 8d 47 20 leaq 0x20(%rdi), %r8
; size_t const iiters = (size_t)(ip[0] - ilimit) / 7;
7be: 49 ba 93 24 49 92 24 49 92 24 movabsq $0x2492492492492493, %r10 # imm = 0x2492492492492493
; {
7c8: 41 55 pushq %r13
7ca: 41 54 pushq %r12
7cc: 55 pushq %rbp
7cd: 53 pushq %rbx
; size_t const oiters = (size_t)(oend - op[3]) / 5;
7ce: 48 bb cd cc cc cc cc cc cc cc movabsq $-0x3333333333333333, %rbx # imm = 0xCCCCCCCCCCCCCCCD
; {
7d8: 48 83 ec 70 subq $0x70, %rsp
; ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip));
7dc: 4c 8b 37 movq (%rdi), %r14
; BYTE* const oend = args->oend;
7df: 48 8b 6f 70 movq 0x70(%rdi), %rbp
; {
7e3: 65 48 8b 04 25 28 00 00 00 movq %gs:0x28, %rax
7ec: 48 89 44 24 68 movq %rax, 0x68(%rsp)
7f1: 31 c0 xorl %eax, %eax
; ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
7f3: 48 8b 47 40 movq 0x40(%rdi), %rax
; BYTE const* const ilimit = args->ilimit;
7f7: 4c 8b 5f 68 movq 0x68(%rdi), %r11
; ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip));
7fb: 4c 89 74 24 28 movq %r14, 0x28(%rsp)
; U16 const* const dtable = (U16 const*)args->dt;
800: 48 8b 77 60 movq 0x60(%rdi), %rsi
; size_t const oiters = (size_t)(oend - op[3]) / 5;
804: 48 89 ea movq %rbp, %rdx
; size_t const iiters = (size_t)(ip[0] - ilimit) / 7;
807: 4d 89 f5 movq %r14, %r13
; ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
80a: 48 89 44 24 08 movq %rax, 0x8(%rsp)
80f: 49 8b 41 08 movq 0x8(%r9), %rax
; size_t const iiters = (size_t)(ip[0] - ilimit) / 7;
813: 4d 29 dd subq %r11, %r13
; ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
816: 48 89 44 24 10 movq %rax, 0x10(%rsp)
81b: 49 8b 41 10 movq 0x10(%r9), %rax
81f: 48 89 44 24 18 movq %rax, 0x18(%rsp)
824: 49 8b 41 18 movq 0x18(%r9), %rax
828: 48 89 44 24 20 movq %rax, 0x20(%rsp)
; ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip));
82d: 48 8b 47 08 movq 0x8(%rdi), %rax
831: 48 89 44 24 30 movq %rax, 0x30(%rsp)
836: 48 8b 47 10 movq 0x10(%rdi), %rax
83a: 48 89 44 24 38 movq %rax, 0x38(%rsp)
83f: 48 8b 47 18 movq 0x18(%rdi), %rax
843: 48 89 44 24 40 movq %rax, 0x40(%rsp)
; ZSTD_memcpy(&op, &args->op, sizeof(op));
848: 48 8b 47 20 movq 0x20(%rdi), %rax
84c: 48 89 44 24 48 movq %rax, 0x48(%rsp)
851: 49 8b 40 08 movq 0x8(%r8), %rax
855: 48 89 44 24 50 movq %rax, 0x50(%rsp)
85a: 49 8b 40 10 movq 0x10(%r8), %rax
85e: 48 89 44 24 58 movq %rax, 0x58(%rsp)
863: 4d 8b 60 18 movq 0x18(%r8), %r12
; size_t const oiters = (size_t)(oend - op[3]) / 5;
867: 4c 29 e2 subq %r12, %rdx
; ZSTD_memcpy(&op, &args->op, sizeof(op));
86a: 4c 89 64 24 60 movq %r12, 0x60(%rsp)
; size_t const oiters = (size_t)(oend - op[3]) / 5;
86f: 48 89 d0 movq %rdx, %rax
872: 48 f7 e3 mulq %rbx
; size_t const iiters = (size_t)(ip[0] - ilimit) / 7;
875: 4c 89 e8 movq %r13, %rax
; size_t const oiters = (size_t)(oend - op[3]) / 5;
878: 48 c1 ea 02 shrq $0x2, %rdx
87c: 48 89 d1 movq %rdx, %rcx
; size_t const iiters = (size_t)(ip[0] - ilimit) / 7;
87f: 49 f7 e2 mulq %r10
882: 4c 89 e8 movq %r13, %rax
885: 48 29 d0 subq %rdx, %rax
888: 48 d1 e8 shrq %rax
88b: 48 01 d0 addq %rdx, %rax
88e: 48 c1 e8 02 shrq $0x2, %rax
; size_t const iters = MIN(oiters, iiters);
892: 48 39 c1 cmpq %rax, %rcx
895: 48 0f 47 c8 cmovaq %rax, %rcx
; size_t const symbols = iters * 5;
899: 4c 8d 2c 89 leaq (%rcx,%rcx,4), %r13
; olimit = op[3] + symbols;
89d: 4d 01 e5 addq %r12, %r13
; if (op[3] + 20 > olimit)
8a0: 49 83 c4 14 addq $0x14, %r12
8a4: 4d 39 e5 cmpq %r12, %r13
8a7: 0f 82 0a 01 00 00 jb <L0>
<L5>:
; if (ip[stream] < ip[stream - 1])
8ad: 48 8b 44 24 30 movq 0x30(%rsp), %rax
8b2: 4c 39 f0 cmpq %r14, %rax
8b5: 0f 82 fc 00 00 00 jb <L0>
8bb: 48 8b 54 24 38 movq 0x38(%rsp), %rdx
8c0: 48 39 c2 cmpq %rax, %rdx
8c3: 0f 82 ee 00 00 00 jb <L0>
8c9: 48 39 54 24 40 cmpq %rdx, 0x40(%rsp)
8ce: 0f 82 e3 00 00 00 jb <L0>
<L4>:
; {
8d4: 45 31 e4 xorl %r12d, %r12d
<L2>:
8d7: 31 c0 xorl %eax, %eax
<L1>:
; int const index = (int)(bits[stream] >> 53);
8d9: 48 8b 54 04 08 movq 0x8(%rsp,%rax), %rdx
8de: 48 89 d1 movq %rdx, %rcx
8e1: 48 c1 e9 35 shrq $0x35, %rcx
; int const entry = (int)dtable[index];
8e5: 0f b7 0c 4e movzwl (%rsi,%rcx,2), %ecx
; bits[stream] <<= (entry & 63);
8e9: 41 89 ce movl %ecx, %r14d
; op[stream][symbol] = (BYTE)((entry >> 8) & 0xFF);
8ec: 0f b6 cd movzbl %ch, %ecx
; bits[stream] <<= (entry & 63);
8ef: c4 e2 89 f7 d2 shlxq %r14, %rdx, %rdx
8f4: 48 89 54 04 08 movq %rdx, 0x8(%rsp,%rax)
; op[stream][symbol] = (BYTE)((entry >> 8) & 0xFF);
8f9: 48 8b 54 04 48 movq 0x48(%rsp,%rax), %rdx
; for (stream = 0; stream < 4; ++stream) {
8fe: 48 83 c0 08 addq $0x8, %rax
; op[stream][symbol] = (BYTE)((entry >> 8) & 0xFF);
902: 42 88 0c 22 movb %cl, (%rdx,%r12)
; for (stream = 0; stream < 4; ++stream) {
906: 48 83 f8 20 cmpq $0x20, %rax
90a: 75 cd jne <L1>
; for (symbol = 0; symbol < 5; ++symbol) {
90c: 49 83 c4 01 addq $0x1, %r12
910: 49 83 fc 05 cmpq $0x5, %r12
914: 75 c1 jne <L2>
916: 31 c0 xorl %eax, %eax
<L3>:
; return (unsigned)__builtin_ctzll(val);
918: 31 c9 xorl %ecx, %ecx
91a: f3 48 0f bc 4c 04 08 tzcntq 0x8(%rsp,%rax), %rcx
; ip[stream] -= nbBytes;
921: 48 8b 54 04 28 movq 0x28(%rsp,%rax), %rdx
; op[stream] += 5;
926: 48 83 44 04 48 05 addq $0x5, 0x48(%rsp,%rax)
; ip[stream] -= nbBytes;
92c: 48 89 14 24 movq %rdx, (%rsp)
930: 49 89 cc movq %rcx, %r12
; int const nbBits = ctz & 7;
933: 83 e1 07 andl $0x7, %ecx
; ip[stream] -= nbBytes;
936: 49 c1 ec 03 shrq $0x3, %r12
93a: 4c 29 e2 subq %r12, %rdx
93d: 48 89 54 04 28 movq %rdx, 0x28(%rsp,%rax)
; bits[stream] = MEM_read64(ip[stream]) | 1;
942: 48 8b 12 movq (%rdx), %rdx
945: 48 83 ca 01 orq $0x1, %rdx
; bits[stream] <<= nbBits;
949: c4 e2 f1 f7 d2 shlxq %rcx, %rdx, %rdx
94e: 48 89 54 04 08 movq %rdx, 0x8(%rsp,%rax)
; for (stream = 0; stream < 4; ++stream) {
953: 48 83 c0 08 addq $0x8, %rax
957: 48 83 f8 20 cmpq $0x20, %rax
95b: 75 bb jne <L3>
; } while (op[3] < olimit);
95d: 48 8b 4c 24 60 movq 0x60(%rsp), %rcx
962: 4c 39 e9 cmpq %r13, %rcx
965: 0f 82 69 ff ff ff jb <L4>
; size_t const oiters = (size_t)(oend - op[3]) / 5;
96b: 48 89 ea movq %rbp, %rdx
; size_t const iiters = (size_t)(ip[0] - ilimit) / 7;
96e: 4c 8b 74 24 28 movq 0x28(%rsp), %r14
; size_t const oiters = (size_t)(oend - op[3]) / 5;
973: 48 29 ca subq %rcx, %rdx
976: 48 89 d0 movq %rdx, %rax
; size_t const iiters = (size_t)(ip[0] - ilimit) / 7;
979: 4d 89 f5 movq %r14, %r13
; size_t const oiters = (size_t)(oend - op[3]) / 5;
97c: 48 f7 e3 mulq %rbx
; size_t const iiters = (size_t)(ip[0] - ilimit) / 7;
97f: 4d 29 dd subq %r11, %r13
982: 4c 89 e8 movq %r13, %rax
; size_t const oiters = (size_t)(oend - op[3]) / 5;
985: 48 c1 ea 02 shrq $0x2, %rdx
989: 49 89 d4 movq %rdx, %r12
; size_t const iiters = (size_t)(ip[0] - ilimit) / 7;
98c: 49 f7 e2 mulq %r10
98f: 49 29 d5 subq %rdx, %r13
992: 49 d1 ed shrq %r13
995: 4c 01 ea addq %r13, %rdx
998: 48 c1 ea 02 shrq $0x2, %rdx
; size_t const iters = MIN(oiters, iiters);
99c: 49 39 d4 cmpq %rdx, %r12
99f: 4c 0f 47 e2 cmovaq %rdx, %r12
; size_t const symbols = iters * 5;
9a3: 4f 8d 2c a4 leaq (%r12,%r12,4), %r13
; olimit = op[3] + symbols;
9a7: 49 01 cd addq %rcx, %r13
; if (op[3] + 20 > olimit)
9aa: 48 83 c1 14 addq $0x14, %rcx
9ae: 49 39 cd cmpq %rcx, %r13
9b1: 0f 83 f6 fe ff ff jae <L5>
<L0>:
; ZSTD_memcpy(&args->bits, &bits, sizeof(bits));
9b7: 48 8b 44 24 08 movq 0x8(%rsp), %rax
9bc: 48 89 47 40 movq %rax, 0x40(%rdi)
9c0: 48 8b 44 24 10 movq 0x10(%rsp), %rax
9c5: 49 89 41 08 movq %rax, 0x8(%r9)
9c9: 48 8b 44 24 18 movq 0x18(%rsp), %rax
9ce: 49 89 41 10 movq %rax, 0x10(%r9)
9d2: 48 8b 44 24 20 movq 0x20(%rsp), %rax
9d7: 49 89 41 18 movq %rax, 0x18(%r9)
; ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip));
9db: 48 8b 44 24 28 movq 0x28(%rsp), %rax
9e0: 48 89 07 movq %rax, (%rdi)
9e3: 48 8b 44 24 30 movq 0x30(%rsp), %rax
9e8: 48 89 47 08 movq %rax, 0x8(%rdi)
9ec: 48 8b 44 24 38 movq 0x38(%rsp), %rax
9f1: 48 89 47 10 movq %rax, 0x10(%rdi)
9f5: 48 8b 44 24 40 movq 0x40(%rsp), %rax
9fa: 48 89 47 18 movq %rax, 0x18(%rdi)
; ZSTD_memcpy(&args->op, &op, sizeof(op));
9fe: 48 8b 44 24 48 movq 0x48(%rsp), %rax
a03: 48 89 47 20 movq %rax, 0x20(%rdi)
a07: 48 8b 44 24 50 movq 0x50(%rsp), %rax
a0c: 49 89 40 08 movq %rax, 0x8(%r8)
a10: 48 8b 44 24 58 movq 0x58(%rsp), %rax
a15: 49 89 40 10 movq %rax, 0x10(%r8)
a19: 48 8b 44 24 60 movq 0x60(%rsp), %rax
a1e: 49 89 40 18 movq %rax, 0x18(%r8)
; }
a22: 48 8b 44 24 68 movq 0x68(%rsp), %rax
a27: 65 48 2b 04 25 28 00 00 00 subq %gs:0x28, %rax
a30: 75 11 jne <L6>
a32: 48 83 c4 70 addq $0x70, %rsp
a36: 5b popq %rbx
a37: 5d popq %rbp
a38: 41 5c popq %r12
a3a: 41 5d popq %r13
a3c: 41 5e popq %r14
a3e: e9 00 00 00 00 jmp <L6>
<L6>:
a43: e8 00 00 00 00 callq <L7>
<L7>:
a48: 0f 1f 84 00 00 00 00 00 nopl (%rax,%rax)
a50: 90 nop
a51: 90 nop
a52: 90 nop
a53: 90 nop
a54: 90 nop
a55: 90 nop
a56: 90 nop
a57: 90 nop
a58: 90 nop
a59: 90 nop
a5a: 90 nop
a5b: 90 nop
a5c: 90 nop
a5d: 90 nop
a5e: 90 nop
a5f: 90 nop
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment