-
-
Save krisk0/4f80a9ab2d04dd5d25eee21f2ae17fe4 to your computer and use it in GitHub Desktop.
.text | |
.globl mul8x2_zen | |
.type mul8x2_zen, @function | |
mul8x2_zen: | |
vzeroupper | |
movq %r13, %xmm10 | |
movq %rdx, %rax | |
movq (%rdx), %rdx | |
movq %r15, %xmm9 | |
movq 8(%rax), %rax | |
mulx (%rsi), %r8, %r9 | |
mulx 8(%rsi), %r10, %r11 | |
movq %r14, %xmm8 | |
mulx 16(%rsi), %r15, %r14 | |
mulx 24(%rsi), %rcx, %r13 | |
movq %r12, %xmm7 | |
movq %rbx, %xmm6 | |
movq %rax, 48(%rdi) | |
mulx 32(%rsi), %rax, %r12 | |
movq %rbp, %xmm5 | |
mulx 40(%rsi), %rbx, %rbp | |
addq %r10, %r9 | |
movq %r8, (%rdi) | |
mulx 48(%rsi), %r8, %r10 | |
adcq %r15, %r11 | |
movq %r9, 8(%rdi) | |
mulx 56(%rsi), %r9, %r15 | |
movq 48(%rdi), %rdx | |
adcq %rcx, %r14 | |
movq %r11, 16(%rdi) | |
mulx (%rsi), %r11, %rcx | |
adcq %r13, %rax | |
adcq %rbx, %r12 | |
mulx 8(%rsi), %r13, %rbx | |
adcq %rbp, %r8 | |
adcq %r10, %r9 | |
mulx 16(%rsi), %r10, %rbp | |
adcq $0, %r15 | |
addq %r11, 8(%rdi) | |
movq $0, %r11 | |
adcq %r13, 16(%rdi) | |
adcq %rbx, %r14 | |
mulx 24(%rsi), %r13, %rbx | |
adcq %rbp, %rax | |
movq 16(%rdi), %rbp | |
adcq $0, %r11 | |
xor %edx, %edx | |
movq 48(%rdi), %rdx | |
adox %rbp, %rcx | |
adox %r14, %r10 | |
adox %r13, %rax | |
mulx 32(%rsi), %r14, %rbp | |
adox %rbx, %r12 | |
mulx 40(%rsi), %r13, %rbx | |
adcx %r12, %r11 | |
movq %rcx, 16(%rdi) | |
mulx 48(%rsi), %rcx, %r12 | |
adox %rbp, %r8 | |
adox %rbx, %r9 | |
mulx 56(%rsi), %rbx, %rbp | |
adcx %r13, %r8 | |
movq $0, %r13 | |
adcx %rcx, %r9 | |
adox %r12, %r15 | |
mulx (%rsi), %rcx, %r12 | |
adox %r13, %rbp | |
adcx %rbx, %r15 | |
adcx %r13, %rbp | |
mulx 8(%rsi), %r13, %rbx | |
movq %rbp, 48(%rdi) | |
movq 16(%rdi), %rbp | |
adcx %r12, %r10 | |
movq 48(%rdi), %r12 | |
adox %rbp, %rcx | |
movq %rcx, 16(%rdi) | |
mulx 16(%rsi), %rcx, %rbp | |
adox %r13, %r10 | |
adcx %rbx, %rax | |
mulx 24(%rsi), %r13, %rbx | |
movq %r10, 24(%rdi) | |
adox %rcx, %rax | |
mulx 32(%rsi), %r10, %rcx | |
adcx %r14, %r11 | |
movq %rax, 32(%rdi) | |
mulx 40(%rsi), %rax, %r14 | |
adox %rbp, %r11 | |
adcx %rbx, %r8 | |
mulx 48(%rsi), %rbx, %rbp | |
adox %r10, %r8 | |
adcx %rcx, %r9 | |
mulx 56(%rsi), %r10, %rcx | |
movq %xmm14, %rdx | |
adox %r9, %rax | |
movq $0, %r9 | |
adcx %r14, %r15 | |
movq %xmm9, %r15 | |
movq %xmm8, %r14 | |
adcx %rbp, %r12 | |
movq %xmm5, %rbp | |
adcx %r13, %r9 | |
movq %xmm10, %r13 | |
movq %rax, 32(%rdi) | |
adcx %r12, %r10 | |
movq %r12, 40(%rdi) | |
movq %xmm7, %r12 | |
movq %r8, 48(%rdi) | |
adcx %rbx, %rcx | |
movq %xmm6, %rbx | |
movq %r9, 56(%rdi) | |
movq %r10, 64(%rdi) | |
movq %rcx, 72(%rdi) | |
retq |
I use movq / adcq so nearly all asm instructions are 4 chars long.
This is generated code, not source code. Source code for 8x8 multiplication is in https://github.com/krisk0/broadwell_multiplication/blob/master/gen_mul8_zen.py . Asm instructions in the file are zero-column aligned, and I won't change this indent.
I will try to follow other advices if they apply to 8x8 Ryzen code and report results at https://gist.github.com/krisk0/2658fd2fb8d8ce97f2640216768eb035. Thank you, @pcordes
I use movq / adcq so nearly all asm instructions are 4 chars long.
I wondered if that was intentional. In my opinion that's not a good thing. It makes it harder to distinguish ADC from ADOX or ADCX, which is something you definitely want to do when you're looking for when you can optimize (for code size, or with an immediate) by using ADC.
Use a space instead of a q
if you want the rest of the line to look nice. And use multiple spaces so operands aren't crowding into the instruction mnemonics. https://codereview.stackexchange.com/questions/204902/checking-if-a-number-is-prime-in-nasm-win64-assembly/204965#204965 discusses style, and shows an example of what I think is good style.
-O2
advice reduced binary code size by 11 bytes.
The advice to replacq adox
with adc
does not apply for 8x8 code. I will keep the trick in mind.
adox 16(%rdi), rcx
slows code down by 2 ticks.
Thank you, @pcordes.
You can save some code-size with
mov $0, %edi
instead ofmov $0, %rdi
. GAS can do this for you withas -O2
orgcc -Wa,-O2
(which is unfortunately not the default, unlike NASM which does do that by default).Style: the explicit
q
suffix on some instructions is just clutter, and makes it slightly harder to immediately see the difference between adcx and adc, and between mov and movq xmm. You should still indent your operands to a consistent column with spaces (or a tab)Not sure if there's a good way to use
setc %r11b
instead ofmovq $0, %r11
/adc $0, %r11
. Both ways need it zeroed first; x86 is inconvenient that way. So it wouldn't save themov $0, %r11d
The
adcx %r13, %r9
near the end of your function can beadc $0, %r13
, avoiding the need to zero R9. You don't use R13 or ADOX again after that, and ADC is compatible with ADCX except for destroying the other flags.As I commented on SO,
adc (mem), reg
(or adcx or adox) can be useful. e.g. after line 71movq 16(%rdi), %rbp
, the only use of RBP is anadox %rbp, %rcx
. So you could save an instruction and not need RBP there by usingadox 16(%rdi), %rcx
. (RBP is dead after that: in your version the next access to it is write-only: the 2nd destination of a MULX.)