Last active
December 19, 2019 15:54
-
-
Save AttilaFueloep/278697313a980f64260965c0276c40ad to your computer and use it in GitHub Desktop.
Diff against openssl openssl/crypto/modes/ghash-x86_64.s.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- ../openssl/crypto/modes/ghash-x86_64.s 2019-10-19 20:15:29.166647251 +0200 | |
+++ module/icp/asm-x86_64/modes/ghash-x86_64.S 2019-12-09 21:15:50.320635370 +0100 | |
@@ -1,864 +1,103 @@ | |
-.text | |
- | |
- | |
-.globl gcm_gmult_4bit | |
-.type gcm_gmult_4bit,@function | |
-.align 16 | |
-gcm_gmult_4bit: | |
-.cfi_startproc | |
- pushq %rbx | |
-.cfi_adjust_cfa_offset 8 | |
-.cfi_offset %rbx,-16 | |
- pushq %rbp | |
-.cfi_adjust_cfa_offset 8 | |
-.cfi_offset %rbp,-24 | |
- pushq %r12 | |
-.cfi_adjust_cfa_offset 8 | |
-.cfi_offset %r12,-32 | |
- pushq %r13 | |
-.cfi_adjust_cfa_offset 8 | |
-.cfi_offset %r13,-40 | |
- pushq %r14 | |
-.cfi_adjust_cfa_offset 8 | |
-.cfi_offset %r14,-48 | |
- pushq %r15 | |
-.cfi_adjust_cfa_offset 8 | |
-.cfi_offset %r15,-56 | |
- subq $280,%rsp | |
-.cfi_adjust_cfa_offset 280 | |
-.Lgmult_prologue: | |
- | |
- movzbq 15(%rdi),%r8 | |
- leaq .Lrem_4bit(%rip),%r11 | |
- xorq %rax,%rax | |
- xorq %rbx,%rbx | |
- movb %r8b,%al | |
- movb %r8b,%bl | |
- shlb $4,%al | |
- movq $14,%rcx | |
- movq 8(%rsi,%rax,1),%r8 | |
- movq (%rsi,%rax,1),%r9 | |
- andb $0xf0,%bl | |
- movq %r8,%rdx | |
- jmp .Loop1 | |
- | |
-.align 16 | |
-.Loop1: | |
- shrq $4,%r8 | |
- andq $0xf,%rdx | |
- movq %r9,%r10 | |
- movb (%rdi,%rcx,1),%al | |
- shrq $4,%r9 | |
- xorq 8(%rsi,%rbx,1),%r8 | |
- shlq $60,%r10 | |
- xorq (%rsi,%rbx,1),%r9 | |
- movb %al,%bl | |
- xorq (%r11,%rdx,8),%r9 | |
- movq %r8,%rdx | |
- shlb $4,%al | |
- xorq %r10,%r8 | |
- decq %rcx | |
- js .Lbreak1 | |
- | |
- shrq $4,%r8 | |
- andq $0xf,%rdx | |
- movq %r9,%r10 | |
- shrq $4,%r9 | |
- xorq 8(%rsi,%rax,1),%r8 | |
- shlq $60,%r10 | |
- xorq (%rsi,%rax,1),%r9 | |
- andb $0xf0,%bl | |
- xorq (%r11,%rdx,8),%r9 | |
- movq %r8,%rdx | |
- xorq %r10,%r8 | |
- jmp .Loop1 | |
- | |
-.align 16 | |
-.Lbreak1: | |
- shrq $4,%r8 | |
- andq $0xf,%rdx | |
- movq %r9,%r10 | |
- shrq $4,%r9 | |
- xorq 8(%rsi,%rax,1),%r8 | |
- shlq $60,%r10 | |
- xorq (%rsi,%rax,1),%r9 | |
- andb $0xf0,%bl | |
- xorq (%r11,%rdx,8),%r9 | |
- movq %r8,%rdx | |
- xorq %r10,%r8 | |
- | |
- shrq $4,%r8 | |
- andq $0xf,%rdx | |
- movq %r9,%r10 | |
- shrq $4,%r9 | |
- xorq 8(%rsi,%rbx,1),%r8 | |
- shlq $60,%r10 | |
- xorq (%rsi,%rbx,1),%r9 | |
- xorq %r10,%r8 | |
- xorq (%r11,%rdx,8),%r9 | |
- | |
- bswapq %r8 | |
- bswapq %r9 | |
- movq %r8,8(%rdi) | |
- movq %r9,(%rdi) | |
- | |
- leaq 280+48(%rsp),%rsi | |
-.cfi_def_cfa %rsi,8 | |
- movq -8(%rsi),%rbx | |
-.cfi_restore %rbx | |
- leaq (%rsi),%rsp | |
-.cfi_def_cfa_register %rsp | |
-.Lgmult_epilogue: | |
- .byte 0xf3,0xc3 | |
-.cfi_endproc | |
-.size gcm_gmult_4bit,.-gcm_gmult_4bit | |
-.globl gcm_ghash_4bit | |
-.type gcm_ghash_4bit,@function | |
-.align 16 | |
-gcm_ghash_4bit: | |
-.cfi_startproc | |
- pushq %rbx | |
-.cfi_adjust_cfa_offset 8 | |
-.cfi_offset %rbx,-16 | |
- pushq %rbp | |
-.cfi_adjust_cfa_offset 8 | |
-.cfi_offset %rbp,-24 | |
- pushq %r12 | |
-.cfi_adjust_cfa_offset 8 | |
-.cfi_offset %r12,-32 | |
- pushq %r13 | |
-.cfi_adjust_cfa_offset 8 | |
-.cfi_offset %r13,-40 | |
- pushq %r14 | |
-.cfi_adjust_cfa_offset 8 | |
-.cfi_offset %r14,-48 | |
- pushq %r15 | |
-.cfi_adjust_cfa_offset 8 | |
-.cfi_offset %r15,-56 | |
- subq $280,%rsp | |
-.cfi_adjust_cfa_offset 280 | |
-.Lghash_prologue: | |
- movq %rdx,%r14 | |
- movq %rcx,%r15 | |
- subq $-128,%rsi | |
- leaq 16+128(%rsp),%rbp | |
- xorl %edx,%edx | |
- movq 0+0-128(%rsi),%r8 | |
- movq 0+8-128(%rsi),%rax | |
- movb %al,%dl | |
- shrq $4,%rax | |
- movq %r8,%r10 | |
- shrq $4,%r8 | |
- movq 16+0-128(%rsi),%r9 | |
- shlb $4,%dl | |
- movq 16+8-128(%rsi),%rbx | |
- shlq $60,%r10 | |
- movb %dl,0(%rsp) | |
- orq %r10,%rax | |
- movb %bl,%dl | |
- shrq $4,%rbx | |
- movq %r9,%r10 | |
- shrq $4,%r9 | |
- movq %r8,0(%rbp) | |
- movq 32+0-128(%rsi),%r8 | |
- shlb $4,%dl | |
- movq %rax,0-128(%rbp) | |
- movq 32+8-128(%rsi),%rax | |
- shlq $60,%r10 | |
- movb %dl,1(%rsp) | |
- orq %r10,%rbx | |
- movb %al,%dl | |
- shrq $4,%rax | |
- movq %r8,%r10 | |
- shrq $4,%r8 | |
- movq %r9,8(%rbp) | |
- movq 48+0-128(%rsi),%r9 | |
- shlb $4,%dl | |
- movq %rbx,8-128(%rbp) | |
- movq 48+8-128(%rsi),%rbx | |
- shlq $60,%r10 | |
- movb %dl,2(%rsp) | |
- orq %r10,%rax | |
- movb %bl,%dl | |
- shrq $4,%rbx | |
- movq %r9,%r10 | |
- shrq $4,%r9 | |
- movq %r8,16(%rbp) | |
- movq 64+0-128(%rsi),%r8 | |
- shlb $4,%dl | |
- movq %rax,16-128(%rbp) | |
- movq 64+8-128(%rsi),%rax | |
- shlq $60,%r10 | |
- movb %dl,3(%rsp) | |
- orq %r10,%rbx | |
- movb %al,%dl | |
- shrq $4,%rax | |
- movq %r8,%r10 | |
- shrq $4,%r8 | |
- movq %r9,24(%rbp) | |
- movq 80+0-128(%rsi),%r9 | |
- shlb $4,%dl | |
- movq %rbx,24-128(%rbp) | |
- movq 80+8-128(%rsi),%rbx | |
- shlq $60,%r10 | |
- movb %dl,4(%rsp) | |
- orq %r10,%rax | |
- movb %bl,%dl | |
- shrq $4,%rbx | |
- movq %r9,%r10 | |
- shrq $4,%r9 | |
- movq %r8,32(%rbp) | |
- movq 96+0-128(%rsi),%r8 | |
- shlb $4,%dl | |
- movq %rax,32-128(%rbp) | |
- movq 96+8-128(%rsi),%rax | |
- shlq $60,%r10 | |
- movb %dl,5(%rsp) | |
- orq %r10,%rbx | |
- movb %al,%dl | |
- shrq $4,%rax | |
- movq %r8,%r10 | |
- shrq $4,%r8 | |
- movq %r9,40(%rbp) | |
- movq 112+0-128(%rsi),%r9 | |
- shlb $4,%dl | |
- movq %rbx,40-128(%rbp) | |
- movq 112+8-128(%rsi),%rbx | |
- shlq $60,%r10 | |
- movb %dl,6(%rsp) | |
- orq %r10,%rax | |
- movb %bl,%dl | |
- shrq $4,%rbx | |
- movq %r9,%r10 | |
- shrq $4,%r9 | |
- movq %r8,48(%rbp) | |
- movq 128+0-128(%rsi),%r8 | |
- shlb $4,%dl | |
- movq %rax,48-128(%rbp) | |
- movq 128+8-128(%rsi),%rax | |
- shlq $60,%r10 | |
- movb %dl,7(%rsp) | |
- orq %r10,%rbx | |
- movb %al,%dl | |
- shrq $4,%rax | |
- movq %r8,%r10 | |
- shrq $4,%r8 | |
- movq %r9,56(%rbp) | |
- movq 144+0-128(%rsi),%r9 | |
- shlb $4,%dl | |
- movq %rbx,56-128(%rbp) | |
- movq 144+8-128(%rsi),%rbx | |
- shlq $60,%r10 | |
- movb %dl,8(%rsp) | |
- orq %r10,%rax | |
- movb %bl,%dl | |
- shrq $4,%rbx | |
- movq %r9,%r10 | |
- shrq $4,%r9 | |
- movq %r8,64(%rbp) | |
- movq 160+0-128(%rsi),%r8 | |
- shlb $4,%dl | |
- movq %rax,64-128(%rbp) | |
- movq 160+8-128(%rsi),%rax | |
- shlq $60,%r10 | |
- movb %dl,9(%rsp) | |
- orq %r10,%rbx | |
- movb %al,%dl | |
- shrq $4,%rax | |
- movq %r8,%r10 | |
- shrq $4,%r8 | |
- movq %r9,72(%rbp) | |
- movq 176+0-128(%rsi),%r9 | |
- shlb $4,%dl | |
- movq %rbx,72-128(%rbp) | |
- movq 176+8-128(%rsi),%rbx | |
- shlq $60,%r10 | |
- movb %dl,10(%rsp) | |
- orq %r10,%rax | |
- movb %bl,%dl | |
- shrq $4,%rbx | |
- movq %r9,%r10 | |
- shrq $4,%r9 | |
- movq %r8,80(%rbp) | |
- movq 192+0-128(%rsi),%r8 | |
- shlb $4,%dl | |
- movq %rax,80-128(%rbp) | |
- movq 192+8-128(%rsi),%rax | |
- shlq $60,%r10 | |
- movb %dl,11(%rsp) | |
- orq %r10,%rbx | |
- movb %al,%dl | |
- shrq $4,%rax | |
- movq %r8,%r10 | |
- shrq $4,%r8 | |
- movq %r9,88(%rbp) | |
- movq 208+0-128(%rsi),%r9 | |
- shlb $4,%dl | |
- movq %rbx,88-128(%rbp) | |
- movq 208+8-128(%rsi),%rbx | |
- shlq $60,%r10 | |
- movb %dl,12(%rsp) | |
- orq %r10,%rax | |
- movb %bl,%dl | |
- shrq $4,%rbx | |
- movq %r9,%r10 | |
- shrq $4,%r9 | |
- movq %r8,96(%rbp) | |
- movq 224+0-128(%rsi),%r8 | |
- shlb $4,%dl | |
- movq %rax,96-128(%rbp) | |
- movq 224+8-128(%rsi),%rax | |
- shlq $60,%r10 | |
- movb %dl,13(%rsp) | |
- orq %r10,%rbx | |
- movb %al,%dl | |
- shrq $4,%rax | |
- movq %r8,%r10 | |
- shrq $4,%r8 | |
- movq %r9,104(%rbp) | |
- movq 240+0-128(%rsi),%r9 | |
- shlb $4,%dl | |
- movq %rbx,104-128(%rbp) | |
- movq 240+8-128(%rsi),%rbx | |
- shlq $60,%r10 | |
- movb %dl,14(%rsp) | |
- orq %r10,%rax | |
- movb %bl,%dl | |
- shrq $4,%rbx | |
- movq %r9,%r10 | |
- shrq $4,%r9 | |
- movq %r8,112(%rbp) | |
- shlb $4,%dl | |
- movq %rax,112-128(%rbp) | |
- shlq $60,%r10 | |
- movb %dl,15(%rsp) | |
- orq %r10,%rbx | |
- movq %r9,120(%rbp) | |
- movq %rbx,120-128(%rbp) | |
- addq $-128,%rsi | |
- movq 8(%rdi),%r8 | |
- movq 0(%rdi),%r9 | |
- addq %r14,%r15 | |
- leaq .Lrem_8bit(%rip),%r11 | |
- jmp .Louter_loop | |
-.align 16 | |
-.Louter_loop: | |
- xorq (%r14),%r9 | |
- movq 8(%r14),%rdx | |
- leaq 16(%r14),%r14 | |
- xorq %r8,%rdx | |
- movq %r9,(%rdi) | |
- movq %rdx,8(%rdi) | |
- shrq $32,%rdx | |
- xorq %rax,%rax | |
- roll $8,%edx | |
- movb %dl,%al | |
- movzbl %dl,%ebx | |
- shlb $4,%al | |
- shrl $4,%ebx | |
- roll $8,%edx | |
- movq 8(%rsi,%rax,1),%r8 | |
- movq (%rsi,%rax,1),%r9 | |
- movb %dl,%al | |
- movzbl %dl,%ecx | |
- shlb $4,%al | |
- movzbq (%rsp,%rbx,1),%r12 | |
- shrl $4,%ecx | |
- xorq %r8,%r12 | |
- movq %r9,%r10 | |
- shrq $8,%r8 | |
- movzbq %r12b,%r12 | |
- shrq $8,%r9 | |
- xorq -128(%rbp,%rbx,8),%r8 | |
- shlq $56,%r10 | |
- xorq (%rbp,%rbx,8),%r9 | |
- roll $8,%edx | |
- xorq 8(%rsi,%rax,1),%r8 | |
- xorq (%rsi,%rax,1),%r9 | |
- movb %dl,%al | |
- xorq %r10,%r8 | |
- movzwq (%r11,%r12,2),%r12 | |
- movzbl %dl,%ebx | |
- shlb $4,%al | |
- movzbq (%rsp,%rcx,1),%r13 | |
- shrl $4,%ebx | |
- shlq $48,%r12 | |
- xorq %r8,%r13 | |
- movq %r9,%r10 | |
- xorq %r12,%r9 | |
- shrq $8,%r8 | |
- movzbq %r13b,%r13 | |
- shrq $8,%r9 | |
- xorq -128(%rbp,%rcx,8),%r8 | |
- shlq $56,%r10 | |
- xorq (%rbp,%rcx,8),%r9 | |
- roll $8,%edx | |
- xorq 8(%rsi,%rax,1),%r8 | |
- xorq (%rsi,%rax,1),%r9 | |
- movb %dl,%al | |
- xorq %r10,%r8 | |
- movzwq (%r11,%r13,2),%r13 | |
- movzbl %dl,%ecx | |
- shlb $4,%al | |
- movzbq (%rsp,%rbx,1),%r12 | |
- shrl $4,%ecx | |
- shlq $48,%r13 | |
- xorq %r8,%r12 | |
- movq %r9,%r10 | |
- xorq %r13,%r9 | |
- shrq $8,%r8 | |
- movzbq %r12b,%r12 | |
- movl 8(%rdi),%edx | |
- shrq $8,%r9 | |
- xorq -128(%rbp,%rbx,8),%r8 | |
- shlq $56,%r10 | |
- xorq (%rbp,%rbx,8),%r9 | |
- roll $8,%edx | |
- xorq 8(%rsi,%rax,1),%r8 | |
- xorq (%rsi,%rax,1),%r9 | |
- movb %dl,%al | |
- xorq %r10,%r8 | |
- movzwq (%r11,%r12,2),%r12 | |
- movzbl %dl,%ebx | |
- shlb $4,%al | |
- movzbq (%rsp,%rcx,1),%r13 | |
- shrl $4,%ebx | |
- shlq $48,%r12 | |
- xorq %r8,%r13 | |
- movq %r9,%r10 | |
- xorq %r12,%r9 | |
- shrq $8,%r8 | |
- movzbq %r13b,%r13 | |
- shrq $8,%r9 | |
- xorq -128(%rbp,%rcx,8),%r8 | |
- shlq $56,%r10 | |
- xorq (%rbp,%rcx,8),%r9 | |
- roll $8,%edx | |
- xorq 8(%rsi,%rax,1),%r8 | |
- xorq (%rsi,%rax,1),%r9 | |
- movb %dl,%al | |
- xorq %r10,%r8 | |
- movzwq (%r11,%r13,2),%r13 | |
- movzbl %dl,%ecx | |
- shlb $4,%al | |
- movzbq (%rsp,%rbx,1),%r12 | |
- shrl $4,%ecx | |
- shlq $48,%r13 | |
- xorq %r8,%r12 | |
- movq %r9,%r10 | |
- xorq %r13,%r9 | |
- shrq $8,%r8 | |
- movzbq %r12b,%r12 | |
- shrq $8,%r9 | |
- xorq -128(%rbp,%rbx,8),%r8 | |
- shlq $56,%r10 | |
- xorq (%rbp,%rbx,8),%r9 | |
- roll $8,%edx | |
- xorq 8(%rsi,%rax,1),%r8 | |
- xorq (%rsi,%rax,1),%r9 | |
- movb %dl,%al | |
- xorq %r10,%r8 | |
- movzwq (%r11,%r12,2),%r12 | |
- movzbl %dl,%ebx | |
- shlb $4,%al | |
- movzbq (%rsp,%rcx,1),%r13 | |
- shrl $4,%ebx | |
- shlq $48,%r12 | |
- xorq %r8,%r13 | |
- movq %r9,%r10 | |
- xorq %r12,%r9 | |
- shrq $8,%r8 | |
- movzbq %r13b,%r13 | |
- shrq $8,%r9 | |
- xorq -128(%rbp,%rcx,8),%r8 | |
- shlq $56,%r10 | |
- xorq (%rbp,%rcx,8),%r9 | |
- roll $8,%edx | |
- xorq 8(%rsi,%rax,1),%r8 | |
- xorq (%rsi,%rax,1),%r9 | |
- movb %dl,%al | |
- xorq %r10,%r8 | |
- movzwq (%r11,%r13,2),%r13 | |
- movzbl %dl,%ecx | |
- shlb $4,%al | |
- movzbq (%rsp,%rbx,1),%r12 | |
- shrl $4,%ecx | |
- shlq $48,%r13 | |
- xorq %r8,%r12 | |
- movq %r9,%r10 | |
- xorq %r13,%r9 | |
- shrq $8,%r8 | |
- movzbq %r12b,%r12 | |
- movl 4(%rdi),%edx | |
- shrq $8,%r9 | |
- xorq -128(%rbp,%rbx,8),%r8 | |
- shlq $56,%r10 | |
- xorq (%rbp,%rbx,8),%r9 | |
- roll $8,%edx | |
- xorq 8(%rsi,%rax,1),%r8 | |
- xorq (%rsi,%rax,1),%r9 | |
- movb %dl,%al | |
- xorq %r10,%r8 | |
- movzwq (%r11,%r12,2),%r12 | |
- movzbl %dl,%ebx | |
- shlb $4,%al | |
- movzbq (%rsp,%rcx,1),%r13 | |
- shrl $4,%ebx | |
- shlq $48,%r12 | |
- xorq %r8,%r13 | |
- movq %r9,%r10 | |
- xorq %r12,%r9 | |
- shrq $8,%r8 | |
- movzbq %r13b,%r13 | |
- shrq $8,%r9 | |
- xorq -128(%rbp,%rcx,8),%r8 | |
- shlq $56,%r10 | |
- xorq (%rbp,%rcx,8),%r9 | |
- roll $8,%edx | |
- xorq 8(%rsi,%rax,1),%r8 | |
- xorq (%rsi,%rax,1),%r9 | |
- movb %dl,%al | |
- xorq %r10,%r8 | |
- movzwq (%r11,%r13,2),%r13 | |
- movzbl %dl,%ecx | |
- shlb $4,%al | |
- movzbq (%rsp,%rbx,1),%r12 | |
- shrl $4,%ecx | |
- shlq $48,%r13 | |
- xorq %r8,%r12 | |
- movq %r9,%r10 | |
- xorq %r13,%r9 | |
- shrq $8,%r8 | |
- movzbq %r12b,%r12 | |
- shrq $8,%r9 | |
- xorq -128(%rbp,%rbx,8),%r8 | |
- shlq $56,%r10 | |
- xorq (%rbp,%rbx,8),%r9 | |
- roll $8,%edx | |
- xorq 8(%rsi,%rax,1),%r8 | |
- xorq (%rsi,%rax,1),%r9 | |
- movb %dl,%al | |
- xorq %r10,%r8 | |
- movzwq (%r11,%r12,2),%r12 | |
- movzbl %dl,%ebx | |
- shlb $4,%al | |
- movzbq (%rsp,%rcx,1),%r13 | |
- shrl $4,%ebx | |
- shlq $48,%r12 | |
- xorq %r8,%r13 | |
- movq %r9,%r10 | |
- xorq %r12,%r9 | |
- shrq $8,%r8 | |
- movzbq %r13b,%r13 | |
- shrq $8,%r9 | |
- xorq -128(%rbp,%rcx,8),%r8 | |
- shlq $56,%r10 | |
- xorq (%rbp,%rcx,8),%r9 | |
- roll $8,%edx | |
- xorq 8(%rsi,%rax,1),%r8 | |
- xorq (%rsi,%rax,1),%r9 | |
- movb %dl,%al | |
- xorq %r10,%r8 | |
- movzwq (%r11,%r13,2),%r13 | |
- movzbl %dl,%ecx | |
- shlb $4,%al | |
- movzbq (%rsp,%rbx,1),%r12 | |
- shrl $4,%ecx | |
- shlq $48,%r13 | |
- xorq %r8,%r12 | |
- movq %r9,%r10 | |
- xorq %r13,%r9 | |
- shrq $8,%r8 | |
- movzbq %r12b,%r12 | |
- movl 0(%rdi),%edx | |
- shrq $8,%r9 | |
- xorq -128(%rbp,%rbx,8),%r8 | |
- shlq $56,%r10 | |
- xorq (%rbp,%rbx,8),%r9 | |
- roll $8,%edx | |
- xorq 8(%rsi,%rax,1),%r8 | |
- xorq (%rsi,%rax,1),%r9 | |
- movb %dl,%al | |
- xorq %r10,%r8 | |
- movzwq (%r11,%r12,2),%r12 | |
- movzbl %dl,%ebx | |
- shlb $4,%al | |
- movzbq (%rsp,%rcx,1),%r13 | |
- shrl $4,%ebx | |
- shlq $48,%r12 | |
- xorq %r8,%r13 | |
- movq %r9,%r10 | |
- xorq %r12,%r9 | |
- shrq $8,%r8 | |
- movzbq %r13b,%r13 | |
- shrq $8,%r9 | |
- xorq -128(%rbp,%rcx,8),%r8 | |
- shlq $56,%r10 | |
- xorq (%rbp,%rcx,8),%r9 | |
- roll $8,%edx | |
- xorq 8(%rsi,%rax,1),%r8 | |
- xorq (%rsi,%rax,1),%r9 | |
- movb %dl,%al | |
- xorq %r10,%r8 | |
- movzwq (%r11,%r13,2),%r13 | |
- movzbl %dl,%ecx | |
- shlb $4,%al | |
- movzbq (%rsp,%rbx,1),%r12 | |
- shrl $4,%ecx | |
- shlq $48,%r13 | |
- xorq %r8,%r12 | |
- movq %r9,%r10 | |
- xorq %r13,%r9 | |
- shrq $8,%r8 | |
- movzbq %r12b,%r12 | |
- shrq $8,%r9 | |
- xorq -128(%rbp,%rbx,8),%r8 | |
- shlq $56,%r10 | |
- xorq (%rbp,%rbx,8),%r9 | |
- roll $8,%edx | |
- xorq 8(%rsi,%rax,1),%r8 | |
- xorq (%rsi,%rax,1),%r9 | |
- movb %dl,%al | |
- xorq %r10,%r8 | |
- movzwq (%r11,%r12,2),%r12 | |
- movzbl %dl,%ebx | |
- shlb $4,%al | |
- movzbq (%rsp,%rcx,1),%r13 | |
- shrl $4,%ebx | |
- shlq $48,%r12 | |
- xorq %r8,%r13 | |
- movq %r9,%r10 | |
- xorq %r12,%r9 | |
- shrq $8,%r8 | |
- movzbq %r13b,%r13 | |
- shrq $8,%r9 | |
- xorq -128(%rbp,%rcx,8),%r8 | |
- shlq $56,%r10 | |
- xorq (%rbp,%rcx,8),%r9 | |
- roll $8,%edx | |
- xorq 8(%rsi,%rax,1),%r8 | |
- xorq (%rsi,%rax,1),%r9 | |
- movb %dl,%al | |
- xorq %r10,%r8 | |
- movzwq (%r11,%r13,2),%r13 | |
- movzbl %dl,%ecx | |
- shlb $4,%al | |
- movzbq (%rsp,%rbx,1),%r12 | |
- andl $240,%ecx | |
- shlq $48,%r13 | |
- xorq %r8,%r12 | |
- movq %r9,%r10 | |
- xorq %r13,%r9 | |
- shrq $8,%r8 | |
- movzbq %r12b,%r12 | |
- movl -4(%rdi),%edx | |
- shrq $8,%r9 | |
- xorq -128(%rbp,%rbx,8),%r8 | |
- shlq $56,%r10 | |
- xorq (%rbp,%rbx,8),%r9 | |
- movzwq (%r11,%r12,2),%r12 | |
- xorq 8(%rsi,%rax,1),%r8 | |
- xorq (%rsi,%rax,1),%r9 | |
- shlq $48,%r12 | |
- xorq %r10,%r8 | |
- xorq %r12,%r9 | |
- movzbq %r8b,%r13 | |
- shrq $4,%r8 | |
- movq %r9,%r10 | |
- shlb $4,%r13b | |
- shrq $4,%r9 | |
- xorq 8(%rsi,%rcx,1),%r8 | |
- movzwq (%r11,%r13,2),%r13 | |
- shlq $60,%r10 | |
- xorq (%rsi,%rcx,1),%r9 | |
- xorq %r10,%r8 | |
- shlq $48,%r13 | |
- bswapq %r8 | |
- xorq %r13,%r9 | |
- bswapq %r9 | |
- cmpq %r15,%r14 | |
- jb .Louter_loop | |
- movq %r8,8(%rdi) | |
- movq %r9,(%rdi) | |
- | |
- leaq 280+48(%rsp),%rsi | |
-.cfi_def_cfa %rsi,8 | |
- movq -48(%rsi),%r15 | |
-.cfi_restore %r15 | |
- movq -40(%rsi),%r14 | |
-.cfi_restore %r14 | |
- movq -32(%rsi),%r13 | |
-.cfi_restore %r13 | |
- movq -24(%rsi),%r12 | |
-.cfi_restore %r12 | |
- movq -16(%rsi),%rbp | |
-.cfi_restore %rbp | |
- movq -8(%rsi),%rbx | |
-.cfi_restore %rbx | |
- leaq 0(%rsi),%rsp | |
-.cfi_def_cfa_register %rsp | |
-.Lghash_epilogue: | |
- .byte 0xf3,0xc3 | |
-.cfi_endproc | |
-.size gcm_ghash_4bit,.-gcm_ghash_4bit | |
-.globl gcm_init_clmul | |
-.type gcm_init_clmul,@function | |
-.align 16 | |
-gcm_init_clmul: | |
-.cfi_startproc | |
-.L_init_clmul: | |
- movdqu (%rsi),%xmm2 | |
- pshufd $78,%xmm2,%xmm2 | |
- | |
- | |
- pshufd $255,%xmm2,%xmm4 | |
- movdqa %xmm2,%xmm3 | |
- psllq $1,%xmm2 | |
- pxor %xmm5,%xmm5 | |
- psrlq $63,%xmm3 | |
- pcmpgtd %xmm4,%xmm5 | |
- pslldq $8,%xmm3 | |
- por %xmm3,%xmm2 | |
- | |
- | |
- pand .L0x1c2_polynomial(%rip),%xmm5 | |
- pxor %xmm5,%xmm2 | |
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. | |
+# | |
+# Licensed under the Apache License 2.0 (the "License"). You may not use | |
+# this file except in compliance with the License. You can obtain a copy | |
+# in the file LICENSE in the source distribution or at | |
+# https://www.openssl.org/source/license.html | |
+ | |
+# | |
+# ==================================================================== | |
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | |
+# project. The module is, however, dual licensed under OpenSSL and | |
+# CRYPTOGAMS licenses depending on where you obtain it. For further | |
+# details see http://www.openssl.org/~appro/cryptogams/. | |
+# ==================================================================== | |
+# | |
+# March, June 2010 | |
+# | |
+# The module implements "4-bit" GCM GHASH function and underlying | |
+# single multiplication operation in GF(2^128). "4-bit" means that | |
+# it uses 256 bytes per-key table [+128 bytes shared table]. GHASH | |
+# function features so called "528B" variant utilizing additional | |
+# 256+16 bytes of per-key storage [+512 bytes shared table]. | |
+# Performance results are for this streamed GHASH subroutine and are | |
+# expressed in cycles per processed byte, less is better: | |
+# | |
+# gcc 3.4.x(*) assembler | |
+# | |
+# P4 28.6 14.0 +100% | |
+# Opteron 19.3 7.7 +150% | |
+# Core2 17.8 8.1(**) +120% | |
+# Atom 31.6 16.8 +88% | |
+# VIA Nano 21.8 10.1 +115% | |
+# | |
+# (*) comparison is not completely fair, because C results are | |
+# for vanilla "256B" implementation, while assembler results | |
+# are for "528B";-) | |
+# (**) it's mystery [to me] why Core2 result is not same as for | |
+# Opteron; | |
+ | |
+# May 2010 | |
+# | |
+# Add PCLMULQDQ version performing at 2.02 cycles per processed byte. | |
+# See ghash-x86.pl for background information and details about coding | |
+# techniques. | |
+# | |
+# Special thanks to David Woodhouse for providing access to a | |
+# Westmere-based system on behalf of Intel Open Source Technology Centre. | |
+ | |
+# December 2012 | |
+# | |
+# Overhaul: aggregate Karatsuba post-processing, improve ILP in | |
+# reduction_alg9, increase reduction aggregate factor to 4x. As for | |
+# the latter. ghash-x86.pl discusses that it makes lesser sense to | |
+# increase aggregate factor. Then why increase here? Critical path | |
+# consists of 3 independent pclmulqdq instructions, Karatsuba post- | |
+# processing and reduction. "On top" of this we lay down aggregated | |
+# multiplication operations, triplets of independent pclmulqdq's. As | |
+# issue rate for pclmulqdq is limited, it makes lesser sense to | |
+# aggregate more multiplications than it takes to perform remaining | |
+# non-multiplication operations. 2x is near-optimal coefficient for | |
+# contemporary Intel CPUs (therefore modest improvement coefficient), | |
+# but not for Bulldozer. Latter is because logical SIMD operations | |
+# are twice as slow in comparison to Intel, so that critical path is | |
+# longer. A CPU with higher pclmulqdq issue rate would also benefit | |
+# from higher aggregate factor... | |
+# | |
+# Westmere 1.78(+13%) | |
+# Sandy Bridge 1.80(+8%) | |
+# Ivy Bridge 1.80(+7%) | |
+# Haswell 0.55(+93%) (if system doesn't support AVX) | |
+# Broadwell 0.45(+110%)(if system doesn't support AVX) | |
+# Skylake 0.44(+110%)(if system doesn't support AVX) | |
+# Bulldozer 1.49(+27%) | |
+# Silvermont 2.88(+13%) | |
+# Knights L 2.12(-) (if system doesn't support AVX) | |
+# Goldmont 1.08(+24%) | |
+ | |
+# March 2013 | |
+# | |
+# ... 8x aggregate factor AVX code path is using reduction algorithm | |
+# suggested by Shay Gueron[1]. Even though contemporary AVX-capable | |
+# CPUs such as Sandy and Ivy Bridge can execute it, the code performs | |
+# sub-optimally in comparison to above mentioned version. But thanks | |
+# to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that | |
+# it performs in 0.41 cycles per byte on Haswell processor, in | |
+# 0.29 on Broadwell, and in 0.36 on Skylake. | |
+# | |
+# Knights Landing achieves 1.09 cpb. | |
+# | |
+# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest | |
+ | |
+# Generated once from | |
+# https://github.com/openssl/openssl/blob/5ffc3324/crypto/modes/asm/ghash-x86_64.pl | |
+# and modified for ICP. Modification are kept at a bare minimum to ease later | |
+# upstream merges. | |
+#if defined(__x86_64__) && defined(HAVE_AVX) | |
- pshufd $78,%xmm2,%xmm6 | |
- movdqa %xmm2,%xmm0 | |
- pxor %xmm2,%xmm6 | |
- movdqa %xmm0,%xmm1 | |
- pshufd $78,%xmm0,%xmm3 | |
- pxor %xmm0,%xmm3 | |
-.byte 102,15,58,68,194,0 | |
-.byte 102,15,58,68,202,17 | |
-.byte 102,15,58,68,222,0 | |
- pxor %xmm0,%xmm3 | |
- pxor %xmm1,%xmm3 | |
- | |
- movdqa %xmm3,%xmm4 | |
- psrldq $8,%xmm3 | |
- pslldq $8,%xmm4 | |
- pxor %xmm3,%xmm1 | |
- pxor %xmm4,%xmm0 | |
- | |
- movdqa %xmm0,%xmm4 | |
- movdqa %xmm0,%xmm3 | |
- psllq $5,%xmm0 | |
- pxor %xmm0,%xmm3 | |
- psllq $1,%xmm0 | |
- pxor %xmm3,%xmm0 | |
- psllq $57,%xmm0 | |
- movdqa %xmm0,%xmm3 | |
- pslldq $8,%xmm0 | |
- psrldq $8,%xmm3 | |
- pxor %xmm4,%xmm0 | |
- pxor %xmm3,%xmm1 | |
- | |
- | |
- movdqa %xmm0,%xmm4 | |
- psrlq $1,%xmm0 | |
- pxor %xmm4,%xmm1 | |
- pxor %xmm0,%xmm4 | |
- psrlq $5,%xmm0 | |
- pxor %xmm4,%xmm0 | |
- psrlq $1,%xmm0 | |
- pxor %xmm1,%xmm0 | |
- pshufd $78,%xmm2,%xmm3 | |
- pshufd $78,%xmm0,%xmm4 | |
- pxor %xmm2,%xmm3 | |
- movdqu %xmm2,0(%rdi) | |
- pxor %xmm0,%xmm4 | |
- movdqu %xmm0,16(%rdi) | |
-.byte 102,15,58,15,227,8 | |
- movdqu %xmm4,32(%rdi) | |
- movdqa %xmm0,%xmm1 | |
- pshufd $78,%xmm0,%xmm3 | |
- pxor %xmm0,%xmm3 | |
-.byte 102,15,58,68,194,0 | |
-.byte 102,15,58,68,202,17 | |
-.byte 102,15,58,68,222,0 | |
- pxor %xmm0,%xmm3 | |
- pxor %xmm1,%xmm3 | |
- | |
- movdqa %xmm3,%xmm4 | |
- psrldq $8,%xmm3 | |
- pslldq $8,%xmm4 | |
- pxor %xmm3,%xmm1 | |
- pxor %xmm4,%xmm0 | |
- | |
- movdqa %xmm0,%xmm4 | |
- movdqa %xmm0,%xmm3 | |
- psllq $5,%xmm0 | |
- pxor %xmm0,%xmm3 | |
- psllq $1,%xmm0 | |
- pxor %xmm3,%xmm0 | |
- psllq $57,%xmm0 | |
- movdqa %xmm0,%xmm3 | |
- pslldq $8,%xmm0 | |
- psrldq $8,%xmm3 | |
- pxor %xmm4,%xmm0 | |
- pxor %xmm3,%xmm1 | |
- | |
- | |
- movdqa %xmm0,%xmm4 | |
- psrlq $1,%xmm0 | |
- pxor %xmm4,%xmm1 | |
- pxor %xmm0,%xmm4 | |
- psrlq $5,%xmm0 | |
- pxor %xmm4,%xmm0 | |
- psrlq $1,%xmm0 | |
- pxor %xmm1,%xmm0 | |
- movdqa %xmm0,%xmm5 | |
- movdqa %xmm0,%xmm1 | |
- pshufd $78,%xmm0,%xmm3 | |
- pxor %xmm0,%xmm3 | |
-.byte 102,15,58,68,194,0 | |
-.byte 102,15,58,68,202,17 | |
-.byte 102,15,58,68,222,0 | |
- pxor %xmm0,%xmm3 | |
- pxor %xmm1,%xmm3 | |
- | |
- movdqa %xmm3,%xmm4 | |
- psrldq $8,%xmm3 | |
- pslldq $8,%xmm4 | |
- pxor %xmm3,%xmm1 | |
- pxor %xmm4,%xmm0 | |
- | |
- movdqa %xmm0,%xmm4 | |
- movdqa %xmm0,%xmm3 | |
- psllq $5,%xmm0 | |
- pxor %xmm0,%xmm3 | |
- psllq $1,%xmm0 | |
- pxor %xmm3,%xmm0 | |
- psllq $57,%xmm0 | |
- movdqa %xmm0,%xmm3 | |
- pslldq $8,%xmm0 | |
- psrldq $8,%xmm3 | |
- pxor %xmm4,%xmm0 | |
- pxor %xmm3,%xmm1 | |
- | |
+.text | |
- movdqa %xmm0,%xmm4 | |
- psrlq $1,%xmm0 | |
- pxor %xmm4,%xmm1 | |
- pxor %xmm0,%xmm4 | |
- psrlq $5,%xmm0 | |
- pxor %xmm4,%xmm0 | |
- psrlq $1,%xmm0 | |
- pxor %xmm1,%xmm0 | |
- pshufd $78,%xmm5,%xmm3 | |
- pshufd $78,%xmm0,%xmm4 | |
- pxor %xmm5,%xmm3 | |
- movdqu %xmm5,48(%rdi) | |
- pxor %xmm0,%xmm4 | |
- movdqu %xmm0,64(%rdi) | |
-.byte 102,15,58,15,227,8 | |
- movdqu %xmm4,80(%rdi) | |
- .byte 0xf3,0xc3 | |
-.cfi_endproc | |
-.size gcm_init_clmul,.-gcm_init_clmul | |
.globl gcm_gmult_clmul | |
.type gcm_gmult_clmul,@function | |
.align 16 | |
@@ -912,400 +151,19 @@ | |
.byte 0xf3,0xc3 | |
.cfi_endproc | |
.size gcm_gmult_clmul,.-gcm_gmult_clmul | |
-.globl gcm_ghash_clmul | |
-.type gcm_ghash_clmul,@function | |
-.align 32 | |
-gcm_ghash_clmul: | |
-.cfi_startproc | |
-.L_ghash_clmul: | |
- movdqa .Lbswap_mask(%rip),%xmm10 | |
- movdqu (%rdi),%xmm0 | |
- movdqu (%rsi),%xmm2 | |
- movdqu 32(%rsi),%xmm7 | |
-.byte 102,65,15,56,0,194 | |
- | |
- subq $0x10,%rcx | |
- jz .Lodd_tail | |
- | |
- movdqu 16(%rsi),%xmm6 | |
- movl OPENSSL_ia32cap_P+4(%rip),%eax | |
- cmpq $0x30,%rcx | |
- jb .Lskip4x | |
- | |
- andl $71303168,%eax | |
- cmpl $4194304,%eax | |
- je .Lskip4x | |
- | |
- subq $0x30,%rcx | |
- movq $0xA040608020C0E000,%rax | |
- movdqu 48(%rsi),%xmm14 | |
- movdqu 64(%rsi),%xmm15 | |
- | |
- | |
- | |
- | |
- movdqu 48(%rdx),%xmm3 | |
- movdqu 32(%rdx),%xmm11 | |
-.byte 102,65,15,56,0,218 | |
-.byte 102,69,15,56,0,218 | |
- movdqa %xmm3,%xmm5 | |
- pshufd $78,%xmm3,%xmm4 | |
- pxor %xmm3,%xmm4 | |
-.byte 102,15,58,68,218,0 | |
-.byte 102,15,58,68,234,17 | |
-.byte 102,15,58,68,231,0 | |
- | |
- movdqa %xmm11,%xmm13 | |
- pshufd $78,%xmm11,%xmm12 | |
- pxor %xmm11,%xmm12 | |
-.byte 102,68,15,58,68,222,0 | |
-.byte 102,68,15,58,68,238,17 | |
-.byte 102,68,15,58,68,231,16 | |
- xorps %xmm11,%xmm3 | |
- xorps %xmm13,%xmm5 | |
- movups 80(%rsi),%xmm7 | |
- xorps %xmm12,%xmm4 | |
- | |
- movdqu 16(%rdx),%xmm11 | |
- movdqu 0(%rdx),%xmm8 | |
-.byte 102,69,15,56,0,218 | |
-.byte 102,69,15,56,0,194 | |
- movdqa %xmm11,%xmm13 | |
- pshufd $78,%xmm11,%xmm12 | |
- pxor %xmm8,%xmm0 | |
- pxor %xmm11,%xmm12 | |
-.byte 102,69,15,58,68,222,0 | |
- movdqa %xmm0,%xmm1 | |
- pshufd $78,%xmm0,%xmm8 | |
- pxor %xmm0,%xmm8 | |
-.byte 102,69,15,58,68,238,17 | |
-.byte 102,68,15,58,68,231,0 | |
- xorps %xmm11,%xmm3 | |
- xorps %xmm13,%xmm5 | |
- | |
- leaq 64(%rdx),%rdx | |
- subq $0x40,%rcx | |
- jc .Ltail4x | |
- | |
- jmp .Lmod4_loop | |
-.align 32 | |
-.Lmod4_loop: | |
-.byte 102,65,15,58,68,199,0 | |
- xorps %xmm12,%xmm4 | |
- movdqu 48(%rdx),%xmm11 | |
-.byte 102,69,15,56,0,218 | |
-.byte 102,65,15,58,68,207,17 | |
- xorps %xmm3,%xmm0 | |
- movdqu 32(%rdx),%xmm3 | |
- movdqa %xmm11,%xmm13 | |
-.byte 102,68,15,58,68,199,16 | |
- pshufd $78,%xmm11,%xmm12 | |
- xorps %xmm5,%xmm1 | |
- pxor %xmm11,%xmm12 | |
-.byte 102,65,15,56,0,218 | |
- movups 32(%rsi),%xmm7 | |
- xorps %xmm4,%xmm8 | |
-.byte 102,68,15,58,68,218,0 | |
- pshufd $78,%xmm3,%xmm4 | |
- | |
- pxor %xmm0,%xmm8 | |
- movdqa %xmm3,%xmm5 | |
- pxor %xmm1,%xmm8 | |
- pxor %xmm3,%xmm4 | |
- movdqa %xmm8,%xmm9 | |
-.byte 102,68,15,58,68,234,17 | |
- pslldq $8,%xmm8 | |
- psrldq $8,%xmm9 | |
- pxor %xmm8,%xmm0 | |
- movdqa .L7_mask(%rip),%xmm8 | |
- pxor %xmm9,%xmm1 | |
-.byte 102,76,15,110,200 | |
- | |
- pand %xmm0,%xmm8 | |
-.byte 102,69,15,56,0,200 | |
- pxor %xmm0,%xmm9 | |
-.byte 102,68,15,58,68,231,0 | |
- psllq $57,%xmm9 | |
- movdqa %xmm9,%xmm8 | |
- pslldq $8,%xmm9 | |
-.byte 102,15,58,68,222,0 | |
- psrldq $8,%xmm8 | |
- pxor %xmm9,%xmm0 | |
- pxor %xmm8,%xmm1 | |
- movdqu 0(%rdx),%xmm8 | |
- | |
- movdqa %xmm0,%xmm9 | |
- psrlq $1,%xmm0 | |
-.byte 102,15,58,68,238,17 | |
- xorps %xmm11,%xmm3 | |
- movdqu 16(%rdx),%xmm11 | |
-.byte 102,69,15,56,0,218 | |
-.byte 102,15,58,68,231,16 | |
- xorps %xmm13,%xmm5 | |
- movups 80(%rsi),%xmm7 | |
-.byte 102,69,15,56,0,194 | |
- pxor %xmm9,%xmm1 | |
- pxor %xmm0,%xmm9 | |
- psrlq $5,%xmm0 | |
- | |
- movdqa %xmm11,%xmm13 | |
- pxor %xmm12,%xmm4 | |
- pshufd $78,%xmm11,%xmm12 | |
- pxor %xmm9,%xmm0 | |
- pxor %xmm8,%xmm1 | |
- pxor %xmm11,%xmm12 | |
-.byte 102,69,15,58,68,222,0 | |
- psrlq $1,%xmm0 | |
- pxor %xmm1,%xmm0 | |
- movdqa %xmm0,%xmm1 | |
-.byte 102,69,15,58,68,238,17 | |
- xorps %xmm11,%xmm3 | |
- pshufd $78,%xmm0,%xmm8 | |
- pxor %xmm0,%xmm8 | |
- | |
-.byte 102,68,15,58,68,231,0 | |
- xorps %xmm13,%xmm5 | |
- | |
- leaq 64(%rdx),%rdx | |
- subq $0x40,%rcx | |
- jnc .Lmod4_loop | |
- | |
-.Ltail4x: | |
-.byte 102,65,15,58,68,199,0 | |
-.byte 102,65,15,58,68,207,17 | |
-.byte 102,68,15,58,68,199,16 | |
- xorps %xmm12,%xmm4 | |
- xorps %xmm3,%xmm0 | |
- xorps %xmm5,%xmm1 | |
- pxor %xmm0,%xmm1 | |
- pxor %xmm4,%xmm8 | |
- | |
- pxor %xmm1,%xmm8 | |
- pxor %xmm0,%xmm1 | |
- | |
- movdqa %xmm8,%xmm9 | |
- psrldq $8,%xmm8 | |
- pslldq $8,%xmm9 | |
- pxor %xmm8,%xmm1 | |
- pxor %xmm9,%xmm0 | |
- | |
- movdqa %xmm0,%xmm4 | |
- movdqa %xmm0,%xmm3 | |
- psllq $5,%xmm0 | |
- pxor %xmm0,%xmm3 | |
- psllq $1,%xmm0 | |
- pxor %xmm3,%xmm0 | |
- psllq $57,%xmm0 | |
- movdqa %xmm0,%xmm3 | |
- pslldq $8,%xmm0 | |
- psrldq $8,%xmm3 | |
- pxor %xmm4,%xmm0 | |
- pxor %xmm3,%xmm1 | |
- | |
- | |
- movdqa %xmm0,%xmm4 | |
- psrlq $1,%xmm0 | |
- pxor %xmm4,%xmm1 | |
- pxor %xmm0,%xmm4 | |
- psrlq $5,%xmm0 | |
- pxor %xmm4,%xmm0 | |
- psrlq $1,%xmm0 | |
- pxor %xmm1,%xmm0 | |
- addq $0x40,%rcx | |
- jz .Ldone | |
- movdqu 32(%rsi),%xmm7 | |
- subq $0x10,%rcx | |
- jz .Lodd_tail | |
-.Lskip4x: | |
- | |
- | |
- | |
- | |
- | |
- movdqu (%rdx),%xmm8 | |
- movdqu 16(%rdx),%xmm3 | |
-.byte 102,69,15,56,0,194 | |
-.byte 102,65,15,56,0,218 | |
- pxor %xmm8,%xmm0 | |
- | |
- movdqa %xmm3,%xmm5 | |
- pshufd $78,%xmm3,%xmm4 | |
- pxor %xmm3,%xmm4 | |
-.byte 102,15,58,68,218,0 | |
-.byte 102,15,58,68,234,17 | |
-.byte 102,15,58,68,231,0 | |
- | |
- leaq 32(%rdx),%rdx | |
- nop | |
- subq $0x20,%rcx | |
- jbe .Leven_tail | |
- nop | |
- jmp .Lmod_loop | |
- | |
-.align 32 | |
-.Lmod_loop: | |
- movdqa %xmm0,%xmm1 | |
- movdqa %xmm4,%xmm8 | |
- pshufd $78,%xmm0,%xmm4 | |
- pxor %xmm0,%xmm4 | |
- | |
-.byte 102,15,58,68,198,0 | |
-.byte 102,15,58,68,206,17 | |
-.byte 102,15,58,68,231,16 | |
- | |
- pxor %xmm3,%xmm0 | |
- pxor %xmm5,%xmm1 | |
- movdqu (%rdx),%xmm9 | |
- pxor %xmm0,%xmm8 | |
-.byte 102,69,15,56,0,202 | |
- movdqu 16(%rdx),%xmm3 | |
- | |
- pxor %xmm1,%xmm8 | |
- pxor %xmm9,%xmm1 | |
- pxor %xmm8,%xmm4 | |
-.byte 102,65,15,56,0,218 | |
- movdqa %xmm4,%xmm8 | |
- psrldq $8,%xmm8 | |
- pslldq $8,%xmm4 | |
- pxor %xmm8,%xmm1 | |
- pxor %xmm4,%xmm0 | |
- | |
- movdqa %xmm3,%xmm5 | |
- | |
- movdqa %xmm0,%xmm9 | |
- movdqa %xmm0,%xmm8 | |
- psllq $5,%xmm0 | |
- pxor %xmm0,%xmm8 | |
-.byte 102,15,58,68,218,0 | |
- psllq $1,%xmm0 | |
- pxor %xmm8,%xmm0 | |
- psllq $57,%xmm0 | |
- movdqa %xmm0,%xmm8 | |
- pslldq $8,%xmm0 | |
- psrldq $8,%xmm8 | |
- pxor %xmm9,%xmm0 | |
- pshufd $78,%xmm5,%xmm4 | |
- pxor %xmm8,%xmm1 | |
- pxor %xmm5,%xmm4 | |
- | |
- movdqa %xmm0,%xmm9 | |
- psrlq $1,%xmm0 | |
-.byte 102,15,58,68,234,17 | |
- pxor %xmm9,%xmm1 | |
- pxor %xmm0,%xmm9 | |
- psrlq $5,%xmm0 | |
- pxor %xmm9,%xmm0 | |
- leaq 32(%rdx),%rdx | |
- psrlq $1,%xmm0 | |
-.byte 102,15,58,68,231,0 | |
- pxor %xmm1,%xmm0 | |
- | |
- subq $0x20,%rcx | |
- ja .Lmod_loop | |
- | |
-.Leven_tail: | |
- movdqa %xmm0,%xmm1 | |
- movdqa %xmm4,%xmm8 | |
- pshufd $78,%xmm0,%xmm4 | |
- pxor %xmm0,%xmm4 | |
- | |
-.byte 102,15,58,68,198,0 | |
-.byte 102,15,58,68,206,17 | |
-.byte 102,15,58,68,231,16 | |
- | |
- pxor %xmm3,%xmm0 | |
- pxor %xmm5,%xmm1 | |
- pxor %xmm0,%xmm8 | |
- pxor %xmm1,%xmm8 | |
- pxor %xmm8,%xmm4 | |
- movdqa %xmm4,%xmm8 | |
- psrldq $8,%xmm8 | |
- pslldq $8,%xmm4 | |
- pxor %xmm8,%xmm1 | |
- pxor %xmm4,%xmm0 | |
- | |
- movdqa %xmm0,%xmm4 | |
- movdqa %xmm0,%xmm3 | |
- psllq $5,%xmm0 | |
- pxor %xmm0,%xmm3 | |
- psllq $1,%xmm0 | |
- pxor %xmm3,%xmm0 | |
- psllq $57,%xmm0 | |
- movdqa %xmm0,%xmm3 | |
- pslldq $8,%xmm0 | |
- psrldq $8,%xmm3 | |
- pxor %xmm4,%xmm0 | |
- pxor %xmm3,%xmm1 | |
- | |
- | |
- movdqa %xmm0,%xmm4 | |
- psrlq $1,%xmm0 | |
- pxor %xmm4,%xmm1 | |
- pxor %xmm0,%xmm4 | |
- psrlq $5,%xmm0 | |
- pxor %xmm4,%xmm0 | |
- psrlq $1,%xmm0 | |
- pxor %xmm1,%xmm0 | |
- testq %rcx,%rcx | |
- jnz .Ldone | |
- | |
-.Lodd_tail: | |
- movdqu (%rdx),%xmm8 | |
-.byte 102,69,15,56,0,194 | |
- pxor %xmm8,%xmm0 | |
- movdqa %xmm0,%xmm1 | |
- pshufd $78,%xmm0,%xmm3 | |
- pxor %xmm0,%xmm3 | |
-.byte 102,15,58,68,194,0 | |
-.byte 102,15,58,68,202,17 | |
-.byte 102,15,58,68,223,0 | |
- pxor %xmm0,%xmm3 | |
- pxor %xmm1,%xmm3 | |
- | |
- movdqa %xmm3,%xmm4 | |
- psrldq $8,%xmm3 | |
- pslldq $8,%xmm4 | |
- pxor %xmm3,%xmm1 | |
- pxor %xmm4,%xmm0 | |
- | |
- movdqa %xmm0,%xmm4 | |
- movdqa %xmm0,%xmm3 | |
- psllq $5,%xmm0 | |
- pxor %xmm0,%xmm3 | |
- psllq $1,%xmm0 | |
- pxor %xmm3,%xmm0 | |
- psllq $57,%xmm0 | |
- movdqa %xmm0,%xmm3 | |
- pslldq $8,%xmm0 | |
- psrldq $8,%xmm3 | |
- pxor %xmm4,%xmm0 | |
- pxor %xmm3,%xmm1 | |
- | |
- | |
- movdqa %xmm0,%xmm4 | |
- psrlq $1,%xmm0 | |
- pxor %xmm4,%xmm1 | |
- pxor %xmm0,%xmm4 | |
- psrlq $5,%xmm0 | |
- pxor %xmm4,%xmm0 | |
- psrlq $1,%xmm0 | |
- pxor %xmm1,%xmm0 | |
-.Ldone: | |
-.byte 102,65,15,56,0,194 | |
- movdqu %xmm0,(%rdi) | |
- .byte 0xf3,0xc3 | |
-.cfi_endproc | |
-.size gcm_ghash_clmul,.-gcm_ghash_clmul | |
-.globl gcm_init_avx | |
-.type gcm_init_avx,@function | |
+.globl gcm_init_htab_avx | |
+.type gcm_init_htab_avx,@function | |
.align 32 | |
-gcm_init_avx: | |
+gcm_init_htab_avx: | |
.cfi_startproc | |
vzeroupper | |
vmovdqu (%rsi),%xmm2 | |
- vpshufd $78,%xmm2,%xmm2 | |
+ // KCF/ICP stores H in network byte order with the hi qword first | |
+ // so we need to swap all bytes, not the 2 qwords. | |
+ vmovdqu .Lbswap_mask(%rip),%xmm4 | |
+ vpshufb %xmm4,%xmm2,%xmm2 | |
vpshufd $255,%xmm2,%xmm4 | |
@@ -1405,7 +263,8 @@ | |
vzeroupper | |
.byte 0xf3,0xc3 | |
.cfi_endproc | |
-.size gcm_init_avx,.-gcm_init_avx | |
+.size gcm_init_htab_avx,.-gcm_init_htab_avx | |
+ | |
.globl gcm_gmult_avx | |
.type gcm_gmult_avx,@function | |
.align 32 | |
@@ -1845,3 +704,5 @@ | |
.byte 71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 | |
.align 64 | |
+ | |
+#endif /* defined(__x86_64__) && defined(HAVE_AVX) */ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment