Last active
December 16, 2019 21:53
-
-
Save AttilaFueloep/c1aa86170adfbf11fdb37a2ffcad8aa3 to your computer and use it in GitHub Desktop.
Diff against openssl openssl/crypto/modes/aesni-gcm-x86_64.s.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- ../openssl/crypto/modes/aesni-gcm-x86_64.s 2019-10-19 20:15:28.989989780 +0200 | |
+++ module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S 2019-12-16 19:32:16.699285141 +0100 | |
@@ -1,3 +1,51 @@ | |
+# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved. | |
+# | |
+# Licensed under the Apache License 2.0 (the "License"). You may not use | |
+# this file except in compliance with the License. You can obtain a copy | |
+# in the file LICENSE in the source distribution or at | |
+# https://www.openssl.org/source/license.html | |
+ | |
+# | |
+# ==================================================================== | |
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | |
+# project. The module is, however, dual licensed under OpenSSL and | |
+# CRYPTOGAMS licenses depending on where you obtain it. For further | |
+# details see http://www.openssl.org/~appro/cryptogams/. | |
+# ==================================================================== | |
+# | |
+# | |
+# AES-NI-CTR+GHASH stitch. | |
+# | |
+# February 2013 | |
+# | |
+# OpenSSL GCM implementation is organized in such way that its | |
+# performance is rather close to the sum of its streamed components, | |
+# in the context parallelized AES-NI CTR and modulo-scheduled | |
+# PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation | |
+# was observed to perform significantly better than the sum of the | |
+# components on contemporary CPUs, the effort was deemed impossible to | |
+# justify. This module is based on combination of Intel submissions, | |
+# [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max | |
+# Locktyukhin of Intel Corp. who verified that it reduces shuffles | |
+# pressure with notable relative improvement, achieving 1.0 cycle per | |
+# byte processed with 128-bit key on Haswell processor, 0.74 - on | |
+# Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled | |
+# measurements for favourable packet size, one divisible by 96. | |
+# Applications using the EVP interface will observe a few percent | |
+# worse performance.] | |
+# | |
+# Knights Landing processes 1 byte in 1.25 cycles (measured with EVP). | |
+# | |
+# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest | |
+# [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf | |
+ | |
+# Generated once from | |
+# https://github.com/openssl/openssl/blob/5ffc3324/crypto/modes/asm/aesni-gcm-x86_64.pl | |
+# and modified for ICP. Modification are kept at a bare minimum to ease later | |
+# upstream merges. | |
+ | |
+#if defined(__x86_64__) && defined(HAVE_AVX) | |
+ | |
.text | |
.type _aesni_ctr32_ghash_6x,@function | |
@@ -190,7 +238,7 @@ | |
movbeq 0(%r14),%r12 | |
vaesenc %xmm1,%xmm14,%xmm14 | |
vmovups 160-128(%rcx),%xmm1 | |
- cmpl $11,%ebp | |
+ cmpl $12,%ebp // ICP uses 10,12,14 not 9,11,13 for rounds. | |
jb .Lenc_tail | |
vaesenc %xmm15,%xmm9,%xmm9 | |
@@ -208,7 +256,8 @@ | |
vmovups 176-128(%rcx),%xmm15 | |
vaesenc %xmm1,%xmm14,%xmm14 | |
vmovups 192-128(%rcx),%xmm1 | |
- je .Lenc_tail | |
+ cmpl $14,%ebp // ICP does not zero key schedule. | |
+ jb .Lenc_tail | |
vaesenc %xmm15,%xmm9,%xmm9 | |
vaesenc %xmm15,%xmm10,%xmm10 | |
@@ -347,7 +396,7 @@ | |
vmovdqu (%r11),%xmm0 | |
leaq 128(%rcx),%rcx | |
leaq 32+32(%r9),%r9 | |
- movl 240-128(%rcx),%ebp | |
+ movl 504-128(%rcx),%ebp // ICP has a larger offset for rounds. | |
vpshufb %xmm0,%xmm8,%xmm8 | |
andq %r15,%r14 | |
@@ -418,7 +467,7 @@ | |
_aesni_ctr32_6x: | |
vmovdqu 0-128(%rcx),%xmm4 | |
vmovdqu 32(%r11),%xmm2 | |
- leaq -1(%rbp),%r13 | |
+ leaq -2(%rbp),%r13 // ICP uses 10,12,14 not 9,11,13 for rounds. | |
vmovups 16-128(%rcx),%xmm15 | |
leaq 32-128(%rcx),%r12 | |
vpxor %xmm4,%xmm1,%xmm9 | |
@@ -538,7 +587,7 @@ | |
leaq 128(%rcx),%rcx | |
vmovdqu (%r11),%xmm0 | |
andq $-128,%rsp | |
- movl 240-128(%rcx),%ebp | |
+ movl 504-128(%rcx),%ebp // ICP has an larger offset for rounds. | |
andq %r15,%r14 | |
andq %rsp,%r15 | |
@@ -770,6 +819,56 @@ | |
.byte 0xf3,0xc3 | |
.cfi_endproc | |
.size aesni_gcm_encrypt,.-aesni_gcm_encrypt | |
+ | |
+/* Some utility routines */ | |
+ | |
+/* | |
+ * clear all fpu registers | |
+ * void clear_fpu_regs_avx(void); | |
+ */ | |
+.globl clear_fpu_regs_avx | |
+.type clear_fpu_regs_avx,@function | |
+.align 32 | |
+clear_fpu_regs_avx: | |
+ vzeroall | |
+ ret | |
+.size clear_fpu_regs_avx,.-clear_fpu_regs_avx | |
+ | |
+/* | |
+ * void gcm_xor_avx(const uint8_t *src, uint8_t *dst); | |
+ * | |
+ * XORs one pair of unaligned 128-bit blocks from `src' and `dst' and | |
+ * stores the result at `dst'. The XOR is performed using FPU registers, | |
+ * so make sure FPU state is saved when running this in the kernel. | |
+ */ | |
+.globl gcm_xor_avx | |
+.type gcm_xor_avx,@function | |
+.align 32 | |
+gcm_xor_avx: | |
+ movdqu (%rdi), %xmm0 | |
+ movdqu (%rsi), %xmm1 | |
+ pxor %xmm1, %xmm0 | |
+ movdqu %xmm0, (%rsi) | |
+ ret | |
+.size gcm_xor_avx,.-gcm_xor_avx | |
+ | |
+/* | |
+ * Toggle a boolean_t value atomically and return the new value. | |
+ * boolean_t atomic_toggle_boolean_nv(volatile boolean_t *); | |
+ */ | |
+.globl atomic_toggle_boolean_nv | |
+.type atomic_toggle_boolean_nv,@function | |
+.align 32 | |
+atomic_toggle_boolean_nv: | |
+ xorl %eax, %eax | |
+ lock | |
+ xorl $1, (%rdi) | |
+ jz 1f | |
+ movl $1, %eax | |
+1: | |
+ ret | |
+.size atomic_toggle_boolean_nv,.-atomic_toggle_boolean_nv | |
+ | |
.align 64 | |
.Lbswap_mask: | |
.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 | |
@@ -783,3 +882,5 @@ | |
.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 | |
.byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 | |
.align 64 | |
+ | |
+#endif /* defined(__x86_64__) && defined(HAVE_AVX) */ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment