AttilaFueloep/aesni-gcm-x86_64.S.diff

## aesni-gcm-x86_64.S.diff
--- ../openssl/crypto/modes/aesni-gcm-x86_64.s	2019-10-19 20:15:28.989989780 +0200
+++ module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S	2019-12-16 19:32:16.699285141 +0100
@@ -1,3 +1,51 @@
+# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+#
+# AES-NI-CTR+GHASH stitch.
+#
+# February 2013
+#
+# OpenSSL GCM implementation is organized in such way that its
+# performance is rather close to the sum of its streamed components,
+# in the context parallelized AES-NI CTR and modulo-scheduled
+# PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation
+# was observed to perform significantly better than the sum of the
+# components on contemporary CPUs, the effort was deemed impossible to
+# justify. This module is based on combination of Intel submissions,
+# [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
+# Locktyukhin of Intel Corp. who verified that it reduces shuffles
+# pressure with notable relative improvement, achieving 1.0 cycle per
+# byte processed with 128-bit key on Haswell processor, 0.74 - on
+# Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled
+# measurements for favourable packet size, one divisible by 96.
+# Applications using the EVP interface will observe a few percent
+# worse performance.]
+#
+# Knights Landing processes 1 byte in 1.25 cycles (measured with EVP).
+#
+# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
+# [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
+
+# Generated once from
+# https://github.com/openssl/openssl/blob/5ffc3324/crypto/modes/asm/aesni-gcm-x86_64.pl
+# and modified for ICP. Modification are kept at a bare minimum to ease later
+# upstream merges.
+
+#if defined(__x86_64__) && defined(HAVE_AVX)
+
 .text

 .type	_aesni_ctr32_ghash_6x,@function
@@ -190,7 +238,7 @@
 	movbeq	0(%r14),%r12
 	vaesenc	%xmm1,%xmm14,%xmm14
 	vmovups	160-128(%rcx),%xmm1
-	cmpl	$11,%ebp
+	cmpl	$12,%ebp	// ICP uses 10,12,14 not 9,11,13 for rounds.
 	jb	.Lenc_tail

 	vaesenc	%xmm15,%xmm9,%xmm9
@@ -208,7 +256,8 @@
 	vmovups	176-128(%rcx),%xmm15
 	vaesenc	%xmm1,%xmm14,%xmm14
 	vmovups	192-128(%rcx),%xmm1
-	je	.Lenc_tail
+	cmpl	$14,%ebp	// ICP does not zero key schedule.
+	jb	.Lenc_tail

 	vaesenc	%xmm15,%xmm9,%xmm9
 	vaesenc	%xmm15,%xmm10,%xmm10
@@ -347,7 +396,7 @@
 	vmovdqu	(%r11),%xmm0
 	leaq	128(%rcx),%rcx
 	leaq	32+32(%r9),%r9
-	movl	240-128(%rcx),%ebp
+	movl	504-128(%rcx),%ebp	// ICP has a larger offset for rounds.
 	vpshufb	%xmm0,%xmm8,%xmm8

 	andq	%r15,%r14
@@ -418,7 +467,7 @@
 _aesni_ctr32_6x:
 	vmovdqu	0-128(%rcx),%xmm4
 	vmovdqu	32(%r11),%xmm2
-	leaq	-1(%rbp),%r13
+	leaq	-2(%rbp),%r13	// ICP uses 10,12,14 not 9,11,13 for rounds.
 	vmovups	16-128(%rcx),%xmm15
 	leaq	32-128(%rcx),%r12
 	vpxor	%xmm4,%xmm1,%xmm9
@@ -538,7 +587,7 @@
 	leaq	128(%rcx),%rcx
 	vmovdqu	(%r11),%xmm0
 	andq	$-128,%rsp
-	movl	240-128(%rcx),%ebp
+	movl	504-128(%rcx),%ebp	// ICP has an larger offset for rounds.

 	andq	%r15,%r14
 	andq	%rsp,%r15
@@ -770,6 +819,56 @@
 	.byte	0xf3,0xc3
 .cfi_endproc
 .size	aesni_gcm_encrypt,.-aesni_gcm_encrypt
+
+/* Some utility routines */
+
+/*
+ * clear all fpu registers
+ * void clear_fpu_regs_avx(void);
+ */
+.globl	clear_fpu_regs_avx
+.type	clear_fpu_regs_avx,@function
+.align	32
+clear_fpu_regs_avx:
+	vzeroall
+	ret
+.size	clear_fpu_regs_avx,.-clear_fpu_regs_avx
+
+/*
+ * void gcm_xor_avx(const uint8_t *src, uint8_t *dst);
+ *
+ * XORs one pair of unaligned 128-bit blocks from `src' and `dst' and
+ * stores the result at `dst'. The XOR is performed using FPU registers,
+ * so make sure FPU state is saved when running this in the kernel.
+ */
+.globl  gcm_xor_avx
+.type	gcm_xor_avx,@function
+.align	32
+gcm_xor_avx:
+	movdqu  (%rdi), %xmm0
+	movdqu  (%rsi), %xmm1
+	pxor    %xmm1, %xmm0
+	movdqu  %xmm0, (%rsi)
+	ret
+.size	gcm_xor_avx,.-gcm_xor_avx
+
+/*
+ * Toggle a boolean_t value atomically and return the new value.
+ * boolean_t atomic_toggle_boolean_nv(volatile boolean_t *);
+ */
+.globl	atomic_toggle_boolean_nv
+.type	atomic_toggle_boolean_nv,@function
+.align	32
+atomic_toggle_boolean_nv:
+	xorl	%eax, %eax
+	lock
+	xorl	$1, (%rdi)
+	jz	1f
+	movl	$1, %eax
+1:
+	ret
+.size	atomic_toggle_boolean_nv,.-atomic_toggle_boolean_nv
+
 .align	64
 .Lbswap_mask:
 .byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
@@ -783,3 +882,5 @@
 .byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 .byte	65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
 .align	64
+
+#endif /* defined(__x86_64__) && defined(HAVE_AVX) */
	--- ../openssl/crypto/modes/aesni-gcm-x86_64.s 2019-10-19 20:15:28.989989780 +0200
	+++ module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S 2019-12-16 19:32:16.699285141 +0100
	@@ -1,3 +1,51 @@
	+# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
	+#
	+# Licensed under the Apache License 2.0 (the "License"). You may not use
	+# this file except in compliance with the License. You can obtain a copy
	+# in the file LICENSE in the source distribution or at
	+# https://www.openssl.org/source/license.html
	+
	+#
	+# ====================================================================
	+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
	+# project. The module is, however, dual licensed under OpenSSL and
	+# CRYPTOGAMS licenses depending on where you obtain it. For further
	+# details see http://www.openssl.org/~appro/cryptogams/.
	+# ====================================================================
	+#
	+#
	+# AES-NI-CTR+GHASH stitch.
	+#
	+# February 2013
	+#
	+# OpenSSL GCM implementation is organized in such way that its
	+# performance is rather close to the sum of its streamed components,
	+# in the context parallelized AES-NI CTR and modulo-scheduled
	+# PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation
	+# was observed to perform significantly better than the sum of the
	+# components on contemporary CPUs, the effort was deemed impossible to
	+# justify. This module is based on combination of Intel submissions,
	+# [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
	+# Locktyukhin of Intel Corp. who verified that it reduces shuffles
	+# pressure with notable relative improvement, achieving 1.0 cycle per
	+# byte processed with 128-bit key on Haswell processor, 0.74 - on
	+# Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled
	+# measurements for favourable packet size, one divisible by 96.
	+# Applications using the EVP interface will observe a few percent
	+# worse performance.]
	+#
	+# Knights Landing processes 1 byte in 1.25 cycles (measured with EVP).
	+#
	+# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
	+# [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
	+
	+# Generated once from
	+# https://github.com/openssl/openssl/blob/5ffc3324/crypto/modes/asm/aesni-gcm-x86_64.pl
	+# and modified for ICP. Modification are kept at a bare minimum to ease later
	+# upstream merges.
	+
	+#if defined(__x86_64__) && defined(HAVE_AVX)
	+
	.text

	.type _aesni_ctr32_ghash_6x,@function
	@@ -190,7 +238,7 @@
	movbeq 0(%r14),%r12
	vaesenc %xmm1,%xmm14,%xmm14
	vmovups 160-128(%rcx),%xmm1
	- cmpl $11,%ebp
	+ cmpl $12,%ebp // ICP uses 10,12,14 not 9,11,13 for rounds.
	jb .Lenc_tail

	vaesenc %xmm15,%xmm9,%xmm9
	@@ -208,7 +256,8 @@
	vmovups 176-128(%rcx),%xmm15
	vaesenc %xmm1,%xmm14,%xmm14
	vmovups 192-128(%rcx),%xmm1
	- je .Lenc_tail
	+ cmpl $14,%ebp // ICP does not zero key schedule.
	+ jb .Lenc_tail

	vaesenc %xmm15,%xmm9,%xmm9
	vaesenc %xmm15,%xmm10,%xmm10
	@@ -347,7 +396,7 @@
	vmovdqu (%r11),%xmm0
	leaq 128(%rcx),%rcx
	leaq 32+32(%r9),%r9
	- movl 240-128(%rcx),%ebp
	+ movl 504-128(%rcx),%ebp // ICP has a larger offset for rounds.
	vpshufb %xmm0,%xmm8,%xmm8

	andq %r15,%r14
	@@ -418,7 +467,7 @@
	_aesni_ctr32_6x:
	vmovdqu 0-128(%rcx),%xmm4
	vmovdqu 32(%r11),%xmm2
	- leaq -1(%rbp),%r13
	+ leaq -2(%rbp),%r13 // ICP uses 10,12,14 not 9,11,13 for rounds.
	vmovups 16-128(%rcx),%xmm15
	leaq 32-128(%rcx),%r12
	vpxor %xmm4,%xmm1,%xmm9
	@@ -538,7 +587,7 @@
	leaq 128(%rcx),%rcx
	vmovdqu (%r11),%xmm0
	andq $-128,%rsp
	- movl 240-128(%rcx),%ebp
	+ movl 504-128(%rcx),%ebp // ICP has an larger offset for rounds.

	andq %r15,%r14
	andq %rsp,%r15
	@@ -770,6 +819,56 @@
	.byte 0xf3,0xc3
	.cfi_endproc
	.size aesni_gcm_encrypt,.-aesni_gcm_encrypt
	+
	+/* Some utility routines */
	+
	+/*
	+ * clear all fpu registers
	+ * void clear_fpu_regs_avx(void);
	+ */
	+.globl clear_fpu_regs_avx
	+.type clear_fpu_regs_avx,@function
	+.align 32
	+clear_fpu_regs_avx:
	+ vzeroall
	+ ret
	+.size clear_fpu_regs_avx,.-clear_fpu_regs_avx
	+
	+/*
	+ * void gcm_xor_avx(const uint8_t src, uint8_t dst);
	+ *
	+ * XORs one pair of unaligned 128-bit blocks from `src' and `dst' and
	+ * stores the result at `dst'. The XOR is performed using FPU registers,
	+ * so make sure FPU state is saved when running this in the kernel.
	+ */
	+.globl gcm_xor_avx
	+.type gcm_xor_avx,@function
	+.align 32
	+gcm_xor_avx:
	+ movdqu (%rdi), %xmm0
	+ movdqu (%rsi), %xmm1
	+ pxor %xmm1, %xmm0
	+ movdqu %xmm0, (%rsi)
	+ ret
	+.size gcm_xor_avx,.-gcm_xor_avx
	+
	+/*
	+ * Toggle a boolean_t value atomically and return the new value.
	+ * boolean_t atomic_toggle_boolean_nv(volatile boolean_t *);
	+ */
	+.globl atomic_toggle_boolean_nv
	+.type atomic_toggle_boolean_nv,@function
	+.align 32
	+atomic_toggle_boolean_nv:
	+ xorl %eax, %eax
	+ lock
	+ xorl $1, (%rdi)
	+ jz 1f
	+ movl $1, %eax
	+1:
	+ ret
	+.size atomic_toggle_boolean_nv,.-atomic_toggle_boolean_nv
	+
	.align 64
	.Lbswap_mask:
	.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
	@@ -783,3 +882,5 @@
	.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
	.byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
	.align 64
	+
	+#endif /* defined(__x86_64__) && defined(HAVE_AVX) */