Skip to content

Instantly share code, notes, and snippets.

@AttilaFueloep
Last active December 16, 2019 21:53
Show Gist options
  • Save AttilaFueloep/c1aa86170adfbf11fdb37a2ffcad8aa3 to your computer and use it in GitHub Desktop.
Save AttilaFueloep/c1aa86170adfbf11fdb37a2ffcad8aa3 to your computer and use it in GitHub Desktop.
Diff against openssl openssl/crypto/modes/aesni-gcm-x86_64.s.
--- ../openssl/crypto/modes/aesni-gcm-x86_64.s 2019-10-19 20:15:28.989989780 +0200
+++ module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S 2019-12-16 19:32:16.699285141 +0100
@@ -1,3 +1,51 @@
+# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+#
+# AES-NI-CTR+GHASH stitch.
+#
+# February 2013
+#
+# OpenSSL GCM implementation is organized in such way that its
+# performance is rather close to the sum of its streamed components,
+# in the context parallelized AES-NI CTR and modulo-scheduled
+# PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation
+# was observed to perform significantly better than the sum of the
+# components on contemporary CPUs, the effort was deemed impossible to
+# justify. This module is based on combination of Intel submissions,
+# [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
+# Locktyukhin of Intel Corp. who verified that it reduces shuffles
+# pressure with notable relative improvement, achieving 1.0 cycle per
+# byte processed with 128-bit key on Haswell processor, 0.74 - on
+# Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled
+# measurements for favourable packet size, one divisible by 96.
+# Applications using the EVP interface will observe a few percent
+# worse performance.]
+#
+# Knights Landing processes 1 byte in 1.25 cycles (measured with EVP).
+#
+# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
+# [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
+
+# Generated once from
+# https://github.com/openssl/openssl/blob/5ffc3324/crypto/modes/asm/aesni-gcm-x86_64.pl
+# and modified for ICP. Modification are kept at a bare minimum to ease later
+# upstream merges.
+
+#if defined(__x86_64__) && defined(HAVE_AVX)
+
.text
.type _aesni_ctr32_ghash_6x,@function
@@ -190,7 +238,7 @@
movbeq 0(%r14),%r12
vaesenc %xmm1,%xmm14,%xmm14
vmovups 160-128(%rcx),%xmm1
- cmpl $11,%ebp
+ cmpl $12,%ebp // ICP uses 10,12,14 not 9,11,13 for rounds.
jb .Lenc_tail
vaesenc %xmm15,%xmm9,%xmm9
@@ -208,7 +256,8 @@
vmovups 176-128(%rcx),%xmm15
vaesenc %xmm1,%xmm14,%xmm14
vmovups 192-128(%rcx),%xmm1
- je .Lenc_tail
+ cmpl $14,%ebp // ICP does not zero key schedule.
+ jb .Lenc_tail
vaesenc %xmm15,%xmm9,%xmm9
vaesenc %xmm15,%xmm10,%xmm10
@@ -347,7 +396,7 @@
vmovdqu (%r11),%xmm0
leaq 128(%rcx),%rcx
leaq 32+32(%r9),%r9
- movl 240-128(%rcx),%ebp
+ movl 504-128(%rcx),%ebp // ICP has a larger offset for rounds.
vpshufb %xmm0,%xmm8,%xmm8
andq %r15,%r14
@@ -418,7 +467,7 @@
_aesni_ctr32_6x:
vmovdqu 0-128(%rcx),%xmm4
vmovdqu 32(%r11),%xmm2
- leaq -1(%rbp),%r13
+ leaq -2(%rbp),%r13 // ICP uses 10,12,14 not 9,11,13 for rounds.
vmovups 16-128(%rcx),%xmm15
leaq 32-128(%rcx),%r12
vpxor %xmm4,%xmm1,%xmm9
@@ -538,7 +587,7 @@
leaq 128(%rcx),%rcx
vmovdqu (%r11),%xmm0
andq $-128,%rsp
- movl 240-128(%rcx),%ebp
+ movl 504-128(%rcx),%ebp // ICP has an larger offset for rounds.
andq %r15,%r14
andq %rsp,%r15
@@ -770,6 +819,56 @@
.byte 0xf3,0xc3
.cfi_endproc
.size aesni_gcm_encrypt,.-aesni_gcm_encrypt
+
+/* Some utility routines */
+
+/*
+ * clear all fpu registers
+ * void clear_fpu_regs_avx(void);
+ */
+.globl clear_fpu_regs_avx
+.type clear_fpu_regs_avx,@function
+.align 32
+clear_fpu_regs_avx:
+ vzeroall
+ ret
+.size clear_fpu_regs_avx,.-clear_fpu_regs_avx
+
+/*
+ * void gcm_xor_avx(const uint8_t *src, uint8_t *dst);
+ *
+ * XORs one pair of unaligned 128-bit blocks from `src' and `dst' and
+ * stores the result at `dst'. The XOR is performed using FPU registers,
+ * so make sure FPU state is saved when running this in the kernel.
+ */
+.globl gcm_xor_avx
+.type gcm_xor_avx,@function
+.align 32
+gcm_xor_avx:
+ movdqu (%rdi), %xmm0
+ movdqu (%rsi), %xmm1
+ pxor %xmm1, %xmm0
+ movdqu %xmm0, (%rsi)
+ ret
+.size gcm_xor_avx,.-gcm_xor_avx
+
+/*
+ * Toggle a boolean_t value atomically and return the new value.
+ * boolean_t atomic_toggle_boolean_nv(volatile boolean_t *);
+ */
+.globl atomic_toggle_boolean_nv
+.type atomic_toggle_boolean_nv,@function
+.align 32
+atomic_toggle_boolean_nv:
+ xorl %eax, %eax
+ lock
+ xorl $1, (%rdi)
+ jz 1f
+ movl $1, %eax
+1:
+ ret
+.size atomic_toggle_boolean_nv,.-atomic_toggle_boolean_nv
+
.align 64
.Lbswap_mask:
.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
@@ -783,3 +882,5 @@
.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
.byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 64
+
+#endif /* defined(__x86_64__) && defined(HAVE_AVX) */
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment