Skip to content

Instantly share code, notes, and snippets.

@thoughtpolice
Created January 28, 2017 04:52
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save thoughtpolice/26ea25f69715ffde96efa6364c19cf18 to your computer and use it in GitHub Desktop.
Save thoughtpolice/26ea25f69715ffde96efa6364c19cf18 to your computer and use it in GitHub Desktop.
Agner Fog's 64bit memcpy
;************************* memcpy64.asm ************************************
; Author: Agner Fog
; Date created: 2008-07-19
; Last modified: 2016-11-12
;
; Description:
; Faster version of the standard memcpy function:
; void * A_memcpy(void *dest, const void *src, size_t count);
; Copies 'count' bytes from 'src' to 'dest'
;
; Overriding standard function memcpy:
; The alias ?OVR_memcpy is changed to _memcpy in the object file if
; it is desired to override the standard library function memcpy.
;
; The function uses non-temporal writes to bypass the cache when the size is
; bigger than half the size of the largest_level cache. This limit can be
; read with GetMemcpyCacheLimit and changed with SetMemcpyCacheLimit
; C++ prototypes:
; extern "C" size_t GetMemcpyCacheLimit(); // in memcpy64.asm
; extern "C" void SetMemcpyCacheLimit(); // in memmove64.asm
; extern "C" void SetMemcpyCacheLimit1(); // used internally
;
; Position-independent code is generated if POSITIONINDEPENDENT is defined.
;
; CPU dispatching included for the following instruction sets:
; SSE2, Suppl-SSE3, AVX, AVX512F, AVX512BW.
;
; Copyright (c) 2008-2016 GNU General Public License www.gnu.org/licenses
;******************************************************************************
default rel
global A_memcpy ; Function A_memcpy
global ?OVR_memcpy ; ?OVR removed if standard function memcpy overridden
global memcpySSE2 ; Version for processors with only SSE2
global memcpySSSE3 ; Version for processors with SSSE3
global memcpyU ; Version for processors with fast unaligned read
global memcpyU256 ; Version for processors with fast 256-bit read/write
global memcpyAVX512F ; Version for processors with fast 512-bit read/write
global memcpyAVX512BW ; Version for processors with fast 512-bit read/write
global GetMemcpyCacheLimit ; Get the size limit for bypassing cache when copying with memcpy and memmove
global SetMemcpyCacheLimit1 ; Set the size limit for bypassing cache when copying with memcpy
global getDispatch
; Imported from instrset64.asm
extern InstructionSet ; Instruction set for CPU dispatcher
; Imported from unalignedisfaster64.asm:
extern UnalignedIsFaster ; Tells if unaligned read is faster than PALIGNR
extern Store256BitIsFaster ; Tells if a 256 bit store is faster than two 128 bit stores
; Imported from cachesize32.asm:
extern DataCacheSize ; Gets size of data cache
; Define prolog for this function
%MACRO PROLOGM 0
%IFDEF WINDOWS
push rsi
push rdi
mov rdi, rcx ; dest
mov r9, rcx ; dest
mov rsi, rdx ; src
mov rcx, r8 ; count
%ELSE ; Unix
mov rcx, rdx ; count
mov r9, rdi ; dest
%ENDIF
%ENDM
; Define return from this function
%MACRO EPILOGM 0
%IFDEF WINDOWS
pop rdi
pop rsi
%ENDIF
mov rax, r9 ; Return value = dest
ret
%ENDM
SECTION .text align=16
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;
; Common entry for dispatch
;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; extern "C" void * A_memcpy(void * dest, const void * src, size_t count);
; Function entry:
A_memcpy:
?OVR_memcpy:
jmp qword [memcpyDispatch] ; Go to appropriate version, depending on instruction set
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;
; AVX512BW Version for processors with fast unaligned read and fast 512 bits write
; Requires AVX512BW, BMI2
;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; memcpyAVX512BW:
align 8
; Version for size <= 40H. Requires AVX512BW and BMI2
L000: mov rax, -1
bzhi rax, rax, rcx ; set mask k1 to move rcx bytes, at most 40H
kmovq k1, rax
vmovdqu8 zmm16{k1}{z}, [rsi]
vmovdqu8 [rdi]{k1}, zmm16
; vzeroupper not needed if we use zmm16?
EPILOGM
align 8
; Version for size = 40H - 80H
L010: ; make two partially overlapping blocks
vmovdqu64 zmm16, [rsi]
vmovdqu64 zmm17, [rsi+rcx-40H]
vmovdqu64 [rdi], zmm16
vmovdqu64 [rdi+rcx-40H], zmm17
; vzeroupper not needed if we use zmm16?
EPILOGM
; Function entry
; rdi = dest
; rsi = src
; rcx = count
; r9 = dest
align 16
%IFDEF WINDOWS
times 5 nop ; align L200
%ELSE ; Unix
times 13 nop ; align L200
%ENDIF
memcpyAVX512BW: ; global label
memcpyAVX512BW@: ; local label
PROLOGM
cmp rcx, 040H
jbe L000
cmp rcx, 080H
jbe L010
L100: ; count > 80H ; Entry from memcpyAVX512F
vmovdqu64 zmm17, [rsi] ; save first possibly unaligned block to after main loop
vmovdqu64 zmm18, [rsi+rcx-40H] ; save last possibly unaligned block to after main loop
add rdi, rcx ; end of destination
and rdi, -40H ; round down to align by 40H
mov rdx, rdi
sub rdx, r9
add rsi, rdx ; end of main blocks of source
and rdx, -40H ; size of aligned blocks to copy
; Check if count very big
cmp rdx, [CacheBypassLimit]
ja L500 ; Use non-temporal store if count > CacheBypassLimit
neg rdx ; negative index from end of aligned blocks
; align ?
L200: ; main loop. Move 40H bytes at a time
vmovdqu64 zmm16, [rsi+rdx]
vmovdqa64 [rdi+rdx], zmm16
add rdx, 40H
jnz L200
L210: ; insert remaining bytes at beginning and end, possibly overlapping main blocks
vmovdqu64 [r9], zmm17
vmovdqu64 [r9+rcx-40H], zmm18
;vzeroupper not needed if we use zmm16-18
EPILOGM
align 16
L500: ; Move 40H bytes at a time, non-temporal
neg rdx
L510: vmovdqu64 zmm16, [rsi+rdx]
vmovntdq [rdi+rdx], zmm16
add rdx, 40H
jnz L510
sfence
jmp L210
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;
; AVX512F Version for processors with fast unaligned read and fast 64 bytes write
;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Function entry
align 16
memcpyAVX512F: ; global label
memcpyAVX512F@: ; local label
PROLOGM
; rdi = dest
; rsi = src
; rcx = count
cmp rcx, 080H
ja L100
cmp rcx, 040H
jae L010
; count < 40H
jmp A1000
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;
; AVX Version for processors with fast unaligned read and fast 256 bits write
;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
align 16
memcpyU256: ; global label
memcpyU256@: ; local label
PROLOGM
cmp rcx, 40H
jb A1000 ; Use simpler code if count < 64
; count >= 64
; Calculate size of first block up to first regular boundary of dest
mov edx, edi
neg edx
and edx, 1FH
jz B3100 ; Skip if dest aligned by 32
; edx = size of first partial block, 1 - 31 bytes
test dl, 3
jz B3030
test dl, 1
jz B3020
; move 1 byte
movzx eax, byte [rsi]
mov [rdi], al
inc rsi
inc rdi
B3020: test dl, 2
jz B3030
; move 2 bytes
movzx eax, word [rsi]
mov [rdi], ax
add rsi, 2
add rdi, 2
B3030: test dl, 4
jz B3040
; move 4 bytes
mov eax, [rsi]
mov [rdi], eax
add rsi, 4
add rdi, 4
B3040: test dl, 8
jz B3050
; move 8 bytes
mov rax, [rsi]
mov [rdi], rax
add rsi, 8
add rdi, 8
B3050: test dl, 16
jz B3060
; move 16 bytes
movups xmm0, [rsi]
movaps [rdi], xmm0
add rsi, 16
add rdi, 16
B3060: sub rcx, rdx
B3100: ; Now dest is aligned by 32. Any partial block has been moved
; Set up for loop moving 32 bytes per iteration:
mov rdx, rcx ; Save count
and rcx, -20H ; Round down to nearest multiple of 32
add rsi, rcx ; Point to the end
add rdi, rcx ; Point to the end
sub rdx, rcx ; Remaining data after loop
; Check if count very big
cmp rcx, [CacheBypassLimit]
ja I3100 ; Use non-temporal store if count > CacheBypassLimit
neg rcx ; Negative index from the end
H3100: ; copy -rcx bytes in blocks of 32 bytes.
; Check for false memory dependence: The CPU may falsely assume
; a partial overlap between the written destination and the following
; read source if source is unaligned and
; (src-dest) modulo 4096 is close to 4096
test sil, 1FH
jz H3110 ; aligned
mov eax, esi
sub eax, edi
and eax, 0FFFH ; modulo 4096
cmp eax, 1000H - 200H
ja J3100
align 16
H3110: ; main copy loop, 32 bytes at a time
; rcx has negative index from the end, counting up to zero
vmovups ymm0, [rsi+rcx]
vmovaps [rdi+rcx], ymm0
add rcx, 20H
jnz H3110
vzeroupper ; end of AVX mode
H3120: ; Move the remaining edx bytes (0 - 31):
add rsi, rdx
add rdi, rdx
neg rdx
jz H3500 ; Skip if no more data
; move 16-8-4-2-1 bytes, aligned
cmp edx, -10H
jg H3200
; move 16 bytes
movups xmm0, [rsi+rdx]
movaps [rdi+rdx], xmm0
add rdx, 10H
H3200: cmp edx, -8
jg H3210
; move 8 bytes
movq xmm0, qword [rsi+rdx]
movq qword [rdi+rdx], xmm0
add rdx, 8
jz H500 ; Early skip if count divisible by 8
H3210: cmp edx, -4
jg H3220
; move 4 bytes
mov eax, [rsi+rdx]
mov [rdi+rdx], eax
add rdx, 4
H3220: cmp edx, -2
jg H3230
; move 2 bytes
movzx eax, word [rsi+rdx]
mov [rdi+rdx], ax
add rdx, 2
H3230: cmp edx, -1
jg H3500
; move 1 byte
movzx eax, byte [rsi+rdx]
mov [rdi+rdx], al
H3500: ; finished
EPILOGM
I3100: ; non-temporal move
neg rcx ; Negative index from the end
align 16
I3110: ; main copy loop, 32 bytes at a time
; rcx has negative index from the end, counting up to zero
vmovups ymm0, [rsi+rcx]
vmovntps [rdi+rcx], ymm0
add rcx, 20H
jnz I3110
sfence
vzeroupper ; end of AVX mode
jmp H3120 ; Move the remaining edx bytes (0 - 31)
align 16
J3100: ; There is a false memory dependence.
; check if src and dest overlap, if not then it is safe
; to copy backwards to avoid false memory dependence
%if 1
; Use this version if you want consistent behavior in the case
; where dest > src and overlap. However, this case is undefined
; anyway because part of src is overwritten before copying
push rdx
mov rax, rsi
sub rax, rdi
cqo
xor rax, rdx
sub rax, rdx ; abs(src-dest)
neg rcx ; size
pop rdx ; restore rdx
cmp rax, rcx
jnb J3110
neg rcx ; restore rcx
jmp H3110 ; overlap between src and dest. Can't copy backwards
%else
; save time by not checking the case that is undefined anyway
mov rax, rsi
sub rax, rdi
neg rcx ; size
cmp rax, rcx
jnb J3110 ; OK to copy backwards
; must copy forwards
neg rcx ; restore ecx
jmp H3110 ; copy forwards
%endif
J3110: ; copy backwards, rcx = size. rsi, rdi = end of src, dest
push rsi
push rdi
sub rsi, rcx
sub rdi, rcx
J3120: ; loop backwards
vmovups ymm0, [rsi+rcx-20H]
vmovaps [rdi+rcx-20H], ymm0
sub rcx, 20H
jnz J3120
vzeroupper
pop rdi
pop rsi
jmp H3120
align 16
; count < 64. Move 32-16-8-4-2-1 bytes
; multiple CPU versions (SSSE3 and above)
A1000: add rsi, rcx ; end of src
add rdi, rcx ; end of dest
neg rcx ; negative index from the end
cmp ecx, -20H
jg A1100
; move 32 bytes
; movdqu is faster than 64-bit moves on processors with SSSE3
movups xmm0, [rsi+rcx]
movups xmm1, [rsi+rcx+10H]
movups [rdi+rcx], xmm0
movups [rdi+rcx+10H], xmm1
add rcx, 20H
A1100: cmp ecx, -10H
jg A1200
; move 16 bytes
movups xmm0, [rsi+rcx]
movups [rdi+rcx], xmm0
add rcx, 10H
A1200: cmp ecx, -8
jg A1300
; move 8 bytes
mov rax, qword [rsi+rcx]
mov qword [rdi+rcx], rax
add rcx, 8
A1300: cmp ecx, -4
jg A1400
; move 4 bytes
mov eax, [rsi+rcx]
mov [rdi+rcx], eax
add rcx, 4
jz A1900 ; early out if count divisible by 4
A1400: cmp ecx, -2
jg A1500
; move 2 bytes
movzx eax, word [rsi+rcx]
mov [rdi+rcx], ax
add rcx, 2
A1500: cmp ecx, -1
jg A1900
; move 1 byte
movzx eax, byte [rsi+rcx]
mov [rdi+rcx], al
A1900: ; finished
EPILOGM
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;
; Version for processors with fast unaligned read and fast 16 bytes write
;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
align 16
memcpyU: ; global label
memcpyU@: ; local label
PROLOGM
cmp rcx, 40H
jb A1000 ; Use simpler code if count < 64
; count >= 64
; Calculate size of first block up to first regular boundary of dest
mov edx, edi
neg edx
and edx, 0FH
jz B2100 ; Skip if dest aligned by 16
; edx = size of first partial block, 1 - 15 bytes
test dl, 3
jz B2030
test dl, 1
jz B2020
; move 1 byte
movzx eax, byte [rsi]
mov [rdi], al
inc rsi
inc rdi
B2020: test dl, 2
jz B2030
; move 2 bytes
movzx eax, word [rsi]
mov [rdi], ax
add rsi, 2
add rdi, 2
B2030: test dl, 4
jz B2040
; move 4 bytes
mov eax, [rsi]
mov [rdi], eax
add rsi, 4
add rdi, 4
B2040: test dl, 8
jz B2050
; move 8 bytes
mov rax, [rsi]
mov [rdi], rax
add rsi, 8
add rdi, 8
B2050: sub rcx, rdx
B2100: ; Now dest is aligned by 16. Any partial block has been moved
; Set up for loop moving 32 bytes per iteration:
mov rdx, rcx ; Save count
and rcx, -20H ; Round down to nearest multiple of 32
add rsi, rcx ; Point to the end
add rdi, rcx ; Point to the end
sub rdx, rcx ; Remaining data after loop
; Check if count very big
cmp rcx, [CacheBypassLimit]
ja I100 ; Use non-temporal store if count > CacheBypassLimit
neg rcx ; Negative index from the end
H100: ; copy -rcx bytes in blocks of 32 bytes.
; Check for false memory dependence: The CPU may falsely assume
; a partial overlap between the written destination and the following
; read source if source is unaligned and
; (src-dest) modulo 4096 is close to 4096
test sil, 0FH
jz H110 ; aligned
mov eax, esi
sub eax, edi
and eax, 0FFFH ; modulo 4096
cmp eax, 1000H - 200H
ja J100
H110: ; main copy loop, 32 bytes at a time
; rcx has negative index from the end, counting up to zero
movups xmm0, [rsi+rcx]
movups xmm1, [rsi+rcx+10H]
movaps [rdi+rcx], xmm0
movaps [rdi+rcx+10H], xmm1
add rcx, 20H
jnz H110
H120: ; Move the remaining edx bytes (0 - 31):
add rsi, rdx
add rdi, rdx
neg rdx
jz H500 ; Skip if no more data
; move 16-8-4-2-1 bytes, aligned
cmp edx, -10H
jg H200
; move 16 bytes
movups xmm0, [rsi+rdx]
movaps [rdi+rdx], xmm0
add rdx, 10H
H200: cmp edx, -8
jg H210
; move 8 bytes
movq xmm0, qword [rsi+rdx]
movq qword [rdi+rdx], xmm0
add rdx, 8
jz H500 ; Early skip if count divisible by 8
H210: cmp edx, -4
jg H220
; move 4 bytes
mov eax, [rsi+rdx]
mov [rdi+rdx], eax
add rdx, 4
H220: cmp edx, -2
jg H230
; move 2 bytes
movzx eax, word [rsi+rdx]
mov [rdi+rdx], ax
add rdx, 2
H230: cmp edx, -1
jg H500
; move 1 byte
movzx eax, byte [rsi+rdx]
mov [rdi+rdx], al
H500: ; finished
EPILOGM
I100: ; non-temporal move
neg rcx ; Negative index from the end
align 16
I110: ; main copy loop, 32 bytes at a time
; rcx has negative index from the end, counting up to zero
movups xmm0, [rsi+rcx]
movups xmm1, [rsi+rcx+10H]
movntps [rdi+rcx], xmm0
movntps [rdi+rcx+10H], xmm1
add rcx, 20H
jnz I110
sfence
jmp H120 ; Move the remaining edx bytes (0 - 31):
align 16
J100: ; There is a false memory dependence.
; check if src and dest overlap, if not then it is safe
; to copy backwards to avoid false memory dependence
%if 1
; Use this version if you want consistent behavior in the case
; where dest > src and overlap. However, this case is undefined
; anyway because part of src is overwritten before copying
push rdx
mov rax, rsi
sub rax, rdi
cqo
xor rax, rdx
sub rax, rdx ; abs(src-dest)
neg rcx ; size
pop rdx ; restore rdx
cmp rax, rcx
jnb J110
neg rcx ; restore rcx
jmp H110 ; overlap between src and dest. Can't copy backwards
%else
; save time by not checking the case that is undefined anyway
mov rax, rsi
sub rax, rdi
neg rcx ; size
cmp rax, rcx
jnb J110 ; OK to copy backwards
; must copy forwards
neg rcx ; restore ecx
jmp H110 ; copy forwards
%endif
J110: ; copy backwards, rcx = size. rsi, rdi = end of src, dest
push rsi
push rdi
sub rsi, rcx
sub rdi, rcx
J120: ; loop backwards
movups xmm1, [rsi+rcx-20H]
movups xmm0, [rsi+rcx-10H]
movaps [rdi+rcx-20H], xmm1
movaps [rdi+rcx-10H], xmm0
sub rcx, 20H
jnz J120
pop rdi
pop rsi
jmp H120
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;
; Version for processors with SSSE3. Aligned read + shift + aligned write
;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
align 16
memcpySSSE3: ; global label
memcpySSSE3@: ; local label
PROLOGM
cmp rcx, 40H
jb A1000 ; Use simpler code if count < 64
; count >= 64
; Calculate size of first block up to first regular boundary of dest
mov edx, edi
neg edx
and edx, 0FH
jz B1200 ; Skip if dest aligned by 16
; edx = size of first partial block, 1 - 15 bytes
test dl, 3
jz B1030
test dl, 1
jz B1020
; move 1 byte
movzx eax, byte [rsi]
mov [rdi], al
inc rsi
inc rdi
B1020: test dl, 2
jz B1030
; move 2 bytes
movzx eax, word [rsi]
mov [rdi], ax
add rsi, 2
add rdi, 2
B1030: test dl, 4
jz B1040
; move 4 bytes
mov eax, [rsi]
mov [rdi], eax
add rsi, 4
add rdi, 4
B1040: test dl, 8
jz B1050
; move 8 bytes
mov rax, [rsi]
mov [rdi], rax
add rsi, 8
add rdi, 8
B1050: sub rcx, rdx
B1200: ; Now dest is aligned by 16. Any partial block has been moved
; Find alignment of src modulo 16 at this point:
mov eax, esi
and eax, 0FH
; Set up for loop moving 32 bytes per iteration:
mov edx, ecx ; Save count (lower 32 bits)
and rcx, -20H ; Round down count to nearest multiple of 32
add rsi, rcx ; Point to the end
add rdi, rcx ; Point to the end
sub edx, ecx ; Remaining data after loop (0-31)
sub rsi, rax ; Nearest preceding aligned block of src
; Check if count very big
cmp rcx, [CacheBypassLimit]
ja B1400 ; Use non-temporal store if count > CacheBypassLimit
neg rcx ; Negative index from the end
; Dispatch to different codes depending on src alignment
lea r8, [AlignmentDispatchSSSE3]
jmp near [r8+rax*8]
B1400: neg rcx
; Dispatch to different codes depending on src alignment
lea r8, [AlignmentDispatchNT]
jmp near [r8+rax*8]
align 16
C100: ; Code for aligned src. SSE2 and SSSE3 versions
; The nice case, src and dest have same alignment.
; Loop. rcx has negative index from the end, counting up to zero
movaps xmm0, [rsi+rcx]
movaps xmm1, [rsi+rcx+10H]
movaps [rdi+rcx], xmm0
movaps [rdi+rcx+10H], xmm1
add rcx, 20H
jnz C100
; Move the remaining edx bytes (0 - 31):
add rsi, rdx
add rdi, rdx
neg rdx
jz C500 ; Skip if no more data
; move 16-8-4-2-1 bytes, aligned
cmp edx, -10H
jg C200
; move 16 bytes
movaps xmm0, [rsi+rdx]
movaps [rdi+rdx], xmm0
add rdx, 10H
C200: cmp edx, -8
jg C210
; move 8 bytes
mov rax, [rsi+rdx]
mov [rdi+rdx], rax
add rdx, 8
jz C500 ; Early skip if count divisible by 8
C210: cmp edx, -4
jg C220
; move 4 bytes
mov eax, [rsi+rdx]
mov [rdi+rdx], eax
add rdx, 4
C220: cmp edx, -2
jg C230
; move 2 bytes
movzx eax, word [rsi+rdx]
mov [rdi+rdx], ax
add rdx, 2
C230: cmp edx, -1
jg C500
; move 1 byte
movzx eax, byte [rsi+rdx]
mov [rdi+rdx], al
C500: ; finished
EPILOGM
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;
; Version for processors with SSE2. Aligned read + shift + aligned write
;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
memcpySSE2: ; global label
memcpySSE2@: ; local label
PROLOGM
cmp rcx, 40H
jae B0100 ; Use simpler code if count < 64
; count < 64. Move 32-16-8-4-2-1 bytes
add rsi, rcx ; end of src
add rdi, rcx ; end of dest
neg rcx ; negative index from the end
cmp ecx, -20H
jg A100
; move 32 bytes
; mov r64 is faster than movdqu on Intel Pentium M and Core 1
; movdqu is fast on Nehalem and later
mov rax, [rsi+rcx]
mov rdx, [rsi+rcx+8]
mov [rdi+rcx], rax
mov [rdi+rcx+8], rdx
mov rax, qword [rsi+rcx+10H]
mov rdx, qword [rsi+rcx+18H]
mov qword [rdi+rcx+10H], rax
mov qword [rdi+rcx+18H], rdx
add rcx, 20H
A100: cmp ecx, -10H
jg A200
; move 16 bytes
mov rax, [rsi+rcx]
mov rdx, [rsi+rcx+8]
mov [rdi+rcx], rax
mov [rdi+rcx+8], rdx
add rcx, 10H
A200: cmp ecx, -8
jg A300
; move 8 bytes
mov rax, qword [rsi+rcx]
mov qword [rdi+rcx], rax
add rcx, 8
A300: cmp ecx, -4
jg A400
; move 4 bytes
mov eax, [rsi+rcx]
mov [rdi+rcx], eax
add rcx, 4
jz A900 ; early out if count divisible by 4
A400: cmp ecx, -2
jg A500
; move 2 bytes
movzx eax, word [rsi+rcx]
mov [rdi+rcx], ax
add rcx, 2
A500: cmp ecx, -1
jg A900
; move 1 byte
movzx eax, byte [rsi+rcx]
mov [rdi+rcx], al
A900: ; finished
EPILOGM
B0100: ; count >= 64
; Calculate size of first block up to first regular boundary of dest
mov edx, edi
neg edx
and edx, 0FH
jz B0200 ; Skip if dest aligned by 16
; edx = size of first partial block, 1 - 15 bytes
test dl, 3
jz B0030
test dl, 1
jz B0020
; move 1 byte
movzx eax, byte [rsi]
mov [rdi], al
inc rsi
inc rdi
B0020: test dl, 2
jz B0030
; move 2 bytes
movzx eax, word [rsi]
mov [rdi], ax
add rsi, 2
add rdi, 2
B0030: test dl, 4
jz B0040
; move 4 bytes
mov eax, [rsi]
mov [rdi], eax
add rsi, 4
add rdi, 4
B0040: test dl, 8
jz B0050
; move 8 bytes
mov rax, [rsi]
mov [rdi], rax
add rsi, 8
add rdi, 8
B0050: sub rcx, rdx
B0200: ; Now dest is aligned by 16. Any partial block has been moved
; This part will not always work if count < 64
; Calculate size of first block up to first regular boundary of dest
mov edx, edi
neg edx
and edx, 0FH
jz B300 ; Skip if dest aligned by 16
; rdx = size of first partial block, 1 - 15 bytes
add rsi, rdx
add rdi, rdx
sub rcx, rdx
neg rdx
cmp edx, -8
jg B200
; move 8 bytes
mov rax, [rsi+rdx]
mov [rdi+rdx], rax
add rdx, 8
B200: cmp edx, -4
jg B210
; move 4 bytes
mov eax, [rsi+rdx]
mov [rdi+rdx], eax
add rdx, 4
jz B300 ; early out if aligned by 4
B210: cmp edx, -2
jg B220
; move 2 bytes
movzx eax, word [rsi+rdx]
mov [rdi+rdx], ax
add rdx, 2
B220: cmp edx, -1
jg B300
; move 1 byte
movzx eax, byte [rsi+rdx]
mov [rdi+rdx], al
B300: ; Now dest is aligned by 16. Any partial block has been moved
; Find alignment of src modulo 16 at this point:
mov eax, esi
and eax, 0FH
; Set up for loop moving 32 bytes per iteration:
mov edx, ecx ; Save count (lower 32 bits)
and rcx, -20H ; Round down count to nearest multiple of 32
add rsi, rcx ; Point to the end
add rdi, rcx ; Point to the end
sub edx, ecx ; Remaining data after loop (0-31)
sub rsi, rax ; Nearest preceding aligned block of src
; Check if count very big
cmp rcx, [CacheBypassLimit]
ja B400 ; Use non-temporal store if count > CacheBypassLimit
neg rcx ; Negative index from the end
; Dispatch to different codes depending on src alignment
lea r8, [AlignmentDispatchSSE2]
jmp near [r8+rax*8]
B400: neg rcx
; Dispatch to different codes depending on src alignment
lea r8, [AlignmentDispatchNT]
jmp near [r8+rax*8]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;
; Macros and alignment jump tables
;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Macros for each src alignment, SSE2 instruction set:
; Make separate code for each alignment u because the shift instructions
; have the shift count as a constant:
%MACRO MOVE_UNALIGNED_SSE2 2 ; u, nt
; Move rcx + rdx bytes of data
; Source is misaligned. (src-dest) modulo 16 = %1
; %2 = 1 if non-temporal store desired
; eax = %1
; rsi = src - %1 = nearest preceding 16-bytes boundary
; rdi = dest (aligned)
; rcx = - (count rounded down to nearest divisible by 32)
; edx = remaining bytes to move after loop
movdqa xmm0, [rsi+rcx] ; Read from nearest preceding 16B boundary
%%L1: ; Loop. rcx has negative index from the end, counting up to zero
movdqa xmm1, [rsi+rcx+10H] ; Read next two blocks aligned
movdqa xmm2, [rsi+rcx+20H]
movdqa xmm3, xmm1 ; Copy because used twice
psrldq xmm0, %1 ; shift right
pslldq xmm1, 16-%1 ; shift left
por xmm0, xmm1 ; combine blocks
%IF %2 == 0
movdqa [rdi+rcx], xmm0 ; Save aligned
%ELSE
movntdq [rdi+rcx], xmm0 ; non-temporal save
%ENDIF
movdqa xmm0, xmm2 ; Save for next iteration
psrldq xmm3, %1 ; shift right
pslldq xmm2, 16-%1 ; shift left
por xmm3, xmm2 ; combine blocks
%IF %2 == 0
movdqa [rdi+rcx+10H], xmm3 ; Save aligned
%ELSE
movntdq [rdi+rcx+10H], xmm3 ; non-temporal save
%ENDIF
add rcx, 20H ; Loop through negative values up to zero
jnz %%L1
; Set up for edx remaining bytes
add rsi, rdx
add rdi, rdx
neg rdx
cmp edx, -10H
jg %%L2
; One more 16-bytes block to move
movdqa xmm1, [rsi+rdx+10H]
psrldq xmm0, %1 ; shift right
pslldq xmm1, 16-%1 ; shift left
por xmm0, xmm1 ; combine blocks
%IF %2 == 0
movdqa [rdi+rdx], xmm0 ; Save aligned
%ELSE
movntdq [rdi+rdx], xmm0 ; non-temporal save
%ENDIF
add rdx, 10H
%%L2: ; Get src pointer back to misaligned state
add rsi, rax
%IF %2 == 1
sfence
%ENDIF
; Move remaining 0 - 15 bytes, unaligned
jmp C200
%ENDMACRO
%MACRO MOVE_UNALIGNED_SSE2_4 1 ; nt
; Special case for u = 4
; %1 = 1 if non-temporal store desired
movaps xmm0, [rsi+rcx] ; Read from nearest preceding 16B boundary
%%L1: ; Loop. rcx has negative index from the end, counting up to zero
movaps xmm1, [rsi+rcx+10H] ; Read next two blocks aligned
movss xmm0, xmm1 ; Moves 4 bytes, leaves remaining bytes unchanged
shufps xmm0, xmm0, 00111001B ; Rotate
%IF %1 == 0
movaps [rdi+rcx], xmm0 ; Save aligned
%ELSE
movntps [rdi+rcx], xmm0 ; Non-temporal save
%ENDIF
movaps xmm0, [rsi+rcx+20H]
movss xmm1, xmm0
shufps xmm1, xmm1, 00111001B
%IF %1 == 0
movaps [rdi+rcx+10H], xmm1 ; Save aligned
%ELSE
movntps [rdi+rcx+10H], xmm1 ; Non-temporal save
%ENDIF
add rcx, 20H ; Loop through negative values up to zero
jnz %%L1
; Set up for edx remaining bytes
add rsi, rdx
add rdi, rdx
neg rdx
cmp edx, -10H
jg %%L2
; One more 16-bytes block to move
movaps xmm1, [rsi+rdx+10H] ; Read next two blocks aligned
movss xmm0, xmm1
shufps xmm0, xmm0, 00111001B
%IF %1 == 0
movaps [rdi+rdx], xmm0 ; Save aligned
%ELSE
movntps [rdi+rdx], xmm0 ; Non-temporal save
%ENDIF
add rdx, 10H
%%L2: ; Get src pointer back to misaligned state
add rsi, rax
%IF %1 == 1
sfence
%ENDIF
; Move remaining 0 - 15 bytes, unaligned
jmp C200
%ENDMACRO
%MACRO MOVE_UNALIGNED_SSE2_8 1 ; nt
; Special case for u = 8
; %1 = 1 if non-temporal store desired
movaps xmm0, [rsi+rcx] ; Read from nearest preceding 16B boundary
%%L1: ; Loop. rcx has negative index from the end, counting up to zero
movaps xmm1, [rsi+rcx+10H] ; Read next two blocks aligned
movsd xmm0, xmm1 ; Moves 8 bytes, leaves remaining bytes unchanged
shufps xmm0, xmm0, 01001110B ; Rotate
%IF %1 == 0
movaps [rdi+rcx], xmm0 ; Save aligned
%ELSE
movntps [rdi+rcx], xmm0 ; Non-temporal save
%ENDIF
movaps xmm0, [rsi+rcx+20H]
movsd xmm1, xmm0
shufps xmm1, xmm1, 01001110B
%IF %1 == 0
movaps [rdi+rcx+10H], xmm1 ; Save aligned
%ELSE
movntps [rdi+rcx+10H], xmm1 ; Non-temporal save
%ENDIF
add rcx, 20H ; Loop through negative values up to zero
jnz %%L1
; Set up for edx remaining bytes
add rsi, rdx
add rdi, rdx
neg rdx
cmp edx, -10H
jg %%L2
; One more 16-bytes block to move
movaps xmm1, [rsi+rdx+10H] ; Read next two blocks aligned
movsd xmm0, xmm1
shufps xmm0, xmm0, 01001110B
%IF %1 == 0
movaps [rdi+rdx], xmm0 ; Save aligned
%ELSE
movntps [rdi+rdx], xmm0 ; Non-temporal save
%ENDIF
add rdx, 10H
%%L2: ; Get src pointer back to misaligned state
add rsi, rax
%IF %1 == 1
sfence
%ENDIF
; Move remaining 0 - 15 bytes, unaligned
jmp C200
%ENDMACRO
%MACRO MOVE_UNALIGNED_SSE2_12 1 ; nt
; Special case for u = 12
; %1 = 1 if non-temporal store desired
movaps xmm0, [rsi+rcx] ; Read from nearest preceding 16B boundary
shufps xmm0, xmm0, 10010011B
%%L1: ; Loop. rcx has negative index from the end, counting up to zero
movaps xmm1, [rsi+rcx+10H] ; Read next two blocks aligned
movaps xmm2, [rsi+rcx+20H]
shufps xmm1, xmm1, 10010011B
shufps xmm2, xmm2, 10010011B
movaps xmm3, xmm2
movss xmm2, xmm1 ; Moves 4 bytes, leaves remaining bytes unchanged
movss xmm1, xmm0 ; Moves 4 bytes, leaves remaining bytes unchanged
%IF %1 == 0
movaps [rdi+rcx], xmm1 ; Save aligned
movaps [rdi+rcx+10H], xmm2 ; Save aligned
%ELSE
movntps [rdi+rcx], xmm1 ; Non-temporal save
movntps [rdi+rcx+10H], xmm2 ; Non-temporal save
%ENDIF
movaps xmm0, xmm3 ; Save for next iteration
add rcx, 20H ; Loop through negative values up to zero
jnz %%L1
; Set up for edx remaining bytes
add rsi, rdx
add rdi, rdx
neg rdx
cmp edx, -10H
jg %%L2
; One more 16-bytes block to move
movaps xmm1, [rsi+rdx+10H] ; Read next two blocks aligned
shufps xmm1, xmm1, 10010011B
movss xmm1, xmm0 ; Moves 4 bytes, leaves remaining bytes unchanged
%IF %1 == 0
movaps [rdi+rdx], xmm1 ; Save aligned
%ELSE
movntps [rdi+rdx], xmm1 ; Non-temporal save
%ENDIF
add rdx, 10H
%%L2: ; Get src pointer back to misaligned state
add rsi, rax
%IF %1 == 1
sfence
%ENDIF
; Move remaining 0 - 15 bytes, unaligned
jmp C200
%ENDMACRO
; Macros for each src alignment, Suppl.SSE3 instruction set:
; Make separate code for each alignment u because the palignr instruction
; has the shift count as a constant:
%MACRO MOVE_UNALIGNED_SSSE3 1 ; u
; Move rcx + rdx bytes of data
; Source is misaligned. (src-dest) modulo 16 = %1
; eax = %1
; rsi = src - %1 = nearest preceding 16-bytes boundary
; rdi = dest (aligned)
; rcx = - (count rounded down to nearest divisible by 32)
; edx = remaining bytes to move after loop
movdqa xmm0, [rsi+rcx] ; Read from nearest preceding 16B boundary
%%L1: ; Loop. rcx has negative index from the end, counting up to zero
movdqa xmm2, [rsi+rcx+10H] ; Read next two blocks
movdqa xmm3, [rsi+rcx+20H]
movdqa xmm1, xmm0 ; Save xmm0
movdqa xmm0, xmm3 ; Save for next iteration
palignr xmm3, xmm2, %1 ; Combine parts into aligned block
palignr xmm2, xmm1, %1 ; Combine parts into aligned block
movdqa [rdi+rcx], xmm2 ; Save aligned
movdqa [rdi+rcx+10H], xmm3 ; Save aligned
add rcx, 20H
jnz %%L1
; Set up for edx remaining bytes
add rsi, rdx
add rdi, rdx
neg rdx
cmp edx, -10H
jg %%L2
; One more 16-bytes block to move
movdqa xmm2, [rsi+rdx+10H]
palignr xmm2, xmm0, %1
movdqa [rdi+rdx], xmm2
add rdx, 10H
%%L2: ; Get src pointer back to misaligned state
add rsi, rax
; Move remaining 0 - 15 bytes
jmp C200
%ENDMACRO
; Make 15 instances of SSE2 macro for each value of the alignment u.
; These are pointed to by the jump table AlignmentDispatchSSE2 below
; (alignments and fillers are inserted manually to minimize the number
; of 16-bytes boundaries inside loops)
align 16
D104: MOVE_UNALIGNED_SSE2_4 0
;times 4 nop
D108: MOVE_UNALIGNED_SSE2_8 0
;times 4 nop
D10C: MOVE_UNALIGNED_SSE2_12 0
;times 1 nop
D101: MOVE_UNALIGNED_SSE2 1, 0
D102: MOVE_UNALIGNED_SSE2 2, 0
D103: MOVE_UNALIGNED_SSE2 3, 0
D105: MOVE_UNALIGNED_SSE2 5, 0
D106: MOVE_UNALIGNED_SSE2 6, 0
D107: MOVE_UNALIGNED_SSE2 7, 0
D109: MOVE_UNALIGNED_SSE2 9, 0
;times 1 nop
D10A: MOVE_UNALIGNED_SSE2 0AH, 0
D10B: MOVE_UNALIGNED_SSE2 0BH, 0
D10D: MOVE_UNALIGNED_SSE2 0DH, 0
D10E: MOVE_UNALIGNED_SSE2 0EH, 0
D10F: MOVE_UNALIGNED_SSE2 0FH, 0
; Make 15 instances of Suppl-SSE3 macro for each value of the alignment u.
; These are pointed to by the jump table AlignmentDispatchSupSSE3 below
align 16
E104: MOVE_UNALIGNED_SSSE3 4
E108: MOVE_UNALIGNED_SSSE3 8
E10C: MOVE_UNALIGNED_SSSE3 0CH
E101: MOVE_UNALIGNED_SSSE3 1
E102: MOVE_UNALIGNED_SSSE3 2
E103: MOVE_UNALIGNED_SSSE3 3
E105: MOVE_UNALIGNED_SSSE3 5
E106: MOVE_UNALIGNED_SSSE3 6
E107: MOVE_UNALIGNED_SSSE3 7
E109: MOVE_UNALIGNED_SSSE3 9
times 1 nop
E10A: MOVE_UNALIGNED_SSSE3 0AH
E10B: MOVE_UNALIGNED_SSSE3 0BH
E10D: MOVE_UNALIGNED_SSSE3 0DH
E10E: MOVE_UNALIGNED_SSSE3 0EH
E10F: MOVE_UNALIGNED_SSSE3 0FH
; Codes for non-temporal move. Aligned case first
align 16
F100: ; Non-temporal move, src and dest have same alignment.
; Loop. rcx has negative index from the end, counting up to zero
movaps xmm0, [rsi+rcx] ; Read
movaps xmm1, [rsi+rcx+10H]
movntps [rdi+rcx], xmm0 ; Write non-temporal (bypass cache)
movntps [rdi+rcx+10H], xmm1
add rcx, 20H
jnz F100 ; Loop through negative rcx up to zero
; Move the remaining edx bytes (0 - 31):
add rsi, rdx
add rdi, rdx
neg rdx
jz C500 ; Skip if no more data
; Check if we can more one more 16-bytes block
cmp edx, -10H
jg C200
; move 16 bytes, aligned
movaps xmm0, [rsi+rdx]
movntps [rdi+rdx], xmm0
add rdx, 10H
sfence
; move the remaining 0 - 15 bytes
jmp C200
; Make 15 instances of MOVE_UNALIGNED_SSE2 macro for each value of
; the alignment u.
; These are pointed to by the jump table AlignmentDispatchNT below
;align 16
F104: MOVE_UNALIGNED_SSE2_4 1
F108: MOVE_UNALIGNED_SSE2_8 1
F10C: MOVE_UNALIGNED_SSE2_12 1
F101: MOVE_UNALIGNED_SSE2 1, 1
F102: MOVE_UNALIGNED_SSE2 2, 1
F103: MOVE_UNALIGNED_SSE2 3, 1
F105: MOVE_UNALIGNED_SSE2 5, 1
F106: MOVE_UNALIGNED_SSE2 6, 1
F107: MOVE_UNALIGNED_SSE2 7, 1
F109: MOVE_UNALIGNED_SSE2 9, 1
F10A: MOVE_UNALIGNED_SSE2 0AH, 1
F10B: MOVE_UNALIGNED_SSE2 0BH, 1
F10D: MOVE_UNALIGNED_SSE2 0DH, 1
F10E: MOVE_UNALIGNED_SSE2 0EH, 1
F10F: MOVE_UNALIGNED_SSE2 0FH, 1
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;
; CPU dispatcher
;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
memcpyCPUDispatch: ; CPU dispatcher, check for instruction sets and which method is fastest
; This part is executed only once
push rbx
push rcx
push rdx
push rsi
push rdi
push r8
; set CacheBypassLimit to half the size of the largest level cache
call GetMemcpyCacheLimit@
mov eax, 1
cpuid ; Get feature flags
lea rbx, [memcpySSE2@]
bt ecx, 9 ; Test bit for SupplSSE3
jnc Q100
lea rbx, [memcpySSSE3@]
call UnalignedIsFaster ; Test if unaligned read is faster than aligned read and shift
test eax, eax
jz Q100
lea rbx, [memcpyU@]
call Store256BitIsFaster ; Test if 256-bit read/write is available and faster than 128-bit read/write
test eax, eax
jz Q100
lea rbx, [memcpyU256@]
call InstructionSet
cmp eax, 15
jb Q100
lea rbx, [memcpyAVX512F@]
cmp eax, 16
jb Q100
lea rbx, [memcpyAVX512BW@]
Q100: ; Insert appropriate pointer
mov [memcpyDispatch], rbx
mov rax, rbx
pop r8
pop rdi
pop rsi
pop rdx
pop rcx
pop rbx
; Jump according to the replaced function pointer
jmp rax
; extern "C" size_t GetMemcpyCacheLimit();
GetMemcpyCacheLimit:
GetMemcpyCacheLimit@: ; local limit
mov rax, [CacheBypassLimit]
test rax, rax
jnz U200
; Get half the size of the largest level cache
%ifdef WINDOWS
xor ecx, ecx ; 0 means largest level cache
%else
xor edi, edi ; 0 means largest level cache
%endif
call DataCacheSize ; get cache size
shr rax, 1 ; half the size
jnz U100
mov eax, 400000H ; cannot determine cache size. use 4 Mbytes
U100: mov [CacheBypassLimit], rax
U200: ret
; Note: SetMemcpyCacheLimit is defined in memmove64.asm, calling SetMemcpyCacheLimit1
SetMemcpyCacheLimit1:
%ifdef WINDOWS
mov rax, rcx
%else
mov rax, rdi
%endif
test rax, rax
jnz U400
; zero, means default
mov [CacheBypassLimit], rax
call GetMemcpyCacheLimit@
U400: mov [CacheBypassLimit], rax
ret
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;
; getDispatch, for testing only
;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
getDispatch:
mov rax,[memcpyDispatch]
ret
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;
; data section. jump tables, dispatch function pointer, cache size
;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Data segment must be included in function namespace
SECTION .data
align 16
; Jump tables for alignments 0 - 15:
; The CPU dispatcher replaces AlignmentDispatch with
; AlignmentDispatchSSE2 or AlignmentDispatchSupSSE3 if Suppl-SSE3
; is supported.
; Code pointer for each alignment for SSE2 instruction set
AlignmentDispatchSSE2:
DQ C100, D101, D102, D103, D104, D105, D106, D107
DQ D108, D109, D10A, D10B, D10C, D10D, D10E, D10F
; Code pointer for each alignment for Suppl-SSE3 instruction set
AlignmentDispatchSSSE3:
DQ C100, E101, E102, E103, E104, E105, E106, E107
DQ E108, E109, E10A, E10B, E10C, E10D, E10E, E10F
; Code pointer for each alignment for non-temporal store
AlignmentDispatchNT:
DQ F100, F101, F102, F103, F104, F105, F106, F107
DQ F108, F109, F10A, F10B, F10C, F10D, F10E, F10F
; Pointer to appropriate version.
; This initially points to memcpyCPUDispatch. memcpyCPUDispatch will
; change this to the appropriate version of memcpy, so that
; memcpyCPUDispatch is only executed once:
memcpyDispatch DQ memcpyCPUDispatch
; Bypass cache by using non-temporal moves if count > CacheBypassLimit
; The optimal value of CacheBypassLimit is difficult to estimate, but
; a reasonable value is half the size of the largest cache:
CacheBypassLimit: DQ 0
@etale-cohomology
Copy link

How do I compile this with GCC on Linux?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment