Created
January 28, 2017 04:52
-
-
Save thoughtpolice/26ea25f69715ffde96efa6364c19cf18 to your computer and use it in GitHub Desktop.
Agner Fog's 64bit memcpy
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
;************************* memcpy64.asm ************************************ | |
; Author: Agner Fog | |
; Date created: 2008-07-19 | |
; Last modified: 2016-11-12 | |
; | |
; Description: | |
; Faster version of the standard memcpy function: | |
; void * A_memcpy(void *dest, const void *src, size_t count); | |
; Copies 'count' bytes from 'src' to 'dest' | |
; | |
; Overriding standard function memcpy: | |
; The alias ?OVR_memcpy is changed to _memcpy in the object file if | |
; it is desired to override the standard library function memcpy. | |
; | |
; The function uses non-temporal writes to bypass the cache when the size is | |
; bigger than half the size of the largest_level cache. This limit can be | |
; read with GetMemcpyCacheLimit and changed with SetMemcpyCacheLimit | |
; C++ prototypes: | |
; extern "C" size_t GetMemcpyCacheLimit(); // in memcpy64.asm | |
; extern "C" void SetMemcpyCacheLimit(); // in memmove64.asm | |
; extern "C" void SetMemcpyCacheLimit1(); // used internally | |
; | |
; Position-independent code is generated if POSITIONINDEPENDENT is defined. | |
; | |
; CPU dispatching included for the following instruction sets: | |
; SSE2, Suppl-SSE3, AVX, AVX512F, AVX512BW. | |
; | |
; Copyright (c) 2008-2016 GNU General Public License www.gnu.org/licenses | |
;****************************************************************************** | |
default rel | |
global A_memcpy ; Function A_memcpy | |
global ?OVR_memcpy ; ?OVR removed if standard function memcpy overridden | |
global memcpySSE2 ; Version for processors with only SSE2 | |
global memcpySSSE3 ; Version for processors with SSSE3 | |
global memcpyU ; Version for processors with fast unaligned read | |
global memcpyU256 ; Version for processors with fast 256-bit read/write | |
global memcpyAVX512F ; Version for processors with fast 512-bit read/write | |
global memcpyAVX512BW ; Version for processors with fast 512-bit read/write | |
global GetMemcpyCacheLimit ; Get the size limit for bypassing cache when copying with memcpy and memmove | |
global SetMemcpyCacheLimit1 ; Set the size limit for bypassing cache when copying with memcpy | |
global getDispatch | |
; Imported from instrset64.asm | |
extern InstructionSet ; Instruction set for CPU dispatcher | |
; Imported from unalignedisfaster64.asm: | |
extern UnalignedIsFaster ; Tells if unaligned read is faster than PALIGNR | |
extern Store256BitIsFaster ; Tells if a 256 bit store is faster than two 128 bit stores | |
; Imported from cachesize32.asm: | |
extern DataCacheSize ; Gets size of data cache | |
; Define prolog for this function | |
%MACRO PROLOGM 0 | |
%IFDEF WINDOWS | |
push rsi | |
push rdi | |
mov rdi, rcx ; dest | |
mov r9, rcx ; dest | |
mov rsi, rdx ; src | |
mov rcx, r8 ; count | |
%ELSE ; Unix | |
mov rcx, rdx ; count | |
mov r9, rdi ; dest | |
%ENDIF | |
%ENDM | |
; Define return from this function | |
%MACRO EPILOGM 0 | |
%IFDEF WINDOWS | |
pop rdi | |
pop rsi | |
%ENDIF | |
mov rax, r9 ; Return value = dest | |
ret | |
%ENDM | |
SECTION .text align=16 | |
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
; | |
; Common entry for dispatch | |
; | |
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
; extern "C" void * A_memcpy(void * dest, const void * src, size_t count); | |
; Function entry: | |
A_memcpy: | |
?OVR_memcpy: | |
jmp qword [memcpyDispatch] ; Go to appropriate version, depending on instruction set | |
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
; | |
; AVX512BW Version for processors with fast unaligned read and fast 512 bits write | |
; Requires AVX512BW, BMI2 | |
; | |
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
; memcpyAVX512BW: | |
align 8 | |
; Version for size <= 40H. Requires AVX512BW and BMI2 | |
L000: mov rax, -1 | |
bzhi rax, rax, rcx ; set mask k1 to move rcx bytes, at most 40H | |
kmovq k1, rax | |
vmovdqu8 zmm16{k1}{z}, [rsi] | |
vmovdqu8 [rdi]{k1}, zmm16 | |
; vzeroupper not needed if we use zmm16? | |
EPILOGM | |
align 8 | |
; Version for size = 40H - 80H | |
L010: ; make two partially overlapping blocks | |
vmovdqu64 zmm16, [rsi] | |
vmovdqu64 zmm17, [rsi+rcx-40H] | |
vmovdqu64 [rdi], zmm16 | |
vmovdqu64 [rdi+rcx-40H], zmm17 | |
; vzeroupper not needed if we use zmm16? | |
EPILOGM | |
; Function entry | |
; rdi = dest | |
; rsi = src | |
; rcx = count | |
; r9 = dest | |
align 16 | |
%IFDEF WINDOWS | |
times 5 nop ; align L200 | |
%ELSE ; Unix | |
times 13 nop ; align L200 | |
%ENDIF | |
memcpyAVX512BW: ; global label | |
memcpyAVX512BW@: ; local label | |
PROLOGM | |
cmp rcx, 040H | |
jbe L000 | |
cmp rcx, 080H | |
jbe L010 | |
L100: ; count > 80H ; Entry from memcpyAVX512F | |
vmovdqu64 zmm17, [rsi] ; save first possibly unaligned block to after main loop | |
vmovdqu64 zmm18, [rsi+rcx-40H] ; save last possibly unaligned block to after main loop | |
add rdi, rcx ; end of destination | |
and rdi, -40H ; round down to align by 40H | |
mov rdx, rdi | |
sub rdx, r9 | |
add rsi, rdx ; end of main blocks of source | |
and rdx, -40H ; size of aligned blocks to copy | |
; Check if count very big | |
cmp rdx, [CacheBypassLimit] | |
ja L500 ; Use non-temporal store if count > CacheBypassLimit | |
neg rdx ; negative index from end of aligned blocks | |
; align ? | |
L200: ; main loop. Move 40H bytes at a time | |
vmovdqu64 zmm16, [rsi+rdx] | |
vmovdqa64 [rdi+rdx], zmm16 | |
add rdx, 40H | |
jnz L200 | |
L210: ; insert remaining bytes at beginning and end, possibly overlapping main blocks | |
vmovdqu64 [r9], zmm17 | |
vmovdqu64 [r9+rcx-40H], zmm18 | |
;vzeroupper not needed if we use zmm16-18 | |
EPILOGM | |
align 16 | |
L500: ; Move 40H bytes at a time, non-temporal | |
neg rdx | |
L510: vmovdqu64 zmm16, [rsi+rdx] | |
vmovntdq [rdi+rdx], zmm16 | |
add rdx, 40H | |
jnz L510 | |
sfence | |
jmp L210 | |
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
; | |
; AVX512F Version for processors with fast unaligned read and fast 64 bytes write | |
; | |
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
; Function entry | |
align 16 | |
memcpyAVX512F: ; global label | |
memcpyAVX512F@: ; local label | |
PROLOGM | |
; rdi = dest | |
; rsi = src | |
; rcx = count | |
cmp rcx, 080H | |
ja L100 | |
cmp rcx, 040H | |
jae L010 | |
; count < 40H | |
jmp A1000 | |
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
; | |
; AVX Version for processors with fast unaligned read and fast 256 bits write | |
; | |
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
align 16 | |
memcpyU256: ; global label | |
memcpyU256@: ; local label | |
PROLOGM | |
cmp rcx, 40H | |
jb A1000 ; Use simpler code if count < 64 | |
; count >= 64 | |
; Calculate size of first block up to first regular boundary of dest | |
mov edx, edi | |
neg edx | |
and edx, 1FH | |
jz B3100 ; Skip if dest aligned by 32 | |
; edx = size of first partial block, 1 - 31 bytes | |
test dl, 3 | |
jz B3030 | |
test dl, 1 | |
jz B3020 | |
; move 1 byte | |
movzx eax, byte [rsi] | |
mov [rdi], al | |
inc rsi | |
inc rdi | |
B3020: test dl, 2 | |
jz B3030 | |
; move 2 bytes | |
movzx eax, word [rsi] | |
mov [rdi], ax | |
add rsi, 2 | |
add rdi, 2 | |
B3030: test dl, 4 | |
jz B3040 | |
; move 4 bytes | |
mov eax, [rsi] | |
mov [rdi], eax | |
add rsi, 4 | |
add rdi, 4 | |
B3040: test dl, 8 | |
jz B3050 | |
; move 8 bytes | |
mov rax, [rsi] | |
mov [rdi], rax | |
add rsi, 8 | |
add rdi, 8 | |
B3050: test dl, 16 | |
jz B3060 | |
; move 16 bytes | |
movups xmm0, [rsi] | |
movaps [rdi], xmm0 | |
add rsi, 16 | |
add rdi, 16 | |
B3060: sub rcx, rdx | |
B3100: ; Now dest is aligned by 32. Any partial block has been moved | |
; Set up for loop moving 32 bytes per iteration: | |
mov rdx, rcx ; Save count | |
and rcx, -20H ; Round down to nearest multiple of 32 | |
add rsi, rcx ; Point to the end | |
add rdi, rcx ; Point to the end | |
sub rdx, rcx ; Remaining data after loop | |
; Check if count very big | |
cmp rcx, [CacheBypassLimit] | |
ja I3100 ; Use non-temporal store if count > CacheBypassLimit | |
neg rcx ; Negative index from the end | |
H3100: ; copy -rcx bytes in blocks of 32 bytes. | |
; Check for false memory dependence: The CPU may falsely assume | |
; a partial overlap between the written destination and the following | |
; read source if source is unaligned and | |
; (src-dest) modulo 4096 is close to 4096 | |
test sil, 1FH | |
jz H3110 ; aligned | |
mov eax, esi | |
sub eax, edi | |
and eax, 0FFFH ; modulo 4096 | |
cmp eax, 1000H - 200H | |
ja J3100 | |
align 16 | |
H3110: ; main copy loop, 32 bytes at a time | |
; rcx has negative index from the end, counting up to zero | |
vmovups ymm0, [rsi+rcx] | |
vmovaps [rdi+rcx], ymm0 | |
add rcx, 20H | |
jnz H3110 | |
vzeroupper ; end of AVX mode | |
H3120: ; Move the remaining edx bytes (0 - 31): | |
add rsi, rdx | |
add rdi, rdx | |
neg rdx | |
jz H3500 ; Skip if no more data | |
; move 16-8-4-2-1 bytes, aligned | |
cmp edx, -10H | |
jg H3200 | |
; move 16 bytes | |
movups xmm0, [rsi+rdx] | |
movaps [rdi+rdx], xmm0 | |
add rdx, 10H | |
H3200: cmp edx, -8 | |
jg H3210 | |
; move 8 bytes | |
movq xmm0, qword [rsi+rdx] | |
movq qword [rdi+rdx], xmm0 | |
add rdx, 8 | |
jz H500 ; Early skip if count divisible by 8 | |
H3210: cmp edx, -4 | |
jg H3220 | |
; move 4 bytes | |
mov eax, [rsi+rdx] | |
mov [rdi+rdx], eax | |
add rdx, 4 | |
H3220: cmp edx, -2 | |
jg H3230 | |
; move 2 bytes | |
movzx eax, word [rsi+rdx] | |
mov [rdi+rdx], ax | |
add rdx, 2 | |
H3230: cmp edx, -1 | |
jg H3500 | |
; move 1 byte | |
movzx eax, byte [rsi+rdx] | |
mov [rdi+rdx], al | |
H3500: ; finished | |
EPILOGM | |
I3100: ; non-temporal move | |
neg rcx ; Negative index from the end | |
align 16 | |
I3110: ; main copy loop, 32 bytes at a time | |
; rcx has negative index from the end, counting up to zero | |
vmovups ymm0, [rsi+rcx] | |
vmovntps [rdi+rcx], ymm0 | |
add rcx, 20H | |
jnz I3110 | |
sfence | |
vzeroupper ; end of AVX mode | |
jmp H3120 ; Move the remaining edx bytes (0 - 31) | |
align 16 | |
J3100: ; There is a false memory dependence. | |
; check if src and dest overlap, if not then it is safe | |
; to copy backwards to avoid false memory dependence | |
%if 1 | |
; Use this version if you want consistent behavior in the case | |
; where dest > src and overlap. However, this case is undefined | |
; anyway because part of src is overwritten before copying | |
push rdx | |
mov rax, rsi | |
sub rax, rdi | |
cqo | |
xor rax, rdx | |
sub rax, rdx ; abs(src-dest) | |
neg rcx ; size | |
pop rdx ; restore rdx | |
cmp rax, rcx | |
jnb J3110 | |
neg rcx ; restore rcx | |
jmp H3110 ; overlap between src and dest. Can't copy backwards | |
%else | |
; save time by not checking the case that is undefined anyway | |
mov rax, rsi | |
sub rax, rdi | |
neg rcx ; size | |
cmp rax, rcx | |
jnb J3110 ; OK to copy backwards | |
; must copy forwards | |
neg rcx ; restore ecx | |
jmp H3110 ; copy forwards | |
%endif | |
J3110: ; copy backwards, rcx = size. rsi, rdi = end of src, dest | |
push rsi | |
push rdi | |
sub rsi, rcx | |
sub rdi, rcx | |
J3120: ; loop backwards | |
vmovups ymm0, [rsi+rcx-20H] | |
vmovaps [rdi+rcx-20H], ymm0 | |
sub rcx, 20H | |
jnz J3120 | |
vzeroupper | |
pop rdi | |
pop rsi | |
jmp H3120 | |
align 16 | |
; count < 64. Move 32-16-8-4-2-1 bytes | |
; multiple CPU versions (SSSE3 and above) | |
A1000: add rsi, rcx ; end of src | |
add rdi, rcx ; end of dest | |
neg rcx ; negative index from the end | |
cmp ecx, -20H | |
jg A1100 | |
; move 32 bytes | |
; movdqu is faster than 64-bit moves on processors with SSSE3 | |
movups xmm0, [rsi+rcx] | |
movups xmm1, [rsi+rcx+10H] | |
movups [rdi+rcx], xmm0 | |
movups [rdi+rcx+10H], xmm1 | |
add rcx, 20H | |
A1100: cmp ecx, -10H | |
jg A1200 | |
; move 16 bytes | |
movups xmm0, [rsi+rcx] | |
movups [rdi+rcx], xmm0 | |
add rcx, 10H | |
A1200: cmp ecx, -8 | |
jg A1300 | |
; move 8 bytes | |
mov rax, qword [rsi+rcx] | |
mov qword [rdi+rcx], rax | |
add rcx, 8 | |
A1300: cmp ecx, -4 | |
jg A1400 | |
; move 4 bytes | |
mov eax, [rsi+rcx] | |
mov [rdi+rcx], eax | |
add rcx, 4 | |
jz A1900 ; early out if count divisible by 4 | |
A1400: cmp ecx, -2 | |
jg A1500 | |
; move 2 bytes | |
movzx eax, word [rsi+rcx] | |
mov [rdi+rcx], ax | |
add rcx, 2 | |
A1500: cmp ecx, -1 | |
jg A1900 | |
; move 1 byte | |
movzx eax, byte [rsi+rcx] | |
mov [rdi+rcx], al | |
A1900: ; finished | |
EPILOGM | |
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
; | |
; Version for processors with fast unaligned read and fast 16 bytes write | |
; | |
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
align 16 | |
memcpyU: ; global label | |
memcpyU@: ; local label | |
PROLOGM | |
cmp rcx, 40H | |
jb A1000 ; Use simpler code if count < 64 | |
; count >= 64 | |
; Calculate size of first block up to first regular boundary of dest | |
mov edx, edi | |
neg edx | |
and edx, 0FH | |
jz B2100 ; Skip if dest aligned by 16 | |
; edx = size of first partial block, 1 - 15 bytes | |
test dl, 3 | |
jz B2030 | |
test dl, 1 | |
jz B2020 | |
; move 1 byte | |
movzx eax, byte [rsi] | |
mov [rdi], al | |
inc rsi | |
inc rdi | |
B2020: test dl, 2 | |
jz B2030 | |
; move 2 bytes | |
movzx eax, word [rsi] | |
mov [rdi], ax | |
add rsi, 2 | |
add rdi, 2 | |
B2030: test dl, 4 | |
jz B2040 | |
; move 4 bytes | |
mov eax, [rsi] | |
mov [rdi], eax | |
add rsi, 4 | |
add rdi, 4 | |
B2040: test dl, 8 | |
jz B2050 | |
; move 8 bytes | |
mov rax, [rsi] | |
mov [rdi], rax | |
add rsi, 8 | |
add rdi, 8 | |
B2050: sub rcx, rdx | |
B2100: ; Now dest is aligned by 16. Any partial block has been moved | |
; Set up for loop moving 32 bytes per iteration: | |
mov rdx, rcx ; Save count | |
and rcx, -20H ; Round down to nearest multiple of 32 | |
add rsi, rcx ; Point to the end | |
add rdi, rcx ; Point to the end | |
sub rdx, rcx ; Remaining data after loop | |
; Check if count very big | |
cmp rcx, [CacheBypassLimit] | |
ja I100 ; Use non-temporal store if count > CacheBypassLimit | |
neg rcx ; Negative index from the end | |
H100: ; copy -rcx bytes in blocks of 32 bytes. | |
; Check for false memory dependence: The CPU may falsely assume | |
; a partial overlap between the written destination and the following | |
; read source if source is unaligned and | |
; (src-dest) modulo 4096 is close to 4096 | |
test sil, 0FH | |
jz H110 ; aligned | |
mov eax, esi | |
sub eax, edi | |
and eax, 0FFFH ; modulo 4096 | |
cmp eax, 1000H - 200H | |
ja J100 | |
H110: ; main copy loop, 32 bytes at a time | |
; rcx has negative index from the end, counting up to zero | |
movups xmm0, [rsi+rcx] | |
movups xmm1, [rsi+rcx+10H] | |
movaps [rdi+rcx], xmm0 | |
movaps [rdi+rcx+10H], xmm1 | |
add rcx, 20H | |
jnz H110 | |
H120: ; Move the remaining edx bytes (0 - 31): | |
add rsi, rdx | |
add rdi, rdx | |
neg rdx | |
jz H500 ; Skip if no more data | |
; move 16-8-4-2-1 bytes, aligned | |
cmp edx, -10H | |
jg H200 | |
; move 16 bytes | |
movups xmm0, [rsi+rdx] | |
movaps [rdi+rdx], xmm0 | |
add rdx, 10H | |
H200: cmp edx, -8 | |
jg H210 | |
; move 8 bytes | |
movq xmm0, qword [rsi+rdx] | |
movq qword [rdi+rdx], xmm0 | |
add rdx, 8 | |
jz H500 ; Early skip if count divisible by 8 | |
H210: cmp edx, -4 | |
jg H220 | |
; move 4 bytes | |
mov eax, [rsi+rdx] | |
mov [rdi+rdx], eax | |
add rdx, 4 | |
H220: cmp edx, -2 | |
jg H230 | |
; move 2 bytes | |
movzx eax, word [rsi+rdx] | |
mov [rdi+rdx], ax | |
add rdx, 2 | |
H230: cmp edx, -1 | |
jg H500 | |
; move 1 byte | |
movzx eax, byte [rsi+rdx] | |
mov [rdi+rdx], al | |
H500: ; finished | |
EPILOGM | |
I100: ; non-temporal move | |
neg rcx ; Negative index from the end | |
align 16 | |
I110: ; main copy loop, 32 bytes at a time | |
; rcx has negative index from the end, counting up to zero | |
movups xmm0, [rsi+rcx] | |
movups xmm1, [rsi+rcx+10H] | |
movntps [rdi+rcx], xmm0 | |
movntps [rdi+rcx+10H], xmm1 | |
add rcx, 20H | |
jnz I110 | |
sfence | |
jmp H120 ; Move the remaining edx bytes (0 - 31): | |
align 16 | |
J100: ; There is a false memory dependence. | |
; check if src and dest overlap, if not then it is safe | |
; to copy backwards to avoid false memory dependence | |
%if 1 | |
; Use this version if you want consistent behavior in the case | |
; where dest > src and overlap. However, this case is undefined | |
; anyway because part of src is overwritten before copying | |
push rdx | |
mov rax, rsi | |
sub rax, rdi | |
cqo | |
xor rax, rdx | |
sub rax, rdx ; abs(src-dest) | |
neg rcx ; size | |
pop rdx ; restore rdx | |
cmp rax, rcx | |
jnb J110 | |
neg rcx ; restore rcx | |
jmp H110 ; overlap between src and dest. Can't copy backwards | |
%else | |
; save time by not checking the case that is undefined anyway | |
mov rax, rsi | |
sub rax, rdi | |
neg rcx ; size | |
cmp rax, rcx | |
jnb J110 ; OK to copy backwards | |
; must copy forwards | |
neg rcx ; restore ecx | |
jmp H110 ; copy forwards | |
%endif | |
J110: ; copy backwards, rcx = size. rsi, rdi = end of src, dest | |
push rsi | |
push rdi | |
sub rsi, rcx | |
sub rdi, rcx | |
J120: ; loop backwards | |
movups xmm1, [rsi+rcx-20H] | |
movups xmm0, [rsi+rcx-10H] | |
movaps [rdi+rcx-20H], xmm1 | |
movaps [rdi+rcx-10H], xmm0 | |
sub rcx, 20H | |
jnz J120 | |
pop rdi | |
pop rsi | |
jmp H120 | |
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
; | |
; Version for processors with SSSE3. Aligned read + shift + aligned write | |
; | |
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
align 16 | |
memcpySSSE3: ; global label | |
memcpySSSE3@: ; local label | |
PROLOGM | |
cmp rcx, 40H | |
jb A1000 ; Use simpler code if count < 64 | |
; count >= 64 | |
; Calculate size of first block up to first regular boundary of dest | |
mov edx, edi | |
neg edx | |
and edx, 0FH | |
jz B1200 ; Skip if dest aligned by 16 | |
; edx = size of first partial block, 1 - 15 bytes | |
test dl, 3 | |
jz B1030 | |
test dl, 1 | |
jz B1020 | |
; move 1 byte | |
movzx eax, byte [rsi] | |
mov [rdi], al | |
inc rsi | |
inc rdi | |
B1020: test dl, 2 | |
jz B1030 | |
; move 2 bytes | |
movzx eax, word [rsi] | |
mov [rdi], ax | |
add rsi, 2 | |
add rdi, 2 | |
B1030: test dl, 4 | |
jz B1040 | |
; move 4 bytes | |
mov eax, [rsi] | |
mov [rdi], eax | |
add rsi, 4 | |
add rdi, 4 | |
B1040: test dl, 8 | |
jz B1050 | |
; move 8 bytes | |
mov rax, [rsi] | |
mov [rdi], rax | |
add rsi, 8 | |
add rdi, 8 | |
B1050: sub rcx, rdx | |
B1200: ; Now dest is aligned by 16. Any partial block has been moved | |
; Find alignment of src modulo 16 at this point: | |
mov eax, esi | |
and eax, 0FH | |
; Set up for loop moving 32 bytes per iteration: | |
mov edx, ecx ; Save count (lower 32 bits) | |
and rcx, -20H ; Round down count to nearest multiple of 32 | |
add rsi, rcx ; Point to the end | |
add rdi, rcx ; Point to the end | |
sub edx, ecx ; Remaining data after loop (0-31) | |
sub rsi, rax ; Nearest preceding aligned block of src | |
; Check if count very big | |
cmp rcx, [CacheBypassLimit] | |
ja B1400 ; Use non-temporal store if count > CacheBypassLimit | |
neg rcx ; Negative index from the end | |
; Dispatch to different codes depending on src alignment | |
lea r8, [AlignmentDispatchSSSE3] | |
jmp near [r8+rax*8] | |
B1400: neg rcx | |
; Dispatch to different codes depending on src alignment | |
lea r8, [AlignmentDispatchNT] | |
jmp near [r8+rax*8] | |
align 16 | |
C100: ; Code for aligned src. SSE2 and SSSE3 versions | |
; The nice case, src and dest have same alignment. | |
; Loop. rcx has negative index from the end, counting up to zero | |
movaps xmm0, [rsi+rcx] | |
movaps xmm1, [rsi+rcx+10H] | |
movaps [rdi+rcx], xmm0 | |
movaps [rdi+rcx+10H], xmm1 | |
add rcx, 20H | |
jnz C100 | |
; Move the remaining edx bytes (0 - 31): | |
add rsi, rdx | |
add rdi, rdx | |
neg rdx | |
jz C500 ; Skip if no more data | |
; move 16-8-4-2-1 bytes, aligned | |
cmp edx, -10H | |
jg C200 | |
; move 16 bytes | |
movaps xmm0, [rsi+rdx] | |
movaps [rdi+rdx], xmm0 | |
add rdx, 10H | |
C200: cmp edx, -8 | |
jg C210 | |
; move 8 bytes | |
mov rax, [rsi+rdx] | |
mov [rdi+rdx], rax | |
add rdx, 8 | |
jz C500 ; Early skip if count divisible by 8 | |
C210: cmp edx, -4 | |
jg C220 | |
; move 4 bytes | |
mov eax, [rsi+rdx] | |
mov [rdi+rdx], eax | |
add rdx, 4 | |
C220: cmp edx, -2 | |
jg C230 | |
; move 2 bytes | |
movzx eax, word [rsi+rdx] | |
mov [rdi+rdx], ax | |
add rdx, 2 | |
C230: cmp edx, -1 | |
jg C500 | |
; move 1 byte | |
movzx eax, byte [rsi+rdx] | |
mov [rdi+rdx], al | |
C500: ; finished | |
EPILOGM | |
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
; | |
; Version for processors with SSE2. Aligned read + shift + aligned write | |
; | |
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
memcpySSE2: ; global label | |
memcpySSE2@: ; local label | |
PROLOGM | |
cmp rcx, 40H | |
jae B0100 ; Use simpler code if count < 64 | |
; count < 64. Move 32-16-8-4-2-1 bytes | |
add rsi, rcx ; end of src | |
add rdi, rcx ; end of dest | |
neg rcx ; negative index from the end | |
cmp ecx, -20H | |
jg A100 | |
; move 32 bytes | |
; mov r64 is faster than movdqu on Intel Pentium M and Core 1 | |
; movdqu is fast on Nehalem and later | |
mov rax, [rsi+rcx] | |
mov rdx, [rsi+rcx+8] | |
mov [rdi+rcx], rax | |
mov [rdi+rcx+8], rdx | |
mov rax, qword [rsi+rcx+10H] | |
mov rdx, qword [rsi+rcx+18H] | |
mov qword [rdi+rcx+10H], rax | |
mov qword [rdi+rcx+18H], rdx | |
add rcx, 20H | |
A100: cmp ecx, -10H | |
jg A200 | |
; move 16 bytes | |
mov rax, [rsi+rcx] | |
mov rdx, [rsi+rcx+8] | |
mov [rdi+rcx], rax | |
mov [rdi+rcx+8], rdx | |
add rcx, 10H | |
A200: cmp ecx, -8 | |
jg A300 | |
; move 8 bytes | |
mov rax, qword [rsi+rcx] | |
mov qword [rdi+rcx], rax | |
add rcx, 8 | |
A300: cmp ecx, -4 | |
jg A400 | |
; move 4 bytes | |
mov eax, [rsi+rcx] | |
mov [rdi+rcx], eax | |
add rcx, 4 | |
jz A900 ; early out if count divisible by 4 | |
A400: cmp ecx, -2 | |
jg A500 | |
; move 2 bytes | |
movzx eax, word [rsi+rcx] | |
mov [rdi+rcx], ax | |
add rcx, 2 | |
A500: cmp ecx, -1 | |
jg A900 | |
; move 1 byte | |
movzx eax, byte [rsi+rcx] | |
mov [rdi+rcx], al | |
A900: ; finished | |
EPILOGM | |
B0100: ; count >= 64 | |
; Calculate size of first block up to first regular boundary of dest | |
mov edx, edi | |
neg edx | |
and edx, 0FH | |
jz B0200 ; Skip if dest aligned by 16 | |
; edx = size of first partial block, 1 - 15 bytes | |
test dl, 3 | |
jz B0030 | |
test dl, 1 | |
jz B0020 | |
; move 1 byte | |
movzx eax, byte [rsi] | |
mov [rdi], al | |
inc rsi | |
inc rdi | |
B0020: test dl, 2 | |
jz B0030 | |
; move 2 bytes | |
movzx eax, word [rsi] | |
mov [rdi], ax | |
add rsi, 2 | |
add rdi, 2 | |
B0030: test dl, 4 | |
jz B0040 | |
; move 4 bytes | |
mov eax, [rsi] | |
mov [rdi], eax | |
add rsi, 4 | |
add rdi, 4 | |
B0040: test dl, 8 | |
jz B0050 | |
; move 8 bytes | |
mov rax, [rsi] | |
mov [rdi], rax | |
add rsi, 8 | |
add rdi, 8 | |
B0050: sub rcx, rdx | |
B0200: ; Now dest is aligned by 16. Any partial block has been moved | |
; This part will not always work if count < 64 | |
; Calculate size of first block up to first regular boundary of dest | |
mov edx, edi | |
neg edx | |
and edx, 0FH | |
jz B300 ; Skip if dest aligned by 16 | |
; rdx = size of first partial block, 1 - 15 bytes | |
add rsi, rdx | |
add rdi, rdx | |
sub rcx, rdx | |
neg rdx | |
cmp edx, -8 | |
jg B200 | |
; move 8 bytes | |
mov rax, [rsi+rdx] | |
mov [rdi+rdx], rax | |
add rdx, 8 | |
B200: cmp edx, -4 | |
jg B210 | |
; move 4 bytes | |
mov eax, [rsi+rdx] | |
mov [rdi+rdx], eax | |
add rdx, 4 | |
jz B300 ; early out if aligned by 4 | |
B210: cmp edx, -2 | |
jg B220 | |
; move 2 bytes | |
movzx eax, word [rsi+rdx] | |
mov [rdi+rdx], ax | |
add rdx, 2 | |
B220: cmp edx, -1 | |
jg B300 | |
; move 1 byte | |
movzx eax, byte [rsi+rdx] | |
mov [rdi+rdx], al | |
B300: ; Now dest is aligned by 16. Any partial block has been moved | |
; Find alignment of src modulo 16 at this point: | |
mov eax, esi | |
and eax, 0FH | |
; Set up for loop moving 32 bytes per iteration: | |
mov edx, ecx ; Save count (lower 32 bits) | |
and rcx, -20H ; Round down count to nearest multiple of 32 | |
add rsi, rcx ; Point to the end | |
add rdi, rcx ; Point to the end | |
sub edx, ecx ; Remaining data after loop (0-31) | |
sub rsi, rax ; Nearest preceding aligned block of src | |
; Check if count very big | |
cmp rcx, [CacheBypassLimit] | |
ja B400 ; Use non-temporal store if count > CacheBypassLimit | |
neg rcx ; Negative index from the end | |
; Dispatch to different codes depending on src alignment | |
lea r8, [AlignmentDispatchSSE2] | |
jmp near [r8+rax*8] | |
B400: neg rcx | |
; Dispatch to different codes depending on src alignment | |
lea r8, [AlignmentDispatchNT] | |
jmp near [r8+rax*8] | |
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
; | |
; Macros and alignment jump tables | |
; | |
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
; Macros for each src alignment, SSE2 instruction set: | |
; Make separate code for each alignment u because the shift instructions | |
; have the shift count as a constant: | |
%MACRO MOVE_UNALIGNED_SSE2 2 ; u, nt | |
; Move rcx + rdx bytes of data | |
; Source is misaligned. (src-dest) modulo 16 = %1 | |
; %2 = 1 if non-temporal store desired | |
; eax = %1 | |
; rsi = src - %1 = nearest preceding 16-bytes boundary | |
; rdi = dest (aligned) | |
; rcx = - (count rounded down to nearest divisible by 32) | |
; edx = remaining bytes to move after loop | |
movdqa xmm0, [rsi+rcx] ; Read from nearest preceding 16B boundary | |
%%L1: ; Loop. rcx has negative index from the end, counting up to zero | |
movdqa xmm1, [rsi+rcx+10H] ; Read next two blocks aligned | |
movdqa xmm2, [rsi+rcx+20H] | |
movdqa xmm3, xmm1 ; Copy because used twice | |
psrldq xmm0, %1 ; shift right | |
pslldq xmm1, 16-%1 ; shift left | |
por xmm0, xmm1 ; combine blocks | |
%IF %2 == 0 | |
movdqa [rdi+rcx], xmm0 ; Save aligned | |
%ELSE | |
movntdq [rdi+rcx], xmm0 ; non-temporal save | |
%ENDIF | |
movdqa xmm0, xmm2 ; Save for next iteration | |
psrldq xmm3, %1 ; shift right | |
pslldq xmm2, 16-%1 ; shift left | |
por xmm3, xmm2 ; combine blocks | |
%IF %2 == 0 | |
movdqa [rdi+rcx+10H], xmm3 ; Save aligned | |
%ELSE | |
movntdq [rdi+rcx+10H], xmm3 ; non-temporal save | |
%ENDIF | |
add rcx, 20H ; Loop through negative values up to zero | |
jnz %%L1 | |
; Set up for edx remaining bytes | |
add rsi, rdx | |
add rdi, rdx | |
neg rdx | |
cmp edx, -10H | |
jg %%L2 | |
; One more 16-bytes block to move | |
movdqa xmm1, [rsi+rdx+10H] | |
psrldq xmm0, %1 ; shift right | |
pslldq xmm1, 16-%1 ; shift left | |
por xmm0, xmm1 ; combine blocks | |
%IF %2 == 0 | |
movdqa [rdi+rdx], xmm0 ; Save aligned | |
%ELSE | |
movntdq [rdi+rdx], xmm0 ; non-temporal save | |
%ENDIF | |
add rdx, 10H | |
%%L2: ; Get src pointer back to misaligned state | |
add rsi, rax | |
%IF %2 == 1 | |
sfence | |
%ENDIF | |
; Move remaining 0 - 15 bytes, unaligned | |
jmp C200 | |
%ENDMACRO | |
%MACRO MOVE_UNALIGNED_SSE2_4 1 ; nt | |
; Special case for u = 4 | |
; %1 = 1 if non-temporal store desired | |
movaps xmm0, [rsi+rcx] ; Read from nearest preceding 16B boundary | |
%%L1: ; Loop. rcx has negative index from the end, counting up to zero | |
movaps xmm1, [rsi+rcx+10H] ; Read next two blocks aligned | |
movss xmm0, xmm1 ; Moves 4 bytes, leaves remaining bytes unchanged | |
shufps xmm0, xmm0, 00111001B ; Rotate | |
%IF %1 == 0 | |
movaps [rdi+rcx], xmm0 ; Save aligned | |
%ELSE | |
movntps [rdi+rcx], xmm0 ; Non-temporal save | |
%ENDIF | |
movaps xmm0, [rsi+rcx+20H] | |
movss xmm1, xmm0 | |
shufps xmm1, xmm1, 00111001B | |
%IF %1 == 0 | |
movaps [rdi+rcx+10H], xmm1 ; Save aligned | |
%ELSE | |
movntps [rdi+rcx+10H], xmm1 ; Non-temporal save | |
%ENDIF | |
add rcx, 20H ; Loop through negative values up to zero | |
jnz %%L1 | |
; Set up for edx remaining bytes | |
add rsi, rdx | |
add rdi, rdx | |
neg rdx | |
cmp edx, -10H | |
jg %%L2 | |
; One more 16-bytes block to move | |
movaps xmm1, [rsi+rdx+10H] ; Read next two blocks aligned | |
movss xmm0, xmm1 | |
shufps xmm0, xmm0, 00111001B | |
%IF %1 == 0 | |
movaps [rdi+rdx], xmm0 ; Save aligned | |
%ELSE | |
movntps [rdi+rdx], xmm0 ; Non-temporal save | |
%ENDIF | |
add rdx, 10H | |
%%L2: ; Get src pointer back to misaligned state | |
add rsi, rax | |
%IF %1 == 1 | |
sfence | |
%ENDIF | |
; Move remaining 0 - 15 bytes, unaligned | |
jmp C200 | |
%ENDMACRO | |
%MACRO MOVE_UNALIGNED_SSE2_8 1 ; nt | |
; Special case for u = 8 | |
; %1 = 1 if non-temporal store desired | |
movaps xmm0, [rsi+rcx] ; Read from nearest preceding 16B boundary | |
%%L1: ; Loop. rcx has negative index from the end, counting up to zero | |
movaps xmm1, [rsi+rcx+10H] ; Read next two blocks aligned | |
movsd xmm0, xmm1 ; Moves 8 bytes, leaves remaining bytes unchanged | |
shufps xmm0, xmm0, 01001110B ; Rotate | |
%IF %1 == 0 | |
movaps [rdi+rcx], xmm0 ; Save aligned | |
%ELSE | |
movntps [rdi+rcx], xmm0 ; Non-temporal save | |
%ENDIF | |
movaps xmm0, [rsi+rcx+20H] | |
movsd xmm1, xmm0 | |
shufps xmm1, xmm1, 01001110B | |
%IF %1 == 0 | |
movaps [rdi+rcx+10H], xmm1 ; Save aligned | |
%ELSE | |
movntps [rdi+rcx+10H], xmm1 ; Non-temporal save | |
%ENDIF | |
add rcx, 20H ; Loop through negative values up to zero | |
jnz %%L1 | |
; Set up for edx remaining bytes | |
add rsi, rdx | |
add rdi, rdx | |
neg rdx | |
cmp edx, -10H | |
jg %%L2 | |
; One more 16-bytes block to move | |
movaps xmm1, [rsi+rdx+10H] ; Read next two blocks aligned | |
movsd xmm0, xmm1 | |
shufps xmm0, xmm0, 01001110B | |
%IF %1 == 0 | |
movaps [rdi+rdx], xmm0 ; Save aligned | |
%ELSE | |
movntps [rdi+rdx], xmm0 ; Non-temporal save | |
%ENDIF | |
add rdx, 10H | |
%%L2: ; Get src pointer back to misaligned state | |
add rsi, rax | |
%IF %1 == 1 | |
sfence | |
%ENDIF | |
; Move remaining 0 - 15 bytes, unaligned | |
jmp C200 | |
%ENDMACRO | |
%MACRO MOVE_UNALIGNED_SSE2_12 1 ; nt | |
; Special case for u = 12 | |
; %1 = 1 if non-temporal store desired | |
movaps xmm0, [rsi+rcx] ; Read from nearest preceding 16B boundary | |
shufps xmm0, xmm0, 10010011B | |
%%L1: ; Loop. rcx has negative index from the end, counting up to zero | |
movaps xmm1, [rsi+rcx+10H] ; Read next two blocks aligned | |
movaps xmm2, [rsi+rcx+20H] | |
shufps xmm1, xmm1, 10010011B | |
shufps xmm2, xmm2, 10010011B | |
movaps xmm3, xmm2 | |
movss xmm2, xmm1 ; Moves 4 bytes, leaves remaining bytes unchanged | |
movss xmm1, xmm0 ; Moves 4 bytes, leaves remaining bytes unchanged | |
%IF %1 == 0 | |
movaps [rdi+rcx], xmm1 ; Save aligned | |
movaps [rdi+rcx+10H], xmm2 ; Save aligned | |
%ELSE | |
movntps [rdi+rcx], xmm1 ; Non-temporal save | |
movntps [rdi+rcx+10H], xmm2 ; Non-temporal save | |
%ENDIF | |
movaps xmm0, xmm3 ; Save for next iteration | |
add rcx, 20H ; Loop through negative values up to zero | |
jnz %%L1 | |
; Set up for edx remaining bytes | |
add rsi, rdx | |
add rdi, rdx | |
neg rdx | |
cmp edx, -10H | |
jg %%L2 | |
; One more 16-bytes block to move | |
movaps xmm1, [rsi+rdx+10H] ; Read next two blocks aligned | |
shufps xmm1, xmm1, 10010011B | |
movss xmm1, xmm0 ; Moves 4 bytes, leaves remaining bytes unchanged | |
%IF %1 == 0 | |
movaps [rdi+rdx], xmm1 ; Save aligned | |
%ELSE | |
movntps [rdi+rdx], xmm1 ; Non-temporal save | |
%ENDIF | |
add rdx, 10H | |
%%L2: ; Get src pointer back to misaligned state | |
add rsi, rax | |
%IF %1 == 1 | |
sfence | |
%ENDIF | |
; Move remaining 0 - 15 bytes, unaligned | |
jmp C200 | |
%ENDMACRO | |
; Macros for each src alignment, Suppl.SSE3 instruction set: | |
; Make separate code for each alignment u because the palignr instruction | |
; has the shift count as a constant: | |
%MACRO MOVE_UNALIGNED_SSSE3 1 ; u | |
; Move rcx + rdx bytes of data | |
; Source is misaligned. (src-dest) modulo 16 = %1 | |
; eax = %1 | |
; rsi = src - %1 = nearest preceding 16-bytes boundary | |
; rdi = dest (aligned) | |
; rcx = - (count rounded down to nearest divisible by 32) | |
; edx = remaining bytes to move after loop | |
movdqa xmm0, [rsi+rcx] ; Read from nearest preceding 16B boundary | |
%%L1: ; Loop. rcx has negative index from the end, counting up to zero | |
movdqa xmm2, [rsi+rcx+10H] ; Read next two blocks | |
movdqa xmm3, [rsi+rcx+20H] | |
movdqa xmm1, xmm0 ; Save xmm0 | |
movdqa xmm0, xmm3 ; Save for next iteration | |
palignr xmm3, xmm2, %1 ; Combine parts into aligned block | |
palignr xmm2, xmm1, %1 ; Combine parts into aligned block | |
movdqa [rdi+rcx], xmm2 ; Save aligned | |
movdqa [rdi+rcx+10H], xmm3 ; Save aligned | |
add rcx, 20H | |
jnz %%L1 | |
; Set up for edx remaining bytes | |
add rsi, rdx | |
add rdi, rdx | |
neg rdx | |
cmp edx, -10H | |
jg %%L2 | |
; One more 16-bytes block to move | |
movdqa xmm2, [rsi+rdx+10H] | |
palignr xmm2, xmm0, %1 | |
movdqa [rdi+rdx], xmm2 | |
add rdx, 10H | |
%%L2: ; Get src pointer back to misaligned state | |
add rsi, rax | |
; Move remaining 0 - 15 bytes | |
jmp C200 | |
%ENDMACRO | |
; Make 15 instances of SSE2 macro for each value of the alignment u. | |
; These are pointed to by the jump table AlignmentDispatchSSE2 below | |
; (alignments and fillers are inserted manually to minimize the number | |
; of 16-bytes boundaries inside loops) | |
align 16 | |
D104: MOVE_UNALIGNED_SSE2_4 0 | |
;times 4 nop | |
D108: MOVE_UNALIGNED_SSE2_8 0 | |
;times 4 nop | |
D10C: MOVE_UNALIGNED_SSE2_12 0 | |
;times 1 nop | |
D101: MOVE_UNALIGNED_SSE2 1, 0 | |
D102: MOVE_UNALIGNED_SSE2 2, 0 | |
D103: MOVE_UNALIGNED_SSE2 3, 0 | |
D105: MOVE_UNALIGNED_SSE2 5, 0 | |
D106: MOVE_UNALIGNED_SSE2 6, 0 | |
D107: MOVE_UNALIGNED_SSE2 7, 0 | |
D109: MOVE_UNALIGNED_SSE2 9, 0 | |
;times 1 nop | |
D10A: MOVE_UNALIGNED_SSE2 0AH, 0 | |
D10B: MOVE_UNALIGNED_SSE2 0BH, 0 | |
D10D: MOVE_UNALIGNED_SSE2 0DH, 0 | |
D10E: MOVE_UNALIGNED_SSE2 0EH, 0 | |
D10F: MOVE_UNALIGNED_SSE2 0FH, 0 | |
; Make 15 instances of Suppl-SSE3 macro for each value of the alignment u. | |
; These are pointed to by the jump table AlignmentDispatchSupSSE3 below | |
align 16 | |
E104: MOVE_UNALIGNED_SSSE3 4 | |
E108: MOVE_UNALIGNED_SSSE3 8 | |
E10C: MOVE_UNALIGNED_SSSE3 0CH | |
E101: MOVE_UNALIGNED_SSSE3 1 | |
E102: MOVE_UNALIGNED_SSSE3 2 | |
E103: MOVE_UNALIGNED_SSSE3 3 | |
E105: MOVE_UNALIGNED_SSSE3 5 | |
E106: MOVE_UNALIGNED_SSSE3 6 | |
E107: MOVE_UNALIGNED_SSSE3 7 | |
E109: MOVE_UNALIGNED_SSSE3 9 | |
times 1 nop | |
E10A: MOVE_UNALIGNED_SSSE3 0AH | |
E10B: MOVE_UNALIGNED_SSSE3 0BH | |
E10D: MOVE_UNALIGNED_SSSE3 0DH | |
E10E: MOVE_UNALIGNED_SSSE3 0EH | |
E10F: MOVE_UNALIGNED_SSSE3 0FH | |
; Codes for non-temporal move. Aligned case first | |
align 16 | |
F100: ; Non-temporal move, src and dest have same alignment. | |
; Loop. rcx has negative index from the end, counting up to zero | |
movaps xmm0, [rsi+rcx] ; Read | |
movaps xmm1, [rsi+rcx+10H] | |
movntps [rdi+rcx], xmm0 ; Write non-temporal (bypass cache) | |
movntps [rdi+rcx+10H], xmm1 | |
add rcx, 20H | |
jnz F100 ; Loop through negative rcx up to zero | |
; Move the remaining edx bytes (0 - 31): | |
add rsi, rdx | |
add rdi, rdx | |
neg rdx | |
jz C500 ; Skip if no more data | |
; Check if we can more one more 16-bytes block | |
cmp edx, -10H | |
jg C200 | |
; move 16 bytes, aligned | |
movaps xmm0, [rsi+rdx] | |
movntps [rdi+rdx], xmm0 | |
add rdx, 10H | |
sfence | |
; move the remaining 0 - 15 bytes | |
jmp C200 | |
; Make 15 instances of MOVE_UNALIGNED_SSE2 macro for each value of | |
; the alignment u. | |
; These are pointed to by the jump table AlignmentDispatchNT below | |
;align 16 | |
F104: MOVE_UNALIGNED_SSE2_4 1 | |
F108: MOVE_UNALIGNED_SSE2_8 1 | |
F10C: MOVE_UNALIGNED_SSE2_12 1 | |
F101: MOVE_UNALIGNED_SSE2 1, 1 | |
F102: MOVE_UNALIGNED_SSE2 2, 1 | |
F103: MOVE_UNALIGNED_SSE2 3, 1 | |
F105: MOVE_UNALIGNED_SSE2 5, 1 | |
F106: MOVE_UNALIGNED_SSE2 6, 1 | |
F107: MOVE_UNALIGNED_SSE2 7, 1 | |
F109: MOVE_UNALIGNED_SSE2 9, 1 | |
F10A: MOVE_UNALIGNED_SSE2 0AH, 1 | |
F10B: MOVE_UNALIGNED_SSE2 0BH, 1 | |
F10D: MOVE_UNALIGNED_SSE2 0DH, 1 | |
F10E: MOVE_UNALIGNED_SSE2 0EH, 1 | |
F10F: MOVE_UNALIGNED_SSE2 0FH, 1 | |
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
; | |
; CPU dispatcher | |
; | |
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
memcpyCPUDispatch: ; CPU dispatcher, check for instruction sets and which method is fastest | |
; This part is executed only once | |
push rbx | |
push rcx | |
push rdx | |
push rsi | |
push rdi | |
push r8 | |
; set CacheBypassLimit to half the size of the largest level cache | |
call GetMemcpyCacheLimit@ | |
mov eax, 1 | |
cpuid ; Get feature flags | |
lea rbx, [memcpySSE2@] | |
bt ecx, 9 ; Test bit for SupplSSE3 | |
jnc Q100 | |
lea rbx, [memcpySSSE3@] | |
call UnalignedIsFaster ; Test if unaligned read is faster than aligned read and shift | |
test eax, eax | |
jz Q100 | |
lea rbx, [memcpyU@] | |
call Store256BitIsFaster ; Test if 256-bit read/write is available and faster than 128-bit read/write | |
test eax, eax | |
jz Q100 | |
lea rbx, [memcpyU256@] | |
call InstructionSet | |
cmp eax, 15 | |
jb Q100 | |
lea rbx, [memcpyAVX512F@] | |
cmp eax, 16 | |
jb Q100 | |
lea rbx, [memcpyAVX512BW@] | |
Q100: ; Insert appropriate pointer | |
mov [memcpyDispatch], rbx | |
mov rax, rbx | |
pop r8 | |
pop rdi | |
pop rsi | |
pop rdx | |
pop rcx | |
pop rbx | |
; Jump according to the replaced function pointer | |
jmp rax | |
; extern "C" size_t GetMemcpyCacheLimit(); | |
GetMemcpyCacheLimit: | |
GetMemcpyCacheLimit@: ; local limit | |
mov rax, [CacheBypassLimit] | |
test rax, rax | |
jnz U200 | |
; Get half the size of the largest level cache | |
%ifdef WINDOWS | |
xor ecx, ecx ; 0 means largest level cache | |
%else | |
xor edi, edi ; 0 means largest level cache | |
%endif | |
call DataCacheSize ; get cache size | |
shr rax, 1 ; half the size | |
jnz U100 | |
mov eax, 400000H ; cannot determine cache size. use 4 Mbytes | |
U100: mov [CacheBypassLimit], rax | |
U200: ret | |
; Note: SetMemcpyCacheLimit is defined in memmove64.asm, calling SetMemcpyCacheLimit1 | |
SetMemcpyCacheLimit1: | |
%ifdef WINDOWS | |
mov rax, rcx | |
%else | |
mov rax, rdi | |
%endif | |
test rax, rax | |
jnz U400 | |
; zero, means default | |
mov [CacheBypassLimit], rax | |
call GetMemcpyCacheLimit@ | |
U400: mov [CacheBypassLimit], rax | |
ret | |
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
; | |
; getDispatch, for testing only | |
; | |
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
getDispatch: | |
mov rax,[memcpyDispatch] | |
ret | |
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
; | |
; data section. jump tables, dispatch function pointer, cache size | |
; | |
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
; Data segment must be included in function namespace | |
SECTION .data | |
align 16 | |
; Jump tables for alignments 0 - 15: | |
; The CPU dispatcher replaces AlignmentDispatch with | |
; AlignmentDispatchSSE2 or AlignmentDispatchSupSSE3 if Suppl-SSE3 | |
; is supported. | |
; Code pointer for each alignment for SSE2 instruction set | |
AlignmentDispatchSSE2: | |
DQ C100, D101, D102, D103, D104, D105, D106, D107 | |
DQ D108, D109, D10A, D10B, D10C, D10D, D10E, D10F | |
; Code pointer for each alignment for Suppl-SSE3 instruction set | |
AlignmentDispatchSSSE3: | |
DQ C100, E101, E102, E103, E104, E105, E106, E107 | |
DQ E108, E109, E10A, E10B, E10C, E10D, E10E, E10F | |
; Code pointer for each alignment for non-temporal store | |
AlignmentDispatchNT: | |
DQ F100, F101, F102, F103, F104, F105, F106, F107 | |
DQ F108, F109, F10A, F10B, F10C, F10D, F10E, F10F | |
; Pointer to appropriate version. | |
; This initially points to memcpyCPUDispatch. memcpyCPUDispatch will | |
; change this to the appropriate version of memcpy, so that | |
; memcpyCPUDispatch is only executed once: | |
memcpyDispatch DQ memcpyCPUDispatch | |
; Bypass cache by using non-temporal moves if count > CacheBypassLimit | |
; The optimal value of CacheBypassLimit is difficult to estimate, but | |
; a reasonable value is half the size of the largest cache: | |
CacheBypassLimit: DQ 0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
How do I compile this with GCC on Linux?