Skip to content

Instantly share code, notes, and snippets.

@lancejpollard
Forked from Const-me/memcpy.asm
Created June 1, 2020 14:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lancejpollard/36313543531b2ed9314837ecdbb408e8 to your computer and use it in GitHub Desktop.
Save lancejpollard/36313543531b2ed9314837ecdbb408e8 to your computer and use it in GitHub Desktop.
page ,132
title memcpy - Copy source memory bytes to destination
;***
;memcpy.asm - contains memcpy and memmove routines
;
; Copyright (c) Microsoft Corporation. All rights reserved.
;
;Purpose:
; memcpy() copies a source memory buffer to a destination buffer.
; Overlapping buffers are not treated specially, so propogation may occur.
; memmove() copies a source memory buffer to a destination buffer.
; Overlapping buffers are treated specially, to avoid propogation.
;
;*******************************************************************************
include ksamd64.inc
subttl "memcpy"
;***
;memcpy - Copy source buffer to destination buffer
;
;Purpose:
; memcpy() copies a source memory buffer to a destination memory buffer.
; This routine does NOT recognize overlapping buffers, and thus can lead
; to propogation.
; For cases where propogation must be avoided, memmove() must be used.
;
; Algorithm:
;
; void * memcpy(void * dst, void * src, size_t count)
; {
; void * ret = dst;
;
; /*
; * copy from lower addresses to higher addresses
; */
; while (count--)
; *dst++ = *src++;
;
; return(ret);
; }
;
;memmove - Copy source buffer to destination buffer
;
;Purpose:
; memmove() copies a source memory buffer to a destination memory buffer.
; This routine recognize overlapping buffers to avoid propogation.
; For cases where propogation is not a problem, memcpy() can be used.
;
; Algorithm:
;
; void * memmove(void * dst, void * src, size_t count)
; {
; void * ret = dst;
;
; if (dst <= src || dst >= (src + count)) {
; /*
; * Non-Overlapping Buffers
; * copy from lower addresses to higher addresses
; */
; while (count--)
; *dst++ = *src++;
; }
; else {
; /*
; * Overlapping Buffers
; * copy from higher addresses to lower addresses
; */
; dst += count - 1;
; src += count - 1;
;
; while (count--)
; *dst-- = *src--;
; }
;
; return(ret);
; }
;
;
;Entry:
; void *dst = pointer to destination buffer
; const void *src = pointer to source buffer
; size_t count = number of bytes to copy
;
;Exit:
; Returns a pointer to the destination buffer in AX/DX:AX
;
;Uses:
; CX, DX
;
;Exceptions:
;*******************************************************************************
extrn __favor:dword
extrn __ImageBase:byte
extrn __memcpy_nt_iters:qword ; defined in cpu_disp.c
__FAVOR_ENFSTRG equ 1
__FAVOR_SMSTRG equ 2
; Code for copying block using enhanced fast strings.
; This code needs to be in a separate routine because
; it uses non-volatile registers which must be saved
; and restored for exception handling.
NESTED_ENTRY memcpy_repmovs, _TEXT
push_reg rdi
push_reg rsi
.endprolog
mov rax, r11 ; return original destination pointer
mov rdi, rcx ; move destination pointer to rdi
mov rcx, r8 ; move length to rcx
mov rsi, r10 ; move source pointer to rsi
rep movsb ; copy source to destination buffer
.beginepilog
pop rsi
pop rdi
ret ; return
NESTED_END memcpy_repmovs, _TEXT
; Main memmove/memcpy routine
public memmove
LEAF_ENTRY_ARG3 memcpy, _TEXT, dst:ptr byte, src:ptr byte, count:dword
OPTION PROLOGUE:NONE, EPILOGUE:NONE
memmove = memcpy
mov r11, rcx ; save destination address
mov r10, rdx ; save source address
cmp r8, 16 ; if 16 bytes or less
jbe MoveBytes16 ; go move them quick
cmp r8, 32 ; check for length <= 32 (we know its > 16)
jbe Move17to32 ; go handle lengths 17-32 as a special case
sub rdx, rcx ; compute offset to source buffer
jae CopyUp ; if above or equal, go move up
lea rax, [r8+r10] ; else check that src+count < dst
cmp rcx, rax ; (src + count) < dst
jb CopyDown ; no, buffers overlap go move downward
CopyUp:
cmp r8, 128
jbe XmmCopySmall
bt __favor, __FAVOR_ENFSTRG ; check for ENFSTRG (enhanced fast strings)
jnc XmmCopyUp ; If Enhanced Fast String not available, use XMM
jmp memcpy_repmovs
; Handle lengths 17-32 as a special case using XMM registers.
; This allows the regular code to assume that there will always be enough
; bytes for the "deferred" block of 16. Also any case that can be handled
; with just two stores is handled with just two stores, the regular code
; will always do 3 stores for unaligned moves that have a remainder.
; No assumptions are made here about buffer alignment or overlap.
; We load the entire string to be moved in 2 xmm registers before storing
; anything, so this works for any arrangement of overlapping buffers.
;
; dst is in rcx (can modify) and r11 (must preserve for return value)
; src is in r10 (should preserve for consistency)
; rdx is the offset from the dst to the source, so rcx + rdx is the src
; r8 is the length, and is known to be 17 <= r8 <= 32
;
; When length < 32 the first 16 bytes includes some of the last 16 bytes
; and we will store (length - 32) bytes twice. (E.g. in the worst case
; of len 17 we are storing the middle 15 bytes of the buffer twice).
; This is still much faster than doing logic and branching with 1, 2, 4
; and 8 byte conditional copies.
;
align 16
Move17to32:
movups xmm0, [rdx] ; load first 16 bytes of src
movups xmm1, (-16)[rdx + r8] ; load last 16 bytes of src
movups [rcx], xmm0 ; store first 16 bytes of dst
movups (-16)[rcx + r8], xmm1 ; store last 16 bytes of dst
mov rax, rcx ; set destination address
ret
;
; Move residual bytes.
;
align 16
MoveBytes16:
mov rax, rcx ; mov destination address to rax
lea r9, OFFSET __ImageBase
IFDEF _VCRUNTIME_BUILD_QSPECTRE
and r8, 1Fh ; bound r8 to 0-31 in speculation scenarios (17-31 is padding)
ENDIF
mov ecx, [(IMAGEREL MoveSmall) + r9 +r8*4]
add rcx, r9
jmp rcx
MoveSmall dd IMAGEREL MoveSmall0
dd IMAGEREL MoveSmall1
dd IMAGEREL MoveSmall2
dd IMAGEREL MoveSmall3
dd IMAGEREL MoveSmall4
dd IMAGEREL MoveSmall5
dd IMAGEREL MoveSmall6
dd IMAGEREL MoveSmall7
dd IMAGEREL MoveSmall8
dd IMAGEREL MoveSmall9
dd IMAGEREL MoveSmall10
dd IMAGEREL MoveSmall11
dd IMAGEREL MoveSmall12
dd IMAGEREL MoveSmall13
dd IMAGEREL MoveSmall14
dd IMAGEREL MoveSmall15
dd IMAGEREL MoveSmall16
IFDEF _VCRUNTIME_BUILD_QSPECTRE
dd 15 dup (IMAGEREL MoveSmall0) ; 17 -> 31 padding
ENDIF
align 16
MoveSmall0::
ret
MoveSmall2::
movzx ecx, word ptr [rdx] ; get two byte from source
mov [rax], cx ; write two bytes to destination
ret
MoveSmall8::
mov rcx, qword ptr [rdx] ; get eight bytes from source
mov [rax], rcx ; write eight bytes to destination
ret
MoveSmall3::
movzx ecx, word ptr [rdx] ; get two bytes from source
movzx r8d, byte ptr 2[rdx] ; get last byte from source
mov [rax], cx ; write two bytes to destination
mov 2[rax], r8b ; write last byte to destination
ret
MoveSmall1::
movzx ecx, byte ptr [rdx] ; get byte from source
mov [rax], cl ; write byte to destination
ret
MoveSmall16::
movdqu xmm0, xmmword ptr [rdx] ; get sixteen bytes from source
movdqu xmmword ptr [rax], xmm0 ; write sixteen bytes to destination
ret
align 16
MoveSmall11::
mov r8, qword ptr [rdx] ; get eight bytes from source
movzx ecx, word ptr 8[rdx] ; get two bytes from source
movzx r9d, byte ptr 10[rdx] ; get last byte from source
mov [rax], r8 ; write eight bytes to destination
mov 8[rax], cx ; write two bytes to destination
mov 10[rax], r9b ; write last byte to destination
ret
MoveSmall4::
mov ecx, dword ptr [rdx] ; get four bytes from source
mov [rax], ecx ; write four bytes to destination
ret
align 16
MoveSmall5::
mov ecx, dword ptr [rdx] ; get four bytes from source
movzx r8d, byte ptr 4[rdx] ; get last byte from source
mov [rax], ecx ; write four bytes to destination
mov 4[rax], r8b ; write last byte to destination
ret
align 16
MoveSmall6::
mov ecx, dword ptr [rdx] ; get four bytes from source
movzx r8d, word ptr 4[rdx] ; get two bytes from source
mov [rax], ecx ; write four bytes to destination
mov 4[rax], r8w ; write two bytes to destination
ret
align 16
MoveSmall7::
mov ecx, dword ptr [rdx] ; get four bytes from source
movzx r8d, word ptr 4[rdx] ; get two bytes from source
movzx r9d, byte ptr 6[rdx] ; get last byte from source
mov [rax], ecx ; write four bytes to destination
mov 4[rax], r8w ; write two bytes to destination
mov 6[rax], r9b ; write last byte to destination
ret
MoveSmall13::
mov r8, qword ptr [rdx] ; get eight bytes from source
mov ecx, dword ptr 8[rdx] ; get four bytes from source
movzx r9d, byte ptr 12[rdx] ; get last bytes from source
mov [rax], r8 ; write eight bytes to destination
mov 8[rax], ecx ; write four bytes to destination
mov 12[rax], r9b ; write last byte to destination
ret
align 16
MoveSmall9::
mov r8, qword ptr [rdx] ; get eight bytes from source
movzx ecx, byte ptr 8[rdx] ; get last byte from source
mov [rax], r8 ; write eight bytes to destination
mov 8[rax], cl ; write last byte to destination
ret
align 16
MoveSmall10::
mov r8, qword ptr [rdx] ; get eight bytes from source
movzx ecx, word ptr 8[rdx] ; get two bytes from source
mov [rax], r8 ; write eight bytes to destination
mov 8[rax], cx ; write two bytes to destination
ret
align 16
MoveSmall12::
mov r8, qword ptr [rdx] ; get eight bytes from source
mov ecx, dword ptr 8[rdx] ; get four bytes from source
mov [rax], r8 ; write eight bytes to destination
mov 8[rax], ecx ; write four bytes to destination
ret
align 16
MoveSmall14::
mov r8, qword ptr [rdx] ; get eight bytes from source
mov ecx, dword ptr 8[rdx] ; get four bytes from source
movzx r9d, word ptr 12[rdx] ; get two bytes from source
mov [rax], r8 ; write eight bytes to destination
mov 8[rax], ecx ; write four bytes to destination
mov 12[rax], r9w ; write two bytes to destination
ret
align 16
MoveSmall15::
mov r8, qword ptr [rdx] ; get eight bytes from source
mov ecx, dword ptr 8[rdx] ; get four bytes from source
movzx r9d, word ptr 12[rdx] ; get two bytes from source
movzx r10d, byte ptr 14[rdx] ; get last byte from source
mov [rax], r8 ; write eight bytes to destination
mov 8[rax], ecx ; write four bytes to destination
mov 12[rax], r9w ; write two bytes to destination
mov 14[rax], r10b ; write last byte to destination
ret
;
; Memcpy up using SSE instructions.
;
; Preconditions:
; destination in rcx (destructable) and r11 (must preserve for return value)
; source in r10
; length in r8, must be greater than 16
; offset from dest to src in rdx
; source addr > dest addr or else buffers don't overlap
;
; Aligned stores are much faster on AMD hardware, so start by moving however many
; bytes must be moved so updated dst is 16-byte aligned. We need to copy
; (16 - (dest mod 16)) bytes, but it's faster to just do an unaligned copy of 16
; bytes and then start the aligned loop as usual at ((dest - (dest mod 16)) + 16).
; This results in (dest mod 16) bytes being copied twice. This is a lot faster
; than a bunch of code to copy maybe 1 then maybe 2 then maybe 4 then maybe 8
; bytes to achieve dst alignement.
;
; We know the src address is greater than the dst, but not by how much. In the
; case where the difference is less than 16 we must be careful about the bytes
; that will be stored twice. We must do both loads before either store, or the
; second load of those bytes will get the wrong values. We handle this by
; loading the last 16 bytes that can be stored at an aligned address, but
; deferring the store of those bytes to the remainder code, so it can load the
; remainder before storing the deferred bytes. Since either or both of the two
; loops can be skipped, the preconditions needed by the remainder code must
; also apply to the loops. These conditions are:
; - r8 is the count remaining, not including the deferred bytes
; - [rcx + rdx] and [rcx] as usual point to the src and dst where the number
; number of bytes given by r8 should be copied from and to.
; - xmm0 holds the 16 deferred bytes that need to be stored at (-16)[rcx]
;
align 16
XmmCopyUp:
movups xmm0, [rcx + rdx] ; load deferred bytes
add r8, rcx ; r8 points 1 byte past end
add rcx, 16 ; update to next block.
test r11b, 15 ; test if destination aligned
jz XmmCopyLargeTest ; go try 128-byte blocks
;
; Move alignment bytes.
;
XmmCopyAlign:
movaps xmm1, xmm0 ; save initial bytes in xmm1
and rcx, -16 ; rcx is 16 bytes past first 16-byte align point
movups xmm0, [rcx + rdx] ; load aligned deferred-store bytes
add rcx, 16 ; update to next block
movups [r11], xmm1 ; now safe to store 16 unaligned at start
;
; See if we can move any 128-byte blocks.
;
XmmCopyLargeTest:
sub r8, rcx ; r8 restored to count remaining
mov r9, r8 ; copy count of bytes remaining
shr r9, 7 ; compute number of 128-byte blocks
jz XmmCopySmallTest ; if z jump around to 2nd loop
movaps (-16)[rcx], xmm0 ; going into 1st loop, ok to store deferred bytes
cmp r9, __memcpy_nt_iters ; threshold defined by cpu_disp.c
jna short XmmCopyLargeInner ; jump into 1st loop
jmp XmmCopyLargeInnerNT ; long enough so non-temporal worth it, jump into nt loop
;
; Move 128-byte blocks
;
align 16
;
; When possible, non-mov instructions are put between a load and store
; so their execution can overlap the store.
; The jnz is likewise moved earlier to come before the last store pair.
; Pairs of loads/stores are used to overlap cache latencies.
; movups and movaps are equally fast on aligned storage, we use movaps
; to document movs that we *know* are going to be aligned, movups otherwise.
; xmm0 must be preloaded before jumping into this loop, and the last
; store must be deferred (and the bytes to store left in xmm0) for the
; following loop and/or the remainder code.
;
XmmCopyLargeOuter:
movaps (-32)[rcx], xmm0 ; store 7th chunk from prior iteration
movaps (-16)[rcx], xmm1 ; store 8th chunk from prior iteration
XmmCopyLargeInner: ; enter loop here with xmm0 preloaded.
movups xmm0, [rcx + rdx] ; load first 16 byte chunk
movups xmm1, 16[rcx + rdx] ; load 2nd 16 byte chunk
add rcx, 128 ; advance destination address
movaps (-128)[rcx], xmm0 ; store first 16 byte chunk
movaps (-112)[rcx], xmm1 ; store 2nd 16 byte chunk
movups xmm0, (-96)[rcx + rdx] ; load 3rd chunk
movups xmm1, (-80)[rcx + rdx] ; load 4th chunk
dec r9 ; dec block counter (set cc for jnz)
movaps (-96)[rcx], xmm0 ; store 3rd chunk
movaps (-80)[rcx], xmm1 ; store 4th chunk
movups xmm0, (-64)[rcx + rdx] ; load 5th chunk
movups xmm1, (-48)[rcx + rdx] ; load 6th chunk
movaps (-64)[rcx], xmm0 ; store 5th chunk
movaps (-48)[rcx], xmm1 ; store 6th chunk
movups xmm0, (-32)[rcx + rdx] ; load 7th chunk
movups xmm1, (-16)[rcx + rdx] ; load 8th chunk
jnz XmmCopyLargeOuter ; loop if more blocks
XmmCopyFinish: ; non-temporal codepath rejoins here
movaps (-32)[rcx], xmm0 ; store 7th chunk from final iteration
and r8, 127 ; compute remaining byte count
movaps xmm0, xmm1 ; 8th chunk becomes deferred bytes
jmp XmmCopySmallTest
XmmCopySmall:
bt __favor, __FAVOR_SMSTRG ; check if string copy should be used.
jc memcpy_repmovs
movups xmm0, [rcx + rdx] ; load deferred bytes
add rcx, 16
sub r8, 16
;
; See if we have any 16-byte blocks left to move
;
XmmCopySmallTest:
mov r9, r8 ; copy count of bytes remaining
shr r9, 4 ; compute number of 16-byte blocks
jz short XmmCopyTrail ; on z, no 16-byte blocks, skip 2nd loop
align 16
XmmCopySmallLoop:
movups (-16)[rcx], xmm0 ; the first time through this is the
; store of the deferred bytes from above
movups xmm0, [rcx + rdx] ; load a block
add rcx, 16 ; advance dest addr (store is deferred)
dec r9
jnz XmmCopySmallLoop
XmmCopyTrail:
and r8, 15 ; compute remaining byte count
jz short XmmCopyReturn ; if z, no remainder bytes to move
;
; Handle remainder bytes.
;
; As at the start, we are going to do an unaligned copy of 16 bytes which will double-write
; some bytes. We must not touch rcx or xmm0 because they have what we need to store the
; deferred block. We use rax to point to the first byte after the end of the buffer and
; back up from there. Note rax is pointing to an address we must not read or write!
;
lea rax, [rcx+r8] ; make rax point one past the end
movups xmm1, (-16)[rax + rdx] ; load last 16 bytes of source buffer
movups (-16)[rax], xmm1 ; write last 16 bytes, including 16-r8 bytes
; from the last aligned block which we are about to
; overstore with identical values
XmmCopyReturn:
movups (-16)[rcx], xmm0 ; store the last deferred aligned block
mov rax, r11 ; we must return the original destination address
ret ;
;
; Move 128-byte blocks non-temporal
;
align 16
;
; non-temporal is exactly the same as the regular xmm loop above, except the movaps
; stores are movntps and we use prefetchnta. We are prefetching in two places, each
; prefetch gets 64 bytes about half an iteration ahead of time (about 10 instructions
; lead time). When we come to the end of the memcpy, we'll be prefetching bytes
; beyond the buffer we need to copy from, which may not be valid bytes. This is
; not illegal; if the memory address is invalid it does not trap, the hardware treats
; illegal prefetches as nops.
;
XmmCopyLargeOuterNT:
movntps (-32)[rcx], xmm0 ; store 7th chunk from prior iteration
movntps (-16)[rcx], xmm1 ; store 8th chunk from prior iteration
XmmCopyLargeInnerNT: ; enter loop here with xmm0 preloaded.
prefetchnta [rcx + rdx + 512] ; prefetch several cache lines ahead
movups xmm0, [rcx + rdx] ; load first 16 byte chunk
movups xmm1, 16[rcx + rdx] ; load 2nd 16 byte chunk
add rcx, 128 ; advance destination address
movntps (-128)[rcx], xmm0 ; store first 16 byte chunk
movntps (-112)[rcx], xmm1 ; store 2nd 16 byte chunk
movups xmm0, (-96)[rcx + rdx] ; load 3rd chunk
movups xmm1, (-80)[rcx + rdx] ; load 4th chunk
dec r9 ; dec block counter (set cc for jnz)
movntps (-96)[rcx], xmm0 ; store 3rd chunk
movntps (-80)[rcx], xmm1 ; store 4th chunk
movups xmm0, (-64)[rcx + rdx] ; load 5th chunk
movups xmm1, (-48)[rcx + rdx] ; load 6th chunk
prefetchnta [rcx + rdx + 576] ; prefetch several cache lines ahead
movntps (-64)[rcx], xmm0 ; store 5th chunk
movntps (-48)[rcx], xmm1 ; store 6th chunk
movups xmm0, (-32)[rcx + rdx] ; load 7th chunk
movups xmm1, (-16)[rcx + rdx] ; load 8th chunk
jnz XmmCopyLargeOuterNT ; loop if more blocks
sfence
jmp XmmCopyFinish ; rejoin regular memcpy codepath
;
; The source address is less than the destination address.
;
align 16
;
; Move bytes down using SSE registers. The source address is less than
; the destination address and the buffers overlap. We will do everything back-to-front.
;
; Preconditions:
; destination is r11 (must preserve for return value) and rcx
; source in r10 (must preserve for remainder move)
; length in r8, must have been verified to be greater than 16
; offset from dest to src in rdx
; source addr < dest addr and the buffers overlap
;
CopyDown:
add rcx, r8 ; make rcx point one past the end of the dst buffer
movups xmm0, -16[rcx + rdx] ; load deferred bytes
sub rcx, 16 ; reduce dst addr
sub r8, 16 ; r8 -= 16 in case aligned
;
; Aligned stores using movaps or movups are faster on AMD hardware than unaligned
; stores using movups. To achieve 16-byte dest alignment, we do an unaligned move
; of the last 16 bytes of the buffers, then reduce rcx only by the amount necessary
; to achieve alignment. This results in some bytes getting copied twice, unless we're
; already aligned.
;
; We know the src address is less than the dst, but not by exactly how much. In the
; case where the difference is less than 16 we must be careful about the bytes
; that will be stored twice. We must do both loads before either store, or the
; second load of those bytes will get the wrong values. We handle this by
; deferring the store of 16 aligned bytes to the remainder code, so it can load the
; remainder before storing the deferred bytes. Since either or both of the two
; loops can be skipped, the preconditions needed by the remainder code must
; also apply to the loops. These conditions are:
; - r8 is the count remaining, not including the deferred bytes
; - [rcx] points one past the end of the remainder bytes
; - rdx is the offset from the dst to the source
; - xmm0 holds the 16 deferred bytes that need to be stored at [rcx]
;
test cl, 15 ; test if dest aligned
jz XmmMovLargeTest ; go try 128-byte blocks
;
; Move alignment bytes.
;
XmmMovAlign:
mov rax, rcx ; save unaligned store address
and rcx, -16 ; rcx is deferred store address
movups xmm1, xmm0 ; copy unaligned last bytes to xmm1
movups xmm0, [rcx + rdx] ; load deferred-store bytes
movups [rax], xmm1 ; now safe to do unaligned store
mov r8, rcx ; easier to recalc r8 using rcx-r11 ...
sub r8, r11 ; ... than calc how much to subtract from r8
;
; See if we can move any 128-byte blocks.
;
XmmMovLargeTest:
mov r9, r8 ; copy count of bytes remaining
shr r9, 7 ; compute number of 128-byte blocks
jz short XmmMovSmallTest ; if z jump around to 2nd loop
movaps [rcx], xmm0 ; going into 1st loop, ok to store deferred bytes
jmp short XmmMovLargeInner ; jump into 1st loop
;
; Move 128-byte blocks
;
align 16
XmmMovLargeOuter:
movaps (128-112)[rcx], xmm0 ; store 7th chunk from prior iteration
movaps (128-128)[rcx], xmm1 ; store 8th chunk from prior iteration
XmmMovLargeInner:
movups xmm0, (-16)[rcx + rdx] ; load first 16 byte chunk
movups xmm1, (-32)[rcx + rdx] ; load 2nd 16 byte chunk
sub rcx, 128 ; reduce destination address
movaps (128-16)[rcx], xmm0 ; store first 16 byte chunk
movaps (128-32)[rcx], xmm1 ; store 2nd 16 byte chunk
movups xmm0, (128-48)[rcx + rdx] ; load 3rd chunk
movups xmm1, (128-64)[rcx + rdx] ; load 4th chunk
dec r9 ; dec block counter (set cc for jnz)
movaps (128-48)[rcx], xmm0 ; store 3rd chunk
movaps (128-64)[rcx], xmm1 ; store 4th chunk
movups xmm0, (128-80)[rcx + rdx] ; load 5th chunk
movups xmm1, (128-96)[rcx + rdx] ; load 6th chunk
movaps (128-80)[rcx], xmm0 ; store 5th chunk
movaps (128-96)[rcx], xmm1 ; store 6th chunk
movups xmm0, (128-112)[rcx + rdx] ; load 7th chunk
movups xmm1, (128-128)[rcx + rdx] ; load 8th chunk
jnz short XmmMovLargeOuter ; loop if more blocks
movaps (128-112)[rcx], xmm0 ; store 7th chunk from final iteration
and r8, 127 ; compute remaining byte count
movaps xmm0, xmm1 ; 8th chunk becomes deferred bytes
;
; See if we have any 16-byte blocks left to move
;
XmmMovSmallTest:
mov r9, r8 ; copy count of bytes remaining
shr r9, 4 ; compute number of 16-byte blocks
jz short XmmMovTrailing ; if z, no 16-byte blocks
align 16
XmmMovSmallLoop:
movups [rcx], xmm0 ; the first time through this is the
; store of the deferred bytes from above
sub rcx, 16 ; reduce dest addr
movups xmm0, [rcx + rdx] ; load a block
dec r9
jnz XmmMovSmallLoop
XmmMovTrailing:
and r8, 15 ; compute remaining byte count
jz short XmmMovReturn ; if z, no residual bytes to move
;
; Handle remainder bytes.
;
; As at the start, we are going to do an unaligned copy of 16 bytes which will double-write
; some bytes. We must not touch rcx or xmm0 because they have what we need to store the
; deferred block. But unlike for mcpyxmm code above, we have r10 and r11 we can just use
; to copy the lowest 16 bytes.
;
movups xmm1, [r10] ; load lowest 16 bytes, which includes remainder
movups [r11], xmm1 ; store lowest 16 bytes, which includes remainder
XmmMovReturn:
movups [rcx], xmm0 ; store deferred bytes
mov rax, r11 ; we must return destination address
ret
LEAF_END memcpy, _TEXT
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment