Skip to content

Instantly share code, notes, and snippets.

@heatd
Created March 5, 2023 17:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save heatd/fe2c9a2d3a4ef04616d481ee6660c722 to your computer and use it in GitHub Desktop.
Save heatd/fe2c9a2d3a4ef04616d481ee6660c722 to your computer and use it in GitHub Desktop.
// SPDX-License-Identifier: GPL-2.0
#define ALIGN_TEXT .p2align 4, 0x90
#ifndef MEMCPY_SYM
#define MEMCPY_SYM memcpy
#endif
.global MEMCPY_SYM
.type MEMCPY_SYM, @function
MEMCPY_SYM:
/* Set up the return value */
mov %rdi, %rax
/* Test for 0 */
test %rdx, %rdx
jz .Lout
/* Deal with [0..16], [16..32], [32..256] and [256..] separately */
cmp $16, %rdx
jbe .L0_to_16_bytes
cmp $32, %rdx
jbe .L0_to_32_bytes
/* Heuristic from FreeBSD */
/* Seems to check out locally (Kabylake R) */
cmp $256, %rdx
jae .Lerms
/* Fallthrough to the 32 byte copy */
ALIGN_TEXT
.L32_byte_copy:
/* TODO: Investigate if we need/should interleave loads and stores */
movq (%rsi), %rcx
movq 8(%rsi), %r8
movq 16(%rsi), %r9
movq 24(%rsi), %r10
movq %rcx, (%rdi)
movq %r8, 8(%rdi)
movq %r9, 16(%rdi)
movq %r10, 24(%rdi)
/* We use both lea and sub as to fully utilize execution units */
lea 32(%rsi), %rsi
lea 32(%rdi), %rdi
sub $32, %rdx
jz .Lout
cmp $32, %rdx
ja .L32_byte_copy
/* Fallthrough to the 0..32 copy */
ALIGN_TEXT
.L0_to_32_bytes:
cmp $16, %rdx
jbe .L0_to_16_bytes
movq (%rsi), %rcx
movq 8(%rsi), %r8
movq %rcx, (%rdi)
movq %r8, 8(%rdi)
movq -16(%rsi, %rdx), %rcx
movq -8(%rsi, %rdx), %r8
movq %rcx, -16(%rdi, %rdx)
movq %r8, -8(%rdi, %rdx)
ret
ALIGN_TEXT
.L0_to_16_bytes:
cmp $8, %rdx
jb .L4_to_7_bytes
movq (%rsi), %rcx
movq -8(%rsi, %rdx), %r8
movq %rcx, (%rdi)
movq %r8, -8(%rdi, %rdx)
ret
ALIGN_TEXT
.L4_to_7_bytes:
cmp $4, %rdx
jb .L1_to_3_bytes
movl (%rsi), %ecx
movl %ecx, (%rdi)
movl -4(%rsi, %rdx), %ecx
movl %ecx, -4(%rdi, %rdx)
ret
ALIGN_TEXT
.L1_to_3_bytes:
cmp $1, %rdx
je .L1_byte
movw (%rsi), %cx
movw %cx, (%rdi)
movw -2(%rsi, %rdx), %cx
movw %cx, -2(%rdi, %rdx)
ret
.L1_byte:
movb (%rsi), %cl
movb %cl, (%rdi)
ret
ALIGN_TEXT
.Lerms:
mov %rdx, %rcx
rep movsb
.Lout:
ret
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment