-
-
Save heatd/fe2c9a2d3a4ef04616d481ee6660c722 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// SPDX-License-Identifier: GPL-2.0 | |
#define ALIGN_TEXT .p2align 4, 0x90 | |
#ifndef MEMCPY_SYM | |
#define MEMCPY_SYM memcpy | |
#endif | |
.global MEMCPY_SYM | |
.type MEMCPY_SYM, @function | |
MEMCPY_SYM: | |
/* Set up the return value */ | |
mov %rdi, %rax | |
/* Test for 0 */ | |
test %rdx, %rdx | |
jz .Lout | |
/* Deal with [0..16], [16..32], [32..256] and [256..] separately */ | |
cmp $16, %rdx | |
jbe .L0_to_16_bytes | |
cmp $32, %rdx | |
jbe .L0_to_32_bytes | |
/* Heuristic from FreeBSD */ | |
/* Seems to check out locally (Kabylake R) */ | |
cmp $256, %rdx | |
jae .Lerms | |
/* Fallthrough to the 32 byte copy */ | |
ALIGN_TEXT | |
.L32_byte_copy: | |
/* TODO: Investigate if we need/should interleave loads and stores */ | |
movq (%rsi), %rcx | |
movq 8(%rsi), %r8 | |
movq 16(%rsi), %r9 | |
movq 24(%rsi), %r10 | |
movq %rcx, (%rdi) | |
movq %r8, 8(%rdi) | |
movq %r9, 16(%rdi) | |
movq %r10, 24(%rdi) | |
/* We use both lea and sub as to fully utilize execution units */ | |
lea 32(%rsi), %rsi | |
lea 32(%rdi), %rdi | |
sub $32, %rdx | |
jz .Lout | |
cmp $32, %rdx | |
ja .L32_byte_copy | |
/* Fallthrough to the 0..32 copy */ | |
ALIGN_TEXT | |
.L0_to_32_bytes: | |
cmp $16, %rdx | |
jbe .L0_to_16_bytes | |
movq (%rsi), %rcx | |
movq 8(%rsi), %r8 | |
movq %rcx, (%rdi) | |
movq %r8, 8(%rdi) | |
movq -16(%rsi, %rdx), %rcx | |
movq -8(%rsi, %rdx), %r8 | |
movq %rcx, -16(%rdi, %rdx) | |
movq %r8, -8(%rdi, %rdx) | |
ret | |
ALIGN_TEXT | |
.L0_to_16_bytes: | |
cmp $8, %rdx | |
jb .L4_to_7_bytes | |
movq (%rsi), %rcx | |
movq -8(%rsi, %rdx), %r8 | |
movq %rcx, (%rdi) | |
movq %r8, -8(%rdi, %rdx) | |
ret | |
ALIGN_TEXT | |
.L4_to_7_bytes: | |
cmp $4, %rdx | |
jb .L1_to_3_bytes | |
movl (%rsi), %ecx | |
movl %ecx, (%rdi) | |
movl -4(%rsi, %rdx), %ecx | |
movl %ecx, -4(%rdi, %rdx) | |
ret | |
ALIGN_TEXT | |
.L1_to_3_bytes: | |
cmp $1, %rdx | |
je .L1_byte | |
movw (%rsi), %cx | |
movw %cx, (%rdi) | |
movw -2(%rsi, %rdx), %cx | |
movw %cx, -2(%rdi, %rdx) | |
ret | |
.L1_byte: | |
movb (%rsi), %cl | |
movb %cl, (%rdi) | |
ret | |
ALIGN_TEXT | |
.Lerms: | |
mov %rdx, %rcx | |
rep movsb | |
.Lout: | |
ret |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment