Skip to content

Instantly share code, notes, and snippets.

@alexey-milovidov
Created March 2, 2021 03:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save alexey-milovidov/2c3b4e532b11bfb1708c89bccc7d6576 to your computer and use it in GitHub Desktop.
Save alexey-milovidov/2c3b4e532b11bfb1708c89bccc7d6576 to your computer and use it in GitHub Desktop.
memcpy.S:
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│
╞══════════════════════════════════════════════════════════════════════════════╡
│ Copyright 2020 Justine Alexandra Roberts Tunney │
│ │
│ Permission to use, copy, modify, and/or distribute this software for │
│ any purpose with or without fee is hereby granted, provided that the │
│ above copyright notice and this permission notice appear in all copies. │
│ │
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
│ PERFORMANCE OF THIS SOFTWARE. │
╚──────────────────────────────────────────────────────────────────────────────╝
@fileoverview Cosmopolitan Memory Copying
Of all the functions in the technology industry, none are more
critical than the Kernighan & Ritchie Memory Copy API for the C
Language, 1972 model: more commonly known as memcpy(). It's the
world's most popular function──one all programmers love.
This implementation is the fastest and nearly the tiniest too.
It doesn't break when copying backwards or on misaligned data.
It's so easy that even a child could use it, and they do.
*/
#include "libc/nexgen32e/x86feature.h"
// Ends function definition.
// @cost saves 1-3 lines of code
.macro .endfn name:req bnd vis
.size \name,.-\name
.type \name,@function
.ifnb \bnd
.\bnd \name
.endif
.ifnb \vis
.\vis \name
.endif
.endm
// Ends variable definition.
// @cost saves 1-3 lines of code
.macro .endobj name:req bnd vis
.size \name,.-\name
.type \name,@object
.ifnb \bnd
.\bnd \name
.endif
.ifnb \vis
.\vis \name
.endif
.endm
// Helpers for Cosmopolitan _init() amalgamation magic.
// @param name should be consistent across macros for a module
// @see libc/runtime/_init.S
.macro .initro number:req name:req
.section .initro.\number\().\name,"a",@progbits
.align 8
.endm
.macro .initbss number:req name:req
.section .piro.bss.init.2.\number\().\name,"aw",@nobits
.align 8
.endm
.macro .init.start number:req name:req
.section .init.\number\().\name,"ax",@progbits
\name:
.endm
.macro .init.end number:req name:req bnd=globl vis
.endfn \name,\bnd,\vis
.previous
.endm
// LOOP Instruction Replacement.
// With its mop-Fusion Mexican equivalent.
// Thus avoiding 3x legacy pipeline slowdown.
.macro .loop label:req
.byte 0x83,0xe9,0x01 # sub $1,%ecx
jnz \label
.endm
// TODO(jart): delete
// Loads Effective Address
// Supporting security blankets
.macro ezlea symbol:req reg:req
#if __pic__ + __pie__ + __code_model_medium__ + __code_model_large__ + 0 > 1
// lea \symbol(%rip),%r\reg
mov $\symbol,%e\reg
#else
mov $\symbol,%e\reg
#endif
.endm
// Copies memory.
//
// DEST and SRC must not overlap, unless DEST≤SRC.
//
// @param rdi is dest
// @param rsi is src
// @param rdx is number of bytes
// @return original rdi copied to rax
// @mode long
// @asyncsignalsafe
memcpy:
/// memcpy function returns pointer to dst (its first argument), it's required by standard but rarely needed.
/// That's why we also provide a function MemCpy that returns nothing and is one instruction less.
mov %rdi, %rax
// 𝑠𝑙𝑖𝑑𝑒
.align 16
.endfn memcpy,globl
// Copies memory with minimal impact ABI.
//
// @param rdi is dest
// @param rsi is src
// @param rdx is number of bytes
// @clob flags,rcx,xmm3,xmm4
// @mode long
MemCpy:
push %rbp
mov %rsp, %rbp
mov $.Lmemcpytab_ro.size, %ecx
/// rcx - the size of the static part of memcpytab_ro (a small value)
/// rdx - the number of bytes to copy
/// If the number of bytes is smaller than the size of static table - jump to memcpytab[size * 8]
/// If it is larger - jump to the end of memcpytab
cmp %rcx, %rdx
cmovb %rdx, %rcx
jmp *memcpytab(,%rcx,8)
/// This label is needed only to calculate differencies in the memcpytab_ro
.Lanchorpoint:
/// For large sizes we will branch to ERMS and Non-Temporary stores implementations.
.L32r:
cmp $1024,%rdx
jae .Lerms
.L32:
/// We will save the latest 32 bytes before the loop
vmovdqu -32(%rsi,%rdx),%ymm4
/// rcx will be the number of bytes processed so far
mov $32,%rcx
/// The main AVX loop: copy by 32 byte portions from the beginning up to the latest 32 bytes.
0:
add $32,%rcx
vmovdqu -64(%rsi,%rcx),%ymm3
vmovdqu %ymm3,-64(%rdi,%rcx)
cmp %rcx,%rdx
ja 0b
/// Now copy the latest 32 bytes. They can overlap previously copied 32 bytes if the size is not a multiple of 32.
/// Example: when copying 33 bytes, the sequence is the following:
/// 1. Save the latest 32 bytes.
/// 2. Copy first 32 bytes.
/// 3. Store the latest 32 bytes.
vmovdqu %ymm4,-32(%rdi,%rdx)
vxorps %ymm4,%ymm4,%ymm4
vxorps %ymm3,%ymm3,%ymm3
jmp .L0
/// For large sizes we will branch to ERMS and Non-Temporary stores implementations.
.L16r:
cmp $1024,%rdx
jae .Lerms
/// This is very similar to AVX but for SSE
.L16:
movdqu -16(%rsi,%rdx),%xmm4
mov $16,%rcx
0:
add $16,%rcx
movdqu -32(%rsi,%rcx),%xmm3
movdqu %xmm3,-32(%rdi,%rcx)
cmp %rcx,%rdx
ja 0b
movdqu %xmm4,-16(%rdi,%rdx)
pxor %xmm4,%xmm4
pxor %xmm3,%xmm3
jmp .L0
/// 9..16 remaining bytes.
.L8:
/// We will use rbx, so save the previous value on stack.
push %rbx
/// Copy them by two halves to allow overlapping regions.
mov (%rsi),%rcx
mov -8(%rsi,%rdx),%rbx
mov %rcx,(%rdi)
mov %rbx,-8(%rdi,%rdx)
/// Restore the value of rbx and return
1:
pop %rbx
/// Zero bytes remaining, exit from function
.L0:
pop %rbp
ret
/// 5..8 remaining bytes
.L4:
/// We will use rbx, so save the previous value on stack.
push %rbx
/// The current four bytes
mov (%rsi),%ecx
/// The latest four bytes
mov -4(%rsi,%rdx),%ebx
mov %ecx,(%rdi)
mov %ebx,-4(%rdi,%rdx)
/// Restore the value of rbx and return
jmp 1b
/// 3..4 remaining bytes
.L3:
/// We will use rbx, so save the previous value on stack.
push %rbx
/// The current two bytes
mov (%rsi),%cx
/// The latest two bytes
mov -2(%rsi,%rdx),%bx
mov %cx,(%rdi)
mov %bx,-2(%rdi,%rdx)
/// Restore the value of rbx and return
jmp 1b
/// Two remaining bytes
.L2:
mov (%rsi),%cx
mov %cx,(%rdi)
jmp .L0
/// One remaining byte
.L1:
mov (%rsi),%cl
mov %cl,(%rdi)
jmp .L0
/// The case when CPU has "erms" flag. It means "Enhanced REP MOVSB/STOSB",
/// and literally: "rep movsb" is a good way for memcpy.
/// But in fact it's profitable only for sizes larger than 1024 bytes but less than half of L3 cache size.
.Lerms:
/// For large size, go to Non-Temporary stores implementation.
cmp kHalfCache3(%rip),%rdx
ja .Lnts
/// "rep movsb" will clobber these registers, but our function should not. Save them and restore before return.
push %rdi
push %rsi
/// This is where "rep movsb" expect to have its counter.
mov %rdx,%rcx
rep movsb
pop %rsi
pop %rdi
/// Return
jmp .L0
/// The case for non-temporary stores.
/// Non-temporary stores are operations that bypass CPU cache (store directly to memory).
/// They are profitable if size is larger than half of L3 cache size.
/// NOTE: when doing memcpy from multiple threads, most likely it will be better to lower
/// the threshold, because the L3 cache is shared among cores.
.Lnts:
/// Copy the first 16 bytes.
movdqu (%rsi),%xmm3
movdqu %xmm3,(%rdi)
/// Align the address of dest + 16 down to 16 bytes boundary and store to rcx.
lea 16(%rdi),%rcx
and $-16,%rcx
/// Align both source and destination and set size to the number of remaining bytes
sub %rdi,%rcx
add %rcx,%rdi
add %rcx,%rsi
sub %rcx,%rdx
/// rcx will be the number of bytes processed so far
mov $16,%rcx
/// Main loop
0:
add $16,%rcx
movdqu -32(%rsi,%rcx),%xmm3
movntdq %xmm3,-32(%rdi,%rcx)
cmp %rcx,%rdx
ja 0b
/// Copy the latest 16 bytes (that may overlap already copied bytes).
/// We need to make the previous stores visible for this operation.
sfence
movdqu -16(%rsi,%rdx),%xmm3
movdqu %xmm3,-16(%rdi,%rdx)
pxor %xmm3,%xmm3
jmp .L0
.endfn MemCpy,globl,hidden
.initro 300,_init_memcpy
memcpytab_ro:
/// This table is indexed by the number of remaining bytes.
.byte .L0-.Lanchorpoint
.byte .L1-.Lanchorpoint
.byte .L2-.Lanchorpoint
.byte .L3-.Lanchorpoint
.byte .L4-.Lanchorpoint
.byte .L4-.Lanchorpoint
.byte .L4-.Lanchorpoint
.byte .L4-.Lanchorpoint
.byte .L8-.Lanchorpoint
.byte .L8-.Lanchorpoint
.byte .L8-.Lanchorpoint
.byte .L8-.Lanchorpoint
.byte .L8-.Lanchorpoint
.byte .L8-.Lanchorpoint
.byte .L8-.Lanchorpoint
.byte .L8-.Lanchorpoint /// For example, 15 bytes can be copied by two 8-byte unaligned load/stores.
.byte .L16-.Lanchorpoint
.byte .L16-.Lanchorpoint
.byte .L16-.Lanchorpoint
.byte .L16-.Lanchorpoint
.byte .L16-.Lanchorpoint
.byte .L16-.Lanchorpoint
.byte .L16-.Lanchorpoint
.byte .L16-.Lanchorpoint
.byte .L16-.Lanchorpoint
.byte .L16-.Lanchorpoint
.byte .L16-.Lanchorpoint
.byte .L16-.Lanchorpoint
.byte .L16-.Lanchorpoint
.byte .L16-.Lanchorpoint
.byte .L16-.Lanchorpoint
.byte .L16-.Lanchorpoint
.equ .Lmemcpytab_ro.size,.-memcpytab_ro
.endobj memcpytab_ro
.if .Lmemcpytab_ro.size % 8
.error "moar jmptab"
.endif
.byte .L16-.Lanchorpoint # SSE2
.byte .L16r-.Lanchorpoint # SSE2 + ERMS
.byte .L32-.Lanchorpoint # AVX
.byte .L32r-.Lanchorpoint # AVX + ERMS
.byte 0,0,0,0
.endobj memcpytab_ro, globl
.previous /// This means: "undo" section declaration: https://stackoverflow.com/questions/2416879/what-does-asm-previous-mean
/// It is filled at startup with pointers according to the offsets in memcpytab_ro
.initbss 300,_init_memcpy
memcpytab:
.rept .Lmemcpytab_ro.size
.quad 0 /// Pointers for small sizes.
.endr
.quad 0 /// A pointer to the label for large size.
.endobj memcpytab, globl
.previous
.init.start 300,_init_memcpy
ezlea .Lmemcpytab_ro.size, cx
ezlea .Lanchorpoint, dx
testb X86_HAVE(AVX) + kCpuids(%rip)
call memjmpinit
ret
.init.end 300,_init_memcpy
// Initializes jump table for memset() and memcpy().
//
// @param !ZF if required cpu vector extensions are available
// @param rdi is address of 64-bit jump table
// @param rsi is address of 8-bit jump initializers
// @param rdx is address of indirect branch (Anchor point)
// @param ecx is size of jump table
memjmpinit:
push %rbp
mov %rsp, %rbp
setnz %r8b
shl %r8b
/// Transform memcpytab_ro with byte offsets to pointers.
0: xor %eax,%eax
lodsb
add %rdx,%rax
stosq
.loop 0b
xor %eax,%eax
testb X86_HAVE(ERMS) + kCpuids(%rip)
setnz %al
or %r8b,%al
mov (%rsi,%rax),%al
add %rdx,%rax
stosq
lodsq
pop %rbp
ret
.endfn memjmpinit,globl,hidden
// Globally precomputed CPUID.
//
// This module lets us check CPUID in 0.06ns rather than 51.00ns.
// If every piece of native software linked this module, then the
// world would be a much better place; since all the alternatives
// are quite toilsome.
//
// @see www.felixcloutier.com/x86/cpuid
.initbss 201,_init_kCpuids
kCpuids:
.long 0,0,0,0 # EAX=0 (Basic Processor Info)
.long 0,0,0,0 # EAX=1 (Processor Info)
.long 0,0,0,0 # EAX=2
.long 0,0,0,0 # EAX=7 (Extended Features)
.long 0,0,0,0 # EAX=0x80000001 (NexGen32e)
.long 0,0,0,0 # EAX=0x80000007 (APM)
.long 0,0,0,0 # EAX=16h (CPU Frequency)
.endobj kCpuids,globl
.previous
.init.start 201,_init_kCpuids
push %rbx
push $0
push $0x16
push $0xffffffff80000007
push $0xffffffff80000001
push $7
push $2
push $1
mov %rdi,%r8
xor %eax,%eax
1: xor %ecx,%ecx
cpuid
stosl
xchg %eax,%ebx
stosl
xchg %eax,%ecx
stosl
xchg %eax,%edx
stosl
2: pop %rax
test %eax,%eax # EAX = stacklist->pop()
jz 3f # EAX ≠ 0 (EOL sentinel)
cmp KCPUIDS(0H,EAX)(%r8),%al # EAX ≤ CPUID.0 max leaf
jbe 1b # CPUID too new to probe
add $4*4,%rdi
jmp 2b
3: nop
/*
#if !X86_NEED(AVX2)
testb X86_HAVE(AVX)(%r8)
jz 5f
testb X86_HAVE(OSXSAVE)(%r8)
jz 4f
xor %ecx,%ecx
xgetbv
and $XCR0_SSE|XCR0_AVX,%eax
cmp $XCR0_SSE|XCR0_AVX,%eax
je 5f
4: btr $X86_BIT(AVX),X86_WORD(AVX)(%r8)
btr $X86_BIT(AVX2),X86_WORD(AVX2)(%r8)
#endif*/
5: pop %rbx
retq
.init.end 201,_init_kCpuids
.initbss 202,_init_kHalfCache3
// Half size of level 3 cache in bytes.
kHalfCache3:
.quad 0
.endobj kHalfCache3,globl
.previous
.init.start 202,_init_kHalfCache3
cmpl $3,kCpuids(%rip)
jbe 3f
xor %r8d,%r8d
mov $4,%r8d
1: mov %r8d,%eax
mov %r8d,%ecx
push %rbx
cpuid
mov %ebx,%r9d
pop %rbx
test $31,%al
je 3f
cmp $99,%al
jne 2f
mov %r9d,%eax
mov %r9d,%edx
inc %ecx
shr $12,%r9d
shr $22,%eax
and $0x0fff,%edx
and $0x03ff,%r9d
inc %eax
inc %edx
imul %edx,%eax
imul %ecx,%eax
lea 1(%r9),%ecx
imul %ecx,%eax
jmp 4f
2: inc %r8d
jmp 1b
3: mov $0x00400000,%eax
4: shr %eax
stosq
retq
.init.end 202,_init_kHalfCache3
/* your memcpy() 375 bytes
bionic memcpy() 1,429 bytes
glibc memcpy() 27,216 bytes
musl memcpy() 49 bytes
newlib memcpy() 300 bytes
benchmarks on intel core i7-6700 @ 3.40GHz (skylake)
includes function call overhead (unless marked otherwise)
your memcpy(𝑛) for #c per n where c ≈ 0.293ns
N x1 x8 x64 mBps
------------------------------------------------------------
1 297.000 35.125 35.203 92
1 35.000 35.625 35.016 93
2 27.500 17.438 17.555 185
3 21.000 11.875 12.057 270
4 16.250 8.719 8.809 369
7 5.000 4.946 5.069 641
8 7.375 4.422 4.365 745
15 4.067 2.342 2.336 1391
16 4.188 2.242 2.257 1440 «
31 8.032 1.157 1.147 2835
32 2.031 1.723 1.325 2454
63 1.000 0.589 0.589 5523
64 0.578 0.580 0.577 5630 «
127 0.638 0.377 0.320 10151
128 0.289 0.296 0.307 10605
255 0.404 0.202 0.194 16741
256 0.160 0.165 0.166 19574 «
511 0.159 0.123 0.110 29458
512 0.139 0.098 0.097 33571 «
1023 0.107 0.086 0.074 44111
1024 0.103 0.084 0.082 39489
2047 0.057 0.056 0.057 57450
2048 0.055 0.055 0.055 59269
4095 0.044 0.044 0.044 74051
4096 0.043 0.043 0.043 75300 «
8191 0.036 0.036 0.036 91301
8192 0.036 0.035 0.035 92411
16383 0.033 0.032 0.032 102163
16384 0.034 0.032 0.032 102145 « (L1)/2
32767 0.098 0.081 0.077 42271
32768 0.077 0.077 0.076 42781
65535 0.088 0.075 0.072 44973
65536 0.074 0.072 0.071 45520
131071 0.086 0.075 0.072 44869
131072 0.077 0.073 0.072 45076 « (L2)/2
262143 0.095 0.096 0.095 34116
262144 0.096 0.096 0.095 34160
524287 0.102 0.109 0.111 29359
524288 0.107 0.109 0.108 30033
1048575 0.102 0.103 0.104 31112
1048576 0.101 0.103 0.103 31605
2097151 0.104 0.103 0.109 29929
2097152 0.108 0.110 0.103 31652
4194303 0.192 0.172 0.172 18950
4194304 0.168 0.161 0.160 20311 « (L3)/2
8388607 0.339 0.329 0.344 9461 « RAM
8388608 0.384 0.369 0.341 9545
Bionic memcpy() for #c per n where c ≈ 0.293ns
N x1 x8 x64 mBps
------------------------------------------------------------
1 347.000 40.625 35.984 90
1 37.000 35.625 36.734 89
2 28.500 18.688 18.383 177
3 11.667 12.375 12.359 263
4 12.250 9.406 9.020 361
7 5.000 5.018 5.118 636
8 11.625 5.828 4.779 681
15 3.533 3.158 2.620 1243
16 4.688 2.742 2.884 1129 «
31 1.903 1.262 1.172 2778
32 1.344 1.113 1.125 2895
63 1.444 0.633 0.591 5513
64 0.766 0.580 0.581 5605 «
127 0.512 0.383 0.318 10229
128 0.461 0.315 0.311 10463
255 0.475 0.216 0.193 16840
256 0.371 0.236 0.199 16397 «
511 0.295 0.144 0.120 27223
512 0.240 0.151 0.126 25937 «
1023 0.142 0.101 0.088 36947
1024 0.126 0.108 0.091 35889
2047 0.088 0.074 0.072 45475
2048 0.089 0.077 0.073 44380
4095 0.081 0.065 0.064 50766
4096 0.068 0.066 0.065 50246 «
8191 0.063 0.061 0.060 54075
8192 0.065 0.061 0.061 53731
16383 0.082 0.066 0.061 53765
16384 0.067 0.063 0.062 52765 « (L1)/2
32767 0.102 0.085 0.085 38406
32768 0.086 0.085 0.085 38473
65535 0.098 0.085 0.085 38292
65536 0.086 0.085 0.085 38369
131071 0.438 0.177 0.089 36716
131072 0.092 0.090 0.093 34880 « (L2)/2
262143 0.306 0.146 0.127 25601
262144 0.126 0.168 0.127 25704
524287 0.213 0.152 0.136 23993
524288 0.132 0.159 0.133 24570
1048575 0.127 0.129 0.130 25117
1048576 0.128 0.129 0.130 25107
2097151 0.127 0.127 0.129 25199
2097152 0.127 0.136 0.134 24274
4194303 0.216 0.192 0.228 14237
4194304 0.351 0.351 0.356 9139 « (L3)/2
8388607 0.323 0.293 0.298 10903 « RAM
8388608 0.365 0.296 0.300 10844
GCC builtin (Inline REP MOVSB) for #c per n where c ≈ 0.293ns
N x1 x8 x64 mBps
------------------------------------------------------------
1 53.000 50.625 50.453 64
1 47.000 49.375 49.141 66
2 23.500 25.062 24.898 131
3 15.667 16.792 16.880 193
4 11.750 12.531 12.957 251
7 7.000 7.125 7.190 452
8 6.125 7.578 6.322 514
15 3.133 3.325 3.372 964
16 3.062 3.117 3.132 1038 «
31 1.645 1.601 1.620 2007
32 1.531 1.559 1.585 2051
63 0.778 0.796 0.802 4056
64 0.766 0.768 0.767 4238 «
127 0.480 0.446 0.448 7259
128 0.445 0.419 0.423 7693
255 0.239 0.239 0.236 13781
256 0.238 0.225 0.225 14466 «
511 0.127 0.133 0.132 24555
512 0.123 0.127 0.128 25377 «
1023 0.079 0.081 0.081 40346
1024 0.075 0.077 0.078 41714
2047 0.053 0.055 0.055 59575
2048 0.053 0.053 0.053 60795
4095 0.042 0.043 0.043 75843
4096 0.042 0.042 0.042 77153
8191 0.035 0.036 0.036 91518
8192 0.035 0.035 0.035 92603
16383 0.032 0.032 0.032 102407
16384 0.033 0.032 0.032 102864 « (L1)/2
32767 0.106 0.082 0.078 41486
32768 0.079 0.078 0.079 41290
65535 0.090 0.077 0.075 43565
65536 0.074 0.074 0.073 44299
131071 0.091 0.078 0.075 43196
131072 0.078 0.076 0.074 43673 « (L2)/2
262143 0.097 0.099 0.098 33192
262144 0.098 0.098 0.098 33193
524287 0.105 0.111 0.111 29212
524288 0.109 0.111 0.111 29211
1048575 0.107 0.108 0.108 30069
1048576 0.106 0.112 0.105 30886
2097151 0.105 0.103 0.103 31621
2097152 0.102 0.103 0.104 31280
4194303 0.180 0.158 0.176 18456
4194304 0.167 0.155 0.154 21098 « (L3)/2
8388607 0.538 0.576 0.557 5834 « RAM
8388608 0.750 0.579 0.552 5893
glibc memcpy() for #c per n where c ≈ 0.293ns
N x1 x8 x64 mBps
------------------------------------------------------------
1 139.000 90.125 84.891 38
1 83.000 82.125 84.359 39
2 61.500 46.438 45.164 72
3 41.667 32.458 31.245 104
4 32.750 26.156 24.410 133
7 20.143 16.732 16.033 203
8 13.375 8.328 6.908 471
15 8.200 6.408 5.753 565
16 4.438 3.570 3.466 938 «
31 3.258 2.891 2.786 1167
32 2.281 1.801 1.732 1878
63 1.635 1.431 1.374 2367
64 1.109 0.896 0.868 3747 «
127 0.921 0.792 0.779 4176
128 0.508 0.511 0.494 6589
255 0.451 0.407 0.402 8081
256 0.324 0.269 0.260 12498 «
511 0.249 0.218 0.212 15335
512 0.178 0.149 0.146 22297 «
1023 0.138 0.124 0.121 26947
1024 0.087 0.089 0.087 37238
2047 0.084 0.077 0.076 43046
2048 0.066 0.059 0.058 56120
4095 0.058 0.054 0.054 60706
4096 0.050 0.046 0.046 71092 «
8191 0.043 0.042 0.042 78259
8192 0.037 0.037 0.037 87409
16383 0.037 0.036 0.035 92065
16384 0.034 0.034 0.033 97942 « (L1)/2
32767 0.104 0.084 0.080 40572
32768 0.079 0.079 0.079 41055
65535 0.094 0.080 0.076 42885
65536 0.077 0.075 0.075 43423
131071 0.092 0.080 0.078 41498
131072 0.082 0.078 0.077 42350 « (L2)/2
262143 0.100 0.101 0.287 11342
262144 0.099 0.099 0.098 33177
524287 0.106 0.111 0.110 29609
524288 0.107 0.119 0.110 29608
1048575 0.104 0.105 0.106 30626
1048576 0.104 0.111 0.105 30878
2097151 0.103 0.103 0.103 31606
2097152 0.102 0.103 0.103 31644
4194303 0.174 0.160 0.165 19714
4194304 0.166 0.157 0.154 21110 « (L3)/2
8388607 0.537 0.554 0.565 5750 « RAM
8388608 0.701 0.537 0.552 5884
musl memcpy() for #c per n where c ≈ 0.293ns
N x1 x8 x64 mBps
------------------------------------------------------------
1 97.000 80.625 79.891 41
1 77.000 78.875 78.266 42
2 49.500 44.062 42.102 77
3 33.667 32.792 30.651 106
4 29.750 24.281 24.137 135
7 19.000 16.161 15.734 207
8 12.125 7.766 6.721 484
15 8.867 5.892 5.714 569
16 5.062 3.742 3.458 940
31 3.645 2.915 2.715 1198
32 2.156 1.723 1.663 1956
63 1.540 1.367 1.333 2440
64 1.078 0.873 0.833 3905
127 0.874 0.771 0.737 4415
128 0.617 0.487 0.481 6766
255 0.443 0.390 0.382 8504
256 0.316 0.259 0.259 12545
511 0.245 0.232 0.237 13742
512 0.174 0.159 0.208 15668
1023 0.181 0.193 0.182 17821
1024 0.155 0.123 0.114 28579
2047 0.102 0.092 0.085 38219
2048 0.064 0.073 0.070 46577
4095 0.058 0.067 0.065 50272
4096 0.049 0.055 0.055 59467
8191 0.057 0.052 0.049 66468
8192 0.053 0.050 0.051 63557
16383 0.082 0.065 0.064 50897
16384 0.066 0.065 0.061 53697 « (L1)/2
32767 0.121 0.100 0.114 28555
32768 0.093 0.091 0.114 28615
65535 0.118 0.102 0.142 22858
65536 0.108 0.274 0.097 33432
131071 0.117 0.109 0.109 29905
131072 0.110 0.195 0.113 28692 « (L2)/2
262143 0.283 0.166 0.122 26638
262144 0.130 0.144 0.123 26544
524287 0.210 0.153 0.130 25079
524288 0.126 0.128 0.123 26422
1048575 0.139 0.107 0.106 30803
1048576 0.104 0.105 0.106 30683
2097151 0.103 0.103 0.103 31564
2097152 0.102 0.103 0.103 31531
4194303 0.242 0.158 0.169 19238
4194304 0.166 0.161 0.154 21072 « (L3)/2
8388607 0.533 0.549 0.599 5422 « RAM
8388608 0.768 0.630 0.560 5801
newlib (aka. cygwin) memcpy() for #c per n where c ≈ 0.293ns
N x1 x8 x64 mBps
------------------------------------------------------------
1 61.000 52.875 53.141 61
1 49.000 49.875 50.328 65
2 24.500 24.812 26.727 122
3 15.667 20.125 16.943 192
4 12.750 15.281 13.090 248
7 7.000 7.375 7.431 438
8 5.875 6.422 6.377 510
15 3.267 3.375 3.447 943
16 10.062 6.945 6.386 509
31 2.548 2.488 2.545 1278
32 3.156 3.207 3.201 1016
63 1.190 1.220 1.229 2646
64 1.578 1.588 1.599 2033
127 0.717 0.690 0.685 4744
128 0.820 0.856 0.857 3795
255 0.357 0.359 0.358 9077
256 0.629 0.461 0.426 7630
511 0.260 0.219 0.204 15947
512 0.330 0.299 0.268 12113
1023 0.269 0.175 0.162 20042
1024 0.315 0.201 0.196 16633
2047 0.349 0.241 0.236 13790
2048 0.332 0.269 0.264 12295
4095 0.349 0.295 0.287 11348
4096 0.361 0.313 0.303 10748
8191 0.361 0.317 0.322 10110
8192 0.369 0.326 0.319 10201
16383 0.321 0.322 0.327 9940
16384 0.309 0.330 0.329 9878 « (L1)/2
32767 0.291 0.303 0.307 10599
32768 0.314 0.304 0.305 10667
65535 0.373 0.311 0.313 10396
65536 0.305 0.750 0.421 7729
131071 0.329 0.427 0.384 8470
131072 0.329 0.388 0.361 9020 « (L2)/2
262143 0.520 0.389 0.425 7646
262144 0.364 0.400 0.368 8843
524287 0.449 0.389 0.389 8353
524288 0.384 0.379 0.384 8466
1048575 0.436 0.397 0.401 8107
1048576 0.431 0.397 0.401 8112
2097151 0.417 0.567 0.434 7498
2097152 0.457 0.503 0.427 7621
4194303 0.328 0.348 0.368 8822
4194304 0.343 0.352 0.352 9221 « (L3)/2
8388607 0.313 0.319 0.326 9957 « RAM
8388608 0.366 0.320 0.328 9910
openbsd memcpy() for #c per n where c ≈ 0.293ns
N x1 x8 x64 mBps
------------------------------------------------------------
1 73.000 41.375 41.484 78
1 39.000 39.875 41.641 78
2 28.500 20.688 21.227 153
3 27.000 15.875 15.557 209
4 16.750 12.656 12.520 260
7 20.429 10.982 10.292 316
8 8.625 5.234 5.576 583
15 7.267 4.758 4.920 661
16 4.312 2.742 2.747 1183
31 4.613 2.891 2.555 1272
32 2.844 1.520 1.441 2256
63 2.397 1.268 1.328 2449
64 1.547 0.822 0.769 4226
127 1.189 0.782 0.671 4842
128 0.727 0.532 0.460 7066
255 0.631 0.463 0.414 7856
256 0.543 0.374 0.302 10775
511 0.542 0.316 0.276 11785
512 0.354 0.260 0.224 14494
1023 0.267 0.245 0.229 14201
1024 0.251 0.200 0.197 16496
2047 0.214 0.226 0.181 17941
2048 0.189 0.167 0.166 19575
4095 0.200 0.168 0.163 19957
4096 0.165 0.155 0.153 21219
8191 0.158 0.153 0.151 21578
8192 0.153 0.148 0.147 22138
16383 0.173 0.148 0.146 22319
16384 0.153 0.487 0.188 17298 « (L1)/2
32767 0.161 0.151 0.192 16893
32768 0.151 0.314 0.213 15275
65535 0.157 0.154 0.148 21969
65536 0.147 0.145 0.145 22493
131071 0.152 0.151 0.154 21145
131072 0.148 0.229 0.158 20564 « (L2)/2
262143 0.320 0.183 0.162 20031
262144 0.330 0.205 0.167 19503
524287 0.159 0.171 0.163 19913
524288 0.250 0.189 0.162 20120
1048575 0.157 0.164 0.161 20182
1048576 0.155 0.156 0.157 20672
2097151 0.161 0.158 0.157 20644
2097152 0.158 0.157 0.165 19727
4194303 0.327 0.256 0.238 13684
4194304 0.232 0.220 0.236 13749 « (L3)/2
8388607 0.721 0.689 0.586 5549 « RAM
8388608 0.943 0.569 0.593 5481 */
memcpy_test.cpp:
#include <string.h>
#include <iostream>
extern "C" int kCpuids;
extern "C" int kHalfCache3;
extern "C" int memcpytab;
extern "C" int memcpytab_ro;
extern "C" void _init_kCpuids(void *);
extern "C" void _init_kHalfCache3(void *);
extern "C" void _init_memcpy(void *, void *);
__attribute__((__constructor__)) void memcpy_init()
{
_init_kCpuids(&kCpuids);
_init_kHalfCache3(&kHalfCache3);
_init_memcpy(&memcpytab, &memcpytab_ro);
}
__attribute__((__noinline__)) void memcpy_noinline(void * __restrict dst, const void * __restrict src, size_t size)
{
memcpy(dst, src, size);
}
int main(int, char **)
{
constexpr size_t buf_size = 100;
char buf[buf_size]{};
memcpy_noinline(buf, "abc", 3);
size_t bytes_to_copy = 3;
while (bytes_to_copy * 2 < buf_size)
{
memcpy_noinline(&buf[bytes_to_copy], buf, bytes_to_copy);
bytes_to_copy *= 2;
}
std::cerr << buf << "\n";
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment