-
-
Save heatd/28714dccadfff5469a3647f364a93ce6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* Copyright (C) 2012 The Android Open Source Project | |
* Copyright (c) 2022 - 2023 Pedro Falcato | |
* | |
* Licensed under the Apache License, Version 2.0 (the "License"); | |
* you may not use this file except in compliance with the License. | |
* You may obtain a copy of the License at | |
* | |
* http://www.apache.org/licenses/LICENSE-2.0 | |
* | |
* Unless required by applicable law or agreed to in writing, software | |
* distributed under the License is distributed on an "AS IS" BASIS, | |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
* See the License for the specific language governing permissions and | |
* limitations under the License. | |
*/ | |
#include <stdint.h> | |
#include <string.h> | |
#include <benchmark/benchmark.h> | |
static void rep_movsb(void* dst, const void* src, size_t n) | |
{ | |
__asm__ __volatile__("rep movsb\n\t" : "+D"(dst), "+S"(src), "+c"(n) : : "memory"); | |
} | |
#define KB 1024 | |
#define MB 1024 * KB | |
#define AT_COMMON_SIZES \ | |
Arg(4) \ | |
->Arg(8) \ | |
->Arg(16) \ | |
->Arg(32) \ | |
->Arg(64) \ | |
->Arg(128) \ | |
->Arg(150) \ | |
->Arg(192) \ | |
->Arg(256) \ | |
->Arg(384) \ | |
->Arg(512) \ | |
->Arg(1 * KB) \ | |
->Arg(8 * KB) \ | |
->Arg(16 * KB) \ | |
->Arg(32 * KB) \ | |
->Arg(64 * KB) | |
void BM_string_erms(benchmark::State& state) | |
{ | |
const auto nbytes = state.range(); | |
char* src = new char[nbytes]; | |
char* dst = new char[nbytes]; | |
memset(src, 'x', nbytes); | |
memset(dst, 'x', nbytes); | |
for (auto _ : state) | |
{ | |
rep_movsb(dst, src, nbytes); | |
benchmark::ClobberMemory(); | |
} | |
state.SetBytesProcessed(state.iterations() * nbytes); | |
delete[] src; | |
delete[] dst; | |
} | |
BENCHMARK(BM_string_erms)->AT_COMMON_SIZES; | |
void BM_string_memcpy(benchmark::State& state) | |
{ | |
const auto nbytes = state.range(); | |
char* src = new char[nbytes]; | |
char* dst = new char[nbytes]; | |
memset(src, 'x', nbytes); | |
memset(dst, 'x', nbytes); | |
for (auto _ : state) | |
{ | |
auto val = memcpy(dst, src, nbytes); | |
benchmark::DoNotOptimize(val); | |
benchmark::ClobberMemory(); | |
} | |
state.SetBytesProcessed(state.iterations() * nbytes); | |
delete[] src; | |
delete[] dst; | |
} | |
BENCHMARK(BM_string_memcpy)->AT_COMMON_SIZES; | |
extern "C" void* our_memcpy(void* dst, const void* src, size_t len); | |
extern "C" void* our_memmove(void* dst, const void* src, size_t len); | |
void BM_string_our_memcpy(benchmark::State& state) | |
{ | |
const auto nbytes = state.range(); | |
char* src = new char[nbytes]; | |
char* dst = new char[nbytes]; | |
memset(src, 'x', nbytes); | |
memset(dst, 'x', nbytes); | |
for (auto _ : state) | |
{ | |
auto val = our_memcpy(dst, src, nbytes); | |
benchmark::DoNotOptimize(val); | |
benchmark::ClobberMemory(); | |
} | |
state.SetBytesProcessed(state.iterations() * nbytes); | |
delete[] src; | |
delete[] dst; | |
} | |
BENCHMARK(BM_string_our_memcpy)->AT_COMMON_SIZES; | |
void BM_string_memmove(benchmark::State& state) | |
{ | |
const auto nbytes = state.range(); | |
char* buf = new char[nbytes + 64]; | |
memset(buf, 'x', nbytes + 64); | |
for (auto _ : state) | |
{ | |
auto val = memmove(buf, buf + 1, nbytes); // Worst-case overlap. | |
benchmark::DoNotOptimize(val); | |
benchmark::ClobberMemory(); | |
} | |
state.SetBytesProcessed(state.iterations() * nbytes); | |
delete[] buf; | |
} | |
BENCHMARK(BM_string_memmove)->AT_COMMON_SIZES; | |
void BM_string_our_memmove(benchmark::State& state) | |
{ | |
const auto nbytes = state.range(); | |
char* buf = new char[nbytes + 64]; | |
memset(buf, 'x', nbytes + 64); | |
for (auto _ : state) | |
{ | |
auto val = our_memmove(buf, buf + 1, nbytes); // Worst-case overlap. | |
benchmark::DoNotOptimize(val); | |
benchmark::ClobberMemory(); | |
} | |
state.SetBytesProcessed(state.iterations() * nbytes); | |
delete[] buf; | |
} | |
BENCHMARK(BM_string_our_memmove)->AT_COMMON_SIZES; | |
BENCHMARK_MAIN(); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* Copyright (c) 2023 Pedro Falcato | |
* This file is part of Onyx, and is released under the terms of the MIT License | |
* check LICENSE at the root directory for more information | |
* | |
* SPDX-License-Identifier: MIT | |
*/ | |
#define RET ret | |
#define ALIGN_TEXT .p2align 4, 0x90 | |
#ifndef MEMCPY_SYM | |
#define MEMCPY_SYM memcpy | |
#endif | |
#ifndef MEMMOVE_SYM | |
#define MEMMOVE_SYM memmove | |
#endif | |
#ifndef L | |
#define L(label) .L##label##\suffix | |
#endif | |
.macro memcpy_like overlap suffix | |
/* Test for 0 */ | |
test %rdx, %rdx | |
jz L(out) | |
/* Deal with [0..16], [16..32], [32..256] and [256..] separately */ | |
cmp $16, %rdx | |
jbe L(0_to_16_bytes) | |
cmp $32, %rdx | |
jbe L(0_to_32_bytes) | |
.if \overlap == 1 | |
/* If src < dst and they overlap, do a backwards copy */ | |
mov %rdi, %r8 | |
sub %rsi, %r8 | |
cmp %rdx, %r8 | |
jb L(copy_backwards) | |
.endif | |
/* Heuristic tested on Kabylake R */ | |
/* The limit is likely much lower on FSRM but TODO */ | |
cmp $512, %rdx | |
jae L(erms) | |
/* Fallthrough to the 32 byte copy */ | |
ALIGN_TEXT | |
L(32_byte_copy): | |
movq (%rsi), %rcx | |
movq 8(%rsi), %r8 | |
movq 16(%rsi), %r9 | |
movq 24(%rsi), %r10 | |
movq %rcx, (%rdi) | |
movq %r8, 8(%rdi) | |
movq %r9, 16(%rdi) | |
movq %r10, 24(%rdi) | |
/* We use both lea and arithmetic insns as to fully utilize execution units */ | |
lea 32(%rsi), %rsi | |
lea 32(%rdi), %rdi | |
sub $32, %rdx | |
jz L(out) | |
cmp $32, %rdx | |
jae L(32_byte_copy) | |
/* Fallthrough to the 0..32 copy */ | |
ALIGN_TEXT | |
/* This whole code (the part that handles the "tail") is based on being able to | |
* do unaligned, overlapping loads and stores. So something like (i.e 2-3 byte copy): | |
* movzwl (%rsi), %ecx | |
* movzwl -2(%rsi, %rdx), %r8d | |
* movw %cx, (%rdi) | |
* movw %r8w, -2(%rdi, %rdx) | |
* where rdi is dest, rsi is src, rdx is len. This is much cheaper than having a lot more branching | |
* down with some duff's device-like thing. | |
* | |
* Worth noting that tail code doesn't need a special backwards version as we load everything | |
* and only then store everything. | |
*/ | |
L(0_to_32_bytes): | |
cmp $16, %rdx | |
jbe L(0_to_16_bytes) | |
movq (%rsi), %rcx | |
movq 8(%rsi), %r8 | |
movq -16(%rsi, %rdx), %r9 | |
movq -8(%rsi, %rdx), %r10 | |
movq %rcx, (%rdi) | |
movq %r8, 8(%rdi) | |
movq %r9, -16(%rdi, %rdx) | |
movq %r10, -8(%rdi, %rdx) | |
RET | |
ALIGN_TEXT | |
L(0_to_16_bytes): | |
cmp $8, %rdx | |
jb L(4_to_7_bytes) | |
movq (%rsi), %rcx | |
movq -8(%rsi, %rdx), %r8 | |
movq %rcx, (%rdi) | |
movq %r8, -8(%rdi, %rdx) | |
RET | |
ALIGN_TEXT | |
L(4_to_7_bytes): | |
cmp $4, %rdx | |
jb L(1_to_3_bytes) | |
movl (%rsi), %ecx | |
movl -4(%rsi, %rdx), %r8d | |
movl %ecx, (%rdi) | |
movl %r8d, -4(%rdi, %rdx) | |
RET | |
ALIGN_TEXT | |
L(1_to_3_bytes): | |
/* Note: We use movz(b,w)l as it's superior to doing a load to a partial register */ | |
cmp $1, %rdx | |
je L(1_byte) | |
movzwl (%rsi), %ecx | |
movzwl -2(%rsi, %rdx), %r8d | |
movw %cx, (%rdi) | |
movw %r8w, -2(%rdi, %rdx) | |
RET | |
L(1_byte): | |
movzbl (%rsi), %ecx | |
movb %cl, (%rdi) | |
RET | |
ALIGN_TEXT | |
L(erms): | |
mov %rdx, %rcx | |
rep movsb | |
L(out): | |
RET | |
.if \overlap == 1 | |
L(copy_backwards): | |
lea (%rdi, %rdx), %rdi | |
lea (%rsi, %rdx), %rsi | |
ALIGN_TEXT | |
/* Standard 32-byte copy loop, as above, but backwards */ | |
L(32b_backwards): | |
movq -8(%rsi), %rcx | |
movq -16(%rsi), %r8 | |
movq -24(%rsi), %r9 | |
movq -32(%rsi), %r10 | |
movq %rcx, -8(%rdi) | |
movq %r8, -16(%rdi) | |
movq %r9, -24(%rdi) | |
movq %r10, -32(%rdi) | |
/* We use both lea and arithmetic insns as to fully utilize execution units */ | |
lea -32(%rsi), %rsi | |
lea -32(%rdi), %rdi | |
sub $32, %rdx | |
jz L(out) | |
cmp $32, %rdx | |
jae L(32b_backwards) | |
/* Do tail, but re-adjust regs */ | |
sub %rdx, %rdi | |
sub %rdx, %rsi | |
jmp L(0_to_32_bytes) | |
.endif | |
.endm | |
ALIGN_TEXT | |
.global __memcpy | |
.type __memcpy, @function | |
__memcpy: | |
/* Set up the return value */ | |
mov %rdi, %rax | |
memcpy_like 0 _memcpy | |
.size __memcpy, . - __memcpy | |
ALIGN_TEXT | |
.global __memmove | |
.type __memmove, @function | |
__memmove: | |
/* Set up the return value */ | |
mov %rdi, %rax | |
memcpy_like 1 _memset | |
.size __memmove, . - __memmove | |
.weak our_memcpy | |
.set our_memcpy, __memcpy | |
.weak our_memmove | |
.set our_memmove, __memmove |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment