Skip to content

Instantly share code, notes, and snippets.

@heatd

heatd/bench.cpp Secret

Created March 10, 2023 02:08
Show Gist options
  • Save heatd/28714dccadfff5469a3647f364a93ce6 to your computer and use it in GitHub Desktop.
Save heatd/28714dccadfff5469a3647f364a93ce6 to your computer and use it in GitHub Desktop.
/*
* Copyright (C) 2012 The Android Open Source Project
* Copyright (c) 2022 - 2023 Pedro Falcato
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <stdint.h>
#include <string.h>
#include <benchmark/benchmark.h>
static void rep_movsb(void* dst, const void* src, size_t n)
{
__asm__ __volatile__("rep movsb\n\t" : "+D"(dst), "+S"(src), "+c"(n) : : "memory");
}
#define KB 1024
#define MB 1024 * KB
#define AT_COMMON_SIZES \
Arg(4) \
->Arg(8) \
->Arg(16) \
->Arg(32) \
->Arg(64) \
->Arg(128) \
->Arg(150) \
->Arg(192) \
->Arg(256) \
->Arg(384) \
->Arg(512) \
->Arg(1 * KB) \
->Arg(8 * KB) \
->Arg(16 * KB) \
->Arg(32 * KB) \
->Arg(64 * KB)
void BM_string_erms(benchmark::State& state)
{
const auto nbytes = state.range();
char* src = new char[nbytes];
char* dst = new char[nbytes];
memset(src, 'x', nbytes);
memset(dst, 'x', nbytes);
for (auto _ : state)
{
rep_movsb(dst, src, nbytes);
benchmark::ClobberMemory();
}
state.SetBytesProcessed(state.iterations() * nbytes);
delete[] src;
delete[] dst;
}
BENCHMARK(BM_string_erms)->AT_COMMON_SIZES;
void BM_string_memcpy(benchmark::State& state)
{
const auto nbytes = state.range();
char* src = new char[nbytes];
char* dst = new char[nbytes];
memset(src, 'x', nbytes);
memset(dst, 'x', nbytes);
for (auto _ : state)
{
auto val = memcpy(dst, src, nbytes);
benchmark::DoNotOptimize(val);
benchmark::ClobberMemory();
}
state.SetBytesProcessed(state.iterations() * nbytes);
delete[] src;
delete[] dst;
}
BENCHMARK(BM_string_memcpy)->AT_COMMON_SIZES;
extern "C" void* our_memcpy(void* dst, const void* src, size_t len);
extern "C" void* our_memmove(void* dst, const void* src, size_t len);
void BM_string_our_memcpy(benchmark::State& state)
{
const auto nbytes = state.range();
char* src = new char[nbytes];
char* dst = new char[nbytes];
memset(src, 'x', nbytes);
memset(dst, 'x', nbytes);
for (auto _ : state)
{
auto val = our_memcpy(dst, src, nbytes);
benchmark::DoNotOptimize(val);
benchmark::ClobberMemory();
}
state.SetBytesProcessed(state.iterations() * nbytes);
delete[] src;
delete[] dst;
}
BENCHMARK(BM_string_our_memcpy)->AT_COMMON_SIZES;
void BM_string_memmove(benchmark::State& state)
{
const auto nbytes = state.range();
char* buf = new char[nbytes + 64];
memset(buf, 'x', nbytes + 64);
for (auto _ : state)
{
auto val = memmove(buf, buf + 1, nbytes); // Worst-case overlap.
benchmark::DoNotOptimize(val);
benchmark::ClobberMemory();
}
state.SetBytesProcessed(state.iterations() * nbytes);
delete[] buf;
}
BENCHMARK(BM_string_memmove)->AT_COMMON_SIZES;
void BM_string_our_memmove(benchmark::State& state)
{
const auto nbytes = state.range();
char* buf = new char[nbytes + 64];
memset(buf, 'x', nbytes + 64);
for (auto _ : state)
{
auto val = our_memmove(buf, buf + 1, nbytes); // Worst-case overlap.
benchmark::DoNotOptimize(val);
benchmark::ClobberMemory();
}
state.SetBytesProcessed(state.iterations() * nbytes);
delete[] buf;
}
BENCHMARK(BM_string_our_memmove)->AT_COMMON_SIZES;
BENCHMARK_MAIN();
/*
* Copyright (c) 2023 Pedro Falcato
* This file is part of Onyx, and is released under the terms of the MIT License
* check LICENSE at the root directory for more information
*
* SPDX-License-Identifier: MIT
*/
#define RET ret
#define ALIGN_TEXT .p2align 4, 0x90
#ifndef MEMCPY_SYM
#define MEMCPY_SYM memcpy
#endif
#ifndef MEMMOVE_SYM
#define MEMMOVE_SYM memmove
#endif
#ifndef L
#define L(label) .L##label##\suffix
#endif
.macro memcpy_like overlap suffix
/* Test for 0 */
test %rdx, %rdx
jz L(out)
/* Deal with [0..16], [16..32], [32..256] and [256..] separately */
cmp $16, %rdx
jbe L(0_to_16_bytes)
cmp $32, %rdx
jbe L(0_to_32_bytes)
.if \overlap == 1
/* If src < dst and they overlap, do a backwards copy */
mov %rdi, %r8
sub %rsi, %r8
cmp %rdx, %r8
jb L(copy_backwards)
.endif
/* Heuristic tested on Kabylake R */
/* The limit is likely much lower on FSRM but TODO */
cmp $512, %rdx
jae L(erms)
/* Fallthrough to the 32 byte copy */
ALIGN_TEXT
L(32_byte_copy):
movq (%rsi), %rcx
movq 8(%rsi), %r8
movq 16(%rsi), %r9
movq 24(%rsi), %r10
movq %rcx, (%rdi)
movq %r8, 8(%rdi)
movq %r9, 16(%rdi)
movq %r10, 24(%rdi)
/* We use both lea and arithmetic insns as to fully utilize execution units */
lea 32(%rsi), %rsi
lea 32(%rdi), %rdi
sub $32, %rdx
jz L(out)
cmp $32, %rdx
jae L(32_byte_copy)
/* Fallthrough to the 0..32 copy */
ALIGN_TEXT
/* This whole code (the part that handles the "tail") is based on being able to
* do unaligned, overlapping loads and stores. So something like (i.e 2-3 byte copy):
* movzwl (%rsi), %ecx
* movzwl -2(%rsi, %rdx), %r8d
* movw %cx, (%rdi)
* movw %r8w, -2(%rdi, %rdx)
* where rdi is dest, rsi is src, rdx is len. This is much cheaper than having a lot more branching
* down with some duff's device-like thing.
*
* Worth noting that tail code doesn't need a special backwards version as we load everything
* and only then store everything.
*/
L(0_to_32_bytes):
cmp $16, %rdx
jbe L(0_to_16_bytes)
movq (%rsi), %rcx
movq 8(%rsi), %r8
movq -16(%rsi, %rdx), %r9
movq -8(%rsi, %rdx), %r10
movq %rcx, (%rdi)
movq %r8, 8(%rdi)
movq %r9, -16(%rdi, %rdx)
movq %r10, -8(%rdi, %rdx)
RET
ALIGN_TEXT
L(0_to_16_bytes):
cmp $8, %rdx
jb L(4_to_7_bytes)
movq (%rsi), %rcx
movq -8(%rsi, %rdx), %r8
movq %rcx, (%rdi)
movq %r8, -8(%rdi, %rdx)
RET
ALIGN_TEXT
L(4_to_7_bytes):
cmp $4, %rdx
jb L(1_to_3_bytes)
movl (%rsi), %ecx
movl -4(%rsi, %rdx), %r8d
movl %ecx, (%rdi)
movl %r8d, -4(%rdi, %rdx)
RET
ALIGN_TEXT
L(1_to_3_bytes):
/* Note: We use movz(b,w)l as it's superior to doing a load to a partial register */
cmp $1, %rdx
je L(1_byte)
movzwl (%rsi), %ecx
movzwl -2(%rsi, %rdx), %r8d
movw %cx, (%rdi)
movw %r8w, -2(%rdi, %rdx)
RET
L(1_byte):
movzbl (%rsi), %ecx
movb %cl, (%rdi)
RET
ALIGN_TEXT
L(erms):
mov %rdx, %rcx
rep movsb
L(out):
RET
.if \overlap == 1
L(copy_backwards):
lea (%rdi, %rdx), %rdi
lea (%rsi, %rdx), %rsi
ALIGN_TEXT
/* Standard 32-byte copy loop, as above, but backwards */
L(32b_backwards):
movq -8(%rsi), %rcx
movq -16(%rsi), %r8
movq -24(%rsi), %r9
movq -32(%rsi), %r10
movq %rcx, -8(%rdi)
movq %r8, -16(%rdi)
movq %r9, -24(%rdi)
movq %r10, -32(%rdi)
/* We use both lea and arithmetic insns as to fully utilize execution units */
lea -32(%rsi), %rsi
lea -32(%rdi), %rdi
sub $32, %rdx
jz L(out)
cmp $32, %rdx
jae L(32b_backwards)
/* Do tail, but re-adjust regs */
sub %rdx, %rdi
sub %rdx, %rsi
jmp L(0_to_32_bytes)
.endif
.endm
ALIGN_TEXT
.global __memcpy
.type __memcpy, @function
__memcpy:
/* Set up the return value */
mov %rdi, %rax
memcpy_like 0 _memcpy
.size __memcpy, . - __memcpy
ALIGN_TEXT
.global __memmove
.type __memmove, @function
__memmove:
/* Set up the return value */
mov %rdi, %rax
memcpy_like 1 _memset
.size __memmove, . - __memmove
.weak our_memcpy
.set our_memcpy, __memcpy
.weak our_memmove
.set our_memmove, __memmove
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment