heatd/bench.cpp Secret

## bench.cpp
/*
 * Copyright (C) 2012 The Android Open Source Project
 * Copyright (c) 2022 - 2023 Pedro Falcato
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
#include <stdint.h>
#include <string.h>

#include <benchmark/benchmark.h>

static void rep_movsb(void* dst, const void* src, size_t n)
{
    __asm__ __volatile__("rep movsb\n\t" : "+D"(dst), "+S"(src), "+c"(n) : : "memory");
}

#define KB 1024
#define MB 1024 * KB
#define AT_COMMON_SIZES \
    Arg(4)              \
        ->Arg(8)        \
        ->Arg(16)       \
        ->Arg(32)       \
        ->Arg(64)       \
        ->Arg(128)      \
        ->Arg(150)      \
        ->Arg(192)      \
        ->Arg(256)      \
        ->Arg(384)      \
        ->Arg(512)      \
        ->Arg(1 * KB)   \
        ->Arg(8 * KB)   \
        ->Arg(16 * KB)  \
        ->Arg(32 * KB)  \
        ->Arg(64 * KB)

void BM_string_erms(benchmark::State& state)
{
    const auto nbytes = state.range();
    char* src = new char[nbytes];
    char* dst = new char[nbytes];
    memset(src, 'x', nbytes);
    memset(dst, 'x', nbytes);

    for (auto _ : state)
    {
        rep_movsb(dst, src, nbytes);
        benchmark::ClobberMemory();
    }

    state.SetBytesProcessed(state.iterations() * nbytes);
    delete[] src;
    delete[] dst;
}

BENCHMARK(BM_string_erms)->AT_COMMON_SIZES;

void BM_string_memcpy(benchmark::State& state)
{
    const auto nbytes = state.range();
    char* src = new char[nbytes];
    char* dst = new char[nbytes];
    memset(src, 'x', nbytes);
    memset(dst, 'x', nbytes);

    for (auto _ : state)
    {
        auto val = memcpy(dst, src, nbytes);
        benchmark::DoNotOptimize(val);
        benchmark::ClobberMemory();
    }

    state.SetBytesProcessed(state.iterations() * nbytes);
    delete[] src;
    delete[] dst;
}

BENCHMARK(BM_string_memcpy)->AT_COMMON_SIZES;

extern "C" void* our_memcpy(void* dst, const void* src, size_t len);
extern "C" void* our_memmove(void* dst, const void* src, size_t len);

void BM_string_our_memcpy(benchmark::State& state)
{
    const auto nbytes = state.range();
    char* src = new char[nbytes];
    char* dst = new char[nbytes];
    memset(src, 'x', nbytes);
    memset(dst, 'x', nbytes);

    for (auto _ : state)
    {
        auto val = our_memcpy(dst, src, nbytes);
        benchmark::DoNotOptimize(val);
        benchmark::ClobberMemory();
    }

    state.SetBytesProcessed(state.iterations() * nbytes);
    delete[] src;
    delete[] dst;
}

BENCHMARK(BM_string_our_memcpy)->AT_COMMON_SIZES;

void BM_string_memmove(benchmark::State& state)
{
    const auto nbytes = state.range();
    char* buf = new char[nbytes + 64];
    memset(buf, 'x', nbytes + 64);

    for (auto _ : state)
    {
        auto val = memmove(buf, buf + 1, nbytes); // Worst-case overlap.
        benchmark::DoNotOptimize(val);
        benchmark::ClobberMemory();
    }

    state.SetBytesProcessed(state.iterations() * nbytes);
    delete[] buf;
}

BENCHMARK(BM_string_memmove)->AT_COMMON_SIZES;

void BM_string_our_memmove(benchmark::State& state)
{
    const auto nbytes = state.range();
    char* buf = new char[nbytes + 64];
    memset(buf, 'x', nbytes + 64);

    for (auto _ : state)
    {
        auto val = our_memmove(buf, buf + 1, nbytes); // Worst-case overlap.
        benchmark::DoNotOptimize(val);
        benchmark::ClobberMemory();
    }

    state.SetBytesProcessed(state.iterations() * nbytes);
    delete[] buf;
}

BENCHMARK(BM_string_our_memmove)->AT_COMMON_SIZES;

BENCHMARK_MAIN();

## memcpy.S
/*
 * Copyright (c) 2023 Pedro Falcato
 * This file is part of Onyx, and is released under the terms of the MIT License
 * check LICENSE at the root directory for more information
 *
 * SPDX-License-Identifier: MIT
 */
#define RET ret
#define ALIGN_TEXT .p2align 4, 0x90

#ifndef MEMCPY_SYM
#define MEMCPY_SYM memcpy
#endif

#ifndef MEMMOVE_SYM
#define MEMMOVE_SYM memmove
#endif

#ifndef L
#define L(label)	.L##label##\suffix
#endif

.macro memcpy_like overlap suffix
    /* Test for 0 */
    test %rdx, %rdx
    jz L(out)

    /* Deal with [0..16], [16..32], [32..256] and [256..] separately */
    cmp $16, %rdx
    jbe L(0_to_16_bytes)

    cmp $32, %rdx
    jbe L(0_to_32_bytes)

.if \overlap == 1
    /* If src < dst and they overlap, do a backwards copy */
    mov %rdi, %r8
    sub %rsi, %r8
    cmp %rdx, %r8
    jb L(copy_backwards)
.endif

    /* Heuristic tested on Kabylake R */
    /* The limit is likely much lower on FSRM but TODO */
    cmp $512, %rdx
    jae L(erms)

    /* Fallthrough to the 32 byte copy */
    ALIGN_TEXT
L(32_byte_copy):
    movq  (%rsi), %rcx
    movq  8(%rsi), %r8
    movq 16(%rsi), %r9
    movq 24(%rsi), %r10
    movq %rcx,   (%rdi)
    movq %r8,   8(%rdi)
    movq %r9,  16(%rdi)
    movq %r10, 24(%rdi)
    /* We use both lea and arithmetic insns as to fully utilize execution units */
    lea 32(%rsi), %rsi
    lea 32(%rdi), %rdi
    sub $32, %rdx
    jz L(out)
    cmp $32, %rdx
    jae L(32_byte_copy)

    /* Fallthrough to the 0..32 copy */
    ALIGN_TEXT
    /* This whole code (the part that handles the "tail") is based on being able to
     * do unaligned, overlapping loads and stores. So something like (i.e 2-3 byte copy):
     *          movzwl (%rsi), %ecx
     *          movzwl -2(%rsi, %rdx), %r8d
     *          movw %cx, (%rdi)
     *          movw %r8w, -2(%rdi, %rdx)
     * where rdi is dest, rsi is src, rdx is len. This is much cheaper than having a lot more branching
     * down with some duff's device-like thing.
     *
     * Worth noting that tail code doesn't need a special backwards version as we load everything
     * and only then store everything.
     */
L(0_to_32_bytes):
    cmp $16, %rdx
    jbe L(0_to_16_bytes)
    movq  (%rsi), %rcx
    movq 8(%rsi), %r8
    movq -16(%rsi, %rdx), %r9
    movq -8(%rsi, %rdx), %r10
    movq %rcx, (%rdi)
    movq %r8, 8(%rdi)
    movq %r9, -16(%rdi, %rdx)
    movq %r10,  -8(%rdi, %rdx)
    RET

    ALIGN_TEXT
L(0_to_16_bytes):
    cmp $8, %rdx
    jb L(4_to_7_bytes)
    movq   (%rsi), %rcx
    movq -8(%rsi, %rdx), %r8
    movq %rcx,  (%rdi)
    movq %r8, -8(%rdi, %rdx)
    RET

    ALIGN_TEXT
L(4_to_7_bytes):
    cmp $4, %rdx
    jb L(1_to_3_bytes)
    movl (%rsi), %ecx
    movl -4(%rsi, %rdx), %r8d
    movl %ecx, (%rdi)
    movl %r8d, -4(%rdi, %rdx)
    RET

    ALIGN_TEXT
L(1_to_3_bytes):
    /* Note: We use movz(b,w)l as it's superior to doing a load to a partial register */
    cmp $1, %rdx
    je L(1_byte)
    movzwl (%rsi), %ecx
    movzwl -2(%rsi, %rdx), %r8d
    movw %cx, (%rdi)
    movw %r8w, -2(%rdi, %rdx)
    RET

L(1_byte):
    movzbl (%rsi), %ecx
    movb %cl, (%rdi)
    RET

    ALIGN_TEXT
L(erms):
    mov %rdx, %rcx
    rep movsb
L(out):
    RET

.if \overlap == 1
L(copy_backwards):
    lea (%rdi, %rdx), %rdi
    lea (%rsi, %rdx), %rsi
    ALIGN_TEXT
    /* Standard 32-byte copy loop, as above, but backwards */
L(32b_backwards):
    movq  -8(%rsi), %rcx
    movq -16(%rsi), %r8
    movq -24(%rsi), %r9
    movq -32(%rsi), %r10
    movq %rcx,  -8(%rdi)
    movq %r8,  -16(%rdi)
    movq %r9,  -24(%rdi)
    movq %r10, -32(%rdi)
    /* We use both lea and arithmetic insns as to fully utilize execution units */
    lea -32(%rsi), %rsi
    lea -32(%rdi), %rdi
    sub $32, %rdx
    jz L(out)
    cmp $32, %rdx
    jae L(32b_backwards)

    /* Do tail, but re-adjust regs */
    sub %rdx, %rdi
    sub %rdx, %rsi
    jmp L(0_to_32_bytes)
.endif

.endm

ALIGN_TEXT
.global __memcpy
.type __memcpy, @function
__memcpy:
    /* Set up the return value */
    mov %rdi, %rax
    memcpy_like 0 _memcpy

.size __memcpy, . - __memcpy

ALIGN_TEXT
.global __memmove
.type __memmove, @function
__memmove:
    /* Set up the return value */
    mov %rdi, %rax
    memcpy_like 1 _memset

.size __memmove, . - __memmove

.weak our_memcpy
.set our_memcpy, __memcpy
.weak our_memmove
.set our_memmove, __memmove
	/*
	* Copyright (C) 2012 The Android Open Source Project
	* Copyright (c) 2022 - 2023 Pedro Falcato
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	#include <stdint.h>
	#include <string.h>

	#include <benchmark/benchmark.h>

	static void rep_movsb(void* dst, const void* src, size_t n)
	{
	__asm__ __volatile__("rep movsb\n\t" : "+D"(dst), "+S"(src), "+c"(n) : : "memory");
	}

	#define KB 1024
	#define MB 1024 * KB
	#define AT_COMMON_SIZES \
	Arg(4) \
	->Arg(8) \
	->Arg(16) \
	->Arg(32) \
	->Arg(64) \
	->Arg(128) \
	->Arg(150) \
	->Arg(192) \
	->Arg(256) \
	->Arg(384) \
	->Arg(512) \
	->Arg(1 * KB) \
	->Arg(8 * KB) \
	->Arg(16 * KB) \
	->Arg(32 * KB) \
	->Arg(64 * KB)

	void BM_string_erms(benchmark::State& state)
	{
	const auto nbytes = state.range();
	char* src = new char[nbytes];
	char* dst = new char[nbytes];
	memset(src, 'x', nbytes);
	memset(dst, 'x', nbytes);

	for (auto _ : state)
	{
	rep_movsb(dst, src, nbytes);
	benchmark::ClobberMemory();
	}

	state.SetBytesProcessed(state.iterations() * nbytes);
	delete[] src;
	delete[] dst;
	}

	BENCHMARK(BM_string_erms)->AT_COMMON_SIZES;

	void BM_string_memcpy(benchmark::State& state)
	{
	const auto nbytes = state.range();
	char* src = new char[nbytes];
	char* dst = new char[nbytes];
	memset(src, 'x', nbytes);
	memset(dst, 'x', nbytes);

	for (auto _ : state)
	{
	auto val = memcpy(dst, src, nbytes);
	benchmark::DoNotOptimize(val);
	benchmark::ClobberMemory();
	}

	state.SetBytesProcessed(state.iterations() * nbytes);
	delete[] src;
	delete[] dst;
	}

	BENCHMARK(BM_string_memcpy)->AT_COMMON_SIZES;

	extern "C" void* our_memcpy(void* dst, const void* src, size_t len);
	extern "C" void* our_memmove(void* dst, const void* src, size_t len);

	void BM_string_our_memcpy(benchmark::State& state)
	{
	const auto nbytes = state.range();
	char* src = new char[nbytes];
	char* dst = new char[nbytes];
	memset(src, 'x', nbytes);
	memset(dst, 'x', nbytes);

	for (auto _ : state)
	{
	auto val = our_memcpy(dst, src, nbytes);
	benchmark::DoNotOptimize(val);
	benchmark::ClobberMemory();
	}

	state.SetBytesProcessed(state.iterations() * nbytes);
	delete[] src;
	delete[] dst;
	}

	BENCHMARK(BM_string_our_memcpy)->AT_COMMON_SIZES;

	void BM_string_memmove(benchmark::State& state)
	{
	const auto nbytes = state.range();
	char* buf = new char[nbytes + 64];
	memset(buf, 'x', nbytes + 64);

	for (auto _ : state)
	{
	auto val = memmove(buf, buf + 1, nbytes); // Worst-case overlap.
	benchmark::DoNotOptimize(val);
	benchmark::ClobberMemory();
	}

	state.SetBytesProcessed(state.iterations() * nbytes);
	delete[] buf;
	}

	BENCHMARK(BM_string_memmove)->AT_COMMON_SIZES;

	void BM_string_our_memmove(benchmark::State& state)
	{
	const auto nbytes = state.range();
	char* buf = new char[nbytes + 64];
	memset(buf, 'x', nbytes + 64);

	for (auto _ : state)
	{
	auto val = our_memmove(buf, buf + 1, nbytes); // Worst-case overlap.
	benchmark::DoNotOptimize(val);
	benchmark::ClobberMemory();
	}

	state.SetBytesProcessed(state.iterations() * nbytes);
	delete[] buf;
	}

	BENCHMARK(BM_string_our_memmove)->AT_COMMON_SIZES;

	BENCHMARK_MAIN();
	/*
	* Copyright (c) 2023 Pedro Falcato
	* This file is part of Onyx, and is released under the terms of the MIT License
	* check LICENSE at the root directory for more information
	*
	* SPDX-License-Identifier: MIT
	*/
	#define RET ret
	#define ALIGN_TEXT .p2align 4, 0x90

	#ifndef MEMCPY_SYM
	#define MEMCPY_SYM memcpy
	#endif

	#ifndef MEMMOVE_SYM
	#define MEMMOVE_SYM memmove
	#endif

	#ifndef L
	#define L(label) .L##label##\suffix
	#endif

	.macro memcpy_like overlap suffix
	/* Test for 0 */
	test %rdx, %rdx
	jz L(out)

	/* Deal with [0..16], [16..32], [32..256] and [256..] separately */
	cmp $16, %rdx
	jbe L(0_to_16_bytes)

	cmp $32, %rdx
	jbe L(0_to_32_bytes)

	.if \overlap == 1
	/* If src < dst and they overlap, do a backwards copy */
	mov %rdi, %r8
	sub %rsi, %r8
	cmp %rdx, %r8
	jb L(copy_backwards)
	.endif

	/* Heuristic tested on Kabylake R */
	/* The limit is likely much lower on FSRM but TODO */
	cmp $512, %rdx
	jae L(erms)

	/* Fallthrough to the 32 byte copy */
	ALIGN_TEXT
	L(32_byte_copy):
	movq (%rsi), %rcx
	movq 8(%rsi), %r8
	movq 16(%rsi), %r9
	movq 24(%rsi), %r10
	movq %rcx, (%rdi)
	movq %r8, 8(%rdi)
	movq %r9, 16(%rdi)
	movq %r10, 24(%rdi)
	/* We use both lea and arithmetic insns as to fully utilize execution units */
	lea 32(%rsi), %rsi
	lea 32(%rdi), %rdi
	sub $32, %rdx
	jz L(out)
	cmp $32, %rdx
	jae L(32_byte_copy)

	/* Fallthrough to the 0..32 copy */
	ALIGN_TEXT
	/* This whole code (the part that handles the "tail") is based on being able to
	* do unaligned, overlapping loads and stores. So something like (i.e 2-3 byte copy):
	* movzwl (%rsi), %ecx
	* movzwl -2(%rsi, %rdx), %r8d
	* movw %cx, (%rdi)
	* movw %r8w, -2(%rdi, %rdx)
	* where rdi is dest, rsi is src, rdx is len. This is much cheaper than having a lot more branching
	* down with some duff's device-like thing.
	*
	* Worth noting that tail code doesn't need a special backwards version as we load everything
	* and only then store everything.
	*/
	L(0_to_32_bytes):
	cmp $16, %rdx
	jbe L(0_to_16_bytes)
	movq (%rsi), %rcx
	movq 8(%rsi), %r8
	movq -16(%rsi, %rdx), %r9
	movq -8(%rsi, %rdx), %r10
	movq %rcx, (%rdi)
	movq %r8, 8(%rdi)
	movq %r9, -16(%rdi, %rdx)
	movq %r10, -8(%rdi, %rdx)
	RET

	ALIGN_TEXT
	L(0_to_16_bytes):
	cmp $8, %rdx
	jb L(4_to_7_bytes)
	movq (%rsi), %rcx
	movq -8(%rsi, %rdx), %r8
	movq %rcx, (%rdi)
	movq %r8, -8(%rdi, %rdx)
	RET

	ALIGN_TEXT
	L(4_to_7_bytes):
	cmp $4, %rdx
	jb L(1_to_3_bytes)
	movl (%rsi), %ecx
	movl -4(%rsi, %rdx), %r8d
	movl %ecx, (%rdi)
	movl %r8d, -4(%rdi, %rdx)
	RET

	ALIGN_TEXT
	L(1_to_3_bytes):
	/* Note: We use movz(b,w)l as it's superior to doing a load to a partial register */
	cmp $1, %rdx
	je L(1_byte)
	movzwl (%rsi), %ecx
	movzwl -2(%rsi, %rdx), %r8d
	movw %cx, (%rdi)
	movw %r8w, -2(%rdi, %rdx)
	RET

	L(1_byte):
	movzbl (%rsi), %ecx
	movb %cl, (%rdi)
	RET

	ALIGN_TEXT
	L(erms):
	mov %rdx, %rcx
	rep movsb
	L(out):
	RET

	.if \overlap == 1
	L(copy_backwards):
	lea (%rdi, %rdx), %rdi
	lea (%rsi, %rdx), %rsi
	ALIGN_TEXT
	/* Standard 32-byte copy loop, as above, but backwards */
	L(32b_backwards):
	movq -8(%rsi), %rcx
	movq -16(%rsi), %r8
	movq -24(%rsi), %r9
	movq -32(%rsi), %r10
	movq %rcx, -8(%rdi)
	movq %r8, -16(%rdi)
	movq %r9, -24(%rdi)
	movq %r10, -32(%rdi)
	/* We use both lea and arithmetic insns as to fully utilize execution units */
	lea -32(%rsi), %rsi
	lea -32(%rdi), %rdi
	sub $32, %rdx
	jz L(out)
	cmp $32, %rdx
	jae L(32b_backwards)

	/* Do tail, but re-adjust regs */
	sub %rdx, %rdi
	sub %rdx, %rsi
	jmp L(0_to_32_bytes)
	.endif

	.endm

	ALIGN_TEXT
	.global __memcpy
	.type __memcpy, @function
	__memcpy:
	/* Set up the return value */
	mov %rdi, %rax
	memcpy_like 0 _memcpy

	.size __memcpy, . - __memcpy

	ALIGN_TEXT
	.global __memmove
	.type __memmove, @function
	__memmove:
	/* Set up the return value */
	mov %rdi, %rax
	memcpy_like 1 _memset

	.size __memmove, . - __memmove

	.weak our_memcpy
	.set our_memcpy, __memcpy
	.weak our_memmove
	.set our_memmove, __memmove