Skip to content

Instantly share code, notes, and snippets.

@kaixiong
Last active February 22, 2023 18:33
Show Gist options
  • Save kaixiong/ad653c81658332b77cd7456c189544e7 to your computer and use it in GitHub Desktop.
Save kaixiong/ad653c81658332b77cd7456c189544e7 to your computer and use it in GitHub Desktop.
LV vs glibc memset/memcpy
// -*- compile-command: "g++ -O2 -Wall -std=c++20 -fstrict-aliasing -o mem-benchmark mem-benchmark.cpp $(pkg-config --cflags --libs fmt)" -*-
#include <bit>
#include <chrono>
#include <iostream>
#include <memory>
#include <utility>
#include <cassert>
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <fmt/core.h>
#include <fmt/chrono.h>
void *mem_set8_mmx2(void *dest, int c, std::size_t n)
{
auto d {static_cast<uint32_t*>(dest)};
uint32_t const setflag32 = (c & 0xff) | ((c << 8) & 0xff00) | ((c << 16) & 0xff0000) | ((c << 24) & 0xff000000);
__asm__ __volatile__
("\n\t movd (%0), %%mm0"
"\n\t movd (%0), %%mm1"
"\n\t psllq $32, %%mm1"
"\n\t por %%mm1, %%mm0"
"\n\t movq %%mm0, %%mm2"
"\n\t movq %%mm0, %%mm1"
"\n\t movq %%mm2, %%mm3"
"\n\t movq %%mm1, %%mm4"
"\n\t movq %%mm0, %%mm5"
"\n\t movq %%mm2, %%mm6"
"\n\t movq %%mm1, %%mm7"
:: "r" (&setflag32) : "memory");
while (n >= 64) {
__asm__ __volatile__
("\n\t movntq %%mm0, (%0)"
"\n\t movntq %%mm1, 8(%0)"
"\n\t movntq %%mm2, 16(%0)"
"\n\t movntq %%mm3, 24(%0)"
"\n\t movntq %%mm4, 32(%0)"
"\n\t movntq %%mm5, 40(%0)"
"\n\t movntq %%mm6, 48(%0)"
"\n\t movntq %%mm7, 56(%0)"
:: "r" (d) : "memory");
d += 16;
n -= 64;
}
uint8_t const setflag8 = c & 0xff;
while (n >= 4) {
*d++ = setflag32;
n -= 4;
}
auto dc {reinterpret_cast<uint8_t*>(d)};
while (n--)
*dc++ = setflag8;
return dest;
}
void *mem_copy_mmx2(void* dest, void const* src, std::size_t n)
{
auto d {static_cast<uint32_t*>(dest)};
auto s {static_cast<uint32_t const*>(src)};
while (n >= 64) {
__asm__ __volatile__
(//"\n\t prefetchnta 256(%0)"
//"\n\t prefetchnta 320(%0)"
"\n\t movq (%0), %%mm0"
"\n\t movq 8(%0), %%mm1"
"\n\t movq 16(%0), %%mm2"
"\n\t movq 24(%0), %%mm3"
"\n\t movq 32(%0), %%mm4"
"\n\t movq 40(%0), %%mm5"
"\n\t movq 48(%0), %%mm6"
"\n\t movq 56(%0), %%mm7"
"\n\t movntq %%mm0, (%1)"
"\n\t movntq %%mm1, 8(%1)"
"\n\t movntq %%mm2, 16(%1)"
"\n\t movntq %%mm3, 24(%1)"
"\n\t movntq %%mm4, 32(%1)"
"\n\t movntq %%mm5, 40(%1)"
"\n\t movntq %%mm6, 48(%1)"
"\n\t movntq %%mm7, 56(%1)"
:: "r" (s), "r" (d) : "memory");
d += 16;
s += 16;
n -= 64;
}
while (n >= 4) {
*d++ = *s++;
n -= 4;
}
auto dc {reinterpret_cast<std::uint8_t*>(d)};
auto sc {reinterpret_cast<std::uint8_t const*>(s)};
while (n--)
*dc++ = *sc++;
return dest;
}
void benchmark_set(unsigned int run_count, std::size_t buffer_size)
{
namespace chrono = std::chrono;
using chrono::duration_cast;
using benchmark_clock = chrono::high_resolution_clock;
fmt::print("Benchmarking setting of {}-byte buffers for {} times.\n", buffer_size, run_count);
{
auto target {std::make_unique<std::byte[]>(buffer_size)};
{
auto start_time {benchmark_clock::now()};
for (unsigned i = 0; i < run_count; i++) {
std::memset(target.get(), 0x12, buffer_size);
}
chrono::duration<float> duration {benchmark_clock::now() - start_time};
fmt::print("Libc : {}\n", duration.count());
}
{
auto start_time {benchmark_clock::now()};
for (unsigned i = 0; i < run_count; i++) {
mem_set8_mmx2(target.get(), 0x12, buffer_size);
}
chrono::duration<float> duration {benchmark_clock::now() - start_time};
fmt::print("Custom: {}s\n", duration.count());
}
}
{
constexpr std::size_t alignment {64};
static_assert(std::has_single_bit(alignment), "Alignment is not a power of 2.");
auto target {std::make_unique<std::byte[]>(buffer_size + alignment-1)};
auto target_offset {alignment - (std::uintptr_t(target.get()) & (alignment-1))};
assert(std::uintptr_t(target.get() + target_offset) % alignment == 0);
{
auto start_time {benchmark_clock::now()};
for (unsigned i = 0; i < run_count; i++) {
std::memset(target.get() + target_offset, 0x12, buffer_size);
}
chrono::duration<float> duration {benchmark_clock::now() - start_time};
fmt::print("Libc (align:{}): {}s\n", alignment, duration.count());
}
{
auto start_time {benchmark_clock::now()};
for (unsigned i = 0; i < run_count; i++) {
mem_set8_mmx2(target.get() + target_offset, 0x12, buffer_size);
}
chrono::duration<float> duration {benchmark_clock::now() - start_time};
fmt::print("Custom (align:{}): {}s\n", alignment, duration.count());
}
}
fmt::print("\n");
}
void benchmark_copy(unsigned int run_count, std::size_t buffer_size)
{
namespace chrono = std::chrono;
using chrono::duration_cast;
using benchmark_clock = chrono::high_resolution_clock;
fmt::print("Benchmarking copying of {}-byte buffers for {} times.\n", buffer_size, run_count);
{
auto source {std::make_unique<std::byte[]>(buffer_size)};
auto target {std::make_unique<std::byte[]>(buffer_size)};
{
auto start_time {benchmark_clock::now()};
for (unsigned i = 0; i < run_count; i++) {
std::memcpy(target.get(), source.get(), buffer_size);
}
chrono::duration<float> duration {benchmark_clock::now() - start_time};
fmt::print("Libc : {}s\n", duration.count());
}
{
auto start_time {benchmark_clock::now()};
for (unsigned i = 0; i < run_count; i++) {
mem_copy_mmx2(target.get(), source.get(), buffer_size);
}
chrono::duration<float> duration {benchmark_clock::now() - start_time};
fmt::print("Custom: {}s\n", duration.count());
}
}
{
constexpr std::size_t alignment {64};
static_assert(std::has_single_bit(alignment), "Alignment is not a power of 2.");
auto source {std::make_unique<std::byte[]>(buffer_size + alignment-1)};
auto target {std::make_unique<std::byte[]>(buffer_size + alignment-1)};
auto source_offset {alignment - (std::uintptr_t(source.get()) & (alignment-1))};
auto target_offset {alignment - (std::uintptr_t(target.get()) & (alignment-1))};
assert(std::uintptr_t(source.get() + source_offset) % alignment == 0);
assert(std::uintptr_t(target.get() + target_offset) % alignment == 0);
{
auto start_time {benchmark_clock::now()};
for (unsigned i = 0; i < run_count; i++) {
std::memcpy(target.get() + target_offset, source.get() + source_offset, buffer_size);
}
chrono::duration<float> duration {benchmark_clock::now() - start_time};
fmt::print("Libc (align:{}): {}s\n", alignment, duration.count());
}
{
auto start_time {benchmark_clock::now()};
for (unsigned i = 0; i < run_count; i++) {
mem_copy_mmx2(target.get() + target_offset, source.get() + source_offset, buffer_size);
}
chrono::duration<float> duration {benchmark_clock::now() - start_time};
fmt::print("Custom (align:{}): {}s\n", alignment, duration.count());
}
}
fmt::print("\n");
}
int main()
{
constexpr std::size_t bytes_to_copy {std::size_t(1000000) * 3000};
for (std::size_t buffer_size = 100; buffer_size < bytes_to_copy; buffer_size *= 10) {
std::size_t run_count {bytes_to_copy / buffer_size};
benchmark_set(run_count, buffer_size);
}
for (std::size_t buffer_size = 100; buffer_size < bytes_to_copy; buffer_size *= 10) {
std::size_t run_count {bytes_to_copy / buffer_size};
benchmark_copy(run_count, buffer_size);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment