Last active
February 22, 2023 18:33
-
-
Save kaixiong/ad653c81658332b77cd7456c189544e7 to your computer and use it in GitHub Desktop.
LV vs glibc memset/memcpy
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// -*- compile-command: "g++ -O2 -Wall -std=c++20 -fstrict-aliasing -o mem-benchmark mem-benchmark.cpp $(pkg-config --cflags --libs fmt)" -*- | |
#include <bit> | |
#include <chrono> | |
#include <iostream> | |
#include <memory> | |
#include <utility> | |
#include <cassert> | |
#include <cstddef> | |
#include <cstdint> | |
#include <cstring> | |
#include <fmt/core.h> | |
#include <fmt/chrono.h> | |
void *mem_set8_mmx2(void *dest, int c, std::size_t n) | |
{ | |
auto d {static_cast<uint32_t*>(dest)}; | |
uint32_t const setflag32 = (c & 0xff) | ((c << 8) & 0xff00) | ((c << 16) & 0xff0000) | ((c << 24) & 0xff000000); | |
__asm__ __volatile__ | |
("\n\t movd (%0), %%mm0" | |
"\n\t movd (%0), %%mm1" | |
"\n\t psllq $32, %%mm1" | |
"\n\t por %%mm1, %%mm0" | |
"\n\t movq %%mm0, %%mm2" | |
"\n\t movq %%mm0, %%mm1" | |
"\n\t movq %%mm2, %%mm3" | |
"\n\t movq %%mm1, %%mm4" | |
"\n\t movq %%mm0, %%mm5" | |
"\n\t movq %%mm2, %%mm6" | |
"\n\t movq %%mm1, %%mm7" | |
:: "r" (&setflag32) : "memory"); | |
while (n >= 64) { | |
__asm__ __volatile__ | |
("\n\t movntq %%mm0, (%0)" | |
"\n\t movntq %%mm1, 8(%0)" | |
"\n\t movntq %%mm2, 16(%0)" | |
"\n\t movntq %%mm3, 24(%0)" | |
"\n\t movntq %%mm4, 32(%0)" | |
"\n\t movntq %%mm5, 40(%0)" | |
"\n\t movntq %%mm6, 48(%0)" | |
"\n\t movntq %%mm7, 56(%0)" | |
:: "r" (d) : "memory"); | |
d += 16; | |
n -= 64; | |
} | |
uint8_t const setflag8 = c & 0xff; | |
while (n >= 4) { | |
*d++ = setflag32; | |
n -= 4; | |
} | |
auto dc {reinterpret_cast<uint8_t*>(d)}; | |
while (n--) | |
*dc++ = setflag8; | |
return dest; | |
} | |
void *mem_copy_mmx2(void* dest, void const* src, std::size_t n) | |
{ | |
auto d {static_cast<uint32_t*>(dest)}; | |
auto s {static_cast<uint32_t const*>(src)}; | |
while (n >= 64) { | |
__asm__ __volatile__ | |
(//"\n\t prefetchnta 256(%0)" | |
//"\n\t prefetchnta 320(%0)" | |
"\n\t movq (%0), %%mm0" | |
"\n\t movq 8(%0), %%mm1" | |
"\n\t movq 16(%0), %%mm2" | |
"\n\t movq 24(%0), %%mm3" | |
"\n\t movq 32(%0), %%mm4" | |
"\n\t movq 40(%0), %%mm5" | |
"\n\t movq 48(%0), %%mm6" | |
"\n\t movq 56(%0), %%mm7" | |
"\n\t movntq %%mm0, (%1)" | |
"\n\t movntq %%mm1, 8(%1)" | |
"\n\t movntq %%mm2, 16(%1)" | |
"\n\t movntq %%mm3, 24(%1)" | |
"\n\t movntq %%mm4, 32(%1)" | |
"\n\t movntq %%mm5, 40(%1)" | |
"\n\t movntq %%mm6, 48(%1)" | |
"\n\t movntq %%mm7, 56(%1)" | |
:: "r" (s), "r" (d) : "memory"); | |
d += 16; | |
s += 16; | |
n -= 64; | |
} | |
while (n >= 4) { | |
*d++ = *s++; | |
n -= 4; | |
} | |
auto dc {reinterpret_cast<std::uint8_t*>(d)}; | |
auto sc {reinterpret_cast<std::uint8_t const*>(s)}; | |
while (n--) | |
*dc++ = *sc++; | |
return dest; | |
} | |
void benchmark_set(unsigned int run_count, std::size_t buffer_size) | |
{ | |
namespace chrono = std::chrono; | |
using chrono::duration_cast; | |
using benchmark_clock = chrono::high_resolution_clock; | |
fmt::print("Benchmarking setting of {}-byte buffers for {} times.\n", buffer_size, run_count); | |
{ | |
auto target {std::make_unique<std::byte[]>(buffer_size)}; | |
{ | |
auto start_time {benchmark_clock::now()}; | |
for (unsigned i = 0; i < run_count; i++) { | |
std::memset(target.get(), 0x12, buffer_size); | |
} | |
chrono::duration<float> duration {benchmark_clock::now() - start_time}; | |
fmt::print("Libc : {}\n", duration.count()); | |
} | |
{ | |
auto start_time {benchmark_clock::now()}; | |
for (unsigned i = 0; i < run_count; i++) { | |
mem_set8_mmx2(target.get(), 0x12, buffer_size); | |
} | |
chrono::duration<float> duration {benchmark_clock::now() - start_time}; | |
fmt::print("Custom: {}s\n", duration.count()); | |
} | |
} | |
{ | |
constexpr std::size_t alignment {64}; | |
static_assert(std::has_single_bit(alignment), "Alignment is not a power of 2."); | |
auto target {std::make_unique<std::byte[]>(buffer_size + alignment-1)}; | |
auto target_offset {alignment - (std::uintptr_t(target.get()) & (alignment-1))}; | |
assert(std::uintptr_t(target.get() + target_offset) % alignment == 0); | |
{ | |
auto start_time {benchmark_clock::now()}; | |
for (unsigned i = 0; i < run_count; i++) { | |
std::memset(target.get() + target_offset, 0x12, buffer_size); | |
} | |
chrono::duration<float> duration {benchmark_clock::now() - start_time}; | |
fmt::print("Libc (align:{}): {}s\n", alignment, duration.count()); | |
} | |
{ | |
auto start_time {benchmark_clock::now()}; | |
for (unsigned i = 0; i < run_count; i++) { | |
mem_set8_mmx2(target.get() + target_offset, 0x12, buffer_size); | |
} | |
chrono::duration<float> duration {benchmark_clock::now() - start_time}; | |
fmt::print("Custom (align:{}): {}s\n", alignment, duration.count()); | |
} | |
} | |
fmt::print("\n"); | |
} | |
void benchmark_copy(unsigned int run_count, std::size_t buffer_size) | |
{ | |
namespace chrono = std::chrono; | |
using chrono::duration_cast; | |
using benchmark_clock = chrono::high_resolution_clock; | |
fmt::print("Benchmarking copying of {}-byte buffers for {} times.\n", buffer_size, run_count); | |
{ | |
auto source {std::make_unique<std::byte[]>(buffer_size)}; | |
auto target {std::make_unique<std::byte[]>(buffer_size)}; | |
{ | |
auto start_time {benchmark_clock::now()}; | |
for (unsigned i = 0; i < run_count; i++) { | |
std::memcpy(target.get(), source.get(), buffer_size); | |
} | |
chrono::duration<float> duration {benchmark_clock::now() - start_time}; | |
fmt::print("Libc : {}s\n", duration.count()); | |
} | |
{ | |
auto start_time {benchmark_clock::now()}; | |
for (unsigned i = 0; i < run_count; i++) { | |
mem_copy_mmx2(target.get(), source.get(), buffer_size); | |
} | |
chrono::duration<float> duration {benchmark_clock::now() - start_time}; | |
fmt::print("Custom: {}s\n", duration.count()); | |
} | |
} | |
{ | |
constexpr std::size_t alignment {64}; | |
static_assert(std::has_single_bit(alignment), "Alignment is not a power of 2."); | |
auto source {std::make_unique<std::byte[]>(buffer_size + alignment-1)}; | |
auto target {std::make_unique<std::byte[]>(buffer_size + alignment-1)}; | |
auto source_offset {alignment - (std::uintptr_t(source.get()) & (alignment-1))}; | |
auto target_offset {alignment - (std::uintptr_t(target.get()) & (alignment-1))}; | |
assert(std::uintptr_t(source.get() + source_offset) % alignment == 0); | |
assert(std::uintptr_t(target.get() + target_offset) % alignment == 0); | |
{ | |
auto start_time {benchmark_clock::now()}; | |
for (unsigned i = 0; i < run_count; i++) { | |
std::memcpy(target.get() + target_offset, source.get() + source_offset, buffer_size); | |
} | |
chrono::duration<float> duration {benchmark_clock::now() - start_time}; | |
fmt::print("Libc (align:{}): {}s\n", alignment, duration.count()); | |
} | |
{ | |
auto start_time {benchmark_clock::now()}; | |
for (unsigned i = 0; i < run_count; i++) { | |
mem_copy_mmx2(target.get() + target_offset, source.get() + source_offset, buffer_size); | |
} | |
chrono::duration<float> duration {benchmark_clock::now() - start_time}; | |
fmt::print("Custom (align:{}): {}s\n", alignment, duration.count()); | |
} | |
} | |
fmt::print("\n"); | |
} | |
int main() | |
{ | |
constexpr std::size_t bytes_to_copy {std::size_t(1000000) * 3000}; | |
for (std::size_t buffer_size = 100; buffer_size < bytes_to_copy; buffer_size *= 10) { | |
std::size_t run_count {bytes_to_copy / buffer_size}; | |
benchmark_set(run_count, buffer_size); | |
} | |
for (std::size_t buffer_size = 100; buffer_size < bytes_to_copy; buffer_size *= 10) { | |
std::size_t run_count {bytes_to_copy / buffer_size}; | |
benchmark_copy(run_count, buffer_size); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment