Created
February 23, 2023 14:01
-
-
Save kaixiong/be0ba18e70181c18c621298e6053cf73 to your computer and use it in GitHub Desktop.
LV vs SSE2 vs glibc memset/memcpy
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// -*- compile-command: "g++ -O2 -Wall -std=c++20 -fstrict-aliasing -o mem-benchmark mem-benchmark.cpp $(pkg-config --cflags --libs fmt)" -*- | |
#include <bit> | |
#include <chrono> | |
#include <iostream> | |
#include <memory> | |
#include <utility> | |
#include <cassert> | |
#include <cstddef> | |
#include <cstdint> | |
#include <cstring> | |
#include <fmt/core.h> | |
#include <fmt/chrono.h> | |
#include <x86intrin.h> | |
namespace chrono = std::chrono; | |
using benchmark_clock = chrono::high_resolution_clock; | |
using Timing = chrono::duration<float>; | |
void* mem_set8_mmx2(void* dest, int c, std::size_t n) | |
{ | |
auto d {static_cast<std::uint32_t*>(dest)}; | |
std::uint32_t const setflag32 {(c & 0xff) | ((c << 8) & 0xff00) | ((c << 16) & 0xff0000) | ((c << 24) & 0xff000000)}; | |
__asm__ __volatile__ | |
("\n\t movd (%0), %%mm0" | |
"\n\t movd (%0), %%mm1" | |
"\n\t psllq $32, %%mm1" | |
"\n\t por %%mm1, %%mm0" | |
"\n\t movq %%mm0, %%mm2" | |
"\n\t movq %%mm0, %%mm1" | |
"\n\t movq %%mm2, %%mm3" | |
"\n\t movq %%mm1, %%mm4" | |
"\n\t movq %%mm0, %%mm5" | |
"\n\t movq %%mm2, %%mm6" | |
"\n\t movq %%mm1, %%mm7" | |
:: "r" (&setflag32) : "memory"); | |
while (n >= 64) { | |
__asm__ __volatile__ | |
("\n\t movntq %%mm0, (%0)" | |
"\n\t movntq %%mm1, 8(%0)" | |
"\n\t movntq %%mm2, 16(%0)" | |
"\n\t movntq %%mm3, 24(%0)" | |
"\n\t movntq %%mm4, 32(%0)" | |
"\n\t movntq %%mm5, 40(%0)" | |
"\n\t movntq %%mm6, 48(%0)" | |
"\n\t movntq %%mm7, 56(%0)" | |
:: "r" (d) : "memory"); | |
d += 16; | |
n -= 64; | |
} | |
auto const setflag8 {std::uint8_t(c & 0xff)}; | |
while (n >= 4) { | |
*d++ = setflag32; | |
n -= 4; | |
} | |
auto dc {reinterpret_cast<uint8_t*>(d)}; | |
while (n--) | |
*dc++ = setflag8; | |
return dest; | |
} | |
void *mem_set8_sse2(void* dest, int c, std::size_t n) | |
{ | |
std::uint8_t const c8 = c & 0xff; | |
std::uint32_t const setflag32 {(c & 0xff) | ((c << 8) & 0xff00) | ((c << 16) & 0xff0000) | ((c << 24) & 0xff000000)}; | |
auto u8_ptr1 {static_cast<std::uint8_t*>(dest)}; | |
auto n0 {16 - (std::uintptr_t(dest) & 15)}; | |
while (n0--) { | |
*u8_ptr1++ = c8; | |
n--; | |
} | |
assert((std::uintptr_t(dest) & 15) == 0); | |
auto m128i_ptr {reinterpret_cast<__m128i*>(u8_ptr1)}; | |
auto copy_x16 {_mm_set1_epi8(c8)}; | |
while (n >= 64) { | |
_mm_storeu_si128(m128i_ptr, copy_x16); | |
_mm_storeu_si128(m128i_ptr+1, copy_x16); | |
_mm_storeu_si128(m128i_ptr+2, copy_x16); | |
_mm_storeu_si128(m128i_ptr+3, copy_x16); | |
m128i_ptr += 4; | |
n -= 64; | |
} | |
auto u32_ptr {reinterpret_cast<std::uint32_t*>(m128i_ptr)}; | |
while (n >= 4) { | |
*u32_ptr++ = setflag32; | |
n -= 4; | |
} | |
auto u8_ptr2 {reinterpret_cast<std::uint8_t*>(u32_ptr)}; | |
while (n--) { | |
*u8_ptr2++ = c8; | |
} | |
return dest; | |
} | |
void *mem_copy_mmx2(void* dest, void const* src, std::size_t n) | |
{ | |
auto d {static_cast<uint32_t*>(dest)}; | |
auto s {static_cast<uint32_t const*>(src)}; | |
while (n >= 64) { | |
__asm__ __volatile__ | |
(//"\n\t prefetchnta 256(%0)" | |
//"\n\t prefetchnta 320(%0)" | |
"\n\t movq (%0), %%mm0" | |
"\n\t movq 8(%0), %%mm1" | |
"\n\t movq 16(%0), %%mm2" | |
"\n\t movq 24(%0), %%mm3" | |
"\n\t movq 32(%0), %%mm4" | |
"\n\t movq 40(%0), %%mm5" | |
"\n\t movq 48(%0), %%mm6" | |
"\n\t movq 56(%0), %%mm7" | |
"\n\t movntq %%mm0, (%1)" | |
"\n\t movntq %%mm1, 8(%1)" | |
"\n\t movntq %%mm2, 16(%1)" | |
"\n\t movntq %%mm3, 24(%1)" | |
"\n\t movntq %%mm4, 32(%1)" | |
"\n\t movntq %%mm5, 40(%1)" | |
"\n\t movntq %%mm6, 48(%1)" | |
"\n\t movntq %%mm7, 56(%1)" | |
:: "r" (s), "r" (d) : "memory"); | |
d += 16; | |
s += 16; | |
n -= 64; | |
} | |
while (n >= 4) { | |
*d++ = *s++; | |
n -= 4; | |
} | |
auto dc {reinterpret_cast<std::uint8_t*>(d)}; | |
auto sc {reinterpret_cast<std::uint8_t const*>(s)}; | |
while (n--) | |
*dc++ = *sc++; | |
return dest; | |
} | |
void *mem_copy_sse2(void* dest, void const* src, std::size_t n) | |
{ | |
auto d_u8_ptr1 {static_cast<std::uint8_t*>(dest)}; | |
auto s_u8_ptr1 {static_cast<std::uint8_t const*>(src)}; | |
auto n0 {16 - (std::uintptr_t(dest) & 15)}; | |
while (n0--) { | |
*d_u8_ptr1++ = *s_u8_ptr1++; | |
n--; | |
} | |
assert((std::uintptr_t(dest) & 15) == 0); | |
auto d_m128i_ptr {reinterpret_cast<__m128i*>(d_u8_ptr1)}; | |
auto s_m128i_ptr {reinterpret_cast<__m128i const*>(s_u8_ptr1)}; | |
while (n >= 64) { | |
_mm_storeu_si128(d_m128i_ptr , s_m128i_ptr[0]); | |
_mm_storeu_si128(d_m128i_ptr+1, s_m128i_ptr[1]); | |
_mm_storeu_si128(d_m128i_ptr+2, s_m128i_ptr[2]); | |
_mm_storeu_si128(d_m128i_ptr+3, s_m128i_ptr[3]); | |
d_m128i_ptr += 4; | |
s_m128i_ptr += 4; | |
n -= 64; | |
} | |
auto d_u32_ptr {reinterpret_cast<std::uint32_t*>(d_m128i_ptr)}; | |
auto s_u32_ptr {reinterpret_cast<std::uint32_t const*>(s_m128i_ptr)}; | |
while (n >= 4) { | |
*d_u32_ptr++ = *s_u32_ptr++; | |
n -= 4; | |
} | |
auto d_u8_ptr2 {reinterpret_cast<std::uint8_t*>(d_u32_ptr)}; | |
auto s_u8_ptr2 {reinterpret_cast<std::uint8_t const*>(s_u32_ptr)}; | |
while (n--) { | |
*d_u8_ptr2++ = *s_u8_ptr2++; | |
} | |
return dest; | |
} | |
template <typename F> | |
Timing measure_runtime(F fn) | |
{ | |
auto start_time {benchmark_clock::now()}; | |
fn(); | |
return {benchmark_clock::now() - start_time}; | |
} | |
std::tuple<Timing, Timing, Timing> benchmark_set_runner(void* target_ptr, int c, std::size_t n, unsigned int run_count) | |
{ | |
auto timing1 {measure_runtime([=] () { | |
for (unsigned i = 0; i < run_count; i++) { | |
std::memset(target_ptr, 0x12, n); | |
} | |
})}; | |
auto timing2 {measure_runtime([=] () { | |
for (unsigned i = 0; i < run_count; i++) { | |
mem_set8_mmx2(target_ptr, 0x12, n); | |
} | |
})}; | |
auto timing3 {measure_runtime([=] () { | |
for (unsigned i = 0; i < run_count; i++) { | |
mem_set8_sse2(target_ptr, 0x12, n); | |
} | |
})}; | |
return {timing1, timing2, timing3}; | |
} | |
void benchmark_set(unsigned int run_count, std::size_t buffer_size) | |
{ | |
fmt::print("Benchmarking setting of {}-byte buffers for {} times.\n", buffer_size, run_count); | |
{ | |
auto target {std::make_unique<std::byte[]>(buffer_size)}; | |
{ | |
auto target_ptr {target.get()}; | |
auto [timing1, timing2, timing3] = benchmark_set_runner(target_ptr, 0x12, buffer_size, run_count); | |
fmt::print("Libc: {}s\n", timing1.count()); | |
fmt::print("LV : {}s\n", timing2.count()); | |
fmt::print("SSE2: {}s\n", timing3.count()); | |
} | |
} | |
{ | |
constexpr std::size_t alignment {64}; | |
static_assert(std::has_single_bit(alignment), "Alignment is not a power of 2."); | |
auto target {std::make_unique<std::byte[]>(buffer_size + alignment-1)}; | |
auto target_offset {alignment - (std::uintptr_t(target.get()) & (alignment-1))}; | |
auto target_ptr = target.get() + target_offset; | |
assert(std::uintptr_t(target_ptr) % alignment == 0); | |
auto [timing1, timing2, timing3] = benchmark_set_runner(target_ptr, 0x12, buffer_size, run_count); | |
fmt::print("Libc (align:{}): {}s\n", alignment, timing1.count()); | |
fmt::print("LV (align:{}): {}s\n", alignment, timing2.count()); | |
fmt::print("SSE2 (align:{}): {}s\n", alignment, timing3.count()); | |
} | |
fmt::print("\n"); | |
} | |
std::tuple<Timing, Timing, Timing> benchmark_copy_runner(void* target_ptr, void* source_ptr, std::size_t buffer_size, unsigned int run_count) | |
{ | |
auto timing1 {measure_runtime([=]() { | |
for (unsigned i = 0; i < run_count; i++) { | |
std::memcpy(target_ptr, source_ptr, buffer_size); | |
} | |
})}; | |
auto timing2 {measure_runtime([=]() { | |
for (unsigned i = 0; i < run_count; i++) { | |
mem_copy_mmx2(target_ptr, source_ptr, buffer_size); | |
} | |
})}; | |
auto timing3 {measure_runtime([=]() { | |
for (unsigned i = 0; i < run_count; i++) { | |
mem_copy_sse2(target_ptr, source_ptr, buffer_size); | |
} | |
})}; | |
return {timing1, timing2, timing3}; | |
} | |
void benchmark_copy(unsigned int run_count, std::size_t buffer_size) | |
{ | |
fmt::print("Benchmarking copying of {}-byte buffers for {} times.\n", buffer_size, run_count); | |
{ | |
auto source {std::make_unique<std::byte[]>(buffer_size)}; | |
auto target {std::make_unique<std::byte[]>(buffer_size)}; | |
auto source_ptr {source.get()}; | |
auto target_ptr {target.get()}; | |
auto [timing1, timing2, timing3] = benchmark_copy_runner(target_ptr, source_ptr, buffer_size, run_count); | |
fmt::print("Glibc: {}s\n", timing1.count()); | |
fmt::print("LV : {}s\n", timing2.count()); | |
fmt::print("SSE2 : {}s\n", timing3.count()); | |
} | |
{ | |
constexpr std::size_t alignment {64}; | |
static_assert(std::has_single_bit(alignment), "Alignment is not a power of 2."); | |
auto source {std::make_unique<std::byte[]>(buffer_size + alignment-1)}; | |
auto target {std::make_unique<std::byte[]>(buffer_size + alignment-1)}; | |
auto source_offset {alignment - (std::uintptr_t(source.get()) & (alignment-1))}; | |
auto target_offset {alignment - (std::uintptr_t(target.get()) & (alignment-1))}; | |
auto source_ptr {source.get() + source_offset}; | |
auto target_ptr {target.get() + target_offset}; | |
assert(std::uintptr_t(source_ptr) % alignment == 0); | |
assert(std::uintptr_t(target_ptr) % alignment == 0); | |
auto [timing1, timing2, timing3] = benchmark_copy_runner(target_ptr, source_ptr, buffer_size, run_count); | |
fmt::print("GLibc (align:{}): {}s\n", alignment, timing1.count()); | |
fmt::print("LV (align:{}): {}s\n", alignment, timing2.count()); | |
fmt::print("SSE2 (align:{}): {}s\n", alignment, timing3.count()); | |
} | |
fmt::print("\n"); | |
} | |
int main() | |
{ | |
constexpr std::size_t bytes_to_copy {std::size_t(1024*1024) * 2048}; | |
for (std::size_t buffer_size = 32; buffer_size < bytes_to_copy; buffer_size *= 2) { | |
std::size_t run_count {bytes_to_copy / buffer_size}; | |
benchmark_set(run_count, buffer_size); | |
} | |
for (std::size_t buffer_size = 32; buffer_size < bytes_to_copy; buffer_size *= 2) { | |
std::size_t run_count {bytes_to_copy / buffer_size}; | |
benchmark_copy(run_count, buffer_size); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment