Skip to content

Instantly share code, notes, and snippets.

@kaixiong
Created February 23, 2023 14:01
Show Gist options
  • Save kaixiong/be0ba18e70181c18c621298e6053cf73 to your computer and use it in GitHub Desktop.
Save kaixiong/be0ba18e70181c18c621298e6053cf73 to your computer and use it in GitHub Desktop.
LV vs SSE2 vs glibc memset/memcpy
// -*- compile-command: "g++ -O2 -Wall -std=c++20 -fstrict-aliasing -o mem-benchmark mem-benchmark.cpp $(pkg-config --cflags --libs fmt)" -*-
#include <bit>
#include <chrono>
#include <iostream>
#include <memory>
#include <utility>
#include <cassert>
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <fmt/core.h>
#include <fmt/chrono.h>
#include <x86intrin.h>
namespace chrono = std::chrono;
using benchmark_clock = chrono::high_resolution_clock;
using Timing = chrono::duration<float>;
void* mem_set8_mmx2(void* dest, int c, std::size_t n)
{
auto d {static_cast<std::uint32_t*>(dest)};
std::uint32_t const setflag32 {(c & 0xff) | ((c << 8) & 0xff00) | ((c << 16) & 0xff0000) | ((c << 24) & 0xff000000)};
__asm__ __volatile__
("\n\t movd (%0), %%mm0"
"\n\t movd (%0), %%mm1"
"\n\t psllq $32, %%mm1"
"\n\t por %%mm1, %%mm0"
"\n\t movq %%mm0, %%mm2"
"\n\t movq %%mm0, %%mm1"
"\n\t movq %%mm2, %%mm3"
"\n\t movq %%mm1, %%mm4"
"\n\t movq %%mm0, %%mm5"
"\n\t movq %%mm2, %%mm6"
"\n\t movq %%mm1, %%mm7"
:: "r" (&setflag32) : "memory");
while (n >= 64) {
__asm__ __volatile__
("\n\t movntq %%mm0, (%0)"
"\n\t movntq %%mm1, 8(%0)"
"\n\t movntq %%mm2, 16(%0)"
"\n\t movntq %%mm3, 24(%0)"
"\n\t movntq %%mm4, 32(%0)"
"\n\t movntq %%mm5, 40(%0)"
"\n\t movntq %%mm6, 48(%0)"
"\n\t movntq %%mm7, 56(%0)"
:: "r" (d) : "memory");
d += 16;
n -= 64;
}
auto const setflag8 {std::uint8_t(c & 0xff)};
while (n >= 4) {
*d++ = setflag32;
n -= 4;
}
auto dc {reinterpret_cast<uint8_t*>(d)};
while (n--)
*dc++ = setflag8;
return dest;
}
void *mem_set8_sse2(void* dest, int c, std::size_t n)
{
std::uint8_t const c8 = c & 0xff;
std::uint32_t const setflag32 {(c & 0xff) | ((c << 8) & 0xff00) | ((c << 16) & 0xff0000) | ((c << 24) & 0xff000000)};
auto u8_ptr1 {static_cast<std::uint8_t*>(dest)};
auto n0 {16 - (std::uintptr_t(dest) & 15)};
while (n0--) {
*u8_ptr1++ = c8;
n--;
}
assert((std::uintptr_t(dest) & 15) == 0);
auto m128i_ptr {reinterpret_cast<__m128i*>(u8_ptr1)};
auto copy_x16 {_mm_set1_epi8(c8)};
while (n >= 64) {
_mm_storeu_si128(m128i_ptr, copy_x16);
_mm_storeu_si128(m128i_ptr+1, copy_x16);
_mm_storeu_si128(m128i_ptr+2, copy_x16);
_mm_storeu_si128(m128i_ptr+3, copy_x16);
m128i_ptr += 4;
n -= 64;
}
auto u32_ptr {reinterpret_cast<std::uint32_t*>(m128i_ptr)};
while (n >= 4) {
*u32_ptr++ = setflag32;
n -= 4;
}
auto u8_ptr2 {reinterpret_cast<std::uint8_t*>(u32_ptr)};
while (n--) {
*u8_ptr2++ = c8;
}
return dest;
}
void *mem_copy_mmx2(void* dest, void const* src, std::size_t n)
{
auto d {static_cast<uint32_t*>(dest)};
auto s {static_cast<uint32_t const*>(src)};
while (n >= 64) {
__asm__ __volatile__
(//"\n\t prefetchnta 256(%0)"
//"\n\t prefetchnta 320(%0)"
"\n\t movq (%0), %%mm0"
"\n\t movq 8(%0), %%mm1"
"\n\t movq 16(%0), %%mm2"
"\n\t movq 24(%0), %%mm3"
"\n\t movq 32(%0), %%mm4"
"\n\t movq 40(%0), %%mm5"
"\n\t movq 48(%0), %%mm6"
"\n\t movq 56(%0), %%mm7"
"\n\t movntq %%mm0, (%1)"
"\n\t movntq %%mm1, 8(%1)"
"\n\t movntq %%mm2, 16(%1)"
"\n\t movntq %%mm3, 24(%1)"
"\n\t movntq %%mm4, 32(%1)"
"\n\t movntq %%mm5, 40(%1)"
"\n\t movntq %%mm6, 48(%1)"
"\n\t movntq %%mm7, 56(%1)"
:: "r" (s), "r" (d) : "memory");
d += 16;
s += 16;
n -= 64;
}
while (n >= 4) {
*d++ = *s++;
n -= 4;
}
auto dc {reinterpret_cast<std::uint8_t*>(d)};
auto sc {reinterpret_cast<std::uint8_t const*>(s)};
while (n--)
*dc++ = *sc++;
return dest;
}
void *mem_copy_sse2(void* dest, void const* src, std::size_t n)
{
auto d_u8_ptr1 {static_cast<std::uint8_t*>(dest)};
auto s_u8_ptr1 {static_cast<std::uint8_t const*>(src)};
auto n0 {16 - (std::uintptr_t(dest) & 15)};
while (n0--) {
*d_u8_ptr1++ = *s_u8_ptr1++;
n--;
}
assert((std::uintptr_t(dest) & 15) == 0);
auto d_m128i_ptr {reinterpret_cast<__m128i*>(d_u8_ptr1)};
auto s_m128i_ptr {reinterpret_cast<__m128i const*>(s_u8_ptr1)};
while (n >= 64) {
_mm_storeu_si128(d_m128i_ptr , s_m128i_ptr[0]);
_mm_storeu_si128(d_m128i_ptr+1, s_m128i_ptr[1]);
_mm_storeu_si128(d_m128i_ptr+2, s_m128i_ptr[2]);
_mm_storeu_si128(d_m128i_ptr+3, s_m128i_ptr[3]);
d_m128i_ptr += 4;
s_m128i_ptr += 4;
n -= 64;
}
auto d_u32_ptr {reinterpret_cast<std::uint32_t*>(d_m128i_ptr)};
auto s_u32_ptr {reinterpret_cast<std::uint32_t const*>(s_m128i_ptr)};
while (n >= 4) {
*d_u32_ptr++ = *s_u32_ptr++;
n -= 4;
}
auto d_u8_ptr2 {reinterpret_cast<std::uint8_t*>(d_u32_ptr)};
auto s_u8_ptr2 {reinterpret_cast<std::uint8_t const*>(s_u32_ptr)};
while (n--) {
*d_u8_ptr2++ = *s_u8_ptr2++;
}
return dest;
}
template <typename F>
Timing measure_runtime(F fn)
{
auto start_time {benchmark_clock::now()};
fn();
return {benchmark_clock::now() - start_time};
}
std::tuple<Timing, Timing, Timing> benchmark_set_runner(void* target_ptr, int c, std::size_t n, unsigned int run_count)
{
auto timing1 {measure_runtime([=] () {
for (unsigned i = 0; i < run_count; i++) {
std::memset(target_ptr, 0x12, n);
}
})};
auto timing2 {measure_runtime([=] () {
for (unsigned i = 0; i < run_count; i++) {
mem_set8_mmx2(target_ptr, 0x12, n);
}
})};
auto timing3 {measure_runtime([=] () {
for (unsigned i = 0; i < run_count; i++) {
mem_set8_sse2(target_ptr, 0x12, n);
}
})};
return {timing1, timing2, timing3};
}
void benchmark_set(unsigned int run_count, std::size_t buffer_size)
{
fmt::print("Benchmarking setting of {}-byte buffers for {} times.\n", buffer_size, run_count);
{
auto target {std::make_unique<std::byte[]>(buffer_size)};
{
auto target_ptr {target.get()};
auto [timing1, timing2, timing3] = benchmark_set_runner(target_ptr, 0x12, buffer_size, run_count);
fmt::print("Libc: {}s\n", timing1.count());
fmt::print("LV : {}s\n", timing2.count());
fmt::print("SSE2: {}s\n", timing3.count());
}
}
{
constexpr std::size_t alignment {64};
static_assert(std::has_single_bit(alignment), "Alignment is not a power of 2.");
auto target {std::make_unique<std::byte[]>(buffer_size + alignment-1)};
auto target_offset {alignment - (std::uintptr_t(target.get()) & (alignment-1))};
auto target_ptr = target.get() + target_offset;
assert(std::uintptr_t(target_ptr) % alignment == 0);
auto [timing1, timing2, timing3] = benchmark_set_runner(target_ptr, 0x12, buffer_size, run_count);
fmt::print("Libc (align:{}): {}s\n", alignment, timing1.count());
fmt::print("LV (align:{}): {}s\n", alignment, timing2.count());
fmt::print("SSE2 (align:{}): {}s\n", alignment, timing3.count());
}
fmt::print("\n");
}
std::tuple<Timing, Timing, Timing> benchmark_copy_runner(void* target_ptr, void* source_ptr, std::size_t buffer_size, unsigned int run_count)
{
auto timing1 {measure_runtime([=]() {
for (unsigned i = 0; i < run_count; i++) {
std::memcpy(target_ptr, source_ptr, buffer_size);
}
})};
auto timing2 {measure_runtime([=]() {
for (unsigned i = 0; i < run_count; i++) {
mem_copy_mmx2(target_ptr, source_ptr, buffer_size);
}
})};
auto timing3 {measure_runtime([=]() {
for (unsigned i = 0; i < run_count; i++) {
mem_copy_sse2(target_ptr, source_ptr, buffer_size);
}
})};
return {timing1, timing2, timing3};
}
void benchmark_copy(unsigned int run_count, std::size_t buffer_size)
{
fmt::print("Benchmarking copying of {}-byte buffers for {} times.\n", buffer_size, run_count);
{
auto source {std::make_unique<std::byte[]>(buffer_size)};
auto target {std::make_unique<std::byte[]>(buffer_size)};
auto source_ptr {source.get()};
auto target_ptr {target.get()};
auto [timing1, timing2, timing3] = benchmark_copy_runner(target_ptr, source_ptr, buffer_size, run_count);
fmt::print("Glibc: {}s\n", timing1.count());
fmt::print("LV : {}s\n", timing2.count());
fmt::print("SSE2 : {}s\n", timing3.count());
}
{
constexpr std::size_t alignment {64};
static_assert(std::has_single_bit(alignment), "Alignment is not a power of 2.");
auto source {std::make_unique<std::byte[]>(buffer_size + alignment-1)};
auto target {std::make_unique<std::byte[]>(buffer_size + alignment-1)};
auto source_offset {alignment - (std::uintptr_t(source.get()) & (alignment-1))};
auto target_offset {alignment - (std::uintptr_t(target.get()) & (alignment-1))};
auto source_ptr {source.get() + source_offset};
auto target_ptr {target.get() + target_offset};
assert(std::uintptr_t(source_ptr) % alignment == 0);
assert(std::uintptr_t(target_ptr) % alignment == 0);
auto [timing1, timing2, timing3] = benchmark_copy_runner(target_ptr, source_ptr, buffer_size, run_count);
fmt::print("GLibc (align:{}): {}s\n", alignment, timing1.count());
fmt::print("LV (align:{}): {}s\n", alignment, timing2.count());
fmt::print("SSE2 (align:{}): {}s\n", alignment, timing3.count());
}
fmt::print("\n");
}
int main()
{
constexpr std::size_t bytes_to_copy {std::size_t(1024*1024) * 2048};
for (std::size_t buffer_size = 32; buffer_size < bytes_to_copy; buffer_size *= 2) {
std::size_t run_count {bytes_to_copy / buffer_size};
benchmark_set(run_count, buffer_size);
}
for (std::size_t buffer_size = 32; buffer_size < bytes_to_copy; buffer_size *= 2) {
std::size_t run_count {bytes_to_copy / buffer_size};
benchmark_copy(run_count, buffer_size);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment