Skip to content

Instantly share code, notes, and snippets.

@kaixiong
Last active March 4, 2023 11:05
Show Gist options
  • Save kaixiong/89607caddf7f84c5cb45b4fdd10900be to your computer and use it in GitHub Desktop.
Save kaixiong/89607caddf7f84c5cb45b4fdd10900be to your computer and use it in GitHub Desktop.
visual_mem_set{16,32} benchmarks - C vs ASM vs Intrinsics
// -*- compile-command: "clang++ -O2 -Wall -std=c++20 -fstrict-aliasing -o mem-set-benchmark-16 mem-set-benchmark-16.cpp $(pkg-config --cflags --libs fmt)" -*-
#include <array>
#include <bit>
#include <chrono>
#include <iostream>
#include <memory>
#include <utility>
#include <cassert>
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <fmt/core.h>
#include <fmt/chrono.h>
#include <x86intrin.h>
namespace chrono = std::chrono;
using benchmark_clock = chrono::high_resolution_clock;
using Timing = chrono::duration<float>;
template <typename F>
Timing measure_runtime(F fn)
{
auto start_time {benchmark_clock::now()};
fn();
return {benchmark_clock::now() - start_time};
}
void *mem_set16_c (void *dest, int c, std::size_t n)
{
uint64_t *d = static_cast<uint64_t*>(dest);
uint16_t copy = c & 0xffff;
uint32_t copy_x2 = copy | (copy << 16);
uint64_t copy_x4 = copy_x2 | (uint64_t(copy_x2) << 32);
while (n >= 32) {
d[0] = copy_x4;
d[1] = copy_x4;
d[2] = copy_x4;
d[3] = copy_x4;
d[4] = copy_x4;
d[5] = copy_x4;
d[6] = copy_x4;
d[7] = copy_x4;
n -= 32;
d += 8;
}
uint16_t* dc = (uint16_t *) d;
while (n--) {
*dc++ = copy;
}
return dest;
}
void *mem_set16_mmx (void *dest, int c, std::size_t n)
{
uint32_t *d = static_cast<uint32_t*>(dest);
uint16_t *dc = static_cast<uint16_t*>(dest);
uint32_t setflag32 = (c & 0xffff) | ((c << 16) & 0xffff0000);
uint16_t setflag16 = c & 0xffff;
__asm __volatile
("\n\t movd (%0), %%mm0"
"\n\t movd (%0), %%mm1"
"\n\t psllq $32, %%mm1"
"\n\t por %%mm1, %%mm0"
"\n\t movq %%mm0, %%mm2"
"\n\t movq %%mm0, %%mm1"
"\n\t movq %%mm2, %%mm3"
"\n\t movq %%mm1, %%mm4"
"\n\t movq %%mm0, %%mm5"
"\n\t movq %%mm2, %%mm6"
"\n\t movq %%mm1, %%mm7"
:: "r" (&setflag32) : "memory");
while (n >= 32) {
__asm __volatile
("\n\t movq %%mm0, (%0)"
"\n\t movq %%mm1, 8(%0)"
"\n\t movq %%mm2, 16(%0)"
"\n\t movq %%mm3, 24(%0)"
"\n\t movq %%mm4, 32(%0)"
"\n\t movq %%mm5, 40(%0)"
"\n\t movq %%mm6, 48(%0)"
"\n\t movq %%mm7, 56(%0)"
:: "r" (d) : "memory");
d += 16;
n -= 32;
}
__asm __volatile
("\n\t emms");
while (n >= 2) {
*d++ = setflag32;
n -= 2;
}
dc = (uint16_t *) d;
while (n--)
*dc++ = setflag16;
return dest;
}
void *mem_set16_sse (void *dest, int c, std::size_t n)
{
// FIXME: Do we need 'dest' to be aligned for this to be performing at optimal speed?
const uint16_t copy = c & 0xffff;
const uint32_t copy_2x = copy | (copy << 16);
const __m64 copy_4x = _mm_set_pi32 (copy_2x, copy_2x);
__m64 *m64_ptr = (__m64 *) dest;
while (n >= 32) {
_mm_stream_pi (m64_ptr, copy_4x);
_mm_stream_pi (m64_ptr + 1, copy_4x);
_mm_stream_pi (m64_ptr + 2, copy_4x);
_mm_stream_pi (m64_ptr + 3, copy_4x);
_mm_stream_pi (m64_ptr + 4, copy_4x);
_mm_stream_pi (m64_ptr + 5, copy_4x);
_mm_stream_pi (m64_ptr + 6, copy_4x);
_mm_stream_pi (m64_ptr + 7, copy_4x);
m64_ptr += 8;
n -= 32;
}
uint16_t *uint16_ptr = (uint16_t *) m64_ptr;
while (n--) {
*uint16_ptr++ = copy;
}
return dest;
}
void *mem_set16_sse2_stream (void *dest, int c, std::size_t n)
{
const uint16_t copy = c & 0xffff;
const uint32_t copy_2x = copy | (copy << 16);
const __m128i copy_4x = _mm_set_epi32 (copy_2x, copy_2x, copy_2x, copy_2x);
__m128i * __restrict__ m128i_ptr = (__m128i *) dest;
while (n >= 64) {
_mm_stream_si128 (m128i_ptr, copy_4x);
_mm_stream_si128 (m128i_ptr + 1, copy_4x);
_mm_stream_si128 (m128i_ptr + 2, copy_4x);
_mm_stream_si128 (m128i_ptr + 3, copy_4x);
_mm_stream_si128 (m128i_ptr + 4, copy_4x);
_mm_stream_si128 (m128i_ptr + 5, copy_4x);
_mm_stream_si128 (m128i_ptr + 6, copy_4x);
_mm_stream_si128 (m128i_ptr + 7, copy_4x);
m128i_ptr += 8;
n -= 64;
}
uint16_t *uint16_ptr = (uint16_t *) m128i_ptr;
while (n--) {
*uint16_ptr++ = copy;
}
return dest;
}
void *mem_set16_sse2_store (void *dest, int c, std::size_t n)
{
const uint16_t copy = c & 0xffff;
const uint32_t copy_2x = copy | (copy << 16);
const __m128i copy_4x = _mm_set_epi32 (copy_2x, copy_2x, copy_2x, copy_2x);
__m128i * __restrict__ m128i_ptr = (__m128i *) dest;
while (n >= 64) {
_mm_storeu_si128 (m128i_ptr, copy_4x);
_mm_storeu_si128 (m128i_ptr + 1, copy_4x);
_mm_storeu_si128 (m128i_ptr + 2, copy_4x);
_mm_storeu_si128 (m128i_ptr + 3, copy_4x);
_mm_storeu_si128 (m128i_ptr + 4, copy_4x);
_mm_storeu_si128 (m128i_ptr + 5, copy_4x);
_mm_storeu_si128 (m128i_ptr + 6, copy_4x);
_mm_storeu_si128 (m128i_ptr + 7, copy_4x);
m128i_ptr += 8;
n -= 64;
}
uint16_t *uint16_ptr = (uint16_t *) m128i_ptr;
while (n--) {
*uint16_ptr++ = copy;
}
return dest;
}
std::array<Timing, 5> benchmark_set_runner(void* target_ptr, int c, std::size_t n, unsigned int run_count)
{
auto timing1 {measure_runtime([=] () {
for (unsigned i = 0; i < run_count; i++) {
mem_set16_c(target_ptr, 0x1234, n);
}
})};
auto timing2 {measure_runtime([=] () {
for (unsigned i = 0; i < run_count; i++) {
mem_set16_mmx(target_ptr, 0x1234, n);
}
})};
auto timing3 {measure_runtime([=] () {
for (unsigned i = 0; i < run_count; i++) {
mem_set16_sse(target_ptr, 0x1234, n);
}
})};
auto timing4 {measure_runtime([=] () {
for (unsigned i = 0; i < run_count; i++) {
mem_set16_sse2_stream(target_ptr, 0x1234, n);
}
})};
auto timing5 {measure_runtime([=] () {
for (unsigned i = 0; i < run_count; i++) {
mem_set16_sse2_store(target_ptr, 0x1234, n);
}
})};
return {timing1, timing2, timing3, timing4, timing5};
}
void benchmark_set(unsigned int run_count, std::size_t buffer_size)
{
fmt::print("Benchmarking setting of {}-byte buffers for {} times.\n", buffer_size, run_count);
auto count {buffer_size / sizeof(std::uint16_t)};
{
auto target {std::make_unique<std::byte[]>(buffer_size)};
{
auto target_ptr {target.get()};
auto timings {benchmark_set_runner(target_ptr, 0x12, count, run_count)};
auto reference_time {timings[0].count()};
fmt::print("LV C : {:.6f}s ({:.2f}x)\n", timings[0].count(), reference_time / timings[0].count());
fmt::print("LV MMX : {:.6f}s ({:.2f}x)\n", timings[1].count(), reference_time / timings[1].count());
fmt::print("LV SSE : {:.6f}s ({:.2f}x)\n", timings[2].count(), reference_time / timings[2].count());
fmt::print("LV SSE2 1: {:.6f}s ({:.2f}x)\n", timings[3].count(), reference_time / timings[3].count());
fmt::print("LV SSE2 2: {:.6f}s ({:.2f}x)\n", timings[4].count(), reference_time / timings[4].count());
}
}
{
constexpr std::size_t alignment {64};
static_assert(std::has_single_bit(alignment), "Alignment is not a power of 2.");
auto target {std::make_unique<std::byte[]>(buffer_size + alignment-1)};
auto target_offset {alignment - (std::uintptr_t(target.get()) & (alignment-1))};
auto target_ptr = target.get() + target_offset;
assert(std::uintptr_t(target_ptr) % alignment == 0);
auto timings {benchmark_set_runner(target_ptr, 0x12, count, run_count)};
auto reference_time {timings[0].count()};
fmt::print("LV C (align:{}): {:.6f}s ({:.2f}x)\n", alignment, timings[0].count(), reference_time / timings[0].count());
fmt::print("LV MMX (align:{}): {:.6f}s ({:.2f}x)\n", alignment, timings[1].count(), reference_time / timings[1].count());
fmt::print("LV SSE (align:{}): {:.6f}s ({:.2f}x)\n", alignment, timings[2].count(), reference_time / timings[2].count());
fmt::print("LV SSE2 1 (align:{}): {:.6f}s ({:.2f}x)\n", alignment, timings[3].count(), reference_time / timings[3].count());
fmt::print("LV SSE2 2 (align:{}): {:.6f}s ({:.2f}x)\n", alignment, timings[4].count(), reference_time / timings[4].count());
}
fmt::print("\n");
}
int main()
{
constexpr std::size_t bytes_to_copy {std::size_t(1024) * 1024 * 1024};
constexpr std::size_t max_buffer_size {1 << 26};
for (std::size_t buffer_size = 64; buffer_size <= max_buffer_size; buffer_size *= 2) {
std::size_t run_count {bytes_to_copy / buffer_size};
benchmark_set(run_count, buffer_size);
}
}
// -*- compile-command: "clang++ -O2 -Wall -std=c++20 -fstrict-aliasing -o mem-set-benchmark-32 mem-set-benchmark-32.cpp $(pkg-config --cflags --libs fmt)" -*-
#include <array>
#include <bit>
#include <chrono>
#include <iostream>
#include <memory>
#include <utility>
#include <cassert>
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <fmt/core.h>
#include <fmt/chrono.h>
#include <x86intrin.h>
namespace chrono = std::chrono;
using benchmark_clock = chrono::high_resolution_clock;
using Timing = chrono::duration<float>;
template <typename F>
Timing measure_runtime(F fn)
{
auto start_time {benchmark_clock::now()};
fn();
return {benchmark_clock::now() - start_time};
}
void *mem_set32_c (void *dest, int c, std::size_t n)
{
uint64_t *d = static_cast<uint64_t*>(dest);
uint32_t copy = c;
uint64_t copy_x2 = uint64_t(copy) | (uint64_t(copy) << 32);
while (n >= 16) {
d[0] = copy_x2;
d[1] = copy_x2;
d[2] = copy_x2;
d[3] = copy_x2;
d[4] = copy_x2;
d[5] = copy_x2;
d[6] = copy_x2;
d[7] = copy_x2;
n -= 16;
d += 8;
}
uint32_t* dc = (uint32_t *) d;
while (n--) {
*dc++ = copy;
}
return dest;
}
static void *mem_set32_mmx (void *dest, int c, std::size_t n)
{
uint32_t *d = static_cast<uint32_t*>(dest);
uint32_t setflag32 = c;
__asm __volatile
("\n\t movd (%0), %%mm0"
"\n\t movd (%0), %%mm1"
"\n\t psllq $32, %%mm1"
"\n\t por %%mm1, %%mm0"
"\n\t movq %%mm0, %%mm2"
"\n\t movq %%mm0, %%mm1"
"\n\t movq %%mm2, %%mm3"
"\n\t movq %%mm1, %%mm4"
"\n\t movq %%mm0, %%mm5"
"\n\t movq %%mm2, %%mm6"
"\n\t movq %%mm1, %%mm7"
:: "r" (&setflag32) : "memory");
while (n >= 16) {
__asm __volatile
("\n\t movq %%mm0, (%0)"
"\n\t movq %%mm1, 8(%0)"
"\n\t movq %%mm2, 16(%0)"
"\n\t movq %%mm3, 24(%0)"
"\n\t movq %%mm4, 32(%0)"
"\n\t movq %%mm5, 40(%0)"
"\n\t movq %%mm6, 48(%0)"
"\n\t movq %%mm7, 56(%0)"
:: "r" (d) : "memory");
d += 16;
n -= 16;
}
__asm __volatile
("\n\t emms");
while (n--)
*d++ = setflag32;
return dest;
}
static void *mem_set32_mmx2 (void *dest, int c, std::size_t n)
{
uint32_t *d = static_cast<uint32_t*>(dest);
uint32_t setflag32 = c;
__asm __volatile
("\n\t movd (%0), %%mm0"
"\n\t movd (%0), %%mm1"
"\n\t psllq $32, %%mm1"
"\n\t por %%mm1, %%mm0"
"\n\t movq %%mm0, %%mm2"
"\n\t movq %%mm0, %%mm1"
"\n\t movq %%mm2, %%mm3"
"\n\t movq %%mm1, %%mm4"
"\n\t movq %%mm0, %%mm5"
"\n\t movq %%mm2, %%mm6"
"\n\t movq %%mm1, %%mm7"
:: "r" (&setflag32) : "memory");
while (n >= 16) {
__asm __volatile
("\n\t movntq %%mm0, (%0)"
"\n\t movntq %%mm1, 8(%0)"
"\n\t movntq %%mm2, 16(%0)"
"\n\t movntq %%mm3, 24(%0)"
"\n\t movntq %%mm4, 32(%0)"
"\n\t movntq %%mm5, 40(%0)"
"\n\t movntq %%mm6, 48(%0)"
"\n\t movntq %%mm7, 56(%0)"
:: "r" (d) : "memory");
d += 16;
n -= 16;
}
__asm __volatile
("\n\t emms");
while (n--)
*d++ = setflag32;
return dest;
}
static void *mem_set32_sse (void *dest, int c, std::size_t n)
{
const uint32_t copy = c;
const __m64 copy_2x = _mm_set_pi32 (copy, copy);
__m64 *m64_ptr = (__m64 *) dest;
while (n >= 16) {
_mm_stream_pi (m64_ptr, copy_2x);
_mm_stream_pi (m64_ptr + 1, copy_2x);
_mm_stream_pi (m64_ptr + 2, copy_2x);
_mm_stream_pi (m64_ptr + 3, copy_2x);
_mm_stream_pi (m64_ptr + 4, copy_2x);
_mm_stream_pi (m64_ptr + 5, copy_2x);
_mm_stream_pi (m64_ptr + 6, copy_2x);
_mm_stream_pi (m64_ptr + 7, copy_2x);
n -= 16;
m64_ptr += 8;
}
uint32_t *uint32_ptr = (uint32_t *) m64_ptr;
while (n--) {
*uint32_ptr++ = copy;
}
return dest;
}
static void *mem_set32_sse2_stream (void *dest, int c, std::size_t n)
{
const uint32_t copy = c;
const __m128i copy_4x = _mm_set_epi32 (copy, copy, copy, copy);
__m128i *m128i_ptr = (__m128i *) dest;
while (n >= 32) {
_mm_stream_si128 (m128i_ptr, copy_4x);
_mm_stream_si128 (m128i_ptr + 1, copy_4x);
_mm_stream_si128 (m128i_ptr + 2, copy_4x);
_mm_stream_si128 (m128i_ptr + 3, copy_4x);
_mm_stream_si128 (m128i_ptr + 4, copy_4x);
_mm_stream_si128 (m128i_ptr + 5, copy_4x);
_mm_stream_si128 (m128i_ptr + 6, copy_4x);
_mm_stream_si128 (m128i_ptr + 7, copy_4x);
n -= 32;
m128i_ptr += 8;
}
uint32_t *uint32_ptr = (uint32_t *) m128i_ptr;
while (n--) {
*uint32_ptr++ = copy;
}
return dest;
}
static void *mem_set32_sse2_store (void *dest, int c, std::size_t n)
{
const uint32_t copy = c;
const __m128i copy_4x = _mm_set_epi32 (copy, copy, copy, copy);
__m128i *m128i_ptr = (__m128i *) dest;
while (n >= 32) {
_mm_storeu_si128 (m128i_ptr, copy_4x);
_mm_storeu_si128 (m128i_ptr + 1, copy_4x);
_mm_storeu_si128 (m128i_ptr + 2, copy_4x);
_mm_storeu_si128 (m128i_ptr + 3, copy_4x);
_mm_storeu_si128 (m128i_ptr + 4, copy_4x);
_mm_storeu_si128 (m128i_ptr + 5, copy_4x);
_mm_storeu_si128 (m128i_ptr + 6, copy_4x);
_mm_storeu_si128 (m128i_ptr + 7, copy_4x);
n -= 32;
m128i_ptr += 8;
}
uint32_t *uint32_ptr = (uint32_t *) m128i_ptr;
while (n--) {
*uint32_ptr++ = copy;
}
return dest;
}
std::array<Timing, 6> benchmark_set_runner(void* target_ptr, int c, std::size_t n, unsigned int run_count)
{
auto timing1 {measure_runtime([=] () {
for (unsigned i = 0; i < run_count; i++) {
mem_set32_c(target_ptr, 0x1234, n);
}
})};
auto timing2 {measure_runtime([=] () {
for (unsigned i = 0; i < run_count; i++) {
mem_set32_mmx(target_ptr, 0x1234, n);
}
})};
auto timing3 {measure_runtime([=] () {
for (unsigned i = 0; i < run_count; i++) {
mem_set32_mmx2(target_ptr, 0x1234, n);
}
})};
auto timing4 {measure_runtime([=] () {
for (unsigned i = 0; i < run_count; i++) {
mem_set32_sse(target_ptr, 0x1234, n);
}
})};
auto timing5 {measure_runtime([=] () {
for (unsigned i = 0; i < run_count; i++) {
mem_set32_sse2_stream(target_ptr, 0x1234, n);
}
})};
auto timing6 {measure_runtime([=] () {
for (unsigned i = 0; i < run_count; i++) {
mem_set32_sse2_store(target_ptr, 0x1234, n);
}
})};
return {timing1, timing2, timing3, timing4, timing5, timing6};
}
void benchmark_set(unsigned int run_count, std::size_t buffer_size)
{
fmt::print("Benchmarking setting of {}-byte buffers for {} times.\n", buffer_size, run_count);
auto count {buffer_size / sizeof(std::uint32_t)};
{
auto target {std::make_unique<std::byte[]>(buffer_size)};
{
auto target_ptr {target.get()};
auto timings {benchmark_set_runner(target_ptr, 0x12, count, run_count)};
auto reference_time {timings[0].count()};
fmt::print("LV C : {:.6}s ({:.2f}x)\n", timings[0].count(), reference_time/timings[0].count());
fmt::print("LV MMX : {:.6}s ({:.2f}x)\n", timings[1].count(), reference_time/timings[1].count());
fmt::print("LV MMX2 : {:.6}s ({:.2f}x)\n", timings[2].count(), reference_time/timings[2].count());
fmt::print("LV SSE : {:.6}s ({:.2f}x)\n", timings[3].count(), reference_time/timings[3].count());
fmt::print("LV SSE2 1: {:.6}s ({:.2f}x)\n", timings[4].count(), reference_time/timings[4].count());
fmt::print("LV SSE2 2: {:.6}s ({:.2f}x)\n", timings[5].count(), reference_time/timings[5].count());
}
}
{
constexpr std::size_t alignment {64};
static_assert(std::has_single_bit(alignment), "Alignment is not a power of 2.");
auto target {std::make_unique<std::byte[]>(buffer_size + alignment-1)};
auto target_offset {alignment - (std::uintptr_t(target.get()) & (alignment-1))};
auto target_ptr = target.get() + target_offset;
assert(std::uintptr_t(target_ptr) % alignment == 0);
auto timings {benchmark_set_runner(target_ptr, 0x12, count, run_count)};
auto reference_time {timings[0].count()};
fmt::print("LV C (align:{}): {:.6}s ({:.2f}x)\n", alignment, timings[0].count(), reference_time/timings[0].count());
fmt::print("LV MMX (align:{}): {:.6}s ({:.2f}x)\n", alignment, timings[1].count(), reference_time/timings[1].count());
fmt::print("LV MMX2 (align:{}): {:.6}s ({:.2f}x)\n", alignment, timings[2].count(), reference_time/timings[2].count());
fmt::print("LV SSE (align:{}): {:.6}s ({:.2f}x)\n", alignment, timings[3].count(), reference_time/timings[3].count());
fmt::print("LV SSE2 1 (align:{}): {:.6}s ({:.2f}x)\n", alignment, timings[4].count(), reference_time/timings[4].count());
fmt::print("LV SSE2 2 (align:{}): {:.6}s ({:.2f}x)\n", alignment, timings[5].count(), reference_time/timings[5].count());
}
fmt::print("\n");
}
int main()
{
constexpr std::size_t bytes_to_copy {std::size_t(1024) * 1024 * 1024};
constexpr std::size_t max_buffer_size {1 << 26};
for (std::size_t buffer_size = 32; buffer_size <= max_buffer_size; buffer_size *= 2) {
std::size_t run_count {bytes_to_copy / buffer_size};
benchmark_set(run_count, buffer_size);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment