kaixiong/mem-set-benchmark-16.cpp

## mem-set-benchmark-16.cpp
// -*- compile-command: "clang++ -O2 -Wall -std=c++20 -fstrict-aliasing -o mem-set-benchmark-16 mem-set-benchmark-16.cpp  $(pkg-config --cflags --libs fmt)" -*-

#include <array>
#include <bit>
#include <chrono>
#include <iostream>
#include <memory>
#include <utility>
#include <cassert>
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <fmt/core.h>
#include <fmt/chrono.h>
#include <x86intrin.h>

namespace chrono = std::chrono;

using benchmark_clock = chrono::high_resolution_clock;
using Timing = chrono::duration<float>;

template <typename F>
Timing measure_runtime(F fn)
{
    auto start_time {benchmark_clock::now()};
    fn();
    return {benchmark_clock::now() - start_time};
}

void *mem_set16_c (void *dest, int c, std::size_t n)
{
	uint64_t *d = static_cast<uint64_t*>(dest);

	uint16_t copy    = c & 0xffff;
	uint32_t copy_x2 = copy | (copy << 16);
	uint64_t copy_x4 = copy_x2 | (uint64_t(copy_x2) << 32);

	while (n >= 32) {
		d[0] = copy_x4;
		d[1] = copy_x4;
		d[2] = copy_x4;
		d[3] = copy_x4;
		d[4] = copy_x4;
		d[5] = copy_x4;
		d[6] = copy_x4;
		d[7] = copy_x4;
		n -= 32;
        d += 8;
	}

    uint16_t* dc = (uint16_t *) d;
    while (n--) {
        *dc++ = copy;
    }

	return dest;
}

void *mem_set16_mmx (void *dest, int c, std::size_t n)
{
	uint32_t *d = static_cast<uint32_t*>(dest);
	uint16_t *dc = static_cast<uint16_t*>(dest);
	uint32_t setflag32 = (c & 0xffff) | ((c << 16) & 0xffff0000);
	uint16_t setflag16 = c & 0xffff;

	__asm __volatile
		("\n\t movd (%0), %%mm0"
		 "\n\t movd (%0), %%mm1"
		 "\n\t psllq $32, %%mm1"
		 "\n\t por %%mm1, %%mm0"
		 "\n\t movq %%mm0, %%mm2"
		 "\n\t movq %%mm0, %%mm1"
		 "\n\t movq %%mm2, %%mm3"
		 "\n\t movq %%mm1, %%mm4"
		 "\n\t movq %%mm0, %%mm5"
		 "\n\t movq %%mm2, %%mm6"
		 "\n\t movq %%mm1, %%mm7"
		 :: "r" (&setflag32) : "memory");

	while (n >= 32) {
		__asm __volatile
			("\n\t movq %%mm0, (%0)"
			 "\n\t movq %%mm1, 8(%0)"
			 "\n\t movq %%mm2, 16(%0)"
			 "\n\t movq %%mm3, 24(%0)"
			 "\n\t movq %%mm4, 32(%0)"
			 "\n\t movq %%mm5, 40(%0)"
			 "\n\t movq %%mm6, 48(%0)"
			 "\n\t movq %%mm7, 56(%0)"
			 :: "r" (d) : "memory");

		d += 16;
		n -= 32;
	}

	__asm __volatile
		("\n\t emms");

	while (n >= 2) {
		*d++ = setflag32;
		n -= 2;
	}

	dc = (uint16_t *) d;

	while (n--)
		*dc++ = setflag16;

	return dest;
}

void *mem_set16_sse (void *dest, int c, std::size_t n)
{
	// FIXME: Do we need 'dest' to be aligned for this to be performing at optimal speed?

	const uint16_t copy    = c & 0xffff;
	const uint32_t copy_2x = copy | (copy << 16);
	const __m64    copy_4x = _mm_set_pi32 (copy_2x, copy_2x);

	__m64 *m64_ptr = (__m64 *) dest;

	while (n >= 32) {
		_mm_stream_pi (m64_ptr, copy_4x);
		_mm_stream_pi (m64_ptr + 1, copy_4x);
		_mm_stream_pi (m64_ptr + 2, copy_4x);
		_mm_stream_pi (m64_ptr + 3, copy_4x);
		_mm_stream_pi (m64_ptr + 4, copy_4x);
		_mm_stream_pi (m64_ptr + 5, copy_4x);
		_mm_stream_pi (m64_ptr + 6, copy_4x);
		_mm_stream_pi (m64_ptr + 7, copy_4x);
		m64_ptr += 8;
		n -= 32;
	}

	uint16_t *uint16_ptr = (uint16_t *) m64_ptr;
    while (n--) {
        *uint16_ptr++ = copy;
    }

	return dest;
}

void *mem_set16_sse2_stream (void *dest, int c, std::size_t n)
{
	const uint16_t copy    = c & 0xffff;
	const uint32_t copy_2x = copy | (copy << 16);
	const __m128i  copy_4x = _mm_set_epi32 (copy_2x, copy_2x, copy_2x, copy_2x);

	__m128i * __restrict__ m128i_ptr = (__m128i *) dest;

	while (n >= 64) {
		_mm_stream_si128 (m128i_ptr, copy_4x);
		_mm_stream_si128 (m128i_ptr + 1, copy_4x);
		_mm_stream_si128 (m128i_ptr + 2, copy_4x);
		_mm_stream_si128 (m128i_ptr + 3, copy_4x);
		_mm_stream_si128 (m128i_ptr + 4, copy_4x);
		_mm_stream_si128 (m128i_ptr + 5, copy_4x);
		_mm_stream_si128 (m128i_ptr + 6, copy_4x);
		_mm_stream_si128 (m128i_ptr + 7, copy_4x);
		m128i_ptr += 8;
		n -= 64;
	}

	uint16_t *uint16_ptr = (uint16_t *) m128i_ptr;
    while (n--) {
        *uint16_ptr++ = copy;
    }

	return dest;
}

void *mem_set16_sse2_store (void *dest, int c, std::size_t n)
{
	const uint16_t copy    = c & 0xffff;
	const uint32_t copy_2x = copy | (copy << 16);
	const __m128i  copy_4x = _mm_set_epi32 (copy_2x, copy_2x, copy_2x, copy_2x);

	__m128i * __restrict__ m128i_ptr = (__m128i *) dest;

	while (n >= 64) {
		_mm_storeu_si128 (m128i_ptr, copy_4x);
		_mm_storeu_si128 (m128i_ptr + 1, copy_4x);
		_mm_storeu_si128 (m128i_ptr + 2, copy_4x);
		_mm_storeu_si128 (m128i_ptr + 3, copy_4x);
		_mm_storeu_si128 (m128i_ptr + 4, copy_4x);
		_mm_storeu_si128 (m128i_ptr + 5, copy_4x);
		_mm_storeu_si128 (m128i_ptr + 6, copy_4x);
		_mm_storeu_si128 (m128i_ptr + 7, copy_4x);
		m128i_ptr += 8;
		n -= 64;
	}

	uint16_t *uint16_ptr = (uint16_t *) m128i_ptr;
    while (n--) {
        *uint16_ptr++ = copy;
    }

	return dest;
}

std::array<Timing, 5> benchmark_set_runner(void* target_ptr, int c, std::size_t n, unsigned int run_count)
{
    auto timing1 {measure_runtime([=] () {
        for (unsigned i = 0; i < run_count; i++) {
            mem_set16_c(target_ptr, 0x1234, n);
        }
    })};

    auto timing2 {measure_runtime([=] () {
        for (unsigned i = 0; i < run_count; i++) {
            mem_set16_mmx(target_ptr, 0x1234, n);
        }
    })};

    auto timing3 {measure_runtime([=] () {
        for (unsigned i = 0; i < run_count; i++) {
            mem_set16_sse(target_ptr, 0x1234, n);
        }
    })};

    auto timing4 {measure_runtime([=] () {
        for (unsigned i = 0; i < run_count; i++) {
            mem_set16_sse2_stream(target_ptr, 0x1234, n);
        }
    })};

    auto timing5 {measure_runtime([=] () {
        for (unsigned i = 0; i < run_count; i++) {
            mem_set16_sse2_store(target_ptr, 0x1234, n);
        }
    })};

    return {timing1, timing2, timing3, timing4, timing5};
}

void benchmark_set(unsigned int run_count, std::size_t buffer_size)
{
    fmt::print("Benchmarking setting of {}-byte buffers for {} times.\n", buffer_size, run_count);

    auto count {buffer_size / sizeof(std::uint16_t)};

    {
        auto target {std::make_unique<std::byte[]>(buffer_size)};

        {
            auto target_ptr {target.get()};

            auto timings {benchmark_set_runner(target_ptr, 0x12, count, run_count)};

            auto reference_time {timings[0].count()};
            fmt::print("LV C     : {:.6f}s ({:.2f}x)\n", timings[0].count(), reference_time / timings[0].count());
            fmt::print("LV MMX   : {:.6f}s ({:.2f}x)\n", timings[1].count(), reference_time / timings[1].count());
            fmt::print("LV SSE   : {:.6f}s ({:.2f}x)\n", timings[2].count(), reference_time / timings[2].count());
            fmt::print("LV SSE2 1: {:.6f}s ({:.2f}x)\n", timings[3].count(), reference_time / timings[3].count());
            fmt::print("LV SSE2 2: {:.6f}s ({:.2f}x)\n", timings[4].count(), reference_time / timings[4].count());
        }
    }

    {
        constexpr std::size_t alignment {64};
        static_assert(std::has_single_bit(alignment), "Alignment is not a power of 2.");

        auto target {std::make_unique<std::byte[]>(buffer_size + alignment-1)};

        auto target_offset {alignment - (std::uintptr_t(target.get()) & (alignment-1))};
        auto target_ptr = target.get() + target_offset;

        assert(std::uintptr_t(target_ptr) % alignment == 0);

        auto timings {benchmark_set_runner(target_ptr, 0x12, count, run_count)};

        auto reference_time {timings[0].count()};
        fmt::print("LV C      (align:{}): {:.6f}s ({:.2f}x)\n", alignment, timings[0].count(), reference_time / timings[0].count());
        fmt::print("LV MMX    (align:{}): {:.6f}s ({:.2f}x)\n", alignment, timings[1].count(), reference_time / timings[1].count());
        fmt::print("LV SSE    (align:{}): {:.6f}s ({:.2f}x)\n", alignment, timings[2].count(), reference_time / timings[2].count());
        fmt::print("LV SSE2 1 (align:{}): {:.6f}s ({:.2f}x)\n", alignment, timings[3].count(), reference_time / timings[3].count());
        fmt::print("LV SSE2 2 (align:{}): {:.6f}s ({:.2f}x)\n", alignment, timings[4].count(), reference_time / timings[4].count());
    }

    fmt::print("\n");
}

int main()
{
    constexpr std::size_t bytes_to_copy {std::size_t(1024) * 1024 * 1024};
    constexpr std::size_t max_buffer_size {1 << 26};

    for (std::size_t buffer_size = 64; buffer_size <= max_buffer_size; buffer_size *= 2) {
        std::size_t run_count {bytes_to_copy / buffer_size};
        benchmark_set(run_count, buffer_size);
    }
}

## mem-set-benchmark-32.cpp
// -*- compile-command: "clang++ -O2 -Wall -std=c++20 -fstrict-aliasing -o mem-set-benchmark-32 mem-set-benchmark-32.cpp  $(pkg-config --cflags --libs fmt)" -*-

#include <array>
#include <bit>
#include <chrono>
#include <iostream>
#include <memory>
#include <utility>
#include <cassert>
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <fmt/core.h>
#include <fmt/chrono.h>
#include <x86intrin.h>

namespace chrono = std::chrono;

using benchmark_clock = chrono::high_resolution_clock;
using Timing = chrono::duration<float>;

template <typename F>
Timing measure_runtime(F fn)
{
    auto start_time {benchmark_clock::now()};
    fn();
    return {benchmark_clock::now() - start_time};
}

void *mem_set32_c (void *dest, int c, std::size_t n)
{
	uint64_t *d = static_cast<uint64_t*>(dest);

	uint32_t copy = c;
	uint64_t copy_x2 = uint64_t(copy) | (uint64_t(copy) << 32);

	while (n >= 16) {
		d[0] = copy_x2;
		d[1] = copy_x2;
		d[2] = copy_x2;
		d[3] = copy_x2;
		d[4] = copy_x2;
		d[5] = copy_x2;
		d[6] = copy_x2;
		d[7] = copy_x2;
		n -= 16;
        d += 8;
	}

    uint32_t* dc = (uint32_t *) d;
    while (n--) {
        *dc++ = copy;
    }

	return dest;
}

static void *mem_set32_mmx (void *dest, int c, std::size_t n)
{
	uint32_t *d = static_cast<uint32_t*>(dest);
	uint32_t setflag32 = c;

	__asm __volatile
		("\n\t movd (%0), %%mm0"
		 "\n\t movd (%0), %%mm1"
		 "\n\t psllq $32, %%mm1"
		 "\n\t por %%mm1, %%mm0"
		 "\n\t movq %%mm0, %%mm2"
		 "\n\t movq %%mm0, %%mm1"
		 "\n\t movq %%mm2, %%mm3"
		 "\n\t movq %%mm1, %%mm4"
		 "\n\t movq %%mm0, %%mm5"
		 "\n\t movq %%mm2, %%mm6"
		 "\n\t movq %%mm1, %%mm7"
		 :: "r" (&setflag32) : "memory");

	while (n >= 16) {
		__asm __volatile
			("\n\t movq %%mm0, (%0)"
			 "\n\t movq %%mm1, 8(%0)"
			 "\n\t movq %%mm2, 16(%0)"
			 "\n\t movq %%mm3, 24(%0)"
			 "\n\t movq %%mm4, 32(%0)"
			 "\n\t movq %%mm5, 40(%0)"
			 "\n\t movq %%mm6, 48(%0)"
			 "\n\t movq %%mm7, 56(%0)"
			 :: "r" (d) : "memory");

		d += 16;
		n -= 16;
	}

	__asm __volatile
		("\n\t emms");

	while (n--)
		*d++ = setflag32;

	return dest;
}

static void *mem_set32_mmx2 (void *dest, int c, std::size_t n)
{
	uint32_t *d = static_cast<uint32_t*>(dest);
	uint32_t setflag32 = c;

	__asm __volatile
		("\n\t movd (%0), %%mm0"
		 "\n\t movd (%0), %%mm1"
		 "\n\t psllq $32, %%mm1"
		 "\n\t por %%mm1, %%mm0"
		 "\n\t movq %%mm0, %%mm2"
		 "\n\t movq %%mm0, %%mm1"
		 "\n\t movq %%mm2, %%mm3"
		 "\n\t movq %%mm1, %%mm4"
		 "\n\t movq %%mm0, %%mm5"
		 "\n\t movq %%mm2, %%mm6"
		 "\n\t movq %%mm1, %%mm7"
		 :: "r" (&setflag32) : "memory");

	while (n >= 16) {
		__asm __volatile
			("\n\t movntq %%mm0, (%0)"
			 "\n\t movntq %%mm1, 8(%0)"
			 "\n\t movntq %%mm2, 16(%0)"
			 "\n\t movntq %%mm3, 24(%0)"
			 "\n\t movntq %%mm4, 32(%0)"
			 "\n\t movntq %%mm5, 40(%0)"
			 "\n\t movntq %%mm6, 48(%0)"
			 "\n\t movntq %%mm7, 56(%0)"
			 :: "r" (d) : "memory");

		d += 16;
		n -= 16;
	}

	__asm __volatile
		("\n\t emms");

	while (n--)
		*d++ = setflag32;

	return dest;
}

static void *mem_set32_sse (void *dest, int c, std::size_t n)
{
	const uint32_t copy = c;
	const __m64    copy_2x = _mm_set_pi32 (copy, copy);

	__m64 *m64_ptr = (__m64 *) dest;

	while (n >= 16) {
		_mm_stream_pi (m64_ptr, copy_2x);
		_mm_stream_pi (m64_ptr + 1, copy_2x);
		_mm_stream_pi (m64_ptr + 2, copy_2x);
		_mm_stream_pi (m64_ptr + 3, copy_2x);
		_mm_stream_pi (m64_ptr + 4, copy_2x);
		_mm_stream_pi (m64_ptr + 5, copy_2x);
		_mm_stream_pi (m64_ptr + 6, copy_2x);
		_mm_stream_pi (m64_ptr + 7, copy_2x);
		n -= 16;
		m64_ptr += 8;
	}

	uint32_t *uint32_ptr = (uint32_t *) m64_ptr;
	while (n--) {
		*uint32_ptr++ = copy;
    }

	return dest;
}

static void *mem_set32_sse2_stream (void *dest, int c, std::size_t n)
{
	const uint32_t copy = c;
	const __m128i  copy_4x = _mm_set_epi32 (copy, copy, copy, copy);

	__m128i *m128i_ptr = (__m128i *) dest;
	while (n >= 32) {
		_mm_stream_si128 (m128i_ptr, copy_4x);
		_mm_stream_si128 (m128i_ptr + 1, copy_4x);
		_mm_stream_si128 (m128i_ptr + 2, copy_4x);
		_mm_stream_si128 (m128i_ptr + 3, copy_4x);
		_mm_stream_si128 (m128i_ptr + 4, copy_4x);
		_mm_stream_si128 (m128i_ptr + 5, copy_4x);
		_mm_stream_si128 (m128i_ptr + 6, copy_4x);
		_mm_stream_si128 (m128i_ptr + 7, copy_4x);
		n -= 32;
		m128i_ptr += 8;
	}

	uint32_t *uint32_ptr = (uint32_t *) m128i_ptr;
	while (n--) {
		*uint32_ptr++ = copy;
    }

	return dest;
}

static void *mem_set32_sse2_store (void *dest, int c, std::size_t n)
{
	const uint32_t copy = c;
	const __m128i  copy_4x = _mm_set_epi32 (copy, copy, copy, copy);

	__m128i *m128i_ptr = (__m128i *) dest;
	while (n >= 32) {
		_mm_storeu_si128 (m128i_ptr, copy_4x);
		_mm_storeu_si128 (m128i_ptr + 1, copy_4x);
		_mm_storeu_si128 (m128i_ptr + 2, copy_4x);
		_mm_storeu_si128 (m128i_ptr + 3, copy_4x);
		_mm_storeu_si128 (m128i_ptr + 4, copy_4x);
		_mm_storeu_si128 (m128i_ptr + 5, copy_4x);
		_mm_storeu_si128 (m128i_ptr + 6, copy_4x);
		_mm_storeu_si128 (m128i_ptr + 7, copy_4x);
		n -= 32;
		m128i_ptr += 8;
	}

	uint32_t *uint32_ptr = (uint32_t *) m128i_ptr;
	while (n--) {
		*uint32_ptr++ = copy;
    }

	return dest;
}

std::array<Timing, 6> benchmark_set_runner(void* target_ptr, int c, std::size_t n, unsigned int run_count)
{
    auto timing1 {measure_runtime([=] () {
        for (unsigned i = 0; i < run_count; i++) {
            mem_set32_c(target_ptr, 0x1234, n);
        }
    })};

    auto timing2 {measure_runtime([=] () {
        for (unsigned i = 0; i < run_count; i++) {
            mem_set32_mmx(target_ptr, 0x1234, n);
        }
    })};

    auto timing3 {measure_runtime([=] () {
        for (unsigned i = 0; i < run_count; i++) {
            mem_set32_mmx2(target_ptr, 0x1234, n);
        }
    })};

    auto timing4 {measure_runtime([=] () {
        for (unsigned i = 0; i < run_count; i++) {
            mem_set32_sse(target_ptr, 0x1234, n);
        }
    })};

    auto timing5 {measure_runtime([=] () {
        for (unsigned i = 0; i < run_count; i++) {
            mem_set32_sse2_stream(target_ptr, 0x1234, n);
        }
    })};

    auto timing6 {measure_runtime([=] () {
        for (unsigned i = 0; i < run_count; i++) {
            mem_set32_sse2_store(target_ptr, 0x1234, n);
        }
    })};

    return {timing1, timing2, timing3, timing4, timing5, timing6};
}

void benchmark_set(unsigned int run_count, std::size_t buffer_size)
{
    fmt::print("Benchmarking setting of {}-byte buffers for {} times.\n", buffer_size, run_count);

    auto count {buffer_size / sizeof(std::uint32_t)};

    {
        auto target {std::make_unique<std::byte[]>(buffer_size)};

        {
            auto target_ptr {target.get()};

            auto timings {benchmark_set_runner(target_ptr, 0x12, count, run_count)};

            auto reference_time {timings[0].count()};
            fmt::print("LV C     : {:.6}s ({:.2f}x)\n", timings[0].count(), reference_time/timings[0].count());
            fmt::print("LV MMX   : {:.6}s ({:.2f}x)\n", timings[1].count(), reference_time/timings[1].count());
            fmt::print("LV MMX2  : {:.6}s ({:.2f}x)\n", timings[2].count(), reference_time/timings[2].count());
            fmt::print("LV SSE   : {:.6}s ({:.2f}x)\n", timings[3].count(), reference_time/timings[3].count());
            fmt::print("LV SSE2 1: {:.6}s ({:.2f}x)\n", timings[4].count(), reference_time/timings[4].count());
            fmt::print("LV SSE2 2: {:.6}s ({:.2f}x)\n", timings[5].count(), reference_time/timings[5].count());
        }
    }

    {
        constexpr std::size_t alignment {64};
        static_assert(std::has_single_bit(alignment), "Alignment is not a power of 2.");

        auto target {std::make_unique<std::byte[]>(buffer_size + alignment-1)};

        auto target_offset {alignment - (std::uintptr_t(target.get()) & (alignment-1))};
        auto target_ptr = target.get() + target_offset;

        assert(std::uintptr_t(target_ptr) % alignment == 0);

        auto timings {benchmark_set_runner(target_ptr, 0x12, count, run_count)};

        auto reference_time {timings[0].count()};
        fmt::print("LV C      (align:{}): {:.6}s ({:.2f}x)\n", alignment, timings[0].count(), reference_time/timings[0].count());
        fmt::print("LV MMX    (align:{}): {:.6}s ({:.2f}x)\n", alignment, timings[1].count(), reference_time/timings[1].count());
        fmt::print("LV MMX2   (align:{}): {:.6}s ({:.2f}x)\n", alignment, timings[2].count(), reference_time/timings[2].count());
        fmt::print("LV SSE    (align:{}): {:.6}s ({:.2f}x)\n", alignment, timings[3].count(), reference_time/timings[3].count());
        fmt::print("LV SSE2 1 (align:{}): {:.6}s ({:.2f}x)\n", alignment, timings[4].count(), reference_time/timings[4].count());
        fmt::print("LV SSE2 2 (align:{}): {:.6}s ({:.2f}x)\n", alignment, timings[5].count(), reference_time/timings[5].count());
    }

    fmt::print("\n");
}

int main()
{
    constexpr std::size_t bytes_to_copy {std::size_t(1024) * 1024 * 1024};
    constexpr std::size_t max_buffer_size {1 << 26};

    for (std::size_t buffer_size = 32; buffer_size <= max_buffer_size; buffer_size *= 2) {
        std::size_t run_count {bytes_to_copy / buffer_size};
        benchmark_set(run_count, buffer_size);
    }
}
	// -- compile-command: "clang++ -O2 -Wall -std=c++20 -fstrict-aliasing -o mem-set-benchmark-16 mem-set-benchmark-16.cpp $(pkg-config --cflags --libs fmt)" --

	#include <array>
	#include <bit>
	#include <chrono>
	#include <iostream>
	#include <memory>
	#include <utility>
	#include <cassert>
	#include <cstddef>
	#include <cstdint>
	#include <cstring>
	#include <fmt/core.h>
	#include <fmt/chrono.h>
	#include <x86intrin.h>

	namespace chrono = std::chrono;

	using benchmark_clock = chrono::high_resolution_clock;
	using Timing = chrono::duration<float>;

	template <typename F>
	Timing measure_runtime(F fn)
	{
	auto start_time {benchmark_clock::now()};
	fn();
	return {benchmark_clock::now() - start_time};
	}

	void mem_set16_c (void dest, int c, std::size_t n)
	{
	uint64_t d = static_cast<uint64_t>(dest);

	uint16_t copy = c & 0xffff;
	uint32_t copy_x2 = copy \| (copy << 16);
	uint64_t copy_x4 = copy_x2 \| (uint64_t(copy_x2) << 32);

	while (n >= 32) {
	d[0] = copy_x4;
	d[1] = copy_x4;
	d[2] = copy_x4;
	d[3] = copy_x4;
	d[4] = copy_x4;
	d[5] = copy_x4;
	d[6] = copy_x4;
	d[7] = copy_x4;
	n -= 32;
	d += 8;
	}

	uint16_t* dc = (uint16_t *) d;
	while (n--) {
	*dc++ = copy;
	}

	return dest;
	}

	void mem_set16_mmx (void dest, int c, std::size_t n)
	{
	uint32_t d = static_cast<uint32_t>(dest);
	uint16_t dc = static_cast<uint16_t>(dest);
	uint32_t setflag32 = (c & 0xffff) \| ((c << 16) & 0xffff0000);
	uint16_t setflag16 = c & 0xffff;

	__asm __volatile
	("\n\t movd (%0), %%mm0"
	"\n\t movd (%0), %%mm1"
	"\n\t psllq $32, %%mm1"
	"\n\t por %%mm1, %%mm0"
	"\n\t movq %%mm0, %%mm2"
	"\n\t movq %%mm0, %%mm1"
	"\n\t movq %%mm2, %%mm3"
	"\n\t movq %%mm1, %%mm4"
	"\n\t movq %%mm0, %%mm5"
	"\n\t movq %%mm2, %%mm6"
	"\n\t movq %%mm1, %%mm7"
	:: "r" (&setflag32) : "memory");

	while (n >= 32) {
	__asm __volatile
	("\n\t movq %%mm0, (%0)"
	"\n\t movq %%mm1, 8(%0)"
	"\n\t movq %%mm2, 16(%0)"
	"\n\t movq %%mm3, 24(%0)"
	"\n\t movq %%mm4, 32(%0)"
	"\n\t movq %%mm5, 40(%0)"
	"\n\t movq %%mm6, 48(%0)"
	"\n\t movq %%mm7, 56(%0)"
	:: "r" (d) : "memory");

	d += 16;
	n -= 32;
	}

	__asm __volatile
	("\n\t emms");

	while (n >= 2) {
	*d++ = setflag32;
	n -= 2;
	}

	dc = (uint16_t *) d;

	while (n--)
	*dc++ = setflag16;

	return dest;
	}

	void mem_set16_sse (void dest, int c, std::size_t n)
	{
	// FIXME: Do we need 'dest' to be aligned for this to be performing at optimal speed?

	const uint16_t copy = c & 0xffff;
	const uint32_t copy_2x = copy \| (copy << 16);
	const __m64 copy_4x = _mm_set_pi32 (copy_2x, copy_2x);

	__m64 m64_ptr = (__m64 ) dest;

	while (n >= 32) {
	_mm_stream_pi (m64_ptr, copy_4x);
	_mm_stream_pi (m64_ptr + 1, copy_4x);
	_mm_stream_pi (m64_ptr + 2, copy_4x);
	_mm_stream_pi (m64_ptr + 3, copy_4x);
	_mm_stream_pi (m64_ptr + 4, copy_4x);
	_mm_stream_pi (m64_ptr + 5, copy_4x);
	_mm_stream_pi (m64_ptr + 6, copy_4x);
	_mm_stream_pi (m64_ptr + 7, copy_4x);
	m64_ptr += 8;
	n -= 32;
	}

	uint16_t uint16_ptr = (uint16_t ) m64_ptr;
	while (n--) {
	*uint16_ptr++ = copy;
	}

	return dest;
	}

	void mem_set16_sse2_stream (void dest, int c, std::size_t n)
	{
	const uint16_t copy = c & 0xffff;
	const uint32_t copy_2x = copy \| (copy << 16);
	const __m128i copy_4x = _mm_set_epi32 (copy_2x, copy_2x, copy_2x, copy_2x);

	__m128i * __restrict__ m128i_ptr = (__m128i *) dest;

	while (n >= 64) {
	_mm_stream_si128 (m128i_ptr, copy_4x);
	_mm_stream_si128 (m128i_ptr + 1, copy_4x);
	_mm_stream_si128 (m128i_ptr + 2, copy_4x);
	_mm_stream_si128 (m128i_ptr + 3, copy_4x);
	_mm_stream_si128 (m128i_ptr + 4, copy_4x);
	_mm_stream_si128 (m128i_ptr + 5, copy_4x);
	_mm_stream_si128 (m128i_ptr + 6, copy_4x);
	_mm_stream_si128 (m128i_ptr + 7, copy_4x);
	m128i_ptr += 8;
	n -= 64;
	}

	uint16_t uint16_ptr = (uint16_t ) m128i_ptr;
	while (n--) {
	*uint16_ptr++ = copy;
	}

	return dest;
	}

	void mem_set16_sse2_store (void dest, int c, std::size_t n)
	{
	const uint16_t copy = c & 0xffff;
	const uint32_t copy_2x = copy \| (copy << 16);
	const __m128i copy_4x = _mm_set_epi32 (copy_2x, copy_2x, copy_2x, copy_2x);

	__m128i * __restrict__ m128i_ptr = (__m128i *) dest;

	while (n >= 64) {
	_mm_storeu_si128 (m128i_ptr, copy_4x);
	_mm_storeu_si128 (m128i_ptr + 1, copy_4x);
	_mm_storeu_si128 (m128i_ptr + 2, copy_4x);
	_mm_storeu_si128 (m128i_ptr + 3, copy_4x);
	_mm_storeu_si128 (m128i_ptr + 4, copy_4x);
	_mm_storeu_si128 (m128i_ptr + 5, copy_4x);
	_mm_storeu_si128 (m128i_ptr + 6, copy_4x);
	_mm_storeu_si128 (m128i_ptr + 7, copy_4x);
	m128i_ptr += 8;
	n -= 64;
	}

	uint16_t uint16_ptr = (uint16_t ) m128i_ptr;
	while (n--) {
	*uint16_ptr++ = copy;
	}

	return dest;
	}

	std::array<Timing, 5> benchmark_set_runner(void* target_ptr, int c, std::size_t n, unsigned int run_count)
	{
	auto timing1 {measure_runtime([=] () {
	for (unsigned i = 0; i < run_count; i++) {
	mem_set16_c(target_ptr, 0x1234, n);
	}
	})};

	auto timing2 {measure_runtime([=] () {
	for (unsigned i = 0; i < run_count; i++) {
	mem_set16_mmx(target_ptr, 0x1234, n);
	}
	})};

	auto timing3 {measure_runtime([=] () {
	for (unsigned i = 0; i < run_count; i++) {
	mem_set16_sse(target_ptr, 0x1234, n);
	}
	})};

	auto timing4 {measure_runtime([=] () {
	for (unsigned i = 0; i < run_count; i++) {
	mem_set16_sse2_stream(target_ptr, 0x1234, n);
	}
	})};

	auto timing5 {measure_runtime([=] () {
	for (unsigned i = 0; i < run_count; i++) {
	mem_set16_sse2_store(target_ptr, 0x1234, n);
	}
	})};

	return {timing1, timing2, timing3, timing4, timing5};
	}

	void benchmark_set(unsigned int run_count, std::size_t buffer_size)
	{
	fmt::print("Benchmarking setting of {}-byte buffers for {} times.\n", buffer_size, run_count);

	auto count {buffer_size / sizeof(std::uint16_t)};

	{
	auto target {std::make_unique<std::byte[]>(buffer_size)};

	{
	auto target_ptr {target.get()};

	auto timings {benchmark_set_runner(target_ptr, 0x12, count, run_count)};

	auto reference_time {timings[0].count()};
	fmt::print("LV C : {:.6f}s ({:.2f}x)\n", timings[0].count(), reference_time / timings[0].count());
	fmt::print("LV MMX : {:.6f}s ({:.2f}x)\n", timings[1].count(), reference_time / timings[1].count());
	fmt::print("LV SSE : {:.6f}s ({:.2f}x)\n", timings[2].count(), reference_time / timings[2].count());
	fmt::print("LV SSE2 1: {:.6f}s ({:.2f}x)\n", timings[3].count(), reference_time / timings[3].count());
	fmt::print("LV SSE2 2: {:.6f}s ({:.2f}x)\n", timings[4].count(), reference_time / timings[4].count());
	}
	}

	{
	constexpr std::size_t alignment {64};
	static_assert(std::has_single_bit(alignment), "Alignment is not a power of 2.");

	auto target {std::make_unique<std::byte[]>(buffer_size + alignment-1)};

	auto target_offset {alignment - (std::uintptr_t(target.get()) & (alignment-1))};
	auto target_ptr = target.get() + target_offset;

	assert(std::uintptr_t(target_ptr) % alignment == 0);

	auto timings {benchmark_set_runner(target_ptr, 0x12, count, run_count)};

	auto reference_time {timings[0].count()};
	fmt::print("LV C (align:{}): {:.6f}s ({:.2f}x)\n", alignment, timings[0].count(), reference_time / timings[0].count());
	fmt::print("LV MMX (align:{}): {:.6f}s ({:.2f}x)\n", alignment, timings[1].count(), reference_time / timings[1].count());
	fmt::print("LV SSE (align:{}): {:.6f}s ({:.2f}x)\n", alignment, timings[2].count(), reference_time / timings[2].count());
	fmt::print("LV SSE2 1 (align:{}): {:.6f}s ({:.2f}x)\n", alignment, timings[3].count(), reference_time / timings[3].count());
	fmt::print("LV SSE2 2 (align:{}): {:.6f}s ({:.2f}x)\n", alignment, timings[4].count(), reference_time / timings[4].count());
	}

	fmt::print("\n");
	}

	int main()
	{
	constexpr std::size_t bytes_to_copy {std::size_t(1024) * 1024 * 1024};
	constexpr std::size_t max_buffer_size {1 << 26};

	for (std::size_t buffer_size = 64; buffer_size <= max_buffer_size; buffer_size *= 2) {
	std::size_t run_count {bytes_to_copy / buffer_size};
	benchmark_set(run_count, buffer_size);
	}
	}