Last active
March 4, 2023 11:05
-
-
Save kaixiong/89607caddf7f84c5cb45b4fdd10900be to your computer and use it in GitHub Desktop.
visual_mem_set{16,32} benchmarks - C vs ASM vs Intrinsics
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// -*- compile-command: "clang++ -O2 -Wall -std=c++20 -fstrict-aliasing -o mem-set-benchmark-16 mem-set-benchmark-16.cpp $(pkg-config --cflags --libs fmt)" -*- | |
#include <array> | |
#include <bit> | |
#include <chrono> | |
#include <iostream> | |
#include <memory> | |
#include <utility> | |
#include <cassert> | |
#include <cstddef> | |
#include <cstdint> | |
#include <cstring> | |
#include <fmt/core.h> | |
#include <fmt/chrono.h> | |
#include <x86intrin.h> | |
namespace chrono = std::chrono; | |
using benchmark_clock = chrono::high_resolution_clock; | |
using Timing = chrono::duration<float>; | |
template <typename F> | |
Timing measure_runtime(F fn) | |
{ | |
auto start_time {benchmark_clock::now()}; | |
fn(); | |
return {benchmark_clock::now() - start_time}; | |
} | |
void *mem_set16_c (void *dest, int c, std::size_t n) | |
{ | |
uint64_t *d = static_cast<uint64_t*>(dest); | |
uint16_t copy = c & 0xffff; | |
uint32_t copy_x2 = copy | (copy << 16); | |
uint64_t copy_x4 = copy_x2 | (uint64_t(copy_x2) << 32); | |
while (n >= 32) { | |
d[0] = copy_x4; | |
d[1] = copy_x4; | |
d[2] = copy_x4; | |
d[3] = copy_x4; | |
d[4] = copy_x4; | |
d[5] = copy_x4; | |
d[6] = copy_x4; | |
d[7] = copy_x4; | |
n -= 32; | |
d += 8; | |
} | |
uint16_t* dc = (uint16_t *) d; | |
while (n--) { | |
*dc++ = copy; | |
} | |
return dest; | |
} | |
void *mem_set16_mmx (void *dest, int c, std::size_t n) | |
{ | |
uint32_t *d = static_cast<uint32_t*>(dest); | |
uint16_t *dc = static_cast<uint16_t*>(dest); | |
uint32_t setflag32 = (c & 0xffff) | ((c << 16) & 0xffff0000); | |
uint16_t setflag16 = c & 0xffff; | |
__asm __volatile | |
("\n\t movd (%0), %%mm0" | |
"\n\t movd (%0), %%mm1" | |
"\n\t psllq $32, %%mm1" | |
"\n\t por %%mm1, %%mm0" | |
"\n\t movq %%mm0, %%mm2" | |
"\n\t movq %%mm0, %%mm1" | |
"\n\t movq %%mm2, %%mm3" | |
"\n\t movq %%mm1, %%mm4" | |
"\n\t movq %%mm0, %%mm5" | |
"\n\t movq %%mm2, %%mm6" | |
"\n\t movq %%mm1, %%mm7" | |
:: "r" (&setflag32) : "memory"); | |
while (n >= 32) { | |
__asm __volatile | |
("\n\t movq %%mm0, (%0)" | |
"\n\t movq %%mm1, 8(%0)" | |
"\n\t movq %%mm2, 16(%0)" | |
"\n\t movq %%mm3, 24(%0)" | |
"\n\t movq %%mm4, 32(%0)" | |
"\n\t movq %%mm5, 40(%0)" | |
"\n\t movq %%mm6, 48(%0)" | |
"\n\t movq %%mm7, 56(%0)" | |
:: "r" (d) : "memory"); | |
d += 16; | |
n -= 32; | |
} | |
__asm __volatile | |
("\n\t emms"); | |
while (n >= 2) { | |
*d++ = setflag32; | |
n -= 2; | |
} | |
dc = (uint16_t *) d; | |
while (n--) | |
*dc++ = setflag16; | |
return dest; | |
} | |
void *mem_set16_sse (void *dest, int c, std::size_t n) | |
{ | |
// FIXME: Do we need 'dest' to be aligned for this to be performing at optimal speed? | |
const uint16_t copy = c & 0xffff; | |
const uint32_t copy_2x = copy | (copy << 16); | |
const __m64 copy_4x = _mm_set_pi32 (copy_2x, copy_2x); | |
__m64 *m64_ptr = (__m64 *) dest; | |
while (n >= 32) { | |
_mm_stream_pi (m64_ptr, copy_4x); | |
_mm_stream_pi (m64_ptr + 1, copy_4x); | |
_mm_stream_pi (m64_ptr + 2, copy_4x); | |
_mm_stream_pi (m64_ptr + 3, copy_4x); | |
_mm_stream_pi (m64_ptr + 4, copy_4x); | |
_mm_stream_pi (m64_ptr + 5, copy_4x); | |
_mm_stream_pi (m64_ptr + 6, copy_4x); | |
_mm_stream_pi (m64_ptr + 7, copy_4x); | |
m64_ptr += 8; | |
n -= 32; | |
} | |
uint16_t *uint16_ptr = (uint16_t *) m64_ptr; | |
while (n--) { | |
*uint16_ptr++ = copy; | |
} | |
return dest; | |
} | |
void *mem_set16_sse2_stream (void *dest, int c, std::size_t n) | |
{ | |
const uint16_t copy = c & 0xffff; | |
const uint32_t copy_2x = copy | (copy << 16); | |
const __m128i copy_4x = _mm_set_epi32 (copy_2x, copy_2x, copy_2x, copy_2x); | |
__m128i * __restrict__ m128i_ptr = (__m128i *) dest; | |
while (n >= 64) { | |
_mm_stream_si128 (m128i_ptr, copy_4x); | |
_mm_stream_si128 (m128i_ptr + 1, copy_4x); | |
_mm_stream_si128 (m128i_ptr + 2, copy_4x); | |
_mm_stream_si128 (m128i_ptr + 3, copy_4x); | |
_mm_stream_si128 (m128i_ptr + 4, copy_4x); | |
_mm_stream_si128 (m128i_ptr + 5, copy_4x); | |
_mm_stream_si128 (m128i_ptr + 6, copy_4x); | |
_mm_stream_si128 (m128i_ptr + 7, copy_4x); | |
m128i_ptr += 8; | |
n -= 64; | |
} | |
uint16_t *uint16_ptr = (uint16_t *) m128i_ptr; | |
while (n--) { | |
*uint16_ptr++ = copy; | |
} | |
return dest; | |
} | |
void *mem_set16_sse2_store (void *dest, int c, std::size_t n) | |
{ | |
const uint16_t copy = c & 0xffff; | |
const uint32_t copy_2x = copy | (copy << 16); | |
const __m128i copy_4x = _mm_set_epi32 (copy_2x, copy_2x, copy_2x, copy_2x); | |
__m128i * __restrict__ m128i_ptr = (__m128i *) dest; | |
while (n >= 64) { | |
_mm_storeu_si128 (m128i_ptr, copy_4x); | |
_mm_storeu_si128 (m128i_ptr + 1, copy_4x); | |
_mm_storeu_si128 (m128i_ptr + 2, copy_4x); | |
_mm_storeu_si128 (m128i_ptr + 3, copy_4x); | |
_mm_storeu_si128 (m128i_ptr + 4, copy_4x); | |
_mm_storeu_si128 (m128i_ptr + 5, copy_4x); | |
_mm_storeu_si128 (m128i_ptr + 6, copy_4x); | |
_mm_storeu_si128 (m128i_ptr + 7, copy_4x); | |
m128i_ptr += 8; | |
n -= 64; | |
} | |
uint16_t *uint16_ptr = (uint16_t *) m128i_ptr; | |
while (n--) { | |
*uint16_ptr++ = copy; | |
} | |
return dest; | |
} | |
std::array<Timing, 5> benchmark_set_runner(void* target_ptr, int c, std::size_t n, unsigned int run_count) | |
{ | |
auto timing1 {measure_runtime([=] () { | |
for (unsigned i = 0; i < run_count; i++) { | |
mem_set16_c(target_ptr, 0x1234, n); | |
} | |
})}; | |
auto timing2 {measure_runtime([=] () { | |
for (unsigned i = 0; i < run_count; i++) { | |
mem_set16_mmx(target_ptr, 0x1234, n); | |
} | |
})}; | |
auto timing3 {measure_runtime([=] () { | |
for (unsigned i = 0; i < run_count; i++) { | |
mem_set16_sse(target_ptr, 0x1234, n); | |
} | |
})}; | |
auto timing4 {measure_runtime([=] () { | |
for (unsigned i = 0; i < run_count; i++) { | |
mem_set16_sse2_stream(target_ptr, 0x1234, n); | |
} | |
})}; | |
auto timing5 {measure_runtime([=] () { | |
for (unsigned i = 0; i < run_count; i++) { | |
mem_set16_sse2_store(target_ptr, 0x1234, n); | |
} | |
})}; | |
return {timing1, timing2, timing3, timing4, timing5}; | |
} | |
void benchmark_set(unsigned int run_count, std::size_t buffer_size) | |
{ | |
fmt::print("Benchmarking setting of {}-byte buffers for {} times.\n", buffer_size, run_count); | |
auto count {buffer_size / sizeof(std::uint16_t)}; | |
{ | |
auto target {std::make_unique<std::byte[]>(buffer_size)}; | |
{ | |
auto target_ptr {target.get()}; | |
auto timings {benchmark_set_runner(target_ptr, 0x12, count, run_count)}; | |
auto reference_time {timings[0].count()}; | |
fmt::print("LV C : {:.6f}s ({:.2f}x)\n", timings[0].count(), reference_time / timings[0].count()); | |
fmt::print("LV MMX : {:.6f}s ({:.2f}x)\n", timings[1].count(), reference_time / timings[1].count()); | |
fmt::print("LV SSE : {:.6f}s ({:.2f}x)\n", timings[2].count(), reference_time / timings[2].count()); | |
fmt::print("LV SSE2 1: {:.6f}s ({:.2f}x)\n", timings[3].count(), reference_time / timings[3].count()); | |
fmt::print("LV SSE2 2: {:.6f}s ({:.2f}x)\n", timings[4].count(), reference_time / timings[4].count()); | |
} | |
} | |
{ | |
constexpr std::size_t alignment {64}; | |
static_assert(std::has_single_bit(alignment), "Alignment is not a power of 2."); | |
auto target {std::make_unique<std::byte[]>(buffer_size + alignment-1)}; | |
auto target_offset {alignment - (std::uintptr_t(target.get()) & (alignment-1))}; | |
auto target_ptr = target.get() + target_offset; | |
assert(std::uintptr_t(target_ptr) % alignment == 0); | |
auto timings {benchmark_set_runner(target_ptr, 0x12, count, run_count)}; | |
auto reference_time {timings[0].count()}; | |
fmt::print("LV C (align:{}): {:.6f}s ({:.2f}x)\n", alignment, timings[0].count(), reference_time / timings[0].count()); | |
fmt::print("LV MMX (align:{}): {:.6f}s ({:.2f}x)\n", alignment, timings[1].count(), reference_time / timings[1].count()); | |
fmt::print("LV SSE (align:{}): {:.6f}s ({:.2f}x)\n", alignment, timings[2].count(), reference_time / timings[2].count()); | |
fmt::print("LV SSE2 1 (align:{}): {:.6f}s ({:.2f}x)\n", alignment, timings[3].count(), reference_time / timings[3].count()); | |
fmt::print("LV SSE2 2 (align:{}): {:.6f}s ({:.2f}x)\n", alignment, timings[4].count(), reference_time / timings[4].count()); | |
} | |
fmt::print("\n"); | |
} | |
int main() | |
{ | |
constexpr std::size_t bytes_to_copy {std::size_t(1024) * 1024 * 1024}; | |
constexpr std::size_t max_buffer_size {1 << 26}; | |
for (std::size_t buffer_size = 64; buffer_size <= max_buffer_size; buffer_size *= 2) { | |
std::size_t run_count {bytes_to_copy / buffer_size}; | |
benchmark_set(run_count, buffer_size); | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// -*- compile-command: "clang++ -O2 -Wall -std=c++20 -fstrict-aliasing -o mem-set-benchmark-32 mem-set-benchmark-32.cpp $(pkg-config --cflags --libs fmt)" -*- | |
#include <array> | |
#include <bit> | |
#include <chrono> | |
#include <iostream> | |
#include <memory> | |
#include <utility> | |
#include <cassert> | |
#include <cstddef> | |
#include <cstdint> | |
#include <cstring> | |
#include <fmt/core.h> | |
#include <fmt/chrono.h> | |
#include <x86intrin.h> | |
namespace chrono = std::chrono; | |
using benchmark_clock = chrono::high_resolution_clock; | |
using Timing = chrono::duration<float>; | |
template <typename F> | |
Timing measure_runtime(F fn) | |
{ | |
auto start_time {benchmark_clock::now()}; | |
fn(); | |
return {benchmark_clock::now() - start_time}; | |
} | |
void *mem_set32_c (void *dest, int c, std::size_t n) | |
{ | |
uint64_t *d = static_cast<uint64_t*>(dest); | |
uint32_t copy = c; | |
uint64_t copy_x2 = uint64_t(copy) | (uint64_t(copy) << 32); | |
while (n >= 16) { | |
d[0] = copy_x2; | |
d[1] = copy_x2; | |
d[2] = copy_x2; | |
d[3] = copy_x2; | |
d[4] = copy_x2; | |
d[5] = copy_x2; | |
d[6] = copy_x2; | |
d[7] = copy_x2; | |
n -= 16; | |
d += 8; | |
} | |
uint32_t* dc = (uint32_t *) d; | |
while (n--) { | |
*dc++ = copy; | |
} | |
return dest; | |
} | |
static void *mem_set32_mmx (void *dest, int c, std::size_t n) | |
{ | |
uint32_t *d = static_cast<uint32_t*>(dest); | |
uint32_t setflag32 = c; | |
__asm __volatile | |
("\n\t movd (%0), %%mm0" | |
"\n\t movd (%0), %%mm1" | |
"\n\t psllq $32, %%mm1" | |
"\n\t por %%mm1, %%mm0" | |
"\n\t movq %%mm0, %%mm2" | |
"\n\t movq %%mm0, %%mm1" | |
"\n\t movq %%mm2, %%mm3" | |
"\n\t movq %%mm1, %%mm4" | |
"\n\t movq %%mm0, %%mm5" | |
"\n\t movq %%mm2, %%mm6" | |
"\n\t movq %%mm1, %%mm7" | |
:: "r" (&setflag32) : "memory"); | |
while (n >= 16) { | |
__asm __volatile | |
("\n\t movq %%mm0, (%0)" | |
"\n\t movq %%mm1, 8(%0)" | |
"\n\t movq %%mm2, 16(%0)" | |
"\n\t movq %%mm3, 24(%0)" | |
"\n\t movq %%mm4, 32(%0)" | |
"\n\t movq %%mm5, 40(%0)" | |
"\n\t movq %%mm6, 48(%0)" | |
"\n\t movq %%mm7, 56(%0)" | |
:: "r" (d) : "memory"); | |
d += 16; | |
n -= 16; | |
} | |
__asm __volatile | |
("\n\t emms"); | |
while (n--) | |
*d++ = setflag32; | |
return dest; | |
} | |
static void *mem_set32_mmx2 (void *dest, int c, std::size_t n) | |
{ | |
uint32_t *d = static_cast<uint32_t*>(dest); | |
uint32_t setflag32 = c; | |
__asm __volatile | |
("\n\t movd (%0), %%mm0" | |
"\n\t movd (%0), %%mm1" | |
"\n\t psllq $32, %%mm1" | |
"\n\t por %%mm1, %%mm0" | |
"\n\t movq %%mm0, %%mm2" | |
"\n\t movq %%mm0, %%mm1" | |
"\n\t movq %%mm2, %%mm3" | |
"\n\t movq %%mm1, %%mm4" | |
"\n\t movq %%mm0, %%mm5" | |
"\n\t movq %%mm2, %%mm6" | |
"\n\t movq %%mm1, %%mm7" | |
:: "r" (&setflag32) : "memory"); | |
while (n >= 16) { | |
__asm __volatile | |
("\n\t movntq %%mm0, (%0)" | |
"\n\t movntq %%mm1, 8(%0)" | |
"\n\t movntq %%mm2, 16(%0)" | |
"\n\t movntq %%mm3, 24(%0)" | |
"\n\t movntq %%mm4, 32(%0)" | |
"\n\t movntq %%mm5, 40(%0)" | |
"\n\t movntq %%mm6, 48(%0)" | |
"\n\t movntq %%mm7, 56(%0)" | |
:: "r" (d) : "memory"); | |
d += 16; | |
n -= 16; | |
} | |
__asm __volatile | |
("\n\t emms"); | |
while (n--) | |
*d++ = setflag32; | |
return dest; | |
} | |
static void *mem_set32_sse (void *dest, int c, std::size_t n) | |
{ | |
const uint32_t copy = c; | |
const __m64 copy_2x = _mm_set_pi32 (copy, copy); | |
__m64 *m64_ptr = (__m64 *) dest; | |
while (n >= 16) { | |
_mm_stream_pi (m64_ptr, copy_2x); | |
_mm_stream_pi (m64_ptr + 1, copy_2x); | |
_mm_stream_pi (m64_ptr + 2, copy_2x); | |
_mm_stream_pi (m64_ptr + 3, copy_2x); | |
_mm_stream_pi (m64_ptr + 4, copy_2x); | |
_mm_stream_pi (m64_ptr + 5, copy_2x); | |
_mm_stream_pi (m64_ptr + 6, copy_2x); | |
_mm_stream_pi (m64_ptr + 7, copy_2x); | |
n -= 16; | |
m64_ptr += 8; | |
} | |
uint32_t *uint32_ptr = (uint32_t *) m64_ptr; | |
while (n--) { | |
*uint32_ptr++ = copy; | |
} | |
return dest; | |
} | |
static void *mem_set32_sse2_stream (void *dest, int c, std::size_t n) | |
{ | |
const uint32_t copy = c; | |
const __m128i copy_4x = _mm_set_epi32 (copy, copy, copy, copy); | |
__m128i *m128i_ptr = (__m128i *) dest; | |
while (n >= 32) { | |
_mm_stream_si128 (m128i_ptr, copy_4x); | |
_mm_stream_si128 (m128i_ptr + 1, copy_4x); | |
_mm_stream_si128 (m128i_ptr + 2, copy_4x); | |
_mm_stream_si128 (m128i_ptr + 3, copy_4x); | |
_mm_stream_si128 (m128i_ptr + 4, copy_4x); | |
_mm_stream_si128 (m128i_ptr + 5, copy_4x); | |
_mm_stream_si128 (m128i_ptr + 6, copy_4x); | |
_mm_stream_si128 (m128i_ptr + 7, copy_4x); | |
n -= 32; | |
m128i_ptr += 8; | |
} | |
uint32_t *uint32_ptr = (uint32_t *) m128i_ptr; | |
while (n--) { | |
*uint32_ptr++ = copy; | |
} | |
return dest; | |
} | |
static void *mem_set32_sse2_store (void *dest, int c, std::size_t n) | |
{ | |
const uint32_t copy = c; | |
const __m128i copy_4x = _mm_set_epi32 (copy, copy, copy, copy); | |
__m128i *m128i_ptr = (__m128i *) dest; | |
while (n >= 32) { | |
_mm_storeu_si128 (m128i_ptr, copy_4x); | |
_mm_storeu_si128 (m128i_ptr + 1, copy_4x); | |
_mm_storeu_si128 (m128i_ptr + 2, copy_4x); | |
_mm_storeu_si128 (m128i_ptr + 3, copy_4x); | |
_mm_storeu_si128 (m128i_ptr + 4, copy_4x); | |
_mm_storeu_si128 (m128i_ptr + 5, copy_4x); | |
_mm_storeu_si128 (m128i_ptr + 6, copy_4x); | |
_mm_storeu_si128 (m128i_ptr + 7, copy_4x); | |
n -= 32; | |
m128i_ptr += 8; | |
} | |
uint32_t *uint32_ptr = (uint32_t *) m128i_ptr; | |
while (n--) { | |
*uint32_ptr++ = copy; | |
} | |
return dest; | |
} | |
std::array<Timing, 6> benchmark_set_runner(void* target_ptr, int c, std::size_t n, unsigned int run_count) | |
{ | |
auto timing1 {measure_runtime([=] () { | |
for (unsigned i = 0; i < run_count; i++) { | |
mem_set32_c(target_ptr, 0x1234, n); | |
} | |
})}; | |
auto timing2 {measure_runtime([=] () { | |
for (unsigned i = 0; i < run_count; i++) { | |
mem_set32_mmx(target_ptr, 0x1234, n); | |
} | |
})}; | |
auto timing3 {measure_runtime([=] () { | |
for (unsigned i = 0; i < run_count; i++) { | |
mem_set32_mmx2(target_ptr, 0x1234, n); | |
} | |
})}; | |
auto timing4 {measure_runtime([=] () { | |
for (unsigned i = 0; i < run_count; i++) { | |
mem_set32_sse(target_ptr, 0x1234, n); | |
} | |
})}; | |
auto timing5 {measure_runtime([=] () { | |
for (unsigned i = 0; i < run_count; i++) { | |
mem_set32_sse2_stream(target_ptr, 0x1234, n); | |
} | |
})}; | |
auto timing6 {measure_runtime([=] () { | |
for (unsigned i = 0; i < run_count; i++) { | |
mem_set32_sse2_store(target_ptr, 0x1234, n); | |
} | |
})}; | |
return {timing1, timing2, timing3, timing4, timing5, timing6}; | |
} | |
void benchmark_set(unsigned int run_count, std::size_t buffer_size) | |
{ | |
fmt::print("Benchmarking setting of {}-byte buffers for {} times.\n", buffer_size, run_count); | |
auto count {buffer_size / sizeof(std::uint32_t)}; | |
{ | |
auto target {std::make_unique<std::byte[]>(buffer_size)}; | |
{ | |
auto target_ptr {target.get()}; | |
auto timings {benchmark_set_runner(target_ptr, 0x12, count, run_count)}; | |
auto reference_time {timings[0].count()}; | |
fmt::print("LV C : {:.6}s ({:.2f}x)\n", timings[0].count(), reference_time/timings[0].count()); | |
fmt::print("LV MMX : {:.6}s ({:.2f}x)\n", timings[1].count(), reference_time/timings[1].count()); | |
fmt::print("LV MMX2 : {:.6}s ({:.2f}x)\n", timings[2].count(), reference_time/timings[2].count()); | |
fmt::print("LV SSE : {:.6}s ({:.2f}x)\n", timings[3].count(), reference_time/timings[3].count()); | |
fmt::print("LV SSE2 1: {:.6}s ({:.2f}x)\n", timings[4].count(), reference_time/timings[4].count()); | |
fmt::print("LV SSE2 2: {:.6}s ({:.2f}x)\n", timings[5].count(), reference_time/timings[5].count()); | |
} | |
} | |
{ | |
constexpr std::size_t alignment {64}; | |
static_assert(std::has_single_bit(alignment), "Alignment is not a power of 2."); | |
auto target {std::make_unique<std::byte[]>(buffer_size + alignment-1)}; | |
auto target_offset {alignment - (std::uintptr_t(target.get()) & (alignment-1))}; | |
auto target_ptr = target.get() + target_offset; | |
assert(std::uintptr_t(target_ptr) % alignment == 0); | |
auto timings {benchmark_set_runner(target_ptr, 0x12, count, run_count)}; | |
auto reference_time {timings[0].count()}; | |
fmt::print("LV C (align:{}): {:.6}s ({:.2f}x)\n", alignment, timings[0].count(), reference_time/timings[0].count()); | |
fmt::print("LV MMX (align:{}): {:.6}s ({:.2f}x)\n", alignment, timings[1].count(), reference_time/timings[1].count()); | |
fmt::print("LV MMX2 (align:{}): {:.6}s ({:.2f}x)\n", alignment, timings[2].count(), reference_time/timings[2].count()); | |
fmt::print("LV SSE (align:{}): {:.6}s ({:.2f}x)\n", alignment, timings[3].count(), reference_time/timings[3].count()); | |
fmt::print("LV SSE2 1 (align:{}): {:.6}s ({:.2f}x)\n", alignment, timings[4].count(), reference_time/timings[4].count()); | |
fmt::print("LV SSE2 2 (align:{}): {:.6}s ({:.2f}x)\n", alignment, timings[5].count(), reference_time/timings[5].count()); | |
} | |
fmt::print("\n"); | |
} | |
int main() | |
{ | |
constexpr std::size_t bytes_to_copy {std::size_t(1024) * 1024 * 1024}; | |
constexpr std::size_t max_buffer_size {1 << 26}; | |
for (std::size_t buffer_size = 32; buffer_size <= max_buffer_size; buffer_size *= 2) { | |
std::size_t run_count {bytes_to_copy / buffer_size}; | |
benchmark_set(run_count, buffer_size); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment