Skip to content

Instantly share code, notes, and snippets.

@janisozaur
Last active October 6, 2021 21:54
Show Gist options
  • Save janisozaur/21e0c6ba82b9b871cdeef8cbdefe615a to your computer and use it in GitHub Desktop.
Save janisozaur/21e0c6ba82b9b871cdeef8cbdefe615a to your computer and use it in GitHub Desktop.
Benchmark for SDL2's blit1to4 implementation alternative
// g++ benchmark.cpp -lbenchmark -O2 -g -o bench && ./bench
#include <cstdint>
#include <cstring>
#include <random>
#include <benchmark/benchmark.h>
#define Uint8 uint8_t
#define Uint32 uint32_t
typedef struct SDL_Color
{
Uint8 r;
Uint8 g;
Uint8 b;
Uint8 a;
} SDL_Color;
#define SDL_Colour SDL_Color
typedef struct SDL_Palette
{
int ncolors;
SDL_Color *colors;
Uint32 version;
int refcount;
} SDL_Palette;
/**
* \note Everything in the pixel format structure is read-only.
*/
typedef struct SDL_PixelFormat
{
Uint32 format;
SDL_Palette *palette;
Uint8 BitsPerPixel;
Uint8 BytesPerPixel;
Uint8 padding[2];
Uint32 Rmask;
Uint32 Gmask;
Uint32 Bmask;
Uint32 Amask;
Uint8 Rloss;
Uint8 Gloss;
Uint8 Bloss;
Uint8 Aloss;
Uint8 Rshift;
Uint8 Gshift;
Uint8 Bshift;
Uint8 Ashift;
int refcount;
struct SDL_PixelFormat *next;
} SDL_PixelFormat;
typedef struct
{
Uint8 *src;
int src_w, src_h;
int src_pitch;
int src_skip;
Uint8 *dst;
int dst_w, dst_h;
int dst_pitch;
int dst_skip;
SDL_PixelFormat *src_fmt;
SDL_PixelFormat *dst_fmt;
Uint8 *table;
int flags;
Uint32 colorkey;
Uint8 r, g, b, a;
} SDL_BlitInfo;
static void
Blit1to4(SDL_BlitInfo * info)
{
#ifndef USE_DUFFS_LOOP
int c;
#endif
int width, height;
Uint8 *src;
Uint32 *map, *dst;
int srcskip, dstskip;
/* Set up some basic variables */
width = info->dst_w;
height = info->dst_h;
src = info->src;
srcskip = info->src_skip;
dst = (Uint32 *) info->dst;
dstskip = info->dst_skip / 4;
map = (Uint32 *) info->table;
while (height--) {
#ifdef USE_DUFFS_LOOP
/* *INDENT-OFF* */
DUFFS_LOOP(
*dst++ = map[*src++];
, width);
/* *INDENT-ON* */
#else
for (c = width / 4; c; --c) {
*dst++ = map[*src++];
*dst++ = map[*src++];
*dst++ = map[*src++];
*dst++ = map[*src++];
}
switch (width & 3) {
case 3:
*dst++ = map[*src++];
case 2:
*dst++ = map[*src++];
case 1:
*dst++ = map[*src++];
}
#endif /* USE_DUFFS_LOOP */
src += srcskip;
dst += dstskip;
}
}
#define USE_DUFFS_LOOP
/* 8-times unrolled loop */
#define DUFFS_LOOP8(pixel_copy_increment, width) \
{ int n = (width+7)/8; \
switch (width & 7) { \
case 0: do { pixel_copy_increment; /* fallthrough */ \
case 7: pixel_copy_increment; /* fallthrough */ \
case 6: pixel_copy_increment; /* fallthrough */ \
case 5: pixel_copy_increment; /* fallthrough */ \
case 4: pixel_copy_increment; /* fallthrough */ \
case 3: pixel_copy_increment; /* fallthrough */ \
case 2: pixel_copy_increment; /* fallthrough */ \
case 1: pixel_copy_increment; /* fallthrough */ \
} while ( --n > 0 ); \
} \
}
/* 4-times unrolled loop */
#define DUFFS_LOOP4(pixel_copy_increment, width) \
{ int n = (width+3)/4; \
switch (width & 3) { \
case 0: do { pixel_copy_increment; /* fallthrough */ \
case 3: pixel_copy_increment; /* fallthrough */ \
case 2: pixel_copy_increment; /* fallthrough */ \
case 1: pixel_copy_increment; /* fallthrough */ \
} while (--n > 0); \
} \
}
/* Use the 8-times version of the loop by default */
#define DUFFS_LOOP(pixel_copy_increment, width) \
DUFFS_LOOP8(pixel_copy_increment, width)
static void
Blit1to4_duffs(SDL_BlitInfo * info)
{
#ifndef USE_DUFFS_LOOP
int c;
#endif
int width, height;
Uint8 *src;
Uint32 *map, *dst;
int srcskip, dstskip;
/* Set up some basic variables */
width = info->dst_w;
height = info->dst_h;
src = info->src;
srcskip = info->src_skip;
dst = (Uint32 *) info->dst;
dstskip = info->dst_skip / 4;
map = (Uint32 *) info->table;
while (height--) {
#ifdef USE_DUFFS_LOOP
/* *INDENT-OFF* */
DUFFS_LOOP(
*dst++ = map[*src++];
, width);
/* *INDENT-ON* */
#else
for (c = width / 4; c; --c) {
*dst++ = map[*src++];
*dst++ = map[*src++];
*dst++ = map[*src++];
*dst++ = map[*src++];
}
switch (width & 3) {
case 3:
*dst++ = map[*src++];
case 2:
*dst++ = map[*src++];
case 1:
*dst++ = map[*src++];
}
#endif /* USE_DUFFS_LOOP */
src += srcskip;
dst += dstskip;
}
}
static void
Blit1to4_janis(SDL_BlitInfo * info)
{
int c;
int width, height;
Uint8 *src;
Uint32 *map, *dst;
int srcskip, dstskip;
/* Set up some basic variables */
width = info->dst_w;
height = info->dst_h;
src = info->src;
srcskip = info->src_skip;
dst = (Uint32 *)(info->dst);
dstskip = info->dst_skip / 4;
map = (Uint32 *) info->table;
while (height--) {
for (c = width / 4; c; --c) {
uint32_t foo[4];
foo[0] = map[*src++];
foo[1] = map[*src++];
foo[2] = map[*src++];
foo[3] = map[*src++];
memcpy(dst, foo, 4 * sizeof(uint32_t));
dst += 4;
}
switch (width & 3) {
case 3:
*dst++ = map[*src++];
case 2:
*dst++ = map[*src++];
case 1:
*dst++ = map[*src++];
}
src += srcskip;
dst += dstskip;
}
}
static void
Blit1to4_janis_aligned(SDL_BlitInfo * info)
{
int c;
int width, height;
Uint8 *src;
Uint32 *map, *dst;
int srcskip, dstskip;
/* Set up some basic variables */
width = info->dst_w;
height = info->dst_h;
src = (uint8_t*)__builtin_assume_aligned(info->src, 4);
srcskip = info->src_skip;
dst = (Uint32 *)__builtin_assume_aligned(info->dst, 4);
dstskip = info->dst_skip / 4;
map = (Uint32 *) __builtin_assume_aligned(info->table, 4);
while (height--) {
for (c = width / 4; c; --c) {
uint32_t foo[4];
foo[0] = map[*src++];
foo[1] = map[*src++];
foo[2] = map[*src++];
foo[3] = map[*src++];
memcpy(dst, foo, 4 * sizeof(uint32_t));
dst += 4;
}
switch (width & 3) {
case 3:
*dst++ = map[*src++];
case 2:
*dst++ = map[*src++];
case 1:
*dst++ = map[*src++];
}
src += srcskip;
dst += dstskip;
}
}
static void
Blit_memcpy(SDL_BlitInfo * info)
{
int c;
int width, height;
Uint8 *src;
Uint32 *map, *dst;
int srcskip, dstskip;
/* Set up some basic variables */
width = info->dst_w;
height = info->dst_h;
src = (uint8_t*)__builtin_assume_aligned(info->src, 4);
srcskip = info->src_skip;
dst = (Uint32 *)__builtin_assume_aligned(info->dst, 4);
dstskip = info->dst_skip / 4;
map = (Uint32 *) __builtin_assume_aligned(info->table, 4);
while (height--) {
for (c = width / 4; c; --c) {
uint32_t foo[4]{};
memcpy(dst, foo, 4 * sizeof(uint32_t));
dst += 4;
}
switch (width & 3) {
case 3:
*dst++ = map[*src++];
case 2:
*dst++ = map[*src++];
case 1:
*dst++ = map[*src++];
}
src += srcskip;
dst += dstskip;
}
}
static void
Blit_memset(SDL_BlitInfo * info)
{
int c;
int width, height;
Uint8 *src;
Uint32 *map, *dst;
int srcskip, dstskip;
/* Set up some basic variables */
width = info->dst_w;
height = info->dst_h;
src = (uint8_t*)__builtin_assume_aligned(info->src, 4);
srcskip = info->src_skip;
dst = (Uint32 *)__builtin_assume_aligned(info->dst, 4);
dstskip = info->dst_skip / 4;
map = (Uint32 *) __builtin_assume_aligned(info->table, 4);
while (height--) {
memset(dst, 0, width * 4);
src += srcskip + width;
dst += dstskip + width;
}
}
extern Uint8 real_palette[256 * 4];
Uint8 real_palette[256 * 4];
void init(SDL_BlitInfo *sbi)
{
std::mt19937 gen32;
for (int i = 0; i < 256 * 4; i++) {
sbi->table[i] = gen32();
}
for (int i = 0; i < sbi->dst_w * sbi->dst_h; i++) {
sbi->src[i] = gen32();
}
}
static void BM_Blit1to4(benchmark::State& state) {
auto* src = new Uint8[state.range(0) * state.range(0)];
auto* dst = new Uint8[state.range(0) * state.range(0) * 4];
SDL_BlitInfo sbi;
sbi.src = src;
sbi.dst = dst;
sbi.table = real_palette;
sbi.dst_w = state.range(0);
sbi.dst_h = state.range(0);
sbi.src_skip = 0;
sbi.dst_skip = 0;
init(&sbi);
for (auto _ : state)
Blit1to4(&sbi);
state.SetBytesProcessed(int64_t(state.iterations()) *
int64_t(state.range(0) * state.range(0)));
delete[] src;
delete[] dst;
}
BENCHMARK(BM_Blit1to4)->Arg(8)->Arg(64)->Arg(512)->Arg(1<<10)->Arg(2<<10)->Arg(4<<10);
static void BM_Blit1to4_duffs(benchmark::State& state) {
auto* src = new Uint8[state.range(0) * state.range(0)];
auto* dst = new Uint8[state.range(0) * state.range(0) * 4];
SDL_BlitInfo sbi;
sbi.src = src;
sbi.dst = dst;
sbi.table = real_palette;
sbi.dst_w = state.range(0);
sbi.dst_h = state.range(0);
sbi.src_skip = 0;
sbi.dst_skip = 0;
init(&sbi);
for (auto _ : state)
Blit1to4_duffs(&sbi);
state.SetBytesProcessed(int64_t(state.iterations()) *
int64_t(state.range(0) * state.range(0)));
delete[] src;
delete[] dst;
}
BENCHMARK(BM_Blit1to4_duffs)->Arg(8)->Arg(64)->Arg(512)->Arg(1<<10)->Arg(2<<10)->Arg(4<<10);
static void BM_Blit1to4_janis(benchmark::State& state) {
auto* src = new Uint8[state.range(0) * state.range(0)];
auto* dst = new Uint8[state.range(0) * state.range(0) * 4];
SDL_BlitInfo sbi;
sbi.src = src;
sbi.dst = dst;
sbi.table = real_palette;
sbi.dst_w = state.range(0);
sbi.dst_h = state.range(0);
sbi.src_skip = 0;
sbi.dst_skip = 0;
init(&sbi);
for (auto _ : state)
Blit1to4_janis(&sbi);
state.SetBytesProcessed(int64_t(state.iterations()) *
int64_t(state.range(0) * state.range(0)));
delete[] src;
delete[] dst;
}
BENCHMARK(BM_Blit1to4_janis)->Arg(8)->Arg(64)->Arg(512)->Arg(1<<10)->Arg(2<<10)->Arg(4<<10);
static void BM_Blit1to4_janis_aligned(benchmark::State& state) {
auto* src = new Uint8[state.range(0) * state.range(0)];
auto* dst = new Uint8[state.range(0) * state.range(0) * 4];
SDL_BlitInfo sbi;
sbi.src = src;
sbi.dst = dst;
sbi.table = real_palette;
sbi.dst_w = state.range(0);
sbi.dst_h = state.range(0);
sbi.src_skip = 0;
sbi.dst_skip = 0;
init(&sbi);
for (auto _ : state)
Blit1to4_janis_aligned(&sbi);
state.SetBytesProcessed(int64_t(state.iterations()) *
int64_t(state.range(0) * state.range(0)));
delete[] src;
delete[] dst;
}
BENCHMARK(BM_Blit1to4_janis_aligned)->Arg(8)->Arg(64)->Arg(512)->Arg(1<<10)->Arg(2<<10)->Arg(4<<10);
static void BM_Blit1to4_janis_aligned_zero(benchmark::State& state) {
auto* src = new Uint8[state.range(0) * state.range(0)];
auto* dst = new Uint8[state.range(0) * state.range(0) * 4];
SDL_BlitInfo sbi;
sbi.src = src;
sbi.dst = dst;
sbi.table = real_palette;
sbi.dst_w = state.range(0);
sbi.dst_h = state.range(0);
sbi.src_skip = 0;
sbi.dst_skip = 0;
init(&sbi);
memset(sbi.table, 0, 256 * 4);
for (auto _ : state)
Blit1to4_janis_aligned(&sbi);
state.SetBytesProcessed(int64_t(state.iterations()) *
int64_t(state.range(0) * state.range(0)));
delete[] src;
delete[] dst;
}
BENCHMARK(BM_Blit1to4_janis_aligned_zero)->Arg(8)->Arg(64)->Arg(512)->Arg(1<<10)->Arg(2<<10)->Arg(4<<10);
static void BM_memset(benchmark::State& state) {
auto* src = new Uint8[state.range(0) * state.range(0)];
auto* dst = new Uint8[state.range(0) * state.range(0) * 4];
SDL_BlitInfo sbi;
sbi.src = src;
sbi.dst = dst;
sbi.table = real_palette;
sbi.dst_w = state.range(0);
sbi.dst_h = state.range(0);
sbi.src_skip = 0;
sbi.dst_skip = 0;
init(&sbi);
memset(sbi.table, 0, 256 * 4);
for (auto _ : state)
Blit_memset(&sbi);
state.SetBytesProcessed(int64_t(state.iterations()) *
int64_t(state.range(0) * state.range(0)));
delete[] src;
delete[] dst;
}
BENCHMARK(BM_memset)->Arg(8)->Arg(64)->Arg(512)->Arg(1<<10)->Arg(2<<10)->Arg(4<<10);
static void BM_memcpy(benchmark::State& state) {
auto* src = new Uint8[state.range(0) * state.range(0)];
auto* dst = new Uint8[state.range(0) * state.range(0) * 4];
SDL_BlitInfo sbi;
sbi.src = src;
sbi.dst = dst;
sbi.table = real_palette;
sbi.dst_w = state.range(0);
sbi.dst_h = state.range(0);
sbi.src_skip = 0;
sbi.dst_skip = 0;
init(&sbi);
memset(sbi.table, 0, 256 * 4);
for (auto _ : state)
Blit_memcpy(&sbi);
state.SetBytesProcessed(int64_t(state.iterations()) *
int64_t(state.range(0) * state.range(0)));
delete[] src;
delete[] dst;
}
BENCHMARK(BM_memcpy)->Arg(8)->Arg(64)->Arg(512)->Arg(1<<10)->Arg(2<<10)->Arg(4<<10);
int main(int argc, char** argv) {
//benchmark::RegisterBenchmark("BM_Blit1to4_duffs", BM_Blit1to4_duffs);
benchmark::Initialize(&argc, argv);
benchmark::RunSpecifiedBenchmarks();
//benchmark::Shutdown();
}
Unable to determine clock rate from sysctl: hw.cpufrequency: No such file or directory
2021-09-18T13:05:20+02:00
Running ./bench
Run on (8 X 24.1206 MHz CPU s)
CPU Caches:
L1 Data 64 KiB (x8)
L1 Instruction 128 KiB (x8)
L2 Unified 4096 KiB (x8)
Load Average: 1.41, 1.67, 1.79
----------------------------------------------------------------------------------------------
Benchmark Time CPU Iterations UserCounters...
----------------------------------------------------------------------------------------------
BM_Blit1to4/8 16.0 ns 16.0 ns 41584241 bytes_per_second=3.72337G/s
BM_Blit1to4/64 947 ns 947 ns 744673 bytes_per_second=4.02659G/s
BM_Blit1to4/512 63372 ns 63371 ns 11145 bytes_per_second=3.85257G/s
BM_Blit1to4/1024 252167 ns 252163 ns 2772 bytes_per_second=3.87274G/s
BM_Blit1to4/2048 1003950 ns 1003946 ns 700 bytes_per_second=3.8909G/s
BM_Blit1to4/4096 4061885 ns 4061785 ns 172 bytes_per_second=3.84683G/s
BM_Blit1to4_duffs/8 23.3 ns 23.3 ns 30233532 bytes_per_second=2.5628G/s
BM_Blit1to4_duffs/64 1346 ns 1346 ns 517078 bytes_per_second=2.8335G/s
BM_Blit1to4_duffs/512 88163 ns 88162 ns 7966 bytes_per_second=2.76923G/s
BM_Blit1to4_duffs/1024 368285 ns 368281 ns 1899 bytes_per_second=2.65168G/s
BM_Blit1to4_duffs/2048 1471590 ns 1471584 ns 476 bytes_per_second=2.65445G/s
BM_Blit1to4_duffs/4096 5908542 ns 5908495 ns 111 bytes_per_second=2.6445G/s
BM_Blit1to4_janis/8 16.9 ns 16.9 ns 41453723 bytes_per_second=3.52575G/s
BM_Blit1to4_janis/64 957 ns 957 ns 731239 bytes_per_second=3.98593G/s
BM_Blit1to4_janis/512 63277 ns 63275 ns 11051 bytes_per_second=3.85843G/s
BM_Blit1to4_janis/1024 251139 ns 251137 ns 2786 bytes_per_second=3.88856G/s
BM_Blit1to4_janis/2048 1002765 ns 1002730 ns 699 bytes_per_second=3.89562G/s
BM_Blit1to4_janis/4096 4037985 ns 4037983 ns 173 bytes_per_second=3.86951G/s
BM_Blit1to4_janis_aligned/8 16.6 ns 16.6 ns 42072112 bytes_per_second=3.59699G/s
BM_Blit1to4_janis_aligned/64 956 ns 956 ns 732279 bytes_per_second=3.98882G/s
BM_Blit1to4_janis_aligned/512 63317 ns 63316 ns 11049 bytes_per_second=3.85591G/s
BM_Blit1to4_janis_aligned/1024 251312 ns 251309 ns 2784 bytes_per_second=3.8859G/s
BM_Blit1to4_janis_aligned/2048 998530 ns 998521 ns 701 bytes_per_second=3.91204G/s
BM_Blit1to4_janis_aligned/4096 4034439 ns 4034399 ns 173 bytes_per_second=3.87294G/s
BM_Blit1to4_janis_aligned_zero/8 16.6 ns 16.6 ns 42012268 bytes_per_second=3.58571G/s
BM_Blit1to4_janis_aligned_zero/64 957 ns 957 ns 731743 bytes_per_second=3.98668G/s
BM_Blit1to4_janis_aligned_zero/512 63346 ns 63345 ns 11050 bytes_per_second=3.85412G/s
BM_Blit1to4_janis_aligned_zero/1024 251606 ns 251604 ns 2783 bytes_per_second=3.88135G/s
BM_Blit1to4_janis_aligned_zero/2048 999055 ns 999030 ns 701 bytes_per_second=3.91004G/s
BM_Blit1to4_janis_aligned_zero/4096 4046626 ns 4046387 ns 173 bytes_per_second=3.86147G/s
BM_memset/8 31.0 ns 31.0 ns 22412408 bytes_per_second=1.92244G/s
BM_memset/64 184 ns 184 ns 3805734 bytes_per_second=20.7455G/s
BM_memset/512 16803 ns 16802 ns 41671 bytes_per_second=14.5301G/s
BM_memset/1024 74512 ns 74510 ns 9413 bytes_per_second=13.1065G/s
BM_memset/2048 680728 ns 680727 ns 967 bytes_per_second=5.73835G/s
BM_memset/4096 2963167 ns 2963136 ns 235 bytes_per_second=5.27313G/s
BM_memcpy/8 35.1 ns 35.1 ns 20043523 bytes_per_second=1.69974G/s
BM_memcpy/64 332 ns 332 ns 2111735 bytes_per_second=11.5018G/s
BM_memcpy/512 18137 ns 18137 ns 38616 bytes_per_second=13.461G/s
BM_memcpy/1024 95677 ns 95669 ns 7307 bytes_per_second=10.2077G/s
BM_memcpy/2048 652300 ns 652300 ns 1014 bytes_per_second=5.98843G/s
BM_memcpy/4096 3523019 ns 3523000 ns 198 bytes_per_second=4.43514G/s
Run on (4 X 1500 MHz CPU s)
Load Average: 0.29, 1.51, 1.30
***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
----------------------------------------------------------------------------------------------
Benchmark Time CPU Iterations UserCounters...
----------------------------------------------------------------------------------------------
BM_Blit1to4/8 108 ns 108 ns 6450524 bytes_per_second=563.704M/s
BM_Blit1to4/64 5695 ns 5690 ns 120728 bytes_per_second=686.452M/s
BM_Blit1to4/512 384858 ns 384585 ns 1818 bytes_per_second=650.052M/s
BM_Blit1to4/1024 1737759 ns 1734079 ns 404 bytes_per_second=576.675M/s
BM_Blit1to4/2048 6973408 ns 6956302 ns 101 bytes_per_second=575.018M/s
BM_Blit1to4/4096 35186953 ns 35090453 ns 18 bytes_per_second=455.964M/s
BM_Blit1to4_duffs/8 108 ns 108 ns 6467720 bytes_per_second=564.11M/s
BM_Blit1to4_duffs/64 5707 ns 5703 ns 117894 bytes_per_second=684.899M/s
BM_Blit1to4_duffs/512 393705 ns 393433 ns 1728 bytes_per_second=635.432M/s
BM_Blit1to4_duffs/1024 1727725 ns 1724378 ns 405 bytes_per_second=579.919M/s
BM_Blit1to4_duffs/2048 6950270 ns 6931980 ns 101 bytes_per_second=577.036M/s
BM_Blit1to4_duffs/4096 34990708 ns 34902436 ns 18 bytes_per_second=458.421M/s
BM_Blit1to4_janis/8 105 ns 105 ns 6676132 bytes_per_second=582.156M/s
BM_Blit1to4_janis/64 5646 ns 5642 ns 118151 bytes_per_second=692.362M/s
BM_Blit1to4_janis/512 392460 ns 392156 ns 1735 bytes_per_second=637.501M/s
BM_Blit1to4_janis/1024 1729658 ns 1726145 ns 405 bytes_per_second=579.326M/s
BM_Blit1to4_janis/2048 6957858 ns 6940140 ns 100 bytes_per_second=576.357M/s
BM_Blit1to4_janis/4096 34863384 ns 34765160 ns 18 bytes_per_second=460.231M/s
BM_Blit1to4_janis_aligned/8 106 ns 106 ns 6633877 bytes_per_second=578.385M/s
BM_Blit1to4_janis_aligned/64 5657 ns 5653 ns 118096 bytes_per_second=690.989M/s
BM_Blit1to4_janis_aligned/512 392742 ns 392469 ns 1736 bytes_per_second=636.992M/s
BM_Blit1to4_janis_aligned/1024 1728761 ns 1725408 ns 405 bytes_per_second=579.573M/s
BM_Blit1to4_janis_aligned/2048 6959338 ns 6939785 ns 100 bytes_per_second=576.387M/s
BM_Blit1to4_janis_aligned/4096 34906249 ns 34815391 ns 18 bytes_per_second=459.567M/s
BM_Blit1to4_janis_aligned_zero/8 106 ns 106 ns 6632627 bytes_per_second=578.382M/s
BM_Blit1to4_janis_aligned_zero/64 5640 ns 5636 ns 119403 bytes_per_second=693.066M/s
BM_Blit1to4_janis_aligned_zero/512 393510 ns 393206 ns 1736 bytes_per_second=635.799M/s
BM_Blit1to4_janis_aligned_zero/1024 1728387 ns 1724770 ns 405 bytes_per_second=579.788M/s
BM_Blit1to4_janis_aligned_zero/2048 6953788 ns 6936679 ns 100 bytes_per_second=576.645M/s
BM_Blit1to4_janis_aligned_zero/4096 34902772 ns 34804518 ns 18 bytes_per_second=459.71M/s
BM_memset/8 78.9 ns 78.8 ns 8874246 bytes_per_second=774.597M/s
BM_memset/64 1714 ns 1713 ns 357216 bytes_per_second=2.22725G/s
BM_memset/512 310023 ns 309789 ns 2113 bytes_per_second=807.001M/s
BM_memset/1024 1387825 ns 1386778 ns 495 bytes_per_second=721.096M/s
BM_memset/2048 5672146 ns 5667385 ns 123 bytes_per_second=705.793M/s
BM_memset/4096 27291624 ns 27257563 ns 24 bytes_per_second=586.993M/s
BM_memcpy/8 88.9 ns 88.8 ns 7874357 bytes_per_second=687.215M/s
BM_memcpy/64 1767 ns 1766 ns 360358 bytes_per_second=2.16045G/s
BM_memcpy/512 298864 ns 298655 ns 2088 bytes_per_second=837.087M/s
BM_memcpy/1024 1384263 ns 1383143 ns 494 bytes_per_second=722.991M/s
BM_memcpy/2048 5701829 ns 5697497 ns 123 bytes_per_second=702.063M/s
BM_memcpy/4096 27279868 ns 27243797 ns 24 bytes_per_second=587.29M/s
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment