Skip to content

Instantly share code, notes, and snippets.

@dmah42
Created June 5, 2020 15:10
Show Gist options
  • Save dmah42/f88f3f22dbbbfcb9530187ea206bae1b to your computer and use it in GitHub Desktop.
Save dmah42/f88f3f22dbbbfcb9530187ea206bae1b to your computer and use it in GitHub Desktop.
#include "benchmark/benchmark.h"
#include "cblas.h"
#include <memory>
static void BM_GEMM_unique_ptr(benchmark::State& state) {
std::unique_ptr<float> A(new float[state.range(0) * state.range(0)]);
std::unique_ptr<float> B(new float[state.range(0) * state.range(0)]);
std::unique_ptr<float> C(new float[state.range(0) * state.range(0)]);
//openblas_set_num_threads(1);
for (auto _ : state) {
cblas_sgemm(CblasColMajor, CblasNoTrans, CblasTrans,
state.range(0),
state.range(0),
state.range(0),
1, // alpha
A.get(),
state.range(0),
B.get(),
state.range(0),
1, // beta,
C.get(),
state.range(0));
benchmark::DoNotOptimize(C.get());
}
}
static void BM_GEMM_raw_ptr(benchmark::State& state) {
float* A = new float[state.range(0) * state.range(0)];
float* B = new float[state.range(0) * state.range(0)];
float* C = new float[state.range(0) * state.range(0)];
//openblas_set_num_threads(1);
for (auto _ : state) {
cblas_sgemm(CblasColMajor, CblasNoTrans, CblasTrans,
state.range(0),
state.range(0),
state.range(0),
1, // alpha
A,
state.range(0),
B,
state.range(0),
1, // beta,
C,
state.range(0));
benchmark::DoNotOptimize(C);
}
delete[] C;
delete[] B;
delete[] A;
}
BENCHMARK(BM_GEMM_unique_ptr)->Range(4, 1024);
BENCHMARK(BM_GEMM_raw_ptr)->Range(4, 1024);
$ clang++-5.0 -std=c++11 ./blas_bm.cc -I ~/git/benchmark/include/ -L ~/git/benchmark/build/src/ -lbenchmark_main -lbenchmark -lopenblas -pthread
./a.out
2020-06-05 16:09:38
Running ./a.out
Run on (4 X 3900 MHz CPU s)
CPU Caches:
L1 Data 32 KiB (x2)
L1 Instruction 32 KiB (x2)
L2 Unified 256 KiB (x2)
L3 Unified 4096 KiB (x1)
Load Average: 3.57, 3.55, 3.11
***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
------------------------------------------------------------------
Benchmark Time CPU Iterations
------------------------------------------------------------------
BM_GEMM_unique_ptr/4 465 ns 448 ns 1974826
BM_GEMM_unique_ptr/8 321 ns 321 ns 1889871
BM_GEMM_unique_ptr/64 10502 ns 10499 ns 65323
BM_GEMM_unique_ptr/512 3810300 ns 2597048 ns 387
BM_GEMM_unique_ptr/1024 31456758 ns 18933383 ns 49
BM_GEMM_raw_ptr/4 255 ns 254 ns 2978025
BM_GEMM_raw_ptr/8 337 ns 322 ns 2221654
BM_GEMM_raw_ptr/64 10354 ns 10121 ns 72125
BM_GEMM_raw_ptr/512 6574122 ns 2887697 ns 339
BM_GEMM_raw_ptr/1024 22505692 ns 17119469 ns 47
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment