Skip to content

Instantly share code, notes, and snippets.

@marty1885
Created September 2, 2023 10:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save marty1885/a939e3cda146e333195bf8f62fde7a95 to your computer and use it in GitHub Desktop.
Save marty1885/a939e3cda146e333195bf8f62fde7a95 to your computer and use it in GitHub Desktop.
#include <chrono>
#include <iostream>
#include <vector>
#include <random>
#include <cstring>
#include <array>
#include <cblas.h>
#include <rknn_matmul_api.h>
typedef __fp16 float16;
typedef struct _rknn_matmul_ctx
{
rknn_context ctx;
rknn_matmul_info info;
rknn_matmul_io_attr io_attr;
rknn_tensor_mem* A;
rknn_tensor_mem* B;
rknn_tensor_mem* C;
float16* a;
float16* b;
float* c;
int32_t M;
int32_t K;
int32_t N;
} RKNNMatMulCtx;
RKNNMatMulCtx* make_matmul(int32_t M, int32_t K, int32_t N)
{
RKNNMatMulCtx* ctx = (RKNNMatMulCtx*)malloc(sizeof(RKNNMatMulCtx));
memset(ctx, 0, sizeof(RKNNMatMulCtx));
ctx->info.M = M;
ctx->info.K = K;
ctx->info.N = N;
ctx->info.type = RKNN_TENSOR_FLOAT16;
ctx->info.native_layout = 1;
ctx->info.perf_layout = 1;
int ret = rknn_matmul_create(&ctx->ctx, &ctx->info, &ctx->io_attr);
if (ret < 0) {
printf("rknn_matmul_create fail! ret=%d\n", ret);
abort();
}
// Create A
ctx->A = rknn_create_mem(ctx->ctx, ctx->io_attr.A.size);
ctx->B = rknn_create_mem(ctx->ctx, ctx->io_attr.B.size);
ctx->C = rknn_create_mem(ctx->ctx, ctx->io_attr.C.size);
ctx->M = M;
ctx->K = K;
ctx->N = N;
ctx->a = (float16*)ctx->A->virt_addr;
ctx->b = (float16*)ctx->B->virt_addr;
ctx->c = (float*)ctx->C->virt_addr;
rknn_matmul_set_io_mem(ctx->ctx, ctx->A, &ctx->io_attr.A);
rknn_matmul_set_io_mem(ctx->ctx, ctx->B, &ctx->io_attr.B);
rknn_matmul_set_io_mem(ctx->ctx, ctx->C, &ctx->io_attr.C);
return ctx;
}
void set_matrix_data(rknn_matmul_ctx* ctx, rknn_tensor_mem* mem, rknn_matmul_tensor_attr* attr, const float* data)
{
size_t size = mem->size / sizeof(float16);
float16* ptr = (float16*)mem->virt_addr;
for (size_t i = 0; i < size; ++i) {
ptr[i] = (float16)data[i];
}
rknn_matmul_set_io_mem(*ctx, mem, attr);
}
void free_matmul(RKNNMatMulCtx* ctx)
{
rknn_destroy_mem(ctx->ctx, ctx->A);
rknn_destroy_mem(ctx->ctx, ctx->B);
rknn_destroy_mem(ctx->ctx, ctx->C);
rknn_matmul_destroy(ctx->ctx);
free(ctx);
}
template <typename T>
std::vector<T> make_random_matrix(size_t M, size_t N) {
std::vector<T> A(M * N);
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_real_distribution<T> dis(0.0, 1.0);
for (size_t i = 0; i < M * N; ++i) {
A[i] = dis(gen);
}
return A;
}
std::vector<float> matmul_naive(const std::vector<float>& A,
const std::vector<float>& B, size_t M, size_t K, size_t N) {
std::vector<float> C(M * N);
for (size_t i = 0; i < M; ++i) {
for (size_t j = 0; j < N; ++j) {
float sum = 0.0f;
for (size_t k = 0; k < K; ++k) {
sum += A[i * K + k] * B[k * N + j];
}
C[i * N + j] = sum;
}
}
return C;
}
std::vector<float> matmul_cblas(const std::vector<float>& A,
const std::vector<float>& B, size_t M, size_t K, size_t N) {
std::vector<float> C(M * N);
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, M, N, K, 1.0f, A.data(), K, B.data(), N, 0.0f, C.data(), N);
return C;
}
template <typename Func>
float benchmark(size_t M, size_t K, size_t N, size_t repeat, Func func) {
auto A = make_random_matrix<float>(M, K);
auto B = make_random_matrix<float>(K, N);
auto start = std::chrono::system_clock::now();
for (size_t i = 0; i < repeat; ++i) {
auto C = func(A, B, M, K, N);
}
auto end = std::chrono::system_clock::now();
auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
return elapsed / 1000.0f / repeat;
}
std::vector<float> bench_all(size_t M, size_t K, size_t N, size_t repeat)
{
std::vector<float> ret;
ret.push_back(benchmark(M, K, N, repeat, matmul_cblas));
auto rknn_setup = [&](){
auto ctx = make_matmul(M, K, N);
return [ctx](const std::vector<float>& A, const std::vector<float>& B, size_t M, size_t K, size_t N) {
static bool first = true;
if (first) {
first = false;
set_matrix_data(&ctx->ctx, ctx->A, &ctx->io_attr.A, A.data());
set_matrix_data(&ctx->ctx, ctx->B, &ctx->io_attr.B, B.data());
}
rknn_matmul_run(ctx->ctx);
return std::vector<float>(ctx->c, ctx->c + M * N);
};
};
auto rknn = rknn_setup();
ret.push_back(benchmark(M, K, N, repeat, rknn));
return ret;
}
int main()
{
std::vector<size_t> sizes = {256, 512, 1024, 2048};
size_t repeat = 10;
std::cout << "size,cblas,rknn" << std::endl;
for (auto size : sizes) {
auto ret = bench_all(size, size, size, repeat);
std::cout << size << "," << ret[0] << "," << ret[1] << std::endl;
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment