Skip to content

Instantly share code, notes, and snippets.

@fengyuentau
Last active January 15, 2024 03:15
Show Gist options
  • Save fengyuentau/20af905a3ee0dbf5da93c9befc6a9841 to your computer and use it in GitHub Desktop.
Save fengyuentau/20af905a3ee0dbf5da93c9befc6a9841 to your computer and use it in GitHub Desktop.
OpenCL Benchmark C++
cmake_minimum_required(VERSION 3.13)
project("CLBlast performance test")
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
find_package(OpenCL)
find_package(CLBlast HINTS "/home/opencv-cn/Workspace/others/CLBlast/build/install")
message(STATUS "CLBlast_FOUND=${CLBlast_FOUND}, CLBlast_INCLUDE_DIRS=${CLBlast_INCLUDE_DIRS}, CLBlast_LIBS=${CLBlast_LIBS}")
find_package(OpenCV 4.9.0 HINTS "/home/opencv-cn/Workspace/opencv/build/pre-4.9.0/install")
message(STATUS "OpenCV_FOUND=${OpenCV_FOUND}, OpenCV_INCLUDE_DIRS=${OpenCV_INCLUDE_DIRS}, OpenCV_LIBS=${OpenCV_LIBS}")
include_directories("/home/opencv-cn/Workspace/others/CLBlast/build/install/include")
include_directories(${OpenCV_INCLUDE_DIRS})
add_executable(main main.cpp)
target_link_libraries(main "clblast" "${OpenCV_LIBS}")
#include "opencv2/opencv.hpp"
#include <vector>
#include <iostream>
#include <numeric>
#define CL_HPP_TARGET_OPENCL_VERSION 120
#define CL_HPP_MINIMUM_OPENCL_VERSION 120
#define CL_TARGET_OPENCL_VERSION 120
#include "CL/opencl.hpp"
#include "clblast.h"
using Shape = std::vector<int>;
struct TestGemmParam {
Shape a;
Shape b;
Shape c;
bool trans_a;
bool trans_b;
TestGemmParam(Shape A, Shape B, Shape C = {}, bool transA = false, bool transB = false)
: a(A), b(B), c(C), trans_a(transA), trans_b(transB) {}
};
static const TestGemmParam test_configs[] = {
{ { 768, 768 }, { 768, 768 }, { 768 } },
{ { 1024, 1024 }, { 1024, 1024 }, { 1024 } },
{ { 50, 768 }, { 768, 2304 } },
{ { 197, 768 }, { 768, 2304 } },
{ { 50, 1024 }, { 1024, 3072 } },
{ { 197, 1024 }, { 1024, 3072 } },
};
int main() {
// OpenCL platform
auto platforms = std::vector<cl::Platform>();
cl::Platform::get(&platforms);
if (platforms.size() == 0) {
std::cerr << "Cannot get OpenCL platforms" << std::endl;
return 1;
}
for (size_t i = 0; i < platforms.size(); i++) {
std::string platform_name;
auto error = platforms[i].getInfo(CL_PLATFORM_NAME, &platform_name);
std::cout << platform_name << std::endl;
}
auto platform = platforms[2];
// OpenCL device
auto devices = std::vector<cl::Device>();
platform.getDevices(CL_DEVICE_TYPE_ALL, &devices);
if (devices.size() == 0) {
std::cerr << "Cannot get OpenCL devices" << std::endl;
return 1;
}
auto device = devices[0];
// OpenCL context, queue
auto context = cl::Context(std::vector<cl::Device>{device});
auto queue = cl::CommandQueue(context, device);
for (auto config : test_configs) {
Shape a_shape = config.a;
Shape b_shape = config.b;
Shape c_shape = config.c;
bool trans_a = config.trans_a;
bool trans_b = config.trans_b;
int M = trans_a ? a_shape.back() : a_shape[0],
N = trans_b ? b_shape[0] : b_shape.back(),
K = trans_b ? b_shape.back() : b_shape[0],
lda = a_shape.back(),
ldb = b_shape.back(),
ldc = N;
cv::Mat A(a_shape, CV_32FC1),
B(b_shape, CV_32FC1);
auto C = c_shape.empty() ? cv::Mat::zeros(M, N, CV_32FC1) : cv::Mat(c_shape, CV_32FC1);
auto Y = cv::Mat(std::vector<int>{M, N}, CV_32FC1);
std::memset(Y.ptr<float>(), 0, Y.total() * sizeof(float));
cv::randn(A, 0.f, 1.f);
cv::randn(B, 0.f, 1.f);
if (!c_shape.empty()) {
cv::randn(C, 0.f, 1.f);
}
// Copy cv::Mat to device
auto A_device = cl::Buffer(context, CL_MEM_READ_WRITE, A.total() * sizeof(float));
auto B_device = cl::Buffer(context, CL_MEM_READ_WRITE, B.total() * sizeof(float));
// auto C_device = cl::Buffer(context, CL_MEM_READ_WRITE, C.total() * sizeof(float));
auto Y_device = cl::Buffer(context, CL_MEM_READ_WRITE, Y.total() * sizeof(float));
queue.enqueueWriteBuffer(A_device, CL_TRUE, 0, A.total() * sizeof(float), A.ptr<const float>());
queue.enqueueWriteBuffer(B_device, CL_TRUE, 0, B.total() * sizeof(float), B.ptr<const float>());
// queue.enqueueWriteBuffer(C_device, CL_TRUE, 0, C.total() * sizeof(float), C.ptr<const float>());
queue.enqueueWriteBuffer(Y_device, CL_TRUE, 0, Y.total() * sizeof(float), Y.ptr<const float>());
auto event = cl_event{nullptr};
// Warmup
auto queue_plain = queue();
auto status = clblast::Gemm(clblast::Layout::kRowMajor,
clblast::Transpose::kNo,
clblast::Transpose::kNo,
M, N, K,
1.f, // alpha
A_device(), 0, lda,
B_device(), 0, ldb,
0.f, // beta
Y_device(), 0, ldc,
&queue_plain, &event);
if (status == clblast::StatusCode::kSuccess) {
clWaitForEvents(1, &event);
}
// Benchmark
std::vector<double> times;
cv::TickMeter meter;
for (int i = 0; i < 10; i++) {
meter.reset();
meter.start();
auto status = clblast::Gemm(clblast::Layout::kRowMajor,
clblast::Transpose::kNo,
clblast::Transpose::kNo,
M, N, K,
1.f, // alpha
A_device(), 0, lda,
B_device(), 0, ldb,
0.f, // beta
Y_device(), 0, ldc,
&queue_plain, &event);
if (status == clblast::StatusCode::kSuccess) {
clWaitForEvents(1, &event);
}
meter.stop();
times.push_back(meter.getTimeMilli());
}
meter.reset();
clReleaseEvent(event);
clReleaseMemObject(A_device.get());
clReleaseMemObject(B_device.get());
clReleaseMemObject(Y_device.get());
// Handle results
std::sort(times.begin(), times.end());
double mean = std::accumulate(times.begin(), times.end(), decltype(times)::value_type(0)) / times.size();
double median = (times[4] + times[5]) / 2;
double minimum = times[0];
std::string str_a_shape = cv::format("[%d, %d]", a_shape[0], a_shape[1]);
std::string str_b_shape = cv::format("[%d, %d]", b_shape[0], b_shape[1]);
std::cout << cv::format("A=%s, B=%s, mean=%.2f, median=%.2f, min=%.2f\n", str_a_shape.c_str(), str_b_shape.c_str(), mean, median, minimum);
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment