Skip to content

Instantly share code, notes, and snippets.

@mcleary
Created May 14, 2020 09:42
Show Gist options
  • Save mcleary/2ecdf5edbe6d16426655ab1711a3039f to your computer and use it in GitHub Desktop.
Save mcleary/2ecdf5edbe6d16426655ab1711a3039f to your computer and use it in GitHub Desktop.
Simple OpenCL example
#include <iostream>
#include <ctime>
#ifdef __APPLE__
#include <OpenCL/cl.hpp>
#else
#include <CL/cl.hpp>
#endif
#define NUM_GLOBAL_WITEMS 1024
void compareResults(double CPUtime, double GPUtime, int trial) {
double time_ratio = (CPUtime / GPUtime);
std::cout << "VERSION " << trial << " -----------" << std::endl;
std::cout << "CPU time: " << CPUtime << std::endl;
std::cout << "GPU time: " << GPUtime << std::endl;
std::cout << "GPU is ";
if (time_ratio > 1)
std::cout << time_ratio << " times faster!" << std::endl;
else
std::cout << (1 / time_ratio) << " times slower :(" << std::endl;
}
double timeAddVectorsCPU(int n, int k) {
// adds two vectors of size n, k times, returns total duration
std::clock_t start;
double duration;
std::vector<int> A(n);
std::vector<int> B(n);
std::vector<int> C(n);
for (int i = 0; i < n; i++) {
A[i] = i;
B[i] = n - i;
C[i] = 0;
}
start = std::clock();
for (int i = 0; i < k; i++) {
for (int j = 0; j < n; j++)
C[j] = A[j] + B[j];
}
duration = (std::clock() - start) / (double)CLOCKS_PER_SEC;
return duration;
}
void warmup(cl::Context& context, cl::CommandQueue& queue,
cl::Kernel& add, int A[], int B[], int n) {
std::vector<int> C(n);
// allocate space
cl::Buffer buffer_A(context, CL_MEM_READ_WRITE, sizeof(int) * n);
cl::Buffer buffer_B(context, CL_MEM_READ_WRITE, sizeof(int) * n);
cl::Buffer buffer_C(context, CL_MEM_READ_WRITE, sizeof(int) * n);
// push write commands to queue
queue.enqueueWriteBuffer(buffer_A, CL_TRUE, 0, sizeof(int) * n, A);
queue.enqueueWriteBuffer(buffer_B, CL_TRUE, 0, sizeof(int) * n, B);
// RUN ZE KERNEL
add.setArg(1, buffer_B);
add.setArg(0, buffer_A);
add.setArg(2, buffer_C);
for (int i = 0; i < 5; i++)
queue.enqueueNDRangeKernel(add, cl::NullRange, cl::NDRange(NUM_GLOBAL_WITEMS), cl::NDRange(32));
queue.enqueueReadBuffer(buffer_C, CL_TRUE, 0, sizeof(int) * n, C.data());
queue.finish();
}
int main(int argc, char* argv[])
{
bool verbose;
if (argc == 1 || std::strcmp(argv[1], "0") == 0)
verbose = true;
else
verbose = false;
verbose = 1;
const int n = 8 * 32 * 512; // size of vectors
const int k = 1000; // number of loop iterations
// const int NUM_GLOBAL_WITEMS = 1024; // number of threads
// get all platforms (drivers), e.g. NVIDIA
std::vector<cl::Platform> all_platforms;
cl::Platform::get(&all_platforms);
if (all_platforms.size() == 0) {
std::cout << " No platforms found. Check OpenCL installation!\n";
exit(1);
}
cl::Platform default_platform = all_platforms[1];
std::cout << "Using platform: "<<default_platform.getInfo<CL_PLATFORM_NAME>()<<"\n";
// get default device (CPUs, GPUs) of the default platform
std::vector<cl::Device> all_devices;
default_platform.getDevices(CL_DEVICE_TYPE_GPU, &all_devices);
if (all_devices.size() == 0) {
std::cout << " No devices found. Check OpenCL installation!\n";
exit(1);
}
cl::Device default_device = all_devices[0];
std::cout<< "Using device: "<<default_device.getInfo<CL_DEVICE_NAME>()<<"\n";
cl::Context context({ default_device });
cl::Program::Sources sources;
// calculates for each element; C = A + B
std::string kernel_code =
" void kernel add(global const int* v1, global const int* v2, global int* v3) {"
" int ID;"
" ID = get_global_id(0);"
" v3[ID] = v1[ID] + v2[ID];"
" }"
""
" void kernel add_looped_1(global const int* v1, global const int* v2, global int* v3, "
" const int n, const int k) {"
" int ID, NUM_GLOBAL_WITEMS, ratio, start, stop;"
" ID = get_global_id(0);"
" NUM_GLOBAL_WITEMS = get_global_size(0);"
""
" ratio = (n / NUM_GLOBAL_WITEMS);" // elements per thread
" start = ratio * ID;"
" stop = ratio * (ID+1);"
""
" int i, j;" // will the compiler optimize this anyway? probably.
" for (i=0; i<k; i++) {"
" for (j=start; j<stop; j++)"
" v3[j] = v1[j] + v2[j];"
" }"
" }"
""
" void kernel add_looped_2(global const int* v1, global const int* v2, global int* v3,"
" const int n, const int k) {"
" int ID, NUM_GLOBAL_WITEMS, step;"
" ID = get_global_id(0);"
" NUM_GLOBAL_WITEMS = get_global_size(0);"
" step = (n / NUM_GLOBAL_WITEMS);"
""
" int i,j;"
" for (i=0; i<k; i++) {"
" for (j=ID; j<n; j+=step)"
" v3[j] = v1[j] + v2[j];"
" }"
" }"
""
" void kernel add_single(global const int* v1, global const int* v2, global int* v3, "
" const int k) { "
" int ID = get_global_id(0);"
" for (int i=0; i<k; i++)"
" v3[ID] = v1[ID] + v2[ID];"
" }";
sources.push_back({ kernel_code.c_str(), kernel_code.length() });
cl::Program program(context, sources);
if (program.build({ default_device }) != CL_SUCCESS) {
std::cout << "Error building: " << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(default_device) << std::endl;
exit(1);
}
// run the CPU code
float CPUtime = timeAddVectorsCPU(n, k);
// set up kernels and vectors for GPU code
cl::CommandQueue queue(context, default_device);
cl::Kernel add = cl::Kernel(program, "add");
cl::Kernel add_looped_1 = cl::Kernel(program, "add_looped_1");
cl::Kernel add_looped_2 = cl::Kernel(program, "add_looped_2");
cl::Kernel add_single = cl::Kernel(program, "add_single");
// construct vectors
std::vector<int> A(n);
std::vector<int> B(n);
std::vector<int> C(n);
for (int i = 0; i < n; i++) {
A[i] = i;
B[i] = n - i - 1;
}
// attempt at warm-up...
warmup(context, queue, add, A.data(), B.data(), n);
queue.finish();
std::clock_t start_time;
// VERSION 1 ==========================================
// start timer
double GPUtime1;
start_time = std::clock();
// allocate space
cl::Buffer buffer_A(context, CL_MEM_READ_WRITE, sizeof(int) * n);
cl::Buffer buffer_B(context, CL_MEM_READ_WRITE, sizeof(int) * n);
cl::Buffer buffer_C(context, CL_MEM_READ_WRITE, sizeof(int) * n);
// push write commands to queue
queue.enqueueWriteBuffer(buffer_A, CL_TRUE, 0, sizeof(int) * n, A.data());
queue.enqueueWriteBuffer(buffer_B, CL_TRUE, 0, sizeof(int) * n, B.data());
// RUN ZE KERNEL
add_looped_1.setArg(0, buffer_A);
add_looped_1.setArg(1, buffer_B);
add_looped_1.setArg(2, buffer_C);
add_looped_1.setArg(3, n);
add_looped_1.setArg(4, k);
queue.enqueueNDRangeKernel(add_looped_1, cl::NullRange, // kernel, offset
cl::NDRange(NUM_GLOBAL_WITEMS), // global number of work items
cl::NDRange(32)); // local number (per group)
// read result from GPU to here; including for the sake of timing
queue.enqueueReadBuffer(buffer_C, CL_TRUE, 0, sizeof(int) * n, C.data());
queue.finish();
GPUtime1 = (std::clock() - start_time) / (double)CLOCKS_PER_SEC;
// VERSION 2 ==========================================
double GPUtime2;
cl::Buffer buffer_A2(context, CL_MEM_READ_WRITE, sizeof(int) * n);
cl::Buffer buffer_B2(context, CL_MEM_READ_WRITE, sizeof(int) * n);
cl::Buffer buffer_C2(context, CL_MEM_READ_WRITE, sizeof(int) * n);
queue.enqueueWriteBuffer(buffer_A2, CL_TRUE, 0, sizeof(int) * n, A.data());
queue.enqueueWriteBuffer(buffer_B2, CL_TRUE, 0, sizeof(int) * n, B.data());
start_time = std::clock();
add_looped_2.setArg(0, buffer_A2);
add_looped_2.setArg(1, buffer_B2);
add_looped_2.setArg(2, buffer_C2);
add_looped_2.setArg(3, n);
add_looped_2.setArg(4, k);
queue.enqueueNDRangeKernel(add_looped_2, cl::NullRange, cl::NDRange(NUM_GLOBAL_WITEMS), cl::NDRange(32));
queue.enqueueReadBuffer(buffer_C2, CL_TRUE, 0, sizeof(int) * n, C.data());
queue.finish();
GPUtime2 = (std::clock() - start_time) / (double)CLOCKS_PER_SEC;
// let's compare!
const int NUM_VERSIONS = 2;
double GPUtimes[NUM_VERSIONS] = { GPUtime1, GPUtime2 };
if (verbose) {
for (int i = 0; i < NUM_VERSIONS; i++)
compareResults(CPUtime, GPUtimes[i], i + 1);
}
else {
std::cout << CPUtime << ",";
for (int i = 0; i < NUM_VERSIONS - 1; i++)
std::cout << GPUtimes[i] << ",";
std::cout << GPUtimes[NUM_VERSIONS - 1] << std::endl;
}
return 0;
}
@leguilc
Copy link

leguilc commented Oct 20, 2020

I started to read you code because of your Timer class and you don't even use it here ;-)
https://gist.github.com/mcleary/b0bf4fa88830ff7c882d

@mcleary
Copy link
Author

mcleary commented Oct 20, 2020

I started to read you code because of your Timer class and you don't even use it here ;-)
https://gist.github.com/mcleary/b0bf4fa88830ff7c882d

That is true, however, this OpenCL sample was meant to be used as single file to test OpenCL compilation and stuff like that. I did the Timer class as a gist for quick reference, but I do use a similar version in my Atmosphere demo here: https://github.com/mcleary/pbr/blob/master/pbr/main.cpp

Another thing, I wrote that Timer class 4 years ago. There was a lot I didn't know at the time on how to use std::chrono

@leguilc
Copy link

leguilc commented Oct 20, 2020 via email

@mcleary
Copy link
Author

mcleary commented Oct 20, 2020

Don't worry, all feedback is welcomed.

  • When I opened some classes, I noticed that the style was not the same
    everywhere,

I know that and I also use clang_format extensively but I didn't bother to use it in my personal project

  • virtual and override are redondant, only the second one really
    matters. If it overrides, it must be a virtual function

I know that as well but I probably didn't at the time I wrote the code for the first time

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment