Skip to content

Instantly share code, notes, and snippets.

Created May 16, 2022 19:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save eyalroz/8312f7d417d13ef00f192b38d2df097d to your computer and use it in GitHub Desktop.
Save eyalroz/8312f7d417d13ef00f192b38d2df097d to your computer and use it in GitHub Desktop.
#include <stdio.h>
#include <math.h>
#ifdef __APPLE__
#include <OpenCL/cl.hpp>
#include <CL/cl.hpp>
#include <cstdio>
#include <cstdlib>
#include <iostream>
#include <math.h>
// OpenCL kernel. Each work item takes care of one element of c
const char *kernelSource = "\n" \
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable \n" \
"__kernel void vecAdd( __global double *a, \n" \
" __global double *b, \n" \
" __global double *c, \n" \
" const unsigned int n) \n" \
"{ \n" \
" //Get our global thread ID \n" \
" int id = get_global_id(0); \n" \
" \n" \
" //Make sure we do not go out of bounds \n" \
" if (id < n) \n" \
" c[id] = a[id] + b[id]; \n" \
"} \n" \
"\n" ;
int main(int argc, char *argv[])
// Length of vectors
unsigned int n = 1000;
// Host input vectors
double *h_a;
double *h_b;
// Host output vector
double *h_c;
// Device input buffers
cl::Buffer d_a;
cl::Buffer d_b;
// Device output buffer
cl::Buffer d_c;
// Size, in bytes, of each vector
size_t bytes = n*sizeof(double);
// Allocate memory for each vector on host
h_a = new double[n];
h_b = new double[n];
h_c = new double[n];
// Initialize vectors on host
for(int i = 0; i < n; i++ )
h_a[i] = sinf(i)*sinf(i);
h_b[i] = cosf(i)*cosf(i);
cl_int err = CL_SUCCESS;
try {
// Query platforms
std::vector<cl::Platform> platforms;
if (platforms.size() == 0) {
std::cout << "Platform size 0\n";
return -1;
// Get list of devices on default platform and create context
cl_context_properties properties[] =
{ CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(), 0};
cl::Context context(CL_DEVICE_TYPE_GPU, properties);
std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
// Create command queue for first device
cl::CommandQueue queue(context, devices[0], 0, &err);
// Create device memory buffers
d_a = cl::Buffer(context, CL_MEM_READ_ONLY, bytes);
d_b = cl::Buffer(context, CL_MEM_READ_ONLY, bytes);
d_c = cl::Buffer(context, CL_MEM_WRITE_ONLY, bytes);
// Bind memory buffers
queue.enqueueWriteBuffer(d_a, CL_TRUE, 0, bytes, h_a);
queue.enqueueWriteBuffer(d_b, CL_TRUE, 0, bytes, h_b);
//Build kernel from source string
cl::Program::Sources source(1,
cl::Program program_ = cl::Program(context, source);;
// Create kernel object
cl::Kernel kernel(program_, "vecAdd", &err);
// Bind kernel arguments to kernel
kernel.setArg(0, d_a);
kernel.setArg(1, d_b);
kernel.setArg(2, d_c);
kernel.setArg(3, n);
// Number of work items in each local work group
cl::NDRange localSize(64);
// Number of total work items - localSize must be devisor
cl::NDRange globalSize((int)(ceil(n/(float)64)*64));
// Enqueue kernel
cl::Event event;
// Block until kernel completion
// Read back d_c
queue.enqueueReadBuffer(d_c, CL_TRUE, 0, bytes, h_c);
catch (cl::Error err) {
<< "ERROR: "<<err.what()<<"("<<err.err()<<")"<<std::endl;
// Sum up vector c and print result divided by n, this should equal 1 within error
double sum = 0;
for(int i=0; i<n; i++)
sum += h_c[i];
std::cout<<"final result (divided by n): "<<sum/n<<std::endl;
// Release host memory
return 0;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment