Skip to content

Instantly share code, notes, and snippets.

@flisboac
Last active August 29, 2015 14:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save flisboac/16f87a86e4f559f89ee3 to your computer and use it in GitHub Desktop.
Save flisboac/16f87a86e4f559f89ee3 to your computer and use it in GitHub Desktop.
A simple, yet complex, yet complete, yet big, example of vector multiplication, implemented in both CPU and GPU (using OpenCL).
A simple, yet complex, yet complete, yet big, example of vector multiplication, implemented in both CPU and GPU (using OpenCL).
*.a
*.o
*.so
opencl-example-vecmul
.settings
RM=rm -f
RMDIR=rm -rf
CC=gcc -c -o
CPP=g++ -c -o
SLINK=ar rcu
ELINK=g++ -o
DLINK=g++ -shared -o
SOBJS=opencl-helper.o
DOBJS=
EOBJS=opencl-example-vecmul.o
SLIB=libopencl-helper.a
DLIB=
EXEC=opencl-example-vecmul
SDEPS=
DDEPS=
EDEPS=$(SLIB)
CFLAGS=-Wall -g --std=c99
CPPFLAGS=-Wall -g --std=c++11
LIBS=-lOpenCL
LINKFLAGS=
SFLAGS=$(LINKFLAGS)
EFLAGS=$(LINKFLAGS)
DFLAGS=$(LINKFLAGS)
TARGETS=$(EXEC) $(SLIB)
all: $(TARGETS)
clean:
$(RM) $(SLIB) $(DLIB) $(EXEC) $(SOBJS) $(DOBJS) $(EOBJS)
$(SLIB): $(SOBJS)
$(SLINK) $@ $? $(SFLAGS)
$(DLIB): $(DOBJS) $(DDEPS)
$(DLINK) $@ $? $(DFLAGS) $(LIBS) $(DDEPS)
$(EXEC): $(EOBJS) $(EDEPS)
$(ELINK) $@ $? $(EFLAGS) $(LIBS) $(EDEPS)
.cpp.o:
$(CPP) $@ $? $(CPPFLAGS)
.c.o:
$(CC) $@ $? $(CFLAGS)
// Compile with: g++ --std=c++11 -Wall -lOpenCL -o opencl-example-vecmul opencl-example-vecmul.cpp
#include <limits>
#include <iostream>
#include <sstream>
#include <iomanip>
#include <cstdlib>
#include "opencl-helper.hpp"
#include "opencl-example-vecmul.hpp"
int main(int argc, const char* argv[]) {
Example<cl_float> example;
int ret = example.parse(argc, argv);
if (ret == 0)
ret = example.initialize();
example.print_options(std::cerr);
if (ret == 0)
ret = example.execute();
return ret;
}
template <typename T> Example<T>::Example()
// Options
: _program_name("example")
, _commands("")
, _platform_idx(-1)
, _vector_size(100000)
, _min_value(0)
, _max_value(100)
, _margin(
std::numeric_limits<T>::is_specialized
? std::numeric_limits<T>::is_integer
? 0
: std::numeric_limits<T>::epsilon()
: 0
)
, _verbose(false)
// State
, _program_source()
, _global_size(0)
, _local_size(0)
, _Va(), _Vb(), _Vo()
, _context()
, _device()
, _program()
{}
template <typename T> int Example<T>::parse(int argc, const char** argv) {
_program_name = argv[0];
if (argc > 1) _commands = argv[1];
if (argc > 2)_platform_idx = std::atoi(argv[2]);
if (argc > 3) _vector_size = std::atoi(argv[3]);
if (argc > 4) _min_value = std::atof(argv[4]);
if (argc > 5) _max_value = std::atof(argv[5]);
if (argc > 6) _margin = std::atof(argv[6]);
return 0;
}
template <typename T> void Example<T>::split(std::vector<Example<T>> per_command_examples) {
per_command_examples.resize(_commands.size());
for (size_t i = 0; i < _commands.size(); ++i) {
per_command_examples[i] = Example(const_cast<Example<T>&>(*this));
per_command_examples[i]._commands = _commands[i];
}
}
template <typename T> int Example<T>::initialize() {
int ret = 0;
try {
_context = create_context(1, CL_DEVICE_TYPE_ALL, _platform_idx);
_device = first_device(_context);
_queue = cl::CommandQueue(_context, _device, CL_QUEUE_PROFILING_ENABLE);
auto sizes = optimal_sizes();
_global_size = sizes.first;
_local_size = sizes.second;
if (_global_size == 0 || _local_size == 0) {
std::cerr << "* ERROR: Could not find suitable sizes. Global: " << _global_size << ", Local: " << _local_size << std::endl;
ret = 1;
} else {
_Va.resize(_global_size);
_Vb.resize(_global_size);
_Vo.resize(_global_size);
fill();
size_t pos;
_program_source = ::program_source;
while ((pos = _program_source.find("TYPE")) != std::string::npos) {
_program_source.replace(pos, strlen("TYPE"), TypeInfo<T>::type_name());
}
std::cout << _program_source << std::endl;
_program = compile_program(_context, _program_source.c_str(), _program_source.size(), "-cl-std=CL1.1 -cl-kernel-arg-info -w -g");
}
} catch (cl::Error& ex) {
std::cerr << "* OPENCL EXCEPTION " << ex.err() << ": " << ex.what() << std::endl;
ret = 1;
} catch (Error& ex) {
std::cerr << "* EXCEPTION " << ex.code() << ": " << ex.what() << std::endl;
ret = 1;
}
return ret;
}
template <typename T> void Example<T>::print_options(std::ostream& os) {
os << "* Minimum value: " << _min_value << std::endl;
os << "* Maximum value: " << _max_value << std::endl;
os << "* Margin value: " << _margin << std::endl;
os << "* Vector size: " << _vector_size << std::endl;
os << "* Commands: " << _commands << std::endl;
if (_global_size) os << "* Global Size: " << _global_size << std::endl;
if (_local_size) os << "* Local Size: " << _local_size << std::endl;
}
template <typename T> int Example<T>::execute() {
int ret = 0;
if (_commands.empty()) {
std::cerr << "USAGE: " << _program_name << " <COMMANDS> [PLATFORM_IDX] [VECTOR_SIZE] [MIN_VALUE] [MAX_VALUE] [MARGIN]" << std::endl;
} else {
for (size_t i = 0; i < _commands.size(); ++i) {
switch(_commands[i]) {
case 'p':
std::cout << _program_source << std::endl;
break;
case 'c':
{
copy(_Va, _Vo);
CpuTiming timing;
timing.start();
vecmul(&_Vo[0], &_Vb[0], _Vo.size());
timing.end();
std::cout << "Local-OpenCL statistics:" << std::endl
<< "\tValid: " << verify(_Vo) << std::endl
<< "\tTotal CPU time: " << timing.sec() << "s" << std::endl;
}
break;
case 'g':
try {
copy(_Va, _Vo);
execute_global();
} catch (cl::Error& ex) {
std::cerr << "* [GLOBAL] OPENCL ERROR " << ex.err() << ": " << ex.what() << std::endl;
} catch (Error& ex) {
std::cerr << "* [GLOBAL] ERROR " << ex.code() << ": " << ex.what() << std::endl;
}
break;
case 'l':
{
try {
copy(_Va, _Vo);
execute_local();
} catch (cl::Error& ex) {
std::cerr << "* [LOCAL] OPENCL ERROR " << ex.err() << ": " << ex.what() << std::endl;
} catch (Error& ex) {
std::cerr << "* [LOCAL] ERROR " << ex.code() << ": " << ex.what() << std::endl;
}
}
break;
default:
std::cerr << "* WARNING: Unknown command '" << _commands[i] << "'." << std::endl;
}
}
}
return ret;
}
template <typename T> void Example<T>::fill() {
for (size_t i = 0; i < _vector_size; ++i) {
_Va[i] = _min_value + (static_cast<float>(rand()) / RAND_MAX) * (_max_value - _min_value);
_Vb[i] = _min_value + (static_cast<float>(rand()) / RAND_MAX) * (_max_value - _min_value);
}
}
template <typename T> void Example<T>::copy(const std::vector<T>& from, std::vector<T>& to) {
std::copy(from.begin(), from.begin() + _vector_size, to.begin());
}
template <typename T> bool Example<T>::verify(const std::vector<T>& Vo) {
for (size_t i = 0; i < _vector_size; ++i) {
T correct_value = _Va[i] * _Vb[i];
T value = Vo[i];
if (!within_margin(correct_value, value)) {
return false;
}
}
return true;
}
template <typename T> bool Example<T>::within_margin(T correct_value, T value) {
T diff = correct_value - value;
if (diff < 0) diff = -diff;
if (diff > _margin) {
return false;
}
return true;
}
template <typename T> std::pair<size_t, size_t> Example<T>::optimal_sizes() {
// Utterly fails if architecture/platform doesn't give or optimize workgroup sizes
// in powers of 2.
std::vector<size_t> maxWorkItemSizes = _device.getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>();
size_t maxLocalMemSize = _device.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
// Limit by maximum work-item size.
size_t size_pow = highest_pow2(_vector_size);
size_t local_size = std::min(size_pow, maxWorkItemSizes[0]);
// Further limit by maximum local memory.
while (local_size * sizeof(T) * 2 > maxLocalMemSize) {
local_size /= 2;
}
size_t global_size = next_multiple_ge_size(_vector_size, local_size);
return std::make_pair(global_size, local_size);
}
template <typename T> void Example<T>::execute_global() {
CpuTiming timing;
timing.start();
cl::Buffer Ma(_context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, _vector_size * sizeof(T), &_Vo[0]);
cl::Buffer Mb(_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, _vector_size * sizeof(T), &_Vb[0]);
std::vector<cl::Event> kernel_events(1);
kernel_events[0] = enqueue_global(Ma, Mb);
cl::Event copy_event = enqueue_read(Ma, _Vo, true, &kernel_events, _vector_size);
timing.end();
bool valid = verify(_Vo);
std::cout << "Global-OpenCL statistics:" << std::endl
<< "\tValid: " << valid << std::endl
<< "\tTotal CPU Time: " << timing.sec() << "s" << std::endl;
}
template <typename T> void Example<T>::execute_local() {
CpuTiming timing;
timing.start();
cl::Buffer Ma(_context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, _vector_size * sizeof(T), &_Vo[0]);
cl::Buffer Mb(_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, _vector_size * sizeof(T), &_Vb[0]);
std::vector<cl::Event> kernel_events(1);
kernel_events[0] = enqueue_local(Ma, Mb);
cl::Event copy_event = enqueue_read(Ma, _Vo, true, &kernel_events, _vector_size);
timing.end();
bool valid = verify(_Vo);
std::cout << "Local-OpenCL statistics:" << std::endl
<< "\tValid: " << valid << std::endl
<< "\tTotal CPU Time: " << timing.sec() << "s" << std::endl;
}
template <typename T> cl::Event Example<T>::enqueue_read(cl::Buffer& M, std::vector<T>& to, bool sync, std::vector<cl::Event>* dependencies, size_t size) {
cl::Event event;
if (size == 0) size = to.size();
_queue.enqueueReadBuffer(M, sync, 0, size * sizeof(T), &to[0], dependencies, &event);
return event;
}
template <typename T> cl::Event Example<T>::enqueue_write(cl::Buffer& M, std::vector<T>& from, bool sync, std::vector<cl::Event>* dependencies, size_t size) {
cl::Event event;
if (size == 0) size = from.size();
_queue.enqueueWriteBuffer(M, sync, 0, size * sizeof(T), &from[0], dependencies, &event);
return event;
}
template <typename T> cl::Event Example<T>::enqueue_global(cl::Buffer& Ma, cl::Buffer& Mb, std::vector<cl::Event>* dependencies) {
cl::Event event;
size_t arg_idx = 0;
cl::NDRange range(_global_size);
cl::Kernel kernel(_program, "vecmul_g");
kernel.setArg(arg_idx++, Ma);
kernel.setArg(arg_idx++, Mb);
kernel.setArg(arg_idx++, static_cast<cl_int>(_vector_size));
_queue.enqueueNDRangeKernel(kernel, cl::NullRange, range, cl::NullRange, dependencies, &event);
return event;
}
template <typename T> cl::Event Example<T>::enqueue_local(cl::Buffer& Ma, cl::Buffer& Mb, std::vector<cl::Event>* dependencies) {
cl::Event event;
size_t arg_idx = 0;
cl::NDRange grange(_global_size);
cl::NDRange lrange(_local_size);
cl::Kernel kernel(_program, "vecmul_l");
kernel.setArg(arg_idx++, Ma);
kernel.setArg(arg_idx++, Mb);
kernel.setArg(arg_idx++, cl::Local(_local_size * sizeof(T))); // cl::__local(_local_size * sizeof(T))
kernel.setArg(arg_idx++, cl::Local(_local_size * sizeof(T)));
kernel.setArg(arg_idx++, static_cast<cl_int>(_vector_size));
_queue.enqueueNDRangeKernel(kernel, cl::NullRange, grange, lrange, dependencies, &event);
return event;
}
void vecmul(float* Va, const float* Vb, size_t N) {
for (size_t i = 0; i < N; ++i) {
Va[i] = Va[i] * Vb[i];
}
}
const char* const program_source =
"kernel void vecmul_g(\n"
" global TYPE* Va,\n"
" global TYPE* Vb,\n"
" const int N\n"
") {\n"
" int i = get_global_id(0);\n"
" if (i < N) {\n"
" Va[i] = Va[i] * Vb[i];\n"
" }\n"
"}\n"
"\n"
"kernel void vecmul_l(\n"
" global TYPE* Va,\n"
" global TYPE* Vb,\n"
" local TYPE* Ba,\n"
" local TYPE* Bb,\n"
" const int N\n"
") {\n"
" int i = get_global_id(0);\n"
" if (i < N) {\n"
" int LN = get_group_id(0) *\n"
" get_local_size(0);\n"
" int L = get_local_id(0) + LN;\n"
" Ba[L] = Va[i]; Bb[L] = Vb[i];\n"
" barrier(CLK_GLOBAL_MEM_FENCE);\n"
" Ba[L] = Ba[L] * Bb[L];\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" Va[i] = Ba[L];\n"
" }\n"
"}\n"
;
#ifndef OPENCL_EXAMPLE_VECMUL_HPP_
#define OPENCL_EXAMPLE_VECMUL_HPP_
#include <vector>
#include <iosfwd>
#include <ctime>
#include <string>
#include <cstring>
class CpuTiming;
template <typename T> class Example;
extern const char* const program_source;
extern void vecmul(float* Va, const float* Vb, size_t N);
class CpuTiming {
public:
CpuTiming() : _start_clk(0), _end_clk(0) {}
inline void start() { _start_clk = std::clock(); }
inline void end() { _end_clk = std::clock(); }
inline double sec() { return static_cast<double>(_end_clk - _start_clk) / CLOCKS_PER_SEC; }
private:
clock_t _start_clk;
clock_t _end_clk;
};
template <typename T> struct TypeInfo {
static inline constexpr const char* type_name() { return ""; }
};
template <> struct TypeInfo<cl_int> {
static inline constexpr const char* type_name() { return "int"; }
};
template <> struct TypeInfo<cl_float> {
static inline constexpr const char* type_name() { return "float"; }
};
template <> struct TypeInfo<cl_double> {
static inline constexpr const char* type_name() { return "double"; }
};
template <typename T> class Example {
public:
Example();
Example(const Example<T>& other) = default;
int parse(int argc, const char** argv);
void split(std::vector<Example<T>> per_command_examples);
int initialize();
void print_options(std::ostream& os);
int execute();
private:
void execute_global();
void execute_local();
cl::Event enqueue_read(cl::Buffer& M, std::vector<T>& to, bool sync, std::vector<cl::Event>* dependencies = NULL, size_t size = 0);
cl::Event enqueue_write(cl::Buffer& M, std::vector<T>& from, bool sync, std::vector<cl::Event>* dependencies = NULL, size_t size = 0);
cl::Event enqueue_global(cl::Buffer& Ma, cl::Buffer& Mb, std::vector<cl::Event>* dependencies = NULL);
cl::Event enqueue_local(cl::Buffer& Ma, cl::Buffer& Mb, std::vector<cl::Event>* dependencies = NULL);
void fill();
void copy(const std::vector<T>& from, std::vector<T>& to);
bool verify(const std::vector<T>& Vo);
bool within_margin(T correct_value, T value);
std::pair<size_t, size_t> optimal_sizes();
private:
// Options
std::string _program_name;
std::string _commands;
int _platform_idx;
size_t _vector_size;
T _min_value;
T _max_value;
T _margin;
bool _verbose;
// State
std::string _program_source;
size_t _global_size;
size_t _local_size;
std::vector<T> _Va, _Vb, _Vo;
cl::Context _context;
cl::Device _device;
cl::Program _program;
cl::CommandQueue _queue;
};
#endif /* OPENCL_EXAMPLE_VECMUL_HPP_ */
#define __CL_ENABLE_EXCEPTIONS
#include <CL/cl.hpp>
#include <sstream>
#include <string>
#include <vector>
#include "opencl-helper.hpp"
size_t highest_pow2(size_t num) {
size_t ret = 1;
while (ret < num) ret <<= 1;
return ret;
}
size_t next_multiple_ge_size(size_t size, size_t dimSize) {
size_t v = 1;
while (v * dimSize < size) ++v;
return dimSize * v;
}
bool filter_devices(cl::Platform platform, std::vector<cl::Device>& output, size_t num_devices, cl_device_type device_type) {
bool foundDevice = false;
std::vector<cl::Device> platformDevices;
platform.getDevices(device_type, &platformDevices);
if (!platformDevices.empty() && platformDevices.size()) {
foundDevice = true;
if (num_devices == 0)
num_devices = platformDevices.size();
output.insert(output.end(), platformDevices.begin(), platformDevices.begin() + std::min(num_devices, platformDevices.size()));
}
return foundDevice;
}
cl::Context create_context(size_t num_devices, cl_device_type device_type, int platform_idx, context_callback_f callback, void* callback_data) {
std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
int total_devices = (num_devices) ? num_devices : -1;
if (platforms.empty())
throw Error("No OpenCL Platform found, please check your OpenCL installation.");
std::vector<cl::Device> devices;
if (platform_idx < 0) {
for (std::vector<cl::Platform>::iterator it = platforms.begin(); (total_devices != 0) && it != platforms.end(); ++it) {
cl::Platform platform = *it;
if (total_devices > 0) {
size_t previous_size = devices.size();
filter_devices(platform, devices, total_devices, device_type);
total_devices -= devices.size() - previous_size;
} else if (total_devices < 0){
filter_devices(platform, devices, 0, device_type);
}
}
} else if (platform_idx >= static_cast<int>(platforms.size())) {
throw Error("Invalid platform index given.");
} else {
cl::Platform platform = platforms[platform_idx];
filter_devices(platform, devices, num_devices, device_type);
}
if (devices.empty())
throw Error("Could not find available/enough devices.");
cl::Context context(devices, NULL, callback, callback_data);
return context;
}
cl::Program compile_program(cl::Context& context, const char* source, size_t source_size, const char* options) {
std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
if (devices.empty())
throw Error("No device available in context.");
cl::Program::Sources sources;
sources.push_back(std::make_pair(source, source_size));
cl::Program program(context, sources);
program.build(devices, options, NULL);
std::stringstream ss;
for (auto& device : devices) {
if (program.getBuildInfo<CL_PROGRAM_BUILD_STATUS>(device) != CL_BUILD_SUCCESS) {
ss
<< "Program could not be built for device \"" << device.getInfo<CL_DEVICE_NAME>() << "\"."
<< " Status: " << program.getBuildInfo<CL_PROGRAM_BUILD_STATUS>(device)
<< ", Options: " << program.getBuildInfo<CL_PROGRAM_BUILD_OPTIONS>(device)
<< ", Log: \"" << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device) << "\"."
<< std::endl;
}
}
std::string errors = ss.str();
if (!errors.empty()) {
throw Error(errors);
}
return program;
}
cl::Device first_device(cl::Context& context) {
std::vector<cl::Device> devices;
devices = context.getInfo<CL_CONTEXT_DEVICES>();
return devices[0];
}
#ifndef OPENCL_HELPER_HPP_
#define OPENCL_HELPER_HPP_
#define __CL_ENABLE_EXCEPTIONS
#include <CL/cl.hpp>
#include <exception>
typedef void (*context_callback_f)(const char* message, const void* data, size_t data_size, void* user_data);
class Error : public std::exception {
public:
Error() throw() : _code(0) {}
Error(const char* what) throw() : _code(0), _what(what) {}
Error(const std::string& what) throw() : _code(0), _what(what) {}
const char* what() throw() { return _what.c_str(); }
int code() throw() { return _code; }
private:
int _code;
std::string _what;
};
extern size_t highest_pow2(size_t num);
extern size_t next_multiple_ge_size(size_t size, size_t dimSize);
extern bool filter_devices(cl::Platform platform, std::vector<cl::Device>& output, size_t num_devices = 0, cl_device_type device_type = CL_DEVICE_TYPE_ALL);
extern cl::Context create_context(size_t num_devices = 0, cl_device_type device_type = CL_DEVICE_TYPE_ALL, int platform_idx = -1, context_callback_f callback = NULL, void* callback_data = NULL);
extern cl::Device first_device(cl::Context& context);
extern cl::Program compile_program(cl::Context& context, const char* source, size_t source_size, const char* options = NULL);
#endif /* OPENCL_HELPER_HPP_ */
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment