flisboac/ opencl-example-vecmul

## opencl-example-vecmul
A simple, yet complex, yet complete, yet big, example of vector multiplication, implemented in both CPU and GPU (using OpenCL).

## .gitignore
*.a
*.o
*.so
opencl-example-vecmul
.settings

## Makefile

RM=rm -f
RMDIR=rm -rf

CC=gcc -c -o
CPP=g++ -c -o
SLINK=ar rcu
ELINK=g++ -o
DLINK=g++ -shared -o


SOBJS=opencl-helper.o
DOBJS=
EOBJS=opencl-example-vecmul.o

SLIB=libopencl-helper.a
DLIB=
EXEC=opencl-example-vecmul

SDEPS=
DDEPS=
EDEPS=$(SLIB)

CFLAGS=-Wall -g --std=c99
CPPFLAGS=-Wall -g --std=c++11

LIBS=-lOpenCL
LINKFLAGS=
SFLAGS=$(LINKFLAGS)
EFLAGS=$(LINKFLAGS)
DFLAGS=$(LINKFLAGS)

TARGETS=$(EXEC) $(SLIB)

all: $(TARGETS)

clean:
	$(RM) $(SLIB) $(DLIB) $(EXEC) $(SOBJS) $(DOBJS) $(EOBJS)


$(SLIB): $(SOBJS)
	$(SLINK) $@ $? $(SFLAGS)


$(DLIB): $(DOBJS) $(DDEPS)
	$(DLINK) $@ $? $(DFLAGS) $(LIBS) $(DDEPS)


$(EXEC): $(EOBJS) $(EDEPS)
	$(ELINK) $@ $? $(EFLAGS) $(LIBS) $(EDEPS)


.cpp.o:
	$(CPP) $@ $? $(CPPFLAGS)


.c.o:
	$(CC) $@ $? $(CFLAGS)

## opencl-example-vecmul.cpp
// Compile with: g++ --std=c++11 -Wall -lOpenCL -o opencl-example-vecmul opencl-example-vecmul.cpp

#include <limits>
#include <iostream>
#include <sstream>
#include <iomanip>
#include <cstdlib>

#include "opencl-helper.hpp"
#include "opencl-example-vecmul.hpp"


int main(int argc, const char* argv[]) {
	Example<cl_float> example;
	int ret = example.parse(argc, argv);
	if (ret == 0)
		ret = example.initialize();
	example.print_options(std::cerr);
	if (ret == 0)
		ret = example.execute();
	return ret;
}


template <typename T> Example<T>::Example()
	// Options
	: _program_name("example")
	, _commands("")
	, _platform_idx(-1)
	, _vector_size(100000)
	, _min_value(0)
	, _max_value(100)
	, _margin(
			std::numeric_limits<T>::is_specialized
				? std::numeric_limits<T>::is_integer
					? 0
					: std::numeric_limits<T>::epsilon()
				: 0
		)
	, _verbose(false)

	// State
	, _program_source()
	, _global_size(0)
	, _local_size(0)
	, _Va(), _Vb(), _Vo()
	, _context()
	, _device()
	, _program()
{}


template <typename T> int Example<T>::parse(int argc, const char** argv) {
	_program_name = argv[0];
	if (argc > 1)    _commands = argv[1];
	if (argc > 2)_platform_idx = std::atoi(argv[2]);
	if (argc > 3) _vector_size = std::atoi(argv[3]);
	if (argc > 4)   _min_value = std::atof(argv[4]);
	if (argc > 5)   _max_value = std::atof(argv[5]);
	if (argc > 6)      _margin = std::atof(argv[6]);

	return 0;
}


template <typename T> void Example<T>::split(std::vector<Example<T>> per_command_examples) {
	per_command_examples.resize(_commands.size());
	for (size_t i = 0; i < _commands.size(); ++i) {
		per_command_examples[i] = Example(const_cast<Example<T>&>(*this));
		per_command_examples[i]._commands = _commands[i];
	}
}


template <typename T> int Example<T>::initialize() {
	int ret = 0;

	try {
		_context = create_context(1, CL_DEVICE_TYPE_ALL, _platform_idx);
		_device = first_device(_context);
		_queue = cl::CommandQueue(_context, _device, CL_QUEUE_PROFILING_ENABLE);
		auto sizes = optimal_sizes();
		_global_size = sizes.first;
		_local_size = sizes.second;

		if (_global_size == 0 || _local_size == 0) {
			std::cerr << "* ERROR: Could not find suitable sizes. Global: " << _global_size << ", Local: " << _local_size << std::endl;
			ret = 1;

		} else {
			_Va.resize(_global_size);
			_Vb.resize(_global_size);
			_Vo.resize(_global_size);
			fill();

			size_t pos;
			_program_source = ::program_source;
			while ((pos = _program_source.find("TYPE")) != std::string::npos) {
				_program_source.replace(pos, strlen("TYPE"), TypeInfo<T>::type_name());
			}
			std::cout << _program_source << std::endl;
			_program = compile_program(_context, _program_source.c_str(), _program_source.size(), "-cl-std=CL1.1 -cl-kernel-arg-info -w -g");
		}

	} catch (cl::Error& ex) {
		std::cerr << "* OPENCL EXCEPTION " << ex.err() << ": " << ex.what() << std::endl;
		ret = 1;

	} catch (Error& ex) {
		std::cerr << "* EXCEPTION " << ex.code() << ": " << ex.what() << std::endl;
		ret = 1;
	}

	return ret;
}


template <typename T> void Example<T>::print_options(std::ostream& os) {
	os << "* Minimum value: " << _min_value << std::endl;
	os << "* Maximum value: " << _max_value << std::endl;
	os << "*  Margin value: " << _margin << std::endl;
	os << "*   Vector size: " << _vector_size << std::endl;
	os << "*      Commands: " << _commands << std::endl;
	if (_global_size) os << "*   Global Size: " << _global_size << std::endl;
	if (_local_size) os << "*    Local Size: " << _local_size << std::endl;
}


template <typename T> int Example<T>::execute() {
	int ret = 0;

	if (_commands.empty()) {
		std::cerr << "USAGE: " << _program_name << " <COMMANDS> [PLATFORM_IDX] [VECTOR_SIZE] [MIN_VALUE] [MAX_VALUE] [MARGIN]" << std::endl;

	} else {
		for (size_t i = 0; i < _commands.size(); ++i) {
			switch(_commands[i]) {
			case 'p':
				std::cout << _program_source << std::endl;
				break;

			case 'c':
				{
					copy(_Va, _Vo);
					CpuTiming timing;
					timing.start();
					vecmul(&_Vo[0], &_Vb[0], _Vo.size());
					timing.end();
					std::cout << "Local-OpenCL statistics:" << std::endl
							<< "\tValid: " << verify(_Vo) << std::endl
							<< "\tTotal CPU time: " << timing.sec() << "s" << std::endl;
				}
				break;

			case 'g':
				try {
					copy(_Va, _Vo);
					execute_global();

				} catch (cl::Error& ex) {
					std::cerr << "* [GLOBAL] OPENCL ERROR " << ex.err() << ": " << ex.what() << std::endl;

				} catch (Error& ex) {
					std::cerr << "* [GLOBAL] ERROR " << ex.code() << ": " << ex.what() << std::endl;
				}
				break;

			case 'l':
				{
					try {
						copy(_Va, _Vo);
						execute_local();

					} catch (cl::Error& ex) {
						std::cerr << "* [LOCAL] OPENCL ERROR " << ex.err() << ": " << ex.what() << std::endl;

					} catch (Error& ex) {
						std::cerr << "* [LOCAL] ERROR " << ex.code() << ": " << ex.what() << std::endl;
					}
				}
				break;

			default:
				std::cerr << "* WARNING: Unknown command '" << _commands[i] << "'." << std::endl;
			}
		}
	}

	return ret;
}


template <typename T> void Example<T>::fill() {
	for (size_t i = 0; i < _vector_size; ++i) {
		_Va[i] = _min_value + (static_cast<float>(rand()) / RAND_MAX) * (_max_value - _min_value);
		_Vb[i] = _min_value + (static_cast<float>(rand()) / RAND_MAX) * (_max_value - _min_value);
	}
}


template <typename T> void Example<T>::copy(const std::vector<T>& from, std::vector<T>& to) {
	std::copy(from.begin(), from.begin() + _vector_size, to.begin());
}


template <typename T> bool Example<T>::verify(const std::vector<T>& Vo) {
	for (size_t i = 0; i < _vector_size; ++i) {
		T correct_value = _Va[i] * _Vb[i];
		T value = Vo[i];
		if (!within_margin(correct_value, value)) {
			return false;
		}
	}
	return true;
}


template <typename T> bool Example<T>::within_margin(T correct_value, T value) {
	T diff = correct_value - value;
	if (diff < 0) diff = -diff;
	if (diff > _margin) {
		return false;
	}
	return true;
}


template <typename T> std::pair<size_t, size_t> Example<T>::optimal_sizes() {
	// Utterly fails if architecture/platform doesn't give or optimize workgroup sizes
	// in powers of 2.
	std::vector<size_t> maxWorkItemSizes = _device.getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>();
	size_t maxLocalMemSize = _device.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();

	// Limit by maximum work-item size.
	size_t size_pow = highest_pow2(_vector_size);
	size_t local_size = std::min(size_pow, maxWorkItemSizes[0]);

	// Further limit by maximum local memory.
	while (local_size * sizeof(T) * 2 > maxLocalMemSize) {
		local_size /= 2;
	}

	size_t global_size = next_multiple_ge_size(_vector_size, local_size);
	return std::make_pair(global_size, local_size);
}


template <typename T> void Example<T>::execute_global() {
	CpuTiming timing;
	timing.start();

	cl::Buffer Ma(_context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, _vector_size * sizeof(T), &_Vo[0]);
	cl::Buffer Mb(_context,  CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, _vector_size * sizeof(T), &_Vb[0]);
	std::vector<cl::Event> kernel_events(1);
	kernel_events[0] = enqueue_global(Ma, Mb);
	cl::Event copy_event = enqueue_read(Ma, _Vo, true, &kernel_events, _vector_size);
	timing.end();

	bool valid = verify(_Vo);
	std::cout << "Global-OpenCL statistics:" << std::endl
			<< "\tValid: " << valid << std::endl
			<< "\tTotal CPU Time: " << timing.sec() << "s" << std::endl;
}


template <typename T> void Example<T>::execute_local() {
	CpuTiming timing;
	timing.start();

	cl::Buffer Ma(_context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, _vector_size * sizeof(T), &_Vo[0]);
	cl::Buffer Mb(_context,  CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, _vector_size * sizeof(T), &_Vb[0]);
	std::vector<cl::Event> kernel_events(1);
	kernel_events[0] = enqueue_local(Ma, Mb);
	cl::Event copy_event = enqueue_read(Ma, _Vo, true, &kernel_events, _vector_size);
	timing.end();

	bool valid = verify(_Vo);
	std::cout << "Local-OpenCL statistics:" << std::endl
			<< "\tValid: " << valid << std::endl
			<< "\tTotal CPU Time: " << timing.sec() << "s" << std::endl;
}


template <typename T> cl::Event Example<T>::enqueue_read(cl::Buffer& M, std::vector<T>& to, bool sync, std::vector<cl::Event>* dependencies, size_t size) {
	cl::Event event;
	if (size == 0) size = to.size();
	_queue.enqueueReadBuffer(M, sync, 0, size * sizeof(T), &to[0], dependencies, &event);
	return event;
}


template <typename T> cl::Event Example<T>::enqueue_write(cl::Buffer& M, std::vector<T>& from, bool sync, std::vector<cl::Event>* dependencies, size_t size) {
	cl::Event event;
	if (size == 0) size = from.size();
	_queue.enqueueWriteBuffer(M, sync, 0, size * sizeof(T), &from[0], dependencies, &event);
	return event;
}


template <typename T> cl::Event Example<T>::enqueue_global(cl::Buffer& Ma, cl::Buffer& Mb, std::vector<cl::Event>* dependencies) {
	cl::Event event;
	size_t arg_idx = 0;
	cl::NDRange range(_global_size);
	cl::Kernel kernel(_program, "vecmul_g");
	kernel.setArg(arg_idx++, Ma);
	kernel.setArg(arg_idx++, Mb);
	kernel.setArg(arg_idx++, static_cast<cl_int>(_vector_size));
	_queue.enqueueNDRangeKernel(kernel, cl::NullRange, range, cl::NullRange, dependencies, &event);
	return event;
}


template <typename T> cl::Event Example<T>::enqueue_local(cl::Buffer& Ma, cl::Buffer& Mb, std::vector<cl::Event>* dependencies) {
	cl::Event event;
	size_t arg_idx = 0;
	cl::NDRange grange(_global_size);
	cl::NDRange lrange(_local_size);
	cl::Kernel kernel(_program, "vecmul_l");
	kernel.setArg(arg_idx++, Ma);
	kernel.setArg(arg_idx++, Mb);
	kernel.setArg(arg_idx++, cl::Local(_local_size * sizeof(T))); // cl::__local(_local_size * sizeof(T))
	kernel.setArg(arg_idx++, cl::Local(_local_size * sizeof(T)));
	kernel.setArg(arg_idx++, static_cast<cl_int>(_vector_size));
	_queue.enqueueNDRangeKernel(kernel, cl::NullRange, grange, lrange, dependencies, &event);
	return event;
}


void vecmul(float* Va, const float* Vb, size_t N) {
	for (size_t i = 0; i < N; ++i) {
		Va[i] = Va[i] * Vb[i];
	}
}


const char* const program_source =
	"kernel void vecmul_g(\n"
	"  global TYPE* Va,\n"
	"  global TYPE* Vb,\n"
	"  const  int    N\n"
	") {\n"
	"  int i = get_global_id(0);\n"
	"  if (i < N) {\n"
	"    Va[i] = Va[i] * Vb[i];\n"
	"  }\n"
	"}\n"
	"\n"
	"kernel void vecmul_l(\n"
	"  global TYPE* Va,\n"
	"  global TYPE* Vb,\n"
	"  local  TYPE* Ba,\n"
	"  local  TYPE* Bb,\n"
	"  const  int    N\n"
	") {\n"
	"  int i = get_global_id(0);\n"
	"  if (i < N) {\n"
	"    int LN = get_group_id(0) *\n"
	"             get_local_size(0);\n"
	"    int L = get_local_id(0) + LN;\n"
	"    Ba[L] = Va[i]; Bb[L] = Vb[i];\n"
	"    barrier(CLK_GLOBAL_MEM_FENCE);\n"
	"    Ba[L] = Ba[L] * Bb[L];\n"
	"    barrier(CLK_LOCAL_MEM_FENCE);\n"
	"    Va[i] = Ba[L];\n"
	"  }\n"
	"}\n"
	;

## opencl-example-vecmul.hpp
#ifndef OPENCL_EXAMPLE_VECMUL_HPP_
#define OPENCL_EXAMPLE_VECMUL_HPP_

#include <vector>
#include <iosfwd>
#include <ctime>
#include <string>
#include <cstring>


class CpuTiming;
template <typename T> class Example;

extern const char* const program_source;
extern void vecmul(float* Va, const float* Vb, size_t N);

class CpuTiming {
public:
	CpuTiming() : _start_clk(0), _end_clk(0) {}
	inline void start() { _start_clk = std::clock(); }
	inline void end() { _end_clk = std::clock(); }
	inline double sec() { return static_cast<double>(_end_clk - _start_clk) / CLOCKS_PER_SEC; }

private:
	clock_t _start_clk;
	clock_t _end_clk;
};


template <typename T> struct TypeInfo {
	static inline constexpr const char* type_name() { return ""; }
};


template <> struct TypeInfo<cl_int> {
	static inline constexpr const char* type_name() { return "int"; }
};


template <> struct TypeInfo<cl_float> {
	static inline constexpr const char* type_name() { return "float"; }
};


template <> struct TypeInfo<cl_double> {
	static inline constexpr const char* type_name() { return  "double"; }
};


template <typename T> class Example {
public:
	Example();
	Example(const Example<T>& other) = default;
	int parse(int argc, const char** argv);
	void split(std::vector<Example<T>> per_command_examples);
	int initialize();
	void print_options(std::ostream& os);
	int execute();

private:
	void execute_global();
	void execute_local();
	cl::Event enqueue_read(cl::Buffer& M, std::vector<T>& to, bool sync, std::vector<cl::Event>* dependencies = NULL, size_t size = 0);
	cl::Event enqueue_write(cl::Buffer& M, std::vector<T>& from, bool sync, std::vector<cl::Event>* dependencies = NULL, size_t size = 0);
	cl::Event enqueue_global(cl::Buffer& Ma, cl::Buffer& Mb, std::vector<cl::Event>* dependencies = NULL);
	cl::Event enqueue_local(cl::Buffer& Ma, cl::Buffer& Mb, std::vector<cl::Event>* dependencies = NULL);

	void fill();
	void copy(const std::vector<T>& from, std::vector<T>& to);
	bool verify(const std::vector<T>& Vo);
	bool within_margin(T correct_value, T value);
	std::pair<size_t, size_t> optimal_sizes();

private:
	// Options
	std::string _program_name;
	std::string _commands;
	int _platform_idx;
	size_t _vector_size;
	T _min_value;
	T _max_value;
	T _margin;
	bool _verbose;

	// State
	std::string _program_source;
	size_t _global_size;
	size_t _local_size;
	std::vector<T> _Va, _Vb, _Vo;
	cl::Context _context;
	cl::Device _device;
	cl::Program _program;
	cl::CommandQueue _queue;
};


#endif /* OPENCL_EXAMPLE_VECMUL_HPP_ */

## opencl-helper.cpp

#define __CL_ENABLE_EXCEPTIONS
#include <CL/cl.hpp>
#include <sstream>
#include <string>
#include <vector>

#include "opencl-helper.hpp"


size_t highest_pow2(size_t num) {
	size_t ret = 1;
	while (ret < num) ret <<= 1;
	return ret;
}


size_t next_multiple_ge_size(size_t size, size_t dimSize) {
	size_t v = 1;
	while (v * dimSize < size) ++v;
	return dimSize * v;
}


bool filter_devices(cl::Platform platform, std::vector<cl::Device>& output, size_t num_devices, cl_device_type device_type) {
	bool foundDevice = false;
	std::vector<cl::Device> platformDevices;
	platform.getDevices(device_type, &platformDevices);

	if (!platformDevices.empty() && platformDevices.size()) {
		foundDevice = true;
		if (num_devices == 0)
			num_devices = platformDevices.size();
		output.insert(output.end(), platformDevices.begin(), platformDevices.begin() + std::min(num_devices, platformDevices.size()));
	}

	return foundDevice;
}

cl::Context create_context(size_t num_devices, cl_device_type device_type, int platform_idx, context_callback_f callback, void* callback_data) {
	std::vector<cl::Platform> platforms;
	cl::Platform::get(&platforms);
	int total_devices = (num_devices) ? num_devices : -1;

	if (platforms.empty())
		throw Error("No OpenCL Platform found, please check your OpenCL installation.");

	std::vector<cl::Device> devices;
	if (platform_idx < 0) {
		for (std::vector<cl::Platform>::iterator it = platforms.begin(); (total_devices != 0) && it != platforms.end(); ++it) {
			cl::Platform platform = *it;

			if (total_devices > 0) {
				size_t previous_size = devices.size();
				filter_devices(platform, devices, total_devices, device_type);
				total_devices -= devices.size() - previous_size;

			} else if (total_devices < 0){
				filter_devices(platform, devices, 0, device_type);
			}
		}

	} else if (platform_idx >= static_cast<int>(platforms.size())) {
		throw Error("Invalid platform index given.");

	} else {
		cl::Platform platform = platforms[platform_idx];
		filter_devices(platform, devices, num_devices, device_type);
	}

	if (devices.empty())
		throw Error("Could not find available/enough devices.");

	cl::Context context(devices, NULL, callback, callback_data);
	return context;
}


cl::Program compile_program(cl::Context& context, const char* source, size_t source_size, const char* options) {
	std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();

	if (devices.empty())
		throw Error("No device available in context.");

	cl::Program::Sources sources;
	sources.push_back(std::make_pair(source, source_size));
	cl::Program program(context, sources);
	program.build(devices, options, NULL);
	std::stringstream ss;
	for (auto& device : devices) {
		if (program.getBuildInfo<CL_PROGRAM_BUILD_STATUS>(device) != CL_BUILD_SUCCESS) {
			ss
				<< "Program could not be built for device \"" << device.getInfo<CL_DEVICE_NAME>() << "\"."
				<< " Status: " << program.getBuildInfo<CL_PROGRAM_BUILD_STATUS>(device)
				<< ", Options: " << program.getBuildInfo<CL_PROGRAM_BUILD_OPTIONS>(device)
				<< ", Log: \"" << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device) << "\"."
				<< std::endl;
		}
	}
	std::string errors = ss.str();
	if (!errors.empty()) {
		throw Error(errors);
	}

	return program;
}


cl::Device first_device(cl::Context& context) {
	std::vector<cl::Device> devices;
	devices = context.getInfo<CL_CONTEXT_DEVICES>();
	return devices[0];
}


## opencl-helper.hpp
#ifndef OPENCL_HELPER_HPP_
#define OPENCL_HELPER_HPP_

#define __CL_ENABLE_EXCEPTIONS
#include <CL/cl.hpp>
#include <exception>

typedef void (*context_callback_f)(const char* message, const void* data, size_t data_size, void* user_data);

class Error : public std::exception {
public:
	Error() throw() : _code(0) {}
	Error(const char* what) throw() : _code(0), _what(what) {}
	Error(const std::string& what) throw() : _code(0), _what(what) {}
	const char* what() throw() { return _what.c_str(); }
	int code() throw() { return _code; }

private:
	int _code;
	std::string _what;
};


extern size_t highest_pow2(size_t num);
extern size_t next_multiple_ge_size(size_t size, size_t dimSize);

extern bool filter_devices(cl::Platform platform, std::vector<cl::Device>& output, size_t num_devices = 0, cl_device_type device_type = CL_DEVICE_TYPE_ALL);
extern cl::Context create_context(size_t num_devices = 0, cl_device_type device_type = CL_DEVICE_TYPE_ALL, int platform_idx = -1, context_callback_f callback = NULL, void* callback_data = NULL);
extern cl::Device first_device(cl::Context& context);
extern cl::Program compile_program(cl::Context& context, const char* source, size_t source_size, const char* options = NULL);


#endif /* OPENCL_HELPER_HPP_ */

	RM=rm -f
	RMDIR=rm -rf

	CC=gcc -c -o
	CPP=g++ -c -o
	SLINK=ar rcu
	ELINK=g++ -o
	DLINK=g++ -shared -o


	SOBJS=opencl-helper.o
	DOBJS=
	EOBJS=opencl-example-vecmul.o

	SLIB=libopencl-helper.a
	DLIB=
	EXEC=opencl-example-vecmul

	SDEPS=
	DDEPS=
	EDEPS=$(SLIB)

	CFLAGS=-Wall -g --std=c99
	CPPFLAGS=-Wall -g --std=c++11

	LIBS=-lOpenCL
	LINKFLAGS=
	SFLAGS=$(LINKFLAGS)
	EFLAGS=$(LINKFLAGS)
	DFLAGS=$(LINKFLAGS)

	TARGETS=$(EXEC) $(SLIB)

	all: $(TARGETS)

	clean:
	$(RM) $(SLIB) $(DLIB) $(EXEC) $(SOBJS) $(DOBJS) $(EOBJS)


	$(SLIB): $(SOBJS)
	$(SLINK) $@ $? $(SFLAGS)


	$(DLIB): $(DOBJS) $(DDEPS)
	$(DLINK) $@ $? $(DFLAGS) $(LIBS) $(DDEPS)


	$(EXEC): $(EOBJS) $(EDEPS)
	$(ELINK) $@ $? $(EFLAGS) $(LIBS) $(EDEPS)


	.cpp.o:
	$(CPP) $@ $? $(CPPFLAGS)


	.c.o:
	$(CC) $@ $? $(CFLAGS)
	// Compile with: g++ --std=c++11 -Wall -lOpenCL -o opencl-example-vecmul opencl-example-vecmul.cpp

	#include <limits>
	#include <iostream>
	#include <sstream>
	#include <iomanip>
	#include <cstdlib>

	#include "opencl-helper.hpp"
	#include "opencl-example-vecmul.hpp"


	int main(int argc, const char* argv[]) {
	Example<cl_float> example;
	int ret = example.parse(argc, argv);
	if (ret == 0)
	ret = example.initialize();
	example.print_options(std::cerr);
	if (ret == 0)
	ret = example.execute();
	return ret;
	}


	template <typename T> Example<T>::Example()
	// Options
	: _program_name("example")
	, _commands("")
	, _platform_idx(-1)
	, _vector_size(100000)
	, _min_value(0)
	, _max_value(100)
	, _margin(
	std::numeric_limits<T>::is_specialized
	? std::numeric_limits<T>::is_integer
	? 0
	: std::numeric_limits<T>::epsilon()
	: 0
	)
	, _verbose(false)

	// State
	, _program_source()
	, _global_size(0)
	, _local_size(0)
	, _Va(), _Vb(), _Vo()
	, _context()
	, _device()
	, _program()
	{}


	template <typename T> int Example<T>::parse(int argc, const char** argv) {
	_program_name = argv[0];
	if (argc > 1) _commands = argv[1];
	if (argc > 2)_platform_idx = std::atoi(argv[2]);
	if (argc > 3) _vector_size = std::atoi(argv[3]);
	if (argc > 4) _min_value = std::atof(argv[4]);
	if (argc > 5) _max_value = std::atof(argv[5]);
	if (argc > 6) _margin = std::atof(argv[6]);

	return 0;
	}


	template <typename T> void Example<T>::split(std::vector<Example<T>> per_command_examples) {
	per_command_examples.resize(_commands.size());
	for (size_t i = 0; i < _commands.size(); ++i) {
	per_command_examples[i] = Example(const_cast<Example<T>&>(*this));
	per_command_examples[i]._commands = _commands[i];
	}
	}


	template <typename T> int Example<T>::initialize() {
	int ret = 0;

	try {
	_context = create_context(1, CL_DEVICE_TYPE_ALL, _platform_idx);
	_device = first_device(_context);
	_queue = cl::CommandQueue(_context, _device, CL_QUEUE_PROFILING_ENABLE);
	auto sizes = optimal_sizes();
	_global_size = sizes.first;
	_local_size = sizes.second;

	if (_global_size == 0 \|\| _local_size == 0) {
	std::cerr << "* ERROR: Could not find suitable sizes. Global: " << _global_size << ", Local: " << _local_size << std::endl;
	ret = 1;

	} else {
	_Va.resize(_global_size);
	_Vb.resize(_global_size);
	_Vo.resize(_global_size);
	fill();

	size_t pos;
	_program_source = ::program_source;
	while ((pos = _program_source.find("TYPE")) != std::string::npos) {
	_program_source.replace(pos, strlen("TYPE"), TypeInfo<T>::type_name());
	}
	std::cout << _program_source << std::endl;
	_program = compile_program(_context, _program_source.c_str(), _program_source.size(), "-cl-std=CL1.1 -cl-kernel-arg-info -w -g");
	}

	} catch (cl::Error& ex) {
	std::cerr << "* OPENCL EXCEPTION " << ex.err() << ": " << ex.what() << std::endl;
	ret = 1;

	} catch (Error& ex) {
	std::cerr << "* EXCEPTION " << ex.code() << ": " << ex.what() << std::endl;
	ret = 1;
	}

	return ret;
	}


	template <typename T> void Example<T>::print_options(std::ostream& os) {
	os << "* Minimum value: " << _min_value << std::endl;
	os << "* Maximum value: " << _max_value << std::endl;
	os << "* Margin value: " << _margin << std::endl;
	os << "* Vector size: " << _vector_size << std::endl;
	os << "* Commands: " << _commands << std::endl;
	if (_global_size) os << "* Global Size: " << _global_size << std::endl;
	if (_local_size) os << "* Local Size: " << _local_size << std::endl;
	}


	template <typename T> int Example<T>::execute() {
	int ret = 0;

	if (_commands.empty()) {
	std::cerr << "USAGE: " << _program_name << " <COMMANDS> [PLATFORM_IDX] [VECTOR_SIZE] [MIN_VALUE] [MAX_VALUE] [MARGIN]" << std::endl;

	} else {
	for (size_t i = 0; i < _commands.size(); ++i) {
	switch(_commands[i]) {
	case 'p':
	std::cout << _program_source << std::endl;
	break;

	case 'c':
	{
	copy(_Va, _Vo);
	CpuTiming timing;
	timing.start();
	vecmul(&_Vo[0], &_Vb[0], _Vo.size());
	timing.end();
	std::cout << "Local-OpenCL statistics:" << std::endl
	<< "\tValid: " << verify(_Vo) << std::endl
	<< "\tTotal CPU time: " << timing.sec() << "s" << std::endl;
	}
	break;

	case 'g':
	try {
	copy(_Va, _Vo);
	execute_global();

	} catch (cl::Error& ex) {
	std::cerr << "* [GLOBAL] OPENCL ERROR " << ex.err() << ": " << ex.what() << std::endl;

	} catch (Error& ex) {
	std::cerr << "* [GLOBAL] ERROR " << ex.code() << ": " << ex.what() << std::endl;
	}
	break;

	case 'l':
	{
	try {
	copy(_Va, _Vo);
	execute_local();

	} catch (cl::Error& ex) {
	std::cerr << "* [LOCAL] OPENCL ERROR " << ex.err() << ": " << ex.what() << std::endl;

	} catch (Error& ex) {
	std::cerr << "* [LOCAL] ERROR " << ex.code() << ": " << ex.what() << std::endl;
	}
	}
	break;

	default:
	std::cerr << "* WARNING: Unknown command '" << _commands[i] << "'." << std::endl;
	}
	}
	}

	return ret;
	}


	template <typename T> void Example<T>::fill() {
	for (size_t i = 0; i < _vector_size; ++i) {
	_Va[i] = _min_value + (static_cast<float>(rand()) / RAND_MAX) * (_max_value - _min_value);
	_Vb[i] = _min_value + (static_cast<float>(rand()) / RAND_MAX) * (_max_value - _min_value);
	}
	}


	template <typename T> void Example<T>::copy(const std::vector<T>& from, std::vector<T>& to) {
	std::copy(from.begin(), from.begin() + _vector_size, to.begin());
	}


	template <typename T> bool Example<T>::verify(const std::vector<T>& Vo) {
	for (size_t i = 0; i < _vector_size; ++i) {
	T correct_value = _Va[i] * _Vb[i];
	T value = Vo[i];
	if (!within_margin(correct_value, value)) {
	return false;
	}
	}
	return true;
	}


	template <typename T> bool Example<T>::within_margin(T correct_value, T value) {
	T diff = correct_value - value;
	if (diff < 0) diff = -diff;
	if (diff > _margin) {
	return false;
	}
	return true;
	}


	template <typename T> std::pair<size_t, size_t> Example<T>::optimal_sizes() {
	// Utterly fails if architecture/platform doesn't give or optimize workgroup sizes
	// in powers of 2.
	std::vector<size_t> maxWorkItemSizes = _device.getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>();
	size_t maxLocalMemSize = _device.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();

	// Limit by maximum work-item size.
	size_t size_pow = highest_pow2(_vector_size);
	size_t local_size = std::min(size_pow, maxWorkItemSizes[0]);

	// Further limit by maximum local memory.
	while (local_size * sizeof(T) * 2 > maxLocalMemSize) {
	local_size /= 2;
	}

	size_t global_size = next_multiple_ge_size(_vector_size, local_size);
	return std::make_pair(global_size, local_size);
	}


	template <typename T> void Example<T>::execute_global() {
	CpuTiming timing;
	timing.start();

	cl::Buffer Ma(_context, CL_MEM_READ_WRITE \| CL_MEM_COPY_HOST_PTR, _vector_size * sizeof(T), &_Vo[0]);
	cl::Buffer Mb(_context, CL_MEM_READ_ONLY \| CL_MEM_COPY_HOST_PTR, _vector_size * sizeof(T), &_Vb[0]);
	std::vector<cl::Event> kernel_events(1);
	kernel_events[0] = enqueue_global(Ma, Mb);
	cl::Event copy_event = enqueue_read(Ma, _Vo, true, &kernel_events, _vector_size);
	timing.end();

	bool valid = verify(_Vo);
	std::cout << "Global-OpenCL statistics:" << std::endl
	<< "\tValid: " << valid << std::endl
	<< "\tTotal CPU Time: " << timing.sec() << "s" << std::endl;
	}


	template <typename T> void Example<T>::execute_local() {
	CpuTiming timing;
	timing.start();

	cl::Buffer Ma(_context, CL_MEM_READ_WRITE \| CL_MEM_COPY_HOST_PTR, _vector_size * sizeof(T), &_Vo[0]);
	cl::Buffer Mb(_context, CL_MEM_READ_ONLY \| CL_MEM_COPY_HOST_PTR, _vector_size * sizeof(T), &_Vb[0]);
	std::vector<cl::Event> kernel_events(1);
	kernel_events[0] = enqueue_local(Ma, Mb);
	cl::Event copy_event = enqueue_read(Ma, _Vo, true, &kernel_events, _vector_size);
	timing.end();

	bool valid = verify(_Vo);
	std::cout << "Local-OpenCL statistics:" << std::endl
	<< "\tValid: " << valid << std::endl
	<< "\tTotal CPU Time: " << timing.sec() << "s" << std::endl;
	}


	template <typename T> cl::Event Example<T>::enqueue_read(cl::Buffer& M, std::vector<T>& to, bool sync, std::vector<cl::Event>* dependencies, size_t size) {
	cl::Event event;
	if (size == 0) size = to.size();
	_queue.enqueueReadBuffer(M, sync, 0, size * sizeof(T), &to[0], dependencies, &event);
	return event;
	}


	template <typename T> cl::Event Example<T>::enqueue_write(cl::Buffer& M, std::vector<T>& from, bool sync, std::vector<cl::Event>* dependencies, size_t size) {
	cl::Event event;
	if (size == 0) size = from.size();
	_queue.enqueueWriteBuffer(M, sync, 0, size * sizeof(T), &from[0], dependencies, &event);
	return event;
	}


	template <typename T> cl::Event Example<T>::enqueue_global(cl::Buffer& Ma, cl::Buffer& Mb, std::vector<cl::Event>* dependencies) {
	cl::Event event;
	size_t arg_idx = 0;
	cl::NDRange range(_global_size);
	cl::Kernel kernel(_program, "vecmul_g");
	kernel.setArg(arg_idx++, Ma);
	kernel.setArg(arg_idx++, Mb);
	kernel.setArg(arg_idx++, static_cast<cl_int>(_vector_size));
	_queue.enqueueNDRangeKernel(kernel, cl::NullRange, range, cl::NullRange, dependencies, &event);
	return event;
	}


	template <typename T> cl::Event Example<T>::enqueue_local(cl::Buffer& Ma, cl::Buffer& Mb, std::vector<cl::Event>* dependencies) {
	cl::Event event;
	size_t arg_idx = 0;
	cl::NDRange grange(_global_size);
	cl::NDRange lrange(_local_size);
	cl::Kernel kernel(_program, "vecmul_l");
	kernel.setArg(arg_idx++, Ma);
	kernel.setArg(arg_idx++, Mb);
	kernel.setArg(arg_idx++, cl::Local(_local_size * sizeof(T))); // cl::__local(_local_size * sizeof(T))
	kernel.setArg(arg_idx++, cl::Local(_local_size * sizeof(T)));
	kernel.setArg(arg_idx++, static_cast<cl_int>(_vector_size));
	_queue.enqueueNDRangeKernel(kernel, cl::NullRange, grange, lrange, dependencies, &event);
	return event;
	}


	void vecmul(float* Va, const float* Vb, size_t N) {
	for (size_t i = 0; i < N; ++i) {
	Va[i] = Va[i] * Vb[i];
	}
	}


	const char* const program_source =
	"kernel void vecmul_g(\n"
	" global TYPE* Va,\n"
	" global TYPE* Vb,\n"
	" const int N\n"
	") {\n"
	" int i = get_global_id(0);\n"
	" if (i < N) {\n"
	" Va[i] = Va[i] * Vb[i];\n"
	" }\n"
	"}\n"
	"\n"
	"kernel void vecmul_l(\n"
	" global TYPE* Va,\n"
	" global TYPE* Vb,\n"
	" local TYPE* Ba,\n"
	" local TYPE* Bb,\n"
	" const int N\n"
	") {\n"
	" int i = get_global_id(0);\n"
	" if (i < N) {\n"
	" int LN = get_group_id(0) *\n"
	" get_local_size(0);\n"
	" int L = get_local_id(0) + LN;\n"
	" Ba[L] = Va[i]; Bb[L] = Vb[i];\n"
	" barrier(CLK_GLOBAL_MEM_FENCE);\n"
	" Ba[L] = Ba[L] * Bb[L];\n"
	" barrier(CLK_LOCAL_MEM_FENCE);\n"
	" Va[i] = Ba[L];\n"
	" }\n"
	"}\n"
	;
	#ifndef OPENCL_EXAMPLE_VECMUL_HPP_
	#define OPENCL_EXAMPLE_VECMUL_HPP_

	#include <vector>
	#include <iosfwd>
	#include <ctime>
	#include <string>
	#include <cstring>


	class CpuTiming;
	template <typename T> class Example;

	extern const char* const program_source;
	extern void vecmul(float* Va, const float* Vb, size_t N);

	class CpuTiming {
	public:
	CpuTiming() : _start_clk(0), _end_clk(0) {}
	inline void start() { _start_clk = std::clock(); }
	inline void end() { _end_clk = std::clock(); }
	inline double sec() { return static_cast<double>(_end_clk - _start_clk) / CLOCKS_PER_SEC; }

	private:
	clock_t _start_clk;
	clock_t _end_clk;
	};


	template <typename T> struct TypeInfo {
	static inline constexpr const char* type_name() { return ""; }
	};


	template <> struct TypeInfo<cl_int> {
	static inline constexpr const char* type_name() { return "int"; }
	};


	template <> struct TypeInfo<cl_float> {
	static inline constexpr const char* type_name() { return "float"; }
	};


	template <> struct TypeInfo<cl_double> {
	static inline constexpr const char* type_name() { return "double"; }
	};


	template <typename T> class Example {
	public:
	Example();
	Example(const Example<T>& other) = default;
	int parse(int argc, const char** argv);
	void split(std::vector<Example<T>> per_command_examples);
	int initialize();
	void print_options(std::ostream& os);
	int execute();

	private:
	void execute_global();
	void execute_local();
	cl::Event enqueue_read(cl::Buffer& M, std::vector<T>& to, bool sync, std::vector<cl::Event>* dependencies = NULL, size_t size = 0);
	cl::Event enqueue_write(cl::Buffer& M, std::vector<T>& from, bool sync, std::vector<cl::Event>* dependencies = NULL, size_t size = 0);
	cl::Event enqueue_global(cl::Buffer& Ma, cl::Buffer& Mb, std::vector<cl::Event>* dependencies = NULL);
	cl::Event enqueue_local(cl::Buffer& Ma, cl::Buffer& Mb, std::vector<cl::Event>* dependencies = NULL);

	void fill();
	void copy(const std::vector<T>& from, std::vector<T>& to);
	bool verify(const std::vector<T>& Vo);
	bool within_margin(T correct_value, T value);
	std::pair<size_t, size_t> optimal_sizes();

	private:
	// Options
	std::string _program_name;
	std::string _commands;
	int _platform_idx;
	size_t _vector_size;
	T _min_value;
	T _max_value;
	T _margin;
	bool _verbose;

	// State
	std::string _program_source;
	size_t _global_size;
	size_t _local_size;
	std::vector<T> _Va, _Vb, _Vo;
	cl::Context _context;
	cl::Device _device;
	cl::Program _program;
	cl::CommandQueue _queue;
	};



	#endif /* OPENCL_EXAMPLE_VECMUL_HPP_ */

	#define __CL_ENABLE_EXCEPTIONS
	#include <CL/cl.hpp>
	#include <sstream>
	#include <string>
	#include <vector>

	#include "opencl-helper.hpp"



	size_t highest_pow2(size_t num) {
	size_t ret = 1;
	while (ret < num) ret <<= 1;
	return ret;
	}


	size_t next_multiple_ge_size(size_t size, size_t dimSize) {
	size_t v = 1;
	while (v * dimSize < size) ++v;
	return dimSize * v;
	}


	bool filter_devices(cl::Platform platform, std::vector<cl::Device>& output, size_t num_devices, cl_device_type device_type) {
	bool foundDevice = false;
	std::vector<cl::Device> platformDevices;
	platform.getDevices(device_type, &platformDevices);

	if (!platformDevices.empty() && platformDevices.size()) {
	foundDevice = true;
	if (num_devices == 0)
	num_devices = platformDevices.size();
	output.insert(output.end(), platformDevices.begin(), platformDevices.begin() + std::min(num_devices, platformDevices.size()));
	}

	return foundDevice;
	}

	cl::Context create_context(size_t num_devices, cl_device_type device_type, int platform_idx, context_callback_f callback, void* callback_data) {
	std::vector<cl::Platform> platforms;
	cl::Platform::get(&platforms);
	int total_devices = (num_devices) ? num_devices : -1;

	if (platforms.empty())
	throw Error("No OpenCL Platform found, please check your OpenCL installation.");

	std::vector<cl::Device> devices;
	if (platform_idx < 0) {
	for (std::vector<cl::Platform>::iterator it = platforms.begin(); (total_devices != 0) && it != platforms.end(); ++it) {
	cl::Platform platform = *it;

	if (total_devices > 0) {
	size_t previous_size = devices.size();
	filter_devices(platform, devices, total_devices, device_type);
	total_devices -= devices.size() - previous_size;

	} else if (total_devices < 0){
	filter_devices(platform, devices, 0, device_type);
	}
	}

	} else if (platform_idx >= static_cast<int>(platforms.size())) {
	throw Error("Invalid platform index given.");

	} else {
	cl::Platform platform = platforms[platform_idx];
	filter_devices(platform, devices, num_devices, device_type);
	}

	if (devices.empty())
	throw Error("Could not find available/enough devices.");

	cl::Context context(devices, NULL, callback, callback_data);
	return context;
	}


	cl::Program compile_program(cl::Context& context, const char* source, size_t source_size, const char* options) {
	std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();

	if (devices.empty())
	throw Error("No device available in context.");

	cl::Program::Sources sources;
	sources.push_back(std::make_pair(source, source_size));
	cl::Program program(context, sources);
	program.build(devices, options, NULL);
	std::stringstream ss;
	for (auto& device : devices) {
	if (program.getBuildInfo<CL_PROGRAM_BUILD_STATUS>(device) != CL_BUILD_SUCCESS) {
	ss
	<< "Program could not be built for device \"" << device.getInfo<CL_DEVICE_NAME>() << "\"."
	<< " Status: " << program.getBuildInfo<CL_PROGRAM_BUILD_STATUS>(device)
	<< ", Options: " << program.getBuildInfo<CL_PROGRAM_BUILD_OPTIONS>(device)
	<< ", Log: \"" << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device) << "\"."
	<< std::endl;
	}
	}
	std::string errors = ss.str();
	if (!errors.empty()) {
	throw Error(errors);
	}

	return program;
	}


	cl::Device first_device(cl::Context& context) {
	std::vector<cl::Device> devices;
	devices = context.getInfo<CL_CONTEXT_DEVICES>();
	return devices[0];
	}
	#ifndef OPENCL_HELPER_HPP_
	#define OPENCL_HELPER_HPP_

	#define __CL_ENABLE_EXCEPTIONS
	#include <CL/cl.hpp>
	#include <exception>

	typedef void (context_callback_f)(const char message, const void* data, size_t data_size, void* user_data);

	class Error : public std::exception {
	public:
	Error() throw() : _code(0) {}
	Error(const char* what) throw() : _code(0), _what(what) {}
	Error(const std::string& what) throw() : _code(0), _what(what) {}
	const char* what() throw() { return _what.c_str(); }
	int code() throw() { return _code; }

	private:
	int _code;
	std::string _what;
	};


	extern size_t highest_pow2(size_t num);
	extern size_t next_multiple_ge_size(size_t size, size_t dimSize);

	extern bool filter_devices(cl::Platform platform, std::vector<cl::Device>& output, size_t num_devices = 0, cl_device_type device_type = CL_DEVICE_TYPE_ALL);
	extern cl::Context create_context(size_t num_devices = 0, cl_device_type device_type = CL_DEVICE_TYPE_ALL, int platform_idx = -1, context_callback_f callback = NULL, void* callback_data = NULL);
	extern cl::Device first_device(cl::Context& context);
	extern cl::Program compile_program(cl::Context& context, const char* source, size_t source_size, const char* options = NULL);



	#endif /* OPENCL_HELPER_HPP_ */