ddemidov/vector_sum.cpp

## vector_sum.cpp
#include <iostream>
#include <vector>
#include <string>
#include <stdexcept>

#define __CL_ENABLE_EXCEPTIONS
#include <CL/cl.hpp>

//---------------------------------------------------------------------------
void precondition(bool cond, const std::string &msg) {
    if (!cond) throw std::runtime_error(msg);
}

//---------------------------------------------------------------------------
cl::Device get_device() {
    // Get list of OpenCL platforms.
    std::vector<cl::Platform> platform;
    cl::Platform::get(&platform);

    precondition(!platform.empty(), "No OpenCL platforms.");

    // Get first available device.
    for(auto p = platform.begin(); p != platform.end(); p++) {
        std::vector<cl::Device> device;
        p->getDevices(CL_DEVICE_TYPE_ALL, &device);

        if (!device.empty()) return device[0];
    }

    precondition(false, "No compute devices.");
}

//---------------------------------------------------------------------------
cl::Program build_program(
        const cl::Context &context,
        const std::vector<cl::Device> &device,
        const std::string &source
        )
{
    cl::Program program(context,
            cl::Program::Sources(1, std::make_pair(source.c_str(), source.size()))
            );
    program.build(device);
    return program;
}

//---------------------------------------------------------------------------
size_t alignup(size_t n, size_t m) {
    // Return least multiple of m that is >= n.
    return n % m ? n - n % m + m : n;
}

//---------------------------------------------------------------------------
int main() {
    const size_t N = 1 << 20;
    const size_t M = 1 << 12;

    try {
        std::vector<cl::Device> device;
        device.push_back(get_device());

        cl::Context context(device);

        std::cout << device[0].getInfo<CL_DEVICE_NAME>() << std::endl;

        // Create command queue.
        cl::CommandQueue queue(context, device[0]);

        // Compile OpenCL program for the device.
        cl::Program program = build_program(context, device,
            "kernel void add(\n"
            "       ulong n,\n"
            "       global const int *a,\n"
            "       global const int *b,\n"
            "       global int *c\n"
            "       )\n"
            "{\n"
            "    for(size_t i = get_global_id(0); i < n; i += get_global_size(0)) {\n"
            "       c[i] = a[i] + b[i];\n"
            "    }\n"
            "}\n"
            );

        cl::Kernel add(program, "add");

        // Prepare input data.
        std::vector<int> a(N, 0);
        std::vector<int> b(N, 1);

        // Allocate device buffers and transfer input data to device.
        cl::Buffer A(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
                N * sizeof(int), a.data());

        cl::Buffer B(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
                N * sizeof(int), b.data());

        // The kernel configuration:
        size_t workgroup_size = 256;
        size_t global_size    = 4 * workgroup_size *
            device[0].getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();

        for(size_t i = 0; i < M; ++i) {
            // Set kernel parameters.
            add.setArg(0, static_cast<cl_ulong>(N));
            add.setArg(1, A);
            add.setArg(2, B);
            add.setArg(3, A);

            // Launch the kernel.
            queue.enqueueNDRangeKernel(add, cl::NullRange, global_size, workgroup_size);

            // If the following line is uncommented, then everything works as intended:
            // queue.finish();
        }

        // Get result back to host.
        queue.enqueueReadBuffer(A, CL_TRUE, 0, N * sizeof(int), a.data());

        // Should see "4096 == 4096" here:
        std::cout << M << " == " << a[0] << std::endl;
    } catch (const std::exception &err) {
        std::cerr << err.what() << std::endl;
        return 1;
    }
}
	#include <iostream>
	#include <vector>
	#include <string>
	#include <stdexcept>

	#define __CL_ENABLE_EXCEPTIONS
	#include <CL/cl.hpp>

	//---------------------------------------------------------------------------
	void precondition(bool cond, const std::string &msg) {
	if (!cond) throw std::runtime_error(msg);
	}

	//---------------------------------------------------------------------------
	cl::Device get_device() {
	// Get list of OpenCL platforms.
	std::vector<cl::Platform> platform;
	cl::Platform::get(&platform);

	precondition(!platform.empty(), "No OpenCL platforms.");

	// Get first available device.
	for(auto p = platform.begin(); p != platform.end(); p++) {
	std::vector<cl::Device> device;
	p->getDevices(CL_DEVICE_TYPE_ALL, &device);

	if (!device.empty()) return device[0];
	}

	precondition(false, "No compute devices.");
	}

	//---------------------------------------------------------------------------
	cl::Program build_program(
	const cl::Context &context,
	const std::vector<cl::Device> &device,
	const std::string &source
	)
	{
	cl::Program program(context,
	cl::Program::Sources(1, std::make_pair(source.c_str(), source.size()))
	);
	program.build(device);
	return program;
	}

	//---------------------------------------------------------------------------
	size_t alignup(size_t n, size_t m) {
	// Return least multiple of m that is >= n.
	return n % m ? n - n % m + m : n;
	}

	//---------------------------------------------------------------------------
	int main() {
	const size_t N = 1 << 20;
	const size_t M = 1 << 12;

	try {
	std::vector<cl::Device> device;
	device.push_back(get_device());

	cl::Context context(device);

	std::cout << device[0].getInfo<CL_DEVICE_NAME>() << std::endl;

	// Create command queue.
	cl::CommandQueue queue(context, device[0]);

	// Compile OpenCL program for the device.
	cl::Program program = build_program(context, device,
	"kernel void add(\n"
	" ulong n,\n"
	" global const int *a,\n"
	" global const int *b,\n"
	" global int *c\n"
	" )\n"
	"{\n"
	" for(size_t i = get_global_id(0); i < n; i += get_global_size(0)) {\n"
	" c[i] = a[i] + b[i];\n"
	" }\n"
	"}\n"
	);

	cl::Kernel add(program, "add");

	// Prepare input data.
	std::vector<int> a(N, 0);
	std::vector<int> b(N, 1);

	// Allocate device buffers and transfer input data to device.
	cl::Buffer A(context, CL_MEM_READ_WRITE \| CL_MEM_COPY_HOST_PTR,
	N * sizeof(int), a.data());

	cl::Buffer B(context, CL_MEM_READ_WRITE \| CL_MEM_COPY_HOST_PTR,
	N * sizeof(int), b.data());

	// The kernel configuration:
	size_t workgroup_size = 256;
	size_t global_size = 4 * workgroup_size *
	device[0].getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();

	for(size_t i = 0; i < M; ++i) {
	// Set kernel parameters.
	add.setArg(0, static_cast<cl_ulong>(N));
	add.setArg(1, A);
	add.setArg(2, B);
	add.setArg(3, A);

	// Launch the kernel.
	queue.enqueueNDRangeKernel(add, cl::NullRange, global_size, workgroup_size);

	// If the following line is uncommented, then everything works as intended:
	// queue.finish();
	}

	// Get result back to host.
	queue.enqueueReadBuffer(A, CL_TRUE, 0, N * sizeof(int), a.data());

	// Should see "4096 == 4096" here:
	std::cout << M << " == " << a[0] << std::endl;
	} catch (const std::exception &err) {
	std::cerr << err.what() << std::endl;
	return 1;
	}
	}