Skip to content

Instantly share code, notes, and snippets.

@ddemidov
Last active December 16, 2015 07:19
Show Gist options
  • Save ddemidov/5398174 to your computer and use it in GitHub Desktop.
Save ddemidov/5398174 to your computer and use it in GitHub Desktop.
Bug in AMD OpenCL implementation
#include <iostream>
#include <vector>
#include <string>
#include <stdexcept>
#define __CL_ENABLE_EXCEPTIONS
#include <CL/cl.hpp>
//---------------------------------------------------------------------------
void precondition(bool cond, const std::string &msg) {
if (!cond) throw std::runtime_error(msg);
}
//---------------------------------------------------------------------------
cl::Device get_device() {
// Get list of OpenCL platforms.
std::vector<cl::Platform> platform;
cl::Platform::get(&platform);
precondition(!platform.empty(), "No OpenCL platforms.");
// Get first available device.
for(auto p = platform.begin(); p != platform.end(); p++) {
std::vector<cl::Device> device;
p->getDevices(CL_DEVICE_TYPE_ALL, &device);
if (!device.empty()) return device[0];
}
precondition(false, "No compute devices.");
}
//---------------------------------------------------------------------------
cl::Program build_program(
const cl::Context &context,
const std::vector<cl::Device> &device,
const std::string &source
)
{
cl::Program program(context,
cl::Program::Sources(1, std::make_pair(source.c_str(), source.size()))
);
program.build(device);
return program;
}
//---------------------------------------------------------------------------
size_t alignup(size_t n, size_t m) {
// Return least multiple of m that is >= n.
return n % m ? n - n % m + m : n;
}
//---------------------------------------------------------------------------
int main() {
const size_t N = 1 << 20;
const size_t M = 1 << 12;
try {
std::vector<cl::Device> device;
device.push_back(get_device());
cl::Context context(device);
std::cout << device[0].getInfo<CL_DEVICE_NAME>() << std::endl;
// Create command queue.
cl::CommandQueue queue(context, device[0]);
// Compile OpenCL program for the device.
cl::Program program = build_program(context, device,
"kernel void add(\n"
" ulong n,\n"
" global const int *a,\n"
" global const int *b,\n"
" global int *c\n"
" )\n"
"{\n"
" for(size_t i = get_global_id(0); i < n; i += get_global_size(0)) {\n"
" c[i] = a[i] + b[i];\n"
" }\n"
"}\n"
);
cl::Kernel add(program, "add");
// Prepare input data.
std::vector<int> a(N, 0);
std::vector<int> b(N, 1);
// Allocate device buffers and transfer input data to device.
cl::Buffer A(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
N * sizeof(int), a.data());
cl::Buffer B(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
N * sizeof(int), b.data());
// The kernel configuration:
size_t workgroup_size = 256;
size_t global_size = 4 * workgroup_size *
device[0].getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
for(size_t i = 0; i < M; ++i) {
// Set kernel parameters.
add.setArg(0, static_cast<cl_ulong>(N));
add.setArg(1, A);
add.setArg(2, B);
add.setArg(3, A);
// Launch the kernel.
queue.enqueueNDRangeKernel(add, cl::NullRange, global_size, workgroup_size);
// If the following line is uncommented, then everything works as intended:
// queue.finish();
}
// Get result back to host.
queue.enqueueReadBuffer(A, CL_TRUE, 0, N * sizeof(int), a.data());
// Should see "4096 == 4096" here:
std::cout << M << " == " << a[0] << std::endl;
} catch (const std::exception &err) {
std::cerr << err.what() << std::endl;
return 1;
}
}
@ddemidov
Copy link
Author

This simple program, when compiled with

g++ -std=c++0x -o vector_sum vector_sum.cpp -lOpenCL

outputs 4096 == 4096 on NVIDIA and Intel OpenCL implementations. When, however, it is executed on AMD GPUs (the ones I tested are HD 7970 'Tahiti' and HD 7770 'Capeverde'), it may output 4096 == 4081, 4096 == 4082, or something else.

Uncommenting line 113 solves the issue, but should be unnecessary according to the standard. Moving the line outside of the loop (right before buffer read) does not help at all.

Replacing definition of global_size at line 99 with

size_t global_size = alignup(N, workgroup_size);

also helps, but is equally unnecessary.

The current operating system is Gentoo linux, kernel version 3.7.1. ati-drivers package has version 13.1. But I have observed this behavior on several machines for several consecutive versions of ati-drivers (and several linux kernels).

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment