Skip to content

Instantly share code, notes, and snippets.

@al42and
Created May 23, 2022 13:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save al42and/7e580e2202bcd28425c473cb04c8fb02 to your computer and use it in GitHub Desktop.
Save al42and/7e580e2202bcd28425c473cb04c8fb02 to your computer and use it in GitHub Desktop.
Calling a kernel with different subgroup size for different vendors
// clang++ sg.cpp -fsycl-device-code-split=per_kernel -fsycl-targets=nvptx64-nvidia-cuda,spir64 -Xsycl-target-backend=nvptx64-nvidia-cuda,spir64 --offload-arch=sm_75 -fsycl -o sg
#include <CL/sycl.hpp>
#include <vector>
template <int subGroupSize> class Kernel;
template <int subGroupSize>
void run_kernel(const cl::sycl::device &syclDevice) {
static const int numThreads = 64;
std::cout << "Calling " << subGroupSize << std::endl;
sycl::queue queue = cl::sycl::queue(syclDevice);
auto buf = sycl::malloc_device<float>(1, queue);
sycl::event ev = queue.submit([&](sycl::handler &cgh) {
cgh.parallel_for<Kernel<subGroupSize>>(
sycl::range<1>{numThreads}, [=
](sycl::id<1> threadId) [[sycl::reqd_sub_group_size(subGroupSize)]] {
buf[0] = 1;
});
});
ev.wait_and_throw();
std::cout << " Done!" << std::endl;
}
int main() {
std::vector<cl::sycl::device> devices = cl::sycl::device::get_devices();
for (const auto &dev : devices) {
std::vector<size_t> subGroupSizes;
try {
subGroupSizes = dev.get_info<sycl::info::device::sub_group_sizes>();
} catch (sycl::runtime_error) {
continue;
}
if (subGroupSizes[0] == 32) // NVIDIA
{
run_kernel<32>(dev);
} else if (subGroupSizes[0] == 64) // AMD
{
run_kernel<64>(dev);
} else // Intel. Can support 32, but 16 works the best
{
assert(std::find(subGroupSizes.begin(), subGroupSizes.end(), 16) !=
subGroupSizes.end());
run_kernel<16>(dev);
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment