Skip to content

Instantly share code, notes, and snippets.

@al42and
Last active April 18, 2024 16:30
Show Gist options
  • Save al42and/90a0e410f7d140b851eb4d2b9c14d9cd to your computer and use it in GitHub Desktop.
Save al42and/90a0e410f7d140b851eb4d2b9c14d9cd to your computer and use it in GitHub Desktop.
Simple standalone test to see how different versions of a small scan kernel behave
#include <sycl/sycl.hpp>
#if BUILD_ONEDPL
#define ONEDPL_USE_DPCPP_BACKEND 1
#include <oneapi/dpl/algorithm>
#include <oneapi/dpl/async>
#include <oneapi/dpl/execution>
#endif
template <int workGroupSize, int nElements_> struct ExclusivePrefixSumGlobal {
constexpr static int nElements = nElements_;
static auto kernel(sycl::handler &cgh, const int *__restrict__ gm_input,
int *__restrict__ gm_output) {
static_assert(nElements % workGroupSize == 0,
"This simple scan kernel does not handle padding");
return [=](sycl::nd_item<1> itemIdx) {
const int tid = itemIdx.get_local_id(0);
const sycl::group<1> workGroup = itemIdx.get_group();
sycl::joint_exclusive_scan(workGroup, gm_input, gm_input + nElements,
gm_output, 0, sycl::plus<int>{});
};
}
static void submit(sycl::queue q, const int *dataIn, int *dataOut) {
q.submit([&](sycl::handler &cgh) {
sycl::nd_range<1> range{workGroupSize, workGroupSize};
cgh.parallel_for(range, kernel(cgh, dataIn, dataOut));
});
}
};
template <int workGroupSize, int nElements_> struct ExclusivePrefixSumLocal {
constexpr static int nElements = nElements_;
static auto kernel(sycl::handler &cgh, const int *__restrict__ gm_input,
int *__restrict__ gm_output) {
static_assert(nElements % workGroupSize == 0,
"This simple scan kernel does not handle padding");
sycl::local_accessor<int, 1> sm_localBuf(nElements, cgh);
return [=](sycl::nd_item<1> itemIdx) {
const int tid = itemIdx.get_local_id(0);
const sycl::group<1> workGroup = itemIdx.get_group();
int *sm_localBufPtr = sm_localBuf.get_pointer();
for (int elem = tid; elem < nElements; elem += workGroupSize) {
sm_localBufPtr[elem] = gm_input[elem];
}
sycl::joint_exclusive_scan(workGroup, sm_localBufPtr,
sm_localBufPtr + nElements, sm_localBufPtr, 0,
sycl::plus<int>{});
sycl::group_barrier(workGroup);
for (int elem = tid; elem < nElements; elem += workGroupSize) {
gm_output[elem] = sm_localBufPtr[elem];
}
};
}
static void submit(sycl::queue q, const int *dataIn, int *dataOut) {
q.submit([&](sycl::handler &cgh) {
sycl::nd_range<1> range{workGroupSize, workGroupSize};
cgh.parallel_for(range, kernel(cgh, dataIn, dataOut));
});
}
};
template <int nElements_> struct ExclusivePrefixSumOneDPL {
constexpr static int nElements = nElements_;
static void submit(sycl::queue q, const int *dataIn, int *dataOut) {
#if BUILD_ONEDPL
static const auto policy = oneapi::dpl::execution::make_device_policy(q);
oneapi::dpl::experimental::exclusive_scan_async(
policy, dataIn, dataIn + nElements, dataOut, 0, sycl::plus<int>{});
#endif
}
};
template <typename F>
void benchmark(sycl::device device, F &&kernel, const char *name) {
sycl::queue q(device, {sycl::property::queue::in_order()});
std::cout << "Running on " << device.get_info<sycl::info::device::name>()
<< ", version " << name << "\n";
constexpr int size = F::nElements;
int *dataIn = sycl::malloc_device<int>(size, q);
int *dataOut = sycl::malloc_device<int>(size, q);
std::vector<int> dataH(size);
// Warm-up and sanity check
q.fill<int>(dataIn, 1, size);
F::submit(q, dataIn, dataOut);
q.copy<int>(dataOut, dataH.data(), size).wait();
int numFail = 0;
for (size_t i = 0; i < size; i++) {
int reference = i;
numFail += (dataH[i] != reference);
}
if (numFail > 0)
return;
for (int iter = 0; iter < 1000; iter++) {
F::submit(q, dataIn, dataOut);
q.wait();
}
sycl::free(dataIn, q);
sycl::free(dataOut, q);
}
int main() {
for (const auto &device : sycl::device::get_devices()) {
// Creating SYCL queue
sycl::queue q(device, {sycl::property::queue::in_order()});
constexpr size_t size = 8192;
benchmark(device, ExclusivePrefixSumGlobal<256, size>{}, "global (256)");
benchmark(device, ExclusivePrefixSumGlobal<512, size>{}, "global (512)");
benchmark(device, ExclusivePrefixSumGlobal<1024, size>{}, "global (1024)");
benchmark(device, ExclusivePrefixSumLocal<256, size>{}, "local (256)");
benchmark(device, ExclusivePrefixSumLocal<512, size>{}, "local (512)");
benchmark(device, ExclusivePrefixSumLocal<1024, size>{}, "local (1024)");
benchmark(device, ExclusivePrefixSumOneDPL<size>{}, "oneDPL");
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment