al42and/test_scan.cpp

## test_scan.cpp
#include <sycl/sycl.hpp>

#if BUILD_ONEDPL
#define ONEDPL_USE_DPCPP_BACKEND 1
#include <oneapi/dpl/algorithm>
#include <oneapi/dpl/async>
#include <oneapi/dpl/execution>
#endif

template <int workGroupSize, int nElements_> struct ExclusivePrefixSumGlobal {
  constexpr static int nElements = nElements_;
  static auto kernel(sycl::handler &cgh, const int *__restrict__ gm_input,
                     int *__restrict__ gm_output) {
    static_assert(nElements % workGroupSize == 0,
                  "This simple scan kernel does not handle padding");
    return [=](sycl::nd_item<1> itemIdx) {
      const int tid = itemIdx.get_local_id(0);
      const sycl::group<1> workGroup = itemIdx.get_group();
      sycl::joint_exclusive_scan(workGroup, gm_input, gm_input + nElements,
                                 gm_output, 0, sycl::plus<int>{});
    };
  }
  static void submit(sycl::queue q, const int *dataIn, int *dataOut) {
    q.submit([&](sycl::handler &cgh) {
      sycl::nd_range<1> range{workGroupSize, workGroupSize};
      cgh.parallel_for(range, kernel(cgh, dataIn, dataOut));
    });
  }
};

template <int workGroupSize, int nElements_> struct ExclusivePrefixSumLocal {
  constexpr static int nElements = nElements_;
  static auto kernel(sycl::handler &cgh, const int *__restrict__ gm_input,
                     int *__restrict__ gm_output) {
    static_assert(nElements % workGroupSize == 0,
                  "This simple scan kernel does not handle padding");
    sycl::local_accessor<int, 1> sm_localBuf(nElements, cgh);
    return [=](sycl::nd_item<1> itemIdx) {
      const int tid = itemIdx.get_local_id(0);
      const sycl::group<1> workGroup = itemIdx.get_group();
      int *sm_localBufPtr = sm_localBuf.get_pointer();

      for (int elem = tid; elem < nElements; elem += workGroupSize) {
        sm_localBufPtr[elem] = gm_input[elem];
      }
      sycl::joint_exclusive_scan(workGroup, sm_localBufPtr,
                                 sm_localBufPtr + nElements, sm_localBufPtr, 0,
                                 sycl::plus<int>{});
      sycl::group_barrier(workGroup);
      for (int elem = tid; elem < nElements; elem += workGroupSize) {
        gm_output[elem] = sm_localBufPtr[elem];
      }
    };
  }
  static void submit(sycl::queue q, const int *dataIn, int *dataOut) {
    q.submit([&](sycl::handler &cgh) {
      sycl::nd_range<1> range{workGroupSize, workGroupSize};
      cgh.parallel_for(range, kernel(cgh, dataIn, dataOut));
    });
  }
};

template <int nElements_> struct ExclusivePrefixSumOneDPL {
  constexpr static int nElements = nElements_;
  static void submit(sycl::queue q, const int *dataIn, int *dataOut) {
#if BUILD_ONEDPL
    static const auto policy = oneapi::dpl::execution::make_device_policy(q);
    oneapi::dpl::experimental::exclusive_scan_async(
        policy, dataIn, dataIn + nElements, dataOut, 0, sycl::plus<int>{});
#endif
  }
};

template <typename F>
void benchmark(sycl::device device, F &&kernel, const char *name) {
  sycl::queue q(device, {sycl::property::queue::in_order()});
  std::cout << "Running on " << device.get_info<sycl::info::device::name>()
            << ", version " << name << "\n";

  constexpr int size = F::nElements;
  int *dataIn = sycl::malloc_device<int>(size, q);
  int *dataOut = sycl::malloc_device<int>(size, q);
  std::vector<int> dataH(size);

  // Warm-up and sanity check
  q.fill<int>(dataIn, 1, size);
  F::submit(q, dataIn, dataOut);
  q.copy<int>(dataOut, dataH.data(), size).wait();

  int numFail = 0;
  for (size_t i = 0; i < size; i++) {
    int reference = i;
    numFail += (dataH[i] != reference);
  }
  if (numFail > 0)
    return;

  for (int iter = 0; iter < 1000; iter++) {
    F::submit(q, dataIn, dataOut);
    q.wait();
  }

  sycl::free(dataIn, q);
  sycl::free(dataOut, q);
}

int main() {
  for (const auto &device : sycl::device::get_devices()) {
    // Creating SYCL queue
    sycl::queue q(device, {sycl::property::queue::in_order()});

    constexpr size_t size = 8192;

    benchmark(device, ExclusivePrefixSumGlobal<256, size>{}, "global (256)");
    benchmark(device, ExclusivePrefixSumGlobal<512, size>{}, "global (512)");
    benchmark(device, ExclusivePrefixSumGlobal<1024, size>{}, "global (1024)");
    benchmark(device, ExclusivePrefixSumLocal<256, size>{}, "local (256)");
    benchmark(device, ExclusivePrefixSumLocal<512, size>{}, "local (512)");
    benchmark(device, ExclusivePrefixSumLocal<1024, size>{}, "local (1024)");
    benchmark(device, ExclusivePrefixSumOneDPL<size>{}, "oneDPL");
  }
  return 0;
}
	#include <sycl/sycl.hpp>

	#if BUILD_ONEDPL
	#define ONEDPL_USE_DPCPP_BACKEND 1
	#include <oneapi/dpl/algorithm>
	#include <oneapi/dpl/async>
	#include <oneapi/dpl/execution>
	#endif

	template <int workGroupSize, int nElements_> struct ExclusivePrefixSumGlobal {
	constexpr static int nElements = nElements_;
	static auto kernel(sycl::handler &cgh, const int *__restrict__ gm_input,
	int *__restrict__ gm_output) {
	static_assert(nElements % workGroupSize == 0,
	"This simple scan kernel does not handle padding");
	return [=](sycl::nd_item<1> itemIdx) {
	const int tid = itemIdx.get_local_id(0);
	const sycl::group<1> workGroup = itemIdx.get_group();
	sycl::joint_exclusive_scan(workGroup, gm_input, gm_input + nElements,
	gm_output, 0, sycl::plus<int>{});
	};
	}
	static void submit(sycl::queue q, const int dataIn, int dataOut) {
	q.submit([&](sycl::handler &cgh) {
	sycl::nd_range<1> range{workGroupSize, workGroupSize};
	cgh.parallel_for(range, kernel(cgh, dataIn, dataOut));
	});
	}
	};

	template <int workGroupSize, int nElements_> struct ExclusivePrefixSumLocal {
	constexpr static int nElements = nElements_;
	static auto kernel(sycl::handler &cgh, const int *__restrict__ gm_input,
	int *__restrict__ gm_output) {
	static_assert(nElements % workGroupSize == 0,
	"This simple scan kernel does not handle padding");
	sycl::local_accessor<int, 1> sm_localBuf(nElements, cgh);
	return [=](sycl::nd_item<1> itemIdx) {
	const int tid = itemIdx.get_local_id(0);
	const sycl::group<1> workGroup = itemIdx.get_group();
	int *sm_localBufPtr = sm_localBuf.get_pointer();

	for (int elem = tid; elem < nElements; elem += workGroupSize) {
	sm_localBufPtr[elem] = gm_input[elem];
	}
	sycl::joint_exclusive_scan(workGroup, sm_localBufPtr,
	sm_localBufPtr + nElements, sm_localBufPtr, 0,
	sycl::plus<int>{});
	sycl::group_barrier(workGroup);
	for (int elem = tid; elem < nElements; elem += workGroupSize) {
	gm_output[elem] = sm_localBufPtr[elem];
	}
	};
	}
	static void submit(sycl::queue q, const int dataIn, int dataOut) {
	q.submit([&](sycl::handler &cgh) {
	sycl::nd_range<1> range{workGroupSize, workGroupSize};
	cgh.parallel_for(range, kernel(cgh, dataIn, dataOut));
	});
	}
	};

	template <int nElements_> struct ExclusivePrefixSumOneDPL {
	constexpr static int nElements = nElements_;
	static void submit(sycl::queue q, const int dataIn, int dataOut) {
	#if BUILD_ONEDPL
	static const auto policy = oneapi::dpl::execution::make_device_policy(q);
	oneapi::dpl::experimental::exclusive_scan_async(
	policy, dataIn, dataIn + nElements, dataOut, 0, sycl::plus<int>{});
	#endif
	}
	};

	template <typename F>
	void benchmark(sycl::device device, F &&kernel, const char *name) {
	sycl::queue q(device, {sycl::property::queue::in_order()});
	std::cout << "Running on " << device.get_info<sycl::info::device::name>()
	<< ", version " << name << "\n";

	constexpr int size = F::nElements;
	int *dataIn = sycl::malloc_device<int>(size, q);
	int *dataOut = sycl::malloc_device<int>(size, q);
	std::vector<int> dataH(size);

	// Warm-up and sanity check
	q.fill<int>(dataIn, 1, size);
	F::submit(q, dataIn, dataOut);
	q.copy<int>(dataOut, dataH.data(), size).wait();

	int numFail = 0;
	for (size_t i = 0; i < size; i++) {
	int reference = i;
	numFail += (dataH[i] != reference);
	}
	if (numFail > 0)
	return;

	for (int iter = 0; iter < 1000; iter++) {
	F::submit(q, dataIn, dataOut);
	q.wait();
	}

	sycl::free(dataIn, q);
	sycl::free(dataOut, q);
	}

	int main() {
	for (const auto &device : sycl::device::get_devices()) {
	// Creating SYCL queue
	sycl::queue q(device, {sycl::property::queue::in_order()});

	constexpr size_t size = 8192;

	benchmark(device, ExclusivePrefixSumGlobal<256, size>{}, "global (256)");
	benchmark(device, ExclusivePrefixSumGlobal<512, size>{}, "global (512)");
	benchmark(device, ExclusivePrefixSumGlobal<1024, size>{}, "global (1024)");
	benchmark(device, ExclusivePrefixSumLocal<256, size>{}, "local (256)");
	benchmark(device, ExclusivePrefixSumLocal<512, size>{}, "local (512)");
	benchmark(device, ExclusivePrefixSumLocal<1024, size>{}, "local (1024)");
	benchmark(device, ExclusivePrefixSumOneDPL<size>{}, "oneDPL");
	}
	return 0;
	}