Andrey Alekseenko al42and

## test_scan.cpp
#include <sycl/sycl.hpp>

#if BUILD_ONEDPL
#define ONEDPL_USE_DPCPP_BACKEND 1
#include <oneapi/dpl/algorithm>
#include <oneapi/dpl/async>
#include <oneapi/dpl/execution>
#endif

template <int workGroupSize, int nElements_> struct ExclusivePrefixSumGlobal {

## farm_sve_m_test.cpp
#include <cstdio>
#define FARM_NB_BITS_IN_VEC 128
#include "farm_sve.h"

int main()
{
    svfloat32_t a, b, c;
    svbool_t    m;
    for (int i = 0; i < FARM_NB_BITS_IN_VEC / 32; i++)
    {

## vkfft-5x5x10.cpp
#include <iostream>
#include <vector>

#include "vkFFT.h"

const float inputdata[500] = {
    -3.5, 6.3,  1.2,  0.3,  1.1,  -5.7, 5.8,  -1.9, -6.3, -1.4, 7.4,  2.4,
    -9.9, -7.2, 5.4,  6.1,  -1.9, -7.6, 1.4,  -3.5, 0.7,  5.6,  -4.2, -1.1,
    -4.4, -6.3, -7.2, 4.6,  -3.0, -0.9, 7.2,  2.5,  -3.6, 6.1,  -3.2, -2.1,
    6.5,  -0.4, -9.0, 2.3,  8.4,  4.0,  -5.2, -9.0, 4.7,  -3.7, -2.0, -9.5,

## 8390.cpp
#include "nvToolsExt.h"
#include <iostream>
#include <sycl/sycl.hpp>
#include <vector>

template <int N> class Kernel;

constexpr int size = 8;

template <int N> void run_kernel(sycl::queue &queue, int *ptr) {

## CMakeLists.txt
project(HipSyclTest)
cmake_minimum_required(VERSION 3.0)

set(HIPSYCL_SYCLCC_EXTRA_ARGS -DCOMMON_DEFINE=1)
set(HIPSYCL_SYCLCC_EXTRA_ARGS_DEVICE_ONLY -DGPU_DEFINE=1 -ffast-math)

find_package(hipsycl REQUIRED)

add_executable(HipSyclTest main.cpp module_cpu.cpp module_gpu.cpp)
add_sycl_to_target(TARGET HipSyclTest SOURCES module_gpu.cpp)

## run.cpp
#include <CL/sycl.hpp>
#include <iostream>
#include <stdio.h>

void run_kernel(const sycl::device &syclDevice) {
  constexpr int numThreads = 512;
  try {
    sycl::queue queue = sycl::queue(syclDevice);
    sycl::buffer<int, 1> buffer(numThreads);
    queue

## sg.cpp
// clang++ sg.cpp -fsycl-device-code-split=per_kernel -fsycl-targets=nvptx64-nvidia-cuda,spir64 -Xsycl-target-backend=nvptx64-nvidia-cuda,spir64 --offload-arch=sm_75 -fsycl -o sg

#include <CL/sycl.hpp>
#include <vector>

template <int subGroupSize> class Kernel;

template <int subGroupSize>
void run_kernel(const cl::sycl::device &syclDevice) {
  static const int numThreads = 64;

## local_accessors.cpp
#include <CL/sycl.hpp>
#include <iostream>

using mode = sycl::access_mode;
using sycl::access::fence_space;
using sycl::access::target;

static constexpr int blockSize = 32;
static constexpr int numBlocks = 8;

## in_order_not_really.cpp
#include <iostream>
#include <vector>

#include <CL/sycl.hpp>

using namespace cl;

int main() {
  sycl::device dev{sycl::gpu_selector{}};
  sycl::queue q{dev, sycl::property_list{sycl::property::queue::in_order{}}};

## test_sqrt.cpp
#include <CL/sycl.hpp>
#include <iostream>

#ifdef USE_NATIVE
static inline float mySqrt(float x) {
#ifdef SYCL_DEVICE_ONLY
  return cl::sycl::native::sqrt(x);
#else
  return cl::sycl::sqrt(x);
#endif
	#include <sycl/sycl.hpp>

	#if BUILD_ONEDPL
	#define ONEDPL_USE_DPCPP_BACKEND 1
	#include <oneapi/dpl/algorithm>
	#include <oneapi/dpl/async>
	#include <oneapi/dpl/execution>
	#endif

	template <int workGroupSize, int nElements_> struct ExclusivePrefixSumGlobal {
	#include <cstdio>
	#define FARM_NB_BITS_IN_VEC 128
	#include "farm_sve.h"

	int main()
	{
	svfloat32_t a, b, c;
	svbool_t m;
	for (int i = 0; i < FARM_NB_BITS_IN_VEC / 32; i++)
	{
	#include <iostream>
	#include <vector>

	#include "vkFFT.h"

	const float inputdata[500] = {
	-3.5, 6.3, 1.2, 0.3, 1.1, -5.7, 5.8, -1.9, -6.3, -1.4, 7.4, 2.4,
	-9.9, -7.2, 5.4, 6.1, -1.9, -7.6, 1.4, -3.5, 0.7, 5.6, -4.2, -1.1,
	-4.4, -6.3, -7.2, 4.6, -3.0, -0.9, 7.2, 2.5, -3.6, 6.1, -3.2, -2.1,
	6.5, -0.4, -9.0, 2.3, 8.4, 4.0, -5.2, -9.0, 4.7, -3.7, -2.0, -9.5,
	#include "nvToolsExt.h"
	#include <iostream>
	#include <sycl/sycl.hpp>
	#include <vector>

	template <int N> class Kernel;

	constexpr int size = 8;

	template <int N> void run_kernel(sycl::queue &queue, int *ptr) {
	project(HipSyclTest)
	cmake_minimum_required(VERSION 3.0)

	set(HIPSYCL_SYCLCC_EXTRA_ARGS -DCOMMON_DEFINE=1)
	set(HIPSYCL_SYCLCC_EXTRA_ARGS_DEVICE_ONLY -DGPU_DEFINE=1 -ffast-math)

	find_package(hipsycl REQUIRED)

	add_executable(HipSyclTest main.cpp module_cpu.cpp module_gpu.cpp)
	add_sycl_to_target(TARGET HipSyclTest SOURCES module_gpu.cpp)
	#include <CL/sycl.hpp>
	#include <iostream>
	#include <stdio.h>

	void run_kernel(const sycl::device &syclDevice) {
	constexpr int numThreads = 512;
	try {
	sycl::queue queue = sycl::queue(syclDevice);
	sycl::buffer<int, 1> buffer(numThreads);
	queue
	// clang++ sg.cpp -fsycl-device-code-split=per_kernel -fsycl-targets=nvptx64-nvidia-cuda,spir64 -Xsycl-target-backend=nvptx64-nvidia-cuda,spir64 --offload-arch=sm_75 -fsycl -o sg

	#include <CL/sycl.hpp>
	#include <vector>

	template <int subGroupSize> class Kernel;

	template <int subGroupSize>
	void run_kernel(const cl::sycl::device &syclDevice) {
	static const int numThreads = 64;
	#include <CL/sycl.hpp>
	#include <iostream>

	using mode = sycl::access_mode;
	using sycl::access::fence_space;
	using sycl::access::target;

	static constexpr int blockSize = 32;
	static constexpr int numBlocks = 8;
	#include <CL/sycl.hpp>
	#include <iostream>

	#ifdef USE_NATIVE
	static inline float mySqrt(float x) {
	#ifdef SYCL_DEVICE_ONLY
	return cl::sycl::native::sqrt(x);
	#else
	return cl::sycl::sqrt(x);
	#endif