Skip to content

Instantly share code, notes, and snippets.

View al42and's full-sized avatar

Andrey Alekseenko al42and

View GitHub Profile
@al42and
al42and / test_scan.cpp
Last active April 18, 2024 16:30
Simple standalone test to see how different versions of a small scan kernel behave
#include <sycl/sycl.hpp>
#if BUILD_ONEDPL
#define ONEDPL_USE_DPCPP_BACKEND 1
#include <oneapi/dpl/algorithm>
#include <oneapi/dpl/async>
#include <oneapi/dpl/execution>
#endif
template <int workGroupSize, int nElements_> struct ExclusivePrefixSumGlobal {
#include <cstdio>
#define FARM_NB_BITS_IN_VEC 128
#include "farm_sve.h"
int main()
{
svfloat32_t a, b, c;
svbool_t m;
for (int i = 0; i < FARM_NB_BITS_IN_VEC / 32; i++)
{
@al42and
al42and / vkfft-5x5x10.cpp
Created April 25, 2023 14:48
VkFFT example failing with current "develop" branch
#include <iostream>
#include <vector>
#include "vkFFT.h"
const float inputdata[500] = {
-3.5, 6.3, 1.2, 0.3, 1.1, -5.7, 5.8, -1.9, -6.3, -1.4, 7.4, 2.4,
-9.9, -7.2, 5.4, 6.1, -1.9, -7.6, 1.4, -3.5, 0.7, 5.6, -4.2, -1.1,
-4.4, -6.3, -7.2, 4.6, -3.0, -0.9, 7.2, 2.5, -3.6, 6.1, -3.2, -2.1,
6.5, -0.4, -9.0, 2.3, 8.4, 4.0, -5.2, -9.0, 4.7, -3.7, -2.0, -9.5,
@al42and
al42and / 8390.cpp
Created April 5, 2023 14:49
Scheduling a lot of small kernels, CUDA vs oneAPI
#include "nvToolsExt.h"
#include <iostream>
#include <sycl/sycl.hpp>
#include <vector>
template <int N> class Kernel;
constexpr int size = 8;
template <int N> void run_kernel(sycl::queue &queue, int *ptr) {
project(HipSyclTest)
cmake_minimum_required(VERSION 3.0)
set(HIPSYCL_SYCLCC_EXTRA_ARGS -DCOMMON_DEFINE=1)
set(HIPSYCL_SYCLCC_EXTRA_ARGS_DEVICE_ONLY -DGPU_DEFINE=1 -ffast-math)
find_package(hipsycl REQUIRED)
add_executable(HipSyclTest main.cpp module_cpu.cpp module_gpu.cpp)
add_sycl_to_target(TARGET HipSyclTest SOURCES module_gpu.cpp)
#include <CL/sycl.hpp>
#include <iostream>
#include <stdio.h>
void run_kernel(const sycl::device &syclDevice) {
constexpr int numThreads = 512;
try {
sycl::queue queue = sycl::queue(syclDevice);
sycl::buffer<int, 1> buffer(numThreads);
queue
@al42and
al42and / sg.cpp
Created May 23, 2022 13:24
Calling a kernel with different subgroup size for different vendors
// clang++ sg.cpp -fsycl-device-code-split=per_kernel -fsycl-targets=nvptx64-nvidia-cuda,spir64 -Xsycl-target-backend=nvptx64-nvidia-cuda,spir64 --offload-arch=sm_75 -fsycl -o sg
#include <CL/sycl.hpp>
#include <vector>
template <int subGroupSize> class Kernel;
template <int subGroupSize>
void run_kernel(const cl::sycl::device &syclDevice) {
static const int numThreads = 64;
#include <CL/sycl.hpp>
#include <iostream>
using mode = sycl::access_mode;
using sycl::access::fence_space;
using sycl::access::target;
static constexpr int blockSize = 32;
static constexpr int numBlocks = 8;
#include <iostream>
#include <vector>
#include <CL/sycl.hpp>
using namespace cl;
int main() {
sycl::device dev{sycl::gpu_selector{}};
sycl::queue q{dev, sycl::property_list{sycl::property::queue::in_order{}}};
#include <CL/sycl.hpp>
#include <iostream>
#ifdef USE_NATIVE
static inline float mySqrt(float x) {
#ifdef SYCL_DEVICE_ONLY
return cl::sycl::native::sqrt(x);
#else
return cl::sycl::sqrt(x);
#endif