Christoph Klein codecircuit

## check-cuda-event-inter-stream-synchronization.cu
#include <cuda/runtime_api.hpp>
__device__ float pirate_value;

__global__ void pirate_float_non_assembler(char operation, int num_operations) {
	float acc{};
	switch (operation) {
		case 'a':
			for (int i = 0; i < num_operations; ++i) {
				acc += threadIdx.x;
			}

## test-compute-sanitizer-racecheck-with-sync-intrinsic.cu
#include <cstdio>

// compile with: nvcc -g -G -std=c++17 test-compute-sanitizer-racecheck-with-sync-intrinsic.cu -arch sm_75 -o test-compute-sanitizer-racecheck-with-sync-intrinsic
// execute with: compute-sanitizer --tool racecheck ./test-compute-sanitizer-racecheck-with-sync-intrinsic
//
// The example at https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#warp-examples-broadcast
// mentions that the threads are "synchronized" in the comment behind the function call to `shfl_sync`.


__managed__ float result[1];

## manual-fourier-transform-example.py
#!/usr/bin/env python

import numpy as np

import matplotlib.pyplot as plt

x0 = -10.0
x1 = 10.0
N = 1000
dx = (x1 - x0) / N

## warp-finds-max-G-elements.cu
// compile with `nvcc -std=c++17 -arch sm_61 --expt-relaxed-constexpr main.cu -o main`
#include <cfloat>
#include <cstdio>
#include <iostream>

__global__ void kernel(float* values, int num_values) {
	const int tid = threadIdx.x;
	const int G = blockDim.x;

	float r = 0;

## CMakeLists.txt
cmake_minimum_required(VERSION 3.18)

project(cuda-clang LANGUAGES CXX CUDA)

find_package(CMakeshift REQUIRED)
#find_package(gsl-lite REQUIRED)
find_package(cuda-api-wrappers REQUIRED)

add_executable(cuda-clang  "cuda-clang.cu")

## dynamic-parallelism-with-thrust-and-multiple-streams.cu
// compile with `nvcc -std=c++17 -rdc=true --extended-lambda dynamic-parallelism-with-thrust-and-multiple-streams.cu -o dpe`
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/for_each.h>
#include <thrust/iterator/counting_iterator.h>

struct do_something {
    __device__ do_something(float* ptr) : ptr_(ptr) {}
	#include <cuda/runtime_api.hpp>
	__device__ float pirate_value;

	__global__ void pirate_float_non_assembler(char operation, int num_operations) {
	float acc{};
	switch (operation) {
	case 'a':
	for (int i = 0; i < num_operations; ++i) {
	acc += threadIdx.x;
	}
	#include <cstdio>

	// compile with: nvcc -g -G -std=c++17 test-compute-sanitizer-racecheck-with-sync-intrinsic.cu -arch sm_75 -o test-compute-sanitizer-racecheck-with-sync-intrinsic
	// execute with: compute-sanitizer --tool racecheck ./test-compute-sanitizer-racecheck-with-sync-intrinsic
	//
	// The example at https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#warp-examples-broadcast
	// mentions that the threads are "synchronized" in the comment behind the function call to `shfl_sync`.


	__managed__ float result[1];
	#!/usr/bin/env python

	import numpy as np

	import matplotlib.pyplot as plt

	x0 = -10.0
	x1 = 10.0
	N = 1000
	dx = (x1 - x0) / N
	// compile with `nvcc -std=c++17 -arch sm_61 --expt-relaxed-constexpr main.cu -o main`
	#include <cfloat>
	#include <cstdio>
	#include <iostream>

	__global__ void kernel(float* values, int num_values) {
	const int tid = threadIdx.x;
	const int G = blockDim.x;

	float r = 0;
	cmake_minimum_required(VERSION 3.18)

	project(cuda-clang LANGUAGES CXX CUDA)

	find_package(CMakeshift REQUIRED)
	#find_package(gsl-lite REQUIRED)
	find_package(cuda-api-wrappers REQUIRED)

	add_executable(cuda-clang "cuda-clang.cu")
	// compile with `nvcc -std=c++17 -rdc=true --extended-lambda dynamic-parallelism-with-thrust-and-multiple-streams.cu -o dpe`
	#include <iostream>
	#include <thrust/device_vector.h>
	#include <thrust/execution_policy.h>
	#include <thrust/for_each.h>
	#include <thrust/iterator/counting_iterator.h>

	struct do_something {
	__device__ do_something(float* ptr) : ptr_(ptr) {}