codecircuit/test-compute-sanitizer-racecheck-with-sync-intrinsic.cu

## test-compute-sanitizer-racecheck-with-sync-intrinsic.cu
#include <cstdio>

// compile with: nvcc -g -G -std=c++17 test-compute-sanitizer-racecheck-with-sync-intrinsic.cu -arch sm_75 -o test-compute-sanitizer-racecheck-with-sync-intrinsic
// execute with: compute-sanitizer --tool racecheck ./test-compute-sanitizer-racecheck-with-sync-intrinsic
//
// The example at https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#warp-examples-broadcast
// mentions that the threads are "synchronized" in the comment behind the function call to `shfl_sync`.


__managed__ float result[1];

__global__ void kernel() {

	constexpr int N = 500;
	__shared__ int sh_mem[N];
	const int tid = threadIdx.x;
	int k = tid;
	for (int i = 0; i < blockDim.x / 16; ++i) {
		// __syncwarp(); // no hazards with this sync statement
		k = __shfl_up_sync(0xffffffff, k, 1);
		if (tid > i) {
			printf("thread %d, i = %d, k = %d\n", tid, i, k);
			sh_mem[k] += k;
		}
	}
	//
	// Use the result to prevent that it is optimized out.
	//
	__syncthreads();
	if (tid == 0) {
		int acc = 0;
		for (int i = 0; i < N; ++i) {
			acc += sh_mem[i];
		}
		*result = acc;
	}
}

int main() {
	kernel<<<1, 32>>>();
	cudaDeviceSynchronize();
}
	#include <cstdio>

	// compile with: nvcc -g -G -std=c++17 test-compute-sanitizer-racecheck-with-sync-intrinsic.cu -arch sm_75 -o test-compute-sanitizer-racecheck-with-sync-intrinsic
	// execute with: compute-sanitizer --tool racecheck ./test-compute-sanitizer-racecheck-with-sync-intrinsic
	//
	// The example at https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#warp-examples-broadcast
	// mentions that the threads are "synchronized" in the comment behind the function call to `shfl_sync`.


	__managed__ float result[1];

	__global__ void kernel() {

	constexpr int N = 500;
	__shared__ int sh_mem[N];
	const int tid = threadIdx.x;
	int k = tid;
	for (int i = 0; i < blockDim.x / 16; ++i) {
	// __syncwarp(); // no hazards with this sync statement
	k = __shfl_up_sync(0xffffffff, k, 1);
	if (tid > i) {
	printf("thread %d, i = %d, k = %d\n", tid, i, k);
	sh_mem[k] += k;
	}
	}
	//
	// Use the result to prevent that it is optimized out.
	//
	__syncthreads();
	if (tid == 0) {
	int acc = 0;
	for (int i = 0; i < N; ++i) {
	acc += sh_mem[i];
	}
	*result = acc;
	}
	}

	int main() {
	kernel<<<1, 32>>>();
	cudaDeviceSynchronize();
	}