Last active
October 29, 2021 14:17
-
-
Save codecircuit/0d6b98a6b987ab59c2a94dbf1e6a6118 to your computer and use it in GitHub Desktop.
Does *__sync intrinsics ensure consistent view on shared memory within a warp?
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cstdio> | |
// compile with: nvcc -g -G -std=c++17 test-compute-sanitizer-racecheck-with-sync-intrinsic.cu -arch sm_75 -o test-compute-sanitizer-racecheck-with-sync-intrinsic | |
// execute with: compute-sanitizer --tool racecheck ./test-compute-sanitizer-racecheck-with-sync-intrinsic | |
// | |
// The example at https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#warp-examples-broadcast | |
// mentions that the threads are "synchronized" in the comment behind the function call to `shfl_sync`. | |
__managed__ float result[1]; | |
__global__ void kernel() { | |
constexpr int N = 500; | |
__shared__ int sh_mem[N]; | |
const int tid = threadIdx.x; | |
int k = tid; | |
for (int i = 0; i < blockDim.x / 16; ++i) { | |
// __syncwarp(); // no hazards with this sync statement | |
k = __shfl_up_sync(0xffffffff, k, 1); | |
if (tid > i) { | |
printf("thread %d, i = %d, k = %d\n", tid, i, k); | |
sh_mem[k] += k; | |
} | |
} | |
// | |
// Use the result to prevent that it is optimized out. | |
// | |
__syncthreads(); | |
if (tid == 0) { | |
int acc = 0; | |
for (int i = 0; i < N; ++i) { | |
acc += sh_mem[i]; | |
} | |
*result = acc; | |
} | |
} | |
int main() { | |
kernel<<<1, 32>>>(); | |
cudaDeviceSynchronize(); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment