Skip to content

Instantly share code, notes, and snippets.

@dfukunaga
Last active September 3, 2022 19:13
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dfukunaga/2be9e00f670a8b5db7dcfadc83248e17 to your computer and use it in GitHub Desktop.
Save dfukunaga/2be9e00f670a8b5db7dcfadc83248e17 to your computer and use it in GitHub Desktop.
CUDA Warp's Sum and Scan
#include <stdio.h>
#include <thrust/device_vector.h>
#include <thrust/sequence.h>
__global__
void warpSum(int *data) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int value = data[idx];
#pragma unroll
for (int i = 1; i < warpSize; i *= 2)
value += __shfl_xor(value, i);
printf("idx=%d\tvalue=%d\n", idx, value);
}
__global__
void warpScan(int *data) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int value = data[idx];
int laneId = threadIdx.x % warpSize;
#pragma unroll
for (int i = 1; i < warpSize; i *= 2) {
int n = __shfl_up(value, i);
if (laneId >= i) value += n;
}
printf("idx=%d\tvalue=%d\n", idx, value);
}
int main() {
int warpSize = 32;
// warp sum
thrust::device_vector<int> A(warpSize);
thrust::sequence(A.data(), A.data() + warpSize);
warpSum<<<1, warpSize>>>(thrust::raw_pointer_cast(A.data()));
// warp scan
thrust::device_vector<int> B(warpSize);
thrust::sequence(B.data(), B.data() + warpSize);
warpScan<<<1, warpSize>>>(thrust::raw_pointer_cast(B.data()));
cudaDeviceSynchronize();
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment