Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
CUDA Warp's Sum and Scan
#include <stdio.h>
#include <thrust/device_vector.h>
#include <thrust/sequence.h>
__global__
void warpSum(int *data) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int value = data[idx];
#pragma unroll
for (int i = 1; i < warpSize; i *= 2)
value += __shfl_xor(value, i);
printf("idx=%d\tvalue=%d\n", idx, value);
}
__global__
void warpScan(int *data) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int value = data[idx];
int laneId = threadIdx.x % warpSize;
#pragma unroll
for (int i = 1; i < warpSize; i *= 2) {
int n = __shfl_up(value, i);
if (laneId >= i) value += n;
}
printf("idx=%d\tvalue=%d\n", idx, value);
}
int main() {
int warpSize = 32;
// warp sum
thrust::device_vector<int> A(warpSize);
thrust::sequence(A.data(), A.data() + warpSize);
warpSum<<<1, warpSize>>>(thrust::raw_pointer_cast(A.data()));
// warp scan
thrust::device_vector<int> B(warpSize);
thrust::sequence(B.data(), B.data() + warpSize);
warpScan<<<1, warpSize>>>(thrust::raw_pointer_cast(B.data()));
cudaDeviceSynchronize();
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.