Skip to content

Instantly share code, notes, and snippets.

@allanmac
Created August 6, 2016 18:48
Show Gist options
  • Save allanmac/bf28f61a88be14693e594653e2c280fd to your computer and use it in GitHub Desktop.
Save allanmac/bf28f61a88be14693e594653e2c280fd to your computer and use it in GitHub Desktop.
Inclusive vs. exclusive warp scan
#include <stdio.h>
#include <stdint.h>
#define WARP_SIZE 32
//
//
//
__device__
uint32_t
warp_scan_inc_up_add_shuffled(uint32_t v, uint32_t const width=WARP_SIZE)
{
asm("{ \n\t"
" .reg .u32 t; \n\t"
" .reg .pred p; \n\t");
#pragma unroll
for (uint32_t d = 1; d < width; d *= 2)
asm("shfl.up.b32 t|p, %0, %1, 0x0; \n\t"
"@p add.u32 %0, %0, t; \n\t" : "+r"(v) : "r"(d));
asm("}");
return v;
}
__device__
uint32_t
warp_scan_exc_up_add_shuffled(uint32_t v, uint32_t const width=WARP_SIZE)
{
#if 1
asm("{ \n\t"
" .reg .u32 t; \n\t"
" .reg .pred p; \n\t"
" \n\t"
" shfl.up.b32 t|p, %0, 1, 0x0; \n\t"
" selp.b32 %0, t, 0, p; \n\t" : "+r"(v));
#pragma unroll
for (uint32_t d = 1; d < width; d *= 2)
asm("shfl.up.b32 t|p, %0, %1, 0x0; \n\t"
"@p add.u32 %0, %0, t; \n\t" : "+r"(v) : "r"(d));
asm("}");
return v;
#else
return warp_scan_inc_up_add_shuffled(v,width) - v;
#endif
}
//
//
//
extern "C"
__global__
void
warp_scan_exc_up_add_shuffled_kernel(uint32_t const * const vin, uint32_t * const vout)
{
vout[threadIdx.x] = warp_scan_exc_up_add_shuffled(vin[threadIdx.x]);
}
extern "C"
__global__
void
warp_scan_inc_up_add_shuffled_kernel(uint32_t const * const vin, uint32_t * const vout)
{
vout[threadIdx.x] = warp_scan_inc_up_add_shuffled(vin[threadIdx.x]);
}
//
//
//
void
print_scan(char const * const msg, uint32_t const warp[WARP_SIZE])
{
printf("%6s: ",msg);
for (int ii=0; ii<WARP_SIZE; ii++)
printf("%2d ",warp[ii]);
printf("\n");
}
//
//
//
int
main(int argc, char** argv)
{
//
// warp_scan [device] [0=exclusive] -- otherwise defaults to inclusive
//
const int device = (argc >= 2) ? atoi(argv[1]) : 0;
const bool inclusive = (argc == 3) ? atoi(argv[2]) != 0 : true;
cudaDeviceProp props;
cudaGetDeviceProperties(&props,device);
printf("%s (%2d)\n",props.name,props.multiProcessorCount);
printf("%s scan ...\n",inclusive ? "inclusive" : "exclusive");
cudaSetDevice(device);
//
//
//
size_t const size = sizeof(uint32_t) * WARP_SIZE;
uint32_t *vin_d, *vout_d;
cudaMalloc(&vin_d, size);
cudaMalloc(&vout_d,size);
//
//
//
uint32_t const vin_h[32] = { 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1,
1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1 };
cudaMemcpy(vin_d,vin_h,size,cudaMemcpyHostToDevice);
//
// LAUNCH KERNEL
//
if (inclusive)
warp_scan_inc_up_add_shuffled_kernel<<<1,WARP_SIZE>>>(vin_d,vout_d);
else
warp_scan_exc_up_add_shuffled_kernel<<<1,WARP_SIZE>>>(vin_d,vout_d);
cudaDeviceSynchronize();
//
//
//
uint32_t vout_h[32];
cudaMemcpy(vout_h,vout_d,size,cudaMemcpyDeviceToHost);
print_scan("warp",vin_h);
print_scan("scan",vout_h);
//
//
//
cudaFree(vin_d);
cudaFree(vout_d);
cudaDeviceReset();
return 0;
}
//
//
//
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment