Skip to content

Instantly share code, notes, and snippets.

@allanmac
Last active December 25, 2015 00:28
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save allanmac/6887719 to your computer and use it in GitHub Desktop.
Save allanmac/6887719 to your computer and use it in GitHub Desktop.
The macro at the bottom of "warp_scan.inl" is used to declare an optimal CUDA warp scan primitive without using C++ templates and specialization. The macro supports 32-bit PTX types (u32/s32/f32) and can generate inclusive and exclusive scans over any appropriate PTX two-argument operator (add,sub,min,max,mul,div,rem,etc). See examples below.
#pragma once
//
//
//
#define PXL_WARP_SCAN_SHFL(_op,_vT,_opT,_regC,_exc,_exc0,_excP)
////////////////////////////////////////////////////////////////////////
//
// FOR KEPLER+
//
#if (__CUDA_ARCH__ >= 300)
#undef PXL_WARP_SCAN_SHFL
#define PXL_WARP_SCAN_SHFL(_op,_vT,_opT,_regC,_exc,_exc0,_excP) \
\
DEVICE_FUNCTION_QUALIFIERS \
_vT \
_excP ## _warp_scan_shuffled_ ## _op (_vT v) \
{ \
asm("{ \n\t" \
" .reg " _opT " t; \n\t" \
" .reg .pred p; \n\t"); \
\
for (unsigned int d = 1; d < WARP_SIZE; d *= 2) \
asm(" shfl.up.b32 t|p, %0, %1, 0x0; \n\t" \
" @p " #_op _opT " %0, t, %0; \n\t" \
: "+" _regC (v) : "r"(d)); \
\
if (_exc) \
asm(" shfl.up.b32 %0|p, %0, 0x1, 0x0; \n\t" \
" @!p mov" _opT " %0, %1; \n\t" \
: "+" _regC (v) : _regC (_exc0)); \
\
asm("}"); \
\
return v; \
}
#endif
////////////////////////////////////////////////////////////////////////
//
// FOR ALL ARCHITECTURES
//
#ifdef SQUELCH_REDUNDANT_SHARED_STORES //////////
#define STORE_IF_LT_WARP_MINUS(l) \
if (lid < WARP_SIZE-l) \
v32[0] = v
#else ///////////////////////////////////////////
#define STORE_IF_LT_WARP_MINUS(l) \
v32[0] = v
#endif //////////////////////////////////////////
#define PXL_WARP_SCAN_SHARED(_op,_vT,_exc,_exc0,_excP,_incLd) \
\
DEVICE_FUNCTION_QUALIFIERS \
_vT \
_excP ## _warp_scan_shared_ ## _op \
(_vT v, volatile _vT* const shared32) \
{ \
const unsigned int lid = laneId(); \
volatile _vT* const v32 = shared32 + lid; \
\
if (_exc) \
{ \
if (lid == (WARP_SIZE-1)) \
v32[-31] = _exc0; \
else \
v32[ 1] = v; \
\
v = v32[0]; \
} \
else if (_incLd) \
{ \
v = v32[0]; \
} \
else \
{ \
v32[0] = v; \
} \
\
if (lid >= 1) \
{ \
v = v + v32[-1]; \
\
STORE_IF_LT_WARP_MINUS(2); \
\
if (lid >= 2) \
{ \
v = v + v32[-2]; \
\
STORE_IF_LT_WARP_MINUS(4); \
\
if (lid >= 4) \
{ \
v = v + v32[-4]; \
\
STORE_IF_LT_WARP_MINUS(8); \
\
if (lid >= 8) \
{ \
v = v + v32[-8]; \
\
STORE_IF_LT_WARP_MINUS(16); \
\
if (lid >= 16) \
v = v + v32[-16]; \
} \
} \
} \
} \
\
return v; \
}
//
// GENERATE BOTH SHFL AND SHARED WARP SCAN PRIMITIVES
//
// PXL_WARP_SCAN_SHFL is a noop when not compiling for sm_30+.
//
#define PXL_WARP_SCAN_DECL(_op,_vT,_opT,_regC,_exc,_exc0,_excP,_incLd) \
PXL_WARP_SCAN_SHARED(_op,_vT, _exc,_exc0,_excP,_incLd); \
PXL_WARP_SCAN_SHFL (_op,_vT,_opT,_regC,_exc,_exc0,_excP)
//
// DECLARE WARP SCAN PRIMITIVES IN YOUR .cu SOURCE FILE
//
/*
Examples:
// inclusive "add.u32" scan:
//
// unsigned int
// inc_warp_scan_shuffled_add(unsigned int)
//
// unsigned int
// inc_warp_scan_shared_add(unsigned int, volatile unsigned int* const)
//
PXL_WARP_SCAN_DECL(add, unsigned int, ".u32", "r", false, 0, inc, true);
// exclusive "add.u32" scan:
//
// unsigned int
// exc_warp_scan_shuffled_add(unsigned int)
//
// unsigned int
// exc_warp_scan_shared_add(unsigned int, volatile unsigned int* const)
//
PXL_WARP_SCAN_DECL(add, unsigned int, ".u32", "r", true, 0, exc, false);
// exclusive "max.f32" scan:
//
// float
// exc_warp_scan_shuffled_max(float)
//
// float
// exc_warp_scan_shared_max(float, volatile float* const)
//
PXL_WARP_SCAN_DECL(max, float, ".f32", "f", true, 0.0f, exc, false);
*/
@allanmac
Copy link
Author

allanmac commented Oct 8, 2013

Warp scans functions are declared like this:

// inclusive "add.u32" scan:
//
//  unsigned int
//  inc_warp_scan_shuffled_add(unsigned int)
//
//  unsigned int
//  inc_warp_scan_shared_add(unsigned int, volatile unsigned int* const)
//  
PXL_WARP_SCAN_DECL(add, unsigned int, ".u32", "r", false, 0,    inc, true);

// exclusive "add.u32" scan:
//
//  unsigned int
//  exc_warp_scan_shuffled_add(unsigned int)
// 
//  unsigned int
//  exc_warp_scan_shared_add(unsigned int, volatile unsigned int* const)
//  
PXL_WARP_SCAN_DECL(add, unsigned int, ".u32", "r", true,  0,    exc, false);

// exclusive "max.f32" scan:
//
//  float
//  exc_warp_scan_shuffled_max(float)
//
//  float
//  exc_warp_scan_shared_max(float, volatile float* const)
//  
PXL_WARP_SCAN_DECL(max, float,        ".f32", "f", true,  0.0f, exc, false);

@allanmac
Copy link
Author

allanmac commented Oct 8, 2013

A simple test for warp_scan.inl:

#include <stdio.h>

//
//
//

#define WARP_SIZE                     32
#define VOLATILE                      volatile

#define KERNEL_QUALIFIERS             extern "C" __global__
#define DEVICE_FUNCTION_QUALIFIERS    __device__
#define DEVICE_INTRINSIC_QUALIFIERS   __device__ __forceinline__

//
//
//

__shared__ struct
{

#if __CUDA_ARCH__ < 300
  unsigned int scratch[WARP_SIZE];
#endif

} VOLATILE shared;

//
//
//

DEVICE_INTRINSIC_QUALIFIERS
unsigned int laneId()
{
  unsigned int id;

  asm("mov.u32 %0, %%laneid;" : "=r"(id));

  return id;
}

//
//
//

#include "warp_scan.inl"

//
//
//

// inclusive "add.u32" scan:
//
// - inc_warp_scan_shuffled_add(unsigned int)
// - inc_warp_scan_shared_add(unsigned int, volatile unsigned int* const)
//  
PXL_WARP_SCAN_DECL(add, unsigned int, ".u32", "r", false, 0,    inc, false);

// exclusive "add.u32" scan:
//
// - exc_warp_scan_shuffled_add(unsigned int)
// - exc_warp_scan_shared_add(unsigned int, volatile unsigned int* const)
//  
PXL_WARP_SCAN_DECL(add, unsigned int, ".u32", "r", true,  0,    exc, false);

//
//
//

KERNEL_QUALIFIERS
void inclusivePlusScanKernel(const unsigned int* const vin,
                             unsigned int*       const vout)
{
  unsigned int v = vin[threadIdx.x];

#if __CUDA_ARCH__ >= 300
  v = inc_warp_scan_shuffled_add(v);
#else
  v = inc_warp_scan_shared_add(v,shared.scratch);
#endif

  vout[threadIdx.x] = v;
}

//
//
//

KERNEL_QUALIFIERS
void exclusivePlusScanKernel(const unsigned int* const vin,
                             unsigned int*       const vout)
{
  unsigned int v = vin[threadIdx.x];

#if __CUDA_ARCH__ >= 300
  v = exc_warp_scan_shuffled_add(v);
#else
  v = exc_warp_scan_shared_add(v,shared.scratch);
#endif

  vout[threadIdx.x] = v;
}

//
//
//

void printScan(const char*         const msg,
               const unsigned int* const warp)
{
  printf("%6s:",msg);

  for (int ii=0; ii<WARP_SIZE; ii++)
    printf("%2d ",warp[ii]);

  printf("\n");
}

//
//
//

int main(int argc, char** argv)
{
  // scan [device] [0=exclusive] -- otherwise defaults to inclusive

  const int  device    = (argc >= 2) ? atoi(argv[1])      : 0;
  const bool inclusive = (argc == 3) ? atoi(argv[2]) != 0 : true;

  cudaError err = cudaSetDevice(device);

  if (err != cudaSuccess)
    {
      printf("device (%d) error: (%d) %s\n",device,err,cudaGetErrorString(err));
      exit(err);
    }

  cudaDeviceProp props;
  cudaGetDeviceProperties(&props,device);

  printf("%s (%2d)\n",props.name,props.multiProcessorCount);
  printf("%s scan ...\n",inclusive ? "inclusive" : "exclusive");

  //
  // LAUNCH KERNEL
  //

  unsigned int* vin; 
  unsigned int* vout; 

  cudaMalloc(&vin, sizeof(unsigned int) * WARP_SIZE);
  cudaMalloc(&vout,sizeof(unsigned int) * WARP_SIZE);

  //
  //
  //

  unsigned int win[WARP_SIZE];

  // {
  //   1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1,
  //   1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1
  // };

  srand(0);

  for (int ii=0; ii<WARP_SIZE; ii++)
    win[ii] = rand() % 4;

  cudaMemcpy(vin,win,sizeof(unsigned int) * WARP_SIZE,cudaMemcpyHostToDevice);

  //
  //
  //

  if (inclusive)
    inclusivePlusScanKernel<<<1,WARP_SIZE>>>(vin,vout);
  else
    exclusivePlusScanKernel<<<1,WARP_SIZE>>>(vin,vout);

  cudaDeviceSynchronize();

  //
  //
  //

  unsigned int wout[32];

  cudaMemcpy(wout,vout,sizeof(unsigned int) * WARP_SIZE,cudaMemcpyDeviceToHost);

  printScan("warp",win);
  printScan("scan",wout);

  //
  //
  //

  cudaFree(vin);
  cudaFree(vout);

  cudaDeviceReset();

  return 0;
}

@allanmac
Copy link
Author

allanmac commented Oct 8, 2013

Compile example with:

  nvcc -m 32 -Xptxas=-v,-abi=no             \
    -gencode=arch=compute_11,code=sm_11     \
    -gencode=arch=compute_12,code=sm_12     \
    -gencode=arch=compute_20,code=sm_21     \
    -gencode=arch=compute_30,code=sm_30     \
    -gencode=arch=compute_35,code=sm_35     \
    warp_scan.cu -o warp_scan

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment