Last active
December 25, 2015 00:28
-
-
Save allanmac/6887719 to your computer and use it in GitHub Desktop.
The macro at the bottom of "warp_scan.inl" is used to declare an optimal CUDA warp scan primitive without using C++ templates and specialization. The macro supports 32-bit PTX types (u32/s32/f32) and can generate inclusive and exclusive scans over any appropriate PTX two-argument operator (add,sub,min,max,mul,div,rem,etc). See examples below.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#pragma once | |
// | |
// | |
// | |
#define PXL_WARP_SCAN_SHFL(_op,_vT,_opT,_regC,_exc,_exc0,_excP) | |
//////////////////////////////////////////////////////////////////////// | |
// | |
// FOR KEPLER+ | |
// | |
#if (__CUDA_ARCH__ >= 300) | |
#undef PXL_WARP_SCAN_SHFL | |
#define PXL_WARP_SCAN_SHFL(_op,_vT,_opT,_regC,_exc,_exc0,_excP) \ | |
\ | |
DEVICE_FUNCTION_QUALIFIERS \ | |
_vT \ | |
_excP ## _warp_scan_shuffled_ ## _op (_vT v) \ | |
{ \ | |
asm("{ \n\t" \ | |
" .reg " _opT " t; \n\t" \ | |
" .reg .pred p; \n\t"); \ | |
\ | |
for (unsigned int d = 1; d < WARP_SIZE; d *= 2) \ | |
asm(" shfl.up.b32 t|p, %0, %1, 0x0; \n\t" \ | |
" @p " #_op _opT " %0, t, %0; \n\t" \ | |
: "+" _regC (v) : "r"(d)); \ | |
\ | |
if (_exc) \ | |
asm(" shfl.up.b32 %0|p, %0, 0x1, 0x0; \n\t" \ | |
" @!p mov" _opT " %0, %1; \n\t" \ | |
: "+" _regC (v) : _regC (_exc0)); \ | |
\ | |
asm("}"); \ | |
\ | |
return v; \ | |
} | |
#endif | |
//////////////////////////////////////////////////////////////////////// | |
// | |
// FOR ALL ARCHITECTURES | |
// | |
#ifdef SQUELCH_REDUNDANT_SHARED_STORES ////////// | |
#define STORE_IF_LT_WARP_MINUS(l) \ | |
if (lid < WARP_SIZE-l) \ | |
v32[0] = v | |
#else /////////////////////////////////////////// | |
#define STORE_IF_LT_WARP_MINUS(l) \ | |
v32[0] = v | |
#endif ////////////////////////////////////////// | |
#define PXL_WARP_SCAN_SHARED(_op,_vT,_exc,_exc0,_excP,_incLd) \ | |
\ | |
DEVICE_FUNCTION_QUALIFIERS \ | |
_vT \ | |
_excP ## _warp_scan_shared_ ## _op \ | |
(_vT v, volatile _vT* const shared32) \ | |
{ \ | |
const unsigned int lid = laneId(); \ | |
volatile _vT* const v32 = shared32 + lid; \ | |
\ | |
if (_exc) \ | |
{ \ | |
if (lid == (WARP_SIZE-1)) \ | |
v32[-31] = _exc0; \ | |
else \ | |
v32[ 1] = v; \ | |
\ | |
v = v32[0]; \ | |
} \ | |
else if (_incLd) \ | |
{ \ | |
v = v32[0]; \ | |
} \ | |
else \ | |
{ \ | |
v32[0] = v; \ | |
} \ | |
\ | |
if (lid >= 1) \ | |
{ \ | |
v = v + v32[-1]; \ | |
\ | |
STORE_IF_LT_WARP_MINUS(2); \ | |
\ | |
if (lid >= 2) \ | |
{ \ | |
v = v + v32[-2]; \ | |
\ | |
STORE_IF_LT_WARP_MINUS(4); \ | |
\ | |
if (lid >= 4) \ | |
{ \ | |
v = v + v32[-4]; \ | |
\ | |
STORE_IF_LT_WARP_MINUS(8); \ | |
\ | |
if (lid >= 8) \ | |
{ \ | |
v = v + v32[-8]; \ | |
\ | |
STORE_IF_LT_WARP_MINUS(16); \ | |
\ | |
if (lid >= 16) \ | |
v = v + v32[-16]; \ | |
} \ | |
} \ | |
} \ | |
} \ | |
\ | |
return v; \ | |
} | |
// | |
// GENERATE BOTH SHFL AND SHARED WARP SCAN PRIMITIVES | |
// | |
// PXL_WARP_SCAN_SHFL is a noop when not compiling for sm_30+. | |
// | |
#define PXL_WARP_SCAN_DECL(_op,_vT,_opT,_regC,_exc,_exc0,_excP,_incLd) \ | |
PXL_WARP_SCAN_SHARED(_op,_vT, _exc,_exc0,_excP,_incLd); \ | |
PXL_WARP_SCAN_SHFL (_op,_vT,_opT,_regC,_exc,_exc0,_excP) | |
// | |
// DECLARE WARP SCAN PRIMITIVES IN YOUR .cu SOURCE FILE | |
// | |
/* | |
Examples: | |
// inclusive "add.u32" scan: | |
// | |
// unsigned int | |
// inc_warp_scan_shuffled_add(unsigned int) | |
// | |
// unsigned int | |
// inc_warp_scan_shared_add(unsigned int, volatile unsigned int* const) | |
// | |
PXL_WARP_SCAN_DECL(add, unsigned int, ".u32", "r", false, 0, inc, true); | |
// exclusive "add.u32" scan: | |
// | |
// unsigned int | |
// exc_warp_scan_shuffled_add(unsigned int) | |
// | |
// unsigned int | |
// exc_warp_scan_shared_add(unsigned int, volatile unsigned int* const) | |
// | |
PXL_WARP_SCAN_DECL(add, unsigned int, ".u32", "r", true, 0, exc, false); | |
// exclusive "max.f32" scan: | |
// | |
// float | |
// exc_warp_scan_shuffled_max(float) | |
// | |
// float | |
// exc_warp_scan_shared_max(float, volatile float* const) | |
// | |
PXL_WARP_SCAN_DECL(max, float, ".f32", "f", true, 0.0f, exc, false); | |
*/ |
A simple test for warp_scan.inl:
#include <stdio.h>
//
//
//
#define WARP_SIZE 32
#define VOLATILE volatile
#define KERNEL_QUALIFIERS extern "C" __global__
#define DEVICE_FUNCTION_QUALIFIERS __device__
#define DEVICE_INTRINSIC_QUALIFIERS __device__ __forceinline__
//
//
//
__shared__ struct
{
#if __CUDA_ARCH__ < 300
unsigned int scratch[WARP_SIZE];
#endif
} VOLATILE shared;
//
//
//
DEVICE_INTRINSIC_QUALIFIERS
unsigned int laneId()
{
unsigned int id;
asm("mov.u32 %0, %%laneid;" : "=r"(id));
return id;
}
//
//
//
#include "warp_scan.inl"
//
//
//
// inclusive "add.u32" scan:
//
// - inc_warp_scan_shuffled_add(unsigned int)
// - inc_warp_scan_shared_add(unsigned int, volatile unsigned int* const)
//
PXL_WARP_SCAN_DECL(add, unsigned int, ".u32", "r", false, 0, inc, false);
// exclusive "add.u32" scan:
//
// - exc_warp_scan_shuffled_add(unsigned int)
// - exc_warp_scan_shared_add(unsigned int, volatile unsigned int* const)
//
PXL_WARP_SCAN_DECL(add, unsigned int, ".u32", "r", true, 0, exc, false);
//
//
//
KERNEL_QUALIFIERS
void inclusivePlusScanKernel(const unsigned int* const vin,
unsigned int* const vout)
{
unsigned int v = vin[threadIdx.x];
#if __CUDA_ARCH__ >= 300
v = inc_warp_scan_shuffled_add(v);
#else
v = inc_warp_scan_shared_add(v,shared.scratch);
#endif
vout[threadIdx.x] = v;
}
//
//
//
KERNEL_QUALIFIERS
void exclusivePlusScanKernel(const unsigned int* const vin,
unsigned int* const vout)
{
unsigned int v = vin[threadIdx.x];
#if __CUDA_ARCH__ >= 300
v = exc_warp_scan_shuffled_add(v);
#else
v = exc_warp_scan_shared_add(v,shared.scratch);
#endif
vout[threadIdx.x] = v;
}
//
//
//
void printScan(const char* const msg,
const unsigned int* const warp)
{
printf("%6s:",msg);
for (int ii=0; ii<WARP_SIZE; ii++)
printf("%2d ",warp[ii]);
printf("\n");
}
//
//
//
int main(int argc, char** argv)
{
// scan [device] [0=exclusive] -- otherwise defaults to inclusive
const int device = (argc >= 2) ? atoi(argv[1]) : 0;
const bool inclusive = (argc == 3) ? atoi(argv[2]) != 0 : true;
cudaError err = cudaSetDevice(device);
if (err != cudaSuccess)
{
printf("device (%d) error: (%d) %s\n",device,err,cudaGetErrorString(err));
exit(err);
}
cudaDeviceProp props;
cudaGetDeviceProperties(&props,device);
printf("%s (%2d)\n",props.name,props.multiProcessorCount);
printf("%s scan ...\n",inclusive ? "inclusive" : "exclusive");
//
// LAUNCH KERNEL
//
unsigned int* vin;
unsigned int* vout;
cudaMalloc(&vin, sizeof(unsigned int) * WARP_SIZE);
cudaMalloc(&vout,sizeof(unsigned int) * WARP_SIZE);
//
//
//
unsigned int win[WARP_SIZE];
// {
// 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1,
// 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1
// };
srand(0);
for (int ii=0; ii<WARP_SIZE; ii++)
win[ii] = rand() % 4;
cudaMemcpy(vin,win,sizeof(unsigned int) * WARP_SIZE,cudaMemcpyHostToDevice);
//
//
//
if (inclusive)
inclusivePlusScanKernel<<<1,WARP_SIZE>>>(vin,vout);
else
exclusivePlusScanKernel<<<1,WARP_SIZE>>>(vin,vout);
cudaDeviceSynchronize();
//
//
//
unsigned int wout[32];
cudaMemcpy(wout,vout,sizeof(unsigned int) * WARP_SIZE,cudaMemcpyDeviceToHost);
printScan("warp",win);
printScan("scan",wout);
//
//
//
cudaFree(vin);
cudaFree(vout);
cudaDeviceReset();
return 0;
}
Compile example with:
nvcc -m 32 -Xptxas=-v,-abi=no \
-gencode=arch=compute_11,code=sm_11 \
-gencode=arch=compute_12,code=sm_12 \
-gencode=arch=compute_20,code=sm_21 \
-gencode=arch=compute_30,code=sm_30 \
-gencode=arch=compute_35,code=sm_35 \
warp_scan.cu -o warp_scan
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Warp scans functions are declared like this: