Skip to content

Instantly share code, notes, and snippets.

@allanmac
Last active September 24, 2016 12:20
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save allanmac/6160110 to your computer and use it in GitHub Desktop.
Save allanmac/6160110 to your computer and use it in GitHub Desktop.
Inclusive and exclusive warp-level scan snippets. Evaluating SHFL vs. shared implementations. Also evaluating the simplest transformation of an inclusive scan into an exclusive scan. It's only two ops on sm_3x.
#include <stdio.h>
//
//
//
#define WARP_SIZE 32
#define VOLATILE volatile
#define KERNEL_QUALIFIERS extern "C" __global__
#define DEVICE_FUNCTION_QUALIFIERS __device__
#define DEVICE_INTRINSIC_QUALIFIERS __device__ __forceinline__
//
//
//
VOLATILE __shared__ struct
{
#if __CUDA_ARCH__ < 300
unsigned int scratch[WARP_SIZE];
#endif
} shared;
//
//
//
DEVICE_INTRINSIC_QUALIFIERS
unsigned int laneId()
{
unsigned int id;
asm("mov.u32 %0, %%laneid;" : "=r"(id));
return id;
}
DEVICE_INTRINSIC_QUALIFIERS
unsigned int laneMaskEQ()
{
#if __CUDA_ARCH__ >= 200
unsigned int id;
asm("mov.u32 %0, %%lanemask_eq;" : "=r"(id));
return id;
#else
return 1u << laneId();
#endif
}
//
//
//
/**
* Convert a warp-level inclusive scan to an exclusive scan by
* shifting the lanes to the right and assigning an 'identity' value
* to lane 0.
*
* @param v
*
* @return scan result for lane
*/
#if (__CUDA_ARCH__ >= 300)
DEVICE_FUNCTION_QUALIFIERS
unsigned int
toExclusiveScan(unsigned int v, const unsigned int i)
{
asm("{ \n\t"
" .reg .pred p; \n\t"
" shfl.up.b32 %0|p, %0, 0x1, 0x0; \n\t"
" @!p mov.u32 %0, %1; \n\t"
"}"
: "+r"(v) : "r"(i));
return v;
}
#endif
//
//
//
/**
* Warp-level "inclusive plus scan".
*
* PTX from PTX ISA PDF
*
* @param v
*
* @return scan result for lane
*/
DEVICE_FUNCTION_QUALIFIERS
unsigned int
plusScan(unsigned int v, const bool inclusive)
{
#if (__CUDA_ARCH__ >= 300)
asm("{ \n\t"
" .reg .u32 t; \n\t"
" .reg .pred p; \n\t"
" shfl.up.b32 t|p, %0, 0x1, 0x0; \n\t"
" @p add.u32 %0, t, %0; \n\t"
" shfl.up.b32 t|p, %0, 0x2, 0x0; \n\t"
" @p add.u32 %0, t, %0; \n\t"
" shfl.up.b32 t|p, %0, 0x4, 0x0; \n\t"
" @p add.u32 %0, t, %0; \n\t"
" shfl.up.b32 t|p, %0, 0x8, 0x0; \n\t"
" @p add.u32 %0, t, %0; \n\t"
" shfl.up.b32 t|p, %0, 0x10, 0x0; \n\t"
" @p add.u32 %0, t, %0; \n\t"
"}"
: "+r"(v));
if (inclusive)
return v;
else
return toExclusiveScan(v,0u);
#else
/*
//
// uncomment if you want to mask redundant shared stores
//
#define STORE_IF_LT_WARP_MINUS(l) \
if (lid < WARP_SIZE-l) \
scratch[0] = v
*/
#define STORE_IF_LT_WARP_MINUS(l) \
scratch[0] = v
const unsigned int lid = laneId();
volatile unsigned int* scratch = shared.scratch + lid;
if (inclusive)
{
scratch[0] = v;
}
else
{
if (lid == (WARP_SIZE-1))
scratch[-31] = 0u;
else
scratch[1] = v;
}
v = scratch[0];
if (lid >= 1)
{
v = v + scratch[-1];
STORE_IF_LT_WARP_MINUS(2);
if (lid >= 2)
{
v = v + scratch[-2];
STORE_IF_LT_WARP_MINUS(4);
if (lid >= 4)
{
v = v + scratch[-4];
STORE_IF_LT_WARP_MINUS(8);
if (lid >= 8)
{
v = v + scratch[-8];
STORE_IF_LT_WARP_MINUS(16);
if (lid >= 16)
v = v + scratch[-16];
}
}
}
}
return v;
#endif
}
//
//
//
KERNEL_QUALIFIERS
void inclusivePlusScanKernel(const unsigned int* const vin,
unsigned int* const vout)
{
unsigned int v = vin[threadIdx.x];
v = plusScan(v,true);
vout[threadIdx.x] = v;
}
//
//
//
KERNEL_QUALIFIERS
void exclusivePlusScanKernel(const unsigned int* const vin,
unsigned int* const vout)
{
unsigned int v = vin[threadIdx.x];
v = plusScan(v,false);
vout[threadIdx.x] = v;
}
//
//
//
void printScan(const char* const msg,
const unsigned int* const warp)
{
printf("%6s:",msg);
for (int ii=0; ii<WARP_SIZE; ii++)
printf("%2d ",warp[ii]);
printf("\n");
}
//
//
//
int main(int argc, char** argv)
{
// scan [device] [0=exclusive] -- otherwise defaults to inclusive
const int device = (argc >= 2) ? atoi(argv[1]) : 0;
const bool inclusive = (argc == 3) ? atoi(argv[2]) != 0 : true;
cudaDeviceProp props;
cudaGetDeviceProperties(&props,device);
printf("%s (%2d)\n",props.name,props.multiProcessorCount);
printf("%s scan ...\n",inclusive ? "inclusive" : "exclusive");
cudaSetDevice(device);
//
// LAUNCH KERNEL
//
unsigned int* vin;
unsigned int* vout;
cudaMalloc(&vin, sizeof(unsigned int) * WARP_SIZE);
cudaMalloc(&vout,sizeof(unsigned int) * WARP_SIZE);
//
//
//
unsigned int win[32] =
{
1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1,
1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1
};
cudaMemcpy(vin,win,sizeof(unsigned int) * WARP_SIZE,cudaMemcpyHostToDevice);
//
//
//
if (inclusive)
inclusivePlusScanKernel<<<1,WARP_SIZE>>>(vin,vout);
else
exclusivePlusScanKernel<<<1,WARP_SIZE>>>(vin,vout);
cudaDeviceSynchronize();
//
//
//
unsigned int wout[32];
cudaMemcpy(wout,vout,sizeof(unsigned int) * WARP_SIZE,cudaMemcpyDeviceToHost);
printScan("warp",win);
printScan("scan",wout);
//
//
//
cudaFree(vin);
cudaFree(vout);
cudaDeviceReset();
return 0;
}
@allanmac
Copy link
Author

allanmac commented Aug 5, 2013

Compiled with:

  nvcc -m 32 -Xptxas=-v,-abi=no             \
    -gencode=arch=compute_11,code=sm_11     \
    -gencode=arch=compute_12,code=sm_12     \
    -gencode=arch=compute_20,code=sm_21     \
    -gencode=arch=compute_30,code=sm_30     \
    -gencode=arch=compute_35,code=sm_35     \
    scan.cu -o scan

@allanmac
Copy link
Author

allanmac commented Aug 5, 2013

The SASS for a warp-level inclusive plus scan with SHFL is:

    code for sm_35
        Function : inclusivePlusScanKernel
    .headerflags    @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)"
                                                                   /* 0x08eca0ecdc10a0a0 */
        /*0008*/                S2R R0, SR_TID.X;                  /* 0x86400000109c0002 */
        /*0010*/                ISCADD R1, R0, c[0x0][0x140], 0x2; /* 0x60c00800281c0006 */
        /*0018*/                LD R1, [R1];                       /* 0xc4000000001c0404 */
        /*0020*/                ISCADD R0, R0, c[0x0][0x144], 0x2; /* 0x60c00800289c0002 */
        /*0028*/                SHFL.UP P0, R2, R1, 0x1, 0x0;      /* 0x78800003809c040a */
        /*0030*/            @P0 IADD R1, R2, R1;                   /* 0xe080000000800806 */
        /*0038*/                SHFL.UP P0, R2, R1, 0x2, 0x0;      /* 0x78800003811c040a */
                                                                   /* 0x08eca0a0dca0eca0 */
        /*0048*/            @P0 IADD R1, R2, R1;                   /* 0xe080000000800806 */
        /*0050*/                SHFL.UP P0, R2, R1, 0x4, 0x0;      /* 0x78800003821c040a */
        /*0058*/            @P0 IADD R1, R2, R1;                   /* 0xe080000000800806 */
        /*0060*/                SHFL.UP P0, R2, R1, 0x8, 0x0;      /* 0x78800003841c040a */
        /*0068*/                IADD R2, R2, R1;                   /* 0xe0800000009c080a */
        /*0070*/                SEL R1, R2, R1, P0;                /* 0xe5000000009c0806 */
        /*0078*/                SHFL.UP P0, R2, R1, 0x10, 0x0;     /* 0x78800003881c040a */
                                                                   /* 0x0800000000b810a0 */
        /*0088*/            @P0 IADD R1, R2, R1;                   /* 0xe080000000800806 */
        /*0090*/                ST [R0], R1;                       /* 0xe4000000001c0004 */
        /*0098*/                EXIT ;                             /* 0x18000000001c003c */
        /*00a0*/                BRA 0xa0;                          /* 0x12007ffffc1c003c */

@allanmac
Copy link
Author

allanmac commented Aug 5, 2013

The SASS for a warp-level inclusive plus scan using shared is:

    code for sm_21
        Function : inclusivePlusScanKernel
    .headerflags    @"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
        /*0000*/        S2R R0, SR_TID.X;                      /* 0x2c00000084001c04 */
        /*0008*/        SHL R2, R0, 0x2;                       /* 0x6000c00008009c03 */
        /*0010*/        S2R R3, SR_LANEID;                     /* 0x2c0000000000dc04 */
        /*0018*/        IADD R0, R2, c[0x0][0x20];             /* 0x4800400080201c03 */
        /*0020*/        SHL R4, R3, 0x2;                       /* 0x6000c00008311c03 */
        /*0028*/        ISETP.EQ.AND P0, PT, R3, RZ, PT;       /* 0x190e0000fc31dc23 */
        /*0030*/        LD R1, [R0];                           /* 0x8000000000005c85 */
        /*0038*/        SSY 0x108;                             /* 0x6000000320000007 */
        /*0040*/        STS [R4], R1;                          /* 0xc900000000405c85 */
        /*0048*/        LDS R0, [R4];                          /* 0xc100000000401c85 */
        /*0050*/    @P0 NOP.S;                                 /* 0x40000000000001f4 */
        /*0058*/        ISETP.LT.U32.AND P0, PT, R3, 0x2, PT;  /* 0x188ec0000831dc03 */
        /*0060*/        LDS R1, [R4+-0x4];                     /* 0xc103fffff0405c85 */
        /*0068*/        IADD R0, R1, R0;                       /* 0x4800000000101c03 */
        /*0070*/        STS [R4], R0;                          /* 0xc900000000401c85 */
        /*0078*/    @P0 NOP.S;                                 /* 0x40000000000001f4 */
        /*0080*/        ISETP.LT.U32.AND P0, PT, R3, 0x4, PT;  /* 0x188ec0001031dc03 */
        /*0088*/        LDS R1, [R4+-0x8];                     /* 0xc103ffffe0405c85 */
        /*0090*/        IADD R0, R1, R0;                       /* 0x4800000000101c03 */
        /*0098*/        STS [R4], R0;                          /* 0xc900000000401c85 */
        /*00a0*/    @P0 NOP.S;                                 /* 0x40000000000001f4 */
        /*00a8*/        ISETP.LT.U32.AND P0, PT, R3, 0x8, PT;  /* 0x188ec0002031dc03 */
        /*00b0*/        LDS R1, [R4+-0x10];                    /* 0xc103ffffc0405c85 */
        /*00b8*/        IADD R0, R1, R0;                       /* 0x4800000000101c03 */
        /*00c0*/        STS [R4], R0;                          /* 0xc900000000401c85 */
        /*00c8*/    @P0 NOP.S;                                 /* 0x40000000000001f4 */
        /*00d0*/        ISETP.LT.U32.AND P0, PT, R3, 0x10, PT; /* 0x188ec0004031dc03 */
        /*00d8*/        LDS R1, [R4+-0x20];                    /* 0xc103ffff80405c85 */
        /*00e0*/        IADD R0, R1, R0;                       /* 0x4800000000101c03 */
        /*00e8*/        STS [R4], R0;                          /* 0xc900000000401c85 */
        /*00f0*/    @P0 NOP.S;                                 /* 0x40000000000001f4 */
        /*00f8*/        LDS R1, [R4+-0x40];                    /* 0xc103ffff00405c85 */
        /*0100*/        IADD.S R0, R1, R0;                     /* 0x4800000000101c13 */
        /*0108*/        IADD R1, R2, c[0x0][0x24];             /* 0x4800400090205c03 */
        /*0110*/        ST [R1], R0;                           /* 0x9000000000101c85 */
        /*0118*/        EXIT ;                                 /* 0x8000000000001de7 */

@allanmac
Copy link
Author

allanmac commented Aug 5, 2013

inclusive:

> scan 0 1
Tesla K20c (13)
inclusive scan ...
  warp: 1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 
  scan: 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 

exclusive:

> scan 0 0
Tesla K20c (13)
exclusive scan ...
  warp: 1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 
  scan: 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment