Skip to content

Instantly share code, notes, and snippets.

@allanmac
Last active December 14, 2015 01:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save allanmac/5006316 to your computer and use it in GitHub Desktop.
Save allanmac/5006316 to your computer and use it in GitHub Desktop.
Example kernel used to observe Global Load/Store Efficiency metrics in the Visual Profiler.
#include <stdio.h>
//
//
//
#define TYPE unsigned int
#define REPS 1
#define NUM_BLOCKS_PER_SM 1
#define NUM_THREADS_PER_BLOCK 256
#define RESTRICT __restrict
#define WARP_SIZE 32
//
//
//
#define LD(i) \
TYPE r##i = vin[idx+(i*WARP_SIZE)]
#define ST(i) \
vout[idx+(i*WARP_SIZE)] = r##i
#define REP_1() \
OP(0)
#define REP_2() \
REP_1(); \
OP(1)
#define REP_4() \
REP_2(); \
OP(2); \
OP(3)
#define REP_8() \
REP_4(); \
OP(4); \
OP(5); \
OP(6); \
OP(7)
//
//
//
#if REPS == 1
#define REP_N() REP_1()
#elif REPS == 2
#define REP_N() REP_2()
#elif REPS == 4
#define REP_N() REP_4()
#elif REPS == 8
#define REP_N() REP_8()
#else
#error INVALID REP
#endif
//
//
//
__global__
void
__launch_bounds__(NUM_THREADS_PER_BLOCK,NUM_BLOCKS_PER_SM)
geff(const TYPE* const RESTRICT vin, TYPE* const RESTRICT vout)
{
const unsigned int bidx = (blockIdx.x * blockDim.x + threadIdx.x) / WARP_SIZE;
const unsigned int idx = bidx * REPS * WARP_SIZE + (threadIdx.x & (WARP_SIZE-1));
#undef OP
#define OP(i) LD(i)
REP_N();
#undef OP
#define OP(i) ST(i)
REP_N();
}
//
//
//
int main(int argc, char** argv)
{
cudaError_t err;
int device = (argc == 1) ? 0 : atoi(argv[1]);
cudaDeviceProp props;
err = cudaGetDeviceProperties(&props,device);
if (err)
return -1;
printf("%s (%2d)\n",props.name,props.multiProcessorCount);
cudaSetDevice(device);
//
// LAUNCH KERNEL
//
const unsigned int numThreads = NUM_THREADS_PER_BLOCK;
const unsigned int numBlocks = NUM_BLOCKS_PER_SM * props.multiProcessorCount;
const unsigned int vsize = numBlocks * numThreads * REPS * sizeof(TYPE);
TYPE* vin;
TYPE* vout;
#define LD_OFFSET 0 // sizeof(TYPE) // uncomment for uncoalesced loads
#define ST_OFFSET 0
cudaMalloc(&vin, vsize+LD_OFFSET);
cudaMalloc(&vout,vsize+ST_OFFSET);
//
//
//
geff<<<numBlocks,numThreads>>>(vin +(LD_OFFSET/sizeof(TYPE)),
vout+(ST_OFFSET/sizeof(TYPE)));
err = cudaDeviceSynchronize();
if (err)
printf("Err = %d\n",err);
//
//
//
cudaFree(vin);
cudaFree(vout);
cudaDeviceReset();
return 0;
}
@allanmac
Copy link
Author

Compile with nvcc -m 32 -arch sm_XX -Xptxas=-v geff.cu -o geff.exe

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment