Last active
December 14, 2015 01:29
-
-
Save allanmac/5006316 to your computer and use it in GitHub Desktop.
Example kernel used to observe Global Load/Store Efficiency metrics in the Visual Profiler.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
// | |
// | |
// | |
#define TYPE unsigned int | |
#define REPS 1 | |
#define NUM_BLOCKS_PER_SM 1 | |
#define NUM_THREADS_PER_BLOCK 256 | |
#define RESTRICT __restrict | |
#define WARP_SIZE 32 | |
// | |
// | |
// | |
#define LD(i) \ | |
TYPE r##i = vin[idx+(i*WARP_SIZE)] | |
#define ST(i) \ | |
vout[idx+(i*WARP_SIZE)] = r##i | |
#define REP_1() \ | |
OP(0) | |
#define REP_2() \ | |
REP_1(); \ | |
OP(1) | |
#define REP_4() \ | |
REP_2(); \ | |
OP(2); \ | |
OP(3) | |
#define REP_8() \ | |
REP_4(); \ | |
OP(4); \ | |
OP(5); \ | |
OP(6); \ | |
OP(7) | |
// | |
// | |
// | |
#if REPS == 1 | |
#define REP_N() REP_1() | |
#elif REPS == 2 | |
#define REP_N() REP_2() | |
#elif REPS == 4 | |
#define REP_N() REP_4() | |
#elif REPS == 8 | |
#define REP_N() REP_8() | |
#else | |
#error INVALID REP | |
#endif | |
// | |
// | |
// | |
__global__ | |
void | |
__launch_bounds__(NUM_THREADS_PER_BLOCK,NUM_BLOCKS_PER_SM) | |
geff(const TYPE* const RESTRICT vin, TYPE* const RESTRICT vout) | |
{ | |
const unsigned int bidx = (blockIdx.x * blockDim.x + threadIdx.x) / WARP_SIZE; | |
const unsigned int idx = bidx * REPS * WARP_SIZE + (threadIdx.x & (WARP_SIZE-1)); | |
#undef OP | |
#define OP(i) LD(i) | |
REP_N(); | |
#undef OP | |
#define OP(i) ST(i) | |
REP_N(); | |
} | |
// | |
// | |
// | |
int main(int argc, char** argv) | |
{ | |
cudaError_t err; | |
int device = (argc == 1) ? 0 : atoi(argv[1]); | |
cudaDeviceProp props; | |
err = cudaGetDeviceProperties(&props,device); | |
if (err) | |
return -1; | |
printf("%s (%2d)\n",props.name,props.multiProcessorCount); | |
cudaSetDevice(device); | |
// | |
// LAUNCH KERNEL | |
// | |
const unsigned int numThreads = NUM_THREADS_PER_BLOCK; | |
const unsigned int numBlocks = NUM_BLOCKS_PER_SM * props.multiProcessorCount; | |
const unsigned int vsize = numBlocks * numThreads * REPS * sizeof(TYPE); | |
TYPE* vin; | |
TYPE* vout; | |
#define LD_OFFSET 0 // sizeof(TYPE) // uncomment for uncoalesced loads | |
#define ST_OFFSET 0 | |
cudaMalloc(&vin, vsize+LD_OFFSET); | |
cudaMalloc(&vout,vsize+ST_OFFSET); | |
// | |
// | |
// | |
geff<<<numBlocks,numThreads>>>(vin +(LD_OFFSET/sizeof(TYPE)), | |
vout+(ST_OFFSET/sizeof(TYPE))); | |
err = cudaDeviceSynchronize(); | |
if (err) | |
printf("Err = %d\n",err); | |
// | |
// | |
// | |
cudaFree(vin); | |
cudaFree(vout); | |
cudaDeviceReset(); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Compile with
nvcc -m 32 -arch sm_XX -Xptxas=-v geff.cu -o geff.exe