Last active
November 20, 2019 06:22
-
-
Save teju85/baa0ce889ff8b26d27e184342f9eb14b to your computer and use it in GitHub Desktop.
Measure runtimes of commonly used cuda runtime APIs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Compiling and running this program: | |
// nvcc -std=c++11 cuda-runtime-api-perf.cu && ./a.out | |
#include <chrono> | |
#include <stdio.h> | |
#include <stdlib.h> | |
using namespace std; | |
#define __CUDA(call) \ | |
do { \ | |
cudaError_t status = call; \ | |
if(status != cudaSuccess) { \ | |
printf("FAIL: call='%s'. Reason:%s\n", #call, \ | |
cudaGetErrorString(status)); \ | |
exit(1); \ | |
} \ | |
} while (0) | |
#define CUDA_NO_TIMING(cnt, func, ...) \ | |
do { \ | |
for (int itr = 0; itr < cnt; ++itr) { \ | |
__CUDA(func(__VA_ARGS__)); \ | |
} \ | |
} while (0) | |
#define TIMEIT_COMMENT(cnt, comment, func, ...) \ | |
do { \ | |
auto s = chrono::high_resolution_clock::now(); \ | |
CUDA_NO_TIMING(cnt, func, __VA_ARGS__); \ | |
auto end = chrono::high_resolution_clock::now(); \ | |
auto diff = chrono::duration_cast<chrono::microseconds>(end - s); \ | |
printf("%-64s -> %12.3lf us %s\n", #func, \ | |
diff.count() / double(cnt), comment); \ | |
fflush(stdout); \ | |
} while (0) | |
#define TIMEIT(cnt, func, ...) TIMEIT_COMMENT(cnt, "", func, __VA_ARGS__) | |
__global__ void dummyKernel(int val) { | |
} | |
static constexpr int COUNT = 25; | |
void deviceManagement() { | |
int devId; | |
TIMEIT(COUNT, cudaGetDevice, &devId); | |
TIMEIT(COUNT, cudaSetDevice, devId); | |
unsigned flags; | |
TIMEIT(COUNT, cudaGetDeviceFlags, &flags); | |
int count; | |
TIMEIT(COUNT, cudaGetDeviceCount, &count); | |
cudaDeviceProp prop; | |
TIMEIT(COUNT, cudaGetDeviceProperties, &prop, devId); | |
int smemSize; | |
TIMEIT(COUNT, cudaDeviceGetAttribute, &smemSize, | |
cudaDevAttrMaxSharedMemoryPerBlock, devId); | |
cudaFuncCache config; | |
TIMEIT(COUNT, cudaDeviceGetCacheConfig, &config); | |
char name[128]; | |
TIMEIT(COUNT, cudaDeviceGetPCIBusId, name, 128, devId); | |
cudaSharedMemConfig sConfig; | |
TIMEIT(COUNT, cudaDeviceGetSharedMemConfig, &sConfig); | |
TIMEIT(COUNT, cudaDeviceSynchronize); | |
} | |
void streamManagement() { | |
cudaStream_t streams[COUNT]; | |
TIMEIT(COUNT, cudaStreamCreate, &streams[itr]); | |
TIMEIT(COUNT, cudaStreamSynchronize, streams[itr]); | |
TIMEIT(COUNT, cudaStreamQuery, streams[itr]); | |
cudaEvent_t event; | |
unsigned flags = 0; | |
__CUDA(cudaEventCreate(&event)); | |
TIMEIT(COUNT, cudaStreamWaitEvent, streams[itr], event, flags); | |
__CUDA(cudaEventDestroy(event)); | |
cudaStreamCaptureStatus capStatus; | |
TIMEIT(COUNT, cudaStreamIsCapturing, streams[itr], &capStatus); | |
#if (CUDART_VERSION >= 10100) | |
unsigned long long pid; | |
TIMEIT(COUNT, cudaStreamGetCaptureInfo, streams[itr], &capStatus, &pid); | |
#endif | |
TIMEIT(COUNT, cudaStreamDestroy, streams[itr]); | |
// check timing the second time | |
TIMEIT_COMMENT(COUNT, "# second time", cudaStreamCreate, &streams[itr]); | |
CUDA_NO_TIMING(COUNT, cudaStreamDestroy, streams[itr]); | |
TIMEIT(COUNT, cudaStreamCreateWithFlags, &streams[itr], 0); | |
TIMEIT(COUNT, cudaStreamGetFlags, streams[itr], &flags); | |
CUDA_NO_TIMING(COUNT, cudaStreamDestroy, streams[itr]); | |
int priority; | |
TIMEIT(COUNT, cudaStreamCreateWithPriority, &streams[itr], 0, 0); | |
TIMEIT(COUNT, cudaStreamGetPriority, streams[itr], &priority); | |
CUDA_NO_TIMING(COUNT, cudaStreamDestroy, streams[itr]); | |
} | |
void eventManagement() { | |
cudaEvent_t events[COUNT]; | |
TIMEIT(COUNT, cudaEventCreate, &events[itr]); | |
TIMEIT(COUNT, cudaEventSynchronize, events[itr]); | |
TIMEIT(COUNT, cudaEventRecord, events[itr], 0); | |
///@todo: causes the following error: | |
// FAIL: call='cudaEventQuery(events[itr])'. Reason:device not ready | |
//TIMEIT(COUNT, cudaEventQuery, events[itr]); | |
float ms; | |
TIMEIT(COUNT, cudaEventElapsedTime, &ms, events[0], events[itr]); | |
TIMEIT(COUNT, cudaEventDestroy, events[itr]); | |
unsigned flags = 0; | |
TIMEIT(COUNT, cudaEventCreateWithFlags, &events[itr], flags); | |
CUDA_NO_TIMING(COUNT, cudaEventDestroy, events[itr]); | |
} | |
void errorManagement() { | |
TIMEIT(COUNT, cudaGetLastError); | |
TIMEIT(COUNT, cudaPeekAtLastError); | |
} | |
void executionControl() { | |
int val = 0; | |
void* args = reinterpret_cast<void*>(&val); | |
cudaStream_t stream; | |
__CUDA(cudaStreamCreate(&stream)); | |
void* fKernel = reinterpret_cast<void*>(&dummyKernel); | |
dim3 blks(1), threads(256); | |
// initial warm-up launch | |
cudaLaunchKernel(fKernel, blks, threads, &args, size_t(0), stream); | |
__CUDA(cudaDeviceSynchronize()); | |
TIMEIT(COUNT, cudaLaunchKernel, fKernel, blks, threads, &args, | |
size_t(0), stream); | |
__CUDA(cudaDeviceSynchronize()); | |
TIMEIT(COUNT, cudaLaunchCooperativeKernel, fKernel, blks, threads, &args, | |
size_t(0), stream); | |
__CUDA(cudaDeviceSynchronize()); | |
{ | |
auto s = chrono::high_resolution_clock::now(); | |
for (int i = 0; i < COUNT; ++i) { | |
dummyKernel<<<blks, threads, 0, stream>>>(val); | |
} | |
auto end = chrono::high_resolution_clock::now(); | |
auto diff = chrono::duration_cast<chrono::microseconds>(end - s); | |
printf("%-64s -> %12.3lf us\n", "<<<...>>>", diff.count() / double(COUNT)); | |
fflush(stdout); | |
__CUDA(cudaGetLastError()); | |
} | |
__CUDA(cudaDeviceSynchronize()); | |
TIMEIT_COMMENT(COUNT, "# cudaFuncCachePreferNone", cudaFuncSetCacheConfig, | |
fKernel, cudaFuncCachePreferNone); | |
TIMEIT_COMMENT(COUNT, "# cudaFuncCachePreferShared", cudaFuncSetCacheConfig, | |
fKernel, cudaFuncCachePreferShared); | |
TIMEIT_COMMENT(COUNT, "# cudaFuncCachePreferL1", cudaFuncSetCacheConfig, | |
fKernel, cudaFuncCachePreferL1); | |
TIMEIT_COMMENT(COUNT, "# cudaFuncCachePreferEqual", cudaFuncSetCacheConfig, | |
fKernel, cudaFuncCachePreferEqual); | |
TIMEIT_COMMENT(COUNT, "# cudaSharedMemBankSizeDefault", | |
cudaFuncSetSharedMemConfig, fKernel, | |
cudaSharedMemBankSizeDefault); | |
TIMEIT_COMMENT(COUNT, "# cudaSharedMemBankSizeFourByte", | |
cudaFuncSetSharedMemConfig, fKernel, | |
cudaSharedMemBankSizeFourByte); | |
TIMEIT_COMMENT(COUNT, "# cudaSharedMemBankSizeEightByte", | |
cudaFuncSetSharedMemConfig, fKernel, | |
cudaSharedMemBankSizeEightByte); | |
cudaLaunchParams params; | |
params.args = &args; | |
params.blockDim = dim3(256); | |
params.func = fKernel; | |
params.gridDim = dim3(1); | |
params.sharedMem = 0; | |
params.stream = stream; | |
TIMEIT(COUNT, cudaLaunchCooperativeKernelMultiDevice, ¶ms, 1, 0); | |
__CUDA(cudaDeviceSynchronize()); | |
__CUDA(cudaStreamDestroy(stream)); | |
cudaFuncAttributes fAttr; | |
TIMEIT(COUNT, cudaFuncGetAttributes, &fAttr, fKernel); | |
TIMEIT_COMMENT(COUNT, " # cudaFuncAttributeMaxDynamicSharedMemorySize", | |
cudaFuncSetAttribute, fKernel, | |
cudaFuncAttributeMaxDynamicSharedMemorySize, val); | |
TIMEIT_COMMENT(COUNT, " # cudaFuncAttributePreferredSharedMemoryCarveout", | |
cudaFuncSetAttribute, fKernel, | |
cudaFuncAttributePreferredSharedMemoryCarveout, val); | |
} | |
void occupancy() { | |
int nBlks; | |
TIMEIT(COUNT, cudaOccupancyMaxActiveBlocksPerMultiprocessor, &nBlks, | |
dummyKernel, 256, 0); | |
TIMEIT(COUNT, cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, &nBlks, | |
dummyKernel, 256, 0, 0); | |
} | |
void memoryManagement() { | |
size_t free, total; | |
TIMEIT(COUNT, cudaMemGetInfo, &free, &total); | |
} | |
void unifiedAddressing() { | |
char* ptr; | |
__CUDA(cudaMalloc((void**)&ptr, sizeof(char))); | |
cudaPointerAttributes attr; | |
TIMEIT(COUNT, cudaPointerGetAttributes, &attr, ptr); | |
__CUDA(cudaFree(ptr)); | |
} | |
int main(int argc, char** argv) { | |
cudaFree(nullptr); // just to create context | |
printf("****************** Device Management ******************\n"); | |
deviceManagement(); | |
printf("****************** Error Management ******************\n"); | |
errorManagement(); | |
printf("****************** Stream Management ******************\n"); | |
streamManagement(); | |
printf("****************** Event Management ******************\n"); | |
eventManagement(); | |
printf("****************** Execution Control ******************\n"); | |
executionControl(); | |
printf("****************** Occupancy ******************\n"); | |
occupancy(); | |
printf("****************** Memory Management ******************\n"); | |
memoryManagement(); | |
printf("****************** Unified Addressing *****************\n"); | |
unifiedAddressing(); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Here's output on a DGX-1 running cuda v10.0 and driver v410.79: