Skip to content

Instantly share code, notes, and snippets.

@teju85
Last active November 20, 2019 06:22
Show Gist options
  • Save teju85/baa0ce889ff8b26d27e184342f9eb14b to your computer and use it in GitHub Desktop.
Save teju85/baa0ce889ff8b26d27e184342f9eb14b to your computer and use it in GitHub Desktop.
Measure runtimes of commonly used cuda runtime APIs
// Compiling and running this program:
// nvcc -std=c++11 cuda-runtime-api-perf.cu && ./a.out
#include <chrono>
#include <stdio.h>
#include <stdlib.h>
using namespace std;
#define __CUDA(call) \
do { \
cudaError_t status = call; \
if(status != cudaSuccess) { \
printf("FAIL: call='%s'. Reason:%s\n", #call, \
cudaGetErrorString(status)); \
exit(1); \
} \
} while (0)
#define CUDA_NO_TIMING(cnt, func, ...) \
do { \
for (int itr = 0; itr < cnt; ++itr) { \
__CUDA(func(__VA_ARGS__)); \
} \
} while (0)
#define TIMEIT_COMMENT(cnt, comment, func, ...) \
do { \
auto s = chrono::high_resolution_clock::now(); \
CUDA_NO_TIMING(cnt, func, __VA_ARGS__); \
auto end = chrono::high_resolution_clock::now(); \
auto diff = chrono::duration_cast<chrono::microseconds>(end - s); \
printf("%-64s -> %12.3lf us %s\n", #func, \
diff.count() / double(cnt), comment); \
fflush(stdout); \
} while (0)
#define TIMEIT(cnt, func, ...) TIMEIT_COMMENT(cnt, "", func, __VA_ARGS__)
__global__ void dummyKernel(int val) {
}
static constexpr int COUNT = 25;
void deviceManagement() {
int devId;
TIMEIT(COUNT, cudaGetDevice, &devId);
TIMEIT(COUNT, cudaSetDevice, devId);
unsigned flags;
TIMEIT(COUNT, cudaGetDeviceFlags, &flags);
int count;
TIMEIT(COUNT, cudaGetDeviceCount, &count);
cudaDeviceProp prop;
TIMEIT(COUNT, cudaGetDeviceProperties, &prop, devId);
int smemSize;
TIMEIT(COUNT, cudaDeviceGetAttribute, &smemSize,
cudaDevAttrMaxSharedMemoryPerBlock, devId);
cudaFuncCache config;
TIMEIT(COUNT, cudaDeviceGetCacheConfig, &config);
char name[128];
TIMEIT(COUNT, cudaDeviceGetPCIBusId, name, 128, devId);
cudaSharedMemConfig sConfig;
TIMEIT(COUNT, cudaDeviceGetSharedMemConfig, &sConfig);
TIMEIT(COUNT, cudaDeviceSynchronize);
}
void streamManagement() {
cudaStream_t streams[COUNT];
TIMEIT(COUNT, cudaStreamCreate, &streams[itr]);
TIMEIT(COUNT, cudaStreamSynchronize, streams[itr]);
TIMEIT(COUNT, cudaStreamQuery, streams[itr]);
cudaEvent_t event;
unsigned flags = 0;
__CUDA(cudaEventCreate(&event));
TIMEIT(COUNT, cudaStreamWaitEvent, streams[itr], event, flags);
__CUDA(cudaEventDestroy(event));
cudaStreamCaptureStatus capStatus;
TIMEIT(COUNT, cudaStreamIsCapturing, streams[itr], &capStatus);
#if (CUDART_VERSION >= 10100)
unsigned long long pid;
TIMEIT(COUNT, cudaStreamGetCaptureInfo, streams[itr], &capStatus, &pid);
#endif
TIMEIT(COUNT, cudaStreamDestroy, streams[itr]);
// check timing the second time
TIMEIT_COMMENT(COUNT, "# second time", cudaStreamCreate, &streams[itr]);
CUDA_NO_TIMING(COUNT, cudaStreamDestroy, streams[itr]);
TIMEIT(COUNT, cudaStreamCreateWithFlags, &streams[itr], 0);
TIMEIT(COUNT, cudaStreamGetFlags, streams[itr], &flags);
CUDA_NO_TIMING(COUNT, cudaStreamDestroy, streams[itr]);
int priority;
TIMEIT(COUNT, cudaStreamCreateWithPriority, &streams[itr], 0, 0);
TIMEIT(COUNT, cudaStreamGetPriority, streams[itr], &priority);
CUDA_NO_TIMING(COUNT, cudaStreamDestroy, streams[itr]);
}
void eventManagement() {
cudaEvent_t events[COUNT];
TIMEIT(COUNT, cudaEventCreate, &events[itr]);
TIMEIT(COUNT, cudaEventSynchronize, events[itr]);
TIMEIT(COUNT, cudaEventRecord, events[itr], 0);
///@todo: causes the following error:
// FAIL: call='cudaEventQuery(events[itr])'. Reason:device not ready
//TIMEIT(COUNT, cudaEventQuery, events[itr]);
float ms;
TIMEIT(COUNT, cudaEventElapsedTime, &ms, events[0], events[itr]);
TIMEIT(COUNT, cudaEventDestroy, events[itr]);
unsigned flags = 0;
TIMEIT(COUNT, cudaEventCreateWithFlags, &events[itr], flags);
CUDA_NO_TIMING(COUNT, cudaEventDestroy, events[itr]);
}
void errorManagement() {
TIMEIT(COUNT, cudaGetLastError);
TIMEIT(COUNT, cudaPeekAtLastError);
}
void executionControl() {
int val = 0;
void* args = reinterpret_cast<void*>(&val);
cudaStream_t stream;
__CUDA(cudaStreamCreate(&stream));
void* fKernel = reinterpret_cast<void*>(&dummyKernel);
dim3 blks(1), threads(256);
// initial warm-up launch
cudaLaunchKernel(fKernel, blks, threads, &args, size_t(0), stream);
__CUDA(cudaDeviceSynchronize());
TIMEIT(COUNT, cudaLaunchKernel, fKernel, blks, threads, &args,
size_t(0), stream);
__CUDA(cudaDeviceSynchronize());
TIMEIT(COUNT, cudaLaunchCooperativeKernel, fKernel, blks, threads, &args,
size_t(0), stream);
__CUDA(cudaDeviceSynchronize());
{
auto s = chrono::high_resolution_clock::now();
for (int i = 0; i < COUNT; ++i) {
dummyKernel<<<blks, threads, 0, stream>>>(val);
}
auto end = chrono::high_resolution_clock::now();
auto diff = chrono::duration_cast<chrono::microseconds>(end - s);
printf("%-64s -> %12.3lf us\n", "<<<...>>>", diff.count() / double(COUNT));
fflush(stdout);
__CUDA(cudaGetLastError());
}
__CUDA(cudaDeviceSynchronize());
TIMEIT_COMMENT(COUNT, "# cudaFuncCachePreferNone", cudaFuncSetCacheConfig,
fKernel, cudaFuncCachePreferNone);
TIMEIT_COMMENT(COUNT, "# cudaFuncCachePreferShared", cudaFuncSetCacheConfig,
fKernel, cudaFuncCachePreferShared);
TIMEIT_COMMENT(COUNT, "# cudaFuncCachePreferL1", cudaFuncSetCacheConfig,
fKernel, cudaFuncCachePreferL1);
TIMEIT_COMMENT(COUNT, "# cudaFuncCachePreferEqual", cudaFuncSetCacheConfig,
fKernel, cudaFuncCachePreferEqual);
TIMEIT_COMMENT(COUNT, "# cudaSharedMemBankSizeDefault",
cudaFuncSetSharedMemConfig, fKernel,
cudaSharedMemBankSizeDefault);
TIMEIT_COMMENT(COUNT, "# cudaSharedMemBankSizeFourByte",
cudaFuncSetSharedMemConfig, fKernel,
cudaSharedMemBankSizeFourByte);
TIMEIT_COMMENT(COUNT, "# cudaSharedMemBankSizeEightByte",
cudaFuncSetSharedMemConfig, fKernel,
cudaSharedMemBankSizeEightByte);
cudaLaunchParams params;
params.args = &args;
params.blockDim = dim3(256);
params.func = fKernel;
params.gridDim = dim3(1);
params.sharedMem = 0;
params.stream = stream;
TIMEIT(COUNT, cudaLaunchCooperativeKernelMultiDevice, &params, 1, 0);
__CUDA(cudaDeviceSynchronize());
__CUDA(cudaStreamDestroy(stream));
cudaFuncAttributes fAttr;
TIMEIT(COUNT, cudaFuncGetAttributes, &fAttr, fKernel);
TIMEIT_COMMENT(COUNT, " # cudaFuncAttributeMaxDynamicSharedMemorySize",
cudaFuncSetAttribute, fKernel,
cudaFuncAttributeMaxDynamicSharedMemorySize, val);
TIMEIT_COMMENT(COUNT, " # cudaFuncAttributePreferredSharedMemoryCarveout",
cudaFuncSetAttribute, fKernel,
cudaFuncAttributePreferredSharedMemoryCarveout, val);
}
void occupancy() {
int nBlks;
TIMEIT(COUNT, cudaOccupancyMaxActiveBlocksPerMultiprocessor, &nBlks,
dummyKernel, 256, 0);
TIMEIT(COUNT, cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, &nBlks,
dummyKernel, 256, 0, 0);
}
void memoryManagement() {
size_t free, total;
TIMEIT(COUNT, cudaMemGetInfo, &free, &total);
}
void unifiedAddressing() {
char* ptr;
__CUDA(cudaMalloc((void**)&ptr, sizeof(char)));
cudaPointerAttributes attr;
TIMEIT(COUNT, cudaPointerGetAttributes, &attr, ptr);
__CUDA(cudaFree(ptr));
}
int main(int argc, char** argv) {
cudaFree(nullptr); // just to create context
printf("****************** Device Management ******************\n");
deviceManagement();
printf("****************** Error Management ******************\n");
errorManagement();
printf("****************** Stream Management ******************\n");
streamManagement();
printf("****************** Event Management ******************\n");
eventManagement();
printf("****************** Execution Control ******************\n");
executionControl();
printf("****************** Occupancy ******************\n");
occupancy();
printf("****************** Memory Management ******************\n");
memoryManagement();
printf("****************** Unified Addressing *****************\n");
unifiedAddressing();
return 0;
}
@teju85
Copy link
Author

teju85 commented Nov 20, 2019

Here's output on a DGX-1 running cuda v10.0 and driver v410.79:

$ nvcc -std=c++11 cuda-runtime-api-perf.cu && ./a.out
****************** Device Management ******************
cudaGetDevice                                                    ->        0.040 us
cudaSetDevice                                                    ->        0.080 us
cudaGetDeviceFlags                                               ->        0.040 us
cudaGetDeviceCount                                               ->        0.040 us
cudaGetDeviceProperties                                          ->     1183.960 us
cudaDeviceGetAttribute                                           ->        0.040 us
cudaDeviceGetCacheConfig                                         ->        0.120 us
cudaDeviceGetPCIBusId                                            ->        0.240 us
cudaDeviceGetSharedMemConfig                                     ->        0.080 us
cudaDeviceSynchronize                                            ->        4.680 us
****************** Error Management ******************
cudaGetLastError                                                 ->        0.040 us
cudaPeekAtLastError                                              ->        0.000 us
****************** Stream Management ******************
cudaStreamCreate                                                 ->       26.600 us
cudaStreamSynchronize                                            ->        3.200 us
cudaStreamQuery                                                  ->        3.000 us
cudaStreamWaitEvent                                              ->        0.120 us
cudaStreamIsCapturing                                            ->        0.800 us
cudaStreamDestroy                                                ->        5.600 us
cudaStreamCreate                                                 ->        0.920 us   # second time
cudaStreamCreateWithFlags                                        ->        0.800 us
cudaStreamGetFlags                                               ->        0.080 us
cudaStreamCreateWithPriority                                     ->        0.800 us
cudaStreamGetPriority                                            ->        0.080 us
****************** Event Management ******************
cudaEventCreate                                                  ->        0.440 us
cudaEventSynchronize                                             ->        1.840 us
cudaEventRecord                                                  ->        1.720 us
cudaEventElapsedTime                                             ->        3.480 us
cudaEventDestroy                                                 ->        0.240 us
cudaEventCreateWithFlags                                         ->        0.320 us
****************** Execution Control ******************
cudaLaunchKernel                                                 ->        3.560 us
cudaLaunchCooperativeKernel                                      ->        4.960 us
<<<...>>>                                                        ->        3.800 us
cudaFuncSetCacheConfig                                           ->        0.160 us   # cudaFuncCachePreferNone
cudaFuncSetCacheConfig                                           ->        0.120 us   # cudaFuncCachePreferShared
cudaFuncSetCacheConfig                                           ->        0.120 us   # cudaFuncCachePreferL1
cudaFuncSetCacheConfig                                           ->        0.120 us   # cudaFuncCachePreferEqual
cudaFuncSetSharedMemConfig                                       ->        0.120 us   # cudaSharedMemBankSizeDefault
cudaFuncSetSharedMemConfig                                       ->        0.120 us   # cudaSharedMemBankSizeFourByte
cudaFuncSetSharedMemConfig                                       ->        0.120 us   # cudaSharedMemBankSizeEightByte
cudaLaunchCooperativeKernelMultiDevice                           ->        5.000 us
cudaFuncGetAttributes                                            ->        0.760 us
cudaFuncSetAttribute                                             ->        0.160 us    # cudaFuncAttributeMaxDynamicSharedMemorySize
cudaFuncSetAttribute                                             ->        0.120 us    # cudaFuncAttributePreferredSharedMemoryCarveout
****************** Occupancy ******************
cudaOccupancyMaxActiveBlocksPerMultiprocessor                    ->        0.240 us
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags           ->        0.200 us
****************** Memory Management ******************
cudaMemGetInfo                                                   ->      413.320 us
****************** Unified Addressing *****************
cudaPointerGetAttributes                                         ->        0.200 us

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment