|
#include <sys/time.h> |
|
#include <stdio.h> |
|
#include <cuda_runtime.h> |
|
|
|
inline double seconds() |
|
{ |
|
struct timeval tp; |
|
struct timezone tzp; |
|
int i = gettimeofday(&tp, &tzp); |
|
return ((double)tp.tv_sec + (double)tp.tv_usec * 1.e-6); |
|
} |
|
|
|
#define SET_ENV(name, value, ow) setenv(name, value, ow) |
|
#define GET_ENV(name) getenv(name) |
|
|
|
#define CHECK(call) \ |
|
{ \ |
|
const cudaError_t error = call; \ |
|
if (error != cudaSuccess) \ |
|
{ \ |
|
fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__); \ |
|
fprintf(stderr, "code: %d, reason: %s\n", error, \ |
|
cudaGetErrorString(error)); \ |
|
} \ |
|
} |
|
|
|
/* |
|
* An example of using CUDA callbacks to trigger work on the host after the |
|
* completion of asynchronous work on the device. In this example, n_streams |
|
* CUDA streams are created and 4 kernels are launched asynchronously in each. |
|
* Then, a callback is added at the completion of those asynchronous kernels |
|
* that prints diagnostic information. |
|
*/ |
|
|
|
#define N 10000000 |
|
#define NSTREAM 4 |
|
|
|
void CUDART_CB my_callback(cudaStream_t stream, cudaError_t status, void *data) |
|
{ |
|
printf("callback from stream %d\n", *((int *)data)); |
|
} |
|
|
|
__global__ void kernel_1() |
|
{ |
|
double sum = 0.0; |
|
|
|
for(int i = 0; i < N; i++) |
|
{ |
|
sum = sum + tan(0.1) * tan(0.1); |
|
} |
|
printf("kernel_1 %lf done\n", sum); |
|
} |
|
|
|
__global__ void kernel_2() |
|
{ |
|
double sum = 0.0; |
|
|
|
for(int i = 0; i < N; i++) |
|
{ |
|
sum = sum + tan(0.1) * tan(0.1); |
|
} |
|
printf("kernel_2 %lf done\n", sum); |
|
} |
|
|
|
__global__ void kernel_3() |
|
{ |
|
double sum = 0.0; |
|
|
|
for(int i = 0; i < N; i++) |
|
{ |
|
sum = sum + tan(0.1) * tan(0.1); |
|
} |
|
printf("kernel_3 %lf done\n", sum); |
|
} |
|
|
|
__global__ void kernel_4() |
|
{ |
|
double sum = 0.0; |
|
|
|
for(int i = 0; i < N; i++) |
|
{ |
|
sum = sum + tan(0.1) * tan(0.1); |
|
} |
|
printf("kernel_4 %lf done\n", sum); |
|
} |
|
|
|
int main(int argc, char **argv) |
|
{ |
|
int n_streams = NSTREAM; |
|
if (argc > 1) n_streams = atoi(argv[1]); |
|
|
|
int dev = 0; |
|
cudaDeviceProp deviceProp; |
|
CHECK(cudaGetDeviceProperties(&deviceProp, dev)); |
|
printf("> %s Starting...\n", argv[0]); |
|
printf("> Using Device %d: %s\n", dev, deviceProp.name); |
|
CHECK(cudaSetDevice(dev)); |
|
|
|
// check if device support hyper-q |
|
if (deviceProp.major < 3 || (deviceProp.major == 3 && deviceProp.minor < 5)) |
|
{ |
|
if (deviceProp.concurrentKernels == 0) |
|
{ |
|
printf("> GPU does not support concurrent kernel execution (SM 3.5 or higher required)\n"); |
|
printf("> CUDA kernel runs will be serialized\n"); |
|
} |
|
else |
|
{ |
|
printf("> GPU does not support HyperQ\n"); |
|
printf("> CUDA kernel runs will have limited concurrency\n"); |
|
} |
|
} |
|
|
|
printf("> Compute Capability %d.%d hardware with %d multi-processors\n", |
|
deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount); |
|
|
|
// set up max connectioin |
|
char *iname = "CUDA_DEVICE_MAX_CONNECTIONS"; |
|
SET_ENV(iname, "8", 1); |
|
char *ivalue = GET_ENV(iname); |
|
//printf ("> %s = %s\n", iname, ivalue); |
|
printf ("> with streams = %d\n", n_streams); |
|
|
|
// Allocate and initialize an array of stream handles |
|
cudaStream_t *streams = (cudaStream_t *) malloc(n_streams * sizeof(cudaStream_t)); |
|
|
|
for (int i = 0; i < n_streams; i++) |
|
{ |
|
CHECK(cudaStreamCreate(&(streams[i]))); |
|
} |
|
|
|
dim3 block(1); |
|
dim3 grid(1); |
|
cudaEvent_t start_event, stop_event; |
|
CHECK(cudaEventCreate(&start_event)); |
|
CHECK(cudaEventCreate(&stop_event)); |
|
|
|
//int stream_ids[n_streams]; |
|
int *stream_ids = (int *)malloc(n_streams); |
|
|
|
{ |
|
CHECK(cudaEventRecord(start_event, 0)); |
|
|
|
for (int i = 0; i < n_streams; i++) |
|
{ |
|
stream_ids[i] = i; |
|
kernel_1<<<grid, block, 0, streams[i]>>>(); |
|
kernel_2<<<grid, block, 0, streams[i]>>>(); |
|
kernel_3<<<grid, block, 0, streams[i]>>>(); |
|
kernel_4<<<grid, block, 0, streams[i]>>>(); |
|
} |
|
|
|
CHECK(cudaEventRecord(stop_event, 0)); |
|
CHECK(cudaEventSynchronize(stop_event)); |
|
|
|
float elapsed_time; |
|
CHECK(cudaEventElapsedTime(&elapsed_time, start_event, stop_event)); |
|
printf("No callback: elapsed time = %.3fs\n", elapsed_time / 1000.0f); |
|
} |
|
cudaDeviceSynchronize(); |
|
|
|
{ |
|
CHECK(cudaEventRecord(start_event, 0)); |
|
|
|
for (int i = 0; i < n_streams; i++) |
|
{ |
|
stream_ids[i] = i; |
|
kernel_1<<<grid, block, 0, streams[i]>>>(); |
|
kernel_2<<<grid, block, 0, streams[i]>>>(); |
|
kernel_3<<<grid, block, 0, streams[i]>>>(); |
|
kernel_4<<<grid, block, 0, streams[i]>>>(); |
|
CHECK(cudaStreamAddCallback(streams[i], my_callback, (void *)(stream_ids + i), 0)); |
|
} |
|
|
|
CHECK(cudaEventRecord(stop_event, 0)); |
|
CHECK(cudaEventSynchronize(stop_event)); |
|
|
|
float elapsed_time; |
|
CHECK(cudaEventElapsedTime(&elapsed_time, start_event, stop_event)); |
|
printf("One callback: elapsed time = %.3fs\n", elapsed_time / 1000.0f); |
|
} |
|
cudaDeviceSynchronize(); |
|
{ |
|
CHECK(cudaEventRecord(start_event, 0)); |
|
|
|
for (int i = 0; i < n_streams; i++) |
|
{ |
|
stream_ids[i] = i; |
|
kernel_1<<<grid, block, 0, streams[i]>>>(); |
|
kernel_2<<<grid, block, 0, streams[i]>>>(); |
|
CHECK(cudaStreamAddCallback(streams[i], my_callback, (void *)(stream_ids + i), 0)); |
|
kernel_3<<<grid, block, 0, streams[i]>>>(); |
|
kernel_4<<<grid, block, 0, streams[i]>>>(); |
|
CHECK(cudaStreamAddCallback(streams[i], my_callback, (void *)(stream_ids + i), 0)); |
|
} |
|
|
|
CHECK(cudaEventRecord(stop_event, 0)); |
|
CHECK(cudaEventSynchronize(stop_event)); |
|
|
|
float elapsed_time; |
|
CHECK(cudaEventElapsedTime(&elapsed_time, start_event, stop_event)); |
|
printf("Two callback: elapsed time = %.3fs\n", elapsed_time / 1000.0f); |
|
} |
|
cudaDeviceSynchronize(); |
|
|
|
{ |
|
CHECK(cudaEventRecord(start_event, 0)); |
|
|
|
for (int i = 0; i < n_streams; i++) |
|
{ |
|
stream_ids[i] = i; |
|
kernel_1<<<grid, block, 0, streams[i]>>>(); |
|
CHECK(cudaStreamAddCallback(streams[i], my_callback, (void *)(stream_ids + i), 0)); |
|
kernel_2<<<grid, block, 0, streams[i]>>>(); |
|
CHECK(cudaStreamAddCallback(streams[i], my_callback, (void *)(stream_ids + i), 0)); |
|
kernel_3<<<grid, block, 0, streams[i]>>>(); |
|
CHECK(cudaStreamAddCallback(streams[i], my_callback, (void *)(stream_ids + i), 0)); |
|
kernel_4<<<grid, block, 0, streams[i]>>>(); |
|
CHECK(cudaStreamAddCallback(streams[i], my_callback, (void *)(stream_ids + i), 0)); |
|
} |
|
|
|
CHECK(cudaEventRecord(stop_event, 0)); |
|
CHECK(cudaEventSynchronize(stop_event)); |
|
|
|
float elapsed_time; |
|
CHECK(cudaEventElapsedTime(&elapsed_time, start_event, stop_event)); |
|
printf("Four callback: elapsed time = %.3fs\n", elapsed_time / 1000.0f); |
|
} |
|
cudaDeviceSynchronize(); |
|
|
|
// release all stream |
|
for (int i = 0; i < n_streams; i++) |
|
{ |
|
CHECK(cudaStreamDestroy(streams[i])); |
|
} |
|
|
|
free(streams); |
|
free(stream_ids); |
|
|
|
/* |
|
* cudaDeviceReset must be called before exiting in order for profiling and |
|
* tracing tools such as Nsight and Visual Profiler to show complete traces. |
|
*/ |
|
CHECK(cudaDeviceReset()); |
|
|
|
return 0; |
|
} |
nvidia visual profiler