Skip to content

Instantly share code, notes, and snippets.

@sonots
Last active November 2, 2018 14:12
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save sonots/e98a95aaceae65a15d2b59a81befb023 to your computer and use it in GitHub Desktop.
Concurrency is lost by cudaStreamCallback?
$ nvcc simpleCallback.cu -O2 -o simpleCallback
$ nvprof -f -o simpleCallback.nvvp ./simpleCallback | grep elapsed
No callback: elapsed time = 1.534s
One callback: elapsed time = 1.498s
Two callback: elapsed time = 3.718s
Four callback: elapsed time = 5.194s

As increasing callbacks, it becomes slow...

#include <sys/time.h>
#include <stdio.h>
#include <cuda_runtime.h>
inline double seconds()
{
struct timeval tp;
struct timezone tzp;
int i = gettimeofday(&tp, &tzp);
return ((double)tp.tv_sec + (double)tp.tv_usec * 1.e-6);
}
#define SET_ENV(name, value, ow) setenv(name, value, ow)
#define GET_ENV(name) getenv(name)
#define CHECK(call) \
{ \
const cudaError_t error = call; \
if (error != cudaSuccess) \
{ \
fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__); \
fprintf(stderr, "code: %d, reason: %s\n", error, \
cudaGetErrorString(error)); \
} \
}
/*
* An example of using CUDA callbacks to trigger work on the host after the
* completion of asynchronous work on the device. In this example, n_streams
* CUDA streams are created and 4 kernels are launched asynchronously in each.
* Then, a callback is added at the completion of those asynchronous kernels
* that prints diagnostic information.
*/
#define N 10000000
#define NSTREAM 4
void CUDART_CB my_callback(cudaStream_t stream, cudaError_t status, void *data)
{
printf("callback from stream %d\n", *((int *)data));
}
__global__ void kernel_1()
{
double sum = 0.0;
for(int i = 0; i < N; i++)
{
sum = sum + tan(0.1) * tan(0.1);
}
printf("kernel_1 %lf done\n", sum);
}
__global__ void kernel_2()
{
double sum = 0.0;
for(int i = 0; i < N; i++)
{
sum = sum + tan(0.1) * tan(0.1);
}
printf("kernel_2 %lf done\n", sum);
}
__global__ void kernel_3()
{
double sum = 0.0;
for(int i = 0; i < N; i++)
{
sum = sum + tan(0.1) * tan(0.1);
}
printf("kernel_3 %lf done\n", sum);
}
__global__ void kernel_4()
{
double sum = 0.0;
for(int i = 0; i < N; i++)
{
sum = sum + tan(0.1) * tan(0.1);
}
printf("kernel_4 %lf done\n", sum);
}
int main(int argc, char **argv)
{
int n_streams = NSTREAM;
if (argc > 1) n_streams = atoi(argv[1]);
int dev = 0;
cudaDeviceProp deviceProp;
CHECK(cudaGetDeviceProperties(&deviceProp, dev));
printf("> %s Starting...\n", argv[0]);
printf("> Using Device %d: %s\n", dev, deviceProp.name);
CHECK(cudaSetDevice(dev));
// check if device support hyper-q
if (deviceProp.major < 3 || (deviceProp.major == 3 && deviceProp.minor < 5))
{
if (deviceProp.concurrentKernels == 0)
{
printf("> GPU does not support concurrent kernel execution (SM 3.5 or higher required)\n");
printf("> CUDA kernel runs will be serialized\n");
}
else
{
printf("> GPU does not support HyperQ\n");
printf("> CUDA kernel runs will have limited concurrency\n");
}
}
printf("> Compute Capability %d.%d hardware with %d multi-processors\n",
deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
// set up max connectioin
char *iname = "CUDA_DEVICE_MAX_CONNECTIONS";
SET_ENV(iname, "8", 1);
char *ivalue = GET_ENV(iname);
//printf ("> %s = %s\n", iname, ivalue);
printf ("> with streams = %d\n", n_streams);
// Allocate and initialize an array of stream handles
cudaStream_t *streams = (cudaStream_t *) malloc(n_streams * sizeof(cudaStream_t));
for (int i = 0; i < n_streams; i++)
{
CHECK(cudaStreamCreate(&(streams[i])));
}
dim3 block(1);
dim3 grid(1);
cudaEvent_t start_event, stop_event;
CHECK(cudaEventCreate(&start_event));
CHECK(cudaEventCreate(&stop_event));
//int stream_ids[n_streams];
int *stream_ids = (int *)malloc(n_streams);
{
CHECK(cudaEventRecord(start_event, 0));
for (int i = 0; i < n_streams; i++)
{
stream_ids[i] = i;
kernel_1<<<grid, block, 0, streams[i]>>>();
kernel_2<<<grid, block, 0, streams[i]>>>();
kernel_3<<<grid, block, 0, streams[i]>>>();
kernel_4<<<grid, block, 0, streams[i]>>>();
}
CHECK(cudaEventRecord(stop_event, 0));
CHECK(cudaEventSynchronize(stop_event));
float elapsed_time;
CHECK(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));
printf("No callback: elapsed time = %.3fs\n", elapsed_time / 1000.0f);
}
cudaDeviceSynchronize();
{
CHECK(cudaEventRecord(start_event, 0));
for (int i = 0; i < n_streams; i++)
{
stream_ids[i] = i;
kernel_1<<<grid, block, 0, streams[i]>>>();
kernel_2<<<grid, block, 0, streams[i]>>>();
kernel_3<<<grid, block, 0, streams[i]>>>();
kernel_4<<<grid, block, 0, streams[i]>>>();
CHECK(cudaStreamAddCallback(streams[i], my_callback, (void *)(stream_ids + i), 0));
}
CHECK(cudaEventRecord(stop_event, 0));
CHECK(cudaEventSynchronize(stop_event));
float elapsed_time;
CHECK(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));
printf("One callback: elapsed time = %.3fs\n", elapsed_time / 1000.0f);
}
cudaDeviceSynchronize();
{
CHECK(cudaEventRecord(start_event, 0));
for (int i = 0; i < n_streams; i++)
{
stream_ids[i] = i;
kernel_1<<<grid, block, 0, streams[i]>>>();
kernel_2<<<grid, block, 0, streams[i]>>>();
CHECK(cudaStreamAddCallback(streams[i], my_callback, (void *)(stream_ids + i), 0));
kernel_3<<<grid, block, 0, streams[i]>>>();
kernel_4<<<grid, block, 0, streams[i]>>>();
CHECK(cudaStreamAddCallback(streams[i], my_callback, (void *)(stream_ids + i), 0));
}
CHECK(cudaEventRecord(stop_event, 0));
CHECK(cudaEventSynchronize(stop_event));
float elapsed_time;
CHECK(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));
printf("Two callback: elapsed time = %.3fs\n", elapsed_time / 1000.0f);
}
cudaDeviceSynchronize();
{
CHECK(cudaEventRecord(start_event, 0));
for (int i = 0; i < n_streams; i++)
{
stream_ids[i] = i;
kernel_1<<<grid, block, 0, streams[i]>>>();
CHECK(cudaStreamAddCallback(streams[i], my_callback, (void *)(stream_ids + i), 0));
kernel_2<<<grid, block, 0, streams[i]>>>();
CHECK(cudaStreamAddCallback(streams[i], my_callback, (void *)(stream_ids + i), 0));
kernel_3<<<grid, block, 0, streams[i]>>>();
CHECK(cudaStreamAddCallback(streams[i], my_callback, (void *)(stream_ids + i), 0));
kernel_4<<<grid, block, 0, streams[i]>>>();
CHECK(cudaStreamAddCallback(streams[i], my_callback, (void *)(stream_ids + i), 0));
}
CHECK(cudaEventRecord(stop_event, 0));
CHECK(cudaEventSynchronize(stop_event));
float elapsed_time;
CHECK(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));
printf("Four callback: elapsed time = %.3fs\n", elapsed_time / 1000.0f);
}
cudaDeviceSynchronize();
// release all stream
for (int i = 0; i < n_streams; i++)
{
CHECK(cudaStreamDestroy(streams[i]));
}
free(streams);
free(stream_ids);
/*
* cudaDeviceReset must be called before exiting in order for profiling and
* tracing tools such as Nsight and Visual Profiler to show complete traces.
*/
CHECK(cudaDeviceReset());
return 0;
}
@sonots
Copy link
Author

sonots commented Jul 21, 2017

nvidia visual profiler

image

@sonots
Copy link
Author

sonots commented Jul 21, 2017

Q. I expected all versions finish in same time, but "Two callback" and "Four callback" became slow. Two callback: why stream13 and stream14 worked concurrently, but stream15 and stream16 did not?

@sonots
Copy link
Author

sonots commented Aug 19, 2017

A. It seems we have to make threads in CPU side, too.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment