Skip to content

Instantly share code, notes, and snippets.

@allanmac
Last active May 19, 2022 03:08
Show Gist options
  • Save allanmac/049837785a10b7999fce6ca282f62dc6 to your computer and use it in GitHub Desktop.
Save allanmac/049837785a10b7999fce6ca282f62dc6 to your computer and use it in GitHub Desktop.
Concurrent kernel test that demonstrates _different_ kernels running concurrently. Hacked from NVIDIA's example. ck_2.cu has two kernels each requiring half of an sm_50 multiprocessor's shared memory. Kernel "a" is run on 5 out of 6 launches, otherwise kernel "b" is launched. ck_6.cu has six kernels.
/*
* Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
//
// This sample demonstrates the use of streams for concurrent execution. It also illustrates how to
// introduce dependencies between CUDA streams with the new cudaStreamWaitEvent function introduced
// in CUDA 3.2.
//
// Devices of compute capability 1.x will run the kernels one after another
// Devices of compute capability 2.0 or higher can overlap the kernels
//
#include <stdio.h>
#include <cuda_profiler_api.h>
#include "helper_functions.h"
#include "helper_cuda.h"
// This is a kernel that does no real work but runs at least for a specified number of clocks
__global__ void clock_block_a(clock_t *d_o, clock_t clock_count)
{
__shared__ unsigned int smem[32768/4];
unsigned int start_clock = (unsigned int) clock();
smem[0] = start_clock;
clock_t clock_offset = 0;
while (clock_offset < clock_count)
{
unsigned int end_clock = (unsigned int) clock();
// The code below should work like
// this (thanks to modular arithmetics):
//
// clock_offset = (clock_t) (end_clock > start_clock ?
// end_clock - start_clock :
// end_clock + (0xffffffffu - start_clock));
//
// Indeed, let m = 2^32 then
// end - start = end + m - start (mod m).
clock_offset = (clock_t)(end_clock - start_clock);
}
d_o[0] = clock_offset;
}
__global__ void clock_block_b(clock_t *d_o, clock_t clock_count)
{
__shared__ unsigned int smem[32768/4];
unsigned int start_clock = (unsigned int) clock();
smem[0] = start_clock;
clock_t clock_offset = 0;
while (clock_offset < clock_count)
{
unsigned int end_clock = (unsigned int) clock();
// The code below should work like
// this (thanks to modular arithmetics):
//
// clock_offset = (clock_t) (end_clock > start_clock ?
// end_clock - start_clock :
// end_clock + (0xffffffffu - start_clock));
//
// Indeed, let m = 2^32 then
// end - start = end + m - start (mod m).
clock_offset = (clock_t)(end_clock - start_clock);
}
d_o[0] = clock_offset;
}
// Single warp reduction kernel
__global__ void sum(clock_t *d_clocks, int N)
{
__shared__ clock_t s_clocks[32];
clock_t my_sum = 0;
for (int i = threadIdx.x; i < N; i+= blockDim.x)
{
my_sum += d_clocks[i];
}
s_clocks[threadIdx.x] = my_sum;
__syncthreads();
for (int i=16; i>0; i/=2)
{
if (threadIdx.x < i)
{
s_clocks[threadIdx.x] += s_clocks[threadIdx.x + i];
}
__syncthreads();
}
d_clocks[0] = s_clocks[0];
}
int main(int argc, char **argv)
{
int nkernels = 8; // number of concurrent kernels
int nstreams = nkernels + 1; // use one more stream than concurrent kernel
int nbytes = nkernels * sizeof(clock_t); // number of data bytes
float kernel_time = 10; // time the kernel should run in ms
float elapsed_time; // timing variables
int cuda_device = 0;
printf("[%s] - Starting...\n", argv[0]);
// get number of kernels if overridden on the command line
if (checkCmdLineFlag(argc, (const char **)argv, "nkernels"))
{
nkernels = getCmdLineArgumentInt(argc, (const char **)argv, "nkernels");
nstreams = nkernels + 1;
}
// use command-line specified CUDA device, otherwise use device with highest Gflops/s
cuda_device = findCudaDevice(argc, (const char **)argv);
cudaDeviceProp deviceProp;
checkCudaErrors(cudaGetDevice(&cuda_device));
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
if ((deviceProp.concurrentKernels == 0))
{
printf("> GPU does not support concurrent kernel execution\n");
printf(" CUDA kernel runs will be serialized\n");
}
printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n",
deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
// allocate host memory
clock_t *a = 0; // pointer to the array data in host memory
checkCudaErrors(cudaMallocHost((void **)&a, nbytes));
// allocate device memory
clock_t *d_a = 0; // pointers to data and init value in the device memory
checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
// allocate and initialize an array of stream handles
cudaStream_t *streams = (cudaStream_t *) malloc(nstreams * sizeof(cudaStream_t));
for (int i = 0; i < nstreams; i++)
{
checkCudaErrors(cudaStreamCreate(&(streams[i])));
}
// create CUDA event handles
cudaEvent_t start_event, stop_event;
checkCudaErrors(cudaEventCreate(&start_event));
checkCudaErrors(cudaEventCreate(&stop_event));
// the events are used for synchronization only and hence do not need to record timings
// this also makes events not introduce global sync points when recorded which is critical to get overlap
cudaEvent_t *kernelEvent;
kernelEvent = (cudaEvent_t *) malloc(nkernels * sizeof(cudaEvent_t));
for (int i = 0; i < nkernels; i++)
{
checkCudaErrors(cudaEventCreateWithFlags(&(kernelEvent[i]), cudaEventDisableTiming));
}
//////////////////////////////////////////////////////////////////////
// time execution with nkernels streams
clock_t total_clocks = 0;
#if defined(__arm__) || defined(__aarch64__)
// the kernel takes more time than the channel reset time on arm archs, so to prevent hangs reduce time_clocks.
clock_t time_clocks = (clock_t)(kernel_time * (deviceProp.clockRate / 1000));
#else
clock_t time_clocks = (clock_t)(kernel_time * deviceProp.clockRate);
#endif
cudaEventRecord(start_event, 0);
// queue nkernels in separate streams and record when they are done
for (int i=0; i<nkernels; ++i)
{
if (i%6 > 0)
clock_block_a<<<1,1,0,streams[i]>>>(&d_a[i], time_clocks);
else
clock_block_b<<<1,1,0,streams[i]>>>(&d_a[i], time_clocks);
total_clocks += time_clocks;
checkCudaErrors(cudaEventRecord(kernelEvent[i], streams[i]));
// make the last stream wait for the kernel event to be recorded
checkCudaErrors(cudaStreamWaitEvent(streams[nstreams-1], kernelEvent[i],0));
}
// queue a sum kernel and a copy back to host in the last stream.
// the commands in this stream get dispatched as soon as all the kernel events have been recorded
sum<<<1,32,0,streams[nstreams-1]>>>(d_a, nkernels);
checkCudaErrors(cudaMemcpyAsync(a, d_a, sizeof(clock_t), cudaMemcpyDeviceToHost, streams[nstreams-1]));
// at this point the CPU has dispatched all work for the GPU and can continue processing other tasks in parallel
// in this sample we just wait until the GPU is done
checkCudaErrors(cudaEventRecord(stop_event, 0));
checkCudaErrors(cudaEventSynchronize(stop_event));
checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));
printf("Expected time for serial execution of %d kernels = %.3fs\n", nkernels, nkernels * kernel_time/1000.0f);
printf("Expected time for concurrent execution of %d kernels = %.3fs\n", nkernels, kernel_time/1000.0f);
printf("Measured time for sample = %.3fs\n", elapsed_time/1000.0f);
bool bTestResult = (a[0] > total_clocks);
// release resources
for (int i = 0; i < nkernels; i++)
{
cudaStreamDestroy(streams[i]);
cudaEventDestroy(kernelEvent[i]);
}
free(streams);
free(kernelEvent);
cudaEventDestroy(start_event);
cudaEventDestroy(stop_event);
cudaFreeHost(a);
cudaFree(d_a);
// Calling cudaProfilerStop causes all profile data to be
// flushed before the application exits
checkCudaErrors(cudaProfilerStop());
if (!bTestResult)
{
printf("Test failed!\n");
exit(EXIT_FAILURE);
}
printf("Test passed\n");
exit(EXIT_SUCCESS);
}
/*
* Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
//
// This sample demonstrates the use of streams for concurrent execution. It also illustrates how to
// introduce dependencies between CUDA streams with the new cudaStreamWaitEvent function introduced
// in CUDA 3.2.
//
// Devices of compute capability 1.x will run the kernels one after another
// Devices of compute capability 2.0 or higher can overlap the kernels
//
#include <stdio.h>
#include <cuda_profiler_api.h>
#include "helper_functions.h"
#include "helper_cuda.h"
// This is a kernel that does no real work but runs at least for a specified number of clocks
__global__ void clock_block_a(clock_t *d_o, clock_t clock_count)
{
__shared__ unsigned int smem[32768/4];
unsigned int start_clock = (unsigned int) clock();
smem[0] = start_clock;
clock_t clock_offset = 0;
while (clock_offset < clock_count)
{
unsigned int end_clock = (unsigned int) clock();
// The code below should work like
// this (thanks to modular arithmetics):
//
// clock_offset = (clock_t) (end_clock > start_clock ?
// end_clock - start_clock :
// end_clock + (0xffffffffu - start_clock));
//
// Indeed, let m = 2^32 then
// end - start = end + m - start (mod m).
clock_offset = (clock_t)(end_clock - start_clock);
}
d_o[0] = clock_offset;
}
__global__ void clock_block_b(clock_t *d_o, clock_t clock_count)
{
__shared__ unsigned int smem[32768/4];
unsigned int start_clock = (unsigned int) clock();
smem[0] = start_clock;
clock_t clock_offset = 0;
while (clock_offset < clock_count)
{
unsigned int end_clock = (unsigned int) clock();
// The code below should work like
// this (thanks to modular arithmetics):
//
// clock_offset = (clock_t) (end_clock > start_clock ?
// end_clock - start_clock :
// end_clock + (0xffffffffu - start_clock));
//
// Indeed, let m = 2^32 then
// end - start = end + m - start (mod m).
clock_offset = (clock_t)(end_clock - start_clock);
}
d_o[0] = clock_offset;
}
__global__ void clock_block_c(clock_t *d_o, clock_t clock_count)
{
__shared__ unsigned int smem[32768/4];
unsigned int start_clock = (unsigned int) clock();
smem[0] = start_clock;
clock_t clock_offset = 0;
while (clock_offset < clock_count)
{
unsigned int end_clock = (unsigned int) clock();
// The code below should work like
// this (thanks to modular arithmetics):
//
// clock_offset = (clock_t) (end_clock > start_clock ?
// end_clock - start_clock :
// end_clock + (0xffffffffu - start_clock));
//
// Indeed, let m = 2^32 then
// end - start = end + m - start (mod m).
clock_offset = (clock_t)(end_clock - start_clock);
}
d_o[0] = clock_offset;
}
__global__ void clock_block_d(clock_t *d_o, clock_t clock_count)
{
__shared__ unsigned int smem[32768/4];
unsigned int start_clock = (unsigned int) clock();
smem[0] = start_clock;
clock_t clock_offset = 0;
while (clock_offset < clock_count)
{
unsigned int end_clock = (unsigned int) clock();
// The code below should work like
// this (thanks to modular arithmetics):
//
// clock_offset = (clock_t) (end_clock > start_clock ?
// end_clock - start_clock :
// end_clock + (0xffffffffu - start_clock));
//
// Indeed, let m = 2^32 then
// end - start = end + m - start (mod m).
clock_offset = (clock_t)(end_clock - start_clock);
}
d_o[0] = clock_offset;
}
__global__ void clock_block_e(clock_t *d_o, clock_t clock_count)
{
__shared__ unsigned int smem[32768/4];
unsigned int start_clock = (unsigned int) clock();
smem[0] = start_clock;
clock_t clock_offset = 0;
while (clock_offset < clock_count)
{
unsigned int end_clock = (unsigned int) clock();
// The code below should work like
// this (thanks to modular arithmetics):
//
// clock_offset = (clock_t) (end_clock > start_clock ?
// end_clock - start_clock :
// end_clock + (0xffffffffu - start_clock));
//
// Indeed, let m = 2^32 then
// end - start = end + m - start (mod m).
clock_offset = (clock_t)(end_clock - start_clock);
}
d_o[0] = clock_offset;
}
__global__ void clock_block_f(clock_t *d_o, clock_t clock_count)
{
__shared__ unsigned int smem[32768/4];
unsigned int start_clock = (unsigned int) clock();
smem[0] = start_clock;
clock_t clock_offset = 0;
while (clock_offset < clock_count)
{
unsigned int end_clock = (unsigned int) clock();
// The code below should work like
// this (thanks to modular arithmetics):
//
// clock_offset = (clock_t) (end_clock > start_clock ?
// end_clock - start_clock :
// end_clock + (0xffffffffu - start_clock));
//
// Indeed, let m = 2^32 then
// end - start = end + m - start (mod m).
clock_offset = (clock_t)(end_clock - start_clock);
}
d_o[0] = clock_offset;
}
// Single warp reduction kernel
__global__ void sum(clock_t *d_clocks, int N)
{
__shared__ clock_t s_clocks[32];
clock_t my_sum = 0;
for (int i = threadIdx.x; i < N; i+= blockDim.x)
{
my_sum += d_clocks[i];
}
s_clocks[threadIdx.x] = my_sum;
__syncthreads();
for (int i=16; i>0; i/=2)
{
if (threadIdx.x < i)
{
s_clocks[threadIdx.x] += s_clocks[threadIdx.x + i];
}
__syncthreads();
}
d_clocks[0] = s_clocks[0];
}
int main(int argc, char **argv)
{
int nkernels = 8; // number of concurrent kernels
int nstreams = nkernels + 1; // use one more stream than concurrent kernel
int nbytes = nkernels * sizeof(clock_t); // number of data bytes
float kernel_time = 10; // time the kernel should run in ms
float elapsed_time; // timing variables
int cuda_device = 0;
printf("[%s] - Starting...\n", argv[0]);
// get number of kernels if overridden on the command line
if (checkCmdLineFlag(argc, (const char **)argv, "nkernels"))
{
nkernels = getCmdLineArgumentInt(argc, (const char **)argv, "nkernels");
nstreams = nkernels + 1;
}
// use command-line specified CUDA device, otherwise use device with highest Gflops/s
cuda_device = findCudaDevice(argc, (const char **)argv);
cudaDeviceProp deviceProp;
checkCudaErrors(cudaGetDevice(&cuda_device));
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
if ((deviceProp.concurrentKernels == 0))
{
printf("> GPU does not support concurrent kernel execution\n");
printf(" CUDA kernel runs will be serialized\n");
}
printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n",
deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
// allocate host memory
clock_t *a = 0; // pointer to the array data in host memory
checkCudaErrors(cudaMallocHost((void **)&a, nbytes));
// allocate device memory
clock_t *d_a = 0; // pointers to data and init value in the device memory
checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
// allocate and initialize an array of stream handles
cudaStream_t *streams = (cudaStream_t *) malloc(nstreams * sizeof(cudaStream_t));
for (int i = 0; i < nstreams; i++)
{
checkCudaErrors(cudaStreamCreate(&(streams[i])));
}
// create CUDA event handles
cudaEvent_t start_event, stop_event;
checkCudaErrors(cudaEventCreate(&start_event));
checkCudaErrors(cudaEventCreate(&stop_event));
// the events are used for synchronization only and hence do not need to record timings
// this also makes events not introduce global sync points when recorded which is critical to get overlap
cudaEvent_t *kernelEvent;
kernelEvent = (cudaEvent_t *) malloc(nkernels * sizeof(cudaEvent_t));
for (int i = 0; i < nkernels; i++)
{
checkCudaErrors(cudaEventCreateWithFlags(&(kernelEvent[i]), cudaEventDisableTiming));
}
//////////////////////////////////////////////////////////////////////
// time execution with nkernels streams
clock_t total_clocks = 0;
#if defined(__arm__) || defined(__aarch64__)
// the kernel takes more time than the channel reset time on arm archs, so to prevent hangs reduce time_clocks.
clock_t time_clocks = (clock_t)(kernel_time * (deviceProp.clockRate / 1000));
#else
clock_t time_clocks = (clock_t)(kernel_time * deviceProp.clockRate);
#endif
cudaEventRecord(start_event, 0);
// queue nkernels in separate streams and record when they are done
for (int i=0; i<nkernels; ++i)
{
switch (i%6)
{
case 0:
clock_block_a<<<1,1,0,streams[i]>>>(&d_a[i], time_clocks);
break;
case 1:
clock_block_b<<<1,1,0,streams[i]>>>(&d_a[i], time_clocks);
break;
case 2:
clock_block_c<<<1,1,0,streams[i]>>>(&d_a[i], time_clocks);
break;
case 3:
clock_block_d<<<1,1,0,streams[i]>>>(&d_a[i], time_clocks);
break;
case 4:
clock_block_e<<<1,1,0,streams[i]>>>(&d_a[i], time_clocks);
break;
case 5:
clock_block_f<<<1,1,0,streams[i]>>>(&d_a[i], time_clocks);
break;
}
total_clocks += time_clocks;
checkCudaErrors(cudaEventRecord(kernelEvent[i], streams[i]));
// make the last stream wait for the kernel event to be recorded
checkCudaErrors(cudaStreamWaitEvent(streams[nstreams-1], kernelEvent[i],0));
}
// queue a sum kernel and a copy back to host in the last stream.
// the commands in this stream get dispatched as soon as all the kernel events have been recorded
sum<<<1,32,0,streams[nstreams-1]>>>(d_a, nkernels);
checkCudaErrors(cudaMemcpyAsync(a, d_a, sizeof(clock_t), cudaMemcpyDeviceToHost, streams[nstreams-1]));
// at this point the CPU has dispatched all work for the GPU and can continue processing other tasks in parallel
// in this sample we just wait until the GPU is done
checkCudaErrors(cudaEventRecord(stop_event, 0));
checkCudaErrors(cudaEventSynchronize(stop_event));
checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));
printf("Expected time for serial execution of %d kernels = %.3fs\n", nkernels, nkernels * kernel_time/1000.0f);
printf("Expected time for concurrent execution of %d kernels = %.3fs\n", nkernels, kernel_time/1000.0f);
printf("Measured time for sample = %.3fs\n", elapsed_time/1000.0f);
bool bTestResult = (a[0] > total_clocks);
// release resources
for (int i = 0; i < nkernels; i++)
{
cudaStreamDestroy(streams[i]);
cudaEventDestroy(kernelEvent[i]);
}
free(streams);
free(kernelEvent);
cudaEventDestroy(start_event);
cudaEventDestroy(stop_event);
cudaFreeHost(a);
cudaFree(d_a);
// Calling cudaProfilerStop causes all profile data to be
// flushed before the application exits
checkCudaErrors(cudaProfilerStop());
if (!bTestResult)
{
printf("Test failed!\n");
exit(EXIT_FAILURE);
}
printf("Test passed\n");
exit(EXIT_SUCCESS);
}
@allanmac
Copy link
Author

nvcc -arch sm_50 -Xptxas=-v,-abi=no -o ck -I . concurrentKernels.cu

ptxas warning : 'option -abi=no' might get deprecated in future
ptxas warning : Stack size for entry function '_Z13clock_block_bPll' cannot be statically determined
ptxas warning : Stack size for entry function '_Z13clock_block_aPll' cannot be statically determined
ptxas warning : Stack size for entry function '_Z3sumPli' cannot be statically determined
ptxas info    : 288 bytes gmem
ptxas info    : Compiling entry function '_Z13clock_block_bPll' for 'sm_50'
ptxas info    : Used 3 registers, 32768 bytes smem, 332 bytes cmem[0]
ptxas info    : Compiling entry function '_Z13clock_block_aPll' for 'sm_50'
ptxas info    : Used 3 registers, 32768 bytes smem, 332 bytes cmem[0]
ptxas info    : Compiling entry function '_Z3sumPli' for 'sm_50'
ptxas info    : Used 5 registers, 128 bytes smem, 332 bytes cmem[0]
concurrentKernels.cu
   Creating library ck.lib and object ck.exp

@allanmac
Copy link
Author

ck --device=1 --nkernels=6
ck_6

ck --device=1 --nkernels=7
ck_7

@allanmac
Copy link
Author

allanmac commented May 16, 2016

Here's the same kernel with 6 different kernels and 600 launches running on a 3 SMM K620:

ck_6_600

At first glance, the scheduling rule appears to be dynamic and a mix of first-fit and round-robin.

I wouldn't be surprised if there is a heuristic hiding in there to drain more of the same kernel type before launching a different kernel on the same SM (for many many reasons). The tri-color striping supports this idea.

@ib198669
Copy link

thanks a lot, but I cannot find the file "helper_functions.h" "helper_cuda.h" ?

@allanmac
Copy link
Author

These kernels are based on the example in the NVIDIA Samples/Examples directory -- the .h files should somewhere in that tree.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment