Last active
May 19, 2022 03:08
-
-
Save allanmac/049837785a10b7999fce6ca282f62dc6 to your computer and use it in GitHub Desktop.
Concurrent kernel test that demonstrates _different_ kernels running concurrently. Hacked from NVIDIA's example. ck_2.cu has two kernels each requiring half of an sm_50 multiprocessor's shared memory. Kernel "a" is run on 5 out of 6 launches, otherwise kernel "b" is launched. ck_6.cu has six kernels.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* Copyright 1993-2015 NVIDIA Corporation. All rights reserved. | |
* | |
* Please refer to the NVIDIA end user license agreement (EULA) associated | |
* with this source code for terms and conditions that govern your use of | |
* this software. Any use, reproduction, disclosure, or distribution of | |
* this software and related documentation outside the terms of the EULA | |
* is strictly prohibited. | |
* | |
*/ | |
// | |
// This sample demonstrates the use of streams for concurrent execution. It also illustrates how to | |
// introduce dependencies between CUDA streams with the new cudaStreamWaitEvent function introduced | |
// in CUDA 3.2. | |
// | |
// Devices of compute capability 1.x will run the kernels one after another | |
// Devices of compute capability 2.0 or higher can overlap the kernels | |
// | |
#include <stdio.h> | |
#include <cuda_profiler_api.h> | |
#include "helper_functions.h" | |
#include "helper_cuda.h" | |
// This is a kernel that does no real work but runs at least for a specified number of clocks | |
__global__ void clock_block_a(clock_t *d_o, clock_t clock_count) | |
{ | |
__shared__ unsigned int smem[32768/4]; | |
unsigned int start_clock = (unsigned int) clock(); | |
smem[0] = start_clock; | |
clock_t clock_offset = 0; | |
while (clock_offset < clock_count) | |
{ | |
unsigned int end_clock = (unsigned int) clock(); | |
// The code below should work like | |
// this (thanks to modular arithmetics): | |
// | |
// clock_offset = (clock_t) (end_clock > start_clock ? | |
// end_clock - start_clock : | |
// end_clock + (0xffffffffu - start_clock)); | |
// | |
// Indeed, let m = 2^32 then | |
// end - start = end + m - start (mod m). | |
clock_offset = (clock_t)(end_clock - start_clock); | |
} | |
d_o[0] = clock_offset; | |
} | |
__global__ void clock_block_b(clock_t *d_o, clock_t clock_count) | |
{ | |
__shared__ unsigned int smem[32768/4]; | |
unsigned int start_clock = (unsigned int) clock(); | |
smem[0] = start_clock; | |
clock_t clock_offset = 0; | |
while (clock_offset < clock_count) | |
{ | |
unsigned int end_clock = (unsigned int) clock(); | |
// The code below should work like | |
// this (thanks to modular arithmetics): | |
// | |
// clock_offset = (clock_t) (end_clock > start_clock ? | |
// end_clock - start_clock : | |
// end_clock + (0xffffffffu - start_clock)); | |
// | |
// Indeed, let m = 2^32 then | |
// end - start = end + m - start (mod m). | |
clock_offset = (clock_t)(end_clock - start_clock); | |
} | |
d_o[0] = clock_offset; | |
} | |
// Single warp reduction kernel | |
__global__ void sum(clock_t *d_clocks, int N) | |
{ | |
__shared__ clock_t s_clocks[32]; | |
clock_t my_sum = 0; | |
for (int i = threadIdx.x; i < N; i+= blockDim.x) | |
{ | |
my_sum += d_clocks[i]; | |
} | |
s_clocks[threadIdx.x] = my_sum; | |
__syncthreads(); | |
for (int i=16; i>0; i/=2) | |
{ | |
if (threadIdx.x < i) | |
{ | |
s_clocks[threadIdx.x] += s_clocks[threadIdx.x + i]; | |
} | |
__syncthreads(); | |
} | |
d_clocks[0] = s_clocks[0]; | |
} | |
int main(int argc, char **argv) | |
{ | |
int nkernels = 8; // number of concurrent kernels | |
int nstreams = nkernels + 1; // use one more stream than concurrent kernel | |
int nbytes = nkernels * sizeof(clock_t); // number of data bytes | |
float kernel_time = 10; // time the kernel should run in ms | |
float elapsed_time; // timing variables | |
int cuda_device = 0; | |
printf("[%s] - Starting...\n", argv[0]); | |
// get number of kernels if overridden on the command line | |
if (checkCmdLineFlag(argc, (const char **)argv, "nkernels")) | |
{ | |
nkernels = getCmdLineArgumentInt(argc, (const char **)argv, "nkernels"); | |
nstreams = nkernels + 1; | |
} | |
// use command-line specified CUDA device, otherwise use device with highest Gflops/s | |
cuda_device = findCudaDevice(argc, (const char **)argv); | |
cudaDeviceProp deviceProp; | |
checkCudaErrors(cudaGetDevice(&cuda_device)); | |
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device)); | |
if ((deviceProp.concurrentKernels == 0)) | |
{ | |
printf("> GPU does not support concurrent kernel execution\n"); | |
printf(" CUDA kernel runs will be serialized\n"); | |
} | |
printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n", | |
deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount); | |
// allocate host memory | |
clock_t *a = 0; // pointer to the array data in host memory | |
checkCudaErrors(cudaMallocHost((void **)&a, nbytes)); | |
// allocate device memory | |
clock_t *d_a = 0; // pointers to data and init value in the device memory | |
checkCudaErrors(cudaMalloc((void **)&d_a, nbytes)); | |
// allocate and initialize an array of stream handles | |
cudaStream_t *streams = (cudaStream_t *) malloc(nstreams * sizeof(cudaStream_t)); | |
for (int i = 0; i < nstreams; i++) | |
{ | |
checkCudaErrors(cudaStreamCreate(&(streams[i]))); | |
} | |
// create CUDA event handles | |
cudaEvent_t start_event, stop_event; | |
checkCudaErrors(cudaEventCreate(&start_event)); | |
checkCudaErrors(cudaEventCreate(&stop_event)); | |
// the events are used for synchronization only and hence do not need to record timings | |
// this also makes events not introduce global sync points when recorded which is critical to get overlap | |
cudaEvent_t *kernelEvent; | |
kernelEvent = (cudaEvent_t *) malloc(nkernels * sizeof(cudaEvent_t)); | |
for (int i = 0; i < nkernels; i++) | |
{ | |
checkCudaErrors(cudaEventCreateWithFlags(&(kernelEvent[i]), cudaEventDisableTiming)); | |
} | |
////////////////////////////////////////////////////////////////////// | |
// time execution with nkernels streams | |
clock_t total_clocks = 0; | |
#if defined(__arm__) || defined(__aarch64__) | |
// the kernel takes more time than the channel reset time on arm archs, so to prevent hangs reduce time_clocks. | |
clock_t time_clocks = (clock_t)(kernel_time * (deviceProp.clockRate / 1000)); | |
#else | |
clock_t time_clocks = (clock_t)(kernel_time * deviceProp.clockRate); | |
#endif | |
cudaEventRecord(start_event, 0); | |
// queue nkernels in separate streams and record when they are done | |
for (int i=0; i<nkernels; ++i) | |
{ | |
if (i%6 > 0) | |
clock_block_a<<<1,1,0,streams[i]>>>(&d_a[i], time_clocks); | |
else | |
clock_block_b<<<1,1,0,streams[i]>>>(&d_a[i], time_clocks); | |
total_clocks += time_clocks; | |
checkCudaErrors(cudaEventRecord(kernelEvent[i], streams[i])); | |
// make the last stream wait for the kernel event to be recorded | |
checkCudaErrors(cudaStreamWaitEvent(streams[nstreams-1], kernelEvent[i],0)); | |
} | |
// queue a sum kernel and a copy back to host in the last stream. | |
// the commands in this stream get dispatched as soon as all the kernel events have been recorded | |
sum<<<1,32,0,streams[nstreams-1]>>>(d_a, nkernels); | |
checkCudaErrors(cudaMemcpyAsync(a, d_a, sizeof(clock_t), cudaMemcpyDeviceToHost, streams[nstreams-1])); | |
// at this point the CPU has dispatched all work for the GPU and can continue processing other tasks in parallel | |
// in this sample we just wait until the GPU is done | |
checkCudaErrors(cudaEventRecord(stop_event, 0)); | |
checkCudaErrors(cudaEventSynchronize(stop_event)); | |
checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event)); | |
printf("Expected time for serial execution of %d kernels = %.3fs\n", nkernels, nkernels * kernel_time/1000.0f); | |
printf("Expected time for concurrent execution of %d kernels = %.3fs\n", nkernels, kernel_time/1000.0f); | |
printf("Measured time for sample = %.3fs\n", elapsed_time/1000.0f); | |
bool bTestResult = (a[0] > total_clocks); | |
// release resources | |
for (int i = 0; i < nkernels; i++) | |
{ | |
cudaStreamDestroy(streams[i]); | |
cudaEventDestroy(kernelEvent[i]); | |
} | |
free(streams); | |
free(kernelEvent); | |
cudaEventDestroy(start_event); | |
cudaEventDestroy(stop_event); | |
cudaFreeHost(a); | |
cudaFree(d_a); | |
// Calling cudaProfilerStop causes all profile data to be | |
// flushed before the application exits | |
checkCudaErrors(cudaProfilerStop()); | |
if (!bTestResult) | |
{ | |
printf("Test failed!\n"); | |
exit(EXIT_FAILURE); | |
} | |
printf("Test passed\n"); | |
exit(EXIT_SUCCESS); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* Copyright 1993-2015 NVIDIA Corporation. All rights reserved. | |
* | |
* Please refer to the NVIDIA end user license agreement (EULA) associated | |
* with this source code for terms and conditions that govern your use of | |
* this software. Any use, reproduction, disclosure, or distribution of | |
* this software and related documentation outside the terms of the EULA | |
* is strictly prohibited. | |
* | |
*/ | |
// | |
// This sample demonstrates the use of streams for concurrent execution. It also illustrates how to | |
// introduce dependencies between CUDA streams with the new cudaStreamWaitEvent function introduced | |
// in CUDA 3.2. | |
// | |
// Devices of compute capability 1.x will run the kernels one after another | |
// Devices of compute capability 2.0 or higher can overlap the kernels | |
// | |
#include <stdio.h> | |
#include <cuda_profiler_api.h> | |
#include "helper_functions.h" | |
#include "helper_cuda.h" | |
// This is a kernel that does no real work but runs at least for a specified number of clocks | |
__global__ void clock_block_a(clock_t *d_o, clock_t clock_count) | |
{ | |
__shared__ unsigned int smem[32768/4]; | |
unsigned int start_clock = (unsigned int) clock(); | |
smem[0] = start_clock; | |
clock_t clock_offset = 0; | |
while (clock_offset < clock_count) | |
{ | |
unsigned int end_clock = (unsigned int) clock(); | |
// The code below should work like | |
// this (thanks to modular arithmetics): | |
// | |
// clock_offset = (clock_t) (end_clock > start_clock ? | |
// end_clock - start_clock : | |
// end_clock + (0xffffffffu - start_clock)); | |
// | |
// Indeed, let m = 2^32 then | |
// end - start = end + m - start (mod m). | |
clock_offset = (clock_t)(end_clock - start_clock); | |
} | |
d_o[0] = clock_offset; | |
} | |
__global__ void clock_block_b(clock_t *d_o, clock_t clock_count) | |
{ | |
__shared__ unsigned int smem[32768/4]; | |
unsigned int start_clock = (unsigned int) clock(); | |
smem[0] = start_clock; | |
clock_t clock_offset = 0; | |
while (clock_offset < clock_count) | |
{ | |
unsigned int end_clock = (unsigned int) clock(); | |
// The code below should work like | |
// this (thanks to modular arithmetics): | |
// | |
// clock_offset = (clock_t) (end_clock > start_clock ? | |
// end_clock - start_clock : | |
// end_clock + (0xffffffffu - start_clock)); | |
// | |
// Indeed, let m = 2^32 then | |
// end - start = end + m - start (mod m). | |
clock_offset = (clock_t)(end_clock - start_clock); | |
} | |
d_o[0] = clock_offset; | |
} | |
__global__ void clock_block_c(clock_t *d_o, clock_t clock_count) | |
{ | |
__shared__ unsigned int smem[32768/4]; | |
unsigned int start_clock = (unsigned int) clock(); | |
smem[0] = start_clock; | |
clock_t clock_offset = 0; | |
while (clock_offset < clock_count) | |
{ | |
unsigned int end_clock = (unsigned int) clock(); | |
// The code below should work like | |
// this (thanks to modular arithmetics): | |
// | |
// clock_offset = (clock_t) (end_clock > start_clock ? | |
// end_clock - start_clock : | |
// end_clock + (0xffffffffu - start_clock)); | |
// | |
// Indeed, let m = 2^32 then | |
// end - start = end + m - start (mod m). | |
clock_offset = (clock_t)(end_clock - start_clock); | |
} | |
d_o[0] = clock_offset; | |
} | |
__global__ void clock_block_d(clock_t *d_o, clock_t clock_count) | |
{ | |
__shared__ unsigned int smem[32768/4]; | |
unsigned int start_clock = (unsigned int) clock(); | |
smem[0] = start_clock; | |
clock_t clock_offset = 0; | |
while (clock_offset < clock_count) | |
{ | |
unsigned int end_clock = (unsigned int) clock(); | |
// The code below should work like | |
// this (thanks to modular arithmetics): | |
// | |
// clock_offset = (clock_t) (end_clock > start_clock ? | |
// end_clock - start_clock : | |
// end_clock + (0xffffffffu - start_clock)); | |
// | |
// Indeed, let m = 2^32 then | |
// end - start = end + m - start (mod m). | |
clock_offset = (clock_t)(end_clock - start_clock); | |
} | |
d_o[0] = clock_offset; | |
} | |
__global__ void clock_block_e(clock_t *d_o, clock_t clock_count) | |
{ | |
__shared__ unsigned int smem[32768/4]; | |
unsigned int start_clock = (unsigned int) clock(); | |
smem[0] = start_clock; | |
clock_t clock_offset = 0; | |
while (clock_offset < clock_count) | |
{ | |
unsigned int end_clock = (unsigned int) clock(); | |
// The code below should work like | |
// this (thanks to modular arithmetics): | |
// | |
// clock_offset = (clock_t) (end_clock > start_clock ? | |
// end_clock - start_clock : | |
// end_clock + (0xffffffffu - start_clock)); | |
// | |
// Indeed, let m = 2^32 then | |
// end - start = end + m - start (mod m). | |
clock_offset = (clock_t)(end_clock - start_clock); | |
} | |
d_o[0] = clock_offset; | |
} | |
__global__ void clock_block_f(clock_t *d_o, clock_t clock_count) | |
{ | |
__shared__ unsigned int smem[32768/4]; | |
unsigned int start_clock = (unsigned int) clock(); | |
smem[0] = start_clock; | |
clock_t clock_offset = 0; | |
while (clock_offset < clock_count) | |
{ | |
unsigned int end_clock = (unsigned int) clock(); | |
// The code below should work like | |
// this (thanks to modular arithmetics): | |
// | |
// clock_offset = (clock_t) (end_clock > start_clock ? | |
// end_clock - start_clock : | |
// end_clock + (0xffffffffu - start_clock)); | |
// | |
// Indeed, let m = 2^32 then | |
// end - start = end + m - start (mod m). | |
clock_offset = (clock_t)(end_clock - start_clock); | |
} | |
d_o[0] = clock_offset; | |
} | |
// Single warp reduction kernel | |
__global__ void sum(clock_t *d_clocks, int N) | |
{ | |
__shared__ clock_t s_clocks[32]; | |
clock_t my_sum = 0; | |
for (int i = threadIdx.x; i < N; i+= blockDim.x) | |
{ | |
my_sum += d_clocks[i]; | |
} | |
s_clocks[threadIdx.x] = my_sum; | |
__syncthreads(); | |
for (int i=16; i>0; i/=2) | |
{ | |
if (threadIdx.x < i) | |
{ | |
s_clocks[threadIdx.x] += s_clocks[threadIdx.x + i]; | |
} | |
__syncthreads(); | |
} | |
d_clocks[0] = s_clocks[0]; | |
} | |
int main(int argc, char **argv) | |
{ | |
int nkernels = 8; // number of concurrent kernels | |
int nstreams = nkernels + 1; // use one more stream than concurrent kernel | |
int nbytes = nkernels * sizeof(clock_t); // number of data bytes | |
float kernel_time = 10; // time the kernel should run in ms | |
float elapsed_time; // timing variables | |
int cuda_device = 0; | |
printf("[%s] - Starting...\n", argv[0]); | |
// get number of kernels if overridden on the command line | |
if (checkCmdLineFlag(argc, (const char **)argv, "nkernels")) | |
{ | |
nkernels = getCmdLineArgumentInt(argc, (const char **)argv, "nkernels"); | |
nstreams = nkernels + 1; | |
} | |
// use command-line specified CUDA device, otherwise use device with highest Gflops/s | |
cuda_device = findCudaDevice(argc, (const char **)argv); | |
cudaDeviceProp deviceProp; | |
checkCudaErrors(cudaGetDevice(&cuda_device)); | |
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device)); | |
if ((deviceProp.concurrentKernels == 0)) | |
{ | |
printf("> GPU does not support concurrent kernel execution\n"); | |
printf(" CUDA kernel runs will be serialized\n"); | |
} | |
printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n", | |
deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount); | |
// allocate host memory | |
clock_t *a = 0; // pointer to the array data in host memory | |
checkCudaErrors(cudaMallocHost((void **)&a, nbytes)); | |
// allocate device memory | |
clock_t *d_a = 0; // pointers to data and init value in the device memory | |
checkCudaErrors(cudaMalloc((void **)&d_a, nbytes)); | |
// allocate and initialize an array of stream handles | |
cudaStream_t *streams = (cudaStream_t *) malloc(nstreams * sizeof(cudaStream_t)); | |
for (int i = 0; i < nstreams; i++) | |
{ | |
checkCudaErrors(cudaStreamCreate(&(streams[i]))); | |
} | |
// create CUDA event handles | |
cudaEvent_t start_event, stop_event; | |
checkCudaErrors(cudaEventCreate(&start_event)); | |
checkCudaErrors(cudaEventCreate(&stop_event)); | |
// the events are used for synchronization only and hence do not need to record timings | |
// this also makes events not introduce global sync points when recorded which is critical to get overlap | |
cudaEvent_t *kernelEvent; | |
kernelEvent = (cudaEvent_t *) malloc(nkernels * sizeof(cudaEvent_t)); | |
for (int i = 0; i < nkernels; i++) | |
{ | |
checkCudaErrors(cudaEventCreateWithFlags(&(kernelEvent[i]), cudaEventDisableTiming)); | |
} | |
////////////////////////////////////////////////////////////////////// | |
// time execution with nkernels streams | |
clock_t total_clocks = 0; | |
#if defined(__arm__) || defined(__aarch64__) | |
// the kernel takes more time than the channel reset time on arm archs, so to prevent hangs reduce time_clocks. | |
clock_t time_clocks = (clock_t)(kernel_time * (deviceProp.clockRate / 1000)); | |
#else | |
clock_t time_clocks = (clock_t)(kernel_time * deviceProp.clockRate); | |
#endif | |
cudaEventRecord(start_event, 0); | |
// queue nkernels in separate streams and record when they are done | |
for (int i=0; i<nkernels; ++i) | |
{ | |
switch (i%6) | |
{ | |
case 0: | |
clock_block_a<<<1,1,0,streams[i]>>>(&d_a[i], time_clocks); | |
break; | |
case 1: | |
clock_block_b<<<1,1,0,streams[i]>>>(&d_a[i], time_clocks); | |
break; | |
case 2: | |
clock_block_c<<<1,1,0,streams[i]>>>(&d_a[i], time_clocks); | |
break; | |
case 3: | |
clock_block_d<<<1,1,0,streams[i]>>>(&d_a[i], time_clocks); | |
break; | |
case 4: | |
clock_block_e<<<1,1,0,streams[i]>>>(&d_a[i], time_clocks); | |
break; | |
case 5: | |
clock_block_f<<<1,1,0,streams[i]>>>(&d_a[i], time_clocks); | |
break; | |
} | |
total_clocks += time_clocks; | |
checkCudaErrors(cudaEventRecord(kernelEvent[i], streams[i])); | |
// make the last stream wait for the kernel event to be recorded | |
checkCudaErrors(cudaStreamWaitEvent(streams[nstreams-1], kernelEvent[i],0)); | |
} | |
// queue a sum kernel and a copy back to host in the last stream. | |
// the commands in this stream get dispatched as soon as all the kernel events have been recorded | |
sum<<<1,32,0,streams[nstreams-1]>>>(d_a, nkernels); | |
checkCudaErrors(cudaMemcpyAsync(a, d_a, sizeof(clock_t), cudaMemcpyDeviceToHost, streams[nstreams-1])); | |
// at this point the CPU has dispatched all work for the GPU and can continue processing other tasks in parallel | |
// in this sample we just wait until the GPU is done | |
checkCudaErrors(cudaEventRecord(stop_event, 0)); | |
checkCudaErrors(cudaEventSynchronize(stop_event)); | |
checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event)); | |
printf("Expected time for serial execution of %d kernels = %.3fs\n", nkernels, nkernels * kernel_time/1000.0f); | |
printf("Expected time for concurrent execution of %d kernels = %.3fs\n", nkernels, kernel_time/1000.0f); | |
printf("Measured time for sample = %.3fs\n", elapsed_time/1000.0f); | |
bool bTestResult = (a[0] > total_clocks); | |
// release resources | |
for (int i = 0; i < nkernels; i++) | |
{ | |
cudaStreamDestroy(streams[i]); | |
cudaEventDestroy(kernelEvent[i]); | |
} | |
free(streams); | |
free(kernelEvent); | |
cudaEventDestroy(start_event); | |
cudaEventDestroy(stop_event); | |
cudaFreeHost(a); | |
cudaFree(d_a); | |
// Calling cudaProfilerStop causes all profile data to be | |
// flushed before the application exits | |
checkCudaErrors(cudaProfilerStop()); | |
if (!bTestResult) | |
{ | |
printf("Test failed!\n"); | |
exit(EXIT_FAILURE); | |
} | |
printf("Test passed\n"); | |
exit(EXIT_SUCCESS); | |
} |
Here's the same kernel with 6 different kernels and 600 launches running on a 3 SMM K620:
At first glance, the scheduling rule appears to be dynamic and a mix of first-fit and round-robin.
I wouldn't be surprised if there is a heuristic hiding in there to drain more of the same kernel type before launching a different kernel on the same SM (for many many reasons). The tri-color striping supports this idea.
thanks a lot, but I cannot find the file "helper_functions.h" "helper_cuda.h" ?
These kernels are based on the example in the NVIDIA Samples/Examples directory -- the .h
files should somewhere in that tree.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
nvcc -arch sm_50 -Xptxas=-v,-abi=no -o ck -I . concurrentKernels.cu