Skip to content

Instantly share code, notes, and snippets.

@hayunjong83
Created March 5, 2020 07:59
Show Gist options
  • Save hayunjong83/332fd88438dc3628ea2d737ea672cbf1 to your computer and use it in GitHub Desktop.
Save hayunjong83/332fd88438dc3628ea2d737ea672cbf1 to your computer and use it in GitHub Desktop.
CUDA dynamic parallelism example 1) cdpSimplePrint
#include <iostream>
#include <cstdio>
#include <cstdlib>
#include <helper_cuda.h>
#include <helper_string.h>
__device__ int g_uids = 0;
__device__ void print_info(int depth, int thread, int uid, int parent_uid)
{
if(threadIdx.x == 0)
{
if(depth==0)
printf("BLOCK %d launched by the host\n", uid);
else{
char buffer[32];
for(int i = 0; i < depth; ++i)
{
buffer[3*i+0] = '|';
buffer[3*i+1] = ' ';
buffer[3*i+2] = ' ';
}
buffer[3*depth] = '\0';
printf("%sBLOCK %d launched by thread %d of block %d\n", buffer, uid, thread, parent_uid);
}
}
__syncthreads();
}
__global__ void cdp_kernel(int max_depth, int depth, int thread, int parent_uid)
{
__shared__ int s_uid;
if(threadIdx.x == 0)
{
s_uid = atomicAdd(&g_uids, 1);
}
__syncthreads();
print_info(depth, thread, s_uid, parent_uid);
if(++depth >=max_depth)
{
return;
}
cdp_kernel<<<gridDim.x, blockDim.x>>>(max_depth, depth, threadIdx.x, s_uid);
}
int main(int argc, char **argv)
{
printf("starting Simple Print (CUDA Dynamic Parallelism)\n");
int max_depth = 2;
if(checkCmdLineFlag(argc, (const char **)argv, "help") ||
checkCmdLineFlag(argc, (const char **)argv, "h"))
{
printf("Usage: %s depth=<max_depth>\t(where max_depth is a value between 1 and 8).\n",argv[0]);
exit(EXIT_SUCCESS);
}
if(checkCmdLineFlag(argc, (const char **)argv, "depth"))
{
max_depth = getCmdLineArgumentInc(argc, (const char **)argv, "depth");
if(max_depth < 1 || max_depth > 0)
{
printf("depth parameter has to be between 1 and 8\n");
exit(EXIT_FAILURE);
}
}
int device = -1;
cudaDeviceProp deviceProp;
device = findCudaDevice(argc, (const char **)argv);
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, device));
if(!(deviceProp.major > 3 || (deviceProp.major == 3 && deviceProp.minor >=5)))
{
printf("GPU %d - %s does not support CUDA Dynamic Parallelism\n Exiting.", device, deviceProp.name);
exit(EXIT_WAIVED);
}
printf("***************************************************************************\n");
printf("The CPU launches 2 blocks of 2 threads each. On the device each thread will\n");
printf("until it reaches max_depth=%d\n\n", max_depth);
printf("In total 2");
int num_blocks = 2, sum = 2;
for(int i = 1 ; i < max_depth; ++i)
{
num_blocks *=4;
printf("+%d", num_blocks);
sum += num_blocks;
}
printf("=%d blocks are launched!!!! (%d from the GPU)\n", sum, sum-2);
printf("*****************************************************************************\n\n");
cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, max_depth);
printf("Launching cdp_kernel() with CUDA Dynamci Parallelism:\n\n");
cdp_kernel<<<2, 2>>>(max_depth, 0, 0, -1);
checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaDeviceSynchronize());
exit(EXIT_SUCCESS);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment