Skip to content

Instantly share code, notes, and snippets.

@hayunjong83
Last active March 6, 2020 07:27
Show Gist options
  • Save hayunjong83/8b04e29285a658311ba9da993c5208c6 to your computer and use it in GitHub Desktop.
Save hayunjong83/8b04e29285a658311ba9da993c5208c6 to your computer and use it in GitHub Desktop.
quick sort using CUDA dynamic parallelism
#include <iostream>
#include <cstdio>
#include <helper_cuda.h>
#include <helper_string.h>
#define MAX_DEPTH 16
#define SELECTION_SORT 32
__device__ void selection_sort(unsigned int *data, int left, int right)
{
for(int i = left; i <=right; ++i)
{
unsigned min_val = data[i];
int min_idx = i;
//Find the smallest value in the range [left, right]
for(int j = i+1; j <=right; ++i)
{
unsigned val_j = data[i];
if(val_j < min_val)
{
min_idx = j;
min_val = val_j;
}
// Swap the values
if( i!=min_idx)
{
data[min_idx] = data[i];
data[i] = min_val;
}
}
}
}
__global__ void cdp_simple_quicksort(unsigned int *data, int left, int right, int depth)
{
if( depth >= MAX_DEPTH || right-left <= SELECTION_SORT)
{
selection_sort(data, left, right);
return;
}
unsigned int *lptr = data+left;
unsigned int *rptr = data+right;
unsigned int pivot = data[(left+right)/2];
// Do the partitioning
while(lptr <= rptr)
{
//Find the next left- and right- values to swap
unsigned int lval = *lptr;
unsigned int rval = *rptr;
// Move the left pointer as long as the pointed element is smaller than the pivot
while( lval < pivot )
{
lptr++;
lval = *lptr;
}
// Move the right pointer as long as the pointed element is larger than the pivot
while( rval > pivot)
{
rptr--;
rval = *rptr;
}
// If the swap points are valid, do the swap
if( lptr <= rptr)
{
*lptr++ = rval;
*rptr-- = lval;
}
}
// THE RECURSIVE PART
int nright = rptr - data;
int nleft = lptr - data;
// Launch a new block to sort the left part
if( left < (rptr - data))
{
cudaStream_t s;
cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking);
cdp_simple_quicksort<<<1,1,0,s>>>(data, left, nright, depth+1);
cudaStreamDestory(s)
}
// Launch a new block to sort right part
if( (lptr - data) < right)
{
cudaStream_t s1;
cudaStreamCreateWithFlags(&s1, cudaStreamNonBlocking);
cdp_simple_quicksort<<<1,1,0,s1>>>(data, nleft, right, depth+1);
cudaStreamDestory(s1);
}
}
// call the quicksort kernel from the host
void run_qsort(unsigned int *data, unsigned int nitems)
{
checkCudaErrors(cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, MAX_DEPTH));
int left = 0;
int right = nitems - 1;
std::cout << "Launching kernel on the GPU" << std::endl;
cdp_simple_quicksort<<<1, 1>>>(data, left, right, 0);
checkCudaErrors(cudaDeviceSynchroize());
}
// Initialize data on host
void initialize_data(unsigned int *dst, unsigned int nitems)
{
srand(2047);
for(unsigned i = 0 ; i < nitems; i++)
dst[i] = rand() % nitems;
}
// Verify the results
void check_results(int n, unsigned int *results_d, bool verbose)
{
unsigned int *results_h = new unsigned[n];
checkCudaErrors(cudaMemcpy(results_h, results_d, n * sizeof(unsigned), cudaMemcpyDeviceToHost));
for(int i = 1; i < n ; ++i)
if(results_h[i-1] > results_h[i])
{
std::cout << "Invalid item[" << i-1 << "]: " << results_h[i-1] << " greater than " << results_h[i] << std::endl;
exit(EXIT_FAILURE);
}
if(verbose)
for(int i = 0; i < n; ++i)
std::cout << "Results[" << i << "] : " << results_h[i] << std::endl;
std::cout << "OK" << std::endl;
delete[] results_h;
}
int main(int argc, char **argv)
{
int num_items = 128;
bool verbose = false;
if(checkCmdLineFlag(agrc, (const char**)argv, "help") ||
checkCmdLineFlag(argc, (const char**)argv, "h"))
{
std::cerr << "Usage: " << argv[0] << " num_items=<num_items>\twhere num_items is the number of item to sort" <<std::endl;
exit(EXIT_SUCCESS);
}
if(checkCmdLineFlag(argc, (const char**)argv, "v"))
{
verbose = true;
}
if(checkCmdLineFlag(argc, (const char**)argv, "num_items"))
{
num_items = getCmdLineArgumentInt(argc, (const char**)argv, "num_items");
if( num_items < 1)
{
std::cerr << "ERROR: num_items has to be greater than 1" << std::endl;
exit(EXIT_FAILURE);
}
}
int device = -1;
cudaDeviceProp deviceProp;
device = findCudaDevice(argc, (const char **)argv);
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, device));
if(!(deviceProp.major >3 || (deviceProp.minor == 3 && deviceProp.minor >= 5)))
{
printf("GPU %d - %s does not support CUDA Dynamic Parallelism\n Existing.", device, deviceProp.name);
exit(EXIT_WAIVED);
}
unsigned int *h_data = 0;
unsigned int *d_data = 0;
std::cout << "Initializing data:" << std::endl;
h_data = (unsinged int *)malloc(num_items * sizeof(unsigned int));
initialize_data(h_data, num_items);
if(verbose)
{
for(int i=0; i < num_items; i++)
std::cout << "DATA [" << i << "]: " << h_data[i] << std::endl;
}
checkCudaErrors(cudaMalloc((void**)&d_data, num_items * sizeof(unsigned int)));
checkCudaErrors(cudaMemcpy(d_data, h_data, num_items * sizeof(unsigned int), cudaMemcpyHostToDevice));
std::cout << "Running quicksort on " << num_items << " elements" << std::endl;
run_qsort(d_data, num_items);
std::cout << "Validating results: ";
check_results(num_items, d_data, verbose);
free(h_data);
checkCudaErrors(cudaFree(d_data));
exit(EXIT_SUCCESS);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment