Last active
June 17, 2022 17:29
-
-
Save allanmac/93868ee6be78f9d1e9f5704e6e62db43 to your computer and use it in GitHub Desktop.
Benchmark CUB Radix Sort with uniformly random data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// | |
// Build: | |
// | |
// nvcc -lcurand --generate-code arch=compute_50,code=compute_50 --generate-code arch=compute_75,code=compute_75 -D CUB_SORT_TYPE=uint32_t -o sort_cub_32 cub_sort.cu | |
// nvcc -lcurand --generate-code arch=compute_50,code=compute_50 --generate-code arch=compute_75,code=compute_75 -D CUB_SORT_TYPE=uint64_t -o sort_cub_64 cub_sort.cu | |
// | |
#define THRUST_IGNORE_CUB_VERSION_CHECK | |
#include <curand.h> | |
#include <cub/cub.cuh> | |
// | |
// | |
// | |
#include <stdbool.h> | |
static | |
void | |
cuda_assert(const cudaError_t code, const char* const file, const int line, const bool abort) | |
{ | |
if (code != cudaSuccess) | |
{ | |
fprintf(stderr,"cuda_assert: %s %s %d\n",cudaGetErrorString(code),file,line); | |
if (abort) | |
{ | |
cudaDeviceReset(); | |
exit(code); | |
} | |
} | |
} | |
#define cuda(...) { cuda_assert((cuda##__VA_ARGS__), __FILE__, __LINE__, true); } | |
// | |
// | |
// | |
#ifndef CUB_SORT_TYPE | |
#define CUB_SORT_TYPE uint64_t | |
#endif | |
#define CUB_SORT_WARMUP 100 | |
#define CUB_SORT_TRIALS 1000 | |
// | |
// | |
// | |
static | |
void | |
sort(uint32_t count, | |
CUB_SORT_TYPE * vin_d, | |
CUB_SORT_TYPE * vout_d, | |
void * tmp, | |
size_t & tmp_size, | |
cudaEvent_t start, | |
cudaEvent_t end, | |
float * min_ms, | |
float * max_ms, | |
float * elapsed_ms) | |
{ | |
cuda(EventRecord(start,0)); | |
cub::DeviceRadixSort::SortKeys(tmp,tmp_size,vin_d,vout_d,count); | |
cuda(EventRecord(end,0)); | |
cuda(EventSynchronize(end)); | |
float t_ms; | |
cuda(EventElapsedTime(&t_ms,start,end)); | |
*min_ms = min(*min_ms,t_ms); | |
*max_ms = max(*max_ms,t_ms); | |
*elapsed_ms += t_ms; | |
} | |
// | |
// | |
// | |
static | |
void | |
bench(const struct cudaDeviceProp* const props, | |
const uint32_t count, | |
const uint32_t warmup, | |
const uint32_t trials) | |
{ | |
// | |
// allocate | |
// | |
size_t const vin_size = sizeof(CUB_SORT_TYPE) * count; | |
CUB_SORT_TYPE * vin_d; | |
CUB_SORT_TYPE * vout_d; | |
cuda(Malloc(&vin_d, vin_size)); | |
cuda(Malloc(&vout_d,vin_size)); | |
// | |
// fill with random values | |
// | |
curandGenerator_t prng; | |
curandCreateGenerator(&prng,CURAND_RNG_PSEUDO_XORWOW); | |
curandSetPseudoRandomGeneratorSeed(prng,0xCAFEBABE); | |
if (sizeof(CUB_SORT_TYPE) == sizeof(unsigned int)) { | |
curandGenerate(prng,(unsigned int*)vin_d,count); | |
} else if (sizeof(CUB_SORT_TYPE) == sizeof(unsigned long long)) { | |
curandGenerateLongLong(prng,(unsigned long long*)vin_d,count); | |
} else { | |
exit(EXIT_FAILURE); | |
} | |
// | |
// size and allocate the temp array | |
// | |
void * tmp; | |
size_t tmp_size = 0; | |
cub::DeviceRadixSort::SortKeys(NULL,tmp_size,vin_d,vout_d,count); | |
cuda(Malloc(&tmp,tmp_size)); | |
// | |
// benchmark | |
// | |
cudaEvent_t start, end; | |
cuda(EventCreate(&start)); | |
cuda(EventCreate(&end)); | |
float min_ms = FLT_MAX; | |
float max_ms = 0.0f; | |
float elapsed_ms = 0.0f; | |
for (int ii=0; ii<warmup; ii++) | |
{ | |
sort(count,vin_d,vout_d,tmp,tmp_size,start,end, | |
&min_ms, | |
&max_ms, | |
&elapsed_ms); | |
} | |
min_ms = FLT_MAX; | |
max_ms = 0.0f; | |
elapsed_ms = 0.0f; | |
for (int ii=0; ii<trials; ii++) | |
{ | |
sort(count,vin_d,vout_d,tmp,tmp_size,start,end, | |
&min_ms, | |
&max_ms, | |
&elapsed_ms); | |
} | |
cuda(EventDestroy(start)); | |
cuda(EventDestroy(end)); | |
// | |
// | |
// | |
cuda(Free(tmp)); | |
cuda(Free(vout_d)); | |
cuda(Free(vin_d)); | |
// | |
// | |
// | |
#define STRINGIFY2(s) #s | |
#define STRINGIFY(s) STRINGIFY2(s) | |
fprintf(stdout,"%s, %u, %u.%u.%u.%u, %s, %u, %u, %u, %.3f, %.3f, %.3f, %.3f, %.3f, %.3f\n", | |
props->name, | |
props->multiProcessorCount, | |
CUB_MAJOR_VERSION, | |
CUB_MINOR_VERSION, | |
CUB_SUBMINOR_VERSION, | |
CUB_PATCH_NUMBER, | |
STRINGIFY(CUB_SORT_TYPE), | |
count, | |
warmup, | |
trials, | |
elapsed_ms, | |
(double)elapsed_ms / trials, | |
(double)min_ms, | |
(double)max_ms, | |
(double)count * trials / (1000.0 * elapsed_ms), | |
(double)count / (1000.0 * min_ms)); | |
} | |
// | |
// | |
// | |
int | |
main(int argc, char** argv) | |
{ | |
const int32_t device = (argc == 1) ? 0 : atoi(argv[1]); | |
struct cudaDeviceProp props; | |
cuda(GetDeviceProperties(&props,device)); | |
printf("%s (%2d)\n",props.name,props.multiProcessorCount); | |
cuda(SetDevice(device)); | |
// | |
// Usage: | |
// | |
// $ cub_sort_xx [ count_lo [ count_hi [ count_step [ trials [ warmup ] ] ] ] ] | |
// | |
const uint32_t count_lo = argc <= 2 ? 131072 : strtoul(argv[2],NULL,0); | |
const uint32_t count_hi = argc <= 3 ? 8388608 : strtoul(argv[3],NULL,0); | |
const uint32_t count_step = argc <= 4 ? 131072 : strtoul(argv[4],NULL,0); | |
const uint32_t trials = argc <= 5 ? CUB_SORT_TRIALS : strtoul(argv[5],NULL,0); | |
const uint32_t warmup = argc <= 6 ? CUB_SORT_WARMUP : strtoul(argv[6],NULL,0); | |
// | |
// LABELS | |
// | |
fprintf(stdout, | |
"Device, " | |
"Multiprocessors, " | |
"CUB, " | |
"Type, " | |
"Keys, " | |
"Warmup, " | |
"Trials, " | |
"Total Msecs, " | |
"Avg. Msecs, " | |
"Min Msecs, " | |
"Max Msecs, " | |
"Avg. Mkeys/s, " | |
"Max. Mkeys/s\n"); | |
// | |
// SORT | |
// | |
for (uint32_t count=count_lo; count<=count_hi; count+=count_step) | |
{ | |
bench(&props,count,warmup,trials); | |
} | |
// | |
// RESET | |
// | |
cuda(DeviceReset()); | |
return 0; | |
} |
Author
allanmac
commented
Jun 17, 2022
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment