Created
February 10, 2023 18:55
-
-
Save Robadob/5e3a068267f8a351991bef16d138dd7c to your computer and use it in GitHub Desktop.
Generate all 2^32 random uniform floats and place them on a histogram.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <algorithm> | |
#include "cuda_runtime.h" | |
#include "device_launch_parameters.h" | |
#include <cstdio> | |
#include <cstdint> | |
#include <fstream> | |
#include <limits> | |
/** | |
* Error check function for safe CUDA API calling | |
* Wrap all calls to CUDA API functions with CUDA_CALL() to catch errors on failure | |
* e.g. CUDA_CALL(cudaFree(myPtr)); | |
* CUDA_CHECk() can also be used to perform error checking after kernel launches and async methods | |
* e.g. CUDA_CHECK() | |
*/ | |
#if defined(_DEBUG) || defined(D_DEBUG) | |
#define CUDA_CALL(ans) { gpuAssert((ans), __FILE__, __LINE__); } | |
#define CUDA_CHECK() { gpuAssert(cudaDeviceSynchronize(), __FILE__, __LINE__); } | |
#else | |
#define CUDA_CALL(ans) { gpuAssert((ans), __FILE__, __LINE__); } | |
#define CUDA_CHECK() { gpuAssert(cudaPeekAtLastError(), __FILE__, __LINE__); } | |
#endif | |
inline void gpuAssert(cudaError_t code, const char* file, int line) { | |
if (code != cudaSuccess) { | |
if (line >= 0) { | |
fprintf(stderr, "CUDA Error: %s(%d): %s", file, line, cudaGetErrorString(code)); | |
} | |
else { | |
fprintf(stderr, "CUDA Error: %s(%d): %s", file, line, cudaGetErrorString(code)); | |
} | |
exit(EXIT_FAILURE); | |
} | |
} | |
__device__ uint32_t d_histogram[1025] = {}; | |
__device__ uint32_t d_is_0 = 0; | |
__device__ uint32_t d_is_1 = 0; | |
__forceinline__ __device__ float uniform_float(const uint32_t rng) { | |
// curand naturally generates the range (0, 1], we want [0, 1) | |
// https://github.com/pytorch/pytorch/blob/059aa34b124916dfd761f3cbdb5fa97d7a01fc93/aten/src/ATen/native/cuda/Distributions.cu#L71-L77 | |
constexpr auto MASK = static_cast<uint32_t>((static_cast<uint64_t>(1) << std::numeric_limits<float>::digits) - 1); | |
constexpr auto DIVISOR = static_cast<float>(1) / (static_cast<uint32_t>(1) << std::numeric_limits<float>::digits); | |
return (rng & MASK) * DIVISOR; | |
} | |
__global__ void uniform_float_histogram() { | |
const uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x; | |
__shared__ uint32_t histogram[1025]; | |
__shared__ uint32_t is_0; | |
__shared__ uint32_t is_1; | |
// Init shared mem | |
if (threadIdx.x == 0) { | |
is_0 = 0; | |
is_1 = 0; | |
histogram[1024] = 0; | |
} | |
histogram[threadIdx.x] = 0; | |
__syncthreads(); | |
// Each thread calculates 2^12 random numbers and adds them to a histogram in shared mem | |
const uint32_t start = tid << 12; | |
for (uint32_t i = 0; i < (1 << 12); ++i) { | |
const float f = uniform_float(start + i); | |
// Histogram | |
atomicInc(histogram + static_cast<uint32_t>(f * 1024), UINT_MAX); | |
// Check 0 and 1 | |
if (f == 0.0f) | |
atomicInc(&is_0, UINT_MAX); | |
if (f == 1.0f) | |
atomicInc(&is_0, UINT_MAX); | |
} | |
__syncthreads(); | |
// Copy hist to global mem | |
if (threadIdx.x == 0) { | |
atomicAdd(&d_is_0, is_0); | |
atomicAdd(&d_is_1, is_1); | |
atomicAdd(d_histogram + 1024, histogram[1024]); | |
} | |
atomicAdd(d_histogram + threadIdx.x, histogram[threadIdx.x]); | |
} | |
int main() | |
{ | |
// Launch 2^10 (1024) blocks, each with 2^10 (1024) threads | |
uniform_float_histogram<<<1024, 1024>>>(); | |
CUDA_CHECK(); | |
CUDA_CALL(cudaDeviceSynchronize()); | |
// Copy results back | |
uint32_t histogram[1025] = {}; | |
uint32_t is_0 = 0; | |
uint32_t is_1 = 0; | |
cudaMemcpyFromSymbol(histogram, d_histogram, sizeof(uint32_t) * 1025); | |
CUDA_CALL(cudaMemcpyFromSymbol(&is_0, d_is_0, sizeof(uint32_t))); | |
CUDA_CALL(cudaMemcpyFromSymbol(&is_1, d_is_1, sizeof(uint32_t))); | |
// Summarise results | |
printf("Found %u zeros\n", is_0); | |
printf("Found %u ones\n", is_1); | |
uint32_t is_min = std::numeric_limits<uint32_t>::max(); | |
uint32_t is_max = std::numeric_limits<uint32_t>::min(); | |
for (int i = 0; i < 1024; ++i) { | |
is_min = std::min(is_min, histogram[i]); | |
is_max = std::max(is_max, histogram[i]); | |
} | |
printf("Min Bin Size %u\n", is_min); | |
printf("Max Bin Size %u\n", is_max); | |
// Print hist to csv | |
std::ofstream file; | |
file.open("uniform_float_hist.csv", std::ofstream::out | std::ofstream::trunc); | |
for (int i = 0; i < 1024; ++i) { | |
file << histogram[i] <<"\n"; | |
} | |
file.close(); | |
return 0; | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment