Skip to content

Instantly share code, notes, and snippets.

@Robadob
Created February 10, 2023 18:55
Show Gist options
  • Save Robadob/5e3a068267f8a351991bef16d138dd7c to your computer and use it in GitHub Desktop.
Save Robadob/5e3a068267f8a351991bef16d138dd7c to your computer and use it in GitHub Desktop.
Generate all 2^32 random uniform floats and place them on a histogram.
#include <algorithm>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cstdio>
#include <cstdint>
#include <fstream>
#include <limits>
/**
* Error check function for safe CUDA API calling
* Wrap all calls to CUDA API functions with CUDA_CALL() to catch errors on failure
* e.g. CUDA_CALL(cudaFree(myPtr));
* CUDA_CHECk() can also be used to perform error checking after kernel launches and async methods
* e.g. CUDA_CHECK()
*/
#if defined(_DEBUG) || defined(D_DEBUG)
#define CUDA_CALL(ans) { gpuAssert((ans), __FILE__, __LINE__); }
#define CUDA_CHECK() { gpuAssert(cudaDeviceSynchronize(), __FILE__, __LINE__); }
#else
#define CUDA_CALL(ans) { gpuAssert((ans), __FILE__, __LINE__); }
#define CUDA_CHECK() { gpuAssert(cudaPeekAtLastError(), __FILE__, __LINE__); }
#endif
inline void gpuAssert(cudaError_t code, const char* file, int line) {
if (code != cudaSuccess) {
if (line >= 0) {
fprintf(stderr, "CUDA Error: %s(%d): %s", file, line, cudaGetErrorString(code));
}
else {
fprintf(stderr, "CUDA Error: %s(%d): %s", file, line, cudaGetErrorString(code));
}
exit(EXIT_FAILURE);
}
}
__device__ uint32_t d_histogram[1025] = {};
__device__ uint32_t d_is_0 = 0;
__device__ uint32_t d_is_1 = 0;
__forceinline__ __device__ float uniform_float(const uint32_t rng) {
// curand naturally generates the range (0, 1], we want [0, 1)
// https://github.com/pytorch/pytorch/blob/059aa34b124916dfd761f3cbdb5fa97d7a01fc93/aten/src/ATen/native/cuda/Distributions.cu#L71-L77
constexpr auto MASK = static_cast<uint32_t>((static_cast<uint64_t>(1) << std::numeric_limits<float>::digits) - 1);
constexpr auto DIVISOR = static_cast<float>(1) / (static_cast<uint32_t>(1) << std::numeric_limits<float>::digits);
return (rng & MASK) * DIVISOR;
}
__global__ void uniform_float_histogram() {
const uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ uint32_t histogram[1025];
__shared__ uint32_t is_0;
__shared__ uint32_t is_1;
// Init shared mem
if (threadIdx.x == 0) {
is_0 = 0;
is_1 = 0;
histogram[1024] = 0;
}
histogram[threadIdx.x] = 0;
__syncthreads();
// Each thread calculates 2^12 random numbers and adds them to a histogram in shared mem
const uint32_t start = tid << 12;
for (uint32_t i = 0; i < (1 << 12); ++i) {
const float f = uniform_float(start + i);
// Histogram
atomicInc(histogram + static_cast<uint32_t>(f * 1024), UINT_MAX);
// Check 0 and 1
if (f == 0.0f)
atomicInc(&is_0, UINT_MAX);
if (f == 1.0f)
atomicInc(&is_0, UINT_MAX);
}
__syncthreads();
// Copy hist to global mem
if (threadIdx.x == 0) {
atomicAdd(&d_is_0, is_0);
atomicAdd(&d_is_1, is_1);
atomicAdd(d_histogram + 1024, histogram[1024]);
}
atomicAdd(d_histogram + threadIdx.x, histogram[threadIdx.x]);
}
int main()
{
// Launch 2^10 (1024) blocks, each with 2^10 (1024) threads
uniform_float_histogram<<<1024, 1024>>>();
CUDA_CHECK();
CUDA_CALL(cudaDeviceSynchronize());
// Copy results back
uint32_t histogram[1025] = {};
uint32_t is_0 = 0;
uint32_t is_1 = 0;
cudaMemcpyFromSymbol(histogram, d_histogram, sizeof(uint32_t) * 1025);
CUDA_CALL(cudaMemcpyFromSymbol(&is_0, d_is_0, sizeof(uint32_t)));
CUDA_CALL(cudaMemcpyFromSymbol(&is_1, d_is_1, sizeof(uint32_t)));
// Summarise results
printf("Found %u zeros\n", is_0);
printf("Found %u ones\n", is_1);
uint32_t is_min = std::numeric_limits<uint32_t>::max();
uint32_t is_max = std::numeric_limits<uint32_t>::min();
for (int i = 0; i < 1024; ++i) {
is_min = std::min(is_min, histogram[i]);
is_max = std::max(is_max, histogram[i]);
}
printf("Min Bin Size %u\n", is_min);
printf("Max Bin Size %u\n", is_max);
// Print hist to csv
std::ofstream file;
file.open("uniform_float_hist.csv", std::ofstream::out | std::ofstream::trunc);
for (int i = 0; i < 1024; ++i) {
file << histogram[i] <<"\n";
}
file.close();
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment