Last active
August 9, 2023 14:55
-
-
Save akiross/17e722c5bea92bd2c310324eac643df6 to your computer and use it in GitHub Desktop.
Approximate Pi using usual Monte-Carlo simulation, in CUDA (with bonus Python snippet!).
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Approximation of Pi using a simple, and not optimized, CUDA program | |
// Copyleft Alessandro Re | |
// | |
// GCC 6.x not supported by CUDA 8, I used compat version | |
// | |
// nvcc -std=c++11 -ccbin=gcc5 pigreco.cu -c | |
// g++5 pigreco.o -lcudart -L/usr/local/cuda/lib64 -o pigreco | |
// | |
// This code is basically equivalent to the following Python code: | |
// | |
// def pigreco(NUM): | |
// from random import random as rand | |
// def sqrad(): | |
// x, y = rand(), rand() | |
// return x*x + y*y | |
// return 4 * sum(1 - int(test()) for _ in range(NUM)) / NUM | |
// | |
// Python version takes, on this machine, 3.5 seconds to compute 10M tests | |
// CUDA version takes, on this machine, 1.6 seconds to compute 20.48G tests | |
// | |
#include <iostream> | |
#include <limits> | |
#include <cuda.h> | |
#include <curand_kernel.h> | |
using std::cout; | |
using std::endl; | |
typedef unsigned long long Count; | |
typedef std::numeric_limits<double> DblLim; | |
const Count WARP_SIZE = 32; // Warp size | |
const Count NBLOCKS = 640; // Number of total cuda cores on my GPU | |
const Count ITERATIONS = 1000000; // Number of points to generate (each thread) | |
// This kernel is | |
__global__ void picount(Count *totals) { | |
// Define some shared memory: all threads in this block | |
__shared__ Count counter[WARP_SIZE]; | |
// Unique ID of the thread | |
int tid = threadIdx.x + blockIdx.x * blockDim.x; | |
// Initialize RNG | |
curandState_t rng; | |
curand_init(clock64(), tid, 0, &rng); | |
// Initialize the counter | |
counter[threadIdx.x] = 0; | |
// Computation loop | |
for (int i = 0; i < ITERATIONS; i++) { | |
float x = curand_uniform(&rng); // Random x position in [0,1] | |
float y = curand_uniform(&rng); // Random y position in [0,1] | |
counter[threadIdx.x] += 1 - int(x * x + y * y); // Hit test | |
} | |
// The first thread in *every block* should sum the results | |
if (threadIdx.x == 0) { | |
// Reset count for this block | |
totals[blockIdx.x] = 0; | |
// Accumulate results | |
for (int i = 0; i < WARP_SIZE; i++) { | |
totals[blockIdx.x] += counter[i]; | |
} | |
} | |
} | |
int main(int argc, char **argv) { | |
int numDev; | |
cudaGetDeviceCount(&numDev); | |
if (numDev < 1) { | |
cout << "CUDA device missing! Do you need to use optirun?\n"; | |
return 1; | |
} | |
cout << "Starting simulation with " << NBLOCKS << " blocks, " << WARP_SIZE << " threads, and " << ITERATIONS << " iterations\n"; | |
// Allocate host and device memory to store the counters | |
Count *hOut, *dOut; | |
hOut = new Count[NBLOCKS]; // Host memory | |
cudaMalloc(&dOut, sizeof(Count) * NBLOCKS); // Device memory | |
// Launch kernel | |
picount<<<NBLOCKS, WARP_SIZE>>>(dOut); | |
// Copy back memory used on device and free | |
cudaMemcpy(hOut, dOut, sizeof(Count) * NBLOCKS, cudaMemcpyDeviceToHost); | |
cudaFree(dOut); | |
// Compute total hits | |
Count total = 0; | |
for (int i = 0; i < NBLOCKS; i++) { | |
total += hOut[i]; | |
} | |
Count tests = NBLOCKS * ITERATIONS * WARP_SIZE; | |
cout << "Approximated PI using " << tests << " random tests\n"; | |
// Set maximum precision for decimal printing | |
cout.precision(DblLim::max_digits10); | |
cout << "PI ~= " << 4.0 * (double)total/(double)tests << endl; | |
return 0; | |
} |
This was very helpful. Thank you. There is a slight bug in your code, ideally you would sync before the first block attempts to add everything up.
I will put down the equivalent Python code:
# %%
from numba import cuda, types
from numba.cuda.random import create_xoroshiro128p_states, xoroshiro128p_uniform_float32
import numpy as np
# %%
@cuda.jit
def sim_pi_cuda(out, rng_state, iterations):
grid_idx = cuda.grid(1)
warp_size = 32
counts = cuda.shared.array(warp_size, dtype=types.uint64)
counts[cuda.threadIdx.x] = 0
for _ in range(iterations):
x = xoroshiro128p_uniform_float32(rng_state, grid_idx)
y = xoroshiro128p_uniform_float32(rng_state, grid_idx)
counts[cuda.threadIdx.x] += 1 - int(x*x + y*y)
cuda.syncthreads()
if cuda.threadIdx.x == 0:
out[cuda.blockIdx.x] = 0
for i in range(warp_size):
out[cuda.blockIdx.x] += counts[i]
# %%
def sim_pi(iterations):
blocks = 10496
warp_size = 32
out = cuda.device_array(blocks, dtype=np.uint64)
seed = np.random.randint(0, 2**64, dtype=np.uint64)
rng_state = create_xoroshiro128p_states(warp_size * blocks, seed=seed)
sim_pi_cuda[blocks, warp_size](out, rng_state, iterations)
data = out.copy_to_host()
data.sort()
return (4.0 * data.sum()) / (iterations*blocks*warp_size)
# %%
N = 1_000_000_00
sim_pi(N)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Nice! And thanks for sharing this! I'm learning cuda and this is helpful!