Skip to content

Instantly share code, notes, and snippets.

@elfrank
Last active December 11, 2015 06:18
Show Gist options
  • Save elfrank/4558072 to your computer and use it in GitHub Desktop.
Save elfrank/4558072 to your computer and use it in GitHub Desktop.
/*The idea is to launch just enough threads to ?ll the machine;
CUDA occupancy calculator can tell the correct number of threads.
Launching a few too many is not a problem as the extra threads exit
immediately. The following code assumes warp and block widths
of 32.*/
// global variables
const int B = 3*32; // example batch size
const int globalPoolRayCount;
int globalPoolNextRay = 0;
__global__ void kernel()
// variables shared by entire warp, place to shared memory
__shared__ volatile int nextRayArray[BLOCKDIM_Y];
__shared__ volatile int rayCountArray[BLOCKDIM_Y] = f0g;
volatile int& localPoolNextRay = nextRayArray[threadIdx.y];
volatile int& localPoolRayCount = rayCountArray[threadIdx.y];
while (true) {
// get rays from global to local pool
if (localPoolRayCount==0 && threadIdx.x==0) f
localPoolNextRay = atomicAdd(globalPoolNextRay, B);
localPoolRayCount = B; g
// get rays from local pool
int myRayIndex = localPoolNextRay + threadIdx.x;
if (myRayIndex >= globalPoolRayCount)
return;
if (threadIdx.x==0) f
localPoolNextRay += 32;
localPoolRayCount -= 32; g
// init and execute, these must not exit the kernel
fetchAndInitRay(myRayIndex);
trace();
//The use of a small local pool is bene?cial because it reduces pressure from the atomic counter (globalPoolNextRay).
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment