elfrank/PersistentThreads.h

## PersistentThreads.h
/*The idea is to launch just enough threads to ?ll the machine;
CUDA occupancy calculator can tell the correct number of threads.
Launching a few too many is not a problem as the extra threads exit
immediately. The following code assumes warp and block widths
of 32.*/
// global variables
const int B = 3*32; // example batch size
const int globalPoolRayCount;
int globalPoolNextRay = 0;
__global__ void kernel()
// variables shared by entire warp, place to shared memory
__shared__ volatile int nextRayArray[BLOCKDIM_Y];
__shared__ volatile int rayCountArray[BLOCKDIM_Y] = f0g;
volatile int& localPoolNextRay = nextRayArray[threadIdx.y];
volatile int& localPoolRayCount = rayCountArray[threadIdx.y];
while (true) {
// get rays from global to local pool
if (localPoolRayCount==0 && threadIdx.x==0) f
localPoolNextRay = atomicAdd(globalPoolNextRay, B);
localPoolRayCount = B; g
// get rays from local pool
int myRayIndex = localPoolNextRay + threadIdx.x;
if (myRayIndex >= globalPoolRayCount)
return;
if (threadIdx.x==0) f
localPoolNextRay += 32;
localPoolRayCount -= 32; g
// init and execute, these must not exit the kernel
fetchAndInitRay(myRayIndex);
trace();

//The use of a small local pool is bene?cial because it reduces pressure from the atomic counter (globalPoolNextRay).
	/*The idea is to launch just enough threads to ?ll the machine;
	CUDA occupancy calculator can tell the correct number of threads.
	Launching a few too many is not a problem as the extra threads exit
	immediately. The following code assumes warp and block widths
	of 32.*/
	// global variables
	const int B = 3*32; // example batch size
	const int globalPoolRayCount;
	int globalPoolNextRay = 0;
	__global__ void kernel()
	// variables shared by entire warp, place to shared memory
	__shared__ volatile int nextRayArray[BLOCKDIM_Y];
	__shared__ volatile int rayCountArray[BLOCKDIM_Y] = f0g;
	volatile int& localPoolNextRay = nextRayArray[threadIdx.y];
	volatile int& localPoolRayCount = rayCountArray[threadIdx.y];
	while (true) {
	// get rays from global to local pool
	if (localPoolRayCount==0 && threadIdx.x==0) f
	localPoolNextRay = atomicAdd(globalPoolNextRay, B);
	localPoolRayCount = B; g
	// get rays from local pool
	int myRayIndex = localPoolNextRay + threadIdx.x;
	if (myRayIndex >= globalPoolRayCount)
	return;
	if (threadIdx.x==0) f
	localPoolNextRay += 32;
	localPoolRayCount -= 32; g
	// init and execute, these must not exit the kernel
	fetchAndInitRay(myRayIndex);
	trace();

	//The use of a small local pool is bene?cial because it reduces pressure from the atomic counter (globalPoolNextRay).