Skip to content

Instantly share code, notes, and snippets.

@wh5a
Last active April 9, 2021 20:56
Show Gist options
  • Star 8 You must be signed in to star a gist
  • Fork 3 You must be signed in to fork a gist
  • Save wh5a/4500706 to your computer and use it in GitHub Desktop.
Save wh5a/4500706 to your computer and use it in GitHub Desktop.
CUDA Work Efficient Parallel Scan.
// MP 5 Scan
// Given a list (lst) of length n
// Output its prefix sum = {lst[0], lst[0] + lst[1], lst[0] + lst[1] + ... + lst[n-1]}
// Due Tuesday, January 22, 2013 at 11:59 p.m. PST
#include <wb.h>
#define BLOCK_SIZE 512 //@@ You can change this
#define wbCheck(stmt) do { \
cudaError_t err = stmt; \
if (err != cudaSuccess) { \
wbLog(ERROR, "Failed to run stmt ", #stmt); \
return -1; \
} \
} while(0)
__global__ void fixup(float *input, float *aux, int len) {
unsigned int t = threadIdx.x, start = 2 * blockIdx.x * BLOCK_SIZE;
if (blockIdx.x) {
if (start + t < len)
input[start + t] += aux[blockIdx.x - 1];
if (start + BLOCK_SIZE + t < len)
input[start + BLOCK_SIZE + t] += aux[blockIdx.x - 1];
}
}
__global__ void scan(float * input, float * output, float *aux, int len) {
// Load a segment of the input vector into shared memory
__shared__ float scan_array[BLOCK_SIZE << 1];
unsigned int t = threadIdx.x, start = 2 * blockIdx.x * BLOCK_SIZE;
if (start + t < len)
scan_array[t] = input[start + t];
else
scan_array[t] = 0;
if (start + BLOCK_SIZE + t < len)
scan_array[BLOCK_SIZE + t] = input[start + BLOCK_SIZE + t];
else
scan_array[BLOCK_SIZE + t] = 0;
__syncthreads();
// Reduction
int stride;
for (stride = 1; stride <= BLOCK_SIZE; stride <<= 1) {
int index = (t + 1) * stride * 2 - 1;
if (index < 2 * BLOCK_SIZE)
scan_array[index] += scan_array[index - stride];
__syncthreads();
}
// Post reduction
for (stride = BLOCK_SIZE >> 1; stride; stride >>= 1) {
int index = (t + 1) * stride * 2 - 1;
if (index + stride < 2 * BLOCK_SIZE)
scan_array[index + stride] += scan_array[index];
__syncthreads();
}
if (start + t < len)
output[start + t] = scan_array[t];
if (start + BLOCK_SIZE + t < len)
output[start + BLOCK_SIZE + t] = scan_array[BLOCK_SIZE + t];
if (aux && t == 0)
aux[blockIdx.x] = scan_array[2 * BLOCK_SIZE - 1];
}
int main(int argc, char ** argv) {
wbArg_t args;
float * hostInput; // The input 1D list
float * hostOutput; // The output list
float * deviceInput;
float * deviceOutput;
float *deviceAuxArray, *deviceAuxScannedArray;
int numElements; // number of elements in the list
args = wbArg_read(argc, argv);
wbTime_start(Generic, "Importing data and creating memory on host");
hostInput = (float *) wbImport(wbArg_getInputFile(args, 0), &numElements);
cudaHostAlloc(&hostOutput, numElements * sizeof(float), cudaHostAllocDefault);
wbTime_stop(Generic, "Importing data and creating memory on host");
wbLog(TRACE, "The number of input elements in the input is ", numElements);
wbTime_start(GPU, "Allocating GPU memory.");
wbCheck(cudaMalloc((void**)&deviceInput, numElements*sizeof(float)));
wbCheck(cudaMalloc((void**)&deviceOutput, numElements*sizeof(float)));
// XXX the size is fixed for ease of implementation.
cudaMalloc(&deviceAuxArray, (BLOCK_SIZE << 1) * sizeof(float));
cudaMalloc(&deviceAuxScannedArray, (BLOCK_SIZE << 1) * sizeof(float));
wbTime_stop(GPU, "Allocating GPU memory.");
wbTime_start(GPU, "Clearing output memory.");
wbCheck(cudaMemset(deviceOutput, 0, numElements*sizeof(float)));
wbTime_stop(GPU, "Clearing output memory.");
wbTime_start(GPU, "Copying input memory to the GPU.");
wbCheck(cudaMemcpy(deviceInput, hostInput, numElements*sizeof(float), cudaMemcpyHostToDevice));
wbTime_stop(GPU, "Copying input memory to the GPU.");
//@@ Initialize the grid and block dimensions here
int numBlocks = ceil((float)numElements/(BLOCK_SIZE<<1));
dim3 dimGrid(numBlocks, 1, 1);
dim3 dimBlock(BLOCK_SIZE, 1, 1);
wbLog(TRACE, "The number of blocks is ", numBlocks);
wbTime_start(Compute, "Performing CUDA computation");
//@@ Modify this to complete the functionality of the scan
//@@ on the deivce
scan<<<dimGrid, dimBlock>>>(deviceInput, deviceOutput, deviceAuxArray, numElements);
cudaDeviceSynchronize();
scan<<<dim3(1,1,1), dimBlock>>>(deviceAuxArray, deviceAuxScannedArray, NULL, BLOCK_SIZE << 1);
cudaDeviceSynchronize();
fixup<<<dimGrid, dimBlock>>>(deviceOutput, deviceAuxScannedArray, numElements);
cudaDeviceSynchronize();
wbTime_stop(Compute, "Performing CUDA computation");
wbTime_start(Copy, "Copying output memory to the CPU");
wbCheck(cudaMemcpy(hostOutput, deviceOutput, numElements*sizeof(float), cudaMemcpyDeviceToHost));
wbTime_stop(Copy, "Copying output memory to the CPU");
wbTime_start(GPU, "Freeing GPU Memory");
cudaFree(deviceInput);
cudaFree(deviceOutput);
cudaFree(deviceAuxArray);
cudaFree(deviceAuxScannedArray);
wbTime_stop(GPU, "Freeing GPU Memory");
wbSolution(args, hostOutput, numElements);
free(hostInput);
cudaFreeHost(hostOutput);
return 0;
}
@gasiort
Copy link

gasiort commented Mar 5, 2016

I think solution is incorrect for more than 1 thread blocks. You should save intermediate sums in auxiliary_array between up-sweep and down-sweep phases, NOT after down-sweep.
Check it out here: http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html "39.2.4 Arrays of Arbitrary Size"

@ramakarl
Copy link

ramakarl commented Sep 7, 2016

Yes, Tom-Demijohn is right, there are a few bugs in the above code.
The main issue is this line in fixup() is incorrect (#22 and #24)
input[start + t] += aux[blockIdx.x - 1];
There should be no -1 here.
More generally, a limitation of the above is that it only works when there is one thread block on the auxiliary array. Since the auxiliary scan step (second kernel launch) throws away its last element sums, the auxiliary cannot be fixed up. This means the maximum number of elements that can be scanned for above code is BLOCK_SIZE^2. On CUDA hardware, with a limit of 512 threads/block, the maximum is 262,144 (not even a million).
The solution is to create an auxiliary-of-the-auxiliary. This would allow BLOCK_SIZE^3 elements, or 134 million on CUDA hardware.
I've provided the answer here, but doing this is not trivial. The code above returns proper prefix sums (where the first element of the output equals the first element of the input), but any auxiliary scans must do zero-offset scans (where the first element of output is zero, and all others are shifted down one), or the fixup of the auxiliaries will be incorrect.
Therefore, I've created kernels which enable or disable zero-offset scans selectively at each level, and added another layer of scan & auxiliary so that the maximum limit is BLOCK_SIZE^3.
Differences from the above code are marked with "<----"

extern "C" __global__ void prefixFixup ( uint *input, uint *aux, int len) 
{
       unsigned int t = threadIdx.x;
    unsigned int start = t + 2 * blockIdx.x * SCAN_BLOCKSIZE;   
    if (start < len)                    input[start] += aux[blockIdx.x] ;              // <------------------- fixed code
    if (start + SCAN_BLOCKSIZE < len)   input[start + SCAN_BLOCKSIZE] += aux[blockIdx.x];
}

extern "C" __global__ void prefixSum ( uint* input, uint* output, uint* aux, int len, int zeroff )   //<---- support zero-offsets
{
    __shared__ uint scan_array[SCAN_BLOCKSIZE << 1];    
    unsigned int t1 = threadIdx.x + 2 * blockIdx.x * SCAN_BLOCKSIZE;    // <--- store t1,t2 for efficiency
    unsigned int t2 = t1 + SCAN_BLOCKSIZE;

    // Pre-load into shared memory
    scan_array[threadIdx.x] = (t1<len) ? input[t1] : 0.0f;
    scan_array[threadIdx.x + SCAN_BLOCKSIZE] = (t2<len) ? input[t2] : 0.0f;
    __syncthreads();

    // Reduction
    int stride;
    for (stride = 1; stride <= SCAN_BLOCKSIZE; stride <<= 1) {
       int index = (threadIdx.x + 1) * stride * 2 - 1;
       if (index < 2 * SCAN_BLOCKSIZE)
          scan_array[index] += scan_array[index - stride];
       __syncthreads();
    }

    // Post reduction
    for (stride = SCAN_BLOCKSIZE >> 1; stride > 0; stride >>= 1) {
       int index = (threadIdx.x + 1) * stride * 2 - 1;
       if (index + stride < 2 * SCAN_BLOCKSIZE)
          scan_array[index + stride] += scan_array[index];
       __syncthreads();
    }
    __syncthreads();

    // Output values & aux
    if (t1+zeroff < len)    output[t1+zeroff] = scan_array[threadIdx.x];          // <---- support zero-offsets
    if (t2+zeroff < len)    output[t2+zeroff] = (threadIdx.x==SCAN_BLOCKSIZE-1 && zeroff) ? 0 : scan_array[threadIdx.x + SCAN_BLOCKSIZE];   
    if ( threadIdx.x == 0 ) {
        if ( zeroff ) output[0] = 0;                    // <---- support zero-offsets
        if (aux) aux[blockIdx.x] = scan_array[2 * SCAN_BLOCKSIZE - 1];              
    }       
}

Host code (three layers):

          // CUDA Prefix Sums - Analysis and fixes by Rama Karl (www.ramakarl.com)

        int numElem = 512000;     // input number of elements
                int zero_offsets = 0;          // set to 0 if you want classic prefix sums, or 1 if you want zero-offset sums
        int naux = SCAN_BLOCKSIZE << 1;
        int len1 = numElem / naux ;     
        int blks1 = int ( numElem / naux ) + 1;
        int blks2 = int ( blks1 / naux ) + 1;
        int threads = SCAN_BLOCKSIZE;
        int zon=1;    // used for upper layers

               // Throw error if number of elements exceeds BLOCKSIZE^3
        if ( numElem > SCAN_BLOCKSIZE*ulong(SCAN_BLOCKSIZE)*SCAN_BLOCKSIZE) {
            nvprintf ( "ERROR: Number of elements exceeds prefix sum max. Adjust SCAN_BLOCKSIZE.\n" );
        }
                // Prefix sum primary array, save last elements of blocks into aux1
        void* argsA[5] = {&inArray, &outArray, &auxArray1, &numElem, &zero_offsets };   // <--- zero-offset mode option
        cudaCheck ( cuLaunchKernel ( m_Func[FUNC_PREFIXSUM], blks1, 1, 1, threads, 1, 1, 0, NULL, argsA, NULL ), "cuPrefixSumA" );
        cudaCheck ( cuCtxSynchronize(), "cuCtxSynchronize" );

                // Prefix sum of aux1, save last elements of blocks into aux2
        void* argsB[5] = { &auxArray1, &auxScan1, &auxArray2, &len1, &zon };    // <--- upper level scans must use zero-offset mode
        cudaCheck ( cuLaunchKernel ( m_Func[FUNC_PREFIXSUM], blks2, 1, 1, threads, 1, 1, 0, NULL, argsB, NULL ), "cuPrefixSumB" );
        cudaCheck ( cuCtxSynchronize(), "cuCtxSynchronize" );

                // Prefix sum of aux2, don't save the last elems. Implies aux2 must be <BLOCK_SIZE elements, or total cnt limited to BLOCK_SIZE^3
        CUdeviceptr nptr = {0};
        void* argsC[5] = { &auxArray2, &auxScan2, &nptr, &blks2, &zon };
        cudaCheck ( cuLaunchKernel ( m_Func[FUNC_PREFIXSUM], 1, 1, 1, threads, 1, 1, 0, NULL, argsC, NULL ), "cuPrefixSumB" );
        cudaCheck ( cuCtxSynchronize(), "cuCtxSynchronize" );           

                // Add-in the aux2 elements back into aux1 results
        void* argsD[4] = { &auxScan1, &auxScan2, &len1, &zon };
        cudaCheck ( cuLaunchKernel ( m_Func[FUNC_PREFIXFIXUP], blks2, 1, 1, threads, 1, 1, 0, NULL, argsD, NULL ), "cuPrefixFixupC" );
        cudaCheck ( cuCtxSynchronize(), "cuCtxSynchronize" );       

        // Add-in the aux1 results back into primary results
        void* argsE[4] = { &outArray, &auxScan1, &numElem, &zero_offsets };   // <--- zero-offset mode option
        cudaCheck ( cuLaunchKernel ( m_Func[FUNC_PREFIXFIXUP], blks1, 1, 1, threads, 1, 1, 0, NULL, argsE, NULL ), "cuPrefixFixupC" );
        cudaCheck ( cuCtxSynchronize(), "cuCtxSynchronize" );   

This has been tested with up to several million elements, in both classic and zero-offset mode.

Note: Zero-offset scans are useful in applications like stream compaction and radix-sort, where the input counts represent the # of elements in a bin, and you want the scan to return the offsets of the bins.
bin counts = {8, 4, 5, 3, 2}
offsets scan = {0, 8, 12, 17, 20} // <-- note offset(0)=0, and scan elements are shifted right.
classic scan = {8, 12, 17, 20, 22}
If you attempted to use classic-scan directly for bin offsets, you would incorrectly reserve 4 slots for the first bin instead of 8.

@Corillian
Copy link

The code above by @rchoetzlein contains an error in how it calculates the block counts. Specifically:

int naux = SCAN_BLOCKSIZE << 1;

should be:

int naux = SCAN_BLOCKSIZE;

@TriLoo
Copy link

TriLoo commented Aug 4, 2017

Is this kernel only fit to ONE Block? Because it used shared memory and no other operations work on the post reductioin results. Meanwhile, the number of elements in vector is limited to 512 float ? Is it correct?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment