Skip to content

Instantly share code, notes, and snippets.

@sandeepkumar-skb
Last active January 29, 2021 23:29
Show Gist options
  • Save sandeepkumar-skb/3ebfb6f918ed7c37aae550f775305ed3 to your computer and use it in GitHub Desktop.
Save sandeepkumar-skb/3ebfb6f918ed7c37aae550f775305ed3 to your computer and use it in GitHub Desktop.
#include <cooperative_groups.h>
#include <algorithm>
#include <cuda.h>
#include<stdio.h>
using namespace cooperative_groups;
inline void gpuAssert(cudaError_t err, const char *file, int line)
{
if (err != cudaSuccess){
printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
exit(EXIT_FAILURE);
}
}
#define gpuErrchk(ans) \
{ \
gpuAssert((ans), __FILE__, __LINE__); \
}
__device__ int thread_sum(int *input, int n)
{
int sum = 0;
for(int i = blockIdx.x * blockDim.x + threadIdx.x;
i < n / 4;
i += blockDim.x * gridDim.x)
{
int4 in = ((int4*)input)[i];
sum += in.x + in.y + in.z + in.w;
}
return sum;
}
__device__ int reduce_sum(thread_group g, int *temp, int val)
{
int lane = g.thread_rank();
// Each iteration halves the number of active threads
// Each thread adds its partial sum[i] to sum[lane+i]
for (int i = g.size() / 2; i > 0; i /= 2)
{
temp[lane] = val;
g.sync(); // wait for all threads to store
if(lane<i) val += temp[lane + i];
g.sync(); // wait for all threads to load
}
return val; // note: only thread 0 will return full sum
}
__global__ void sum_kernel_block(int *sum, int *input, int n)
{
//int tx = blockIdx.x*blockDim.x + threadIdx.x;
//int my_sum = input[tx]; //thread_sum(input, n);
int my_sum = thread_sum(input, n);
extern __shared__ int temp[];
auto g = this_thread_block();
int block_sum = reduce_sum(g, temp, my_sum);
if (g.thread_rank() == 0) atomicAdd(sum, block_sum);
}
int main(){
int n = 1<<24;
int blockSize = 256;
int nBlocks = (n + blockSize - 1) / blockSize;
int sharedBytes = blockSize * sizeof(int);
int *sum, *data;
cudaMallocManaged(&sum, sizeof(int));
cudaMallocManaged(&data, n * sizeof(int));
std::fill_n(data, n, 1); // initialize data
cudaMemset(sum, 0, sizeof(int));
cudaEvent_t start, stop;
gpuErrchk(cudaEventCreate(&start));
gpuErrchk(cudaEventCreate(&stop));
gpuErrchk(cudaEventRecord(start));
sum_kernel_block<<<nBlocks, blockSize, sharedBytes>>>(sum, data, n);
gpuErrchk(cudaEventRecord(stop));
gpuErrchk(cudaEventSynchronize(stop));
float milliseconds = 0;
gpuErrchk(cudaEventElapsedTime(&milliseconds, start, stop));
printf("Effective time: %.3f ms\n", milliseconds);
printf("Final Result: %d Expected Result: %d\n", *sum, n);
cudaFree(sum);
cudaFree(data);
return 0;
}
@sandeepkumar-skb
Copy link
Author

nvcc -std=c++11 cooperative_groups.cu -o cg.out && ./cg.out
Effective time: 24.086 ms
Final Result: 16777216 Expected Result: 16777216

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment