sandeepkumar-skb/cooperative_groups.cu

## cooperative_groups.cu
#include <cooperative_groups.h>
#include <algorithm>
#include <cuda.h>
#include<stdio.h>

using namespace cooperative_groups;

inline void gpuAssert(cudaError_t err, const char *file, int line)
{
    if (err != cudaSuccess){
        printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
        exit(EXIT_FAILURE);
    }
}
#define gpuErrchk(ans)                        \
{                                         \
    gpuAssert((ans), __FILE__, __LINE__); \
}


__device__ int thread_sum(int *input, int n)
{
    int sum = 0;

    for(int i = blockIdx.x * blockDim.x + threadIdx.x;
        i < n / 4;
        i += blockDim.x * gridDim.x)
    {
        int4 in = ((int4*)input)[i];
        sum += in.x + in.y + in.z + in.w;
    }
    return sum;
}

__device__ int reduce_sum(thread_group g, int *temp, int val)
{
    int lane = g.thread_rank();

    // Each iteration halves the number of active threads
    // Each thread adds its partial sum[i] to sum[lane+i]
    for (int i = g.size() / 2; i > 0; i /= 2)
    {
        temp[lane] = val;
        g.sync(); // wait for all threads to store
        if(lane<i) val += temp[lane + i];
        g.sync(); // wait for all threads to load
    }
    return val; // note: only thread 0 will return full sum
}

__global__ void sum_kernel_block(int *sum, int *input, int n)
{
    //int tx = blockIdx.x*blockDim.x + threadIdx.x;
    //int my_sum = input[tx]; //thread_sum(input, n);
    int my_sum = thread_sum(input, n);

    extern __shared__ int temp[];
    auto g = this_thread_block();
    int block_sum = reduce_sum(g, temp, my_sum);
    if (g.thread_rank() == 0) atomicAdd(sum, block_sum);
}

int main(){
    int n = 1<<24;
    int blockSize = 256;
    int nBlocks = (n + blockSize - 1) / blockSize;
    int sharedBytes = blockSize * sizeof(int);

    int *sum, *data;
    cudaMallocManaged(&sum, sizeof(int));
    cudaMallocManaged(&data, n * sizeof(int));
    std::fill_n(data, n, 1); // initialize data
    cudaMemset(sum, 0, sizeof(int));


    cudaEvent_t start, stop;
    gpuErrchk(cudaEventCreate(&start));
    gpuErrchk(cudaEventCreate(&stop));

    gpuErrchk(cudaEventRecord(start));

    sum_kernel_block<<<nBlocks, blockSize, sharedBytes>>>(sum, data, n);

    gpuErrchk(cudaEventRecord(stop));
    gpuErrchk(cudaEventSynchronize(stop));

    float milliseconds = 0;
    gpuErrchk(cudaEventElapsedTime(&milliseconds, start, stop));

    printf("Effective time: %.3f ms\n", milliseconds);
    printf("Final Result: %d Expected Result: %d\n", *sum, n);

    cudaFree(sum);
    cudaFree(data);
    return 0;
}
	#include <cooperative_groups.h>
	#include <algorithm>
	#include <cuda.h>
	#include<stdio.h>

	using namespace cooperative_groups;

	inline void gpuAssert(cudaError_t err, const char *file, int line)
	{
	if (err != cudaSuccess){
	printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
	exit(EXIT_FAILURE);
	}
	}
	#define gpuErrchk(ans) \
	{ \
	gpuAssert((ans), __FILE__, __LINE__); \
	}



	__device__ int thread_sum(int *input, int n)
	{
	int sum = 0;

	for(int i = blockIdx.x * blockDim.x + threadIdx.x;
	i < n / 4;
	i += blockDim.x * gridDim.x)
	{
	int4 in = ((int4*)input)[i];
	sum += in.x + in.y + in.z + in.w;
	}
	return sum;
	}

	__device__ int reduce_sum(thread_group g, int *temp, int val)
	{
	int lane = g.thread_rank();

	// Each iteration halves the number of active threads
	// Each thread adds its partial sum[i] to sum[lane+i]
	for (int i = g.size() / 2; i > 0; i /= 2)
	{
	temp[lane] = val;
	g.sync(); // wait for all threads to store
	if(lane<i) val += temp[lane + i];
	g.sync(); // wait for all threads to load
	}
	return val; // note: only thread 0 will return full sum
	}

	__global__ void sum_kernel_block(int sum, int input, int n)
	{
	//int tx = blockIdx.x*blockDim.x + threadIdx.x;
	//int my_sum = input[tx]; //thread_sum(input, n);
	int my_sum = thread_sum(input, n);

	extern __shared__ int temp[];
	auto g = this_thread_block();
	int block_sum = reduce_sum(g, temp, my_sum);
	if (g.thread_rank() == 0) atomicAdd(sum, block_sum);
	}

	int main(){
	int n = 1<<24;
	int blockSize = 256;
	int nBlocks = (n + blockSize - 1) / blockSize;
	int sharedBytes = blockSize * sizeof(int);

	int sum, data;
	cudaMallocManaged(&sum, sizeof(int));
	cudaMallocManaged(&data, n * sizeof(int));
	std::fill_n(data, n, 1); // initialize data
	cudaMemset(sum, 0, sizeof(int));


	cudaEvent_t start, stop;
	gpuErrchk(cudaEventCreate(&start));
	gpuErrchk(cudaEventCreate(&stop));

	gpuErrchk(cudaEventRecord(start));

	sum_kernel_block<<<nBlocks, blockSize, sharedBytes>>>(sum, data, n);

	gpuErrchk(cudaEventRecord(stop));
	gpuErrchk(cudaEventSynchronize(stop));

	float milliseconds = 0;
	gpuErrchk(cudaEventElapsedTime(&milliseconds, start, stop));

	printf("Effective time: %.3f ms\n", milliseconds);
	printf("Final Result: %d Expected Result: %d\n", *sum, n);

	cudaFree(sum);
	cudaFree(data);
	return 0;
	}