sandeepkumar-skb/reduce.cu

## reduce.cu
#include <iostream>
#include <chrono>

#define BLOCK_SIZE 256

inline void gpuAssert(cudaError_t err, const char *file, int line)
{
    if (err != cudaSuccess){
        printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
        exit(EXIT_FAILURE);
    }
}
#define gpuErrchk(ans)                        \
{                                         \
    gpuAssert((ans), __FILE__, __LINE__); \
}


void cpu_all_reduce(int* sum, int* data, int n){
    int temp_sum = 0;
    for (int i=0; i<n; ++i){
        temp_sum += data[i];
    }
    *sum = temp_sum;
}

__global__
void gpu_all_reduce_global(int* sum, int* data, int n){
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    for(int s=blockDim.x/2; s>0; s/=2){
        if(threadIdx.x < s){
            data[idx] += data[idx+s];
        }
        __syncthreads();
    }

    if (threadIdx.x == 0){
        atomicAdd(sum, data[idx]);
    }
}

void init(int *data, int size){
    for (int i=0; i<size; ++i){
        data[i] = 2;
    }
}


int main(){
    int n = 1 << 24;

    // execution configuration
    dim3 blockSize (BLOCK_SIZE, 1, 1);
    dim3 nBlocks ((n + BLOCK_SIZE -1)/ BLOCK_SIZE, 1, 1);

    // cpu variables for golden model
    int *cpu_sum = new int;
    *cpu_sum = 0;
    int *input_data;

    // variables for cuda model
    int *gpu_sum;
    gpuErrchk(cudaMallocManaged(&gpu_sum, sizeof(int)));

    gpuErrchk(cudaMallocManaged(&input_data, n*sizeof(int)));
    init(input_data, n);
    gpuErrchk(cudaMemset(gpu_sum, 0, sizeof(int)));

    //CPU
    cpu_all_reduce(cpu_sum, input_data, n);
    init(input_data, n);
    std::chrono::high_resolution_clock::time_point cpu_start = std::chrono::high_resolution_clock::now();
    cpu_all_reduce(cpu_sum, input_data, n);
    init(input_data, n);
    std::chrono::high_resolution_clock::time_point cpu_end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> cpu_span = cpu_end - cpu_start;


    //GPU
    // Warmup
    gpu_all_reduce_global<<<nBlocks, blockSize>>>(gpu_sum, input_data, n);
    cudaDeviceSynchronize();

    cudaEvent_t gpu_start, gpu_stop;
    gpuErrchk(cudaEventCreate(&gpu_start));
    gpuErrchk(cudaEventCreate(&gpu_stop));

    init(input_data, n);
    cudaMemset(gpu_sum, 0, sizeof(int));

    gpuErrchk(cudaEventRecord(gpu_start));
    gpu_all_reduce_global<<<nBlocks, blockSize>>>(gpu_sum, input_data, n);
    gpuErrchk(cudaEventRecord(gpu_stop));
    gpuErrchk(cudaEventSynchronize(gpu_stop));

    float milliseconds = 0;
    gpuErrchk(cudaEventElapsedTime(&milliseconds, gpu_start, gpu_stop));

    if (*gpu_sum == *cpu_sum){
        std::cout << "cpu sum: " << *cpu_sum << std::endl;
        std::cout << "gpu sum: " << *gpu_sum << std::endl;
        std::cout << "cpu time: " << cpu_span.count()*1000 << "ms" << std::endl;
        std::cout << "gpu time: " << milliseconds << "ms" << std::endl;
    }
    else{
        std::cout << "GPU and CPU results don't Match!!" << std::endl;
    }


    cudaFree(gpu_sum);
    cudaFree(input_data);
    delete cpu_sum;
    return 0;
}
	#include <iostream>
	#include <chrono>

	#define BLOCK_SIZE 256

	inline void gpuAssert(cudaError_t err, const char *file, int line)
	{
	if (err != cudaSuccess){
	printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
	exit(EXIT_FAILURE);
	}
	}
	#define gpuErrchk(ans) \
	{ \
	gpuAssert((ans), __FILE__, __LINE__); \
	}


	void cpu_all_reduce(int* sum, int* data, int n){
	int temp_sum = 0;
	for (int i=0; i<n; ++i){
	temp_sum += data[i];
	}
	*sum = temp_sum;
	}

	__global__
	void gpu_all_reduce_global(int* sum, int* data, int n){
	int idx = blockDim.x * blockIdx.x + threadIdx.x;
	for(int s=blockDim.x/2; s>0; s/=2){
	if(threadIdx.x < s){
	data[idx] += data[idx+s];
	}
	__syncthreads();
	}

	if (threadIdx.x == 0){
	atomicAdd(sum, data[idx]);
	}
	}

	void init(int *data, int size){
	for (int i=0; i<size; ++i){
	data[i] = 2;
	}
	}


	int main(){
	int n = 1 << 24;

	// execution configuration
	dim3 blockSize (BLOCK_SIZE, 1, 1);
	dim3 nBlocks ((n + BLOCK_SIZE -1)/ BLOCK_SIZE, 1, 1);

	// cpu variables for golden model
	int *cpu_sum = new int;
	*cpu_sum = 0;
	int *input_data;

	// variables for cuda model
	int *gpu_sum;
	gpuErrchk(cudaMallocManaged(&gpu_sum, sizeof(int)));

	gpuErrchk(cudaMallocManaged(&input_data, n*sizeof(int)));
	init(input_data, n);
	gpuErrchk(cudaMemset(gpu_sum, 0, sizeof(int)));

	//CPU
	cpu_all_reduce(cpu_sum, input_data, n);
	init(input_data, n);
	std::chrono::high_resolution_clock::time_point cpu_start = std::chrono::high_resolution_clock::now();
	cpu_all_reduce(cpu_sum, input_data, n);
	init(input_data, n);
	std::chrono::high_resolution_clock::time_point cpu_end = std::chrono::high_resolution_clock::now();
	std::chrono::duration<double> cpu_span = cpu_end - cpu_start;


	//GPU
	// Warmup
	gpu_all_reduce_global<<<nBlocks, blockSize>>>(gpu_sum, input_data, n);
	cudaDeviceSynchronize();

	cudaEvent_t gpu_start, gpu_stop;
	gpuErrchk(cudaEventCreate(&gpu_start));
	gpuErrchk(cudaEventCreate(&gpu_stop));

	init(input_data, n);
	cudaMemset(gpu_sum, 0, sizeof(int));

	gpuErrchk(cudaEventRecord(gpu_start));
	gpu_all_reduce_global<<<nBlocks, blockSize>>>(gpu_sum, input_data, n);
	gpuErrchk(cudaEventRecord(gpu_stop));
	gpuErrchk(cudaEventSynchronize(gpu_stop));

	float milliseconds = 0;
	gpuErrchk(cudaEventElapsedTime(&milliseconds, gpu_start, gpu_stop));

	if (gpu_sum == cpu_sum){
	std::cout << "cpu sum: " << *cpu_sum << std::endl;
	std::cout << "gpu sum: " << *gpu_sum << std::endl;
	std::cout << "cpu time: " << cpu_span.count()*1000 << "ms" << std::endl;
	std::cout << "gpu time: " << milliseconds << "ms" << std::endl;
	}
	else{
	std::cout << "GPU and CPU results don't Match!!" << std::endl;
	}


	cudaFree(gpu_sum);
	cudaFree(input_data);
	delete cpu_sum;
	return 0;
	}