Skip to content

Instantly share code, notes, and snippets.

@sandeepkumar-skb
Created February 17, 2021 05:22
Show Gist options
  • Save sandeepkumar-skb/2fff5f85c2772c0955ad9ae6149c998f to your computer and use it in GitHub Desktop.
Save sandeepkumar-skb/2fff5f85c2772c0955ad9ae6149c998f to your computer and use it in GitHub Desktop.
#include <iostream>
#include <chrono>
#define BLOCK_SIZE 256
inline void gpuAssert(cudaError_t err, const char *file, int line)
{
if (err != cudaSuccess){
printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
exit(EXIT_FAILURE);
}
}
#define gpuErrchk(ans) \
{ \
gpuAssert((ans), __FILE__, __LINE__); \
}
void cpu_all_reduce(int* sum, int* data, int n){
int temp_sum = 0;
for (int i=0; i<n; ++i){
temp_sum += data[i];
}
*sum = temp_sum;
}
__global__
void gpu_all_reduce_global(int* sum, int* data, int n){
int idx = blockDim.x * blockIdx.x + threadIdx.x;
for(int s=blockDim.x/2; s>0; s/=2){
if(threadIdx.x < s){
data[idx] += data[idx+s];
}
__syncthreads();
}
if (threadIdx.x == 0){
atomicAdd(sum, data[idx]);
}
}
void init(int *data, int size){
for (int i=0; i<size; ++i){
data[i] = 2;
}
}
int main(){
int n = 1 << 24;
// execution configuration
dim3 blockSize (BLOCK_SIZE, 1, 1);
dim3 nBlocks ((n + BLOCK_SIZE -1)/ BLOCK_SIZE, 1, 1);
// cpu variables for golden model
int *cpu_sum = new int;
*cpu_sum = 0;
int *input_data;
// variables for cuda model
int *gpu_sum;
gpuErrchk(cudaMallocManaged(&gpu_sum, sizeof(int)));
gpuErrchk(cudaMallocManaged(&input_data, n*sizeof(int)));
init(input_data, n);
gpuErrchk(cudaMemset(gpu_sum, 0, sizeof(int)));
//CPU
cpu_all_reduce(cpu_sum, input_data, n);
init(input_data, n);
std::chrono::high_resolution_clock::time_point cpu_start = std::chrono::high_resolution_clock::now();
cpu_all_reduce(cpu_sum, input_data, n);
init(input_data, n);
std::chrono::high_resolution_clock::time_point cpu_end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> cpu_span = cpu_end - cpu_start;
//GPU
// Warmup
gpu_all_reduce_global<<<nBlocks, blockSize>>>(gpu_sum, input_data, n);
cudaDeviceSynchronize();
cudaEvent_t gpu_start, gpu_stop;
gpuErrchk(cudaEventCreate(&gpu_start));
gpuErrchk(cudaEventCreate(&gpu_stop));
init(input_data, n);
cudaMemset(gpu_sum, 0, sizeof(int));
gpuErrchk(cudaEventRecord(gpu_start));
gpu_all_reduce_global<<<nBlocks, blockSize>>>(gpu_sum, input_data, n);
gpuErrchk(cudaEventRecord(gpu_stop));
gpuErrchk(cudaEventSynchronize(gpu_stop));
float milliseconds = 0;
gpuErrchk(cudaEventElapsedTime(&milliseconds, gpu_start, gpu_stop));
if (*gpu_sum == *cpu_sum){
std::cout << "cpu sum: " << *cpu_sum << std::endl;
std::cout << "gpu sum: " << *gpu_sum << std::endl;
std::cout << "cpu time: " << cpu_span.count()*1000 << "ms" << std::endl;
std::cout << "gpu time: " << milliseconds << "ms" << std::endl;
}
else{
std::cout << "GPU and CPU results don't Match!!" << std::endl;
}
cudaFree(gpu_sum);
cudaFree(input_data);
delete cpu_sum;
return 0;
}
@sandeepkumar-skb
Copy link
Author

╰─ nvcc reduce.cu -o reduce.o && ./reduce.o
cpu sum: 33554432
gpu sum: 33554432
cpu time: 74.9788ms
gpu time: 19.9832ms

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment