Skip to content

Instantly share code, notes, and snippets.

@sandeepkumar-skb
Last active August 2, 2020 01:57
Show Gist options
  • Save sandeepkumar-skb/914f7f0678ffac88843ab16963c03fe9 to your computer and use it in GitHub Desktop.
Save sandeepkumar-skb/914f7f0678ffac88843ab16963c03fe9 to your computer and use it in GitHub Desktop.
#include <iostream>
#include <chrono>
void cpu_all_reduce(int* sum, int* data, int n){
int temp_sum = 0;
for (int i=0; i<n; ++i){
temp_sum += data[i];
}
*sum = temp_sum;
}
__global__
void gpu_all_reduce(int* sum, int* data, int n){
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
int temp = 0;
for (int i =idx; i<n; i += stride){
temp += data[i];
}
atomicAdd(sum, temp);
}
int main(){
int n = 1 << 24;
// execution configuration
int blockSize = 256;
int nBlocks = (n + blockSize -1)/ blockSize;
// cpu variables for golden model
int *cpu_data = new int[n];
int *cpu_sum = new int;
*cpu_sum = 0;
// variables for cuda model
int *gpu_sum, *gpu_data;
cudaMallocManaged(&gpu_sum, sizeof(int));
cudaMallocManaged(&gpu_data, n * sizeof(int));
std::fill_n(gpu_data, n, 1); //initialize data
std::fill_n(cpu_data, n, 1); //initialize data
cudaMemset(gpu_sum, 0, sizeof(int));
std::chrono::high_resolution_clock::time_point cpu_start = std::chrono::high_resolution_clock::now();
cpu_all_reduce(cpu_sum, cpu_data, n);
std::chrono::high_resolution_clock::time_point cpu_end = std::chrono::high_resolution_clock::now();
std::chrono::high_resolution_clock::time_point gpu_start = std::chrono::high_resolution_clock::now();
gpu_all_reduce<<<nBlocks, blockSize>>>(gpu_sum, gpu_data, n);
cudaDeviceSynchronize();
std::chrono::high_resolution_clock::time_point gpu_end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> cpu_span = std::chrono::duration_cast<std::chrono::duration<double>>(cpu_end - cpu_start);
std::chrono::duration<double> gpu_span = std::chrono::duration_cast<std::chrono::duration<double>>(gpu_end - gpu_start);
if (*gpu_sum == *cpu_sum){
std::cout << "cpu sum: " << *cpu_sum << std::endl;
std::cout << "gpu sum: " << *gpu_sum << std::endl;
std::cout << "cpu time: " << cpu_span.count()*1000 << "ms" << std::endl;
std::cout << "gpu time: " << gpu_span.count()*1000 << "ms" << std::endl;
}
else{
std::cout << "GPU and CPU results don't Match!!" << std::endl;
std::cout << "cpu sum: " << *cpu_sum << std::endl;
std::cout << "gpu sum: " << *gpu_sum << std::endl;
}
cudaFree(gpu_sum);
cudaFree(gpu_data);
delete cpu_sum;
delete[] cpu_data;
return 0;
}
@sandeepkumar-skb
Copy link
Author

sandeepkumar-skb commented Aug 1, 2020

GPU: TitanRTX
Driver: 440.100
CUDA: 10.2
CPU: Intel(R) Xeon(R) Gold 6136 CPU @ 3.00GHz
Compile: nvcc -Xcompiler "-std=c++11" all_reduce_basic.cu -o all_reduce_basic
Run: ./all_reduce_basic

cpu sum: 16777216
gpu sum: 16777216
cpu time: 37.0161ms
gpu time: 30.3922ms

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment