Skip to content

Instantly share code, notes, and snippets.

@sandeepkumar-skb
Created August 2, 2020 01:57
Show Gist options
  • Save sandeepkumar-skb/a86790eade82fbecb7677ec12ad4cbe5 to your computer and use it in GitHub Desktop.
Save sandeepkumar-skb/a86790eade82fbecb7677ec12ad4cbe5 to your computer and use it in GitHub Desktop.
#include <iostream>
#include <chrono>
void cpu_all_reduce(int* sum, int* data, int n){
int temp_sum = 0;
for (int i=0; i<n; ++i){
temp_sum += data[i];
}
*sum = temp_sum;
}
__device__
void warpReduce(volatile int* shmem, int tid){
shmem[tid] += shmem[tid+32];
shmem[tid] += shmem[tid+16];
shmem[tid] += shmem[tid+8];
shmem[tid] += shmem[tid+4];
shmem[tid] += shmem[tid+2];
shmem[tid] += shmem[tid+1];
}
__global__
void gpu_all_reduce(int* sum, int* data, int n, int shmemSize){
extern __shared__ int shmem [];
int tid = threadIdx.x;
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
int temp = 0;
for (int i =idx; i<n; i += stride){
shmem[tid] = data[i];// + data[i+(blockDim.x+1)/2];
__syncthreads();
for (int s_id=(shmemSize+1)/2; s_id > 32; s_id/=2){
if (tid < s_id){
shmem[tid] += shmem[tid+s_id];
}
__syncthreads();
}
if (tid < 32) {
warpReduce(shmem, tid);
}
if (tid == 0)
temp += shmem[0];
}
if (tid == 0)
atomicAdd(sum, temp);
}
void init(int* data, int size){
for (int i=0; i<size; ++i){
data[i] = i;
}
}
int main(){
int n = 1 << 24;
// execution configuration
int blockSize = 256;
int nBlocks = (n + blockSize -1)/ blockSize;
int sharedBytes = blockSize*sizeof(int);
// cpu variables for golden model
int *cpu_data = new int[n];
int *cpu_sum = new int;
*cpu_sum = 0;
// variables for cuda model
int *gpu_sum, *gpu_data;
cudaMallocManaged(&gpu_sum, sizeof(int));
cudaMallocManaged(&gpu_data, n * sizeof(int));
std::fill_n(gpu_data, n, 1); //initialize data
std::fill_n(cpu_data, n, 1); //initialize data
//init(gpu_data, n);
//init(cpu_data, n);
cudaMemset(gpu_sum, 0, sizeof(int));
std::chrono::high_resolution_clock::time_point cpu_start = std::chrono::high_resolution_clock::now();
cpu_all_reduce(cpu_sum, cpu_data, n);
std::chrono::high_resolution_clock::time_point cpu_end = std::chrono::high_resolution_clock::now();
std::chrono::high_resolution_clock::time_point gpu_start = std::chrono::high_resolution_clock::now();
gpu_all_reduce<<<nBlocks, blockSize, sharedBytes>>>(gpu_sum, gpu_data, n, blockSize);
cudaDeviceSynchronize();
std::chrono::high_resolution_clock::time_point gpu_end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> cpu_span = std::chrono::duration_cast<std::chrono::duration<double>>(cpu_end - cpu_start);
std::chrono::duration<double> gpu_span = std::chrono::duration_cast<std::chrono::duration<double>>(gpu_end - gpu_start);
if (*gpu_sum == *cpu_sum){
std::cout << "cpu sum: " << *cpu_sum << std::endl;
std::cout << "gpu sum: " << *gpu_sum << std::endl;
std::cout << "cpu time: " << cpu_span.count()*1000 << "ms" << std::endl;
std::cout << "gpu time: " << gpu_span.count()*1000 << "ms" << std::endl;
}
else{
std::cout << "GPU and CPU results don't Match!!" << std::endl;
std::cout << "cpu sum: " << *cpu_sum << std::endl;
std::cout << "gpu sum: " << *gpu_sum << std::endl;
}
cudaFree(gpu_sum);
cudaFree(gpu_data);
delete cpu_sum;
delete[] cpu_data;
return 0;
}
@sandeepkumar-skb
Copy link
Author

This implementation uses shared memory and accesses the neighboring addresses in a coalesced fashion.
Also when the tid < 32, it don't need any more __syncthreads(); Instead we directly compute it.

GPU: TitanRTX
Driver: 440.100
CUDA: 10.2
CPU: Intel(R) Xeon(R) Gold 6136 CPU @ 3.00GHz
Compile: nvcc -Xcompiler "-std=c++11" all_reduce_shmem.cu -o all_reduce_shmem
Run: ./all_reduce_shmem

cpu sum: 16777216
gpu sum: 16777216
cpu time: 38.1562ms
gpu time: 27.6263ms

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment