Skip to content

Instantly share code, notes, and snippets.

@denizyuret
Created February 4, 2021 11:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save denizyuret/7e249e21915406f9069bcc0da09fcf0e to your computer and use it in GitHub Desktop.
Save denizyuret/7e249e21915406f9069bcc0da09fcf0e to your computer and use it in GitHub Desktop.
__device__ void _sum_32_20_0(volatile float *x, int i) {
//for optimizing warps
//volatile must be used as register optimization will lead to wrong answers
float ai, xi;
ai=x[i]; xi=x[i+32]; x[i]=ai+xi;
ai=x[i]; xi=x[i+16]; x[i]=ai+xi;
ai=x[i]; xi=x[i+ 8]; x[i]=ai+xi;
ai=x[i]; xi=x[i+ 4]; x[i]=ai+xi;
ai=x[i]; xi=x[i+ 2]; x[i]=ai+xi;
ai=x[i]; xi=x[i+ 1]; x[i]=ai+xi;
}
__global__ void _sum_32_20_1(int n, float *x, float *y) {
__shared__ float buffer[128]; //all THR threads in the block write to buffer on their own tid
int i_start = threadIdx.x+blockIdx.x*blockDim.x; //start at the thread index
int i_end = n; //end at dim
int i_step = blockDim.x*gridDim.x; // step is the total number of threads in the system
int tid = threadIdx.x;
float ai, xi;
// sum the elements assigned to this thread
ai = 0;
for(int i=i_start; i<i_end; i+=i_step) {
xi=x[i]; xi=xi; ai=ai+xi;
}
buffer[tid] = ai;
__syncthreads();
// help sum the entries in the block
for(int stride=128/2; stride>32; stride>>=1) {
if(tid < stride) {
ai=buffer[tid]; xi=buffer[stride+tid]; buffer[tid]=ai+xi;
}
__syncthreads(); // Q: can this be outside the for loop?
}
if(tid<32) {
_sum_32_20_0(buffer,tid); // Inlining this does not work.
}
__syncthreads();
if(tid==0) { // the first thread in the block writes the block result to y
y[blockIdx.x]=buffer[0];
}
}
__global__ void _sum_32_20_2(float *y,float *z) { // sum block results in y
__shared__ float buffer[128];
float ai, xi;
int tid = threadIdx.x;
buffer[tid] = y[tid];
__syncthreads();
for(int stride=128/2; stride>32; stride>>=1) {
if(tid < stride) {
ai=buffer[tid]; xi=buffer[stride+tid]; buffer[tid]=ai+xi;
}
__syncthreads();
}
if(tid<32) {
_sum_32_20_0(buffer,tid);
}
__syncthreads();
if(tid==0) {
z[0]=buffer[0];
}
}
extern "C" {
float sum_32_20(int n, float *x) {
float r;
static float *y;
static float *z;
if (y == NULL) cudaMalloc(&y, 128*sizeof(float)); // sum for each block
if (z == NULL) cudaMalloc(&z, sizeof(float)); // final sum
_sum_32_20_1<<<128,128>>>(n,x,y);
_sum_32_20_2<<<1,128>>>(y,z);
cudaMemcpy(&r,z,sizeof(float),cudaMemcpyDeviceToHost);
return r;
}
float sum_32_20_stream(int n, float *x, cudaStream_t STR) {
float r;
static float *y;
static float *z;
if (y == NULL) cudaMalloc(&y, 128*sizeof(float)); // sum for each block
if (z == NULL) cudaMalloc(&z, sizeof(float)); // final sum
_sum_32_20_1<<<128,128,0,STR>>>(n,x,y);
_sum_32_20_2<<<1,128,0,STR>>>(y,z);
cudaMemcpy(&r,z,sizeof(float),cudaMemcpyDeviceToHost);
return r;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment