Created
February 4, 2021 11:21
-
-
Save denizyuret/7e249e21915406f9069bcc0da09fcf0e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__device__ void _sum_32_20_0(volatile float *x, int i) { | |
//for optimizing warps | |
//volatile must be used as register optimization will lead to wrong answers | |
float ai, xi; | |
ai=x[i]; xi=x[i+32]; x[i]=ai+xi; | |
ai=x[i]; xi=x[i+16]; x[i]=ai+xi; | |
ai=x[i]; xi=x[i+ 8]; x[i]=ai+xi; | |
ai=x[i]; xi=x[i+ 4]; x[i]=ai+xi; | |
ai=x[i]; xi=x[i+ 2]; x[i]=ai+xi; | |
ai=x[i]; xi=x[i+ 1]; x[i]=ai+xi; | |
} | |
__global__ void _sum_32_20_1(int n, float *x, float *y) { | |
__shared__ float buffer[128]; //all THR threads in the block write to buffer on their own tid | |
int i_start = threadIdx.x+blockIdx.x*blockDim.x; //start at the thread index | |
int i_end = n; //end at dim | |
int i_step = blockDim.x*gridDim.x; // step is the total number of threads in the system | |
int tid = threadIdx.x; | |
float ai, xi; | |
// sum the elements assigned to this thread | |
ai = 0; | |
for(int i=i_start; i<i_end; i+=i_step) { | |
xi=x[i]; xi=xi; ai=ai+xi; | |
} | |
buffer[tid] = ai; | |
__syncthreads(); | |
// help sum the entries in the block | |
for(int stride=128/2; stride>32; stride>>=1) { | |
if(tid < stride) { | |
ai=buffer[tid]; xi=buffer[stride+tid]; buffer[tid]=ai+xi; | |
} | |
__syncthreads(); // Q: can this be outside the for loop? | |
} | |
if(tid<32) { | |
_sum_32_20_0(buffer,tid); // Inlining this does not work. | |
} | |
__syncthreads(); | |
if(tid==0) { // the first thread in the block writes the block result to y | |
y[blockIdx.x]=buffer[0]; | |
} | |
} | |
__global__ void _sum_32_20_2(float *y,float *z) { // sum block results in y | |
__shared__ float buffer[128]; | |
float ai, xi; | |
int tid = threadIdx.x; | |
buffer[tid] = y[tid]; | |
__syncthreads(); | |
for(int stride=128/2; stride>32; stride>>=1) { | |
if(tid < stride) { | |
ai=buffer[tid]; xi=buffer[stride+tid]; buffer[tid]=ai+xi; | |
} | |
__syncthreads(); | |
} | |
if(tid<32) { | |
_sum_32_20_0(buffer,tid); | |
} | |
__syncthreads(); | |
if(tid==0) { | |
z[0]=buffer[0]; | |
} | |
} | |
extern "C" { | |
float sum_32_20(int n, float *x) { | |
float r; | |
static float *y; | |
static float *z; | |
if (y == NULL) cudaMalloc(&y, 128*sizeof(float)); // sum for each block | |
if (z == NULL) cudaMalloc(&z, sizeof(float)); // final sum | |
_sum_32_20_1<<<128,128>>>(n,x,y); | |
_sum_32_20_2<<<1,128>>>(y,z); | |
cudaMemcpy(&r,z,sizeof(float),cudaMemcpyDeviceToHost); | |
return r; | |
} | |
float sum_32_20_stream(int n, float *x, cudaStream_t STR) { | |
float r; | |
static float *y; | |
static float *z; | |
if (y == NULL) cudaMalloc(&y, 128*sizeof(float)); // sum for each block | |
if (z == NULL) cudaMalloc(&z, sizeof(float)); // final sum | |
_sum_32_20_1<<<128,128,0,STR>>>(n,x,y); | |
_sum_32_20_2<<<1,128,0,STR>>>(y,z); | |
cudaMemcpy(&r,z,sizeof(float),cudaMemcpyDeviceToHost); | |
return r; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment