denizyuret/sum.cu

## sum.cu
__device__ void _sum_32_20_0(volatile float *x, int i) {
  //for optimizing warps
  //volatile must be used as register optimization will lead to wrong answers
  float ai, xi;
  ai=x[i]; xi=x[i+32]; x[i]=ai+xi;
  ai=x[i]; xi=x[i+16]; x[i]=ai+xi;
  ai=x[i]; xi=x[i+ 8]; x[i]=ai+xi;
  ai=x[i]; xi=x[i+ 4]; x[i]=ai+xi;
  ai=x[i]; xi=x[i+ 2]; x[i]=ai+xi;
  ai=x[i]; xi=x[i+ 1]; x[i]=ai+xi;
}

__global__ void _sum_32_20_1(int n, float *x, float *y) {
  __shared__ float buffer[128];			   //all THR threads in the block write to buffer on their own tid
  int i_start = threadIdx.x+blockIdx.x*blockDim.x; //start at the thread index
  int i_end = n; //end at dim
  int i_step = blockDim.x*gridDim.x; // step is the total number of threads in the system
  int tid = threadIdx.x;
  float ai, xi;

  // sum the elements assigned to this thread
  ai = 0;
  for(int i=i_start; i<i_end; i+=i_step) {
    xi=x[i]; xi=xi; ai=ai+xi;
  }
  buffer[tid] = ai;
  __syncthreads();

  // help sum the entries in the block
  for(int stride=128/2; stride>32; stride>>=1) {
    if(tid < stride) {
      ai=buffer[tid]; xi=buffer[stride+tid]; buffer[tid]=ai+xi;
    }
    __syncthreads();   // Q: can this be outside the for loop?
  }

  if(tid<32) {
    _sum_32_20_0(buffer,tid);  // Inlining this does not work.
  }
  __syncthreads();

  if(tid==0) {  // the first thread in the block writes the block result to y
    y[blockIdx.x]=buffer[0];
  }
}

__global__ void _sum_32_20_2(float *y,float *z) {   // sum block results in y
  __shared__ float buffer[128];
  float ai, xi;
  int tid = threadIdx.x;
  buffer[tid] = y[tid];
  __syncthreads();
  for(int stride=128/2; stride>32; stride>>=1) {
    if(tid < stride) {
      ai=buffer[tid]; xi=buffer[stride+tid]; buffer[tid]=ai+xi;
    }
    __syncthreads();
  }
  if(tid<32) {
    _sum_32_20_0(buffer,tid);
  }
  __syncthreads();
  if(tid==0) {
    z[0]=buffer[0];
  }
}

extern "C" {
  float sum_32_20(int n, float *x) {
    float r;
    static float *y;
    static float *z;
    if (y == NULL) cudaMalloc(&y, 128*sizeof(float)); // sum for each block
    if (z == NULL) cudaMalloc(&z, sizeof(float));      // final sum
    _sum_32_20_1<<<128,128>>>(n,x,y);
    _sum_32_20_2<<<1,128>>>(y,z);
    cudaMemcpy(&r,z,sizeof(float),cudaMemcpyDeviceToHost);
    return r;
  }
  float sum_32_20_stream(int n, float *x, cudaStream_t STR) {
    float r;
    static float *y;
    static float *z;
    if (y == NULL) cudaMalloc(&y, 128*sizeof(float)); // sum for each block
    if (z == NULL) cudaMalloc(&z, sizeof(float));      // final sum
    _sum_32_20_1<<<128,128,0,STR>>>(n,x,y);
    _sum_32_20_2<<<1,128,0,STR>>>(y,z);
    cudaMemcpy(&r,z,sizeof(float),cudaMemcpyDeviceToHost);
    return r;
  }
}
	__device__ void _sum_32_20_0(volatile float *x, int i) {
	//for optimizing warps
	//volatile must be used as register optimization will lead to wrong answers
	float ai, xi;
	ai=x[i]; xi=x[i+32]; x[i]=ai+xi;
	ai=x[i]; xi=x[i+16]; x[i]=ai+xi;
	ai=x[i]; xi=x[i+ 8]; x[i]=ai+xi;
	ai=x[i]; xi=x[i+ 4]; x[i]=ai+xi;
	ai=x[i]; xi=x[i+ 2]; x[i]=ai+xi;
	ai=x[i]; xi=x[i+ 1]; x[i]=ai+xi;
	}

	__global__ void _sum_32_20_1(int n, float x, float y) {
	__shared__ float buffer[128]; //all THR threads in the block write to buffer on their own tid
	int i_start = threadIdx.x+blockIdx.x*blockDim.x; //start at the thread index
	int i_end = n; //end at dim
	int i_step = blockDim.x*gridDim.x; // step is the total number of threads in the system
	int tid = threadIdx.x;
	float ai, xi;

	// sum the elements assigned to this thread
	ai = 0;
	for(int i=i_start; i<i_end; i+=i_step) {
	xi=x[i]; xi=xi; ai=ai+xi;
	}
	buffer[tid] = ai;
	__syncthreads();

	// help sum the entries in the block
	for(int stride=128/2; stride>32; stride>>=1) {
	if(tid < stride) {
	ai=buffer[tid]; xi=buffer[stride+tid]; buffer[tid]=ai+xi;
	}
	__syncthreads(); // Q: can this be outside the for loop?
	}

	if(tid<32) {
	_sum_32_20_0(buffer,tid); // Inlining this does not work.
	}
	__syncthreads();

	if(tid==0) { // the first thread in the block writes the block result to y
	y[blockIdx.x]=buffer[0];
	}
	}

	__global__ void _sum_32_20_2(float y,float z) { // sum block results in y
	__shared__ float buffer[128];
	float ai, xi;
	int tid = threadIdx.x;
	buffer[tid] = y[tid];
	__syncthreads();
	for(int stride=128/2; stride>32; stride>>=1) {
	if(tid < stride) {
	ai=buffer[tid]; xi=buffer[stride+tid]; buffer[tid]=ai+xi;
	}
	__syncthreads();
	}
	if(tid<32) {
	_sum_32_20_0(buffer,tid);
	}
	__syncthreads();
	if(tid==0) {
	z[0]=buffer[0];
	}
	}

	extern "C" {
	float sum_32_20(int n, float *x) {
	float r;
	static float *y;
	static float *z;
	if (y == NULL) cudaMalloc(&y, 128*sizeof(float)); // sum for each block
	if (z == NULL) cudaMalloc(&z, sizeof(float)); // final sum
	_sum_32_20_1<<<128,128>>>(n,x,y);
	_sum_32_20_2<<<1,128>>>(y,z);
	cudaMemcpy(&r,z,sizeof(float),cudaMemcpyDeviceToHost);
	return r;
	}
	float sum_32_20_stream(int n, float *x, cudaStream_t STR) {
	float r;
	static float *y;
	static float *z;
	if (y == NULL) cudaMalloc(&y, 128*sizeof(float)); // sum for each block
	if (z == NULL) cudaMalloc(&z, sizeof(float)); // final sum
	_sum_32_20_1<<<128,128,0,STR>>>(n,x,y);
	_sum_32_20_2<<<1,128,0,STR>>>(y,z);
	cudaMemcpy(&r,z,sizeof(float),cudaMemcpyDeviceToHost);
	return r;
	}
	}