Skip to content

Instantly share code, notes, and snippets.

@kalj
Last active July 1, 2016 15:32
Show Gist options
  • Save kalj/0d279d405b214d5304559583b0c3bb9c to your computer and use it in GitHub Desktop.
Save kalj/0d279d405b214d5304559583b0c3bb9c to your computer and use it in GitHub Desktop.
Register-hungry CUDA code
#define ELEM_DEGREE 4
enum Direction { X, Y, Z};
enum Transpose { TR, NOTR};
template <Direction dir, unsigned int n,
bool add, bool inplace,typename Number>
__device__ void reduce(Number * __restrict__ dst, const Number * __restrict__ src, const Number *myphi)
{
const unsigned int q = threadIdx.x; // new index
const unsigned int i = threadIdx.y; // two unchanged
const unsigned int j = threadIdx.z%n; // indices
Number tmp = 0;
#pragma unroll
for(int k = 0; k < n; ++k) {
const unsigned int srcidx =
(dir==X) ? (k + n*(i + n*j))
: (dir==Y) ? (i + n*(k + n*j))
: (i + n*(j + n*k));
tmp += myphi[k] * (inplace ? dst[srcidx] : src[srcidx]);
}
if(inplace) __syncthreads();
const unsigned int dstidx =
(dir==X) ? (q + n*(i + n*j))
: (dir==Y) ? (i + n*(q + n*j))
: (i + n*(j + n*q));
if(add)
dst[dstidx] += tmp;
else
dst[dstidx] = tmp;
}
template <int dim, int n, typename Number>
class Phi {
const static int npts=n*n*n;
const int tid = threadIdx.x+n*(threadIdx.y + n*threadIdx.z);
const unsigned int cell = blockIdx.x+gridDim.x*blockIdx.y;
const int ncells;
public:
__device__ Phi(int nc)
: ncells(nc) { }
__device__ unsigned int global_q() const { return ROWLENGTH*cell+tid; }
__device__ void read_values(Number *__restrict__ values, const Number *__restrict__ src,
const unsigned int * __restrict__ loc2glob)
{
values[tid] = __ldg(&src[loc2glob[ROWLENGTH*cell+tid]]);
__syncthreads();
}
__device__ void write_values(Number *__restrict__ dst, const Number *__restrict__ values,
const unsigned int * __restrict__ loc2glob)
{
dst[loc2glob[cell*ROWLENGTH+tid]] += values[tid];
}
__device__ void get_grad(GpuArray<dim,Number> &grad, const Number *__restrict__ gradients,
const Number * __restrict__ jac) const
{
#pragma unroll
for(int d1=0; d1<dim; d1++) {
Number tmp = 0;
#pragma unroll
for(int d2=0; d2<dim; d2++) {
tmp += jac[((dim*d2+d1)*ncells+cell)*ROWLENGTH+tid]*gradients[d2*npts+tid];
}
grad[d1] = tmp;
}
}
__device__ void submit_grad(Number *__restrict__ gradients, const GpuArray<dim,Number> &grad,
const Number * __restrict__ jac,
const Number * __restrict__ jxw) const
{
#pragma unroll
for(int d1=0; d1<dim; d1++) {
Number tmp = 0;
#pragma unroll
for(int d2=0; d2<dim; d2++) {
tmp += jac[((dim*d1+d2)*ncells+cell)*ROWLENGTH+tid]*grad[d2];
}
gradients[d1*npts+tid] = tmp*jxw[cell*ROWLENGTH+tid];
}
__syncthreads();
}
__device__ void evaluate(Number *__restrict__ gradients, const Number *__restrict__ values)
{
///////////////////////////////////////////////////////////////
// Stage PHI and DPHI in registers:
Number my_phi[n];
Number my_dphi[n];
#pragma unroll
for(int i = 0; i < n; i++) {
my_phi[i] = shape_values[threadIdx.x + n*i];
my_dphi[i] = shape_gradient[threadIdx.x + n*i];
}
///////////////////////////////////////////////////////////////
// reduce along x / i / q - direction
reduce<X,n,false,false> (&gradients[0],values,my_dphi);
reduce<X,n,false,false> (&gradients[1*npts],values,my_phi);
reduce<X,n,false,false> (&gradients[2*npts],values,my_phi);
__syncthreads();
// reduce along y / j / r - direction
reduce<Y,n,false,true> (&gradients[0],&gradients[0],my_phi);
reduce<Y,n,false,true> (&gradients[1*npts],&gradients[1*npts],my_dphi);
reduce<Y,n,false,true> (&gradients[2*npts],&gradients[2*npts],my_phi);
__syncthreads();
// reduce along z / k / s - direction
reduce<Z,n,false,true> (&gradients[0],&gradients[0],my_phi);
reduce<Z,n,false,true> (&gradients[1*npts],&gradients[1*npts],my_phi);
reduce<Z,n,false,true> (&gradients[2*npts],&gradients[2*npts],my_dphi);
__syncthreads();
// now we should have values at quadrature points
// no synch is necessary since we are only working on local data.
}
__device__ void integrate(Number *__restrict__ gradients, Number *__restrict__ values)
{
///////////////////////////////////////////////////////////////
// Stage transpose of PHI and DPHI in registers:
Number my_phi[n];
Number my_dphi[n];
#pragma unroll
for(int i = 0; i < n; i++) {
my_phi[i] = shape_values[threadIdx.x*n + i];
my_dphi[i] = shape_gradient[threadIdx.x*n + i];
}
///////////////////////////////////////////////////////////////
__syncthreads();
reduce<X,n,false,true> (&gradients[0],&gradients[0],my_dphi);
reduce<X,n,false,true> (&gradients[1*npts],&gradients[1*npts],my_phi);
reduce<X,n,false,true> (&gradients[2*npts],&gradients[2*npts],my_phi);
__syncthreads();
// reduce along y / j / r - direction
reduce<Y,n,false,true> (&gradients[0],&gradients[0],my_phi);
reduce<Y,n,false,true> (&gradients[1*npts],&gradients[1*npts],my_dphi);
reduce<Y,n,false,true> (&gradients[2*npts],&gradients[2*npts],my_phi);
__syncthreads();
// reduce along z / k / s - direction
reduce<Z,n,false,false> (values,&gradients[0],my_phi);
__syncthreads();
reduce<Z,n,true,false> (values,&gradients[1*npts],my_phi);
__syncthreads();
reduce<Z,n,true,false> (values,&gradients[2*npts],my_dphi);
__syncthreads();
// no synch is necessary since we are only working on local data.
}
};
template <int dim, unsigned int n, typename Number>
__global__ void kernel_grad(Number *__restrict__ dst, const Number *__restrict__ src,
const unsigned int *__restrict__ loc2glob,
const Number *__restrict__ coeff, const Number *__restrict__ jac,
const Number *__restrict__ jxw, const unsigned int n_cells)
{
const unsigned int nqpts=n*n*n;
__shared__ Number values[nqpts];
__shared__ Number gradients[3*nqpts];
const unsigned int cell = blockIdx.x+gridDim.x*blockIdx.y;
if(cell < n_cells) {
// USER CODE BEGIN HERE
Phi<dim,n,Number> phi(n_cells);
phi.read_values(values,src,loc2glob);
phi.evaluate(gradients,values);
GpuArray<dim,Number> grad;
phi.get_grad(grad,gradients,jac);
grad *= coeff[phi.global_q()];
phi.submit_grad(gradients,grad,jac,jxw);
phi.integrate(gradients,values);
phi.write_values(dst,values,loc2glob);
// END USER CODE
}
}
#define ELEM_DEGREE 4
enum Direction { X, Y, Z};
enum Transpose { TR, NOTR};
template <Direction dir, unsigned int n,
bool add, bool inplace,typename Number>
__device__ void reduce(Number * __restrict__ dst, const Number * __restrict__ src, const Number *myphi)
{
const unsigned int q = threadIdx.x; // new index
const unsigned int i = threadIdx.y; // two unchanged
const unsigned int j = threadIdx.z%n; // indices
Number tmp = 0;
#pragma unroll
for(int k = 0; k < n; ++k) {
const unsigned int srcidx =
(dir==X) ? (k + n*(i + n*j))
: (dir==Y) ? (i + n*(k + n*j))
: (i + n*(j + n*k));
tmp += myphi[k] * (inplace ? dst[srcidx] : src[srcidx]);
}
if(inplace) __syncthreads();
const unsigned int dstidx =
(dir==X) ? (q + n*(i + n*j))
: (dir==Y) ? (i + n*(q + n*j))
: (i + n*(j + n*q));
if(add)
dst[dstidx] += tmp;
else
dst[dstidx] = tmp;
}
template <int dim, int n, typename Number>
class Phi {
typedef typename MatrixFreeGpu<dim,Number>::GpuData GData;
const unsigned int * __restrict__ loc2glob;
const Number * __restrict__ jxw;
const Number * __restrict__ jac;
Number *__restrict__ values;
Number *__restrict__ gradients;
const static int npts=n*n*n;
const int tid = threadIdx.x+n*(threadIdx.y + n*threadIdx.z);
const unsigned int cell;
const int ncells;
Number my_phi[n];
Number my_dphi[n];
public:
__device__ Phi(Number * __restrict__ v, Number * __restrict__ g,const GData *data, unsigned int c)
: values(v),gradients(g),
loc2glob(data->loc2glob+data->rowlength*c),
jxw(data->JxW+data->rowlength*c),
jac(data->inv_jac+data->rowlength*c),
cell(c),
ncells(data->n_cells)
{ }
__device__ unsigned int global_q() const { return ROWLENGTH*cell+tid; }
__device__ void read_values(const Number *__restrict__ src)
{
values[tid] = __ldg(&src[loc2glob[tid]]);
__syncthreads();
}
__device__ void write_values(Number *__restrict__ dst) const
{
dst[loc2glob[tid]] += values[tid];
}
__device__ GpuArray<dim,Number> get_grad() const
{
GpuArray<dim,Number> grad;
#pragma unroll
for(int d1=0; d1<dim; d1++) {
Number tmp = 0;
#pragma unroll
for(int d2=0; d2<dim; d2++) {
tmp += jac[((dim*d2+d1)*ncells)*ROWLENGTH+tid]*gradients[d2*npts+tid];
}
grad[d1] = tmp;
}
return grad;
}
__device__ void submit_grad(const GpuArray<dim,Number> &grad)
{
#pragma unroll
for(int d1=0; d1<dim; d1++) {
Number tmp = 0;
#pragma unroll
for(int d2=0; d2<dim; d2++) {
tmp += jac[((dim*d1+d2)*ncells)*ROWLENGTH+tid]*grad[d2];
}
gradients[d1*npts+tid] = tmp*jxw[tid];
}
__syncthreads();
}
__device__ void evaluate()
{
///////////////////////////////////////////////////////////////
// Stage PHI and DPHI in registers:
#pragma unroll
for(int i = 0; i < n; i++) {
my_phi[i] = shape_values[threadIdx.x + n*i];
my_dphi[i] = shape_gradient[threadIdx.x + n*i];
}
///////////////////////////////////////////////////////////////
// reduce along x / i / q - direction
reduce<X,n,false,false> (&gradients[0],values,my_dphi);
reduce<X,n,false,false> (&gradients[1*npts],values,my_phi);
reduce<X,n,false,false> (&gradients[2*npts],values,my_phi);
__syncthreads();
// reduce along y / j / r - direction
reduce<Y,n,false,true> (&gradients[0],&gradients[0],my_phi);
reduce<Y,n,false,true> (&gradients[1*npts],&gradients[1*npts],my_dphi);
reduce<Y,n,false,true> (&gradients[2*npts],&gradients[2*npts],my_phi);
__syncthreads();
// reduce along z / k / s - direction
reduce<Z,n,false,true> (&gradients[0],&gradients[0],my_phi);
reduce<Z,n,false,true> (&gradients[1*npts],&gradients[1*npts],my_phi);
reduce<Z,n,false,true> (&gradients[2*npts],&gradients[2*npts],my_dphi);
__syncthreads();
// now we should have values at quadrature points
// no synch is necessary since we are only working on local data.
}
__device__ void integrate()
{
///////////////////////////////////////////////////////////////
// Stage transpose of PHI and DPHI in registers:
#pragma unroll
for(int i = 0; i < n; i++) {
my_phi[i] = shape_values[threadIdx.x*n + i];
my_dphi[i] = shape_gradient[threadIdx.x*n + i];
}
///////////////////////////////////////////////////////////////
__syncthreads();
reduce<X,n,false,true> (&gradients[0],&gradients[0],my_dphi);
reduce<X,n,false,true> (&gradients[1*npts],&gradients[1*npts],my_phi);
reduce<X,n,false,true> (&gradients[2*npts],&gradients[2*npts],my_phi);
__syncthreads();
// reduce along y / j / r - direction
reduce<Y,n,false,true> (&gradients[0],&gradients[0],my_phi);
reduce<Y,n,false,true> (&gradients[1*npts],&gradients[1*npts],my_dphi);
reduce<Y,n,false,true> (&gradients[2*npts],&gradients[2*npts],my_phi);
__syncthreads();
// reduce along z / k / s - direction
reduce<Z,n,false,false> (values,&gradients[0],my_phi);
__syncthreads();
reduce<Z,n,true,false> (values,&gradients[1*npts],my_phi);
__syncthreads();
reduce<Z,n,true,false> (values,&gradients[2*npts],my_dphi);
__syncthreads();
// no synch is necessary since we are only working on local data.
}
};
template <int dim, unsigned int n, typename Number>
__global__ void kernel_grad(Number *__restrict__ dst, const Number *__restrict__ src,
const typename MatrixFreeGpu<dim,Number>::GpuData gpu_data,
const Number *__restrict__ coeff)
{
const unsigned int nqpts=n*n*n;
__shared__ Number values[nqpts];
__shared__ Number gradients[3*nqpts];
const unsigned int cell = blockIdx.x+gridDim.x*blockIdx.y;
if(cell < gpu_data.n_cells) {
// USER CODE BEGIN HERE
Phi<dim,n,Number> phi(values,gradients,&gpu_data,cell);
phi.read_values(src);
phi.evaluate();
phi.submit_grad((coeff[phi.global_q()] *
phi.get_grad()));
phi.integrate();
phi.write_values(dst);
// END USER CODE
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment