cwli24/new-forward.cuh

## new-forward.cuh

#ifndef MXNET_OPERATOR_NEW_FORWARD_CUH_
#define MXNET_OPERATOR_NEW_FORWARD_CUH_
#define TILE_WIDTH 24 // has to be a multiple of this to get correctness = 0.8562
//#define TILE_WIDTH 8
#include <mxnet/base.h>

namespace mxnet
{
namespace op
{


__global__ void forward_kernel(float *y, const float *x, const float *k, const int B, const int M, const int C, const int H, const int W, const int K) {

    const int H_out = H - K + 1;
    const int W_out = W - K + 1;

    /* dynamic shared memory declaration */
    extern __shared__ float shared_mem[];
    float *k_s = &shared_mem[0];
    float *x_s = &shared_mem[C*K*K];
    //__shared__ float y_s[24*24];

    //(void)H_out; // silence declared but never referenced warning. remove this line when you start working
    //(void)W_out; // silence declared but never referenced warning. remove this line when you start working

    #define y4d(i3,i2,i1,i0) y[(i3) * (M * H_out * W_out) + (i2)*(H_out * W_out) + (i1)*(W_out) + i0]
    #define x4d(i3,i2,i1,i0) x[(i3) * (C * H * W) + (i2)*(H * W) + (i1)*(W) + i0]
    #define k4d(i3,i2,i1,i0) k[(i3) * (C * K * K) + (i2)*(K * K) + (i1)*(K) + i0]

    int b, m, h, w;
    b = blockIdx.x;
    m = blockIdx.y;
    h = threadIdx.x;
    w = threadIdx.y;

    /* load the shared memory sequentially */
    int ti = h*TILE_WIDTH + w;
    if (ti < C*K*K)
        k_s[ti] = k[m*(C*K*K) + ti];
    do{
        if (ti < C*H*W)
            x_s[ti] = x[b*(C*H*W) + ti];
        ti += TILE_WIDTH*TILE_WIDTH;
    } while (ti < C*H*W);
    __syncthreads();

    //for (int m = 0; m < M; m++) {
    float dotsum = 0;
    for (int c = 0; c < C; c++) {
        for(int p = 0; p < K; p++) {
            for (int q = 0; q < K; q++) {
                dotsum += x_s[c*(H*W) + (h + p)*W + w + q] * k_s[c*(K*K) + p*K + q];
                //dotsum += x4d(b, c, h + p, w + q) * k4d(m, c, p, q);
            }
        }
    }
    y4d(b, m, h, w) = dotsum;
    //}

    // int ti;
    // for (ti = h*TILE_WIDTH+w; ti < C*K*K; ti += TILE_WIDTH*TILE_WIDTH)
    //     k_s[ti] = k[m*(C*K*K)+ti];
    // for (ti = h*TILE_WIDTH+w; ti < C*H*W; ti += TILE_WIDTH*TILE_WIDTH)
    //     x_s[ti] = x[b*(C*H*W)+ti];
    // __syncthreads();
    //
    // //for (int m = 0; m < M; m++) {
    // float ds11, ds12, ds13, ds21, ds22, ds23, ds31, ds32, ds33;
    // ds11=ds12=ds13=ds21=ds22=ds23=ds31=ds32=ds33=0;
    //
    // for (int c = 0; c < C; c++) { // ranges from [0-1)
    //     for(int p = 0; p < K; p++) { // ranges from [0-5)
    //         for (int q = 0; q < K; q++) { // ranges from [0-5)
    //             // ds11 += x_s[c*(H*W) + (h + p)*W + (w + q)] * k_s[c*(K*K) + p*K + q];
    //             // ds12 += x_s[c*(H*W) + (h + p)*W + (w + TILE_WIDTH + q)] * k_s[c*(K*K) + p*K + q];
    //             // ds13 += x_s[c*(H*W) + (h + p)*W + (w + TILE_WIDTH*2 + q)] * k_s[c*(K*K) + p*K + q];
    //             // ds21 += x_s[c*(H*W) + (h + TILE_WIDTH + p)*W + (w + q)] * k_s[c*(K*K) + p*K + q];
    //             // ds22 += x_s[c*(H*W) + (h + TILE_WIDTH + p)*W + (w + TILE_WIDTH + q)] * k_s[c*(K*K) + p*K + q];
    //             // ds23 += x_s[c*(H*W) + (h + TILE_WIDTH + p)*W + (w + TILE_WIDTH*2 q)] * k_s[c*(K*K) + p*K + q];
    //             // ds31 += x_s[c*(H*W) + (h + TILE_WIDTH*2 + p)*W + (w + q)] * k_s[c*(K*K) + p*K + q];
    //             // ds32 += x_s[c*(H*W) + (h + TILE_WIDTH*2 + p)*W + (w + TILE_WIDTH + q)] * k_s[c*(K*K) + p*K + q];
    //             // ds33 += x_s[c*(H*W) + (h + TILE_WIDTH*2 + p)*W + (w + TILE_WIDTH*2 + q)] * k_s[c*(K*K) + p*K + q];
    //             dotsum += x4d(b, c, h + p, w + q) * k4d(m, c, p, q);
    //         }
    //     }
    // }
    // y4d(b, m, h, w) = dotsum;

    // y_s[h*TILE_WIDTH+w] = ds11;
    // y_s[h*TILE_WIDTH+(w+TILE_WIDTH)] = ds12;
    // y_s[h*TILE_WIDTH+(w+TILE_WIDTH*2)] = ds13;
    // y_s[(h+TILE_WIDTH)*TILE_WIDTH+w] = ds21;
    // y_s[(h+TILE_WIDTH)*TILE_WIDTH+(w+TILE_WIDTH)] = ds22;
    // y_s[(h+TILE_WIDTH)*TILE_WIDTH+(w+TILE_WIDTH*2)] = ds23;
    // y_s[(h+TILE_WIDTH*2)*TILE_WIDTH+w] = ds31;
    // y_s[(h+TILE_WIDTH*2)*TILE_WIDTH+(w+TILE_WIDTH)] = ds32;
    // y_s[(h+TILE_WIDTH*2)*TILE_WIDTH+(w+TILE_WIDTH*2)] = ds33;

    // y4d(b, m, h,                w)              = ds11;
    // y4d(b, m, h,                w+TILE_WIDTH)   = ds12;
    // y4d(b, m, h,                w+TILE_WIDTH*2) = ds13;
    // y4d(b, m, h+TILE_WIDTH,     w)              = ds21;
    // y4d(b, m, h+TILE_WIDTH,     w+TILE_WIDTH)   = ds22;
    // y4d(b, m, h+TILE_WIDTH,     w+TILE_WIDTH*2) = ds23;
    // y4d(b, m, h+TILE_WIDTH*2,   w)              = ds31;
    // y4d(b, m, h+TILE_WIDTH*2,   w+TILE_WIDTH)   = ds32;
    // y4d(b, m, h+TILE_WIDTH*2,   w+TILE_WIDTH*2) = ds33;

    //}

    #undef y4d
    #undef x4d
    #undef k4d
}

/*
   This function is called by new-inl.h
   Any code you write should be executed by this function.
   For ECE408, we only expect the float version of the operator to be called, so here we specialize with only floats.
*/
template<>
void forward<gpu, float>(mshadow::Tensor<gpu, 4, float> &y, const mshadow::Tensor<gpu, 4, float> &x, const mshadow::Tensor<gpu, 4, float> &w) {

    // Use mxnet's CHECK_EQ to do assertions.
    // Remove this assertion when you do your implementation!
    //CHECK_EQ(0, 1) << "Missing an ECE408 GPU implementation!";

    // You'll probably need to launch kernels against the right stream to keep MXNet happy
    cudaStream_t s = y.stream_->stream_;

    // Extract the tensor dimensions into B,M,C,H,W,K
    // ...

    /* # of batches */
    const int B = x.shape_[0];

    /* # of output channels/convolution layers */
    const int M = y.shape_[1];

    /* # of input channels */
    const int C = x.shape_[1];

    /* height of each input channel */
    const int H = x.shape_[2];

    /* width of each input channel */
    const int W = x.shape_[3];

    /* height and width of each weight kernel */
    const int K = w.shape_[3];

    /*
        H_out -> height of each output channel
        W_out -> width of each output channel
    */
    //const int H_out = H - K + 1;
    //const int W_out = W - K + 1;

    //int W_grid = ceil((double)W_out/TILE_WIDTH); // number of horizontal tiles per output map
    //int H_grid = ceil((double)H_out/TILE_WIDTH); // number of vertical tiles per output map
    //int Z = H_grid * W_grid;

    dim3 blockDim(TILE_WIDTH, TILE_WIDTH, 1);
    dim3 gridDim(B, M, 1);

    forward_kernel<<<gridDim, blockDim, C*(K*K + H*W)*sizeof(float), s>>>(y.dptr_,x.dptr_,w.dptr_, B,M,C,H,W,K);

    // Use MSHADOW_CUDA_CALL to check for CUDA runtime errors.
    MSHADOW_CUDA_CALL(cudaDeviceSynchronize());
}


/*
    This tells mxnet how to do an op when it's not a float.
    This is not used in the ECE408 project
*/
template<typename gpu, typename DType>
void forward(mshadow::Tensor<gpu, 4, DType> &y, const mshadow::Tensor<gpu, 4, DType> &x, const mshadow::Tensor<gpu, 4, DType> &w) {
    assert( 0 && "No forward implementation for other datatypes needed for ECE408");
}


}
}

#endif

	#ifndef MXNET_OPERATOR_NEW_FORWARD_CUH_
	#define MXNET_OPERATOR_NEW_FORWARD_CUH_
	#define TILE_WIDTH 24 // has to be a multiple of this to get correctness = 0.8562
	//#define TILE_WIDTH 8
	#include <mxnet/base.h>

	namespace mxnet
	{
	namespace op
	{


	__global__ void forward_kernel(float y, const float x, const float *k, const int B, const int M, const int C, const int H, const int W, const int K) {

	const int H_out = H - K + 1;
	const int W_out = W - K + 1;

	/* dynamic shared memory declaration */
	extern __shared__ float shared_mem[];
	float *k_s = &shared_mem[0];
	float x_s = &shared_mem[CK*K];
	//__shared__ float y_s[24*24];

	//(void)H_out; // silence declared but never referenced warning. remove this line when you start working
	//(void)W_out; // silence declared but never referenced warning. remove this line when you start working

	#define y4d(i3,i2,i1,i0) y[(i3) * (M * H_out * W_out) + (i2)(H_out W_out) + (i1)*(W_out) + i0]
	#define x4d(i3,i2,i1,i0) x[(i3) * (C * H * W) + (i2)(H W) + (i1)*(W) + i0]
	#define k4d(i3,i2,i1,i0) k[(i3) * (C * K * K) + (i2)(K K) + (i1)*(K) + i0]

	int b, m, h, w;
	b = blockIdx.x;
	m = blockIdx.y;
	h = threadIdx.x;
	w = threadIdx.y;

	/* load the shared memory sequentially */
	int ti = h*TILE_WIDTH + w;
	if (ti < CKK)
	k_s[ti] = k[m(CK*K) + ti];
	do{
	if (ti < CHW)
	x_s[ti] = x[b(CH*W) + ti];
	ti += TILE_WIDTH*TILE_WIDTH;
	} while (ti < CHW);
	__syncthreads();

	//for (int m = 0; m < M; m++) {
	float dotsum = 0;
	for (int c = 0; c < C; c++) {
	for(int p = 0; p < K; p++) {
	for (int q = 0; q < K; q++) {
	dotsum += x_s[c(HW) + (h + p)W + w + q] k_s[c(KK) + p*K + q];
	//dotsum += x4d(b, c, h + p, w + q) * k4d(m, c, p, q);
	}
	}
	}
	y4d(b, m, h, w) = dotsum;
	//}

	// int ti;
	// for (ti = hTILE_WIDTH+w; ti < CKK; ti += TILE_WIDTHTILE_WIDTH)
	// k_s[ti] = k[m(CK*K)+ti];
	// for (ti = hTILE_WIDTH+w; ti < CHW; ti += TILE_WIDTHTILE_WIDTH)
	// x_s[ti] = x[b(CH*W)+ti];
	// __syncthreads();
	//
	// //for (int m = 0; m < M; m++) {
	// float ds11, ds12, ds13, ds21, ds22, ds23, ds31, ds32, ds33;
	// ds11=ds12=ds13=ds21=ds22=ds23=ds31=ds32=ds33=0;
	//
	// for (int c = 0; c < C; c++) { // ranges from [0-1)
	// for(int p = 0; p < K; p++) { // ranges from [0-5)
	// for (int q = 0; q < K; q++) { // ranges from [0-5)
	// // ds11 += x_s[c(HW) + (h + p)W + (w + q)] k_s[c(KK) + p*K + q];
	// // ds12 += x_s[c(HW) + (h + p)W + (w + TILE_WIDTH + q)] k_s[c(KK) + p*K + q];
	// // ds13 += x_s[c(HW) + (h + p)W + (w + TILE_WIDTH2 + q)] * k_s[c(KK) + p*K + q];
	// // ds21 += x_s[c(HW) + (h + TILE_WIDTH + p)W + (w + q)] k_s[c(KK) + p*K + q];
	// // ds22 += x_s[c(HW) + (h + TILE_WIDTH + p)W + (w + TILE_WIDTH + q)] k_s[c(KK) + p*K + q];
	// // ds23 += x_s[c(HW) + (h + TILE_WIDTH + p)W + (w + TILE_WIDTH2 q)] * k_s[c(KK) + p*K + q];
	// // ds31 += x_s[c(HW) + (h + TILE_WIDTH2 + p)W + (w + q)] * k_s[c(KK) + p*K + q];
	// // ds32 += x_s[c(HW) + (h + TILE_WIDTH2 + p)W + (w + TILE_WIDTH + q)] * k_s[c(KK) + p*K + q];
	// // ds33 += x_s[c(HW) + (h + TILE_WIDTH2 + p)W + (w + TILE_WIDTH2 + q)] k_s[c(KK) + p*K + q];
	// dotsum += x4d(b, c, h + p, w + q) * k4d(m, c, p, q);
	// }
	// }
	// }
	// y4d(b, m, h, w) = dotsum;

	// y_s[h*TILE_WIDTH+w] = ds11;
	// y_s[h*TILE_WIDTH+(w+TILE_WIDTH)] = ds12;
	// y_s[hTILE_WIDTH+(w+TILE_WIDTH2)] = ds13;
	// y_s[(h+TILE_WIDTH)*TILE_WIDTH+w] = ds21;
	// y_s[(h+TILE_WIDTH)*TILE_WIDTH+(w+TILE_WIDTH)] = ds22;
	// y_s[(h+TILE_WIDTH)TILE_WIDTH+(w+TILE_WIDTH2)] = ds23;
	// y_s[(h+TILE_WIDTH2)TILE_WIDTH+w] = ds31;
	// y_s[(h+TILE_WIDTH2)TILE_WIDTH+(w+TILE_WIDTH)] = ds32;
	// y_s[(h+TILE_WIDTH2)TILE_WIDTH+(w+TILE_WIDTH*2)] = ds33;

	// y4d(b, m, h, w) = ds11;
	// y4d(b, m, h, w+TILE_WIDTH) = ds12;
	// y4d(b, m, h, w+TILE_WIDTH*2) = ds13;
	// y4d(b, m, h+TILE_WIDTH, w) = ds21;
	// y4d(b, m, h+TILE_WIDTH, w+TILE_WIDTH) = ds22;
	// y4d(b, m, h+TILE_WIDTH, w+TILE_WIDTH*2) = ds23;
	// y4d(b, m, h+TILE_WIDTH*2, w) = ds31;
	// y4d(b, m, h+TILE_WIDTH*2, w+TILE_WIDTH) = ds32;
	// y4d(b, m, h+TILE_WIDTH2, w+TILE_WIDTH2) = ds33;

	//}

	#undef y4d
	#undef x4d
	#undef k4d
	}

	/*
	This function is called by new-inl.h
	Any code you write should be executed by this function.
	For ECE408, we only expect the float version of the operator to be called, so here we specialize with only floats.
	*/
	template<>
	void forward<gpu, float>(mshadow::Tensor<gpu, 4, float> &y, const mshadow::Tensor<gpu, 4, float> &x, const mshadow::Tensor<gpu, 4, float> &w) {

	// Use mxnet's CHECK_EQ to do assertions.
	// Remove this assertion when you do your implementation!
	//CHECK_EQ(0, 1) << "Missing an ECE408 GPU implementation!";

	// You'll probably need to launch kernels against the right stream to keep MXNet happy
	cudaStream_t s = y.stream_->stream_;

	// Extract the tensor dimensions into B,M,C,H,W,K
	// ...

	/* # of batches */
	const int B = x.shape_[0];

	/* # of output channels/convolution layers */
	const int M = y.shape_[1];

	/* # of input channels */
	const int C = x.shape_[1];

	/* height of each input channel */
	const int H = x.shape_[2];

	/* width of each input channel */
	const int W = x.shape_[3];

	/* height and width of each weight kernel */
	const int K = w.shape_[3];

	/*
	H_out -> height of each output channel
	W_out -> width of each output channel
	*/
	//const int H_out = H - K + 1;
	//const int W_out = W - K + 1;

	//int W_grid = ceil((double)W_out/TILE_WIDTH); // number of horizontal tiles per output map
	//int H_grid = ceil((double)H_out/TILE_WIDTH); // number of vertical tiles per output map
	//int Z = H_grid * W_grid;

	dim3 blockDim(TILE_WIDTH, TILE_WIDTH, 1);
	dim3 gridDim(B, M, 1);

	forward_kernel<<<gridDim, blockDim, C(KK + HW)sizeof(float), s>>>(y.dptr_,x.dptr_,w.dptr_, B,M,C,H,W,K);

	// Use MSHADOW_CUDA_CALL to check for CUDA runtime errors.
	MSHADOW_CUDA_CALL(cudaDeviceSynchronize());
	}


	/*
	This tells mxnet how to do an op when it's not a float.
	This is not used in the ECE408 project
	*/
	template<typename gpu, typename DType>
	void forward(mshadow::Tensor<gpu, 4, DType> &y, const mshadow::Tensor<gpu, 4, DType> &x, const mshadow::Tensor<gpu, 4, DType> &w) {
	assert( 0 && "No forward implementation for other datatypes needed for ECE408");
	}


	}
	}

	#endif