odashi/cudnn_convolution_forward.cu

## cudnn_convolution_forward.cu
#include <iomanip>
#include <iostream>
#include <cstdlib>
#include <vector>

#include <cuda.h>
#include <cudnn.h>

#define CUDA_CALL(f) { \
  cudaError_t err = (f); \
  if (err != cudaSuccess) { \
    std::cout \
        << "    Error occurred: " << err << std::endl; \
    std::exit(1); \
  } \
}

#define CUDNN_CALL(f) { \
  cudnnStatus_t err = (f); \
  if (err != CUDNN_STATUS_SUCCESS) { \
    std::cout \
        << "    Error occurred: " << err << std::endl; \
    std::exit(1); \
  } \
}

__global__ void dev_const(float *px, float k) {
  int tid = threadIdx.x + blockIdx.x * blockDim.x;
  px[tid] = k;
}

__global__ void dev_iota(float *px) {
  int tid = threadIdx.x + blockIdx.x * blockDim.x;
  px[tid] = tid;
}

void print(const float *data, int n, int c, int h, int w) {
  std::vector<float> buffer(1 << 20);
  CUDA_CALL(cudaMemcpy(
        buffer.data(), data,
        n * c * h * w * sizeof(float),
        cudaMemcpyDeviceToHost));
  int a = 0;
  for (int i = 0; i < n; ++i) {
    for (int j = 0; j < c; ++j) {
      std::cout << "n=" << i << ", c=" << j << ":" << std::endl;
      for (int k = 0; k < h; ++k) {
        for (int l = 0; l < w; ++l) {
          std::cout << std::setw(4) << std::right << buffer[a];
          ++a;
        }
        std::cout << std::endl;
      }
    }
  }
  std::cout << std::endl;
}

int main() {
  cudnnHandle_t cudnn;
  CUDNN_CALL(cudnnCreate(&cudnn));

  // input
  const int in_n = 1;
  const int in_c = 1;
  const int in_h = 5;
  const int in_w = 5;
  std::cout << "in_n: " << in_n << std::endl;
  std::cout << "in_c: " << in_c << std::endl;
  std::cout << "in_h: " << in_h << std::endl;
  std::cout << "in_w: " << in_w << std::endl;
  std::cout << std::endl;

  cudnnTensorDescriptor_t in_desc;
  CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc));
  CUDNN_CALL(cudnnSetTensor4dDescriptor(
        in_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT,
        in_n, in_c, in_h, in_w));

  float *in_data;
  CUDA_CALL(cudaMalloc(
        &in_data, in_n * in_c * in_h * in_w * sizeof(float)));

  // filter
  const int filt_k = 1;
  const int filt_c = 1;
  const int filt_h = 2;
  const int filt_w = 2;
  std::cout << "filt_k: " << filt_k << std::endl;
  std::cout << "filt_c: " << filt_c << std::endl;
  std::cout << "filt_h: " << filt_h << std::endl;
  std::cout << "filt_w: " << filt_w << std::endl;
  std::cout << std::endl;

  cudnnFilterDescriptor_t filt_desc;
  CUDNN_CALL(cudnnCreateFilterDescriptor(&filt_desc));
  CUDNN_CALL(cudnnSetFilter4dDescriptor(
        filt_desc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW,
        filt_k, filt_c, filt_h, filt_w));

  float *filt_data;
  CUDA_CALL(cudaMalloc(
      &filt_data, filt_k * filt_c * filt_h * filt_w * sizeof(float)));

  // convolution
  const int pad_h = 1;
  const int pad_w = 1;
  const int str_h = 1;
  const int str_w = 1;
  const int dil_h = 1;
  const int dil_w = 1;
  std::cout << "pad_h: " << pad_h << std::endl;
  std::cout << "pad_w: " << pad_w << std::endl;
  std::cout << "str_h: " << str_h << std::endl;
  std::cout << "str_w: " << str_w << std::endl;
  std::cout << "dil_h: " << dil_h << std::endl;
  std::cout << "dil_w: " << dil_w << std::endl;
  std::cout << std::endl;

  cudnnConvolutionDescriptor_t conv_desc;
  CUDNN_CALL(cudnnCreateConvolutionDescriptor(&conv_desc));
  CUDNN_CALL(cudnnSetConvolution2dDescriptor(
        conv_desc,
        pad_h, pad_w, str_h, str_w, dil_h, dil_w,
        CUDNN_CONVOLUTION, CUDNN_DATA_FLOAT));

  // output
  int out_n;
  int out_c;
  int out_h;
  int out_w;

  CUDNN_CALL(cudnnGetConvolution2dForwardOutputDim(
        conv_desc, in_desc, filt_desc,
        &out_n, &out_c, &out_h, &out_w));

  std::cout << "out_n: " << out_n << std::endl;
  std::cout << "out_c: " << out_c << std::endl;
  std::cout << "out_h: " << out_h << std::endl;
  std::cout << "out_w: " << out_w << std::endl;
  std::cout << std::endl;

  cudnnTensorDescriptor_t out_desc;
  CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc));
  CUDNN_CALL(cudnnSetTensor4dDescriptor(
        out_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT,
        out_n, out_c, out_h, out_w));

  float *out_data;
  CUDA_CALL(cudaMalloc(
        &out_data, out_n * out_c * out_h * out_w * sizeof(float)));

  // algorithm
  cudnnConvolutionFwdAlgo_t algo;
  CUDNN_CALL(cudnnGetConvolutionForwardAlgorithm(
        cudnn,
        in_desc, filt_desc, conv_desc, out_desc,
        CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &algo));

  std::cout << "Convolution algorithm: " << algo << std::endl;
  std::cout << std::endl;

  // workspace
  size_t ws_size;
  CUDNN_CALL(cudnnGetConvolutionForwardWorkspaceSize(
        cudnn, in_desc, filt_desc, conv_desc, out_desc, algo, &ws_size));

  float *ws_data;
  CUDA_CALL(cudaMalloc(&ws_data, ws_size));

  std::cout << "Workspace size: " << ws_size << std::endl;
  std::cout << std::endl;

  // perform
  float alpha = 1.f;
  float beta = 0.f;
  dev_iota<<<in_w * in_h, in_n * in_c>>>(in_data);
  dev_const<<<filt_w * filt_h, filt_k * filt_c>>>(filt_data, 1.f);
  CUDNN_CALL(cudnnConvolutionForward(
      cudnn,
      &alpha, in_desc, in_data, filt_desc, filt_data,
      conv_desc, algo, ws_data, ws_size,
      &beta, out_desc, out_data));

  // results
  std::cout << "in_data:" << std::endl;
  print(in_data, in_n, in_c, in_h, in_w);

  std::cout << "filt_data:" << std::endl;
  print(filt_data, filt_k, filt_c, filt_h, filt_w);

  std::cout << "out_data:" << std::endl;
  print(out_data, out_n, out_c, out_h, out_w);

  // finalizing
  CUDA_CALL(cudaFree(ws_data));
  CUDA_CALL(cudaFree(out_data));
  CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc));
  CUDNN_CALL(cudnnDestroyConvolutionDescriptor(conv_desc));
  CUDA_CALL(cudaFree(filt_data));
  CUDNN_CALL(cudnnDestroyFilterDescriptor(filt_desc));
  CUDA_CALL(cudaFree(in_data));
  CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc));
  CUDNN_CALL(cudnnDestroy(cudnn));
  return 0;
}
	#include <iomanip>
	#include <iostream>
	#include <cstdlib>
	#include <vector>

	#include <cuda.h>
	#include <cudnn.h>

	#define CUDA_CALL(f) { \
	cudaError_t err = (f); \
	if (err != cudaSuccess) { \
	std::cout \
	<< " Error occurred: " << err << std::endl; \
	std::exit(1); \
	} \
	}

	#define CUDNN_CALL(f) { \
	cudnnStatus_t err = (f); \
	if (err != CUDNN_STATUS_SUCCESS) { \
	std::cout \
	<< " Error occurred: " << err << std::endl; \
	std::exit(1); \
	} \
	}

	__global__ void dev_const(float *px, float k) {
	int tid = threadIdx.x + blockIdx.x * blockDim.x;
	px[tid] = k;
	}

	__global__ void dev_iota(float *px) {
	int tid = threadIdx.x + blockIdx.x * blockDim.x;
	px[tid] = tid;
	}

	void print(const float *data, int n, int c, int h, int w) {
	std::vector<float> buffer(1 << 20);
	CUDA_CALL(cudaMemcpy(
	buffer.data(), data,
	n * c * h * w * sizeof(float),
	cudaMemcpyDeviceToHost));
	int a = 0;
	for (int i = 0; i < n; ++i) {
	for (int j = 0; j < c; ++j) {
	std::cout << "n=" << i << ", c=" << j << ":" << std::endl;
	for (int k = 0; k < h; ++k) {
	for (int l = 0; l < w; ++l) {
	std::cout << std::setw(4) << std::right << buffer[a];
	++a;
	}
	std::cout << std::endl;
	}
	}
	}
	std::cout << std::endl;
	}

	int main() {
	cudnnHandle_t cudnn;
	CUDNN_CALL(cudnnCreate(&cudnn));

	// input
	const int in_n = 1;
	const int in_c = 1;
	const int in_h = 5;
	const int in_w = 5;
	std::cout << "in_n: " << in_n << std::endl;
	std::cout << "in_c: " << in_c << std::endl;
	std::cout << "in_h: " << in_h << std::endl;
	std::cout << "in_w: " << in_w << std::endl;
	std::cout << std::endl;

	cudnnTensorDescriptor_t in_desc;
	CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc));
	CUDNN_CALL(cudnnSetTensor4dDescriptor(
	in_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT,
	in_n, in_c, in_h, in_w));

	float *in_data;
	CUDA_CALL(cudaMalloc(
	&in_data, in_n * in_c * in_h * in_w * sizeof(float)));

	// filter
	const int filt_k = 1;
	const int filt_c = 1;
	const int filt_h = 2;
	const int filt_w = 2;
	std::cout << "filt_k: " << filt_k << std::endl;
	std::cout << "filt_c: " << filt_c << std::endl;
	std::cout << "filt_h: " << filt_h << std::endl;
	std::cout << "filt_w: " << filt_w << std::endl;
	std::cout << std::endl;

	cudnnFilterDescriptor_t filt_desc;
	CUDNN_CALL(cudnnCreateFilterDescriptor(&filt_desc));
	CUDNN_CALL(cudnnSetFilter4dDescriptor(
	filt_desc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW,
	filt_k, filt_c, filt_h, filt_w));

	float *filt_data;
	CUDA_CALL(cudaMalloc(
	&filt_data, filt_k * filt_c * filt_h * filt_w * sizeof(float)));

	// convolution
	const int pad_h = 1;
	const int pad_w = 1;
	const int str_h = 1;
	const int str_w = 1;
	const int dil_h = 1;
	const int dil_w = 1;
	std::cout << "pad_h: " << pad_h << std::endl;
	std::cout << "pad_w: " << pad_w << std::endl;
	std::cout << "str_h: " << str_h << std::endl;
	std::cout << "str_w: " << str_w << std::endl;
	std::cout << "dil_h: " << dil_h << std::endl;
	std::cout << "dil_w: " << dil_w << std::endl;
	std::cout << std::endl;

	cudnnConvolutionDescriptor_t conv_desc;
	CUDNN_CALL(cudnnCreateConvolutionDescriptor(&conv_desc));
	CUDNN_CALL(cudnnSetConvolution2dDescriptor(
	conv_desc,
	pad_h, pad_w, str_h, str_w, dil_h, dil_w,
	CUDNN_CONVOLUTION, CUDNN_DATA_FLOAT));

	// output
	int out_n;
	int out_c;
	int out_h;
	int out_w;

	CUDNN_CALL(cudnnGetConvolution2dForwardOutputDim(
	conv_desc, in_desc, filt_desc,
	&out_n, &out_c, &out_h, &out_w));

	std::cout << "out_n: " << out_n << std::endl;
	std::cout << "out_c: " << out_c << std::endl;
	std::cout << "out_h: " << out_h << std::endl;
	std::cout << "out_w: " << out_w << std::endl;
	std::cout << std::endl;

	cudnnTensorDescriptor_t out_desc;
	CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc));
	CUDNN_CALL(cudnnSetTensor4dDescriptor(
	out_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT,
	out_n, out_c, out_h, out_w));

	float *out_data;
	CUDA_CALL(cudaMalloc(
	&out_data, out_n * out_c * out_h * out_w * sizeof(float)));

	// algorithm
	cudnnConvolutionFwdAlgo_t algo;
	CUDNN_CALL(cudnnGetConvolutionForwardAlgorithm(
	cudnn,
	in_desc, filt_desc, conv_desc, out_desc,
	CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &algo));

	std::cout << "Convolution algorithm: " << algo << std::endl;
	std::cout << std::endl;

	// workspace
	size_t ws_size;
	CUDNN_CALL(cudnnGetConvolutionForwardWorkspaceSize(
	cudnn, in_desc, filt_desc, conv_desc, out_desc, algo, &ws_size));

	float *ws_data;
	CUDA_CALL(cudaMalloc(&ws_data, ws_size));

	std::cout << "Workspace size: " << ws_size << std::endl;
	std::cout << std::endl;

	// perform
	float alpha = 1.f;
	float beta = 0.f;
	dev_iota<<<in_w * in_h, in_n * in_c>>>(in_data);
	dev_const<<<filt_w * filt_h, filt_k * filt_c>>>(filt_data, 1.f);
	CUDNN_CALL(cudnnConvolutionForward(
	cudnn,
	&alpha, in_desc, in_data, filt_desc, filt_data,
	conv_desc, algo, ws_data, ws_size,
	&beta, out_desc, out_data));

	// results
	std::cout << "in_data:" << std::endl;
	print(in_data, in_n, in_c, in_h, in_w);

	std::cout << "filt_data:" << std::endl;
	print(filt_data, filt_k, filt_c, filt_h, filt_w);

	std::cout << "out_data:" << std::endl;
	print(out_data, out_n, out_c, out_h, out_w);

	// finalizing
	CUDA_CALL(cudaFree(ws_data));
	CUDA_CALL(cudaFree(out_data));
	CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc));
	CUDNN_CALL(cudnnDestroyConvolutionDescriptor(conv_desc));
	CUDA_CALL(cudaFree(filt_data));
	CUDNN_CALL(cudnnDestroyFilterDescriptor(filt_desc));
	CUDA_CALL(cudaFree(in_data));
	CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc));
	CUDNN_CALL(cudnnDestroy(cudnn));
	return 0;
	}