Created
June 26, 2016 08:28
-
-
Save chungexcy/415ba6a87ec9fb1bb54f501f51f92e2e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#ifdef USE_CUDNN | |
#include <algorithm> | |
#include <vector> | |
#include "caffe/layers/cudnn_conv_layer.hpp" | |
namespace caffe { | |
cudnnConvolutionFwdAlgo_t | |
GetCuDNNFwdAlgo(ConvolutionParameter_CuDNNFwdAlgorithm algo) { | |
switch (algo) { | |
case ConvolutionParameter_CuDNNFwdAlgorithm_IMPLICIT_GEMM: | |
return CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; | |
case ConvolutionParameter_CuDNNFwdAlgorithm_IMPLICIT_PRECOMP_GEMM: | |
return CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; | |
case ConvolutionParameter_CuDNNFwdAlgorithm_GEMM: | |
return CUDNN_CONVOLUTION_FWD_ALGO_GEMM; | |
case ConvolutionParameter_CuDNNFwdAlgorithm_DIRECT: | |
return CUDNN_CONVOLUTION_FWD_ALGO_DIRECT; | |
case ConvolutionParameter_CuDNNFwdAlgorithm_FFT: | |
return CUDNN_CONVOLUTION_FWD_ALGO_FFT; | |
case ConvolutionParameter_CuDNNFwdAlgorithm_FFT_TILING: | |
return CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING; | |
#if CUDNN_VERSION_MIN(5, 0, 0) | |
case ConvolutionParameter_CuDNNFwdAlgorithm_WINOGRAD: | |
return CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD; | |
#endif | |
default: | |
return CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; | |
} | |
} | |
cudnnConvolutionBwdDataAlgo_t | |
GetCuDNNBwdDataAlgo(ConvolutionParameter_CuDNNBwdDataAlgorithm algo) { | |
switch (algo) { | |
case ConvolutionParameter_CuDNNBwdDataAlgorithm_BWD_DATA_ALGO_0: | |
return CUDNN_CONVOLUTION_BWD_DATA_ALGO_0; | |
case ConvolutionParameter_CuDNNBwdDataAlgorithm_BWD_DATA_ALGO_1: | |
return CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; | |
case ConvolutionParameter_CuDNNBwdDataAlgorithm_BWD_DATA_FFT: | |
return CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT; | |
case ConvolutionParameter_CuDNNBwdDataAlgorithm_BWD_DATA_FFT_TILING: | |
return CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING; | |
#if CUDNN_VERSION_MIN(5, 0, 0) | |
case ConvolutionParameter_CuDNNBwdDataAlgorithm_BWD_DATA_WINOGRAD: | |
return CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD; | |
#endif | |
default: | |
return CUDNN_CONVOLUTION_BWD_DATA_ALGO_0; | |
} | |
} | |
cudnnConvolutionBwdFilterAlgo_t | |
GetCuDNNBwdFilterAlgo(ConvolutionParameter_CuDNNBwdFilterAlgorithm algo) { | |
switch (algo) { | |
case ConvolutionParameter_CuDNNBwdFilterAlgorithm_BWD_FILTER_ALGO_0: | |
return CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0; | |
case ConvolutionParameter_CuDNNBwdFilterAlgorithm_BWD_FILTER_ALGO_1: | |
return CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1; | |
case ConvolutionParameter_CuDNNBwdFilterAlgorithm_BWD_FILTER_FFT: | |
return CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT; | |
case ConvolutionParameter_CuDNNBwdFilterAlgorithm_BWD_FILTER_ALGO_3: | |
return CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3; | |
default: | |
return CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0; | |
} | |
} | |
// Set to three for the benefit of the backward pass, which | |
// can use separate streams for calculating the gradient w.r.t. | |
// bias, filter weights, and bottom data for each group independently | |
#define CUDNN_STREAMS_PER_GROUP 3 | |
/** | |
* TODO(dox) explain cuDNN interface | |
*/ | |
template <typename Dtype> | |
void CuDNNConvolutionLayer<Dtype>::LayerSetUp( | |
const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { | |
ConvolutionLayer<Dtype>::LayerSetUp(bottom, top); | |
// Initialize CUDA streams and cuDNN. | |
stream_ = new cudaStream_t[this->group_ * CUDNN_STREAMS_PER_GROUP]; | |
handle_ = new cudnnHandle_t[this->group_ * CUDNN_STREAMS_PER_GROUP]; | |
// Initialize algorithm arrays | |
fwd_algo_ = new cudnnConvolutionFwdAlgo_t[bottom.size()]; | |
bwd_filter_algo_= new cudnnConvolutionBwdFilterAlgo_t[bottom.size()]; | |
bwd_data_algo_ = new cudnnConvolutionBwdDataAlgo_t[bottom.size()]; | |
// initialize size arrays | |
workspace_fwd_sizes_ = new size_t[bottom.size()]; | |
workspace_bwd_filter_sizes_ = new size_t[bottom.size()]; | |
workspace_bwd_data_sizes_ = new size_t[bottom.size()]; | |
// workspace data | |
workspaceSizeInBytes = 0; | |
workspaceData = NULL; | |
workspace = new void*[this->group_ * CUDNN_STREAMS_PER_GROUP]; | |
for (size_t i = 0; i < bottom.size(); ++i) { | |
// initialize all to default algorithms | |
fwd_algo_[i] = (cudnnConvolutionFwdAlgo_t)6; | |
bwd_filter_algo_[i] = (cudnnConvolutionBwdFilterAlgo_t)0; | |
bwd_data_algo_[i] = (cudnnConvolutionBwdDataAlgo_t)0; | |
// default algorithms don't require workspace | |
workspace_fwd_sizes_[i] = 0; | |
workspace_bwd_data_sizes_[i] = 0; | |
workspace_bwd_filter_sizes_[i] = 0; | |
} | |
//printf("ok\n"); | |
for (int g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) { | |
CUDA_CHECK(cudaStreamCreate(&stream_[g])); | |
CUDNN_CHECK(cudnnCreate(&handle_[g])); | |
CUDNN_CHECK(cudnnSetStream(handle_[g], stream_[g])); | |
workspace[g] = NULL; | |
} | |
// Set the indexing parameters. | |
bias_offset_ = (this->num_output_ / this->group_); | |
// Create filter descriptor. | |
const int* kernel_shape_data = this->kernel_shape_.cpu_data(); | |
const int kernel_h = kernel_shape_data[0]; | |
const int kernel_w = kernel_shape_data[1]; | |
cudnn::createFilterDesc<Dtype>(&filter_desc_, | |
this->num_output_ / this->group_, this->channels_ / this->group_, | |
kernel_h, kernel_w); | |
// Create tensor descriptor(s) for data and corresponding convolution(s). | |
for (int i = 0; i < bottom.size(); i++) { | |
cudnnTensorDescriptor_t bottom_desc; | |
cudnn::createTensor4dDesc<Dtype>(&bottom_desc); | |
bottom_descs_.push_back(bottom_desc); | |
cudnnTensorDescriptor_t top_desc; | |
cudnn::createTensor4dDesc<Dtype>(&top_desc); | |
top_descs_.push_back(top_desc); | |
cudnnConvolutionDescriptor_t conv_desc; | |
cudnn::createConvolutionDesc<Dtype>(&conv_desc); | |
conv_descs_.push_back(conv_desc); | |
} | |
// Tensor descriptor for bias. | |
if (this->bias_term_) { | |
cudnn::createTensor4dDesc<Dtype>(&bias_desc_); | |
} | |
handles_setup_ = true; | |
} | |
template <typename Dtype> | |
void CuDNNConvolutionLayer<Dtype>::Reshape( | |
const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { | |
ConvolutionLayer<Dtype>::Reshape(bottom, top); | |
CHECK_EQ(2, this->num_spatial_axes_) | |
<< "CuDNNConvolution input must have 2 spatial axes " | |
<< "(e.g., height and width). " | |
<< "Use 'engine: CAFFE' for general ND convolution."; | |
bottom_offset_ = this->bottom_dim_ / this->group_; | |
top_offset_ = this->top_dim_ / this->group_; | |
const int height = bottom[0]->shape(this->channel_axis_ + 1); | |
const int width = bottom[0]->shape(this->channel_axis_ + 2); | |
const int height_out = top[0]->shape(this->channel_axis_ + 1); | |
const int width_out = top[0]->shape(this->channel_axis_ + 2); | |
const int* pad_data = this->pad_.cpu_data(); | |
const int pad_h = pad_data[0]; | |
const int pad_w = pad_data[1]; | |
const int* stride_data = this->stride_.cpu_data(); | |
const int stride_h = stride_data[0]; | |
const int stride_w = stride_data[1]; | |
// Specify workspace limit for kernels directly until we have a | |
// planning strategy and a rewrite of Caffe's GPU memory mangagement | |
size_t workspace_limit_bytes = 8*1024*1024; | |
for (int i = 0; i < bottom.size(); i++) { | |
cudnn::setTensor4dDesc<Dtype>(&bottom_descs_[i], | |
this->num_, | |
this->channels_ / this->group_, height, width, | |
this->channels_ * height * width, | |
height * width, width, 1); | |
cudnn::setTensor4dDesc<Dtype>(&top_descs_[i], | |
this->num_, | |
this->num_output_ / this->group_, height_out, width_out, | |
this->num_output_ * this->out_spatial_dim_, | |
this->out_spatial_dim_, width_out, 1); | |
cudnn::setConvolutionDesc<Dtype>(&conv_descs_[i], bottom_descs_[i], | |
filter_desc_, pad_h, pad_w, | |
stride_h, stride_w); | |
cudnnConvolutionFwdAlgoPerf_t perfResults[1]; | |
int innt[1]; | |
CUDNN_CHECK(cudnnFindConvolutionForwardAlgorithm(handle_[0], | |
bottom_descs_[i], | |
filter_desc_, | |
conv_descs_[i], | |
top_descs_[i], | |
CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, | |
innt, | |
perfResults)); | |
fwd_algo_[i]=perfResults[0].algo; | |
printf("Autotuning SC Forward: Time: %3.5f Memory: %8d Algorithm: %d\n", | |
float(perfResults[0].time), int(perfResults[0].memory), int(perfResults[0].algo)); | |
/* | |
if (!this->layer_param_.convolution_param().has_cudnnfwdalgo()) { | |
// choose forward and backward algorithms + workspace(s) | |
CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(handle_[0], | |
bottom_descs_[i], | |
filter_desc_, | |
conv_descs_[i], | |
top_descs_[i], | |
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, | |
workspace_limit_bytes, | |
&fwd_algo_[i])); | |
} else { | |
fwd_algo_[i] = GetCuDNNFwdAlgo( | |
this->layer_param_.convolution_param().cudnnfwdalgo()); | |
}*/ | |
CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(handle_[0], | |
bottom_descs_[i], | |
filter_desc_, | |
conv_descs_[i], | |
top_descs_[i], | |
fwd_algo_[i], | |
&(workspace_fwd_sizes_[i]))); | |
// choose backward algorithm for filter | |
CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(handle_[0], | |
bottom_descs_[i], top_descs_[i], conv_descs_[i], filter_desc_, | |
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, | |
workspace_limit_bytes, &bwd_filter_algo_[i]) ); | |
// get workspace for backwards filter algorithm | |
CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(handle_[0], | |
bottom_descs_[i], top_descs_[i], conv_descs_[i], filter_desc_, | |
bwd_filter_algo_[i], &workspace_bwd_filter_sizes_[i])); | |
// choose backward algo for data | |
CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(handle_[0], | |
filter_desc_, top_descs_[i], conv_descs_[i], bottom_descs_[i], | |
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, | |
workspace_limit_bytes, &bwd_data_algo_[i])); | |
// get workspace size | |
CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(handle_[0], | |
filter_desc_, top_descs_[i], conv_descs_[i], bottom_descs_[i], | |
bwd_data_algo_[i], &workspace_bwd_data_sizes_[i]) ); | |
} | |
// reduce over all workspace sizes to get a maximum to allocate / reallocate | |
size_t total_workspace_fwd = 0; | |
size_t total_workspace_bwd_data = 0; | |
size_t total_workspace_bwd_filter = 0; | |
for (size_t i = 0; i < bottom.size(); i++) { | |
total_workspace_fwd = std::max(total_workspace_fwd, | |
workspace_fwd_sizes_[i]); | |
total_workspace_bwd_data = std::max(total_workspace_bwd_data, | |
workspace_bwd_data_sizes_[i]); | |
total_workspace_bwd_filter = std::max(total_workspace_bwd_filter, | |
workspace_bwd_filter_sizes_[i]); | |
} | |
// get max over all operations | |
size_t max_workspace = std::max(total_workspace_fwd, | |
total_workspace_bwd_data); | |
max_workspace = std::max(max_workspace, total_workspace_bwd_filter); | |
// ensure all groups have enough workspace | |
size_t total_max_workspace = max_workspace * | |
(this->group_ * CUDNN_STREAMS_PER_GROUP); | |
// this is the total amount of storage needed over all groups + streams | |
if (total_max_workspace > workspaceSizeInBytes) { | |
DLOG(INFO) << "Reallocating workspace storage: " << total_max_workspace; | |
workspaceSizeInBytes = total_max_workspace; | |
// free the existing workspace and allocate a new (larger) one | |
cudaFree(this->workspaceData); | |
cudaError_t err = cudaMalloc(&(this->workspaceData), workspaceSizeInBytes); | |
if (err != cudaSuccess) { | |
// force zero memory path | |
for (int i = 0; i < bottom.size(); i++) { | |
workspace_fwd_sizes_[i] = 0; | |
workspace_bwd_filter_sizes_[i] = 0; | |
workspace_bwd_data_sizes_[i] = 0; | |
fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; | |
bwd_filter_algo_[i] = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0; | |
bwd_data_algo_[i] = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0; | |
} | |
// NULL out all workspace pointers | |
for (int g = 0; g < (this->group_ * CUDNN_STREAMS_PER_GROUP); g++) { | |
workspace[g] = NULL; | |
} | |
// NULL out underlying data | |
workspaceData = NULL; | |
workspaceSizeInBytes = 0; | |
} | |
// if we succeed in the allocation, set pointer aliases for workspaces | |
for (int g = 0; g < (this->group_ * CUDNN_STREAMS_PER_GROUP); g++) { | |
workspace[g] = reinterpret_cast<char *>(workspaceData) + g*max_workspace; | |
} | |
} | |
// Tensor descriptor for bias. | |
if (this->bias_term_) { | |
cudnn::setTensor4dDesc<Dtype>(&bias_desc_, | |
1, this->num_output_ / this->group_, 1, 1); | |
} | |
} | |
template <typename Dtype> | |
CuDNNConvolutionLayer<Dtype>::~CuDNNConvolutionLayer() { | |
// Check that handles have been setup before destroying. | |
if (!handles_setup_) { return; } | |
for (int i = 0; i < bottom_descs_.size(); i++) { | |
cudnnDestroyTensorDescriptor(bottom_descs_[i]); | |
cudnnDestroyTensorDescriptor(top_descs_[i]); | |
cudnnDestroyConvolutionDescriptor(conv_descs_[i]); | |
} | |
if (this->bias_term_) { | |
cudnnDestroyTensorDescriptor(bias_desc_); | |
} | |
cudnnDestroyFilterDescriptor(filter_desc_); | |
for (int g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) { | |
cudaStreamDestroy(stream_[g]); | |
cudnnDestroy(handle_[g]); | |
} | |
cudaFree(workspaceData); | |
delete [] stream_; | |
delete [] handle_; | |
delete [] fwd_algo_; | |
delete [] bwd_filter_algo_; | |
delete [] bwd_data_algo_; | |
delete [] workspace_fwd_sizes_; | |
delete [] workspace_bwd_data_sizes_; | |
delete [] workspace_bwd_filter_sizes_; | |
} | |
INSTANTIATE_CLASS(CuDNNConvolutionLayer); | |
} // namespace caffe | |
#endif |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment