yjxiong/Acc_Caffe merge guide

## Acc_Caffe merge guide
Guide for merging accelerated convolution to Caffe.
====

*Yuanjun Xiong*

---


[TOC]

Adding this feature will accelerate minibatch-based convolution in  Caffe. Typically it brings about 1.33x speedup.

# 0. The Full Change List

All changes are wrapped in [this commit](https://github.com/yjxiong/caffe/commit/9e8e069f11a3382bc34a595fe5bd2814f4f9993d) on github. Please refer to the page for a full list of modified files.

# 1. Manual merging
The zip in the email file only contains modified files.
Here is a list of changed content:

* In file `include/caffe/util/im2col.hpp`:
add function declaration
```
template <typename Dtype>
void bu_im2col_gpu(const Dtype* data_im, const int channels,
				   const int height, const int width, const int ksize, const int pad,
				   const int stride, Dtype* data_col, const int batch_size);

template <typename Dtype>
void bu_im2col_gpu_rot(const Dtype* data_im, const int channels,
				   const int height, const int width, const int ksize, const int pad,
				   const int stride, Dtype* data_col, const int batch_size);

template <typename Dtype>
void bu_col2im_gpu(const Dtype* data_col, const int channels,
    const int height, const int width, const int psize, const int pad,
    const int stride, Dtype* data_im,
	const int batch_size);

template <typename Dtype>
void bu_col2im_gpu_rot(const Dtype* data_col, const int channels,
    const int height, const int width, const int ksize, const int pad,
    const int stride, Dtype* data_im,
	const int batch_size);

template <typename Dtype>
void cu_im2mat_gpu(const Dtype* data_im, const int channels,
    const int height, const int width,
	Dtype* data_mat,
	const int batch_size);
```

* In file `include/caffe/util/math_functions.hpp`
add function declaration
```
template <typename Dtype>
void caffe_gpu_geam(const CBLAS_TRANSPOSE TransA,
                    const CBLAS_TRANSPOSE TransB, const int M, const int N,
                    const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta,
                    Dtype* C);
```


* In file `include/caffe/vision_layers.hpp`, in the declaration of class `ConvolutionLayer` add following properties:
```
int mem_group_size;
Blob<Dtype> bias_buffer_;
Blob<Dtype> trans_buffer_;
```

* In file `src/caffe/layers/conv_layer.cpp`, change the implementation of method
```
void ConvolutionLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
                                    vector<Blob<Dtype>*>* top)

```

* In file `src/caffe/layers/conv_layer.cu`, rewrite implementations of two methods
```
template <typename Dtype>
Dtype ConvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
                                           vector<Blob<Dtype>*>* top)
```
and
```
template <typename Dtype>
void ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
	                                        const bool propagate_down, vector<Blob<Dtype>*>* bottom)
```

* In file `src/caffe/util/im2col.cu` add  function
```
template <typename Dtype>
__global__ void bu_im2col_gpu_kernel(
                                    const int n, const Dtype* data_im,
                                    const int height, const int width, const int ksize, const int pad,
                                    const int stride, const int height_col, const int width_col,
                                    Dtype* data_col,
                                    const int data_im_size,
                                    const int data_col_size,
                                    const int batch_size)

template <typename Dtype>
__global__ void bu_im2col_gpu_kernel_rot(
	const int n, const Dtype* data_im,
	const int height, const int width, const int ksize, const int pad,
	const int stride, const int height_col, const int width_col,
	Dtype* data_col,
	const int data_im_size,
	const int data_col_size,
	const int batch_size)

template <typename Dtype>
__global__ void bu_col2im_gpu_kernel(const int n, const Dtype* data_col,
    const int height, const int width, const int channels, const int ksize,
    const int pad, const int stride, const int height_col, const int width_col,
    Dtype* data_im,
	const int batch_size)

template <typename Dtype>
__global__ void bu_col2im_gpu_rot_kernel(const int n, const Dtype* data_col,
    const int height, const int width, const int channels, const int ksize,
    const int pad, const int stride, const int height_col, const int width_col,
    Dtype* data_im,
	const int batch_size) {
  CUDA_KERNEL_LOOP(index, n)

template <typename Dtype>
__global__ void cu_im2mat_gpu_kernel(const int n, const Dtype* data_im,
    const int height, const int width, const int channels,
    Dtype* data_mat,
	const int batch_size)
```
implement functions
```
template <typename Dtype>
 void bu_im2col_gpu(const Dtype* data_im, const int channels,
 				   const int height, const int width, const int ksize, const int pad,
 				   const int stride, Dtype* data_col, const int batch_size)

template <typename Dtype>
void bu_im2col_gpu_rot(const Dtype* data_im, const int channels,
				   const int height, const int width, const int ksize, const int pad,
				   const int stride, Dtype* data_col, const int batch_size);

template <typename Dtype>
void bu_col2im_gpu(const Dtype* data_col, const int channels,
    const int height, const int width, const int psize, const int pad,
    const int stride, Dtype* data_im,
	const int batch_size);

template <typename Dtype>
void bu_col2im_gpu_rot(const Dtype* data_col, const int channels,
    const int height, const int width, const int ksize, const int pad,
    const int stride, Dtype* data_im,
	const int batch_size);

template <typename Dtype>
void cu_im2mat_gpu(const Dtype* data_im, const int channels,
    const int height, const int width,
	Dtype* data_mat,
	const int batch_size);
```
and explicitly instantiate them
```
// Explicit instantiation
template void bu_im2col_gpu<float>(
	                                const float* data_im, const int channels,
	                                const int height, const int width, const int ksize, const int pad,
	                                const int stride, float* data_col,
	                                const int batch_size);
template void bu_im2col_gpu<double>(
	                                const double* data_im, const int channels,
	                                const int height, const int width, const int ksize, const int pad,
	                                const int stride, double* data_col,
	                                const int batch_size);
```
(repeat for others).


* In file `src/caffe/util/math_functions.cpp` instantiate and implement two functions
```
void caffe_gpu_geam<float>(const CBLAS_TRANSPOSE TransA,
                    const CBLAS_TRANSPOSE TransB, const int M, const int N,
                    const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta,
                    Dtype* C);
```
```
void caffe_gpu_geam<double>(const CBLAS_TRANSPOSE TransA,
                    const CBLAS_TRANSPOSE TransB, const int M, const int N,
                    const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta,
                    Dtype* C);
```

* Finally, change the protobuffer definition `src/caffe/proto/caffe.proto`.
In message `ConvolutionParameter ` add one parameter
```
optional uint32 mem_group_size = 9 [default = 1];
```
If the parameter id (9) is occupied, just assign a non-used number to it.

## Caffe batch merge guide
Guide for merging accelerated convolution to Caffe.
====

*Yuanjun Xiong*

---
[TOC]

# 0. The Full Change List

All changes are wrapped in [this commit](https://github.com/yjxiong/caffe/commit/9e8e069f11a3382bc34a595fe5bd2814f4f9993d) on github. Please refer to the page for a full list of modified files.

# 1. Manual merging
The zip in the email file only contains modified files.
Here is a list of changed content:

* In file `include/caffe/util/im2col.hpp`:
add function declaration
```
template <typename Dtype>
void bu_im2col_gpu(const Dtype* data_im, const int channels,
				   const int height, const int width, const int ksize, const int pad,
				   const int stride, Dtype* data_col, const int batch_size);

```

* In file `include/caffe/util/math_functions.hpp`
add function declaration
```
template <typename Dtype>
void caffe_gpu_geam(const CBLAS_TRANSPOSE TransA,
                    const CBLAS_TRANSPOSE TransB, const int M, const int N,
                    const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta,
                    Dtype* C);
```


* In file `include/caffe/vision_layers.hpp`, in the declaration of class `ConvolutionLayer` add following properties:
```
int mem_group_size;
Blob<Dtype> bias_buffer_;
Blob<Dtype> trans_buffer_;
```

* In file `src/caffe/layers/conv_layer.cpp`, change the implementation of method
```
void ConvolutionLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
                                    vector<Blob<Dtype>*>* top)
```

* In file `src/caffe/layers/conv_layer.cu`, rewrite implementations of two methods
```
template <typename Dtype>
Dtype ConvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
                                           vector<Blob<Dtype>*>* top)
```
and
```
template <typename Dtype>
void ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
	                                        const bool propagate_down, vector<Blob<Dtype>*>* bottom)
```

* In file `src/caffe/util/im2col.cu` add  function
```
template <typename Dtype>
__global__ void bu_im2col_gpu_kernel(
                                    const int n, const Dtype* data_im,
                                    const int height, const int width, const int ksize, const int pad,
                                    const int stride, const int height_col, const int width_col,
                                    Dtype* data_col,
                                    const int data_im_size,
                                    const int data_col_size,
                                    const int batch_size)
```
implement function
```
template <typename Dtype>
 void bu_im2col_gpu(const Dtype* data_im, const int channels,
 				   const int height, const int width, const int ksize, const int pad,
 				   const int stride, Dtype* data_col, const int batch_size)
```
and explicitly instantiate it
```
// Explicit instantiation
template void bu_im2col_gpu<float>(
	                                const float* data_im, const int channels,
	                                const int height, const int width, const int ksize, const int pad,
	                                const int stride, float* data_col,
	                                const int batch_size);
template void bu_im2col_gpu<double>(
	                                const double* data_im, const int channels,
	                                const int height, const int width, const int ksize, const int pad,
	                                const int stride, double* data_col,
	                                const int batch_size);
```

* In file `src/caffe/util/math_functions.cpp` instantiate and implement two functions
```
void caffe_gpu_geam<float>(const CBLAS_TRANSPOSE TransA,
                    const CBLAS_TRANSPOSE TransB, const int M, const int N,
                    const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta,
                    Dtype* C);
```
```
void caffe_gpu_geam<double>(const CBLAS_TRANSPOSE TransA,
                    const CBLAS_TRANSPOSE TransB, const int M, const int N,
                    const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta,
                    Dtype* C);
```

* Finally, change the protobuffer definition `src/caffe/proto/caffe.proto`.
In message `ConvolutionParameter ` add one parameter
```
optional uint32 mem_group_size = 9 [default = 1];
```
If the parameter id (9) is occupied, just assign a non-used number to it.

# 2. Auto patching with Git

If your code are tracked by Git. You can download the Git patch I generated -> [changes.patch](https://dl.dropboxusercontent.com/u/101055055/changes.patch).
Assume you are in `master` branch. In your repo, run
```
git commit -a -m "before patching"
git checkout -b patching_batch
```
to build a temporary branch for patching. Then, run

```
git am -3 < changes.patch
```
to patch the code. This will reserve the change you make while adding new feature from the patch.
If no conflict detected, run
```
git checkout master
git merge patching_batch
```
to merge the changes to the master branch.
	Guide for merging accelerated convolution to Caffe.
	====

	Yuanjun Xiong

	---



	[TOC]

	Adding this feature will accelerate minibatch-based convolution in Caffe. Typically it brings about 1.33x speedup.

	# 0. The Full Change List

	All changes are wrapped in [this commit](https://github.com/yjxiong/caffe/commit/9e8e069f11a3382bc34a595fe5bd2814f4f9993d) on github. Please refer to the page for a full list of modified files.

	# 1. Manual merging
	The zip in the email file only contains modified files.
	Here is a list of changed content:

	* In file `include/caffe/util/im2col.hpp`:
	add function declaration
	```
	template <typename Dtype>
	void bu_im2col_gpu(const Dtype* data_im, const int channels,
	const int height, const int width, const int ksize, const int pad,
	const int stride, Dtype* data_col, const int batch_size);

	template <typename Dtype>
	void bu_im2col_gpu_rot(const Dtype* data_im, const int channels,
	const int height, const int width, const int ksize, const int pad,
	const int stride, Dtype* data_col, const int batch_size);

	template <typename Dtype>
	void bu_col2im_gpu(const Dtype* data_col, const int channels,
	const int height, const int width, const int psize, const int pad,
	const int stride, Dtype* data_im,
	const int batch_size);

	template <typename Dtype>
	void bu_col2im_gpu_rot(const Dtype* data_col, const int channels,
	const int height, const int width, const int ksize, const int pad,
	const int stride, Dtype* data_im,
	const int batch_size);

	template <typename Dtype>
	void cu_im2mat_gpu(const Dtype* data_im, const int channels,
	const int height, const int width,
	Dtype* data_mat,
	const int batch_size);
	```

	* In file `include/caffe/util/math_functions.hpp`
	add function declaration
	```
	template <typename Dtype>
	void caffe_gpu_geam(const CBLAS_TRANSPOSE TransA,
	const CBLAS_TRANSPOSE TransB, const int M, const int N,
	const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta,
	Dtype* C);
	```


	* In file `include/caffe/vision_layers.hpp`, in the declaration of class `ConvolutionLayer` add following properties:
	```
	int mem_group_size;
	Blob<Dtype> bias_buffer_;
	Blob<Dtype> trans_buffer_;
	```

	* In file `src/caffe/layers/conv_layer.cpp`, change the implementation of method
	```
	void ConvolutionLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
	vector<Blob<Dtype>> top)

	```

	* In file `src/caffe/layers/conv_layer.cu`, rewrite implementations of two methods
	```
	template <typename Dtype>
	Dtype ConvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
	vector<Blob<Dtype>> top)
	```
	and
	```
	template <typename Dtype>
	void ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
	const bool propagate_down, vector<Blob<Dtype>> bottom)
	```

	* In file `src/caffe/util/im2col.cu` add function
	```
	template <typename Dtype>
	__global__ void bu_im2col_gpu_kernel(
	const int n, const Dtype* data_im,
	const int height, const int width, const int ksize, const int pad,
	const int stride, const int height_col, const int width_col,
	Dtype* data_col,
	const int data_im_size,
	const int data_col_size,
	const int batch_size)

	template <typename Dtype>
	__global__ void bu_im2col_gpu_kernel_rot(
	const int n, const Dtype* data_im,
	const int height, const int width, const int ksize, const int pad,
	const int stride, const int height_col, const int width_col,
	Dtype* data_col,
	const int data_im_size,
	const int data_col_size,
	const int batch_size)

	template <typename Dtype>
	__global__ void bu_col2im_gpu_kernel(const int n, const Dtype* data_col,
	const int height, const int width, const int channels, const int ksize,
	const int pad, const int stride, const int height_col, const int width_col,
	Dtype* data_im,
	const int batch_size)

	template <typename Dtype>
	__global__ void bu_col2im_gpu_rot_kernel(const int n, const Dtype* data_col,
	const int height, const int width, const int channels, const int ksize,
	const int pad, const int stride, const int height_col, const int width_col,
	Dtype* data_im,
	const int batch_size) {
	CUDA_KERNEL_LOOP(index, n)

	template <typename Dtype>
	__global__ void cu_im2mat_gpu_kernel(const int n, const Dtype* data_im,
	const int height, const int width, const int channels,
	Dtype* data_mat,
	const int batch_size)
	```
	implement functions
	```
	template <typename Dtype>
	void bu_im2col_gpu(const Dtype* data_im, const int channels,
	const int height, const int width, const int ksize, const int pad,
	const int stride, Dtype* data_col, const int batch_size)

	template <typename Dtype>
	void bu_im2col_gpu_rot(const Dtype* data_im, const int channels,
	const int height, const int width, const int ksize, const int pad,
	const int stride, Dtype* data_col, const int batch_size);

	template <typename Dtype>
	void bu_col2im_gpu(const Dtype* data_col, const int channels,
	const int height, const int width, const int psize, const int pad,
	const int stride, Dtype* data_im,
	const int batch_size);

	template <typename Dtype>
	void bu_col2im_gpu_rot(const Dtype* data_col, const int channels,
	const int height, const int width, const int ksize, const int pad,
	const int stride, Dtype* data_im,
	const int batch_size);

	template <typename Dtype>
	void cu_im2mat_gpu(const Dtype* data_im, const int channels,
	const int height, const int width,
	Dtype* data_mat,
	const int batch_size);
	```
	and explicitly instantiate them
	```
	// Explicit instantiation
	template void bu_im2col_gpu<float>(
	const float* data_im, const int channels,
	const int height, const int width, const int ksize, const int pad,
	const int stride, float* data_col,
	const int batch_size);
	template void bu_im2col_gpu<double>(
	const double* data_im, const int channels,
	const int height, const int width, const int ksize, const int pad,
	const int stride, double* data_col,
	const int batch_size);
	```
	(repeat for others).


	* In file `src/caffe/util/math_functions.cpp` instantiate and implement two functions
	```
	void caffe_gpu_geam<float>(const CBLAS_TRANSPOSE TransA,
	const CBLAS_TRANSPOSE TransB, const int M, const int N,
	const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta,
	Dtype* C);
	```
	```
	void caffe_gpu_geam<double>(const CBLAS_TRANSPOSE TransA,
	const CBLAS_TRANSPOSE TransB, const int M, const int N,
	const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta,
	Dtype* C);
	```

	* Finally, change the protobuffer definition `src/caffe/proto/caffe.proto`.
	In message `ConvolutionParameter ` add one parameter
	```
	optional uint32 mem_group_size = 9 [default = 1];
	```
	If the parameter id (9) is occupied, just assign a non-used number to it.