vbkaisetsu/col2im_gpu_caffe.cc

## col2im_gpu_caffe.cc
#include <algorithm>
#include <chrono>
#include <functional>
#include <iostream>
#include <random>

#define CL_HPP_ENABLE_EXCEPTIONS
#define CL_HPP_MINIMUM_OPENCL_VERSION 120
#define CL_HPP_TARGET_OPENCL_VERSION 120
#define CL_HPP_CL_1_2_DEFAULT_BUILD

#include <CL/cl2.hpp>

std::vector<cl::Platform> get_all_platforms() {
  std::vector<cl::Platform> ret;
  cl::Platform::get(&ret);
  return ret;
}

std::vector<cl::Device> get_all_devices(std::uint32_t platform_id) {
  const auto all_pfs = get_all_platforms();
  std::vector<cl::Device> ret;
  all_pfs[platform_id].getDevices(CL_DEVICE_TYPE_ALL, &ret);
  return ret;
}

cl::Device get_device(std::uint32_t platform_id, std::uint32_t device_id) {
  const auto all_devs = get_all_devices(platform_id);
  return all_devs[device_id];
}

std::uint32_t calc_num_blocks(std::uint32_t size, std::uint32_t num_threads) {
  return (size + num_threads - 1) / num_threads;
}

void col2im(
    cl::Kernel &kernel, cl::CommandQueue &queue,
    const std::uint32_t channels, const std::uint32_t height, const std::uint32_t width,
    const std::uint32_t kernel_h, const std::uint32_t kernel_w, const std::uint32_t pad_h,
    const std::uint32_t pad_w, const std::uint32_t stride_h, const std::uint32_t stride_w,
    const std::uint32_t dilation_h, const std::uint32_t dilation_w,
    const cl::Buffer &col_buffer, const std::uint32_t col_offset,
    cl::Buffer &im_buffer, const std::uint32_t im_offset) {
  const std::uint32_t size_h = height + 2 * pad_h;
  const std::uint32_t padding_h = dilation_h * (kernel_h - 1) + 1;
  const std::uint32_t output_h = (size_h >= padding_h) ? (size_h - padding_h) / stride_h + 1 : 1;
  const std::uint32_t size_w = width + 2 * pad_w;
  const std::uint32_t padding_w = dilation_w * (kernel_w - 1) + 1;
  const std::uint32_t output_w = (size_w >= padding_w) ? (size_w - padding_w) / stride_w + 1 : 1;
  const std::uint32_t g1 = calc_num_blocks(width, 16);
  const std::uint32_t g2 = calc_num_blocks(height * channels, 16);
  kernel.setArg(0, height);
  kernel.setArg(1, width);
  kernel.setArg(2, channels);
  kernel.setArg(3, output_h);
  kernel.setArg(4, output_w);
  kernel.setArg(5, kernel_h);
  kernel.setArg(6, kernel_w);
  kernel.setArg(7, pad_h);
  kernel.setArg(8, pad_w);
  kernel.setArg(9, stride_h);
  kernel.setArg(10, stride_w);
  kernel.setArg(11, dilation_h);
  kernel.setArg(12, dilation_w);
  kernel.setArg(13, col_buffer);
  kernel.setArg(14, col_offset);
  kernel.setArg(15, im_buffer);
  kernel.setArg(16, im_offset);
  queue.enqueueNDRangeKernel(
      kernel, cl::NullRange,
      cl::NDRange(g1 * 16, g2 * 16, 1),
      cl::NDRange(16, 16, 1));
}

/*
 * In reference to caffe:
 * https://github.com/BVLC/caffe/blob/master/src/caffe/util/im2col.cu
 */

std::string kernel_string = R"EOS(
kernel void col2im_kernel(
    const int input_h, const int input_w, const int channels,
    const int output_h, const int output_w,
    const int kernel_h, const int kernel_w,
    const int pad_h, const int pad_w,
    const int stride_h, const int stride_w,
    const int dilation_h, const int dilation_w,
    const global float *col_buffer, const int col_offset,
    __global float *im_buffer, const int im_offset) {

  const int x_x = get_global_id(0) + pad_w;
  const int x_y = ((int) get_global_id(1)) % input_h + pad_h;
  const int channel = ((int) get_global_id(1)) / input_h;

  const int kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
  const int kernel_extent_h = (kernel_h - 1) * dilation_h + 1;

  const int col_channel_shift = channel * kernel_w * kernel_h * output_h * output_w + col_offset;
  const int x_channel_shift = channel * input_h * input_w + im_offset;

  const int t_y_begin = (x_y < kernel_extent_h) ? 0 : (x_y - kernel_extent_h) / stride_h + 1;
  const int t_y_end = min(x_y / stride_h + 1, output_h);
  const int t_x_begin = (x_x < kernel_extent_w) ? 0 : (x_x - kernel_extent_w) / stride_w + 1;
  const int t_x_end = min(x_x / stride_w + 1, output_w);

  if (x_x < input_w + pad_w && channel < channels) {

    float val = 0;

    for (int t_y = t_y_begin; t_y < t_y_end; ++t_y) {
      for (int t_x = t_x_begin; t_x < t_x_end; ++t_x) {
        int w_y = x_y - t_y * stride_h;
        int w_x = x_x - t_x * stride_w;
        if (w_y % dilation_h == 0 && w_x % dilation_w == 0) {
          w_y /= dilation_h;
          w_x /= dilation_w;
          val += col_buffer[col_channel_shift
                              + (w_x + w_y * kernel_w) * output_h * output_w
                              + t_y * output_w
                              + t_x];
        }
      }
    }
    im_buffer[x_channel_shift + (x_y - pad_h) * input_w + x_x - pad_w] = val;
  }
}
)EOS";

int main() {
  cl::Device device(get_device(0, 0));
  cl::Context context({ device });
  cl::CommandQueue queue(context, device, 0);

  cl::Program program(context, kernel_string);

  try {
    program.build({device});
  } catch (...) {
    std::cerr << "OpenCL kernel compile error:" << std::endl << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device) << std::endl;
    return 1;
  }

  cl::Kernel col2im_kernel(program, "col2im_kernel");

  const std::uint32_t im_height = 512;
  const std::uint32_t im_width = 512;
  const std::uint32_t w_height = 16;
  const std::uint32_t w_width = 16;
  const std::uint32_t channels = 1;

  const std::uint32_t pad_h = 8;
  const std::uint32_t pad_w = 8;
  const std::uint32_t stride_h = 1;
  const std::uint32_t stride_w = 1;
  const std::uint32_t dilation_h = 1;
  const std::uint32_t dilation_w = 1;

  const std::uint32_t n_iters = 500;

  std::cout << "col2im (GPU caffe)" <<std::endl;
  std::cout << "=====================" << std::endl;
  std::cout << "im_height:  " << im_height << std::endl;
  std::cout << "im_width:   " << im_width << std::endl;
  std::cout << "w_height:   " << w_height << std::endl;
  std::cout << "w_width:    " << w_width << std::endl;
  std::cout << "channels:   " << channels << std::endl;
  std::cout << "pad_h:      " << pad_h << std::endl;
  std::cout << "pad_w:      " << pad_w << std::endl;
  std::cout << "stride_h:   " << stride_h << std::endl;
  std::cout << "stride_w:   " << stride_w << std::endl;
  std::cout << "dilation_h: " << dilation_h << std::endl;
  std::cout << "dilation_w: " << dilation_w << std::endl;
  std::cout << "n_iters:    " << n_iters << std::endl;
  std::cout << "=====================" << std::endl;

  const std::uint32_t x0 = im_height + 2 * pad_h;
  const std::uint32_t x1 = im_width + 2 * pad_w;
  const std::uint32_t w0 = (w_height - 1) * dilation_h + 1;
  const std::uint32_t w1 = (w_width - 1) * dilation_w + 1;

  const std::uint32_t im_size = im_height * im_width * channels;
  const std::uint32_t col_size = ((x0 - w0) / stride_h + 1) * ((x1 - w1) / stride_w + 1) * w_height * w_width * channels;

  std::vector<float> im(im_size);
  std::vector<float> col(col_size);

  std::mt19937 rng(12345);
  std::uniform_real_distribution<float> dist(-1, 1);
  auto gen = [&dist, &rng]() {
    return dist(rng);
  };
  std::generate(col.begin(), col.end(), gen);

  cl::Buffer col_buffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, col_size * sizeof(float), nullptr);
  cl::Buffer im_buffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, im_size * sizeof(float), nullptr);

  float *mapped_col_buffer = static_cast<float *>(
    queue.enqueueMapBuffer(
      col_buffer, CL_TRUE, CL_MAP_WRITE, 0, sizeof(float) * col_size, 0));
  std::memcpy(mapped_col_buffer, col.data(), sizeof(float) * col_size);
  queue.enqueueUnmapMemObject(col_buffer, mapped_col_buffer);

  auto start = std::chrono::system_clock::now();
  for (auto i = 0; i < n_iters; ++i) {
    queue.enqueueFillBuffer<float>(im_buffer, 0, 0, sizeof(float) * im_size);
    col2im(col2im_kernel, queue,
      channels, im_height, im_width, w_height, w_width, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, col_buffer, 0, im_buffer, 0);
  }
  auto end = std::chrono::system_clock::now();

  float *mapped_im_buffer = static_cast<float *>(queue.enqueueMapBuffer(im_buffer, CL_TRUE, CL_MAP_WRITE, 0, sizeof(float) * im_size, 0));
  std::memcpy(im.data(), mapped_im_buffer, sizeof(float) * im_size);
  queue.enqueueUnmapMemObject(im_buffer, mapped_im_buffer);

  std::size_t hash_val = 0;
  std::hash<float> h;
  for (auto v : im) {
    hash_val ^= h(v) + 0x9e3779b9 + (hash_val << 6) + (hash_val >> 2);
  }

  auto diff = end - start;
  std::cout << "Elapsed time = " << std::chrono::duration_cast<std::chrono::milliseconds>(diff).count() << " [ms]" <<std::endl;
  std::cout << "Hash = " << hash_val <<std::endl;

//   for (auto x : im) {
//     std::cout << x << ", ";
//   }
//   std::cout << std::endl;

  return 0;
}

## col2im_gpu_gcdlcm.cc
#include <algorithm>
#include <chrono>
#include <functional>
#include <iostream>
#include <random>

#define CL_HPP_ENABLE_EXCEPTIONS
#define CL_HPP_MINIMUM_OPENCL_VERSION 120
#define CL_HPP_TARGET_OPENCL_VERSION 120
#define CL_HPP_CL_1_2_DEFAULT_BUILD

#include <CL/cl2.hpp>

std::vector<cl::Platform> get_all_platforms() {
  std::vector<cl::Platform> ret;
  cl::Platform::get(&ret);
  return ret;
}

std::vector<cl::Device> get_all_devices(std::uint32_t platform_id) {
  const auto all_pfs = get_all_platforms();
  std::vector<cl::Device> ret;
  all_pfs[platform_id].getDevices(CL_DEVICE_TYPE_ALL, &ret);
  return ret;
}

cl::Device get_device(std::uint32_t platform_id, std::uint32_t device_id) {
  const auto all_devs = get_all_devices(platform_id);
  return all_devs[device_id];
}

std::uint32_t calc_num_blocks(std::uint32_t size, std::uint32_t num_threads) {
  return (size + num_threads - 1) / num_threads;
}

void euclid_gcd(std::int32_t a, std::int32_t b, std::int32_t &p, std::int32_t &q, std::int32_t &r) {
  p = 0;
  q = 1;
  std::int32_t p_1 = 1;
  std::int32_t q_1 = 0;
  for (;;) {
    const std::int32_t c = a % b;
    if (c == 0) {
      break;
    }
    const std::int32_t p_2 = p_1;
    const std::int32_t q_2 = q_1;
    p_1 = p;
    q_1 = q;
    p = p_2 - p_1 * (a / b);
    q = q_2 - q_1 * (a / b);
    a = b;
    b = c;
  }
  r = b;
}

void col2im(
    cl::Kernel &kernel, cl::CommandQueue &queue,
    const std::uint32_t channels, const std::uint32_t height, const std::uint32_t width,
    const std::uint32_t kernel_h, const std::uint32_t kernel_w, const std::uint32_t pad_h,
    const std::uint32_t pad_w, const std::uint32_t stride_h, const std::uint32_t stride_w,
    const std::uint32_t dilation_h, const std::uint32_t dilation_w,
    const cl::Buffer &col_buffer, const std::uint32_t col_offset,
    cl::Buffer &im_buffer, const std::uint32_t im_offset) {

  std::int32_t stride_bez_h = 0;
  std::int32_t dilation_bez_h = 0;
  std::int32_t gcd_h = 0;
  std::int32_t stride_bez_w = 0;
  std::int32_t dilation_bez_w = 0;
  std::int32_t gcd_w = 0;
  euclid_gcd(stride_h, dilation_h, stride_bez_h, dilation_bez_h, gcd_h);
  euclid_gcd(stride_w, dilation_w, stride_bez_w, dilation_bez_w, gcd_w);

  const std::uint32_t size_h = height + 2 * pad_h;
  const std::uint32_t padding_h = dilation_h * (kernel_h - 1) + 1;
  const std::uint32_t output_h = (size_h >= padding_h) ? (size_h - padding_h) / stride_h + 1 : 1;
  const std::uint32_t size_w = width + 2 * pad_w;
  const std::uint32_t padding_w = dilation_w * (kernel_w - 1) + 1;
  const std::uint32_t output_w = (size_w >= padding_w) ? (size_w - padding_w) / stride_w + 1 : 1;
  const std::uint32_t g1 = calc_num_blocks((width - 1) / gcd_w + 1, 16);
  const std::uint32_t g2 = calc_num_blocks(((height - 1) / gcd_h + 1) * channels, 16);

  kernel.setArg(0, height);
  kernel.setArg(1, width);
  kernel.setArg(2, channels);
  kernel.setArg(3, output_h);
  kernel.setArg(4, output_w);
  kernel.setArg(5, kernel_h);
  kernel.setArg(6, kernel_w);
  kernel.setArg(7, pad_h);
  kernel.setArg(8, pad_w);
  kernel.setArg(9, stride_h);
  kernel.setArg(10, stride_w);
  kernel.setArg(11, dilation_h);
  kernel.setArg(12, dilation_w);
  kernel.setArg(13, stride_bez_h);
  kernel.setArg(14, stride_bez_w);
  kernel.setArg(15, dilation_bez_h);
  kernel.setArg(16, dilation_bez_w);
  kernel.setArg(17, gcd_h);
  kernel.setArg(18, gcd_w);
  kernel.setArg(19, col_buffer);
  kernel.setArg(20, col_offset);
  kernel.setArg(21, im_buffer);
  kernel.setArg(22, im_offset);
  queue.enqueueNDRangeKernel(
      kernel, cl::NullRange,
      cl::NDRange(g1 * 16, g2 * 16, 1),
      cl::NDRange(16, 16, 1));
}

std::string kernel_string = R"EOS(
inline int grid_ceil(const int x, const int step) {
  return x > 0 ? ((x - 1) / step + 1) * step : x / step * step;
}

kernel void col2im_kernel(
    const int input_h, const int input_w, const int channels,
    const int output_h, const int output_w,
    const int kernel_h, const int kernel_w,
    const int pad_h, const int pad_w,
    const int stride_h, const int stride_w,
    const int dilation_h, const int dilation_w,
    const int stride_bez_h, const int stride_bez_w,
    const int dilation_bez_h, const int dilation_bez_w,
    const int gcd_h, const int gcd_w,
    const global float *col_buffer, const int col_offset,
    __global float *im_buffer, const int im_offset) {

  const int input_h_scale = (input_h - 1) / gcd_h + 1;
  const int gcd_scale_w = get_global_id(0) + (pad_w - 1) / gcd_w + 1;
  const int gcd_scale_h = ((int) get_global_id(1)) % input_h_scale + (pad_h - 1) / gcd_h + 1;
  const int x_x = gcd_scale_w * gcd_w - pad_w;
  const int x_y = gcd_scale_h * gcd_h - pad_h;
  const int channel = ((int) get_global_id(1)) / input_h_scale;

  const int col_channel_shift = channel * kernel_w * kernel_h * output_h * output_w + col_offset;
  const int x_channel_shift = channel * input_h * input_w + im_offset;

  const int t_y_step = stride_h * dilation_h / gcd_h;
  const int t_y_begin = grid_ceil(max(-stride_bez_h * gcd_scale_h * stride_h,
                                      (dilation_bez_h * gcd_scale_h - kernel_h + 1) * dilation_h),
                                  t_y_step);
  const int t_y_end = min((output_h - stride_bez_h * gcd_scale_h) * stride_h,
                          (dilation_bez_h * gcd_scale_h + 1) * dilation_h);
  const int t_x_step = stride_w * dilation_w / gcd_w;
  const int t_x_begin = grid_ceil(max(-stride_bez_w * gcd_scale_w * stride_w,
                                      (dilation_bez_w * gcd_scale_w - kernel_w + 1) * dilation_w),
                                  t_x_step);
  const int t_x_end = min((output_w - stride_bez_w * gcd_scale_w) * stride_w,
                          (dilation_bez_w * gcd_scale_w + 1) * dilation_w);
  if (x_x < input_w && channel < channels) {
    float val = 0;
    for (int t_y = t_y_begin; t_y < t_y_end; t_y += t_y_step) {
      for (int t_x = t_x_begin; t_x < t_x_end; t_x += t_x_step) {
        const int w_y = -t_y / dilation_h + dilation_bez_h * gcd_scale_h;
        const int y_y = t_y / stride_h + stride_bez_h * gcd_scale_h;
        const int w_x = -t_x / dilation_w + dilation_bez_w * gcd_scale_w;
        const int y_x = t_x / stride_w + stride_bez_w * gcd_scale_w;
        val += col_buffer[col_channel_shift
                            + (w_x + w_y * kernel_w) * output_h * output_w
                            + y_y * output_w
                            + y_x];
      }
    }
    im_buffer[x_channel_shift + x_y * input_w + x_x] = val;
  }
}
)EOS";

int main() {
  cl::Device device(get_device(0, 0));
  cl::Context context({ device });
  cl::CommandQueue queue(context, device, 0);

  cl::Program program(context, kernel_string);

  try {
    program.build({device});
  } catch (...) {
    std::cerr << "OpenCL kernel compile error:" << std::endl << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device) << std::endl;
    return 1;
  }

  cl::Kernel col2im_kernel(program, "col2im_kernel");

  const std::uint32_t im_height = 512;
  const std::uint32_t im_width = 512;
  const std::uint32_t w_height = 16;
  const std::uint32_t w_width = 16;
  const std::uint32_t channels = 1;

  const std::uint32_t pad_h = 8;
  const std::uint32_t pad_w = 8;
  const std::uint32_t stride_h = 1;
  const std::uint32_t stride_w = 1;
  const std::uint32_t dilation_h = 1;
  const std::uint32_t dilation_w = 1;

  const std::uint32_t n_iters = 500;

  std::cout << "col2im (GPU GCD/LCM)" <<std::endl;
  std::cout << "=====================" << std::endl;
  std::cout << "im_height:  " << im_height << std::endl;
  std::cout << "im_width:   " << im_width << std::endl;
  std::cout << "w_height:   " << w_height << std::endl;
  std::cout << "w_width:    " << w_width << std::endl;
  std::cout << "channels:   " << channels << std::endl;
  std::cout << "pad_h:      " << pad_h << std::endl;
  std::cout << "pad_w:      " << pad_w << std::endl;
  std::cout << "stride_h:   " << stride_h << std::endl;
  std::cout << "stride_w:   " << stride_w << std::endl;
  std::cout << "dilation_h: " << dilation_h << std::endl;
  std::cout << "dilation_w: " << dilation_w << std::endl;
  std::cout << "n_iters:    " << n_iters << std::endl;
  std::cout << "=====================" << std::endl;

  const std::uint32_t x0 = im_height + 2 * pad_h;
  const std::uint32_t x1 = im_width + 2 * pad_w;
  const std::uint32_t w0 = (w_height - 1) * dilation_h + 1;
  const std::uint32_t w1 = (w_width - 1) * dilation_w + 1;

  const std::uint32_t im_size = im_height * im_width * channels;
  const std::uint32_t col_size = ((x0 - w0) / stride_h + 1) * ((x1 - w1) / stride_w + 1) * w_height * w_width * channels;

  std::vector<float> im(im_size);
  std::vector<float> col(col_size);

  std::mt19937 rng(12345);
  std::uniform_real_distribution<float> dist(-1, 1);
  auto gen = [&dist, &rng]() {
    return dist(rng);
  };
  std::generate(col.begin(), col.end(), gen);

  cl::Buffer col_buffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, col_size * sizeof(float), nullptr);
  cl::Buffer im_buffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, im_size * sizeof(float), nullptr);

  float *mapped_col_buffer = static_cast<float *>(
    queue.enqueueMapBuffer(
      col_buffer, CL_TRUE, CL_MAP_WRITE, 0, sizeof(float) * col_size, 0));
  std::memcpy(mapped_col_buffer, col.data(), sizeof(float) * col_size);
  queue.enqueueUnmapMemObject(col_buffer, mapped_col_buffer);

  auto start = std::chrono::system_clock::now();
  for (auto i = 0; i < n_iters; ++i) {
    queue.enqueueFillBuffer<float>(im_buffer, 0, 0, sizeof(float) * im_size);
    col2im(col2im_kernel, queue,
      channels, im_height, im_width, w_height, w_width, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, col_buffer, 0, im_buffer, 0);
  }
  auto end = std::chrono::system_clock::now();

  float *mapped_im_buffer = static_cast<float *>(queue.enqueueMapBuffer(im_buffer, CL_TRUE, CL_MAP_WRITE, 0, sizeof(float) * im_size, 0));
  std::memcpy(im.data(), mapped_im_buffer, sizeof(float) * im_size);
  queue.enqueueUnmapMemObject(im_buffer, mapped_im_buffer);

  std::size_t hash_val = 0;
  std::hash<float> h;
  for (auto v : im) {
    hash_val ^= h(v) + 0x9e3779b9 + (hash_val << 6) + (hash_val >> 2);
  }

  auto diff = end - start;
  std::cout << "Elapsed time = " << std::chrono::duration_cast<std::chrono::milliseconds>(diff).count() << " [ms]" <<std::endl;
  std::cout << "Hash = " << hash_val <<std::endl;

//   for (auto x : im) {
//     std::cout << x << ", ";
//   }
//   std::cout << std::endl;

  return 0;
}

## col2im_naive.cc
#include <algorithm>
#include <chrono>
#include <functional>
#include <iostream>
#include <random>

void col2im(
    const std::uint32_t channels, const std::uint32_t height, const std::uint32_t width,
    const std::uint32_t kernel_h, const std::uint32_t kernel_w, const std::uint32_t pad_h,
    const std::uint32_t pad_w, const std::uint32_t stride_h, const std::uint32_t stride_w,
    const std::uint32_t dilation_h, const std::uint32_t dilation_w,
    const std::vector<float> &col, const std::uint32_t col_offset,
    std::vector<float> &im, const std::uint32_t im_offset) {

  const std::uint32_t size_h = height + 2 * pad_h;
  const std::uint32_t padding_h = dilation_h * (kernel_h - 1) + 1;
  const std::uint32_t output_h = (size_h >= padding_h) ? (size_h - padding_h) / stride_h + 1 : 1;
  const std::uint32_t size_w = width + 2 * pad_w;
  const std::uint32_t padding_w = dilation_w * (kernel_w - 1) + 1;
  const std::uint32_t output_w = (size_w >= padding_w) ? (size_w - padding_w) / stride_w + 1 : 1;

  for (std::uint32_t channel = 0; channel < channels; ++channel) {
    const int col_channel_shift = channel * kernel_w * kernel_h * output_h * output_w + col_offset;
    const int x_channel_shift = channel * height * width + im_offset;
    for (std::int32_t y_y = 0; y_y < output_h; ++y_y) {
      for (std::int32_t y_x = 0; y_x < output_w; ++y_x) {
        for (std::uint32_t w_y = 0; w_y < kernel_h; ++w_y) {
          for (std::uint32_t w_x = 0; w_x < kernel_w; ++w_x) {
            const std::int32_t x_y = -pad_h + y_y * stride_h + w_y * dilation_h;
            const std::int32_t x_x = -pad_w + y_x * stride_w + w_x * dilation_w;
            if (x_y >= 0 && x_x >= 0 && x_y < height && x_x < width) {
              im[x_channel_shift + x_y * width + x_x]
                  += col[col_channel_shift
                            + (w_x + w_y * kernel_w) * output_h * output_w
                            + y_y * output_w
                            + y_x];
            }
          }
        }
      }
    }
  }
}

int main() {

  const std::uint32_t im_height = 512;
  const std::uint32_t im_width = 512;
  const std::uint32_t w_height = 16;
  const std::uint32_t w_width = 16;
  const std::uint32_t channels = 1;

  const std::uint32_t pad_h = 8;
  const std::uint32_t pad_w = 8;
  const std::uint32_t stride_h = 1;
  const std::uint32_t stride_w = 1;
  const std::uint32_t dilation_h = 1;
  const std::uint32_t dilation_w = 1;

  const std::uint32_t n_iters = 1;

  std::cout << "col2im (Naive)" <<std::endl;
  std::cout << "=====================" << std::endl;
  std::cout << "im_height:  " << im_height << std::endl;
  std::cout << "im_width:   " << im_width << std::endl;
  std::cout << "w_height:   " << w_height << std::endl;
  std::cout << "w_width:    " << w_width << std::endl;
  std::cout << "channels:   " << channels << std::endl;
  std::cout << "pad_h:      " << pad_h << std::endl;
  std::cout << "pad_w:      " << pad_w << std::endl;
  std::cout << "stride_h:   " << stride_h << std::endl;
  std::cout << "stride_w:   " << stride_w << std::endl;
  std::cout << "dilation_h: " << dilation_h << std::endl;
  std::cout << "dilation_w: " << dilation_w << std::endl;
  std::cout << "n_iters:    " << n_iters << std::endl;
  std::cout << "=====================" << std::endl;

  const std::uint32_t x0 = im_height + 2 * pad_h;
  const std::uint32_t x1 = im_width + 2 * pad_w;
  const std::uint32_t w0 = (w_height - 1) * dilation_h + 1;
  const std::uint32_t w1 = (w_width - 1) * dilation_w + 1;

  const std::uint32_t im_size = im_height * im_width * channels;
  const std::uint32_t col_size = ((x0 - w0) / stride_h + 1) * ((x1 - w1) / stride_w + 1) * w_height * w_width * channels;

  std::vector<float> im(im_size);
  std::vector<float> col(col_size);

  std::mt19937 rng(12345);
  std::uniform_real_distribution<float> dist(-1, 1);
  auto gen = [&dist, &rng]() {
    return dist(rng);
  };
  std::generate(col.begin(), col.end(), gen);

  auto start = std::chrono::system_clock::now();
  for (auto i = 0; i < n_iters; ++i) {
    std::fill(im.begin(), im.end(), 0);
    col2im(
      channels, im_height, im_width, w_height, w_width, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, col, 0, im, 0);
  }
  auto end = std::chrono::system_clock::now();

  std::size_t hash_val = 0;
  std::hash<float> h;
  for (auto v : im) {
    hash_val ^= h(v) + 0x9e3779b9 + (hash_val << 6) + (hash_val >> 2);
  }

  auto diff = end - start;
  std::cout << "Elapsed time = " << std::chrono::duration_cast<std::chrono::milliseconds>(diff).count() << " [ms]" <<std::endl;
  std::cout << "Hash = " << hash_val <<std::endl;

//   for (auto x : im) {
//     std::cout << x << ", ";
//   }
//   std::cout << std::endl;

  return 0;
}
	#include <algorithm>
	#include <chrono>
	#include <functional>
	#include <iostream>
	#include <random>

	#define CL_HPP_ENABLE_EXCEPTIONS
	#define CL_HPP_MINIMUM_OPENCL_VERSION 120
	#define CL_HPP_TARGET_OPENCL_VERSION 120
	#define CL_HPP_CL_1_2_DEFAULT_BUILD

	#include <CL/cl2.hpp>

	std::vector<cl::Platform> get_all_platforms() {
	std::vector<cl::Platform> ret;
	cl::Platform::get(&ret);
	return ret;
	}

	std::vector<cl::Device> get_all_devices(std::uint32_t platform_id) {
	const auto all_pfs = get_all_platforms();
	std::vector<cl::Device> ret;
	all_pfs[platform_id].getDevices(CL_DEVICE_TYPE_ALL, &ret);
	return ret;
	}

	cl::Device get_device(std::uint32_t platform_id, std::uint32_t device_id) {
	const auto all_devs = get_all_devices(platform_id);
	return all_devs[device_id];
	}

	std::uint32_t calc_num_blocks(std::uint32_t size, std::uint32_t num_threads) {
	return (size + num_threads - 1) / num_threads;
	}

	void col2im(
	cl::Kernel &kernel, cl::CommandQueue &queue,
	const std::uint32_t channels, const std::uint32_t height, const std::uint32_t width,
	const std::uint32_t kernel_h, const std::uint32_t kernel_w, const std::uint32_t pad_h,
	const std::uint32_t pad_w, const std::uint32_t stride_h, const std::uint32_t stride_w,
	const std::uint32_t dilation_h, const std::uint32_t dilation_w,
	const cl::Buffer &col_buffer, const std::uint32_t col_offset,
	cl::Buffer &im_buffer, const std::uint32_t im_offset) {
	const std::uint32_t size_h = height + 2 * pad_h;
	const std::uint32_t padding_h = dilation_h * (kernel_h - 1) + 1;
	const std::uint32_t output_h = (size_h >= padding_h) ? (size_h - padding_h) / stride_h + 1 : 1;
	const std::uint32_t size_w = width + 2 * pad_w;
	const std::uint32_t padding_w = dilation_w * (kernel_w - 1) + 1;
	const std::uint32_t output_w = (size_w >= padding_w) ? (size_w - padding_w) / stride_w + 1 : 1;
	const std::uint32_t g1 = calc_num_blocks(width, 16);
	const std::uint32_t g2 = calc_num_blocks(height * channels, 16);
	kernel.setArg(0, height);
	kernel.setArg(1, width);
	kernel.setArg(2, channels);
	kernel.setArg(3, output_h);
	kernel.setArg(4, output_w);
	kernel.setArg(5, kernel_h);
	kernel.setArg(6, kernel_w);
	kernel.setArg(7, pad_h);
	kernel.setArg(8, pad_w);
	kernel.setArg(9, stride_h);
	kernel.setArg(10, stride_w);
	kernel.setArg(11, dilation_h);
	kernel.setArg(12, dilation_w);
	kernel.setArg(13, col_buffer);
	kernel.setArg(14, col_offset);
	kernel.setArg(15, im_buffer);
	kernel.setArg(16, im_offset);
	queue.enqueueNDRangeKernel(
	kernel, cl::NullRange,
	cl::NDRange(g1 * 16, g2 * 16, 1),
	cl::NDRange(16, 16, 1));
	}

	/*
	* In reference to caffe:
	* https://github.com/BVLC/caffe/blob/master/src/caffe/util/im2col.cu
	*/

	std::string kernel_string = R"EOS(
	kernel void col2im_kernel(
	const int input_h, const int input_w, const int channels,
	const int output_h, const int output_w,
	const int kernel_h, const int kernel_w,
	const int pad_h, const int pad_w,
	const int stride_h, const int stride_w,
	const int dilation_h, const int dilation_w,
	const global float *col_buffer, const int col_offset,
	__global float *im_buffer, const int im_offset) {

	const int x_x = get_global_id(0) + pad_w;
	const int x_y = ((int) get_global_id(1)) % input_h + pad_h;
	const int channel = ((int) get_global_id(1)) / input_h;

	const int kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
	const int kernel_extent_h = (kernel_h - 1) * dilation_h + 1;

	const int col_channel_shift = channel * kernel_w * kernel_h * output_h * output_w + col_offset;
	const int x_channel_shift = channel * input_h * input_w + im_offset;

	const int t_y_begin = (x_y < kernel_extent_h) ? 0 : (x_y - kernel_extent_h) / stride_h + 1;
	const int t_y_end = min(x_y / stride_h + 1, output_h);
	const int t_x_begin = (x_x < kernel_extent_w) ? 0 : (x_x - kernel_extent_w) / stride_w + 1;
	const int t_x_end = min(x_x / stride_w + 1, output_w);

	if (x_x < input_w + pad_w && channel < channels) {

	float val = 0;

	for (int t_y = t_y_begin; t_y < t_y_end; ++t_y) {
	for (int t_x = t_x_begin; t_x < t_x_end; ++t_x) {
	int w_y = x_y - t_y * stride_h;
	int w_x = x_x - t_x * stride_w;
	if (w_y % dilation_h == 0 && w_x % dilation_w == 0) {
	w_y /= dilation_h;
	w_x /= dilation_w;
	val += col_buffer[col_channel_shift
	+ (w_x + w_y * kernel_w) * output_h * output_w
	+ t_y * output_w
	+ t_x];
	}
	}
	}
	im_buffer[x_channel_shift + (x_y - pad_h) * input_w + x_x - pad_w] = val;
	}
	}
	)EOS";

	int main() {
	cl::Device device(get_device(0, 0));
	cl::Context context({ device });
	cl::CommandQueue queue(context, device, 0);

	cl::Program program(context, kernel_string);

	try {
	program.build({device});
	} catch (...) {
	std::cerr << "OpenCL kernel compile error:" << std::endl << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device) << std::endl;
	return 1;
	}

	cl::Kernel col2im_kernel(program, "col2im_kernel");

	const std::uint32_t im_height = 512;
	const std::uint32_t im_width = 512;
	const std::uint32_t w_height = 16;
	const std::uint32_t w_width = 16;
	const std::uint32_t channels = 1;

	const std::uint32_t pad_h = 8;
	const std::uint32_t pad_w = 8;
	const std::uint32_t stride_h = 1;
	const std::uint32_t stride_w = 1;
	const std::uint32_t dilation_h = 1;
	const std::uint32_t dilation_w = 1;

	const std::uint32_t n_iters = 500;

	std::cout << "col2im (GPU caffe)" <<std::endl;
	std::cout << "=====================" << std::endl;
	std::cout << "im_height: " << im_height << std::endl;
	std::cout << "im_width: " << im_width << std::endl;
	std::cout << "w_height: " << w_height << std::endl;
	std::cout << "w_width: " << w_width << std::endl;
	std::cout << "channels: " << channels << std::endl;
	std::cout << "pad_h: " << pad_h << std::endl;
	std::cout << "pad_w: " << pad_w << std::endl;
	std::cout << "stride_h: " << stride_h << std::endl;
	std::cout << "stride_w: " << stride_w << std::endl;
	std::cout << "dilation_h: " << dilation_h << std::endl;
	std::cout << "dilation_w: " << dilation_w << std::endl;
	std::cout << "n_iters: " << n_iters << std::endl;
	std::cout << "=====================" << std::endl;

	const std::uint32_t x0 = im_height + 2 * pad_h;
	const std::uint32_t x1 = im_width + 2 * pad_w;
	const std::uint32_t w0 = (w_height - 1) * dilation_h + 1;
	const std::uint32_t w1 = (w_width - 1) * dilation_w + 1;

	const std::uint32_t im_size = im_height * im_width * channels;
	const std::uint32_t col_size = ((x0 - w0) / stride_h + 1) * ((x1 - w1) / stride_w + 1) * w_height * w_width * channels;

	std::vector<float> im(im_size);
	std::vector<float> col(col_size);

	std::mt19937 rng(12345);
	std::uniform_real_distribution<float> dist(-1, 1);
	auto gen = [&dist, &rng]() {
	return dist(rng);
	};
	std::generate(col.begin(), col.end(), gen);

	cl::Buffer col_buffer(context, CL_MEM_READ_WRITE \| CL_MEM_ALLOC_HOST_PTR, col_size * sizeof(float), nullptr);
	cl::Buffer im_buffer(context, CL_MEM_READ_WRITE \| CL_MEM_ALLOC_HOST_PTR, im_size * sizeof(float), nullptr);

	float mapped_col_buffer = static_cast<float >(
	queue.enqueueMapBuffer(
	col_buffer, CL_TRUE, CL_MAP_WRITE, 0, sizeof(float) * col_size, 0));
	std::memcpy(mapped_col_buffer, col.data(), sizeof(float) * col_size);
	queue.enqueueUnmapMemObject(col_buffer, mapped_col_buffer);

	auto start = std::chrono::system_clock::now();
	for (auto i = 0; i < n_iters; ++i) {
	queue.enqueueFillBuffer<float>(im_buffer, 0, 0, sizeof(float) * im_size);
	col2im(col2im_kernel, queue,
	channels, im_height, im_width, w_height, w_width, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, col_buffer, 0, im_buffer, 0);
	}
	auto end = std::chrono::system_clock::now();

	float mapped_im_buffer = static_cast<float >(queue.enqueueMapBuffer(im_buffer, CL_TRUE, CL_MAP_WRITE, 0, sizeof(float) * im_size, 0));
	std::memcpy(im.data(), mapped_im_buffer, sizeof(float) * im_size);
	queue.enqueueUnmapMemObject(im_buffer, mapped_im_buffer);

	std::size_t hash_val = 0;
	std::hash<float> h;
	for (auto v : im) {
	hash_val ^= h(v) + 0x9e3779b9 + (hash_val << 6) + (hash_val >> 2);
	}

	auto diff = end - start;
	std::cout << "Elapsed time = " << std::chrono::duration_cast<std::chrono::milliseconds>(diff).count() << " [ms]" <<std::endl;
	std::cout << "Hash = " << hash_val <<std::endl;

	// for (auto x : im) {
	// std::cout << x << ", ";
	// }
	// std::cout << std::endl;

	return 0;
	}