Skip to content

Instantly share code, notes, and snippets.

@vbkaisetsu
Last active July 31, 2018 11:32
Show Gist options
  • Save vbkaisetsu/a98299df827f9a5245635f646c1d94be to your computer and use it in GitHub Desktop.
Save vbkaisetsu/a98299df827f9a5245635f646c1d94be to your computer and use it in GitHub Desktop.
col2im implementations
#include <algorithm>
#include <chrono>
#include <functional>
#include <iostream>
#include <random>
#define CL_HPP_ENABLE_EXCEPTIONS
#define CL_HPP_MINIMUM_OPENCL_VERSION 120
#define CL_HPP_TARGET_OPENCL_VERSION 120
#define CL_HPP_CL_1_2_DEFAULT_BUILD
#include <CL/cl2.hpp>
std::vector<cl::Platform> get_all_platforms() {
std::vector<cl::Platform> ret;
cl::Platform::get(&ret);
return ret;
}
std::vector<cl::Device> get_all_devices(std::uint32_t platform_id) {
const auto all_pfs = get_all_platforms();
std::vector<cl::Device> ret;
all_pfs[platform_id].getDevices(CL_DEVICE_TYPE_ALL, &ret);
return ret;
}
cl::Device get_device(std::uint32_t platform_id, std::uint32_t device_id) {
const auto all_devs = get_all_devices(platform_id);
return all_devs[device_id];
}
std::uint32_t calc_num_blocks(std::uint32_t size, std::uint32_t num_threads) {
return (size + num_threads - 1) / num_threads;
}
void col2im(
cl::Kernel &kernel, cl::CommandQueue &queue,
const std::uint32_t channels, const std::uint32_t height, const std::uint32_t width,
const std::uint32_t kernel_h, const std::uint32_t kernel_w, const std::uint32_t pad_h,
const std::uint32_t pad_w, const std::uint32_t stride_h, const std::uint32_t stride_w,
const std::uint32_t dilation_h, const std::uint32_t dilation_w,
const cl::Buffer &col_buffer, const std::uint32_t col_offset,
cl::Buffer &im_buffer, const std::uint32_t im_offset) {
const std::uint32_t size_h = height + 2 * pad_h;
const std::uint32_t padding_h = dilation_h * (kernel_h - 1) + 1;
const std::uint32_t output_h = (size_h >= padding_h) ? (size_h - padding_h) / stride_h + 1 : 1;
const std::uint32_t size_w = width + 2 * pad_w;
const std::uint32_t padding_w = dilation_w * (kernel_w - 1) + 1;
const std::uint32_t output_w = (size_w >= padding_w) ? (size_w - padding_w) / stride_w + 1 : 1;
const std::uint32_t g1 = calc_num_blocks(width, 16);
const std::uint32_t g2 = calc_num_blocks(height * channels, 16);
kernel.setArg(0, height);
kernel.setArg(1, width);
kernel.setArg(2, channels);
kernel.setArg(3, output_h);
kernel.setArg(4, output_w);
kernel.setArg(5, kernel_h);
kernel.setArg(6, kernel_w);
kernel.setArg(7, pad_h);
kernel.setArg(8, pad_w);
kernel.setArg(9, stride_h);
kernel.setArg(10, stride_w);
kernel.setArg(11, dilation_h);
kernel.setArg(12, dilation_w);
kernel.setArg(13, col_buffer);
kernel.setArg(14, col_offset);
kernel.setArg(15, im_buffer);
kernel.setArg(16, im_offset);
queue.enqueueNDRangeKernel(
kernel, cl::NullRange,
cl::NDRange(g1 * 16, g2 * 16, 1),
cl::NDRange(16, 16, 1));
}
/*
* In reference to caffe:
* https://github.com/BVLC/caffe/blob/master/src/caffe/util/im2col.cu
*/
std::string kernel_string = R"EOS(
kernel void col2im_kernel(
const int input_h, const int input_w, const int channels,
const int output_h, const int output_w,
const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const global float *col_buffer, const int col_offset,
__global float *im_buffer, const int im_offset) {
const int x_x = get_global_id(0) + pad_w;
const int x_y = ((int) get_global_id(1)) % input_h + pad_h;
const int channel = ((int) get_global_id(1)) / input_h;
const int kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
const int kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
const int col_channel_shift = channel * kernel_w * kernel_h * output_h * output_w + col_offset;
const int x_channel_shift = channel * input_h * input_w + im_offset;
const int t_y_begin = (x_y < kernel_extent_h) ? 0 : (x_y - kernel_extent_h) / stride_h + 1;
const int t_y_end = min(x_y / stride_h + 1, output_h);
const int t_x_begin = (x_x < kernel_extent_w) ? 0 : (x_x - kernel_extent_w) / stride_w + 1;
const int t_x_end = min(x_x / stride_w + 1, output_w);
if (x_x < input_w + pad_w && channel < channels) {
float val = 0;
for (int t_y = t_y_begin; t_y < t_y_end; ++t_y) {
for (int t_x = t_x_begin; t_x < t_x_end; ++t_x) {
int w_y = x_y - t_y * stride_h;
int w_x = x_x - t_x * stride_w;
if (w_y % dilation_h == 0 && w_x % dilation_w == 0) {
w_y /= dilation_h;
w_x /= dilation_w;
val += col_buffer[col_channel_shift
+ (w_x + w_y * kernel_w) * output_h * output_w
+ t_y * output_w
+ t_x];
}
}
}
im_buffer[x_channel_shift + (x_y - pad_h) * input_w + x_x - pad_w] = val;
}
}
)EOS";
int main() {
cl::Device device(get_device(0, 0));
cl::Context context({ device });
cl::CommandQueue queue(context, device, 0);
cl::Program program(context, kernel_string);
try {
program.build({device});
} catch (...) {
std::cerr << "OpenCL kernel compile error:" << std::endl << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device) << std::endl;
return 1;
}
cl::Kernel col2im_kernel(program, "col2im_kernel");
const std::uint32_t im_height = 512;
const std::uint32_t im_width = 512;
const std::uint32_t w_height = 16;
const std::uint32_t w_width = 16;
const std::uint32_t channels = 1;
const std::uint32_t pad_h = 8;
const std::uint32_t pad_w = 8;
const std::uint32_t stride_h = 1;
const std::uint32_t stride_w = 1;
const std::uint32_t dilation_h = 1;
const std::uint32_t dilation_w = 1;
const std::uint32_t n_iters = 500;
std::cout << "col2im (GPU caffe)" <<std::endl;
std::cout << "=====================" << std::endl;
std::cout << "im_height: " << im_height << std::endl;
std::cout << "im_width: " << im_width << std::endl;
std::cout << "w_height: " << w_height << std::endl;
std::cout << "w_width: " << w_width << std::endl;
std::cout << "channels: " << channels << std::endl;
std::cout << "pad_h: " << pad_h << std::endl;
std::cout << "pad_w: " << pad_w << std::endl;
std::cout << "stride_h: " << stride_h << std::endl;
std::cout << "stride_w: " << stride_w << std::endl;
std::cout << "dilation_h: " << dilation_h << std::endl;
std::cout << "dilation_w: " << dilation_w << std::endl;
std::cout << "n_iters: " << n_iters << std::endl;
std::cout << "=====================" << std::endl;
const std::uint32_t x0 = im_height + 2 * pad_h;
const std::uint32_t x1 = im_width + 2 * pad_w;
const std::uint32_t w0 = (w_height - 1) * dilation_h + 1;
const std::uint32_t w1 = (w_width - 1) * dilation_w + 1;
const std::uint32_t im_size = im_height * im_width * channels;
const std::uint32_t col_size = ((x0 - w0) / stride_h + 1) * ((x1 - w1) / stride_w + 1) * w_height * w_width * channels;
std::vector<float> im(im_size);
std::vector<float> col(col_size);
std::mt19937 rng(12345);
std::uniform_real_distribution<float> dist(-1, 1);
auto gen = [&dist, &rng]() {
return dist(rng);
};
std::generate(col.begin(), col.end(), gen);
cl::Buffer col_buffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, col_size * sizeof(float), nullptr);
cl::Buffer im_buffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, im_size * sizeof(float), nullptr);
float *mapped_col_buffer = static_cast<float *>(
queue.enqueueMapBuffer(
col_buffer, CL_TRUE, CL_MAP_WRITE, 0, sizeof(float) * col_size, 0));
std::memcpy(mapped_col_buffer, col.data(), sizeof(float) * col_size);
queue.enqueueUnmapMemObject(col_buffer, mapped_col_buffer);
auto start = std::chrono::system_clock::now();
for (auto i = 0; i < n_iters; ++i) {
queue.enqueueFillBuffer<float>(im_buffer, 0, 0, sizeof(float) * im_size);
col2im(col2im_kernel, queue,
channels, im_height, im_width, w_height, w_width, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, col_buffer, 0, im_buffer, 0);
}
auto end = std::chrono::system_clock::now();
float *mapped_im_buffer = static_cast<float *>(queue.enqueueMapBuffer(im_buffer, CL_TRUE, CL_MAP_WRITE, 0, sizeof(float) * im_size, 0));
std::memcpy(im.data(), mapped_im_buffer, sizeof(float) * im_size);
queue.enqueueUnmapMemObject(im_buffer, mapped_im_buffer);
std::size_t hash_val = 0;
std::hash<float> h;
for (auto v : im) {
hash_val ^= h(v) + 0x9e3779b9 + (hash_val << 6) + (hash_val >> 2);
}
auto diff = end - start;
std::cout << "Elapsed time = " << std::chrono::duration_cast<std::chrono::milliseconds>(diff).count() << " [ms]" <<std::endl;
std::cout << "Hash = " << hash_val <<std::endl;
// for (auto x : im) {
// std::cout << x << ", ";
// }
// std::cout << std::endl;
return 0;
}
#include <algorithm>
#include <chrono>
#include <functional>
#include <iostream>
#include <random>
#define CL_HPP_ENABLE_EXCEPTIONS
#define CL_HPP_MINIMUM_OPENCL_VERSION 120
#define CL_HPP_TARGET_OPENCL_VERSION 120
#define CL_HPP_CL_1_2_DEFAULT_BUILD
#include <CL/cl2.hpp>
std::vector<cl::Platform> get_all_platforms() {
std::vector<cl::Platform> ret;
cl::Platform::get(&ret);
return ret;
}
std::vector<cl::Device> get_all_devices(std::uint32_t platform_id) {
const auto all_pfs = get_all_platforms();
std::vector<cl::Device> ret;
all_pfs[platform_id].getDevices(CL_DEVICE_TYPE_ALL, &ret);
return ret;
}
cl::Device get_device(std::uint32_t platform_id, std::uint32_t device_id) {
const auto all_devs = get_all_devices(platform_id);
return all_devs[device_id];
}
std::uint32_t calc_num_blocks(std::uint32_t size, std::uint32_t num_threads) {
return (size + num_threads - 1) / num_threads;
}
void euclid_gcd(std::int32_t a, std::int32_t b, std::int32_t &p, std::int32_t &q, std::int32_t &r) {
p = 0;
q = 1;
std::int32_t p_1 = 1;
std::int32_t q_1 = 0;
for (;;) {
const std::int32_t c = a % b;
if (c == 0) {
break;
}
const std::int32_t p_2 = p_1;
const std::int32_t q_2 = q_1;
p_1 = p;
q_1 = q;
p = p_2 - p_1 * (a / b);
q = q_2 - q_1 * (a / b);
a = b;
b = c;
}
r = b;
}
void col2im(
cl::Kernel &kernel, cl::CommandQueue &queue,
const std::uint32_t channels, const std::uint32_t height, const std::uint32_t width,
const std::uint32_t kernel_h, const std::uint32_t kernel_w, const std::uint32_t pad_h,
const std::uint32_t pad_w, const std::uint32_t stride_h, const std::uint32_t stride_w,
const std::uint32_t dilation_h, const std::uint32_t dilation_w,
const cl::Buffer &col_buffer, const std::uint32_t col_offset,
cl::Buffer &im_buffer, const std::uint32_t im_offset) {
std::int32_t stride_bez_h = 0;
std::int32_t dilation_bez_h = 0;
std::int32_t gcd_h = 0;
std::int32_t stride_bez_w = 0;
std::int32_t dilation_bez_w = 0;
std::int32_t gcd_w = 0;
euclid_gcd(stride_h, dilation_h, stride_bez_h, dilation_bez_h, gcd_h);
euclid_gcd(stride_w, dilation_w, stride_bez_w, dilation_bez_w, gcd_w);
const std::uint32_t size_h = height + 2 * pad_h;
const std::uint32_t padding_h = dilation_h * (kernel_h - 1) + 1;
const std::uint32_t output_h = (size_h >= padding_h) ? (size_h - padding_h) / stride_h + 1 : 1;
const std::uint32_t size_w = width + 2 * pad_w;
const std::uint32_t padding_w = dilation_w * (kernel_w - 1) + 1;
const std::uint32_t output_w = (size_w >= padding_w) ? (size_w - padding_w) / stride_w + 1 : 1;
const std::uint32_t g1 = calc_num_blocks((width - 1) / gcd_w + 1, 16);
const std::uint32_t g2 = calc_num_blocks(((height - 1) / gcd_h + 1) * channels, 16);
kernel.setArg(0, height);
kernel.setArg(1, width);
kernel.setArg(2, channels);
kernel.setArg(3, output_h);
kernel.setArg(4, output_w);
kernel.setArg(5, kernel_h);
kernel.setArg(6, kernel_w);
kernel.setArg(7, pad_h);
kernel.setArg(8, pad_w);
kernel.setArg(9, stride_h);
kernel.setArg(10, stride_w);
kernel.setArg(11, dilation_h);
kernel.setArg(12, dilation_w);
kernel.setArg(13, stride_bez_h);
kernel.setArg(14, stride_bez_w);
kernel.setArg(15, dilation_bez_h);
kernel.setArg(16, dilation_bez_w);
kernel.setArg(17, gcd_h);
kernel.setArg(18, gcd_w);
kernel.setArg(19, col_buffer);
kernel.setArg(20, col_offset);
kernel.setArg(21, im_buffer);
kernel.setArg(22, im_offset);
queue.enqueueNDRangeKernel(
kernel, cl::NullRange,
cl::NDRange(g1 * 16, g2 * 16, 1),
cl::NDRange(16, 16, 1));
}
std::string kernel_string = R"EOS(
inline int grid_ceil(const int x, const int step) {
return x > 0 ? ((x - 1) / step + 1) * step : x / step * step;
}
kernel void col2im_kernel(
const int input_h, const int input_w, const int channels,
const int output_h, const int output_w,
const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int stride_bez_h, const int stride_bez_w,
const int dilation_bez_h, const int dilation_bez_w,
const int gcd_h, const int gcd_w,
const global float *col_buffer, const int col_offset,
__global float *im_buffer, const int im_offset) {
const int input_h_scale = (input_h - 1) / gcd_h + 1;
const int gcd_scale_w = get_global_id(0) + (pad_w - 1) / gcd_w + 1;
const int gcd_scale_h = ((int) get_global_id(1)) % input_h_scale + (pad_h - 1) / gcd_h + 1;
const int x_x = gcd_scale_w * gcd_w - pad_w;
const int x_y = gcd_scale_h * gcd_h - pad_h;
const int channel = ((int) get_global_id(1)) / input_h_scale;
const int col_channel_shift = channel * kernel_w * kernel_h * output_h * output_w + col_offset;
const int x_channel_shift = channel * input_h * input_w + im_offset;
const int t_y_step = stride_h * dilation_h / gcd_h;
const int t_y_begin = grid_ceil(max(-stride_bez_h * gcd_scale_h * stride_h,
(dilation_bez_h * gcd_scale_h - kernel_h + 1) * dilation_h),
t_y_step);
const int t_y_end = min((output_h - stride_bez_h * gcd_scale_h) * stride_h,
(dilation_bez_h * gcd_scale_h + 1) * dilation_h);
const int t_x_step = stride_w * dilation_w / gcd_w;
const int t_x_begin = grid_ceil(max(-stride_bez_w * gcd_scale_w * stride_w,
(dilation_bez_w * gcd_scale_w - kernel_w + 1) * dilation_w),
t_x_step);
const int t_x_end = min((output_w - stride_bez_w * gcd_scale_w) * stride_w,
(dilation_bez_w * gcd_scale_w + 1) * dilation_w);
if (x_x < input_w && channel < channels) {
float val = 0;
for (int t_y = t_y_begin; t_y < t_y_end; t_y += t_y_step) {
for (int t_x = t_x_begin; t_x < t_x_end; t_x += t_x_step) {
const int w_y = -t_y / dilation_h + dilation_bez_h * gcd_scale_h;
const int y_y = t_y / stride_h + stride_bez_h * gcd_scale_h;
const int w_x = -t_x / dilation_w + dilation_bez_w * gcd_scale_w;
const int y_x = t_x / stride_w + stride_bez_w * gcd_scale_w;
val += col_buffer[col_channel_shift
+ (w_x + w_y * kernel_w) * output_h * output_w
+ y_y * output_w
+ y_x];
}
}
im_buffer[x_channel_shift + x_y * input_w + x_x] = val;
}
}
)EOS";
int main() {
cl::Device device(get_device(0, 0));
cl::Context context({ device });
cl::CommandQueue queue(context, device, 0);
cl::Program program(context, kernel_string);
try {
program.build({device});
} catch (...) {
std::cerr << "OpenCL kernel compile error:" << std::endl << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device) << std::endl;
return 1;
}
cl::Kernel col2im_kernel(program, "col2im_kernel");
const std::uint32_t im_height = 512;
const std::uint32_t im_width = 512;
const std::uint32_t w_height = 16;
const std::uint32_t w_width = 16;
const std::uint32_t channels = 1;
const std::uint32_t pad_h = 8;
const std::uint32_t pad_w = 8;
const std::uint32_t stride_h = 1;
const std::uint32_t stride_w = 1;
const std::uint32_t dilation_h = 1;
const std::uint32_t dilation_w = 1;
const std::uint32_t n_iters = 500;
std::cout << "col2im (GPU GCD/LCM)" <<std::endl;
std::cout << "=====================" << std::endl;
std::cout << "im_height: " << im_height << std::endl;
std::cout << "im_width: " << im_width << std::endl;
std::cout << "w_height: " << w_height << std::endl;
std::cout << "w_width: " << w_width << std::endl;
std::cout << "channels: " << channels << std::endl;
std::cout << "pad_h: " << pad_h << std::endl;
std::cout << "pad_w: " << pad_w << std::endl;
std::cout << "stride_h: " << stride_h << std::endl;
std::cout << "stride_w: " << stride_w << std::endl;
std::cout << "dilation_h: " << dilation_h << std::endl;
std::cout << "dilation_w: " << dilation_w << std::endl;
std::cout << "n_iters: " << n_iters << std::endl;
std::cout << "=====================" << std::endl;
const std::uint32_t x0 = im_height + 2 * pad_h;
const std::uint32_t x1 = im_width + 2 * pad_w;
const std::uint32_t w0 = (w_height - 1) * dilation_h + 1;
const std::uint32_t w1 = (w_width - 1) * dilation_w + 1;
const std::uint32_t im_size = im_height * im_width * channels;
const std::uint32_t col_size = ((x0 - w0) / stride_h + 1) * ((x1 - w1) / stride_w + 1) * w_height * w_width * channels;
std::vector<float> im(im_size);
std::vector<float> col(col_size);
std::mt19937 rng(12345);
std::uniform_real_distribution<float> dist(-1, 1);
auto gen = [&dist, &rng]() {
return dist(rng);
};
std::generate(col.begin(), col.end(), gen);
cl::Buffer col_buffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, col_size * sizeof(float), nullptr);
cl::Buffer im_buffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, im_size * sizeof(float), nullptr);
float *mapped_col_buffer = static_cast<float *>(
queue.enqueueMapBuffer(
col_buffer, CL_TRUE, CL_MAP_WRITE, 0, sizeof(float) * col_size, 0));
std::memcpy(mapped_col_buffer, col.data(), sizeof(float) * col_size);
queue.enqueueUnmapMemObject(col_buffer, mapped_col_buffer);
auto start = std::chrono::system_clock::now();
for (auto i = 0; i < n_iters; ++i) {
queue.enqueueFillBuffer<float>(im_buffer, 0, 0, sizeof(float) * im_size);
col2im(col2im_kernel, queue,
channels, im_height, im_width, w_height, w_width, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, col_buffer, 0, im_buffer, 0);
}
auto end = std::chrono::system_clock::now();
float *mapped_im_buffer = static_cast<float *>(queue.enqueueMapBuffer(im_buffer, CL_TRUE, CL_MAP_WRITE, 0, sizeof(float) * im_size, 0));
std::memcpy(im.data(), mapped_im_buffer, sizeof(float) * im_size);
queue.enqueueUnmapMemObject(im_buffer, mapped_im_buffer);
std::size_t hash_val = 0;
std::hash<float> h;
for (auto v : im) {
hash_val ^= h(v) + 0x9e3779b9 + (hash_val << 6) + (hash_val >> 2);
}
auto diff = end - start;
std::cout << "Elapsed time = " << std::chrono::duration_cast<std::chrono::milliseconds>(diff).count() << " [ms]" <<std::endl;
std::cout << "Hash = " << hash_val <<std::endl;
// for (auto x : im) {
// std::cout << x << ", ";
// }
// std::cout << std::endl;
return 0;
}
#include <algorithm>
#include <chrono>
#include <functional>
#include <iostream>
#include <random>
void col2im(
const std::uint32_t channels, const std::uint32_t height, const std::uint32_t width,
const std::uint32_t kernel_h, const std::uint32_t kernel_w, const std::uint32_t pad_h,
const std::uint32_t pad_w, const std::uint32_t stride_h, const std::uint32_t stride_w,
const std::uint32_t dilation_h, const std::uint32_t dilation_w,
const std::vector<float> &col, const std::uint32_t col_offset,
std::vector<float> &im, const std::uint32_t im_offset) {
const std::uint32_t size_h = height + 2 * pad_h;
const std::uint32_t padding_h = dilation_h * (kernel_h - 1) + 1;
const std::uint32_t output_h = (size_h >= padding_h) ? (size_h - padding_h) / stride_h + 1 : 1;
const std::uint32_t size_w = width + 2 * pad_w;
const std::uint32_t padding_w = dilation_w * (kernel_w - 1) + 1;
const std::uint32_t output_w = (size_w >= padding_w) ? (size_w - padding_w) / stride_w + 1 : 1;
for (std::uint32_t channel = 0; channel < channels; ++channel) {
const int col_channel_shift = channel * kernel_w * kernel_h * output_h * output_w + col_offset;
const int x_channel_shift = channel * height * width + im_offset;
for (std::int32_t y_y = 0; y_y < output_h; ++y_y) {
for (std::int32_t y_x = 0; y_x < output_w; ++y_x) {
for (std::uint32_t w_y = 0; w_y < kernel_h; ++w_y) {
for (std::uint32_t w_x = 0; w_x < kernel_w; ++w_x) {
const std::int32_t x_y = -pad_h + y_y * stride_h + w_y * dilation_h;
const std::int32_t x_x = -pad_w + y_x * stride_w + w_x * dilation_w;
if (x_y >= 0 && x_x >= 0 && x_y < height && x_x < width) {
im[x_channel_shift + x_y * width + x_x]
+= col[col_channel_shift
+ (w_x + w_y * kernel_w) * output_h * output_w
+ y_y * output_w
+ y_x];
}
}
}
}
}
}
}
int main() {
const std::uint32_t im_height = 512;
const std::uint32_t im_width = 512;
const std::uint32_t w_height = 16;
const std::uint32_t w_width = 16;
const std::uint32_t channels = 1;
const std::uint32_t pad_h = 8;
const std::uint32_t pad_w = 8;
const std::uint32_t stride_h = 1;
const std::uint32_t stride_w = 1;
const std::uint32_t dilation_h = 1;
const std::uint32_t dilation_w = 1;
const std::uint32_t n_iters = 1;
std::cout << "col2im (Naive)" <<std::endl;
std::cout << "=====================" << std::endl;
std::cout << "im_height: " << im_height << std::endl;
std::cout << "im_width: " << im_width << std::endl;
std::cout << "w_height: " << w_height << std::endl;
std::cout << "w_width: " << w_width << std::endl;
std::cout << "channels: " << channels << std::endl;
std::cout << "pad_h: " << pad_h << std::endl;
std::cout << "pad_w: " << pad_w << std::endl;
std::cout << "stride_h: " << stride_h << std::endl;
std::cout << "stride_w: " << stride_w << std::endl;
std::cout << "dilation_h: " << dilation_h << std::endl;
std::cout << "dilation_w: " << dilation_w << std::endl;
std::cout << "n_iters: " << n_iters << std::endl;
std::cout << "=====================" << std::endl;
const std::uint32_t x0 = im_height + 2 * pad_h;
const std::uint32_t x1 = im_width + 2 * pad_w;
const std::uint32_t w0 = (w_height - 1) * dilation_h + 1;
const std::uint32_t w1 = (w_width - 1) * dilation_w + 1;
const std::uint32_t im_size = im_height * im_width * channels;
const std::uint32_t col_size = ((x0 - w0) / stride_h + 1) * ((x1 - w1) / stride_w + 1) * w_height * w_width * channels;
std::vector<float> im(im_size);
std::vector<float> col(col_size);
std::mt19937 rng(12345);
std::uniform_real_distribution<float> dist(-1, 1);
auto gen = [&dist, &rng]() {
return dist(rng);
};
std::generate(col.begin(), col.end(), gen);
auto start = std::chrono::system_clock::now();
for (auto i = 0; i < n_iters; ++i) {
std::fill(im.begin(), im.end(), 0);
col2im(
channels, im_height, im_width, w_height, w_width, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, col, 0, im, 0);
}
auto end = std::chrono::system_clock::now();
std::size_t hash_val = 0;
std::hash<float> h;
for (auto v : im) {
hash_val ^= h(v) + 0x9e3779b9 + (hash_val << 6) + (hash_val >> 2);
}
auto diff = end - start;
std::cout << "Elapsed time = " << std::chrono::duration_cast<std::chrono::milliseconds>(diff).count() << " [ms]" <<std::endl;
std::cout << "Hash = " << hash_val <<std::endl;
// for (auto x : im) {
// std::cout << x << ", ";
// }
// std::cout << std::endl;
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment