Last active
July 31, 2018 11:32
-
-
Save vbkaisetsu/a98299df827f9a5245635f646c1d94be to your computer and use it in GitHub Desktop.
col2im implementations
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <algorithm> | |
#include <chrono> | |
#include <functional> | |
#include <iostream> | |
#include <random> | |
#define CL_HPP_ENABLE_EXCEPTIONS | |
#define CL_HPP_MINIMUM_OPENCL_VERSION 120 | |
#define CL_HPP_TARGET_OPENCL_VERSION 120 | |
#define CL_HPP_CL_1_2_DEFAULT_BUILD | |
#include <CL/cl2.hpp> | |
std::vector<cl::Platform> get_all_platforms() { | |
std::vector<cl::Platform> ret; | |
cl::Platform::get(&ret); | |
return ret; | |
} | |
std::vector<cl::Device> get_all_devices(std::uint32_t platform_id) { | |
const auto all_pfs = get_all_platforms(); | |
std::vector<cl::Device> ret; | |
all_pfs[platform_id].getDevices(CL_DEVICE_TYPE_ALL, &ret); | |
return ret; | |
} | |
cl::Device get_device(std::uint32_t platform_id, std::uint32_t device_id) { | |
const auto all_devs = get_all_devices(platform_id); | |
return all_devs[device_id]; | |
} | |
std::uint32_t calc_num_blocks(std::uint32_t size, std::uint32_t num_threads) { | |
return (size + num_threads - 1) / num_threads; | |
} | |
void col2im( | |
cl::Kernel &kernel, cl::CommandQueue &queue, | |
const std::uint32_t channels, const std::uint32_t height, const std::uint32_t width, | |
const std::uint32_t kernel_h, const std::uint32_t kernel_w, const std::uint32_t pad_h, | |
const std::uint32_t pad_w, const std::uint32_t stride_h, const std::uint32_t stride_w, | |
const std::uint32_t dilation_h, const std::uint32_t dilation_w, | |
const cl::Buffer &col_buffer, const std::uint32_t col_offset, | |
cl::Buffer &im_buffer, const std::uint32_t im_offset) { | |
const std::uint32_t size_h = height + 2 * pad_h; | |
const std::uint32_t padding_h = dilation_h * (kernel_h - 1) + 1; | |
const std::uint32_t output_h = (size_h >= padding_h) ? (size_h - padding_h) / stride_h + 1 : 1; | |
const std::uint32_t size_w = width + 2 * pad_w; | |
const std::uint32_t padding_w = dilation_w * (kernel_w - 1) + 1; | |
const std::uint32_t output_w = (size_w >= padding_w) ? (size_w - padding_w) / stride_w + 1 : 1; | |
const std::uint32_t g1 = calc_num_blocks(width, 16); | |
const std::uint32_t g2 = calc_num_blocks(height * channels, 16); | |
kernel.setArg(0, height); | |
kernel.setArg(1, width); | |
kernel.setArg(2, channels); | |
kernel.setArg(3, output_h); | |
kernel.setArg(4, output_w); | |
kernel.setArg(5, kernel_h); | |
kernel.setArg(6, kernel_w); | |
kernel.setArg(7, pad_h); | |
kernel.setArg(8, pad_w); | |
kernel.setArg(9, stride_h); | |
kernel.setArg(10, stride_w); | |
kernel.setArg(11, dilation_h); | |
kernel.setArg(12, dilation_w); | |
kernel.setArg(13, col_buffer); | |
kernel.setArg(14, col_offset); | |
kernel.setArg(15, im_buffer); | |
kernel.setArg(16, im_offset); | |
queue.enqueueNDRangeKernel( | |
kernel, cl::NullRange, | |
cl::NDRange(g1 * 16, g2 * 16, 1), | |
cl::NDRange(16, 16, 1)); | |
} | |
/* | |
* In reference to caffe: | |
* https://github.com/BVLC/caffe/blob/master/src/caffe/util/im2col.cu | |
*/ | |
std::string kernel_string = R"EOS( | |
kernel void col2im_kernel( | |
const int input_h, const int input_w, const int channels, | |
const int output_h, const int output_w, | |
const int kernel_h, const int kernel_w, | |
const int pad_h, const int pad_w, | |
const int stride_h, const int stride_w, | |
const int dilation_h, const int dilation_w, | |
const global float *col_buffer, const int col_offset, | |
__global float *im_buffer, const int im_offset) { | |
const int x_x = get_global_id(0) + pad_w; | |
const int x_y = ((int) get_global_id(1)) % input_h + pad_h; | |
const int channel = ((int) get_global_id(1)) / input_h; | |
const int kernel_extent_w = (kernel_w - 1) * dilation_w + 1; | |
const int kernel_extent_h = (kernel_h - 1) * dilation_h + 1; | |
const int col_channel_shift = channel * kernel_w * kernel_h * output_h * output_w + col_offset; | |
const int x_channel_shift = channel * input_h * input_w + im_offset; | |
const int t_y_begin = (x_y < kernel_extent_h) ? 0 : (x_y - kernel_extent_h) / stride_h + 1; | |
const int t_y_end = min(x_y / stride_h + 1, output_h); | |
const int t_x_begin = (x_x < kernel_extent_w) ? 0 : (x_x - kernel_extent_w) / stride_w + 1; | |
const int t_x_end = min(x_x / stride_w + 1, output_w); | |
if (x_x < input_w + pad_w && channel < channels) { | |
float val = 0; | |
for (int t_y = t_y_begin; t_y < t_y_end; ++t_y) { | |
for (int t_x = t_x_begin; t_x < t_x_end; ++t_x) { | |
int w_y = x_y - t_y * stride_h; | |
int w_x = x_x - t_x * stride_w; | |
if (w_y % dilation_h == 0 && w_x % dilation_w == 0) { | |
w_y /= dilation_h; | |
w_x /= dilation_w; | |
val += col_buffer[col_channel_shift | |
+ (w_x + w_y * kernel_w) * output_h * output_w | |
+ t_y * output_w | |
+ t_x]; | |
} | |
} | |
} | |
im_buffer[x_channel_shift + (x_y - pad_h) * input_w + x_x - pad_w] = val; | |
} | |
} | |
)EOS"; | |
int main() { | |
cl::Device device(get_device(0, 0)); | |
cl::Context context({ device }); | |
cl::CommandQueue queue(context, device, 0); | |
cl::Program program(context, kernel_string); | |
try { | |
program.build({device}); | |
} catch (...) { | |
std::cerr << "OpenCL kernel compile error:" << std::endl << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device) << std::endl; | |
return 1; | |
} | |
cl::Kernel col2im_kernel(program, "col2im_kernel"); | |
const std::uint32_t im_height = 512; | |
const std::uint32_t im_width = 512; | |
const std::uint32_t w_height = 16; | |
const std::uint32_t w_width = 16; | |
const std::uint32_t channels = 1; | |
const std::uint32_t pad_h = 8; | |
const std::uint32_t pad_w = 8; | |
const std::uint32_t stride_h = 1; | |
const std::uint32_t stride_w = 1; | |
const std::uint32_t dilation_h = 1; | |
const std::uint32_t dilation_w = 1; | |
const std::uint32_t n_iters = 500; | |
std::cout << "col2im (GPU caffe)" <<std::endl; | |
std::cout << "=====================" << std::endl; | |
std::cout << "im_height: " << im_height << std::endl; | |
std::cout << "im_width: " << im_width << std::endl; | |
std::cout << "w_height: " << w_height << std::endl; | |
std::cout << "w_width: " << w_width << std::endl; | |
std::cout << "channels: " << channels << std::endl; | |
std::cout << "pad_h: " << pad_h << std::endl; | |
std::cout << "pad_w: " << pad_w << std::endl; | |
std::cout << "stride_h: " << stride_h << std::endl; | |
std::cout << "stride_w: " << stride_w << std::endl; | |
std::cout << "dilation_h: " << dilation_h << std::endl; | |
std::cout << "dilation_w: " << dilation_w << std::endl; | |
std::cout << "n_iters: " << n_iters << std::endl; | |
std::cout << "=====================" << std::endl; | |
const std::uint32_t x0 = im_height + 2 * pad_h; | |
const std::uint32_t x1 = im_width + 2 * pad_w; | |
const std::uint32_t w0 = (w_height - 1) * dilation_h + 1; | |
const std::uint32_t w1 = (w_width - 1) * dilation_w + 1; | |
const std::uint32_t im_size = im_height * im_width * channels; | |
const std::uint32_t col_size = ((x0 - w0) / stride_h + 1) * ((x1 - w1) / stride_w + 1) * w_height * w_width * channels; | |
std::vector<float> im(im_size); | |
std::vector<float> col(col_size); | |
std::mt19937 rng(12345); | |
std::uniform_real_distribution<float> dist(-1, 1); | |
auto gen = [&dist, &rng]() { | |
return dist(rng); | |
}; | |
std::generate(col.begin(), col.end(), gen); | |
cl::Buffer col_buffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, col_size * sizeof(float), nullptr); | |
cl::Buffer im_buffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, im_size * sizeof(float), nullptr); | |
float *mapped_col_buffer = static_cast<float *>( | |
queue.enqueueMapBuffer( | |
col_buffer, CL_TRUE, CL_MAP_WRITE, 0, sizeof(float) * col_size, 0)); | |
std::memcpy(mapped_col_buffer, col.data(), sizeof(float) * col_size); | |
queue.enqueueUnmapMemObject(col_buffer, mapped_col_buffer); | |
auto start = std::chrono::system_clock::now(); | |
for (auto i = 0; i < n_iters; ++i) { | |
queue.enqueueFillBuffer<float>(im_buffer, 0, 0, sizeof(float) * im_size); | |
col2im(col2im_kernel, queue, | |
channels, im_height, im_width, w_height, w_width, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, col_buffer, 0, im_buffer, 0); | |
} | |
auto end = std::chrono::system_clock::now(); | |
float *mapped_im_buffer = static_cast<float *>(queue.enqueueMapBuffer(im_buffer, CL_TRUE, CL_MAP_WRITE, 0, sizeof(float) * im_size, 0)); | |
std::memcpy(im.data(), mapped_im_buffer, sizeof(float) * im_size); | |
queue.enqueueUnmapMemObject(im_buffer, mapped_im_buffer); | |
std::size_t hash_val = 0; | |
std::hash<float> h; | |
for (auto v : im) { | |
hash_val ^= h(v) + 0x9e3779b9 + (hash_val << 6) + (hash_val >> 2); | |
} | |
auto diff = end - start; | |
std::cout << "Elapsed time = " << std::chrono::duration_cast<std::chrono::milliseconds>(diff).count() << " [ms]" <<std::endl; | |
std::cout << "Hash = " << hash_val <<std::endl; | |
// for (auto x : im) { | |
// std::cout << x << ", "; | |
// } | |
// std::cout << std::endl; | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <algorithm> | |
#include <chrono> | |
#include <functional> | |
#include <iostream> | |
#include <random> | |
#define CL_HPP_ENABLE_EXCEPTIONS | |
#define CL_HPP_MINIMUM_OPENCL_VERSION 120 | |
#define CL_HPP_TARGET_OPENCL_VERSION 120 | |
#define CL_HPP_CL_1_2_DEFAULT_BUILD | |
#include <CL/cl2.hpp> | |
std::vector<cl::Platform> get_all_platforms() { | |
std::vector<cl::Platform> ret; | |
cl::Platform::get(&ret); | |
return ret; | |
} | |
std::vector<cl::Device> get_all_devices(std::uint32_t platform_id) { | |
const auto all_pfs = get_all_platforms(); | |
std::vector<cl::Device> ret; | |
all_pfs[platform_id].getDevices(CL_DEVICE_TYPE_ALL, &ret); | |
return ret; | |
} | |
cl::Device get_device(std::uint32_t platform_id, std::uint32_t device_id) { | |
const auto all_devs = get_all_devices(platform_id); | |
return all_devs[device_id]; | |
} | |
std::uint32_t calc_num_blocks(std::uint32_t size, std::uint32_t num_threads) { | |
return (size + num_threads - 1) / num_threads; | |
} | |
void euclid_gcd(std::int32_t a, std::int32_t b, std::int32_t &p, std::int32_t &q, std::int32_t &r) { | |
p = 0; | |
q = 1; | |
std::int32_t p_1 = 1; | |
std::int32_t q_1 = 0; | |
for (;;) { | |
const std::int32_t c = a % b; | |
if (c == 0) { | |
break; | |
} | |
const std::int32_t p_2 = p_1; | |
const std::int32_t q_2 = q_1; | |
p_1 = p; | |
q_1 = q; | |
p = p_2 - p_1 * (a / b); | |
q = q_2 - q_1 * (a / b); | |
a = b; | |
b = c; | |
} | |
r = b; | |
} | |
void col2im( | |
cl::Kernel &kernel, cl::CommandQueue &queue, | |
const std::uint32_t channels, const std::uint32_t height, const std::uint32_t width, | |
const std::uint32_t kernel_h, const std::uint32_t kernel_w, const std::uint32_t pad_h, | |
const std::uint32_t pad_w, const std::uint32_t stride_h, const std::uint32_t stride_w, | |
const std::uint32_t dilation_h, const std::uint32_t dilation_w, | |
const cl::Buffer &col_buffer, const std::uint32_t col_offset, | |
cl::Buffer &im_buffer, const std::uint32_t im_offset) { | |
std::int32_t stride_bez_h = 0; | |
std::int32_t dilation_bez_h = 0; | |
std::int32_t gcd_h = 0; | |
std::int32_t stride_bez_w = 0; | |
std::int32_t dilation_bez_w = 0; | |
std::int32_t gcd_w = 0; | |
euclid_gcd(stride_h, dilation_h, stride_bez_h, dilation_bez_h, gcd_h); | |
euclid_gcd(stride_w, dilation_w, stride_bez_w, dilation_bez_w, gcd_w); | |
const std::uint32_t size_h = height + 2 * pad_h; | |
const std::uint32_t padding_h = dilation_h * (kernel_h - 1) + 1; | |
const std::uint32_t output_h = (size_h >= padding_h) ? (size_h - padding_h) / stride_h + 1 : 1; | |
const std::uint32_t size_w = width + 2 * pad_w; | |
const std::uint32_t padding_w = dilation_w * (kernel_w - 1) + 1; | |
const std::uint32_t output_w = (size_w >= padding_w) ? (size_w - padding_w) / stride_w + 1 : 1; | |
const std::uint32_t g1 = calc_num_blocks((width - 1) / gcd_w + 1, 16); | |
const std::uint32_t g2 = calc_num_blocks(((height - 1) / gcd_h + 1) * channels, 16); | |
kernel.setArg(0, height); | |
kernel.setArg(1, width); | |
kernel.setArg(2, channels); | |
kernel.setArg(3, output_h); | |
kernel.setArg(4, output_w); | |
kernel.setArg(5, kernel_h); | |
kernel.setArg(6, kernel_w); | |
kernel.setArg(7, pad_h); | |
kernel.setArg(8, pad_w); | |
kernel.setArg(9, stride_h); | |
kernel.setArg(10, stride_w); | |
kernel.setArg(11, dilation_h); | |
kernel.setArg(12, dilation_w); | |
kernel.setArg(13, stride_bez_h); | |
kernel.setArg(14, stride_bez_w); | |
kernel.setArg(15, dilation_bez_h); | |
kernel.setArg(16, dilation_bez_w); | |
kernel.setArg(17, gcd_h); | |
kernel.setArg(18, gcd_w); | |
kernel.setArg(19, col_buffer); | |
kernel.setArg(20, col_offset); | |
kernel.setArg(21, im_buffer); | |
kernel.setArg(22, im_offset); | |
queue.enqueueNDRangeKernel( | |
kernel, cl::NullRange, | |
cl::NDRange(g1 * 16, g2 * 16, 1), | |
cl::NDRange(16, 16, 1)); | |
} | |
std::string kernel_string = R"EOS( | |
inline int grid_ceil(const int x, const int step) { | |
return x > 0 ? ((x - 1) / step + 1) * step : x / step * step; | |
} | |
kernel void col2im_kernel( | |
const int input_h, const int input_w, const int channels, | |
const int output_h, const int output_w, | |
const int kernel_h, const int kernel_w, | |
const int pad_h, const int pad_w, | |
const int stride_h, const int stride_w, | |
const int dilation_h, const int dilation_w, | |
const int stride_bez_h, const int stride_bez_w, | |
const int dilation_bez_h, const int dilation_bez_w, | |
const int gcd_h, const int gcd_w, | |
const global float *col_buffer, const int col_offset, | |
__global float *im_buffer, const int im_offset) { | |
const int input_h_scale = (input_h - 1) / gcd_h + 1; | |
const int gcd_scale_w = get_global_id(0) + (pad_w - 1) / gcd_w + 1; | |
const int gcd_scale_h = ((int) get_global_id(1)) % input_h_scale + (pad_h - 1) / gcd_h + 1; | |
const int x_x = gcd_scale_w * gcd_w - pad_w; | |
const int x_y = gcd_scale_h * gcd_h - pad_h; | |
const int channel = ((int) get_global_id(1)) / input_h_scale; | |
const int col_channel_shift = channel * kernel_w * kernel_h * output_h * output_w + col_offset; | |
const int x_channel_shift = channel * input_h * input_w + im_offset; | |
const int t_y_step = stride_h * dilation_h / gcd_h; | |
const int t_y_begin = grid_ceil(max(-stride_bez_h * gcd_scale_h * stride_h, | |
(dilation_bez_h * gcd_scale_h - kernel_h + 1) * dilation_h), | |
t_y_step); | |
const int t_y_end = min((output_h - stride_bez_h * gcd_scale_h) * stride_h, | |
(dilation_bez_h * gcd_scale_h + 1) * dilation_h); | |
const int t_x_step = stride_w * dilation_w / gcd_w; | |
const int t_x_begin = grid_ceil(max(-stride_bez_w * gcd_scale_w * stride_w, | |
(dilation_bez_w * gcd_scale_w - kernel_w + 1) * dilation_w), | |
t_x_step); | |
const int t_x_end = min((output_w - stride_bez_w * gcd_scale_w) * stride_w, | |
(dilation_bez_w * gcd_scale_w + 1) * dilation_w); | |
if (x_x < input_w && channel < channels) { | |
float val = 0; | |
for (int t_y = t_y_begin; t_y < t_y_end; t_y += t_y_step) { | |
for (int t_x = t_x_begin; t_x < t_x_end; t_x += t_x_step) { | |
const int w_y = -t_y / dilation_h + dilation_bez_h * gcd_scale_h; | |
const int y_y = t_y / stride_h + stride_bez_h * gcd_scale_h; | |
const int w_x = -t_x / dilation_w + dilation_bez_w * gcd_scale_w; | |
const int y_x = t_x / stride_w + stride_bez_w * gcd_scale_w; | |
val += col_buffer[col_channel_shift | |
+ (w_x + w_y * kernel_w) * output_h * output_w | |
+ y_y * output_w | |
+ y_x]; | |
} | |
} | |
im_buffer[x_channel_shift + x_y * input_w + x_x] = val; | |
} | |
} | |
)EOS"; | |
int main() { | |
cl::Device device(get_device(0, 0)); | |
cl::Context context({ device }); | |
cl::CommandQueue queue(context, device, 0); | |
cl::Program program(context, kernel_string); | |
try { | |
program.build({device}); | |
} catch (...) { | |
std::cerr << "OpenCL kernel compile error:" << std::endl << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device) << std::endl; | |
return 1; | |
} | |
cl::Kernel col2im_kernel(program, "col2im_kernel"); | |
const std::uint32_t im_height = 512; | |
const std::uint32_t im_width = 512; | |
const std::uint32_t w_height = 16; | |
const std::uint32_t w_width = 16; | |
const std::uint32_t channels = 1; | |
const std::uint32_t pad_h = 8; | |
const std::uint32_t pad_w = 8; | |
const std::uint32_t stride_h = 1; | |
const std::uint32_t stride_w = 1; | |
const std::uint32_t dilation_h = 1; | |
const std::uint32_t dilation_w = 1; | |
const std::uint32_t n_iters = 500; | |
std::cout << "col2im (GPU GCD/LCM)" <<std::endl; | |
std::cout << "=====================" << std::endl; | |
std::cout << "im_height: " << im_height << std::endl; | |
std::cout << "im_width: " << im_width << std::endl; | |
std::cout << "w_height: " << w_height << std::endl; | |
std::cout << "w_width: " << w_width << std::endl; | |
std::cout << "channels: " << channels << std::endl; | |
std::cout << "pad_h: " << pad_h << std::endl; | |
std::cout << "pad_w: " << pad_w << std::endl; | |
std::cout << "stride_h: " << stride_h << std::endl; | |
std::cout << "stride_w: " << stride_w << std::endl; | |
std::cout << "dilation_h: " << dilation_h << std::endl; | |
std::cout << "dilation_w: " << dilation_w << std::endl; | |
std::cout << "n_iters: " << n_iters << std::endl; | |
std::cout << "=====================" << std::endl; | |
const std::uint32_t x0 = im_height + 2 * pad_h; | |
const std::uint32_t x1 = im_width + 2 * pad_w; | |
const std::uint32_t w0 = (w_height - 1) * dilation_h + 1; | |
const std::uint32_t w1 = (w_width - 1) * dilation_w + 1; | |
const std::uint32_t im_size = im_height * im_width * channels; | |
const std::uint32_t col_size = ((x0 - w0) / stride_h + 1) * ((x1 - w1) / stride_w + 1) * w_height * w_width * channels; | |
std::vector<float> im(im_size); | |
std::vector<float> col(col_size); | |
std::mt19937 rng(12345); | |
std::uniform_real_distribution<float> dist(-1, 1); | |
auto gen = [&dist, &rng]() { | |
return dist(rng); | |
}; | |
std::generate(col.begin(), col.end(), gen); | |
cl::Buffer col_buffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, col_size * sizeof(float), nullptr); | |
cl::Buffer im_buffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, im_size * sizeof(float), nullptr); | |
float *mapped_col_buffer = static_cast<float *>( | |
queue.enqueueMapBuffer( | |
col_buffer, CL_TRUE, CL_MAP_WRITE, 0, sizeof(float) * col_size, 0)); | |
std::memcpy(mapped_col_buffer, col.data(), sizeof(float) * col_size); | |
queue.enqueueUnmapMemObject(col_buffer, mapped_col_buffer); | |
auto start = std::chrono::system_clock::now(); | |
for (auto i = 0; i < n_iters; ++i) { | |
queue.enqueueFillBuffer<float>(im_buffer, 0, 0, sizeof(float) * im_size); | |
col2im(col2im_kernel, queue, | |
channels, im_height, im_width, w_height, w_width, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, col_buffer, 0, im_buffer, 0); | |
} | |
auto end = std::chrono::system_clock::now(); | |
float *mapped_im_buffer = static_cast<float *>(queue.enqueueMapBuffer(im_buffer, CL_TRUE, CL_MAP_WRITE, 0, sizeof(float) * im_size, 0)); | |
std::memcpy(im.data(), mapped_im_buffer, sizeof(float) * im_size); | |
queue.enqueueUnmapMemObject(im_buffer, mapped_im_buffer); | |
std::size_t hash_val = 0; | |
std::hash<float> h; | |
for (auto v : im) { | |
hash_val ^= h(v) + 0x9e3779b9 + (hash_val << 6) + (hash_val >> 2); | |
} | |
auto diff = end - start; | |
std::cout << "Elapsed time = " << std::chrono::duration_cast<std::chrono::milliseconds>(diff).count() << " [ms]" <<std::endl; | |
std::cout << "Hash = " << hash_val <<std::endl; | |
// for (auto x : im) { | |
// std::cout << x << ", "; | |
// } | |
// std::cout << std::endl; | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <algorithm> | |
#include <chrono> | |
#include <functional> | |
#include <iostream> | |
#include <random> | |
void col2im( | |
const std::uint32_t channels, const std::uint32_t height, const std::uint32_t width, | |
const std::uint32_t kernel_h, const std::uint32_t kernel_w, const std::uint32_t pad_h, | |
const std::uint32_t pad_w, const std::uint32_t stride_h, const std::uint32_t stride_w, | |
const std::uint32_t dilation_h, const std::uint32_t dilation_w, | |
const std::vector<float> &col, const std::uint32_t col_offset, | |
std::vector<float> &im, const std::uint32_t im_offset) { | |
const std::uint32_t size_h = height + 2 * pad_h; | |
const std::uint32_t padding_h = dilation_h * (kernel_h - 1) + 1; | |
const std::uint32_t output_h = (size_h >= padding_h) ? (size_h - padding_h) / stride_h + 1 : 1; | |
const std::uint32_t size_w = width + 2 * pad_w; | |
const std::uint32_t padding_w = dilation_w * (kernel_w - 1) + 1; | |
const std::uint32_t output_w = (size_w >= padding_w) ? (size_w - padding_w) / stride_w + 1 : 1; | |
for (std::uint32_t channel = 0; channel < channels; ++channel) { | |
const int col_channel_shift = channel * kernel_w * kernel_h * output_h * output_w + col_offset; | |
const int x_channel_shift = channel * height * width + im_offset; | |
for (std::int32_t y_y = 0; y_y < output_h; ++y_y) { | |
for (std::int32_t y_x = 0; y_x < output_w; ++y_x) { | |
for (std::uint32_t w_y = 0; w_y < kernel_h; ++w_y) { | |
for (std::uint32_t w_x = 0; w_x < kernel_w; ++w_x) { | |
const std::int32_t x_y = -pad_h + y_y * stride_h + w_y * dilation_h; | |
const std::int32_t x_x = -pad_w + y_x * stride_w + w_x * dilation_w; | |
if (x_y >= 0 && x_x >= 0 && x_y < height && x_x < width) { | |
im[x_channel_shift + x_y * width + x_x] | |
+= col[col_channel_shift | |
+ (w_x + w_y * kernel_w) * output_h * output_w | |
+ y_y * output_w | |
+ y_x]; | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
int main() { | |
const std::uint32_t im_height = 512; | |
const std::uint32_t im_width = 512; | |
const std::uint32_t w_height = 16; | |
const std::uint32_t w_width = 16; | |
const std::uint32_t channels = 1; | |
const std::uint32_t pad_h = 8; | |
const std::uint32_t pad_w = 8; | |
const std::uint32_t stride_h = 1; | |
const std::uint32_t stride_w = 1; | |
const std::uint32_t dilation_h = 1; | |
const std::uint32_t dilation_w = 1; | |
const std::uint32_t n_iters = 1; | |
std::cout << "col2im (Naive)" <<std::endl; | |
std::cout << "=====================" << std::endl; | |
std::cout << "im_height: " << im_height << std::endl; | |
std::cout << "im_width: " << im_width << std::endl; | |
std::cout << "w_height: " << w_height << std::endl; | |
std::cout << "w_width: " << w_width << std::endl; | |
std::cout << "channels: " << channels << std::endl; | |
std::cout << "pad_h: " << pad_h << std::endl; | |
std::cout << "pad_w: " << pad_w << std::endl; | |
std::cout << "stride_h: " << stride_h << std::endl; | |
std::cout << "stride_w: " << stride_w << std::endl; | |
std::cout << "dilation_h: " << dilation_h << std::endl; | |
std::cout << "dilation_w: " << dilation_w << std::endl; | |
std::cout << "n_iters: " << n_iters << std::endl; | |
std::cout << "=====================" << std::endl; | |
const std::uint32_t x0 = im_height + 2 * pad_h; | |
const std::uint32_t x1 = im_width + 2 * pad_w; | |
const std::uint32_t w0 = (w_height - 1) * dilation_h + 1; | |
const std::uint32_t w1 = (w_width - 1) * dilation_w + 1; | |
const std::uint32_t im_size = im_height * im_width * channels; | |
const std::uint32_t col_size = ((x0 - w0) / stride_h + 1) * ((x1 - w1) / stride_w + 1) * w_height * w_width * channels; | |
std::vector<float> im(im_size); | |
std::vector<float> col(col_size); | |
std::mt19937 rng(12345); | |
std::uniform_real_distribution<float> dist(-1, 1); | |
auto gen = [&dist, &rng]() { | |
return dist(rng); | |
}; | |
std::generate(col.begin(), col.end(), gen); | |
auto start = std::chrono::system_clock::now(); | |
for (auto i = 0; i < n_iters; ++i) { | |
std::fill(im.begin(), im.end(), 0); | |
col2im( | |
channels, im_height, im_width, w_height, w_width, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, col, 0, im, 0); | |
} | |
auto end = std::chrono::system_clock::now(); | |
std::size_t hash_val = 0; | |
std::hash<float> h; | |
for (auto v : im) { | |
hash_val ^= h(v) + 0x9e3779b9 + (hash_val << 6) + (hash_val >> 2); | |
} | |
auto diff = end - start; | |
std::cout << "Elapsed time = " << std::chrono::duration_cast<std::chrono::milliseconds>(diff).count() << " [ms]" <<std::endl; | |
std::cout << "Hash = " << hash_val <<std::endl; | |
// for (auto x : im) { | |
// std::cout << x << ", "; | |
// } | |
// std::cout << std::endl; | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment