Last active
April 13, 2018 09:13
-
-
Save kinchungwong/94a090ace1bdd67c02cf932c4a4b8a2a to your computer and use it in GitHub Desktop.
OpenCV C++ Convolution Function (Kernel) concept
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
template <typename vec_type, int nv> | |
using v_arr = std::array<vec_type, nv>; | |
template <typename vec_type, int nvr, int nvc> | |
using v_arr2d = std::array<std::array<vec_type, nvc>, nvr>; | |
template <int s, typename vec_type, int nv> | |
inline vec_type v_extract(const std::array<vec_type, nv>& input) | |
{ | |
const v_extract_array_helper<s, vec_type, nv> helper; | |
return helper(input); | |
} | |
template <int s, typename vec_type, int nv> | |
inline void v_insert(std::array<vec_type, nv>& output, const vec_type& input) | |
{ | |
const v_insert_array_helper<s, vec_type, nv> helper; | |
helper(output, input); | |
} | |
template <int row_start, int col_start, typename _Tpvec, int nvr, int nvc> | |
inline _Tpvec v_extract(const v_arr2d<_Tpvec, nvr, nvc>& input) | |
{ | |
CV_StaticAssert((row_start >= 0) && (row_start < nvr), "Invalid row_start for v_extract(v_arr2d)"); | |
return v_extract<col_start, _Tpvec, nvc>(input[row_start]); | |
} | |
template <typename vec_type, int nvr, int nvc> | |
struct v_arr_center_info | |
{ | |
enum | |
{ | |
center_row = ((nvr - 1) / 2), | |
lanes_per_vec = vec_type::nlanes, | |
lanes_per_row = (lanes_per_vec * nvc), | |
center_col = ((lanes_per_row - lanes_per_vec) / 2), | |
}; | |
}; | |
template <int row_offset, int col_offset, typename vec_type, int nvr, int nvc> | |
inline _Tpvec v_extract_around_center(const v_arr2d<vec_type, nvr, nvc>& input) | |
{ | |
using center_info = v_arr_center_info<vec_type, nvr, nvc>; | |
return v_extract< | |
(row_offset + center_info::center_row), | |
(col_offset + center_info::center_col), | |
vec_type, nvr, nvc>(input); | |
} | |
// | |
// The following temporary helper classes, when used properly (created and destroyed | |
// within same local scope (i.e. within a function, not passed around, not copied, | |
// not on heap, its address never taken), an optimizing C++ compiler should be able | |
// to eliminate all that scaffolding, keeping only the state-changing part of code. | |
// This means the only instructions that will be generated will be the assignment | |
// (insert) and the extraction (disguised as an implicit overloaded conversion | |
// operator). | |
// | |
template <typename vec_type, int nvr, int nvc> | |
class NeighborClass | |
{ | |
using arraytype = v_arr2d<vec_type, nvr, nvc>; | |
public: | |
NeighborClass(arraytype& array_ref) | |
: m_array_ref(array_ref) | |
{ | |
} | |
private: | |
template <int row_offset, int col_offset> | |
class Ref | |
{ | |
public: | |
Ref(NeighborClass& host) | |
: m_host(host) | |
{} | |
public: | |
operator vec_type() const | |
{ | |
return v_extract_around_center<row_offset, col_offset>(m_host.m_array_ref); | |
} | |
void operator = (const vec_type& v) | |
{ | |
v_insert_around_center<row_offset, col_offset>(m_host.m_array_ref, v); | |
} | |
private: | |
NeighborClass& m_host; | |
}; | |
public: | |
Ref<-1, -1> top_left() | |
{ | |
return Ref<-1, -1>(*this); | |
} | |
Ref<-1, 0> top_center() | |
{ | |
return Ref<-1, 0>(*this); | |
} | |
// Etc for the other neighbor pixels. | |
public: | |
arraytype& m_array_ref; | |
}; | |
template <typename vec_type, int nvr, int nvc> | |
NeighborClass<vec_type, nvr, nvc> Neighbor(cv::hal_baseline::cv400array::v_arr2d<vec_type, nvr, nvc>& ref) | |
{ | |
return NeighborClass<vec_type, nvr, nvc>(ref); | |
} | |
template <class YourFunc> | |
void test() | |
{ | |
v_arr2d<cv::v_float32x4, 3, 2> input = | |
{ | |
{ | |
cv::v_float32x4{ 101, 102, 103, 104 }, cv::v_float32x4{ 105, 106, 107, 108 }, | |
cv::v_float32x4{ 201, 202, 203, 204 }, cv::v_float32x4{ 205, 206, 207, 208 }, | |
cv::v_float32x4{ 301, 302, 303, 304 }, cv::v_float32x4{ 305, 306, 307, 308 } | |
}, | |
}; | |
// Each following line is only 0 - 1 instructions on SSSE3 (PALIGNR) | |
auto n = Neighbor(input); | |
auto output = YourFunc( | |
n.top_left(), n.top_center(), n.top_right(), | |
n.mid_left(), n.mid_center(), n.mid_right(), | |
n.bot_left(), n.bot_center(), n.bot_right()); | |
// Despite verbosity, inlining of the function allows C++ compiler to do | |
// "scalar replacement of aggregates" (SRoA), even if the input values | |
// come from an array (such as std::array above), therefore the backend's | |
// instruction scheduler can fully permute the instructions, without being | |
// constrained by the sequence of code laid out in the C++. | |
std::cout << "Output: " << to_string(output) << std::endl; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment