Skip to content

Instantly share code, notes, and snippets.

@kinchungwong
Last active April 13, 2018 09:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kinchungwong/94a090ace1bdd67c02cf932c4a4b8a2a to your computer and use it in GitHub Desktop.
Save kinchungwong/94a090ace1bdd67c02cf932c4a4b8a2a to your computer and use it in GitHub Desktop.
OpenCV C++ Convolution Function (Kernel) concept
template <typename vec_type, int nv>
using v_arr = std::array<vec_type, nv>;
template <typename vec_type, int nvr, int nvc>
using v_arr2d = std::array<std::array<vec_type, nvc>, nvr>;
template <int s, typename vec_type, int nv>
inline vec_type v_extract(const std::array<vec_type, nv>& input)
{
const v_extract_array_helper<s, vec_type, nv> helper;
return helper(input);
}
template <int s, typename vec_type, int nv>
inline void v_insert(std::array<vec_type, nv>& output, const vec_type& input)
{
const v_insert_array_helper<s, vec_type, nv> helper;
helper(output, input);
}
template <int row_start, int col_start, typename _Tpvec, int nvr, int nvc>
inline _Tpvec v_extract(const v_arr2d<_Tpvec, nvr, nvc>& input)
{
CV_StaticAssert((row_start >= 0) && (row_start < nvr), "Invalid row_start for v_extract(v_arr2d)");
return v_extract<col_start, _Tpvec, nvc>(input[row_start]);
}
template <typename vec_type, int nvr, int nvc>
struct v_arr_center_info
{
enum
{
center_row = ((nvr - 1) / 2),
lanes_per_vec = vec_type::nlanes,
lanes_per_row = (lanes_per_vec * nvc),
center_col = ((lanes_per_row - lanes_per_vec) / 2),
};
};
template <int row_offset, int col_offset, typename vec_type, int nvr, int nvc>
inline _Tpvec v_extract_around_center(const v_arr2d<vec_type, nvr, nvc>& input)
{
using center_info = v_arr_center_info<vec_type, nvr, nvc>;
return v_extract<
(row_offset + center_info::center_row),
(col_offset + center_info::center_col),
vec_type, nvr, nvc>(input);
}
//
// The following temporary helper classes, when used properly (created and destroyed
// within same local scope (i.e. within a function, not passed around, not copied,
// not on heap, its address never taken), an optimizing C++ compiler should be able
// to eliminate all that scaffolding, keeping only the state-changing part of code.
// This means the only instructions that will be generated will be the assignment
// (insert) and the extraction (disguised as an implicit overloaded conversion
// operator).
//
template <typename vec_type, int nvr, int nvc>
class NeighborClass
{
using arraytype = v_arr2d<vec_type, nvr, nvc>;
public:
NeighborClass(arraytype& array_ref)
: m_array_ref(array_ref)
{
}
private:
template <int row_offset, int col_offset>
class Ref
{
public:
Ref(NeighborClass& host)
: m_host(host)
{}
public:
operator vec_type() const
{
return v_extract_around_center<row_offset, col_offset>(m_host.m_array_ref);
}
void operator = (const vec_type& v)
{
v_insert_around_center<row_offset, col_offset>(m_host.m_array_ref, v);
}
private:
NeighborClass& m_host;
};
public:
Ref<-1, -1> top_left()
{
return Ref<-1, -1>(*this);
}
Ref<-1, 0> top_center()
{
return Ref<-1, 0>(*this);
}
// Etc for the other neighbor pixels.
public:
arraytype& m_array_ref;
};
template <typename vec_type, int nvr, int nvc>
NeighborClass<vec_type, nvr, nvc> Neighbor(cv::hal_baseline::cv400array::v_arr2d<vec_type, nvr, nvc>& ref)
{
return NeighborClass<vec_type, nvr, nvc>(ref);
}
template <class YourFunc>
void test()
{
v_arr2d<cv::v_float32x4, 3, 2> input =
{
{
cv::v_float32x4{ 101, 102, 103, 104 }, cv::v_float32x4{ 105, 106, 107, 108 },
cv::v_float32x4{ 201, 202, 203, 204 }, cv::v_float32x4{ 205, 206, 207, 208 },
cv::v_float32x4{ 301, 302, 303, 304 }, cv::v_float32x4{ 305, 306, 307, 308 }
},
};
// Each following line is only 0 - 1 instructions on SSSE3 (PALIGNR)
auto n = Neighbor(input);
auto output = YourFunc(
n.top_left(), n.top_center(), n.top_right(),
n.mid_left(), n.mid_center(), n.mid_right(),
n.bot_left(), n.bot_center(), n.bot_right());
// Despite verbosity, inlining of the function allows C++ compiler to do
// "scalar replacement of aggregates" (SRoA), even if the input values
// come from an array (such as std::array above), therefore the backend's
// instruction scheduler can fully permute the instructions, without being
// constrained by the sequence of code laid out in the C++.
std::cout << "Output: " << to_string(output) << std::endl;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment