kinchungwong/concept_20180411.cpp

## concept_20180411.cpp
template <typename vec_type, int nv>
using v_arr = std::array<vec_type, nv>;

template <typename vec_type, int nvr, int nvc>
using v_arr2d = std::array<std::array<vec_type, nvc>, nvr>;

template <int s, typename vec_type, int nv>
inline vec_type v_extract(const std::array<vec_type, nv>& input)
{
    const v_extract_array_helper<s, vec_type, nv> helper;
    return helper(input);
}

template <int s, typename vec_type, int nv>
inline void v_insert(std::array<vec_type, nv>& output, const vec_type& input)
{
    const v_insert_array_helper<s, vec_type, nv> helper;
    helper(output, input);
}

template <int row_start, int col_start, typename _Tpvec, int nvr, int nvc>
inline _Tpvec v_extract(const v_arr2d<_Tpvec, nvr, nvc>& input)
{
    CV_StaticAssert((row_start >= 0) && (row_start < nvr), "Invalid row_start for v_extract(v_arr2d)");
    return v_extract<col_start, _Tpvec, nvc>(input[row_start]);
}

template <typename vec_type, int nvr, int nvc>
struct v_arr_center_info
{
    enum
    {
        center_row = ((nvr - 1) / 2),
        lanes_per_vec = vec_type::nlanes,
        lanes_per_row = (lanes_per_vec * nvc),
        center_col = ((lanes_per_row - lanes_per_vec) / 2),
    };
};

template <int row_offset, int col_offset, typename vec_type, int nvr, int nvc>
inline _Tpvec v_extract_around_center(const v_arr2d<vec_type, nvr, nvc>& input)
{
    using center_info = v_arr_center_info<vec_type, nvr, nvc>;
    return v_extract<
        (row_offset + center_info::center_row),
        (col_offset + center_info::center_col),
        vec_type, nvr, nvc>(input);
}

//
// The following temporary helper classes, when used properly (created and destroyed
// within same local scope (i.e. within a function, not passed around, not copied,
// not on heap, its address never taken), an optimizing C++ compiler should be able
// to eliminate all that scaffolding, keeping only the state-changing part of code.
// This means the only instructions that will be generated will be the assignment
// (insert) and the extraction (disguised as an implicit overloaded conversion
// operator).
//

template <typename vec_type, int nvr, int nvc>
class NeighborClass
{
    using arraytype = v_arr2d<vec_type, nvr, nvc>;

public:
    NeighborClass(arraytype& array_ref)
        : m_array_ref(array_ref)
    {
    }

private:
    template <int row_offset, int col_offset>
    class Ref
    {
    public:
        Ref(NeighborClass& host)
            : m_host(host)
        {}
    public:
        operator vec_type() const
        {
            return v_extract_around_center<row_offset, col_offset>(m_host.m_array_ref);
        }
        void operator = (const vec_type& v)
        {
            v_insert_around_center<row_offset, col_offset>(m_host.m_array_ref, v);
        }
    private:
        NeighborClass& m_host;
    };

public:

    Ref<-1, -1> top_left()
    {
        return Ref<-1, -1>(*this);
    }

    Ref<-1, 0> top_center()
    {
        return Ref<-1, 0>(*this);
    }

    // Etc for the other neighbor pixels.

public:
    arraytype& m_array_ref;
};

template <typename vec_type, int nvr, int nvc>
NeighborClass<vec_type, nvr, nvc> Neighbor(cv::hal_baseline::cv400array::v_arr2d<vec_type, nvr, nvc>& ref)
{
    return NeighborClass<vec_type, nvr, nvc>(ref);
}

template <class YourFunc>
void test()
{
    v_arr2d<cv::v_float32x4, 3, 2> input =
    {
        {
            cv::v_float32x4{ 101, 102, 103, 104 }, cv::v_float32x4{ 105, 106, 107, 108 },
            cv::v_float32x4{ 201, 202, 203, 204 }, cv::v_float32x4{ 205, 206, 207, 208 },
            cv::v_float32x4{ 301, 302, 303, 304 }, cv::v_float32x4{ 305, 306, 307, 308 }
        },
    };
    // Each following line is only 0 - 1 instructions on SSSE3 (PALIGNR)
    auto n = Neighbor(input);
    auto output = YourFunc(
        n.top_left(), n.top_center(), n.top_right(),
        n.mid_left(), n.mid_center(), n.mid_right(),
        n.bot_left(), n.bot_center(), n.bot_right());

    // Despite verbosity, inlining of the function allows C++ compiler to do
    // "scalar replacement of aggregates" (SRoA), even if the input values
    // come from an array (such as std::array above), therefore the backend's
    // instruction scheduler can fully permute the instructions, without being
    // constrained by the sequence of code laid out in the C++.

    std::cout << "Output: " << to_string(output) << std::endl;
}
	template <typename vec_type, int nv>
	using v_arr = std::array<vec_type, nv>;

	template <typename vec_type, int nvr, int nvc>
	using v_arr2d = std::array<std::array<vec_type, nvc>, nvr>;

	template <int s, typename vec_type, int nv>
	inline vec_type v_extract(const std::array<vec_type, nv>& input)
	{
	const v_extract_array_helper<s, vec_type, nv> helper;
	return helper(input);
	}

	template <int s, typename vec_type, int nv>
	inline void v_insert(std::array<vec_type, nv>& output, const vec_type& input)
	{
	const v_insert_array_helper<s, vec_type, nv> helper;
	helper(output, input);
	}

	template <int row_start, int col_start, typename _Tpvec, int nvr, int nvc>
	inline _Tpvec v_extract(const v_arr2d<_Tpvec, nvr, nvc>& input)
	{
	CV_StaticAssert((row_start >= 0) && (row_start < nvr), "Invalid row_start for v_extract(v_arr2d)");
	return v_extract<col_start, _Tpvec, nvc>(input[row_start]);
	}

	template <typename vec_type, int nvr, int nvc>
	struct v_arr_center_info
	{
	enum
	{
	center_row = ((nvr - 1) / 2),
	lanes_per_vec = vec_type::nlanes,
	lanes_per_row = (lanes_per_vec * nvc),
	center_col = ((lanes_per_row - lanes_per_vec) / 2),
	};
	};

	template <int row_offset, int col_offset, typename vec_type, int nvr, int nvc>
	inline _Tpvec v_extract_around_center(const v_arr2d<vec_type, nvr, nvc>& input)
	{
	using center_info = v_arr_center_info<vec_type, nvr, nvc>;
	return v_extract<
	(row_offset + center_info::center_row),
	(col_offset + center_info::center_col),
	vec_type, nvr, nvc>(input);
	}

	//
	// The following temporary helper classes, when used properly (created and destroyed
	// within same local scope (i.e. within a function, not passed around, not copied,
	// not on heap, its address never taken), an optimizing C++ compiler should be able
	// to eliminate all that scaffolding, keeping only the state-changing part of code.
	// This means the only instructions that will be generated will be the assignment
	// (insert) and the extraction (disguised as an implicit overloaded conversion
	// operator).
	//

	template <typename vec_type, int nvr, int nvc>
	class NeighborClass
	{
	using arraytype = v_arr2d<vec_type, nvr, nvc>;

	public:
	NeighborClass(arraytype& array_ref)
	: m_array_ref(array_ref)
	{
	}

	private:
	template <int row_offset, int col_offset>
	class Ref
	{
	public:
	Ref(NeighborClass& host)
	: m_host(host)
	{}
	public:
	operator vec_type() const
	{
	return v_extract_around_center<row_offset, col_offset>(m_host.m_array_ref);
	}
	void operator = (const vec_type& v)
	{
	v_insert_around_center<row_offset, col_offset>(m_host.m_array_ref, v);
	}
	private:
	NeighborClass& m_host;
	};

	public:

	Ref<-1, -1> top_left()
	{
	return Ref<-1, -1>(*this);
	}

	Ref<-1, 0> top_center()
	{
	return Ref<-1, 0>(*this);
	}

	// Etc for the other neighbor pixels.

	public:
	arraytype& m_array_ref;
	};

	template <typename vec_type, int nvr, int nvc>
	NeighborClass<vec_type, nvr, nvc> Neighbor(cv::hal_baseline::cv400array::v_arr2d<vec_type, nvr, nvc>& ref)
	{
	return NeighborClass<vec_type, nvr, nvc>(ref);
	}

	template <class YourFunc>
	void test()
	{
	v_arr2d<cv::v_float32x4, 3, 2> input =
	{
	{
	cv::v_float32x4{ 101, 102, 103, 104 }, cv::v_float32x4{ 105, 106, 107, 108 },
	cv::v_float32x4{ 201, 202, 203, 204 }, cv::v_float32x4{ 205, 206, 207, 208 },
	cv::v_float32x4{ 301, 302, 303, 304 }, cv::v_float32x4{ 305, 306, 307, 308 }
	},
	};
	// Each following line is only 0 - 1 instructions on SSSE3 (PALIGNR)
	auto n = Neighbor(input);
	auto output = YourFunc(
	n.top_left(), n.top_center(), n.top_right(),
	n.mid_left(), n.mid_center(), n.mid_right(),
	n.bot_left(), n.bot_center(), n.bot_right());

	// Despite verbosity, inlining of the function allows C++ compiler to do
	// "scalar replacement of aggregates" (SRoA), even if the input values
	// come from an array (such as std::array above), therefore the backend's
	// instruction scheduler can fully permute the instructions, without being
	// constrained by the sequence of code laid out in the C++.

	std::cout << "Output: " << to_string(output) << std::endl;
	}