michelerenzullo/FBB_reflection.cpp

## FBB_reflection.cpp
template<typename T, int C>
void horizontal_blur_kernel_reflect(const T* in, T* out, const int w, const int h, const int ksize)
{
	// change the local variable types depending on the template type for faster calculations
	using calc_type = std::conditional_t<std::is_integral_v<T>, uint32_t, float>;

	int r = 0.5f * (ksize - 1);
	r = std::min(r, w - 1);

	const float iarr = 1.f / (r + r + 1);

#pragma omp parallel for
	for (int i = 0; i < h; i++)
	{
		const int begin = i * w, end = begin + w, max_end = end - 1;
		int li = begin + r, ri = begin + r + 1; // left index(mirrored in the beginning), right index(mirrored at the end)
		calc_type acc[C] = {};

		// for ksize = 7, and r = 3, and array length = 11
		// array is [ a b c d e f g h i j k ]
		// emulated array is [d c b _ a b c d e f g h i j k _ j i h]

		// emulating the left padd: the initial accumulation is (d + c + b + a + b + c + d) --> 2 * (a + b + c + d) - a

		for (int ch = 0; ch < C; ++ch)
		{
			for (int j = 0; j <= r; j++)
				acc[ch] += 2 * in[(begin + j) * C + ch];
			acc[ch] -= in[begin * C + ch]; // remove extra pivot value

			// calculated first value
			out[begin * C + ch] = acc[ch] * iarr + (std::is_integral_v<T> ? 0.5f : 0);
		}

		////////////////////////////////////////////////////////////////////////////////////////////////////


		for (int j = begin + 1; j < begin + r + 1; ++j)
		{
			for (int ch = 0; ch < C; ++ch)
			{
				//ri < end ? ri : max_end - ri % max_end   <--  reading in a reverse way
				//when reached the end of the row buffer and starting to read the "emulated" right pad
				acc[ch] += in[(ri < end ? ri : max_end - ri % max_end) * C + ch] - in[li * C + ch];
				out[j * C + ch] = acc[ch] * iarr + (std::is_integral_v<T> ? 0.5f : 0);
			}
			--li, ++ri;
		}

		//this loop won't be executed when r > w / 2 - 2 therefore the end of the image buffer will never be reached
		for (int j = begin + r + 1; j < end - r - 1; ++j)
		{
			for (int ch = 0; ch < C; ++ch)
			{
				acc[ch] += in[ri * C + ch] - in[li * C + ch];
				out[j * C + ch] = acc[ch] * iarr + (std::is_integral_v<T> ? 0.5f : 0);
			}
			++li, ++ri;
		}

		for (int j = end - r - 1; j < end; ++j)
		{
			for (int ch = 0; ch < C; ++ch)
			{
				acc[ch] += in[(ri < end ? ri : max_end - ri % max_end) * C + ch] - in[li * C + ch];
				out[j * C + ch] = acc[ch] * iarr + (std::is_integral_v<T> ? 0.5f : 0);
			}
			++li, --ri;
		}
	}
}
	template<typename T, int C>
	void horizontal_blur_kernel_reflect(const T* in, T* out, const int w, const int h, const int ksize)
	{
	// change the local variable types depending on the template type for faster calculations
	using calc_type = std::conditional_t<std::is_integral_v<T>, uint32_t, float>;

	int r = 0.5f * (ksize - 1);
	r = std::min(r, w - 1);

	const float iarr = 1.f / (r + r + 1);

	#pragma omp parallel for
	for (int i = 0; i < h; i++)
	{
	const int begin = i * w, end = begin + w, max_end = end - 1;
	int li = begin + r, ri = begin + r + 1; // left index(mirrored in the beginning), right index(mirrored at the end)
	calc_type acc[C] = {};

	// for ksize = 7, and r = 3, and array length = 11
	// array is [ a b c d e f g h i j k ]
	// emulated array is [d c b _ a b c d e f g h i j k _ j i h]

	// emulating the left padd: the initial accumulation is (d + c + b + a + b + c + d) --> 2 * (a + b + c + d) - a

	for (int ch = 0; ch < C; ++ch)
	{
	for (int j = 0; j <= r; j++)
	acc[ch] += 2 * in[(begin + j) * C + ch];
	acc[ch] -= in[begin * C + ch]; // remove extra pivot value

	// calculated first value
	out[begin * C + ch] = acc[ch] * iarr + (std::is_integral_v<T> ? 0.5f : 0);
	}

	////////////////////////////////////////////////////////////////////////////////////////////////////


	for (int j = begin + 1; j < begin + r + 1; ++j)
	{
	for (int ch = 0; ch < C; ++ch)
	{
	//ri < end ? ri : max_end - ri % max_end <-- reading in a reverse way
	//when reached the end of the row buffer and starting to read the "emulated" right pad
	acc[ch] += in[(ri < end ? ri : max_end - ri % max_end) * C + ch] - in[li * C + ch];
	out[j * C + ch] = acc[ch] * iarr + (std::is_integral_v<T> ? 0.5f : 0);
	}
	--li, ++ri;
	}

	//this loop won't be executed when r > w / 2 - 2 therefore the end of the image buffer will never be reached
	for (int j = begin + r + 1; j < end - r - 1; ++j)
	{
	for (int ch = 0; ch < C; ++ch)
	{
	acc[ch] += in[ri * C + ch] - in[li * C + ch];
	out[j * C + ch] = acc[ch] * iarr + (std::is_integral_v<T> ? 0.5f : 0);
	}
	++li, ++ri;
	}

	for (int j = end - r - 1; j < end; ++j)
	{
	for (int ch = 0; ch < C; ++ch)
	{
	acc[ch] += in[(ri < end ? ri : max_end - ri % max_end) * C + ch] - in[li * C + ch];
	out[j * C + ch] = acc[ch] * iarr + (std::is_integral_v<T> ? 0.5f : 0);
	}
	++li, --ri;
	}
	}
	}