Skip to content

Instantly share code, notes, and snippets.

@michelerenzullo
Last active September 28, 2022 00:34
Show Gist options
  • Save michelerenzullo/e6c012b41fc0ab278a523815f14c842e to your computer and use it in GitHub Desktop.
Save michelerenzullo/e6c012b41fc0ab278a523815f14c842e to your computer and use it in GitHub Desktop.
Fast Box Blur + reflected padding without memory waste repo: https://github.com/michelerenzullo/FastBoxBlur
template<typename T, int C>
void horizontal_blur_kernel_reflect(const T* in, T* out, const int w, const int h, const int ksize)
{
// change the local variable types depending on the template type for faster calculations
using calc_type = std::conditional_t<std::is_integral_v<T>, uint32_t, float>;
int r = 0.5f * (ksize - 1);
r = std::min(r, w - 1);
const float iarr = 1.f / (r + r + 1);
#pragma omp parallel for
for (int i = 0; i < h; i++)
{
const int begin = i * w, end = begin + w, max_end = end - 1;
int li = begin + r, ri = begin + r + 1; // left index(mirrored in the beginning), right index(mirrored at the end)
calc_type acc[C] = {};
// for ksize = 7, and r = 3, and array length = 11
// array is [ a b c d e f g h i j k ]
// emulated array is [d c b _ a b c d e f g h i j k _ j i h]
// emulating the left padd: the initial accumulation is (d + c + b + a + b + c + d) --> 2 * (a + b + c + d) - a
for (int ch = 0; ch < C; ++ch)
{
for (int j = 0; j <= r; j++)
acc[ch] += 2 * in[(begin + j) * C + ch];
acc[ch] -= in[begin * C + ch]; // remove extra pivot value
// calculated first value
out[begin * C + ch] = acc[ch] * iarr + (std::is_integral_v<T> ? 0.5f : 0);
}
////////////////////////////////////////////////////////////////////////////////////////////////////
for (int j = begin + 1; j < begin + r + 1; ++j)
{
for (int ch = 0; ch < C; ++ch)
{
//ri < end ? ri : max_end - ri % max_end <-- reading in a reverse way
//when reached the end of the row buffer and starting to read the "emulated" right pad
acc[ch] += in[(ri < end ? ri : max_end - ri % max_end) * C + ch] - in[li * C + ch];
out[j * C + ch] = acc[ch] * iarr + (std::is_integral_v<T> ? 0.5f : 0);
}
--li, ++ri;
}
//this loop won't be executed when r > w / 2 - 2 therefore the end of the image buffer will never be reached
for (int j = begin + r + 1; j < end - r - 1; ++j)
{
for (int ch = 0; ch < C; ++ch)
{
acc[ch] += in[ri * C + ch] - in[li * C + ch];
out[j * C + ch] = acc[ch] * iarr + (std::is_integral_v<T> ? 0.5f : 0);
}
++li, ++ri;
}
for (int j = end - r - 1; j < end; ++j)
{
for (int ch = 0; ch < C; ++ch)
{
acc[ch] += in[(ri < end ? ri : max_end - ri % max_end) * C + ch] - in[li * C + ch];
out[j * C + ch] = acc[ch] * iarr + (std::is_integral_v<T> ? 0.5f : 0);
}
++li, --ri;
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment