Skip to content

Instantly share code, notes, and snippets.

@inspirit
Created October 2, 2013 08:26
Show Gist options
  • Star 6 You must be signed in to star a gist
  • Fork 4 You must be signed in to fork a gist
  • Save inspirit/6790643 to your computer and use it in GitHub Desktop.
Save inspirit/6790643 to your computer and use it in GitHub Desktop.
Separable Convolution and Gaussian Blur
#pragma once
#ifndef CONVOLUTION_H_INCLUDED
#define CONVOLUTION_H_INCLUDED
/**
* Separable Convolution routines with SSE and NEON intrinsics
*
* this implementation is based on OpenCV Filter Class
* with template optimizations and SIMD intrinsic
*
* @author Eugene Zatepyakin
*/
#include <algorithm>
#include <limits>
// SIMD
#define SIMD_OPT 1
#include <emmintrin.h>
#ifdef __GNUC__
# define CV_DECL_ALIGNED(x) __attribute__ ((aligned (x)))
#elif defined _MSC_VER
# define CV_DECL_ALIGNED(x) __declspec(align(x))
#else
#error unknown platform
#endif
// SEPARABLE CONVOLUTION ONLY
namespace convolve {
//! type of the kernel
enum { KERNEL_GENERAL = 0, // the kernel is generic. No any type of symmetry or other properties.
KERNEL_SYMMETRICAL = 1, // kernel[i] == kernel[ksize-i-1] , and the anchor is at the center
KERNEL_ASYMMETRICAL = 2, // kernel[i] == -kernel[ksize-i-1] , and the anchor is at the center
KERNEL_SMOOTH = 4, // all the kernel elements are non-negative and summed to 1
KERNEL_INTEGER = 8 // all the kernel coefficients are integer numbers
};
template<typename KT>
static int getKernelType(const KT* kernel, const int sz)
{
int i;
float sum = 0;
int type = KERNEL_SMOOTH + KERNEL_INTEGER;
type |= (KERNEL_SYMMETRICAL + KERNEL_ASYMMETRICAL);
for( i = 0; i < sz; i++ )
{
KT a = kernel[i], b = kernel[sz - i - 1];
if( a != b )
type &= ~KERNEL_SYMMETRICAL;
if( a != -b )
type &= ~KERNEL_ASYMMETRICAL;
if( a < 0 )
type &= ~KERNEL_SMOOTH;
if( a != static_cast<int>(a) )
type &= ~KERNEL_INTEGER;
sum += a;
}
if( std::fabs(sum - 1) > std::numeric_limits<float>::epsilon()*(std::fabs(sum) + 1) )
type &= ~KERNEL_SMOOTH;
return type;
}
template<typename ST, typename DT, typename KT, int cn, int _ksize>
struct SIMDRow
{
SIMDRow() {}
int operator()(ST*, DT*, const KT*, int) const { return 0; }
};
template<typename ST, typename DT, typename KT, int cn, int _ksize>
struct SIMDRowSymm : public SIMDRow<ST,DT,KT,cn,_ksize>
{
int operator()(ST*, DT*, const KT*, int) const { return 0; }
};
template<typename ST, typename DT, typename KT, int cn, int _ksize>
struct SIMDRowASymm : public SIMDRow<ST,DT,KT,cn,_ksize>
{
int operator()(ST*, DT*, const KT*, int) const { return 0; }
};
//
template<typename ST, typename DT, typename KT, int cn, int _ksize>
struct SIMDCol
{
SIMDCol() {}
int operator()(ST**, DT*, const KT*, int) const { return 0; }
};
template<typename ST, typename DT, typename KT, int cn, int _ksize>
struct SIMDColSymm : public SIMDCol<ST,DT,KT,cn,_ksize>
{
int operator()(ST**, DT*, const KT*, int) const { return 0; }
};
template<typename ST, typename DT, typename KT, int cn, int _ksize>
struct SIMDColASymm : public SIMDCol<ST,DT,KT,cn,_ksize>
{
int operator()(ST**, DT*, const KT*, int) const { return 0; }
};
// SPECIAL CASE (SIMD) FOR FLOAT DATA TYPE
// TODO: NEON version
#if SIMD_OPT
#ifdef __arm__
#include <arm_neon.h>
//#define _mm_set1_ps(a) ( vld1q_dup_f32( &(a) ) )
#define _mm_load_ss(ptr) ( vld1q_dup_f32( (ptr) ) )
// in our case we just dup in above line
#define _mm_shuffle_ps(a, b, m) (a)
#define _mm_setzero_ps() ( vmovq_n_f32( 0.f ) )
#define _mm_set1_ps(a) ( vmovq_n_f32( (a) ) )
#define _mm_loadu_ps(ptr) ( vld1q_f32( (ptr) ) )
#define _mm_load_ps(ptr) ( vld1q_f32( (ptr) ) )
#define _mm_storeu_ps(ptr, a) ( vst1q_f32( (ptr), (a) ) )
#define _mm_store_ps(ptr, a) ( vst1q_f32( (ptr), (a) ) )
#define _mm_add_ps(a, b) ( vaddq_f32( (a), (b) ) )
#define _mm_sub_ps(a, b) ( vsubq_f32( (a), (b) ) )
#define _mm_mul_ps(a, b) ( vmulq_f32( (a), (b) ) )
#endif
template<int cn, int _ksize>
struct SIMDRow<float,float,float,cn,_ksize>
{
#ifdef __arm__
typedef float32x4_t __m128;
#endif
int operator()(float* buffer, float* dst, const float* _kernel, int rowSize) const
{
//rowSize *= cn;
int i = 0, k;
const float* kx = _kernel;
for( ; i <= rowSize - 8; i += 8 )
{
const float* src = buffer + i;
__m128 f, s0 = _mm_setzero_ps(), s1 = s0, x0, x1;
for( k = 0; k < _ksize; k++, src += cn )
{
f = _mm_load_ss(kx+k);
f = _mm_shuffle_ps(f, f, 0);
x0 = _mm_loadu_ps(src);
x1 = _mm_loadu_ps(src + 4);
s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
}
_mm_store_ps(dst + i, s0);
_mm_store_ps(dst + i + 4, s1);
}
return i;
}
};
template<int cn, int _ksize>
struct SIMDRowSymm<float,float,float,cn,_ksize>
{
#ifdef __arm__
typedef float32x4_t __m128;
#endif
int operator()(float* buffer, float* dst, const float* _kernel, int rowSize) const
{
//rowSize *= cn;
int i = 0;
const int ksize2 = _ksize/2;
const float* src = buffer + ksize2*cn;
const float* kx = _kernel + ksize2;
if( _ksize == 3 )
{
if( kx[0] == 2 && kx[1] == 1 )
for( ; i <= rowSize - 8; i += 8, src += 8 )
{
__m128 x0, x1, x2, y0, y1, y2;
x0 = _mm_loadu_ps(src - cn);
x1 = _mm_loadu_ps(src);
x2 = _mm_loadu_ps(src + cn);
y0 = _mm_loadu_ps(src - cn + 4);
y1 = _mm_loadu_ps(src + 4);
y2 = _mm_loadu_ps(src + cn + 4);
x0 = _mm_add_ps(x0, _mm_add_ps(_mm_add_ps(x1, x1), x2));
y0 = _mm_add_ps(y0, _mm_add_ps(_mm_add_ps(y1, y1), y2));
_mm_store_ps(dst + i, x0);
_mm_store_ps(dst + i + 4, y0);
}
else if( kx[0] == -2 && kx[1] == 1 )
for( ; i <= rowSize - 8; i += 8, src += 8 )
{
__m128 x0, x1, x2, y0, y1, y2;
x0 = _mm_loadu_ps(src - cn);
x1 = _mm_loadu_ps(src);
x2 = _mm_loadu_ps(src + cn);
y0 = _mm_loadu_ps(src - cn + 4);
y1 = _mm_loadu_ps(src + 4);
y2 = _mm_loadu_ps(src + cn + 4);
x0 = _mm_add_ps(x0, _mm_sub_ps(x2, _mm_add_ps(x1, x1)));
y0 = _mm_add_ps(y0, _mm_sub_ps(y2, _mm_add_ps(y1, y1)));
_mm_store_ps(dst + i, x0);
_mm_store_ps(dst + i + 4, y0);
}
else
{
__m128 k0 = _mm_set1_ps(kx[0]), k1 = _mm_set1_ps(kx[1]);
for( ; i <= rowSize - 8; i += 8, src += 8 )
{
__m128 x0, x1, x2, y0, y1, y2;
x0 = _mm_loadu_ps(src - cn);
x1 = _mm_loadu_ps(src);
x2 = _mm_loadu_ps(src + cn);
y0 = _mm_loadu_ps(src - cn + 4);
y1 = _mm_loadu_ps(src + 4);
y2 = _mm_loadu_ps(src + cn + 4);
x0 = _mm_mul_ps(_mm_add_ps(x0, x2), k1);
y0 = _mm_mul_ps(_mm_add_ps(y0, y2), k1);
x0 = _mm_add_ps(x0, _mm_mul_ps(x1, k0));
y0 = _mm_add_ps(y0, _mm_mul_ps(y1, k0));
_mm_store_ps(dst + i, x0);
_mm_store_ps(dst + i + 4, y0);
}
}
}
else if( _ksize == 5 )
{
if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
for( ; i <= rowSize - 8; i += 8, src += 8 )
{
__m128 x0, x1, x2, y0, y1, y2;
x0 = _mm_loadu_ps(src - cn*2);
x1 = _mm_loadu_ps(src);
x2 = _mm_loadu_ps(src + cn*2);
y0 = _mm_loadu_ps(src - cn*2 + 4);
y1 = _mm_loadu_ps(src + 4);
y2 = _mm_loadu_ps(src + cn*2 + 4);
x0 = _mm_add_ps(x0, _mm_sub_ps(x2, _mm_add_ps(x1, x1)));
y0 = _mm_add_ps(y0, _mm_sub_ps(y2, _mm_add_ps(y1, y1)));
_mm_store_ps(dst + i, x0);
_mm_store_ps(dst + i + 4, y0);
}
else
{
__m128 k0 = _mm_set1_ps(kx[0]), k1 = _mm_set1_ps(kx[1]), k2 = _mm_set1_ps(kx[2]);
for( ; i <= rowSize - 8; i += 8, src += 8 )
{
__m128 x0, x1, x2, y0, y1, y2;
x0 = _mm_loadu_ps(src - cn);
x1 = _mm_loadu_ps(src);
x2 = _mm_loadu_ps(src + cn);
y0 = _mm_loadu_ps(src - cn + 4);
y1 = _mm_loadu_ps(src + 4);
y2 = _mm_loadu_ps(src + cn + 4);
x0 = _mm_mul_ps(_mm_add_ps(x0, x2), k1);
y0 = _mm_mul_ps(_mm_add_ps(y0, y2), k1);
x0 = _mm_add_ps(x0, _mm_mul_ps(x1, k0));
y0 = _mm_add_ps(y0, _mm_mul_ps(y1, k0));
x2 = _mm_add_ps(_mm_loadu_ps(src + cn*2), _mm_loadu_ps(src - cn*2));
y2 = _mm_add_ps(_mm_loadu_ps(src + cn*2 + 4), _mm_loadu_ps(src - cn*2 + 4));
x0 = _mm_add_ps(x0, _mm_mul_ps(x2, k2));
y0 = _mm_add_ps(y0, _mm_mul_ps(y2, k2));
_mm_store_ps(dst + i, x0);
_mm_store_ps(dst + i + 4, y0);
}
}
}
else
{
SIMDRow<float,float,float,cn,_ksize> op;
i = op(buffer, dst, _kernel, rowSize);
}
return i;
}
};
template<int cn, int _ksize>
struct SIMDRowASymm<float,float,float,cn,_ksize>
{
#ifdef __arm__
typedef float32x4_t __m128;
#endif
int operator()(float* buffer, float* dst, const float* _kernel, int rowSize) const
{
//rowSize *= cn;
int i = 0;
const int ksize2 = _ksize/2;
const float* src = buffer + ksize2*cn;
const float* kx = _kernel + ksize2;
if( _ksize == 3 )
{
if( kx[0] == 0 && kx[1] == 1 )
for( ; i <= rowSize - 8; i += 8, src += 8 )
{
__m128 x0, x2, y0, y2;
x0 = _mm_loadu_ps(src + cn);
x2 = _mm_loadu_ps(src - cn);
y0 = _mm_loadu_ps(src + cn + 4);
y2 = _mm_loadu_ps(src - cn + 4);
x0 = _mm_sub_ps(x0, x2);
y0 = _mm_sub_ps(y0, y2);
_mm_store_ps(dst + i, x0);
_mm_store_ps(dst + i + 4, y0);
}
else
{
__m128 k1 = _mm_set1_ps(kx[1]);
for( ; i <= rowSize - 8; i += 8, src += 8 )
{
__m128 x0, x2, y0, y2;
x0 = _mm_loadu_ps(src + cn);
x2 = _mm_loadu_ps(src - cn);
y0 = _mm_loadu_ps(src + cn + 4);
y2 = _mm_loadu_ps(src - cn + 4);
x0 = _mm_mul_ps(_mm_sub_ps(x0, x2), k1);
y0 = _mm_mul_ps(_mm_sub_ps(y0, y2), k1);
_mm_store_ps(dst + i, x0);
_mm_store_ps(dst + i + 4, y0);
}
}
}
else if( _ksize == 5 )
{
__m128 k1 = _mm_set1_ps(kx[1]), k2 = _mm_set1_ps(kx[2]);
for( ; i <= rowSize - 8; i += 8, src += 8 )
{
__m128 x0, x2, y0, y2;
x0 = _mm_loadu_ps(src + cn);
x2 = _mm_loadu_ps(src - cn);
y0 = _mm_loadu_ps(src + cn + 4);
y2 = _mm_loadu_ps(src - cn + 4);
x0 = _mm_mul_ps(_mm_sub_ps(x0, x2), k1);
y0 = _mm_mul_ps(_mm_sub_ps(y0, y2), k1);
x2 = _mm_sub_ps(_mm_loadu_ps(src + cn*2), _mm_loadu_ps(src - cn*2));
y2 = _mm_sub_ps(_mm_loadu_ps(src + cn*2 + 4), _mm_loadu_ps(src - cn*2 + 4));
x0 = _mm_add_ps(x0, _mm_mul_ps(x2, k2));
y0 = _mm_add_ps(y0, _mm_mul_ps(y2, k2));
_mm_store_ps(dst + i, x0);
_mm_store_ps(dst + i + 4, y0);
}
}
else
{
SIMDRow<float,float,float,cn,_ksize> op;
i = op(buffer, dst, _kernel, rowSize);
}
return i;
}
};
template<int cn, int _ksize>
struct SIMDColSymm<float,float,float,cn,_ksize>
{
#ifdef __arm__
typedef float32x4_t __m128;
#endif
int operator()(float** buffer, float* dst, const float* _kernel, int rowSize) const
{
//rowSize *= cn;
int i = 0, k;
const int ksize2 = _ksize/2;
const float** src = (const float**)buffer+ksize2;
const float* ky = _kernel + ksize2;
float _delta = 0.f;
__m128 d4 = _mm_set1_ps(_delta);
if(_ksize == 3)
{
const float *S0 = src[-1], *S1 = src[0], *S2 = src[1];
if( ky[0] == 2 && ky[1] == 1 )
{
for( ; i <= rowSize - 8; i += 8 )
{
__m128 s0, s1, s2, s3, s4, s5;
s0 = _mm_load_ps(S0 + i);
s1 = _mm_load_ps(S0 + i + 4);
s2 = _mm_load_ps(S1 + i);
s3 = _mm_load_ps(S1 + i + 4);
s4 = _mm_load_ps(S2 + i);
s5 = _mm_load_ps(S2 + i + 4);
s0 = _mm_add_ps(s0, _mm_add_ps(s4, _mm_add_ps(s2, s2)));
s1 = _mm_add_ps(s1, _mm_add_ps(s5, _mm_add_ps(s3, s3)));
s0 = _mm_add_ps(s0, d4);
s1 = _mm_add_ps(s1, d4);
_mm_store_ps(dst + i, s0);
_mm_store_ps(dst + i + 4, s1);
}
}
else if( ky[0] == -2 && ky[1] == 1 )
{
for( ; i <= rowSize - 8; i += 8 )
{
__m128 s0, s1, s2, s3, s4, s5;
s0 = _mm_load_ps(S0 + i);
s1 = _mm_load_ps(S0 + i + 4);
s2 = _mm_load_ps(S1 + i);
s3 = _mm_load_ps(S1 + i + 4);
s4 = _mm_load_ps(S2 + i);
s5 = _mm_load_ps(S2 + i + 4);
s0 = _mm_add_ps(s0, _mm_sub_ps(s4, _mm_add_ps(s2, s2)));
s1 = _mm_add_ps(s1, _mm_sub_ps(s5, _mm_add_ps(s3, s3)));
s0 = _mm_add_ps(s0, d4);
s1 = _mm_add_ps(s1, d4);
_mm_store_ps(dst + i, s0);
_mm_store_ps(dst + i + 4, s1);
}
}
else
{
__m128 k0 = _mm_set1_ps(ky[0]), k1 = _mm_set1_ps(ky[1]);
for( ; i <= rowSize - 8; i += 8 )
{
__m128 s0, s1, x0, x1;
s0 = _mm_load_ps(S1 + i);
s1 = _mm_load_ps(S1 + i + 4);
s0 = _mm_add_ps(_mm_mul_ps(s0, k0), d4);
s1 = _mm_add_ps(_mm_mul_ps(s1, k0), d4);
x0 = _mm_add_ps(_mm_load_ps(S0 + i), _mm_load_ps(S2 + i));
x1 = _mm_add_ps(_mm_load_ps(S0 + i + 4), _mm_load_ps(S2 + i + 4));
s0 = _mm_add_ps(s0, _mm_mul_ps(x0,k1));
s1 = _mm_add_ps(s1, _mm_mul_ps(x1,k1));
_mm_store_ps(dst + i, s0);
_mm_store_ps(dst + i + 4, s1);
}
}
}
else // kernel greater than 3
{
const float *S, *S2;
for( ; i <= rowSize - 16; i += 16 )
{
__m128 f = _mm_load_ss(ky);
f = _mm_shuffle_ps(f, f, 0);
__m128 s0, s1, s2, s3;
__m128 x0, x1;
S = src[0] + i;
s0 = _mm_load_ps(S);
s1 = _mm_load_ps(S+4);
s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
s1 = _mm_add_ps(_mm_mul_ps(s1, f), d4);
s2 = _mm_load_ps(S+8);
s3 = _mm_load_ps(S+12);
s2 = _mm_add_ps(_mm_mul_ps(s2, f), d4);
s3 = _mm_add_ps(_mm_mul_ps(s3, f), d4);
for( k = 1; k <= ksize2; k++ )
{
S = src[k] + i;
S2 = src[-k] + i;
f = _mm_load_ss(ky+k);
f = _mm_shuffle_ps(f, f, 0);
x0 = _mm_add_ps(_mm_load_ps(S), _mm_load_ps(S2));
x1 = _mm_add_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4));
s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
x0 = _mm_add_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8));
x1 = _mm_add_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12));
s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f));
s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f));
}
_mm_store_ps(dst + i, s0);
_mm_store_ps(dst + i + 4, s1);
_mm_store_ps(dst + i + 8, s2);
_mm_store_ps(dst + i + 12, s3);
}
for( ; i <= rowSize - 4; i += 4 )
{
__m128 f = _mm_load_ss(ky);
f = _mm_shuffle_ps(f, f, 0);
__m128 x0, s0 = _mm_load_ps(src[0] + i);
s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
for( k = 1; k <= ksize2; k++ )
{
f = _mm_load_ss(ky+k);
f = _mm_shuffle_ps(f, f, 0);
S = src[k] + i;
S2 = src[-k] + i;
x0 = _mm_add_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
}
_mm_storeu_ps(dst + i, s0);
}
}
return i;
}
};
template<int cn, int _ksize>
struct SIMDColASymm<float,float,float,cn,_ksize>
{
#ifdef __arm__
typedef float32x4_t __m128;
#endif
int operator()(float** buffer, float* dst, const float* _kernel, int rowSize) const
{
//rowSize *= cn;
int i = 0, k;
const int ksize2 = _ksize/2;
const float** src = (const float**)buffer+ksize2;
const float* ky = _kernel + ksize2;
float _delta = 0.f;
__m128 d4 = _mm_set1_ps(_delta);
if(_ksize == 3)
{
const float *S0 = src[-1], *S2 = src[1];
if( fabs(ky[1]) == 1 && ky[1] == -ky[-1] )
{
if( ky[1] < 0 )
std::swap(S0, S2);
for( ; i <= rowSize - 8; i += 8 )
{
__m128 s0, s1, s2, s3;
s0 = _mm_load_ps(S2 + i);
s1 = _mm_load_ps(S2 + i + 4);
s2 = _mm_load_ps(S0 + i);
s3 = _mm_load_ps(S0 + i + 4);
s0 = _mm_add_ps(_mm_sub_ps(s0, s2), d4);
s1 = _mm_add_ps(_mm_sub_ps(s1, s3), d4);
_mm_store_ps(dst + i, s0);
_mm_store_ps(dst + i + 4, s1);
}
}
else
{
__m128 k1 = _mm_set1_ps(ky[1]);
for( ; i <= rowSize - 8; i += 8 )
{
__m128 s0 = d4, s1 = d4, x0, x1;
x0 = _mm_sub_ps(_mm_load_ps(S2 + i), _mm_load_ps(S0 + i));
x1 = _mm_sub_ps(_mm_load_ps(S2 + i + 4), _mm_load_ps(S0 + i + 4));
s0 = _mm_add_ps(s0, _mm_mul_ps(x0,k1));
s1 = _mm_add_ps(s1, _mm_mul_ps(x1,k1));
_mm_store_ps(dst + i, s0);
_mm_store_ps(dst + i + 4, s1);
}
}
}
else // kernel greater than 3
{
const float *S, *S2;
for( ; i <= rowSize - 16; i += 16 )
{
__m128 f, s0 = d4, s1 = d4, s2 = d4, s3 = d4;
__m128 x0, x1;
S = src[0] + i;
for( k = 1; k <= ksize2; k++ )
{
S = src[k] + i;
S2 = src[-k] + i;
f = _mm_load_ss(ky+k);
f = _mm_shuffle_ps(f, f, 0);
x0 = _mm_sub_ps(_mm_load_ps(S), _mm_load_ps(S2));
x1 = _mm_sub_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4));
s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
x0 = _mm_sub_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8));
x1 = _mm_sub_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12));
s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f));
s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f));
}
_mm_store_ps(dst + i, s0);
_mm_store_ps(dst + i + 4, s1);
_mm_store_ps(dst + i + 8, s2);
_mm_store_ps(dst + i + 12, s3);
}
for( ; i <= rowSize - 4; i += 4 )
{
__m128 f, x0, s0 = d4;
for( k = 1; k <= ksize2; k++ )
{
f = _mm_load_ss(ky+k);
f = _mm_shuffle_ps(f, f, 0);
x0 = _mm_sub_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
}
_mm_store_ps(dst + i, s0);
}
}
return i;
}
};
#endif
template<typename ST, typename DT, typename KT, int cn, int _ksize>
struct ConvolveRowBuffer
{
const KT* _kernel;
SIMDRow<ST, DT, KT, cn, _ksize> _vecOp;
typedef ST stype;
typedef DT dtype;
enum {
Channels = cn,
KSize = _ksize
};
ConvolveRowBuffer(const KT* kernel)
: _kernel(kernel)
{
}
// horizontal pass
virtual void operator()(ST* buffer, DT* dst, int rowSize) const
{
int i = 0, k;
const ST* S;
rowSize *= cn;
i = _vecOp(buffer, dst, _kernel, rowSize);
for( ; i < rowSize; i++ )
{
S = buffer + i;
DT s0 = _kernel[0]*S[0];
for( k = 1; k < _ksize; k++ )
{
S += cn;
s0 += _kernel[k]*S[0];
}
dst[i] = s0;
}
}
};
template<typename ST, typename DT, typename KT, int cn, int _ksize>
struct ConvolveColBuffer
{
const KT* _kernel;
SIMDCol<ST, DT, KT, cn, _ksize> _vecOp;
typedef ST stype;
typedef DT dtype;
enum {
Channels = cn,
KSize = _ksize
};
ConvolveColBuffer(const KT* kernel)
: _kernel(kernel)
{
}
// vertical pass
virtual void operator()(ST** buffer, DT* dst, int rowSize) const
{
int i = 0, k;
rowSize *= cn;
i = _vecOp(buffer, dst, _kernel, rowSize);
DT _delta = 0;
for( ; i < rowSize; i++ )
{
ST s0 = _kernel[0]*(buffer[0])[i] + _delta;
for( k = 1; k < _ksize; k++ )
{
s0 += _kernel[k]*(buffer[k])[i];
}
dst[i] = (s0);
}
}
};
template<typename ST, typename DT, typename KT, int cn, int _ksize>
struct ConvolveRowBufferSymm : public ConvolveRowBuffer<ST,DT,KT,cn,_ksize>
{
SIMDRowSymm<ST, DT, KT, cn, _ksize> _vecOp;
ConvolveRowBufferSymm(const KT* kernel)
:ConvolveRowBuffer<ST,DT,KT,cn,_ksize>(kernel)
{ }
// horizontal pass
void operator()(ST* buffer, DT* dst, int rowSize) const
{
int i = 0, j, k;
rowSize *= cn;
i = _vecOp(buffer, dst, ConvolveRowBuffer<ST,DT,KT,cn,_ksize>::_kernel, rowSize);
const int ksize2 = _ksize/2;
const ST* S = buffer + ksize2*cn;
const KT* kx = ConvolveRowBuffer<ST,DT,KT,cn,_ksize>::_kernel + ksize2;
if( _ksize == 1 && kx[0] == 1 )
{
for( ; i <= rowSize - 2; i += 2 )
{
DT s0 = S[i], s1 = S[i+1];
dst[i] = s0; dst[i+1] = s1;
}
S += i;
}
else if( _ksize == 3 )
{
if( kx[0] == 2 && kx[1] == 1 )
for( ; i <= rowSize - 2; i += 2, S += 2 )
{
DT s0 = S[-cn] + S[0]*2 + S[cn], s1 = S[1-cn] + S[1]*2 + S[1+cn];
dst[i] = s0; dst[i+1] = s1;
}
else if( kx[0] == -2 && kx[1] == 1 )
for( ; i <= rowSize - 2; i += 2, S += 2 )
{
DT s0 = S[-cn] - S[0]*2 + S[cn], s1 = S[1-cn] - S[1]*2 + S[1+cn];
dst[i] = s0; dst[i+1] = s1;
}
else
{
KT k0 = kx[0], k1 = kx[1];
for( ; i <= rowSize - 2; i += 2, S += 2 )
{
DT s0 = S[0]*k0 + (S[-cn] + S[cn])*k1, s1 = S[1]*k0 + (S[1-cn] + S[1+cn])*k1;
dst[i] = s0; dst[i+1] = s1;
}
}
}
else if( _ksize == 5 )
{
KT k0 = kx[0], k1 = kx[1], k2 = kx[2];
if( k0 == -2 && k1 == 0 && k2 == 1 )
for( ; i <= rowSize - 2; i += 2, S += 2 )
{
DT s0 = -2*S[0] + S[-cn*2] + S[cn*2];
DT s1 = -2*S[1] + S[1-cn*2] + S[1+cn*2];
dst[i] = s0; dst[i+1] = s1;
}
else
for( ; i <= rowSize - 2; i += 2, S += 2 )
{
DT s0 = S[0]*k0 + (S[-cn] + S[cn])*k1 + (S[-cn*2] + S[cn*2])*k2;
DT s1 = S[1]*k0 + (S[1-cn] + S[1+cn])*k1 + (S[1-cn*2] + S[1+cn*2])*k2;
dst[i] = s0; dst[i+1] = s1;
}
}
for( ; i < rowSize; i++, S++ )
{
DT s0 = kx[0]*S[0];
for( k = 1, j = cn; k <= ksize2; k++, j += cn )
s0 += kx[k]*(S[j] + S[-j]);
dst[i] = s0;
}
}
};
template<typename ST, typename DT, typename KT, int cn, int _ksize>
struct ConvolveColBufferSymm : public ConvolveColBuffer<ST,DT,KT,cn,_ksize>
{
SIMDColSymm<ST, DT, KT, cn, _ksize> _vecOp;
ConvolveColBufferSymm(const KT* kernel)
:ConvolveColBuffer<ST,DT,KT,cn,_ksize>(kernel)
{ }
// vertical pass
void operator()(ST** buffer, DT* dst, int rowSize) const
{
int i = 0, k;
rowSize *= cn;
i = _vecOp(buffer, dst, ConvolveColBuffer<ST,DT,KT,cn,_ksize>::_kernel, rowSize);
const int ksize2 = _ksize/2;
ST** src = buffer + ksize2;
const KT* ky = ConvolveColBuffer<ST,DT,KT,cn,_ksize>::_kernel + ksize2;
ST _delta = 0;
for( ; i < rowSize; i++ )
{
ST s0 = ky[0]*(src[0])[i] + _delta;
for( k = 1; k <= ksize2; k++ )
{
s0 += ky[k]*((src[k])[i] + (src[-k])[i]);
}
dst[i] = (s0);
}
}
};
template<typename ST, typename DT, typename KT, int cn, int _ksize>
struct ConvolveRowBufferASymm : public ConvolveRowBuffer<ST,DT,KT,cn,_ksize>
{
SIMDRowASymm<ST, DT, KT, cn, _ksize> _vecOp;
ConvolveRowBufferASymm(const KT* kernel)
:ConvolveRowBuffer<ST,DT,KT,cn,_ksize>(kernel)
{ }
// horizontal pass
void operator()(ST* buffer, DT* dst, int rowSize) const
{
int i = 0, j, k;
rowSize *= cn;
i = _vecOp(buffer, dst, ConvolveRowBuffer<ST,DT,KT,cn,_ksize>::_kernel, rowSize);
const int ksize2 = _ksize/2;
const ST* S = buffer + ksize2*cn;
const KT* kx = ConvolveRowBuffer<ST,DT,KT,cn,_ksize>::_kernel + ksize2;
if( _ksize == 3 )
{
if( kx[0] == 0 && kx[1] == 1 )
for( ; i <= rowSize - 2; i += 2, S += 2 )
{
DT s0 = S[cn] - S[-cn], s1 = S[1+cn] - S[1-cn];
dst[i] = s0; dst[i+1] = s1;
}
else
{
KT k1 = kx[1];
for( ; i <= rowSize - 2; i += 2, S += 2 )
{
DT s0 = (S[cn] - S[-cn])*k1, s1 = (S[1+cn] - S[1-cn])*k1;
dst[i] = s0; dst[i+1] = s1;
}
}
}
else if( _ksize == 5 )
{
KT k1 = kx[1], k2 = kx[2];
for( ; i <= rowSize - 2; i += 2, S += 2 )
{
DT s0 = (S[cn] - S[-cn])*k1 + (S[cn*2] - S[-cn*2])*k2;
DT s1 = (S[1+cn] - S[1-cn])*k1 + (S[1+cn*2] - S[1-cn*2])*k2;
dst[i] = s0; dst[i+1] = s1;
}
}
for( ; i < rowSize; i++, S++ )
{
DT s0 = kx[0]*S[0];
for( k = 1, j = cn; k <= ksize2; k++, j += cn )
s0 += kx[k]*(S[j] - S[-j]);
dst[i] = s0;
}
}
};
template<typename ST, typename DT, typename KT, int cn, int _ksize>
struct ConvolveColBufferASymm : public ConvolveColBuffer<ST,DT,KT,cn,_ksize>
{
SIMDColASymm<ST, DT, KT, cn, _ksize> _vecOp;
ConvolveColBufferASymm(const KT* kernel)
:ConvolveColBuffer<ST,DT,KT,cn,_ksize>(kernel)
{ }
// vertical pass
void operator()(ST** buffer, DT* dst, int rowSize) const
{
int i = 0, k;
rowSize *= cn;
i = _vecOp(buffer, dst, ConvolveColBuffer<ST,DT,KT,cn,_ksize>::_kernel, rowSize);
const int ksize2 = _ksize/2;
ST** src = buffer + ksize2;
const KT* ky = ConvolveColBuffer<ST,DT,KT,cn,_ksize>::_kernel + ksize2;
ST _delta = 0;
for( ; i < rowSize; i++ )
{
ST s0 = _delta;
for( k = 1; k <= ksize2; k++ )
{
s0 += ky[k]*((src[k])[i] - (src[-k])[i]);
}
dst[i] = (s0);
}
}
};
static void getGaussianKernel( float* kernel, int ksize, double sigma=0 )
{
const int SMALL_GAUSSIAN_SIZE = 7;
static const float small_gaussian_tab[][SMALL_GAUSSIAN_SIZE] =
{
{1.f},
{0.25f, 0.5f, 0.25f},
{0.0625f, 0.25f, 0.375f, 0.25f, 0.0625f},
{0.03125f, 0.109375f, 0.21875f, 0.28125f, 0.21875f, 0.109375f, 0.03125f}
};
const float* fixed_kernel = ksize % 2 == 1 && ksize <= SMALL_GAUSSIAN_SIZE && sigma <= 0 ?
small_gaussian_tab[ksize>>1] : 0;
float* cf = kernel;
double sigmaX = sigma > 0 ? sigma : ((ksize-1)*0.5 - 1)*0.3 + 0.8;
double scale2X = -0.5/(sigmaX*sigmaX);
double sum = 0;
int i;
for( i = 0; i < ksize; ++i )
{
double x = i - (ksize-1)*0.5;
double t = fixed_kernel ? (double)fixed_kernel[i] : std::exp(scale2X*x*x);
cf[i] = static_cast<float>(t);
sum += t;
}
sum = 1./sum;
for( i = 0; i < ksize; i++ )
{
cf[i] = static_cast<float>(static_cast<double>(cf[i])*sum);
}
}
template<typename _Tp>
static inline _Tp* alignPtr(_Tp* ptr, int n=(int)sizeof(_Tp))
{
return (_Tp*)(((size_t)ptr + n-1) & -n);
}
static inline size_t alignSize(const size_t sz, const int n)
{
return (sz + n-1) & -n;
}
template<typename T, typename DT, int cn, int ksize>
static inline void fillRow(const T* src, DT* rowbuf, int w)
{
int halfsize = ksize/2;
DT* row = rowbuf;
for(int j = 0; j < halfsize; ++j)
{
for(int c = 0; c < cn; ++c, ++row)
{
*row = src[c];
}
}
for(int j = 0; j < w*cn; ++j, ++row)
{
*row = src[j];
}
const T* temp = &(src[w*cn-cn]);
for(int j = 0; j < halfsize; ++j)
{
for(int c = 0; c < cn; ++c, ++row)
{
*row = temp[c];
}
}
}
template<typename ST, typename DT>
struct Cast
{
typedef ST stype;
typedef DT dtype;
DT operator()(ST val) const { return static_cast<DT>(val); }
};
template<typename ST, typename DT, int bits>
struct FixedPtCast
{
typedef ST stype;
typedef DT dtype;
enum { SHIFT = bits, DELTA = 1 << (bits-1) };
DT operator()(ST val) const { return static_cast<DT>((val + DELTA)>>SHIFT); }
};
template<typename ST, typename DT, typename KT, int cn, int ksize>
struct ConvolveTraits {};
/*
template<int cn, int ksize>
struct ConvolveTraits<unsigned char, unsigned char, int, cn, ksize>
{
typedef ConvolveRowBuffer<unsigned char, int, int, cn, ksize> cbx;
typedef ConvolveColBuffer<int, int, int, cn, ksize> cby;
typedef ConvolveRowBufferSymm<unsigned char, int, int, cn, ksize> cbx_symm;
typedef ConvolveColBufferSymm<int, int, int, cn, ksize> cby_symm;
typedef FixedPtCast<int,unsigned char, 16> cast;
};
template<int cn, int ksize>
struct ConvolveTraits<unsigned char, unsigned char, float, cn, ksize>
{
typedef ConvolveRowBuffer<float, float, float, cn, ksize> cbx;
typedef ConvolveColBuffer<float, float, float, cn, ksize> cby;
typedef ConvolveRowBufferSymm<float, float, float, cn, ksize> cbx_symm;
typedef ConvolveColBufferSymm<float, float, float, cn, ksize> cby_symm;
typedef Cast<float,unsigned char> cast;
};
template<int cn, int ksize>
struct ConvolveTraits<float, float, float, cn, ksize>
{
typedef ConvolveRowBuffer<float, float, float, cn, ksize> cbx;
typedef ConvolveColBuffer<float, float, float, cn, ksize> cby;
typedef ConvolveRowBufferSymm<float, float, float, cn, ksize> cbx_symm;
typedef ConvolveColBufferSymm<float, float, float, cn, ksize> cby_symm;
typedef Cast<float,float> cast;
};
*/
template<typename T, class Cx, class Cy, class Cast>
static void convolve2(const T* src, int h, int w, int srcStride, T* dst, int dstStride, const Cx& convX, const Cy& convY, const Cast& castOp)
{
typedef typename Cx::dtype wtype;
typedef typename Cx::stype stype;
typedef typename Cy::dtype dtype;
enum {
cn = Cx::Channels,
ksize = Cx::KSize,
halfsize = ksize/2,
};
srcStride /= sizeof(T);
dstStride /= sizeof(T);
size_t row_step = alignSize((w+ksize)*cn * sizeof(wtype), 16);
unsigned char* _buffer = (unsigned char*)alloca(row_step*(ksize+2) + 16);
unsigned char* buffer = alignPtr(_buffer);
wtype* rows[ksize+1];
for(int i=0; i<ksize+1; ++i)
{
rows[i] = reinterpret_cast<wtype*>(&buffer[row_step*i]);
}
stype* rowbuf = reinterpret_cast<stype*>(&buffer[row_step*(ksize+1)]);
dtype* outbuf = reinterpret_cast<dtype*>(rowbuf);
const T* input = src;
T* output = dst;
for(int i = halfsize; i < ksize; ++i, input+=srcStride)
{
fillRow<T, stype, cn, ksize>(input, rowbuf, w);
convX(rowbuf, rows[i], w);
}
for(int i = 0; i < halfsize; ++i)
{
memcpy(rows[i], rows[halfsize], row_step);
}
for(int i = halfsize+1; i < h; ++i, input+=srcStride, output+= dstStride)
{
wtype* next_row = rows[ksize];
fillRow<T, stype, cn, ksize>(input, rowbuf, w);
convX(rowbuf, next_row, w);
convY(rows, outbuf, w);
for(int j = 0; j < w*cn; ++j)
{
output[j] = castOp(outbuf[j]);
}
//
wtype* tmp = rows[0];
for(int r = 0; r < ksize; ++r)
{
rows[r] = rows[r+1];
}
rows[ksize] = tmp;
}
assert(input == src+h*srcStride);
// last rows
for(int i = 0; i < halfsize+1; ++i, output+= dstStride)
{
wtype* next_row = rows[ksize];
memcpy(next_row, rows[ksize-1], row_step);
convY(rows, outbuf, w);
for(int j = 0; j < w*cn; ++j)
{
output[j] = castOp(outbuf[j]);
}
//
wtype* tmp = rows[0];
for(int r = 0; r < ksize; ++r)
{
rows[r] = rows[r+1];
}
rows[ksize] = tmp;
}
assert(output == dst+h*dstStride);
}
template<typename T, int cn, int ksize>
static void gaussianSmooth(const T* src, int h, int w, int srcStride, T* dst, int dstStride)
{
float CV_DECL_ALIGNED(16) kernel[33];
assert(ksize<=33);
getGaussianKernel(&kernel[0], ksize, 0);
/*
// we can use Integer kernel for Byte images
int CV_DECL_ALIGNED(16) ikernel[33];
int bits = 8;
for(int i = 0; i < ksize; ++i)
{
ikernel[i] = static_cast<int>(kernel[i] * (1 << bits));
}
typedef ConvolveRowBufferSymm<unsigned char, int, int, cn, ksize> Cx;
typedef ConvolveColBufferSymm<int, int, int, cn, ksize> Cy;
typedef FixedPtCast<int,unsigned char, 16> cast;
convolve2<T, Cx, Cy>(src, h, w, srcStride, dst, dstStride, Cx(ikernel), Cy(ikernel), cast());
*/
//
typedef ConvolveRowBufferSymm<float, float, float, cn, ksize> Cx;
typedef ConvolveColBufferSymm<float, float, float, cn, ksize> Cy;
typedef Cast<float,T> cast;
convolve2<T, Cx, Cy>(src, h, w, srcStride, dst, dstStride, Cx(kernel), Cy(kernel), cast());
//
}
} // namespace
#endif
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment