From cb445d697c1f750e8b5e7f49082d3994d1bce49d Mon Sep 17 00:00:00 2001 From: Adrian Stratulat Date: Sat, 14 Sep 2013 15:12:20 +0300 Subject: [PATCH] Refactor vectorized arithmetical operations --- modules/core/src/arithm.cpp | 579 ++++++++++++++++-------------------- 1 file changed, 264 insertions(+), 315 deletions(-) diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp index 313d06d881..ec1fe5adab 100644 --- a/modules/core/src/arithm.cpp +++ b/modules/core/src/arithm.cpp @@ -65,11 +65,24 @@ IPPArithmInitializer ippArithmInitializer; struct NOP {}; -template -void vBinOp8(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, size_t step, Size sz) +#if CV_SSE2 + +#define FUNCTOR_TEMPLATE(name) \ + template struct name {} + +FUNCTOR_TEMPLATE(VLoadStore128); +FUNCTOR_TEMPLATE(VLoadStore64); +FUNCTOR_TEMPLATE(VLoadStore128Aligned); + +#undef FUNCTOR_TEMPLATE + +#endif + +template +void vBinOp(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, size_t step, Size sz) { #if CV_SSE2 - Op8 op8; + VOp vop; #endif Op op; @@ -79,26 +92,31 @@ void vBinOp8(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, s { int x = 0; - #if CV_SSE2 +#if CV_SSE2 if( USE_SSE2 ) { - for( ; x <= sz.width - 32; x += 32 ) + for( ; x <= sz.width - 32/(int)sizeof(T); x += 32/sizeof(T) ) { - __m128i r0 = _mm_loadu_si128((const __m128i*)(src1 + x)); - __m128i r1 = _mm_loadu_si128((const __m128i*)(src1 + x + 16)); - r0 = op8(r0,_mm_loadu_si128((const __m128i*)(src2 + x))); - r1 = op8(r1,_mm_loadu_si128((const __m128i*)(src2 + x + 16))); - _mm_storeu_si128((__m128i*)(dst + x), r0); - _mm_storeu_si128((__m128i*)(dst + x + 16), r1); - } - for( ; x <= sz.width - 8; x += 8 ) - { - __m128i r0 = _mm_loadl_epi64((const __m128i*)(src1 + x)); - r0 = op8(r0,_mm_loadl_epi64((const __m128i*)(src2 + x))); - _mm_storel_epi64((__m128i*)(dst + x), r0); + typename VLoadStore128::reg_type r0 = VLoadStore128::load(src1 + x ); + typename VLoadStore128::reg_type r1 = VLoadStore128::load(src1 + x + 16/sizeof(T)); + r0 = vop(r0, VLoadStore128::load(src2 + x )); + r1 = vop(r1, VLoadStore128::load(src2 + x + 16/sizeof(T))); + VLoadStore128::store(dst + x , r0); + VLoadStore128::store(dst + x + 16/sizeof(T), r1); } } - #endif +#endif +#if CV_SSE2 + if( USE_SSE2 ) + { + for( ; x <= sz.width - 8/(int)sizeof(T); x += 8/sizeof(T) ) + { + typename VLoadStore64::reg_type r = VLoadStore64::load(src1 + x); + r = vop(r, VLoadStore64::load(src2 + x)); + VLoadStore64::store(dst + x, r); + } + } +#endif #if CV_ENABLE_UNROLLED for( ; x <= sz.width - 4; x += 4 ) { @@ -110,17 +128,18 @@ void vBinOp8(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, s dst[x+2] = v0; dst[x+3] = v1; } #endif + for( ; x < sz.width; x++ ) dst[x] = op(src1[x], src2[x]); } } -template -void vBinOp16(const T* src1, size_t step1, const T* src2, size_t step2, +template +void vBinOp32(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, size_t step, Size sz) { #if CV_SSE2 - Op16 op16; + Op32 op32; #endif Op op; @@ -130,28 +149,38 @@ void vBinOp16(const T* src1, size_t step1, const T* src2, size_t step2, { int x = 0; - #if CV_SSE2 +#if CV_SSE2 if( USE_SSE2 ) { - for( ; x <= sz.width - 16; x += 16 ) + if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 ) { - __m128i r0 = _mm_loadu_si128((const __m128i*)(src1 + x)); - __m128i r1 = _mm_loadu_si128((const __m128i*)(src1 + x + 8)); - r0 = op16(r0,_mm_loadu_si128((const __m128i*)(src2 + x))); - r1 = op16(r1,_mm_loadu_si128((const __m128i*)(src2 + x + 8))); - _mm_storeu_si128((__m128i*)(dst + x), r0); - _mm_storeu_si128((__m128i*)(dst + x + 8), r1); - } - for( ; x <= sz.width - 4; x += 4 ) - { - __m128i r0 = _mm_loadl_epi64((const __m128i*)(src1 + x)); - r0 = op16(r0,_mm_loadl_epi64((const __m128i*)(src2 + x))); - _mm_storel_epi64((__m128i*)(dst + x), r0); + for( ; x <= sz.width - 8; x += 8 ) + { + typename VLoadStore128Aligned::reg_type r0 = VLoadStore128Aligned::load(src1 + x ); + typename VLoadStore128Aligned::reg_type r1 = VLoadStore128Aligned::load(src1 + x + 4); + r0 = op32(r0, VLoadStore128Aligned::load(src2 + x )); + r1 = op32(r1, VLoadStore128Aligned::load(src2 + x + 4)); + VLoadStore128Aligned::store(dst + x , r0); + VLoadStore128Aligned::store(dst + x + 4, r1); + } } } - else - #endif - +#endif +#if CV_SSE2 + if( USE_SSE2 ) + { + for( ; x <= sz.width - 8; x += 8 ) + { + typename VLoadStore128::reg_type r0 = VLoadStore128::load(src1 + x ); + typename VLoadStore128::reg_type r1 = VLoadStore128::load(src1 + x + 4); + r0 = op32(r0, VLoadStore128::load(src2 + x )); + r1 = op32(r1, VLoadStore128::load(src2 + x + 4)); + VLoadStore128::store(dst + x , r0); + VLoadStore128::store(dst + x + 4, r1); + } + } +#endif +#if CV_ENABLE_UNROLLED for( ; x <= sz.width - 4; x += 4 ) { T v0 = op(src1[x], src2[x]); @@ -161,6 +190,7 @@ void vBinOp16(const T* src1, size_t step1, const T* src2, size_t step2, v1 = op(src1[x+3], src2[x+3]); dst[x+2] = v0; dst[x+3] = v1; } +#endif for( ; x < sz.width; x++ ) dst[x] = op(src1[x], src2[x]); @@ -168,120 +198,7 @@ void vBinOp16(const T* src1, size_t step1, const T* src2, size_t step2, } -template -void vBinOp32s(const int* src1, size_t step1, const int* src2, size_t step2, - int* dst, size_t step, Size sz) -{ -#if CV_SSE2 - Op32 op32; -#endif - Op op; - - for( ; sz.height--; src1 += step1/sizeof(src1[0]), - src2 += step2/sizeof(src2[0]), - dst += step/sizeof(dst[0]) ) - { - int x = 0; - -#if CV_SSE2 - if( USE_SSE2 ) - { - if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 ) - for( ; x <= sz.width - 8; x += 8 ) - { - __m128i r0 = _mm_load_si128((const __m128i*)(src1 + x)); - __m128i r1 = _mm_load_si128((const __m128i*)(src1 + x + 4)); - r0 = op32(r0,_mm_load_si128((const __m128i*)(src2 + x))); - r1 = op32(r1,_mm_load_si128((const __m128i*)(src2 + x + 4))); - _mm_store_si128((__m128i*)(dst + x), r0); - _mm_store_si128((__m128i*)(dst + x + 4), r1); - } - else - for( ; x <= sz.width - 8; x += 8 ) - { - __m128i r0 = _mm_loadu_si128((const __m128i*)(src1 + x)); - __m128i r1 = _mm_loadu_si128((const __m128i*)(src1 + x + 4)); - r0 = op32(r0,_mm_loadu_si128((const __m128i*)(src2 + x))); - r1 = op32(r1,_mm_loadu_si128((const __m128i*)(src2 + x + 4))); - _mm_storeu_si128((__m128i*)(dst + x), r0); - _mm_storeu_si128((__m128i*)(dst + x + 4), r1); - } - } -#endif -#if CV_ENABLE_UNROLLED - for( ; x <= sz.width - 4; x += 4 ) - { - int v0 = op(src1[x], src2[x]); - int v1 = op(src1[x+1], src2[x+1]); - dst[x] = v0; dst[x+1] = v1; - v0 = op(src1[x+2], src2[x+2]); - v1 = op(src1[x+3], src2[x+3]); - dst[x+2] = v0; dst[x+3] = v1; - } -#endif - for( ; x < sz.width; x++ ) - dst[x] = op(src1[x], src2[x]); - } -} - - -template -void vBinOp32f(const float* src1, size_t step1, const float* src2, size_t step2, - float* dst, size_t step, Size sz) -{ -#if CV_SSE2 - Op32 op32; -#endif - Op op; - - for( ; sz.height--; src1 += step1/sizeof(src1[0]), - src2 += step2/sizeof(src2[0]), - dst += step/sizeof(dst[0]) ) - { - int x = 0; - - #if CV_SSE2 - if( USE_SSE2 ) - { - if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 ) - for( ; x <= sz.width - 8; x += 8 ) - { - __m128 r0 = _mm_load_ps(src1 + x); - __m128 r1 = _mm_load_ps(src1 + x + 4); - r0 = op32(r0,_mm_load_ps(src2 + x)); - r1 = op32(r1,_mm_load_ps(src2 + x + 4)); - _mm_store_ps(dst + x, r0); - _mm_store_ps(dst + x + 4, r1); - } - else - for( ; x <= sz.width - 8; x += 8 ) - { - __m128 r0 = _mm_loadu_ps(src1 + x); - __m128 r1 = _mm_loadu_ps(src1 + x + 4); - r0 = op32(r0,_mm_loadu_ps(src2 + x)); - r1 = op32(r1,_mm_loadu_ps(src2 + x + 4)); - _mm_storeu_ps(dst + x, r0); - _mm_storeu_ps(dst + x + 4, r1); - } - } - #endif -#if CV_ENABLE_UNROLLED - for( ; x <= sz.width - 4; x += 4 ) - { - float v0 = op(src1[x], src2[x]); - float v1 = op(src1[x+1], src2[x+1]); - dst[x] = v0; dst[x+1] = v1; - v0 = op(src1[x+2], src2[x+2]); - v1 = op(src1[x+3], src2[x+3]); - dst[x+2] = v0; dst[x+3] = v1; - } -#endif - for( ; x < sz.width; x++ ) - dst[x] = op(src1[x], src2[x]); - } -} - -template +template void vBinOp64f(const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, Size sz) { @@ -296,19 +213,24 @@ void vBinOp64f(const double* src1, size_t step1, const double* src2, size_t step { int x = 0; - #if CV_SSE2 - if( USE_SSE2 && (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 ) - for( ; x <= sz.width - 4; x += 4 ) +#if CV_SSE2 + if( USE_SSE2 ) + { + if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 ) { - __m128d r0 = _mm_load_pd(src1 + x); - __m128d r1 = _mm_load_pd(src1 + x + 2); - r0 = op64(r0,_mm_load_pd(src2 + x)); - r1 = op64(r1,_mm_load_pd(src2 + x + 2)); - _mm_store_pd(dst + x, r0); - _mm_store_pd(dst + x + 2, r1); + for( ; x <= sz.width - 4; x += 4 ) + { + typename VLoadStore128Aligned::reg_type r0 = VLoadStore128Aligned::load(src1 + x ); + typename VLoadStore128Aligned::reg_type r1 = VLoadStore128Aligned::load(src1 + x + 2); + r0 = op64(r0, VLoadStore128Aligned::load(src2 + x )); + r1 = op64(r1, VLoadStore128Aligned::load(src2 + x + 2)); + VLoadStore128Aligned::store(dst + x , r0); + VLoadStore128Aligned::store(dst + x + 2, r1); + } } - else - #endif + } +#endif + for( ; x <= sz.width - 4; x += 4 ) { double v0 = op(src1[x], src2[x]); @@ -326,134 +248,161 @@ void vBinOp64f(const double* src1, size_t step1, const double* src2, size_t step #if CV_SSE2 -struct _VAdd8u { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_adds_epu8(a,b); }}; -struct _VSub8u { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_subs_epu8(a,b); }}; -struct _VMin8u { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_min_epu8(a,b); }}; -struct _VMax8u { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_max_epu8(a,b); }}; -struct _VAbsDiff8u -{ - __m128i operator()(const __m128i& a, const __m128i& b) const - { return _mm_add_epi8(_mm_subs_epu8(a,b),_mm_subs_epu8(b,a)); } -}; +#define FUNCTOR_TEMPLATE(name) \ + template struct name {} -struct _VAdd8s { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_adds_epi8(a,b); }}; -struct _VSub8s { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_subs_epi8(a,b); }}; -struct _VMin8s -{ - __m128i operator()(const __m128i& a, const __m128i& b) const - { +#define FUNCTOR_LOADSTORE_CAST(name, template_arg, register_type, load_body, store_body)\ + template <> \ + struct name{ \ + typedef register_type reg_type; \ + static reg_type load(const template_arg * p) { return load_body ((const reg_type *)p);}; \ + static void store(template_arg * p, reg_type v) { store_body ((reg_type *)p, v);}; \ + } + +#define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body)\ + template <> \ + struct name{ \ + typedef register_type reg_type; \ + static reg_type load(const template_arg * p) { return load_body (p);}; \ + static void store(template_arg * p, reg_type v) { store_body (p, v);}; \ + } + +#define FUNCTOR_CLOSURE_2arg(name, template_arg, body)\ + template<> \ + struct name \ + { \ + VLoadStore128::reg_type operator()( \ + const VLoadStore128::reg_type & a, \ + const VLoadStore128::reg_type & b) const \ + { \ + body; \ + } \ + } + +#define FUNCTOR_CLOSURE_1arg(name, template_arg, body)\ + template<> \ + struct name \ + { \ + VLoadStore128::reg_type operator()( \ + const VLoadStore128::reg_type & a, \ + const VLoadStore128::reg_type & ) const \ + { \ + body; \ + } \ + } + +FUNCTOR_LOADSTORE_CAST(VLoadStore128, uchar, __m128i, _mm_loadu_si128, _mm_storeu_si128); +FUNCTOR_LOADSTORE_CAST(VLoadStore128, schar, __m128i, _mm_loadu_si128, _mm_storeu_si128); +FUNCTOR_LOADSTORE_CAST(VLoadStore128, ushort, __m128i, _mm_loadu_si128, _mm_storeu_si128); +FUNCTOR_LOADSTORE_CAST(VLoadStore128, short, __m128i, _mm_loadu_si128, _mm_storeu_si128); +FUNCTOR_LOADSTORE_CAST(VLoadStore128, int, __m128i, _mm_loadu_si128, _mm_storeu_si128); +FUNCTOR_LOADSTORE( VLoadStore128, float, __m128 , _mm_loadu_ps , _mm_storeu_ps ); +FUNCTOR_LOADSTORE( VLoadStore128, double, __m128d, _mm_loadu_pd , _mm_storeu_pd ); + +FUNCTOR_LOADSTORE_CAST(VLoadStore64, uchar, __m128i, _mm_loadl_epi64, _mm_storel_epi64); +FUNCTOR_LOADSTORE_CAST(VLoadStore64, schar, __m128i, _mm_loadl_epi64, _mm_storel_epi64); +FUNCTOR_LOADSTORE_CAST(VLoadStore64, ushort, __m128i, _mm_loadl_epi64, _mm_storel_epi64); +FUNCTOR_LOADSTORE_CAST(VLoadStore64, short, __m128i, _mm_loadl_epi64, _mm_storel_epi64); + +FUNCTOR_LOADSTORE_CAST(VLoadStore128Aligned, int, __m128i, _mm_load_si128, _mm_store_si128); +FUNCTOR_LOADSTORE( VLoadStore128Aligned, float, __m128 , _mm_load_ps , _mm_store_ps ); +FUNCTOR_LOADSTORE( VLoadStore128Aligned, double, __m128d, _mm_load_pd , _mm_store_pd ); + +FUNCTOR_TEMPLATE(VAdd); +FUNCTOR_CLOSURE_2arg(VAdd, uchar, return _mm_adds_epu8 (a, b)); +FUNCTOR_CLOSURE_2arg(VAdd, schar, return _mm_adds_epi8 (a, b)); +FUNCTOR_CLOSURE_2arg(VAdd, ushort, return _mm_adds_epu16(a, b)); +FUNCTOR_CLOSURE_2arg(VAdd, short, return _mm_adds_epi16(a, b)); +FUNCTOR_CLOSURE_2arg(VAdd, int, return _mm_add_epi32 (a, b)); +FUNCTOR_CLOSURE_2arg(VAdd, float, return _mm_add_ps (a, b)); +FUNCTOR_CLOSURE_2arg(VAdd, double, return _mm_add_pd (a, b)); + +FUNCTOR_TEMPLATE(VSub); +FUNCTOR_CLOSURE_2arg(VSub, uchar, return _mm_subs_epu8 (a, b)); +FUNCTOR_CLOSURE_2arg(VSub, schar, return _mm_subs_epi8 (a, b)); +FUNCTOR_CLOSURE_2arg(VSub, ushort, return _mm_subs_epu16(a, b)); +FUNCTOR_CLOSURE_2arg(VSub, short, return _mm_subs_epi16(a, b)); +FUNCTOR_CLOSURE_2arg(VSub, int, return _mm_sub_epi32 (a, b)); +FUNCTOR_CLOSURE_2arg(VSub, float, return _mm_sub_ps (a, b)); +FUNCTOR_CLOSURE_2arg(VSub, double, return _mm_sub_pd (a, b)); + +FUNCTOR_TEMPLATE(VMin); +FUNCTOR_CLOSURE_2arg(VMin, uchar, return _mm_min_epu8(a, b)); +FUNCTOR_CLOSURE_2arg(VMin, schar, __m128i m = _mm_cmpgt_epi8(a, b); return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m)); - } -}; -struct _VMax8s -{ - __m128i operator()(const __m128i& a, const __m128i& b) const - { + ); +FUNCTOR_CLOSURE_2arg(VMin, ushort, return _mm_subs_epu16(a, _mm_subs_epu16(a, b))); +FUNCTOR_CLOSURE_2arg(VMin, short, return _mm_min_epi16(a, b)); +FUNCTOR_CLOSURE_2arg(VMin, int, + __m128i m = _mm_cmpgt_epi32(a, b); + return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m)); + ); +FUNCTOR_CLOSURE_2arg(VMin, float, return _mm_min_ps(a, b)); +FUNCTOR_CLOSURE_2arg(VMin, double, return _mm_min_pd(a, b)); + +FUNCTOR_TEMPLATE(VMax); +FUNCTOR_CLOSURE_2arg(VMax, uchar, return _mm_max_epu8(a, b)); +FUNCTOR_CLOSURE_2arg(VMax, schar, __m128i m = _mm_cmpgt_epi8(b, a); return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m)); - } -}; -struct _VAbsDiff8s -{ - __m128i operator()(const __m128i& a, const __m128i& b) const - { + ); +FUNCTOR_CLOSURE_2arg(VMax, ushort, return _mm_adds_epu16(_mm_subs_epu16(a, b), b)); +FUNCTOR_CLOSURE_2arg(VMax, short, return _mm_max_epi16(a, b)); +FUNCTOR_CLOSURE_2arg(VMax, int, + __m128i m = _mm_cmpgt_epi32(b, a); + return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m)); + ); +FUNCTOR_CLOSURE_2arg(VMax, float, return _mm_max_ps(a, b)); +FUNCTOR_CLOSURE_2arg(VMax, double, return _mm_max_pd(a, b)); + + +static int CV_DECL_ALIGNED(16) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }; +static int CV_DECL_ALIGNED(16) v64f_absmask[] = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff }; + +FUNCTOR_TEMPLATE(VAbsDiff); +FUNCTOR_CLOSURE_2arg(VAbsDiff, uchar, + return _mm_add_epi8(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a)); + ); +FUNCTOR_CLOSURE_2arg(VAbsDiff, schar, __m128i d = _mm_subs_epi8(a, b); __m128i m = _mm_cmpgt_epi8(b, a); return _mm_subs_epi8(_mm_xor_si128(d, m), m); - } -}; - -struct _VAdd16u { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_adds_epu16(a,b); }}; -struct _VSub16u { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_subs_epu16(a,b); }}; -struct _VMin16u -{ - __m128i operator()(const __m128i& a, const __m128i& b) const - { return _mm_subs_epu16(a,_mm_subs_epu16(a,b)); } -}; -struct _VMax16u -{ - __m128i operator()(const __m128i& a, const __m128i& b) const - { return _mm_adds_epu16(_mm_subs_epu16(a,b),b); } -}; -struct _VAbsDiff16u -{ - __m128i operator()(const __m128i& a, const __m128i& b) const - { return _mm_add_epi16(_mm_subs_epu16(a,b),_mm_subs_epu16(b,a)); } -}; - -struct _VAdd16s { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_adds_epi16(a,b); }}; -struct _VSub16s { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_subs_epi16(a,b); }}; -struct _VMin16s { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_min_epi16(a,b); }}; -struct _VMax16s { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_max_epi16(a,b); }}; -struct _VAbsDiff16s -{ - __m128i operator()(const __m128i& a, const __m128i& b) const - { - __m128i M = _mm_max_epi16(a,b), m = _mm_min_epi16(a,b); + ); +FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort, + return _mm_add_epi16(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a)); + ); +FUNCTOR_CLOSURE_2arg(VAbsDiff, short, + __m128i M = _mm_max_epi16(a, b); + __m128i m = _mm_min_epi16(a, b); return _mm_subs_epi16(M, m); - } -}; - -struct _VAdd32s { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_add_epi32(a,b); }}; -struct _VSub32s { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_sub_epi32(a,b); }}; -struct _VMin32s -{ - __m128i operator()(const __m128i& a, const __m128i& b) const - { - __m128i m = _mm_cmpgt_epi32(a, b); - return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m)); - } -}; -struct _VMax32s -{ - __m128i operator()(const __m128i& a, const __m128i& b) const - { - __m128i m = _mm_cmpgt_epi32(b, a); - return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m)); - } -}; -struct _VAbsDiff32s -{ - __m128i operator()(const __m128i& a, const __m128i& b) const - { + ); +FUNCTOR_CLOSURE_2arg(VAbsDiff, int, __m128i d = _mm_sub_epi32(a, b); __m128i m = _mm_cmpgt_epi32(b, a); return _mm_sub_epi32(_mm_xor_si128(d, m), m); - } -}; - -struct _VAdd32f { __m128 operator()(const __m128& a, const __m128& b) const { return _mm_add_ps(a,b); }}; -struct _VSub32f { __m128 operator()(const __m128& a, const __m128& b) const { return _mm_sub_ps(a,b); }}; -struct _VMin32f { __m128 operator()(const __m128& a, const __m128& b) const { return _mm_min_ps(a,b); }}; -struct _VMax32f { __m128 operator()(const __m128& a, const __m128& b) const { return _mm_max_ps(a,b); }}; -static int CV_DECL_ALIGNED(16) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }; -struct _VAbsDiff32f -{ - __m128 operator()(const __m128& a, const __m128& b) const - { + ); +FUNCTOR_CLOSURE_2arg(VAbsDiff, float, return _mm_and_ps(_mm_sub_ps(a,b), *(const __m128*)v32f_absmask); - } -}; - -struct _VAdd64f { __m128d operator()(const __m128d& a, const __m128d& b) const { return _mm_add_pd(a,b); }}; -struct _VSub64f { __m128d operator()(const __m128d& a, const __m128d& b) const { return _mm_sub_pd(a,b); }}; -struct _VMin64f { __m128d operator()(const __m128d& a, const __m128d& b) const { return _mm_min_pd(a,b); }}; -struct _VMax64f { __m128d operator()(const __m128d& a, const __m128d& b) const { return _mm_max_pd(a,b); }}; - -static int CV_DECL_ALIGNED(16) v64f_absmask[] = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff }; -struct _VAbsDiff64f -{ - __m128d operator()(const __m128d& a, const __m128d& b) const - { + ); +FUNCTOR_CLOSURE_2arg(VAbsDiff, double, return _mm_and_pd(_mm_sub_pd(a,b), *(const __m128d*)v64f_absmask); - } -}; + ); -struct _VAnd8u { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_and_si128(a,b); }}; -struct _VOr8u { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_or_si128(a,b); }}; -struct _VXor8u { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_xor_si128(a,b); }}; -struct _VNot8u { __m128i operator()(const __m128i& a, const __m128i&) const { return _mm_xor_si128(_mm_set1_epi32(-1),a); }}; +FUNCTOR_TEMPLATE(VAnd); +FUNCTOR_CLOSURE_2arg(VAnd, uchar, return _mm_and_si128(a, b)); +FUNCTOR_TEMPLATE(VOr); +FUNCTOR_CLOSURE_2arg(VOr , uchar, return _mm_or_si128 (a, b)); +FUNCTOR_TEMPLATE(VXor); +FUNCTOR_CLOSURE_2arg(VXor, uchar, return _mm_xor_si128(a, b)); +FUNCTOR_TEMPLATE(VNot); +FUNCTOR_CLOSURE_1arg(VNot, uchar, return _mm_xor_si128(_mm_set1_epi32(-1), a)); + +#undef FUNCTOR_TEMPLATE +#undef FUNCTOR_LOADSTORE_CAST +#undef FUNCTOR_LOADSTORE +#undef FUNCTOR_CLOSURE_2arg +#undef FUNCTOR_CLOSURE_1arg #endif @@ -534,14 +483,14 @@ static void add8u( const uchar* src1, size_t step1, { IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step); ippiAdd_8u_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, (IppiSize&)sz, 0), - (vBinOp8, IF_SIMD(_VAdd8u)>(src1, step1, src2, step2, dst, step, sz))); + (vBinOp, IF_SIMD(VAdd)>(src1, step1, src2, step2, dst, step, sz))); } static void add8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, Size sz, void* ) { - vBinOp8, IF_SIMD(_VAdd8s)>(src1, step1, src2, step2, dst, step, sz); + vBinOp, IF_SIMD(VAdd)>(src1, step1, src2, step2, dst, step, sz); } static void add16u( const ushort* src1, size_t step1, @@ -550,7 +499,7 @@ static void add16u( const ushort* src1, size_t step1, { IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step); ippiAdd_16u_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, (IppiSize&)sz, 0), - (vBinOp16, IF_SIMD(_VAdd16u)>(src1, step1, src2, step2, dst, step, sz))); + (vBinOp, IF_SIMD(VAdd)>(src1, step1, src2, step2, dst, step, sz))); } static void add16s( const short* src1, size_t step1, @@ -559,14 +508,14 @@ static void add16s( const short* src1, size_t step1, { IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step); ippiAdd_16s_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, (IppiSize&)sz, 0), - (vBinOp16, IF_SIMD(_VAdd16s)>(src1, step1, src2, step2, dst, step, sz))); + (vBinOp, IF_SIMD(VAdd)>(src1, step1, src2, step2, dst, step, sz))); } static void add32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, Size sz, void* ) { - vBinOp32s, IF_SIMD(_VAdd32s)>(src1, step1, src2, step2, dst, step, sz); + vBinOp32, IF_SIMD(VAdd)>(src1, step1, src2, step2, dst, step, sz); } static void add32f( const float* src1, size_t step1, @@ -575,14 +524,14 @@ static void add32f( const float* src1, size_t step1, { IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step); ippiAdd_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, (IppiSize&)sz), - (vBinOp32f, IF_SIMD(_VAdd32f)>(src1, step1, src2, step2, dst, step, sz))); + (vBinOp32, IF_SIMD(VAdd)>(src1, step1, src2, step2, dst, step, sz))); } static void add64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, Size sz, void* ) { - vBinOp64f, IF_SIMD(_VAdd64f)>(src1, step1, src2, step2, dst, step, sz); + vBinOp64f, IF_SIMD(VAdd)>(src1, step1, src2, step2, dst, step, sz); } static void sub8u( const uchar* src1, size_t step1, @@ -591,14 +540,14 @@ static void sub8u( const uchar* src1, size_t step1, { IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step); ippiSub_8u_C1RSfs(src2, (int)step2, src1, (int)step1, dst, (int)step, (IppiSize&)sz, 0), - (vBinOp8, IF_SIMD(_VSub8u)>(src1, step1, src2, step2, dst, step, sz))); + (vBinOp, IF_SIMD(VSub)>(src1, step1, src2, step2, dst, step, sz))); } static void sub8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, Size sz, void* ) { - vBinOp8, IF_SIMD(_VSub8s)>(src1, step1, src2, step2, dst, step, sz); + vBinOp, IF_SIMD(VSub)>(src1, step1, src2, step2, dst, step, sz); } static void sub16u( const ushort* src1, size_t step1, @@ -607,7 +556,7 @@ static void sub16u( const ushort* src1, size_t step1, { IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step); ippiSub_16u_C1RSfs(src2, (int)step2, src1, (int)step1, dst, (int)step, (IppiSize&)sz, 0), - (vBinOp16, IF_SIMD(_VSub16u)>(src1, step1, src2, step2, dst, step, sz))); + (vBinOp, IF_SIMD(VSub)>(src1, step1, src2, step2, dst, step, sz))); } static void sub16s( const short* src1, size_t step1, @@ -616,14 +565,14 @@ static void sub16s( const short* src1, size_t step1, { IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step); ippiSub_16s_C1RSfs(src2, (int)step2, src1, (int)step1, dst, (int)step, (IppiSize&)sz, 0), - (vBinOp16, IF_SIMD(_VSub16s)>(src1, step1, src2, step2, dst, step, sz))); + (vBinOp, IF_SIMD(VSub)>(src1, step1, src2, step2, dst, step, sz))); } static void sub32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, Size sz, void* ) { - vBinOp32s, IF_SIMD(_VSub32s)>(src1, step1, src2, step2, dst, step, sz); + vBinOp32, IF_SIMD(VSub)>(src1, step1, src2, step2, dst, step, sz); } static void sub32f( const float* src1, size_t step1, @@ -632,14 +581,14 @@ static void sub32f( const float* src1, size_t step1, { IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step); ippiSub_32f_C1R(src2, (int)step2, src1, (int)step1, dst, (int)step, (IppiSize&)sz), - (vBinOp32f, IF_SIMD(_VSub32f)>(src1, step1, src2, step2, dst, step, sz))); + (vBinOp32, IF_SIMD(VSub)>(src1, step1, src2, step2, dst, step, sz))); } static void sub64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, Size sz, void* ) { - vBinOp64f, IF_SIMD(_VSub64f)>(src1, step1, src2, step2, dst, step, sz); + vBinOp64f, IF_SIMD(VSub)>(src1, step1, src2, step2, dst, step, sz); } template<> inline uchar OpMin::operator ()(uchar a, uchar b) const { return CV_MIN_8U(a, b); } @@ -664,7 +613,7 @@ static void max8u( const uchar* src1, size_t step1, } } #else - vBinOp8, IF_SIMD(_VMax8u)>(src1, step1, src2, step2, dst, step, sz); + vBinOp, IF_SIMD(VMax)>(src1, step1, src2, step2, dst, step, sz); #endif // IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step); @@ -676,7 +625,7 @@ static void max8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, Size sz, void* ) { - vBinOp8, IF_SIMD(_VMax8s)>(src1, step1, src2, step2, dst, step, sz); + vBinOp, IF_SIMD(VMax)>(src1, step1, src2, step2, dst, step, sz); } static void max16u( const ushort* src1, size_t step1, @@ -698,7 +647,7 @@ static void max16u( const ushort* src1, size_t step1, } } #else - vBinOp16, IF_SIMD(_VMax16u)>(src1, step1, src2, step2, dst, step, sz); + vBinOp, IF_SIMD(VMax)>(src1, step1, src2, step2, dst, step, sz); #endif // IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step); @@ -710,14 +659,14 @@ static void max16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, Size sz, void* ) { - vBinOp16, IF_SIMD(_VMax16s)>(src1, step1, src2, step2, dst, step, sz); + vBinOp, IF_SIMD(VMax)>(src1, step1, src2, step2, dst, step, sz); } static void max32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, Size sz, void* ) { - vBinOp32s, IF_SIMD(_VMax32s)>(src1, step1, src2, step2, dst, step, sz); + vBinOp32, IF_SIMD(VMax)>(src1, step1, src2, step2, dst, step, sz); } static void max32f( const float* src1, size_t step1, @@ -739,7 +688,7 @@ static void max32f( const float* src1, size_t step1, } } #else - vBinOp32f, IF_SIMD(_VMax32f)>(src1, step1, src2, step2, dst, step, sz); + vBinOp32, IF_SIMD(VMax)>(src1, step1, src2, step2, dst, step, sz); #endif // IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step); // ippiMaxEvery_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz), @@ -750,7 +699,7 @@ static void max64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, Size sz, void* ) { - vBinOp64f, IF_SIMD(_VMax64f)>(src1, step1, src2, step2, dst, step, sz); + vBinOp64f, IF_SIMD(VMax)>(src1, step1, src2, step2, dst, step, sz); } static void min8u( const uchar* src1, size_t step1, @@ -772,7 +721,7 @@ static void min8u( const uchar* src1, size_t step1, } } #else - vBinOp8, IF_SIMD(_VMin8u)>(src1, step1, src2, step2, dst, step, sz); + vBinOp, IF_SIMD(VMin)>(src1, step1, src2, step2, dst, step, sz); #endif // IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step); @@ -784,7 +733,7 @@ static void min8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, Size sz, void* ) { - vBinOp8, IF_SIMD(_VMin8s)>(src1, step1, src2, step2, dst, step, sz); + vBinOp, IF_SIMD(VMin)>(src1, step1, src2, step2, dst, step, sz); } static void min16u( const ushort* src1, size_t step1, @@ -806,7 +755,7 @@ static void min16u( const ushort* src1, size_t step1, } } #else - vBinOp16, IF_SIMD(_VMin16u)>(src1, step1, src2, step2, dst, step, sz); + vBinOp, IF_SIMD(VMin)>(src1, step1, src2, step2, dst, step, sz); #endif // IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step); @@ -818,14 +767,14 @@ static void min16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, Size sz, void* ) { - vBinOp16, IF_SIMD(_VMin16s)>(src1, step1, src2, step2, dst, step, sz); + vBinOp, IF_SIMD(VMin)>(src1, step1, src2, step2, dst, step, sz); } static void min32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, Size sz, void* ) { - vBinOp32s, IF_SIMD(_VMin32s)>(src1, step1, src2, step2, dst, step, sz); + vBinOp32, IF_SIMD(VMin)>(src1, step1, src2, step2, dst, step, sz); } static void min32f( const float* src1, size_t step1, @@ -847,7 +796,7 @@ static void min32f( const float* src1, size_t step1, } } #else - vBinOp32f, IF_SIMD(_VMin32f)>(src1, step1, src2, step2, dst, step, sz); + vBinOp32, IF_SIMD(VMin)>(src1, step1, src2, step2, dst, step, sz); #endif // IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step); // ippiMinEvery_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz), @@ -858,7 +807,7 @@ static void min64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, Size sz, void* ) { - vBinOp64f, IF_SIMD(_VMin64f)>(src1, step1, src2, step2, dst, step, sz); + vBinOp64f, IF_SIMD(VMin)>(src1, step1, src2, step2, dst, step, sz); } static void absdiff8u( const uchar* src1, size_t step1, @@ -867,14 +816,14 @@ static void absdiff8u( const uchar* src1, size_t step1, { IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step); ippiAbsDiff_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, (IppiSize&)sz), - (vBinOp8, IF_SIMD(_VAbsDiff8u)>(src1, step1, src2, step2, dst, step, sz))); + (vBinOp, IF_SIMD(VAbsDiff)>(src1, step1, src2, step2, dst, step, sz))); } static void absdiff8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, Size sz, void* ) { - vBinOp8, IF_SIMD(_VAbsDiff8s)>(src1, step1, src2, step2, dst, step, sz); + vBinOp, IF_SIMD(VAbsDiff)>(src1, step1, src2, step2, dst, step, sz); } static void absdiff16u( const ushort* src1, size_t step1, @@ -883,21 +832,21 @@ static void absdiff16u( const ushort* src1, size_t step1, { IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step); ippiAbsDiff_16u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, (IppiSize&)sz), - (vBinOp16, IF_SIMD(_VAbsDiff16u)>(src1, step1, src2, step2, dst, step, sz))); + (vBinOp, IF_SIMD(VAbsDiff)>(src1, step1, src2, step2, dst, step, sz))); } static void absdiff16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, Size sz, void* ) { - vBinOp16, IF_SIMD(_VAbsDiff16s)>(src1, step1, src2, step2, dst, step, sz); + vBinOp, IF_SIMD(VAbsDiff)>(src1, step1, src2, step2, dst, step, sz); } static void absdiff32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, Size sz, void* ) { - vBinOp32s, IF_SIMD(_VAbsDiff32s)>(src1, step1, src2, step2, dst, step, sz); + vBinOp32, IF_SIMD(VAbsDiff)>(src1, step1, src2, step2, dst, step, sz); } static void absdiff32f( const float* src1, size_t step1, @@ -906,14 +855,14 @@ static void absdiff32f( const float* src1, size_t step1, { IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step); ippiAbsDiff_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, (IppiSize&)sz), - (vBinOp32f, IF_SIMD(_VAbsDiff32f)>(src1, step1, src2, step2, dst, step, sz))); + (vBinOp32, IF_SIMD(VAbsDiff)>(src1, step1, src2, step2, dst, step, sz))); } static void absdiff64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, Size sz, void* ) { - vBinOp64f, IF_SIMD(_VAbsDiff64f)>(src1, step1, src2, step2, dst, step, sz); + vBinOp64f, IF_SIMD(VAbsDiff)>(src1, step1, src2, step2, dst, step, sz); } @@ -923,7 +872,7 @@ static void and8u( const uchar* src1, size_t step1, { IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step); ippiAnd_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, (IppiSize&)sz), - (vBinOp8, IF_SIMD(_VAnd8u)>(src1, step1, src2, step2, dst, step, sz))); + (vBinOp, IF_SIMD(VAnd)>(src1, step1, src2, step2, dst, step, sz))); } static void or8u( const uchar* src1, size_t step1, @@ -932,7 +881,7 @@ static void or8u( const uchar* src1, size_t step1, { IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step); ippiOr_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, (IppiSize&)sz), - (vBinOp8, IF_SIMD(_VOr8u)>(src1, step1, src2, step2, dst, step, sz))); + (vBinOp, IF_SIMD(VOr)>(src1, step1, src2, step2, dst, step, sz))); } static void xor8u( const uchar* src1, size_t step1, @@ -941,7 +890,7 @@ static void xor8u( const uchar* src1, size_t step1, { IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step); ippiXor_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, (IppiSize&)sz), - (vBinOp8, IF_SIMD(_VXor8u)>(src1, step1, src2, step2, dst, step, sz))); + (vBinOp, IF_SIMD(VXor)>(src1, step1, src2, step2, dst, step, sz))); } static void not8u( const uchar* src1, size_t step1, @@ -950,7 +899,7 @@ static void not8u( const uchar* src1, size_t step1, { IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step); ippiNot_8u_C1R(src1, (int)step1, dst, (int)step, (IppiSize&)sz), - (vBinOp8, IF_SIMD(_VNot8u)>(src1, step1, src2, step2, dst, step, sz))); + (vBinOp, IF_SIMD(VNot)>(src1, step1, src2, step2, dst, step, sz))); } /****************************************************************************************\