diff --git a/modules/imgproc/src/smooth.cpp b/modules/imgproc/src/smooth.cpp index 81d3a7ccb0..bf4491fff6 100644 --- a/modules/imgproc/src/smooth.cpp +++ b/modules/imgproc/src/smooth.cpp @@ -42,6 +42,7 @@ //M*/ #include "precomp.hpp" +#include "opencv2/core/hal/intrin.hpp" #include "opencl_kernels_imgproc.hpp" #include "opencv2/core/openvx/ovx_defs.hpp" @@ -467,6 +468,8 @@ template<> struct ColumnSum : public BaseColumnFilter { + enum { SHIFT = 23 }; + ColumnSum( int _ksize, int _anchor, double _scale ) : BaseColumnFilter() { @@ -479,7 +482,7 @@ public BaseColumnFilter if( scale != 1 ) { int d = cvRound(1./scale); - double scalef = ((double)(1 << 16))/d; + double scalef = ((double)(1 << SHIFT))/d; divScale = cvFloor(scalef); scalef -= divScale; divDelta = d/2; @@ -554,35 +557,43 @@ public BaseColumnFilter if( haveScale ) { int i = 0; - #if CV_SSE2 - if(haveSSE2) + #if CV_SIMD128 + v_uint32x4 ds4 = v_setall_u32((unsigned)ds); + v_uint16x8 dd8 = v_setall_u16((ushort)dd); + + for( ; i <= width-16; i+=16 ) { - __m128i ds8 = _mm_set1_epi16((short)ds); - __m128i dd8 = _mm_set1_epi16((short)dd); + v_uint16x8 _sm0 = v_load(Sm + i); + v_uint16x8 _sm1 = v_load(Sm + i + 8); - for( ; i <= width-16; i+=16 ) - { - __m128i _sm0 = _mm_loadu_si128((const __m128i*)(Sm+i)); - __m128i _sm1 = _mm_loadu_si128((const __m128i*)(Sm+i+8)); + v_uint16x8 _s0 = v_add_wrap(v_load(SUM + i), v_load(Sp + i)); + v_uint16x8 _s1 = v_add_wrap(v_load(SUM + i + 8), v_load(Sp + i + 8)); - __m128i _s0 = _mm_add_epi16(_mm_loadu_si128((const __m128i*)(SUM+i)), - _mm_loadu_si128((const __m128i*)(Sp+i))); - __m128i _s1 = _mm_add_epi16(_mm_loadu_si128((const __m128i*)(SUM+i+8)), - _mm_loadu_si128((const __m128i*)(Sp+i+8))); - __m128i _s2 = _mm_mulhi_epu16(_mm_adds_epu16(_s0, dd8), ds8); - __m128i _s3 = _mm_mulhi_epu16(_mm_adds_epu16(_s1, dd8), ds8); - _s0 = _mm_sub_epi16(_s0, _sm0); - _s1 = _mm_sub_epi16(_s1, _sm1); - _mm_storeu_si128((__m128i*)(D+i), _mm_packus_epi16(_s2, _s3)); - _mm_storeu_si128((__m128i*)(SUM+i), _s0); - _mm_storeu_si128((__m128i*)(SUM+i+8), _s1); - } + v_uint32x4 _s00, _s01, _s10, _s11; + + v_expand(_s0 + dd8, _s00, _s01); + v_expand(_s1 + dd8, _s10, _s11); + + _s00 = v_shr(_s00*ds4); + _s01 = v_shr(_s01*ds4); + _s10 = v_shr(_s10*ds4); + _s11 = v_shr(_s11*ds4); + + v_int16x8 r0 = v_pack(v_reinterpret_as_s32(_s00), v_reinterpret_as_s32(_s01)); + v_int16x8 r1 = v_pack(v_reinterpret_as_s32(_s10), v_reinterpret_as_s32(_s11)); + + _s0 = v_sub_wrap(_s0, _sm0); + _s1 = v_sub_wrap(_s1, _sm1); + + v_store(D + i, v_pack_u(r0, r1)); + v_store(SUM + i, _s0); + v_store(SUM + i + 8, _s1); } - #endif + #endif for( ; i < width; i++ ) { int s0 = SUM[i] + Sp[i]; - D[i] = (uchar)((s0 + dd)*ds >> 16); + D[i] = (uchar)((s0 + dd)*ds >> SHIFT); SUM[i] = (ushort)(s0 - Sm[i]); } }