diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp index 6ab4ccb36c..ef74176f33 100644 --- a/modules/core/include/opencv2/core/hal/intrin.hpp +++ b/modules/core/include/opencv2/core/hal/intrin.hpp @@ -139,8 +139,14 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE; # undef CV_FP16 #endif +#if CV_SSE2 || CV_NEON || CV_VSX +#define CV__SIMD_FORWARD 128 +#include "opencv2/core/hal/intrin_forward.hpp" +#endif + #if CV_SSE2 +#include "opencv2/core/hal/intrin_sse_em.hpp" #include "opencv2/core/hal/intrin_sse.hpp" #elif CV_NEON @@ -168,6 +174,8 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE; // (and will be mapped to v256_ counterparts) (e.g. vx_load() => v256_load()) #if CV_AVX2 +#define CV__SIMD_FORWARD 256 +#include "opencv2/core/hal/intrin_forward.hpp" #include "opencv2/core/hal/intrin_avx.hpp" #endif diff --git a/modules/core/include/opencv2/core/hal/intrin_avx.hpp b/modules/core/include/opencv2/core/hal/intrin_avx.hpp index a38c25e385..0cf36cf174 100644 --- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp @@ -82,6 +82,14 @@ inline __m128 _v256_extract_low(const __m256& v) inline __m128d _v256_extract_low(const __m256d& v) { return _mm256_castpd256_pd128(v); } +inline __m256i _v256_packs_epu32(const __m256i& a, const __m256i& b) +{ + const __m256i m = _mm256_set1_epi32(65535); + __m256i am = _mm256_min_epu32(a, m); + __m256i bm = _mm256_min_epu32(b, m); + return _mm256_packus_epi32(am, bm); +} + ///////// Types //////////// struct v_uint8x32 @@ -626,10 +634,8 @@ OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int8x32, _mm256_adds_epi8) OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int8x32, _mm256_subs_epi8) OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint16x16, _mm256_adds_epu16) OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint16x16, _mm256_subs_epu16) -OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_uint16x16, _mm256_mullo_epi16) OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int16x16, _mm256_adds_epi16) OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int16x16, _mm256_subs_epi16) -OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_int16x16, _mm256_mullo_epi16) OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint32x8, _mm256_add_epi32) OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint32x8, _mm256_sub_epi32) OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_uint32x8, _mm256_mullo_epi32) @@ -650,13 +656,103 @@ OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_float64x4, _mm256_sub_pd) OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_float64x4, _mm256_mul_pd) OPENCV_HAL_IMPL_AVX_BIN_OP(/, v_float64x4, _mm256_div_pd) +// saturating multiply 8-bit, 16-bit +inline v_uint8x32 operator * (const v_uint8x32& a, const v_uint8x32& b) +{ + v_uint16x16 c, d; + v_mul_expand(a, b, c, d); + return v_pack_u(v_reinterpret_as_s16(c), v_reinterpret_as_s16(d)); +} +inline v_int8x32 operator * (const v_int8x32& a, const v_int8x32& b) +{ + v_int16x16 c, d; + v_mul_expand(a, b, c, d); + return v_pack(c, d); +} +inline v_uint16x16 operator * (const v_uint16x16& a, const v_uint16x16& b) +{ + __m256i pl = _mm256_mullo_epi16(a.val, b.val); + __m256i ph = _mm256_mulhi_epu16(a.val, b.val); + __m256i p0 = _mm256_unpacklo_epi16(pl, ph); + __m256i p1 = _mm256_unpackhi_epi16(pl, ph); + return v_uint16x16(_v256_packs_epu32(p0, p1)); +} +inline v_int16x16 operator * (const v_int16x16& a, const v_int16x16& b) +{ + __m256i pl = _mm256_mullo_epi16(a.val, b.val); + __m256i ph = _mm256_mulhi_epi16(a.val, b.val); + __m256i p0 = _mm256_unpacklo_epi16(pl, ph); + __m256i p1 = _mm256_unpackhi_epi16(pl, ph); + return v_int16x16(_mm256_packs_epi32(p0, p1)); +} +inline v_uint8x32& operator *= (v_uint8x32& a, const v_uint8x32& b) +{ a = a * b; return a; } +inline v_int8x32& operator *= (v_int8x32& a, const v_int8x32& b) +{ a = a * b; return a; } +inline v_uint16x16& operator *= (v_uint16x16& a, const v_uint16x16& b) +{ a = a * b; return a; } +inline v_int16x16& operator *= (v_int16x16& a, const v_int16x16& b) +{ a = a * b; return a; } + +/** Non-saturating arithmetics **/ +#define OPENCV_HAL_IMPL_AVX_BIN_FUNC(func, _Tpvec, intrin) \ + inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \ + { return _Tpvec(intrin(a.val, b.val)); } + +OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_uint8x32, _mm256_add_epi8) +OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_int8x32, _mm256_add_epi8) +OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_uint16x16, _mm256_add_epi16) +OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_int16x16, _mm256_add_epi16) +OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_uint8x32, _mm256_sub_epi8) +OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_int8x32, _mm256_sub_epi8) +OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_uint16x16, _mm256_sub_epi16) +OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_int16x16, _mm256_sub_epi16) +OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_mul_wrap, v_uint16x16, _mm256_mullo_epi16) +OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_mul_wrap, v_int16x16, _mm256_mullo_epi16) + +inline v_uint8x32 v_mul_wrap(const v_uint8x32& a, const v_uint8x32& b) +{ + __m256i ad = _mm256_srai_epi16(a.val, 8); + __m256i bd = _mm256_srai_epi16(b.val, 8); + __m256i p0 = _mm256_mullo_epi16(a.val, b.val); // even + __m256i p1 = _mm256_slli_epi16(_mm256_mullo_epi16(ad, bd), 8); // odd + + const __m256i b01 = _mm256_set1_epi32(0xFF00FF00); + return v_uint8x32(_mm256_blendv_epi8(p0, p1, b01)); +} +inline v_int8x32 v_mul_wrap(const v_int8x32& a, const v_int8x32& b) +{ + return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b))); +} + +// Multiply and expand +inline void v_mul_expand(const v_uint8x32& a, const v_uint8x32& b, + v_uint16x16& c, v_uint16x16& d) +{ + v_uint16x16 a0, a1, b0, b1; + v_expand(a, a0, a1); + v_expand(b, b0, b1); + c = v_mul_wrap(a0, b0); + d = v_mul_wrap(a1, b1); +} + +inline void v_mul_expand(const v_int8x32& a, const v_int8x32& b, + v_int16x16& c, v_int16x16& d) +{ + v_int16x16 a0, a1, b0, b1; + v_expand(a, a0, a1); + v_expand(b, b0, b1); + c = v_mul_wrap(a0, b0); + d = v_mul_wrap(a1, b1); +} + inline void v_mul_expand(const v_int16x16& a, const v_int16x16& b, v_int32x8& c, v_int32x8& d) { v_int16x16 vhi = v_int16x16(_mm256_mulhi_epi16(a.val, b.val)); v_int16x16 v0, v1; - v_zip(a * b, vhi, v0, v1); + v_zip(v_mul_wrap(a, b), vhi, v0, v1); c = v_reinterpret_as_s32(v0); d = v_reinterpret_as_s32(v1); @@ -668,7 +764,7 @@ inline void v_mul_expand(const v_uint16x16& a, const v_uint16x16& b, v_uint16x16 vhi = v_uint16x16(_mm256_mulhi_epu16(a.val, b.val)); v_uint16x16 v0, v1; - v_zip(a * b, vhi, v0, v1); + v_zip(v_mul_wrap(a, b), vhi, v0, v1); c = v_reinterpret_as_u32(v0); d = v_reinterpret_as_u32(v1); @@ -685,20 +781,6 @@ inline void v_mul_expand(const v_uint32x8& a, const v_uint32x8& b, inline v_int16x16 v_mul_hi(const v_int16x16& a, const v_int16x16& b) { return v_int16x16(_mm256_mulhi_epi16(a.val, b.val)); } inline v_uint16x16 v_mul_hi(const v_uint16x16& a, const v_uint16x16& b) { return v_uint16x16(_mm256_mulhi_epu16(a.val, b.val)); } -/** Non-saturating arithmetics **/ -#define OPENCV_HAL_IMPL_AVX_BIN_FUNC(func, _Tpvec, intrin) \ - inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \ - { return _Tpvec(intrin(a.val, b.val)); } - -OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_uint8x32, _mm256_add_epi8) -OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_int8x32, _mm256_add_epi8) -OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_uint16x16, _mm256_add_epi16) -OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_int16x16, _mm256_add_epi16) -OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_uint8x32, _mm256_sub_epi8) -OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_int8x32, _mm256_sub_epi8) -OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_uint16x16, _mm256_sub_epi16) -OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_int16x16, _mm256_sub_epi16) - /** Bitwise shifts **/ #define OPENCV_HAL_IMPL_AVX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \ inline _Tpuvec operator << (const _Tpuvec& a, int imm) \ @@ -1385,6 +1467,10 @@ OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(v_float32x8, ps, _mm256_castps_si256, _mm256_ca b0.val = intrin(_v256_extract_low(a.val)); \ b1.val = intrin(_v256_extract_high(a.val)); \ } \ + inline _Tpwvec v_expand_low(const _Tpvec& a) \ + { return _Tpwvec(intrin(_v256_extract_low(a.val))); } \ + inline _Tpwvec v_expand_high(const _Tpvec& a) \ + { return _Tpwvec(intrin(_v256_extract_high(a.val))); } \ inline _Tpwvec v256_load_expand(const _Tp* ptr) \ { \ __m128i a = _mm_loadu_si128((const __m128i*)ptr); \ @@ -1430,7 +1516,12 @@ inline void v_pack_store(schar* ptr, const v_int16x16& a) { v_store_low(ptr, v_pack(a, a)); } inline void v_pack_store(uchar* ptr, const v_uint16x16& a) -{ v_store_low(ptr, v_pack(a, a)); } +{ + const __m256i m = _mm256_set1_epi16(255); + __m256i am = _mm256_min_epu16(a.val, m); + am = _v256_shuffle_odd_64(_mm256_packus_epi16(am, am)); + v_store_low(ptr, v_uint8x32(am)); +} inline void v_pack_u_store(uchar* ptr, const v_int16x16& a) { v_store_low(ptr, v_pack_u(a, a)); } @@ -1484,16 +1575,21 @@ inline v_int16x16 v_pack(const v_int32x8& a, const v_int32x8& b) { return v_int16x16(_v256_shuffle_odd_64(_mm256_packs_epi32(a.val, b.val))); } inline v_uint16x16 v_pack(const v_uint32x8& a, const v_uint32x8& b) -{ return v_uint16x16(_v256_shuffle_odd_64(_mm256_packus_epi32(a.val, b.val))); } +{ return v_uint16x16(_v256_shuffle_odd_64(_v256_packs_epu32(a.val, b.val))); } inline v_uint16x16 v_pack_u(const v_int32x8& a, const v_int32x8& b) -{ return v_pack(v_reinterpret_as_u32(a), v_reinterpret_as_u32(b)); } +{ return v_uint16x16(_v256_shuffle_odd_64(_mm256_packus_epi32(a.val, b.val))); } inline void v_pack_store(short* ptr, const v_int32x8& a) { v_store_low(ptr, v_pack(a, a)); } inline void v_pack_store(ushort* ptr, const v_uint32x8& a) -{ v_store_low(ptr, v_pack(a, a)); } +{ + const __m256i m = _mm256_set1_epi32(65535); + __m256i am = _mm256_min_epu32(a.val, m); + am = _v256_shuffle_odd_64(_mm256_packus_epi32(am, am)); + v_store_low(ptr, v_uint16x16(am)); +} inline void v_pack_u_store(ushort* ptr, const v_int32x8& a) { v_store_low(ptr, v_pack_u(a, a)); } diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp index 64a457a530..38a39172d0 100644 --- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp @@ -108,7 +108,7 @@ block and to save contents of the register to memory block. These operations allow to reorder or recombine elements in one or multiple vectors. - Interleave, deinterleave (2, 3 and 4 channels): @ref v_load_deinterleave, @ref v_store_interleave -- Expand: @ref v_load_expand, @ref v_load_expand_q, @ref v_expand +- Expand: @ref v_load_expand, @ref v_load_expand_q, @ref v_expand, @ref v_expand_low, @ref v_expand_high - Pack: @ref v_pack, @ref v_pack_u, @ref v_rshr_pack, @ref v_rshr_pack_u, @ref v_pack_store, @ref v_pack_u_store, @ref v_rshr_pack_store, @ref v_rshr_pack_u_store - Recombine: @ref v_zip, @ref v_recombine, @ref v_combine_low, @ref v_combine_high @@ -185,11 +185,14 @@ Regular integers: |load, store | x | x | x | x | x | x | |interleave | x | x | x | x | x | x | |expand | x | x | x | x | x | x | +|expand_low | x | x | x | x | x | x | +|expand_high | x | x | x | x | x | x | |expand_q | x | x | | | | | |add, sub | x | x | x | x | x | x | |add_wrap, sub_wrap | x | x | x | x | | | -|mul | | | x | x | x | x | -|mul_expand | | | x | x | x | | +|mul_wrap | x | x | x | x | | | +|mul | x | x | x | x | x | x | +|mul_expand | x | x | x | x | x | | |compare | x | x | x | x | x | x | |shift | | | x | x | x | x | |dotprod | | | | x | | | @@ -680,7 +683,7 @@ OPENCV_HAL_IMPL_CMP_OP(!=) //! @brief Helper macro //! @ingroup core_hal_intrin_impl -#define OPENCV_HAL_IMPL_ADD_SUB_OP(func, bin_op, cast_op, _Tp2) \ +#define OPENCV_HAL_IMPL_ARITHM_OP(func, bin_op, cast_op, _Tp2) \ template \ inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ { \ @@ -694,12 +697,17 @@ inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ /** @brief Add values without saturation For 8- and 16-bit integer values. */ -OPENCV_HAL_IMPL_ADD_SUB_OP(v_add_wrap, +, (_Tp), _Tp) +OPENCV_HAL_IMPL_ARITHM_OP(v_add_wrap, +, (_Tp), _Tp) /** @brief Subtract values without saturation For 8- and 16-bit integer values. */ -OPENCV_HAL_IMPL_ADD_SUB_OP(v_sub_wrap, -, (_Tp), _Tp) +OPENCV_HAL_IMPL_ARITHM_OP(v_sub_wrap, -, (_Tp), _Tp) + +/** @brief Multiply values without saturation + +For 8- and 16-bit integer values. */ +OPENCV_HAL_IMPL_ARITHM_OP(v_mul_wrap, *, (_Tp), _Tp) //! @cond IGNORED template inline T _absdiff(T a, T b) @@ -1106,6 +1114,44 @@ template inline void v_expand(const v_reg<_Tp, n>& a, } } +/** @brief Expand lower values to the wider pack type + +Same as cv::v_expand, but return lower half of the vector. + +Scheme: +@code + int32x4 int64x2 +{A B C D} ==> {A B} +@endcode */ +template +inline v_reg::w_type, n/2> +v_expand_low(const v_reg<_Tp, n>& a) +{ + v_reg::w_type, n/2> b; + for( int i = 0; i < (n/2); i++ ) + b.s[i] = a.s[i]; + return b; +} + +/** @brief Expand higher values to the wider pack type + +Same as cv::v_expand_low, but expand higher half of the vector instead. + +Scheme: +@code + int32x4 int64x2 +{A B C D} ==> {C D} +@endcode */ +template +inline v_reg::w_type, n/2> +v_expand_high(const v_reg<_Tp, n>& a) +{ + v_reg::w_type, n/2> b; + for( int i = 0; i < (n/2); i++ ) + b.s[i] = a.s[i+(n/2)]; + return b; +} + //! @cond IGNORED template inline v_reg::int_type, n> v_reinterpret_as_int(const v_reg<_Tp, n>& a) diff --git a/modules/core/include/opencv2/core/hal/intrin_forward.hpp b/modules/core/include/opencv2/core/hal/intrin_forward.hpp new file mode 100644 index 0000000000..4618552907 --- /dev/null +++ b/modules/core/include/opencv2/core/hal/intrin_forward.hpp @@ -0,0 +1,158 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html + +#ifndef CV__SIMD_FORWARD +#error "Need to pre-define forward width" +#endif + +namespace cv +{ + +//! @cond IGNORED + +CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN + +/** Types **/ +#if CV__SIMD_FORWARD == 512 +// [todo] 512 +#error "AVX512 Not implemented yet" +#elif CV__SIMD_FORWARD == 256 +// 256 +#define __CV_VX(fun) v256_##fun +#define __CV_V_UINT8 v_uint8x32 +#define __CV_V_INT8 v_int8x32 +#define __CV_V_UINT16 v_uint16x16 +#define __CV_V_INT16 v_int16x16 +#define __CV_V_UINT32 v_uint32x8 +#define __CV_V_INT32 v_int32x8 +#define __CV_V_UINT64 v_uint64x4 +#define __CV_V_INT64 v_int64x4 +#define __CV_V_FLOAT32 v_float32x8 +#define __CV_V_FLOAT64 v_float64x4 +struct v_uint8x32; +struct v_int8x32; +struct v_uint16x16; +struct v_int16x16; +struct v_uint32x8; +struct v_int32x8; +struct v_uint64x4; +struct v_int64x4; +struct v_float32x8; +struct v_float64x4; +#else +// 128 +#define __CV_VX(fun) v_##fun +#define __CV_V_UINT8 v_uint8x16 +#define __CV_V_INT8 v_int8x16 +#define __CV_V_UINT16 v_uint16x8 +#define __CV_V_INT16 v_int16x8 +#define __CV_V_UINT32 v_uint32x4 +#define __CV_V_INT32 v_int32x4 +#define __CV_V_UINT64 v_uint64x2 +#define __CV_V_INT64 v_int64x2 +#define __CV_V_FLOAT32 v_float32x4 +#define __CV_V_FLOAT64 v_float64x2 +struct v_uint8x16; +struct v_int8x16; +struct v_uint16x8; +struct v_int16x8; +struct v_uint32x4; +struct v_int32x4; +struct v_uint64x2; +struct v_int64x2; +struct v_float32x4; +struct v_float64x2; +#endif + +/** Value reordering **/ + +// Expansion +void v_expand(const __CV_V_UINT8&, __CV_V_UINT16&, __CV_V_UINT16&); +void v_expand(const __CV_V_INT8&, __CV_V_INT16&, __CV_V_INT16&); +void v_expand(const __CV_V_UINT16&, __CV_V_UINT32&, __CV_V_UINT32&); +void v_expand(const __CV_V_INT16&, __CV_V_INT32&, __CV_V_INT32&); +void v_expand(const __CV_V_UINT32&, __CV_V_UINT64&, __CV_V_UINT64&); +void v_expand(const __CV_V_INT32&, __CV_V_INT64&, __CV_V_INT64&); +// Low Expansion +__CV_V_UINT16 v_expand_low(const __CV_V_UINT8&); +__CV_V_INT16 v_expand_low(const __CV_V_INT8&); +__CV_V_UINT32 v_expand_low(const __CV_V_UINT16&); +__CV_V_INT32 v_expand_low(const __CV_V_INT16&); +__CV_V_UINT64 v_expand_low(const __CV_V_UINT32&); +__CV_V_INT64 v_expand_low(const __CV_V_INT32&); +// High Expansion +__CV_V_UINT16 v_expand_high(const __CV_V_UINT8&); +__CV_V_INT16 v_expand_high(const __CV_V_INT8&); +__CV_V_UINT32 v_expand_high(const __CV_V_UINT16&); +__CV_V_INT32 v_expand_high(const __CV_V_INT16&); +__CV_V_UINT64 v_expand_high(const __CV_V_UINT32&); +__CV_V_INT64 v_expand_high(const __CV_V_INT32&); +// Load & Low Expansion +__CV_V_UINT16 __CV_VX(load_expand)(const uchar*); +__CV_V_INT16 __CV_VX(load_expand)(const schar*); +__CV_V_UINT32 __CV_VX(load_expand)(const ushort*); +__CV_V_INT32 __CV_VX(load_expand)(const short*); +__CV_V_UINT64 __CV_VX(load_expand)(const uint*); +__CV_V_INT64 __CV_VX(load_expand)(const int*); +// Load lower 8-bit and expand into 32-bit +__CV_V_UINT32 __CV_VX(load_expand_q)(const uchar*); +__CV_V_INT32 __CV_VX(load_expand_q)(const schar*); + +// Saturating Pack +__CV_V_UINT8 v_pack(const __CV_V_UINT16&, const __CV_V_UINT16&); +__CV_V_INT8 v_pack(const __CV_V_INT16&, const __CV_V_INT16&); +__CV_V_UINT16 v_pack(const __CV_V_UINT32&, const __CV_V_UINT32&); +__CV_V_INT16 v_pack(const __CV_V_INT32&, const __CV_V_INT32&); +// Non-saturating Pack +__CV_V_UINT32 v_pack(const __CV_V_UINT64&, const __CV_V_UINT64&); +__CV_V_INT32 v_pack(const __CV_V_INT64&, const __CV_V_INT64&); +// Pack signed integers with unsigned saturation +__CV_V_UINT8 v_pack_u(const __CV_V_INT16&, const __CV_V_INT16&); +__CV_V_UINT16 v_pack_u(const __CV_V_INT32&, const __CV_V_INT32&); + +/** Arithmetic, bitwise and comparison operations **/ + +// Non-saturating multiply +#if CV_VSX +template +Tvec v_mul_wrap(const Tvec& a, const Tvec& b); +#else +__CV_V_UINT8 v_mul_wrap(const __CV_V_UINT8&, const __CV_V_UINT8&); +__CV_V_INT8 v_mul_wrap(const __CV_V_INT8&, const __CV_V_INT8&); +__CV_V_UINT16 v_mul_wrap(const __CV_V_UINT16&, const __CV_V_UINT16&); +__CV_V_INT16 v_mul_wrap(const __CV_V_INT16&, const __CV_V_INT16&); +#endif + +// Multiply and expand +#if CV_VSX +template +void v_mul_expand(const Tvec& a, const Tvec& b, Twvec& c, Twvec& d); +#else +void v_mul_expand(const __CV_V_UINT8&, const __CV_V_UINT8&, __CV_V_UINT16&, __CV_V_UINT16&); +void v_mul_expand(const __CV_V_INT8&, const __CV_V_INT8&, __CV_V_INT16&, __CV_V_INT16&); +void v_mul_expand(const __CV_V_UINT16&, const __CV_V_UINT16&, __CV_V_UINT32&, __CV_V_UINT32&); +void v_mul_expand(const __CV_V_INT16&, const __CV_V_INT16&, __CV_V_INT32&, __CV_V_INT32&); +void v_mul_expand(const __CV_V_UINT32&, const __CV_V_UINT32&, __CV_V_UINT64&, __CV_V_UINT64&); +void v_mul_expand(const __CV_V_INT32&, const __CV_V_INT32&, __CV_V_INT64&, __CV_V_INT64&); +#endif + +/** Cleanup **/ +#undef CV__SIMD_FORWARD +#undef __CV_VX +#undef __CV_V_UINT8 +#undef __CV_V_INT8 +#undef __CV_V_UINT16 +#undef __CV_V_INT16 +#undef __CV_V_UINT32 +#undef __CV_V_INT32 +#undef __CV_V_UINT64 +#undef __CV_V_INT64 +#undef __CV_V_FLOAT32 +#undef __CV_V_FLOAT64 + +CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END + +//! @endcond + +} // cv:: \ No newline at end of file diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp index d87b4e2ba0..8c13ad52db 100644 --- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp @@ -435,10 +435,8 @@ OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int8x16, vqaddq_s8) OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int8x16, vqsubq_s8) OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint16x8, vqaddq_u16) OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint16x8, vqsubq_u16) -OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_uint16x8, vmulq_u16) OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int16x8, vqaddq_s16) OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int16x8, vqsubq_s16) -OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int16x8, vmulq_s16) OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int32x4, vaddq_s32) OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int32x4, vsubq_s32) OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int32x4, vmulq_s32) @@ -476,6 +474,37 @@ inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b) } #endif +// saturating multiply 8-bit, 16-bit +#define OPENCV_HAL_IMPL_NEON_MUL_SAT(_Tpvec, _Tpwvec) \ + inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \ + { \ + _Tpwvec c, d; \ + v_mul_expand(a, b, c, d); \ + return v_pack(c, d); \ + } \ + inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \ + { a = a * b; return a; } + +OPENCV_HAL_IMPL_NEON_MUL_SAT(v_int8x16, v_int16x8) +OPENCV_HAL_IMPL_NEON_MUL_SAT(v_uint8x16, v_uint16x8) +OPENCV_HAL_IMPL_NEON_MUL_SAT(v_int16x8, v_int32x4) +OPENCV_HAL_IMPL_NEON_MUL_SAT(v_uint16x8, v_uint32x4) + +// Multiply and expand +inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b, + v_int16x8& c, v_int16x8& d) +{ + c.val = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val)); + d.val = vmull_s8(vget_high_s8(a.val), vget_high_s8(b.val)); +} + +inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b, + v_uint16x8& c, v_uint16x8& d) +{ + c.val = vmull_u8(vget_low_u8(a.val), vget_low_u8(b.val)); + d.val = vmull_u8(vget_high_u8(a.val), vget_high_u8(b.val)); +} + inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b, v_int32x4& c, v_int32x4& d) { @@ -714,6 +743,10 @@ OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_sub_wrap, vsubq_u8) OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_sub_wrap, vsubq_s8) OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_sub_wrap, vsubq_u16) OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_sub_wrap, vsubq_s16) +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_mul_wrap, vmulq_u8) +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_mul_wrap, vmulq_s8) +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_mul_wrap, vmulq_u16) +OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_mul_wrap, vmulq_s16) // TODO: absdiff for signed integers OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_absdiff, vabdq_u8) @@ -1056,6 +1089,14 @@ inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \ b0.val = vmovl_##suffix(vget_low_##suffix(a.val)); \ b1.val = vmovl_##suffix(vget_high_##suffix(a.val)); \ } \ +inline _Tpwvec v_expand_low(const _Tpvec& a) \ +{ \ + return _Tpwvec(vmovl_##suffix(vget_low_##suffix(a.val))); \ +} \ +inline _Tpwvec v_expand_high(const _Tpvec& a) \ +{ \ + return _Tpwvec(vmovl_##suffix(vget_high_##suffix(a.val))); \ +} \ inline _Tpwvec v_load_expand(const _Tp* ptr) \ { \ return _Tpwvec(vmovl_##suffix(vld1_##suffix(ptr))); \ diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp index 18bdf46f90..d4740b72fe 100644 --- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp @@ -59,6 +59,8 @@ namespace cv CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN +///////// Types //////////// + struct v_uint8x16 { typedef uchar lane_type; @@ -436,13 +438,7 @@ inline __m128i v_select_si128(__m128i mask, __m128i a, __m128i b) } inline v_uint16x8 v_pack(const v_uint32x4& a, const v_uint32x4& b) -{ - __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768); - __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32); - __m128i b1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, b.val), maxval32, b.val), delta32); - __m128i r = _mm_packs_epi32(a1, b1); - return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768))); -} +{ return v_uint16x8(_v128_packs_epu32(a.val, b.val)); } inline void v_pack_store(ushort* ptr, const v_uint32x4& a) { @@ -678,14 +674,14 @@ OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int8x16, _mm_adds_epi8) OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int8x16, _mm_subs_epi8) OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint16x8, _mm_adds_epu16) OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint16x8, _mm_subs_epu16) -OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_uint16x8, _mm_mullo_epi16) OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int16x8, _mm_adds_epi16) OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int16x8, _mm_subs_epi16) -OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_int16x8, _mm_mullo_epi16) OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint32x4, _mm_add_epi32) OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint32x4, _mm_sub_epi32) +OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_uint32x4, _v128_mullo_epi32) OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int32x4, _mm_add_epi32) OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int32x4, _mm_sub_epi32) +OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_int32x4, _v128_mullo_epi32) OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float32x4, _mm_add_ps) OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float32x4, _mm_sub_ps) OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float32x4, _mm_mul_ps) @@ -699,35 +695,49 @@ OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint64x2, _mm_sub_epi64) OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int64x2, _mm_add_epi64) OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int64x2, _mm_sub_epi64) -inline v_uint32x4 operator * (const v_uint32x4& a, const v_uint32x4& b) +// saturating multiply 8-bit, 16-bit +#define OPENCV_HAL_IMPL_SSE_MUL_SAT(_Tpvec, _Tpwvec) \ + inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \ + { \ + _Tpwvec c, d; \ + v_mul_expand(a, b, c, d); \ + return v_pack(c, d); \ + } \ + inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \ + { a = a * b; return a; } + +OPENCV_HAL_IMPL_SSE_MUL_SAT(v_int8x16, v_int16x8) +OPENCV_HAL_IMPL_SSE_MUL_SAT(v_uint16x8, v_uint32x4) +OPENCV_HAL_IMPL_SSE_MUL_SAT(v_int16x8, v_int32x4) + +inline v_uint8x16 operator * (const v_uint8x16& a, const v_uint8x16& b) { - __m128i c0 = _mm_mul_epu32(a.val, b.val); - __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32)); - __m128i d0 = _mm_unpacklo_epi32(c0, c1); - __m128i d1 = _mm_unpackhi_epi32(c0, c1); - return v_uint32x4(_mm_unpacklo_epi64(d0, d1)); + v_uint16x8 c, d; + v_mul_expand(a, b, c, d); + return v_pack_u(v_reinterpret_as_s16(c), v_reinterpret_as_s16(d)); } -inline v_int32x4 operator * (const v_int32x4& a, const v_int32x4& b) +inline v_uint8x16& operator *= (v_uint8x16& a, const v_uint8x16& b) +{ a = a * b; return a; } + +// Multiply and expand +inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b, + v_uint16x8& c, v_uint16x8& d) { -#if CV_SSE4_1 - return v_int32x4(_mm_mullo_epi32(a.val, b.val)); -#else - __m128i c0 = _mm_mul_epu32(a.val, b.val); - __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32)); - __m128i d0 = _mm_unpacklo_epi32(c0, c1); - __m128i d1 = _mm_unpackhi_epi32(c0, c1); - return v_int32x4(_mm_unpacklo_epi64(d0, d1)); -#endif + v_uint16x8 a0, a1, b0, b1; + v_expand(a, a0, a1); + v_expand(b, b0, b1); + c = v_mul_wrap(a0, b0); + d = v_mul_wrap(a1, b1); } -inline v_uint32x4& operator *= (v_uint32x4& a, const v_uint32x4& b) + +inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b, + v_int16x8& c, v_int16x8& d) { - a = a * b; - return a; -} -inline v_int32x4& operator *= (v_int32x4& a, const v_int32x4& b) -{ - a = a * b; - return a; + v_int16x8 a0, a1, b0, b1; + v_expand(a, a0, a1); + v_expand(b, b0, b1); + c = v_mul_wrap(a0, b0); + d = v_mul_wrap(a1, b1); } inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b, @@ -1018,6 +1028,22 @@ OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_sub_wrap, _mm_sub_epi8) OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_sub_wrap, _mm_sub_epi8) OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_sub_wrap, _mm_sub_epi16) OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_sub_wrap, _mm_sub_epi16) +OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_mul_wrap, _mm_mullo_epi16) +OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_mul_wrap, _mm_mullo_epi16) + +inline v_uint8x16 v_mul_wrap(const v_uint8x16& a, const v_uint8x16& b) +{ + __m128i ad = _mm_srai_epi16(a.val, 8); + __m128i bd = _mm_srai_epi16(b.val, 8); + __m128i p0 = _mm_mullo_epi16(a.val, b.val); // even + __m128i p1 = _mm_slli_epi16(_mm_mullo_epi16(ad, bd), 8); // odd + const __m128i b01 = _mm_set1_epi32(0xFF00FF00); + return v_uint8x16(_v128_blendv_epi8(p0, p1, b01)); +} +inline v_int8x16 v_mul_wrap(const v_int8x16& a, const v_int8x16& b) +{ + return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b))); +} #define OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(_Tpuvec, _Tpsvec, bits, smask32) \ inline _Tpuvec v_absdiff(const _Tpuvec& a, const _Tpuvec& b) \ @@ -1502,70 +1528,39 @@ OPENCV_HAL_IMPL_SSE_SELECT(v_float32x4, ps) OPENCV_HAL_IMPL_SSE_SELECT(v_float64x2, pd) #endif -#define OPENCV_HAL_IMPL_SSE_EXPAND(_Tpuvec, _Tpwuvec, _Tpu, _Tpsvec, _Tpwsvec, _Tps, suffix, wsuffix, shift) \ -inline void v_expand(const _Tpuvec& a, _Tpwuvec& b0, _Tpwuvec& b1) \ -{ \ - __m128i z = _mm_setzero_si128(); \ - b0.val = _mm_unpacklo_##suffix(a.val, z); \ - b1.val = _mm_unpackhi_##suffix(a.val, z); \ -} \ -inline _Tpwuvec v_load_expand(const _Tpu* ptr) \ -{ \ - __m128i z = _mm_setzero_si128(); \ - return _Tpwuvec(_mm_unpacklo_##suffix(_mm_loadl_epi64((const __m128i*)ptr), z)); \ -} \ -inline void v_expand(const _Tpsvec& a, _Tpwsvec& b0, _Tpwsvec& b1) \ -{ \ - b0.val = _mm_srai_##wsuffix(_mm_unpacklo_##suffix(a.val, a.val), shift); \ - b1.val = _mm_srai_##wsuffix(_mm_unpackhi_##suffix(a.val, a.val), shift); \ -} \ -inline _Tpwsvec v_load_expand(const _Tps* ptr) \ -{ \ - __m128i a = _mm_loadl_epi64((const __m128i*)ptr); \ - return _Tpwsvec(_mm_srai_##wsuffix(_mm_unpacklo_##suffix(a, a), shift)); \ -} +/* Expand */ +#define OPENCV_HAL_IMPL_SSE_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin) \ + inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \ + { \ + b0.val = intrin(a.val); \ + b1.val = __CV_CAT(intrin, _high)(a.val); \ + } \ + inline _Tpwvec v_expand_low(const _Tpvec& a) \ + { return _Tpwvec(intrin(a.val)); } \ + inline _Tpwvec v_expand_high(const _Tpvec& a) \ + { return _Tpwvec(__CV_CAT(intrin, _high)(a.val)); } \ + inline _Tpwvec v_load_expand(const _Tp* ptr) \ + { \ + __m128i a = _mm_loadl_epi64((const __m128i*)ptr); \ + return _Tpwvec(intrin(a)); \ + } -OPENCV_HAL_IMPL_SSE_EXPAND(v_uint8x16, v_uint16x8, uchar, v_int8x16, v_int16x8, schar, epi8, epi16, 8) -OPENCV_HAL_IMPL_SSE_EXPAND(v_uint16x8, v_uint32x4, ushort, v_int16x8, v_int32x4, short, epi16, epi32, 16) +OPENCV_HAL_IMPL_SSE_EXPAND(v_uint8x16, v_uint16x8, uchar, _v128_cvtepu8_epi16) +OPENCV_HAL_IMPL_SSE_EXPAND(v_int8x16, v_int16x8, schar, _v128_cvtepi8_epi16) +OPENCV_HAL_IMPL_SSE_EXPAND(v_uint16x8, v_uint32x4, ushort, _v128_cvtepu16_epi32) +OPENCV_HAL_IMPL_SSE_EXPAND(v_int16x8, v_int32x4, short, _v128_cvtepi16_epi32) +OPENCV_HAL_IMPL_SSE_EXPAND(v_uint32x4, v_uint64x2, unsigned, _v128_cvtepu32_epi64) +OPENCV_HAL_IMPL_SSE_EXPAND(v_int32x4, v_int64x2, int, _v128_cvtepi32_epi64) -inline void v_expand(const v_uint32x4& a, v_uint64x2& b0, v_uint64x2& b1) -{ - __m128i z = _mm_setzero_si128(); - b0.val = _mm_unpacklo_epi32(a.val, z); - b1.val = _mm_unpackhi_epi32(a.val, z); -} -inline v_uint64x2 v_load_expand(const unsigned* ptr) -{ - __m128i z = _mm_setzero_si128(); - return v_uint64x2(_mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i*)ptr), z)); -} -inline void v_expand(const v_int32x4& a, v_int64x2& b0, v_int64x2& b1) -{ - __m128i s = _mm_srai_epi32(a.val, 31); - b0.val = _mm_unpacklo_epi32(a.val, s); - b1.val = _mm_unpackhi_epi32(a.val, s); -} -inline v_int64x2 v_load_expand(const int* ptr) -{ - __m128i a = _mm_loadl_epi64((const __m128i*)ptr); - __m128i s = _mm_srai_epi32(a, 31); - return v_int64x2(_mm_unpacklo_epi32(a, s)); -} +#define OPENCV_HAL_IMPL_SSE_EXPAND_Q(_Tpvec, _Tp, intrin) \ + inline _Tpvec v_load_expand_q(const _Tp* ptr) \ + { \ + __m128i a = _mm_cvtsi32_si128(*(const int*)ptr); \ + return _Tpvec(intrin(a)); \ + } -inline v_uint32x4 v_load_expand_q(const uchar* ptr) -{ - __m128i z = _mm_setzero_si128(); - __m128i a = _mm_cvtsi32_si128(*(const int*)ptr); - return v_uint32x4(_mm_unpacklo_epi16(_mm_unpacklo_epi8(a, z), z)); -} - -inline v_int32x4 v_load_expand_q(const schar* ptr) -{ - __m128i a = _mm_cvtsi32_si128(*(const int*)ptr); - a = _mm_unpacklo_epi8(a, a); - a = _mm_unpacklo_epi8(a, a); - return v_int32x4(_mm_srai_epi32(a, 24)); -} +OPENCV_HAL_IMPL_SSE_EXPAND_Q(v_uint32x4, uchar, _v128_cvtepu8_epi32) +OPENCV_HAL_IMPL_SSE_EXPAND_Q(v_int32x4, schar, _v128_cvtepi8_epi32) #define OPENCV_HAL_IMPL_SSE_UNPACKS(_Tpvec, suffix, cast_from, cast_to) \ inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \ diff --git a/modules/core/include/opencv2/core/hal/intrin_sse_em.hpp b/modules/core/include/opencv2/core/hal/intrin_sse_em.hpp new file mode 100644 index 0000000000..be2766847c --- /dev/null +++ b/modules/core/include/opencv2/core/hal/intrin_sse_em.hpp @@ -0,0 +1,167 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html + +#ifndef OPENCV_HAL_INTRIN_SSE_EM_HPP +#define OPENCV_HAL_INTRIN_SSE_EM_HPP + +namespace cv +{ + +//! @cond IGNORED + +CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN + +#define OPENCV_HAL_SSE_WRAP_1(fun, tp) \ + inline tp _v128_##fun(const tp& a) \ + { return _mm_##fun(a); } + +#define OPENCV_HAL_SSE_WRAP_2(fun, tp) \ + inline tp _v128_##fun(const tp& a, const tp& b) \ + { return _mm_##fun(a, b); } + +#define OPENCV_HAL_SSE_WRAP_3(fun, tp) \ + inline tp _v128_##fun(const tp& a, const tp& b, const tp& c) \ + { return _mm_##fun(a, b, c); } + +///////////////////////////// XOP ///////////////////////////// + +// [todo] define CV_XOP +#if 1 // CV_XOP +inline __m128i _v128_comgt_epu32(const __m128i& a, const __m128i& b) +{ + const __m128i delta = _mm_set1_epi32((int)0x80000000); + return _mm_cmpgt_epi32(_mm_xor_si128(a, delta), _mm_xor_si128(b, delta)); +} +// wrapping XOP +#else +OPENCV_HAL_SSE_WRAP_2(_v128_comgt_epu32, __m128i) +#endif // !CV_XOP + +///////////////////////////// SSE4.1 ///////////////////////////// + +#if !CV_SSE4_1 + +/** Swizzle **/ +inline __m128i _v128_blendv_epi8(const __m128i& a, const __m128i& b, const __m128i& mask) +{ return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(b, a), mask)); } + +/** Convert **/ +// 8 >> 16 +inline __m128i _v128_cvtepu8_epi16(const __m128i& a) +{ + const __m128i z = _mm_setzero_si128(); + return _mm_unpacklo_epi8(a, z); +} +inline __m128i _v128_cvtepi8_epi16(const __m128i& a) +{ return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8); } +// 8 >> 32 +inline __m128i _v128_cvtepu8_epi32(const __m128i& a) +{ + const __m128i z = _mm_setzero_si128(); + return _mm_unpacklo_epi16(_mm_unpacklo_epi8(a, z), z); +} +inline __m128i _v128_cvtepi8_epi32(const __m128i& a) +{ + __m128i r = _mm_unpacklo_epi8(a, a); + r = _mm_unpacklo_epi8(r, r); + return _mm_srai_epi32(r, 24); +} +// 16 >> 32 +inline __m128i _v128_cvtepu16_epi32(const __m128i& a) +{ + const __m128i z = _mm_setzero_si128(); + return _mm_unpacklo_epi16(a, z); +} +inline __m128i _v128_cvtepi16_epi32(const __m128i& a) +{ return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16); } +// 32 >> 64 +inline __m128i _v128_cvtepu32_epi64(const __m128i& a) +{ + const __m128i z = _mm_setzero_si128(); + return _mm_unpacklo_epi32(a, z); +} +inline __m128i _v128_cvtepi32_epi64(const __m128i& a) +{ return _mm_unpacklo_epi32(a, _mm_srai_epi32(a, 31)); } + +/** Arithmetic **/ +inline __m128i _v128_mullo_epi32(const __m128i& a, const __m128i& b) +{ + __m128i c0 = _mm_mul_epu32(a, b); + __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a, 32), _mm_srli_epi64(b, 32)); + __m128i d0 = _mm_unpacklo_epi32(c0, c1); + __m128i d1 = _mm_unpackhi_epi32(c0, c1); + return _mm_unpacklo_epi64(d0, d1); +} + +/** Math **/ +inline __m128i _v128_min_epu32(const __m128i& a, const __m128i& b) +{ return _v128_blendv_epi8(a, b, _v128_comgt_epu32(a, b)); } + +// wrapping SSE4.1 +#else +OPENCV_HAL_SSE_WRAP_1(cvtepu8_epi16, __m128i) +OPENCV_HAL_SSE_WRAP_1(cvtepi8_epi16, __m128i) +OPENCV_HAL_SSE_WRAP_1(cvtepu8_epi32, __m128i) +OPENCV_HAL_SSE_WRAP_1(cvtepi8_epi32, __m128i) +OPENCV_HAL_SSE_WRAP_1(cvtepu16_epi32, __m128i) +OPENCV_HAL_SSE_WRAP_1(cvtepi16_epi32, __m128i) +OPENCV_HAL_SSE_WRAP_1(cvtepu32_epi64, __m128i) +OPENCV_HAL_SSE_WRAP_1(cvtepi32_epi64, __m128i) +OPENCV_HAL_SSE_WRAP_2(min_epu32, __m128i) +OPENCV_HAL_SSE_WRAP_2(mullo_epi32, __m128i) +OPENCV_HAL_SSE_WRAP_3(blendv_epi8, __m128i) +#endif // !CV_SSE4_1 + +///////////////////////////// Revolutionary ///////////////////////////// + +/** Convert **/ +// 16 << 8 +inline __m128i _v128_cvtepu8_epi16_high(const __m128i& a) +{ + const __m128i z = _mm_setzero_si128(); + return _mm_unpackhi_epi8(a, z); +} +inline __m128i _v128_cvtepi8_epi16_high(const __m128i& a) +{ return _mm_srai_epi16(_mm_unpackhi_epi8(a, a), 8); } +// 32 << 16 +inline __m128i _v128_cvtepu16_epi32_high(const __m128i& a) +{ + const __m128i z = _mm_setzero_si128(); + return _mm_unpackhi_epi16(a, z); +} +inline __m128i _v128_cvtepi16_epi32_high(const __m128i& a) +{ return _mm_srai_epi32(_mm_unpackhi_epi16(a, a), 16); } +// 64 << 32 +inline __m128i _v128_cvtepu32_epi64_high(const __m128i& a) +{ + const __m128i z = _mm_setzero_si128(); + return _mm_unpackhi_epi32(a, z); +} +inline __m128i _v128_cvtepi32_epi64_high(const __m128i& a) +{ return _mm_unpackhi_epi32(a, _mm_srai_epi32(a, 31)); } + +/** Miscellaneous **/ +inline __m128i _v128_packs_epu32(const __m128i& a, const __m128i& b) +{ + const __m128i m = _mm_set1_epi32(65535); + __m128i am = _v128_min_epu32(a, m); + __m128i bm = _v128_min_epu32(b, m); +#if CV_SSE4_1 + return _mm_packus_epi32(am, bm); +#else + const __m128i d = _mm_set1_epi32(32768), nd = _mm_set1_epi16(-32768); + am = _mm_sub_epi32(am, d); + bm = _mm_sub_epi32(bm, d); + am = _mm_packs_epi32(am, bm); + return _mm_sub_epi16(am, nd); +#endif +} + +CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END + +//! @endcond + +} // cv:: + +#endif // OPENCV_HAL_INTRIN_SSE_EM_HPP \ No newline at end of file diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp index fb81986f6c..fd554ac755 100644 --- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp @@ -315,6 +315,10 @@ inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \ b0.val = fh(a.val); \ b1.val = fl(a.val); \ } \ +inline _Tpwvec v_expand_low(const _Tpvec& a) \ +{ return _Tpwvec(fh(a.val)); } \ +inline _Tpwvec v_expand_high(const _Tpvec& a) \ +{ return _Tpwvec(fl(a.val)); } \ inline _Tpwvec v_load_expand(const _Tp* ptr) \ { return _Tpwvec(fh(vec_ld_l8(ptr))); } @@ -418,10 +422,8 @@ OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int8x16, vec_adds) OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int8x16, vec_subs) OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint16x8, vec_adds) OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint16x8, vec_subs) -OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_uint16x8, vec_mul) OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int16x8, vec_adds) OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int16x8, vec_subs) -OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_int16x8, vec_mul) OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint32x4, vec_add) OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint32x4, vec_sub) OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_uint32x4, vec_mul) @@ -441,16 +443,30 @@ OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint64x2, vec_sub) OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int64x2, vec_add) OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int64x2, vec_sub) -inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b, v_int32x4& c, v_int32x4& d) +// saturating multiply +#define OPENCV_HAL_IMPL_VSX_MUL_SAT(_Tpvec, _Tpwvec) \ + inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \ + { \ + _Tpwvec c, d; \ + v_mul_expand(a, b, c, d); \ + return v_pack(c, d); \ + } \ + inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \ + { a = a * b; return a; } + +OPENCV_HAL_IMPL_VSX_MUL_SAT(v_int8x16, v_int16x8) +OPENCV_HAL_IMPL_VSX_MUL_SAT(v_uint8x16, v_uint16x8) +OPENCV_HAL_IMPL_VSX_MUL_SAT(v_int16x8, v_int32x4) +OPENCV_HAL_IMPL_VSX_MUL_SAT(v_uint16x8, v_uint32x4) + +template +inline void v_mul_expand(const Tvec& a, const Tvec& b, Twvec& c, Twvec& d) { - c.val = vec_mul(vec_unpackh(a.val), vec_unpackh(b.val)); - d.val = vec_mul(vec_unpackl(a.val), vec_unpackl(b.val)); -} -inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b, v_uint32x4& c, v_uint32x4& d) -{ - c.val = vec_mul(vec_unpackhu(a.val), vec_unpackhu(b.val)); - d.val = vec_mul(vec_unpacklu(a.val), vec_unpacklu(b.val)); + Twvec p0 = Twvec(vec_mule(a.val, b.val)); + Twvec p1 = Twvec(vec_mulo(a.val, b.val)); + v_zip(p0, p1, c, d); } + inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b, v_uint64x2& c, v_uint64x2& d) { c.val = vec_mul(vec_unpackhu(a.val), vec_unpackhu(b.val)); @@ -459,17 +475,17 @@ inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b, v_uint64x2& c inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b) { - return v_int16x8(vec_packs( - vec_sra(vec_mul(vec_unpackh(a.val), vec_unpackh(b.val)), vec_uint4_sp(16)), - vec_sra(vec_mul(vec_unpackl(a.val), vec_unpackl(b.val)), vec_uint4_sp(16)) - )); + vec_int4 p0 = vec_mule(a.val, b.val); + vec_int4 p1 = vec_mulo(a.val, b.val); + static const vec_uchar16 perm = {2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31}; + return v_int16x8(vec_perm(vec_short8_c(p0), vec_short8_c(p1), perm)); } inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b) { - return v_uint16x8(vec_packs( - vec_sr(vec_mul(vec_unpackhu(a.val), vec_unpackhu(b.val)), vec_uint4_sp(16)), - vec_sr(vec_mul(vec_unpacklu(a.val), vec_unpacklu(b.val)), vec_uint4_sp(16)) - )); + vec_uint4 p0 = vec_mule(a.val, b.val); + vec_uint4 p1 = vec_mulo(a.val, b.val); + static const vec_uchar16 perm = {2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31}; + return v_uint16x8(vec_perm(vec_ushort8_c(p0), vec_ushort8_c(p1), perm)); } /** Non-saturating arithmetics **/ @@ -480,6 +496,7 @@ inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \ OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_add_wrap, vec_add) OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_sub_wrap, vec_sub) +OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_mul_wrap, vec_mul) /** Bitwise shifts **/ #define OPENCV_HAL_IMPL_VSX_SHIFT_OP(_Tpvec, shr, splfunc) \ diff --git a/modules/core/include/opencv2/core/vsx_utils.hpp b/modules/core/include/opencv2/core/vsx_utils.hpp index d4dab9eed7..b4e3f30562 100644 --- a/modules/core/include/opencv2/core/vsx_utils.hpp +++ b/modules/core/include/opencv2/core/vsx_utils.hpp @@ -130,19 +130,21 @@ VSX_FINLINE(rt) fnm(const rg& a, const rg& b) \ # undef vec_mul # endif /* - * there's no a direct instruction for supporting 16-bit multiplication in ISA 2.07, + * there's no a direct instruction for supporting 8-bit, 16-bit multiplication in ISA 2.07, * XLC Implement it by using instruction "multiply even", "multiply odd" and "permute" - * todo: Do I need to support 8-bit ? **/ -# define VSX_IMPL_MULH(Tvec, Tcast) \ - VSX_FINLINE(Tvec) vec_mul(const Tvec& a, const Tvec& b) \ - { \ - static const vec_uchar16 even_perm = {0, 1, 16, 17, 4, 5, 20, 21, \ - 8, 9, 24, 25, 12, 13, 28, 29}; \ - return vec_perm(Tcast(vec_mule(a, b)), Tcast(vec_mulo(a, b)), even_perm); \ +# define VSX_IMPL_MULH(Tvec, cperm) \ + VSX_FINLINE(Tvec) vec_mul(const Tvec& a, const Tvec& b) \ + { \ + static const vec_uchar16 ev_od = {cperm}; \ + return vec_perm((Tvec)vec_mule(a, b), (Tvec)vec_mulo(a, b), ev_od); \ } - VSX_IMPL_MULH(vec_short8, vec_short8_c) - VSX_IMPL_MULH(vec_ushort8, vec_ushort8_c) + #define VSX_IMPL_MULH_P16 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30 + VSX_IMPL_MULH(vec_char16, VSX_IMPL_MULH_P16) + VSX_IMPL_MULH(vec_uchar16, VSX_IMPL_MULH_P16) + #define VSX_IMPL_MULH_P8 0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29 + VSX_IMPL_MULH(vec_short8, VSX_IMPL_MULH_P8) + VSX_IMPL_MULH(vec_ushort8, VSX_IMPL_MULH_P8) // vmuluwm can be used for unsigned or signed integers, that's what they said VSX_IMPL_2VRG(vec_int4, vec_int4, vmuluwm, vec_mul) VSX_IMPL_2VRG(vec_uint4, vec_uint4, vmuluwm, vec_mul) diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp index 6666bc4253..40d282b1c2 100644 --- a/modules/core/test/test_intrin_utils.hpp +++ b/modules/core/test/test_intrin_utils.hpp @@ -407,10 +407,13 @@ template struct TheTest Data resB = vx_load_expand(dataA.d); - Rx2 c, d; + Rx2 c, d, e, f; v_expand(a, c, d); - Data resC = c, resD = d; + e = v_expand_low(a); + f = v_expand_high(a); + + Data resC = c, resD = d, resE = e, resF = f; const int n = Rx2::nlanes; for (int i = 0; i < n; ++i) { @@ -418,6 +421,8 @@ template struct TheTest EXPECT_EQ(dataA[i], resB[i]); EXPECT_EQ(dataA[i], resC[i]); EXPECT_EQ(dataA[i + n], resD[i]); + EXPECT_EQ(dataA[i], resE[i]); + EXPECT_EQ(dataA[i + n], resF[i]); } return *this; @@ -455,19 +460,21 @@ template struct TheTest return *this; } - TheTest & test_addsub_wrap() + TheTest & test_arithm_wrap() { Data dataA, dataB; dataB.reverse(); R a = dataA, b = dataB; Data resC = v_add_wrap(a, b), - resD = v_sub_wrap(a, b); + resD = v_sub_wrap(a, b), + resE = v_mul_wrap(a, b); for (int i = 0; i < R::nlanes; ++i) { SCOPED_TRACE(cv::format("i=%d", i)); EXPECT_EQ((LaneType)(dataA[i] + dataB[i]), resC[i]); EXPECT_EQ((LaneType)(dataA[i] - dataB[i]), resD[i]); + EXPECT_EQ((LaneType)(dataA[i] * dataB[i]), resE[i]); } return *this; } @@ -475,6 +482,7 @@ template struct TheTest TheTest & test_mul() { Data dataA, dataB; + dataA[1] = static_cast(std::numeric_limits::max()); dataB.reverse(); R a = dataA, b = dataB; @@ -482,7 +490,7 @@ template struct TheTest for (int i = 0; i < R::nlanes; ++i) { SCOPED_TRACE(cv::format("i=%d", i)); - EXPECT_EQ(dataA[i] * dataB[i], resC[i]); + EXPECT_EQ(saturate_cast(dataA[i] * dataB[i]), resC[i]); } return *this; @@ -1209,7 +1217,9 @@ void test_hal_intrin_uint8() .test_expand() .test_expand_q() .test_addsub() - .test_addsub_wrap() + .test_arithm_wrap() + .test_mul() + .test_mul_expand() .test_cmp() .test_logic() .test_min_max() @@ -1242,7 +1252,9 @@ void test_hal_intrin_int8() .test_expand() .test_expand_q() .test_addsub() - .test_addsub_wrap() + .test_arithm_wrap() + .test_mul() + .test_mul_expand() .test_cmp() .test_logic() .test_min_max() @@ -1267,7 +1279,7 @@ void test_hal_intrin_uint16() .test_interleave() .test_expand() .test_addsub() - .test_addsub_wrap() + .test_arithm_wrap() .test_mul() .test_mul_expand() .test_cmp() @@ -1295,7 +1307,7 @@ void test_hal_intrin_int16() .test_interleave() .test_expand() .test_addsub() - .test_addsub_wrap() + .test_arithm_wrap() .test_mul() .test_mul_expand() .test_cmp() diff --git a/modules/imgproc/CMakeLists.txt b/modules/imgproc/CMakeLists.txt index 5cfb616503..1caadbbbad 100644 --- a/modules/imgproc/CMakeLists.txt +++ b/modules/imgproc/CMakeLists.txt @@ -1,3 +1,3 @@ set(the_description "Image Processing") -ocv_add_dispatched_file(accum SSE2 AVX NEON) +ocv_add_dispatched_file(accum SSE4_1 AVX AVX2) ocv_define_module(imgproc opencv_core WRAP java python js) diff --git a/modules/imgproc/perf/perf_accumulate.cpp b/modules/imgproc/perf/perf_accumulate.cpp index f9cd80af71..c52b31e84d 100644 --- a/modules/imgproc/perf/perf_accumulate.cpp +++ b/modules/imgproc/perf/perf_accumulate.cpp @@ -5,94 +5,102 @@ namespace opencv_test { -#ifdef HAVE_OPENVX -PERF_TEST_P(Size_MatType, Accumulate, - testing::Combine( - testing::Values(::perf::szODD, ::perf::szQVGA, ::perf::szVGA, ::perf::sz1080p), - testing::Values(CV_16SC1, CV_32FC1) - ) -) -#else -PERF_TEST_P( Size_MatType, Accumulate, - testing::Combine( - testing::Values(::perf::szODD, ::perf::szQVGA, ::perf::szVGA, ::perf::sz1080p), - testing::Values(CV_32FC1) - ) - ) -#endif -{ - Size sz = get<0>(GetParam()); - int dstType = get<1>(GetParam()); +typedef Size_MatType Accumulate; - Mat src(sz, CV_8UC1); - Mat dst(sz, dstType); +#define MAT_TYPES_ACCUMLATE CV_8UC1, CV_16UC1, CV_32FC1 +#define MAT_TYPES_ACCUMLATE_C MAT_TYPES_ACCUMLATE, CV_8UC3, CV_16UC3, CV_32FC3 +#define MAT_TYPES_ACCUMLATE_D MAT_TYPES_ACCUMLATE, CV_64FC1 +#define MAT_TYPES_ACCUMLATE_D_C MAT_TYPES_ACCUMLATE_C, CV_64FC1, CV_64FC1 - declare.time(100); - declare.in(src, WARMUP_RNG).out(dst); +#define PERF_ACCUMULATE_INIT(_FLTC) \ + const Size srcSize = get<0>(GetParam()); \ + const int srcType = get<1>(GetParam()); \ + const int dstType = _FLTC(CV_MAT_CN(srcType)); \ + Mat src1(srcSize, srcType), dst(srcSize, dstType); \ + declare.in(src1, dst, WARMUP_RNG).out(dst); - TEST_CYCLE() accumulate(src, dst); +#define PERF_ACCUMULATE_MASK_INIT(_FLTC) \ + PERF_ACCUMULATE_INIT(_FLTC) \ + Mat mask(srcSize, CV_8UC1); \ + declare.in(mask, WARMUP_RNG); - SANITY_CHECK_NOTHING(); -} +#define PERF_TEST_P_ACCUMULATE(_NAME, _TYPES, _INIT, _FUN) \ + PERF_TEST_P(Accumulate, _NAME, \ + testing::Combine( \ + testing::Values(sz1080p, sz720p, szVGA, szQVGA, szODD), \ + testing::Values(_TYPES) \ + ) \ + ) \ + { \ + _INIT \ + TEST_CYCLE() _FUN; \ + SANITY_CHECK_NOTHING(); \ + } -#ifdef HAVE_OPENVX -PERF_TEST_P(Size_MatType, AccumulateSquare, - testing::Combine( - testing::Values(::perf::szODD, ::perf::szQVGA, ::perf::szVGA, ::perf::sz1080p), - testing::Values(CV_16SC1, CV_32FC1) - ) -) -#else -PERF_TEST_P( Size_MatType, AccumulateSquare, - testing::Combine( - testing::Values(::perf::szODD, ::perf::szQVGA, ::perf::szVGA, ::perf::sz1080p), - testing::Values(CV_32FC1) - ) - ) -#endif -{ - Size sz = get<0>(GetParam()); - int dstType = get<1>(GetParam()); +/////////////////////////////////// Accumulate /////////////////////////////////// - Mat src(sz, CV_8UC1); - Mat dst(sz, dstType); +PERF_TEST_P_ACCUMULATE(Accumulate, MAT_TYPES_ACCUMLATE, + PERF_ACCUMULATE_INIT(CV_32FC), accumulate(src1, dst)) - declare.time(100); - declare.in(src, WARMUP_RNG).out(dst); +PERF_TEST_P_ACCUMULATE(AccumulateMask, MAT_TYPES_ACCUMLATE_C, + PERF_ACCUMULATE_MASK_INIT(CV_32FC), accumulate(src1, dst, mask)) - TEST_CYCLE() accumulateSquare(src, dst); +PERF_TEST_P_ACCUMULATE(AccumulateDouble, MAT_TYPES_ACCUMLATE_D, + PERF_ACCUMULATE_INIT(CV_64FC), accumulate(src1, dst)) - SANITY_CHECK_NOTHING(); -} +PERF_TEST_P_ACCUMULATE(AccumulateDoubleMask, MAT_TYPES_ACCUMLATE_D_C, + PERF_ACCUMULATE_MASK_INIT(CV_64FC), accumulate(src1, dst, mask)) -#ifdef HAVE_OPENVX -PERF_TEST_P(Size_MatType, AccumulateWeighted, - testing::Combine( - testing::Values(::perf::szODD, ::perf::szQVGA, ::perf::szVGA, ::perf::sz1080p), - testing::Values(CV_8UC1, CV_32FC1) - ) -) -#else -PERF_TEST_P( Size_MatType, AccumulateWeighted, - testing::Combine( - testing::Values(::perf::szODD, ::perf::szQVGA, ::perf::szVGA, ::perf::sz1080p), - testing::Values(CV_32FC1) - ) - ) -#endif -{ - Size sz = get<0>(GetParam()); - int dstType = get<1>(GetParam()); +///////////////////////////// AccumulateSquare /////////////////////////////////// - Mat src(sz, CV_8UC1); - Mat dst(sz, dstType); +PERF_TEST_P_ACCUMULATE(Square, MAT_TYPES_ACCUMLATE, + PERF_ACCUMULATE_INIT(CV_32FC), accumulateSquare(src1, dst)) - declare.time(100); - declare.in(src, WARMUP_RNG).out(dst); +PERF_TEST_P_ACCUMULATE(SquareMask, MAT_TYPES_ACCUMLATE_C, + PERF_ACCUMULATE_MASK_INIT(CV_32FC), accumulateSquare(src1, dst, mask)) - TEST_CYCLE() accumulateWeighted(src, dst, 0.314); +PERF_TEST_P_ACCUMULATE(SquareDouble, MAT_TYPES_ACCUMLATE_D, + PERF_ACCUMULATE_INIT(CV_64FC), accumulateSquare(src1, dst)) - SANITY_CHECK_NOTHING(); -} +PERF_TEST_P_ACCUMULATE(SquareDoubleMask, MAT_TYPES_ACCUMLATE_D_C, + PERF_ACCUMULATE_MASK_INIT(CV_64FC), accumulateSquare(src1, dst, mask)) + +///////////////////////////// AccumulateProduct /////////////////////////////////// + +#define PERF_ACCUMULATE_INIT_2(_FLTC) \ + PERF_ACCUMULATE_INIT(_FLTC) \ + Mat src2(srcSize, srcType); \ + declare.in(src2); + +#define PERF_ACCUMULATE_MASK_INIT_2(_FLTC) \ + PERF_ACCUMULATE_MASK_INIT(_FLTC) \ + Mat src2(srcSize, srcType); \ + declare.in(src2); + +PERF_TEST_P_ACCUMULATE(Product, MAT_TYPES_ACCUMLATE, + PERF_ACCUMULATE_INIT_2(CV_32FC), accumulateProduct(src1, src2, dst)) + +PERF_TEST_P_ACCUMULATE(ProductMask, MAT_TYPES_ACCUMLATE_C, + PERF_ACCUMULATE_MASK_INIT_2(CV_32FC), accumulateProduct(src1, src2, dst, mask)) + +PERF_TEST_P_ACCUMULATE(ProductDouble, MAT_TYPES_ACCUMLATE_D, + PERF_ACCUMULATE_INIT_2(CV_64FC), accumulateProduct(src1, src2, dst)) + +PERF_TEST_P_ACCUMULATE(ProductDoubleMask, MAT_TYPES_ACCUMLATE_D_C, + PERF_ACCUMULATE_MASK_INIT_2(CV_64FC), accumulateProduct(src1, src2, dst, mask)) + +///////////////////////////// AccumulateWeighted /////////////////////////////////// + +PERF_TEST_P_ACCUMULATE(Weighted, MAT_TYPES_ACCUMLATE, + PERF_ACCUMULATE_INIT(CV_32FC), accumulateWeighted(src1, dst, 0.123)) + +PERF_TEST_P_ACCUMULATE(WeightedMask, MAT_TYPES_ACCUMLATE_C, + PERF_ACCUMULATE_MASK_INIT(CV_32FC), accumulateWeighted(src1, dst, 0.123, mask)) + +PERF_TEST_P_ACCUMULATE(WeightedDouble, MAT_TYPES_ACCUMLATE_D, + PERF_ACCUMULATE_INIT(CV_64FC), accumulateWeighted(src1, dst, 0.123456)) + +PERF_TEST_P_ACCUMULATE(WeightedDoubleMask, MAT_TYPES_ACCUMLATE_D_C, + PERF_ACCUMULATE_MASK_INIT(CV_64FC), accumulateWeighted(src1, dst, 0.123456, mask)) } // namespace diff --git a/modules/imgproc/src/accum.simd.hpp b/modules/imgproc/src/accum.simd.hpp index 7a29447497..7bca93de87 100644 --- a/modules/imgproc/src/accum.simd.hpp +++ b/modules/imgproc/src/accum.simd.hpp @@ -8,63 +8,43 @@ void acc_##suffix(const type* src, acctype* dst, \ const uchar* mask, int len, int cn) \ { \ - CV_CPU_CALL_NEON(acc_simd_, (src, dst, mask, len, cn)); \ - CV_CPU_CALL_SSE2(acc_simd_, (src, dst, mask, len, cn)); \ - CV_CPU_CALL_BASELINE(acc_general_, (src, dst, mask, len, cn)); \ + CV_CPU_DISPATCH(acc_simd_, (src, dst, mask, len, cn), CV_CPU_DISPATCH_MODES_ALL); \ } \ void accSqr_##suffix(const type* src, acctype* dst, \ const uchar* mask, int len, int cn) \ { \ - CV_CPU_CALL_NEON(accSqr_simd_, (src, dst, mask, len, cn)); \ - CV_CPU_CALL_SSE2(accSqr_simd_, (src, dst, mask, len, cn)); \ - CV_CPU_CALL_BASELINE(accSqr_general_, (src, dst, mask, len, cn)); \ + CV_CPU_DISPATCH(accSqr_simd_, (src, dst, mask, len, cn), CV_CPU_DISPATCH_MODES_ALL); \ } \ void accProd_##suffix(const type* src1, const type* src2, \ acctype* dst, const uchar* mask, int len, int cn) \ { \ - CV_CPU_CALL_NEON(accProd_simd_, (src1, src2, dst, mask, len, cn)); \ - CV_CPU_CALL_SSE2(accProd_simd_, (src1, src2, dst, mask, len, cn)); \ - CV_CPU_CALL_BASELINE(accProd_general_, (src1, src2, dst, mask, len, cn)); \ + CV_CPU_DISPATCH(accProd_simd_, (src1, src2, dst, mask, len, cn), CV_CPU_DISPATCH_MODES_ALL); \ } \ void accW_##suffix(const type* src, acctype* dst, \ const uchar* mask, int len, int cn, double alpha) \ { \ - CV_CPU_CALL_NEON(accW_simd_, (src, dst, mask, len, cn, alpha)); \ - CV_CPU_CALL_SSE2(accW_simd_, (src, dst, mask, len, cn, alpha)); \ - CV_CPU_CALL_BASELINE(accW_general_, (src, dst, mask, len, cn, alpha)); \ + CV_CPU_DISPATCH(accW_simd_, (src, dst, mask, len, cn, alpha), CV_CPU_DISPATCH_MODES_ALL); \ } #define DEF_ACC_FLT_FUNCS(suffix, type, acctype) \ void acc_##suffix(const type* src, acctype* dst, \ const uchar* mask, int len, int cn) \ { \ - CV_CPU_CALL_AVX(acc_avx_##suffix, (src, dst, mask, len, cn)); \ - CV_CPU_CALL_NEON(acc_simd_, (src, dst, mask, len, cn)); \ - CV_CPU_CALL_SSE2(acc_simd_, (src, dst, mask, len, cn)); \ - CV_CPU_CALL_BASELINE(acc_general_, (src, dst, mask, len, cn)); \ + CV_CPU_DISPATCH(acc_simd_, (src, dst, mask, len, cn), CV_CPU_DISPATCH_MODES_ALL); \ } \ void accSqr_##suffix(const type* src, acctype* dst, \ const uchar* mask, int len, int cn) \ { \ - CV_CPU_CALL_AVX(accSqr_avx_##suffix, (src, dst, mask, len, cn)); \ - CV_CPU_CALL_NEON(accSqr_simd_, (src, dst, mask, len, cn)); \ - CV_CPU_CALL_SSE2(accSqr_simd_, (src, dst, mask, len, cn)); \ - CV_CPU_CALL_BASELINE(accSqr_general_, (src, dst, mask, len, cn)); \ + CV_CPU_DISPATCH(accSqr_simd_, (src, dst, mask, len, cn), CV_CPU_DISPATCH_MODES_ALL); \ } \ void accProd_##suffix(const type* src1, const type* src2, \ acctype* dst, const uchar* mask, int len, int cn) \ { \ - CV_CPU_CALL_AVX(accProd_avx_##suffix, (src1, src2, dst, mask, len, cn)); \ - CV_CPU_CALL_NEON(accProd_simd_, (src1, src2, dst, mask, len, cn)); \ - CV_CPU_CALL_SSE2(accProd_simd_, (src1, src2, dst, mask, len, cn)); \ - CV_CPU_CALL_BASELINE(accProd_general_, (src1, src2, dst, mask, len, cn)); \ + CV_CPU_DISPATCH(accProd_simd_, (src1, src2, dst, mask, len, cn), CV_CPU_DISPATCH_MODES_ALL); \ } \ void accW_##suffix(const type* src, acctype* dst, \ const uchar* mask, int len, int cn, double alpha) \ { \ - CV_CPU_CALL_AVX(accW_avx_##suffix, (src, dst, mask, len, cn, alpha)); \ - CV_CPU_CALL_NEON(accW_simd_, (src, dst, mask, len, cn, alpha)); \ - CV_CPU_CALL_SSE2(accW_simd_, (src, dst, mask, len, cn, alpha)); \ - CV_CPU_CALL_BASELINE(accW_general_, (src, dst, mask, len, cn, alpha)); \ + CV_CPU_DISPATCH(accW_simd_, (src, dst, mask, len, cn, alpha), CV_CPU_DISPATCH_MODES_ALL); \ } #define DECLARATE_ACC_FUNCS(suffix, type, acctype) \ void acc_##suffix(const type* src, acctype* dst, const uchar* mask, int len, int cn); \ @@ -114,22 +94,8 @@ void accW_simd_(const float* src, float* dst, const uchar* mask, int len, int cn void accW_simd_(const float* src, double* dst, const uchar* mask, int len, int cn, double alpha); void accW_simd_(const double* src, double* dst, const uchar* mask, int len, int cn, double alpha); -// accumulate series optimized by AVX -void acc_avx_32f(const float* src, float* dst, const uchar* mask, int len, int cn); -void acc_avx_32f64f(const float* src, double* dst, const uchar* mask, int len, int cn); -void acc_avx_64f(const double* src, double* dst, const uchar* mask, int len, int cn); -void accSqr_avx_32f(const float* src, float* dst, const uchar* mask, int len, int cn); -void accSqr_avx_32f64f(const float* src, double* dst, const uchar* mask, int len, int cn); -void accSqr_avx_64f(const double* src, double* dst, const uchar* mask, int len, int cn); -void accProd_avx_32f(const float* src1, const float* src2, float* dst, const uchar* mask, int len, int cn); -void accProd_avx_32f64f(const float* src1, const float* src2, double* dst, const uchar* mask, int len, int cn); -void accProd_avx_64f(const double* src1, const double* src2, double* dst, const uchar* mask, int len, int cn); -void accW_avx_32f(const float* src, float* dst, const uchar* mask, int len, int cn, double alpha); -void accW_avx_32f64f(const float* src, double* dst, const uchar* mask, int len, int cn, double alpha); -void accW_avx_64f(const double* src, double* dst, const uchar* mask, int len, int cn, double alpha); - #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY - +// todo: remove AVX branch after support it by universal intrinsics template void acc_general_(const T* src, AT* dst, const uchar* mask, int len, int cn, int start = 0 ) { @@ -171,7 +137,11 @@ void acc_general_(const T* src, AT* dst, const uchar* mask, int len, int cn, int } } } - +#if CV_AVX && !CV_AVX2 + _mm256_zeroupper(); +#elif CV_SIMD + vx_cleanup(); +#endif } template void @@ -215,6 +185,11 @@ accSqr_general_( const T* src, AT* dst, const uchar* mask, int len, int cn, int } } } +#if CV_AVX && !CV_AVX2 + _mm256_zeroupper(); +#elif CV_SIMD + vx_cleanup(); +#endif } template void @@ -259,6 +234,11 @@ accProd_general_( const T* src1, const T* src2, AT* dst, const uchar* mask, int } } } +#if CV_AVX && !CV_AVX2 + _mm256_zeroupper(); +#elif CV_SIMD + vx_cleanup(); +#endif } template void @@ -303,77 +283,81 @@ accW_general_( const T* src, AT* dst, const uchar* mask, int len, int cn, double } } } +#if CV_AVX && !CV_AVX2 + _mm256_zeroupper(); +#elif CV_SIMD + vx_cleanup(); +#endif } - -#if CV_SIMD128 - void acc_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn) { int x = 0; - const int cVectorWidth = 16; +#if CV_SIMD + const int cVectorWidth = v_uint8::nlanes; + const int step = v_float32::nlanes; if (!mask) { int size = len * cn; for (; x <= size - cVectorWidth; x += cVectorWidth) { - v_uint8x16 v_src = v_load(src + x); - v_uint16x8 v_src0, v_src1; + v_uint8 v_src = vx_load(src + x); + v_uint16 v_src0, v_src1; v_expand(v_src, v_src0, v_src1); - v_uint32x4 v_src00, v_src01, v_src10, v_src11; + v_uint32 v_src00, v_src01, v_src10, v_src11; v_expand(v_src0, v_src00, v_src01); v_expand(v_src1, v_src10, v_src11); - v_store(dst + x, v_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00))); - v_store(dst + x + 4, v_load(dst + x + 4) + v_cvt_f32(v_reinterpret_as_s32(v_src01))); - v_store(dst + x + 8, v_load(dst + x + 8) + v_cvt_f32(v_reinterpret_as_s32(v_src10))); - v_store(dst + x + 12, v_load(dst + x + 12) + v_cvt_f32(v_reinterpret_as_s32(v_src11))); + v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00))); + v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src01))); + v_store(dst + x + step * 2, vx_load(dst + x + step * 2) + v_cvt_f32(v_reinterpret_as_s32(v_src10))); + v_store(dst + x + step * 3, vx_load(dst + x + step * 3) + v_cvt_f32(v_reinterpret_as_s32(v_src11))); } } else { - v_uint8x16 v_0 = v_setall_u8(0); + v_uint8 v_0 = vx_setall_u8(0); if (cn == 1) { for ( ; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint8x16 v_mask = v_load(mask + x); + v_uint8 v_mask = vx_load(mask + x); v_mask = ~(v_0 == v_mask); - v_uint8x16 v_src = v_load(src + x); + v_uint8 v_src = vx_load(src + x); v_src = v_src & v_mask; - v_uint16x8 v_src0, v_src1; + v_uint16 v_src0, v_src1; v_expand(v_src, v_src0, v_src1); - v_uint32x4 v_src00, v_src01, v_src10, v_src11; + v_uint32 v_src00, v_src01, v_src10, v_src11; v_expand(v_src0, v_src00, v_src01); v_expand(v_src1, v_src10, v_src11); - v_store(dst + x, v_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00))); - v_store(dst + x + 4, v_load(dst + x + 4) + v_cvt_f32(v_reinterpret_as_s32(v_src01))); - v_store(dst + x + 8, v_load(dst + x + 8) + v_cvt_f32(v_reinterpret_as_s32(v_src10))); - v_store(dst + x + 12, v_load(dst + x + 12) + v_cvt_f32(v_reinterpret_as_s32(v_src11))); + v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00))); + v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src01))); + v_store(dst + x + step * 2, vx_load(dst + x + step * 2) + v_cvt_f32(v_reinterpret_as_s32(v_src10))); + v_store(dst + x + step * 3, vx_load(dst + x + step * 3) + v_cvt_f32(v_reinterpret_as_s32(v_src11))); } } else if (cn == 3) { for ( ; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint8x16 v_mask = v_load(mask + x); + v_uint8 v_mask = vx_load(mask + x); v_mask = ~(v_0 == v_mask); - v_uint8x16 v_src0, v_src1, v_src2; + v_uint8 v_src0, v_src1, v_src2; v_load_deinterleave(src + (x * cn), v_src0, v_src1, v_src2); v_src0 = v_src0 & v_mask; v_src1 = v_src1 & v_mask; v_src2 = v_src2 & v_mask; - v_uint16x8 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21; + v_uint16 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21; v_expand(v_src0, v_src00, v_src01); v_expand(v_src1, v_src10, v_src11); v_expand(v_src2, v_src20, v_src21); - v_uint32x4 v_src000, v_src001, v_src010, v_src011; - v_uint32x4 v_src100, v_src101, v_src110, v_src111; - v_uint32x4 v_src200, v_src201, v_src210, v_src211; + v_uint32 v_src000, v_src001, v_src010, v_src011; + v_uint32 v_src100, v_src101, v_src110, v_src111; + v_uint32 v_src200, v_src201, v_src210, v_src211; v_expand(v_src00, v_src000, v_src001); v_expand(v_src01, v_src010, v_src011); v_expand(v_src10, v_src100, v_src101); @@ -381,135 +365,169 @@ void acc_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn) v_expand(v_src20, v_src200, v_src201); v_expand(v_src21, v_src210, v_src211); - v_float32x4 v_dst000, v_dst001, v_dst010, v_dst011; - v_float32x4 v_dst100, v_dst101, v_dst110, v_dst111; - v_float32x4 v_dst200, v_dst201, v_dst210, v_dst211; + v_float32 v_dst000, v_dst001, v_dst010, v_dst011; + v_float32 v_dst100, v_dst101, v_dst110, v_dst111; + v_float32 v_dst200, v_dst201, v_dst210, v_dst211; v_load_deinterleave(dst + (x * cn), v_dst000, v_dst100, v_dst200); - v_load_deinterleave(dst + ((x + 4) * cn), v_dst001, v_dst101, v_dst201); - v_load_deinterleave(dst + ((x + 8) * cn), v_dst010, v_dst110, v_dst210); - v_load_deinterleave(dst + ((x + 12) * cn), v_dst011, v_dst111, v_dst211); + v_load_deinterleave(dst + ((x + step) * cn), v_dst001, v_dst101, v_dst201); + v_load_deinterleave(dst + ((x + step * 2) * cn), v_dst010, v_dst110, v_dst210); + v_load_deinterleave(dst + ((x + step * 3) * cn), v_dst011, v_dst111, v_dst211); - v_store_interleave(dst + (x * cn), v_dst000 + v_cvt_f32(v_reinterpret_as_s32(v_src000)), v_dst100 + v_cvt_f32(v_reinterpret_as_s32(v_src100)), v_dst200 + v_cvt_f32(v_reinterpret_as_s32(v_src200))); - v_store_interleave(dst + ((x + 4) * cn), v_dst001 + v_cvt_f32(v_reinterpret_as_s32(v_src001)), v_dst101 + v_cvt_f32(v_reinterpret_as_s32(v_src101)), v_dst201 + v_cvt_f32(v_reinterpret_as_s32(v_src201))); - v_store_interleave(dst + ((x + 8) * cn), v_dst010 + v_cvt_f32(v_reinterpret_as_s32(v_src010)), v_dst110 + v_cvt_f32(v_reinterpret_as_s32(v_src110)), v_dst210 + v_cvt_f32(v_reinterpret_as_s32(v_src210))); - v_store_interleave(dst + ((x + 12) * cn), v_dst011 + v_cvt_f32(v_reinterpret_as_s32(v_src011)), v_dst111 + v_cvt_f32(v_reinterpret_as_s32(v_src111)), v_dst211 + v_cvt_f32(v_reinterpret_as_s32(v_src211))); + v_dst000 += v_cvt_f32(v_reinterpret_as_s32(v_src000)); + v_dst100 += v_cvt_f32(v_reinterpret_as_s32(v_src100)); + v_dst200 += v_cvt_f32(v_reinterpret_as_s32(v_src200)); + v_dst001 += v_cvt_f32(v_reinterpret_as_s32(v_src001)); + v_dst101 += v_cvt_f32(v_reinterpret_as_s32(v_src101)); + v_dst201 += v_cvt_f32(v_reinterpret_as_s32(v_src201)); + v_dst010 += v_cvt_f32(v_reinterpret_as_s32(v_src010)); + v_dst110 += v_cvt_f32(v_reinterpret_as_s32(v_src110)); + v_dst210 += v_cvt_f32(v_reinterpret_as_s32(v_src210)); + v_dst011 += v_cvt_f32(v_reinterpret_as_s32(v_src011)); + v_dst111 += v_cvt_f32(v_reinterpret_as_s32(v_src111)); + v_dst211 += v_cvt_f32(v_reinterpret_as_s32(v_src211)); + + v_store_interleave(dst + (x * cn), v_dst000, v_dst100, v_dst200); + v_store_interleave(dst + ((x + step) * cn), v_dst001, v_dst101, v_dst201); + v_store_interleave(dst + ((x + step * 2) * cn), v_dst010, v_dst110, v_dst210); + v_store_interleave(dst + ((x + step * 3) * cn), v_dst011, v_dst111, v_dst211); } } } - +#endif // CV_SIMD acc_general_(src, dst, mask, len, cn, x); } void acc_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn) { int x = 0; - const int cVectorWidth = 8; +#if CV_SIMD + const int cVectorWidth = v_uint16::nlanes; + const int step = v_float32::nlanes; if (!mask) { int size = len * cn; for (; x <= size - cVectorWidth; x += cVectorWidth) { - v_uint16x8 v_src = v_load(src + x); - v_uint32x4 v_src0, v_src1; + v_uint16 v_src = vx_load(src + x); + v_uint32 v_src0, v_src1; v_expand(v_src, v_src0, v_src1); - v_store(dst + x, v_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src0))); - v_store(dst + x + 4, v_load(dst + x + 4) + v_cvt_f32(v_reinterpret_as_s32(v_src1))); + v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src0))); + v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src1))); } } else { if (cn == 1) { - v_uint16x8 v_0 = v_setall_u16(0); + v_uint16 v_0 = vx_setall_u16(0); for ( ; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint16x8 v_mask = v_load_expand(mask + x); + v_uint16 v_mask = vx_load_expand(mask + x); v_mask = ~(v_mask == v_0); - v_uint16x8 v_src = v_load(src + x); + v_uint16 v_src = vx_load(src + x); v_src = v_src & v_mask; - v_uint32x4 v_src0, v_src1; + v_uint32 v_src0, v_src1; v_expand(v_src, v_src0, v_src1); - v_store(dst + x, v_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src0))); - v_store(dst + x + 4, v_load(dst + x + 4) + v_cvt_f32(v_reinterpret_as_s32(v_src1))); + v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src0))); + v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src1))); } } else if (cn == 3) { - v_uint16x8 v_0 = v_setall_u16(0); + v_uint16 v_0 = vx_setall_u16(0); for ( ; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint16x8 v_mask = v_load_expand(mask + x); + v_uint16 v_mask = vx_load_expand(mask + x); v_mask = ~(v_mask == v_0); - v_uint16x8 v_src0, v_src1, v_src2; + v_uint16 v_src0, v_src1, v_src2; v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2); v_src0 = v_src0 & v_mask; v_src1 = v_src1 & v_mask; v_src2 = v_src2 & v_mask; - v_uint32x4 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21; + v_uint32 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21; v_expand(v_src0, v_src00, v_src01); v_expand(v_src1, v_src10, v_src11); v_expand(v_src2, v_src20, v_src21); - v_float32x4 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21; + v_float32 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21; v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20); - v_load_deinterleave(dst + (x + 4) * cn, v_dst01, v_dst11, v_dst21); + v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21); - v_store_interleave(dst + x * cn, v_dst00 + v_cvt_f32(v_reinterpret_as_s32(v_src00)), v_dst10 + v_cvt_f32(v_reinterpret_as_s32(v_src10)), v_dst20 + v_cvt_f32(v_reinterpret_as_s32(v_src20))); - v_store_interleave(dst + (x + 4) * cn, v_dst01 + v_cvt_f32(v_reinterpret_as_s32(v_src01)), v_dst11 + v_cvt_f32(v_reinterpret_as_s32(v_src11)), v_dst21 + v_cvt_f32(v_reinterpret_as_s32(v_src21))); + v_dst00 += v_cvt_f32(v_reinterpret_as_s32(v_src00)); + v_dst01 += v_cvt_f32(v_reinterpret_as_s32(v_src01)); + v_dst10 += v_cvt_f32(v_reinterpret_as_s32(v_src10)); + v_dst11 += v_cvt_f32(v_reinterpret_as_s32(v_src11)); + v_dst20 += v_cvt_f32(v_reinterpret_as_s32(v_src20)); + v_dst21 += v_cvt_f32(v_reinterpret_as_s32(v_src21)); + + v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20); + v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21); } } } - +#endif // CV_SIMD acc_general_(src, dst, mask, len, cn, x); } - +// todo: remove AVX branch after support it by universal intrinsics void acc_simd_(const float* src, float* dst, const uchar* mask, int len, int cn) { int x = 0; - const int cVectorWidth = 8; +#if CV_SIMD + const int cVectorWidth = v_uint16::nlanes; + const int step = v_float32::nlanes; if (!mask) { int size = len * cn; + #if CV_AVX && !CV_AVX2 + for (; x <= size - 8 ; x += 8) + { + __m256 v_src = _mm256_loadu_ps(src + x); + __m256 v_dst = _mm256_loadu_ps(dst + x); + v_dst = _mm256_add_ps(v_src, v_dst); + _mm256_storeu_ps(dst + x, v_dst); + } + #else for (; x <= size - cVectorWidth; x += cVectorWidth) { - v_store(dst + x, v_load(dst + x) + v_load(src + x)); - v_store(dst + x + 4, v_load(dst + x + 4) + v_load(src + x + 4)); + v_store(dst + x, vx_load(dst + x) + vx_load(src + x)); + v_store(dst + x + step, vx_load(dst + x + step) + vx_load(src + x + step)); } + #endif // CV_AVX && !CV_AVX2 } else { - v_float32x4 v_0 = v_setzero_f32(); + v_float32 v_0 = vx_setzero_f32(); if (cn == 1) { for ( ; x <= len - cVectorWidth ; x += cVectorWidth) { - v_uint16x8 v_masku16 = v_load_expand(mask + x); - v_uint32x4 v_masku320, v_masku321; + v_uint16 v_masku16 = vx_load_expand(mask + x); + v_uint32 v_masku320, v_masku321; v_expand(v_masku16, v_masku320, v_masku321); - v_float32x4 v_mask0 = v_reinterpret_as_f32(~(v_masku320 == v_reinterpret_as_u32(v_0))); - v_float32x4 v_mask1 = v_reinterpret_as_f32(~(v_masku321 == v_reinterpret_as_u32(v_0))); + v_float32 v_mask0 = v_reinterpret_as_f32(~(v_masku320 == v_reinterpret_as_u32(v_0))); + v_float32 v_mask1 = v_reinterpret_as_f32(~(v_masku321 == v_reinterpret_as_u32(v_0))); - v_store(dst + x, v_load(dst + x) + (v_load(src + x) & v_mask0)); - v_store(dst + x + 4, v_load(dst + x + 4) + (v_load(src + x + 4) & v_mask1)); + v_store(dst + x, vx_load(dst + x) + (vx_load(src + x) & v_mask0)); + v_store(dst + x + step, vx_load(dst + x + step) + (vx_load(src + x + step) & v_mask1)); } } else if (cn == 3) { for ( ; x <= len - cVectorWidth ; x += cVectorWidth) { - v_uint16x8 v_masku16 = v_load_expand(mask + x); - v_uint32x4 v_masku320, v_masku321; + v_uint16 v_masku16 = vx_load_expand(mask + x); + v_uint32 v_masku320, v_masku321; v_expand(v_masku16, v_masku320, v_masku321); - v_float32x4 v_mask0 = v_reinterpret_as_f32(~(v_masku320 == v_reinterpret_as_u32(v_0))); - v_float32x4 v_mask1 = v_reinterpret_as_f32(~(v_masku321 == v_reinterpret_as_u32(v_0))); + v_float32 v_mask0 = v_reinterpret_as_f32(~(v_masku320 == v_reinterpret_as_u32(v_0))); + v_float32 v_mask1 = v_reinterpret_as_f32(~(v_masku321 == v_reinterpret_as_u32(v_0))); - v_float32x4 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21; + v_float32 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21; v_load_deinterleave(src + x * cn, v_src00, v_src10, v_src20); - v_load_deinterleave(src + (x + 4) * cn, v_src01, v_src11, v_src21); + v_load_deinterleave(src + (x + step) * cn, v_src01, v_src11, v_src21); v_src00 = v_src00 & v_mask0; v_src01 = v_src01 & v_mask1; v_src10 = v_src10 & v_mask0; @@ -517,55 +535,56 @@ void acc_simd_(const float* src, float* dst, const uchar* mask, int len, int cn) v_src20 = v_src20 & v_mask0; v_src21 = v_src21 & v_mask1; - v_float32x4 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21; + v_float32 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21; v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20); - v_load_deinterleave(dst + (x + 4) * cn, v_dst01, v_dst11, v_dst21); + v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21); v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20); - v_store_interleave(dst + (x + 4) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21); + v_store_interleave(dst + (x + step) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21); } } } - +#endif // CV_SIMD acc_general_(src, dst, mask, len, cn, x); } -#if CV_SIMD128_64F void acc_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn) { int x = 0; - const int cVectorWidth = 16; +#if CV_SIMD_64F + const int cVectorWidth = v_uint8::nlanes; + const int step = v_float64::nlanes; if (!mask) { int size = len * cn; for (; x <= size - cVectorWidth; x += cVectorWidth) { - v_uint8x16 v_src = v_load(src + x); - v_uint16x8 v_int0, v_int1; + v_uint8 v_src = vx_load(src + x); + v_uint16 v_int0, v_int1; v_expand(v_src, v_int0, v_int1); - v_uint32x4 v_int00, v_int01, v_int10, v_int11; + v_uint32 v_int00, v_int01, v_int10, v_int11; v_expand(v_int0, v_int00, v_int01); v_expand(v_int1, v_int10, v_int11); - v_float64x2 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int00)); - v_float64x2 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int00)); - v_float64x2 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int01)); - v_float64x2 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int01)); - v_float64x2 v_src4 = v_cvt_f64(v_reinterpret_as_s32(v_int10)); - v_float64x2 v_src5 = v_cvt_f64_high(v_reinterpret_as_s32(v_int10)); - v_float64x2 v_src6 = v_cvt_f64(v_reinterpret_as_s32(v_int11)); - v_float64x2 v_src7 = v_cvt_f64_high(v_reinterpret_as_s32(v_int11)); + v_float64 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int00)); + v_float64 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int00)); + v_float64 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int01)); + v_float64 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int01)); + v_float64 v_src4 = v_cvt_f64(v_reinterpret_as_s32(v_int10)); + v_float64 v_src5 = v_cvt_f64_high(v_reinterpret_as_s32(v_int10)); + v_float64 v_src6 = v_cvt_f64(v_reinterpret_as_s32(v_int11)); + v_float64 v_src7 = v_cvt_f64_high(v_reinterpret_as_s32(v_int11)); - v_float64x2 v_dst0 = v_load(dst + x); - v_float64x2 v_dst1 = v_load(dst + x + 2); - v_float64x2 v_dst2 = v_load(dst + x + 4); - v_float64x2 v_dst3 = v_load(dst + x + 6); - v_float64x2 v_dst4 = v_load(dst + x + 8); - v_float64x2 v_dst5 = v_load(dst + x + 10); - v_float64x2 v_dst6 = v_load(dst + x + 12); - v_float64x2 v_dst7 = v_load(dst + x + 14); + v_float64 v_dst0 = vx_load(dst + x); + v_float64 v_dst1 = vx_load(dst + x + step); + v_float64 v_dst2 = vx_load(dst + x + step * 2); + v_float64 v_dst3 = vx_load(dst + x + step * 3); + v_float64 v_dst4 = vx_load(dst + x + step * 4); + v_float64 v_dst5 = vx_load(dst + x + step * 5); + v_float64 v_dst6 = vx_load(dst + x + step * 6); + v_float64 v_dst7 = vx_load(dst + x + step * 7); v_dst0 = v_dst0 + v_src0; v_dst1 = v_dst1 + v_src1; @@ -577,50 +596,50 @@ void acc_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn v_dst7 = v_dst7 + v_src7; v_store(dst + x, v_dst0); - v_store(dst + x + 2, v_dst1); - v_store(dst + x + 4, v_dst2); - v_store(dst + x + 6, v_dst3); - v_store(dst + x + 8, v_dst4); - v_store(dst + x + 10, v_dst5); - v_store(dst + x + 12, v_dst6); - v_store(dst + x + 14, v_dst7); + v_store(dst + x + step, v_dst1); + v_store(dst + x + step * 2, v_dst2); + v_store(dst + x + step * 3, v_dst3); + v_store(dst + x + step * 4, v_dst4); + v_store(dst + x + step * 5, v_dst5); + v_store(dst + x + step * 6, v_dst6); + v_store(dst + x + step * 7, v_dst7); } } else { - v_uint8x16 v_0 = v_setall_u8(0); + v_uint8 v_0 = vx_setall_u8(0); if (cn == 1) { for ( ; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint8x16 v_mask = v_load(mask + x); + v_uint8 v_mask = vx_load(mask + x); v_mask = ~(v_mask == v_0); - v_uint8x16 v_src = v_load(src + x); + v_uint8 v_src = vx_load(src + x); v_src = v_src & v_mask; - v_uint16x8 v_int0, v_int1; + v_uint16 v_int0, v_int1; v_expand(v_src, v_int0, v_int1); - v_uint32x4 v_int00, v_int01, v_int10, v_int11; + v_uint32 v_int00, v_int01, v_int10, v_int11; v_expand(v_int0, v_int00, v_int01); v_expand(v_int1, v_int10, v_int11); - v_float64x2 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int00)); - v_float64x2 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int00)); - v_float64x2 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int01)); - v_float64x2 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int01)); - v_float64x2 v_src4 = v_cvt_f64(v_reinterpret_as_s32(v_int10)); - v_float64x2 v_src5 = v_cvt_f64_high(v_reinterpret_as_s32(v_int10)); - v_float64x2 v_src6 = v_cvt_f64(v_reinterpret_as_s32(v_int11)); - v_float64x2 v_src7 = v_cvt_f64_high(v_reinterpret_as_s32(v_int11)); + v_float64 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int00)); + v_float64 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int00)); + v_float64 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int01)); + v_float64 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int01)); + v_float64 v_src4 = v_cvt_f64(v_reinterpret_as_s32(v_int10)); + v_float64 v_src5 = v_cvt_f64_high(v_reinterpret_as_s32(v_int10)); + v_float64 v_src6 = v_cvt_f64(v_reinterpret_as_s32(v_int11)); + v_float64 v_src7 = v_cvt_f64_high(v_reinterpret_as_s32(v_int11)); - v_float64x2 v_dst0 = v_load(dst + x); - v_float64x2 v_dst1 = v_load(dst + x + 2); - v_float64x2 v_dst2 = v_load(dst + x + 4); - v_float64x2 v_dst3 = v_load(dst + x + 6); - v_float64x2 v_dst4 = v_load(dst + x + 8); - v_float64x2 v_dst5 = v_load(dst + x + 10); - v_float64x2 v_dst6 = v_load(dst + x + 12); - v_float64x2 v_dst7 = v_load(dst + x + 14); + v_float64 v_dst0 = vx_load(dst + x); + v_float64 v_dst1 = vx_load(dst + x + step); + v_float64 v_dst2 = vx_load(dst + x + step * 2); + v_float64 v_dst3 = vx_load(dst + x + step * 3); + v_float64 v_dst4 = vx_load(dst + x + step * 4); + v_float64 v_dst5 = vx_load(dst + x + step * 5); + v_float64 v_dst6 = vx_load(dst + x + step * 6); + v_float64 v_dst7 = vx_load(dst + x + step * 7); v_dst0 = v_dst0 + v_src0; v_dst1 = v_dst1 + v_src1; @@ -632,34 +651,34 @@ void acc_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn v_dst7 = v_dst7 + v_src7; v_store(dst + x, v_dst0); - v_store(dst + x + 2, v_dst1); - v_store(dst + x + 4, v_dst2); - v_store(dst + x + 6, v_dst3); - v_store(dst + x + 8, v_dst4); - v_store(dst + x + 10, v_dst5); - v_store(dst + x + 12, v_dst6); - v_store(dst + x + 14, v_dst7); + v_store(dst + x + step, v_dst1); + v_store(dst + x + step * 2, v_dst2); + v_store(dst + x + step * 3, v_dst3); + v_store(dst + x + step * 4, v_dst4); + v_store(dst + x + step * 5, v_dst5); + v_store(dst + x + step * 6, v_dst6); + v_store(dst + x + step * 7, v_dst7); } } else if (cn == 3) { for ( ; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint8x16 v_mask = v_load(mask + x); + v_uint8 v_mask = vx_load(mask + x); v_mask = ~(v_0 == v_mask); - v_uint8x16 v_src0, v_src1, v_src2; + v_uint8 v_src0, v_src1, v_src2; v_load_deinterleave(src + (x * cn), v_src0, v_src1, v_src2); v_src0 = v_src0 & v_mask; v_src1 = v_src1 & v_mask; v_src2 = v_src2 & v_mask; - v_uint16x8 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21; + v_uint16 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21; v_expand(v_src0, v_src00, v_src01); v_expand(v_src1, v_src10, v_src11); v_expand(v_src2, v_src20, v_src21); - v_uint32x4 v_src000, v_src001, v_src010, v_src011; - v_uint32x4 v_src100, v_src101, v_src110, v_src111; - v_uint32x4 v_src200, v_src201, v_src210, v_src211; + v_uint32 v_src000, v_src001, v_src010, v_src011; + v_uint32 v_src100, v_src101, v_src110, v_src111; + v_uint32 v_src200, v_src201, v_src210, v_src211; v_expand(v_src00, v_src000, v_src001); v_expand(v_src01, v_src010, v_src011); v_expand(v_src10, v_src100, v_src101); @@ -667,9 +686,9 @@ void acc_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn v_expand(v_src20, v_src200, v_src201); v_expand(v_src21, v_src210, v_src211); - v_float64x2 v_src0000, v_src0001, v_src0010, v_src0011, v_src0100, v_src0101, v_src0110, v_src0111; - v_float64x2 v_src1000, v_src1001, v_src1010, v_src1011, v_src1100, v_src1101, v_src1110, v_src1111; - v_float64x2 v_src2000, v_src2001, v_src2010, v_src2011, v_src2100, v_src2101, v_src2110, v_src2111; + v_float64 v_src0000, v_src0001, v_src0010, v_src0011, v_src0100, v_src0101, v_src0110, v_src0111; + v_float64 v_src1000, v_src1001, v_src1010, v_src1011, v_src1100, v_src1101, v_src1110, v_src1111; + v_float64 v_src2000, v_src2001, v_src2010, v_src2011, v_src2100, v_src2101, v_src2110, v_src2111; v_src0000 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src000))); v_src0001 = v_cvt_f64_high(v_cvt_f32(v_reinterpret_as_s32(v_src000))); v_src0010 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src001))); @@ -695,56 +714,58 @@ void acc_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn v_src2110 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src211))); v_src2111 = v_cvt_f64_high(v_cvt_f32(v_reinterpret_as_s32(v_src211))); - v_float64x2 v_dst0000, v_dst0001, v_dst0010, v_dst0011, v_dst0100, v_dst0101, v_dst0110, v_dst0111; - v_float64x2 v_dst1000, v_dst1001, v_dst1010, v_dst1011, v_dst1100, v_dst1101, v_dst1110, v_dst1111; - v_float64x2 v_dst2000, v_dst2001, v_dst2010, v_dst2011, v_dst2100, v_dst2101, v_dst2110, v_dst2111; + v_float64 v_dst0000, v_dst0001, v_dst0010, v_dst0011, v_dst0100, v_dst0101, v_dst0110, v_dst0111; + v_float64 v_dst1000, v_dst1001, v_dst1010, v_dst1011, v_dst1100, v_dst1101, v_dst1110, v_dst1111; + v_float64 v_dst2000, v_dst2001, v_dst2010, v_dst2011, v_dst2100, v_dst2101, v_dst2110, v_dst2111; v_load_deinterleave(dst + (x * cn), v_dst0000, v_dst1000, v_dst2000); - v_load_deinterleave(dst + ((x + 2) * cn), v_dst0001, v_dst1001, v_dst2001); - v_load_deinterleave(dst + ((x + 4) * cn), v_dst0010, v_dst1010, v_dst2010); - v_load_deinterleave(dst + ((x + 6) * cn), v_dst0011, v_dst1011, v_dst2011); - v_load_deinterleave(dst + ((x + 8) * cn), v_dst0100, v_dst1100, v_dst2100); - v_load_deinterleave(dst + ((x + 10) * cn), v_dst0101, v_dst1101, v_dst2101); - v_load_deinterleave(dst + ((x + 12) * cn), v_dst0110, v_dst1110, v_dst2110); - v_load_deinterleave(dst + ((x + 14) * cn), v_dst0111, v_dst1111, v_dst2111); + v_load_deinterleave(dst + ((x + step) * cn), v_dst0001, v_dst1001, v_dst2001); + v_load_deinterleave(dst + ((x + step * 2) * cn), v_dst0010, v_dst1010, v_dst2010); + v_load_deinterleave(dst + ((x + step * 3) * cn), v_dst0011, v_dst1011, v_dst2011); + v_load_deinterleave(dst + ((x + step * 4) * cn), v_dst0100, v_dst1100, v_dst2100); + v_load_deinterleave(dst + ((x + step * 5) * cn), v_dst0101, v_dst1101, v_dst2101); + v_load_deinterleave(dst + ((x + step * 6) * cn), v_dst0110, v_dst1110, v_dst2110); + v_load_deinterleave(dst + ((x + step * 7) * cn), v_dst0111, v_dst1111, v_dst2111); v_store_interleave(dst + (x * cn), v_dst0000 + v_src0000, v_dst1000 + v_src1000, v_dst2000 + v_src2000); - v_store_interleave(dst + ((x + 2) * cn), v_dst0001 + v_src0001, v_dst1001 + v_src1001, v_dst2001 + v_src2001); - v_store_interleave(dst + ((x + 4) * cn), v_dst0010 + v_src0010, v_dst1010 + v_src1010, v_dst2010 + v_src2010); - v_store_interleave(dst + ((x + 6) * cn), v_dst0011 + v_src0011, v_dst1011 + v_src1011, v_dst2011 + v_src2011); - v_store_interleave(dst + ((x + 8) * cn), v_dst0100 + v_src0100, v_dst1100 + v_src1100, v_dst2100 + v_src2100); - v_store_interleave(dst + ((x + 10) * cn), v_dst0101 + v_src0101, v_dst1101 + v_src1101, v_dst2101 + v_src2101); - v_store_interleave(dst + ((x + 12) * cn), v_dst0110 + v_src0110, v_dst1110 + v_src1110, v_dst2110 + v_src2110); - v_store_interleave(dst + ((x + 14) * cn), v_dst0111 + v_src0111, v_dst1111 + v_src1111, v_dst2111 + v_src2111); + v_store_interleave(dst + ((x + step) * cn), v_dst0001 + v_src0001, v_dst1001 + v_src1001, v_dst2001 + v_src2001); + v_store_interleave(dst + ((x + step * 2) * cn), v_dst0010 + v_src0010, v_dst1010 + v_src1010, v_dst2010 + v_src2010); + v_store_interleave(dst + ((x + step * 3) * cn), v_dst0011 + v_src0011, v_dst1011 + v_src1011, v_dst2011 + v_src2011); + v_store_interleave(dst + ((x + step * 4) * cn), v_dst0100 + v_src0100, v_dst1100 + v_src1100, v_dst2100 + v_src2100); + v_store_interleave(dst + ((x + step * 5) * cn), v_dst0101 + v_src0101, v_dst1101 + v_src1101, v_dst2101 + v_src2101); + v_store_interleave(dst + ((x + step * 6) * cn), v_dst0110 + v_src0110, v_dst1110 + v_src1110, v_dst2110 + v_src2110); + v_store_interleave(dst + ((x + step * 7) * cn), v_dst0111 + v_src0111, v_dst1111 + v_src1111, v_dst2111 + v_src2111); } } } - +#endif // CV_SIMD_64F acc_general_(src, dst, mask, len, cn, x); } void acc_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn) { int x = 0; - const int cVectorWidth = 8; +#if CV_SIMD_64F + const int cVectorWidth = v_uint16::nlanes; + const int step = v_float64::nlanes; if (!mask) { int size = len * cn; for (; x <= size - cVectorWidth; x += cVectorWidth) { - v_uint16x8 v_src = v_load(src + x); - v_uint32x4 v_int0, v_int1; + v_uint16 v_src = vx_load(src + x); + v_uint32 v_int0, v_int1; v_expand(v_src, v_int0, v_int1); - v_float64x2 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int0)); - v_float64x2 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int0)); - v_float64x2 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int1)); - v_float64x2 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int1)); + v_float64 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int0)); + v_float64 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int0)); + v_float64 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int1)); + v_float64 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int1)); - v_float64x2 v_dst0 = v_load(dst + x); - v_float64x2 v_dst1 = v_load(dst + x + 2); - v_float64x2 v_dst2 = v_load(dst + x + 4); - v_float64x2 v_dst3 = v_load(dst + x + 6); + v_float64 v_dst0 = vx_load(dst + x); + v_float64 v_dst1 = vx_load(dst + x + step); + v_float64 v_dst2 = vx_load(dst + x + step * 2); + v_float64 v_dst3 = vx_load(dst + x + step * 3); v_dst0 = v_dst0 + v_src0; v_dst1 = v_dst1 + v_src1; @@ -752,34 +773,34 @@ void acc_simd_(const ushort* src, double* dst, const uchar* mask, int len, int c v_dst3 = v_dst3 + v_src3; v_store(dst + x, v_dst0); - v_store(dst + x + 2, v_dst1); - v_store(dst + x + 4, v_dst2); - v_store(dst + x + 6, v_dst3); + v_store(dst + x + step, v_dst1); + v_store(dst + x + step * 2, v_dst2); + v_store(dst + x + step * 3, v_dst3); } } else { - v_uint16x8 v_0 = v_setzero_u16(); + v_uint16 v_0 = vx_setzero_u16(); if (cn == 1) { for ( ; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint16x8 v_mask = v_load_expand(mask + x); + v_uint16 v_mask = vx_load_expand(mask + x); v_mask = ~(v_mask == v_0); - v_uint16x8 v_src = v_load(src + x); + v_uint16 v_src = vx_load(src + x); v_src = v_src & v_mask; - v_uint32x4 v_int0, v_int1; + v_uint32 v_int0, v_int1; v_expand(v_src, v_int0, v_int1); - v_float64x2 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int0)); - v_float64x2 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int0)); - v_float64x2 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int1)); - v_float64x2 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int1)); + v_float64 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int0)); + v_float64 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int0)); + v_float64 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int1)); + v_float64 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int1)); - v_float64x2 v_dst0 = v_load(dst + x); - v_float64x2 v_dst1 = v_load(dst + x + 2); - v_float64x2 v_dst2 = v_load(dst + x + 4); - v_float64x2 v_dst3 = v_load(dst + x + 6); + v_float64 v_dst0 = vx_load(dst + x); + v_float64 v_dst1 = vx_load(dst + x + step); + v_float64 v_dst2 = vx_load(dst + x + step * 2); + v_float64 v_dst3 = vx_load(dst + x + step * 3); v_dst0 = v_dst0 + v_src0; v_dst1 = v_dst1 + v_src1; @@ -787,178 +808,207 @@ void acc_simd_(const ushort* src, double* dst, const uchar* mask, int len, int c v_dst3 = v_dst3 + v_src3; v_store(dst + x, v_dst0); - v_store(dst + x + 2, v_dst1); - v_store(dst + x + 4, v_dst2); - v_store(dst + x + 6, v_dst3); + v_store(dst + x + step, v_dst1); + v_store(dst + x + step * 2, v_dst2); + v_store(dst + x + step * 3, v_dst3); } } if (cn == 3) { for ( ; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint16x8 v_mask = v_load_expand(mask + x); + v_uint16 v_mask = vx_load_expand(mask + x); v_mask = ~(v_mask == v_0); - v_uint16x8 v_src0, v_src1, v_src2; + v_uint16 v_src0, v_src1, v_src2; v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2); v_src0 = v_src0 & v_mask; v_src1 = v_src1 & v_mask; v_src2 = v_src2 & v_mask; - v_uint32x4 v_int00, v_int01, v_int10, v_int11, v_int20, v_int21; + v_uint32 v_int00, v_int01, v_int10, v_int11, v_int20, v_int21; v_expand(v_src0, v_int00, v_int01); v_expand(v_src1, v_int10, v_int11); v_expand(v_src2, v_int20, v_int21); - v_float64x2 v_src00 = v_cvt_f64(v_reinterpret_as_s32(v_int00)); - v_float64x2 v_src01 = v_cvt_f64_high(v_reinterpret_as_s32(v_int00)); - v_float64x2 v_src02 = v_cvt_f64(v_reinterpret_as_s32(v_int01)); - v_float64x2 v_src03 = v_cvt_f64_high(v_reinterpret_as_s32(v_int01)); - v_float64x2 v_src10 = v_cvt_f64(v_reinterpret_as_s32(v_int10)); - v_float64x2 v_src11 = v_cvt_f64_high(v_reinterpret_as_s32(v_int10)); - v_float64x2 v_src12 = v_cvt_f64(v_reinterpret_as_s32(v_int11)); - v_float64x2 v_src13 = v_cvt_f64_high(v_reinterpret_as_s32(v_int11)); - v_float64x2 v_src20 = v_cvt_f64(v_reinterpret_as_s32(v_int20)); - v_float64x2 v_src21 = v_cvt_f64_high(v_reinterpret_as_s32(v_int20)); - v_float64x2 v_src22 = v_cvt_f64(v_reinterpret_as_s32(v_int21)); - v_float64x2 v_src23 = v_cvt_f64_high(v_reinterpret_as_s32(v_int21)); + v_float64 v_src00 = v_cvt_f64(v_reinterpret_as_s32(v_int00)); + v_float64 v_src01 = v_cvt_f64_high(v_reinterpret_as_s32(v_int00)); + v_float64 v_src02 = v_cvt_f64(v_reinterpret_as_s32(v_int01)); + v_float64 v_src03 = v_cvt_f64_high(v_reinterpret_as_s32(v_int01)); + v_float64 v_src10 = v_cvt_f64(v_reinterpret_as_s32(v_int10)); + v_float64 v_src11 = v_cvt_f64_high(v_reinterpret_as_s32(v_int10)); + v_float64 v_src12 = v_cvt_f64(v_reinterpret_as_s32(v_int11)); + v_float64 v_src13 = v_cvt_f64_high(v_reinterpret_as_s32(v_int11)); + v_float64 v_src20 = v_cvt_f64(v_reinterpret_as_s32(v_int20)); + v_float64 v_src21 = v_cvt_f64_high(v_reinterpret_as_s32(v_int20)); + v_float64 v_src22 = v_cvt_f64(v_reinterpret_as_s32(v_int21)); + v_float64 v_src23 = v_cvt_f64_high(v_reinterpret_as_s32(v_int21)); - v_float64x2 v_dst00, v_dst01, v_dst02, v_dst03, v_dst10, v_dst11, v_dst12, v_dst13, v_dst20, v_dst21, v_dst22, v_dst23; + v_float64 v_dst00, v_dst01, v_dst02, v_dst03, v_dst10, v_dst11, v_dst12, v_dst13, v_dst20, v_dst21, v_dst22, v_dst23; v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20); - v_load_deinterleave(dst + (x + 2) * cn, v_dst01, v_dst11, v_dst21); - v_load_deinterleave(dst + (x + 4) * cn, v_dst02, v_dst12, v_dst22); - v_load_deinterleave(dst + (x + 6) * cn, v_dst03, v_dst13, v_dst23); + v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21); + v_load_deinterleave(dst + (x + step * 2) * cn, v_dst02, v_dst12, v_dst22); + v_load_deinterleave(dst + (x + step * 3) * cn, v_dst03, v_dst13, v_dst23); v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20); - v_store_interleave(dst + (x + 2) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21); - v_store_interleave(dst + (x + 4) * cn, v_dst02 + v_src02, v_dst12 + v_src12, v_dst22 + v_src22); - v_store_interleave(dst + (x + 6) * cn, v_dst03 + v_src03, v_dst13 + v_src13, v_dst23 + v_src23); + v_store_interleave(dst + (x + step) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21); + v_store_interleave(dst + (x + step * 2) * cn, v_dst02 + v_src02, v_dst12 + v_src12, v_dst22 + v_src22); + v_store_interleave(dst + (x + step * 3) * cn, v_dst03 + v_src03, v_dst13 + v_src13, v_dst23 + v_src23); } } } - +#endif // CV_SIMD_64F acc_general_(src, dst, mask, len, cn, x); } void acc_simd_(const float* src, double* dst, const uchar* mask, int len, int cn) { int x = 0; - const int cVectorWidth = 4; +#if CV_SIMD_64F + const int cVectorWidth = v_float32::nlanes; + const int step = v_float64::nlanes; if (!mask) { int size = len * cn; + #if CV_AVX && !CV_AVX2 + for (; x <= size - 8 ; x += 8) + { + __m256 v_src = _mm256_loadu_ps(src + x); + __m256d v_src0 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src, 0)); + __m256d v_src1 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src, 1)); + __m256d v_dst0 = _mm256_loadu_pd(dst + x); + __m256d v_dst1 = _mm256_loadu_pd(dst + x + 4); + v_dst0 = _mm256_add_pd(v_src0, v_dst0); + v_dst1 = _mm256_add_pd(v_src1, v_dst1); + _mm256_storeu_pd(dst + x, v_dst0); + _mm256_storeu_pd(dst + x + 4, v_dst1); + } + #else for (; x <= size - cVectorWidth; x += cVectorWidth) { - v_float32x4 v_src = v_load(src + x); - v_float64x2 v_src0 = v_cvt_f64(v_src); - v_float64x2 v_src1 = v_cvt_f64_high(v_src); + v_float32 v_src = vx_load(src + x); + v_float64 v_src0 = v_cvt_f64(v_src); + v_float64 v_src1 = v_cvt_f64_high(v_src); - v_store(dst + x, v_load(dst + x) + v_src0); - v_store(dst + x + 2, v_load(dst + x + 2) + v_src1); + v_store(dst + x, vx_load(dst + x) + v_src0); + v_store(dst + x + step, vx_load(dst + x + step) + v_src1); } + #endif // CV_AVX && !CV_AVX2 } else { - v_uint64x2 v_0 = v_setzero_u64(); + v_uint64 v_0 = vx_setzero_u64(); if (cn == 1) { for ( ; x <= len - cVectorWidth ; x += cVectorWidth) { - v_uint32x4 v_masku32 = v_load_expand_q(mask + x); - v_uint64x2 v_masku640, v_masku641; + v_uint32 v_masku32 = vx_load_expand_q(mask + x); + v_uint64 v_masku640, v_masku641; v_expand(v_masku32, v_masku640, v_masku641); - v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0)); - v_float64x2 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0)); + v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0)); + v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0)); - v_float32x4 v_src = v_load(src + x); - v_float64x2 v_src0 = v_cvt_f64(v_src) & v_mask0; - v_float64x2 v_src1 = v_cvt_f64_high(v_src) & v_mask1; + v_float32 v_src = vx_load(src + x); + v_float64 v_src0 = v_cvt_f64(v_src) & v_mask0; + v_float64 v_src1 = v_cvt_f64_high(v_src) & v_mask1; - v_store(dst + x, v_load(dst + x) + v_src0); - v_store(dst + x + 2, v_load(dst + x + 2) + v_src1); + v_store(dst + x, vx_load(dst + x) + v_src0); + v_store(dst + x + step, vx_load(dst + x + step) + v_src1); } } else if (cn == 3) { for ( ; x <= len - cVectorWidth ; x += cVectorWidth) { - v_uint32x4 v_masku32 = v_load_expand_q(mask + x); - v_uint64x2 v_masku640, v_masku641; + v_uint32 v_masku32 = vx_load_expand_q(mask + x); + v_uint64 v_masku640, v_masku641; v_expand(v_masku32, v_masku640, v_masku641); - v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0)); - v_float64x2 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0)); + v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0)); + v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0)); - v_float32x4 v_src0, v_src1, v_src2; + v_float32 v_src0, v_src1, v_src2; v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2); - v_float64x2 v_src00 = v_cvt_f64(v_src0) & v_mask0; - v_float64x2 v_src01 = v_cvt_f64_high(v_src0) & v_mask1; - v_float64x2 v_src10 = v_cvt_f64(v_src1) & v_mask0; - v_float64x2 v_src11 = v_cvt_f64_high(v_src1) & v_mask1; - v_float64x2 v_src20 = v_cvt_f64(v_src2) & v_mask0; - v_float64x2 v_src21 = v_cvt_f64_high(v_src2) & v_mask1; + v_float64 v_src00 = v_cvt_f64(v_src0) & v_mask0; + v_float64 v_src01 = v_cvt_f64_high(v_src0) & v_mask1; + v_float64 v_src10 = v_cvt_f64(v_src1) & v_mask0; + v_float64 v_src11 = v_cvt_f64_high(v_src1) & v_mask1; + v_float64 v_src20 = v_cvt_f64(v_src2) & v_mask0; + v_float64 v_src21 = v_cvt_f64_high(v_src2) & v_mask1; - v_float64x2 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21; + v_float64 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21; v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20); - v_load_deinterleave(dst + (x + 2) * cn, v_dst01, v_dst11, v_dst21); + v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21); v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20); - v_store_interleave(dst + (x + 2) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21); + v_store_interleave(dst + (x + step) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21); } } } - +#endif // CV_SIMD_64F acc_general_(src, dst, mask, len, cn, x); } void acc_simd_(const double* src, double* dst, const uchar* mask, int len, int cn) { int x = 0; - const int cVectorWidth = 4; +#if CV_SIMD_64F + const int cVectorWidth = v_float64::nlanes * 2; + const int step = v_float64::nlanes; if (!mask) { int size = len * cn; + #if CV_AVX && !CV_AVX2 + for ( ; x <= size - 4 ; x += 4) + { + __m256d v_src = _mm256_loadu_pd(src + x); + __m256d v_dst = _mm256_loadu_pd(dst + x); + v_dst = _mm256_add_pd(v_dst, v_src); + _mm256_storeu_pd(dst + x, v_dst); + } + #else for (; x <= size - cVectorWidth; x += cVectorWidth) { - v_float64x2 v_src0 = v_load(src + x); - v_float64x2 v_src1 = v_load(src + x + 2); + v_float64 v_src0 = vx_load(src + x); + v_float64 v_src1 = vx_load(src + x + step); - v_store(dst + x, v_load(dst + x) + v_src0); - v_store(dst + x + 2, v_load(dst + x + 2) + v_src1); + v_store(dst + x, vx_load(dst + x) + v_src0); + v_store(dst + x + step, vx_load(dst + x + step) + v_src1); } + #endif // CV_AVX && !CV_AVX2 } else { - v_uint64x2 v_0 = v_setzero_u64(); + v_uint64 v_0 = vx_setzero_u64(); if (cn == 1) { for ( ; x <= len - cVectorWidth ; x += cVectorWidth) { - v_uint32x4 v_masku32 = v_load_expand_q(mask + x); - v_uint64x2 v_masku640, v_masku641; + v_uint32 v_masku32 = vx_load_expand_q(mask + x); + v_uint64 v_masku640, v_masku641; v_expand(v_masku32, v_masku640, v_masku641); - v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0)); - v_float64x2 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0)); + v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0)); + v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0)); - v_float64x2 v_src0 = v_load(src + x); - v_float64x2 v_src1 = v_load(src + x + 2); + v_float64 v_src0 = vx_load(src + x); + v_float64 v_src1 = vx_load(src + x + step); - v_store(dst + x, v_load(dst + x) + (v_src0 & v_mask0)); - v_store(dst + x + 2, v_load(dst + x + 2) + (v_src1 & v_mask1)); + v_store(dst + x, vx_load(dst + x) + (v_src0 & v_mask0)); + v_store(dst + x + step, vx_load(dst + x + step) + (v_src1 & v_mask1)); } } else if (cn == 3) { for ( ; x <= len - cVectorWidth ; x += cVectorWidth) { - v_uint32x4 v_masku32 = v_load_expand_q(mask + x); - v_uint64x2 v_masku640, v_masku641; + v_uint32 v_masku32 = vx_load_expand_q(mask + x); + v_uint64 v_masku640, v_masku641; v_expand(v_masku32, v_masku640, v_masku641); - v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0)); - v_float64x2 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0)); + v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0)); + v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0)); - v_float64x2 v_src00, v_src10, v_src20, v_src01, v_src11, v_src21; + v_float64 v_src00, v_src10, v_src20, v_src01, v_src11, v_src21; v_load_deinterleave(src + x * cn, v_src00, v_src10, v_src20); - v_load_deinterleave(src + (x + 2) * cn, v_src01, v_src11, v_src21); + v_load_deinterleave(src + (x + step) * cn, v_src01, v_src11, v_src21); v_src00 = v_src00 & v_mask0; v_src01 = v_src01 & v_mask1; v_src10 = v_src10 & v_mask0; @@ -966,120 +1016,101 @@ void acc_simd_(const double* src, double* dst, const uchar* mask, int len, int c v_src20 = v_src20 & v_mask0; v_src21 = v_src21 & v_mask1; - v_float64x2 v_dst00, v_dst10, v_dst20, v_dst01, v_dst11, v_dst21; + v_float64 v_dst00, v_dst10, v_dst20, v_dst01, v_dst11, v_dst21; v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20); - v_load_deinterleave(dst + (x + 2) * cn, v_dst01, v_dst11, v_dst21); + v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21); v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20); - v_store_interleave(dst + (x + 2) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21); + v_store_interleave(dst + (x + step) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21); } } } - +#endif // CV_SIMD_64F acc_general_(src, dst, mask, len, cn, x); } -#else -void acc_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn) -{ - acc_general_(src, dst, mask, len, cn, 0); -} - -void acc_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn) -{ - acc_general_(src, dst, mask, len, cn, 0); -} - -void acc_simd_(const float* src, double* dst, const uchar* mask, int len, int cn) -{ - acc_general_(src, dst, mask, len, cn, 0); -} - -void acc_simd_(const double* src, double* dst, const uchar* mask, int len, int cn) -{ - acc_general_(src, dst, mask, len, cn, 0); -} -#endif // square accumulate optimized by universal intrinsic void accSqr_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn) { int x = 0; - const int cVectorWidth = 16; +#if CV_SIMD + const int cVectorWidth = v_uint8::nlanes; + const int step = v_float32::nlanes; if (!mask) { int size = len * cn; for (; x <= size - cVectorWidth; x += cVectorWidth) { - v_uint8x16 v_src = v_load(src + x); - v_uint16x8 v_src0, v_src1; + v_uint8 v_src = vx_load(src + x); + v_uint16 v_src0, v_src1; v_expand(v_src, v_src0, v_src1); - v_src0 = v_src0 * v_src0; - v_src1 = v_src1 * v_src1; + v_src0 = v_mul_wrap(v_src0, v_src0); + v_src1 = v_mul_wrap(v_src1, v_src1); - v_uint32x4 v_src00, v_src01, v_src10, v_src11; + v_uint32 v_src00, v_src01, v_src10, v_src11; v_expand(v_src0, v_src00, v_src01); v_expand(v_src1, v_src10, v_src11); - v_store(dst + x, v_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00))); - v_store(dst + x + 4, v_load(dst + x + 4) + v_cvt_f32(v_reinterpret_as_s32(v_src01))); - v_store(dst + x + 8, v_load(dst + x + 8) + v_cvt_f32(v_reinterpret_as_s32(v_src10))); - v_store(dst + x + 12, v_load(dst + x + 12) + v_cvt_f32(v_reinterpret_as_s32(v_src11))); + v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00))); + v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src01))); + v_store(dst + x + step * 2, vx_load(dst + x + step * 2) + v_cvt_f32(v_reinterpret_as_s32(v_src10))); + v_store(dst + x + step * 3, vx_load(dst + x + step * 3) + v_cvt_f32(v_reinterpret_as_s32(v_src11))); } } else { - v_uint8x16 v_0 = v_setall_u8(0); + v_uint8 v_0 = vx_setall_u8(0); if (cn == 1) { for ( ; x <= len - cVectorWidth ; x += cVectorWidth) { - v_uint8x16 v_mask = v_load(mask + x); + v_uint8 v_mask = vx_load(mask + x); v_mask = ~(v_0 == v_mask); - v_uint8x16 v_src = v_load(src + x); + v_uint8 v_src = vx_load(src + x); v_src = v_src & v_mask; - v_uint16x8 v_src0, v_src1; + v_uint16 v_src0, v_src1; v_expand(v_src, v_src0, v_src1); - v_src0 = v_src0 * v_src0; - v_src1 = v_src1 * v_src1; + v_src0 = v_mul_wrap(v_src0, v_src0); + v_src1 = v_mul_wrap(v_src1, v_src1); - v_uint32x4 v_src00, v_src01, v_src10, v_src11; + v_uint32 v_src00, v_src01, v_src10, v_src11; v_expand(v_src0, v_src00, v_src01); v_expand(v_src1, v_src10, v_src11); - v_store(dst + x, v_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00))); - v_store(dst + x + 4, v_load(dst + x + 4) + v_cvt_f32(v_reinterpret_as_s32(v_src01))); - v_store(dst + x + 8, v_load(dst + x + 8) + v_cvt_f32(v_reinterpret_as_s32(v_src10))); - v_store(dst + x + 12, v_load(dst + x + 12) + v_cvt_f32(v_reinterpret_as_s32(v_src11))); + v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00))); + v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src01))); + v_store(dst + x + step * 2, vx_load(dst + x + step * 2) + v_cvt_f32(v_reinterpret_as_s32(v_src10))); + v_store(dst + x + step * 3, vx_load(dst + x + step * 3) + v_cvt_f32(v_reinterpret_as_s32(v_src11))); } } else if (cn == 3) { for ( ; x <= len - cVectorWidth ; x += cVectorWidth) { - v_uint8x16 v_mask = v_load(mask + x); + v_uint8 v_mask = vx_load(mask + x); v_mask = ~(v_0 == v_mask); - v_uint8x16 v_src0, v_src1, v_src2; + v_uint8 v_src0, v_src1, v_src2; v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2); v_src0 = v_src0 & v_mask; v_src1 = v_src1 & v_mask; v_src2 = v_src2 & v_mask; - v_uint16x8 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21; + v_uint16 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21; v_expand(v_src0, v_src00, v_src01); v_expand(v_src1, v_src10, v_src11); v_expand(v_src2, v_src20, v_src21); - v_src00 = v_src00 * v_src00; - v_src01 = v_src01 * v_src01; - v_src10 = v_src10 * v_src10; - v_src11 = v_src11 * v_src11; - v_src20 = v_src20 * v_src20; - v_src21 = v_src21 * v_src21; + v_src00 = v_mul_wrap(v_src00, v_src00); + v_src01 = v_mul_wrap(v_src01, v_src01); + v_src10 = v_mul_wrap(v_src10, v_src10); + v_src11 = v_mul_wrap(v_src11, v_src11); + v_src20 = v_mul_wrap(v_src20, v_src20); + v_src21 = v_mul_wrap(v_src21, v_src21); - v_uint32x4 v_src000, v_src001, v_src010, v_src011; - v_uint32x4 v_src100, v_src101, v_src110, v_src111; - v_uint32x4 v_src200, v_src201, v_src210, v_src211; + v_uint32 v_src000, v_src001, v_src010, v_src011; + v_uint32 v_src100, v_src101, v_src110, v_src111; + v_uint32 v_src200, v_src201, v_src210, v_src211; v_expand(v_src00, v_src000, v_src001); v_expand(v_src01, v_src010, v_src011); v_expand(v_src10, v_src100, v_src101); @@ -1087,90 +1118,103 @@ void accSqr_simd_(const uchar* src, float* dst, const uchar* mask, int len, int v_expand(v_src20, v_src200, v_src201); v_expand(v_src21, v_src210, v_src211); - v_float32x4 v_dst000, v_dst001, v_dst010, v_dst011; - v_float32x4 v_dst100, v_dst101, v_dst110, v_dst111; - v_float32x4 v_dst200, v_dst201, v_dst210, v_dst211; + v_float32 v_dst000, v_dst001, v_dst010, v_dst011; + v_float32 v_dst100, v_dst101, v_dst110, v_dst111; + v_float32 v_dst200, v_dst201, v_dst210, v_dst211; v_load_deinterleave(dst + x * cn, v_dst000, v_dst100, v_dst200); - v_load_deinterleave(dst + (x + 4) * cn, v_dst001, v_dst101, v_dst201); - v_load_deinterleave(dst + (x + 8) * cn, v_dst010, v_dst110, v_dst210); - v_load_deinterleave(dst + (x + 12) * cn, v_dst011, v_dst111, v_dst211); + v_load_deinterleave(dst + (x + step) * cn, v_dst001, v_dst101, v_dst201); + v_load_deinterleave(dst + (x + step * 2) * cn, v_dst010, v_dst110, v_dst210); + v_load_deinterleave(dst + (x + step * 3) * cn, v_dst011, v_dst111, v_dst211); - v_store_interleave(dst + x * cn, v_dst000 + v_cvt_f32(v_reinterpret_as_s32(v_src000)), v_dst100 + v_cvt_f32(v_reinterpret_as_s32(v_src100)), v_dst200 + v_cvt_f32(v_reinterpret_as_s32(v_src200))); - v_store_interleave(dst + (x + 4) * cn, v_dst001 + v_cvt_f32(v_reinterpret_as_s32(v_src001)), v_dst101 + v_cvt_f32(v_reinterpret_as_s32(v_src101)), v_dst201 + v_cvt_f32(v_reinterpret_as_s32(v_src201))); - v_store_interleave(dst + (x + 8) * cn, v_dst010 + v_cvt_f32(v_reinterpret_as_s32(v_src010)), v_dst110 + v_cvt_f32(v_reinterpret_as_s32(v_src110)), v_dst210 + v_cvt_f32(v_reinterpret_as_s32(v_src210))); - v_store_interleave(dst + (x + 12) * cn, v_dst011 + v_cvt_f32(v_reinterpret_as_s32(v_src011)), v_dst111 + v_cvt_f32(v_reinterpret_as_s32(v_src111)), v_dst211 + v_cvt_f32(v_reinterpret_as_s32(v_src211))); + v_dst000 += v_cvt_f32(v_reinterpret_as_s32(v_src000)); + v_dst001 += v_cvt_f32(v_reinterpret_as_s32(v_src001)); + v_dst010 += v_cvt_f32(v_reinterpret_as_s32(v_src010)); + v_dst011 += v_cvt_f32(v_reinterpret_as_s32(v_src011)); + + v_dst100 += v_cvt_f32(v_reinterpret_as_s32(v_src100)); + v_dst101 += v_cvt_f32(v_reinterpret_as_s32(v_src101)); + v_dst110 += v_cvt_f32(v_reinterpret_as_s32(v_src110)); + v_dst111 += v_cvt_f32(v_reinterpret_as_s32(v_src111)); + + v_dst200 += v_cvt_f32(v_reinterpret_as_s32(v_src200)); + v_dst201 += v_cvt_f32(v_reinterpret_as_s32(v_src201)); + v_dst210 += v_cvt_f32(v_reinterpret_as_s32(v_src210)); + v_dst211 += v_cvt_f32(v_reinterpret_as_s32(v_src211)); + + v_store_interleave(dst + x * cn, v_dst000, v_dst100, v_dst200); + v_store_interleave(dst + (x + step) * cn, v_dst001, v_dst101, v_dst201); + v_store_interleave(dst + (x + step * 2) * cn, v_dst010, v_dst110, v_dst210); + v_store_interleave(dst + (x + step * 3) * cn, v_dst011, v_dst111, v_dst211); } } } - +#endif // CV_SIMD accSqr_general_(src, dst, mask, len, cn, x); } void accSqr_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn) { int x = 0; - const int cVectorWidth = 8; +#if CV_SIMD + const int cVectorWidth = v_uint16::nlanes; + const int step = v_float32::nlanes; if (!mask) { int size = len * cn; for (; x <= size - cVectorWidth; x += cVectorWidth) { - v_uint16x8 v_src = v_load(src + x); - v_uint32x4 v_src0, v_src1; + v_uint16 v_src = vx_load(src + x); + v_uint32 v_src0, v_src1; v_expand(v_src, v_src0, v_src1); - v_float32x4 v_float0, v_float1; + v_float32 v_float0, v_float1; v_float0 = v_cvt_f32(v_reinterpret_as_s32(v_src0)); v_float1 = v_cvt_f32(v_reinterpret_as_s32(v_src1)); - v_float0 = v_float0 * v_float0; - v_float1 = v_float1 * v_float1; - v_store(dst + x, v_load(dst + x) + v_float0); - v_store(dst + x + 4, v_load(dst + x + 4) + v_float1); + v_store(dst + x, v_fma(v_float0, v_float0, vx_load(dst + x))); + v_store(dst + x + step, v_fma(v_float1, v_float1, vx_load(dst + x + step))); } } else { - v_uint32x4 v_0 = v_setzero_u32(); + v_uint32 v_0 = vx_setzero_u32(); if (cn == 1) { for ( ; x <= len - cVectorWidth ; x += cVectorWidth) { - v_uint16x8 v_mask16 = v_load_expand(mask + x); - v_uint32x4 v_mask0, v_mask1; + v_uint16 v_mask16 = vx_load_expand(mask + x); + v_uint32 v_mask0, v_mask1; v_expand(v_mask16, v_mask0, v_mask1); v_mask0 = ~(v_mask0 == v_0); v_mask1 = ~(v_mask1 == v_0); - v_uint16x8 v_src = v_load(src + x); - v_uint32x4 v_src0, v_src1; + v_uint16 v_src = vx_load(src + x); + v_uint32 v_src0, v_src1; v_expand(v_src, v_src0, v_src1); v_src0 = v_src0 & v_mask0; v_src1 = v_src1 & v_mask1; - v_float32x4 v_float0, v_float1; + v_float32 v_float0, v_float1; v_float0 = v_cvt_f32(v_reinterpret_as_s32(v_src0)); v_float1 = v_cvt_f32(v_reinterpret_as_s32(v_src1)); - v_float0 = v_float0 * v_float0; - v_float1 = v_float1 * v_float1; - v_store(dst + x, v_load(dst + x) + v_float0); - v_store(dst + x + 4, v_load(dst + x + 4) + v_float1); + v_store(dst + x, v_fma(v_float0, v_float0, vx_load(dst + x))); + v_store(dst + x + step, v_fma(v_float1, v_float1, vx_load(dst + x + step))); } } else if (cn == 3) { for ( ; x <= len - cVectorWidth ; x += cVectorWidth) { - v_uint16x8 v_mask16 = v_load_expand(mask + x); - v_uint32x4 v_mask0, v_mask1; + v_uint16 v_mask16 = vx_load_expand(mask + x); + v_uint32 v_mask0, v_mask1; v_expand(v_mask16, v_mask0, v_mask1); v_mask0 = ~(v_mask0 == v_0); v_mask1 = ~(v_mask1 == v_0); - v_uint16x8 v_src0, v_src1, v_src2; + v_uint16 v_src0, v_src1, v_src2; v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2); - v_uint32x4 v_int00, v_int01, v_int10, v_int11, v_int20, v_int21; + v_uint32 v_int00, v_int01, v_int10, v_int11, v_int20, v_int21; v_expand(v_src0, v_int00, v_int01); v_expand(v_src1, v_int10, v_int11); v_expand(v_src2, v_int20, v_int21); @@ -1181,653 +1225,650 @@ void accSqr_simd_(const ushort* src, float* dst, const uchar* mask, int len, int v_int20 = v_int20 & v_mask0; v_int21 = v_int21 & v_mask1; - v_float32x4 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21; + v_float32 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21; v_src00 = v_cvt_f32(v_reinterpret_as_s32(v_int00)); v_src01 = v_cvt_f32(v_reinterpret_as_s32(v_int01)); v_src10 = v_cvt_f32(v_reinterpret_as_s32(v_int10)); v_src11 = v_cvt_f32(v_reinterpret_as_s32(v_int11)); v_src20 = v_cvt_f32(v_reinterpret_as_s32(v_int20)); v_src21 = v_cvt_f32(v_reinterpret_as_s32(v_int21)); - v_src00 = v_src00 * v_src00; - v_src01 = v_src01 * v_src01; - v_src10 = v_src10 * v_src10; - v_src11 = v_src11 * v_src11; - v_src20 = v_src20 * v_src20; - v_src21 = v_src21 * v_src21; - v_float32x4 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21; + v_float32 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21; v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20); - v_load_deinterleave(dst + (x + 4) * cn, v_dst01, v_dst11, v_dst21); + v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21); - v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20); - v_store_interleave(dst + (x + 4) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21); + v_dst00 = v_fma(v_src00, v_src00, v_dst00); + v_dst01 = v_fma(v_src01, v_src01, v_dst01); + v_dst10 = v_fma(v_src10, v_src10, v_dst10); + v_dst11 = v_fma(v_src11, v_src11, v_dst11); + v_dst20 = v_fma(v_src20, v_src20, v_dst20); + v_dst21 = v_fma(v_src21, v_src21, v_dst21); + + v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20); + v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21); } } } - +#endif // CV_SIMD accSqr_general_(src, dst, mask, len, cn, x); } void accSqr_simd_(const float* src, float* dst, const uchar* mask, int len, int cn) { int x = 0; - const int cVectorWidth = 8; +#if CV_SIMD + const int cVectorWidth = v_uint16::nlanes; + const int step = v_float32::nlanes; if (!mask) { int size = len * cn; + #if CV_AVX && !CV_AVX2 + for ( ; x <= size - 8 ; x += 8) + { + __m256 v_src = _mm256_loadu_ps(src + x); + __m256 v_dst = _mm256_loadu_ps(dst + x); + v_src = _mm256_mul_ps(v_src, v_src); + v_dst = _mm256_add_ps(v_src, v_dst); + _mm256_storeu_ps(dst + x, v_dst); + } + #else for (; x <= size - cVectorWidth; x += cVectorWidth) { - v_float32x4 v_src0 = v_load(src + x); - v_float32x4 v_src1 = v_load(src + x + 4); - v_src0 = v_src0 * v_src0; - v_src1 = v_src1 * v_src1; + v_float32 v_src0 = vx_load(src + x); + v_float32 v_src1 = vx_load(src + x + step); - v_store(dst + x, v_load(dst + x) + v_src0); - v_store(dst + x + 4, v_load(dst + x + 4) + v_src1); + v_store(dst + x, v_fma(v_src0, v_src0, vx_load(dst + x))); + v_store(dst + x + step, v_fma(v_src1, v_src1, vx_load(dst + x + step))); } + #endif // CV_AVX && !CV_AVX2 } else { - v_uint32x4 v_0 = v_setzero_u32(); + v_uint32 v_0 = vx_setzero_u32(); if (cn == 1) { for (; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint16x8 v_mask16 = v_load_expand(mask + x); - v_uint32x4 v_mask_0, v_mask_1; + v_uint16 v_mask16 = vx_load_expand(mask + x); + v_uint32 v_mask_0, v_mask_1; v_expand(v_mask16, v_mask_0, v_mask_1); - v_float32x4 v_mask0 = v_reinterpret_as_f32(~(v_mask_0 == v_0)); - v_float32x4 v_mask1 = v_reinterpret_as_f32(~(v_mask_1 == v_0)); - v_float32x4 v_src0 = v_load(src + x); - v_float32x4 v_src1 = v_load(src + x + 4); + v_float32 v_mask0 = v_reinterpret_as_f32(~(v_mask_0 == v_0)); + v_float32 v_mask1 = v_reinterpret_as_f32(~(v_mask_1 == v_0)); + v_float32 v_src0 = vx_load(src + x); + v_float32 v_src1 = vx_load(src + x + step); v_src0 = v_src0 & v_mask0; v_src1 = v_src1 & v_mask1; - v_src0 = v_src0 * v_src0; - v_src1 = v_src1 * v_src1; - v_store(dst + x, v_load(dst + x) + v_src0); - v_store(dst + x + 4, v_load(dst + x + 4) + v_src1); + v_store(dst + x, v_fma(v_src0, v_src0, vx_load(dst + x))); + v_store(dst + x + step, v_fma(v_src1, v_src1, vx_load(dst + x + step))); } } else if (cn == 3) { for (; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint16x8 v_mask16 = v_load_expand(mask + x); - v_uint32x4 v_mask_0, v_mask_1; + v_uint16 v_mask16 = vx_load_expand(mask + x); + v_uint32 v_mask_0, v_mask_1; v_expand(v_mask16, v_mask_0, v_mask_1); - v_float32x4 v_mask0 = v_reinterpret_as_f32(~(v_mask_0 == v_0)); - v_float32x4 v_mask1 = v_reinterpret_as_f32(~(v_mask_1 == v_0)); + v_float32 v_mask0 = v_reinterpret_as_f32(~(v_mask_0 == v_0)); + v_float32 v_mask1 = v_reinterpret_as_f32(~(v_mask_1 == v_0)); - v_float32x4 v_src00, v_src10, v_src20, v_src01, v_src11, v_src21; + v_float32 v_src00, v_src10, v_src20, v_src01, v_src11, v_src21; v_load_deinterleave(src + x * cn, v_src00, v_src10, v_src20); - v_load_deinterleave(src + (x + 4) * cn, v_src01, v_src11, v_src21); + v_load_deinterleave(src + (x + step) * cn, v_src01, v_src11, v_src21); v_src00 = v_src00 & v_mask0; v_src01 = v_src01 & v_mask1; v_src10 = v_src10 & v_mask0; v_src11 = v_src11 & v_mask1; v_src20 = v_src20 & v_mask0; v_src21 = v_src21 & v_mask1; - v_src00 = v_src00 * v_src00; - v_src01 = v_src01 * v_src01; - v_src10 = v_src10 * v_src10; - v_src11 = v_src11 * v_src11; - v_src20 = v_src20 * v_src20; - v_src21 = v_src21 * v_src21; - v_float32x4 v_dst00, v_dst10, v_dst20, v_dst01, v_dst11, v_dst21; + v_float32 v_dst00, v_dst10, v_dst20, v_dst01, v_dst11, v_dst21; v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20); - v_load_deinterleave(dst + (x + 4) * cn, v_dst01, v_dst11, v_dst21); + v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21); - v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20); - v_store_interleave(dst + (x + 4) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21); + v_dst00 = v_fma(v_src00, v_src00, v_dst00); + v_dst01 = v_fma(v_src01, v_src01, v_dst01); + v_dst10 = v_fma(v_src10, v_src10, v_dst10); + v_dst11 = v_fma(v_src11, v_src11, v_dst11); + v_dst20 = v_fma(v_src20, v_src20, v_dst20); + v_dst21 = v_fma(v_src21, v_src21, v_dst21); + + v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20); + v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21); } } } - +#endif // CV_SIMD accSqr_general_(src, dst, mask, len, cn, x); } -#if CV_SIMD128_64F + void accSqr_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn) { int x = 0; - const int cVectorWidth = 8; +#if CV_SIMD_64F + const int cVectorWidth = v_uint16::nlanes; + const int step = v_float64::nlanes; if (!mask) { int size = len * cn; for (; x <= size - cVectorWidth; x += cVectorWidth) { - v_uint16x8 v_int = v_load_expand(src + x); + v_uint16 v_int = vx_load_expand(src + x); - v_uint32x4 v_int0, v_int1; + v_uint32 v_int0, v_int1; v_expand(v_int, v_int0, v_int1); - v_float64x2 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int0)); - v_float64x2 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int0)); - v_float64x2 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int1)); - v_float64x2 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int1)); - v_src0 = v_src0 * v_src0; - v_src1 = v_src1 * v_src1; - v_src2 = v_src2 * v_src2; - v_src3 = v_src3 * v_src3; + v_float64 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int0)); + v_float64 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int0)); + v_float64 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int1)); + v_float64 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int1)); - v_float64x2 v_dst0 = v_load(dst + x); - v_float64x2 v_dst1 = v_load(dst + x + 2); - v_float64x2 v_dst2 = v_load(dst + x + 4); - v_float64x2 v_dst3 = v_load(dst + x + 6); + v_float64 v_dst0 = vx_load(dst + x); + v_float64 v_dst1 = vx_load(dst + x + step); + v_float64 v_dst2 = vx_load(dst + x + step * 2); + v_float64 v_dst3 = vx_load(dst + x + step * 3); - v_dst0 += v_src0; - v_dst1 += v_src1; - v_dst2 += v_src2; - v_dst3 += v_src3; + v_dst0 = v_fma(v_src0, v_src0, v_dst0); + v_dst1 = v_fma(v_src1, v_src1, v_dst1); + v_dst2 = v_fma(v_src2, v_src2, v_dst2); + v_dst3 = v_fma(v_src3, v_src3, v_dst3); v_store(dst + x, v_dst0); - v_store(dst + x + 2, v_dst1); - v_store(dst + x + 4, v_dst2); - v_store(dst + x + 6, v_dst3); + v_store(dst + x + step, v_dst1); + v_store(dst + x + step * 2, v_dst2); + v_store(dst + x + step * 3, v_dst3); } } else { - v_uint16x8 v_0 = v_setzero_u16(); + v_uint16 v_0 = vx_setzero_u16(); if (cn == 1) { for (; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint16x8 v_mask = v_load_expand(mask + x); + v_uint16 v_mask = vx_load_expand(mask + x); v_mask = ~(v_mask == v_0); - v_uint16x8 v_src = v_load_expand(src + x); - v_uint16x8 v_int = v_src & v_mask; + v_uint16 v_src = vx_load_expand(src + x); + v_uint16 v_int = v_src & v_mask; - v_uint32x4 v_int0, v_int1; + v_uint32 v_int0, v_int1; v_expand(v_int, v_int0, v_int1); - v_float64x2 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int0)); - v_float64x2 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int0)); - v_float64x2 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int1)); - v_float64x2 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int1)); - v_src0 = v_src0 * v_src0; - v_src1 = v_src1 * v_src1; - v_src2 = v_src2 * v_src2; - v_src3 = v_src3 * v_src3; + v_float64 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int0)); + v_float64 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int0)); + v_float64 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int1)); + v_float64 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int1)); - v_float64x2 v_dst0 = v_load(dst + x); - v_float64x2 v_dst1 = v_load(dst + x + 2); - v_float64x2 v_dst2 = v_load(dst + x + 4); - v_float64x2 v_dst3 = v_load(dst + x + 6); + v_float64 v_dst0 = vx_load(dst + x); + v_float64 v_dst1 = vx_load(dst + x + step); + v_float64 v_dst2 = vx_load(dst + x + step * 2); + v_float64 v_dst3 = vx_load(dst + x + step * 3); - v_dst0 += v_src0; - v_dst1 += v_src1; - v_dst2 += v_src2; - v_dst3 += v_src3; + v_dst0 = v_fma(v_src0, v_src0, v_dst0); + v_dst1 = v_fma(v_src1, v_src1, v_dst1); + v_dst2 = v_fma(v_src2, v_src2, v_dst2); + v_dst3 = v_fma(v_src3, v_src3, v_dst3); v_store(dst + x, v_dst0); - v_store(dst + x + 2, v_dst1); - v_store(dst + x + 4, v_dst2); - v_store(dst + x + 6, v_dst3); + v_store(dst + x + step, v_dst1); + v_store(dst + x + step * 2, v_dst2); + v_store(dst + x + step * 3, v_dst3); } } else if (cn == 3) { - for (; x <= len - /*cVectorWidth*/16; x += cVectorWidth) + for (; x <= len - cVectorWidth * 2; x += cVectorWidth) { - v_uint8x16 v_src0, v_src1, v_src2; + v_uint8 v_src0, v_src1, v_src2; v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2); - v_uint16x8 v_int0, v_int1, v_int2, dummy; - v_expand(v_src0, v_int0, dummy); - v_expand(v_src1, v_int1, dummy); - v_expand(v_src2, v_int2, dummy); - v_uint16x8 v_mask = v_load_expand(mask + x); + + v_uint16 v_int0 = v_expand_low(v_src0); + v_uint16 v_int1 = v_expand_low(v_src1); + v_uint16 v_int2 = v_expand_low(v_src2); + + v_uint16 v_mask = vx_load_expand(mask + x); v_mask = ~(v_mask == v_0); v_int0 = v_int0 & v_mask; v_int1 = v_int1 & v_mask; v_int2 = v_int2 & v_mask; - v_uint32x4 v_int00, v_int01, v_int10, v_int11, v_int20, v_int21; + v_uint32 v_int00, v_int01, v_int10, v_int11, v_int20, v_int21; v_expand(v_int0, v_int00, v_int01); v_expand(v_int1, v_int10, v_int11); v_expand(v_int2, v_int20, v_int21); - v_float64x2 v_src00 = v_cvt_f64(v_reinterpret_as_s32(v_int00)); - v_float64x2 v_src01 = v_cvt_f64_high(v_reinterpret_as_s32(v_int00)); - v_float64x2 v_src02 = v_cvt_f64(v_reinterpret_as_s32(v_int01)); - v_float64x2 v_src03 = v_cvt_f64_high(v_reinterpret_as_s32(v_int01)); - v_float64x2 v_src10 = v_cvt_f64(v_reinterpret_as_s32(v_int10)); - v_float64x2 v_src11 = v_cvt_f64_high(v_reinterpret_as_s32(v_int10)); - v_float64x2 v_src12 = v_cvt_f64(v_reinterpret_as_s32(v_int11)); - v_float64x2 v_src13 = v_cvt_f64_high(v_reinterpret_as_s32(v_int11)); - v_float64x2 v_src20 = v_cvt_f64(v_reinterpret_as_s32(v_int20)); - v_float64x2 v_src21 = v_cvt_f64_high(v_reinterpret_as_s32(v_int20)); - v_float64x2 v_src22 = v_cvt_f64(v_reinterpret_as_s32(v_int21)); - v_float64x2 v_src23 = v_cvt_f64_high(v_reinterpret_as_s32(v_int21)); - v_src00 = v_src00 * v_src00; - v_src01 = v_src01 * v_src01; - v_src02 = v_src02 * v_src02; - v_src03 = v_src03 * v_src03; - v_src10 = v_src10 * v_src10; - v_src11 = v_src11 * v_src11; - v_src12 = v_src12 * v_src12; - v_src13 = v_src13 * v_src13; - v_src20 = v_src20 * v_src20; - v_src21 = v_src21 * v_src21; - v_src22 = v_src22 * v_src22; - v_src23 = v_src23 * v_src23; + v_float64 v_src00 = v_cvt_f64(v_reinterpret_as_s32(v_int00)); + v_float64 v_src01 = v_cvt_f64_high(v_reinterpret_as_s32(v_int00)); + v_float64 v_src02 = v_cvt_f64(v_reinterpret_as_s32(v_int01)); + v_float64 v_src03 = v_cvt_f64_high(v_reinterpret_as_s32(v_int01)); + v_float64 v_src10 = v_cvt_f64(v_reinterpret_as_s32(v_int10)); + v_float64 v_src11 = v_cvt_f64_high(v_reinterpret_as_s32(v_int10)); + v_float64 v_src12 = v_cvt_f64(v_reinterpret_as_s32(v_int11)); + v_float64 v_src13 = v_cvt_f64_high(v_reinterpret_as_s32(v_int11)); + v_float64 v_src20 = v_cvt_f64(v_reinterpret_as_s32(v_int20)); + v_float64 v_src21 = v_cvt_f64_high(v_reinterpret_as_s32(v_int20)); + v_float64 v_src22 = v_cvt_f64(v_reinterpret_as_s32(v_int21)); + v_float64 v_src23 = v_cvt_f64_high(v_reinterpret_as_s32(v_int21)); - v_float64x2 v_dst00, v_dst01, v_dst02, v_dst03, v_dst10, v_dst11, v_dst12, v_dst13, v_dst20, v_dst21, v_dst22, v_dst23; + v_float64 v_dst00, v_dst01, v_dst02, v_dst03, v_dst10, v_dst11, v_dst12, v_dst13, v_dst20, v_dst21, v_dst22, v_dst23; v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20); - v_load_deinterleave(dst + (x + 2) * cn, v_dst01, v_dst11, v_dst21); - v_load_deinterleave(dst + (x + 4) * cn, v_dst02, v_dst12, v_dst22); - v_load_deinterleave(dst + (x + 6) * cn, v_dst03, v_dst13, v_dst23); + v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21); + v_load_deinterleave(dst + (x + step * 2) * cn, v_dst02, v_dst12, v_dst22); + v_load_deinterleave(dst + (x + step * 3) * cn, v_dst03, v_dst13, v_dst23); - v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20); - v_store_interleave(dst + (x + 2) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21); - v_store_interleave(dst + (x + 4) * cn, v_dst02 + v_src02, v_dst12 + v_src12, v_dst22 + v_src22); - v_store_interleave(dst + (x + 6) * cn, v_dst03 + v_src03, v_dst13 + v_src13, v_dst23 + v_src23); + v_dst00 = v_fma(v_src00, v_src00, v_dst00); + v_dst01 = v_fma(v_src01, v_src01, v_dst01); + v_dst02 = v_fma(v_src02, v_src02, v_dst02); + v_dst03 = v_fma(v_src03, v_src03, v_dst03); + v_dst10 = v_fma(v_src10, v_src10, v_dst10); + v_dst11 = v_fma(v_src11, v_src11, v_dst11); + v_dst12 = v_fma(v_src12, v_src12, v_dst12); + v_dst13 = v_fma(v_src13, v_src13, v_dst13); + v_dst20 = v_fma(v_src20, v_src20, v_dst20); + v_dst21 = v_fma(v_src21, v_src21, v_dst21); + v_dst22 = v_fma(v_src22, v_src22, v_dst22); + v_dst23 = v_fma(v_src23, v_src23, v_dst23); + + v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20); + v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21); + v_store_interleave(dst + (x + step * 2) * cn, v_dst02, v_dst12, v_dst22); + v_store_interleave(dst + (x + step * 3) * cn, v_dst03, v_dst13, v_dst23); } } } - +#endif // CV_SIMD_64F accSqr_general_(src, dst, mask, len, cn, x); } void accSqr_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn) { int x = 0; - const int cVectorWidth = 8; +#if CV_SIMD_64F + const int cVectorWidth = v_uint16::nlanes; + const int step = v_float64::nlanes; if (!mask) { int size = len * cn; for (; x <= size - cVectorWidth; x += cVectorWidth) { - v_uint16x8 v_src = v_load(src + x); - v_uint32x4 v_int_0, v_int_1; + v_uint16 v_src = vx_load(src + x); + v_uint32 v_int_0, v_int_1; v_expand(v_src, v_int_0, v_int_1); - v_int32x4 v_int0 = v_reinterpret_as_s32(v_int_0); - v_int32x4 v_int1 = v_reinterpret_as_s32(v_int_1); + v_int32 v_int0 = v_reinterpret_as_s32(v_int_0); + v_int32 v_int1 = v_reinterpret_as_s32(v_int_1); - v_float64x2 v_src0 = v_cvt_f64(v_int0); - v_float64x2 v_src1 = v_cvt_f64_high(v_int0); - v_float64x2 v_src2 = v_cvt_f64(v_int1); - v_float64x2 v_src3 = v_cvt_f64_high(v_int1); - v_src0 = v_src0 * v_src0; - v_src1 = v_src1 * v_src1; - v_src2 = v_src2 * v_src2; - v_src3 = v_src3 * v_src3; + v_float64 v_src0 = v_cvt_f64(v_int0); + v_float64 v_src1 = v_cvt_f64_high(v_int0); + v_float64 v_src2 = v_cvt_f64(v_int1); + v_float64 v_src3 = v_cvt_f64_high(v_int1); - v_float64x2 v_dst0 = v_load(dst + x); - v_float64x2 v_dst1 = v_load(dst + x + 2); - v_float64x2 v_dst2 = v_load(dst + x + 4); - v_float64x2 v_dst3 = v_load(dst + x + 6); + v_float64 v_dst0 = vx_load(dst + x); + v_float64 v_dst1 = vx_load(dst + x + step); + v_float64 v_dst2 = vx_load(dst + x + step * 2); + v_float64 v_dst3 = vx_load(dst + x + step * 3); - v_dst0 += v_src0; - v_dst1 += v_src1; - v_dst2 += v_src2; - v_dst3 += v_src3; + v_dst0 = v_fma(v_src0, v_src0, v_dst0); + v_dst1 = v_fma(v_src1, v_src1, v_dst1); + v_dst2 = v_fma(v_src2, v_src2, v_dst2); + v_dst3 = v_fma(v_src3, v_src3, v_dst3); v_store(dst + x, v_dst0); - v_store(dst + x + 2, v_dst1); - v_store(dst + x + 4, v_dst2); - v_store(dst + x + 6, v_dst3); + v_store(dst + x + step, v_dst1); + v_store(dst + x + step * 2, v_dst2); + v_store(dst + x + step * 3, v_dst3); } } else { - v_uint16x8 v_0 = v_setzero_u16(); + v_uint16 v_0 = vx_setzero_u16(); if (cn == 1) { for (; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint16x8 v_mask = v_load_expand(mask + x); + v_uint16 v_mask = vx_load_expand(mask + x); v_mask = ~(v_mask == v_0); - v_uint16x8 v_src = v_load(src + x); + v_uint16 v_src = vx_load(src + x); v_src = v_src & v_mask; - v_uint32x4 v_int_0, v_int_1; + v_uint32 v_int_0, v_int_1; v_expand(v_src, v_int_0, v_int_1); - v_int32x4 v_int0 = v_reinterpret_as_s32(v_int_0); - v_int32x4 v_int1 = v_reinterpret_as_s32(v_int_1); + v_int32 v_int0 = v_reinterpret_as_s32(v_int_0); + v_int32 v_int1 = v_reinterpret_as_s32(v_int_1); - v_float64x2 v_src0 = v_cvt_f64(v_int0); - v_float64x2 v_src1 = v_cvt_f64_high(v_int0); - v_float64x2 v_src2 = v_cvt_f64(v_int1); - v_float64x2 v_src3 = v_cvt_f64_high(v_int1); - v_src0 = v_src0 * v_src0; - v_src1 = v_src1 * v_src1; - v_src2 = v_src2 * v_src2; - v_src3 = v_src3 * v_src3; + v_float64 v_src0 = v_cvt_f64(v_int0); + v_float64 v_src1 = v_cvt_f64_high(v_int0); + v_float64 v_src2 = v_cvt_f64(v_int1); + v_float64 v_src3 = v_cvt_f64_high(v_int1); - v_float64x2 v_dst0 = v_load(dst + x); - v_float64x2 v_dst1 = v_load(dst + x + 2); - v_float64x2 v_dst2 = v_load(dst + x + 4); - v_float64x2 v_dst3 = v_load(dst + x + 6); + v_float64 v_dst0 = vx_load(dst + x); + v_float64 v_dst1 = vx_load(dst + x + step); + v_float64 v_dst2 = vx_load(dst + x + step * 2); + v_float64 v_dst3 = vx_load(dst + x + step * 3); - v_dst0 += v_src0; - v_dst1 += v_src1; - v_dst2 += v_src2; - v_dst3 += v_src3; + v_dst0 = v_fma(v_src0, v_src0, v_dst0); + v_dst1 = v_fma(v_src1, v_src1, v_dst1); + v_dst2 = v_fma(v_src2, v_src2, v_dst2); + v_dst3 = v_fma(v_src3, v_src3, v_dst3); v_store(dst + x, v_dst0); - v_store(dst + x + 2, v_dst1); - v_store(dst + x + 4, v_dst2); - v_store(dst + x + 6, v_dst3); + v_store(dst + x + step, v_dst1); + v_store(dst + x + step * 2, v_dst2); + v_store(dst + x + step * 3, v_dst3); } } else if (cn == 3) { for (; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint16x8 v_mask = v_load_expand(mask + x); + v_uint16 v_mask = vx_load_expand(mask + x); v_mask = ~(v_mask == v_0); - v_uint16x8 v_src0, v_src1, v_src2; + v_uint16 v_src0, v_src1, v_src2; v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2); v_src0 = v_src0 & v_mask; v_src1 = v_src1 & v_mask; v_src2 = v_src2 & v_mask; - v_uint32x4 v_int00, v_int01, v_int10, v_int11, v_int20, v_int21; + v_uint32 v_int00, v_int01, v_int10, v_int11, v_int20, v_int21; v_expand(v_src0, v_int00, v_int01); v_expand(v_src1, v_int10, v_int11); v_expand(v_src2, v_int20, v_int21); - v_float64x2 v_src00 = v_cvt_f64(v_reinterpret_as_s32(v_int00)); - v_float64x2 v_src01 = v_cvt_f64_high(v_reinterpret_as_s32(v_int00)); - v_float64x2 v_src02 = v_cvt_f64(v_reinterpret_as_s32(v_int01)); - v_float64x2 v_src03 = v_cvt_f64_high(v_reinterpret_as_s32(v_int01)); - v_float64x2 v_src10 = v_cvt_f64(v_reinterpret_as_s32(v_int10)); - v_float64x2 v_src11 = v_cvt_f64_high(v_reinterpret_as_s32(v_int10)); - v_float64x2 v_src12 = v_cvt_f64(v_reinterpret_as_s32(v_int11)); - v_float64x2 v_src13 = v_cvt_f64_high(v_reinterpret_as_s32(v_int11)); - v_float64x2 v_src20 = v_cvt_f64(v_reinterpret_as_s32(v_int20)); - v_float64x2 v_src21 = v_cvt_f64_high(v_reinterpret_as_s32(v_int20)); - v_float64x2 v_src22 = v_cvt_f64(v_reinterpret_as_s32(v_int21)); - v_float64x2 v_src23 = v_cvt_f64_high(v_reinterpret_as_s32(v_int21)); - v_src00 = v_src00 * v_src00; - v_src01 = v_src01 * v_src01; - v_src02 = v_src02 * v_src02; - v_src03 = v_src03 * v_src03; - v_src10 = v_src10 * v_src10; - v_src11 = v_src11 * v_src11; - v_src12 = v_src12 * v_src12; - v_src13 = v_src13 * v_src13; - v_src20 = v_src20 * v_src20; - v_src21 = v_src21 * v_src21; - v_src22 = v_src22 * v_src22; - v_src23 = v_src23 * v_src23; + v_float64 v_src00 = v_cvt_f64(v_reinterpret_as_s32(v_int00)); + v_float64 v_src01 = v_cvt_f64_high(v_reinterpret_as_s32(v_int00)); + v_float64 v_src02 = v_cvt_f64(v_reinterpret_as_s32(v_int01)); + v_float64 v_src03 = v_cvt_f64_high(v_reinterpret_as_s32(v_int01)); + v_float64 v_src10 = v_cvt_f64(v_reinterpret_as_s32(v_int10)); + v_float64 v_src11 = v_cvt_f64_high(v_reinterpret_as_s32(v_int10)); + v_float64 v_src12 = v_cvt_f64(v_reinterpret_as_s32(v_int11)); + v_float64 v_src13 = v_cvt_f64_high(v_reinterpret_as_s32(v_int11)); + v_float64 v_src20 = v_cvt_f64(v_reinterpret_as_s32(v_int20)); + v_float64 v_src21 = v_cvt_f64_high(v_reinterpret_as_s32(v_int20)); + v_float64 v_src22 = v_cvt_f64(v_reinterpret_as_s32(v_int21)); + v_float64 v_src23 = v_cvt_f64_high(v_reinterpret_as_s32(v_int21)); - v_float64x2 v_dst00, v_dst01, v_dst02, v_dst03; - v_float64x2 v_dst10, v_dst11, v_dst12, v_dst13; - v_float64x2 v_dst20, v_dst21, v_dst22, v_dst23; + v_float64 v_dst00, v_dst01, v_dst02, v_dst03; + v_float64 v_dst10, v_dst11, v_dst12, v_dst13; + v_float64 v_dst20, v_dst21, v_dst22, v_dst23; v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20); - v_load_deinterleave(dst + (x + 2)* cn, v_dst01, v_dst11, v_dst21); - v_load_deinterleave(dst + (x + 4)* cn, v_dst02, v_dst12, v_dst22); - v_load_deinterleave(dst + (x + 6)* cn, v_dst03, v_dst13, v_dst23); + v_load_deinterleave(dst + (x + step)* cn, v_dst01, v_dst11, v_dst21); + v_load_deinterleave(dst + (x + step * 2)* cn, v_dst02, v_dst12, v_dst22); + v_load_deinterleave(dst + (x + step * 3)* cn, v_dst03, v_dst13, v_dst23); - v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20); - v_store_interleave(dst + (x + 2) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21); - v_store_interleave(dst + (x + 4) * cn, v_dst02 + v_src02, v_dst12 + v_src12, v_dst22 + v_src22); - v_store_interleave(dst + (x + 6) * cn, v_dst03 + v_src03, v_dst13 + v_src13, v_dst23 + v_src23); + v_dst00 = v_fma(v_src00, v_src00, v_dst00); + v_dst01 = v_fma(v_src01, v_src01, v_dst01); + v_dst02 = v_fma(v_src02, v_src02, v_dst02); + v_dst03 = v_fma(v_src03, v_src03, v_dst03); + v_dst10 = v_fma(v_src10, v_src10, v_dst10); + v_dst11 = v_fma(v_src11, v_src11, v_dst11); + v_dst12 = v_fma(v_src12, v_src12, v_dst12); + v_dst13 = v_fma(v_src13, v_src13, v_dst13); + v_dst20 = v_fma(v_src20, v_src20, v_dst20); + v_dst21 = v_fma(v_src21, v_src21, v_dst21); + v_dst22 = v_fma(v_src22, v_src22, v_dst22); + v_dst23 = v_fma(v_src23, v_src23, v_dst23); + + v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20); + v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21); + v_store_interleave(dst + (x + step * 2) * cn, v_dst02, v_dst12, v_dst22); + v_store_interleave(dst + (x + step * 3) * cn, v_dst03, v_dst13, v_dst23); } } } - +#endif // CV_SIMD_64F accSqr_general_(src, dst, mask, len, cn, x); } void accSqr_simd_(const float* src, double* dst, const uchar* mask, int len, int cn) { int x = 0; - const int cVectorWidth = 4; +#if CV_SIMD_64F + const int cVectorWidth = v_float32::nlanes; + const int step = v_float64::nlanes; if (!mask) { int size = len * cn; + #if CV_AVX && !CV_AVX2 + for (; x <= size - 8 ; x += 8) + { + __m256 v_src = _mm256_loadu_ps(src + x); + __m256d v_src0 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src,0)); + __m256d v_src1 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src,1)); + __m256d v_dst0 = _mm256_loadu_pd(dst + x); + __m256d v_dst1 = _mm256_loadu_pd(dst + x + 4); + v_src0 = _mm256_mul_pd(v_src0, v_src0); + v_src1 = _mm256_mul_pd(v_src1, v_src1); + v_dst0 = _mm256_add_pd(v_src0, v_dst0); + v_dst1 = _mm256_add_pd(v_src1, v_dst1); + _mm256_storeu_pd(dst + x, v_dst0); + _mm256_storeu_pd(dst + x + 4, v_dst1); + } + #else for (; x <= size - cVectorWidth; x += cVectorWidth) { - v_float32x4 v_src = v_load(src + x); - v_float64x2 v_src0 = v_cvt_f64(v_src); - v_float64x2 v_src1 = v_cvt_f64_high(v_src); - v_src0 = v_src0 * v_src0; - v_src1 = v_src1 * v_src1; + v_float32 v_src = vx_load(src + x); + v_float64 v_src0 = v_cvt_f64(v_src); + v_float64 v_src1 = v_cvt_f64_high(v_src); - v_store(dst + x, v_load(dst + x) + v_src0); - v_store(dst + x + 2, v_load(dst + x + 2) + v_src1); + v_store(dst + x, v_fma(v_src0, v_src0, vx_load(dst + x))); + v_store(dst + x + step, v_fma(v_src1, v_src1, vx_load(dst + x + step))); } + #endif // CV_AVX && !CV_AVX2 } else { - v_uint32x4 v_0 = v_setzero_u32(); + v_uint32 v_0 = vx_setzero_u32(); if (cn == 1) { for (; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint32x4 v_mask = v_load_expand_q(mask + x);; + v_uint32 v_mask = vx_load_expand_q(mask + x);; v_mask = ~(v_mask == v_0); - v_float32x4 v_src = v_load(src + x); + v_float32 v_src = vx_load(src + x); v_src = v_src & v_reinterpret_as_f32(v_mask); - v_float64x2 v_src0 = v_cvt_f64(v_src); - v_float64x2 v_src1 = v_cvt_f64_high(v_src); - v_src0 = v_src0 * v_src0; - v_src1 = v_src1 * v_src1; + v_float64 v_src0 = v_cvt_f64(v_src); + v_float64 v_src1 = v_cvt_f64_high(v_src); - v_store(dst + x, v_load(dst + x) + v_src0); - v_store(dst + x + 2, v_load(dst + x + 2) + v_src1); + v_store(dst + x, v_fma(v_src0, v_src0, vx_load(dst + x))); + v_store(dst + x + step, v_fma(v_src1, v_src1, vx_load(dst + x + step))); } } else if (cn == 3) { for (; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint32x4 v_mask = v_load_expand_q(mask + x); + v_uint32 v_mask = vx_load_expand_q(mask + x); v_mask = ~(v_mask == v_0); - v_float32x4 v_src0, v_src1, v_src2; + v_float32 v_src0, v_src1, v_src2; v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2); v_src0 = v_src0 & v_reinterpret_as_f32(v_mask); v_src1 = v_src1 & v_reinterpret_as_f32(v_mask); v_src2 = v_src2 & v_reinterpret_as_f32(v_mask); - v_float64x2 v_src00 = v_cvt_f64(v_src0); - v_float64x2 v_src01 = v_cvt_f64_high(v_src0); - v_float64x2 v_src10 = v_cvt_f64(v_src1); - v_float64x2 v_src11 = v_cvt_f64_high(v_src1); - v_float64x2 v_src20 = v_cvt_f64(v_src2); - v_float64x2 v_src21 = v_cvt_f64_high(v_src2); - v_src00 = v_src00 * v_src00; - v_src01 = v_src01 * v_src01; - v_src10 = v_src10 * v_src10; - v_src11 = v_src11 * v_src11; - v_src20 = v_src20 * v_src20; - v_src21 = v_src21 * v_src21; + v_float64 v_src00 = v_cvt_f64(v_src0); + v_float64 v_src01 = v_cvt_f64_high(v_src0); + v_float64 v_src10 = v_cvt_f64(v_src1); + v_float64 v_src11 = v_cvt_f64_high(v_src1); + v_float64 v_src20 = v_cvt_f64(v_src2); + v_float64 v_src21 = v_cvt_f64_high(v_src2); - v_float64x2 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21; + v_float64 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21; v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20); - v_load_deinterleave(dst + (x + 2) * cn, v_dst01, v_dst11, v_dst21); + v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21); - v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20); - v_store_interleave(dst + (x + 2) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21); + v_dst00 = v_fma(v_src00, v_src00, v_dst00); + v_dst01 = v_fma(v_src01, v_src01, v_dst01); + v_dst10 = v_fma(v_src10, v_src10, v_dst10); + v_dst11 = v_fma(v_src11, v_src11, v_dst11); + v_dst20 = v_fma(v_src20, v_src20, v_dst20); + v_dst21 = v_fma(v_src21, v_src21, v_dst21); + + v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20); + v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21); } } } - +#endif // CV_SIMD_64F accSqr_general_(src, dst, mask, len, cn, x); } void accSqr_simd_(const double* src, double* dst, const uchar* mask, int len, int cn) { int x = 0; - const int cVectorWidth = 4; +#if CV_SIMD_64F + const int cVectorWidth = v_float64::nlanes * 2; + const int step = v_float64::nlanes; if (!mask) { int size = len * cn; + #if CV_AVX && !CV_AVX2 + for (; x <= size - 4 ; x += 4) + { + __m256d v_src = _mm256_loadu_pd(src + x); + __m256d v_dst = _mm256_loadu_pd(dst + x); + v_src = _mm256_mul_pd(v_src, v_src); + v_dst = _mm256_add_pd(v_dst, v_src); + _mm256_storeu_pd(dst + x, v_dst); + } + #else for (; x <= size - cVectorWidth; x += cVectorWidth) { - v_float64x2 v_src0 = v_load(src + x); - v_float64x2 v_src1 = v_load(src + x + 2); - v_src0 = v_src0 * v_src0; - v_src1 = v_src1 * v_src1; - - v_store(dst + x, v_load(dst + x) + v_src0); - v_store(dst + x + 2, v_load(dst + x + 2) + v_src1); + v_float64 v_src0 = vx_load(src + x); + v_float64 v_src1 = vx_load(src + x + step); + v_store(dst + x, v_fma(v_src0, v_src0, vx_load(dst + x))); + v_store(dst + x + step, v_fma(v_src1, v_src1, vx_load(dst + x + step))); } + #endif // CV_AVX && !CV_AVX2 } else { - v_uint64x2 v_0 = v_setzero_u64(); + v_uint64 v_0 = vx_setzero_u64(); if (cn == 1) { for (; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint32x4 v_mask32 = v_load_expand_q(mask + x); - v_uint64x2 v_masku640, v_masku641; + v_uint32 v_mask32 = vx_load_expand_q(mask + x); + v_uint64 v_masku640, v_masku641; v_expand(v_mask32, v_masku640, v_masku641); - v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0)); - v_float64x2 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0)); - v_float64x2 v_src0 = v_load(src + x); - v_float64x2 v_src1 = v_load(src + x + 2); + v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0)); + v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0)); + v_float64 v_src0 = vx_load(src + x); + v_float64 v_src1 = vx_load(src + x + step); v_src0 = v_src0 & v_mask0; v_src1 = v_src1 & v_mask1; - v_src0 = v_src0 * v_src0; - v_src1 = v_src1 * v_src1; - - v_store(dst + x, v_load(dst + x) + v_src0); - v_store(dst + x + 2, v_load(dst + x + 2) + v_src1); + v_store(dst + x, v_fma(v_src0, v_src0, vx_load(dst + x))); + v_store(dst + x + step, v_fma(v_src1, v_src1, vx_load(dst + x + step))); } } else if (cn == 3) { for (; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint32x4 v_mask32 = v_load_expand_q(mask + x); - v_uint64x2 v_masku640, v_masku641; + v_uint32 v_mask32 = vx_load_expand_q(mask + x); + v_uint64 v_masku640, v_masku641; v_expand(v_mask32, v_masku640, v_masku641); - v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0)); - v_float64x2 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0)); + v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0)); + v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0)); - v_float64x2 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21; + v_float64 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21; v_load_deinterleave(src + x * cn, v_src00, v_src10, v_src20); - v_load_deinterleave(src + (x + 2) * cn, v_src01, v_src11, v_src21); + v_load_deinterleave(src + (x + step) * cn, v_src01, v_src11, v_src21); v_src00 = v_src00 & v_mask0; v_src01 = v_src01 & v_mask1; v_src10 = v_src10 & v_mask0; v_src11 = v_src11 & v_mask1; v_src20 = v_src20 & v_mask0; v_src21 = v_src21 & v_mask1; - v_src00 = v_src00 * v_src00; - v_src01 = v_src01 * v_src01; - v_src10 = v_src10 * v_src10; - v_src11 = v_src11 * v_src11; - v_src20 = v_src20 * v_src20; - v_src21 = v_src21 * v_src21; - v_float64x2 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21; + v_float64 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21; v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20); - v_load_deinterleave(dst + (x + 2) * cn, v_dst01, v_dst11, v_dst21); + v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21); - v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20); - v_store_interleave(dst + (x + 2) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21); + v_dst00 = v_fma(v_src00, v_src00, v_dst00); + v_dst01 = v_fma(v_src01, v_src01, v_dst01); + v_dst10 = v_fma(v_src10, v_src10, v_dst10); + v_dst11 = v_fma(v_src11, v_src11, v_dst11); + v_dst20 = v_fma(v_src20, v_src20, v_dst20); + v_dst21 = v_fma(v_src21, v_src21, v_dst21); + + v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20); + v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21); } } } - +#endif // CV_SIMD_64F accSqr_general_(src, dst, mask, len, cn, x); } -#else -void accSqr_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn) -{ - accSqr_general_(src, dst, mask, len, cn, 0); -} - -void accSqr_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn) -{ - accSqr_general_(src, dst, mask, len, cn, 0); -} - -void accSqr_simd_(const float* src, double* dst, const uchar* mask, int len, int cn) -{ - accSqr_general_(src, dst, mask, len, cn, 0); -} - -void accSqr_simd_(const double* src, double* dst, const uchar* mask, int len, int cn) -{ - accSqr_general_(src, dst, mask, len, cn, 0); -} -#endif // product accumulate optimized by universal intrinsic void accProd_simd_(const uchar* src1, const uchar* src2, float* dst, const uchar* mask, int len, int cn) { int x = 0; - const int cVectorWidth = 16; +#if CV_SIMD + const int cVectorWidth = v_uint8::nlanes; + const int step = v_uint32::nlanes; if (!mask) { int size = len * cn; for (; x <= size - cVectorWidth; x += cVectorWidth) { - v_uint8x16 v_1src = v_load(src1 + x); - v_uint8x16 v_2src = v_load(src2 + x); + v_uint8 v_1src = vx_load(src1 + x); + v_uint8 v_2src = vx_load(src2 + x); - v_uint16x8 v_1src0, v_1src1, v_2src0, v_2src1; - v_expand(v_1src, v_1src0, v_1src1); - v_expand(v_2src, v_2src0, v_2src1); + v_uint16 v_src0, v_src1; + v_mul_expand(v_1src, v_2src, v_src0, v_src1); - v_uint16x8 v_src0, v_src1; - v_src0 = v_1src0 * v_2src0; - v_src1 = v_1src1 * v_2src1; - - v_uint32x4 v_src00, v_src01, v_src10, v_src11; + v_uint32 v_src00, v_src01, v_src10, v_src11; v_expand(v_src0, v_src00, v_src01); v_expand(v_src1, v_src10, v_src11); - v_store(dst + x, v_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00))); - v_store(dst + x + 4, v_load(dst + x + 4) + v_cvt_f32(v_reinterpret_as_s32(v_src01))); - v_store(dst + x + 8, v_load(dst + x + 8) + v_cvt_f32(v_reinterpret_as_s32(v_src10))); - v_store(dst + x + 12, v_load(dst + x + 12) + v_cvt_f32(v_reinterpret_as_s32(v_src11))); + v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00))); + v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src01))); + v_store(dst + x + step * 2, vx_load(dst + x + step * 2) + v_cvt_f32(v_reinterpret_as_s32(v_src10))); + v_store(dst + x + step * 3, vx_load(dst + x + step * 3) + v_cvt_f32(v_reinterpret_as_s32(v_src11))); } } else { - v_uint8x16 v_0 = v_setzero_u8(); + v_uint8 v_0 = vx_setzero_u8(); if (cn == 1) { for (; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint8x16 v_mask = v_load(mask + x); + v_uint8 v_mask = vx_load(mask + x); v_mask = ~(v_mask == v_0); - v_uint8x16 v_1src = v_load(src1 + x); - v_uint8x16 v_2src = v_load(src2 + x); + v_uint8 v_1src = vx_load(src1 + x); + v_uint8 v_2src = vx_load(src2 + x); v_1src = v_1src & v_mask; v_2src = v_2src & v_mask; - v_uint16x8 v_1src0, v_1src1, v_2src0, v_2src1; - v_expand(v_1src, v_1src0, v_1src1); - v_expand(v_2src, v_2src0, v_2src1); + v_uint16 v_src0, v_src1; + v_mul_expand(v_1src, v_2src, v_src0, v_src1); - v_uint16x8 v_src0, v_src1; - v_src0 = v_1src0 * v_2src0; - v_src1 = v_1src1 * v_2src1; - - v_uint32x4 v_src00, v_src01, v_src10, v_src11; + v_uint32 v_src00, v_src01, v_src10, v_src11; v_expand(v_src0, v_src00, v_src01); v_expand(v_src1, v_src10, v_src11); - v_store(dst + x, v_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00))); - v_store(dst + x + 4, v_load(dst + x + 4) + v_cvt_f32(v_reinterpret_as_s32(v_src01))); - v_store(dst + x + 8, v_load(dst + x + 8) + v_cvt_f32(v_reinterpret_as_s32(v_src10))); - v_store(dst + x + 12, v_load(dst + x + 12) + v_cvt_f32(v_reinterpret_as_s32(v_src11))); + v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00))); + v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src01))); + v_store(dst + x + step * 2, vx_load(dst + x + step * 2) + v_cvt_f32(v_reinterpret_as_s32(v_src10))); + v_store(dst + x + step * 3, vx_load(dst + x + step * 3) + v_cvt_f32(v_reinterpret_as_s32(v_src11))); } } else if (cn == 3) { for (; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint8x16 v_mask = v_load(mask + x); + v_uint8 v_mask = vx_load(mask + x); v_mask = ~(v_mask == v_0); - v_uint8x16 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2; + v_uint8 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2; v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2); v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2); v_1src0 = v_1src0 & v_mask; @@ -1837,23 +1878,12 @@ void accProd_simd_(const uchar* src1, const uchar* src2, float* dst, const uchar v_2src1 = v_2src1 & v_mask; v_2src2 = v_2src2 & v_mask; - v_uint16x8 v_1src00, v_1src01, v_1src10, v_1src11, v_1src20, v_1src21, v_2src00, v_2src01, v_2src10, v_2src11, v_2src20, v_2src21; - v_expand(v_1src0, v_1src00, v_1src01); - v_expand(v_1src1, v_1src10, v_1src11); - v_expand(v_1src2, v_1src20, v_1src21); - v_expand(v_2src0, v_2src00, v_2src01); - v_expand(v_2src1, v_2src10, v_2src11); - v_expand(v_2src2, v_2src20, v_2src21); + v_uint16 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21; + v_mul_expand(v_1src0, v_2src0, v_src00, v_src01); + v_mul_expand(v_1src1, v_2src1, v_src10, v_src11); + v_mul_expand(v_1src2, v_2src2, v_src20, v_src21); - v_uint16x8 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21; - v_src00 = v_1src00 * v_2src00; - v_src01 = v_1src01 * v_2src01; - v_src10 = v_1src10 * v_2src10; - v_src11 = v_1src11 * v_2src11; - v_src20 = v_1src20 * v_2src20; - v_src21 = v_1src21 * v_2src21; - - v_uint32x4 v_src000, v_src001, v_src002, v_src003, v_src100, v_src101, v_src102, v_src103, v_src200, v_src201, v_src202, v_src203; + v_uint32 v_src000, v_src001, v_src002, v_src003, v_src100, v_src101, v_src102, v_src103, v_src200, v_src201, v_src202, v_src203; v_expand(v_src00, v_src000, v_src001); v_expand(v_src01, v_src002, v_src003); v_expand(v_src10, v_src100, v_src101); @@ -1861,11 +1891,11 @@ void accProd_simd_(const uchar* src1, const uchar* src2, float* dst, const uchar v_expand(v_src20, v_src200, v_src201); v_expand(v_src21, v_src202, v_src203); - v_float32x4 v_dst000, v_dst001, v_dst002, v_dst003, v_dst100, v_dst101, v_dst102, v_dst103, v_dst200, v_dst201, v_dst202, v_dst203; + v_float32 v_dst000, v_dst001, v_dst002, v_dst003, v_dst100, v_dst101, v_dst102, v_dst103, v_dst200, v_dst201, v_dst202, v_dst203; v_load_deinterleave(dst + x * cn, v_dst000, v_dst100, v_dst200); - v_load_deinterleave(dst + (x + 4) * cn, v_dst001, v_dst101, v_dst201); - v_load_deinterleave(dst + (x + 8) * cn, v_dst002, v_dst102, v_dst202); - v_load_deinterleave(dst + (x + 12) * cn, v_dst003, v_dst103, v_dst203); + v_load_deinterleave(dst + (x + step) * cn, v_dst001, v_dst101, v_dst201); + v_load_deinterleave(dst + (x + step * 2) * cn, v_dst002, v_dst102, v_dst202); + v_load_deinterleave(dst + (x + step * 3) * cn, v_dst003, v_dst103, v_dst203); v_dst000 = v_dst000 + v_cvt_f32(v_reinterpret_as_s32(v_src000)); v_dst001 = v_dst001 + v_cvt_f32(v_reinterpret_as_s32(v_src001)); v_dst002 = v_dst002 + v_cvt_f32(v_reinterpret_as_s32(v_src002)); @@ -1880,82 +1910,78 @@ void accProd_simd_(const uchar* src1, const uchar* src2, float* dst, const uchar v_dst203 = v_dst203 + v_cvt_f32(v_reinterpret_as_s32(v_src203)); v_store_interleave(dst + x * cn, v_dst000, v_dst100, v_dst200); - v_store_interleave(dst + (x + 4) * cn, v_dst001, v_dst101, v_dst201); - v_store_interleave(dst + (x + 8) * cn, v_dst002, v_dst102, v_dst202); - v_store_interleave(dst + (x + 12) * cn, v_dst003, v_dst103, v_dst203); + v_store_interleave(dst + (x + step) * cn, v_dst001, v_dst101, v_dst201); + v_store_interleave(dst + (x + step * 2) * cn, v_dst002, v_dst102, v_dst202); + v_store_interleave(dst + (x + step * 3) * cn, v_dst003, v_dst103, v_dst203); } } } - +#endif // CV_SIMD accProd_general_(src1, src2, dst, mask, len, cn, x); } void accProd_simd_(const ushort* src1, const ushort* src2, float* dst, const uchar* mask, int len, int cn) { int x = 0; - const int cVectorWidth = 8; +#if CV_SIMD + const int cVectorWidth = v_uint16::nlanes; + const int step = v_float32::nlanes; if (!mask) { int size = len * cn; for (; x <= size - cVectorWidth; x += cVectorWidth) { - v_uint16x8 v_1src = v_load(src1 + x); - v_uint16x8 v_2src = v_load(src2 + x); + v_uint16 v_1src = vx_load(src1 + x); + v_uint16 v_2src = vx_load(src2 + x); - v_uint32x4 v_1src0, v_1src1, v_2src0, v_2src1; + v_uint32 v_1src0, v_1src1, v_2src0, v_2src1; v_expand(v_1src, v_1src0, v_1src1); v_expand(v_2src, v_2src0, v_2src1); - v_float32x4 v_1float0 = v_cvt_f32(v_reinterpret_as_s32(v_1src0)); - v_float32x4 v_1float1 = v_cvt_f32(v_reinterpret_as_s32(v_1src1)); - v_float32x4 v_2float0 = v_cvt_f32(v_reinterpret_as_s32(v_2src0)); - v_float32x4 v_2float1 = v_cvt_f32(v_reinterpret_as_s32(v_2src1)); + v_float32 v_1float0 = v_cvt_f32(v_reinterpret_as_s32(v_1src0)); + v_float32 v_1float1 = v_cvt_f32(v_reinterpret_as_s32(v_1src1)); + v_float32 v_2float0 = v_cvt_f32(v_reinterpret_as_s32(v_2src0)); + v_float32 v_2float1 = v_cvt_f32(v_reinterpret_as_s32(v_2src1)); - v_float32x4 v_src0 = v_1float0 * v_2float0; - v_float32x4 v_src1 = v_1float1 * v_2float1; - - v_store(dst + x, v_load(dst + x) + v_src0); - v_store(dst + x + 4, v_load(dst + x + 4) + v_src1); + v_store(dst + x, v_fma(v_1float0, v_2float0, vx_load(dst + x))); + v_store(dst + x + step, v_fma(v_1float1, v_2float1, vx_load(dst + x + step))); } } else { - v_uint16x8 v_0 = v_setzero_u16(); + v_uint16 v_0 = vx_setzero_u16(); if (cn == 1) { for (; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint16x8 v_mask = v_load_expand(mask + x); + v_uint16 v_mask = vx_load_expand(mask + x); v_mask = ~(v_0 == v_mask); - v_uint16x8 v_1src = v_load(src1 + x) & v_mask; - v_uint16x8 v_2src = v_load(src2 + x) & v_mask; + v_uint16 v_1src = vx_load(src1 + x) & v_mask; + v_uint16 v_2src = vx_load(src2 + x) & v_mask; - v_uint32x4 v_1src0, v_1src1, v_2src0, v_2src1; + v_uint32 v_1src0, v_1src1, v_2src0, v_2src1; v_expand(v_1src, v_1src0, v_1src1); v_expand(v_2src, v_2src0, v_2src1); - v_float32x4 v_1float0 = v_cvt_f32(v_reinterpret_as_s32(v_1src0)); - v_float32x4 v_1float1 = v_cvt_f32(v_reinterpret_as_s32(v_1src1)); - v_float32x4 v_2float0 = v_cvt_f32(v_reinterpret_as_s32(v_2src0)); - v_float32x4 v_2float1 = v_cvt_f32(v_reinterpret_as_s32(v_2src1)); + v_float32 v_1float0 = v_cvt_f32(v_reinterpret_as_s32(v_1src0)); + v_float32 v_1float1 = v_cvt_f32(v_reinterpret_as_s32(v_1src1)); + v_float32 v_2float0 = v_cvt_f32(v_reinterpret_as_s32(v_2src0)); + v_float32 v_2float1 = v_cvt_f32(v_reinterpret_as_s32(v_2src1)); - v_float32x4 v_src0 = v_1float0 * v_2float0; - v_float32x4 v_src1 = v_1float1 * v_2float1; - - v_store(dst + x, v_load(dst + x) + v_src0); - v_store(dst + x + 4, v_load(dst + x + 4) + v_src1); + v_store(dst + x, v_fma(v_1float0, v_2float0, vx_load(dst + x))); + v_store(dst + x + step, v_fma(v_1float1, v_2float1, vx_load(dst + x + step))); } } else if (cn == 3) { for (; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint16x8 v_mask = v_load_expand(mask + x); + v_uint16 v_mask = vx_load_expand(mask + x); v_mask = ~(v_0 == v_mask); - v_uint16x8 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2; + v_uint16 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2; v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2); v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2); v_1src0 = v_1src0 & v_mask; @@ -1965,7 +1991,7 @@ void accProd_simd_(const ushort* src1, const ushort* src2, float* dst, const uch v_2src1 = v_2src1 & v_mask; v_2src2 = v_2src2 & v_mask; - v_uint32x4 v_1src00, v_1src01, v_1src10, v_1src11, v_1src20, v_1src21, v_2src00, v_2src01, v_2src10, v_2src11, v_2src20, v_2src21; + v_uint32 v_1src00, v_1src01, v_1src10, v_1src11, v_1src20, v_1src21, v_2src00, v_2src01, v_2src10, v_2src11, v_2src20, v_2src21; v_expand(v_1src0, v_1src00, v_1src01); v_expand(v_1src1, v_1src10, v_1src11); v_expand(v_1src2, v_1src20, v_1src21); @@ -1973,1003 +1999,51 @@ void accProd_simd_(const ushort* src1, const ushort* src2, float* dst, const uch v_expand(v_2src1, v_2src10, v_2src11); v_expand(v_2src2, v_2src20, v_2src21); - v_float32x4 v_1float00 = v_cvt_f32(v_reinterpret_as_s32(v_1src00)); - v_float32x4 v_1float01 = v_cvt_f32(v_reinterpret_as_s32(v_1src01)); - v_float32x4 v_1float10 = v_cvt_f32(v_reinterpret_as_s32(v_1src10)); - v_float32x4 v_1float11 = v_cvt_f32(v_reinterpret_as_s32(v_1src11)); - v_float32x4 v_1float20 = v_cvt_f32(v_reinterpret_as_s32(v_1src20)); - v_float32x4 v_1float21 = v_cvt_f32(v_reinterpret_as_s32(v_1src21)); - v_float32x4 v_2float00 = v_cvt_f32(v_reinterpret_as_s32(v_2src00)); - v_float32x4 v_2float01 = v_cvt_f32(v_reinterpret_as_s32(v_2src01)); - v_float32x4 v_2float10 = v_cvt_f32(v_reinterpret_as_s32(v_2src10)); - v_float32x4 v_2float11 = v_cvt_f32(v_reinterpret_as_s32(v_2src11)); - v_float32x4 v_2float20 = v_cvt_f32(v_reinterpret_as_s32(v_2src20)); - v_float32x4 v_2float21 = v_cvt_f32(v_reinterpret_as_s32(v_2src21)); + v_float32 v_1float00 = v_cvt_f32(v_reinterpret_as_s32(v_1src00)); + v_float32 v_1float01 = v_cvt_f32(v_reinterpret_as_s32(v_1src01)); + v_float32 v_1float10 = v_cvt_f32(v_reinterpret_as_s32(v_1src10)); + v_float32 v_1float11 = v_cvt_f32(v_reinterpret_as_s32(v_1src11)); + v_float32 v_1float20 = v_cvt_f32(v_reinterpret_as_s32(v_1src20)); + v_float32 v_1float21 = v_cvt_f32(v_reinterpret_as_s32(v_1src21)); + v_float32 v_2float00 = v_cvt_f32(v_reinterpret_as_s32(v_2src00)); + v_float32 v_2float01 = v_cvt_f32(v_reinterpret_as_s32(v_2src01)); + v_float32 v_2float10 = v_cvt_f32(v_reinterpret_as_s32(v_2src10)); + v_float32 v_2float11 = v_cvt_f32(v_reinterpret_as_s32(v_2src11)); + v_float32 v_2float20 = v_cvt_f32(v_reinterpret_as_s32(v_2src20)); + v_float32 v_2float21 = v_cvt_f32(v_reinterpret_as_s32(v_2src21)); - v_float32x4 v_src00 = v_1float00 * v_2float00; - v_float32x4 v_src01 = v_1float01 * v_2float01; - v_float32x4 v_src10 = v_1float10 * v_2float10; - v_float32x4 v_src11 = v_1float11 * v_2float11; - v_float32x4 v_src20 = v_1float20 * v_2float20; - v_float32x4 v_src21 = v_1float21 * v_2float21; - - v_float32x4 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21; + v_float32 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21; v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20); - v_load_deinterleave(dst + (x + 4) * cn, v_dst01, v_dst11, v_dst21); + v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21); - v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20); - v_store_interleave(dst + (x + 4) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21); + v_dst00 = v_fma(v_1float00, v_2float00, v_dst00); + v_dst01 = v_fma(v_1float01, v_2float01, v_dst01); + v_dst10 = v_fma(v_1float10, v_2float10, v_dst10); + v_dst11 = v_fma(v_1float11, v_2float11, v_dst11); + v_dst20 = v_fma(v_1float20, v_2float20, v_dst20); + v_dst21 = v_fma(v_1float21, v_2float21, v_dst21); + + v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20); + v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21); } } } - +#endif // CV_SIMD accProd_general_(src1, src2, dst, mask, len, cn, x); } void accProd_simd_(const float* src1, const float* src2, float* dst, const uchar* mask, int len, int cn) { int x = 0; - const int cVectorWidth = 8; +#if CV_SIMD + const int cVectorWidth = v_uint16::nlanes; + const int step = v_float32::nlanes; if (!mask) { int size = len * cn; - for (; x <= size - cVectorWidth; x += cVectorWidth) - { - v_store(dst + x, v_load(dst + x) + v_load(src1 + x) * v_load(src2 + x)); - v_store(dst + x + 4, v_load(dst + x + 4) + v_load(src1 + x + 4) * v_load(src2 + x + 4)); - } - } - else - { - v_uint32x4 v_0 = v_setzero_u32(); - if (cn == 1) - { - for (; x <= len - cVectorWidth; x += cVectorWidth) - { - v_uint32x4 v_mask32_0 = v_load_expand_q(mask + x); - v_uint32x4 v_mask32_1 = v_load_expand_q(mask + x + 4); - v_float32x4 v_mask0 = v_reinterpret_as_f32(~(v_mask32_0 == v_0)); - v_float32x4 v_mask1 = v_reinterpret_as_f32(~(v_mask32_1 == v_0)); - - v_store(dst + x, v_load(dst + x) + ((v_load(src1 + x) * v_load(src2 + x)) & v_mask0)); - v_store(dst + x + 4, v_load(dst + x + 4) + ((v_load(src1 + x + 4) * v_load(src2 + x + 4)) & v_mask1)); - } - } - else if (cn == 3) - { - for (; x <= len - cVectorWidth; x += cVectorWidth) - { - v_uint32x4 v_mask32_0 = v_load_expand_q(mask + x); - v_uint32x4 v_mask32_1 = v_load_expand_q(mask + x + 4); - v_float32x4 v_mask0 = v_reinterpret_as_f32(~(v_mask32_0 == v_0)); - v_float32x4 v_mask1 = v_reinterpret_as_f32(~(v_mask32_1 == v_0)); - - v_float32x4 v_1src00, v_1src01, v_1src10, v_1src11, v_1src20, v_1src21; - v_float32x4 v_2src00, v_2src01, v_2src10, v_2src11, v_2src20, v_2src21; - v_load_deinterleave(src1 + x * cn, v_1src00, v_1src10, v_1src20); - v_load_deinterleave(src2 + x * cn, v_2src00, v_2src10, v_2src20); - v_load_deinterleave(src1 + (x + 4) * cn, v_1src01, v_1src11, v_1src21); - v_load_deinterleave(src2 + (x + 4) * cn, v_2src01, v_2src11, v_2src21); - - v_float32x4 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21; - v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20); - v_load_deinterleave(dst + (x + 4) * cn, v_dst01, v_dst11, v_dst21); - - v_store_interleave(dst + x * cn, v_dst00 + ((v_1src00 * v_2src00) & v_mask0), v_dst10 + ((v_1src10 * v_2src10) & v_mask0), v_dst20 + ((v_1src20 * v_2src20) & v_mask0)); - v_store_interleave(dst + (x + 4) * cn, v_dst01 + ((v_1src01 * v_2src01) & v_mask1), v_dst11 + ((v_1src11 * v_2src11) & v_mask1), v_dst21 + ((v_1src21 * v_2src21) & v_mask1)); - } - } - } - - accProd_general_(src1, src2, dst, mask, len, cn, x); -} -#if CV_SIMD128_64F -void accProd_simd_(const uchar* src1, const uchar* src2, double* dst, const uchar* mask, int len, int cn) -{ - int x = 0; - const int cVectorWidth = 8; - - if (!mask) - { - int size = len * cn; - for (; x <= size - cVectorWidth; x += cVectorWidth) - { - v_uint16x8 v_1int = v_load_expand(src1 + x); - v_uint16x8 v_2int = v_load_expand(src2 + x); - - v_uint32x4 v_1int_0, v_1int_1, v_2int_0, v_2int_1; - v_expand(v_1int, v_1int_0, v_1int_1); - v_expand(v_2int, v_2int_0, v_2int_1); - - v_int32x4 v_1int0 = v_reinterpret_as_s32(v_1int_0); - v_int32x4 v_1int1 = v_reinterpret_as_s32(v_1int_1); - v_int32x4 v_2int0 = v_reinterpret_as_s32(v_2int_0); - v_int32x4 v_2int1 = v_reinterpret_as_s32(v_2int_1); - - v_float64x2 v_src0 = v_cvt_f64(v_1int0) * v_cvt_f64(v_2int0); - v_float64x2 v_src1 = v_cvt_f64_high(v_1int0) * v_cvt_f64_high(v_2int0); - v_float64x2 v_src2 = v_cvt_f64(v_1int1) * v_cvt_f64(v_2int1); - v_float64x2 v_src3 = v_cvt_f64_high(v_1int1) * v_cvt_f64_high(v_2int1); - - v_float64x2 v_dst0 = v_load(dst + x); - v_float64x2 v_dst1 = v_load(dst + x + 2); - v_float64x2 v_dst2 = v_load(dst + x + 4); - v_float64x2 v_dst3 = v_load(dst + x + 6); - - v_dst0 += v_src0; - v_dst1 += v_src1; - v_dst2 += v_src2; - v_dst3 += v_src3; - - v_store(dst + x, v_dst0); - v_store(dst + x + 2, v_dst1); - v_store(dst + x + 4, v_dst2); - v_store(dst + x + 6, v_dst3); - } - } - else - { - v_uint16x8 v_0 = v_setzero_u16(); - if (cn == 1) - { - for (; x <= len - cVectorWidth; x += cVectorWidth) - { - v_uint16x8 v_mask = v_load_expand(mask + x); - v_mask = ~(v_mask == v_0); - v_uint16x8 v_1int = v_load_expand(src1 + x) & v_mask; - v_uint16x8 v_2int = v_load_expand(src2 + x) & v_mask; - - v_uint32x4 v_1int_0, v_1int_1, v_2int_0, v_2int_1; - v_expand(v_1int, v_1int_0, v_1int_1); - v_expand(v_2int, v_2int_0, v_2int_1); - - v_int32x4 v_1int0 = v_reinterpret_as_s32(v_1int_0); - v_int32x4 v_1int1 = v_reinterpret_as_s32(v_1int_1); - v_int32x4 v_2int0 = v_reinterpret_as_s32(v_2int_0); - v_int32x4 v_2int1 = v_reinterpret_as_s32(v_2int_1); - - v_float64x2 v_src0 = v_cvt_f64(v_1int0) * v_cvt_f64(v_2int0); - v_float64x2 v_src1 = v_cvt_f64_high(v_1int0) * v_cvt_f64_high(v_2int0); - v_float64x2 v_src2 = v_cvt_f64(v_1int1) * v_cvt_f64(v_2int1); - v_float64x2 v_src3 = v_cvt_f64_high(v_1int1) * v_cvt_f64_high(v_2int1); - - v_float64x2 v_dst0 = v_load(dst + x); - v_float64x2 v_dst1 = v_load(dst + x + 2); - v_float64x2 v_dst2 = v_load(dst + x + 4); - v_float64x2 v_dst3 = v_load(dst + x + 6); - - v_dst0 += v_src0; - v_dst1 += v_src1; - v_dst2 += v_src2; - v_dst3 += v_src3; - - v_store(dst + x, v_dst0); - v_store(dst + x + 2, v_dst1); - v_store(dst + x + 4, v_dst2); - v_store(dst + x + 6, v_dst3); - } - } - else if (cn == 3) - { - for (; x <= len - /*cVectorWidth*/16; x += cVectorWidth) - { - v_uint8x16 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2; - v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2); - v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2); - - v_uint16x8 v_1int0, v_1int1, v_1int2, v_2int0, v_2int1, v_2int2, dummy; - v_expand(v_1src0, v_1int0, dummy); - v_expand(v_1src1, v_1int1, dummy); - v_expand(v_1src2, v_1int2, dummy); - v_expand(v_2src0, v_2int0, dummy); - v_expand(v_2src1, v_2int1, dummy); - v_expand(v_2src2, v_2int2, dummy); - - v_uint16x8 v_mask = v_load_expand(mask + x); - v_mask = ~(v_mask == v_0); - v_1int0 = v_1int0 & v_mask; - v_1int1 = v_1int1 & v_mask; - v_1int2 = v_1int2 & v_mask; - v_2int0 = v_2int0 & v_mask; - v_2int1 = v_2int1 & v_mask; - v_2int2 = v_2int2 & v_mask; - - v_uint32x4 v_1int00, v_1int01, v_1int10, v_1int11, v_1int20, v_1int21; - v_uint32x4 v_2int00, v_2int01, v_2int10, v_2int11, v_2int20, v_2int21; - v_expand(v_1int0, v_1int00, v_1int01); - v_expand(v_1int1, v_1int10, v_1int11); - v_expand(v_1int2, v_1int20, v_1int21); - v_expand(v_2int0, v_2int00, v_2int01); - v_expand(v_2int1, v_2int10, v_2int11); - v_expand(v_2int2, v_2int20, v_2int21); - - v_float64x2 v_src00 = v_cvt_f64(v_reinterpret_as_s32(v_1int00)) * v_cvt_f64(v_reinterpret_as_s32(v_2int00)); - v_float64x2 v_src01 = v_cvt_f64_high(v_reinterpret_as_s32(v_1int00)) * v_cvt_f64_high(v_reinterpret_as_s32(v_2int00)); - v_float64x2 v_src02 = v_cvt_f64(v_reinterpret_as_s32(v_1int01)) * v_cvt_f64(v_reinterpret_as_s32(v_2int01)); - v_float64x2 v_src03 = v_cvt_f64_high(v_reinterpret_as_s32(v_1int01)) * v_cvt_f64_high(v_reinterpret_as_s32(v_2int01)); - v_float64x2 v_src10 = v_cvt_f64(v_reinterpret_as_s32(v_1int10)) * v_cvt_f64(v_reinterpret_as_s32(v_2int10)); - v_float64x2 v_src11 = v_cvt_f64_high(v_reinterpret_as_s32(v_1int10)) * v_cvt_f64_high(v_reinterpret_as_s32(v_2int10)); - v_float64x2 v_src12 = v_cvt_f64(v_reinterpret_as_s32(v_1int11)) * v_cvt_f64(v_reinterpret_as_s32(v_2int11)); - v_float64x2 v_src13 = v_cvt_f64_high(v_reinterpret_as_s32(v_1int11)) * v_cvt_f64_high(v_reinterpret_as_s32(v_2int11)); - v_float64x2 v_src20 = v_cvt_f64(v_reinterpret_as_s32(v_1int20)) * v_cvt_f64(v_reinterpret_as_s32(v_2int20)); - v_float64x2 v_src21 = v_cvt_f64_high(v_reinterpret_as_s32(v_1int20)) * v_cvt_f64_high(v_reinterpret_as_s32(v_2int20)); - v_float64x2 v_src22 = v_cvt_f64(v_reinterpret_as_s32(v_1int21)) * v_cvt_f64(v_reinterpret_as_s32(v_2int21)); - v_float64x2 v_src23 = v_cvt_f64_high(v_reinterpret_as_s32(v_1int21)) * v_cvt_f64_high(v_reinterpret_as_s32(v_2int21)); - - v_float64x2 v_dst00, v_dst01, v_dst02, v_dst03, v_dst10, v_dst11, v_dst12, v_dst13, v_dst20, v_dst21, v_dst22, v_dst23; - v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20); - v_load_deinterleave(dst + (x + 2) * cn, v_dst01, v_dst11, v_dst21); - v_load_deinterleave(dst + (x + 4) * cn, v_dst02, v_dst12, v_dst22); - v_load_deinterleave(dst + (x + 6) * cn, v_dst03, v_dst13, v_dst23); - - v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20); - v_store_interleave(dst + (x + 2) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21); - v_store_interleave(dst + (x + 4) * cn, v_dst02 + v_src02, v_dst12 + v_src12, v_dst22 + v_src22); - v_store_interleave(dst + (x + 6) * cn, v_dst03 + v_src03, v_dst13 + v_src13, v_dst23 + v_src23); - } - } - } - - accProd_general_(src1, src2, dst, mask, len, cn, x); -} - -void accProd_simd_(const ushort* src1, const ushort* src2, double* dst, const uchar* mask, int len, int cn) -{ - int x = 0; - const int cVectorWidth = 8; - - if (!mask) - { - int size = len * cn; - for (; x <= size - cVectorWidth; x += cVectorWidth) - { - v_uint16x8 v_1src = v_load(src1 + x); - v_uint16x8 v_2src = v_load(src2 + x); - - v_uint32x4 v_1int_0, v_1int_1, v_2int_0, v_2int_1; - v_expand(v_1src, v_1int_0, v_1int_1); - v_expand(v_2src, v_2int_0, v_2int_1); - - v_int32x4 v_1int0 = v_reinterpret_as_s32(v_1int_0); - v_int32x4 v_1int1 = v_reinterpret_as_s32(v_1int_1); - v_int32x4 v_2int0 = v_reinterpret_as_s32(v_2int_0); - v_int32x4 v_2int1 = v_reinterpret_as_s32(v_2int_1); - - v_float64x2 v_src0 = v_cvt_f64(v_1int0) * v_cvt_f64(v_2int0); - v_float64x2 v_src1 = v_cvt_f64_high(v_1int0) * v_cvt_f64_high(v_2int0); - v_float64x2 v_src2 = v_cvt_f64(v_1int1) * v_cvt_f64(v_2int1); - v_float64x2 v_src3 = v_cvt_f64_high(v_1int1) * v_cvt_f64_high(v_2int1); - - v_float64x2 v_dst0 = v_load(dst + x); - v_float64x2 v_dst1 = v_load(dst + x + 2); - v_float64x2 v_dst2 = v_load(dst + x + 4); - v_float64x2 v_dst3 = v_load(dst + x + 6); - - v_dst0 = v_dst0 + v_src0; - v_dst1 = v_dst1 + v_src1; - v_dst2 = v_dst2 + v_src2; - v_dst3 = v_dst3 + v_src3; - v_store(dst + x, v_dst0); - v_store(dst + x + 2, v_dst1); - v_store(dst + x + 4, v_dst2); - v_store(dst + x + 6, v_dst3); - } - } - else - { - v_uint16x8 v_0 = v_setzero_u16(); - if (cn == 1) - { - for (; x <= len - cVectorWidth; x += cVectorWidth) - { - v_uint16x8 v_mask = v_load_expand(mask + x); - v_mask = ~(v_mask == v_0); - v_uint16x8 v_1src = v_load(src1 + x); - v_uint16x8 v_2src = v_load(src2 + x); - v_1src = v_1src & v_mask; - v_2src = v_2src & v_mask; - - v_uint32x4 v_1int_0, v_1int_1, v_2int_0, v_2int_1; - v_expand(v_1src, v_1int_0, v_1int_1); - v_expand(v_2src, v_2int_0, v_2int_1); - - v_int32x4 v_1int0 = v_reinterpret_as_s32(v_1int_0); - v_int32x4 v_1int1 = v_reinterpret_as_s32(v_1int_1); - v_int32x4 v_2int0 = v_reinterpret_as_s32(v_2int_0); - v_int32x4 v_2int1 = v_reinterpret_as_s32(v_2int_1); - - v_float64x2 v_src0 = v_cvt_f64(v_1int0) * v_cvt_f64(v_2int0); - v_float64x2 v_src1 = v_cvt_f64_high(v_1int0) * v_cvt_f64_high(v_2int0); - v_float64x2 v_src2 = v_cvt_f64(v_1int1) * v_cvt_f64(v_2int1); - v_float64x2 v_src3 = v_cvt_f64_high(v_1int1) * v_cvt_f64_high(v_2int1); - - v_float64x2 v_dst0 = v_load(dst + x); - v_float64x2 v_dst1 = v_load(dst + x + 2); - v_float64x2 v_dst2 = v_load(dst + x + 4); - v_float64x2 v_dst3 = v_load(dst + x + 6); - - v_dst0 = v_dst0 + v_src0; - v_dst1 = v_dst1 + v_src1; - v_dst2 = v_dst2 + v_src2; - v_dst3 = v_dst3 + v_src3; - v_store(dst + x, v_dst0); - v_store(dst + x + 2, v_dst1); - v_store(dst + x + 4, v_dst2); - v_store(dst + x + 6, v_dst3); - } - } - else if (cn == 3) - { - for (; x <= len - cVectorWidth; x += cVectorWidth) - { - v_uint16x8 v_mask = v_load_expand(mask + x); - v_mask = ~(v_mask == v_0); - v_uint16x8 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2; - v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2); - v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2); - v_1src0 = v_1src0 & v_mask; - v_1src1 = v_1src1 & v_mask; - v_1src2 = v_1src2 & v_mask; - v_2src0 = v_2src0 & v_mask; - v_2src1 = v_2src1 & v_mask; - v_2src2 = v_2src2 & v_mask; - - v_uint32x4 v_1int_00, v_1int_01, v_2int_00, v_2int_01; - v_uint32x4 v_1int_10, v_1int_11, v_2int_10, v_2int_11; - v_uint32x4 v_1int_20, v_1int_21, v_2int_20, v_2int_21; - v_expand(v_1src0, v_1int_00, v_1int_01); - v_expand(v_1src1, v_1int_10, v_1int_11); - v_expand(v_1src2, v_1int_20, v_1int_21); - v_expand(v_2src0, v_2int_00, v_2int_01); - v_expand(v_2src1, v_2int_10, v_2int_11); - v_expand(v_2src2, v_2int_20, v_2int_21); - - v_int32x4 v_1int00 = v_reinterpret_as_s32(v_1int_00); - v_int32x4 v_1int01 = v_reinterpret_as_s32(v_1int_01); - v_int32x4 v_1int10 = v_reinterpret_as_s32(v_1int_10); - v_int32x4 v_1int11 = v_reinterpret_as_s32(v_1int_11); - v_int32x4 v_1int20 = v_reinterpret_as_s32(v_1int_20); - v_int32x4 v_1int21 = v_reinterpret_as_s32(v_1int_21); - v_int32x4 v_2int00 = v_reinterpret_as_s32(v_2int_00); - v_int32x4 v_2int01 = v_reinterpret_as_s32(v_2int_01); - v_int32x4 v_2int10 = v_reinterpret_as_s32(v_2int_10); - v_int32x4 v_2int11 = v_reinterpret_as_s32(v_2int_11); - v_int32x4 v_2int20 = v_reinterpret_as_s32(v_2int_20); - v_int32x4 v_2int21 = v_reinterpret_as_s32(v_2int_21); - - v_float64x2 v_src00 = v_cvt_f64(v_1int00) * v_cvt_f64(v_2int00); - v_float64x2 v_src01 = v_cvt_f64_high(v_1int00) * v_cvt_f64_high(v_2int00); - v_float64x2 v_src02 = v_cvt_f64(v_1int01) * v_cvt_f64(v_2int01); - v_float64x2 v_src03 = v_cvt_f64_high(v_1int01) * v_cvt_f64_high(v_2int01); - v_float64x2 v_src10 = v_cvt_f64(v_1int10) * v_cvt_f64(v_2int10); - v_float64x2 v_src11 = v_cvt_f64_high(v_1int10) * v_cvt_f64_high(v_2int10); - v_float64x2 v_src12 = v_cvt_f64(v_1int11) * v_cvt_f64(v_2int11); - v_float64x2 v_src13 = v_cvt_f64_high(v_1int11) * v_cvt_f64_high(v_2int11); - v_float64x2 v_src20 = v_cvt_f64(v_1int20) * v_cvt_f64(v_2int20); - v_float64x2 v_src21 = v_cvt_f64_high(v_1int20) * v_cvt_f64_high(v_2int20); - v_float64x2 v_src22 = v_cvt_f64(v_1int21) * v_cvt_f64(v_2int21); - v_float64x2 v_src23 = v_cvt_f64_high(v_1int21) * v_cvt_f64_high(v_2int21); - - v_float64x2 v_dst00, v_dst01, v_dst02, v_dst03; - v_float64x2 v_dst10, v_dst11, v_dst12, v_dst13; - v_float64x2 v_dst20, v_dst21, v_dst22, v_dst23; - v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20); - v_load_deinterleave(dst + (x + 2) * cn, v_dst01, v_dst11, v_dst21); - v_load_deinterleave(dst + (x + 4) * cn, v_dst02, v_dst12, v_dst22); - v_load_deinterleave(dst + (x + 6) * cn, v_dst03, v_dst13, v_dst23); - - v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20); - v_store_interleave(dst + (x + 2) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21); - v_store_interleave(dst + (x + 4) * cn, v_dst02 + v_src02, v_dst12 + v_src12, v_dst22 + v_src22); - v_store_interleave(dst + (x + 6) * cn, v_dst03 + v_src03, v_dst13 + v_src13, v_dst23 + v_src23); - } - } - } - - accProd_general_(src1, src2, dst, mask, len, cn, x); -} - -void accProd_simd_(const float* src1, const float* src2, double* dst, const uchar* mask, int len, int cn) -{ - int x = 0; - const int cVectorWidth = 4; - - if (!mask) - { - int size = len * cn; - for (; x <= size - cVectorWidth; x += cVectorWidth) - { - v_float32x4 v_1src = v_load(src1 + x); - v_float32x4 v_2src = v_load(src2 + x); - - v_float64x2 v_1src0 = v_cvt_f64(v_1src); - v_float64x2 v_1src1 = v_cvt_f64_high(v_1src); - v_float64x2 v_2src0 = v_cvt_f64(v_2src); - v_float64x2 v_2src1 = v_cvt_f64_high(v_2src); - - v_store(dst + x, v_load(dst + x) + (v_1src0 * v_2src0)); - v_store(dst + x + 2, v_load(dst + x + 2) + (v_1src1 * v_2src1)); - } - } - else - { - v_uint32x4 v_0 = v_setzero_u32(); - if (cn == 1) - { - for (; x <= len - cVectorWidth; x += cVectorWidth) - { - v_uint32x4 v_mask = v_load_expand_q(mask + x); - v_mask = ~(v_mask == v_0); - v_float32x4 v_1src = v_load(src1 + x); - v_float32x4 v_2src = v_load(src2 + x); - v_1src = v_1src & v_reinterpret_as_f32(v_mask); - v_2src = v_2src & v_reinterpret_as_f32(v_mask); - - v_float64x2 v_1src0 = v_cvt_f64(v_1src); - v_float64x2 v_1src1 = v_cvt_f64_high(v_1src); - v_float64x2 v_2src0 = v_cvt_f64(v_2src); - v_float64x2 v_2src1 = v_cvt_f64_high(v_2src); - - v_store(dst + x, v_load(dst + x) + (v_1src0 * v_2src0)); - v_store(dst + x + 2, v_load(dst + x + 2) + (v_1src1 * v_2src1)); - } - } - else if (cn == 3) - { - for (; x <= len - cVectorWidth; x += cVectorWidth) - { - v_uint32x4 v_mask = v_load_expand_q(mask + x); - v_mask = ~(v_mask == v_0); - v_float32x4 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2; - v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2); - v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2); - v_1src0 = v_1src0 & v_reinterpret_as_f32(v_mask); - v_1src1 = v_1src1 & v_reinterpret_as_f32(v_mask); - v_1src2 = v_1src2 & v_reinterpret_as_f32(v_mask); - v_2src0 = v_2src0 & v_reinterpret_as_f32(v_mask); - v_2src1 = v_2src1 & v_reinterpret_as_f32(v_mask); - v_2src2 = v_2src2 & v_reinterpret_as_f32(v_mask); - - v_float64x2 v_src00 = v_cvt_f64(v_1src0) * v_cvt_f64(v_2src0); - v_float64x2 v_src01 = v_cvt_f64_high(v_1src0) * v_cvt_f64_high(v_2src0); - v_float64x2 v_src10 = v_cvt_f64(v_1src1) * v_cvt_f64(v_2src1); - v_float64x2 v_src11 = v_cvt_f64_high(v_1src1) * v_cvt_f64_high(v_2src1); - v_float64x2 v_src20 = v_cvt_f64(v_1src2) * v_cvt_f64(v_2src2); - v_float64x2 v_src21 = v_cvt_f64_high(v_1src2) * v_cvt_f64_high(v_2src2); - - v_float64x2 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21; - v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20); - v_load_deinterleave(dst + (x + 2) * cn, v_dst01, v_dst11, v_dst21); - - v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20); - v_store_interleave(dst + (x + 2) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21); - } - } - } - - accProd_general_(src1, src2, dst, mask, len, cn, x); -} - -void accProd_simd_(const double* src1, const double* src2, double* dst, const uchar* mask, int len, int cn) -{ - int x = 0; - const int cVectorWidth = 4; - - if (!mask) - { - int size = len * cn; - for (; x <= size - cVectorWidth; x += cVectorWidth) - { - v_float64x2 v_src00 = v_load(src1 + x); - v_float64x2 v_src01 = v_load(src1 + x + 2); - v_float64x2 v_src10 = v_load(src2 + x); - v_float64x2 v_src11 = v_load(src2 + x + 2); - - v_store(dst + x, v_load(dst + x) + (v_src00 * v_src10)); - v_store(dst + x + 2, v_load(dst + x + 2) + (v_src01 * v_src11)); - } - } - else - { - v_uint64x2 v_0 = v_setzero_u64(); - if (cn == 1) - { - for (; x <= len - cVectorWidth; x += cVectorWidth) - { - v_uint32x4 v_mask32 = v_load_expand_q(mask + x); - v_uint64x2 v_masku640, v_masku641; - v_expand(v_mask32, v_masku640, v_masku641); - v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0)); - v_float64x2 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0)); - - v_float64x2 v_src00 = v_load(src1 + x); - v_float64x2 v_src01 = v_load(src1 + x + 2); - v_float64x2 v_src10 = v_load(src2 + x); - v_float64x2 v_src11 = v_load(src2 + x + 2); - - v_store(dst + x, v_load(dst + x) + ((v_src00 * v_src10) & v_mask0)); - v_store(dst + x + 2, v_load(dst + x + 2) + ((v_src01 * v_src11) & v_mask1)); - } - } - else if (cn == 3) - { - for (; x <= len - cVectorWidth; x += cVectorWidth) - { - v_uint32x4 v_mask32 = v_load_expand_q(mask + x); - v_uint64x2 v_masku640, v_masku641; - v_expand(v_mask32, v_masku640, v_masku641); - v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0)); - v_float64x2 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0)); - - v_float64x2 v_1src00, v_1src01, v_1src10, v_1src11, v_1src20, v_1src21; - v_float64x2 v_2src00, v_2src01, v_2src10, v_2src11, v_2src20, v_2src21; - v_load_deinterleave(src1 + x * cn, v_1src00, v_1src10, v_1src20); - v_load_deinterleave(src1 + (x + 2) * cn, v_1src01, v_1src11, v_1src21); - v_load_deinterleave(src2 + x * cn, v_2src00, v_2src10, v_2src20); - v_load_deinterleave(src2 + (x + 2) * cn, v_2src01, v_2src11, v_2src21); - v_float64x2 v_src00 = (v_1src00 & v_mask0) * v_2src00; - v_float64x2 v_src01 = (v_1src01 & v_mask1) * v_2src01; - v_float64x2 v_src10 = (v_1src10 & v_mask0) * v_2src10; - v_float64x2 v_src11 = (v_1src11 & v_mask1) * v_2src11; - v_float64x2 v_src20 = (v_1src20 & v_mask0) * v_2src20; - v_float64x2 v_src21 = (v_1src21 & v_mask1) * v_2src21; - - v_float64x2 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21; - v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20); - v_load_deinterleave(dst + (x + 2) * cn, v_dst01, v_dst11, v_dst21); - - v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20); - v_store_interleave(dst + (x + 2) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21); - } - } - } - - accProd_general_(src1, src2, dst, mask, len, cn, x); -} -#else -void accProd_simd_(const uchar* src1, const uchar* src2, double* dst, const uchar* mask, int len, int cn) -{ - accProd_general_(src1, src2, dst, mask, len, cn, 0); -} - -void accProd_simd_(const ushort* src1, const ushort* src2, double* dst, const uchar* mask, int len, int cn) -{ - accProd_general_(src1, src2, dst, mask, len, cn, 0); -} - -void accProd_simd_(const float* src1, const float* src2, double* dst, const uchar* mask, int len, int cn) -{ - accProd_general_(src1, src2, dst, mask, len, cn, 0); -} - -void accProd_simd_(const double* src1, const double* src2, double* dst, const uchar* mask, int len, int cn) -{ - accProd_general_(src1, src2, dst, mask, len, cn, 0); -} -#endif - -// running weight accumulate optimized by universal intrinsic -void accW_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn, double alpha) -{ - int x = 0; - const v_float32x4 v_alpha = v_setall_f32((float)alpha); - const v_float32x4 v_beta = v_setall_f32((float)(1.0f - alpha)); - const int cVectorWidth = 16; - - if (!mask) - { - int size = len * cn; - for (; x <= size - cVectorWidth; x += cVectorWidth) - { - v_uint8x16 v_src = v_load(src + x); - - v_uint16x8 v_src0, v_src1; - v_expand(v_src, v_src0, v_src1); - - v_uint32x4 v_src00, v_src01, v_src10, v_src11; - v_expand(v_src0, v_src00, v_src01); - v_expand(v_src1, v_src10, v_src11); - - v_float32x4 v_dst00 = v_load(dst + x); - v_float32x4 v_dst01 = v_load(dst + x + 4); - v_float32x4 v_dst10 = v_load(dst + x + 8); - v_float32x4 v_dst11 = v_load(dst + x + 12); - - v_dst00 = (v_dst00 * v_beta) + (v_cvt_f32(v_reinterpret_as_s32(v_src00)) * v_alpha); - v_dst01 = (v_dst01 * v_beta) + (v_cvt_f32(v_reinterpret_as_s32(v_src01)) * v_alpha); - v_dst10 = (v_dst10 * v_beta) + (v_cvt_f32(v_reinterpret_as_s32(v_src10)) * v_alpha); - v_dst11 = (v_dst11 * v_beta) + (v_cvt_f32(v_reinterpret_as_s32(v_src11)) * v_alpha); - - v_store(dst + x, v_dst00); - v_store(dst + x + 4, v_dst01); - v_store(dst + x + 8, v_dst10); - v_store(dst + x + 12, v_dst11); - } - } - - accW_general_(src, dst, mask, len, cn, alpha, x); -} - -void accW_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn, double alpha) -{ - int x = 0; - const v_float32x4 v_alpha = v_setall_f32((float)alpha); - const v_float32x4 v_beta = v_setall_f32((float)(1.0f - alpha)); - const int cVectorWidth = 8; - - if (!mask) - { - int size = len * cn; - for (; x <= size - cVectorWidth; x += cVectorWidth) - { - v_uint16x8 v_src = v_load(src + x); - v_uint32x4 v_int0, v_int1; - v_expand(v_src, v_int0, v_int1); - - v_float32x4 v_src0 = v_cvt_f32(v_reinterpret_as_s32(v_int0)); - v_float32x4 v_src1 = v_cvt_f32(v_reinterpret_as_s32(v_int1)); - v_src0 = v_src0 * v_alpha; - v_src1 = v_src1 * v_alpha; - - v_float32x4 v_dst0 = v_load(dst + x) * v_beta; - v_float32x4 v_dst1 = v_load(dst + x + 4) * v_beta; - - v_store(dst + x, v_dst0 + v_src0); - v_store(dst + x + 4, v_dst1 + v_src1); - } - } - - accW_general_(src, dst, mask, len, cn, alpha, x); -} - -void accW_simd_(const float* src, float* dst, const uchar* mask, int len, int cn, double alpha) -{ - int x = 0; - const v_float32x4 v_alpha = v_setall_f32((float)alpha); - const v_float32x4 v_beta = v_setall_f32((float)(1.0f - alpha)); - const int cVectorWidth = 8; - - if (!mask) - { - int size = len * cn; - for (; x <= size - cVectorWidth; x += cVectorWidth) - { - v_store(dst + x, ((v_load(dst + x) * v_beta) + (v_load(src + x) * v_alpha))); - v_store(dst + x + 4, ((v_load(dst + x + 4) * v_beta) + (v_load(src + x + 4) * v_alpha))); - } - } - - accW_general_(src, dst, mask, len, cn, alpha, x); -} -#if CV_SIMD128_64F -void accW_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn, double alpha) -{ - int x = 0; - const v_float64x2 v_alpha = v_setall_f64(alpha); - const v_float64x2 v_beta = v_setall_f64(1.0f - alpha); - const int cVectorWidth = 8; - - if (!mask) - { - int size = len * cn; - for (; x <= size - cVectorWidth; x += cVectorWidth) - { - v_uint16x8 v_src16 = v_load_expand(src + x); - - v_uint32x4 v_int_0, v_int_1; - v_expand(v_src16, v_int_0, v_int_1); - - v_int32x4 v_int0 = v_reinterpret_as_s32(v_int_0); - v_int32x4 v_int1 = v_reinterpret_as_s32(v_int_1); - - v_float64x2 v_src0 = v_cvt_f64(v_int0); - v_float64x2 v_src1 = v_cvt_f64_high(v_int0); - v_float64x2 v_src2 = v_cvt_f64(v_int1); - v_float64x2 v_src3 = v_cvt_f64_high(v_int1); - - v_float64x2 v_dst0 = v_load(dst + x); - v_float64x2 v_dst1 = v_load(dst + x + 2); - v_float64x2 v_dst2 = v_load(dst + x + 4); - v_float64x2 v_dst3 = v_load(dst + x + 6); - - v_dst0 = (v_dst0 * v_beta) + (v_src0 * v_alpha); - v_dst1 = (v_dst1 * v_beta) + (v_src1 * v_alpha); - v_dst2 = (v_dst2 * v_beta) + (v_src2 * v_alpha); - v_dst3 = (v_dst3 * v_beta) + (v_src3 * v_alpha); - - v_store(dst + x, v_dst0); - v_store(dst + x + 2, v_dst1); - v_store(dst + x + 4, v_dst2); - v_store(dst + x + 6, v_dst3); - } - } - - accW_general_(src, dst, mask, len, cn, alpha, x); -} - -void accW_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn, double alpha) -{ - int x = 0; - const v_float64x2 v_alpha = v_setall_f64(alpha); - const v_float64x2 v_beta = v_setall_f64(1.0f - alpha); - const int cVectorWidth = 8; - - if (!mask) - { - int size = len * cn; - for (; x <= size - cVectorWidth; x += cVectorWidth) - { - v_uint16x8 v_src = v_load(src + x); - v_uint32x4 v_int_0, v_int_1; - v_expand(v_src, v_int_0, v_int_1); - - v_int32x4 v_int0 = v_reinterpret_as_s32(v_int_0); - v_int32x4 v_int1 = v_reinterpret_as_s32(v_int_1); - - v_float64x2 v_src00 = v_cvt_f64(v_int0); - v_float64x2 v_src01 = v_cvt_f64_high(v_int0); - v_float64x2 v_src10 = v_cvt_f64(v_int1); - v_float64x2 v_src11 = v_cvt_f64_high(v_int1); - - v_float64x2 v_dst00 = v_load(dst + x); - v_float64x2 v_dst01 = v_load(dst + x + 2); - v_float64x2 v_dst10 = v_load(dst + x + 4); - v_float64x2 v_dst11 = v_load(dst + x + 6); - - v_dst00 = (v_dst00 * v_beta) + (v_src00 * v_alpha); - v_dst01 = (v_dst01 * v_beta) + (v_src01 * v_alpha); - v_dst10 = (v_dst10 * v_beta) + (v_src10 * v_alpha); - v_dst11 = (v_dst11 * v_beta) + (v_src11 * v_alpha); - - v_store(dst + x, v_dst00); - v_store(dst + x + 2, v_dst01); - v_store(dst + x + 4, v_dst10); - v_store(dst + x + 6, v_dst11); - } - } - - accW_general_(src, dst, mask, len, cn, alpha, x); -} - -void accW_simd_(const float* src, double* dst, const uchar* mask, int len, int cn, double alpha) -{ - int x = 0; - const v_float64x2 v_alpha = v_setall_f64(alpha); - const v_float64x2 v_beta = v_setall_f64(1.0f - alpha); - const int cVectorWidth = 8; - - if (!mask) - { - int size = len * cn; - for (; x <= size - cVectorWidth; x += cVectorWidth) - { - v_float32x4 v_src0 = v_load(src + x); - v_float32x4 v_src1 = v_load(src + x + 4); - v_float64x2 v_src00 = v_cvt_f64(v_src0); - v_float64x2 v_src01 = v_cvt_f64_high(v_src0); - v_float64x2 v_src10 = v_cvt_f64(v_src1); - v_float64x2 v_src11 = v_cvt_f64_high(v_src1); - - v_store(dst + x, ((v_load(dst + x) * v_beta) + (v_src00 * v_alpha))); - v_store(dst + x + 2, ((v_load(dst + x + 2) * v_beta) + (v_src01 * v_alpha))); - v_store(dst + x + 4, ((v_load(dst + x + 4) * v_beta) + (v_src10 * v_alpha))); - v_store(dst + x + 6, ((v_load(dst + x + 6) * v_beta) + (v_src11 * v_alpha))); - } - } - - accW_general_(src, dst, mask, len, cn, alpha, x); -} - -void accW_simd_(const double* src, double* dst, const uchar* mask, int len, int cn, double alpha) -{ - int x = 0; - const v_float64x2 v_alpha = v_setall_f64(alpha); - const v_float64x2 v_beta = v_setall_f64(1.0f - alpha); - const int cVectorWidth = 4; - - if (!mask) - { - int size = len * cn; - for (; x <= size - cVectorWidth; x += cVectorWidth) - { - v_float64x2 v_src0 = v_load(src + x); - v_float64x2 v_src1 = v_load(src + x + 2); - - v_store(dst + x, ((v_load(dst + x) * v_beta) + (v_src0 * v_alpha))); - v_store(dst + x + 2, ((v_load(dst + x + 2) * v_beta) + (v_src1 * v_alpha))); - } - } - - accW_general_(src, dst, mask, len, cn, alpha, x); -} -#else -void accW_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn, double alpha) -{ - accW_general_(src, dst, mask, len, cn, alpha, 0); -} - -void accW_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn, double alpha) -{ - accW_general_(src, dst, mask, len, cn, alpha, 0); -} - -void accW_simd_(const float* src, double* dst, const uchar* mask, int len, int cn, double alpha) -{ - accW_general_(src, dst, mask, len, cn, alpha, 0); -} - -void accW_simd_(const double* src, double* dst, const uchar* mask, int len, int cn, double alpha) -{ - accW_general_(src, dst, mask, len, cn, alpha, 0); -} -#endif // CV_SIMD128_64F -#endif // CV_SIMD128 -#if CV_AVX -// accumulate optimized by AVX -void acc_avx_32f(const float* src, float* dst, const uchar* mask, int len, int cn) -{ - int x = 0; - const int cVectorWidth = 8; - - if (!mask) - { - int size = len * cn; - for ( ; x <= size - cVectorWidth ; x += cVectorWidth) - { - __m256 v_src = _mm256_loadu_ps(src + x); - __m256 v_dst = _mm256_loadu_ps(dst + x); - v_dst = _mm256_add_ps(v_src, v_dst); - _mm256_storeu_ps(dst + x, v_dst); - } - acc_general_(src, dst, mask, len, cn, x); - } - else - { - acc_simd_(src, dst, mask, len, cn); - } -} - -void acc_avx_32f64f(const float* src, double* dst, const uchar* mask, int len, int cn) -{ - int x = 0; - const int cVectorWidth = 8; - - if (!mask) - { - int size = len * cn; - for ( ; x <= size - cVectorWidth ; x += cVectorWidth) - { - __m256 v_src = _mm256_loadu_ps(src + x); - __m256d v_src0 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src, 0)); - __m256d v_src1 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src, 1)); - __m256d v_dst0 = _mm256_loadu_pd(dst + x); - __m256d v_dst1 = _mm256_loadu_pd(dst + x + 4); - v_dst0 = _mm256_add_pd(v_src0, v_dst0); - v_dst1 = _mm256_add_pd(v_src1, v_dst1); - _mm256_storeu_pd(dst + x, v_dst0); - _mm256_storeu_pd(dst + x + 4, v_dst1); - } - acc_general_(src, dst, mask, len, cn, x); - } - else - { - acc_simd_(src, dst, mask, len, cn); - } -} - -void acc_avx_64f(const double* src, double* dst, const uchar* mask, int len, int cn) -{ - int x = 0; - const int cVectorWidth = 4; - - if (!mask) - { - int size = len * cn; - for ( ; x <= size - cVectorWidth ; x += cVectorWidth) - { - __m256d v_src = _mm256_loadu_pd(src + x); - __m256d v_dst = _mm256_loadu_pd(dst + x); - v_dst = _mm256_add_pd(v_dst, v_src); - _mm256_storeu_pd(dst + x, v_dst); - } - acc_general_(src, dst, mask, len, cn, x); - } - else - { - acc_simd_(src, dst, mask, len, cn); - } -} - -// square accumulate optimized by avx -void accSqr_avx_32f(const float* src, float* dst, const uchar* mask, int len, int cn) -{ - int x = 0; - const int cVectorWidth = 8; - - if (!mask) - { - int size = len * cn; - for ( ; x <= size - cVectorWidth ; x += cVectorWidth) - { - __m256 v_src = _mm256_loadu_ps(src + x); - __m256 v_dst = _mm256_loadu_ps(dst + x); - v_src = _mm256_mul_ps(v_src, v_src); - v_dst = _mm256_add_ps(v_src, v_dst); - _mm256_storeu_ps(dst + x, v_dst); - } - accSqr_general_(src, dst, mask, len, cn, x); - } - else - { - accSqr_simd_(src, dst, mask, len, cn); - } -} - -void accSqr_avx_32f64f(const float* src, double* dst, const uchar* mask, int len, int cn) -{ - int x = 0; - const int cVectorWidth = 8; - - if (!mask) - { - int size = len * cn; - for ( ; x <= size - cVectorWidth ; x += cVectorWidth) - { - __m256 v_src = _mm256_loadu_ps(src + x); - __m256d v_src0 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src,0)); - __m256d v_src1 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src,1)); - __m256d v_dst0 = _mm256_loadu_pd(dst + x); - __m256d v_dst1 = _mm256_loadu_pd(dst + x + 4); - v_src0 = _mm256_mul_pd(v_src0, v_src0); - v_src1 = _mm256_mul_pd(v_src1, v_src1); - v_dst0 = _mm256_add_pd(v_src0, v_dst0); - v_dst1 = _mm256_add_pd(v_src1, v_dst1); - _mm256_storeu_pd(dst + x, v_dst0); - _mm256_storeu_pd(dst + x + 4, v_dst1); - } - accSqr_general_(src, dst, mask, len, cn, x); - } - else - { - accSqr_simd_(src, dst, mask, len, cn); - } -} - -void accSqr_avx_64f(const double* src, double* dst, const uchar* mask, int len, int cn) -{ - int x = 0; - const int cVectorWidth = 4; - - if (!mask) - { - int size = len * cn; - for ( ; x <= size - cVectorWidth ; x += cVectorWidth) - { - __m256d v_src = _mm256_loadu_pd(src + x); - __m256d v_dst = _mm256_loadu_pd(dst + x); - v_src = _mm256_mul_pd(v_src, v_src); - v_dst = _mm256_add_pd(v_dst, v_src); - _mm256_storeu_pd(dst + x, v_dst); - } - accSqr_general_(src, dst, mask, len, cn, x); - } - else - { - accSqr_simd_(src, dst, mask, len, cn); - } -} - -// product accumulate optimized by avx -void accProd_avx_32f(const float* src1, const float* src2, float* dst, const uchar* mask, int len, int cn) -{ - int x = 0; - const int cVectorWidth = 8; - - if (!mask) - { - int size = len * cn; - for ( ; x <= size - cVectorWidth ; x += cVectorWidth) + #if CV_AVX && !CV_AVX2 + for (; x <= size - 8 ; x += 8) { __m256 v_src0 = _mm256_loadu_ps(src1 + x); __m256 v_src1 = _mm256_loadu_ps(src2 + x); @@ -2978,23 +2052,361 @@ void accProd_avx_32f(const float* src1, const float* src2, float* dst, const uch v_dst = _mm256_add_ps(v_src, v_dst); _mm256_storeu_ps(dst + x, v_dst); } - accProd_general_(src1, src2, dst, mask, len, cn, x); + #else + for (; x <= size - cVectorWidth; x += cVectorWidth) + { + v_store(dst + x, v_fma(vx_load(src1 + x), vx_load(src2 + x), vx_load(dst + x))); + v_store(dst + x + step, v_fma(vx_load(src1 + x + step), vx_load(src2 + x + step), vx_load(dst + x + step))); + } + #endif // CV_AVX && !CV_AVX2 } else { - accProd_simd_(src1, src2, dst, mask, len, cn); + v_uint32 v_0 = vx_setzero_u32(); + if (cn == 1) + { + for (; x <= len - cVectorWidth; x += cVectorWidth) + { + v_uint32 v_mask32_0 = vx_load_expand_q(mask + x); + v_uint32 v_mask32_1 = vx_load_expand_q(mask + x + step); + v_float32 v_mask0 = v_reinterpret_as_f32(~(v_mask32_0 == v_0)); + v_float32 v_mask1 = v_reinterpret_as_f32(~(v_mask32_1 == v_0)); + + v_store(dst + x, vx_load(dst + x) + ((vx_load(src1 + x) * vx_load(src2 + x)) & v_mask0)); + v_store(dst + x + step, vx_load(dst + x + step) + ((vx_load(src1 + x + step) * vx_load(src2 + x + step)) & v_mask1)); + } + } + else if (cn == 3) + { + for (; x <= len - cVectorWidth; x += cVectorWidth) + { + v_uint32 v_mask32_0 = vx_load_expand_q(mask + x); + v_uint32 v_mask32_1 = vx_load_expand_q(mask + x + step); + v_float32 v_mask0 = v_reinterpret_as_f32(~(v_mask32_0 == v_0)); + v_float32 v_mask1 = v_reinterpret_as_f32(~(v_mask32_1 == v_0)); + + v_float32 v_1src00, v_1src01, v_1src10, v_1src11, v_1src20, v_1src21; + v_float32 v_2src00, v_2src01, v_2src10, v_2src11, v_2src20, v_2src21; + v_load_deinterleave(src1 + x * cn, v_1src00, v_1src10, v_1src20); + v_load_deinterleave(src2 + x * cn, v_2src00, v_2src10, v_2src20); + v_load_deinterleave(src1 + (x + step) * cn, v_1src01, v_1src11, v_1src21); + v_load_deinterleave(src2 + (x + step) * cn, v_2src01, v_2src11, v_2src21); + + v_float32 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21; + v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20); + v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21); + + v_store_interleave(dst + x * cn, v_dst00 + ((v_1src00 * v_2src00) & v_mask0), v_dst10 + ((v_1src10 * v_2src10) & v_mask0), v_dst20 + ((v_1src20 * v_2src20) & v_mask0)); + v_store_interleave(dst + (x + step) * cn, v_dst01 + ((v_1src01 * v_2src01) & v_mask1), v_dst11 + ((v_1src11 * v_2src11) & v_mask1), v_dst21 + ((v_1src21 * v_2src21) & v_mask1)); + } + } } +#endif // CV_SIMD + accProd_general_(src1, src2, dst, mask, len, cn, x); } -void accProd_avx_32f64f(const float* src1, const float* src2, double* dst, const uchar* mask, int len, int cn) +void accProd_simd_(const uchar* src1, const uchar* src2, double* dst, const uchar* mask, int len, int cn) { int x = 0; - const int cVectorWidth = 8; +#if CV_SIMD_64F + const int cVectorWidth = v_uint16::nlanes; + const int step = v_float64::nlanes; if (!mask) { int size = len * cn; - for ( ; x <= size - cVectorWidth ; x += cVectorWidth) + for (; x <= size - cVectorWidth; x += cVectorWidth) + { + v_uint16 v_1int = vx_load_expand(src1 + x); + v_uint16 v_2int = vx_load_expand(src2 + x); + + v_uint32 v_1int_0, v_1int_1, v_2int_0, v_2int_1; + v_expand(v_1int, v_1int_0, v_1int_1); + v_expand(v_2int, v_2int_0, v_2int_1); + + v_int32 v_1int0 = v_reinterpret_as_s32(v_1int_0); + v_int32 v_1int1 = v_reinterpret_as_s32(v_1int_1); + v_int32 v_2int0 = v_reinterpret_as_s32(v_2int_0); + v_int32 v_2int1 = v_reinterpret_as_s32(v_2int_1); + + v_float64 v_dst0 = vx_load(dst + x); + v_float64 v_dst1 = vx_load(dst + x + step); + v_float64 v_dst2 = vx_load(dst + x + step * 2); + v_float64 v_dst3 = vx_load(dst + x + step * 3); + + v_dst0 = v_fma(v_cvt_f64(v_1int0), v_cvt_f64(v_2int0), v_dst0); + v_dst1 = v_fma(v_cvt_f64_high(v_1int0), v_cvt_f64_high(v_2int0), v_dst1); + v_dst2 = v_fma(v_cvt_f64(v_1int1), v_cvt_f64(v_2int1), v_dst2); + v_dst3 = v_fma(v_cvt_f64_high(v_1int1), v_cvt_f64_high(v_2int1), v_dst3); + + v_store(dst + x, v_dst0); + v_store(dst + x + step, v_dst1); + v_store(dst + x + step * 2, v_dst2); + v_store(dst + x + step * 3, v_dst3); + } + } + else + { + v_uint16 v_0 = vx_setzero_u16(); + if (cn == 1) + { + for (; x <= len - cVectorWidth; x += cVectorWidth) + { + v_uint16 v_mask = vx_load_expand(mask + x); + v_mask = ~(v_mask == v_0); + v_uint16 v_1int = vx_load_expand(src1 + x) & v_mask; + v_uint16 v_2int = vx_load_expand(src2 + x) & v_mask; + + v_uint32 v_1int_0, v_1int_1, v_2int_0, v_2int_1; + v_expand(v_1int, v_1int_0, v_1int_1); + v_expand(v_2int, v_2int_0, v_2int_1); + + v_int32 v_1int0 = v_reinterpret_as_s32(v_1int_0); + v_int32 v_1int1 = v_reinterpret_as_s32(v_1int_1); + v_int32 v_2int0 = v_reinterpret_as_s32(v_2int_0); + v_int32 v_2int1 = v_reinterpret_as_s32(v_2int_1); + + v_float64 v_dst0 = vx_load(dst + x); + v_float64 v_dst1 = vx_load(dst + x + step); + v_float64 v_dst2 = vx_load(dst + x + step * 2); + v_float64 v_dst3 = vx_load(dst + x + step * 3); + + v_dst0 = v_fma(v_cvt_f64(v_1int0), v_cvt_f64(v_2int0), v_dst0); + v_dst1 = v_fma(v_cvt_f64_high(v_1int0), v_cvt_f64_high(v_2int0), v_dst1); + v_dst2 = v_fma(v_cvt_f64(v_1int1), v_cvt_f64(v_2int1), v_dst2); + v_dst3 = v_fma(v_cvt_f64_high(v_1int1), v_cvt_f64_high(v_2int1), v_dst3); + + v_store(dst + x, v_dst0); + v_store(dst + x + step, v_dst1); + v_store(dst + x + step * 2, v_dst2); + v_store(dst + x + step * 3, v_dst3); + } + } + else if (cn == 3) + { + for (; x <= len - cVectorWidth * 2; x += cVectorWidth) + { + v_uint8 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2; + v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2); + v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2); + + v_uint16 v_1int0 = v_expand_low(v_1src0); + v_uint16 v_1int1 = v_expand_low(v_1src1); + v_uint16 v_1int2 = v_expand_low(v_1src2); + v_uint16 v_2int0 = v_expand_low(v_2src0); + v_uint16 v_2int1 = v_expand_low(v_2src1); + v_uint16 v_2int2 = v_expand_low(v_2src2); + + v_uint16 v_mask = vx_load_expand(mask + x); + v_mask = ~(v_mask == v_0); + v_1int0 = v_1int0 & v_mask; + v_1int1 = v_1int1 & v_mask; + v_1int2 = v_1int2 & v_mask; + v_2int0 = v_2int0 & v_mask; + v_2int1 = v_2int1 & v_mask; + v_2int2 = v_2int2 & v_mask; + + v_uint32 v_1int00, v_1int01, v_1int10, v_1int11, v_1int20, v_1int21; + v_uint32 v_2int00, v_2int01, v_2int10, v_2int11, v_2int20, v_2int21; + v_expand(v_1int0, v_1int00, v_1int01); + v_expand(v_1int1, v_1int10, v_1int11); + v_expand(v_1int2, v_1int20, v_1int21); + v_expand(v_2int0, v_2int00, v_2int01); + v_expand(v_2int1, v_2int10, v_2int11); + v_expand(v_2int2, v_2int20, v_2int21); + + v_float64 v_dst00, v_dst01, v_dst02, v_dst03, v_dst10, v_dst11, v_dst12, v_dst13, v_dst20, v_dst21, v_dst22, v_dst23; + v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20); + v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21); + v_load_deinterleave(dst + (x + step * 2) * cn, v_dst02, v_dst12, v_dst22); + v_load_deinterleave(dst + (x + step * 3) * cn, v_dst03, v_dst13, v_dst23); + + v_dst00 = v_fma(v_cvt_f64(v_reinterpret_as_s32(v_1int00)), v_cvt_f64(v_reinterpret_as_s32(v_2int00)), v_dst00); + v_dst01 = v_fma(v_cvt_f64_high(v_reinterpret_as_s32(v_1int00)), v_cvt_f64_high(v_reinterpret_as_s32(v_2int00)), v_dst01); + v_dst02 = v_fma(v_cvt_f64(v_reinterpret_as_s32(v_1int01)), v_cvt_f64(v_reinterpret_as_s32(v_2int01)), v_dst02); + v_dst03 = v_fma(v_cvt_f64_high(v_reinterpret_as_s32(v_1int01)), v_cvt_f64_high(v_reinterpret_as_s32(v_2int01)), v_dst03); + v_dst10 = v_fma(v_cvt_f64(v_reinterpret_as_s32(v_1int10)), v_cvt_f64(v_reinterpret_as_s32(v_2int10)), v_dst10); + v_dst11 = v_fma(v_cvt_f64_high(v_reinterpret_as_s32(v_1int10)), v_cvt_f64_high(v_reinterpret_as_s32(v_2int10)), v_dst11); + v_dst12 = v_fma(v_cvt_f64(v_reinterpret_as_s32(v_1int11)), v_cvt_f64(v_reinterpret_as_s32(v_2int11)), v_dst12); + v_dst13 = v_fma(v_cvt_f64_high(v_reinterpret_as_s32(v_1int11)), v_cvt_f64_high(v_reinterpret_as_s32(v_2int11)), v_dst13); + v_dst20 = v_fma(v_cvt_f64(v_reinterpret_as_s32(v_1int20)), v_cvt_f64(v_reinterpret_as_s32(v_2int20)), v_dst20); + v_dst21 = v_fma(v_cvt_f64_high(v_reinterpret_as_s32(v_1int20)), v_cvt_f64_high(v_reinterpret_as_s32(v_2int20)), v_dst21); + v_dst22 = v_fma(v_cvt_f64(v_reinterpret_as_s32(v_1int21)), v_cvt_f64(v_reinterpret_as_s32(v_2int21)), v_dst22); + v_dst23 = v_fma(v_cvt_f64_high(v_reinterpret_as_s32(v_1int21)), v_cvt_f64_high(v_reinterpret_as_s32(v_2int21)), v_dst23); + + v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20); + v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21); + v_store_interleave(dst + (x + step * 2) * cn, v_dst02, v_dst12, v_dst22); + v_store_interleave(dst + (x + step * 3) * cn, v_dst03, v_dst13, v_dst23); + } + } + } +#endif // CV_SIMD_64F + accProd_general_(src1, src2, dst, mask, len, cn, x); +} + +void accProd_simd_(const ushort* src1, const ushort* src2, double* dst, const uchar* mask, int len, int cn) +{ + int x = 0; +#if CV_SIMD_64F + const int cVectorWidth = v_uint16::nlanes; + const int step = v_float64::nlanes; + + if (!mask) + { + int size = len * cn; + for (; x <= size - cVectorWidth; x += cVectorWidth) + { + v_uint16 v_1src = vx_load(src1 + x); + v_uint16 v_2src = vx_load(src2 + x); + + v_uint32 v_1int_0, v_1int_1, v_2int_0, v_2int_1; + v_expand(v_1src, v_1int_0, v_1int_1); + v_expand(v_2src, v_2int_0, v_2int_1); + + v_int32 v_1int0 = v_reinterpret_as_s32(v_1int_0); + v_int32 v_1int1 = v_reinterpret_as_s32(v_1int_1); + v_int32 v_2int0 = v_reinterpret_as_s32(v_2int_0); + v_int32 v_2int1 = v_reinterpret_as_s32(v_2int_1); + + v_float64 v_dst0 = vx_load(dst + x); + v_float64 v_dst1 = vx_load(dst + x + step); + v_float64 v_dst2 = vx_load(dst + x + step * 2); + v_float64 v_dst3 = vx_load(dst + x + step * 3); + + v_dst0 = v_fma(v_cvt_f64(v_1int0), v_cvt_f64(v_2int0), v_dst0); + v_dst1 = v_fma(v_cvt_f64_high(v_1int0), v_cvt_f64_high(v_2int0), v_dst1); + v_dst2 = v_fma(v_cvt_f64(v_1int1), v_cvt_f64(v_2int1), v_dst2); + v_dst3 = v_fma(v_cvt_f64_high(v_1int1), v_cvt_f64_high(v_2int1), v_dst3); + + v_store(dst + x, v_dst0); + v_store(dst + x + step, v_dst1); + v_store(dst + x + step * 2, v_dst2); + v_store(dst + x + step * 3, v_dst3); + } + } + else + { + v_uint16 v_0 = vx_setzero_u16(); + if (cn == 1) + { + for (; x <= len - cVectorWidth; x += cVectorWidth) + { + v_uint16 v_mask = vx_load_expand(mask + x); + v_mask = ~(v_mask == v_0); + v_uint16 v_1src = vx_load(src1 + x); + v_uint16 v_2src = vx_load(src2 + x); + v_1src = v_1src & v_mask; + v_2src = v_2src & v_mask; + + v_uint32 v_1int_0, v_1int_1, v_2int_0, v_2int_1; + v_expand(v_1src, v_1int_0, v_1int_1); + v_expand(v_2src, v_2int_0, v_2int_1); + + v_int32 v_1int0 = v_reinterpret_as_s32(v_1int_0); + v_int32 v_1int1 = v_reinterpret_as_s32(v_1int_1); + v_int32 v_2int0 = v_reinterpret_as_s32(v_2int_0); + v_int32 v_2int1 = v_reinterpret_as_s32(v_2int_1); + + v_float64 v_dst0 = vx_load(dst + x); + v_float64 v_dst1 = vx_load(dst + x + step); + v_float64 v_dst2 = vx_load(dst + x + step * 2); + v_float64 v_dst3 = vx_load(dst + x + step * 3); + + v_dst0 = v_fma(v_cvt_f64(v_1int0), v_cvt_f64(v_2int0), v_dst0); + v_dst1 = v_fma(v_cvt_f64_high(v_1int0), v_cvt_f64_high(v_2int0), v_dst1); + v_dst2 = v_fma(v_cvt_f64(v_1int1), v_cvt_f64(v_2int1), v_dst2); + v_dst3 = v_fma(v_cvt_f64_high(v_1int1), v_cvt_f64_high(v_2int1), v_dst3); + + v_store(dst + x, v_dst0); + v_store(dst + x + step, v_dst1); + v_store(dst + x + step * 2, v_dst2); + v_store(dst + x + step * 3, v_dst3); + } + } + else if (cn == 3) + { + for (; x <= len - cVectorWidth; x += cVectorWidth) + { + v_uint16 v_mask = vx_load_expand(mask + x); + v_mask = ~(v_mask == v_0); + v_uint16 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2; + v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2); + v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2); + v_1src0 = v_1src0 & v_mask; + v_1src1 = v_1src1 & v_mask; + v_1src2 = v_1src2 & v_mask; + v_2src0 = v_2src0 & v_mask; + v_2src1 = v_2src1 & v_mask; + v_2src2 = v_2src2 & v_mask; + + v_uint32 v_1int_00, v_1int_01, v_2int_00, v_2int_01; + v_uint32 v_1int_10, v_1int_11, v_2int_10, v_2int_11; + v_uint32 v_1int_20, v_1int_21, v_2int_20, v_2int_21; + v_expand(v_1src0, v_1int_00, v_1int_01); + v_expand(v_1src1, v_1int_10, v_1int_11); + v_expand(v_1src2, v_1int_20, v_1int_21); + v_expand(v_2src0, v_2int_00, v_2int_01); + v_expand(v_2src1, v_2int_10, v_2int_11); + v_expand(v_2src2, v_2int_20, v_2int_21); + + v_int32 v_1int00 = v_reinterpret_as_s32(v_1int_00); + v_int32 v_1int01 = v_reinterpret_as_s32(v_1int_01); + v_int32 v_1int10 = v_reinterpret_as_s32(v_1int_10); + v_int32 v_1int11 = v_reinterpret_as_s32(v_1int_11); + v_int32 v_1int20 = v_reinterpret_as_s32(v_1int_20); + v_int32 v_1int21 = v_reinterpret_as_s32(v_1int_21); + v_int32 v_2int00 = v_reinterpret_as_s32(v_2int_00); + v_int32 v_2int01 = v_reinterpret_as_s32(v_2int_01); + v_int32 v_2int10 = v_reinterpret_as_s32(v_2int_10); + v_int32 v_2int11 = v_reinterpret_as_s32(v_2int_11); + v_int32 v_2int20 = v_reinterpret_as_s32(v_2int_20); + v_int32 v_2int21 = v_reinterpret_as_s32(v_2int_21); + + v_float64 v_dst00, v_dst01, v_dst02, v_dst03; + v_float64 v_dst10, v_dst11, v_dst12, v_dst13; + v_float64 v_dst20, v_dst21, v_dst22, v_dst23; + v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20); + v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21); + v_load_deinterleave(dst + (x + step * 2) * cn, v_dst02, v_dst12, v_dst22); + v_load_deinterleave(dst + (x + step * 3) * cn, v_dst03, v_dst13, v_dst23); + + v_dst00 = v_fma(v_cvt_f64(v_1int00), v_cvt_f64(v_2int00), v_dst00); + v_dst01 = v_fma(v_cvt_f64_high(v_1int00), v_cvt_f64_high(v_2int00), v_dst01); + v_dst02 = v_fma(v_cvt_f64(v_1int01), v_cvt_f64(v_2int01), v_dst02); + v_dst03 = v_fma(v_cvt_f64_high(v_1int01), v_cvt_f64_high(v_2int01), v_dst03); + v_dst10 = v_fma(v_cvt_f64(v_1int10), v_cvt_f64(v_2int10), v_dst10); + v_dst11 = v_fma(v_cvt_f64_high(v_1int10), v_cvt_f64_high(v_2int10), v_dst11); + v_dst12 = v_fma(v_cvt_f64(v_1int11), v_cvt_f64(v_2int11), v_dst12); + v_dst13 = v_fma(v_cvt_f64_high(v_1int11), v_cvt_f64_high(v_2int11), v_dst13); + v_dst20 = v_fma(v_cvt_f64(v_1int20), v_cvt_f64(v_2int20), v_dst20); + v_dst21 = v_fma(v_cvt_f64_high(v_1int20), v_cvt_f64_high(v_2int20), v_dst21); + v_dst22 = v_fma(v_cvt_f64(v_1int21), v_cvt_f64(v_2int21), v_dst22); + v_dst23 = v_fma(v_cvt_f64_high(v_1int21), v_cvt_f64_high(v_2int21), v_dst23); + + v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20); + v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21); + v_store_interleave(dst + (x + step * 2) * cn, v_dst02, v_dst12, v_dst22); + v_store_interleave(dst + (x + step * 3) * cn, v_dst03, v_dst13, v_dst23); + } + } + } +#endif // CV_SIMD_64F + accProd_general_(src1, src2, dst, mask, len, cn, x); +} + +void accProd_simd_(const float* src1, const float* src2, double* dst, const uchar* mask, int len, int cn) +{ + int x = 0; +#if CV_SIMD_64F + const int cVectorWidth = v_float32::nlanes; + const int step = v_float64::nlanes; + + if (!mask) + { + int size = len * cn; + #if CV_AVX && !CV_AVX2 + for ( ; x <= size - 8 ; x += 8) { __m256 v_1src = _mm256_loadu_ps(src1 + x); __m256 v_2src = _mm256_loadu_ps(src2 + x); @@ -3011,23 +2423,93 @@ void accProd_avx_32f64f(const float* src1, const float* src2, double* dst, const _mm256_storeu_pd(dst + x, v_dst0); _mm256_storeu_pd(dst + x + 4, v_dst1); } - accProd_general_(src1, src2, dst, mask, len, cn, x); + #else + for (; x <= size - cVectorWidth; x += cVectorWidth) + { + v_float32 v_1src = vx_load(src1 + x); + v_float32 v_2src = vx_load(src2 + x); + + v_float64 v_1src0 = v_cvt_f64(v_1src); + v_float64 v_1src1 = v_cvt_f64_high(v_1src); + v_float64 v_2src0 = v_cvt_f64(v_2src); + v_float64 v_2src1 = v_cvt_f64_high(v_2src); + + v_store(dst + x, v_fma(v_1src0, v_2src0, vx_load(dst + x))); + v_store(dst + x + step, v_fma(v_1src1, v_2src1, vx_load(dst + x + step))); + } + #endif // CV_AVX && !CV_AVX2 } else { - accProd_simd_(src1, src2, dst, mask, len, cn); + v_uint32 v_0 = vx_setzero_u32(); + if (cn == 1) + { + for (; x <= len - cVectorWidth; x += cVectorWidth) + { + v_uint32 v_mask = vx_load_expand_q(mask + x); + v_mask = ~(v_mask == v_0); + v_float32 v_1src = vx_load(src1 + x); + v_float32 v_2src = vx_load(src2 + x); + v_1src = v_1src & v_reinterpret_as_f32(v_mask); + v_2src = v_2src & v_reinterpret_as_f32(v_mask); + + v_float64 v_1src0 = v_cvt_f64(v_1src); + v_float64 v_1src1 = v_cvt_f64_high(v_1src); + v_float64 v_2src0 = v_cvt_f64(v_2src); + v_float64 v_2src1 = v_cvt_f64_high(v_2src); + + v_store(dst + x, v_fma(v_1src0, v_2src0, vx_load(dst + x))); + v_store(dst + x + step, v_fma(v_1src1, v_2src1, vx_load(dst + x + step))); + } + } + else if (cn == 3) + { + for (; x <= len - cVectorWidth; x += cVectorWidth) + { + v_uint32 v_mask = vx_load_expand_q(mask + x); + v_mask = ~(v_mask == v_0); + v_float32 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2; + v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2); + v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2); + v_1src0 = v_1src0 & v_reinterpret_as_f32(v_mask); + v_1src1 = v_1src1 & v_reinterpret_as_f32(v_mask); + v_1src2 = v_1src2 & v_reinterpret_as_f32(v_mask); + v_2src0 = v_2src0 & v_reinterpret_as_f32(v_mask); + v_2src1 = v_2src1 & v_reinterpret_as_f32(v_mask); + v_2src2 = v_2src2 & v_reinterpret_as_f32(v_mask); + + v_float64 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21; + v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20); + v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21); + + v_dst00 = v_fma(v_cvt_f64(v_1src0), v_cvt_f64(v_2src0), v_dst00); + v_dst01 = v_fma(v_cvt_f64_high(v_1src0), v_cvt_f64_high(v_2src0), v_dst01); + v_dst10 = v_fma(v_cvt_f64(v_1src1), v_cvt_f64(v_2src1), v_dst10); + v_dst11 = v_fma(v_cvt_f64_high(v_1src1), v_cvt_f64_high(v_2src1), v_dst11); + v_dst20 = v_fma(v_cvt_f64(v_1src2), v_cvt_f64(v_2src2), v_dst20); + v_dst21 = v_fma(v_cvt_f64_high(v_1src2), v_cvt_f64_high(v_2src2), v_dst21); + + v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20); + v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21); + } + } } +#endif // CV_SIMD_64F + accProd_general_(src1, src2, dst, mask, len, cn, x); } -void accProd_avx_64f(const double* src1, const double* src2, double* dst, const uchar* mask, int len, int cn) +void accProd_simd_(const double* src1, const double* src2, double* dst, const uchar* mask, int len, int cn) { int x = 0; - const int cVectorWidth = 4; +#if CV_SIMD_64F + const int cVectorWidth = v_float64::nlanes * 2; + const int step = v_float64::nlanes; if (!mask) { int size = len * cn; - for ( ; x <= size - cVectorWidth ; x += cVectorWidth) + #if CV_AVX && !CV_AVX2 + for ( ; x <= size - 4 ; x += 4) { __m256d v_src0 = _mm256_loadu_pd(src1 + x); __m256d v_src1 = _mm256_loadu_pd(src2 + x); @@ -3036,18 +2518,157 @@ void accProd_avx_64f(const double* src1, const double* src2, double* dst, const v_dst = _mm256_add_pd(v_dst, v_src0); _mm256_storeu_pd(dst + x, v_dst); } - accProd_general_(src1, src2, dst, mask, len, cn, x); + #else + for (; x <= size - cVectorWidth; x += cVectorWidth) + { + v_float64 v_src00 = vx_load(src1 + x); + v_float64 v_src01 = vx_load(src1 + x + step); + v_float64 v_src10 = vx_load(src2 + x); + v_float64 v_src11 = vx_load(src2 + x + step); + + v_store(dst + x, v_fma(v_src00, v_src10, vx_load(dst + x))); + v_store(dst + x + step, v_fma(v_src01, v_src11, vx_load(dst + x + step))); + } + #endif } else { - accProd_simd_(src1, src2, dst, mask, len, cn); + // todo: try fma + v_uint64 v_0 = vx_setzero_u64(); + if (cn == 1) + { + for (; x <= len - cVectorWidth; x += cVectorWidth) + { + v_uint32 v_mask32 = vx_load_expand_q(mask + x); + v_uint64 v_masku640, v_masku641; + v_expand(v_mask32, v_masku640, v_masku641); + v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0)); + v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0)); + + v_float64 v_src00 = vx_load(src1 + x); + v_float64 v_src01 = vx_load(src1 + x + step); + v_float64 v_src10 = vx_load(src2 + x); + v_float64 v_src11 = vx_load(src2 + x + step); + + v_store(dst + x, vx_load(dst + x) + ((v_src00 * v_src10) & v_mask0)); + v_store(dst + x + step, vx_load(dst + x + step) + ((v_src01 * v_src11) & v_mask1)); + } + } + else if (cn == 3) + { + for (; x <= len - cVectorWidth; x += cVectorWidth) + { + v_uint32 v_mask32 = vx_load_expand_q(mask + x); + v_uint64 v_masku640, v_masku641; + v_expand(v_mask32, v_masku640, v_masku641); + v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0)); + v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0)); + + v_float64 v_1src00, v_1src01, v_1src10, v_1src11, v_1src20, v_1src21; + v_float64 v_2src00, v_2src01, v_2src10, v_2src11, v_2src20, v_2src21; + v_load_deinterleave(src1 + x * cn, v_1src00, v_1src10, v_1src20); + v_load_deinterleave(src1 + (x + step) * cn, v_1src01, v_1src11, v_1src21); + v_load_deinterleave(src2 + x * cn, v_2src00, v_2src10, v_2src20); + v_load_deinterleave(src2 + (x + step) * cn, v_2src01, v_2src11, v_2src21); + v_float64 v_src00 = (v_1src00 & v_mask0) * v_2src00; + v_float64 v_src01 = (v_1src01 & v_mask1) * v_2src01; + v_float64 v_src10 = (v_1src10 & v_mask0) * v_2src10; + v_float64 v_src11 = (v_1src11 & v_mask1) * v_2src11; + v_float64 v_src20 = (v_1src20 & v_mask0) * v_2src20; + v_float64 v_src21 = (v_1src21 & v_mask1) * v_2src21; + + v_float64 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21; + v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20); + v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21); + + v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20); + v_store_interleave(dst + (x + step) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21); + } + } } +#endif // CV_SIMD_64F + accProd_general_(src1, src2, dst, mask, len, cn, x); } -// running weight accumulate optimized by avx -void accW_avx_32f(const float* src, float* dst, const uchar* mask, int len, int cn, double alpha) +// running weight accumulate optimized by universal intrinsic +void accW_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn, double alpha) { int x = 0; +#if CV_SIMD + const v_float32 v_alpha = vx_setall_f32((float)alpha); + const v_float32 v_beta = vx_setall_f32((float)(1.0f - alpha)); + const int cVectorWidth = v_uint8::nlanes; + const int step = v_float32::nlanes; + + if (!mask) + { + int size = len * cn; + for (; x <= size - cVectorWidth; x += cVectorWidth) + { + v_uint8 v_src = vx_load(src + x); + + v_uint16 v_src0, v_src1; + v_expand(v_src, v_src0, v_src1); + + v_uint32 v_src00, v_src01, v_src10, v_src11; + v_expand(v_src0, v_src00, v_src01); + v_expand(v_src1, v_src10, v_src11); + + v_float32 v_dst00 = vx_load(dst + x); + v_float32 v_dst01 = vx_load(dst + x + step); + v_float32 v_dst10 = vx_load(dst + x + step * 2); + v_float32 v_dst11 = vx_load(dst + x + step * 3); + + v_dst00 = v_fma(v_dst00, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src00)) * v_alpha); + v_dst01 = v_fma(v_dst01, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src01)) * v_alpha); + v_dst10 = v_fma(v_dst10, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src10)) * v_alpha); + v_dst11 = v_fma(v_dst11, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src11)) * v_alpha); + + v_store(dst + x, v_dst00); + v_store(dst + x + step, v_dst01); + v_store(dst + x + step * 2, v_dst10); + v_store(dst + x + step * 3, v_dst11); + } + } +#endif // CV_SIMD + accW_general_(src, dst, mask, len, cn, alpha, x); +} + +void accW_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn, double alpha) +{ + int x = 0; +#if CV_SIMD + const v_float32 v_alpha = vx_setall_f32((float)alpha); + const v_float32 v_beta = vx_setall_f32((float)(1.0f - alpha)); + const int cVectorWidth = v_uint16::nlanes; + const int step = v_float32::nlanes; + + if (!mask) + { + int size = len * cn; + for (; x <= size - cVectorWidth; x += cVectorWidth) + { + v_uint16 v_src = vx_load(src + x); + v_uint32 v_int0, v_int1; + v_expand(v_src, v_int0, v_int1); + + v_float32 v_dst0 = vx_load(dst + x); + v_float32 v_dst1 = vx_load(dst + x + step); + v_dst0 = v_fma(v_dst0, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_int0)) * v_alpha); + v_dst1 = v_fma(v_dst1, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_int1)) * v_alpha); + + v_store(dst + x, v_dst0); + v_store(dst + x + step, v_dst1); + } + } +#endif // CV_SIMD + accW_general_(src, dst, mask, len, cn, alpha, x); +} + +void accW_simd_(const float* src, float* dst, const uchar* mask, int len, int cn, double alpha) +{ + int x = 0; +#if CV_AVX && !CV_AVX2 const __m256 v_alpha = _mm256_set1_ps((float)alpha); const __m256 v_beta = _mm256_set1_ps((float)(1.0f - alpha)); const int cVectorWidth = 16; @@ -3060,18 +2681,129 @@ void accW_avx_32f(const float* src, float* dst, const uchar* mask, int len, int _mm256_storeu_ps(dst + x, _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(dst + x), v_beta), _mm256_mul_ps(_mm256_loadu_ps(src + x), v_alpha))); _mm256_storeu_ps(dst + x + 8, _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(dst + x + 8), v_beta), _mm256_mul_ps(_mm256_loadu_ps(src + x + 8), v_alpha))); } - accW_general_(src, dst, mask, len, cn, alpha, x); - } - else - { - accW_simd_(src, dst, mask, len, cn, alpha); } +#elif CV_SIMD + const v_float32 v_alpha = vx_setall_f32((float)alpha); + const v_float32 v_beta = vx_setall_f32((float)(1.0f - alpha)); + const int cVectorWidth = v_uint16::nlanes; + const int step = v_float32::nlanes; + if (!mask) + { + int size = len * cn; + for (; x <= size - cVectorWidth; x += cVectorWidth) + { + v_float32 v_dst0 = vx_load(dst + x); + v_float32 v_dst1 = vx_load(dst + x + step); + + v_dst0 = v_fma(v_dst0, v_beta, vx_load(src + x) * v_alpha); + v_dst1 = v_fma(v_dst1, v_beta, vx_load(src + x + step) * v_alpha); + + v_store(dst + x, v_dst0); + v_store(dst + x + step, v_dst1); + } + } +#endif // CV_SIMD + accW_general_(src, dst, mask, len, cn, alpha, x); } -void accW_avx_32f64f(const float* src, double* dst, const uchar* mask, int len, int cn, double alpha) +void accW_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn, double alpha) { int x = 0; +#if CV_SIMD_64F + const v_float64 v_alpha = vx_setall_f64(alpha); + const v_float64 v_beta = vx_setall_f64(1.0f - alpha); + const int cVectorWidth = v_uint16::nlanes; + const int step = v_float64::nlanes; + + if (!mask) + { + int size = len * cn; + for (; x <= size - cVectorWidth; x += cVectorWidth) + { + v_uint16 v_src16 = vx_load_expand(src + x); + + v_uint32 v_int_0, v_int_1; + v_expand(v_src16, v_int_0, v_int_1); + + v_int32 v_int0 = v_reinterpret_as_s32(v_int_0); + v_int32 v_int1 = v_reinterpret_as_s32(v_int_1); + + v_float64 v_src0 = v_cvt_f64(v_int0); + v_float64 v_src1 = v_cvt_f64_high(v_int0); + v_float64 v_src2 = v_cvt_f64(v_int1); + v_float64 v_src3 = v_cvt_f64_high(v_int1); + + v_float64 v_dst0 = vx_load(dst + x); + v_float64 v_dst1 = vx_load(dst + x + step); + v_float64 v_dst2 = vx_load(dst + x + step * 2); + v_float64 v_dst3 = vx_load(dst + x + step * 3); + + v_dst0 = v_fma(v_dst0, v_beta, v_src0 * v_alpha); + v_dst1 = v_fma(v_dst1, v_beta, v_src1 * v_alpha); + v_dst2 = v_fma(v_dst2, v_beta, v_src2 * v_alpha); + v_dst3 = v_fma(v_dst3, v_beta, v_src3 * v_alpha); + + v_store(dst + x, v_dst0); + v_store(dst + x + step, v_dst1); + v_store(dst + x + step * 2, v_dst2); + v_store(dst + x + step * 3, v_dst3); + } + } +#endif // CV_SIMD_64F + accW_general_(src, dst, mask, len, cn, alpha, x); +} + +void accW_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn, double alpha) +{ + int x = 0; +#if CV_SIMD_64F + const v_float64 v_alpha = vx_setall_f64(alpha); + const v_float64 v_beta = vx_setall_f64(1.0f - alpha); + const int cVectorWidth = v_uint16::nlanes; + const int step = v_float64::nlanes; + + if (!mask) + { + int size = len * cn; + for (; x <= size - cVectorWidth; x += cVectorWidth) + { + v_uint16 v_src = vx_load(src + x); + v_uint32 v_int_0, v_int_1; + v_expand(v_src, v_int_0, v_int_1); + + v_int32 v_int0 = v_reinterpret_as_s32(v_int_0); + v_int32 v_int1 = v_reinterpret_as_s32(v_int_1); + + v_float64 v_src00 = v_cvt_f64(v_int0); + v_float64 v_src01 = v_cvt_f64_high(v_int0); + v_float64 v_src10 = v_cvt_f64(v_int1); + v_float64 v_src11 = v_cvt_f64_high(v_int1); + + v_float64 v_dst00 = vx_load(dst + x); + v_float64 v_dst01 = vx_load(dst + x + step); + v_float64 v_dst10 = vx_load(dst + x + step * 2); + v_float64 v_dst11 = vx_load(dst + x + step * 3); + + v_dst00 = v_fma(v_dst00, v_beta, v_src00 * v_alpha); + v_dst01 = v_fma(v_dst01, v_beta, v_src01 * v_alpha); + v_dst10 = v_fma(v_dst10, v_beta, v_src10 * v_alpha); + v_dst11 = v_fma(v_dst11, v_beta, v_src11 * v_alpha); + + v_store(dst + x, v_dst00); + v_store(dst + x + step, v_dst01); + v_store(dst + x + step * 2, v_dst10); + v_store(dst + x + step * 3, v_dst11); + } + } +#endif // CV_SIMD_64F + accW_general_(src, dst, mask, len, cn, alpha, x); +} + +void accW_simd_(const float* src, double* dst, const uchar* mask, int len, int cn, double alpha) +{ + int x = 0; +#if CV_AVX && !CV_AVX2 const __m256d v_alpha = _mm256_set1_pd(alpha); const __m256d v_beta = _mm256_set1_pd(1.0f - alpha); const int cVectorWidth = 16; @@ -3093,17 +2825,49 @@ void accW_avx_32f64f(const float* src, double* dst, const uchar* mask, int len, _mm256_storeu_pd(dst + x + 8, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x + 8), v_beta), _mm256_mul_pd(v_src10, v_alpha))); _mm256_storeu_pd(dst + x + 12, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x + 12), v_beta), _mm256_mul_pd(v_src11, v_alpha))); } - accW_general_(src, dst, mask, len, cn, alpha, x); } - else +#elif CV_SIMD_64F + const v_float64 v_alpha = vx_setall_f64(alpha); + const v_float64 v_beta = vx_setall_f64(1.0f - alpha); + const int cVectorWidth = v_float32::nlanes * 2; + const int step = v_float64::nlanes; + + if (!mask) { - accW_simd_(src, dst, mask, len, cn, alpha); + int size = len * cn; + for (; x <= size - cVectorWidth; x += cVectorWidth) + { + v_float32 v_src0 = vx_load(src + x); + v_float32 v_src1 = vx_load(src + x + v_float32::nlanes); + v_float64 v_src00 = v_cvt_f64(v_src0); + v_float64 v_src01 = v_cvt_f64_high(v_src0); + v_float64 v_src10 = v_cvt_f64(v_src1); + v_float64 v_src11 = v_cvt_f64_high(v_src1); + + v_float64 v_dst00 = vx_load(dst + x); + v_float64 v_dst01 = vx_load(dst + x + step); + v_float64 v_dst10 = vx_load(dst + x + step * 2); + v_float64 v_dst11 = vx_load(dst + x + step * 3); + + v_dst00 = v_fma(v_dst00, v_beta, v_src00 * v_alpha); + v_dst01 = v_fma(v_dst01, v_beta, v_src01 * v_alpha); + v_dst10 = v_fma(v_dst10, v_beta, v_src10 * v_alpha); + v_dst11 = v_fma(v_dst11, v_beta, v_src11 * v_alpha); + + v_store(dst + x, v_dst00); + v_store(dst + x + step, v_dst01); + v_store(dst + x + step * 2, v_dst10); + v_store(dst + x + step * 3, v_dst11); + } } +#endif // CV_SIMD_64F + accW_general_(src, dst, mask, len, cn, alpha, x); } -void accW_avx_64f(const double* src, double* dst, const uchar* mask, int len, int cn, double alpha) +void accW_simd_(const double* src, double* dst, const uchar* mask, int len, int cn, double alpha) { int x = 0; +#if CV_AVX && !CV_AVX2 const __m256d v_alpha = _mm256_set1_pd(alpha); const __m256d v_beta = _mm256_set1_pd(1.0f - alpha); const int cVectorWidth = 8; @@ -3119,14 +2883,35 @@ void accW_avx_64f(const double* src, double* dst, const uchar* mask, int len, in _mm256_storeu_pd(dst + x, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x), v_beta), _mm256_mul_pd(v_src0, v_alpha))); _mm256_storeu_pd(dst + x + 4, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x + 4), v_beta), _mm256_mul_pd(v_src1, v_alpha))); } - accW_general_(src, dst, mask, len, cn, alpha, x); } - else +#elif CV_SIMD_64F + const v_float64 v_alpha = vx_setall_f64(alpha); + const v_float64 v_beta = vx_setall_f64(1.0f - alpha); + const int cVectorWidth = v_float64::nlanes * 2; + const int step = v_float64::nlanes; + + if (!mask) { - accW_simd_(src, dst, mask, len, cn, alpha); + int size = len * cn; + for (; x <= size - cVectorWidth; x += cVectorWidth) + { + v_float64 v_src0 = vx_load(src + x); + v_float64 v_src1 = vx_load(src + x + step); + + v_float64 v_dst0 = vx_load(dst + x); + v_float64 v_dst1 = vx_load(dst + x + step); + + v_dst0 = v_fma(v_dst0, v_beta, v_src0 * v_alpha); + v_dst1 = v_fma(v_dst1, v_beta, v_src1 * v_alpha); + + v_store(dst + x, v_dst0); + v_store(dst + x + step, v_dst1); + } } +#endif // CV_SIMD_64F + accW_general_(src, dst, mask, len, cn, alpha, x); } -#endif + #endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY CV_CPU_OPTIMIZATION_NAMESPACE_END diff --git a/modules/imgproc/src/smooth.cpp b/modules/imgproc/src/smooth.cpp index 81cc548b40..f327d9f067 100644 --- a/modules/imgproc/src/smooth.cpp +++ b/modules/imgproc/src/smooth.cpp @@ -1825,7 +1825,7 @@ void hlineSmooth1N(const uint8_t* src, int cn, const ufi const int VECSZ = v_uint16::nlanes; v_uint16 v_mul = vx_setall_u16(*((uint16_t*)m)); for (; i <= lencn - VECSZ; i += VECSZ) - v_store((uint16_t*)dst + i, v_mul*vx_load_expand(src + i)); + v_store((uint16_t*)dst + i, v_mul_wrap(v_mul, vx_load_expand(src + i))); #endif for (; i < lencn; i++) dst[i] = m[0] * src[i]; @@ -1915,7 +1915,9 @@ void hlineSmooth3N(const uint8_t* src, int cn, const ufi v_uint16 v_mul1 = vx_setall_u16(_m[1]); v_uint16 v_mul2 = vx_setall_u16(_m[2]); for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ) - v_store((uint16_t*)dst, vx_load_expand(src - cn) * v_mul0 + vx_load_expand(src) * v_mul1 + vx_load_expand(src + cn) * v_mul2); + v_store((uint16_t*)dst, v_mul_wrap(vx_load_expand(src - cn), v_mul0) + + v_mul_wrap(vx_load_expand(src), v_mul1) + + v_mul_wrap(vx_load_expand(src + cn), v_mul2)); #endif for (; i < lencn; i++, src++, dst++) *dst = m[0] * src[-cn] + m[1] * src[0] + m[2] * src[cn]; @@ -2089,7 +2091,8 @@ void hlineSmooth3Naba(const uint8_t* src, int cn, const v_uint16 v_mul0 = vx_setall_u16(_m[0]); v_uint16 v_mul1 = vx_setall_u16(_m[1]); for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ) - v_store((uint16_t*)dst, (vx_load_expand(src - cn) + vx_load_expand(src + cn)) * v_mul0 + vx_load_expand(src) * v_mul1); + v_store((uint16_t*)dst, v_mul_wrap(vx_load_expand(src - cn) + vx_load_expand(src + cn), v_mul0) + + v_mul_wrap(vx_load_expand(src), v_mul1)); #endif for (; i < lencn; i++, src++, dst++) *((uint16_t*)dst) = ((uint16_t*)m)[1] * src[0] + ((uint16_t*)m)[0] * ((uint16_t)(src[-cn]) + (uint16_t)(src[cn])); @@ -2285,7 +2288,11 @@ void hlineSmooth5N(const uint8_t* src, int cn, const ufi v_uint16 v_mul3 = vx_setall_u16(_m[3]); v_uint16 v_mul4 = vx_setall_u16(_m[4]); for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ) - v_store((uint16_t*)dst, vx_load_expand(src - 2 * cn) * v_mul0 + vx_load_expand(src - cn) * v_mul1 + vx_load_expand(src) * v_mul2 + vx_load_expand(src + cn) * v_mul3 + vx_load_expand(src + 2 * cn) * v_mul4); + v_store((uint16_t*)dst, v_mul_wrap(vx_load_expand(src - 2 * cn), v_mul0) + + v_mul_wrap(vx_load_expand(src - cn), v_mul1) + + v_mul_wrap(vx_load_expand(src), v_mul2) + + v_mul_wrap(vx_load_expand(src + cn), v_mul3) + + v_mul_wrap(vx_load_expand(src + 2 * cn), v_mul4)); #endif for (; i < lencn; i++, src++, dst++) *dst = m[0] * src[-2*cn] + m[1] * src[-cn] + m[2] * src[0] + m[3] * src[cn] + m[4] * src[2*cn]; @@ -2488,7 +2495,7 @@ void hlineSmooth5N14641(const uint8_t* src, int cn, cons const int VECSZ = v_uint16::nlanes; v_uint16 v_6 = vx_setall_u16(6); for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ) - v_store((uint16_t*)dst, (vx_load_expand(src) * v_6 + ((vx_load_expand(src - cn) + vx_load_expand(src + cn)) << 2) + vx_load_expand(src - 2 * cn) + vx_load_expand(src + 2 * cn)) << 4); + v_store((uint16_t*)dst, (v_mul_wrap(vx_load_expand(src), v_6) + ((vx_load_expand(src - cn) + vx_load_expand(src + cn)) << 2) + vx_load_expand(src - 2 * cn) + vx_load_expand(src + 2 * cn)) << 4); #endif for (; i < lencn; i++, src++, dst++) *((uint16_t*)dst) = (uint16_t(src[0]) * 6 + ((uint16_t(src[-cn]) + uint16_t(src[cn])) << 2) + uint16_t(src[-2 * cn]) + uint16_t(src[2 * cn])) << 4; @@ -2689,7 +2696,9 @@ void hlineSmooth5Nabcba(const uint8_t* src, int cn, cons v_uint16 v_mul1 = vx_setall_u16(_m[1]); v_uint16 v_mul2 = vx_setall_u16(_m[2]); for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ) - v_store((uint16_t*)dst, (vx_load_expand(src - 2 * cn) + vx_load_expand(src + 2 * cn)) * v_mul0 + (vx_load_expand(src - cn) + vx_load_expand(src + cn))* v_mul1 + vx_load_expand(src) * v_mul2); + v_store((uint16_t*)dst, v_mul_wrap(vx_load_expand(src - 2 * cn) + vx_load_expand(src + 2 * cn), v_mul0) + + v_mul_wrap(vx_load_expand(src - cn) + vx_load_expand(src + cn), v_mul1) + + v_mul_wrap(vx_load_expand(src), v_mul2)); #endif for (; i < lencn; i++, src++, dst++) *((uint16_t*)dst) = ((uint16_t*)m)[0] * ((uint16_t)(src[-2 * cn]) + (uint16_t)(src[2 * cn])) + ((uint16_t*)m)[1] * ((uint16_t)(src[-cn]) + (uint16_t)(src[cn])) + ((uint16_t*)m)[2] * src[0]; @@ -2804,9 +2813,9 @@ void hlineSmooth(const uint8_t* src, int cn, const ufixe const int VECSZ = v_uint16::nlanes; for (; i <= lencn - VECSZ; i+=VECSZ, src+=VECSZ, dst+=VECSZ) { - v_uint16 v_res0 = vx_load_expand(src) * vx_setall_u16(*((uint16_t*)m)); + v_uint16 v_res0 = v_mul_wrap(vx_load_expand(src), vx_setall_u16(*((uint16_t*)m))); for (int j = 1; j < n; j++) - v_res0 += vx_load_expand(src + j * cn) * vx_setall_u16(*((uint16_t*)(m + j))); + v_res0 += v_mul_wrap(vx_load_expand(src + j * cn), vx_setall_u16(*((uint16_t*)(m + j)))); v_store((uint16_t*)dst, v_res0); } #endif @@ -2923,9 +2932,9 @@ void hlineSmoothONa_yzy_a(const uint8_t* src, int cn, co const int VECSZ = v_uint16::nlanes; for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ) { - v_uint16 v_res0 = vx_load_expand(src + pre_shift * cn) * vx_setall_u16(*((uint16_t*)(m + pre_shift))); + v_uint16 v_res0 = v_mul_wrap(vx_load_expand(src + pre_shift * cn), vx_setall_u16(*((uint16_t*)(m + pre_shift)))); for (int j = 0; j < pre_shift; j ++) - v_res0 += (vx_load_expand(src + j * cn) + vx_load_expand(src + (n - 1 - j)*cn)) * vx_setall_u16(*((uint16_t*)(m + j))); + v_res0 += v_mul_wrap(vx_load_expand(src + j * cn) + vx_load_expand(src + (n - 1 - j)*cn), vx_setall_u16(*((uint16_t*)(m + j)))); v_store((uint16_t*)dst, v_res0); } #endif diff --git a/modules/video/src/lkpyramid.cpp b/modules/video/src/lkpyramid.cpp index 40026cd3c1..647f5e304b 100644 --- a/modules/video/src/lkpyramid.cpp +++ b/modules/video/src/lkpyramid.cpp @@ -93,7 +93,7 @@ static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst) v_int16x8 s2 = v_reinterpret_as_s16(v_load_expand(srow2 + x)); v_int16x8 t1 = s2 - s0; - v_int16x8 t0 = (s0 + s2) * c3 + s1 * c10; + v_int16x8 t0 = v_mul_wrap(s0 + s2, c3) + v_mul_wrap(s1, c10); v_store(trow0 + x, t0); v_store(trow1 + x, t1); @@ -131,7 +131,7 @@ static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst) v_int16x8 s4 = v_load(trow1 + x + cn); v_int16x8 t0 = s1 - s0; - v_int16x8 t1 = ((s2 + s4) * c3) + (s3 * c10); + v_int16x8 t1 = v_mul_wrap(s2 + s4, c3) + v_mul_wrap(s3, c10); v_store_interleave((drow + x*2), t0, t1); }