diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp
index 6ab4ccb36c..ef74176f33 100644
--- a/modules/core/include/opencv2/core/hal/intrin.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin.hpp
@@ -139,8 +139,14 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
 #   undef CV_FP16
 #endif
 
+#if CV_SSE2 || CV_NEON || CV_VSX
+#define CV__SIMD_FORWARD 128
+#include "opencv2/core/hal/intrin_forward.hpp"
+#endif
+
 #if CV_SSE2
 
+#include "opencv2/core/hal/intrin_sse_em.hpp"
 #include "opencv2/core/hal/intrin_sse.hpp"
 
 #elif CV_NEON
@@ -168,6 +174,8 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
 // (and will be mapped to v256_ counterparts) (e.g. vx_load() => v256_load())
 #if CV_AVX2
 
+#define CV__SIMD_FORWARD 256
+#include "opencv2/core/hal/intrin_forward.hpp"
 #include "opencv2/core/hal/intrin_avx.hpp"
 
 #endif
diff --git a/modules/core/include/opencv2/core/hal/intrin_avx.hpp b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
index a38c25e385..0cf36cf174 100644
--- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
@@ -82,6 +82,14 @@ inline __m128  _v256_extract_low(const __m256& v)
 inline __m128d _v256_extract_low(const __m256d& v)
 { return _mm256_castpd256_pd128(v); }
 
+inline __m256i _v256_packs_epu32(const __m256i& a, const __m256i& b)
+{
+    const __m256i m = _mm256_set1_epi32(65535);
+    __m256i am = _mm256_min_epu32(a, m);
+    __m256i bm = _mm256_min_epu32(b, m);
+    return _mm256_packus_epi32(am, bm);
+}
+
 ///////// Types ////////////
 
 struct v_uint8x32
@@ -626,10 +634,8 @@ OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int8x32,   _mm256_adds_epi8)
 OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int8x32,   _mm256_subs_epi8)
 OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint16x16, _mm256_adds_epu16)
 OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint16x16, _mm256_subs_epu16)
-OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_uint16x16, _mm256_mullo_epi16)
 OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int16x16,  _mm256_adds_epi16)
 OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int16x16,  _mm256_subs_epi16)
-OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_int16x16,  _mm256_mullo_epi16)
 OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint32x8,  _mm256_add_epi32)
 OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint32x8,  _mm256_sub_epi32)
 OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_uint32x8,  _mm256_mullo_epi32)
@@ -650,13 +656,103 @@ OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_float64x4, _mm256_sub_pd)
 OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_float64x4, _mm256_mul_pd)
 OPENCV_HAL_IMPL_AVX_BIN_OP(/, v_float64x4, _mm256_div_pd)
 
+// saturating multiply 8-bit, 16-bit
+inline v_uint8x32 operator * (const v_uint8x32& a, const v_uint8x32& b)
+{
+    v_uint16x16 c, d;
+    v_mul_expand(a, b, c, d);
+    return v_pack_u(v_reinterpret_as_s16(c), v_reinterpret_as_s16(d));
+}
+inline v_int8x32 operator * (const v_int8x32& a, const v_int8x32& b)
+{
+    v_int16x16 c, d;
+    v_mul_expand(a, b, c, d);
+    return v_pack(c, d);
+}
+inline v_uint16x16 operator * (const v_uint16x16& a, const v_uint16x16& b)
+{
+    __m256i pl = _mm256_mullo_epi16(a.val, b.val);
+    __m256i ph = _mm256_mulhi_epu16(a.val, b.val);
+    __m256i p0 = _mm256_unpacklo_epi16(pl, ph);
+    __m256i p1 = _mm256_unpackhi_epi16(pl, ph);
+    return v_uint16x16(_v256_packs_epu32(p0, p1));
+}
+inline v_int16x16 operator * (const v_int16x16& a, const v_int16x16& b)
+{
+    __m256i pl = _mm256_mullo_epi16(a.val, b.val);
+    __m256i ph = _mm256_mulhi_epi16(a.val, b.val);
+    __m256i p0 = _mm256_unpacklo_epi16(pl, ph);
+    __m256i p1 = _mm256_unpackhi_epi16(pl, ph);
+    return v_int16x16(_mm256_packs_epi32(p0, p1));
+}
+inline v_uint8x32& operator *= (v_uint8x32& a, const v_uint8x32& b)
+{ a = a * b; return a; }
+inline v_int8x32& operator *= (v_int8x32& a, const v_int8x32& b)
+{ a = a * b; return a; }
+inline v_uint16x16& operator *= (v_uint16x16& a, const v_uint16x16& b)
+{ a = a * b; return a; }
+inline v_int16x16& operator *= (v_int16x16& a, const v_int16x16& b)
+{ a = a * b; return a; }
+
+/** Non-saturating arithmetics **/
+#define OPENCV_HAL_IMPL_AVX_BIN_FUNC(func, _Tpvec, intrin) \
+    inline _Tpvec func(const _Tpvec& a, const _Tpvec& b)   \
+    { return _Tpvec(intrin(a.val, b.val)); }
+
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_uint8x32,  _mm256_add_epi8)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_int8x32,   _mm256_add_epi8)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_uint16x16, _mm256_add_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_int16x16,  _mm256_add_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_uint8x32,  _mm256_sub_epi8)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_int8x32,   _mm256_sub_epi8)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_uint16x16, _mm256_sub_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_int16x16,  _mm256_sub_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_mul_wrap, v_uint16x16, _mm256_mullo_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_mul_wrap, v_int16x16,  _mm256_mullo_epi16)
+
+inline v_uint8x32 v_mul_wrap(const v_uint8x32& a, const v_uint8x32& b)
+{
+    __m256i ad = _mm256_srai_epi16(a.val, 8);
+    __m256i bd = _mm256_srai_epi16(b.val, 8);
+    __m256i p0 = _mm256_mullo_epi16(a.val, b.val); // even
+    __m256i p1 = _mm256_slli_epi16(_mm256_mullo_epi16(ad, bd), 8); // odd
+
+    const __m256i b01 = _mm256_set1_epi32(0xFF00FF00);
+    return v_uint8x32(_mm256_blendv_epi8(p0, p1, b01));
+}
+inline v_int8x32 v_mul_wrap(const v_int8x32& a, const v_int8x32& b)
+{
+    return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
+}
+
+//  Multiply and expand
+inline void v_mul_expand(const v_uint8x32& a, const v_uint8x32& b,
+                         v_uint16x16& c, v_uint16x16& d)
+{
+    v_uint16x16 a0, a1, b0, b1;
+    v_expand(a, a0, a1);
+    v_expand(b, b0, b1);
+    c = v_mul_wrap(a0, b0);
+    d = v_mul_wrap(a1, b1);
+}
+
+inline void v_mul_expand(const v_int8x32& a, const v_int8x32& b,
+                         v_int16x16& c, v_int16x16& d)
+{
+    v_int16x16 a0, a1, b0, b1;
+    v_expand(a, a0, a1);
+    v_expand(b, b0, b1);
+    c = v_mul_wrap(a0, b0);
+    d = v_mul_wrap(a1, b1);
+}
+
 inline void v_mul_expand(const v_int16x16& a, const v_int16x16& b,
                          v_int32x8& c, v_int32x8& d)
 {
     v_int16x16 vhi = v_int16x16(_mm256_mulhi_epi16(a.val, b.val));
 
     v_int16x16 v0, v1;
-    v_zip(a * b, vhi, v0, v1);
+    v_zip(v_mul_wrap(a, b), vhi, v0, v1);
 
     c = v_reinterpret_as_s32(v0);
     d = v_reinterpret_as_s32(v1);
@@ -668,7 +764,7 @@ inline void v_mul_expand(const v_uint16x16& a, const v_uint16x16& b,
     v_uint16x16 vhi = v_uint16x16(_mm256_mulhi_epu16(a.val, b.val));
 
     v_uint16x16 v0, v1;
-    v_zip(a * b, vhi, v0, v1);
+    v_zip(v_mul_wrap(a, b), vhi, v0, v1);
 
     c = v_reinterpret_as_u32(v0);
     d = v_reinterpret_as_u32(v1);
@@ -685,20 +781,6 @@ inline void v_mul_expand(const v_uint32x8& a, const v_uint32x8& b,
 inline v_int16x16 v_mul_hi(const v_int16x16& a, const v_int16x16& b) { return v_int16x16(_mm256_mulhi_epi16(a.val, b.val)); }
 inline v_uint16x16 v_mul_hi(const v_uint16x16& a, const v_uint16x16& b) { return v_uint16x16(_mm256_mulhi_epu16(a.val, b.val)); }
 
-/** Non-saturating arithmetics **/
-#define OPENCV_HAL_IMPL_AVX_BIN_FUNC(func, _Tpvec, intrin) \
-    inline _Tpvec func(const _Tpvec& a, const _Tpvec& b)   \
-    { return _Tpvec(intrin(a.val, b.val)); }
-
-OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_uint8x32,  _mm256_add_epi8)
-OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_int8x32,   _mm256_add_epi8)
-OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_uint16x16, _mm256_add_epi16)
-OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_int16x16,  _mm256_add_epi16)
-OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_uint8x32,  _mm256_sub_epi8)
-OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_int8x32,   _mm256_sub_epi8)
-OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_uint16x16, _mm256_sub_epi16)
-OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_int16x16,  _mm256_sub_epi16)
-
 /** Bitwise shifts **/
 #define OPENCV_HAL_IMPL_AVX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai)  \
     inline _Tpuvec operator << (const _Tpuvec& a, int imm)            \
@@ -1385,6 +1467,10 @@ OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(v_float32x8, ps, _mm256_castps_si256, _mm256_ca
         b0.val = intrin(_v256_extract_low(a.val));                  \
         b1.val = intrin(_v256_extract_high(a.val));                 \
     }                                                               \
+    inline _Tpwvec v_expand_low(const _Tpvec& a)                    \
+    { return _Tpwvec(intrin(_v256_extract_low(a.val))); }           \
+    inline _Tpwvec v_expand_high(const _Tpvec& a)                   \
+    { return _Tpwvec(intrin(_v256_extract_high(a.val))); }          \
     inline _Tpwvec v256_load_expand(const _Tp* ptr)                 \
     {                                                               \
         __m128i a = _mm_loadu_si128((const __m128i*)ptr);           \
@@ -1430,7 +1516,12 @@ inline void v_pack_store(schar* ptr, const v_int16x16& a)
 { v_store_low(ptr, v_pack(a, a)); }
 
 inline void v_pack_store(uchar* ptr, const v_uint16x16& a)
-{ v_store_low(ptr, v_pack(a, a)); }
+{
+    const __m256i m = _mm256_set1_epi16(255);
+    __m256i am = _mm256_min_epu16(a.val, m);
+            am =  _v256_shuffle_odd_64(_mm256_packus_epi16(am, am));
+    v_store_low(ptr, v_uint8x32(am));
+}
 
 inline void v_pack_u_store(uchar* ptr, const v_int16x16& a)
 { v_store_low(ptr, v_pack_u(a, a)); }
@@ -1484,16 +1575,21 @@ inline v_int16x16 v_pack(const v_int32x8& a, const v_int32x8& b)
 { return v_int16x16(_v256_shuffle_odd_64(_mm256_packs_epi32(a.val, b.val))); }
 
 inline v_uint16x16 v_pack(const v_uint32x8& a, const v_uint32x8& b)
-{ return v_uint16x16(_v256_shuffle_odd_64(_mm256_packus_epi32(a.val, b.val))); }
+{ return v_uint16x16(_v256_shuffle_odd_64(_v256_packs_epu32(a.val, b.val))); }
 
 inline v_uint16x16 v_pack_u(const v_int32x8& a, const v_int32x8& b)
-{ return v_pack(v_reinterpret_as_u32(a), v_reinterpret_as_u32(b)); }
+{ return v_uint16x16(_v256_shuffle_odd_64(_mm256_packus_epi32(a.val, b.val))); }
 
 inline void v_pack_store(short* ptr, const v_int32x8& a)
 { v_store_low(ptr, v_pack(a, a)); }
 
 inline void v_pack_store(ushort* ptr, const v_uint32x8& a)
-{ v_store_low(ptr, v_pack(a, a)); }
+{
+    const __m256i m = _mm256_set1_epi32(65535);
+    __m256i am = _mm256_min_epu32(a.val, m);
+            am = _v256_shuffle_odd_64(_mm256_packus_epi32(am, am));
+    v_store_low(ptr, v_uint16x16(am));
+}
 
 inline void v_pack_u_store(ushort* ptr, const v_int32x8& a)
 { v_store_low(ptr, v_pack_u(a, a)); }
diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
index 64a457a530..38a39172d0 100644
--- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@@ -108,7 +108,7 @@ block and to save contents of the register to memory block.
 These operations allow to reorder or recombine elements in one or multiple vectors.
 
 - Interleave, deinterleave (2, 3 and 4 channels): @ref v_load_deinterleave, @ref v_store_interleave
-- Expand: @ref v_load_expand, @ref v_load_expand_q, @ref v_expand
+- Expand: @ref v_load_expand, @ref v_load_expand_q, @ref v_expand, @ref v_expand_low, @ref v_expand_high
 - Pack: @ref v_pack, @ref v_pack_u, @ref v_rshr_pack, @ref v_rshr_pack_u,
 @ref v_pack_store, @ref v_pack_u_store, @ref v_rshr_pack_store, @ref v_rshr_pack_u_store
 - Recombine: @ref v_zip, @ref v_recombine, @ref v_combine_low, @ref v_combine_high
@@ -185,11 +185,14 @@ Regular integers:
 |load, store        | x | x | x | x | x | x |
 |interleave         | x | x | x | x | x | x |
 |expand             | x | x | x | x | x | x |
+|expand_low         | x | x | x | x | x | x |
+|expand_high        | x | x | x | x | x | x |
 |expand_q           | x | x |   |   |   |   |
 |add, sub           | x | x | x | x | x | x |
 |add_wrap, sub_wrap | x | x | x | x |   |   |
-|mul                |   |   | x | x | x | x |
-|mul_expand         |   |   | x | x | x |   |
+|mul_wrap           | x | x | x | x |   |   |
+|mul                | x | x | x | x | x | x |
+|mul_expand         | x | x | x | x | x |   |
 |compare            | x | x | x | x | x | x |
 |shift              |   |   | x | x | x | x |
 |dotprod            |   |   |   | x |   |   |
@@ -680,7 +683,7 @@ OPENCV_HAL_IMPL_CMP_OP(!=)
 
 //! @brief Helper macro
 //! @ingroup core_hal_intrin_impl
-#define OPENCV_HAL_IMPL_ADD_SUB_OP(func, bin_op, cast_op, _Tp2) \
+#define OPENCV_HAL_IMPL_ARITHM_OP(func, bin_op, cast_op, _Tp2) \
 template<typename _Tp, int n> \
 inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
 { \
@@ -694,12 +697,17 @@ inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
 /** @brief Add values without saturation
 
 For 8- and 16-bit integer values. */
-OPENCV_HAL_IMPL_ADD_SUB_OP(v_add_wrap, +, (_Tp), _Tp)
+OPENCV_HAL_IMPL_ARITHM_OP(v_add_wrap, +, (_Tp), _Tp)
 
 /** @brief Subtract values without saturation
 
 For 8- and 16-bit integer values. */
-OPENCV_HAL_IMPL_ADD_SUB_OP(v_sub_wrap, -, (_Tp), _Tp)
+OPENCV_HAL_IMPL_ARITHM_OP(v_sub_wrap, -, (_Tp), _Tp)
+
+/** @brief Multiply values without saturation
+
+For 8- and 16-bit integer values. */
+OPENCV_HAL_IMPL_ARITHM_OP(v_mul_wrap, *, (_Tp), _Tp)
 
 //! @cond IGNORED
 template<typename T> inline T _absdiff(T a, T b)
@@ -1106,6 +1114,44 @@ template<typename _Tp, int n> inline void v_expand(const v_reg<_Tp, n>& a,
     }
 }
 
+/** @brief Expand lower values to the wider pack type
+
+Same as cv::v_expand, but return lower half of the vector.
+
+Scheme:
+@code
+ int32x4     int64x2
+{A B C D} ==> {A B}
+@endcode */
+template<typename _Tp, int n>
+inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
+v_expand_low(const v_reg<_Tp, n>& a)
+{
+    v_reg<typename V_TypeTraits<_Tp>::w_type, n/2> b;
+    for( int i = 0; i < (n/2); i++ )
+        b.s[i] = a.s[i];
+    return b;
+}
+
+/** @brief Expand higher values to the wider pack type
+
+Same as cv::v_expand_low, but expand higher half of the vector instead.
+
+Scheme:
+@code
+ int32x4     int64x2
+{A B C D} ==> {C D}
+@endcode */
+template<typename _Tp, int n>
+inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
+v_expand_high(const v_reg<_Tp, n>& a)
+{
+    v_reg<typename V_TypeTraits<_Tp>::w_type, n/2> b;
+    for( int i = 0; i < (n/2); i++ )
+        b.s[i] = a.s[i+(n/2)];
+    return b;
+}
+
 //! @cond IGNORED
 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::int_type, n>
     v_reinterpret_as_int(const v_reg<_Tp, n>& a)
diff --git a/modules/core/include/opencv2/core/hal/intrin_forward.hpp b/modules/core/include/opencv2/core/hal/intrin_forward.hpp
new file mode 100644
index 0000000000..4618552907
--- /dev/null
+++ b/modules/core/include/opencv2/core/hal/intrin_forward.hpp
@@ -0,0 +1,158 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#ifndef CV__SIMD_FORWARD
+#error "Need to pre-define forward width"
+#endif
+
+namespace cv
+{
+
+//! @cond IGNORED
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+
+/** Types **/
+#if CV__SIMD_FORWARD == 512
+// [todo] 512
+#error "AVX512 Not implemented yet"
+#elif CV__SIMD_FORWARD == 256
+// 256
+#define __CV_VX(fun)   v256_##fun
+#define __CV_V_UINT8   v_uint8x32
+#define __CV_V_INT8    v_int8x32
+#define __CV_V_UINT16  v_uint16x16
+#define __CV_V_INT16   v_int16x16
+#define __CV_V_UINT32  v_uint32x8
+#define __CV_V_INT32   v_int32x8
+#define __CV_V_UINT64  v_uint64x4
+#define __CV_V_INT64   v_int64x4
+#define __CV_V_FLOAT32 v_float32x8
+#define __CV_V_FLOAT64 v_float64x4
+struct v_uint8x32;
+struct v_int8x32;
+struct v_uint16x16;
+struct v_int16x16;
+struct v_uint32x8;
+struct v_int32x8;
+struct v_uint64x4;
+struct v_int64x4;
+struct v_float32x8;
+struct v_float64x4;
+#else
+// 128
+#define __CV_VX(fun)   v_##fun
+#define __CV_V_UINT8   v_uint8x16
+#define __CV_V_INT8    v_int8x16
+#define __CV_V_UINT16  v_uint16x8
+#define __CV_V_INT16   v_int16x8
+#define __CV_V_UINT32  v_uint32x4
+#define __CV_V_INT32   v_int32x4
+#define __CV_V_UINT64  v_uint64x2
+#define __CV_V_INT64   v_int64x2
+#define __CV_V_FLOAT32 v_float32x4
+#define __CV_V_FLOAT64 v_float64x2
+struct v_uint8x16;
+struct v_int8x16;
+struct v_uint16x8;
+struct v_int16x8;
+struct v_uint32x4;
+struct v_int32x4;
+struct v_uint64x2;
+struct v_int64x2;
+struct v_float32x4;
+struct v_float64x2;
+#endif
+
+/** Value reordering **/
+
+// Expansion
+void v_expand(const __CV_V_UINT8&,  __CV_V_UINT16&, __CV_V_UINT16&);
+void v_expand(const __CV_V_INT8&,   __CV_V_INT16&,  __CV_V_INT16&);
+void v_expand(const __CV_V_UINT16&, __CV_V_UINT32&, __CV_V_UINT32&);
+void v_expand(const __CV_V_INT16&,  __CV_V_INT32&,  __CV_V_INT32&);
+void v_expand(const __CV_V_UINT32&, __CV_V_UINT64&, __CV_V_UINT64&);
+void v_expand(const __CV_V_INT32&,  __CV_V_INT64&,  __CV_V_INT64&);
+// Low Expansion
+__CV_V_UINT16 v_expand_low(const __CV_V_UINT8&);
+__CV_V_INT16  v_expand_low(const __CV_V_INT8&);
+__CV_V_UINT32 v_expand_low(const __CV_V_UINT16&);
+__CV_V_INT32  v_expand_low(const __CV_V_INT16&);
+__CV_V_UINT64 v_expand_low(const __CV_V_UINT32&);
+__CV_V_INT64  v_expand_low(const __CV_V_INT32&);
+// High Expansion
+__CV_V_UINT16 v_expand_high(const __CV_V_UINT8&);
+__CV_V_INT16  v_expand_high(const __CV_V_INT8&);
+__CV_V_UINT32 v_expand_high(const __CV_V_UINT16&);
+__CV_V_INT32  v_expand_high(const __CV_V_INT16&);
+__CV_V_UINT64 v_expand_high(const __CV_V_UINT32&);
+__CV_V_INT64  v_expand_high(const __CV_V_INT32&);
+// Load & Low Expansion
+__CV_V_UINT16 __CV_VX(load_expand)(const uchar*);
+__CV_V_INT16  __CV_VX(load_expand)(const schar*);
+__CV_V_UINT32 __CV_VX(load_expand)(const ushort*);
+__CV_V_INT32  __CV_VX(load_expand)(const short*);
+__CV_V_UINT64 __CV_VX(load_expand)(const uint*);
+__CV_V_INT64  __CV_VX(load_expand)(const int*);
+// Load lower 8-bit and expand into 32-bit
+__CV_V_UINT32 __CV_VX(load_expand_q)(const uchar*);
+__CV_V_INT32  __CV_VX(load_expand_q)(const schar*);
+
+// Saturating Pack
+__CV_V_UINT8  v_pack(const __CV_V_UINT16&, const __CV_V_UINT16&);
+__CV_V_INT8   v_pack(const __CV_V_INT16&,  const __CV_V_INT16&);
+__CV_V_UINT16 v_pack(const __CV_V_UINT32&, const __CV_V_UINT32&);
+__CV_V_INT16  v_pack(const __CV_V_INT32&,  const __CV_V_INT32&);
+// Non-saturating Pack
+__CV_V_UINT32 v_pack(const __CV_V_UINT64&, const __CV_V_UINT64&);
+__CV_V_INT32  v_pack(const __CV_V_INT64&,  const __CV_V_INT64&);
+// Pack signed integers with unsigned saturation
+__CV_V_UINT8  v_pack_u(const __CV_V_INT16&, const __CV_V_INT16&);
+__CV_V_UINT16 v_pack_u(const __CV_V_INT32&, const __CV_V_INT32&);
+
+/** Arithmetic, bitwise and comparison operations **/
+
+// Non-saturating multiply
+#if CV_VSX
+template<typename Tvec>
+Tvec v_mul_wrap(const Tvec& a, const Tvec& b);
+#else
+__CV_V_UINT8  v_mul_wrap(const __CV_V_UINT8&,  const __CV_V_UINT8&);
+__CV_V_INT8   v_mul_wrap(const __CV_V_INT8&,   const __CV_V_INT8&);
+__CV_V_UINT16 v_mul_wrap(const __CV_V_UINT16&, const __CV_V_UINT16&);
+__CV_V_INT16  v_mul_wrap(const __CV_V_INT16&,  const __CV_V_INT16&);
+#endif
+
+//  Multiply and expand
+#if CV_VSX
+template<typename Tvec, typename Twvec>
+void v_mul_expand(const Tvec& a, const Tvec& b, Twvec& c, Twvec& d);
+#else
+void v_mul_expand(const __CV_V_UINT8&,  const __CV_V_UINT8&,  __CV_V_UINT16&, __CV_V_UINT16&);
+void v_mul_expand(const __CV_V_INT8&,   const __CV_V_INT8&,   __CV_V_INT16&,  __CV_V_INT16&);
+void v_mul_expand(const __CV_V_UINT16&, const __CV_V_UINT16&, __CV_V_UINT32&, __CV_V_UINT32&);
+void v_mul_expand(const __CV_V_INT16&,  const __CV_V_INT16&,  __CV_V_INT32&,  __CV_V_INT32&);
+void v_mul_expand(const __CV_V_UINT32&, const __CV_V_UINT32&, __CV_V_UINT64&, __CV_V_UINT64&);
+void v_mul_expand(const __CV_V_INT32&,  const __CV_V_INT32&,  __CV_V_INT64&,  __CV_V_INT64&);
+#endif
+
+/** Cleanup **/
+#undef CV__SIMD_FORWARD
+#undef __CV_VX
+#undef __CV_V_UINT8
+#undef __CV_V_INT8
+#undef __CV_V_UINT16
+#undef __CV_V_INT16
+#undef __CV_V_UINT32
+#undef __CV_V_INT32
+#undef __CV_V_UINT64
+#undef __CV_V_INT64
+#undef __CV_V_FLOAT32
+#undef __CV_V_FLOAT64
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+
+//! @endcond
+
+} // cv::
\ No newline at end of file
diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
index d87b4e2ba0..8c13ad52db 100644
--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@@ -435,10 +435,8 @@ OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int8x16, vqaddq_s8)
 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int8x16, vqsubq_s8)
 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint16x8, vqaddq_u16)
 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint16x8, vqsubq_u16)
-OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_uint16x8, vmulq_u16)
 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int16x8, vqaddq_s16)
 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int16x8, vqsubq_s16)
-OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int16x8, vmulq_s16)
 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int32x4, vaddq_s32)
 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int32x4, vsubq_s32)
 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int32x4, vmulq_s32)
@@ -476,6 +474,37 @@ inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b)
 }
 #endif
 
+// saturating multiply 8-bit, 16-bit
+#define OPENCV_HAL_IMPL_NEON_MUL_SAT(_Tpvec, _Tpwvec)            \
+    inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b)  \
+    {                                                            \
+        _Tpwvec c, d;                                            \
+        v_mul_expand(a, b, c, d);                                \
+        return v_pack(c, d);                                     \
+    }                                                            \
+    inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b)      \
+    { a = a * b; return a; }
+
+OPENCV_HAL_IMPL_NEON_MUL_SAT(v_int8x16,  v_int16x8)
+OPENCV_HAL_IMPL_NEON_MUL_SAT(v_uint8x16, v_uint16x8)
+OPENCV_HAL_IMPL_NEON_MUL_SAT(v_int16x8,  v_int32x4)
+OPENCV_HAL_IMPL_NEON_MUL_SAT(v_uint16x8, v_uint32x4)
+
+//  Multiply and expand
+inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
+                         v_int16x8& c, v_int16x8& d)
+{
+    c.val = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
+    d.val = vmull_s8(vget_high_s8(a.val), vget_high_s8(b.val));
+}
+
+inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
+                         v_uint16x8& c, v_uint16x8& d)
+{
+    c.val = vmull_u8(vget_low_u8(a.val), vget_low_u8(b.val));
+    d.val = vmull_u8(vget_high_u8(a.val), vget_high_u8(b.val));
+}
+
 inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
                          v_int32x4& c, v_int32x4& d)
 {
@@ -714,6 +743,10 @@ OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_sub_wrap, vsubq_u8)
 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_sub_wrap, vsubq_s8)
 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_sub_wrap, vsubq_u16)
 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_sub_wrap, vsubq_s16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_mul_wrap, vmulq_u8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_mul_wrap, vmulq_s8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_mul_wrap, vmulq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_mul_wrap, vmulq_s16)
 
 // TODO: absdiff for signed integers
 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_absdiff, vabdq_u8)
@@ -1056,6 +1089,14 @@ inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
     b0.val = vmovl_##suffix(vget_low_##suffix(a.val)); \
     b1.val = vmovl_##suffix(vget_high_##suffix(a.val)); \
 } \
+inline _Tpwvec v_expand_low(const _Tpvec& a) \
+{ \
+    return _Tpwvec(vmovl_##suffix(vget_low_##suffix(a.val))); \
+} \
+inline _Tpwvec v_expand_high(const _Tpvec& a) \
+{ \
+    return _Tpwvec(vmovl_##suffix(vget_high_##suffix(a.val))); \
+} \
 inline _Tpwvec v_load_expand(const _Tp* ptr) \
 { \
     return _Tpwvec(vmovl_##suffix(vld1_##suffix(ptr))); \
diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
index 18bdf46f90..d4740b72fe 100644
--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@@ -59,6 +59,8 @@ namespace cv
 
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
 
+///////// Types ////////////
+
 struct v_uint8x16
 {
     typedef uchar lane_type;
@@ -436,13 +438,7 @@ inline __m128i v_select_si128(__m128i mask, __m128i a, __m128i b)
 }
 
 inline v_uint16x8 v_pack(const v_uint32x4& a, const v_uint32x4& b)
-{
-    __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768);
-    __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32);
-    __m128i b1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, b.val), maxval32, b.val), delta32);
-    __m128i r = _mm_packs_epi32(a1, b1);
-    return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768)));
-}
+{ return v_uint16x8(_v128_packs_epu32(a.val, b.val)); }
 
 inline void v_pack_store(ushort* ptr, const v_uint32x4& a)
 {
@@ -678,14 +674,14 @@ OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int8x16, _mm_adds_epi8)
 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int8x16, _mm_subs_epi8)
 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint16x8, _mm_adds_epu16)
 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint16x8, _mm_subs_epu16)
-OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_uint16x8, _mm_mullo_epi16)
 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int16x8, _mm_adds_epi16)
 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int16x8, _mm_subs_epi16)
-OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_int16x8, _mm_mullo_epi16)
 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint32x4, _mm_add_epi32)
 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint32x4, _mm_sub_epi32)
+OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_uint32x4, _v128_mullo_epi32)
 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int32x4, _mm_add_epi32)
 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int32x4, _mm_sub_epi32)
+OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_int32x4, _v128_mullo_epi32)
 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float32x4, _mm_add_ps)
 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float32x4, _mm_sub_ps)
 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float32x4, _mm_mul_ps)
@@ -699,35 +695,49 @@ OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint64x2, _mm_sub_epi64)
 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int64x2, _mm_add_epi64)
 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int64x2, _mm_sub_epi64)
 
-inline v_uint32x4 operator * (const v_uint32x4& a, const v_uint32x4& b)
+// saturating multiply 8-bit, 16-bit
+#define OPENCV_HAL_IMPL_SSE_MUL_SAT(_Tpvec, _Tpwvec)             \
+    inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b)  \
+    {                                                            \
+        _Tpwvec c, d;                                            \
+        v_mul_expand(a, b, c, d);                                \
+        return v_pack(c, d);                                     \
+    }                                                            \
+    inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b)      \
+    { a = a * b; return a; }
+
+OPENCV_HAL_IMPL_SSE_MUL_SAT(v_int8x16,  v_int16x8)
+OPENCV_HAL_IMPL_SSE_MUL_SAT(v_uint16x8, v_uint32x4)
+OPENCV_HAL_IMPL_SSE_MUL_SAT(v_int16x8,  v_int32x4)
+
+inline v_uint8x16 operator * (const v_uint8x16& a, const v_uint8x16& b)
 {
-    __m128i c0 = _mm_mul_epu32(a.val, b.val);
-    __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
-    __m128i d0 = _mm_unpacklo_epi32(c0, c1);
-    __m128i d1 = _mm_unpackhi_epi32(c0, c1);
-    return v_uint32x4(_mm_unpacklo_epi64(d0, d1));
+    v_uint16x8 c, d;
+    v_mul_expand(a, b, c, d);
+    return v_pack_u(v_reinterpret_as_s16(c), v_reinterpret_as_s16(d));
 }
-inline v_int32x4 operator * (const v_int32x4& a, const v_int32x4& b)
+inline v_uint8x16& operator *= (v_uint8x16& a, const v_uint8x16& b)
+{ a = a * b; return a; }
+
+//  Multiply and expand
+inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
+                         v_uint16x8& c, v_uint16x8& d)
 {
-#if CV_SSE4_1
-    return v_int32x4(_mm_mullo_epi32(a.val, b.val));
-#else
-    __m128i c0 = _mm_mul_epu32(a.val, b.val);
-    __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
-    __m128i d0 = _mm_unpacklo_epi32(c0, c1);
-    __m128i d1 = _mm_unpackhi_epi32(c0, c1);
-    return v_int32x4(_mm_unpacklo_epi64(d0, d1));
-#endif
+    v_uint16x8 a0, a1, b0, b1;
+    v_expand(a, a0, a1);
+    v_expand(b, b0, b1);
+    c = v_mul_wrap(a0, b0);
+    d = v_mul_wrap(a1, b1);
 }
-inline v_uint32x4& operator *= (v_uint32x4& a, const v_uint32x4& b)
+
+inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
+                         v_int16x8& c, v_int16x8& d)
 {
-    a = a * b;
-    return a;
-}
-inline v_int32x4& operator *= (v_int32x4& a, const v_int32x4& b)
-{
-    a = a * b;
-    return a;
+    v_int16x8 a0, a1, b0, b1;
+    v_expand(a, a0, a1);
+    v_expand(b, b0, b1);
+    c = v_mul_wrap(a0, b0);
+    d = v_mul_wrap(a1, b1);
 }
 
 inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
@@ -1018,6 +1028,22 @@ OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_sub_wrap, _mm_sub_epi8)
 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_sub_wrap, _mm_sub_epi8)
 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_sub_wrap, _mm_sub_epi16)
 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_sub_wrap, _mm_sub_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_mul_wrap, _mm_mullo_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_mul_wrap, _mm_mullo_epi16)
+
+inline v_uint8x16 v_mul_wrap(const v_uint8x16& a, const v_uint8x16& b)
+{
+    __m128i ad = _mm_srai_epi16(a.val, 8);
+    __m128i bd = _mm_srai_epi16(b.val, 8);
+    __m128i p0 = _mm_mullo_epi16(a.val, b.val); // even
+    __m128i p1 = _mm_slli_epi16(_mm_mullo_epi16(ad, bd), 8); // odd
+    const __m128i b01 = _mm_set1_epi32(0xFF00FF00);
+    return v_uint8x16(_v128_blendv_epi8(p0, p1, b01));
+}
+inline v_int8x16 v_mul_wrap(const v_int8x16& a, const v_int8x16& b)
+{
+    return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
+}
 
 #define OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(_Tpuvec, _Tpsvec, bits, smask32) \
 inline _Tpuvec v_absdiff(const _Tpuvec& a, const _Tpuvec& b) \
@@ -1502,70 +1528,39 @@ OPENCV_HAL_IMPL_SSE_SELECT(v_float32x4, ps)
 OPENCV_HAL_IMPL_SSE_SELECT(v_float64x2, pd)
 #endif
 
-#define OPENCV_HAL_IMPL_SSE_EXPAND(_Tpuvec, _Tpwuvec, _Tpu, _Tpsvec, _Tpwsvec, _Tps, suffix, wsuffix, shift) \
-inline void v_expand(const _Tpuvec& a, _Tpwuvec& b0, _Tpwuvec& b1) \
-{ \
-    __m128i z = _mm_setzero_si128(); \
-    b0.val = _mm_unpacklo_##suffix(a.val, z); \
-    b1.val = _mm_unpackhi_##suffix(a.val, z); \
-} \
-inline _Tpwuvec v_load_expand(const _Tpu* ptr) \
-{ \
-    __m128i z = _mm_setzero_si128(); \
-    return _Tpwuvec(_mm_unpacklo_##suffix(_mm_loadl_epi64((const __m128i*)ptr), z)); \
-} \
-inline void v_expand(const _Tpsvec& a, _Tpwsvec& b0, _Tpwsvec& b1) \
-{ \
-    b0.val = _mm_srai_##wsuffix(_mm_unpacklo_##suffix(a.val, a.val), shift); \
-    b1.val = _mm_srai_##wsuffix(_mm_unpackhi_##suffix(a.val, a.val), shift); \
-} \
-inline _Tpwsvec v_load_expand(const _Tps* ptr) \
-{ \
-    __m128i a = _mm_loadl_epi64((const __m128i*)ptr); \
-    return _Tpwsvec(_mm_srai_##wsuffix(_mm_unpacklo_##suffix(a, a), shift)); \
-}
+/* Expand */
+#define OPENCV_HAL_IMPL_SSE_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin)    \
+    inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
+    {                                                               \
+        b0.val = intrin(a.val);                                     \
+        b1.val = __CV_CAT(intrin, _high)(a.val);                    \
+    }                                                               \
+    inline _Tpwvec v_expand_low(const _Tpvec& a)                    \
+    { return _Tpwvec(intrin(a.val)); }                              \
+    inline _Tpwvec v_expand_high(const _Tpvec& a)                   \
+    { return _Tpwvec(__CV_CAT(intrin, _high)(a.val)); }             \
+    inline _Tpwvec v_load_expand(const _Tp* ptr)                    \
+    {                                                               \
+        __m128i a = _mm_loadl_epi64((const __m128i*)ptr);           \
+        return _Tpwvec(intrin(a));                                  \
+    }
 
-OPENCV_HAL_IMPL_SSE_EXPAND(v_uint8x16, v_uint16x8, uchar, v_int8x16, v_int16x8, schar, epi8, epi16, 8)
-OPENCV_HAL_IMPL_SSE_EXPAND(v_uint16x8, v_uint32x4, ushort, v_int16x8, v_int32x4, short, epi16, epi32, 16)
+OPENCV_HAL_IMPL_SSE_EXPAND(v_uint8x16, v_uint16x8,  uchar,    _v128_cvtepu8_epi16)
+OPENCV_HAL_IMPL_SSE_EXPAND(v_int8x16,  v_int16x8,   schar,    _v128_cvtepi8_epi16)
+OPENCV_HAL_IMPL_SSE_EXPAND(v_uint16x8, v_uint32x4,  ushort,   _v128_cvtepu16_epi32)
+OPENCV_HAL_IMPL_SSE_EXPAND(v_int16x8,  v_int32x4,   short,    _v128_cvtepi16_epi32)
+OPENCV_HAL_IMPL_SSE_EXPAND(v_uint32x4, v_uint64x2,  unsigned, _v128_cvtepu32_epi64)
+OPENCV_HAL_IMPL_SSE_EXPAND(v_int32x4,  v_int64x2,   int,      _v128_cvtepi32_epi64)
 
-inline void v_expand(const v_uint32x4& a, v_uint64x2& b0, v_uint64x2& b1)
-{
-    __m128i z = _mm_setzero_si128();
-    b0.val = _mm_unpacklo_epi32(a.val, z);
-    b1.val = _mm_unpackhi_epi32(a.val, z);
-}
-inline v_uint64x2 v_load_expand(const unsigned* ptr)
-{
-    __m128i z = _mm_setzero_si128();
-    return v_uint64x2(_mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i*)ptr), z));
-}
-inline void v_expand(const v_int32x4& a, v_int64x2& b0, v_int64x2& b1)
-{
-    __m128i s = _mm_srai_epi32(a.val, 31);
-    b0.val = _mm_unpacklo_epi32(a.val, s);
-    b1.val = _mm_unpackhi_epi32(a.val, s);
-}
-inline v_int64x2 v_load_expand(const int* ptr)
-{
-    __m128i a = _mm_loadl_epi64((const __m128i*)ptr);
-    __m128i s = _mm_srai_epi32(a, 31);
-    return v_int64x2(_mm_unpacklo_epi32(a, s));
-}
+#define OPENCV_HAL_IMPL_SSE_EXPAND_Q(_Tpvec, _Tp, intrin)  \
+    inline _Tpvec v_load_expand_q(const _Tp* ptr)          \
+    {                                                      \
+        __m128i a = _mm_cvtsi32_si128(*(const int*)ptr);   \
+        return _Tpvec(intrin(a));                          \
+    }
 
-inline v_uint32x4 v_load_expand_q(const uchar* ptr)
-{
-    __m128i z = _mm_setzero_si128();
-    __m128i a = _mm_cvtsi32_si128(*(const int*)ptr);
-    return v_uint32x4(_mm_unpacklo_epi16(_mm_unpacklo_epi8(a, z), z));
-}
-
-inline v_int32x4 v_load_expand_q(const schar* ptr)
-{
-    __m128i a = _mm_cvtsi32_si128(*(const int*)ptr);
-    a = _mm_unpacklo_epi8(a, a);
-    a = _mm_unpacklo_epi8(a, a);
-    return v_int32x4(_mm_srai_epi32(a, 24));
-}
+OPENCV_HAL_IMPL_SSE_EXPAND_Q(v_uint32x4, uchar, _v128_cvtepu8_epi32)
+OPENCV_HAL_IMPL_SSE_EXPAND_Q(v_int32x4,  schar, _v128_cvtepi8_epi32)
 
 #define OPENCV_HAL_IMPL_SSE_UNPACKS(_Tpvec, suffix, cast_from, cast_to) \
 inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \
diff --git a/modules/core/include/opencv2/core/hal/intrin_sse_em.hpp b/modules/core/include/opencv2/core/hal/intrin_sse_em.hpp
new file mode 100644
index 0000000000..be2766847c
--- /dev/null
+++ b/modules/core/include/opencv2/core/hal/intrin_sse_em.hpp
@@ -0,0 +1,167 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#ifndef OPENCV_HAL_INTRIN_SSE_EM_HPP
+#define OPENCV_HAL_INTRIN_SSE_EM_HPP
+
+namespace cv
+{
+
+//! @cond IGNORED
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+
+#define OPENCV_HAL_SSE_WRAP_1(fun, tp) \
+    inline tp _v128_##fun(const tp& a) \
+    { return _mm_##fun(a); }
+
+#define OPENCV_HAL_SSE_WRAP_2(fun, tp) \
+    inline tp _v128_##fun(const tp& a, const tp& b) \
+    { return _mm_##fun(a, b); }
+
+#define OPENCV_HAL_SSE_WRAP_3(fun, tp) \
+    inline tp _v128_##fun(const tp& a, const tp& b, const tp& c) \
+    { return _mm_##fun(a, b, c); }
+
+///////////////////////////// XOP /////////////////////////////
+
+// [todo] define CV_XOP
+#if 1 // CV_XOP
+inline __m128i _v128_comgt_epu32(const __m128i& a, const __m128i& b)
+{
+    const __m128i delta = _mm_set1_epi32((int)0x80000000);
+    return _mm_cmpgt_epi32(_mm_xor_si128(a, delta), _mm_xor_si128(b, delta));
+}
+// wrapping XOP
+#else
+OPENCV_HAL_SSE_WRAP_2(_v128_comgt_epu32, __m128i)
+#endif // !CV_XOP
+
+///////////////////////////// SSE4.1 /////////////////////////////
+
+#if !CV_SSE4_1
+
+/** Swizzle **/
+inline __m128i _v128_blendv_epi8(const __m128i& a, const __m128i& b, const __m128i& mask)
+{ return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(b, a), mask)); }
+
+/** Convert **/
+// 8 >> 16
+inline __m128i _v128_cvtepu8_epi16(const __m128i& a)
+{
+    const __m128i z = _mm_setzero_si128();
+    return _mm_unpacklo_epi8(a, z);
+}
+inline __m128i _v128_cvtepi8_epi16(const __m128i& a)
+{ return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8); }
+// 8 >> 32
+inline __m128i _v128_cvtepu8_epi32(const __m128i& a)
+{
+    const __m128i z = _mm_setzero_si128();
+    return _mm_unpacklo_epi16(_mm_unpacklo_epi8(a, z), z);
+}
+inline __m128i _v128_cvtepi8_epi32(const __m128i& a)
+{
+    __m128i r = _mm_unpacklo_epi8(a, a);
+    r = _mm_unpacklo_epi8(r, r);
+    return _mm_srai_epi32(r, 24);
+}
+// 16 >> 32
+inline __m128i _v128_cvtepu16_epi32(const __m128i& a)
+{
+    const __m128i z = _mm_setzero_si128();
+    return _mm_unpacklo_epi16(a, z);
+}
+inline __m128i _v128_cvtepi16_epi32(const __m128i& a)
+{ return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16); }
+// 32 >> 64
+inline __m128i _v128_cvtepu32_epi64(const __m128i& a)
+{
+    const __m128i z = _mm_setzero_si128();
+    return _mm_unpacklo_epi32(a, z);
+}
+inline __m128i _v128_cvtepi32_epi64(const __m128i& a)
+{ return _mm_unpacklo_epi32(a, _mm_srai_epi32(a, 31)); }
+
+/** Arithmetic **/
+inline __m128i _v128_mullo_epi32(const __m128i& a, const __m128i& b)
+{
+    __m128i c0 = _mm_mul_epu32(a, b);
+    __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a, 32), _mm_srli_epi64(b, 32));
+    __m128i d0 = _mm_unpacklo_epi32(c0, c1);
+    __m128i d1 = _mm_unpackhi_epi32(c0, c1);
+    return _mm_unpacklo_epi64(d0, d1);
+}
+
+/** Math **/
+inline __m128i _v128_min_epu32(const __m128i& a, const __m128i& b)
+{ return _v128_blendv_epi8(a, b, _v128_comgt_epu32(a, b)); }
+
+// wrapping SSE4.1
+#else
+OPENCV_HAL_SSE_WRAP_1(cvtepu8_epi16, __m128i)
+OPENCV_HAL_SSE_WRAP_1(cvtepi8_epi16, __m128i)
+OPENCV_HAL_SSE_WRAP_1(cvtepu8_epi32, __m128i)
+OPENCV_HAL_SSE_WRAP_1(cvtepi8_epi32, __m128i)
+OPENCV_HAL_SSE_WRAP_1(cvtepu16_epi32, __m128i)
+OPENCV_HAL_SSE_WRAP_1(cvtepi16_epi32, __m128i)
+OPENCV_HAL_SSE_WRAP_1(cvtepu32_epi64, __m128i)
+OPENCV_HAL_SSE_WRAP_1(cvtepi32_epi64, __m128i)
+OPENCV_HAL_SSE_WRAP_2(min_epu32, __m128i)
+OPENCV_HAL_SSE_WRAP_2(mullo_epi32, __m128i)
+OPENCV_HAL_SSE_WRAP_3(blendv_epi8, __m128i)
+#endif // !CV_SSE4_1
+
+///////////////////////////// Revolutionary /////////////////////////////
+
+/** Convert **/
+// 16 << 8
+inline __m128i _v128_cvtepu8_epi16_high(const __m128i& a)
+{
+    const __m128i z = _mm_setzero_si128();
+    return _mm_unpackhi_epi8(a, z);
+}
+inline __m128i _v128_cvtepi8_epi16_high(const __m128i& a)
+{ return _mm_srai_epi16(_mm_unpackhi_epi8(a, a), 8); }
+// 32 << 16
+inline __m128i _v128_cvtepu16_epi32_high(const __m128i& a)
+{
+    const __m128i z = _mm_setzero_si128();
+    return _mm_unpackhi_epi16(a, z);
+}
+inline __m128i _v128_cvtepi16_epi32_high(const __m128i& a)
+{ return _mm_srai_epi32(_mm_unpackhi_epi16(a, a), 16); }
+// 64 << 32
+inline __m128i _v128_cvtepu32_epi64_high(const __m128i& a)
+{
+    const __m128i z = _mm_setzero_si128();
+    return _mm_unpackhi_epi32(a, z);
+}
+inline __m128i _v128_cvtepi32_epi64_high(const __m128i& a)
+{ return _mm_unpackhi_epi32(a, _mm_srai_epi32(a, 31)); }
+
+/** Miscellaneous **/
+inline __m128i _v128_packs_epu32(const __m128i& a, const __m128i& b)
+{
+    const __m128i m = _mm_set1_epi32(65535);
+    __m128i am = _v128_min_epu32(a, m);
+    __m128i bm = _v128_min_epu32(b, m);
+#if CV_SSE4_1
+    return _mm_packus_epi32(am, bm);
+#else
+    const __m128i d = _mm_set1_epi32(32768), nd = _mm_set1_epi16(-32768);
+    am = _mm_sub_epi32(am, d);
+    bm = _mm_sub_epi32(bm, d);
+    am = _mm_packs_epi32(am, bm);
+    return _mm_sub_epi16(am, nd);
+#endif
+}
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+
+//! @endcond
+
+} // cv::
+
+#endif // OPENCV_HAL_INTRIN_SSE_EM_HPP
\ No newline at end of file
diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
index fb81986f6c..fd554ac755 100644
--- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
@@ -315,6 +315,10 @@ inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1)   \
     b0.val = fh(a.val);                                           \
     b1.val = fl(a.val);                                           \
 }                                                                 \
+inline _Tpwvec v_expand_low(const _Tpvec& a)                      \
+{ return _Tpwvec(fh(a.val)); }                                    \
+inline _Tpwvec v_expand_high(const _Tpvec& a)                     \
+{ return _Tpwvec(fl(a.val)); }                                    \
 inline _Tpwvec v_load_expand(const _Tp* ptr)                      \
 { return _Tpwvec(fh(vec_ld_l8(ptr))); }
 
@@ -418,10 +422,8 @@ OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int8x16,  vec_adds)
 OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int8x16, vec_subs)
 OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint16x8, vec_adds)
 OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint16x8, vec_subs)
-OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_uint16x8, vec_mul)
 OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int16x8, vec_adds)
 OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int16x8, vec_subs)
-OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_int16x8, vec_mul)
 OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint32x4, vec_add)
 OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint32x4, vec_sub)
 OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_uint32x4, vec_mul)
@@ -441,16 +443,30 @@ OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint64x2, vec_sub)
 OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int64x2, vec_add)
 OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int64x2, vec_sub)
 
-inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b, v_int32x4& c, v_int32x4& d)
+// saturating multiply
+#define OPENCV_HAL_IMPL_VSX_MUL_SAT(_Tpvec, _Tpwvec)             \
+    inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b)  \
+    {                                                            \
+        _Tpwvec c, d;                                            \
+        v_mul_expand(a, b, c, d);                                \
+        return v_pack(c, d);                                     \
+    }                                                            \
+    inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b)      \
+    { a = a * b; return a; }
+
+OPENCV_HAL_IMPL_VSX_MUL_SAT(v_int8x16,  v_int16x8)
+OPENCV_HAL_IMPL_VSX_MUL_SAT(v_uint8x16, v_uint16x8)
+OPENCV_HAL_IMPL_VSX_MUL_SAT(v_int16x8,  v_int32x4)
+OPENCV_HAL_IMPL_VSX_MUL_SAT(v_uint16x8, v_uint32x4)
+
+template<typename Tvec, typename Twvec>
+inline void v_mul_expand(const Tvec& a, const Tvec& b, Twvec& c, Twvec& d)
 {
-    c.val = vec_mul(vec_unpackh(a.val), vec_unpackh(b.val));
-    d.val = vec_mul(vec_unpackl(a.val), vec_unpackl(b.val));
-}
-inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b, v_uint32x4& c, v_uint32x4& d)
-{
-    c.val = vec_mul(vec_unpackhu(a.val), vec_unpackhu(b.val));
-    d.val = vec_mul(vec_unpacklu(a.val), vec_unpacklu(b.val));
+    Twvec p0 = Twvec(vec_mule(a.val, b.val));
+    Twvec p1 = Twvec(vec_mulo(a.val, b.val));
+    v_zip(p0, p1, c, d);
 }
+
 inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b, v_uint64x2& c, v_uint64x2& d)
 {
     c.val = vec_mul(vec_unpackhu(a.val), vec_unpackhu(b.val));
@@ -459,17 +475,17 @@ inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b, v_uint64x2& c
 
 inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
 {
-    return v_int16x8(vec_packs(
-                               vec_sra(vec_mul(vec_unpackh(a.val), vec_unpackh(b.val)), vec_uint4_sp(16)),
-                               vec_sra(vec_mul(vec_unpackl(a.val), vec_unpackl(b.val)), vec_uint4_sp(16))
-                              ));
+    vec_int4 p0 = vec_mule(a.val, b.val);
+    vec_int4 p1 = vec_mulo(a.val, b.val);
+    static const vec_uchar16 perm = {2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31};
+    return v_int16x8(vec_perm(vec_short8_c(p0), vec_short8_c(p1), perm));
 }
 inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
 {
-    return v_uint16x8(vec_packs(
-                                vec_sr(vec_mul(vec_unpackhu(a.val), vec_unpackhu(b.val)), vec_uint4_sp(16)),
-                                vec_sr(vec_mul(vec_unpacklu(a.val), vec_unpacklu(b.val)), vec_uint4_sp(16))
-                               ));
+    vec_uint4 p0 = vec_mule(a.val, b.val);
+    vec_uint4 p1 = vec_mulo(a.val, b.val);
+    static const vec_uchar16 perm = {2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31};
+    return v_uint16x8(vec_perm(vec_ushort8_c(p0), vec_ushort8_c(p1), perm));
 }
 
 /** Non-saturating arithmetics **/
@@ -480,6 +496,7 @@ inline _Tpvec func(const _Tpvec& a, const _Tpvec& b)  \
 
 OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_add_wrap, vec_add)
 OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_sub_wrap, vec_sub)
+OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_mul_wrap, vec_mul)
 
 /** Bitwise shifts **/
 #define OPENCV_HAL_IMPL_VSX_SHIFT_OP(_Tpvec, shr, splfunc)   \
diff --git a/modules/core/include/opencv2/core/vsx_utils.hpp b/modules/core/include/opencv2/core/vsx_utils.hpp
index d4dab9eed7..b4e3f30562 100644
--- a/modules/core/include/opencv2/core/vsx_utils.hpp
+++ b/modules/core/include/opencv2/core/vsx_utils.hpp
@@ -130,19 +130,21 @@ VSX_FINLINE(rt) fnm(const rg& a, const rg& b)  \
 #       undef vec_mul
 #   endif
 /*
- * there's no a direct instruction for supporting 16-bit multiplication in ISA 2.07,
+ * there's no a direct instruction for supporting 8-bit, 16-bit multiplication in ISA 2.07,
  * XLC Implement it by using instruction "multiply even", "multiply odd" and "permute"
- * todo: Do I need to support 8-bit ?
 **/
-#   define VSX_IMPL_MULH(Tvec, Tcast)                                               \
-    VSX_FINLINE(Tvec) vec_mul(const Tvec& a, const Tvec& b)                         \
-    {                                                                               \
-        static const vec_uchar16 even_perm = {0, 1, 16, 17, 4, 5, 20, 21,           \
-                                              8, 9, 24, 25, 12, 13, 28, 29};        \
-        return vec_perm(Tcast(vec_mule(a, b)), Tcast(vec_mulo(a, b)), even_perm);   \
+#   define VSX_IMPL_MULH(Tvec, cperm)                                        \
+    VSX_FINLINE(Tvec) vec_mul(const Tvec& a, const Tvec& b)                  \
+    {                                                                        \
+        static const vec_uchar16 ev_od = {cperm};                            \
+        return vec_perm((Tvec)vec_mule(a, b), (Tvec)vec_mulo(a, b), ev_od);  \
     }
-    VSX_IMPL_MULH(vec_short8,  vec_short8_c)
-    VSX_IMPL_MULH(vec_ushort8, vec_ushort8_c)
+    #define VSX_IMPL_MULH_P16 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30
+    VSX_IMPL_MULH(vec_char16,  VSX_IMPL_MULH_P16)
+    VSX_IMPL_MULH(vec_uchar16, VSX_IMPL_MULH_P16)
+    #define VSX_IMPL_MULH_P8 0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29
+    VSX_IMPL_MULH(vec_short8,  VSX_IMPL_MULH_P8)
+    VSX_IMPL_MULH(vec_ushort8, VSX_IMPL_MULH_P8)
     // vmuluwm can be used for unsigned or signed integers, that's what they said
     VSX_IMPL_2VRG(vec_int4,  vec_int4,  vmuluwm, vec_mul)
     VSX_IMPL_2VRG(vec_uint4, vec_uint4, vmuluwm, vec_mul)
diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp
index 6666bc4253..40d282b1c2 100644
--- a/modules/core/test/test_intrin_utils.hpp
+++ b/modules/core/test/test_intrin_utils.hpp
@@ -407,10 +407,13 @@ template<typename R> struct TheTest
 
         Data<Rx2> resB = vx_load_expand(dataA.d);
 
-        Rx2 c, d;
+        Rx2 c, d, e, f;
         v_expand(a, c, d);
 
-        Data<Rx2> resC = c, resD = d;
+        e = v_expand_low(a);
+        f = v_expand_high(a);
+
+        Data<Rx2> resC = c, resD = d, resE = e, resF = f;
         const int n = Rx2::nlanes;
         for (int i = 0; i < n; ++i)
         {
@@ -418,6 +421,8 @@ template<typename R> struct TheTest
             EXPECT_EQ(dataA[i], resB[i]);
             EXPECT_EQ(dataA[i], resC[i]);
             EXPECT_EQ(dataA[i + n], resD[i]);
+            EXPECT_EQ(dataA[i], resE[i]);
+            EXPECT_EQ(dataA[i + n], resF[i]);
         }
 
         return *this;
@@ -455,19 +460,21 @@ template<typename R> struct TheTest
         return *this;
     }
 
-    TheTest & test_addsub_wrap()
+    TheTest & test_arithm_wrap()
     {
         Data<R> dataA, dataB;
         dataB.reverse();
         R a = dataA, b = dataB;
 
         Data<R> resC = v_add_wrap(a, b),
-                resD = v_sub_wrap(a, b);
+                resD = v_sub_wrap(a, b),
+                resE = v_mul_wrap(a, b);
         for (int i = 0; i < R::nlanes; ++i)
         {
             SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_EQ((LaneType)(dataA[i] + dataB[i]), resC[i]);
             EXPECT_EQ((LaneType)(dataA[i] - dataB[i]), resD[i]);
+            EXPECT_EQ((LaneType)(dataA[i] * dataB[i]), resE[i]);
         }
         return *this;
     }
@@ -475,6 +482,7 @@ template<typename R> struct TheTest
     TheTest & test_mul()
     {
         Data<R> dataA, dataB;
+        dataA[1] = static_cast<LaneType>(std::numeric_limits<LaneType>::max());
         dataB.reverse();
         R a = dataA, b = dataB;
 
@@ -482,7 +490,7 @@ template<typename R> struct TheTest
         for (int i = 0; i < R::nlanes; ++i)
         {
             SCOPED_TRACE(cv::format("i=%d", i));
-            EXPECT_EQ(dataA[i] * dataB[i], resC[i]);
+            EXPECT_EQ(saturate_cast<LaneType>(dataA[i] * dataB[i]), resC[i]);
         }
 
         return *this;
@@ -1209,7 +1217,9 @@ void test_hal_intrin_uint8()
         .test_expand()
         .test_expand_q()
         .test_addsub()
-        .test_addsub_wrap()
+        .test_arithm_wrap()
+        .test_mul()
+        .test_mul_expand()
         .test_cmp()
         .test_logic()
         .test_min_max()
@@ -1242,7 +1252,9 @@ void test_hal_intrin_int8()
         .test_expand()
         .test_expand_q()
         .test_addsub()
-        .test_addsub_wrap()
+        .test_arithm_wrap()
+        .test_mul()
+        .test_mul_expand()
         .test_cmp()
         .test_logic()
         .test_min_max()
@@ -1267,7 +1279,7 @@ void test_hal_intrin_uint16()
         .test_interleave()
         .test_expand()
         .test_addsub()
-        .test_addsub_wrap()
+        .test_arithm_wrap()
         .test_mul()
         .test_mul_expand()
         .test_cmp()
@@ -1295,7 +1307,7 @@ void test_hal_intrin_int16()
         .test_interleave()
         .test_expand()
         .test_addsub()
-        .test_addsub_wrap()
+        .test_arithm_wrap()
         .test_mul()
         .test_mul_expand()
         .test_cmp()
diff --git a/modules/imgproc/CMakeLists.txt b/modules/imgproc/CMakeLists.txt
index 5cfb616503..1caadbbbad 100644
--- a/modules/imgproc/CMakeLists.txt
+++ b/modules/imgproc/CMakeLists.txt
@@ -1,3 +1,3 @@
 set(the_description "Image Processing")
-ocv_add_dispatched_file(accum SSE2 AVX NEON)
+ocv_add_dispatched_file(accum SSE4_1 AVX AVX2)
 ocv_define_module(imgproc opencv_core WRAP java python js)
diff --git a/modules/imgproc/perf/perf_accumulate.cpp b/modules/imgproc/perf/perf_accumulate.cpp
index f9cd80af71..c52b31e84d 100644
--- a/modules/imgproc/perf/perf_accumulate.cpp
+++ b/modules/imgproc/perf/perf_accumulate.cpp
@@ -5,94 +5,102 @@
 
 namespace opencv_test {
 
-#ifdef HAVE_OPENVX
-PERF_TEST_P(Size_MatType, Accumulate,
-    testing::Combine(
-        testing::Values(::perf::szODD, ::perf::szQVGA, ::perf::szVGA, ::perf::sz1080p),
-        testing::Values(CV_16SC1, CV_32FC1)
-    )
-)
-#else
-PERF_TEST_P( Size_MatType, Accumulate,
-             testing::Combine(
-                 testing::Values(::perf::szODD, ::perf::szQVGA, ::perf::szVGA, ::perf::sz1080p),
-                 testing::Values(CV_32FC1)
-             )
-           )
-#endif
-{
-    Size sz = get<0>(GetParam());
-    int dstType = get<1>(GetParam());
+typedef Size_MatType Accumulate;
 
-    Mat src(sz, CV_8UC1);
-    Mat dst(sz, dstType);
+#define MAT_TYPES_ACCUMLATE CV_8UC1, CV_16UC1, CV_32FC1
+#define MAT_TYPES_ACCUMLATE_C MAT_TYPES_ACCUMLATE, CV_8UC3, CV_16UC3, CV_32FC3
+#define MAT_TYPES_ACCUMLATE_D MAT_TYPES_ACCUMLATE, CV_64FC1
+#define MAT_TYPES_ACCUMLATE_D_C MAT_TYPES_ACCUMLATE_C, CV_64FC1, CV_64FC1
 
-    declare.time(100);
-    declare.in(src, WARMUP_RNG).out(dst);
+#define PERF_ACCUMULATE_INIT(_FLTC)                    \
+    const Size srcSize = get<0>(GetParam());           \
+    const int srcType = get<1>(GetParam());            \
+    const int dstType = _FLTC(CV_MAT_CN(srcType));     \
+    Mat src1(srcSize, srcType), dst(srcSize, dstType); \
+    declare.in(src1, dst, WARMUP_RNG).out(dst);
 
-    TEST_CYCLE() accumulate(src, dst);
+#define PERF_ACCUMULATE_MASK_INIT(_FLTC) \
+    PERF_ACCUMULATE_INIT(_FLTC)          \
+    Mat mask(srcSize, CV_8UC1);          \
+    declare.in(mask, WARMUP_RNG);
 
-    SANITY_CHECK_NOTHING();
-}
+#define PERF_TEST_P_ACCUMULATE(_NAME, _TYPES, _INIT, _FUN)           \
+    PERF_TEST_P(Accumulate, _NAME,                                   \
+        testing::Combine(                                            \
+            testing::Values(sz1080p, sz720p, szVGA, szQVGA, szODD),  \
+            testing::Values(_TYPES)                                  \
+        )                                                            \
+    )                                                                \
+    {                                                                \
+        _INIT                                                        \
+        TEST_CYCLE() _FUN;                                           \
+        SANITY_CHECK_NOTHING();                                      \
+    }
 
-#ifdef HAVE_OPENVX
-PERF_TEST_P(Size_MatType, AccumulateSquare,
-    testing::Combine(
-        testing::Values(::perf::szODD, ::perf::szQVGA, ::perf::szVGA, ::perf::sz1080p),
-        testing::Values(CV_16SC1, CV_32FC1)
-    )
-)
-#else
-PERF_TEST_P( Size_MatType, AccumulateSquare,
-             testing::Combine(
-                 testing::Values(::perf::szODD, ::perf::szQVGA, ::perf::szVGA, ::perf::sz1080p),
-                 testing::Values(CV_32FC1)
-             )
-           )
-#endif
-{
-    Size sz = get<0>(GetParam());
-    int dstType = get<1>(GetParam());
+/////////////////////////////////// Accumulate ///////////////////////////////////
 
-    Mat src(sz, CV_8UC1);
-    Mat dst(sz, dstType);
+PERF_TEST_P_ACCUMULATE(Accumulate, MAT_TYPES_ACCUMLATE,
+        PERF_ACCUMULATE_INIT(CV_32FC), accumulate(src1, dst))
 
-    declare.time(100);
-    declare.in(src, WARMUP_RNG).out(dst);
+PERF_TEST_P_ACCUMULATE(AccumulateMask, MAT_TYPES_ACCUMLATE_C,
+    PERF_ACCUMULATE_MASK_INIT(CV_32FC), accumulate(src1, dst, mask))
 
-    TEST_CYCLE() accumulateSquare(src, dst);
+PERF_TEST_P_ACCUMULATE(AccumulateDouble, MAT_TYPES_ACCUMLATE_D,
+    PERF_ACCUMULATE_INIT(CV_64FC), accumulate(src1, dst))
 
-    SANITY_CHECK_NOTHING();
-}
+PERF_TEST_P_ACCUMULATE(AccumulateDoubleMask, MAT_TYPES_ACCUMLATE_D_C,
+    PERF_ACCUMULATE_MASK_INIT(CV_64FC), accumulate(src1, dst, mask))
 
-#ifdef HAVE_OPENVX
-PERF_TEST_P(Size_MatType, AccumulateWeighted,
-    testing::Combine(
-        testing::Values(::perf::szODD, ::perf::szQVGA, ::perf::szVGA, ::perf::sz1080p),
-        testing::Values(CV_8UC1, CV_32FC1)
-    )
-)
-#else
-PERF_TEST_P( Size_MatType, AccumulateWeighted,
-             testing::Combine(
-                 testing::Values(::perf::szODD, ::perf::szQVGA, ::perf::szVGA, ::perf::sz1080p),
-                 testing::Values(CV_32FC1)
-             )
-           )
-#endif
-{
-    Size sz = get<0>(GetParam());
-    int dstType = get<1>(GetParam());
+///////////////////////////// AccumulateSquare ///////////////////////////////////
 
-    Mat src(sz, CV_8UC1);
-    Mat dst(sz, dstType);
+PERF_TEST_P_ACCUMULATE(Square, MAT_TYPES_ACCUMLATE,
+    PERF_ACCUMULATE_INIT(CV_32FC), accumulateSquare(src1, dst))
 
-    declare.time(100);
-    declare.in(src, WARMUP_RNG).out(dst);
+PERF_TEST_P_ACCUMULATE(SquareMask, MAT_TYPES_ACCUMLATE_C,
+    PERF_ACCUMULATE_MASK_INIT(CV_32FC), accumulateSquare(src1, dst, mask))
 
-    TEST_CYCLE() accumulateWeighted(src, dst, 0.314);
+PERF_TEST_P_ACCUMULATE(SquareDouble, MAT_TYPES_ACCUMLATE_D,
+    PERF_ACCUMULATE_INIT(CV_64FC), accumulateSquare(src1, dst))
 
-    SANITY_CHECK_NOTHING();
-}
+PERF_TEST_P_ACCUMULATE(SquareDoubleMask, MAT_TYPES_ACCUMLATE_D_C,
+    PERF_ACCUMULATE_MASK_INIT(CV_64FC), accumulateSquare(src1, dst, mask))
+
+///////////////////////////// AccumulateProduct ///////////////////////////////////
+
+#define PERF_ACCUMULATE_INIT_2(_FLTC) \
+    PERF_ACCUMULATE_INIT(_FLTC)       \
+    Mat src2(srcSize, srcType);       \
+    declare.in(src2);
+
+#define PERF_ACCUMULATE_MASK_INIT_2(_FLTC) \
+    PERF_ACCUMULATE_MASK_INIT(_FLTC)       \
+    Mat src2(srcSize, srcType);            \
+    declare.in(src2);
+
+PERF_TEST_P_ACCUMULATE(Product, MAT_TYPES_ACCUMLATE,
+    PERF_ACCUMULATE_INIT_2(CV_32FC), accumulateProduct(src1, src2, dst))
+
+PERF_TEST_P_ACCUMULATE(ProductMask, MAT_TYPES_ACCUMLATE_C,
+    PERF_ACCUMULATE_MASK_INIT_2(CV_32FC), accumulateProduct(src1, src2, dst, mask))
+
+PERF_TEST_P_ACCUMULATE(ProductDouble, MAT_TYPES_ACCUMLATE_D,
+    PERF_ACCUMULATE_INIT_2(CV_64FC), accumulateProduct(src1, src2, dst))
+
+PERF_TEST_P_ACCUMULATE(ProductDoubleMask, MAT_TYPES_ACCUMLATE_D_C,
+    PERF_ACCUMULATE_MASK_INIT_2(CV_64FC), accumulateProduct(src1, src2, dst, mask))
+
+///////////////////////////// AccumulateWeighted ///////////////////////////////////
+
+PERF_TEST_P_ACCUMULATE(Weighted, MAT_TYPES_ACCUMLATE,
+    PERF_ACCUMULATE_INIT(CV_32FC), accumulateWeighted(src1, dst, 0.123))
+
+PERF_TEST_P_ACCUMULATE(WeightedMask, MAT_TYPES_ACCUMLATE_C,
+    PERF_ACCUMULATE_MASK_INIT(CV_32FC), accumulateWeighted(src1, dst, 0.123, mask))
+
+PERF_TEST_P_ACCUMULATE(WeightedDouble, MAT_TYPES_ACCUMLATE_D,
+    PERF_ACCUMULATE_INIT(CV_64FC), accumulateWeighted(src1, dst, 0.123456))
+
+PERF_TEST_P_ACCUMULATE(WeightedDoubleMask, MAT_TYPES_ACCUMLATE_D_C,
+    PERF_ACCUMULATE_MASK_INIT(CV_64FC), accumulateWeighted(src1, dst, 0.123456, mask))
 
 } // namespace
diff --git a/modules/imgproc/src/accum.simd.hpp b/modules/imgproc/src/accum.simd.hpp
index 7a29447497..7bca93de87 100644
--- a/modules/imgproc/src/accum.simd.hpp
+++ b/modules/imgproc/src/accum.simd.hpp
@@ -8,63 +8,43 @@
 void acc_##suffix(const type* src, acctype* dst, \
                          const uchar* mask, int len, int cn) \
 { \
-    CV_CPU_CALL_NEON(acc_simd_, (src, dst, mask, len, cn)); \
-    CV_CPU_CALL_SSE2(acc_simd_, (src, dst, mask, len, cn)); \
-    CV_CPU_CALL_BASELINE(acc_general_, (src, dst, mask, len, cn)); \
+    CV_CPU_DISPATCH(acc_simd_, (src, dst, mask, len, cn),  CV_CPU_DISPATCH_MODES_ALL); \
 } \
 void accSqr_##suffix(const type* src, acctype* dst, \
                             const uchar* mask, int len, int cn) \
 { \
-    CV_CPU_CALL_NEON(accSqr_simd_, (src, dst, mask, len, cn)); \
-    CV_CPU_CALL_SSE2(accSqr_simd_, (src, dst, mask, len, cn)); \
-    CV_CPU_CALL_BASELINE(accSqr_general_, (src, dst, mask, len, cn)); \
+    CV_CPU_DISPATCH(accSqr_simd_, (src, dst, mask, len, cn),  CV_CPU_DISPATCH_MODES_ALL); \
 } \
 void accProd_##suffix(const type* src1, const type* src2, \
                              acctype* dst, const uchar* mask, int len, int cn) \
 { \
-    CV_CPU_CALL_NEON(accProd_simd_, (src1, src2, dst, mask, len, cn)); \
-    CV_CPU_CALL_SSE2(accProd_simd_, (src1, src2, dst, mask, len, cn)); \
-    CV_CPU_CALL_BASELINE(accProd_general_, (src1, src2, dst, mask, len, cn)); \
+    CV_CPU_DISPATCH(accProd_simd_, (src1, src2, dst, mask, len, cn),  CV_CPU_DISPATCH_MODES_ALL); \
 } \
 void accW_##suffix(const type* src, acctype* dst, \
                           const uchar* mask, int len, int cn, double alpha) \
 { \
-    CV_CPU_CALL_NEON(accW_simd_, (src, dst, mask, len, cn, alpha)); \
-    CV_CPU_CALL_SSE2(accW_simd_, (src, dst, mask, len, cn, alpha)); \
-    CV_CPU_CALL_BASELINE(accW_general_, (src, dst, mask, len, cn, alpha)); \
+    CV_CPU_DISPATCH(accW_simd_, (src, dst, mask, len, cn, alpha),  CV_CPU_DISPATCH_MODES_ALL); \
 }
 #define DEF_ACC_FLT_FUNCS(suffix, type, acctype) \
 void acc_##suffix(const type* src, acctype* dst, \
                          const uchar* mask, int len, int cn) \
 { \
-    CV_CPU_CALL_AVX(acc_avx_##suffix, (src, dst, mask, len, cn)); \
-    CV_CPU_CALL_NEON(acc_simd_, (src, dst, mask, len, cn)); \
-    CV_CPU_CALL_SSE2(acc_simd_, (src, dst, mask, len, cn)); \
-    CV_CPU_CALL_BASELINE(acc_general_, (src, dst, mask, len, cn)); \
+    CV_CPU_DISPATCH(acc_simd_, (src, dst, mask, len, cn),  CV_CPU_DISPATCH_MODES_ALL); \
 } \
 void accSqr_##suffix(const type* src, acctype* dst, \
                             const uchar* mask, int len, int cn) \
 { \
-    CV_CPU_CALL_AVX(accSqr_avx_##suffix, (src, dst, mask, len, cn)); \
-    CV_CPU_CALL_NEON(accSqr_simd_, (src, dst, mask, len, cn)); \
-    CV_CPU_CALL_SSE2(accSqr_simd_, (src, dst, mask, len, cn)); \
-    CV_CPU_CALL_BASELINE(accSqr_general_, (src, dst, mask, len, cn)); \
+    CV_CPU_DISPATCH(accSqr_simd_, (src, dst, mask, len, cn),  CV_CPU_DISPATCH_MODES_ALL); \
 } \
 void accProd_##suffix(const type* src1, const type* src2, \
                              acctype* dst, const uchar* mask, int len, int cn) \
 { \
-    CV_CPU_CALL_AVX(accProd_avx_##suffix, (src1, src2, dst, mask, len, cn)); \
-    CV_CPU_CALL_NEON(accProd_simd_, (src1, src2, dst, mask, len, cn)); \
-    CV_CPU_CALL_SSE2(accProd_simd_, (src1, src2, dst, mask, len, cn)); \
-    CV_CPU_CALL_BASELINE(accProd_general_, (src1, src2, dst, mask, len, cn)); \
+    CV_CPU_DISPATCH(accProd_simd_, (src1, src2, dst, mask, len, cn),  CV_CPU_DISPATCH_MODES_ALL); \
 } \
 void accW_##suffix(const type* src, acctype* dst, \
                           const uchar* mask, int len, int cn, double alpha) \
 { \
-    CV_CPU_CALL_AVX(accW_avx_##suffix, (src, dst, mask, len, cn, alpha)); \
-    CV_CPU_CALL_NEON(accW_simd_, (src, dst, mask, len, cn, alpha)); \
-    CV_CPU_CALL_SSE2(accW_simd_, (src, dst, mask, len, cn, alpha)); \
-    CV_CPU_CALL_BASELINE(accW_general_, (src, dst, mask, len, cn, alpha)); \
+    CV_CPU_DISPATCH(accW_simd_, (src, dst, mask, len, cn, alpha),  CV_CPU_DISPATCH_MODES_ALL); \
 }
 #define DECLARATE_ACC_FUNCS(suffix, type, acctype) \
 void acc_##suffix(const type* src, acctype* dst, const uchar* mask, int len, int cn); \
@@ -114,22 +94,8 @@ void accW_simd_(const float* src, float* dst, const uchar* mask, int len, int cn
 void accW_simd_(const float* src, double* dst, const uchar* mask, int len, int cn, double alpha);
 void accW_simd_(const double* src, double* dst, const uchar* mask, int len, int cn, double alpha);
 
-// accumulate series optimized by AVX
-void acc_avx_32f(const float* src, float* dst, const uchar* mask, int len, int cn);
-void acc_avx_32f64f(const float* src, double* dst, const uchar* mask, int len, int cn);
-void acc_avx_64f(const double* src, double* dst, const uchar* mask, int len, int cn);
-void accSqr_avx_32f(const float* src, float* dst, const uchar* mask, int len, int cn);
-void accSqr_avx_32f64f(const float* src, double* dst, const uchar* mask, int len, int cn);
-void accSqr_avx_64f(const double* src, double* dst, const uchar* mask, int len, int cn);
-void accProd_avx_32f(const float* src1, const float* src2, float* dst, const uchar* mask, int len, int cn);
-void accProd_avx_32f64f(const float* src1, const float* src2, double* dst, const uchar* mask, int len, int cn);
-void accProd_avx_64f(const double* src1, const double* src2, double* dst, const uchar* mask, int len, int cn);
-void accW_avx_32f(const float* src, float* dst, const uchar* mask, int len, int cn, double alpha);
-void accW_avx_32f64f(const float* src, double* dst, const uchar* mask, int len, int cn, double alpha);
-void accW_avx_64f(const double* src, double* dst, const uchar* mask, int len, int cn, double alpha);
-
 #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
-
+// todo: remove AVX branch after support it by universal intrinsics
 template <typename T, typename AT>
 void acc_general_(const T* src, AT* dst, const uchar* mask, int len, int cn, int start = 0 )
 {
@@ -171,7 +137,11 @@ void acc_general_(const T* src, AT* dst, const uchar* mask, int len, int cn, int
             }
         }
     }
-
+#if CV_AVX && !CV_AVX2
+    _mm256_zeroupper();
+#elif CV_SIMD
+    vx_cleanup();
+#endif
 }
 
 template<typename T, typename AT> void
@@ -215,6 +185,11 @@ accSqr_general_( const T* src, AT* dst, const uchar* mask, int len, int cn, int
             }
         }
     }
+#if CV_AVX && !CV_AVX2
+    _mm256_zeroupper();
+#elif CV_SIMD
+    vx_cleanup();
+#endif
 }
 
 template<typename T, typename AT> void
@@ -259,6 +234,11 @@ accProd_general_( const T* src1, const T* src2, AT* dst, const uchar* mask, int
             }
         }
     }
+#if CV_AVX && !CV_AVX2
+    _mm256_zeroupper();
+#elif CV_SIMD
+    vx_cleanup();
+#endif
 }
 
 template<typename T, typename AT> void
@@ -303,77 +283,81 @@ accW_general_( const T* src, AT* dst, const uchar* mask, int len, int cn, double
             }
         }
     }
+#if CV_AVX && !CV_AVX2
+    _mm256_zeroupper();
+#elif CV_SIMD
+    vx_cleanup();
+#endif
 }
-
-#if CV_SIMD128
-
 void acc_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-    const int cVectorWidth = 16;
+#if CV_SIMD
+    const int cVectorWidth = v_uint8::nlanes;
+    const int step = v_float32::nlanes;
 
     if (!mask)
     {
         int size = len * cn;
         for (; x <= size - cVectorWidth; x += cVectorWidth)
         {
-            v_uint8x16 v_src  = v_load(src + x);
-            v_uint16x8 v_src0, v_src1;
+            v_uint8 v_src  = vx_load(src + x);
+            v_uint16 v_src0, v_src1;
             v_expand(v_src, v_src0, v_src1);
 
-            v_uint32x4 v_src00, v_src01, v_src10, v_src11;
+            v_uint32 v_src00, v_src01, v_src10, v_src11;
             v_expand(v_src0, v_src00, v_src01);
             v_expand(v_src1, v_src10, v_src11);
 
-            v_store(dst + x, v_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));
-            v_store(dst + x + 4, v_load(dst + x + 4) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));
-            v_store(dst + x + 8, v_load(dst + x + 8) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));
-            v_store(dst + x + 12, v_load(dst + x + 12) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));
+            v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));
+            v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));
+            v_store(dst + x + step * 2, vx_load(dst + x + step * 2) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));
+            v_store(dst + x + step * 3, vx_load(dst + x + step * 3) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));
         }
     }
     else
     {
-        v_uint8x16 v_0 = v_setall_u8(0);
+        v_uint8 v_0 = vx_setall_u8(0);
         if (cn == 1)
         {
             for ( ; x <= len - cVectorWidth; x += cVectorWidth)
             {
-                v_uint8x16 v_mask = v_load(mask + x);
+                v_uint8 v_mask = vx_load(mask + x);
                 v_mask = ~(v_0 == v_mask);
-                v_uint8x16 v_src = v_load(src + x);
+                v_uint8 v_src = vx_load(src + x);
                 v_src = v_src & v_mask;
-                v_uint16x8 v_src0, v_src1;
+                v_uint16 v_src0, v_src1;
                 v_expand(v_src, v_src0, v_src1);
 
-                v_uint32x4 v_src00, v_src01, v_src10, v_src11;
+                v_uint32 v_src00, v_src01, v_src10, v_src11;
                 v_expand(v_src0, v_src00, v_src01);
                 v_expand(v_src1, v_src10, v_src11);
 
-                v_store(dst + x, v_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));
-                v_store(dst + x + 4, v_load(dst + x + 4) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));
-                v_store(dst + x + 8, v_load(dst + x + 8) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));
-                v_store(dst + x + 12, v_load(dst + x + 12) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));
+                v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));
+                v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));
+                v_store(dst + x + step * 2, vx_load(dst + x + step * 2) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));
+                v_store(dst + x + step * 3, vx_load(dst + x + step * 3) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));
             }
         }
         else if (cn == 3)
         {
             for ( ; x <= len - cVectorWidth; x += cVectorWidth)
             {
-                v_uint8x16 v_mask = v_load(mask + x);
+                v_uint8 v_mask = vx_load(mask + x);
                 v_mask = ~(v_0 == v_mask);
-                v_uint8x16 v_src0, v_src1, v_src2;
+                v_uint8 v_src0, v_src1, v_src2;
                 v_load_deinterleave(src + (x * cn), v_src0, v_src1, v_src2);
                 v_src0 = v_src0 & v_mask;
                 v_src1 = v_src1 & v_mask;
                 v_src2 = v_src2 & v_mask;
-                v_uint16x8 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
+                v_uint16 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
                 v_expand(v_src0, v_src00, v_src01);
                 v_expand(v_src1, v_src10, v_src11);
                 v_expand(v_src2, v_src20, v_src21);
 
-                v_uint32x4 v_src000, v_src001, v_src010, v_src011;
-                v_uint32x4 v_src100, v_src101, v_src110, v_src111;
-                v_uint32x4 v_src200, v_src201, v_src210, v_src211;
+                v_uint32 v_src000, v_src001, v_src010, v_src011;
+                v_uint32 v_src100, v_src101, v_src110, v_src111;
+                v_uint32 v_src200, v_src201, v_src210, v_src211;
                 v_expand(v_src00, v_src000, v_src001);
                 v_expand(v_src01, v_src010, v_src011);
                 v_expand(v_src10, v_src100, v_src101);
@@ -381,135 +365,169 @@ void acc_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn)
                 v_expand(v_src20, v_src200, v_src201);
                 v_expand(v_src21, v_src210, v_src211);
 
-                v_float32x4 v_dst000, v_dst001, v_dst010, v_dst011;
-                v_float32x4 v_dst100, v_dst101, v_dst110, v_dst111;
-                v_float32x4 v_dst200, v_dst201, v_dst210, v_dst211;
+                v_float32 v_dst000, v_dst001, v_dst010, v_dst011;
+                v_float32 v_dst100, v_dst101, v_dst110, v_dst111;
+                v_float32 v_dst200, v_dst201, v_dst210, v_dst211;
                 v_load_deinterleave(dst + (x * cn), v_dst000, v_dst100, v_dst200);
-                v_load_deinterleave(dst + ((x + 4) * cn), v_dst001, v_dst101, v_dst201);
-                v_load_deinterleave(dst + ((x + 8) * cn), v_dst010, v_dst110, v_dst210);
-                v_load_deinterleave(dst + ((x + 12) * cn), v_dst011, v_dst111, v_dst211);
+                v_load_deinterleave(dst + ((x + step) * cn), v_dst001, v_dst101, v_dst201);
+                v_load_deinterleave(dst + ((x + step * 2) * cn), v_dst010, v_dst110, v_dst210);
+                v_load_deinterleave(dst + ((x + step * 3) * cn), v_dst011, v_dst111, v_dst211);
 
-                v_store_interleave(dst + (x * cn), v_dst000 + v_cvt_f32(v_reinterpret_as_s32(v_src000)), v_dst100 + v_cvt_f32(v_reinterpret_as_s32(v_src100)), v_dst200 + v_cvt_f32(v_reinterpret_as_s32(v_src200)));
-                v_store_interleave(dst + ((x + 4) * cn), v_dst001 + v_cvt_f32(v_reinterpret_as_s32(v_src001)), v_dst101 + v_cvt_f32(v_reinterpret_as_s32(v_src101)), v_dst201 + v_cvt_f32(v_reinterpret_as_s32(v_src201)));
-                v_store_interleave(dst + ((x + 8) * cn), v_dst010 + v_cvt_f32(v_reinterpret_as_s32(v_src010)), v_dst110 + v_cvt_f32(v_reinterpret_as_s32(v_src110)), v_dst210 + v_cvt_f32(v_reinterpret_as_s32(v_src210)));
-                v_store_interleave(dst + ((x + 12) * cn), v_dst011 + v_cvt_f32(v_reinterpret_as_s32(v_src011)), v_dst111 + v_cvt_f32(v_reinterpret_as_s32(v_src111)), v_dst211 + v_cvt_f32(v_reinterpret_as_s32(v_src211)));
+                v_dst000 += v_cvt_f32(v_reinterpret_as_s32(v_src000));
+                v_dst100 += v_cvt_f32(v_reinterpret_as_s32(v_src100));
+                v_dst200 += v_cvt_f32(v_reinterpret_as_s32(v_src200));
+                v_dst001 += v_cvt_f32(v_reinterpret_as_s32(v_src001));
+                v_dst101 += v_cvt_f32(v_reinterpret_as_s32(v_src101));
+                v_dst201 += v_cvt_f32(v_reinterpret_as_s32(v_src201));
+                v_dst010 += v_cvt_f32(v_reinterpret_as_s32(v_src010));
+                v_dst110 += v_cvt_f32(v_reinterpret_as_s32(v_src110));
+                v_dst210 += v_cvt_f32(v_reinterpret_as_s32(v_src210));
+                v_dst011 += v_cvt_f32(v_reinterpret_as_s32(v_src011));
+                v_dst111 += v_cvt_f32(v_reinterpret_as_s32(v_src111));
+                v_dst211 += v_cvt_f32(v_reinterpret_as_s32(v_src211));
+
+                v_store_interleave(dst + (x * cn), v_dst000, v_dst100, v_dst200);
+                v_store_interleave(dst + ((x + step) * cn), v_dst001, v_dst101, v_dst201);
+                v_store_interleave(dst + ((x + step * 2) * cn), v_dst010, v_dst110, v_dst210);
+                v_store_interleave(dst + ((x + step * 3) * cn), v_dst011, v_dst111, v_dst211);
             }
         }
     }
-
+#endif // CV_SIMD
     acc_general_(src, dst, mask, len, cn, x);
 }
 
 void acc_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-    const int cVectorWidth = 8;
+#if CV_SIMD
+    const int cVectorWidth = v_uint16::nlanes;
+    const int step = v_float32::nlanes;
 
     if (!mask)
     {
         int size = len * cn;
         for (; x <= size - cVectorWidth; x += cVectorWidth)
         {
-            v_uint16x8 v_src = v_load(src + x);
-            v_uint32x4 v_src0, v_src1;
+            v_uint16 v_src = vx_load(src + x);
+            v_uint32 v_src0, v_src1;
             v_expand(v_src, v_src0, v_src1);
 
-            v_store(dst + x, v_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src0)));
-            v_store(dst + x + 4, v_load(dst + x + 4) + v_cvt_f32(v_reinterpret_as_s32(v_src1)));
+            v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src0)));
+            v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src1)));
         }
     }
     else
     {
         if (cn == 1)
         {
-            v_uint16x8 v_0 = v_setall_u16(0);
+            v_uint16 v_0 = vx_setall_u16(0);
             for ( ; x <= len - cVectorWidth; x += cVectorWidth)
             {
-                v_uint16x8 v_mask = v_load_expand(mask + x);
+                v_uint16 v_mask = vx_load_expand(mask + x);
                 v_mask = ~(v_mask == v_0);
-                v_uint16x8 v_src = v_load(src + x);
+                v_uint16 v_src = vx_load(src + x);
                 v_src = v_src & v_mask;
-                v_uint32x4 v_src0, v_src1;
+                v_uint32 v_src0, v_src1;
                 v_expand(v_src, v_src0, v_src1);
 
-                v_store(dst + x, v_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src0)));
-                v_store(dst + x + 4, v_load(dst + x + 4) + v_cvt_f32(v_reinterpret_as_s32(v_src1)));
+                v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src0)));
+                v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src1)));
             }
         }
         else if (cn == 3)
         {
-            v_uint16x8 v_0 = v_setall_u16(0);
+            v_uint16 v_0 = vx_setall_u16(0);
             for ( ; x <= len - cVectorWidth; x += cVectorWidth)
             {
-                v_uint16x8 v_mask = v_load_expand(mask + x);
+                v_uint16 v_mask = vx_load_expand(mask + x);
                 v_mask = ~(v_mask == v_0);
-                v_uint16x8 v_src0, v_src1, v_src2;
+                v_uint16 v_src0, v_src1, v_src2;
                 v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
                 v_src0 = v_src0 & v_mask;
                 v_src1 = v_src1 & v_mask;
                 v_src2 = v_src2 & v_mask;
-                v_uint32x4 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
+                v_uint32 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
                 v_expand(v_src0, v_src00, v_src01);
                 v_expand(v_src1, v_src10, v_src11);
                 v_expand(v_src2, v_src20, v_src21);
 
-                v_float32x4 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
+                v_float32 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
                 v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
-                v_load_deinterleave(dst + (x + 4) * cn, v_dst01, v_dst11, v_dst21);
+                v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
 
-                v_store_interleave(dst + x * cn, v_dst00 + v_cvt_f32(v_reinterpret_as_s32(v_src00)), v_dst10 + v_cvt_f32(v_reinterpret_as_s32(v_src10)), v_dst20 + v_cvt_f32(v_reinterpret_as_s32(v_src20)));
-                v_store_interleave(dst + (x + 4) * cn, v_dst01 + v_cvt_f32(v_reinterpret_as_s32(v_src01)), v_dst11 + v_cvt_f32(v_reinterpret_as_s32(v_src11)), v_dst21 + v_cvt_f32(v_reinterpret_as_s32(v_src21)));
+                v_dst00 += v_cvt_f32(v_reinterpret_as_s32(v_src00));
+                v_dst01 += v_cvt_f32(v_reinterpret_as_s32(v_src01));
+                v_dst10 += v_cvt_f32(v_reinterpret_as_s32(v_src10));
+                v_dst11 += v_cvt_f32(v_reinterpret_as_s32(v_src11));
+                v_dst20 += v_cvt_f32(v_reinterpret_as_s32(v_src20));
+                v_dst21 += v_cvt_f32(v_reinterpret_as_s32(v_src21));
+
+                v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
+                v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
             }
         }
     }
-
+#endif // CV_SIMD
     acc_general_(src, dst, mask, len, cn, x);
 }
-
+// todo: remove AVX branch after support it by universal intrinsics
 void acc_simd_(const float* src, float* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-    const int cVectorWidth = 8;
+#if CV_SIMD
+    const int cVectorWidth = v_uint16::nlanes;
+    const int step = v_float32::nlanes;
 
     if (!mask)
     {
         int size = len * cn;
+        #if CV_AVX && !CV_AVX2
+        for (; x <= size - 8 ; x += 8)
+        {
+            __m256 v_src = _mm256_loadu_ps(src + x);
+            __m256 v_dst = _mm256_loadu_ps(dst + x);
+            v_dst = _mm256_add_ps(v_src, v_dst);
+            _mm256_storeu_ps(dst + x, v_dst);
+        }
+        #else
         for (; x <= size - cVectorWidth; x += cVectorWidth)
         {
-            v_store(dst + x, v_load(dst + x) + v_load(src + x));
-            v_store(dst + x + 4, v_load(dst + x + 4) + v_load(src + x + 4));
+            v_store(dst + x, vx_load(dst + x) + vx_load(src + x));
+            v_store(dst + x + step, vx_load(dst + x + step) + vx_load(src + x + step));
         }
+        #endif // CV_AVX && !CV_AVX2
     }
     else
     {
-        v_float32x4 v_0 = v_setzero_f32();
+        v_float32 v_0 = vx_setzero_f32();
         if (cn == 1)
         {
             for ( ; x <= len - cVectorWidth ; x += cVectorWidth)
             {
-                v_uint16x8 v_masku16 = v_load_expand(mask + x);
-                v_uint32x4 v_masku320, v_masku321;
+                v_uint16 v_masku16 = vx_load_expand(mask + x);
+                v_uint32 v_masku320, v_masku321;
                 v_expand(v_masku16, v_masku320, v_masku321);
-                v_float32x4 v_mask0 = v_reinterpret_as_f32(~(v_masku320 == v_reinterpret_as_u32(v_0)));
-                v_float32x4 v_mask1 = v_reinterpret_as_f32(~(v_masku321 == v_reinterpret_as_u32(v_0)));
+                v_float32 v_mask0 = v_reinterpret_as_f32(~(v_masku320 == v_reinterpret_as_u32(v_0)));
+                v_float32 v_mask1 = v_reinterpret_as_f32(~(v_masku321 == v_reinterpret_as_u32(v_0)));
 
-                v_store(dst + x, v_load(dst + x) + (v_load(src + x) & v_mask0));
-                v_store(dst + x + 4, v_load(dst + x + 4) + (v_load(src + x + 4) & v_mask1));
+                v_store(dst + x, vx_load(dst + x) + (vx_load(src + x) & v_mask0));
+                v_store(dst + x + step, vx_load(dst + x + step) + (vx_load(src + x + step) & v_mask1));
             }
         }
         else if (cn == 3)
         {
             for ( ; x <= len - cVectorWidth ; x += cVectorWidth)
             {
-                v_uint16x8 v_masku16 = v_load_expand(mask + x);
-                v_uint32x4 v_masku320, v_masku321;
+                v_uint16 v_masku16 = vx_load_expand(mask + x);
+                v_uint32 v_masku320, v_masku321;
                 v_expand(v_masku16, v_masku320, v_masku321);
-                v_float32x4 v_mask0 = v_reinterpret_as_f32(~(v_masku320 == v_reinterpret_as_u32(v_0)));
-                v_float32x4 v_mask1 = v_reinterpret_as_f32(~(v_masku321 == v_reinterpret_as_u32(v_0)));
+                v_float32 v_mask0 = v_reinterpret_as_f32(~(v_masku320 == v_reinterpret_as_u32(v_0)));
+                v_float32 v_mask1 = v_reinterpret_as_f32(~(v_masku321 == v_reinterpret_as_u32(v_0)));
 
-                v_float32x4 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
+                v_float32 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
                 v_load_deinterleave(src + x * cn, v_src00, v_src10, v_src20);
-                v_load_deinterleave(src + (x + 4) * cn, v_src01, v_src11, v_src21);
+                v_load_deinterleave(src + (x + step) * cn, v_src01, v_src11, v_src21);
                 v_src00 = v_src00 & v_mask0;
                 v_src01 = v_src01 & v_mask1;
                 v_src10 = v_src10 & v_mask0;
@@ -517,55 +535,56 @@ void acc_simd_(const float* src, float* dst, const uchar* mask, int len, int cn)
                 v_src20 = v_src20 & v_mask0;
                 v_src21 = v_src21 & v_mask1;
 
-                v_float32x4 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
+                v_float32 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
                 v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
-                v_load_deinterleave(dst + (x + 4) * cn, v_dst01, v_dst11, v_dst21);
+                v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
 
                 v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
-                v_store_interleave(dst + (x + 4) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
+                v_store_interleave(dst + (x + step) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
             }
         }
     }
-
+#endif // CV_SIMD
     acc_general_(src, dst, mask, len, cn, x);
 }
 
-#if CV_SIMD128_64F
 void acc_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-    const int cVectorWidth = 16;
+#if CV_SIMD_64F
+    const int cVectorWidth = v_uint8::nlanes;
+    const int step = v_float64::nlanes;
 
     if (!mask)
     {
         int size = len * cn;
         for (; x <= size - cVectorWidth; x += cVectorWidth)
         {
-            v_uint8x16 v_src  = v_load(src + x);
-            v_uint16x8 v_int0, v_int1;
+            v_uint8 v_src  = vx_load(src + x);
+            v_uint16 v_int0, v_int1;
             v_expand(v_src, v_int0, v_int1);
 
-            v_uint32x4 v_int00, v_int01, v_int10, v_int11;
+            v_uint32 v_int00, v_int01, v_int10, v_int11;
             v_expand(v_int0, v_int00, v_int01);
             v_expand(v_int1, v_int10, v_int11);
 
-            v_float64x2 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int00));
-            v_float64x2 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int00));
-            v_float64x2 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int01));
-            v_float64x2 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int01));
-            v_float64x2 v_src4 = v_cvt_f64(v_reinterpret_as_s32(v_int10));
-            v_float64x2 v_src5 = v_cvt_f64_high(v_reinterpret_as_s32(v_int10));
-            v_float64x2 v_src6 = v_cvt_f64(v_reinterpret_as_s32(v_int11));
-            v_float64x2 v_src7 = v_cvt_f64_high(v_reinterpret_as_s32(v_int11));
+            v_float64 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int00));
+            v_float64 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int00));
+            v_float64 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int01));
+            v_float64 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int01));
+            v_float64 v_src4 = v_cvt_f64(v_reinterpret_as_s32(v_int10));
+            v_float64 v_src5 = v_cvt_f64_high(v_reinterpret_as_s32(v_int10));
+            v_float64 v_src6 = v_cvt_f64(v_reinterpret_as_s32(v_int11));
+            v_float64 v_src7 = v_cvt_f64_high(v_reinterpret_as_s32(v_int11));
 
-            v_float64x2 v_dst0 = v_load(dst + x);
-            v_float64x2 v_dst1 = v_load(dst + x + 2);
-            v_float64x2 v_dst2 = v_load(dst + x + 4);
-            v_float64x2 v_dst3 = v_load(dst + x + 6);
-            v_float64x2 v_dst4 = v_load(dst + x + 8);
-            v_float64x2 v_dst5 = v_load(dst + x + 10);
-            v_float64x2 v_dst6 = v_load(dst + x + 12);
-            v_float64x2 v_dst7 = v_load(dst + x + 14);
+            v_float64 v_dst0 = vx_load(dst + x);
+            v_float64 v_dst1 = vx_load(dst + x + step);
+            v_float64 v_dst2 = vx_load(dst + x + step * 2);
+            v_float64 v_dst3 = vx_load(dst + x + step * 3);
+            v_float64 v_dst4 = vx_load(dst + x + step * 4);
+            v_float64 v_dst5 = vx_load(dst + x + step * 5);
+            v_float64 v_dst6 = vx_load(dst + x + step * 6);
+            v_float64 v_dst7 = vx_load(dst + x + step * 7);
 
             v_dst0 = v_dst0 + v_src0;
             v_dst1 = v_dst1 + v_src1;
@@ -577,50 +596,50 @@ void acc_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn
             v_dst7 = v_dst7 + v_src7;
 
             v_store(dst + x, v_dst0);
-            v_store(dst + x + 2, v_dst1);
-            v_store(dst + x + 4, v_dst2);
-            v_store(dst + x + 6, v_dst3);
-            v_store(dst + x + 8, v_dst4);
-            v_store(dst + x + 10, v_dst5);
-            v_store(dst + x + 12, v_dst6);
-            v_store(dst + x + 14, v_dst7);
+            v_store(dst + x + step, v_dst1);
+            v_store(dst + x + step * 2, v_dst2);
+            v_store(dst + x + step * 3, v_dst3);
+            v_store(dst + x + step * 4, v_dst4);
+            v_store(dst + x + step * 5, v_dst5);
+            v_store(dst + x + step * 6, v_dst6);
+            v_store(dst + x + step * 7, v_dst7);
         }
     }
     else
     {
-        v_uint8x16 v_0 = v_setall_u8(0);
+        v_uint8 v_0 = vx_setall_u8(0);
         if (cn == 1)
         {
             for ( ; x <= len - cVectorWidth; x += cVectorWidth)
             {
-                v_uint8x16 v_mask = v_load(mask + x);
+                v_uint8 v_mask = vx_load(mask + x);
                 v_mask = ~(v_mask == v_0);
-                v_uint8x16 v_src  = v_load(src + x);
+                v_uint8 v_src  = vx_load(src + x);
                 v_src = v_src & v_mask;
-                v_uint16x8 v_int0, v_int1;
+                v_uint16 v_int0, v_int1;
                 v_expand(v_src, v_int0, v_int1);
 
-                v_uint32x4 v_int00, v_int01, v_int10, v_int11;
+                v_uint32 v_int00, v_int01, v_int10, v_int11;
                 v_expand(v_int0, v_int00, v_int01);
                 v_expand(v_int1, v_int10, v_int11);
 
-                v_float64x2 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int00));
-                v_float64x2 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int00));
-                v_float64x2 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int01));
-                v_float64x2 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int01));
-                v_float64x2 v_src4 = v_cvt_f64(v_reinterpret_as_s32(v_int10));
-                v_float64x2 v_src5 = v_cvt_f64_high(v_reinterpret_as_s32(v_int10));
-                v_float64x2 v_src6 = v_cvt_f64(v_reinterpret_as_s32(v_int11));
-                v_float64x2 v_src7 = v_cvt_f64_high(v_reinterpret_as_s32(v_int11));
+                v_float64 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int00));
+                v_float64 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int00));
+                v_float64 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int01));
+                v_float64 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int01));
+                v_float64 v_src4 = v_cvt_f64(v_reinterpret_as_s32(v_int10));
+                v_float64 v_src5 = v_cvt_f64_high(v_reinterpret_as_s32(v_int10));
+                v_float64 v_src6 = v_cvt_f64(v_reinterpret_as_s32(v_int11));
+                v_float64 v_src7 = v_cvt_f64_high(v_reinterpret_as_s32(v_int11));
 
-                v_float64x2 v_dst0 = v_load(dst + x);
-                v_float64x2 v_dst1 = v_load(dst + x + 2);
-                v_float64x2 v_dst2 = v_load(dst + x + 4);
-                v_float64x2 v_dst3 = v_load(dst + x + 6);
-                v_float64x2 v_dst4 = v_load(dst + x + 8);
-                v_float64x2 v_dst5 = v_load(dst + x + 10);
-                v_float64x2 v_dst6 = v_load(dst + x + 12);
-                v_float64x2 v_dst7 = v_load(dst + x + 14);
+                v_float64 v_dst0 = vx_load(dst + x);
+                v_float64 v_dst1 = vx_load(dst + x + step);
+                v_float64 v_dst2 = vx_load(dst + x + step * 2);
+                v_float64 v_dst3 = vx_load(dst + x + step * 3);
+                v_float64 v_dst4 = vx_load(dst + x + step * 4);
+                v_float64 v_dst5 = vx_load(dst + x + step * 5);
+                v_float64 v_dst6 = vx_load(dst + x + step * 6);
+                v_float64 v_dst7 = vx_load(dst + x + step * 7);
 
                 v_dst0 = v_dst0 + v_src0;
                 v_dst1 = v_dst1 + v_src1;
@@ -632,34 +651,34 @@ void acc_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn
                 v_dst7 = v_dst7 + v_src7;
 
                 v_store(dst + x, v_dst0);
-                v_store(dst + x + 2, v_dst1);
-                v_store(dst + x + 4, v_dst2);
-                v_store(dst + x + 6, v_dst3);
-                v_store(dst + x + 8, v_dst4);
-                v_store(dst + x + 10, v_dst5);
-                v_store(dst + x + 12, v_dst6);
-                v_store(dst + x + 14, v_dst7);
+                v_store(dst + x + step, v_dst1);
+                v_store(dst + x + step * 2, v_dst2);
+                v_store(dst + x + step * 3, v_dst3);
+                v_store(dst + x + step * 4, v_dst4);
+                v_store(dst + x + step * 5, v_dst5);
+                v_store(dst + x + step * 6, v_dst6);
+                v_store(dst + x + step * 7, v_dst7);
             }
         }
         else if (cn == 3)
         {
             for ( ; x <= len - cVectorWidth; x += cVectorWidth)
             {
-                v_uint8x16 v_mask = v_load(mask + x);
+                v_uint8 v_mask = vx_load(mask + x);
                 v_mask = ~(v_0 == v_mask);
-                v_uint8x16 v_src0, v_src1, v_src2;
+                v_uint8 v_src0, v_src1, v_src2;
                 v_load_deinterleave(src + (x * cn), v_src0, v_src1, v_src2);
                 v_src0 = v_src0 & v_mask;
                 v_src1 = v_src1 & v_mask;
                 v_src2 = v_src2 & v_mask;
-                v_uint16x8 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
+                v_uint16 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
                 v_expand(v_src0, v_src00, v_src01);
                 v_expand(v_src1, v_src10, v_src11);
                 v_expand(v_src2, v_src20, v_src21);
 
-                v_uint32x4 v_src000, v_src001, v_src010, v_src011;
-                v_uint32x4 v_src100, v_src101, v_src110, v_src111;
-                v_uint32x4 v_src200, v_src201, v_src210, v_src211;
+                v_uint32 v_src000, v_src001, v_src010, v_src011;
+                v_uint32 v_src100, v_src101, v_src110, v_src111;
+                v_uint32 v_src200, v_src201, v_src210, v_src211;
                 v_expand(v_src00, v_src000, v_src001);
                 v_expand(v_src01, v_src010, v_src011);
                 v_expand(v_src10, v_src100, v_src101);
@@ -667,9 +686,9 @@ void acc_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn
                 v_expand(v_src20, v_src200, v_src201);
                 v_expand(v_src21, v_src210, v_src211);
 
-                v_float64x2 v_src0000, v_src0001, v_src0010, v_src0011, v_src0100, v_src0101, v_src0110, v_src0111;
-                v_float64x2 v_src1000, v_src1001, v_src1010, v_src1011, v_src1100, v_src1101, v_src1110, v_src1111;
-                v_float64x2 v_src2000, v_src2001, v_src2010, v_src2011, v_src2100, v_src2101, v_src2110, v_src2111;
+                v_float64 v_src0000, v_src0001, v_src0010, v_src0011, v_src0100, v_src0101, v_src0110, v_src0111;
+                v_float64 v_src1000, v_src1001, v_src1010, v_src1011, v_src1100, v_src1101, v_src1110, v_src1111;
+                v_float64 v_src2000, v_src2001, v_src2010, v_src2011, v_src2100, v_src2101, v_src2110, v_src2111;
                 v_src0000 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src000)));
                 v_src0001 = v_cvt_f64_high(v_cvt_f32(v_reinterpret_as_s32(v_src000)));
                 v_src0010 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src001)));
@@ -695,56 +714,58 @@ void acc_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn
                 v_src2110 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src211)));
                 v_src2111 = v_cvt_f64_high(v_cvt_f32(v_reinterpret_as_s32(v_src211)));
 
-                v_float64x2 v_dst0000, v_dst0001, v_dst0010, v_dst0011, v_dst0100, v_dst0101, v_dst0110, v_dst0111;
-                v_float64x2 v_dst1000, v_dst1001, v_dst1010, v_dst1011, v_dst1100, v_dst1101, v_dst1110, v_dst1111;
-                v_float64x2 v_dst2000, v_dst2001, v_dst2010, v_dst2011, v_dst2100, v_dst2101, v_dst2110, v_dst2111;
+                v_float64 v_dst0000, v_dst0001, v_dst0010, v_dst0011, v_dst0100, v_dst0101, v_dst0110, v_dst0111;
+                v_float64 v_dst1000, v_dst1001, v_dst1010, v_dst1011, v_dst1100, v_dst1101, v_dst1110, v_dst1111;
+                v_float64 v_dst2000, v_dst2001, v_dst2010, v_dst2011, v_dst2100, v_dst2101, v_dst2110, v_dst2111;
                 v_load_deinterleave(dst + (x * cn), v_dst0000, v_dst1000, v_dst2000);
-                v_load_deinterleave(dst + ((x + 2) * cn), v_dst0001, v_dst1001, v_dst2001);
-                v_load_deinterleave(dst + ((x + 4) * cn), v_dst0010, v_dst1010, v_dst2010);
-                v_load_deinterleave(dst + ((x + 6) * cn), v_dst0011, v_dst1011, v_dst2011);
-                v_load_deinterleave(dst + ((x + 8) * cn), v_dst0100, v_dst1100, v_dst2100);
-                v_load_deinterleave(dst + ((x + 10) * cn), v_dst0101, v_dst1101, v_dst2101);
-                v_load_deinterleave(dst + ((x + 12) * cn), v_dst0110, v_dst1110, v_dst2110);
-                v_load_deinterleave(dst + ((x + 14) * cn), v_dst0111, v_dst1111, v_dst2111);
+                v_load_deinterleave(dst + ((x + step) * cn), v_dst0001, v_dst1001, v_dst2001);
+                v_load_deinterleave(dst + ((x + step * 2) * cn), v_dst0010, v_dst1010, v_dst2010);
+                v_load_deinterleave(dst + ((x + step * 3) * cn), v_dst0011, v_dst1011, v_dst2011);
+                v_load_deinterleave(dst + ((x + step * 4) * cn), v_dst0100, v_dst1100, v_dst2100);
+                v_load_deinterleave(dst + ((x + step * 5) * cn), v_dst0101, v_dst1101, v_dst2101);
+                v_load_deinterleave(dst + ((x + step * 6) * cn), v_dst0110, v_dst1110, v_dst2110);
+                v_load_deinterleave(dst + ((x + step * 7) * cn), v_dst0111, v_dst1111, v_dst2111);
 
                 v_store_interleave(dst + (x * cn), v_dst0000 + v_src0000, v_dst1000 + v_src1000, v_dst2000 + v_src2000);
-                v_store_interleave(dst + ((x + 2) * cn), v_dst0001 + v_src0001, v_dst1001 + v_src1001, v_dst2001 + v_src2001);
-                v_store_interleave(dst + ((x + 4) * cn), v_dst0010 + v_src0010, v_dst1010 + v_src1010, v_dst2010 + v_src2010);
-                v_store_interleave(dst + ((x + 6) * cn), v_dst0011 + v_src0011, v_dst1011 + v_src1011, v_dst2011 + v_src2011);
-                v_store_interleave(dst + ((x + 8) * cn), v_dst0100 + v_src0100, v_dst1100 + v_src1100, v_dst2100 + v_src2100);
-                v_store_interleave(dst + ((x + 10) * cn), v_dst0101 + v_src0101, v_dst1101 + v_src1101, v_dst2101 + v_src2101);
-                v_store_interleave(dst + ((x + 12) * cn), v_dst0110 + v_src0110, v_dst1110 + v_src1110, v_dst2110 + v_src2110);
-                v_store_interleave(dst + ((x + 14) * cn), v_dst0111 + v_src0111, v_dst1111 + v_src1111, v_dst2111 + v_src2111);
+                v_store_interleave(dst + ((x + step) * cn), v_dst0001 + v_src0001, v_dst1001 + v_src1001, v_dst2001 + v_src2001);
+                v_store_interleave(dst + ((x + step * 2) * cn), v_dst0010 + v_src0010, v_dst1010 + v_src1010, v_dst2010 + v_src2010);
+                v_store_interleave(dst + ((x + step * 3) * cn), v_dst0011 + v_src0011, v_dst1011 + v_src1011, v_dst2011 + v_src2011);
+                v_store_interleave(dst + ((x + step * 4) * cn), v_dst0100 + v_src0100, v_dst1100 + v_src1100, v_dst2100 + v_src2100);
+                v_store_interleave(dst + ((x + step * 5) * cn), v_dst0101 + v_src0101, v_dst1101 + v_src1101, v_dst2101 + v_src2101);
+                v_store_interleave(dst + ((x + step * 6) * cn), v_dst0110 + v_src0110, v_dst1110 + v_src1110, v_dst2110 + v_src2110);
+                v_store_interleave(dst + ((x + step * 7) * cn), v_dst0111 + v_src0111, v_dst1111 + v_src1111, v_dst2111 + v_src2111);
             }
         }
     }
-
+#endif // CV_SIMD_64F
     acc_general_(src, dst, mask, len, cn, x);
 }
 
 void acc_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-    const int cVectorWidth = 8;
+#if CV_SIMD_64F
+    const int cVectorWidth = v_uint16::nlanes;
+    const int step = v_float64::nlanes;
 
     if (!mask)
     {
         int size = len * cn;
         for (; x <= size - cVectorWidth; x += cVectorWidth)
         {
-            v_uint16x8 v_src  = v_load(src + x);
-            v_uint32x4 v_int0, v_int1;
+            v_uint16 v_src  = vx_load(src + x);
+            v_uint32 v_int0, v_int1;
             v_expand(v_src, v_int0, v_int1);
 
-            v_float64x2 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int0));
-            v_float64x2 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int0));
-            v_float64x2 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int1));
-            v_float64x2 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int1));
+            v_float64 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int0));
+            v_float64 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int0));
+            v_float64 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int1));
+            v_float64 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int1));
 
-            v_float64x2 v_dst0 = v_load(dst + x);
-            v_float64x2 v_dst1 = v_load(dst + x + 2);
-            v_float64x2 v_dst2 = v_load(dst + x + 4);
-            v_float64x2 v_dst3 = v_load(dst + x + 6);
+            v_float64 v_dst0 = vx_load(dst + x);
+            v_float64 v_dst1 = vx_load(dst + x + step);
+            v_float64 v_dst2 = vx_load(dst + x + step * 2);
+            v_float64 v_dst3 = vx_load(dst + x + step * 3);
 
             v_dst0 = v_dst0 + v_src0;
             v_dst1 = v_dst1 + v_src1;
@@ -752,34 +773,34 @@ void acc_simd_(const ushort* src, double* dst, const uchar* mask, int len, int c
             v_dst3 = v_dst3 + v_src3;
 
             v_store(dst + x, v_dst0);
-            v_store(dst + x + 2, v_dst1);
-            v_store(dst + x + 4, v_dst2);
-            v_store(dst + x + 6, v_dst3);
+            v_store(dst + x + step, v_dst1);
+            v_store(dst + x + step * 2, v_dst2);
+            v_store(dst + x + step * 3, v_dst3);
         }
     }
     else
     {
-        v_uint16x8 v_0 = v_setzero_u16();
+        v_uint16 v_0 = vx_setzero_u16();
         if (cn == 1)
         {
             for ( ; x <= len - cVectorWidth; x += cVectorWidth)
             {
-                v_uint16x8 v_mask = v_load_expand(mask + x);
+                v_uint16 v_mask = vx_load_expand(mask + x);
                 v_mask = ~(v_mask == v_0);
-                v_uint16x8 v_src  = v_load(src + x);
+                v_uint16 v_src  = vx_load(src + x);
                 v_src = v_src & v_mask;
-                v_uint32x4 v_int0, v_int1;
+                v_uint32 v_int0, v_int1;
                 v_expand(v_src, v_int0, v_int1);
 
-                v_float64x2 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int0));
-                v_float64x2 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int0));
-                v_float64x2 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int1));
-                v_float64x2 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int1));
+                v_float64 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int0));
+                v_float64 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int0));
+                v_float64 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int1));
+                v_float64 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int1));
 
-                v_float64x2 v_dst0 = v_load(dst + x);
-                v_float64x2 v_dst1 = v_load(dst + x + 2);
-                v_float64x2 v_dst2 = v_load(dst + x + 4);
-                v_float64x2 v_dst3 = v_load(dst + x + 6);
+                v_float64 v_dst0 = vx_load(dst + x);
+                v_float64 v_dst1 = vx_load(dst + x + step);
+                v_float64 v_dst2 = vx_load(dst + x + step * 2);
+                v_float64 v_dst3 = vx_load(dst + x + step * 3);
 
                 v_dst0 = v_dst0 + v_src0;
                 v_dst1 = v_dst1 + v_src1;
@@ -787,178 +808,207 @@ void acc_simd_(const ushort* src, double* dst, const uchar* mask, int len, int c
                 v_dst3 = v_dst3 + v_src3;
 
                 v_store(dst + x, v_dst0);
-                v_store(dst + x + 2, v_dst1);
-                v_store(dst + x + 4, v_dst2);
-                v_store(dst + x + 6, v_dst3);
+                v_store(dst + x + step, v_dst1);
+                v_store(dst + x + step * 2, v_dst2);
+                v_store(dst + x + step * 3, v_dst3);
             }
         }
         if (cn == 3)
         {
             for ( ; x <= len - cVectorWidth; x += cVectorWidth)
             {
-                v_uint16x8 v_mask = v_load_expand(mask + x);
+                v_uint16 v_mask = vx_load_expand(mask + x);
                 v_mask = ~(v_mask == v_0);
-                v_uint16x8 v_src0, v_src1, v_src2;
+                v_uint16 v_src0, v_src1, v_src2;
                 v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
                 v_src0 = v_src0 & v_mask;
                 v_src1 = v_src1 & v_mask;
                 v_src2 = v_src2 & v_mask;
-                v_uint32x4 v_int00, v_int01, v_int10, v_int11, v_int20, v_int21;
+                v_uint32 v_int00, v_int01, v_int10, v_int11, v_int20, v_int21;
                 v_expand(v_src0, v_int00, v_int01);
                 v_expand(v_src1, v_int10, v_int11);
                 v_expand(v_src2, v_int20, v_int21);
 
-                v_float64x2 v_src00 = v_cvt_f64(v_reinterpret_as_s32(v_int00));
-                v_float64x2 v_src01 = v_cvt_f64_high(v_reinterpret_as_s32(v_int00));
-                v_float64x2 v_src02 = v_cvt_f64(v_reinterpret_as_s32(v_int01));
-                v_float64x2 v_src03 = v_cvt_f64_high(v_reinterpret_as_s32(v_int01));
-                v_float64x2 v_src10 = v_cvt_f64(v_reinterpret_as_s32(v_int10));
-                v_float64x2 v_src11 = v_cvt_f64_high(v_reinterpret_as_s32(v_int10));
-                v_float64x2 v_src12 = v_cvt_f64(v_reinterpret_as_s32(v_int11));
-                v_float64x2 v_src13 = v_cvt_f64_high(v_reinterpret_as_s32(v_int11));
-                v_float64x2 v_src20 = v_cvt_f64(v_reinterpret_as_s32(v_int20));
-                v_float64x2 v_src21 = v_cvt_f64_high(v_reinterpret_as_s32(v_int20));
-                v_float64x2 v_src22 = v_cvt_f64(v_reinterpret_as_s32(v_int21));
-                v_float64x2 v_src23 = v_cvt_f64_high(v_reinterpret_as_s32(v_int21));
+                v_float64 v_src00 = v_cvt_f64(v_reinterpret_as_s32(v_int00));
+                v_float64 v_src01 = v_cvt_f64_high(v_reinterpret_as_s32(v_int00));
+                v_float64 v_src02 = v_cvt_f64(v_reinterpret_as_s32(v_int01));
+                v_float64 v_src03 = v_cvt_f64_high(v_reinterpret_as_s32(v_int01));
+                v_float64 v_src10 = v_cvt_f64(v_reinterpret_as_s32(v_int10));
+                v_float64 v_src11 = v_cvt_f64_high(v_reinterpret_as_s32(v_int10));
+                v_float64 v_src12 = v_cvt_f64(v_reinterpret_as_s32(v_int11));
+                v_float64 v_src13 = v_cvt_f64_high(v_reinterpret_as_s32(v_int11));
+                v_float64 v_src20 = v_cvt_f64(v_reinterpret_as_s32(v_int20));
+                v_float64 v_src21 = v_cvt_f64_high(v_reinterpret_as_s32(v_int20));
+                v_float64 v_src22 = v_cvt_f64(v_reinterpret_as_s32(v_int21));
+                v_float64 v_src23 = v_cvt_f64_high(v_reinterpret_as_s32(v_int21));
 
-                v_float64x2 v_dst00, v_dst01, v_dst02, v_dst03, v_dst10, v_dst11, v_dst12, v_dst13, v_dst20, v_dst21, v_dst22, v_dst23;
+                v_float64 v_dst00, v_dst01, v_dst02, v_dst03, v_dst10, v_dst11, v_dst12, v_dst13, v_dst20, v_dst21, v_dst22, v_dst23;
                 v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
-                v_load_deinterleave(dst + (x + 2) * cn, v_dst01, v_dst11, v_dst21);
-                v_load_deinterleave(dst + (x + 4) * cn, v_dst02, v_dst12, v_dst22);
-                v_load_deinterleave(dst + (x + 6) * cn, v_dst03, v_dst13, v_dst23);
+                v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
+                v_load_deinterleave(dst + (x + step * 2) * cn, v_dst02, v_dst12, v_dst22);
+                v_load_deinterleave(dst + (x + step * 3) * cn, v_dst03, v_dst13, v_dst23);
 
                 v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
-                v_store_interleave(dst + (x + 2) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
-                v_store_interleave(dst + (x + 4) * cn, v_dst02 + v_src02, v_dst12 + v_src12, v_dst22 + v_src22);
-                v_store_interleave(dst + (x + 6) * cn, v_dst03 + v_src03, v_dst13 + v_src13, v_dst23 + v_src23);
+                v_store_interleave(dst + (x + step) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
+                v_store_interleave(dst + (x + step * 2) * cn, v_dst02 + v_src02, v_dst12 + v_src12, v_dst22 + v_src22);
+                v_store_interleave(dst + (x + step * 3) * cn, v_dst03 + v_src03, v_dst13 + v_src13, v_dst23 + v_src23);
             }
         }
     }
-
+#endif // CV_SIMD_64F
     acc_general_(src, dst, mask, len, cn, x);
 }
 
 void acc_simd_(const float* src, double* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-    const int cVectorWidth = 4;
+#if CV_SIMD_64F
+    const int cVectorWidth = v_float32::nlanes;
+    const int step = v_float64::nlanes;
 
     if (!mask)
     {
         int size = len * cn;
+        #if CV_AVX && !CV_AVX2
+        for (; x <= size - 8 ; x += 8)
+        {
+            __m256 v_src = _mm256_loadu_ps(src + x);
+            __m256d v_src0 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src, 0));
+            __m256d v_src1 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src, 1));
+            __m256d v_dst0 = _mm256_loadu_pd(dst + x);
+            __m256d v_dst1 = _mm256_loadu_pd(dst + x + 4);
+            v_dst0 = _mm256_add_pd(v_src0, v_dst0);
+            v_dst1 = _mm256_add_pd(v_src1, v_dst1);
+            _mm256_storeu_pd(dst + x, v_dst0);
+            _mm256_storeu_pd(dst + x + 4, v_dst1);
+        }
+        #else
         for (; x <= size - cVectorWidth; x += cVectorWidth)
         {
-            v_float32x4 v_src = v_load(src + x);
-            v_float64x2 v_src0 = v_cvt_f64(v_src);
-            v_float64x2 v_src1 = v_cvt_f64_high(v_src);
+            v_float32 v_src = vx_load(src + x);
+            v_float64 v_src0 = v_cvt_f64(v_src);
+            v_float64 v_src1 = v_cvt_f64_high(v_src);
 
-            v_store(dst + x, v_load(dst + x) + v_src0);
-            v_store(dst + x + 2, v_load(dst + x + 2) + v_src1);
+            v_store(dst + x, vx_load(dst + x) + v_src0);
+            v_store(dst + x + step, vx_load(dst + x + step) + v_src1);
         }
+        #endif // CV_AVX && !CV_AVX2
     }
     else
     {
-        v_uint64x2 v_0 = v_setzero_u64();
+        v_uint64 v_0 = vx_setzero_u64();
         if (cn == 1)
         {
             for ( ; x <= len - cVectorWidth ; x += cVectorWidth)
             {
-                v_uint32x4 v_masku32 = v_load_expand_q(mask + x);
-                v_uint64x2 v_masku640, v_masku641;
+                v_uint32 v_masku32 = vx_load_expand_q(mask + x);
+                v_uint64 v_masku640, v_masku641;
                 v_expand(v_masku32, v_masku640, v_masku641);
-                v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
-                v_float64x2 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
+                v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
+                v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
 
-                v_float32x4 v_src = v_load(src + x);
-                v_float64x2 v_src0 = v_cvt_f64(v_src) & v_mask0;
-                v_float64x2 v_src1 = v_cvt_f64_high(v_src) & v_mask1;
+                v_float32 v_src = vx_load(src + x);
+                v_float64 v_src0 = v_cvt_f64(v_src) & v_mask0;
+                v_float64 v_src1 = v_cvt_f64_high(v_src) & v_mask1;
 
-                v_store(dst + x, v_load(dst + x) + v_src0);
-                v_store(dst + x + 2, v_load(dst + x + 2) + v_src1);
+                v_store(dst + x, vx_load(dst + x) + v_src0);
+                v_store(dst + x + step, vx_load(dst + x + step) + v_src1);
             }
         }
         else if (cn == 3)
         {
             for ( ; x <= len - cVectorWidth ; x += cVectorWidth)
             {
-                v_uint32x4 v_masku32 = v_load_expand_q(mask + x);
-                v_uint64x2 v_masku640, v_masku641;
+                v_uint32 v_masku32 = vx_load_expand_q(mask + x);
+                v_uint64 v_masku640, v_masku641;
                 v_expand(v_masku32, v_masku640, v_masku641);
-                v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
-                v_float64x2 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
+                v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
+                v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
 
-                v_float32x4 v_src0, v_src1, v_src2;
+                v_float32 v_src0, v_src1, v_src2;
                 v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
-                v_float64x2 v_src00 = v_cvt_f64(v_src0) & v_mask0;
-                v_float64x2 v_src01 = v_cvt_f64_high(v_src0) & v_mask1;
-                v_float64x2 v_src10 = v_cvt_f64(v_src1) & v_mask0;
-                v_float64x2 v_src11 = v_cvt_f64_high(v_src1) & v_mask1;
-                v_float64x2 v_src20 = v_cvt_f64(v_src2) & v_mask0;
-                v_float64x2 v_src21 = v_cvt_f64_high(v_src2) & v_mask1;
+                v_float64 v_src00 = v_cvt_f64(v_src0) & v_mask0;
+                v_float64 v_src01 = v_cvt_f64_high(v_src0) & v_mask1;
+                v_float64 v_src10 = v_cvt_f64(v_src1) & v_mask0;
+                v_float64 v_src11 = v_cvt_f64_high(v_src1) & v_mask1;
+                v_float64 v_src20 = v_cvt_f64(v_src2) & v_mask0;
+                v_float64 v_src21 = v_cvt_f64_high(v_src2) & v_mask1;
 
-                v_float64x2 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
+                v_float64 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
                 v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
-                v_load_deinterleave(dst + (x + 2) * cn, v_dst01, v_dst11, v_dst21);
+                v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
 
                 v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
-                v_store_interleave(dst + (x + 2) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
+                v_store_interleave(dst + (x + step) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
             }
         }
     }
-
+#endif // CV_SIMD_64F
     acc_general_(src, dst, mask, len, cn, x);
 }
 
 void acc_simd_(const double* src, double* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-    const int cVectorWidth = 4;
+#if CV_SIMD_64F
+    const int cVectorWidth = v_float64::nlanes * 2;
+    const int step = v_float64::nlanes;
 
     if (!mask)
     {
         int size = len * cn;
+        #if CV_AVX && !CV_AVX2
+        for ( ; x <= size - 4 ; x += 4)
+        {
+            __m256d v_src = _mm256_loadu_pd(src + x);
+            __m256d v_dst = _mm256_loadu_pd(dst + x);
+            v_dst = _mm256_add_pd(v_dst, v_src);
+            _mm256_storeu_pd(dst + x, v_dst);
+        }
+        #else
         for (; x <= size - cVectorWidth; x += cVectorWidth)
         {
-            v_float64x2 v_src0 = v_load(src + x);
-            v_float64x2 v_src1 = v_load(src + x + 2);
+            v_float64 v_src0 = vx_load(src + x);
+            v_float64 v_src1 = vx_load(src + x + step);
 
-            v_store(dst + x, v_load(dst + x) + v_src0);
-            v_store(dst + x + 2, v_load(dst + x + 2) + v_src1);
+            v_store(dst + x, vx_load(dst + x) + v_src0);
+            v_store(dst + x + step, vx_load(dst + x + step) + v_src1);
         }
+        #endif // CV_AVX && !CV_AVX2
     }
     else
     {
-        v_uint64x2 v_0 = v_setzero_u64();
+        v_uint64 v_0 = vx_setzero_u64();
         if (cn == 1)
         {
             for ( ; x <= len - cVectorWidth ; x += cVectorWidth)
             {
-                v_uint32x4 v_masku32 = v_load_expand_q(mask + x);
-                v_uint64x2 v_masku640, v_masku641;
+                v_uint32 v_masku32 = vx_load_expand_q(mask + x);
+                v_uint64 v_masku640, v_masku641;
                 v_expand(v_masku32, v_masku640, v_masku641);
-                v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
-                v_float64x2 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
+                v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
+                v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
 
-                v_float64x2 v_src0 = v_load(src + x);
-                v_float64x2 v_src1 = v_load(src + x + 2);
+                v_float64 v_src0 = vx_load(src + x);
+                v_float64 v_src1 = vx_load(src + x + step);
 
-                v_store(dst + x, v_load(dst + x) + (v_src0 & v_mask0));
-                v_store(dst + x + 2, v_load(dst + x + 2) + (v_src1 & v_mask1));
+                v_store(dst + x, vx_load(dst + x) + (v_src0 & v_mask0));
+                v_store(dst + x + step, vx_load(dst + x + step) + (v_src1 & v_mask1));
             }
         }
         else if (cn == 3)
         {
             for ( ; x <= len - cVectorWidth ; x += cVectorWidth)
             {
-                v_uint32x4 v_masku32 = v_load_expand_q(mask + x);
-                v_uint64x2 v_masku640, v_masku641;
+                v_uint32 v_masku32 = vx_load_expand_q(mask + x);
+                v_uint64 v_masku640, v_masku641;
                 v_expand(v_masku32, v_masku640, v_masku641);
-                v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
-                v_float64x2 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
+                v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
+                v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
 
-                v_float64x2 v_src00, v_src10, v_src20, v_src01, v_src11, v_src21;
+                v_float64 v_src00, v_src10, v_src20, v_src01, v_src11, v_src21;
                 v_load_deinterleave(src + x * cn, v_src00, v_src10, v_src20);
-                v_load_deinterleave(src + (x + 2) * cn, v_src01, v_src11, v_src21);
+                v_load_deinterleave(src + (x + step) * cn, v_src01, v_src11, v_src21);
                 v_src00 = v_src00 & v_mask0;
                 v_src01 = v_src01 & v_mask1;
                 v_src10 = v_src10 & v_mask0;
@@ -966,120 +1016,101 @@ void acc_simd_(const double* src, double* dst, const uchar* mask, int len, int c
                 v_src20 = v_src20 & v_mask0;
                 v_src21 = v_src21 & v_mask1;
 
-                v_float64x2 v_dst00, v_dst10, v_dst20, v_dst01, v_dst11, v_dst21;
+                v_float64 v_dst00, v_dst10, v_dst20, v_dst01, v_dst11, v_dst21;
                 v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
-                v_load_deinterleave(dst + (x + 2) * cn, v_dst01, v_dst11, v_dst21);
+                v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
 
                 v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
-                v_store_interleave(dst + (x + 2) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
+                v_store_interleave(dst + (x + step) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
             }
         }
     }
-
+#endif // CV_SIMD_64F
     acc_general_(src, dst, mask, len, cn, x);
 }
-#else
-void acc_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn)
-{
-    acc_general_(src, dst, mask, len, cn, 0);
-}
-
-void acc_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn)
-{
-    acc_general_(src, dst, mask, len, cn, 0);
-}
-
-void acc_simd_(const float* src, double* dst, const uchar* mask, int len, int cn)
-{
-    acc_general_(src, dst, mask, len, cn, 0);
-}
-
-void acc_simd_(const double* src, double* dst, const uchar* mask, int len, int cn)
-{
-    acc_general_(src, dst, mask, len, cn, 0);
-}
-#endif
 
 // square accumulate optimized by universal intrinsic
 void accSqr_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-    const int cVectorWidth = 16;
+#if CV_SIMD
+    const int cVectorWidth = v_uint8::nlanes;
+    const int step = v_float32::nlanes;
 
     if (!mask)
     {
         int size = len * cn;
         for (; x <= size - cVectorWidth; x += cVectorWidth)
         {
-            v_uint8x16 v_src  = v_load(src + x);
-            v_uint16x8 v_src0, v_src1;
+            v_uint8 v_src  = vx_load(src + x);
+            v_uint16 v_src0, v_src1;
             v_expand(v_src, v_src0, v_src1);
-            v_src0 = v_src0 * v_src0;
-            v_src1 = v_src1 * v_src1;
+            v_src0 = v_mul_wrap(v_src0, v_src0);
+            v_src1 = v_mul_wrap(v_src1, v_src1);
 
-            v_uint32x4 v_src00, v_src01, v_src10, v_src11;
+            v_uint32 v_src00, v_src01, v_src10, v_src11;
             v_expand(v_src0, v_src00, v_src01);
             v_expand(v_src1, v_src10, v_src11);
 
-            v_store(dst + x, v_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));
-            v_store(dst + x + 4, v_load(dst + x + 4) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));
-            v_store(dst + x + 8, v_load(dst + x + 8) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));
-            v_store(dst + x + 12, v_load(dst + x + 12) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));
+            v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));
+            v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));
+            v_store(dst + x + step * 2, vx_load(dst + x + step * 2) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));
+            v_store(dst + x + step * 3, vx_load(dst + x + step * 3) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));
         }
     }
     else
     {
-        v_uint8x16 v_0 = v_setall_u8(0);
+        v_uint8 v_0 = vx_setall_u8(0);
         if (cn == 1)
         {
             for ( ; x <= len - cVectorWidth ; x += cVectorWidth)
             {
-                v_uint8x16 v_mask = v_load(mask + x);
+                v_uint8 v_mask = vx_load(mask + x);
                 v_mask = ~(v_0 == v_mask);
-                v_uint8x16 v_src = v_load(src + x);
+                v_uint8 v_src = vx_load(src + x);
                 v_src = v_src & v_mask;
-                v_uint16x8 v_src0, v_src1;
+                v_uint16 v_src0, v_src1;
                 v_expand(v_src, v_src0, v_src1);
-                v_src0 = v_src0 * v_src0;
-                v_src1 = v_src1 * v_src1;
+                v_src0 = v_mul_wrap(v_src0, v_src0);
+                v_src1 = v_mul_wrap(v_src1, v_src1);
 
-                v_uint32x4 v_src00, v_src01, v_src10, v_src11;
+                v_uint32 v_src00, v_src01, v_src10, v_src11;
                 v_expand(v_src0, v_src00, v_src01);
                 v_expand(v_src1, v_src10, v_src11);
 
-                v_store(dst + x, v_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));
-                v_store(dst + x + 4, v_load(dst + x + 4) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));
-                v_store(dst + x + 8, v_load(dst + x + 8) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));
-                v_store(dst + x + 12, v_load(dst + x + 12) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));
+                v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));
+                v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));
+                v_store(dst + x + step * 2, vx_load(dst + x + step * 2) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));
+                v_store(dst + x + step * 3, vx_load(dst + x + step * 3) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));
             }
         }
         else if (cn == 3)
         {
             for ( ; x <= len - cVectorWidth ; x += cVectorWidth)
             {
-                v_uint8x16 v_mask = v_load(mask + x);
+                v_uint8 v_mask = vx_load(mask + x);
                 v_mask = ~(v_0 == v_mask);
 
-                v_uint8x16 v_src0, v_src1, v_src2;
+                v_uint8 v_src0, v_src1, v_src2;
                 v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
                 v_src0 = v_src0 & v_mask;
                 v_src1 = v_src1 & v_mask;
                 v_src2 = v_src2 & v_mask;
 
-                v_uint16x8 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
+                v_uint16 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
                 v_expand(v_src0, v_src00, v_src01);
                 v_expand(v_src1, v_src10, v_src11);
                 v_expand(v_src2, v_src20, v_src21);
-                v_src00 = v_src00 * v_src00;
-                v_src01 = v_src01 * v_src01;
-                v_src10 = v_src10 * v_src10;
-                v_src11 = v_src11 * v_src11;
-                v_src20 = v_src20 * v_src20;
-                v_src21 = v_src21 * v_src21;
+                v_src00 = v_mul_wrap(v_src00, v_src00);
+                v_src01 = v_mul_wrap(v_src01, v_src01);
+                v_src10 = v_mul_wrap(v_src10, v_src10);
+                v_src11 = v_mul_wrap(v_src11, v_src11);
+                v_src20 = v_mul_wrap(v_src20, v_src20);
+                v_src21 = v_mul_wrap(v_src21, v_src21);
 
-                v_uint32x4 v_src000, v_src001, v_src010, v_src011;
-                v_uint32x4 v_src100, v_src101, v_src110, v_src111;
-                v_uint32x4 v_src200, v_src201, v_src210, v_src211;
+                v_uint32 v_src000, v_src001, v_src010, v_src011;
+                v_uint32 v_src100, v_src101, v_src110, v_src111;
+                v_uint32 v_src200, v_src201, v_src210, v_src211;
                 v_expand(v_src00, v_src000, v_src001);
                 v_expand(v_src01, v_src010, v_src011);
                 v_expand(v_src10, v_src100, v_src101);
@@ -1087,90 +1118,103 @@ void accSqr_simd_(const uchar* src, float* dst, const uchar* mask, int len, int
                 v_expand(v_src20, v_src200, v_src201);
                 v_expand(v_src21, v_src210, v_src211);
 
-                v_float32x4 v_dst000, v_dst001, v_dst010, v_dst011;
-                v_float32x4 v_dst100, v_dst101, v_dst110, v_dst111;
-                v_float32x4 v_dst200, v_dst201, v_dst210, v_dst211;
+                v_float32 v_dst000, v_dst001, v_dst010, v_dst011;
+                v_float32 v_dst100, v_dst101, v_dst110, v_dst111;
+                v_float32 v_dst200, v_dst201, v_dst210, v_dst211;
                 v_load_deinterleave(dst + x * cn, v_dst000, v_dst100, v_dst200);
-                v_load_deinterleave(dst + (x + 4) * cn, v_dst001, v_dst101, v_dst201);
-                v_load_deinterleave(dst + (x + 8) * cn, v_dst010, v_dst110, v_dst210);
-                v_load_deinterleave(dst + (x + 12) * cn, v_dst011, v_dst111, v_dst211);
+                v_load_deinterleave(dst + (x + step) * cn, v_dst001, v_dst101, v_dst201);
+                v_load_deinterleave(dst + (x + step * 2) * cn, v_dst010, v_dst110, v_dst210);
+                v_load_deinterleave(dst + (x + step * 3) * cn, v_dst011, v_dst111, v_dst211);
 
-                v_store_interleave(dst + x * cn, v_dst000 + v_cvt_f32(v_reinterpret_as_s32(v_src000)), v_dst100 + v_cvt_f32(v_reinterpret_as_s32(v_src100)), v_dst200 + v_cvt_f32(v_reinterpret_as_s32(v_src200)));
-                v_store_interleave(dst + (x + 4) * cn, v_dst001 + v_cvt_f32(v_reinterpret_as_s32(v_src001)), v_dst101 + v_cvt_f32(v_reinterpret_as_s32(v_src101)), v_dst201 + v_cvt_f32(v_reinterpret_as_s32(v_src201)));
-                v_store_interleave(dst + (x + 8) * cn, v_dst010 + v_cvt_f32(v_reinterpret_as_s32(v_src010)), v_dst110 + v_cvt_f32(v_reinterpret_as_s32(v_src110)), v_dst210 + v_cvt_f32(v_reinterpret_as_s32(v_src210)));
-                v_store_interleave(dst + (x + 12) * cn, v_dst011 + v_cvt_f32(v_reinterpret_as_s32(v_src011)), v_dst111 + v_cvt_f32(v_reinterpret_as_s32(v_src111)), v_dst211 + v_cvt_f32(v_reinterpret_as_s32(v_src211)));
+                v_dst000 += v_cvt_f32(v_reinterpret_as_s32(v_src000));
+                v_dst001 += v_cvt_f32(v_reinterpret_as_s32(v_src001));
+                v_dst010 += v_cvt_f32(v_reinterpret_as_s32(v_src010));
+                v_dst011 += v_cvt_f32(v_reinterpret_as_s32(v_src011));
+
+                v_dst100 += v_cvt_f32(v_reinterpret_as_s32(v_src100));
+                v_dst101 += v_cvt_f32(v_reinterpret_as_s32(v_src101));
+                v_dst110 += v_cvt_f32(v_reinterpret_as_s32(v_src110));
+                v_dst111 += v_cvt_f32(v_reinterpret_as_s32(v_src111));
+
+                v_dst200 += v_cvt_f32(v_reinterpret_as_s32(v_src200));
+                v_dst201 += v_cvt_f32(v_reinterpret_as_s32(v_src201));
+                v_dst210 += v_cvt_f32(v_reinterpret_as_s32(v_src210));
+                v_dst211 += v_cvt_f32(v_reinterpret_as_s32(v_src211));
+
+                v_store_interleave(dst + x * cn, v_dst000, v_dst100, v_dst200);
+                v_store_interleave(dst + (x + step) * cn, v_dst001, v_dst101, v_dst201);
+                v_store_interleave(dst + (x + step * 2) * cn, v_dst010, v_dst110, v_dst210);
+                v_store_interleave(dst + (x + step * 3) * cn, v_dst011, v_dst111, v_dst211);
             }
         }
     }
-
+#endif // CV_SIMD
     accSqr_general_(src, dst, mask, len, cn, x);
 }
 
 void accSqr_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-    const int cVectorWidth = 8;
+#if CV_SIMD
+    const int cVectorWidth = v_uint16::nlanes;
+    const int step = v_float32::nlanes;
 
     if (!mask)
     {
         int size = len * cn;
         for (; x <= size - cVectorWidth; x += cVectorWidth)
         {
-            v_uint16x8 v_src = v_load(src + x);
-            v_uint32x4 v_src0, v_src1;
+            v_uint16 v_src = vx_load(src + x);
+            v_uint32 v_src0, v_src1;
             v_expand(v_src, v_src0, v_src1);
 
-            v_float32x4 v_float0, v_float1;
+            v_float32 v_float0, v_float1;
             v_float0 = v_cvt_f32(v_reinterpret_as_s32(v_src0));
             v_float1 = v_cvt_f32(v_reinterpret_as_s32(v_src1));
-            v_float0 = v_float0 * v_float0;
-            v_float1 = v_float1 * v_float1;
 
-            v_store(dst + x, v_load(dst + x) + v_float0);
-            v_store(dst + x + 4, v_load(dst + x + 4) + v_float1);
+            v_store(dst + x, v_fma(v_float0, v_float0, vx_load(dst + x)));
+            v_store(dst + x + step, v_fma(v_float1, v_float1, vx_load(dst + x + step)));
         }
     }
     else
     {
-        v_uint32x4 v_0 = v_setzero_u32();
+        v_uint32 v_0 = vx_setzero_u32();
         if (cn == 1)
         {
             for ( ; x <= len - cVectorWidth ; x += cVectorWidth)
             {
-                v_uint16x8 v_mask16 = v_load_expand(mask + x);
-                v_uint32x4 v_mask0, v_mask1;
+                v_uint16 v_mask16 = vx_load_expand(mask + x);
+                v_uint32 v_mask0, v_mask1;
                 v_expand(v_mask16, v_mask0, v_mask1);
                 v_mask0 = ~(v_mask0 == v_0);
                 v_mask1 = ~(v_mask1 == v_0);
-                v_uint16x8 v_src = v_load(src + x);
-                v_uint32x4 v_src0, v_src1;
+                v_uint16 v_src = vx_load(src + x);
+                v_uint32 v_src0, v_src1;
                 v_expand(v_src, v_src0, v_src1);
                 v_src0 = v_src0 & v_mask0;
                 v_src1 = v_src1 & v_mask1;
 
-                v_float32x4 v_float0, v_float1;
+                v_float32 v_float0, v_float1;
                 v_float0 = v_cvt_f32(v_reinterpret_as_s32(v_src0));
                 v_float1 = v_cvt_f32(v_reinterpret_as_s32(v_src1));
-                v_float0 = v_float0 * v_float0;
-                v_float1 = v_float1 * v_float1;
 
-                v_store(dst + x, v_load(dst + x) + v_float0);
-                v_store(dst + x + 4, v_load(dst + x + 4) + v_float1);
+                v_store(dst + x, v_fma(v_float0, v_float0, vx_load(dst + x)));
+                v_store(dst + x + step, v_fma(v_float1, v_float1, vx_load(dst + x + step)));
             }
         }
         else if (cn == 3)
         {
             for ( ; x <= len - cVectorWidth ; x += cVectorWidth)
             {
-                v_uint16x8 v_mask16 = v_load_expand(mask + x);
-                v_uint32x4 v_mask0, v_mask1;
+                v_uint16 v_mask16 = vx_load_expand(mask + x);
+                v_uint32 v_mask0, v_mask1;
                 v_expand(v_mask16, v_mask0, v_mask1);
                 v_mask0 = ~(v_mask0 == v_0);
                 v_mask1 = ~(v_mask1 == v_0);
 
-                v_uint16x8 v_src0, v_src1, v_src2;
+                v_uint16 v_src0, v_src1, v_src2;
                 v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
-                v_uint32x4 v_int00, v_int01, v_int10, v_int11, v_int20, v_int21;
+                v_uint32 v_int00, v_int01, v_int10, v_int11, v_int20, v_int21;
                 v_expand(v_src0, v_int00, v_int01);
                 v_expand(v_src1, v_int10, v_int11);
                 v_expand(v_src2, v_int20, v_int21);
@@ -1181,653 +1225,650 @@ void accSqr_simd_(const ushort* src, float* dst, const uchar* mask, int len, int
                 v_int20 = v_int20 & v_mask0;
                 v_int21 = v_int21 & v_mask1;
 
-                v_float32x4 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
+                v_float32 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
                 v_src00 = v_cvt_f32(v_reinterpret_as_s32(v_int00));
                 v_src01 = v_cvt_f32(v_reinterpret_as_s32(v_int01));
                 v_src10 = v_cvt_f32(v_reinterpret_as_s32(v_int10));
                 v_src11 = v_cvt_f32(v_reinterpret_as_s32(v_int11));
                 v_src20 = v_cvt_f32(v_reinterpret_as_s32(v_int20));
                 v_src21 = v_cvt_f32(v_reinterpret_as_s32(v_int21));
-                v_src00 = v_src00 * v_src00;
-                v_src01 = v_src01 * v_src01;
-                v_src10 = v_src10 * v_src10;
-                v_src11 = v_src11 * v_src11;
-                v_src20 = v_src20 * v_src20;
-                v_src21 = v_src21 * v_src21;
 
-                v_float32x4 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
+                v_float32 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
                 v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
-                v_load_deinterleave(dst + (x + 4) * cn, v_dst01, v_dst11, v_dst21);
+                v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
 
-                v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
-                v_store_interleave(dst + (x + 4) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
+                v_dst00 = v_fma(v_src00, v_src00, v_dst00);
+                v_dst01 = v_fma(v_src01, v_src01, v_dst01);
+                v_dst10 = v_fma(v_src10, v_src10, v_dst10);
+                v_dst11 = v_fma(v_src11, v_src11, v_dst11);
+                v_dst20 = v_fma(v_src20, v_src20, v_dst20);
+                v_dst21 = v_fma(v_src21, v_src21, v_dst21);
+
+                v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
+                v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
             }
         }
     }
-
+#endif // CV_SIMD
     accSqr_general_(src, dst, mask, len, cn, x);
 }
 
 void accSqr_simd_(const float* src, float* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-    const int cVectorWidth = 8;
+#if CV_SIMD
+    const int cVectorWidth = v_uint16::nlanes;
+    const int step = v_float32::nlanes;
 
     if (!mask)
     {
         int size = len * cn;
+        #if CV_AVX && !CV_AVX2
+        for ( ; x <= size - 8 ; x += 8)
+        {
+            __m256 v_src = _mm256_loadu_ps(src + x);
+            __m256 v_dst = _mm256_loadu_ps(dst + x);
+            v_src = _mm256_mul_ps(v_src, v_src);
+            v_dst = _mm256_add_ps(v_src, v_dst);
+            _mm256_storeu_ps(dst + x, v_dst);
+        }
+        #else
         for (; x <= size - cVectorWidth; x += cVectorWidth)
         {
-            v_float32x4 v_src0 = v_load(src + x);
-            v_float32x4 v_src1 = v_load(src + x + 4);
-            v_src0 = v_src0 * v_src0;
-            v_src1 = v_src1 * v_src1;
+            v_float32 v_src0 = vx_load(src + x);
+            v_float32 v_src1 = vx_load(src + x + step);
 
-            v_store(dst + x, v_load(dst + x) + v_src0);
-            v_store(dst + x + 4, v_load(dst + x + 4) + v_src1);
+            v_store(dst + x, v_fma(v_src0, v_src0, vx_load(dst + x)));
+            v_store(dst + x + step, v_fma(v_src1, v_src1, vx_load(dst + x + step)));
         }
+        #endif // CV_AVX && !CV_AVX2
     }
     else
     {
-        v_uint32x4 v_0 = v_setzero_u32();
+        v_uint32 v_0 = vx_setzero_u32();
         if (cn == 1)
         {
             for (; x <= len - cVectorWidth; x += cVectorWidth)
             {
-                v_uint16x8 v_mask16 = v_load_expand(mask + x);
-                v_uint32x4 v_mask_0, v_mask_1;
+                v_uint16 v_mask16 = vx_load_expand(mask + x);
+                v_uint32 v_mask_0, v_mask_1;
                 v_expand(v_mask16, v_mask_0, v_mask_1);
-                v_float32x4 v_mask0 = v_reinterpret_as_f32(~(v_mask_0 == v_0));
-                v_float32x4 v_mask1 = v_reinterpret_as_f32(~(v_mask_1 == v_0));
-                v_float32x4 v_src0 = v_load(src + x);
-                v_float32x4 v_src1 = v_load(src + x + 4);
+                v_float32 v_mask0 = v_reinterpret_as_f32(~(v_mask_0 == v_0));
+                v_float32 v_mask1 = v_reinterpret_as_f32(~(v_mask_1 == v_0));
+                v_float32 v_src0 = vx_load(src + x);
+                v_float32 v_src1 = vx_load(src + x + step);
                 v_src0 = v_src0 & v_mask0;
                 v_src1 = v_src1 & v_mask1;
-                v_src0 = v_src0 * v_src0;
-                v_src1 = v_src1 * v_src1;
 
-                v_store(dst + x, v_load(dst + x) + v_src0);
-                v_store(dst + x + 4, v_load(dst + x + 4) + v_src1);
+                v_store(dst + x, v_fma(v_src0, v_src0, vx_load(dst + x)));
+                v_store(dst + x + step, v_fma(v_src1, v_src1, vx_load(dst + x + step)));
             }
         }
         else if (cn == 3)
         {
             for (; x <= len - cVectorWidth; x += cVectorWidth)
             {
-                v_uint16x8 v_mask16 = v_load_expand(mask + x);
-                v_uint32x4 v_mask_0, v_mask_1;
+                v_uint16 v_mask16 = vx_load_expand(mask + x);
+                v_uint32 v_mask_0, v_mask_1;
                 v_expand(v_mask16, v_mask_0, v_mask_1);
-                v_float32x4 v_mask0 = v_reinterpret_as_f32(~(v_mask_0 == v_0));
-                v_float32x4 v_mask1 = v_reinterpret_as_f32(~(v_mask_1 == v_0));
+                v_float32 v_mask0 = v_reinterpret_as_f32(~(v_mask_0 == v_0));
+                v_float32 v_mask1 = v_reinterpret_as_f32(~(v_mask_1 == v_0));
 
-                v_float32x4 v_src00, v_src10, v_src20, v_src01, v_src11, v_src21;
+                v_float32 v_src00, v_src10, v_src20, v_src01, v_src11, v_src21;
                 v_load_deinterleave(src + x * cn, v_src00, v_src10, v_src20);
-                v_load_deinterleave(src + (x + 4) * cn, v_src01, v_src11, v_src21);
+                v_load_deinterleave(src + (x + step) * cn, v_src01, v_src11, v_src21);
                 v_src00 = v_src00 & v_mask0;
                 v_src01 = v_src01 & v_mask1;
                 v_src10 = v_src10 & v_mask0;
                 v_src11 = v_src11 & v_mask1;
                 v_src20 = v_src20 & v_mask0;
                 v_src21 = v_src21 & v_mask1;
-                v_src00 = v_src00 * v_src00;
-                v_src01 = v_src01 * v_src01;
-                v_src10 = v_src10 * v_src10;
-                v_src11 = v_src11 * v_src11;
-                v_src20 = v_src20 * v_src20;
-                v_src21 = v_src21 * v_src21;
 
-                v_float32x4 v_dst00, v_dst10, v_dst20, v_dst01, v_dst11, v_dst21;
+                v_float32 v_dst00, v_dst10, v_dst20, v_dst01, v_dst11, v_dst21;
                 v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
-                v_load_deinterleave(dst + (x + 4) * cn, v_dst01, v_dst11, v_dst21);
+                v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
 
-                v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
-                v_store_interleave(dst + (x + 4) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
+                v_dst00 = v_fma(v_src00, v_src00, v_dst00);
+                v_dst01 = v_fma(v_src01, v_src01, v_dst01);
+                v_dst10 = v_fma(v_src10, v_src10, v_dst10);
+                v_dst11 = v_fma(v_src11, v_src11, v_dst11);
+                v_dst20 = v_fma(v_src20, v_src20, v_dst20);
+                v_dst21 = v_fma(v_src21, v_src21, v_dst21);
+
+                v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
+                v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
             }
         }
     }
-
+#endif // CV_SIMD
     accSqr_general_(src, dst, mask, len, cn, x);
 }
-#if CV_SIMD128_64F
+
 void accSqr_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-    const int cVectorWidth = 8;
+#if CV_SIMD_64F
+    const int cVectorWidth = v_uint16::nlanes;
+    const int step = v_float64::nlanes;
 
     if (!mask)
     {
         int size = len * cn;
         for (; x <= size - cVectorWidth; x += cVectorWidth)
         {
-            v_uint16x8 v_int = v_load_expand(src + x);
+            v_uint16 v_int = vx_load_expand(src + x);
 
-            v_uint32x4 v_int0, v_int1;
+            v_uint32 v_int0, v_int1;
             v_expand(v_int, v_int0, v_int1);
 
-            v_float64x2 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int0));
-            v_float64x2 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int0));
-            v_float64x2 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int1));
-            v_float64x2 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int1));
-            v_src0 = v_src0 * v_src0;
-            v_src1 = v_src1 * v_src1;
-            v_src2 = v_src2 * v_src2;
-            v_src3 = v_src3 * v_src3;
+            v_float64 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int0));
+            v_float64 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int0));
+            v_float64 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int1));
+            v_float64 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int1));
 
-            v_float64x2 v_dst0 = v_load(dst + x);
-            v_float64x2 v_dst1 = v_load(dst + x + 2);
-            v_float64x2 v_dst2 = v_load(dst + x + 4);
-            v_float64x2 v_dst3 = v_load(dst + x + 6);
+            v_float64 v_dst0 = vx_load(dst + x);
+            v_float64 v_dst1 = vx_load(dst + x + step);
+            v_float64 v_dst2 = vx_load(dst + x + step * 2);
+            v_float64 v_dst3 = vx_load(dst + x + step * 3);
 
-            v_dst0 += v_src0;
-            v_dst1 += v_src1;
-            v_dst2 += v_src2;
-            v_dst3 += v_src3;
+            v_dst0 = v_fma(v_src0, v_src0, v_dst0);
+            v_dst1 = v_fma(v_src1, v_src1, v_dst1);
+            v_dst2 = v_fma(v_src2, v_src2, v_dst2);
+            v_dst3 = v_fma(v_src3, v_src3, v_dst3);
 
             v_store(dst + x, v_dst0);
-            v_store(dst + x + 2, v_dst1);
-            v_store(dst + x + 4, v_dst2);
-            v_store(dst + x + 6, v_dst3);
+            v_store(dst + x + step, v_dst1);
+            v_store(dst + x + step * 2, v_dst2);
+            v_store(dst + x + step * 3, v_dst3);
         }
     }
     else
     {
-        v_uint16x8 v_0 = v_setzero_u16();
+        v_uint16 v_0 = vx_setzero_u16();
         if (cn == 1)
         {
             for (; x <= len - cVectorWidth; x += cVectorWidth)
             {
-                v_uint16x8 v_mask = v_load_expand(mask + x);
+                v_uint16 v_mask = vx_load_expand(mask + x);
                 v_mask = ~(v_mask == v_0);
-                v_uint16x8 v_src = v_load_expand(src + x);
-                v_uint16x8 v_int = v_src & v_mask;
+                v_uint16 v_src = vx_load_expand(src + x);
+                v_uint16 v_int = v_src & v_mask;
 
-                v_uint32x4 v_int0, v_int1;
+                v_uint32 v_int0, v_int1;
                 v_expand(v_int, v_int0, v_int1);
 
-                v_float64x2 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int0));
-                v_float64x2 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int0));
-                v_float64x2 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int1));
-                v_float64x2 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int1));
-                v_src0 = v_src0 * v_src0;
-                v_src1 = v_src1 * v_src1;
-                v_src2 = v_src2 * v_src2;
-                v_src3 = v_src3 * v_src3;
+                v_float64 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int0));
+                v_float64 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int0));
+                v_float64 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int1));
+                v_float64 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int1));
 
-                v_float64x2 v_dst0 = v_load(dst + x);
-                v_float64x2 v_dst1 = v_load(dst + x + 2);
-                v_float64x2 v_dst2 = v_load(dst + x + 4);
-                v_float64x2 v_dst3 = v_load(dst + x + 6);
+                v_float64 v_dst0 = vx_load(dst + x);
+                v_float64 v_dst1 = vx_load(dst + x + step);
+                v_float64 v_dst2 = vx_load(dst + x + step * 2);
+                v_float64 v_dst3 = vx_load(dst + x + step * 3);
 
-                v_dst0 += v_src0;
-                v_dst1 += v_src1;
-                v_dst2 += v_src2;
-                v_dst3 += v_src3;
+                v_dst0 = v_fma(v_src0, v_src0, v_dst0);
+                v_dst1 = v_fma(v_src1, v_src1, v_dst1);
+                v_dst2 = v_fma(v_src2, v_src2, v_dst2);
+                v_dst3 = v_fma(v_src3, v_src3, v_dst3);
 
                 v_store(dst + x, v_dst0);
-                v_store(dst + x + 2, v_dst1);
-                v_store(dst + x + 4, v_dst2);
-                v_store(dst + x + 6, v_dst3);
+                v_store(dst + x + step, v_dst1);
+                v_store(dst + x + step * 2, v_dst2);
+                v_store(dst + x + step * 3, v_dst3);
             }
         }
         else if (cn == 3)
         {
-            for (; x <= len - /*cVectorWidth*/16; x += cVectorWidth)
+            for (; x <= len - cVectorWidth * 2; x += cVectorWidth)
             {
-                v_uint8x16 v_src0, v_src1, v_src2;
+                v_uint8 v_src0, v_src1, v_src2;
                 v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
-                v_uint16x8 v_int0, v_int1, v_int2, dummy;
-                v_expand(v_src0, v_int0, dummy);
-                v_expand(v_src1, v_int1, dummy);
-                v_expand(v_src2, v_int2, dummy);
-                v_uint16x8 v_mask = v_load_expand(mask + x);
+
+                v_uint16 v_int0 = v_expand_low(v_src0);
+                v_uint16 v_int1 = v_expand_low(v_src1);
+                v_uint16 v_int2 = v_expand_low(v_src2);
+
+                v_uint16 v_mask = vx_load_expand(mask + x);
                 v_mask = ~(v_mask == v_0);
                 v_int0 = v_int0 & v_mask;
                 v_int1 = v_int1 & v_mask;
                 v_int2 = v_int2 & v_mask;
 
-                v_uint32x4 v_int00, v_int01, v_int10, v_int11, v_int20, v_int21;
+                v_uint32 v_int00, v_int01, v_int10, v_int11, v_int20, v_int21;
                 v_expand(v_int0, v_int00, v_int01);
                 v_expand(v_int1, v_int10, v_int11);
                 v_expand(v_int2, v_int20, v_int21);
 
-                v_float64x2 v_src00 = v_cvt_f64(v_reinterpret_as_s32(v_int00));
-                v_float64x2 v_src01 = v_cvt_f64_high(v_reinterpret_as_s32(v_int00));
-                v_float64x2 v_src02 = v_cvt_f64(v_reinterpret_as_s32(v_int01));
-                v_float64x2 v_src03 = v_cvt_f64_high(v_reinterpret_as_s32(v_int01));
-                v_float64x2 v_src10 = v_cvt_f64(v_reinterpret_as_s32(v_int10));
-                v_float64x2 v_src11 = v_cvt_f64_high(v_reinterpret_as_s32(v_int10));
-                v_float64x2 v_src12 = v_cvt_f64(v_reinterpret_as_s32(v_int11));
-                v_float64x2 v_src13 = v_cvt_f64_high(v_reinterpret_as_s32(v_int11));
-                v_float64x2 v_src20 = v_cvt_f64(v_reinterpret_as_s32(v_int20));
-                v_float64x2 v_src21 = v_cvt_f64_high(v_reinterpret_as_s32(v_int20));
-                v_float64x2 v_src22 = v_cvt_f64(v_reinterpret_as_s32(v_int21));
-                v_float64x2 v_src23 = v_cvt_f64_high(v_reinterpret_as_s32(v_int21));
-                v_src00 = v_src00 * v_src00;
-                v_src01 = v_src01 * v_src01;
-                v_src02 = v_src02 * v_src02;
-                v_src03 = v_src03 * v_src03;
-                v_src10 = v_src10 * v_src10;
-                v_src11 = v_src11 * v_src11;
-                v_src12 = v_src12 * v_src12;
-                v_src13 = v_src13 * v_src13;
-                v_src20 = v_src20 * v_src20;
-                v_src21 = v_src21 * v_src21;
-                v_src22 = v_src22 * v_src22;
-                v_src23 = v_src23 * v_src23;
+                v_float64 v_src00 = v_cvt_f64(v_reinterpret_as_s32(v_int00));
+                v_float64 v_src01 = v_cvt_f64_high(v_reinterpret_as_s32(v_int00));
+                v_float64 v_src02 = v_cvt_f64(v_reinterpret_as_s32(v_int01));
+                v_float64 v_src03 = v_cvt_f64_high(v_reinterpret_as_s32(v_int01));
+                v_float64 v_src10 = v_cvt_f64(v_reinterpret_as_s32(v_int10));
+                v_float64 v_src11 = v_cvt_f64_high(v_reinterpret_as_s32(v_int10));
+                v_float64 v_src12 = v_cvt_f64(v_reinterpret_as_s32(v_int11));
+                v_float64 v_src13 = v_cvt_f64_high(v_reinterpret_as_s32(v_int11));
+                v_float64 v_src20 = v_cvt_f64(v_reinterpret_as_s32(v_int20));
+                v_float64 v_src21 = v_cvt_f64_high(v_reinterpret_as_s32(v_int20));
+                v_float64 v_src22 = v_cvt_f64(v_reinterpret_as_s32(v_int21));
+                v_float64 v_src23 = v_cvt_f64_high(v_reinterpret_as_s32(v_int21));
 
-                v_float64x2 v_dst00, v_dst01, v_dst02, v_dst03, v_dst10, v_dst11, v_dst12, v_dst13, v_dst20, v_dst21, v_dst22, v_dst23;
+                v_float64 v_dst00, v_dst01, v_dst02, v_dst03, v_dst10, v_dst11, v_dst12, v_dst13, v_dst20, v_dst21, v_dst22, v_dst23;
                 v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
-                v_load_deinterleave(dst + (x + 2) * cn, v_dst01, v_dst11, v_dst21);
-                v_load_deinterleave(dst + (x + 4) * cn, v_dst02, v_dst12, v_dst22);
-                v_load_deinterleave(dst + (x + 6) * cn, v_dst03, v_dst13, v_dst23);
+                v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
+                v_load_deinterleave(dst + (x + step * 2) * cn, v_dst02, v_dst12, v_dst22);
+                v_load_deinterleave(dst + (x + step * 3) * cn, v_dst03, v_dst13, v_dst23);
 
-                v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
-                v_store_interleave(dst + (x + 2) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
-                v_store_interleave(dst + (x + 4) * cn, v_dst02 + v_src02, v_dst12 + v_src12, v_dst22 + v_src22);
-                v_store_interleave(dst + (x + 6) * cn, v_dst03 + v_src03, v_dst13 + v_src13, v_dst23 + v_src23);
+                v_dst00 = v_fma(v_src00, v_src00, v_dst00);
+                v_dst01 = v_fma(v_src01, v_src01, v_dst01);
+                v_dst02 = v_fma(v_src02, v_src02, v_dst02);
+                v_dst03 = v_fma(v_src03, v_src03, v_dst03);
+                v_dst10 = v_fma(v_src10, v_src10, v_dst10);
+                v_dst11 = v_fma(v_src11, v_src11, v_dst11);
+                v_dst12 = v_fma(v_src12, v_src12, v_dst12);
+                v_dst13 = v_fma(v_src13, v_src13, v_dst13);
+                v_dst20 = v_fma(v_src20, v_src20, v_dst20);
+                v_dst21 = v_fma(v_src21, v_src21, v_dst21);
+                v_dst22 = v_fma(v_src22, v_src22, v_dst22);
+                v_dst23 = v_fma(v_src23, v_src23, v_dst23);
+
+                v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
+                v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
+                v_store_interleave(dst + (x + step * 2) * cn, v_dst02, v_dst12, v_dst22);
+                v_store_interleave(dst + (x + step * 3) * cn, v_dst03, v_dst13, v_dst23);
             }
         }
     }
-
+#endif // CV_SIMD_64F
     accSqr_general_(src, dst, mask, len, cn, x);
 }
 
 void accSqr_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-    const int cVectorWidth = 8;
+#if CV_SIMD_64F
+    const int cVectorWidth = v_uint16::nlanes;
+    const int step = v_float64::nlanes;
 
     if (!mask)
     {
         int size = len * cn;
         for (; x <= size - cVectorWidth; x += cVectorWidth)
         {
-            v_uint16x8 v_src  = v_load(src + x);
-            v_uint32x4 v_int_0, v_int_1;
+            v_uint16 v_src  = vx_load(src + x);
+            v_uint32 v_int_0, v_int_1;
             v_expand(v_src, v_int_0, v_int_1);
 
-            v_int32x4 v_int0 = v_reinterpret_as_s32(v_int_0);
-            v_int32x4 v_int1 = v_reinterpret_as_s32(v_int_1);
+            v_int32 v_int0 = v_reinterpret_as_s32(v_int_0);
+            v_int32 v_int1 = v_reinterpret_as_s32(v_int_1);
 
-            v_float64x2 v_src0 = v_cvt_f64(v_int0);
-            v_float64x2 v_src1 = v_cvt_f64_high(v_int0);
-            v_float64x2 v_src2 = v_cvt_f64(v_int1);
-            v_float64x2 v_src3 = v_cvt_f64_high(v_int1);
-            v_src0 = v_src0 * v_src0;
-            v_src1 = v_src1 * v_src1;
-            v_src2 = v_src2 * v_src2;
-            v_src3 = v_src3 * v_src3;
+            v_float64 v_src0 = v_cvt_f64(v_int0);
+            v_float64 v_src1 = v_cvt_f64_high(v_int0);
+            v_float64 v_src2 = v_cvt_f64(v_int1);
+            v_float64 v_src3 = v_cvt_f64_high(v_int1);
 
-            v_float64x2 v_dst0 = v_load(dst + x);
-            v_float64x2 v_dst1 = v_load(dst + x + 2);
-            v_float64x2 v_dst2 = v_load(dst + x + 4);
-            v_float64x2 v_dst3 = v_load(dst + x + 6);
+            v_float64 v_dst0 = vx_load(dst + x);
+            v_float64 v_dst1 = vx_load(dst + x + step);
+            v_float64 v_dst2 = vx_load(dst + x + step * 2);
+            v_float64 v_dst3 = vx_load(dst + x + step * 3);
 
-            v_dst0 += v_src0;
-            v_dst1 += v_src1;
-            v_dst2 += v_src2;
-            v_dst3 += v_src3;
+            v_dst0 = v_fma(v_src0, v_src0, v_dst0);
+            v_dst1 = v_fma(v_src1, v_src1, v_dst1);
+            v_dst2 = v_fma(v_src2, v_src2, v_dst2);
+            v_dst3 = v_fma(v_src3, v_src3, v_dst3);
 
             v_store(dst + x, v_dst0);
-            v_store(dst + x + 2, v_dst1);
-            v_store(dst + x + 4, v_dst2);
-            v_store(dst + x + 6, v_dst3);
+            v_store(dst + x + step, v_dst1);
+            v_store(dst + x + step * 2, v_dst2);
+            v_store(dst + x + step * 3, v_dst3);
         }
     }
     else
     {
-        v_uint16x8 v_0 = v_setzero_u16();
+        v_uint16 v_0 = vx_setzero_u16();
         if (cn == 1)
         {
             for (; x <= len - cVectorWidth; x += cVectorWidth)
             {
-                v_uint16x8 v_mask = v_load_expand(mask + x);
+                v_uint16 v_mask = vx_load_expand(mask + x);
                 v_mask = ~(v_mask == v_0);
-                v_uint16x8 v_src = v_load(src + x);
+                v_uint16 v_src = vx_load(src + x);
                 v_src = v_src & v_mask;
-                v_uint32x4 v_int_0, v_int_1;
+                v_uint32 v_int_0, v_int_1;
                 v_expand(v_src, v_int_0, v_int_1);
 
-                v_int32x4 v_int0 = v_reinterpret_as_s32(v_int_0);
-                v_int32x4 v_int1 = v_reinterpret_as_s32(v_int_1);
+                v_int32 v_int0 = v_reinterpret_as_s32(v_int_0);
+                v_int32 v_int1 = v_reinterpret_as_s32(v_int_1);
 
-                v_float64x2 v_src0 = v_cvt_f64(v_int0);
-                v_float64x2 v_src1 = v_cvt_f64_high(v_int0);
-                v_float64x2 v_src2 = v_cvt_f64(v_int1);
-                v_float64x2 v_src3 = v_cvt_f64_high(v_int1);
-                v_src0 = v_src0 * v_src0;
-                v_src1 = v_src1 * v_src1;
-                v_src2 = v_src2 * v_src2;
-                v_src3 = v_src3 * v_src3;
+                v_float64 v_src0 = v_cvt_f64(v_int0);
+                v_float64 v_src1 = v_cvt_f64_high(v_int0);
+                v_float64 v_src2 = v_cvt_f64(v_int1);
+                v_float64 v_src3 = v_cvt_f64_high(v_int1);
 
-                v_float64x2 v_dst0 = v_load(dst + x);
-                v_float64x2 v_dst1 = v_load(dst + x + 2);
-                v_float64x2 v_dst2 = v_load(dst + x + 4);
-                v_float64x2 v_dst3 = v_load(dst + x + 6);
+                v_float64 v_dst0 = vx_load(dst + x);
+                v_float64 v_dst1 = vx_load(dst + x + step);
+                v_float64 v_dst2 = vx_load(dst + x + step * 2);
+                v_float64 v_dst3 = vx_load(dst + x + step * 3);
 
-                v_dst0 += v_src0;
-                v_dst1 += v_src1;
-                v_dst2 += v_src2;
-                v_dst3 += v_src3;
+                v_dst0 = v_fma(v_src0, v_src0, v_dst0);
+                v_dst1 = v_fma(v_src1, v_src1, v_dst1);
+                v_dst2 = v_fma(v_src2, v_src2, v_dst2);
+                v_dst3 = v_fma(v_src3, v_src3, v_dst3);
 
                 v_store(dst + x, v_dst0);
-                v_store(dst + x + 2, v_dst1);
-                v_store(dst + x + 4, v_dst2);
-                v_store(dst + x + 6, v_dst3);
+                v_store(dst + x + step, v_dst1);
+                v_store(dst + x + step * 2, v_dst2);
+                v_store(dst + x + step * 3, v_dst3);
             }
         }
         else if (cn == 3)
         {
             for (; x <= len - cVectorWidth; x += cVectorWidth)
             {
-                v_uint16x8 v_mask = v_load_expand(mask + x);
+                v_uint16 v_mask = vx_load_expand(mask + x);
                 v_mask = ~(v_mask == v_0);
-                v_uint16x8 v_src0, v_src1, v_src2;
+                v_uint16 v_src0, v_src1, v_src2;
                 v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
                 v_src0 = v_src0 & v_mask;
                 v_src1 = v_src1 & v_mask;
                 v_src2 = v_src2 & v_mask;
-                v_uint32x4 v_int00, v_int01, v_int10, v_int11, v_int20, v_int21;
+                v_uint32 v_int00, v_int01, v_int10, v_int11, v_int20, v_int21;
                 v_expand(v_src0, v_int00, v_int01);
                 v_expand(v_src1, v_int10, v_int11);
                 v_expand(v_src2, v_int20, v_int21);
 
-                v_float64x2 v_src00 = v_cvt_f64(v_reinterpret_as_s32(v_int00));
-                v_float64x2 v_src01 = v_cvt_f64_high(v_reinterpret_as_s32(v_int00));
-                v_float64x2 v_src02 = v_cvt_f64(v_reinterpret_as_s32(v_int01));
-                v_float64x2 v_src03 = v_cvt_f64_high(v_reinterpret_as_s32(v_int01));
-                v_float64x2 v_src10 = v_cvt_f64(v_reinterpret_as_s32(v_int10));
-                v_float64x2 v_src11 = v_cvt_f64_high(v_reinterpret_as_s32(v_int10));
-                v_float64x2 v_src12 = v_cvt_f64(v_reinterpret_as_s32(v_int11));
-                v_float64x2 v_src13 = v_cvt_f64_high(v_reinterpret_as_s32(v_int11));
-                v_float64x2 v_src20 = v_cvt_f64(v_reinterpret_as_s32(v_int20));
-                v_float64x2 v_src21 = v_cvt_f64_high(v_reinterpret_as_s32(v_int20));
-                v_float64x2 v_src22 = v_cvt_f64(v_reinterpret_as_s32(v_int21));
-                v_float64x2 v_src23 = v_cvt_f64_high(v_reinterpret_as_s32(v_int21));
-                v_src00 = v_src00 * v_src00;
-                v_src01 = v_src01 * v_src01;
-                v_src02 = v_src02 * v_src02;
-                v_src03 = v_src03 * v_src03;
-                v_src10 = v_src10 * v_src10;
-                v_src11 = v_src11 * v_src11;
-                v_src12 = v_src12 * v_src12;
-                v_src13 = v_src13 * v_src13;
-                v_src20 = v_src20 * v_src20;
-                v_src21 = v_src21 * v_src21;
-                v_src22 = v_src22 * v_src22;
-                v_src23 = v_src23 * v_src23;
+                v_float64 v_src00 = v_cvt_f64(v_reinterpret_as_s32(v_int00));
+                v_float64 v_src01 = v_cvt_f64_high(v_reinterpret_as_s32(v_int00));
+                v_float64 v_src02 = v_cvt_f64(v_reinterpret_as_s32(v_int01));
+                v_float64 v_src03 = v_cvt_f64_high(v_reinterpret_as_s32(v_int01));
+                v_float64 v_src10 = v_cvt_f64(v_reinterpret_as_s32(v_int10));
+                v_float64 v_src11 = v_cvt_f64_high(v_reinterpret_as_s32(v_int10));
+                v_float64 v_src12 = v_cvt_f64(v_reinterpret_as_s32(v_int11));
+                v_float64 v_src13 = v_cvt_f64_high(v_reinterpret_as_s32(v_int11));
+                v_float64 v_src20 = v_cvt_f64(v_reinterpret_as_s32(v_int20));
+                v_float64 v_src21 = v_cvt_f64_high(v_reinterpret_as_s32(v_int20));
+                v_float64 v_src22 = v_cvt_f64(v_reinterpret_as_s32(v_int21));
+                v_float64 v_src23 = v_cvt_f64_high(v_reinterpret_as_s32(v_int21));
 
-                v_float64x2 v_dst00, v_dst01, v_dst02, v_dst03;
-                v_float64x2 v_dst10, v_dst11, v_dst12, v_dst13;
-                v_float64x2 v_dst20, v_dst21, v_dst22, v_dst23;
+                v_float64 v_dst00, v_dst01, v_dst02, v_dst03;
+                v_float64 v_dst10, v_dst11, v_dst12, v_dst13;
+                v_float64 v_dst20, v_dst21, v_dst22, v_dst23;
                 v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
-                v_load_deinterleave(dst + (x + 2)* cn, v_dst01, v_dst11, v_dst21);
-                v_load_deinterleave(dst + (x + 4)* cn, v_dst02, v_dst12, v_dst22);
-                v_load_deinterleave(dst + (x + 6)* cn, v_dst03, v_dst13, v_dst23);
+                v_load_deinterleave(dst + (x + step)* cn, v_dst01, v_dst11, v_dst21);
+                v_load_deinterleave(dst + (x + step * 2)* cn, v_dst02, v_dst12, v_dst22);
+                v_load_deinterleave(dst + (x + step * 3)* cn, v_dst03, v_dst13, v_dst23);
 
-                v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
-                v_store_interleave(dst + (x + 2) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
-                v_store_interleave(dst + (x + 4) * cn, v_dst02 + v_src02, v_dst12 + v_src12, v_dst22 + v_src22);
-                v_store_interleave(dst + (x + 6) * cn, v_dst03 + v_src03, v_dst13 + v_src13, v_dst23 + v_src23);
+                v_dst00 = v_fma(v_src00, v_src00, v_dst00);
+                v_dst01 = v_fma(v_src01, v_src01, v_dst01);
+                v_dst02 = v_fma(v_src02, v_src02, v_dst02);
+                v_dst03 = v_fma(v_src03, v_src03, v_dst03);
+                v_dst10 = v_fma(v_src10, v_src10, v_dst10);
+                v_dst11 = v_fma(v_src11, v_src11, v_dst11);
+                v_dst12 = v_fma(v_src12, v_src12, v_dst12);
+                v_dst13 = v_fma(v_src13, v_src13, v_dst13);
+                v_dst20 = v_fma(v_src20, v_src20, v_dst20);
+                v_dst21 = v_fma(v_src21, v_src21, v_dst21);
+                v_dst22 = v_fma(v_src22, v_src22, v_dst22);
+                v_dst23 = v_fma(v_src23, v_src23, v_dst23);
+
+                v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
+                v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
+                v_store_interleave(dst + (x + step * 2) * cn, v_dst02, v_dst12, v_dst22);
+                v_store_interleave(dst + (x + step * 3) * cn, v_dst03, v_dst13, v_dst23);
             }
         }
     }
-
+#endif // CV_SIMD_64F
     accSqr_general_(src, dst, mask, len, cn, x);
 }
 
 void accSqr_simd_(const float* src, double* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-    const int cVectorWidth = 4;
+#if CV_SIMD_64F
+    const int cVectorWidth = v_float32::nlanes;
+    const int step = v_float64::nlanes;
 
     if (!mask)
     {
         int size = len * cn;
+        #if CV_AVX && !CV_AVX2
+        for (; x <= size - 8 ; x += 8)
+        {
+            __m256 v_src = _mm256_loadu_ps(src + x);
+            __m256d v_src0 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src,0));
+            __m256d v_src1 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src,1));
+            __m256d v_dst0 = _mm256_loadu_pd(dst + x);
+            __m256d v_dst1 = _mm256_loadu_pd(dst + x + 4);
+            v_src0 = _mm256_mul_pd(v_src0, v_src0);
+            v_src1 = _mm256_mul_pd(v_src1, v_src1);
+            v_dst0 = _mm256_add_pd(v_src0, v_dst0);
+            v_dst1 = _mm256_add_pd(v_src1, v_dst1);
+            _mm256_storeu_pd(dst + x, v_dst0);
+            _mm256_storeu_pd(dst + x + 4, v_dst1);
+        }
+        #else
         for (; x <= size - cVectorWidth; x += cVectorWidth)
         {
-            v_float32x4 v_src = v_load(src + x);
-            v_float64x2 v_src0 = v_cvt_f64(v_src);
-            v_float64x2 v_src1 = v_cvt_f64_high(v_src);
-            v_src0 = v_src0 * v_src0;
-            v_src1 = v_src1 * v_src1;
+            v_float32 v_src = vx_load(src + x);
+            v_float64 v_src0 = v_cvt_f64(v_src);
+            v_float64 v_src1 = v_cvt_f64_high(v_src);
 
-            v_store(dst + x, v_load(dst + x) + v_src0);
-            v_store(dst + x + 2, v_load(dst + x + 2) + v_src1);
+            v_store(dst + x, v_fma(v_src0, v_src0, vx_load(dst + x)));
+            v_store(dst + x + step, v_fma(v_src1, v_src1, vx_load(dst + x + step)));
         }
+        #endif // CV_AVX && !CV_AVX2
     }
     else
     {
-        v_uint32x4 v_0 = v_setzero_u32();
+        v_uint32 v_0 = vx_setzero_u32();
         if (cn == 1)
         {
             for (; x <= len - cVectorWidth; x += cVectorWidth)
             {
-                v_uint32x4 v_mask = v_load_expand_q(mask + x);;
+                v_uint32 v_mask = vx_load_expand_q(mask + x);;
                 v_mask = ~(v_mask == v_0);
-                v_float32x4 v_src = v_load(src + x);
+                v_float32 v_src = vx_load(src + x);
                 v_src = v_src & v_reinterpret_as_f32(v_mask);
-                v_float64x2 v_src0 = v_cvt_f64(v_src);
-                v_float64x2 v_src1 = v_cvt_f64_high(v_src);
-                v_src0 = v_src0 * v_src0;
-                v_src1 = v_src1 * v_src1;
+                v_float64 v_src0 = v_cvt_f64(v_src);
+                v_float64 v_src1 = v_cvt_f64_high(v_src);
 
-                v_store(dst + x, v_load(dst + x) + v_src0);
-                v_store(dst + x + 2, v_load(dst + x + 2) + v_src1);
+                v_store(dst + x, v_fma(v_src0, v_src0, vx_load(dst + x)));
+                v_store(dst + x + step, v_fma(v_src1, v_src1, vx_load(dst + x + step)));
             }
         }
         else if (cn == 3)
         {
             for (; x <= len - cVectorWidth; x += cVectorWidth)
             {
-                v_uint32x4 v_mask = v_load_expand_q(mask + x);
+                v_uint32 v_mask = vx_load_expand_q(mask + x);
                 v_mask = ~(v_mask == v_0);
 
-                v_float32x4 v_src0, v_src1, v_src2;
+                v_float32 v_src0, v_src1, v_src2;
                 v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
                 v_src0 = v_src0 & v_reinterpret_as_f32(v_mask);
                 v_src1 = v_src1 & v_reinterpret_as_f32(v_mask);
                 v_src2 = v_src2 & v_reinterpret_as_f32(v_mask);
 
-                v_float64x2 v_src00 = v_cvt_f64(v_src0);
-                v_float64x2 v_src01 = v_cvt_f64_high(v_src0);
-                v_float64x2 v_src10 = v_cvt_f64(v_src1);
-                v_float64x2 v_src11 = v_cvt_f64_high(v_src1);
-                v_float64x2 v_src20 = v_cvt_f64(v_src2);
-                v_float64x2 v_src21 = v_cvt_f64_high(v_src2);
-                v_src00 = v_src00 * v_src00;
-                v_src01 = v_src01 * v_src01;
-                v_src10 = v_src10 * v_src10;
-                v_src11 = v_src11 * v_src11;
-                v_src20 = v_src20 * v_src20;
-                v_src21 = v_src21 * v_src21;
+                v_float64 v_src00 = v_cvt_f64(v_src0);
+                v_float64 v_src01 = v_cvt_f64_high(v_src0);
+                v_float64 v_src10 = v_cvt_f64(v_src1);
+                v_float64 v_src11 = v_cvt_f64_high(v_src1);
+                v_float64 v_src20 = v_cvt_f64(v_src2);
+                v_float64 v_src21 = v_cvt_f64_high(v_src2);
 
-                v_float64x2 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
+                v_float64 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
                 v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
-                v_load_deinterleave(dst + (x + 2) * cn, v_dst01, v_dst11, v_dst21);
+                v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
 
-                v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
-                v_store_interleave(dst + (x + 2) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
+                v_dst00 = v_fma(v_src00, v_src00, v_dst00);
+                v_dst01 = v_fma(v_src01, v_src01, v_dst01);
+                v_dst10 = v_fma(v_src10, v_src10, v_dst10);
+                v_dst11 = v_fma(v_src11, v_src11, v_dst11);
+                v_dst20 = v_fma(v_src20, v_src20, v_dst20);
+                v_dst21 = v_fma(v_src21, v_src21, v_dst21);
+
+                v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
+                v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
             }
         }
     }
-
+#endif // CV_SIMD_64F
     accSqr_general_(src, dst, mask, len, cn, x);
 }
 
 void accSqr_simd_(const double* src, double* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-    const int cVectorWidth = 4;
+#if CV_SIMD_64F
+    const int cVectorWidth = v_float64::nlanes * 2;
+    const int step = v_float64::nlanes;
 
     if (!mask)
     {
         int size = len * cn;
+        #if CV_AVX && !CV_AVX2
+        for (; x <= size - 4 ; x += 4)
+        {
+            __m256d v_src = _mm256_loadu_pd(src + x);
+            __m256d v_dst = _mm256_loadu_pd(dst + x);
+            v_src = _mm256_mul_pd(v_src, v_src);
+            v_dst = _mm256_add_pd(v_dst, v_src);
+            _mm256_storeu_pd(dst + x, v_dst);
+        }
+        #else
         for (; x <= size - cVectorWidth; x += cVectorWidth)
         {
-            v_float64x2 v_src0 = v_load(src + x);
-            v_float64x2 v_src1 = v_load(src + x + 2);
-            v_src0 = v_src0 * v_src0;
-            v_src1 = v_src1 * v_src1;
-
-            v_store(dst + x, v_load(dst + x) + v_src0);
-            v_store(dst + x + 2, v_load(dst + x + 2) + v_src1);
+            v_float64 v_src0 = vx_load(src + x);
+            v_float64 v_src1 = vx_load(src + x + step);
+            v_store(dst + x, v_fma(v_src0, v_src0, vx_load(dst + x)));
+            v_store(dst + x + step, v_fma(v_src1, v_src1, vx_load(dst + x + step)));
         }
+        #endif // CV_AVX && !CV_AVX2
     }
     else
     {
-        v_uint64x2 v_0 = v_setzero_u64();
+        v_uint64 v_0 = vx_setzero_u64();
         if (cn == 1)
         {
             for (; x <= len - cVectorWidth; x += cVectorWidth)
             {
-                v_uint32x4 v_mask32 = v_load_expand_q(mask + x);
-                v_uint64x2 v_masku640, v_masku641;
+                v_uint32 v_mask32 = vx_load_expand_q(mask + x);
+                v_uint64 v_masku640, v_masku641;
                 v_expand(v_mask32, v_masku640, v_masku641);
-                v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
-                v_float64x2 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
-                v_float64x2 v_src0 = v_load(src + x);
-                v_float64x2 v_src1 = v_load(src + x + 2);
+                v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
+                v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
+                v_float64 v_src0 = vx_load(src + x);
+                v_float64 v_src1 = vx_load(src + x + step);
                 v_src0 = v_src0 & v_mask0;
                 v_src1 = v_src1 & v_mask1;
-                v_src0 = v_src0 * v_src0;
-                v_src1 = v_src1 * v_src1;
-
-                v_store(dst + x, v_load(dst + x) + v_src0);
-                v_store(dst + x + 2, v_load(dst + x + 2) + v_src1);
+                v_store(dst + x, v_fma(v_src0, v_src0, vx_load(dst + x)));
+                v_store(dst + x + step, v_fma(v_src1, v_src1, vx_load(dst + x + step)));
             }
         }
         else if (cn == 3)
         {
             for (; x <= len - cVectorWidth; x += cVectorWidth)
             {
-                v_uint32x4 v_mask32 = v_load_expand_q(mask + x);
-                v_uint64x2 v_masku640, v_masku641;
+                v_uint32 v_mask32 = vx_load_expand_q(mask + x);
+                v_uint64 v_masku640, v_masku641;
                 v_expand(v_mask32, v_masku640, v_masku641);
-                v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
-                v_float64x2 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
+                v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
+                v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
 
-                v_float64x2 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
+                v_float64 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
                 v_load_deinterleave(src + x * cn, v_src00, v_src10, v_src20);
-                v_load_deinterleave(src + (x + 2) * cn, v_src01, v_src11, v_src21);
+                v_load_deinterleave(src + (x + step) * cn, v_src01, v_src11, v_src21);
                 v_src00 = v_src00 & v_mask0;
                 v_src01 = v_src01 & v_mask1;
                 v_src10 = v_src10 & v_mask0;
                 v_src11 = v_src11 & v_mask1;
                 v_src20 = v_src20 & v_mask0;
                 v_src21 = v_src21 & v_mask1;
-                v_src00 = v_src00 * v_src00;
-                v_src01 = v_src01 * v_src01;
-                v_src10 = v_src10 * v_src10;
-                v_src11 = v_src11 * v_src11;
-                v_src20 = v_src20 * v_src20;
-                v_src21 = v_src21 * v_src21;
 
-                v_float64x2 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
+                v_float64 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
                 v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
-                v_load_deinterleave(dst + (x + 2) * cn, v_dst01, v_dst11, v_dst21);
+                v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
 
-                v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
-                v_store_interleave(dst + (x + 2) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
+                v_dst00 = v_fma(v_src00, v_src00, v_dst00);
+                v_dst01 = v_fma(v_src01, v_src01, v_dst01);
+                v_dst10 = v_fma(v_src10, v_src10, v_dst10);
+                v_dst11 = v_fma(v_src11, v_src11, v_dst11);
+                v_dst20 = v_fma(v_src20, v_src20, v_dst20);
+                v_dst21 = v_fma(v_src21, v_src21, v_dst21);
+
+                v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
+                v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
             }
         }
     }
-
+#endif // CV_SIMD_64F
     accSqr_general_(src, dst, mask, len, cn, x);
 }
-#else
-void accSqr_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn)
-{
-    accSqr_general_(src, dst, mask, len, cn, 0);
-}
-
-void accSqr_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn)
-{
-    accSqr_general_(src, dst, mask, len, cn, 0);
-}
-
-void accSqr_simd_(const float* src, double* dst, const uchar* mask, int len, int cn)
-{
-    accSqr_general_(src, dst, mask, len, cn, 0);
-}
-
-void accSqr_simd_(const double* src, double* dst, const uchar* mask, int len, int cn)
-{
-    accSqr_general_(src, dst, mask, len, cn, 0);
-}
-#endif
 
 // product accumulate optimized by universal intrinsic
 void accProd_simd_(const uchar* src1, const uchar* src2, float* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-    const int cVectorWidth = 16;
+#if CV_SIMD
+    const int cVectorWidth = v_uint8::nlanes;
+    const int step = v_uint32::nlanes;
 
     if (!mask)
     {
         int size = len * cn;
         for (; x <= size - cVectorWidth; x += cVectorWidth)
         {
-            v_uint8x16 v_1src = v_load(src1 + x);
-            v_uint8x16 v_2src = v_load(src2 + x);
+            v_uint8 v_1src = vx_load(src1 + x);
+            v_uint8 v_2src = vx_load(src2 + x);
 
-            v_uint16x8 v_1src0, v_1src1, v_2src0, v_2src1;
-            v_expand(v_1src, v_1src0, v_1src1);
-            v_expand(v_2src, v_2src0, v_2src1);
+            v_uint16 v_src0, v_src1;
+            v_mul_expand(v_1src, v_2src, v_src0, v_src1);
 
-            v_uint16x8 v_src0, v_src1;
-            v_src0 = v_1src0 * v_2src0;
-            v_src1 = v_1src1 * v_2src1;
-
-            v_uint32x4 v_src00, v_src01, v_src10, v_src11;
+            v_uint32 v_src00, v_src01, v_src10, v_src11;
             v_expand(v_src0, v_src00, v_src01);
             v_expand(v_src1, v_src10, v_src11);
 
-            v_store(dst + x, v_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));
-            v_store(dst + x + 4, v_load(dst + x + 4) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));
-            v_store(dst + x + 8, v_load(dst + x + 8) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));
-            v_store(dst + x + 12, v_load(dst + x + 12) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));
+            v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));
+            v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));
+            v_store(dst + x + step * 2, vx_load(dst + x + step * 2) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));
+            v_store(dst + x + step * 3, vx_load(dst + x + step * 3) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));
         }
     }
     else
     {
-        v_uint8x16 v_0 = v_setzero_u8();
+        v_uint8 v_0 = vx_setzero_u8();
         if (cn == 1)
         {
             for (; x <= len - cVectorWidth; x += cVectorWidth)
             {
-                v_uint8x16 v_mask = v_load(mask + x);
+                v_uint8 v_mask = vx_load(mask + x);
                 v_mask = ~(v_mask == v_0);
-                v_uint8x16 v_1src = v_load(src1 + x);
-                v_uint8x16 v_2src = v_load(src2 + x);
+                v_uint8 v_1src = vx_load(src1 + x);
+                v_uint8 v_2src = vx_load(src2 + x);
                 v_1src = v_1src & v_mask;
                 v_2src = v_2src & v_mask;
 
-                v_uint16x8 v_1src0, v_1src1, v_2src0, v_2src1;
-                v_expand(v_1src, v_1src0, v_1src1);
-                v_expand(v_2src, v_2src0, v_2src1);
+                v_uint16 v_src0, v_src1;
+                v_mul_expand(v_1src, v_2src, v_src0, v_src1);
 
-                v_uint16x8 v_src0, v_src1;
-                v_src0 = v_1src0 * v_2src0;
-                v_src1 = v_1src1 * v_2src1;
-
-                v_uint32x4 v_src00, v_src01, v_src10, v_src11;
+                v_uint32 v_src00, v_src01, v_src10, v_src11;
                 v_expand(v_src0, v_src00, v_src01);
                 v_expand(v_src1, v_src10, v_src11);
 
-                v_store(dst + x, v_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));
-                v_store(dst + x + 4, v_load(dst + x + 4) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));
-                v_store(dst + x + 8, v_load(dst + x + 8) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));
-                v_store(dst + x + 12, v_load(dst + x + 12) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));
+                v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));
+                v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));
+                v_store(dst + x + step * 2, vx_load(dst + x + step * 2) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));
+                v_store(dst + x + step * 3, vx_load(dst + x + step * 3) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));
             }
         }
         else if (cn == 3)
         {
             for (; x <= len - cVectorWidth; x += cVectorWidth)
             {
-                v_uint8x16 v_mask = v_load(mask + x);
+                v_uint8 v_mask = vx_load(mask + x);
                 v_mask = ~(v_mask == v_0);
-                v_uint8x16 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2;
+                v_uint8 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2;
                 v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2);
                 v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2);
                 v_1src0 = v_1src0 & v_mask;
@@ -1837,23 +1878,12 @@ void accProd_simd_(const uchar* src1, const uchar* src2, float* dst, const uchar
                 v_2src1 = v_2src1 & v_mask;
                 v_2src2 = v_2src2 & v_mask;
 
-                v_uint16x8 v_1src00, v_1src01, v_1src10, v_1src11, v_1src20, v_1src21, v_2src00, v_2src01, v_2src10, v_2src11, v_2src20, v_2src21;
-                v_expand(v_1src0, v_1src00, v_1src01);
-                v_expand(v_1src1, v_1src10, v_1src11);
-                v_expand(v_1src2, v_1src20, v_1src21);
-                v_expand(v_2src0, v_2src00, v_2src01);
-                v_expand(v_2src1, v_2src10, v_2src11);
-                v_expand(v_2src2, v_2src20, v_2src21);
+                v_uint16 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
+                v_mul_expand(v_1src0, v_2src0, v_src00, v_src01);
+                v_mul_expand(v_1src1, v_2src1, v_src10, v_src11);
+                v_mul_expand(v_1src2, v_2src2, v_src20, v_src21);
 
-                v_uint16x8 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
-                v_src00 = v_1src00 * v_2src00;
-                v_src01 = v_1src01 * v_2src01;
-                v_src10 = v_1src10 * v_2src10;
-                v_src11 = v_1src11 * v_2src11;
-                v_src20 = v_1src20 * v_2src20;
-                v_src21 = v_1src21 * v_2src21;
-
-                v_uint32x4 v_src000, v_src001, v_src002, v_src003, v_src100, v_src101, v_src102, v_src103, v_src200, v_src201, v_src202, v_src203;
+                v_uint32 v_src000, v_src001, v_src002, v_src003, v_src100, v_src101, v_src102, v_src103, v_src200, v_src201, v_src202, v_src203;
                 v_expand(v_src00, v_src000, v_src001);
                 v_expand(v_src01, v_src002, v_src003);
                 v_expand(v_src10, v_src100, v_src101);
@@ -1861,11 +1891,11 @@ void accProd_simd_(const uchar* src1, const uchar* src2, float* dst, const uchar
                 v_expand(v_src20, v_src200, v_src201);
                 v_expand(v_src21, v_src202, v_src203);
 
-                v_float32x4 v_dst000, v_dst001, v_dst002, v_dst003, v_dst100, v_dst101, v_dst102, v_dst103, v_dst200, v_dst201, v_dst202, v_dst203;
+                v_float32 v_dst000, v_dst001, v_dst002, v_dst003, v_dst100, v_dst101, v_dst102, v_dst103, v_dst200, v_dst201, v_dst202, v_dst203;
                 v_load_deinterleave(dst + x * cn, v_dst000, v_dst100, v_dst200);
-                v_load_deinterleave(dst + (x + 4) * cn, v_dst001, v_dst101, v_dst201);
-                v_load_deinterleave(dst + (x + 8) * cn, v_dst002, v_dst102, v_dst202);
-                v_load_deinterleave(dst + (x + 12) * cn, v_dst003, v_dst103, v_dst203);
+                v_load_deinterleave(dst + (x + step) * cn, v_dst001, v_dst101, v_dst201);
+                v_load_deinterleave(dst + (x + step * 2) * cn, v_dst002, v_dst102, v_dst202);
+                v_load_deinterleave(dst + (x + step * 3) * cn, v_dst003, v_dst103, v_dst203);
                 v_dst000 = v_dst000 + v_cvt_f32(v_reinterpret_as_s32(v_src000));
                 v_dst001 = v_dst001 + v_cvt_f32(v_reinterpret_as_s32(v_src001));
                 v_dst002 = v_dst002 + v_cvt_f32(v_reinterpret_as_s32(v_src002));
@@ -1880,82 +1910,78 @@ void accProd_simd_(const uchar* src1, const uchar* src2, float* dst, const uchar
                 v_dst203 = v_dst203 + v_cvt_f32(v_reinterpret_as_s32(v_src203));
 
                 v_store_interleave(dst + x * cn, v_dst000, v_dst100, v_dst200);
-                v_store_interleave(dst + (x + 4) * cn, v_dst001, v_dst101, v_dst201);
-                v_store_interleave(dst + (x + 8) * cn, v_dst002, v_dst102, v_dst202);
-                v_store_interleave(dst + (x + 12) * cn, v_dst003, v_dst103, v_dst203);
+                v_store_interleave(dst + (x + step) * cn, v_dst001, v_dst101, v_dst201);
+                v_store_interleave(dst + (x + step * 2) * cn, v_dst002, v_dst102, v_dst202);
+                v_store_interleave(dst + (x + step * 3) * cn, v_dst003, v_dst103, v_dst203);
             }
         }
     }
-
+#endif // CV_SIMD
     accProd_general_(src1, src2, dst, mask, len, cn, x);
 }
 
 void accProd_simd_(const ushort* src1, const ushort* src2, float* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-    const int cVectorWidth = 8;
+#if CV_SIMD
+    const int cVectorWidth = v_uint16::nlanes;
+    const int step = v_float32::nlanes;
 
     if (!mask)
     {
         int size = len * cn;
         for (; x <= size - cVectorWidth; x += cVectorWidth)
         {
-            v_uint16x8 v_1src = v_load(src1 + x);
-            v_uint16x8 v_2src = v_load(src2 + x);
+            v_uint16 v_1src = vx_load(src1 + x);
+            v_uint16 v_2src = vx_load(src2 + x);
 
-            v_uint32x4 v_1src0, v_1src1, v_2src0, v_2src1;
+            v_uint32 v_1src0, v_1src1, v_2src0, v_2src1;
             v_expand(v_1src, v_1src0, v_1src1);
             v_expand(v_2src, v_2src0, v_2src1);
 
-            v_float32x4 v_1float0 = v_cvt_f32(v_reinterpret_as_s32(v_1src0));
-            v_float32x4 v_1float1 = v_cvt_f32(v_reinterpret_as_s32(v_1src1));
-            v_float32x4 v_2float0 = v_cvt_f32(v_reinterpret_as_s32(v_2src0));
-            v_float32x4 v_2float1 = v_cvt_f32(v_reinterpret_as_s32(v_2src1));
+            v_float32 v_1float0 = v_cvt_f32(v_reinterpret_as_s32(v_1src0));
+            v_float32 v_1float1 = v_cvt_f32(v_reinterpret_as_s32(v_1src1));
+            v_float32 v_2float0 = v_cvt_f32(v_reinterpret_as_s32(v_2src0));
+            v_float32 v_2float1 = v_cvt_f32(v_reinterpret_as_s32(v_2src1));
 
-            v_float32x4 v_src0 = v_1float0 * v_2float0;
-            v_float32x4 v_src1 = v_1float1 * v_2float1;
-
-            v_store(dst + x, v_load(dst + x) + v_src0);
-            v_store(dst + x + 4, v_load(dst + x + 4) + v_src1);
+            v_store(dst + x, v_fma(v_1float0, v_2float0, vx_load(dst + x)));
+            v_store(dst + x + step, v_fma(v_1float1, v_2float1, vx_load(dst + x + step)));
         }
     }
     else
     {
-        v_uint16x8 v_0 = v_setzero_u16();
+        v_uint16 v_0 = vx_setzero_u16();
         if (cn == 1)
         {
             for (; x <= len - cVectorWidth; x += cVectorWidth)
             {
-                v_uint16x8 v_mask = v_load_expand(mask + x);
+                v_uint16 v_mask = vx_load_expand(mask + x);
                 v_mask = ~(v_0 == v_mask);
 
-                v_uint16x8 v_1src = v_load(src1 + x) & v_mask;
-                v_uint16x8 v_2src = v_load(src2 + x) & v_mask;
+                v_uint16 v_1src = vx_load(src1 + x) & v_mask;
+                v_uint16 v_2src = vx_load(src2 + x) & v_mask;
 
-                v_uint32x4 v_1src0, v_1src1, v_2src0, v_2src1;
+                v_uint32 v_1src0, v_1src1, v_2src0, v_2src1;
                 v_expand(v_1src, v_1src0, v_1src1);
                 v_expand(v_2src, v_2src0, v_2src1);
 
-                v_float32x4 v_1float0 = v_cvt_f32(v_reinterpret_as_s32(v_1src0));
-                v_float32x4 v_1float1 = v_cvt_f32(v_reinterpret_as_s32(v_1src1));
-                v_float32x4 v_2float0 = v_cvt_f32(v_reinterpret_as_s32(v_2src0));
-                v_float32x4 v_2float1 = v_cvt_f32(v_reinterpret_as_s32(v_2src1));
+                v_float32 v_1float0 = v_cvt_f32(v_reinterpret_as_s32(v_1src0));
+                v_float32 v_1float1 = v_cvt_f32(v_reinterpret_as_s32(v_1src1));
+                v_float32 v_2float0 = v_cvt_f32(v_reinterpret_as_s32(v_2src0));
+                v_float32 v_2float1 = v_cvt_f32(v_reinterpret_as_s32(v_2src1));
 
-                v_float32x4 v_src0 = v_1float0 * v_2float0;
-                v_float32x4 v_src1 = v_1float1 * v_2float1;
-
-                v_store(dst + x, v_load(dst + x) + v_src0);
-                v_store(dst + x + 4, v_load(dst + x + 4) + v_src1);
+                v_store(dst + x, v_fma(v_1float0, v_2float0, vx_load(dst + x)));
+                v_store(dst + x + step, v_fma(v_1float1, v_2float1, vx_load(dst + x + step)));
             }
         }
         else if (cn == 3)
         {
             for (; x <= len - cVectorWidth; x += cVectorWidth)
             {
-                v_uint16x8 v_mask = v_load_expand(mask + x);
+                v_uint16 v_mask = vx_load_expand(mask + x);
                 v_mask = ~(v_0 == v_mask);
 
-                v_uint16x8 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2;
+                v_uint16 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2;
                 v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2);
                 v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2);
                 v_1src0 = v_1src0 & v_mask;
@@ -1965,7 +1991,7 @@ void accProd_simd_(const ushort* src1, const ushort* src2, float* dst, const uch
                 v_2src1 = v_2src1 & v_mask;
                 v_2src2 = v_2src2 & v_mask;
 
-                v_uint32x4 v_1src00, v_1src01, v_1src10, v_1src11, v_1src20, v_1src21, v_2src00, v_2src01, v_2src10, v_2src11, v_2src20, v_2src21;
+                v_uint32 v_1src00, v_1src01, v_1src10, v_1src11, v_1src20, v_1src21, v_2src00, v_2src01, v_2src10, v_2src11, v_2src20, v_2src21;
                 v_expand(v_1src0, v_1src00, v_1src01);
                 v_expand(v_1src1, v_1src10, v_1src11);
                 v_expand(v_1src2, v_1src20, v_1src21);
@@ -1973,1003 +1999,51 @@ void accProd_simd_(const ushort* src1, const ushort* src2, float* dst, const uch
                 v_expand(v_2src1, v_2src10, v_2src11);
                 v_expand(v_2src2, v_2src20, v_2src21);
 
-                v_float32x4 v_1float00 = v_cvt_f32(v_reinterpret_as_s32(v_1src00));
-                v_float32x4 v_1float01 = v_cvt_f32(v_reinterpret_as_s32(v_1src01));
-                v_float32x4 v_1float10 = v_cvt_f32(v_reinterpret_as_s32(v_1src10));
-                v_float32x4 v_1float11 = v_cvt_f32(v_reinterpret_as_s32(v_1src11));
-                v_float32x4 v_1float20 = v_cvt_f32(v_reinterpret_as_s32(v_1src20));
-                v_float32x4 v_1float21 = v_cvt_f32(v_reinterpret_as_s32(v_1src21));
-                v_float32x4 v_2float00 = v_cvt_f32(v_reinterpret_as_s32(v_2src00));
-                v_float32x4 v_2float01 = v_cvt_f32(v_reinterpret_as_s32(v_2src01));
-                v_float32x4 v_2float10 = v_cvt_f32(v_reinterpret_as_s32(v_2src10));
-                v_float32x4 v_2float11 = v_cvt_f32(v_reinterpret_as_s32(v_2src11));
-                v_float32x4 v_2float20 = v_cvt_f32(v_reinterpret_as_s32(v_2src20));
-                v_float32x4 v_2float21 = v_cvt_f32(v_reinterpret_as_s32(v_2src21));
+                v_float32 v_1float00 = v_cvt_f32(v_reinterpret_as_s32(v_1src00));
+                v_float32 v_1float01 = v_cvt_f32(v_reinterpret_as_s32(v_1src01));
+                v_float32 v_1float10 = v_cvt_f32(v_reinterpret_as_s32(v_1src10));
+                v_float32 v_1float11 = v_cvt_f32(v_reinterpret_as_s32(v_1src11));
+                v_float32 v_1float20 = v_cvt_f32(v_reinterpret_as_s32(v_1src20));
+                v_float32 v_1float21 = v_cvt_f32(v_reinterpret_as_s32(v_1src21));
+                v_float32 v_2float00 = v_cvt_f32(v_reinterpret_as_s32(v_2src00));
+                v_float32 v_2float01 = v_cvt_f32(v_reinterpret_as_s32(v_2src01));
+                v_float32 v_2float10 = v_cvt_f32(v_reinterpret_as_s32(v_2src10));
+                v_float32 v_2float11 = v_cvt_f32(v_reinterpret_as_s32(v_2src11));
+                v_float32 v_2float20 = v_cvt_f32(v_reinterpret_as_s32(v_2src20));
+                v_float32 v_2float21 = v_cvt_f32(v_reinterpret_as_s32(v_2src21));
 
-                v_float32x4 v_src00 = v_1float00 * v_2float00;
-                v_float32x4 v_src01 = v_1float01 * v_2float01;
-                v_float32x4 v_src10 = v_1float10 * v_2float10;
-                v_float32x4 v_src11 = v_1float11 * v_2float11;
-                v_float32x4 v_src20 = v_1float20 * v_2float20;
-                v_float32x4 v_src21 = v_1float21 * v_2float21;
-
-                v_float32x4 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
+                v_float32 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
                 v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
-                v_load_deinterleave(dst + (x + 4) * cn, v_dst01, v_dst11, v_dst21);
+                v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
 
-                v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
-                v_store_interleave(dst + (x + 4) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
+                v_dst00 = v_fma(v_1float00, v_2float00, v_dst00);
+                v_dst01 = v_fma(v_1float01, v_2float01, v_dst01);
+                v_dst10 = v_fma(v_1float10, v_2float10, v_dst10);
+                v_dst11 = v_fma(v_1float11, v_2float11, v_dst11);
+                v_dst20 = v_fma(v_1float20, v_2float20, v_dst20);
+                v_dst21 = v_fma(v_1float21, v_2float21, v_dst21);
+
+                v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
+                v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
             }
         }
     }
-
+#endif // CV_SIMD
     accProd_general_(src1, src2, dst, mask, len, cn, x);
 }
 
 void accProd_simd_(const float* src1, const float* src2, float* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-    const int cVectorWidth = 8;
+#if CV_SIMD
+    const int cVectorWidth = v_uint16::nlanes;
+    const int step = v_float32::nlanes;
 
     if (!mask)
     {
         int size = len * cn;
-        for (; x <= size - cVectorWidth; x += cVectorWidth)
-        {
-            v_store(dst + x, v_load(dst + x) + v_load(src1 + x) * v_load(src2 + x));
-            v_store(dst + x + 4, v_load(dst + x + 4) + v_load(src1 + x + 4) * v_load(src2 + x + 4));
-        }
-    }
-    else
-    {
-        v_uint32x4 v_0 = v_setzero_u32();
-        if (cn == 1)
-        {
-            for (; x <= len - cVectorWidth; x += cVectorWidth)
-            {
-                v_uint32x4 v_mask32_0 = v_load_expand_q(mask + x);
-                v_uint32x4 v_mask32_1 = v_load_expand_q(mask + x + 4);
-                v_float32x4 v_mask0 = v_reinterpret_as_f32(~(v_mask32_0 == v_0));
-                v_float32x4 v_mask1 = v_reinterpret_as_f32(~(v_mask32_1 == v_0));
-
-                v_store(dst + x, v_load(dst + x) + ((v_load(src1 + x) * v_load(src2 + x)) & v_mask0));
-                v_store(dst + x + 4, v_load(dst + x + 4) + ((v_load(src1 + x + 4) * v_load(src2 + x + 4)) & v_mask1));
-            }
-        }
-        else if (cn == 3)
-        {
-            for (; x <= len - cVectorWidth; x += cVectorWidth)
-            {
-                v_uint32x4 v_mask32_0 = v_load_expand_q(mask + x);
-                v_uint32x4 v_mask32_1 = v_load_expand_q(mask + x + 4);
-                v_float32x4 v_mask0 = v_reinterpret_as_f32(~(v_mask32_0 == v_0));
-                v_float32x4 v_mask1 = v_reinterpret_as_f32(~(v_mask32_1 == v_0));
-
-                v_float32x4 v_1src00, v_1src01, v_1src10, v_1src11, v_1src20, v_1src21;
-                v_float32x4 v_2src00, v_2src01, v_2src10, v_2src11, v_2src20, v_2src21;
-                v_load_deinterleave(src1 + x * cn, v_1src00, v_1src10, v_1src20);
-                v_load_deinterleave(src2 + x * cn, v_2src00, v_2src10, v_2src20);
-                v_load_deinterleave(src1 + (x + 4) * cn, v_1src01, v_1src11, v_1src21);
-                v_load_deinterleave(src2 + (x + 4) * cn, v_2src01, v_2src11, v_2src21);
-
-                v_float32x4 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
-                v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
-                v_load_deinterleave(dst + (x + 4) * cn, v_dst01, v_dst11, v_dst21);
-
-                v_store_interleave(dst + x * cn, v_dst00 + ((v_1src00 * v_2src00) & v_mask0), v_dst10 + ((v_1src10 * v_2src10) & v_mask0), v_dst20 + ((v_1src20 * v_2src20) & v_mask0));
-                v_store_interleave(dst + (x + 4) * cn, v_dst01 + ((v_1src01 * v_2src01) & v_mask1), v_dst11 + ((v_1src11 * v_2src11) & v_mask1), v_dst21 + ((v_1src21 * v_2src21) & v_mask1));
-            }
-        }
-    }
-
-    accProd_general_(src1, src2, dst, mask, len, cn, x);
-}
-#if CV_SIMD128_64F
-void accProd_simd_(const uchar* src1, const uchar* src2, double* dst, const uchar* mask, int len, int cn)
-{
-    int x = 0;
-    const int cVectorWidth = 8;
-
-    if (!mask)
-    {
-        int size = len * cn;
-        for (; x <= size - cVectorWidth; x += cVectorWidth)
-        {
-            v_uint16x8 v_1int  = v_load_expand(src1 + x);
-            v_uint16x8 v_2int  = v_load_expand(src2 + x);
-
-            v_uint32x4 v_1int_0, v_1int_1, v_2int_0, v_2int_1;
-            v_expand(v_1int, v_1int_0, v_1int_1);
-            v_expand(v_2int, v_2int_0, v_2int_1);
-
-            v_int32x4 v_1int0 = v_reinterpret_as_s32(v_1int_0);
-            v_int32x4 v_1int1 = v_reinterpret_as_s32(v_1int_1);
-            v_int32x4 v_2int0 = v_reinterpret_as_s32(v_2int_0);
-            v_int32x4 v_2int1 = v_reinterpret_as_s32(v_2int_1);
-
-            v_float64x2 v_src0 = v_cvt_f64(v_1int0) * v_cvt_f64(v_2int0);
-            v_float64x2 v_src1 = v_cvt_f64_high(v_1int0) * v_cvt_f64_high(v_2int0);
-            v_float64x2 v_src2 = v_cvt_f64(v_1int1) * v_cvt_f64(v_2int1);
-            v_float64x2 v_src3 = v_cvt_f64_high(v_1int1) * v_cvt_f64_high(v_2int1);
-
-            v_float64x2 v_dst0 = v_load(dst + x);
-            v_float64x2 v_dst1 = v_load(dst + x + 2);
-            v_float64x2 v_dst2 = v_load(dst + x + 4);
-            v_float64x2 v_dst3 = v_load(dst + x + 6);
-
-            v_dst0 += v_src0;
-            v_dst1 += v_src1;
-            v_dst2 += v_src2;
-            v_dst3 += v_src3;
-
-            v_store(dst + x, v_dst0);
-            v_store(dst + x + 2, v_dst1);
-            v_store(dst + x + 4, v_dst2);
-            v_store(dst + x + 6, v_dst3);
-        }
-    }
-    else
-    {
-        v_uint16x8 v_0 = v_setzero_u16();
-        if (cn == 1)
-        {
-            for (; x <= len - cVectorWidth; x += cVectorWidth)
-            {
-                v_uint16x8 v_mask = v_load_expand(mask + x);
-                v_mask = ~(v_mask == v_0);
-                v_uint16x8 v_1int = v_load_expand(src1 + x) & v_mask;
-                v_uint16x8 v_2int = v_load_expand(src2 + x) & v_mask;
-
-                v_uint32x4 v_1int_0, v_1int_1, v_2int_0, v_2int_1;
-                v_expand(v_1int, v_1int_0, v_1int_1);
-                v_expand(v_2int, v_2int_0, v_2int_1);
-
-                v_int32x4 v_1int0 = v_reinterpret_as_s32(v_1int_0);
-                v_int32x4 v_1int1 = v_reinterpret_as_s32(v_1int_1);
-                v_int32x4 v_2int0 = v_reinterpret_as_s32(v_2int_0);
-                v_int32x4 v_2int1 = v_reinterpret_as_s32(v_2int_1);
-
-                v_float64x2 v_src0 = v_cvt_f64(v_1int0) * v_cvt_f64(v_2int0);
-                v_float64x2 v_src1 = v_cvt_f64_high(v_1int0) * v_cvt_f64_high(v_2int0);
-                v_float64x2 v_src2 = v_cvt_f64(v_1int1) * v_cvt_f64(v_2int1);
-                v_float64x2 v_src3 = v_cvt_f64_high(v_1int1) * v_cvt_f64_high(v_2int1);
-
-                v_float64x2 v_dst0 = v_load(dst + x);
-                v_float64x2 v_dst1 = v_load(dst + x + 2);
-                v_float64x2 v_dst2 = v_load(dst + x + 4);
-                v_float64x2 v_dst3 = v_load(dst + x + 6);
-
-                v_dst0 += v_src0;
-                v_dst1 += v_src1;
-                v_dst2 += v_src2;
-                v_dst3 += v_src3;
-
-                v_store(dst + x, v_dst0);
-                v_store(dst + x + 2, v_dst1);
-                v_store(dst + x + 4, v_dst2);
-                v_store(dst + x + 6, v_dst3);
-            }
-        }
-        else if (cn == 3)
-        {
-            for (; x <= len - /*cVectorWidth*/16; x += cVectorWidth)
-            {
-                v_uint8x16 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2;
-                v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2);
-                v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2);
-
-                v_uint16x8 v_1int0, v_1int1, v_1int2, v_2int0, v_2int1, v_2int2, dummy;
-                v_expand(v_1src0, v_1int0, dummy);
-                v_expand(v_1src1, v_1int1, dummy);
-                v_expand(v_1src2, v_1int2, dummy);
-                v_expand(v_2src0, v_2int0, dummy);
-                v_expand(v_2src1, v_2int1, dummy);
-                v_expand(v_2src2, v_2int2, dummy);
-
-                v_uint16x8 v_mask = v_load_expand(mask + x);
-                v_mask = ~(v_mask == v_0);
-                v_1int0 = v_1int0 & v_mask;
-                v_1int1 = v_1int1 & v_mask;
-                v_1int2 = v_1int2 & v_mask;
-                v_2int0 = v_2int0 & v_mask;
-                v_2int1 = v_2int1 & v_mask;
-                v_2int2 = v_2int2 & v_mask;
-
-                v_uint32x4 v_1int00, v_1int01, v_1int10, v_1int11, v_1int20, v_1int21;
-                v_uint32x4 v_2int00, v_2int01, v_2int10, v_2int11, v_2int20, v_2int21;
-                v_expand(v_1int0, v_1int00, v_1int01);
-                v_expand(v_1int1, v_1int10, v_1int11);
-                v_expand(v_1int2, v_1int20, v_1int21);
-                v_expand(v_2int0, v_2int00, v_2int01);
-                v_expand(v_2int1, v_2int10, v_2int11);
-                v_expand(v_2int2, v_2int20, v_2int21);
-
-                v_float64x2 v_src00 = v_cvt_f64(v_reinterpret_as_s32(v_1int00)) * v_cvt_f64(v_reinterpret_as_s32(v_2int00));
-                v_float64x2 v_src01 = v_cvt_f64_high(v_reinterpret_as_s32(v_1int00)) * v_cvt_f64_high(v_reinterpret_as_s32(v_2int00));
-                v_float64x2 v_src02 = v_cvt_f64(v_reinterpret_as_s32(v_1int01)) * v_cvt_f64(v_reinterpret_as_s32(v_2int01));
-                v_float64x2 v_src03 = v_cvt_f64_high(v_reinterpret_as_s32(v_1int01)) * v_cvt_f64_high(v_reinterpret_as_s32(v_2int01));
-                v_float64x2 v_src10 = v_cvt_f64(v_reinterpret_as_s32(v_1int10)) * v_cvt_f64(v_reinterpret_as_s32(v_2int10));
-                v_float64x2 v_src11 = v_cvt_f64_high(v_reinterpret_as_s32(v_1int10)) * v_cvt_f64_high(v_reinterpret_as_s32(v_2int10));
-                v_float64x2 v_src12 = v_cvt_f64(v_reinterpret_as_s32(v_1int11)) * v_cvt_f64(v_reinterpret_as_s32(v_2int11));
-                v_float64x2 v_src13 = v_cvt_f64_high(v_reinterpret_as_s32(v_1int11)) * v_cvt_f64_high(v_reinterpret_as_s32(v_2int11));
-                v_float64x2 v_src20 = v_cvt_f64(v_reinterpret_as_s32(v_1int20)) * v_cvt_f64(v_reinterpret_as_s32(v_2int20));
-                v_float64x2 v_src21 = v_cvt_f64_high(v_reinterpret_as_s32(v_1int20)) * v_cvt_f64_high(v_reinterpret_as_s32(v_2int20));
-                v_float64x2 v_src22 = v_cvt_f64(v_reinterpret_as_s32(v_1int21)) * v_cvt_f64(v_reinterpret_as_s32(v_2int21));
-                v_float64x2 v_src23 = v_cvt_f64_high(v_reinterpret_as_s32(v_1int21)) * v_cvt_f64_high(v_reinterpret_as_s32(v_2int21));
-
-                v_float64x2 v_dst00, v_dst01, v_dst02, v_dst03, v_dst10, v_dst11, v_dst12, v_dst13, v_dst20, v_dst21, v_dst22, v_dst23;
-                v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
-                v_load_deinterleave(dst + (x + 2) * cn, v_dst01, v_dst11, v_dst21);
-                v_load_deinterleave(dst + (x + 4) * cn, v_dst02, v_dst12, v_dst22);
-                v_load_deinterleave(dst + (x + 6) * cn, v_dst03, v_dst13, v_dst23);
-
-                v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
-                v_store_interleave(dst + (x + 2) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
-                v_store_interleave(dst + (x + 4) * cn, v_dst02 + v_src02, v_dst12 + v_src12, v_dst22 + v_src22);
-                v_store_interleave(dst + (x + 6) * cn, v_dst03 + v_src03, v_dst13 + v_src13, v_dst23 + v_src23);
-            }
-        }
-    }
-
-    accProd_general_(src1, src2, dst, mask, len, cn, x);
-}
-
-void accProd_simd_(const ushort* src1, const ushort* src2, double* dst, const uchar* mask, int len, int cn)
-{
-    int x = 0;
-    const int cVectorWidth = 8;
-
-    if (!mask)
-    {
-        int size = len * cn;
-        for (; x <= size - cVectorWidth; x += cVectorWidth)
-        {
-            v_uint16x8 v_1src  = v_load(src1 + x);
-            v_uint16x8 v_2src  = v_load(src2 + x);
-
-            v_uint32x4 v_1int_0, v_1int_1, v_2int_0, v_2int_1;
-            v_expand(v_1src, v_1int_0, v_1int_1);
-            v_expand(v_2src, v_2int_0, v_2int_1);
-
-            v_int32x4 v_1int0 = v_reinterpret_as_s32(v_1int_0);
-            v_int32x4 v_1int1 = v_reinterpret_as_s32(v_1int_1);
-            v_int32x4 v_2int0 = v_reinterpret_as_s32(v_2int_0);
-            v_int32x4 v_2int1 = v_reinterpret_as_s32(v_2int_1);
-
-            v_float64x2 v_src0 = v_cvt_f64(v_1int0) * v_cvt_f64(v_2int0);
-            v_float64x2 v_src1 = v_cvt_f64_high(v_1int0) * v_cvt_f64_high(v_2int0);
-            v_float64x2 v_src2 = v_cvt_f64(v_1int1) * v_cvt_f64(v_2int1);
-            v_float64x2 v_src3 = v_cvt_f64_high(v_1int1) * v_cvt_f64_high(v_2int1);
-
-            v_float64x2 v_dst0 = v_load(dst + x);
-            v_float64x2 v_dst1 = v_load(dst + x + 2);
-            v_float64x2 v_dst2 = v_load(dst + x + 4);
-            v_float64x2 v_dst3 = v_load(dst + x + 6);
-
-            v_dst0 = v_dst0 + v_src0;
-            v_dst1 = v_dst1 + v_src1;
-            v_dst2 = v_dst2 + v_src2;
-            v_dst3 = v_dst3 + v_src3;
-            v_store(dst + x, v_dst0);
-            v_store(dst + x + 2, v_dst1);
-            v_store(dst + x + 4, v_dst2);
-            v_store(dst + x + 6, v_dst3);
-        }
-    }
-    else
-    {
-        v_uint16x8 v_0 = v_setzero_u16();
-        if (cn == 1)
-        {
-            for (; x <= len - cVectorWidth; x += cVectorWidth)
-            {
-                v_uint16x8 v_mask = v_load_expand(mask + x);
-                v_mask = ~(v_mask == v_0);
-                v_uint16x8 v_1src = v_load(src1 + x);
-                v_uint16x8 v_2src = v_load(src2 + x);
-                v_1src = v_1src & v_mask;
-                v_2src = v_2src & v_mask;
-
-                v_uint32x4 v_1int_0, v_1int_1, v_2int_0, v_2int_1;
-                v_expand(v_1src, v_1int_0, v_1int_1);
-                v_expand(v_2src, v_2int_0, v_2int_1);
-
-                v_int32x4 v_1int0 = v_reinterpret_as_s32(v_1int_0);
-                v_int32x4 v_1int1 = v_reinterpret_as_s32(v_1int_1);
-                v_int32x4 v_2int0 = v_reinterpret_as_s32(v_2int_0);
-                v_int32x4 v_2int1 = v_reinterpret_as_s32(v_2int_1);
-
-                v_float64x2 v_src0 = v_cvt_f64(v_1int0) * v_cvt_f64(v_2int0);
-                v_float64x2 v_src1 = v_cvt_f64_high(v_1int0) * v_cvt_f64_high(v_2int0);
-                v_float64x2 v_src2 = v_cvt_f64(v_1int1) * v_cvt_f64(v_2int1);
-                v_float64x2 v_src3 = v_cvt_f64_high(v_1int1) * v_cvt_f64_high(v_2int1);
-
-                v_float64x2 v_dst0 = v_load(dst + x);
-                v_float64x2 v_dst1 = v_load(dst + x + 2);
-                v_float64x2 v_dst2 = v_load(dst + x + 4);
-                v_float64x2 v_dst3 = v_load(dst + x + 6);
-
-                v_dst0 = v_dst0 + v_src0;
-                v_dst1 = v_dst1 + v_src1;
-                v_dst2 = v_dst2 + v_src2;
-                v_dst3 = v_dst3 + v_src3;
-                v_store(dst + x, v_dst0);
-                v_store(dst + x + 2, v_dst1);
-                v_store(dst + x + 4, v_dst2);
-                v_store(dst + x + 6, v_dst3);
-            }
-        }
-        else if (cn == 3)
-        {
-            for (; x <= len - cVectorWidth; x += cVectorWidth)
-            {
-                v_uint16x8 v_mask = v_load_expand(mask + x);
-                v_mask = ~(v_mask == v_0);
-                v_uint16x8 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2;
-                v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2);
-                v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2);
-                v_1src0 = v_1src0 & v_mask;
-                v_1src1 = v_1src1 & v_mask;
-                v_1src2 = v_1src2 & v_mask;
-                v_2src0 = v_2src0 & v_mask;
-                v_2src1 = v_2src1 & v_mask;
-                v_2src2 = v_2src2 & v_mask;
-
-                v_uint32x4 v_1int_00, v_1int_01, v_2int_00, v_2int_01;
-                v_uint32x4 v_1int_10, v_1int_11, v_2int_10, v_2int_11;
-                v_uint32x4 v_1int_20, v_1int_21, v_2int_20, v_2int_21;
-                v_expand(v_1src0, v_1int_00, v_1int_01);
-                v_expand(v_1src1, v_1int_10, v_1int_11);
-                v_expand(v_1src2, v_1int_20, v_1int_21);
-                v_expand(v_2src0, v_2int_00, v_2int_01);
-                v_expand(v_2src1, v_2int_10, v_2int_11);
-                v_expand(v_2src2, v_2int_20, v_2int_21);
-
-                v_int32x4 v_1int00 = v_reinterpret_as_s32(v_1int_00);
-                v_int32x4 v_1int01 = v_reinterpret_as_s32(v_1int_01);
-                v_int32x4 v_1int10 = v_reinterpret_as_s32(v_1int_10);
-                v_int32x4 v_1int11 = v_reinterpret_as_s32(v_1int_11);
-                v_int32x4 v_1int20 = v_reinterpret_as_s32(v_1int_20);
-                v_int32x4 v_1int21 = v_reinterpret_as_s32(v_1int_21);
-                v_int32x4 v_2int00 = v_reinterpret_as_s32(v_2int_00);
-                v_int32x4 v_2int01 = v_reinterpret_as_s32(v_2int_01);
-                v_int32x4 v_2int10 = v_reinterpret_as_s32(v_2int_10);
-                v_int32x4 v_2int11 = v_reinterpret_as_s32(v_2int_11);
-                v_int32x4 v_2int20 = v_reinterpret_as_s32(v_2int_20);
-                v_int32x4 v_2int21 = v_reinterpret_as_s32(v_2int_21);
-
-                v_float64x2 v_src00 = v_cvt_f64(v_1int00) * v_cvt_f64(v_2int00);
-                v_float64x2 v_src01 = v_cvt_f64_high(v_1int00) * v_cvt_f64_high(v_2int00);
-                v_float64x2 v_src02 = v_cvt_f64(v_1int01) * v_cvt_f64(v_2int01);
-                v_float64x2 v_src03 = v_cvt_f64_high(v_1int01) * v_cvt_f64_high(v_2int01);
-                v_float64x2 v_src10 = v_cvt_f64(v_1int10) * v_cvt_f64(v_2int10);
-                v_float64x2 v_src11 = v_cvt_f64_high(v_1int10) * v_cvt_f64_high(v_2int10);
-                v_float64x2 v_src12 = v_cvt_f64(v_1int11) * v_cvt_f64(v_2int11);
-                v_float64x2 v_src13 = v_cvt_f64_high(v_1int11) * v_cvt_f64_high(v_2int11);
-                v_float64x2 v_src20 = v_cvt_f64(v_1int20) * v_cvt_f64(v_2int20);
-                v_float64x2 v_src21 = v_cvt_f64_high(v_1int20) * v_cvt_f64_high(v_2int20);
-                v_float64x2 v_src22 = v_cvt_f64(v_1int21) * v_cvt_f64(v_2int21);
-                v_float64x2 v_src23 = v_cvt_f64_high(v_1int21) * v_cvt_f64_high(v_2int21);
-
-                v_float64x2 v_dst00, v_dst01, v_dst02, v_dst03;
-                v_float64x2 v_dst10, v_dst11, v_dst12, v_dst13;
-                v_float64x2 v_dst20, v_dst21, v_dst22, v_dst23;
-                v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
-                v_load_deinterleave(dst + (x + 2) * cn, v_dst01, v_dst11, v_dst21);
-                v_load_deinterleave(dst + (x + 4) * cn, v_dst02, v_dst12, v_dst22);
-                v_load_deinterleave(dst + (x + 6) * cn, v_dst03, v_dst13, v_dst23);
-
-                v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
-                v_store_interleave(dst + (x + 2) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
-                v_store_interleave(dst + (x + 4) * cn, v_dst02 + v_src02, v_dst12 + v_src12, v_dst22 + v_src22);
-                v_store_interleave(dst + (x + 6) * cn, v_dst03 + v_src03, v_dst13 + v_src13, v_dst23 + v_src23);
-            }
-        }
-    }
-
-    accProd_general_(src1, src2, dst, mask, len, cn, x);
-}
-
-void accProd_simd_(const float* src1, const float* src2, double* dst, const uchar* mask, int len, int cn)
-{
-    int x = 0;
-    const int cVectorWidth = 4;
-
-    if (!mask)
-    {
-        int size = len * cn;
-        for (; x <= size - cVectorWidth; x += cVectorWidth)
-        {
-            v_float32x4 v_1src  = v_load(src1 + x);
-            v_float32x4 v_2src  = v_load(src2 + x);
-
-            v_float64x2 v_1src0 = v_cvt_f64(v_1src);
-            v_float64x2 v_1src1 = v_cvt_f64_high(v_1src);
-            v_float64x2 v_2src0 = v_cvt_f64(v_2src);
-            v_float64x2 v_2src1 = v_cvt_f64_high(v_2src);
-
-            v_store(dst + x, v_load(dst + x) + (v_1src0 * v_2src0));
-            v_store(dst + x + 2, v_load(dst + x + 2) + (v_1src1 * v_2src1));
-        }
-    }
-    else
-    {
-        v_uint32x4 v_0 = v_setzero_u32();
-        if (cn == 1)
-        {
-            for (; x <= len - cVectorWidth; x += cVectorWidth)
-            {
-                v_uint32x4 v_mask = v_load_expand_q(mask + x);
-                v_mask = ~(v_mask == v_0);
-                v_float32x4 v_1src = v_load(src1 + x);
-                v_float32x4 v_2src = v_load(src2 + x);
-                v_1src = v_1src & v_reinterpret_as_f32(v_mask);
-                v_2src = v_2src & v_reinterpret_as_f32(v_mask);
-
-                v_float64x2 v_1src0 = v_cvt_f64(v_1src);
-                v_float64x2 v_1src1 = v_cvt_f64_high(v_1src);
-                v_float64x2 v_2src0 = v_cvt_f64(v_2src);
-                v_float64x2 v_2src1 = v_cvt_f64_high(v_2src);
-
-                v_store(dst + x, v_load(dst + x) + (v_1src0 * v_2src0));
-                v_store(dst + x + 2, v_load(dst + x + 2) + (v_1src1 * v_2src1));
-            }
-        }
-        else if (cn == 3)
-        {
-            for (; x <= len - cVectorWidth; x += cVectorWidth)
-            {
-                v_uint32x4 v_mask = v_load_expand_q(mask + x);
-                v_mask = ~(v_mask == v_0);
-                v_float32x4 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2;
-                v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2);
-                v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2);
-                v_1src0 = v_1src0 & v_reinterpret_as_f32(v_mask);
-                v_1src1 = v_1src1 & v_reinterpret_as_f32(v_mask);
-                v_1src2 = v_1src2 & v_reinterpret_as_f32(v_mask);
-                v_2src0 = v_2src0 & v_reinterpret_as_f32(v_mask);
-                v_2src1 = v_2src1 & v_reinterpret_as_f32(v_mask);
-                v_2src2 = v_2src2 & v_reinterpret_as_f32(v_mask);
-
-                v_float64x2 v_src00 = v_cvt_f64(v_1src0) * v_cvt_f64(v_2src0);
-                v_float64x2 v_src01 = v_cvt_f64_high(v_1src0) * v_cvt_f64_high(v_2src0);
-                v_float64x2 v_src10 = v_cvt_f64(v_1src1) * v_cvt_f64(v_2src1);
-                v_float64x2 v_src11 = v_cvt_f64_high(v_1src1) * v_cvt_f64_high(v_2src1);
-                v_float64x2 v_src20 = v_cvt_f64(v_1src2) * v_cvt_f64(v_2src2);
-                v_float64x2 v_src21 = v_cvt_f64_high(v_1src2) * v_cvt_f64_high(v_2src2);
-
-                v_float64x2 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
-                v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
-                v_load_deinterleave(dst + (x + 2) * cn, v_dst01, v_dst11, v_dst21);
-
-                v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
-                v_store_interleave(dst + (x + 2) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
-            }
-        }
-    }
-
-    accProd_general_(src1, src2, dst, mask, len, cn, x);
-}
-
-void accProd_simd_(const double* src1, const double* src2, double* dst, const uchar* mask, int len, int cn)
-{
-    int x = 0;
-    const int cVectorWidth = 4;
-
-    if (!mask)
-    {
-        int size = len * cn;
-        for (; x <= size - cVectorWidth; x += cVectorWidth)
-        {
-            v_float64x2 v_src00 = v_load(src1 + x);
-            v_float64x2 v_src01 = v_load(src1 + x + 2);
-            v_float64x2 v_src10 = v_load(src2 + x);
-            v_float64x2 v_src11 = v_load(src2 + x + 2);
-
-            v_store(dst + x, v_load(dst + x) + (v_src00 * v_src10));
-            v_store(dst + x + 2, v_load(dst + x + 2) + (v_src01 * v_src11));
-        }
-    }
-    else
-    {
-        v_uint64x2 v_0 = v_setzero_u64();
-        if (cn == 1)
-        {
-            for (; x <= len - cVectorWidth; x += cVectorWidth)
-            {
-                v_uint32x4 v_mask32 = v_load_expand_q(mask + x);
-                v_uint64x2 v_masku640, v_masku641;
-                v_expand(v_mask32, v_masku640, v_masku641);
-                v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
-                v_float64x2 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
-
-                v_float64x2 v_src00 = v_load(src1 + x);
-                v_float64x2 v_src01 = v_load(src1 + x + 2);
-                v_float64x2 v_src10 = v_load(src2 + x);
-                v_float64x2 v_src11 = v_load(src2 + x + 2);
-
-                v_store(dst + x, v_load(dst + x) + ((v_src00 * v_src10) & v_mask0));
-                v_store(dst + x + 2, v_load(dst + x + 2) + ((v_src01 * v_src11) & v_mask1));
-            }
-        }
-        else if (cn == 3)
-        {
-            for (; x <= len - cVectorWidth; x += cVectorWidth)
-            {
-                v_uint32x4 v_mask32 = v_load_expand_q(mask + x);
-                v_uint64x2 v_masku640, v_masku641;
-                v_expand(v_mask32, v_masku640, v_masku641);
-                v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
-                v_float64x2 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
-
-                v_float64x2 v_1src00, v_1src01, v_1src10, v_1src11, v_1src20, v_1src21;
-                v_float64x2 v_2src00, v_2src01, v_2src10, v_2src11, v_2src20, v_2src21;
-                v_load_deinterleave(src1 + x * cn, v_1src00, v_1src10, v_1src20);
-                v_load_deinterleave(src1 + (x + 2) * cn, v_1src01, v_1src11, v_1src21);
-                v_load_deinterleave(src2 + x * cn, v_2src00, v_2src10, v_2src20);
-                v_load_deinterleave(src2 + (x + 2) * cn, v_2src01, v_2src11, v_2src21);
-                v_float64x2 v_src00 = (v_1src00 & v_mask0) * v_2src00;
-                v_float64x2 v_src01 = (v_1src01 & v_mask1) * v_2src01;
-                v_float64x2 v_src10 = (v_1src10 & v_mask0) * v_2src10;
-                v_float64x2 v_src11 = (v_1src11 & v_mask1) * v_2src11;
-                v_float64x2 v_src20 = (v_1src20 & v_mask0) * v_2src20;
-                v_float64x2 v_src21 = (v_1src21 & v_mask1) * v_2src21;
-
-                v_float64x2 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
-                v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
-                v_load_deinterleave(dst + (x + 2) * cn, v_dst01, v_dst11, v_dst21);
-
-                v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
-                v_store_interleave(dst + (x + 2) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
-            }
-        }
-    }
-
-    accProd_general_(src1, src2, dst, mask, len, cn, x);
-}
-#else
-void accProd_simd_(const uchar* src1, const uchar* src2, double* dst, const uchar* mask, int len, int cn)
-{
-    accProd_general_(src1, src2, dst, mask, len, cn, 0);
-}
-
-void accProd_simd_(const ushort* src1, const ushort* src2, double* dst, const uchar* mask, int len, int cn)
-{
-    accProd_general_(src1, src2, dst, mask, len, cn, 0);
-}
-
-void accProd_simd_(const float* src1, const float* src2, double* dst, const uchar* mask, int len, int cn)
-{
-    accProd_general_(src1, src2, dst, mask, len, cn, 0);
-}
-
-void accProd_simd_(const double* src1, const double* src2, double* dst, const uchar* mask, int len, int cn)
-{
-    accProd_general_(src1, src2, dst, mask, len, cn, 0);
-}
-#endif
-
-// running weight accumulate optimized by universal intrinsic
-void accW_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn, double alpha)
-{
-    int x = 0;
-    const v_float32x4 v_alpha = v_setall_f32((float)alpha);
-    const v_float32x4 v_beta = v_setall_f32((float)(1.0f - alpha));
-    const int cVectorWidth = 16;
-
-    if (!mask)
-    {
-        int size = len * cn;
-        for (; x <= size - cVectorWidth; x += cVectorWidth)
-        {
-            v_uint8x16 v_src = v_load(src + x);
-
-            v_uint16x8 v_src0, v_src1;
-            v_expand(v_src, v_src0, v_src1);
-
-            v_uint32x4 v_src00, v_src01, v_src10, v_src11;
-            v_expand(v_src0, v_src00, v_src01);
-            v_expand(v_src1, v_src10, v_src11);
-
-            v_float32x4 v_dst00 = v_load(dst + x);
-            v_float32x4 v_dst01 = v_load(dst + x + 4);
-            v_float32x4 v_dst10 = v_load(dst + x + 8);
-            v_float32x4 v_dst11 = v_load(dst + x + 12);
-
-            v_dst00 = (v_dst00 * v_beta) + (v_cvt_f32(v_reinterpret_as_s32(v_src00)) * v_alpha);
-            v_dst01 = (v_dst01 * v_beta) + (v_cvt_f32(v_reinterpret_as_s32(v_src01)) * v_alpha);
-            v_dst10 = (v_dst10 * v_beta) + (v_cvt_f32(v_reinterpret_as_s32(v_src10)) * v_alpha);
-            v_dst11 = (v_dst11 * v_beta) + (v_cvt_f32(v_reinterpret_as_s32(v_src11)) * v_alpha);
-
-            v_store(dst + x, v_dst00);
-            v_store(dst + x + 4, v_dst01);
-            v_store(dst + x + 8, v_dst10);
-            v_store(dst + x + 12, v_dst11);
-        }
-    }
-
-    accW_general_(src, dst, mask, len, cn, alpha, x);
-}
-
-void accW_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn, double alpha)
-{
-    int x = 0;
-    const v_float32x4 v_alpha = v_setall_f32((float)alpha);
-    const v_float32x4 v_beta = v_setall_f32((float)(1.0f - alpha));
-    const int cVectorWidth = 8;
-
-    if (!mask)
-    {
-        int size = len * cn;
-        for (; x <= size - cVectorWidth; x += cVectorWidth)
-        {
-            v_uint16x8 v_src = v_load(src + x);
-            v_uint32x4 v_int0, v_int1;
-            v_expand(v_src, v_int0, v_int1);
-
-            v_float32x4 v_src0 = v_cvt_f32(v_reinterpret_as_s32(v_int0));
-            v_float32x4 v_src1 = v_cvt_f32(v_reinterpret_as_s32(v_int1));
-            v_src0 = v_src0 * v_alpha;
-            v_src1 = v_src1 * v_alpha;
-
-            v_float32x4 v_dst0 = v_load(dst + x) * v_beta;
-            v_float32x4 v_dst1 = v_load(dst + x + 4) * v_beta;
-
-            v_store(dst + x, v_dst0 + v_src0);
-            v_store(dst + x + 4, v_dst1 + v_src1);
-        }
-    }
-
-    accW_general_(src, dst, mask, len, cn, alpha, x);
-}
-
-void accW_simd_(const float* src, float* dst, const uchar* mask, int len, int cn, double alpha)
-{
-    int x = 0;
-    const v_float32x4 v_alpha = v_setall_f32((float)alpha);
-    const v_float32x4 v_beta = v_setall_f32((float)(1.0f - alpha));
-    const int cVectorWidth = 8;
-
-    if (!mask)
-    {
-        int size = len * cn;
-        for (; x <= size - cVectorWidth; x += cVectorWidth)
-        {
-            v_store(dst + x, ((v_load(dst + x) * v_beta) + (v_load(src + x) * v_alpha)));
-            v_store(dst + x + 4, ((v_load(dst + x + 4) * v_beta) + (v_load(src + x + 4) * v_alpha)));
-        }
-    }
-
-    accW_general_(src, dst, mask, len, cn, alpha, x);
-}
-#if CV_SIMD128_64F
-void accW_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn, double alpha)
-{
-    int x = 0;
-    const v_float64x2 v_alpha = v_setall_f64(alpha);
-    const v_float64x2 v_beta = v_setall_f64(1.0f - alpha);
-    const int cVectorWidth = 8;
-
-    if (!mask)
-    {
-        int size = len * cn;
-        for (; x <= size - cVectorWidth; x += cVectorWidth)
-        {
-            v_uint16x8 v_src16 = v_load_expand(src + x);
-
-            v_uint32x4 v_int_0, v_int_1;
-            v_expand(v_src16, v_int_0, v_int_1);
-
-            v_int32x4 v_int0 = v_reinterpret_as_s32(v_int_0);
-            v_int32x4 v_int1 = v_reinterpret_as_s32(v_int_1);
-
-            v_float64x2 v_src0 = v_cvt_f64(v_int0);
-            v_float64x2 v_src1 = v_cvt_f64_high(v_int0);
-            v_float64x2 v_src2 = v_cvt_f64(v_int1);
-            v_float64x2 v_src3 = v_cvt_f64_high(v_int1);
-
-            v_float64x2 v_dst0 = v_load(dst + x);
-            v_float64x2 v_dst1 = v_load(dst + x + 2);
-            v_float64x2 v_dst2 = v_load(dst + x + 4);
-            v_float64x2 v_dst3 = v_load(dst + x + 6);
-
-            v_dst0 = (v_dst0 * v_beta) + (v_src0 * v_alpha);
-            v_dst1 = (v_dst1 * v_beta) + (v_src1 * v_alpha);
-            v_dst2 = (v_dst2 * v_beta) + (v_src2 * v_alpha);
-            v_dst3 = (v_dst3 * v_beta) + (v_src3 * v_alpha);
-
-            v_store(dst + x, v_dst0);
-            v_store(dst + x + 2, v_dst1);
-            v_store(dst + x + 4, v_dst2);
-            v_store(dst + x + 6, v_dst3);
-        }
-    }
-
-    accW_general_(src, dst, mask, len, cn, alpha, x);
-}
-
-void accW_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn, double alpha)
-{
-    int x = 0;
-    const v_float64x2 v_alpha = v_setall_f64(alpha);
-    const v_float64x2 v_beta = v_setall_f64(1.0f - alpha);
-    const int cVectorWidth = 8;
-
-    if (!mask)
-    {
-        int size = len * cn;
-        for (; x <= size - cVectorWidth; x += cVectorWidth)
-        {
-            v_uint16x8 v_src = v_load(src + x);
-            v_uint32x4 v_int_0, v_int_1;
-            v_expand(v_src, v_int_0, v_int_1);
-
-            v_int32x4 v_int0 = v_reinterpret_as_s32(v_int_0);
-            v_int32x4 v_int1 = v_reinterpret_as_s32(v_int_1);
-
-            v_float64x2 v_src00 = v_cvt_f64(v_int0);
-            v_float64x2 v_src01 = v_cvt_f64_high(v_int0);
-            v_float64x2 v_src10 = v_cvt_f64(v_int1);
-            v_float64x2 v_src11 = v_cvt_f64_high(v_int1);
-
-            v_float64x2 v_dst00 = v_load(dst + x);
-            v_float64x2 v_dst01 = v_load(dst + x + 2);
-            v_float64x2 v_dst10 = v_load(dst + x + 4);
-            v_float64x2 v_dst11 = v_load(dst + x + 6);
-
-            v_dst00 = (v_dst00 * v_beta) + (v_src00 * v_alpha);
-            v_dst01 = (v_dst01 * v_beta) + (v_src01 * v_alpha);
-            v_dst10 = (v_dst10 * v_beta) + (v_src10 * v_alpha);
-            v_dst11 = (v_dst11 * v_beta) + (v_src11 * v_alpha);
-
-            v_store(dst + x, v_dst00);
-            v_store(dst + x + 2, v_dst01);
-            v_store(dst + x + 4, v_dst10);
-            v_store(dst + x + 6, v_dst11);
-        }
-    }
-
-    accW_general_(src, dst, mask, len, cn, alpha, x);
-}
-
-void accW_simd_(const float* src, double* dst, const uchar* mask, int len, int cn, double alpha)
-{
-    int x = 0;
-    const v_float64x2 v_alpha = v_setall_f64(alpha);
-    const v_float64x2 v_beta = v_setall_f64(1.0f - alpha);
-    const int cVectorWidth = 8;
-
-    if (!mask)
-    {
-        int size = len * cn;
-        for (; x <= size - cVectorWidth; x += cVectorWidth)
-        {
-            v_float32x4 v_src0 = v_load(src + x);
-            v_float32x4 v_src1 = v_load(src + x + 4);
-            v_float64x2 v_src00 = v_cvt_f64(v_src0);
-            v_float64x2 v_src01 = v_cvt_f64_high(v_src0);
-            v_float64x2 v_src10 = v_cvt_f64(v_src1);
-            v_float64x2 v_src11 = v_cvt_f64_high(v_src1);
-
-            v_store(dst + x, ((v_load(dst + x) * v_beta) + (v_src00 * v_alpha)));
-            v_store(dst + x + 2, ((v_load(dst + x + 2) * v_beta) + (v_src01 * v_alpha)));
-            v_store(dst + x + 4, ((v_load(dst + x + 4) * v_beta) + (v_src10 * v_alpha)));
-            v_store(dst + x + 6, ((v_load(dst + x + 6) * v_beta) + (v_src11 * v_alpha)));
-        }
-    }
-
-    accW_general_(src, dst, mask, len, cn, alpha, x);
-}
-
-void accW_simd_(const double* src, double* dst, const uchar* mask, int len, int cn, double alpha)
-{
-    int x = 0;
-    const v_float64x2 v_alpha = v_setall_f64(alpha);
-    const v_float64x2 v_beta = v_setall_f64(1.0f - alpha);
-    const int cVectorWidth = 4;
-
-    if (!mask)
-    {
-        int size = len * cn;
-        for (; x <= size - cVectorWidth; x += cVectorWidth)
-        {
-            v_float64x2 v_src0 = v_load(src + x);
-            v_float64x2 v_src1 = v_load(src + x + 2);
-
-            v_store(dst + x, ((v_load(dst + x) * v_beta) + (v_src0 * v_alpha)));
-            v_store(dst + x + 2, ((v_load(dst + x + 2) * v_beta) + (v_src1 * v_alpha)));
-        }
-    }
-
-    accW_general_(src, dst, mask, len, cn, alpha, x);
-}
-#else
-void accW_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn, double alpha)
-{
-    accW_general_(src, dst, mask, len, cn, alpha, 0);
-}
-
-void accW_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn, double alpha)
-{
-    accW_general_(src, dst, mask, len, cn, alpha, 0);
-}
-
-void accW_simd_(const float* src, double* dst, const uchar* mask, int len, int cn, double alpha)
-{
-    accW_general_(src, dst, mask, len, cn, alpha, 0);
-}
-
-void accW_simd_(const double* src, double* dst, const uchar* mask, int len, int cn, double alpha)
-{
-    accW_general_(src, dst, mask, len, cn, alpha, 0);
-}
-#endif // CV_SIMD128_64F
-#endif // CV_SIMD128
-#if CV_AVX
-// accumulate optimized by AVX
-void acc_avx_32f(const float* src, float* dst, const uchar* mask, int len, int cn)
-{
-    int x = 0;
-    const int cVectorWidth = 8;
-
-    if (!mask)
-    {
-        int size = len * cn;
-        for ( ; x <= size - cVectorWidth ; x += cVectorWidth)
-        {
-            __m256 v_src = _mm256_loadu_ps(src + x);
-            __m256 v_dst = _mm256_loadu_ps(dst + x);
-            v_dst = _mm256_add_ps(v_src, v_dst);
-            _mm256_storeu_ps(dst + x, v_dst);
-        }
-        acc_general_(src, dst, mask, len, cn, x);
-    }
-    else
-    {
-        acc_simd_(src, dst, mask, len, cn);
-    }
-}
-
-void acc_avx_32f64f(const float* src, double* dst, const uchar* mask, int len, int cn)
-{
-    int x = 0;
-    const int cVectorWidth = 8;
-
-    if (!mask)
-    {
-        int size = len * cn;
-        for ( ; x <= size - cVectorWidth ; x += cVectorWidth)
-        {
-            __m256 v_src = _mm256_loadu_ps(src + x);
-            __m256d v_src0 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src, 0));
-            __m256d v_src1 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src, 1));
-            __m256d v_dst0 = _mm256_loadu_pd(dst + x);
-            __m256d v_dst1 = _mm256_loadu_pd(dst + x + 4);
-            v_dst0 = _mm256_add_pd(v_src0, v_dst0);
-            v_dst1 = _mm256_add_pd(v_src1, v_dst1);
-            _mm256_storeu_pd(dst + x, v_dst0);
-            _mm256_storeu_pd(dst + x + 4, v_dst1);
-        }
-        acc_general_(src, dst, mask, len, cn, x);
-    }
-    else
-    {
-        acc_simd_(src, dst, mask, len, cn);
-    }
-}
-
-void acc_avx_64f(const double* src, double* dst, const uchar* mask, int len, int cn)
-{
-    int x = 0;
-    const int cVectorWidth = 4;
-
-    if (!mask)
-    {
-        int size = len * cn;
-        for ( ; x <= size - cVectorWidth ; x += cVectorWidth)
-        {
-            __m256d v_src = _mm256_loadu_pd(src + x);
-            __m256d v_dst = _mm256_loadu_pd(dst + x);
-            v_dst = _mm256_add_pd(v_dst, v_src);
-            _mm256_storeu_pd(dst + x, v_dst);
-        }
-        acc_general_(src, dst, mask, len, cn, x);
-    }
-    else
-    {
-        acc_simd_(src, dst, mask, len, cn);
-    }
-}
-
-// square accumulate optimized by avx
-void accSqr_avx_32f(const float* src, float* dst, const uchar* mask, int len, int cn)
-{
-    int x = 0;
-    const int cVectorWidth = 8;
-
-    if (!mask)
-    {
-        int size = len * cn;
-        for ( ; x <= size - cVectorWidth ; x += cVectorWidth)
-        {
-            __m256 v_src = _mm256_loadu_ps(src + x);
-            __m256 v_dst = _mm256_loadu_ps(dst + x);
-            v_src = _mm256_mul_ps(v_src, v_src);
-            v_dst = _mm256_add_ps(v_src, v_dst);
-            _mm256_storeu_ps(dst + x, v_dst);
-        }
-        accSqr_general_(src, dst, mask, len, cn, x);
-    }
-    else
-    {
-        accSqr_simd_(src, dst, mask, len, cn);
-    }
-}
-
-void accSqr_avx_32f64f(const float* src, double* dst, const uchar* mask, int len, int cn)
-{
-    int x = 0;
-    const int cVectorWidth = 8;
-
-    if (!mask)
-    {
-        int size = len * cn;
-        for ( ; x <= size - cVectorWidth ; x += cVectorWidth)
-        {
-            __m256 v_src = _mm256_loadu_ps(src + x);
-            __m256d v_src0 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src,0));
-            __m256d v_src1 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src,1));
-            __m256d v_dst0 = _mm256_loadu_pd(dst + x);
-            __m256d v_dst1 = _mm256_loadu_pd(dst + x + 4);
-            v_src0 = _mm256_mul_pd(v_src0, v_src0);
-            v_src1 = _mm256_mul_pd(v_src1, v_src1);
-            v_dst0 = _mm256_add_pd(v_src0, v_dst0);
-            v_dst1 = _mm256_add_pd(v_src1, v_dst1);
-            _mm256_storeu_pd(dst + x, v_dst0);
-            _mm256_storeu_pd(dst + x + 4, v_dst1);
-        }
-        accSqr_general_(src, dst, mask, len, cn, x);
-    }
-    else
-    {
-        accSqr_simd_(src, dst, mask, len, cn);
-    }
-}
-
-void accSqr_avx_64f(const double* src, double* dst, const uchar* mask, int len, int cn)
-{
-    int x = 0;
-    const int cVectorWidth = 4;
-
-    if (!mask)
-    {
-        int size = len * cn;
-        for ( ; x <= size - cVectorWidth ; x += cVectorWidth)
-        {
-            __m256d v_src = _mm256_loadu_pd(src + x);
-            __m256d v_dst = _mm256_loadu_pd(dst + x);
-            v_src = _mm256_mul_pd(v_src, v_src);
-            v_dst = _mm256_add_pd(v_dst, v_src);
-            _mm256_storeu_pd(dst + x, v_dst);
-        }
-        accSqr_general_(src, dst, mask, len, cn, x);
-    }
-    else
-    {
-        accSqr_simd_(src, dst, mask, len, cn);
-    }
-}
-
-// product accumulate optimized by avx
-void accProd_avx_32f(const float* src1, const float* src2, float* dst, const uchar* mask, int len, int cn)
-{
-    int x = 0;
-    const int cVectorWidth = 8;
-
-    if (!mask)
-    {
-        int size = len * cn;
-        for ( ; x <= size - cVectorWidth ; x += cVectorWidth)
+        #if CV_AVX && !CV_AVX2
+        for (; x <= size - 8 ; x += 8)
         {
             __m256 v_src0 = _mm256_loadu_ps(src1 + x);
             __m256 v_src1 = _mm256_loadu_ps(src2 + x);
@@ -2978,23 +2052,361 @@ void accProd_avx_32f(const float* src1, const float* src2, float* dst, const uch
             v_dst = _mm256_add_ps(v_src, v_dst);
             _mm256_storeu_ps(dst + x, v_dst);
         }
-        accProd_general_(src1, src2, dst, mask, len, cn, x);
+        #else
+        for (; x <= size - cVectorWidth; x += cVectorWidth)
+        {
+            v_store(dst + x, v_fma(vx_load(src1 + x), vx_load(src2 + x), vx_load(dst + x)));
+            v_store(dst + x + step, v_fma(vx_load(src1 + x + step), vx_load(src2 + x + step), vx_load(dst + x + step)));
+        }
+        #endif // CV_AVX && !CV_AVX2
     }
     else
     {
-        accProd_simd_(src1, src2, dst, mask, len, cn);
+        v_uint32 v_0 = vx_setzero_u32();
+        if (cn == 1)
+        {
+            for (; x <= len - cVectorWidth; x += cVectorWidth)
+            {
+                v_uint32 v_mask32_0 = vx_load_expand_q(mask + x);
+                v_uint32 v_mask32_1 = vx_load_expand_q(mask + x + step);
+                v_float32 v_mask0 = v_reinterpret_as_f32(~(v_mask32_0 == v_0));
+                v_float32 v_mask1 = v_reinterpret_as_f32(~(v_mask32_1 == v_0));
+
+                v_store(dst + x, vx_load(dst + x) + ((vx_load(src1 + x) * vx_load(src2 + x)) & v_mask0));
+                v_store(dst + x + step, vx_load(dst + x + step) + ((vx_load(src1 + x + step) * vx_load(src2 + x + step)) & v_mask1));
+            }
+        }
+        else if (cn == 3)
+        {
+            for (; x <= len - cVectorWidth; x += cVectorWidth)
+            {
+                v_uint32 v_mask32_0 = vx_load_expand_q(mask + x);
+                v_uint32 v_mask32_1 = vx_load_expand_q(mask + x + step);
+                v_float32 v_mask0 = v_reinterpret_as_f32(~(v_mask32_0 == v_0));
+                v_float32 v_mask1 = v_reinterpret_as_f32(~(v_mask32_1 == v_0));
+
+                v_float32 v_1src00, v_1src01, v_1src10, v_1src11, v_1src20, v_1src21;
+                v_float32 v_2src00, v_2src01, v_2src10, v_2src11, v_2src20, v_2src21;
+                v_load_deinterleave(src1 + x * cn, v_1src00, v_1src10, v_1src20);
+                v_load_deinterleave(src2 + x * cn, v_2src00, v_2src10, v_2src20);
+                v_load_deinterleave(src1 + (x + step) * cn, v_1src01, v_1src11, v_1src21);
+                v_load_deinterleave(src2 + (x + step) * cn, v_2src01, v_2src11, v_2src21);
+
+                v_float32 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
+                v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
+                v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
+
+                v_store_interleave(dst + x * cn, v_dst00 + ((v_1src00 * v_2src00) & v_mask0), v_dst10 + ((v_1src10 * v_2src10) & v_mask0), v_dst20 + ((v_1src20 * v_2src20) & v_mask0));
+                v_store_interleave(dst + (x + step) * cn, v_dst01 + ((v_1src01 * v_2src01) & v_mask1), v_dst11 + ((v_1src11 * v_2src11) & v_mask1), v_dst21 + ((v_1src21 * v_2src21) & v_mask1));
+            }
+        }
     }
+#endif // CV_SIMD
+    accProd_general_(src1, src2, dst, mask, len, cn, x);
 }
 
-void accProd_avx_32f64f(const float* src1, const float* src2, double* dst, const uchar* mask, int len, int cn)
+void accProd_simd_(const uchar* src1, const uchar* src2, double* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-    const int cVectorWidth = 8;
+#if CV_SIMD_64F
+    const int cVectorWidth = v_uint16::nlanes;
+    const int step = v_float64::nlanes;
 
     if (!mask)
     {
         int size = len * cn;
-        for ( ; x <= size - cVectorWidth ; x += cVectorWidth)
+        for (; x <= size - cVectorWidth; x += cVectorWidth)
+        {
+            v_uint16 v_1int  = vx_load_expand(src1 + x);
+            v_uint16 v_2int  = vx_load_expand(src2 + x);
+
+            v_uint32 v_1int_0, v_1int_1, v_2int_0, v_2int_1;
+            v_expand(v_1int, v_1int_0, v_1int_1);
+            v_expand(v_2int, v_2int_0, v_2int_1);
+
+            v_int32 v_1int0 = v_reinterpret_as_s32(v_1int_0);
+            v_int32 v_1int1 = v_reinterpret_as_s32(v_1int_1);
+            v_int32 v_2int0 = v_reinterpret_as_s32(v_2int_0);
+            v_int32 v_2int1 = v_reinterpret_as_s32(v_2int_1);
+
+            v_float64 v_dst0 = vx_load(dst + x);
+            v_float64 v_dst1 = vx_load(dst + x + step);
+            v_float64 v_dst2 = vx_load(dst + x + step * 2);
+            v_float64 v_dst3 = vx_load(dst + x + step * 3);
+
+            v_dst0 = v_fma(v_cvt_f64(v_1int0), v_cvt_f64(v_2int0), v_dst0);
+            v_dst1 = v_fma(v_cvt_f64_high(v_1int0), v_cvt_f64_high(v_2int0), v_dst1);
+            v_dst2 = v_fma(v_cvt_f64(v_1int1), v_cvt_f64(v_2int1), v_dst2);
+            v_dst3 = v_fma(v_cvt_f64_high(v_1int1), v_cvt_f64_high(v_2int1), v_dst3);
+
+            v_store(dst + x, v_dst0);
+            v_store(dst + x + step, v_dst1);
+            v_store(dst + x + step * 2, v_dst2);
+            v_store(dst + x + step * 3, v_dst3);
+        }
+    }
+    else
+    {
+        v_uint16 v_0 = vx_setzero_u16();
+        if (cn == 1)
+        {
+            for (; x <= len - cVectorWidth; x += cVectorWidth)
+            {
+                v_uint16 v_mask = vx_load_expand(mask + x);
+                v_mask = ~(v_mask == v_0);
+                v_uint16 v_1int = vx_load_expand(src1 + x) & v_mask;
+                v_uint16 v_2int = vx_load_expand(src2 + x) & v_mask;
+
+                v_uint32 v_1int_0, v_1int_1, v_2int_0, v_2int_1;
+                v_expand(v_1int, v_1int_0, v_1int_1);
+                v_expand(v_2int, v_2int_0, v_2int_1);
+
+                v_int32 v_1int0 = v_reinterpret_as_s32(v_1int_0);
+                v_int32 v_1int1 = v_reinterpret_as_s32(v_1int_1);
+                v_int32 v_2int0 = v_reinterpret_as_s32(v_2int_0);
+                v_int32 v_2int1 = v_reinterpret_as_s32(v_2int_1);
+
+                v_float64 v_dst0 = vx_load(dst + x);
+                v_float64 v_dst1 = vx_load(dst + x + step);
+                v_float64 v_dst2 = vx_load(dst + x + step * 2);
+                v_float64 v_dst3 = vx_load(dst + x + step * 3);
+
+                v_dst0 = v_fma(v_cvt_f64(v_1int0), v_cvt_f64(v_2int0), v_dst0);
+                v_dst1 = v_fma(v_cvt_f64_high(v_1int0), v_cvt_f64_high(v_2int0), v_dst1);
+                v_dst2 = v_fma(v_cvt_f64(v_1int1), v_cvt_f64(v_2int1), v_dst2);
+                v_dst3 = v_fma(v_cvt_f64_high(v_1int1), v_cvt_f64_high(v_2int1), v_dst3);
+
+                v_store(dst + x, v_dst0);
+                v_store(dst + x + step, v_dst1);
+                v_store(dst + x + step * 2, v_dst2);
+                v_store(dst + x + step * 3, v_dst3);
+            }
+        }
+        else if (cn == 3)
+        {
+            for (; x <= len - cVectorWidth * 2; x += cVectorWidth)
+            {
+                v_uint8 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2;
+                v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2);
+                v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2);
+
+                v_uint16 v_1int0 = v_expand_low(v_1src0);
+                v_uint16 v_1int1 = v_expand_low(v_1src1);
+                v_uint16 v_1int2 = v_expand_low(v_1src2);
+                v_uint16 v_2int0 = v_expand_low(v_2src0);
+                v_uint16 v_2int1 = v_expand_low(v_2src1);
+                v_uint16 v_2int2 = v_expand_low(v_2src2);
+
+                v_uint16 v_mask = vx_load_expand(mask + x);
+                v_mask = ~(v_mask == v_0);
+                v_1int0 = v_1int0 & v_mask;
+                v_1int1 = v_1int1 & v_mask;
+                v_1int2 = v_1int2 & v_mask;
+                v_2int0 = v_2int0 & v_mask;
+                v_2int1 = v_2int1 & v_mask;
+                v_2int2 = v_2int2 & v_mask;
+
+                v_uint32 v_1int00, v_1int01, v_1int10, v_1int11, v_1int20, v_1int21;
+                v_uint32 v_2int00, v_2int01, v_2int10, v_2int11, v_2int20, v_2int21;
+                v_expand(v_1int0, v_1int00, v_1int01);
+                v_expand(v_1int1, v_1int10, v_1int11);
+                v_expand(v_1int2, v_1int20, v_1int21);
+                v_expand(v_2int0, v_2int00, v_2int01);
+                v_expand(v_2int1, v_2int10, v_2int11);
+                v_expand(v_2int2, v_2int20, v_2int21);
+
+                v_float64 v_dst00, v_dst01, v_dst02, v_dst03, v_dst10, v_dst11, v_dst12, v_dst13, v_dst20, v_dst21, v_dst22, v_dst23;
+                v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
+                v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
+                v_load_deinterleave(dst + (x + step * 2) * cn, v_dst02, v_dst12, v_dst22);
+                v_load_deinterleave(dst + (x + step * 3) * cn, v_dst03, v_dst13, v_dst23);
+
+                v_dst00 = v_fma(v_cvt_f64(v_reinterpret_as_s32(v_1int00)), v_cvt_f64(v_reinterpret_as_s32(v_2int00)), v_dst00);
+                v_dst01 = v_fma(v_cvt_f64_high(v_reinterpret_as_s32(v_1int00)), v_cvt_f64_high(v_reinterpret_as_s32(v_2int00)), v_dst01);
+                v_dst02 = v_fma(v_cvt_f64(v_reinterpret_as_s32(v_1int01)), v_cvt_f64(v_reinterpret_as_s32(v_2int01)), v_dst02);
+                v_dst03 = v_fma(v_cvt_f64_high(v_reinterpret_as_s32(v_1int01)), v_cvt_f64_high(v_reinterpret_as_s32(v_2int01)), v_dst03);
+                v_dst10 = v_fma(v_cvt_f64(v_reinterpret_as_s32(v_1int10)), v_cvt_f64(v_reinterpret_as_s32(v_2int10)), v_dst10);
+                v_dst11 = v_fma(v_cvt_f64_high(v_reinterpret_as_s32(v_1int10)), v_cvt_f64_high(v_reinterpret_as_s32(v_2int10)), v_dst11);
+                v_dst12 = v_fma(v_cvt_f64(v_reinterpret_as_s32(v_1int11)), v_cvt_f64(v_reinterpret_as_s32(v_2int11)), v_dst12);
+                v_dst13 = v_fma(v_cvt_f64_high(v_reinterpret_as_s32(v_1int11)), v_cvt_f64_high(v_reinterpret_as_s32(v_2int11)), v_dst13);
+                v_dst20 = v_fma(v_cvt_f64(v_reinterpret_as_s32(v_1int20)), v_cvt_f64(v_reinterpret_as_s32(v_2int20)), v_dst20);
+                v_dst21 = v_fma(v_cvt_f64_high(v_reinterpret_as_s32(v_1int20)), v_cvt_f64_high(v_reinterpret_as_s32(v_2int20)), v_dst21);
+                v_dst22 = v_fma(v_cvt_f64(v_reinterpret_as_s32(v_1int21)), v_cvt_f64(v_reinterpret_as_s32(v_2int21)), v_dst22);
+                v_dst23 = v_fma(v_cvt_f64_high(v_reinterpret_as_s32(v_1int21)), v_cvt_f64_high(v_reinterpret_as_s32(v_2int21)), v_dst23);
+
+                v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
+                v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
+                v_store_interleave(dst + (x + step * 2) * cn, v_dst02, v_dst12, v_dst22);
+                v_store_interleave(dst + (x + step * 3) * cn, v_dst03, v_dst13, v_dst23);
+            }
+        }
+    }
+#endif // CV_SIMD_64F
+    accProd_general_(src1, src2, dst, mask, len, cn, x);
+}
+
+void accProd_simd_(const ushort* src1, const ushort* src2, double* dst, const uchar* mask, int len, int cn)
+{
+    int x = 0;
+#if CV_SIMD_64F
+    const int cVectorWidth = v_uint16::nlanes;
+    const int step = v_float64::nlanes;
+
+    if (!mask)
+    {
+        int size = len * cn;
+        for (; x <= size - cVectorWidth; x += cVectorWidth)
+        {
+            v_uint16 v_1src  = vx_load(src1 + x);
+            v_uint16 v_2src  = vx_load(src2 + x);
+
+            v_uint32 v_1int_0, v_1int_1, v_2int_0, v_2int_1;
+            v_expand(v_1src, v_1int_0, v_1int_1);
+            v_expand(v_2src, v_2int_0, v_2int_1);
+
+            v_int32 v_1int0 = v_reinterpret_as_s32(v_1int_0);
+            v_int32 v_1int1 = v_reinterpret_as_s32(v_1int_1);
+            v_int32 v_2int0 = v_reinterpret_as_s32(v_2int_0);
+            v_int32 v_2int1 = v_reinterpret_as_s32(v_2int_1);
+
+            v_float64 v_dst0 = vx_load(dst + x);
+            v_float64 v_dst1 = vx_load(dst + x + step);
+            v_float64 v_dst2 = vx_load(dst + x + step * 2);
+            v_float64 v_dst3 = vx_load(dst + x + step * 3);
+
+            v_dst0 = v_fma(v_cvt_f64(v_1int0), v_cvt_f64(v_2int0), v_dst0);
+            v_dst1 = v_fma(v_cvt_f64_high(v_1int0), v_cvt_f64_high(v_2int0), v_dst1);
+            v_dst2 = v_fma(v_cvt_f64(v_1int1), v_cvt_f64(v_2int1), v_dst2);
+            v_dst3 = v_fma(v_cvt_f64_high(v_1int1), v_cvt_f64_high(v_2int1), v_dst3);
+
+            v_store(dst + x, v_dst0);
+            v_store(dst + x + step, v_dst1);
+            v_store(dst + x + step * 2, v_dst2);
+            v_store(dst + x + step * 3, v_dst3);
+        }
+    }
+    else
+    {
+        v_uint16 v_0 = vx_setzero_u16();
+        if (cn == 1)
+        {
+            for (; x <= len - cVectorWidth; x += cVectorWidth)
+            {
+                v_uint16 v_mask = vx_load_expand(mask + x);
+                v_mask = ~(v_mask == v_0);
+                v_uint16 v_1src = vx_load(src1 + x);
+                v_uint16 v_2src = vx_load(src2 + x);
+                v_1src = v_1src & v_mask;
+                v_2src = v_2src & v_mask;
+
+                v_uint32 v_1int_0, v_1int_1, v_2int_0, v_2int_1;
+                v_expand(v_1src, v_1int_0, v_1int_1);
+                v_expand(v_2src, v_2int_0, v_2int_1);
+
+                v_int32 v_1int0 = v_reinterpret_as_s32(v_1int_0);
+                v_int32 v_1int1 = v_reinterpret_as_s32(v_1int_1);
+                v_int32 v_2int0 = v_reinterpret_as_s32(v_2int_0);
+                v_int32 v_2int1 = v_reinterpret_as_s32(v_2int_1);
+
+                v_float64 v_dst0 = vx_load(dst + x);
+                v_float64 v_dst1 = vx_load(dst + x + step);
+                v_float64 v_dst2 = vx_load(dst + x + step * 2);
+                v_float64 v_dst3 = vx_load(dst + x + step * 3);
+
+                v_dst0 = v_fma(v_cvt_f64(v_1int0), v_cvt_f64(v_2int0), v_dst0);
+                v_dst1 = v_fma(v_cvt_f64_high(v_1int0), v_cvt_f64_high(v_2int0), v_dst1);
+                v_dst2 = v_fma(v_cvt_f64(v_1int1), v_cvt_f64(v_2int1), v_dst2);
+                v_dst3 = v_fma(v_cvt_f64_high(v_1int1), v_cvt_f64_high(v_2int1), v_dst3);
+
+                v_store(dst + x, v_dst0);
+                v_store(dst + x + step, v_dst1);
+                v_store(dst + x + step * 2, v_dst2);
+                v_store(dst + x + step * 3, v_dst3);
+            }
+        }
+        else if (cn == 3)
+        {
+            for (; x <= len - cVectorWidth; x += cVectorWidth)
+            {
+                v_uint16 v_mask = vx_load_expand(mask + x);
+                v_mask = ~(v_mask == v_0);
+                v_uint16 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2;
+                v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2);
+                v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2);
+                v_1src0 = v_1src0 & v_mask;
+                v_1src1 = v_1src1 & v_mask;
+                v_1src2 = v_1src2 & v_mask;
+                v_2src0 = v_2src0 & v_mask;
+                v_2src1 = v_2src1 & v_mask;
+                v_2src2 = v_2src2 & v_mask;
+
+                v_uint32 v_1int_00, v_1int_01, v_2int_00, v_2int_01;
+                v_uint32 v_1int_10, v_1int_11, v_2int_10, v_2int_11;
+                v_uint32 v_1int_20, v_1int_21, v_2int_20, v_2int_21;
+                v_expand(v_1src0, v_1int_00, v_1int_01);
+                v_expand(v_1src1, v_1int_10, v_1int_11);
+                v_expand(v_1src2, v_1int_20, v_1int_21);
+                v_expand(v_2src0, v_2int_00, v_2int_01);
+                v_expand(v_2src1, v_2int_10, v_2int_11);
+                v_expand(v_2src2, v_2int_20, v_2int_21);
+
+                v_int32 v_1int00 = v_reinterpret_as_s32(v_1int_00);
+                v_int32 v_1int01 = v_reinterpret_as_s32(v_1int_01);
+                v_int32 v_1int10 = v_reinterpret_as_s32(v_1int_10);
+                v_int32 v_1int11 = v_reinterpret_as_s32(v_1int_11);
+                v_int32 v_1int20 = v_reinterpret_as_s32(v_1int_20);
+                v_int32 v_1int21 = v_reinterpret_as_s32(v_1int_21);
+                v_int32 v_2int00 = v_reinterpret_as_s32(v_2int_00);
+                v_int32 v_2int01 = v_reinterpret_as_s32(v_2int_01);
+                v_int32 v_2int10 = v_reinterpret_as_s32(v_2int_10);
+                v_int32 v_2int11 = v_reinterpret_as_s32(v_2int_11);
+                v_int32 v_2int20 = v_reinterpret_as_s32(v_2int_20);
+                v_int32 v_2int21 = v_reinterpret_as_s32(v_2int_21);
+
+                v_float64 v_dst00, v_dst01, v_dst02, v_dst03;
+                v_float64 v_dst10, v_dst11, v_dst12, v_dst13;
+                v_float64 v_dst20, v_dst21, v_dst22, v_dst23;
+                v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
+                v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
+                v_load_deinterleave(dst + (x + step * 2) * cn, v_dst02, v_dst12, v_dst22);
+                v_load_deinterleave(dst + (x + step * 3) * cn, v_dst03, v_dst13, v_dst23);
+
+                v_dst00 = v_fma(v_cvt_f64(v_1int00), v_cvt_f64(v_2int00), v_dst00);
+                v_dst01 = v_fma(v_cvt_f64_high(v_1int00), v_cvt_f64_high(v_2int00), v_dst01);
+                v_dst02 = v_fma(v_cvt_f64(v_1int01), v_cvt_f64(v_2int01), v_dst02);
+                v_dst03 = v_fma(v_cvt_f64_high(v_1int01), v_cvt_f64_high(v_2int01), v_dst03);
+                v_dst10 = v_fma(v_cvt_f64(v_1int10), v_cvt_f64(v_2int10), v_dst10);
+                v_dst11 = v_fma(v_cvt_f64_high(v_1int10), v_cvt_f64_high(v_2int10), v_dst11);
+                v_dst12 = v_fma(v_cvt_f64(v_1int11), v_cvt_f64(v_2int11), v_dst12);
+                v_dst13 = v_fma(v_cvt_f64_high(v_1int11), v_cvt_f64_high(v_2int11), v_dst13);
+                v_dst20 = v_fma(v_cvt_f64(v_1int20), v_cvt_f64(v_2int20), v_dst20);
+                v_dst21 = v_fma(v_cvt_f64_high(v_1int20), v_cvt_f64_high(v_2int20), v_dst21);
+                v_dst22 = v_fma(v_cvt_f64(v_1int21), v_cvt_f64(v_2int21), v_dst22);
+                v_dst23 = v_fma(v_cvt_f64_high(v_1int21), v_cvt_f64_high(v_2int21), v_dst23);
+
+                v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
+                v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
+                v_store_interleave(dst + (x + step * 2) * cn, v_dst02, v_dst12, v_dst22);
+                v_store_interleave(dst + (x + step * 3) * cn, v_dst03, v_dst13, v_dst23);
+            }
+        }
+    }
+#endif // CV_SIMD_64F
+    accProd_general_(src1, src2, dst, mask, len, cn, x);
+}
+
+void accProd_simd_(const float* src1, const float* src2, double* dst, const uchar* mask, int len, int cn)
+{
+    int x = 0;
+#if CV_SIMD_64F
+    const int cVectorWidth = v_float32::nlanes;
+    const int step = v_float64::nlanes;
+
+    if (!mask)
+    {
+        int size = len * cn;
+        #if CV_AVX && !CV_AVX2
+        for ( ; x <= size - 8 ; x += 8)
         {
             __m256 v_1src = _mm256_loadu_ps(src1 + x);
             __m256 v_2src = _mm256_loadu_ps(src2 + x);
@@ -3011,23 +2423,93 @@ void accProd_avx_32f64f(const float* src1, const float* src2, double* dst, const
             _mm256_storeu_pd(dst + x, v_dst0);
             _mm256_storeu_pd(dst + x + 4, v_dst1);
         }
-        accProd_general_(src1, src2, dst, mask, len, cn, x);
+        #else
+        for (; x <= size - cVectorWidth; x += cVectorWidth)
+        {
+            v_float32 v_1src  = vx_load(src1 + x);
+            v_float32 v_2src  = vx_load(src2 + x);
+
+            v_float64 v_1src0 = v_cvt_f64(v_1src);
+            v_float64 v_1src1 = v_cvt_f64_high(v_1src);
+            v_float64 v_2src0 = v_cvt_f64(v_2src);
+            v_float64 v_2src1 = v_cvt_f64_high(v_2src);
+
+            v_store(dst + x, v_fma(v_1src0, v_2src0, vx_load(dst + x)));
+            v_store(dst + x + step, v_fma(v_1src1, v_2src1, vx_load(dst + x + step)));
+        }
+        #endif // CV_AVX && !CV_AVX2
     }
     else
     {
-        accProd_simd_(src1, src2, dst, mask, len, cn);
+        v_uint32 v_0 = vx_setzero_u32();
+        if (cn == 1)
+        {
+            for (; x <= len - cVectorWidth; x += cVectorWidth)
+            {
+                v_uint32 v_mask = vx_load_expand_q(mask + x);
+                v_mask = ~(v_mask == v_0);
+                v_float32 v_1src = vx_load(src1 + x);
+                v_float32 v_2src = vx_load(src2 + x);
+                v_1src = v_1src & v_reinterpret_as_f32(v_mask);
+                v_2src = v_2src & v_reinterpret_as_f32(v_mask);
+
+                v_float64 v_1src0 = v_cvt_f64(v_1src);
+                v_float64 v_1src1 = v_cvt_f64_high(v_1src);
+                v_float64 v_2src0 = v_cvt_f64(v_2src);
+                v_float64 v_2src1 = v_cvt_f64_high(v_2src);
+
+                v_store(dst + x, v_fma(v_1src0, v_2src0, vx_load(dst + x)));
+                v_store(dst + x + step, v_fma(v_1src1, v_2src1, vx_load(dst + x + step)));
+            }
+        }
+        else if (cn == 3)
+        {
+            for (; x <= len - cVectorWidth; x += cVectorWidth)
+            {
+                v_uint32 v_mask = vx_load_expand_q(mask + x);
+                v_mask = ~(v_mask == v_0);
+                v_float32 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2;
+                v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2);
+                v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2);
+                v_1src0 = v_1src0 & v_reinterpret_as_f32(v_mask);
+                v_1src1 = v_1src1 & v_reinterpret_as_f32(v_mask);
+                v_1src2 = v_1src2 & v_reinterpret_as_f32(v_mask);
+                v_2src0 = v_2src0 & v_reinterpret_as_f32(v_mask);
+                v_2src1 = v_2src1 & v_reinterpret_as_f32(v_mask);
+                v_2src2 = v_2src2 & v_reinterpret_as_f32(v_mask);
+
+                v_float64 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
+                v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
+                v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
+
+                v_dst00 = v_fma(v_cvt_f64(v_1src0), v_cvt_f64(v_2src0), v_dst00);
+                v_dst01 = v_fma(v_cvt_f64_high(v_1src0), v_cvt_f64_high(v_2src0), v_dst01);
+                v_dst10 = v_fma(v_cvt_f64(v_1src1), v_cvt_f64(v_2src1), v_dst10);
+                v_dst11 = v_fma(v_cvt_f64_high(v_1src1), v_cvt_f64_high(v_2src1), v_dst11);
+                v_dst20 = v_fma(v_cvt_f64(v_1src2), v_cvt_f64(v_2src2), v_dst20);
+                v_dst21 = v_fma(v_cvt_f64_high(v_1src2), v_cvt_f64_high(v_2src2), v_dst21);
+
+                v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
+                v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
+            }
+        }
     }
+#endif // CV_SIMD_64F
+    accProd_general_(src1, src2, dst, mask, len, cn, x);
 }
 
-void accProd_avx_64f(const double* src1, const double* src2, double* dst, const uchar* mask, int len, int cn)
+void accProd_simd_(const double* src1, const double* src2, double* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-    const int cVectorWidth = 4;
+#if CV_SIMD_64F
+    const int cVectorWidth = v_float64::nlanes * 2;
+    const int step = v_float64::nlanes;
 
     if (!mask)
     {
         int size = len * cn;
-        for ( ; x <= size - cVectorWidth ; x += cVectorWidth)
+        #if CV_AVX && !CV_AVX2
+        for ( ; x <= size - 4 ; x += 4)
         {
             __m256d v_src0 = _mm256_loadu_pd(src1 + x);
             __m256d v_src1 = _mm256_loadu_pd(src2 + x);
@@ -3036,18 +2518,157 @@ void accProd_avx_64f(const double* src1, const double* src2, double* dst, const
             v_dst = _mm256_add_pd(v_dst, v_src0);
             _mm256_storeu_pd(dst + x, v_dst);
         }
-        accProd_general_(src1, src2, dst, mask, len, cn, x);
+        #else
+        for (; x <= size - cVectorWidth; x += cVectorWidth)
+        {
+            v_float64 v_src00 = vx_load(src1 + x);
+            v_float64 v_src01 = vx_load(src1 + x + step);
+            v_float64 v_src10 = vx_load(src2 + x);
+            v_float64 v_src11 = vx_load(src2 + x + step);
+
+            v_store(dst + x, v_fma(v_src00, v_src10, vx_load(dst + x)));
+            v_store(dst + x + step, v_fma(v_src01, v_src11, vx_load(dst + x + step)));
+        }
+        #endif
     }
     else
     {
-        accProd_simd_(src1, src2, dst, mask, len, cn);
+        // todo: try fma
+        v_uint64 v_0 = vx_setzero_u64();
+        if (cn == 1)
+        {
+            for (; x <= len - cVectorWidth; x += cVectorWidth)
+            {
+                v_uint32 v_mask32 = vx_load_expand_q(mask + x);
+                v_uint64 v_masku640, v_masku641;
+                v_expand(v_mask32, v_masku640, v_masku641);
+                v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
+                v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
+
+                v_float64 v_src00 = vx_load(src1 + x);
+                v_float64 v_src01 = vx_load(src1 + x + step);
+                v_float64 v_src10 = vx_load(src2 + x);
+                v_float64 v_src11 = vx_load(src2 + x + step);
+
+                v_store(dst + x, vx_load(dst + x) + ((v_src00 * v_src10) & v_mask0));
+                v_store(dst + x + step, vx_load(dst + x + step) + ((v_src01 * v_src11) & v_mask1));
+            }
+        }
+        else if (cn == 3)
+        {
+            for (; x <= len - cVectorWidth; x += cVectorWidth)
+            {
+                v_uint32 v_mask32 = vx_load_expand_q(mask + x);
+                v_uint64 v_masku640, v_masku641;
+                v_expand(v_mask32, v_masku640, v_masku641);
+                v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
+                v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
+
+                v_float64 v_1src00, v_1src01, v_1src10, v_1src11, v_1src20, v_1src21;
+                v_float64 v_2src00, v_2src01, v_2src10, v_2src11, v_2src20, v_2src21;
+                v_load_deinterleave(src1 + x * cn, v_1src00, v_1src10, v_1src20);
+                v_load_deinterleave(src1 + (x + step) * cn, v_1src01, v_1src11, v_1src21);
+                v_load_deinterleave(src2 + x * cn, v_2src00, v_2src10, v_2src20);
+                v_load_deinterleave(src2 + (x + step) * cn, v_2src01, v_2src11, v_2src21);
+                v_float64 v_src00 = (v_1src00 & v_mask0) * v_2src00;
+                v_float64 v_src01 = (v_1src01 & v_mask1) * v_2src01;
+                v_float64 v_src10 = (v_1src10 & v_mask0) * v_2src10;
+                v_float64 v_src11 = (v_1src11 & v_mask1) * v_2src11;
+                v_float64 v_src20 = (v_1src20 & v_mask0) * v_2src20;
+                v_float64 v_src21 = (v_1src21 & v_mask1) * v_2src21;
+
+                v_float64 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
+                v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
+                v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
+
+                v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
+                v_store_interleave(dst + (x + step) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
+            }
+        }
     }
+#endif // CV_SIMD_64F
+    accProd_general_(src1, src2, dst, mask, len, cn, x);
 }
 
-// running weight accumulate optimized by avx
-void accW_avx_32f(const float* src, float* dst, const uchar* mask, int len, int cn, double alpha)
+// running weight accumulate optimized by universal intrinsic
+void accW_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn, double alpha)
 {
     int x = 0;
+#if CV_SIMD
+    const v_float32 v_alpha = vx_setall_f32((float)alpha);
+    const v_float32 v_beta = vx_setall_f32((float)(1.0f - alpha));
+    const int cVectorWidth = v_uint8::nlanes;
+    const int step = v_float32::nlanes;
+
+    if (!mask)
+    {
+        int size = len * cn;
+        for (; x <= size - cVectorWidth; x += cVectorWidth)
+        {
+            v_uint8 v_src = vx_load(src + x);
+
+            v_uint16 v_src0, v_src1;
+            v_expand(v_src, v_src0, v_src1);
+
+            v_uint32 v_src00, v_src01, v_src10, v_src11;
+            v_expand(v_src0, v_src00, v_src01);
+            v_expand(v_src1, v_src10, v_src11);
+
+            v_float32 v_dst00 = vx_load(dst + x);
+            v_float32 v_dst01 = vx_load(dst + x + step);
+            v_float32 v_dst10 = vx_load(dst + x + step * 2);
+            v_float32 v_dst11 = vx_load(dst + x + step * 3);
+
+            v_dst00 = v_fma(v_dst00, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src00)) * v_alpha);
+            v_dst01 = v_fma(v_dst01, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src01)) * v_alpha);
+            v_dst10 = v_fma(v_dst10, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src10)) * v_alpha);
+            v_dst11 = v_fma(v_dst11, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src11)) * v_alpha);
+
+            v_store(dst + x, v_dst00);
+            v_store(dst + x + step, v_dst01);
+            v_store(dst + x + step * 2, v_dst10);
+            v_store(dst + x + step * 3, v_dst11);
+        }
+    }
+#endif // CV_SIMD
+    accW_general_(src, dst, mask, len, cn, alpha, x);
+}
+
+void accW_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn, double alpha)
+{
+    int x = 0;
+#if CV_SIMD
+    const v_float32 v_alpha = vx_setall_f32((float)alpha);
+    const v_float32 v_beta = vx_setall_f32((float)(1.0f - alpha));
+    const int cVectorWidth = v_uint16::nlanes;
+    const int step = v_float32::nlanes;
+
+    if (!mask)
+    {
+        int size = len * cn;
+        for (; x <= size - cVectorWidth; x += cVectorWidth)
+        {
+            v_uint16 v_src = vx_load(src + x);
+            v_uint32 v_int0, v_int1;
+            v_expand(v_src, v_int0, v_int1);
+
+            v_float32 v_dst0 = vx_load(dst + x);
+            v_float32 v_dst1 = vx_load(dst + x + step);
+            v_dst0 = v_fma(v_dst0, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_int0)) * v_alpha);
+            v_dst1 = v_fma(v_dst1, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_int1)) * v_alpha);
+
+            v_store(dst + x, v_dst0);
+            v_store(dst + x + step, v_dst1);
+        }
+    }
+#endif // CV_SIMD
+    accW_general_(src, dst, mask, len, cn, alpha, x);
+}
+
+void accW_simd_(const float* src, float* dst, const uchar* mask, int len, int cn, double alpha)
+{
+    int x = 0;
+#if CV_AVX && !CV_AVX2
     const __m256 v_alpha = _mm256_set1_ps((float)alpha);
     const __m256 v_beta = _mm256_set1_ps((float)(1.0f - alpha));
     const int cVectorWidth = 16;
@@ -3060,18 +2681,129 @@ void accW_avx_32f(const float* src, float* dst, const uchar* mask, int len, int
             _mm256_storeu_ps(dst + x, _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(dst + x), v_beta), _mm256_mul_ps(_mm256_loadu_ps(src + x), v_alpha)));
             _mm256_storeu_ps(dst + x + 8, _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(dst + x + 8), v_beta), _mm256_mul_ps(_mm256_loadu_ps(src + x + 8), v_alpha)));
         }
-        accW_general_(src, dst, mask, len, cn, alpha, x);
-    }
-    else
-    {
-        accW_simd_(src, dst, mask, len, cn, alpha);
     }
+#elif CV_SIMD
+    const v_float32 v_alpha = vx_setall_f32((float)alpha);
+    const v_float32 v_beta = vx_setall_f32((float)(1.0f - alpha));
+    const int cVectorWidth = v_uint16::nlanes;
+    const int step = v_float32::nlanes;
 
+    if (!mask)
+    {
+        int size = len * cn;
+        for (; x <= size - cVectorWidth; x += cVectorWidth)
+        {
+            v_float32 v_dst0 = vx_load(dst + x);
+            v_float32 v_dst1 = vx_load(dst + x + step);
+
+            v_dst0 = v_fma(v_dst0, v_beta, vx_load(src + x) * v_alpha);
+            v_dst1 = v_fma(v_dst1, v_beta, vx_load(src + x + step) * v_alpha);
+
+            v_store(dst + x, v_dst0);
+            v_store(dst + x + step, v_dst1);
+        }
+    }
+#endif // CV_SIMD
+    accW_general_(src, dst, mask, len, cn, alpha, x);
 }
 
-void accW_avx_32f64f(const float* src, double* dst, const uchar* mask, int len, int cn, double alpha)
+void accW_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn, double alpha)
 {
     int x = 0;
+#if CV_SIMD_64F
+    const v_float64 v_alpha = vx_setall_f64(alpha);
+    const v_float64 v_beta = vx_setall_f64(1.0f - alpha);
+    const int cVectorWidth = v_uint16::nlanes;
+    const int step = v_float64::nlanes;
+
+    if (!mask)
+    {
+        int size = len * cn;
+        for (; x <= size - cVectorWidth; x += cVectorWidth)
+        {
+            v_uint16 v_src16 = vx_load_expand(src + x);
+
+            v_uint32 v_int_0, v_int_1;
+            v_expand(v_src16, v_int_0, v_int_1);
+
+            v_int32 v_int0 = v_reinterpret_as_s32(v_int_0);
+            v_int32 v_int1 = v_reinterpret_as_s32(v_int_1);
+
+            v_float64 v_src0 = v_cvt_f64(v_int0);
+            v_float64 v_src1 = v_cvt_f64_high(v_int0);
+            v_float64 v_src2 = v_cvt_f64(v_int1);
+            v_float64 v_src3 = v_cvt_f64_high(v_int1);
+
+            v_float64 v_dst0 = vx_load(dst + x);
+            v_float64 v_dst1 = vx_load(dst + x + step);
+            v_float64 v_dst2 = vx_load(dst + x + step * 2);
+            v_float64 v_dst3 = vx_load(dst + x + step * 3);
+
+            v_dst0 = v_fma(v_dst0, v_beta, v_src0 * v_alpha);
+            v_dst1 = v_fma(v_dst1, v_beta, v_src1 * v_alpha);
+            v_dst2 = v_fma(v_dst2, v_beta, v_src2 * v_alpha);
+            v_dst3 = v_fma(v_dst3, v_beta, v_src3 * v_alpha);
+
+            v_store(dst + x, v_dst0);
+            v_store(dst + x + step, v_dst1);
+            v_store(dst + x + step * 2, v_dst2);
+            v_store(dst + x + step * 3, v_dst3);
+        }
+    }
+#endif // CV_SIMD_64F
+    accW_general_(src, dst, mask, len, cn, alpha, x);
+}
+
+void accW_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn, double alpha)
+{
+    int x = 0;
+#if CV_SIMD_64F
+    const v_float64 v_alpha = vx_setall_f64(alpha);
+    const v_float64 v_beta = vx_setall_f64(1.0f - alpha);
+    const int cVectorWidth = v_uint16::nlanes;
+    const int step = v_float64::nlanes;
+
+    if (!mask)
+    {
+        int size = len * cn;
+        for (; x <= size - cVectorWidth; x += cVectorWidth)
+        {
+            v_uint16 v_src = vx_load(src + x);
+            v_uint32 v_int_0, v_int_1;
+            v_expand(v_src, v_int_0, v_int_1);
+
+            v_int32 v_int0 = v_reinterpret_as_s32(v_int_0);
+            v_int32 v_int1 = v_reinterpret_as_s32(v_int_1);
+
+            v_float64 v_src00 = v_cvt_f64(v_int0);
+            v_float64 v_src01 = v_cvt_f64_high(v_int0);
+            v_float64 v_src10 = v_cvt_f64(v_int1);
+            v_float64 v_src11 = v_cvt_f64_high(v_int1);
+
+            v_float64 v_dst00 = vx_load(dst + x);
+            v_float64 v_dst01 = vx_load(dst + x + step);
+            v_float64 v_dst10 = vx_load(dst + x + step * 2);
+            v_float64 v_dst11 = vx_load(dst + x + step * 3);
+
+            v_dst00 = v_fma(v_dst00, v_beta, v_src00 * v_alpha);
+            v_dst01 = v_fma(v_dst01, v_beta, v_src01 * v_alpha);
+            v_dst10 = v_fma(v_dst10, v_beta, v_src10 * v_alpha);
+            v_dst11 = v_fma(v_dst11, v_beta, v_src11 * v_alpha);
+
+            v_store(dst + x, v_dst00);
+            v_store(dst + x + step, v_dst01);
+            v_store(dst + x + step * 2, v_dst10);
+            v_store(dst + x + step * 3, v_dst11);
+        }
+    }
+#endif // CV_SIMD_64F
+    accW_general_(src, dst, mask, len, cn, alpha, x);
+}
+
+void accW_simd_(const float* src, double* dst, const uchar* mask, int len, int cn, double alpha)
+{
+    int x = 0;
+#if CV_AVX && !CV_AVX2
     const __m256d v_alpha = _mm256_set1_pd(alpha);
     const __m256d v_beta = _mm256_set1_pd(1.0f - alpha);
     const int cVectorWidth = 16;
@@ -3093,17 +2825,49 @@ void accW_avx_32f64f(const float* src, double* dst, const uchar* mask, int len,
             _mm256_storeu_pd(dst + x + 8, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x + 8), v_beta), _mm256_mul_pd(v_src10, v_alpha)));
             _mm256_storeu_pd(dst + x + 12, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x + 12), v_beta), _mm256_mul_pd(v_src11, v_alpha)));
         }
-        accW_general_(src, dst, mask, len, cn, alpha, x);
     }
-    else
+#elif CV_SIMD_64F
+    const v_float64 v_alpha = vx_setall_f64(alpha);
+    const v_float64 v_beta = vx_setall_f64(1.0f - alpha);
+    const int cVectorWidth = v_float32::nlanes * 2;
+    const int step = v_float64::nlanes;
+
+    if (!mask)
     {
-        accW_simd_(src, dst, mask, len, cn, alpha);
+        int size = len * cn;
+        for (; x <= size - cVectorWidth; x += cVectorWidth)
+        {
+            v_float32 v_src0 = vx_load(src + x);
+            v_float32 v_src1 = vx_load(src + x + v_float32::nlanes);
+            v_float64 v_src00 = v_cvt_f64(v_src0);
+            v_float64 v_src01 = v_cvt_f64_high(v_src0);
+            v_float64 v_src10 = v_cvt_f64(v_src1);
+            v_float64 v_src11 = v_cvt_f64_high(v_src1);
+
+            v_float64 v_dst00 = vx_load(dst + x);
+            v_float64 v_dst01 = vx_load(dst + x + step);
+            v_float64 v_dst10 = vx_load(dst + x + step * 2);
+            v_float64 v_dst11 = vx_load(dst + x + step * 3);
+
+            v_dst00 = v_fma(v_dst00, v_beta, v_src00 * v_alpha);
+            v_dst01 = v_fma(v_dst01, v_beta, v_src01 * v_alpha);
+            v_dst10 = v_fma(v_dst10, v_beta, v_src10 * v_alpha);
+            v_dst11 = v_fma(v_dst11, v_beta, v_src11 * v_alpha);
+
+            v_store(dst + x, v_dst00);
+            v_store(dst + x + step, v_dst01);
+            v_store(dst + x + step * 2, v_dst10);
+            v_store(dst + x + step * 3, v_dst11);
+        }
     }
+#endif // CV_SIMD_64F
+    accW_general_(src, dst, mask, len, cn, alpha, x);
 }
 
-void accW_avx_64f(const double* src, double* dst, const uchar* mask, int len, int cn, double alpha)
+void accW_simd_(const double* src, double* dst, const uchar* mask, int len, int cn, double alpha)
 {
     int x = 0;
+#if CV_AVX && !CV_AVX2
     const __m256d v_alpha = _mm256_set1_pd(alpha);
     const __m256d v_beta = _mm256_set1_pd(1.0f - alpha);
     const int cVectorWidth = 8;
@@ -3119,14 +2883,35 @@ void accW_avx_64f(const double* src, double* dst, const uchar* mask, int len, in
             _mm256_storeu_pd(dst + x, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x), v_beta), _mm256_mul_pd(v_src0, v_alpha)));
             _mm256_storeu_pd(dst + x + 4, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x + 4), v_beta), _mm256_mul_pd(v_src1, v_alpha)));
         }
-        accW_general_(src, dst, mask, len, cn, alpha, x);
     }
-    else
+#elif CV_SIMD_64F
+    const v_float64 v_alpha = vx_setall_f64(alpha);
+    const v_float64 v_beta = vx_setall_f64(1.0f - alpha);
+    const int cVectorWidth = v_float64::nlanes * 2;
+    const int step = v_float64::nlanes;
+
+    if (!mask)
     {
-        accW_simd_(src, dst, mask, len, cn, alpha);
+        int size = len * cn;
+        for (; x <= size - cVectorWidth; x += cVectorWidth)
+        {
+            v_float64 v_src0 = vx_load(src + x);
+            v_float64 v_src1 = vx_load(src + x + step);
+
+            v_float64 v_dst0 = vx_load(dst + x);
+            v_float64 v_dst1 = vx_load(dst + x + step);
+
+            v_dst0 = v_fma(v_dst0, v_beta, v_src0 * v_alpha);
+            v_dst1 = v_fma(v_dst1, v_beta, v_src1 * v_alpha);
+
+            v_store(dst + x, v_dst0);
+            v_store(dst + x + step, v_dst1);
+        }
     }
+#endif // CV_SIMD_64F
+    accW_general_(src, dst, mask, len, cn, alpha, x);
 }
-#endif
+
 #endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
 
 CV_CPU_OPTIMIZATION_NAMESPACE_END
diff --git a/modules/imgproc/src/smooth.cpp b/modules/imgproc/src/smooth.cpp
index 81cc548b40..f327d9f067 100644
--- a/modules/imgproc/src/smooth.cpp
+++ b/modules/imgproc/src/smooth.cpp
@@ -1825,7 +1825,7 @@ void hlineSmooth1N<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const ufi
     const int VECSZ = v_uint16::nlanes;
     v_uint16 v_mul = vx_setall_u16(*((uint16_t*)m));
     for (; i <= lencn - VECSZ; i += VECSZ)
-        v_store((uint16_t*)dst + i, v_mul*vx_load_expand(src + i));
+        v_store((uint16_t*)dst + i, v_mul_wrap(v_mul, vx_load_expand(src + i)));
 #endif
     for (; i < lencn; i++)
         dst[i] = m[0] * src[i];
@@ -1915,7 +1915,9 @@ void hlineSmooth3N<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const ufi
         v_uint16 v_mul1 = vx_setall_u16(_m[1]);
         v_uint16 v_mul2 = vx_setall_u16(_m[2]);
         for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ)
-            v_store((uint16_t*)dst, vx_load_expand(src - cn) * v_mul0 + vx_load_expand(src) * v_mul1 + vx_load_expand(src + cn) * v_mul2);
+            v_store((uint16_t*)dst, v_mul_wrap(vx_load_expand(src - cn), v_mul0) +
+                                    v_mul_wrap(vx_load_expand(src), v_mul1) +
+                                    v_mul_wrap(vx_load_expand(src + cn), v_mul2));
 #endif
         for (; i < lencn; i++, src++, dst++)
             *dst = m[0] * src[-cn] + m[1] * src[0] + m[2] * src[cn];
@@ -2089,7 +2091,8 @@ void hlineSmooth3Naba<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const
         v_uint16 v_mul0 = vx_setall_u16(_m[0]);
         v_uint16 v_mul1 = vx_setall_u16(_m[1]);
         for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ)
-            v_store((uint16_t*)dst, (vx_load_expand(src - cn) + vx_load_expand(src + cn)) * v_mul0 + vx_load_expand(src) * v_mul1);
+            v_store((uint16_t*)dst, v_mul_wrap(vx_load_expand(src - cn) + vx_load_expand(src + cn), v_mul0) +
+                                    v_mul_wrap(vx_load_expand(src), v_mul1));
 #endif
         for (; i < lencn; i++, src++, dst++)
             *((uint16_t*)dst) = ((uint16_t*)m)[1] * src[0] + ((uint16_t*)m)[0] * ((uint16_t)(src[-cn]) + (uint16_t)(src[cn]));
@@ -2285,7 +2288,11 @@ void hlineSmooth5N<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const ufi
         v_uint16 v_mul3 = vx_setall_u16(_m[3]);
         v_uint16 v_mul4 = vx_setall_u16(_m[4]);
         for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ)
-            v_store((uint16_t*)dst, vx_load_expand(src - 2 * cn) * v_mul0 + vx_load_expand(src - cn) * v_mul1 + vx_load_expand(src) * v_mul2 + vx_load_expand(src + cn) * v_mul3 + vx_load_expand(src + 2 * cn) * v_mul4);
+            v_store((uint16_t*)dst, v_mul_wrap(vx_load_expand(src - 2 * cn), v_mul0) +
+                                    v_mul_wrap(vx_load_expand(src - cn), v_mul1) +
+                                    v_mul_wrap(vx_load_expand(src), v_mul2) +
+                                    v_mul_wrap(vx_load_expand(src + cn), v_mul3) +
+                                    v_mul_wrap(vx_load_expand(src + 2 * cn), v_mul4));
 #endif
         for (; i < lencn; i++, src++, dst++)
             *dst = m[0] * src[-2*cn] + m[1] * src[-cn] + m[2] * src[0] + m[3] * src[cn] + m[4] * src[2*cn];
@@ -2488,7 +2495,7 @@ void hlineSmooth5N14641<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, cons
         const int VECSZ = v_uint16::nlanes;
         v_uint16 v_6 = vx_setall_u16(6);
         for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ)
-            v_store((uint16_t*)dst, (vx_load_expand(src) * v_6 + ((vx_load_expand(src - cn) + vx_load_expand(src + cn)) << 2) + vx_load_expand(src - 2 * cn) + vx_load_expand(src + 2 * cn)) << 4);
+            v_store((uint16_t*)dst, (v_mul_wrap(vx_load_expand(src), v_6) + ((vx_load_expand(src - cn) + vx_load_expand(src + cn)) << 2) + vx_load_expand(src - 2 * cn) + vx_load_expand(src + 2 * cn)) << 4);
 #endif
         for (; i < lencn; i++, src++, dst++)
             *((uint16_t*)dst) = (uint16_t(src[0]) * 6 + ((uint16_t(src[-cn]) + uint16_t(src[cn])) << 2) + uint16_t(src[-2 * cn]) + uint16_t(src[2 * cn])) << 4;
@@ -2689,7 +2696,9 @@ void hlineSmooth5Nabcba<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, cons
         v_uint16 v_mul1 = vx_setall_u16(_m[1]);
         v_uint16 v_mul2 = vx_setall_u16(_m[2]);
         for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ)
-            v_store((uint16_t*)dst, (vx_load_expand(src - 2 * cn) + vx_load_expand(src + 2 * cn)) * v_mul0 + (vx_load_expand(src - cn) + vx_load_expand(src + cn))* v_mul1 + vx_load_expand(src) * v_mul2);
+            v_store((uint16_t*)dst, v_mul_wrap(vx_load_expand(src - 2 * cn) + vx_load_expand(src + 2 * cn), v_mul0) +
+                                    v_mul_wrap(vx_load_expand(src - cn) + vx_load_expand(src + cn), v_mul1) +
+                                    v_mul_wrap(vx_load_expand(src), v_mul2));
 #endif
         for (; i < lencn; i++, src++, dst++)
             *((uint16_t*)dst) = ((uint16_t*)m)[0] * ((uint16_t)(src[-2 * cn]) + (uint16_t)(src[2 * cn])) + ((uint16_t*)m)[1] * ((uint16_t)(src[-cn]) + (uint16_t)(src[cn])) + ((uint16_t*)m)[2] * src[0];
@@ -2804,9 +2813,9 @@ void hlineSmooth<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const ufixe
     const int VECSZ = v_uint16::nlanes;
     for (; i <= lencn - VECSZ; i+=VECSZ, src+=VECSZ, dst+=VECSZ)
     {
-        v_uint16 v_res0 = vx_load_expand(src) * vx_setall_u16(*((uint16_t*)m));
+        v_uint16 v_res0 = v_mul_wrap(vx_load_expand(src), vx_setall_u16(*((uint16_t*)m)));
         for (int j = 1; j < n; j++)
-            v_res0 += vx_load_expand(src + j * cn) * vx_setall_u16(*((uint16_t*)(m + j)));
+            v_res0 += v_mul_wrap(vx_load_expand(src + j * cn), vx_setall_u16(*((uint16_t*)(m + j))));
         v_store((uint16_t*)dst, v_res0);
     }
 #endif
@@ -2923,9 +2932,9 @@ void hlineSmoothONa_yzy_a<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, co
     const int VECSZ = v_uint16::nlanes;
     for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ)
     {
-        v_uint16 v_res0 = vx_load_expand(src + pre_shift * cn) * vx_setall_u16(*((uint16_t*)(m + pre_shift)));
+        v_uint16 v_res0 = v_mul_wrap(vx_load_expand(src + pre_shift * cn), vx_setall_u16(*((uint16_t*)(m + pre_shift))));
         for (int j = 0; j < pre_shift; j ++)
-            v_res0 += (vx_load_expand(src + j * cn) + vx_load_expand(src + (n - 1 - j)*cn)) * vx_setall_u16(*((uint16_t*)(m + j)));
+            v_res0 += v_mul_wrap(vx_load_expand(src + j * cn) + vx_load_expand(src + (n - 1 - j)*cn), vx_setall_u16(*((uint16_t*)(m + j))));
         v_store((uint16_t*)dst, v_res0);
     }
 #endif
diff --git a/modules/video/src/lkpyramid.cpp b/modules/video/src/lkpyramid.cpp
index 40026cd3c1..647f5e304b 100644
--- a/modules/video/src/lkpyramid.cpp
+++ b/modules/video/src/lkpyramid.cpp
@@ -93,7 +93,7 @@ static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst)
                 v_int16x8 s2 = v_reinterpret_as_s16(v_load_expand(srow2 + x));
 
                 v_int16x8 t1 = s2 - s0;
-                v_int16x8 t0 = (s0 + s2) * c3 + s1 * c10;
+                v_int16x8 t0 = v_mul_wrap(s0 + s2, c3) + v_mul_wrap(s1, c10);
 
                 v_store(trow0 + x, t0);
                 v_store(trow1 + x, t1);
@@ -131,7 +131,7 @@ static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst)
                 v_int16x8 s4 = v_load(trow1 + x + cn);
 
                 v_int16x8 t0 = s1 - s0;
-                v_int16x8 t1 = ((s2 + s4) * c3) + (s3 * c10);
+                v_int16x8 t1 = v_mul_wrap(s2 + s4, c3) + v_mul_wrap(s3, c10);
 
                 v_store_interleave((drow + x*2), t0, t1);
             }