diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp
index ee8310b5c5..88a002145a 100644
--- a/modules/core/include/opencv2/core/hal/intrin.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin.hpp
@@ -745,7 +745,22 @@ namespace CV__SIMD_NAMESPACE {
     inline _Tpvec v_add(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \
         return v_add(f1 + f2, vf...); \
     }
+    #define OPENCV_HAL_WRAP_SHIFT_OP(_Tpvec) \
+    inline _Tpvec v_shr(const _Tpvec& a, int n) \
+    { \
+        return a >> n; \
+    } \
+    inline _Tpvec v_shl(const _Tpvec& a, int n) \
+    { \
+        return a << n; \
+    }
 
+    OPENCV_HAL_WRAP_SHIFT_OP(v_uint16)
+    OPENCV_HAL_WRAP_SHIFT_OP(v_uint32)
+    OPENCV_HAL_WRAP_SHIFT_OP(v_uint64)
+    OPENCV_HAL_WRAP_SHIFT_OP(v_int16)
+    OPENCV_HAL_WRAP_SHIFT_OP(v_int32)
+    OPENCV_HAL_WRAP_SHIFT_OP(v_int64)
     OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8)
     OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16)
     OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32)
@@ -769,6 +784,12 @@ namespace CV__SIMD_NAMESPACE {
         OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x4)
         OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x2)
         OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x4)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_uint16x8)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_uint32x4)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_uint64x2)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_int16x8)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_int32x4)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_int64x2)
         #if CV_SIMD_64F
         OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x2)
         #endif
@@ -784,6 +805,12 @@ namespace CV__SIMD_NAMESPACE {
         OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x8)
         OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x4)
         OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x8)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_uint16x16)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_uint32x8)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_uint64x4)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_int16x16)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_int32x8)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_int64x4)
         #if CV_SIMD_64F
         OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x4)
         #endif
@@ -801,7 +828,9 @@ namespace CV__SIMD_NAMESPACE {
     inline _Tpvec v_xor(const _Tpvec& a, const _Tpvec& b) \
     { \
         return a ^ b; \
-    } \
+    }
+
+    #define OPENCV_HAL_WRAP_NOT_OP(_Tpvec) \
     inline _Tpvec v_not(const _Tpvec& a) \
     { \
         return ~a; \
@@ -815,6 +844,18 @@ namespace CV__SIMD_NAMESPACE {
     OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16)
     OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32)
     OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64)
+    OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float32)
+    OPENCV_HAL_WRAP_NOT_OP(v_uint8)
+    OPENCV_HAL_WRAP_NOT_OP(v_uint16)
+    OPENCV_HAL_WRAP_NOT_OP(v_uint32)
+    OPENCV_HAL_WRAP_NOT_OP(v_uint64)
+    OPENCV_HAL_WRAP_NOT_OP(v_int8)
+    OPENCV_HAL_WRAP_NOT_OP(v_int16)
+    OPENCV_HAL_WRAP_NOT_OP(v_int32)
+    OPENCV_HAL_WRAP_NOT_OP(v_int64)
+    #if CV_SIMD_64F
+    OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float64)
+    #endif
     #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
         OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint8x16)
         OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint16x8)
@@ -824,6 +865,18 @@ namespace CV__SIMD_NAMESPACE {
         OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16x8)
         OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32x4)
         OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64x2)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float32x4)
+        OPENCV_HAL_WRAP_NOT_OP(v_uint8x16)
+        OPENCV_HAL_WRAP_NOT_OP(v_uint16x8)
+        OPENCV_HAL_WRAP_NOT_OP(v_uint32x4)
+        OPENCV_HAL_WRAP_NOT_OP(v_uint64x2)
+        OPENCV_HAL_WRAP_NOT_OP(v_int8x16)
+        OPENCV_HAL_WRAP_NOT_OP(v_int16x8)
+        OPENCV_HAL_WRAP_NOT_OP(v_int32x4)
+        OPENCV_HAL_WRAP_NOT_OP(v_int64x2)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float64x2)
+        #endif
     #endif
     #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
         OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint8x32)
@@ -834,6 +887,18 @@ namespace CV__SIMD_NAMESPACE {
         OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16x16)
         OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32x8)
         OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64x4)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float32x8)
+        OPENCV_HAL_WRAP_NOT_OP(v_uint8x32)
+        OPENCV_HAL_WRAP_NOT_OP(v_uint16x16)
+        OPENCV_HAL_WRAP_NOT_OP(v_uint32x8)
+        OPENCV_HAL_WRAP_NOT_OP(v_uint64x4)
+        OPENCV_HAL_WRAP_NOT_OP(v_int8x32)
+        OPENCV_HAL_WRAP_NOT_OP(v_int16x16)
+        OPENCV_HAL_WRAP_NOT_OP(v_int32x8)
+        OPENCV_HAL_WRAP_NOT_OP(v_int64x4)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float64x4)
+        #endif
     #endif
 
     #define OPENCV_HAL_WRAP_BIN_OP_MUL(_Tpvec) \
diff --git a/modules/core/include/opencv2/core/hal/intrin_rvv_compat_overloaded.hpp b/modules/core/include/opencv2/core/hal/intrin_rvv_compat_overloaded.hpp
index 7dd735f99a..914ad28978 100644
--- a/modules/core/include/opencv2/core/hal/intrin_rvv_compat_overloaded.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_rvv_compat_overloaded.hpp
@@ -45,6 +45,7 @@ OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vuint8m2_t, u8m2, vuint8m2_t, i8)
 OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vuint8m4_t, u8m4, vuint8m4_t, i8)
 OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vuint8m8_t, u8m8, vuint8m8_t, i8)
 OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vfloat32m1_t, f32m1, vuint32m1_t, i32)
+OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vuint32m1_t, u32m1, vuint32m1_t, i32)
 #if CV_SIMD_SCALABLE_64F
 OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vfloat64m1_t, f64m1, vuint32mf2_t, i32)
 #endif
diff --git a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp
index 6c28b44f5b..a45c90cf90 100644
--- a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp
@@ -475,6 +475,25 @@ OPENCV_HAL_IMPL_RVV_LUT(v_float32, float, m1)
 OPENCV_HAL_IMPL_RVV_LUT(v_float64, double, mf2)
 #endif
 
+#define OPENCV_HAL_IMPL_RVV_LUT_VEC(_Tpvec, _Tp) \
+inline _Tpvec v_lut(const _Tp* tab, const v_int32& vidx) \
+{ \
+    v_uint32 vidx_ = vmul(vreinterpret_u32m1(vidx), sizeof(_Tp), VTraits<v_int32>::vlanes()); \
+    return vloxei32(tab, vidx_, VTraits<_Tpvec>::vlanes()); \
+}
+OPENCV_HAL_IMPL_RVV_LUT_VEC(v_float32, float)
+OPENCV_HAL_IMPL_RVV_LUT_VEC(v_int32, int)
+OPENCV_HAL_IMPL_RVV_LUT_VEC(v_uint32, unsigned)
+
+#if CV_SIMD_SCALABLE_64F
+inline v_float64 v_lut(const double* tab, const v_int32& vidx) \
+{ \
+    vuint32mf2_t vidx_ = vmul(vlmul_trunc_u32mf2(vreinterpret_u32m1(vidx)), sizeof(double), VTraits<v_float64>::vlanes()); \
+    return vloxei32(tab, vidx_, VTraits<v_float64>::vlanes()); \
+}
+#endif
+
+
 inline v_uint8 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
 inline v_uint8 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
 inline v_uint8 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }
@@ -690,23 +709,27 @@ inline v_float64 v_not (const v_float64& a) \
 
 
 ////////////// Bitwise shifts //////////////
+/*  Usage
+1. v_shl<N>(vec);
+2. v_shl(vec, N); // instead of vec << N, when N is non-constant.
+*/
 
 #define OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(_Tpvec, vl) \
-template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
+template<int s = 0> inline _Tpvec v_shl(const _Tpvec& a, int n = s) \
 { \
     return _Tpvec(vsll(a, uint8_t(n), vl)); \
 } \
-template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
+template<int s = 0> inline _Tpvec v_shr(const _Tpvec& a, int n = s) \
 { \
     return _Tpvec(vsrl(a, uint8_t(n), vl)); \
 }
 
 #define OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(_Tpvec, vl) \
-template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
+template<int s = 0> inline _Tpvec v_shl(const _Tpvec& a, int n = s) \
 { \
     return _Tpvec(vsll(a, uint8_t(n), vl)); \
 } \
-template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
+template<int s = 0> inline _Tpvec v_shr(const _Tpvec& a, int n = s) \
 { \
     return _Tpvec(vsra(a, uint8_t(n), vl)); \
 }
diff --git a/modules/imgproc/src/bilateral_filter.simd.hpp b/modules/imgproc/src/bilateral_filter.simd.hpp
index 0d2c394368..332b36646c 100644
--- a/modules/imgproc/src/bilateral_filter.simd.hpp
+++ b/modules/imgproc/src/bilateral_filter.simd.hpp
@@ -99,33 +99,33 @@ public:
                     const uchar* ksptr2 = sptr + space_ofs[k+2];
                     const uchar* ksptr3 = sptr + space_ofs[k+3];
                     j = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                     v_float32 kweight0 = vx_setall_f32(space_weight[k]);
                     v_float32 kweight1 = vx_setall_f32(space_weight[k+1]);
                     v_float32 kweight2 = vx_setall_f32(space_weight[k+2]);
                     v_float32 kweight3 = vx_setall_f32(space_weight[k+3]);
-                    for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes)
+                    for (; j <= size.width - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes())
                     {
                         v_uint32 rval = vx_load_expand_q(sptr + j);
 
                         v_uint32 val = vx_load_expand_q(ksptr0 + j);
-                        v_float32 w = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval)));
-                        v_float32 v_wsum = vx_load_aligned(wsum + j) + w;
+                        v_float32 w = v_mul(kweight0, v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval))));
+                        v_float32 v_wsum = v_add(vx_load_aligned(wsum + j), w);
                         v_float32 v_sum = v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, vx_load_aligned(sum + j));
 
                         val = vx_load_expand_q(ksptr1 + j);
-                        w = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval)));
-                        v_wsum += w;
+                        w = v_mul(kweight1, v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval))));
+                        v_wsum = v_add(v_wsum, w);
                         v_sum = v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, v_sum);
 
                         val = vx_load_expand_q(ksptr2 + j);
-                        w = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval)));
-                        v_wsum += w;
+                        w = v_mul(kweight2, v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval))));
+                        v_wsum = v_add(v_wsum, w);
                         v_sum = v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, v_sum);
 
                         val = vx_load_expand_q(ksptr3 + j);
-                        w = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval)));
-                        v_wsum += w;
+                        w = v_mul(kweight3, v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval))));
+                        v_wsum = v_add(v_wsum, w);
                         v_sum = v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, v_sum);
 
                         v_store_aligned(wsum + j, v_wsum);
@@ -172,13 +172,13 @@ public:
                 {
                     const uchar* ksptr = sptr + space_ofs[k];
                     j = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                     v_float32 kweight = vx_setall_f32(space_weight[k]);
-                    for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes)
+                    for (; j <= size.width - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes())
                     {
                         v_uint32 val = vx_load_expand_q(ksptr + j);
-                        v_float32 w = kweight * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, vx_load_expand_q(sptr + j))));
-                        v_store_aligned(wsum + j, vx_load_aligned(wsum + j) + w);
+                        v_float32 w = v_mul(kweight, v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, vx_load_expand_q(sptr + j)))));
+                        v_store_aligned(wsum + j, v_add(vx_load_aligned(wsum + j), w));
                         v_store_aligned(sum + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, vx_load_aligned(sum + j)));
                     }
 #endif
@@ -191,10 +191,10 @@ public:
                     }
                 }
                 j = 0;
-#if CV_SIMD
-                for (; j <= size.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes)
-                    v_pack_u_store(dptr + j, v_pack(v_round(vx_load_aligned(sum + j                    ) / vx_load_aligned(wsum + j                    )),
-                                                    v_round(vx_load_aligned(sum + j + v_float32::nlanes) / vx_load_aligned(wsum + j + v_float32::nlanes))));
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for (; j <= size.width - 2*VTraits<v_float32>::vlanes(); j += 2*VTraits<v_float32>::vlanes())
+                    v_pack_u_store(dptr + j, v_pack(v_round(v_div(vx_load_aligned(sum + j), vx_load_aligned(wsum + j))),
+                                                    v_round(v_div(vx_load_aligned(sum + j + VTraits<v_float32>::vlanes()), vx_load_aligned(wsum + j + VTraits<v_float32>::vlanes())))));
 #endif
                 for (; j < size.width; j++)
                 {
@@ -221,13 +221,13 @@ public:
                     const uchar* ksptr3 = sptr + space_ofs[k+3];
                     const uchar* rsptr = sptr;
                     j = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                     v_float32 kweight0 = vx_setall_f32(space_weight[k]);
                     v_float32 kweight1 = vx_setall_f32(space_weight[k+1]);
                     v_float32 kweight2 = vx_setall_f32(space_weight[k+2]);
                     v_float32 kweight3 = vx_setall_f32(space_weight[k+3]);
-                    for (; j <= size.width - v_uint8::nlanes; j += v_uint8::nlanes, rsptr += 3*v_uint8::nlanes,
-                                                              ksptr0 += 3*v_uint8::nlanes, ksptr1 += 3*v_uint8::nlanes, ksptr2 += 3*v_uint8::nlanes, ksptr3 += 3*v_uint8::nlanes)
+                    for (; j <= size.width - VTraits<v_uint8>::vlanes(); j += VTraits<v_uint8>::vlanes(), rsptr += 3*VTraits<v_uint8>::vlanes(),
+                                                              ksptr0 += 3*VTraits<v_uint8>::vlanes(), ksptr1 += 3*VTraits<v_uint8>::vlanes(), ksptr2 += 3*VTraits<v_uint8>::vlanes(), ksptr3 += 3*VTraits<v_uint8>::vlanes())
                     {
                         v_uint8 kb, kg, kr, rb, rg, rr;
                         v_load_deinterleave(rsptr, rb, rg, rr);
@@ -236,163 +236,163 @@ public:
                         v_uint16 val0, val1, val2, val3, val4;
                         v_expand(v_absdiff(kb, rb), val0, val1);
                         v_expand(v_absdiff(kg, rg), val2, val3);
-                        val0 += val2; val1 += val3;
+                        val0 = v_add(val0, val2); val1 = v_add(val1, val3);
                         v_expand(v_absdiff(kr, rr), val2, val3);
-                        val0 += val2; val1 += val3;
+                        val0 = v_add(val0, val2); val1 = v_add(val1, val3);
 
                         v_uint32 vall, valh;
                         v_expand(val0, vall, valh);
-                        v_float32 w0 = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(vall));
-                        v_float32 w1 = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(valh));
-                        v_store_aligned(wsum + j, w0 + vx_load_aligned(wsum + j));
-                        v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes));
+                        v_float32 w0 = v_mul(kweight0, v_lut(color_weight, v_reinterpret_as_s32(vall)));
+                        v_float32 w1 = v_mul(kweight0, v_lut(color_weight, v_reinterpret_as_s32(valh)));
+                        v_store_aligned(wsum + j, v_add(w0, vx_load_aligned(wsum + j)));
+                        v_store_aligned(wsum + j + VTraits<v_float32>::vlanes(), v_add(w1, vx_load_aligned(wsum + j + VTraits<v_float32>::vlanes())));
                         v_expand(kb, val0, val2);
                         v_expand(val0, vall, valh);
                         v_store_aligned(sum_b + j                      , v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j)));
-                        v_store_aligned(sum_b + j +   v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes)));
+                        v_store_aligned(sum_b + j +   VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + VTraits<v_float32>::vlanes())));
                         v_expand(kg, val0, val3);
                         v_expand(val0, vall, valh);
                         v_store_aligned(sum_g + j                      , v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j)));
-                        v_store_aligned(sum_g + j +   v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes)));
+                        v_store_aligned(sum_g + j +   VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + VTraits<v_float32>::vlanes())));
                         v_expand(kr, val0, val4);
                         v_expand(val0, vall, valh);
                         v_store_aligned(sum_r + j                      , v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j)));
-                        v_store_aligned(sum_r + j +   v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes)));
+                        v_store_aligned(sum_r + j +   VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + VTraits<v_float32>::vlanes())));
 
                         v_expand(val1, vall, valh);
-                        w0 = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(vall));
-                        w1 = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(valh));
-                        v_store_aligned(wsum + j + 2 * v_float32::nlanes, w0 + vx_load_aligned(wsum + j + 2 * v_float32::nlanes));
-                        v_store_aligned(wsum + j + 3 * v_float32::nlanes, w1 + vx_load_aligned(wsum + j + 3 * v_float32::nlanes));
+                        w0 = v_mul(kweight0, v_lut(color_weight, v_reinterpret_as_s32(vall)));
+                        w1 = v_mul(kweight0, v_lut(color_weight, v_reinterpret_as_s32(valh)));
+                        v_store_aligned(wsum + j + 2 * VTraits<v_float32>::vlanes(), v_add(w0, vx_load_aligned(wsum + j + 2 * VTraits<v_float32>::vlanes())));
+                        v_store_aligned(wsum + j + 3 * VTraits<v_float32>::vlanes(), v_add(w1, vx_load_aligned(wsum + j + 3 * VTraits<v_float32>::vlanes())));
                         v_expand(val2, vall, valh);
-                        v_store_aligned(sum_b + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * v_float32::nlanes)));
-                        v_store_aligned(sum_b + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * v_float32::nlanes)));
+                        v_store_aligned(sum_b + j + 2 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * VTraits<v_float32>::vlanes())));
+                        v_store_aligned(sum_b + j + 3 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * VTraits<v_float32>::vlanes())));
                         v_expand(val3, vall, valh);
-                        v_store_aligned(sum_g + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * v_float32::nlanes)));
-                        v_store_aligned(sum_g + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * v_float32::nlanes)));
+                        v_store_aligned(sum_g + j + 2 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * VTraits<v_float32>::vlanes())));
+                        v_store_aligned(sum_g + j + 3 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * VTraits<v_float32>::vlanes())));
                         v_expand(val4, vall, valh);
-                        v_store_aligned(sum_r + j + 2*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2*v_float32::nlanes)));
-                        v_store_aligned(sum_r + j + 3*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3*v_float32::nlanes)));
+                        v_store_aligned(sum_r + j + 2*VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2*VTraits<v_float32>::vlanes())));
+                        v_store_aligned(sum_r + j + 3*VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3*VTraits<v_float32>::vlanes())));
 
                         v_load_deinterleave(ksptr1, kb, kg, kr);
                         v_expand(v_absdiff(kb, rb), val0, val1);
                         v_expand(v_absdiff(kg, rg), val2, val3);
-                        val0 += val2; val1 += val3;
+                        val0 = v_add(val0, val2); val1 = v_add(val1, val3);
                         v_expand(v_absdiff(kr, rr), val2, val3);
-                        val0 += val2; val1 += val3;
+                        val0 = v_add(val0, val2); val1 = v_add(val1, val3);
 
                         v_expand(val0, vall, valh);
-                        w0 = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(vall));
-                        w1 = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(valh));
-                        v_store_aligned(wsum + j, w0 + vx_load_aligned(wsum + j));
-                        v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes));
+                        w0 = v_mul(kweight1, v_lut(color_weight, v_reinterpret_as_s32(vall)));
+                        w1 = v_mul(kweight1, v_lut(color_weight, v_reinterpret_as_s32(valh)));
+                        v_store_aligned(wsum + j, v_add(w0, vx_load_aligned(wsum + j)));
+                        v_store_aligned(wsum + j + VTraits<v_float32>::vlanes(), v_add(w1, vx_load_aligned(wsum + j + VTraits<v_float32>::vlanes())));
                         v_expand(kb, val0, val2);
                         v_expand(val0, vall, valh);
                         v_store_aligned(sum_b + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j)));
-                        v_store_aligned(sum_b + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes)));
+                        v_store_aligned(sum_b + j + VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + VTraits<v_float32>::vlanes())));
                         v_expand(kg, val0, val3);
                         v_expand(val0, vall, valh);
                         v_store_aligned(sum_g + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j)));
-                        v_store_aligned(sum_g + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes)));
+                        v_store_aligned(sum_g + j + VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + VTraits<v_float32>::vlanes())));
                         v_expand(kr, val0, val4);
                         v_expand(val0, vall, valh);
                         v_store_aligned(sum_r + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j)));
-                        v_store_aligned(sum_r + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes)));
+                        v_store_aligned(sum_r + j + VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + VTraits<v_float32>::vlanes())));
 
                         v_expand(val1, vall, valh);
-                        w0 = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(vall));
-                        w1 = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(valh));
-                        v_store_aligned(wsum + j + 2 * v_float32::nlanes, w0 + vx_load_aligned(wsum + j + 2 * v_float32::nlanes));
-                        v_store_aligned(wsum + j + 3 * v_float32::nlanes, w1 + vx_load_aligned(wsum + j + 3 * v_float32::nlanes));
+                        w0 = v_mul(kweight1, v_lut(color_weight, v_reinterpret_as_s32(vall)));
+                        w1 = v_mul(kweight1, v_lut(color_weight, v_reinterpret_as_s32(valh)));
+                        v_store_aligned(wsum + j + 2 * VTraits<v_float32>::vlanes(), v_add(w0, vx_load_aligned(wsum + j + 2 * VTraits<v_float32>::vlanes())));
+                        v_store_aligned(wsum + j + 3 * VTraits<v_float32>::vlanes(), v_add(w1, vx_load_aligned(wsum + j + 3 * VTraits<v_float32>::vlanes())));
                         v_expand(val2, vall, valh);
-                        v_store_aligned(sum_b + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * v_float32::nlanes)));
-                        v_store_aligned(sum_b + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * v_float32::nlanes)));
+                        v_store_aligned(sum_b + j + 2 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * VTraits<v_float32>::vlanes())));
+                        v_store_aligned(sum_b + j + 3 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * VTraits<v_float32>::vlanes())));
                         v_expand(val3, vall, valh);
-                        v_store_aligned(sum_g + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * v_float32::nlanes)));
-                        v_store_aligned(sum_g + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * v_float32::nlanes)));
+                        v_store_aligned(sum_g + j + 2 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * VTraits<v_float32>::vlanes())));
+                        v_store_aligned(sum_g + j + 3 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * VTraits<v_float32>::vlanes())));
                         v_expand(val4, vall, valh);
-                        v_store_aligned(sum_r + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2 * v_float32::nlanes)));
-                        v_store_aligned(sum_r + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3 * v_float32::nlanes)));
+                        v_store_aligned(sum_r + j + 2 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2 * VTraits<v_float32>::vlanes())));
+                        v_store_aligned(sum_r + j + 3 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3 * VTraits<v_float32>::vlanes())));
 
                         v_load_deinterleave(ksptr2, kb, kg, kr);
                         v_expand(v_absdiff(kb, rb), val0, val1);
                         v_expand(v_absdiff(kg, rg), val2, val3);
-                        val0 += val2; val1 += val3;
+                        val0 = v_add(val0, val2); val1 = v_add(val1, val3);
                         v_expand(v_absdiff(kr, rr), val2, val3);
-                        val0 += val2; val1 += val3;
+                        val0 = v_add(val0, val2); val1 = v_add(val1, val3);
 
                         v_expand(val0, vall, valh);
-                        w0 = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(vall));
-                        w1 = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(valh));
-                        v_store_aligned(wsum + j, w0 + vx_load_aligned(wsum + j));
-                        v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes));
+                        w0 = v_mul(kweight2, v_lut(color_weight, v_reinterpret_as_s32(vall)));
+                        w1 = v_mul(kweight2, v_lut(color_weight, v_reinterpret_as_s32(valh)));
+                        v_store_aligned(wsum + j, v_add(w0, vx_load_aligned(wsum + j)));
+                        v_store_aligned(wsum + j + VTraits<v_float32>::vlanes(), v_add(w1, vx_load_aligned(wsum + j + VTraits<v_float32>::vlanes())));
                         v_expand(kb, val0, val2);
                         v_expand(val0, vall, valh);
                         v_store_aligned(sum_b + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j)));
-                        v_store_aligned(sum_b + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes)));
+                        v_store_aligned(sum_b + j + VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + VTraits<v_float32>::vlanes())));
                         v_expand(kg, val0, val3);
                         v_expand(val0, vall, valh);
                         v_store_aligned(sum_g + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j)));
-                        v_store_aligned(sum_g + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes)));
+                        v_store_aligned(sum_g + j + VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + VTraits<v_float32>::vlanes())));
                         v_expand(kr, val0, val4);
                         v_expand(val0, vall, valh);
                         v_store_aligned(sum_r + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j)));
-                        v_store_aligned(sum_r + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes)));
+                        v_store_aligned(sum_r + j + VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + VTraits<v_float32>::vlanes())));
 
                         v_expand(val1, vall, valh);
-                        w0 = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(vall));
-                        w1 = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(valh));
-                        v_store_aligned(wsum + j + 2 * v_float32::nlanes, w0 + vx_load_aligned(wsum + j + 2 * v_float32::nlanes));
-                        v_store_aligned(wsum + j + 3 * v_float32::nlanes, w1 + vx_load_aligned(wsum + j + 3 * v_float32::nlanes));
+                        w0 = v_mul(kweight2, v_lut(color_weight, v_reinterpret_as_s32(vall)));
+                        w1 = v_mul(kweight2, v_lut(color_weight, v_reinterpret_as_s32(valh)));
+                        v_store_aligned(wsum + j + 2 * VTraits<v_float32>::vlanes(), v_add(w0, vx_load_aligned(wsum + j + 2 * VTraits<v_float32>::vlanes())));
+                        v_store_aligned(wsum + j + 3 * VTraits<v_float32>::vlanes(), v_add(w1, vx_load_aligned(wsum + j + 3 * VTraits<v_float32>::vlanes())));
                         v_expand(val2, vall, valh);
-                        v_store_aligned(sum_b + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * v_float32::nlanes)));
-                        v_store_aligned(sum_b + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * v_float32::nlanes)));
+                        v_store_aligned(sum_b + j + 2 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * VTraits<v_float32>::vlanes())));
+                        v_store_aligned(sum_b + j + 3 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * VTraits<v_float32>::vlanes())));
                         v_expand(val3, vall, valh);
-                        v_store_aligned(sum_g + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * v_float32::nlanes)));
-                        v_store_aligned(sum_g + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * v_float32::nlanes)));
+                        v_store_aligned(sum_g + j + 2 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * VTraits<v_float32>::vlanes())));
+                        v_store_aligned(sum_g + j + 3 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * VTraits<v_float32>::vlanes())));
                         v_expand(val4, vall, valh);
-                        v_store_aligned(sum_r + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2 * v_float32::nlanes)));
-                        v_store_aligned(sum_r + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3 * v_float32::nlanes)));
+                        v_store_aligned(sum_r + j + 2 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2 * VTraits<v_float32>::vlanes())));
+                        v_store_aligned(sum_r + j + 3 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3 * VTraits<v_float32>::vlanes())));
 
                         v_load_deinterleave(ksptr3, kb, kg, kr);
                         v_expand(v_absdiff(kb, rb), val0, val1);
                         v_expand(v_absdiff(kg, rg), val2, val3);
-                        val0 += val2; val1 += val3;
+                        val0 = v_add(val0, val2); val1 = v_add(val1, val3);
                         v_expand(v_absdiff(kr, rr), val2, val3);
-                        val0 += val2; val1 += val3;
+                        val0 = v_add(val0, val2); val1 = v_add(val1, val3);
 
                         v_expand(val0, vall, valh);
-                        w0 = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(vall));
-                        w1 = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(valh));
-                        v_store_aligned(wsum + j, w0 + vx_load_aligned(wsum + j));
-                        v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes));
+                        w0 = v_mul(kweight3, v_lut(color_weight, v_reinterpret_as_s32(vall)));
+                        w1 = v_mul(kweight3, v_lut(color_weight, v_reinterpret_as_s32(valh)));
+                        v_store_aligned(wsum + j, v_add(w0, vx_load_aligned(wsum + j)));
+                        v_store_aligned(wsum + j + VTraits<v_float32>::vlanes(), v_add(w1, vx_load_aligned(wsum + j + VTraits<v_float32>::vlanes())));
                         v_expand(kb, val0, val2);
                         v_expand(val0, vall, valh);
                         v_store_aligned(sum_b + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j)));
-                        v_store_aligned(sum_b + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes)));
+                        v_store_aligned(sum_b + j + VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + VTraits<v_float32>::vlanes())));
                         v_expand(kg, val0, val3);
                         v_expand(val0, vall, valh);
                         v_store_aligned(sum_g + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j)));
-                        v_store_aligned(sum_g + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes)));
+                        v_store_aligned(sum_g + j + VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + VTraits<v_float32>::vlanes())));
                         v_expand(kr, val0, val4);
                         v_expand(val0, vall, valh);
                         v_store_aligned(sum_r + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j)));
-                        v_store_aligned(sum_r + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes)));
+                        v_store_aligned(sum_r + j + VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + VTraits<v_float32>::vlanes())));
 
                         v_expand(val1, vall, valh);
-                        w0 = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(vall));
-                        w1 = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(valh));
-                        v_store_aligned(wsum + j + 2 * v_float32::nlanes, w0 + vx_load_aligned(wsum + j + 2 * v_float32::nlanes));
-                        v_store_aligned(wsum + j + 3 * v_float32::nlanes, w1 + vx_load_aligned(wsum + j + 3 * v_float32::nlanes));
+                        w0 = v_mul(kweight3, v_lut(color_weight, v_reinterpret_as_s32(vall)));
+                        w1 = v_mul(kweight3, v_lut(color_weight, v_reinterpret_as_s32(valh)));
+                        v_store_aligned(wsum + j + 2 * VTraits<v_float32>::vlanes(), v_add(w0, vx_load_aligned(wsum + j + 2 * VTraits<v_float32>::vlanes())));
+                        v_store_aligned(wsum + j + 3 * VTraits<v_float32>::vlanes(), v_add(w1, vx_load_aligned(wsum + j + 3 * VTraits<v_float32>::vlanes())));
                         v_expand(val2, vall, valh);
-                        v_store_aligned(sum_b + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * v_float32::nlanes)));
-                        v_store_aligned(sum_b + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * v_float32::nlanes)));
+                        v_store_aligned(sum_b + j + 2 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * VTraits<v_float32>::vlanes())));
+                        v_store_aligned(sum_b + j + 3 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * VTraits<v_float32>::vlanes())));
                         v_expand(val3, vall, valh);
-                        v_store_aligned(sum_g + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * v_float32::nlanes)));
-                        v_store_aligned(sum_g + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * v_float32::nlanes)));
+                        v_store_aligned(sum_g + j + 2 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * VTraits<v_float32>::vlanes())));
+                        v_store_aligned(sum_g + j + 3 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * VTraits<v_float32>::vlanes())));
                         v_expand(val4, vall, valh);
-                        v_store_aligned(sum_r + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2 * v_float32::nlanes)));
-                        v_store_aligned(sum_r + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3 * v_float32::nlanes)));
+                        v_store_aligned(sum_r + j + 2 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2 * VTraits<v_float32>::vlanes())));
+                        v_store_aligned(sum_r + j + 3 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3 * VTraits<v_float32>::vlanes())));
                     }
 #endif
 #if CV_SIMD128
@@ -442,9 +442,9 @@ public:
                     const uchar* ksptr = sptr + space_ofs[k];
                     const uchar* rsptr = sptr;
                     j = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                     v_float32 kweight = vx_setall_f32(space_weight[k]);
-                    for (; j <= size.width - v_uint8::nlanes; j += v_uint8::nlanes, ksptr += 3*v_uint8::nlanes, rsptr += 3*v_uint8::nlanes)
+                    for (; j <= size.width - VTraits<v_uint8>::vlanes(); j += VTraits<v_uint8>::vlanes(), ksptr += 3*VTraits<v_uint8>::vlanes(), rsptr += 3*VTraits<v_uint8>::vlanes())
                     {
                         v_uint8 kb, kg, kr, rb, rg, rr;
                         v_load_deinterleave(ksptr, kb, kg, kr);
@@ -456,39 +456,39 @@ public:
                         v_expand(v_absdiff(kr, rr), r_l, r_h);
 
                         v_uint32 val0, val1, val2, val3;
-                        v_expand(b_l + g_l + r_l, val0, val1);
-                        v_expand(b_h + g_h + r_h, val2, val3);
+                        v_expand(v_add(v_add(b_l, g_l), r_l), val0, val1);
+                        v_expand(v_add(v_add(b_h, g_h), r_h), val2, val3);
 
                         v_expand(kb, b_l, b_h);
                         v_expand(kg, g_l, g_h);
                         v_expand(kr, r_l, r_h);
 
-                        v_float32 w0 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val0));
-                        v_float32 w1 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val1));
-                        v_float32 w2 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val2));
-                        v_float32 w3 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val3));
-                        v_store_aligned(wsum + j                      , w0 + vx_load_aligned(wsum + j));
-                        v_store_aligned(wsum + j +   v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes));
-                        v_store_aligned(wsum + j + 2*v_float32::nlanes, w2 + vx_load_aligned(wsum + j + 2*v_float32::nlanes));
-                        v_store_aligned(wsum + j + 3*v_float32::nlanes, w3 + vx_load_aligned(wsum + j + 3*v_float32::nlanes));
+                        v_float32 w0 = v_mul(kweight, v_lut(color_weight, v_reinterpret_as_s32(val0)));
+                        v_float32 w1 = v_mul(kweight, v_lut(color_weight, v_reinterpret_as_s32(val1)));
+                        v_float32 w2 = v_mul(kweight, v_lut(color_weight, v_reinterpret_as_s32(val2)));
+                        v_float32 w3 = v_mul(kweight, v_lut(color_weight, v_reinterpret_as_s32(val3)));
+                        v_store_aligned(wsum + j                      , v_add(w0, vx_load_aligned(wsum + j)));
+                        v_store_aligned(wsum + j +   VTraits<v_float32>::vlanes(), v_add(w1, vx_load_aligned(wsum + j + VTraits<v_float32>::vlanes())));
+                        v_store_aligned(wsum + j + 2*VTraits<v_float32>::vlanes(), v_add(w2, vx_load_aligned(wsum + j + 2 * VTraits<v_float32>::vlanes())));
+                        v_store_aligned(wsum + j + 3*VTraits<v_float32>::vlanes(), v_add(w3, vx_load_aligned(wsum + j + 3 * VTraits<v_float32>::vlanes())));
                         v_expand(b_l, val0, val1);
                         v_expand(b_h, val2, val3);
                         v_store_aligned(sum_b + j                      , v_muladd(v_cvt_f32(v_reinterpret_as_s32(val0)), w0, vx_load_aligned(sum_b + j)));
-                        v_store_aligned(sum_b + j +   v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes)));
-                        v_store_aligned(sum_b + j + 2*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_b + j + 2*v_float32::nlanes)));
-                        v_store_aligned(sum_b + j + 3*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_b + j + 3*v_float32::nlanes)));
+                        v_store_aligned(sum_b + j +   VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_b + j + VTraits<v_float32>::vlanes())));
+                        v_store_aligned(sum_b + j + 2*VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_b + j + 2*VTraits<v_float32>::vlanes())));
+                        v_store_aligned(sum_b + j + 3*VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_b + j + 3*VTraits<v_float32>::vlanes())));
                         v_expand(g_l, val0, val1);
                         v_expand(g_h, val2, val3);
                         v_store_aligned(sum_g + j                      , v_muladd(v_cvt_f32(v_reinterpret_as_s32(val0)), w0, vx_load_aligned(sum_g + j)));
-                        v_store_aligned(sum_g + j +   v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes)));
-                        v_store_aligned(sum_g + j + 2*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_g + j + 2*v_float32::nlanes)));
-                        v_store_aligned(sum_g + j + 3*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_g + j + 3*v_float32::nlanes)));
+                        v_store_aligned(sum_g + j +   VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_g + j + VTraits<v_float32>::vlanes())));
+                        v_store_aligned(sum_g + j + 2*VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_g + j + 2*VTraits<v_float32>::vlanes())));
+                        v_store_aligned(sum_g + j + 3*VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_g + j + 3*VTraits<v_float32>::vlanes())));
                         v_expand(r_l, val0, val1);
                         v_expand(r_h, val2, val3);
                         v_store_aligned(sum_r + j                      , v_muladd(v_cvt_f32(v_reinterpret_as_s32(val0)), w0, vx_load_aligned(sum_r + j)));
-                        v_store_aligned(sum_r + j +   v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes)));
-                        v_store_aligned(sum_r + j + 2*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_r + j + 2*v_float32::nlanes)));
-                        v_store_aligned(sum_r + j + 3*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_r + j + 3*v_float32::nlanes)));
+                        v_store_aligned(sum_r + j +   VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_r + j + VTraits<v_float32>::vlanes())));
+                        v_store_aligned(sum_r + j + 2*VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_r + j + 2*VTraits<v_float32>::vlanes())));
+                        v_store_aligned(sum_r + j + 3*VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_r + j + 3*VTraits<v_float32>::vlanes())));
                     }
 #endif
                     for(; j < size.width; j++, ksptr += 3, rsptr += 3)
@@ -500,27 +500,27 @@ public:
                     }
                 }
                 j = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                 v_float32 v_one = vx_setall_f32(1.f);
-                for(; j <= size.width - v_uint8::nlanes; j += v_uint8::nlanes, dptr += 3*v_uint8::nlanes)
+                for(; j <= size.width - VTraits<v_uint8>::vlanes(); j += VTraits<v_uint8>::vlanes(), dptr += 3*VTraits<v_uint8>::vlanes())
                 {
-                    v_float32 w0 = v_one / vx_load_aligned(wsum + j);
-                    v_float32 w1 = v_one / vx_load_aligned(wsum + j + v_float32::nlanes);
-                    v_float32 w2 = v_one / vx_load_aligned(wsum + j + 2*v_float32::nlanes);
-                    v_float32 w3 = v_one / vx_load_aligned(wsum + j + 3*v_float32::nlanes);
+                    v_float32 w0 = v_div(v_one, vx_load_aligned(wsum + j));
+                    v_float32 w1 = v_div(v_one, vx_load_aligned(wsum + j + VTraits<v_float32>::vlanes()));
+                    v_float32 w2 = v_div(v_one, vx_load_aligned(wsum + j + 2 * VTraits<v_float32>::vlanes()));
+                    v_float32 w3 = v_div(v_one, vx_load_aligned(wsum + j + 3 * VTraits<v_float32>::vlanes()));
 
-                    v_store_interleave(dptr, v_pack_u(v_pack(v_round(w0 * vx_load_aligned(sum_b + j)),
-                                                             v_round(w1 * vx_load_aligned(sum_b + j + v_float32::nlanes))),
-                                                      v_pack(v_round(w2 * vx_load_aligned(sum_b + j + 2*v_float32::nlanes)),
-                                                             v_round(w3 * vx_load_aligned(sum_b + j + 3*v_float32::nlanes)))),
-                                             v_pack_u(v_pack(v_round(w0 * vx_load_aligned(sum_g + j)),
-                                                             v_round(w1 * vx_load_aligned(sum_g + j + v_float32::nlanes))),
-                                                      v_pack(v_round(w2 * vx_load_aligned(sum_g + j + 2*v_float32::nlanes)),
-                                                             v_round(w3 * vx_load_aligned(sum_g + j + 3*v_float32::nlanes)))),
-                                             v_pack_u(v_pack(v_round(w0 * vx_load_aligned(sum_r + j)),
-                                                             v_round(w1 * vx_load_aligned(sum_r + j + v_float32::nlanes))),
-                                                      v_pack(v_round(w2 * vx_load_aligned(sum_r + j + 2*v_float32::nlanes)),
-                                                             v_round(w3 * vx_load_aligned(sum_r + j + 3*v_float32::nlanes)))));
+                    v_store_interleave(dptr, v_pack_u(v_pack(v_round(v_mul(w0, vx_load_aligned(sum_b + j))),
+                                                             v_round(v_mul(w1, vx_load_aligned(sum_b + j + VTraits<v_float32>::vlanes())))),
+                                                      v_pack(v_round(v_mul(w2, vx_load_aligned(sum_b + j + 2 * VTraits<v_float32>::vlanes()))),
+                                                             v_round(v_mul(w3, vx_load_aligned(sum_b + j + 3 * VTraits<v_float32>::vlanes()))))),
+                                             v_pack_u(v_pack(v_round(v_mul(w0, vx_load_aligned(sum_g + j))),
+                                                             v_round(v_mul(w1, vx_load_aligned(sum_g + j + VTraits<v_float32>::vlanes())))),
+                                                      v_pack(v_round(v_mul(w2, vx_load_aligned(sum_g + j + 2 * VTraits<v_float32>::vlanes()))),
+                                                             v_round(v_mul(w3, vx_load_aligned(sum_g + j + 3 * VTraits<v_float32>::vlanes()))))),
+                                             v_pack_u(v_pack(v_round(v_mul(w0, vx_load_aligned(sum_r + j))),
+                                                             v_round(v_mul(w1, vx_load_aligned(sum_r + j + VTraits<v_float32>::vlanes())))),
+                                                      v_pack(v_round(v_mul(w2, vx_load_aligned(sum_r + j + 2 * VTraits<v_float32>::vlanes()))),
+                                                             v_round(v_mul(w3, vx_load_aligned(sum_r + j + 3 * VTraits<v_float32>::vlanes()))))));
                 }
 #endif
                 for(; j < size.width; j++)
@@ -533,7 +533,7 @@ public:
                 }
             }
         }
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         vx_cleanup();
 #endif
     }
@@ -589,7 +589,7 @@ public:
                 memset(buf.data(), 0, buf.size() * sizeof(float));
                 float *sum = alignPtr(buf.data(), CV_SIMD_WIDTH);
                 float *wsum = sum + alignSize(size.width, CV_SIMD_WIDTH);
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                 v_float32 v_one = vx_setall_f32(1.f);
                 v_float32 sindex = vx_setall_f32(scale_index);
 #endif
@@ -601,50 +601,50 @@ public:
                     const float* ksptr2 = sptr + space_ofs[k + 2];
                     const float* ksptr3 = sptr + space_ofs[k + 3];
                     j = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                     v_float32 kweight0 = vx_setall_f32(space_weight[k]);
                     v_float32 kweight1 = vx_setall_f32(space_weight[k+1]);
                     v_float32 kweight2 = vx_setall_f32(space_weight[k+2]);
                     v_float32 kweight3 = vx_setall_f32(space_weight[k+3]);
-                    for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes)
+                    for (; j <= size.width - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes())
                     {
                         v_float32 rval = vx_load(sptr + j);
 
                         v_float32 val = vx_load(ksptr0 + j);
                         v_float32 knan = v_not_nan(val);
-                        v_float32 alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan;
+                        v_float32 alpha = v_and(v_and(v_mul(v_absdiff(val, rval), sindex), v_not_nan(rval)), knan);
                         v_int32 idx = v_trunc(alpha);
-                        alpha -= v_cvt_f32(idx);
-                        v_float32 w = (kweight0 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one-alpha))) & knan;
-                        v_float32 v_wsum = vx_load_aligned(wsum + j) + w;
-                        v_float32 v_sum = v_muladd(val & knan, w, vx_load_aligned(sum + j));
+                        alpha = v_sub(alpha, v_cvt_f32(idx));
+                        v_float32 w = v_and(v_mul(kweight0, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one, alpha)))), knan);
+                        v_float32 v_wsum = v_add(vx_load_aligned(wsum + j), w);
+                        v_float32 v_sum = v_muladd(v_and(val, knan), w, vx_load_aligned(sum + j));
 
                         val = vx_load(ksptr1 + j);
                         knan = v_not_nan(val);
-                        alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan;
+                        alpha = v_and(v_and(v_mul(v_absdiff(val, rval), sindex), v_not_nan(rval)), knan);
                         idx = v_trunc(alpha);
-                        alpha -= v_cvt_f32(idx);
-                        w = (kweight1 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
-                        v_wsum += w;
-                        v_sum = v_muladd(val & knan, w, v_sum);
+                        alpha = v_sub(alpha, v_cvt_f32(idx));
+                        w = v_and(v_mul(kweight1, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one, alpha)))), knan);
+                        v_wsum = v_add(v_wsum, w);
+                        v_sum = v_muladd(v_and(val, knan), w, v_sum);
 
                         val = vx_load(ksptr2 + j);
                         knan = v_not_nan(val);
-                        alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan;
+                        alpha = v_and(v_and(v_mul(v_absdiff(val, rval), sindex), v_not_nan(rval)), knan);
                         idx = v_trunc(alpha);
-                        alpha -= v_cvt_f32(idx);
-                        w = (kweight2 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
-                        v_wsum += w;
-                        v_sum = v_muladd(val & knan, w, v_sum);
+                        alpha = v_sub(alpha, v_cvt_f32(idx));
+                        w = v_and(v_mul(kweight2, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one, alpha)))), knan);
+                        v_wsum = v_add(v_wsum, w);
+                        v_sum = v_muladd(v_and(val, knan), w, v_sum);
 
                         val = vx_load(ksptr3 + j);
                         knan = v_not_nan(val);
-                        alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan;
+                        alpha = v_and(v_and(v_mul(v_absdiff(val, rval), sindex), v_not_nan(rval)), knan);
                         idx = v_trunc(alpha);
-                        alpha -= v_cvt_f32(idx);
-                        w = (kweight3 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
-                        v_wsum += w;
-                        v_sum = v_muladd(val & knan, w, v_sum);
+                        alpha = v_sub(alpha, v_cvt_f32(idx));
+                        w = v_and(v_mul(kweight3, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one, alpha)))), knan);
+                        v_wsum = v_add(v_wsum, w);
+                        v_sum = v_muladd(v_and(val, knan), w, v_sum);
 
                         v_store_aligned(wsum + j, v_wsum);
                         v_store_aligned(sum + j, v_sum);
@@ -720,20 +720,20 @@ public:
                 {
                     const float* ksptr = sptr + space_ofs[k];
                     j = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                     v_float32 kweight = vx_setall_f32(space_weight[k]);
-                    for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes)
+                    for (; j <= size.width - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes())
                     {
                         v_float32 val = vx_load(ksptr + j);
                         v_float32 rval = vx_load(sptr + j);
                         v_float32 knan = v_not_nan(val);
-                        v_float32 alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan;
+                        v_float32 alpha = v_and(v_and(v_mul(v_absdiff(val, rval), sindex), v_not_nan(rval)), knan);
                         v_int32 idx = v_trunc(alpha);
-                        alpha -= v_cvt_f32(idx);
+                        alpha = v_sub(alpha, v_cvt_f32(idx));
 
-                        v_float32 w = (kweight * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one-alpha))) & knan;
-                        v_store_aligned(wsum + j, vx_load_aligned(wsum + j) + w);
-                        v_store_aligned(sum + j, v_muladd(val & knan, w, vx_load_aligned(sum + j)));
+                        v_float32 w = v_and(v_mul(kweight, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one, alpha)))), knan);
+                        v_store_aligned(wsum + j, v_add(vx_load_aligned(wsum + j), w));
+                        v_store_aligned(sum + j, v_muladd(v_and(val, knan), w, vx_load_aligned(sum + j)));
                     }
 #endif
                     for (; j < size.width; j++)
@@ -752,11 +752,11 @@ public:
                     }
                 }
                 j = 0;
-#if CV_SIMD
-                for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes)
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for (; j <= size.width - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes())
                 {
                     v_float32 v_val = vx_load(sptr + j);
-                    v_store(dptr + j, (vx_load_aligned(sum + j) + (v_val & v_not_nan(v_val))) / (vx_load_aligned(wsum + j) + (v_one & v_not_nan(v_val))));
+                    v_store(dptr + j, v_div(v_add(vx_load_aligned(sum + j), v_and(v_val, v_not_nan(v_val))), v_add(vx_load_aligned(wsum + j), v_and(v_one, v_not_nan(v_val)))));
                 }
 #endif
                 for (; j < size.width; j++)
@@ -774,7 +774,7 @@ public:
                 float *sum_g = sum_b + alignSize(size.width, CV_SIMD_WIDTH);
                 float *sum_r = sum_g + alignSize(size.width, CV_SIMD_WIDTH);
                 float *wsum = sum_r + alignSize(size.width, CV_SIMD_WIDTH);
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                 v_float32 v_one = vx_setall_f32(1.f);
                 v_float32 sindex = vx_setall_f32(scale_index);
 #endif
@@ -787,60 +787,60 @@ public:
                     const float* ksptr3 = sptr + space_ofs[k+3];
                     const float* rsptr = sptr;
                     j = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                     v_float32 kweight0 = vx_setall_f32(space_weight[k]);
                     v_float32 kweight1 = vx_setall_f32(space_weight[k+1]);
                     v_float32 kweight2 = vx_setall_f32(space_weight[k+2]);
                     v_float32 kweight3 = vx_setall_f32(space_weight[k+3]);
-                    for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes, rsptr += 3 * v_float32::nlanes,
-                                                                ksptr0 += 3 * v_float32::nlanes, ksptr1 += 3 * v_float32::nlanes, ksptr2 += 3 * v_float32::nlanes, ksptr3 += 3 * v_float32::nlanes)
+                    for (; j <= size.width - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes(), rsptr += 3 * VTraits<v_float32>::vlanes(),
+                                                                ksptr0 += 3 * VTraits<v_float32>::vlanes(), ksptr1 += 3 * VTraits<v_float32>::vlanes(), ksptr2 += 3 * VTraits<v_float32>::vlanes(), ksptr3 += 3 * VTraits<v_float32>::vlanes())
                     {
                         v_float32 kb, kg, kr, rb, rg, rr;
                         v_load_deinterleave(rsptr, rb, rg, rr);
 
                         v_load_deinterleave(ksptr0, kb, kg, kr);
-                        v_float32 knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr);
-                        v_float32 alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan;
+                        v_float32 knan = v_and(v_and(v_not_nan(kb), v_not_nan(kg)), v_not_nan(kr));
+                        v_float32 alpha = v_and(v_and(v_and(v_and(v_mul(v_add(v_add(v_absdiff(kb, rb), v_absdiff(kg, rg)), v_absdiff(kr, rr)), sindex), v_not_nan(rb)), v_not_nan(rg)), v_not_nan(rr)), knan);
                         v_int32 idx = v_trunc(alpha);
-                        alpha -= v_cvt_f32(idx);
-                        v_float32 w = (kweight0 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
-                        v_float32 v_wsum = vx_load_aligned(wsum + j) + w;
-                        v_float32 v_sum_b = v_muladd(kb & knan, w, vx_load_aligned(sum_b + j));
-                        v_float32 v_sum_g = v_muladd(kg & knan, w, vx_load_aligned(sum_g + j));
-                        v_float32 v_sum_r = v_muladd(kr & knan, w, vx_load_aligned(sum_r + j));
+                        alpha = v_sub(alpha, v_cvt_f32(idx));
+                        v_float32 w = v_and(v_mul(kweight0, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one, alpha)))), knan);
+                        v_float32 v_wsum = v_add(vx_load_aligned(wsum + j), w);
+                        v_float32 v_sum_b = v_muladd(v_and(kb, knan), w, vx_load_aligned(sum_b + j));
+                        v_float32 v_sum_g = v_muladd(v_and(kg, knan), w, vx_load_aligned(sum_g + j));
+                        v_float32 v_sum_r = v_muladd(v_and(kr, knan), w, vx_load_aligned(sum_r + j));
 
                         v_load_deinterleave(ksptr1, kb, kg, kr);
-                        knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr);
-                        alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan;
+                        knan = v_and(v_and(v_not_nan(kb), v_not_nan(kg)), v_not_nan(kr));
+                        alpha = v_and(v_and(v_and(v_and(v_mul(v_add(v_add(v_absdiff(kb, rb), v_absdiff(kg, rg)), v_absdiff(kr, rr)), sindex), v_not_nan(rb)), v_not_nan(rg)), v_not_nan(rr)), knan);
                         idx = v_trunc(alpha);
-                        alpha -= v_cvt_f32(idx);
-                        w = (kweight1 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
-                        v_wsum += w;
-                        v_sum_b = v_muladd(kb & knan, w, v_sum_b);
-                        v_sum_g = v_muladd(kg & knan, w, v_sum_g);
-                        v_sum_r = v_muladd(kr & knan, w, v_sum_r);
+                        alpha = v_sub(alpha, v_cvt_f32(idx));
+                        w = v_and(v_mul(kweight1, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one, alpha)))), knan);
+                        v_wsum = v_add(v_wsum, w);
+                        v_sum_b = v_muladd(v_and(kb, knan), w, v_sum_b);
+                        v_sum_g = v_muladd(v_and(kg, knan), w, v_sum_g);
+                        v_sum_r = v_muladd(v_and(kr, knan), w, v_sum_r);
 
                         v_load_deinterleave(ksptr2, kb, kg, kr);
-                        knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr);
-                        alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan;
+                        knan = v_and(v_and(v_not_nan(kb), v_not_nan(kg)), v_not_nan(kr));
+                        alpha = v_and(v_and(v_and(v_and(v_mul(v_add(v_add(v_absdiff(kb, rb), v_absdiff(kg, rg)), v_absdiff(kr, rr)), sindex), v_not_nan(rb)), v_not_nan(rg)), v_not_nan(rr)), knan);
                         idx = v_trunc(alpha);
-                        alpha -= v_cvt_f32(idx);
-                        w = (kweight2 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
-                        v_wsum += w;
-                        v_sum_b = v_muladd(kb & knan, w, v_sum_b);
-                        v_sum_g = v_muladd(kg & knan, w, v_sum_g);
-                        v_sum_r = v_muladd(kr & knan, w, v_sum_r);
+                        alpha = v_sub(alpha, v_cvt_f32(idx));
+                        w = v_and(v_mul(kweight2, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one, alpha)))), knan);
+                        v_wsum = v_add(v_wsum, w);
+                        v_sum_b = v_muladd(v_and(kb, knan), w, v_sum_b);
+                        v_sum_g = v_muladd(v_and(kg, knan), w, v_sum_g);
+                        v_sum_r = v_muladd(v_and(kr, knan), w, v_sum_r);
 
                         v_load_deinterleave(ksptr3, kb, kg, kr);
-                        knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr);
-                        alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan;
+                        knan = v_and(v_and(v_not_nan(kb), v_not_nan(kg)), v_not_nan(kr));
+                        alpha = v_and(v_and(v_and(v_and(v_mul(v_add(v_add(v_absdiff(kb, rb), v_absdiff(kg, rg)), v_absdiff(kr, rr)), sindex), v_not_nan(rb)), v_not_nan(rg)), v_not_nan(rr)), knan);
                         idx = v_trunc(alpha);
-                        alpha -= v_cvt_f32(idx);
-                        w = (kweight3 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
-                        v_wsum += w;
-                        v_sum_b = v_muladd(kb & knan, w, v_sum_b);
-                        v_sum_g = v_muladd(kg & knan, w, v_sum_g);
-                        v_sum_r = v_muladd(kr & knan, w, v_sum_r);
+                        alpha = v_sub(alpha, v_cvt_f32(idx));
+                        w = v_and(v_mul(kweight3, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one, alpha)))), knan);
+                        v_wsum = v_add(v_wsum, w);
+                        v_sum_b = v_muladd(v_and(kb, knan), w, v_sum_b);
+                        v_sum_g = v_muladd(v_and(kg, knan), w, v_sum_g);
+                        v_sum_r = v_muladd(v_and(kr, knan), w, v_sum_r);
 
                         v_store_aligned(wsum + j, v_wsum);
                         v_store_aligned(sum_b + j, v_sum_b);
@@ -938,24 +938,24 @@ public:
                     const float* ksptr = sptr + space_ofs[k];
                     const float* rsptr = sptr;
                     j = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                     v_float32 kweight = vx_setall_f32(space_weight[k]);
-                    for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes, ksptr += 3*v_float32::nlanes, rsptr += 3*v_float32::nlanes)
+                    for (; j <= size.width - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes(), ksptr += 3*VTraits<v_float32>::vlanes(), rsptr += 3*VTraits<v_float32>::vlanes())
                     {
                         v_float32 kb, kg, kr, rb, rg, rr;
                         v_load_deinterleave(ksptr, kb, kg, kr);
                         v_load_deinterleave(rsptr, rb, rg, rr);
 
-                        v_float32 knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr);
-                        v_float32 alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan;
+                        v_float32 knan = v_and(v_and(v_not_nan(kb), v_not_nan(kg)), v_not_nan(kr));
+                        v_float32 alpha = v_and(v_and(v_and(v_and(v_mul(v_add(v_add(v_absdiff(kb, rb), v_absdiff(kg, rg)), v_absdiff(kr, rr)), sindex), v_not_nan(rb)), v_not_nan(rg)), v_not_nan(rr)), knan);
                         v_int32 idx = v_trunc(alpha);
-                        alpha -= v_cvt_f32(idx);
+                        alpha = v_sub(alpha, v_cvt_f32(idx));
 
-                        v_float32 w = (kweight * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
-                        v_store_aligned(wsum + j, vx_load_aligned(wsum + j) + w);
-                        v_store_aligned(sum_b + j, v_muladd(kb & knan, w, vx_load_aligned(sum_b + j)));
-                        v_store_aligned(sum_g + j, v_muladd(kg & knan, w, vx_load_aligned(sum_g + j)));
-                        v_store_aligned(sum_r + j, v_muladd(kr & knan, w, vx_load_aligned(sum_r + j)));
+                        v_float32 w = v_and(v_mul(kweight, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one, alpha)))), knan);
+                        v_store_aligned(wsum + j, v_add(vx_load_aligned(wsum + j), w));
+                        v_store_aligned(sum_b + j, v_muladd(v_and(kb, knan), w, vx_load_aligned(sum_b + j)));
+                        v_store_aligned(sum_g + j, v_muladd(v_and(kg, knan), w, vx_load_aligned(sum_g + j)));
+                        v_store_aligned(sum_r + j, v_muladd(v_and(kr, knan), w, vx_load_aligned(sum_r + j)));
                     }
 #endif
                     for (; j < size.width; j++, ksptr += 3, rsptr += 3)
@@ -978,14 +978,14 @@ public:
                     }
                 }
                 j = 0;
-#if CV_SIMD
-                for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes, sptr += 3*v_float32::nlanes, dptr += 3*v_float32::nlanes)
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for (; j <= size.width - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes(), sptr += 3*VTraits<v_float32>::vlanes(), dptr += 3*VTraits<v_float32>::vlanes())
                 {
                     v_float32 b, g, r;
                     v_load_deinterleave(sptr, b, g, r);
-                    v_float32 mask = v_not_nan(b) & v_not_nan(g) & v_not_nan(r);
-                    v_float32 w = v_one / (vx_load_aligned(wsum + j) + (v_one & mask));
-                    v_store_interleave(dptr, (vx_load_aligned(sum_b + j) + (b & mask)) * w, (vx_load_aligned(sum_g + j) + (g & mask)) * w, (vx_load_aligned(sum_r + j) + (r & mask)) * w);
+                    v_float32 mask = v_and(v_and(v_not_nan(b), v_not_nan(g)), v_not_nan(r));
+                    v_float32 w = v_div(v_one, v_add(vx_load_aligned(wsum + j), v_and(v_one, mask)));
+                    v_store_interleave(dptr, v_mul(v_add(vx_load_aligned(sum_b + j), v_and(b, mask)), w), v_mul(v_add(vx_load_aligned(sum_g + j), v_and(g, mask)), w), v_mul(v_add(vx_load_aligned(sum_r + j), v_and(r, mask)), w));
                 }
 #endif
                 for (; j < size.width; j++)
@@ -1011,7 +1011,7 @@ public:
                 }
             }
         }
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         vx_cleanup();
 #endif
     }
diff --git a/modules/imgproc/src/color_lab.cpp b/modules/imgproc/src/color_lab.cpp
index 3b18944a0c..d111efdc47 100644
--- a/modules/imgproc/src/color_lab.cpp
+++ b/modules/imgproc/src/color_lab.cpp
@@ -56,40 +56,38 @@ template<typename _Tp> static inline _Tp splineInterpolate(_Tp x, const _Tp* tab
     return ((tab[3]*x + tab[2])*x + tab[1])*x + tab[0];
 }
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 
 template<typename _Tp> static inline cv::v_float32 splineInterpolate(const cv::v_float32& x, const _Tp* tab, int n)
 {
     using namespace cv;
     v_int32 ix = v_min(v_max(v_trunc(x), vx_setzero_s32()), vx_setall_s32(n-1));
-    cv::v_float32 xx = x - v_cvt_f32(ix);
-    ix = ix << 2;
+    cv::v_float32 xx = v_sub(x, v_cvt_f32(ix));
+    ix = v_shl<2>(ix);
 
-    v_float32 t[4];
+    v_float32 t0, t1, t2, t3;
     // assume that v_float32::nlanes == v_int32::nlanes
-    if(v_float32::nlanes == 4)
+    if(VTraits<v_float32>::vlanes() == 4)
     {
-#if CV_SIMD_WIDTH == 16
         int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) idx[4];
         v_store_aligned(idx, ix);
-        v_float32x4 tt[4];
-        tt[0] = v_load(tab + idx[0]);
-        tt[1] = v_load(tab + idx[1]);
-        tt[2] = v_load(tab + idx[2]);
-        tt[3] = v_load(tab + idx[3]);
-        v_transpose4x4(tt[0], tt[1], tt[2], tt[3],
-                        t[0],  t[1],  t[2],  t[3]);
-#endif
+        v_float32 tt0, tt1, tt2, tt3;
+        tt0 = vx_load(tab + idx[0]);
+        tt1 = vx_load(tab + idx[1]);
+        tt2 = vx_load(tab + idx[2]);
+        tt3 = vx_load(tab + idx[3]);
+        v_transpose4x4(tt0, tt1, tt2, tt3,
+                        t0,  t1,  t2,  t3);
     }
     else
     {
-        t[0] = v_lut(tab + 0, ix);
-        t[1] = v_lut(tab + 1, ix);
-        t[2] = v_lut(tab + 2, ix);
-        t[3] = v_lut(tab + 3, ix);
+        t0 = v_lut(tab + 0, ix);
+        t1 = v_lut(tab + 1, ix);
+        t2 = v_lut(tab + 2, ix);
+        t3 = v_lut(tab + 3, ix);
     }
 
-    return v_fma(v_fma(v_fma(t[3], xx, t[2]), xx, t[1]), xx, t[0]);
+    return v_fma(v_fma(v_fma(t3, xx, t2), xx, t1), xx, t0);
 }
 
 #endif
@@ -207,8 +205,8 @@ struct RGB2XYZ_f<float>
               C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
               C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
         int i = 0;
-#if CV_SIMD
-        const int vsize = v_float32::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int vsize = VTraits<v_float32>::vlanes();
         v_float32 vc0 = vx_setall_f32(C0), vc1 = vx_setall_f32(C1), vc2 = vx_setall_f32(C2);
         v_float32 vc3 = vx_setall_f32(C3), vc4 = vx_setall_f32(C4), vc5 = vx_setall_f32(C5);
         v_float32 vc6 = vx_setall_f32(C6), vc7 = vx_setall_f32(C7), vc8 = vx_setall_f32(C8);
@@ -226,9 +224,9 @@ struct RGB2XYZ_f<float>
             }
 
             v_float32 x, y, z;
-            x = v_fma(b, vc0, v_fma(g, vc1, r*vc2));
-            y = v_fma(b, vc3, v_fma(g, vc4, r*vc5));
-            z = v_fma(b, vc6, v_fma(g, vc7, r*vc8));
+            x = v_fma(b, vc0, v_fma(g, vc1, v_mul(r, vc2)));
+            y = v_fma(b, vc3, v_fma(g, vc4, v_mul(r, vc5)));
+            z = v_fma(b, vc6, v_fma(g, vc7, v_mul(r, vc8)));
 
             v_store_interleave(dst, x, y, z);
         }
@@ -313,8 +311,8 @@ struct RGB2XYZ_i<uchar>
             C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
             C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
 
-#if CV_SIMD
-        const int vsize = v_uint8::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int vsize = VTraits<v_uint8>::vlanes();
         int descaleShift = 1 << (shift-1);
         v_int16 vdescale = vx_setall_s16((short)descaleShift);
         v_int16 cxbg, cxr1, cybg, cyr1, czbg, czr1;
@@ -349,27 +347,36 @@ struct RGB2XYZ_i<uchar>
             sg0 = v_reinterpret_as_s16(g0); sg1 = v_reinterpret_as_s16(g1);
             sb0 = v_reinterpret_as_s16(b0); sb1 = v_reinterpret_as_s16(b1);
 
-            v_int16 bg[4], rd[4];
-            v_zip(sb0, sg0, bg[0], bg[1]);
-            v_zip(sb1, sg1, bg[2], bg[3]);
-            v_zip(sr0, vdescale, rd[0], rd[1]);
-            v_zip(sr1, vdescale, rd[2], rd[3]);
+            v_int16 bg0, bg1, bg2, bg3, rd0, rd1, rd2, rd3;
+            v_zip(sb0, sg0, bg0, bg1);
+            v_zip(sb1, sg1, bg2, bg3);
+            v_zip(sr0, vdescale, rd0, rd1);
+            v_zip(sr1, vdescale, rd2, rd3);
 
-            v_uint32 vx[4], vy[4], vz[4];
-            for(int j = 0; j < 4; j++)
-            {
-                vx[j] = v_reinterpret_as_u32(v_dotprod(bg[j], cxbg) + v_dotprod(rd[j], cxr1)) >> shift;
-                vy[j] = v_reinterpret_as_u32(v_dotprod(bg[j], cybg) + v_dotprod(rd[j], cyr1)) >> shift;
-                vz[j] = v_reinterpret_as_u32(v_dotprod(bg[j], czbg) + v_dotprod(rd[j], czr1)) >> shift;
-            }
+            v_uint32 vx0, vx1, vx2, vx3;
+            v_uint32 vy0, vy1, vy2, vy3;
+            v_uint32 vz0, vz1, vz2, vz3;
+
+            vx0 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg0, cxbg), v_dotprod(rd0, cxr1))));
+            vy0 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg0, cybg), v_dotprod(rd0, cyr1))));
+            vz0 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg0, czbg), v_dotprod(rd0, czr1))));
+            vx1 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg1, cxbg), v_dotprod(rd1, cxr1))));
+            vy1 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg1, cybg), v_dotprod(rd1, cyr1))));
+            vz1 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg1, czbg), v_dotprod(rd1, czr1))));
+            vx2 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg2, cxbg), v_dotprod(rd2, cxr1))));
+            vy2 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg2, cybg), v_dotprod(rd2, cyr1))));
+            vz2 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg2, czbg), v_dotprod(rd2, czr1))));
+            vx3 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg3, cxbg), v_dotprod(rd3, cxr1))));
+            vy3 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg3, cybg), v_dotprod(rd3, cyr1))));
+            vz3 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg3, czbg), v_dotprod(rd3, czr1))));
 
             v_uint16 x0, x1, y0, y1, z0, z1;
-            x0 = v_pack(vx[0], vx[1]);
-            x1 = v_pack(vx[2], vx[3]);
-            y0 = v_pack(vy[0], vy[1]);
-            y1 = v_pack(vy[2], vy[3]);
-            z0 = v_pack(vz[0], vz[1]);
-            z1 = v_pack(vz[2], vz[3]);
+            x0 = v_pack(vx0, vx1);
+            x1 = v_pack(vx2, vx3);
+            y0 = v_pack(vy0, vy1);
+            y1 = v_pack(vy2, vy3);
+            z0 = v_pack(vz0, vz1);
+            z1 = v_pack(vz2, vz3);
 
             v_uint8 x, y, z;
             x = v_pack(x0, x1);
@@ -424,8 +431,8 @@ struct RGB2XYZ_i<ushort>
         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
             C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
             C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
-#if CV_SIMD
-        const int vsize = v_uint16::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int vsize = VTraits<v_uint16>::vlanes();
         const int descaleShift = 1 << (shift-1);
         v_int16 vdescale = vx_setall_s16(descaleShift);
         v_int16 vc0 = vx_setall_s16((short)C0), vc1 = vx_setall_s16((short)C1), vc2 = vx_setall_s16((short)C2);
@@ -464,29 +471,29 @@ struct RGB2XYZ_i<ushort>
             v_int16 ymr, ymg, ymb;
             v_int16 zmr, zmg, zmb;
 
-            v_int16 mr = sr < zero, mg = sg < zero, mb = sb < zero;
+            v_int16 mr = v_lt(sr, zero), mg = v_lt(sg, zero), mb = v_lt(sb, zero);
 
-            xmb = mb & vc0;
-            xmg = mg & vc1;
-            xmr = mr & vc2;
-            ymb = mb & vc3;
-            ymg = mg & vc4;
-            ymr = mr & vc5;
-            zmb = mb & vc6;
-            zmg = mg & vc7;
-            zmr = mr & vc8;
+            xmb = v_and(mb, vc0);
+            xmg = v_and(mg, vc1);
+            xmr = v_and(mr, vc2);
+            ymb = v_and(mb, vc3);
+            ymg = v_and(mg, vc4);
+            ymr = v_and(mr, vc5);
+            zmb = v_and(mb, vc6);
+            zmg = v_and(mg, vc7);
+            zmr = v_and(mr, vc8);
 
             v_int32 xfix0, xfix1, yfix0, yfix1, zfix0, zfix1;
-            v_expand(xmr + xmg + xmb, xfix0, xfix1);
-            v_expand(ymr + ymg + ymb, yfix0, yfix1);
-            v_expand(zmr + zmg + zmb, zfix0, zfix1);
+            v_expand(v_add(v_add(xmr, xmg), xmb), xfix0, xfix1);
+            v_expand(v_add(v_add(ymr, ymg), ymb), yfix0, yfix1);
+            v_expand(v_add(v_add(zmr, zmg), zmb), zfix0, zfix1);
 
-            xfix0 = xfix0 << 16;
-            xfix1 = xfix1 << 16;
-            yfix0 = yfix0 << 16;
-            yfix1 = yfix1 << 16;
-            zfix0 = zfix0 << 16;
-            zfix1 = zfix1 << 16;
+            xfix0 = v_shl<16>(xfix0);
+            xfix1 = v_shl<16>(xfix1);
+            yfix0 = v_shl<16>(yfix0);
+            yfix1 = v_shl<16>(yfix1);
+            zfix0 = v_shl<16>(zfix0);
+            zfix1 = v_shl<16>(zfix1);
 
             v_int16 bg0, bg1, rd0, rd1;
             v_zip(sb, sg, bg0, bg1);
@@ -494,12 +501,12 @@ struct RGB2XYZ_i<ushort>
 
             v_uint32 x0, x1, y0, y1, z0, z1;
 
-            x0 = v_reinterpret_as_u32(v_dotprod(bg0, cxbg) + v_dotprod(rd0, cxr1) + xfix0) >> shift;
-            x1 = v_reinterpret_as_u32(v_dotprod(bg1, cxbg) + v_dotprod(rd1, cxr1) + xfix1) >> shift;
-            y0 = v_reinterpret_as_u32(v_dotprod(bg0, cybg) + v_dotprod(rd0, cyr1) + yfix0) >> shift;
-            y1 = v_reinterpret_as_u32(v_dotprod(bg1, cybg) + v_dotprod(rd1, cyr1) + yfix1) >> shift;
-            z0 = v_reinterpret_as_u32(v_dotprod(bg0, czbg) + v_dotprod(rd0, czr1) + zfix0) >> shift;
-            z1 = v_reinterpret_as_u32(v_dotprod(bg1, czbg) + v_dotprod(rd1, czr1) + zfix1) >> shift;
+            x0 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_add(v_dotprod(bg0, cxbg), v_dotprod(rd0, cxr1)), xfix0)));
+            x1 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_add(v_dotprod(bg1, cxbg), v_dotprod(rd1, cxr1)), xfix1)));
+            y0 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_add(v_dotprod(bg0, cybg), v_dotprod(rd0, cyr1)), yfix0)));
+            y1 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_add(v_dotprod(bg1, cybg), v_dotprod(rd1, cyr1)), yfix1)));
+            z0 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_add(v_dotprod(bg0, czbg), v_dotprod(rd0, czr1)), zfix0)));
+            z1 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_add(v_dotprod(bg1, czbg), v_dotprod(rd1, czr1)), zfix1)));
 
             v_uint16 x, y, z;
             x = v_pack(x0, x1);
@@ -593,8 +600,8 @@ struct XYZ2RGB_f<float>
               C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
               C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
         int i = 0;
-#if CV_SIMD
-        const int vsize = v_float32::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int vsize = VTraits<v_float32>::vlanes();
         v_float32 valpha = vx_setall_f32(alpha);
         v_float32 vc0 = vx_setall_f32(C0), vc1 = vx_setall_f32(C1), vc2 = vx_setall_f32(C2);
         v_float32 vc3 = vx_setall_f32(C3), vc4 = vx_setall_f32(C4), vc5 = vx_setall_f32(C5);
@@ -606,9 +613,9 @@ struct XYZ2RGB_f<float>
             v_load_deinterleave(src, x, y, z);
 
             v_float32 b, g, r;
-            b = v_fma(x, vc0, v_fma(y, vc1, z*vc2));
-            g = v_fma(x, vc3, v_fma(y, vc4, z*vc5));
-            r = v_fma(x, vc6, v_fma(y, vc7, z*vc8));
+            b = v_fma(x, vc0, v_fma(y, vc1, v_mul(z, vc2)));
+            g = v_fma(x, vc3, v_fma(y, vc4, v_mul(z, vc5)));
+            r = v_fma(x, vc6, v_fma(y, vc7, v_mul(z, vc8)));
 
             if(dcn == 4)
             {
@@ -707,8 +714,8 @@ struct XYZ2RGB_i<uchar>
         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
             C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
             C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
-#if CV_SIMD
-        const int vsize = v_uint8::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int vsize = VTraits<v_uint8>::vlanes();
         const int descaleShift = 1 << (shift - 1);
         v_uint8 valpha = vx_setall_u8(alpha);
         v_int16 vdescale = vx_setall_s16(descaleShift);
@@ -739,25 +746,35 @@ struct XYZ2RGB_i<uchar>
             z0 = v_reinterpret_as_s16(uz0);
             z1 = v_reinterpret_as_s16(uz1);
 
-            v_int32 b[4], g[4], r[4];
+            v_int32 bb0, bb1, bb2, bb3,
+                    gg0, gg1, gg2, gg3,
+                    rr0, rr1, rr2, rr3;
 
-            v_int16 xy[4], zd[4];
-            v_zip(x0, y0, xy[0], xy[1]);
-            v_zip(x1, y1, xy[2], xy[3]);
-            v_zip(z0, vdescale, zd[0], zd[1]);
-            v_zip(z1, vdescale, zd[2], zd[3]);
+            v_int16 xy0, xy1, xy2, xy3;
+            v_int16 zd0, zd1, zd2, zd3;
 
-            for(int j = 0; j < 4; j++)
-            {
-                b[j] = (v_dotprod(xy[j], cbxy) + v_dotprod(zd[j], cbz1)) >> shift;
-                g[j] = (v_dotprod(xy[j], cgxy) + v_dotprod(zd[j], cgz1)) >> shift;
-                r[j] = (v_dotprod(xy[j], crxy) + v_dotprod(zd[j], crz1)) >> shift;
-            }
+            v_zip(x0, y0, xy0, xy1);
+            v_zip(x1, y1, xy2, xy3);
+            v_zip(z0, vdescale, zd0, zd1);
+            v_zip(z1, vdescale, zd2, zd3);
+
+            bb0 = v_shr<shift>(v_add(v_dotprod(xy0, cbxy), v_dotprod(zd0, cbz1)));
+            gg0 = v_shr<shift>(v_add(v_dotprod(xy0, cgxy), v_dotprod(zd0, cgz1)));
+            rr0 = v_shr<shift>(v_add(v_dotprod(xy0, crxy), v_dotprod(zd0, crz1)));
+            bb1 = v_shr<shift>(v_add(v_dotprod(xy1, cbxy), v_dotprod(zd1, cbz1)));
+            gg1 = v_shr<shift>(v_add(v_dotprod(xy1, cgxy), v_dotprod(zd1, cgz1)));
+            rr1 = v_shr<shift>(v_add(v_dotprod(xy1, crxy), v_dotprod(zd1, crz1)));
+            bb2 = v_shr<shift>(v_add(v_dotprod(xy2, cbxy), v_dotprod(zd2, cbz1)));
+            gg2 = v_shr<shift>(v_add(v_dotprod(xy2, cgxy), v_dotprod(zd2, cgz1)));
+            rr2 = v_shr<shift>(v_add(v_dotprod(xy2, crxy), v_dotprod(zd2, crz1)));
+            bb3 = v_shr<shift>(v_add(v_dotprod(xy3, cbxy), v_dotprod(zd3, cbz1)));
+            gg3 = v_shr<shift>(v_add(v_dotprod(xy3, cgxy), v_dotprod(zd3, cgz1)));
+            rr3 = v_shr<shift>(v_add(v_dotprod(xy3, crxy), v_dotprod(zd3, crz1)));
 
             v_uint16 b0, b1, g0, g1, r0, r1;
-            b0 = v_pack_u(b[0], b[1]); b1 = v_pack_u(b[2], b[3]);
-            g0 = v_pack_u(g[0], g[1]); g1 = v_pack_u(g[2], g[3]);
-            r0 = v_pack_u(r[0], r[1]); r1 = v_pack_u(r[2], r[3]);
+            b0 = v_pack_u(bb0, bb1); b1 = v_pack_u(bb2, bb3);
+            g0 = v_pack_u(gg0, gg1); g1 = v_pack_u(gg2, gg3);
+            r0 = v_pack_u(rr0, rr1); r1 = v_pack_u(rr2, rr3);
 
             v_uint8 bb, gg, rr;
             bb = v_pack(b0, b1);
@@ -820,8 +837,8 @@ struct XYZ2RGB_i<ushort>
         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
             C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
             C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
-#if CV_SIMD
-        const int vsize = v_uint16::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int vsize = VTraits<v_uint16>::vlanes();
         const int descaleShift = 1 << (shift-1);
         v_uint16 valpha = vx_setall_u16(alpha);
         v_int16 vdescale = vx_setall_s16(descaleShift);
@@ -850,30 +867,30 @@ struct XYZ2RGB_i<ushort>
             sz = v_reinterpret_as_s16(z);
 
             // fixing 16bit signed multiplication
-            v_int16 mx = sx < zero, my = sy < zero, mz = sz < zero;
+            v_int16 mx = v_lt(sx, zero), my = v_lt(sy, zero), mz = v_lt(sz, zero);
 
             v_int16 bmx, bmy, bmz;
             v_int16 gmx, gmy, gmz;
             v_int16 rmx, rmy, rmz;
 
-            bmx = mx & vc0;
-            bmy = my & vc1;
-            bmz = mz & vc2;
-            gmx = mx & vc3;
-            gmy = my & vc4;
-            gmz = mz & vc5;
-            rmx = mx & vc6;
-            rmy = my & vc7;
-            rmz = mz & vc8;
+            bmx = v_and(mx, vc0);
+            bmy = v_and(my, vc1);
+            bmz = v_and(mz, vc2);
+            gmx = v_and(mx, vc3);
+            gmy = v_and(my, vc4);
+            gmz = v_and(mz, vc5);
+            rmx = v_and(mx, vc6);
+            rmy = v_and(my, vc7);
+            rmz = v_and(mz, vc8);
 
             v_int32 bfix0, bfix1, gfix0, gfix1, rfix0, rfix1;
-            v_expand(bmx + bmy + bmz, bfix0, bfix1);
-            v_expand(gmx + gmy + gmz, gfix0, gfix1);
-            v_expand(rmx + rmy + rmz, rfix0, rfix1);
+            v_expand(v_add(v_add(bmx, bmy), bmz), bfix0, bfix1);
+            v_expand(v_add(v_add(gmx, gmy), gmz), gfix0, gfix1);
+            v_expand(v_add(v_add(rmx, rmy), rmz), rfix0, rfix1);
 
-            bfix0 = bfix0 << 16; bfix1 = bfix1 << 16;
-            gfix0 = gfix0 << 16; gfix1 = gfix1 << 16;
-            rfix0 = rfix0 << 16; rfix1 = rfix1 << 16;
+            bfix0 = v_shl<16>(bfix0); bfix1 = v_shl<16>(bfix1);
+            gfix0 = v_shl<16>(gfix0); gfix1 = v_shl<16>(gfix1);
+            rfix0 = v_shl<16>(rfix0); rfix1 = v_shl<16>(rfix1);
 
             v_int16 xy0, xy1, zd0, zd1;
             v_zip(sx, sy, xy0, xy1);
@@ -881,12 +898,12 @@ struct XYZ2RGB_i<ushort>
 
             v_int32 b0, b1, g0, g1, r0, r1;
 
-            b0 = (v_dotprod(xy0, cbxy) + v_dotprod(zd0, cbz1) + bfix0) >> shift;
-            b1 = (v_dotprod(xy1, cbxy) + v_dotprod(zd1, cbz1) + bfix1) >> shift;
-            g0 = (v_dotprod(xy0, cgxy) + v_dotprod(zd0, cgz1) + gfix0) >> shift;
-            g1 = (v_dotprod(xy1, cgxy) + v_dotprod(zd1, cgz1) + gfix1) >> shift;
-            r0 = (v_dotprod(xy0, crxy) + v_dotprod(zd0, crz1) + rfix0) >> shift;
-            r1 = (v_dotprod(xy1, crxy) + v_dotprod(zd1, crz1) + rfix1) >> shift;
+            b0 = v_shr<shift>(v_add(v_add(v_dotprod(xy0, cbxy), v_dotprod(zd0, cbz1)), bfix0));
+            b1 = v_shr<shift>(v_add(v_add(v_dotprod(xy1, cbxy), v_dotprod(zd1, cbz1)), bfix1));
+            g0 = v_shr<shift>(v_add(v_add(v_dotprod(xy0, cgxy), v_dotprod(zd0, cgz1)), gfix0));
+            g1 = v_shr<shift>(v_add(v_add(v_dotprod(xy1, cgxy), v_dotprod(zd1, cgz1)), gfix1));
+            r0 = v_shr<shift>(v_add(v_add(v_dotprod(xy0, crxy), v_dotprod(zd0, crz1)), rfix0));
+            r1 = v_shr<shift>(v_add(v_add(v_dotprod(xy1, crxy), v_dotprod(zd1, crz1)), rfix1));
 
             v_uint16 b, g, r;
             b = v_pack_u(b0, b1); g = v_pack_u(g0, g1); r = v_pack_u(r0, r1);
@@ -1452,19 +1469,19 @@ static inline void trilinearPackedInterpolate(const v_uint16x8& inX, const v_uin
 #undef DOT_SHIFT_PACK
 }
 
-#elif CV_SIMD
+#elif CV_SIMD // Fixed size v_int16x8 used below, CV_SIMD_SCALABLE is disabled.
 
 // inValues are in [0; LAB_BASE]
 static inline void trilinearPackedInterpolate(const v_uint16& inX, const v_uint16& inY, const v_uint16& inZ,
                                               const int16_t* LUT,
                                               v_uint16& outA, v_uint16& outB, v_uint16& outC)
 {
-    const int vsize = v_uint16::nlanes;
+    const int vsize = VTraits<v_uint16>::max_nlanes;
 
     // LUT idx of origin pt of cube
-    v_uint16 tx = inX >> (lab_base_shift - lab_lut_shift);
-    v_uint16 ty = inY >> (lab_base_shift - lab_lut_shift);
-    v_uint16 tz = inZ >> (lab_base_shift - lab_lut_shift);
+    v_uint16 tx = v_shr<lab_base_shift - lab_lut_shift>(inX);
+    v_uint16 ty = v_shr<lab_base_shift - lab_lut_shift>(inY);
+    v_uint16 tz = v_shr<lab_base_shift - lab_lut_shift>(inZ);
 
     v_uint32 btmp00, btmp01, btmp10, btmp11, btmp20, btmp21;
     v_uint32 baseIdx0, baseIdx1;
@@ -1472,8 +1489,8 @@ static inline void trilinearPackedInterpolate(const v_uint16& inX, const v_uint1
     v_mul_expand(tx, vx_setall_u16(3*8), btmp00, btmp01);
     v_mul_expand(ty, vx_setall_u16(3*8*LAB_LUT_DIM), btmp10, btmp11);
     v_mul_expand(tz, vx_setall_u16(3*8*LAB_LUT_DIM*LAB_LUT_DIM), btmp20, btmp21);
-    baseIdx0 = btmp00 + btmp10 + btmp20;
-    baseIdx1 = btmp01 + btmp11 + btmp21;
+    baseIdx0 = v_add(v_add(btmp00, btmp10), btmp20);
+    baseIdx1 = v_add(v_add(btmp01, btmp11), btmp21);
 
     uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vbaseIdx[vsize];
     v_store_aligned(vbaseIdx + 0*vsize/2, baseIdx0);
@@ -1482,9 +1499,9 @@ static inline void trilinearPackedInterpolate(const v_uint16& inX, const v_uint1
     // fracX, fracY, fracZ are [0; TRILINEAR_BASE)
     const uint16_t bitMask = (1 << trilinear_shift) - 1;
     v_uint16 bitMaskReg = vx_setall_u16(bitMask);
-    v_uint16 fracX = (inX >> (lab_base_shift - 8 - 1)) & bitMaskReg;
-    v_uint16 fracY = (inY >> (lab_base_shift - 8 - 1)) & bitMaskReg;
-    v_uint16 fracZ = (inZ >> (lab_base_shift - 8 - 1)) & bitMaskReg;
+    v_uint16 fracX = v_and(v_shr<lab_base_shift - 8 - 1>(inX), bitMaskReg);
+    v_uint16 fracY = v_and(v_shr<lab_base_shift - 8 - 1>(inY), bitMaskReg);
+    v_uint16 fracZ = v_and(v_shr<lab_base_shift - 8 - 1>(inZ), bitMaskReg);
 
     // trilinearIdx = 8*x + 8*TRILINEAR_BASE*y + 8*TRILINEAR_BASE*TRILINEAR_BASE*z
     v_uint32 trilinearIdx0, trilinearIdx1;
@@ -1493,8 +1510,8 @@ static inline void trilinearPackedInterpolate(const v_uint16& inX, const v_uint1
     v_expand(fracY, fracY0, fracY1);
     v_expand(fracZ, fracZ0, fracZ1);
 
-    trilinearIdx0 = (fracX0 << 3) + (fracY0 << (3+trilinear_shift)) + (fracZ0 << (3+trilinear_shift*2));
-    trilinearIdx1 = (fracX1 << 3) + (fracY1 << (3+trilinear_shift)) + (fracZ1 << (3+trilinear_shift*2));
+    trilinearIdx0 = v_add(v_add(v_shl<3>(fracX0), v_shl<3 + trilinear_shift>(fracY0)), v_shl<3 + trilinear_shift * 2>(fracZ0));
+    trilinearIdx1 = v_add(v_add(v_shl<3>(fracX1), v_shl<3 + trilinear_shift>(fracY1)), v_shl<3 + trilinear_shift * 2>(fracZ1));
 
     uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vtrilinearIdx[vsize];
     v_store_aligned(vtrilinearIdx + 0*vsize/2, trilinearIdx0);
@@ -1528,12 +1545,12 @@ static inline void trilinearPackedInterpolate(const v_uint16& inX, const v_uint1
 
     // CV_DESCALE
     const v_uint32 descaleShift = vx_setall_u32(1 << (trilinear_shift*3 - 1));
-    a0 = (a0 + descaleShift) >> (trilinear_shift*3);
-    a1 = (a1 + descaleShift) >> (trilinear_shift*3);
-    b0 = (b0 + descaleShift) >> (trilinear_shift*3);
-    b1 = (b1 + descaleShift) >> (trilinear_shift*3);
-    c0 = (c0 + descaleShift) >> (trilinear_shift*3);
-    c1 = (c1 + descaleShift) >> (trilinear_shift*3);
+    a0 = v_shr<trilinear_shift * 3>(v_add(a0, descaleShift));
+    a1 = v_shr<trilinear_shift * 3>(v_add(a1, descaleShift));
+    b0 = v_shr<trilinear_shift * 3>(v_add(b0, descaleShift));
+    b1 = v_shr<trilinear_shift * 3>(v_add(b1, descaleShift));
+    c0 = v_shr<trilinear_shift * 3>(v_add(c0, descaleShift));
+    c1 = v_shr<trilinear_shift * 3>(v_add(c1, descaleShift));
 
     outA = v_pack(a0, a1); outB = v_pack(b0, b1); outC = v_pack(c0, c1);
 }
diff --git a/modules/imgproc/src/color_yuv.simd.hpp b/modules/imgproc/src/color_yuv.simd.hpp
index b5f73d873a..580329f660 100644
--- a/modules/imgproc/src/color_yuv.simd.hpp
+++ b/modules/imgproc/src/color_yuv.simd.hpp
@@ -49,6 +49,15 @@ void cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step,
 namespace {
 //constants for conversion from/to RGB and YUV, YCrCb according to BT.601
 
+#if CV_SIMD_SCALABLE
+template <class T>
+static void swap(T&a, T&b) {
+    T t = a;
+    a = b;
+    b = t;
+}
+#endif
+
 //to YCbCr
 static const float YCBF = 0.564f; // == 1/2/(1-B2YF)
 static const float YCRF = 0.713f; // == 1/2/(1-R2YF)
@@ -143,11 +152,11 @@ struct RGB2YCrCb_f<float>
         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
 
         int i = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         v_float32 vc0 = vx_setall_f32(C0), vc1 = vx_setall_f32(C1), vc2 = vx_setall_f32(C2);
         v_float32 vc3 = vx_setall_f32(C3), vc4 = vx_setall_f32(C4);
         v_float32 vdelta = vx_setall_f32(delta);
-        const int vsize = v_float32::nlanes;
+        const int vsize = VTraits<v_float32>::vlanes();
         for( ; i <= n-vsize;
              i += vsize, src += vsize*scn, dst += vsize*3)
         {
@@ -162,13 +171,13 @@ struct RGB2YCrCb_f<float>
             }
 
             v_float32 y, cr, cb;
-            y = v_fma(b, vc0, v_fma(g, vc1, r*vc2));
+            y = v_fma(b, vc0, v_fma(g, vc1, v_mul(r, vc2)));
 
             if(bidx)
-                std::swap(r, b);
+                swap(r, b);
 
-            cr = v_fma(r - y, vc3, vdelta);
-            cb = v_fma(b - y, vc4, vdelta);
+            cr = v_fma(v_sub(r, y), vc3, vdelta);
+            cb = v_fma(v_sub(b, y), vc4, vdelta);
 
             if(yuvOrder)
             {
@@ -266,8 +275,8 @@ struct RGB2YCrCb_i<ushort>
         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
         int sdelta = ColorChannel<ushort>::half()*(1 << shift);
         int i = 0;
-#if CV_SIMD
-        const int vsize = v_uint16::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int vsize = VTraits<v_uint16>::vlanes();
         const int descale = 1 << (shift-1);
 
         v_int16 b2y = vx_setall_s16((short)C0);
@@ -312,13 +321,13 @@ struct RGB2YCrCb_i<ushort>
 
             // fixing 16bit signed multiplication
             v_int16 mr, mg, mb;
-            mr = (sr < z) & r2y;
-            mg = (sg < z) & g2y;
-            mb = (sb < z) & b2y;
-            v_int16 fixmul = v_add_wrap(mr, v_add_wrap(mg, mb)) << fix_shift;
+            mr = v_and(v_lt(sr, z), r2y);
+            mg = v_and(v_lt(sg, z), g2y);
+            mb = v_and(v_lt(sb, z), b2y);
+            v_int16 fixmul = v_shl(v_add_wrap(mr, v_add_wrap(mg, mb)), fix_shift);
 
-            v_int32 ssy0 = (v_dotprod(bg0, bg2y) + v_dotprod(rd0, r12y)) >> shift;
-            v_int32 ssy1 = (v_dotprod(bg1, bg2y) + v_dotprod(rd1, r12y)) >> shift;
+            v_int32 ssy0 = v_shr(v_add(v_dotprod(bg0, bg2y), v_dotprod(rd0, r12y)), shift);
+            v_int32 ssy1 = v_shr(v_add(v_dotprod(bg1, bg2y), v_dotprod(rd1, r12y)), shift);
 
             y = v_reinterpret_as_u16(v_add_wrap(v_pack(ssy0, ssy1), fixmul));
 
@@ -340,15 +349,15 @@ struct RGB2YCrCb_i<ushort>
             v_int32 sy0 = v_reinterpret_as_s32(uy0);
             v_int32 sy1 = v_reinterpret_as_s32(uy1);
 
-            sr0 = sr0 - sy0; sr1 = sr1 - sy1;
-            sb0 = sb0 - sy0; sb1 = sb1 - sy1;
+            sr0 = v_sub(sr0, sy0); sr1 = v_sub(sr1, sy1);
+            sb0 = v_sub(sb0, sy0); sb1 = v_sub(sb1, sy1);
 
             v_int32 v_scr0, v_scr1, v_scb0, v_scb1;
 
-            v_scr0 = (sr0*vc3 + vdd) >> shift;
-            v_scr1 = (sr1*vc3 + vdd) >> shift;
-            v_scb0 = (sb0*vc4 + vdd) >> shift;
-            v_scb1 = (sb1*vc4 + vdd) >> shift;
+            v_scr0 = v_shr(v_add(v_mul(sr0, vc3), vdd), shift);
+            v_scr1 = v_shr(v_add(v_mul(sr1, vc3), vdd), shift);
+            v_scb0 = v_shr(v_add(v_mul(sb0, vc4), vdd), shift);
+            v_scb1 = v_shr(v_add(v_mul(sb1, vc4), vdd), shift);
 
             // saturate and pack
             cr = v_pack_u(v_scr0, v_scr1);
@@ -407,8 +416,8 @@ struct RGB2YCrCb_i<uchar>
         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
         int delta = ColorChannel<uchar>::half()*(1 << shift);
 
-#if CV_SIMD
-        const int vsize = v_uint8::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int vsize = VTraits<v_uint8>::vlanes();
         const int descaleShift = 1 << (shift-1);
         v_int16 bg2y;
         v_int16 r12y;
@@ -458,10 +467,10 @@ struct RGB2YCrCb_i<uchar>
                 v_zip(sr0, vdescale, rd00, rd01);
                 v_zip(sr1, vdescale, rd10, rd11);
 
-                y00 = v_reinterpret_as_u32(v_dotprod(bg00, bg2y) + v_dotprod(rd00, r12y)) >> shift;
-                y01 = v_reinterpret_as_u32(v_dotprod(bg01, bg2y) + v_dotprod(rd01, r12y)) >> shift;
-                y10 = v_reinterpret_as_u32(v_dotprod(bg10, bg2y) + v_dotprod(rd10, r12y)) >> shift;
-                y11 = v_reinterpret_as_u32(v_dotprod(bg11, bg2y) + v_dotprod(rd11, r12y)) >> shift;
+                y00 = v_shr(v_reinterpret_as_u32(v_add(v_dotprod(bg00, bg2y), v_dotprod(rd00, r12y))), shift);
+                y01 = v_shr(v_reinterpret_as_u32(v_add(v_dotprod(bg01, bg2y), v_dotprod(rd01, r12y))), shift);
+                y10 = v_shr(v_reinterpret_as_u32(v_add(v_dotprod(bg10, bg2y), v_dotprod(rd10, r12y))), shift);
+                y11 = v_shr(v_reinterpret_as_u32(v_add(v_dotprod(bg11, bg2y), v_dotprod(rd11, r12y))), shift);
             }
 
             v_uint16 y0, y1;
@@ -512,15 +521,15 @@ struct RGB2YCrCb_i<uchar>
 
             v_uint8 cr, cb;
 
-            cr00 = cr00 >> shift;
-            cr01 = cr01 >> shift;
-            cr10 = cr10 >> shift;
-            cr11 = cr11 >> shift;
+            cr00 = v_shr(cr00, shift);
+            cr01 = v_shr(cr01, shift);
+            cr10 = v_shr(cr10, shift);
+            cr11 = v_shr(cr11, shift);
 
-            cb00 = cb00 >> shift;
-            cb01 = cb01 >> shift;
-            cb10 = cb10 >> shift;
-            cb11 = cb11 >> shift;
+            cb00 = v_shr(cb00, shift);
+            cb01 = v_shr(cb01, shift);
+            cb10 = v_shr(cb10, shift);
+            cb11 = v_shr(cb11, shift);
 
             v_int16 cr0, cr1, cb0, cb1;
             cr0 = v_pack(cr00, cr01); cr1 = v_pack(cr10, cr11);
@@ -623,12 +632,12 @@ struct YCrCb2RGB_f<float>
         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
 
         int i = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         v_float32 vc0 = vx_setall_f32(C0), vc1 = vx_setall_f32(C1);
         v_float32 vc2 = vx_setall_f32(C2), vc3 = vx_setall_f32(C3);
         v_float32 vdelta = vx_setall_f32(delta);
         v_float32 valpha = vx_setall_f32(alpha);
-        const int vsize = v_float32::nlanes;
+        const int vsize = VTraits<v_float32>::vlanes();
         for( ; i <= n-vsize;
              i += vsize, src += vsize*3, dst += vsize*dcn)
         {
@@ -640,7 +649,7 @@ struct YCrCb2RGB_f<float>
 
             v_float32 b, g, r;
 
-            cb -= vdelta; cr -= vdelta;
+            cb = v_sub(cb, vdelta); cr = v_sub(cr, vdelta);
             b = v_fma(cb, vc3, y);
             g = v_fma(cr, vc1, v_fma(cb, vc2, y));
             r = v_fma(cr, vc0, y);
@@ -746,8 +755,8 @@ struct YCrCb2RGB_i<uchar>
         const uchar delta = ColorChannel<uchar>::half(), alpha = ColorChannel<uchar>::max();
         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
 
-#if CV_SIMD
-        const int vsize = v_uint8::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int vsize = VTraits<v_uint8>::vlanes();
         v_uint8 valpha = vx_setall_u8(alpha);
         v_uint8 vdelta = vx_setall_u8(delta);
         const int descaleShift = 1 << (shift - 1);
@@ -794,8 +803,8 @@ struct YCrCb2RGB_i<uchar>
                 v_int32 cb00, cb01, cb10, cb11;
                 v_expand(v_scb0, cb00, cb01);
                 v_expand(v_scb1, cb10, cb11);
-                b00 += cb00 << 15; b01 += cb01 << 15;
-                b10 += cb10 << 15; b11 += cb11 << 15;
+                b00 = v_add(b00, v_shl<15>(cb00)); b01 = v_add(b01, v_shl<15>(cb01));
+                b10 = v_add(b10, v_shl<15>(cb10)); b11 = v_add(b11, v_shl<15>(cb11));
             }
 
             v_int32 t00, t01, t10, t11;
@@ -803,17 +812,17 @@ struct YCrCb2RGB_i<uchar>
             v_mul_expand(v_scb1, vc2, t10, t11);
             v_mul_expand(v_scr0, vc1, g00, g01);
             v_mul_expand(v_scr1, vc1, g10, g11);
-            g00 += t00; g01 += t01;
-            g10 += t10; g11 += t11;
+            g00 = v_add(g00, t00); g01 = v_add(g01, t01);
+            g10 = v_add(g10, t10); g11 = v_add(g11, t11);
             v_mul_expand(v_scr0, vc0, r00, r01);
             v_mul_expand(v_scr1, vc0, r10, r11);
 
-            b00 = (b00 + vdescale) >> shift; b01 = (b01 + vdescale) >> shift;
-            b10 = (b10 + vdescale) >> shift; b11 = (b11 + vdescale) >> shift;
-            g00 = (g00 + vdescale) >> shift; g01 = (g01 + vdescale) >> shift;
-            g10 = (g10 + vdescale) >> shift; g11 = (g11 + vdescale) >> shift;
-            r00 = (r00 + vdescale) >> shift; r01 = (r01 + vdescale) >> shift;
-            r10 = (r10 + vdescale) >> shift; r11 = (r11 + vdescale) >> shift;
+            b00 = v_shr(v_add(b00, vdescale), shift); b01 = v_shr(v_add(b01, vdescale), shift);
+            b10 = v_shr(v_add(b10, vdescale), shift); b11 = v_shr(v_add(b11, vdescale), shift);
+            g00 = v_shr(v_add(g00, vdescale), shift); g01 = v_shr(v_add(g01, vdescale), shift);
+            g10 = v_shr(v_add(g10, vdescale), shift); g11 = v_shr(v_add(g11, vdescale), shift);
+            r00 = v_shr(v_add(r00, vdescale), shift); r01 = v_shr(v_add(r01, vdescale), shift);
+            r10 = v_shr(v_add(r10, vdescale), shift); r11 = v_shr(v_add(r11, vdescale), shift);
 
             v_int16 b0, b1, g0, g1, r0, r1;
             b0 = v_pack(b00, b01); b1 = v_pack(b10, b11);
@@ -897,8 +906,8 @@ struct YCrCb2RGB_i<ushort>
         const ushort delta = ColorChannel<ushort>::half(), alpha = ColorChannel<ushort>::max();
         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
 
-#if CV_SIMD
-        const int vsize = v_uint16::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int vsize = VTraits<v_uint16>::vlanes();
         const int descaleShift = 1 << (shift-1);
         v_uint16 valpha = vx_setall_u16(alpha);
         v_uint16 vdelta = vx_setall_u16(delta);
@@ -939,22 +948,22 @@ struct YCrCb2RGB_i<ushort>
                 // so we fix the multiplication
                 v_int32 cb0, cb1;
                 v_expand(scb, cb0, cb1);
-                b0 += cb0 << 15;
-                b1 += cb1 << 15;
+                b0 = v_add(b0, v_shl<15>(cb0));
+                b1 = v_add(b1, v_shl<15>(cb1));
             }
             v_int32 t0, t1;
             v_mul_expand(scb, vc2, t0, t1);
             v_mul_expand(scr, vc1, g0, g1);
-            g0 += t0; g1 += t1;
+            g0 = v_add(g0, t0); g1 = v_add(g1, t1);
             v_mul_expand(scr, vc0, r0, r1);
 
             // shifted term doesn't fit into 16 bits, addition is to be done in 32 bits
-            b0 = ((b0 + vdescale) >> shift) + y0;
-            b1 = ((b1 + vdescale) >> shift) + y1;
-            g0 = ((g0 + vdescale) >> shift) + y0;
-            g1 = ((g1 + vdescale) >> shift) + y1;
-            r0 = ((r0 + vdescale) >> shift) + y0;
-            r1 = ((r1 + vdescale) >> shift) + y1;
+            b0 = v_add(v_shr(v_add(b0, vdescale), shift), y0);
+            b1 = v_add(v_shr(v_add(b1, vdescale), shift), y1);
+            g0 = v_add(v_shr(v_add(g0, vdescale), shift), y0);
+            g1 = v_add(v_shr(v_add(g1, vdescale), shift), y1);
+            r0 = v_add(v_shr(v_add(r0, vdescale), shift), y0);
+            r1 = v_add(v_shr(v_add(r1, vdescale), shift), y1);
 
             // saturate and pack
             v_uint16 b, g, r;
@@ -1038,11 +1047,11 @@ static inline void uvToRGBuv(const uchar u, const uchar v, int& ruv, int& guv, i
     buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * uu;
 }
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 static inline void uvToRGBuv(const v_uint8& u, const v_uint8& v,
-                             v_int32 (&ruv)[4],
-                             v_int32 (&guv)[4],
-                             v_int32 (&buv)[4])
+                             v_int32 &ruv0, v_int32 &ruv1, v_int32 &ruv2, v_int32 &ruv3,
+                             v_int32 &guv0, v_int32 &guv1, v_int32 &guv2, v_int32 &guv3,
+                             v_int32 &buv0, v_int32 &buv1, v_int32 &buv2, v_int32 &buv3)
 {
     v_uint8 v128 = vx_setall_u8(128);
     v_int8 su = v_reinterpret_as_s8(v_sub_wrap(u, v128));
@@ -1051,9 +1060,10 @@ static inline void uvToRGBuv(const v_uint8& u, const v_uint8& v,
     v_int16 uu0, uu1, vv0, vv1;
     v_expand(su, uu0, uu1);
     v_expand(sv, vv0, vv1);
-    v_int32 uu[4], vv[4];
-    v_expand(uu0, uu[0], uu[1]); v_expand(uu1, uu[2], uu[3]);
-    v_expand(vv0, vv[0], vv[1]); v_expand(vv1, vv[2], vv[3]);
+    v_int32 uuu0, uuu1, uuu2, uuu3;
+    v_int32 vvv0, vvv1, vvv2, vvv3;
+    v_expand(uu0, uuu0, uuu1); v_expand(uu1, uuu2, uuu3);
+    v_expand(vv0, vvv0, vvv1); v_expand(vv1, vvv2, vvv3);
 
     v_int32 vshift = vx_setall_s32(1 << (ITUR_BT_601_SHIFT - 1));
     v_int32 vr = vx_setall_s32(ITUR_BT_601_CVR);
@@ -1061,12 +1071,15 @@ static inline void uvToRGBuv(const v_uint8& u, const v_uint8& v,
     v_int32 ug = vx_setall_s32(ITUR_BT_601_CUG);
     v_int32 ub = vx_setall_s32(ITUR_BT_601_CUB);
 
-    for (int k = 0; k < 4; k++)
-    {
-        ruv[k] = vshift + vr * vv[k];
-        guv[k] = vshift + vg * vv[k] + ug * uu[k];
-        buv[k] = vshift + ub * uu[k];
-    }
+    auto process_uv = [&](v_int32& ruv, v_int32& guv, v_int32& buv, const v_int32& vv, const v_int32& uu) {
+        ruv = v_add(vshift, v_mul(vr, vv));
+        guv = v_add(v_add(vshift, v_mul(vg, vv)), v_mul(ug, uu));
+        buv = v_add(vshift, v_mul(ub, uu));
+    };
+    process_uv(ruv0, guv0, buv0, vvv0, uuu0);
+    process_uv(ruv1, guv1, buv1, vvv1, uuu1);
+    process_uv(ruv2, guv2, buv2, vvv2, uuu2);
+    process_uv(ruv3, guv3, buv3, vvv3, uuu3);
 }
 #endif
 
@@ -1081,44 +1094,48 @@ static inline void yRGBuvToRGBA(const uchar vy, const int ruv, const int guv, co
     a = uchar(0xff);
 }
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 static inline void yRGBuvToRGBA(const v_uint8& vy,
-                                const v_int32 (&ruv)[4],
-                                const v_int32 (&guv)[4],
-                                const v_int32 (&buv)[4],
+                                const v_int32 &ruv0, const v_int32 &ruv1, const v_int32 &ruv2, const v_int32 &ruv3,
+                                const v_int32 &guv0, const v_int32 &guv1, const v_int32 &guv2, const v_int32 &guv3,
+                                const v_int32 &buv0, const v_int32 &buv1, const v_int32 &buv2, const v_int32 &buv3,
                                 v_uint8& rr, v_uint8& gg, v_uint8& bb)
 {
     v_uint8 v16 = vx_setall_u8(16);
-    v_uint8 posY = vy - v16;
+    v_uint8 posY = v_sub(vy, v16);
     v_uint16 yy0, yy1;
     v_expand(posY, yy0, yy1);
-    v_int32 yy[4];
-    v_int32 yy00, yy01, yy10, yy11;
-    v_expand(v_reinterpret_as_s16(yy0), yy[0], yy[1]);
-    v_expand(v_reinterpret_as_s16(yy1), yy[2], yy[3]);
+    v_int32 yyy0, yyy1, yyy2, yyy3;
+    v_expand(v_reinterpret_as_s16(yy0), yyy0, yyy1);
+    v_expand(v_reinterpret_as_s16(yy1), yyy2, yyy3);
 
     v_int32 vcy = vx_setall_s32(ITUR_BT_601_CY);
 
-    v_int32 y[4], r[4], g[4], b[4];
-    for(int k = 0; k < 4; k++)
-    {
-        y[k] = yy[k]*vcy;
-        r[k] = (y[k] + ruv[k]) >> ITUR_BT_601_SHIFT;
-        g[k] = (y[k] + guv[k]) >> ITUR_BT_601_SHIFT;
-        b[k] = (y[k] + buv[k]) >> ITUR_BT_601_SHIFT;
-    }
+    v_int32 y0, y1, y2, y3, r0, r1, r2, r3, g0, g1, g2, g3, b0, b1, b2, b3;
 
-    v_int16 r0, r1, g0, g1, b0, b1;
-    r0 = v_pack(r[0], r[1]);
-    r1 = v_pack(r[2], r[3]);
-    g0 = v_pack(g[0], g[1]);
-    g1 = v_pack(g[2], g[3]);
-    b0 = v_pack(b[0], b[1]);
-    b1 = v_pack(b[2], b[3]);
+    auto process_yrgb = [&](const v_int32& yy, v_int32& y, v_int32& r, v_int32& g, v_int32& b,
+                            const v_int32& ruv, const v_int32& guv, const v_int32& buv) {
+        y = v_mul(yy, vcy);
+        r = v_shr(v_add(y, ruv), ITUR_BT_601_SHIFT);
+        g = v_shr(v_add(y, guv), ITUR_BT_601_SHIFT);
+        b = v_shr(v_add(y, buv), ITUR_BT_601_SHIFT);
+    };
+    process_yrgb(yyy0, y0, r0, g0, b0, ruv0, guv0, buv0);
+    process_yrgb(yyy1, y1, r1, g1, b1, ruv1, guv1, buv1);
+    process_yrgb(yyy2, y2, r2, g2, b2, ruv2, guv2, buv2);
+    process_yrgb(yyy3, y3, r3, g3, b3, ruv3, guv3, buv3);
 
-    rr = v_pack_u(r0, r1);
-    gg = v_pack_u(g0, g1);
-    bb = v_pack_u(b0, b1);
+    v_int16 _r0, _r1, _g0, _g1, _b0, _b1;
+    _r0 = v_pack(r0, r1);
+    _r1 = v_pack(r2, r3);
+    _g0 = v_pack(g0, g1);
+    _g1 = v_pack(g2, g3);
+    _b0 = v_pack(b0, b1);
+    _b1 = v_pack(b2, b3);
+
+    rr = v_pack_u(_r0, _r1);
+    gg = v_pack_u(_g0, _g1);
+    bb = v_pack_u(_b0, _b1);
 }
 #endif
 
@@ -1201,8 +1218,8 @@ struct YUV420sp2RGB8Invoker : ParallelLoopBody
             const uchar* y2 = y1 + my1_step;
 
             int i = 0;
-#if CV_SIMD
-            const int vsize = v_uint8::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+            const int vsize = VTraits<v_uint8>::vlanes();
             v_uint8 a = vx_setall_u8(uchar(0xff));
             for( ; i <= width - 2*vsize;
                  i += 2*vsize, row1 += vsize*dcn*2, row2 += vsize*dcn*2)
@@ -1215,36 +1232,50 @@ struct YUV420sp2RGB8Invoker : ParallelLoopBody
                     swap(u, v);
                 }
 
-                v_uint8 vy[4];
-                v_load_deinterleave(y1 + i, vy[0], vy[1]);
-                v_load_deinterleave(y2 + i, vy[2], vy[3]);
+                v_uint8 vy0, vy1, vy2, vy3;
+                v_load_deinterleave(y1 + i, vy0, vy1);
+                v_load_deinterleave(y2 + i, vy2, vy3);
 
-                v_int32 ruv[4], guv[4], buv[4];
-                uvToRGBuv(u, v, ruv, guv, buv);
+                v_int32 ruv0, ruv1, ruv2, ruv3,
+                        guv0, guv1, guv2, guv3,
+                        buv0, buv1, buv2, buv3;
+                uvToRGBuv(u, v,
+                        ruv0, ruv1, ruv2, ruv3,
+                        guv0, guv1, guv2, guv3,
+                        buv0, buv1, buv2, buv3);
 
-                v_uint8 r[4], g[4], b[4];
+                v_uint8 r0, r1, r2, r3, g0, g1, g2, g3, b0, b1, b2, b3;
 
-                for(int k = 0; k < 4; k++)
-                {
-                    yRGBuvToRGBA(vy[k], ruv, guv, buv, r[k], g[k], b[k]);
-                }
+                auto call_yRGBuvToRGBA = [&](const v_uint8& vy, v_uint8& r, v_uint8& g, v_uint8& b) {
+                    yRGBuvToRGBA(vy,
+                        ruv0, ruv1, ruv2, ruv3,
+                        guv0, guv1, guv2, guv3,
+                        buv0, buv1, buv2, buv3,
+                        r, g, b);
+                };
+                call_yRGBuvToRGBA(vy0, r0, g0, b0);
+                call_yRGBuvToRGBA(vy1, r1, g1, b1);
+                call_yRGBuvToRGBA(vy2, r2, g2, b2);
+                call_yRGBuvToRGBA(vy3, r3, g3, b3);
 
                 if(bIdx)
                 {
-                    for(int k = 0; k < 4; k++)
-                        swap(r[k], b[k]);
+                    swap(r0, b0);
+                    swap(r1, b1);
+                    swap(r2, b2);
+                    swap(r3, b3);
                 }
 
                 // [r0...], [r1...] => [r0, r1, r0, r1...], [r0, r1, r0, r1...]
                 v_uint8 r0_0, r0_1, r1_0, r1_1;
-                v_zip(r[0], r[1], r0_0, r0_1);
-                v_zip(r[2], r[3], r1_0, r1_1);
+                v_zip(r0, r1, r0_0, r0_1);
+                v_zip(r2, r3, r1_0, r1_1);
                 v_uint8 g0_0, g0_1, g1_0, g1_1;
-                v_zip(g[0], g[1], g0_0, g0_1);
-                v_zip(g[2], g[3], g1_0, g1_1);
+                v_zip(g0, g1, g0_0, g0_1);
+                v_zip(g2, g3, g1_0, g1_1);
                 v_uint8 b0_0, b0_1, b1_0, b1_1;
-                v_zip(b[0], b[1], b0_0, b0_1);
-                v_zip(b[2], b[3], b1_0, b1_1);
+                v_zip(b0, b1, b0_0, b0_1);
+                v_zip(b2, b3, b1_0, b1_1);
 
                 if(dcn == 4)
                 {
@@ -1319,8 +1350,8 @@ struct YUV420p2RGB8Invoker : ParallelLoopBody
             const uchar* y2 = y1 + stride;
             int i = 0;
 
-#if CV_SIMD
-            const int vsize = v_uint8::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+            const int vsize = VTraits<v_uint8>::vlanes();
             v_uint8 a = vx_setall_u8(uchar(0xff));
             for( ; i <= width/2 - vsize;
                  i += vsize, row1 += vsize*dcn*2, row2 += vsize*dcn*2)
@@ -1329,36 +1360,50 @@ struct YUV420p2RGB8Invoker : ParallelLoopBody
                 u = vx_load(u1 + i);
                 v = vx_load(v1 + i);
 
-                v_uint8 vy[4];
-                v_load_deinterleave(y1 + 2*i, vy[0], vy[1]);
-                v_load_deinterleave(y2 + 2*i, vy[2], vy[3]);
+                v_uint8 vy0, vy1, vy2, vy3;
+                v_load_deinterleave(y1 + 2*i, vy0, vy1);
+                v_load_deinterleave(y2 + 2*i, vy2, vy3);
 
-                v_int32 ruv[4], guv[4], buv[4];
-                uvToRGBuv(u, v, ruv, guv, buv);
+                v_int32 ruv0, ruv1, ruv2, ruv3,
+                        guv0, guv1, guv2, guv3,
+                        buv0, buv1, buv2, buv3;
+                uvToRGBuv(u, v,
+                        ruv0, ruv1, ruv2, ruv3,
+                        guv0, guv1, guv2, guv3,
+                        buv0, buv1, buv2, buv3);
 
-                v_uint8 r[4], g[4], b[4];
+                v_uint8 r0, r1, r2, r3, g0, g1, g2, g3, b0, b1, b2, b3;
 
-                for(int k = 0; k < 4; k++)
-                {
-                    yRGBuvToRGBA(vy[k], ruv, guv, buv, r[k], g[k], b[k]);
-                }
+                auto call_yRGBuvToRGBA = [&](const v_uint8& vy, v_uint8& r, v_uint8& g, v_uint8& b) {
+                    yRGBuvToRGBA(vy,
+                        ruv0, ruv1, ruv2, ruv3,
+                        guv0, guv1, guv2, guv3,
+                        buv0, buv1, buv2, buv3,
+                        r, g, b);
+                };
+                call_yRGBuvToRGBA(vy0, r0, g0, b0);
+                call_yRGBuvToRGBA(vy1, r1, g1, b1);
+                call_yRGBuvToRGBA(vy2, r2, g2, b2);
+                call_yRGBuvToRGBA(vy3, r3, g3, b3);
 
                 if(bIdx)
                 {
-                    for(int k = 0; k < 4; k++)
-                        swap(r[k], b[k]);
+                    swap(r0, b0);
+                    swap(r1, b1);
+                    swap(r2, b2);
+                    swap(r3, b3);
                 }
 
                 // [r0...], [r1...] => [r0, r1, r0, r1...], [r0, r1, r0, r1...]
                 v_uint8 r0_0, r0_1, r1_0, r1_1;
-                v_zip(r[0], r[1], r0_0, r0_1);
-                v_zip(r[2], r[3], r1_0, r1_1);
+                v_zip(r0, r1, r0_0, r0_1);
+                v_zip(r2, r3, r1_0, r1_1);
                 v_uint8 g0_0, g0_1, g1_0, g1_1;
-                v_zip(g[0], g[1], g0_0, g0_1);
-                v_zip(g[2], g[3], g1_0, g1_1);
+                v_zip(g0, g1, g0_0, g0_1);
+                v_zip(g2, g3, g1_0, g1_1);
                 v_uint8 b0_0, b0_1, b1_0, b1_1;
-                v_zip(b[0], b[1], b0_0, b0_1);
-                v_zip(b[2], b[3], b1_0, b1_1);
+                v_zip(b0, b1, b0_0, b0_1);
+                v_zip(b2, b3, b1_0, b1_1);
 
                 if(dcn == 4)
                 {
@@ -1430,7 +1475,7 @@ static inline uchar rgbToY42x(uchar r, uchar g, uchar b)
     return saturate_cast<uchar>(yy >> ITUR_BT_601_SHIFT);
 }
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 static inline v_uint8 rgbToY42x(const v_uint8& r, const v_uint8& g, const v_uint8& b)
 {
     const int shifted16 = (16 << ITUR_BT_601_SHIFT);
@@ -1440,25 +1485,25 @@ static inline v_uint8 rgbToY42x(const v_uint8& r, const v_uint8& g, const v_uint
     v_expand(g, g0, g1);
     v_expand(b, b0, b1);
 
-    v_uint32 rq[4], gq[4], bq[4];
-    v_expand(r0, rq[0], rq[1]); v_expand(r1, rq[2], rq[3]);
-    v_expand(g0, gq[0], gq[1]); v_expand(g1, gq[2], gq[3]);
-    v_expand(b0, bq[0], bq[1]); v_expand(b1, bq[2], bq[3]);
+    v_uint32 rq0, rq1, rq2, rq3, gq0, gq1, gq2, gq3, bq0, bq1, bq2, bq3;
+    v_expand(r0, rq0, rq1); v_expand(r1, rq2, rq3);
+    v_expand(g0, gq0, gq1); v_expand(g1, gq2, gq3);
+    v_expand(b0, bq0, bq1); v_expand(b1, bq2, bq3);
 
     v_uint32 ry = vx_setall_u32(ITUR_BT_601_CRY), gy = vx_setall_u32(ITUR_BT_601_CGY);
     v_uint32 by = vx_setall_u32(ITUR_BT_601_CBY), shift = vx_setall_u32(halfShift + shifted16);
 
-    v_uint32 y[4];
-    for(int k = 0; k < 4; k++)
-    {
-        y[k] = (rq[k]*ry + gq[k]*gy + bq[k]*by + shift) >> ITUR_BT_601_SHIFT;
-    }
+    v_uint32 y0, y1, y2, y3;
+    y0 = v_shr<ITUR_BT_601_SHIFT>(v_add(v_add(v_add(v_mul(rq0, ry), v_mul(gq0, gy)), v_mul(bq0, by)), shift));
+    y1 = v_shr<ITUR_BT_601_SHIFT>(v_add(v_add(v_add(v_mul(rq1, ry), v_mul(gq1, gy)), v_mul(bq1, by)), shift));
+    y2 = v_shr<ITUR_BT_601_SHIFT>(v_add(v_add(v_add(v_mul(rq2, ry), v_mul(gq2, gy)), v_mul(bq2, by)), shift));
+    y3 = v_shr<ITUR_BT_601_SHIFT>(v_add(v_add(v_add(v_mul(rq3, ry), v_mul(gq3, gy)), v_mul(bq3, by)), shift));
 
-    v_uint16 y0, y1;
-    y0 = v_pack(y[0], y[1]);
-    y1 = v_pack(y[2], y[3]);
+    v_uint16 _y0, _y1;
+    _y0 = v_pack(y0, y1);
+    _y1 = v_pack(y2, y3);
 
-    return v_pack(y0, y1);
+    return v_pack(_y0, _y1);
 }
 #endif
 
@@ -1473,27 +1518,27 @@ static inline void rgbToUV42x(uchar r, uchar g, uchar b, uchar& u, uchar& v)
     v = saturate_cast<uchar>(vv >> ITUR_BT_601_SHIFT);
 }
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 static inline void rgbToUV42x(const v_uint8& r0, const v_uint8& r1, const v_uint8& g0, const v_uint8& g1,
                               const v_uint8& b0, const v_uint8& b1, v_uint8& u, v_uint8& v)
 {
     // [r0, r1, r2, r3,..] => [r0, 0, r2, 0,..]
     v_int16 vlowByte = vx_setall_s16(0x00ff);
     v_int16 rd0, rd1, gd0, gd1, bd0, bd1;
-    rd0 = v_reinterpret_as_s16(r0) & vlowByte;
-    rd1 = v_reinterpret_as_s16(r1) & vlowByte;
-    gd0 = v_reinterpret_as_s16(g0) & vlowByte;
-    gd1 = v_reinterpret_as_s16(g1) & vlowByte;
-    bd0 = v_reinterpret_as_s16(b0) & vlowByte;
-    bd1 = v_reinterpret_as_s16(b1) & vlowByte;
+    rd0 = v_and(v_reinterpret_as_s16(r0), vlowByte);
+    rd1 = v_and(v_reinterpret_as_s16(r1), vlowByte);
+    gd0 = v_and(v_reinterpret_as_s16(g0), vlowByte);
+    gd1 = v_and(v_reinterpret_as_s16(g1), vlowByte);
+    bd0 = v_and(v_reinterpret_as_s16(b0), vlowByte);
+    bd1 = v_and(v_reinterpret_as_s16(b1), vlowByte);
 
-    v_int32 rq[4], gq[4], bq[4];
-    v_expand(rd0, rq[0], rq[1]);
-    v_expand(rd1, rq[2], rq[3]);
-    v_expand(gd0, gq[0], gq[1]);
-    v_expand(gd1, gq[2], gq[3]);
-    v_expand(bd0, bq[0], bq[1]);
-    v_expand(bd1, bq[2], bq[3]);
+    v_int32 rq0, rq1, rq2, rq3, gq0, gq1, gq2, gq3, bq0, bq1, bq2, bq3;
+    v_expand(rd0, rq0, rq1);
+    v_expand(rd1, rq2, rq3);
+    v_expand(gd0, gq0, gq1);
+    v_expand(gd1, gq2, gq3);
+    v_expand(bd0, bq0, bq1);
+    v_expand(bd1, bq2, bq3);
 
     const int halfShift = (1 << (ITUR_BT_601_SHIFT - 1));
     const int shifted128 = (128 << ITUR_BT_601_SHIFT);
@@ -1505,18 +1550,21 @@ static inline void rgbToUV42x(const v_uint8& r0, const v_uint8& r1, const v_uint
     bu = vx_setall_s32(ITUR_BT_601_CBU);
     bv = vx_setall_s32(ITUR_BT_601_CBV);
 
-    v_int32 uq[4], vq[4];
-    for(int k = 0; k < 4; k++)
-    {
-        uq[k] = (ru*rq[k] + gu*gq[k] + bu*bq[k] + shift) >> ITUR_BT_601_SHIFT;
-        vq[k] = (bu*rq[k] + gv*gq[k] + bv*bq[k] + shift) >> ITUR_BT_601_SHIFT;
-    }
+    v_int32 uq0, uq1, uq2, uq3, vq0, vq1, vq2, vq3;
+    uq0 = v_shr<ITUR_BT_601_SHIFT>(v_add(v_add(v_add(v_mul(ru, rq0), v_mul(gu, gq0)), v_mul(bu, bq0)), shift));
+    vq0 = v_shr<ITUR_BT_601_SHIFT>(v_add(v_add(v_add(v_mul(bu, rq0), v_mul(gv, gq0)), v_mul(bv, bq0)), shift));
+    uq1 = v_shr<ITUR_BT_601_SHIFT>(v_add(v_add(v_add(v_mul(ru, rq1), v_mul(gu, gq1)), v_mul(bu, bq1)), shift));
+    vq1 = v_shr<ITUR_BT_601_SHIFT>(v_add(v_add(v_add(v_mul(bu, rq1), v_mul(gv, gq1)), v_mul(bv, bq1)), shift));
+    uq2 = v_shr<ITUR_BT_601_SHIFT>(v_add(v_add(v_add(v_mul(ru, rq2), v_mul(gu, gq2)), v_mul(bu, bq2)), shift));
+    vq2 = v_shr<ITUR_BT_601_SHIFT>(v_add(v_add(v_add(v_mul(bu, rq2), v_mul(gv, gq2)), v_mul(bv, bq2)), shift));
+    uq3 = v_shr<ITUR_BT_601_SHIFT>(v_add(v_add(v_add(v_mul(ru, rq3), v_mul(gu, gq3)), v_mul(bu, bq3)), shift));
+    vq3 = v_shr<ITUR_BT_601_SHIFT>(v_add(v_add(v_add(v_mul(bu, rq3), v_mul(gv, gq3)), v_mul(bv, bq3)), shift));
 
     v_int16 u0, u1, v0, v1;
-    u0 = v_pack(uq[0], uq[1]);
-    u1 = v_pack(uq[2], uq[3]);
-    v0 = v_pack(vq[0], vq[1]);
-    v1 = v_pack(vq[2], vq[3]);
+    u0 = v_pack(uq0, uq1);
+    u1 = v_pack(uq2, uq3);
+    v0 = v_pack(vq0, vq1);
+    v1 = v_pack(vq2, vq3);
 
     u = v_pack_u(u0, u1);
     v = v_pack_u(v0, v1);
@@ -1559,8 +1607,8 @@ struct RGB8toYUV420pInvoker: public ParallelLoopBody
                 }
             }
             int i = 0;
-#if CV_SIMD
-            const int vsize = v_uint8::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+            const int vsize = VTraits<v_uint8>::vlanes();
 
             for( ; i <= w/2 - vsize;
                  i += vsize)
@@ -1708,47 +1756,61 @@ struct YUV422toRGB8Invoker : ParallelLoopBody
         {
             uchar* row = dst_data + dst_step * j;
             int i = 0;
-#if CV_SIMD
-            const int vsize = v_uint8::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+            const int vsize = VTraits<v_uint8>::vlanes();
             v_uint8 a = vx_setall_u8(uchar(0xff));
             for(; i <= 2*width - 4*vsize;
                 i += 4*vsize, row += vsize*dcn*2)
             {
-                v_uint8 u, v, vy[2];
+                v_uint8 u, v, vy0, vy1;
                 if(yIdx == 1) // UYVY
                 {
-                    v_load_deinterleave(yuv_src + i, u, vy[0], v, vy[1]);
+                    v_load_deinterleave(yuv_src + i, u, vy0, v, vy1);
                 }
                 else // YUYV or YVYU
                 {
-                    v_load_deinterleave(yuv_src + i, vy[0], u, vy[1], v);
+                    v_load_deinterleave(yuv_src + i, vy0, u, vy1, v);
                     if(uIdx == 1) // YVYU
                     {
                         swap(u, v);
                     }
                 }
 
-                v_int32 ruv[4], guv[4], buv[4];
-                uvToRGBuv(u, v, ruv, guv, buv);
+                v_int32 ruv0, ruv1, ruv2, ruv3,
+                        guv0, guv1, guv2, guv3,
+                        buv0, buv1, buv2, buv3;
+                uvToRGBuv(u, v,
+                        ruv0, ruv1, ruv2, ruv3,
+                        guv0, guv1, guv2, guv3,
+                        buv0, buv1, buv2, buv3);
 
-                v_uint8 r[2], g[2], b[2];
+                v_uint8 r0, r1, g0, g1, b0, b1;
 
-                yRGBuvToRGBA(vy[0], ruv, guv, buv, r[0], g[0], b[0]);
-                yRGBuvToRGBA(vy[1], ruv, guv, buv, r[1], g[1], b[1]);
+
+                yRGBuvToRGBA(vy0,
+                        ruv0, ruv1, ruv2, ruv3,
+                        guv0, guv1, guv2, guv3,
+                        buv0, buv1, buv2, buv3,
+                        r0, g0, b0);
+                yRGBuvToRGBA(vy1,
+                        ruv0, ruv1, ruv2, ruv3,
+                        guv0, guv1, guv2, guv3,
+                        buv0, buv1, buv2, buv3,
+                        r1, g1, b1);
 
                 if(bIdx)
                 {
-                    swap(r[0], b[0]);
-                    swap(r[1], b[1]);
+                    swap(r0, b0);
+                    swap(r1, b1);
                 }
 
                 // [r0...], [r1...] => [r0, r1, r0, r1...], [r0, r1, r0, r1...]
                 v_uint8 r0_0, r0_1;
-                v_zip(r[0], r[1], r0_0, r0_1);
+                v_zip(r0, r1, r0_0, r0_1);
                 v_uint8 g0_0, g0_1;
-                v_zip(g[0], g[1], g0_0, g0_1);
+                v_zip(g0, g1, g0_0, g0_1);
                 v_uint8 b0_0, b0_1;
-                v_zip(b[0], b[1], b0_0, b0_1);
+                v_zip(b0, b1, b0_0, b0_1);
 
                 if(dcn == 4)
                 {
diff --git a/modules/imgproc/src/filter.simd.hpp b/modules/imgproc/src/filter.simd.hpp
index 8dcf5235af..06053e63fe 100644
--- a/modules/imgproc/src/filter.simd.hpp
+++ b/modules/imgproc/src/filter.simd.hpp
@@ -349,7 +349,7 @@ struct FilterNoVec
 };
 
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 
 ///////////////////////////////////// 8u-16s & 8u-8u //////////////////////////////////
 
@@ -383,7 +383,7 @@ struct RowVec_8u32s
 
         if( smallValues )
         {
-            for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes )
+            for( ; i <= width - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes() )
             {
                 const uchar* src = _src + i;
                 v_int32 s0 = vx_setzero_s32();
@@ -396,27 +396,27 @@ struct RowVec_8u32s
                     v_int32 f = vx_setall_s32((_kx[k] & 0xFFFF) | (_kx[k + 1] << 16));
                     v_uint8 x0, x1;
                     v_zip(vx_load(src), vx_load(src + cn), x0, x1);
-                    s0 += v_dotprod(v_reinterpret_as_s16(v_expand_low(x0)), v_reinterpret_as_s16(f));
-                    s1 += v_dotprod(v_reinterpret_as_s16(v_expand_high(x0)), v_reinterpret_as_s16(f));
-                    s2 += v_dotprod(v_reinterpret_as_s16(v_expand_low(x1)), v_reinterpret_as_s16(f));
-                    s3 += v_dotprod(v_reinterpret_as_s16(v_expand_high(x1)), v_reinterpret_as_s16(f));
+                    s0 = v_add(s0, v_dotprod(v_reinterpret_as_s16(v_expand_low(x0)), v_reinterpret_as_s16(f)));
+                    s1 = v_add(s1, v_dotprod(v_reinterpret_as_s16(v_expand_high(x0)), v_reinterpret_as_s16(f)));
+                    s2 = v_add(s2, v_dotprod(v_reinterpret_as_s16(v_expand_low(x1)), v_reinterpret_as_s16(f)));
+                    s3 = v_add(s3, v_dotprod(v_reinterpret_as_s16(v_expand_high(x1)), v_reinterpret_as_s16(f)));
                 }
                 if (k < _ksize)
                 {
                     v_int32 f = vx_setall_s32(_kx[k]);
                     v_uint16 x0, x1;
                     v_expand(vx_load(src), x0, x1);
-                    s0 += v_dotprod(v_reinterpret_as_s16(v_expand_low(x0)), v_reinterpret_as_s16(f));
-                    s1 += v_dotprod(v_reinterpret_as_s16(v_expand_high(x0)), v_reinterpret_as_s16(f));
-                    s2 += v_dotprod(v_reinterpret_as_s16(v_expand_low(x1)), v_reinterpret_as_s16(f));
-                    s3 += v_dotprod(v_reinterpret_as_s16(v_expand_high(x1)), v_reinterpret_as_s16(f));
+                    s0 = v_add(s0, v_dotprod(v_reinterpret_as_s16(v_expand_low(x0)), v_reinterpret_as_s16(f)));
+                    s1 = v_add(s1, v_dotprod(v_reinterpret_as_s16(v_expand_high(x0)), v_reinterpret_as_s16(f)));
+                    s2 = v_add(s2, v_dotprod(v_reinterpret_as_s16(v_expand_low(x1)), v_reinterpret_as_s16(f)));
+                    s3 = v_add(s3, v_dotprod(v_reinterpret_as_s16(v_expand_high(x1)), v_reinterpret_as_s16(f)));
                 }
                 v_store(dst + i, s0);
-                v_store(dst + i + v_int32::nlanes, s1);
-                v_store(dst + i + 2*v_int32::nlanes, s2);
-                v_store(dst + i + 3*v_int32::nlanes, s3);
+                v_store(dst + i + VTraits<v_int32>::vlanes(), s1);
+                v_store(dst + i + 2*VTraits<v_int32>::vlanes(), s2);
+                v_store(dst + i + 3*VTraits<v_int32>::vlanes(), s3);
             }
-            if( i <= width - v_uint16::nlanes )
+            if( i <= width - VTraits<v_uint16>::vlanes() )
             {
                 const uchar* src = _src + i;
                 v_int32 s0 = vx_setzero_s32();
@@ -427,22 +427,22 @@ struct RowVec_8u32s
                     v_int32 f = vx_setall_s32((_kx[k] & 0xFFFF) | (_kx[k + 1] << 16));
                     v_uint16 x0, x1;
                     v_zip(vx_load_expand(src), vx_load_expand(src + cn), x0, x1);
-                    s0 += v_dotprod(v_reinterpret_as_s16(x0), v_reinterpret_as_s16(f));
-                    s1 += v_dotprod(v_reinterpret_as_s16(x1), v_reinterpret_as_s16(f));
+                    s0 = v_add(s0, v_dotprod(v_reinterpret_as_s16(x0), v_reinterpret_as_s16(f)));
+                    s1 = v_add(s1, v_dotprod(v_reinterpret_as_s16(x1), v_reinterpret_as_s16(f)));
                 }
                 if( k < _ksize )
                 {
                     v_int32 f = vx_setall_s32(_kx[k]);
                     v_uint32 x0, x1;
                     v_expand(vx_load_expand(src), x0, x1);
-                    s0 += v_dotprod(v_reinterpret_as_s16(x0), v_reinterpret_as_s16(f));
-                    s1 += v_dotprod(v_reinterpret_as_s16(x1), v_reinterpret_as_s16(f));
+                    s0 = v_add(s0, v_dotprod(v_reinterpret_as_s16(x0), v_reinterpret_as_s16(f)));
+                    s1 = v_add(s1, v_dotprod(v_reinterpret_as_s16(x1), v_reinterpret_as_s16(f)));
                 }
                 v_store(dst + i, s0);
-                v_store(dst + i + v_int32::nlanes, s1);
-                i += v_uint16::nlanes;
+                v_store(dst + i + VTraits<v_int32>::vlanes(), s1);
+                i += VTraits<v_uint16>::vlanes();
             }
-            if( i <= width - v_uint32::nlanes )
+            if( i <= width - VTraits<v_uint32>::vlanes() )
             {
                 v_int32 d = vx_setzero_s32();
                 k = 0;
@@ -452,12 +452,12 @@ struct RowVec_8u32s
                     v_int32 f = vx_setall_s32((_kx[k] & 0xFFFF) | (_kx[k + 1] << 16));
                     v_uint32 x0, x1;
                     v_zip(vx_load_expand_q(src), vx_load_expand_q(src + cn), x0, x1);
-                    d += v_dotprod(v_pack(v_reinterpret_as_s32(x0), v_reinterpret_as_s32(x1)), v_reinterpret_as_s16(f));
+                    d = v_add(d, v_dotprod(v_pack(v_reinterpret_as_s32(x0), v_reinterpret_as_s32(x1)), v_reinterpret_as_s16(f)));
                 }
                 if (k < _ksize)
-                    d += v_dotprod(v_reinterpret_as_s16(vx_load_expand_q(src)), v_reinterpret_as_s16(vx_setall_s32(_kx[k])));
+                    d = v_add(d, v_dotprod(v_reinterpret_as_s16(vx_load_expand_q(src)), v_reinterpret_as_s16(vx_setall_s32(_kx[k]))));
                 v_store(dst + i, d);
-                i += v_uint32::nlanes;
+                i += VTraits<v_uint32>::vlanes();
             }
         }
         return i;
@@ -480,7 +480,7 @@ struct RowVec_8u32f
         float* dst = (float*)_dst;
         const float* _kx = kernel.ptr<float>();
         width *= cn;
-        for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes )
+        for( ; i <= width - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes() )
         {
             v_float32 s0 = vx_setzero_f32();
             v_float32 s1 = vx_setzero_f32();
@@ -492,18 +492,18 @@ struct RowVec_8u32f
                 v_float32 f = vx_setall_f32(_kx[k]);
                 const uchar* src = (const uchar*)_src + i + k * cn;
                 v_float32 vs_ll = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src)));
-                v_float32 vs_lh = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src + v_float32::nlanes)));
-                v_float32 vs_hl = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src + 2*v_float32::nlanes)));
-                v_float32 vs_hh = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src + 3*v_float32::nlanes)));
+                v_float32 vs_lh = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src + VTraits<v_float32>::vlanes())));
+                v_float32 vs_hl = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src + 2*VTraits<v_float32>::vlanes())));
+                v_float32 vs_hh = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src + 3*VTraits<v_float32>::vlanes())));
                 s0 = v_muladd(vs_ll, f, s0);
                 s1 = v_muladd(vs_lh, f, s1);
                 s2 = v_muladd(vs_hl, f, s2);
                 s3 = v_muladd(vs_hh, f, s3);
             }
             v_store(dst + i, s0);
-            v_store(dst + i + v_float32::nlanes, s1);
-            v_store(dst + i + 2*v_float32::nlanes, s2);
-            v_store(dst + i + 3*v_float32::nlanes, s3);
+            v_store(dst + i + VTraits<v_float32>::vlanes(), s1);
+            v_store(dst + i + 2*VTraits<v_float32>::vlanes(), s2);
+            v_store(dst + i + 3*VTraits<v_float32>::vlanes(), s3);
         }
         return i;
     }
@@ -553,7 +553,7 @@ struct SymmRowSmallVec_8u32s
             {
                 if( kx[0] == 2 && kx[1] == 1 )
                 {
-                    for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
+                    for( ; i <= width - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes(), src += VTraits<v_uint8>::vlanes() )
                     {
                         v_uint16 x0l, x0h, x1l, x1h, x2l, x2h;
                         v_expand(vx_load(src - cn), x0l, x0h);
@@ -562,29 +562,29 @@ struct SymmRowSmallVec_8u32s
                         x1l = v_add_wrap(v_add_wrap(x1l, x1l), v_add_wrap(x0l, x2l));
                         x1h = v_add_wrap(v_add_wrap(x1h, x1h), v_add_wrap(x0h, x2h));
                         v_store(dst + i, v_reinterpret_as_s32(v_expand_low(x1l)));
-                        v_store(dst + i + v_int32::nlanes, v_reinterpret_as_s32(v_expand_high(x1l)));
-                        v_store(dst + i + 2*v_int32::nlanes, v_reinterpret_as_s32(v_expand_low(x1h)));
-                        v_store(dst + i + 3*v_int32::nlanes, v_reinterpret_as_s32(v_expand_high(x1h)));
+                        v_store(dst + i + VTraits<v_int32>::vlanes(), v_reinterpret_as_s32(v_expand_high(x1l)));
+                        v_store(dst + i + 2*VTraits<v_int32>::vlanes(), v_reinterpret_as_s32(v_expand_low(x1h)));
+                        v_store(dst + i + 3*VTraits<v_int32>::vlanes(), v_reinterpret_as_s32(v_expand_high(x1h)));
                     }
-                    if( i <= width - v_uint16::nlanes )
+                    if( i <= width - VTraits<v_uint16>::vlanes() )
                     {
                         v_uint16 x = vx_load_expand(src);
                         x = v_add_wrap(v_add_wrap(x, x), v_add_wrap(vx_load_expand(src - cn), vx_load_expand(src + cn)));
                         v_store(dst + i, v_reinterpret_as_s32(v_expand_low(x)));
-                        v_store(dst + i + v_int32::nlanes, v_reinterpret_as_s32(v_expand_high(x)));
-                        i += v_uint16::nlanes; src += v_uint16::nlanes;
+                        v_store(dst + i + VTraits<v_int32>::vlanes(), v_reinterpret_as_s32(v_expand_high(x)));
+                        i += VTraits<v_uint16>::vlanes(); src += VTraits<v_uint16>::vlanes();
                     }
-                    if( i <= width - v_uint32::nlanes )
+                    if( i <= width - VTraits<v_uint32>::vlanes() )
                     {
                         v_uint32 x = vx_load_expand_q(src);
-                        x = (x + x) + vx_load_expand_q(src - cn) + vx_load_expand_q(src + cn);
+                        x = v_add(v_add(v_add(x, x), vx_load_expand_q(src - cn)), vx_load_expand_q(src + cn));
                         v_store(dst + i, v_reinterpret_as_s32(x));
-                        i += v_uint32::nlanes;
+                        i += VTraits<v_uint32>::vlanes();
                     }
                 }
                 else if( kx[0] == -2 && kx[1] == 1 )
                 {
-                    for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
+                    for( ; i <= width - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes(), src += VTraits<v_uint8>::vlanes() )
                     {
                         v_uint16 x0l, x0h, x1l, x1h, x2l, x2h;
                         v_expand(vx_load(src - cn), x0l, x0h);
@@ -593,31 +593,31 @@ struct SymmRowSmallVec_8u32s
                         x1l = v_sub_wrap(v_add_wrap(x0l, x2l), v_add_wrap(x1l, x1l));
                         x1h = v_sub_wrap(v_add_wrap(x0h, x2h), v_add_wrap(x1h, x1h));
                         v_store(dst + i, v_expand_low(v_reinterpret_as_s16(x1l)));
-                        v_store(dst + i + v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x1l)));
-                        v_store(dst + i + 2*v_int32::nlanes, v_expand_low(v_reinterpret_as_s16(x1h)));
-                        v_store(dst + i + 3*v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x1h)));
+                        v_store(dst + i + VTraits<v_int32>::vlanes(), v_expand_high(v_reinterpret_as_s16(x1l)));
+                        v_store(dst + i + 2*VTraits<v_int32>::vlanes(), v_expand_low(v_reinterpret_as_s16(x1h)));
+                        v_store(dst + i + 3*VTraits<v_int32>::vlanes(), v_expand_high(v_reinterpret_as_s16(x1h)));
                     }
-                    if( i <= width - v_uint16::nlanes )
+                    if( i <= width - VTraits<v_uint16>::vlanes() )
                     {
                         v_uint16 x = vx_load_expand(src);
                         x = v_sub_wrap(v_add_wrap(vx_load_expand(src - cn), vx_load_expand(src + cn)), v_add_wrap(x, x));
                         v_store(dst + i, v_expand_low(v_reinterpret_as_s16(x)));
-                        v_store(dst + i + v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x)));
-                        i += v_uint16::nlanes; src += v_uint16::nlanes;
+                        v_store(dst + i + VTraits<v_int32>::vlanes(), v_expand_high(v_reinterpret_as_s16(x)));
+                        i += VTraits<v_uint16>::vlanes(); src += VTraits<v_uint16>::vlanes();
                     }
-                    if( i <= width - v_uint32::nlanes )
+                    if( i <= width - VTraits<v_uint32>::vlanes() )
                     {
                         v_int32 x = v_reinterpret_as_s32(vx_load_expand_q(src));
-                        x = v_reinterpret_as_s32(vx_load_expand_q(src - cn) + vx_load_expand_q(src + cn)) - (x + x);
+                        x = v_sub(v_reinterpret_as_s32(v_add(vx_load_expand_q(src - cn), vx_load_expand_q(src + cn))), v_add(x, x));
                         v_store(dst + i, x);
-                        i += v_uint32::nlanes;
+                        i += VTraits<v_uint32>::vlanes();
                     }
                 }
                 else
                 {
                     v_int16 k0 = vx_setall_s16((short)kx[0]);
                     v_int16 k1 = vx_setall_s16((short)kx[1]);
-                    for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
+                    for( ; i <= width - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes(), src += VTraits<v_uint8>::vlanes() )
                     {
                         v_uint16 x0l, x0h, x1l, x1h, x2l, x2h;
                         v_expand(vx_load(src - cn), x0l, x0h);
@@ -628,34 +628,34 @@ struct SymmRowSmallVec_8u32s
                         v_int16 x0, x1;
                         v_mul_expand(v_reinterpret_as_s16(x1l), k0, dl, dh);
                         v_zip(v_reinterpret_as_s16(x0l), v_reinterpret_as_s16(x2l), x0, x1);
-                        dl += v_dotprod(x0, k1);
-                        dh += v_dotprod(x1, k1);
+                        dl = v_add(dl, v_dotprod(x0, k1));
+                        dh = v_add(dh, v_dotprod(x1, k1));
                         v_store(dst + i, dl);
-                        v_store(dst + i + v_int32::nlanes, dh);
+                        v_store(dst + i + VTraits<v_int32>::vlanes(), dh);
 
                         v_mul_expand(v_reinterpret_as_s16(x1h), k0, dl, dh);
                         v_zip(v_reinterpret_as_s16(x0h), v_reinterpret_as_s16(x2h), x0, x1);
-                        dl += v_dotprod(x0, k1);
-                        dh += v_dotprod(x1, k1);
-                        v_store(dst + i + 2*v_int32::nlanes, dl);
-                        v_store(dst + i + 3*v_int32::nlanes, dh);
+                        dl = v_add(dl, v_dotprod(x0, k1));
+                        dh = v_add(dh, v_dotprod(x1, k1));
+                        v_store(dst + i + 2*VTraits<v_int32>::vlanes(), dl);
+                        v_store(dst + i + 3*VTraits<v_int32>::vlanes(), dh);
                     }
-                    if ( i <= width - v_uint16::nlanes )
+                    if ( i <= width - VTraits<v_uint16>::vlanes() )
                     {
                         v_int32 dl, dh;
                         v_mul_expand(v_reinterpret_as_s16(vx_load_expand(src)), k0, dl, dh);
                         v_int16 x0, x1;
                         v_zip(v_reinterpret_as_s16(vx_load_expand(src - cn)), v_reinterpret_as_s16(vx_load_expand(src + cn)), x0, x1);
-                        dl += v_dotprod(x0, k1);
-                        dh += v_dotprod(x1, k1);
+                        dl = v_add(dl, v_dotprod(x0, k1));
+                        dh = v_add(dh, v_dotprod(x1, k1));
                         v_store(dst + i, dl);
-                        v_store(dst + i + v_int32::nlanes, dh);
-                        i += v_uint16::nlanes; src += v_uint16::nlanes;
+                        v_store(dst + i + VTraits<v_int32>::vlanes(), dh);
+                        i += VTraits<v_uint16>::vlanes(); src += VTraits<v_uint16>::vlanes();
                     }
-                    if ( i <= width - v_uint32::nlanes )
+                    if ( i <= width - VTraits<v_uint32>::vlanes() )
                     {
-                        v_store(dst + i, v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src)), vx_setall_s32(kx[0]), v_reinterpret_as_s32(vx_load_expand_q(src - cn) + vx_load_expand_q(src + cn)) * vx_setall_s32(kx[1])));
-                        i += v_uint32::nlanes;
+                        v_store(dst + i, v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src)), vx_setall_s32(kx[0]), v_mul(v_reinterpret_as_s32(v_add(vx_load_expand_q(src - cn), vx_load_expand_q(src + cn))), vx_setall_s32(kx[1]))));
+                        i += VTraits<v_uint32>::vlanes();
                     }
                 }
             }
@@ -663,7 +663,7 @@ struct SymmRowSmallVec_8u32s
             {
                 if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
                 {
-                    for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
+                    for( ; i <= width - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes(), src += VTraits<v_uint8>::vlanes() )
                     {
                         v_uint16 x0l, x0h, x1l, x1h, x2l, x2h;
                         v_expand(vx_load(src - 2*cn), x0l, x0h);
@@ -672,31 +672,31 @@ struct SymmRowSmallVec_8u32s
                         x1l = v_sub_wrap(v_add_wrap(x0l, x2l), v_add_wrap(x1l, x1l));
                         x1h = v_sub_wrap(v_add_wrap(x0h, x2h), v_add_wrap(x1h, x1h));
                         v_store(dst + i, v_expand_low(v_reinterpret_as_s16(x1l)));
-                        v_store(dst + i + v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x1l)));
-                        v_store(dst + i + 2*v_int32::nlanes, v_expand_low(v_reinterpret_as_s16(x1h)));
-                        v_store(dst + i + 3*v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x1h)));
+                        v_store(dst + i + VTraits<v_int32>::vlanes(), v_expand_high(v_reinterpret_as_s16(x1l)));
+                        v_store(dst + i + 2*VTraits<v_int32>::vlanes(), v_expand_low(v_reinterpret_as_s16(x1h)));
+                        v_store(dst + i + 3*VTraits<v_int32>::vlanes(), v_expand_high(v_reinterpret_as_s16(x1h)));
                     }
-                    if( i <= width - v_uint16::nlanes )
+                    if( i <= width - VTraits<v_uint16>::vlanes() )
                     {
                         v_uint16 x = vx_load_expand(src);
                         x = v_sub_wrap(v_add_wrap(vx_load_expand(src - 2*cn), vx_load_expand(src + 2*cn)), v_add_wrap(x, x));
                         v_store(dst + i, v_expand_low(v_reinterpret_as_s16(x)));
-                        v_store(dst + i + v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x)));
-                        i += v_uint16::nlanes; src += v_uint16::nlanes;
+                        v_store(dst + i + VTraits<v_int32>::vlanes(), v_expand_high(v_reinterpret_as_s16(x)));
+                        i += VTraits<v_uint16>::vlanes(); src += VTraits<v_uint16>::vlanes();
                     }
-                    if( i <= width - v_uint32::nlanes )
+                    if( i <= width - VTraits<v_uint32>::vlanes() )
                     {
                         v_int32 x = v_reinterpret_as_s32(vx_load_expand_q(src));
-                        x = v_reinterpret_as_s32(vx_load_expand_q(src - 2*cn) + vx_load_expand_q(src + 2*cn)) - (x + x);
+                        x = v_sub(v_reinterpret_as_s32(v_add(vx_load_expand_q(src - 2 * cn), vx_load_expand_q(src + 2 * cn))), v_add(x, x));
                         v_store(dst + i, x);
-                        i += v_uint32::nlanes;
+                        i += VTraits<v_uint32>::vlanes();
                     }
                 }
                 else
                 {
                     v_int16 k0 = vx_setall_s16((short)(kx[0]));
                     v_int16 k12 = v_reinterpret_as_s16(vx_setall_s32((kx[1] & 0xFFFF) | (kx[2] << 16)));
-                    for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
+                    for( ; i <= width - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes(), src += VTraits<v_uint8>::vlanes() )
                     {
                         v_int32 x0, x1, x2, x3;
                         v_uint16 x0l, x0h, x1l, x1h, x2l, x2h, x3l, x3h;
@@ -710,45 +710,45 @@ struct SymmRowSmallVec_8u32s
                         v_expand(vx_load(src + cn), x1l, x1h);
                         v_expand(vx_load(src - 2*cn), x2l, x2h);
                         v_expand(vx_load(src + 2*cn), x3l, x3h);
-                        v_zip(v_reinterpret_as_s16(x0l + x1l), v_reinterpret_as_s16(x2l + x3l), xl, xh);
-                        x0 += v_dotprod(xl, k12);
-                        x1 += v_dotprod(xh, k12);
-                        v_zip(v_reinterpret_as_s16(x0h + x1h), v_reinterpret_as_s16(x2h + x3h), xl, xh);
-                        x2 += v_dotprod(xl, k12);
-                        x3 += v_dotprod(xh, k12);
+                        v_zip(v_reinterpret_as_s16(v_add(x0l, x1l)), v_reinterpret_as_s16(v_add(x2l, x3l)), xl, xh);
+                        x0 = v_add(x0, v_dotprod(xl, k12));
+                        x1 = v_add(x1, v_dotprod(xh, k12));
+                        v_zip(v_reinterpret_as_s16(v_add(x0h, x1h)), v_reinterpret_as_s16(v_add(x2h, x3h)), xl, xh);
+                        x2 = v_add(x2, v_dotprod(xl, k12));
+                        x3 = v_add(x3, v_dotprod(xh, k12));
 
                         v_store(dst + i, x0);
-                        v_store(dst + i + v_int32::nlanes, x1);
-                        v_store(dst + i + 2*v_int32::nlanes, x2);
-                        v_store(dst + i + 3*v_int32::nlanes, x3);
+                        v_store(dst + i + VTraits<v_int32>::vlanes(), x1);
+                        v_store(dst + i + 2*VTraits<v_int32>::vlanes(), x2);
+                        v_store(dst + i + 3*VTraits<v_int32>::vlanes(), x3);
                     }
-                    if( i <= width - v_uint16::nlanes )
+                    if( i <= width - VTraits<v_uint16>::vlanes() )
                     {
                         v_int32 x1, x2;
                         v_mul_expand(v_reinterpret_as_s16(vx_load_expand(src)), k0, x1, x2);
 
                         v_int16 xl, xh;
-                        v_zip(v_reinterpret_as_s16(vx_load_expand(src - cn) + vx_load_expand(src + cn)), v_reinterpret_as_s16(vx_load_expand(src - 2*cn) + vx_load_expand(src + 2*cn)), xl, xh);
-                        x1 += v_dotprod(xl, k12);
-                        x2 += v_dotprod(xh, k12);
+                        v_zip(v_reinterpret_as_s16(v_add(vx_load_expand(src - cn), vx_load_expand(src + cn))), v_reinterpret_as_s16(v_add(vx_load_expand(src - 2 * cn), vx_load_expand(src + 2 * cn))), xl, xh);
+                        x1 = v_add(x1, v_dotprod(xl, k12));
+                        x2 = v_add(x2, v_dotprod(xh, k12));
 
                         v_store(dst + i, x1);
-                        v_store(dst + i + v_int32::nlanes, x2);
-                        i += v_uint16::nlanes, src += v_uint16::nlanes;
+                        v_store(dst + i + VTraits<v_int32>::vlanes(), x2);
+                        i += VTraits<v_uint16>::vlanes(), src += VTraits<v_uint16>::vlanes();
                     }
-                    if( i <= width - v_uint32::nlanes )
+                    if( i <= width - VTraits<v_uint32>::vlanes() )
                     {
                         v_store(dst + i, v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src)), vx_setall_s32(kx[0]),
-                                         v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src - cn) + vx_load_expand_q(src + cn)), vx_setall_s32(kx[1]),
-                                                  v_reinterpret_as_s32(vx_load_expand_q(src - 2*cn) + vx_load_expand_q(src + 2*cn)) * vx_setall_s32(kx[2]))));
-                        i += v_uint32::nlanes;
+                                         v_muladd(v_reinterpret_as_s32(v_add(vx_load_expand_q(src - cn), vx_load_expand_q(src + cn))), vx_setall_s32(kx[1]),
+                                                  v_mul(v_reinterpret_as_s32(v_add(vx_load_expand_q(src - 2 * cn), vx_load_expand_q(src + 2 * cn))), vx_setall_s32(kx[2])))));
+                        i += VTraits<v_uint32>::vlanes();
                     }
                 }
             }
             else
             {
                 v_int16 k0 = vx_setall_s16((short)(kx[0]));
-                for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
+                for( ; i <= width - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes(), src += VTraits<v_uint8>::vlanes() )
                 {
                     v_uint8 v_src = vx_load(src);
                     v_int32 s0, s1, s2, s3;
@@ -764,12 +764,12 @@ struct SymmRowSmallVec_8u32s
                         v_uint8 v_src3 = vx_load(src + j + cn);
 
                         v_int16 xl, xh;
-                        v_zip(v_reinterpret_as_s16(v_expand_low(v_src0) + v_expand_low(v_src2)), v_reinterpret_as_s16(v_expand_low(v_src1) + v_expand_low(v_src3)), xl, xh);
-                        s0 += v_dotprod(xl, k12);
-                        s1 += v_dotprod(xh, k12);
-                        v_zip(v_reinterpret_as_s16(v_expand_high(v_src0) + v_expand_high(v_src2)), v_reinterpret_as_s16(v_expand_high(v_src1) + v_expand_high(v_src3)), xl, xh);
-                        s2 += v_dotprod(xl, k12);
-                        s3 += v_dotprod(xh, k12);
+                        v_zip(v_reinterpret_as_s16(v_add(v_expand_low(v_src0), v_expand_low(v_src2))), v_reinterpret_as_s16(v_add(v_expand_low(v_src1), v_expand_low(v_src3))), xl, xh);
+                        s0 = v_add(s0, v_dotprod(xl, k12));
+                        s1 = v_add(s1, v_dotprod(xh, k12));
+                        v_zip(v_reinterpret_as_s16(v_add(v_expand_high(v_src0), v_expand_high(v_src2))), v_reinterpret_as_s16(v_add(v_expand_high(v_src1), v_expand_high(v_src3))), xl, xh);
+                        s2 = v_add(s2, v_dotprod(xl, k12));
+                        s3 = v_add(s3, v_dotprod(xh, k12));
                     }
                     if( k < _ksize / 2 + 1 )
                     {
@@ -780,48 +780,48 @@ struct SymmRowSmallVec_8u32s
 
                         v_int16 xl, xh;
                         v_zip(v_reinterpret_as_s16(v_expand_low(v_src0)), v_reinterpret_as_s16(v_expand_low(v_src1)), xl, xh);
-                        s0 += v_dotprod(xl, k1);
-                        s1 += v_dotprod(xh, k1);
+                        s0 = v_add(s0, v_dotprod(xl, k1));
+                        s1 = v_add(s1, v_dotprod(xh, k1));
                         v_zip(v_reinterpret_as_s16(v_expand_high(v_src0)), v_reinterpret_as_s16(v_expand_high(v_src1)), xl, xh);
-                        s2 += v_dotprod(xl, k1);
-                        s3 += v_dotprod(xh, k1);
+                        s2 = v_add(s2, v_dotprod(xl, k1));
+                        s3 = v_add(s3, v_dotprod(xh, k1));
                     }
                     v_store(dst + i, s0);
-                    v_store(dst + i + v_int32::nlanes, s1);
-                    v_store(dst + i + 2*v_int32::nlanes, s2);
-                    v_store(dst + i + 3*v_int32::nlanes, s3);
+                    v_store(dst + i + VTraits<v_int32>::vlanes(), s1);
+                    v_store(dst + i + 2*VTraits<v_int32>::vlanes(), s2);
+                    v_store(dst + i + 3*VTraits<v_int32>::vlanes(), s3);
                 }
-                if( i <= width - v_uint16::nlanes )
+                if( i <= width - VTraits<v_uint16>::vlanes() )
                 {
                     v_int32 s0, s1;
                     v_mul_expand(v_reinterpret_as_s16(vx_load_expand(src)), k0, s0, s1);
                     for (k = 1, j = cn; k <= _ksize / 2 - 1; k+=2, j += 2*cn)
                     {
                         v_int16 xl, xh;
-                        v_zip(v_reinterpret_as_s16(vx_load_expand(src - j) + vx_load_expand(src + j)), v_reinterpret_as_s16(vx_load_expand(src - j - cn) + vx_load_expand(src + j + cn)), xl, xh);
+                        v_zip(v_reinterpret_as_s16(v_add(vx_load_expand(src - j), vx_load_expand(src + j))), v_reinterpret_as_s16(v_add(vx_load_expand(src - j - cn), vx_load_expand(src + j + cn))), xl, xh);
                         v_int16 k12 = v_reinterpret_as_s16(vx_setall_s32((kx[k] & 0xFFFF) | (kx[k+1] << 16)));
-                        s0 += v_dotprod(xl, k12);
-                        s1 += v_dotprod(xh, k12);
+                        s0 = v_add(s0, v_dotprod(xl, k12));
+                        s1 = v_add(s1, v_dotprod(xh, k12));
                     }
                     if ( k < _ksize / 2 + 1 )
                     {
                         v_int16 xl, xh;
                         v_zip(v_reinterpret_as_s16(vx_load_expand(src - j)), v_reinterpret_as_s16(vx_load_expand(src + j)), xl, xh);
                         v_int16 k1 = vx_setall_s16((short)(kx[k]));
-                        s0 += v_dotprod(xl, k1);
-                        s1 += v_dotprod(xh, k1);
+                        s0 = v_add(s0, v_dotprod(xl, k1));
+                        s1 = v_add(s1, v_dotprod(xh, k1));
                     }
                     v_store(dst + i, s0);
-                    v_store(dst + i + v_int32::nlanes, s1);
-                    i += v_uint16::nlanes; src += v_uint16::nlanes;
+                    v_store(dst + i + VTraits<v_int32>::vlanes(), s1);
+                    i += VTraits<v_uint16>::vlanes(); src += VTraits<v_uint16>::vlanes();
                 }
-                if( i <= width - v_uint32::nlanes )
+                if( i <= width - VTraits<v_uint32>::vlanes() )
                 {
-                    v_int32 s0 = v_reinterpret_as_s32(vx_load_expand_q(src)) * vx_setall_s32(kx[0]);
+                    v_int32 s0 = v_mul(v_reinterpret_as_s32(vx_load_expand_q(src)), vx_setall_s32(kx[0]));
                     for( k = 1, j = cn; k < _ksize / 2 + 1; k++, j += cn )
-                        s0 = v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src - j) + vx_load_expand_q(src + j)), vx_setall_s32(kx[k]), s0);
+                        s0 = v_muladd(v_reinterpret_as_s32(v_add(vx_load_expand_q(src - j), vx_load_expand_q(src + j))), vx_setall_s32(kx[k]), s0);
                     v_store(dst + i, s0);
-                    i += v_uint32::nlanes;
+                    i += VTraits<v_uint32>::vlanes();
                 }
             }
         }
@@ -831,7 +831,7 @@ struct SymmRowSmallVec_8u32s
             {
                 if( kx[0] == 0 && kx[1] == 1 )
                 {
-                    for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
+                    for( ; i <= width - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes(), src += VTraits<v_uint8>::vlanes() )
                     {
                         v_uint16 x0l, x0h, x2l, x2h;
                         v_expand(vx_load(src - cn), x0l, x0h);
@@ -839,27 +839,27 @@ struct SymmRowSmallVec_8u32s
                         v_int16 dl = v_reinterpret_as_s16(v_sub_wrap(x2l, x0l));
                         v_int16 dh = v_reinterpret_as_s16(v_sub_wrap(x2h, x0h));
                         v_store(dst + i, v_expand_low(dl));
-                        v_store(dst + i + v_int32::nlanes, v_expand_high(dl));
-                        v_store(dst + i + 2*v_int32::nlanes, v_expand_low(dh));
-                        v_store(dst + i + 3*v_int32::nlanes, v_expand_high(dh));
+                        v_store(dst + i + VTraits<v_int32>::vlanes(), v_expand_high(dl));
+                        v_store(dst + i + 2*VTraits<v_int32>::vlanes(), v_expand_low(dh));
+                        v_store(dst + i + 3*VTraits<v_int32>::vlanes(), v_expand_high(dh));
                     }
-                    if( i <= width - v_uint16::nlanes )
+                    if( i <= width - VTraits<v_uint16>::vlanes() )
                     {
                         v_int16 dl = v_reinterpret_as_s16(v_sub_wrap(vx_load_expand(src + cn), vx_load_expand(src - cn)));
                         v_store(dst + i, v_expand_low(dl));
-                        v_store(dst + i + v_int32::nlanes, v_expand_high(dl));
-                        i += v_uint16::nlanes; src += v_uint16::nlanes;
+                        v_store(dst + i + VTraits<v_int32>::vlanes(), v_expand_high(dl));
+                        i += VTraits<v_uint16>::vlanes(); src += VTraits<v_uint16>::vlanes();
                     }
-                    if (i <= width - v_uint32::nlanes)
+                    if (i <= width - VTraits<v_uint32>::vlanes())
                     {
-                        v_store(dst + i, v_reinterpret_as_s32(vx_load_expand_q(src + cn)) - v_reinterpret_as_s32(vx_load_expand_q(src - cn)));
-                        i += v_uint32::nlanes;
+                        v_store(dst + i, v_sub(v_reinterpret_as_s32(vx_load_expand_q(src + cn)), v_reinterpret_as_s32(vx_load_expand_q(src - cn))));
+                        i += VTraits<v_uint32>::vlanes();
                     }
                 }
                 else
                 {
                     v_int16 k0 = v_reinterpret_as_s16(vx_setall_s32((kx[1] & 0xFFFF) | (-kx[1] << 16)));
-                    for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
+                    for( ; i <= width - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes(), src += VTraits<v_uint8>::vlanes() )
                     {
                         v_uint16 x0l, x0h, x2l, x2h;
                         v_expand(vx_load(src - cn), x0l, x0h);
@@ -867,30 +867,30 @@ struct SymmRowSmallVec_8u32s
                         v_int16 xl, xh;
                         v_zip(v_reinterpret_as_s16(x2l), v_reinterpret_as_s16(x0l), xl, xh);
                         v_store(dst + i, v_dotprod(xl, k0));
-                        v_store(dst + i + v_int32::nlanes, v_dotprod(xh, k0));
+                        v_store(dst + i + VTraits<v_int32>::vlanes(), v_dotprod(xh, k0));
                         v_zip(v_reinterpret_as_s16(x2h), v_reinterpret_as_s16(x0h), xl, xh);
-                        v_store(dst + i + 2*v_int32::nlanes, v_dotprod(xl, k0));
-                        v_store(dst + i + 3*v_int32::nlanes, v_dotprod(xh, k0));
+                        v_store(dst + i + 2*VTraits<v_int32>::vlanes(), v_dotprod(xl, k0));
+                        v_store(dst + i + 3*VTraits<v_int32>::vlanes(), v_dotprod(xh, k0));
                     }
-                    if( i <= width - v_uint16::nlanes )
+                    if( i <= width - VTraits<v_uint16>::vlanes() )
                     {
                         v_int16 xl, xh;
                         v_zip(v_reinterpret_as_s16(vx_load_expand(src + cn)), v_reinterpret_as_s16(vx_load_expand(src - cn)), xl, xh);
                         v_store(dst + i, v_dotprod(xl, k0));
-                        v_store(dst + i + v_int32::nlanes, v_dotprod(xh, k0));
-                        i += v_uint16::nlanes; src += v_uint16::nlanes;
+                        v_store(dst + i + VTraits<v_int32>::vlanes(), v_dotprod(xh, k0));
+                        i += VTraits<v_uint16>::vlanes(); src += VTraits<v_uint16>::vlanes();
                     }
-                    if (i <= width - v_uint32::nlanes)
+                    if (i <= width - VTraits<v_uint32>::vlanes())
                     {
-                        v_store(dst + i, v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src + cn)), vx_setall_s32(kx[1]), v_reinterpret_as_s32(vx_load_expand_q(src - cn)) * vx_setall_s32(-kx[1])));
-                        i += v_uint32::nlanes;
+                        v_store(dst + i, v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src + cn)), vx_setall_s32(kx[1]), v_mul(v_reinterpret_as_s32(vx_load_expand_q(src - cn)), vx_setall_s32(-kx[1]))));
+                        i += VTraits<v_uint32>::vlanes();
                     }
                 }
             }
             else if( _ksize == 5 )
             {
                 v_int16 k0 = v_reinterpret_as_s16(vx_setall_s32((kx[1] & 0xFFFF) | (kx[2] << 16)));
-                for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
+                for( ; i <= width - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes(), src += VTraits<v_uint8>::vlanes() )
                 {
                     v_uint16 x0l, x0h, x1l, x1h, x2l, x2h, x3l, x3h;
                     v_expand(vx_load(src - cn), x0l, x0h);
@@ -900,31 +900,31 @@ struct SymmRowSmallVec_8u32s
                     v_int16 x0, x1;
                     v_zip(v_reinterpret_as_s16(v_sub_wrap(x2l, x0l)), v_reinterpret_as_s16(v_sub_wrap(x3l, x1l)), x0, x1);
                     v_store(dst + i, v_dotprod(x0, k0));
-                    v_store(dst + i + v_int32::nlanes, v_dotprod(x1, k0));
+                    v_store(dst + i + VTraits<v_int32>::vlanes(), v_dotprod(x1, k0));
                     v_zip(v_reinterpret_as_s16(v_sub_wrap(x2h, x0h)), v_reinterpret_as_s16(v_sub_wrap(x3h, x1h)), x0, x1);
-                    v_store(dst + i + 2*v_int32::nlanes, v_dotprod(x0, k0));
-                    v_store(dst + i + 3*v_int32::nlanes, v_dotprod(x1, k0));
+                    v_store(dst + i + 2*VTraits<v_int32>::vlanes(), v_dotprod(x0, k0));
+                    v_store(dst + i + 3*VTraits<v_int32>::vlanes(), v_dotprod(x1, k0));
                 }
-                if( i <= width - v_uint16::nlanes )
+                if( i <= width - VTraits<v_uint16>::vlanes() )
                 {
                     v_int16 x0, x1;
                     v_zip(v_reinterpret_as_s16(v_sub_wrap(vx_load_expand(src + cn), vx_load_expand(src - cn))),
                           v_reinterpret_as_s16(v_sub_wrap(vx_load_expand(src + 2*cn), vx_load_expand(src - 2*cn))), x0, x1);
                     v_store(dst + i, v_dotprod(x0, k0));
-                    v_store(dst + i + v_int32::nlanes, v_dotprod(x1, k0));
-                    i += v_uint16::nlanes; src += v_uint16::nlanes;
+                    v_store(dst + i + VTraits<v_int32>::vlanes(), v_dotprod(x1, k0));
+                    i += VTraits<v_uint16>::vlanes(); src += VTraits<v_uint16>::vlanes();
                 }
-                if( i <= width - v_uint32::nlanes )
+                if( i <= width - VTraits<v_uint32>::vlanes() )
                 {
-                    v_store(dst + i, v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src + cn)) - v_reinterpret_as_s32(vx_load_expand_q(src - cn)), vx_setall_s32(kx[1]),
-                                             (v_reinterpret_as_s32(vx_load_expand_q(src + 2*cn)) - v_reinterpret_as_s32(vx_load_expand_q(src - 2*cn))) * vx_setall_s32(kx[2])));
-                    i += v_uint32::nlanes;
+                    v_store(dst + i, v_muladd(v_sub(v_reinterpret_as_s32(vx_load_expand_q(src + cn)), v_reinterpret_as_s32(vx_load_expand_q(src - cn))), vx_setall_s32(kx[1]),
+                                             v_mul(v_sub(v_reinterpret_as_s32(vx_load_expand_q(src + 2 * cn)), v_reinterpret_as_s32(vx_load_expand_q(src - 2 * cn))), vx_setall_s32(kx[2]))));
+                    i += VTraits<v_uint32>::vlanes();
                 }
             }
             else
             {
                 v_int16 k0 = vx_setall_s16((short)(kx[0]));
-                for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
+                for( ; i <= width - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes(), src += VTraits<v_uint8>::vlanes() )
                 {
                     v_uint8 v_src = vx_load(src);
                     v_int32 s0, s1, s2, s3;
@@ -941,11 +941,11 @@ struct SymmRowSmallVec_8u32s
 
                         v_int16 xl, xh;
                         v_zip(v_reinterpret_as_s16(v_sub_wrap(v_expand_low(v_src2), v_expand_low(v_src0))), v_reinterpret_as_s16(v_sub_wrap(v_expand_low(v_src3), v_expand_low(v_src1))), xl, xh);
-                        s0 += v_dotprod(xl, k12);
-                        s1 += v_dotprod(xh, k12);
+                        s0 = v_add(s0, v_dotprod(xl, k12));
+                        s1 = v_add(s1, v_dotprod(xh, k12));
                         v_zip(v_reinterpret_as_s16(v_sub_wrap(v_expand_high(v_src2), v_expand_high(v_src0))), v_reinterpret_as_s16(v_sub_wrap(v_expand_high(v_src3), v_expand_high(v_src1))), xl, xh);
-                        s2 += v_dotprod(xl, k12);
-                        s3 += v_dotprod(xh, k12);
+                        s2 = v_add(s2, v_dotprod(xl, k12));
+                        s3 = v_add(s3, v_dotprod(xh, k12));
                     }
                     if( k < _ksize / 2 + 1 )
                     {
@@ -955,18 +955,18 @@ struct SymmRowSmallVec_8u32s
 
                         v_int16 xl, xh;
                         v_zip(v_reinterpret_as_s16(v_expand_low(v_src1)), v_reinterpret_as_s16(v_expand_low(v_src0)), xl, xh);
-                        s0 += v_dotprod(xl, k12);
-                        s1 += v_dotprod(xh, k12);
+                        s0 = v_add(s0, v_dotprod(xl, k12));
+                        s1 = v_add(s1, v_dotprod(xh, k12));
                         v_zip(v_reinterpret_as_s16(v_expand_high(v_src1)), v_reinterpret_as_s16(v_expand_high(v_src0)), xl, xh);
-                        s2 += v_dotprod(xl, k12);
-                        s3 += v_dotprod(xh, k12);
+                        s2 = v_add(s2, v_dotprod(xl, k12));
+                        s3 = v_add(s3, v_dotprod(xh, k12));
                     }
                     v_store(dst + i, s0);
-                    v_store(dst + i + v_int32::nlanes, s1);
-                    v_store(dst + i + 2*v_int32::nlanes, s2);
-                    v_store(dst + i + 3*v_int32::nlanes, s3);
+                    v_store(dst + i + VTraits<v_int32>::vlanes(), s1);
+                    v_store(dst + i + 2*VTraits<v_int32>::vlanes(), s2);
+                    v_store(dst + i + 3*VTraits<v_int32>::vlanes(), s3);
                 }
-                if( i <= width - v_uint16::nlanes )
+                if( i <= width - VTraits<v_uint16>::vlanes() )
                 {
                     v_int32 s0, s1;
                     v_mul_expand(v_reinterpret_as_s16(vx_load_expand(src)), k0, s0, s1);
@@ -975,28 +975,28 @@ struct SymmRowSmallVec_8u32s
                         v_int16 xl, xh;
                         v_zip(v_reinterpret_as_s16(v_sub_wrap(vx_load_expand(src + j), vx_load_expand(src - j))), v_reinterpret_as_s16(v_sub_wrap(vx_load_expand(src + j + cn), vx_load_expand(src - j - cn))), xl, xh);
                         v_int16 k12 = v_reinterpret_as_s16(vx_setall_s32((kx[k] & 0xFFFF) | (kx[k + 1] << 16)));
-                        s0 += v_dotprod(xl, k12);
-                        s1 += v_dotprod(xh, k12);
+                        s0 = v_add(s0, v_dotprod(xl, k12));
+                        s1 = v_add(s1, v_dotprod(xh, k12));
                     }
                     if( k < _ksize / 2 + 1 )
                     {
                         v_int16 k1 = v_reinterpret_as_s16(vx_setall_s32((kx[k] & 0xFFFF) | (-kx[k] << 16)));
                         v_int16 xl, xh;
                         v_zip(v_reinterpret_as_s16(vx_load_expand(src + j)), v_reinterpret_as_s16(vx_load_expand(src - j)), xl, xh);
-                        s0 += v_dotprod(xl, k1);
-                        s1 += v_dotprod(xh, k1);
+                        s0 = v_add(s0, v_dotprod(xl, k1));
+                        s1 = v_add(s1, v_dotprod(xh, k1));
                     }
                     v_store(dst + i, s0);
-                    v_store(dst + i + v_int32::nlanes, s1);
-                    i += v_uint16::nlanes; src += v_uint16::nlanes;
+                    v_store(dst + i + VTraits<v_int32>::vlanes(), s1);
+                    i += VTraits<v_uint16>::vlanes(); src += VTraits<v_uint16>::vlanes();
                 }
-                if( i <= width - v_uint32::nlanes )
+                if( i <= width - VTraits<v_uint32>::vlanes() )
                 {
-                    v_int32 s0 = v_reinterpret_as_s32(vx_load_expand_q(src)) * vx_setall_s32(kx[0]);
+                    v_int32 s0 = v_mul(v_reinterpret_as_s32(vx_load_expand_q(src)), vx_setall_s32(kx[0]));
                     for (k = 1, j = cn; k < _ksize / 2 + 1; k++, j += cn)
-                        s0 = v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src + j)) - v_reinterpret_as_s32(vx_load_expand_q(src - j)), vx_setall_s32(kx[k]), s0);
+                        s0 = v_muladd(v_sub(v_reinterpret_as_s32(vx_load_expand_q(src + j)), v_reinterpret_as_s32(vx_load_expand_q(src - j))), vx_setall_s32(kx[k]), s0);
                     v_store(dst + i, s0);
-                    i += v_uint32::nlanes;
+                    i += VTraits<v_uint32>::vlanes();
                 }
             }
         }
@@ -1038,120 +1038,120 @@ struct SymmColumnVec_32s8u
         {
             v_float32 f0 = vx_setall_f32(ky[0]);
             v_float32 f1 = vx_setall_f32(ky[1]);
-            for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes )
+            for( ; i <= width - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes() )
             {
                 const int* S = src[0] + i;
                 v_float32 s0 = v_muladd(v_cvt_f32(vx_load(S)), f0, d4);
-                v_float32 s1 = v_muladd(v_cvt_f32(vx_load(S + v_int32::nlanes)), f0, d4);
-                v_float32 s2 = v_muladd(v_cvt_f32(vx_load(S + 2*v_int32::nlanes)), f0, d4);
-                v_float32 s3 = v_muladd(v_cvt_f32(vx_load(S + 3*v_int32::nlanes)), f0, d4);
+                v_float32 s1 = v_muladd(v_cvt_f32(vx_load(S + VTraits<v_int32>::vlanes())), f0, d4);
+                v_float32 s2 = v_muladd(v_cvt_f32(vx_load(S + 2*VTraits<v_int32>::vlanes())), f0, d4);
+                v_float32 s3 = v_muladd(v_cvt_f32(vx_load(S + 3*VTraits<v_int32>::vlanes())), f0, d4);
                 const int* S0 = src[1] + i;
                 const int* S1 = src[-1] + i;
-                s0 = v_muladd(v_cvt_f32(vx_load(S0) + vx_load(S1)), f1, s0);
-                s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) + vx_load(S1 + v_int32::nlanes)), f1, s1);
-                s2 = v_muladd(v_cvt_f32(vx_load(S0 + 2 * v_int32::nlanes) + vx_load(S1 + 2 * v_int32::nlanes)), f1, s2);
-                s3 = v_muladd(v_cvt_f32(vx_load(S0 + 3 * v_int32::nlanes) + vx_load(S1 + 3 * v_int32::nlanes)), f1, s3);
+                s0 = v_muladd(v_cvt_f32(v_add(vx_load(S0), vx_load(S1))), f1, s0);
+                s1 = v_muladd(v_cvt_f32(v_add(vx_load(S0 + VTraits<v_int32>::vlanes()), vx_load(S1 + VTraits<v_int32>::vlanes()))), f1, s1);
+                s2 = v_muladd(v_cvt_f32(v_add(vx_load(S0 + 2 * VTraits<v_int32>::vlanes()), vx_load(S1 + 2 * VTraits<v_int32>::vlanes()))), f1, s2);
+                s3 = v_muladd(v_cvt_f32(v_add(vx_load(S0 + 3 * VTraits<v_int32>::vlanes()), vx_load(S1 + 3 * VTraits<v_int32>::vlanes()))), f1, s3);
                 for( k = 2; k <= ksize2; k++ )
                 {
                     v_float32 f = vx_setall_f32(ky[k]);
                     S0 = src[k] + i;
                     S1 = src[-k] + i;
-                    s0 = v_muladd(v_cvt_f32(vx_load(S0) + vx_load(S1)), f, s0);
-                    s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) + vx_load(S1 + v_int32::nlanes)), f, s1);
-                    s2 = v_muladd(v_cvt_f32(vx_load(S0 + 2*v_int32::nlanes) + vx_load(S1 + 2*v_int32::nlanes)), f, s2);
-                    s3 = v_muladd(v_cvt_f32(vx_load(S0 + 3*v_int32::nlanes) + vx_load(S1 + 3*v_int32::nlanes)), f, s3);
+                    s0 = v_muladd(v_cvt_f32(v_add(vx_load(S0), vx_load(S1))), f, s0);
+                    s1 = v_muladd(v_cvt_f32(v_add(vx_load(S0 + VTraits<v_int32>::vlanes()), vx_load(S1 + VTraits<v_int32>::vlanes()))), f, s1);
+                    s2 = v_muladd(v_cvt_f32(v_add(vx_load(S0 + 2 * VTraits<v_int32>::vlanes()), vx_load(S1 + 2 * VTraits<v_int32>::vlanes()))), f, s2);
+                    s3 = v_muladd(v_cvt_f32(v_add(vx_load(S0 + 3 * VTraits<v_int32>::vlanes()), vx_load(S1 + 3 * VTraits<v_int32>::vlanes()))), f, s3);
                 }
                 v_store(dst + i, v_pack_u(v_pack(v_round(s0), v_round(s1)), v_pack(v_round(s2), v_round(s3))));
             }
-            if( i <= width - v_uint16::nlanes )
+            if( i <= width - VTraits<v_uint16>::vlanes() )
             {
                 const int* S = src[0] + i;
                 v_float32 s0 = v_muladd(v_cvt_f32(vx_load(S)), f0, d4);
-                v_float32 s1 = v_muladd(v_cvt_f32(vx_load(S + v_int32::nlanes)), f0, d4);
+                v_float32 s1 = v_muladd(v_cvt_f32(vx_load(S + VTraits<v_int32>::vlanes())), f0, d4);
                 const int* S0 = src[1] + i;
                 const int* S1 = src[-1] + i;
-                s0 = v_muladd(v_cvt_f32(vx_load(S0) + vx_load(S1)), f1, s0);
-                s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) + vx_load(S1 + v_int32::nlanes)), f1, s1);
+                s0 = v_muladd(v_cvt_f32(v_add(vx_load(S0), vx_load(S1))), f1, s0);
+                s1 = v_muladd(v_cvt_f32(v_add(vx_load(S0 + VTraits<v_int32>::vlanes()), vx_load(S1 + VTraits<v_int32>::vlanes()))), f1, s1);
                 for( k = 2; k <= ksize2; k++ )
                 {
                     v_float32 f = vx_setall_f32(ky[k]);
                     S0 = src[k] + i;
                     S1 = src[-k] + i;
-                    s0 = v_muladd(v_cvt_f32(vx_load(S0) + vx_load(S1)), f, s0);
-                    s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) + vx_load(S1 + v_int32::nlanes)), f, s1);
+                    s0 = v_muladd(v_cvt_f32(v_add(vx_load(S0), vx_load(S1))), f, s0);
+                    s1 = v_muladd(v_cvt_f32(v_add(vx_load(S0 + VTraits<v_int32>::vlanes()), vx_load(S1 + VTraits<v_int32>::vlanes()))), f, s1);
                 }
                 v_pack_u_store(dst + i, v_pack(v_round(s0), v_round(s1)));
-                i += v_uint16::nlanes;
+                i += VTraits<v_uint16>::vlanes();
             }
 #if CV_SIMD_WIDTH > 16
-            while( i <= width - v_int32x4::nlanes )
+            while( i <= width - 4 /*v_int32x4::nlanes*/ )
 #else
-            if( i <= width - v_int32x4::nlanes )
+            if( i <= width - v_int32::nlanes )
 #endif
             {
-                v_float32x4 s0 = v_muladd(v_cvt_f32(v_load(src[0] + i)), v_setall_f32(ky[0]), v_setall_f32(delta));
-                s0 = v_muladd(v_cvt_f32(v_load(src[1] + i) + v_load(src[-1] + i)), v_setall_f32(ky[1]), s0);
+                v_float32 s0 = v_muladd(v_cvt_f32(vx_load(src[0] + i)), vx_setall_f32(ky[0]), vx_setall_f32(delta));
+                s0 = v_muladd(v_cvt_f32(v_add(vx_load(src[1] + i), vx_load(src[-1] + i))), vx_setall_f32(ky[1]), s0);
                 for( k = 2; k <= ksize2; k++ )
-                    s0 = v_muladd(v_cvt_f32(v_load(src[k] + i) + v_load(src[-k] + i)), v_setall_f32(ky[k]), s0);
-                v_int32x4 s32 = v_round(s0);
-                v_int16x8 s16 = v_pack(s32, s32);
-                *(unaligned_int*)(dst + i) = v_reinterpret_as_s32(v_pack_u(s16, s16)).get0();
-                i += v_int32x4::nlanes;
+                    s0 = v_muladd(v_cvt_f32(v_add(vx_load(src[k] + i), vx_load(src[-k] + i))), vx_setall_f32(ky[k]), s0);
+                v_int32 s32 = v_round(s0);
+                v_int16 s16 = v_pack(s32, s32);
+                *(unaligned_int*)(dst + i) = v_get0(v_reinterpret_as_s32(v_pack_u(s16, s16)));
+                i += 4 /*v_int32x4::nlanes*/ ;
             }
         }
         else
         {
             v_float32 f1 = vx_setall_f32(ky[1]);
-            for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes )
+            for( ; i <= width - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes() )
             {
                 const int* S0 = src[1] + i;
                 const int* S1 = src[-1] + i;
-                v_float32 s0 = v_muladd(v_cvt_f32(vx_load(S0) - vx_load(S1)), f1, d4);
-                v_float32 s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) - vx_load(S1 + v_int32::nlanes)), f1, d4);
-                v_float32 s2 = v_muladd(v_cvt_f32(vx_load(S0 + 2 * v_int32::nlanes) - vx_load(S1 + 2 * v_int32::nlanes)), f1, d4);
-                v_float32 s3 = v_muladd(v_cvt_f32(vx_load(S0 + 3 * v_int32::nlanes) - vx_load(S1 + 3 * v_int32::nlanes)), f1, d4);
+                v_float32 s0 = v_muladd(v_cvt_f32(v_sub(vx_load(S0), vx_load(S1))), f1, d4);
+                v_float32 s1 = v_muladd(v_cvt_f32(v_sub(vx_load(S0 + VTraits<v_int32>::vlanes()), vx_load(S1 + VTraits<v_int32>::vlanes()))), f1, d4);
+                v_float32 s2 = v_muladd(v_cvt_f32(v_sub(vx_load(S0 + 2 * VTraits<v_int32>::vlanes()), vx_load(S1 + 2 * VTraits<v_int32>::vlanes()))), f1, d4);
+                v_float32 s3 = v_muladd(v_cvt_f32(v_sub(vx_load(S0 + 3 * VTraits<v_int32>::vlanes()), vx_load(S1 + 3 * VTraits<v_int32>::vlanes()))), f1, d4);
                 for ( k = 2; k <= ksize2; k++ )
                 {
                     v_float32 f = vx_setall_f32(ky[k]);
                     S0 = src[k] + i;
                     S1 = src[-k] + i;
-                    s0 = v_muladd(v_cvt_f32(vx_load(S0) - vx_load(S1)), f, s0);
-                    s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) - vx_load(S1 + v_int32::nlanes)), f, s1);
-                    s2 = v_muladd(v_cvt_f32(vx_load(S0 + 2*v_int32::nlanes) - vx_load(S1 + 2*v_int32::nlanes)), f, s2);
-                    s3 = v_muladd(v_cvt_f32(vx_load(S0 + 3*v_int32::nlanes) - vx_load(S1 + 3*v_int32::nlanes)), f, s3);
+                    s0 = v_muladd(v_cvt_f32(v_sub(vx_load(S0), vx_load(S1))), f, s0);
+                    s1 = v_muladd(v_cvt_f32(v_sub(vx_load(S0 + VTraits<v_int32>::vlanes()), vx_load(S1 + VTraits<v_int32>::vlanes()))), f, s1);
+                    s2 = v_muladd(v_cvt_f32(v_sub(vx_load(S0 + 2 * VTraits<v_int32>::vlanes()), vx_load(S1 + 2 * VTraits<v_int32>::vlanes()))), f, s2);
+                    s3 = v_muladd(v_cvt_f32(v_sub(vx_load(S0 + 3 * VTraits<v_int32>::vlanes()), vx_load(S1 + 3 * VTraits<v_int32>::vlanes()))), f, s3);
                 }
                 v_store(dst + i, v_pack_u(v_pack(v_round(s0), v_round(s1)), v_pack(v_round(s2), v_round(s3))));
             }
-            if( i <= width - v_uint16::nlanes )
+            if( i <= width - VTraits<v_uint16>::vlanes() )
             {
                 const int* S0 = src[1] + i;
                 const int* S1 = src[-1] + i;
-                v_float32 s0 = v_muladd(v_cvt_f32(vx_load(S0) - vx_load(S1)), f1, d4);
-                v_float32 s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) - vx_load(S1 + v_int32::nlanes)), f1, d4);
+                v_float32 s0 = v_muladd(v_cvt_f32(v_sub(vx_load(S0), vx_load(S1))), f1, d4);
+                v_float32 s1 = v_muladd(v_cvt_f32(v_sub(vx_load(S0 + VTraits<v_int32>::vlanes()), vx_load(S1 + VTraits<v_int32>::vlanes()))), f1, d4);
                 for ( k = 2; k <= ksize2; k++ )
                 {
                     v_float32 f = vx_setall_f32(ky[k]);
                     S0 = src[k] + i;
                     S1 = src[-k] + i;
-                    s0 = v_muladd(v_cvt_f32(vx_load(S0) - vx_load(S1)), f, s0);
-                    s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) - vx_load(S1 + v_int32::nlanes)), f, s1);
+                    s0 = v_muladd(v_cvt_f32(v_sub(vx_load(S0), vx_load(S1))), f, s0);
+                    s1 = v_muladd(v_cvt_f32(v_sub(vx_load(S0 + VTraits<v_int32>::vlanes()), vx_load(S1 + VTraits<v_int32>::vlanes()))), f, s1);
                 }
                 v_pack_u_store(dst + i, v_pack(v_round(s0), v_round(s1)));
-                i += v_uint16::nlanes;
+                i += VTraits<v_uint16>::vlanes();
             }
 #if CV_SIMD_WIDTH > 16
-            while( i <= width - v_int32x4::nlanes )
+            while( i <= width - 4 /*v_int32x4::nlanes*/ )
 #else
-            if( i <= width - v_int32x4::nlanes )
+            if( i <= width - v_int32::nlanes )
 #endif
             {
-                v_float32x4 s0 = v_muladd(v_cvt_f32(v_load(src[1] + i) - v_load(src[-1] + i)), v_setall_f32(ky[1]), v_setall_f32(delta));
+                v_float32 s0 = v_muladd(v_cvt_f32(v_sub(vx_load(src[1] + i), vx_load(src[-1] + i))), vx_setall_f32(ky[1]), vx_setall_f32(delta));
                 for (k = 2; k <= ksize2; k++)
-                    s0 = v_muladd(v_cvt_f32(v_load(src[k] + i) - v_load(src[-k] + i)), v_setall_f32(ky[k]), s0);
-                v_int32x4 s32 = v_round(s0);
-                v_int16x8 s16 = v_pack(s32, s32);
-                *(unaligned_int*)(dst + i) = v_reinterpret_as_s32(v_pack_u(s16, s16)).get0();
-                i += v_int32x4::nlanes;
+                    s0 = v_muladd(v_cvt_f32(v_sub(vx_load(src[k] + i), vx_load(src[-k] + i))), vx_setall_f32(ky[k]), s0);
+                v_int32 s32 = v_round(s0);
+                v_int16 s16 = v_pack(s32, s32);
+                *(unaligned_int*)(dst + i) = v_get0(v_reinterpret_as_s32(v_pack_u(s16, s16)));
+                i += 4 /*v_int32x4::nlanes*/ ;
             }
         }
         return i;
@@ -1187,31 +1187,31 @@ struct SymmColumnVec_32f8u
 
         if( symmetrical )
         {
-            for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes )
+            for( ; i <= width - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes() )
             {
                 v_float32 v_ky0 = vx_setall_f32(ky[0]);
                 v_float32 v32_delta = vx_setall_f32(delta);
                 const float* S = src[0] + i;
                 v_float32 s0 = v_muladd(v_ky0, vx_load(S), v32_delta);
-                v_float32 s1 = v_muladd(v_ky0, vx_load(S + v_float32::nlanes), v32_delta);
-                v_float32 s2 = v_muladd(v_ky0, vx_load(S + 2*v_float32::nlanes), v32_delta);
-                v_float32 s3 = v_muladd(v_ky0, vx_load(S + 3*v_float32::nlanes), v32_delta);
+                v_float32 s1 = v_muladd(v_ky0, vx_load(S + VTraits<v_float32>::vlanes()), v32_delta);
+                v_float32 s2 = v_muladd(v_ky0, vx_load(S + 2*VTraits<v_float32>::vlanes()), v32_delta);
+                v_float32 s3 = v_muladd(v_ky0, vx_load(S + 3*VTraits<v_float32>::vlanes()), v32_delta);
                 for( k = 1; k <= ksize2; k++ )
                 {
                     v_float32 v_kyk = vx_setall_f32(ky[k]);
                     const float* S0 = src[k] + i;
                     const float* S1 = src[-k] + i;
-                    s0 = v_muladd(v_kyk, vx_load(S0) + vx_load(S1), s0);
-                    s1 = v_muladd(v_kyk, vx_load(S0 + v_float32::nlanes) + vx_load(S1 + v_float32::nlanes), s1);
-                    s2 = v_muladd(v_kyk, vx_load(S0 + 2*v_float32::nlanes) + vx_load(S1 + 2*v_float32::nlanes), s2);
-                    s3 = v_muladd(v_kyk, vx_load(S0 + 3*v_float32::nlanes) + vx_load(S1 + 3*v_float32::nlanes), s3);
+                    s0 = v_muladd(v_kyk, v_add(vx_load(S0), vx_load(S1)), s0);
+                    s1 = v_muladd(v_kyk, v_add(vx_load(S0 + VTraits<v_float32>::vlanes()), vx_load(S1 + VTraits<v_float32>::vlanes())), s1);
+                    s2 = v_muladd(v_kyk, v_add(vx_load(S0 + 2 * VTraits<v_float32>::vlanes()), vx_load(S1 + 2 * VTraits<v_float32>::vlanes())), s2);
+                    s3 = v_muladd(v_kyk, v_add(vx_load(S0 + 3 * VTraits<v_float32>::vlanes()), vx_load(S1 + 3 * VTraits<v_float32>::vlanes())), s3);
                 }
                 v_store(_dst + i, v_pack_u(v_pack(v_round(s0), v_round(s1)), v_pack(v_round(s2), v_round(s3))));
             }
         }
         else
         {
-            for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes )
+            for( ; i <= width - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes() )
             {
                 v_float32 s0 = vx_setall_f32(delta);
                 v_float32 s1 = vx_setall_f32(delta);
@@ -1222,10 +1222,10 @@ struct SymmColumnVec_32f8u
                     v_float32 v_kyk = vx_setall_f32(ky[k]);
                     const float* S0 = src[k] + i;
                     const float* S1 = src[-k] + i;
-                    s0 = v_muladd(v_kyk, vx_load(S0) - vx_load(S1), s0);
-                    s1 = v_muladd(v_kyk, vx_load(S0 + v_float32::nlanes) - vx_load(S1 + v_float32::nlanes), s1);
-                    s2 = v_muladd(v_kyk, vx_load(S0 + 2*v_float32::nlanes) - vx_load(S1 + 2*v_float32::nlanes), s2);
-                    s3 = v_muladd(v_kyk, vx_load(S0 + 3*v_float32::nlanes) - vx_load(S1 + 3*v_float32::nlanes), s3);
+                    s0 = v_muladd(v_kyk, v_sub(vx_load(S0), vx_load(S1)), s0);
+                    s1 = v_muladd(v_kyk, v_sub(vx_load(S0 + VTraits<v_float32>::vlanes()), vx_load(S1 + VTraits<v_float32>::vlanes())), s1);
+                    s2 = v_muladd(v_kyk, v_sub(vx_load(S0 + 2 * VTraits<v_float32>::vlanes()), vx_load(S1 + 2 * VTraits<v_float32>::vlanes())), s2);
+                    s3 = v_muladd(v_kyk, v_sub(vx_load(S0 + 3 * VTraits<v_float32>::vlanes()), vx_load(S1 + 3 * VTraits<v_float32>::vlanes())), s3);
                 }
                 v_store(_dst + i, v_pack_u(v_pack(v_round(s0), v_round(s1)), v_pack(v_round(s2), v_round(s3))));
             }
@@ -1268,55 +1268,52 @@ struct SymmColumnSmallVec_32s16s
         {
             if( ky[0] == 2 && ky[1] == 1 )
             {
-                for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes )
+                for( ; i <= width - 2*VTraits<v_int16>::vlanes(); i += 2*VTraits<v_int16>::vlanes() )
                 {
                     v_int32 s0 = vx_load(S1 + i);
-                    v_int32 s1 = vx_load(S1 + i + v_int32::nlanes);
-                    v_int32 s2 = vx_load(S1 + i + 2*v_int32::nlanes);
-                    v_int32 s3 = vx_load(S1 + i + 3*v_int32::nlanes);
-                    v_store(dst + i, v_pack(vx_load(S0 + i) + vx_load(S2 + i) + (s0 + s0), vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes) + (s1 + s1)) + d8);
-                    v_store(dst + i + v_int16::nlanes, v_pack(vx_load(S0 + i + 2*v_int32::nlanes) + vx_load(S2 + i + 2*v_int32::nlanes) + (s2 + s2),
-                                                              vx_load(S0 + i + 3*v_int32::nlanes) + vx_load(S2 + i + 3*v_int32::nlanes) + (s3 + s3)) + d8);
+                    v_int32 s1 = vx_load(S1 + i + VTraits<v_int32>::vlanes());
+                    v_int32 s2 = vx_load(S1 + i + 2*VTraits<v_int32>::vlanes());
+                    v_int32 s3 = vx_load(S1 + i + 3*VTraits<v_int32>::vlanes());
+                    v_store(dst + i, v_add(v_pack(v_add(v_add(vx_load(S0 + i), vx_load(S2 + i)), v_add(s0, s0)), v_add(v_add(vx_load(S0 + i + VTraits<v_int32>::vlanes()), vx_load(S2 + i + VTraits<v_int32>::vlanes())), v_add(s1, s1))), d8));
+                    v_store(dst + i + VTraits<v_int16>::vlanes(), v_add(v_pack(v_add(v_add(vx_load(S0 + i + 2 * VTraits<v_int32>::vlanes()), vx_load(S2 + i + 2 * VTraits<v_int32>::vlanes())), v_add(s2, s2)), v_add(v_add(vx_load(S0 + i + 3 * VTraits<v_int32>::vlanes()), vx_load(S2 + i + 3 * VTraits<v_int32>::vlanes())), v_add(s3, s3))), d8));
                 }
-                if( i <= width - v_int16::nlanes )
+                if( i <= width - VTraits<v_int16>::vlanes() )
                 {
                     v_int32 sl = vx_load(S1 + i);
-                    v_int32 sh = vx_load(S1 + i + v_int32::nlanes);
-                    v_store(dst + i, v_pack(vx_load(S0 + i) + vx_load(S2 + i) + (sl + sl), vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes) + (sh + sh)) + d8);
-                    i += v_int16::nlanes;
+                    v_int32 sh = vx_load(S1 + i + VTraits<v_int32>::vlanes());
+                    v_store(dst + i, v_add(v_pack(v_add(v_add(vx_load(S0 + i), vx_load(S2 + i)), v_add(sl, sl)), v_add(v_add(vx_load(S0 + i + VTraits<v_int32>::vlanes()), vx_load(S2 + i + VTraits<v_int32>::vlanes())), v_add(sh, sh))), d8));
+                    i += VTraits<v_int16>::vlanes();
                 }
-                if( i <= width - v_int32::nlanes )
+                if( i <= width - VTraits<v_int32>::vlanes() )
                 {
                     v_int32 s = vx_load(S1 + i);
-                    v_pack_store(dst + i, vx_load(S0 + i) + vx_load(S2 + i) + vx_setall_s32(d) + (s + s));
-                    i += v_int32::nlanes;
+                    v_pack_store(dst + i, v_add(v_add(v_add(vx_load(S0 + i), vx_load(S2 + i)), vx_setall_s32(d)), v_add(s, s)));
+                    i += VTraits<v_int32>::vlanes();
                 }
             }
             else if( ky[0] == -2 && ky[1] == 1 )
             {
-                for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes )
+                for( ; i <= width - 2*VTraits<v_int16>::vlanes(); i += 2*VTraits<v_int16>::vlanes() )
                 {
                     v_int32 s0 = vx_load(S1 + i);
-                    v_int32 s1 = vx_load(S1 + i + v_int32::nlanes);
-                    v_int32 s2 = vx_load(S1 + i + 2*v_int32::nlanes);
-                    v_int32 s3 = vx_load(S1 + i + 3*v_int32::nlanes);
-                    v_store(dst + i, v_pack(vx_load(S0 + i) + vx_load(S2 + i) - (s0 + s0),
-                                            vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes) - (s1 + s1)) + d8);
-                    v_store(dst + i + v_int16::nlanes, v_pack(vx_load(S0 + i + 2*v_int32::nlanes) + vx_load(S2 + i + 2*v_int32::nlanes) - (s2 + s2),
-                                                              vx_load(S0 + i + 3*v_int32::nlanes) + vx_load(S2 + i + 3*v_int32::nlanes) - (s3 + s3)) + d8);
+                    v_int32 s1 = vx_load(S1 + i + VTraits<v_int32>::vlanes());
+                    v_int32 s2 = vx_load(S1 + i + 2*VTraits<v_int32>::vlanes());
+                    v_int32 s3 = vx_load(S1 + i + 3*VTraits<v_int32>::vlanes());
+                    v_store(dst + i, v_add(v_pack(v_sub(v_add(vx_load(S0 + i), vx_load(S2 + i)), v_add(s0, s0)), v_sub(v_add(vx_load(S0 + i + VTraits<v_int32>::vlanes()), vx_load(S2 + i + VTraits<v_int32>::vlanes())), v_add(s1, s1))), d8));
+                    v_store(dst + i + VTraits<v_int16>::vlanes(), v_add(v_pack(v_sub(v_add(vx_load(S0 + i + 2 * VTraits<v_int32>::vlanes()), vx_load(S2 + i + 2 * VTraits<v_int32>::vlanes())), v_add(s2, s2)), v_sub(v_add(vx_load(S0 + i + 3 * VTraits<v_int32>::vlanes()), vx_load(S2 + i + 3 * VTraits<v_int32>::vlanes())), v_add(s3, s3))), d8));
                 }
-                if( i <= width - v_int16::nlanes )
+                if( i <= width - VTraits<v_int16>::vlanes() )
                 {
                     v_int32 sl = vx_load(S1 + i);
-                    v_int32 sh = vx_load(S1 + i + v_int32::nlanes);
-                    v_store(dst + i, v_pack(vx_load(S0 + i) + vx_load(S2 + i) - (sl + sl), vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes) - (sh + sh)) + d8);
-                    i += v_int16::nlanes;
+                    v_int32 sh = vx_load(S1 + i + VTraits<v_int32>::vlanes());
+                    v_store(dst + i, v_add(v_pack(v_sub(v_add(vx_load(S0 + i), vx_load(S2 + i)), v_add(sl, sl)), v_sub(v_add(vx_load(S0 + i + VTraits<v_int32>::vlanes()), vx_load(S2 + i + VTraits<v_int32>::vlanes())), v_add(sh, sh))), d8));
+                    i += VTraits<v_int16>::vlanes();
                 }
-                if( i <= width - v_int32::nlanes )
+                if( i <= width - VTraits<v_int32>::vlanes() )
                 {
                     v_int32 s = vx_load(S1 + i);
-                    v_pack_store(dst + i, vx_load(S0 + i) + vx_load(S2 + i) + vx_setall_s32(d) - (s + s));
-                    i += v_int32::nlanes;
+                    v_pack_store(dst + i, v_sub(v_add(v_add(vx_load(S0 + i), vx_load(S2 + i)), vx_setall_s32(d)), v_add(s, s)));
+                    i += VTraits<v_int32>::vlanes();
                 }
             }
 #if CV_NEON
@@ -1347,23 +1344,23 @@ struct SymmColumnSmallVec_32s16s
             else
             {
                 v_float32 k0 = vx_setall_f32(ky[0]), k1 = vx_setall_f32(ky[1]);
-                for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes )
+                for( ; i <= width - 2*VTraits<v_int16>::vlanes(); i += 2*VTraits<v_int16>::vlanes() )
                 {
-                    v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S0 + i) + vx_load(S2 + i)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i)), k0, df4))),
-                                            v_round(v_muladd(v_cvt_f32(vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i + v_int32::nlanes)), k0, df4)))));
-                    v_store(dst + i + v_int16::nlanes, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S0 + i + 2*v_int32::nlanes) + vx_load(S2 + i + 2*v_int32::nlanes)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i + 2*v_int32::nlanes)), k0, df4))),
-                                                              v_round(v_muladd(v_cvt_f32(vx_load(S0 + i + 3*v_int32::nlanes) + vx_load(S2 + i + 3*v_int32::nlanes)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i + 3*v_int32::nlanes)), k0, df4)))));
+                    v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(v_add(vx_load(S0 + i), vx_load(S2 + i))), k1, v_muladd(v_cvt_f32(vx_load(S1 + i)), k0, df4))),
+                                            v_round(v_muladd(v_cvt_f32(v_add(vx_load(S0 + i + VTraits<v_int32>::vlanes()), vx_load(S2 + i + VTraits<v_int32>::vlanes()))), k1, v_muladd(v_cvt_f32(vx_load(S1 + i + VTraits<v_int32>::vlanes())), k0, df4)))));
+                    v_store(dst + i + VTraits<v_int16>::vlanes(), v_pack(v_round(v_muladd(v_cvt_f32(v_add(vx_load(S0 + i + 2 * VTraits<v_int32>::vlanes()), vx_load(S2 + i + 2 * VTraits<v_int32>::vlanes()))), k1, v_muladd(v_cvt_f32(vx_load(S1 + i + 2*VTraits<v_int32>::vlanes())), k0, df4))),
+                                                              v_round(v_muladd(v_cvt_f32(v_add(vx_load(S0 + i + 3 * VTraits<v_int32>::vlanes()), vx_load(S2 + i + 3 * VTraits<v_int32>::vlanes()))), k1, v_muladd(v_cvt_f32(vx_load(S1 + i + 3*VTraits<v_int32>::vlanes())), k0, df4)))));
                 }
-                if( i <= width - v_int16::nlanes )
+                if( i <= width - VTraits<v_int16>::vlanes() )
                 {
-                    v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S0 + i) + vx_load(S2 + i)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i)), k0, df4))),
-                                            v_round(v_muladd(v_cvt_f32(vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i + v_int32::nlanes)), k0, df4)))));
-                    i += v_int16::nlanes;
+                    v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(v_add(vx_load(S0 + i), vx_load(S2 + i))), k1, v_muladd(v_cvt_f32(vx_load(S1 + i)), k0, df4))),
+                                            v_round(v_muladd(v_cvt_f32(v_add(vx_load(S0 + i + VTraits<v_int32>::vlanes()), vx_load(S2 + i + VTraits<v_int32>::vlanes()))), k1, v_muladd(v_cvt_f32(vx_load(S1 + i + VTraits<v_int32>::vlanes())), k0, df4)))));
+                    i += VTraits<v_int16>::vlanes();
                 }
-                if( i <= width - v_int32::nlanes )
+                if( i <= width - VTraits<v_int32>::vlanes() )
                 {
-                    v_pack_store(dst + i, v_round(v_muladd(v_cvt_f32(vx_load(S0 + i) + vx_load(S2 + i)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i)), k0, df4))));
-                    i += v_int32::nlanes;
+                    v_pack_store(dst + i, v_round(v_muladd(v_cvt_f32(v_add(vx_load(S0 + i), vx_load(S2 + i))), k1, v_muladd(v_cvt_f32(vx_load(S1 + i)), k0, df4))));
+                    i += VTraits<v_int32>::vlanes();
                 }
             }
         }
@@ -1373,42 +1370,42 @@ struct SymmColumnSmallVec_32s16s
             {
                 if( ky[1] < 0 )
                     std::swap(S0, S2);
-                for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes )
+                for( ; i <= width - 2*VTraits<v_int16>::vlanes(); i += 2*VTraits<v_int16>::vlanes() )
                 {
-                    v_store(dst + i, v_pack(vx_load(S2 + i) - vx_load(S0 + i), vx_load(S2 + i + v_int32::nlanes) - vx_load(S0 + i + v_int32::nlanes)) + d8);
-                    v_store(dst + i + v_int16::nlanes, v_pack(vx_load(S2 + i + 2*v_int32::nlanes) - vx_load(S0 + i + 2*v_int32::nlanes), vx_load(S2 + i + 3*v_int32::nlanes) - vx_load(S0 + i + 3*v_int32::nlanes)) + d8);
+                    v_store(dst + i, v_add(v_pack(v_sub(vx_load(S2 + i), vx_load(S0 + i)), v_sub(vx_load(S2 + i + VTraits<v_int32>::vlanes()), vx_load(S0 + i + VTraits<v_int32>::vlanes()))), d8));
+                    v_store(dst + i + VTraits<v_int16>::vlanes(), v_add(v_pack(v_sub(vx_load(S2 + i + 2 * VTraits<v_int32>::vlanes()), vx_load(S0 + i + 2 * VTraits<v_int32>::vlanes())), v_sub(vx_load(S2 + i + 3 * VTraits<v_int32>::vlanes()), vx_load(S0 + i + 3 * VTraits<v_int32>::vlanes()))), d8));
                 }
-                if( i <= width - v_int16::nlanes )
+                if( i <= width - VTraits<v_int16>::vlanes() )
                 {
-                    v_store(dst + i, v_pack(vx_load(S2 + i) - vx_load(S0 + i), vx_load(S2 + i + v_int32::nlanes) - vx_load(S0 + i + v_int32::nlanes)) + d8);
-                    i += v_int16::nlanes;
+                    v_store(dst + i, v_add(v_pack(v_sub(vx_load(S2 + i), vx_load(S0 + i)), v_sub(vx_load(S2 + i + VTraits<v_int32>::vlanes()), vx_load(S0 + i + VTraits<v_int32>::vlanes()))), d8));
+                    i += VTraits<v_int16>::vlanes();
                 }
-                if( i <= width - v_int32::nlanes )
+                if( i <= width - VTraits<v_int32>::vlanes() )
                 {
-                    v_pack_store(dst + i, vx_load(S2 + i) - vx_load(S0 + i) + vx_setall_s32(d));
-                    i += v_int32::nlanes;
+                    v_pack_store(dst + i, v_add(v_sub(vx_load(S2 + i), vx_load(S0 + i)), vx_setall_s32(d)));
+                    i += VTraits<v_int32>::vlanes();
                 }
             }
             else
             {
                 v_float32 k1 = vx_setall_f32(ky[1]);
-                for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes )
+                for( ; i <= width - 2*VTraits<v_int16>::vlanes(); i += 2*VTraits<v_int16>::vlanes() )
                 {
-                    v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S2 + i) - vx_load(S0 + i)), k1, df4)),
-                                            v_round(v_muladd(v_cvt_f32(vx_load(S2 + i + v_int32::nlanes) - vx_load(S0 + i + v_int32::nlanes)), k1, df4))));
-                    v_store(dst + i + v_int16::nlanes, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S2 + i + 2*v_int32::nlanes) - vx_load(S0 + i + 2*v_int32::nlanes)), k1, df4)),
-                                                              v_round(v_muladd(v_cvt_f32(vx_load(S2 + i + 3*v_int32::nlanes) - vx_load(S0 + i + 3*v_int32::nlanes)), k1, df4))));
+                    v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(v_sub(vx_load(S2 + i), vx_load(S0 + i))), k1, df4)),
+                                            v_round(v_muladd(v_cvt_f32(v_sub(vx_load(S2 + i + VTraits<v_int32>::vlanes()), vx_load(S0 + i + VTraits<v_int32>::vlanes()))), k1, df4))));
+                    v_store(dst + i + VTraits<v_int16>::vlanes(), v_pack(v_round(v_muladd(v_cvt_f32(v_sub(vx_load(S2 + i + 2 * VTraits<v_int32>::vlanes()), vx_load(S0 + i + 2 * VTraits<v_int32>::vlanes()))), k1, df4)),
+                                                              v_round(v_muladd(v_cvt_f32(v_sub(vx_load(S2 + i + 3 * VTraits<v_int32>::vlanes()), vx_load(S0 + i + 3 * VTraits<v_int32>::vlanes()))), k1, df4))));
                 }
-                if( i <= width - v_int16::nlanes )
+                if( i <= width - VTraits<v_int16>::vlanes() )
                 {
-                    v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S2 + i) - vx_load(S0 + i)), k1, df4)),
-                                            v_round(v_muladd(v_cvt_f32(vx_load(S2 + i + v_int32::nlanes) - vx_load(S0 + i + v_int32::nlanes)), k1, df4))));
-                    i += v_int16::nlanes;
+                    v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(v_sub(vx_load(S2 + i), vx_load(S0 + i))), k1, df4)),
+                                            v_round(v_muladd(v_cvt_f32(v_sub(vx_load(S2 + i + VTraits<v_int32>::vlanes()), vx_load(S0 + i + VTraits<v_int32>::vlanes()))), k1, df4))));
+                    i += VTraits<v_int16>::vlanes();
                 }
-                if( i <= width - v_int32::nlanes )
+                if( i <= width - VTraits<v_int32>::vlanes() )
                 {
-                    v_pack_store(dst + i, v_round(v_muladd(v_cvt_f32(vx_load(S2 + i) - vx_load(S0 + i)), k1, df4)));
-                    i += v_int32::nlanes;
+                    v_pack_store(dst + i, v_round(v_muladd(v_cvt_f32(v_sub(vx_load(S2 + i), vx_load(S0 + i))), k1, df4)));
+                    i += VTraits<v_int32>::vlanes();
                 }
             }
         }
@@ -1440,7 +1437,7 @@ struct RowVec_16s32f
         const float* _kx = kernel.ptr<float>();
         width *= cn;
 
-        for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes )
+        for( ; i <= width - 2*VTraits<v_int16>::vlanes(); i += 2*VTraits<v_int16>::vlanes() )
         {
             const short* src = (const short*)_src + i;
             v_float32 s0 = vx_setzero_f32();
@@ -1451,18 +1448,18 @@ struct RowVec_16s32f
             {
                 v_float32 f = vx_setall_f32(_kx[k]);
                 v_int16 xl = vx_load(src);
-                v_int16 xh = vx_load(src + v_int16::nlanes);
+                v_int16 xh = vx_load(src + VTraits<v_int16>::vlanes());
                 s0 = v_muladd(v_cvt_f32(v_expand_low(xl)), f, s0);
                 s1 = v_muladd(v_cvt_f32(v_expand_high(xl)), f, s1);
                 s2 = v_muladd(v_cvt_f32(v_expand_low(xh)), f, s2);
                 s3 = v_muladd(v_cvt_f32(v_expand_high(xh)), f, s3);
             }
             v_store(dst + i, s0);
-            v_store(dst + i + v_float32::nlanes, s1);
-            v_store(dst + i + 2*v_float32::nlanes, s2);
-            v_store(dst + i + 3*v_float32::nlanes, s3);
+            v_store(dst + i + VTraits<v_float32>::vlanes(), s1);
+            v_store(dst + i + 2*VTraits<v_float32>::vlanes(), s2);
+            v_store(dst + i + 3*VTraits<v_float32>::vlanes(), s3);
         }
-        if( i <= width - v_int16::nlanes )
+        if( i <= width - VTraits<v_int16>::vlanes() )
         {
             const short* src = (const short*)_src + i;
             v_float32 s0 = vx_setzero_f32();
@@ -1475,17 +1472,17 @@ struct RowVec_16s32f
                 s1 = v_muladd(v_cvt_f32(v_expand_high(x)), f, s1);
             }
             v_store(dst + i, s0);
-            v_store(dst + i + v_float32::nlanes, s1);
-            i += v_int16::nlanes;
+            v_store(dst + i + VTraits<v_float32>::vlanes(), s1);
+            i += VTraits<v_int16>::vlanes();
         }
-        if( i <= width - v_float32::nlanes )
+        if( i <= width - VTraits<v_float32>::vlanes() )
         {
             const short* src = (const short*)_src + i;
             v_float32 s0 = vx_setzero_f32();
             for( k = 0; k < _ksize; k++, src += cn )
                 s0 = v_muladd(v_cvt_f32(vx_load_expand(src)), vx_setall_f32(_kx[k]), s0);
             v_store(dst + i, s0);
-            i += v_float32::nlanes;
+            i += VTraits<v_float32>::vlanes();
         }
         return i;
     }
@@ -1524,92 +1521,92 @@ struct SymmColumnVec_32f16s
         {
             v_float32 k0 = vx_setall_f32(ky[0]);
             v_float32 k1 = vx_setall_f32(ky[1]);
-            for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes )
+            for( ; i <= width - 2*VTraits<v_int16>::vlanes(); i += 2*VTraits<v_int16>::vlanes() )
             {
                 v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4);
-                v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), k0, d4);
-                v_float32 s2 = v_muladd(vx_load(src[0] + i + 2*v_float32::nlanes), k0, d4);
-                v_float32 s3 = v_muladd(vx_load(src[0] + i + 3*v_float32::nlanes), k0, d4);
-                s0 = v_muladd(vx_load(src[1] + i) + vx_load(src[-1] + i), k1, s0);
-                s1 = v_muladd(vx_load(src[1] + i + v_float32::nlanes) + vx_load(src[-1] + i + v_float32::nlanes), k1, s1);
-                s2 = v_muladd(vx_load(src[1] + i + 2*v_float32::nlanes) + vx_load(src[-1] + i + 2*v_float32::nlanes), k1, s2);
-                s3 = v_muladd(vx_load(src[1] + i + 3*v_float32::nlanes) + vx_load(src[-1] + i + 3*v_float32::nlanes), k1, s3);
+                v_float32 s1 = v_muladd(vx_load(src[0] + i + VTraits<v_float32>::vlanes()), k0, d4);
+                v_float32 s2 = v_muladd(vx_load(src[0] + i + 2*VTraits<v_float32>::vlanes()), k0, d4);
+                v_float32 s3 = v_muladd(vx_load(src[0] + i + 3*VTraits<v_float32>::vlanes()), k0, d4);
+                s0 = v_muladd(v_add(vx_load(src[1] + i), vx_load(src[-1] + i)), k1, s0);
+                s1 = v_muladd(v_add(vx_load(src[1] + i + VTraits<v_float32>::vlanes()), vx_load(src[-1] + i + VTraits<v_float32>::vlanes())), k1, s1);
+                s2 = v_muladd(v_add(vx_load(src[1] + i + 2 * VTraits<v_float32>::vlanes()), vx_load(src[-1] + i + 2 * VTraits<v_float32>::vlanes())), k1, s2);
+                s3 = v_muladd(v_add(vx_load(src[1] + i + 3 * VTraits<v_float32>::vlanes()), vx_load(src[-1] + i + 3 * VTraits<v_float32>::vlanes())), k1, s3);
                 for( k = 2; k <= ksize2; k++ )
                 {
                     v_float32 k2 = vx_setall_f32(ky[k]);
-                    s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), k2, s0);
-                    s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) + vx_load(src[-k] + i + v_float32::nlanes), k2, s1);
-                    s2 = v_muladd(vx_load(src[k] + i + 2*v_float32::nlanes) + vx_load(src[-k] + i + 2*v_float32::nlanes), k2, s2);
-                    s3 = v_muladd(vx_load(src[k] + i + 3*v_float32::nlanes) + vx_load(src[-k] + i + 3*v_float32::nlanes), k2, s3);
+                    s0 = v_muladd(v_add(vx_load(src[k] + i), vx_load(src[-k] + i)), k2, s0);
+                    s1 = v_muladd(v_add(vx_load(src[k] + i + VTraits<v_float32>::vlanes()), vx_load(src[-k] + i + VTraits<v_float32>::vlanes())), k2, s1);
+                    s2 = v_muladd(v_add(vx_load(src[k] + i + 2 * VTraits<v_float32>::vlanes()), vx_load(src[-k] + i + 2 * VTraits<v_float32>::vlanes())), k2, s2);
+                    s3 = v_muladd(v_add(vx_load(src[k] + i + 3 * VTraits<v_float32>::vlanes()), vx_load(src[-k] + i + 3 * VTraits<v_float32>::vlanes())), k2, s3);
                 }
                 v_store(dst + i, v_pack(v_round(s0), v_round(s1)));
-                v_store(dst + i + v_int16::nlanes, v_pack(v_round(s2), v_round(s3)));
+                v_store(dst + i + VTraits<v_int16>::vlanes(), v_pack(v_round(s2), v_round(s3)));
             }
-            if( i <= width - v_int16::nlanes )
+            if( i <= width - VTraits<v_int16>::vlanes() )
             {
                 v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4);
-                v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), k0, d4);
-                s0 = v_muladd(vx_load(src[1] + i) + vx_load(src[-1] + i), k1, s0);
-                s1 = v_muladd(vx_load(src[1] + i + v_float32::nlanes) + vx_load(src[-1] + i + v_float32::nlanes), k1, s1);
+                v_float32 s1 = v_muladd(vx_load(src[0] + i + VTraits<v_float32>::vlanes()), k0, d4);
+                s0 = v_muladd(v_add(vx_load(src[1] + i), vx_load(src[-1] + i)), k1, s0);
+                s1 = v_muladd(v_add(vx_load(src[1] + i + VTraits<v_float32>::vlanes()), vx_load(src[-1] + i + VTraits<v_float32>::vlanes())), k1, s1);
                 for( k = 2; k <= ksize2; k++ )
                 {
                     v_float32 k2 = vx_setall_f32(ky[k]);
-                    s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), k2, s0);
-                    s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) + vx_load(src[-k] + i + v_float32::nlanes), k2, s1);
+                    s0 = v_muladd(v_add(vx_load(src[k] + i), vx_load(src[-k] + i)), k2, s0);
+                    s1 = v_muladd(v_add(vx_load(src[k] + i + VTraits<v_float32>::vlanes()), vx_load(src[-k] + i + VTraits<v_float32>::vlanes())), k2, s1);
                 }
                 v_store(dst + i, v_pack(v_round(s0), v_round(s1)));
-                i += v_int16::nlanes;
+                i += VTraits<v_int16>::vlanes();
             }
-            if( i <= width - v_float32::nlanes )
+            if( i <= width - VTraits<v_float32>::vlanes() )
             {
                 v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4);
-                s0 = v_muladd(vx_load(src[1] + i) + vx_load(src[-1] + i), k1, s0);
+                s0 = v_muladd(v_add(vx_load(src[1] + i), vx_load(src[-1] + i)), k1, s0);
                 for( k = 2; k <= ksize2; k++ )
-                    s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0);
+                    s0 = v_muladd(v_add(vx_load(src[k] + i), vx_load(src[-k] + i)), vx_setall_f32(ky[k]), s0);
                 v_pack_store(dst + i, v_round(s0));
-                i += v_float32::nlanes;
+                i += VTraits<v_float32>::vlanes();
             }
         }
         else
         {
             v_float32 k1 = vx_setall_f32(ky[1]);
-            for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes )
+            for( ; i <= width - 2*VTraits<v_int16>::vlanes(); i += 2*VTraits<v_int16>::vlanes() )
             {
-                v_float32 s0 = v_muladd(vx_load(src[1] + i) - vx_load(src[-1] + i), k1, d4);
-                v_float32 s1 = v_muladd(vx_load(src[1] + i + v_float32::nlanes) - vx_load(src[-1] + i + v_float32::nlanes), k1, d4);
-                v_float32 s2 = v_muladd(vx_load(src[1] + i + 2*v_float32::nlanes) - vx_load(src[-1] + i + 2*v_float32::nlanes), k1, d4);
-                v_float32 s3 = v_muladd(vx_load(src[1] + i + 3*v_float32::nlanes) - vx_load(src[-1] + i + 3*v_float32::nlanes), k1, d4);
+                v_float32 s0 = v_muladd(v_sub(vx_load(src[1] + i), vx_load(src[-1] + i)), k1, d4);
+                v_float32 s1 = v_muladd(v_sub(vx_load(src[1] + i + VTraits<v_float32>::vlanes()), vx_load(src[-1] + i + VTraits<v_float32>::vlanes())), k1, d4);
+                v_float32 s2 = v_muladd(v_sub(vx_load(src[1] + i + 2 * VTraits<v_float32>::vlanes()), vx_load(src[-1] + i + 2 * VTraits<v_float32>::vlanes())), k1, d4);
+                v_float32 s3 = v_muladd(v_sub(vx_load(src[1] + i + 3 * VTraits<v_float32>::vlanes()), vx_load(src[-1] + i + 3 * VTraits<v_float32>::vlanes())), k1, d4);
                 for( k = 2; k <= ksize2; k++ )
                 {
                     v_float32 k2 = vx_setall_f32(ky[k]);
-                    s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), k2, s0);
-                    s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) - vx_load(src[-k] + i + v_float32::nlanes), k2, s1);
-                    s2 = v_muladd(vx_load(src[k] + i + 2*v_float32::nlanes) - vx_load(src[-k] + i + 2*v_float32::nlanes), k2, s2);
-                    s3 = v_muladd(vx_load(src[k] + i + 3*v_float32::nlanes) - vx_load(src[-k] + i + 3*v_float32::nlanes), k2, s3);
+                    s0 = v_muladd(v_sub(vx_load(src[k] + i), vx_load(src[-k] + i)), k2, s0);
+                    s1 = v_muladd(v_sub(vx_load(src[k] + i + VTraits<v_float32>::vlanes()), vx_load(src[-k] + i + VTraits<v_float32>::vlanes())), k2, s1);
+                    s2 = v_muladd(v_sub(vx_load(src[k] + i + 2 * VTraits<v_float32>::vlanes()), vx_load(src[-k] + i + 2 * VTraits<v_float32>::vlanes())), k2, s2);
+                    s3 = v_muladd(v_sub(vx_load(src[k] + i + 3 * VTraits<v_float32>::vlanes()), vx_load(src[-k] + i + 3 * VTraits<v_float32>::vlanes())), k2, s3);
                 }
                 v_store(dst + i, v_pack(v_round(s0), v_round(s1)));
-                v_store(dst + i + v_int16::nlanes, v_pack(v_round(s2), v_round(s3)));
+                v_store(dst + i + VTraits<v_int16>::vlanes(), v_pack(v_round(s2), v_round(s3)));
             }
-            if( i <= width - v_int16::nlanes )
+            if( i <= width - VTraits<v_int16>::vlanes() )
             {
-                v_float32 s0 = v_muladd(vx_load(src[1] + i) - vx_load(src[-1] + i), k1, d4);
-                v_float32 s1 = v_muladd(vx_load(src[1] + i + v_float32::nlanes) - vx_load(src[-1] + i + v_float32::nlanes), k1, d4);
+                v_float32 s0 = v_muladd(v_sub(vx_load(src[1] + i), vx_load(src[-1] + i)), k1, d4);
+                v_float32 s1 = v_muladd(v_sub(vx_load(src[1] + i + VTraits<v_float32>::vlanes()), vx_load(src[-1] + i + VTraits<v_float32>::vlanes())), k1, d4);
                 for( k = 2; k <= ksize2; k++ )
                 {
                     v_float32 k2 = vx_setall_f32(ky[k]);
-                    s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), k2, s0);
-                    s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) - vx_load(src[-k] + i + v_float32::nlanes), k2, s1);
+                    s0 = v_muladd(v_sub(vx_load(src[k] + i), vx_load(src[-k] + i)), k2, s0);
+                    s1 = v_muladd(v_sub(vx_load(src[k] + i + VTraits<v_float32>::vlanes()), vx_load(src[-k] + i + VTraits<v_float32>::vlanes())), k2, s1);
                 }
                 v_store(dst + i, v_pack(v_round(s0), v_round(s1)));
-                i += v_int16::nlanes;
+                i += VTraits<v_int16>::vlanes();
             }
-            if( i <= width - v_float32::nlanes )
+            if( i <= width - VTraits<v_float32>::vlanes() )
             {
-                v_float32 s0 = v_muladd(vx_load(src[1] + i) - vx_load(src[-1] + i), k1, d4);
+                v_float32 s0 = v_muladd(v_sub(vx_load(src[1] + i), vx_load(src[-1] + i)), k1, d4);
                 for( k = 2; k <= ksize2; k++ )
-                    s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0);
+                    s0 = v_muladd(v_sub(vx_load(src[k] + i), vx_load(src[-k] + i)), vx_setall_f32(ky[k]), s0);
                 v_pack_store(dst + i, v_round(s0));
-                i += v_float32::nlanes;
+                i += VTraits<v_float32>::vlanes();
             }
         }
 
@@ -1682,52 +1679,52 @@ struct RowVec_32f
         }
 #endif
         v_float32 k0 = vx_setall_f32(_kx[0]);
-        for( ; i <= width - 4*v_float32::nlanes; i += 4*v_float32::nlanes )
+        for( ; i <= width - 4*VTraits<v_float32>::vlanes(); i += 4*VTraits<v_float32>::vlanes() )
         {
             const float* src = src0 + i;
-            v_float32 s0 = vx_load(src) * k0;
-            v_float32 s1 = vx_load(src + v_float32::nlanes) * k0;
-            v_float32 s2 = vx_load(src + 2*v_float32::nlanes) * k0;
-            v_float32 s3 = vx_load(src + 3*v_float32::nlanes) * k0;
+            v_float32 s0 = v_mul(vx_load(src), k0);
+            v_float32 s1 = v_mul(vx_load(src + VTraits<v_float32>::vlanes()), k0);
+            v_float32 s2 = v_mul(vx_load(src + 2 * VTraits<v_float32>::vlanes()), k0);
+            v_float32 s3 = v_mul(vx_load(src + 3 * VTraits<v_float32>::vlanes()), k0);
             src += cn;
             for( k = 1; k < _ksize; k++, src += cn )
             {
                 v_float32 k1 = vx_setall_f32(_kx[k]);
                 s0 = v_muladd(vx_load(src), k1, s0);
-                s1 = v_muladd(vx_load(src + v_float32::nlanes), k1, s1);
-                s2 = v_muladd(vx_load(src + 2*v_float32::nlanes), k1, s2);
-                s3 = v_muladd(vx_load(src + 3*v_float32::nlanes), k1, s3);
+                s1 = v_muladd(vx_load(src + VTraits<v_float32>::vlanes()), k1, s1);
+                s2 = v_muladd(vx_load(src + 2*VTraits<v_float32>::vlanes()), k1, s2);
+                s3 = v_muladd(vx_load(src + 3*VTraits<v_float32>::vlanes()), k1, s3);
             }
             v_store(dst + i, s0);
-            v_store(dst + i + v_float32::nlanes, s1);
-            v_store(dst + i + 2*v_float32::nlanes, s2);
-            v_store(dst + i + 3*v_float32::nlanes, s3);
+            v_store(dst + i + VTraits<v_float32>::vlanes(), s1);
+            v_store(dst + i + 2*VTraits<v_float32>::vlanes(), s2);
+            v_store(dst + i + 3*VTraits<v_float32>::vlanes(), s3);
         }
-        if( i <= width - 2*v_float32::nlanes )
+        if( i <= width - 2*VTraits<v_float32>::vlanes() )
         {
             const float* src = src0 + i;
-            v_float32 s0 = vx_load(src) * k0;
-            v_float32 s1 = vx_load(src + v_float32::nlanes) * k0;
+            v_float32 s0 = v_mul(vx_load(src), k0);
+            v_float32 s1 = v_mul(vx_load(src + VTraits<v_float32>::vlanes()), k0);
             src += cn;
             for( k = 1; k < _ksize; k++, src += cn )
             {
                 v_float32 k1 = vx_setall_f32(_kx[k]);
                 s0 = v_muladd(vx_load(src), k1, s0);
-                s1 = v_muladd(vx_load(src + v_float32::nlanes), k1, s1);
+                s1 = v_muladd(vx_load(src + VTraits<v_float32>::vlanes()), k1, s1);
             }
             v_store(dst + i, s0);
-            v_store(dst + i + v_float32::nlanes, s1);
-            i += 2*v_float32::nlanes;
+            v_store(dst + i + VTraits<v_float32>::vlanes(), s1);
+            i += 2*VTraits<v_float32>::vlanes();
         }
-        if( i <= width - v_float32::nlanes )
+        if( i <= width - VTraits<v_float32>::vlanes() )
         {
             const float* src = src0 + i;
-            v_float32 s0 = vx_load(src) * k0;
+            v_float32 s0 = v_mul(vx_load(src), k0);
             src += cn;
             for( k = 1; k < _ksize; k++, src += cn )
                 s0 = v_muladd(vx_load(src), vx_setall_f32(_kx[k]), s0);
             v_store(dst + i, s0);
-            i += v_float32::nlanes;
+            i += VTraits<v_float32>::vlanes();
         }
         return i;
     }
@@ -1806,28 +1803,28 @@ struct SymmRowSmallVec_32f
                 {
 #if CV_FMA3 || CV_AVX2
                     v_float32 k0 = vx_setall_f32(kx[0]);
-                    for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
-                        v_store(dst + i, v_muladd(vx_load(src), k0, vx_load(src - cn) + vx_load(src + cn)));
+                    for( ; i <= width - VTraits<v_float32>::vlanes(); i += VTraits<v_float32>::vlanes(), src += VTraits<v_float32>::vlanes() )
+                        v_store(dst + i, v_muladd(vx_load(src), k0, v_add(vx_load(src - cn), vx_load(src + cn))));
 #else
                     if( kx[0] > 0 )
-                        for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
+                        for( ; i <= width - VTraits<v_float32>::vlanes(); i += VTraits<v_float32>::vlanes(), src += VTraits<v_float32>::vlanes() )
                         {
                             v_float32 x = vx_load(src);
-                            v_store(dst + i, vx_load(src - cn) + vx_load(src + cn) + (x + x));
+                            v_store(dst + i, v_add(vx_load(src - cn), vx_load(src + cn), x , x));
                         }
                     else
-                        for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
+                        for( ; i <= width - VTraits<v_float32>::vlanes(); i += VTraits<v_float32>::vlanes(), src += VTraits<v_float32>::vlanes() )
                         {
                             v_float32 x = vx_load(src);
-                            v_store(dst + i, vx_load(src - cn) + vx_load(src + cn) - (x + x));
+                            v_store(dst + i, v_sub(v_add(vx_load(src - cn), vx_load(src + cn)), v_add(x, x)));
                         }
 #endif
                 }
                 else
                 {
                     v_float32 k0 = vx_setall_f32(kx[0]), k1 = vx_setall_f32(kx[1]);
-                    for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
-                        v_store(dst + i, v_muladd(vx_load(src), k0, (vx_load(src - cn) + vx_load(src + cn)) * k1));
+                    for( ; i <= width - VTraits<v_float32>::vlanes(); i += VTraits<v_float32>::vlanes(), src += VTraits<v_float32>::vlanes() )
+                        v_store(dst + i, v_muladd(vx_load(src), k0, v_mul(v_add(vx_load(src - cn), vx_load(src + cn)), k1)));
                 }
             }
             else if( _ksize == 5 )
@@ -1836,21 +1833,21 @@ struct SymmRowSmallVec_32f
                 {
 #if CV_FMA3 || CV_AVX2
                     v_float32 k0 = vx_setall_f32(-2);
-                    for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
-                        v_store(dst + i, v_muladd(vx_load(src), k0, vx_load(src - 2*cn) + vx_load(src + 2*cn)));
+                    for( ; i <= width - VTraits<v_float32>::vlanes(); i += VTraits<v_float32>::vlanes(), src += VTraits<v_float32>::vlanes() )
+                        v_store(dst + i, v_muladd(vx_load(src), k0, v_add(vx_load(src - 2 * cn), vx_load(src + 2 * cn))));
 #else
-                    for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
+                    for( ; i <= width - VTraits<v_float32>::vlanes(); i += VTraits<v_float32>::vlanes(), src += VTraits<v_float32>::vlanes() )
                     {
                         v_float32 x = vx_load(src);
-                        v_store(dst + i, vx_load(src - 2*cn) + vx_load(src + 2*cn) - (x + x));
+                        v_store(dst + i, v_sub(v_add(vx_load(src - 2*cn), vx_load(src + 2*cn)), v_add(x, x)));
                     }
 #endif
                 }
                 else
                 {
                     v_float32 k0 = vx_setall_f32(kx[0]), k1 = vx_setall_f32(kx[1]), k2 = vx_setall_f32(kx[2]);
-                    for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
-                        v_store(dst + i, v_muladd(vx_load(src + 2*cn) + vx_load(src - 2*cn), k2, v_muladd(vx_load(src), k0, (vx_load(src - cn) + vx_load(src + cn)) * k1)));
+                    for( ; i <= width - VTraits<v_float32>::vlanes(); i += VTraits<v_float32>::vlanes(), src += VTraits<v_float32>::vlanes() )
+                        v_store(dst + i, v_muladd(v_add(vx_load(src + 2 * cn), vx_load(src - 2 * cn)), k2, v_muladd(vx_load(src), k0, v_mul(v_add(vx_load(src - cn), vx_load(src + cn)), k1))));
                 }
             }
         }
@@ -1859,20 +1856,20 @@ struct SymmRowSmallVec_32f
             if( _ksize == 3 )
             {
                 if( kx[0] == 0 && kx[1] == 1 )
-                    for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
-                        v_store(dst + i, vx_load(src + cn) - vx_load(src - cn));
+                    for( ; i <= width - VTraits<v_float32>::vlanes(); i += VTraits<v_float32>::vlanes(), src += VTraits<v_float32>::vlanes() )
+                        v_store(dst + i, v_sub(vx_load(src + cn), vx_load(src - cn)));
                 else
                 {
                     v_float32 k1 = vx_setall_f32(kx[1]);
-                    for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
-                        v_store(dst + i, (vx_load(src + cn) - vx_load(src - cn)) * k1);
+                    for( ; i <= width - VTraits<v_float32>::vlanes(); i += VTraits<v_float32>::vlanes(), src += VTraits<v_float32>::vlanes() )
+                        v_store(dst + i, v_mul(v_sub(vx_load(src + cn), vx_load(src - cn)), k1));
                 }
             }
             else if( _ksize == 5 )
             {
                 v_float32 k1 = vx_setall_f32(kx[1]), k2 = vx_setall_f32(kx[2]);
-                for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
-                    v_store(dst + i, v_muladd(vx_load(src + 2*cn) - vx_load(src - 2*cn), k2, (vx_load(src + cn) - vx_load(src - cn)) * k1));
+                for ( ; i <= width - VTraits<v_float32>::vlanes(); i += VTraits<v_float32>::vlanes(), src += VTraits<v_float32>::vlanes() )
+                    v_store(dst + i, v_muladd(v_sub(vx_load(src + 2 * cn), vx_load(src - 2 * cn)), k2, v_mul(v_sub(vx_load(src + cn), vx_load(src - cn)), k1)));
             }
         }
         return i;
@@ -1961,46 +1958,46 @@ struct SymmColumnVec_32f
 #endif
             const v_float32 d4 = vx_setall_f32(delta);
             const v_float32 k0 = vx_setall_f32(ky[0]);
-            for( ; i <= width - 4*v_float32::nlanes; i += 4*v_float32::nlanes )
+            for( ; i <= width - 4*VTraits<v_float32>::vlanes(); i += 4*VTraits<v_float32>::vlanes() )
             {
                 v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4);
-                v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), k0, d4);
-                v_float32 s2 = v_muladd(vx_load(src[0] + i + 2*v_float32::nlanes), k0, d4);
-                v_float32 s3 = v_muladd(vx_load(src[0] + i + 3*v_float32::nlanes), k0, d4);
+                v_float32 s1 = v_muladd(vx_load(src[0] + i + VTraits<v_float32>::vlanes()), k0, d4);
+                v_float32 s2 = v_muladd(vx_load(src[0] + i + 2*VTraits<v_float32>::vlanes()), k0, d4);
+                v_float32 s3 = v_muladd(vx_load(src[0] + i + 3*VTraits<v_float32>::vlanes()), k0, d4);
                 for( k = 1; k <= ksize2; k++ )
                 {
                     v_float32 k1 = vx_setall_f32(ky[k]);
-                    s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), k1, s0);
-                    s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) + vx_load(src[-k] + i + v_float32::nlanes), k1, s1);
-                    s2 = v_muladd(vx_load(src[k] + i + 2*v_float32::nlanes) + vx_load(src[-k] + i + 2*v_float32::nlanes), k1, s2);
-                    s3 = v_muladd(vx_load(src[k] + i + 3*v_float32::nlanes) + vx_load(src[-k] + i + 3*v_float32::nlanes), k1, s3);
+                    s0 = v_muladd(v_add(vx_load(src[k] + i), vx_load(src[-k] + i)), k1, s0);
+                    s1 = v_muladd(v_add(vx_load(src[k] + i + VTraits<v_float32>::vlanes()), vx_load(src[-k] + i + VTraits<v_float32>::vlanes())), k1, s1);
+                    s2 = v_muladd(v_add(vx_load(src[k] + i + 2 * VTraits<v_float32>::vlanes()), vx_load(src[-k] + i + 2 * VTraits<v_float32>::vlanes())), k1, s2);
+                    s3 = v_muladd(v_add(vx_load(src[k] + i + 3 * VTraits<v_float32>::vlanes()), vx_load(src[-k] + i + 3 * VTraits<v_float32>::vlanes())), k1, s3);
                 }
                 v_store(dst + i, s0);
-                v_store(dst + i + v_float32::nlanes, s1);
-                v_store(dst + i + 2*v_float32::nlanes, s2);
-                v_store(dst + i + 3*v_float32::nlanes, s3);
+                v_store(dst + i + VTraits<v_float32>::vlanes(), s1);
+                v_store(dst + i + 2*VTraits<v_float32>::vlanes(), s2);
+                v_store(dst + i + 3*VTraits<v_float32>::vlanes(), s3);
             }
-            if( i <= width - 2*v_float32::nlanes )
+            if( i <= width - 2*VTraits<v_float32>::vlanes() )
             {
                 v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4);
-                v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), k0, d4);
+                v_float32 s1 = v_muladd(vx_load(src[0] + i + VTraits<v_float32>::vlanes()), k0, d4);
                 for( k = 1; k <= ksize2; k++ )
                 {
                     v_float32 k1 = vx_setall_f32(ky[k]);
-                    s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), k1, s0);
-                    s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) + vx_load(src[-k] + i + v_float32::nlanes), k1, s1);
+                    s0 = v_muladd(v_add(vx_load(src[k] + i), vx_load(src[-k] + i)), k1, s0);
+                    s1 = v_muladd(v_add(vx_load(src[k] + i + VTraits<v_float32>::vlanes()), vx_load(src[-k] + i + VTraits<v_float32>::vlanes())), k1, s1);
                 }
                 v_store(dst + i, s0);
-                v_store(dst + i + v_float32::nlanes, s1);
-                i += 2*v_float32::nlanes;
+                v_store(dst + i + VTraits<v_float32>::vlanes(), s1);
+                i += 2*VTraits<v_float32>::vlanes();
             }
-            if( i <= width - v_float32::nlanes )
+            if( i <= width - VTraits<v_float32>::vlanes() )
             {
                 v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4);
                 for( k = 1; k <= ksize2; k++ )
-                    s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0);
+                    s0 = v_muladd(v_add(vx_load(src[k] + i), vx_load(src[-k] + i)), vx_setall_f32(ky[k]), s0);
                 v_store(dst + i, s0);
-                i += v_float32::nlanes;
+                i += VTraits<v_float32>::vlanes();
             }
         }
         else
@@ -2042,46 +2039,46 @@ struct SymmColumnVec_32f
 #endif
             const v_float32 d4 = vx_setall_f32(delta);
             const v_float32 k1 = vx_setall_f32(ky[1]);
-            for( ; i <= width - 4*v_float32::nlanes; i += 4*v_float32::nlanes )
+            for( ; i <= width - 4*VTraits<v_float32>::vlanes(); i += 4*VTraits<v_float32>::vlanes() )
             {
-                v_float32 s0 = v_muladd(vx_load(src[1] + i) - vx_load(src[-1] + i), k1, d4);
-                v_float32 s1 = v_muladd(vx_load(src[1] + i + v_float32::nlanes) - vx_load(src[-1] + i + v_float32::nlanes), k1, d4);
-                v_float32 s2 = v_muladd(vx_load(src[1] + i + 2*v_float32::nlanes) - vx_load(src[-1] + i + 2*v_float32::nlanes), k1, d4);
-                v_float32 s3 = v_muladd(vx_load(src[1] + i + 3*v_float32::nlanes) - vx_load(src[-1] + i + 3*v_float32::nlanes), k1, d4);
+                v_float32 s0 = v_muladd(v_sub(vx_load(src[1] + i), vx_load(src[-1] + i)), k1, d4);
+                v_float32 s1 = v_muladd(v_sub(vx_load(src[1] + i + VTraits<v_float32>::vlanes()), vx_load(src[-1] + i + VTraits<v_float32>::vlanes())), k1, d4);
+                v_float32 s2 = v_muladd(v_sub(vx_load(src[1] + i + 2 * VTraits<v_float32>::vlanes()), vx_load(src[-1] + i + 2 * VTraits<v_float32>::vlanes())), k1, d4);
+                v_float32 s3 = v_muladd(v_sub(vx_load(src[1] + i + 3 * VTraits<v_float32>::vlanes()), vx_load(src[-1] + i + 3 * VTraits<v_float32>::vlanes())), k1, d4);
                 for( k = 2; k <= ksize2; k++ )
                 {
                     v_float32 k2 = vx_setall_f32(ky[k]);
-                    s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), k2, s0);
-                    s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) - vx_load(src[-k] + i + v_float32::nlanes), k2, s1);
-                    s2 = v_muladd(vx_load(src[k] + i + 2*v_float32::nlanes) - vx_load(src[-k] + i + 2*v_float32::nlanes), k2, s2);
-                    s3 = v_muladd(vx_load(src[k] + i + 3*v_float32::nlanes) - vx_load(src[-k] + i + 3*v_float32::nlanes), k2, s3);
+                    s0 = v_muladd(v_sub(vx_load(src[k] + i), vx_load(src[-k] + i)), k2, s0);
+                    s1 = v_muladd(v_sub(vx_load(src[k] + i + VTraits<v_float32>::vlanes()), vx_load(src[-k] + i + VTraits<v_float32>::vlanes())), k2, s1);
+                    s2 = v_muladd(v_sub(vx_load(src[k] + i + 2 * VTraits<v_float32>::vlanes()), vx_load(src[-k] + i + 2 * VTraits<v_float32>::vlanes())), k2, s2);
+                    s3 = v_muladd(v_sub(vx_load(src[k] + i + 3 * VTraits<v_float32>::vlanes()), vx_load(src[-k] + i + 3 * VTraits<v_float32>::vlanes())), k2, s3);
                 }
                 v_store(dst + i, s0);
-                v_store(dst + i + v_float32::nlanes, s1);
-                v_store(dst + i + 2*v_float32::nlanes, s2);
-                v_store(dst + i + 3*v_float32::nlanes, s3);
+                v_store(dst + i + VTraits<v_float32>::vlanes(), s1);
+                v_store(dst + i + 2*VTraits<v_float32>::vlanes(), s2);
+                v_store(dst + i + 3*VTraits<v_float32>::vlanes(), s3);
             }
-            if( i <= width - 2*v_float32::nlanes )
+            if( i <= width - 2*VTraits<v_float32>::vlanes() )
             {
-                v_float32 s0 = v_muladd(vx_load(src[1] + i) - vx_load(src[-1] + i), k1, d4);
-                v_float32 s1 = v_muladd(vx_load(src[1] + i + v_float32::nlanes) - vx_load(src[-1] + i + v_float32::nlanes), k1, d4);
+                v_float32 s0 = v_muladd(v_sub(vx_load(src[1] + i), vx_load(src[-1] + i)), k1, d4);
+                v_float32 s1 = v_muladd(v_sub(vx_load(src[1] + i + VTraits<v_float32>::vlanes()), vx_load(src[-1] + i + VTraits<v_float32>::vlanes())), k1, d4);
                 for( k = 2; k <= ksize2; k++ )
                 {
                     v_float32 k2 = vx_setall_f32(ky[k]);
-                    s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), k2, s0);
-                    s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) - vx_load(src[-k] + i + v_float32::nlanes), k2, s1);
+                    s0 = v_muladd(v_sub(vx_load(src[k] + i), vx_load(src[-k] + i)), k2, s0);
+                    s1 = v_muladd(v_sub(vx_load(src[k] + i + VTraits<v_float32>::vlanes()), vx_load(src[-k] + i + VTraits<v_float32>::vlanes())), k2, s1);
                 }
                 v_store(dst + i, s0);
-                v_store(dst + i + v_float32::nlanes, s1);
-                i += 2*v_float32::nlanes;
+                v_store(dst + i + VTraits<v_float32>::vlanes(), s1);
+                i += 2*VTraits<v_float32>::vlanes();
             }
-            if( i <= width - v_float32::nlanes )
+            if( i <= width - VTraits<v_float32>::vlanes() )
             {
-                v_float32 s0 = v_muladd(vx_load(src[1] + i) - vx_load(src[-1] + i), k1, d4);
+                v_float32 s0 = v_muladd(v_sub(vx_load(src[1] + i), vx_load(src[-1] + i)), k1, d4);
                 for( k = 2; k <= ksize2; k++ )
-                    s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0);
+                    s0 = v_muladd(v_sub(vx_load(src[k] + i), vx_load(src[-k] + i)), vx_setall_f32(ky[k]), s0);
                 v_store(dst + i, s0);
-                i += v_float32::nlanes;
+                i += VTraits<v_float32>::vlanes();
             }
         }
         return i;
@@ -2123,28 +2120,28 @@ struct SymmColumnSmallVec_32f
             {
 #if CV_FMA3 || CV_AVX2
                 v_float32 k0 = vx_setall_f32(ky[0]);
-                for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
-                    v_store(dst + i, v_muladd(vx_load(S1 + i), k0, vx_load(S0 + i) + vx_load(S2 + i) + d4));
+                for( ; i <= width - VTraits<v_float32>::vlanes(); i += VTraits<v_float32>::vlanes() )
+                    v_store(dst + i, v_muladd(vx_load(S1 + i), k0, v_add(v_add(vx_load(S0 + i), vx_load(S2 + i)), d4)));
 #else
                 if(ky[0] > 0)
-                    for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
+                    for( ; i <= width - VTraits<v_float32>::vlanes(); i += VTraits<v_float32>::vlanes() )
                     {
                         v_float32 x = vx_load(S1 + i);
-                        v_store(dst + i, vx_load(S0 + i) + vx_load(S2 + i) + d4 + (x + x));
+                        v_store(dst + i, v_add(vx_load(S0 + i), vx_load(S2 + i), d4, x, x));
                     }
                 else
-                    for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
+                    for( ; i <= width - VTraits<v_float32>::vlanes(); i += VTraits<v_float32>::vlanes() )
                     {
                         v_float32 x = vx_load(S1 + i);
-                        v_store(dst + i, vx_load(S0 + i) + vx_load(S2 + i) + d4 - (x + x));
+                        v_store(dst + i, v_sub(v_add(vx_load(S0 + i), vx_load(S2 + i), d4), v_add(x, x)));
                     }
 #endif
             }
             else
             {
                 v_float32 k0 = vx_setall_f32(ky[0]), k1 = vx_setall_f32(ky[1]);
-                for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
-                    v_store(dst + i, v_muladd(vx_load(S0 + i) + vx_load(S2 + i), k1, v_muladd(vx_load(S1 + i), k0, d4)));
+                for ( ; i <= width - VTraits<v_float32>::vlanes(); i += VTraits<v_float32>::vlanes() )
+                    v_store(dst + i, v_muladd(v_add(vx_load(S0 + i), vx_load(S2 + i)), k1, v_muladd(vx_load(S1 + i), k0, d4)));
             }
         }
         else
@@ -2153,14 +2150,14 @@ struct SymmColumnSmallVec_32f
             {
                 if( ky[1] < 0 )
                     std::swap(S0, S2);
-                for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
-                    v_store(dst + i, vx_load(S2 + i) - vx_load(S0 + i) + d4);
+                for ( ; i <= width - VTraits<v_float32>::vlanes(); i += VTraits<v_float32>::vlanes() )
+                    v_store(dst + i, v_add(v_sub(vx_load(S2 + i), vx_load(S0 + i)), d4));
             }
             else
             {
                 v_float32 k1 = vx_setall_f32(ky[1]);
-                for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
-                    v_store(dst + i, v_muladd(vx_load(S2 + i) - vx_load(S0 + i), k1, d4));
+                for ( ; i <= width - VTraits<v_float32>::vlanes(); i += VTraits<v_float32>::vlanes() )
+                    v_store(dst + i, v_muladd(v_sub(vx_load(S2 + i), vx_load(S0 + i)), k1, d4));
             }
         }
         return i;
@@ -2199,7 +2196,7 @@ struct FilterVec_8u
 
         v_float32 d4 = vx_setall_f32(delta);
         v_float32 f0 = vx_setall_f32(kf[0]);
-        for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes )
+        for( ; i <= width - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes() )
         {
             v_uint16 xl, xh;
             v_expand(vx_load(src[0] + i), xl, xh);
@@ -2223,7 +2220,7 @@ struct FilterVec_8u
             }
             v_store(dst + i, v_pack_u(v_pack(v_round(s0), v_round(s1)), v_pack(v_round(s2), v_round(s3))));
         }
-        if( i <= width - v_uint16::nlanes )
+        if( i <= width - VTraits<v_uint16>::vlanes() )
         {
             v_uint32 x0, x1;
             v_expand(vx_load_expand(src[0] + i), x0, x1);
@@ -2237,21 +2234,21 @@ struct FilterVec_8u
                 s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x1)), f, s1);
             }
             v_pack_u_store(dst + i, v_pack(v_round(s0), v_round(s1)));
-            i += v_uint16::nlanes;
+            i += VTraits<v_uint16>::vlanes();
         }
 #if CV_SIMD_WIDTH > 16
-        while( i <= width - v_int32x4::nlanes )
+        while( i <= width - 4 /*v_int32x4::nlanes*/ )
 #else
-        if( i <= width - v_int32x4::nlanes )
+        if( i <= width - v_int32::nlanes )
 #endif
         {
-            v_float32x4 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_load_expand_q(src[0] + i))), v_setall_f32(kf[0]), v_setall_f32(delta));
+            v_float32 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src[0] + i))), vx_setall_f32(kf[0]), vx_setall_f32(delta));
             for( k = 1; k < nz; k++ )
-                s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_load_expand_q(src[k] + i))), v_setall_f32(kf[k]), s0);
-            v_int32x4 s32 = v_round(s0);
-            v_int16x8 s16 = v_pack(s32, s32);
-            *(unaligned_int*)(dst + i) = v_reinterpret_as_s32(v_pack_u(s16, s16)).get0();
-            i += v_int32x4::nlanes;
+                s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src[k] + i))), vx_setall_f32(kf[k]), s0);
+            v_int32 s32 = v_round(s0);
+            v_int16 s16 = v_pack(s32, s32);
+            *(unaligned_int*)(dst + i) = v_get0(v_reinterpret_as_s32(v_pack_u(s16, s16)));
+            i += 4 /*v_int32x4::nlanes*/ ;
         }
         return i;
     }
@@ -2286,7 +2283,7 @@ struct FilterVec_8u16s
 
         v_float32 d4 = vx_setall_f32(delta);
         v_float32 f0 = vx_setall_f32(kf[0]);
-        for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes )
+        for( ; i <= width - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes() )
         {
             v_uint16 xl, xh;
             v_expand(vx_load(src[0] + i), xl, xh);
@@ -2304,9 +2301,9 @@ struct FilterVec_8u16s
                 s3 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_high(xh))), f, s3);
             }
             v_store(dst + i, v_pack(v_round(s0), v_round(s1)));
-            v_store(dst + i + v_int16::nlanes, v_pack(v_round(s2), v_round(s3)));
+            v_store(dst + i + VTraits<v_int16>::vlanes(), v_pack(v_round(s2), v_round(s3)));
         }
-        if( i <= width - v_uint16::nlanes )
+        if( i <= width - VTraits<v_uint16>::vlanes() )
         {
             v_uint16 x = vx_load_expand(src[0] + i);
             v_float32 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_low(x))), f0, d4);
@@ -2319,15 +2316,15 @@ struct FilterVec_8u16s
                 s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_high(x))), f, s1);
             }
             v_store(dst + i, v_pack(v_round(s0), v_round(s1)));
-            i += v_uint16::nlanes;
+            i += VTraits<v_uint16>::vlanes();
         }
-        if( i <= width - v_int32::nlanes )
+        if( i <= width - VTraits<v_int32>::vlanes() )
         {
             v_float32 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src[0] + i))), f0, d4);
             for( k = 1; k < nz; k++ )
                 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src[k] + i))), vx_setall_f32(kf[k]), s0);
             v_pack_store(dst + i, v_round(s0));
-            i += v_int32::nlanes;
+            i += VTraits<v_int32>::vlanes();
         }
         return i;
     }
@@ -2360,46 +2357,46 @@ struct FilterVec_32f
 
         v_float32 d4 = vx_setall_f32(delta);
         v_float32 f0 = vx_setall_f32(kf[0]);
-        for( ; i <= width - 4*v_float32::nlanes; i += 4*v_float32::nlanes )
+        for( ; i <= width - 4*VTraits<v_float32>::vlanes(); i += 4*VTraits<v_float32>::vlanes() )
         {
             v_float32 s0 = v_muladd(vx_load(src[0] + i), f0, d4);
-            v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), f0, d4);
-            v_float32 s2 = v_muladd(vx_load(src[0] + i + 2*v_float32::nlanes), f0, d4);
-            v_float32 s3 = v_muladd(vx_load(src[0] + i + 3*v_float32::nlanes), f0, d4);
+            v_float32 s1 = v_muladd(vx_load(src[0] + i + VTraits<v_float32>::vlanes()), f0, d4);
+            v_float32 s2 = v_muladd(vx_load(src[0] + i + 2*VTraits<v_float32>::vlanes()), f0, d4);
+            v_float32 s3 = v_muladd(vx_load(src[0] + i + 3*VTraits<v_float32>::vlanes()), f0, d4);
             for( k = 1; k < nz; k++ )
             {
                 v_float32 f1 = vx_setall_f32(kf[k]);
                 s0 = v_muladd(vx_load(src[k] + i), f1, s0);
-                s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes), f1, s1);
-                s2 = v_muladd(vx_load(src[k] + i + 2*v_float32::nlanes), f1, s2);
-                s3 = v_muladd(vx_load(src[k] + i + 3*v_float32::nlanes), f1, s3);
+                s1 = v_muladd(vx_load(src[k] + i + VTraits<v_float32>::vlanes()), f1, s1);
+                s2 = v_muladd(vx_load(src[k] + i + 2*VTraits<v_float32>::vlanes()), f1, s2);
+                s3 = v_muladd(vx_load(src[k] + i + 3*VTraits<v_float32>::vlanes()), f1, s3);
             }
             v_store(dst + i, s0);
-            v_store(dst + i + v_float32::nlanes, s1);
-            v_store(dst + i + 2*v_float32::nlanes, s2);
-            v_store(dst + i + 3*v_float32::nlanes, s3);
+            v_store(dst + i + VTraits<v_float32>::vlanes(), s1);
+            v_store(dst + i + 2*VTraits<v_float32>::vlanes(), s2);
+            v_store(dst + i + 3*VTraits<v_float32>::vlanes(), s3);
         }
-        if( i <= width - 2*v_float32::nlanes )
+        if( i <= width - 2*VTraits<v_float32>::vlanes() )
         {
             v_float32 s0 = v_muladd(vx_load(src[0] + i), f0, d4);
-            v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), f0, d4);
+            v_float32 s1 = v_muladd(vx_load(src[0] + i + VTraits<v_float32>::vlanes()), f0, d4);
             for( k = 1; k < nz; k++ )
             {
                 v_float32 f1 = vx_setall_f32(kf[k]);
                 s0 = v_muladd(vx_load(src[k] + i), f1, s0);
-                s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes), f1, s1);
+                s1 = v_muladd(vx_load(src[k] + i + VTraits<v_float32>::vlanes()), f1, s1);
             }
             v_store(dst + i, s0);
-            v_store(dst + i + v_float32::nlanes, s1);
-            i += 2*v_float32::nlanes;
+            v_store(dst + i + VTraits<v_float32>::vlanes(), s1);
+            i += 2*VTraits<v_float32>::vlanes();
         }
-        if( i <= width - v_float32::nlanes )
+        if( i <= width - VTraits<v_float32>::vlanes() )
         {
             v_float32 s0 = v_muladd(vx_load(src[0] + i), f0, d4);
             for( k = 1; k < nz; k++ )
                 s0 = v_muladd(vx_load(src[k] + i), vx_setall_f32(kf[k]), s0);
             v_store(dst + i, s0);
-            i += v_float32::nlanes;
+            i += VTraits<v_float32>::vlanes();
         }
         return i;
     }
diff --git a/modules/imgproc/src/hough.cpp b/modules/imgproc/src/hough.cpp
index 96ba338f30..9961e9aace 100644
--- a/modules/imgproc/src/hough.cpp
+++ b/modules/imgproc/src/hough.cpp
@@ -1156,13 +1156,13 @@ public:
 
             for(; x < numCols; ++x )
             {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                 {
                     v_uint8 v_zero = vx_setzero_u8();
 
-                    for(; x <= numCols - 2*v_uint8::nlanes; x += 2*v_uint8::nlanes) {
-                        v_uint8 v_edge1 = (vx_load(edgeData + x                  ) != v_zero);
-                        v_uint8 v_edge2 = (vx_load(edgeData + x + v_uint8::nlanes) != v_zero);
+                    for(; x <= numCols - 2*VTraits<v_uint8>::vlanes(); x += 2*VTraits<v_uint8>::vlanes()) {
+                        v_uint8 v_edge1 = (v_ne(vx_load(edgeData + x), v_zero));
+                        v_uint8 v_edge2 = (v_ne(vx_load(edgeData + x + VTraits<v_uint8>::vlanes()), v_zero));
 
                         if(v_check_any(v_edge1))
                         {
@@ -1172,7 +1172,7 @@ public:
 
                         if(v_check_any(v_edge2))
                         {
-                            x += v_uint8::nlanes + v_scan_forward(v_edge2);
+                            x += VTraits<v_uint8>::vlanes() + v_scan_forward(v_edge2);
                             goto _next_step;
                         }
                     }
@@ -1183,7 +1183,7 @@ public:
 
                 if(x == numCols)
                     continue;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 _next_step:
 #endif
                 float vx, vy;
@@ -1514,7 +1514,7 @@ inline int HoughCircleEstimateRadiusInvoker<NZPointList>::filterCircles(const Po
     int nzCount = 0;
     const Point* nz_ = &nz[0];
     int j = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     {
         const v_float32 v_minRadius2 = vx_setall_f32(minRadius2);
         const v_float32 v_maxRadius2 = vx_setall_f32(maxRadius2);
@@ -1522,9 +1522,9 @@ inline int HoughCircleEstimateRadiusInvoker<NZPointList>::filterCircles(const Po
         v_float32 v_curCenterX = vx_setall_f32(curCenter.x);
         v_float32 v_curCenterY = vx_setall_f32(curCenter.y);
 
-        float CV_DECL_ALIGNED(CV_SIMD_WIDTH) rbuf[v_float32::nlanes];
-        int CV_DECL_ALIGNED(CV_SIMD_WIDTH) rmask[v_int32::nlanes];
-        for(; j <= nzSz - v_float32::nlanes; j += v_float32::nlanes)
+        float CV_DECL_ALIGNED(CV_SIMD_WIDTH) rbuf[VTraits<v_float32>::max_nlanes];
+        int CV_DECL_ALIGNED(CV_SIMD_WIDTH) rmask[VTraits<v_int32>::max_nlanes];
+        for(; j <= nzSz - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes())
         {
             v_float32 v_nzX, v_nzY;
             v_load_deinterleave((const float*)&nz_[j], v_nzX, v_nzY); // FIXIT use proper datatype
@@ -1532,16 +1532,16 @@ inline int HoughCircleEstimateRadiusInvoker<NZPointList>::filterCircles(const Po
             v_float32 v_x = v_cvt_f32(v_reinterpret_as_s32(v_nzX));
             v_float32 v_y = v_cvt_f32(v_reinterpret_as_s32(v_nzY));
 
-            v_float32 v_dx = v_x - v_curCenterX;
-            v_float32 v_dy = v_y - v_curCenterY;
+            v_float32 v_dx = v_sub(v_x, v_curCenterX);
+            v_float32 v_dy = v_sub(v_y, v_curCenterY);
 
-            v_float32 v_r2 = (v_dx * v_dx) + (v_dy * v_dy);
-            v_float32 vmask = (v_minRadius2 <= v_r2) & (v_r2 <= v_maxRadius2);
+            v_float32 v_r2 = v_add(v_mul(v_dx, v_dx), v_mul(v_dy, v_dy));
+            v_float32 vmask = v_and(v_le(v_minRadius2, v_r2), v_le(v_r2, v_maxRadius2));
             if (v_check_any(vmask))
             {
                 v_store_aligned(rmask, v_reinterpret_as_s32(vmask));
                 v_store_aligned(rbuf, v_r2);
-                for (int i = 0; i < v_int32::nlanes; ++i)
+                for (int i = 0; i < VTraits<v_int32>::vlanes(); ++i)
                     if (rmask[i]) ddata[nzCount++] = rbuf[i];
             }
         }
@@ -1573,13 +1573,13 @@ inline int HoughCircleEstimateRadiusInvoker<NZPointSet>::filterCircles(const Poi
     const Range xOuter = Range(std::max(int(curCenter.x - rOuter), 0), std::min(int(curCenter.x + rOuter), positions.cols));
     const Range yOuter = Range(std::max(int(curCenter.y - rOuter), 0), std::min(int(curCenter.y + rOuter), positions.rows));
 
-#if CV_SIMD
-    float v_seq[v_float32::nlanes];
-    for (int i = 0; i < v_float32::nlanes; ++i)
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    float v_seq[VTraits<v_float32>::max_nlanes];
+    for (int i = 0; i < VTraits<v_float32>::vlanes(); ++i)
         v_seq[i] = (float)i;
     const v_float32 v_minRadius2 = vx_setall_f32(minRadius2);
     const v_float32 v_maxRadius2 = vx_setall_f32(maxRadius2);
-    const v_float32 v_curCenterX_0123 = vx_setall_f32(curCenter.x) - vx_load(v_seq);
+    const v_float32 v_curCenterX_0123 = v_sub(vx_setall_f32(curCenter.x), vx_load(v_seq));
 #endif
 
     for (int y = yOuter.start; y < yOuter.end; y++)
@@ -1589,27 +1589,27 @@ inline int HoughCircleEstimateRadiusInvoker<NZPointSet>::filterCircles(const Poi
         float dy2 = dy * dy;
 
         int x = xOuter.start;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         {
             const v_float32 v_dy2 = vx_setall_f32(dy2);
             const v_uint32 v_zero_u32 = vx_setall_u32(0);
-            float CV_DECL_ALIGNED(CV_SIMD_WIDTH) rbuf[v_float32::nlanes];
-            int CV_DECL_ALIGNED(CV_SIMD_WIDTH) rmask[v_int32::nlanes];
-            for (; x <= xOuter.end - v_float32::nlanes; x += v_float32::nlanes)
+            float CV_DECL_ALIGNED(CV_SIMD_WIDTH) rbuf[VTraits<v_float32>::max_nlanes];
+            int CV_DECL_ALIGNED(CV_SIMD_WIDTH) rmask[VTraits<v_int32>::max_nlanes];
+            for (; x <= xOuter.end - VTraits<v_float32>::vlanes(); x += VTraits<v_float32>::vlanes())
             {
                 v_uint32 v_mask = vx_load_expand_q(ptr + x);
-                v_mask = v_mask != v_zero_u32;
+                v_mask = v_ne(v_mask, v_zero_u32);
 
                 v_float32 v_x = v_cvt_f32(vx_setall_s32(x));
-                v_float32 v_dx = v_x - v_curCenterX_0123;
+                v_float32 v_dx = v_sub(v_x, v_curCenterX_0123);
 
-                v_float32 v_r2 = (v_dx * v_dx) + v_dy2;
-                v_float32 vmask = (v_minRadius2 <= v_r2) & (v_r2 <= v_maxRadius2) & v_reinterpret_as_f32(v_mask);
+                v_float32 v_r2 = v_add(v_mul(v_dx, v_dx), v_dy2);
+                v_float32 vmask = v_and(v_and(v_le(v_minRadius2, v_r2), v_le(v_r2, v_maxRadius2)), v_reinterpret_as_f32(v_mask));
                 if (v_check_any(vmask))
                 {
                     v_store_aligned(rmask, v_reinterpret_as_s32(vmask));
                     v_store_aligned(rbuf, v_r2);
-                    for (int i = 0; i < v_int32::nlanes; ++i)
+                    for (int i = 0; i < VTraits<v_int32>::vlanes(); ++i)
                         if (rmask[i]) ddata[nzCount++] = rbuf[i];
                 }
             }
diff --git a/modules/imgproc/src/stackblur.cpp b/modules/imgproc/src/stackblur.cpp
index 5d60a1d365..6becbe5c41 100644
--- a/modules/imgproc/src/stackblur.cpp
+++ b/modules/imgproc/src/stackblur.cpp
@@ -88,7 +88,7 @@ static unsigned char const stackblurShr[255] =
 
 namespace cv{
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 template<typename T>
 inline int opRow(const T* , T* , const std::vector<ushort>& , const float , const int radius, const int CN, const int )
 {
@@ -107,7 +107,7 @@ inline int opRow<uchar>(const uchar* srcPtr, uchar* dstPtr, const std::vector<us
     const int mulValTab= stackblurMul[radius];
     const int shrValTab= stackblurShr[radius];
 
-    const int VEC_LINE = v_uint8::nlanes;
+    const int VEC_LINE = VTraits<v_uint8>::vlanes();
 
     if (kernelSize == 3)
     {
@@ -126,10 +126,10 @@ inline int opRow<uchar>(const uchar* srcPtr, uchar* dstPtr, const std::vector<us
             v_expand(x1l, y00, y01);
             v_expand(x1h, y10, y11);
 
-            y00 = (y00 * v_mulVal)>>shrValTab;
-            y01 = (y01 * v_mulVal)>>shrValTab;
-            y10 = (y10 * v_mulVal)>>shrValTab;
-            y11 = (y11 * v_mulVal)>>shrValTab;
+            y00 = v_shr(v_mul(y00, v_mulVal), shrValTab);
+            y01 = v_shr(v_mul(y01, v_mulVal), shrValTab);
+            y10 = v_shr(v_mul(y10, v_mulVal), shrValTab);
+            y11 = v_shr(v_mul(y11, v_mulVal), shrValTab);
 
             v_store(dstPtr + i, v_pack(v_pack(y00, y01), v_pack(y10, y11)));
         }
@@ -159,12 +159,12 @@ inline int opRow<uchar>(const uchar* srcPtr, uchar* dstPtr, const std::vector<us
                 v_uint8 v_src3 = vx_load(srcPtr + j + CN);
 
                 v_int16 xl, xh;
-                v_zip(v_reinterpret_as_s16(v_expand_low(v_src0) + v_expand_low(v_src2)), v_reinterpret_as_s16(v_expand_low(v_src1) + v_expand_low(v_src3)), xl, xh);
-                s0 += v_dotprod(xl, k12);
-                s1 += v_dotprod(xh, k12);
-                v_zip(v_reinterpret_as_s16(v_expand_high(v_src0) + v_expand_high(v_src2)), v_reinterpret_as_s16(v_expand_high(v_src1) + v_expand_high(v_src3)), xl, xh);
-                s2 += v_dotprod(xl, k12);
-                s3 += v_dotprod(xh, k12);
+                v_zip(v_reinterpret_as_s16(v_add(v_expand_low(v_src0), v_expand_low(v_src2))), v_reinterpret_as_s16(v_add(v_expand_low(v_src1), v_expand_low(v_src3))), xl, xh);
+                s0 = v_add(s0, v_dotprod(xl, k12));
+                s1 = v_add(s1, v_dotprod(xh, k12));
+                v_zip(v_reinterpret_as_s16(v_add(v_expand_high(v_src0), v_expand_high(v_src2))), v_reinterpret_as_s16(v_add(v_expand_high(v_src1), v_expand_high(v_src3))), xl, xh);
+                s2 = v_add(s2, v_dotprod(xl, k12));
+                s3 = v_add(s3, v_dotprod(xh, k12));
             }
             if( k < kernelSize / 2 + 1 )
             {
@@ -175,17 +175,17 @@ inline int opRow<uchar>(const uchar* srcPtr, uchar* dstPtr, const std::vector<us
 
                 v_int16 xl, xh;
                 v_zip(v_reinterpret_as_s16(v_expand_low(v_src0)), v_reinterpret_as_s16(v_expand_low(v_src1)), xl, xh);
-                s0 += v_dotprod(xl, k1);
-                s1 += v_dotprod(xh, k1);
+                s0 = v_add(s0, v_dotprod(xl, k1));
+                s1 = v_add(s1, v_dotprod(xh, k1));
                 v_zip(v_reinterpret_as_s16(v_expand_high(v_src0)), v_reinterpret_as_s16(v_expand_high(v_src1)), xl, xh);
-                s2 += v_dotprod(xl, k1);
-                s3 += v_dotprod(xh, k1);
+                s2 = v_add(s2, v_dotprod(xl, k1));
+                s3 = v_add(s3, v_dotprod(xh, k1));
             }
 
-            s0 = (s0 * v_mulVal)>>shrValTab;
-            s1 = (s1 * v_mulVal)>>shrValTab;
-            s2 = (s2 * v_mulVal)>>shrValTab;
-            s3 = (s3 * v_mulVal)>>shrValTab;
+            s0 = v_shr(v_mul(s0, v_mulVal), shrValTab);
+            s1 = v_shr(v_mul(s1, v_mulVal), shrValTab);
+            s2 = v_shr(v_mul(s2, v_mulVal), shrValTab);
+            s3 = v_shr(v_mul(s3, v_mulVal), shrValTab);
 
             v_store(dstPtr + i, v_pack(v_reinterpret_as_u16(v_pack(s0, s1)), v_reinterpret_as_u16(v_pack(s2, s3))));
         }
@@ -205,7 +205,7 @@ inline int opRow<ushort>(const ushort* srcPtr, ushort* dstPtr, const std::vector
     const int mulValTab= stackblurMul[radius];
     const int shrValTab= stackblurShr[radius];
 
-    const int VEC_LINE = v_uint16::nlanes;
+    const int VEC_LINE = VTraits<v_uint16>::vlanes();
 
     v_uint32 v_mulVal = vx_setall_u32(mulValTab);
     if (kernelSize == 3)
@@ -220,7 +220,7 @@ inline int opRow<ushort>(const ushort* srcPtr, ushort* dstPtr, const std::vector
             x1l = v_add(v_add(x1l, x1l), v_add(x0l, x2l));
             x1h = v_add(v_add(x1h, x1h), v_add(x0h, x2h));
 
-            v_store(dstPtr + i, v_pack((x1l * v_mulVal)>>shrValTab, (x1h * v_mulVal)>>shrValTab));
+            v_store(dstPtr + i, v_pack(v_shr(v_mul(x1l, v_mulVal), shrValTab), v_shr(v_mul(x1h, v_mulVal), shrValTab)));
         }
     }
     else
@@ -243,25 +243,25 @@ inline int opRow<ushort>(const ushort* srcPtr, ushort* dstPtr, const std::vector
                 v_uint16 k2 = vx_setall_u16(kx[k + 1]);
 
                 v_uint32 y0, y1;
-                v_mul_expand(vx_load(srcPtr - j) + vx_load(srcPtr + j), k1, y0, y1);
-                s0 += y0;
-                s1 += y1;
-                v_mul_expand(vx_load(srcPtr - j - CN) + vx_load(srcPtr + j + CN), k2, y0, y1);
-                s0 += y0;
-                s1 += y1;
+                v_mul_expand(v_add(vx_load(srcPtr - j), vx_load(srcPtr + j)), k1, y0, y1);
+                s0 = v_add(s0, y0);
+                s1 = v_add(s1, y1);
+                v_mul_expand(v_add(vx_load(srcPtr - j - CN), vx_load(srcPtr + j + CN)), k2, y0, y1);
+                s0 = v_add(s0, y0);
+                s1 = v_add(s1, y1);
             }
             if( k < kernelSize / 2 + 1 )
             {
                 v_uint16 k1 = vx_setall_u16(kx[k]);
 
                 v_uint32 y0, y1;
-                v_mul_expand(vx_load(srcPtr - j) + vx_load(srcPtr + j), k1, y0, y1);
-                s0 += y0;
-                s1 += y1;
+                v_mul_expand(v_add(vx_load(srcPtr - j), vx_load(srcPtr + j)), k1, y0, y1);
+                s0 = v_add(s0, y0);
+                s1 = v_add(s1, y1);
             }
 
-            s0 = (s0 * v_mulVal)>>shrValTab;
-            s1 = (s1 * v_mulVal)>>shrValTab;
+            s0 = v_shr(v_mul(s0, v_mulVal), shrValTab);
+            s1 = v_shr(v_mul(s1, v_mulVal), shrValTab);
 
             v_store(dstPtr + i, v_pack(s0, s1));
         }
@@ -282,7 +282,7 @@ inline int opRow<short>(const short* srcPtr, short* dstPtr, const std::vector<us
     const int mulValTab= stackblurMul[radius];
     const int shrValTab= stackblurShr[radius];
 
-    const int VEC_LINE = v_int16::nlanes;
+    const int VEC_LINE = VTraits<v_int16>::vlanes();
     v_int32 v_mulVal = vx_setall_s32(mulValTab);
 
     if (kernelSize == 3)
@@ -297,7 +297,7 @@ inline int opRow<short>(const short* srcPtr, short* dstPtr, const std::vector<us
             x1l = v_add(v_add(x1l, x1l), v_add(x0l, x2l));
             x1h = v_add(v_add(x1h, x1h), v_add(x0h, x2h));
 
-            v_store(dstPtr + i, v_pack((x1l * v_mulVal)>>shrValTab, (x1h * v_mulVal)>>shrValTab));
+            v_store(dstPtr + i, v_pack(v_shr(v_mul(x1l, v_mulVal), shrValTab), v_shr(v_mul(x1h, v_mulVal), shrValTab)));
         }
     }
     else
@@ -320,24 +320,24 @@ inline int opRow<short>(const short* srcPtr, short* dstPtr, const std::vector<us
 
                 v_int32 y0, y1;
 
-                v_mul_expand(vx_load(srcPtr - j) + vx_load(srcPtr + j), k1, y0, y1);
-                s0 += y0;
-                s1 += y1;
-                v_mul_expand(vx_load(srcPtr - j - CN) + vx_load(srcPtr + j + CN), k2, y0, y1);
-                s0 += y0;
-                s1 += y1;
+                v_mul_expand(v_add(vx_load(srcPtr - j), vx_load(srcPtr + j)), k1, y0, y1);
+                s0 = v_add(s0, y0);
+                s1 = v_add(s1, y1);
+                v_mul_expand(v_add(vx_load(srcPtr - j - CN), vx_load(srcPtr + j + CN)), k2, y0, y1);
+                s0 = v_add(s0, y0);
+                s1 = v_add(s1, y1);
             }
             if( k < kernelSize / 2 + 1 )
             {
                 v_int16 k1 = vx_setall_s16((short)kx[k]);
                 v_int32 y0, y1;
-                v_mul_expand(vx_load(srcPtr - j) + vx_load(srcPtr + j), k1, y0, y1);
-                s0 += y0;
-                s1 += y1;
+                v_mul_expand(v_add(vx_load(srcPtr - j), vx_load(srcPtr + j)), k1, y0, y1);
+                s0 = v_add(s0, y0);
+                s1 = v_add(s1, y1);
             }
 
-            s0 = (s0 * v_mulVal)>>shrValTab;
-            s1 = (s1 * v_mulVal)>>shrValTab;
+            s0 = v_shr(v_mul(s0, v_mulVal), shrValTab);
+            s1 = v_shr(v_mul(s1, v_mulVal), shrValTab);
 
             v_store(dstPtr + i, v_pack(s0, s1));
         }
@@ -352,7 +352,7 @@ inline int opRow<float>(const float* srcPtr, float* dstPtr, const std::vector<us
     int i = radius * CN;
 
     v_float32 v_mulVal = vx_setall_f32(mulVal);
-    const int VEC_LINE = v_float32::nlanes;
+    const int VEC_LINE = VTraits<v_float32>::vlanes();
     const int VEC_LINE4 = VEC_LINE * 4;
 
     if (kernelSize == 3)
@@ -364,22 +364,22 @@ inline int opRow<float>(const float* srcPtr, float* dstPtr, const std::vector<us
             v_float32 v_srcPtr2 = vx_load(srcPtr + VEC_LINE * 2 + i);
             v_float32 v_srcPtr3 = vx_load(srcPtr + VEC_LINE * 3 + i);
 
-            v_float32 v_sumVal0 =  v_srcPtr0 + v_srcPtr0 + vx_load(srcPtr + i - CN) + vx_load(srcPtr + i + CN);
-            v_float32 v_sumVal1 =  v_srcPtr1 + v_srcPtr1 + vx_load(srcPtr + VEC_LINE + i - CN) + vx_load(srcPtr + VEC_LINE + i + CN);
-            v_float32 v_sumVal2 =  v_srcPtr2 + v_srcPtr2 + vx_load(srcPtr + VEC_LINE * 2 + i - CN) + vx_load(srcPtr + VEC_LINE * 2 + i + CN);
-            v_float32 v_sumVal3 =  v_srcPtr3 + v_srcPtr3 + vx_load(srcPtr + VEC_LINE * 3 + i - CN) + vx_load(srcPtr + VEC_LINE * 3 + i + CN);
+            v_float32 v_sumVal0 =  v_add(v_add(v_add(v_srcPtr0, v_srcPtr0), vx_load(srcPtr + i - CN)), vx_load(srcPtr + i + CN));
+            v_float32 v_sumVal1 =  v_add(v_add(v_add(v_srcPtr1, v_srcPtr1), vx_load(srcPtr + VEC_LINE + i - CN)), vx_load(srcPtr + VEC_LINE + i + CN));
+            v_float32 v_sumVal2 =  v_add(v_add(v_add(v_srcPtr2, v_srcPtr2), vx_load(srcPtr + VEC_LINE * 2 + i - CN)), vx_load(srcPtr + VEC_LINE * 2 + i + CN));
+            v_float32 v_sumVal3 =  v_add(v_add(v_add(v_srcPtr3, v_srcPtr3), vx_load(srcPtr + VEC_LINE * 3 + i - CN)), vx_load(srcPtr + VEC_LINE * 3 + i + CN));
 
-            v_store(dstPtr + i, v_sumVal0 * v_mulVal);
-            v_store(dstPtr + i + VEC_LINE, v_sumVal1 * v_mulVal);
-            v_store(dstPtr + i + VEC_LINE * 2, v_sumVal2 * v_mulVal);
-            v_store(dstPtr + i + VEC_LINE * 3, v_sumVal3 * v_mulVal);
+            v_store(dstPtr + i, v_mul(v_sumVal0, v_mulVal));
+            v_store(dstPtr + i + VEC_LINE, v_mul(v_sumVal1, v_mulVal));
+            v_store(dstPtr + i + VEC_LINE * 2, v_mul(v_sumVal2, v_mulVal));
+            v_store(dstPtr + i + VEC_LINE * 3, v_mul(v_sumVal3, v_mulVal));
         }
 
         for (; i <= widthCN - VEC_LINE; i += VEC_LINE)
         {
             v_float32 v_srcPtr = vx_load(srcPtr + i);
-            v_float32 v_sumVal = v_srcPtr + v_srcPtr + vx_load(srcPtr + i - CN) + vx_load(srcPtr + i + CN);
-            v_store(dstPtr + i, v_sumVal * v_mulVal);
+            v_float32 v_sumVal = v_add(v_add(v_add(v_srcPtr, v_srcPtr), vx_load(srcPtr + i - CN)), vx_load(srcPtr + i + CN));
+            v_store(dstPtr + i, v_mul(v_sumVal, v_mulVal));
         }
     }
     else
@@ -392,7 +392,7 @@ inline int opRow<float>(const float* srcPtr, float* dstPtr, const std::vector<us
         {
             v_float32 v_src = vx_load(srcPtr);
             v_float32 s0;
-            s0 = v_src * k0;
+            s0 = v_mul(v_src, k0);
 
             int k = 1, j = CN;
             for (; k <= kernelSize / 2 - 1; k += 2, j += 2 * CN)
@@ -400,17 +400,17 @@ inline int opRow<float>(const float* srcPtr, float* dstPtr, const std::vector<us
                 v_float32 k1 = vx_setall_f32((float)kx[k]);
                 v_float32 k2 = vx_setall_f32((float)kx[k + 1]);
 
-                s0 += (vx_load(srcPtr - j) + vx_load(srcPtr + j)) * k1;
-                s0 += (vx_load(srcPtr - j - CN) + vx_load(srcPtr + j + CN)) * k2;
+                s0 = v_add(s0, v_mul(v_add(vx_load(srcPtr - j), vx_load(srcPtr + j)), k1));
+                s0 = v_add(s0, v_mul(v_add(vx_load(srcPtr - j - CN), vx_load(srcPtr + j + CN)), k2));
             }
             if( k < kernelSize / 2 + 1 )
             {
                 v_float32 k1 = vx_setall_f32((float)kx[k]);
 
-                s0 += (vx_load(srcPtr - j) + vx_load(srcPtr + j)) * k1;
+                s0 = v_add(s0, v_mul(v_add(vx_load(srcPtr - j), vx_load(srcPtr + j)), k1));
             }
 
-            v_store(dstPtr + i, s0 * v_mulVal);
+            v_store(dstPtr + i, v_mul(s0, v_mulVal));
         }
     }
     return i;
@@ -426,8 +426,8 @@ template<>
 inline int opComputeDiff<uchar, int>(const uchar*& srcPtr, int*& diff0, const int w, const int CNR1)
 {
     int index = 0;
-    const int VEC_LINE_8 = v_uint8::nlanes;
-    const int VEC_LINE_32 = v_int32::nlanes;
+    const int VEC_LINE_8 = VTraits<v_uint8>::vlanes();
+    const int VEC_LINE_32 = VTraits<v_int32>::vlanes();
     for (; index <= w - VEC_LINE_8; index += VEC_LINE_8, diff0+=VEC_LINE_8, srcPtr+=VEC_LINE_8)
     {
         v_uint16 x0l, x0h, x1l, x1h;
@@ -435,8 +435,8 @@ inline int opComputeDiff<uchar, int>(const uchar*& srcPtr, int*& diff0, const in
         v_expand(vx_load(srcPtr), x1l, x1h);
 
         v_int32 y0, y1, y2, y3;
-        v_expand(v_reinterpret_as_s16(x0l) - v_reinterpret_as_s16(x1l), y0, y1);
-        v_expand(v_reinterpret_as_s16(x0h) - v_reinterpret_as_s16(x1h), y2, y3);
+        v_expand(v_sub(v_reinterpret_as_s16(x0l), v_reinterpret_as_s16(x1l)), y0, y1);
+        v_expand(v_sub(v_reinterpret_as_s16(x0h), v_reinterpret_as_s16(x1h)), y2, y3);
 
         v_store(diff0, y0);
         v_store(diff0 + VEC_LINE_32, y1);
@@ -517,7 +517,7 @@ public:
 
                 // middle
                 int wc = radius * CN;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                 wc = opRow<T>(srcPtr, dstPtr, kVec, mulVal, radius, CN, widthCN);
 #endif
                 for (; wc < widthCN; wc++)
@@ -586,7 +586,7 @@ public:
                 // middle
                 auto diff0 = diff + radius * CN;
                 int index = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                 index = opComputeDiff(srcPtr, diff0, widthCN, CNR1);
 #endif
 
@@ -688,7 +688,7 @@ private:
     float mulVal;
 };
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 template<typename T, typename TBuf>
 inline int opColumn(const T* , T* , T* , TBuf* , TBuf* , TBuf* , const float ,
                     const int , const int , const int , const int , const int )
@@ -703,7 +703,7 @@ inline int opColumn<float, float>(const float* srcPtr, float* dstPtr, float* sta
 {
     int k = 0;
     v_float32 v_mulVal = vx_setall_f32(mulVal);
-    const int VEC_LINE = v_float32::nlanes;
+    const int VEC_LINE = VTraits<v_float32>::vlanes();
     const int VEC_LINE4 = 4 * VEC_LINE;
 
     auto stackStartPtr = stack + ss * widthLen;
@@ -726,20 +726,20 @@ inline int opColumn<float, float>(const float* srcPtr, float* dstPtr, float* sta
         v_float32 v_sumIn2 = vx_load(sumIn + VEC_LINE * 2 + k);
         v_float32 v_sumIn3 = vx_load(sumIn + VEC_LINE * 3+ k);
 
-        v_store(dstPtr + k, v_sum0 * v_mulVal);
-        v_store(dstPtr + VEC_LINE + k, v_sum1 * v_mulVal);
-        v_store(dstPtr + VEC_LINE * 2 + k, v_sum2 * v_mulVal);
-        v_store(dstPtr + VEC_LINE * 3 + k, v_sum3 * v_mulVal);
+        v_store(dstPtr + k, v_mul(v_sum0, v_mulVal));
+        v_store(dstPtr + VEC_LINE + k, v_mul(v_sum1, v_mulVal));
+        v_store(dstPtr + VEC_LINE * 2 + k, v_mul(v_sum2, v_mulVal));
+        v_store(dstPtr + VEC_LINE * 3 + k, v_mul(v_sum3, v_mulVal));
 
-        v_sum0 -= v_sumOut0;
-        v_sum1 -= v_sumOut1;
-        v_sum2 -= v_sumOut2;
-        v_sum3 -= v_sumOut3;
+        v_sum0 = v_sub(v_sum0, v_sumOut0);
+        v_sum1 = v_sub(v_sum1, v_sumOut1);
+        v_sum2 = v_sub(v_sum2, v_sumOut2);
+        v_sum3 = v_sub(v_sum3, v_sumOut3);
 
-        v_sumOut0 -= vx_load(stackStartPtr + k);
-        v_sumOut1 -= vx_load(stackStartPtr + VEC_LINE + k);
-        v_sumOut2 -= vx_load(stackStartPtr + VEC_LINE * 2 + k);
-        v_sumOut3 -= vx_load(stackStartPtr + VEC_LINE * 3 + k);
+        v_sumOut0 = v_sub(v_sumOut0, vx_load(stackStartPtr + k));
+        v_sumOut1 = v_sub(v_sumOut1, vx_load(stackStartPtr + VEC_LINE + k));
+        v_sumOut2 = v_sub(v_sumOut2, vx_load(stackStartPtr + VEC_LINE * 2 + k));
+        v_sumOut3 = v_sub(v_sumOut3, vx_load(stackStartPtr + VEC_LINE * 3 + k));
 
         v_float32 v_srcPtr0 = vx_load(srcPtr + k);
         v_float32 v_srcPtr1 = vx_load(srcPtr + VEC_LINE + k);
@@ -751,35 +751,35 @@ inline int opColumn<float, float>(const float* srcPtr, float* dstPtr, float* sta
         v_store(stackStartPtr + VEC_LINE * 2 + k, v_srcPtr2);
         v_store(stackStartPtr + VEC_LINE * 3 + k, v_srcPtr3);
 
-        v_sumIn0 += v_srcPtr0;
-        v_sumIn1 += v_srcPtr1;
-        v_sumIn2 += v_srcPtr2;
-        v_sumIn3 += v_srcPtr3;
+        v_sumIn0 = v_add(v_sumIn0, v_srcPtr0);
+        v_sumIn1 = v_add(v_sumIn1, v_srcPtr1);
+        v_sumIn2 = v_add(v_sumIn2, v_srcPtr2);
+        v_sumIn3 = v_add(v_sumIn3, v_srcPtr3);
 
-        v_store(sum + k, v_sum0 + v_sumIn0);
-        v_store(sum + VEC_LINE + k, v_sum1 + v_sumIn1);
-        v_store(sum + VEC_LINE * 2 + k, v_sum2 + v_sumIn2);
-        v_store(sum + VEC_LINE * 3 + k, v_sum3 + v_sumIn3);
+        v_store(sum + k, v_add(v_sum0, v_sumIn0));
+        v_store(sum + VEC_LINE + k, v_add(v_sum1, v_sumIn1));
+        v_store(sum + VEC_LINE * 2 + k, v_add(v_sum2, v_sumIn2));
+        v_store(sum + VEC_LINE * 3 + k, v_add(v_sum3, v_sumIn3));
 
         v_srcPtr0 = vx_load(stackSp1Ptr + k);
         v_srcPtr1 = vx_load(stackSp1Ptr + VEC_LINE + k);
         v_srcPtr2 = vx_load(stackSp1Ptr + VEC_LINE * 2 +  k);
         v_srcPtr3 = vx_load(stackSp1Ptr + VEC_LINE * 3 + k);
 
-        v_sumOut0 += v_srcPtr0;
-        v_sumOut1 += v_srcPtr1;
-        v_sumOut2 += v_srcPtr2;
-        v_sumOut3 += v_srcPtr3;
+        v_sumOut0 = v_add(v_sumOut0, v_srcPtr0);
+        v_sumOut1 = v_add(v_sumOut1, v_srcPtr1);
+        v_sumOut2 = v_add(v_sumOut2, v_srcPtr2);
+        v_sumOut3 = v_add(v_sumOut3, v_srcPtr3);
 
         v_store(sumOut + k, v_sumOut0);
         v_store(sumOut + VEC_LINE + k, v_sumOut1);
         v_store(sumOut + VEC_LINE * 2 + k, v_sumOut2);
         v_store(sumOut + VEC_LINE * 3 + k, v_sumOut3);
 
-        v_sumIn0 -= v_srcPtr0;
-        v_sumIn1 -= v_srcPtr1;
-        v_sumIn2 -= v_srcPtr2;
-        v_sumIn3 -= v_srcPtr3;
+        v_sumIn0 = v_sub(v_sumIn0, v_srcPtr0);
+        v_sumIn1 = v_sub(v_sumIn1, v_srcPtr1);
+        v_sumIn2 = v_sub(v_sumIn2, v_srcPtr2);
+        v_sumIn3 = v_sub(v_sumIn3, v_srcPtr3);
 
         v_store(sumIn + k, v_sumIn0);
         v_store(sumIn + VEC_LINE + k, v_sumIn1);
@@ -793,20 +793,20 @@ inline int opColumn<float, float>(const float* srcPtr, float* dstPtr, float* sta
         v_float32 v_sumOut = vx_load(sumOut + k);
         v_float32 v_sumIn = vx_load(sumIn + k);
 
-        v_store(dstPtr + k, v_sum * v_mulVal);
-        v_sum -= v_sumOut;
-        v_sumOut -= vx_load(stackStartPtr + k);
+        v_store(dstPtr + k, v_mul(v_sum, v_mulVal));
+        v_sum = v_sub(v_sum, v_sumOut);
+        v_sumOut = v_sub(v_sumOut, vx_load(stackStartPtr + k));
 
         v_float32 v_srcPtr = vx_load(srcPtr + k);
         v_store(stackStartPtr + k, v_srcPtr);
 
-        v_sumIn += v_srcPtr;
-        v_store(sum + k, v_sum + v_sumIn);
+        v_sumIn = v_add(v_sumIn, v_srcPtr);
+        v_store(sum + k, v_add(v_sum, v_sumIn));
 
         v_srcPtr = vx_load(stackSp1Ptr + k);
-        v_sumOut += v_srcPtr;
+        v_sumOut = v_add(v_sumOut, v_srcPtr);
         v_store(sumOut + k, v_sumOut);
-        v_sumIn -= v_srcPtr;
+        v_sumIn = v_sub(v_sumIn, v_srcPtr);
         v_store(sumIn + k, v_sumIn);
     }
     return k;
@@ -820,8 +820,8 @@ inline int opColumn<uchar, int>(const uchar* srcPtr, uchar* dstPtr, uchar* stack
     int k = 0;
     if (mulValTab != 0 && shrValTab != 0)
     {
-        const int VEC_LINE_8 = v_uint8::nlanes;
-        const int VEC_LINE_32 = v_int32::nlanes;
+        const int VEC_LINE_8 = VTraits<v_uint8>::vlanes();
+        const int VEC_LINE_32 = VTraits<v_int32>::vlanes();
         v_int32 v_mulVal = vx_setall_s32(mulValTab);
 
         auto stackStartPtr = stack + ss * widthLen;
@@ -850,13 +850,13 @@ inline int opColumn<uchar, int>(const uchar* srcPtr, uchar* dstPtr, uchar* stack
 
             v_store(dstPtr + k,
                     v_pack(
-                            v_reinterpret_as_u16(v_pack((v_sum0 * v_mulVal)>>shrValTab, (v_sum1 * v_mulVal)>>shrValTab)),
-                            v_reinterpret_as_u16(v_pack((v_sum2 * v_mulVal)>>shrValTab, (v_sum3 * v_mulVal)>>shrValTab))));
+                            v_reinterpret_as_u16(v_pack(v_shr(v_mul(v_sum0, v_mulVal), shrValTab), v_shr(v_mul(v_sum1, v_mulVal), shrValTab))),
+                            v_reinterpret_as_u16(v_pack(v_shr(v_mul(v_sum2, v_mulVal), shrValTab), v_shr(v_mul(v_sum3, v_mulVal), shrValTab)))));
 
-            v_sum0 -= v_sumOut0;
-            v_sum1 -= v_sumOut1;
-            v_sum2 -= v_sumOut2;
-            v_sum3 -= v_sumOut3;
+            v_sum0 = v_sub(v_sum0, v_sumOut0);
+            v_sum1 = v_sub(v_sum1, v_sumOut1);
+            v_sum2 = v_sub(v_sum2, v_sumOut2);
+            v_sum3 = v_sub(v_sum3, v_sumOut3);
 
             v_uint16 x0l, x0h;
             v_int32 v_ss0, v_ss1, v_ss2, v_ss3;
@@ -865,10 +865,10 @@ inline int opColumn<uchar, int>(const uchar* srcPtr, uchar* dstPtr, uchar* stack
             v_expand(v_reinterpret_as_s16(x0l), v_ss0, v_ss1);
             v_expand(v_reinterpret_as_s16(x0h), v_ss2, v_ss3);
 
-            v_sumOut0 -= v_ss0;
-            v_sumOut1 -= v_ss1;
-            v_sumOut2 -= v_ss2;
-            v_sumOut3 -= v_ss3;
+            v_sumOut0 = v_sub(v_sumOut0, v_ss0);
+            v_sumOut1 = v_sub(v_sumOut1, v_ss1);
+            v_sumOut2 = v_sub(v_sumOut2, v_ss2);
+            v_sumOut3 = v_sub(v_sumOut3, v_ss3);
 
             v_expand(vx_load(srcPtr + k), x0l, x0h);
             v_expand(v_reinterpret_as_s16(x0l), v_ss0, v_ss1);
@@ -876,34 +876,34 @@ inline int opColumn<uchar, int>(const uchar* srcPtr, uchar* dstPtr, uchar* stack
 
             memcpy(stackStartPtr + k,srcPtr + k, VEC_LINE_8 * sizeof (uchar));
 
-            v_sumIn0 += v_ss0;
-            v_sumIn1 += v_ss1;
-            v_sumIn2 += v_ss2;
-            v_sumIn3 += v_ss3;
+            v_sumIn0 = v_add(v_sumIn0, v_ss0);
+            v_sumIn1 = v_add(v_sumIn1, v_ss1);
+            v_sumIn2 = v_add(v_sumIn2, v_ss2);
+            v_sumIn3 = v_add(v_sumIn3, v_ss3);
 
-            v_store(sum + k, v_sum0 + v_sumIn0);
-            v_store(sum + VEC_LINE_32 + k, v_sum1 + v_sumIn1);
-            v_store(sum + VEC_LINE_32 * 2 + k, v_sum2 + v_sumIn2);
-            v_store(sum + VEC_LINE_32 * 3 + k, v_sum3 + v_sumIn3);
+            v_store(sum + k, v_add(v_sum0, v_sumIn0));
+            v_store(sum + VEC_LINE_32 + k, v_add(v_sum1, v_sumIn1));
+            v_store(sum + VEC_LINE_32 * 2 + k, v_add(v_sum2, v_sumIn2));
+            v_store(sum + VEC_LINE_32 * 3 + k, v_add(v_sum3, v_sumIn3));
 
             v_expand(vx_load(stackSp1Ptr + k), x0l, x0h);
             v_expand(v_reinterpret_as_s16(x0l), v_ss0, v_ss1);
             v_expand(v_reinterpret_as_s16(x0h), v_ss2, v_ss3);
 
-            v_sumOut0 += v_ss0;
-            v_sumOut1 += v_ss1;
-            v_sumOut2 += v_ss2;
-            v_sumOut3 += v_ss3;
+            v_sumOut0 = v_add(v_sumOut0, v_ss0);
+            v_sumOut1 = v_add(v_sumOut1, v_ss1);
+            v_sumOut2 = v_add(v_sumOut2, v_ss2);
+            v_sumOut3 = v_add(v_sumOut3, v_ss3);
 
             v_store(sumOut + k, v_sumOut0);
             v_store(sumOut + VEC_LINE_32 + k, v_sumOut1);
             v_store(sumOut + VEC_LINE_32 * 2 + k, v_sumOut2);
             v_store(sumOut + VEC_LINE_32 * 3 + k, v_sumOut3);
 
-            v_sumIn0 -= v_ss0;
-            v_sumIn1 -= v_ss1;
-            v_sumIn2 -= v_ss2;
-            v_sumIn3 -= v_ss3;
+            v_sumIn0 = v_sub(v_sumIn0, v_ss0);
+            v_sumIn1 = v_sub(v_sumIn1, v_ss1);
+            v_sumIn2 = v_sub(v_sumIn2, v_ss2);
+            v_sumIn3 = v_sub(v_sumIn3, v_ss3);
 
             v_store(sumIn + k, v_sumIn0);
             v_store(sumIn + VEC_LINE_32 + k, v_sumIn1);
@@ -922,8 +922,8 @@ inline int opColumn<short, int>(const short* srcPtr, short* dstPtr, short* stack
     int k = 0;
     if (mulValTab != 0 && shrValTab != 0)
     {
-        const int VEC_LINE_16 = v_int16::nlanes;
-        const int VEC_LINE_32 = v_int32::nlanes;
+        const int VEC_LINE_16 = VTraits<v_int16>::vlanes();
+        const int VEC_LINE_32 = VTraits<v_int32>::vlanes();
         v_int32 v_mulVal = vx_setall_s32(mulValTab);
 
         auto stackStartPtr = stack + ss * widthLen;
@@ -943,39 +943,39 @@ inline int opColumn<short, int>(const short* srcPtr, short* dstPtr, short* stack
             v_sumOut0 = vx_load(sumOut + k);
             v_sumOut1 = vx_load(sumOut + k + VEC_LINE_32);
 
-            v_store(dstPtr + k,v_pack((v_sum0 * v_mulVal)>>shrValTab, (v_sum1 * v_mulVal)>>shrValTab));
+            v_store(dstPtr + k,v_pack(v_shr(v_mul(v_sum0, v_mulVal), shrValTab), v_shr(v_mul(v_sum1, v_mulVal), shrValTab)));
 
-            v_sum0 -= v_sumOut0;
-            v_sum1 -= v_sumOut1;
+            v_sum0 = v_sub(v_sum0, v_sumOut0);
+            v_sum1 = v_sub(v_sum1, v_sumOut1);
 
             v_int32 v_ss0, v_ss1;
             v_expand(vx_load(stackStartPtr + k), v_ss0, v_ss1);
 
-            v_sumOut0 -= v_ss0;
-            v_sumOut1 -= v_ss1;
+            v_sumOut0 = v_sub(v_sumOut0, v_ss0);
+            v_sumOut1 = v_sub(v_sumOut1, v_ss1);
 
             v_expand(vx_load(srcPtr + k), v_ss0, v_ss1);
             memcpy(stackStartPtr + k,srcPtr + k, VEC_LINE_16 * sizeof (short));
 
-            v_sumIn0 += v_ss0;
-            v_sumIn1 += v_ss1;
+            v_sumIn0 = v_add(v_sumIn0, v_ss0);
+            v_sumIn1 = v_add(v_sumIn1, v_ss1);
 
-            v_sum0 += v_sumIn0;
-            v_sum1 += v_sumIn1;
+            v_sum0 = v_add(v_sum0, v_sumIn0);
+            v_sum1 = v_add(v_sum1, v_sumIn1);
 
             v_store(sum + k, v_sum0);
             v_store(sum + VEC_LINE_32 + k, v_sum1);
 
             v_expand(vx_load(stackSp1Ptr + k), v_ss0, v_ss1);
 
-            v_sumOut0 += v_ss0;
-            v_sumOut1 += v_ss1;
+            v_sumOut0 = v_add(v_sumOut0, v_ss0);
+            v_sumOut1 = v_add(v_sumOut1, v_ss1);
 
             v_store(sumOut + k, v_sumOut0);
             v_store(sumOut + VEC_LINE_32 + k, v_sumOut1);
 
-            v_sumIn0 -= v_ss0;
-            v_sumIn1 -= v_ss1;
+            v_sumIn0 = v_sub(v_sumIn0, v_ss0);
+            v_sumIn1 = v_sub(v_sumIn1, v_ss1);
 
             v_store(sumIn + k, v_sumIn0);
             v_store(sumIn + VEC_LINE_32 + k, v_sumIn1);
@@ -992,8 +992,8 @@ inline int opColumn<ushort, int>(const ushort* srcPtr, ushort* dstPtr, ushort* s
     int k = 0;
     if (mulValTab != 0 && shrValTab != 0)
     {
-        const int VEC_LINE_16 = v_uint16::nlanes;
-        const int VEC_LINE_32 = v_int32::nlanes;
+        const int VEC_LINE_16 = VTraits<v_uint16>::vlanes();
+        const int VEC_LINE_32 = VTraits<v_int32>::vlanes();
         v_uint32 v_mulVal = vx_setall_u32((uint32_t)mulValTab);
 
         auto stackStartPtr = stack + ss * widthLen;
@@ -1013,40 +1013,40 @@ inline int opColumn<ushort, int>(const ushort* srcPtr, ushort* dstPtr, ushort* s
             v_sumOut0 = vx_load(sumOut + k);
             v_sumOut1 = vx_load(sumOut + k + VEC_LINE_32);
 
-            v_store(dstPtr + k, v_pack((v_reinterpret_as_u32(v_sum0) * v_mulVal)>>shrValTab, (v_reinterpret_as_u32(v_sum1) * v_mulVal)>>shrValTab));
+            v_store(dstPtr + k, v_pack(v_shr(v_mul(v_reinterpret_as_u32(v_sum0), v_mulVal), shrValTab), v_shr(v_mul(v_reinterpret_as_u32(v_sum1), v_mulVal), shrValTab)));
 
-            v_sum0 -= v_sumOut0;
-            v_sum1 -= v_sumOut1;
+            v_sum0 = v_sub(v_sum0, v_sumOut0);
+            v_sum1 = v_sub(v_sum1, v_sumOut1);
 
             v_uint32 v_ss0, v_ss1;
             v_expand(vx_load(stackStartPtr + k), v_ss0, v_ss1);
 
-            v_sumOut0 -= v_reinterpret_as_s32(v_ss0);
-            v_sumOut1 -= v_reinterpret_as_s32(v_ss1);
+            v_sumOut0 = v_sub(v_sumOut0, v_reinterpret_as_s32(v_ss0));
+            v_sumOut1 = v_sub(v_sumOut1, v_reinterpret_as_s32(v_ss1));
 
             v_expand(vx_load(srcPtr + k), v_ss0, v_ss1);
 
             memcpy(stackStartPtr + k,srcPtr + k, VEC_LINE_16 * sizeof (ushort));
 
-            v_sumIn0 += v_reinterpret_as_s32(v_ss0);
-            v_sumIn1 += v_reinterpret_as_s32(v_ss1);
+            v_sumIn0 = v_add(v_sumIn0, v_reinterpret_as_s32(v_ss0));
+            v_sumIn1 = v_add(v_sumIn1, v_reinterpret_as_s32(v_ss1));
 
-            v_sum0 += v_sumIn0;
-            v_sum1 += v_sumIn1;
+            v_sum0 = v_add(v_sum0, v_sumIn0);
+            v_sum1 = v_add(v_sum1, v_sumIn1);
 
             v_store(sum + k, v_sum0);
             v_store(sum + VEC_LINE_32 + k, v_sum1);
 
             v_expand(vx_load(stackSp1Ptr + k), v_ss0, v_ss1);
 
-            v_sumOut0 += v_reinterpret_as_s32(v_ss0);
-            v_sumOut1 += v_reinterpret_as_s32(v_ss1);
+            v_sumOut0 = v_add(v_sumOut0, v_reinterpret_as_s32(v_ss0));
+            v_sumOut1 = v_add(v_sumOut1, v_reinterpret_as_s32(v_ss1));
 
             v_store(sumOut + k, v_sumOut0);
             v_store(sumOut + VEC_LINE_32 + k, v_sumOut1);
 
-            v_sumIn0 -= v_reinterpret_as_s32(v_ss0);
-            v_sumIn1 -= v_reinterpret_as_s32(v_ss1);
+            v_sumIn0 = v_sub(v_sumIn0, v_reinterpret_as_s32(v_ss0));
+            v_sumIn1 = v_sub(v_sumIn1, v_reinterpret_as_s32(v_ss1));
 
             v_store(sumIn + k, v_sumIn0);
             v_store(sumIn + VEC_LINE_32 + k, v_sumIn1);
@@ -1152,7 +1152,7 @@ public:
             }
 
             int k = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
             k = opColumn<T, TBuf>(srcPtr, dstPtr, stack, sum, sumIn, sumOut, mulVal, mulValTab, shrValTab,
                                       widthLen, stackStart, sp1);
 #endif
diff --git a/modules/imgproc/src/thresh.cpp b/modules/imgproc/src/thresh.cpp
index bed0d37f26..4622691e68 100644
--- a/modules/imgproc/src/thresh.cpp
+++ b/modules/imgproc/src/thresh.cpp
@@ -190,7 +190,7 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type )
     int j = 0;
     const uchar* src = _src.ptr();
     uchar* dst = _dst.ptr();
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     v_uint8 thresh_u = vx_setall_u8( thresh );
     v_uint8 maxval16 = vx_setall_u8( maxval );
 
@@ -199,12 +199,12 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type )
     case THRESH_BINARY:
         for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
         {
-            for( j = 0; j <= roi.width - v_uint8::nlanes; j += v_uint8::nlanes)
+            for( j = 0; j <= roi.width - VTraits<v_uint8>::vlanes(); j += VTraits<v_uint8>::vlanes())
             {
                 v_uint8 v0;
                 v0 = vx_load( src + j );
-                v0 = thresh_u < v0;
-                v0 = v0 & maxval16;
+                v0 = v_lt(thresh_u, v0);
+                v0 = v_and(v0, maxval16);
                 v_store( dst + j, v0 );
             }
         }
@@ -213,12 +213,12 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type )
     case THRESH_BINARY_INV:
         for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
         {
-            for( j = 0; j <= roi.width - v_uint8::nlanes; j += v_uint8::nlanes)
+            for( j = 0; j <= roi.width - VTraits<v_uint8>::vlanes(); j += VTraits<v_uint8>::vlanes())
             {
                 v_uint8 v0;
                 v0 = vx_load( src + j );
-                v0 = v0 <= thresh_u;
-                v0 = v0 & maxval16;
+                v0 = v_le(v0, thresh_u);
+                v0 = v_and(v0, maxval16);
                 v_store( dst + j, v0 );
             }
         }
@@ -227,11 +227,11 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type )
     case THRESH_TRUNC:
         for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
         {
-            for( j = 0; j <= roi.width - v_uint8::nlanes; j += v_uint8::nlanes)
+            for( j = 0; j <= roi.width - VTraits<v_uint8>::vlanes(); j += VTraits<v_uint8>::vlanes())
             {
                 v_uint8 v0;
                 v0 = vx_load( src + j );
-                v0 = v0 - ( v0 - thresh_u );
+                v0 = v_sub(v0, v_sub(v0, thresh_u));
                 v_store( dst + j, v0 );
             }
         }
@@ -240,11 +240,11 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type )
     case THRESH_TOZERO:
         for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
         {
-            for( j = 0; j <= roi.width - v_uint8::nlanes; j += v_uint8::nlanes)
+            for( j = 0; j <= roi.width - VTraits<v_uint8>::vlanes(); j += VTraits<v_uint8>::vlanes())
             {
                 v_uint8 v0;
                 v0 = vx_load( src + j );
-                v0 = ( thresh_u < v0 ) & v0;
+                v0 = v_and(v_lt(thresh_u, v0), v0);
                 v_store( dst + j, v0 );
             }
         }
@@ -253,11 +253,11 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type )
     case THRESH_TOZERO_INV:
         for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
         {
-            for( j = 0; j <= roi.width - v_uint8::nlanes; j += v_uint8::nlanes)
+            for( j = 0; j <= roi.width - VTraits<v_uint8>::vlanes(); j += VTraits<v_uint8>::vlanes())
             {
                 v_uint8 v0;
                 v0 = vx_load( src + j );
-                v0 = ( v0 <= thresh_u ) & v0;
+                v0 = v_and(v_le(v0, thresh_u), v0);
                 v_store( dst + j, v0 );
             }
         }
@@ -351,7 +351,7 @@ thresh_16u(const Mat& _src, Mat& _dst, ushort thresh, ushort maxval, int type)
 
     const ushort* src = _src.ptr<ushort>();
     ushort* dst = _dst.ptr<ushort>();
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     int i, j;
     v_uint16 thresh_u = vx_setall_u16(thresh);
     v_uint16 maxval16 = vx_setall_u16(maxval);
@@ -361,25 +361,25 @@ thresh_16u(const Mat& _src, Mat& _dst, ushort thresh, ushort maxval, int type)
     case THRESH_BINARY:
         for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
         {
-            for (j = 0; j <= roi.width - 2*v_uint16::nlanes; j += 2*v_uint16::nlanes)
+            for (j = 0; j <= roi.width - 2*VTraits<v_uint16>::vlanes(); j += 2*VTraits<v_uint16>::vlanes())
             {
                 v_uint16 v0, v1;
                 v0 = vx_load(src + j);
-                v1 = vx_load(src + j + v_uint16::nlanes);
-                v0 = thresh_u < v0;
-                v1 = thresh_u < v1;
-                v0 = v0 & maxval16;
-                v1 = v1 & maxval16;
+                v1 = vx_load(src + j + VTraits<v_uint16>::vlanes());
+                v0 = v_lt(thresh_u, v0);
+                v1 = v_lt(thresh_u, v1);
+                v0 = v_and(v0, maxval16);
+                v1 = v_and(v1, maxval16);
                 v_store(dst + j, v0);
-                v_store(dst + j + v_uint16::nlanes, v1);
+                v_store(dst + j + VTraits<v_uint16>::vlanes(), v1);
             }
-            if (j <= roi.width - v_uint16::nlanes)
+            if (j <= roi.width - VTraits<v_uint16>::vlanes())
             {
                 v_uint16 v0 = vx_load(src + j);
-                v0 = thresh_u < v0;
-                v0 = v0 & maxval16;
+                v0 = v_lt(thresh_u, v0);
+                v0 = v_and(v0, maxval16);
                 v_store(dst + j, v0);
-                j += v_uint16::nlanes;
+                j += VTraits<v_uint16>::vlanes();
             }
 
             for (; j < roi.width; j++)
@@ -391,25 +391,25 @@ thresh_16u(const Mat& _src, Mat& _dst, ushort thresh, ushort maxval, int type)
         for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
         {
             j = 0;
-            for (; j <= roi.width - 2*v_uint16::nlanes; j += 2*v_uint16::nlanes)
+            for (; j <= roi.width - 2*VTraits<v_uint16>::vlanes(); j += 2*VTraits<v_uint16>::vlanes())
             {
                 v_uint16 v0, v1;
                 v0 = vx_load(src + j);
-                v1 = vx_load(src + j + v_uint16::nlanes);
-                v0 = v0 <= thresh_u;
-                v1 = v1 <= thresh_u;
-                v0 = v0 & maxval16;
-                v1 = v1 & maxval16;
+                v1 = vx_load(src + j + VTraits<v_uint16>::vlanes());
+                v0 = v_le(v0, thresh_u);
+                v1 = v_le(v1, thresh_u);
+                v0 = v_and(v0, maxval16);
+                v1 = v_and(v1, maxval16);
                 v_store(dst + j, v0);
-                v_store(dst + j + v_uint16::nlanes, v1);
+                v_store(dst + j + VTraits<v_uint16>::vlanes(), v1);
             }
-            if (j <= roi.width - v_uint16::nlanes)
+            if (j <= roi.width - VTraits<v_uint16>::vlanes())
             {
                 v_uint16 v0 = vx_load(src + j);
-                v0 = v0 <= thresh_u;
-                v0 = v0 & maxval16;
+                v0 = v_le(v0, thresh_u);
+                v0 = v_and(v0, maxval16);
                 v_store(dst + j, v0);
-                j += v_uint16::nlanes;
+                j += VTraits<v_uint16>::vlanes();
             }
 
             for (; j < roi.width; j++)
@@ -421,22 +421,22 @@ thresh_16u(const Mat& _src, Mat& _dst, ushort thresh, ushort maxval, int type)
         for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
         {
             j = 0;
-            for (; j <= roi.width - 2*v_uint16::nlanes; j += 2*v_uint16::nlanes)
+            for (; j <= roi.width - 2*VTraits<v_uint16>::vlanes(); j += 2*VTraits<v_uint16>::vlanes())
             {
                 v_uint16 v0, v1;
                 v0 = vx_load(src + j);
-                v1 = vx_load(src + j + v_uint16::nlanes);
+                v1 = vx_load(src + j + VTraits<v_uint16>::vlanes());
                 v0 = v_min(v0, thresh_u);
                 v1 = v_min(v1, thresh_u);
                 v_store(dst + j, v0);
-                v_store(dst + j + v_uint16::nlanes, v1);
+                v_store(dst + j + VTraits<v_uint16>::vlanes(), v1);
             }
-            if (j <= roi.width - v_uint16::nlanes)
+            if (j <= roi.width - VTraits<v_uint16>::vlanes())
             {
                 v_uint16 v0 = vx_load(src + j);
                 v0 = v_min(v0, thresh_u);
                 v_store(dst + j, v0);
-                j += v_uint16::nlanes;
+                j += VTraits<v_uint16>::vlanes();
             }
 
             for (; j < roi.width; j++)
@@ -448,22 +448,22 @@ thresh_16u(const Mat& _src, Mat& _dst, ushort thresh, ushort maxval, int type)
         for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
         {
             j = 0;
-            for (; j <= roi.width - 2*v_uint16::nlanes; j += 2*v_uint16::nlanes)
+            for (; j <= roi.width - 2*VTraits<v_uint16>::vlanes(); j += 2*VTraits<v_uint16>::vlanes())
             {
                 v_uint16 v0, v1;
                 v0 = vx_load(src + j);
-                v1 = vx_load(src + j + v_uint16::nlanes);
-                v0 = (thresh_u < v0) & v0;
-                v1 = (thresh_u < v1) & v1;
+                v1 = vx_load(src + j + VTraits<v_uint16>::vlanes());
+                v0 = v_and(v_lt(thresh_u, v0), v0);
+                v1 = v_and(v_lt(thresh_u, v1), v1);
                 v_store(dst + j, v0);
-                v_store(dst + j + v_uint16::nlanes, v1);
+                v_store(dst + j + VTraits<v_uint16>::vlanes(), v1);
             }
-            if (j <= roi.width - v_uint16::nlanes)
+            if (j <= roi.width - VTraits<v_uint16>::vlanes())
             {
                 v_uint16 v0 = vx_load(src + j);
-                v0 = (thresh_u < v0) & v0;
+                v0 = v_and(v_lt(thresh_u, v0), v0);
                 v_store(dst + j, v0);
-                j += v_uint16::nlanes;
+                j += VTraits<v_uint16>::vlanes();
             }
 
             for (; j < roi.width; j++)
@@ -475,22 +475,22 @@ thresh_16u(const Mat& _src, Mat& _dst, ushort thresh, ushort maxval, int type)
         for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
         {
             j = 0;
-            for (; j <= roi.width - 2*v_uint16::nlanes; j += 2*v_uint16::nlanes)
+            for (; j <= roi.width - 2*VTraits<v_uint16>::vlanes(); j += 2*VTraits<v_uint16>::vlanes())
             {
                 v_uint16 v0, v1;
                 v0 = vx_load(src + j);
-                v1 = vx_load(src + j + v_uint16::nlanes);
-                v0 = (v0 <= thresh_u) & v0;
-                v1 = (v1 <= thresh_u) & v1;
+                v1 = vx_load(src + j + VTraits<v_uint16>::vlanes());
+                v0 = v_and(v_le(v0, thresh_u), v0);
+                v1 = v_and(v_le(v1, thresh_u), v1);
                 v_store(dst + j, v0);
-                v_store(dst + j + v_uint16::nlanes, v1);
+                v_store(dst + j + VTraits<v_uint16>::vlanes(), v1);
             }
-            if (j <= roi.width - v_uint16::nlanes)
+            if (j <= roi.width - VTraits<v_uint16>::vlanes())
             {
                 v_uint16 v0 = vx_load(src + j);
-                v0 = (v0 <= thresh_u) & v0;
+                v0 = v_and(v_le(v0, thresh_u), v0);
                 v_store(dst + j, v0);
-                j += v_uint16::nlanes;
+                j += VTraits<v_uint16>::vlanes();
             }
 
             for (; j < roi.width; j++)
@@ -571,7 +571,7 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
     }
 #endif
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     int i, j;
     v_int16 thresh8 = vx_setall_s16( thresh );
     v_int16 maxval8 = vx_setall_s16( maxval );
@@ -582,25 +582,25 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
         for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
         {
             j = 0;
-            for( ; j <= roi.width - 2*v_int16::nlanes; j += 2*v_int16::nlanes )
+            for( ; j <= roi.width - 2*VTraits<v_int16>::vlanes(); j += 2*VTraits<v_int16>::vlanes() )
             {
                 v_int16 v0, v1;
                 v0 = vx_load( src + j );
-                v1 = vx_load( src + j + v_int16::nlanes );
-                v0 = thresh8 < v0;
-                v1 = thresh8 < v1;
-                v0 = v0 & maxval8;
-                v1 = v1 & maxval8;
+                v1 = vx_load( src + j + VTraits<v_int16>::vlanes() );
+                v0 = v_lt(thresh8, v0);
+                v1 = v_lt(thresh8, v1);
+                v0 = v_and(v0, maxval8);
+                v1 = v_and(v1, maxval8);
                 v_store( dst + j, v0 );
-                v_store( dst + j + v_int16::nlanes, v1 );
+                v_store( dst + j + VTraits<v_int16>::vlanes(), v1 );
             }
-            if( j <= roi.width - v_int16::nlanes )
+            if( j <= roi.width - VTraits<v_int16>::vlanes() )
             {
                 v_int16 v0 = vx_load( src + j );
-                v0 = thresh8 < v0;
-                v0 = v0 & maxval8;
+                v0 = v_lt(thresh8, v0);
+                v0 = v_and(v0, maxval8);
                 v_store( dst + j, v0 );
-                j += v_int16::nlanes;
+                j += VTraits<v_int16>::vlanes();
             }
 
             for( ; j < roi.width; j++ )
@@ -612,25 +612,25 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
         for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
         {
             j = 0;
-            for( ; j <= roi.width - 2*v_int16::nlanes; j += 2*v_int16::nlanes )
+            for( ; j <= roi.width - 2*VTraits<v_int16>::vlanes(); j += 2*VTraits<v_int16>::vlanes() )
             {
                 v_int16 v0, v1;
                 v0 = vx_load( src + j );
-                v1 = vx_load( src + j + v_int16::nlanes );
-                v0 = v0 <= thresh8;
-                v1 = v1 <= thresh8;
-                v0 = v0 & maxval8;
-                v1 = v1 & maxval8;
+                v1 = vx_load( src + j + VTraits<v_int16>::vlanes() );
+                v0 = v_le(v0, thresh8);
+                v1 = v_le(v1, thresh8);
+                v0 = v_and(v0, maxval8);
+                v1 = v_and(v1, maxval8);
                 v_store( dst + j, v0 );
-                v_store( dst + j + v_int16::nlanes, v1 );
+                v_store( dst + j + VTraits<v_int16>::vlanes(), v1 );
             }
-            if( j <= roi.width - v_int16::nlanes )
+            if( j <= roi.width - VTraits<v_int16>::vlanes() )
             {
                 v_int16 v0 = vx_load( src + j );
-                v0 = v0 <= thresh8;
-                v0 = v0 & maxval8;
+                v0 = v_le(v0, thresh8);
+                v0 = v_and(v0, maxval8);
                 v_store( dst + j, v0 );
-                j += v_int16::nlanes;
+                j += VTraits<v_int16>::vlanes();
             }
 
             for( ; j < roi.width; j++ )
@@ -642,22 +642,22 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
         for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
         {
             j = 0;
-            for( ; j <= roi.width - 2*v_int16::nlanes; j += 2*v_int16::nlanes )
+            for( ; j <= roi.width - 2*VTraits<v_int16>::vlanes(); j += 2*VTraits<v_int16>::vlanes() )
             {
                 v_int16 v0, v1;
                 v0 = vx_load( src + j );
-                v1 = vx_load( src + j + v_int16::nlanes );
+                v1 = vx_load( src + j + VTraits<v_int16>::vlanes() );
                 v0 = v_min( v0, thresh8 );
                 v1 = v_min( v1, thresh8 );
                 v_store( dst + j, v0 );
-                v_store( dst + j + v_int16::nlanes, v1 );
+                v_store( dst + j + VTraits<v_int16>::vlanes(), v1 );
             }
-            if( j <= roi.width - v_int16::nlanes )
+            if( j <= roi.width - VTraits<v_int16>::vlanes() )
             {
                 v_int16 v0 = vx_load( src + j );
                 v0 = v_min( v0, thresh8 );
                 v_store( dst + j, v0 );
-                j += v_int16::nlanes;
+                j += VTraits<v_int16>::vlanes();
             }
 
             for( ; j < roi.width; j++ )
@@ -669,22 +669,22 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
         for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
         {
             j = 0;
-            for( ; j <= roi.width - 2*v_int16::nlanes; j += 2*v_int16::nlanes )
+            for( ; j <= roi.width - 2*VTraits<v_int16>::vlanes(); j += 2*VTraits<v_int16>::vlanes() )
             {
                 v_int16 v0, v1;
                 v0 = vx_load( src + j );
-                v1 = vx_load( src + j + v_int16::nlanes );
-                v0 = ( thresh8 < v0 ) & v0;
-                v1 = ( thresh8 < v1 ) & v1;
+                v1 = vx_load( src + j + VTraits<v_int16>::vlanes() );
+                v0 = v_and(v_lt(thresh8, v0), v0);
+                v1 = v_and(v_lt(thresh8, v1), v1);
                 v_store( dst + j, v0 );
-                v_store( dst + j + v_int16::nlanes, v1 );
+                v_store( dst + j + VTraits<v_int16>::vlanes(), v1 );
             }
-            if( j <= roi.width - v_int16::nlanes )
+            if( j <= roi.width - VTraits<v_int16>::vlanes() )
             {
                 v_int16 v0 = vx_load( src + j );
-                v0 = ( thresh8 < v0 ) & v0;
+                v0 = v_and(v_lt(thresh8, v0), v0);
                 v_store( dst + j, v0 );
-                j += v_int16::nlanes;
+                j += VTraits<v_int16>::vlanes();
             }
 
             for( ; j < roi.width; j++ )
@@ -696,22 +696,22 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
         for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
         {
             j = 0;
-            for( ; j <= roi.width - 2*v_int16::nlanes; j += 2*v_int16::nlanes )
+            for( ; j <= roi.width - 2*VTraits<v_int16>::vlanes(); j += 2*VTraits<v_int16>::vlanes() )
             {
                 v_int16 v0, v1;
                 v0 = vx_load( src + j );
-                v1 = vx_load( src + j + v_int16::nlanes );
-                v0 = ( v0 <= thresh8 ) & v0;
-                v1 = ( v1 <= thresh8 ) & v1;
+                v1 = vx_load( src + j + VTraits<v_int16>::vlanes() );
+                v0 = v_and(v_le(v0, thresh8), v0);
+                v1 = v_and(v_le(v1, thresh8), v1);
                 v_store( dst + j, v0 );
-                v_store( dst + j + v_int16::nlanes, v1 );
+                v_store( dst + j + VTraits<v_int16>::vlanes(), v1 );
             }
-            if( j <= roi.width - v_int16::nlanes )
+            if( j <= roi.width - VTraits<v_int16>::vlanes() )
             {
                 v_int16 v0 = vx_load( src + j );
-                v0 = ( v0 <= thresh8 ) & v0;
+                v0 = v_and(v_le(v0, thresh8), v0);
                 v_store( dst + j, v0 );
-                j += v_int16::nlanes;
+                j += VTraits<v_int16>::vlanes();
             }
 
             for( ; j < roi.width; j++ )
@@ -777,7 +777,7 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
     }
 #endif
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     int i, j;
     v_float32 thresh4 = vx_setall_f32( thresh );
     v_float32 maxval4 = vx_setall_f32( maxval );
@@ -788,25 +788,25 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
             for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
             {
                 j = 0;
-                for( ; j <= roi.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes )
+                for( ; j <= roi.width - 2*VTraits<v_float32>::vlanes(); j += 2*VTraits<v_float32>::vlanes() )
                 {
                     v_float32 v0, v1;
                     v0 = vx_load( src + j );
-                    v1 = vx_load( src + j + v_float32::nlanes );
-                    v0 = thresh4 < v0;
-                    v1 = thresh4 < v1;
-                    v0 = v0 & maxval4;
-                    v1 = v1 & maxval4;
+                    v1 = vx_load( src + j + VTraits<v_float32>::vlanes() );
+                    v0 = v_lt(thresh4, v0);
+                    v1 = v_lt(thresh4, v1);
+                    v0 = v_and(v0, maxval4);
+                    v1 = v_and(v1, maxval4);
                     v_store( dst + j, v0 );
-                    v_store( dst + j + v_float32::nlanes, v1 );
+                    v_store( dst + j + VTraits<v_float32>::vlanes(), v1 );
                 }
-                if( j <= roi.width - v_float32::nlanes )
+                if( j <= roi.width - VTraits<v_float32>::vlanes() )
                 {
                     v_float32 v0 = vx_load( src + j );
-                    v0 = thresh4 < v0;
-                    v0 = v0 & maxval4;
+                    v0 = v_lt(thresh4, v0);
+                    v0 = v_and(v0, maxval4);
                     v_store( dst + j, v0 );
-                    j += v_float32::nlanes;
+                    j += VTraits<v_float32>::vlanes();
                 }
 
                 for( ; j < roi.width; j++ )
@@ -818,25 +818,25 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
             for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
             {
                 j = 0;
-                for( ; j <= roi.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes )
+                for( ; j <= roi.width - 2*VTraits<v_float32>::vlanes(); j += 2*VTraits<v_float32>::vlanes() )
                 {
                     v_float32 v0, v1;
                     v0 = vx_load( src + j );
-                    v1 = vx_load( src + j + v_float32::nlanes );
-                    v0 = v0 <= thresh4;
-                    v1 = v1 <= thresh4;
-                    v0 = v0 & maxval4;
-                    v1 = v1 & maxval4;
+                    v1 = vx_load( src + j + VTraits<v_float32>::vlanes() );
+                    v0 = v_le(v0, thresh4);
+                    v1 = v_le(v1, thresh4);
+                    v0 = v_and(v0, maxval4);
+                    v1 = v_and(v1, maxval4);
                     v_store( dst + j, v0 );
-                    v_store( dst + j + v_float32::nlanes, v1 );
+                    v_store( dst + j + VTraits<v_float32>::vlanes(), v1 );
                 }
-                if( j <= roi.width - v_float32::nlanes )
+                if( j <= roi.width - VTraits<v_float32>::vlanes() )
                 {
                     v_float32 v0 = vx_load( src + j );
-                    v0 = v0 <= thresh4;
-                    v0 = v0 & maxval4;
+                    v0 = v_le(v0, thresh4);
+                    v0 = v_and(v0, maxval4);
                     v_store( dst + j, v0 );
-                    j += v_float32::nlanes;
+                    j += VTraits<v_float32>::vlanes();
                 }
 
                 for( ; j < roi.width; j++ )
@@ -848,22 +848,22 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
             for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
             {
                 j = 0;
-                for( ; j <= roi.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes )
+                for( ; j <= roi.width - 2*VTraits<v_float32>::vlanes(); j += 2*VTraits<v_float32>::vlanes() )
                 {
                     v_float32 v0, v1;
                     v0 = vx_load( src + j );
-                    v1 = vx_load( src + j + v_float32::nlanes );
+                    v1 = vx_load( src + j + VTraits<v_float32>::vlanes() );
                     v0 = v_min( v0, thresh4 );
                     v1 = v_min( v1, thresh4 );
                     v_store( dst + j, v0 );
-                    v_store( dst + j + v_float32::nlanes, v1 );
+                    v_store( dst + j + VTraits<v_float32>::vlanes(), v1 );
                 }
-                if( j <= roi.width - v_float32::nlanes )
+                if( j <= roi.width - VTraits<v_float32>::vlanes() )
                 {
                     v_float32 v0 = vx_load( src + j );
                     v0 = v_min( v0, thresh4 );
                     v_store( dst + j, v0 );
-                    j += v_float32::nlanes;
+                    j += VTraits<v_float32>::vlanes();
                 }
 
                 for( ; j < roi.width; j++ )
@@ -875,22 +875,22 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
             for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
             {
                 j = 0;
-                for( ; j <= roi.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes )
+                for( ; j <= roi.width - 2*VTraits<v_float32>::vlanes(); j += 2*VTraits<v_float32>::vlanes() )
                 {
                     v_float32 v0, v1;
                     v0 = vx_load( src + j );
-                    v1 = vx_load( src + j + v_float32::nlanes );
-                    v0 = ( thresh4 < v0 ) & v0;
-                    v1 = ( thresh4 < v1 ) & v1;
+                    v1 = vx_load( src + j + VTraits<v_float32>::vlanes() );
+                    v0 = v_and(v_lt(thresh4, v0), v0);
+                    v1 = v_and(v_lt(thresh4, v1), v1);
                     v_store( dst + j, v0 );
-                    v_store( dst + j + v_float32::nlanes, v1 );
+                    v_store( dst + j + VTraits<v_float32>::vlanes(), v1 );
                 }
-                if( j <= roi.width - v_float32::nlanes )
+                if( j <= roi.width - VTraits<v_float32>::vlanes() )
                 {
                     v_float32 v0 = vx_load( src + j );
-                    v0 = ( thresh4 < v0 ) & v0;
+                    v0 = v_and(v_lt(thresh4, v0), v0);
                     v_store( dst + j, v0 );
-                    j += v_float32::nlanes;
+                    j += VTraits<v_float32>::vlanes();
                 }
 
                 for( ; j < roi.width; j++ )
@@ -902,22 +902,22 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
             for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
             {
                 j = 0;
-                for( ; j <= roi.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes )
+                for( ; j <= roi.width - 2*VTraits<v_float32>::vlanes(); j += 2*VTraits<v_float32>::vlanes() )
                 {
                     v_float32 v0, v1;
                     v0 = vx_load( src + j );
-                    v1 = vx_load( src + j + v_float32::nlanes );
-                    v0 = ( v0 <= thresh4 ) & v0;
-                    v1 = ( v1 <= thresh4 ) & v1;
+                    v1 = vx_load( src + j + VTraits<v_float32>::vlanes() );
+                    v0 = v_and(v_le(v0, thresh4), v0);
+                    v1 = v_and(v_le(v1, thresh4), v1);
                     v_store( dst + j, v0 );
-                    v_store( dst + j + v_float32::nlanes, v1 );
+                    v_store( dst + j + VTraits<v_float32>::vlanes(), v1 );
                 }
-                if( j <= roi.width - v_float32::nlanes )
+                if( j <= roi.width - VTraits<v_float32>::vlanes() )
                 {
                     v_float32 v0 = vx_load( src + j );
-                    v0 = ( v0 <= thresh4 ) & v0;
+                    v0 = v_and(v_le(v0, thresh4), v0);
                     v_store( dst + j, v0 );
-                    j += v_float32::nlanes;
+                    j += VTraits<v_float32>::vlanes();
                 }
 
                 for( ; j < roi.width; j++ )
@@ -948,7 +948,7 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
         roi.height = 1;
     }
 
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
     int i, j;
     v_float64 thresh2 = vx_setall_f64( thresh );
     v_float64 maxval2 = vx_setall_f64( maxval );
@@ -959,25 +959,25 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
         for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
         {
             j = 0;
-            for( ; j <= roi.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes )
+            for( ; j <= roi.width - 2*VTraits<v_float64>::vlanes(); j += 2*VTraits<v_float64>::vlanes() )
             {
                 v_float64 v0, v1;
                 v0 = vx_load( src + j );
-                v1 = vx_load( src + j + v_float64::nlanes );
-                v0 = thresh2 < v0;
-                v1 = thresh2 < v1;
-                v0 = v0 & maxval2;
-                v1 = v1 & maxval2;
+                v1 = vx_load( src + j + VTraits<v_float64>::vlanes() );
+                v0 = v_lt(thresh2, v0);
+                v1 = v_lt(thresh2, v1);
+                v0 = v_and(v0, maxval2);
+                v1 = v_and(v1, maxval2);
                 v_store( dst + j, v0 );
-                v_store( dst + j + v_float64::nlanes, v1 );
+                v_store( dst + j + VTraits<v_float64>::vlanes(), v1 );
             }
-            if( j <= roi.width - v_float64::nlanes )
+            if( j <= roi.width - VTraits<v_float64>::vlanes() )
             {
                 v_float64 v0 = vx_load( src + j );
-                v0 = thresh2 < v0;
-                v0 = v0 & maxval2;
+                v0 = v_lt(thresh2, v0);
+                v0 = v_and(v0, maxval2);
                 v_store( dst + j, v0 );
-                j += v_float64::nlanes;
+                j += VTraits<v_float64>::vlanes();
             }
 
             for( ; j < roi.width; j++ )
@@ -989,25 +989,25 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
         for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
         {
             j = 0;
-            for( ; j <= roi.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes )
+            for( ; j <= roi.width - 2*VTraits<v_float64>::vlanes(); j += 2*VTraits<v_float64>::vlanes() )
             {
                 v_float64 v0, v1;
                 v0 = vx_load( src + j );
-                v1 = vx_load( src + j + v_float64::nlanes );
-                v0 = v0 <= thresh2;
-                v1 = v1 <= thresh2;
-                v0 = v0 & maxval2;
-                v1 = v1 & maxval2;
+                v1 = vx_load( src + j + VTraits<v_float64>::vlanes() );
+                v0 = v_le(v0, thresh2);
+                v1 = v_le(v1, thresh2);
+                v0 = v_and(v0, maxval2);
+                v1 = v_and(v1, maxval2);
                 v_store( dst + j, v0 );
-                v_store( dst + j + v_float64::nlanes, v1 );
+                v_store( dst + j + VTraits<v_float64>::vlanes(), v1 );
             }
-            if( j <= roi.width - v_float64::nlanes )
+            if( j <= roi.width - VTraits<v_float64>::vlanes() )
             {
                 v_float64 v0 = vx_load( src + j );
-                v0 = v0 <= thresh2;
-                v0 = v0 & maxval2;
+                v0 = v_le(v0, thresh2);
+                v0 = v_and(v0, maxval2);
                 v_store( dst + j, v0 );
-                j += v_float64::nlanes;
+                j += VTraits<v_float64>::vlanes();
             }
 
             for( ; j < roi.width; j++ )
@@ -1019,22 +1019,22 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
         for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
         {
             j = 0;
-            for( ; j <= roi.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes )
+            for( ; j <= roi.width - 2*VTraits<v_float64>::vlanes(); j += 2*VTraits<v_float64>::vlanes() )
             {
                 v_float64 v0, v1;
                 v0 = vx_load( src + j );
-                v1 = vx_load( src + j + v_float64::nlanes );
+                v1 = vx_load( src + j + VTraits<v_float64>::vlanes() );
                 v0 = v_min( v0, thresh2 );
                 v1 = v_min( v1, thresh2 );
                 v_store( dst + j, v0 );
-                v_store( dst + j + v_float64::nlanes, v1 );
+                v_store( dst + j + VTraits<v_float64>::vlanes(), v1 );
             }
-            if( j <= roi.width - v_float64::nlanes )
+            if( j <= roi.width - VTraits<v_float64>::vlanes() )
             {
                 v_float64 v0 = vx_load( src + j );
                 v0 = v_min( v0, thresh2 );
                 v_store( dst + j, v0 );
-                j += v_float64::nlanes;
+                j += VTraits<v_float64>::vlanes();
             }
 
             for( ; j < roi.width; j++ )
@@ -1046,22 +1046,22 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
         for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
         {
             j = 0;
-            for( ; j <= roi.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes )
+            for( ; j <= roi.width - 2*VTraits<v_float64>::vlanes(); j += 2*VTraits<v_float64>::vlanes() )
             {
                 v_float64 v0, v1;
                 v0 = vx_load( src + j );
-                v1 = vx_load( src + j + v_float64::nlanes );
-                v0 = ( thresh2 < v0 ) & v0;
-                v1 = ( thresh2 < v1 ) & v1;
+                v1 = vx_load( src + j + VTraits<v_float64>::vlanes() );
+                v0 = v_and(v_lt(thresh2, v0), v0);
+                v1 = v_and(v_lt(thresh2, v1), v1);
                 v_store( dst + j, v0 );
-                v_store( dst + j + v_float64::nlanes, v1 );
+                v_store( dst + j + VTraits<v_float64>::vlanes(), v1 );
             }
-            if( j <= roi.width - v_float64::nlanes )
+            if( j <= roi.width - VTraits<v_float64>::vlanes() )
             {
                 v_float64 v0 = vx_load( src + j );
-                v0 = ( thresh2 < v0 ) & v0;
+                v0 = v_and(v_lt(thresh2, v0), v0);
                 v_store( dst + j, v0 );
-                j += v_float64::nlanes;
+                j += VTraits<v_float64>::vlanes();
             }
 
             for( ; j < roi.width; j++ )
@@ -1073,22 +1073,22 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
         for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
         {
             j = 0;
-            for( ; j <= roi.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes )
+            for( ; j <= roi.width - 2*VTraits<v_float64>::vlanes(); j += 2*VTraits<v_float64>::vlanes() )
             {
                 v_float64 v0, v1;
                 v0 = vx_load( src + j );
-                v1 = vx_load( src + j + v_float64::nlanes );
-                v0 = ( v0 <= thresh2 ) & v0;
-                v1 = ( v1 <= thresh2 ) & v1;
+                v1 = vx_load( src + j + VTraits<v_float64>::vlanes() );
+                v0 = v_and(v_le(v0, thresh2), v0);
+                v1 = v_and(v_le(v1, thresh2), v1);
                 v_store( dst + j, v0 );
-                v_store( dst + j + v_float64::nlanes, v1 );
+                v_store( dst + j + VTraits<v_float64>::vlanes(), v1 );
             }
-            if( j <= roi.width - v_float64::nlanes )
+            if( j <= roi.width - VTraits<v_float64>::vlanes() )
             {
                 v_float64 v0 = vx_load( src + j );
-                v0 = ( v0 <= thresh2 ) & v0;
+                v0 = v_and(v_le(v0, thresh2), v0);
                 v_store( dst + j, v0 );
-                j += v_float64::nlanes;
+                j += VTraits<v_float64>::vlanes();
             }
 
             for( ; j < roi.width; j++ )