Merge pull request #24132 from hanliutong:rewrite-imgproc2

Rewrite Universal Intrinsic code by using new API: ImgProc module Part 2 #24132 The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro in the opencv/modules/imgproc folder: rewrite them by using the new Universal Intrinsic API. This is the second part of the modification to the Imgproc module ( Part 1: #24058 ), And I tested this patch on RVV (QEMU) and AVX devices, `opencv_test_imgproc` is passed. The patch is partially auto-generated by using the [rewriter](https://github.com/hanliutong/rewriter). ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [ ] I agree to contribute to the project under Apache 2 License. - [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [ ] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
2024-11-28 05:06:29 +08:00 · 2023-09-19 13:52:42 +08:00 · 2023-09-19 13:52:42 +08:00 · f617fbe166
commit f617fbe166
parent 8f2e6640e3
10 changed files with 1589 additions and 1424 deletions
--- a/modules/core/include/opencv2/core/hal/intrin.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin.hpp
@ -745,7 +745,22 @@ namespace CV__SIMD_NAMESPACE {
    inline _Tpvec v_add(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \
        return v_add(f1 + f2, vf...); \
    }
+    #define OPENCV_HAL_WRAP_SHIFT_OP(_Tpvec) \
+    inline _Tpvec v_shr(const _Tpvec& a, int n) \
+    { \
+        return a >> n; \
+    } \
+    inline _Tpvec v_shl(const _Tpvec& a, int n) \
+    { \
+        return a << n; \
+    }

+    OPENCV_HAL_WRAP_SHIFT_OP(v_uint16)
+    OPENCV_HAL_WRAP_SHIFT_OP(v_uint32)
+    OPENCV_HAL_WRAP_SHIFT_OP(v_uint64)
+    OPENCV_HAL_WRAP_SHIFT_OP(v_int16)
+    OPENCV_HAL_WRAP_SHIFT_OP(v_int32)
+    OPENCV_HAL_WRAP_SHIFT_OP(v_int64)
    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8)
    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16)
    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32)
@ -769,6 +784,12 @@ namespace CV__SIMD_NAMESPACE {
        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x4)
        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x2)
        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x4)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_uint16x8)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_uint32x4)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_uint64x2)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_int16x8)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_int32x4)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_int64x2)
        #if CV_SIMD_64F
        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x2)
        #endif
@ -784,6 +805,12 @@ namespace CV__SIMD_NAMESPACE {
        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x8)
        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x4)
        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x8)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_uint16x16)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_uint32x8)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_uint64x4)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_int16x16)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_int32x8)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_int64x4)
        #if CV_SIMD_64F
        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x4)
        #endif
@ -801,7 +828,9 @@ namespace CV__SIMD_NAMESPACE {
    inline _Tpvec v_xor(const _Tpvec& a, const _Tpvec& b) \
    { \
        return a ^ b; \
-    } \
+    }
+
+    #define OPENCV_HAL_WRAP_NOT_OP(_Tpvec) \
    inline _Tpvec v_not(const _Tpvec& a) \
    { \
        return ~a; \
@ -815,6 +844,18 @@ namespace CV__SIMD_NAMESPACE {
    OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16)
    OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32)
    OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64)
+    OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float32)
+    OPENCV_HAL_WRAP_NOT_OP(v_uint8)
+    OPENCV_HAL_WRAP_NOT_OP(v_uint16)
+    OPENCV_HAL_WRAP_NOT_OP(v_uint32)
+    OPENCV_HAL_WRAP_NOT_OP(v_uint64)
+    OPENCV_HAL_WRAP_NOT_OP(v_int8)
+    OPENCV_HAL_WRAP_NOT_OP(v_int16)
+    OPENCV_HAL_WRAP_NOT_OP(v_int32)
+    OPENCV_HAL_WRAP_NOT_OP(v_int64)
+    #if CV_SIMD_64F
+    OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float64)
+    #endif
    #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint8x16)
        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint16x8)
@ -824,6 +865,18 @@ namespace CV__SIMD_NAMESPACE {
        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16x8)
        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32x4)
        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64x2)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float32x4)
+        OPENCV_HAL_WRAP_NOT_OP(v_uint8x16)
+        OPENCV_HAL_WRAP_NOT_OP(v_uint16x8)
+        OPENCV_HAL_WRAP_NOT_OP(v_uint32x4)
+        OPENCV_HAL_WRAP_NOT_OP(v_uint64x2)
+        OPENCV_HAL_WRAP_NOT_OP(v_int8x16)
+        OPENCV_HAL_WRAP_NOT_OP(v_int16x8)
+        OPENCV_HAL_WRAP_NOT_OP(v_int32x4)
+        OPENCV_HAL_WRAP_NOT_OP(v_int64x2)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float64x2)
+        #endif
    #endif
    #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint8x32)
@ -834,6 +887,18 @@ namespace CV__SIMD_NAMESPACE {
        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16x16)
        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32x8)
        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64x4)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float32x8)
+        OPENCV_HAL_WRAP_NOT_OP(v_uint8x32)
+        OPENCV_HAL_WRAP_NOT_OP(v_uint16x16)
+        OPENCV_HAL_WRAP_NOT_OP(v_uint32x8)
+        OPENCV_HAL_WRAP_NOT_OP(v_uint64x4)
+        OPENCV_HAL_WRAP_NOT_OP(v_int8x32)
+        OPENCV_HAL_WRAP_NOT_OP(v_int16x16)
+        OPENCV_HAL_WRAP_NOT_OP(v_int32x8)
+        OPENCV_HAL_WRAP_NOT_OP(v_int64x4)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float64x4)
+        #endif
    #endif

    #define OPENCV_HAL_WRAP_BIN_OP_MUL(_Tpvec) \
--- a/modules/core/include/opencv2/core/hal/intrin_rvv_compat_overloaded.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_rvv_compat_overloaded.hpp
@ -45,6 +45,7 @@ OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vuint8m2_t, u8m2, vuint8m2_t, i8)
 OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vuint8m4_t, u8m4, vuint8m4_t, i8)
 OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vuint8m8_t, u8m8, vuint8m8_t, i8)
 OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vfloat32m1_t, f32m1, vuint32m1_t, i32)
+OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vuint32m1_t, u32m1, vuint32m1_t, i32)
 #if CV_SIMD_SCALABLE_64F
 OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vfloat64m1_t, f64m1, vuint32mf2_t, i32)
 #endif
--- a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp
@ -475,6 +475,25 @@ OPENCV_HAL_IMPL_RVV_LUT(v_float32, float, m1)
 OPENCV_HAL_IMPL_RVV_LUT(v_float64, double, mf2)
 #endif

+#define OPENCV_HAL_IMPL_RVV_LUT_VEC(_Tpvec, _Tp) \
+inline _Tpvec v_lut(const _Tp* tab, const v_int32& vidx) \
+{ \
+    v_uint32 vidx_ = vmul(vreinterpret_u32m1(vidx), sizeof(_Tp), VTraits<v_int32>::vlanes()); \
+    return vloxei32(tab, vidx_, VTraits<_Tpvec>::vlanes()); \
+}
+OPENCV_HAL_IMPL_RVV_LUT_VEC(v_float32, float)
+OPENCV_HAL_IMPL_RVV_LUT_VEC(v_int32, int)
+OPENCV_HAL_IMPL_RVV_LUT_VEC(v_uint32, unsigned)
+
+#if CV_SIMD_SCALABLE_64F
+inline v_float64 v_lut(const double* tab, const v_int32& vidx) \
+{ \
+    vuint32mf2_t vidx_ = vmul(vlmul_trunc_u32mf2(vreinterpret_u32m1(vidx)), sizeof(double), VTraits<v_float64>::vlanes()); \
+    return vloxei32(tab, vidx_, VTraits<v_float64>::vlanes()); \
+}
+#endif
+
+
 inline v_uint8 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
 inline v_uint8 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
 inline v_uint8 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }
@ -690,23 +709,27 @@ inline v_float64 v_not (const v_float64& a) \


 ////////////// Bitwise shifts //////////////
+/*  Usage
+1. v_shl<N>(vec);
+2. v_shl(vec, N); // instead of vec << N, when N is non-constant.
+*/

 #define OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(_Tpvec, vl) \
-template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
+template<int s = 0> inline _Tpvec v_shl(const _Tpvec& a, int n = s) \
 { \
    return _Tpvec(vsll(a, uint8_t(n), vl)); \
 } \
-template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
+template<int s = 0> inline _Tpvec v_shr(const _Tpvec& a, int n = s) \
 { \
    return _Tpvec(vsrl(a, uint8_t(n), vl)); \
 }

 #define OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(_Tpvec, vl) \
-template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
+template<int s = 0> inline _Tpvec v_shl(const _Tpvec& a, int n = s) \
 { \
    return _Tpvec(vsll(a, uint8_t(n), vl)); \
 } \
-template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
+template<int s = 0> inline _Tpvec v_shr(const _Tpvec& a, int n = s) \
 { \
    return _Tpvec(vsra(a, uint8_t(n), vl)); \
 }
--- a/modules/imgproc/src/bilateral_filter.simd.hpp
+++ b/modules/imgproc/src/bilateral_filter.simd.hpp
@ -99,33 +99,33 @@ public:
                    const uchar* ksptr2 = sptr + space_ofs[k+2];
                    const uchar* ksptr3 = sptr + space_ofs[k+3];
                    j = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                    v_float32 kweight0 = vx_setall_f32(space_weight[k]);
                    v_float32 kweight1 = vx_setall_f32(space_weight[k+1]);
                    v_float32 kweight2 = vx_setall_f32(space_weight[k+2]);
                    v_float32 kweight3 = vx_setall_f32(space_weight[k+3]);
-                    for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes)
+                    for (; j <= size.width - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes())
                    {
                        v_uint32 rval = vx_load_expand_q(sptr + j);

                        v_uint32 val = vx_load_expand_q(ksptr0 + j);
-                        v_float32 w = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval)));
-                        v_float32 v_wsum = vx_load_aligned(wsum + j) + w;
+                        v_float32 w = v_mul(kweight0, v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval))));
+                        v_float32 v_wsum = v_add(vx_load_aligned(wsum + j), w);
                        v_float32 v_sum = v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, vx_load_aligned(sum + j));

                        val = vx_load_expand_q(ksptr1 + j);
-                        w = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval)));
-                        v_wsum += w;
+                        w = v_mul(kweight1, v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval))));
+                        v_wsum = v_add(v_wsum, w);
                        v_sum = v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, v_sum);

                        val = vx_load_expand_q(ksptr2 + j);
-                        w = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval)));
-                        v_wsum += w;
+                        w = v_mul(kweight2, v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval))));
+                        v_wsum = v_add(v_wsum, w);
                        v_sum = v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, v_sum);

                        val = vx_load_expand_q(ksptr3 + j);
-                        w = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval)));
-                        v_wsum += w;
+                        w = v_mul(kweight3, v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval))));
+                        v_wsum = v_add(v_wsum, w);
                        v_sum = v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, v_sum);

                        v_store_aligned(wsum + j, v_wsum);
@ -172,13 +172,13 @@ public:
                {
                    const uchar* ksptr = sptr + space_ofs[k];
                    j = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                    v_float32 kweight = vx_setall_f32(space_weight[k]);
-                    for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes)
+                    for (; j <= size.width - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes())
                    {
                        v_uint32 val = vx_load_expand_q(ksptr + j);
-                        v_float32 w = kweight * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, vx_load_expand_q(sptr + j))));
-                        v_store_aligned(wsum + j, vx_load_aligned(wsum + j) + w);
+                        v_float32 w = v_mul(kweight, v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, vx_load_expand_q(sptr + j)))));
+                        v_store_aligned(wsum + j, v_add(vx_load_aligned(wsum + j), w));
                        v_store_aligned(sum + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, vx_load_aligned(sum + j)));
                    }
 #endif
@ -191,10 +191,10 @@ public:
                    }
                }
                j = 0;
-#if CV_SIMD
-                for (; j <= size.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes)
-                    v_pack_u_store(dptr + j, v_pack(v_round(vx_load_aligned(sum + j                    ) / vx_load_aligned(wsum + j                    )),
-                                                    v_round(vx_load_aligned(sum + j + v_float32::nlanes) / vx_load_aligned(wsum + j + v_float32::nlanes))));
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for (; j <= size.width - 2*VTraits<v_float32>::vlanes(); j += 2*VTraits<v_float32>::vlanes())
+                    v_pack_u_store(dptr + j, v_pack(v_round(v_div(vx_load_aligned(sum + j), vx_load_aligned(wsum + j))),
+                                                    v_round(v_div(vx_load_aligned(sum + j + VTraits<v_float32>::vlanes()), vx_load_aligned(wsum + j + VTraits<v_float32>::vlanes())))));
 #endif
                for (; j < size.width; j++)
                {
@ -221,13 +221,13 @@ public:
                    const uchar* ksptr3 = sptr + space_ofs[k+3];
                    const uchar* rsptr = sptr;
                    j = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                    v_float32 kweight0 = vx_setall_f32(space_weight[k]);
                    v_float32 kweight1 = vx_setall_f32(space_weight[k+1]);
                    v_float32 kweight2 = vx_setall_f32(space_weight[k+2]);
                    v_float32 kweight3 = vx_setall_f32(space_weight[k+3]);
-                    for (; j <= size.width - v_uint8::nlanes; j += v_uint8::nlanes, rsptr += 3*v_uint8::nlanes,
-                                                              ksptr0 += 3*v_uint8::nlanes, ksptr1 += 3*v_uint8::nlanes, ksptr2 += 3*v_uint8::nlanes, ksptr3 += 3*v_uint8::nlanes)
+                    for (; j <= size.width - VTraits<v_uint8>::vlanes(); j += VTraits<v_uint8>::vlanes(), rsptr += 3*VTraits<v_uint8>::vlanes(),
+                                                              ksptr0 += 3*VTraits<v_uint8>::vlanes(), ksptr1 += 3*VTraits<v_uint8>::vlanes(), ksptr2 += 3*VTraits<v_uint8>::vlanes(), ksptr3 += 3*VTraits<v_uint8>::vlanes())
                    {
                        v_uint8 kb, kg, kr, rb, rg, rr;
                        v_load_deinterleave(rsptr, rb, rg, rr);
@ -236,163 +236,163 @@ public:
                        v_uint16 val0, val1, val2, val3, val4;
                        v_expand(v_absdiff(kb, rb), val0, val1);
                        v_expand(v_absdiff(kg, rg), val2, val3);
-                        val0 += val2; val1 += val3;
+                        val0 = v_add(val0, val2); val1 = v_add(val1, val3);
                        v_expand(v_absdiff(kr, rr), val2, val3);
-                        val0 += val2; val1 += val3;
+                        val0 = v_add(val0, val2); val1 = v_add(val1, val3);

                        v_uint32 vall, valh;
                        v_expand(val0, vall, valh);
-                        v_float32 w0 = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(vall));
-                        v_float32 w1 = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(valh));
-                        v_store_aligned(wsum + j, w0 + vx_load_aligned(wsum + j));
-                        v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes));
+                        v_float32 w0 = v_mul(kweight0, v_lut(color_weight, v_reinterpret_as_s32(vall)));
+                        v_float32 w1 = v_mul(kweight0, v_lut(color_weight, v_reinterpret_as_s32(valh)));
+                        v_store_aligned(wsum + j, v_add(w0, vx_load_aligned(wsum + j)));
+                        v_store_aligned(wsum + j + VTraits<v_float32>::vlanes(), v_add(w1, vx_load_aligned(wsum + j + VTraits<v_float32>::vlanes())));
                        v_expand(kb, val0, val2);
                        v_expand(val0, vall, valh);
                        v_store_aligned(sum_b + j                      , v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j)));
-                        v_store_aligned(sum_b + j +   v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes)));
+                        v_store_aligned(sum_b + j +   VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + VTraits<v_float32>::vlanes())));
                        v_expand(kg, val0, val3);
                        v_expand(val0, vall, valh);
                        v_store_aligned(sum_g + j                      , v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j)));
-                        v_store_aligned(sum_g + j +   v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes)));
+                        v_store_aligned(sum_g + j +   VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + VTraits<v_float32>::vlanes())));
                        v_expand(kr, val0, val4);
                        v_expand(val0, vall, valh);
                        v_store_aligned(sum_r + j                      , v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j)));
-                        v_store_aligned(sum_r + j +   v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes)));
+                        v_store_aligned(sum_r + j +   VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + VTraits<v_float32>::vlanes())));

                        v_expand(val1, vall, valh);
-                        w0 = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(vall));
-                        w1 = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(valh));
-                        v_store_aligned(wsum + j + 2 * v_float32::nlanes, w0 + vx_load_aligned(wsum + j + 2 * v_float32::nlanes));
-                        v_store_aligned(wsum + j + 3 * v_float32::nlanes, w1 + vx_load_aligned(wsum + j + 3 * v_float32::nlanes));
+                        w0 = v_mul(kweight0, v_lut(color_weight, v_reinterpret_as_s32(vall)));
+                        w1 = v_mul(kweight0, v_lut(color_weight, v_reinterpret_as_s32(valh)));
+                        v_store_aligned(wsum + j + 2 * VTraits<v_float32>::vlanes(), v_add(w0, vx_load_aligned(wsum + j + 2 * VTraits<v_float32>::vlanes())));
+                        v_store_aligned(wsum + j + 3 * VTraits<v_float32>::vlanes(), v_add(w1, vx_load_aligned(wsum + j + 3 * VTraits<v_float32>::vlanes())));
                        v_expand(val2, vall, valh);
-                        v_store_aligned(sum_b + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * v_float32::nlanes)));
-                        v_store_aligned(sum_b + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * v_float32::nlanes)));
+                        v_store_aligned(sum_b + j + 2 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * VTraits<v_float32>::vlanes())));
+                        v_store_aligned(sum_b + j + 3 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * VTraits<v_float32>::vlanes())));
                        v_expand(val3, vall, valh);
-                        v_store_aligned(sum_g + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * v_float32::nlanes)));
-                        v_store_aligned(sum_g + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * v_float32::nlanes)));
+                        v_store_aligned(sum_g + j + 2 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * VTraits<v_float32>::vlanes())));
+                        v_store_aligned(sum_g + j + 3 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * VTraits<v_float32>::vlanes())));
                        v_expand(val4, vall, valh);
-                        v_store_aligned(sum_r + j + 2*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2*v_float32::nlanes)));
-                        v_store_aligned(sum_r + j + 3*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3*v_float32::nlanes)));
+                        v_store_aligned(sum_r + j + 2*VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2*VTraits<v_float32>::vlanes())));
+                        v_store_aligned(sum_r + j + 3*VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3*VTraits<v_float32>::vlanes())));

                        v_load_deinterleave(ksptr1, kb, kg, kr);
                        v_expand(v_absdiff(kb, rb), val0, val1);
                        v_expand(v_absdiff(kg, rg), val2, val3);
-                        val0 += val2; val1 += val3;
+                        val0 = v_add(val0, val2); val1 = v_add(val1, val3);
                        v_expand(v_absdiff(kr, rr), val2, val3);
-                        val0 += val2; val1 += val3;
+                        val0 = v_add(val0, val2); val1 = v_add(val1, val3);

                        v_expand(val0, vall, valh);
-                        w0 = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(vall));
-                        w1 = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(valh));
-                        v_store_aligned(wsum + j, w0 + vx_load_aligned(wsum + j));
-                        v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes));
+                        w0 = v_mul(kweight1, v_lut(color_weight, v_reinterpret_as_s32(vall)));
+                        w1 = v_mul(kweight1, v_lut(color_weight, v_reinterpret_as_s32(valh)));
+                        v_store_aligned(wsum + j, v_add(w0, vx_load_aligned(wsum + j)));
+                        v_store_aligned(wsum + j + VTraits<v_float32>::vlanes(), v_add(w1, vx_load_aligned(wsum + j + VTraits<v_float32>::vlanes())));
                        v_expand(kb, val0, val2);
                        v_expand(val0, vall, valh);
                        v_store_aligned(sum_b + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j)));
-                        v_store_aligned(sum_b + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes)));
+                        v_store_aligned(sum_b + j + VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + VTraits<v_float32>::vlanes())));
                        v_expand(kg, val0, val3);
                        v_expand(val0, vall, valh);
                        v_store_aligned(sum_g + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j)));
-                        v_store_aligned(sum_g + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes)));
+                        v_store_aligned(sum_g + j + VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + VTraits<v_float32>::vlanes())));
                        v_expand(kr, val0, val4);
                        v_expand(val0, vall, valh);
                        v_store_aligned(sum_r + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j)));
-                        v_store_aligned(sum_r + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes)));
+                        v_store_aligned(sum_r + j + VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + VTraits<v_float32>::vlanes())));

                        v_expand(val1, vall, valh);
-                        w0 = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(vall));
-                        w1 = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(valh));
-                        v_store_aligned(wsum + j + 2 * v_float32::nlanes, w0 + vx_load_aligned(wsum + j + 2 * v_float32::nlanes));
-                        v_store_aligned(wsum + j + 3 * v_float32::nlanes, w1 + vx_load_aligned(wsum + j + 3 * v_float32::nlanes));
+                        w0 = v_mul(kweight1, v_lut(color_weight, v_reinterpret_as_s32(vall)));
+                        w1 = v_mul(kweight1, v_lut(color_weight, v_reinterpret_as_s32(valh)));
+                        v_store_aligned(wsum + j + 2 * VTraits<v_float32>::vlanes(), v_add(w0, vx_load_aligned(wsum + j + 2 * VTraits<v_float32>::vlanes())));
+                        v_store_aligned(wsum + j + 3 * VTraits<v_float32>::vlanes(), v_add(w1, vx_load_aligned(wsum + j + 3 * VTraits<v_float32>::vlanes())));
                        v_expand(val2, vall, valh);
-                        v_store_aligned(sum_b + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * v_float32::nlanes)));
-                        v_store_aligned(sum_b + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * v_float32::nlanes)));
+                        v_store_aligned(sum_b + j + 2 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * VTraits<v_float32>::vlanes())));
+                        v_store_aligned(sum_b + j + 3 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * VTraits<v_float32>::vlanes())));
                        v_expand(val3, vall, valh);
-                        v_store_aligned(sum_g + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * v_float32::nlanes)));
-                        v_store_aligned(sum_g + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * v_float32::nlanes)));
+                        v_store_aligned(sum_g + j + 2 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * VTraits<v_float32>::vlanes())));
+                        v_store_aligned(sum_g + j + 3 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * VTraits<v_float32>::vlanes())));
                        v_expand(val4, vall, valh);
-                        v_store_aligned(sum_r + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2 * v_float32::nlanes)));
-                        v_store_aligned(sum_r + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3 * v_float32::nlanes)));
+                        v_store_aligned(sum_r + j + 2 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2 * VTraits<v_float32>::vlanes())));
+                        v_store_aligned(sum_r + j + 3 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3 * VTraits<v_float32>::vlanes())));

                        v_load_deinterleave(ksptr2, kb, kg, kr);
                        v_expand(v_absdiff(kb, rb), val0, val1);
                        v_expand(v_absdiff(kg, rg), val2, val3);
-                        val0 += val2; val1 += val3;
+                        val0 = v_add(val0, val2); val1 = v_add(val1, val3);
                        v_expand(v_absdiff(kr, rr), val2, val3);
-                        val0 += val2; val1 += val3;
+                        val0 = v_add(val0, val2); val1 = v_add(val1, val3);

                        v_expand(val0, vall, valh);
-                        w0 = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(vall));
-                        w1 = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(valh));
-                        v_store_aligned(wsum + j, w0 + vx_load_aligned(wsum + j));
-                        v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes));
+                        w0 = v_mul(kweight2, v_lut(color_weight, v_reinterpret_as_s32(vall)));
+                        w1 = v_mul(kweight2, v_lut(color_weight, v_reinterpret_as_s32(valh)));
+                        v_store_aligned(wsum + j, v_add(w0, vx_load_aligned(wsum + j)));
+                        v_store_aligned(wsum + j + VTraits<v_float32>::vlanes(), v_add(w1, vx_load_aligned(wsum + j + VTraits<v_float32>::vlanes())));
                        v_expand(kb, val0, val2);
                        v_expand(val0, vall, valh);
                        v_store_aligned(sum_b + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j)));
-                        v_store_aligned(sum_b + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes)));
+                        v_store_aligned(sum_b + j + VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + VTraits<v_float32>::vlanes())));
                        v_expand(kg, val0, val3);
                        v_expand(val0, vall, valh);
                        v_store_aligned(sum_g + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j)));
-                        v_store_aligned(sum_g + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes)));
+                        v_store_aligned(sum_g + j + VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + VTraits<v_float32>::vlanes())));
                        v_expand(kr, val0, val4);
                        v_expand(val0, vall, valh);
                        v_store_aligned(sum_r + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j)));
-                        v_store_aligned(sum_r + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes)));
+                        v_store_aligned(sum_r + j + VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + VTraits<v_float32>::vlanes())));

                        v_expand(val1, vall, valh);
-                        w0 = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(vall));
-                        w1 = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(valh));
-                        v_store_aligned(wsum + j + 2 * v_float32::nlanes, w0 + vx_load_aligned(wsum + j + 2 * v_float32::nlanes));
-                        v_store_aligned(wsum + j + 3 * v_float32::nlanes, w1 + vx_load_aligned(wsum + j + 3 * v_float32::nlanes));
+                        w0 = v_mul(kweight2, v_lut(color_weight, v_reinterpret_as_s32(vall)));
+                        w1 = v_mul(kweight2, v_lut(color_weight, v_reinterpret_as_s32(valh)));
+                        v_store_aligned(wsum + j + 2 * VTraits<v_float32>::vlanes(), v_add(w0, vx_load_aligned(wsum + j + 2 * VTraits<v_float32>::vlanes())));
+                        v_store_aligned(wsum + j + 3 * VTraits<v_float32>::vlanes(), v_add(w1, vx_load_aligned(wsum + j + 3 * VTraits<v_float32>::vlanes())));
                        v_expand(val2, vall, valh);
-                        v_store_aligned(sum_b + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * v_float32::nlanes)));
-                        v_store_aligned(sum_b + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * v_float32::nlanes)));
+                        v_store_aligned(sum_b + j + 2 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * VTraits<v_float32>::vlanes())));
+                        v_store_aligned(sum_b + j + 3 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * VTraits<v_float32>::vlanes())));
                        v_expand(val3, vall, valh);
-                        v_store_aligned(sum_g + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * v_float32::nlanes)));
-                        v_store_aligned(sum_g + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * v_float32::nlanes)));
+                        v_store_aligned(sum_g + j + 2 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * VTraits<v_float32>::vlanes())));
+                        v_store_aligned(sum_g + j + 3 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * VTraits<v_float32>::vlanes())));
                        v_expand(val4, vall, valh);
-                        v_store_aligned(sum_r + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2 * v_float32::nlanes)));
-                        v_store_aligned(sum_r + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3 * v_float32::nlanes)));
+                        v_store_aligned(sum_r + j + 2 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2 * VTraits<v_float32>::vlanes())));
+                        v_store_aligned(sum_r + j + 3 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3 * VTraits<v_float32>::vlanes())));

                        v_load_deinterleave(ksptr3, kb, kg, kr);
                        v_expand(v_absdiff(kb, rb), val0, val1);
                        v_expand(v_absdiff(kg, rg), val2, val3);
-                        val0 += val2; val1 += val3;
+                        val0 = v_add(val0, val2); val1 = v_add(val1, val3);
                        v_expand(v_absdiff(kr, rr), val2, val3);
-                        val0 += val2; val1 += val3;
+                        val0 = v_add(val0, val2); val1 = v_add(val1, val3);

                        v_expand(val0, vall, valh);
-                        w0 = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(vall));
-                        w1 = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(valh));
-                        v_store_aligned(wsum + j, w0 + vx_load_aligned(wsum + j));
-                        v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes));
+                        w0 = v_mul(kweight3, v_lut(color_weight, v_reinterpret_as_s32(vall)));
+                        w1 = v_mul(kweight3, v_lut(color_weight, v_reinterpret_as_s32(valh)));
+                        v_store_aligned(wsum + j, v_add(w0, vx_load_aligned(wsum + j)));
+                        v_store_aligned(wsum + j + VTraits<v_float32>::vlanes(), v_add(w1, vx_load_aligned(wsum + j + VTraits<v_float32>::vlanes())));
                        v_expand(kb, val0, val2);
                        v_expand(val0, vall, valh);
                        v_store_aligned(sum_b + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j)));
-                        v_store_aligned(sum_b + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes)));
+                        v_store_aligned(sum_b + j + VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + VTraits<v_float32>::vlanes())));
                        v_expand(kg, val0, val3);
                        v_expand(val0, vall, valh);
                        v_store_aligned(sum_g + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j)));
-                        v_store_aligned(sum_g + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes)));
+                        v_store_aligned(sum_g + j + VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + VTraits<v_float32>::vlanes())));
                        v_expand(kr, val0, val4);
                        v_expand(val0, vall, valh);
                        v_store_aligned(sum_r + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j)));
-                        v_store_aligned(sum_r + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes)));
+                        v_store_aligned(sum_r + j + VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + VTraits<v_float32>::vlanes())));

                        v_expand(val1, vall, valh);
-                        w0 = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(vall));
-                        w1 = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(valh));
-                        v_store_aligned(wsum + j + 2 * v_float32::nlanes, w0 + vx_load_aligned(wsum + j + 2 * v_float32::nlanes));
-                        v_store_aligned(wsum + j + 3 * v_float32::nlanes, w1 + vx_load_aligned(wsum + j + 3 * v_float32::nlanes));
+                        w0 = v_mul(kweight3, v_lut(color_weight, v_reinterpret_as_s32(vall)));
+                        w1 = v_mul(kweight3, v_lut(color_weight, v_reinterpret_as_s32(valh)));
+                        v_store_aligned(wsum + j + 2 * VTraits<v_float32>::vlanes(), v_add(w0, vx_load_aligned(wsum + j + 2 * VTraits<v_float32>::vlanes())));
+                        v_store_aligned(wsum + j + 3 * VTraits<v_float32>::vlanes(), v_add(w1, vx_load_aligned(wsum + j + 3 * VTraits<v_float32>::vlanes())));
                        v_expand(val2, vall, valh);
-                        v_store_aligned(sum_b + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * v_float32::nlanes)));
-                        v_store_aligned(sum_b + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * v_float32::nlanes)));
+                        v_store_aligned(sum_b + j + 2 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * VTraits<v_float32>::vlanes())));
+                        v_store_aligned(sum_b + j + 3 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * VTraits<v_float32>::vlanes())));
                        v_expand(val3, vall, valh);
-                        v_store_aligned(sum_g + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * v_float32::nlanes)));
-                        v_store_aligned(sum_g + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * v_float32::nlanes)));
+                        v_store_aligned(sum_g + j + 2 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * VTraits<v_float32>::vlanes())));
+                        v_store_aligned(sum_g + j + 3 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * VTraits<v_float32>::vlanes())));
                        v_expand(val4, vall, valh);
-                        v_store_aligned(sum_r + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2 * v_float32::nlanes)));
-                        v_store_aligned(sum_r + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3 * v_float32::nlanes)));
+                        v_store_aligned(sum_r + j + 2 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2 * VTraits<v_float32>::vlanes())));
+                        v_store_aligned(sum_r + j + 3 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3 * VTraits<v_float32>::vlanes())));
                    }
 #endif
 #if CV_SIMD128
@ -442,9 +442,9 @@ public:
                    const uchar* ksptr = sptr + space_ofs[k];
                    const uchar* rsptr = sptr;
                    j = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                    v_float32 kweight = vx_setall_f32(space_weight[k]);
-                    for (; j <= size.width - v_uint8::nlanes; j += v_uint8::nlanes, ksptr += 3*v_uint8::nlanes, rsptr += 3*v_uint8::nlanes)
+                    for (; j <= size.width - VTraits<v_uint8>::vlanes(); j += VTraits<v_uint8>::vlanes(), ksptr += 3*VTraits<v_uint8>::vlanes(), rsptr += 3*VTraits<v_uint8>::vlanes())
                    {
                        v_uint8 kb, kg, kr, rb, rg, rr;
                        v_load_deinterleave(ksptr, kb, kg, kr);
@ -456,39 +456,39 @@ public:
                        v_expand(v_absdiff(kr, rr), r_l, r_h);

                        v_uint32 val0, val1, val2, val3;
-                        v_expand(b_l + g_l + r_l, val0, val1);
-                        v_expand(b_h + g_h + r_h, val2, val3);
+                        v_expand(v_add(v_add(b_l, g_l), r_l), val0, val1);
+                        v_expand(v_add(v_add(b_h, g_h), r_h), val2, val3);

                        v_expand(kb, b_l, b_h);
                        v_expand(kg, g_l, g_h);
                        v_expand(kr, r_l, r_h);

-                        v_float32 w0 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val0));
-                        v_float32 w1 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val1));
-                        v_float32 w2 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val2));
-                        v_float32 w3 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val3));
-                        v_store_aligned(wsum + j                      , w0 + vx_load_aligned(wsum + j));
-                        v_store_aligned(wsum + j +   v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes));
-                        v_store_aligned(wsum + j + 2*v_float32::nlanes, w2 + vx_load_aligned(wsum + j + 2*v_float32::nlanes));
-                        v_store_aligned(wsum + j + 3*v_float32::nlanes, w3 + vx_load_aligned(wsum + j + 3*v_float32::nlanes));
+                        v_float32 w0 = v_mul(kweight, v_lut(color_weight, v_reinterpret_as_s32(val0)));
+                        v_float32 w1 = v_mul(kweight, v_lut(color_weight, v_reinterpret_as_s32(val1)));
+                        v_float32 w2 = v_mul(kweight, v_lut(color_weight, v_reinterpret_as_s32(val2)));
+                        v_float32 w3 = v_mul(kweight, v_lut(color_weight, v_reinterpret_as_s32(val3)));
+                        v_store_aligned(wsum + j                      , v_add(w0, vx_load_aligned(wsum + j)));
+                        v_store_aligned(wsum + j +   VTraits<v_float32>::vlanes(), v_add(w1, vx_load_aligned(wsum + j + VTraits<v_float32>::vlanes())));
+                        v_store_aligned(wsum + j + 2*VTraits<v_float32>::vlanes(), v_add(w2, vx_load_aligned(wsum + j + 2 * VTraits<v_float32>::vlanes())));
+                        v_store_aligned(wsum + j + 3*VTraits<v_float32>::vlanes(), v_add(w3, vx_load_aligned(wsum + j + 3 * VTraits<v_float32>::vlanes())));
                        v_expand(b_l, val0, val1);
                        v_expand(b_h, val2, val3);
                        v_store_aligned(sum_b + j                      , v_muladd(v_cvt_f32(v_reinterpret_as_s32(val0)), w0, vx_load_aligned(sum_b + j)));
-                        v_store_aligned(sum_b + j +   v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes)));
-                        v_store_aligned(sum_b + j + 2*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_b + j + 2*v_float32::nlanes)));
-                        v_store_aligned(sum_b + j + 3*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_b + j + 3*v_float32::nlanes)));
+                        v_store_aligned(sum_b + j +   VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_b + j + VTraits<v_float32>::vlanes())));
+                        v_store_aligned(sum_b + j + 2*VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_b + j + 2*VTraits<v_float32>::vlanes())));
+                        v_store_aligned(sum_b + j + 3*VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_b + j + 3*VTraits<v_float32>::vlanes())));
                        v_expand(g_l, val0, val1);
                        v_expand(g_h, val2, val3);
                        v_store_aligned(sum_g + j                      , v_muladd(v_cvt_f32(v_reinterpret_as_s32(val0)), w0, vx_load_aligned(sum_g + j)));
-                        v_store_aligned(sum_g + j +   v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes)));
-                        v_store_aligned(sum_g + j + 2*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_g + j + 2*v_float32::nlanes)));
-                        v_store_aligned(sum_g + j + 3*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_g + j + 3*v_float32::nlanes)));
+                        v_store_aligned(sum_g + j +   VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_g + j + VTraits<v_float32>::vlanes())));
+                        v_store_aligned(sum_g + j + 2*VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_g + j + 2*VTraits<v_float32>::vlanes())));
+                        v_store_aligned(sum_g + j + 3*VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_g + j + 3*VTraits<v_float32>::vlanes())));
                        v_expand(r_l, val0, val1);
                        v_expand(r_h, val2, val3);
                        v_store_aligned(sum_r + j                      , v_muladd(v_cvt_f32(v_reinterpret_as_s32(val0)), w0, vx_load_aligned(sum_r + j)));
-                        v_store_aligned(sum_r + j +   v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes)));
-                        v_store_aligned(sum_r + j + 2*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_r + j + 2*v_float32::nlanes)));
-                        v_store_aligned(sum_r + j + 3*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_r + j + 3*v_float32::nlanes)));
+                        v_store_aligned(sum_r + j +   VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_r + j + VTraits<v_float32>::vlanes())));
+                        v_store_aligned(sum_r + j + 2*VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_r + j + 2*VTraits<v_float32>::vlanes())));
+                        v_store_aligned(sum_r + j + 3*VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_r + j + 3*VTraits<v_float32>::vlanes())));
                    }
 #endif
                    for(; j < size.width; j++, ksptr += 3, rsptr += 3)
@ -500,27 +500,27 @@ public:
                    }
                }
                j = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                v_float32 v_one = vx_setall_f32(1.f);
-                for(; j <= size.width - v_uint8::nlanes; j += v_uint8::nlanes, dptr += 3*v_uint8::nlanes)
+                for(; j <= size.width - VTraits<v_uint8>::vlanes(); j += VTraits<v_uint8>::vlanes(), dptr += 3*VTraits<v_uint8>::vlanes())
                {
-                    v_float32 w0 = v_one / vx_load_aligned(wsum + j);
-                    v_float32 w1 = v_one / vx_load_aligned(wsum + j + v_float32::nlanes);
-                    v_float32 w2 = v_one / vx_load_aligned(wsum + j + 2*v_float32::nlanes);
-                    v_float32 w3 = v_one / vx_load_aligned(wsum + j + 3*v_float32::nlanes);
+                    v_float32 w0 = v_div(v_one, vx_load_aligned(wsum + j));
+                    v_float32 w1 = v_div(v_one, vx_load_aligned(wsum + j + VTraits<v_float32>::vlanes()));
+                    v_float32 w2 = v_div(v_one, vx_load_aligned(wsum + j + 2 * VTraits<v_float32>::vlanes()));
+                    v_float32 w3 = v_div(v_one, vx_load_aligned(wsum + j + 3 * VTraits<v_float32>::vlanes()));

-                    v_store_interleave(dptr, v_pack_u(v_pack(v_round(w0 * vx_load_aligned(sum_b + j)),
-                                                             v_round(w1 * vx_load_aligned(sum_b + j + v_float32::nlanes))),
-                                                      v_pack(v_round(w2 * vx_load_aligned(sum_b + j + 2*v_float32::nlanes)),
-                                                             v_round(w3 * vx_load_aligned(sum_b + j + 3*v_float32::nlanes)))),
-                                             v_pack_u(v_pack(v_round(w0 * vx_load_aligned(sum_g + j)),
-                                                             v_round(w1 * vx_load_aligned(sum_g + j + v_float32::nlanes))),
-                                                      v_pack(v_round(w2 * vx_load_aligned(sum_g + j + 2*v_float32::nlanes)),
-                                                             v_round(w3 * vx_load_aligned(sum_g + j + 3*v_float32::nlanes)))),
-                                             v_pack_u(v_pack(v_round(w0 * vx_load_aligned(sum_r + j)),
-                                                             v_round(w1 * vx_load_aligned(sum_r + j + v_float32::nlanes))),
-                                                      v_pack(v_round(w2 * vx_load_aligned(sum_r + j + 2*v_float32::nlanes)),
-                                                             v_round(w3 * vx_load_aligned(sum_r + j + 3*v_float32::nlanes)))));
+                    v_store_interleave(dptr, v_pack_u(v_pack(v_round(v_mul(w0, vx_load_aligned(sum_b + j))),
+                                                             v_round(v_mul(w1, vx_load_aligned(sum_b + j + VTraits<v_float32>::vlanes())))),
+                                                      v_pack(v_round(v_mul(w2, vx_load_aligned(sum_b + j + 2 * VTraits<v_float32>::vlanes()))),
+                                                             v_round(v_mul(w3, vx_load_aligned(sum_b + j + 3 * VTraits<v_float32>::vlanes()))))),
+                                             v_pack_u(v_pack(v_round(v_mul(w0, vx_load_aligned(sum_g + j))),
+                                                             v_round(v_mul(w1, vx_load_aligned(sum_g + j + VTraits<v_float32>::vlanes())))),
+                                                      v_pack(v_round(v_mul(w2, vx_load_aligned(sum_g + j + 2 * VTraits<v_float32>::vlanes()))),
+                                                             v_round(v_mul(w3, vx_load_aligned(sum_g + j + 3 * VTraits<v_float32>::vlanes()))))),
+                                             v_pack_u(v_pack(v_round(v_mul(w0, vx_load_aligned(sum_r + j))),
+                                                             v_round(v_mul(w1, vx_load_aligned(sum_r + j + VTraits<v_float32>::vlanes())))),
+                                                      v_pack(v_round(v_mul(w2, vx_load_aligned(sum_r + j + 2 * VTraits<v_float32>::vlanes()))),
+                                                             v_round(v_mul(w3, vx_load_aligned(sum_r + j + 3 * VTraits<v_float32>::vlanes()))))));
                }
 #endif
                for(; j < size.width; j++)
@ -533,7 +533,7 @@ public:
                }
            }
        }
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
        vx_cleanup();
 #endif
    }
@ -589,7 +589,7 @@ public:
                memset(buf.data(), 0, buf.size() * sizeof(float));
                float *sum = alignPtr(buf.data(), CV_SIMD_WIDTH);
                float *wsum = sum + alignSize(size.width, CV_SIMD_WIDTH);
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                v_float32 v_one = vx_setall_f32(1.f);
                v_float32 sindex = vx_setall_f32(scale_index);
 #endif
@ -601,50 +601,50 @@ public:
                    const float* ksptr2 = sptr + space_ofs[k + 2];
                    const float* ksptr3 = sptr + space_ofs[k + 3];
                    j = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                    v_float32 kweight0 = vx_setall_f32(space_weight[k]);
                    v_float32 kweight1 = vx_setall_f32(space_weight[k+1]);
                    v_float32 kweight2 = vx_setall_f32(space_weight[k+2]);
                    v_float32 kweight3 = vx_setall_f32(space_weight[k+3]);
-                    for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes)
+                    for (; j <= size.width - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes())
                    {
                        v_float32 rval = vx_load(sptr + j);

                        v_float32 val = vx_load(ksptr0 + j);
                        v_float32 knan = v_not_nan(val);
-                        v_float32 alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan;
+                        v_float32 alpha = v_and(v_and(v_mul(v_absdiff(val, rval), sindex), v_not_nan(rval)), knan);
                        v_int32 idx = v_trunc(alpha);
-                        alpha -= v_cvt_f32(idx);
-                        v_float32 w = (kweight0 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one-alpha))) & knan;
-                        v_float32 v_wsum = vx_load_aligned(wsum + j) + w;
-                        v_float32 v_sum = v_muladd(val & knan, w, vx_load_aligned(sum + j));
+                        alpha = v_sub(alpha, v_cvt_f32(idx));
+                        v_float32 w = v_and(v_mul(kweight0, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one, alpha)))), knan);
+                        v_float32 v_wsum = v_add(vx_load_aligned(wsum + j), w);
+                        v_float32 v_sum = v_muladd(v_and(val, knan), w, vx_load_aligned(sum + j));

                        val = vx_load(ksptr1 + j);
                        knan = v_not_nan(val);
-                        alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan;
+                        alpha = v_and(v_and(v_mul(v_absdiff(val, rval), sindex), v_not_nan(rval)), knan);
                        idx = v_trunc(alpha);
-                        alpha -= v_cvt_f32(idx);
-                        w = (kweight1 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
-                        v_wsum += w;
-                        v_sum = v_muladd(val & knan, w, v_sum);
+                        alpha = v_sub(alpha, v_cvt_f32(idx));
+                        w = v_and(v_mul(kweight1, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one, alpha)))), knan);
+                        v_wsum = v_add(v_wsum, w);
+                        v_sum = v_muladd(v_and(val, knan), w, v_sum);

                        val = vx_load(ksptr2 + j);
                        knan = v_not_nan(val);
-                        alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan;
+                        alpha = v_and(v_and(v_mul(v_absdiff(val, rval), sindex), v_not_nan(rval)), knan);
                        idx = v_trunc(alpha);
-                        alpha -= v_cvt_f32(idx);
-                        w = (kweight2 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
-                        v_wsum += w;
-                        v_sum = v_muladd(val & knan, w, v_sum);
+                        alpha = v_sub(alpha, v_cvt_f32(idx));
+                        w = v_and(v_mul(kweight2, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one, alpha)))), knan);
+                        v_wsum = v_add(v_wsum, w);
+                        v_sum = v_muladd(v_and(val, knan), w, v_sum);

                        val = vx_load(ksptr3 + j);
                        knan = v_not_nan(val);
-                        alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan;
+                        alpha = v_and(v_and(v_mul(v_absdiff(val, rval), sindex), v_not_nan(rval)), knan);
                        idx = v_trunc(alpha);
-                        alpha -= v_cvt_f32(idx);
-                        w = (kweight3 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
-                        v_wsum += w;
-                        v_sum = v_muladd(val & knan, w, v_sum);
+                        alpha = v_sub(alpha, v_cvt_f32(idx));
+                        w = v_and(v_mul(kweight3, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one, alpha)))), knan);
+                        v_wsum = v_add(v_wsum, w);
+                        v_sum = v_muladd(v_and(val, knan), w, v_sum);

                        v_store_aligned(wsum + j, v_wsum);
                        v_store_aligned(sum + j, v_sum);
@ -720,20 +720,20 @@ public:
                {
                    const float* ksptr = sptr + space_ofs[k];
                    j = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                    v_float32 kweight = vx_setall_f32(space_weight[k]);
-                    for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes)
+                    for (; j <= size.width - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes())
                    {
                        v_float32 val = vx_load(ksptr + j);
                        v_float32 rval = vx_load(sptr + j);
                        v_float32 knan = v_not_nan(val);
-                        v_float32 alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan;
+                        v_float32 alpha = v_and(v_and(v_mul(v_absdiff(val, rval), sindex), v_not_nan(rval)), knan);
                        v_int32 idx = v_trunc(alpha);
-                        alpha -= v_cvt_f32(idx);
+                        alpha = v_sub(alpha, v_cvt_f32(idx));

-                        v_float32 w = (kweight * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one-alpha))) & knan;
-                        v_store_aligned(wsum + j, vx_load_aligned(wsum + j) + w);
-                        v_store_aligned(sum + j, v_muladd(val & knan, w, vx_load_aligned(sum + j)));
+                        v_float32 w = v_and(v_mul(kweight, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one, alpha)))), knan);
+                        v_store_aligned(wsum + j, v_add(vx_load_aligned(wsum + j), w));
+                        v_store_aligned(sum + j, v_muladd(v_and(val, knan), w, vx_load_aligned(sum + j)));
                    }
 #endif
                    for (; j < size.width; j++)
@ -752,11 +752,11 @@ public:
                    }
                }
                j = 0;
-#if CV_SIMD
-                for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes)
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for (; j <= size.width - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes())
                {
                    v_float32 v_val = vx_load(sptr + j);
-                    v_store(dptr + j, (vx_load_aligned(sum + j) + (v_val & v_not_nan(v_val))) / (vx_load_aligned(wsum + j) + (v_one & v_not_nan(v_val))));
+                    v_store(dptr + j, v_div(v_add(vx_load_aligned(sum + j), v_and(v_val, v_not_nan(v_val))), v_add(vx_load_aligned(wsum + j), v_and(v_one, v_not_nan(v_val)))));
                }
 #endif
                for (; j < size.width; j++)
@ -774,7 +774,7 @@ public:
                float *sum_g = sum_b + alignSize(size.width, CV_SIMD_WIDTH);
                float *sum_r = sum_g + alignSize(size.width, CV_SIMD_WIDTH);
                float *wsum = sum_r + alignSize(size.width, CV_SIMD_WIDTH);
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                v_float32 v_one = vx_setall_f32(1.f);
                v_float32 sindex = vx_setall_f32(scale_index);
 #endif
@ -787,60 +787,60 @@ public:
                    const float* ksptr3 = sptr + space_ofs[k+3];
                    const float* rsptr = sptr;
                    j = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                    v_float32 kweight0 = vx_setall_f32(space_weight[k]);
                    v_float32 kweight1 = vx_setall_f32(space_weight[k+1]);
                    v_float32 kweight2 = vx_setall_f32(space_weight[k+2]);
                    v_float32 kweight3 = vx_setall_f32(space_weight[k+3]);
-                    for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes, rsptr += 3 * v_float32::nlanes,
-                                                                ksptr0 += 3 * v_float32::nlanes, ksptr1 += 3 * v_float32::nlanes, ksptr2 += 3 * v_float32::nlanes, ksptr3 += 3 * v_float32::nlanes)
+                    for (; j <= size.width - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes(), rsptr += 3 * VTraits<v_float32>::vlanes(),
+                                                                ksptr0 += 3 * VTraits<v_float32>::vlanes(), ksptr1 += 3 * VTraits<v_float32>::vlanes(), ksptr2 += 3 * VTraits<v_float32>::vlanes(), ksptr3 += 3 * VTraits<v_float32>::vlanes())
                    {
                        v_float32 kb, kg, kr, rb, rg, rr;
                        v_load_deinterleave(rsptr, rb, rg, rr);

                        v_load_deinterleave(ksptr0, kb, kg, kr);
-                        v_float32 knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr);
-                        v_float32 alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan;
+                        v_float32 knan = v_and(v_and(v_not_nan(kb), v_not_nan(kg)), v_not_nan(kr));
+                        v_float32 alpha = v_and(v_and(v_and(v_and(v_mul(v_add(v_add(v_absdiff(kb, rb), v_absdiff(kg, rg)), v_absdiff(kr, rr)), sindex), v_not_nan(rb)), v_not_nan(rg)), v_not_nan(rr)), knan);
                        v_int32 idx = v_trunc(alpha);
-                        alpha -= v_cvt_f32(idx);
-                        v_float32 w = (kweight0 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
-                        v_float32 v_wsum = vx_load_aligned(wsum + j) + w;
-                        v_float32 v_sum_b = v_muladd(kb & knan, w, vx_load_aligned(sum_b + j));
-                        v_float32 v_sum_g = v_muladd(kg & knan, w, vx_load_aligned(sum_g + j));
-                        v_float32 v_sum_r = v_muladd(kr & knan, w, vx_load_aligned(sum_r + j));
+                        alpha = v_sub(alpha, v_cvt_f32(idx));
+                        v_float32 w = v_and(v_mul(kweight0, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one, alpha)))), knan);
+                        v_float32 v_wsum = v_add(vx_load_aligned(wsum + j), w);
+                        v_float32 v_sum_b = v_muladd(v_and(kb, knan), w, vx_load_aligned(sum_b + j));
+                        v_float32 v_sum_g = v_muladd(v_and(kg, knan), w, vx_load_aligned(sum_g + j));
+                        v_float32 v_sum_r = v_muladd(v_and(kr, knan), w, vx_load_aligned(sum_r + j));

                        v_load_deinterleave(ksptr1, kb, kg, kr);
-                        knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr);
-                        alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan;
+                        knan = v_and(v_and(v_not_nan(kb), v_not_nan(kg)), v_not_nan(kr));
+                        alpha = v_and(v_and(v_and(v_and(v_mul(v_add(v_add(v_absdiff(kb, rb), v_absdiff(kg, rg)), v_absdiff(kr, rr)), sindex), v_not_nan(rb)), v_not_nan(rg)), v_not_nan(rr)), knan);
                        idx = v_trunc(alpha);
-                        alpha -= v_cvt_f32(idx);
-                        w = (kweight1 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
-                        v_wsum += w;
-                        v_sum_b = v_muladd(kb & knan, w, v_sum_b);
-                        v_sum_g = v_muladd(kg & knan, w, v_sum_g);
-                        v_sum_r = v_muladd(kr & knan, w, v_sum_r);
+                        alpha = v_sub(alpha, v_cvt_f32(idx));
+                        w = v_and(v_mul(kweight1, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one, alpha)))), knan);
+                        v_wsum = v_add(v_wsum, w);
+                        v_sum_b = v_muladd(v_and(kb, knan), w, v_sum_b);
+                        v_sum_g = v_muladd(v_and(kg, knan), w, v_sum_g);
+                        v_sum_r = v_muladd(v_and(kr, knan), w, v_sum_r);

                        v_load_deinterleave(ksptr2, kb, kg, kr);
-                        knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr);
-                        alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan;
+                        knan = v_and(v_and(v_not_nan(kb), v_not_nan(kg)), v_not_nan(kr));
+                        alpha = v_and(v_and(v_and(v_and(v_mul(v_add(v_add(v_absdiff(kb, rb), v_absdiff(kg, rg)), v_absdiff(kr, rr)), sindex), v_not_nan(rb)), v_not_nan(rg)), v_not_nan(rr)), knan);
                        idx = v_trunc(alpha);
-                        alpha -= v_cvt_f32(idx);
-                        w = (kweight2 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
-                        v_wsum += w;
-                        v_sum_b = v_muladd(kb & knan, w, v_sum_b);
-                        v_sum_g = v_muladd(kg & knan, w, v_sum_g);
-                        v_sum_r = v_muladd(kr & knan, w, v_sum_r);
+                        alpha = v_sub(alpha, v_cvt_f32(idx));
+                        w = v_and(v_mul(kweight2, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one, alpha)))), knan);
+                        v_wsum = v_add(v_wsum, w);
+                        v_sum_b = v_muladd(v_and(kb, knan), w, v_sum_b);
+                        v_sum_g = v_muladd(v_and(kg, knan), w, v_sum_g);
+                        v_sum_r = v_muladd(v_and(kr, knan), w, v_sum_r);

                        v_load_deinterleave(ksptr3, kb, kg, kr);
-                        knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr);
-                        alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan;
+                        knan = v_and(v_and(v_not_nan(kb), v_not_nan(kg)), v_not_nan(kr));
+                        alpha = v_and(v_and(v_and(v_and(v_mul(v_add(v_add(v_absdiff(kb, rb), v_absdiff(kg, rg)), v_absdiff(kr, rr)), sindex), v_not_nan(rb)), v_not_nan(rg)), v_not_nan(rr)), knan);
                        idx = v_trunc(alpha);
-                        alpha -= v_cvt_f32(idx);
-                        w = (kweight3 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
-                        v_wsum += w;
-                        v_sum_b = v_muladd(kb & knan, w, v_sum_b);
-                        v_sum_g = v_muladd(kg & knan, w, v_sum_g);
-                        v_sum_r = v_muladd(kr & knan, w, v_sum_r);
+                        alpha = v_sub(alpha, v_cvt_f32(idx));
+                        w = v_and(v_mul(kweight3, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one, alpha)))), knan);
+                        v_wsum = v_add(v_wsum, w);
+                        v_sum_b = v_muladd(v_and(kb, knan), w, v_sum_b);
+                        v_sum_g = v_muladd(v_and(kg, knan), w, v_sum_g);
+                        v_sum_r = v_muladd(v_and(kr, knan), w, v_sum_r);

                        v_store_aligned(wsum + j, v_wsum);
                        v_store_aligned(sum_b + j, v_sum_b);
@ -938,24 +938,24 @@ public:
                    const float* ksptr = sptr + space_ofs[k];
                    const float* rsptr = sptr;
                    j = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                    v_float32 kweight = vx_setall_f32(space_weight[k]);
-                    for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes, ksptr += 3*v_float32::nlanes, rsptr += 3*v_float32::nlanes)
+                    for (; j <= size.width - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes(), ksptr += 3*VTraits<v_float32>::vlanes(), rsptr += 3*VTraits<v_float32>::vlanes())
                    {
                        v_float32 kb, kg, kr, rb, rg, rr;
                        v_load_deinterleave(ksptr, kb, kg, kr);
                        v_load_deinterleave(rsptr, rb, rg, rr);

-                        v_float32 knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr);
-                        v_float32 alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan;
+                        v_float32 knan = v_and(v_and(v_not_nan(kb), v_not_nan(kg)), v_not_nan(kr));
+                        v_float32 alpha = v_and(v_and(v_and(v_and(v_mul(v_add(v_add(v_absdiff(kb, rb), v_absdiff(kg, rg)), v_absdiff(kr, rr)), sindex), v_not_nan(rb)), v_not_nan(rg)), v_not_nan(rr)), knan);
                        v_int32 idx = v_trunc(alpha);
-                        alpha -= v_cvt_f32(idx);
+                        alpha = v_sub(alpha, v_cvt_f32(idx));

-                        v_float32 w = (kweight * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
-                        v_store_aligned(wsum + j, vx_load_aligned(wsum + j) + w);
-                        v_store_aligned(sum_b + j, v_muladd(kb & knan, w, vx_load_aligned(sum_b + j)));
-                        v_store_aligned(sum_g + j, v_muladd(kg & knan, w, vx_load_aligned(sum_g + j)));
-                        v_store_aligned(sum_r + j, v_muladd(kr & knan, w, vx_load_aligned(sum_r + j)));
+                        v_float32 w = v_and(v_mul(kweight, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one, alpha)))), knan);
+                        v_store_aligned(wsum + j, v_add(vx_load_aligned(wsum + j), w));
+                        v_store_aligned(sum_b + j, v_muladd(v_and(kb, knan), w, vx_load_aligned(sum_b + j)));
+                        v_store_aligned(sum_g + j, v_muladd(v_and(kg, knan), w, vx_load_aligned(sum_g + j)));
+                        v_store_aligned(sum_r + j, v_muladd(v_and(kr, knan), w, vx_load_aligned(sum_r + j)));
                    }
 #endif
                    for (; j < size.width; j++, ksptr += 3, rsptr += 3)
@ -978,14 +978,14 @@ public:
                    }
                }
                j = 0;
-#if CV_SIMD
-                for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes, sptr += 3*v_float32::nlanes, dptr += 3*v_float32::nlanes)
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for (; j <= size.width - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes(), sptr += 3*VTraits<v_float32>::vlanes(), dptr += 3*VTraits<v_float32>::vlanes())
                {
                    v_float32 b, g, r;
                    v_load_deinterleave(sptr, b, g, r);
-                    v_float32 mask = v_not_nan(b) & v_not_nan(g) & v_not_nan(r);
-                    v_float32 w = v_one / (vx_load_aligned(wsum + j) + (v_one & mask));
-                    v_store_interleave(dptr, (vx_load_aligned(sum_b + j) + (b & mask)) * w, (vx_load_aligned(sum_g + j) + (g & mask)) * w, (vx_load_aligned(sum_r + j) + (r & mask)) * w);
+                    v_float32 mask = v_and(v_and(v_not_nan(b), v_not_nan(g)), v_not_nan(r));
+                    v_float32 w = v_div(v_one, v_add(vx_load_aligned(wsum + j), v_and(v_one, mask)));
+                    v_store_interleave(dptr, v_mul(v_add(vx_load_aligned(sum_b + j), v_and(b, mask)), w), v_mul(v_add(vx_load_aligned(sum_g + j), v_and(g, mask)), w), v_mul(v_add(vx_load_aligned(sum_r + j), v_and(r, mask)), w));
                }
 #endif
                for (; j < size.width; j++)
@ -1011,7 +1011,7 @@ public:
                }
            }
        }
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
        vx_cleanup();
 #endif
    }
--- a/modules/imgproc/src/color_lab.cpp
+++ b/modules/imgproc/src/color_lab.cpp
@ -56,40 +56,38 @@ template<typename _Tp> static inline _Tp splineInterpolate(_Tp x, const _Tp* tab
    return ((tab[3]*x + tab[2])*x + tab[1])*x + tab[0];
 }

-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)

 template<typename _Tp> static inline cv::v_float32 splineInterpolate(const cv::v_float32& x, const _Tp* tab, int n)
 {
    using namespace cv;
    v_int32 ix = v_min(v_max(v_trunc(x), vx_setzero_s32()), vx_setall_s32(n-1));
-    cv::v_float32 xx = x - v_cvt_f32(ix);
-    ix = ix << 2;
+    cv::v_float32 xx = v_sub(x, v_cvt_f32(ix));
+    ix = v_shl<2>(ix);

-    v_float32 t[4];
+    v_float32 t0, t1, t2, t3;
    // assume that v_float32::nlanes == v_int32::nlanes
-    if(v_float32::nlanes == 4)
+    if(VTraits<v_float32>::vlanes() == 4)
    {
-#if CV_SIMD_WIDTH == 16
        int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) idx[4];
        v_store_aligned(idx, ix);
-        v_float32x4 tt[4];
-        tt[0] = v_load(tab + idx[0]);
-        tt[1] = v_load(tab + idx[1]);
-        tt[2] = v_load(tab + idx[2]);
-        tt[3] = v_load(tab + idx[3]);
-        v_transpose4x4(tt[0], tt[1], tt[2], tt[3],
-                        t[0],  t[1],  t[2],  t[3]);
-#endif
+        v_float32 tt0, tt1, tt2, tt3;
+        tt0 = vx_load(tab + idx[0]);
+        tt1 = vx_load(tab + idx[1]);
+        tt2 = vx_load(tab + idx[2]);
+        tt3 = vx_load(tab + idx[3]);
+        v_transpose4x4(tt0, tt1, tt2, tt3,
+                        t0,  t1,  t2,  t3);
    }
    else
    {
-        t[0] = v_lut(tab + 0, ix);
-        t[1] = v_lut(tab + 1, ix);
-        t[2] = v_lut(tab + 2, ix);
-        t[3] = v_lut(tab + 3, ix);
+        t0 = v_lut(tab + 0, ix);
+        t1 = v_lut(tab + 1, ix);
+        t2 = v_lut(tab + 2, ix);
+        t3 = v_lut(tab + 3, ix);
    }

-    return v_fma(v_fma(v_fma(t[3], xx, t[2]), xx, t[1]), xx, t[0]);
+    return v_fma(v_fma(v_fma(t3, xx, t2), xx, t1), xx, t0);
 }

 #endif
@ -207,8 +205,8 @@ struct RGB2XYZ_f<float>
              C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
              C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
        int i = 0;
-#if CV_SIMD
-        const int vsize = v_float32::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int vsize = VTraits<v_float32>::vlanes();
        v_float32 vc0 = vx_setall_f32(C0), vc1 = vx_setall_f32(C1), vc2 = vx_setall_f32(C2);
        v_float32 vc3 = vx_setall_f32(C3), vc4 = vx_setall_f32(C4), vc5 = vx_setall_f32(C5);
        v_float32 vc6 = vx_setall_f32(C6), vc7 = vx_setall_f32(C7), vc8 = vx_setall_f32(C8);
@ -226,9 +224,9 @@ struct RGB2XYZ_f<float>
            }

            v_float32 x, y, z;
-            x = v_fma(b, vc0, v_fma(g, vc1, r*vc2));
-            y = v_fma(b, vc3, v_fma(g, vc4, r*vc5));
-            z = v_fma(b, vc6, v_fma(g, vc7, r*vc8));
+            x = v_fma(b, vc0, v_fma(g, vc1, v_mul(r, vc2)));
+            y = v_fma(b, vc3, v_fma(g, vc4, v_mul(r, vc5)));
+            z = v_fma(b, vc6, v_fma(g, vc7, v_mul(r, vc8)));

            v_store_interleave(dst, x, y, z);
        }
@ -313,8 +311,8 @@ struct RGB2XYZ_i<uchar>
            C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
            C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];

-#if CV_SIMD
-        const int vsize = v_uint8::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int vsize = VTraits<v_uint8>::vlanes();
        int descaleShift = 1 << (shift-1);
        v_int16 vdescale = vx_setall_s16((short)descaleShift);
        v_int16 cxbg, cxr1, cybg, cyr1, czbg, czr1;
@ -349,27 +347,36 @@ struct RGB2XYZ_i<uchar>
            sg0 = v_reinterpret_as_s16(g0); sg1 = v_reinterpret_as_s16(g1);
            sb0 = v_reinterpret_as_s16(b0); sb1 = v_reinterpret_as_s16(b1);

-            v_int16 bg[4], rd[4];
-            v_zip(sb0, sg0, bg[0], bg[1]);
-            v_zip(sb1, sg1, bg[2], bg[3]);
-            v_zip(sr0, vdescale, rd[0], rd[1]);
-            v_zip(sr1, vdescale, rd[2], rd[3]);
+            v_int16 bg0, bg1, bg2, bg3, rd0, rd1, rd2, rd3;
+            v_zip(sb0, sg0, bg0, bg1);
+            v_zip(sb1, sg1, bg2, bg3);
+            v_zip(sr0, vdescale, rd0, rd1);
+            v_zip(sr1, vdescale, rd2, rd3);

-            v_uint32 vx[4], vy[4], vz[4];
-            for(int j = 0; j < 4; j++)
-            {
-                vx[j] = v_reinterpret_as_u32(v_dotprod(bg[j], cxbg) + v_dotprod(rd[j], cxr1)) >> shift;
-                vy[j] = v_reinterpret_as_u32(v_dotprod(bg[j], cybg) + v_dotprod(rd[j], cyr1)) >> shift;
-                vz[j] = v_reinterpret_as_u32(v_dotprod(bg[j], czbg) + v_dotprod(rd[j], czr1)) >> shift;
-            }
+            v_uint32 vx0, vx1, vx2, vx3;
+            v_uint32 vy0, vy1, vy2, vy3;
+            v_uint32 vz0, vz1, vz2, vz3;
+
+            vx0 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg0, cxbg), v_dotprod(rd0, cxr1))));
+            vy0 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg0, cybg), v_dotprod(rd0, cyr1))));
+            vz0 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg0, czbg), v_dotprod(rd0, czr1))));
+            vx1 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg1, cxbg), v_dotprod(rd1, cxr1))));
+            vy1 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg1, cybg), v_dotprod(rd1, cyr1))));
+            vz1 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg1, czbg), v_dotprod(rd1, czr1))));
+            vx2 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg2, cxbg), v_dotprod(rd2, cxr1))));
+            vy2 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg2, cybg), v_dotprod(rd2, cyr1))));
+            vz2 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg2, czbg), v_dotprod(rd2, czr1))));
+            vx3 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg3, cxbg), v_dotprod(rd3, cxr1))));
+            vy3 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg3, cybg), v_dotprod(rd3, cyr1))));
+            vz3 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg3, czbg), v_dotprod(rd3, czr1))));

            v_uint16 x0, x1, y0, y1, z0, z1;
-            x0 = v_pack(vx[0], vx[1]);
-            x1 = v_pack(vx[2], vx[3]);
-            y0 = v_pack(vy[0], vy[1]);
-            y1 = v_pack(vy[2], vy[3]);
-            z0 = v_pack(vz[0], vz[1]);
-            z1 = v_pack(vz[2], vz[3]);
+            x0 = v_pack(vx0, vx1);
+            x1 = v_pack(vx2, vx3);
+            y0 = v_pack(vy0, vy1);
+            y1 = v_pack(vy2, vy3);
+            z0 = v_pack(vz0, vz1);
+            z1 = v_pack(vz2, vz3);

            v_uint8 x, y, z;
            x = v_pack(x0, x1);
@ -424,8 +431,8 @@ struct RGB2XYZ_i<ushort>
        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
            C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
            C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
-#if CV_SIMD
-        const int vsize = v_uint16::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int vsize = VTraits<v_uint16>::vlanes();
        const int descaleShift = 1 << (shift-1);
        v_int16 vdescale = vx_setall_s16(descaleShift);
        v_int16 vc0 = vx_setall_s16((short)C0), vc1 = vx_setall_s16((short)C1), vc2 = vx_setall_s16((short)C2);
@ -464,29 +471,29 @@ struct RGB2XYZ_i<ushort>
            v_int16 ymr, ymg, ymb;
            v_int16 zmr, zmg, zmb;

-            v_int16 mr = sr < zero, mg = sg < zero, mb = sb < zero;
+            v_int16 mr = v_lt(sr, zero), mg = v_lt(sg, zero), mb = v_lt(sb, zero);

-            xmb = mb & vc0;
-            xmg = mg & vc1;
-            xmr = mr & vc2;
-            ymb = mb & vc3;
-            ymg = mg & vc4;
-            ymr = mr & vc5;
-            zmb = mb & vc6;
-            zmg = mg & vc7;
-            zmr = mr & vc8;
+            xmb = v_and(mb, vc0);
+            xmg = v_and(mg, vc1);
+            xmr = v_and(mr, vc2);
+            ymb = v_and(mb, vc3);
+            ymg = v_and(mg, vc4);
+            ymr = v_and(mr, vc5);
+            zmb = v_and(mb, vc6);
+            zmg = v_and(mg, vc7);
+            zmr = v_and(mr, vc8);

            v_int32 xfix0, xfix1, yfix0, yfix1, zfix0, zfix1;
-            v_expand(xmr + xmg + xmb, xfix0, xfix1);
-            v_expand(ymr + ymg + ymb, yfix0, yfix1);
-            v_expand(zmr + zmg + zmb, zfix0, zfix1);
+            v_expand(v_add(v_add(xmr, xmg), xmb), xfix0, xfix1);
+            v_expand(v_add(v_add(ymr, ymg), ymb), yfix0, yfix1);
+            v_expand(v_add(v_add(zmr, zmg), zmb), zfix0, zfix1);

-            xfix0 = xfix0 << 16;
-            xfix1 = xfix1 << 16;
-            yfix0 = yfix0 << 16;
-            yfix1 = yfix1 << 16;
-            zfix0 = zfix0 << 16;
-            zfix1 = zfix1 << 16;
+            xfix0 = v_shl<16>(xfix0);
+            xfix1 = v_shl<16>(xfix1);
+            yfix0 = v_shl<16>(yfix0);
+            yfix1 = v_shl<16>(yfix1);
+            zfix0 = v_shl<16>(zfix0);
+            zfix1 = v_shl<16>(zfix1);

            v_int16 bg0, bg1, rd0, rd1;
            v_zip(sb, sg, bg0, bg1);
@ -494,12 +501,12 @@ struct RGB2XYZ_i<ushort>

            v_uint32 x0, x1, y0, y1, z0, z1;

-            x0 = v_reinterpret_as_u32(v_dotprod(bg0, cxbg) + v_dotprod(rd0, cxr1) + xfix0) >> shift;
-            x1 = v_reinterpret_as_u32(v_dotprod(bg1, cxbg) + v_dotprod(rd1, cxr1) + xfix1) >> shift;
-            y0 = v_reinterpret_as_u32(v_dotprod(bg0, cybg) + v_dotprod(rd0, cyr1) + yfix0) >> shift;
-            y1 = v_reinterpret_as_u32(v_dotprod(bg1, cybg) + v_dotprod(rd1, cyr1) + yfix1) >> shift;
-            z0 = v_reinterpret_as_u32(v_dotprod(bg0, czbg) + v_dotprod(rd0, czr1) + zfix0) >> shift;
-            z1 = v_reinterpret_as_u32(v_dotprod(bg1, czbg) + v_dotprod(rd1, czr1) + zfix1) >> shift;
+            x0 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_add(v_dotprod(bg0, cxbg), v_dotprod(rd0, cxr1)), xfix0)));
+            x1 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_add(v_dotprod(bg1, cxbg), v_dotprod(rd1, cxr1)), xfix1)));
+            y0 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_add(v_dotprod(bg0, cybg), v_dotprod(rd0, cyr1)), yfix0)));
+            y1 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_add(v_dotprod(bg1, cybg), v_dotprod(rd1, cyr1)), yfix1)));
+            z0 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_add(v_dotprod(bg0, czbg), v_dotprod(rd0, czr1)), zfix0)));
+            z1 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_add(v_dotprod(bg1, czbg), v_dotprod(rd1, czr1)), zfix1)));

            v_uint16 x, y, z;
            x = v_pack(x0, x1);
@ -593,8 +600,8 @@ struct XYZ2RGB_f<float>
              C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
              C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
        int i = 0;
-#if CV_SIMD
-        const int vsize = v_float32::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int vsize = VTraits<v_float32>::vlanes();
        v_float32 valpha = vx_setall_f32(alpha);
        v_float32 vc0 = vx_setall_f32(C0), vc1 = vx_setall_f32(C1), vc2 = vx_setall_f32(C2);
        v_float32 vc3 = vx_setall_f32(C3), vc4 = vx_setall_f32(C4), vc5 = vx_setall_f32(C5);
@ -606,9 +613,9 @@ struct XYZ2RGB_f<float>
            v_load_deinterleave(src, x, y, z);

            v_float32 b, g, r;
-            b = v_fma(x, vc0, v_fma(y, vc1, z*vc2));
-            g = v_fma(x, vc3, v_fma(y, vc4, z*vc5));
-            r = v_fma(x, vc6, v_fma(y, vc7, z*vc8));
+            b = v_fma(x, vc0, v_fma(y, vc1, v_mul(z, vc2)));
+            g = v_fma(x, vc3, v_fma(y, vc4, v_mul(z, vc5)));
+            r = v_fma(x, vc6, v_fma(y, vc7, v_mul(z, vc8)));

            if(dcn == 4)
            {
@ -707,8 +714,8 @@ struct XYZ2RGB_i<uchar>
        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
            C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
            C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
-#if CV_SIMD
-        const int vsize = v_uint8::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int vsize = VTraits<v_uint8>::vlanes();
        const int descaleShift = 1 << (shift - 1);
        v_uint8 valpha = vx_setall_u8(alpha);
        v_int16 vdescale = vx_setall_s16(descaleShift);
@ -739,25 +746,35 @@ struct XYZ2RGB_i<uchar>
            z0 = v_reinterpret_as_s16(uz0);
            z1 = v_reinterpret_as_s16(uz1);

-            v_int32 b[4], g[4], r[4];
+            v_int32 bb0, bb1, bb2, bb3,
+                    gg0, gg1, gg2, gg3,
+                    rr0, rr1, rr2, rr3;

-            v_int16 xy[4], zd[4];
-            v_zip(x0, y0, xy[0], xy[1]);
-            v_zip(x1, y1, xy[2], xy[3]);
-            v_zip(z0, vdescale, zd[0], zd[1]);
-            v_zip(z1, vdescale, zd[2], zd[3]);
+            v_int16 xy0, xy1, xy2, xy3;
+            v_int16 zd0, zd1, zd2, zd3;

-            for(int j = 0; j < 4; j++)
-            {
-                b[j] = (v_dotprod(xy[j], cbxy) + v_dotprod(zd[j], cbz1)) >> shift;
-                g[j] = (v_dotprod(xy[j], cgxy) + v_dotprod(zd[j], cgz1)) >> shift;
-                r[j] = (v_dotprod(xy[j], crxy) + v_dotprod(zd[j], crz1)) >> shift;
-            }
+            v_zip(x0, y0, xy0, xy1);
+            v_zip(x1, y1, xy2, xy3);
+            v_zip(z0, vdescale, zd0, zd1);
+            v_zip(z1, vdescale, zd2, zd3);
+
+            bb0 = v_shr<shift>(v_add(v_dotprod(xy0, cbxy), v_dotprod(zd0, cbz1)));
+            gg0 = v_shr<shift>(v_add(v_dotprod(xy0, cgxy), v_dotprod(zd0, cgz1)));
+            rr0 = v_shr<shift>(v_add(v_dotprod(xy0, crxy), v_dotprod(zd0, crz1)));
+            bb1 = v_shr<shift>(v_add(v_dotprod(xy1, cbxy), v_dotprod(zd1, cbz1)));
+            gg1 = v_shr<shift>(v_add(v_dotprod(xy1, cgxy), v_dotprod(zd1, cgz1)));
+            rr1 = v_shr<shift>(v_add(v_dotprod(xy1, crxy), v_dotprod(zd1, crz1)));
+            bb2 = v_shr<shift>(v_add(v_dotprod(xy2, cbxy), v_dotprod(zd2, cbz1)));
+            gg2 = v_shr<shift>(v_add(v_dotprod(xy2, cgxy), v_dotprod(zd2, cgz1)));
+            rr2 = v_shr<shift>(v_add(v_dotprod(xy2, crxy), v_dotprod(zd2, crz1)));
+            bb3 = v_shr<shift>(v_add(v_dotprod(xy3, cbxy), v_dotprod(zd3, cbz1)));
+            gg3 = v_shr<shift>(v_add(v_dotprod(xy3, cgxy), v_dotprod(zd3, cgz1)));
+            rr3 = v_shr<shift>(v_add(v_dotprod(xy3, crxy), v_dotprod(zd3, crz1)));

            v_uint16 b0, b1, g0, g1, r0, r1;
-            b0 = v_pack_u(b[0], b[1]); b1 = v_pack_u(b[2], b[3]);
-            g0 = v_pack_u(g[0], g[1]); g1 = v_pack_u(g[2], g[3]);
-            r0 = v_pack_u(r[0], r[1]); r1 = v_pack_u(r[2], r[3]);
+            b0 = v_pack_u(bb0, bb1); b1 = v_pack_u(bb2, bb3);
+            g0 = v_pack_u(gg0, gg1); g1 = v_pack_u(gg2, gg3);
+            r0 = v_pack_u(rr0, rr1); r1 = v_pack_u(rr2, rr3);

            v_uint8 bb, gg, rr;
            bb = v_pack(b0, b1);
@ -820,8 +837,8 @@ struct XYZ2RGB_i<ushort>
        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
            C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
            C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
-#if CV_SIMD
-        const int vsize = v_uint16::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int vsize = VTraits<v_uint16>::vlanes();
        const int descaleShift = 1 << (shift-1);
        v_uint16 valpha = vx_setall_u16(alpha);
        v_int16 vdescale = vx_setall_s16(descaleShift);
@ -850,30 +867,30 @@ struct XYZ2RGB_i<ushort>
            sz = v_reinterpret_as_s16(z);

            // fixing 16bit signed multiplication
-            v_int16 mx = sx < zero, my = sy < zero, mz = sz < zero;
+            v_int16 mx = v_lt(sx, zero), my = v_lt(sy, zero), mz = v_lt(sz, zero);

            v_int16 bmx, bmy, bmz;
            v_int16 gmx, gmy, gmz;
            v_int16 rmx, rmy, rmz;

-            bmx = mx & vc0;
-            bmy = my & vc1;
-            bmz = mz & vc2;
-            gmx = mx & vc3;
-            gmy = my & vc4;
-            gmz = mz & vc5;
-            rmx = mx & vc6;
-            rmy = my & vc7;
-            rmz = mz & vc8;
+            bmx = v_and(mx, vc0);
+            bmy = v_and(my, vc1);
+            bmz = v_and(mz, vc2);
+            gmx = v_and(mx, vc3);
+            gmy = v_and(my, vc4);
+            gmz = v_and(mz, vc5);
+            rmx = v_and(mx, vc6);
+            rmy = v_and(my, vc7);
+            rmz = v_and(mz, vc8);

            v_int32 bfix0, bfix1, gfix0, gfix1, rfix0, rfix1;
-            v_expand(bmx + bmy + bmz, bfix0, bfix1);
-            v_expand(gmx + gmy + gmz, gfix0, gfix1);
-            v_expand(rmx + rmy + rmz, rfix0, rfix1);
+            v_expand(v_add(v_add(bmx, bmy), bmz), bfix0, bfix1);
+            v_expand(v_add(v_add(gmx, gmy), gmz), gfix0, gfix1);
+            v_expand(v_add(v_add(rmx, rmy), rmz), rfix0, rfix1);

-            bfix0 = bfix0 << 16; bfix1 = bfix1 << 16;
-            gfix0 = gfix0 << 16; gfix1 = gfix1 << 16;
-            rfix0 = rfix0 << 16; rfix1 = rfix1 << 16;
+            bfix0 = v_shl<16>(bfix0); bfix1 = v_shl<16>(bfix1);
+            gfix0 = v_shl<16>(gfix0); gfix1 = v_shl<16>(gfix1);
+            rfix0 = v_shl<16>(rfix0); rfix1 = v_shl<16>(rfix1);

            v_int16 xy0, xy1, zd0, zd1;
            v_zip(sx, sy, xy0, xy1);
@ -881,12 +898,12 @@ struct XYZ2RGB_i<ushort>

            v_int32 b0, b1, g0, g1, r0, r1;

-            b0 = (v_dotprod(xy0, cbxy) + v_dotprod(zd0, cbz1) + bfix0) >> shift;
-            b1 = (v_dotprod(xy1, cbxy) + v_dotprod(zd1, cbz1) + bfix1) >> shift;
-            g0 = (v_dotprod(xy0, cgxy) + v_dotprod(zd0, cgz1) + gfix0) >> shift;
-            g1 = (v_dotprod(xy1, cgxy) + v_dotprod(zd1, cgz1) + gfix1) >> shift;
-            r0 = (v_dotprod(xy0, crxy) + v_dotprod(zd0, crz1) + rfix0) >> shift;
-            r1 = (v_dotprod(xy1, crxy) + v_dotprod(zd1, crz1) + rfix1) >> shift;
+            b0 = v_shr<shift>(v_add(v_add(v_dotprod(xy0, cbxy), v_dotprod(zd0, cbz1)), bfix0));
+            b1 = v_shr<shift>(v_add(v_add(v_dotprod(xy1, cbxy), v_dotprod(zd1, cbz1)), bfix1));
+            g0 = v_shr<shift>(v_add(v_add(v_dotprod(xy0, cgxy), v_dotprod(zd0, cgz1)), gfix0));
+            g1 = v_shr<shift>(v_add(v_add(v_dotprod(xy1, cgxy), v_dotprod(zd1, cgz1)), gfix1));
+            r0 = v_shr<shift>(v_add(v_add(v_dotprod(xy0, crxy), v_dotprod(zd0, crz1)), rfix0));
+            r1 = v_shr<shift>(v_add(v_add(v_dotprod(xy1, crxy), v_dotprod(zd1, crz1)), rfix1));

            v_uint16 b, g, r;
            b = v_pack_u(b0, b1); g = v_pack_u(g0, g1); r = v_pack_u(r0, r1);
@ -1452,19 +1469,19 @@ static inline void trilinearPackedInterpolate(const v_uint16x8& inX, const v_uin
 #undef DOT_SHIFT_PACK
 }

-#elif CV_SIMD
+#elif CV_SIMD // Fixed size v_int16x8 used below, CV_SIMD_SCALABLE is disabled.

 // inValues are in [0; LAB_BASE]
 static inline void trilinearPackedInterpolate(const v_uint16& inX, const v_uint16& inY, const v_uint16& inZ,
                                              const int16_t* LUT,
                                              v_uint16& outA, v_uint16& outB, v_uint16& outC)
 {
-    const int vsize = v_uint16::nlanes;
+    const int vsize = VTraits<v_uint16>::max_nlanes;

    // LUT idx of origin pt of cube
-    v_uint16 tx = inX >> (lab_base_shift - lab_lut_shift);
-    v_uint16 ty = inY >> (lab_base_shift - lab_lut_shift);
-    v_uint16 tz = inZ >> (lab_base_shift - lab_lut_shift);
+    v_uint16 tx = v_shr<lab_base_shift - lab_lut_shift>(inX);
+    v_uint16 ty = v_shr<lab_base_shift - lab_lut_shift>(inY);
+    v_uint16 tz = v_shr<lab_base_shift - lab_lut_shift>(inZ);

    v_uint32 btmp00, btmp01, btmp10, btmp11, btmp20, btmp21;
    v_uint32 baseIdx0, baseIdx1;
@ -1472,8 +1489,8 @@ static inline void trilinearPackedInterpolate(const v_uint16& inX, const v_uint1
    v_mul_expand(tx, vx_setall_u16(3*8), btmp00, btmp01);
    v_mul_expand(ty, vx_setall_u16(3*8*LAB_LUT_DIM), btmp10, btmp11);
    v_mul_expand(tz, vx_setall_u16(3*8*LAB_LUT_DIM*LAB_LUT_DIM), btmp20, btmp21);
-    baseIdx0 = btmp00 + btmp10 + btmp20;
-    baseIdx1 = btmp01 + btmp11 + btmp21;
+    baseIdx0 = v_add(v_add(btmp00, btmp10), btmp20);
+    baseIdx1 = v_add(v_add(btmp01, btmp11), btmp21);

    uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vbaseIdx[vsize];
    v_store_aligned(vbaseIdx + 0*vsize/2, baseIdx0);
@ -1482,9 +1499,9 @@ static inline void trilinearPackedInterpolate(const v_uint16& inX, const v_uint1
    // fracX, fracY, fracZ are [0; TRILINEAR_BASE)
    const uint16_t bitMask = (1 << trilinear_shift) - 1;
    v_uint16 bitMaskReg = vx_setall_u16(bitMask);
-    v_uint16 fracX = (inX >> (lab_base_shift - 8 - 1)) & bitMaskReg;
-    v_uint16 fracY = (inY >> (lab_base_shift - 8 - 1)) & bitMaskReg;
-    v_uint16 fracZ = (inZ >> (lab_base_shift - 8 - 1)) & bitMaskReg;
+    v_uint16 fracX = v_and(v_shr<lab_base_shift - 8 - 1>(inX), bitMaskReg);
+    v_uint16 fracY = v_and(v_shr<lab_base_shift - 8 - 1>(inY), bitMaskReg);
+    v_uint16 fracZ = v_and(v_shr<lab_base_shift - 8 - 1>(inZ), bitMaskReg);

    // trilinearIdx = 8*x + 8*TRILINEAR_BASE*y + 8*TRILINEAR_BASE*TRILINEAR_BASE*z
    v_uint32 trilinearIdx0, trilinearIdx1;
@ -1493,8 +1510,8 @@ static inline void trilinearPackedInterpolate(const v_uint16& inX, const v_uint1
    v_expand(fracY, fracY0, fracY1);
    v_expand(fracZ, fracZ0, fracZ1);

-    trilinearIdx0 = (fracX0 << 3) + (fracY0 << (3+trilinear_shift)) + (fracZ0 << (3+trilinear_shift*2));
-    trilinearIdx1 = (fracX1 << 3) + (fracY1 << (3+trilinear_shift)) + (fracZ1 << (3+trilinear_shift*2));
+    trilinearIdx0 = v_add(v_add(v_shl<3>(fracX0), v_shl<3 + trilinear_shift>(fracY0)), v_shl<3 + trilinear_shift * 2>(fracZ0));
+    trilinearIdx1 = v_add(v_add(v_shl<3>(fracX1), v_shl<3 + trilinear_shift>(fracY1)), v_shl<3 + trilinear_shift * 2>(fracZ1));

    uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vtrilinearIdx[vsize];
    v_store_aligned(vtrilinearIdx + 0*vsize/2, trilinearIdx0);
@ -1528,12 +1545,12 @@ static inline void trilinearPackedInterpolate(const v_uint16& inX, const v_uint1

    // CV_DESCALE
    const v_uint32 descaleShift = vx_setall_u32(1 << (trilinear_shift*3 - 1));
-    a0 = (a0 + descaleShift) >> (trilinear_shift*3);
-    a1 = (a1 + descaleShift) >> (trilinear_shift*3);
-    b0 = (b0 + descaleShift) >> (trilinear_shift*3);
-    b1 = (b1 + descaleShift) >> (trilinear_shift*3);
-    c0 = (c0 + descaleShift) >> (trilinear_shift*3);
-    c1 = (c1 + descaleShift) >> (trilinear_shift*3);
+    a0 = v_shr<trilinear_shift * 3>(v_add(a0, descaleShift));
+    a1 = v_shr<trilinear_shift * 3>(v_add(a1, descaleShift));
+    b0 = v_shr<trilinear_shift * 3>(v_add(b0, descaleShift));
+    b1 = v_shr<trilinear_shift * 3>(v_add(b1, descaleShift));
+    c0 = v_shr<trilinear_shift * 3>(v_add(c0, descaleShift));
+    c1 = v_shr<trilinear_shift * 3>(v_add(c1, descaleShift));

    outA = v_pack(a0, a1); outB = v_pack(b0, b1); outC = v_pack(c0, c1);
 }
--- a/modules/imgproc/src/color_yuv.simd.hpp
+++ b/modules/imgproc/src/color_yuv.simd.hpp
@ -49,6 +49,15 @@ void cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step,
 namespace {
 //constants for conversion from/to RGB and YUV, YCrCb according to BT.601

+#if CV_SIMD_SCALABLE
+template <class T>
+static void swap(T&a, T&b) {
+    T t = a;
+    a = b;
+    b = t;
+}
+#endif
+
 //to YCbCr
 static const float YCBF = 0.564f; // == 1/2/(1-B2YF)
 static const float YCRF = 0.713f; // == 1/2/(1-R2YF)
@ -143,11 +152,11 @@ struct RGB2YCrCb_f<float>
        float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];

        int i = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
        v_float32 vc0 = vx_setall_f32(C0), vc1 = vx_setall_f32(C1), vc2 = vx_setall_f32(C2);
        v_float32 vc3 = vx_setall_f32(C3), vc4 = vx_setall_f32(C4);
        v_float32 vdelta = vx_setall_f32(delta);
-        const int vsize = v_float32::nlanes;
+        const int vsize = VTraits<v_float32>::vlanes();
        for( ; i <= n-vsize;
             i += vsize, src += vsize*scn, dst += vsize*3)
        {
@ -162,13 +171,13 @@ struct RGB2YCrCb_f<float>
            }

            v_float32 y, cr, cb;
-            y = v_fma(b, vc0, v_fma(g, vc1, r*vc2));
+            y = v_fma(b, vc0, v_fma(g, vc1, v_mul(r, vc2)));

            if(bidx)
-                std::swap(r, b);
+                swap(r, b);

-            cr = v_fma(r - y, vc3, vdelta);
-            cb = v_fma(b - y, vc4, vdelta);
+            cr = v_fma(v_sub(r, y), vc3, vdelta);
+            cb = v_fma(v_sub(b, y), vc4, vdelta);

            if(yuvOrder)
            {
@ -266,8 +275,8 @@ struct RGB2YCrCb_i<ushort>
        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
        int sdelta = ColorChannel<ushort>::half()*(1 << shift);
        int i = 0;
-#if CV_SIMD
-        const int vsize = v_uint16::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int vsize = VTraits<v_uint16>::vlanes();
        const int descale = 1 << (shift-1);

        v_int16 b2y = vx_setall_s16((short)C0);
@ -312,13 +321,13 @@ struct RGB2YCrCb_i<ushort>

            // fixing 16bit signed multiplication
            v_int16 mr, mg, mb;
-            mr = (sr < z) & r2y;
-            mg = (sg < z) & g2y;
-            mb = (sb < z) & b2y;
-            v_int16 fixmul = v_add_wrap(mr, v_add_wrap(mg, mb)) << fix_shift;
+            mr = v_and(v_lt(sr, z), r2y);
+            mg = v_and(v_lt(sg, z), g2y);
+            mb = v_and(v_lt(sb, z), b2y);
+            v_int16 fixmul = v_shl(v_add_wrap(mr, v_add_wrap(mg, mb)), fix_shift);

-            v_int32 ssy0 = (v_dotprod(bg0, bg2y) + v_dotprod(rd0, r12y)) >> shift;
-            v_int32 ssy1 = (v_dotprod(bg1, bg2y) + v_dotprod(rd1, r12y)) >> shift;
+            v_int32 ssy0 = v_shr(v_add(v_dotprod(bg0, bg2y), v_dotprod(rd0, r12y)), shift);
+            v_int32 ssy1 = v_shr(v_add(v_dotprod(bg1, bg2y), v_dotprod(rd1, r12y)), shift);

            y = v_reinterpret_as_u16(v_add_wrap(v_pack(ssy0, ssy1), fixmul));

@ -340,15 +349,15 @@ struct RGB2YCrCb_i<ushort>
            v_int32 sy0 = v_reinterpret_as_s32(uy0);
            v_int32 sy1 = v_reinterpret_as_s32(uy1);

-            sr0 = sr0 - sy0; sr1 = sr1 - sy1;
-            sb0 = sb0 - sy0; sb1 = sb1 - sy1;
+            sr0 = v_sub(sr0, sy0); sr1 = v_sub(sr1, sy1);
+            sb0 = v_sub(sb0, sy0); sb1 = v_sub(sb1, sy1);

            v_int32 v_scr0, v_scr1, v_scb0, v_scb1;

-            v_scr0 = (sr0*vc3 + vdd) >> shift;
-            v_scr1 = (sr1*vc3 + vdd) >> shift;
-            v_scb0 = (sb0*vc4 + vdd) >> shift;
-            v_scb1 = (sb1*vc4 + vdd) >> shift;
+            v_scr0 = v_shr(v_add(v_mul(sr0, vc3), vdd), shift);
+            v_scr1 = v_shr(v_add(v_mul(sr1, vc3), vdd), shift);
+            v_scb0 = v_shr(v_add(v_mul(sb0, vc4), vdd), shift);
+            v_scb1 = v_shr(v_add(v_mul(sb1, vc4), vdd), shift);

            // saturate and pack
            cr = v_pack_u(v_scr0, v_scr1);
@ -407,8 +416,8 @@ struct RGB2YCrCb_i<uchar>
        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
        int delta = ColorChannel<uchar>::half()*(1 << shift);

-#if CV_SIMD
-        const int vsize = v_uint8::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int vsize = VTraits<v_uint8>::vlanes();
        const int descaleShift = 1 << (shift-1);
        v_int16 bg2y;
        v_int16 r12y;
@ -458,10 +467,10 @@ struct RGB2YCrCb_i<uchar>
                v_zip(sr0, vdescale, rd00, rd01);
                v_zip(sr1, vdescale, rd10, rd11);

-                y00 = v_reinterpret_as_u32(v_dotprod(bg00, bg2y) + v_dotprod(rd00, r12y)) >> shift;
-                y01 = v_reinterpret_as_u32(v_dotprod(bg01, bg2y) + v_dotprod(rd01, r12y)) >> shift;
-                y10 = v_reinterpret_as_u32(v_dotprod(bg10, bg2y) + v_dotprod(rd10, r12y)) >> shift;
-                y11 = v_reinterpret_as_u32(v_dotprod(bg11, bg2y) + v_dotprod(rd11, r12y)) >> shift;
+                y00 = v_shr(v_reinterpret_as_u32(v_add(v_dotprod(bg00, bg2y), v_dotprod(rd00, r12y))), shift);
+                y01 = v_shr(v_reinterpret_as_u32(v_add(v_dotprod(bg01, bg2y), v_dotprod(rd01, r12y))), shift);
+                y10 = v_shr(v_reinterpret_as_u32(v_add(v_dotprod(bg10, bg2y), v_dotprod(rd10, r12y))), shift);
+                y11 = v_shr(v_reinterpret_as_u32(v_add(v_dotprod(bg11, bg2y), v_dotprod(rd11, r12y))), shift);
            }

            v_uint16 y0, y1;
@ -512,15 +521,15 @@ struct RGB2YCrCb_i<uchar>

            v_uint8 cr, cb;

-            cr00 = cr00 >> shift;
-            cr01 = cr01 >> shift;
-            cr10 = cr10 >> shift;
-            cr11 = cr11 >> shift;
+            cr00 = v_shr(cr00, shift);
+            cr01 = v_shr(cr01, shift);
+            cr10 = v_shr(cr10, shift);
+            cr11 = v_shr(cr11, shift);

-            cb00 = cb00 >> shift;
-            cb01 = cb01 >> shift;
-            cb10 = cb10 >> shift;
-            cb11 = cb11 >> shift;
+            cb00 = v_shr(cb00, shift);
+            cb01 = v_shr(cb01, shift);
+            cb10 = v_shr(cb10, shift);
+            cb11 = v_shr(cb11, shift);

            v_int16 cr0, cr1, cb0, cb1;
            cr0 = v_pack(cr00, cr01); cr1 = v_pack(cr10, cr11);
@ -623,12 +632,12 @@ struct YCrCb2RGB_f<float>
        float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];

        int i = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
        v_float32 vc0 = vx_setall_f32(C0), vc1 = vx_setall_f32(C1);
        v_float32 vc2 = vx_setall_f32(C2), vc3 = vx_setall_f32(C3);
        v_float32 vdelta = vx_setall_f32(delta);
        v_float32 valpha = vx_setall_f32(alpha);
-        const int vsize = v_float32::nlanes;
+        const int vsize = VTraits<v_float32>::vlanes();
        for( ; i <= n-vsize;
             i += vsize, src += vsize*3, dst += vsize*dcn)
        {
@ -640,7 +649,7 @@ struct YCrCb2RGB_f<float>

            v_float32 b, g, r;

-            cb -= vdelta; cr -= vdelta;
+            cb = v_sub(cb, vdelta); cr = v_sub(cr, vdelta);
            b = v_fma(cb, vc3, y);
            g = v_fma(cr, vc1, v_fma(cb, vc2, y));
            r = v_fma(cr, vc0, y);
@ -746,8 +755,8 @@ struct YCrCb2RGB_i<uchar>
        const uchar delta = ColorChannel<uchar>::half(), alpha = ColorChannel<uchar>::max();
        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];

-#if CV_SIMD
-        const int vsize = v_uint8::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int vsize = VTraits<v_uint8>::vlanes();
        v_uint8 valpha = vx_setall_u8(alpha);
        v_uint8 vdelta = vx_setall_u8(delta);
        const int descaleShift = 1 << (shift - 1);
@ -794,8 +803,8 @@ struct YCrCb2RGB_i<uchar>
                v_int32 cb00, cb01, cb10, cb11;
                v_expand(v_scb0, cb00, cb01);
                v_expand(v_scb1, cb10, cb11);
-                b00 += cb00 << 15; b01 += cb01 << 15;
-                b10 += cb10 << 15; b11 += cb11 << 15;
+                b00 = v_add(b00, v_shl<15>(cb00)); b01 = v_add(b01, v_shl<15>(cb01));
+                b10 = v_add(b10, v_shl<15>(cb10)); b11 = v_add(b11, v_shl<15>(cb11));
            }

            v_int32 t00, t01, t10, t11;
@ -803,17 +812,17 @@ struct YCrCb2RGB_i<uchar>
            v_mul_expand(v_scb1, vc2, t10, t11);
            v_mul_expand(v_scr0, vc1, g00, g01);
            v_mul_expand(v_scr1, vc1, g10, g11);
-            g00 += t00; g01 += t01;
-            g10 += t10; g11 += t11;
+            g00 = v_add(g00, t00); g01 = v_add(g01, t01);
+            g10 = v_add(g10, t10); g11 = v_add(g11, t11);
            v_mul_expand(v_scr0, vc0, r00, r01);
            v_mul_expand(v_scr1, vc0, r10, r11);

-            b00 = (b00 + vdescale) >> shift; b01 = (b01 + vdescale) >> shift;
-            b10 = (b10 + vdescale) >> shift; b11 = (b11 + vdescale) >> shift;
-            g00 = (g00 + vdescale) >> shift; g01 = (g01 + vdescale) >> shift;
-            g10 = (g10 + vdescale) >> shift; g11 = (g11 + vdescale) >> shift;
-            r00 = (r00 + vdescale) >> shift; r01 = (r01 + vdescale) >> shift;
-            r10 = (r10 + vdescale) >> shift; r11 = (r11 + vdescale) >> shift;
+            b00 = v_shr(v_add(b00, vdescale), shift); b01 = v_shr(v_add(b01, vdescale), shift);
+            b10 = v_shr(v_add(b10, vdescale), shift); b11 = v_shr(v_add(b11, vdescale), shift);
+            g00 = v_shr(v_add(g00, vdescale), shift); g01 = v_shr(v_add(g01, vdescale), shift);
+            g10 = v_shr(v_add(g10, vdescale), shift); g11 = v_shr(v_add(g11, vdescale), shift);
+            r00 = v_shr(v_add(r00, vdescale), shift); r01 = v_shr(v_add(r01, vdescale), shift);
+            r10 = v_shr(v_add(r10, vdescale), shift); r11 = v_shr(v_add(r11, vdescale), shift);

            v_int16 b0, b1, g0, g1, r0, r1;
            b0 = v_pack(b00, b01); b1 = v_pack(b10, b11);
@ -897,8 +906,8 @@ struct YCrCb2RGB_i<ushort>
        const ushort delta = ColorChannel<ushort>::half(), alpha = ColorChannel<ushort>::max();
        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];

-#if CV_SIMD
-        const int vsize = v_uint16::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int vsize = VTraits<v_uint16>::vlanes();
        const int descaleShift = 1 << (shift-1);
        v_uint16 valpha = vx_setall_u16(alpha);
        v_uint16 vdelta = vx_setall_u16(delta);
@ -939,22 +948,22 @@ struct YCrCb2RGB_i<ushort>
                // so we fix the multiplication
                v_int32 cb0, cb1;
                v_expand(scb, cb0, cb1);
-                b0 += cb0 << 15;
-                b1 += cb1 << 15;
+                b0 = v_add(b0, v_shl<15>(cb0));
+                b1 = v_add(b1, v_shl<15>(cb1));
            }
            v_int32 t0, t1;
            v_mul_expand(scb, vc2, t0, t1);
            v_mul_expand(scr, vc1, g0, g1);
-            g0 += t0; g1 += t1;
+            g0 = v_add(g0, t0); g1 = v_add(g1, t1);
            v_mul_expand(scr, vc0, r0, r1);

            // shifted term doesn't fit into 16 bits, addition is to be done in 32 bits
-            b0 = ((b0 + vdescale) >> shift) + y0;
-            b1 = ((b1 + vdescale) >> shift) + y1;
-            g0 = ((g0 + vdescale) >> shift) + y0;
-            g1 = ((g1 + vdescale) >> shift) + y1;
-            r0 = ((r0 + vdescale) >> shift) + y0;
-            r1 = ((r1 + vdescale) >> shift) + y1;
+            b0 = v_add(v_shr(v_add(b0, vdescale), shift), y0);
+            b1 = v_add(v_shr(v_add(b1, vdescale), shift), y1);
+            g0 = v_add(v_shr(v_add(g0, vdescale), shift), y0);
+            g1 = v_add(v_shr(v_add(g1, vdescale), shift), y1);
+            r0 = v_add(v_shr(v_add(r0, vdescale), shift), y0);
+            r1 = v_add(v_shr(v_add(r1, vdescale), shift), y1);

            // saturate and pack
            v_uint16 b, g, r;
@ -1038,11 +1047,11 @@ static inline void uvToRGBuv(const uchar u, const uchar v, int& ruv, int& guv, i
    buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * uu;
 }

-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 static inline void uvToRGBuv(const v_uint8& u, const v_uint8& v,
-                             v_int32 (&ruv)[4],
-                             v_int32 (&guv)[4],
-                             v_int32 (&buv)[4])
+                             v_int32 &ruv0, v_int32 &ruv1, v_int32 &ruv2, v_int32 &ruv3,
+                             v_int32 &guv0, v_int32 &guv1, v_int32 &guv2, v_int32 &guv3,
+                             v_int32 &buv0, v_int32 &buv1, v_int32 &buv2, v_int32 &buv3)
 {
    v_uint8 v128 = vx_setall_u8(128);
    v_int8 su = v_reinterpret_as_s8(v_sub_wrap(u, v128));
@ -1051,9 +1060,10 @@ static inline void uvToRGBuv(const v_uint8& u, const v_uint8& v,
    v_int16 uu0, uu1, vv0, vv1;
    v_expand(su, uu0, uu1);
    v_expand(sv, vv0, vv1);
-    v_int32 uu[4], vv[4];
-    v_expand(uu0, uu[0], uu[1]); v_expand(uu1, uu[2], uu[3]);
-    v_expand(vv0, vv[0], vv[1]); v_expand(vv1, vv[2], vv[3]);
+    v_int32 uuu0, uuu1, uuu2, uuu3;
+    v_int32 vvv0, vvv1, vvv2, vvv3;
+    v_expand(uu0, uuu0, uuu1); v_expand(uu1, uuu2, uuu3);
+    v_expand(vv0, vvv0, vvv1); v_expand(vv1, vvv2, vvv3);

    v_int32 vshift = vx_setall_s32(1 << (ITUR_BT_601_SHIFT - 1));
    v_int32 vr = vx_setall_s32(ITUR_BT_601_CVR);
@ -1061,12 +1071,15 @@ static inline void uvToRGBuv(const v_uint8& u, const v_uint8& v,
    v_int32 ug = vx_setall_s32(ITUR_BT_601_CUG);
    v_int32 ub = vx_setall_s32(ITUR_BT_601_CUB);

-    for (int k = 0; k < 4; k++)
-    {
-        ruv[k] = vshift + vr * vv[k];
-        guv[k] = vshift + vg * vv[k] + ug * uu[k];
-        buv[k] = vshift + ub * uu[k];
-    }
+    auto process_uv = [&](v_int32& ruv, v_int32& guv, v_int32& buv, const v_int32& vv, const v_int32& uu) {
+        ruv = v_add(vshift, v_mul(vr, vv));
+        guv = v_add(v_add(vshift, v_mul(vg, vv)), v_mul(ug, uu));
+        buv = v_add(vshift, v_mul(ub, uu));
+    };
+    process_uv(ruv0, guv0, buv0, vvv0, uuu0);
+    process_uv(ruv1, guv1, buv1, vvv1, uuu1);
+    process_uv(ruv2, guv2, buv2, vvv2, uuu2);
+    process_uv(ruv3, guv3, buv3, vvv3, uuu3);
 }
 #endif

@ -1081,44 +1094,48 @@ static inline void yRGBuvToRGBA(const uchar vy, const int ruv, const int guv, co
    a = uchar(0xff);
 }

-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 static inline void yRGBuvToRGBA(const v_uint8& vy,
-                                const v_int32 (&ruv)[4],
-                                const v_int32 (&guv)[4],
-                                const v_int32 (&buv)[4],
+                                const v_int32 &ruv0, const v_int32 &ruv1, const v_int32 &ruv2, const v_int32 &ruv3,
+                                const v_int32 &guv0, const v_int32 &guv1, const v_int32 &guv2, const v_int32 &guv3,
+                                const v_int32 &buv0, const v_int32 &buv1, const v_int32 &buv2, const v_int32 &buv3,
                                v_uint8& rr, v_uint8& gg, v_uint8& bb)
 {
    v_uint8 v16 = vx_setall_u8(16);
-    v_uint8 posY = vy - v16;
+    v_uint8 posY = v_sub(vy, v16);
    v_uint16 yy0, yy1;
    v_expand(posY, yy0, yy1);
-    v_int32 yy[4];
-    v_int32 yy00, yy01, yy10, yy11;
-    v_expand(v_reinterpret_as_s16(yy0), yy[0], yy[1]);
-    v_expand(v_reinterpret_as_s16(yy1), yy[2], yy[3]);
+    v_int32 yyy0, yyy1, yyy2, yyy3;
+    v_expand(v_reinterpret_as_s16(yy0), yyy0, yyy1);
+    v_expand(v_reinterpret_as_s16(yy1), yyy2, yyy3);

    v_int32 vcy = vx_setall_s32(ITUR_BT_601_CY);

-    v_int32 y[4], r[4], g[4], b[4];
-    for(int k = 0; k < 4; k++)
-    {
-        y[k] = yy[k]*vcy;
-        r[k] = (y[k] + ruv[k]) >> ITUR_BT_601_SHIFT;
-        g[k] = (y[k] + guv[k]) >> ITUR_BT_601_SHIFT;
-        b[k] = (y[k] + buv[k]) >> ITUR_BT_601_SHIFT;
-    }
+    v_int32 y0, y1, y2, y3, r0, r1, r2, r3, g0, g1, g2, g3, b0, b1, b2, b3;

-    v_int16 r0, r1, g0, g1, b0, b1;
-    r0 = v_pack(r[0], r[1]);
-    r1 = v_pack(r[2], r[3]);
-    g0 = v_pack(g[0], g[1]);
-    g1 = v_pack(g[2], g[3]);
-    b0 = v_pack(b[0], b[1]);
-    b1 = v_pack(b[2], b[3]);
+    auto process_yrgb = [&](const v_int32& yy, v_int32& y, v_int32& r, v_int32& g, v_int32& b,
+                            const v_int32& ruv, const v_int32& guv, const v_int32& buv) {
+        y = v_mul(yy, vcy);
+        r = v_shr(v_add(y, ruv), ITUR_BT_601_SHIFT);
+        g = v_shr(v_add(y, guv), ITUR_BT_601_SHIFT);
+        b = v_shr(v_add(y, buv), ITUR_BT_601_SHIFT);
+    };
+    process_yrgb(yyy0, y0, r0, g0, b0, ruv0, guv0, buv0);
+    process_yrgb(yyy1, y1, r1, g1, b1, ruv1, guv1, buv1);
+    process_yrgb(yyy2, y2, r2, g2, b2, ruv2, guv2, buv2);
+    process_yrgb(yyy3, y3, r3, g3, b3, ruv3, guv3, buv3);

-    rr = v_pack_u(r0, r1);
-    gg = v_pack_u(g0, g1);
-    bb = v_pack_u(b0, b1);
+    v_int16 _r0, _r1, _g0, _g1, _b0, _b1;
+    _r0 = v_pack(r0, r1);
+    _r1 = v_pack(r2, r3);
+    _g0 = v_pack(g0, g1);
+    _g1 = v_pack(g2, g3);
+    _b0 = v_pack(b0, b1);
+    _b1 = v_pack(b2, b3);
+
+    rr = v_pack_u(_r0, _r1);
+    gg = v_pack_u(_g0, _g1);
+    bb = v_pack_u(_b0, _b1);
 }
 #endif

@ -1201,8 +1218,8 @@ struct YUV420sp2RGB8Invoker : ParallelLoopBody
            const uchar* y2 = y1 + my1_step;

            int i = 0;
-#if CV_SIMD
-            const int vsize = v_uint8::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+            const int vsize = VTraits<v_uint8>::vlanes();
            v_uint8 a = vx_setall_u8(uchar(0xff));
            for( ; i <= width - 2*vsize;
                 i += 2*vsize, row1 += vsize*dcn*2, row2 += vsize*dcn*2)
@ -1215,36 +1232,50 @@ struct YUV420sp2RGB8Invoker : ParallelLoopBody
                    swap(u, v);
                }

-                v_uint8 vy[4];
-                v_load_deinterleave(y1 + i, vy[0], vy[1]);
-                v_load_deinterleave(y2 + i, vy[2], vy[3]);
+                v_uint8 vy0, vy1, vy2, vy3;
+                v_load_deinterleave(y1 + i, vy0, vy1);
+                v_load_deinterleave(y2 + i, vy2, vy3);

-                v_int32 ruv[4], guv[4], buv[4];
-                uvToRGBuv(u, v, ruv, guv, buv);
+                v_int32 ruv0, ruv1, ruv2, ruv3,
+                        guv0, guv1, guv2, guv3,
+                        buv0, buv1, buv2, buv3;
+                uvToRGBuv(u, v,
+                        ruv0, ruv1, ruv2, ruv3,
+                        guv0, guv1, guv2, guv3,
+                        buv0, buv1, buv2, buv3);

-                v_uint8 r[4], g[4], b[4];
+                v_uint8 r0, r1, r2, r3, g0, g1, g2, g3, b0, b1, b2, b3;

-                for(int k = 0; k < 4; k++)
-                {
-                    yRGBuvToRGBA(vy[k], ruv, guv, buv, r[k], g[k], b[k]);
-                }
+                auto call_yRGBuvToRGBA = [&](const v_uint8& vy, v_uint8& r, v_uint8& g, v_uint8& b) {
+                    yRGBuvToRGBA(vy,
+                        ruv0, ruv1, ruv2, ruv3,
+                        guv0, guv1, guv2, guv3,
+                        buv0, buv1, buv2, buv3,
+                        r, g, b);
+                };
+                call_yRGBuvToRGBA(vy0, r0, g0, b0);
+                call_yRGBuvToRGBA(vy1, r1, g1, b1);
+                call_yRGBuvToRGBA(vy2, r2, g2, b2);
+                call_yRGBuvToRGBA(vy3, r3, g3, b3);

                if(bIdx)
                {
-                    for(int k = 0; k < 4; k++)
-                        swap(r[k], b[k]);
+                    swap(r0, b0);
+                    swap(r1, b1);
+                    swap(r2, b2);
+                    swap(r3, b3);
                }

                // [r0...], [r1...] => [r0, r1, r0, r1...], [r0, r1, r0, r1...]
                v_uint8 r0_0, r0_1, r1_0, r1_1;
-                v_zip(r[0], r[1], r0_0, r0_1);
-                v_zip(r[2], r[3], r1_0, r1_1);
+                v_zip(r0, r1, r0_0, r0_1);
+                v_zip(r2, r3, r1_0, r1_1);
                v_uint8 g0_0, g0_1, g1_0, g1_1;
-                v_zip(g[0], g[1], g0_0, g0_1);
-                v_zip(g[2], g[3], g1_0, g1_1);
+                v_zip(g0, g1, g0_0, g0_1);
+                v_zip(g2, g3, g1_0, g1_1);
                v_uint8 b0_0, b0_1, b1_0, b1_1;
-                v_zip(b[0], b[1], b0_0, b0_1);
-                v_zip(b[2], b[3], b1_0, b1_1);
+                v_zip(b0, b1, b0_0, b0_1);
+                v_zip(b2, b3, b1_0, b1_1);

                if(dcn == 4)
                {
@ -1319,8 +1350,8 @@ struct YUV420p2RGB8Invoker : ParallelLoopBody
            const uchar* y2 = y1 + stride;
            int i = 0;

-#if CV_SIMD
-            const int vsize = v_uint8::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+            const int vsize = VTraits<v_uint8>::vlanes();
            v_uint8 a = vx_setall_u8(uchar(0xff));
            for( ; i <= width/2 - vsize;
                 i += vsize, row1 += vsize*dcn*2, row2 += vsize*dcn*2)
@ -1329,36 +1360,50 @@ struct YUV420p2RGB8Invoker : ParallelLoopBody
                u = vx_load(u1 + i);
                v = vx_load(v1 + i);

-                v_uint8 vy[4];
-                v_load_deinterleave(y1 + 2*i, vy[0], vy[1]);
-                v_load_deinterleave(y2 + 2*i, vy[2], vy[3]);
+                v_uint8 vy0, vy1, vy2, vy3;
+                v_load_deinterleave(y1 + 2*i, vy0, vy1);
+                v_load_deinterleave(y2 + 2*i, vy2, vy3);

-                v_int32 ruv[4], guv[4], buv[4];
-                uvToRGBuv(u, v, ruv, guv, buv);
+                v_int32 ruv0, ruv1, ruv2, ruv3,
+                        guv0, guv1, guv2, guv3,
+                        buv0, buv1, buv2, buv3;
+                uvToRGBuv(u, v,
+                        ruv0, ruv1, ruv2, ruv3,
+                        guv0, guv1, guv2, guv3,
+                        buv0, buv1, buv2, buv3);

-                v_uint8 r[4], g[4], b[4];
+                v_uint8 r0, r1, r2, r3, g0, g1, g2, g3, b0, b1, b2, b3;

-                for(int k = 0; k < 4; k++)
-                {
-                    yRGBuvToRGBA(vy[k], ruv, guv, buv, r[k], g[k], b[k]);
-                }
+                auto call_yRGBuvToRGBA = [&](const v_uint8& vy, v_uint8& r, v_uint8& g, v_uint8& b) {
+                    yRGBuvToRGBA(vy,
+                        ruv0, ruv1, ruv2, ruv3,
+                        guv0, guv1, guv2, guv3,
+                        buv0, buv1, buv2, buv3,
+                        r, g, b);
+                };
+                call_yRGBuvToRGBA(vy0, r0, g0, b0);
+                call_yRGBuvToRGBA(vy1, r1, g1, b1);
+                call_yRGBuvToRGBA(vy2, r2, g2, b2);
+                call_yRGBuvToRGBA(vy3, r3, g3, b3);

                if(bIdx)
                {
-                    for(int k = 0; k < 4; k++)
-                        swap(r[k], b[k]);
+                    swap(r0, b0);
+                    swap(r1, b1);
+                    swap(r2, b2);
+                    swap(r3, b3);
                }

                // [r0...], [r1...] => [r0, r1, r0, r1...], [r0, r1, r0, r1...]
                v_uint8 r0_0, r0_1, r1_0, r1_1;
-                v_zip(r[0], r[1], r0_0, r0_1);
-                v_zip(r[2], r[3], r1_0, r1_1);
+                v_zip(r0, r1, r0_0, r0_1);
+                v_zip(r2, r3, r1_0, r1_1);
                v_uint8 g0_0, g0_1, g1_0, g1_1;
-                v_zip(g[0], g[1], g0_0, g0_1);
-                v_zip(g[2], g[3], g1_0, g1_1);
+                v_zip(g0, g1, g0_0, g0_1);
+                v_zip(g2, g3, g1_0, g1_1);
                v_uint8 b0_0, b0_1, b1_0, b1_1;
-                v_zip(b[0], b[1], b0_0, b0_1);
-                v_zip(b[2], b[3], b1_0, b1_1);
+                v_zip(b0, b1, b0_0, b0_1);
+                v_zip(b2, b3, b1_0, b1_1);

                if(dcn == 4)
                {
@ -1430,7 +1475,7 @@ static inline uchar rgbToY42x(uchar r, uchar g, uchar b)
    return saturate_cast<uchar>(yy >> ITUR_BT_601_SHIFT);
 }

-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 static inline v_uint8 rgbToY42x(const v_uint8& r, const v_uint8& g, const v_uint8& b)
 {
    const int shifted16 = (16 << ITUR_BT_601_SHIFT);
@ -1440,25 +1485,25 @@ static inline v_uint8 rgbToY42x(const v_uint8& r, const v_uint8& g, const v_uint
    v_expand(g, g0, g1);
    v_expand(b, b0, b1);

-    v_uint32 rq[4], gq[4], bq[4];
-    v_expand(r0, rq[0], rq[1]); v_expand(r1, rq[2], rq[3]);
-    v_expand(g0, gq[0], gq[1]); v_expand(g1, gq[2], gq[3]);
-    v_expand(b0, bq[0], bq[1]); v_expand(b1, bq[2], bq[3]);
+    v_uint32 rq0, rq1, rq2, rq3, gq0, gq1, gq2, gq3, bq0, bq1, bq2, bq3;
+    v_expand(r0, rq0, rq1); v_expand(r1, rq2, rq3);
+    v_expand(g0, gq0, gq1); v_expand(g1, gq2, gq3);
+    v_expand(b0, bq0, bq1); v_expand(b1, bq2, bq3);

    v_uint32 ry = vx_setall_u32(ITUR_BT_601_CRY), gy = vx_setall_u32(ITUR_BT_601_CGY);
    v_uint32 by = vx_setall_u32(ITUR_BT_601_CBY), shift = vx_setall_u32(halfShift + shifted16);

-    v_uint32 y[4];
-    for(int k = 0; k < 4; k++)
-    {
-        y[k] = (rq[k]*ry + gq[k]*gy + bq[k]*by + shift) >> ITUR_BT_601_SHIFT;
-    }
+    v_uint32 y0, y1, y2, y3;
+    y0 = v_shr<ITUR_BT_601_SHIFT>(v_add(v_add(v_add(v_mul(rq0, ry), v_mul(gq0, gy)), v_mul(bq0, by)), shift));
+    y1 = v_shr<ITUR_BT_601_SHIFT>(v_add(v_add(v_add(v_mul(rq1, ry), v_mul(gq1, gy)), v_mul(bq1, by)), shift));
+    y2 = v_shr<ITUR_BT_601_SHIFT>(v_add(v_add(v_add(v_mul(rq2, ry), v_mul(gq2, gy)), v_mul(bq2, by)), shift));
+    y3 = v_shr<ITUR_BT_601_SHIFT>(v_add(v_add(v_add(v_mul(rq3, ry), v_mul(gq3, gy)), v_mul(bq3, by)), shift));

-    v_uint16 y0, y1;
-    y0 = v_pack(y[0], y[1]);
-    y1 = v_pack(y[2], y[3]);
+    v_uint16 _y0, _y1;
+    _y0 = v_pack(y0, y1);
+    _y1 = v_pack(y2, y3);

-    return v_pack(y0, y1);
+    return v_pack(_y0, _y1);
 }
 #endif

@ -1473,27 +1518,27 @@ static inline void rgbToUV42x(uchar r, uchar g, uchar b, uchar& u, uchar& v)
    v = saturate_cast<uchar>(vv >> ITUR_BT_601_SHIFT);
 }

-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 static inline void rgbToUV42x(const v_uint8& r0, const v_uint8& r1, const v_uint8& g0, const v_uint8& g1,
                              const v_uint8& b0, const v_uint8& b1, v_uint8& u, v_uint8& v)
 {
    // [r0, r1, r2, r3,..] => [r0, 0, r2, 0,..]
    v_int16 vlowByte = vx_setall_s16(0x00ff);
    v_int16 rd0, rd1, gd0, gd1, bd0, bd1;
-    rd0 = v_reinterpret_as_s16(r0) & vlowByte;
-    rd1 = v_reinterpret_as_s16(r1) & vlowByte;
-    gd0 = v_reinterpret_as_s16(g0) & vlowByte;
-    gd1 = v_reinterpret_as_s16(g1) & vlowByte;
-    bd0 = v_reinterpret_as_s16(b0) & vlowByte;
-    bd1 = v_reinterpret_as_s16(b1) & vlowByte;
+    rd0 = v_and(v_reinterpret_as_s16(r0), vlowByte);
+    rd1 = v_and(v_reinterpret_as_s16(r1), vlowByte);
+    gd0 = v_and(v_reinterpret_as_s16(g0), vlowByte);
+    gd1 = v_and(v_reinterpret_as_s16(g1), vlowByte);
+    bd0 = v_and(v_reinterpret_as_s16(b0), vlowByte);
+    bd1 = v_and(v_reinterpret_as_s16(b1), vlowByte);

-    v_int32 rq[4], gq[4], bq[4];
-    v_expand(rd0, rq[0], rq[1]);
-    v_expand(rd1, rq[2], rq[3]);
-    v_expand(gd0, gq[0], gq[1]);
-    v_expand(gd1, gq[2], gq[3]);
-    v_expand(bd0, bq[0], bq[1]);
-    v_expand(bd1, bq[2], bq[3]);
+    v_int32 rq0, rq1, rq2, rq3, gq0, gq1, gq2, gq3, bq0, bq1, bq2, bq3;
+    v_expand(rd0, rq0, rq1);
+    v_expand(rd1, rq2, rq3);
+    v_expand(gd0, gq0, gq1);
+    v_expand(gd1, gq2, gq3);
+    v_expand(bd0, bq0, bq1);
+    v_expand(bd1, bq2, bq3);

    const int halfShift = (1 << (ITUR_BT_601_SHIFT - 1));
    const int shifted128 = (128 << ITUR_BT_601_SHIFT);
@ -1505,18 +1550,21 @@ static inline void rgbToUV42x(const v_uint8& r0, const v_uint8& r1, const v_uint
    bu = vx_setall_s32(ITUR_BT_601_CBU);
    bv = vx_setall_s32(ITUR_BT_601_CBV);

-    v_int32 uq[4], vq[4];
-    for(int k = 0; k < 4; k++)
-    {
-        uq[k] = (ru*rq[k] + gu*gq[k] + bu*bq[k] + shift) >> ITUR_BT_601_SHIFT;
-        vq[k] = (bu*rq[k] + gv*gq[k] + bv*bq[k] + shift) >> ITUR_BT_601_SHIFT;
-    }
+    v_int32 uq0, uq1, uq2, uq3, vq0, vq1, vq2, vq3;
+    uq0 = v_shr<ITUR_BT_601_SHIFT>(v_add(v_add(v_add(v_mul(ru, rq0), v_mul(gu, gq0)), v_mul(bu, bq0)), shift));
+    vq0 = v_shr<ITUR_BT_601_SHIFT>(v_add(v_add(v_add(v_mul(bu, rq0), v_mul(gv, gq0)), v_mul(bv, bq0)), shift));
+    uq1 = v_shr<ITUR_BT_601_SHIFT>(v_add(v_add(v_add(v_mul(ru, rq1), v_mul(gu, gq1)), v_mul(bu, bq1)), shift));
+    vq1 = v_shr<ITUR_BT_601_SHIFT>(v_add(v_add(v_add(v_mul(bu, rq1), v_mul(gv, gq1)), v_mul(bv, bq1)), shift));
+    uq2 = v_shr<ITUR_BT_601_SHIFT>(v_add(v_add(v_add(v_mul(ru, rq2), v_mul(gu, gq2)), v_mul(bu, bq2)), shift));
+    vq2 = v_shr<ITUR_BT_601_SHIFT>(v_add(v_add(v_add(v_mul(bu, rq2), v_mul(gv, gq2)), v_mul(bv, bq2)), shift));
+    uq3 = v_shr<ITUR_BT_601_SHIFT>(v_add(v_add(v_add(v_mul(ru, rq3), v_mul(gu, gq3)), v_mul(bu, bq3)), shift));
+    vq3 = v_shr<ITUR_BT_601_SHIFT>(v_add(v_add(v_add(v_mul(bu, rq3), v_mul(gv, gq3)), v_mul(bv, bq3)), shift));

    v_int16 u0, u1, v0, v1;
-    u0 = v_pack(uq[0], uq[1]);
-    u1 = v_pack(uq[2], uq[3]);
-    v0 = v_pack(vq[0], vq[1]);
-    v1 = v_pack(vq[2], vq[3]);
+    u0 = v_pack(uq0, uq1);
+    u1 = v_pack(uq2, uq3);
+    v0 = v_pack(vq0, vq1);
+    v1 = v_pack(vq2, vq3);

    u = v_pack_u(u0, u1);
    v = v_pack_u(v0, v1);
@ -1559,8 +1607,8 @@ struct RGB8toYUV420pInvoker: public ParallelLoopBody
                }
            }
            int i = 0;
-#if CV_SIMD
-            const int vsize = v_uint8::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+            const int vsize = VTraits<v_uint8>::vlanes();

            for( ; i <= w/2 - vsize;
                 i += vsize)
@ -1708,47 +1756,61 @@ struct YUV422toRGB8Invoker : ParallelLoopBody
        {
            uchar* row = dst_data + dst_step * j;
            int i = 0;
-#if CV_SIMD
-            const int vsize = v_uint8::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+            const int vsize = VTraits<v_uint8>::vlanes();
            v_uint8 a = vx_setall_u8(uchar(0xff));
            for(; i <= 2*width - 4*vsize;
                i += 4*vsize, row += vsize*dcn*2)
            {
-                v_uint8 u, v, vy[2];
+                v_uint8 u, v, vy0, vy1;
                if(yIdx == 1) // UYVY
                {
-                    v_load_deinterleave(yuv_src + i, u, vy[0], v, vy[1]);
+                    v_load_deinterleave(yuv_src + i, u, vy0, v, vy1);
                }
                else // YUYV or YVYU
                {
-                    v_load_deinterleave(yuv_src + i, vy[0], u, vy[1], v);
+                    v_load_deinterleave(yuv_src + i, vy0, u, vy1, v);
                    if(uIdx == 1) // YVYU
                    {
                        swap(u, v);
                    }
                }

-                v_int32 ruv[4], guv[4], buv[4];
-                uvToRGBuv(u, v, ruv, guv, buv);
+                v_int32 ruv0, ruv1, ruv2, ruv3,
+                        guv0, guv1, guv2, guv3,
+                        buv0, buv1, buv2, buv3;
+                uvToRGBuv(u, v,
+                        ruv0, ruv1, ruv2, ruv3,
+                        guv0, guv1, guv2, guv3,
+                        buv0, buv1, buv2, buv3);

-                v_uint8 r[2], g[2], b[2];
+                v_uint8 r0, r1, g0, g1, b0, b1;

-                yRGBuvToRGBA(vy[0], ruv, guv, buv, r[0], g[0], b[0]);
-                yRGBuvToRGBA(vy[1], ruv, guv, buv, r[1], g[1], b[1]);
+
+                yRGBuvToRGBA(vy0,
+                        ruv0, ruv1, ruv2, ruv3,
+                        guv0, guv1, guv2, guv3,
+                        buv0, buv1, buv2, buv3,
+                        r0, g0, b0);
+                yRGBuvToRGBA(vy1,
+                        ruv0, ruv1, ruv2, ruv3,
+                        guv0, guv1, guv2, guv3,
+                        buv0, buv1, buv2, buv3,
+                        r1, g1, b1);

                if(bIdx)
                {
-                    swap(r[0], b[0]);
-                    swap(r[1], b[1]);
+                    swap(r0, b0);
+                    swap(r1, b1);
                }

                // [r0...], [r1...] => [r0, r1, r0, r1...], [r0, r1, r0, r1...]
                v_uint8 r0_0, r0_1;
-                v_zip(r[0], r[1], r0_0, r0_1);
+                v_zip(r0, r1, r0_0, r0_1);
                v_uint8 g0_0, g0_1;
-                v_zip(g[0], g[1], g0_0, g0_1);
+                v_zip(g0, g1, g0_0, g0_1);
                v_uint8 b0_0, b0_1;
-                v_zip(b[0], b[1], b0_0, b0_1);
+                v_zip(b0, b1, b0_0, b0_1);

                if(dcn == 4)
                {
--- a/modules/imgproc/src/filter.simd.hpp
+++ b/modules/imgproc/src/filter.simd.hpp
--- a/modules/imgproc/src/hough.cpp
+++ b/modules/imgproc/src/hough.cpp
@ -1156,13 +1156,13 @@ public:

            for(; x < numCols; ++x )
            {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                {
                    v_uint8 v_zero = vx_setzero_u8();

-                    for(; x <= numCols - 2*v_uint8::nlanes; x += 2*v_uint8::nlanes) {
-                        v_uint8 v_edge1 = (vx_load(edgeData + x                  ) != v_zero);
-                        v_uint8 v_edge2 = (vx_load(edgeData + x + v_uint8::nlanes) != v_zero);
+                    for(; x <= numCols - 2*VTraits<v_uint8>::vlanes(); x += 2*VTraits<v_uint8>::vlanes()) {
+                        v_uint8 v_edge1 = (v_ne(vx_load(edgeData + x), v_zero));
+                        v_uint8 v_edge2 = (v_ne(vx_load(edgeData + x + VTraits<v_uint8>::vlanes()), v_zero));

                        if(v_check_any(v_edge1))
                        {
@ -1172,7 +1172,7 @@ public:

                        if(v_check_any(v_edge2))
                        {
-                            x += v_uint8::nlanes + v_scan_forward(v_edge2);
+                            x += VTraits<v_uint8>::vlanes() + v_scan_forward(v_edge2);
                            goto _next_step;
                        }
                    }
@ -1183,7 +1183,7 @@ public:

                if(x == numCols)
                    continue;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 _next_step:
 #endif
                float vx, vy;
@ -1514,7 +1514,7 @@ inline int HoughCircleEstimateRadiusInvoker<NZPointList>::filterCircles(const Po
    int nzCount = 0;
    const Point* nz_ = &nz[0];
    int j = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    {
        const v_float32 v_minRadius2 = vx_setall_f32(minRadius2);
        const v_float32 v_maxRadius2 = vx_setall_f32(maxRadius2);
@ -1522,9 +1522,9 @@ inline int HoughCircleEstimateRadiusInvoker<NZPointList>::filterCircles(const Po
        v_float32 v_curCenterX = vx_setall_f32(curCenter.x);
        v_float32 v_curCenterY = vx_setall_f32(curCenter.y);

-        float CV_DECL_ALIGNED(CV_SIMD_WIDTH) rbuf[v_float32::nlanes];
-        int CV_DECL_ALIGNED(CV_SIMD_WIDTH) rmask[v_int32::nlanes];
-        for(; j <= nzSz - v_float32::nlanes; j += v_float32::nlanes)
+        float CV_DECL_ALIGNED(CV_SIMD_WIDTH) rbuf[VTraits<v_float32>::max_nlanes];
+        int CV_DECL_ALIGNED(CV_SIMD_WIDTH) rmask[VTraits<v_int32>::max_nlanes];
+        for(; j <= nzSz - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes())
        {
            v_float32 v_nzX, v_nzY;
            v_load_deinterleave((const float*)&nz_[j], v_nzX, v_nzY); // FIXIT use proper datatype
@ -1532,16 +1532,16 @@ inline int HoughCircleEstimateRadiusInvoker<NZPointList>::filterCircles(const Po
            v_float32 v_x = v_cvt_f32(v_reinterpret_as_s32(v_nzX));
            v_float32 v_y = v_cvt_f32(v_reinterpret_as_s32(v_nzY));

-            v_float32 v_dx = v_x - v_curCenterX;
-            v_float32 v_dy = v_y - v_curCenterY;
+            v_float32 v_dx = v_sub(v_x, v_curCenterX);
+            v_float32 v_dy = v_sub(v_y, v_curCenterY);

-            v_float32 v_r2 = (v_dx * v_dx) + (v_dy * v_dy);
-            v_float32 vmask = (v_minRadius2 <= v_r2) & (v_r2 <= v_maxRadius2);
+            v_float32 v_r2 = v_add(v_mul(v_dx, v_dx), v_mul(v_dy, v_dy));
+            v_float32 vmask = v_and(v_le(v_minRadius2, v_r2), v_le(v_r2, v_maxRadius2));
            if (v_check_any(vmask))
            {
                v_store_aligned(rmask, v_reinterpret_as_s32(vmask));
                v_store_aligned(rbuf, v_r2);
-                for (int i = 0; i < v_int32::nlanes; ++i)
+                for (int i = 0; i < VTraits<v_int32>::vlanes(); ++i)
                    if (rmask[i]) ddata[nzCount++] = rbuf[i];
            }
        }
@ -1573,13 +1573,13 @@ inline int HoughCircleEstimateRadiusInvoker<NZPointSet>::filterCircles(const Poi
    const Range xOuter = Range(std::max(int(curCenter.x - rOuter), 0), std::min(int(curCenter.x + rOuter), positions.cols));
    const Range yOuter = Range(std::max(int(curCenter.y - rOuter), 0), std::min(int(curCenter.y + rOuter), positions.rows));

-#if CV_SIMD
-    float v_seq[v_float32::nlanes];
-    for (int i = 0; i < v_float32::nlanes; ++i)
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    float v_seq[VTraits<v_float32>::max_nlanes];
+    for (int i = 0; i < VTraits<v_float32>::vlanes(); ++i)
        v_seq[i] = (float)i;
    const v_float32 v_minRadius2 = vx_setall_f32(minRadius2);
    const v_float32 v_maxRadius2 = vx_setall_f32(maxRadius2);
-    const v_float32 v_curCenterX_0123 = vx_setall_f32(curCenter.x) - vx_load(v_seq);
+    const v_float32 v_curCenterX_0123 = v_sub(vx_setall_f32(curCenter.x), vx_load(v_seq));
 #endif

    for (int y = yOuter.start; y < yOuter.end; y++)
@ -1589,27 +1589,27 @@ inline int HoughCircleEstimateRadiusInvoker<NZPointSet>::filterCircles(const Poi
        float dy2 = dy * dy;

        int x = xOuter.start;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
        {
            const v_float32 v_dy2 = vx_setall_f32(dy2);
            const v_uint32 v_zero_u32 = vx_setall_u32(0);
-            float CV_DECL_ALIGNED(CV_SIMD_WIDTH) rbuf[v_float32::nlanes];
-            int CV_DECL_ALIGNED(CV_SIMD_WIDTH) rmask[v_int32::nlanes];
-            for (; x <= xOuter.end - v_float32::nlanes; x += v_float32::nlanes)
+            float CV_DECL_ALIGNED(CV_SIMD_WIDTH) rbuf[VTraits<v_float32>::max_nlanes];
+            int CV_DECL_ALIGNED(CV_SIMD_WIDTH) rmask[VTraits<v_int32>::max_nlanes];
+            for (; x <= xOuter.end - VTraits<v_float32>::vlanes(); x += VTraits<v_float32>::vlanes())
            {
                v_uint32 v_mask = vx_load_expand_q(ptr + x);
-                v_mask = v_mask != v_zero_u32;
+                v_mask = v_ne(v_mask, v_zero_u32);

                v_float32 v_x = v_cvt_f32(vx_setall_s32(x));
-                v_float32 v_dx = v_x - v_curCenterX_0123;
+                v_float32 v_dx = v_sub(v_x, v_curCenterX_0123);

-                v_float32 v_r2 = (v_dx * v_dx) + v_dy2;
-                v_float32 vmask = (v_minRadius2 <= v_r2) & (v_r2 <= v_maxRadius2) & v_reinterpret_as_f32(v_mask);
+                v_float32 v_r2 = v_add(v_mul(v_dx, v_dx), v_dy2);
+                v_float32 vmask = v_and(v_and(v_le(v_minRadius2, v_r2), v_le(v_r2, v_maxRadius2)), v_reinterpret_as_f32(v_mask));
                if (v_check_any(vmask))
                {
                    v_store_aligned(rmask, v_reinterpret_as_s32(vmask));
                    v_store_aligned(rbuf, v_r2);
-                    for (int i = 0; i < v_int32::nlanes; ++i)
+                    for (int i = 0; i < VTraits<v_int32>::vlanes(); ++i)
                        if (rmask[i]) ddata[nzCount++] = rbuf[i];
                }
            }
--- a/modules/imgproc/src/stackblur.cpp
+++ b/modules/imgproc/src/stackblur.cpp
@ -88,7 +88,7 @@ static unsigned char const stackblurShr[255] =

 namespace cv{

-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 template<typename T>
 inline int opRow(const T* , T* , const std::vector<ushort>& , const float , const int radius, const int CN, const int )
 {
@ -107,7 +107,7 @@ inline int opRow<uchar>(const uchar* srcPtr, uchar* dstPtr, const std::vector<us
    const int mulValTab= stackblurMul[radius];
    const int shrValTab= stackblurShr[radius];

-    const int VEC_LINE = v_uint8::nlanes;
+    const int VEC_LINE = VTraits<v_uint8>::vlanes();

    if (kernelSize == 3)
    {
@ -126,10 +126,10 @@ inline int opRow<uchar>(const uchar* srcPtr, uchar* dstPtr, const std::vector<us
            v_expand(x1l, y00, y01);
            v_expand(x1h, y10, y11);

-            y00 = (y00 * v_mulVal)>>shrValTab;
-            y01 = (y01 * v_mulVal)>>shrValTab;
-            y10 = (y10 * v_mulVal)>>shrValTab;
-            y11 = (y11 * v_mulVal)>>shrValTab;
+            y00 = v_shr(v_mul(y00, v_mulVal), shrValTab);
+            y01 = v_shr(v_mul(y01, v_mulVal), shrValTab);
+            y10 = v_shr(v_mul(y10, v_mulVal), shrValTab);
+            y11 = v_shr(v_mul(y11, v_mulVal), shrValTab);

            v_store(dstPtr + i, v_pack(v_pack(y00, y01), v_pack(y10, y11)));
        }
@ -159,12 +159,12 @@ inline int opRow<uchar>(const uchar* srcPtr, uchar* dstPtr, const std::vector<us
                v_uint8 v_src3 = vx_load(srcPtr + j + CN);

                v_int16 xl, xh;
-                v_zip(v_reinterpret_as_s16(v_expand_low(v_src0) + v_expand_low(v_src2)), v_reinterpret_as_s16(v_expand_low(v_src1) + v_expand_low(v_src3)), xl, xh);
-                s0 += v_dotprod(xl, k12);
-                s1 += v_dotprod(xh, k12);
-                v_zip(v_reinterpret_as_s16(v_expand_high(v_src0) + v_expand_high(v_src2)), v_reinterpret_as_s16(v_expand_high(v_src1) + v_expand_high(v_src3)), xl, xh);
-                s2 += v_dotprod(xl, k12);
-                s3 += v_dotprod(xh, k12);
+                v_zip(v_reinterpret_as_s16(v_add(v_expand_low(v_src0), v_expand_low(v_src2))), v_reinterpret_as_s16(v_add(v_expand_low(v_src1), v_expand_low(v_src3))), xl, xh);
+                s0 = v_add(s0, v_dotprod(xl, k12));
+                s1 = v_add(s1, v_dotprod(xh, k12));
+                v_zip(v_reinterpret_as_s16(v_add(v_expand_high(v_src0), v_expand_high(v_src2))), v_reinterpret_as_s16(v_add(v_expand_high(v_src1), v_expand_high(v_src3))), xl, xh);
+                s2 = v_add(s2, v_dotprod(xl, k12));
+                s3 = v_add(s3, v_dotprod(xh, k12));
            }
            if( k < kernelSize / 2 + 1 )
            {
@ -175,17 +175,17 @@ inline int opRow<uchar>(const uchar* srcPtr, uchar* dstPtr, const std::vector<us

                v_int16 xl, xh;
                v_zip(v_reinterpret_as_s16(v_expand_low(v_src0)), v_reinterpret_as_s16(v_expand_low(v_src1)), xl, xh);
-                s0 += v_dotprod(xl, k1);
-                s1 += v_dotprod(xh, k1);
+                s0 = v_add(s0, v_dotprod(xl, k1));
+                s1 = v_add(s1, v_dotprod(xh, k1));
                v_zip(v_reinterpret_as_s16(v_expand_high(v_src0)), v_reinterpret_as_s16(v_expand_high(v_src1)), xl, xh);
-                s2 += v_dotprod(xl, k1);
-                s3 += v_dotprod(xh, k1);
+                s2 = v_add(s2, v_dotprod(xl, k1));
+                s3 = v_add(s3, v_dotprod(xh, k1));
            }

-            s0 = (s0 * v_mulVal)>>shrValTab;
-            s1 = (s1 * v_mulVal)>>shrValTab;
-            s2 = (s2 * v_mulVal)>>shrValTab;
-            s3 = (s3 * v_mulVal)>>shrValTab;
+            s0 = v_shr(v_mul(s0, v_mulVal), shrValTab);
+            s1 = v_shr(v_mul(s1, v_mulVal), shrValTab);
+            s2 = v_shr(v_mul(s2, v_mulVal), shrValTab);
+            s3 = v_shr(v_mul(s3, v_mulVal), shrValTab);

            v_store(dstPtr + i, v_pack(v_reinterpret_as_u16(v_pack(s0, s1)), v_reinterpret_as_u16(v_pack(s2, s3))));
        }
@ -205,7 +205,7 @@ inline int opRow<ushort>(const ushort* srcPtr, ushort* dstPtr, const std::vector
    const int mulValTab= stackblurMul[radius];
    const int shrValTab= stackblurShr[radius];

-    const int VEC_LINE = v_uint16::nlanes;
+    const int VEC_LINE = VTraits<v_uint16>::vlanes();

    v_uint32 v_mulVal = vx_setall_u32(mulValTab);
    if (kernelSize == 3)
@ -220,7 +220,7 @@ inline int opRow<ushort>(const ushort* srcPtr, ushort* dstPtr, const std::vector
            x1l = v_add(v_add(x1l, x1l), v_add(x0l, x2l));
            x1h = v_add(v_add(x1h, x1h), v_add(x0h, x2h));

-            v_store(dstPtr + i, v_pack((x1l * v_mulVal)>>shrValTab, (x1h * v_mulVal)>>shrValTab));
+            v_store(dstPtr + i, v_pack(v_shr(v_mul(x1l, v_mulVal), shrValTab), v_shr(v_mul(x1h, v_mulVal), shrValTab)));
        }
    }
    else
@ -243,25 +243,25 @@ inline int opRow<ushort>(const ushort* srcPtr, ushort* dstPtr, const std::vector
                v_uint16 k2 = vx_setall_u16(kx[k + 1]);

                v_uint32 y0, y1;
-                v_mul_expand(vx_load(srcPtr - j) + vx_load(srcPtr + j), k1, y0, y1);
-                s0 += y0;
-                s1 += y1;
-                v_mul_expand(vx_load(srcPtr - j - CN) + vx_load(srcPtr + j + CN), k2, y0, y1);
-                s0 += y0;
-                s1 += y1;
+                v_mul_expand(v_add(vx_load(srcPtr - j), vx_load(srcPtr + j)), k1, y0, y1);
+                s0 = v_add(s0, y0);
+                s1 = v_add(s1, y1);
+                v_mul_expand(v_add(vx_load(srcPtr - j - CN), vx_load(srcPtr + j + CN)), k2, y0, y1);
+                s0 = v_add(s0, y0);
+                s1 = v_add(s1, y1);
            }
            if( k < kernelSize / 2 + 1 )
            {
                v_uint16 k1 = vx_setall_u16(kx[k]);

                v_uint32 y0, y1;
-                v_mul_expand(vx_load(srcPtr - j) + vx_load(srcPtr + j), k1, y0, y1);
-                s0 += y0;
-                s1 += y1;
+                v_mul_expand(v_add(vx_load(srcPtr - j), vx_load(srcPtr + j)), k1, y0, y1);
+                s0 = v_add(s0, y0);
+                s1 = v_add(s1, y1);
            }

-            s0 = (s0 * v_mulVal)>>shrValTab;
-            s1 = (s1 * v_mulVal)>>shrValTab;
+            s0 = v_shr(v_mul(s0, v_mulVal), shrValTab);
+            s1 = v_shr(v_mul(s1, v_mulVal), shrValTab);

            v_store(dstPtr + i, v_pack(s0, s1));
        }
@ -282,7 +282,7 @@ inline int opRow<short>(const short* srcPtr, short* dstPtr, const std::vector<us
    const int mulValTab= stackblurMul[radius];
    const int shrValTab= stackblurShr[radius];

-    const int VEC_LINE = v_int16::nlanes;
+    const int VEC_LINE = VTraits<v_int16>::vlanes();
    v_int32 v_mulVal = vx_setall_s32(mulValTab);

    if (kernelSize == 3)
@ -297,7 +297,7 @@ inline int opRow<short>(const short* srcPtr, short* dstPtr, const std::vector<us
            x1l = v_add(v_add(x1l, x1l), v_add(x0l, x2l));
            x1h = v_add(v_add(x1h, x1h), v_add(x0h, x2h));

-            v_store(dstPtr + i, v_pack((x1l * v_mulVal)>>shrValTab, (x1h * v_mulVal)>>shrValTab));
+            v_store(dstPtr + i, v_pack(v_shr(v_mul(x1l, v_mulVal), shrValTab), v_shr(v_mul(x1h, v_mulVal), shrValTab)));
        }
    }
    else
@ -320,24 +320,24 @@ inline int opRow<short>(const short* srcPtr, short* dstPtr, const std::vector<us

                v_int32 y0, y1;

-                v_mul_expand(vx_load(srcPtr - j) + vx_load(srcPtr + j), k1, y0, y1);
-                s0 += y0;
-                s1 += y1;
-                v_mul_expand(vx_load(srcPtr - j - CN) + vx_load(srcPtr + j + CN), k2, y0, y1);
-                s0 += y0;
-                s1 += y1;
+                v_mul_expand(v_add(vx_load(srcPtr - j), vx_load(srcPtr + j)), k1, y0, y1);
+                s0 = v_add(s0, y0);
+                s1 = v_add(s1, y1);
+                v_mul_expand(v_add(vx_load(srcPtr - j - CN), vx_load(srcPtr + j + CN)), k2, y0, y1);
+                s0 = v_add(s0, y0);
+                s1 = v_add(s1, y1);
            }
            if( k < kernelSize / 2 + 1 )
            {
                v_int16 k1 = vx_setall_s16((short)kx[k]);
                v_int32 y0, y1;
-                v_mul_expand(vx_load(srcPtr - j) + vx_load(srcPtr + j), k1, y0, y1);
-                s0 += y0;
-                s1 += y1;
+                v_mul_expand(v_add(vx_load(srcPtr - j), vx_load(srcPtr + j)), k1, y0, y1);
+                s0 = v_add(s0, y0);
+                s1 = v_add(s1, y1);
            }

-            s0 = (s0 * v_mulVal)>>shrValTab;
-            s1 = (s1 * v_mulVal)>>shrValTab;
+            s0 = v_shr(v_mul(s0, v_mulVal), shrValTab);
+            s1 = v_shr(v_mul(s1, v_mulVal), shrValTab);

            v_store(dstPtr + i, v_pack(s0, s1));
        }
@ -352,7 +352,7 @@ inline int opRow<float>(const float* srcPtr, float* dstPtr, const std::vector<us
    int i = radius * CN;

    v_float32 v_mulVal = vx_setall_f32(mulVal);
-    const int VEC_LINE = v_float32::nlanes;
+    const int VEC_LINE = VTraits<v_float32>::vlanes();
    const int VEC_LINE4 = VEC_LINE * 4;

    if (kernelSize == 3)
@ -364,22 +364,22 @@ inline int opRow<float>(const float* srcPtr, float* dstPtr, const std::vector<us
            v_float32 v_srcPtr2 = vx_load(srcPtr + VEC_LINE * 2 + i);
            v_float32 v_srcPtr3 = vx_load(srcPtr + VEC_LINE * 3 + i);

-            v_float32 v_sumVal0 =  v_srcPtr0 + v_srcPtr0 + vx_load(srcPtr + i - CN) + vx_load(srcPtr + i + CN);
-            v_float32 v_sumVal1 =  v_srcPtr1 + v_srcPtr1 + vx_load(srcPtr + VEC_LINE + i - CN) + vx_load(srcPtr + VEC_LINE + i + CN);
-            v_float32 v_sumVal2 =  v_srcPtr2 + v_srcPtr2 + vx_load(srcPtr + VEC_LINE * 2 + i - CN) + vx_load(srcPtr + VEC_LINE * 2 + i + CN);
-            v_float32 v_sumVal3 =  v_srcPtr3 + v_srcPtr3 + vx_load(srcPtr + VEC_LINE * 3 + i - CN) + vx_load(srcPtr + VEC_LINE * 3 + i + CN);
+            v_float32 v_sumVal0 =  v_add(v_add(v_add(v_srcPtr0, v_srcPtr0), vx_load(srcPtr + i - CN)), vx_load(srcPtr + i + CN));
+            v_float32 v_sumVal1 =  v_add(v_add(v_add(v_srcPtr1, v_srcPtr1), vx_load(srcPtr + VEC_LINE + i - CN)), vx_load(srcPtr + VEC_LINE + i + CN));
+            v_float32 v_sumVal2 =  v_add(v_add(v_add(v_srcPtr2, v_srcPtr2), vx_load(srcPtr + VEC_LINE * 2 + i - CN)), vx_load(srcPtr + VEC_LINE * 2 + i + CN));
+            v_float32 v_sumVal3 =  v_add(v_add(v_add(v_srcPtr3, v_srcPtr3), vx_load(srcPtr + VEC_LINE * 3 + i - CN)), vx_load(srcPtr + VEC_LINE * 3 + i + CN));

-            v_store(dstPtr + i, v_sumVal0 * v_mulVal);
-            v_store(dstPtr + i + VEC_LINE, v_sumVal1 * v_mulVal);
-            v_store(dstPtr + i + VEC_LINE * 2, v_sumVal2 * v_mulVal);
-            v_store(dstPtr + i + VEC_LINE * 3, v_sumVal3 * v_mulVal);
+            v_store(dstPtr + i, v_mul(v_sumVal0, v_mulVal));
+            v_store(dstPtr + i + VEC_LINE, v_mul(v_sumVal1, v_mulVal));
+            v_store(dstPtr + i + VEC_LINE * 2, v_mul(v_sumVal2, v_mulVal));
+            v_store(dstPtr + i + VEC_LINE * 3, v_mul(v_sumVal3, v_mulVal));
        }

        for (; i <= widthCN - VEC_LINE; i += VEC_LINE)
        {
            v_float32 v_srcPtr = vx_load(srcPtr + i);
-            v_float32 v_sumVal = v_srcPtr + v_srcPtr + vx_load(srcPtr + i - CN) + vx_load(srcPtr + i + CN);
-            v_store(dstPtr + i, v_sumVal * v_mulVal);
+            v_float32 v_sumVal = v_add(v_add(v_add(v_srcPtr, v_srcPtr), vx_load(srcPtr + i - CN)), vx_load(srcPtr + i + CN));
+            v_store(dstPtr + i, v_mul(v_sumVal, v_mulVal));
        }
    }
    else
@ -392,7 +392,7 @@ inline int opRow<float>(const float* srcPtr, float* dstPtr, const std::vector<us
        {
            v_float32 v_src = vx_load(srcPtr);
            v_float32 s0;
-            s0 = v_src * k0;
+            s0 = v_mul(v_src, k0);

            int k = 1, j = CN;
            for (; k <= kernelSize / 2 - 1; k += 2, j += 2 * CN)
@ -400,17 +400,17 @@ inline int opRow<float>(const float* srcPtr, float* dstPtr, const std::vector<us
                v_float32 k1 = vx_setall_f32((float)kx[k]);
                v_float32 k2 = vx_setall_f32((float)kx[k + 1]);

-                s0 += (vx_load(srcPtr - j) + vx_load(srcPtr + j)) * k1;
-                s0 += (vx_load(srcPtr - j - CN) + vx_load(srcPtr + j + CN)) * k2;
+                s0 = v_add(s0, v_mul(v_add(vx_load(srcPtr - j), vx_load(srcPtr + j)), k1));
+                s0 = v_add(s0, v_mul(v_add(vx_load(srcPtr - j - CN), vx_load(srcPtr + j + CN)), k2));
            }
            if( k < kernelSize / 2 + 1 )
            {
                v_float32 k1 = vx_setall_f32((float)kx[k]);

-                s0 += (vx_load(srcPtr - j) + vx_load(srcPtr + j)) * k1;
+                s0 = v_add(s0, v_mul(v_add(vx_load(srcPtr - j), vx_load(srcPtr + j)), k1));
            }

-            v_store(dstPtr + i, s0 * v_mulVal);
+            v_store(dstPtr + i, v_mul(s0, v_mulVal));
        }
    }
    return i;
@ -426,8 +426,8 @@ template<>
 inline int opComputeDiff<uchar, int>(const uchar*& srcPtr, int*& diff0, const int w, const int CNR1)
 {
    int index = 0;
-    const int VEC_LINE_8 = v_uint8::nlanes;
-    const int VEC_LINE_32 = v_int32::nlanes;
+    const int VEC_LINE_8 = VTraits<v_uint8>::vlanes();
+    const int VEC_LINE_32 = VTraits<v_int32>::vlanes();
    for (; index <= w - VEC_LINE_8; index += VEC_LINE_8, diff0+=VEC_LINE_8, srcPtr+=VEC_LINE_8)
    {
        v_uint16 x0l, x0h, x1l, x1h;
@ -435,8 +435,8 @@ inline int opComputeDiff<uchar, int>(const uchar*& srcPtr, int*& diff0, const in
        v_expand(vx_load(srcPtr), x1l, x1h);

        v_int32 y0, y1, y2, y3;
-        v_expand(v_reinterpret_as_s16(x0l) - v_reinterpret_as_s16(x1l), y0, y1);
-        v_expand(v_reinterpret_as_s16(x0h) - v_reinterpret_as_s16(x1h), y2, y3);
+        v_expand(v_sub(v_reinterpret_as_s16(x0l), v_reinterpret_as_s16(x1l)), y0, y1);
+        v_expand(v_sub(v_reinterpret_as_s16(x0h), v_reinterpret_as_s16(x1h)), y2, y3);

        v_store(diff0, y0);
        v_store(diff0 + VEC_LINE_32, y1);
@ -517,7 +517,7 @@ public:

                // middle
                int wc = radius * CN;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                wc = opRow<T>(srcPtr, dstPtr, kVec, mulVal, radius, CN, widthCN);
 #endif
                for (; wc < widthCN; wc++)
@ -586,7 +586,7 @@ public:
                // middle
                auto diff0 = diff + radius * CN;
                int index = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                index = opComputeDiff(srcPtr, diff0, widthCN, CNR1);
 #endif

@ -688,7 +688,7 @@ private:
    float mulVal;
 };

-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 template<typename T, typename TBuf>
 inline int opColumn(const T* , T* , T* , TBuf* , TBuf* , TBuf* , const float ,
                    const int , const int , const int , const int , const int )
@ -703,7 +703,7 @@ inline int opColumn<float, float>(const float* srcPtr, float* dstPtr, float* sta
 {
    int k = 0;
    v_float32 v_mulVal = vx_setall_f32(mulVal);
-    const int VEC_LINE = v_float32::nlanes;
+    const int VEC_LINE = VTraits<v_float32>::vlanes();
    const int VEC_LINE4 = 4 * VEC_LINE;

    auto stackStartPtr = stack + ss * widthLen;
@ -726,20 +726,20 @@ inline int opColumn<float, float>(const float* srcPtr, float* dstPtr, float* sta
        v_float32 v_sumIn2 = vx_load(sumIn + VEC_LINE * 2 + k);
        v_float32 v_sumIn3 = vx_load(sumIn + VEC_LINE * 3+ k);

-        v_store(dstPtr + k, v_sum0 * v_mulVal);
-        v_store(dstPtr + VEC_LINE + k, v_sum1 * v_mulVal);
-        v_store(dstPtr + VEC_LINE * 2 + k, v_sum2 * v_mulVal);
-        v_store(dstPtr + VEC_LINE * 3 + k, v_sum3 * v_mulVal);
+        v_store(dstPtr + k, v_mul(v_sum0, v_mulVal));
+        v_store(dstPtr + VEC_LINE + k, v_mul(v_sum1, v_mulVal));
+        v_store(dstPtr + VEC_LINE * 2 + k, v_mul(v_sum2, v_mulVal));
+        v_store(dstPtr + VEC_LINE * 3 + k, v_mul(v_sum3, v_mulVal));

-        v_sum0 -= v_sumOut0;
-        v_sum1 -= v_sumOut1;
-        v_sum2 -= v_sumOut2;
-        v_sum3 -= v_sumOut3;
+        v_sum0 = v_sub(v_sum0, v_sumOut0);
+        v_sum1 = v_sub(v_sum1, v_sumOut1);
+        v_sum2 = v_sub(v_sum2, v_sumOut2);
+        v_sum3 = v_sub(v_sum3, v_sumOut3);

-        v_sumOut0 -= vx_load(stackStartPtr + k);
-        v_sumOut1 -= vx_load(stackStartPtr + VEC_LINE + k);
-        v_sumOut2 -= vx_load(stackStartPtr + VEC_LINE * 2 + k);
-        v_sumOut3 -= vx_load(stackStartPtr + VEC_LINE * 3 + k);
+        v_sumOut0 = v_sub(v_sumOut0, vx_load(stackStartPtr + k));
+        v_sumOut1 = v_sub(v_sumOut1, vx_load(stackStartPtr + VEC_LINE + k));
+        v_sumOut2 = v_sub(v_sumOut2, vx_load(stackStartPtr + VEC_LINE * 2 + k));
+        v_sumOut3 = v_sub(v_sumOut3, vx_load(stackStartPtr + VEC_LINE * 3 + k));

        v_float32 v_srcPtr0 = vx_load(srcPtr + k);
        v_float32 v_srcPtr1 = vx_load(srcPtr + VEC_LINE + k);
@ -751,35 +751,35 @@ inline int opColumn<float, float>(const float* srcPtr, float* dstPtr, float* sta
        v_store(stackStartPtr + VEC_LINE * 2 + k, v_srcPtr2);
        v_store(stackStartPtr + VEC_LINE * 3 + k, v_srcPtr3);

-        v_sumIn0 += v_srcPtr0;
-        v_sumIn1 += v_srcPtr1;
-        v_sumIn2 += v_srcPtr2;
-        v_sumIn3 += v_srcPtr3;
+        v_sumIn0 = v_add(v_sumIn0, v_srcPtr0);
+        v_sumIn1 = v_add(v_sumIn1, v_srcPtr1);
+        v_sumIn2 = v_add(v_sumIn2, v_srcPtr2);
+        v_sumIn3 = v_add(v_sumIn3, v_srcPtr3);

-        v_store(sum + k, v_sum0 + v_sumIn0);
-        v_store(sum + VEC_LINE + k, v_sum1 + v_sumIn1);
-        v_store(sum + VEC_LINE * 2 + k, v_sum2 + v_sumIn2);
-        v_store(sum + VEC_LINE * 3 + k, v_sum3 + v_sumIn3);
+        v_store(sum + k, v_add(v_sum0, v_sumIn0));
+        v_store(sum + VEC_LINE + k, v_add(v_sum1, v_sumIn1));
+        v_store(sum + VEC_LINE * 2 + k, v_add(v_sum2, v_sumIn2));
+        v_store(sum + VEC_LINE * 3 + k, v_add(v_sum3, v_sumIn3));

        v_srcPtr0 = vx_load(stackSp1Ptr + k);
        v_srcPtr1 = vx_load(stackSp1Ptr + VEC_LINE + k);
        v_srcPtr2 = vx_load(stackSp1Ptr + VEC_LINE * 2 +  k);
        v_srcPtr3 = vx_load(stackSp1Ptr + VEC_LINE * 3 + k);

-        v_sumOut0 += v_srcPtr0;
-        v_sumOut1 += v_srcPtr1;
-        v_sumOut2 += v_srcPtr2;
-        v_sumOut3 += v_srcPtr3;
+        v_sumOut0 = v_add(v_sumOut0, v_srcPtr0);
+        v_sumOut1 = v_add(v_sumOut1, v_srcPtr1);
+        v_sumOut2 = v_add(v_sumOut2, v_srcPtr2);
+        v_sumOut3 = v_add(v_sumOut3, v_srcPtr3);

        v_store(sumOut + k, v_sumOut0);
        v_store(sumOut + VEC_LINE + k, v_sumOut1);
        v_store(sumOut + VEC_LINE * 2 + k, v_sumOut2);
        v_store(sumOut + VEC_LINE * 3 + k, v_sumOut3);

-        v_sumIn0 -= v_srcPtr0;
-        v_sumIn1 -= v_srcPtr1;
-        v_sumIn2 -= v_srcPtr2;
-        v_sumIn3 -= v_srcPtr3;
+        v_sumIn0 = v_sub(v_sumIn0, v_srcPtr0);
+        v_sumIn1 = v_sub(v_sumIn1, v_srcPtr1);
+        v_sumIn2 = v_sub(v_sumIn2, v_srcPtr2);
+        v_sumIn3 = v_sub(v_sumIn3, v_srcPtr3);

        v_store(sumIn + k, v_sumIn0);
        v_store(sumIn + VEC_LINE + k, v_sumIn1);
@ -793,20 +793,20 @@ inline int opColumn<float, float>(const float* srcPtr, float* dstPtr, float* sta
        v_float32 v_sumOut = vx_load(sumOut + k);
        v_float32 v_sumIn = vx_load(sumIn + k);

-        v_store(dstPtr + k, v_sum * v_mulVal);
-        v_sum -= v_sumOut;
-        v_sumOut -= vx_load(stackStartPtr + k);
+        v_store(dstPtr + k, v_mul(v_sum, v_mulVal));
+        v_sum = v_sub(v_sum, v_sumOut);
+        v_sumOut = v_sub(v_sumOut, vx_load(stackStartPtr + k));

        v_float32 v_srcPtr = vx_load(srcPtr + k);
        v_store(stackStartPtr + k, v_srcPtr);

-        v_sumIn += v_srcPtr;
-        v_store(sum + k, v_sum + v_sumIn);
+        v_sumIn = v_add(v_sumIn, v_srcPtr);
+        v_store(sum + k, v_add(v_sum, v_sumIn));

        v_srcPtr = vx_load(stackSp1Ptr + k);
-        v_sumOut += v_srcPtr;
+        v_sumOut = v_add(v_sumOut, v_srcPtr);
        v_store(sumOut + k, v_sumOut);
-        v_sumIn -= v_srcPtr;
+        v_sumIn = v_sub(v_sumIn, v_srcPtr);
        v_store(sumIn + k, v_sumIn);
    }
    return k;
@ -820,8 +820,8 @@ inline int opColumn<uchar, int>(const uchar* srcPtr, uchar* dstPtr, uchar* stack
    int k = 0;
    if (mulValTab != 0 && shrValTab != 0)
    {
-        const int VEC_LINE_8 = v_uint8::nlanes;
-        const int VEC_LINE_32 = v_int32::nlanes;
+        const int VEC_LINE_8 = VTraits<v_uint8>::vlanes();
+        const int VEC_LINE_32 = VTraits<v_int32>::vlanes();
        v_int32 v_mulVal = vx_setall_s32(mulValTab);

        auto stackStartPtr = stack + ss * widthLen;
@ -850,13 +850,13 @@ inline int opColumn<uchar, int>(const uchar* srcPtr, uchar* dstPtr, uchar* stack

            v_store(dstPtr + k,
                    v_pack(
-                            v_reinterpret_as_u16(v_pack((v_sum0 * v_mulVal)>>shrValTab, (v_sum1 * v_mulVal)>>shrValTab)),
-                            v_reinterpret_as_u16(v_pack((v_sum2 * v_mulVal)>>shrValTab, (v_sum3 * v_mulVal)>>shrValTab))));
+                            v_reinterpret_as_u16(v_pack(v_shr(v_mul(v_sum0, v_mulVal), shrValTab), v_shr(v_mul(v_sum1, v_mulVal), shrValTab))),
+                            v_reinterpret_as_u16(v_pack(v_shr(v_mul(v_sum2, v_mulVal), shrValTab), v_shr(v_mul(v_sum3, v_mulVal), shrValTab)))));

-            v_sum0 -= v_sumOut0;
-            v_sum1 -= v_sumOut1;
-            v_sum2 -= v_sumOut2;
-            v_sum3 -= v_sumOut3;
+            v_sum0 = v_sub(v_sum0, v_sumOut0);
+            v_sum1 = v_sub(v_sum1, v_sumOut1);
+            v_sum2 = v_sub(v_sum2, v_sumOut2);
+            v_sum3 = v_sub(v_sum3, v_sumOut3);

            v_uint16 x0l, x0h;
            v_int32 v_ss0, v_ss1, v_ss2, v_ss3;
@ -865,10 +865,10 @@ inline int opColumn<uchar, int>(const uchar* srcPtr, uchar* dstPtr, uchar* stack
            v_expand(v_reinterpret_as_s16(x0l), v_ss0, v_ss1);
            v_expand(v_reinterpret_as_s16(x0h), v_ss2, v_ss3);

-            v_sumOut0 -= v_ss0;
-            v_sumOut1 -= v_ss1;
-            v_sumOut2 -= v_ss2;
-            v_sumOut3 -= v_ss3;
+            v_sumOut0 = v_sub(v_sumOut0, v_ss0);
+            v_sumOut1 = v_sub(v_sumOut1, v_ss1);
+            v_sumOut2 = v_sub(v_sumOut2, v_ss2);
+            v_sumOut3 = v_sub(v_sumOut3, v_ss3);

            v_expand(vx_load(srcPtr + k), x0l, x0h);
            v_expand(v_reinterpret_as_s16(x0l), v_ss0, v_ss1);
@ -876,34 +876,34 @@ inline int opColumn<uchar, int>(const uchar* srcPtr, uchar* dstPtr, uchar* stack

            memcpy(stackStartPtr + k,srcPtr + k, VEC_LINE_8 * sizeof (uchar));

-            v_sumIn0 += v_ss0;
-            v_sumIn1 += v_ss1;
-            v_sumIn2 += v_ss2;
-            v_sumIn3 += v_ss3;
+            v_sumIn0 = v_add(v_sumIn0, v_ss0);
+            v_sumIn1 = v_add(v_sumIn1, v_ss1);
+            v_sumIn2 = v_add(v_sumIn2, v_ss2);
+            v_sumIn3 = v_add(v_sumIn3, v_ss3);

-            v_store(sum + k, v_sum0 + v_sumIn0);
-            v_store(sum + VEC_LINE_32 + k, v_sum1 + v_sumIn1);
-            v_store(sum + VEC_LINE_32 * 2 + k, v_sum2 + v_sumIn2);
-            v_store(sum + VEC_LINE_32 * 3 + k, v_sum3 + v_sumIn3);
+            v_store(sum + k, v_add(v_sum0, v_sumIn0));
+            v_store(sum + VEC_LINE_32 + k, v_add(v_sum1, v_sumIn1));
+            v_store(sum + VEC_LINE_32 * 2 + k, v_add(v_sum2, v_sumIn2));
+            v_store(sum + VEC_LINE_32 * 3 + k, v_add(v_sum3, v_sumIn3));

            v_expand(vx_load(stackSp1Ptr + k), x0l, x0h);
            v_expand(v_reinterpret_as_s16(x0l), v_ss0, v_ss1);
            v_expand(v_reinterpret_as_s16(x0h), v_ss2, v_ss3);

-            v_sumOut0 += v_ss0;
-            v_sumOut1 += v_ss1;
-            v_sumOut2 += v_ss2;
-            v_sumOut3 += v_ss3;
+            v_sumOut0 = v_add(v_sumOut0, v_ss0);
+            v_sumOut1 = v_add(v_sumOut1, v_ss1);
+            v_sumOut2 = v_add(v_sumOut2, v_ss2);
+            v_sumOut3 = v_add(v_sumOut3, v_ss3);

            v_store(sumOut + k, v_sumOut0);
            v_store(sumOut + VEC_LINE_32 + k, v_sumOut1);
            v_store(sumOut + VEC_LINE_32 * 2 + k, v_sumOut2);
            v_store(sumOut + VEC_LINE_32 * 3 + k, v_sumOut3);

-            v_sumIn0 -= v_ss0;
-            v_sumIn1 -= v_ss1;
-            v_sumIn2 -= v_ss2;
-            v_sumIn3 -= v_ss3;
+            v_sumIn0 = v_sub(v_sumIn0, v_ss0);
+            v_sumIn1 = v_sub(v_sumIn1, v_ss1);
+            v_sumIn2 = v_sub(v_sumIn2, v_ss2);
+            v_sumIn3 = v_sub(v_sumIn3, v_ss3);

            v_store(sumIn + k, v_sumIn0);
            v_store(sumIn + VEC_LINE_32 + k, v_sumIn1);
@ -922,8 +922,8 @@ inline int opColumn<short, int>(const short* srcPtr, short* dstPtr, short* stack
    int k = 0;
    if (mulValTab != 0 && shrValTab != 0)
    {
-        const int VEC_LINE_16 = v_int16::nlanes;
-        const int VEC_LINE_32 = v_int32::nlanes;
+        const int VEC_LINE_16 = VTraits<v_int16>::vlanes();
+        const int VEC_LINE_32 = VTraits<v_int32>::vlanes();
        v_int32 v_mulVal = vx_setall_s32(mulValTab);

        auto stackStartPtr = stack + ss * widthLen;
@ -943,39 +943,39 @@ inline int opColumn<short, int>(const short* srcPtr, short* dstPtr, short* stack
            v_sumOut0 = vx_load(sumOut + k);
            v_sumOut1 = vx_load(sumOut + k + VEC_LINE_32);

-            v_store(dstPtr + k,v_pack((v_sum0 * v_mulVal)>>shrValTab, (v_sum1 * v_mulVal)>>shrValTab));
+            v_store(dstPtr + k,v_pack(v_shr(v_mul(v_sum0, v_mulVal), shrValTab), v_shr(v_mul(v_sum1, v_mulVal), shrValTab)));

-            v_sum0 -= v_sumOut0;
-            v_sum1 -= v_sumOut1;
+            v_sum0 = v_sub(v_sum0, v_sumOut0);
+            v_sum1 = v_sub(v_sum1, v_sumOut1);

            v_int32 v_ss0, v_ss1;
            v_expand(vx_load(stackStartPtr + k), v_ss0, v_ss1);

-            v_sumOut0 -= v_ss0;
-            v_sumOut1 -= v_ss1;
+            v_sumOut0 = v_sub(v_sumOut0, v_ss0);
+            v_sumOut1 = v_sub(v_sumOut1, v_ss1);

            v_expand(vx_load(srcPtr + k), v_ss0, v_ss1);
            memcpy(stackStartPtr + k,srcPtr + k, VEC_LINE_16 * sizeof (short));

-            v_sumIn0 += v_ss0;
-            v_sumIn1 += v_ss1;
+            v_sumIn0 = v_add(v_sumIn0, v_ss0);
+            v_sumIn1 = v_add(v_sumIn1, v_ss1);

-            v_sum0 += v_sumIn0;
-            v_sum1 += v_sumIn1;
+            v_sum0 = v_add(v_sum0, v_sumIn0);
+            v_sum1 = v_add(v_sum1, v_sumIn1);

            v_store(sum + k, v_sum0);
            v_store(sum + VEC_LINE_32 + k, v_sum1);

            v_expand(vx_load(stackSp1Ptr + k), v_ss0, v_ss1);

-            v_sumOut0 += v_ss0;
-            v_sumOut1 += v_ss1;
+            v_sumOut0 = v_add(v_sumOut0, v_ss0);
+            v_sumOut1 = v_add(v_sumOut1, v_ss1);

            v_store(sumOut + k, v_sumOut0);
            v_store(sumOut + VEC_LINE_32 + k, v_sumOut1);

-            v_sumIn0 -= v_ss0;
-            v_sumIn1 -= v_ss1;
+            v_sumIn0 = v_sub(v_sumIn0, v_ss0);
+            v_sumIn1 = v_sub(v_sumIn1, v_ss1);

            v_store(sumIn + k, v_sumIn0);
            v_store(sumIn + VEC_LINE_32 + k, v_sumIn1);
@ -992,8 +992,8 @@ inline int opColumn<ushort, int>(const ushort* srcPtr, ushort* dstPtr, ushort* s
    int k = 0;
    if (mulValTab != 0 && shrValTab != 0)
    {
-        const int VEC_LINE_16 = v_uint16::nlanes;
-        const int VEC_LINE_32 = v_int32::nlanes;
+        const int VEC_LINE_16 = VTraits<v_uint16>::vlanes();
+        const int VEC_LINE_32 = VTraits<v_int32>::vlanes();
        v_uint32 v_mulVal = vx_setall_u32((uint32_t)mulValTab);

        auto stackStartPtr = stack + ss * widthLen;
@ -1013,40 +1013,40 @@ inline int opColumn<ushort, int>(const ushort* srcPtr, ushort* dstPtr, ushort* s
            v_sumOut0 = vx_load(sumOut + k);
            v_sumOut1 = vx_load(sumOut + k + VEC_LINE_32);

-            v_store(dstPtr + k, v_pack((v_reinterpret_as_u32(v_sum0) * v_mulVal)>>shrValTab, (v_reinterpret_as_u32(v_sum1) * v_mulVal)>>shrValTab));
+            v_store(dstPtr + k, v_pack(v_shr(v_mul(v_reinterpret_as_u32(v_sum0), v_mulVal), shrValTab), v_shr(v_mul(v_reinterpret_as_u32(v_sum1), v_mulVal), shrValTab)));

-            v_sum0 -= v_sumOut0;
-            v_sum1 -= v_sumOut1;
+            v_sum0 = v_sub(v_sum0, v_sumOut0);
+            v_sum1 = v_sub(v_sum1, v_sumOut1);

            v_uint32 v_ss0, v_ss1;
            v_expand(vx_load(stackStartPtr + k), v_ss0, v_ss1);

-            v_sumOut0 -= v_reinterpret_as_s32(v_ss0);
-            v_sumOut1 -= v_reinterpret_as_s32(v_ss1);
+            v_sumOut0 = v_sub(v_sumOut0, v_reinterpret_as_s32(v_ss0));
+            v_sumOut1 = v_sub(v_sumOut1, v_reinterpret_as_s32(v_ss1));

            v_expand(vx_load(srcPtr + k), v_ss0, v_ss1);

            memcpy(stackStartPtr + k,srcPtr + k, VEC_LINE_16 * sizeof (ushort));

-            v_sumIn0 += v_reinterpret_as_s32(v_ss0);
-            v_sumIn1 += v_reinterpret_as_s32(v_ss1);
+            v_sumIn0 = v_add(v_sumIn0, v_reinterpret_as_s32(v_ss0));
+            v_sumIn1 = v_add(v_sumIn1, v_reinterpret_as_s32(v_ss1));

-            v_sum0 += v_sumIn0;
-            v_sum1 += v_sumIn1;
+            v_sum0 = v_add(v_sum0, v_sumIn0);
+            v_sum1 = v_add(v_sum1, v_sumIn1);

            v_store(sum + k, v_sum0);
            v_store(sum + VEC_LINE_32 + k, v_sum1);

            v_expand(vx_load(stackSp1Ptr + k), v_ss0, v_ss1);

-            v_sumOut0 += v_reinterpret_as_s32(v_ss0);
-            v_sumOut1 += v_reinterpret_as_s32(v_ss1);
+            v_sumOut0 = v_add(v_sumOut0, v_reinterpret_as_s32(v_ss0));
+            v_sumOut1 = v_add(v_sumOut1, v_reinterpret_as_s32(v_ss1));

            v_store(sumOut + k, v_sumOut0);
            v_store(sumOut + VEC_LINE_32 + k, v_sumOut1);

-            v_sumIn0 -= v_reinterpret_as_s32(v_ss0);
-            v_sumIn1 -= v_reinterpret_as_s32(v_ss1);
+            v_sumIn0 = v_sub(v_sumIn0, v_reinterpret_as_s32(v_ss0));
+            v_sumIn1 = v_sub(v_sumIn1, v_reinterpret_as_s32(v_ss1));

            v_store(sumIn + k, v_sumIn0);
            v_store(sumIn + VEC_LINE_32 + k, v_sumIn1);
@ -1152,7 +1152,7 @@ public:
            }

            int k = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
            k = opColumn<T, TBuf>(srcPtr, dstPtr, stack, sum, sumIn, sumOut, mulVal, mulValTab, shrValTab,
                                      widthLen, stackStart, sp1);
 #endif
--- a/modules/imgproc/src/thresh.cpp
+++ b/modules/imgproc/src/thresh.cpp
@ -190,7 +190,7 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type )
    int j = 0;
    const uchar* src = _src.ptr();
    uchar* dst = _dst.ptr();
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    v_uint8 thresh_u = vx_setall_u8( thresh );
    v_uint8 maxval16 = vx_setall_u8( maxval );

@ -199,12 +199,12 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type )
    case THRESH_BINARY:
        for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
        {
-            for( j = 0; j <= roi.width - v_uint8::nlanes; j += v_uint8::nlanes)
+            for( j = 0; j <= roi.width - VTraits<v_uint8>::vlanes(); j += VTraits<v_uint8>::vlanes())
            {
                v_uint8 v0;
                v0 = vx_load( src + j );
-                v0 = thresh_u < v0;
-                v0 = v0 & maxval16;
+                v0 = v_lt(thresh_u, v0);
+                v0 = v_and(v0, maxval16);
                v_store( dst + j, v0 );
            }
        }
@ -213,12 +213,12 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type )
    case THRESH_BINARY_INV:
        for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
        {
-            for( j = 0; j <= roi.width - v_uint8::nlanes; j += v_uint8::nlanes)
+            for( j = 0; j <= roi.width - VTraits<v_uint8>::vlanes(); j += VTraits<v_uint8>::vlanes())
            {
                v_uint8 v0;
                v0 = vx_load( src + j );
-                v0 = v0 <= thresh_u;
-                v0 = v0 & maxval16;
+                v0 = v_le(v0, thresh_u);
+                v0 = v_and(v0, maxval16);
                v_store( dst + j, v0 );
            }
        }
@ -227,11 +227,11 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type )
    case THRESH_TRUNC:
        for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
        {
-            for( j = 0; j <= roi.width - v_uint8::nlanes; j += v_uint8::nlanes)
+            for( j = 0; j <= roi.width - VTraits<v_uint8>::vlanes(); j += VTraits<v_uint8>::vlanes())
            {
                v_uint8 v0;
                v0 = vx_load( src + j );
-                v0 = v0 - ( v0 - thresh_u );
+                v0 = v_sub(v0, v_sub(v0, thresh_u));
                v_store( dst + j, v0 );
            }
        }
@ -240,11 +240,11 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type )
    case THRESH_TOZERO:
        for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
        {
-            for( j = 0; j <= roi.width - v_uint8::nlanes; j += v_uint8::nlanes)
+            for( j = 0; j <= roi.width - VTraits<v_uint8>::vlanes(); j += VTraits<v_uint8>::vlanes())
            {
                v_uint8 v0;
                v0 = vx_load( src + j );
-                v0 = ( thresh_u < v0 ) & v0;
+                v0 = v_and(v_lt(thresh_u, v0), v0);
                v_store( dst + j, v0 );
            }
        }
@ -253,11 +253,11 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type )
    case THRESH_TOZERO_INV:
        for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
        {
-            for( j = 0; j <= roi.width - v_uint8::nlanes; j += v_uint8::nlanes)
+            for( j = 0; j <= roi.width - VTraits<v_uint8>::vlanes(); j += VTraits<v_uint8>::vlanes())
            {
                v_uint8 v0;
                v0 = vx_load( src + j );
-                v0 = ( v0 <= thresh_u ) & v0;
+                v0 = v_and(v_le(v0, thresh_u), v0);
                v_store( dst + j, v0 );
            }
        }
@ -351,7 +351,7 @@ thresh_16u(const Mat& _src, Mat& _dst, ushort thresh, ushort maxval, int type)

    const ushort* src = _src.ptr<ushort>();
    ushort* dst = _dst.ptr<ushort>();
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    int i, j;
    v_uint16 thresh_u = vx_setall_u16(thresh);
    v_uint16 maxval16 = vx_setall_u16(maxval);
@ -361,25 +361,25 @@ thresh_16u(const Mat& _src, Mat& _dst, ushort thresh, ushort maxval, int type)
    case THRESH_BINARY:
        for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
        {
-            for (j = 0; j <= roi.width - 2*v_uint16::nlanes; j += 2*v_uint16::nlanes)
+            for (j = 0; j <= roi.width - 2*VTraits<v_uint16>::vlanes(); j += 2*VTraits<v_uint16>::vlanes())
            {
                v_uint16 v0, v1;
                v0 = vx_load(src + j);
-                v1 = vx_load(src + j + v_uint16::nlanes);
-                v0 = thresh_u < v0;
-                v1 = thresh_u < v1;
-                v0 = v0 & maxval16;
-                v1 = v1 & maxval16;
+                v1 = vx_load(src + j + VTraits<v_uint16>::vlanes());
+                v0 = v_lt(thresh_u, v0);
+                v1 = v_lt(thresh_u, v1);
+                v0 = v_and(v0, maxval16);
+                v1 = v_and(v1, maxval16);
                v_store(dst + j, v0);
-                v_store(dst + j + v_uint16::nlanes, v1);
+                v_store(dst + j + VTraits<v_uint16>::vlanes(), v1);
            }
-            if (j <= roi.width - v_uint16::nlanes)
+            if (j <= roi.width - VTraits<v_uint16>::vlanes())
            {
                v_uint16 v0 = vx_load(src + j);
-                v0 = thresh_u < v0;
-                v0 = v0 & maxval16;
+                v0 = v_lt(thresh_u, v0);
+                v0 = v_and(v0, maxval16);
                v_store(dst + j, v0);
-                j += v_uint16::nlanes;
+                j += VTraits<v_uint16>::vlanes();
            }

            for (; j < roi.width; j++)
@ -391,25 +391,25 @@ thresh_16u(const Mat& _src, Mat& _dst, ushort thresh, ushort maxval, int type)
        for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
        {
            j = 0;
-            for (; j <= roi.width - 2*v_uint16::nlanes; j += 2*v_uint16::nlanes)
+            for (; j <= roi.width - 2*VTraits<v_uint16>::vlanes(); j += 2*VTraits<v_uint16>::vlanes())
            {
                v_uint16 v0, v1;
                v0 = vx_load(src + j);
-                v1 = vx_load(src + j + v_uint16::nlanes);
-                v0 = v0 <= thresh_u;
-                v1 = v1 <= thresh_u;
-                v0 = v0 & maxval16;
-                v1 = v1 & maxval16;
+                v1 = vx_load(src + j + VTraits<v_uint16>::vlanes());
+                v0 = v_le(v0, thresh_u);
+                v1 = v_le(v1, thresh_u);
+                v0 = v_and(v0, maxval16);
+                v1 = v_and(v1, maxval16);
                v_store(dst + j, v0);
-                v_store(dst + j + v_uint16::nlanes, v1);
+                v_store(dst + j + VTraits<v_uint16>::vlanes(), v1);
            }
-            if (j <= roi.width - v_uint16::nlanes)
+            if (j <= roi.width - VTraits<v_uint16>::vlanes())
            {
                v_uint16 v0 = vx_load(src + j);
-                v0 = v0 <= thresh_u;
-                v0 = v0 & maxval16;
+                v0 = v_le(v0, thresh_u);
+                v0 = v_and(v0, maxval16);
                v_store(dst + j, v0);
-                j += v_uint16::nlanes;
+                j += VTraits<v_uint16>::vlanes();
            }

            for (; j < roi.width; j++)
@ -421,22 +421,22 @@ thresh_16u(const Mat& _src, Mat& _dst, ushort thresh, ushort maxval, int type)
        for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
        {
            j = 0;
-            for (; j <= roi.width - 2*v_uint16::nlanes; j += 2*v_uint16::nlanes)
+            for (; j <= roi.width - 2*VTraits<v_uint16>::vlanes(); j += 2*VTraits<v_uint16>::vlanes())
            {
                v_uint16 v0, v1;
                v0 = vx_load(src + j);
-                v1 = vx_load(src + j + v_uint16::nlanes);
+                v1 = vx_load(src + j + VTraits<v_uint16>::vlanes());
                v0 = v_min(v0, thresh_u);
                v1 = v_min(v1, thresh_u);
                v_store(dst + j, v0);
-                v_store(dst + j + v_uint16::nlanes, v1);
+                v_store(dst + j + VTraits<v_uint16>::vlanes(), v1);
            }
-            if (j <= roi.width - v_uint16::nlanes)
+            if (j <= roi.width - VTraits<v_uint16>::vlanes())
            {
                v_uint16 v0 = vx_load(src + j);
                v0 = v_min(v0, thresh_u);
                v_store(dst + j, v0);
-                j += v_uint16::nlanes;
+                j += VTraits<v_uint16>::vlanes();
            }

            for (; j < roi.width; j++)
@ -448,22 +448,22 @@ thresh_16u(const Mat& _src, Mat& _dst, ushort thresh, ushort maxval, int type)
        for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
        {
            j = 0;
-            for (; j <= roi.width - 2*v_uint16::nlanes; j += 2*v_uint16::nlanes)
+            for (; j <= roi.width - 2*VTraits<v_uint16>::vlanes(); j += 2*VTraits<v_uint16>::vlanes())
            {
                v_uint16 v0, v1;
                v0 = vx_load(src + j);
-                v1 = vx_load(src + j + v_uint16::nlanes);
-                v0 = (thresh_u < v0) & v0;
-                v1 = (thresh_u < v1) & v1;
+                v1 = vx_load(src + j + VTraits<v_uint16>::vlanes());
+                v0 = v_and(v_lt(thresh_u, v0), v0);
+                v1 = v_and(v_lt(thresh_u, v1), v1);
                v_store(dst + j, v0);
-                v_store(dst + j + v_uint16::nlanes, v1);
+                v_store(dst + j + VTraits<v_uint16>::vlanes(), v1);
            }
-            if (j <= roi.width - v_uint16::nlanes)
+            if (j <= roi.width - VTraits<v_uint16>::vlanes())
            {
                v_uint16 v0 = vx_load(src + j);
-                v0 = (thresh_u < v0) & v0;
+                v0 = v_and(v_lt(thresh_u, v0), v0);
                v_store(dst + j, v0);
-                j += v_uint16::nlanes;
+                j += VTraits<v_uint16>::vlanes();
            }

            for (; j < roi.width; j++)
@ -475,22 +475,22 @@ thresh_16u(const Mat& _src, Mat& _dst, ushort thresh, ushort maxval, int type)
        for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
        {
            j = 0;
-            for (; j <= roi.width - 2*v_uint16::nlanes; j += 2*v_uint16::nlanes)
+            for (; j <= roi.width - 2*VTraits<v_uint16>::vlanes(); j += 2*VTraits<v_uint16>::vlanes())
            {
                v_uint16 v0, v1;
                v0 = vx_load(src + j);
-                v1 = vx_load(src + j + v_uint16::nlanes);
-                v0 = (v0 <= thresh_u) & v0;
-                v1 = (v1 <= thresh_u) & v1;
+                v1 = vx_load(src + j + VTraits<v_uint16>::vlanes());
+                v0 = v_and(v_le(v0, thresh_u), v0);
+                v1 = v_and(v_le(v1, thresh_u), v1);
                v_store(dst + j, v0);
-                v_store(dst + j + v_uint16::nlanes, v1);
+                v_store(dst + j + VTraits<v_uint16>::vlanes(), v1);
            }
-            if (j <= roi.width - v_uint16::nlanes)
+            if (j <= roi.width - VTraits<v_uint16>::vlanes())
            {
                v_uint16 v0 = vx_load(src + j);
-                v0 = (v0 <= thresh_u) & v0;
+                v0 = v_and(v_le(v0, thresh_u), v0);
                v_store(dst + j, v0);
-                j += v_uint16::nlanes;
+                j += VTraits<v_uint16>::vlanes();
            }

            for (; j < roi.width; j++)
@ -571,7 +571,7 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
    }
 #endif

-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    int i, j;
    v_int16 thresh8 = vx_setall_s16( thresh );
    v_int16 maxval8 = vx_setall_s16( maxval );
@ -582,25 +582,25 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
        for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
        {
            j = 0;
-            for( ; j <= roi.width - 2*v_int16::nlanes; j += 2*v_int16::nlanes )
+            for( ; j <= roi.width - 2*VTraits<v_int16>::vlanes(); j += 2*VTraits<v_int16>::vlanes() )
            {
                v_int16 v0, v1;
                v0 = vx_load( src + j );
-                v1 = vx_load( src + j + v_int16::nlanes );
-                v0 = thresh8 < v0;
-                v1 = thresh8 < v1;
-                v0 = v0 & maxval8;
-                v1 = v1 & maxval8;
+                v1 = vx_load( src + j + VTraits<v_int16>::vlanes() );
+                v0 = v_lt(thresh8, v0);
+                v1 = v_lt(thresh8, v1);
+                v0 = v_and(v0, maxval8);
+                v1 = v_and(v1, maxval8);
                v_store( dst + j, v0 );
-                v_store( dst + j + v_int16::nlanes, v1 );
+                v_store( dst + j + VTraits<v_int16>::vlanes(), v1 );
            }
-            if( j <= roi.width - v_int16::nlanes )
+            if( j <= roi.width - VTraits<v_int16>::vlanes() )
            {
                v_int16 v0 = vx_load( src + j );
-                v0 = thresh8 < v0;
-                v0 = v0 & maxval8;
+                v0 = v_lt(thresh8, v0);
+                v0 = v_and(v0, maxval8);
                v_store( dst + j, v0 );
-                j += v_int16::nlanes;
+                j += VTraits<v_int16>::vlanes();
            }

            for( ; j < roi.width; j++ )
@ -612,25 +612,25 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
        for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
        {
            j = 0;
-            for( ; j <= roi.width - 2*v_int16::nlanes; j += 2*v_int16::nlanes )
+            for( ; j <= roi.width - 2*VTraits<v_int16>::vlanes(); j += 2*VTraits<v_int16>::vlanes() )
            {
                v_int16 v0, v1;
                v0 = vx_load( src + j );
-                v1 = vx_load( src + j + v_int16::nlanes );
-                v0 = v0 <= thresh8;
-                v1 = v1 <= thresh8;
-                v0 = v0 & maxval8;
-                v1 = v1 & maxval8;
+                v1 = vx_load( src + j + VTraits<v_int16>::vlanes() );
+                v0 = v_le(v0, thresh8);
+                v1 = v_le(v1, thresh8);
+                v0 = v_and(v0, maxval8);
+                v1 = v_and(v1, maxval8);
                v_store( dst + j, v0 );
-                v_store( dst + j + v_int16::nlanes, v1 );
+                v_store( dst + j + VTraits<v_int16>::vlanes(), v1 );
            }
-            if( j <= roi.width - v_int16::nlanes )
+            if( j <= roi.width - VTraits<v_int16>::vlanes() )
            {
                v_int16 v0 = vx_load( src + j );
-                v0 = v0 <= thresh8;
-                v0 = v0 & maxval8;
+                v0 = v_le(v0, thresh8);
+                v0 = v_and(v0, maxval8);
                v_store( dst + j, v0 );
-                j += v_int16::nlanes;
+                j += VTraits<v_int16>::vlanes();
            }

            for( ; j < roi.width; j++ )
@ -642,22 +642,22 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
        for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
        {
            j = 0;
-            for( ; j <= roi.width - 2*v_int16::nlanes; j += 2*v_int16::nlanes )
+            for( ; j <= roi.width - 2*VTraits<v_int16>::vlanes(); j += 2*VTraits<v_int16>::vlanes() )
            {
                v_int16 v0, v1;
                v0 = vx_load( src + j );
-                v1 = vx_load( src + j + v_int16::nlanes );
+                v1 = vx_load( src + j + VTraits<v_int16>::vlanes() );
                v0 = v_min( v0, thresh8 );
                v1 = v_min( v1, thresh8 );
                v_store( dst + j, v0 );
-                v_store( dst + j + v_int16::nlanes, v1 );
+                v_store( dst + j + VTraits<v_int16>::vlanes(), v1 );
            }
-            if( j <= roi.width - v_int16::nlanes )
+            if( j <= roi.width - VTraits<v_int16>::vlanes() )
            {
                v_int16 v0 = vx_load( src + j );
                v0 = v_min( v0, thresh8 );
                v_store( dst + j, v0 );
-                j += v_int16::nlanes;
+                j += VTraits<v_int16>::vlanes();
            }

            for( ; j < roi.width; j++ )
@ -669,22 +669,22 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
        for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
        {
            j = 0;
-            for( ; j <= roi.width - 2*v_int16::nlanes; j += 2*v_int16::nlanes )
+            for( ; j <= roi.width - 2*VTraits<v_int16>::vlanes(); j += 2*VTraits<v_int16>::vlanes() )
            {
                v_int16 v0, v1;
                v0 = vx_load( src + j );
-                v1 = vx_load( src + j + v_int16::nlanes );
-                v0 = ( thresh8 < v0 ) & v0;
-                v1 = ( thresh8 < v1 ) & v1;
+                v1 = vx_load( src + j + VTraits<v_int16>::vlanes() );
+                v0 = v_and(v_lt(thresh8, v0), v0);
+                v1 = v_and(v_lt(thresh8, v1), v1);
                v_store( dst + j, v0 );
-                v_store( dst + j + v_int16::nlanes, v1 );
+                v_store( dst + j + VTraits<v_int16>::vlanes(), v1 );
            }
-            if( j <= roi.width - v_int16::nlanes )
+            if( j <= roi.width - VTraits<v_int16>::vlanes() )
            {
                v_int16 v0 = vx_load( src + j );
-                v0 = ( thresh8 < v0 ) & v0;
+                v0 = v_and(v_lt(thresh8, v0), v0);
                v_store( dst + j, v0 );
-                j += v_int16::nlanes;
+                j += VTraits<v_int16>::vlanes();
            }

            for( ; j < roi.width; j++ )
@ -696,22 +696,22 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
        for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
        {
            j = 0;
-            for( ; j <= roi.width - 2*v_int16::nlanes; j += 2*v_int16::nlanes )
+            for( ; j <= roi.width - 2*VTraits<v_int16>::vlanes(); j += 2*VTraits<v_int16>::vlanes() )
            {
                v_int16 v0, v1;
                v0 = vx_load( src + j );
-                v1 = vx_load( src + j + v_int16::nlanes );
-                v0 = ( v0 <= thresh8 ) & v0;
-                v1 = ( v1 <= thresh8 ) & v1;
+                v1 = vx_load( src + j + VTraits<v_int16>::vlanes() );
+                v0 = v_and(v_le(v0, thresh8), v0);
+                v1 = v_and(v_le(v1, thresh8), v1);
                v_store( dst + j, v0 );
-                v_store( dst + j + v_int16::nlanes, v1 );
+                v_store( dst + j + VTraits<v_int16>::vlanes(), v1 );
            }
-            if( j <= roi.width - v_int16::nlanes )
+            if( j <= roi.width - VTraits<v_int16>::vlanes() )
            {
                v_int16 v0 = vx_load( src + j );
-                v0 = ( v0 <= thresh8 ) & v0;
+                v0 = v_and(v_le(v0, thresh8), v0);
                v_store( dst + j, v0 );
-                j += v_int16::nlanes;
+                j += VTraits<v_int16>::vlanes();
            }

            for( ; j < roi.width; j++ )
@ -777,7 +777,7 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
    }
 #endif

-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    int i, j;
    v_float32 thresh4 = vx_setall_f32( thresh );
    v_float32 maxval4 = vx_setall_f32( maxval );
@ -788,25 +788,25 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
            {
                j = 0;
-                for( ; j <= roi.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes )
+                for( ; j <= roi.width - 2*VTraits<v_float32>::vlanes(); j += 2*VTraits<v_float32>::vlanes() )
                {
                    v_float32 v0, v1;
                    v0 = vx_load( src + j );
-                    v1 = vx_load( src + j + v_float32::nlanes );
-                    v0 = thresh4 < v0;
-                    v1 = thresh4 < v1;
-                    v0 = v0 & maxval4;
-                    v1 = v1 & maxval4;
+                    v1 = vx_load( src + j + VTraits<v_float32>::vlanes() );
+                    v0 = v_lt(thresh4, v0);
+                    v1 = v_lt(thresh4, v1);
+                    v0 = v_and(v0, maxval4);
+                    v1 = v_and(v1, maxval4);
                    v_store( dst + j, v0 );
-                    v_store( dst + j + v_float32::nlanes, v1 );
+                    v_store( dst + j + VTraits<v_float32>::vlanes(), v1 );
                }
-                if( j <= roi.width - v_float32::nlanes )
+                if( j <= roi.width - VTraits<v_float32>::vlanes() )
                {
                    v_float32 v0 = vx_load( src + j );
-                    v0 = thresh4 < v0;
-                    v0 = v0 & maxval4;
+                    v0 = v_lt(thresh4, v0);
+                    v0 = v_and(v0, maxval4);
                    v_store( dst + j, v0 );
-                    j += v_float32::nlanes;
+                    j += VTraits<v_float32>::vlanes();
                }

                for( ; j < roi.width; j++ )
@ -818,25 +818,25 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
            {
                j = 0;
-                for( ; j <= roi.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes )
+                for( ; j <= roi.width - 2*VTraits<v_float32>::vlanes(); j += 2*VTraits<v_float32>::vlanes() )
                {
                    v_float32 v0, v1;
                    v0 = vx_load( src + j );
-                    v1 = vx_load( src + j + v_float32::nlanes );
-                    v0 = v0 <= thresh4;
-                    v1 = v1 <= thresh4;
-                    v0 = v0 & maxval4;
-                    v1 = v1 & maxval4;
+                    v1 = vx_load( src + j + VTraits<v_float32>::vlanes() );
+                    v0 = v_le(v0, thresh4);
+                    v1 = v_le(v1, thresh4);
+                    v0 = v_and(v0, maxval4);
+                    v1 = v_and(v1, maxval4);
                    v_store( dst + j, v0 );
-                    v_store( dst + j + v_float32::nlanes, v1 );
+                    v_store( dst + j + VTraits<v_float32>::vlanes(), v1 );
                }
-                if( j <= roi.width - v_float32::nlanes )
+                if( j <= roi.width - VTraits<v_float32>::vlanes() )
                {
                    v_float32 v0 = vx_load( src + j );
-                    v0 = v0 <= thresh4;
-                    v0 = v0 & maxval4;
+                    v0 = v_le(v0, thresh4);
+                    v0 = v_and(v0, maxval4);
                    v_store( dst + j, v0 );
-                    j += v_float32::nlanes;
+                    j += VTraits<v_float32>::vlanes();
                }

                for( ; j < roi.width; j++ )
@ -848,22 +848,22 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
            {
                j = 0;
-                for( ; j <= roi.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes )
+                for( ; j <= roi.width - 2*VTraits<v_float32>::vlanes(); j += 2*VTraits<v_float32>::vlanes() )
                {
                    v_float32 v0, v1;
                    v0 = vx_load( src + j );
-                    v1 = vx_load( src + j + v_float32::nlanes );
+                    v1 = vx_load( src + j + VTraits<v_float32>::vlanes() );
                    v0 = v_min( v0, thresh4 );
                    v1 = v_min( v1, thresh4 );
                    v_store( dst + j, v0 );
-                    v_store( dst + j + v_float32::nlanes, v1 );
+                    v_store( dst + j + VTraits<v_float32>::vlanes(), v1 );
                }
-                if( j <= roi.width - v_float32::nlanes )
+                if( j <= roi.width - VTraits<v_float32>::vlanes() )
                {
                    v_float32 v0 = vx_load( src + j );
                    v0 = v_min( v0, thresh4 );
                    v_store( dst + j, v0 );
-                    j += v_float32::nlanes;
+                    j += VTraits<v_float32>::vlanes();
                }

                for( ; j < roi.width; j++ )
@ -875,22 +875,22 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
            {
                j = 0;
-                for( ; j <= roi.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes )
+                for( ; j <= roi.width - 2*VTraits<v_float32>::vlanes(); j += 2*VTraits<v_float32>::vlanes() )
                {
                    v_float32 v0, v1;
                    v0 = vx_load( src + j );
-                    v1 = vx_load( src + j + v_float32::nlanes );
-                    v0 = ( thresh4 < v0 ) & v0;
-                    v1 = ( thresh4 < v1 ) & v1;
+                    v1 = vx_load( src + j + VTraits<v_float32>::vlanes() );
+                    v0 = v_and(v_lt(thresh4, v0), v0);
+                    v1 = v_and(v_lt(thresh4, v1), v1);
                    v_store( dst + j, v0 );
-                    v_store( dst + j + v_float32::nlanes, v1 );
+                    v_store( dst + j + VTraits<v_float32>::vlanes(), v1 );
                }
-                if( j <= roi.width - v_float32::nlanes )
+                if( j <= roi.width - VTraits<v_float32>::vlanes() )
                {
                    v_float32 v0 = vx_load( src + j );
-                    v0 = ( thresh4 < v0 ) & v0;
+                    v0 = v_and(v_lt(thresh4, v0), v0);
                    v_store( dst + j, v0 );
-                    j += v_float32::nlanes;
+                    j += VTraits<v_float32>::vlanes();
                }

                for( ; j < roi.width; j++ )
@ -902,22 +902,22 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
            {
                j = 0;
-                for( ; j <= roi.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes )
+                for( ; j <= roi.width - 2*VTraits<v_float32>::vlanes(); j += 2*VTraits<v_float32>::vlanes() )
                {
                    v_float32 v0, v1;
                    v0 = vx_load( src + j );
-                    v1 = vx_load( src + j + v_float32::nlanes );
-                    v0 = ( v0 <= thresh4 ) & v0;
-                    v1 = ( v1 <= thresh4 ) & v1;
+                    v1 = vx_load( src + j + VTraits<v_float32>::vlanes() );
+                    v0 = v_and(v_le(v0, thresh4), v0);
+                    v1 = v_and(v_le(v1, thresh4), v1);
                    v_store( dst + j, v0 );
-                    v_store( dst + j + v_float32::nlanes, v1 );
+                    v_store( dst + j + VTraits<v_float32>::vlanes(), v1 );
                }
-                if( j <= roi.width - v_float32::nlanes )
+                if( j <= roi.width - VTraits<v_float32>::vlanes() )
                {
                    v_float32 v0 = vx_load( src + j );
-                    v0 = ( v0 <= thresh4 ) & v0;
+                    v0 = v_and(v_le(v0, thresh4), v0);
                    v_store( dst + j, v0 );
-                    j += v_float32::nlanes;
+                    j += VTraits<v_float32>::vlanes();
                }

                for( ; j < roi.width; j++ )
@ -948,7 +948,7 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
        roi.height = 1;
    }

-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
    int i, j;
    v_float64 thresh2 = vx_setall_f64( thresh );
    v_float64 maxval2 = vx_setall_f64( maxval );
@ -959,25 +959,25 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
        for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
        {
            j = 0;
-            for( ; j <= roi.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes )
+            for( ; j <= roi.width - 2*VTraits<v_float64>::vlanes(); j += 2*VTraits<v_float64>::vlanes() )
            {
                v_float64 v0, v1;
                v0 = vx_load( src + j );
-                v1 = vx_load( src + j + v_float64::nlanes );
-                v0 = thresh2 < v0;
-                v1 = thresh2 < v1;
-                v0 = v0 & maxval2;
-                v1 = v1 & maxval2;
+                v1 = vx_load( src + j + VTraits<v_float64>::vlanes() );
+                v0 = v_lt(thresh2, v0);
+                v1 = v_lt(thresh2, v1);
+                v0 = v_and(v0, maxval2);
+                v1 = v_and(v1, maxval2);
                v_store( dst + j, v0 );
-                v_store( dst + j + v_float64::nlanes, v1 );
+                v_store( dst + j + VTraits<v_float64>::vlanes(), v1 );
            }
-            if( j <= roi.width - v_float64::nlanes )
+            if( j <= roi.width - VTraits<v_float64>::vlanes() )
            {
                v_float64 v0 = vx_load( src + j );
-                v0 = thresh2 < v0;
-                v0 = v0 & maxval2;
+                v0 = v_lt(thresh2, v0);
+                v0 = v_and(v0, maxval2);
                v_store( dst + j, v0 );
-                j += v_float64::nlanes;
+                j += VTraits<v_float64>::vlanes();
            }

            for( ; j < roi.width; j++ )
@ -989,25 +989,25 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
        for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
        {
            j = 0;
-            for( ; j <= roi.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes )
+            for( ; j <= roi.width - 2*VTraits<v_float64>::vlanes(); j += 2*VTraits<v_float64>::vlanes() )
            {
                v_float64 v0, v1;
                v0 = vx_load( src + j );
-                v1 = vx_load( src + j + v_float64::nlanes );
-                v0 = v0 <= thresh2;
-                v1 = v1 <= thresh2;
-                v0 = v0 & maxval2;
-                v1 = v1 & maxval2;
+                v1 = vx_load( src + j + VTraits<v_float64>::vlanes() );
+                v0 = v_le(v0, thresh2);
+                v1 = v_le(v1, thresh2);
+                v0 = v_and(v0, maxval2);
+                v1 = v_and(v1, maxval2);
                v_store( dst + j, v0 );
-                v_store( dst + j + v_float64::nlanes, v1 );
+                v_store( dst + j + VTraits<v_float64>::vlanes(), v1 );
            }
-            if( j <= roi.width - v_float64::nlanes )
+            if( j <= roi.width - VTraits<v_float64>::vlanes() )
            {
                v_float64 v0 = vx_load( src + j );
-                v0 = v0 <= thresh2;
-                v0 = v0 & maxval2;
+                v0 = v_le(v0, thresh2);
+                v0 = v_and(v0, maxval2);
                v_store( dst + j, v0 );
-                j += v_float64::nlanes;
+                j += VTraits<v_float64>::vlanes();
            }

            for( ; j < roi.width; j++ )
@ -1019,22 +1019,22 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
        for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
        {
            j = 0;
-            for( ; j <= roi.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes )
+            for( ; j <= roi.width - 2*VTraits<v_float64>::vlanes(); j += 2*VTraits<v_float64>::vlanes() )
            {
                v_float64 v0, v1;
                v0 = vx_load( src + j );
-                v1 = vx_load( src + j + v_float64::nlanes );
+                v1 = vx_load( src + j + VTraits<v_float64>::vlanes() );
                v0 = v_min( v0, thresh2 );
                v1 = v_min( v1, thresh2 );
                v_store( dst + j, v0 );
-                v_store( dst + j + v_float64::nlanes, v1 );
+                v_store( dst + j + VTraits<v_float64>::vlanes(), v1 );
            }
-            if( j <= roi.width - v_float64::nlanes )
+            if( j <= roi.width - VTraits<v_float64>::vlanes() )
            {
                v_float64 v0 = vx_load( src + j );
                v0 = v_min( v0, thresh2 );
                v_store( dst + j, v0 );
-                j += v_float64::nlanes;
+                j += VTraits<v_float64>::vlanes();
            }

            for( ; j < roi.width; j++ )
@ -1046,22 +1046,22 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
        for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
        {
            j = 0;
-            for( ; j <= roi.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes )
+            for( ; j <= roi.width - 2*VTraits<v_float64>::vlanes(); j += 2*VTraits<v_float64>::vlanes() )
            {
                v_float64 v0, v1;
                v0 = vx_load( src + j );
-                v1 = vx_load( src + j + v_float64::nlanes );
-                v0 = ( thresh2 < v0 ) & v0;
-                v1 = ( thresh2 < v1 ) & v1;
+                v1 = vx_load( src + j + VTraits<v_float64>::vlanes() );
+                v0 = v_and(v_lt(thresh2, v0), v0);
+                v1 = v_and(v_lt(thresh2, v1), v1);
                v_store( dst + j, v0 );
-                v_store( dst + j + v_float64::nlanes, v1 );
+                v_store( dst + j + VTraits<v_float64>::vlanes(), v1 );
            }
-            if( j <= roi.width - v_float64::nlanes )
+            if( j <= roi.width - VTraits<v_float64>::vlanes() )
            {
                v_float64 v0 = vx_load( src + j );
-                v0 = ( thresh2 < v0 ) & v0;
+                v0 = v_and(v_lt(thresh2, v0), v0);
                v_store( dst + j, v0 );
-                j += v_float64::nlanes;
+                j += VTraits<v_float64>::vlanes();
            }

            for( ; j < roi.width; j++ )
@ -1073,22 +1073,22 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
        for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
        {
            j = 0;
-            for( ; j <= roi.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes )
+            for( ; j <= roi.width - 2*VTraits<v_float64>::vlanes(); j += 2*VTraits<v_float64>::vlanes() )
            {
                v_float64 v0, v1;
                v0 = vx_load( src + j );
-                v1 = vx_load( src + j + v_float64::nlanes );
-                v0 = ( v0 <= thresh2 ) & v0;
-                v1 = ( v1 <= thresh2 ) & v1;
+                v1 = vx_load( src + j + VTraits<v_float64>::vlanes() );
+                v0 = v_and(v_le(v0, thresh2), v0);
+                v1 = v_and(v_le(v1, thresh2), v1);
                v_store( dst + j, v0 );
-                v_store( dst + j + v_float64::nlanes, v1 );
+                v_store( dst + j + VTraits<v_float64>::vlanes(), v1 );
            }
-            if( j <= roi.width - v_float64::nlanes )
+            if( j <= roi.width - VTraits<v_float64>::vlanes() )
            {
                v_float64 v0 = vx_load( src + j );
-                v0 = ( v0 <= thresh2 ) & v0;
+                v0 = v_and(v_le(v0, thresh2), v0);
                v_store( dst + j, v0 );
-                j += v_float64::nlanes;
+                j += VTraits<v_float64>::vlanes();
            }

            for( ; j < roi.width; j++ )