Merge pull request #24325 from hanliutong:rewrite

Rewrite Universal Intrinsic code: float related part #24325 The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro: rewrite them by using the new Universal Intrinsic API. The series of PRs is listed below: #23885 First patch, an example #23980 Core module #24058 ImgProc module, part 1 #24132 ImgProc module, part 2 #24166 ImgProc module, part 3 #24301 Features2d and calib3d module #24324 Gapi module This patch (hopefully) is the last one in the series. This patch mainly involves 3 parts 1. Add some modifications related to float (CV_SIMD_64F) 2. Use `#if (CV_SIMD || CV_SIMD_SCALABLE)` instead of `#if CV_SIMD || CV_SIMD_SCALABLE`, then we can get the `CV_SIMD` module that is not enabled for `CV_SIMD_SCALABLE` by looking for `if CV_SIMD` 3. Summary of `CV_SIMD` blocks that remains unmodified: Updated comments - Some blocks will cause test fail when enable for RVV, marked as `TODO: enable for CV_SIMD_SCALABLE, ....` - Some blocks can not be rewrited directly. (Not commented in the source code, just listed here) - ./modules/core/src/mathfuncs_core.simd.hpp (Vector type wrapped in class/struct) - ./modules/imgproc/src/color_lab.cpp (Array of vector type) - ./modules/imgproc/src/color_rgb.simd.hpp (Array of vector type) - ./modules/imgproc/src/sumpixels.simd.hpp (fixed length algorithm, strongly ralated with `CV_SIMD_WIDTH`) These algorithms will need to be redesigned to accommodate scalable backends. ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [ ] I agree to contribute to the project under Apache 2 License. - [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [ ] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
2025-08-05 22:19:14 +08:00 · 2023-10-05 22:57:25 +08:00 · 2023-10-05 22:57:25 +08:00 · 07bf9cb013
commit 07bf9cb013
parent 3dcaf1f287
12 changed files with 341 additions and 324 deletions
--- a/modules/calib3d/src/undistort.simd.hpp
+++ b/modules/calib3d/src/undistort.simd.hpp
@ -89,8 +89,8 @@ public:
        s2(_s2),
        s3(_s3),
        s4(_s4) {
-#if CV_SIMD_64F
-        for (int i = 0; i < 2 * v_float64::nlanes; ++i)
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
+        for (int i = 0; i < 2 * VTraits<v_float64>::vlanes(); ++i)
        {
            s_x[i] = ir[0] * i;
            s_y[i] = ir[3] * i;
@ -123,26 +123,26 @@ public:
            else
                CV_Assert(m1 != NULL);

-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
            const v_float64 v_one = vx_setall_f64(1.0);
-            for (; j <= size.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes, _x += 2*v_float64::nlanes * ir[0], _y += 2*v_float64::nlanes * ir[3], _w += 2*v_float64::nlanes * ir[6])
+            for (; j <= size.width - 2*VTraits<v_float64>::vlanes(); j += 2*VTraits<v_float64>::vlanes(), _x += 2*VTraits<v_float64>::vlanes() * ir[0], _y += 2*VTraits<v_float64>::vlanes() * ir[3], _w += 2*VTraits<v_float64>::vlanes() * ir[6])
            {
                v_float64 m_0, m_1, m_2, m_3;
-                m_2 = v_one / (vx_setall_f64(_w) + vx_load(s_w));
-                m_3 = v_one / (vx_setall_f64(_w) + vx_load(s_w + v_float64::nlanes));
+                m_2 = v_div(v_one, v_add(vx_setall_f64(_w), vx_load(this->s_w)));
+                m_3 = v_div(v_one, v_add(vx_setall_f64(_w), vx_load(this->s_w + VTraits<v_float64>::vlanes())));
                m_0 = vx_setall_f64(_x); m_1 = vx_setall_f64(_y);
-                v_float64 x_0 = (m_0 + vx_load(s_x)) * m_2;
-                v_float64 x_1 = (m_0 + vx_load(s_x + v_float64::nlanes)) * m_3;
-                v_float64 y_0 = (m_1 + vx_load(s_y)) * m_2;
-                v_float64 y_1 = (m_1 + vx_load(s_y + v_float64::nlanes)) * m_3;
+                v_float64 x_0 = v_mul(v_add(m_0, vx_load(this->s_x)), m_2);
+                v_float64 x_1 = v_mul(v_add(m_0, vx_load(this->s_x + VTraits<v_float64>::vlanes())), m_3);
+                v_float64 y_0 = v_mul(v_add(m_1, vx_load(this->s_y)), m_2);
+                v_float64 y_1 = v_mul(v_add(m_1, vx_load(this->s_y + VTraits<v_float64>::vlanes())), m_3);

-                v_float64 xd_0 = x_0 * x_0;
-                v_float64 yd_0 = y_0 * y_0;
-                v_float64 xd_1 = x_1 * x_1;
-                v_float64 yd_1 = y_1 * y_1;
+                v_float64 xd_0 = v_mul(x_0, x_0);
+                v_float64 yd_0 = v_mul(y_0, y_0);
+                v_float64 xd_1 = v_mul(x_1, x_1);
+                v_float64 yd_1 = v_mul(y_1, y_1);

-                v_float64 r2_0 = xd_0 + yd_0;
-                v_float64 r2_1 = xd_1 + yd_1;
+                v_float64 r2_0 = v_add(xd_0, yd_0);
+                v_float64 r2_1 = v_add(xd_1, yd_1);

                m_1 = vx_setall_f64(k3);
                m_2 = vx_setall_f64(k2);
@ -151,18 +151,18 @@ public:
                m_1 = v_muladd(v_muladd(v_muladd(m_1, r2_1, m_2), r2_1, m_3), r2_1, v_one);
                m_3 = vx_setall_f64(k6);
                m_2 = vx_setall_f64(k5);
-                m_0 /= v_muladd(v_muladd(v_muladd(m_3, r2_0, m_2), r2_0, vx_setall_f64(k4)), r2_0, v_one);
-                m_1 /= v_muladd(v_muladd(v_muladd(m_3, r2_1, m_2), r2_1, vx_setall_f64(k4)), r2_1, v_one);
+                m_0 = v_div(m_0, v_muladd(v_muladd(v_muladd(m_3, r2_0, m_2), r2_0, vx_setall_f64(this->k4)), r2_0, v_one));
+                m_1 = v_div(m_1, v_muladd(v_muladd(v_muladd(m_3, r2_1, m_2), r2_1, vx_setall_f64(this->k4)), r2_1, v_one));

                m_3 = vx_setall_f64(2.0);
                xd_0 = v_muladd(m_3, xd_0, r2_0);
                yd_0 = v_muladd(m_3, yd_0, r2_0);
                xd_1 = v_muladd(m_3, xd_1, r2_1);
                yd_1 = v_muladd(m_3, yd_1, r2_1);
-                m_2 = x_0 * y_0 * m_3;
-                m_3 = x_1 * y_1 * m_3;
+                m_2 = v_mul(v_mul(x_0, y_0), m_3);
+                m_3 = v_mul(v_mul(x_1, y_1), m_3);

-                x_0 *= m_0; y_0 *= m_0; x_1 *= m_1; y_1 *= m_1;
+                x_0 = v_mul(x_0, m_0); y_0 = v_mul(y_0, m_0); x_1 = v_mul(x_1, m_1); y_1 = v_mul(y_1, m_1);

                m_0 = vx_setall_f64(p1);
                m_1 = vx_setall_f64(p2);
@ -176,8 +176,8 @@ public:
                xd_1 = v_muladd(m_0, m_3, xd_1);
                yd_1 = v_muladd(m_1, m_3, yd_1);

-                m_0 = r2_0 * r2_0;
-                m_1 = r2_1 * r2_1;
+                m_0 = v_mul(r2_0, r2_0);
+                m_1 = v_mul(r2_1, r2_1);
                m_2 = vx_setall_f64(s2);
                m_3 = vx_setall_f64(s1);
                xd_0 = v_muladd(m_3, r2_0, v_muladd(m_2, m_0, xd_0));
@ -203,17 +203,17 @@ public:
                r2_0 = v_muladd(m_0, xd_0, v_muladd(m_1, yd_0, m_2));
                r2_1 = v_muladd(m_0, xd_1, v_muladd(m_1, yd_1, m_2));
                m_0 = vx_setzero_f64();
-                r2_0 = v_select(r2_0 == m_0, v_one, v_one / r2_0);
-                r2_1 = v_select(r2_1 == m_0, v_one, v_one / r2_1);
+                r2_0 = v_select(v_eq(r2_0, m_0), v_one, v_div(v_one, r2_0));
+                r2_1 = v_select(v_eq(r2_1, m_0), v_one, v_div(v_one, r2_1));

                m_0 = vx_setall_f64(fx);
                m_1 = vx_setall_f64(u0);
                m_2 = vx_setall_f64(fy);
                m_3 = vx_setall_f64(v0);
-                x_0 = v_muladd(m_0 * r2_0, x_0, m_1);
-                y_0 = v_muladd(m_2 * r2_0, y_0, m_3);
-                x_1 = v_muladd(m_0 * r2_1, x_1, m_1);
-                y_1 = v_muladd(m_2 * r2_1, y_1, m_3);
+                x_0 = v_muladd(v_mul(m_0, r2_0), x_0, m_1);
+                y_0 = v_muladd(v_mul(m_2, r2_0), y_0, m_3);
+                x_1 = v_muladd(v_mul(m_0, r2_1), x_1, m_1);
+                y_1 = v_muladd(v_mul(m_2, r2_1), y_1, m_3);

                if (m1type == CV_32FC1)
                {
@ -225,20 +225,20 @@ public:
                    v_float32 mf0, mf1;
                    v_zip(v_cvt_f32(x_0, x_1), v_cvt_f32(y_0, y_1), mf0, mf1);
                    v_store(&m1f[j * 2], mf0);
-                    v_store(&m1f[j * 2 + v_float32::nlanes], mf1);
+                    v_store(&m1f[j * 2 + VTraits<v_float32>::vlanes()], mf1);
                }
                else // m1type == CV_16SC2
                {
                    m_0 = vx_setall_f64(INTER_TAB_SIZE);
-                    x_0 *= m_0; x_1 *= m_0; y_0 *= m_0; y_1 *= m_0;
+                    x_0 = v_mul(x_0, m_0); x_1 = v_mul(x_1, m_0); y_0 = v_mul(y_0, m_0); y_1 = v_mul(y_1, m_0);

                    v_int32 mask = vx_setall_s32(INTER_TAB_SIZE - 1);
                    v_int32 iu = v_round(x_0, x_1);
                    v_int32 iv = v_round(y_0, y_1);

-                    v_pack_u_store(&m2[j], (iu & mask) + (iv & mask) * vx_setall_s32(INTER_TAB_SIZE));
+                    v_pack_u_store(&m2[j], v_add(v_and(iu, mask), v_mul(v_and(iv, mask), vx_setall_s32(INTER_TAB_SIZE))));
                    v_int32 out0, out1;
-                    v_zip(iu >> INTER_BITS, iv >> INTER_BITS, out0, out1);
+                    v_zip(v_shr<INTER_BITS>(iu), v_shr<INTER_BITS>(iv), out0, out1);
                    v_store(&m1[j * 2], v_pack(out0, out1));
                }
            }
@ -302,10 +302,10 @@ private:
    double s2;
    double s3;
    double s4;
-#if CV_SIMD_64F
-    double s_x[2*v_float64::nlanes];
-    double s_y[2*v_float64::nlanes];
-    double s_w[2*v_float64::nlanes];
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
+    double s_x[2*VTraits<v_float64>::max_nlanes];
+    double s_y[2*VTraits<v_float64>::max_nlanes];
+    double s_w[2*VTraits<v_float64>::max_nlanes];
 #endif
 };
 }
--- a/modules/core/include/opencv2/core/hal/intrin.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin.hpp
@ -972,6 +972,15 @@ namespace CV__SIMD_NAMESPACE {
    { \
        return a op b; \
    }
+    #define OPENCV_HAL_WRAP_EQ_OP(_Tpvec) \
+    inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \
+    { \
+        return a == b; \
+    } \
+    inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
+    { \
+        return a != b; \
+    }

    #define OPENCV_HAL_WRAP_CMP(_Tpvec) \
    OPENCV_HAL_WRAP_CMP_OP(_Tpvec, eq, ==) \
@ -984,11 +993,11 @@ namespace CV__SIMD_NAMESPACE {
    OPENCV_HAL_WRAP_CMP(v_uint8)
    OPENCV_HAL_WRAP_CMP(v_uint16)
    OPENCV_HAL_WRAP_CMP(v_uint32)
-    // OPENCV_HAL_WRAP_CMP(v_uint64)
+    OPENCV_HAL_WRAP_EQ_OP(v_uint64)
    OPENCV_HAL_WRAP_CMP(v_int8)
    OPENCV_HAL_WRAP_CMP(v_int16)
    OPENCV_HAL_WRAP_CMP(v_int32)
-    // OPENCV_HAL_WRAP_CMP(v_int64)
+    OPENCV_HAL_WRAP_EQ_OP(v_int64)
    OPENCV_HAL_WRAP_CMP(v_float32)
    #if CV_SIMD_64F
    OPENCV_HAL_WRAP_CMP(v_float64)
@ -997,9 +1006,11 @@ namespace CV__SIMD_NAMESPACE {
        OPENCV_HAL_WRAP_CMP(v_uint8x16)
        OPENCV_HAL_WRAP_CMP(v_uint16x8)
        OPENCV_HAL_WRAP_CMP(v_uint32x4)
+        OPENCV_HAL_WRAP_EQ_OP(v_uint64x2)
        OPENCV_HAL_WRAP_CMP(v_int8x16)
        OPENCV_HAL_WRAP_CMP(v_int16x8)
        OPENCV_HAL_WRAP_CMP(v_int32x4)
+        OPENCV_HAL_WRAP_EQ_OP(v_int64x2)
        OPENCV_HAL_WRAP_CMP(v_float32x4)
        #if CV_SIMD_64F
        OPENCV_HAL_WRAP_CMP(v_float64x2)
@ -1009,9 +1020,11 @@ namespace CV__SIMD_NAMESPACE {
        OPENCV_HAL_WRAP_CMP(v_uint8x32)
        OPENCV_HAL_WRAP_CMP(v_uint16x16)
        OPENCV_HAL_WRAP_CMP(v_uint32x8)
+        OPENCV_HAL_WRAP_EQ_OP(v_uint64x4)
        OPENCV_HAL_WRAP_CMP(v_int8x32)
        OPENCV_HAL_WRAP_CMP(v_int16x16)
        OPENCV_HAL_WRAP_CMP(v_int32x8)
+        OPENCV_HAL_WRAP_EQ_OP(v_int64x4)
        OPENCV_HAL_WRAP_CMP(v_float32x8)
        #if CV_SIMD_64F
        OPENCV_HAL_WRAP_CMP(v_float64x4)
--- a/modules/core/src/arithm.simd.hpp
+++ b/modules/core/src/arithm.simd.hpp
@ -69,7 +69,7 @@
 #define DEFINE_SIMD_F32(fun, ...) \
    DEFINE_SIMD(__CV_CAT(fun, 32f), float, v_float32, __VA_ARGS__)

-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
    #define DEFINE_SIMD_F64(fun, ...) \
        DEFINE_SIMD(__CV_CAT(fun, 64f), double, v_float64, __VA_ARGS__)
 #else
@ -104,7 +104,7 @@ namespace cv { namespace hal {

 #ifdef ARITHM_DEFINITIONS_ONLY

-#if !CV_SIMD_64F
+#if !(CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
 typedef int v_float64; // dummy
 #endif

@ -266,7 +266,7 @@ struct op_absdiff
 template<>
 struct op_absdiff<schar, v_int8>
 {
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    static inline v_int8 r(const v_int8& a, const v_int8& b)
    { return v_absdiffs(a, b); }
 #endif
@ -276,7 +276,7 @@ struct op_absdiff<schar, v_int8>
 template<>
 struct op_absdiff<short, v_int16>
 {
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    static inline v_int16 r(const v_int16& a, const v_int16& b)
    { return v_absdiffs(a, b); }
 #endif
@ -286,7 +286,7 @@ struct op_absdiff<short, v_int16>
 template<>
 struct op_absdiff<int, v_int32>
 {
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    static inline v_int32 r(const v_int32& a, const v_int32& b)
    { return v_reinterpret_as_s32(v_absdiff(a, b)); }
 #endif
@ -331,7 +331,7 @@ struct op_not

 //////////////////////////// Loaders /////////////////////////////////

-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)

 template< template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
 struct bin_loader
@ -396,7 +396,7 @@ template<template<typename T1, typename Tvec> class OP, typename T1, typename Tv
 static void bin_loop(const T1* src1, size_t step1, const T1* src2, size_t step2, T1* dst, size_t step, int width, int height)
 {
    typedef OP<T1, Tvec> op;
-#if CV_SIMD  || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    typedef bin_loader<OP, T1, Tvec> ldr;
    const int wide_step = VTraits<Tvec>::vlanes();
    #if !CV_NEON && CV_SIMD_WIDTH == 16
@ -414,7 +414,7 @@ static void bin_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
    {
        int x = 0;

-    #if CV_SIMD || CV_SIMD_SCALABLE
+    #if (CV_SIMD || CV_SIMD_SCALABLE)
        #if !CV_NEON && !CV_MSA
        if (is_aligned(src1, src2, dst))
        {
@ -464,7 +464,7 @@ static void bin_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
    vx_cleanup();
 }

-#if !CV_SIMD_64F
+#if !(CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
 template<template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
 static void bin_loop_nosimd(const T1* src1, size_t step1, const T1* src2, size_t step2, T1* dst, size_t step, int width, int height)
 {
@ -496,7 +496,7 @@ static void bin_loop_nosimd(const T1* src1, size_t step1, const T1* src2, size_t
 #define BIN_LOOP64F bin_loop_nosimd
 #else
 #define BIN_LOOP64F bin_loop
-#endif //!CV_SIMD_64F
+#endif //!(CV_SIMD_64F || CV_SIMD_SCALABLE_64F)

 #endif // ARITHM_DEFINITIONS_ONLY

@ -621,7 +621,7 @@ struct op_cmpne

 //////////////////////////// Loaders /////////////////////////////////

-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 // todo: add support for RW alignment & stream
 template<int nload, template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
 struct cmp_loader_n
@ -701,7 +701,7 @@ template<template<typename T1, typename Tvec> class OP, typename T1, typename Tv
 static void cmp_loop(const T1* src1, size_t step1, const T1* src2, size_t step2, uchar* dst, size_t step, int width, int height)
 {
    typedef OP<T1, Tvec> op;
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    typedef cmp_loader_n<sizeof(T1), OP, T1, Tvec> ldr;
    const int wide_step = VTraits<Tvec>::vlanes() * sizeof(T1);
 #endif // CV_SIMD
@ -713,7 +713,7 @@ static void cmp_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
    {
        int x = 0;

-    #if CV_SIMD || CV_SIMD_SCALABLE
+    #if (CV_SIMD || CV_SIMD_SCALABLE)
        for (; x <= width - wide_step; x += wide_step)
        {
            ldr::l(src1 + x, src2 + x, dst + x);
@ -768,7 +768,7 @@ static void cmp_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
    }
 }

-#if !CV_SIMD_64F
+#if !(CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
 template< template<typename T1, typename Tvec> class OP, typename T1>
 static void cmp_loop_nosimd(const T1* src1, size_t step1, const T1* src2, size_t step2, uchar* dst, size_t step, int width, int height)
 {
@ -822,7 +822,7 @@ static void cmp_loop_nosimd(const double* src1, size_t step1, const double* src2
        break;
    }
 }
-#endif // !CV_SIMD_64F
+#endif // !(CV_SIMD_64F || CV_SIMD_SCALABLE_64F)

 #endif // ARITHM_DEFINITIONS_ONLY

@ -880,7 +880,7 @@ DEFINE_SIMD_ALL(cmp)

 //////////////////////////// Loaders ///////////////////////////////

-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 // todo: add support for RW alignment & stream
 template<int nload, template<typename T1, typename T2, typename Tvec> class OP, typename T1, typename T2, typename Tvec>
 struct scalar_loader_n
@ -1099,16 +1099,16 @@ struct scalar_loader_n<sizeof(float), OP, float, T2, v_float32>
 };
 #endif // CV_SIMD

-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
 template<template<typename T1, typename T2, typename Tvec> class OP>
 struct scalar_loader_n<sizeof(int), OP, int, double, v_int32>
 {
    typedef OP<int, float, v_int32> op;
    typedef OP<double, double, v_float64> op64;
-    enum {step = v_int32::nlanes};

    static inline void l(const int* src1, const int* src2, const double* scalar, int* dst)
    {
+        const int step = VTraits<v_int32>::vlanes();
        v_int32 v_src1 = vx_load(src1);
        v_int32 v_src2 = vx_load(src2);
        v_int32 v_src1s = vx_load(src1 + step);
@ -1125,6 +1125,7 @@ struct scalar_loader_n<sizeof(int), OP, int, double, v_int32>
    }
    static inline void l(const int* src1, const double* scalar, int* dst)
    {
+        const int step = VTraits<v_int32>::vlanes();
        v_int32 v_src1 = vx_load(src1);
        v_int32 v_src1s = vx_load(src1 + step);

@ -1169,10 +1170,10 @@ struct scalar_loader_n<sizeof(float), OP, float, double, v_float32>
 {
    typedef OP<float, float, v_float32> op;
    typedef OP<double, double, v_float64> op64;
-    enum {step = v_float32::nlanes};

    static inline void l(const float* src1, const float* src2, const double* scalar, float* dst)
    {
+        const int step = VTraits<v_float32>::vlanes();
        v_float32 v_src1 = vx_load(src1);
        v_float32 v_src2 = vx_load(src2);
        v_float32 v_src1s = vx_load(src1 + step);
@ -1186,6 +1187,7 @@ struct scalar_loader_n<sizeof(float), OP, float, double, v_float32>
    }
    static inline void l(const float* src1, const double* scalar, float* dst)
    {
+        const int step = VTraits<v_float32>::vlanes();
        v_float32 v_src1 = vx_load(src1);
        v_float32 v_src1s = vx_load(src1 + step);

@ -1226,10 +1228,10 @@ template<template<typename T1, typename T2, typename Tvec> class OP>
 struct scalar_loader_n<sizeof(double), OP, double, double, v_float64>
 {
    typedef OP<double, double, v_float64> op;
-    enum {step = v_float64::nlanes};

    static inline void l(const double* src1, const double* src2, const double* scalar, double* dst)
    {
+        const int step = VTraits<v_float64>::vlanes();
        v_float64 v_src1 = vx_load(src1);
        v_float64 v_src2 = vx_load(src2);
        v_float64 v_src1s = vx_load(src1 + step);
@ -1243,6 +1245,7 @@ struct scalar_loader_n<sizeof(double), OP, double, double, v_float64>
    }
    static inline void l(const double* src1, const double* scalar, double* dst)
    {
+        const int step = VTraits<v_float64>::vlanes();
        v_float64 v_src1 = vx_load(src1);
        v_float64 v_src1s = vx_load(src1 + step);

@ -1253,7 +1256,7 @@ struct scalar_loader_n<sizeof(double), OP, double, double, v_float64>
        v_store(dst + step, r1);
    }
 };
-#endif // CV_SIMD_64F
+#endif // (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)

 //////////////////////////// Loops /////////////////////////////////

@ -1263,7 +1266,7 @@ static void scalar_loop(const T1* src1, size_t step1, const T1* src2, size_t ste
                 T1* dst, size_t step, int width, int height, const T2* scalar)
 {
    typedef OP<T1, T2, Tvec> op;
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    typedef scalar_loader_n<sizeof(T1), OP, T1, T2, Tvec> ldr;
    const int wide_step = sizeof(T1) > sizeof(ushort) ? VTraits<Tvec>::vlanes() * 2 :
                          sizeof(T1) == sizeof(uchar) ? VTraits<Tvec>::vlanes() / 2 : VTraits<Tvec>::vlanes();
@ -1277,7 +1280,7 @@ static void scalar_loop(const T1* src1, size_t step1, const T1* src2, size_t ste
    {
        int x = 0;

-    #if CV_SIMD || CV_SIMD_SCALABLE
+    #if (CV_SIMD || CV_SIMD_SCALABLE)
        for (; x <= width - wide_step; x += wide_step)
        {
            ldr::l(src1 + x, src2 + x, scalar, dst + x);
@ -1309,7 +1312,7 @@ template<template<typename T1, typename T2, typename Tvec> class OP, typename T1
 static void scalar_loop(const T1* src1, size_t step1, T1* dst, size_t step, int width, int height, const T2* scalar)
 {
    typedef OP<T1, T2, Tvec> op;
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    typedef scalar_loader_n<sizeof(T1), OP, T1, T2, Tvec> ldr;
    const int wide_step = sizeof(T1) > sizeof(ushort) ? VTraits<Tvec>::vlanes() * 2 :
                          sizeof(T1) == sizeof(uchar) ? VTraits<Tvec>::vlanes() / 2 : VTraits<Tvec>::vlanes();
@ -1322,7 +1325,7 @@ static void scalar_loop(const T1* src1, size_t step1, T1* dst, size_t step, int
    {
        int x = 0;

-    #if CV_SIMD || CV_SIMD_SCALABLE
+    #if (CV_SIMD || CV_SIMD_SCALABLE)
        for (; x <= width - wide_step; x += wide_step)
        {
            ldr::l(src1 + x, scalar, dst + x);
@ -1349,7 +1352,7 @@ static void scalar_loop(const T1* src1, size_t step1, T1* dst, size_t step, int
    vx_cleanup();
 }

-#if !CV_SIMD_64F
+#if !(CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
 // dual source
 template<template<typename T1, typename T2, typename Tvec> class OP, typename T1, typename T2, typename Tvec>
 static void scalar_loop_nosimd(const T1* src1, size_t step1, const T1* src2, size_t step2,
@ -1413,7 +1416,7 @@ static void scalar_loop_nosimd(const T1* src1, size_t step1, T1* dst, size_t ste
 #define SCALAR_LOOP64F scalar_loop_nosimd
 #else
 #define SCALAR_LOOP64F scalar_loop
-#endif // !CV_SIMD_64F
+#endif // !(CV_SIMD_64F || CV_SIMD_SCALABLE_64F)

 #endif // ARITHM_DEFINITIONS_ONLY

@ -1437,7 +1440,7 @@ struct op_mul
 template<typename T1, typename T2, typename Tvec>
 struct op_mul_scale
 {
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar)
    {
        const v_float32 v_scalar = vx_setall_f32(*scalar);
@ -1453,7 +1456,7 @@ struct op_mul_scale
 template<>
 struct op_mul_scale<double, double, v_float64>
 {
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
    static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalar)
    {
        const v_float64 v_scalar = vx_setall_f64(*scalar);
@ -1578,7 +1581,7 @@ struct op_div_f
 template<typename T1, typename T2, typename Tvec>
 struct op_div_scale
 {
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar)
    {
        const v_float32 v_scalar = vx_setall_f32(*scalar);
@ -1600,7 +1603,7 @@ struct op_div_scale
 template<>
 struct op_div_scale<float, float, v_float32>
 {
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    static inline v_float32 r(const v_float32& a, const v_float32& b, const float* scalar)
    {
        const v_float32 v_scalar = vx_setall_f32(*scalar);
@ -1614,7 +1617,7 @@ struct op_div_scale<float, float, v_float32>
 template<>
 struct op_div_scale<double, double, v_float64>
 {
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
    static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalar)
    {
        const v_float64 v_scalar = vx_setall_f64(*scalar);
@ -1686,7 +1689,7 @@ DEFINE_SIMD_ALL(div, div_loop)
 template<typename T1, typename T2, typename Tvec>
 struct op_add_scale
 {
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar)
    {
        const v_float32 v_alpha = vx_setall_f32(*scalar);
@ -1702,7 +1705,7 @@ struct op_add_scale
 template<>
 struct op_add_scale<double, double, v_float64>
 {
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
    static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalar)
    {
        const v_float64 v_alpha = vx_setall_f64(*scalar);
@ -1719,7 +1722,7 @@ struct op_add_scale<double, double, v_float64>
 template<typename T1, typename T2, typename Tvec>
 struct op_add_weighted
 {
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalars)
    {
        const v_float32 v_alpha = vx_setall_f32(scalars[0]);
@ -1737,7 +1740,7 @@ struct op_add_weighted
 template<>
 struct op_add_weighted<double, double, v_float64>
 {
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
    static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalars)
    {
        const v_float64 v_alpha = vx_setall_f64(scalars[0]);
@ -1836,7 +1839,7 @@ DEFINE_SIMD_F64(addWeighted, add_weighted_loop_d)
 template<typename T1, typename T2, typename Tvec>
 struct op_recip
 {
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    static inline v_float32 r(const v_float32& a, const T2* scalar)
    {
        const v_float32 v_scalar = vx_setall_f32(*scalar);
@ -1858,7 +1861,7 @@ struct op_recip
 template<>
 struct op_recip<float, float, v_float32>
 {
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    static inline v_float32 r(const v_float32& a, const float* scalar)
    {
        const v_float32 v_scalar = vx_setall_f32(*scalar);
@ -1872,7 +1875,7 @@ struct op_recip<float, float, v_float32>
 template<>
 struct op_recip<double, double, v_float64>
 {
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
    static inline v_float64 r(const v_float64& a, const double* scalar)
    {
        const v_float64 v_scalar = vx_setall_f64(*scalar);
--- a/modules/core/src/has_non_zero.simd.hpp
+++ b/modules/core/src/has_non_zero.simd.hpp
@ -87,11 +87,11 @@ static bool hasNonZero8u( const uchar* src, size_t len )
 {
    bool res = false;
    const uchar* srcEnd = src+len;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    typedef v_uint8 v_type;
    const v_type v_zero = vx_setzero_u8();
    constexpr const int unrollCount = 2;
-    int step = v_type::nlanes * unrollCount;
+    int step = VTraits<v_type>::vlanes() * unrollCount;
    int len0 = len & -step;
    const uchar* srcSimdEnd = src+len0;

@ -99,10 +99,10 @@ static bool hasNonZero8u( const uchar* src, size_t len )
    while(!res && countSIMD--)
    {
        v_type v0 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v1 = vx_load(src);
-        src += v_type::nlanes;
-        res = v_check_any(((v0 | v1) != v_zero));
+        src += VTraits<v_type>::vlanes();
+        res = v_check_any((v_ne(v_or(v0, v1), v_zero)));
    }

    v_cleanup();
@ -114,11 +114,11 @@ static bool hasNonZero16u( const ushort* src, size_t len )
 {
    bool res = false;
    const ushort* srcEnd = src+len;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    typedef v_uint16 v_type;
    const v_type v_zero = vx_setzero_u16();
    constexpr const int unrollCount = 4;
-    int step = v_type::nlanes * unrollCount;
+    int step = VTraits<v_type>::vlanes() * unrollCount;
    int len0 = len & -step;
    const ushort* srcSimdEnd = src+len0;

@ -126,16 +126,16 @@ static bool hasNonZero16u( const ushort* src, size_t len )
    while(!res && countSIMD--)
    {
        v_type v0 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v1 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v2 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v3 = vx_load(src);
-        src += v_type::nlanes;
-        v0 |= v1;
-        v2 |= v3;
-        res = v_check_any(((v0 | v2) != v_zero));
+        src += VTraits<v_type>::vlanes();
+        v0 = v_or(v0, v1);
+        v2 = v_or(v2, v3);
+        res = v_check_any((v_ne(v_or(v0, v2), v_zero)));
    }

    v_cleanup();
@ -147,11 +147,11 @@ static bool hasNonZero32s( const int* src, size_t len )
 {
    bool res = false;
    const int* srcEnd = src+len;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    typedef v_int32 v_type;
    const v_type v_zero = vx_setzero_s32();
    constexpr const int unrollCount = 8;
-    int step = v_type::nlanes * unrollCount;
+    int step = VTraits<v_type>::vlanes() * unrollCount;
    int len0 = len & -step;
    const int* srcSimdEnd = src+len0;

@ -159,29 +159,29 @@ static bool hasNonZero32s( const int* src, size_t len )
    while(!res && countSIMD--)
    {
        v_type v0 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v1 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v2 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v3 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v4 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v5 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v6 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v7 = vx_load(src);
-        src += v_type::nlanes;
-        v0 |= v1;
-        v2 |= v3;
-        v4 |= v5;
-        v6 |= v7;
+        src += VTraits<v_type>::vlanes();
+        v0 = v_or(v0, v1);
+        v2 = v_or(v2, v3);
+        v4 = v_or(v4, v5);
+        v6 = v_or(v6, v7);

-        v0 |= v2;
-        v4 |= v6;
-        res = v_check_any(((v0 | v4) != v_zero));
+        v0 = v_or(v0, v2);
+        v4 = v_or(v4, v6);
+        res = v_check_any((v_ne(v_or(v0, v4), v_zero)));
    }

    v_cleanup();
@ -193,11 +193,11 @@ static bool hasNonZero32f( const float* src, size_t len )
 {
    bool res = false;
    const float* srcEnd = src+len;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    typedef v_float32 v_type;
    const v_type v_zero = vx_setzero_f32();
    constexpr const int unrollCount = 8;
-    int step = v_type::nlanes * unrollCount;
+    int step = VTraits<v_type>::vlanes() * unrollCount;
    int len0 = len & -step;
    const float* srcSimdEnd = src+len0;

@ -205,30 +205,30 @@ static bool hasNonZero32f( const float* src, size_t len )
    while(!res && countSIMD--)
    {
        v_type v0 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v1 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v2 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v3 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v4 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v5 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v6 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v7 = vx_load(src);
-        src += v_type::nlanes;
-        v0 |= v1;
-        v2 |= v3;
-        v4 |= v5;
-        v6 |= v7;
+        src += VTraits<v_type>::vlanes();
+        v0 = v_or(v0, v1);
+        v2 = v_or(v2, v3);
+        v4 = v_or(v4, v5);
+        v6 = v_or(v6, v7);

-        v0 |= v2;
-        v4 |= v6;
+        v0 = v_or(v0, v2);
+        v4 = v_or(v4, v6);
        //res = v_check_any(((v0 | v4) != v_zero));//beware : (NaN != 0) returns "false" since != is mapped to _CMP_NEQ_OQ and not _CMP_NEQ_UQ
-        res = !v_check_all(((v0 | v4) == v_zero));
+        res = !v_check_all((v_eq(v_or(v0, v4), v_zero)));
    }

    v_cleanup();
@ -240,11 +240,11 @@ static bool hasNonZero64f( const double* src, size_t len )
 {
    bool res = false;
    const double* srcEnd = src+len;
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
    typedef v_float64 v_type;
    const v_type v_zero = vx_setzero_f64();
    constexpr const int unrollCount = 16;
-    int step = v_type::nlanes * unrollCount;
+    int step = VTraits<v_type>::vlanes() * unrollCount;
    int len0 = len & -step;
    const double* srcSimdEnd = src+len0;

@ -252,55 +252,55 @@ static bool hasNonZero64f( const double* src, size_t len )
    while(!res && countSIMD--)
    {
        v_type v0 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v1 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v2 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v3 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v4 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v5 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v6 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v7 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v8 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v9 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v10 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v11 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v12 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v13 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v14 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v15 = vx_load(src);
-        src += v_type::nlanes;
-        v0  |= v1;
-        v2  |= v3;
-        v4  |= v5;
-        v6  |= v7;
-        v8  |= v9;
-        v10 |= v11;
-        v12 |= v13;
-        v14 |= v15;
+        src += VTraits<v_type>::vlanes();
+        v0 = v_or(v0, v1);
+        v2 = v_or(v2, v3);
+        v4 = v_or(v4, v5);
+        v6 = v_or(v6, v7);
+        v8 = v_or(v8, v9);
+        v10 = v_or(v10, v11);
+        v12 = v_or(v12, v13);
+        v14 = v_or(v14, v15);

-        v0  |= v2;
-        v4  |= v6;
-        v8  |= v10;
-        v12 |= v14;
+        v0 = v_or(v0, v2);
+        v4 = v_or(v4, v6);
+        v8 = v_or(v8, v10);
+        v12 = v_or(v12, v14);

-        v0  |= v4;
-        v8  |= v12;
+        v0 = v_or(v0, v4);
+        v8 = v_or(v8, v12);
        //res = v_check_any(((v0 | v8) != v_zero));//beware : (NaN != 0) returns "false" since != is mapped to _CMP_NEQ_OQ and not _CMP_NEQ_UQ
-        res = !v_check_all(((v0 | v8) == v_zero));
+        res = !v_check_all((v_eq(v_or(v0, v8), v_zero)));
    }

    v_cleanup();
--- a/modules/core/src/lapack.cpp
+++ b/modules/core/src/lapack.cpp
@ -276,7 +276,7 @@ template<typename T> struct VBLAS
    int givens(T*, T*, int, T, T) const { return 0; }
 };

-#if CV_SIMD // TODO: enable for CV_SIMD_SCALABLE_64F
+#if CV_SIMD // TODO: enable for CV_SIMD_SCALABLE, GCC 13 related
 template<> inline int VBLAS<float>::dot(const float* a, const float* b, int n, float* result) const
 {
    if( n < 2*VTraits<v_float32>::vlanes() )
--- a/modules/core/src/matmul.simd.hpp
+++ b/modules/core/src/matmul.simd.hpp
@ -2549,6 +2549,7 @@ double dotProd_16s(const short* src1, const short* src2, int len)
 double dotProd_32s(const int* src1, const int* src2, int len)
 {
 #if CV_SIMD_64F // TODO: enable for CV_SIMD_SCALABLE_64F
+// Test failed on RVV(QEMU): Too big difference (=1.20209e-08 > 1.11022e-12)
    double r = .0;
    int i = 0;
    const int step  = VTraits<v_int32>::vlanes();
--- a/modules/core/src/mean.simd.hpp
+++ b/modules/core/src/mean.simd.hpp
@ -24,7 +24,7 @@ struct SumSqr_SIMD
    }
 };

-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)

 template <>
 struct SumSqr_SIMD<uchar, int, int>
--- a/modules/dnn/src/int8layers/convolution_layer.cpp
+++ b/modules/dnn/src/int8layers/convolution_layer.cpp
@ -19,7 +19,7 @@ namespace cv
 namespace dnn
 {

-#if CV_SIMD
+#if CV_SIMD128
 static inline void v_expand_mul_add(const v_int8x16& a, const v_int8x16& b,
                                    v_int32x4& out0, v_int32x4& out1, v_int32x4& out2, v_int32x4& out3)
 {
@ -1015,7 +1015,7 @@ public:
                                        outptr[0] = std::min(std::max(out1, -128), 127);
                                        out_j = 1;
                                    }
-                                #if CV_SIMD
+                                #if CV_SIMD128
                                    if( stride_w == 1 )
                                    {
                                        const int out_delta = 16;
--- a/modules/dnn/src/int8layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/int8layers/fully_connected_layer.cpp
@ -305,7 +305,7 @@ public:
            #endif
                {
                    int i = 0;
-            #if CV_SIMD
+            #if CV_SIMD128
                    for( ; i  <= nw - 4; i += 4, wptr += 4*wstep )
                    {
                        v_int32x4 vs0 = v_setzero_s32(), vs1 = v_setzero_s32(),
--- a/modules/imgproc/src/accum.simd.hpp
+++ b/modules/imgproc/src/accum.simd.hpp
@ -475,9 +475,9 @@ void acc_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn
 void acc_simd_(const float* src, float* dst, const uchar* mask, int len, int cn)
 {
    int x = 0;
-#if CV_SIMD
-    const int cVectorWidth = v_uint16::nlanes;
-    const int step = v_float32::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int cVectorWidth = VTraits<v_uint16>::vlanes();
+    const int step = VTraits<v_float32>::vlanes();

    if (!mask)
    {
@ -493,8 +493,8 @@ void acc_simd_(const float* src, float* dst, const uchar* mask, int len, int cn)
        #else
        for (; x <= size - cVectorWidth; x += cVectorWidth)
        {
-            v_store(dst + x, vx_load(dst + x) + vx_load(src + x));
-            v_store(dst + x + step, vx_load(dst + x + step) + vx_load(src + x + step));
+            v_store(dst + x, v_add(vx_load(dst + x), vx_load(src + x)));
+            v_store(dst + x + step, v_add(vx_load(dst + x + step), vx_load(src + x + step)));
        }
        #endif // CV_AVX && !CV_AVX2
    }
@ -508,11 +508,11 @@ void acc_simd_(const float* src, float* dst, const uchar* mask, int len, int cn)
                v_uint16 v_masku16 = vx_load_expand(mask + x);
                v_uint32 v_masku320, v_masku321;
                v_expand(v_masku16, v_masku320, v_masku321);
-                v_float32 v_mask0 = v_reinterpret_as_f32(~(v_masku320 == v_reinterpret_as_u32(v_0)));
-                v_float32 v_mask1 = v_reinterpret_as_f32(~(v_masku321 == v_reinterpret_as_u32(v_0)));
+                v_float32 v_mask0 = v_reinterpret_as_f32(v_not(v_eq(v_masku320, v_reinterpret_as_u32(v_0))));
+                v_float32 v_mask1 = v_reinterpret_as_f32(v_not(v_eq(v_masku321, v_reinterpret_as_u32(v_0))));

-                v_store(dst + x, vx_load(dst + x) + (vx_load(src + x) & v_mask0));
-                v_store(dst + x + step, vx_load(dst + x + step) + (vx_load(src + x + step) & v_mask1));
+                v_store(dst + x, v_add(vx_load(dst + x), v_and(vx_load(src + x), v_mask0)));
+                v_store(dst + x + step, v_add(vx_load(dst + x + step), v_and(vx_load(src + x + step), v_mask1)));
            }
        }
        else if (cn == 3)
@ -522,25 +522,25 @@ void acc_simd_(const float* src, float* dst, const uchar* mask, int len, int cn)
                v_uint16 v_masku16 = vx_load_expand(mask + x);
                v_uint32 v_masku320, v_masku321;
                v_expand(v_masku16, v_masku320, v_masku321);
-                v_float32 v_mask0 = v_reinterpret_as_f32(~(v_masku320 == v_reinterpret_as_u32(v_0)));
-                v_float32 v_mask1 = v_reinterpret_as_f32(~(v_masku321 == v_reinterpret_as_u32(v_0)));
+                v_float32 v_mask0 = v_reinterpret_as_f32(v_not(v_eq(v_masku320, v_reinterpret_as_u32(v_0))));
+                v_float32 v_mask1 = v_reinterpret_as_f32(v_not(v_eq(v_masku321, v_reinterpret_as_u32(v_0))));

                v_float32 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
                v_load_deinterleave(src + x * cn, v_src00, v_src10, v_src20);
                v_load_deinterleave(src + (x + step) * cn, v_src01, v_src11, v_src21);
-                v_src00 = v_src00 & v_mask0;
-                v_src01 = v_src01 & v_mask1;
-                v_src10 = v_src10 & v_mask0;
-                v_src11 = v_src11 & v_mask1;
-                v_src20 = v_src20 & v_mask0;
-                v_src21 = v_src21 & v_mask1;
+                v_src00 = v_and(v_src00, v_mask0);
+                v_src01 = v_and(v_src01, v_mask1);
+                v_src10 = v_and(v_src10, v_mask0);
+                v_src11 = v_and(v_src11, v_mask1);
+                v_src20 = v_and(v_src20, v_mask0);
+                v_src21 = v_and(v_src21, v_mask1);

                v_float32 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
                v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
                v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);

-                v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
-                v_store_interleave(dst + (x + step) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
+                v_store_interleave(dst + x * cn, v_add(v_dst00, v_src00), v_add(v_dst10, v_src10), v_add(v_dst20, v_src20));
+                v_store_interleave(dst + (x + step) * cn, v_add(v_dst01, v_src01), v_add(v_dst11, v_src11), v_add(v_dst21, v_src21));
            }
        }
    }
@ -862,9 +862,9 @@ void acc_simd_(const ushort* src, double* dst, const uchar* mask, int len, int c
 void acc_simd_(const float* src, double* dst, const uchar* mask, int len, int cn)
 {
    int x = 0;
-#if CV_SIMD_64F
-    const int cVectorWidth = v_float32::nlanes;
-    const int step = v_float64::nlanes;
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
+    const int cVectorWidth = VTraits<v_float32>::vlanes();
+    const int step = VTraits<v_float64>::vlanes();

    if (!mask)
    {
@ -889,8 +889,8 @@ void acc_simd_(const float* src, double* dst, const uchar* mask, int len, int cn
            v_float64 v_src0 = v_cvt_f64(v_src);
            v_float64 v_src1 = v_cvt_f64_high(v_src);

-            v_store(dst + x, vx_load(dst + x) + v_src0);
-            v_store(dst + x + step, vx_load(dst + x + step) + v_src1);
+            v_store(dst + x, v_add(vx_load(dst + x), v_src0));
+            v_store(dst + x + step, v_add(vx_load(dst + x + step), v_src1));
        }
        #endif // CV_AVX && !CV_AVX2
    }
@ -904,15 +904,15 @@ void acc_simd_(const float* src, double* dst, const uchar* mask, int len, int cn
                v_uint32 v_masku32 = vx_load_expand_q(mask + x);
                v_uint64 v_masku640, v_masku641;
                v_expand(v_masku32, v_masku640, v_masku641);
-                v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
-                v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
+                v_float64 v_mask0 = v_reinterpret_as_f64(v_not(v_eq(v_masku640, v_0)));
+                v_float64 v_mask1 = v_reinterpret_as_f64(v_not(v_eq(v_masku641, v_0)));

                v_float32 v_src = vx_load(src + x);
-                v_float64 v_src0 = v_cvt_f64(v_src) & v_mask0;
-                v_float64 v_src1 = v_cvt_f64_high(v_src) & v_mask1;
+                v_float64 v_src0 = v_and(v_cvt_f64(v_src), v_mask0);
+                v_float64 v_src1 = v_and(v_cvt_f64_high(v_src), v_mask1);

-                v_store(dst + x, vx_load(dst + x) + v_src0);
-                v_store(dst + x + step, vx_load(dst + x + step) + v_src1);
+                v_store(dst + x, v_add(vx_load(dst + x), v_src0));
+                v_store(dst + x + step, v_add(vx_load(dst + x + step), v_src1));
            }
        }
        else if (cn == 3)
@ -922,24 +922,24 @@ void acc_simd_(const float* src, double* dst, const uchar* mask, int len, int cn
                v_uint32 v_masku32 = vx_load_expand_q(mask + x);
                v_uint64 v_masku640, v_masku641;
                v_expand(v_masku32, v_masku640, v_masku641);
-                v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
-                v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
+                v_float64 v_mask0 = v_reinterpret_as_f64(v_not(v_eq(v_masku640, v_0)));
+                v_float64 v_mask1 = v_reinterpret_as_f64(v_not(v_eq(v_masku641, v_0)));

                v_float32 v_src0, v_src1, v_src2;
                v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
-                v_float64 v_src00 = v_cvt_f64(v_src0) & v_mask0;
-                v_float64 v_src01 = v_cvt_f64_high(v_src0) & v_mask1;
-                v_float64 v_src10 = v_cvt_f64(v_src1) & v_mask0;
-                v_float64 v_src11 = v_cvt_f64_high(v_src1) & v_mask1;
-                v_float64 v_src20 = v_cvt_f64(v_src2) & v_mask0;
-                v_float64 v_src21 = v_cvt_f64_high(v_src2) & v_mask1;
+                v_float64 v_src00 = v_and(v_cvt_f64(v_src0), v_mask0);
+                v_float64 v_src01 = v_and(v_cvt_f64_high(v_src0), v_mask1);
+                v_float64 v_src10 = v_and(v_cvt_f64(v_src1), v_mask0);
+                v_float64 v_src11 = v_and(v_cvt_f64_high(v_src1), v_mask1);
+                v_float64 v_src20 = v_and(v_cvt_f64(v_src2), v_mask0);
+                v_float64 v_src21 = v_and(v_cvt_f64_high(v_src2), v_mask1);

                v_float64 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
                v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
                v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);

-                v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
-                v_store_interleave(dst + (x + step) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
+                v_store_interleave(dst + x * cn, v_add(v_dst00, v_src00), v_add(v_dst10, v_src10), v_add(v_dst20, v_src20));
+                v_store_interleave(dst + (x + step) * cn, v_add(v_dst01, v_src01), v_add(v_dst11, v_src11), v_add(v_dst21, v_src21));
            }
        }
    }
@ -950,9 +950,9 @@ void acc_simd_(const float* src, double* dst, const uchar* mask, int len, int cn
 void acc_simd_(const double* src, double* dst, const uchar* mask, int len, int cn)
 {
    int x = 0;
-#if CV_SIMD_64F
-    const int cVectorWidth = v_float64::nlanes * 2;
-    const int step = v_float64::nlanes;
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
+    const int cVectorWidth = VTraits<v_float64>::vlanes() * 2;
+    const int step = VTraits<v_float64>::vlanes();

    if (!mask)
    {
@ -971,8 +971,8 @@ void acc_simd_(const double* src, double* dst, const uchar* mask, int len, int c
            v_float64 v_src0 = vx_load(src + x);
            v_float64 v_src1 = vx_load(src + x + step);

-            v_store(dst + x, vx_load(dst + x) + v_src0);
-            v_store(dst + x + step, vx_load(dst + x + step) + v_src1);
+            v_store(dst + x, v_add(vx_load(dst + x), v_src0));
+            v_store(dst + x + step, v_add(vx_load(dst + x + step), v_src1));
        }
        #endif // CV_AVX && !CV_AVX2
    }
@ -986,14 +986,14 @@ void acc_simd_(const double* src, double* dst, const uchar* mask, int len, int c
                v_uint32 v_masku32 = vx_load_expand_q(mask + x);
                v_uint64 v_masku640, v_masku641;
                v_expand(v_masku32, v_masku640, v_masku641);
-                v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
-                v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
+                v_float64 v_mask0 = v_reinterpret_as_f64(v_not(v_eq(v_masku640, v_0)));
+                v_float64 v_mask1 = v_reinterpret_as_f64(v_not(v_eq(v_masku641, v_0)));

                v_float64 v_src0 = vx_load(src + x);
                v_float64 v_src1 = vx_load(src + x + step);

-                v_store(dst + x, vx_load(dst + x) + (v_src0 & v_mask0));
-                v_store(dst + x + step, vx_load(dst + x + step) + (v_src1 & v_mask1));
+                v_store(dst + x, v_add(vx_load(dst + x), v_and(v_src0, v_mask0)));
+                v_store(dst + x + step, v_add(vx_load(dst + x + step), v_and(v_src1, v_mask1)));
            }
        }
        else if (cn == 3)
@ -1003,25 +1003,25 @@ void acc_simd_(const double* src, double* dst, const uchar* mask, int len, int c
                v_uint32 v_masku32 = vx_load_expand_q(mask + x);
                v_uint64 v_masku640, v_masku641;
                v_expand(v_masku32, v_masku640, v_masku641);
-                v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
-                v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
+                v_float64 v_mask0 = v_reinterpret_as_f64(v_not(v_eq(v_masku640, v_0)));
+                v_float64 v_mask1 = v_reinterpret_as_f64(v_not(v_eq(v_masku641, v_0)));

                v_float64 v_src00, v_src10, v_src20, v_src01, v_src11, v_src21;
                v_load_deinterleave(src + x * cn, v_src00, v_src10, v_src20);
                v_load_deinterleave(src + (x + step) * cn, v_src01, v_src11, v_src21);
-                v_src00 = v_src00 & v_mask0;
-                v_src01 = v_src01 & v_mask1;
-                v_src10 = v_src10 & v_mask0;
-                v_src11 = v_src11 & v_mask1;
-                v_src20 = v_src20 & v_mask0;
-                v_src21 = v_src21 & v_mask1;
+                v_src00 = v_and(v_src00, v_mask0);
+                v_src01 = v_and(v_src01, v_mask1);
+                v_src10 = v_and(v_src10, v_mask0);
+                v_src11 = v_and(v_src11, v_mask1);
+                v_src20 = v_and(v_src20, v_mask0);
+                v_src21 = v_and(v_src21, v_mask1);

                v_float64 v_dst00, v_dst10, v_dst20, v_dst01, v_dst11, v_dst21;
                v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
                v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);

-                v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
-                v_store_interleave(dst + (x + step) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
+                v_store_interleave(dst + x * cn, v_add(v_dst00, v_src00), v_add(v_dst10, v_src10), v_add(v_dst20, v_src20));
+                v_store_interleave(dst + (x + step) * cn, v_add(v_dst01, v_src01), v_add(v_dst11, v_src11), v_add(v_dst21, v_src21));
            }
        }
    }
@ -1256,9 +1256,9 @@ void accSqr_simd_(const ushort* src, float* dst, const uchar* mask, int len, int
 void accSqr_simd_(const float* src, float* dst, const uchar* mask, int len, int cn)
 {
    int x = 0;
-#if CV_SIMD
-    const int cVectorWidth = v_uint16::nlanes;
-    const int step = v_float32::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int cVectorWidth = VTraits<v_uint16>::vlanes();
+    const int step = VTraits<v_float32>::vlanes();

    if (!mask)
    {
@ -1293,12 +1293,12 @@ void accSqr_simd_(const float* src, float* dst, const uchar* mask, int len, int
                v_uint16 v_mask16 = vx_load_expand(mask + x);
                v_uint32 v_mask_0, v_mask_1;
                v_expand(v_mask16, v_mask_0, v_mask_1);
-                v_float32 v_mask0 = v_reinterpret_as_f32(~(v_mask_0 == v_0));
-                v_float32 v_mask1 = v_reinterpret_as_f32(~(v_mask_1 == v_0));
+                v_float32 v_mask0 = v_reinterpret_as_f32(v_not(v_eq(v_mask_0, v_0)));
+                v_float32 v_mask1 = v_reinterpret_as_f32(v_not(v_eq(v_mask_1, v_0)));
                v_float32 v_src0 = vx_load(src + x);
                v_float32 v_src1 = vx_load(src + x + step);
-                v_src0 = v_src0 & v_mask0;
-                v_src1 = v_src1 & v_mask1;
+                v_src0 = v_and(v_src0, v_mask0);
+                v_src1 = v_and(v_src1, v_mask1);

                v_store(dst + x, v_fma(v_src0, v_src0, vx_load(dst + x)));
                v_store(dst + x + step, v_fma(v_src1, v_src1, vx_load(dst + x + step)));
@ -1311,18 +1311,18 @@ void accSqr_simd_(const float* src, float* dst, const uchar* mask, int len, int
                v_uint16 v_mask16 = vx_load_expand(mask + x);
                v_uint32 v_mask_0, v_mask_1;
                v_expand(v_mask16, v_mask_0, v_mask_1);
-                v_float32 v_mask0 = v_reinterpret_as_f32(~(v_mask_0 == v_0));
-                v_float32 v_mask1 = v_reinterpret_as_f32(~(v_mask_1 == v_0));
+                v_float32 v_mask0 = v_reinterpret_as_f32(v_not(v_eq(v_mask_0, v_0)));
+                v_float32 v_mask1 = v_reinterpret_as_f32(v_not(v_eq(v_mask_1, v_0)));

                v_float32 v_src00, v_src10, v_src20, v_src01, v_src11, v_src21;
                v_load_deinterleave(src + x * cn, v_src00, v_src10, v_src20);
                v_load_deinterleave(src + (x + step) * cn, v_src01, v_src11, v_src21);
-                v_src00 = v_src00 & v_mask0;
-                v_src01 = v_src01 & v_mask1;
-                v_src10 = v_src10 & v_mask0;
-                v_src11 = v_src11 & v_mask1;
-                v_src20 = v_src20 & v_mask0;
-                v_src21 = v_src21 & v_mask1;
+                v_src00 = v_and(v_src00, v_mask0);
+                v_src01 = v_and(v_src01, v_mask1);
+                v_src10 = v_and(v_src10, v_mask0);
+                v_src11 = v_and(v_src11, v_mask1);
+                v_src20 = v_and(v_src20, v_mask0);
+                v_src21 = v_and(v_src21, v_mask1);

                v_float32 v_dst00, v_dst10, v_dst20, v_dst01, v_dst11, v_dst21;
                v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
@ -1625,9 +1625,9 @@ void accSqr_simd_(const ushort* src, double* dst, const uchar* mask, int len, in
 void accSqr_simd_(const float* src, double* dst, const uchar* mask, int len, int cn)
 {
    int x = 0;
-#if CV_SIMD_64F
-    const int cVectorWidth = v_float32::nlanes;
-    const int step = v_float64::nlanes;
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
+    const int cVectorWidth = VTraits<v_float32>::vlanes();
+    const int step = VTraits<v_float64>::vlanes();

    if (!mask)
    {
@ -1667,9 +1667,9 @@ void accSqr_simd_(const float* src, double* dst, const uchar* mask, int len, int
            for (; x <= len - cVectorWidth; x += cVectorWidth)
            {
                v_uint32 v_mask = vx_load_expand_q(mask + x);;
-                v_mask = ~(v_mask == v_0);
+                v_mask = v_not(v_eq(v_mask, v_0));
                v_float32 v_src = vx_load(src + x);
-                v_src = v_src & v_reinterpret_as_f32(v_mask);
+                v_src = v_and(v_src, v_reinterpret_as_f32(v_mask));
                v_float64 v_src0 = v_cvt_f64(v_src);
                v_float64 v_src1 = v_cvt_f64_high(v_src);

@ -1682,13 +1682,13 @@ void accSqr_simd_(const float* src, double* dst, const uchar* mask, int len, int
            for (; x <= len - cVectorWidth; x += cVectorWidth)
            {
                v_uint32 v_mask = vx_load_expand_q(mask + x);
-                v_mask = ~(v_mask == v_0);
+                v_mask = v_not(v_eq(v_mask, v_0));

                v_float32 v_src0, v_src1, v_src2;
                v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
-                v_src0 = v_src0 & v_reinterpret_as_f32(v_mask);
-                v_src1 = v_src1 & v_reinterpret_as_f32(v_mask);
-                v_src2 = v_src2 & v_reinterpret_as_f32(v_mask);
+                v_src0 = v_and(v_src0, v_reinterpret_as_f32(v_mask));
+                v_src1 = v_and(v_src1, v_reinterpret_as_f32(v_mask));
+                v_src2 = v_and(v_src2, v_reinterpret_as_f32(v_mask));

                v_float64 v_src00 = v_cvt_f64(v_src0);
                v_float64 v_src01 = v_cvt_f64_high(v_src0);
@ -1720,9 +1720,9 @@ void accSqr_simd_(const float* src, double* dst, const uchar* mask, int len, int
 void accSqr_simd_(const double* src, double* dst, const uchar* mask, int len, int cn)
 {
    int x = 0;
-#if CV_SIMD_64F
-    const int cVectorWidth = v_float64::nlanes * 2;
-    const int step = v_float64::nlanes;
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
+    const int cVectorWidth = VTraits<v_float64>::vlanes() * 2;
+    const int step = VTraits<v_float64>::vlanes();

    if (!mask)
    {
@ -1756,12 +1756,12 @@ void accSqr_simd_(const double* src, double* dst, const uchar* mask, int len, in
                v_uint32 v_mask32 = vx_load_expand_q(mask + x);
                v_uint64 v_masku640, v_masku641;
                v_expand(v_mask32, v_masku640, v_masku641);
-                v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
-                v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
+                v_float64 v_mask0 = v_reinterpret_as_f64(v_not(v_eq(v_masku640, v_0)));
+                v_float64 v_mask1 = v_reinterpret_as_f64(v_not(v_eq(v_masku641, v_0)));
                v_float64 v_src0 = vx_load(src + x);
                v_float64 v_src1 = vx_load(src + x + step);
-                v_src0 = v_src0 & v_mask0;
-                v_src1 = v_src1 & v_mask1;
+                v_src0 = v_and(v_src0, v_mask0);
+                v_src1 = v_and(v_src1, v_mask1);
                v_store(dst + x, v_fma(v_src0, v_src0, vx_load(dst + x)));
                v_store(dst + x + step, v_fma(v_src1, v_src1, vx_load(dst + x + step)));
            }
@ -1773,18 +1773,18 @@ void accSqr_simd_(const double* src, double* dst, const uchar* mask, int len, in
                v_uint32 v_mask32 = vx_load_expand_q(mask + x);
                v_uint64 v_masku640, v_masku641;
                v_expand(v_mask32, v_masku640, v_masku641);
-                v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
-                v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
+                v_float64 v_mask0 = v_reinterpret_as_f64(v_not(v_eq(v_masku640, v_0)));
+                v_float64 v_mask1 = v_reinterpret_as_f64(v_not(v_eq(v_masku641, v_0)));

                v_float64 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
                v_load_deinterleave(src + x * cn, v_src00, v_src10, v_src20);
                v_load_deinterleave(src + (x + step) * cn, v_src01, v_src11, v_src21);
-                v_src00 = v_src00 & v_mask0;
-                v_src01 = v_src01 & v_mask1;
-                v_src10 = v_src10 & v_mask0;
-                v_src11 = v_src11 & v_mask1;
-                v_src20 = v_src20 & v_mask0;
-                v_src21 = v_src21 & v_mask1;
+                v_src00 = v_and(v_src00, v_mask0);
+                v_src01 = v_and(v_src01, v_mask1);
+                v_src10 = v_and(v_src10, v_mask0);
+                v_src11 = v_and(v_src11, v_mask1);
+                v_src20 = v_and(v_src20, v_mask0);
+                v_src21 = v_and(v_src21, v_mask1);

                v_float64 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
                v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
@ -2035,9 +2035,9 @@ void accProd_simd_(const ushort* src1, const ushort* src2, float* dst, const uch
 void accProd_simd_(const float* src1, const float* src2, float* dst, const uchar* mask, int len, int cn)
 {
    int x = 0;
-#if CV_SIMD
-    const int cVectorWidth = v_uint16::nlanes;
-    const int step = v_float32::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int cVectorWidth = VTraits<v_uint16>::vlanes();
+    const int step = VTraits<v_float32>::vlanes();

    if (!mask)
    {
@ -2069,11 +2069,11 @@ void accProd_simd_(const float* src1, const float* src2, float* dst, const uchar
            {
                v_uint32 v_mask32_0 = vx_load_expand_q(mask + x);
                v_uint32 v_mask32_1 = vx_load_expand_q(mask + x + step);
-                v_float32 v_mask0 = v_reinterpret_as_f32(~(v_mask32_0 == v_0));
-                v_float32 v_mask1 = v_reinterpret_as_f32(~(v_mask32_1 == v_0));
+                v_float32 v_mask0 = v_reinterpret_as_f32(v_not(v_eq(v_mask32_0, v_0)));
+                v_float32 v_mask1 = v_reinterpret_as_f32(v_not(v_eq(v_mask32_1, v_0)));

-                v_store(dst + x, vx_load(dst + x) + ((vx_load(src1 + x) * vx_load(src2 + x)) & v_mask0));
-                v_store(dst + x + step, vx_load(dst + x + step) + ((vx_load(src1 + x + step) * vx_load(src2 + x + step)) & v_mask1));
+                v_store(dst + x, v_add(vx_load(dst + x), v_and(v_mul(vx_load(src1 + x), vx_load(src2 + x)), v_mask0)));
+                v_store(dst + x + step, v_add(vx_load(dst + x + step), v_and(v_mul(vx_load(src1 + x + step), vx_load(src2 + x + step)), v_mask1)));
            }
        }
        else if (cn == 3)
@ -2082,8 +2082,8 @@ void accProd_simd_(const float* src1, const float* src2, float* dst, const uchar
            {
                v_uint32 v_mask32_0 = vx_load_expand_q(mask + x);
                v_uint32 v_mask32_1 = vx_load_expand_q(mask + x + step);
-                v_float32 v_mask0 = v_reinterpret_as_f32(~(v_mask32_0 == v_0));
-                v_float32 v_mask1 = v_reinterpret_as_f32(~(v_mask32_1 == v_0));
+                v_float32 v_mask0 = v_reinterpret_as_f32(v_not(v_eq(v_mask32_0, v_0)));
+                v_float32 v_mask1 = v_reinterpret_as_f32(v_not(v_eq(v_mask32_1, v_0)));

                v_float32 v_1src00, v_1src01, v_1src10, v_1src11, v_1src20, v_1src21;
                v_float32 v_2src00, v_2src01, v_2src10, v_2src11, v_2src20, v_2src21;
@ -2096,8 +2096,8 @@ void accProd_simd_(const float* src1, const float* src2, float* dst, const uchar
                v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
                v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);

-                v_store_interleave(dst + x * cn, v_dst00 + ((v_1src00 * v_2src00) & v_mask0), v_dst10 + ((v_1src10 * v_2src10) & v_mask0), v_dst20 + ((v_1src20 * v_2src20) & v_mask0));
-                v_store_interleave(dst + (x + step) * cn, v_dst01 + ((v_1src01 * v_2src01) & v_mask1), v_dst11 + ((v_1src11 * v_2src11) & v_mask1), v_dst21 + ((v_1src21 * v_2src21) & v_mask1));
+                v_store_interleave(dst + x * cn, v_add(v_dst00, v_and(v_mul(v_1src00, v_2src00), v_mask0)), v_add(v_dst10, v_and(v_mul(v_1src10, v_2src10), v_mask0)), v_add(v_dst20, v_and(v_mul(v_1src20, v_2src20), v_mask0)));
+                v_store_interleave(dst + (x + step) * cn, v_add(v_dst01, v_and(v_mul(v_1src01, v_2src01), v_mask1)), v_add(v_dst11, v_and(v_mul(v_1src11, v_2src11), v_mask1)), v_add(v_dst21, v_and(v_mul(v_1src21, v_2src21), v_mask1)));
            }
        }
    }
@ -2398,9 +2398,9 @@ void accProd_simd_(const ushort* src1, const ushort* src2, double* dst, const uc
 void accProd_simd_(const float* src1, const float* src2, double* dst, const uchar* mask, int len, int cn)
 {
    int x = 0;
-#if CV_SIMD_64F
-    const int cVectorWidth = v_float32::nlanes;
-    const int step = v_float64::nlanes;
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
+    const int cVectorWidth = VTraits<v_float32>::vlanes();
+    const int step = VTraits<v_float64>::vlanes();

    if (!mask)
    {
@ -2447,11 +2447,11 @@ void accProd_simd_(const float* src1, const float* src2, double* dst, const ucha
            for (; x <= len - cVectorWidth; x += cVectorWidth)
            {
                v_uint32 v_mask = vx_load_expand_q(mask + x);
-                v_mask = ~(v_mask == v_0);
+                v_mask = v_not(v_eq(v_mask, v_0));
                v_float32 v_1src = vx_load(src1 + x);
                v_float32 v_2src = vx_load(src2 + x);
-                v_1src = v_1src & v_reinterpret_as_f32(v_mask);
-                v_2src = v_2src & v_reinterpret_as_f32(v_mask);
+                v_1src = v_and(v_1src, v_reinterpret_as_f32(v_mask));
+                v_2src = v_and(v_2src, v_reinterpret_as_f32(v_mask));

                v_float64 v_1src0 = v_cvt_f64(v_1src);
                v_float64 v_1src1 = v_cvt_f64_high(v_1src);
@ -2467,16 +2467,16 @@ void accProd_simd_(const float* src1, const float* src2, double* dst, const ucha
            for (; x <= len - cVectorWidth; x += cVectorWidth)
            {
                v_uint32 v_mask = vx_load_expand_q(mask + x);
-                v_mask = ~(v_mask == v_0);
+                v_mask = v_not(v_eq(v_mask, v_0));
                v_float32 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2;
                v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2);
                v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2);
-                v_1src0 = v_1src0 & v_reinterpret_as_f32(v_mask);
-                v_1src1 = v_1src1 & v_reinterpret_as_f32(v_mask);
-                v_1src2 = v_1src2 & v_reinterpret_as_f32(v_mask);
-                v_2src0 = v_2src0 & v_reinterpret_as_f32(v_mask);
-                v_2src1 = v_2src1 & v_reinterpret_as_f32(v_mask);
-                v_2src2 = v_2src2 & v_reinterpret_as_f32(v_mask);
+                v_1src0 = v_and(v_1src0, v_reinterpret_as_f32(v_mask));
+                v_1src1 = v_and(v_1src1, v_reinterpret_as_f32(v_mask));
+                v_1src2 = v_and(v_1src2, v_reinterpret_as_f32(v_mask));
+                v_2src0 = v_and(v_2src0, v_reinterpret_as_f32(v_mask));
+                v_2src1 = v_and(v_2src1, v_reinterpret_as_f32(v_mask));
+                v_2src2 = v_and(v_2src2, v_reinterpret_as_f32(v_mask));

                v_float64 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
                v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
@ -2501,9 +2501,9 @@ void accProd_simd_(const float* src1, const float* src2, double* dst, const ucha
 void accProd_simd_(const double* src1, const double* src2, double* dst, const uchar* mask, int len, int cn)
 {
    int x = 0;
-#if CV_SIMD_64F
-    const int cVectorWidth = v_float64::nlanes * 2;
-    const int step = v_float64::nlanes;
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
+    const int cVectorWidth = VTraits<v_float64>::vlanes() * 2;
+    const int step = VTraits<v_float64>::vlanes();

    if (!mask)
    {
@ -2542,16 +2542,16 @@ void accProd_simd_(const double* src1, const double* src2, double* dst, const uc
                v_uint32 v_mask32 = vx_load_expand_q(mask + x);
                v_uint64 v_masku640, v_masku641;
                v_expand(v_mask32, v_masku640, v_masku641);
-                v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
-                v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
+                v_float64 v_mask0 = v_reinterpret_as_f64(v_not(v_eq(v_masku640, v_0)));
+                v_float64 v_mask1 = v_reinterpret_as_f64(v_not(v_eq(v_masku641, v_0)));

                v_float64 v_src00 = vx_load(src1 + x);
                v_float64 v_src01 = vx_load(src1 + x + step);
                v_float64 v_src10 = vx_load(src2 + x);
                v_float64 v_src11 = vx_load(src2 + x + step);

-                v_store(dst + x, vx_load(dst + x) + ((v_src00 * v_src10) & v_mask0));
-                v_store(dst + x + step, vx_load(dst + x + step) + ((v_src01 * v_src11) & v_mask1));
+                v_store(dst + x, v_add(vx_load(dst + x), v_and(v_mul(v_src00, v_src10), v_mask0)));
+                v_store(dst + x + step, v_add(vx_load(dst + x + step), v_and(v_mul(v_src01, v_src11), v_mask1)));
            }
        }
        else if (cn == 3)
@ -2561,8 +2561,8 @@ void accProd_simd_(const double* src1, const double* src2, double* dst, const uc
                v_uint32 v_mask32 = vx_load_expand_q(mask + x);
                v_uint64 v_masku640, v_masku641;
                v_expand(v_mask32, v_masku640, v_masku641);
-                v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
-                v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
+                v_float64 v_mask0 = v_reinterpret_as_f64(v_not(v_eq(v_masku640, v_0)));
+                v_float64 v_mask1 = v_reinterpret_as_f64(v_not(v_eq(v_masku641, v_0)));

                v_float64 v_1src00, v_1src01, v_1src10, v_1src11, v_1src20, v_1src21;
                v_float64 v_2src00, v_2src01, v_2src10, v_2src11, v_2src20, v_2src21;
@ -2570,19 +2570,19 @@ void accProd_simd_(const double* src1, const double* src2, double* dst, const uc
                v_load_deinterleave(src1 + (x + step) * cn, v_1src01, v_1src11, v_1src21);
                v_load_deinterleave(src2 + x * cn, v_2src00, v_2src10, v_2src20);
                v_load_deinterleave(src2 + (x + step) * cn, v_2src01, v_2src11, v_2src21);
-                v_float64 v_src00 = (v_1src00 & v_mask0) * v_2src00;
-                v_float64 v_src01 = (v_1src01 & v_mask1) * v_2src01;
-                v_float64 v_src10 = (v_1src10 & v_mask0) * v_2src10;
-                v_float64 v_src11 = (v_1src11 & v_mask1) * v_2src11;
-                v_float64 v_src20 = (v_1src20 & v_mask0) * v_2src20;
-                v_float64 v_src21 = (v_1src21 & v_mask1) * v_2src21;
+                v_float64 v_src00 = v_mul(v_and(v_1src00, v_mask0), v_2src00);
+                v_float64 v_src01 = v_mul(v_and(v_1src01, v_mask1), v_2src01);
+                v_float64 v_src10 = v_mul(v_and(v_1src10, v_mask0), v_2src10);
+                v_float64 v_src11 = v_mul(v_and(v_1src11, v_mask1), v_2src11);
+                v_float64 v_src20 = v_mul(v_and(v_1src20, v_mask0), v_2src20);
+                v_float64 v_src21 = v_mul(v_and(v_1src21, v_mask1), v_2src21);

                v_float64 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
                v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
                v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);

-                v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
-                v_store_interleave(dst + (x + step) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
+                v_store_interleave(dst + x * cn, v_add(v_dst00, v_src00), v_add(v_dst10, v_src10), v_add(v_dst20, v_src20));
+                v_store_interleave(dst + (x + step) * cn, v_add(v_dst01, v_src01), v_add(v_dst11, v_src11), v_add(v_dst21, v_src21));
            }
        }
    }
--- a/modules/imgproc/src/color_hsv.simd.hpp
+++ b/modules/imgproc/src/color_hsv.simd.hpp
@ -98,7 +98,7 @@ struct RGB2HSV_b

        int i = 0;

-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
        const int vsize = VTraits<v_uint8>::vlanes();
        for ( ; i <= n - vsize;
              i += vsize, src += scn*vsize, dst += 3*vsize)
@ -274,7 +274,7 @@ struct RGB2HSV_f
    : srccn(_srccn), blueIdx(_blueIdx), hrange(_hrange)
    { }

-    #if CV_SIMD || CV_SIMD_SCALABLE
+    #if (CV_SIMD || CV_SIMD_SCALABLE)
    inline void process(const v_float32& v_r, const v_float32& v_g, const v_float32& v_b,
                        v_float32& v_h, v_float32& v_s, v_float32& v_v,
                        float hscale) const
@ -308,7 +308,7 @@ struct RGB2HSV_f
        float hscale = hrange*(1.f/360.f);
        n *= 3;

-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
        const int vsize = VTraits<v_float32>::vlanes();
        for ( ; i <= n - 3*vsize; i += 3*vsize, src += scn * vsize)
        {
@ -368,7 +368,7 @@ struct RGB2HSV_f
 };


-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 inline void HSV2RGB_simd(const v_float32& h, const v_float32& s, const v_float32& v,
                         v_float32& b, v_float32& g, v_float32& r, float hscale)
 {
@ -473,7 +473,7 @@ struct HSV2RGB_f
        float hs = hscale;
        n *= 3;

-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
        const int vsize = VTraits<v_float32>::vlanes();
        v_float32 valpha = vx_setall_f32(alpha);
        for (; i <= n - vsize*3; i += vsize*3, dst += dcn * vsize)
@ -530,7 +530,7 @@ struct HSV2RGB_b
        int j = 0, dcn = dstcn;
        uchar alpha = ColorChannel<uchar>::max();

-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
        const int vsize = VTraits<v_float32>::vlanes();

        for (j = 0; j <= (n - vsize*4) * 3; j += 3 * 4 * vsize, dst += dcn * 4 * vsize)
@ -679,7 +679,7 @@ struct RGB2HLS_f
    {
    }

-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    inline void process(const v_float32& r, const v_float32& g, const v_float32& b,
                        const v_float32& vhscale,
                        v_float32& h, v_float32& l, v_float32& s) const
@ -718,7 +718,7 @@ struct RGB2HLS_f

        int i = 0, bidx = blueIdx, scn = srccn;

-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
        const int vsize = VTraits<v_float32>::vlanes();
        v_float32 vhscale = vx_setall_f32(hscale);

@ -802,13 +802,13 @@ struct RGB2HLS_b

        int scn = srccn;

-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
        float CV_DECL_ALIGNED(CV_SIMD_WIDTH) buf[bufChannels*BLOCK_SIZE];
 #else
        float CV_DECL_ALIGNED(16) buf[bufChannels*BLOCK_SIZE];
 #endif

-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
        static const int fsize = VTraits<v_float32>::vlanes();
        //TODO: fix that when v_interleave is available
        float CV_DECL_ALIGNED(CV_SIMD_WIDTH) interTmpM[VTraits<v_float32>::max_nlanes*3];
@ -823,7 +823,7 @@ struct RGB2HLS_b
        {
            int dn = std::min(n - i, (int)BLOCK_SIZE);

-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
            v_float32 v255inv = vx_setall_f32(1.f/255.f);
            if (scn == 3)
            {
@ -902,7 +902,7 @@ struct RGB2HLS_b
            cvt(buf, buf, dn);

            int j = 0;
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
            for( ; j <= dn*3 - fsize*3*4; j += fsize*3*4)
            {
                v_float32 f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11;
@ -973,7 +973,7 @@ struct HLS2RGB_f
    : dstcn(_dstcn), blueIdx(_blueIdx), hscale(6.f/_hrange)
    { }

-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    inline void process(const v_float32& h, const v_float32& l, const v_float32& s,
                        v_float32& b, v_float32& g, v_float32& r) const
    {
@ -1016,7 +1016,7 @@ struct HLS2RGB_f
        int i = 0, bidx = blueIdx, dcn = dstcn;
        float alpha = ColorChannel<float>::max();

-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
        static const int vsize = VTraits<v_float32>::vlanes();
        for (; i <= n - vsize; i += vsize, src += 3*vsize, dst += dcn*vsize)
        {
@ -1099,13 +1099,13 @@ struct HLS2RGB_b
        int i, j, dcn = dstcn;
        uchar alpha = ColorChannel<uchar>::max();

-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
        float CV_DECL_ALIGNED(CV_SIMD_WIDTH) buf[bufChannels*BLOCK_SIZE];
 #else
        float CV_DECL_ALIGNED(16) buf[bufChannels*BLOCK_SIZE];
 #endif

-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
        static const int fsize = VTraits<v_float32>::vlanes();
        //TODO: fix that when v_interleave is available
        float CV_DECL_ALIGNED(CV_SIMD_WIDTH) interTmpM[VTraits<v_float32>::max_nlanes*3];
@ -1122,7 +1122,7 @@ struct HLS2RGB_b
            int dn = std::min(n - i, (int)BLOCK_SIZE);
            j = 0;

-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
            for( ; j <= dn*3 - 3*4*fsize; j += 3*4*fsize)
            {
                // 3x uchar -> 3*4 float
@ -1179,7 +1179,7 @@ struct HLS2RGB_b
            }
            cvt(buf, buf, dn);

-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
            v_float32 v255 = vx_setall_f32(255.f);
            if(dcn == 3)
            {
--- a/modules/imgproc/src/median_blur.simd.hpp
+++ b/modules/imgproc/src/median_blur.simd.hpp
@ -548,7 +548,7 @@ struct MinMax32f
    }
 };

-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)

 struct MinMaxVec8u
 {
@ -688,7 +688,7 @@ medianBlur_SortNet( const Mat& _src, Mat& _dst, int m )
                if( limit == size.width )
                    break;

-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                int nlanes = VTraits<typename VecOp::arg_type>::vlanes();
 #else
                int nlanes = 1;
@ -793,7 +793,7 @@ medianBlur_SortNet( const Mat& _src, Mat& _dst, int m )
                if( limit == size.width )
                    break;

-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                int nlanes = VTraits<typename VecOp::arg_type>::vlanes();
 #else
                int nlanes = 1;