From 419060da846cb68139490f498a101994da033f92 Mon Sep 17 00:00:00 2001
From: Liutong HAN <liutong2020@iscas.ac.cn>
Date: Fri, 22 Sep 2023 12:14:43 +0800
Subject: [PATCH] Rewrite fluid related part.

---
 .../gapi/src/backends/fluid/gfluidcore.cpp    |  60 ++-
 .../fluid/gfluidcore_func.dispatch.cpp        |   2 +-
 .../src/backends/fluid/gfluidcore_func.hpp    |   2 +-
 .../backends/fluid/gfluidcore_func.simd.hpp   | 202 ++++----
 .../fluid/gfluidimgproc_func.simd.hpp         | 459 +++++++++---------
 5 files changed, 351 insertions(+), 374 deletions(-)
diff --git a/modules/gapi/src/backends/fluid/gfluidcore.cpp b/modules/gapi/src/backends/fluid/gfluidcore.cpp
index c2686c7bd3..50615b2652 100644
--- a/modules/gapi/src/backends/fluid/gfluidcore.cpp
+++ b/modules/gapi/src/backends/fluid/gfluidcore.cpp
@@ -13,7 +13,7 @@
 #include <opencv2/core/hal/hal.hpp>
 #include <opencv2/core/hal/intrin.hpp>
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 #include "gfluidcore_func.hpp"
 #endif
 
@@ -113,7 +113,7 @@ static inline DST divr(SRC1 x, SRC2 y, float scale=1)
 // Fluid kernels: addWeighted
 //
 //---------------------------
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 CV_ALWAYS_INLINE v_float32 v_load_f32(const ushort* in)
 {
     return v_cvt_f32(v_reinterpret_as_s32(vx_load_expand(in)));
@@ -150,8 +150,8 @@ CV_ALWAYS_INLINE int addw_simd(const SRC in1[], const SRC in2[], DST out[],
                   ((std::is_same<SRC, short>::value) && (std::is_same<DST, short>::value)),
                   "This templated overload is only for short and ushort type combinations.");
 
-    constexpr int nlanes = (std::is_same<DST, ushort>::value) ? static_cast<int>(v_uint16::nlanes) :
-                                                                static_cast<int>(v_int16::nlanes);
+    const int nlanes = (std::is_same<DST, ushort>::value) ? static_cast<int>(VTraits<v_uint16>::vlanes()) :
+                                                                static_cast<int>(VTraits<v_int16>::vlanes());
 
     if (length < nlanes)
         return 0;
@@ -189,7 +189,7 @@ CV_ALWAYS_INLINE int addw_simd(const SRC in1[], const SRC in2[], uchar out[],
                                const float _alpha, const float _beta,
                                const float _gamma, int length)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
 
     if (length < nlanes)
         return 0;
@@ -298,7 +298,7 @@ GAPI_FLUID_KERNEL(GFluidAddW, cv::gapi::core::GAddW, false)
 
 enum Arithm { ARITHM_ABSDIFF, ARITHM_ADD, ARITHM_SUBTRACT, ARITHM_MULTIPLY, ARITHM_DIVIDE };
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 CV_ALWAYS_INLINE void absdiff_store(short out[], const v_int16& a, const v_int16& b, int x)
 {
     vx_store(&out[x], v_absdiffs(a, b));
@@ -322,7 +322,7 @@ CV_ALWAYS_INLINE void absdiff_store(float out[], const v_float32& a, const v_flo
 template<typename T, typename VT>
 CV_ALWAYS_INLINE int absdiff_impl(const T in1[], const T in2[], T out[], int length)
 {
-    constexpr int nlanes = static_cast<int>(VT::nlanes);
+    const int nlanes = static_cast<int>(VTraits<VT>::vlanes());
 
     if (length < nlanes)
         return 0;
@@ -403,7 +403,7 @@ CV_ALWAYS_INLINE void run_arithm(Buffer &dst, const View &src1, const View &src2
     {
         case ARITHM_ADD:
         {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
             x = add_simd(in1, in2, out, length);
 #endif
             for (; x < length; ++x)
@@ -412,7 +412,7 @@ CV_ALWAYS_INLINE void run_arithm(Buffer &dst, const View &src1, const View &src2
         }
         case ARITHM_SUBTRACT:
         {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
             x = sub_simd(in1, in2, out, length);
 #endif
             for (; x < length; ++x)
@@ -421,7 +421,7 @@ CV_ALWAYS_INLINE void run_arithm(Buffer &dst, const View &src1, const View &src2
         }
         case ARITHM_MULTIPLY:
         {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
             x = mul_simd(in1, in2, out, length, scale);
 #endif
             for (; x < length; ++x)
@@ -430,7 +430,7 @@ CV_ALWAYS_INLINE void run_arithm(Buffer &dst, const View &src1, const View &src2
         }
         case ARITHM_DIVIDE:
         {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
             x = div_simd(in1, in2, out, length, scale);
 #endif
             for (; x < length; ++x)
@@ -569,7 +569,7 @@ static void run_absdiff(Buffer &dst, const View &src1, const View &src2)
 
     int x = 0;
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     x = absdiff_simd(in1, in2, out, length);
 #endif
     for (; x < length; ++x)
@@ -660,7 +660,7 @@ CV_ALWAYS_INLINE void run_arithm_s(Buffer &dst, const View &src, const float sca
     case ARITHM_ADD:
     {
         int w = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         w = addc_simd(in, scalar, out, length, chan);
 #endif
         for (; w < length; ++w)
@@ -671,7 +671,7 @@ CV_ALWAYS_INLINE void run_arithm_s(Buffer &dst, const View &src, const float sca
     case ARITHM_SUBTRACT:
     {
         int w = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         w = subc_simd(in, scalar, out, length, chan);
 #endif
         for (; w < length; ++w)
@@ -681,7 +681,7 @@ CV_ALWAYS_INLINE void run_arithm_s(Buffer &dst, const View &src, const float sca
     case ARITHM_MULTIPLY:
     {
         int w = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         w = mulc_simd(in, scalar, out, length, chan, scale);
 #endif
         for (; w < width; ++w)
@@ -709,7 +709,7 @@ CV_ALWAYS_INLINE void run_arithm_rs(Buffer &dst, const View &src, const float sc
     case ARITHM_SUBTRACT:
     {
         int w = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         w = subrc_simd(scalar, in, out, length, chan);
 #endif
         for (; w < length; ++w)
@@ -721,7 +721,7 @@ CV_ALWAYS_INLINE void run_arithm_rs(Buffer &dst, const View &src, const float sc
     case ARITHM_DIVIDE:
     {
         int w = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         w = divrc_simd(scalar, in, out, length, chan, scale);
 #endif
         for (; w < length; ++w)
@@ -744,7 +744,7 @@ CV_ALWAYS_INLINE void setScratchSize(Buffer& scratch, const int buflen)
 
 CV_ALWAYS_INLINE void initScratchBuffer(Buffer& scratch)
 {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     // 512 bits / 32 bits = 16 elements of float32 can contain a AVX 512 SIMD vector.
     constexpr int maxNlanes = 16;
 
@@ -783,7 +783,7 @@ CV_ALWAYS_INLINE void run_absdiffc(Buffer& dst, const View& src, const float sca
     const int length = width * chan;
 
     int w = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     w = absdiffc_simd(in, scalar, out, length, chan);
 #endif
 
@@ -1076,7 +1076,7 @@ CV_ALWAYS_INLINE void run_divc(Buffer& dst, const View& src, Buffer& scratch,
     const int length = width * chan;
 
     int w = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     int scratch_length = scratch.length();
     int indicator_offset = scratch_length - 1;
     const int set_mask_indicator = static_cast<int>(*(scratch.OutLine<float>() + (indicator_offset)));
@@ -1143,7 +1143,7 @@ GAPI_FLUID_KERNEL(GFluidDivC, cv::gapi::core::GDivC, true)
 
     static void initScratch(const GMatDesc&, const GScalarDesc&, double, int, Buffer& scratch)
     {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
             // 512 bits / 32 bits = 16 elements of float32 a AVX512 SIMD vector can contain.
             constexpr int maxNlanes = 16;
 
@@ -1565,7 +1565,7 @@ template<typename SRC, typename DST>
 CV_ALWAYS_INLINE void convertto_impl(const SRC in[], DST out[], const int length)
 {
     int x = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     x = convertto_simd(in, out, length);
 #endif
     // tail of SIMD cycle
@@ -1580,7 +1580,7 @@ CV_ALWAYS_INLINE void convertto_impl(const SRC *in, DST* out, const float alpha,
                                      const int length)
 {
     int x = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     x = convertto_scaled_simd(in, out, alpha, beta, length);
 #endif
 
@@ -2096,9 +2096,7 @@ static void run_inrange3(uchar out[], const uchar in[], int width,
         v_load_deinterleave(&in[3*w], i0, i1, i2);
 
         v_uint8x16 o;
-        o = (i0 >= v_setall_u8(lower[0])) & (i0 <= v_setall_u8(upper[0])) &
-            (i1 >= v_setall_u8(lower[1])) & (i1 <= v_setall_u8(upper[1])) &
-            (i2 >= v_setall_u8(lower[2])) & (i2 <= v_setall_u8(upper[2]));
+        o = v_and(v_and(v_and(v_and(v_and(v_ge(i0, v_setall_u8(lower[0])), v_le(i0, v_setall_u8(upper[0]))), v_ge(i1, v_setall_u8(lower[1]))), v_le(i1, v_setall_u8(upper[1]))), v_ge(i2, v_setall_u8(lower[2]))), v_le(i2, v_setall_u8(upper[2])));
 
         v_store(&out[w], o);
     }
@@ -2226,7 +2224,7 @@ static void run_select_row3(int width, uchar out[], uchar in1[], uchar in2[], uc
         v_load_deinterleave(&in2[3*w], a2, b2, c2);
 
         mask = v_load(&in3[w]);
-        mask = mask != v_setzero_u8();
+        mask = v_ne(mask, v_setzero_u8());
 
         a = v_select(mask, a1, a2);
         b = v_select(mask, b1, b2);
@@ -2332,7 +2330,7 @@ GAPI_FLUID_KERNEL(GFluidSplit3, cv::gapi::core::GSplit3, false)
         int width = src.length();
         int w = 0;
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         w = split3_simd(in, out1, out2, out3, width);
 #endif
 
@@ -2364,7 +2362,7 @@ GAPI_FLUID_KERNEL(GFluidSplit4, cv::gapi::core::GSplit4, false)
         int width = src.length();
         int w = 0;
 
-    #if CV_SIMD
+    #if (CV_SIMD || CV_SIMD_SCALABLE)
         w = split4_simd(in, out1, out2, out3, out4, width);
     #endif
 
@@ -2389,7 +2387,7 @@ CV_ALWAYS_INLINE void run_merge3(Buffer& dst, const View& src1, const View& src2
     int width = dst.length();
     int w = 0;
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         w = merge3_simd(in1, in2, in3, out, width);
 #endif
 
@@ -2442,7 +2440,7 @@ GAPI_FLUID_KERNEL(GFluidMerge4, cv::gapi::core::GMerge4, false)
 
         int w = 0; // cycle counter
 
-    #if CV_SIMD
+    #if (CV_SIMD || CV_SIMD_SCALABLE)
         w = merge4_simd(in1, in2, in3, in4, out, width);
     #endif
 
diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp b/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp
index 05d3417024..a0ef4b1479 100644
--- a/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp
+++ b/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp
@@ -7,7 +7,7 @@
 #if !defined(GAPI_STANDALONE)
 
 #include <opencv2/core/hal/intrin.hpp>
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 #include "gfluidcore_func.hpp"
 #include "gfluidcore_func.simd.hpp"
 
diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.hpp b/modules/gapi/src/backends/fluid/gfluidcore_func.hpp
index 0511f4e095..0186ea020e 100644
--- a/modules/gapi/src/backends/fluid/gfluidcore_func.hpp
+++ b/modules/gapi/src/backends/fluid/gfluidcore_func.hpp
@@ -6,7 +6,7 @@
 
 #pragma once
 
-#if !defined(GAPI_STANDALONE) && CV_SIMD
+#if !defined(GAPI_STANDALONE) && (CV_SIMD || CV_SIMD_SCALABLE)
 
 #include <opencv2/core.hpp>
 
diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp b/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp
index aed0ee97d8..6191e9ab05 100644
--- a/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp
+++ b/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp
@@ -402,22 +402,22 @@ CV_ALWAYS_INLINE v_float32 vg_load_f32(const uchar* in)
 
 CV_ALWAYS_INLINE v_float32 mul_op(scale_tag, const v_float32& a, const v_float32& b, const v_float32& scale)
 {
-    return (scale*a * b);
+    return (v_mul(v_mul(scale, a), b));
 }
 
 CV_ALWAYS_INLINE v_float32 mul_op(not_scale_tag, const v_float32& a, const v_float32& b, const v_float32&)
 {
-    return a * b;
+    return v_mul(a, b);
 }
 
 CV_ALWAYS_INLINE v_float32 div_op(scale_tag, const v_float32& a, const v_float32& div, const v_float32& scale)
 {
-    return (a*scale/div);
+    return (v_div(v_mul(a, scale), div));
 }
 
 CV_ALWAYS_INLINE v_float32 div_op(not_scale_tag, const v_float32& a, const v_float32& div, const v_float32&)
 {
-    return a / div;
+    return v_div(a, div);
 }
 
 CV_ALWAYS_INLINE void v_store_i16(short* dst, const v_int32& res1, const v_int32& res2)
@@ -433,13 +433,13 @@ CV_ALWAYS_INLINE void v_store_i16(ushort* dst, const v_int32& res1, const v_int3
 CV_ALWAYS_INLINE void v_store_select(short* dst, const v_int16& div, const v_int16& v_zero,
                                      const v_int32& res1, const v_int32& res2)
 {
-    vx_store(dst, v_select(div == v_zero, v_zero, v_pack(res1, res2)));
+    vx_store(dst, v_select(v_eq(div, v_zero), v_zero, v_pack(res1, res2)));
 }
 
 CV_ALWAYS_INLINE void v_store_select(ushort* dst, const v_int16& div, const v_int16& v_zero,
                                      const v_int32& res1, const v_int32& res2)
 {
-    vx_store(dst, v_select(v_reinterpret_as_u16(div == v_zero),
+    vx_store(dst, v_select(v_reinterpret_as_u16(v_eq(div, v_zero)),
                            v_reinterpret_as_u16(v_zero), v_pack_u(res1, res2)));
 }
 
@@ -451,7 +451,7 @@ void div_simd_impl(scale_tag_t s_tag, const v_float32& a1, const v_float32& a2,
                    const v_float32& a3, const v_float32& a4, const uchar* in2x,
                    uchar* outx, const v_float32& v_scale, const v_int16& v_zero)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
 
     v_int16 div1 = v_reinterpret_as_s16(vx_load_expand(in2x));
     v_int16 div2 = v_reinterpret_as_s16(vx_load_expand(&in2x[nlanes/2]));
@@ -466,8 +466,8 @@ void div_simd_impl(scale_tag_t s_tag, const v_float32& a1, const v_float32& a2,
             sum3 = v_round(div_op(s_tag, a3, fdiv3, v_scale)),
             sum4 = v_round(div_op(s_tag, a4, fdiv4, v_scale));
 
-    v_int16 res1 = v_select((div1 == v_zero), v_zero, v_pack(sum1, sum2));
-    v_int16 res2 = v_select((div2 == v_zero), v_zero, v_pack(sum3, sum4));
+    v_int16 res1 = v_select((v_eq(div1, v_zero)), v_zero, v_pack(sum1, sum2));
+    v_int16 res2 = v_select((v_eq(div2, v_zero)), v_zero, v_pack(sum3, sum4));
 
     vx_store(outx, v_pack_u(res1, res2));
 }
@@ -480,7 +480,7 @@ div_simd_impl(scale_tag_t s_tag, const v_float32& a1, const v_float32& a2,
               const v_float32& a3, const v_float32& a4, const SRC* in2x,
               uchar* outx, const v_float32& v_scale, const v_int16& v_zero)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
 
     v_int16 div1 = v_reinterpret_as_s16(vx_load(in2x));
     v_int16 div2 = v_reinterpret_as_s16(vx_load(&in2x[nlanes/2]));
@@ -495,8 +495,8 @@ div_simd_impl(scale_tag_t s_tag, const v_float32& a1, const v_float32& a2,
             sum3 = v_round(div_op(s_tag, a3, fdiv3, v_scale)),
             sum4 = v_round(div_op(s_tag, a4, fdiv4, v_scale));
 
-    v_int16 res1 = v_select((div1 == v_zero), v_zero, v_pack(sum1, sum2));
-    v_int16 res2 = v_select((div2 == v_zero), v_zero, v_pack(sum3, sum4));
+    v_int16 res1 = v_select((v_eq(div1, v_zero)), v_zero, v_pack(sum1, sum2));
+    v_int16 res2 = v_select((v_eq(div2, v_zero)), v_zero, v_pack(sum3, sum4));
 
     vx_store(outx, v_pack_u(res1, res2));
 }
@@ -507,7 +507,7 @@ CV_ALWAYS_INLINE void div_simd_impl(scale_tag_t s_tag, const v_float32& a1,
                                     const v_float32& a4, const float* in2x, uchar* outx,
                                     const v_float32& v_scale, const v_float32& v_zero)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
 
     v_float32 div1 = vg_load_f32(in2x);
     v_float32 div2 = vg_load_f32(&in2x[nlanes / 4]);
@@ -519,10 +519,10 @@ CV_ALWAYS_INLINE void div_simd_impl(scale_tag_t s_tag, const v_float32& a1,
     v_float32 r3 = div_op(s_tag, a3, div3, v_scale);
     v_float32 r4 = div_op(s_tag, a4, div4, v_scale);
 
-    v_float32 sel1 = v_select((div1 == v_zero), v_zero, r1);
-    v_float32 sel2 = v_select((div2 == v_zero), v_zero, r2);
-    v_float32 sel3 = v_select((div3 == v_zero), v_zero, r3);
-    v_float32 sel4 = v_select((div4 == v_zero), v_zero, r4);
+    v_float32 sel1 = v_select((v_eq(div1, v_zero)), v_zero, r1);
+    v_float32 sel2 = v_select((v_eq(div2, v_zero)), v_zero, r2);
+    v_float32 sel3 = v_select((v_eq(div3, v_zero)), v_zero, r3);
+    v_float32 sel4 = v_select((v_eq(div4, v_zero)), v_zero, r4);
 
     v_int32 res1 = v_round(sel1);
     v_int32 res2 = v_round(sel2);
@@ -536,7 +536,7 @@ template<typename scale_tag_t, typename SRC, typename Vtype>
 CV_ALWAYS_INLINE void div_hal(scale_tag_t s_tag, const SRC* in1x, const SRC* in2x, uchar* outx,
                               const v_float32& v_scale, const Vtype& v_zero)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
 
     v_float32 a1 = vg_load_f32(in1x);
     v_float32 a2 = vg_load_f32(&in1x[nlanes / 4]);
@@ -595,7 +595,7 @@ div_simd_impl(scale_tag_t s_tag, const v_float32& a1, const v_float32& a2,
               const float* in2x, DST* outx, const v_float32& v_scale,
               const v_float32& v_zero)
 {
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
 
     v_float32 fdiv1 = vg_load_f32(in2x);
     v_float32 fdiv2 = vg_load_f32(&in2x[nlanes / 2]);
@@ -603,8 +603,8 @@ div_simd_impl(scale_tag_t s_tag, const v_float32& a1, const v_float32& a2,
     v_float32 r1 = div_op(s_tag, a1, fdiv1, v_scale);
     v_float32 r2 = div_op(s_tag, a2, fdiv2, v_scale);
 
-    v_int32 res1 = v_round(v_select((fdiv1 == v_zero), v_zero, r1));
-    v_int32 res2 = v_round(v_select((fdiv2 == v_zero), v_zero, r2));
+    v_int32 res1 = v_round(v_select((v_eq(fdiv1, v_zero)), v_zero, r1));
+    v_int32 res2 = v_round(v_select((v_eq(fdiv2, v_zero)), v_zero, r2));
 
     v_store_i16(outx, res1, res2);
 }
@@ -616,7 +616,7 @@ typename std::enable_if<std::is_same<DST, short>::value ||
 div_hal(scale_tag_t s_tag, const SRC* in1x, const SRC* in2x, DST* outx,
         const v_float32& v_scale, const Vtype& v_zero)
 {
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
 
     v_float32 a1 = vg_load_f32(in1x);
     v_float32 a2 = vg_load_f32(&in1x[nlanes / 2]);
@@ -648,12 +648,12 @@ template<typename scale_tag_t, typename SRC, typename DST>
 CV_ALWAYS_INLINE int div_simd_common(scale_tag_t s_tag, const SRC in1[], const SRC in2[],
                                      DST out[], const int length, float scale)
 {
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
 
     if (length < nlanes)
         return 0;
 
-    const zero_vec_type_of_t<SRC> v_zero = vx_setall<typename zero_vec_type_of_t<SRC>::lane_type>(0);
+    const zero_vec_type_of_t<SRC> v_zero = vx_setall<typename VTraits< zero_vec_type_of_t<SRC> >::lane_type>(0);
     v_float32 v_scale = vx_setall_f32(scale);
 
     int x = 0;
@@ -724,7 +724,7 @@ typename std::enable_if<(std::is_same<SRC, short>::value && std::is_same<DST, us
                         (std::is_same<SRC, ushort>::value && std::is_same<DST, short>::value), int>::type
 mul_hal(scale_tag_t t, const SRC in1[], const SRC in2[], DST out[], const int length, double _scale)
 {
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
 
     if (length < nlanes)
         return 0;
@@ -769,7 +769,7 @@ typename std::enable_if<std::is_same<SRC, short>::value ||
                         std::is_same<SRC, ushort>::value, int>::type
 mul_hal(scale_tag_t t, const SRC in1[], const SRC in2[], uchar out[], const int length, double _scale)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
 
     if (length < nlanes)
         return 0;
@@ -824,7 +824,7 @@ template<typename scale_tag_t>
 CV_ALWAYS_INLINE int mul_hal(scale_tag_t t, const float in1[], const float in2[], uchar out[],
                              const int length, double _scale)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
 
     if (length < nlanes)
         return 0;
@@ -869,7 +869,7 @@ typename std::enable_if<std::is_same<DST, short>::value ||
                         std::is_same<DST, ushort>::value, int>::type
 mul_hal(scale_tag_t t, const uchar in1[], const uchar in2[], DST out[], const int length, double _scale)
 {
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
 
     if (length < nlanes)
         return 0;
@@ -914,7 +914,7 @@ typename std::enable_if<std::is_same<DST, short>::value ||
                         std::is_same<DST, ushort>::value, int>::type
 mul_hal(scale_tag_t t, const float in1[], const float in2[], DST out[], const int length, double _scale)
 {
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
 
     if (length < nlanes)
         return 0;
@@ -954,7 +954,7 @@ template<typename scale_tag_t, typename SRC>
 CV_ALWAYS_INLINE int mul_hal(scale_tag_t t, const SRC in1[], const SRC in2[], float out[],
                              const int length, double _scale)
 {
-    constexpr int nlanes = v_float32::nlanes;
+    const int nlanes = VTraits<v_float32>::vlanes();
 
     if (length < nlanes)
         return 0;
@@ -1049,7 +1049,7 @@ CV_ALWAYS_INLINE void arithmOpScalar_pack_store_c3(short* outx,       const v_in
                                                    const v_int32& c4, const v_int32& c5,
                                                    const v_int32& c6)
 {
-    constexpr int nlanes = v_int16::nlanes;
+    const int nlanes = VTraits<v_int16>::vlanes();
     vx_store(outx,           v_pack(c1, c2));
     vx_store(&outx[nlanes],   v_pack(c3, c4));
     vx_store(&outx[2*nlanes], v_pack(c5, c6));
@@ -1060,7 +1060,7 @@ CV_ALWAYS_INLINE void arithmOpScalar_pack_store_c3(ushort* outx,      const v_in
                                                    const v_int32& c4, const v_int32& c5,
                                                    const v_int32& c6)
 {
-    constexpr int nlanes = v_uint16::nlanes;
+    const int nlanes = VTraits<v_uint16>::vlanes();
     vx_store(outx,            v_pack_u(c1, c2));
     vx_store(&outx[nlanes],   v_pack_u(c3, c4));
     vx_store(&outx[2*nlanes], v_pack_u(c5, c6));
@@ -1068,37 +1068,37 @@ CV_ALWAYS_INLINE void arithmOpScalar_pack_store_c3(ushort* outx,      const v_in
 
 CV_ALWAYS_INLINE v_float32 oper(add_tag, const v_float32& a, const v_float32& sc)
 {
-    return a + sc;
+    return v_add(a, sc);
 }
 
 CV_ALWAYS_INLINE v_float32 oper(sub_tag, const v_float32& a, const v_float32& sc)
 {
-    return a - sc;
+    return v_sub(a, sc);
 }
 
 CV_ALWAYS_INLINE v_float32 oper(subr_tag, const v_float32& a, const v_float32& sc)
 {
-    return sc - a;
+    return v_sub(sc, a);
 }
 
 CV_ALWAYS_INLINE v_float32 oper(mul_tag, const v_float32& a, const v_float32& sc)
 {
-    return a * sc;
+    return v_mul(a, sc);
 }
 
 CV_ALWAYS_INLINE v_float32 oper_scaled(mul_tag, const v_float32& a, const v_float32& v_scalar, const v_float32& v_scale)
 {
-    return v_scale * a * v_scalar;
+    return v_mul(v_mul(v_scale, a), v_scalar);
 }
 
 CV_ALWAYS_INLINE v_float32 oper(div_tag, const v_float32& a, const v_float32& sc)
 {
-    return a / sc;
+    return v_div(a, sc);
 }
 
 CV_ALWAYS_INLINE v_float32 oper_scaled(div_tag, const v_float32& a, const v_float32& v_scalar, const v_float32& v_scale)
 {
-    return a*v_scale / v_scalar;
+    return v_div(v_mul(a, v_scale), v_scalar);
 }
 
 CV_ALWAYS_INLINE v_float32 oper(absdiff_tag, const v_float32& a, const v_float32& sc)
@@ -1223,8 +1223,8 @@ CV_ALWAYS_INLINE int arithmOpScalar_simd_c3(oper_tag t, const SRC in[],
                                             const int length)
 {
     constexpr int chan = 3;
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
-    constexpr int lanes = chan * nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
+    const int lanes = chan * nlanes;
 
     if (length < lanes)
         return 0;
@@ -1263,7 +1263,7 @@ CV_ALWAYS_INLINE int arithmOpScalar_simd_common(oper_tag t, const SRC in[],
                                                 const float scalar[], DST out[],
                                                 const int length)
 {
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
 
     if (length < nlanes)
         return 0;
@@ -1489,8 +1489,8 @@ CV_ALWAYS_INLINE int arithmOpScalarScaled_simd_c3(oper_tag op, const SRC in[],
                                                   const int length, const float scale)
 {
     constexpr int chan = 3;
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
-    constexpr int lanes = chan * nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
+    const int lanes = chan * nlanes;
 
     if (length < lanes)
         return 0;
@@ -1576,7 +1576,7 @@ CV_ALWAYS_INLINE int arithmOpScalarScaled_simd_common(oper_tag op, const SRC in[
                                                       const float scalar[], DST out[],
                                                       const int length, const float scale)
 {
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
 
     if (length < nlanes)
         return 0;
@@ -1675,10 +1675,10 @@ divc_simd_common_impl(scale_tag_t s_tag, const SRC in[], DST out[],
                       const v_float32& v_scalar, const v_float32& v_scale,
                       const int length)
 {
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
 
     v_float32 v_zero = vx_setzero_f32();
-    v_float32 v_mask = (v_scalar == v_zero);
+    v_float32 v_mask = (v_eq(v_scalar, v_zero));
 
     int x = 0;
     for (;;)
@@ -1709,10 +1709,10 @@ CV_ALWAYS_INLINE int divc_simd_common_impl(scale_tag_t s_tag, const SRC in[],
                                            uchar out[], const v_float32& v_scalar,
                                            const v_float32& v_scale, const int length)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
 
     v_float32 v_zero = vx_setzero_f32();
-    v_float32 v_mask = (v_scalar == v_zero);
+    v_float32 v_mask = (v_eq(v_scalar, v_zero));
 
     int x = 0;
     for (;;)
@@ -1747,7 +1747,7 @@ CV_ALWAYS_INLINE int divc_simd_common_impl(scale_tag_t s_tag, const SRC in[],
                                            float out[], const v_float32& v_scalar,
                                            const v_float32& v_scale, const int length)
 {
-    constexpr int nlanes = v_float32::nlanes;
+    const int nlanes = VTraits<v_float32>::vlanes();
     int x = 0;
     for (;;)
     {
@@ -1774,7 +1774,7 @@ CV_ALWAYS_INLINE int divc_mask_simd_common(scale_tag_t tag, const SRC in[],
                                            const float scalar[], DST out[],
                                            const int length, const float scale)
 {
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
 
     if (length < nlanes)
         return 0;
@@ -1796,9 +1796,9 @@ divc_simd_c3_impl(scale_tag_t s_tag, SRC in[], DST out[], const v_float32& s1,
                   const int nlanes, const int lanes)
 {
     v_float32 v_zero = vx_setzero_f32();
-    v_float32 v_mask1 = (s1 == v_zero);
-    v_float32 v_mask2 = (s2 == v_zero);
-    v_float32 v_mask3 = (s3 == v_zero);
+    v_float32 v_mask1 = (v_eq(s1, v_zero));
+    v_float32 v_mask2 = (v_eq(s2, v_zero));
+    v_float32 v_mask3 = (v_eq(s3, v_zero));
 
     int x = 0;
     for (;;)
@@ -1839,9 +1839,9 @@ CV_ALWAYS_INLINE int divc_simd_c3_impl(scale_tag_t s_tag, const SRC* in, uchar*
                                        const int length, const int nlanes, const int lanes)
 {
     v_float32 v_zero = vx_setzero_f32();
-    v_float32 v_mask1 = (s1 == v_zero);
-    v_float32 v_mask2 = (s2 == v_zero);
-    v_float32 v_mask3 = (s3 == v_zero);
+    v_float32 v_mask1 = (v_eq(s1, v_zero));
+    v_float32 v_mask2 = (v_eq(s2, v_zero));
+    v_float32 v_mask3 = (v_eq(s3, v_zero));
 
     int x = 0;
     for (;;)
@@ -1917,8 +1917,8 @@ CV_ALWAYS_INLINE int divc_mask_simd_c3(scale_tag_t s_tag, const SRC in[],
                                        const int length, const float scale)
 {
     constexpr int chan = 3;
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
-    constexpr int lanes = chan * nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
+    const int lanes = chan * nlanes;
 
     if (length < lanes)
         return 0;
@@ -2084,7 +2084,7 @@ CV_ALWAYS_INLINE int divrc_simd_common(scale_tag_t s_tag, const SRC in[],
                                        const float scalar[], DST out[],
                                        const int length, const float scale)
 {
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
 
     if (length < nlanes)
         return 0;
@@ -2092,7 +2092,7 @@ CV_ALWAYS_INLINE int divrc_simd_common(scale_tag_t s_tag, const SRC in[],
     v_float32 v_scalar = vx_load(scalar);
     v_float32 v_scale = vx_setall_f32(scale);
     zero_vec_type_of_t<SRC> v_zero =
-                         vx_setall<typename zero_vec_type_of_t<SRC>::lane_type>(0);
+                         vx_setall<typename VTraits<zero_vec_type_of_t<SRC>>::lane_type>(0);
 
     int x = 0;
     for (;;)
@@ -2121,7 +2121,7 @@ CV_ALWAYS_INLINE void divrc_simd_c3_calc(scale_tag_t s_tag, const uchar* inx, uc
                                          const v_uint8& v_zero)
 {
     v_uint8 div = vx_load(inx);
-    v_uint8 v_mask = (div == v_zero);
+    v_uint8 v_mask = (v_eq(div, v_zero));
 
     v_uint16 div1 = v_expand_low(div);
     v_uint16 div2 = v_expand_high(div);
@@ -2147,13 +2147,13 @@ divrc_simd_c3_calc(scale_tag_t s_tag, const SRC* inx, uchar* outx,
                    const v_float32& s3, const v_float32& v_scale,
                    const v_int16& v_zero)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
 
     v_int16 div1 = v_reinterpret_as_s16(vx_load(inx));
     v_int16 div2 = v_reinterpret_as_s16(vx_load(&inx[nlanes / 2]));
 
-    v_int16 v_mask1 = (div1 == v_zero);
-    v_int16 v_mask2 = (div2 == v_zero);
+    v_int16 v_mask1 = (v_eq(div1, v_zero));
+    v_int16 v_mask2 = (v_eq(div2, v_zero));
 
     v_float32 fdiv1 = v_cvt_f32(v_expand_low(div1));
     v_float32 fdiv2 = v_cvt_f32(v_expand_high(div1));
@@ -2175,17 +2175,17 @@ CV_ALWAYS_INLINE void divrc_simd_c3_calc(scale_tag_t s_tag, const float* inx, uc
                                          const v_float32& s3, const v_float32& v_scale,
                                          const v_float32& v_zero)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
 
     v_float32 fdiv1 = vg_load_f32(inx);
     v_float32 fdiv2 = vg_load_f32(&inx[nlanes / 4]);
     v_float32 fdiv3 = vg_load_f32(&inx[nlanes / 2]);
     v_float32 fdiv4 = vg_load_f32(&inx[3 * nlanes / 4]);
 
-    v_float32 v_mask1 = (fdiv1 == v_zero);
-    v_float32 v_mask2 = (fdiv2 == v_zero);
-    v_float32 v_mask3 = (fdiv3 == v_zero);
-    v_float32 v_mask4 = (fdiv4 == v_zero);
+    v_float32 v_mask1 = (v_eq(fdiv1, v_zero));
+    v_float32 v_mask2 = (v_eq(fdiv2, v_zero));
+    v_float32 v_mask3 = (v_eq(fdiv3, v_zero));
+    v_float32 v_mask4 = (v_eq(fdiv4, v_zero));
 
     vx_store(outx,
              v_pack_u(v_pack(v_round(v_select(v_mask1, v_zero, div_op(s_tag, s1, fdiv1, v_scale))),
@@ -2202,7 +2202,7 @@ CV_ALWAYS_INLINE int divrc_simd_c3_impl(scale_tag_t s_tag, const SRC in[], uchar
                                         const int length, const int nlanes, const int lanes)
 {
     univ_zero_vec_type_of_t<SRC> v_zero =
-        vx_setall<typename univ_zero_vec_type_of_t<SRC>::lane_type>(0);
+        vx_setall<typename VTraits<univ_zero_vec_type_of_t<SRC>>::lane_type>(0);
 
     int x = 0;
     for (;;)
@@ -2235,7 +2235,7 @@ divrc_simd_c3_calc(scale_tag_t s_tag, const uchar* inx, DST* outx,
                    const v_float32& s3, const v_float32& v_scale,
                    const v_int16& v_zero)
 {
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
     v_uint8 div = vx_load(inx);
 
     v_int16 div1 = v_reinterpret_as_s16(v_expand_low(div));
@@ -2268,7 +2268,7 @@ divrc_simd_c3_calc(scale_tag_t s_tag, const SRC* inx, DST* outx,
                    const v_float32& s3, const v_float32& v_scale,
                    const v_int16& v_zero)
 {
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
 
     v_int16 div1 = v_reinterpret_as_s16(vx_load(inx));
     v_int16 div2 = v_reinterpret_as_s16(vx_load(&inx[nlanes]));
@@ -2298,7 +2298,7 @@ divrc_simd_c3_calc(scale_tag_t s_tag, const float* inx, DST* outx,
                    const v_float32& s3, const v_float32& v_scale,
                    const v_float32& v_zero)
 {
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
 
     v_float32 fdiv1 = vg_load_f32(inx);
     v_float32 fdiv2 = vg_load_f32(&inx[nlanes/2]);
@@ -2307,12 +2307,12 @@ divrc_simd_c3_calc(scale_tag_t s_tag, const float* inx, DST* outx,
     v_float32 fdiv5 = vg_load_f32(&inx[2*nlanes]);
     v_float32 fdiv6 = vg_load_f32(&inx[5*nlanes/2]);
 
-    v_store_i16(outx, v_round(v_select(fdiv1 == v_zero, v_zero, div_op(s_tag, s1, fdiv1, v_scale))),
-                      v_round(v_select(fdiv2 == v_zero, v_zero, div_op(s_tag, s2, fdiv2, v_scale))));
-    v_store_i16(&outx[nlanes], v_round(v_select(fdiv3 == v_zero, v_zero, div_op(s_tag, s3, fdiv3, v_scale))),
-                               v_round(v_select(fdiv4 == v_zero, v_zero, div_op(s_tag, s1, fdiv4, v_scale))));
-    v_store_i16(&outx[2*nlanes], v_round(v_select(fdiv5 == v_zero, v_zero, div_op(s_tag, s2, fdiv5, v_scale))),
-                                 v_round(v_select(fdiv6 == v_zero, v_zero, div_op(s_tag, s3, fdiv6, v_scale))));
+    v_store_i16(outx, v_round(v_select(v_eq(fdiv1, v_zero), v_zero, div_op(s_tag, s1, fdiv1, v_scale))),
+                      v_round(v_select(v_eq(fdiv2, v_zero), v_zero, div_op(s_tag, s2, fdiv2, v_scale))));
+    v_store_i16(&outx[nlanes], v_round(v_select(v_eq(fdiv3, v_zero), v_zero, div_op(s_tag, s3, fdiv3, v_scale))),
+                               v_round(v_select(v_eq(fdiv4, v_zero), v_zero, div_op(s_tag, s1, fdiv4, v_scale))));
+    v_store_i16(&outx[2*nlanes], v_round(v_select(v_eq(fdiv5, v_zero), v_zero, div_op(s_tag, s2, fdiv5, v_scale))),
+                                 v_round(v_select(v_eq(fdiv6, v_zero), v_zero, div_op(s_tag, s3, fdiv6, v_scale))));
 }
 
 template<typename scale_tag_t, typename SRC, typename DST>
@@ -2325,7 +2325,7 @@ divrc_simd_c3_impl(scale_tag_t s_tag, const SRC in[], DST out[], const v_float32
                    const int, const int lanes)
 {
     zero_vec_type_of_t<SRC> v_zero =
-        vx_setall<typename zero_vec_type_of_t<SRC>::lane_type>(0);
+        vx_setall<typename VTraits<zero_vec_type_of_t<SRC>>::lane_type>(0);
 
     int x = 0;
     for (;;)
@@ -2385,8 +2385,8 @@ CV_ALWAYS_INLINE int divrc_simd_c3(scale_tag_t s_tag, const SRC in[],
                                    const int length, const float scale)
 {
     constexpr int chan = 3;
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
-    constexpr int lanes = chan * nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
+    const int lanes = chan * nlanes;
 
     if (length < lanes)
         return 0;
@@ -2473,7 +2473,7 @@ DIVRC_SIMD(float, float)
 int split3_simd(const uchar in[], uchar out1[], uchar out2[], uchar out3[],
                 const int width)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
     if (width < nlanes)
         return 0;
 
@@ -2507,7 +2507,7 @@ int split3_simd(const uchar in[], uchar out1[], uchar out2[], uchar out3[],
 int split4_simd(const uchar in[], uchar out1[], uchar out2[],
                 uchar out3[], uchar out4[], const int width)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
     if (width < nlanes)
         return 0;
 
@@ -2543,7 +2543,7 @@ int split4_simd(const uchar in[], uchar out1[], uchar out2[],
 int merge3_simd(const T in1[], const T in2[], const T in3[],        \
                 T out[], const int width)                           \
 {                                                                   \
-    constexpr int nlanes = vector_type_of_t<T>::nlanes;             \
+    const int nlanes = VTraits<vector_type_of_t<T>>::vlanes();      \
     if (width < nlanes)                                             \
         return 0;                                                   \
                                                                     \
@@ -2584,7 +2584,7 @@ MERGE3_SIMD(float)
 int merge4_simd(const uchar in1[], const uchar in2[], const uchar in3[],
                 const uchar in4[], uchar out[], const int width)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
     if (width < nlanes)
         return 0;
 
@@ -2618,13 +2618,13 @@ int merge4_simd(const uchar in1[], const uchar in2[], const uchar in3[],
 template<typename VT>
 CV_ALWAYS_INLINE VT oper(add_tag, const VT& a, const VT& b)
 {
-    return a + b;
+    return v_add(a, b);
 }
 
 template<typename VT>
 CV_ALWAYS_INLINE VT oper(sub_tag, const VT& a, const VT& b)
 {
-    return a - b;
+    return v_sub(a, b);
 }
 
 CV_ALWAYS_INLINE void pack_store_uchar(uchar* outx, const v_uint16& c1, const v_uint16& c2)
@@ -2653,7 +2653,7 @@ typename std::enable_if<std::is_same<SRC, short>::value ||
                         std::is_same<SRC, ushort>::value, void>::type
 arithmOp_simd_impl(oper_tag op, const SRC* in1x, const SRC* in2x, uchar* outx)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
 
     vector_type_of_t<SRC> a1 = vx_load(in1x);
     vector_type_of_t<SRC> a2 = vx_load(&in1x[nlanes / 2]);
@@ -2667,7 +2667,7 @@ template<typename oper_tag>
 CV_ALWAYS_INLINE void arithmOp_simd_impl(oper_tag op, const float* in1x,
                                          const float* in2x, uchar* outx)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
 
     v_float32 a1 = vx_load(in1x);
     v_float32 a2 = vx_load(&in1x[nlanes / 4]);
@@ -2709,7 +2709,7 @@ typename std::enable_if<std::is_same<DST, short>::value ||
                         std::is_same<DST, ushort>::value, void>::type
 arithmOp_simd_impl(oper_tag op, const float* in1x, const float* in2x, DST* outx)
 {
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
     v_float32 a1 = vx_load(in1x);
     v_float32 a2 = vx_load(&in1x[nlanes/2]);
     v_float32 b1 = vx_load(in2x);
@@ -2761,7 +2761,7 @@ template<typename oper_tag, typename SRC, typename DST>
 CV_ALWAYS_INLINE int arithmOp_simd(oper_tag op, const SRC in1[], const SRC in2[],
                                    DST out[], const int length)
 {
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
 
     if (length < nlanes)
         return 0;
@@ -2869,7 +2869,7 @@ CV_ALWAYS_INLINE void store_i16(short* outx, const v_int16& res)
 
 CV_ALWAYS_INLINE void convertto_simd_nocoeff_impl(const float* inx, uchar* outx)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
 
     v_int32 a1 = v_round(vx_load(inx));
     v_int32 a2 = v_round(vx_load(&inx[nlanes/4]));
@@ -2887,7 +2887,7 @@ CV_ALWAYS_INLINE
 typename std::enable_if<SRC_SHORT_OR_USHORT, void>::type
 convertto_simd_nocoeff_impl(const SRC* inx, uchar* outx)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
 
     vector_type_of_t<SRC> a1 = vx_load(inx);
     vector_type_of_t<SRC> a2 = vx_load(&inx[nlanes/2]);
@@ -2902,7 +2902,7 @@ CV_ALWAYS_INLINE
 typename std::enable_if<DST_SHORT_OR_USHORT, void>::type
 convertto_simd_nocoeff_impl(const float* inx, DST* outx)
 {
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
 
     v_int32 a1 = v_round(vx_load(inx));
     v_int32 a2 = v_round(vx_load(&inx[nlanes/2]));
@@ -2942,7 +2942,7 @@ CV_ALWAYS_INLINE void convertto_simd_nocoeff_impl(const SRC* inx, float* outx)
 #define CONVERTTO_NOCOEF_SIMD(SRC, DST)                            \
 int convertto_simd(const SRC in[], DST out[], const int length)    \
 {                                                                  \
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;          \
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();   \
     if (length < nlanes)                                           \
         return 0;                                                  \
                                                                    \
@@ -2982,7 +2982,7 @@ CV_ALWAYS_INLINE void convertto_scaled_simd_impl(const float* inx, uchar* outx,
                                                  const v_float32& v_alpha,
                                                  const v_float32& v_beta)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
 
     v_float32 a1 = vx_load(inx);
     v_float32 a2 = vx_load(&inx[nlanes / 4]);
@@ -3003,7 +3003,7 @@ typename std::enable_if<SRC_SHORT_OR_USHORT, void>::type
 convertto_scaled_simd_impl(const SRC* inx, uchar* outx, const v_float32& v_alpha,
                            const v_float32& v_beta)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
 
     v_int16 a = v_reinterpret_as_s16(vx_load(inx));
     v_int16 b = v_reinterpret_as_s16(vx_load(&inx[nlanes / 2]));
@@ -3050,7 +3050,7 @@ convertto_scaled_simd_impl(const float* inx, DST* outx,
                            const v_float32& v_alpha,
                            const v_float32& v_beta)
 {
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
 
     v_float32 a1 = vx_load(inx);
     v_float32 a2 = vx_load(&inx[nlanes / 2]);
@@ -3111,7 +3111,7 @@ CV_ALWAYS_INLINE void convertto_scaled_simd_impl(const SRC* inx, float* outx,
 int convertto_scaled_simd(const SRC in[], DST out[], const float alpha,     \
                           const float beta, const int length)               \
 {                                                                           \
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;                   \
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();            \
     if (length < nlanes)                                                    \
         return 0;                                                           \
                                                                             \
diff --git a/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp b/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp
index 9766cf7cc6..6c517b1f57 100644
--- a/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp
+++ b/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp
@@ -175,7 +175,7 @@ RUN_MEDBLUR3X3_IMPL( float)
 
 #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 template<typename SRC>
 static inline v_float32 vx_load_f32(const SRC* ptr)
 {
@@ -228,8 +228,8 @@ void run_rgb2gray_impl(uchar out[], const uchar in[], int width,
     GAPI_Assert(rc + gc + bc <= unity);
     GAPI_Assert(rc + gc + bc >= USHRT_MAX);
 
-#if CV_SIMD
-    constexpr int nlanes = v_uint8::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int nlanes = VTraits<v_uint8>::vlanes();
     if (width >= nlanes)
     {
         for (int w=0; w < width; )
@@ -247,14 +247,8 @@ void run_rgb2gray_impl(uchar out[], const uchar in[], int width,
 
                 v_uint16 y0, y1;
                 static const ushort half = 1 << 7; // Q0.8.8
-                y0 = (v_mul_hi(r0 << 8, vx_setall_u16(rc)) +
-                      v_mul_hi(g0 << 8, vx_setall_u16(gc)) +
-                      v_mul_hi(b0 << 8, vx_setall_u16(bc)) +
-                                        vx_setall_u16(half)) >> 8;
-                y1 = (v_mul_hi(r1 << 8, vx_setall_u16(rc)) +
-                      v_mul_hi(g1 << 8, vx_setall_u16(gc)) +
-                      v_mul_hi(b1 << 8, vx_setall_u16(bc)) +
-                                        vx_setall_u16(half)) >> 8;
+                y0 = v_shr<8>(v_add(v_add(v_add(v_mul_hi(v_shl<8>(r0), vx_setall_u16(rc)), v_mul_hi(v_shl<8>(g0), vx_setall_u16(gc))), v_mul_hi(v_shl<8>(b0), vx_setall_u16(bc))), vx_setall_u16(half)));
+                y1 = v_shr<8>(v_add(v_add(v_add(v_mul_hi(v_shl<8>(r1), vx_setall_u16(rc)), v_mul_hi(v_shl<8>(g1), vx_setall_u16(gc))), v_mul_hi(v_shl<8>(b1), vx_setall_u16(bc))), vx_setall_u16(half)));
 
                 v_uint8 y;
                 y = v_pack(y0, y1);
@@ -316,10 +310,10 @@ void run_rgb2hsv_impl(uchar out[], const uchar in[], const int sdiv_table[],
             v_uint8x16 v_min_rgb = v_min(v_min(r, g), b);
             v_uint8x16 v_max_rgb = v_max(v_max(r, g), b);
 
-            v_uint8x16 v_diff = v_max_rgb - v_min_rgb;
+            v_uint8x16 v_diff = v_sub(v_max_rgb, v_min_rgb);
 
-            v_uint8x16 v_r_eq_max = (r == v_max_rgb);
-            v_uint8x16 v_g_eq_max = (g == v_max_rgb);
+            v_uint8x16 v_r_eq_max = (v_eq(r, v_max_rgb));
+            v_uint8x16 v_g_eq_max = (v_eq(g, v_max_rgb));
 
             v_uint8x16 v;
             // get V-ch
@@ -327,10 +321,10 @@ void run_rgb2hsv_impl(uchar out[], const uchar in[], const int sdiv_table[],
 
             // divide v into 4x4 vectors because later int32 required
             v_uint32x4 v_idx[4];
-            v_idx[0] = v_reinterpret_as_u32(v & mask1);
-            v_idx[1] = v_reinterpret_as_u32(v & mask2) >> 8;
-            v_idx[2] = v_reinterpret_as_u32(v & mask3) >> 16;
-            v_idx[3] = v_reinterpret_as_u32(v & mask4) >> 24;
+            v_idx[0] = v_reinterpret_as_u32(v_and(v, mask1));
+            v_idx[1] = v_shr<8>(v_reinterpret_as_u32(v_and(v, mask2)));
+            v_idx[2] = v_shr<16>(v_reinterpret_as_u32(v_and(v, mask3)));
+            v_idx[3] = v_shr<24>(v_reinterpret_as_u32(v_and(v, mask4)));
 
             v_uint32x4 sv_elems_32[4];
             sv_elems_32[0] = v_reinterpret_as_u32(v_lut(sdiv_table, v_reinterpret_as_s32(v_idx[0])));
@@ -341,19 +335,19 @@ void run_rgb2hsv_impl(uchar out[], const uchar in[], const int sdiv_table[],
             // divide and calculate s according to above feature
             v_uint32x4 ss[4];
 
-            v_uint32x4 v_add = v_setall_u32(1) << (hsv_shift - 1);
+            v_uint32x4 vadd = v_setall_u32(1) << (hsv_shift - 1);
 
             v_uint32x4 v_diff_exp[4];
-            v_diff_exp[0] = v_reinterpret_as_u32(v_reinterpret_as_u8(v_diff) & mask1);
-            v_diff_exp[1] = v_reinterpret_as_u32(v_reinterpret_as_u8(v_diff) & mask2) >> 8;
-            v_diff_exp[2] = v_reinterpret_as_u32(v_reinterpret_as_u8(v_diff) & mask3) >> 16;
-            v_diff_exp[3] = v_reinterpret_as_u32(v_reinterpret_as_u8(v_diff) & mask4) >> 24;
+            v_diff_exp[0] = v_reinterpret_as_u32(v_and(v_reinterpret_as_u8(v_diff), mask1));
+            v_diff_exp[1] = v_shr<8>(v_reinterpret_as_u32(v_and(v_reinterpret_as_u8(v_diff), mask2)));
+            v_diff_exp[2] = v_shr<16>(v_reinterpret_as_u32(v_and(v_reinterpret_as_u8(v_diff), mask3)));
+            v_diff_exp[3] = v_shr<24>(v_reinterpret_as_u32(v_and(v_reinterpret_as_u8(v_diff), mask4)));
 
             // s = (diff * sdiv_table[v] + (1 << (hsv_shift-1))) >> hsv_shift;
-            ss[0] = (v_diff_exp[0] * sv_elems_32[0] + v_add) >> hsv_shift;
-            ss[1] = (v_diff_exp[1] * sv_elems_32[1] + v_add) >> hsv_shift;
-            ss[2] = (v_diff_exp[2] * sv_elems_32[2] + v_add) >> hsv_shift;
-            ss[3] = (v_diff_exp[3] * sv_elems_32[3] + v_add) >> hsv_shift;
+            ss[0] = v_shr<hsv_shift>(v_add(v_mul(v_diff_exp[0], sv_elems_32[0]), vadd));
+            ss[1] = v_shr<hsv_shift>(v_add(v_mul(v_diff_exp[1], sv_elems_32[1]), vadd));
+            ss[2] = v_shr<hsv_shift>(v_add(v_mul(v_diff_exp[2], sv_elems_32[2]), vadd));
+            ss[3] = v_shr<hsv_shift>(v_add(v_mul(v_diff_exp[3], sv_elems_32[3]), vadd));
 
             // reconstruct order of S-ch
             v_uint32x4 zip[8];
@@ -413,17 +407,17 @@ void run_rgb2hsv_impl(uchar out[], const uchar in[], const int sdiv_table[],
             //h = (_vr & (g - b)) + (~_vr & ((_vg & (b - r + 2 * diff)) + ((~_vg) & (r - g + 4 * diff))));
             v_int32x4 hh[4];
             hh[0] = v_reinterpret_as_s32(v_select(e[0], v_reinterpret_as_s32(gg[0] - bb[0]),
-                                         v_select(p[0], v_reinterpret_as_s32(bb[0] - rr[0] + v_setall_u32(2) * vdd[0]),
-                                                        v_reinterpret_as_s32(rr[0] - gg[0] + v_setall_u32(4) * vdd[0]))));
+                                         v_select(p[0], v_reinterpret_as_s32(v_add(v_sub(bb[0], rr[0]), v_mul(v_setall_u32(2), vdd[0]))),
+                                                        v_reinterpret_as_s32(v_add(v_sub(rr[0], gg[0]), v_mul(v_setall_u32(4), vdd[0]))))));
             hh[1] = v_reinterpret_as_s32(v_select(e[1], v_reinterpret_as_s32(gg[1] - bb[1]),
-                                         v_select(p[1], v_reinterpret_as_s32(bb[1] - rr[1] + v_setall_u32(2) * vdd[1]),
-                                                        v_reinterpret_as_s32(rr[1] - gg[1] + v_setall_u32(4) * vdd[1]))));
+                                         v_select(p[1], v_reinterpret_as_s32(v_add(v_sub(bb[1], rr[1]), v_mul(v_setall_u32(2), vdd[1]))),
+                                                        v_reinterpret_as_s32(v_add(v_sub(rr[1], gg[1]), v_mul(v_setall_u32(4), vdd[1]))))));
             hh[2] = v_reinterpret_as_s32(v_select(e[2], v_reinterpret_as_s32(gg[2] - bb[2]),
-                                         v_select(p[2], v_reinterpret_as_s32(bb[2] - rr[2] + v_setall_u32(2) * vdd[2]),
-                                                        v_reinterpret_as_s32(rr[2] - gg[2] + v_setall_u32(4) * vdd[2]))));
+                                         v_select(p[2], v_reinterpret_as_s32(v_add(v_sub(bb[2], rr[2]), v_mul(v_setall_u32(2), vdd[2]))),
+                                                        v_reinterpret_as_s32(v_add(v_sub(rr[2], gg[2]), v_mul(v_setall_u32(4), vdd[2]))))));
             hh[3] = v_reinterpret_as_s32(v_select(e[3], v_reinterpret_as_s32(gg[3] - bb[3]),
-                                         v_select(p[3], v_reinterpret_as_s32(bb[3] - rr[3] + v_setall_u32(2) * vdd[3]),
-                                                        v_reinterpret_as_s32(rr[3] - gg[3] + v_setall_u32(4) * vdd[3]))));
+                                         v_select(p[3], v_reinterpret_as_s32(v_add(v_sub(bb[3], rr[3]), v_mul(v_setall_u32(2), vdd[3]))),
+                                                        v_reinterpret_as_s32(v_add(v_sub(rr[3], gg[3]), v_mul(v_setall_u32(4), vdd[3]))))));
 
             //h = (h * hdiv_table[diff] + (1 << (hsv_shift-1))) >> hsv_shift;
             v_uint32x4 h_elems_32[4];
@@ -432,10 +426,10 @@ void run_rgb2hsv_impl(uchar out[], const uchar in[], const int sdiv_table[],
             h_elems_32[2] = v_reinterpret_as_u32(v_lut(hdiv_table, v_reinterpret_as_s32(vdd[2])));
             h_elems_32[3] = v_reinterpret_as_u32(v_lut(hdiv_table, v_reinterpret_as_s32(vdd[3])));
 
-            hh[0] = (hh[0] * v_reinterpret_as_s32(h_elems_32[0]) + v_reinterpret_as_s32(v_add)) >> hsv_shift;
-            hh[1] = (hh[1] * v_reinterpret_as_s32(h_elems_32[1]) + v_reinterpret_as_s32(v_add)) >> hsv_shift;
-            hh[2] = (hh[2] * v_reinterpret_as_s32(h_elems_32[2]) + v_reinterpret_as_s32(v_add)) >> hsv_shift;
-            hh[3] = (hh[3] * v_reinterpret_as_s32(h_elems_32[3]) + v_reinterpret_as_s32(v_add)) >> hsv_shift;
+            hh[0] = v_shr(v_add(v_mul(hh[0], v_reinterpret_as_s32(h_elems_32[0])), v_reinterpret_as_s32(vadd)), hsv_shift);
+            hh[1] = v_shr(v_add(v_mul(hh[1], v_reinterpret_as_s32(h_elems_32[1])), v_reinterpret_as_s32(vadd)), hsv_shift);
+            hh[2] = v_shr(v_add(v_mul(hh[2], v_reinterpret_as_s32(h_elems_32[2])), v_reinterpret_as_s32(vadd)), hsv_shift);
+            hh[3] = v_shr(v_add(v_mul(hh[3], v_reinterpret_as_s32(h_elems_32[3])), v_reinterpret_as_s32(vadd)), hsv_shift);
 
             // check for negative H
             v_int32x4 v_h_less_0[4];
@@ -534,7 +528,7 @@ void run_bayergr2rgb_bg_impl(uchar out[], const uchar **in, int width)
             // calculate b-channel
             v_expand(b2, l_1, r_1);
             v_expand(b2_offset, l_2, r_2);
-            v_uint8x16 b2_sum = v_rshr_pack<1>(l_1 + l_2, r_1 + r_2);
+            v_uint8x16 b2_sum = v_rshr_pack<1>(v_add(l_1, l_2), v_add(r_1, r_2));
 
             v_uint8x16 b_low, b_high;
             v_zip(b2_sum, b2_offset, b_low, b_high);
@@ -547,9 +541,9 @@ void run_bayergr2rgb_bg_impl(uchar out[], const uchar **in, int width)
             v_expand(r3_offset, l_4, r_4);
 
             v_uint8x16 r13offset_sum, r13_sum;
-            r13offset_sum = v_rshr_pack<2>(l_1 + l_2 + l_3 + l_4,
-                                           r_1 + r_2 + r_3 + r_4);
-            r13_sum = v_rshr_pack<1>(l_1 + l_3, r_1 + r_3);
+            r13offset_sum = v_rshr_pack<2>(v_add(v_add(v_add(l_1, l_2), l_3), l_4),
+                                           v_add(v_add(v_add(r_1, r_2), r_3), r_4));
+            r13_sum = v_rshr_pack<1>(v_add(l_1, l_3), v_add(r_1, r_3));
 
             v_uint8x16 r_low, r_high;
             v_zip(r13_sum, r13offset_sum, r_low, r_high);
@@ -561,8 +555,8 @@ void run_bayergr2rgb_bg_impl(uchar out[], const uchar **in, int width)
             v_expand(g2, l_3, r_3);
             v_expand(g2_offset, l_4, r_4);
 
-            v_uint8x16 g_out_sum = v_rshr_pack<2>(l_1 + l_2 + l_3 + l_4,
-                                                  r_1 + r_2 + r_3 + r_4);
+            v_uint8x16 g_out_sum = v_rshr_pack<2>(v_add(v_add(v_add(l_1, l_2), l_3), l_4),
+                                                  v_add(v_add(v_add(r_1, r_2), r_3), r_4));
 
             v_uint8x16 g_low, g_high;
             v_zip(g2, g_out_sum, g_low, g_high);
@@ -646,7 +640,7 @@ void run_bayergr2rgb_gr_impl(uchar out[], const uchar **in, int width)
             // calculate r-channel
             v_expand(r2, l_1, r_1);
             v_expand(r2_offset, l_2, r_2);
-            v_uint8x16 r2_sum = v_rshr_pack<1>(l_1 + l_2, r_1 + r_2);
+            v_uint8x16 r2_sum = v_rshr_pack<1>(v_add(l_1, l_2), v_add(r_1, r_2));
 
             v_uint8x16 r_low, r_high;
             v_zip(r2, r2_sum, r_low, r_high);
@@ -659,9 +653,9 @@ void run_bayergr2rgb_gr_impl(uchar out[], const uchar **in, int width)
             v_expand(b3_offset, l_4, r_4);
 
             v_uint8x16 b13offset_sum, b13_sum;
-            b13offset_sum = v_rshr_pack<2>(l_1 + l_2 + l_3 + l_4,
-                                           r_1 + r_2 + r_3 + r_4);
-            b13_sum = v_rshr_pack<1>(l_2 + l_4, r_2 + r_4);
+            b13offset_sum = v_rshr_pack<2>(v_add(v_add(v_add(l_1, l_2), l_3), l_4),
+                                           v_add(v_add(v_add(r_1, r_2), r_3), r_4));
+            b13_sum = v_rshr_pack<1>(v_add(l_2, l_4), v_add(r_2, r_4));
 
             v_uint8x16 b_low, b_high;
             v_zip(b13offset_sum, b13_sum, b_low, b_high);
@@ -673,8 +667,8 @@ void run_bayergr2rgb_gr_impl(uchar out[], const uchar **in, int width)
             v_expand(g2, l_3, r_3);
             v_expand(g2_offset, l_4, r_4);
 
-            v_uint8x16 g_out_sum = v_rshr_pack<2>(l_1 + l_2 + l_3 + l_4,
-                                                  r_1 + r_2 + r_3 + r_4);
+            v_uint8x16 g_out_sum = v_rshr_pack<2>(v_add(v_add(v_add(l_1, l_2), l_3), l_4),
+                                                  v_add(v_add(v_add(r_1, r_2), r_3), r_4));
 
             v_uint8x16 g_low, g_high;
             v_zip(g_out_sum, g2_offset, g_low, g_high);
@@ -749,8 +743,8 @@ void run_rgb2yuv_impl(uchar out[], const uchar in[], int width, const float coef
 
     int w = 0;
 
-#if CV_SIMD
-    static const int nlanes = v_uint8::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    static const int nlanes = VTraits<v_uint8>::vlanes();
     for ( ; w <= width - nlanes; w += nlanes)
     {
         v_uint8 r, g, b;
@@ -761,20 +755,16 @@ void run_rgb2yuv_impl(uchar out[], const uchar in[], int width, const float coef
         v_expand(g, _g0, _g1);
         v_expand(b, _b0, _b1);
 
-        _r0 = _r0 << 7;                         // Q0.9.7 un-signed
-        _r1 = _r1 << 7;
-        _g0 = _g0 << 7;
-        _g1 = _g1 << 7;
-        _b0 = _b0 << 7;
-        _b1 = _b1 << 7;
+        _r0 = v_shl<7>(_r0);                         // Q0.9.7 un-signed
+        _r1 = v_shl<7>(_r1);
+        _g0 = v_shl<7>(_g0);
+        _g1 = v_shl<7>(_g1);
+        _b0 = v_shl<7>(_b0);
+        _b1 = v_shl<7>(_b1);
 
         v_uint16 _y0, _y1;
-        _y0 = v_mul_hi(vx_setall_u16(c0), _r0)  // Q0.9.7
-            + v_mul_hi(vx_setall_u16(c1), _g0)
-            + v_mul_hi(vx_setall_u16(c2), _b0);
-        _y1 = v_mul_hi(vx_setall_u16(c0), _r1)
-            + v_mul_hi(vx_setall_u16(c1), _g1)
-            + v_mul_hi(vx_setall_u16(c2), _b1);
+        _y0 = v_add(v_add(v_mul_hi(vx_setall_u16(c0), _r0), v_mul_hi(vx_setall_u16(c1), _g0)), v_mul_hi(vx_setall_u16(c2), _b0));
+        _y1 = v_add(v_add(v_mul_hi(vx_setall_u16(c0), _r1), v_mul_hi(vx_setall_u16(c1), _g1)), v_mul_hi(vx_setall_u16(c2), _b1));
 
         v_int16 r0, r1, b0, b1, y0, y1;
         r0 = v_reinterpret_as_s16(_r0);         // Q1.8.7 signed
@@ -785,18 +775,18 @@ void run_rgb2yuv_impl(uchar out[], const uchar in[], int width, const float coef
         y1 = v_reinterpret_as_s16(_y1);
 
         v_int16 u0, u1, v0, v1;
-        u0 = v_mul_hi(vx_setall_s16(c3), b0 - y0);  // Q1.12.3
-        u1 = v_mul_hi(vx_setall_s16(c3), b1 - y1);
-        v0 = v_mul_hi(vx_setall_s16(c4), r0 - y0);
-        v1 = v_mul_hi(vx_setall_s16(c4), r1 - y1);
+        u0 = v_mul_hi(vx_setall_s16(c3), v_sub(b0, y0));  // Q1.12.3
+        u1 = v_mul_hi(vx_setall_s16(c3), v_sub(b1, y1));
+        v0 = v_mul_hi(vx_setall_s16(c4), v_sub(r0, y0));
+        v1 = v_mul_hi(vx_setall_s16(c4), v_sub(r1, y1));
 
         v_uint8 y, u, v;
-        y = v_pack((_y0 + vx_setall_u16(1 << 6)) >> 7,
-                   (_y1 + vx_setall_u16(1 << 6)) >> 7);
-        u = v_pack_u((u0 + vx_setall_s16(257 << 2)) >> 3,  // 257 << 2 = 128.5 * (1 << 3)
-                     (u1 + vx_setall_s16(257 << 2)) >> 3);
-        v = v_pack_u((v0 + vx_setall_s16(257 << 2)) >> 3,
-                     (v1 + vx_setall_s16(257 << 2)) >> 3);
+        y = v_pack(v_shr<7>(v_add(_y0, vx_setall_u16(1 << 6))),
+                   v_shr<7>(v_add(_y1, vx_setall_u16(1 << 6))));
+        u = v_pack_u(v_shr<3>(v_add(u0, vx_setall_s16(257 << 2))),  // 257 << 2 = 128.5 * (1 << 3)
+                     v_shr<3>(v_add(u1, vx_setall_s16(257 << 2))));
+        v = v_pack_u(v_shr<3>(v_add(v0, vx_setall_s16(257 << 2))),
+                     v_shr<3>(v_add(v1, vx_setall_s16(257 << 2))));
 
         v_store_interleave(&out[3*w], y, u, v);
     }
@@ -825,8 +815,8 @@ void run_yuv2rgb_impl(uchar out[], const uchar in[], int width, const float coef
 
     int w = 0;
 
-#if CV_SIMD
-    static const int nlanes = v_uint8::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    static const int nlanes = VTraits<v_uint8>::vlanes();
     for ( ; w <= width - nlanes; w += nlanes)
     {
         v_uint8 y, u, v;
@@ -845,30 +835,28 @@ void run_yuv2rgb_impl(uchar out[], const uchar in[], int width, const float coef
         v0 = v_reinterpret_as_s16(_v0);
         v1 = v_reinterpret_as_s16(_v1);
 
-        y0 =  y0 << 3;                              // Q1.12.3
-        y1 =  y1 << 3;
-        u0 = (u0 - vx_setall_s16(128)) << 7;        // Q1.8.7
-        u1 = (u1 - vx_setall_s16(128)) << 7;
-        v0 = (v0 - vx_setall_s16(128)) << 7;
-        v1 = (v1 - vx_setall_s16(128)) << 7;
+        y0 =  v_shl<3>(y0);                              // Q1.12.3
+        y1 =  v_shl<3>(y1);
+        u0 = v_shl<7>(v_sub(u0, vx_setall_s16(128)));        // Q1.8.7
+        u1 = v_shl<7>(v_sub(u1, vx_setall_s16(128)));
+        v0 = v_shl<7>(v_sub(v0, vx_setall_s16(128)));
+        v1 = v_shl<7>(v_sub(v1, vx_setall_s16(128)));
 
         v_int16 r0, r1, g0, g1, b0, b1;
-        r0 = y0 + v_mul_hi(vx_setall_s16(c0), v0);  // Q1.12.3
-        r1 = y1 + v_mul_hi(vx_setall_s16(c0), v1);
-        g0 = y0 + v_mul_hi(vx_setall_s16(c1), u0)
-                + v_mul_hi(vx_setall_s16(c2), v0);
-        g1 = y1 + v_mul_hi(vx_setall_s16(c1), u1)
-                + v_mul_hi(vx_setall_s16(c2), v1);
-        b0 = y0 + v_mul_hi(vx_setall_s16(c3), u0);
-        b1 = y1 + v_mul_hi(vx_setall_s16(c3), u1);
+        r0 = v_add(y0, v_mul_hi(vx_setall_s16(c0), v0));  // Q1.12.3
+        r1 = v_add(y1, v_mul_hi(vx_setall_s16(c0), v1));
+        g0 = v_add(v_add(y0, v_mul_hi(vx_setall_s16(c1), u0)), v_mul_hi(vx_setall_s16(c2), v0));
+        g1 = v_add(v_add(y1, v_mul_hi(vx_setall_s16(c1), u1)), v_mul_hi(vx_setall_s16(c2), v1));
+        b0 = v_add(y0, v_mul_hi(vx_setall_s16(c3), u0));
+        b1 = v_add(y1, v_mul_hi(vx_setall_s16(c3), u1));
 
         v_uint8 r, g, b;
-        r = v_pack_u((r0 + vx_setall_s16(1 << 2)) >> 3,
-                     (r1 + vx_setall_s16(1 << 2)) >> 3);
-        g = v_pack_u((g0 + vx_setall_s16(1 << 2)) >> 3,
-                     (g1 + vx_setall_s16(1 << 2)) >> 3);
-        b = v_pack_u((b0 + vx_setall_s16(1 << 2)) >> 3,
-                     (b1 + vx_setall_s16(1 << 2)) >> 3);
+        r = v_pack_u(v_shr<3>(v_add(r0, vx_setall_s16(1 << 2))),
+                     v_shr<3>(v_add(r1, vx_setall_s16(1 << 2))));
+        g = v_pack_u(v_shr<3>(v_add(g0, vx_setall_s16(1 << 2))),
+                     v_shr<3>(v_add(g1, vx_setall_s16(1 << 2))));
+        b = v_pack_u(v_shr<3>(v_add(b0, vx_setall_s16(1 << 2))),
+                     v_shr<3>(v_add(b1, vx_setall_s16(1 << 2))));
 
         v_store_interleave(&out[3*w], r, g, b);
     }
@@ -920,41 +908,37 @@ void run_rgb2yuv422_impl(uchar out[], const uchar in[], int width)
             v_expand(g, gg1, gg2);
             v_expand(b, bb1, bb2);
 
-            rr1 = rr1 << 7;
-            rr2 = rr2 << 7;
-            gg1 = gg1 << 7;
-            gg2 = gg2 << 7;
-            bb1 = bb1 << 7;
-            bb2 = bb2 << 7;
+            rr1 = v_shl<7>(rr1);
+            rr2 = v_shl<7>(rr2);
+            gg1 = v_shl<7>(gg1);
+            gg2 = v_shl<7>(gg2);
+            bb1 = v_shl<7>(bb1);
+            bb2 = v_shl<7>(bb2);
 
             v_uint16x8 yy1, yy2;
 
-            yy1 = v_mul_hi(v_setall_u16(c0), rr1) +
-                  v_mul_hi(v_setall_u16(c1), gg1) +
-                  v_mul_hi(v_setall_u16(c2), bb1);
+            yy1 = v_add(v_add(v_mul_hi(v_setall_u16(c0), rr1), v_mul_hi(v_setall_u16(c1), gg1)), v_mul_hi(v_setall_u16(c2), bb1));
 
-            yy2 = v_mul_hi(v_setall_u16(c0), rr2) +
-                  v_mul_hi(v_setall_u16(c1), gg2) +
-                  v_mul_hi(v_setall_u16(c2), bb2);
+            yy2 = v_add(v_add(v_mul_hi(v_setall_u16(c0), rr2), v_mul_hi(v_setall_u16(c1), gg2)), v_mul_hi(v_setall_u16(c2), bb2));
 
             v_int16x8 u1, u2, v1, v2;
 
-            u1 = v_mul_hi(v_setall_s16(c3), v_reinterpret_as_s16(bb1) - v_reinterpret_as_s16(yy1));
-            u2 = v_mul_hi(v_setall_s16(c3), v_reinterpret_as_s16(bb2) - v_reinterpret_as_s16(yy2));
-            v1 = v_mul_hi(v_setall_s16(c4), v_reinterpret_as_s16(rr1) - v_reinterpret_as_s16(yy1));
-            v2 = v_mul_hi(v_setall_s16(c4), v_reinterpret_as_s16(rr2) - v_reinterpret_as_s16(yy2));
+            u1 = v_mul_hi(v_setall_s16(c3), v_sub(v_reinterpret_as_s16(bb1), v_reinterpret_as_s16(yy1)));
+            u2 = v_mul_hi(v_setall_s16(c3), v_sub(v_reinterpret_as_s16(bb2), v_reinterpret_as_s16(yy2)));
+            v1 = v_mul_hi(v_setall_s16(c4), v_sub(v_reinterpret_as_s16(rr1), v_reinterpret_as_s16(yy1)));
+            v2 = v_mul_hi(v_setall_s16(c4), v_sub(v_reinterpret_as_s16(rr2), v_reinterpret_as_s16(yy2)));
 
-            y = v_pack((yy1 + v_setall_u16(1 << 6)) >> 7,
-                       (yy2 + v_setall_u16(1 << 6)) >> 7);
-            u = v_pack_u((u1 + v_setall_s16(257 << 2)) >> 3,
-                         (u2 + v_setall_s16(257 << 2)) >> 3);
-            v = v_pack_u((v1 + v_setall_s16(257 << 2)) >> 3,
-                         (v2 + v_setall_s16(257 << 2)) >> 3);
+            y = v_pack(v_shr<7>(v_add(yy1, v_setall_u16(1 << 6))),
+                       v_shr<7>(v_add(yy2, v_setall_u16(1 << 6))));
+            u = v_pack_u(v_shr<3>(v_add(u1, v_setall_s16(257 << 2))),
+                         v_shr<3>(v_add(u2, v_setall_s16(257 << 2))));
+            v = v_pack_u(v_shr<3>(v_add(v1, v_setall_s16(257 << 2))),
+                         v_shr<3>(v_add(v2, v_setall_s16(257 << 2))));
 
             uint8_t ff = 0xff;
             v_uint8x16 mask(ff, 0, ff, 0, ff, 0, ff, 0, ff, 0, ff, 0, ff, 0, ff, 0);
-            v_uint8x16 uu = u & mask;
-            v_uint8x16 vv = v & mask;
+            v_uint8x16 uu = v_and(u, mask);
+            v_uint8x16 vv = v_and(v, mask);
             // extract even u and v
             v_uint8x16 u_low = v_pack(v_reinterpret_as_u16(uu), v_reinterpret_as_u16(uu));
             v_uint8x16 v_low = v_pack(v_reinterpret_as_u16(vv), v_reinterpret_as_u16(vv));
@@ -1001,7 +985,7 @@ void run_rgb2yuv422_impl(uchar out[], const uchar in[], int width)
 //
 //-----------------------------
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 // this variant not using buf[] appears 15% faster than reference any-2-float code below
 template<bool noscale, typename SRC>
 static void run_sepfilter3x3_any2float(float out[], const SRC *in[], int width, int chan,
@@ -1016,7 +1000,7 @@ static void run_sepfilter3x3_any2float(float out[], const SRC *in[], int width,
 
     for (int l=0; l < length; )
     {
-        static const int nlanes = v_float32::nlanes;
+        static const int nlanes = VTraits<v_float32>::vlanes();
 
         // main part
         for ( ; l <= length - nlanes; l += nlanes)
@@ -1026,7 +1010,7 @@ static void run_sepfilter3x3_any2float(float out[], const SRC *in[], int width,
                 v_float32 t0 = vx_load_f32(&i[l - shift]);
                 v_float32 t1 = vx_load_f32(&i[l        ]);
                 v_float32 t2 = vx_load_f32(&i[l + shift]);
-                v_float32 t = t0 * vx_setall_f32(kx0);
+                v_float32 t = v_mul(t0, vx_setall_f32(kx0));
                     t = v_fma(t1,  vx_setall_f32(kx1), t);
                     t = v_fma(t2,  vx_setall_f32(kx2), t);
                 return t;
@@ -1035,7 +1019,7 @@ static void run_sepfilter3x3_any2float(float out[], const SRC *in[], int width,
             v_float32 s0 = xsum(in[0]);
             v_float32 s1 = xsum(in[1]);
             v_float32 s2 = xsum(in[2]);
-            v_float32 s = s0 * vx_setall_f32(ky0);
+            v_float32 s = v_mul(s0, vx_setall_f32(ky0));
                 s = v_fma(s1,  vx_setall_f32(ky1), s);
                 s = v_fma(s2,  vx_setall_f32(ky2), s);
 
@@ -1097,16 +1081,16 @@ static void run_sepfilter3x3_any2short(DST out[], const SRC *in[], int width, in
 
     for (int l=0; l < length;)
     {
-        constexpr int nlanes = v_int16::nlanes;
+        const int nlanes = VTraits<v_int16>::vlanes();
 
         // main part of row
         for (; l <= length - nlanes; l += nlanes)
         {
-            v_float32 sum0 = vx_load(&buf[r0][l])            * vx_setall_f32(ky0);
+            v_float32 sum0 = v_mul(vx_load(&buf[r0][l]), vx_setall_f32(ky0));
                 sum0 = v_fma(vx_load(&buf[r1][l]),             vx_setall_f32(ky1), sum0);
                 sum0 = v_fma(vx_load(&buf[r2][l]),             vx_setall_f32(ky2), sum0);
 
-            v_float32 sum1 = vx_load(&buf[r0][l + nlanes/2]) * vx_setall_f32(ky0);
+            v_float32 sum1 = v_mul(vx_load(&buf[r0][l + nlanes / 2]), vx_setall_f32(ky0));
                 sum1 = v_fma(vx_load(&buf[r1][l + nlanes/2]),  vx_setall_f32(ky1), sum1);
                 sum1 = v_fma(vx_load(&buf[r2][l + nlanes/2]),  vx_setall_f32(ky2), sum1);
 
@@ -1181,24 +1165,24 @@ static void run_sepfilter3x3_any2char(uchar out[], const SRC *in[], int width, i
 
     for (int l=0; l < length;)
     {
-        constexpr int nlanes = v_uint8::nlanes;
+        const int nlanes = VTraits<v_uint8>::vlanes();
 
         // main part of row
         for (; l <= length - nlanes; l += nlanes)
         {
-            v_float32 sum0 = vx_load(&buf[r0][l])              * vx_setall_f32(ky0);
+            v_float32 sum0 = v_mul(vx_load(&buf[r0][l]), vx_setall_f32(ky0));
                 sum0 = v_fma(vx_load(&buf[r1][l]),               vx_setall_f32(ky1), sum0);
                 sum0 = v_fma(vx_load(&buf[r2][l]),               vx_setall_f32(ky2), sum0);
 
-            v_float32 sum1 = vx_load(&buf[r0][l +   nlanes/4]) * vx_setall_f32(ky0);
+            v_float32 sum1 = v_mul(vx_load(&buf[r0][l + nlanes / 4]), vx_setall_f32(ky0));
                 sum1 = v_fma(vx_load(&buf[r1][l +   nlanes/4]),  vx_setall_f32(ky1), sum1);
                 sum1 = v_fma(vx_load(&buf[r2][l +   nlanes/4]),  vx_setall_f32(ky2), sum1);
 
-            v_float32 sum2 = vx_load(&buf[r0][l + 2*nlanes/4]) * vx_setall_f32(ky0);
+            v_float32 sum2 = v_mul(vx_load(&buf[r0][l + 2 * nlanes / 4]), vx_setall_f32(ky0));
                 sum2 = v_fma(vx_load(&buf[r1][l + 2*nlanes/4]),  vx_setall_f32(ky1), sum2);
                 sum2 = v_fma(vx_load(&buf[r2][l + 2*nlanes/4]),  vx_setall_f32(ky2), sum2);
 
-            v_float32 sum3 = vx_load(&buf[r0][l + 3*nlanes/4]) * vx_setall_f32(ky0);
+            v_float32 sum3 = v_mul(vx_load(&buf[r0][l + 3 * nlanes / 4]), vx_setall_f32(ky0));
                 sum3 = v_fma(vx_load(&buf[r1][l + 3*nlanes/4]),  vx_setall_f32(ky1), sum3);
                 sum3 = v_fma(vx_load(&buf[r2][l + 3*nlanes/4]),  vx_setall_f32(ky2), sum3);
 
@@ -1284,7 +1268,7 @@ static void run_sepfilter3x3_char2short(short out[], const uchar *in[], int widt
     {
         for (int l=0; l < length;)
         {
-            constexpr int nlanes = v_int16::nlanes;
+            const int nlanes = VTraits<v_int16>::vlanes();
 
             // main part of output row
             for (; l <= length - nlanes; l += nlanes)
@@ -1292,9 +1276,7 @@ static void run_sepfilter3x3_char2short(short out[], const uchar *in[], int widt
                 v_uint16 t0 = vx_load_expand(&in[k][l - shift]);  // previous
                 v_uint16 t1 = vx_load_expand(&in[k][l        ]);  // current
                 v_uint16 t2 = vx_load_expand(&in[k][l + shift]);  // next pixel
-                v_int16 t = v_reinterpret_as_s16(t0) * vx_setall_s16(ikx0) +
-                            v_reinterpret_as_s16(t1) * vx_setall_s16(ikx1) +
-                            v_reinterpret_as_s16(t2) * vx_setall_s16(ikx2);
+                v_int16 t = v_add(v_add(v_mul(v_reinterpret_as_s16(t0), vx_setall_s16(ikx0)), v_mul(v_reinterpret_as_s16(t1), vx_setall_s16(ikx1))), v_mul(v_reinterpret_as_s16(t2), vx_setall_s16(ikx2)));
                 v_store(&ibuf[r[k]][l], t);
             }
 
@@ -1311,7 +1293,7 @@ static void run_sepfilter3x3_char2short(short out[], const uchar *in[], int widt
 
     for (int l=0; l < length;)
     {
-        constexpr int nlanes = v_int16::nlanes;
+        const int nlanes = VTraits<v_int16>::vlanes();
 
         // main part of output row
         for (; l <= length - nlanes; l += nlanes)
@@ -1319,13 +1301,11 @@ static void run_sepfilter3x3_char2short(short out[], const uchar *in[], int widt
             v_int16 s0 = vx_load(&ibuf[r[0]][l]);  // previous
             v_int16 s1 = vx_load(&ibuf[r[1]][l]);  // current
             v_int16 s2 = vx_load(&ibuf[r[2]][l]);  // next row
-            v_int16 s = s0 * vx_setall_s16(iky0) +
-                        s1 * vx_setall_s16(iky1) +
-                        s2 * vx_setall_s16(iky2);
+            v_int16 s = v_add(v_add(v_mul(s0, vx_setall_s16(iky0)), v_mul(s1, vx_setall_s16(iky1))), v_mul(s2, vx_setall_s16(iky2)));
 
             if (!noscale)
             {
-                s = v_mul_hi(s << 1, vx_setall_s16(iscale)) + vx_setall_s16(idelta);
+                s = v_add(v_mul_hi(v_shl<1>(s), vx_setall_s16(iscale)), vx_setall_s16(idelta));
             }
 
             v_store(&out[l], s);
@@ -1399,7 +1379,7 @@ static void run_sepfilter3x3_code(DST out[], const SRC *in[], int width, int cha
                                   float scale, float delta,
                                   float *buf[], int y, int y0)
 {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     int length = width * chan;
 
     // length variable may be unused if types do not match at 'if' statements below
@@ -1407,7 +1387,7 @@ static void run_sepfilter3x3_code(DST out[], const SRC *in[], int width, int cha
 
 #if USE_SEPFILTER3X3_CHAR2SHORT
     if (std::is_same<DST, short>::value && std::is_same<SRC, uchar>::value &&
-        length >= v_int16::nlanes)
+        length >= VTraits<v_int16>::vlanes())
     {
         // only slightly faster than more generic any-to-short (see below)
         run_sepfilter3x3_char2short<noscale>(reinterpret_cast<short*>(out),
@@ -1419,7 +1399,7 @@ static void run_sepfilter3x3_code(DST out[], const SRC *in[], int width, int cha
 #endif
 
     if (std::is_same<DST, float>::value && std::is_same<SRC, float>::value &&
-        length >= v_float32::nlanes)
+        length >= VTraits<v_float32>::vlanes())
     {
         // appears 15% faster than reference any-to-float code (called below)
         run_sepfilter3x3_any2float<noscale>(reinterpret_cast<float*>(out), in,
@@ -1427,7 +1407,7 @@ static void run_sepfilter3x3_code(DST out[], const SRC *in[], int width, int cha
         return;
     }
 
-    if (std::is_same<DST, short>::value && length >= v_int16::nlanes)
+    if (std::is_same<DST, short>::value && length >= VTraits<v_int16>::vlanes())
     {
         // appears 10-40x faster than reference due to much faster rounding
         run_sepfilter3x3_any2short<noscale>(reinterpret_cast<short*>(out), in,
@@ -1436,7 +1416,7 @@ static void run_sepfilter3x3_code(DST out[], const SRC *in[], int width, int cha
         return;
     }
 
-    if (std::is_same<DST, ushort>::value && length >= v_uint16::nlanes)
+    if (std::is_same<DST, ushort>::value && length >= VTraits<v_uint16>::vlanes())
     {
         // appears 10-40x faster than reference due to much faster rounding
         run_sepfilter3x3_any2short<noscale>(reinterpret_cast<ushort*>(out), in,
@@ -1445,7 +1425,7 @@ static void run_sepfilter3x3_code(DST out[], const SRC *in[], int width, int cha
         return;
     }
 
-    if (std::is_same<DST, uchar>::value && length >= v_uint8::nlanes)
+    if (std::is_same<DST, uchar>::value && length >= VTraits<v_uint8>::vlanes())
     {
         // appears 10-40x faster than reference due to much faster rounding
         run_sepfilter3x3_any2char<noscale>(reinterpret_cast<uchar*>(out), in,
@@ -1499,7 +1479,7 @@ RUN_SEPFILTER3X3_IMPL(float, float)
 //
 //-----------------------------
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 
 // this code with manually vectored rounding to uchar
 template<bool noscale, typename SRC>
@@ -1549,17 +1529,17 @@ static void run_sepfilter5x5_any2char(uchar out[], const SRC *in[], int width, i
 
     // vertical pass
 
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
 
     for (int l = 0; l < length;)
     {
         // main part of row
         for (; l <= length - nlanes; l += nlanes)
         {
-            v_float32 sum0 = vx_load(&buf[r[0]][l]) * vx_setall_f32(ky[0]);
-            v_float32 sum1 = vx_load(&buf[r[0]][l + nlanes / 4]) * vx_setall_f32(ky[0]);
-            v_float32 sum2 = vx_load(&buf[r[0]][l + 2 * nlanes / 4]) * vx_setall_f32(ky[0]);
-            v_float32 sum3 = vx_load(&buf[r[0]][l + 3 * nlanes / 4]) * vx_setall_f32(ky[0]);
+            v_float32 sum0 = v_mul(vx_load(&buf[r[0]][l]), vx_setall_f32(ky[0]));
+            v_float32 sum1 = v_mul(vx_load(&buf[r[0]][l + nlanes / 4]), vx_setall_f32(ky[0]));
+            v_float32 sum2 = v_mul(vx_load(&buf[r[0]][l + 2 * nlanes / 4]), vx_setall_f32(ky[0]));
+            v_float32 sum3 = v_mul(vx_load(&buf[r[0]][l + 3 * nlanes / 4]), vx_setall_f32(ky[0]));
 
             for (int n = 1; n < kyLen; ++n)
             {
@@ -1647,15 +1627,15 @@ static void run_sepfilter5x5_any2short(DST out[], const SRC *in[], int width, in
 
     // vertical pass
 
-    constexpr int nlanes = v_int16::nlanes;
+    const int nlanes = VTraits<v_int16>::vlanes();
     for (int l = 0; l < length;)
     {
         //GAPI_Assert(length >= nlanes);
         // main part of row
         for (; l <= length - nlanes; l += nlanes)
         {
-            v_float32 sum0 = vx_load(&buf[r[0]][l]) * vx_setall_f32(ky[0]);
-            v_float32 sum1 = vx_load(&buf[r[0]][l + nlanes / 2]) * vx_setall_f32(ky[0]);
+            v_float32 sum0 = v_mul(vx_load(&buf[r[0]][l]), vx_setall_f32(ky[0]));
+            v_float32 sum1 = v_mul(vx_load(&buf[r[0]][l + nlanes / 2]), vx_setall_f32(ky[0]));
 
             for (int j = 1; j < kyLen; ++j)
             {
@@ -1702,14 +1682,10 @@ static void run_sepfilter5x5_any2float(float out[], const SRC *in[], int width,
                                        const float kx[], const float ky[], int border,
                                        float scale, float delta)
 {
-    constexpr int kxLen = 5;
-    constexpr int kyLen = kxLen;
-    constexpr int buffSize = 5;
-
     const int length = width * chan;
     const int shift = chan;
 
-    static const int nlanes = v_float32::nlanes;
+    static const int nlanes = VTraits<v_float32>::vlanes();
     for (int l = 0; l < length; )
     {
         //GAPI_Assert(length >= nlanes);
@@ -1717,33 +1693,33 @@ static void run_sepfilter5x5_any2float(float out[], const SRC *in[], int width,
         for (; l <= length - nlanes; l += nlanes)
         {
             auto xsum = [l, border, shift, kx](const SRC inp[])
-            {
-                v_float32 t[5];
-                for (int i = 0; i < 5; ++i)
-                {
-                    t[i] = vx_load_f32(&inp[l + (i - border)*shift]);
-                }
+            { //buffSize = 5
+                v_float32 t0 = vx_load_f32(&inp[l + (0 - border)*shift]);
+                v_float32 t1 = vx_load_f32(&inp[l + (1 - border)*shift]);
+                v_float32 t2 = vx_load_f32(&inp[l + (2 - border)*shift]);
+                v_float32 t3 = vx_load_f32(&inp[l + (3 - border)*shift]);
+                v_float32 t4 = vx_load_f32(&inp[l + (4 - border)*shift]);
 
-                v_float32 sum = t[0] * vx_setall_f32(kx[0]);
-                for (int j = 1; j < 5; ++j)
-                {
-                    sum = v_fma(t[j], vx_setall_f32(kx[j]), sum);
-                }
+                v_float32 sum = v_mul(t0, vx_setall_f32(kx[0]));
+                sum = v_fma(t1, vx_setall_f32(kx[1]), sum);
+                sum = v_fma(t2, vx_setall_f32(kx[2]), sum);
+                sum = v_fma(t3, vx_setall_f32(kx[3]), sum);
+                sum = v_fma(t4, vx_setall_f32(kx[4]), sum);
 
                 return sum;
             };
 
-            v_float32 s[buffSize];
-            for (int m = 0; m < buffSize; ++m)
-            {
-                s[m] = xsum(in[m]);
-            }
+            v_float32 s0 = xsum(in[0]);
+            v_float32 s1 = xsum(in[1]);
+            v_float32 s2 = xsum(in[2]);
+            v_float32 s3 = xsum(in[3]);
+            v_float32 s4 = xsum(in[4]);
 
-            v_float32 sum = s[0] * vx_setall_f32(ky[0]);
-            for (int n = 1; n < kyLen; ++n)
-            {
-                sum = v_fma(s[n], vx_setall_f32(ky[n]), sum);
-            }
+            v_float32 sum = v_mul(s0, vx_setall_f32(ky[0]));
+            sum = v_fma(s1, vx_setall_f32(ky[1]), sum);
+            sum = v_fma(s2, vx_setall_f32(ky[2]), sum);
+            sum = v_fma(s3, vx_setall_f32(ky[3]), sum);
+            sum = v_fma(s4, vx_setall_f32(ky[4]), sum);
 
             if (!noscale)
             {
@@ -1819,7 +1795,7 @@ static void run_sepfilter5x5_char2short(short out[], const uchar *in[], int widt
     // this kernel (Fluid does rows consequently: y=y0, y0+1, ...)
     int k0 = (y == y0) ? 0 : 4;
 
-    constexpr int nlanes = v_int16::nlanes;
+    const int nlanes = VTraits<v_int16>::vlanes();
 
     for (int k = k0; k < kyLen; ++k)
     {
@@ -1830,16 +1806,18 @@ static void run_sepfilter5x5_char2short(short out[], const uchar *in[], int widt
             // main part of output row
             for (; l <= length - nlanes; l += nlanes)
             {
-                v_uint16 t[kxLen];
                 v_int16 sum = vx_setzero_s16();
 
-                for (int i = 0; i < kxLen; ++i)
-                {
-                    // previous, current, next pixels
-                    t[i] = vx_load_expand(&in[k][l + (i - border)*shift]);
+                auto process = [&](int i) {
+                    v_uint16 t = vx_load_expand(&in[k][l + (i - border)*shift]);
+                    return v_add(sum, v_mul(v_reinterpret_as_s16(t), vx_setall_s16(ikx[i])));
+                };
 
-                    sum += v_reinterpret_as_s16(t[i]) * vx_setall_s16(ikx[i]);
-                }
+                sum = process(0);
+                sum = process(1);
+                sum = process(2);
+                sum = process(3);
+                sum = process(4);
 
                 v_store(&ibuf[r[k]][l], sum);
             }
@@ -1861,20 +1839,21 @@ static void run_sepfilter5x5_char2short(short out[], const uchar *in[], int widt
         // main part of output row
         for (; l <= length - nlanes; l += nlanes)
         {
-            v_int16 s[buffSize];
             v_int16 sum = vx_setzero_s16();
 
-            for (int i = 0; i < kyLen; ++i)
-            {
-                // previous, current, next rows
-                s[i] = vx_load(&ibuf[r[i]][l]);
-
-                sum += s[i] * vx_setall_s16(iky[i]);
-            }
+            auto process = [&](int i) {
+                v_int16 s = vx_load(&ibuf[r[i]][l]);
+                return v_add(sum, v_mul(s, vx_setall_s16(iky[i])));
+            };
+            sum = process(0);
+            sum = process(1);
+            sum = process(2);
+            sum = process(3);
+            sum = process(4);
 
             if (!noscale)
             {
-                sum = v_mul_hi(sum << 1, vx_setall_s16(iscale)) + vx_setall_s16(idelta);
+                sum = v_add(v_mul_hi(v_shl<1>(sum), vx_setall_s16(iscale)), vx_setall_s16(idelta));
             }
 
             v_store(&out[l], sum);
@@ -1965,14 +1944,14 @@ static void run_sepfilter5x5_code(DST out[], const SRC *in[], int width, int cha
                                   const float kx[], const float ky[], int border,
                                   float scale, float delta, float *buf[], int y, int y0)
 {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     int length = width * chan;
 
     // length variable may be unused if types do not match at 'if' statements below
     (void)length;
 
     if (std::is_same<DST, short>::value && std::is_same<SRC, uchar>::value &&
-        length >= v_int16::nlanes)
+        length >= VTraits<v_int16>::vlanes())
     {
         run_sepfilter5x5_char2short<noscale>(reinterpret_cast<short*>(out),
                                              reinterpret_cast<const uchar**>(in),
@@ -1982,14 +1961,14 @@ static void run_sepfilter5x5_code(DST out[], const SRC *in[], int width, int cha
     }
 
     if (std::is_same<DST, float>::value && std::is_same<SRC, float>::value &&
-        length >= v_float32::nlanes)
+        length >= VTraits<v_float32>::vlanes())
     {
         run_sepfilter5x5_any2float<noscale>(reinterpret_cast<float*>(out), in, width,
                                             chan, kx, ky, border, scale, delta);
         return;
     }
 
-    if (std::is_same<DST, short>::value && length >= v_int16::nlanes)
+    if (std::is_same<DST, short>::value && length >= VTraits<v_int16>::vlanes())
     {
         run_sepfilter5x5_any2short<noscale>(reinterpret_cast<short*>(out), in, width,
                                             chan, kx, ky, border, scale, delta,
@@ -1997,7 +1976,7 @@ static void run_sepfilter5x5_code(DST out[], const SRC *in[], int width, int cha
         return;
     }
 
-    if (std::is_same<DST, ushort>::value && length >= v_uint16::nlanes)
+    if (std::is_same<DST, ushort>::value && length >= VTraits<v_uint16>::vlanes())
     {
         run_sepfilter5x5_any2short<noscale>(reinterpret_cast<ushort*>(out), in, width,
                                             chan, kx, ky, border, scale, delta,
@@ -2005,7 +1984,7 @@ static void run_sepfilter5x5_code(DST out[], const SRC *in[], int width, int cha
         return;
     }
 
-    if (std::is_same<DST, uchar>::value && length >= v_uint8::nlanes)
+    if (std::is_same<DST, uchar>::value && length >= VTraits<v_uint8>::vlanes())
     {
         run_sepfilter5x5_any2char<noscale>(reinterpret_cast<uchar*>(out), in, width,
                                            chan, kx, ky, border, scale, delta,
@@ -2086,7 +2065,7 @@ static void run_filter2d_3x3_reference(DST out[], const SRC *in[], int width, in
     }
 }
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 // assume DST is short or ushort
 template<bool noscale, typename DST, typename SRC>
 static void run_filter2d_3x3_any2short(DST out[], const SRC *in[], int width, int chan,
@@ -2106,14 +2085,14 @@ static void run_filter2d_3x3_any2short(DST out[], const SRC *in[], int width, in
 
     for (int l=0; l < length;)
     {
-        static constexpr int nlanes = v_int16::nlanes;
+        static const int nlanes = VTraits<v_int16>::vlanes();
 
         // main part of output row
         for (; l <= length - nlanes; l += nlanes)
         {
             auto sumx = [in, shift, &k](int i, int j)
             {
-                v_float32 s = vx_load_f32(&in[i][j - shift]) * vx_setall_f32(k[i][0]);
+                v_float32 s = v_mul(vx_load_f32(&in[i][j - shift]), vx_setall_f32(k[i][0]));
                     s = v_fma(vx_load_f32(&in[i][j        ]),  vx_setall_f32(k[i][1]), s);
                     s = v_fma(vx_load_f32(&in[i][j + shift]),  vx_setall_f32(k[i][2]), s);
                 return s;
@@ -2121,8 +2100,8 @@ static void run_filter2d_3x3_any2short(DST out[], const SRC *in[], int width, in
 
             int l0 = l;
             int l1 = l + nlanes/2;
-            v_float32 sum0 = sumx(0, l0) + sumx(1, l0) + sumx(2, l0);
-            v_float32 sum1 = sumx(0, l1) + sumx(1, l1) + sumx(2, l1);
+            v_float32 sum0 = v_add(sumx(0, l0), sumx(1, l0), sumx(2, l0));
+            v_float32 sum1 = v_add(sumx(0, l1), sumx(1, l1), sumx(2, l1));
 
             if (!noscale)
             {
@@ -2172,14 +2151,14 @@ static void run_filter2d_3x3_any2char(uchar out[], const SRC *in[], int width, i
 
     for (int l=0; l < length;)
     {
-        static constexpr int nlanes = v_uint8::nlanes;
+        static const int nlanes = VTraits<v_uint8>::vlanes();
 
         // main part of output row
         for (; l <= length - nlanes; l += nlanes)
         {
             auto sumx = [in, shift, &k](int i, int j)
             {
-                v_float32 s = vx_load_f32(&in[i][j - shift]) * vx_setall_f32(k[i][0]);
+                v_float32 s = v_mul(vx_load_f32(&in[i][j - shift]), vx_setall_f32(k[i][0]));
                     s = v_fma(vx_load_f32(&in[i][j        ]),  vx_setall_f32(k[i][1]), s);
                     s = v_fma(vx_load_f32(&in[i][j + shift]),  vx_setall_f32(k[i][2]), s);
                 return s;
@@ -2189,10 +2168,10 @@ static void run_filter2d_3x3_any2char(uchar out[], const SRC *in[], int width, i
             int l1 = l +   nlanes/4;
             int l2 = l + 2*nlanes/4;
             int l3 = l + 3*nlanes/4;
-            v_float32 sum0 = sumx(0, l0) + sumx(1, l0) + sumx(2, l0);
-            v_float32 sum1 = sumx(0, l1) + sumx(1, l1) + sumx(2, l1);
-            v_float32 sum2 = sumx(0, l2) + sumx(1, l2) + sumx(2, l2);
-            v_float32 sum3 = sumx(0, l3) + sumx(1, l3) + sumx(2, l3);
+            v_float32 sum0 = v_add(sumx(0, l0), sumx(1, l0), sumx(2, l0));
+            v_float32 sum1 = v_add(sumx(0, l1), sumx(1, l1), sumx(2, l1));
+            v_float32 sum2 = v_add(sumx(0, l2), sumx(1, l2), sumx(2, l2));
+            v_float32 sum3 = v_add(sumx(0, l3), sumx(1, l3), sumx(2, l3));
 
             if (!noscale)
             {
@@ -2228,20 +2207,20 @@ template<bool noscale, typename DST, typename SRC>
 static void run_filter2d_3x3_code(DST out[], const SRC *in[], int width, int chan,
                                   const float kernel[], float scale, float delta)
 {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     int length = width * chan;
 
     // length variable may be unused if types do not match at 'if' statements below
     (void) length;
 
-    if (std::is_same<DST, short>::value && length >= v_int16::nlanes)
+    if (std::is_same<DST, short>::value && length >= VTraits<v_int16>::vlanes())
     {
         run_filter2d_3x3_any2short<noscale>(reinterpret_cast<short*>(out), in,
                                             width, chan, kernel, scale, delta);
         return;
     }
 
-    if (std::is_same<DST, ushort>::value && length >= v_uint16::nlanes)
+    if (std::is_same<DST, ushort>::value && length >= VTraits<v_uint16>::vlanes())
     {
         run_filter2d_3x3_any2short<noscale>(reinterpret_cast<ushort*>(out), in,
                                             width, chan, kernel, scale, delta);
@@ -2249,7 +2228,7 @@ static void run_filter2d_3x3_code(DST out[], const SRC *in[], int width, int cha
     }
 
 
-    if (std::is_same<DST, uchar>::value && length >= v_uint8::nlanes)
+    if (std::is_same<DST, uchar>::value && length >= VTraits<v_uint8>::vlanes())
     {
         run_filter2d_3x3_any2char<noscale>(reinterpret_cast<uchar*>(out), in,
                                            width, chan, kernel, scale, delta);
@@ -2446,7 +2425,7 @@ static void run_morphology3x3_reference(T out[], const T *in[], int width, int c
     CV_Error(cv::Error::StsBadArg, "unsupported morphology");
 }
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 template<typename T, typename VT, typename S>
 static void run_morphology3x3_simd(T out[], const T *in[], int width, int chan,
                                    const uchar k[], MorphShape k_type,
@@ -2467,7 +2446,7 @@ static void run_morphology3x3_simd(T out[], const T *in[], int width, int chan,
         {
             for (int l=0; l < length;)
             {
-                constexpr int nlanes = VT::nlanes;
+                const int nlanes = VTraits<VT>::vlanes();
 
                 // main part of output row
                 for (; l <= length - nlanes; l += nlanes)
@@ -2503,7 +2482,7 @@ static void run_morphology3x3_simd(T out[], const T *in[], int width, int chan,
         {
             for (int l=0; l < length;)
             {
-                constexpr int nlanes = VT::nlanes;
+                const int nlanes = VTraits<VT>::vlanes();
 
                 // main part of output row
                 for (; l <= length - nlanes; l += nlanes)
@@ -2537,7 +2516,7 @@ static void run_morphology3x3_simd(T out[], const T *in[], int width, int chan,
 
         for (int l=0; l < length;)
         {
-            constexpr int nlanes = VT::nlanes;
+            const int nlanes = VTraits<VT>::vlanes();
 
             // main part of output row
             for (; l <= length - nlanes; l += nlanes)
@@ -2575,7 +2554,7 @@ static void run_morphology3x3_simd(T out[], const T *in[], int width, int chan,
         {
             for (int l=0; l < length;)
             {
-                constexpr int nlanes = VT::nlanes;
+                const int nlanes = VTraits<VT>::vlanes();
 
                 // main part of output row
                 for (; l <= length - nlanes; l += nlanes)
@@ -2611,7 +2590,7 @@ static void run_morphology3x3_simd(T out[], const T *in[], int width, int chan,
         {
             for (int l=0; l < length;)
             {
-                constexpr int nlanes = VT::nlanes;
+                const int nlanes = VTraits<VT>::vlanes();
 
                 // main part of output row
                 for (; l <= length - nlanes; l += nlanes)
@@ -2645,7 +2624,7 @@ static void run_morphology3x3_simd(T out[], const T *in[], int width, int chan,
 
         for (int l=0; l < length;)
         {
-            constexpr int nlanes = VT::nlanes;
+            const int nlanes = VTraits<VT>::vlanes();
 
             // main part of output row
             for (; l <= length - nlanes; l += nlanes)
@@ -2686,13 +2665,13 @@ static void run_morphology3x3_code(T out[], const T *in[], int width, int chan,
                                    const uchar k[], MorphShape k_type,
                                    Morphology morphology)
 {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     int length = width * chan;
 
     // length variable may be unused if types do not match at 'if' statements below
     (void) length;
 
-    if (std::is_same<T, float>::value && length >= v_float32::nlanes)
+    if (std::is_same<T, float>::value && length >= VTraits<v_float32>::vlanes())
     {
         run_morphology3x3_simd<float, v_float32>(reinterpret_cast<float*>(out),
                                                  reinterpret_cast<const float**>(in),
@@ -2701,7 +2680,7 @@ static void run_morphology3x3_code(T out[], const T *in[], int width, int chan,
         return;
     }
 
-    if (std::is_same<T, short>::value && length >= v_int16::nlanes)
+    if (std::is_same<T, short>::value && length >= VTraits<v_int16>::vlanes())
     {
         run_morphology3x3_simd<short, v_int16>(reinterpret_cast<short*>(out),
                                                reinterpret_cast<const short**>(in),
@@ -2710,7 +2689,7 @@ static void run_morphology3x3_code(T out[], const T *in[], int width, int chan,
         return;
     }
 
-    if (std::is_same<T, ushort>::value && length >= v_uint16::nlanes)
+    if (std::is_same<T, ushort>::value && length >= VTraits<v_uint16>::vlanes())
     {
         run_morphology3x3_simd<ushort, v_uint16>(reinterpret_cast<ushort*>(out),
                                                  reinterpret_cast<const ushort**>(in),
@@ -2719,7 +2698,7 @@ static void run_morphology3x3_code(T out[], const T *in[], int width, int chan,
         return;
     }
 
-    if (std::is_same<T, uchar>::value && length >= v_uint8::nlanes)
+    if (std::is_same<T, uchar>::value && length >= VTraits<v_uint8>::vlanes())
     {
         run_morphology3x3_simd<uchar, v_uint8>(reinterpret_cast<uchar*>(out),
                                                reinterpret_cast<const uchar**>(in),
@@ -2796,7 +2775,7 @@ static void run_medblur3x3_reference(T out[], const T *in[], int width, int chan
     }
 }
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 template<typename VT, typename T>
 static void run_medblur3x3_simd(T out[], const T *in[], int width, int chan)
 {
@@ -2808,7 +2787,7 @@ static void run_medblur3x3_simd(T out[], const T *in[], int width, int chan)
 
     for (int l=0; l < length;)
     {
-        constexpr int nlanes = VT::nlanes;
+        const int nlanes = VTraits<VT>::vlanes();
 
         // main part of output row
         for (; l <= length - nlanes; l += nlanes)
@@ -2866,13 +2845,13 @@ static void run_medblur3x3_simd(T out[], const T *in[], int width, int chan)
 template<typename T>
 static void run_medblur3x3_code(T out[], const T *in[], int width, int chan)
 {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     int length = width * chan;
 
     // length variable may be unused if types do not match at 'if' statements below
     (void) length;
 
-    if (std::is_same<T, float>::value && length >= v_float32::nlanes)
+    if (std::is_same<T, float>::value && length >= VTraits<v_float32>::vlanes())
     {
         run_medblur3x3_simd<v_float32>(reinterpret_cast<float*>(out),
                                        reinterpret_cast<const float**>(in),
@@ -2880,7 +2859,7 @@ static void run_medblur3x3_code(T out[], const T *in[], int width, int chan)
         return;
     }
 
-    if (std::is_same<T, short>::value && length >= v_int16::nlanes)
+    if (std::is_same<T, short>::value && length >= VTraits<v_int16>::vlanes())
     {
         run_medblur3x3_simd<v_int16>(reinterpret_cast<short*>(out),
                                      reinterpret_cast<const short**>(in),
@@ -2888,7 +2867,7 @@ static void run_medblur3x3_code(T out[], const T *in[], int width, int chan)
         return;
     }
 
-    if (std::is_same<T, ushort>::value && length >= v_uint16::nlanes)
+    if (std::is_same<T, ushort>::value && length >= VTraits<v_uint16>::vlanes())
     {
         run_medblur3x3_simd<v_uint16>(reinterpret_cast<ushort*>(out),
                                       reinterpret_cast<const ushort**>(in),
@@ -2896,7 +2875,7 @@ static void run_medblur3x3_code(T out[], const T *in[], int width, int chan)
         return;
     }
 
-    if (std::is_same<T, uchar>::value && length >= v_uint8::nlanes)
+    if (std::is_same<T, uchar>::value && length >= VTraits<v_uint8>::vlanes())
     {
         run_medblur3x3_simd<v_uint8>(reinterpret_cast<uchar*>(out),
                                      reinterpret_cast<const uchar**>(in),