Merge pull request #21474 from anna-khakimova:ak/simd_for_divc

GAPI Fluid: SIMD for DivC kernel. * GAPI Fluid:SIMD for DivC * Applied comment
2025-08-06 14:36:36 +08:00 · 2022-02-02 21:47:01 +03:00 · 2022-02-02 21:47:01 +03:00 · 1605d1d24d
commit 1605d1d24d
parent 415a42f327
5 changed files with 660 additions and 133 deletions
--- a/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp
+++ b/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp
@ -93,8 +93,8 @@ INSTANTIATE_TEST_CASE_P(DivPerfTestFluid, DivPerfTest,
 INSTANTIATE_TEST_CASE_P(DivCPerfTestFluid, DivCPerfTest,
    Combine(Values(Tolerance_FloatRel_IntAbs(1e-6, 1).to_compare_f()),
            Values(szSmall128, szVGA, sz720p, sz1080p),
-            Values(CV_8UC1, CV_8UC3, CV_16SC1, CV_32FC1),
-            Values(-1, CV_8U, CV_32F),
+            Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+            Values(-1, CV_8U, CV_16U, CV_16S, CV_32F),
            Values(1.0),
            Values(cv::compile_args(CORE_FLUID))));

--- a/modules/gapi/src/backends/fluid/gfluidcore.cpp
+++ b/modules/gapi/src/backends/fluid/gfluidcore.cpp
@ -886,25 +886,6 @@ static void run_arithm_s(DST out[], const SRC in[], int width, int chan,
        CV_Error(cv::Error::StsBadArg, "unsupported number of channels");
 }

-template<typename DST, typename SRC>
-static void run_absdiffc(Buffer &dst, const View &src, const float scalar[])
-{
-    const auto *in = src.InLine<SRC>(0);
-    auto *out = dst.OutLine<DST>();
-
-    int width = dst.length();
-    int chan = dst.meta().chan;
-    const int length = width * chan;
-
-    int w = 0;
-#if CV_SIMD
-    w = absdiffc_simd(in, scalar, out, length, chan);
-#endif
-
-    for (; w < length; ++w)
-        out[w] = absdiff<DST>(in[w], scalar[w%chan]);
-}
-
 template<typename DST, typename SRC>
 CV_ALWAYS_INLINE void run_arithm_s(Buffer &dst, const View &src, const float scalar[],
                                   Arithm arithm, float scale=1)
@ -950,11 +931,6 @@ CV_ALWAYS_INLINE void run_arithm_s(Buffer &dst, const View &src, const float sca
                out[chan * w + c] = mul<DST>(in[chan * w + c], scalar[c], scale);
        break;
    }
-    case ARITHM_DIVIDE:
-        for (int w=0; w < width; w++)
-            for (int c=0; c < chan; c++)
-                out[chan*w + c] = div<DST>(in[chan*w + c], scalar[c], scale);
-        break;
    default: CV_Error(cv::Error::StsBadArg, "unsupported arithmetic operation");
    }
 }
@ -992,6 +968,14 @@ static void run_arithm_rs(Buffer &dst, const View &src, const float scalar[4], A
    }
 }

+CV_ALWAYS_INLINE void setScratchSize(Buffer& scratch, const int buflen)
+{
+    cv::Size bufsize(buflen, 1);
+    GMatDesc bufdesc = { CV_32F, 1, bufsize };
+    Buffer buffer(bufdesc);
+    scratch = std::move(buffer);
+}
+
 CV_ALWAYS_INLINE void initScratchBuffer(Buffer& scratch)
 {
 #if CV_SIMD
@ -1012,10 +996,33 @@ CV_ALWAYS_INLINE void initScratchBuffer(Buffer& scratch)
 #else
    constexpr int buflen = 4;
 #endif
-    cv::Size bufsize(buflen, 1);
-    GMatDesc bufdesc = { CV_32F, 1, bufsize };
-    Buffer buffer(bufdesc);
-    scratch = std::move(buffer);
+    setScratchSize(scratch, buflen);
+}
+
+CV_ALWAYS_INLINE void scalar_to_scratch(const cv::Scalar& scalar,
+                                        float scratch[], const int length, const int chan)
+{
+    for (int i = 0; i < length; ++i)
+        scratch[i] = static_cast<float>(scalar[i % chan]);
+}
+
+template<typename DST, typename SRC>
+CV_ALWAYS_INLINE void run_absdiffc(Buffer& dst, const View& src, const float scalar[])
+{
+    const auto* in = src.InLine<SRC>(0);
+    auto* out = dst.OutLine<DST>();
+
+    int width = dst.length();
+    int chan = dst.meta().chan;
+    const int length = width * chan;
+
+    int w = 0;
+#if CV_SIMD
+    w = absdiffc_simd(in, scalar, out, length, chan);
+#endif
+
+    for (; w < length; ++w)
+        out[w] = absdiff<DST>(in[w], scalar[w % chan]);
 }

 GAPI_FLUID_KERNEL(GFluidAbsDiffC, cv::gapi::core::GAbsDiffC, true)
@ -1027,10 +1034,9 @@ GAPI_FLUID_KERNEL(GFluidAbsDiffC, cv::gapi::core::GAbsDiffC, true)
        if (dst.y() == 0)
        {
            const int chan = src.meta().chan;
-            float* sc = scratch.OutLine<float>();
+            float* _scratch = scratch.OutLine<float>();

-            for (int i = 0; i < scratch.length(); ++i)
-                sc[i] = static_cast<float>(_scalar[i % chan]);
+            scalar_to_scratch(_scalar, _scratch, scratch.length(), chan);
        }

        const float* scalar = scratch.OutLine<float>();
@ -1065,10 +1071,9 @@ GAPI_FLUID_KERNEL(GFluidAddC, cv::gapi::core::GAddC, true)
        if (dst.y() == 0)
        {
            const int chan = src.meta().chan;
-            float* sc = scratch.OutLine<float>();
+            float* _scratch = scratch.OutLine<float>();

-            for (int i = 0; i < scratch.length(); ++i)
-                sc[i] = static_cast<float>(_scalar[i % chan]);
+            scalar_to_scratch(_scalar, _scratch, scratch.length(), chan);
        }

        const float* scalar = scratch.OutLine<float>();
@ -1115,10 +1120,9 @@ GAPI_FLUID_KERNEL(GFluidSubC, cv::gapi::core::GSubC, true)
        if (dst.y() == 0)
        {
            const int chan = src.meta().chan;
-            float* sc = scratch.OutLine<float>();
+            float* _scratch = scratch.OutLine<float>();

-            for (int i = 0; i < scratch.length(); ++i)
-                sc[i] = static_cast<float>(_scalar[i % chan]);
+            scalar_to_scratch(_scalar, _scratch, scratch.length(), chan);
        }

        const float* scalar = scratch.OutLine<float>();
@ -1165,10 +1169,9 @@ GAPI_FLUID_KERNEL(GFluidSubRC, cv::gapi::core::GSubRC, true)
        if (dst.y() == 0)
        {
            const int chan = src.meta().chan;
-            float* sc = scratch.OutLine<float>();
+            float* _scratch = scratch.OutLine<float>();

-            for (int i = 0; i < scratch.length(); ++i)
-                sc[i] = static_cast<float>(_scalar[i % chan]);
+            scalar_to_scratch(_scalar, _scratch, scratch.length(), chan);
        }

        const float* scalar = scratch.OutLine<float>();
@ -1216,10 +1219,9 @@ GAPI_FLUID_KERNEL(GFluidMulC, cv::gapi::core::GMulC, true)
        if (dst.y() == 0)
        {
            const int chan = src.meta().chan;
-            float* sc = scratch.OutLine<float>();
+            float* _scratch = scratch.OutLine<float>();

-            for (int i = 0; i < scratch.length(); ++i)
-                sc[i] = static_cast<float>(_scalar[i % chan]);
+            scalar_to_scratch(_scalar, _scratch, scratch.length(), chan);
        }
        const float* scalar = scratch.OutLine<float>();
        const float scale = 1.0;
@ -1295,32 +1297,109 @@ GAPI_FLUID_KERNEL(GFluidMulCOld, cv::gapi::core::GMulCOld, true)
    }
 };

-GAPI_FLUID_KERNEL(GFluidDivC, cv::gapi::core::GDivC, false)
+template<typename DST, typename SRC>
+CV_ALWAYS_INLINE void run_divc(Buffer& dst, const View& src, Buffer& scratch,
+                               float scale)
+{
+    const auto* in = src.InLine<SRC>(0);
+    auto* out = dst.OutLine<DST>();
+    const float* scalar = scratch.OutLine<float>();
+
+    int width = dst.length();
+    int chan = dst.meta().chan;
+    const int length = width * chan;
+
+    int w = 0;
+#if CV_SIMD
+    int scratch_length = scratch.length();
+    int indicator_offset = scratch_length - 1;
+    const int set_mask_indicator = static_cast<int>(*(scratch.OutLine<float>() + (indicator_offset)));
+
+    w = divc_simd(in, scalar, out, length, chan, scale, set_mask_indicator);
+#endif
+
+    for (; w < length; ++w)
+        out[w] = div<DST>(in[w], scalar[w % chan], scale);
+}
+
+GAPI_FLUID_KERNEL(GFluidDivC, cv::gapi::core::GDivC, true)
 {
    static const int Window = 1;

    static void run(const View& src, const cv::Scalar& _scalar, double _scale, int /*dtype*/,
-                    Buffer &dst)
+                    Buffer& dst, Buffer& scratch)
    {
-        const float scalar[4] = {
-            static_cast<float>(_scalar[0]),
-            static_cast<float>(_scalar[1]),
-            static_cast<float>(_scalar[2]),
-            static_cast<float>(_scalar[3])
-        };
-        const float scale = static_cast<float>(_scale);
+        GAPI_Assert(src.meta().chan <= 4);
+
+        if (dst.y() == 0)
+        {
+            const int chan = src.meta().chan;
+            float* _scratch = scratch.OutLine<float>();
+            int scratch_length = scratch.length();
+
+            scalar_to_scratch(_scalar, _scratch, scratch_length - 1, chan);
+
+            _scratch[scratch_length - 1] = 0.0;
+            for (int j = 0; j < chan; ++j)
+            {
+                if (std::fabs(static_cast<float>(_scalar[j])) <= FLT_EPSILON)
+                {
+                    _scratch[scratch_length - 1] = 1.0;
+                    break;
+                }
+            }
+        }
+
+        float scale = static_cast<float>(_scale);

        //     DST     SRC     OP            __VA_ARGS__
-        UNARY_(uchar , uchar , run_arithm_s, dst, src, scalar, ARITHM_DIVIDE, scale);
-        UNARY_(uchar ,  short, run_arithm_s, dst, src, scalar, ARITHM_DIVIDE, scale);
-        UNARY_(uchar ,  float, run_arithm_s, dst, src, scalar, ARITHM_DIVIDE, scale);
-        UNARY_( short,  short, run_arithm_s, dst, src, scalar, ARITHM_DIVIDE, scale);
-        UNARY_( float, uchar , run_arithm_s, dst, src, scalar, ARITHM_DIVIDE, scale);
-        UNARY_( float,  short, run_arithm_s, dst, src, scalar, ARITHM_DIVIDE, scale);
-        UNARY_( float,  float, run_arithm_s, dst, src, scalar, ARITHM_DIVIDE, scale);
+        UNARY_(uchar,  uchar,  run_divc, dst, src, scratch, scale);
+        UNARY_(uchar,  ushort, run_divc, dst, src, scratch, scale);
+        UNARY_(uchar,  short,  run_divc, dst, src, scratch, scale);
+        UNARY_(uchar,  float,  run_divc, dst, src, scratch, scale);
+        UNARY_(ushort, ushort, run_divc, dst, src, scratch, scale);
+        UNARY_(ushort, uchar,  run_divc, dst, src, scratch, scale);
+        UNARY_(ushort, short,  run_divc, dst, src, scratch, scale);
+        UNARY_(ushort, float,  run_divc, dst, src, scratch, scale);
+        UNARY_(short,  short,  run_divc, dst, src, scratch, scale);
+        UNARY_(short,  ushort, run_divc, dst, src, scratch, scale);
+        UNARY_(short,  uchar,  run_divc, dst, src, scratch, scale);
+        UNARY_(short,  float,  run_divc, dst, src, scratch, scale);
+        UNARY_(float,  uchar,  run_divc, dst, src, scratch, scale);
+        UNARY_(float,  short,  run_divc, dst, src, scratch, scale);
+        UNARY_(float,  ushort, run_divc, dst, src, scratch, scale);
+        UNARY_(float,  float,  run_divc, dst, src, scratch, scale);

        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
    }
+
+    static void initScratch(const GMatDesc&, const GScalarDesc&, double, int, Buffer& scratch)
+    {
+#if CV_SIMD
+            // 512 bits / 32 bits = 16 elements of float32 a AVX512 SIMD vector can contain.
+            constexpr int maxNlanes = 16;
+
+            // +2 is offset for 3-channel case.
+            // Offset is need to right load coefficients from scalar array to SIMD vectors for 3-channel case.
+            // Scalar array looks like: scalar[] = {C1, C2, C3, C1, C2, C3, ...}
+            // The first scalar SIMD vector should looks like:
+            // C1 C2 C3 C1
+            // The second:
+            // C2 C3 C1 C2
+            // The third:
+            // C3 C1 C2 C3
+            constexpr int offset = 2;
+            constexpr int zero_scalar_elem_indicator = 1;
+            constexpr int buflen = maxNlanes + offset + zero_scalar_elem_indicator;
+#else
+            constexpr int buflen = 4;
+#endif
+            setScratchSize(scratch, buflen);
+    }
+
+    static void resetScratch(Buffer& /*scratch*/)
+    {
+    }
 };

 GAPI_FLUID_KERNEL(GFluidDivRC, cv::gapi::core::GDivRC, false)
--- a/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp
+++ b/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp
@ -192,6 +192,34 @@ MULC_SIMD(float, float)

 #undef MULC_SIMD

+#define DIVC_SIMD(SRC, DST)                                                              \
+int divc_simd(const SRC in[], const float scalar[], DST out[],                           \
+              const int length, const int chan, const float scale,                       \
+              const int set_mask_flag)                                                   \
+{                                                                                        \
+    CV_CPU_DISPATCH(divc_simd, (in, scalar, out, length, chan, scale, set_mask_flag),    \
+                    CV_CPU_DISPATCH_MODES_ALL);                                          \
+}
+
+DIVC_SIMD(uchar, uchar)
+DIVC_SIMD(ushort, uchar)
+DIVC_SIMD(short, uchar)
+DIVC_SIMD(float, uchar)
+DIVC_SIMD(short, short)
+DIVC_SIMD(ushort, short)
+DIVC_SIMD(uchar, short)
+DIVC_SIMD(float, short)
+DIVC_SIMD(ushort, ushort)
+DIVC_SIMD(uchar, ushort)
+DIVC_SIMD(short, ushort)
+DIVC_SIMD(float, ushort)
+DIVC_SIMD(uchar, float)
+DIVC_SIMD(ushort, float)
+DIVC_SIMD(short, float)
+DIVC_SIMD(float, float)
+
+#undef DIVC_SIMD
+
 #define ABSDIFFC_SIMD(SRC)                                               \
 int absdiffc_simd(const SRC in[], const float scalar[], SRC out[],       \
                  const int length, const int chan)                      \
--- a/modules/gapi/src/backends/fluid/gfluidcore_func.hpp
+++ b/modules/gapi/src/backends/fluid/gfluidcore_func.hpp
@ -152,6 +152,30 @@ MULC_SIMD(float, float)

 #undef MULC_SIMD

+#define DIVC_SIMD(SRC, DST)                                                              \
+int divc_simd(const SRC in[], const float scalar[], DST out[],                           \
+              const int length, const int chan, const float scale,                       \
+              const int set_mask_flag);
+
+DIVC_SIMD(uchar, uchar)
+DIVC_SIMD(ushort, uchar)
+DIVC_SIMD(short, uchar)
+DIVC_SIMD(float, uchar)
+DIVC_SIMD(short, short)
+DIVC_SIMD(ushort, short)
+DIVC_SIMD(uchar, short)
+DIVC_SIMD(float, short)
+DIVC_SIMD(ushort, ushort)
+DIVC_SIMD(uchar, ushort)
+DIVC_SIMD(short, ushort)
+DIVC_SIMD(float, ushort)
+DIVC_SIMD(uchar, float)
+DIVC_SIMD(ushort, float)
+DIVC_SIMD(short, float)
+DIVC_SIMD(float, float)
+
+#undef DIVC_SIMD
+
 #define ABSDIFFC_SIMD(T)                                            \
 int absdiffc_simd(const T in[], const float scalar[], T out[],      \
                  const int length, const int chan);
--- a/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp
+++ b/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp
@ -173,6 +173,30 @@ MULC_SIMD(float, float)

 #undef MULC_SIMD

+#define DIVC_SIMD(SRC, DST)                                                              \
+int divc_simd(const SRC in[], const float scalar[], DST out[],                           \
+              const int length, const int chan, const float scale,                       \
+              const int set_mask_flag);
+
+DIVC_SIMD(uchar, uchar)
+DIVC_SIMD(ushort, uchar)
+DIVC_SIMD(short, uchar)
+DIVC_SIMD(float, uchar)
+DIVC_SIMD(short, short)
+DIVC_SIMD(ushort, short)
+DIVC_SIMD(uchar, short)
+DIVC_SIMD(float, short)
+DIVC_SIMD(ushort, ushort)
+DIVC_SIMD(uchar, ushort)
+DIVC_SIMD(short, ushort)
+DIVC_SIMD(float, ushort)
+DIVC_SIMD(uchar, float)
+DIVC_SIMD(ushort, float)
+DIVC_SIMD(short, float)
+DIVC_SIMD(float, float)
+
+#undef DIVC_SIMD
+
 #define ABSDIFFC_SIMD(T)                                            \
 int absdiffc_simd(const T in[], const float scalar[], T out[],      \
                  const int length, const int chan);
@ -941,6 +965,7 @@ struct add_tag {};
 struct sub_tag {};
 struct subr_tag {};
 struct mul_tag {};
+struct div_tag {};
 struct absdiff_tag {};

 CV_ALWAYS_INLINE void arithmOpScalar_pack_store_c3(short* outx,       const v_int32& c1,
@ -985,6 +1010,21 @@ CV_ALWAYS_INLINE v_float32 oper(mul_tag, const v_float32& a, const v_float32& sc
    return a * sc;
 }

+CV_ALWAYS_INLINE v_float32 oper_scaled(mul_tag, const v_float32& a, const v_float32& v_scalar, const v_float32& v_scale)
+{
+    return v_scale * a * v_scalar;
+}
+
+CV_ALWAYS_INLINE v_float32 oper(div_tag, const v_float32& a, const v_float32& sc)
+{
+    return a / sc;
+}
+
+CV_ALWAYS_INLINE v_float32 oper_scaled(div_tag, const v_float32& a, const v_float32& v_scalar, const v_float32& v_scale)
+{
+    return a*v_scale / v_scalar;
+}
+
 CV_ALWAYS_INLINE v_float32 oper(absdiff_tag, const v_float32& a, const v_float32& sc)
 {
    return v_absdiff(a, sc);
@ -1294,16 +1334,17 @@ SUBRC_SIMD(float, float)

 //-------------------------
 //
-// Fluid kernels: MulC
+// Fluid kernels: MulC, DivC
 //
 //-------------------------

-template<typename SRC, typename DST>
+template<typename oper_tag, typename SRC, typename DST>
 CV_ALWAYS_INLINE
 typename std::enable_if<std::is_same<DST, short>::value ||
                        std::is_same<DST, ushort>::value, void>::type
-mulc_scale_simd_c3_impl(const SRC* inx, DST* outx, const v_float32& s1, const v_float32& s2,
-                        const v_float32& s3, const v_float32& scale, const int nlanes)
+arithmOpScalarScaled_simd_c3_impl(oper_tag op, SRC* inx, DST* outx, const v_float32& s1,
+                                  const v_float32& s2, const v_float32& s3,
+                                  const v_float32& v_scale, const int nlanes)
 {
    v_float32 a1 = vg_load_f32(inx);
    v_float32 a2 = vg_load_f32(&inx[nlanes / 2]);
@ -1312,62 +1353,64 @@ mulc_scale_simd_c3_impl(const SRC* inx, DST* outx, const v_float32& s1, const v_
    v_float32 a5 = vg_load_f32(&inx[2 * nlanes]);
    v_float32 a6 = vg_load_f32(&inx[5 * nlanes / 2]);

-    arithmOpScalar_pack_store_c3(outx, v_round(scale*a1*s1),
-                                       v_round(scale*a2*s2),
-                                       v_round(scale*a3*s3),
-                                       v_round(scale*a4*s1),
-                                       v_round(scale*a5*s2),
-                                       v_round(scale*a6*s3));
+    arithmOpScalar_pack_store_c3(outx, v_round(oper_scaled(op, a1, s1, v_scale)),
+                                       v_round(oper_scaled(op, a2, s2, v_scale)),
+                                       v_round(oper_scaled(op, a3, s3, v_scale)),
+                                       v_round(oper_scaled(op, a4, s1, v_scale)),
+                                       v_round(oper_scaled(op, a5, s2, v_scale)),
+                                       v_round(oper_scaled(op, a6, s3, v_scale)));
 }

 //-------------------------------------------------------------------------------------------------

-template<typename SRC>
-CV_ALWAYS_INLINE void mulc_scale_simd_c3_impl(const SRC* inx, uchar* outx,
+template<typename oper_tag, typename SRC>
+CV_ALWAYS_INLINE void arithmOpScalarScaled_simd_c3_impl(oper_tag op, const SRC* inx, uchar* outx,
                                                        const v_float32& s1, const v_float32& s2,
-                                              const v_float32& s3, const v_float32& scale, const int nlanes)
+                                                        const v_float32& s3, const v_float32& v_scale,
+                                                        const int nlanes)
 {
    vx_store(outx,
-               v_pack_u(v_pack(v_round(scale * vg_load_f32(inx)* s1),
-                               v_round(scale * vg_load_f32(&inx[nlanes/4])* s2)),
-                        v_pack(v_round(scale * vg_load_f32(&inx[nlanes/2])* s3),
-                               v_round(scale * vg_load_f32(&inx[3*nlanes/4])* s1))));
+               v_pack_u(v_pack(v_round(oper_scaled(op, vg_load_f32(inx), s1, v_scale)),
+                               v_round(oper_scaled(op, vg_load_f32(&inx[nlanes/4]), s2, v_scale))),
+                        v_pack(v_round(oper_scaled(op, vg_load_f32(&inx[nlanes/2]), s3, v_scale)),
+                               v_round(oper_scaled(op, vg_load_f32(&inx[3*nlanes/4]), s1, v_scale)))));

    vx_store(&outx[nlanes],
-                v_pack_u(v_pack(v_round(scale * vg_load_f32(&inx[nlanes])* s2),
-                                v_round(scale * vg_load_f32(&inx[5*nlanes/4])* s3)),
-                         v_pack(v_round(scale * vg_load_f32(&inx[3*nlanes/2])* s1),
-                                v_round(scale * vg_load_f32(&inx[7*nlanes/4])* s2))));
+                v_pack_u(v_pack(v_round(oper_scaled(op, vg_load_f32(&inx[nlanes]), s2, v_scale)),
+                                v_round(oper_scaled(op, vg_load_f32(&inx[5*nlanes/4]), s3, v_scale))),
+                         v_pack(v_round(oper_scaled(op, vg_load_f32(&inx[3*nlanes/2]), s1, v_scale)),
+                                v_round(oper_scaled(op, vg_load_f32(&inx[7*nlanes/4]), s2, v_scale)))));

    vx_store(&outx[2 * nlanes],
-                v_pack_u(v_pack(v_round(scale * vg_load_f32(&inx[2*nlanes])* s3),
-                                v_round(scale * vg_load_f32(&inx[9*nlanes/4])* s1)),
-                         v_pack(v_round(scale * vg_load_f32(&inx[5*nlanes/2])* s2),
-                                v_round(scale * vg_load_f32(&inx[11*nlanes/4])* s3))));
+                v_pack_u(v_pack(v_round(oper_scaled(op, vg_load_f32(&inx[2*nlanes]), s3, v_scale)),
+                                v_round(oper_scaled(op, vg_load_f32(&inx[9*nlanes/4]), s1, v_scale))),
+                         v_pack(v_round(oper_scaled(op, vg_load_f32(&inx[5*nlanes/2]), s2, v_scale)),
+                                v_round(oper_scaled(op, vg_load_f32(&inx[11*nlanes/4]), s3, v_scale)))));
 }

 //-------------------------------------------------------------------------------------------------

-template<typename SRC>
-CV_ALWAYS_INLINE void mulc_scale_simd_c3_impl(const SRC* in, float* out,
+template<typename oper_tag, typename SRC>
+CV_ALWAYS_INLINE void arithmOpScalarScaled_simd_c3_impl(oper_tag op, const SRC* in, float* out,
                                                        const v_float32& s1, const v_float32& s2,
-                                        const v_float32& s3, const v_float32& scale, const int nlanes)
+                                                        const v_float32& s3, const v_float32& v_scale,
+                                                        const int nlanes)
 {
    v_float32 a1 = vg_load_f32(in);
    v_float32 a2 = vg_load_f32(&in[nlanes]);
    v_float32 a3 = vg_load_f32(&in[2*nlanes]);

-    vx_store(out, scale * a1* s1);
-    vx_store(&out[nlanes], scale * a2* s2);
-    vx_store(&out[2*nlanes], scale * a3* s3);
+    vx_store(out, oper_scaled(op, a1, s1, v_scale));
+    vx_store(&out[nlanes], oper_scaled(op, a2, s2, v_scale));
+    vx_store(&out[2*nlanes], oper_scaled(op, a3, s3, v_scale));
 }

 //-------------------------------------------------------------------------------------------------

-template<typename SRC, typename DST>
-CV_ALWAYS_INLINE int mulc_scale_simd_c3(const SRC in[],
+template<typename oper_tag, typename SRC, typename DST>
+CV_ALWAYS_INLINE int arithmOpScalarScaled_simd_c3(oper_tag op, const SRC in[],
                                                  const float scalar[], DST out[],
-                                        const int length, const float _scale)
+                                                  const int length, const float scale)
 {
    constexpr int chan = 3;
    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
@ -1376,7 +1419,7 @@ CV_ALWAYS_INLINE int mulc_scale_simd_c3(const SRC in[],
    if (length < lanes)
        return 0;

-    v_float32 scale = vx_setall_f32(_scale);
+    v_float32 v_scale = vx_setall_f32(scale);

    v_float32 s1 = vx_load(scalar);
 #if CV_SIMD_WIDTH == 32
@ -1392,7 +1435,7 @@ CV_ALWAYS_INLINE int mulc_scale_simd_c3(const SRC in[],
    {
        for (; x <= length - lanes; x += lanes)
        {
-            mulc_scale_simd_c3_impl(&in[x], &out[x], s1, s2, s3, scale, nlanes);
+            arithmOpScalarScaled_simd_c3_impl(op, &in[x], &out[x], s1, s2, s3, v_scale, nlanes);
        }

        if (x < length)
@ -1407,70 +1450,70 @@ CV_ALWAYS_INLINE int mulc_scale_simd_c3(const SRC in[],

 //-------------------------------------------------------------------------------------------------

-template<typename SRC, typename DST>
+template<typename oper_tag, typename SRC, typename DST>
 CV_ALWAYS_INLINE
 typename std::enable_if<(std::is_same<DST, ushort>::value ||
                         std::is_same<DST, short>::value), void>::type
-mulc_scale_simd_common_impl(const SRC* inx, DST* outx,
-                            const v_float32& sc, const v_float32& scale,
+arithmOpScalarScaled_simd_common_impl(oper_tag op, const SRC* inx, DST* outx,
+                                      const v_float32& v_scalar, const v_float32& v_scale,
                                      const int nlanes)
 {
    v_float32 a1 = vg_load_f32(inx);
    v_float32 a2 = vg_load_f32(&inx[nlanes/2]);

-    v_store_i16(outx, v_round(scale * a1* sc), v_round(scale * a2* sc));
+    v_store_i16(outx, v_round(oper_scaled(op, a1, v_scalar, v_scale)), v_round(oper_scaled(op, a2, v_scalar, v_scale)));
 }

 //-------------------------------------------------------------------------------------------------

-template<typename SRC>
-CV_ALWAYS_INLINE void mulc_scale_simd_common_impl(const SRC* inx,
-                                                  uchar* outx, const v_float32& sc,
-                                                  const v_float32& scale, const int nlanes)
+template<typename oper_tag, typename SRC>
+CV_ALWAYS_INLINE void arithmOpScalarScaled_simd_common_impl(oper_tag op, const SRC* inx,
+                                                            uchar* outx, const v_float32& v_scalar,
+                                                            const v_float32& v_scale, const int nlanes)
 {
    v_float32 a1 = vg_load_f32(inx);
    v_float32 a2 = vg_load_f32(&inx[nlanes/4]);
    v_float32 a3 = vg_load_f32(&inx[nlanes/2]);
    v_float32 a4 = vg_load_f32(&inx[3 * nlanes/4]);

-    vx_store(outx, v_pack_u(v_pack(v_round(scale * a1* sc),
-                                   v_round(scale * a2* sc)),
-                            v_pack(v_round(scale * a3* sc),
-                                   v_round(scale * a4* sc))));
+    vx_store(outx, v_pack_u(v_pack(v_round(oper_scaled(op, a1, v_scalar, v_scale)),
+                                   v_round(oper_scaled(op, a2, v_scalar, v_scale))),
+                            v_pack(v_round(oper_scaled(op, a3, v_scalar, v_scale)),
+                                   v_round(oper_scaled(op, a4, v_scalar, v_scale)))));
 }

 //-------------------------------------------------------------------------------------------------

-template<typename SRC>
-CV_ALWAYS_INLINE void mulc_scale_simd_common_impl(const SRC* inx,
-                                                  float* outx, const v_float32& sc,
-                                                  const v_float32& scale, const int)
+template<typename oper_tag, typename SRC>
+CV_ALWAYS_INLINE void arithmOpScalarScaled_simd_common_impl(oper_tag op, const SRC* inx,
+                                                            float* outx, const v_float32& v_scalar,
+                                                            const v_float32& v_scale, const int)
 {
-    v_float32 a1 = vg_load_f32(inx);
-    vx_store(outx, scale * a1* sc);
+    v_float32 a = vg_load_f32(inx);
+    vx_store(outx, oper_scaled(op, a, v_scalar, v_scale));
 }

 //-------------------------------------------------------------------------------------------------

-template<typename SRC, typename DST>
-CV_ALWAYS_INLINE int mulc_scale_simd_common(const SRC in[],
+template<typename oper_tag, typename SRC, typename DST>
+CV_ALWAYS_INLINE int arithmOpScalarScaled_simd_common(oper_tag op, const SRC in[],
                                                      const float scalar[], DST out[],
-                                            const int length, const float _scale)
+                                                      const int length, const float scale)
 {
    constexpr int nlanes = vector_type_of_t<DST>::nlanes;

    if (length < nlanes)
        return 0;

-    v_float32 _scalar = vx_load(scalar);
-    v_float32 scale = vx_setall_f32(_scale);
+    v_float32 v_scalar = vx_load(scalar);
+    v_float32 v_scale = vx_setall_f32(scale);

    int x = 0;
    for (;;)
    {
        for (; x <= length - nlanes; x += nlanes)
        {
-            mulc_scale_simd_common_impl(&in[x], &out[x], _scalar, scale, nlanes);
+            arithmOpScalarScaled_simd_common_impl(op, &in[x], &out[x], v_scalar, v_scale, nlanes);
        }

        if (x < length)
@ -1483,6 +1526,8 @@ CV_ALWAYS_INLINE int mulc_scale_simd_common(const SRC in[],
    return x;
 }

+//-------------------------------------------------------------------------------------------------
+
 #define MULC_SIMD(SRC, DST)                                                    \
 int mulc_simd(const SRC in[], const float scalar[], DST out[],                 \
              const int length, const int chan, const float scale)             \
@ -1501,7 +1546,8 @@ int mulc_simd(const SRC in[], const float scalar[], DST out[],                 \
        }                                                                      \
        else                                                                   \
        {                                                                      \
-            return mulc_scale_simd_common(in, scalar, out, length, scale);     \
+            return arithmOpScalarScaled_simd_common(op_t, in, scalar, out,     \
+                   length, scale);                                             \
        }                                                                      \
    }                                                                          \
    case 3:                                                                    \
@ -1513,7 +1559,8 @@ int mulc_simd(const SRC in[], const float scalar[], DST out[],                 \
        }                                                                      \
        else                                                                   \
        {                                                                      \
-            return mulc_scale_simd_c3(in, scalar, out, length, scale);         \
+            return arithmOpScalarScaled_simd_c3(op_t, in, scalar, out,         \
+                                                length, scale);                \
        }                                                                      \
    }                                                                          \
    default:                                                                   \
@ -1542,6 +1589,355 @@ MULC_SIMD(float, float)

 #undef MULC_SIMD

+//-------------------------------------------------------------------------------------------------
+
+template<typename scale_tag_t, typename SRC, typename DST>
+CV_ALWAYS_INLINE
+typename std::enable_if<(std::is_same<DST, ushort>::value ||
+                         std::is_same<DST, short>::value), int>::type
+divc_simd_common_impl(scale_tag_t s_tag, const SRC in[], DST out[],
+                      const v_float32& v_scalar, const v_float32& v_scale,
+                      const int length)
+{
+    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+
+    v_float32 v_zero = vx_setzero_f32();
+    v_float32 v_mask = (v_scalar == v_zero);
+
+    int x = 0;
+    for (;;)
+    {
+        for (; x <= length - nlanes; x += nlanes)
+        {
+            v_float32 a1 = vg_load_f32(&in[x]);
+            v_float32 a2 = vg_load_f32(&in[x + nlanes/2]);
+
+            v_store_i16(&out[x], v_round(v_select(v_mask, v_zero, div_op(s_tag, a1, v_scalar, v_scale))),
+                                 v_round(v_select(v_mask, v_zero, div_op(s_tag, a2, v_scalar, v_scale))));
+        }
+
+        if (x < length)
+        {
+            x = length - nlanes;
+            continue;  // process unaligned tail
+        }
+        break;
+    }
+    return x;
+}
+
+//-------------------------------------------------------------------------------------------------
+
+template<typename scale_tag_t, typename SRC>
+CV_ALWAYS_INLINE int divc_simd_common_impl(scale_tag_t s_tag, const SRC in[],
+                                           uchar out[], const v_float32& v_scalar,
+                                           const v_float32& v_scale, const int length)
+{
+    constexpr int nlanes = v_uint8::nlanes;
+
+    v_float32 v_zero = vx_setzero_f32();
+    v_float32 v_mask = (v_scalar == v_zero);
+
+    int x = 0;
+    for (;;)
+    {
+        for (; x <= length - nlanes; x += nlanes)
+        {
+            v_float32 a1 = vg_load_f32(&in[x]);
+            v_float32 a2 = vg_load_f32(&in[x + nlanes/4]);
+            v_float32 a3 = vg_load_f32(&in[x + nlanes/2]);
+            v_float32 a4 = vg_load_f32(&in[x + 3 * nlanes/4]);
+
+            vx_store(&out[x], v_pack_u(v_pack(v_round(v_select(v_mask, v_zero, div_op(s_tag, a1, v_scalar, v_scale))),
+                                              v_round(v_select(v_mask, v_zero, div_op(s_tag, a2, v_scalar, v_scale)))),
+                                       v_pack(v_round(v_select(v_mask, v_zero, div_op(s_tag, a3, v_scalar, v_scale))),
+                                              v_round(v_select(v_mask, v_zero, div_op(s_tag, a4, v_scalar, v_scale))))));
+        }
+
+        if (x < length)
+        {
+            x = length - nlanes;
+            continue;  // process unaligned tail
+        }
+        break;
+    }
+    return x;
+}
+
+//-------------------------------------------------------------------------------------------------
+
+template<typename scale_tag_t, typename SRC>
+CV_ALWAYS_INLINE int divc_simd_common_impl(scale_tag_t s_tag, const SRC in[],
+                                           float out[], const v_float32& v_scalar,
+                                           const v_float32& v_scale, const int length)
+{
+    constexpr int nlanes = v_float32::nlanes;
+    int x = 0;
+    for (;;)
+    {
+        for (; x <= length - nlanes; x += nlanes)
+        {
+            v_float32 a1 = vg_load_f32(&in[x]);
+            vx_store(&out[x], div_op(s_tag, a1, v_scalar, v_scale));
+        }
+
+        if (x < length)
+        {
+            x = length - nlanes;
+            continue;  // process unaligned tail
+        }
+        break;
+    }
+    return x;
+}
+
+//-------------------------------------------------------------------------------------------------
+
+template<typename scale_tag_t, typename SRC, typename DST>
+CV_ALWAYS_INLINE int divc_mask_simd_common(scale_tag_t tag, const SRC in[],
+                                           const float scalar[], DST out[],
+                                           const int length, const float scale)
+{
+    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+
+    if (length < nlanes)
+        return 0;
+
+    v_float32 v_scalar = vx_load(scalar);
+    v_float32 v_scale = vx_setall_f32(scale);
+    return divc_simd_common_impl(tag, in, out, v_scalar, v_scale, length);
+}
+
+//-------------------------------------------------------------------------------------------------
+
+template<typename scale_tag_t, typename SRC, typename DST>
+CV_ALWAYS_INLINE
+typename std::enable_if<std::is_same<DST, short>::value ||
+                        std::is_same<DST, ushort>::value, int>::type
+divc_simd_c3_impl(scale_tag_t s_tag, SRC in[], DST out[], const v_float32& s1,
+                  const v_float32& s2, const v_float32& s3,
+                  const v_float32& v_scale, const int length,
+                  const int nlanes, const int lanes)
+{
+    v_float32 v_zero = vx_setzero_f32();
+    v_float32 v_mask1 = (s1 == v_zero);
+    v_float32 v_mask2 = (s2 == v_zero);
+    v_float32 v_mask3 = (s3 == v_zero);
+
+    int x = 0;
+    for (;;)
+    {
+        for (; x <= length - lanes; x += lanes)
+        {
+            v_float32 a1 = vg_load_f32(&in[x]);
+            v_float32 a2 = vg_load_f32(&in[x + nlanes / 2]);
+            v_float32 a3 = vg_load_f32(&in[x + nlanes]);
+            v_float32 a4 = vg_load_f32(&in[x + 3 * nlanes / 2]);
+            v_float32 a5 = vg_load_f32(&in[x + 2 * nlanes]);
+            v_float32 a6 = vg_load_f32(&in[x + 5 * nlanes / 2]);
+
+            arithmOpScalar_pack_store_c3(&out[x], v_round(v_select(v_mask1, v_zero, div_op(s_tag, a1, s1, v_scale))),
+                                               v_round(v_select(v_mask2, v_zero, div_op(s_tag, a2, s2, v_scale))),
+                                               v_round(v_select(v_mask3, v_zero, div_op(s_tag, a3, s3, v_scale))),
+                                               v_round(v_select(v_mask1, v_zero, div_op(s_tag, a4, s1, v_scale))),
+                                               v_round(v_select(v_mask2, v_zero, div_op(s_tag, a5, s2, v_scale))),
+                                               v_round(v_select(v_mask3, v_zero, div_op(s_tag, a6, s3, v_scale))));
+        }
+
+        if (x < length)
+        {
+            x = length - lanes;
+            continue;  // process unaligned tail
+        }
+        break;
+    }
+    return x;
+}
+
+//-------------------------------------------------------------------------------------------------
+
+template<typename scale_tag_t, typename SRC>
+CV_ALWAYS_INLINE int divc_simd_c3_impl(scale_tag_t s_tag, const SRC* in, uchar* out,
+                                       const v_float32& s1, const v_float32& s2,
+                                       const v_float32& s3, const v_float32& v_scale,
+                                       const int length, const int nlanes, const int lanes)
+{
+    v_float32 v_zero = vx_setzero_f32();
+    v_float32 v_mask1 = (s1 == v_zero);
+    v_float32 v_mask2 = (s2 == v_zero);
+    v_float32 v_mask3 = (s3 == v_zero);
+
+    int x = 0;
+    for (;;)
+    {
+        for (; x <= length - lanes; x += lanes)
+        {
+            vx_store(&out[x],
+                       v_pack_u(v_pack(v_round(v_select(v_mask1, v_zero, div_op(s_tag, vg_load_f32(&in[x]), s1, v_scale))),
+                                       v_round(v_select(v_mask2, v_zero, div_op(s_tag, vg_load_f32(&in[x + nlanes/4]), s2, v_scale)))),
+                                v_pack(v_round(v_select(v_mask3, v_zero, div_op(s_tag, vg_load_f32(&in[x + nlanes/2]), s3, v_scale))),
+                                       v_round(v_select(v_mask1, v_zero, div_op(s_tag, vg_load_f32(&in[x + 3*nlanes/4]), s1, v_scale))))));
+
+            vx_store(&out[x + nlanes],
+                        v_pack_u(v_pack(v_round(v_select(v_mask2, v_zero, div_op(s_tag, vg_load_f32(&in[x + nlanes]), s2, v_scale))),
+                                        v_round(v_select(v_mask3, v_zero, div_op(s_tag, vg_load_f32(&in[x + 5*nlanes/4]), s3, v_scale)))),
+                                 v_pack(v_round(v_select(v_mask1, v_zero, div_op(s_tag, vg_load_f32(&in[x + 3*nlanes/2]), s1, v_scale))),
+                                        v_round(v_select(v_mask2, v_zero, div_op(s_tag, vg_load_f32(&in[x + 7*nlanes/4]), s2, v_scale))))));
+
+            vx_store(&out[x + 2 * nlanes],
+                        v_pack_u(v_pack(v_round(v_select(v_mask3, v_zero, div_op(s_tag, vg_load_f32(&in[x + 2*nlanes]), s3, v_scale))),
+                                        v_round(v_select(v_mask1, v_zero, div_op(s_tag, vg_load_f32(&in[x + 9*nlanes/4]), s1, v_scale)))),
+                                 v_pack(v_round(v_select(v_mask2, v_zero, div_op(s_tag, vg_load_f32(&in[x + 5*nlanes/2]), s2, v_scale))),
+                                        v_round(v_select(v_mask3, v_zero, div_op(s_tag, vg_load_f32(&in[x + 11*nlanes/4]), s3, v_scale))))));
+        }
+
+        if (x < length)
+        {
+            x = length - lanes;
+            continue;  // process unaligned tail
+        }
+        break;
+    }
+    return x;
+}
+
+//-------------------------------------------------------------------------------------------------
+
+template<typename scale_tag_t, typename SRC>
+CV_ALWAYS_INLINE int divc_simd_c3_impl(scale_tag_t s_tag, const SRC* in, float* out,
+                                       const v_float32& s1, const v_float32& s2,
+                                       const v_float32& s3, const v_float32& v_scale, const int length,
+                                       const int nlanes, const int lanes)
+{
+    int x = 0;
+    for (;;)
+    {
+        for (; x <= length - lanes; x += lanes)
+        {
+            v_float32 a1 = vg_load_f32(&in[x]);
+            v_float32 a2 = vg_load_f32(&in[x + nlanes]);
+            v_float32 a3 = vg_load_f32(&in[x + 2*nlanes]);
+
+            vx_store(&out[x], div_op(s_tag, a1, s1, v_scale));
+            vx_store(&out[x + nlanes], div_op(s_tag, a2, s2, v_scale));
+            vx_store(&out[x + 2*nlanes], div_op(s_tag, a3, s3, v_scale));
+        }
+
+        if (x < length)
+        {
+            x = length - lanes;
+            continue;  // process unaligned tail
+        }
+        break;
+    }
+    return x;
+}
+
+//-------------------------------------------------------------------------------------------------
+
+template<typename scale_tag_t, typename SRC, typename DST>
+CV_ALWAYS_INLINE int divc_mask_simd_c3(scale_tag_t s_tag, const SRC in[],
+                                       const float scalar[], DST out[],
+                                       const int length, const float scale)
+{
+    constexpr int chan = 3;
+    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    constexpr int lanes = chan * nlanes;
+
+    if (length < lanes)
+        return 0;
+
+    v_float32 v_scale = vx_setall_f32(scale);
+
+    v_float32 s1 = vx_load(scalar);
+#if CV_SIMD_WIDTH == 32
+    v_float32 s2 = vx_load(&scalar[2]);
+    v_float32 s3 = vx_load(&scalar[1]);
+#else
+    v_float32 s2 = vx_load(&scalar[1]);
+    v_float32 s3 = vx_load(&scalar[2]);
+#endif
+     return divc_simd_c3_impl(s_tag, in, out, s1, s2, s3, v_scale, length, nlanes, lanes);
+}
+
+//-------------------------------------------------------------------------------------------------
+
+#define DIVC_SIMD(SRC, DST)                                                    \
+int divc_simd(const SRC in[], const float scalar[], DST out[],                 \
+              const int length, const int chan, const float scale,             \
+              const int set_mask_flag)                                         \
+{                                                                              \
+    switch (chan)                                                              \
+    {                                                                          \
+    case 1:                                                                    \
+    case 2:                                                                    \
+    case 4:                                                                    \
+    {                                                                          \
+        if (std::fabs(scale - 1.0f) <= FLT_EPSILON)                            \
+        {                                                                      \
+            if (set_mask_flag == 1)                                            \
+                return divc_mask_simd_common(not_scale_tag{}, in, scalar,      \
+                                             out, length, scale);              \
+            else                                                               \
+                return arithmOpScalar_simd_common(div_tag{}, in, scalar,       \
+                                                  out, length);                \
+        }                                                                      \
+        else                                                                   \
+        {   if (set_mask_flag == 1)                                            \
+                return divc_mask_simd_common(scale_tag{}, in, scalar,          \
+                                             out, length, scale);              \
+            else                                                               \
+                return arithmOpScalarScaled_simd_common(div_tag{}, in, scalar, \
+                                                        out, length, scale);   \
+        }                                                                      \
+    }                                                                          \
+    case 3:                                                                    \
+    {                                                                          \
+        if (std::fabs(scale - 1.0f) <= FLT_EPSILON)                            \
+        {                                                                      \
+            if (set_mask_flag == 1)                                            \
+                return divc_mask_simd_c3(not_scale_tag{}, in, scalar,          \
+                                             out, length, scale);              \
+            else                                                               \
+                return arithmOpScalar_simd_c3(div_tag{}, in, scalar,           \
+                                              out, length);                    \
+        }                                                                      \
+        else                                                                   \
+        {                                                                      \
+            if (set_mask_flag == 1)                                            \
+                return divc_mask_simd_c3(scale_tag{}, in, scalar,              \
+                                         out, length, scale);                  \
+            else                                                               \
+                return arithmOpScalarScaled_simd_c3(div_tag{}, in, scalar, out,\
+                                                    length, scale);            \
+        }                                                                      \
+    }                                                                          \
+    default:                                                                   \
+        GAPI_Assert(chan <= 4);                                                \
+        break;                                                                 \
+    }                                                                          \
+    return 0;                                                                  \
+}
+
+DIVC_SIMD(uchar, uchar)
+DIVC_SIMD(ushort, uchar)
+DIVC_SIMD(short, uchar)
+DIVC_SIMD(float, uchar)
+DIVC_SIMD(short, short)
+DIVC_SIMD(ushort, short)
+DIVC_SIMD(uchar, short)
+DIVC_SIMD(float, short)
+DIVC_SIMD(ushort, ushort)
+DIVC_SIMD(uchar, ushort)
+DIVC_SIMD(short, ushort)
+DIVC_SIMD(float, ushort)
+DIVC_SIMD(uchar, float)
+DIVC_SIMD(ushort, float)
+DIVC_SIMD(short, float)
+DIVC_SIMD(float, float)
+
+#undef DIVC_SIMD
+
 //-------------------------
 //
 // Fluid kernels: AbsDiffC