Merge pull request #20914 from anna-khakimova:ak/simd_div

GAPI Fluid: SIMD Div kernel. * HAL implementation for Div kernel * Removed dbg lines * Applied comments. * Reworked * Final version
2025-06-16 23:00:51 +08:00 · 2021-11-15 20:16:25 +03:00 · 2021-11-15 20:16:25 +03:00 · b19697e3ac
commit b19697e3ac
parent 2b2e515a30
11 changed files with 648 additions and 18 deletions
--- a/modules/gapi/CMakeLists.txt
+++ b/modules/gapi/CMakeLists.txt
@ -123,6 +123,7 @@ set(gapi_srcs
    src/backends/fluid/gfluidimgproc.cpp
    src/backends/fluid/gfluidimgproc_func.dispatch.cpp
    src/backends/fluid/gfluidcore.cpp
 	  src/backends/fluid/gfluidcore_func.dispatch.cpp
    # OCL Backend (currently built-in)
    src/backends/ocl/goclbackend.cpp
@ -188,6 +189,7 @@ set(gapi_srcs
    )
 ocv_add_dispatched_file(backends/fluid/gfluidimgproc_func SSE4_1 AVX2)
 ocv_add_dispatched_file(backends/fluid/gfluidcore_func SSE4_1 AVX2)
 ocv_list_add_prefix(gapi_srcs "${CMAKE_CURRENT_LIST_DIR}/")
--- a/modules/gapi/include/opencv2/gapi/core.hpp
+++ b/modules/gapi/include/opencv2/gapi/core.hpp
@ -770,7 +770,10 @@ GAPI_EXPORTS GMat mulC(const GScalar& multiplier, const GMat& src, int ddepth =
 The function divides one matrix by another:
 \f[\texttt{dst(I) = saturate(src1(I)*scale/src2(I))}\f]
-When src2(I) is zero, dst(I) will also be zero. Different channels of
+For integer types when src2(I) is zero, dst(I) will also be zero.
 Floating point case returns Inf/NaN (according to IEEE).
 Different channels of
 multi-channel matrices are processed independently.
 The matrices can be single or multi channel. Output matrix must have the same size and depth as src.
--- a/modules/gapi/perf/common/gapi_core_perf_tests.hpp
+++ b/modules/gapi/perf/common/gapi_core_perf_tests.hpp
@ -35,7 +35,7 @@ namespace opencv_test
    class MulPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
    class MulDoublePerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
    class MulCPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
-    class DivPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, int, cv::GCompileArgs>> {};
+    class DivPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, int, double, cv::GCompileArgs>> {};
    class DivCPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
    class DivRCPerfTest : public TestPerfParams<tuple<compare_f,cv::Size, MatType, int, cv::GCompileArgs>> {};
    class MaskPerfTest : public TestPerfParams<tuple<cv::Size, MatType, cv::GCompileArgs>> {};
--- a/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp
+++ b/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp
@ -323,17 +323,23 @@ PERF_TEST_P_(DivPerfTest, TestPerformance)
    Size sz = get<1>(GetParam());
    MatType type = get<2>(GetParam());
    int dtype = get<3>(GetParam());
-    cv::GCompileArgs compile_args = get<4>(GetParam());
+    double scale = get<4>(GetParam());
    cv::GCompileArgs compile_args = get<5>(GetParam());
    // FIXIT Unstable input data for divide
    initMatsRandU(type, sz, dtype, false);
    //This condition need to workaround bug in OpenCV.
    //It reinitializes divider matrix without zero values.
    if (dtype == CV_16S && dtype != type)
        cv::randu(in_mat2, cv::Scalar::all(1), cv::Scalar::all(255));
    // OpenCV code ///////////////////////////////////////////////////////////
-    cv::divide(in_mat1, in_mat2, out_mat_ocv, dtype);
+    cv::divide(in_mat1, in_mat2, out_mat_ocv, scale, dtype);
    // G-API code ////////////////////////////////////////////////////////////
    cv::GMat in1, in2, out;
-    out = cv::gapi::div(in1, in2, dtype);
+    out = cv::gapi::div(in1, in2, scale, dtype);
    cv::GComputation c(GIn(in1, in2), GOut(out));
    // Warm-up graph engine:
@ -347,8 +353,9 @@ PERF_TEST_P_(DivPerfTest, TestPerformance)
    }
    // Comparison ////////////////////////////////////////////////////////////
-    // FIXIT unrealiable check: EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+    {
-    EXPECT_EQ(out_mat_gapi.size(), sz);
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
    }
    SANITY_CHECK_NOTHING();
 }
--- a/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp
+++ b/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp
@ -67,7 +67,8 @@ INSTANTIATE_TEST_CASE_P(DivPerfTestCPU, DivPerfTest,
    Combine(Values(AbsExact().to_compare_f()),
        Values(szSmall128, szVGA, sz720p, sz1080p),
        Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
-        Values(-1, CV_8U, CV_16U, CV_32F),
+        Values(-1, CV_8U, CV_16U, CV_16S, CV_32F),
        Values(2.3),
        Values(cv::compile_args(CORE_CPU))));
 INSTANTIATE_TEST_CASE_P(DivCPerfTestCPU, DivCPerfTest,
--- a/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp
+++ b/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp
@ -60,12 +60,13 @@ INSTANTIATE_TEST_CASE_P(SubPerfTestFluid, SubPerfTest,
 //         Values(-1, CV_8U, CV_16U, CV_32F),
 //         Values(cv::compile_args(CORE_FLUID))));
-// INSTANTIATE_TEST_CASE_P(DivPerfTestFluid, DivPerfTest,
+ INSTANTIATE_TEST_CASE_P(DivPerfTestFluid, DivPerfTest,
-//     Combine(Values(AbsExact().to_compare_f()),
+     Combine(Values(AbsExact().to_compare_f()),
-//         Values(szSmall128, szVGA, sz720p, sz1080p),
+         Values(szSmall128, szVGA, sz720p, sz1080p),
-//         Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+         Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
-//         Values(-1, CV_8U, CV_16U, CV_32F),
+         Values(-1, CV_8U, CV_16U, CV_16S, CV_32F),
-//         Values(cv::compile_args(CORE_FLUID))));
+         Values(2.3),
         Values(cv::compile_args(CORE_FLUID))));
 // INSTANTIATE_TEST_CASE_P(DivCPerfTestFluid, DivCPerfTest,
 //     Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
--- a/modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp
+++ b/modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp
@ -62,10 +62,11 @@ INSTANTIATE_TEST_CASE_P(MulCPerfTestGPU, MulCPerfTest,
                                Values(cv::compile_args(CORE_GPU))));
 INSTANTIATE_TEST_CASE_P(DivPerfTestGPU, DivPerfTest,
-                        Combine(Values(Tolerance_FloatRel_IntAbs(1e-5, 2).to_compare_f()),
+                        Combine(Values(AbsTolerance(2).to_compare_f()),
                                Values( szSmall128, szVGA, sz720p, sz1080p ),
                                Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
                                Values( -1, CV_8U, CV_16U, CV_32F ),
                                Values(2.3),
                                Values(cv::compile_args(CORE_GPU))));
 INSTANTIATE_TEST_CASE_P(DivCPerfTestGPU, DivCPerfTest,
--- a/modules/gapi/src/backends/fluid/gfluidcore.cpp
+++ b/modules/gapi/src/backends/fluid/gfluidcore.cpp
@ -13,6 +13,10 @@
 #include <opencv2/core/hal/hal.hpp>
 #include <opencv2/core/hal/intrin.hpp>
 #if CV_SIMD
 #include "gfluidcore_func.hpp"
 #endif
 #include <opencv2/gapi/core.hpp>
 #include <opencv2/gapi/fluid/gfluidbuffer.hpp>
@ -82,13 +86,25 @@ static inline DST mul(SRC1 x, SRC2 y, float scale=1)
 }
 template<typename DST, typename SRC1, typename SRC2>
-static inline DST div(SRC1 x, SRC2 y, float scale=1)
+static inline
 typename std::enable_if<!std::is_same<DST, float>::value, DST>::type
 div(SRC1 x, SRC2 y, float scale=1)
 {
-    // like OpenCV: returns 0, if y=0
+    // like OpenCV: returns 0, if DST type=uchar/short/ushort and divider(y)=0
    auto result = y? scale * x / y: 0;
    return saturate<DST>(result, rintf);
 }
 template<typename DST, typename SRC1, typename SRC2>
 static inline
 typename std::enable_if<std::is_same<DST, float>::value, DST>::type
 div(SRC1 x, SRC2 y, float scale = 1)
 {
    // like OpenCV: returns inf/nan, if DST type=float and divider(y)=0
    auto result = scale * x / y;
    return saturate<DST>(result, rintf);
 }
 template<typename DST, typename SRC1, typename SRC2>
 static inline DST divr(SRC1 x, SRC2 y, float scale=1)
 {
@ -626,7 +642,7 @@ CV_ALWAYS_INLINE int sub_simd(const SRC in1[], const SRC in2[], DST out[], int l
    return 0;
 }
-#endif
+#endif // CV_SIMD
 template<typename DST, typename SRC1, typename SRC2>
 static void run_arithm(Buffer &dst, const View &src1, const View &src2, Arithm arithm,
@ -672,9 +688,14 @@ static void run_arithm(Buffer &dst, const View &src1, const View &src2, Arithm a
                out[x] = mul<DST>(in1[x], in2[x], _scale);
            break;
        case ARITHM_DIVIDE:
        {
 #if CV_SIMD
            x = div_simd(in1, in2, out, length, scale);
 #endif
            for (; x < length; ++x)
                out[x] = div<DST>(in1[x], in2[x], _scale);
            break;
        }
        default: CV_Error(cv::Error::StsBadArg, "unsupported arithmetic operation");
    }
 }
@ -744,10 +765,19 @@ GAPI_FLUID_KERNEL(GFluidDiv, cv::gapi::core::GDiv, false)
    {
        //      DST     SRC1    SRC2    OP          __VA_ARGS__
        BINARY_(uchar , uchar , uchar , run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale);
        BINARY_(uchar,  ushort, ushort, run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale);
        BINARY_(uchar ,  short,  short, run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale);
        BINARY_(uchar ,  float,  float, run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale);
        BINARY_( short,  short,  short, run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale);
        BINARY_( short, ushort, ushort, run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale);
        BINARY_( short,  uchar,  uchar, run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale);
        BINARY_( short,  float,  float, run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale);
        BINARY_(ushort, ushort, ushort, run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale);
        BINARY_(ushort, uchar , uchar , run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale);
        BINARY_(ushort,  short,  short, run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale);
        BINARY_(ushort,  float,  float, run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale);
        BINARY_( float, uchar , uchar , run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale);
        BINARY_( float, ushort, ushort, run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale);
        BINARY_( float,  short,  short, run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale);
        BINARY_( float,  float,  float, run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale);
--- a/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp
+++ b/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp
@ -0,0 +1,63 @@
 // This file is part of OpenCV project.
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 //
 // Copyright (C) 2021 Intel Corporation
 #if !defined(GAPI_STANDALONE)
 #include "gfluidcore_func.hpp"
 #include "gfluidcore_func.simd.hpp"
 #include "backends/fluid/gfluidcore_func.simd_declarations.hpp"
 #include "gfluidutils.hpp"
 #include <opencv2/core/cvdef.h>
 #include <opencv2/core/hal/intrin.hpp>
 #include <cmath>
 #include <cstdlib>
 #ifdef __GNUC__
 #  pragma GCC diagnostic push
 #  pragma GCC diagnostic ignored "-Wstrict-overflow"
 #endif
 namespace cv {
 namespace gapi {
 namespace fluid {
 #define DIV_SIMD(SRC, DST)                                                  \
 int div_simd(const SRC in1[], const SRC in2[], DST out[],                   \
             const int length, double _scale)                               \
 {                                                                           \
    CV_CPU_DISPATCH(div_simd, (in1, in2, out, length, _scale),              \
                    CV_CPU_DISPATCH_MODES_ALL);                             \
 }
 DIV_SIMD(uchar, uchar)
 DIV_SIMD(ushort, uchar)
 DIV_SIMD(short, uchar)
 DIV_SIMD(float, uchar)
 DIV_SIMD(short, short)
 DIV_SIMD(ushort, short)
 DIV_SIMD(uchar, short)
 DIV_SIMD(float, short)
 DIV_SIMD(ushort, ushort)
 DIV_SIMD(uchar, ushort)
 DIV_SIMD(short, ushort)
 DIV_SIMD(float, ushort)
 DIV_SIMD(uchar, float)
 DIV_SIMD(ushort, float)
 DIV_SIMD(short, float)
 DIV_SIMD(float, float)
 #undef DIV_SIMD
 } // namespace fluid
 } // namespace gapi
 } // namespace cv
 #endif // !defined(GAPI_STANDALONE)
--- a/modules/gapi/src/backends/fluid/gfluidcore_func.hpp
+++ b/modules/gapi/src/backends/fluid/gfluidcore_func.hpp
@ -0,0 +1,44 @@
 // This file is part of OpenCV project.
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 //
 // Copyright (C) 2021 Intel Corporation
 #pragma once
 #if !defined(GAPI_STANDALONE)
 #include <opencv2/core.hpp>
 namespace cv {
 namespace gapi {
 namespace fluid {
 #define DIV_SIMD(SRC, DST)                                       \
 int div_simd(const SRC in1[], const SRC in2[], DST out[],        \
             const int length, double _scale);
 DIV_SIMD(uchar, uchar)
 DIV_SIMD(ushort, uchar)
 DIV_SIMD(short, uchar)
 DIV_SIMD(float, uchar)
 DIV_SIMD(short, short)
 DIV_SIMD(ushort, short)
 DIV_SIMD(uchar, short)
 DIV_SIMD(float, short)
 DIV_SIMD(ushort, ushort)
 DIV_SIMD(uchar, ushort)
 DIV_SIMD(short, ushort)
 DIV_SIMD(float, ushort)
 DIV_SIMD(uchar, float)
 DIV_SIMD(ushort, float)
 DIV_SIMD(short, float)
 DIV_SIMD(float, float)
 #undef DIV_SIMD
 }  // namespace fluid
 }  // namespace gapi
 }  // namespace cv
 #endif // !defined(GAPI_STANDALONE)
--- a/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp
+++ b/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp
@ -0,0 +1,478 @@
 // This file is part of OpenCV project.
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 //
 // Copyright (C) 2021 Intel Corporation
 // NB: allow including this *.hpp several times!
 // #pragma once -- don't: this file is NOT once!
 #if !defined(GAPI_STANDALONE)
 #include "opencv2/gapi/own/saturate.hpp"
 #include "opencv2/core.hpp"
 #include <opencv2/core/hal/intrin.hpp>
 #include <opencv2/core/hal/hal.hpp>
 #include <cstdint>
 #include <cstring>
 #include <algorithm>
 #include <limits>
 #include <vector>
 #ifdef __GNUC__
 #  pragma GCC diagnostic push
 #  pragma GCC diagnostic ignored "-Wstrict-overflow"
 #endif
 using cv::gapi::own::saturate;
 namespace cv {
 namespace gapi {
 namespace fluid {
 CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
 #define DIV_SIMD(SRC, DST)                                     \
 int div_simd(const SRC in1[], const SRC in2[], DST out[],      \
             const int length, double _scale);
 DIV_SIMD(uchar, uchar)
 DIV_SIMD(ushort, uchar)
 DIV_SIMD(short, uchar)
 DIV_SIMD(float, uchar)
 DIV_SIMD(short, short)
 DIV_SIMD(ushort, short)
 DIV_SIMD(uchar, short)
 DIV_SIMD(float, short)
 DIV_SIMD(ushort, ushort)
 DIV_SIMD(uchar, ushort)
 DIV_SIMD(short, ushort)
 DIV_SIMD(float, ushort)
 DIV_SIMD(uchar, float)
 DIV_SIMD(ushort, float)
 DIV_SIMD(short, float)
 DIV_SIMD(float, float)
 #undef DIV_SIMD
 #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
 struct scale_tag {};
 struct not_scale_tag {};
 template<typename scalar_t>
 struct vector_type_of;
 template<typename scalar_t>
 using vector_type_of_t = typename vector_type_of<scalar_t>::type;
 template<> struct vector_type_of<uchar> { using type = v_uint8; };
 template<> struct vector_type_of<ushort> { using type = v_uint16; };
 template<> struct vector_type_of<short> { using type = v_int16; };
 CV_ALWAYS_INLINE v_float32 vg_load_f32(const float* in)
 {
    return vx_load(in);
 }
 CV_ALWAYS_INLINE v_float32 vg_load_f32(const ushort* in)
 {
    return v_cvt_f32(v_reinterpret_as_s32(vx_load_expand(in)));
 }
 CV_ALWAYS_INLINE v_float32 vg_load_f32(const short* in)
 {
    return v_cvt_f32(v_reinterpret_as_s32(vx_load_expand(in)));
 }
 CV_ALWAYS_INLINE v_float32 vg_load_f32(const uchar* in)
 {
    return v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(in)));
 }
 CV_ALWAYS_INLINE v_float32 div_op(scale_tag, const v_float32& a, const v_float32& div, const v_float32& scale)
 {
    return (a*scale/div);
 }
 CV_ALWAYS_INLINE v_float32 div_op(not_scale_tag, const v_float32& a, const v_float32& div, const v_float32&)
 {
    return a / div;
 }
 CV_ALWAYS_INLINE void v_store_div(short* dst, v_int32& res1, v_int32& res2)
 {
    vx_store(dst, v_pack(res1, res2));
 }
 CV_ALWAYS_INLINE void v_store_div(ushort* dst, v_int32& res1, v_int32& res2)
 {
    vx_store(dst, v_pack_u(res1, res2));
 }
 CV_ALWAYS_INLINE void v_store_select(short* dst, const v_int16& div, const v_int16& v_zero,
                                     const v_int32& res1, const v_int32& res2)
 {
    vx_store(dst, v_select(div == v_zero, v_zero, v_pack(res1, res2)));
 }
 CV_ALWAYS_INLINE void v_store_select(ushort* dst, const v_int16& div, const v_int16& v_zero,
                                     const v_int32& res1, const v_int32& res2)
 {
    v_uint16 sel = v_reinterpret_as_u16(v_select(div == v_zero, v_zero, v_pack(res1, res2)));
    vx_store(dst, sel);
 }
 //=================================================================================================
 template<typename scale_tag_t, typename SRC, typename DST>
 CV_ALWAYS_INLINE
 typename std::enable_if<(std::is_same<SRC, short>::value && std::is_same<DST, ushort>::value) ||
                        (std::is_same<SRC, ushort>::value && std::is_same<DST, ushort>::value) ||
                        (std::is_same<SRC, ushort>::value && std::is_same<DST, short>::value), int>::type
 div_hal(scale_tag_t t, const SRC in1[], const SRC in2[], DST out[], const int length, double _scale)
 {
    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
    if (length < nlanes)
        return 0;
    v_int16 v_zero = vx_setall_s16(0);
    v_float32 scale = vx_setall_f32(static_cast<float>(_scale));
    int x = 0;
    for (;;)
    {
        for (; x <= length - nlanes; x += nlanes)
        {
            v_float32 a1 = vg_load_f32(&in1[x]);
            v_float32 a2 = vg_load_f32(&in1[x + nlanes / 2]);
            v_int16 div = v_reinterpret_as_s16(vx_load(&in2[x]));
            v_float32 fdiv1 = v_cvt_f32(v_expand_low(div));
            v_float32 fdiv2 = v_cvt_f32(v_expand_high(div));
            v_int32 r1 = v_round(div_op(t, a1, fdiv1, scale));
            v_int32 r2 = v_round(div_op(t, a2, fdiv2, scale));
            v_store_select(&out[x], div, v_zero, r1, r2);
        }
        if (x < length)
        {
            x = length - nlanes;
            continue;  // process one more time (unaligned tail)
        }
        break;
    }
    return x;
 }
 //-------------------------------------------------------------------------------------------------
 template<typename scale_tag_t, typename SRC>
 CV_ALWAYS_INLINE
 typename std::enable_if<std::is_same<SRC, short>::value ||
                        std::is_same<SRC, ushort>::value, int>::type
 div_hal(scale_tag_t t, const SRC in1[], const SRC in2[], uchar out[], const int length, double _scale)
 {
    constexpr int nlanes = v_uint8::nlanes;
    if (length < nlanes)
        return 0;
    v_float32 scale = vx_setall_f32(static_cast<float>(_scale));
    v_int16 v_zero = vx_setall_s16(0);
    int x = 0;
    for (;;)
    {
        for (; x <= length - nlanes; x += nlanes)
        {
            v_float32 a1 = vg_load_f32(&in1[x]);
            v_float32 a2 = vg_load_f32(&in1[x + nlanes / 4]);
            v_float32 a3 = vg_load_f32(&in1[x + nlanes / 2]);
            v_float32 a4 = vg_load_f32(&in1[x + 3 * nlanes / 4]);
            v_int16 div1 = v_reinterpret_as_s16(vx_load(&in2[x]));
            v_int16 div2 = v_reinterpret_as_s16(vx_load(&in2[x + nlanes/2]));
            v_float32 fdiv1 = v_cvt_f32(v_expand_low(div1));
            v_float32 fdiv2 = v_cvt_f32(v_expand_high(div1));
            v_float32 fdiv3 = v_cvt_f32(v_expand_low(div2));
            v_float32 fdiv4 = v_cvt_f32(v_expand_high(div2));
            v_int32 sum1 = v_round(div_op(t, a1, fdiv1, scale)),
                    sum2 = v_round(div_op(t, a2, fdiv2, scale)),
                    sum3 = v_round(div_op(t, a3, fdiv3, scale)),
                    sum4 = v_round(div_op(t, a4, fdiv4, scale));
            v_int16 res1 = v_select((div1 == v_zero), v_zero, v_pack(sum1, sum2));
            v_int16 res2 = v_select((div2 == v_zero), v_zero, v_pack(sum3, sum4));
            vx_store(&out[x], v_pack_u(res1, res2));
        }
        if (x < length)
        {
            x = length - nlanes;
            continue;  // process one more time (unaligned tail)
        }
        break;
    }
    return x;
 }
 //-------------------------------------------------------------------------------------------------
 template<typename scale_tag_t>
 CV_ALWAYS_INLINE int div_hal(scale_tag_t t, const float in1[], const float in2[], uchar out[],
                             const int length, double _scale)
 {
    constexpr int nlanes = v_uint8::nlanes;
    if (length < nlanes)
        return 0;
    v_float32 scale = vx_setall_f32(static_cast<float>(_scale));
    v_float32 v_zero = vx_setall_f32(0);
    int x = 0;
    for (;;)
    {
        for (; x <= length - nlanes; x += nlanes)
        {
            v_float32 a1 = vg_load_f32(&in1[x]);
            v_float32 a2 = vg_load_f32(&in1[x + nlanes / 4]);
            v_float32 a3 = vg_load_f32(&in1[x + nlanes / 2]);
            v_float32 a4 = vg_load_f32(&in1[x + 3 * nlanes / 4]);
            v_float32 div1 = vg_load_f32(&in2[x]);
            v_float32 div2 = vg_load_f32(&in2[x + nlanes / 4]);
            v_float32 div3 = vg_load_f32(&in2[x + nlanes / 2]);
            v_float32 div4 = vg_load_f32(&in2[x + 3 * nlanes / 4]);
            v_float32 r1 = div_op(t, a1, div1, scale);
            v_float32 r2 = div_op(t, a2, div2, scale);
            v_float32 r3 = div_op(t, a3, div3, scale);
            v_float32 r4 = div_op(t, a4, div4, scale);
            v_float32 sel1 = v_select((div1 == v_zero), v_zero, r1);
            v_float32 sel2 = v_select((div2 == v_zero), v_zero, r2);
            v_float32 sel3 = v_select((div3 == v_zero), v_zero, r3);
            v_float32 sel4 = v_select((div4 == v_zero), v_zero, r4);
            v_int32 res1 = v_round(sel1);
            v_int32 res2 = v_round(sel2);
            v_int32 res3 = v_round(sel3);
            v_int32 res4 = v_round(sel4);
            vx_store(&out[x], v_pack_u(v_pack(res1, res2), v_pack(res3, res4)));
        }
        if (x < length)
        {
            x = length - nlanes;
            continue;  // process one more time (unaligned tail)
        }
        break;
    }
    return x;
 }
 //-------------------------------------------------------------------------------------------------
 template<typename scale_tag_t, typename DST>
 CV_ALWAYS_INLINE
 typename std::enable_if<std::is_same<DST, short>::value ||
                        std::is_same<DST, ushort>::value, int>::type
 div_hal(scale_tag_t t, const uchar in1[], const uchar in2[], DST out[], const int length, double _scale)
 {
    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
    if (length < nlanes)
        return 0;
    v_float32 scale = vx_setall_f32(static_cast<float>(_scale));
    v_int16 v_zero = vx_setall_s16(0);
    int x = 0;
    for (;;)
    {
        for (; x <= length - nlanes; x += nlanes)
        {
            v_float32 a1 = vg_load_f32(&in1[x]);
            v_float32 a2 = vg_load_f32(&in1[x + nlanes / 2]);
            v_int16 div = v_reinterpret_as_s16(vx_load_expand(&in2[x]));
            v_float32 fdiv1 = v_cvt_f32(v_expand_low(div));
            v_float32 fdiv2 = v_cvt_f32(v_expand_high(div));
            v_int32 r1 = v_round(div_op(t, a1, fdiv1, scale));
            v_int32 r2 = v_round(div_op(t, a2, fdiv2, scale));
            v_store_select(&out[x], div, v_zero, r1, r2);
        }
        if (x < length)
        {
            x = length - nlanes;
            continue;  // process one more time (unaligned tail)
        }
        break;
    }
    return x;
 }
 //-------------------------------------------------------------------------------------------------
 template<typename scale_tag_t, typename DST>
 CV_ALWAYS_INLINE
 typename std::enable_if<std::is_same<DST, short>::value ||
                        std::is_same<DST, ushort>::value, int>::type
 div_hal(scale_tag_t t, const float in1[], const float in2[], DST out[], const int length, double _scale)
 {
    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
    if (length < nlanes)
        return 0;
    v_float32 scale = vx_setall_f32(static_cast<float>(_scale));
    v_float32 v_zero = vx_setall_f32(0);
    int x = 0;
    for (;;)
    {
        for (; x <= length - nlanes; x += nlanes)
        {
            v_float32 a1 = vg_load_f32(&in1[x]);
            v_float32 a2 = vg_load_f32(&in1[x + nlanes / 2]);
            v_float32 fdiv1 = vg_load_f32(&in2[x]);
            v_float32 fdiv2 = vg_load_f32(&in2[x + nlanes / 2]);
            v_float32 r1 = div_op(t, a1, fdiv1, scale);
            v_float32 r2 = div_op(t, a2, fdiv2, scale);
            v_int32 res1 = v_round(v_select((fdiv1 == v_zero), v_zero, r1));
            v_int32 res2 = v_round(v_select((fdiv2 == v_zero), v_zero, r2));
            v_store_div(&out[x], res1, res2);
        }
        if (x < length)
        {
            x = length - nlanes;
            continue;  // process one more time (unaligned tail)
        }
        break;
    }
    return x;
 }
 //-------------------------------------------------------------------------------------------------
 template<typename scale_tag_t, typename SRC>
 CV_ALWAYS_INLINE int div_hal(scale_tag_t t, const SRC in1[], const SRC in2[], float out[],
                             const int length, double _scale)
 {
    constexpr int nlanes = v_float32::nlanes;
    if (length < nlanes)
        return 0;
    v_float32 scale = vx_setall_f32(static_cast<float>(_scale));
    int x = 0;
    for (;;)
    {
        for (; x <= length - nlanes; x += nlanes)
        {
            v_float32 a1 = vg_load_f32(&in1[x]);
            v_float32 b1 = vg_load_f32(&in2[x]);
            vx_store(&out[x], div_op(t, a1, b1, scale));
        }
        if (x < length)
        {
            x = length - nlanes;
            continue;  // process one more time (unaligned tail)
        }
        break;
    }
    return x;
 }
 //-------------------------------------------------------------------------------------------------
 template<typename scale_tag_t>
 CV_ALWAYS_INLINE int div_hal(scale_tag_t, const uchar in1[], const uchar in2[], uchar out[],
                             const int length, double scale)
 {
    hal::div8u(in1, static_cast<size_t>(length), in2, static_cast<size_t>(length),
               out, static_cast<size_t>(length), length, 1, &scale);
    return length;
 }
 template<typename scale_tag_t>
 CV_ALWAYS_INLINE int div_hal(scale_tag_t, const short in1[], const short in2[], short out[],
                             const int length, double scale)
 {
    hal::div16s(in1, static_cast<size_t>(length), in2, static_cast<size_t>(length),
                out, static_cast<size_t>(length), length, 1, &scale);
    return length;
 }
 //-------------------------------------------------------------------------------------------------
 #define DIV_SIMD(SRC, DST)                                                      \
 int div_simd(const SRC in1[], const SRC in2[], DST out[],                       \
                              const int length, double _scale)                  \
 {                                                                               \
    int x = 0;                                                                  \
    float fscale = static_cast<float>(_scale);                                  \
    if (std::fabs(fscale - 1.0f) <= FLT_EPSILON)                                \
    {                                                                           \
        not_scale_tag t;                                                        \
        x = div_hal(t, in1, in2, out, length, _scale);                          \
    }                                                                           \
    else                                                                        \
    {                                                                           \
        scale_tag t;                                                            \
        x = div_hal(t, in1, in2, out, length, _scale);                          \
    }                                                                           \
    return x;                                                                   \
 }
 DIV_SIMD(uchar, uchar)
 DIV_SIMD(ushort, uchar)
 DIV_SIMD(short, uchar)
 DIV_SIMD(float, uchar)
 DIV_SIMD(short, short)
 DIV_SIMD(ushort, short)
 DIV_SIMD(uchar, short)
 DIV_SIMD(float, short)
 DIV_SIMD(ushort, ushort)
 DIV_SIMD(uchar, ushort)
 DIV_SIMD(short, ushort)
 DIV_SIMD(float, ushort)
 DIV_SIMD(uchar, float)
 DIV_SIMD(ushort, float)
 DIV_SIMD(short, float)
 DIV_SIMD(float, float)
 #undef DIV_SIMD
 #endif  // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
 CV_CPU_OPTIMIZATION_NAMESPACE_END
 }  // namespace fluid
 }  // namespace gapi
 }  // namespace cv
 #endif // !defined(GAPI_STANDALONE)