diff --git a/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp b/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp index c644fd1587..83ef13008c 100644 --- a/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp +++ b/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp @@ -436,7 +436,7 @@ PERF_TEST_P_(DivPerfTest, TestPerformance) // FIXIT Unstable input data for divide initMatsRandU(type, sz, dtype, false); - //This condition need to workaround issue in the OpenCV. + //This condition need to workaround the #21044 issue in the OpenCV. //It reinitializes divider matrix without zero values for CV_16S DST type. if (dtype == CV_16S && dtype != type) cv::randu(in_mat2, cv::Scalar::all(1), cv::Scalar::all(255)); @@ -482,7 +482,7 @@ PERF_TEST_P_(DivCPerfTest, TestPerformance) // FIXIT Unstable input data for divide initMatsRandU(type, sz, dtype, false); - //This condition need as workaround the issue in the OpenCV. + //This condition need to workaround the #21044 issue in the OpenCV. //It reinitializes divider scalar without zero values for CV_16S DST type. if (dtype == CV_16S || (type == CV_16S && dtype == -1)) cv::randu(sc, cv::Scalar::all(1), cv::Scalar::all(SHRT_MAX)); @@ -528,7 +528,7 @@ PERF_TEST_P_(DivRCPerfTest, TestPerformance) // FIXIT Unstable input data for divide initMatsRandU(type, sz, dtype, false); - //This condition need as workaround the bug in the OpenCV. + //This condition need to workaround the #21044 issue in the OpenCV. //It reinitializes divider matrix without zero values for CV_16S DST type. if (dtype == CV_16S || (type == CV_16S && dtype == -1)) cv::randu(in_mat1, cv::Scalar::all(1), cv::Scalar::all(255)); diff --git a/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp b/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp index d91ce65fff..e4b8c0b490 100644 --- a/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp +++ b/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp @@ -40,10 +40,10 @@ INSTANTIATE_TEST_CASE_P(AddCPerfTestFluid, AddCPerfTest, Values(cv::compile_args(CORE_FLUID)))); INSTANTIATE_TEST_CASE_P(SubPerfTestFluid, SubPerfTest, - Combine(Values(AbsExact().to_compare_f()), + Combine(Values(Tolerance_FloatRel_IntAbs(1e-6, 0).to_compare_f()), Values(szSmall128, szVGA, sz720p, sz1080p), - Values(CV_8UC1, CV_8UC3, CV_16SC1, CV_32FC1), - Values(-1, CV_8U, CV_32F), + Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1), + Values(-1, CV_8U, CV_16U, CV_16S, CV_32F), Values(cv::compile_args(CORE_FLUID)))); INSTANTIATE_TEST_CASE_P(SubCPerfTestFluid, SubCPerfTest, diff --git a/modules/gapi/src/backends/fluid/gfluidcore.cpp b/modules/gapi/src/backends/fluid/gfluidcore.cpp index 866381f39b..c5cfc19d48 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore.cpp +++ b/modules/gapi/src/backends/fluid/gfluidcore.cpp @@ -378,141 +378,11 @@ CV_ALWAYS_INLINE int absdiff_simd(const T in1[], const T in2[], T out[], int len return 0; } - -template -CV_ALWAYS_INLINE int sub_simd_sametype(const T in1[], const T in2[], T out[], int length) -{ - constexpr int nlanes = static_cast(VT::nlanes); - - if (length < nlanes) - return 0; - - int x = 0; - for (;;) - { - for (; x <= length - nlanes; x += nlanes) - { - VT a = vx_load(&in1[x]); - VT b = vx_load(&in2[x]); - vx_store(&out[x], a - b); - } - - if (x < length && (in1 != out) && (in2 != out)) - { - x = length - nlanes; - continue; // process one more time (unaligned tail) - } - break; - } - - return x; -} - -template -CV_ALWAYS_INLINE int sub_simd(const SRC in1[], const SRC in2[], DST out[], int length) -{ - if (std::is_same::value && !std::is_same::value) - return 0; - - if (std::is_same::value) - { - if (std::is_same::value) - { - return sub_simd_sametype(reinterpret_cast(in1), - reinterpret_cast(in2), - reinterpret_cast(out), length); - } - else if (std::is_same::value) - { - return sub_simd_sametype(reinterpret_cast(in1), - reinterpret_cast(in2), - reinterpret_cast(out), length); - } - else if (std::is_same::value) - { - return sub_simd_sametype(reinterpret_cast(in1), - reinterpret_cast(in2), - reinterpret_cast(out), length); - } - } - else if (std::is_same::value && std::is_same::value) - { - constexpr int nlanes = static_cast(v_uint8::nlanes); - - if (length < nlanes) - return 0; - - int x = 0; - for (;;) - { - for (; x <= length - nlanes; x += nlanes) - { - v_int16 a1 = vx_load(reinterpret_cast(&in1[x])); - v_int16 a2 = vx_load(reinterpret_cast(&in1[x + nlanes / 2])); - v_int16 b1 = vx_load(reinterpret_cast(&in2[x])); - v_int16 b2 = vx_load(reinterpret_cast(&in2[x + nlanes / 2])); - - vx_store(reinterpret_cast(&out[x]), v_pack_u(a1 - b1, a2 - b2)); - } - - if (x < length) - { - CV_DbgAssert((reinterpret_cast(in1) != reinterpret_cast(out)) && - (reinterpret_cast(in2) != reinterpret_cast(out))); - x = length - nlanes; - continue; // process one more time (unaligned tail) - } - break; - } - - return x; - } - else if (std::is_same::value && std::is_same::value) - { - constexpr int nlanes = static_cast(v_uint8::nlanes); - - if (length < nlanes) - return 0; - - int x = 0; - for (;;) - { - for (; x <= length - nlanes; x += nlanes) - { - v_float32 a1 = vx_load(reinterpret_cast(&in1[x])); - v_float32 a2 = vx_load(reinterpret_cast(&in1[x + nlanes / 4])); - v_float32 a3 = vx_load(reinterpret_cast(&in1[x + 2 * nlanes / 4])); - v_float32 a4 = vx_load(reinterpret_cast(&in1[x + 3 * nlanes / 4])); - - v_float32 b1 = vx_load(reinterpret_cast(&in2[x])); - v_float32 b2 = vx_load(reinterpret_cast(&in2[x + nlanes / 4])); - v_float32 b3 = vx_load(reinterpret_cast(&in2[x + 2 * nlanes / 4])); - v_float32 b4 = vx_load(reinterpret_cast(&in2[x + 3 * nlanes / 4])); - - vx_store(reinterpret_cast(&out[x]), v_pack_u(v_pack(v_round(a1 - b1), v_round(a2 - b2)), - v_pack(v_round(a3 - b3), v_round(a4 - b4)))); - } - - if (x < length) - { - CV_DbgAssert((reinterpret_cast(in1) != reinterpret_cast(out)) && - (reinterpret_cast(in2) != reinterpret_cast(out))); - x = length - nlanes; - continue; // process one more time (unaligned tail) - } - break; - } - - return x; - } - - return 0; -} #endif // CV_SIMD template CV_ALWAYS_INLINE void run_arithm(Buffer &dst, const View &src1, const View &src2, - Arithm arithm, double scale=1) + Arithm arithm, double scale=1) { static_assert(std::is_same::value, "wrong types"); @@ -607,10 +477,19 @@ GAPI_FLUID_KERNEL(GFluidSub, cv::gapi::core::GSub, false) { // DST SRC1 SRC2 OP __VA_ARGS__ BINARY_(uchar , uchar , uchar , run_arithm, dst, src1, src2, ARITHM_SUBTRACT); - BINARY_(uchar , short, short, run_arithm, dst, src1, src2, ARITHM_SUBTRACT); - BINARY_(uchar , float, float, run_arithm, dst, src1, src2, ARITHM_SUBTRACT); - BINARY_( short, short, short, run_arithm, dst, src1, src2, ARITHM_SUBTRACT); - BINARY_( float, uchar , uchar , run_arithm, dst, src1, src2, ARITHM_SUBTRACT); + BINARY_(uchar, ushort, ushort, run_arithm, dst, src1, src2, ARITHM_SUBTRACT); + BINARY_(uchar, short, short, run_arithm, dst, src1, src2, ARITHM_SUBTRACT); + BINARY_(uchar, float, float, run_arithm, dst, src1, src2, ARITHM_SUBTRACT); + BINARY_(short, short, short, run_arithm, dst, src1, src2, ARITHM_SUBTRACT); + BINARY_(short, uchar, uchar, run_arithm, dst, src1, src2, ARITHM_SUBTRACT); + BINARY_(short, ushort, ushort, run_arithm, dst, src1, src2, ARITHM_SUBTRACT); + BINARY_(short, float, float, run_arithm, dst, src1, src2, ARITHM_SUBTRACT); + BINARY_(ushort, ushort, ushort, run_arithm, dst, src1, src2, ARITHM_SUBTRACT); + BINARY_(ushort, uchar, uchar, run_arithm, dst, src1, src2, ARITHM_SUBTRACT); + BINARY_(ushort, short, short, run_arithm, dst, src1, src2, ARITHM_SUBTRACT); + BINARY_(ushort, float, float, run_arithm, dst, src1, src2, ARITHM_SUBTRACT); + BINARY_(float, uchar, uchar, run_arithm, dst, src1, src2, ARITHM_SUBTRACT); + BINARY_(float, ushort, ushort, run_arithm, dst, src1, src2, ARITHM_SUBTRACT); BINARY_( float, short, short, run_arithm, dst, src1, src2, ARITHM_SUBTRACT); BINARY_( float, float, float, run_arithm, dst, src1, src2, ARITHM_SUBTRACT); diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp b/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp index d80a6b29c0..c235991fba 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp +++ b/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp @@ -317,6 +317,33 @@ ADD_SIMD(float, float) #undef ADD_SIMD +#define SUB_SIMD(SRC, DST) \ +int sub_simd(const SRC in1[], const SRC in2[], DST out[], const int length) \ +{ \ + \ + CV_CPU_DISPATCH(sub_simd, (in1, in2, out, length), \ + CV_CPU_DISPATCH_MODES_ALL); \ +} + +SUB_SIMD(uchar, uchar) +SUB_SIMD(ushort, uchar) +SUB_SIMD(short, uchar) +SUB_SIMD(float, uchar) +SUB_SIMD(short, short) +SUB_SIMD(ushort, short) +SUB_SIMD(uchar, short) +SUB_SIMD(float, short) +SUB_SIMD(ushort, ushort) +SUB_SIMD(uchar, ushort) +SUB_SIMD(short, ushort) +SUB_SIMD(float, ushort) +SUB_SIMD(uchar, float) +SUB_SIMD(ushort, float) +SUB_SIMD(short, float) +SUB_SIMD(float, float) + +#undef SUB_SIMD + } // namespace fluid } // namespace gapi } // namespace cv diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.hpp b/modules/gapi/src/backends/fluid/gfluidcore_func.hpp index 052adbe2fd..3a5d70a045 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore_func.hpp +++ b/modules/gapi/src/backends/fluid/gfluidcore_func.hpp @@ -244,6 +244,28 @@ ADD_SIMD(float, float) #undef ADD_SIMD +#define SUB_SIMD(SRC, DST) \ +int sub_simd(const SRC in1[], const SRC in2[], DST out[], const int length); + +SUB_SIMD(uchar, uchar) +SUB_SIMD(ushort, uchar) +SUB_SIMD(short, uchar) +SUB_SIMD(float, uchar) +SUB_SIMD(short, short) +SUB_SIMD(ushort, short) +SUB_SIMD(uchar, short) +SUB_SIMD(float, short) +SUB_SIMD(ushort, ushort) +SUB_SIMD(uchar, ushort) +SUB_SIMD(short, ushort) +SUB_SIMD(float, ushort) +SUB_SIMD(uchar, float) +SUB_SIMD(ushort, float) +SUB_SIMD(short, float) +SUB_SIMD(float, float) + +#undef SUB_SIMD + } // namespace fluid } // namespace gapi } // namespace cv diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp b/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp index 4c324daa25..c148f81e77 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp +++ b/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp @@ -253,6 +253,28 @@ ADD_SIMD(float, float) #undef ADD_SIMD +#define SUB_SIMD(SRC, DST) \ +int sub_simd(const SRC in1[], const SRC in2[], DST out[], const int length); + +SUB_SIMD(uchar, uchar) +SUB_SIMD(ushort, uchar) +SUB_SIMD(short, uchar) +SUB_SIMD(float, uchar) +SUB_SIMD(short, short) +SUB_SIMD(ushort, short) +SUB_SIMD(uchar, short) +SUB_SIMD(float, short) +SUB_SIMD(ushort, ushort) +SUB_SIMD(uchar, ushort) +SUB_SIMD(short, ushort) +SUB_SIMD(float, ushort) +SUB_SIMD(uchar, float) +SUB_SIMD(ushort, float) +SUB_SIMD(short, float) +SUB_SIMD(float, float) + +#undef SUB_SIMD + int split3_simd(const uchar in[], uchar out1[], uchar out2[], uchar out3[], const int width); @@ -2530,32 +2552,43 @@ int merge4_simd(const uchar in1[], const uchar in2[], const uchar in3[], // Fluid kernels: Add // //------------------------- +template +CV_ALWAYS_INLINE VT oper(add_tag, const VT& a, const VT& b) +{ + return a + b; +} -CV_ALWAYS_INLINE void add_uchar_store(uchar* outx, const v_uint16& c1, const v_uint16& c2) +template +CV_ALWAYS_INLINE VT oper(sub_tag, const VT& a, const VT& b) +{ + return a - b; +} + +CV_ALWAYS_INLINE void pack_store_uchar(uchar* outx, const v_uint16& c1, const v_uint16& c2) { vx_store(outx, v_pack(c1, c2)); } -CV_ALWAYS_INLINE void add_uchar_store(uchar* outx, const v_int16& c1, const v_int16& c2) +CV_ALWAYS_INLINE void pack_store_uchar(uchar* outx, const v_int16& c1, const v_int16& c2) { vx_store(outx, v_pack_u(c1, c2)); } -template +template CV_ALWAYS_INLINE typename std::enable_if::value, void>::type -add_simd_impl(const SRC* in1x, const SRC* in2x, DST* outx) +arithmOp_simd_impl(oper_tag op, const SRC* in1x, const SRC* in2x, DST* outx) { vector_type_of_t a = vx_load(in1x); vector_type_of_t b = vx_load(in2x); - vx_store(outx, a + b); + vx_store(outx, oper(op, a, b)); } -template +template CV_ALWAYS_INLINE typename std::enable_if::value || std::is_same::value, void>::type -add_simd_impl(const SRC* in1x, const SRC* in2x, uchar* outx) +arithmOp_simd_impl(oper_tag op, const SRC* in1x, const SRC* in2x, uchar* outx) { constexpr int nlanes = v_uint8::nlanes; @@ -2564,10 +2597,12 @@ add_simd_impl(const SRC* in1x, const SRC* in2x, uchar* outx) vector_type_of_t b1 = vx_load(in2x); vector_type_of_t b2 = vx_load(&in2x[nlanes / 2]); - add_uchar_store(outx, a1 + b1, a2 + b2); + pack_store_uchar(outx, oper(op, a1, b1), oper(op, a2, b2)); } -CV_ALWAYS_INLINE void add_simd_impl(const float* in1x, const float* in2x, uchar* outx) +template +CV_ALWAYS_INLINE void arithmOp_simd_impl(oper_tag op, const float* in1x, + const float* in2x, uchar* outx) { constexpr int nlanes = v_uint8::nlanes; @@ -2581,31 +2616,35 @@ CV_ALWAYS_INLINE void add_simd_impl(const float* in1x, const float* in2x, uchar* v_float32 b3 = vx_load(&in2x[2 * nlanes / 4]); v_float32 b4 = vx_load(&in2x[3 * nlanes / 4]); - vx_store(outx, v_pack_u(v_pack(v_round(a1 + b1), v_round(a2 + b2)), - v_pack(v_round(a3 + b3), v_round(a4 + b4)))); + vx_store(outx, v_pack_u(v_pack(v_round(oper(op, a1, b1)), v_round(oper(op, a2, b2))), + v_pack(v_round(oper(op, a3, b3)), v_round(oper(op, a4, b4))))); } -CV_ALWAYS_INLINE void add_simd_impl(const uchar* in1x, const uchar* in2x, short* outx) +template +CV_ALWAYS_INLINE void arithmOp_simd_impl(oper_tag op, const uchar* in1x, + const uchar* in2x, short* outx) { v_int16 a = v_reinterpret_as_s16(vx_load_expand(in1x)); v_int16 b = v_reinterpret_as_s16(vx_load_expand(in2x)); - vx_store(outx, a + b); + vx_store(outx, oper(op, a, b)); } -CV_ALWAYS_INLINE void add_simd_impl(const uchar* in1x, const uchar* in2x, ushort* outx) +template +CV_ALWAYS_INLINE void arithmOp_simd_impl(oper_tag op, const uchar* in1x, + const uchar* in2x, ushort* outx) { v_uint16 a = vx_load_expand(in1x); v_uint16 b = vx_load_expand(in2x); - vx_store(outx, a + b); + vx_store(outx, oper(op, a, b)); } -template +template CV_ALWAYS_INLINE typename std::enable_if::value || std::is_same::value, void>::type -add_simd_impl(const float* in1x, const float* in2x, DST* outx) +arithmOp_simd_impl(oper_tag op, const float* in1x, const float* in2x, DST* outx) { constexpr int nlanes = vector_type_of_t::nlanes; v_float32 a1 = vx_load(in1x); @@ -2613,10 +2652,12 @@ add_simd_impl(const float* in1x, const float* in2x, DST* outx) v_float32 b1 = vx_load(in2x); v_float32 b2 = vx_load(&in2x[nlanes/2]); - v_store_i16(outx, v_round(a1 + b1), v_round(a2 + b2)); + v_store_i16(outx, v_round(oper(op, a1, b1)), v_round(oper(op, a2, b2))); } -CV_ALWAYS_INLINE void add_simd_impl(const short* in1x, const short* in2x, ushort* outx) +template +CV_ALWAYS_INLINE void arithmOp_simd_impl(oper_tag op, const short* in1x, + const short* in2x, ushort* outx) { v_int16 a = vx_load(in1x); v_int32 a1 = v_expand_low(a); @@ -2626,57 +2667,66 @@ CV_ALWAYS_INLINE void add_simd_impl(const short* in1x, const short* in2x, ushort v_int32 b1 = v_expand_low(b); v_int32 b2 = v_expand_high(b); - vx_store(outx, v_pack_u(a1 + b1, a2 + b2)); + vx_store(outx, v_pack_u(oper(op, a1, b1), oper(op, a2, b2))); } -CV_ALWAYS_INLINE void add_simd_impl(const ushort* in1x, const ushort* in2x, short* outx) +template +CV_ALWAYS_INLINE void arithmOp_simd_impl(oper_tag op, const ushort* in1x, + const ushort* in2x, short* outx) { - v_uint16 a = vx_load(in1x); - v_uint32 a1 = v_expand_low(a); - v_uint32 a2 = v_expand_high(a); + v_int16 a = v_reinterpret_as_s16(vx_load(in1x)); + v_int32 a1 = v_expand_low(a); + v_int32 a2 = v_expand_high(a); - v_uint16 b = vx_load(in2x); - v_uint32 b1 = v_expand_low(b); - v_uint32 b2 = v_expand_high(b); + v_int16 b = v_reinterpret_as_s16(vx_load(in2x)); + v_int32 b1 = v_expand_low(b); + v_int32 b2 = v_expand_high(b); - vx_store(outx, v_reinterpret_as_s16(v_pack(a1 + b1, a2 + b2))); + vx_store(outx, v_pack(oper(op, a1, b1), oper(op, a2, b2))); } -template -CV_ALWAYS_INLINE void add_simd_impl(const SRC* in1x, const SRC* in2x, float* outx) +template +CV_ALWAYS_INLINE void arithmOp_simd_impl(oper_tag op, const SRC* in1x, const SRC* in2x, float* outx) { v_float32 a = vg_load_f32(in1x); v_float32 b = vg_load_f32(in2x); - vx_store(outx, a + b); + vx_store(outx, oper(op, a, b)); +} + +template +CV_ALWAYS_INLINE int arithmOp_simd(oper_tag op, const SRC in1[], const SRC in2[], + DST out[], const int length) +{ + constexpr int nlanes = vector_type_of_t::nlanes; + + if (length < nlanes) + return 0; + + int x = 0; + for (;;) + { + for (; x <= length - nlanes; x += nlanes) + { + arithmOp_simd_impl(op, &in1[x], &in2[x], &out[x]); + } + + if (x < length) + { + x = length - nlanes; + continue; + } + break; + } + + return x; } #define ADD_SIMD(SRC, DST) \ int add_simd(const SRC in1[], const SRC in2[], DST out[], const int length) \ { \ - constexpr int nlanes = vector_type_of_t::nlanes; \ - \ - if (length < nlanes) \ - return 0; \ - \ - int x = 0; \ - for (;;) \ - { \ - for (; x <= length - nlanes; x += nlanes) \ - { \ - add_simd_impl(&in1[x], &in2[x], &out[x]); \ - } \ - \ - if (x < length) \ - { \ - x = length - nlanes; \ - continue; \ - } \ - break; \ - } \ - \ - return x; \ -} + return arithmOp_simd(add_tag{}, in1, in2, out, length); \ +} \ ADD_SIMD(uchar, uchar) ADD_SIMD(ushort, uchar) @@ -2697,6 +2747,37 @@ ADD_SIMD(float, float) #undef ADD_SIMD +//------------------------- +// +// Fluid kernels: Sub +// +//------------------------- + +#define SUB_SIMD(SRC, DST) \ +int sub_simd(const SRC in1[], const SRC in2[], DST out[], const int length) \ +{ \ + return arithmOp_simd(sub_tag{}, in1, in2, out, length); \ +} \ + +SUB_SIMD(uchar, uchar) +SUB_SIMD(ushort, uchar) +SUB_SIMD(short, uchar) +SUB_SIMD(float, uchar) +SUB_SIMD(short, short) +SUB_SIMD(ushort, short) +SUB_SIMD(uchar, short) +SUB_SIMD(float, short) +SUB_SIMD(ushort, ushort) +SUB_SIMD(uchar, ushort) +SUB_SIMD(short, ushort) +SUB_SIMD(float, ushort) +SUB_SIMD(uchar, float) +SUB_SIMD(ushort, float) +SUB_SIMD(short, float) +SUB_SIMD(float, float) + +#undef SUB_SIMD + #endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY CV_CPU_OPTIMIZATION_NAMESPACE_END