diff --git a/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp b/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp index cd72932780..796d05101e 100644 --- a/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp +++ b/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp @@ -93,8 +93,8 @@ INSTANTIATE_TEST_CASE_P(DivPerfTestFluid, DivPerfTest, INSTANTIATE_TEST_CASE_P(DivCPerfTestFluid, DivCPerfTest, Combine(Values(Tolerance_FloatRel_IntAbs(1e-6, 1).to_compare_f()), Values(szSmall128, szVGA, sz720p, sz1080p), - Values(CV_8UC1, CV_8UC3, CV_16SC1, CV_32FC1), - Values(-1, CV_8U, CV_32F), + Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1), + Values(-1, CV_8U, CV_16U, CV_16S, CV_32F), Values(1.0), Values(cv::compile_args(CORE_FLUID)))); diff --git a/modules/gapi/src/backends/fluid/gfluidcore.cpp b/modules/gapi/src/backends/fluid/gfluidcore.cpp index 403bcf252d..590589eaaa 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore.cpp +++ b/modules/gapi/src/backends/fluid/gfluidcore.cpp @@ -886,25 +886,6 @@ static void run_arithm_s(DST out[], const SRC in[], int width, int chan, CV_Error(cv::Error::StsBadArg, "unsupported number of channels"); } -template -static void run_absdiffc(Buffer &dst, const View &src, const float scalar[]) -{ - const auto *in = src.InLine(0); - auto *out = dst.OutLine(); - - int width = dst.length(); - int chan = dst.meta().chan; - const int length = width * chan; - - int w = 0; -#if CV_SIMD - w = absdiffc_simd(in, scalar, out, length, chan); -#endif - - for (; w < length; ++w) - out[w] = absdiff(in[w], scalar[w%chan]); -} - template CV_ALWAYS_INLINE void run_arithm_s(Buffer &dst, const View &src, const float scalar[], Arithm arithm, float scale=1) @@ -950,11 +931,6 @@ CV_ALWAYS_INLINE void run_arithm_s(Buffer &dst, const View &src, const float sca out[chan * w + c] = mul(in[chan * w + c], scalar[c], scale); break; } - case ARITHM_DIVIDE: - for (int w=0; w < width; w++) - for (int c=0; c < chan; c++) - out[chan*w + c] = div(in[chan*w + c], scalar[c], scale); - break; default: CV_Error(cv::Error::StsBadArg, "unsupported arithmetic operation"); } } @@ -992,6 +968,14 @@ static void run_arithm_rs(Buffer &dst, const View &src, const float scalar[4], A } } +CV_ALWAYS_INLINE void setScratchSize(Buffer& scratch, const int buflen) +{ + cv::Size bufsize(buflen, 1); + GMatDesc bufdesc = { CV_32F, 1, bufsize }; + Buffer buffer(bufdesc); + scratch = std::move(buffer); +} + CV_ALWAYS_INLINE void initScratchBuffer(Buffer& scratch) { #if CV_SIMD @@ -1012,25 +996,47 @@ CV_ALWAYS_INLINE void initScratchBuffer(Buffer& scratch) #else constexpr int buflen = 4; #endif - cv::Size bufsize(buflen, 1); - GMatDesc bufdesc = { CV_32F, 1, bufsize }; - Buffer buffer(bufdesc); - scratch = std::move(buffer); + setScratchSize(scratch, buflen); +} + +CV_ALWAYS_INLINE void scalar_to_scratch(const cv::Scalar& scalar, + float scratch[], const int length, const int chan) +{ + for (int i = 0; i < length; ++i) + scratch[i] = static_cast(scalar[i % chan]); +} + +template +CV_ALWAYS_INLINE void run_absdiffc(Buffer& dst, const View& src, const float scalar[]) +{ + const auto* in = src.InLine(0); + auto* out = dst.OutLine(); + + int width = dst.length(); + int chan = dst.meta().chan; + const int length = width * chan; + + int w = 0; +#if CV_SIMD + w = absdiffc_simd(in, scalar, out, length, chan); +#endif + + for (; w < length; ++w) + out[w] = absdiff(in[w], scalar[w % chan]); } GAPI_FLUID_KERNEL(GFluidAbsDiffC, cv::gapi::core::GAbsDiffC, true) { static const int Window = 1; - static void run(const View &src, const cv::Scalar& _scalar, Buffer &dst, Buffer& scratch) + static void run(const View& src, const cv::Scalar& _scalar, Buffer& dst, Buffer& scratch) { if (dst.y() == 0) { const int chan = src.meta().chan; - float* sc = scratch.OutLine(); + float* _scratch = scratch.OutLine(); - for (int i = 0; i < scratch.length(); ++i) - sc[i] = static_cast(_scalar[i % chan]); + scalar_to_scratch(_scalar, _scratch, scratch.length(), chan); } const float* scalar = scratch.OutLine(); @@ -1058,17 +1064,16 @@ GAPI_FLUID_KERNEL(GFluidAddC, cv::gapi::core::GAddC, true) { static const int Window = 1; - static void run(const View &src, const cv::Scalar &_scalar, int /*dtype*/, Buffer &dst, Buffer &scratch) + static void run(const View& src, const cv::Scalar& _scalar, int /*dtype*/, Buffer& dst, Buffer& scratch) { GAPI_Assert(src.meta().chan <= 4); if (dst.y() == 0) { const int chan = src.meta().chan; - float* sc = scratch.OutLine(); + float* _scratch = scratch.OutLine(); - for (int i = 0; i < scratch.length(); ++i) - sc[i] = static_cast(_scalar[i % chan]); + scalar_to_scratch(_scalar, _scratch, scratch.length(), chan); } const float* scalar = scratch.OutLine(); @@ -1115,10 +1120,9 @@ GAPI_FLUID_KERNEL(GFluidSubC, cv::gapi::core::GSubC, true) if (dst.y() == 0) { const int chan = src.meta().chan; - float* sc = scratch.OutLine(); + float* _scratch = scratch.OutLine(); - for (int i = 0; i < scratch.length(); ++i) - sc[i] = static_cast(_scalar[i % chan]); + scalar_to_scratch(_scalar, _scratch, scratch.length(), chan); } const float* scalar = scratch.OutLine(); @@ -1165,10 +1169,9 @@ GAPI_FLUID_KERNEL(GFluidSubRC, cv::gapi::core::GSubRC, true) if (dst.y() == 0) { const int chan = src.meta().chan; - float* sc = scratch.OutLine(); + float* _scratch = scratch.OutLine(); - for (int i = 0; i < scratch.length(); ++i) - sc[i] = static_cast(_scalar[i % chan]); + scalar_to_scratch(_scalar, _scratch, scratch.length(), chan); } const float* scalar = scratch.OutLine(); @@ -1216,10 +1219,9 @@ GAPI_FLUID_KERNEL(GFluidMulC, cv::gapi::core::GMulC, true) if (dst.y() == 0) { const int chan = src.meta().chan; - float* sc = scratch.OutLine(); + float* _scratch = scratch.OutLine(); - for (int i = 0; i < scratch.length(); ++i) - sc[i] = static_cast(_scalar[i % chan]); + scalar_to_scratch(_scalar, _scratch, scratch.length(), chan); } const float* scalar = scratch.OutLine(); const float scale = 1.0; @@ -1259,7 +1261,7 @@ GAPI_FLUID_KERNEL(GFluidMulCOld, cv::gapi::core::GMulCOld, true) { static const int Window = 1; - static void run(const View &src, double _scalar, int /*dtype*/, Buffer &dst, Buffer& scratch) + static void run(const View& src, double _scalar, int /*dtype*/, Buffer& dst, Buffer& scratch) { GAPI_Assert(src.meta().chan <= 4); @@ -1295,32 +1297,109 @@ GAPI_FLUID_KERNEL(GFluidMulCOld, cv::gapi::core::GMulCOld, true) } }; -GAPI_FLUID_KERNEL(GFluidDivC, cv::gapi::core::GDivC, false) +template +CV_ALWAYS_INLINE void run_divc(Buffer& dst, const View& src, Buffer& scratch, + float scale) +{ + const auto* in = src.InLine(0); + auto* out = dst.OutLine(); + const float* scalar = scratch.OutLine(); + + int width = dst.length(); + int chan = dst.meta().chan; + const int length = width * chan; + + int w = 0; +#if CV_SIMD + int scratch_length = scratch.length(); + int indicator_offset = scratch_length - 1; + const int set_mask_indicator = static_cast(*(scratch.OutLine() + (indicator_offset))); + + w = divc_simd(in, scalar, out, length, chan, scale, set_mask_indicator); +#endif + + for (; w < length; ++w) + out[w] = div(in[w], scalar[w % chan], scale); +} + +GAPI_FLUID_KERNEL(GFluidDivC, cv::gapi::core::GDivC, true) { static const int Window = 1; - static void run(const View &src, const cv::Scalar &_scalar, double _scale, int /*dtype*/, - Buffer &dst) + static void run(const View& src, const cv::Scalar& _scalar, double _scale, int /*dtype*/, + Buffer& dst, Buffer& scratch) { - const float scalar[4] = { - static_cast(_scalar[0]), - static_cast(_scalar[1]), - static_cast(_scalar[2]), - static_cast(_scalar[3]) - }; - const float scale = static_cast(_scale); + GAPI_Assert(src.meta().chan <= 4); + + if (dst.y() == 0) + { + const int chan = src.meta().chan; + float* _scratch = scratch.OutLine(); + int scratch_length = scratch.length(); + + scalar_to_scratch(_scalar, _scratch, scratch_length - 1, chan); + + _scratch[scratch_length - 1] = 0.0; + for (int j = 0; j < chan; ++j) + { + if (std::fabs(static_cast(_scalar[j])) <= FLT_EPSILON) + { + _scratch[scratch_length - 1] = 1.0; + break; + } + } + } + + float scale = static_cast(_scale); // DST SRC OP __VA_ARGS__ - UNARY_(uchar , uchar , run_arithm_s, dst, src, scalar, ARITHM_DIVIDE, scale); - UNARY_(uchar , short, run_arithm_s, dst, src, scalar, ARITHM_DIVIDE, scale); - UNARY_(uchar , float, run_arithm_s, dst, src, scalar, ARITHM_DIVIDE, scale); - UNARY_( short, short, run_arithm_s, dst, src, scalar, ARITHM_DIVIDE, scale); - UNARY_( float, uchar , run_arithm_s, dst, src, scalar, ARITHM_DIVIDE, scale); - UNARY_( float, short, run_arithm_s, dst, src, scalar, ARITHM_DIVIDE, scale); - UNARY_( float, float, run_arithm_s, dst, src, scalar, ARITHM_DIVIDE, scale); + UNARY_(uchar, uchar, run_divc, dst, src, scratch, scale); + UNARY_(uchar, ushort, run_divc, dst, src, scratch, scale); + UNARY_(uchar, short, run_divc, dst, src, scratch, scale); + UNARY_(uchar, float, run_divc, dst, src, scratch, scale); + UNARY_(ushort, ushort, run_divc, dst, src, scratch, scale); + UNARY_(ushort, uchar, run_divc, dst, src, scratch, scale); + UNARY_(ushort, short, run_divc, dst, src, scratch, scale); + UNARY_(ushort, float, run_divc, dst, src, scratch, scale); + UNARY_(short, short, run_divc, dst, src, scratch, scale); + UNARY_(short, ushort, run_divc, dst, src, scratch, scale); + UNARY_(short, uchar, run_divc, dst, src, scratch, scale); + UNARY_(short, float, run_divc, dst, src, scratch, scale); + UNARY_(float, uchar, run_divc, dst, src, scratch, scale); + UNARY_(float, short, run_divc, dst, src, scratch, scale); + UNARY_(float, ushort, run_divc, dst, src, scratch, scale); + UNARY_(float, float, run_divc, dst, src, scratch, scale); CV_Error(cv::Error::StsBadArg, "unsupported combination of types"); } + + static void initScratch(const GMatDesc&, const GScalarDesc&, double, int, Buffer& scratch) + { +#if CV_SIMD + // 512 bits / 32 bits = 16 elements of float32 a AVX512 SIMD vector can contain. + constexpr int maxNlanes = 16; + + // +2 is offset for 3-channel case. + // Offset is need to right load coefficients from scalar array to SIMD vectors for 3-channel case. + // Scalar array looks like: scalar[] = {C1, C2, C3, C1, C2, C3, ...} + // The first scalar SIMD vector should looks like: + // C1 C2 C3 C1 + // The second: + // C2 C3 C1 C2 + // The third: + // C3 C1 C2 C3 + constexpr int offset = 2; + constexpr int zero_scalar_elem_indicator = 1; + constexpr int buflen = maxNlanes + offset + zero_scalar_elem_indicator; +#else + constexpr int buflen = 4; +#endif + setScratchSize(scratch, buflen); + } + + static void resetScratch(Buffer& /*scratch*/) + { + } }; GAPI_FLUID_KERNEL(GFluidDivRC, cv::gapi::core::GDivRC, false) diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp b/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp index 30e3d1f5ea..a3be5e3857 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp +++ b/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp @@ -192,6 +192,34 @@ MULC_SIMD(float, float) #undef MULC_SIMD +#define DIVC_SIMD(SRC, DST) \ +int divc_simd(const SRC in[], const float scalar[], DST out[], \ + const int length, const int chan, const float scale, \ + const int set_mask_flag) \ +{ \ + CV_CPU_DISPATCH(divc_simd, (in, scalar, out, length, chan, scale, set_mask_flag), \ + CV_CPU_DISPATCH_MODES_ALL); \ +} + +DIVC_SIMD(uchar, uchar) +DIVC_SIMD(ushort, uchar) +DIVC_SIMD(short, uchar) +DIVC_SIMD(float, uchar) +DIVC_SIMD(short, short) +DIVC_SIMD(ushort, short) +DIVC_SIMD(uchar, short) +DIVC_SIMD(float, short) +DIVC_SIMD(ushort, ushort) +DIVC_SIMD(uchar, ushort) +DIVC_SIMD(short, ushort) +DIVC_SIMD(float, ushort) +DIVC_SIMD(uchar, float) +DIVC_SIMD(ushort, float) +DIVC_SIMD(short, float) +DIVC_SIMD(float, float) + +#undef DIVC_SIMD + #define ABSDIFFC_SIMD(SRC) \ int absdiffc_simd(const SRC in[], const float scalar[], SRC out[], \ const int length, const int chan) \ diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.hpp b/modules/gapi/src/backends/fluid/gfluidcore_func.hpp index e0fdf812f2..9c71d7e3d5 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore_func.hpp +++ b/modules/gapi/src/backends/fluid/gfluidcore_func.hpp @@ -152,6 +152,30 @@ MULC_SIMD(float, float) #undef MULC_SIMD +#define DIVC_SIMD(SRC, DST) \ +int divc_simd(const SRC in[], const float scalar[], DST out[], \ + const int length, const int chan, const float scale, \ + const int set_mask_flag); + +DIVC_SIMD(uchar, uchar) +DIVC_SIMD(ushort, uchar) +DIVC_SIMD(short, uchar) +DIVC_SIMD(float, uchar) +DIVC_SIMD(short, short) +DIVC_SIMD(ushort, short) +DIVC_SIMD(uchar, short) +DIVC_SIMD(float, short) +DIVC_SIMD(ushort, ushort) +DIVC_SIMD(uchar, ushort) +DIVC_SIMD(short, ushort) +DIVC_SIMD(float, ushort) +DIVC_SIMD(uchar, float) +DIVC_SIMD(ushort, float) +DIVC_SIMD(short, float) +DIVC_SIMD(float, float) + +#undef DIVC_SIMD + #define ABSDIFFC_SIMD(T) \ int absdiffc_simd(const T in[], const float scalar[], T out[], \ const int length, const int chan); diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp b/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp index 9f7886f9b0..cdf09a9426 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp +++ b/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp @@ -173,6 +173,30 @@ MULC_SIMD(float, float) #undef MULC_SIMD +#define DIVC_SIMD(SRC, DST) \ +int divc_simd(const SRC in[], const float scalar[], DST out[], \ + const int length, const int chan, const float scale, \ + const int set_mask_flag); + +DIVC_SIMD(uchar, uchar) +DIVC_SIMD(ushort, uchar) +DIVC_SIMD(short, uchar) +DIVC_SIMD(float, uchar) +DIVC_SIMD(short, short) +DIVC_SIMD(ushort, short) +DIVC_SIMD(uchar, short) +DIVC_SIMD(float, short) +DIVC_SIMD(ushort, ushort) +DIVC_SIMD(uchar, ushort) +DIVC_SIMD(short, ushort) +DIVC_SIMD(float, ushort) +DIVC_SIMD(uchar, float) +DIVC_SIMD(ushort, float) +DIVC_SIMD(short, float) +DIVC_SIMD(float, float) + +#undef DIVC_SIMD + #define ABSDIFFC_SIMD(T) \ int absdiffc_simd(const T in[], const float scalar[], T out[], \ const int length, const int chan); @@ -941,6 +965,7 @@ struct add_tag {}; struct sub_tag {}; struct subr_tag {}; struct mul_tag {}; +struct div_tag {}; struct absdiff_tag {}; CV_ALWAYS_INLINE void arithmOpScalar_pack_store_c3(short* outx, const v_int32& c1, @@ -985,6 +1010,21 @@ CV_ALWAYS_INLINE v_float32 oper(mul_tag, const v_float32& a, const v_float32& sc return a * sc; } +CV_ALWAYS_INLINE v_float32 oper_scaled(mul_tag, const v_float32& a, const v_float32& v_scalar, const v_float32& v_scale) +{ + return v_scale * a * v_scalar; +} + +CV_ALWAYS_INLINE v_float32 oper(div_tag, const v_float32& a, const v_float32& sc) +{ + return a / sc; +} + +CV_ALWAYS_INLINE v_float32 oper_scaled(div_tag, const v_float32& a, const v_float32& v_scalar, const v_float32& v_scale) +{ + return a*v_scale / v_scalar; +} + CV_ALWAYS_INLINE v_float32 oper(absdiff_tag, const v_float32& a, const v_float32& sc) { return v_absdiff(a, sc); @@ -1294,16 +1334,17 @@ SUBRC_SIMD(float, float) //------------------------- // -// Fluid kernels: MulC +// Fluid kernels: MulC, DivC // //------------------------- -template +template CV_ALWAYS_INLINE typename std::enable_if::value || std::is_same::value, void>::type -mulc_scale_simd_c3_impl(const SRC* inx, DST* outx, const v_float32& s1, const v_float32& s2, - const v_float32& s3, const v_float32& scale, const int nlanes) +arithmOpScalarScaled_simd_c3_impl(oper_tag op, SRC* inx, DST* outx, const v_float32& s1, + const v_float32& s2, const v_float32& s3, + const v_float32& v_scale, const int nlanes) { v_float32 a1 = vg_load_f32(inx); v_float32 a2 = vg_load_f32(&inx[nlanes / 2]); @@ -1312,62 +1353,64 @@ mulc_scale_simd_c3_impl(const SRC* inx, DST* outx, const v_float32& s1, const v_ v_float32 a5 = vg_load_f32(&inx[2 * nlanes]); v_float32 a6 = vg_load_f32(&inx[5 * nlanes / 2]); - arithmOpScalar_pack_store_c3(outx, v_round(scale*a1*s1), - v_round(scale*a2*s2), - v_round(scale*a3*s3), - v_round(scale*a4*s1), - v_round(scale*a5*s2), - v_round(scale*a6*s3)); + arithmOpScalar_pack_store_c3(outx, v_round(oper_scaled(op, a1, s1, v_scale)), + v_round(oper_scaled(op, a2, s2, v_scale)), + v_round(oper_scaled(op, a3, s3, v_scale)), + v_round(oper_scaled(op, a4, s1, v_scale)), + v_round(oper_scaled(op, a5, s2, v_scale)), + v_round(oper_scaled(op, a6, s3, v_scale))); } //------------------------------------------------------------------------------------------------- -template -CV_ALWAYS_INLINE void mulc_scale_simd_c3_impl(const SRC* inx, uchar* outx, - const v_float32& s1, const v_float32& s2, - const v_float32& s3, const v_float32& scale, const int nlanes) +template +CV_ALWAYS_INLINE void arithmOpScalarScaled_simd_c3_impl(oper_tag op, const SRC* inx, uchar* outx, + const v_float32& s1, const v_float32& s2, + const v_float32& s3, const v_float32& v_scale, + const int nlanes) { vx_store(outx, - v_pack_u(v_pack(v_round(scale * vg_load_f32(inx)* s1), - v_round(scale * vg_load_f32(&inx[nlanes/4])* s2)), - v_pack(v_round(scale * vg_load_f32(&inx[nlanes/2])* s3), - v_round(scale * vg_load_f32(&inx[3*nlanes/4])* s1)))); + v_pack_u(v_pack(v_round(oper_scaled(op, vg_load_f32(inx), s1, v_scale)), + v_round(oper_scaled(op, vg_load_f32(&inx[nlanes/4]), s2, v_scale))), + v_pack(v_round(oper_scaled(op, vg_load_f32(&inx[nlanes/2]), s3, v_scale)), + v_round(oper_scaled(op, vg_load_f32(&inx[3*nlanes/4]), s1, v_scale))))); vx_store(&outx[nlanes], - v_pack_u(v_pack(v_round(scale * vg_load_f32(&inx[nlanes])* s2), - v_round(scale * vg_load_f32(&inx[5*nlanes/4])* s3)), - v_pack(v_round(scale * vg_load_f32(&inx[3*nlanes/2])* s1), - v_round(scale * vg_load_f32(&inx[7*nlanes/4])* s2)))); + v_pack_u(v_pack(v_round(oper_scaled(op, vg_load_f32(&inx[nlanes]), s2, v_scale)), + v_round(oper_scaled(op, vg_load_f32(&inx[5*nlanes/4]), s3, v_scale))), + v_pack(v_round(oper_scaled(op, vg_load_f32(&inx[3*nlanes/2]), s1, v_scale)), + v_round(oper_scaled(op, vg_load_f32(&inx[7*nlanes/4]), s2, v_scale))))); vx_store(&outx[2 * nlanes], - v_pack_u(v_pack(v_round(scale * vg_load_f32(&inx[2*nlanes])* s3), - v_round(scale * vg_load_f32(&inx[9*nlanes/4])* s1)), - v_pack(v_round(scale * vg_load_f32(&inx[5*nlanes/2])* s2), - v_round(scale * vg_load_f32(&inx[11*nlanes/4])* s3)))); + v_pack_u(v_pack(v_round(oper_scaled(op, vg_load_f32(&inx[2*nlanes]), s3, v_scale)), + v_round(oper_scaled(op, vg_load_f32(&inx[9*nlanes/4]), s1, v_scale))), + v_pack(v_round(oper_scaled(op, vg_load_f32(&inx[5*nlanes/2]), s2, v_scale)), + v_round(oper_scaled(op, vg_load_f32(&inx[11*nlanes/4]), s3, v_scale))))); } //------------------------------------------------------------------------------------------------- -template -CV_ALWAYS_INLINE void mulc_scale_simd_c3_impl(const SRC* in, float* out, - const v_float32& s1, const v_float32& s2, - const v_float32& s3, const v_float32& scale, const int nlanes) +template +CV_ALWAYS_INLINE void arithmOpScalarScaled_simd_c3_impl(oper_tag op, const SRC* in, float* out, + const v_float32& s1, const v_float32& s2, + const v_float32& s3, const v_float32& v_scale, + const int nlanes) { v_float32 a1 = vg_load_f32(in); v_float32 a2 = vg_load_f32(&in[nlanes]); v_float32 a3 = vg_load_f32(&in[2*nlanes]); - vx_store(out, scale * a1* s1); - vx_store(&out[nlanes], scale * a2* s2); - vx_store(&out[2*nlanes], scale * a3* s3); + vx_store(out, oper_scaled(op, a1, s1, v_scale)); + vx_store(&out[nlanes], oper_scaled(op, a2, s2, v_scale)); + vx_store(&out[2*nlanes], oper_scaled(op, a3, s3, v_scale)); } //------------------------------------------------------------------------------------------------- -template -CV_ALWAYS_INLINE int mulc_scale_simd_c3(const SRC in[], - const float scalar[], DST out[], - const int length, const float _scale) +template +CV_ALWAYS_INLINE int arithmOpScalarScaled_simd_c3(oper_tag op, const SRC in[], + const float scalar[], DST out[], + const int length, const float scale) { constexpr int chan = 3; constexpr int nlanes = vector_type_of_t::nlanes; @@ -1376,7 +1419,7 @@ CV_ALWAYS_INLINE int mulc_scale_simd_c3(const SRC in[], if (length < lanes) return 0; - v_float32 scale = vx_setall_f32(_scale); + v_float32 v_scale = vx_setall_f32(scale); v_float32 s1 = vx_load(scalar); #if CV_SIMD_WIDTH == 32 @@ -1392,7 +1435,7 @@ CV_ALWAYS_INLINE int mulc_scale_simd_c3(const SRC in[], { for (; x <= length - lanes; x += lanes) { - mulc_scale_simd_c3_impl(&in[x], &out[x], s1, s2, s3, scale, nlanes); + arithmOpScalarScaled_simd_c3_impl(op, &in[x], &out[x], s1, s2, s3, v_scale, nlanes); } if (x < length) @@ -1407,70 +1450,70 @@ CV_ALWAYS_INLINE int mulc_scale_simd_c3(const SRC in[], //------------------------------------------------------------------------------------------------- -template +template CV_ALWAYS_INLINE typename std::enable_if<(std::is_same::value || std::is_same::value), void>::type -mulc_scale_simd_common_impl(const SRC* inx, DST* outx, - const v_float32& sc, const v_float32& scale, - const int nlanes) +arithmOpScalarScaled_simd_common_impl(oper_tag op, const SRC* inx, DST* outx, + const v_float32& v_scalar, const v_float32& v_scale, + const int nlanes) { v_float32 a1 = vg_load_f32(inx); v_float32 a2 = vg_load_f32(&inx[nlanes/2]); - v_store_i16(outx, v_round(scale * a1* sc), v_round(scale * a2* sc)); + v_store_i16(outx, v_round(oper_scaled(op, a1, v_scalar, v_scale)), v_round(oper_scaled(op, a2, v_scalar, v_scale))); } //------------------------------------------------------------------------------------------------- -template -CV_ALWAYS_INLINE void mulc_scale_simd_common_impl(const SRC* inx, - uchar* outx, const v_float32& sc, - const v_float32& scale, const int nlanes) +template +CV_ALWAYS_INLINE void arithmOpScalarScaled_simd_common_impl(oper_tag op, const SRC* inx, + uchar* outx, const v_float32& v_scalar, + const v_float32& v_scale, const int nlanes) { v_float32 a1 = vg_load_f32(inx); v_float32 a2 = vg_load_f32(&inx[nlanes/4]); v_float32 a3 = vg_load_f32(&inx[nlanes/2]); v_float32 a4 = vg_load_f32(&inx[3 * nlanes/4]); - vx_store(outx, v_pack_u(v_pack(v_round(scale * a1* sc), - v_round(scale * a2* sc)), - v_pack(v_round(scale * a3* sc), - v_round(scale * a4* sc)))); + vx_store(outx, v_pack_u(v_pack(v_round(oper_scaled(op, a1, v_scalar, v_scale)), + v_round(oper_scaled(op, a2, v_scalar, v_scale))), + v_pack(v_round(oper_scaled(op, a3, v_scalar, v_scale)), + v_round(oper_scaled(op, a4, v_scalar, v_scale))))); } //------------------------------------------------------------------------------------------------- -template -CV_ALWAYS_INLINE void mulc_scale_simd_common_impl(const SRC* inx, - float* outx, const v_float32& sc, - const v_float32& scale, const int) +template +CV_ALWAYS_INLINE void arithmOpScalarScaled_simd_common_impl(oper_tag op, const SRC* inx, + float* outx, const v_float32& v_scalar, + const v_float32& v_scale, const int) { - v_float32 a1 = vg_load_f32(inx); - vx_store(outx, scale * a1* sc); + v_float32 a = vg_load_f32(inx); + vx_store(outx, oper_scaled(op, a, v_scalar, v_scale)); } //------------------------------------------------------------------------------------------------- -template -CV_ALWAYS_INLINE int mulc_scale_simd_common(const SRC in[], - const float scalar[], DST out[], - const int length, const float _scale) +template +CV_ALWAYS_INLINE int arithmOpScalarScaled_simd_common(oper_tag op, const SRC in[], + const float scalar[], DST out[], + const int length, const float scale) { constexpr int nlanes = vector_type_of_t::nlanes; if (length < nlanes) return 0; - v_float32 _scalar = vx_load(scalar); - v_float32 scale = vx_setall_f32(_scale); + v_float32 v_scalar = vx_load(scalar); + v_float32 v_scale = vx_setall_f32(scale); int x = 0; for (;;) { for (; x <= length - nlanes; x += nlanes) { - mulc_scale_simd_common_impl(&in[x], &out[x], _scalar, scale, nlanes); + arithmOpScalarScaled_simd_common_impl(op, &in[x], &out[x], v_scalar, v_scale, nlanes); } if (x < length) @@ -1483,6 +1526,8 @@ CV_ALWAYS_INLINE int mulc_scale_simd_common(const SRC in[], return x; } +//------------------------------------------------------------------------------------------------- + #define MULC_SIMD(SRC, DST) \ int mulc_simd(const SRC in[], const float scalar[], DST out[], \ const int length, const int chan, const float scale) \ @@ -1501,7 +1546,8 @@ int mulc_simd(const SRC in[], const float scalar[], DST out[], \ } \ else \ { \ - return mulc_scale_simd_common(in, scalar, out, length, scale); \ + return arithmOpScalarScaled_simd_common(op_t, in, scalar, out, \ + length, scale); \ } \ } \ case 3: \ @@ -1513,7 +1559,8 @@ int mulc_simd(const SRC in[], const float scalar[], DST out[], \ } \ else \ { \ - return mulc_scale_simd_c3(in, scalar, out, length, scale); \ + return arithmOpScalarScaled_simd_c3(op_t, in, scalar, out, \ + length, scale); \ } \ } \ default: \ @@ -1542,6 +1589,355 @@ MULC_SIMD(float, float) #undef MULC_SIMD +//------------------------------------------------------------------------------------------------- + +template +CV_ALWAYS_INLINE +typename std::enable_if<(std::is_same::value || + std::is_same::value), int>::type +divc_simd_common_impl(scale_tag_t s_tag, const SRC in[], DST out[], + const v_float32& v_scalar, const v_float32& v_scale, + const int length) +{ + constexpr int nlanes = vector_type_of_t::nlanes; + + v_float32 v_zero = vx_setzero_f32(); + v_float32 v_mask = (v_scalar == v_zero); + + int x = 0; + for (;;) + { + for (; x <= length - nlanes; x += nlanes) + { + v_float32 a1 = vg_load_f32(&in[x]); + v_float32 a2 = vg_load_f32(&in[x + nlanes/2]); + + v_store_i16(&out[x], v_round(v_select(v_mask, v_zero, div_op(s_tag, a1, v_scalar, v_scale))), + v_round(v_select(v_mask, v_zero, div_op(s_tag, a2, v_scalar, v_scale)))); + } + + if (x < length) + { + x = length - nlanes; + continue; // process unaligned tail + } + break; + } + return x; +} + +//------------------------------------------------------------------------------------------------- + +template +CV_ALWAYS_INLINE int divc_simd_common_impl(scale_tag_t s_tag, const SRC in[], + uchar out[], const v_float32& v_scalar, + const v_float32& v_scale, const int length) +{ + constexpr int nlanes = v_uint8::nlanes; + + v_float32 v_zero = vx_setzero_f32(); + v_float32 v_mask = (v_scalar == v_zero); + + int x = 0; + for (;;) + { + for (; x <= length - nlanes; x += nlanes) + { + v_float32 a1 = vg_load_f32(&in[x]); + v_float32 a2 = vg_load_f32(&in[x + nlanes/4]); + v_float32 a3 = vg_load_f32(&in[x + nlanes/2]); + v_float32 a4 = vg_load_f32(&in[x + 3 * nlanes/4]); + + vx_store(&out[x], v_pack_u(v_pack(v_round(v_select(v_mask, v_zero, div_op(s_tag, a1, v_scalar, v_scale))), + v_round(v_select(v_mask, v_zero, div_op(s_tag, a2, v_scalar, v_scale)))), + v_pack(v_round(v_select(v_mask, v_zero, div_op(s_tag, a3, v_scalar, v_scale))), + v_round(v_select(v_mask, v_zero, div_op(s_tag, a4, v_scalar, v_scale)))))); + } + + if (x < length) + { + x = length - nlanes; + continue; // process unaligned tail + } + break; + } + return x; +} + +//------------------------------------------------------------------------------------------------- + +template +CV_ALWAYS_INLINE int divc_simd_common_impl(scale_tag_t s_tag, const SRC in[], + float out[], const v_float32& v_scalar, + const v_float32& v_scale, const int length) +{ + constexpr int nlanes = v_float32::nlanes; + int x = 0; + for (;;) + { + for (; x <= length - nlanes; x += nlanes) + { + v_float32 a1 = vg_load_f32(&in[x]); + vx_store(&out[x], div_op(s_tag, a1, v_scalar, v_scale)); + } + + if (x < length) + { + x = length - nlanes; + continue; // process unaligned tail + } + break; + } + return x; +} + +//------------------------------------------------------------------------------------------------- + +template +CV_ALWAYS_INLINE int divc_mask_simd_common(scale_tag_t tag, const SRC in[], + const float scalar[], DST out[], + const int length, const float scale) +{ + constexpr int nlanes = vector_type_of_t::nlanes; + + if (length < nlanes) + return 0; + + v_float32 v_scalar = vx_load(scalar); + v_float32 v_scale = vx_setall_f32(scale); + return divc_simd_common_impl(tag, in, out, v_scalar, v_scale, length); +} + +//------------------------------------------------------------------------------------------------- + +template +CV_ALWAYS_INLINE +typename std::enable_if::value || + std::is_same::value, int>::type +divc_simd_c3_impl(scale_tag_t s_tag, SRC in[], DST out[], const v_float32& s1, + const v_float32& s2, const v_float32& s3, + const v_float32& v_scale, const int length, + const int nlanes, const int lanes) +{ + v_float32 v_zero = vx_setzero_f32(); + v_float32 v_mask1 = (s1 == v_zero); + v_float32 v_mask2 = (s2 == v_zero); + v_float32 v_mask3 = (s3 == v_zero); + + int x = 0; + for (;;) + { + for (; x <= length - lanes; x += lanes) + { + v_float32 a1 = vg_load_f32(&in[x]); + v_float32 a2 = vg_load_f32(&in[x + nlanes / 2]); + v_float32 a3 = vg_load_f32(&in[x + nlanes]); + v_float32 a4 = vg_load_f32(&in[x + 3 * nlanes / 2]); + v_float32 a5 = vg_load_f32(&in[x + 2 * nlanes]); + v_float32 a6 = vg_load_f32(&in[x + 5 * nlanes / 2]); + + arithmOpScalar_pack_store_c3(&out[x], v_round(v_select(v_mask1, v_zero, div_op(s_tag, a1, s1, v_scale))), + v_round(v_select(v_mask2, v_zero, div_op(s_tag, a2, s2, v_scale))), + v_round(v_select(v_mask3, v_zero, div_op(s_tag, a3, s3, v_scale))), + v_round(v_select(v_mask1, v_zero, div_op(s_tag, a4, s1, v_scale))), + v_round(v_select(v_mask2, v_zero, div_op(s_tag, a5, s2, v_scale))), + v_round(v_select(v_mask3, v_zero, div_op(s_tag, a6, s3, v_scale)))); + } + + if (x < length) + { + x = length - lanes; + continue; // process unaligned tail + } + break; + } + return x; +} + +//------------------------------------------------------------------------------------------------- + +template +CV_ALWAYS_INLINE int divc_simd_c3_impl(scale_tag_t s_tag, const SRC* in, uchar* out, + const v_float32& s1, const v_float32& s2, + const v_float32& s3, const v_float32& v_scale, + const int length, const int nlanes, const int lanes) +{ + v_float32 v_zero = vx_setzero_f32(); + v_float32 v_mask1 = (s1 == v_zero); + v_float32 v_mask2 = (s2 == v_zero); + v_float32 v_mask3 = (s3 == v_zero); + + int x = 0; + for (;;) + { + for (; x <= length - lanes; x += lanes) + { + vx_store(&out[x], + v_pack_u(v_pack(v_round(v_select(v_mask1, v_zero, div_op(s_tag, vg_load_f32(&in[x]), s1, v_scale))), + v_round(v_select(v_mask2, v_zero, div_op(s_tag, vg_load_f32(&in[x + nlanes/4]), s2, v_scale)))), + v_pack(v_round(v_select(v_mask3, v_zero, div_op(s_tag, vg_load_f32(&in[x + nlanes/2]), s3, v_scale))), + v_round(v_select(v_mask1, v_zero, div_op(s_tag, vg_load_f32(&in[x + 3*nlanes/4]), s1, v_scale)))))); + + vx_store(&out[x + nlanes], + v_pack_u(v_pack(v_round(v_select(v_mask2, v_zero, div_op(s_tag, vg_load_f32(&in[x + nlanes]), s2, v_scale))), + v_round(v_select(v_mask3, v_zero, div_op(s_tag, vg_load_f32(&in[x + 5*nlanes/4]), s3, v_scale)))), + v_pack(v_round(v_select(v_mask1, v_zero, div_op(s_tag, vg_load_f32(&in[x + 3*nlanes/2]), s1, v_scale))), + v_round(v_select(v_mask2, v_zero, div_op(s_tag, vg_load_f32(&in[x + 7*nlanes/4]), s2, v_scale)))))); + + vx_store(&out[x + 2 * nlanes], + v_pack_u(v_pack(v_round(v_select(v_mask3, v_zero, div_op(s_tag, vg_load_f32(&in[x + 2*nlanes]), s3, v_scale))), + v_round(v_select(v_mask1, v_zero, div_op(s_tag, vg_load_f32(&in[x + 9*nlanes/4]), s1, v_scale)))), + v_pack(v_round(v_select(v_mask2, v_zero, div_op(s_tag, vg_load_f32(&in[x + 5*nlanes/2]), s2, v_scale))), + v_round(v_select(v_mask3, v_zero, div_op(s_tag, vg_load_f32(&in[x + 11*nlanes/4]), s3, v_scale)))))); + } + + if (x < length) + { + x = length - lanes; + continue; // process unaligned tail + } + break; + } + return x; +} + +//------------------------------------------------------------------------------------------------- + +template +CV_ALWAYS_INLINE int divc_simd_c3_impl(scale_tag_t s_tag, const SRC* in, float* out, + const v_float32& s1, const v_float32& s2, + const v_float32& s3, const v_float32& v_scale, const int length, + const int nlanes, const int lanes) +{ + int x = 0; + for (;;) + { + for (; x <= length - lanes; x += lanes) + { + v_float32 a1 = vg_load_f32(&in[x]); + v_float32 a2 = vg_load_f32(&in[x + nlanes]); + v_float32 a3 = vg_load_f32(&in[x + 2*nlanes]); + + vx_store(&out[x], div_op(s_tag, a1, s1, v_scale)); + vx_store(&out[x + nlanes], div_op(s_tag, a2, s2, v_scale)); + vx_store(&out[x + 2*nlanes], div_op(s_tag, a3, s3, v_scale)); + } + + if (x < length) + { + x = length - lanes; + continue; // process unaligned tail + } + break; + } + return x; +} + +//------------------------------------------------------------------------------------------------- + +template +CV_ALWAYS_INLINE int divc_mask_simd_c3(scale_tag_t s_tag, const SRC in[], + const float scalar[], DST out[], + const int length, const float scale) +{ + constexpr int chan = 3; + constexpr int nlanes = vector_type_of_t::nlanes; + constexpr int lanes = chan * nlanes; + + if (length < lanes) + return 0; + + v_float32 v_scale = vx_setall_f32(scale); + + v_float32 s1 = vx_load(scalar); +#if CV_SIMD_WIDTH == 32 + v_float32 s2 = vx_load(&scalar[2]); + v_float32 s3 = vx_load(&scalar[1]); +#else + v_float32 s2 = vx_load(&scalar[1]); + v_float32 s3 = vx_load(&scalar[2]); +#endif + return divc_simd_c3_impl(s_tag, in, out, s1, s2, s3, v_scale, length, nlanes, lanes); +} + +//------------------------------------------------------------------------------------------------- + +#define DIVC_SIMD(SRC, DST) \ +int divc_simd(const SRC in[], const float scalar[], DST out[], \ + const int length, const int chan, const float scale, \ + const int set_mask_flag) \ +{ \ + switch (chan) \ + { \ + case 1: \ + case 2: \ + case 4: \ + { \ + if (std::fabs(scale - 1.0f) <= FLT_EPSILON) \ + { \ + if (set_mask_flag == 1) \ + return divc_mask_simd_common(not_scale_tag{}, in, scalar, \ + out, length, scale); \ + else \ + return arithmOpScalar_simd_common(div_tag{}, in, scalar, \ + out, length); \ + } \ + else \ + { if (set_mask_flag == 1) \ + return divc_mask_simd_common(scale_tag{}, in, scalar, \ + out, length, scale); \ + else \ + return arithmOpScalarScaled_simd_common(div_tag{}, in, scalar, \ + out, length, scale); \ + } \ + } \ + case 3: \ + { \ + if (std::fabs(scale - 1.0f) <= FLT_EPSILON) \ + { \ + if (set_mask_flag == 1) \ + return divc_mask_simd_c3(not_scale_tag{}, in, scalar, \ + out, length, scale); \ + else \ + return arithmOpScalar_simd_c3(div_tag{}, in, scalar, \ + out, length); \ + } \ + else \ + { \ + if (set_mask_flag == 1) \ + return divc_mask_simd_c3(scale_tag{}, in, scalar, \ + out, length, scale); \ + else \ + return arithmOpScalarScaled_simd_c3(div_tag{}, in, scalar, out,\ + length, scale); \ + } \ + } \ + default: \ + GAPI_Assert(chan <= 4); \ + break; \ + } \ + return 0; \ +} + +DIVC_SIMD(uchar, uchar) +DIVC_SIMD(ushort, uchar) +DIVC_SIMD(short, uchar) +DIVC_SIMD(float, uchar) +DIVC_SIMD(short, short) +DIVC_SIMD(ushort, short) +DIVC_SIMD(uchar, short) +DIVC_SIMD(float, short) +DIVC_SIMD(ushort, ushort) +DIVC_SIMD(uchar, ushort) +DIVC_SIMD(short, ushort) +DIVC_SIMD(float, ushort) +DIVC_SIMD(uchar, float) +DIVC_SIMD(ushort, float) +DIVC_SIMD(short, float) +DIVC_SIMD(float, float) + +#undef DIVC_SIMD + //------------------------- // // Fluid kernels: AbsDiffC @@ -1550,7 +1946,7 @@ MULC_SIMD(float, float) #define ABSDIFFC_SIMD(SRC) \ int absdiffc_simd(const SRC in[], const float scalar[], SRC out[], \ - const int length, const int chan) \ + const int length, const int chan) \ { \ switch (chan) \ { \