Merge pull request #20914 from anna-khakimova:ak/simd_div

GAPI Fluid: SIMD Div kernel.

* HAL implementation for Div kernel

* Removed dbg lines

* Applied comments.

* Reworked

* Final version
This commit is contained in:
Anna Khakimova 2021-11-15 20:16:25 +03:00 committed by GitHub
parent 2b2e515a30
commit b19697e3ac
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 648 additions and 18 deletions

View File

@ -123,6 +123,7 @@ set(gapi_srcs
src/backends/fluid/gfluidimgproc.cpp src/backends/fluid/gfluidimgproc.cpp
src/backends/fluid/gfluidimgproc_func.dispatch.cpp src/backends/fluid/gfluidimgproc_func.dispatch.cpp
src/backends/fluid/gfluidcore.cpp src/backends/fluid/gfluidcore.cpp
src/backends/fluid/gfluidcore_func.dispatch.cpp
# OCL Backend (currently built-in) # OCL Backend (currently built-in)
src/backends/ocl/goclbackend.cpp src/backends/ocl/goclbackend.cpp
@ -188,6 +189,7 @@ set(gapi_srcs
) )
ocv_add_dispatched_file(backends/fluid/gfluidimgproc_func SSE4_1 AVX2) ocv_add_dispatched_file(backends/fluid/gfluidimgproc_func SSE4_1 AVX2)
ocv_add_dispatched_file(backends/fluid/gfluidcore_func SSE4_1 AVX2)
ocv_list_add_prefix(gapi_srcs "${CMAKE_CURRENT_LIST_DIR}/") ocv_list_add_prefix(gapi_srcs "${CMAKE_CURRENT_LIST_DIR}/")

View File

@ -770,7 +770,10 @@ GAPI_EXPORTS GMat mulC(const GScalar& multiplier, const GMat& src, int ddepth =
The function divides one matrix by another: The function divides one matrix by another:
\f[\texttt{dst(I) = saturate(src1(I)*scale/src2(I))}\f] \f[\texttt{dst(I) = saturate(src1(I)*scale/src2(I))}\f]
When src2(I) is zero, dst(I) will also be zero. Different channels of For integer types when src2(I) is zero, dst(I) will also be zero.
Floating point case returns Inf/NaN (according to IEEE).
Different channels of
multi-channel matrices are processed independently. multi-channel matrices are processed independently.
The matrices can be single or multi channel. Output matrix must have the same size and depth as src. The matrices can be single or multi channel. Output matrix must have the same size and depth as src.

View File

@ -35,7 +35,7 @@ namespace opencv_test
class MulPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {}; class MulPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
class MulDoublePerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {}; class MulDoublePerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
class MulCPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {}; class MulCPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
class DivPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, int, cv::GCompileArgs>> {}; class DivPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, int, double, cv::GCompileArgs>> {};
class DivCPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {}; class DivCPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
class DivRCPerfTest : public TestPerfParams<tuple<compare_f,cv::Size, MatType, int, cv::GCompileArgs>> {}; class DivRCPerfTest : public TestPerfParams<tuple<compare_f,cv::Size, MatType, int, cv::GCompileArgs>> {};
class MaskPerfTest : public TestPerfParams<tuple<cv::Size, MatType, cv::GCompileArgs>> {}; class MaskPerfTest : public TestPerfParams<tuple<cv::Size, MatType, cv::GCompileArgs>> {};

View File

@ -323,17 +323,23 @@ PERF_TEST_P_(DivPerfTest, TestPerformance)
Size sz = get<1>(GetParam()); Size sz = get<1>(GetParam());
MatType type = get<2>(GetParam()); MatType type = get<2>(GetParam());
int dtype = get<3>(GetParam()); int dtype = get<3>(GetParam());
cv::GCompileArgs compile_args = get<4>(GetParam()); double scale = get<4>(GetParam());
cv::GCompileArgs compile_args = get<5>(GetParam());
// FIXIT Unstable input data for divide // FIXIT Unstable input data for divide
initMatsRandU(type, sz, dtype, false); initMatsRandU(type, sz, dtype, false);
//This condition need to workaround bug in OpenCV.
//It reinitializes divider matrix without zero values.
if (dtype == CV_16S && dtype != type)
cv::randu(in_mat2, cv::Scalar::all(1), cv::Scalar::all(255));
// OpenCV code /////////////////////////////////////////////////////////// // OpenCV code ///////////////////////////////////////////////////////////
cv::divide(in_mat1, in_mat2, out_mat_ocv, dtype); cv::divide(in_mat1, in_mat2, out_mat_ocv, scale, dtype);
// G-API code //////////////////////////////////////////////////////////// // G-API code ////////////////////////////////////////////////////////////
cv::GMat in1, in2, out; cv::GMat in1, in2, out;
out = cv::gapi::div(in1, in2, dtype); out = cv::gapi::div(in1, in2, scale, dtype);
cv::GComputation c(GIn(in1, in2), GOut(out)); cv::GComputation c(GIn(in1, in2), GOut(out));
// Warm-up graph engine: // Warm-up graph engine:
@ -347,8 +353,9 @@ PERF_TEST_P_(DivPerfTest, TestPerformance)
} }
// Comparison //////////////////////////////////////////////////////////// // Comparison ////////////////////////////////////////////////////////////
// FIXIT unrealiable check: EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv)); {
EXPECT_EQ(out_mat_gapi.size(), sz); EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
}
SANITY_CHECK_NOTHING(); SANITY_CHECK_NOTHING();
} }

View File

@ -67,7 +67,8 @@ INSTANTIATE_TEST_CASE_P(DivPerfTestCPU, DivPerfTest,
Combine(Values(AbsExact().to_compare_f()), Combine(Values(AbsExact().to_compare_f()),
Values(szSmall128, szVGA, sz720p, sz1080p), Values(szSmall128, szVGA, sz720p, sz1080p),
Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1), Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
Values(-1, CV_8U, CV_16U, CV_32F), Values(-1, CV_8U, CV_16U, CV_16S, CV_32F),
Values(2.3),
Values(cv::compile_args(CORE_CPU)))); Values(cv::compile_args(CORE_CPU))));
INSTANTIATE_TEST_CASE_P(DivCPerfTestCPU, DivCPerfTest, INSTANTIATE_TEST_CASE_P(DivCPerfTestCPU, DivCPerfTest,

View File

@ -60,12 +60,13 @@ INSTANTIATE_TEST_CASE_P(SubPerfTestFluid, SubPerfTest,
// Values(-1, CV_8U, CV_16U, CV_32F), // Values(-1, CV_8U, CV_16U, CV_32F),
// Values(cv::compile_args(CORE_FLUID)))); // Values(cv::compile_args(CORE_FLUID))));
// INSTANTIATE_TEST_CASE_P(DivPerfTestFluid, DivPerfTest, INSTANTIATE_TEST_CASE_P(DivPerfTestFluid, DivPerfTest,
// Combine(Values(AbsExact().to_compare_f()), Combine(Values(AbsExact().to_compare_f()),
// Values(szSmall128, szVGA, sz720p, sz1080p), Values(szSmall128, szVGA, sz720p, sz1080p),
// Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1), Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
// Values(-1, CV_8U, CV_16U, CV_32F), Values(-1, CV_8U, CV_16U, CV_16S, CV_32F),
// Values(cv::compile_args(CORE_FLUID)))); Values(2.3),
Values(cv::compile_args(CORE_FLUID))));
// INSTANTIATE_TEST_CASE_P(DivCPerfTestFluid, DivCPerfTest, // INSTANTIATE_TEST_CASE_P(DivCPerfTestFluid, DivCPerfTest,
// Combine(Values(szSmall128, szVGA, sz720p, sz1080p), // Combine(Values(szSmall128, szVGA, sz720p, sz1080p),

View File

@ -62,10 +62,11 @@ INSTANTIATE_TEST_CASE_P(MulCPerfTestGPU, MulCPerfTest,
Values(cv::compile_args(CORE_GPU)))); Values(cv::compile_args(CORE_GPU))));
INSTANTIATE_TEST_CASE_P(DivPerfTestGPU, DivPerfTest, INSTANTIATE_TEST_CASE_P(DivPerfTestGPU, DivPerfTest,
Combine(Values(Tolerance_FloatRel_IntAbs(1e-5, 2).to_compare_f()), Combine(Values(AbsTolerance(2).to_compare_f()),
Values( szSmall128, szVGA, sz720p, sz1080p ), Values( szSmall128, szVGA, sz720p, sz1080p ),
Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ), Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
Values( -1, CV_8U, CV_16U, CV_32F ), Values( -1, CV_8U, CV_16U, CV_32F ),
Values(2.3),
Values(cv::compile_args(CORE_GPU)))); Values(cv::compile_args(CORE_GPU))));
INSTANTIATE_TEST_CASE_P(DivCPerfTestGPU, DivCPerfTest, INSTANTIATE_TEST_CASE_P(DivCPerfTestGPU, DivCPerfTest,

View File

@ -13,6 +13,10 @@
#include <opencv2/core/hal/hal.hpp> #include <opencv2/core/hal/hal.hpp>
#include <opencv2/core/hal/intrin.hpp> #include <opencv2/core/hal/intrin.hpp>
#if CV_SIMD
#include "gfluidcore_func.hpp"
#endif
#include <opencv2/gapi/core.hpp> #include <opencv2/gapi/core.hpp>
#include <opencv2/gapi/fluid/gfluidbuffer.hpp> #include <opencv2/gapi/fluid/gfluidbuffer.hpp>
@ -82,13 +86,25 @@ static inline DST mul(SRC1 x, SRC2 y, float scale=1)
} }
template<typename DST, typename SRC1, typename SRC2> template<typename DST, typename SRC1, typename SRC2>
static inline DST div(SRC1 x, SRC2 y, float scale=1) static inline
typename std::enable_if<!std::is_same<DST, float>::value, DST>::type
div(SRC1 x, SRC2 y, float scale=1)
{ {
// like OpenCV: returns 0, if y=0 // like OpenCV: returns 0, if DST type=uchar/short/ushort and divider(y)=0
auto result = y? scale * x / y: 0; auto result = y? scale * x / y: 0;
return saturate<DST>(result, rintf); return saturate<DST>(result, rintf);
} }
template<typename DST, typename SRC1, typename SRC2>
static inline
typename std::enable_if<std::is_same<DST, float>::value, DST>::type
div(SRC1 x, SRC2 y, float scale = 1)
{
// like OpenCV: returns inf/nan, if DST type=float and divider(y)=0
auto result = scale * x / y;
return saturate<DST>(result, rintf);
}
template<typename DST, typename SRC1, typename SRC2> template<typename DST, typename SRC1, typename SRC2>
static inline DST divr(SRC1 x, SRC2 y, float scale=1) static inline DST divr(SRC1 x, SRC2 y, float scale=1)
{ {
@ -626,7 +642,7 @@ CV_ALWAYS_INLINE int sub_simd(const SRC in1[], const SRC in2[], DST out[], int l
return 0; return 0;
} }
#endif #endif // CV_SIMD
template<typename DST, typename SRC1, typename SRC2> template<typename DST, typename SRC1, typename SRC2>
static void run_arithm(Buffer &dst, const View &src1, const View &src2, Arithm arithm, static void run_arithm(Buffer &dst, const View &src1, const View &src2, Arithm arithm,
@ -672,9 +688,14 @@ static void run_arithm(Buffer &dst, const View &src1, const View &src2, Arithm a
out[x] = mul<DST>(in1[x], in2[x], _scale); out[x] = mul<DST>(in1[x], in2[x], _scale);
break; break;
case ARITHM_DIVIDE: case ARITHM_DIVIDE:
{
#if CV_SIMD
x = div_simd(in1, in2, out, length, scale);
#endif
for (; x < length; ++x) for (; x < length; ++x)
out[x] = div<DST>(in1[x], in2[x], _scale); out[x] = div<DST>(in1[x], in2[x], _scale);
break; break;
}
default: CV_Error(cv::Error::StsBadArg, "unsupported arithmetic operation"); default: CV_Error(cv::Error::StsBadArg, "unsupported arithmetic operation");
} }
} }
@ -744,10 +765,19 @@ GAPI_FLUID_KERNEL(GFluidDiv, cv::gapi::core::GDiv, false)
{ {
// DST SRC1 SRC2 OP __VA_ARGS__ // DST SRC1 SRC2 OP __VA_ARGS__
BINARY_(uchar , uchar , uchar , run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale); BINARY_(uchar , uchar , uchar , run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale);
BINARY_(uchar, ushort, ushort, run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale);
BINARY_(uchar , short, short, run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale); BINARY_(uchar , short, short, run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale);
BINARY_(uchar , float, float, run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale); BINARY_(uchar , float, float, run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale);
BINARY_( short, short, short, run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale); BINARY_( short, short, short, run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale);
BINARY_( short, ushort, ushort, run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale);
BINARY_( short, uchar, uchar, run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale);
BINARY_( short, float, float, run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale);
BINARY_(ushort, ushort, ushort, run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale);
BINARY_(ushort, uchar , uchar , run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale);
BINARY_(ushort, short, short, run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale);
BINARY_(ushort, float, float, run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale);
BINARY_( float, uchar , uchar , run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale); BINARY_( float, uchar , uchar , run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale);
BINARY_( float, ushort, ushort, run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale);
BINARY_( float, short, short, run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale); BINARY_( float, short, short, run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale);
BINARY_( float, float, float, run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale); BINARY_( float, float, float, run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale);

View File

@ -0,0 +1,63 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
//
// Copyright (C) 2021 Intel Corporation
#if !defined(GAPI_STANDALONE)
#include "gfluidcore_func.hpp"
#include "gfluidcore_func.simd.hpp"
#include "backends/fluid/gfluidcore_func.simd_declarations.hpp"
#include "gfluidutils.hpp"
#include <opencv2/core/cvdef.h>
#include <opencv2/core/hal/intrin.hpp>
#include <cmath>
#include <cstdlib>
#ifdef __GNUC__
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wstrict-overflow"
#endif
namespace cv {
namespace gapi {
namespace fluid {
#define DIV_SIMD(SRC, DST) \
int div_simd(const SRC in1[], const SRC in2[], DST out[], \
const int length, double _scale) \
{ \
CV_CPU_DISPATCH(div_simd, (in1, in2, out, length, _scale), \
CV_CPU_DISPATCH_MODES_ALL); \
}
DIV_SIMD(uchar, uchar)
DIV_SIMD(ushort, uchar)
DIV_SIMD(short, uchar)
DIV_SIMD(float, uchar)
DIV_SIMD(short, short)
DIV_SIMD(ushort, short)
DIV_SIMD(uchar, short)
DIV_SIMD(float, short)
DIV_SIMD(ushort, ushort)
DIV_SIMD(uchar, ushort)
DIV_SIMD(short, ushort)
DIV_SIMD(float, ushort)
DIV_SIMD(uchar, float)
DIV_SIMD(ushort, float)
DIV_SIMD(short, float)
DIV_SIMD(float, float)
#undef DIV_SIMD
} // namespace fluid
} // namespace gapi
} // namespace cv
#endif // !defined(GAPI_STANDALONE)

View File

@ -0,0 +1,44 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
//
// Copyright (C) 2021 Intel Corporation
#pragma once
#if !defined(GAPI_STANDALONE)
#include <opencv2/core.hpp>
namespace cv {
namespace gapi {
namespace fluid {
#define DIV_SIMD(SRC, DST) \
int div_simd(const SRC in1[], const SRC in2[], DST out[], \
const int length, double _scale);
DIV_SIMD(uchar, uchar)
DIV_SIMD(ushort, uchar)
DIV_SIMD(short, uchar)
DIV_SIMD(float, uchar)
DIV_SIMD(short, short)
DIV_SIMD(ushort, short)
DIV_SIMD(uchar, short)
DIV_SIMD(float, short)
DIV_SIMD(ushort, ushort)
DIV_SIMD(uchar, ushort)
DIV_SIMD(short, ushort)
DIV_SIMD(float, ushort)
DIV_SIMD(uchar, float)
DIV_SIMD(ushort, float)
DIV_SIMD(short, float)
DIV_SIMD(float, float)
#undef DIV_SIMD
} // namespace fluid
} // namespace gapi
} // namespace cv
#endif // !defined(GAPI_STANDALONE)

View File

@ -0,0 +1,478 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
//
// Copyright (C) 2021 Intel Corporation
// NB: allow including this *.hpp several times!
// #pragma once -- don't: this file is NOT once!
#if !defined(GAPI_STANDALONE)
#include "opencv2/gapi/own/saturate.hpp"
#include "opencv2/core.hpp"
#include <opencv2/core/hal/intrin.hpp>
#include <opencv2/core/hal/hal.hpp>
#include <cstdint>
#include <cstring>
#include <algorithm>
#include <limits>
#include <vector>
#ifdef __GNUC__
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wstrict-overflow"
#endif
using cv::gapi::own::saturate;
namespace cv {
namespace gapi {
namespace fluid {
CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
#define DIV_SIMD(SRC, DST) \
int div_simd(const SRC in1[], const SRC in2[], DST out[], \
const int length, double _scale);
DIV_SIMD(uchar, uchar)
DIV_SIMD(ushort, uchar)
DIV_SIMD(short, uchar)
DIV_SIMD(float, uchar)
DIV_SIMD(short, short)
DIV_SIMD(ushort, short)
DIV_SIMD(uchar, short)
DIV_SIMD(float, short)
DIV_SIMD(ushort, ushort)
DIV_SIMD(uchar, ushort)
DIV_SIMD(short, ushort)
DIV_SIMD(float, ushort)
DIV_SIMD(uchar, float)
DIV_SIMD(ushort, float)
DIV_SIMD(short, float)
DIV_SIMD(float, float)
#undef DIV_SIMD
#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
struct scale_tag {};
struct not_scale_tag {};
template<typename scalar_t>
struct vector_type_of;
template<typename scalar_t>
using vector_type_of_t = typename vector_type_of<scalar_t>::type;
template<> struct vector_type_of<uchar> { using type = v_uint8; };
template<> struct vector_type_of<ushort> { using type = v_uint16; };
template<> struct vector_type_of<short> { using type = v_int16; };
CV_ALWAYS_INLINE v_float32 vg_load_f32(const float* in)
{
return vx_load(in);
}
CV_ALWAYS_INLINE v_float32 vg_load_f32(const ushort* in)
{
return v_cvt_f32(v_reinterpret_as_s32(vx_load_expand(in)));
}
CV_ALWAYS_INLINE v_float32 vg_load_f32(const short* in)
{
return v_cvt_f32(v_reinterpret_as_s32(vx_load_expand(in)));
}
CV_ALWAYS_INLINE v_float32 vg_load_f32(const uchar* in)
{
return v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(in)));
}
CV_ALWAYS_INLINE v_float32 div_op(scale_tag, const v_float32& a, const v_float32& div, const v_float32& scale)
{
return (a*scale/div);
}
CV_ALWAYS_INLINE v_float32 div_op(not_scale_tag, const v_float32& a, const v_float32& div, const v_float32&)
{
return a / div;
}
CV_ALWAYS_INLINE void v_store_div(short* dst, v_int32& res1, v_int32& res2)
{
vx_store(dst, v_pack(res1, res2));
}
CV_ALWAYS_INLINE void v_store_div(ushort* dst, v_int32& res1, v_int32& res2)
{
vx_store(dst, v_pack_u(res1, res2));
}
CV_ALWAYS_INLINE void v_store_select(short* dst, const v_int16& div, const v_int16& v_zero,
const v_int32& res1, const v_int32& res2)
{
vx_store(dst, v_select(div == v_zero, v_zero, v_pack(res1, res2)));
}
CV_ALWAYS_INLINE void v_store_select(ushort* dst, const v_int16& div, const v_int16& v_zero,
const v_int32& res1, const v_int32& res2)
{
v_uint16 sel = v_reinterpret_as_u16(v_select(div == v_zero, v_zero, v_pack(res1, res2)));
vx_store(dst, sel);
}
//=================================================================================================
template<typename scale_tag_t, typename SRC, typename DST>
CV_ALWAYS_INLINE
typename std::enable_if<(std::is_same<SRC, short>::value && std::is_same<DST, ushort>::value) ||
(std::is_same<SRC, ushort>::value && std::is_same<DST, ushort>::value) ||
(std::is_same<SRC, ushort>::value && std::is_same<DST, short>::value), int>::type
div_hal(scale_tag_t t, const SRC in1[], const SRC in2[], DST out[], const int length, double _scale)
{
constexpr int nlanes = vector_type_of_t<DST>::nlanes;
if (length < nlanes)
return 0;
v_int16 v_zero = vx_setall_s16(0);
v_float32 scale = vx_setall_f32(static_cast<float>(_scale));
int x = 0;
for (;;)
{
for (; x <= length - nlanes; x += nlanes)
{
v_float32 a1 = vg_load_f32(&in1[x]);
v_float32 a2 = vg_load_f32(&in1[x + nlanes / 2]);
v_int16 div = v_reinterpret_as_s16(vx_load(&in2[x]));
v_float32 fdiv1 = v_cvt_f32(v_expand_low(div));
v_float32 fdiv2 = v_cvt_f32(v_expand_high(div));
v_int32 r1 = v_round(div_op(t, a1, fdiv1, scale));
v_int32 r2 = v_round(div_op(t, a2, fdiv2, scale));
v_store_select(&out[x], div, v_zero, r1, r2);
}
if (x < length)
{
x = length - nlanes;
continue; // process one more time (unaligned tail)
}
break;
}
return x;
}
//-------------------------------------------------------------------------------------------------
template<typename scale_tag_t, typename SRC>
CV_ALWAYS_INLINE
typename std::enable_if<std::is_same<SRC, short>::value ||
std::is_same<SRC, ushort>::value, int>::type
div_hal(scale_tag_t t, const SRC in1[], const SRC in2[], uchar out[], const int length, double _scale)
{
constexpr int nlanes = v_uint8::nlanes;
if (length < nlanes)
return 0;
v_float32 scale = vx_setall_f32(static_cast<float>(_scale));
v_int16 v_zero = vx_setall_s16(0);
int x = 0;
for (;;)
{
for (; x <= length - nlanes; x += nlanes)
{
v_float32 a1 = vg_load_f32(&in1[x]);
v_float32 a2 = vg_load_f32(&in1[x + nlanes / 4]);
v_float32 a3 = vg_load_f32(&in1[x + nlanes / 2]);
v_float32 a4 = vg_load_f32(&in1[x + 3 * nlanes / 4]);
v_int16 div1 = v_reinterpret_as_s16(vx_load(&in2[x]));
v_int16 div2 = v_reinterpret_as_s16(vx_load(&in2[x + nlanes/2]));
v_float32 fdiv1 = v_cvt_f32(v_expand_low(div1));
v_float32 fdiv2 = v_cvt_f32(v_expand_high(div1));
v_float32 fdiv3 = v_cvt_f32(v_expand_low(div2));
v_float32 fdiv4 = v_cvt_f32(v_expand_high(div2));
v_int32 sum1 = v_round(div_op(t, a1, fdiv1, scale)),
sum2 = v_round(div_op(t, a2, fdiv2, scale)),
sum3 = v_round(div_op(t, a3, fdiv3, scale)),
sum4 = v_round(div_op(t, a4, fdiv4, scale));
v_int16 res1 = v_select((div1 == v_zero), v_zero, v_pack(sum1, sum2));
v_int16 res2 = v_select((div2 == v_zero), v_zero, v_pack(sum3, sum4));
vx_store(&out[x], v_pack_u(res1, res2));
}
if (x < length)
{
x = length - nlanes;
continue; // process one more time (unaligned tail)
}
break;
}
return x;
}
//-------------------------------------------------------------------------------------------------
template<typename scale_tag_t>
CV_ALWAYS_INLINE int div_hal(scale_tag_t t, const float in1[], const float in2[], uchar out[],
const int length, double _scale)
{
constexpr int nlanes = v_uint8::nlanes;
if (length < nlanes)
return 0;
v_float32 scale = vx_setall_f32(static_cast<float>(_scale));
v_float32 v_zero = vx_setall_f32(0);
int x = 0;
for (;;)
{
for (; x <= length - nlanes; x += nlanes)
{
v_float32 a1 = vg_load_f32(&in1[x]);
v_float32 a2 = vg_load_f32(&in1[x + nlanes / 4]);
v_float32 a3 = vg_load_f32(&in1[x + nlanes / 2]);
v_float32 a4 = vg_load_f32(&in1[x + 3 * nlanes / 4]);
v_float32 div1 = vg_load_f32(&in2[x]);
v_float32 div2 = vg_load_f32(&in2[x + nlanes / 4]);
v_float32 div3 = vg_load_f32(&in2[x + nlanes / 2]);
v_float32 div4 = vg_load_f32(&in2[x + 3 * nlanes / 4]);
v_float32 r1 = div_op(t, a1, div1, scale);
v_float32 r2 = div_op(t, a2, div2, scale);
v_float32 r3 = div_op(t, a3, div3, scale);
v_float32 r4 = div_op(t, a4, div4, scale);
v_float32 sel1 = v_select((div1 == v_zero), v_zero, r1);
v_float32 sel2 = v_select((div2 == v_zero), v_zero, r2);
v_float32 sel3 = v_select((div3 == v_zero), v_zero, r3);
v_float32 sel4 = v_select((div4 == v_zero), v_zero, r4);
v_int32 res1 = v_round(sel1);
v_int32 res2 = v_round(sel2);
v_int32 res3 = v_round(sel3);
v_int32 res4 = v_round(sel4);
vx_store(&out[x], v_pack_u(v_pack(res1, res2), v_pack(res3, res4)));
}
if (x < length)
{
x = length - nlanes;
continue; // process one more time (unaligned tail)
}
break;
}
return x;
}
//-------------------------------------------------------------------------------------------------
template<typename scale_tag_t, typename DST>
CV_ALWAYS_INLINE
typename std::enable_if<std::is_same<DST, short>::value ||
std::is_same<DST, ushort>::value, int>::type
div_hal(scale_tag_t t, const uchar in1[], const uchar in2[], DST out[], const int length, double _scale)
{
constexpr int nlanes = vector_type_of_t<DST>::nlanes;
if (length < nlanes)
return 0;
v_float32 scale = vx_setall_f32(static_cast<float>(_scale));
v_int16 v_zero = vx_setall_s16(0);
int x = 0;
for (;;)
{
for (; x <= length - nlanes; x += nlanes)
{
v_float32 a1 = vg_load_f32(&in1[x]);
v_float32 a2 = vg_load_f32(&in1[x + nlanes / 2]);
v_int16 div = v_reinterpret_as_s16(vx_load_expand(&in2[x]));
v_float32 fdiv1 = v_cvt_f32(v_expand_low(div));
v_float32 fdiv2 = v_cvt_f32(v_expand_high(div));
v_int32 r1 = v_round(div_op(t, a1, fdiv1, scale));
v_int32 r2 = v_round(div_op(t, a2, fdiv2, scale));
v_store_select(&out[x], div, v_zero, r1, r2);
}
if (x < length)
{
x = length - nlanes;
continue; // process one more time (unaligned tail)
}
break;
}
return x;
}
//-------------------------------------------------------------------------------------------------
template<typename scale_tag_t, typename DST>
CV_ALWAYS_INLINE
typename std::enable_if<std::is_same<DST, short>::value ||
std::is_same<DST, ushort>::value, int>::type
div_hal(scale_tag_t t, const float in1[], const float in2[], DST out[], const int length, double _scale)
{
constexpr int nlanes = vector_type_of_t<DST>::nlanes;
if (length < nlanes)
return 0;
v_float32 scale = vx_setall_f32(static_cast<float>(_scale));
v_float32 v_zero = vx_setall_f32(0);
int x = 0;
for (;;)
{
for (; x <= length - nlanes; x += nlanes)
{
v_float32 a1 = vg_load_f32(&in1[x]);
v_float32 a2 = vg_load_f32(&in1[x + nlanes / 2]);
v_float32 fdiv1 = vg_load_f32(&in2[x]);
v_float32 fdiv2 = vg_load_f32(&in2[x + nlanes / 2]);
v_float32 r1 = div_op(t, a1, fdiv1, scale);
v_float32 r2 = div_op(t, a2, fdiv2, scale);
v_int32 res1 = v_round(v_select((fdiv1 == v_zero), v_zero, r1));
v_int32 res2 = v_round(v_select((fdiv2 == v_zero), v_zero, r2));
v_store_div(&out[x], res1, res2);
}
if (x < length)
{
x = length - nlanes;
continue; // process one more time (unaligned tail)
}
break;
}
return x;
}
//-------------------------------------------------------------------------------------------------
template<typename scale_tag_t, typename SRC>
CV_ALWAYS_INLINE int div_hal(scale_tag_t t, const SRC in1[], const SRC in2[], float out[],
const int length, double _scale)
{
constexpr int nlanes = v_float32::nlanes;
if (length < nlanes)
return 0;
v_float32 scale = vx_setall_f32(static_cast<float>(_scale));
int x = 0;
for (;;)
{
for (; x <= length - nlanes; x += nlanes)
{
v_float32 a1 = vg_load_f32(&in1[x]);
v_float32 b1 = vg_load_f32(&in2[x]);
vx_store(&out[x], div_op(t, a1, b1, scale));
}
if (x < length)
{
x = length - nlanes;
continue; // process one more time (unaligned tail)
}
break;
}
return x;
}
//-------------------------------------------------------------------------------------------------
template<typename scale_tag_t>
CV_ALWAYS_INLINE int div_hal(scale_tag_t, const uchar in1[], const uchar in2[], uchar out[],
const int length, double scale)
{
hal::div8u(in1, static_cast<size_t>(length), in2, static_cast<size_t>(length),
out, static_cast<size_t>(length), length, 1, &scale);
return length;
}
template<typename scale_tag_t>
CV_ALWAYS_INLINE int div_hal(scale_tag_t, const short in1[], const short in2[], short out[],
const int length, double scale)
{
hal::div16s(in1, static_cast<size_t>(length), in2, static_cast<size_t>(length),
out, static_cast<size_t>(length), length, 1, &scale);
return length;
}
//-------------------------------------------------------------------------------------------------
#define DIV_SIMD(SRC, DST) \
int div_simd(const SRC in1[], const SRC in2[], DST out[], \
const int length, double _scale) \
{ \
int x = 0; \
float fscale = static_cast<float>(_scale); \
if (std::fabs(fscale - 1.0f) <= FLT_EPSILON) \
{ \
not_scale_tag t; \
x = div_hal(t, in1, in2, out, length, _scale); \
} \
else \
{ \
scale_tag t; \
x = div_hal(t, in1, in2, out, length, _scale); \
} \
return x; \
}
DIV_SIMD(uchar, uchar)
DIV_SIMD(ushort, uchar)
DIV_SIMD(short, uchar)
DIV_SIMD(float, uchar)
DIV_SIMD(short, short)
DIV_SIMD(ushort, short)
DIV_SIMD(uchar, short)
DIV_SIMD(float, short)
DIV_SIMD(ushort, ushort)
DIV_SIMD(uchar, ushort)
DIV_SIMD(short, ushort)
DIV_SIMD(float, ushort)
DIV_SIMD(uchar, float)
DIV_SIMD(ushort, float)
DIV_SIMD(short, float)
DIV_SIMD(float, float)
#undef DIV_SIMD
#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
CV_CPU_OPTIMIZATION_NAMESPACE_END
} // namespace fluid
} // namespace gapi
} // namespace cv
#endif // !defined(GAPI_STANDALONE)