mirror of
https://github.com/opencv/opencv.git
synced 2025-06-07 17:44:04 +08:00
Merge pull request #21119 from anna-khakimova:ak/simd_addc
* GAPI Fluid: SIMD for AddC kernel * Final version * Applied comments.
This commit is contained in:
parent
f044037ec5
commit
d58b5ef74b
@ -57,6 +57,7 @@ namespace core {
|
|||||||
|
|
||||||
G_TYPED_KERNEL(GAddC, <GMat(GMat, GScalar, int)>, "org.opencv.core.math.addC") {
|
G_TYPED_KERNEL(GAddC, <GMat(GMat, GScalar, int)>, "org.opencv.core.math.addC") {
|
||||||
static GMatDesc outMeta(GMatDesc a, GScalarDesc, int ddepth) {
|
static GMatDesc outMeta(GMatDesc a, GScalarDesc, int ddepth) {
|
||||||
|
GAPI_Assert(a.chan <= 4);
|
||||||
return a.withDepth(ddepth);
|
return a.withDepth(ddepth);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -28,7 +28,7 @@ namespace opencv_test
|
|||||||
//------------------------------------------------------------------------------
|
//------------------------------------------------------------------------------
|
||||||
|
|
||||||
class AddPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
|
class AddPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
|
||||||
class AddCPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
|
class AddCPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, int, cv::GCompileArgs>> {};
|
||||||
class SubPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
|
class SubPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
|
||||||
class SubCPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
|
class SubCPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
|
||||||
class SubRCPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
|
class SubRCPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
|
||||||
|
@ -61,10 +61,13 @@ PERF_TEST_P_(AddPerfTest, TestPerformance)
|
|||||||
|
|
||||||
PERF_TEST_P_(AddCPerfTest, TestPerformance)
|
PERF_TEST_P_(AddCPerfTest, TestPerformance)
|
||||||
{
|
{
|
||||||
Size sz = get<0>(GetParam());
|
compare_f cmpF;
|
||||||
MatType type = get<1>(GetParam());
|
cv::Size sz;
|
||||||
int dtype = get<2>(GetParam());
|
MatType type = -1;
|
||||||
cv::GCompileArgs compile_args = get<3>(GetParam());
|
int dtype = -1;
|
||||||
|
cv::GCompileArgs compile_args;
|
||||||
|
|
||||||
|
std::tie(cmpF, sz, type, dtype, compile_args) = GetParam();
|
||||||
|
|
||||||
initMatsRandU(type, sz, dtype, false);
|
initMatsRandU(type, sz, dtype, false);
|
||||||
|
|
||||||
@ -88,8 +91,9 @@ PERF_TEST_P_(AddCPerfTest, TestPerformance)
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Comparison ////////////////////////////////////////////////////////////
|
// Comparison ////////////////////////////////////////////////////////////
|
||||||
// FIXIT unrealiable check: EXPECT_EQ(0, cv::countNonZero(out_mat_gapi != out_mat_ocv));
|
{
|
||||||
EXPECT_EQ(out_mat_gapi.size(), sz);
|
EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
|
||||||
|
}
|
||||||
|
|
||||||
SANITY_CHECK_NOTHING();
|
SANITY_CHECK_NOTHING();
|
||||||
}
|
}
|
||||||
|
@ -22,7 +22,8 @@ INSTANTIATE_TEST_CASE_P(AddPerfTestCPU, AddPerfTest,
|
|||||||
Values(cv::compile_args(CORE_CPU))));
|
Values(cv::compile_args(CORE_CPU))));
|
||||||
|
|
||||||
INSTANTIATE_TEST_CASE_P(AddCPerfTestCPU, AddCPerfTest,
|
INSTANTIATE_TEST_CASE_P(AddCPerfTestCPU, AddCPerfTest,
|
||||||
Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
|
Combine(Values(AbsExact().to_compare_f()),
|
||||||
|
Values(szSmall128, szVGA, sz720p, sz1080p),
|
||||||
Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
|
Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
|
||||||
Values(-1, CV_8U, CV_16U, CV_32F),
|
Values(-1, CV_8U, CV_16U, CV_32F),
|
||||||
Values(cv::compile_args(CORE_CPU))));
|
Values(cv::compile_args(CORE_CPU))));
|
||||||
|
@ -18,11 +18,12 @@ INSTANTIATE_TEST_CASE_P(AddPerfTestFluid, AddPerfTest,
|
|||||||
Values(-1, CV_8U, CV_32F),
|
Values(-1, CV_8U, CV_32F),
|
||||||
Values(cv::compile_args(CORE_FLUID))));
|
Values(cv::compile_args(CORE_FLUID))));
|
||||||
|
|
||||||
// INSTANTIATE_TEST_CASE_P(AddCPerfTestFluid, AddCPerfTest,
|
INSTANTIATE_TEST_CASE_P(AddCPerfTestFluid, AddCPerfTest,
|
||||||
// Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
|
Combine(Values(Tolerance_FloatRel_IntAbs(1e-6, 1).to_compare_f()),
|
||||||
// Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
|
Values(szSmall128, szVGA, sz720p, sz1080p),
|
||||||
// Values(-1, CV_8U, CV_16U, CV_32F),
|
Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
|
||||||
// Values(cv::compile_args(CORE_FLUID))));
|
Values(-1, CV_8U, CV_16U, CV_16S, CV_32F),
|
||||||
|
Values(cv::compile_args(CORE_FLUID))));
|
||||||
|
|
||||||
INSTANTIATE_TEST_CASE_P(SubPerfTestFluid, SubPerfTest,
|
INSTANTIATE_TEST_CASE_P(SubPerfTestFluid, SubPerfTest,
|
||||||
Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
|
Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
|
||||||
|
@ -20,7 +20,8 @@ INSTANTIATE_TEST_CASE_P(AddPerfTestGPU, AddPerfTest,
|
|||||||
Values(cv::compile_args(CORE_GPU))));
|
Values(cv::compile_args(CORE_GPU))));
|
||||||
|
|
||||||
INSTANTIATE_TEST_CASE_P(AddCPerfTestGPU, AddCPerfTest,
|
INSTANTIATE_TEST_CASE_P(AddCPerfTestGPU, AddCPerfTest,
|
||||||
Combine(Values( szSmall128, szVGA, sz720p, sz1080p ),
|
Combine(Values(AbsExact().to_compare_f()),
|
||||||
|
Values( szSmall128, szVGA, sz720p, sz1080p ),
|
||||||
Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
|
Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
|
||||||
Values( -1, CV_8U, CV_16U, CV_32F ),
|
Values( -1, CV_8U, CV_16U, CV_32F ),
|
||||||
Values(cv::compile_args(CORE_GPU))));
|
Values(cv::compile_args(CORE_GPU))));
|
||||||
|
@ -645,8 +645,8 @@ CV_ALWAYS_INLINE int sub_simd(const SRC in1[], const SRC in2[], DST out[], int l
|
|||||||
#endif // CV_SIMD
|
#endif // CV_SIMD
|
||||||
|
|
||||||
template<typename DST, typename SRC1, typename SRC2>
|
template<typename DST, typename SRC1, typename SRC2>
|
||||||
static void run_arithm(Buffer &dst, const View &src1, const View &src2, Arithm arithm,
|
static CV_ALWAYS_INLINE void run_arithm(Buffer &dst, const View &src1, const View &src2,
|
||||||
double scale=1)
|
Arithm arithm, double scale=1)
|
||||||
{
|
{
|
||||||
static_assert(std::is_same<SRC1, SRC2>::value, "wrong types");
|
static_assert(std::is_same<SRC1, SRC2>::value, "wrong types");
|
||||||
|
|
||||||
@ -844,19 +844,15 @@ GAPI_FLUID_KERNEL(GFluidAbsDiff, cv::gapi::core::GAbsDiff, false)
|
|||||||
//
|
//
|
||||||
//--------------------------------------
|
//--------------------------------------
|
||||||
|
|
||||||
static inline v_uint16x8 v_add_16u(const v_uint16x8 &x, const v_uint16x8 &y) { return x + y; }
|
|
||||||
static inline v_uint16x8 v_sub_16u(const v_uint16x8 &x, const v_uint16x8 &y) { return x - y; }
|
static inline v_uint16x8 v_sub_16u(const v_uint16x8 &x, const v_uint16x8 &y) { return x - y; }
|
||||||
static inline v_uint16x8 v_subr_16u(const v_uint16x8 &x, const v_uint16x8 &y) { return y - x; }
|
static inline v_uint16x8 v_subr_16u(const v_uint16x8 &x, const v_uint16x8 &y) { return y - x; }
|
||||||
|
|
||||||
static inline v_float32x4 v_add_32f(const v_float32x4 &x, const v_float32x4 &y) { return x + y; }
|
|
||||||
static inline v_float32x4 v_sub_32f(const v_float32x4 &x, const v_float32x4 &y) { return x - y; }
|
static inline v_float32x4 v_sub_32f(const v_float32x4 &x, const v_float32x4 &y) { return x - y; }
|
||||||
static inline v_float32x4 v_subr_32f(const v_float32x4 &x, const v_float32x4 &y) { return y - x; }
|
static inline v_float32x4 v_subr_32f(const v_float32x4 &x, const v_float32x4 &y) { return y - x; }
|
||||||
|
|
||||||
static inline int s_add_8u(uchar x, uchar y) { return x + y; }
|
|
||||||
static inline int s_sub_8u(uchar x, uchar y) { return x - y; }
|
static inline int s_sub_8u(uchar x, uchar y) { return x - y; }
|
||||||
static inline int s_subr_8u(uchar x, uchar y) { return y - x; }
|
static inline int s_subr_8u(uchar x, uchar y) { return y - x; }
|
||||||
|
|
||||||
static inline float s_add_32f(float x, float y) { return x + y; }
|
|
||||||
static inline float s_sub_32f(float x, float y) { return x - y; }
|
static inline float s_sub_32f(float x, float y) { return x - y; }
|
||||||
static inline float s_subr_32f(float x, float y) { return y - x; }
|
static inline float s_subr_32f(float x, float y) { return y - x; }
|
||||||
|
|
||||||
@ -946,11 +942,6 @@ static void run_arithm_s1(uchar out[], const float in[], int width, const float
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void run_arithm_s_add3(uchar out[], const uchar in[], int width, const uchar scalar[])
|
|
||||||
{
|
|
||||||
run_arithm_s3(out, in, width, scalar, v_add_16u, s_add_8u);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void run_arithm_s_sub3(uchar out[], const uchar in[], int width, const uchar scalar[])
|
static void run_arithm_s_sub3(uchar out[], const uchar in[], int width, const uchar scalar[])
|
||||||
{
|
{
|
||||||
run_arithm_s3(out, in, width, scalar, v_sub_16u, s_sub_8u);
|
run_arithm_s3(out, in, width, scalar, v_sub_16u, s_sub_8u);
|
||||||
@ -961,11 +952,6 @@ static void run_arithm_s_subr3(uchar out[], const uchar in[], int width, const u
|
|||||||
run_arithm_s3(out, in, width, scalar, v_subr_16u, s_subr_8u); // reverse: subr
|
run_arithm_s3(out, in, width, scalar, v_subr_16u, s_subr_8u); // reverse: subr
|
||||||
}
|
}
|
||||||
|
|
||||||
static void run_arithm_s_add1(uchar out[], const float in[], int width, const float scalar[])
|
|
||||||
{
|
|
||||||
run_arithm_s1(out, in, width, scalar, v_add_32f, s_add_32f);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void run_arithm_s_sub1(uchar out[], const float in[], int width, const float scalar[])
|
static void run_arithm_s_sub1(uchar out[], const float in[], int width, const float scalar[])
|
||||||
{
|
{
|
||||||
run_arithm_s1(out, in, width, scalar, v_sub_32f, s_sub_32f);
|
run_arithm_s1(out, in, width, scalar, v_sub_32f, s_sub_32f);
|
||||||
@ -1279,8 +1265,8 @@ static void run_absdiffc(Buffer &dst, const View &src, const float scalar[])
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<typename DST, typename SRC>
|
template<typename DST, typename SRC>
|
||||||
static void run_arithm_s(Buffer &dst, const View &src, const float scalar[4], Arithm arithm,
|
CV_ALWAYS_INLINE void run_arithm_s(Buffer &dst, const View &src, const float scalar[],
|
||||||
float scale=1)
|
Arithm arithm, float scale=1)
|
||||||
{
|
{
|
||||||
const auto *in = src.InLine<SRC>(0);
|
const auto *in = src.InLine<SRC>(0);
|
||||||
auto *out = dst.OutLine<DST>();
|
auto *out = dst.OutLine<DST>();
|
||||||
@ -1288,48 +1274,45 @@ static void run_arithm_s(Buffer &dst, const View &src, const float scalar[4], Ar
|
|||||||
int width = dst.length();
|
int width = dst.length();
|
||||||
int chan = dst.meta().chan;
|
int chan = dst.meta().chan;
|
||||||
|
|
||||||
// What if we cast the scalar into the SRC type?
|
|
||||||
const SRC myscal[4] = { static_cast<SRC>(scalar[0]), static_cast<SRC>(scalar[1]),
|
|
||||||
static_cast<SRC>(scalar[2]), static_cast<SRC>(scalar[3]) };
|
|
||||||
bool usemyscal = (myscal[0] == scalar[0]) && (myscal[1] == scalar[1]) &&
|
|
||||||
(myscal[2] == scalar[2]) && (myscal[3] == scalar[3]);
|
|
||||||
|
|
||||||
switch (arithm)
|
switch (arithm)
|
||||||
{
|
{
|
||||||
case ARITHM_ADD:
|
case ARITHM_ADD:
|
||||||
if (usemyscal)
|
{
|
||||||
{
|
int w = 0;
|
||||||
if (std::is_same<DST,uchar>::value &&
|
#if CV_SIMD
|
||||||
std::is_same<SRC,uchar>::value &&
|
w = addc_simd(in, scalar, out, width, chan);
|
||||||
chan == 3)
|
#endif
|
||||||
run_arithm_s_add3((uchar*)out, (const uchar*)in, width, (const uchar*)myscal);
|
|
||||||
else if (std::is_same<DST,uchar>::value &&
|
for (; w < width * chan; ++w)
|
||||||
std::is_same<SRC,float>::value &&
|
out[w] = add<DST>(in[w], scalar[w % chan]);
|
||||||
chan == 1)
|
|
||||||
run_arithm_s_add1((uchar*)out, (const float*)in, width, (const float*)myscal);
|
|
||||||
else
|
|
||||||
run_arithm_s(out, in, width, chan, myscal, add<DST,SRC,SRC>);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
run_arithm_s(out, in, width, chan, scalar, add<DST,SRC,float>);
|
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
case ARITHM_SUBTRACT:
|
case ARITHM_SUBTRACT:
|
||||||
|
{
|
||||||
|
// What if we cast the scalar into the SRC type?
|
||||||
|
const SRC myscal[4] = { static_cast<SRC>(scalar[0]), static_cast<SRC>(scalar[1]),
|
||||||
|
static_cast<SRC>(scalar[2]), static_cast<SRC>(scalar[3]) };
|
||||||
|
bool usemyscal = (myscal[0] == scalar[0]) && (myscal[1] == scalar[1]) &&
|
||||||
|
(myscal[2] == scalar[2]) && (myscal[3] == scalar[3]);
|
||||||
|
|
||||||
if (usemyscal)
|
if (usemyscal)
|
||||||
{
|
{
|
||||||
if (std::is_same<DST,uchar>::value &&
|
if (std::is_same<DST, uchar>::value &&
|
||||||
std::is_same<SRC,uchar>::value &&
|
std::is_same<SRC, uchar>::value &&
|
||||||
chan == 3)
|
chan == 3)
|
||||||
run_arithm_s_sub3((uchar*)out, (const uchar*)in, width, (const uchar*)myscal);
|
run_arithm_s_sub3((uchar*)out, (const uchar*)in, width, (const uchar*)myscal);
|
||||||
else if (std::is_same<DST,uchar>::value &&
|
else if (std::is_same<DST, uchar>::value &&
|
||||||
std::is_same<SRC,float>::value &&
|
std::is_same<SRC, float>::value &&
|
||||||
chan == 1)
|
chan == 1)
|
||||||
run_arithm_s_sub1((uchar*)out, (const float*)in, width, (const float*)myscal);
|
run_arithm_s_sub1((uchar*)out, (const float*)in, width, (const float*)myscal);
|
||||||
else
|
else
|
||||||
run_arithm_s(out, in, width, chan, myscal, sub<DST,SRC,SRC>);
|
run_arithm_s(out, in, width, chan, myscal, sub<DST, SRC, SRC>);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
run_arithm_s(out, in, width, chan, scalar, sub<DST,SRC,float>);
|
run_arithm_s(out, in, width, chan, scalar, sub<DST, SRC, float>);
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
// TODO: optimize miltiplication and division
|
// TODO: optimize miltiplication and division
|
||||||
case ARITHM_MULTIPLY:
|
case ARITHM_MULTIPLY:
|
||||||
for (int w=0; w < width; w++)
|
for (int w=0; w < width; w++)
|
||||||
@ -1433,30 +1416,75 @@ GAPI_FLUID_KERNEL(GFluidAbsDiffC, cv::gapi::core::GAbsDiffC, true)
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
GAPI_FLUID_KERNEL(GFluidAddC, cv::gapi::core::GAddC, false)
|
GAPI_FLUID_KERNEL(GFluidAddC, cv::gapi::core::GAddC, true)
|
||||||
{
|
{
|
||||||
static const int Window = 1;
|
static const int Window = 1;
|
||||||
|
|
||||||
static void run(const View &src, const cv::Scalar &_scalar, int /*dtype*/, Buffer &dst)
|
static void run(const View &src, const cv::Scalar &_scalar, int /*dtype*/, Buffer &dst, Buffer &scratch)
|
||||||
{
|
{
|
||||||
const float scalar[4] = {
|
GAPI_Assert(src.meta().chan <= 4);
|
||||||
static_cast<float>(_scalar[0]),
|
|
||||||
static_cast<float>(_scalar[1]),
|
if (dst.y() == 0)
|
||||||
static_cast<float>(_scalar[2]),
|
{
|
||||||
static_cast<float>(_scalar[3])
|
const int chan = src.meta().chan;
|
||||||
};
|
float* sc = scratch.OutLine<float>();
|
||||||
|
|
||||||
|
for (int i = 0; i < scratch.length(); ++i)
|
||||||
|
sc[i] = static_cast<float>(_scalar[i % chan]);
|
||||||
|
}
|
||||||
|
|
||||||
|
const float* scalar = scratch.OutLine<float>();
|
||||||
|
|
||||||
// DST SRC OP __VA_ARGS__
|
// DST SRC OP __VA_ARGS__
|
||||||
UNARY_(uchar , uchar , run_arithm_s, dst, src, scalar, ARITHM_ADD);
|
UNARY_(uchar, uchar, run_arithm_s, dst, src, scalar, ARITHM_ADD);
|
||||||
UNARY_(uchar , short, run_arithm_s, dst, src, scalar, ARITHM_ADD);
|
UNARY_(uchar, ushort, run_arithm_s, dst, src, scalar, ARITHM_ADD);
|
||||||
UNARY_(uchar , float, run_arithm_s, dst, src, scalar, ARITHM_ADD);
|
UNARY_(uchar, short, run_arithm_s, dst, src, scalar, ARITHM_ADD);
|
||||||
UNARY_( short, short, run_arithm_s, dst, src, scalar, ARITHM_ADD);
|
UNARY_(uchar, float, run_arithm_s, dst, src, scalar, ARITHM_ADD);
|
||||||
UNARY_( float, uchar , run_arithm_s, dst, src, scalar, ARITHM_ADD);
|
UNARY_(ushort, ushort, run_arithm_s, dst, src, scalar, ARITHM_ADD);
|
||||||
UNARY_( float, short, run_arithm_s, dst, src, scalar, ARITHM_ADD);
|
UNARY_(ushort, short, run_arithm_s, dst, src, scalar, ARITHM_ADD);
|
||||||
UNARY_( float, float, run_arithm_s, dst, src, scalar, ARITHM_ADD);
|
UNARY_(ushort, uchar, run_arithm_s, dst, src, scalar, ARITHM_ADD);
|
||||||
|
UNARY_(ushort, float, run_arithm_s, dst, src, scalar, ARITHM_ADD);
|
||||||
|
UNARY_(short, short, run_arithm_s, dst, src, scalar, ARITHM_ADD);
|
||||||
|
UNARY_(short, ushort, run_arithm_s, dst, src, scalar, ARITHM_ADD);
|
||||||
|
UNARY_(short, uchar, run_arithm_s, dst, src, scalar, ARITHM_ADD);
|
||||||
|
UNARY_(short, float, run_arithm_s, dst, src, scalar, ARITHM_ADD);
|
||||||
|
UNARY_(float, uchar, run_arithm_s, dst, src, scalar, ARITHM_ADD);
|
||||||
|
UNARY_(float, ushort, run_arithm_s, dst, src, scalar, ARITHM_ADD);
|
||||||
|
UNARY_(float, short, run_arithm_s, dst, src, scalar, ARITHM_ADD);
|
||||||
|
UNARY_(float, float, run_arithm_s, dst, src, scalar, ARITHM_ADD);
|
||||||
|
|
||||||
CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
|
CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void initScratch(const GMatDesc&, const GScalarDesc&, int, Buffer& scratch)
|
||||||
|
{
|
||||||
|
#if CV_SIMD
|
||||||
|
// 512 bits / 32 bits = 16 elements of float32 can contain a AVX 512 SIMD vector.
|
||||||
|
constexpr int maxNlanes = 16;
|
||||||
|
|
||||||
|
// +2 is offset for 3-channel case.
|
||||||
|
// Offset is need to right load coefficients from scalar array to SIMD vectors for 3-channel case.
|
||||||
|
// Scalar array looks like: scalar[] = {C1, C2, C3, C1, C2, C3, ...}
|
||||||
|
// The first scalar SIMD vector should looks like:
|
||||||
|
// C1 C2 C3 C1
|
||||||
|
// The second:
|
||||||
|
// C2 C3 C1 C2
|
||||||
|
// The third:
|
||||||
|
// C3 C1 C2 C3
|
||||||
|
constexpr int offset = 2;
|
||||||
|
constexpr int buflen = maxNlanes + offset;
|
||||||
|
#else
|
||||||
|
constexpr int buflen = 4;
|
||||||
|
#endif
|
||||||
|
cv::Size bufsize(buflen, 1);
|
||||||
|
GMatDesc bufdesc = { CV_32F, 1, bufsize };
|
||||||
|
Buffer buffer(bufdesc);
|
||||||
|
scratch = std::move(buffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void resetScratch(Buffer& /* scratch */)
|
||||||
|
{
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
GAPI_FLUID_KERNEL(GFluidSubC, cv::gapi::core::GSubC, false)
|
GAPI_FLUID_KERNEL(GFluidSubC, cv::gapi::core::GSubC, false)
|
||||||
|
@ -85,6 +85,33 @@ MUL_SIMD(float, float)
|
|||||||
|
|
||||||
#undef MUL_SIMD
|
#undef MUL_SIMD
|
||||||
|
|
||||||
|
#define ADDC_SIMD(SRC, DST) \
|
||||||
|
int addc_simd(const SRC in[], const float scalar[], DST out[], \
|
||||||
|
const int width, const int chan) \
|
||||||
|
{ \
|
||||||
|
CV_CPU_DISPATCH(addc_simd, (in, scalar, out, width, chan), \
|
||||||
|
CV_CPU_DISPATCH_MODES_ALL); \
|
||||||
|
}
|
||||||
|
|
||||||
|
ADDC_SIMD(uchar, uchar)
|
||||||
|
ADDC_SIMD(ushort, uchar)
|
||||||
|
ADDC_SIMD(short, uchar)
|
||||||
|
ADDC_SIMD(float, uchar)
|
||||||
|
ADDC_SIMD(short, short)
|
||||||
|
ADDC_SIMD(ushort, short)
|
||||||
|
ADDC_SIMD(uchar, short)
|
||||||
|
ADDC_SIMD(float, short)
|
||||||
|
ADDC_SIMD(ushort, ushort)
|
||||||
|
ADDC_SIMD(uchar, ushort)
|
||||||
|
ADDC_SIMD(short, ushort)
|
||||||
|
ADDC_SIMD(float, ushort)
|
||||||
|
ADDC_SIMD(uchar, float)
|
||||||
|
ADDC_SIMD(ushort, float)
|
||||||
|
ADDC_SIMD(short, float)
|
||||||
|
ADDC_SIMD(float, float)
|
||||||
|
|
||||||
|
#undef ADDC_SIMD
|
||||||
|
|
||||||
} // namespace fluid
|
} // namespace fluid
|
||||||
} // namespace gapi
|
} // namespace gapi
|
||||||
} // namespace cv
|
} // namespace cv
|
||||||
|
@ -60,6 +60,29 @@ MUL_SIMD(float, float)
|
|||||||
|
|
||||||
#undef MUL_SIMD
|
#undef MUL_SIMD
|
||||||
|
|
||||||
|
#define ADDC_SIMD(SRC, DST) \
|
||||||
|
int addc_simd(const SRC in[], const float scalar[], DST out[], \
|
||||||
|
const int width, const int chan);
|
||||||
|
|
||||||
|
ADDC_SIMD(uchar, uchar)
|
||||||
|
ADDC_SIMD(ushort, uchar)
|
||||||
|
ADDC_SIMD(short, uchar)
|
||||||
|
ADDC_SIMD(float, uchar)
|
||||||
|
ADDC_SIMD(short, short)
|
||||||
|
ADDC_SIMD(ushort, short)
|
||||||
|
ADDC_SIMD(uchar, short)
|
||||||
|
ADDC_SIMD(float, short)
|
||||||
|
ADDC_SIMD(ushort, ushort)
|
||||||
|
ADDC_SIMD(uchar, ushort)
|
||||||
|
ADDC_SIMD(short, ushort)
|
||||||
|
ADDC_SIMD(float, ushort)
|
||||||
|
ADDC_SIMD(uchar, float)
|
||||||
|
ADDC_SIMD(ushort, float)
|
||||||
|
ADDC_SIMD(short, float)
|
||||||
|
ADDC_SIMD(float, float)
|
||||||
|
|
||||||
|
#undef ADDC_SIMD
|
||||||
|
|
||||||
} // namespace fluid
|
} // namespace fluid
|
||||||
} // namespace gapi
|
} // namespace gapi
|
||||||
} // namespace cv
|
} // namespace cv
|
||||||
|
@ -81,6 +81,29 @@ MUL_SIMD(float, float)
|
|||||||
|
|
||||||
#undef MUL_SIMD
|
#undef MUL_SIMD
|
||||||
|
|
||||||
|
#define ADDC_SIMD(SRC, DST) \
|
||||||
|
int addc_simd(const SRC in[], const float scalar[], DST out[], \
|
||||||
|
const int width, const int chan);
|
||||||
|
|
||||||
|
ADDC_SIMD(uchar, uchar)
|
||||||
|
ADDC_SIMD(ushort, uchar)
|
||||||
|
ADDC_SIMD(short, uchar)
|
||||||
|
ADDC_SIMD(float, uchar)
|
||||||
|
ADDC_SIMD(short, short)
|
||||||
|
ADDC_SIMD(ushort, short)
|
||||||
|
ADDC_SIMD(uchar, short)
|
||||||
|
ADDC_SIMD(float, short)
|
||||||
|
ADDC_SIMD(ushort, ushort)
|
||||||
|
ADDC_SIMD(uchar, ushort)
|
||||||
|
ADDC_SIMD(short, ushort)
|
||||||
|
ADDC_SIMD(float, ushort)
|
||||||
|
ADDC_SIMD(uchar, float)
|
||||||
|
ADDC_SIMD(ushort, float)
|
||||||
|
ADDC_SIMD(short, float)
|
||||||
|
ADDC_SIMD(float, float)
|
||||||
|
|
||||||
|
#undef ADDC_SIMD
|
||||||
|
|
||||||
#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
|
#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
|
||||||
|
|
||||||
struct scale_tag {};
|
struct scale_tag {};
|
||||||
@ -95,6 +118,7 @@ using vector_type_of_t = typename vector_type_of<scalar_t>::type;
|
|||||||
template<> struct vector_type_of<uchar> { using type = v_uint8; };
|
template<> struct vector_type_of<uchar> { using type = v_uint8; };
|
||||||
template<> struct vector_type_of<ushort> { using type = v_uint16; };
|
template<> struct vector_type_of<ushort> { using type = v_uint16; };
|
||||||
template<> struct vector_type_of<short> { using type = v_int16; };
|
template<> struct vector_type_of<short> { using type = v_int16; };
|
||||||
|
template<> struct vector_type_of<float> { using type = v_float32; };
|
||||||
|
|
||||||
CV_ALWAYS_INLINE v_float32 vg_load_f32(const float* in)
|
CV_ALWAYS_INLINE v_float32 vg_load_f32(const float* in)
|
||||||
{
|
{
|
||||||
@ -136,12 +160,12 @@ CV_ALWAYS_INLINE v_float32 div_op(not_scale_tag, const v_float32& a, const v_flo
|
|||||||
return a / div;
|
return a / div;
|
||||||
}
|
}
|
||||||
|
|
||||||
CV_ALWAYS_INLINE void v_store_i16(short* dst, v_int32& res1, v_int32& res2)
|
CV_ALWAYS_INLINE void v_store_i16(short* dst, const v_int32& res1, const v_int32& res2)
|
||||||
{
|
{
|
||||||
vx_store(dst, v_pack(res1, res2));
|
vx_store(dst, v_pack(res1, res2));
|
||||||
}
|
}
|
||||||
|
|
||||||
CV_ALWAYS_INLINE void v_store_i16(ushort* dst, v_int32& res1, v_int32& res2)
|
CV_ALWAYS_INLINE void v_store_i16(ushort* dst, const v_int32& res1, const v_int32& res2)
|
||||||
{
|
{
|
||||||
vx_store(dst, v_pack_u(res1, res2));
|
vx_store(dst, v_pack_u(res1, res2));
|
||||||
}
|
}
|
||||||
@ -821,6 +845,243 @@ MUL_SIMD(float, float)
|
|||||||
|
|
||||||
#undef MUL_SIMD
|
#undef MUL_SIMD
|
||||||
|
|
||||||
|
//-------------------------
|
||||||
|
//
|
||||||
|
// Fluid kernels: AddC
|
||||||
|
//
|
||||||
|
//-------------------------
|
||||||
|
|
||||||
|
CV_ALWAYS_INLINE void addc_pack_store_c3(short* outx, const v_int32& c1,
|
||||||
|
const v_int32& c2, const v_int32& c3,
|
||||||
|
const v_int32& c4, const v_int32& c5,
|
||||||
|
const v_int32& c6)
|
||||||
|
{
|
||||||
|
constexpr int nlanes = v_int16::nlanes;
|
||||||
|
vx_store(outx, v_pack(c1, c2));
|
||||||
|
vx_store(&outx[nlanes], v_pack(c3, c4));
|
||||||
|
vx_store(&outx[2*nlanes], v_pack(c5, c6));
|
||||||
|
}
|
||||||
|
|
||||||
|
CV_ALWAYS_INLINE void addc_pack_store_c3(ushort* outx, const v_int32& c1,
|
||||||
|
const v_int32& c2, const v_int32& c3,
|
||||||
|
const v_int32& c4, const v_int32& c5,
|
||||||
|
const v_int32& c6)
|
||||||
|
{
|
||||||
|
constexpr int nlanes = v_uint16::nlanes;
|
||||||
|
vx_store(outx, v_pack_u(c1, c2));
|
||||||
|
vx_store(&outx[nlanes], v_pack_u(c3, c4));
|
||||||
|
vx_store(&outx[2*nlanes], v_pack_u(c5, c6));
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename SRC, typename DST>
|
||||||
|
CV_ALWAYS_INLINE
|
||||||
|
typename std::enable_if<(std::is_same<DST, ushort>::value ||
|
||||||
|
std::is_same<DST, short>::value), void>::type
|
||||||
|
addc_simd_common_impl(const SRC* inx, DST* outx, const v_float32& sc, const int nlanes)
|
||||||
|
{
|
||||||
|
v_float32 a1 = vg_load_f32(inx);
|
||||||
|
v_float32 a2 = vg_load_f32(&inx[nlanes/2]);
|
||||||
|
|
||||||
|
v_store_i16(outx, v_round(a1 + sc), v_round(a2 + sc));
|
||||||
|
}
|
||||||
|
|
||||||
|
//-------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
template<typename SRC>
|
||||||
|
CV_ALWAYS_INLINE void addc_simd_common_impl(const SRC* inx, uchar* outx, const v_float32& sc, const int nlanes)
|
||||||
|
{
|
||||||
|
v_float32 a1 = vg_load_f32(inx);
|
||||||
|
v_float32 a2 = vg_load_f32(&inx[nlanes/4]);
|
||||||
|
v_float32 a3 = vg_load_f32(&inx[nlanes/2]);
|
||||||
|
v_float32 a4 = vg_load_f32(&inx[3 * nlanes/4]);
|
||||||
|
|
||||||
|
vx_store(outx, v_pack_u(v_pack(v_round(a1 + sc),
|
||||||
|
v_round(a2 + sc)),
|
||||||
|
v_pack(v_round(a3 + sc),
|
||||||
|
v_round(a4 + sc))));
|
||||||
|
}
|
||||||
|
|
||||||
|
//-------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
template<typename SRC>
|
||||||
|
CV_ALWAYS_INLINE void addc_simd_common_impl(const SRC* inx, float* outx, const v_float32& sc, const int)
|
||||||
|
{
|
||||||
|
v_float32 a1 = vg_load_f32(inx);
|
||||||
|
vx_store(outx, a1 + sc);
|
||||||
|
}
|
||||||
|
|
||||||
|
//-------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
template<typename SRC, typename DST>
|
||||||
|
CV_ALWAYS_INLINE
|
||||||
|
typename std::enable_if<std::is_same<DST, short>::value ||
|
||||||
|
std::is_same<DST, ushort>::value, void>::type
|
||||||
|
addc_simd_c3_impl(const SRC* inx, DST* outx, const v_float32& s1, const v_float32& s2,
|
||||||
|
const v_float32& s3, const int nlanes)
|
||||||
|
{
|
||||||
|
v_float32 a1 = vg_load_f32(inx);
|
||||||
|
v_float32 a2 = vg_load_f32(&inx[nlanes / 2]);
|
||||||
|
v_float32 a3 = vg_load_f32(&inx[nlanes]);
|
||||||
|
v_float32 a4 = vg_load_f32(&inx[3 * nlanes / 2]);
|
||||||
|
v_float32 a5 = vg_load_f32(&inx[2 * nlanes]);
|
||||||
|
v_float32 a6 = vg_load_f32(&inx[5 * nlanes / 2]);
|
||||||
|
|
||||||
|
addc_pack_store_c3(outx, v_round(a1 + s1),
|
||||||
|
v_round(a2 + s2),
|
||||||
|
v_round(a3 + s3),
|
||||||
|
v_round(a4 + s1),
|
||||||
|
v_round(a5 + s2),
|
||||||
|
v_round(a6 + s3));
|
||||||
|
}
|
||||||
|
|
||||||
|
//-------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
template<typename SRC>
|
||||||
|
CV_ALWAYS_INLINE void addc_simd_c3_impl(const SRC* inx, uchar* outx,
|
||||||
|
const v_float32& s1, const v_float32& s2,
|
||||||
|
const v_float32& s3, const int nlanes)
|
||||||
|
{
|
||||||
|
vx_store(outx,
|
||||||
|
v_pack_u(v_pack(v_round(vg_load_f32(inx) + s1),
|
||||||
|
v_round(vg_load_f32(&inx[nlanes/4]) + s2)),
|
||||||
|
v_pack(v_round(vg_load_f32(&inx[nlanes/2]) + s3),
|
||||||
|
v_round(vg_load_f32(&inx[3*nlanes/4]) + s1))));
|
||||||
|
|
||||||
|
vx_store(&outx[nlanes],
|
||||||
|
v_pack_u(v_pack(v_round(vg_load_f32(&inx[nlanes]) + s2),
|
||||||
|
v_round(vg_load_f32(&inx[5*nlanes/4]) + s3)),
|
||||||
|
v_pack(v_round(vg_load_f32(&inx[3*nlanes/2]) + s1),
|
||||||
|
v_round(vg_load_f32(&inx[7*nlanes/4]) + s2))));
|
||||||
|
|
||||||
|
vx_store(&outx[2 * nlanes],
|
||||||
|
v_pack_u(v_pack(v_round(vg_load_f32(&inx[2*nlanes]) + s3),
|
||||||
|
v_round(vg_load_f32(&inx[9*nlanes/4]) + s1)),
|
||||||
|
v_pack(v_round(vg_load_f32(&inx[5*nlanes/2]) + s2),
|
||||||
|
v_round(vg_load_f32(&inx[11*nlanes/4]) + s3))));
|
||||||
|
}
|
||||||
|
|
||||||
|
//-------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
template<typename SRC>
|
||||||
|
CV_ALWAYS_INLINE void addc_simd_c3_impl(const SRC* in, float* out,
|
||||||
|
const v_float32& s1, const v_float32& s2,
|
||||||
|
const v_float32& s3, const int nlanes)
|
||||||
|
{
|
||||||
|
v_float32 a1 = vg_load_f32(in);
|
||||||
|
v_float32 a2 = vg_load_f32(&in[nlanes]);
|
||||||
|
v_float32 a3 = vg_load_f32(&in[2*nlanes]);
|
||||||
|
|
||||||
|
vx_store(out, a1 + s1);
|
||||||
|
vx_store(&out[nlanes], a2 + s2);
|
||||||
|
vx_store(&out[2*nlanes], a3 + s3);
|
||||||
|
}
|
||||||
|
|
||||||
|
//-------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
template<typename SRC, typename DST>
|
||||||
|
CV_ALWAYS_INLINE int addc_simd_c3(const SRC in[], const float scalar[], DST out[], const int length)
|
||||||
|
{
|
||||||
|
constexpr int chan = 3;
|
||||||
|
constexpr int nlanes = vector_type_of_t<DST>::nlanes;
|
||||||
|
constexpr int lanes = chan * nlanes;
|
||||||
|
|
||||||
|
if (length < lanes)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
v_float32 s1 = vx_load(scalar);
|
||||||
|
#if CV_SIMD_WIDTH == 32
|
||||||
|
v_float32 s2 = vx_load(&scalar[2]);
|
||||||
|
v_float32 s3 = vx_load(&scalar[1]);
|
||||||
|
#else
|
||||||
|
v_float32 s2 = vx_load(&scalar[1]);
|
||||||
|
v_float32 s3 = vx_load(&scalar[2]);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
int x = 0;
|
||||||
|
for (;;)
|
||||||
|
{
|
||||||
|
for (; x <= length - lanes; x += lanes)
|
||||||
|
{
|
||||||
|
addc_simd_c3_impl(&in[x], &out[x], s1, s2, s3, nlanes);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (x < length)
|
||||||
|
{
|
||||||
|
x = length - lanes;
|
||||||
|
continue; // process unaligned tail
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return x;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename SRC, typename DST>
|
||||||
|
CV_ALWAYS_INLINE int addc_simd_common(const SRC in[], const float scalar[], DST out[], const int length)
|
||||||
|
{
|
||||||
|
constexpr int nlanes = vector_type_of_t<DST>::nlanes;
|
||||||
|
|
||||||
|
if (length < nlanes)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
v_float32 sc = vx_load(scalar);
|
||||||
|
|
||||||
|
int x = 0;
|
||||||
|
for (;;)
|
||||||
|
{
|
||||||
|
for (; x <= length - nlanes; x += nlanes)
|
||||||
|
{
|
||||||
|
addc_simd_common_impl(&in[x], &out[x], sc, nlanes);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (x < length)
|
||||||
|
{
|
||||||
|
x = length - nlanes;
|
||||||
|
continue; // process unaligned tail
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return x;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define ADDC_SIMD(SRC, DST) \
|
||||||
|
int addc_simd(const SRC in[], const float scalar[], DST out[], \
|
||||||
|
const int width, const int chan) \
|
||||||
|
{ \
|
||||||
|
const int length = width * chan; \
|
||||||
|
switch (chan) \
|
||||||
|
{ \
|
||||||
|
case 1: \
|
||||||
|
case 2: \
|
||||||
|
case 4: \
|
||||||
|
return addc_simd_common(in, scalar, out, length); \
|
||||||
|
case 3: \
|
||||||
|
return addc_simd_c3(in, scalar, out, length); \
|
||||||
|
default: \
|
||||||
|
GAPI_Assert(chan <= 4); \
|
||||||
|
break; \
|
||||||
|
} \
|
||||||
|
return 0; \
|
||||||
|
}
|
||||||
|
|
||||||
|
ADDC_SIMD(uchar, uchar)
|
||||||
|
ADDC_SIMD(ushort, uchar)
|
||||||
|
ADDC_SIMD(short, uchar)
|
||||||
|
ADDC_SIMD(float, uchar)
|
||||||
|
ADDC_SIMD(short, short)
|
||||||
|
ADDC_SIMD(ushort, short)
|
||||||
|
ADDC_SIMD(uchar, short)
|
||||||
|
ADDC_SIMD(float, short)
|
||||||
|
ADDC_SIMD(ushort, ushort)
|
||||||
|
ADDC_SIMD(uchar, ushort)
|
||||||
|
ADDC_SIMD(short, ushort)
|
||||||
|
ADDC_SIMD(float, ushort)
|
||||||
|
ADDC_SIMD(uchar, float)
|
||||||
|
ADDC_SIMD(ushort, float)
|
||||||
|
ADDC_SIMD(short, float)
|
||||||
|
ADDC_SIMD(float, float)
|
||||||
|
|
||||||
|
#undef ADDC_SIMD
|
||||||
|
|
||||||
#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
|
#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
|
||||||
|
|
||||||
CV_CPU_OPTIMIZATION_NAMESPACE_END
|
CV_CPU_OPTIMIZATION_NAMESPACE_END
|
||||||
|
Loading…
Reference in New Issue
Block a user