mirror of
https://github.com/opencv/opencv.git
synced 2025-06-07 17:44:04 +08:00
Merge pull request #21177 from anna-khakimova:ak/simd_mulc
* GAPI Fluid: SIMD for MulC kernel. * Changes for MulDouble kernel.
This commit is contained in:
parent
c5b8b5687f
commit
c3910807c5
@ -33,8 +33,8 @@ namespace opencv_test
|
|||||||
class SubCPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, int, cv::GCompileArgs>> {};
|
class SubCPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, int, cv::GCompileArgs>> {};
|
||||||
class SubRCPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
|
class SubRCPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
|
||||||
class MulPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, int, double, cv::GCompileArgs>> {};
|
class MulPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, int, double, cv::GCompileArgs>> {};
|
||||||
class MulDoublePerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
|
class MulDoublePerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, int, cv::GCompileArgs>> {};
|
||||||
class MulCPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
|
class MulCPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, int, cv::GCompileArgs>> {};
|
||||||
class DivPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, int, double, cv::GCompileArgs>> {};
|
class DivPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, int, double, cv::GCompileArgs>> {};
|
||||||
class DivCPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
|
class DivCPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
|
||||||
class DivRCPerfTest : public TestPerfParams<tuple<compare_f,cv::Size, MatType, int, cv::GCompileArgs>> {};
|
class DivRCPerfTest : public TestPerfParams<tuple<compare_f,cv::Size, MatType, int, cv::GCompileArgs>> {};
|
||||||
|
@ -257,17 +257,21 @@ PERF_TEST_P_(MulPerfTest, TestPerformance)
|
|||||||
|
|
||||||
PERF_TEST_P_(MulDoublePerfTest, TestPerformance)
|
PERF_TEST_P_(MulDoublePerfTest, TestPerformance)
|
||||||
{
|
{
|
||||||
Size sz = get<0>(GetParam());
|
compare_f cmpF;
|
||||||
MatType type = get<1>(GetParam());
|
cv::Size sz;
|
||||||
int dtype = get<2>(GetParam());
|
MatType type = -1;
|
||||||
cv::GCompileArgs compile_args = get<3>(GetParam());
|
int dtype = -1;
|
||||||
|
double scale = 1.0;
|
||||||
|
cv::GCompileArgs compile_args;
|
||||||
|
|
||||||
|
std::tie(cmpF, sz, type, dtype, compile_args) = GetParam();
|
||||||
|
|
||||||
auto& rng = cv::theRNG();
|
auto& rng = cv::theRNG();
|
||||||
double d = rng.uniform(0.0, 10.0);
|
double d = rng.uniform(0.0, 10.0);
|
||||||
initMatrixRandU(type, sz, dtype, false);
|
initMatrixRandU(type, sz, dtype, false);
|
||||||
|
|
||||||
// OpenCV code ///////////////////////////////////////////////////////////
|
// OpenCV code ///////////////////////////////////////////////////////////
|
||||||
cv::multiply(in_mat1, d, out_mat_ocv, 1, dtype);
|
cv::multiply(in_mat1, d, out_mat_ocv, scale, dtype);
|
||||||
|
|
||||||
// G-API code ////////////////////////////////////////////////////////////
|
// G-API code ////////////////////////////////////////////////////////////
|
||||||
cv::GMat in1, out;
|
cv::GMat in1, out;
|
||||||
@ -285,8 +289,9 @@ PERF_TEST_P_(MulDoublePerfTest, TestPerformance)
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Comparison ////////////////////////////////////////////////////////////
|
// Comparison ////////////////////////////////////////////////////////////
|
||||||
// FIXIT unrealiable check: EXPECT_EQ(0, cv::countNonZero(out_mat_gapi != out_mat_ocv));
|
{
|
||||||
EXPECT_EQ(out_mat_gapi.size(), sz);
|
EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
|
||||||
|
}
|
||||||
|
|
||||||
SANITY_CHECK_NOTHING();
|
SANITY_CHECK_NOTHING();
|
||||||
}
|
}
|
||||||
@ -295,15 +300,19 @@ PERF_TEST_P_(MulDoublePerfTest, TestPerformance)
|
|||||||
|
|
||||||
PERF_TEST_P_(MulCPerfTest, TestPerformance)
|
PERF_TEST_P_(MulCPerfTest, TestPerformance)
|
||||||
{
|
{
|
||||||
Size sz = get<0>(GetParam());
|
compare_f cmpF;
|
||||||
MatType type = get<1>(GetParam());
|
cv::Size sz;
|
||||||
int dtype = get<2>(GetParam());
|
MatType type = -1;
|
||||||
cv::GCompileArgs compile_args = get<3>(GetParam());
|
int dtype = -1;
|
||||||
|
double scale = 1.0;
|
||||||
|
cv::GCompileArgs compile_args;
|
||||||
|
|
||||||
|
std::tie(cmpF, sz, type, dtype, compile_args) = GetParam();
|
||||||
|
|
||||||
initMatsRandU(type, sz, dtype, false);
|
initMatsRandU(type, sz, dtype, false);
|
||||||
|
|
||||||
// OpenCV code ///////////////////////////////////////////////////////////
|
// OpenCV code ///////////////////////////////////////////////////////////
|
||||||
cv::multiply(in_mat1, sc, out_mat_ocv, 1, dtype);
|
cv::multiply(in_mat1, sc, out_mat_ocv, scale, dtype);
|
||||||
|
|
||||||
// G-API code ////////////////////////////////////////////////////////////
|
// G-API code ////////////////////////////////////////////////////////////
|
||||||
cv::GMat in1, out;
|
cv::GMat in1, out;
|
||||||
@ -322,8 +331,9 @@ PERF_TEST_P_(MulCPerfTest, TestPerformance)
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Comparison ////////////////////////////////////////////////////////////
|
// Comparison ////////////////////////////////////////////////////////////
|
||||||
// FIXIT unrealiable check: EXPECT_EQ(0, cv::countNonZero(out_mat_gapi != out_mat_ocv));
|
{
|
||||||
EXPECT_EQ(out_mat_gapi.size(), sz);
|
EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
|
||||||
|
}
|
||||||
|
|
||||||
SANITY_CHECK_NOTHING();
|
SANITY_CHECK_NOTHING();
|
||||||
}
|
}
|
||||||
|
@ -56,13 +56,15 @@ INSTANTIATE_TEST_CASE_P(MulPerfTestCPU, MulPerfTest,
|
|||||||
Values(cv::compile_args(CORE_CPU))));
|
Values(cv::compile_args(CORE_CPU))));
|
||||||
|
|
||||||
INSTANTIATE_TEST_CASE_P(MulDoublePerfTestCPU, MulDoublePerfTest,
|
INSTANTIATE_TEST_CASE_P(MulDoublePerfTestCPU, MulDoublePerfTest,
|
||||||
Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
|
Combine(Values(AbsExact().to_compare_f()),
|
||||||
Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
|
Values(szSmall128, szVGA, sz720p, sz1080p),
|
||||||
Values(-1, CV_8U, CV_16U, CV_32F),
|
Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
|
||||||
Values(cv::compile_args(CORE_CPU))));
|
Values(-1, CV_8U, CV_16U, CV_32F),
|
||||||
|
Values(cv::compile_args(CORE_CPU))));
|
||||||
|
|
||||||
INSTANTIATE_TEST_CASE_P(MulCPerfTestCPU, MulCPerfTest,
|
INSTANTIATE_TEST_CASE_P(MulCPerfTestCPU, MulCPerfTest,
|
||||||
Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
|
Combine(Values(AbsExact().to_compare_f()),
|
||||||
|
Values(szSmall128, szVGA, sz720p, sz1080p),
|
||||||
Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
|
Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
|
||||||
Values(-1, CV_8U, CV_16U, CV_32F),
|
Values(-1, CV_8U, CV_16U, CV_32F),
|
||||||
Values(cv::compile_args(CORE_CPU))));
|
Values(cv::compile_args(CORE_CPU))));
|
||||||
|
@ -52,17 +52,19 @@ INSTANTIATE_TEST_CASE_P(SubPerfTestFluid, SubPerfTest,
|
|||||||
Values(2.0),
|
Values(2.0),
|
||||||
Values(cv::compile_args(CORE_FLUID))));
|
Values(cv::compile_args(CORE_FLUID))));
|
||||||
|
|
||||||
// INSTANTIATE_TEST_CASE_P(MulDoublePerfTestFluid, MulDoublePerfTest,
|
INSTANTIATE_TEST_CASE_P(MulDoublePerfTestFluid, MulDoublePerfTest,
|
||||||
// Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
|
Combine(Values(Tolerance_FloatRel_IntAbs(1e-6, 1).to_compare_f()),
|
||||||
// Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
|
Values(szSmall128, szVGA, sz720p, sz1080p),
|
||||||
// Values(-1, CV_8U, CV_16U, CV_32F),
|
Values(CV_8UC1, CV_8UC3, CV_16SC1, CV_32FC1),
|
||||||
// Values(cv::compile_args(CORE_FLUID))));
|
Values(-1, CV_8U, CV_32F),
|
||||||
|
Values(cv::compile_args(CORE_FLUID))));
|
||||||
|
|
||||||
// INSTANTIATE_TEST_CASE_P(MulCPerfTestFluid, MulCPerfTest,
|
INSTANTIATE_TEST_CASE_P(MulCPerfTestFluid, MulCPerfTest,
|
||||||
// Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
|
Combine(Values(Tolerance_FloatRel_IntAbs(1e-6, 1).to_compare_f()),
|
||||||
// Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
|
Values(szSmall128, szVGA, sz720p, sz1080p),
|
||||||
// Values(-1, CV_8U, CV_16U, CV_32F),
|
Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
|
||||||
// Values(cv::compile_args(CORE_FLUID))));
|
Values(-1, CV_8U, CV_16U, CV_16S, CV_32F),
|
||||||
|
Values(cv::compile_args(CORE_FLUID))));
|
||||||
|
|
||||||
INSTANTIATE_TEST_CASE_P(DivPerfTestFluid, DivPerfTest,
|
INSTANTIATE_TEST_CASE_P(DivPerfTestFluid, DivPerfTest,
|
||||||
Combine(Values(AbsExact().to_compare_f()),
|
Combine(Values(AbsExact().to_compare_f()),
|
||||||
|
@ -54,13 +54,15 @@ INSTANTIATE_TEST_CASE_P(MulPerfTestGPU, MulPerfTest,
|
|||||||
Values(cv::compile_args(CORE_GPU))));
|
Values(cv::compile_args(CORE_GPU))));
|
||||||
|
|
||||||
INSTANTIATE_TEST_CASE_P(MulDoublePerfTestGPU, MulDoublePerfTest,
|
INSTANTIATE_TEST_CASE_P(MulDoublePerfTestGPU, MulDoublePerfTest,
|
||||||
Combine(Values( szSmall128, szVGA, sz720p, sz1080p ),
|
Combine(Values(AbsExact().to_compare_f()),
|
||||||
|
Values( szSmall128, szVGA, sz720p, sz1080p ),
|
||||||
Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
|
Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
|
||||||
Values( -1, CV_8U, CV_16U, CV_32F ),
|
Values( -1, CV_8U, CV_16U, CV_32F ),
|
||||||
Values(cv::compile_args(CORE_GPU))));
|
Values(cv::compile_args(CORE_GPU))));
|
||||||
|
|
||||||
INSTANTIATE_TEST_CASE_P(MulCPerfTestGPU, MulCPerfTest,
|
INSTANTIATE_TEST_CASE_P(MulCPerfTestGPU, MulCPerfTest,
|
||||||
Combine(Values( szSmall128, szVGA, sz720p, sz1080p ),
|
Combine(Values(AbsExact().to_compare_f()),
|
||||||
|
Values( szSmall128, szVGA, sz720p, sz1080p ),
|
||||||
Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
|
Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
|
||||||
Values( -1, CV_8U, CV_16U, CV_32F ),
|
Values( -1, CV_8U, CV_16U, CV_32F ),
|
||||||
Values(cv::compile_args(CORE_GPU))));
|
Values(cv::compile_args(CORE_GPU))));
|
||||||
|
@ -1265,12 +1265,12 @@ CV_ALWAYS_INLINE void run_arithm_s(Buffer &dst, const View &src, const float sca
|
|||||||
{
|
{
|
||||||
case ARITHM_ADD:
|
case ARITHM_ADD:
|
||||||
{
|
{
|
||||||
int w = 0;
|
int w = 0;
|
||||||
#if CV_SIMD
|
#if CV_SIMD
|
||||||
w = addc_simd(in, scalar, out, length, chan);
|
w = addc_simd(in, scalar, out, length, chan);
|
||||||
#endif
|
#endif
|
||||||
for (; w < length; ++w)
|
for (; w < length; ++w)
|
||||||
out[w] = add<DST>(in[w], scalar[w % chan]);
|
out[w] = add<DST>(in[w], scalar[w % chan]);
|
||||||
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -1284,12 +1284,17 @@ CV_ALWAYS_INLINE void run_arithm_s(Buffer &dst, const View &src, const float sca
|
|||||||
out[w] = sub<DST>(in[w], scalar[w % chan]);
|
out[w] = sub<DST>(in[w], scalar[w % chan]);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
// TODO: optimize miltiplication and division
|
|
||||||
case ARITHM_MULTIPLY:
|
case ARITHM_MULTIPLY:
|
||||||
for (int w=0; w < width; w++)
|
{
|
||||||
for (int c=0; c < chan; c++)
|
int w = 0;
|
||||||
out[chan*w + c] = mul<DST>(in[chan*w + c], scalar[c], scale);
|
#if CV_SIMD
|
||||||
|
w = mulc_simd(in, scalar, out, length, chan, scale);
|
||||||
|
#endif
|
||||||
|
for (; w < width; ++w)
|
||||||
|
for (int c = 0; c < chan; ++c)
|
||||||
|
out[chan * w + c] = mul<DST>(in[chan * w + c], scalar[c], scale);
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
case ARITHM_DIVIDE:
|
case ARITHM_DIVIDE:
|
||||||
for (int w=0; w < width; w++)
|
for (int w=0; w < width; w++)
|
||||||
for (int c=0; c < chan; c++)
|
for (int c=0; c < chan; c++)
|
||||||
@ -1539,18 +1544,73 @@ GAPI_FLUID_KERNEL(GFluidSubRC, cv::gapi::core::GSubRC, false)
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
GAPI_FLUID_KERNEL(GFluidMulC, cv::gapi::core::GMulC, false)
|
GAPI_FLUID_KERNEL(GFluidMulC, cv::gapi::core::GMulC, true)
|
||||||
{
|
{
|
||||||
static const int Window = 1;
|
static const int Window = 1;
|
||||||
|
|
||||||
static void run(const View &src, const cv::Scalar &_scalar, int /*dtype*/, Buffer &dst)
|
static void run(const View& src, const cv::Scalar& _scalar, int /*dtype*/,
|
||||||
|
Buffer& dst, Buffer& scratch)
|
||||||
{
|
{
|
||||||
const float scalar[4] = {
|
GAPI_Assert(src.meta().chan <= 4);
|
||||||
static_cast<float>(_scalar[0]),
|
|
||||||
static_cast<float>(_scalar[1]),
|
if (dst.y() == 0)
|
||||||
static_cast<float>(_scalar[2]),
|
{
|
||||||
static_cast<float>(_scalar[3])
|
const int chan = src.meta().chan;
|
||||||
};
|
float* sc = scratch.OutLine<float>();
|
||||||
|
|
||||||
|
for (int i = 0; i < scratch.length(); ++i)
|
||||||
|
sc[i] = static_cast<float>(_scalar[i % chan]);
|
||||||
|
}
|
||||||
|
const float* scalar = scratch.OutLine<float>();
|
||||||
|
const float scale = 1.0;
|
||||||
|
|
||||||
|
// DST SRC OP __VA_ARGS__
|
||||||
|
UNARY_(uchar, uchar, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
|
||||||
|
UNARY_(uchar, ushort, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
|
||||||
|
UNARY_(uchar, short, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
|
||||||
|
UNARY_(uchar, float, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
|
||||||
|
UNARY_(ushort, ushort, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
|
||||||
|
UNARY_(ushort, short, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
|
||||||
|
UNARY_(ushort, uchar, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
|
||||||
|
UNARY_(ushort, float, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
|
||||||
|
UNARY_(short, short, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
|
||||||
|
UNARY_(short, ushort, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
|
||||||
|
UNARY_(short, uchar, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
|
||||||
|
UNARY_(short, float, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
|
||||||
|
UNARY_(float, uchar, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
|
||||||
|
UNARY_(float, ushort, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
|
||||||
|
UNARY_(float, short, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
|
||||||
|
UNARY_(float, float, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
|
||||||
|
|
||||||
|
CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
|
||||||
|
}
|
||||||
|
|
||||||
|
static void initScratch(const GMatDesc&, const GScalarDesc&, int, Buffer& scratch)
|
||||||
|
{
|
||||||
|
initScratchBuffer(scratch);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void resetScratch(Buffer& /*scratch*/)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
GAPI_FLUID_KERNEL(GFluidMulCOld, cv::gapi::core::GMulCOld, true)
|
||||||
|
{
|
||||||
|
static const int Window = 1;
|
||||||
|
|
||||||
|
static void run(const View &src, double _scalar, int /*dtype*/, Buffer &dst, Buffer& scratch)
|
||||||
|
{
|
||||||
|
GAPI_Assert(src.meta().chan <= 4);
|
||||||
|
|
||||||
|
if (dst.y() == 0)
|
||||||
|
{
|
||||||
|
float* sc = scratch.OutLine<float>();
|
||||||
|
|
||||||
|
for (int i = 0; i < scratch.length(); ++i)
|
||||||
|
sc[i] = static_cast<float>(_scalar);
|
||||||
|
}
|
||||||
|
const float* scalar = scratch.OutLine<float>();
|
||||||
const float scale = 1.f;
|
const float scale = 1.f;
|
||||||
|
|
||||||
// DST SRC OP __VA_ARGS__
|
// DST SRC OP __VA_ARGS__
|
||||||
@ -1564,32 +1624,14 @@ GAPI_FLUID_KERNEL(GFluidMulC, cv::gapi::core::GMulC, false)
|
|||||||
|
|
||||||
CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
|
CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
|
||||||
}
|
}
|
||||||
};
|
|
||||||
|
|
||||||
GAPI_FLUID_KERNEL(GFluidMulCOld, cv::gapi::core::GMulCOld, false)
|
static void initScratch(const GMatDesc&, double, int, Buffer& scratch)
|
||||||
{
|
|
||||||
static const int Window = 1;
|
|
||||||
|
|
||||||
static void run(const View &src, double _scalar, int /*dtype*/, Buffer &dst)
|
|
||||||
{
|
{
|
||||||
const float scalar[4] = {
|
initScratchBuffer(scratch);
|
||||||
static_cast<float>(_scalar),
|
}
|
||||||
static_cast<float>(_scalar),
|
|
||||||
static_cast<float>(_scalar),
|
|
||||||
static_cast<float>(_scalar)
|
|
||||||
};
|
|
||||||
const float scale = 1.f;
|
|
||||||
|
|
||||||
// DST SRC OP __VA_ARGS__
|
static void resetScratch(Buffer& /*scratch*/)
|
||||||
UNARY_(uchar , uchar , run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
|
{
|
||||||
UNARY_(uchar , short, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
|
|
||||||
UNARY_(uchar , float, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
|
|
||||||
UNARY_( short, short, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
|
|
||||||
UNARY_( float, uchar , run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
|
|
||||||
UNARY_( float, short, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
|
|
||||||
UNARY_( float, float, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
|
|
||||||
|
|
||||||
CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -138,6 +138,33 @@ SUBC_SIMD(float, float)
|
|||||||
|
|
||||||
#undef SUBC_SIMD
|
#undef SUBC_SIMD
|
||||||
|
|
||||||
|
#define MULC_SIMD(SRC, DST) \
|
||||||
|
int mulc_simd(const SRC in[], const float scalar[], DST out[], \
|
||||||
|
const int length, const int chan, const float scale) \
|
||||||
|
{ \
|
||||||
|
CV_CPU_DISPATCH(mulc_simd, (in, scalar, out, length, chan, scale), \
|
||||||
|
CV_CPU_DISPATCH_MODES_ALL); \
|
||||||
|
}
|
||||||
|
|
||||||
|
MULC_SIMD(uchar, uchar)
|
||||||
|
MULC_SIMD(ushort, uchar)
|
||||||
|
MULC_SIMD(short, uchar)
|
||||||
|
MULC_SIMD(float, uchar)
|
||||||
|
MULC_SIMD(short, short)
|
||||||
|
MULC_SIMD(ushort, short)
|
||||||
|
MULC_SIMD(uchar, short)
|
||||||
|
MULC_SIMD(float, short)
|
||||||
|
MULC_SIMD(ushort, ushort)
|
||||||
|
MULC_SIMD(uchar, ushort)
|
||||||
|
MULC_SIMD(short, ushort)
|
||||||
|
MULC_SIMD(float, ushort)
|
||||||
|
MULC_SIMD(uchar, float)
|
||||||
|
MULC_SIMD(ushort, float)
|
||||||
|
MULC_SIMD(short, float)
|
||||||
|
MULC_SIMD(float, float)
|
||||||
|
|
||||||
|
#undef MULC_SIMD
|
||||||
|
|
||||||
} // namespace fluid
|
} // namespace fluid
|
||||||
} // namespace gapi
|
} // namespace gapi
|
||||||
} // namespace cv
|
} // namespace cv
|
||||||
|
@ -106,6 +106,29 @@ SUBC_SIMD(float, float)
|
|||||||
|
|
||||||
#undef SUBC_SIMD
|
#undef SUBC_SIMD
|
||||||
|
|
||||||
|
#define MULC_SIMD(SRC, DST) \
|
||||||
|
int mulc_simd(const SRC in[], const float scalar[], DST out[], \
|
||||||
|
const int length, const int chan, const float scale);
|
||||||
|
|
||||||
|
MULC_SIMD(uchar, uchar)
|
||||||
|
MULC_SIMD(ushort, uchar)
|
||||||
|
MULC_SIMD(short, uchar)
|
||||||
|
MULC_SIMD(float, uchar)
|
||||||
|
MULC_SIMD(short, short)
|
||||||
|
MULC_SIMD(ushort, short)
|
||||||
|
MULC_SIMD(uchar, short)
|
||||||
|
MULC_SIMD(float, short)
|
||||||
|
MULC_SIMD(ushort, ushort)
|
||||||
|
MULC_SIMD(uchar, ushort)
|
||||||
|
MULC_SIMD(short, ushort)
|
||||||
|
MULC_SIMD(float, ushort)
|
||||||
|
MULC_SIMD(uchar, float)
|
||||||
|
MULC_SIMD(ushort, float)
|
||||||
|
MULC_SIMD(short, float)
|
||||||
|
MULC_SIMD(float, float)
|
||||||
|
|
||||||
|
#undef MULC_SIMD
|
||||||
|
|
||||||
} // namespace fluid
|
} // namespace fluid
|
||||||
} // namespace gapi
|
} // namespace gapi
|
||||||
} // namespace cv
|
} // namespace cv
|
||||||
|
@ -127,6 +127,30 @@ SUBC_SIMD(float, float)
|
|||||||
|
|
||||||
#undef SUBC_SIMD
|
#undef SUBC_SIMD
|
||||||
|
|
||||||
|
|
||||||
|
#define MULC_SIMD(SRC, DST) \
|
||||||
|
int mulc_simd(const SRC in[], const float scalar[], DST out[], \
|
||||||
|
const int length, const int chan, const float scale);
|
||||||
|
|
||||||
|
MULC_SIMD(uchar, uchar)
|
||||||
|
MULC_SIMD(ushort, uchar)
|
||||||
|
MULC_SIMD(short, uchar)
|
||||||
|
MULC_SIMD(float, uchar)
|
||||||
|
MULC_SIMD(short, short)
|
||||||
|
MULC_SIMD(ushort, short)
|
||||||
|
MULC_SIMD(uchar, short)
|
||||||
|
MULC_SIMD(float, short)
|
||||||
|
MULC_SIMD(ushort, ushort)
|
||||||
|
MULC_SIMD(uchar, ushort)
|
||||||
|
MULC_SIMD(short, ushort)
|
||||||
|
MULC_SIMD(float, ushort)
|
||||||
|
MULC_SIMD(uchar, float)
|
||||||
|
MULC_SIMD(ushort, float)
|
||||||
|
MULC_SIMD(short, float)
|
||||||
|
MULC_SIMD(float, float)
|
||||||
|
|
||||||
|
#undef MULC_SIMD
|
||||||
|
|
||||||
#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
|
#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
|
||||||
|
|
||||||
struct scale_tag {};
|
struct scale_tag {};
|
||||||
@ -870,12 +894,13 @@ MUL_SIMD(float, float)
|
|||||||
|
|
||||||
//-------------------------
|
//-------------------------
|
||||||
//
|
//
|
||||||
// Fluid kernels: AddC
|
// Fluid kernels: AddC, SubC
|
||||||
//
|
//
|
||||||
//-------------------------
|
//-------------------------
|
||||||
|
|
||||||
struct add_tag {};
|
struct add_tag {};
|
||||||
struct sub_tag {};
|
struct sub_tag {};
|
||||||
|
struct mul_tag {};
|
||||||
|
|
||||||
CV_ALWAYS_INLINE void arithmOpScalar_pack_store_c3(short* outx, const v_int32& c1,
|
CV_ALWAYS_INLINE void arithmOpScalar_pack_store_c3(short* outx, const v_int32& c1,
|
||||||
const v_int32& c2, const v_int32& c3,
|
const v_int32& c2, const v_int32& c3,
|
||||||
@ -909,6 +934,12 @@ CV_ALWAYS_INLINE v_float32 oper(sub_tag, const v_float32& a, const v_float32& sc
|
|||||||
return a - sc;
|
return a - sc;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
CV_ALWAYS_INLINE v_float32 oper(mul_tag, const v_float32& a, const v_float32& sc)
|
||||||
|
{
|
||||||
|
return a * sc;
|
||||||
|
}
|
||||||
|
//-------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
template<typename oper_tag, typename SRC, typename DST>
|
template<typename oper_tag, typename SRC, typename DST>
|
||||||
CV_ALWAYS_INLINE
|
CV_ALWAYS_INLINE
|
||||||
typename std::enable_if<(std::is_same<DST, ushort>::value ||
|
typename std::enable_if<(std::is_same<DST, ushort>::value ||
|
||||||
@ -957,7 +988,7 @@ CV_ALWAYS_INLINE
|
|||||||
typename std::enable_if<std::is_same<DST, short>::value ||
|
typename std::enable_if<std::is_same<DST, short>::value ||
|
||||||
std::is_same<DST, ushort>::value, void>::type
|
std::is_same<DST, ushort>::value, void>::type
|
||||||
arithmOpScalar_simd_c3_impl(oper_tag t, const SRC* inx, DST* outx, const v_float32& s1, const v_float32& s2,
|
arithmOpScalar_simd_c3_impl(oper_tag t, const SRC* inx, DST* outx, const v_float32& s1, const v_float32& s2,
|
||||||
const v_float32& s3, const int nlanes)
|
const v_float32& s3, const int nlanes)
|
||||||
{
|
{
|
||||||
v_float32 a1 = vg_load_f32(inx);
|
v_float32 a1 = vg_load_f32(inx);
|
||||||
v_float32 a2 = vg_load_f32(&inx[nlanes / 2]);
|
v_float32 a2 = vg_load_f32(&inx[nlanes / 2]);
|
||||||
@ -1089,7 +1120,7 @@ CV_ALWAYS_INLINE int arithmOpScalar_simd_common(oper_tag t, const SRC in[],
|
|||||||
return x;
|
return x;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//-------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
#define ADDC_SIMD(SRC, DST) \
|
#define ADDC_SIMD(SRC, DST) \
|
||||||
int addc_simd(const SRC in[], const float scalar[], DST out[], \
|
int addc_simd(const SRC in[], const float scalar[], DST out[], \
|
||||||
@ -1129,6 +1160,8 @@ ADDC_SIMD(float, float)
|
|||||||
|
|
||||||
#undef ADDC_SIMD
|
#undef ADDC_SIMD
|
||||||
|
|
||||||
|
//-------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
#define SUBC_SIMD(SRC, DST) \
|
#define SUBC_SIMD(SRC, DST) \
|
||||||
int subc_simd(const SRC in[], const float scalar[], DST out[], \
|
int subc_simd(const SRC in[], const float scalar[], DST out[], \
|
||||||
const int length, const int chan) \
|
const int length, const int chan) \
|
||||||
@ -1167,6 +1200,256 @@ SUBC_SIMD(float, float)
|
|||||||
|
|
||||||
#undef SUBC_SIMD
|
#undef SUBC_SIMD
|
||||||
|
|
||||||
|
//-------------------------
|
||||||
|
//
|
||||||
|
// Fluid kernels: MulC
|
||||||
|
//
|
||||||
|
//-------------------------
|
||||||
|
|
||||||
|
template<typename SRC, typename DST>
|
||||||
|
CV_ALWAYS_INLINE
|
||||||
|
typename std::enable_if<std::is_same<DST, short>::value ||
|
||||||
|
std::is_same<DST, ushort>::value, void>::type
|
||||||
|
mulc_scale_simd_c3_impl(const SRC* inx, DST* outx, const v_float32& s1, const v_float32& s2,
|
||||||
|
const v_float32& s3, const v_float32& scale, const int nlanes)
|
||||||
|
{
|
||||||
|
v_float32 a1 = vg_load_f32(inx);
|
||||||
|
v_float32 a2 = vg_load_f32(&inx[nlanes / 2]);
|
||||||
|
v_float32 a3 = vg_load_f32(&inx[nlanes]);
|
||||||
|
v_float32 a4 = vg_load_f32(&inx[3 * nlanes / 2]);
|
||||||
|
v_float32 a5 = vg_load_f32(&inx[2 * nlanes]);
|
||||||
|
v_float32 a6 = vg_load_f32(&inx[5 * nlanes / 2]);
|
||||||
|
|
||||||
|
arithmOpScalar_pack_store_c3(outx, v_round(scale*a1*s1),
|
||||||
|
v_round(scale*a2*s2),
|
||||||
|
v_round(scale*a3*s3),
|
||||||
|
v_round(scale*a4*s1),
|
||||||
|
v_round(scale*a5*s2),
|
||||||
|
v_round(scale*a6*s3));
|
||||||
|
}
|
||||||
|
|
||||||
|
//-------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
template<typename SRC>
|
||||||
|
CV_ALWAYS_INLINE void mulc_scale_simd_c3_impl(const SRC* inx, uchar* outx,
|
||||||
|
const v_float32& s1, const v_float32& s2,
|
||||||
|
const v_float32& s3, const v_float32& scale, const int nlanes)
|
||||||
|
{
|
||||||
|
vx_store(outx,
|
||||||
|
v_pack_u(v_pack(v_round(scale * vg_load_f32(inx)* s1),
|
||||||
|
v_round(scale * vg_load_f32(&inx[nlanes/4])* s2)),
|
||||||
|
v_pack(v_round(scale * vg_load_f32(&inx[nlanes/2])* s3),
|
||||||
|
v_round(scale * vg_load_f32(&inx[3*nlanes/4])* s1))));
|
||||||
|
|
||||||
|
vx_store(&outx[nlanes],
|
||||||
|
v_pack_u(v_pack(v_round(scale * vg_load_f32(&inx[nlanes])* s2),
|
||||||
|
v_round(scale * vg_load_f32(&inx[5*nlanes/4])* s3)),
|
||||||
|
v_pack(v_round(scale * vg_load_f32(&inx[3*nlanes/2])* s1),
|
||||||
|
v_round(scale * vg_load_f32(&inx[7*nlanes/4])* s2))));
|
||||||
|
|
||||||
|
vx_store(&outx[2 * nlanes],
|
||||||
|
v_pack_u(v_pack(v_round(scale * vg_load_f32(&inx[2*nlanes])* s3),
|
||||||
|
v_round(scale * vg_load_f32(&inx[9*nlanes/4])* s1)),
|
||||||
|
v_pack(v_round(scale * vg_load_f32(&inx[5*nlanes/2])* s2),
|
||||||
|
v_round(scale * vg_load_f32(&inx[11*nlanes/4])* s3))));
|
||||||
|
}
|
||||||
|
|
||||||
|
//-------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
template<typename SRC>
|
||||||
|
CV_ALWAYS_INLINE void mulc_scale_simd_c3_impl(const SRC* in, float* out,
|
||||||
|
const v_float32& s1, const v_float32& s2,
|
||||||
|
const v_float32& s3, const v_float32& scale, const int nlanes)
|
||||||
|
{
|
||||||
|
v_float32 a1 = vg_load_f32(in);
|
||||||
|
v_float32 a2 = vg_load_f32(&in[nlanes]);
|
||||||
|
v_float32 a3 = vg_load_f32(&in[2*nlanes]);
|
||||||
|
|
||||||
|
vx_store(out, scale * a1* s1);
|
||||||
|
vx_store(&out[nlanes], scale * a2* s2);
|
||||||
|
vx_store(&out[2*nlanes], scale * a3* s3);
|
||||||
|
}
|
||||||
|
|
||||||
|
//-------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
template<typename SRC, typename DST>
|
||||||
|
CV_ALWAYS_INLINE int mulc_scale_simd_c3(const SRC in[],
|
||||||
|
const float scalar[], DST out[],
|
||||||
|
const int length, const float _scale)
|
||||||
|
{
|
||||||
|
constexpr int chan = 3;
|
||||||
|
constexpr int nlanes = vector_type_of_t<DST>::nlanes;
|
||||||
|
constexpr int lanes = chan * nlanes;
|
||||||
|
|
||||||
|
if (length < lanes)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
v_float32 scale = vx_setall_f32(_scale);
|
||||||
|
|
||||||
|
v_float32 s1 = vx_load(scalar);
|
||||||
|
#if CV_SIMD_WIDTH == 32
|
||||||
|
v_float32 s2 = vx_load(&scalar[2]);
|
||||||
|
v_float32 s3 = vx_load(&scalar[1]);
|
||||||
|
#else
|
||||||
|
v_float32 s2 = vx_load(&scalar[1]);
|
||||||
|
v_float32 s3 = vx_load(&scalar[2]);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
int x = 0;
|
||||||
|
for (;;)
|
||||||
|
{
|
||||||
|
for (; x <= length - lanes; x += lanes)
|
||||||
|
{
|
||||||
|
mulc_scale_simd_c3_impl(&in[x], &out[x], s1, s2, s3, scale, nlanes);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (x < length)
|
||||||
|
{
|
||||||
|
x = length - lanes;
|
||||||
|
continue; // process unaligned tail
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return x;
|
||||||
|
}
|
||||||
|
|
||||||
|
//-------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
template<typename SRC, typename DST>
|
||||||
|
CV_ALWAYS_INLINE
|
||||||
|
typename std::enable_if<(std::is_same<DST, ushort>::value ||
|
||||||
|
std::is_same<DST, short>::value), void>::type
|
||||||
|
mulc_scale_simd_common_impl(const SRC* inx, DST* outx,
|
||||||
|
const v_float32& sc, const v_float32& scale,
|
||||||
|
const int nlanes)
|
||||||
|
{
|
||||||
|
v_float32 a1 = vg_load_f32(inx);
|
||||||
|
v_float32 a2 = vg_load_f32(&inx[nlanes/2]);
|
||||||
|
|
||||||
|
v_store_i16(outx, v_round(scale * a1* sc), v_round(scale * a2* sc));
|
||||||
|
}
|
||||||
|
|
||||||
|
//-------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
template<typename SRC>
|
||||||
|
CV_ALWAYS_INLINE void mulc_scale_simd_common_impl(const SRC* inx,
|
||||||
|
uchar* outx, const v_float32& sc,
|
||||||
|
const v_float32& scale, const int nlanes)
|
||||||
|
{
|
||||||
|
v_float32 a1 = vg_load_f32(inx);
|
||||||
|
v_float32 a2 = vg_load_f32(&inx[nlanes/4]);
|
||||||
|
v_float32 a3 = vg_load_f32(&inx[nlanes/2]);
|
||||||
|
v_float32 a4 = vg_load_f32(&inx[3 * nlanes/4]);
|
||||||
|
|
||||||
|
vx_store(outx, v_pack_u(v_pack(v_round(scale * a1* sc),
|
||||||
|
v_round(scale * a2* sc)),
|
||||||
|
v_pack(v_round(scale * a3* sc),
|
||||||
|
v_round(scale * a4* sc))));
|
||||||
|
}
|
||||||
|
|
||||||
|
//-------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
template<typename SRC>
|
||||||
|
CV_ALWAYS_INLINE void mulc_scale_simd_common_impl(const SRC* inx,
|
||||||
|
float* outx, const v_float32& sc,
|
||||||
|
const v_float32& scale, const int)
|
||||||
|
{
|
||||||
|
v_float32 a1 = vg_load_f32(inx);
|
||||||
|
vx_store(outx, scale * a1* sc);
|
||||||
|
}
|
||||||
|
|
||||||
|
//-------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
template<typename SRC, typename DST>
|
||||||
|
CV_ALWAYS_INLINE int mulc_scale_simd_common(const SRC in[],
|
||||||
|
const float scalar[], DST out[],
|
||||||
|
const int length, const float _scale)
|
||||||
|
{
|
||||||
|
constexpr int nlanes = vector_type_of_t<DST>::nlanes;
|
||||||
|
|
||||||
|
if (length < nlanes)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
v_float32 _scalar = vx_load(scalar);
|
||||||
|
v_float32 scale = vx_setall_f32(_scale);
|
||||||
|
|
||||||
|
int x = 0;
|
||||||
|
for (;;)
|
||||||
|
{
|
||||||
|
for (; x <= length - nlanes; x += nlanes)
|
||||||
|
{
|
||||||
|
mulc_scale_simd_common_impl(&in[x], &out[x], _scalar, scale, nlanes);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (x < length)
|
||||||
|
{
|
||||||
|
x = length - nlanes;
|
||||||
|
continue; // process unaligned tail
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return x;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define MULC_SIMD(SRC, DST) \
|
||||||
|
int mulc_simd(const SRC in[], const float scalar[], DST out[], \
|
||||||
|
const int length, const int chan, const float scale) \
|
||||||
|
{ \
|
||||||
|
mul_tag op_t; \
|
||||||
|
switch (chan) \
|
||||||
|
{ \
|
||||||
|
case 1: \
|
||||||
|
case 2: \
|
||||||
|
case 4: \
|
||||||
|
{ \
|
||||||
|
if (std::fabs(scale - 1.0f) <= FLT_EPSILON) \
|
||||||
|
{ \
|
||||||
|
return arithmOpScalar_simd_common(op_t, in, scalar, \
|
||||||
|
out, length); \
|
||||||
|
} \
|
||||||
|
else \
|
||||||
|
{ \
|
||||||
|
return mulc_scale_simd_common(in, scalar, out, length, scale); \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
case 3: \
|
||||||
|
{ \
|
||||||
|
if (std::fabs(scale - 1.0f) <= FLT_EPSILON) \
|
||||||
|
{ \
|
||||||
|
return arithmOpScalar_simd_c3(op_t, in, scalar, \
|
||||||
|
out, length); \
|
||||||
|
} \
|
||||||
|
else \
|
||||||
|
{ \
|
||||||
|
return mulc_scale_simd_c3(in, scalar, out, length, scale); \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
default: \
|
||||||
|
GAPI_Assert(chan <= 4); \
|
||||||
|
break; \
|
||||||
|
} \
|
||||||
|
return 0; \
|
||||||
|
}
|
||||||
|
|
||||||
|
MULC_SIMD(uchar, uchar)
|
||||||
|
MULC_SIMD(ushort, uchar)
|
||||||
|
MULC_SIMD(short, uchar)
|
||||||
|
MULC_SIMD(float, uchar)
|
||||||
|
MULC_SIMD(short, short)
|
||||||
|
MULC_SIMD(ushort, short)
|
||||||
|
MULC_SIMD(uchar, short)
|
||||||
|
MULC_SIMD(float, short)
|
||||||
|
MULC_SIMD(ushort, ushort)
|
||||||
|
MULC_SIMD(uchar, ushort)
|
||||||
|
MULC_SIMD(short, ushort)
|
||||||
|
MULC_SIMD(float, ushort)
|
||||||
|
MULC_SIMD(uchar, float)
|
||||||
|
MULC_SIMD(ushort, float)
|
||||||
|
MULC_SIMD(short, float)
|
||||||
|
MULC_SIMD(float, float)
|
||||||
|
|
||||||
|
#undef MULC_SIMD
|
||||||
|
|
||||||
#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
|
#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
|
||||||
|
|
||||||
CV_CPU_OPTIMIZATION_NAMESPACE_END
|
CV_CPU_OPTIMIZATION_NAMESPACE_END
|
||||||
|
Loading…
Reference in New Issue
Block a user