Merge pull request #21024 from anna-khakimova:ak/simd_mul

This commit is contained in:
Alexander Alekhin 2021-11-17 18:41:52 +00:00
commit 4b6047e746
9 changed files with 457 additions and 26 deletions

View File

@ -32,7 +32,7 @@ namespace opencv_test
class SubPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
class SubCPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
class SubRCPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
class MulPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
class MulPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, int, double, cv::GCompileArgs>> {};
class MulDoublePerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
class MulCPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
class DivPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, int, double, cv::GCompileArgs>> {};

View File

@ -208,19 +208,23 @@ PERF_TEST_P_(SubRCPerfTest, TestPerformance)
PERF_TEST_P_(MulPerfTest, TestPerformance)
{
Size sz = get<0>(GetParam());
MatType type = get<1>(GetParam());
int dtype = get<2>(GetParam());
cv::GCompileArgs compile_args = get<3>(GetParam());
compare_f cmpF;
cv::Size sz;
MatType type = -1;
int dtype = -1;
double scale = 1.0;
cv::GCompileArgs compile_args;
std::tie(cmpF, sz, type, dtype, scale, compile_args) = GetParam();
initMatsRandU(type, sz, dtype, false);
// OpenCV code ///////////////////////////////////////////////////////////
cv::multiply(in_mat1, in_mat2, out_mat_ocv, 1.0, dtype);
cv::multiply(in_mat1, in_mat2, out_mat_ocv, scale, dtype);
// G-API code ////////////////////////////////////////////////////////////
cv::GMat in1, in2, out;
out = cv::gapi::mul(in1, in2, 1.0, dtype);
out = cv::gapi::mul(in1, in2, scale, dtype);
cv::GComputation c(GIn(in1, in2), GOut(out));
// Warm-up graph engine:
@ -234,8 +238,9 @@ PERF_TEST_P_(MulPerfTest, TestPerformance)
}
// Comparison ////////////////////////////////////////////////////////////
// FIXIT unrealiable check: EXPECT_EQ(0, cv::countNonZero(out_mat_gapi != out_mat_ocv));
EXPECT_EQ(out_mat_gapi.size(), sz);
{
EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
}
SANITY_CHECK_NOTHING();
}

View File

@ -46,9 +46,11 @@ INSTANTIATE_TEST_CASE_P(SubRCPerfTestCPU, SubRCPerfTest,
Values(cv::compile_args(CORE_CPU))));
INSTANTIATE_TEST_CASE_P(MulPerfTestCPU, MulPerfTest,
Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
Combine(Values(AbsExact().to_compare_f()),
Values(szSmall128, szVGA, sz720p, sz1080p),
Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
Values(-1, CV_8U, CV_16U, CV_32F),
Values(2.0),
Values(cv::compile_args(CORE_CPU))));
INSTANTIATE_TEST_CASE_P(MulDoublePerfTestCPU, MulDoublePerfTest,

View File

@ -42,11 +42,13 @@ INSTANTIATE_TEST_CASE_P(SubPerfTestFluid, SubPerfTest,
// Values(-1, CV_8U, CV_16U, CV_32F),
// Values(cv::compile_args(CORE_FLUID))));
// INSTANTIATE_TEST_CASE_P(MulPerfTestFluid, MulPerfTest,
// Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
// Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
// Values(-1, CV_8U, CV_16U, CV_32F),
// Values(cv::compile_args(CORE_FLUID))));
INSTANTIATE_TEST_CASE_P(MulPerfTestFluid, MulPerfTest,
Combine(Values(AbsExact().to_compare_f()),
Values(szSmall128, szVGA, sz720p, sz1080p),
Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
Values(-1, CV_8U, CV_16U, CV_16S, CV_32F),
Values(2.0),
Values(cv::compile_args(CORE_FLUID))));
// INSTANTIATE_TEST_CASE_P(MulDoublePerfTestFluid, MulDoublePerfTest,
// Combine(Values(szSmall128, szVGA, sz720p, sz1080p),

View File

@ -44,9 +44,11 @@ INSTANTIATE_TEST_CASE_P(SubRCPerfTestGPU, SubRCPerfTest,
Values(cv::compile_args(CORE_GPU))));
INSTANTIATE_TEST_CASE_P(MulPerfTestGPU, MulPerfTest,
Combine(Values( szSmall128, szVGA, sz720p, sz1080p ),
Combine(Values(AbsExact().to_compare_f()),
Values( szSmall128, szVGA, sz720p, sz1080p ),
Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
Values( -1, CV_8U, CV_16U, CV_32F ),
Values(2.0),
Values(cv::compile_args(CORE_GPU))));
INSTANTIATE_TEST_CASE_P(MulDoublePerfTestGPU, MulDoublePerfTest,

View File

@ -684,9 +684,14 @@ static void run_arithm(Buffer &dst, const View &src1, const View &src2, Arithm a
break;
}
case ARITHM_MULTIPLY:
{
#if CV_SIMD
x = mul_simd(in1, in2, out, length, scale);
#endif
for (; x < length; ++x)
out[x] = mul<DST>(in1[x], in2[x], _scale);
break;
}
case ARITHM_DIVIDE:
{
#if CV_SIMD
@ -745,13 +750,22 @@ GAPI_FLUID_KERNEL(GFluidMul, cv::gapi::core::GMul, false)
static void run(const View &src1, const View &src2, double scale, int /*dtype*/, Buffer &dst)
{
// DST SRC1 SRC2 OP __VA_ARGS__
BINARY_(uchar , uchar , uchar , run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
BINARY_(uchar , short, short, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
BINARY_(uchar , float, float, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
BINARY_( short, short, short, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
BINARY_( float, uchar , uchar , run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
BINARY_( float, short, short, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
BINARY_( float, float, float, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
BINARY_(uchar, uchar, uchar, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
BINARY_(uchar, ushort, ushort, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
BINARY_(uchar, short, short, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
BINARY_(uchar, float, float, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
BINARY_(short, short, short, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
BINARY_(short, ushort, ushort, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
BINARY_(short, uchar, uchar, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
BINARY_(short, float, float, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
BINARY_(ushort, ushort, ushort, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
BINARY_(ushort, uchar, uchar, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
BINARY_(ushort, short, short, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
BINARY_(ushort, float, float, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
BINARY_(float, uchar, uchar, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
BINARY_(float, ushort, ushort, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
BINARY_(float, short, short, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
BINARY_(float, float, float, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
}

View File

@ -56,6 +56,35 @@ DIV_SIMD(float, float)
#undef DIV_SIMD
#define MUL_SIMD(SRC, DST) \
int mul_simd(const SRC in1[], const SRC in2[], DST out[], \
const int length, double _scale) \
{ \
CV_CPU_DISPATCH(mul_simd, (in1, in2, out, length, _scale), \
CV_CPU_DISPATCH_MODES_ALL); \
}
MUL_SIMD(uchar, uchar)
MUL_SIMD(ushort, uchar)
MUL_SIMD(short, uchar)
MUL_SIMD(float, uchar)
MUL_SIMD(short, short)
MUL_SIMD(ushort, short)
MUL_SIMD(uchar, short)
MUL_SIMD(float, short)
MUL_SIMD(ushort, ushort)
MUL_SIMD(uchar, ushort)
MUL_SIMD(short, ushort)
MUL_SIMD(float, ushort)
MUL_SIMD(uchar, float)
MUL_SIMD(ushort, float)
MUL_SIMD(short, float)
MUL_SIMD(float, float)
#undef MUL_SIMD
} // namespace fluid
} // namespace gapi
} // namespace cv

View File

@ -37,6 +37,29 @@ DIV_SIMD(float, float)
#undef DIV_SIMD
#define MUL_SIMD(SRC, DST) \
int mul_simd(const SRC in1[], const SRC in2[], DST out[], \
const int length, double _scale);
MUL_SIMD(uchar, uchar)
MUL_SIMD(ushort, uchar)
MUL_SIMD(short, uchar)
MUL_SIMD(float, uchar)
MUL_SIMD(short, short)
MUL_SIMD(ushort, short)
MUL_SIMD(uchar, short)
MUL_SIMD(float, short)
MUL_SIMD(ushort, ushort)
MUL_SIMD(uchar, ushort)
MUL_SIMD(short, ushort)
MUL_SIMD(float, ushort)
MUL_SIMD(uchar, float)
MUL_SIMD(ushort, float)
MUL_SIMD(short, float)
MUL_SIMD(float, float)
#undef MUL_SIMD
} // namespace fluid
} // namespace gapi
} // namespace cv

View File

@ -58,6 +58,29 @@ DIV_SIMD(float, float)
#undef DIV_SIMD
#define MUL_SIMD(SRC, DST) \
int mul_simd(const SRC in1[], const SRC in2[], DST out[], \
const int length, double _scale);
MUL_SIMD(uchar, uchar)
MUL_SIMD(ushort, uchar)
MUL_SIMD(short, uchar)
MUL_SIMD(float, uchar)
MUL_SIMD(short, short)
MUL_SIMD(ushort, short)
MUL_SIMD(uchar, short)
MUL_SIMD(float, short)
MUL_SIMD(ushort, ushort)
MUL_SIMD(uchar, ushort)
MUL_SIMD(short, ushort)
MUL_SIMD(float, ushort)
MUL_SIMD(uchar, float)
MUL_SIMD(ushort, float)
MUL_SIMD(short, float)
MUL_SIMD(float, float)
#undef MUL_SIMD
#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
struct scale_tag {};
@ -93,6 +116,16 @@ CV_ALWAYS_INLINE v_float32 vg_load_f32(const uchar* in)
return v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(in)));
}
CV_ALWAYS_INLINE v_float32 mul_op(scale_tag, const v_float32& a, const v_float32& b, const v_float32& scale)
{
return (scale*a * b);
}
CV_ALWAYS_INLINE v_float32 mul_op(not_scale_tag, const v_float32& a, const v_float32& b, const v_float32&)
{
return a * b;
}
CV_ALWAYS_INLINE v_float32 div_op(scale_tag, const v_float32& a, const v_float32& div, const v_float32& scale)
{
return (a*scale/div);
@ -103,12 +136,12 @@ CV_ALWAYS_INLINE v_float32 div_op(not_scale_tag, const v_float32& a, const v_flo
return a / div;
}
CV_ALWAYS_INLINE void v_store_div(short* dst, v_int32& res1, v_int32& res2)
CV_ALWAYS_INLINE void v_store_i16(short* dst, v_int32& res1, v_int32& res2)
{
vx_store(dst, v_pack(res1, res2));
}
CV_ALWAYS_INLINE void v_store_div(ushort* dst, v_int32& res1, v_int32& res2)
CV_ALWAYS_INLINE void v_store_i16(ushort* dst, v_int32& res1, v_int32& res2)
{
vx_store(dst, v_pack_u(res1, res2));
}
@ -360,7 +393,7 @@ div_hal(scale_tag_t t, const float in1[], const float in2[], DST out[], const in
v_int32 res1 = v_round(v_select((fdiv1 == v_zero), v_zero, r1));
v_int32 res2 = v_round(v_select((fdiv2 == v_zero), v_zero, r2));
v_store_div(&out[x], res1, res2);
v_store_i16(&out[x], res1, res2);
}
if (x < length)
@ -467,6 +500,327 @@ DIV_SIMD(float, float)
#undef DIV_SIMD
//-------------------------
//
// Fluid kernels: Multiply
//
//-------------------------
template<typename scale_tag_t, typename SRC, typename DST>
CV_ALWAYS_INLINE
typename std::enable_if<(std::is_same<SRC, short>::value && std::is_same<DST, ushort>::value) ||
(std::is_same<SRC, ushort>::value && std::is_same<DST, ushort>::value) ||
(std::is_same<SRC, short>::value && std::is_same<DST, short>::value) ||
(std::is_same<SRC, ushort>::value && std::is_same<DST, short>::value), int>::type
mul_hal(scale_tag_t t, const SRC in1[], const SRC in2[], DST out[], const int length, double _scale)
{
constexpr int nlanes = vector_type_of_t<DST>::nlanes;
if (length < nlanes)
return 0;
v_float32 scale = vx_setall_f32(static_cast<float>(_scale));
int x = 0;
for (;;)
{
for (; x <= length - nlanes; x += nlanes)
{
v_int16 a = v_reinterpret_as_s16(vx_load(&in1[x]));
v_int16 b = v_reinterpret_as_s16(vx_load(&in2[x]));
v_float32 a1 = v_cvt_f32(v_expand_low(a));
v_float32 a2 = v_cvt_f32(v_expand_high(a));
v_float32 b1 = v_cvt_f32(v_expand_low(b));
v_float32 b2 = v_cvt_f32(v_expand_high(b));
v_int32 r1 = v_round(mul_op(t, a1, b1, scale));
v_int32 r2 = v_round(mul_op(t, a2, b2, scale));
v_store_i16(&out[x], r1, r2);
}
if (x < length)
{
x = length - nlanes;
continue; // process one more time (unaligned tail)
}
break;
}
return x;
}
//-------------------------------------------------------------------------------------------------
template<typename scale_tag_t, typename SRC>
CV_ALWAYS_INLINE
typename std::enable_if<std::is_same<SRC, short>::value ||
std::is_same<SRC, ushort>::value, int>::type
mul_hal(scale_tag_t t, const SRC in1[], const SRC in2[], uchar out[], const int length, double _scale)
{
constexpr int nlanes = v_uint8::nlanes;
if (length < nlanes)
return 0;
v_float32 scale = vx_setall_f32(static_cast<float>(_scale));
int x = 0;
for (;;)
{
for (; x <= length - nlanes; x += nlanes)
{
v_int16 a1 = v_reinterpret_as_s16(vx_load(&in1[x]));
v_int16 a2 = v_reinterpret_as_s16(vx_load(&in1[x + nlanes / 2]));
v_float32 fa1 = v_cvt_f32(v_expand_low(a1));
v_float32 fa2 = v_cvt_f32(v_expand_high(a1));
v_float32 fa3 = v_cvt_f32(v_expand_low(a2));
v_float32 fa4 = v_cvt_f32(v_expand_high(a2));
v_int16 b1 = v_reinterpret_as_s16(vx_load(&in2[x]));
v_int16 b2 = v_reinterpret_as_s16(vx_load(&in2[x + nlanes/2]));
v_float32 fb1 = v_cvt_f32(v_expand_low(b1));
v_float32 fb2 = v_cvt_f32(v_expand_high(b1));
v_float32 fb3 = v_cvt_f32(v_expand_low(b2));
v_float32 fb4 = v_cvt_f32(v_expand_high(b2));
v_int32 sum1 = v_round(mul_op(t, fa1, fb1, scale)),
sum2 = v_round(mul_op(t, fa2, fb2, scale)),
sum3 = v_round(mul_op(t, fa3, fb3, scale)),
sum4 = v_round(mul_op(t, fa4, fb4, scale));
v_int16 res1 = v_pack(sum1, sum2);
v_int16 res2 = v_pack(sum3, sum4);
vx_store(&out[x], v_pack_u(res1, res2));
}
if (x < length)
{
x = length - nlanes;
continue; // process one more time (unaligned tail)
}
break;
}
return x;
}
//-------------------------------------------------------------------------------------------------
template<typename scale_tag_t>
CV_ALWAYS_INLINE int mul_hal(scale_tag_t t, const float in1[], const float in2[], uchar out[],
const int length, double _scale)
{
constexpr int nlanes = v_uint8::nlanes;
if (length < nlanes)
return 0;
v_float32 scale = vx_setall_f32(static_cast<float>(_scale));
int x = 0;
for (;;)
{
for (; x <= length - nlanes; x += nlanes)
{
v_float32 a1 = vg_load_f32(&in1[x]);
v_float32 a2 = vg_load_f32(&in1[x + nlanes / 4]);
v_float32 a3 = vg_load_f32(&in1[x + nlanes / 2]);
v_float32 a4 = vg_load_f32(&in1[x + 3 * nlanes / 4]);
v_float32 b1 = vg_load_f32(&in2[x]);
v_float32 b2 = vg_load_f32(&in2[x + nlanes / 4]);
v_float32 b3 = vg_load_f32(&in2[x + nlanes / 2]);
v_float32 b4 = vg_load_f32(&in2[x + 3 * nlanes / 4]);
v_int32 res1 = v_round(mul_op(t, a1, b1, scale));
v_int32 res2 = v_round(mul_op(t, a2, b2, scale));
v_int32 res3 = v_round(mul_op(t, a3, b3, scale));
v_int32 res4 = v_round(mul_op(t, a4, b4, scale));
vx_store(&out[x], v_pack_u(v_pack(res1, res2), v_pack(res3, res4)));
}
if (x < length)
{
x = length - nlanes;
continue; // process one more time (unaligned tail)
}
break;
}
return x;
}
template<typename scale_tag_t, typename DST>
CV_ALWAYS_INLINE
typename std::enable_if<std::is_same<DST, short>::value ||
std::is_same<DST, ushort>::value, int>::type
mul_hal(scale_tag_t t, const uchar in1[], const uchar in2[], DST out[], const int length, double _scale)
{
constexpr int nlanes = vector_type_of_t<DST>::nlanes;
if (length < nlanes)
return 0;
v_float32 scale = vx_setall_f32(static_cast<float>(_scale));
int x = 0;
for (;;)
{
for (; x <= length - nlanes; x += nlanes)
{
v_int16 a = v_reinterpret_as_s16(vx_load_expand(&in1[x]));
v_int16 b = v_reinterpret_as_s16(vx_load_expand(&in2[x]));
v_float32 a1 = v_cvt_f32(v_expand_low(a));
v_float32 a2 = v_cvt_f32(v_expand_high(a));
v_float32 b1 = v_cvt_f32(v_expand_low(b));
v_float32 b2 = v_cvt_f32(v_expand_high(b));
v_int32 r1 = v_round(mul_op(t, a1, b1, scale));
v_int32 r2 = v_round(mul_op(t, a2, b2, scale));
v_store_i16(&out[x], r1, r2);
}
if (x < length)
{
x = length - nlanes;
continue; // process one more time (unaligned tail)
}
break;
}
return x;
}
//-------------------------------------------------------------------------------------------------
template<typename scale_tag_t, typename DST>
CV_ALWAYS_INLINE
typename std::enable_if<std::is_same<DST, short>::value ||
std::is_same<DST, ushort>::value, int>::type
mul_hal(scale_tag_t t, const float in1[], const float in2[], DST out[], const int length, double _scale)
{
constexpr int nlanes = vector_type_of_t<DST>::nlanes;
if (length < nlanes)
return 0;
v_float32 scale = vx_setall_f32(static_cast<float>(_scale));
int x = 0;
for (;;)
{
for (; x <= length - nlanes; x += nlanes)
{
v_float32 a1 = vg_load_f32(&in1[x]);
v_float32 a2 = vg_load_f32(&in1[x + nlanes / 2]);
v_float32 b1 = vg_load_f32(&in2[x]);
v_float32 b2 = vg_load_f32(&in2[x + nlanes / 2]);
v_int32 res1 = v_round(mul_op(t, a1, b1, scale));
v_int32 res2 = v_round(mul_op(t, a2, b2, scale));
v_store_i16(&out[x], res1, res2);
}
if (x < length)
{
x = length - nlanes;
continue; // process one more time (unaligned tail)
}
break;
}
return x;
}
//-------------------------------------------------------------------------------------------------
template<typename scale_tag_t, typename SRC>
CV_ALWAYS_INLINE int mul_hal(scale_tag_t t, const SRC in1[], const SRC in2[], float out[],
const int length, double _scale)
{
constexpr int nlanes = v_float32::nlanes;
if (length < nlanes)
return 0;
v_float32 scale = vx_setall_f32(static_cast<float>(_scale));
int x = 0;
for (;;)
{
for (; x <= length - nlanes; x += nlanes)
{
v_float32 a1 = vg_load_f32(&in1[x]);
v_float32 b1 = vg_load_f32(&in2[x]);
vx_store(&out[x], mul_op(t, a1, b1, scale));
}
if (x < length)
{
x = length - nlanes;
continue; // process one more time (unaligned tail)
}
break;
}
return x;
}
//-------------------------------------------------------------------------------------------------
template<typename scale_tag_t>
CV_ALWAYS_INLINE int mul_hal(scale_tag_t, const uchar in1[], const uchar in2[], uchar out[],
const int length, double scale)
{
hal::mul8u(in1, static_cast<size_t>(length), in2, static_cast<size_t>(length),
out, static_cast<size_t>(length), length, 1, &scale);
return length;
}
#define MUL_SIMD(SRC, DST) \
int mul_simd(const SRC in1[], const SRC in2[], DST out[], \
const int length, double _scale) \
{ \
int x = 0; \
float fscale = static_cast<float>(_scale); \
if (std::fabs(fscale - 1.0f) <= FLT_EPSILON) \
{ \
not_scale_tag t; \
x = mul_hal(t, in1, in2, out, length, _scale); \
} \
else \
{ \
scale_tag t; \
x = mul_hal(t, in1, in2, out, length, _scale); \
} \
return x; \
}
MUL_SIMD(uchar, uchar)
MUL_SIMD(ushort, uchar)
MUL_SIMD(short, uchar)
MUL_SIMD(float, uchar)
MUL_SIMD(short, short)
MUL_SIMD(ushort, short)
MUL_SIMD(uchar, short)
MUL_SIMD(float, short)
MUL_SIMD(ushort, ushort)
MUL_SIMD(uchar, ushort)
MUL_SIMD(short, ushort)
MUL_SIMD(float, ushort)
MUL_SIMD(uchar, float)
MUL_SIMD(ushort, float)
MUL_SIMD(short, float)
MUL_SIMD(float, float)
#undef MUL_SIMD
#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
CV_CPU_OPTIMIZATION_NAMESPACE_END