Merge pull request #21797 from anna-khakimova:ak/merge3_extend_supported_types

GAPI Fluid SIMD:Add support of new several types for the Merge3

- Support of the new several types was added.
- Fixes for the Split/Merge and ConvertTo issues.
This commit is contained in:
Anna Khakimova 2023-05-31 14:59:39 +03:00 committed by GitHub
parent fc5d412ba7
commit 6d3dd24622
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 143 additions and 60 deletions

View File

@ -62,7 +62,7 @@ namespace opencv_test
class InRangePerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, cv::GCompileArgs>> {}; class InRangePerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, cv::GCompileArgs>> {};
class Split3PerfTest : public TestPerfParams<tuple<compare_f, cv::Size, cv::GCompileArgs>> {}; class Split3PerfTest : public TestPerfParams<tuple<compare_f, cv::Size, cv::GCompileArgs>> {};
class Split4PerfTest : public TestPerfParams<tuple<compare_f, cv::Size, cv::GCompileArgs>> {}; class Split4PerfTest : public TestPerfParams<tuple<compare_f, cv::Size, cv::GCompileArgs>> {};
class Merge3PerfTest : public TestPerfParams<tuple<compare_f, cv::Size, cv::GCompileArgs>> {}; class Merge3PerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, cv::GCompileArgs>> {};
class Merge4PerfTest : public TestPerfParams<tuple<compare_f, cv::Size, cv::GCompileArgs>> {}; class Merge4PerfTest : public TestPerfParams<tuple<compare_f, cv::Size, cv::GCompileArgs>> {};
class RemapPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, cv::GCompileArgs>> {}; class RemapPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, cv::GCompileArgs>> {};
class FlipPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, int, cv::GCompileArgs>> {}; class FlipPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, int, cv::GCompileArgs>> {};

View File

@ -1577,11 +1577,12 @@ PERF_TEST_P_(Merge3PerfTest, TestPerformance)
{ {
compare_f cmpF; compare_f cmpF;
cv::Size sz; cv::Size sz;
MatType type = -1;
cv::GCompileArgs compile_args; cv::GCompileArgs compile_args;
std::tie(cmpF, sz, compile_args) = GetParam(); std::tie(cmpF, sz, type, compile_args) = GetParam();
initMatsRandU(CV_8UC1, sz, CV_8UC3); initMatsRandU(type, sz, CV_MAKETYPE(type, 3));
cv::Mat in_mat3(sz, CV_8UC1); cv::Mat in_mat3(sz, type);
cv::Scalar mean = cv::Scalar::all(127); cv::Scalar mean = cv::Scalar::all(127);
cv::Scalar stddev = cv::Scalar::all(40.f); cv::Scalar stddev = cv::Scalar::all(40.f);
cv::randn(in_mat3, mean, stddev); cv::randn(in_mat3, mean, stddev);

View File

@ -252,6 +252,7 @@ INSTANTIATE_TEST_CASE_P(Split4PerfTestCPU, Split4PerfTest,
INSTANTIATE_TEST_CASE_P(Merge3PerfTestCPU, Merge3PerfTest, INSTANTIATE_TEST_CASE_P(Merge3PerfTestCPU, Merge3PerfTest,
Combine(Values(AbsExact().to_compare_f()), Combine(Values(AbsExact().to_compare_f()),
Values(szSmall128, szVGA, sz720p, sz1080p), Values(szSmall128, szVGA, sz720p, sz1080p),
Values(CV_8U),
Values(cv::compile_args(CORE_CPU)))); Values(cv::compile_args(CORE_CPU))));
INSTANTIATE_TEST_CASE_P(Merge4PerfTestCPU, Merge4PerfTest, INSTANTIATE_TEST_CASE_P(Merge4PerfTestCPU, Merge4PerfTest,

View File

@ -253,6 +253,7 @@ INSTANTIATE_TEST_CASE_P(Split4PerfTestFluid, Split4PerfTest,
INSTANTIATE_TEST_CASE_P(Merge3PerfTestFluid, Merge3PerfTest, INSTANTIATE_TEST_CASE_P(Merge3PerfTestFluid, Merge3PerfTest,
Combine(Values(AbsExact().to_compare_f()), Combine(Values(AbsExact().to_compare_f()),
Values(szSmall128, szVGA, sz720p, sz1080p), Values(szSmall128, szVGA, sz720p, sz1080p),
Values(CV_8U, CV_16S, CV_16U, CV_32F),
Values(cv::compile_args(CORE_FLUID)))); Values(cv::compile_args(CORE_FLUID))));
INSTANTIATE_TEST_CASE_P(Merge4PerfTestFluid, Merge4PerfTest, INSTANTIATE_TEST_CASE_P(Merge4PerfTestFluid, Merge4PerfTest,

View File

@ -242,6 +242,7 @@ INSTANTIATE_TEST_CASE_P(Split4PerfTestGPU, Split4PerfTest,
INSTANTIATE_TEST_CASE_P(Merge3PerfTestGPU, Merge3PerfTest, INSTANTIATE_TEST_CASE_P(Merge3PerfTestGPU, Merge3PerfTest,
Combine(Values(AbsExact().to_compare_f()), Combine(Values(AbsExact().to_compare_f()),
Values( szSmall128, szVGA, sz720p, sz1080p ), Values( szSmall128, szVGA, sz720p, sz1080p ),
Values(CV_8U),
Values(cv::compile_args(CORE_GPU)))); Values(cv::compile_args(CORE_GPU))));
INSTANTIATE_TEST_CASE_P(Merge4PerfTestGPU, Merge4PerfTest, INSTANTIATE_TEST_CASE_P(Merge4PerfTestGPU, Merge4PerfTest,

View File

@ -2320,12 +2320,15 @@ GAPI_FLUID_KERNEL(GFluidSplit3, cv::gapi::core::GSplit3, false)
static void run(const View &src, Buffer &dst1, Buffer &dst2, Buffer &dst3) static void run(const View &src, Buffer &dst1, Buffer &dst2, Buffer &dst3)
{ {
GAPI_Assert((src.meta().depth == CV_8U) && (dst1.meta().depth == CV_8U) &&
(dst2.meta().depth == CV_8U) && (dst3.meta().depth == CV_8U) &&
(3 == src.meta().chan));
const auto *in = src.InLine<uchar>(0); const auto *in = src.InLine<uchar>(0);
auto *out1 = dst1.OutLine<uchar>(); auto *out1 = dst1.OutLine<uchar>();
auto *out2 = dst2.OutLine<uchar>(); auto *out2 = dst2.OutLine<uchar>();
auto *out3 = dst3.OutLine<uchar>(); auto *out3 = dst3.OutLine<uchar>();
GAPI_Assert(3 == src.meta().chan);
int width = src.length(); int width = src.length();
int w = 0; int w = 0;
@ -2348,13 +2351,16 @@ GAPI_FLUID_KERNEL(GFluidSplit4, cv::gapi::core::GSplit4, false)
static void run(const View &src, Buffer &dst1, Buffer &dst2, Buffer &dst3, Buffer &dst4) static void run(const View &src, Buffer &dst1, Buffer &dst2, Buffer &dst3, Buffer &dst4)
{ {
GAPI_Assert((src.meta().depth == CV_8U) && (dst1.meta().depth == CV_8U) &&
(dst2.meta().depth == CV_8U) && (dst3.meta().depth == CV_8U) &&
(dst4.meta().depth == CV_8U) && (4 == src.meta().chan));
const auto *in = src.InLine<uchar>(0); const auto *in = src.InLine<uchar>(0);
auto *out1 = dst1.OutLine<uchar>(); auto *out1 = dst1.OutLine<uchar>();
auto *out2 = dst2.OutLine<uchar>(); auto *out2 = dst2.OutLine<uchar>();
auto *out3 = dst3.OutLine<uchar>(); auto *out3 = dst3.OutLine<uchar>();
auto *out4 = dst4.OutLine<uchar>(); auto *out4 = dst4.OutLine<uchar>();
GAPI_Assert(4 == src.meta().chan);
int width = src.length(); int width = src.length();
int w = 0; int w = 0;
@ -2372,31 +2378,46 @@ GAPI_FLUID_KERNEL(GFluidSplit4, cv::gapi::core::GSplit4, false)
} }
}; };
template<typename T>
CV_ALWAYS_INLINE void run_merge3(Buffer& dst, const View& src1, const View& src2, const View& src3)
{
const auto* in1 = src1.InLine<T>(0);
const auto* in2 = src2.InLine<T>(0);
const auto* in3 = src3.InLine<T>(0);
auto* out = dst.OutLine<T>();
int width = dst.length();
int w = 0;
#if CV_SIMD
w = merge3_simd(in1, in2, in3, out, width);
#endif
for (; w < width; w++)
{
out[3 * w] = in1[w];
out[3 * w + 1] = in2[w];
out[3 * w + 2] = in3[w];
}
}
GAPI_FLUID_KERNEL(GFluidMerge3, cv::gapi::core::GMerge3, false) GAPI_FLUID_KERNEL(GFluidMerge3, cv::gapi::core::GMerge3, false)
{ {
static const int Window = 1; static const int Window = 1;
static void run(const View &src1, const View &src2, const View &src3, Buffer &dst) static void run(const View& src1, const View& src2, const View& src3, Buffer& dst)
{ {
const auto *in1 = src1.InLine<uchar>(0); GAPI_Assert((src1.meta().depth == dst.meta().depth) &&
const auto *in2 = src2.InLine<uchar>(0); (src1.meta().depth == src2.meta().depth) &&
const auto *in3 = src3.InLine<uchar>(0); (src1.meta().depth == src3.meta().depth));
auto *out = dst.OutLine<uchar>();
GAPI_Assert(3 == dst.meta().chan); // SRC/DST TYPE OP __VA_ARGS__
int width = dst.length(); MERGE3_(uchar, run_merge3, dst, src1, src2, src3);
int w = 0; MERGE3_(ushort, run_merge3, dst, src1, src2, src3);
MERGE3_(short, run_merge3, dst, src1, src2, src3);
MERGE3_(float, run_merge3, dst, src1, src2, src3);
#if CV_SIMD CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
w = merge3_simd(in1, in2, in3, out, width);
#endif
for (; w < width; w++)
{
out[3*w ] = in1[w];
out[3*w + 1] = in2[w];
out[3*w + 2] = in3[w];
}
} }
}; };
@ -2407,13 +2428,16 @@ GAPI_FLUID_KERNEL(GFluidMerge4, cv::gapi::core::GMerge4, false)
static void run(const View &src1, const View &src2, const View &src3, const View &src4, static void run(const View &src1, const View &src2, const View &src3, const View &src4,
Buffer &dst) Buffer &dst)
{ {
GAPI_Assert((dst.meta().depth == CV_8U) && (src1.meta().depth == CV_8U) &&
(src2.meta().depth == CV_8U) && (src3.meta().depth == CV_8U) &&
(4 == dst.meta().chan));
const auto *in1 = src1.InLine<uchar>(0); const auto *in1 = src1.InLine<uchar>(0);
const auto *in2 = src2.InLine<uchar>(0); const auto *in2 = src2.InLine<uchar>(0);
const auto *in3 = src3.InLine<uchar>(0); const auto *in3 = src3.InLine<uchar>(0);
const auto *in4 = src4.InLine<uchar>(0); const auto *in4 = src4.InLine<uchar>(0);
auto *out = dst.OutLine<uchar>(); auto *out = dst.OutLine<uchar>();
GAPI_Assert(4 == dst.meta().chan);
int width = dst.length(); int width = dst.length();
int w = 0; // cycle counter int w = 0; // cycle counter

View File

@ -277,13 +277,21 @@ int split4_simd(const uchar in[], uchar out1[], uchar out2[],
CV_CPU_DISPATCH_MODES_ALL); CV_CPU_DISPATCH_MODES_ALL);
} }
int merge3_simd(const uchar in1[], const uchar in2[], const uchar in3[], #define MERGE3_SIMD(T) \
uchar out[], const int width) int merge3_simd(const T in1[], const T in2[], const T in3[], \
{ T out[], const int width) \
CV_CPU_DISPATCH(merge3_simd, (in1, in2, in3, out, width), { \
CV_CPU_DISPATCH_MODES_ALL); CV_CPU_DISPATCH(merge3_simd, (in1, in2, in3, out, width), \
CV_CPU_DISPATCH_MODES_ALL); \
} }
MERGE3_SIMD(uchar)
MERGE3_SIMD(short)
MERGE3_SIMD(ushort)
MERGE3_SIMD(float)
#undef MERGE3_SIMD
int merge4_simd(const uchar in1[], const uchar in2[], const uchar in3[], int merge4_simd(const uchar in1[], const uchar in2[], const uchar in3[],
const uchar in4[], uchar out[], const int width) const uchar in4[], uchar out[], const int width)
{ {

View File

@ -216,8 +216,16 @@ int split3_simd(const uchar in[], uchar out1[], uchar out2[],
int split4_simd(const uchar in[], uchar out1[], uchar out2[], int split4_simd(const uchar in[], uchar out1[], uchar out2[],
uchar out3[], uchar out4[], const int width); uchar out3[], uchar out4[], const int width);
int merge3_simd(const uchar in1[], const uchar in2[], const uchar in3[], #define MERGE3_SIMD(T) \
uchar out[], const int width); int merge3_simd(const T in1[], const T in2[], const T in3[], \
T out[], const int width);
MERGE3_SIMD(uchar)
MERGE3_SIMD(short)
MERGE3_SIMD(ushort)
MERGE3_SIMD(float)
#undef MERGE3_SIMD
int merge4_simd(const uchar in1[], const uchar in2[], const uchar in3[], int merge4_simd(const uchar in1[], const uchar in2[], const uchar in3[],
const uchar in4[], uchar out[], const int width); const uchar in4[], uchar out[], const int width);

View File

@ -322,12 +322,21 @@ int split3_simd(const uchar in[], uchar out1[], uchar out2[],
int split4_simd(const uchar in[], uchar out1[], uchar out2[], int split4_simd(const uchar in[], uchar out1[], uchar out2[],
uchar out3[], uchar out4[], const int width); uchar out3[], uchar out4[], const int width);
int merge3_simd(const uchar in1[], const uchar in2[], const uchar in3[], #define MERGE3_SIMD(T) \
uchar out[], const int width); int merge3_simd(const T in1[], const T in2[], const T in3[], \
T out[], const int width);
MERGE3_SIMD(uchar)
MERGE3_SIMD(short)
MERGE3_SIMD(ushort)
MERGE3_SIMD(float)
#undef MERGE3_SIMD
int merge4_simd(const uchar in1[], const uchar in2[], const uchar in3[], int merge4_simd(const uchar in1[], const uchar in2[], const uchar in3[],
const uchar in4[], uchar out[], const int width); const uchar in4[], uchar out[], const int width);
#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
#define SRC_SHORT_OR_USHORT std::is_same<SRC, short>::value || std::is_same<SRC, ushort>::value #define SRC_SHORT_OR_USHORT std::is_same<SRC, short>::value || std::is_same<SRC, ushort>::value
@ -2530,34 +2539,42 @@ int split4_simd(const uchar in[], uchar out1[], uchar out2[],
// //
//------------------------- //-------------------------
int merge3_simd(const uchar in1[], const uchar in2[], const uchar in3[], #define MERGE3_SIMD(T) \
uchar out[], const int width) int merge3_simd(const T in1[], const T in2[], const T in3[], \
{ T out[], const int width) \
constexpr int nlanes = v_uint8::nlanes; { \
if (width < nlanes) constexpr int nlanes = vector_type_of_t<T>::nlanes; \
return 0; if (width < nlanes) \
return 0; \
int x = 0; \
for (;;) int x = 0; \
{ for (;;) \
for (; x <= width - nlanes; x += nlanes) { \
{ for (; x <= width - nlanes; x += nlanes) \
v_uint8 a, b, c; { \
a = vx_load(&in1[x]); vector_type_of_t<T> a, b, c; \
b = vx_load(&in2[x]); a = vx_load(&in1[x]); \
c = vx_load(&in3[x]); b = vx_load(&in2[x]); \
v_store_interleave(&out[3 * x], a, b, c); c = vx_load(&in3[x]); \
} v_store_interleave(&out[3 * x], a, b, c); \
if (x < width) } \
{ if (x < width) \
x = width - nlanes; { \
continue; x = width - nlanes; \
} continue; \
break; } \
} break; \
return x; } \
return x; \
} }
MERGE3_SIMD(uchar)
MERGE3_SIMD(short)
MERGE3_SIMD(ushort)
MERGE3_SIMD(float)
#undef MERGE3_SIMD
//------------------------- //-------------------------
// //
// Fluid kernels: Merge4 // Fluid kernels: Merge4
@ -2926,6 +2943,8 @@ CV_ALWAYS_INLINE void convertto_simd_nocoeff_impl(const SRC* inx, float* outx)
int convertto_simd(const SRC in[], DST out[], const int length) \ int convertto_simd(const SRC in[], DST out[], const int length) \
{ \ { \
constexpr int nlanes = vector_type_of_t<DST>::nlanes; \ constexpr int nlanes = vector_type_of_t<DST>::nlanes; \
if (length < nlanes) \
return 0; \
\ \
int x = 0; \ int x = 0; \
for (;;) \ for (;;) \
@ -3093,6 +3112,9 @@ int convertto_scaled_simd(const SRC in[], DST out[], const float alpha, \
const float beta, const int length) \ const float beta, const int length) \
{ \ { \
constexpr int nlanes = vector_type_of_t<DST>::nlanes; \ constexpr int nlanes = vector_type_of_t<DST>::nlanes; \
if (length < nlanes) \
return 0; \
\
v_float32 v_alpha = vx_setall_f32(alpha); \ v_float32 v_alpha = vx_setall_f32(alpha); \
v_float32 v_beta = vx_setall_f32(beta); \ v_float32 v_beta = vx_setall_f32(beta); \
\ \

View File

@ -86,6 +86,23 @@ using cv::gapi::own::rintd;
return; \ return; \
} }
#define MERGE3_(T, OP, ...) \
if (cv::DataType<T>::depth == dst.meta().depth && \
cv::DataType<T>::depth == src1.meta().depth) \
{ \
GAPI_DbgAssert(dst.length() == src1.length()); \
GAPI_DbgAssert(dst.length() == src2.length()); \
GAPI_DbgAssert(dst.length() == src3.length()); \
\
GAPI_DbgAssert(1 == src1.meta().chan); \
GAPI_DbgAssert(1 == src2.meta().chan); \
GAPI_DbgAssert(1 == src3.meta().chan); \
GAPI_DbgAssert(3 == dst.meta().chan); \
\
OP<T>(__VA_ARGS__); \
return; \
}
} // namespace fluid } // namespace fluid
} // namespace gapi } // namespace gapi
} // namespace cv } // namespace cv