mirror of
https://github.com/opencv/opencv.git
synced 2025-07-24 22:16:27 +08:00
Merge pull request #13174 from elatkin:el/gapi_perf_rgb2yuv
GAPI (fluid): RGB to YUV optimization (#13174) * GAPI (fluid): RGB to YUV: activate performance tests * GAPI (fluid): speedup 4-8x RGB-to-YUV, 2.5x YUV to RGB with int16 arithmetic * GAPI (fluid): RGB <--> YUV: fixed compiler warning * GAPI (fluid): RGB <--> YUV: additional speedup 2-3x times (10-15x over original) via manual CV_SIMD * GAPI (fluid): RGB <--> YUV: dynamic CV_SIMD dispatching
This commit is contained in:
parent
59e2ca16d9
commit
6757c2c5a6
@ -678,7 +678,7 @@ PERF_TEST_P_(RGB2YUVPerfTest, TestPerformance)
|
||||
|
||||
TEST_CYCLE()
|
||||
{
|
||||
c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
|
||||
c.apply(in_mat1, out_mat_gapi);
|
||||
}
|
||||
|
||||
// Comparison //////////////////////////////////////////////////////////////
|
||||
@ -716,7 +716,7 @@ PERF_TEST_P_(YUV2RGBPerfTest, TestPerformance)
|
||||
|
||||
TEST_CYCLE()
|
||||
{
|
||||
c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
|
||||
c.apply(in_mat1, out_mat_gapi);
|
||||
}
|
||||
|
||||
// Comparison //////////////////////////////////////////////////////////////
|
||||
@ -864,7 +864,7 @@ PERF_TEST_P_(BGR2YUVPerfTest, TestPerformance)
|
||||
|
||||
TEST_CYCLE()
|
||||
{
|
||||
c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
|
||||
c.apply(in_mat1, out_mat_gapi);
|
||||
}
|
||||
|
||||
EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
|
||||
@ -894,7 +894,7 @@ PERF_TEST_P_(YUV2BGRPerfTest, TestPerformance)
|
||||
|
||||
TEST_CYCLE()
|
||||
{
|
||||
c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
|
||||
c.apply(in_mat1, out_mat_gapi);
|
||||
}
|
||||
|
||||
EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
|
||||
|
@ -43,4 +43,24 @@ namespace opencv_test
|
||||
Values(szVGA, sz720p, sz1080p),
|
||||
Values(cv::compile_args(IMGPROC_FLUID))));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(RGB2YUVPerfTestFluid, RGB2YUVPerfTest,
|
||||
Combine(Values(ToleranceColor(1e-3).to_compare_f()),
|
||||
Values(szVGA, sz720p, sz1080p),
|
||||
Values(cv::compile_args(IMGPROC_FLUID))));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(YUV2RGBPerfTestFluid, YUV2RGBPerfTest,
|
||||
Combine(Values(ToleranceColor(1e-3).to_compare_f()),
|
||||
Values(szVGA, sz720p, sz1080p),
|
||||
Values(cv::compile_args(IMGPROC_FLUID))));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(BGR2YUVPerfTestFluid, BGR2YUVPerfTest,
|
||||
Combine(Values(ToleranceColor(1e-3).to_compare_f()),
|
||||
Values(szVGA, sz720p, sz1080p),
|
||||
Values(cv::compile_args(IMGPROC_FLUID))));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(YUV2BGRPerfTestFluid, YUV2BGRPerfTest,
|
||||
Combine(Values(ToleranceColor(1e-3).to_compare_f()),
|
||||
Values(szVGA, sz720p, sz1080p),
|
||||
Values(cv::compile_args(IMGPROC_FLUID))));
|
||||
|
||||
}
|
||||
|
@ -126,19 +126,7 @@ static void run_rgb2yuv(Buffer &dst, const View &src, const float coef[5])
|
||||
|
||||
int width = dst.length();
|
||||
|
||||
// TODO: Vectorize for SIMD
|
||||
for (int w=0; w < width; w++)
|
||||
{
|
||||
uchar r = in[3*w ];
|
||||
uchar g = in[3*w + 1];
|
||||
uchar b = in[3*w + 2];
|
||||
float y = coef[0]*r + coef[1]*g + coef[2]*b;
|
||||
float u = coef[3]*(b - y) + 128;
|
||||
float v = coef[4]*(r - y) + 128;
|
||||
out[3*w ] = saturate<uchar>(y, roundf);
|
||||
out[3*w + 1] = saturate<uchar>(u, roundf);
|
||||
out[3*w + 2] = saturate<uchar>(v, roundf);
|
||||
}
|
||||
run_rgb2yuv_impl(out, in, width, coef);
|
||||
}
|
||||
|
||||
static void run_yuv2rgb(Buffer &dst, const View &src, const float coef[4])
|
||||
@ -154,19 +142,7 @@ static void run_yuv2rgb(Buffer &dst, const View &src, const float coef[4])
|
||||
|
||||
int width = dst.length();
|
||||
|
||||
// TODO: Vectorize for SIMD
|
||||
for (int w=0; w < width; w++)
|
||||
{
|
||||
uchar y = in[3*w ];
|
||||
int u = in[3*w + 1] - 128;
|
||||
int v = in[3*w + 2] - 128;
|
||||
float r = y + coef[0]*v;
|
||||
float g = y + coef[1]*u + coef[2]*v;
|
||||
float b = y + coef[3]*u;
|
||||
out[3*w ] = saturate<uchar>(r, roundf);
|
||||
out[3*w + 1] = saturate<uchar>(g, roundf);
|
||||
out[3*w + 2] = saturate<uchar>(b, roundf);
|
||||
}
|
||||
run_yuv2rgb_impl(out, in, width, coef);
|
||||
}
|
||||
|
||||
GAPI_FLUID_KERNEL(GFluidRGB2YUV, cv::gapi::imgproc::GRGB2YUV, false)
|
||||
|
@ -41,6 +41,22 @@ void run_rgb2gray_impl(uchar out[], const uchar in[], int width,
|
||||
CV_CPU_DISPATCH_MODES_ALL);
|
||||
}
|
||||
|
||||
//--------------------------------------
|
||||
//
|
||||
// Fluid kernels: RGB-to-YUV, YUV-to-RGB
|
||||
//
|
||||
//--------------------------------------
|
||||
|
||||
void run_rgb2yuv_impl(uchar out[], const uchar in[], int width, const float coef[5])
|
||||
{
|
||||
CV_CPU_DISPATCH(run_rgb2yuv_impl, (out, in, width, coef), CV_CPU_DISPATCH_MODES_ALL);
|
||||
}
|
||||
|
||||
void run_yuv2rgb_impl(uchar out[], const uchar in[], int width, const float coef[4])
|
||||
{
|
||||
CV_CPU_DISPATCH(run_yuv2rgb_impl, (out, in, width, coef), CV_CPU_DISPATCH_MODES_ALL);
|
||||
}
|
||||
|
||||
//---------------------
|
||||
//
|
||||
// Fluid kernels: Sobel
|
||||
|
@ -23,6 +23,16 @@ namespace fluid {
|
||||
void run_rgb2gray_impl(uchar out[], const uchar in[], int width,
|
||||
float coef_r, float coef_g, float coef_b);
|
||||
|
||||
//--------------------------------------
|
||||
//
|
||||
// Fluid kernels: RGB-to-YUV, YUV-to-RGB
|
||||
//
|
||||
//--------------------------------------
|
||||
|
||||
void run_rgb2yuv_impl(uchar out[], const uchar in[], int width, const float coef[5]);
|
||||
|
||||
void run_yuv2rgb_impl(uchar out[], const uchar in[], int width, const float coef[4]);
|
||||
|
||||
//---------------------
|
||||
//
|
||||
// Fluid kernels: Sobel
|
||||
|
@ -21,6 +21,8 @@
|
||||
# pragma GCC diagnostic ignored "-Wstrict-overflow"
|
||||
#endif
|
||||
|
||||
using cv::gapi::own::saturate;
|
||||
|
||||
namespace cv {
|
||||
namespace gapi {
|
||||
namespace fluid {
|
||||
@ -36,6 +38,16 @@ CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
|
||||
void run_rgb2gray_impl(uchar out[], const uchar in[], int width,
|
||||
float coef_r, float coef_g, float coef_b);
|
||||
|
||||
//--------------------------------------
|
||||
//
|
||||
// Fluid kernels: RGB-to-YUV, YUV-to-RGB
|
||||
//
|
||||
//--------------------------------------
|
||||
|
||||
void run_rgb2yuv_impl(uchar out[], const uchar in[], int width, const float coef[5]);
|
||||
|
||||
void run_yuv2rgb_impl(uchar out[], const uchar in[], int width, const float coef[4]);
|
||||
|
||||
//---------------------
|
||||
//
|
||||
// Fluid kernels: Sobel
|
||||
@ -142,6 +154,161 @@ void run_rgb2gray_impl(uchar out[], const uchar in[], int width,
|
||||
}
|
||||
}
|
||||
|
||||
//--------------------------------------
|
||||
//
|
||||
// Fluid kernels: RGB-to-YUV, YUV-to-RGB
|
||||
//
|
||||
//--------------------------------------
|
||||
|
||||
void run_rgb2yuv_impl(uchar out[], const uchar in[], int width, const float coef[5])
|
||||
{
|
||||
ushort c0 = static_cast<ushort>(coef[0]*(1 << 16) + 0.5f); // Q0.0.16 un-signed
|
||||
ushort c1 = static_cast<ushort>(coef[1]*(1 << 16) + 0.5f);
|
||||
ushort c2 = static_cast<ushort>(coef[2]*(1 << 16) + 0.5f);
|
||||
short c3 = static_cast<short>(coef[3]*(1 << 12) + 0.5f); // Q1.0.12 signed
|
||||
short c4 = static_cast<short>(coef[4]*(1 << 12) + 0.5f);
|
||||
|
||||
int w = 0;
|
||||
|
||||
#if CV_SIMD
|
||||
static const int nlanes = v_uint8::nlanes;
|
||||
for ( ; w <= width - nlanes; w += nlanes)
|
||||
{
|
||||
v_uint8 r, g, b;
|
||||
v_load_deinterleave(&in[3*w], r, g, b);
|
||||
|
||||
v_uint16 _r0, _r1, _g0, _g1, _b0, _b1;
|
||||
v_expand(r, _r0, _r1);
|
||||
v_expand(g, _g0, _g1);
|
||||
v_expand(b, _b0, _b1);
|
||||
|
||||
_r0 = _r0 << 7; // Q0.9.7 un-signed
|
||||
_r1 = _r1 << 7;
|
||||
_g0 = _g0 << 7;
|
||||
_g1 = _g1 << 7;
|
||||
_b0 = _b0 << 7;
|
||||
_b1 = _b1 << 7;
|
||||
|
||||
v_uint16 _y0, _y1;
|
||||
_y0 = v_mul_hi(vx_setall_u16(c0), _r0) // Q0.9.7
|
||||
+ v_mul_hi(vx_setall_u16(c1), _g0)
|
||||
+ v_mul_hi(vx_setall_u16(c2), _b0);
|
||||
_y1 = v_mul_hi(vx_setall_u16(c0), _r1)
|
||||
+ v_mul_hi(vx_setall_u16(c1), _g1)
|
||||
+ v_mul_hi(vx_setall_u16(c2), _b1);
|
||||
|
||||
v_int16 r0, r1, b0, b1, y0, y1;
|
||||
r0 = v_reinterpret_as_s16(_r0); // Q1.8.7 signed
|
||||
r1 = v_reinterpret_as_s16(_r1);
|
||||
b0 = v_reinterpret_as_s16(_b0);
|
||||
b1 = v_reinterpret_as_s16(_b1);
|
||||
y0 = v_reinterpret_as_s16(_y0);
|
||||
y1 = v_reinterpret_as_s16(_y1);
|
||||
|
||||
v_int16 u0, u1, v0, v1;
|
||||
u0 = v_mul_hi(vx_setall_s16(c3), b0 - y0); // Q1.12.3
|
||||
u1 = v_mul_hi(vx_setall_s16(c3), b1 - y1);
|
||||
v0 = v_mul_hi(vx_setall_s16(c4), r0 - y0);
|
||||
v1 = v_mul_hi(vx_setall_s16(c4), r1 - y1);
|
||||
|
||||
v_uint8 y, u, v;
|
||||
y = v_pack((_y0 + vx_setall_u16(1 << 6)) >> 7,
|
||||
(_y1 + vx_setall_u16(1 << 6)) >> 7);
|
||||
u = v_pack_u((u0 + vx_setall_s16(257 << 2)) >> 3, // 257 << 2 = 128.5 * (1 << 3)
|
||||
(u1 + vx_setall_s16(257 << 2)) >> 3);
|
||||
v = v_pack_u((v0 + vx_setall_s16(257 << 2)) >> 3,
|
||||
(v1 + vx_setall_s16(257 << 2)) >> 3);
|
||||
|
||||
v_store_interleave(&out[3*w], y, u, v);
|
||||
}
|
||||
#endif
|
||||
|
||||
for ( ; w < width; w++)
|
||||
{
|
||||
short r = in[3*w ] << 7; // Q1.8.7 signed
|
||||
short g = in[3*w + 1] << 7;
|
||||
short b = in[3*w + 2] << 7;
|
||||
short y = (c0*r + c1*g + c2*b) >> 16; // Q1.8.7
|
||||
short u = c3*(b - y) >> 16; // Q1.12.3
|
||||
short v = c4*(r - y) >> 16;
|
||||
out[3*w ] = static_cast<uchar>((y + (1 << 6)) >> 7);
|
||||
out[3*w + 1] = saturate<uchar>((u + (128 << 3) + (1 << 2)) >> 3);
|
||||
out[3*w + 2] = saturate<uchar>((v + (128 << 3) + (1 << 2)) >> 3);
|
||||
}
|
||||
}
|
||||
|
||||
void run_yuv2rgb_impl(uchar out[], const uchar in[], int width, const float coef[4])
|
||||
{
|
||||
short c0 = static_cast<short>(coef[0] * (1 << 12) + 0.5f); // Q1.3.12
|
||||
short c1 = static_cast<short>(coef[1] * (1 << 12) + 0.5f);
|
||||
short c2 = static_cast<short>(coef[2] * (1 << 12) + 0.5f);
|
||||
short c3 = static_cast<short>(coef[3] * (1 << 12) + 0.5f);
|
||||
|
||||
int w = 0;
|
||||
|
||||
#if CV_SIMD
|
||||
static const int nlanes = v_uint8::nlanes;
|
||||
for ( ; w <= width - nlanes; w += nlanes)
|
||||
{
|
||||
v_uint8 y, u, v;
|
||||
v_load_deinterleave(&in[3*w], y, u, v);
|
||||
|
||||
v_uint16 _y0, _y1, _u0, _u1, _v0, _v1;
|
||||
v_expand(y, _y0, _y1);
|
||||
v_expand(u, _u0, _u1);
|
||||
v_expand(v, _v0, _v1);
|
||||
|
||||
v_int16 y0, y1, u0, u1, v0, v1;
|
||||
y0 = v_reinterpret_as_s16(_y0);
|
||||
y1 = v_reinterpret_as_s16(_y1);
|
||||
u0 = v_reinterpret_as_s16(_u0);
|
||||
u1 = v_reinterpret_as_s16(_u1);
|
||||
v0 = v_reinterpret_as_s16(_v0);
|
||||
v1 = v_reinterpret_as_s16(_v1);
|
||||
|
||||
y0 = y0 << 3; // Q1.12.3
|
||||
y1 = y1 << 3;
|
||||
u0 = (u0 - vx_setall_s16(128)) << 7; // Q1.8.7
|
||||
u1 = (u1 - vx_setall_s16(128)) << 7;
|
||||
v0 = (v0 - vx_setall_s16(128)) << 7;
|
||||
v1 = (v1 - vx_setall_s16(128)) << 7;
|
||||
|
||||
v_int16 r0, r1, g0, g1, b0, b1;
|
||||
r0 = y0 + v_mul_hi(vx_setall_s16(c0), v0); // Q1.12.3
|
||||
r1 = y1 + v_mul_hi(vx_setall_s16(c0), v1);
|
||||
g0 = y0 + v_mul_hi(vx_setall_s16(c1), u0)
|
||||
+ v_mul_hi(vx_setall_s16(c2), v0);
|
||||
g1 = y1 + v_mul_hi(vx_setall_s16(c1), u1)
|
||||
+ v_mul_hi(vx_setall_s16(c2), v1);
|
||||
b0 = y0 + v_mul_hi(vx_setall_s16(c3), u0);
|
||||
b1 = y1 + v_mul_hi(vx_setall_s16(c3), u1);
|
||||
|
||||
v_uint8 r, g, b;
|
||||
r = v_pack_u((r0 + vx_setall_s16(1 << 2)) >> 3,
|
||||
(r1 + vx_setall_s16(1 << 2)) >> 3);
|
||||
g = v_pack_u((g0 + vx_setall_s16(1 << 2)) >> 3,
|
||||
(g1 + vx_setall_s16(1 << 2)) >> 3);
|
||||
b = v_pack_u((b0 + vx_setall_s16(1 << 2)) >> 3,
|
||||
(b1 + vx_setall_s16(1 << 2)) >> 3);
|
||||
|
||||
v_store_interleave(&out[3*w], r, g, b);
|
||||
}
|
||||
#endif
|
||||
|
||||
for ( ; w < width; w++)
|
||||
{
|
||||
short y = in[3*w ] << 3; // Q1.12.3
|
||||
short u = (in[3*w + 1] - 128) << 7; // Q1.8.7
|
||||
short v = (in[3*w + 2] - 128) << 7;
|
||||
short r = y + ( c0*v >> 16); // Q1.12.3
|
||||
short g = y + ((c1*u + c2*v) >> 16);
|
||||
short b = y + ((c3*u ) >> 16);
|
||||
out[3*w ] = saturate<uchar>((r + (1 << 2)) >> 3);
|
||||
out[3*w + 1] = saturate<uchar>((g + (1 << 2)) >> 3);
|
||||
out[3*w + 2] = saturate<uchar>((b + (1 << 2)) >> 3);
|
||||
}
|
||||
}
|
||||
|
||||
//---------------------
|
||||
//
|
||||
// Fluid kernels: Sobel
|
||||
|
Loading…
Reference in New Issue
Block a user