Merge pull request #13290 from elatkin:el/gapi_perf_filter2d

GAPI (fluid): Filter 2D optimization (#13290)

* GAPI (fluid): Filter 2D optimization: speedup 13x if float, 2x if integral

* GAPI (fluid): Filter 2D speedup 8x if output is short/ushort

* GAPI (fluid): Filter 2D speedup 7x if output is uchar

* GAPI (fluid): Filter 2D optimization: fixed compiler warnings

* GAPI (fluid): fix compiler warnings on Mac

* GAPI (fluid): fix compiler warnings on Mac

* GAPI (fluid): fix compiler errors on VS2015

* GAPI (fluid): fix compiler errors on VS2015

* GAPI (fluid): fix compiler errors on VS2015
This commit is contained in:
Evgeny Latkin 2018-11-27 19:12:14 +03:00 committed by Alexander Alekhin
parent 966f27df34
commit 6808d33b2f
4 changed files with 328 additions and 15 deletions

View File

@ -1052,24 +1052,30 @@ static void run_filter2d(Buffer& dst, const View& src,
int width = dst.length();
int chan = dst.meta().chan;
int length = width * chan;
for (int w=0; w < width; w++)
// manually optimized for 3x3
if (k_rows == 3 && k_cols == 3)
{
// TODO: make this cycle innermost
for (int c=0; c < chan; c++)
float scale = 1;
run_filter2d_3x3_impl(out, in, width, chan, k, scale, delta);
return;
}
// reference: any kernel size
for (int l=0; l < length; l++)
{
float sum = 0;
for (int i=0; i < k_rows; i++)
for (int j=0; j < k_cols; j++)
{
float sum = 0;
for (int i=0; i < k_rows; i++)
for (int j=0; j < k_cols; j++)
{
sum += in[i][(w + j - border_x)*chan + c] * k[k_cols*i + j];
}
float result = sum + delta;
out[w*chan + c] = saturate<DST>(result, rintf);
sum += in[i][l + (j - border_x)*chan] * k[k_cols*i + j];
}
float result = sum + delta;
out[l] = saturate<DST>(result, rintf);
}
}
@ -1097,6 +1103,7 @@ GAPI_FLUID_KERNEL(GFluidFilter2D, cv::gapi::imgproc::GFilter2D, true)
int k_rows = kernel.rows;
int k_cols = kernel.cols;
const float *k = scratch.OutLine<float>(); // copy of kernel.data
// DST SRC OP __VA_ARGS__
@ -1120,7 +1127,12 @@ GAPI_FLUID_KERNEL(GFluidFilter2D, cv::gapi::imgproc::GFilter2D, true)
const cv::Scalar & /* borderValue */,
Buffer & scratch)
{
cv::gapi::own::Size bufsize(kernel.rows * kernel.cols, 1);
int krows = kernel.rows;
int kcols = kernel.cols;
int buflen = krows * kcols; // kernel size
cv::gapi::own::Size bufsize(buflen, 1);
GMatDesc bufdesc = {CV_32F, 1, bufsize};
Buffer buffer(bufdesc);
scratch = std::move(buffer);

View File

@ -86,6 +86,31 @@ RUN_SEPFILTER3X3_IMPL( float, float)
#undef RUN_SEPFILTER3X3_IMPL
//-------------------------
//
// Fluid kernels: Filter 2D
//
//-------------------------
#define RUN_FILTER2D_3X3_IMPL(DST, SRC) \
void run_filter2d_3x3_impl(DST out[], const SRC *in[], int width, int chan, \
const float kernel[], float scale, float delta) \
{ \
CV_CPU_DISPATCH(run_filter2d_3x3_impl, \
(out, in, width, chan, kernel, scale, delta), \
CV_CPU_DISPATCH_MODES_ALL); \
}
RUN_FILTER2D_3X3_IMPL(uchar , uchar )
RUN_FILTER2D_3X3_IMPL(ushort, ushort)
RUN_FILTER2D_3X3_IMPL( short, short)
RUN_FILTER2D_3X3_IMPL( float, uchar )
RUN_FILTER2D_3X3_IMPL( float, ushort)
RUN_FILTER2D_3X3_IMPL( float, short)
RUN_FILTER2D_3X3_IMPL( float, float)
#undef RUN_FILTER2D_3X3_IMPL
} // namespace fliud
} // namespace gapi
} // namespace cv

View File

@ -57,6 +57,26 @@ RUN_SEPFILTER3X3_IMPL( float, float)
#undef RUN_SEPFILTER3X3_IMPL
//-------------------------
//
// Fluid kernels: Filter 2D
//
//-------------------------
#define RUN_FILTER2D_3X3_IMPL(DST, SRC) \
void run_filter2d_3x3_impl(DST out[], const SRC *in[], int width, int chan, \
const float kernel[], float scale, float delta);
RUN_FILTER2D_3X3_IMPL(uchar , uchar )
RUN_FILTER2D_3X3_IMPL(ushort, ushort)
RUN_FILTER2D_3X3_IMPL( short, short)
RUN_FILTER2D_3X3_IMPL( float, uchar )
RUN_FILTER2D_3X3_IMPL( float, ushort)
RUN_FILTER2D_3X3_IMPL( float, short)
RUN_FILTER2D_3X3_IMPL( float, float)
#undef RUN_FILTER2D_3X3_IMPL
} // namespace fluid
} // namespace gapi
} // namespace cv

View File

@ -17,6 +17,7 @@
#include "opencv2/core/hal/intrin.hpp"
#include <cstdint>
#include <cstring>
#include <vector>
@ -76,6 +77,26 @@ RUN_SEPFILTER3X3_IMPL( float, float)
#undef RUN_SEPFILTER3X3_IMPL
//-------------------------
//
// Fluid kernels: Filter 2D
//
//-------------------------
#define RUN_FILTER2D_3X3_IMPL(DST, SRC) \
void run_filter2d_3x3_impl(DST out[], const SRC *in[], int width, int chan, \
const float kernel[], float scale, float delta);
RUN_FILTER2D_3X3_IMPL(uchar , uchar )
RUN_FILTER2D_3X3_IMPL(ushort, ushort)
RUN_FILTER2D_3X3_IMPL( short, short)
RUN_FILTER2D_3X3_IMPL( float, uchar )
RUN_FILTER2D_3X3_IMPL( float, ushort)
RUN_FILTER2D_3X3_IMPL( float, short)
RUN_FILTER2D_3X3_IMPL( float, float)
#undef RUN_FILTER2D_3X3_IMPL
//----------------------------------------------------------------------
#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
@ -843,6 +864,241 @@ RUN_SEPFILTER3X3_IMPL( float, float)
#undef RUN_SEPFILTER3X3_IMPL
//-------------------------
//
// Fluid kernels: Filter 2D
//
//-------------------------
template<bool noscale, typename DST, typename SRC>
static void run_filter2d_3x3_reference(DST out[], const SRC *in[], int width, int chan,
const float kernel[], float scale, float delta)
{
static constexpr int ksize = 3;
static constexpr int border = (ksize - 1) / 2;
const int length = width * chan;
const int shift = border * chan;
const float k[3][3] = {{ kernel[0], kernel[1], kernel[2] },
{ kernel[3], kernel[4], kernel[5] },
{ kernel[6], kernel[7], kernel[8] }};
for (int l=0; l < length; l++)
{
float sum = in[0][l - shift] * k[0][0] + in[0][l] * k[0][1] + in[0][l + shift] * k[0][2]
+ in[1][l - shift] * k[1][0] + in[1][l] * k[1][1] + in[1][l + shift] * k[1][2]
+ in[2][l - shift] * k[2][0] + in[2][l] * k[2][1] + in[2][l + shift] * k[2][2];
if (!noscale)
{
sum = sum*scale + delta;
}
out[l] = saturate<DST>(sum, rintf);
}
}
#if CV_SIMD
// assume DST is short or ushort
template<bool noscale, typename DST, typename SRC>
static void run_filter2d_3x3_any2short(DST out[], const SRC *in[], int width, int chan,
const float kernel[], float scale, float delta)
{
static constexpr int ksize = 3;
static constexpr int border = (ksize - 1) / 2;
const int length = width * chan;
const int shift = border * chan;
const float k[3][3] = {
{ kernel[0], kernel[1], kernel[2] },
{ kernel[3], kernel[4], kernel[5] },
{ kernel[6], kernel[7], kernel[8] }
};
for (int l=0; l < length;)
{
static constexpr int nlanes = v_int16::nlanes;
// main part of output row
for (; l <= length - nlanes; l += nlanes)
{
auto sumx = [in, shift, &k](int i, int j)
{
v_float32 s = vx_load_f32(&in[i][j - shift]) * vx_setall_f32(k[i][0]);
s = v_fma(vx_load_f32(&in[i][j ]), vx_setall_f32(k[i][1]), s);
s = v_fma(vx_load_f32(&in[i][j + shift]), vx_setall_f32(k[i][2]), s);
return s;
};
int l0 = l;
int l1 = l + nlanes/2;
v_float32 sum0 = sumx(0, l0) + sumx(1, l0) + sumx(2, l0);
v_float32 sum1 = sumx(0, l1) + sumx(1, l1) + sumx(2, l1);
if (!noscale)
{
sum0 = v_fma(sum0, vx_setall_f32(scale), vx_setall_f32(delta));
sum1 = v_fma(sum1, vx_setall_f32(scale), vx_setall_f32(delta));
}
v_int32 res0 = v_round(sum0);
v_int32 res1 = v_round(sum1);
if (std::is_same<DST, ushort>::value)
{
v_uint16 res = v_pack_u(res0, res1);
v_store(reinterpret_cast<ushort*>(&out[l]), res);
}
else // if DST == short
{
v_int16 res = v_pack(res0, res1);
v_store(reinterpret_cast<short*>(&out[l]), res);
}
}
// tail (if any)
if (l < length)
{
GAPI_DbgAssert(length >= nlanes);
l = length - nlanes;
}
}
}
template<bool noscale, typename SRC>
static void run_filter2d_3x3_any2char(uchar out[], const SRC *in[], int width, int chan,
const float kernel[], float scale, float delta)
{
static constexpr int ksize = 3;
static constexpr int border = (ksize - 1) / 2;
const int length = width * chan;
const int shift = border * chan;
const float k[3][3] = {
{ kernel[0], kernel[1], kernel[2] },
{ kernel[3], kernel[4], kernel[5] },
{ kernel[6], kernel[7], kernel[8] }
};
for (int l=0; l < length;)
{
static constexpr int nlanes = v_uint8::nlanes;
// main part of output row
for (; l <= length - nlanes; l += nlanes)
{
auto sumx = [in, shift, &k](int i, int j)
{
v_float32 s = vx_load_f32(&in[i][j - shift]) * vx_setall_f32(k[i][0]);
s = v_fma(vx_load_f32(&in[i][j ]), vx_setall_f32(k[i][1]), s);
s = v_fma(vx_load_f32(&in[i][j + shift]), vx_setall_f32(k[i][2]), s);
return s;
};
int l0 = l;
int l1 = l + nlanes/4;
int l2 = l + 2*nlanes/4;
int l3 = l + 3*nlanes/4;
v_float32 sum0 = sumx(0, l0) + sumx(1, l0) + sumx(2, l0);
v_float32 sum1 = sumx(0, l1) + sumx(1, l1) + sumx(2, l1);
v_float32 sum2 = sumx(0, l2) + sumx(1, l2) + sumx(2, l2);
v_float32 sum3 = sumx(0, l3) + sumx(1, l3) + sumx(2, l3);
if (!noscale)
{
sum0 = v_fma(sum0, vx_setall_f32(scale), vx_setall_f32(delta));
sum1 = v_fma(sum1, vx_setall_f32(scale), vx_setall_f32(delta));
sum2 = v_fma(sum2, vx_setall_f32(scale), vx_setall_f32(delta));
sum3 = v_fma(sum3, vx_setall_f32(scale), vx_setall_f32(delta));
}
v_int32 res0 = v_round(sum0);
v_int32 res1 = v_round(sum1);
v_int32 res2 = v_round(sum2);
v_int32 res3 = v_round(sum3);
v_int16 resl = v_pack(res0, res1);
v_int16 resh = v_pack(res2, res3);
v_uint8 res = v_pack_u(resl, resh);
v_store(&out[l], res);
}
// tail (if any)
if (l < length)
{
GAPI_DbgAssert(length >= nlanes);
l = length - nlanes;
}
}
}
#endif
template<bool noscale, typename DST, typename SRC>
static void run_filter2d_3x3_code(DST out[], const SRC *in[], int width, int chan,
const float kernel[], float scale, float delta)
{
#if CV_SIMD
int length = width * chan;
// length variable may be unused if types do not match at 'if' statements below
(void) length;
if (std::is_same<DST, short>::value && length >= v_int16::nlanes)
{
run_filter2d_3x3_any2short<noscale>(reinterpret_cast<short*>(out), in,
width, chan, kernel, scale, delta);
return;
}
if (std::is_same<DST, ushort>::value && length >= v_uint16::nlanes)
{
run_filter2d_3x3_any2short<noscale>(reinterpret_cast<ushort*>(out), in,
width, chan, kernel, scale, delta);
return;
}
if (std::is_same<DST, uchar>::value && length >= v_uint8::nlanes)
{
run_filter2d_3x3_any2char<noscale>(reinterpret_cast<uchar*>(out), in,
width, chan, kernel, scale, delta);
return;
}
#endif // CV_SIMD
run_filter2d_3x3_reference<noscale>(out, in, width, chan, kernel, scale, delta);
}
#define RUN_FILTER2D_3X3_IMPL(DST, SRC) \
void run_filter2d_3x3_impl(DST out[], const SRC *in[], int width, int chan, \
const float kernel[], float scale, float delta) \
{ \
if (scale == 1 && delta == 0) \
{ \
constexpr bool noscale = true; \
run_filter2d_3x3_code<noscale>(out, in, width, chan, kernel, scale, delta); \
} \
else \
{ \
constexpr bool noscale = false; \
run_filter2d_3x3_code<noscale>(out, in, width, chan, kernel, scale, delta); \
} \
}
RUN_FILTER2D_3X3_IMPL(uchar , uchar )
RUN_FILTER2D_3X3_IMPL(ushort, ushort)
RUN_FILTER2D_3X3_IMPL( short, short)
RUN_FILTER2D_3X3_IMPL( float, uchar )
RUN_FILTER2D_3X3_IMPL( float, ushort)
RUN_FILTER2D_3X3_IMPL( float, short)
RUN_FILTER2D_3X3_IMPL( float, float)
#undef RUN_FILTER2D_3X3_IMPL
//------------------------------------------------------------------------------
#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY