Merge pull request #13329 from elatkin:el/gapi_perf_medblur

GAPI (fluid): Median blur optimization (#13329)

* GAPI (fluid): Median blur optimization: reference 3x3

* GAPI (fluid): Median blur optimization: CPU dispatcher

* GAPI (fluid): Median blur optimization: manual CV_SIMD
This commit is contained in:
Evgeny Latkin 2018-11-29 18:02:29 +03:00 committed by Alexander Alekhin
parent 6374b99a1a
commit ab430b8c87
4 changed files with 253 additions and 16 deletions

View File

@ -1442,7 +1442,9 @@ static void run_medianblur( Buffer& dst,
const View & src,
int ksize)
{
static const int kmax = 9;
static_assert(std::is_same<DST, SRC>::value, "unsupported combination of types");
constexpr int kmax = 9;
GAPI_Assert(ksize <= kmax);
const SRC *in[ kmax ];
@ -1460,24 +1462,33 @@ static void run_medianblur( Buffer& dst,
int width = dst.length();
int chan = dst.meta().chan;
for (int w=0; w < width; w++)
// optimized: if 3x3
if (3 == ksize)
{
// TODO: make this cycle innermost
for (int c=0; c < chan; c++)
run_medblur3x3_impl(out, in, width, chan);
return;
}
// reference: any ksize
int length = width * chan;
int klength = ksize * ksize;
int klenhalf = klength / 2;
for (int l=0; l < length; l++)
{
SRC neighbours[kmax * kmax];
for (int i=0; i < ksize; i++)
for (int j=0; j < ksize; j++)
{
SRC neighbours[kmax * kmax];
for (int i=0; i < ksize; i++)
for (int j=0; j < ksize; j++)
{
neighbours[i*ksize + j] = in[i][(w + j - border)*chan + c];
}
int length = ksize * ksize;
std::nth_element(neighbours, neighbours + length/2, neighbours + length);
out[w*chan + c] = saturate<DST>(neighbours[length/2], rintf);
neighbours[i*ksize + j] = in[i][l + (j - border)*chan];
}
std::nth_element(neighbours, neighbours + klenhalf, neighbours + klength);
out[l] = saturate<DST>(neighbours[klenhalf], rintf);
}
}

View File

@ -134,6 +134,26 @@ RUN_MORPHOLOGY3X3_IMPL( float)
#undef RUN_MORPHOLOGY3X3_IMPL
//---------------------------
//
// Fluid kernels: Median blur
//
//---------------------------
#define RUN_MEDBLUR3X3_IMPL(T) \
void run_medblur3x3_impl(T out[], const T *in[], int width, int chan) \
{ \
CV_CPU_DISPATCH(run_medblur3x3_impl, (out, in, width, chan), \
CV_CPU_DISPATCH_MODES_ALL); \
}
RUN_MEDBLUR3X3_IMPL(uchar )
RUN_MEDBLUR3X3_IMPL(ushort)
RUN_MEDBLUR3X3_IMPL( short)
RUN_MEDBLUR3X3_IMPL( float)
#undef RUN_MEDBLUR3X3_IMPL
} // namespace fliud
} // namespace gapi
} // namespace cv

View File

@ -99,6 +99,22 @@ RUN_MORPHOLOGY3X3_IMPL( float)
#undef RUN_MORPHOLOGY3X3_IMPL
//---------------------------
//
// Fluid kernels: Median blur
//
//---------------------------
#define RUN_MEDBLUR3X3_IMPL(T) \
void run_medblur3x3_impl(T out[], const T *in[], int width, int chan);
RUN_MEDBLUR3X3_IMPL(uchar )
RUN_MEDBLUR3X3_IMPL(ushort)
RUN_MEDBLUR3X3_IMPL( short)
RUN_MEDBLUR3X3_IMPL( float)
#undef RUN_MEDBLUR3X3_IMPL
} // namespace fluid
} // namespace gapi
} // namespace cv

View File

@ -117,6 +117,22 @@ RUN_MORPHOLOGY3X3_IMPL( float)
#undef RUN_MORPHOLOGY3X3_IMPL
//---------------------------
//
// Fluid kernels: Median blur
//
//---------------------------
#define RUN_MEDBLUR3X3_IMPL(T) \
void run_medblur3x3_impl(T out[], const T *in[], int width, int chan);
RUN_MEDBLUR3X3_IMPL(uchar )
RUN_MEDBLUR3X3_IMPL(ushort)
RUN_MEDBLUR3X3_IMPL( short)
RUN_MEDBLUR3X3_IMPL( float)
#undef RUN_MEDBLUR3X3_IMPL
//----------------------------------------------------------------------
#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
@ -1580,6 +1596,180 @@ RUN_MORPHOLOGY3X3_IMPL( float)
#undef RUN_MORPHOLOGY3X3_IMPL
//---------------------------
//
// Fluid kernels: Median blur
//
//---------------------------
template<typename T>
static void run_medblur3x3_reference(T out[], const T *in[], int width, int chan)
{
constexpr int ksize = 3;
constexpr int border = (ksize - 1) / 2;
const int length = width * chan;
const int shift = border * chan;
for (int l=0; l < length; l++)
{
T t[3][3];
// neighbourhood 3x3
t[0][0] = in[0][l - shift]; t[0][1] = in[0][l]; t[0][2] = in[0][l + shift];
t[1][0] = in[1][l - shift]; t[1][1] = in[1][l]; t[1][2] = in[1][l + shift];
t[2][0] = in[2][l - shift]; t[2][1] = in[2][l]; t[2][2] = in[2][l + shift];
// sort 2 values
auto sort = [](T& a, T& b)
{
T u=a, v=b;
a = (std::min)(u, v);
b = (std::max)(u, v);
};
// horizontal: 3-elements bubble-sort per each row
sort(t[0][0], t[0][1]); sort(t[0][1], t[0][2]); sort(t[0][0], t[0][1]);
sort(t[1][0], t[1][1]); sort(t[1][1], t[1][2]); sort(t[1][0], t[1][1]);
sort(t[2][0], t[2][1]); sort(t[2][1], t[2][2]); sort(t[2][0], t[2][1]);
// vertical: columns bubble-sort (although partial)
sort(t[0][0], t[1][0]); sort(t[0][1], t[1][1]); /*sort(t[0][2], t[1][2]);*/
sort(t[1][0], t[2][0]); sort(t[1][1], t[2][1]); sort(t[1][2], t[2][2]);
/*sort(t[0][0], t[1][0]);*/ sort(t[0][1], t[1][1]); sort(t[0][2], t[1][2]);
// diagonal: bubble-sort (in opposite order!)
sort(t[1][1], t[0][2]); sort(t[2][0], t[1][1]); sort(t[1][1], t[0][2]);
out[l] = t[1][1];
}
}
#if CV_SIMD
template<typename VT, typename T>
static void run_medblur3x3_simd(T out[], const T *in[], int width, int chan)
{
constexpr int ksize = 3;
constexpr int border = (ksize - 1) / 2;
const int length = width * chan;
const int shift = border * chan;
for (int l=0; l < length;)
{
constexpr int nlanes = VT::nlanes;
// main part of output row
for (; l <= length - nlanes; l += nlanes)
{
VT t00, t01, t02, t10, t11, t12, t20, t21, t22;
// neighbourhood 3x3
t00 = vx_load(&in[0][l - shift]);
t01 = vx_load(&in[0][l ]);
t02 = vx_load(&in[0][l + shift]);
t10 = vx_load(&in[1][l - shift]);
t11 = vx_load(&in[1][l ]);
t12 = vx_load(&in[1][l + shift]);
t20 = vx_load(&in[2][l - shift]);
t21 = vx_load(&in[2][l ]);
t22 = vx_load(&in[2][l + shift]);
// sort 2 values
auto sort = [](VT& a, VT& b)
{
VT u=a, v=b;
a = v_min(u, v);
b = v_max(u, v);
};
// horizontal: 3-elements bubble-sort per each row
sort(t00, t01); sort(t01, t02); sort(t00, t01);
sort(t10, t11); sort(t11, t12); sort(t10, t11);
sort(t20, t21); sort(t21, t22); sort(t20, t21);
// vertical: columns bubble-sort (although partial)
sort(t00, t10); sort(t01, t11); /*sort(t02, t12);*/
sort(t10, t20); sort(t11, t21); sort(t12, t22);
/*sort(t00, t10);*/ sort(t01, t11); sort(t02, t12);
// diagonal: bubble-sort (in opposite order!)
sort(t11, t02); sort(t20, t11); sort(t11, t02);
v_store(&out[l], t11);
}
// tail (if any)
if (l < length)
{
GAPI_DbgAssert(length >= nlanes);
l = length - nlanes;
}
}
}
#endif
template<typename T>
static void run_medblur3x3_code(T out[], const T *in[], int width, int chan)
{
#if CV_SIMD
int length = width * chan;
// length variable may be unused if types do not match at 'if' statements below
(void) length;
if (std::is_same<T, float>::value && length >= v_float32::nlanes)
{
run_medblur3x3_simd<v_float32>(reinterpret_cast<float*>(out),
reinterpret_cast<const float**>(in),
width, chan);
return;
}
if (std::is_same<T, short>::value && length >= v_int16::nlanes)
{
run_medblur3x3_simd<v_int16>(reinterpret_cast<short*>(out),
reinterpret_cast<const short**>(in),
width, chan);
return;
}
if (std::is_same<T, ushort>::value && length >= v_uint16::nlanes)
{
run_medblur3x3_simd<v_uint16>(reinterpret_cast<ushort*>(out),
reinterpret_cast<const ushort**>(in),
width, chan);
return;
}
if (std::is_same<T, uchar>::value && length >= v_uint8::nlanes)
{
run_medblur3x3_simd<v_uint8>(reinterpret_cast<uchar*>(out),
reinterpret_cast<const uchar**>(in),
width, chan);
return;
}
#endif
run_medblur3x3_reference(out, in, width, chan);
}
#define RUN_MEDBLUR3X3_IMPL(T) \
void run_medblur3x3_impl(T out[], const T *in[], int width, int chan) \
{ \
run_medblur3x3_code(out, in, width, chan); \
}
RUN_MEDBLUR3X3_IMPL(uchar )
RUN_MEDBLUR3X3_IMPL(ushort)
RUN_MEDBLUR3X3_IMPL( short)
RUN_MEDBLUR3X3_IMPL( float)
#undef RUN_MEDBLUR3X3_IMPL
//------------------------------------------------------------------------------
#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY