mirror of
https://github.com/opencv/opencv.git
synced 2025-01-19 06:53:50 +08:00
Merge pull request #13329 from elatkin:el/gapi_perf_medblur
GAPI (fluid): Median blur optimization (#13329) * GAPI (fluid): Median blur optimization: reference 3x3 * GAPI (fluid): Median blur optimization: CPU dispatcher * GAPI (fluid): Median blur optimization: manual CV_SIMD
This commit is contained in:
parent
6374b99a1a
commit
ab430b8c87
@ -1442,7 +1442,9 @@ static void run_medianblur( Buffer& dst,
|
||||
const View & src,
|
||||
int ksize)
|
||||
{
|
||||
static const int kmax = 9;
|
||||
static_assert(std::is_same<DST, SRC>::value, "unsupported combination of types");
|
||||
|
||||
constexpr int kmax = 9;
|
||||
GAPI_Assert(ksize <= kmax);
|
||||
|
||||
const SRC *in[ kmax ];
|
||||
@ -1460,24 +1462,33 @@ static void run_medianblur( Buffer& dst,
|
||||
int width = dst.length();
|
||||
int chan = dst.meta().chan;
|
||||
|
||||
for (int w=0; w < width; w++)
|
||||
// optimized: if 3x3
|
||||
|
||||
if (3 == ksize)
|
||||
{
|
||||
// TODO: make this cycle innermost
|
||||
for (int c=0; c < chan; c++)
|
||||
run_medblur3x3_impl(out, in, width, chan);
|
||||
return;
|
||||
}
|
||||
|
||||
// reference: any ksize
|
||||
|
||||
int length = width * chan;
|
||||
int klength = ksize * ksize;
|
||||
int klenhalf = klength / 2;
|
||||
|
||||
for (int l=0; l < length; l++)
|
||||
{
|
||||
SRC neighbours[kmax * kmax];
|
||||
|
||||
for (int i=0; i < ksize; i++)
|
||||
for (int j=0; j < ksize; j++)
|
||||
{
|
||||
SRC neighbours[kmax * kmax];
|
||||
|
||||
for (int i=0; i < ksize; i++)
|
||||
for (int j=0; j < ksize; j++)
|
||||
{
|
||||
neighbours[i*ksize + j] = in[i][(w + j - border)*chan + c];
|
||||
}
|
||||
|
||||
int length = ksize * ksize;
|
||||
std::nth_element(neighbours, neighbours + length/2, neighbours + length);
|
||||
|
||||
out[w*chan + c] = saturate<DST>(neighbours[length/2], rintf);
|
||||
neighbours[i*ksize + j] = in[i][l + (j - border)*chan];
|
||||
}
|
||||
|
||||
std::nth_element(neighbours, neighbours + klenhalf, neighbours + klength);
|
||||
|
||||
out[l] = saturate<DST>(neighbours[klenhalf], rintf);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -134,6 +134,26 @@ RUN_MORPHOLOGY3X3_IMPL( float)
|
||||
|
||||
#undef RUN_MORPHOLOGY3X3_IMPL
|
||||
|
||||
//---------------------------
|
||||
//
|
||||
// Fluid kernels: Median blur
|
||||
//
|
||||
//---------------------------
|
||||
|
||||
#define RUN_MEDBLUR3X3_IMPL(T) \
|
||||
void run_medblur3x3_impl(T out[], const T *in[], int width, int chan) \
|
||||
{ \
|
||||
CV_CPU_DISPATCH(run_medblur3x3_impl, (out, in, width, chan), \
|
||||
CV_CPU_DISPATCH_MODES_ALL); \
|
||||
}
|
||||
|
||||
RUN_MEDBLUR3X3_IMPL(uchar )
|
||||
RUN_MEDBLUR3X3_IMPL(ushort)
|
||||
RUN_MEDBLUR3X3_IMPL( short)
|
||||
RUN_MEDBLUR3X3_IMPL( float)
|
||||
|
||||
#undef RUN_MEDBLUR3X3_IMPL
|
||||
|
||||
} // namespace fliud
|
||||
} // namespace gapi
|
||||
} // namespace cv
|
||||
|
@ -99,6 +99,22 @@ RUN_MORPHOLOGY3X3_IMPL( float)
|
||||
|
||||
#undef RUN_MORPHOLOGY3X3_IMPL
|
||||
|
||||
//---------------------------
|
||||
//
|
||||
// Fluid kernels: Median blur
|
||||
//
|
||||
//---------------------------
|
||||
|
||||
#define RUN_MEDBLUR3X3_IMPL(T) \
|
||||
void run_medblur3x3_impl(T out[], const T *in[], int width, int chan);
|
||||
|
||||
RUN_MEDBLUR3X3_IMPL(uchar )
|
||||
RUN_MEDBLUR3X3_IMPL(ushort)
|
||||
RUN_MEDBLUR3X3_IMPL( short)
|
||||
RUN_MEDBLUR3X3_IMPL( float)
|
||||
|
||||
#undef RUN_MEDBLUR3X3_IMPL
|
||||
|
||||
} // namespace fluid
|
||||
} // namespace gapi
|
||||
} // namespace cv
|
||||
|
@ -117,6 +117,22 @@ RUN_MORPHOLOGY3X3_IMPL( float)
|
||||
|
||||
#undef RUN_MORPHOLOGY3X3_IMPL
|
||||
|
||||
//---------------------------
|
||||
//
|
||||
// Fluid kernels: Median blur
|
||||
//
|
||||
//---------------------------
|
||||
|
||||
#define RUN_MEDBLUR3X3_IMPL(T) \
|
||||
void run_medblur3x3_impl(T out[], const T *in[], int width, int chan);
|
||||
|
||||
RUN_MEDBLUR3X3_IMPL(uchar )
|
||||
RUN_MEDBLUR3X3_IMPL(ushort)
|
||||
RUN_MEDBLUR3X3_IMPL( short)
|
||||
RUN_MEDBLUR3X3_IMPL( float)
|
||||
|
||||
#undef RUN_MEDBLUR3X3_IMPL
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
|
||||
#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
|
||||
@ -1580,6 +1596,180 @@ RUN_MORPHOLOGY3X3_IMPL( float)
|
||||
|
||||
#undef RUN_MORPHOLOGY3X3_IMPL
|
||||
|
||||
//---------------------------
|
||||
//
|
||||
// Fluid kernels: Median blur
|
||||
//
|
||||
//---------------------------
|
||||
|
||||
template<typename T>
|
||||
static void run_medblur3x3_reference(T out[], const T *in[], int width, int chan)
|
||||
{
|
||||
constexpr int ksize = 3;
|
||||
constexpr int border = (ksize - 1) / 2;
|
||||
|
||||
const int length = width * chan;
|
||||
const int shift = border * chan;
|
||||
|
||||
for (int l=0; l < length; l++)
|
||||
{
|
||||
T t[3][3];
|
||||
|
||||
// neighbourhood 3x3
|
||||
t[0][0] = in[0][l - shift]; t[0][1] = in[0][l]; t[0][2] = in[0][l + shift];
|
||||
t[1][0] = in[1][l - shift]; t[1][1] = in[1][l]; t[1][2] = in[1][l + shift];
|
||||
t[2][0] = in[2][l - shift]; t[2][1] = in[2][l]; t[2][2] = in[2][l + shift];
|
||||
|
||||
// sort 2 values
|
||||
auto sort = [](T& a, T& b)
|
||||
{
|
||||
T u=a, v=b;
|
||||
a = (std::min)(u, v);
|
||||
b = (std::max)(u, v);
|
||||
};
|
||||
|
||||
// horizontal: 3-elements bubble-sort per each row
|
||||
sort(t[0][0], t[0][1]); sort(t[0][1], t[0][2]); sort(t[0][0], t[0][1]);
|
||||
sort(t[1][0], t[1][1]); sort(t[1][1], t[1][2]); sort(t[1][0], t[1][1]);
|
||||
sort(t[2][0], t[2][1]); sort(t[2][1], t[2][2]); sort(t[2][0], t[2][1]);
|
||||
|
||||
// vertical: columns bubble-sort (although partial)
|
||||
sort(t[0][0], t[1][0]); sort(t[0][1], t[1][1]); /*sort(t[0][2], t[1][2]);*/
|
||||
sort(t[1][0], t[2][0]); sort(t[1][1], t[2][1]); sort(t[1][2], t[2][2]);
|
||||
/*sort(t[0][0], t[1][0]);*/ sort(t[0][1], t[1][1]); sort(t[0][2], t[1][2]);
|
||||
|
||||
// diagonal: bubble-sort (in opposite order!)
|
||||
sort(t[1][1], t[0][2]); sort(t[2][0], t[1][1]); sort(t[1][1], t[0][2]);
|
||||
|
||||
out[l] = t[1][1];
|
||||
}
|
||||
}
|
||||
|
||||
#if CV_SIMD
|
||||
template<typename VT, typename T>
|
||||
static void run_medblur3x3_simd(T out[], const T *in[], int width, int chan)
|
||||
{
|
||||
constexpr int ksize = 3;
|
||||
constexpr int border = (ksize - 1) / 2;
|
||||
|
||||
const int length = width * chan;
|
||||
const int shift = border * chan;
|
||||
|
||||
for (int l=0; l < length;)
|
||||
{
|
||||
constexpr int nlanes = VT::nlanes;
|
||||
|
||||
// main part of output row
|
||||
for (; l <= length - nlanes; l += nlanes)
|
||||
{
|
||||
VT t00, t01, t02, t10, t11, t12, t20, t21, t22;
|
||||
|
||||
// neighbourhood 3x3
|
||||
|
||||
t00 = vx_load(&in[0][l - shift]);
|
||||
t01 = vx_load(&in[0][l ]);
|
||||
t02 = vx_load(&in[0][l + shift]);
|
||||
|
||||
t10 = vx_load(&in[1][l - shift]);
|
||||
t11 = vx_load(&in[1][l ]);
|
||||
t12 = vx_load(&in[1][l + shift]);
|
||||
|
||||
t20 = vx_load(&in[2][l - shift]);
|
||||
t21 = vx_load(&in[2][l ]);
|
||||
t22 = vx_load(&in[2][l + shift]);
|
||||
|
||||
// sort 2 values
|
||||
auto sort = [](VT& a, VT& b)
|
||||
{
|
||||
VT u=a, v=b;
|
||||
a = v_min(u, v);
|
||||
b = v_max(u, v);
|
||||
};
|
||||
|
||||
// horizontal: 3-elements bubble-sort per each row
|
||||
sort(t00, t01); sort(t01, t02); sort(t00, t01);
|
||||
sort(t10, t11); sort(t11, t12); sort(t10, t11);
|
||||
sort(t20, t21); sort(t21, t22); sort(t20, t21);
|
||||
|
||||
// vertical: columns bubble-sort (although partial)
|
||||
sort(t00, t10); sort(t01, t11); /*sort(t02, t12);*/
|
||||
sort(t10, t20); sort(t11, t21); sort(t12, t22);
|
||||
/*sort(t00, t10);*/ sort(t01, t11); sort(t02, t12);
|
||||
|
||||
// diagonal: bubble-sort (in opposite order!)
|
||||
sort(t11, t02); sort(t20, t11); sort(t11, t02);
|
||||
|
||||
v_store(&out[l], t11);
|
||||
}
|
||||
|
||||
// tail (if any)
|
||||
if (l < length)
|
||||
{
|
||||
GAPI_DbgAssert(length >= nlanes);
|
||||
l = length - nlanes;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
template<typename T>
|
||||
static void run_medblur3x3_code(T out[], const T *in[], int width, int chan)
|
||||
{
|
||||
#if CV_SIMD
|
||||
int length = width * chan;
|
||||
|
||||
// length variable may be unused if types do not match at 'if' statements below
|
||||
(void) length;
|
||||
|
||||
if (std::is_same<T, float>::value && length >= v_float32::nlanes)
|
||||
{
|
||||
run_medblur3x3_simd<v_float32>(reinterpret_cast<float*>(out),
|
||||
reinterpret_cast<const float**>(in),
|
||||
width, chan);
|
||||
return;
|
||||
}
|
||||
|
||||
if (std::is_same<T, short>::value && length >= v_int16::nlanes)
|
||||
{
|
||||
run_medblur3x3_simd<v_int16>(reinterpret_cast<short*>(out),
|
||||
reinterpret_cast<const short**>(in),
|
||||
width, chan);
|
||||
return;
|
||||
}
|
||||
|
||||
if (std::is_same<T, ushort>::value && length >= v_uint16::nlanes)
|
||||
{
|
||||
run_medblur3x3_simd<v_uint16>(reinterpret_cast<ushort*>(out),
|
||||
reinterpret_cast<const ushort**>(in),
|
||||
width, chan);
|
||||
return;
|
||||
}
|
||||
|
||||
if (std::is_same<T, uchar>::value && length >= v_uint8::nlanes)
|
||||
{
|
||||
run_medblur3x3_simd<v_uint8>(reinterpret_cast<uchar*>(out),
|
||||
reinterpret_cast<const uchar**>(in),
|
||||
width, chan);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
run_medblur3x3_reference(out, in, width, chan);
|
||||
}
|
||||
|
||||
#define RUN_MEDBLUR3X3_IMPL(T) \
|
||||
void run_medblur3x3_impl(T out[], const T *in[], int width, int chan) \
|
||||
{ \
|
||||
run_medblur3x3_code(out, in, width, chan); \
|
||||
}
|
||||
|
||||
RUN_MEDBLUR3X3_IMPL(uchar )
|
||||
RUN_MEDBLUR3X3_IMPL(ushort)
|
||||
RUN_MEDBLUR3X3_IMPL( short)
|
||||
RUN_MEDBLUR3X3_IMPL( float)
|
||||
|
||||
#undef RUN_MEDBLUR3X3_IMPL
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
|
||||
|
Loading…
Reference in New Issue
Block a user