Merge pull request #13329 from elatkin:el/gapi_perf_medblur

GAPI (fluid): Median blur optimization (#13329) * GAPI (fluid): Median blur optimization: reference 3x3 * GAPI (fluid): Median blur optimization: CPU dispatcher * GAPI (fluid): Median blur optimization: manual CV_SIMD
2025-08-05 22:19:14 +08:00 · 2018-11-29 18:02:29 +03:00 · 2018-11-29 18:02:29 +03:00 · ab430b8c87
commit ab430b8c87
parent 6374b99a1a
4 changed files with 253 additions and 16 deletions
--- a/modules/gapi/src/backends/fluid/gfluidimgproc.cpp
+++ b/modules/gapi/src/backends/fluid/gfluidimgproc.cpp
@ -1442,7 +1442,9 @@ static void run_medianblur(      Buffer& dst,
                           const View  & src,
                                 int     ksize)
 {
-    static const int kmax = 9;
+    static_assert(std::is_same<DST, SRC>::value, "unsupported combination of types");
+
+    constexpr int kmax = 9;
    GAPI_Assert(ksize <= kmax);

    const SRC *in[ kmax ];
@ -1460,24 +1462,33 @@ static void run_medianblur(      Buffer& dst,
    int width = dst.length();
    int chan  = dst.meta().chan;

-    for (int w=0; w < width; w++)
+    // optimized: if 3x3
+
+    if (3 == ksize)
    {
-        // TODO: make this cycle innermost
-        for (int c=0; c < chan; c++)
+        run_medblur3x3_impl(out, in, width, chan);
+        return;
+    }
+
+    // reference: any ksize
+
+    int length = width * chan;
+    int klength = ksize * ksize;
+    int klenhalf = klength / 2;
+
+    for (int l=0; l < length; l++)
+    {
+        SRC neighbours[kmax * kmax];
+
+        for (int i=0; i < ksize; i++)
+        for (int j=0; j < ksize; j++)
        {
-            SRC neighbours[kmax * kmax];
-
-            for (int i=0; i < ksize; i++)
-            for (int j=0; j < ksize; j++)
-            {
-                neighbours[i*ksize + j] = in[i][(w + j - border)*chan + c];
-            }
-
-            int length = ksize * ksize;
-            std::nth_element(neighbours, neighbours + length/2, neighbours + length);
-
-            out[w*chan + c] = saturate<DST>(neighbours[length/2], rintf);
+            neighbours[i*ksize + j] = in[i][l + (j - border)*chan];
        }
+
+        std::nth_element(neighbours, neighbours + klenhalf, neighbours + klength);
+
+        out[l] = saturate<DST>(neighbours[klenhalf], rintf);
    }
 }

--- a/modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp
+++ b/modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp
@ -134,6 +134,26 @@ RUN_MORPHOLOGY3X3_IMPL( float)

 #undef RUN_MORPHOLOGY3X3_IMPL

+//---------------------------
+//
+// Fluid kernels: Median blur
+//
+//---------------------------
+
+#define RUN_MEDBLUR3X3_IMPL(T)                                        \
+void run_medblur3x3_impl(T out[], const T *in[], int width, int chan) \
+{                                                                     \
+    CV_CPU_DISPATCH(run_medblur3x3_impl, (out, in, width, chan),      \
+        CV_CPU_DISPATCH_MODES_ALL);                                   \
+}
+
+RUN_MEDBLUR3X3_IMPL(uchar )
+RUN_MEDBLUR3X3_IMPL(ushort)
+RUN_MEDBLUR3X3_IMPL( short)
+RUN_MEDBLUR3X3_IMPL( float)
+
+#undef RUN_MEDBLUR3X3_IMPL
+
 } // namespace fliud
 } // namespace gapi
 } // namespace cv
--- a/modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp
+++ b/modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp
@ -99,6 +99,22 @@ RUN_MORPHOLOGY3X3_IMPL( float)

 #undef RUN_MORPHOLOGY3X3_IMPL

+//---------------------------
+//
+// Fluid kernels: Median blur
+//
+//---------------------------
+
+#define RUN_MEDBLUR3X3_IMPL(T) \
+void run_medblur3x3_impl(T out[], const T *in[], int width, int chan);
+
+RUN_MEDBLUR3X3_IMPL(uchar )
+RUN_MEDBLUR3X3_IMPL(ushort)
+RUN_MEDBLUR3X3_IMPL( short)
+RUN_MEDBLUR3X3_IMPL( float)
+
+#undef RUN_MEDBLUR3X3_IMPL
+
 }  // namespace fluid
 }  // namespace gapi
 }  // namespace cv
--- a/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp
+++ b/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp
@ -117,6 +117,22 @@ RUN_MORPHOLOGY3X3_IMPL( float)

 #undef RUN_MORPHOLOGY3X3_IMPL

+//---------------------------
+//
+// Fluid kernels: Median blur
+//
+//---------------------------
+
+#define RUN_MEDBLUR3X3_IMPL(T) \
+void run_medblur3x3_impl(T out[], const T *in[], int width, int chan);
+
+RUN_MEDBLUR3X3_IMPL(uchar )
+RUN_MEDBLUR3X3_IMPL(ushort)
+RUN_MEDBLUR3X3_IMPL( short)
+RUN_MEDBLUR3X3_IMPL( float)
+
+#undef RUN_MEDBLUR3X3_IMPL
+
 //----------------------------------------------------------------------

 #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
@ -1580,6 +1596,180 @@ RUN_MORPHOLOGY3X3_IMPL( float)

 #undef RUN_MORPHOLOGY3X3_IMPL

+//---------------------------
+//
+// Fluid kernels: Median blur
+//
+//---------------------------
+
+template<typename T>
+static void run_medblur3x3_reference(T out[], const T *in[], int width, int chan)
+{
+    constexpr int ksize = 3;
+    constexpr int border = (ksize - 1) / 2;
+
+    const int length = width * chan;
+    const int shift = border * chan;
+
+    for (int l=0; l < length; l++)
+    {
+        T t[3][3];
+
+        // neighbourhood 3x3
+        t[0][0] = in[0][l - shift];    t[0][1] = in[0][l];    t[0][2] = in[0][l + shift];
+        t[1][0] = in[1][l - shift];    t[1][1] = in[1][l];    t[1][2] = in[1][l + shift];
+        t[2][0] = in[2][l - shift];    t[2][1] = in[2][l];    t[2][2] = in[2][l + shift];
+
+        // sort 2 values
+        auto sort = [](T& a, T& b)
+        {
+            T u=a, v=b;
+            a = (std::min)(u, v);
+            b = (std::max)(u, v);
+        };
+
+        // horizontal: 3-elements bubble-sort per each row
+        sort(t[0][0], t[0][1]);    sort(t[0][1], t[0][2]);    sort(t[0][0], t[0][1]);
+        sort(t[1][0], t[1][1]);    sort(t[1][1], t[1][2]);    sort(t[1][0], t[1][1]);
+        sort(t[2][0], t[2][1]);    sort(t[2][1], t[2][2]);    sort(t[2][0], t[2][1]);
+
+        // vertical: columns bubble-sort (although partial)
+        sort(t[0][0], t[1][0]);    sort(t[0][1], t[1][1]);  /*sort(t[0][2], t[1][2]);*/
+        sort(t[1][0], t[2][0]);    sort(t[1][1], t[2][1]);    sort(t[1][2], t[2][2]);
+      /*sort(t[0][0], t[1][0]);*/  sort(t[0][1], t[1][1]);    sort(t[0][2], t[1][2]);
+
+        // diagonal: bubble-sort (in opposite order!)
+        sort(t[1][1], t[0][2]);    sort(t[2][0], t[1][1]);    sort(t[1][1], t[0][2]);
+
+        out[l] = t[1][1];
+    }
+}
+
+#if CV_SIMD
+template<typename VT, typename T>
+static void run_medblur3x3_simd(T out[], const T *in[], int width, int chan)
+{
+    constexpr int ksize = 3;
+    constexpr int border = (ksize - 1) / 2;
+
+    const int length = width * chan;
+    const int shift = border * chan;
+
+    for (int l=0; l < length;)
+    {
+        constexpr int nlanes = VT::nlanes;
+
+        // main part of output row
+        for (; l <= length - nlanes; l += nlanes)
+        {
+            VT t00, t01, t02, t10, t11, t12, t20, t21, t22;
+
+            // neighbourhood 3x3
+
+            t00 = vx_load(&in[0][l - shift]);
+            t01 = vx_load(&in[0][l        ]);
+            t02 = vx_load(&in[0][l + shift]);
+
+            t10 = vx_load(&in[1][l - shift]);
+            t11 = vx_load(&in[1][l        ]);
+            t12 = vx_load(&in[1][l + shift]);
+
+            t20 = vx_load(&in[2][l - shift]);
+            t21 = vx_load(&in[2][l        ]);
+            t22 = vx_load(&in[2][l + shift]);
+
+            // sort 2 values
+            auto sort = [](VT& a, VT& b)
+            {
+                VT u=a, v=b;
+                a = v_min(u, v);
+                b = v_max(u, v);
+            };
+
+            // horizontal: 3-elements bubble-sort per each row
+            sort(t00, t01);    sort(t01, t02);    sort(t00, t01);
+            sort(t10, t11);    sort(t11, t12);    sort(t10, t11);
+            sort(t20, t21);    sort(t21, t22);    sort(t20, t21);
+
+            // vertical: columns bubble-sort (although partial)
+            sort(t00, t10);    sort(t01, t11);  /*sort(t02, t12);*/
+            sort(t10, t20);    sort(t11, t21);    sort(t12, t22);
+          /*sort(t00, t10);*/  sort(t01, t11);    sort(t02, t12);
+
+            // diagonal: bubble-sort (in opposite order!)
+            sort(t11, t02);    sort(t20, t11);    sort(t11, t02);
+
+            v_store(&out[l], t11);
+        }
+
+        // tail (if any)
+        if (l < length)
+        {
+            GAPI_DbgAssert(length >= nlanes);
+            l = length - nlanes;
+        }
+    }
+}
+#endif
+
+template<typename T>
+static void run_medblur3x3_code(T out[], const T *in[], int width, int chan)
+{
+#if CV_SIMD
+    int length = width * chan;
+
+    // length variable may be unused if types do not match at 'if' statements below
+    (void) length;
+
+    if (std::is_same<T, float>::value && length >= v_float32::nlanes)
+    {
+        run_medblur3x3_simd<v_float32>(reinterpret_cast<float*>(out),
+                                       reinterpret_cast<const float**>(in),
+                                       width, chan);
+        return;
+    }
+
+    if (std::is_same<T, short>::value && length >= v_int16::nlanes)
+    {
+        run_medblur3x3_simd<v_int16>(reinterpret_cast<short*>(out),
+                                     reinterpret_cast<const short**>(in),
+                                     width, chan);
+        return;
+    }
+
+    if (std::is_same<T, ushort>::value && length >= v_uint16::nlanes)
+    {
+        run_medblur3x3_simd<v_uint16>(reinterpret_cast<ushort*>(out),
+                                      reinterpret_cast<const ushort**>(in),
+                                      width, chan);
+        return;
+    }
+
+    if (std::is_same<T, uchar>::value && length >= v_uint8::nlanes)
+    {
+        run_medblur3x3_simd<v_uint8>(reinterpret_cast<uchar*>(out),
+                                     reinterpret_cast<const uchar**>(in),
+                                     width, chan);
+        return;
+    }
+#endif
+
+    run_medblur3x3_reference(out, in, width, chan);
+}
+
+#define RUN_MEDBLUR3X3_IMPL(T)                                        \
+void run_medblur3x3_impl(T out[], const T *in[], int width, int chan) \
+{                                                                     \
+    run_medblur3x3_code(out, in, width, chan);                        \
+}
+
+RUN_MEDBLUR3X3_IMPL(uchar )
+RUN_MEDBLUR3X3_IMPL(ushort)
+RUN_MEDBLUR3X3_IMPL( short)
+RUN_MEDBLUR3X3_IMPL( float)
+
+#undef RUN_MEDBLUR3X3_IMPL
+
 //------------------------------------------------------------------------------

 #endif  // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY