Merge pull request #13221 from elatkin:el/gapi_perf_sepfilter

GAPI (fluid): optimization of Separable filter (#13221) * GAPI (fluid): Separable filter: performance test * GAPI (fluid): enable all performance tests * GAPI: separable filters: alternative code for Sobel * GAPI (fluid): hide unused old code for Sobel filter * GAPI (fluid): especial code for Sobel if U8 into S16 * GAPI (fluid): back to old code for Sobel * GAPI (fluid): run_sepfilter3x3_impl() with CPU dispatcher * GAPI (fluid): run_sepfilter3x3_impl(): fix compiler warnings * GAPI (fluid): new engine for separable filters (but Sobel) * GAPI (fluid): new performance engine for Sobel * GAPI (fluid): Sepfilters performance: fixed compilation error
2025-08-06 14:36:36 +08:00 · 2018-11-26 15:05:35 +03:00 · 2018-11-26 15:05:35 +03:00 · f07856eab9
commit f07856eab9
parent dd952f6d68
6 changed files with 902 additions and 362 deletions
--- a/modules/gapi/perf/common/gapi_imgproc_perf_tests_inl.hpp
+++ b/modules/gapi/perf/common/gapi_imgproc_perf_tests_inl.hpp
@ -52,7 +52,7 @@ PERF_TEST_P_(SepFilterPerfTest, TestPerformance)

    TEST_CYCLE()
    {
-      c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+      c.apply(in_mat1, out_mat_gapi);
    }

    // Comparison //////////////////////////////////////////////////////////////
@ -100,7 +100,7 @@ PERF_TEST_P_(Filter2DPerfTest, TestPerformance)

    TEST_CYCLE()
    {
-        c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+        c.apply(in_mat1, out_mat_gapi);
    }

    // Comparison //////////////////////////////////////////////////////////////
@ -145,7 +145,7 @@ PERF_TEST_P_(BoxFilterPerfTest, TestPerformance)

    TEST_CYCLE()
    {
-        c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+        c.apply(in_mat1, out_mat_gapi);
    }

    // Comparison //////////////////////////////////////////////////////////////
@ -188,7 +188,7 @@ PERF_TEST_P_(BlurPerfTest, TestPerformance)

    TEST_CYCLE()
    {
-        c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+        c.apply(in_mat1, out_mat_gapi);
    }

    // Comparison //////////////////////////////////////////////////////////////
@ -230,7 +230,7 @@ PERF_TEST_P_(GaussianBlurPerfTest, TestPerformance)

    TEST_CYCLE()
    {
-        c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+        c.apply(in_mat1, out_mat_gapi);
    }

    // Comparison //////////////////////////////////////////////////////////////
@ -271,7 +271,7 @@ PERF_TEST_P_(MedianBlurPerfTest, TestPerformance)

    TEST_CYCLE()
    {
-        c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+        c.apply(in_mat1, out_mat_gapi);
    }

    // Comparison //////////////////////////////////////////////////////////////
@ -314,7 +314,7 @@ PERF_TEST_P_(ErodePerfTest, TestPerformance)

    TEST_CYCLE()
    {
-        c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+        c.apply(in_mat1, out_mat_gapi);
    }

    // Comparison //////////////////////////////////////////////////////////////
@ -357,7 +357,7 @@ PERF_TEST_P_(Erode3x3PerfTest, TestPerformance)

    TEST_CYCLE()
    {
-        c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+        c.apply(in_mat1, out_mat_gapi);
    }

    // Comparison //////////////////////////////////////////////////////////////
@ -400,7 +400,7 @@ PERF_TEST_P_(DilatePerfTest, TestPerformance)

    TEST_CYCLE()
    {
-        c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+        c.apply(in_mat1, out_mat_gapi);
    }

    // Comparison //////////////////////////////////////////////////////////////
@ -443,7 +443,7 @@ PERF_TEST_P_(Dilate3x3PerfTest, TestPerformance)

    TEST_CYCLE()
    {
-        c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+        c.apply(in_mat1, out_mat_gapi);
    }

    // Comparison //////////////////////////////////////////////////////////////
@ -526,7 +526,7 @@ PERF_TEST_P_(CannyPerfTest, TestPerformance)

    TEST_CYCLE()
    {
-        c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+        c.apply(in_mat1, out_mat_gapi);
    }

    // Comparison //////////////////////////////////////////////////////////////
@ -564,7 +564,7 @@ PERF_TEST_P_(EqHistPerfTest, TestPerformance)

    TEST_CYCLE()
    {
-        c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+        c.apply(in_mat1, out_mat_gapi);
    }

    // Comparison //////////////////////////////////////////////////////////////
@ -830,7 +830,7 @@ PERF_TEST_P_(LUV2BGRPerfTest, TestPerformance)

    TEST_CYCLE()
    {
-        c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+        c.apply(in_mat1, out_mat_gapi);
    }

    // Comparison //////////////////////////////////////////////////////////////
--- a/modules/gapi/perf/cpu/gapi_imgproc_perf_tests_fluid.cpp
+++ b/modules/gapi/perf/cpu/gapi_imgproc_perf_tests_fluid.cpp
@ -13,9 +13,101 @@
 namespace opencv_test
 {

-    INSTANTIATE_TEST_CASE_P(SobelPerfTestFluid, SobelPerfTest,
-        Combine(Values(AbsExact().to_compare_f()),
-            Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1),  // add CV_32FC1 when ready
+INSTANTIATE_TEST_CASE_P(SepFilterPerfTestFluid_8U, SepFilterPerfTest,
+    Combine(Values(ToleranceFilter(1e-4f, 0.01).to_compare_f()),
+            Values(CV_8UC1, CV_8UC3),
+            Values(3),
+            Values(szVGA, sz720p, sz1080p),
+            Values(-1, CV_16S, CV_32F),
+            Values(cv::compile_args(IMGPROC_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(SepFilterPerfTestFluid_other, SepFilterPerfTest,
+    Combine(Values(ToleranceFilter(1e-4f, 0.01).to_compare_f()),
+            Values(CV_16UC1, CV_16SC1, CV_32FC1),
+            Values(3),
+            Values(szVGA, sz720p, sz1080p),
+            Values(-1, CV_32F),
+            Values(cv::compile_args(IMGPROC_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(Filter2DPerfTestFluid, Filter2DPerfTest,
+    Combine(Values(ToleranceFilter(1e-4f, 0.01).to_compare_f()),
+            Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+            Values(3),                                     // add 4, 5, 7 when kernel is ready
+            Values(szVGA, sz720p, sz1080p),
+            Values(cv::BORDER_DEFAULT),
+            Values(-1, CV_32F),
+            Values(cv::compile_args(IMGPROC_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(BoxFilterPerfTestFluid, BoxFilterPerfTest,
+    Combine(Values(ToleranceFilter(1e-4f, 0.01).to_compare_f()),
+            Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+            Values(3),                                     // add size=5, when kernel is ready
+            Values(szVGA, sz720p, sz1080p),
+            Values(cv::BORDER_DEFAULT),
+            Values(-1, CV_32F),
+            Values(cv::compile_args(IMGPROC_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(BlurPerfTestFluid, BlurPerfTest,
+    Combine(Values(ToleranceFilter(1e-4f, 0.01).to_compare_f()),
+            Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+            Values(3),                                     // add size=5, when kernel is ready
+            Values(szVGA, sz720p, sz1080p),
+            Values(cv::BORDER_DEFAULT),
+            Values(cv::compile_args(IMGPROC_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(GaussianBlurPerfTestFluid, GaussianBlurPerfTest,
+    Combine(Values(ToleranceFilter(1e-3f, 0.01).to_compare_f()),
+            Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+            Values(3),                                     // add size=5, when kernel is ready
+            Values(szVGA, sz720p, sz1080p),
+            Values(cv::compile_args(IMGPROC_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(MedianBlurPerfTestFluid, MedianBlurPerfTest,
+    Combine(Values(AbsExact().to_compare_f()),
+            Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+            Values(3),                                     // add size=5, when kernel is ready
+            Values(szVGA, sz720p, sz1080p),
+            Values(cv::compile_args(IMGPROC_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(ErodePerfTestFluid, ErodePerfTest,
+    Combine(Values(AbsExact().to_compare_f()),
+            Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+            Values(3),                                     // add size=5, when kernel is ready
+            Values(szVGA, sz720p, sz1080p),
+            Values(cv::MorphShapes::MORPH_RECT,
+                   cv::MorphShapes::MORPH_CROSS,
+                   cv::MorphShapes::MORPH_ELLIPSE),
+            Values(cv::compile_args(IMGPROC_FLUID))));
+
+// GAPI/fluid does not support iterations parameter for the Erode kernel
+INSTANTIATE_TEST_CASE_P(DISABLED_Erode3x3PerfTestFluid, Erode3x3PerfTest,
+    Combine(Values(AbsExact().to_compare_f()),
+            Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+            Values(szVGA, sz720p, sz1080p),
+            Values(1, 2, 4),
+            Values(cv::compile_args(IMGPROC_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(DilatePerfTestFluid, DilatePerfTest,
+    Combine(Values(AbsExact().to_compare_f()),
+            Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+            Values(3),                                     // add size=5, when kernel is ready
+            Values(szVGA, sz720p, sz1080p),
+            Values(cv::MorphShapes::MORPH_RECT,
+                   cv::MorphShapes::MORPH_CROSS,
+                   cv::MorphShapes::MORPH_ELLIPSE),
+            Values(cv::compile_args(IMGPROC_FLUID))));
+
+// GAPI/fluid does not support iterations parameter for the Dilate kernel
+INSTANTIATE_TEST_CASE_P(DISABLED_Dilate3x3PerfTestFluid, Dilate3x3PerfTest,
+    Combine(Values(AbsExact().to_compare_f()),
+            Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+            Values(szVGA, sz720p, sz1080p),
+            Values(1, 2, 4),
+            Values(cv::compile_args(IMGPROC_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(SobelPerfTestFluid, SobelPerfTest,
+    Combine(Values(AbsExact().to_compare_f()),
+            Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1),
            Values(3),                                     // add 5x5 once supported
            Values(szVGA, sz720p, sz1080p),
            Values(-1, CV_16S, CV_32F),
@ -23,8 +115,8 @@ namespace opencv_test
            Values(1, 2),
            Values(cv::compile_args(IMGPROC_FLUID))));

-    INSTANTIATE_TEST_CASE_P(SobelPerfTestFluid32F, SobelPerfTest,
-        Combine(Values(ToleranceFilter(1e-3f, 0.0).to_compare_f()),
+INSTANTIATE_TEST_CASE_P(SobelPerfTestFluid32F, SobelPerfTest,
+    Combine(Values(ToleranceFilter(1e-3f, 0.0).to_compare_f()),
            Values(CV_32FC1),
            Values(3),                                     // add 5x5 once supported
            Values(szVGA, sz720p, sz1080p),
@ -33,44 +125,44 @@ namespace opencv_test
            Values(1, 2),
            Values(cv::compile_args(IMGPROC_FLUID))));

-    INSTANTIATE_TEST_CASE_P(RGB2GrayPerfTestFluid, RGB2GrayPerfTest,
-        Combine(Values(ToleranceColor(1e-3).to_compare_f()),
-                Values(szVGA, sz720p, sz1080p),
-                Values(cv::compile_args(IMGPROC_FLUID))));
+INSTANTIATE_TEST_CASE_P(RGB2GrayPerfTestFluid, RGB2GrayPerfTest,
+    Combine(Values(ToleranceColor(1e-3).to_compare_f()),
+            Values(szVGA, sz720p, sz1080p),
+            Values(cv::compile_args(IMGPROC_FLUID))));

-    INSTANTIATE_TEST_CASE_P(BGR2GrayPerfTestFluid, BGR2GrayPerfTest,
-        Combine(Values(ToleranceColor(1e-3).to_compare_f()),
-                Values(szVGA, sz720p, sz1080p),
-                Values(cv::compile_args(IMGPROC_FLUID))));
+INSTANTIATE_TEST_CASE_P(BGR2GrayPerfTestFluid, BGR2GrayPerfTest,
+    Combine(Values(ToleranceColor(1e-3).to_compare_f()),
+            Values(szVGA, sz720p, sz1080p),
+            Values(cv::compile_args(IMGPROC_FLUID))));

-    INSTANTIATE_TEST_CASE_P(RGB2YUVPerfTestFluid, RGB2YUVPerfTest,
-        Combine(Values(ToleranceColor(1e-3).to_compare_f()),
-                Values(szVGA, sz720p, sz1080p),
-                Values(cv::compile_args(IMGPROC_FLUID))));
+INSTANTIATE_TEST_CASE_P(RGB2YUVPerfTestFluid, RGB2YUVPerfTest,
+    Combine(Values(ToleranceColor(1e-3).to_compare_f()),
+            Values(szVGA, sz720p, sz1080p),
+            Values(cv::compile_args(IMGPROC_FLUID))));

-    INSTANTIATE_TEST_CASE_P(YUV2RGBPerfTestFluid, YUV2RGBPerfTest,
-        Combine(Values(ToleranceColor(1e-3).to_compare_f()),
-                Values(szVGA, sz720p, sz1080p),
-                Values(cv::compile_args(IMGPROC_FLUID))));
+INSTANTIATE_TEST_CASE_P(YUV2RGBPerfTestFluid, YUV2RGBPerfTest,
+    Combine(Values(ToleranceColor(1e-3).to_compare_f()),
+            Values(szVGA, sz720p, sz1080p),
+            Values(cv::compile_args(IMGPROC_FLUID))));

-    INSTANTIATE_TEST_CASE_P(BGR2YUVPerfTestFluid, BGR2YUVPerfTest,
-        Combine(Values(ToleranceColor(1e-3).to_compare_f()),
-                Values(szVGA, sz720p, sz1080p),
-                Values(cv::compile_args(IMGPROC_FLUID))));
+INSTANTIATE_TEST_CASE_P(BGR2YUVPerfTestFluid, BGR2YUVPerfTest,
+    Combine(Values(ToleranceColor(1e-3).to_compare_f()),
+            Values(szVGA, sz720p, sz1080p),
+            Values(cv::compile_args(IMGPROC_FLUID))));

-    INSTANTIATE_TEST_CASE_P(YUV2BGRPerfTestFluid, YUV2BGRPerfTest,
-        Combine(Values(ToleranceColor(1e-3).to_compare_f()),
-                Values(szVGA, sz720p, sz1080p),
-                Values(cv::compile_args(IMGPROC_FLUID))));
+INSTANTIATE_TEST_CASE_P(YUV2BGRPerfTestFluid, YUV2BGRPerfTest,
+    Combine(Values(ToleranceColor(1e-3).to_compare_f()),
+            Values(szVGA, sz720p, sz1080p),
+            Values(cv::compile_args(IMGPROC_FLUID))));

-    INSTANTIATE_TEST_CASE_P(BGR2LUVPerfTestFluid, BGR2LUVPerfTest,
-        Combine(Values(AbsSimilarPoints(1, 0.05).to_compare_f()),
-                Values(szVGA, sz720p, sz1080p),
-                Values(cv::compile_args(IMGPROC_FLUID))));
+INSTANTIATE_TEST_CASE_P(BGR2LUVPerfTestFluid, BGR2LUVPerfTest,
+    Combine(Values(AbsSimilarPoints(1, 0.05).to_compare_f()),
+            Values(szVGA, sz720p, sz1080p),
+            Values(cv::compile_args(IMGPROC_FLUID))));

-    INSTANTIATE_TEST_CASE_P(RGB2LabPerfTestFluid, RGB2LabPerfTest,
-        Combine(Values(AbsSimilarPoints(1, 0.05).to_compare_f()),
-                Values(szVGA, sz720p, sz1080p),
-                Values(cv::compile_args(IMGPROC_FLUID))));
+INSTANTIATE_TEST_CASE_P(RGB2LabPerfTestFluid, RGB2LabPerfTest,
+    Combine(Values(AbsSimilarPoints(1, 0.05).to_compare_f()),
+            Values(szVGA, sz720p, sz1080p),
+            Values(cv::compile_args(IMGPROC_FLUID))));

 }
--- a/modules/gapi/src/backends/fluid/gfluidimgproc.cpp
+++ b/modules/gapi/src/backends/fluid/gfluidimgproc.cpp
@ -344,7 +344,7 @@ static const int maxKernelSize = 9;

 template<typename DST, typename SRC>
 static void run_boxfilter(Buffer &dst, const View &src, const cv::Size &kernelSize,
-                          const cv::Point& /* anchor */, bool normalize)
+                          const cv::Point& /* anchor */, bool normalize, float *buf[])
 {
    GAPI_Assert(kernelSize.width <= maxKernelSize);
    GAPI_Assert(kernelSize.width == kernelSize.height);
@ -365,36 +365,53 @@ static void run_boxfilter(Buffer &dst, const View &src, const cv::Size &kernelSi
    int width = dst.length();
    int chan  = dst.meta().chan;

-    GAPI_DbgAssert(chan <= 4);
-
-    for (int w=0; w < width; w++)
+    if (kernelSize.width == 3 && kernelSize.height == 3)
    {
-        float sum[4] = {0, 0, 0, 0};
+        int y  = dst.y();
+        int y0 = dst.priv().writeStart();

-        for (int i=0; i < kernel; i++)
+        float  kx[3] = {1, 1, 1};
+        float *ky = kx;
+
+        float scale=1, delta=0;
+        if (normalize)
+            scale = 1/9.f;
+
+        run_sepfilter3x3_impl(out, in, width, chan, kx, ky, border, scale, delta, buf, y, y0);
+    } else
+    {
+        GAPI_DbgAssert(chan <= 4);
+
+        for (int w=0; w < width; w++)
        {
-            for (int j=0; j < kernel; j++)
+            float sum[4] = {0, 0, 0, 0};
+
+            for (int i=0; i < kernel; i++)
            {
-                for (int c=0; c < chan; c++)
-                    sum[c] += in[i][(w + j - border)*chan + c];
+                for (int j=0; j < kernel; j++)
+                {
+                    for (int c=0; c < chan; c++)
+                        sum[c] += in[i][(w + j - border)*chan + c];
+                }
            }
-        }

-        for (int c=0; c < chan; c++)
-        {
-            float result = normalize? sum[c]/(kernel * kernel) : sum[c];
+            for (int c=0; c < chan; c++)
+            {
+                float result = normalize? sum[c]/(kernel * kernel) : sum[c];

-            out[w*chan + c] = saturate<DST>(result, rintf);
+                out[w*chan + c] = saturate<DST>(result, rintf);
+            }
        }
    }
 }

-GAPI_FLUID_KERNEL(GFluidBlur, cv::gapi::imgproc::GBlur, false)
+GAPI_FLUID_KERNEL(GFluidBlur, cv::gapi::imgproc::GBlur, true)
 {
    static const int Window = 3;

    static void run(const View &src, const cv::Size& kernelSize, const cv::Point& anchor,
-                    int /* borderType */, const cv::Scalar& /* borderValue */, Buffer &dst)
+                    int /* borderType */, const cv::Scalar& /* borderValue */, Buffer &dst,
+                    Buffer& scratch)
    {
        // TODO: support sizes 3, 5, 7, 9, ...
        GAPI_Assert(kernelSize.width  == 3 && kernelSize.height == 3);
@ -404,14 +421,46 @@ GAPI_FLUID_KERNEL(GFluidBlur, cv::gapi::imgproc::GBlur, false)

        static const bool normalize = true;

+        int width = src.length();
+        int chan  = src.meta().chan;
+        int length = width * chan;
+
+        float *buf[3];
+        buf[0] = scratch.OutLine<float>();
+        buf[1] = buf[0] + length;
+        buf[2] = buf[1] + length;
+
        //     DST     SRC     OP             __VA_ARGS__
-        UNARY_(uchar , uchar , run_boxfilter, dst, src, kernelSize, anchor, normalize);
-        UNARY_(ushort, ushort, run_boxfilter, dst, src, kernelSize, anchor, normalize);
-        UNARY_( short,  short, run_boxfilter, dst, src, kernelSize, anchor, normalize);
+        UNARY_(uchar , uchar , run_boxfilter, dst, src, kernelSize, anchor, normalize, buf);
+        UNARY_(ushort, ushort, run_boxfilter, dst, src, kernelSize, anchor, normalize, buf);
+        UNARY_( short,  short, run_boxfilter, dst, src, kernelSize, anchor, normalize, buf);
+        UNARY_( float,  float, run_boxfilter, dst, src, kernelSize, anchor, normalize, buf);

        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
    }

+    static void initScratch(const GMatDesc   & in,
+                            const cv::Size   & /* ksize */,
+                            const cv::Point  & /* anchor */,
+                                  int          /* borderType */,
+                            const cv::Scalar & /* borderValue */,
+                                  Buffer     & scratch)
+    {
+        int width = in.size.width;
+        int chan  = in.chan;
+
+        int buflen = width * chan * Window;  // work buffers
+
+        cv::gapi::own::Size bufsize(buflen, 1);
+        GMatDesc bufdesc = {CV_32F, 1, bufsize};
+        Buffer buffer(bufdesc);
+        scratch = std::move(buffer);
+    }
+
+    static void resetScratch(Buffer& /* scratch */)
+    {
+    }
+
    static Border getBorder(const cv::GMatDesc& /* src */,
                            const cv::Size    & /* kernelSize */,
                            const cv::Point   & /* anchor */,
@ -422,18 +471,19 @@ GAPI_FLUID_KERNEL(GFluidBlur, cv::gapi::imgproc::GBlur, false)
    }
 };

-GAPI_FLUID_KERNEL(GFluidBoxFilter, cv::gapi::imgproc::GBoxFilter, false)
+GAPI_FLUID_KERNEL(GFluidBoxFilter, cv::gapi::imgproc::GBoxFilter, true)
 {
    static const int Window = 3;

    static void run(const     View  &    src,
                              int     /* ddepth */,
                    const cv::Size  &    kernelSize,
-                    const cv::Point &   anchor,
+                    const cv::Point &    anchor,
                              bool       normalize,
                              int     /* borderType */,
                    const cv::Scalar& /* borderValue */,
-                              Buffer&    dst)
+                              Buffer&    dst,
+                              Buffer&    scratch)
    {
        // TODO: support sizes 3, 5, 7, 9, ...
        GAPI_Assert(kernelSize.width  == 3 && kernelSize.height == 3);
@ -441,17 +491,51 @@ GAPI_FLUID_KERNEL(GFluidBoxFilter, cv::gapi::imgproc::GBoxFilter, false)
        // TODO: suport non-trivial anchor
        GAPI_Assert(anchor.x == -1 && anchor.y == -1);

+        int width = src.length();
+        int chan  = src.meta().chan;
+        int length = width * chan;
+
+        float *buf[3];
+        buf[0] = scratch.OutLine<float>();
+        buf[1] = buf[0] + length;
+        buf[2] = buf[1] + length;
+
        //     DST     SRC     OP             __VA_ARGS__
-        UNARY_(uchar , uchar , run_boxfilter, dst, src, kernelSize, anchor, normalize);
-        UNARY_(ushort, ushort, run_boxfilter, dst, src, kernelSize, anchor, normalize);
-        UNARY_( short,  short, run_boxfilter, dst, src, kernelSize, anchor, normalize);
-        UNARY_( float, uchar , run_boxfilter, dst, src, kernelSize, anchor, normalize);
-        UNARY_( float, ushort, run_boxfilter, dst, src, kernelSize, anchor, normalize);
-        UNARY_( float,  short, run_boxfilter, dst, src, kernelSize, anchor, normalize);
+        UNARY_(uchar , uchar , run_boxfilter, dst, src, kernelSize, anchor, normalize, buf);
+        UNARY_( float, uchar , run_boxfilter, dst, src, kernelSize, anchor, normalize, buf);
+        UNARY_(ushort, ushort, run_boxfilter, dst, src, kernelSize, anchor, normalize, buf);
+        UNARY_( float, ushort, run_boxfilter, dst, src, kernelSize, anchor, normalize, buf);
+        UNARY_( short,  short, run_boxfilter, dst, src, kernelSize, anchor, normalize, buf);
+        UNARY_( float,  short, run_boxfilter, dst, src, kernelSize, anchor, normalize, buf);
+        UNARY_( float,  float, run_boxfilter, dst, src, kernelSize, anchor, normalize, buf);

        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
    }

+    static void initScratch(const GMatDesc  & in,
+                                      int     /* ddepth */,
+                            const cv::Size  & /* kernelSize */,
+                            const cv::Point & /* anchor */,
+                                      bool    /*  normalize */,
+                                      int     /* borderType */,
+                            const cv::Scalar& /* borderValue */,
+                                  Buffer    &  scratch)
+    {
+        int width = in.size.width;
+        int chan  = in.chan;
+
+        int buflen = width * chan * Window;  // work buffers
+
+        cv::gapi::own::Size bufsize(buflen, 1);
+        GMatDesc bufdesc = {CV_32F, 1, bufsize};
+        Buffer buffer(bufdesc);
+        scratch = std::move(buffer);
+    }
+
+    static void resetScratch(Buffer& /* scratch */)
+    {
+    }
+
    static Border getBorder(const cv::GMatDesc& /* src */,
                                      int       /* ddepth */,
                            const cv::Size    & /* kernelSize */,
@ -510,18 +594,21 @@ static void run_sepfilter(Buffer& dst, const View& src,
                          const float kx[], int kxLen,
                          const float ky[], int kyLen,
                          const cv::Point& /* anchor */,
-                          float delta=0)
+                          float scale, float delta,
+                          float *buf[])
 {
-    static const int maxLines = 9;
-    GAPI_Assert(kyLen <= maxLines);
+    constexpr int kMax = 11;
+    GAPI_Assert(kxLen <= kMax && kyLen <= kMax);

-    const SRC *in[ maxLines ];
+    const SRC *in[kMax];
          DST *out;

-    int border = (kyLen - 1) / 2;
+    int xborder = (kxLen - 1) / 2;
+    int yborder = (kyLen - 1) / 2;
+
    for (int i=0; i < kyLen; i++)
    {
-        in[i] = src.InLine<SRC>(i - border);
+        in[i] = src.InLine<SRC>(i - yborder);
    }

    out = dst.OutLine<DST>();
@ -529,28 +616,52 @@ static void run_sepfilter(Buffer& dst, const View& src,
    int width = dst.length();
    int chan  = dst.meta().chan;

-    for (int w=0; w < width; w++)
+    // optimized 3x3 vs reference
+    if (kxLen == 3 && kyLen == 3)
    {
-        // TODO: make this cycle innermost
-        for (int c=0; c < chan; c++)
+        int y  = dst.y();
+        int y0 = dst.priv().writeStart();
+
+        int border = xborder;
+        run_sepfilter3x3_impl(out, in, width, chan, kx, ky, border, scale, delta, buf, y, y0);
+    }
+    else
+    {
+        int length = chan * width;
+        int xshift = chan * xborder;
+
+        // horizontal pass
+
+        for (int k=0; k < kyLen; k++)
        {
-            float sum=0;
+            const SRC *inp[kMax] = {nullptr};

-            for (int i=0; i < kyLen; i++)
+            for (int j=0; j < kxLen; j++)
            {
-                float sumi=0;
-
-                for (int j=0; j < kxLen; j++)
-                {
-                    sumi += in[i][(w + j - border)*chan + c] * kx[j];
-                }
-
-                sum += sumi * ky[i];
+                inp[j] = in[k] + (j - xborder)*xshift;
            }

-            float result = sum + delta;
+            for (int l=0; l < length; l++)
+            {
+                float sum = 0;
+                for (int j=0; j < kxLen; j++)
+                {
+                    sum += inp[j][l] * kx[j];
+                }
+                buf[k][l] = sum;
+            }
+        }

-            out[w*chan + c] = saturate<DST>(result, rintf);
+        // vertical pass
+
+        for (int l=0; l < length; l++)
+        {
+            float sum = 0;
+            for (int k=0; k < kyLen; k++)
+            {
+                sum += buf[k][l] * ky[k];
+            }
+            out[l] = saturate<DST>(sum*scale + delta, rintf);
        }
    }
 }
@ -580,21 +691,37 @@ GAPI_FLUID_KERNEL(GFluidSepFilter, cv::gapi::imgproc::GSepFilter, true)
        int kxLen = kernX.rows * kernX.cols;
        int kyLen = kernY.rows * kernY.cols;

+        GAPI_Assert(kyLen == 3);
+
        float *kx = scratch.OutLine<float>();
        float *ky = kx + kxLen;

+        int width = src.meta().size.width;
+        int chan  = src.meta().chan;
+        int length = width * chan;
+
+        float *buf[3];
+        buf[0] = ky + kyLen;
+        buf[1] = buf[0] + length;
+        buf[2] = buf[1] + length;
+
+        float scale = 1;
        float delta = static_cast<float>(delta_[0]);

        //     DST     SRC     OP             __VA_ARGS__
-        UNARY_(uchar , uchar , run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, delta);
-        UNARY_(ushort, ushort, run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, delta);
-        UNARY_( short,  short, run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, delta);
-        UNARY_( float,  float, run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, delta);
+        UNARY_(uchar , uchar , run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, scale, delta, buf);
+        UNARY_( short, uchar , run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, scale, delta, buf);
+        UNARY_( float, uchar , run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, scale, delta, buf);
+        UNARY_(ushort, ushort, run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, scale, delta, buf);
+        UNARY_( float, ushort, run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, scale, delta, buf);
+        UNARY_( short,  short, run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, scale, delta, buf);
+        UNARY_( float,  short, run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, scale, delta, buf);
+        UNARY_( float,  float, run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, scale, delta, buf);

        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
    }

-    static void initScratch(const GMatDesc& /* in */,
+    static void initScratch(const GMatDesc&    in,
                                  int       /* ddepth */,
                            const Mat     &    kernX,
                            const Mat     &    kernY,
@ -607,7 +734,13 @@ GAPI_FLUID_KERNEL(GFluidSepFilter, cv::gapi::imgproc::GSepFilter, true)
        int kxLen = kernX.rows * kernX.cols;
        int kyLen = kernY.rows * kernY.cols;

-        cv::gapi::own::Size bufsize(kxLen + kyLen, 1);
+        int width = in.size.width;
+        int chan  = in.chan;
+
+        int buflen = kxLen + kyLen +         // x, y kernels
+                     width * chan * Window;  // work buffers
+
+        cv::gapi::own::Size bufsize(buflen, 1);
        GMatDesc bufdesc = {CV_32F, 1, bufsize};
        Buffer buffer(bufdesc);
        scratch = std::move(buffer);
@ -664,29 +797,47 @@ GAPI_FLUID_KERNEL(GFluidGaussBlur, cv::gapi::imgproc::GGaussBlur, true)
        auto *kx = scratch.OutLine<float>(); // cached kernX data
        auto *ky = kx + kxsize;              // cached kernY data

+        int width = src.meta().size.width;
+        int chan  = src.meta().chan;
+        int length = width * chan;
+
+        float *buf[3];
+        buf[0] = ky + kysize;
+        buf[1] = buf[0] + length;
+        buf[2] = buf[1] + length;
+
        auto  anchor = cv::Point(-1, -1);
-        float delta = 0.f;
+
+        float scale = 1;
+        float delta = 0;

        //     DST     SRC     OP             __VA_ARGS__
-        UNARY_(uchar , uchar , run_sepfilter, dst, src, kx, kxsize, ky, kysize, anchor, delta);
-        UNARY_(ushort, ushort, run_sepfilter, dst, src, kx, kxsize, ky, kysize, anchor, delta);
-        UNARY_( short,  short, run_sepfilter, dst, src, kx, kxsize, ky, kysize, anchor, delta);
+        UNARY_(uchar , uchar , run_sepfilter, dst, src, kx, kxsize, ky, kysize, anchor, scale, delta, buf);
+        UNARY_(ushort, ushort, run_sepfilter, dst, src, kx, kxsize, ky, kysize, anchor, scale, delta, buf);
+        UNARY_( short,  short, run_sepfilter, dst, src, kx, kxsize, ky, kysize, anchor, scale, delta, buf);
+        UNARY_( float,  float, run_sepfilter, dst, src, kx, kxsize, ky, kysize, anchor, scale, delta, buf);

        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
    }

-    static void initScratch(const GMatDesc& /* in */,
+    static void initScratch(const GMatDesc&    in,
                            const cv::Size &   ksize,
                                  double       sigmaX,
                                  double       sigmaY,
-                                  int       /* borderType */,
-                            const cv::Scalar  & /* borderValue */,
+                                  int          /* borderType */,
+                            const cv::Scalar & /* borderValue */,
                                  Buffer  &    scratch)
    {
        int kxsize = ksize.width;
        int kysize = ksize.height;

-        cv::gapi::own::Size bufsize(kxsize + kysize, 1);
+        int width = in.size.width;
+        int chan  = in.chan;
+
+        int buflen = kxsize + kysize +       // x, y kernels
+                     width * chan * Window;  // work buffers
+
+        cv::gapi::own::Size bufsize(buflen, 1);
        GMatDesc bufdesc = {CV_32F, 1, bufsize};
        Buffer buffer(bufdesc);
        scratch = std::move(buffer);
@ -767,7 +918,7 @@ static void run_sobel(Buffer& dst,
    int y0 = dst.priv().writeStart();
 //  int y1 = dst.priv().writeEnd();

-    run_sobel_row(out, in, width, chan, kx, ky, border, scale, delta, buf, y, y0);
+    run_sepfilter3x3_impl(out, in, width, chan, kx, ky, border, scale, delta, buf, y, y0);
 }

 GAPI_FLUID_KERNEL(GFluidSobel, cv::gapi::imgproc::GSobel, true)
@ -1102,6 +1253,7 @@ GAPI_FLUID_KERNEL(GFluidErode, cv::gapi::imgproc::GErode, true)
        UNARY_(uchar , uchar , run_morphology, dst, src, k, k_rows, k_cols, anchor, M_ERODE);
        UNARY_(ushort, ushort, run_morphology, dst, src, k, k_rows, k_cols, anchor, M_ERODE);
        UNARY_( short,  short, run_morphology, dst, src, k, k_rows, k_cols, anchor, M_ERODE);
+        UNARY_( float,  float, run_morphology, dst, src, k, k_rows, k_cols, anchor, M_ERODE);

        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
    }
@ -1109,7 +1261,7 @@ GAPI_FLUID_KERNEL(GFluidErode, cv::gapi::imgproc::GErode, true)
    static void initScratch(const GMatDesc& /* in */,
                            const Mat     &    kernel,
                            const Point   & /* anchor */,
-                              int           /* iterations */,
+                                  int       /* iterations */,
                                  int       /* borderType */,
                            const cv::Scalar  & /* borderValue */,
                                  Buffer  &    scratch)
@ -1179,6 +1331,7 @@ GAPI_FLUID_KERNEL(GFluidDilate, cv::gapi::imgproc::GDilate, true)
        UNARY_(uchar , uchar , run_morphology, dst, src, k, k_rows, k_cols, anchor, M_DILATE);
        UNARY_(ushort, ushort, run_morphology, dst, src, k, k_rows, k_cols, anchor, M_DILATE);
        UNARY_( short,  short, run_morphology, dst, src, k, k_rows, k_cols, anchor, M_DILATE);
+        UNARY_( float,  float, run_morphology, dst, src, k, k_rows, k_cols, anchor, M_DILATE);

        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
    }
@ -1290,6 +1443,7 @@ GAPI_FLUID_KERNEL(GFluidMedianBlur, cv::gapi::imgproc::GMedianBlur, false)
        UNARY_(uchar , uchar , run_medianblur, dst, src, ksize);
        UNARY_(ushort, ushort, run_medianblur, dst, src, ksize);
        UNARY_( short,  short, run_medianblur, dst, src, ksize);
+        UNARY_( float,  float, run_medianblur, dst, src, ksize);

        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
    }
--- a/modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp
+++ b/modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp
@ -57,34 +57,34 @@ void run_yuv2rgb_impl(uchar out[], const uchar in[], int width, const float coef
    CV_CPU_DISPATCH(run_yuv2rgb_impl, (out, in, width, coef), CV_CPU_DISPATCH_MODES_ALL);
 }

-//---------------------
+//-------------------------
 //
-// Fluid kernels: Sobel
+// Fluid kernels: sepFilter
 //
-//---------------------
+//-------------------------

-#define RUN_SOBEL_ROW(DST, SRC)                                          \
-void run_sobel_row(DST out[], const SRC *in[], int width, int chan,      \
-                   const float kx[], const float ky[], int border,       \
-                   float scale, float delta, float *buf[],               \
-                   int y, int y0)                                        \
-{                                                                        \
-    CV_CPU_DISPATCH(run_sobel_row,                                       \
-        (out, in, width, chan, kx, ky, border, scale, delta, buf,y, y0), \
-        CV_CPU_DISPATCH_MODES_ALL);                                      \
+#define RUN_SEPFILTER3X3_IMPL(DST, SRC)                                     \
+void run_sepfilter3x3_impl(DST out[], const SRC *in[], int width, int chan, \
+                           const float kx[], const float ky[], int border,  \
+                           float scale, float delta,                        \
+                           float *buf[], int y, int y0)                     \
+{                                                                           \
+    CV_CPU_DISPATCH(run_sepfilter3x3_impl,                                  \
+        (out, in, width, chan, kx, ky, border, scale, delta, buf,y, y0),    \
+        CV_CPU_DISPATCH_MODES_ALL);                                         \
 }

-RUN_SOBEL_ROW(uchar , uchar )
-RUN_SOBEL_ROW(ushort, ushort)
-RUN_SOBEL_ROW( short, uchar )
-RUN_SOBEL_ROW( short, ushort)
-RUN_SOBEL_ROW( short,  short)
-RUN_SOBEL_ROW( float, uchar )
-RUN_SOBEL_ROW( float, ushort)
-RUN_SOBEL_ROW( float,  short)
-RUN_SOBEL_ROW( float,  float)
+RUN_SEPFILTER3X3_IMPL(uchar , uchar )
+RUN_SEPFILTER3X3_IMPL( short, uchar )
+RUN_SEPFILTER3X3_IMPL( float, uchar )
+RUN_SEPFILTER3X3_IMPL(ushort, ushort)
+RUN_SEPFILTER3X3_IMPL( short, ushort)
+RUN_SEPFILTER3X3_IMPL( float, ushort)
+RUN_SEPFILTER3X3_IMPL( short,  short)
+RUN_SEPFILTER3X3_IMPL( float,  short)
+RUN_SEPFILTER3X3_IMPL( float,  float)

-#undef RUN_SOBEL_ROW
+#undef RUN_SEPFILTER3X3_IMPL

 } // namespace fliud
 } // namespace gapi
--- a/modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp
+++ b/modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp
@ -33,29 +33,29 @@ void run_rgb2yuv_impl(uchar out[], const uchar in[], int width, const float coef

 void run_yuv2rgb_impl(uchar out[], const uchar in[], int width, const float coef[4]);

-//---------------------
+//-------------------------
 //
-// Fluid kernels: Sobel
+// Fluid kernels: sepFilter
 //
-//---------------------
+//-------------------------

-#define RUN_SOBEL_ROW(DST, SRC)                                     \
-void run_sobel_row(DST out[], const SRC *in[], int width, int chan, \
-                   const float kx[], const float ky[], int border,  \
-                   float scale, float delta, float *buf[],          \
-                   int y, int y0);
+#define RUN_SEPFILTER3X3_IMPL(DST, SRC)                                     \
+void run_sepfilter3x3_impl(DST out[], const SRC *in[], int width, int chan, \
+                           const float kx[], const float ky[], int border,  \
+                           float scale, float delta,                        \
+                           float *buf[], int y, int y0);

-RUN_SOBEL_ROW(uchar , uchar )
-RUN_SOBEL_ROW(ushort, ushort)
-RUN_SOBEL_ROW( short, uchar )
-RUN_SOBEL_ROW( short, ushort)
-RUN_SOBEL_ROW( short,  short)
-RUN_SOBEL_ROW( float, uchar )
-RUN_SOBEL_ROW( float, ushort)
-RUN_SOBEL_ROW( float,  short)
-RUN_SOBEL_ROW( float,  float)
+RUN_SEPFILTER3X3_IMPL(uchar , uchar )
+RUN_SEPFILTER3X3_IMPL( short, uchar )
+RUN_SEPFILTER3X3_IMPL( float, uchar )
+RUN_SEPFILTER3X3_IMPL(ushort, ushort)
+RUN_SEPFILTER3X3_IMPL( short, ushort)
+RUN_SEPFILTER3X3_IMPL( float, ushort)
+RUN_SEPFILTER3X3_IMPL( short,  short)
+RUN_SEPFILTER3X3_IMPL( float,  short)
+RUN_SEPFILTER3X3_IMPL( float,  float)

-#undef RUN_SOBEL_ROW
+#undef RUN_SEPFILTER3X3_IMPL

 }  // namespace fluid
 }  // namespace gapi
--- a/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp
+++ b/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp
@ -9,6 +9,8 @@

 #if !defined(GAPI_STANDALONE)

+#include "gfluidimgproc_func.hpp"
+
 #include "opencv2/gapi/own/saturate.hpp"

 #include "opencv2/core.hpp"
@ -16,6 +18,8 @@

 #include <cstdint>

+#include <vector>
+
 #ifdef __GNUC__
 #  pragma GCC diagnostic push
 #  pragma GCC diagnostic ignored "-Wstrict-overflow"
@ -48,34 +52,66 @@ void run_rgb2yuv_impl(uchar out[], const uchar in[], int width, const float coef

 void run_yuv2rgb_impl(uchar out[], const uchar in[], int width, const float coef[4]);

-//---------------------
+//-------------------------
 //
-// Fluid kernels: Sobel
+// Fluid kernels: sepFilter
 //
-//---------------------
+//-------------------------

-#define RUN_SOBEL_ROW(DST, SRC)                                     \
-void run_sobel_row(DST out[], const SRC *in[], int width, int chan, \
-                  const float kx[], const float ky[], int border,   \
-                  float scale, float delta, float *buf[],           \
-                  int y, int y0);
+#define RUN_SEPFILTER3X3_IMPL(DST, SRC)                                     \
+void run_sepfilter3x3_impl(DST out[], const SRC *in[], int width, int chan, \
+                           const float kx[], const float ky[], int border,  \
+                           float scale, float delta,                        \
+                           float *buf[], int y, int y0);

-RUN_SOBEL_ROW(uchar , uchar )
-RUN_SOBEL_ROW(ushort, ushort)
-RUN_SOBEL_ROW( short, uchar )
-RUN_SOBEL_ROW( short, ushort)
-RUN_SOBEL_ROW( short,  short)
-RUN_SOBEL_ROW( float, uchar )
-RUN_SOBEL_ROW( float, ushort)
-RUN_SOBEL_ROW( float,  short)
-RUN_SOBEL_ROW( float,  float)
+RUN_SEPFILTER3X3_IMPL(uchar , uchar )
+RUN_SEPFILTER3X3_IMPL( short, uchar )
+RUN_SEPFILTER3X3_IMPL( float, uchar )
+RUN_SEPFILTER3X3_IMPL(ushort, ushort)
+RUN_SEPFILTER3X3_IMPL( short, ushort)
+RUN_SEPFILTER3X3_IMPL( float, ushort)
+RUN_SEPFILTER3X3_IMPL( short,  short)
+RUN_SEPFILTER3X3_IMPL( float,  short)
+RUN_SEPFILTER3X3_IMPL( float,  float)

-#undef RUN_SOBEL_ROW
+#undef RUN_SEPFILTER3X3_IMPL

 //----------------------------------------------------------------------

 #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY

+#if CV_SIMD
+template<typename SRC>
+static inline v_float32 vx_load_f32(const SRC* ptr)
+{
+    if (std::is_same<SRC,uchar>::value)
+    {
+        v_uint32 tmp = vx_load_expand_q(reinterpret_cast<const uchar*>(ptr));
+        return v_cvt_f32(v_reinterpret_as_s32(tmp));
+    }
+
+    if (std::is_same<SRC,ushort>::value)
+    {
+        v_uint32 tmp = vx_load_expand(reinterpret_cast<const ushort*>(ptr));
+        return v_cvt_f32(v_reinterpret_as_s32(tmp));
+    }
+
+    if (std::is_same<SRC,short>::value)
+    {
+        v_int32 tmp = vx_load_expand(reinterpret_cast<const short*>(ptr));
+        return v_cvt_f32(tmp);
+    }
+
+    if (std::is_same<SRC,float>::value)
+    {
+        v_float32 tmp = vx_load(reinterpret_cast<const float*>(ptr));
+        return tmp;
+    }
+
+    CV_Error(cv::Error::StsBadArg, "unsupported type");
+}
+#endif  // CV_SIMD
+
 //----------------------------------
 //
 // Fluid kernels: RGB2Gray, BGR2Gray
@ -309,187 +345,359 @@ void run_yuv2rgb_impl(uchar out[], const uchar in[], int width, const float coef
    }
 }

-//---------------------
+//-------------------------
 //
-// Fluid kernels: Sobel
+// Fluid kernels: sepFilter
 //
-//---------------------
-
-// Sobel 3x3: vertical pass
-template<bool noscale, typename DST>
-static void run_sobel3x3_vert(DST out[], int length, const float ky[],
-                float scale, float delta, const int r[], float *buf[])
-{
-    float ky0 = ky[0],
-          ky1 = ky[1],
-          ky2 = ky[2];
-
-    int r0 = r[0],
-        r1 = r[1],
-        r2 = r[2];
+//-------------------------

 #if CV_SIMD
-    // for floating-point output,
-    // manual vectoring may be not better than compiler's optimization
-#define EXPLICIT_SIMD_32F 0  // 1=vectorize 32f case explicitly, 0=don't
-#if     EXPLICIT_SIMD_32F
-    if (std::is_same<DST, float>::value && length >= v_int16::nlanes)
+// this variant not using buf[] appears 15% faster than reference any-2-float code below
+template<bool noscale, typename SRC>
+static void run_sepfilter3x3_any2float(float out[], const SRC *in[], int width, int chan,
+                                       const float kx[], const float ky[], int border,
+                                       float scale, float delta)
+{
+    const int length = width * chan;
+    const int shift = border * chan;
+
+    const float kx0 = kx[0], kx1 = kx[1], kx2 = kx[2];
+    const float ky0 = ky[0], ky1 = ky[1], ky2 = ky[2];
+
+    for (int l=0; l < length; )
    {
-        constexpr static int nlanes = v_float32::nlanes;
+        static const int nlanes = v_float32::nlanes;

-        for (int l=0; l < length; )
+        // main part
+        for ( ; l <= length - nlanes; l += nlanes)
        {
-            for (; l <= length - nlanes; l += nlanes)
+            auto xsum = [l, shift, kx0, kx1, kx2](const SRC i[])
            {
-                v_float32 sum = vx_load(&buf[r0][l]) * vx_setall_f32(ky0);
-                    sum = v_fma(vx_load(&buf[r1][l]),  vx_setall_f32(ky1), sum);
-                    sum = v_fma(vx_load(&buf[r2][l]),  vx_setall_f32(ky2), sum);
+                v_float32 t0 = vx_load_f32(&i[l - shift]);
+                v_float32 t1 = vx_load_f32(&i[l        ]);
+                v_float32 t2 = vx_load_f32(&i[l + shift]);
+                v_float32 t = t0 * vx_setall_f32(kx0);
+                    t = v_fma(t1,  vx_setall_f32(kx1), t);
+                    t = v_fma(t2,  vx_setall_f32(kx2), t);
+                return t;
+            };

-                if (!noscale)
-                {
-                    sum = v_fma(sum, vx_setall_f32(scale), vx_setall_f32(delta));
-                }
+            v_float32 s0 = xsum(in[0]);
+            v_float32 s1 = xsum(in[1]);
+            v_float32 s2 = xsum(in[2]);
+            v_float32 s = s0 * vx_setall_f32(ky0);
+                s = v_fma(s1,  vx_setall_f32(ky1), s);
+                s = v_fma(s2,  vx_setall_f32(ky2), s);

-                v_store(reinterpret_cast<float*>(&out[l]), sum);
+            if (!noscale)
+            {
+                s = v_fma(s, vx_setall_f32(scale), vx_setall_f32(delta));
            }

-            if (l < length)
-            {
-                // tail: recalculate last pixels
-                GAPI_DbgAssert(length >= nlanes);
-                l = length - nlanes;
-            }
+            v_store(&out[l], s);
        }

-        return;
-    }
-#endif
-
-    if ((std::is_same<DST, short>::value || std::is_same<DST, ushort>::value)
-        && length >= v_int16::nlanes)
-    {
-        constexpr static int nlanes = v_int16::nlanes;
-
-        for (int l=0; l < length; )
+        // tail (if any)
+        if (l < length)
        {
-            for (; l <= length - nlanes; l += nlanes)
-            {
-                v_float32 sum0 = vx_load(&buf[r0][l])            * vx_setall_f32(ky0);
-                    sum0 = v_fma(vx_load(&buf[r1][l]),             vx_setall_f32(ky1), sum0);
-                    sum0 = v_fma(vx_load(&buf[r2][l]),             vx_setall_f32(ky2), sum0);
-
-                v_float32 sum1 = vx_load(&buf[r0][l + nlanes/2]) * vx_setall_f32(ky0);
-                    sum1 = v_fma(vx_load(&buf[r1][l + nlanes/2]),  vx_setall_f32(ky1), sum1);
-                    sum1 = v_fma(vx_load(&buf[r2][l + nlanes/2]),  vx_setall_f32(ky2), sum1);
-
-                if (!noscale)
-                {
-                    sum0 = v_fma(sum0, vx_setall_f32(scale), vx_setall_f32(delta));
-                    sum1 = v_fma(sum1, vx_setall_f32(scale), vx_setall_f32(delta));
-                }
-
-                v_int32 isum0 = v_round(sum0),
-                        isum1 = v_round(sum1);
-
-                if (std::is_same<DST, short>::value)
-                {
-                    // signed short
-                    v_int16 res = v_pack(isum0, isum1);
-                    v_store(reinterpret_cast<short*>(&out[l]), res);
-                } else
-                {
-                    // unsigned short
-                    v_uint16 res = v_pack_u(isum0, isum1);
-                    v_store(reinterpret_cast<ushort*>(&out[l]), res);
-                }
-            }
-
-            if (l < length)
-            {
-                // tail: recalculate last pixels
-                GAPI_DbgAssert(length >= nlanes);
-                l = length - nlanes;
-            }
+            GAPI_DbgAssert(length >= nlanes);
+            l = length - nlanes;
        }
-
-        return;
-    }
-
-    if (std::is_same<DST, uchar>::value && length >= v_uint8::nlanes)
-    {
-        constexpr static int nlanes = v_uint8::nlanes;
-
-        for (int l=0; l < length; )
-        {
-            for (; l <= length - nlanes; l += nlanes)
-            {
-                v_float32 sum0 = vx_load(&buf[r0][l])              * vx_setall_f32(ky0);
-                    sum0 = v_fma(vx_load(&buf[r1][l]),               vx_setall_f32(ky1), sum0);
-                    sum0 = v_fma(vx_load(&buf[r2][l]),               vx_setall_f32(ky2), sum0);
-
-                v_float32 sum1 = vx_load(&buf[r0][l +   nlanes/4]) * vx_setall_f32(ky0);
-                    sum1 = v_fma(vx_load(&buf[r1][l +   nlanes/4]),  vx_setall_f32(ky1), sum1);
-                    sum1 = v_fma(vx_load(&buf[r2][l +   nlanes/4]),  vx_setall_f32(ky2), sum1);
-
-                v_float32 sum2 = vx_load(&buf[r0][l + 2*nlanes/4]) * vx_setall_f32(ky0);
-                    sum2 = v_fma(vx_load(&buf[r1][l + 2*nlanes/4]),  vx_setall_f32(ky1), sum2);
-                    sum2 = v_fma(vx_load(&buf[r2][l + 2*nlanes/4]),  vx_setall_f32(ky2), sum2);
-
-                v_float32 sum3 = vx_load(&buf[r0][l + 3*nlanes/4]) * vx_setall_f32(ky0);
-                    sum3 = v_fma(vx_load(&buf[r1][l + 3*nlanes/4]),  vx_setall_f32(ky1), sum3);
-                    sum3 = v_fma(vx_load(&buf[r2][l + 3*nlanes/4]),  vx_setall_f32(ky2), sum3);
-
-                if (!noscale)
-                {
-                    sum0 = v_fma(sum0, vx_setall_f32(scale), vx_setall_f32(delta));
-                    sum1 = v_fma(sum1, vx_setall_f32(scale), vx_setall_f32(delta));
-                    sum2 = v_fma(sum2, vx_setall_f32(scale), vx_setall_f32(delta));
-                    sum3 = v_fma(sum3, vx_setall_f32(scale), vx_setall_f32(delta));
-                }
-
-                v_int32 isum0 = v_round(sum0),
-                        isum1 = v_round(sum1),
-                        isum2 = v_round(sum2),
-                        isum3 = v_round(sum3);
-
-                v_int16 ires0 = v_pack(isum0, isum1),
-                        ires1 = v_pack(isum2, isum3);
-
-                v_uint8 res = v_pack_u(ires0, ires1);
-                v_store(reinterpret_cast<uchar*>(&out[l]), res);
-            }
-
-            if (l < length)
-            {
-                // tail: recalculate last pixels
-                GAPI_DbgAssert(length >= nlanes);
-                l = length - nlanes;
-            }
-        }
-
-        return;
-    }
-#endif
-
-    // reference code
-    for (int l=0; l < length; l++)
-    {
-        float sum = buf[r0][l]*ky0 + buf[r1][l]*ky1 + buf[r2][l]*ky2;
-
-        if (!noscale)
-        {
-            sum = sum*scale + delta;
-        }
-
-        out[l] = cv::gapi::own::saturate<DST>(sum, rintf);
    }
 }

-template<typename DST, typename SRC>
-static void run_sobel_impl(DST out[], const SRC *in[], int width, int chan,
-                           const float kx[], const float ky[], int border,
-                           float scale, float delta, float *buf[],
-                           int y, int y0)
+// this variant with manually vectored rounding to short/ushort appears 10-40x faster
+// than reference code below
+template<bool noscale, typename DST, typename SRC>
+static void run_sepfilter3x3_any2short(DST out[], const SRC *in[], int width, int chan,
+                                       const float kx[], const float ky[], int border,
+                                       float scale, float delta,
+                                       float *buf[], int y, int y0)
+{
+    int r[3];
+    r[0] = (y - y0    ) % 3;  // buf[r[0]]: previous
+    r[1] = (y - y0 + 1) % 3;  //            this
+    r[2] = (y - y0 + 2) % 3;  //            next row
+
+    const int length = width * chan;
+    const int shift = border * chan;
+
+    const float kx0 = kx[0], kx1 = kx[1], kx2 = kx[2];
+    const float ky0 = ky[0], ky1 = ky[1], ky2 = ky[2];
+
+    // horizontal pass
+
+    int k0 = (y == y0)? 0: 2;
+
+    for (int k = k0; k < 3; k++)
+    {
+        //                      previous , this , next pixel
+        const SRC *s[3] = {in[k] - shift , in[k], in[k] + shift};
+
+        // rely on compiler vectoring
+        for (int l=0; l < length; l++)
+        {
+            buf[r[k]][l] = s[0][l]*kx0 + s[1][l]*kx1 + s[2][l]*kx2;
+        }
+    }
+
+    // vertical pass
+
+    const int r0=r[0], r1=r[1], r2=r[2];
+
+    for (int l=0; l < length;)
+    {
+        constexpr int nlanes = v_int16::nlanes;
+
+        // main part of row
+        for (; l <= length - nlanes; l += nlanes)
+        {
+            v_float32 sum0 = vx_load(&buf[r0][l])            * vx_setall_f32(ky0);
+                sum0 = v_fma(vx_load(&buf[r1][l]),             vx_setall_f32(ky1), sum0);
+                sum0 = v_fma(vx_load(&buf[r2][l]),             vx_setall_f32(ky2), sum0);
+
+            v_float32 sum1 = vx_load(&buf[r0][l + nlanes/2]) * vx_setall_f32(ky0);
+                sum1 = v_fma(vx_load(&buf[r1][l + nlanes/2]),  vx_setall_f32(ky1), sum1);
+                sum1 = v_fma(vx_load(&buf[r2][l + nlanes/2]),  vx_setall_f32(ky2), sum1);
+
+            if (!noscale)
+            {
+                sum0 = v_fma(sum0, vx_setall_f32(scale), vx_setall_f32(delta));
+                sum1 = v_fma(sum1, vx_setall_f32(scale), vx_setall_f32(delta));
+            }
+
+            v_int32 isum0 = v_round(sum0),
+                    isum1 = v_round(sum1);
+
+            if (std::is_same<DST, short>::value)
+            {
+                // signed short
+                v_int16 res = v_pack(isum0, isum1);
+                v_store(reinterpret_cast<short*>(&out[l]), res);
+            } else
+            {
+                // unsigned short
+                v_uint16 res = v_pack_u(isum0, isum1);
+                v_store(reinterpret_cast<ushort*>(&out[l]), res);
+            }
+        }
+
+        // tail (if any)
+        if (l < length)
+        {
+            GAPI_DbgAssert(length >= nlanes);
+            l = length - nlanes;
+        }
+    }
+}
+
+// this code with manually vectored rounding to uchar is 10-40x faster than reference
+template<bool noscale, typename SRC>
+static void run_sepfilter3x3_any2char(uchar out[], const SRC *in[], int width, int chan,
+                                      const float kx[], const float ky[], int border,
+                                      float scale, float delta,
+                                      float *buf[], int y, int y0)
+{
+    int r[3];
+    r[0] = (y - y0    ) % 3;  // buf[r[0]]: previous
+    r[1] = (y - y0 + 1) % 3;  //            this
+    r[2] = (y - y0 + 2) % 3;  //            next row
+
+    const int length = width * chan;
+    const int shift = border * chan;
+
+    const float kx0 = kx[0], kx1 = kx[1], kx2 = kx[2];
+    const float ky0 = ky[0], ky1 = ky[1], ky2 = ky[2];
+
+    // horizontal pass
+
+    int k0 = (y == y0)? 0: 2;
+
+    for (int k = k0; k < 3; k++)
+    {
+        //                      previous , this , next pixel
+        const SRC *s[3] = {in[k] - shift , in[k], in[k] + shift};
+
+        // rely on compiler vectoring
+        for (int l=0; l < length; l++)
+        {
+            buf[r[k]][l] = s[0][l]*kx0 + s[1][l]*kx1 + s[2][l]*kx2;
+        }
+    }
+
+    // vertical pass
+
+    const int r0=r[0], r1=r[1], r2=r[2];
+
+    for (int l=0; l < length;)
+    {
+        constexpr int nlanes = v_uint8::nlanes;
+
+        // main part of row
+        for (; l <= length - nlanes; l += nlanes)
+        {
+            v_float32 sum0 = vx_load(&buf[r0][l])              * vx_setall_f32(ky0);
+                sum0 = v_fma(vx_load(&buf[r1][l]),               vx_setall_f32(ky1), sum0);
+                sum0 = v_fma(vx_load(&buf[r2][l]),               vx_setall_f32(ky2), sum0);
+
+            v_float32 sum1 = vx_load(&buf[r0][l +   nlanes/4]) * vx_setall_f32(ky0);
+                sum1 = v_fma(vx_load(&buf[r1][l +   nlanes/4]),  vx_setall_f32(ky1), sum1);
+                sum1 = v_fma(vx_load(&buf[r2][l +   nlanes/4]),  vx_setall_f32(ky2), sum1);
+
+            v_float32 sum2 = vx_load(&buf[r0][l + 2*nlanes/4]) * vx_setall_f32(ky0);
+                sum2 = v_fma(vx_load(&buf[r1][l + 2*nlanes/4]),  vx_setall_f32(ky1), sum2);
+                sum2 = v_fma(vx_load(&buf[r2][l + 2*nlanes/4]),  vx_setall_f32(ky2), sum2);
+
+            v_float32 sum3 = vx_load(&buf[r0][l + 3*nlanes/4]) * vx_setall_f32(ky0);
+                sum3 = v_fma(vx_load(&buf[r1][l + 3*nlanes/4]),  vx_setall_f32(ky1), sum3);
+                sum3 = v_fma(vx_load(&buf[r2][l + 3*nlanes/4]),  vx_setall_f32(ky2), sum3);
+
+            if (!noscale)
+            {
+                sum0 = v_fma(sum0, vx_setall_f32(scale), vx_setall_f32(delta));
+                sum1 = v_fma(sum1, vx_setall_f32(scale), vx_setall_f32(delta));
+                sum2 = v_fma(sum2, vx_setall_f32(scale), vx_setall_f32(delta));
+                sum3 = v_fma(sum3, vx_setall_f32(scale), vx_setall_f32(delta));
+            }
+
+            v_int32 isum0 = v_round(sum0),
+                    isum1 = v_round(sum1),
+                    isum2 = v_round(sum2),
+                    isum3 = v_round(sum3);
+
+            v_int16 ires0 = v_pack(isum0, isum1),
+                    ires1 = v_pack(isum2, isum3);
+
+            v_uint8 res = v_pack_u(ires0, ires1);
+            v_store(reinterpret_cast<uchar*>(&out[l]), res);
+        }
+
+        // tail (if any)
+        if (l < length)
+        {
+            GAPI_DbgAssert(length >= nlanes);
+            l = length - nlanes;
+        }
+    }
+}
+
+// this code manually vectored for int16 not much faster than generic any-to-short code above
+#define USE_SEPFILTER3X3_CHAR2SHORT 1
+
+#if USE_SEPFILTER3X3_CHAR2SHORT
+template<bool noscale>
+static void run_sepfilter3x3_char2short(short out[], const uchar *in[], int width, int chan,
+                                        const float kx[], const float ky[], int border,
+                                        float scale, float delta,
+                                        float *buf[], int y, int y0)
+{
+    const schar ikx0 = saturate<schar>(kx[0], rintf);
+    const schar ikx1 = saturate<schar>(kx[1], rintf);
+    const schar ikx2 = saturate<schar>(kx[2], rintf);
+
+    const schar iky0 = saturate<schar>(ky[0], rintf);
+    const schar iky1 = saturate<schar>(ky[1], rintf);
+    const schar iky2 = saturate<schar>(ky[2], rintf);
+
+    const short iscale = saturate<short>(scale * (1 << 15), rintf);
+    const short idelta = saturate<short>(delta            , rintf);
+
+    // check if this code is applicable
+    if (ikx0 != kx[0] || ikx1 != kx[1] || ikx2 != kx[2] ||
+        iky0 != ky[0] || iky1 != ky[1] || iky2 != ky[2] ||
+        idelta != delta ||
+        std::abs(scale) > 1 || std::abs(scale) < 0.01)
+    {
+        run_sepfilter3x3_any2short<noscale>(out, in, width, chan, kx, ky, border, scale, delta,
+                                            buf, y, y0);
+        return;
+    }
+
+    short *ibuf[3];
+    ibuf[0] = reinterpret_cast<short*>(buf[0]);
+    ibuf[1] = reinterpret_cast<short*>(buf[1]);
+    ibuf[2] = reinterpret_cast<short*>(buf[2]);
+
+    int r[3];
+    r[0] = (y - y0    ) % 3;  // buf[r[0]]: previous
+    r[1] = (y - y0 + 1) % 3;  //            this
+    r[2] = (y - y0 + 2) % 3;  //            next row
+
+    const int length = width * chan;
+    const int shift = border * chan;
+
+    // horizontal pass
+
+    int k0 = (y == y0)? 0: 2;
+
+    for (int k = k0; k < 3; k++)
+    {
+        for (int l=0; l < length;)
+        {
+            constexpr int nlanes = v_int16::nlanes;
+
+            // main part of output row
+            for (; l <= length - nlanes; l += nlanes)
+            {
+                v_uint16 t0 = vx_load_expand(&in[k][l - shift]);  // previous
+                v_uint16 t1 = vx_load_expand(&in[k][l        ]);  // current
+                v_uint16 t2 = vx_load_expand(&in[k][l + shift]);  // next pixel
+                v_int16 t = v_reinterpret_as_s16(t0) * vx_setall_s16(ikx0) +
+                            v_reinterpret_as_s16(t1) * vx_setall_s16(ikx1) +
+                            v_reinterpret_as_s16(t2) * vx_setall_s16(ikx2);
+                v_store(&ibuf[r[k]][l], t);
+            }
+
+            // tail (if any)
+            if (l < length)
+            {
+                GAPI_DbgAssert(length >= nlanes);
+                l = length - nlanes;
+            }
+        }
+    }
+
+    // vertical pass
+
+    for (int l=0; l < length;)
+    {
+        constexpr int nlanes = v_int16::nlanes;
+
+        // main part of output row
+        for (; l <= length - nlanes; l += nlanes)
+        {
+            v_int16 s0 = vx_load(&ibuf[r[0]][l]);  // previous
+            v_int16 s1 = vx_load(&ibuf[r[1]][l]);  // current
+            v_int16 s2 = vx_load(&ibuf[r[2]][l]);  // next row
+            v_int16 s = s0 * vx_setall_s16(iky0) +
+                        s1 * vx_setall_s16(iky1) +
+                        s2 * vx_setall_s16(iky2);
+
+            if (!noscale)
+            {
+                s = v_mul_hi(s << 1, vx_setall_s16(iscale)) + vx_setall_s16(idelta);
+            }
+
+            v_store(&out[l], s);
+        }
+
+        // tail (if any)
+        if (l < length)
+        {
+            GAPI_DbgAssert(length >= nlanes);
+            l = length - nlanes;
+        }
+    }
+}
+#endif
+
+#endif  // CV_SIMD
+
+template<bool noscale, typename DST, typename SRC>
+static void run_sepfilter3x3_reference(DST out[], const SRC *in[], int width, int chan,
+                                       const float kx[], const float ky[], int border,
+                                       float scale, float delta,
+                                       float *buf[], int y, int y0)
 {
    int r[3];
    r[0] = (y - y0)     % 3;  // buf[r[0]]: previous
@ -497,19 +705,21 @@ static void run_sobel_impl(DST out[], const SRC *in[], int width, int chan,
    r[2] = (y - y0 + 2) % 3;  //            next row

    int length = width * chan;
+    int shift = border * chan;

    // horizontal pass

    // full horizontal pass is needed only if very 1st row in ROI;
    // for 2nd and further rows, it is enough to convolve only the
    // "next" row - as we can reuse buffers from previous calls to
-    // this kernel (note that Fluid processes rows consequently)
+    // this kernel (Fluid does rows consequently: y=y0, y0+1, ...)
+
    int k0 = (y == y0)? 0: 2;

    for (int k = k0; k < 3; k++)
    {
-        //                             previous, this , next pixel
-        const SRC *s[3] = {in[k] - border*chan , in[k], in[k] + border*chan};
+        //                      previous , this , next pixel
+        const SRC *s[3] = {in[k] - shift , in[k], in[k] + shift};

        // rely on compiler vectoring
        for (int l=0; l < length; l++)
@ -519,37 +729,121 @@ static void run_sobel_impl(DST out[], const SRC *in[], int width, int chan,
    }

    // vertical pass
-    if (scale == 1 && delta == 0)
+
+    for (int l=0; l < length; l++)
    {
-        constexpr static bool noscale = true;  // omit scaling
-        run_sobel3x3_vert<noscale, DST>(out, length, ky, scale, delta, r, buf);
-    } else
-    {
-        constexpr static bool noscale = false;  // do scaling
-        run_sobel3x3_vert<noscale, DST>(out, length, ky, scale, delta, r, buf);
+        float sum = buf[r[0]][l]*ky[0] + buf[r[1]][l]*ky[1] + buf[r[2]][l]*ky[2];
+
+        if (!noscale)
+        {
+            sum = sum*scale + delta;
+        }
+
+        out[l] = saturate<DST>(sum, rintf);
    }
 }

-#define RUN_SOBEL_ROW(DST, SRC)                                                    \
-void run_sobel_row(DST out[], const SRC *in[], int width, int chan,                \
-                   const float kx[], const float ky[], int border,                 \
-                   float scale, float delta, float *buf[],                         \
-                   int y, int y0)                                                  \
-{                                                                                  \
-    run_sobel_impl(out, in, width, chan, kx, ky, border, scale, delta, buf,y, y0); \
+template<bool noscale, typename DST, typename SRC>
+static void run_sepfilter3x3_code(DST out[], const SRC *in[], int width, int chan,
+                                  const float kx[], const float ky[], int border,
+                                  float scale, float delta,
+                                  float *buf[], int y, int y0)
+{
+#if CV_SIMD
+    int length = width * chan;
+
+    // length variable may be unused if types do not match at 'if' statements below
+    (void) length;
+
+#if USE_SEPFILTER3X3_CHAR2SHORT
+    if (std::is_same<DST, short>::value && std::is_same<SRC, uchar>::value &&
+        length >= v_int16::nlanes)
+    {
+        // only slightly faster than more generic any-to-short (see below)
+        run_sepfilter3x3_char2short<noscale>(reinterpret_cast<short*>(out),
+                                             reinterpret_cast<const uchar**>(in),
+                                             width, chan, kx, ky, border, scale, delta,
+                                             buf, y, y0);
+        return;
+    }
+#endif
+
+    if (std::is_same<DST, float>::value && std::is_same<SRC, float>::value &&
+        length >= v_float32::nlanes)
+    {
+        // appears 15% faster than reference any-to-float code (called below)
+        run_sepfilter3x3_any2float<noscale>(reinterpret_cast<float*>(out), in,
+                                            width, chan, kx, ky, border, scale, delta);
+        return;
+    }
+
+    if (std::is_same<DST, short>::value && length >= v_int16::nlanes)
+    {
+        // appears 10-40x faster than reference due to much faster rounding
+        run_sepfilter3x3_any2short<noscale>(reinterpret_cast<short*>(out), in,
+                                            width, chan, kx, ky, border, scale, delta,
+                                            buf, y, y0);
+        return;
+    }
+
+    if (std::is_same<DST, ushort>::value && length >= v_uint16::nlanes)
+    {
+        // appears 10-40x faster than reference due to much faster rounding
+        run_sepfilter3x3_any2short<noscale>(reinterpret_cast<ushort*>(out), in,
+                                            width, chan, kx, ky, border, scale, delta,
+                                            buf, y, y0);
+        return;
+    }
+
+    if (std::is_same<DST, uchar>::value && length >= v_uint8::nlanes)
+    {
+        // appears 10-40x faster than reference due to much faster rounding
+        run_sepfilter3x3_any2char<noscale>(reinterpret_cast<uchar*>(out), in,
+                                           width, chan, kx, ky, border, scale, delta,
+                                           buf, y, y0);
+        return;
+    }
+#endif  // CV_SIMD
+
+    // reference code is quite fast for any-to-float case,
+    // but not for any-to-integral due to very slow rounding
+    run_sepfilter3x3_reference<noscale>(out, in, width, chan, kx, ky, border,
+                                        scale, delta, buf, y, y0);
 }

-RUN_SOBEL_ROW(uchar , uchar )
-RUN_SOBEL_ROW(ushort, ushort)
-RUN_SOBEL_ROW( short, uchar )
-RUN_SOBEL_ROW( short, ushort)
-RUN_SOBEL_ROW( short,  short)
-RUN_SOBEL_ROW( float, uchar )
-RUN_SOBEL_ROW( float, ushort)
-RUN_SOBEL_ROW( float,  short)
-RUN_SOBEL_ROW( float,  float)
+#define RUN_SEPFILTER3X3_IMPL(DST, SRC)                                      \
+void run_sepfilter3x3_impl(DST out[], const SRC *in[], int width, int chan,  \
+                           const float kx[], const float ky[], int border,   \
+                           float scale, float delta,                         \
+                           float *buf[], int y, int y0)                      \
+{                                                                            \
+    if (scale == 1 && delta == 0)                                            \
+    {                                                                        \
+        constexpr bool noscale = true;                                       \
+        run_sepfilter3x3_code<noscale>(out, in, width, chan, kx, ky, border, \
+                                       scale, delta, buf, y, y0);            \
+    }                                                                        \
+    else                                                                     \
+    {                                                                        \
+        constexpr bool noscale = false;                                      \
+        run_sepfilter3x3_code<noscale>(out, in, width, chan, kx, ky, border, \
+                                       scale, delta, buf, y, y0);            \
+    }                                                                        \
+}

-#undef RUN_SOBEL_ROW
+RUN_SEPFILTER3X3_IMPL(uchar , uchar )
+RUN_SEPFILTER3X3_IMPL( short, uchar )
+RUN_SEPFILTER3X3_IMPL( float, uchar )
+RUN_SEPFILTER3X3_IMPL(ushort, ushort)
+RUN_SEPFILTER3X3_IMPL( short, ushort)
+RUN_SEPFILTER3X3_IMPL( float, ushort)
+RUN_SEPFILTER3X3_IMPL( short,  short)
+RUN_SEPFILTER3X3_IMPL( float,  short)
+RUN_SEPFILTER3X3_IMPL( float,  float)
+
+#undef RUN_SEPFILTER3X3_IMPL
+
+//------------------------------------------------------------------------------

 #endif  // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY