Merge pull request #15852 from akhakim:gauss_blur_kernel_5x5

2025-06-08 01:53:19 +08:00 · 2019-11-14 14:55:24 +00:00 · 2019-11-14 14:55:24 +00:00 · ac2dc29525
commit ac2dc29525
parent d1c4e4b5a5 beb14c70da
4 changed files with 640 additions and 14 deletions
--- a/modules/gapi/src/backends/fluid/gfluidimgproc.cpp
+++ b/modules/gapi/src/backends/fluid/gfluidimgproc.cpp
@ -599,6 +599,7 @@ static void run_sepfilter(Buffer& dst, const View& src,
 {
    constexpr int kMax = 11;
    GAPI_Assert(kxLen <= kMax && kyLen <= kMax);
    GAPI_Assert(kxLen == kyLen);
    const SRC *in[kMax];
          DST *out;
@ -625,6 +626,13 @@ static void run_sepfilter(Buffer& dst, const View& src,
        int border = xborder;
        run_sepfilter3x3_impl(out, in, width, chan, kx, ky, border, scale, delta, buf, y, y0);
    }
    else if (kxLen == 5 && kyLen == 5)
    {
        int y = dst.y();
        int y0 = dst.priv().writeStart();
        run_sepfilter5x5_impl(out, in, width, chan, kx, ky, xborder, scale, delta, buf, y, y0);
    }
    else
    {
        int length = chan * width;
@ -788,7 +796,9 @@ GAPI_FLUID_KERNEL(GFluidGaussBlur, cv::gapi::imgproc::GGaussBlur, true)
                              Buffer&    dst,
                              Buffer&    scratch)
    {
-        int kxsize = ksize.width;
+        GAPI_Assert(ksize.height == ksize.width);
        GAPI_Assert((ksize.height == 3) || (ksize.height == 5));
        const int kxsize = ksize.width;
        int kysize = ksize.height;
        auto *kx = scratch.OutLine<float>(); // cached kernX data
@ -801,7 +811,7 @@ GAPI_FLUID_KERNEL(GFluidGaussBlur, cv::gapi::imgproc::GGaussBlur, true)
        constexpr int buffSize = 5;
        GAPI_Assert(ksize.height <= buffSize);
-        float *buf[buffSize]{};
+        float *buf[buffSize] = { nullptr };
        buf[0] = ky + kysize;
        for (int i = 1; i < ksize.height; ++i)
--- a/modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp
+++ b/modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp
@ -119,6 +119,28 @@ RUN_SEPFILTER3X3_IMPL( float,  float)
 #undef RUN_SEPFILTER3X3_IMPL
 #define RUN_SEPFILTER5x5_IMPL(DST, SRC)                                     \
 void run_sepfilter5x5_impl(DST out[], const SRC *in[], int width, int chan, \
                           const float kx[], const float ky[], int border,  \
                           float scale, float delta,                        \
                           float *buf[], int y, int y0)                     \
 {                                                                           \
    CV_CPU_DISPATCH(run_sepfilter5x5_impl,                                  \
        (out, in, width, chan, kx, ky, border, scale, delta, buf,y, y0),    \
        CV_CPU_DISPATCH_MODES_ALL);                                         \
 }
 RUN_SEPFILTER5x5_IMPL(uchar, uchar)
 RUN_SEPFILTER5x5_IMPL(short, uchar)
 RUN_SEPFILTER5x5_IMPL(float, uchar)
 RUN_SEPFILTER5x5_IMPL(ushort, ushort)
 RUN_SEPFILTER5x5_IMPL(short, ushort)
 RUN_SEPFILTER5x5_IMPL(float, ushort)
 RUN_SEPFILTER5x5_IMPL(short, short)
 RUN_SEPFILTER5x5_IMPL(float, short)
 RUN_SEPFILTER5x5_IMPL(float, float)
 #undef RUN_SEPFILTER5x5_IMPL
 //-------------------------
 //
 // Fluid kernels: Filter 2D
--- a/modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp
+++ b/modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp
@ -78,6 +78,25 @@ RUN_SEPFILTER3X3_IMPL( float,  float)
 #undef RUN_SEPFILTER3X3_IMPL
 #define RUN_SEPFILTER5x5_IMPL(DST, SRC)                                     \
 void run_sepfilter5x5_impl(DST out[], const SRC *in[], int width, int chan, \
                           const float kx[], const float ky[], int border,  \
                           float scale, float delta,                        \
                           float *buf[], int y, int y0);
 RUN_SEPFILTER5x5_IMPL(uchar, uchar)
 RUN_SEPFILTER5x5_IMPL(short, uchar)
 RUN_SEPFILTER5x5_IMPL(float, uchar)
 RUN_SEPFILTER5x5_IMPL(ushort, ushort)
 RUN_SEPFILTER5x5_IMPL(short, ushort)
 RUN_SEPFILTER5x5_IMPL(float, ushort)
 RUN_SEPFILTER5x5_IMPL(short, short)
 RUN_SEPFILTER5x5_IMPL(float, short)
 RUN_SEPFILTER5x5_IMPL(float, float)
 #undef RUN_SEPFILTER5x5_IMPL
 //-------------------------
 //
 // Fluid kernels: Filter 2D
--- a/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp
+++ b/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp
@ -100,6 +100,23 @@ RUN_SEPFILTER3X3_IMPL( float,  float)
 #undef RUN_SEPFILTER3X3_IMPL
 #define RUN_SEPFILTER5x5_IMPL(DST, SRC)                                     \
 void run_sepfilter5x5_impl(DST out[], const SRC *in[], int width, int chan, \
                           const float kx[], const float ky[], int border,  \
                           float scale, float delta,                        \
                           float *buf[], int y, int y0);
 RUN_SEPFILTER5x5_IMPL(uchar, uchar)
 RUN_SEPFILTER5x5_IMPL(short, uchar)
 RUN_SEPFILTER5x5_IMPL(float, uchar)
 RUN_SEPFILTER5x5_IMPL(ushort, ushort)
 RUN_SEPFILTER5x5_IMPL(short, ushort)
 RUN_SEPFILTER5x5_IMPL(float, ushort)
 RUN_SEPFILTER5x5_IMPL(short, short)
 RUN_SEPFILTER5x5_IMPL(float, short)
 RUN_SEPFILTER5x5_IMPL(float, float)
 #undef RUN_SEPFILTER5x5_IMPL
 //-------------------------
 //
 // Fluid kernels: Filter 2D
@ -978,11 +995,11 @@ void run_rgb2yuv422_impl(uchar out[], const uchar in[], int width)
    }
 }
-//-------------------------
+//-----------------------------
 //
-// Fluid kernels: sepFilter
+// Fluid kernels: sepFilter 3x3
 //
-//-------------------------
+//-----------------------------
 #if CV_SIMD
 // this variant not using buf[] appears 15% faster than reference any-2-float code below
@ -1322,7 +1339,7 @@ static void run_sepfilter3x3_char2short(short out[], const uchar *in[], int widt
        }
    }
 }
-#endif
+#endif //USE_SEPFILTER3X3_CHAR2SHORT
 #endif  // CV_SIMD
@ -1464,18 +1481,576 @@ void run_sepfilter3x3_impl(DST out[], const SRC *in[], int width, int chan,  \
    }                                                                        \
 }
-RUN_SEPFILTER3X3_IMPL(uchar , uchar )
+RUN_SEPFILTER3X3_IMPL(uchar, uchar)
-RUN_SEPFILTER3X3_IMPL( short, uchar )
+RUN_SEPFILTER3X3_IMPL(short, uchar)
-RUN_SEPFILTER3X3_IMPL( float, uchar )
+RUN_SEPFILTER3X3_IMPL(float, uchar)
 RUN_SEPFILTER3X3_IMPL(ushort, ushort)
-RUN_SEPFILTER3X3_IMPL( short, ushort)
+RUN_SEPFILTER3X3_IMPL(short, ushort)
-RUN_SEPFILTER3X3_IMPL( float, ushort)
+RUN_SEPFILTER3X3_IMPL(float, ushort)
-RUN_SEPFILTER3X3_IMPL( short,  short)
+RUN_SEPFILTER3X3_IMPL(short, short)
-RUN_SEPFILTER3X3_IMPL( float,  short)
+RUN_SEPFILTER3X3_IMPL(float, short)
-RUN_SEPFILTER3X3_IMPL( float,  float)
+RUN_SEPFILTER3X3_IMPL(float, float)
 #undef RUN_SEPFILTER3X3_IMPL
 //-----------------------------
 //
 // Fluid kernels: sepFilter 5x5
 //
 //-----------------------------
 #if CV_SIMD
 // this code with manually vectored rounding to uchar
 template<bool noscale, typename SRC>
 static void run_sepfilter5x5_any2char(uchar out[], const SRC *in[], int width, int chan,
                                      const float kx[], const float ky[], int border,
                                      float scale, float delta,
                                      float *buf[], int y, int y0)
 {
    constexpr int kxLen = 5;
    constexpr int kyLen = kxLen;
    constexpr int buffSize = 5;
    int r[buffSize];
    for (int n = 0; n < buffSize; ++n)
    {
        r[n] = (y - y0 + n) % 5;  // previous, this, next rows
    }
    const int length = width * chan;
    const int shift = chan;
    // horizontal pass
    int k0 = (y == y0) ? 0 : 4;
    for (int k = k0; k < kxLen; ++k)
    {
        const SRC *s[kxLen] = { nullptr };
        for (int i = 0; i < kxLen; ++i)
        {
            //  previous , this , next pixels
            s[i] = in[k] + (i - border)*shift;
        }
        // rely on compiler vectoring
        for (int l = 0; l < length; ++l)
        {
            float sum = 0;
            for (int j = 0; j < kxLen; ++j)
            {
                sum += s[j][l] * kx[j];
            }
            buf[r[k]][l] = sum;
        }
    }
    // vertical pass
    constexpr int nlanes = v_uint8::nlanes;
    for (int l = 0; l < length;)
    {
        // main part of row
        for (; l <= length - nlanes; l += nlanes)
        {
            v_float32 sum0 = vx_load(&buf[r[0]][l]) * vx_setall_f32(ky[0]);
            v_float32 sum1 = vx_load(&buf[r[0]][l + nlanes / 4]) * vx_setall_f32(ky[0]);
            v_float32 sum2 = vx_load(&buf[r[0]][l + 2 * nlanes / 4]) * vx_setall_f32(ky[0]);
            v_float32 sum3 = vx_load(&buf[r[0]][l + 3 * nlanes / 4]) * vx_setall_f32(ky[0]);
            for (int n = 1; n < kyLen; ++n)
            {
                sum0 = v_fma(vx_load(&buf[r[n]][l]), vx_setall_f32(ky[n]), sum0);
                sum1 = v_fma(vx_load(&buf[r[n]][l + nlanes / 4]), vx_setall_f32(ky[n]), sum1);
                sum2 = v_fma(vx_load(&buf[r[n]][l + 2 * nlanes / 4]), vx_setall_f32(ky[n]), sum2);
                sum3 = v_fma(vx_load(&buf[r[n]][l + 3 * nlanes / 4]), vx_setall_f32(ky[n]), sum3);
            }
            if (!noscale)
            {
                sum0 = v_fma(sum0, vx_setall_f32(scale), vx_setall_f32(delta));
                sum1 = v_fma(sum1, vx_setall_f32(scale), vx_setall_f32(delta));
                sum2 = v_fma(sum2, vx_setall_f32(scale), vx_setall_f32(delta));
                sum3 = v_fma(sum3, vx_setall_f32(scale), vx_setall_f32(delta));
            }
            v_int32 isum0 = v_round(sum0),
                    isum1 = v_round(sum1),
                    isum2 = v_round(sum2),
                    isum3 = v_round(sum3);
            v_int16 ires0 = v_pack(isum0, isum1),
                    ires1 = v_pack(isum2, isum3);
            v_uint8 res = v_pack_u(ires0, ires1);
            v_store(reinterpret_cast<uchar*>(&out[l]), res);
        }
        // tail (if any)
        if (l < length)
        {
            GAPI_DbgAssert(length >= nlanes);
            l = length - nlanes;
        }
    }
    return;
 }
 // this variant with manually vectored rounding to short/ushort
 template<bool noscale, typename DST, typename SRC>
 static void run_sepfilter5x5_any2short(DST out[], const SRC *in[], int width, int chan,
                                       const float kx[], const float ky[], int border,
                                       float scale, float delta,
                                       float *buf[], int y, int y0)
 {
    constexpr int kxLen = 5;
    constexpr int kyLen = kxLen;
    constexpr int buffSize = 5;
    int r[buffSize];
    for (int n = 0; n < buffSize; ++n)
    {
        r[n] = (y - y0 + n) % 5;  // previous, this, next rows
    }
    const int length = width * chan;
    const int shift = chan;
    // horizontal pass
    int k0 = (y == y0) ? 0 : 4;
    for (int k = k0; k < kyLen; ++k)
    {
        const SRC *s[kxLen] = { nullptr };
        for (int i = 0; i < kxLen; ++i)
        {
            //  previous , this , next pixels
            s[i] = in[k] + (i - border)*shift;
        }
        // rely on compiler vectoring
        for (int l = 0; l < length; ++l)
        {
            float sum = 0;
            for (int j = 0; j < kxLen; ++j)
            {
                sum += s[j][l] * kx[j];
            }
            buf[r[k]][l] = sum;
        }
    }
    // vertical pass
    constexpr int nlanes = v_int16::nlanes;
    for (int l = 0; l < length;)
    {
        //GAPI_Assert(length >= nlanes);
        // main part of row
        for (; l <= length - nlanes; l += nlanes)
        {
            v_float32 sum0 = vx_load(&buf[r[0]][l]) * vx_setall_f32(ky[0]);
            v_float32 sum1 = vx_load(&buf[r[0]][l + nlanes / 2]) * vx_setall_f32(ky[0]);
            for (int j = 1; j < kyLen; ++j)
            {
                sum0 = v_fma(vx_load(&buf[r[j]][l]), vx_setall_f32(ky[j]), sum0);
                sum1 = v_fma(vx_load(&buf[r[j]][l + nlanes / 2]), vx_setall_f32(ky[j]), sum1);
            }
            if (!noscale)
            {
                sum0 = v_fma(sum0, vx_setall_f32(scale), vx_setall_f32(delta));
                sum1 = v_fma(sum1, vx_setall_f32(scale), vx_setall_f32(delta));
            }
            v_int32 isum0 = v_round(sum0),
                    isum1 = v_round(sum1);
            if (std::is_same<DST, short>::value)
            {
                // signed short
                v_int16 res = v_pack(isum0, isum1);
                v_store(reinterpret_cast<short*>(&out[l]), res);
            }
            else
            {
                // unsigned short
                v_uint16 res = v_pack_u(isum0, isum1);
                v_store(reinterpret_cast<ushort*>(&out[l]), res);
            }
        }
        // tail (if any)
        if (l < length)
        {
            GAPI_DbgAssert(length >= nlanes);
            l = length - nlanes;
        }
    }
    return;
 }
 // this variant not using buf[]
 template<bool noscale, typename SRC>
 static void run_sepfilter5x5_any2float(float out[], const SRC *in[], int width, int chan,
                                       const float kx[], const float ky[], int border,
                                       float scale, float delta)
 {
    constexpr int kxLen = 5;
    constexpr int kyLen = kxLen;
    constexpr int buffSize = 5;
    const int length = width * chan;
    const int shift = chan;
    static const int nlanes = v_float32::nlanes;
    for (int l = 0; l < length; )
    {
        //GAPI_Assert(length >= nlanes);
        // main part
        for (; l <= length - nlanes; l += nlanes)
        {
            auto xsum = [l, border, shift, kx](const SRC inp[])
            {
                v_float32 t[5];
                for (int i = 0; i < 5; ++i)
                {
                    t[i] = vx_load_f32(&inp[l + (i - border)*shift]);
                }
                v_float32 sum = t[0] * vx_setall_f32(kx[0]);
                for (int j = 1; j < 5; ++j)
                {
                    sum = v_fma(t[j], vx_setall_f32(kx[j]), sum);
                }
                return sum;
            };
            v_float32 s[buffSize];
            for (int m = 0; m < buffSize; ++m)
            {
                s[m] = xsum(in[m]);
            }
            v_float32 sum = s[0] * vx_setall_f32(ky[0]);
            for (int n = 1; n < kyLen; ++n)
            {
                sum = v_fma(s[n], vx_setall_f32(ky[n]), sum);
            }
            if (!noscale)
            {
                sum = v_fma(sum, vx_setall_f32(scale), vx_setall_f32(delta));
            }
            v_store(&out[l], sum);
        }
        // tail (if any)
        if (l < length)
        {
            GAPI_DbgAssert(length >= nlanes);
            l = length - nlanes;
        }
    }
    return;
 }
 #define USE_SEPFILTER5X5_CHAR2SHORT 1
 #if USE_SEPFILTER5X5_CHAR2SHORT
 template<bool noscale>
 static void run_sepfilter5x5_char2short(short out[], const uchar *in[], int width, int chan,
                                        const float kx[], const float ky[], int border,
                                        float scale, float delta,
                                        float *buf[], int y, int y0)
 {
    constexpr int kxLen = 5;
    constexpr int kyLen = kxLen;
    constexpr int buffSize = 5;
    schar ikx[kxLen];
    schar iky[kyLen];
    for (int i = 0; i < kxLen; ++i)
    {
        ikx[i] = saturate<schar>(kx[i], rintf);
        iky[i] = saturate<schar>(ky[i], rintf);
    }
    const short iscale = saturate<short>(scale * (1 << 15), rintf);
    const short idelta = saturate<short>(delta, rintf);
    // check if this code is applicable
    if (ikx[0] != kx[0] || ikx[1] != kx[1] || ikx[2] != kx[2] || ikx[3] != kx[3] || ikx[4] != kx[4] ||
        iky[0] != ky[0] || iky[1] != ky[1] || iky[2] != ky[2] || iky[3] != ky[3] || iky[4] != ky[4] ||
        idelta != delta ||
        std::abs(scale) > 1 || std::abs(scale) < 0.01)
    {
        run_sepfilter5x5_any2short<noscale>(out, in, width, chan, kx, ky, border, scale, delta,
                                            buf, y, y0);
        return;
    }
    short *ibuf[buffSize];
    int r[buffSize];
    for (int n = 0; n < buffSize; ++n)
    {
        ibuf[n] = reinterpret_cast<short*>(buf[n]);
        r[n] = (y - y0 + n) % 5;  // previous, this, next rows
    }
    const int length = width * chan;
    const int shift = chan;
    // horizontal pass
    // full horizontal pass is needed only if the very 1st row in ROI is handled;
    // for 2nd and further rows, it's enough to convolve only the
    // "next" row - as we can reuse buffers from previous calls to
    // this kernel (Fluid does rows consequently: y=y0, y0+1, ...)
    int k0 = (y == y0) ? 0 : 4;
    constexpr int nlanes = v_int16::nlanes;
    for (int k = k0; k < kyLen; ++k)
    {
        for (int l = 0; l < length;)
        {
            GAPI_Assert(length >= nlanes);
            // main part of output row
            for (; l <= length - nlanes; l += nlanes)
            {
                v_uint16 t[kxLen];
                v_int16 sum;
                for (int i = 0; i < kxLen; ++i)
                {
                    // previous, current, next pixels
                    t[i] = vx_load_expand(&in[k][l + (i - border)*shift]);
                    sum += v_reinterpret_as_s16(t[i]) * vx_setall_s16(ikx[i]);
                }
                v_store(&ibuf[r[k]][l], sum);
            }
            // tail (if any)
            if (l < length)
            {
                GAPI_DbgAssert(length >= nlanes);
                l = length - nlanes;
            }
        }
    }
    // vertical pass
    for (int l = 0; l < length;)
    {
        //GAPI_Assert(length >= nlanes);
        // main part of output row
        for (; l <= length - nlanes; l += nlanes)
        {
            v_int16 s[buffSize];
            v_int16 sum;
            for (int i = 0; i < kyLen; ++i)
            {
                // previous, current, next rows
                s[i] = vx_load(&ibuf[r[i]][l]);
                sum += s[i] * vx_setall_s16(iky[i]);
            }
            if (!noscale)
            {
                sum = v_mul_hi(sum << 1, vx_setall_s16(iscale)) + vx_setall_s16(idelta);
            }
            v_store(&out[l], sum);
        }
        // tail (if any)
        if (l < length)
        {
            GAPI_DbgAssert(length >= nlanes);
            l = length - nlanes;
        }
    }
    return;
 }
 #endif //USE_SEPFILTER5X5_CHAR2SHORT
 #endif //CV_SIMD
 template<bool noscale, typename DST, typename SRC>
 static void run_sepfilter5x5_reference(DST out[], const SRC *in[], int width, int chan,
                                       const float kx[], const float ky[], int border,
                                       float scale, float delta, float *buf[], int y, int y0)
 {
    constexpr int kxLen = 5; // kernel size
    constexpr int kyLen = kxLen;
    int r[kyLen];
    for (int n = 0; n < kyLen; ++n)
    {
        r[n] = (y - y0 + n) % 5; // previous, this, next rows
    }
    int length = width * chan;
    int shift = chan;
    // horizontal pass
    // full horizontal pass is needed only if very 1st row in ROI;
    // for 2nd and further rows, it is enough to convolve only the
    // "next" row - as we can reuse buffers from previous calls to
    // this kernel (Fluid does rows consequently: y=y0, y0+1, ...)
    int k0 = (y == y0) ? 0 : 4;
    for (int k = k0; k < kyLen; ++k)
    {
        const SRC *s[kxLen] = { nullptr };
        for (int i = 0; i < kxLen; ++i)
        {
            //  previous , this , next pixels
            s[i] = in[k] + (i - border)*shift;
        }
        // rely on compiler vectoring
        for (int l = 0; l < length; ++l)
        {
            float sum = 0;
            for (int i = 0; i < kxLen; ++i)
            {
                sum += s[i][l] * kx[i];
            }
            buf[r[k]][l] = sum;
        }
    }
    // vertical pass
    for (int l = 0; l < length; ++l)
    {
        float sum = 0;
        for (int j = 0; j < kyLen; ++j)
        {
            sum += buf[r[j]][l] * ky[j];
        }
        if (!noscale)
        {
            sum = sum * scale + delta;
        }
        out[l] = saturate<DST>(sum, rintf);
    }
    return;
 }
 template<bool noscale, typename DST, typename SRC>
 static void run_sepfilter5x5_code(DST out[], const SRC *in[], int width, int chan,
                                  const float kx[], const float ky[], int border,
                                  float scale, float delta, float *buf[], int y, int y0)
 {
 #if CV_SIMD
    int length = width * chan;
    // length variable may be unused if types do not match at 'if' statements below
    (void)length;
    if (std::is_same<DST, short>::value && std::is_same<SRC, uchar>::value &&
        length >= v_int16::nlanes)
    {
        run_sepfilter5x5_char2short<noscale>(reinterpret_cast<short*>(out),
                                             reinterpret_cast<const uchar**>(in),
                                             width, chan, kx, ky, border, scale, delta,
                                             buf, y, y0);
        return;
    }
    if (std::is_same<DST, float>::value && std::is_same<SRC, float>::value &&
        length >= v_float32::nlanes)
    {
        run_sepfilter5x5_any2float<noscale>(reinterpret_cast<float*>(out), in, width,
                                            chan, kx, ky, border, scale, delta);
        return;
    }
    if (std::is_same<DST, short>::value && length >= v_int16::nlanes)
    {
        run_sepfilter5x5_any2short<noscale>(reinterpret_cast<short*>(out), in, width,
                                            chan, kx, ky, border, scale, delta,
                                            buf, y, y0);
        return;
    }
    if (std::is_same<DST, ushort>::value && length >= v_uint16::nlanes)
    {
        run_sepfilter5x5_any2short<noscale>(reinterpret_cast<ushort*>(out), in, width,
                                            chan, kx, ky, border, scale, delta,
                                            buf, y, y0);
        return;
    }
    if (std::is_same<DST, uchar>::value && length >= v_uint8::nlanes)
    {
        run_sepfilter5x5_any2char<noscale>(reinterpret_cast<uchar*>(out), in, width,
                                           chan, kx, ky, border, scale, delta,
                                           buf, y, y0);
        return;
    }
 #endif  // CV_SIMD
    // reference code is quite fast for any-to-float case,
    // but not for any-to-integral due to very slow rounding
    run_sepfilter5x5_reference<noscale>(out, in, width, chan, kx, ky, border,
        scale, delta, buf, y, y0);
 }
 #define RUN_SEPFILTER5x5_IMPL(DST, SRC)                                                                        \
 void run_sepfilter5x5_impl(DST out[], const SRC *in[], int width, int chan, const float kx[],                  \
                           const float ky[], int border, float scale, float delta,                             \
                           float *buf[], int y, int y0)                                                        \
 {                                                                                                              \
    if (scale == 1 && delta == 0)                                                                              \
    {                                                                                                          \
        constexpr bool noscale = true;                                                                         \
        run_sepfilter5x5_code<noscale>(out, in, width, chan, kx, ky, border,                                   \
                                       scale, delta, buf, y, y0);                                              \
    }                                                                                                          \
    else                                                                                                       \
    {                                                                                                          \
        constexpr bool noscale = false;                                                                        \
        run_sepfilter5x5_code<noscale>(out, in, width, chan, kx, ky, border,                                   \
                                       scale, delta, buf, y, y0);                                              \
    }                                                                                                          \
    return;                                                                                                    \
 }
 RUN_SEPFILTER5x5_IMPL(uchar, uchar)
 RUN_SEPFILTER5x5_IMPL(short, uchar)
 RUN_SEPFILTER5x5_IMPL(float, uchar)
 RUN_SEPFILTER5x5_IMPL(ushort, ushort)
 RUN_SEPFILTER5x5_IMPL(short, ushort)
 RUN_SEPFILTER5x5_IMPL(float, ushort)
 RUN_SEPFILTER5x5_IMPL(short, short)
 RUN_SEPFILTER5x5_IMPL(float, short)
 RUN_SEPFILTER5x5_IMPL(float, float)
 #undef RUN_SEPFILTER5x5_IMPL
 //-------------------------
 //
 // Fluid kernels: Filter 2D