cuda resize onnx done

2025-06-23 04:01:31 +08:00 · 2024-06-10 13:50:54 +08:00 · 2024-06-10 13:50:54 +08:00 · 358b64ad0e
commit 358b64ad0e
parent 379c16e106
6 changed files with 168 additions and 226 deletions
--- a/modules/core/include/opencv2/core/cuda/vec_math.hpp
+++ b/modules/core/include/opencv2/core/cuda/vec_math.hpp
@ -458,12 +458,12 @@ CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&, short, short)
 CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&, int, int)
 CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&, uint, uint)

-CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(| , uchar, uchar)
-CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(| , char, char)
-CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(| , ushort, ushort)
-CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(| , short, short)
-CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(| , int, int)
-CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(| , uint, uint)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(|, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(|, char, char)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(|, ushort, ushort)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(|, short, short)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(|, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(|, uint, uint)

 CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(^, uchar, uchar)
 CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(^, char, char)
--- a/modules/imgproc/include/opencv2/imgproc.hpp
+++ b/modules/imgproc/include/opencv2/imgproc.hpp
@ -281,14 +281,11 @@ enum InterpolationFlags {
 //! ONNX Resize Flags
 enum ResizeONNXFlags
 {
-    // static_assert((1 << INTER_COORDINATE_SHIFT) > INTER_MAX, "");
-    // https://github.com/onnx/onnx/blob/main/docs/Operators.md#Resize
-
    INTER_SAMPLER_SHIFT        = 0,
-    INTER_SAMPLER_BIT          = 4,
+    INTER_SAMPLER_BIT          = 3,
    INTER_SAMPLER_MASK         = ((1 << INTER_SAMPLER_BIT) - 1) << INTER_SAMPLER_SHIFT,

-    INTER_COORDINATE_SHIFT     = 4,
+    INTER_COORDINATE_SHIFT     = INTER_SAMPLER_SHIFT + INTER_SAMPLER_BIT,
    INTER_COORDINATE_BIT       = 3,
    INTER_COORDINATE_MASK      = ((1 << INTER_COORDINATE_BIT) - 1) << INTER_COORDINATE_SHIFT,
    /** x_original = (x_resized + 0.5) / scale - 0.5 */
@ -305,8 +302,8 @@ enum ResizeONNXFlags
    /** x_original = x_resized / scale */
    INTER_ASYMMETRIC           = 4 << INTER_COORDINATE_SHIFT,
    /** x_original = length_resized > 1
-     * ? start_x * (length_original - 1) + x_resized * (end_x - start_x) * (length_original - 1) / (length_resized - 1)
-     * : 0.5 * (start_x + end_x) * (length_original - 1) */
+            ? start_x * (length_original - 1) + x_resized * (end_x - start_x) * (length_original - 1) / (length_resized - 1)
+            : 0.5 * (start_x + end_x) * (length_original - 1) */
    INTER_TF_CROP_RESIZE       = 5 << INTER_COORDINATE_SHIFT,

    INTER_NEAREST_MODE_SHIFT   = INTER_COORDINATE_SHIFT + INTER_COORDINATE_BIT,
@ -2468,12 +2465,12 @@ CV_EXPORTS_W void resize( InputArray src, OutputArray dst,
                          int interpolation = INTER_LINEAR );

 /** @brief onnx resize op
+
 https://github.com/onnx/onnx/blob/main/docs/Operators.md#Resize
 https://github.com/onnx/onnx/blob/main/onnx/reference/ops/op_resize.py
-
 Not support `exclude_outside` and `extrapolation_value` yet.

-To get a similar result to resize, give dsize and:
+To get a similar result to `cv::resize`, give dsize and:
    INTER_NEAREST : ASYMMETRIC + NEAREST_FLOOR
    INTER_LINEAR  : HALF_PIXEL
    INTER_CUBIC   : HALF_PIXEL + cubicCoeff(-0.75)
@ -2490,9 +2487,8 @@ To get a similar result to resize, give dsize and:

@sa  resize
 */
-CV_EXPORTS_W void resizeOnnx(
-    InputArray src, OutputArray dst, Size dsize, Point2d scale = Point2d(),
-    int interpolation = INTER_LINEAR | INTER_HALF_PIXEL,
+CV_EXPORTS_W void resizeOnnx(InputArray src, OutputArray dst, Size dsize,
+    Point2d scale = Point2d(), int interpolation = INTER_LINEAR | INTER_HALF_PIXEL,
    float cubicCoeff = -0.75f, Rect2d const& roi = Rect2d());

 /** @brief Applies an affine transformation to an image.
--- a/modules/imgproc/src/opencl/resize_onnx.cl
+++ b/modules/imgproc/src/opencl/resize_onnx.cl
@ -67,22 +67,20 @@ __kernel void resizeOnnx_nearest(

 #if PIXEL_SIZE == 1
        *D = *S;
-#elif PIXEL_SIZE == 2 || PIXEL_SIZE == 4 || PIXEL_SIZE == 8 || PIXEL_SIZE == 16
-        *(__global VT*)(D) = *(__global const VT*)(S);
+#elif PIXEL_SIZE == 2
+        *(__global ushort*)(D) = *(__global const ushort*)(S);
 #elif PIXEL_SIZE == 3
        vstore3(vload3(0, S), 0, D);
+#elif PIXEL_SIZE == 4
+        *(__global uint*)(D) = *(__global const uint*)(S);
 #elif PIXEL_SIZE == 6
        vstore3(vload3(0, (__global ushort const*)(S)), 0, (__global ushort*)(D));
+#elif PIXEL_SIZE == 8
+        *(__global uint2*)(D) = *(__global const uint2*)(S);
 #elif PIXEL_SIZE == 12
        vstore3(vload3(0, (__global const uint*)(S)), 0, (__global uint*)(D));
-#elif PIXEL_SIZE == 24
-        vstore3(vload3(0, (__global ulong const*)(S)), 0, (__global ulong*)(D));
-#elif PIXEL_SIZE == 32
-        *(__global uint8*)(D) = *(__global uint8 const*)(S);
-#elif PIXEL_SIZE == 64
-        *(__global uint16*)(D) = *(__global uint16 const*)(S);
-#elif PIXEL_SIZE == 128
-        *(__global ulong16*)(D) = *(__global ulong16 const*)(S);
+#elif PIXEL_SIZE == 16
+        *(__global uint4*)(D) = *(__global const uint4*)(S);
 #else
        for (int i = 0; i < pixel_size; ++i)
            D[i] = S[i];
@ -114,7 +112,7 @@ __kernel void resizeOnnx_linear(
        __global uchar const* S2 = srcptr + (y1 * src_step + mad24(x0, pixel_size, src_offset));
        __global uchar const* S3 = srcptr + (y1 * src_step + mad24(x1, pixel_size, src_offset));
        __global uchar      * D  = dstptr + (dy * dst_step + mad24(dx, pixel_size, dst_offset));
-#if CN == 1 || CN == 2 || CN == 3 || CN == 4 || CN == 8 || CN == 16
+#if CN == 1 || CN == 2 || CN == 3 || CN == 4
        VW s0 = TO_VEC_WORK(loadpix(S0)); VW s1 = TO_VEC_WORK(loadpix(S1));
        VW s2 = TO_VEC_WORK(loadpix(S2)); VW s3 = TO_VEC_WORK(loadpix(S3));
        VT d0 = TO_VEC_TYPE((u0 * v0) * s0 + (u1 * v0) * s1 + (u0 * v1) * s2 + (u1 * v1) * s3);
@ -154,7 +152,7 @@ __kernel void resizeOnnx_linear_antialias(
        int ix = convert_int_rtn(fx), iy = convert_int_rtn(fy);
        float rx = fx - ix, ry = fy - iy;
        __global uchar* D = dstptr + dy * dst_step + mad24(dx, pixel_size, dst_offset);
-#if CN == 1 || CN == 2 || CN == 3 || CN == 4 || CN == 8 || CN == 16
+#if CN == 1 || CN == 2 || CN == 3 || CN == 4
        VW sumval = (VW)(0);
        float weight = 0;
        for (int h = ystart; h < yend; ++h)
@ -266,7 +264,7 @@ __kernel void resizeOnnx_cubic(
            xcoeff [x - xstart] = cubicCoeff(A, A2, A3, x - fx);
        }
        __global uchar* D = dstptr + (dy * dst_step + mad24(dx, pixel_size, dst_offset));
-#if CN == 1 || CN == 2 || CN == 3 || CN == 4 || CN == 8 || CN == 16
+#if CN == 1 || CN == 2 || CN == 3 || CN == 4
        VW sum = (VW)(0);
        for (int y = ystart; y <= ylimit; ++y)
        {
@ -322,13 +320,14 @@ __kernel void resizeOnnx_table(
        __global int const* yoffset = xoffset + xstride;
        __global float const* xcoeff = (__global float const*)(yoffset + ystride);
        __global float const* ycoeff = (__global float const*)(xcoeff + xstride);
-#if CN == 1 || CN == 2 || CN == 3 || CN == 4 || CN == 8 || CN == 16
+#if CN == 1 || CN == 2 || CN == 3 || CN == 4
        VW sum = (VW)(0);
        // exact ykanti / xkanti loops
        for (int y = dy; y < ystride; y += dst_rows)
        {
-            // offset is already clamped. xoffset is given by uchar
-            __global const uchar* S = (srcptr + yoffset[y] * src_step + src_offset);
+            // offset is already clamped
+            // xoffset is given by uchar, yoffset already multiply by src_step
+            __global const uchar* S = srcptr + yoffset[y] + src_offset;
            VW sline = (VW)(0);
            for (int x = dx; x < xstride; x += dst_cols)
                sline += xcoeff[x] * TO_VEC_WORK(loadpix(S + xoffset[x]));
@ -341,7 +340,7 @@ __kernel void resizeOnnx_table(
            W sum = 0;
            for (int y = dy; y < ystride; y += dst_rows)
            {
-                __global const uchar* S = (srcptr + yoffset[y] * src_step + src_offset);
+                __global const uchar* S = (srcptr + yoffset[y] + src_offset);
                W sline = 0;
                for (int x = dx; x < xstride; x += dst_cols)
                    sline += xcoeff[x] * TO_WORK(((__global T const*)(S + xoffset[x]))[i]);
--- a/modules/imgproc/src/resize.cpp
+++ b/modules/imgproc/src/resize.cpp
@ -1356,36 +1356,36 @@ public:
                break;
            case 2:
                for (; x < width; ++x)
-                    reinterpret_cast<short*>(D)[x] = *(reinterpret_cast<short const*>(S + x_ofs[x]));
+                    reinterpret_cast<ushort*>(D)[x] = *(reinterpret_cast<ushort const*>(S + x_ofs[x]));
                break;
            case 3:
                for (; x < width; ++x, D += 3)
                {
-                    const uchar* _tS = S + x_ofs[x];
+                    uchar const* _tS = S + x_ofs[x];
                    D[0] = _tS[0]; D[1] = _tS[1]; D[2] = _tS[2];
                }
                break;
            case 4:
                for (; x < width; ++x)
-                    reinterpret_cast<int*>(D)[x] = *(reinterpret_cast<int const*>(S + x_ofs[x]));
+                    reinterpret_cast<uint*>(D)[x] = *(reinterpret_cast<uint const*>(S + x_ofs[x]));
                break;
            case 6:
                for (; x < width; ++x, D += 6)
                {
-                    short const* _tS = reinterpret_cast<short const*>(S + x_ofs[x]);
-                    short* _tD = reinterpret_cast<short*>(D);
+                    ushort const* _tS = reinterpret_cast<ushort const*>(S + x_ofs[x]);
+                    ushort* _tD = reinterpret_cast<ushort*>(D);
                    _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2];
                }
                break;
            case 8:
                for (; x < width; ++x)
-                    reinterpret_cast<int64*>(D)[x] = *(reinterpret_cast<int64 const*>(S + x_ofs[x]));
+                    reinterpret_cast<uint64*>(D)[x] = *(reinterpret_cast<uint64 const*>(S + x_ofs[x]));
                break;
            case 12:
                for (; x < width; ++x, D += 12)
                {
-                    int const* _tS = reinterpret_cast<int const*>(S + x_ofs[x]);
-                    int* _tD = reinterpret_cast<int*>(D);
+                    uint const* _tS = reinterpret_cast<uint const*>(S + x_ofs[x]);
+                    uint* _tD = reinterpret_cast<uint*>(D);
                    _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2];
                }
                break;
@ -3465,7 +3465,8 @@ public:
    int xmin, xmax;

 private:
-    void cubic_coeffs_antialias(int dstlen, int cn, float srcpos, float scale, int srclen, float A, TabIdx* elem)
+    void cubicCoeffsAntiAlias(
+        int dstlen, int cn, float srcpos, float scale, int srclen, float A, TabIdx* elem)
    {
        scale = min(scale, 1.f);
        int index = cvFloor(srcpos);
@ -3497,7 +3498,7 @@ private:
        }
    }

-    void cubic_coeffs(float x, float A, float* coeffs)
+    void cubicCoeffs(float x, float A, float* coeffs)
    {
        coeffs[0] = A * ((((x + 1) - 5) * (x + 1) + 8) * (x + 1) - 4);
        coeffs[1] = ((A + 2) * x - (A + 3)) * x * x + 1;
@ -3505,7 +3506,8 @@ private:
        coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];
    }

-    void linear_coeffs_antialias(int dstlen, int cn, float srcpos, float scale, int srclen, TabIdx* elem)
+    void linearCoeffsAntialias(
+        int dstlen, int cn, float srcpos, float scale, int srclen, TabIdx* elem)
    {
        scale = min(scale, 1.f);
        int index = cvFloor(srcpos);
@ -3532,7 +3534,7 @@ private:
        }
    }

-    void linear_coeffs(float x, float* coeffs)
+    void linearCoeffs(float x, float* coeffs)
    {
        coeffs[0] = 1.f - x;
        coeffs[1] = x;
@ -3570,16 +3572,17 @@ private:
        area.commit();
        CV_CheckLE(ksize, MAX_ESIZE, "resampler kernel's size is too larger");

-        if (antialias)
+        // when upsampling, `antialias` is same to `generic`, so use `generic` to speed up
+        if (antialias && scaled.x < 1.0)
        {
            float a = M(0, 0), b = M(0, 1);
            for (int d = 0; d < dsize.width; ++d)
            {
                float f = fmaf(static_cast<float>(d), a, b);
                if (sampler == INTER_LINEAR)
-                    linear_coeffs_antialias(d, cn, f, scale.x, ssize.width, xtab + d * xkanti);
+                    linearCoeffsAntialias(d, cn, f, scale.x, ssize.width, xtab + d * xkanti);
                else // if (sampler == INTER_CUBIC)
-                    cubic_coeffs_antialias(d, cn, f, scale.x, ssize.width, cubicCoeff, xtab + d * xkanti);
+                    cubicCoeffsAntiAlias(d, cn, f, scale.x, ssize.width, cubicCoeff, xtab + d * xkanti);
            }
        }
        else
@ -3608,9 +3611,9 @@ private:
                for (int k = 0; k < cn; ++k)
                    xofs[cn * d + k] = cn * s + k;
                if (sampler == INTER_LINEAR)
-                    linear_coeffs(f, cbuf);
+                    linearCoeffs(f, cbuf);
                else // if (sampler == INTER_CUBIC)
-                    cubic_coeffs(f, cubicCoeff, cbuf);
+                    cubicCoeffs(f, cubicCoeff, cbuf);
                if (is_fixpt)
                {
                    short* coeffs = reinterpret_cast<short*>(xcoeffs) + cn * ksize * d;
@ -3638,16 +3641,16 @@ private:
            }
        }

-        if (antialias)
+        if (antialias && scaled.y < 1.0)
        {
            float a = M(1, 0), b = M(1, 1);
            for (int d = 0; d < dsize.height; ++d)
            {
                float f = fmaf(static_cast<float>(d), a, b);
                if (sampler == INTER_LINEAR)
-                    linear_coeffs_antialias(d, 1, f, scale.y, ssize.height, ytab + d * ykanti);
+                    linearCoeffsAntialias(d, 1, f, scale.y, ssize.height, ytab + d * ykanti);
                else // if (sampler == INTER_CUBIC)
-                    cubic_coeffs_antialias(d, 1, f, scale.y, ssize.height, cubicCoeff, ytab + d * ykanti);
+                    cubicCoeffsAntiAlias(d, 1, f, scale.y, ssize.height, cubicCoeff, ytab + d * ykanti);
            }
        }
        else
@ -3662,9 +3665,9 @@ private:
                f -= s;
                yofs[d] = s;
                if (sampler == INTER_LINEAR)
-                    linear_coeffs(f, cbuf);
+                    linearCoeffs(f, cbuf);
                else // if (sampler == INTER_CUBIC)
-                    cubic_coeffs(f, cubicCoeff, cbuf);
+                    cubicCoeffs(f, cubicCoeff, cbuf);
                if (is_fixpt)
                {
                    short* coeffs = reinterpret_cast<short*>(ycoeffs) + 1 * ksize * d;
@ -3755,7 +3758,7 @@ public:
            "something wrong");
    }

-    void hori_antialias_accumulate(T const* S, IdxT* L) const
+    void horiAntialiasAccumulate(T const* S, IdxT* L) const
    {
        IdxT alpha;
        int const cn = dst.channels();
@ -3809,15 +3812,11 @@ public:
            }
    }

-    void hori_antialias_lines(T const** srcptr, WT** dstptr, IdxT* L, int count) const
+    void horiAntialiasLines(T const** srcptr, WT** dstptr, IdxT* L, int count) const
    {
        int cn = dst.channels();
        int dwidth = dst.cols * cn;
-#ifdef CV_CXX11
-        constexpr bool same_wt_idxt = std::is_same<WT, IdxT>::value;
-#else
-        bool const same_wt_idxt = false;
-#endif
+        bool const same_wt_idxt = std::is_same<WT, IdxT>::value;
        for (int i = 0; i < count; ++i)
        {
            T const* S = srcptr[i];
@ -3825,7 +3824,7 @@ public:
            if (same_wt_idxt)
                L = reinterpret_cast<IdxT*>(dstptr[i]);
            memset(L, 0, sizeof(IdxT) * dwidth);
-            hori_antialias_accumulate(S, L);
+            horiAntialiasAccumulate(S, L);
            if (!same_wt_idxt)
            {
                WT* D = dstptr[i];
@ -3844,7 +3843,7 @@ public:
        }
    }

-    void hori_generic_lines(T const** srcptr, WT** dstptr, int count) const
+    void horiGenericLines(T const** srcptr, WT** dstptr, int count) const
    {
        int cn = src.channels();
        int ssize = src.cols * cn;
@ -3857,53 +3856,7 @@ public:
            ssize, dsize, cn, xmin, xmax);
    }

-    void vert_antialias_hori_antialias(int dy, IdxT* L, IdxT* A) const
-    {
-        // the start and end of ytab
-        int dwidth = dst.channels() * dst.cols;
-        int tstart = dy * ctrl.ykanti, tend = tstart + ctrl.ykanti;
-        memset(A, 0, dwidth * sizeof(IdxT));
-        for (int t = tstart; t < tend; ++t)
-        {
-            IdxT beta;
-            int sy = ctrl.ytab[t].si;
-            CV_CheckEQ(dy, ctrl.ytab[t].di, "something wrong");
-            ctrl.ytab[t].as(beta);
-            memset(L, 0, dwidth * sizeof(IdxT));
-            hori_antialias_accumulate(src.template ptr<T>(sy), L);
-            for (int w = 0; w < dwidth; ++w)
-                A[w] += L[w] * beta;
-        }
-        T* D = dst.template ptr<T>(dy);
-        for (int w = 0; w < dwidth; ++w)
-            D[w] = saturate_cast<T>(A[w]);
-    }
-
-    void vert_antialias_hori_generic(int dy, WT* L, IdxT* A) const
-    {
-        // FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS> cast;
-        int dwidth = dst.channels() * dst.cols;
-        int tstart = dy * ctrl.ykanti, tend = tstart + ctrl.ykanti;
-        memset(A, 0, dwidth * sizeof(IdxT));
-        for (int t = tstart; t < tend; ++t)
-        {
-            IdxT beta;
-            int sy = ctrl.ytab[t].si;
-            CV_CheckEQ(dy, ctrl.ytab[t].di, "something wrong");
-            ctrl.ytab[t].as(beta);
-            T const* S = src.template ptr<T>(sy);
-            hori_generic_lines(&S, &L, 1);
-            if (ctrl.is_fixpt)
-                beta /= INTER_RESIZE_COEF_SCALE;
-            for (int w = 0; w < dwidth; ++w)
-                A[w] += L[w] * beta;
-        }
-        T* D = dst.template ptr<T>(dy);
-        for (int w = 0; w < dwidth; ++w)
-            D[w] = saturate_cast<T>(A[w]);
-    }
-
-    void vert_antialias(Range const& range) const
+    void vertAntialias(Range const& range) const
    {
        int cn = dst.channels();
        int dwidth = dst.cols * cn;
@ -3913,14 +3866,38 @@ public:
        WT* Lw = reinterpret_cast<WT*>(L);
        for (int dy = range.start; dy < range.end; ++dy)
        {
-            if (ctrl.xkanti)
-                vert_antialias_hori_antialias(dy, L, A);
-            else
-                vert_antialias_hori_generic(dy, Lw, A);
+            int tstart = dy * ctrl.ykanti, tend = tstart + ctrl.ykanti;
+            memset(A, 0, dwidth * sizeof(IdxT));
+            for (int t = tstart; t < tend; ++t)
+            {
+                IdxT beta;
+                int sy = ctrl.ytab[t].si;
+                CV_CheckEQ(dy, ctrl.ytab[t].di, "something wrong");
+                ctrl.ytab[t].as(beta);
+                T const* S = src.template ptr<T>(sy);
+                if (ctrl.xkanti)
+                {
+                    memset(L, 0, dwidth * sizeof(IdxT));
+                    horiAntialiasAccumulate(S, L);
+                    for (int w = 0; w < dwidth; ++w)
+                        A[w] += L[w] * beta;
+                }
+                else
+                {
+                    horiGenericLines(&S, &Lw, 1);
+                    if (ctrl.is_fixpt)
+                        beta /= INTER_RESIZE_COEF_SCALE;
+                    for (int w = 0; w < dwidth; ++w)
+                        A[w] += Lw[w] * beta;
+                }
+            }
+            T* D = dst.template ptr<T>(dy);
+            for (int w = 0; w < dwidth; ++w)
+                D[w] = saturate_cast<T>(A[w]);
        }
    }

-    void vert_generic(Range const& range) const
+    void vertGeneric(Range const& range) const
    {
        int ksize = ctrl.ksize, ksize2 = ksize / 2;
        int cn = src.channels();
@ -3963,9 +3940,9 @@ public:
            if (k0 < ksize)
            {
                if (ctrl.xkanti)
-                    hori_antialias_lines(srows + k0, rows + k0, L, ksize - k0);
+                    horiAntialiasLines(srows + k0, rows + k0, L, ksize - k0);
                else
-                    hori_generic_lines(srows + k0, rows + k0, ksize - k0);
+                    horiGenericLines(srows + k0, rows + k0, ksize - k0);
            }
            vresize(const_cast<WT const**>(rows), dst.template ptr<T>(dy), beta, dwidth);
        }
@ -3974,9 +3951,9 @@ public:
    virtual void operator() (Range const& range) const CV_OVERRIDE
    {
        if (ctrl.ykanti)
-            vert_antialias(range);
+            vertAntialias(range);
        else
-            vert_generic(range);
+            vertGeneric(range);
    }
 };

@ -4003,7 +3980,7 @@ typedef void (*ResizeAreaFunc)( const Mat& src, Mat& dst,
                                const DecimateAlpha* ytab, int ytab_size,
                                const int* yofs);

-typedef void (*ResizeOnnxFunc)(Mat const& src, Mat& dst, ResizeOnnxCtrl const&);
+typedef void (*ResizeOnnxFunc)(Mat const& src, Mat& dst, ResizeOnnxCtrl const& ctrl);


 static int computeResizeAreaTab( int ssize, int dsize, int cn, double scale, DecimateAlpha* tab )
@ -4517,7 +4494,7 @@ static bool ocl_resizeOnnx(InputArray _src, OutputArray _dst,
        float* ycoeff = reinterpret_cast<float*>(xcoeff + xstride);
        ocl_resizeOnnxTable(src.cols, dst.cols, pixel_size,
            sampler, M(0, 0), M(0, 1), cubicCoeff, scale.x, xoffset, xcoeff);
-        ocl_resizeOnnxTable(src.rows, dst.rows, 1,
+        ocl_resizeOnnxTable(src.rows, dst.rows, static_cast<int>(src.step[0]),
            sampler, M(1, 0), M(1, 1), cubicCoeff, scale.y, yoffset, ycoeff);
        UMat utable;
        Mat(1, tabsize, CV_32S, table.data()).copyTo(utable);
@ -5175,6 +5152,7 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
 void cv::resizeOnnx(InputArray _src, OutputArray _dst,
    Size dsize, Point2d scale, int interpolation, float cubicCoeff, Rect2d const& roi)
 {
+    static_assert((1 << INTER_SAMPLER_BIT) >= INTER_MAX, "");
    CV_INSTRUMENT_REGION();

    Size ssize = _src.size();
@ -5185,7 +5163,7 @@ void cv::resizeOnnx(InputArray _src, OutputArray _dst,
    {
        CV_CheckGT(scale.x, 0.0, "scale must > 0 if no dsize given");
        CV_CheckGT(scale.y, 0.0, "scale must > 0 if no dsize given");
-        // https://github.com/onnx/onnx/blob/main/onnx/reference/ops/op_resize.py#L365
+        // https://github.com/onnx/onnx/blob/main/onnx/reference/ops/op_resize.py
        // output_size = (scale_factors * np.array(data.shape)).astype(int)
        dsize.width  = static_cast<int>(scale.x * ssize.width );
        dsize.height = static_cast<int>(scale.y * ssize.height);
@ -5196,8 +5174,8 @@ void cv::resizeOnnx(InputArray _src, OutputArray _dst,
        scale.y = static_cast<double>(dsize.height) / ssize.height;
    }
    CV_CheckFalse(dsize.empty(), "dst size must not empty");
-    CV_CheckGT(scale.x, 0.0, "computed scale <= 0 with given dsize");
-    CV_CheckGT(scale.y, 0.0, "computed scale <= 0 with given dsize");
+    CV_CheckGT(scale.x, 0.0, "require computed or given scale > 0");
+    CV_CheckGT(scale.y, 0.0, "require computed or given scale > 0");

    int sampler = interpolation & INTER_SAMPLER_MASK;
    int nearest = interpolation & INTER_NEAREST_MODE_MASK;
@ -5237,6 +5215,9 @@ void cv::resizeOnnx(InputArray _src, OutputArray _dst,
        _src.copyTo(_dst);
        return;
    }
+    // Antialias is applied when downsampling
+    if (scale.x >= 1.0 && scale.y >= 1.0)
+        interpolation &= ~INTER_ANTIALIAS_MASK;

    // Fake reference to source. Resolves issue 13577 in case of src == dst.
    UMat srcUMat;
--- a/modules/imgproc/test/ocl/test_warp.cpp
+++ b/modules/imgproc/test/ocl/test_warp.cpp
@ -338,18 +338,18 @@ OCL_TEST(Resize, overflow_21198)

 PARAM_TEST_CASE(ResizeOnnx, MatType, double, double, int, bool, int)
 {
-    int type, interpolation;
+    int depth, interpolation;
    int widthMultiple;
    double fx, fy;
    bool useRoi;
-    Mat middle;

-    TEST_DECLARE_INPUT_PARAMETER(src);
-    TEST_DECLARE_OUTPUT_PARAMETER(dst);
+    Rect src_loc, dst_loc;
+    Mat src, dst, src_roi, dst_roi;
+    UMat gsrc, gdst, gsrc_roi, gdst_roi;

    virtual void SetUp()
    {
-        type = GET_PARAM(0);
+        depth = GET_PARAM(0);
        fx = GET_PARAM(1);
        fy = GET_PARAM(2);
        interpolation = GET_PARAM(3);
@ -357,96 +357,70 @@ PARAM_TEST_CASE(ResizeOnnx, MatType, double, double, int, bool, int)
        widthMultiple = GET_PARAM(5);
    }

-    void random_roi()
+    void random_submat(int type,
+        Size& size, Rect& roi, Mat& mat, Mat& sub, UMat& gmat, UMat& gsub)
    {
-        CV_Assert(fx > 0 && fy > 0);
+        int border = useRoi ? 65 : 0;
+        roi.x = randomInt(0, border);
+        roi.y = randomInt(0, border);
+        roi.width = size.width;
+        roi.height = size.height;
+        size.width += roi.x + randomInt(0, border);
+        size.height += roi.y + randomInt(0, border);
+        mat = randomMat(size, type, -127, 127);
+        mat.copyTo(gmat);
+        sub = mat(roi);
+        gsub = gmat(roi);
+    }

-        Size srcRoiSize = randomSize(10, MAX_VALUE), dstRoiSize;
-        // Make sure the width is a multiple of the requested value, and no more
-        srcRoiSize.width += widthMultiple - 1 - (srcRoiSize.width - 1) % widthMultiple;
-        dstRoiSize.width = cvRound(srcRoiSize.width * fx);
-        dstRoiSize.height = cvRound(srcRoiSize.height * fy);
-
-        if (dstRoiSize.empty())
+    void random_roi(int type)
+    {
+        Size srcSize, dstSize;
+        int minSize = min(fx, fy) < 1.0 ? 10 : 1;
+        while (dstSize.empty())
        {
-            random_roi();
-            return;
+            srcSize = randomSize(minSize, 129);
+            srcSize.width += widthMultiple - 1 - (srcSize.width - 1) % widthMultiple;
+            dstSize.width = cvRound(srcSize.width * fx);
+            dstSize.height = cvRound(srcSize.height * fy);
        }

-        Border srcBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
-        randomSubMat(src, src_roi, srcRoiSize, srcBorder, type, -MAX_VALUE, MAX_VALUE);
-
-#if 0
-        // if nearest test failed, maybe the fma issue, try open this #if
-        // set pixels' value to their coordinate
-        if ((interpolation & INTER_SAMPLER_MASK) == INTER_NEAREST)
-        {
-            int channel = CV_MAT_CN(type);
-            middle.create(src.rows, src.cols, CV_16SC(channel));
-            for (int h = 0; h < src.rows; ++h)
-            {
-                for (int c = 0; c < channel; c += 2)
-                {
-                    // even x; odd y
-                    short* S = middle.ptr<short>(h) + c;
-                    for (int w = 0; w < src.cols; ++w, S += channel)
-                        S[0] = static_cast<short>(w);
-                }
-                for (int c = 1; c < channel; c += 2)
-                {
-                    // even x; odd y
-                    short* S = middle.ptr<short>(h) + c;
-                    for (int w = 0; w < src.cols; ++w, S += channel)
-                        S[0] = static_cast<short>(h);
-                }
-            }
-            middle.convertTo(src, type);
-            src_roi = src(Rect(srcBorder.lef, srcBorder.top, srcRoiSize.width, srcRoiSize.height));
-        }
-#endif
-        Border dstBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
-        randomSubMat(dst, dst_roi, dstRoiSize, dstBorder, type, -MAX_VALUE, MAX_VALUE);
-
-        UMAT_UPLOAD_INPUT_PARAMETER(src);
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst);
+        random_submat(type, srcSize, src_loc, src, src_roi, gsrc, gsrc_roi);
+        random_submat(type, dstSize, dst_loc, dst, dst_roi, gdst, gdst_roi);
    }
 };

 OCL_TEST_P(ResizeOnnx, Mat)
 {
-    Size whole;
-    Point offset;
    Mat host, host_roi;
-    int cn = CV_MAT_CN(type);
-    int depth = CV_MAT_DEPTH(type);
    double eps = depth <= CV_32S ? integerEps : 5e-2;

-    for (int j = 0; j < test_loop_times; j++)
+    // loop on channel to reduce the number of test
+    for (int cn = 1; cn <= 6; ++cn)
    {
-        random_roi();
-
-        OCL_OFF(cv::resizeOnnx(src_roi, dst_roi,
-            dst_roi.size(), Point2d(fx, fy), interpolation));
-        OCL_ON(cv::resizeOnnx(usrc_roi, udst_roi,
-            dst_roi.size(), Point2d(fx, fy), interpolation));
-
-        dst_roi.locateROI(whole, offset);
-        udst.copyTo(host);
-        host_roi = host(Rect(offset, dst_roi.size()));
-        if (cn <= 4 && depth != CV_8S && depth != CV_32S)
-            OCL_EXPECT_MAT_N_DIFF(dst, eps);
-        else
+        int type = CV_MAKETYPE(depth, cn);
+        for (int j = 0; j < test_loop_times; ++j)
        {
-            // more strict than OCL_EXPECT_MAT_N_DIFF
-            double dif = cv::norm(dst_roi, host_roi, NORM_INF);
-            EXPECT_LE(dif, eps)
-                << "Size: " << src_roi.size()
-                << ", NormInf: " << dif << std::endl;
+            random_roi(type);
+
+            OCL_OFF(cv::resizeOnnx(src_roi, dst_roi,
+                dst_roi.size(), Point2d(fx, fy), interpolation));
+            OCL_ON(cv::resizeOnnx(gsrc_roi, gdst_roi,
+                dst_roi.size(), Point2d(fx, fy), interpolation));
+
+            // copy whole gdst to make sure that
+            // we really use the given roi memory and not allocate a new one
+            gdst.copyTo(host);
+            host_roi = host(dst_loc);
+            string info = cv::format(
+                "fail on type %sC%d src %dx%d dst %dx%d src_roi %dx%d dst_roi %dx%d",
+                depthToString(depth), cn, src.cols, src.rows, dst.cols, dst.rows,
+                src_roi.cols, src_roi.rows, dst_roi.cols, dst_roi.rows);
+            EXPECT_LE(cv::norm(dst_roi, host_roi, NORM_INF), eps) << info;
        }
    }
 }

-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 // remap

@ -689,23 +663,18 @@ OCL_INSTANTIATE_TEST_CASE_P(ImgprocWarpResizeArea, Resize, Combine(
                            Bool(),
                            Values(1, 16)));

-OCL_INSTANTIATE_TEST_CASE_P(ImgprocWarpAlias, ResizeOnnx, Combine(
-                            Values(
-                                CV_8UC1, CV_8SC2, CV_8UC(5), CV_8SC(7),
-                                CV_16UC1, CV_16SC3, CV_16UC(9), CV_16SC(10),
-                                CV_32FC1, CV_32FC4, CV_32FC(11)),
-                            Values(0.5, 0.31, 1.4),
-                            Values(0.5, 0.73, 3.7),
+OCL_INSTANTIATE_TEST_CASE_P(ImgprocWarp, ResizeOnnx, Combine(
+                            Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32F),
+                            Values(0.4, 0.27, 1.6),
+                            Values(0.5, 0.71, 2.7),
                            Values((int)(INTER_LINEAR), (int)(INTER_CUBIC)),
                            Bool(),
                            Values(1, 16)));
+
 OCL_INSTANTIATE_TEST_CASE_P(ImgprocWarpAntiAlias, ResizeOnnx, Combine(
-                            Values(
-                                CV_8UC1, CV_8SC2, CV_8UC(5), CV_8SC(7),
-                                CV_16UC1, CV_16SC3, CV_16UC(9), CV_16SC(10),
-                                CV_32FC1, CV_32FC4, CV_32FC(11)),
-                            Values(0.5, 0.27, 2.6),
-                            Values(0.5, 0.71, 4.1),
+                            Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32F),
+                            Values(0.4, 0.27, 1.6),
+                            Values(0.5, 0.71, 2.7),
                            Values(
                                (int)(INTER_ANTIALIAS | INTER_LINEAR),
                                (int)(INTER_ANTIALIAS | INTER_CUBIC )),
@ -713,12 +682,9 @@ OCL_INSTANTIATE_TEST_CASE_P(ImgprocWarpAntiAlias, ResizeOnnx, Combine(
                            Values(1, 16)));

 OCL_INSTANTIATE_TEST_CASE_P(ImgprocWarpNearest, ResizeOnnx, Combine(
-                            Values(
-                                CV_8UC1, CV_8SC2, CV_8UC4, CV_8SC(7),
-                                CV_16UC1, CV_16SC3, CV_16UC(9), CV_32SC(10),
-                                CV_32FC1, CV_32FC4, CV_32FC(11)),
-                            Values(0.5, 0.27, 2.6),
-                            Values(0.5, 0.71, 4.1),
+                            Values(CV_8S, CV_16S, CV_32F, CV_64F),
+                            Values(0.4, 0.27, 1.6),
+                            Values(0.5, 0.71, 2.7),
                            Values(
                                (int)(INTER_NEAREST | INTER_NEAREST_PREFER_FLOOR),
                                (int)(INTER_NEAREST | INTER_NEAREST_PREFER_CEIL),
--- a/modules/imgproc/test/test_resize_onnx.cpp
+++ b/modules/imgproc/test/test_resize_onnx.cpp
@ -43,10 +43,10 @@ struct ResizeOnnx
        Mat iS(szsrc, CV_64F, insrc.data());
        Mat iR(szref, CV_64F, inref.data());
        Mat S = iS, R = iR, nS, nR;
-        double alpha[6] = {1, 1, 5, 5, -1, -3};
-        double  beta[6] = {0, 7, 0, 7, +0, -7};
+        double alpha[8] = {1, -1, 5, 5, 0, -3, -2, +4};
+        double  beta[8] = {0, -0, 0, 7, 7, -7, -6, +6};
        RNG rng;
-        for (int cn = 1; cn <= 6; ++cn)
+        for (int cn = 1; cn <= 8; ++cn)
        {
            if (cn > 1)
            {
@ -59,7 +59,7 @@ struct ResizeOnnx
            {
                double eps = (depth <= CV_32S) ? 1.0 : 1e-3;
                int type = CV_MAKETYPE(depth, cn);
-                string errinfo = "failed on type " + typeToString(type);
+                string errinfo = "fail on type " + typeToString(type);
                Mat src, ref, dst;
                rand_roi(rng, src, szsrc, type);
                if (szdst.area())