From 6dd93a82edec0870d53020d961c2a5e5ce56e1af Mon Sep 17 00:00:00 2001
From: Ginkgo <ecrirelettre@outlook.com>
Date: Sat, 1 Jun 2024 16:04:36 +0800
Subject: [PATCH 01/12] cpu + ocl resize onnx done - rebase to 4.x - squash
 commit history due to so many conflicts

---
 modules/imgproc/include/opencv2/imgproc.hpp |   89 +-
 modules/imgproc/src/opencl/resize_onnx.cl   |  360 ++++++
 modules/imgproc/src/resize.cpp              | 1217 ++++++++++++++++++-
 modules/imgproc/test/ocl/test_warp.cpp      |  154 ++-
 modules/imgproc/test/test_resize_onnx.cpp   |  498 ++++++++
 modules/ts/include/opencv2/ts/ocl_test.hpp  |    4 +-
 6 files changed, 2302 insertions(+), 20 deletions(-)
 create mode 100644 modules/imgproc/src/opencl/resize_onnx.cl
 create mode 100644 modules/imgproc/test/test_resize_onnx.cpp

diff --git a/modules/imgproc/include/opencv2/imgproc.hpp b/modules/imgproc/include/opencv2/imgproc.hpp
index 471a857f63..8e6eecb667 100644
--- a/modules/imgproc/include/opencv2/imgproc.hpp
+++ b/modules/imgproc/include/opencv2/imgproc.hpp
@@ -245,7 +245,7 @@ enum MorphShapes {
 //! @{
 
 //! interpolation algorithm
-enum InterpolationFlags{
+enum InterpolationFlags {
     /** nearest neighbor interpolation */
     INTER_NEAREST        = 0,
     /** bilinear interpolation */
@@ -278,6 +278,55 @@ enum InterpolationFlags{
     WARP_RELATIVE_MAP    = 32
 };
 
+//! ONNX Resize Flags
+enum ResizeONNXFlags
+{
+    // static_assert((1 << INTER_COORDINATE_SHIFT) > INTER_MAX, "");
+    // https://github.com/onnx/onnx/blob/main/docs/Operators.md#Resize
+
+    INTER_SAMPLER_SHIFT        = 0,
+    INTER_SAMPLER_BIT          = 4,
+    INTER_SAMPLER_MASK         = ((1 << INTER_SAMPLER_BIT) - 1) << INTER_SAMPLER_SHIFT,
+
+    INTER_COORDINATE_SHIFT     = 4,
+    INTER_COORDINATE_BIT       = 3,
+    INTER_COORDINATE_MASK      = ((1 << INTER_COORDINATE_BIT) - 1) << INTER_COORDINATE_SHIFT,
+    /** x_original = (x_resized + 0.5) / scale - 0.5 */
+    INTER_HALF_PIXEL           = 0 << INTER_COORDINATE_SHIFT,
+    /** adjustment = output_width_int / output_width
+        center = input_width / 2
+        offset = center * (1 - adjustment)
+        x_ori = offset + (x + 0.5) / scale - 0.5 */
+    INTER_HALF_PIXEL_SYMMETRIC = 1 << INTER_COORDINATE_SHIFT,
+    /** x_original = length_resized > 1 ? (x_resized + 0.5) / scale - 0.5 : 0 */
+    INTER_HALF_PIXEL_PYTORCH   = 2 << INTER_COORDINATE_SHIFT,
+    /** x_original = x_resized * (length_original - 1) / (length_resized - 1) */
+    INTER_ALIGN_CORNERS        = 3 << INTER_COORDINATE_SHIFT,
+    /** x_original = x_resized / scale */
+    INTER_ASYMMETRIC           = 4 << INTER_COORDINATE_SHIFT,
+    /** x_original = length_resized > 1
+     * ? start_x * (length_original - 1) + x_resized * (end_x - start_x) * (length_original - 1) / (length_resized - 1)
+     * : 0.5 * (start_x + end_x) * (length_original - 1) */
+    INTER_TF_CROP_RESIZE       = 5 << INTER_COORDINATE_SHIFT,
+
+    INTER_NEAREST_MODE_SHIFT   = INTER_COORDINATE_SHIFT + INTER_COORDINATE_BIT,
+    INTER_NEAREST_MODE_BIT     = 2,
+    INTER_NEAREST_MODE_MASK    = ((1 << INTER_NEAREST_MODE_BIT) - 1) << INTER_NEAREST_MODE_SHIFT,
+    /** round half down: x =  ceil(x - 0.5) */
+    INTER_NEAREST_PREFER_FLOOR = 0 << INTER_NEAREST_MODE_SHIFT,
+    /** round half up  : x = floor(x + 0.5) */
+    INTER_NEAREST_PREFER_CEIL  = 1 << INTER_NEAREST_MODE_SHIFT,
+    /** x = floor(x) */
+    INTER_NEAREST_FLOOR        = 2 << INTER_NEAREST_MODE_SHIFT,
+    /** x =  ceil(x) */
+    INTER_NEAREST_CEIL         = 3 << INTER_NEAREST_MODE_SHIFT,
+
+    INTER_ANTIALIAS_SHIFT      = INTER_NEAREST_MODE_SHIFT + INTER_NEAREST_MODE_BIT,
+    INTER_ANTIALIAS_BIT        = 1,
+    INTER_ANTIALIAS_MASK       = ((1 << INTER_ANTIALIAS_BIT) - 1) << INTER_ANTIALIAS_SHIFT,
+    INTER_ANTIALIAS            = 1 << INTER_ANTIALIAS_SHIFT,
+};
+
 /** \brief Specify the polar mapping mode
 @sa warpPolar
 */
@@ -288,11 +337,11 @@ enum WarpPolarMode
 };
 
 enum InterpolationMasks {
-       INTER_BITS      = 5,
-       INTER_BITS2     = INTER_BITS * 2,
-       INTER_TAB_SIZE  = 1 << INTER_BITS,
-       INTER_TAB_SIZE2 = INTER_TAB_SIZE * INTER_TAB_SIZE
-     };
+    INTER_BITS      = 5,
+    INTER_BITS2     = INTER_BITS * 2,
+    INTER_TAB_SIZE  = 1 << INTER_BITS,
+    INTER_TAB_SIZE2 = INTER_TAB_SIZE * INTER_TAB_SIZE
+};
 
 //! @} imgproc_transform
 
@@ -2418,6 +2467,34 @@ CV_EXPORTS_W void resize( InputArray src, OutputArray dst,
                           Size dsize, double fx = 0, double fy = 0,
                           int interpolation = INTER_LINEAR );
 
+/** @brief onnx resize op
+https://github.com/onnx/onnx/blob/main/docs/Operators.md#Resize
+https://github.com/onnx/onnx/blob/main/onnx/reference/ops/op_resize.py
+
+Not support `exclude_outside` and `extrapolation_value` yet.
+
+To get a similar result to resize, give dsize and:
+    INTER_NEAREST : ASYMMETRIC + NEAREST_FLOOR
+    INTER_LINEAR  : HALF_PIXEL
+    INTER_CUBIC   : HALF_PIXEL + cubicCoeff(-0.75)
+
+@param src input image.
+@param dst output image; it has the size dsize (when it is non-zero) or the size computed from src.size(), scale; the type of dst is the same as of src.
+@param dsize output image size; if it equals to zero, it is computed as:
+ \f[\texttt{dsize = Size(int(scale.x * src.cols), int(scale.y * src.rows))}\f]
+ Either dsize or scale must be non-zero.
+@param scale scale factor; use same definition as ONNX, if scale > 1, it's upsampling.
+@param interpolation interpolation / coordiante, see #InterpolationFlags and #ResizeONNXFlags
+@param cubicCoeff cubic sampling coeff; range \f[[-1.0, 0)\f]
+@param roi crop region; if provided, the rois' coordinates are normalized in the coordinate system of the input image; it only takes effect with INTER_TF_CROP_RESIZE (ONNX tf_crop_and_resize)
+
+@sa  resize
+ */
+CV_EXPORTS_W void resizeOnnx(
+    InputArray src, OutputArray dst, Size dsize, Point2d scale = Point2d(),
+    int interpolation = INTER_LINEAR | INTER_HALF_PIXEL,
+    float cubicCoeff = -0.75f, Rect2d const& roi = Rect2d());
+
 /** @brief Applies an affine transformation to an image.
 
 The function warpAffine transforms the source image using the specified matrix:
diff --git a/modules/imgproc/src/opencl/resize_onnx.cl b/modules/imgproc/src/opencl/resize_onnx.cl
new file mode 100644
index 0000000000..03a64e2bc1
--- /dev/null
+++ b/modules/imgproc/src/opencl/resize_onnx.cl
@@ -0,0 +1,360 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifdef DOUBLE_SUPPORT
+#   ifdef cl_amd_fp64
+#       pragma OPENCL EXTENSION cl_amd_fp64:enable
+#   elif defined (cl_khr_fp64)
+#       pragma OPENCL EXTENSION cl_khr_fp64:enable
+#   endif
+#endif
+
+#define noconvert(x) (x)
+
+#ifndef T
+#   define INTER_NEAREST1
+#   define INTER_LINEAR1
+#   define INTER_CUBIC
+#   define INTER_ANTIALIAS1
+#   define T int
+#   define W double
+#   define CN 3
+#   define PIXEL_SIZE 12
+#   define VT int3
+#   define VW double3
+#   define TO_WORK     convert_double
+#   define TO_VEC_WORK convert_double3
+#   define TO_TYPE     convert_int_sat_rte
+#   define TO_VEC_TYPE convert_int3_sat_rte
+#endif
+
+// use parameter `channel' to reduce the number of kernels
+#if CN != 3
+#   define loadpix(addr)        *(__global const VT*)(addr)
+#   define storepix(val, addr)  *(__global VT*)(addr) = val
+#else
+#   define loadpix(addr)       vload3(0, (__global const T*)(addr))
+#   define storepix(val, addr) vstore3(val, 0, (__global T*)(addr))
+#endif
+
+#if defined(INTER_NEAREST)
+
+__kernel void resizeOnnx_nearest(
+    __global uchar const* srcptr, int src_step, int src_offset, int src_rows, int src_cols,
+    __global uchar      * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
+    int pixel_size, float offset, float m00, float m01, float m10, float m11)
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+    if (dx < dst_cols && dy < dst_rows)
+    {
+        float fx = fma(dx, m00 , m01), fy = fma(dy, m10, m11);
+    
+#if defined(INTER_NEAREST_PREFER_FLOOR) || defined(INTER_NEAREST_CEIL)
+        // x, y will >= 0, so `round toward positive infinity' is equivalent to ceil
+        int sx = convert_int_rtp(fx + offset);
+        int sy = convert_int_rtp(fy + offset);
+#else
+        // x, y will >= 0, so `round toward negative infinity' is equivalent to floor
+        int sx = convert_int_rtn(fx + offset);
+        int sy = convert_int_rtn(fy + offset);
+#endif
+        sx = clamp(sx, 0, src_cols - 1);
+        sy = clamp(sy, 0, src_rows - 1);
+        // maybe step >= 8M, so do not use `mad24' for y
+        __global uchar const* S = srcptr + (sy * src_step + mad24(sx, pixel_size, src_offset));
+        __global uchar      * D = dstptr + (dy * dst_step + mad24(dx, pixel_size, dst_offset));
+
+#if PIXEL_SIZE == 1
+        *D = *S;
+#elif PIXEL_SIZE == 2 || PIXEL_SIZE == 4 || PIXEL_SIZE == 8 || PIXEL_SIZE == 16
+        *(__global VT*)(D) = *(__global const VT*)(S);
+#elif PIXEL_SIZE == 3
+        vstore3(vload3(0, S), 0, D);
+#elif PIXEL_SIZE == 6
+        vstore3(vload3(0, (__global ushort const*)(S)), 0, (__global ushort*)(D));
+#elif PIXEL_SIZE == 12
+        vstore3(vload3(0, (__global const uint*)(S)), 0, (__global uint*)(D));
+#elif PIXEL_SIZE == 24
+        vstore3(vload3(0, (__global ulong const*)(S)), 0, (__global ulong*)(D));
+#elif PIXEL_SIZE == 32
+        *(__global uint8*)(D) = *(__global uint8 const*)(S);
+#elif PIXEL_SIZE == 64
+        *(__global uint16*)(D) = *(__global uint16 const*)(S);
+#elif PIXEL_SIZE == 128
+        *(__global ulong16*)(D) = *(__global ulong16 const*)(S);
+#else
+        for (int i = 0; i < pixel_size; ++i)
+            D[i] = S[i];
+#endif
+    }
+}
+
+#elif defined(INTER_LINEAR) && !defined(INTER_ANTIALIAS)
+
+__kernel void resizeOnnx_linear(
+    __global uchar const* srcptr, int src_step, int src_offset, int src_rows, int src_cols,
+    __global uchar      * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
+    int pixel_size, int channel, float m00, float m01, float m10, float m11)
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+    if (dx < dst_cols && dy < dst_rows)
+    {
+        float fx = fma(dx, m00, m01), fy = fma(dy, m10, m11);
+        int ix = convert_int_rtn(fx), iy = convert_int_rtn(fy);
+        float u1 = fx - ix, v1 = fy - iy;
+        float u0 = 1.f - u1, v0 = 1.f - v1;
+        int x0 = max(ix, 0);
+        int y0 = max(iy, 0);
+        int x1 = min(ix + 1, src_cols - 1);
+        int y1 = min(iy + 1, src_rows - 1);
+        __global uchar const* S0 = srcptr + (y0 * src_step + mad24(x0, pixel_size, src_offset));
+        __global uchar const* S1 = srcptr + (y0 * src_step + mad24(x1, pixel_size, src_offset));
+        __global uchar const* S2 = srcptr + (y1 * src_step + mad24(x0, pixel_size, src_offset));
+        __global uchar const* S3 = srcptr + (y1 * src_step + mad24(x1, pixel_size, src_offset));
+        __global uchar      * D  = dstptr + (dy * dst_step + mad24(dx, pixel_size, dst_offset));
+#if CN == 1 || CN == 2 || CN == 3 || CN == 4 || CN == 8 || CN == 16
+        VW s0 = TO_VEC_WORK(loadpix(S0)); VW s1 = TO_VEC_WORK(loadpix(S1));
+        VW s2 = TO_VEC_WORK(loadpix(S2)); VW s3 = TO_VEC_WORK(loadpix(S3));
+        VT d0 = TO_VEC_TYPE((u0 * v0) * s0 + (u1 * v0) * s1 + (u0 * v1) * s2 + (u1 * v1) * s3);
+        storepix(d0, D);
+#else
+        for (int i = 0; i < channel; ++i)
+        {
+            W s0 = TO_WORK(((__global T const*)(S0))[i]);
+            W s1 = TO_WORK(((__global T const*)(S1))[i]);
+            W s2 = TO_WORK(((__global T const*)(S2))[i]);
+            W s3 = TO_WORK(((__global T const*)(S3))[i]);
+            W d0 = (u0 * v0) * s0 + (u1 * v0) * s1 + (u0 * v1) * s2 + (u1 * v1) * s3;
+            ((__global T*)(D))[i] = TO_TYPE(d0);
+        }
+#endif
+    }
+}
+
+#elif defined(INTER_LINEAR) && defined(INTER_ANTIALIAS)
+
+__kernel void resizeOnnx_linear_antialias(
+    __global uchar const* srcptr, int src_step, int src_offset, int src_rows, int src_cols,
+    __global uchar      * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
+    int pixel_size, int channel, float m00, float m01, float m10, float m11,
+    float xscale, float yscale)
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+    if (dx < dst_cols && dy < dst_rows)
+    {
+        int xstart = convert_int_rtn(-1.f / xscale) + 1;
+        int xend = 2 - xstart;
+        int ystart = convert_int_rtn(-1.f / yscale) + 1;
+        int yend = 2 - ystart;
+        float fx = fma(dx, m00, m01), fy = fma(dy, m10, m11);
+        int ix = convert_int_rtn(fx), iy = convert_int_rtn(fy);
+        float rx = fx - ix, ry = fy - iy;
+        __global uchar* D = dstptr + dy * dst_step + mad24(dx, pixel_size, dst_offset);
+#if CN == 1 || CN == 2 || CN == 3 || CN == 4 || CN == 8 || CN == 16
+        VW sumval = (VW)(0);
+        float weight = 0;
+        for (int h = ystart; h < yend; ++h)
+        {
+            VW sline = (VW)(0);
+            float wline = 0;
+            int sy = clamp(iy + h, 0, src_rows - 1);
+            __global uchar const* S = srcptr + sy * src_step + src_offset;
+            for (int w = xstart; w < xend; ++w)
+            {
+                // the computation of linear's weights is trival, so do it in kernel
+                int sx = clamp(ix + w, 0, src_cols - 1);
+                float t = fabs(w - rx) * xscale;
+                t = clamp(1.f - t, 0.f, 1.f);
+                wline += t;
+                sline += t * TO_VEC_WORK(loadpix(S + sx * pixel_size));
+            }
+            float u = fabs(h - ry) * yscale;
+            u = clamp(1.f - u, 0.f, 1.f);
+            weight += u * wline;
+            sumval += u * sline;
+        }
+        VT d0 = TO_VEC_TYPE(sumval / weight);
+        storepix(d0, D);
+#else
+        W sumval = 0;
+        float weight = 0;
+        for (int h = ystart; h < yend; ++h)
+        {
+            W sline = 0;
+            float wline = 0;
+            int sy = clamp(iy + h, 0, src_rows - 1);
+            __global uchar const* S = srcptr + sy * src_step + src_offset;
+            for (int w = xstart; w < xend; ++w)
+            {
+                int sx = clamp(ix + w, 0, src_cols - 1);
+                float t = fabs(w - rx) * xscale;
+                t = clamp(1.f - t, 0.f, 1.f);
+                wline += t;
+                sline += t * TO_WORK(((__global T const*)(S + sx * pixel_size))[0]);
+            }
+            float u = fabs(h - ry) * yscale;
+            u = clamp(1.f - u, 0.f, 1.f);
+            weight += u * wline;
+            sumval += u * sline;
+        }
+        ((__global T*)(D))[0] = TO_TYPE(sumval / weight);
+
+        for (int i = 1; i < channel; ++i)
+        {
+            sumval = 0;
+            for (int h = ystart; h < yend; ++h)
+            {
+                W sline = 0;
+                int sy = clamp(iy + h, 0, src_rows - 1);
+               __global uchar const* S = srcptr + sy * src_step + src_offset;
+                for (int w = xstart; w < xend; ++w)
+                {
+                    int sx = clamp(ix + w, 0, src_cols - 1);
+                    float t = fabs(w - rx) * xscale;
+                    t = clamp(1.f - t, 0.f, 1.f);
+                    sline += t * TO_WORK(((__global T const*)(S + sx * pixel_size))[i]);
+                }
+                float u = fabs(h - ry) * yscale;
+                u = clamp(1.f - u, 0.f, 1.f);
+                sumval += u * sline;
+            }
+            ((__global T*)(D))[i] = TO_TYPE(sumval / weight);
+        }
+#endif
+    }
+}
+
+
+#elif defined(INTER_CUBIC) && !defined(INTER_ANTIALIAS)
+
+float cubicCoeff(float A, float A2, float A3, float x)
+{
+    x = fabs(x);
+    if (x <= 1)
+        x = (A2 * x - A3) * x * x + 1;
+    else if (x <= 2)
+        x = A * (((x - 5) * x + 8) * x - 4);
+    else
+        x = 0;
+    return x;
+}
+
+__kernel void resizeOnnx_cubic(
+    __global uchar const* srcptr, int src_step, int src_offset, int src_rows, int src_cols,
+    __global uchar      * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
+    int pixel_size, int channel, float m00, float m01, float m10, float m11, float A)
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+    float A2 = A + 2, A3 = A + 3;
+    if (dx < dst_cols && dy < dst_rows)
+    {
+        float fx = fma(dx, m00, m01), fy = fma(dy, m10, m11);
+        int xstart = convert_int_rtn(fx) - 1;
+        int ystart = convert_int_rtn(fy) - 1;
+        int xlimit = xstart + 3;
+        int ylimit = ystart + 3;
+        int xoffset[4];
+        float xcoeff[4];
+        for (int x = xstart; x <= xlimit; ++x)
+        {
+            xoffset[x - xstart] = clamp(x, 0, src_cols - 1) * pixel_size;
+            xcoeff [x - xstart] = cubicCoeff(A, A2, A3, x - fx);
+        }
+        __global uchar* D = dstptr + (dy * dst_step + mad24(dx, pixel_size, dst_offset));
+#if CN == 1 || CN == 2 || CN == 3 || CN == 4 || CN == 8 || CN == 16
+        VW sum = (VW)(0);
+        for (int y = ystart; y <= ylimit; ++y)
+        {
+            int yoffset = clamp(y, 0, src_rows - 1) * src_step + src_offset;
+            VW line = (VW)(0);
+            for (int x = 0; x < 4; ++x)
+                line += (VW)(xcoeff[x]) * TO_VEC_WORK(loadpix(srcptr + yoffset + xoffset[x]));
+            sum += line * (VW)(cubicCoeff(A, A2, A3, y - fy));
+        }
+        storepix(TO_VEC_TYPE(sum), D);
+#else
+        int yoffset[4];
+        float ycoeff[4];
+        for (int y = ystart; y <= ylimit; ++y)
+        {
+            yoffset[y - ystart] = clamp(y, 0, src_rows - 1) * src_step + src_offset;
+            ycoeff [y - ystart] = cubicCoeff(A, A2, A3, y - fy);
+        }
+        for (int i = 0; i < channel; ++i)
+        {
+            W sum = 0;
+            for (int y = 0; y < 4; ++y)
+            {
+                W line = 0;
+                for (int x = 0; x < 4; ++x)
+                    line += xcoeff[x] * TO_WORK(((__global T const*)
+                                                (srcptr + yoffset[y] + xoffset[x]))[i]);
+                sum += line * ycoeff[y];
+            }
+            ((__global T*)(D))[i] = TO_TYPE(sum);
+        }
+#endif
+    }
+}
+
+#elif defined(INTER_CUBIC) && defined(INTER_ANTIALIAS)
+
+// the computation of cubic's weight is heavy(?), so do it outside
+// maybe it is also ok for linear antialias resize?
+__kernel void resizeOnnx_table(
+    __global uchar const* srcptr, int src_step, int src_offset, int src_rows, int src_cols,
+    __global uchar      * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
+    int pixel_size, int channel, int xkanti, int ykanti, __global int const* table)
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+    if (dx < dst_cols && dy < dst_rows)
+    {
+        int xstride = xkanti * dst_cols;
+        int ystride = ykanti * dst_rows;
+        __global uchar* D = dstptr + (dy * dst_step + mad24(dx, pixel_size, dst_offset));
+        __global int const* xoffset = table;
+        __global int const* yoffset = xoffset + xstride;
+        __global float const* xcoeff = (__global float const*)(yoffset + ystride);
+        __global float const* ycoeff = (__global float const*)(xcoeff + xstride);
+#if CN == 1 || CN == 2 || CN == 3 || CN == 4 || CN == 8 || CN == 16
+        VW sum = (VW)(0);
+        // exact ykanti / xkanti loops
+        for (int y = dy; y < ystride; y += dst_rows)
+        {
+            // offset is already clamped. xoffset is given by uchar
+            __global const uchar* S = (srcptr + yoffset[y] * src_step + src_offset);
+            VW line = (VW)(0);
+            for (int x = dx; x < xstride; x += dst_cols)
+                line += xcoeff[x] * TO_VEC_WORK(loadpix(S + xoffset[x]));
+            sum += line * ycoeff[y];
+        }
+        storepix(TO_VEC_TYPE(sum), D);
+#else
+        for (int i = 0; i < channel; ++i)
+        {
+            W sum = 0;
+            for (int y = dy; y < ystride; y += dst_rows)
+            {
+                __global const uchar* S = (srcptr + yoffset[y] * src_step + src_offset);
+                W line = 0;
+                for (int x = dx; x < xstride; x += dst_cols)
+                    line += xcoeff[x] * TO_WORK(((__global T const*)(S + xoffset[x]))[i]);
+                sum += line * ycoeff[y];
+            }
+            ((__global T*)(D))[i] = TO_TYPE(sum);
+        }
+#endif
+    }
+}
+
+#else 
+
+#error "empty kernel"
+
+#endif
diff --git a/modules/imgproc/src/resize.cpp b/modules/imgproc/src/resize.cpp
index 7e45f1e0f4..9fa892de98 100644
--- a/modules/imgproc/src/resize.cpp
+++ b/modules/imgproc/src/resize.cpp
@@ -946,6 +946,60 @@ static inline void interpolateLanczos4( float x, float* coeffs )
         coeffs[i] *= sum;
 }
 
+/**
+ * the coordiante transformation from dst to src is linear
+ * and can be written as: x_org = f(x) = a * x + b.
+ * note: scale may be user input and not equal to (src / dst).
+ * ref to onnx, length_resized is src * scale (float), not dst (int).
+ */
+static Vec2f interCoordinate(int coordinate, int dst, int src, double scale, double start, double end)
+{
+    float a, b;
+    if (coordinate == INTER_HALF_PIXEL
+        || coordinate == INTER_HALF_PIXEL_SYMMETRIC
+        || coordinate == INTER_HALF_PIXEL_PYTORCH)
+    {
+        a = static_cast<float>(1.0 / scale);
+        b = static_cast<float>(0.5 / scale - 0.5);
+        if (coordinate == INTER_HALF_PIXEL_SYMMETRIC)
+            b += static_cast<float>(0.5 * (src - dst / scale));
+        if (coordinate == INTER_HALF_PIXEL_PYTORCH && dst <= 1)
+        {
+            a = 0.f;
+            b = -0.5f;
+        }
+    }
+    else if (coordinate == INTER_ALIGN_CORNERS)
+    {
+        a = static_cast<float>((src - 1.0) / (src * scale - 1.0));
+        b = 0.f;
+    }
+    else if (coordinate == INTER_ASYMMETRIC)
+    {
+        a = static_cast<float>(1.0 / scale);
+        b = 0.f;
+    }
+    else if (coordinate == INTER_TF_CROP_RESIZE)
+    {
+        CV_CheckGE(start, 0.0, "roi's start is out of image");
+        CV_CheckLE(end  , 1.0, "roi's end   is out of image");
+        CV_CheckLT(start, end, "roi's start must be less than its end");
+        if (dst <= 1)
+        {
+            a = 0.f;
+            b = static_cast<float>(0.5 * (start + end) * (src - 1.0));
+        }
+        else
+        {
+            a = static_cast<float>((end - start) * (src - 1.0) / (src * scale - 1.0));
+            b = static_cast<float>(start * (src - 1.0));
+        }
+    }
+    else
+        CV_Error(Error::StsBadArg, format("Unknown coordinate transformation mode %d", coordinate));
+    return Vec2f(a, b);
+}
+
 template<typename ST, typename DT> struct Cast
 {
     typedef ST type1;
@@ -1231,6 +1285,128 @@ static void resizeNN_bitexact( const Mat& src, Mat& dst, double /*fx*/, double /
     parallel_for_(range, invoker, dst.total()/(double)(1<<16));
 }
 
+class ResizeOnnxNNInvoker : public ParallelLoopBody
+{
+    Mat src;
+    Mat& dst;
+    Matx22f M;
+    int mode;
+    float offset;
+    AutoBuffer<size_t> x_ofs;
+    ResizeOnnxNNInvoker(const ResizeOnnxNNInvoker&);
+    ResizeOnnxNNInvoker& operator=(const ResizeOnnxNNInvoker&);
+
+    int srcIndex(int x, float a, float b) const
+    {
+        // offset can not add to M(0, 1) and M(1, 1) directly
+        // due to the small float error near integer
+        float f = fmaf(static_cast<float>(x), a, b);
+        if (mode == INTER_NEAREST_PREFER_FLOOR ||
+            mode == INTER_NEAREST_CEIL)
+            x = cvCeil(f + offset);
+        else
+            x = cvFloor(f + offset);
+        return x;
+    }
+
+public:
+    ResizeOnnxNNInvoker(Mat const& _src, Mat& _dst, const Matx22f& _M, int _mode)
+        : src(_src), dst(_dst), M(_M), mode(_mode)
+    {
+        offset = 0.f;
+        if (mode == INTER_NEAREST_PREFER_FLOOR)
+            offset = -0.5f;
+        if (mode == INTER_NEAREST_PREFER_CEIL)
+            offset = +0.5f;
+
+        x_ofs.allocate(dst.cols);
+        size_t pix_size = src.elemSize();
+        for (int x = 0; x < dst.cols; ++x)
+        {
+            int sx = srcIndex(x, M(0, 0), M(0, 1));
+            sx = min(max(sx, 0), src.cols - 1);
+            x_ofs[x] = sx * pix_size;
+        }
+    }
+
+    virtual void operator() (const Range& range) const CV_OVERRIDE
+    {
+        int width = dst.cols;
+        size_t pix_size = src.elemSize();
+        for (int y = range.start; y < range.end; ++y)
+        {
+            uchar* D = dst.ptr(y);
+            int sy = srcIndex(y, M(1, 0), M(1, 1));
+            sy = min(max(sy, 0), src.rows - 1);
+            uchar const* S = src.ptr(sy);
+            int x = 0;
+
+            switch (pix_size)
+            {
+            case 1:
+                for (; x <= width - 2; x += 2)
+                {
+                    uchar t0 = S[x_ofs[x    ]];
+                    uchar t1 = S[x_ofs[x + 1]];
+                    D[x    ] = t0;
+                    D[x + 1] = t1;
+                }
+                for (; x < width; ++x)
+                    D[x] = S[x_ofs[x]];
+                break;
+            case 2:
+                for (; x < width; ++x)
+                    reinterpret_cast<short*>(D)[x] = *(reinterpret_cast<short const*>(S + x_ofs[x]));
+                break;
+            case 3:
+                for (; x < width; ++x, D += 3)
+                {
+                    const uchar* _tS = S + x_ofs[x];
+                    D[0] = _tS[0]; D[1] = _tS[1]; D[2] = _tS[2];
+                }
+                break;
+            case 4:
+                for (; x < width; ++x)
+                    reinterpret_cast<int*>(D)[x] = *(reinterpret_cast<int const*>(S + x_ofs[x]));
+                break;
+            case 6:
+                for (; x < width; ++x, D += 6)
+                {
+                    short const* _tS = reinterpret_cast<short const*>(S + x_ofs[x]);
+                    short* _tD = reinterpret_cast<short*>(D);
+                    _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2];
+                }
+                break;
+            case 8:
+                for (; x < width; ++x)
+                    reinterpret_cast<int64*>(D)[x] = *(reinterpret_cast<int64 const*>(S + x_ofs[x]));
+                break;
+            case 12:
+                for (; x < width; ++x, D += 12)
+                {
+                    int const* _tS = reinterpret_cast<int const*>(S + x_ofs[x]);
+                    int* _tD = reinterpret_cast<int*>(D);
+                    _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2];
+                }
+                break;
+#if CV_SIMD128
+            case 16:
+                for (; x < width; ++x, D += 16)
+                    v_store(D, v_load(S + x_ofs[x]));
+                break;
+#endif
+            default:
+                for (; x < width; ++x, D += pix_size)
+                {
+                    uchar const* _tS = S + x_ofs[x];
+                    for (size_t k = 0; k < pix_size; ++k)
+                        D[k] = _tS[k];
+                }
+            }
+        }
+    }
+};
+
 struct VResizeNoVec
 {
     template<typename WT, typename T, typename BT>
@@ -2924,16 +3100,15 @@ public:
         int _scale_x, int _scale_y, const int* _ofs, const int* _xofs) :
         ParallelLoopBody(), src(_src), dst(_dst), scale_x(_scale_x),
         scale_y(_scale_y), ofs(_ofs), xofs(_xofs)
-    {
-    }
+    {}
 
     virtual void operator() (const Range& range) const CV_OVERRIDE
     {
         Size ssize = src.size(), dsize = dst.size();
         int cn = src.channels();
-        int area = scale_x*scale_y;
-        float scale = 1.f/(area);
-        int dwidth1 = (ssize.width/scale_x)*cn;
+        int area = scale_x * scale_y;
+        float scale = 1.f / area;
+        int dwidth1 = ssize.width / scale_x * cn;
         dsize.width *= cn;
         ssize.width *= cn;
         int dy, dx, k = 0;
@@ -2989,8 +3164,9 @@ public:
                         count++;
                     }
                 }
-
-                D[dx] = saturate_cast<T>((float)sum/count);
+                // sum maybe double, converting it to float will decrease precision
+                // when count < 2^23, converting it to float is fine
+                D[dx] = saturate_cast<T>(sum / static_cast<float>(count));
             }
         }
     }
@@ -3260,6 +3436,561 @@ static void resizeArea_( const Mat& src, Mat& dst,
 }
 
 
+class ResizeOnnxCtrl
+{
+   utils::BufferArea area;
+
+public:
+    struct TabIdx
+    {
+        int si, di; // index on src / dst by elem1
+        union { float f; double d; }; // coefficient / weight
+
+        void as(float&  v) { v = f; }
+        void as(double& v) { v = d; }
+    };
+
+    /* resize parameter */
+    bool is_fixpt, is_double;
+    int ksize, xkanti, ykanti;
+
+    /* for antialias resize */
+    TabIdx* xtab;
+    TabIdx* ytab;
+    /* for generic resize */
+    int* xofs;
+    int* yofs;
+    double* xcoeffs;
+    double* ycoeffs;
+    int xmin, xmax;
+
+private:
+    void cubic_coeffs_antialias(int dstlen, int cn, float srcpos, float scale, int srclen, float A, TabIdx* elem)
+    {
+        scale = min(scale, 1.f);
+        int index = cvFloor(srcpos);
+        float ratio = srcpos - index;
+        int start = cvFloor(-2.f / scale) + 1;
+        int end = 2 - start;
+        int len = end - start;
+        float sum = 0;
+        for (int i = start; i < end; ++i)
+        {
+            float x = fabsf(i - ratio) * scale;
+            if (x <= 1)
+                x = ((A + 2) * x - (A + 3)) * x * x + 1;
+            else if (x <= 2)
+                x = A * (((x - 5) * x + 8) * x - 4);
+            else
+                x = 0;
+            elem[i - start].di = cn * dstlen;
+            elem[i - start].si = cn * min(max(index + i, 0), srclen - 1);
+            elem[i - start].f = x;
+            sum += x;
+        }
+        for (int i = 0; i < len; ++i)
+        {
+            if (is_double)
+                elem[i].d = elem[i].f / sum;
+            else
+                elem[i].f = elem[i].f / sum;
+        }
+    }
+
+    void cubic_coeffs(float x, float A, float* coeffs)
+    {
+        coeffs[0] = A * ((((x + 1) - 5) * (x + 1) + 8) * (x + 1) - 4);
+        coeffs[1] = ((A + 2) * x - (A + 3)) * x * x + 1;
+        coeffs[2] = ((A + 2) * (1 - x) - (A + 3)) * (1 - x) * (1 - x) + 1;
+        coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];
+    }
+
+    void linear_coeffs_antialias(int dstlen, int cn, float srcpos, float scale, int srclen, TabIdx* elem)
+    {
+        scale = min(scale, 1.f);
+        int index = cvFloor(srcpos);
+        float ratio = srcpos - index;
+        int start = cvFloor(-1.f / scale) + 1;
+        int end = 2 - start;
+        int len = end - start;
+        float sum = 0.f;
+        for (int i = start; i < end; ++i)
+        {
+            float x = fabsf(i - ratio) * scale;
+            x = min(max(1.f - x, 0.f), 1.f);
+            elem[i - start].di = cn * dstlen;
+            elem[i - start].si = cn * min(max(index + i, 0), srclen - 1);
+            elem[i - start].f = x;
+            sum += x;
+        }
+        for (int i = 0; i < len; ++i)
+        {
+            if (is_double)
+                elem[i].d = elem[i].f / sum;
+            else
+                elem[i].f = elem[i].f / sum;
+        }
+    }
+
+    void linear_coeffs(float x, float* coeffs)
+    {
+        coeffs[0] = 1.f - x;
+        coeffs[1] = x;
+    }
+
+   public:
+    ResizeOnnxCtrl(int interpolation, int type, float cubicCoeff,
+        Size ssize, Size dsize, Point2d const& scaled, Matx22f const& M)
+    {
+        int sampler = interpolation & INTER_SAMPLER_MASK;
+        int antialias = interpolation & INTER_ANTIALIAS_MASK;
+        Point2f scale = static_cast<Point2f>(scaled);
+        CV_CheckGE(cubicCoeff, -1.f, "cubic coefficient should range [-1, 0)");
+        CV_CheckLT(cubicCoeff, +0.f, "cubic coefficient should range [-1, 0)");
+        CV_Check(sampler, sampler == INTER_LINEAR || sampler == INTER_CUBIC,
+            "should not error");
+
+        int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
+        ksize = (sampler == INTER_LINEAR ? 2 : 4);
+        is_double = (depth == CV_64F);
+        is_fixpt = (depth == CV_8U || depth == CV_8S);
+        is_double = (depth == CV_32S || depth == CV_64F);
+        xtab = ytab = nullptr;
+        xofs = yofs = nullptr;
+        xcoeffs = ycoeffs = nullptr;
+        int khalf = ksize / 2;
+        xkanti = 2 * cvCeil(khalf / min(scale.x, 1.f));
+        ykanti = 2 * cvCeil(khalf / min(scale.y, 1.f));
+        area.allocate(xtab, xkanti * dsize.width );
+        area.allocate(ytab, ykanti * dsize.height);
+        area.allocate(xofs, dsize.width * cn + 1);
+        area.allocate(yofs, dsize.height * 1 + 1);
+        area.allocate(xcoeffs, ksize * dsize.width * cn);
+        area.allocate(ycoeffs, ksize * dsize.height * 1);
+        area.commit();
+        CV_CheckLE(ksize, MAX_ESIZE, "resampler kernel's size is too larger");
+
+        if (antialias)
+        {
+            float a = M(0, 0), b = M(0, 1);
+            for (int d = 0; d < dsize.width; ++d)
+            {
+                float f = fmaf(static_cast<float>(d), a, b);
+                if (sampler == INTER_LINEAR)
+                    linear_coeffs_antialias(d, cn, f, scale.x, ssize.width, xtab + d * xkanti);
+                else // if (sampler == INTER_CUBIC)
+                    cubic_coeffs_antialias(d, cn, f, scale.x, ssize.width, cubicCoeff, xtab + d * xkanti);
+            }
+        }
+        else
+        {
+            xkanti = 0;
+            xmin = 0;
+            xmax = dsize.width;
+            float cbuf[MAX_ESIZE];
+            float a = M(0, 0), b = M(0, 1);
+            for (int d = 0; d < dsize.width; ++d)
+            {
+                float f = fmaf(static_cast<float>(d), a, b);
+                int s = cvFloor(f);
+                f -= s;
+                if (s < khalf - 1) {
+                    xmin = d + 1;
+                    if (s < 0 && sampler == INTER_LINEAR)
+                        f = 0, s = 0;
+                }
+                if (s + khalf >= ssize.width)
+                {
+                    xmax = min(xmax, d);
+                    if (s >= ssize.width - 1 && sampler == INTER_LINEAR)
+                        f = 0, s = ssize.width - 1;
+                }
+                for (int k = 0; k < cn; ++k)
+                    xofs[cn * d + k] = cn * s + k;
+                if (sampler == INTER_LINEAR)
+                    linear_coeffs(f, cbuf);
+                else // if (sampler == INTER_CUBIC)
+                    cubic_coeffs(f, cubicCoeff, cbuf);
+                if (is_fixpt)
+                {
+                    short* coeffs = reinterpret_cast<short*>(xcoeffs) + cn * ksize * d;
+                    for (int k = 0; k < ksize; ++k)
+                        coeffs[k] = saturate_cast<short>(cbuf[k] * INTER_RESIZE_COEF_SCALE);
+                    for (int k = ksize; k < cn * ksize; ++k)
+                        coeffs[k] = coeffs[k - ksize];
+                }
+                else if (is_double)
+                {
+                    double* coeffs = xcoeffs + cn * ksize * d;
+                    for (int k = 0; k < ksize; ++k)
+                        coeffs[k] = cbuf[k];
+                    for (int k = ksize; k < cn * ksize; ++k)
+                        coeffs[k] = coeffs[k - ksize];
+                }
+                else
+                {
+                    float* coeffs = reinterpret_cast<float*>(xcoeffs) + cn * ksize * d;
+                    for (int k = 0; k < ksize; ++k)
+                        coeffs[k] = cbuf[k];
+                    for (int k = ksize; k < cn * ksize; ++k)
+                        coeffs[k] = coeffs[k - ksize];
+                }
+            }
+        }
+
+        if (antialias)
+        {
+            float a = M(1, 0), b = M(1, 1);
+            for (int d = 0; d < dsize.height; ++d)
+            {
+                float f = fmaf(static_cast<float>(d), a, b);
+                if (sampler == INTER_LINEAR)
+                    linear_coeffs_antialias(d, 1, f, scale.y, ssize.height, ytab + d * ykanti);
+                else // if (sampler == INTER_CUBIC)
+                    cubic_coeffs_antialias(d, 1, f, scale.y, ssize.height, cubicCoeff, ytab + d * ykanti);
+            }
+        }
+        else
+        {
+            ykanti = 0;
+            float cbuf[MAX_ESIZE];
+            float a = M(1, 0), b = M(1, 1);
+            for (int d = 0; d < dsize.height; ++d)
+            {
+                float f = fmaf(static_cast<float>(d), a, b);
+                int s = cvFloor(f);
+                f -= s;
+                yofs[d] = s;
+                if (sampler == INTER_LINEAR)
+                    linear_coeffs(f, cbuf);
+                else // if (sampler == INTER_CUBIC)
+                    cubic_coeffs(f, cubicCoeff, cbuf);
+                if (is_fixpt)
+                {
+                    short* coeffs = reinterpret_cast<short*>(ycoeffs) + 1 * ksize * d;
+                    for (int k = 0; k < ksize; ++k)
+                        coeffs[k] = saturate_cast<short>(cbuf[k] * INTER_RESIZE_COEF_SCALE);
+                }
+                else if (is_double)
+                {
+                    double* coeffs = ycoeffs + 1 * ksize * d;
+                    for (int k = 0; k < ksize; ++k)
+                        coeffs[k] = cbuf[k];
+                }
+                else
+                {
+                    float* coeffs = reinterpret_cast<float*>(ycoeffs) + 1 * ksize * d;
+                    for (int k = 0; k < ksize; ++k)
+                        coeffs[k] = cbuf[k];
+                }
+            }
+        }
+    }
+};
+
+template <typename HResize, typename VResize, typename IdxT>
+class ResizeOnnxInvoker : public ParallelLoopBody
+{
+    Mat const& src;
+    Mat& dst;
+    ResizeOnnxCtrl const& ctrl;
+    HResize hresize;
+    VResize vresize;
+
+    ResizeOnnxInvoker& operator =(ResizeOnnxInvoker const&);
+    
+public:
+    typedef typename HResize::value_type T;
+    typedef typename HResize::buf_type WT;
+    typedef typename HResize::alpha_type AT;
+
+    ResizeOnnxInvoker(const Mat& _src, Mat& _dst, ResizeOnnxCtrl const& _ctrl) :
+        src(_src), dst(_dst), ctrl(_ctrl)
+    {
+        CV_CheckLE(ctrl.ksize, MAX_ESIZE, "resampler kernel's size is too larger");
+        CV_Check(ctrl.is_fixpt, !(ctrl.is_fixpt && ctrl.is_double), "can not be both types");
+        // prefer static_assert, but how ?
+#ifdef CV_CXX11
+        // check generic resize
+        if (ctrl.is_fixpt)
+        {
+            CV_Check(ctrl.is_fixpt, (std::is_same<AT, short>::value),
+                "when use fixpt / short coeffs, AT is expected to be short");
+            CV_Check(sizeof(T) * 10 + sizeof(WT),
+                (std::is_same<WT, int>::value
+                    && (std::is_same<T, uchar>::value || std::is_same<T, schar>::value)),
+                "something wrong");
+        }
+        else if (ctrl.is_double)
+        {
+            CV_Check(ctrl.is_double, (std::is_same<AT, double>::value),
+                "when use double coeffs, AT is expected to be double");
+            CV_Check(sizeof(T) * 10 + sizeof(WT),
+                (std::is_same<WT, double>::value &&
+                    (std::is_same<T, int>::value || std::is_same<T, double>::value)),
+                "something wrong");
+         }
+        else
+        {
+            CV_Check(sizeof(AT), (std::is_same<AT, float>::value),
+                "when use float coeffs, AT is expected to be short");
+            CV_Check(sizeof(T) * 10 + sizeof(WT),
+                (std::is_same<WT, float>::value
+                    && (std::is_same<T, short>::value || std::is_same<T, ushort>::value
+                        || std::is_same<T, float>::value)),
+                "something wrong");
+        }
+        // check antialias resize
+        if (ctrl.is_double)
+        {
+            CV_Check(ctrl.is_double, (std::is_same<IdxT, double>::value),
+                "when use double coeffs, AT is expected to be double");
+        }
+        else
+        {
+            CV_Check(ctrl.is_double, (std::is_same<IdxT, float>::value),
+                "when use double coeffs, AT is expected to be double");
+        }
+        CV_Check(sizeof(IdxT) * 10 + sizeof(WT),
+            (std::is_same<IdxT, typename std::common_type<IdxT, WT>::type>::value),
+            "something wrong");
+#endif
+    }
+
+    void hori_antialias_accumulate(T const* S, IdxT* L) const
+    {
+        IdxT alpha;
+        int const cn = dst.channels();
+        int const len = ctrl.xkanti * dst.cols;
+        if (cn == 1)
+            for (int k = 0; k < len; ++k)
+            {
+                int di = ctrl.xtab[k].di;
+                int si = ctrl.xtab[k].si;
+                ctrl.xtab[k].as(alpha);
+                L[di] += S[si] * alpha;
+            }
+        else if (cn == 2)
+            for (int k = 0; k < len; ++k)
+            {
+                int di = ctrl.xtab[k].di;
+                int si = ctrl.xtab[k].si;
+                ctrl.xtab[k].as(alpha);
+                L[di    ] += S[si    ] * alpha;
+                L[di + 1] += S[si + 1] * alpha;
+            }
+        else if (cn == 3)
+            for (int k = 0; k < len; ++k)
+            {
+                int di = ctrl.xtab[k].di;
+                int si = ctrl.xtab[k].si;
+                ctrl.xtab[k].as(alpha);
+                L[di    ] += S[si    ] * alpha;
+                L[di + 1] += S[si + 1] * alpha;
+                L[di + 2] += S[si + 2] * alpha;
+            }
+        else if (cn == 4)
+            for (int k = 0; k < len; ++k)
+            {
+                int di = ctrl.xtab[k].di;
+                int si = ctrl.xtab[k].si;
+                ctrl.xtab[k].as(alpha);
+                L[di    ] += S[si    ] * alpha;
+                L[di + 1] += S[si + 1] * alpha;
+                L[di + 2] += S[si + 2] * alpha;
+                L[di + 3] += S[si + 3] * alpha;
+            }
+        else 
+            for (int k = 0; k < len; ++k)
+            {
+                int di = ctrl.xtab[k].di;
+                int si = ctrl.xtab[k].si;
+                ctrl.xtab[k].as(alpha);
+                for (int c = 0; c < cn; ++c)
+                    L[di + c] += S[si + c] * alpha;
+            }
+    }
+
+    void hori_antialias_lines(T const** srcptr, WT** dstptr, IdxT* L, int count) const
+    {
+        int cn = dst.channels();
+        int dwidth = dst.cols * cn;
+#ifdef CV_CXX11
+        constexpr bool same_wt_idxt = std::is_same<WT, IdxT>::value;
+#else 
+        bool const same_wt_idxt = false;
+#endif
+        for (int i = 0; i < count; ++i)
+        {
+            T const* S = srcptr[i];
+            // reinterpret_cast makes compiler happy
+            if (same_wt_idxt)
+                L = reinterpret_cast<IdxT*>(dstptr[i]);
+            memset(L, 0, sizeof(IdxT) * dwidth);
+            hori_antialias_accumulate(S, L);
+            if (!same_wt_idxt)
+            {
+                WT* D = dstptr[i];
+                if (ctrl.is_fixpt)
+                {
+                    float const alpha = INTER_RESIZE_COEF_SCALE;
+                    for (int k = 0; k < dwidth; ++k)
+                        D[k] = saturate_cast<WT>(L[k] * alpha);
+                }
+                else
+                {
+                    for (int k = 0; k < dwidth; ++k)
+                        D[k] = saturate_cast<WT>(L[k]);
+                }
+            }
+        }
+    }
+
+    void hori_generic_lines(T const** srcptr, WT** dstptr, int count) const
+    {
+        int cn = src.channels();
+        int ssize = src.cols * cn;
+        int dsize = dst.cols * cn;
+        int xmin = ctrl.xmin * cn;
+        int xmax = ctrl.xmax * cn;
+        // just call hresize
+        hresize(srcptr, dstptr, count,
+            ctrl.xofs, reinterpret_cast<AT const*>(ctrl.xcoeffs),
+            ssize, dsize, cn, xmin, xmax);
+    }
+
+    void vert_antialias_hori_antialias(int dy, IdxT* L, IdxT* A) const
+    {
+        // the start and end of ytab
+        int dwidth = dst.channels() * dst.cols;
+        int tstart = dy * ctrl.ykanti, tend = tstart + ctrl.ykanti;
+        memset(A, 0, dwidth * sizeof(IdxT));
+        for (int t = tstart; t < tend; ++t)
+        {
+            IdxT beta;
+            int sy = ctrl.ytab[t].si;
+            CV_CheckEQ(dy, ctrl.ytab[t].di, "something wrong");
+            ctrl.ytab[t].as(beta);
+            memset(L, 0, dwidth * sizeof(IdxT));
+            hori_antialias_accumulate(src.template ptr<T>(sy), L);
+            for (int w = 0; w < dwidth; ++w)
+                A[w] += L[w] * beta;
+        }
+        T* D = dst.template ptr<T>(dy);
+        for (int w = 0; w < dwidth; ++w)
+            D[w] = saturate_cast<T>(A[w]);
+    }
+
+    void vert_antialias_hori_generic(int dy, WT* L, IdxT* A) const
+    {
+        // FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS> cast;
+        int dwidth = dst.channels() * dst.cols;
+        int tstart = dy * ctrl.ykanti, tend = tstart + ctrl.ykanti;
+        memset(A, 0, dwidth * sizeof(IdxT));
+        for (int t = tstart; t < tend; ++t)
+        {
+            IdxT beta;
+            int sy = ctrl.ytab[t].si;
+            CV_CheckEQ(dy, ctrl.ytab[t].di, "something wrong");
+            ctrl.ytab[t].as(beta);
+            T const* S = src.template ptr<T>(sy);
+            hori_generic_lines(&S, &L, 1);
+            if (ctrl.is_fixpt)
+                beta /= INTER_RESIZE_COEF_SCALE;
+            for (int w = 0; w < dwidth; ++w)
+                A[w] += L[w] * beta;
+        }
+        T* D = dst.template ptr<T>(dy);
+        for (int w = 0; w < dwidth; ++w)
+            D[w] = saturate_cast<T>(A[w]);
+    }
+
+    void vert_antialias(Range const& range) const
+    {
+        int cn = dst.channels();
+        int dwidth = dst.cols * cn;
+        AutoBuffer<IdxT> line(dwidth * 2);
+        IdxT* L = line.data();
+        IdxT* A = line.data() + dwidth;
+        WT* Lw = reinterpret_cast<WT*>(L);
+        for (int dy = range.start; dy < range.end; ++dy)
+        {
+            if (ctrl.xkanti)
+                vert_antialias_hori_antialias(dy, L, A);
+            else 
+                vert_antialias_hori_generic(dy, Lw, A);
+        }
+    }
+
+    void vert_generic(Range const& range) const
+    {
+        int ksize = ctrl.ksize, ksize2 = ksize / 2;
+        int cn = src.channels();
+        int dwidth = dst.cols * cn;
+        size_t bufstep = alignSize(dwidth, CV_SIMD_WIDTH / sizeof(IdxT));
+        AutoBuffer<IdxT> _buffer(bufstep * (ksize + 1));
+        T const* srows[MAX_ESIZE] = {0};
+        WT* rows[MAX_ESIZE] = {0};
+        int prev_sy[MAX_ESIZE];
+        IdxT* L = _buffer.data() + bufstep * ksize;
+        for (int k = 0; k < ksize; ++k)
+        {
+            prev_sy[k] = -1;
+            rows[k] = reinterpret_cast<WT*>(_buffer.data() + bufstep * k);
+        }
+        AT const* beta = reinterpret_cast<AT const*>(ctrl.ycoeffs) + ksize * range.start;
+        for (int dy = range.start; dy < range.end; ++dy, beta += ksize)
+        {
+            int sy0 = ctrl.yofs[dy], k0 = ksize, k1 = 0;
+            for(int k = 0; k < ksize; k++ )
+            {
+                int sy = min(max(sy0 - ksize2 + 1 + k, 0), src.rows - 1);
+                for (k1 = max(k1, k); k1 < ksize; ++k1)
+                {
+                    // if the sy-th row has been computed already, reuse it.
+                    if (sy == prev_sy[k1])
+                    {
+                        if (k1 > k)
+                            memcpy(rows[k], rows[k1], bufstep * sizeof(WT));
+                        break;
+                    }
+                }
+                // remember the first row that needs to be computed
+                if( k1 == ksize )
+                    k0 = min(k0, k);
+                srows[k] = src.template ptr<T>(sy);
+                prev_sy[k] = sy;
+            }
+
+            if (k0 < ksize)
+            {
+                if (ctrl.xkanti)
+                    hori_antialias_lines(srows + k0, rows + k0, L, ksize - k0);
+                else 
+                    hori_generic_lines(srows + k0, rows + k0, ksize - k0);
+            }
+            vresize(const_cast<WT const**>(rows), dst.template ptr<T>(dy), beta, dwidth);
+        }
+    }
+
+    virtual void operator() (Range const& range) const CV_OVERRIDE
+    {
+        if (ctrl.ykanti)
+            vert_antialias(range);
+        else
+            vert_generic(range);
+    }
+};
+
+template <typename HResize, typename VResize, typename IdxT>
+static void resizeOnnx_(Mat const& src, Mat& dst, ResizeOnnxCtrl const& ctrl)
+{
+    parallel_for_(Range(0, dst.rows),
+        ResizeOnnxInvoker<HResize, VResize, IdxT>(src, dst, ctrl),
+        static_cast<double>(dst.rows) * dst.cols / (1 << 16));
+}
+
+
 typedef void (*ResizeFunc)( const Mat& src, Mat& dst,
                             const int* xofs, const void* alpha,
                             const int* yofs, const void* beta,
@@ -3274,6 +4005,8 @@ typedef void (*ResizeAreaFunc)( const Mat& src, Mat& dst,
                                 const DecimateAlpha* ytab, int ytab_size,
                                 const int* yofs);
 
+typedef void (*ResizeOnnxFunc)(Mat const& src, Mat& dst, ResizeOnnxCtrl const&);
+
 
 static int computeResizeAreaTab( int ssize, int dsize, int cn, double scale, DecimateAlpha* tab )
 {
@@ -3316,7 +4049,9 @@ static int computeResizeAreaTab( int ssize, int dsize, int cn, double scale, Dec
     return k;
 }
 
+
 #ifdef HAVE_OPENCL
+
 static void ocl_computeResizeAreaTabs(int ssize, int dsize, double scale, int * const map_tab,
                                       float * const alpha_tab, int * const ofs_tab)
 {
@@ -3569,6 +4304,255 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
     return k.run(2, globalsize, 0, false);
 }
 
+static void ocl_resizeOnnxTable(int srclen, int dstlen, int esz,
+    int sampler, float a, float b, float A, float scale, int* offset, float* coeff)
+{
+    // maybe want do linear resize in this way?
+    CV_Assert(sampler == INTER_LINEAR || sampler == INTER_CUBIC);
+    scale = min(scale, 1.f);
+    int start = cvFloor((sampler == INTER_LINEAR ? -1.f : -2.f) / scale) + 1;
+    int end = 2 - start;
+    int kanti = end - start;
+    for (int d = 0; d < dstlen; ++d)
+    {
+        float spos = fmaf(static_cast<float>(d), a, b);
+        int index = cvFloor(spos);
+        float ratio = spos - index;
+        float sum = 0.f;
+        for (int i = start; i < end; ++i)
+        {
+            float x = fabs(i - ratio) * scale;
+            if (sampler == INTER_LINEAR)
+                x = min(max(x, 0.f), 1.f);
+            else 
+            {
+                if (x <= 1)
+                    x = ((A + 2) * x - (A + 3)) * x * x + 1;
+                else if (x <= 2)
+                    x = A * (((x - 5) * x + 8) * x - 4);
+                else
+                    x = 0;
+            }
+            // make work-item(s) in a work-group load offset / coeff in one / fewer memory transaction
+            // offsets & coeffs are arranged like
+            //      00     10     20     ... n0
+            //      01     11     21     ... n1     ...
+            //      0(k-1) 1(k-1) 2(k-1) ... n(k-1)
+            int to = d + (i - start) * dstlen;
+            offset[to] = min(max(index + i, 0), srclen - 1) * esz;
+            coeff [to] = x;
+            sum += x;
+        }
+        for (int i = 0; i < kanti; ++i)
+            coeff[d + i * dstlen] /= sum;
+    }
+}
+
+static char const* ocl_resizeOnnx_typeToString(int type, char* buf, size_t size)
+{
+    // typeToStr CV_Assert will failed
+    static char const* tab[CV_64F + 1] = 
+        { "uchar", "char", "ushort", "short", "int", "float", "double" };
+    int depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+    if (cn == 1)
+        return tab[depth];
+    CV_Assert(size >= 18);
+    snprintf(buf, size, "%s%d", tab[depth], cn);
+    return buf;
+}
+
+static char const* ocl_resizeOnnx_convertTypeString(int sdepth, int ddepth, int cn, char* buf, size_t size)
+{
+    if( sdepth == ddepth )
+        return "noconvert";
+    char dtype[32];
+    const char *typestr = ocl_resizeOnnx_typeToString(CV_MAKETYPE(ddepth, cn), dtype, sizeof(dtype));
+    if ((ddepth >= CV_32F) ||
+        (ddepth == CV_32S && sdepth < CV_32S) ||
+        (ddepth == CV_16S && sdepth <= CV_8S) ||
+        (ddepth == CV_16U && sdepth == CV_8U))
+        snprintf(buf, size, "convert_%s", typestr);
+    else if (sdepth >= CV_32F)
+        snprintf(buf, size, "convert_%s%s_rte", typestr, (ddepth < CV_32S ? "_sat" : ""));
+    else
+        snprintf(buf, size, "convert_%s_sat", typestr);
+    return buf;
+}
+
+
+static bool ocl_resizeOnnx(InputArray _src, OutputArray _dst,
+    Matx22f const& M, Point2d const& scaled, int interpolation, float cubicCoeff)
+{
+    int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+    int sampler = interpolation & INTER_SAMPLER_MASK;
+    int nearest = interpolation & INTER_NEAREST_MODE_MASK;
+    int antialias = interpolation & INTER_ANTIALIAS_MASK;
+    Point2f scale = static_cast<Point2f>(scaled);
+    if (depth > CV_64F)
+        return false;
+
+    ocl::Kernel k;
+    UMat src = _src.getUMat(), dst = _dst.getUMat();
+    size_t globalsize[] = {static_cast<size_t>(dst.cols), static_cast<size_t>(dst.rows)};
+    char buf[6][64];
+    int pixel_size = static_cast<int>(src.elemSize());
+    int T = depth, VT = type;
+    String buildopts, errmsg;
+
+    if (sampler == INTER_NEAREST)
+    {
+        int W = depth, VW = type;
+        float offset = (nearest == INTER_NEAREST_PREFER_FLOOR) ? -0.5f :
+                (nearest == INTER_NEAREST_PREFER_CEIL) ? 0.5f : 0.f;
+        static char const *nearest_name[4] = {
+            "INTER_NEAREST_PREFER_FLOOR", "INTER_NEAREST_PREFER_CEIL",
+            "INTER_NEAREST_FLOOR", "INTER_NEAREST_CEIL"
+        };
+        buildopts = format(
+            "-D INTER_NEAREST -D %s "
+            "-D T=%s -D W=%s -D CN=%d -D PIXEL_SIZE=%d -D VT=%s -D VW=%s "
+            "-D TO_WORK=%s -D TO_VEC_WORK=%s -D TO_TYPE=%s -D TO_VEC_TYPE=%s ",
+            nearest_name[nearest >> INTER_NEAREST_MODE_SHIFT],
+            ocl_resizeOnnx_typeToString(T, nullptr, 0),
+            ocl_resizeOnnx_typeToString(W, nullptr, 0),
+            cn, pixel_size,
+            ocl_resizeOnnx_typeToString(VT, buf[0], sizeof(buf[0])),
+            ocl_resizeOnnx_typeToString(VW, buf[1], sizeof(buf[1])),
+            ocl_resizeOnnx_convertTypeString(T, W, 1, buf[2], sizeof(buf[2])),
+            ocl_resizeOnnx_convertTypeString(T, W, cn, buf[3], sizeof(buf[3])),
+            ocl_resizeOnnx_convertTypeString(W, T, 1, buf[4], sizeof(buf[4])),
+            ocl_resizeOnnx_convertTypeString(W, T, cn, buf[5], sizeof(buf[5]))
+        );
+        k.create("resizeOnnx_nearest", ocl::imgproc::resize_onnx_oclsrc,
+            buildopts, &errmsg);
+        if (k.empty())
+            return false;
+        k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
+            pixel_size, offset, M(0, 0), M(0, 1), M(1, 0), M(1, 1));
+    }
+    else if (sampler == INTER_LINEAR && !antialias)
+    {
+        int W = (T < CV_32S || T == CV_32F) ? CV_32F : CV_64F, VW = CV_MAKETYPE(W, cn);
+        buildopts = format(
+            "-D INTER_LINEAR "
+            "-D T=%s -D W=%s -D CN=%d -D PIXEL_SIZE=%d -D VT=%s -D VW=%s "
+            "-D TO_WORK=%s -D TO_VEC_WORK=%s -D TO_TYPE=%s -D TO_VEC_TYPE=%s ",
+            ocl_resizeOnnx_typeToString(T, nullptr, 0),
+            ocl_resizeOnnx_typeToString(W, nullptr, 0),
+            cn, pixel_size,
+            ocl_resizeOnnx_typeToString(VT, buf[0], sizeof(buf[0])),
+            ocl_resizeOnnx_typeToString(VW, buf[1], sizeof(buf[1])),
+            ocl_resizeOnnx_convertTypeString(T, W, 1, buf[2], sizeof(buf[2])),
+            ocl_resizeOnnx_convertTypeString(T, W, cn, buf[3], sizeof(buf[3])),
+            ocl_resizeOnnx_convertTypeString(W, T, 1, buf[4], sizeof(buf[4])),
+            ocl_resizeOnnx_convertTypeString(W, T, cn, buf[5], sizeof(buf[5]))
+        );
+        k.create("resizeOnnx_linear", ocl::imgproc::resize_onnx_oclsrc,
+            buildopts, &errmsg);
+        if (k.empty())
+            return false;
+        k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
+            pixel_size, cn, M(0, 0), M(0, 1), M(1, 0), M(1, 1));
+    }
+    else if (sampler == INTER_LINEAR && antialias)
+    {
+        int W = (T < CV_32S || T == CV_32F) ? CV_32F : CV_64F, VW = CV_MAKETYPE(W, cn);
+        buildopts = format(
+            "-D INTER_LINEAR -D INTER_ANTIALIAS "
+            "-D T=%s -D W=%s -D CN=%d -D PIXEL_SIZE=%d -D VT=%s -D VW=%s "
+            "-D TO_WORK=%s -D TO_VEC_WORK=%s -D TO_TYPE=%s -D TO_VEC_TYPE=%s ",
+            ocl_resizeOnnx_typeToString(T, nullptr, 0),
+            ocl_resizeOnnx_typeToString(W, nullptr, 0),
+            cn, pixel_size,
+            ocl_resizeOnnx_typeToString(VT, buf[0], sizeof(buf[0])),
+            ocl_resizeOnnx_typeToString(VW, buf[1], sizeof(buf[1])),
+            ocl_resizeOnnx_convertTypeString(T, W, 1, buf[2], sizeof(buf[2])),
+            ocl_resizeOnnx_convertTypeString(T, W, cn, buf[3], sizeof(buf[3])),
+            ocl_resizeOnnx_convertTypeString(W, T, 1, buf[4], sizeof(buf[4])),
+            ocl_resizeOnnx_convertTypeString(W, T, cn, buf[5], sizeof(buf[5]))
+        );
+        k.create("resizeOnnx_linear_antialias", ocl::imgproc::resize_onnx_oclsrc,
+            buildopts, &errmsg);
+        if (k.empty())
+            return false;
+        k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
+            pixel_size, cn, M(0, 0), M(0, 1), M(1, 0), M(1, 1),
+            min(scale.x, 1.f), min(scale.y, 1.f));
+    }
+    else if (sampler == INTER_CUBIC && !antialias)
+    {
+        int W = (T < CV_32S || T == CV_32F) ? CV_32F : CV_64F, VW = CV_MAKETYPE(W, cn);
+        buildopts = format(
+            "-D INTER_CUBIC "
+            "-D T=%s -D W=%s -D CN=%d -D PIXEL_SIZE=%d -D VT=%s -D VW=%s "
+            "-D TO_WORK=%s -D TO_VEC_WORK=%s -D TO_TYPE=%s -D TO_VEC_TYPE=%s ",
+            ocl_resizeOnnx_typeToString(T, nullptr, 0),
+            ocl_resizeOnnx_typeToString(W, nullptr, 0),
+            cn, pixel_size,
+            ocl_resizeOnnx_typeToString(VT, buf[0], sizeof(buf[0])),
+            ocl_resizeOnnx_typeToString(VW, buf[1], sizeof(buf[1])),
+            ocl_resizeOnnx_convertTypeString(T, W, 1, buf[2], sizeof(buf[2])),
+            ocl_resizeOnnx_convertTypeString(T, W, cn, buf[3], sizeof(buf[3])),
+            ocl_resizeOnnx_convertTypeString(W, T, 1, buf[4], sizeof(buf[4])),
+            ocl_resizeOnnx_convertTypeString(W, T, cn, buf[5], sizeof(buf[5]))
+        );
+        k.create("resizeOnnx_cubic", ocl::imgproc::resize_onnx_oclsrc,
+            buildopts, &errmsg);
+        if (k.empty())
+            return false;
+        k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
+            pixel_size, cn, M(0, 0), M(0, 1), M(1, 0), M(1, 1), cubicCoeff);
+    }
+    else if (sampler == INTER_CUBIC && antialias)
+    {
+        int ksize = 4;
+        int khalf = ksize / 2;
+        int xkanti = 2 * cvCeil(khalf / min(scale.x, 1.f));
+        int ykanti = 2 * cvCeil(khalf / min(scale.y, 1.f));
+        int xstride = xkanti * dst.cols;
+        int ystride = ykanti * dst.rows;
+        int tabsize = (xstride + ystride) * 2;
+        AutoBuffer<int> table(tabsize);
+        int* xoffset = table.data();
+        int* yoffset = xoffset + xstride;
+        float* xcoeff = reinterpret_cast<float*>(yoffset + ystride);
+        float* ycoeff = reinterpret_cast<float*>(xcoeff + xstride);
+        ocl_resizeOnnxTable(src.cols, dst.cols, pixel_size,
+            sampler, M(0, 0), M(0, 1), cubicCoeff, scale.x, xoffset, xcoeff);
+        ocl_resizeOnnxTable(src.rows, dst.rows, 1,
+            sampler, M(1, 0), M(1, 1), cubicCoeff, scale.y, yoffset, ycoeff);
+        UMat utable;
+        Mat(1, tabsize, CV_32S, table.data()).copyTo(utable);
+        int W = (T < CV_32S || T == CV_32F) ? CV_32F : CV_64F, VW = CV_MAKETYPE(W, cn);
+        buildopts = format(
+            "-D INTER_CUBIC -D INTER_ANTIALIAS "
+            "-D T=%s -D W=%s -D CN=%d -D PIXEL_SIZE=%d -D VT=%s -D VW=%s "
+            "-D TO_WORK=%s -D TO_VEC_WORK=%s -D TO_TYPE=%s -D TO_VEC_TYPE=%s ",
+            ocl_resizeOnnx_typeToString(T, nullptr, 0),
+            ocl_resizeOnnx_typeToString(W, nullptr, 0),
+            cn, pixel_size,
+            ocl_resizeOnnx_typeToString(VT, buf[0], sizeof(buf[0])),
+            ocl_resizeOnnx_typeToString(VW, buf[1], sizeof(buf[1])),
+            ocl_resizeOnnx_convertTypeString(T, W, 1, buf[2], sizeof(buf[2])),
+            ocl_resizeOnnx_convertTypeString(T, W, cn, buf[3], sizeof(buf[3])),
+            ocl_resizeOnnx_convertTypeString(W, T, 1, buf[4], sizeof(buf[4])),
+            ocl_resizeOnnx_convertTypeString(W, T, cn, buf[5], sizeof(buf[5]))
+        );
+        k.create("resizeOnnx_table", ocl::imgproc::resize_onnx_oclsrc,
+            buildopts, &errmsg);
+        if (k.empty())
+            return false;
+        k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
+            pixel_size, cn, xkanti, ykanti, ocl::KernelArg::PtrReadOnly(utable));
+    }
+    else
+        CV_Error(cv::Error::StsError, "should not got here");
+
+    if (errmsg.size())
+        fputs(errmsg.data(), stderr);
+    return k.run(2, globalsize, 0, false);
+}
+
 #endif
 
 #ifdef HAVE_IPP
@@ -4190,6 +5174,225 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
 }
 
 
+void cv::resizeOnnx(InputArray _src, OutputArray _dst,
+    Size dsize, Point2d scale, int interpolation, float cubicCoeff, Rect2d const& roi)
+{
+    CV_INSTRUMENT_REGION();
+
+    Size ssize = _src.size();
+    CV_CheckEQ(_src.dims(), 2, "only 2 dim image is support now");
+    CV_CheckFalse(ssize.empty(), "src size must not be empty");
+    // allow user input both dsize and scale
+    if (dsize.empty())
+    {
+        CV_CheckGT(scale.x, 0.0, "scale must > 0 if no dsize given");
+        CV_CheckGT(scale.y, 0.0, "scale must > 0 if no dsize given");
+        // https://github.com/onnx/onnx/blob/main/onnx/reference/ops/op_resize.py#L365
+        // output_size = (scale_factors * np.array(data.shape)).astype(int)
+        dsize.width  = static_cast<int>(scale.x * ssize.width );
+        dsize.height = static_cast<int>(scale.y * ssize.height);
+    }
+    if (scale.x == 0 || scale.y == 0)
+    {
+        scale.x = static_cast<double>(dsize.width ) / ssize.width;
+        scale.y = static_cast<double>(dsize.height) / ssize.height;
+    }
+    CV_CheckFalse(dsize.empty(), "dst size must not empty");
+    CV_CheckGT(scale.x, 0.0, "computed scale <= 0 with given dsize");
+    CV_CheckGT(scale.y, 0.0, "computed scale <= 0 with given dsize");
+
+    int sampler = interpolation & INTER_SAMPLER_MASK;
+    int nearest = interpolation & INTER_NEAREST_MODE_MASK;
+    int coordinate = interpolation & INTER_COORDINATE_MASK;
+    CV_Assert(
+        sampler == INTER_NEAREST ||
+        sampler == INTER_LINEAR ||
+        sampler == INTER_CUBIC);
+    CV_Assert(
+        nearest == INTER_NEAREST_PREFER_FLOOR ||
+        nearest == INTER_NEAREST_PREFER_CEIL ||
+        nearest == INTER_NEAREST_FLOOR ||
+        nearest == INTER_NEAREST_CEIL);
+    CV_Assert(
+        coordinate == INTER_HALF_PIXEL ||
+        coordinate == INTER_HALF_PIXEL_PYTORCH ||
+        coordinate == INTER_HALF_PIXEL_SYMMETRIC ||
+        coordinate == INTER_ALIGN_CORNERS ||
+        coordinate == INTER_ASYMMETRIC ||
+        coordinate == INTER_TF_CROP_RESIZE);
+
+    // affine transformation matrix: x' = ax + b
+    Matx22f M;
+    Vec2f xcoef = interCoordinate(
+        coordinate, dsize.width, ssize.width, scale.x, roi.x, roi.x + roi.width);
+    Vec2f ycoef = interCoordinate(
+        coordinate, dsize.height, ssize.height, scale.y, roi.y, roi.y + roi.height);
+    M(0, 0) = xcoef[0];
+    M(0, 1) = xcoef[1];
+    M(1, 0) = ycoef[0];
+    M(1, 1) = ycoef[1];
+
+    _dst.create(dsize, _src.type());
+    if (dsize == ssize && coordinate != INTER_TF_CROP_RESIZE)
+    {
+        // Source and destination are of same size. Use simple copy.
+        _src.copyTo(_dst);
+        return;
+    }
+
+    // Fake reference to source. Resolves issue 13577 in case of src == dst.
+    UMat srcUMat;
+    if (_src.isUMat())
+        srcUMat = _src.getUMat();
+
+    CV_OCL_RUN(_src.isUMat() && _dst.isUMat(),
+        ocl_resizeOnnx(_src, _dst, M, scale, interpolation, cubicCoeff))
+
+    Mat src = _src.getMat(), dst = _dst.getMat();
+
+    if (sampler == INTER_NEAREST)
+    {
+        parallel_for_(Range(0, dsize.height),
+            ResizeOnnxNNInvoker(src, dst, M, nearest),
+            static_cast<double>(dsize.height) * dsize.width / (1 << 16));
+        return;
+    }
+
+    static ResizeOnnxFunc linear_tab[] =
+    {
+        resizeOnnx_<
+            HResizeLinear<uchar, int, short, INTER_RESIZE_COEF_SCALE, HResizeLinearVec_8u32s>,
+            VResizeLinear<uchar, int, short, FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS * 2>,
+                VResizeLinearVec_32s8u>,
+            float>,
+        resizeOnnx_<
+            HResizeLinear<schar, int, short, INTER_RESIZE_COEF_SCALE, HResizeNoVec>,
+            VResizeLinear<schar, int, short, FixedPtCast<int, schar, INTER_RESIZE_COEF_BITS * 2>,
+                VResizeNoVec>,
+            float>,
+        resizeOnnx_<
+            HResizeLinear<ushort, float, float, 1, HResizeLinearVec_16u32f>,
+            VResizeLinear<ushort, float, float, Cast<float, ushort>, VResizeLinearVec_32f16u>,
+            float>,
+        resizeOnnx_<
+            HResizeLinear<short, float, float, 1, HResizeLinearVec_16s32f>,
+            VResizeLinear<short, float, float, Cast<float, short>, VResizeLinearVec_32f16s>,
+            float>,
+        resizeOnnx_<
+            HResizeLinear<int, double, double, 1, HResizeNoVec>,
+            VResizeLinear<int, double, double, Cast<double, int>, VResizeNoVec>,
+            double>,
+        resizeOnnx_<
+            HResizeLinear<float, float, float, 1, HResizeLinearVec_32f>,
+            VResizeLinear<float, float, float, Cast<float, float>, VResizeLinearVec_32f>,
+            float>,
+        resizeOnnx_<
+            HResizeLinear<double, double, double, 1, HResizeNoVec>,
+            VResizeLinear<double, double, double, Cast<double, double>, VResizeNoVec>,
+            double>,
+        nullptr
+    };
+
+    static ResizeOnnxFunc cubic_tab[] =
+    {
+        resizeOnnx_<
+            HResizeCubic<uchar, int, short>,
+            VResizeCubic<uchar, int, short, FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS * 2>,
+                VResizeCubicVec_32s8u>,
+            float>,
+        resizeOnnx_<
+            HResizeCubic<schar, int, short>,
+            VResizeCubic<schar, int, short, FixedPtCast<int, schar, INTER_RESIZE_COEF_BITS * 2>,
+                VResizeNoVec>,
+            float>,
+        resizeOnnx_<
+            HResizeCubic<ushort, float, float>,
+            VResizeCubic<ushort, float, float, Cast<float, ushort>, VResizeCubicVec_32f16u>,
+            float>,
+        resizeOnnx_<
+            HResizeCubic<short, float, float>,
+            VResizeCubic<short, float, float, Cast<float, short>, VResizeCubicVec_32f16s>,
+            float>,
+        resizeOnnx_<
+            HResizeCubic<int, double, double>,
+            VResizeCubic<int, double, double, Cast<double, int>, VResizeNoVec>,
+            double>,
+        resizeOnnx_<
+            HResizeCubic<float, float, float>,
+            VResizeCubic<float, float, float, Cast<float, float>, VResizeCubicVec_32f>,
+            float>,
+        resizeOnnx_<
+            HResizeCubic<double, double, double>,
+            VResizeCubic<double, double, double, Cast<double, double>, VResizeNoVec>,
+            double>,
+        nullptr
+    };
+
+#if 0
+    static ResizeAreaFastFunc areafast_tab[] =
+    {
+        resizeAreaFast_<uchar, int, ResizeAreaFastVec<uchar, ResizeAreaFastVec_SIMD_8u> >,
+        resizeAreaFast_<schar, int, ResizeAreaFastNoVec<schar, float> >,
+        resizeAreaFast_<ushort, float, ResizeAreaFastVec<ushort, ResizeAreaFastVec_SIMD_16u> >,
+        resizeAreaFast_<short, float, ResizeAreaFastVec<short, ResizeAreaFastVec_SIMD_16s> >,
+        resizeAreaFast_<int, double, ResizeAreaFastNoVec<int, double> >,
+        resizeAreaFast_<float, float, ResizeAreaFastVec_SIMD_32f>,
+        resizeAreaFast_<double, double, ResizeAreaFastNoVec<double, double> >,
+        nullptr
+    };
+
+    // check if can use area fast
+    Point2d inv_scale(1.0 / scale.x, 1.0 / scale.y);
+    bool areafast_scale = fabs(inv_scale.y - 2.0) + fabs(inv_scale.x - 2.0) <= DBL_EPSILON;
+    bool areafast_size = (fabs(ssize.height - dsize.height * inv_scale.y) <= DBL_EPSILON)
+        && (fabs(ssize.width - dsize.width * inv_scale.x) <= DBL_EPSILON);
+    bool areafast_coordiante = (coordinate == INTER_HALF_PIXEL)
+        || (coordinate == INTER_HALF_PIXEL_SYMMETRIC)
+        || (coordinate == INTER_HALF_PIXEL_PYTORCH && min(dsize.height, dsize.width) > 1);
+    bool areafast_sampler = (sampler == INTER_LINEAR) && !(interpolation & INTER_ANTIALIAS_MASK);
+    if (areafast_scale && areafast_size && areafast_coordiante && areafast_sampler)
+    {
+        int iiy = static_cast<int>(inv_scale.y);
+        int iix = static_cast<int>(inv_scale.x);
+        int area = iiy * iix;
+        int srcstep = static_cast<int>(src.step1());
+        AutoBuffer<int> _ofs(area + dsize.width * cn);
+        int* ofs = _ofs.data();
+        int* xofs = ofs + area;
+        ResizeAreaFastFunc func = areafast_tab[depth];
+        CV_Check(0, func, "empty implementation in area fast");
+        // offsets of a pixel's sources to its left-top
+        for (int sy = 0, k = 0; sy < iiy; ++sy)
+            for (int sx = 0; sx < iix; ++sx)
+                ofs[k++] = sy * srcstep + sx * cn;
+        // left-top offsets of all pixels on a row
+        for (int dx = 0; dx < dsize.width; ++dx)
+        {
+            int j = dx * cn;
+            int sx = iix * j;
+            for(int k = 0; k < cn; k++ )
+                xofs[j + k] = sx + k;
+        }
+        func(src, dst, ofs, xofs, iix, iiy);
+        return;
+    }
+#endif
+
+    int depth = src.depth();
+    ResizeOnnxCtrl ctrl(interpolation, src.type(), cubicCoeff, ssize, dsize, scale, M);
+    ResizeOnnxFunc func = linear_tab[depth];
+    if (sampler == INTER_LINEAR)
+        func = linear_tab[depth];
+    else if (sampler == INTER_CUBIC)
+        func = cubic_tab[depth];
+    else
+        CV_Error(CV_StsBadArg, format("Unknown sampler %d", sampler));
+    CV_Check(0, func, "empty implementation in area fast");
+
+    func(src, dst, ctrl);
+}
+
+
 CV_IMPL void
 cvResize( const CvArr* srcarr, CvArr* dstarr, int method )
 {
diff --git a/modules/imgproc/test/ocl/test_warp.cpp b/modules/imgproc/test/ocl/test_warp.cpp
index 852dc465ab..963fd8bc0c 100644
--- a/modules/imgproc/test/ocl/test_warp.cpp
+++ b/modules/imgproc/test/ocl/test_warp.cpp
@@ -160,11 +160,7 @@ PARAM_TEST_CASE(WarpTest_cols4_Base, MatType, Interpolation, bool, bool)
     }
 };
 
-/////warpAffine
-
-typedef WarpTestBase WarpAffine;
-
-/////warpAffine
+//// warpAffine
 
 typedef WarpTestBase WarpAffine;
 
@@ -340,6 +336,116 @@ OCL_TEST(Resize, overflow_21198)
     EXPECT_LE(cv::norm(dst_u, dst, NORM_INF), 1.0f);
 }
 
+PARAM_TEST_CASE(ResizeOnnx, MatType, double, double, int, bool, int)
+{
+    int type, interpolation;
+    int widthMultiple;
+    double fx, fy;
+    bool useRoi;
+    Mat middle;
+
+    TEST_DECLARE_INPUT_PARAMETER(src);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst);
+
+    virtual void SetUp()
+    {
+        type = GET_PARAM(0);
+        fx = GET_PARAM(1);
+        fy = GET_PARAM(2);
+        interpolation = GET_PARAM(3);
+        useRoi = GET_PARAM(4);
+        widthMultiple = GET_PARAM(5);
+    }
+
+    void random_roi()
+    {
+        CV_Assert(fx > 0 && fy > 0);
+
+        Size srcRoiSize = randomSize(10, MAX_VALUE), dstRoiSize;
+        // Make sure the width is a multiple of the requested value, and no more
+        srcRoiSize.width += widthMultiple - 1 - (srcRoiSize.width - 1) % widthMultiple;
+        dstRoiSize.width = cvRound(srcRoiSize.width * fx);
+        dstRoiSize.height = cvRound(srcRoiSize.height * fy);
+
+        if (dstRoiSize.empty())
+        {
+            random_roi();
+            return;
+        }
+
+        Border srcBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
+        randomSubMat(src, src_roi, srcRoiSize, srcBorder, type, -MAX_VALUE, MAX_VALUE);
+
+#if 0
+        // if nearest test failed, maybe the fma issue, try open this #if
+        // set pixels' value to their coordinate
+        if ((interpolation & INTER_SAMPLER_MASK) == INTER_NEAREST)
+        {
+            int channel = CV_MAT_CN(type);
+            middle.create(src.rows, src.cols, CV_16SC(channel));
+            for (int h = 0; h < src.rows; ++h)
+            {
+                for (int c = 0; c < channel; c += 2)
+                {
+                    // even x; odd y
+                    short* S = middle.ptr<short>(h) + c;
+                    for (int w = 0; w < src.cols; ++w, S += channel)
+                        S[0] = static_cast<short>(w);
+                }
+                for (int c = 1; c < channel; c += 2)
+                {
+                    // even x; odd y
+                    short* S = middle.ptr<short>(h) + c;
+                    for (int w = 0; w < src.cols; ++w, S += channel)
+                        S[0] = static_cast<short>(h);
+                }
+            }
+            middle.convertTo(src, type);
+            src_roi = src(Rect(srcBorder.lef, srcBorder.top, srcRoiSize.width, srcRoiSize.height));
+        }
+#endif
+        Border dstBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
+        randomSubMat(dst, dst_roi, dstRoiSize, dstBorder, type, -MAX_VALUE, MAX_VALUE);
+
+        UMAT_UPLOAD_INPUT_PARAMETER(src);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst);
+    }
+};
+
+OCL_TEST_P(ResizeOnnx, Mat)
+{
+    Size whole;
+    Point offset;
+    Mat host, host_roi;
+    int cn = CV_MAT_CN(type);
+    int depth = CV_MAT_DEPTH(type);
+    double eps = depth <= CV_32S ? integerEps : 5e-2;
+
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        random_roi();
+
+        OCL_OFF(cv::resizeOnnx(src_roi, dst_roi,
+            dst_roi.size(), Point2d(fx, fy), interpolation));
+        OCL_ON(cv::resizeOnnx(usrc_roi, udst_roi,
+            dst_roi.size(), Point2d(fx, fy), interpolation));
+
+        dst_roi.locateROI(whole, offset);
+        udst.copyTo(host);
+        host_roi = host(Rect(offset, dst_roi.size()));
+        if (cn <= 4 && depth != CV_8S && depth != CV_32S)
+            OCL_EXPECT_MAT_N_DIFF(dst, eps);
+        else
+        {
+            // more strict than OCL_EXPECT_MAT_N_DIFF
+            double dif = cv::norm(dst_roi, host_roi, NORM_INF);
+            EXPECT_LE(dif, eps)
+                << "Size: " << src_roi.size()
+                << ", NormInf: " << dif << std::endl;
+        }
+    }
+}
+
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 // remap
@@ -583,6 +689,44 @@ OCL_INSTANTIATE_TEST_CASE_P(ImgprocWarpResizeArea, Resize, Combine(
                             Bool(),
                             Values(1, 16)));
 
+OCL_INSTANTIATE_TEST_CASE_P(ImgprocWarpAlias, ResizeOnnx, Combine(
+                            Values(
+                                CV_8UC1, CV_8SC2, CV_8UC(5), CV_8SC(7),
+                                CV_16UC1, CV_16SC3, CV_16UC(9), CV_16SC(10),
+                                CV_32FC1, CV_32FC4, CV_32FC(11)),
+                            Values(0.5, 0.31, 1.4),
+                            Values(0.5, 0.73, 3.7),
+                            Values((int)(INTER_LINEAR), (int)(INTER_CUBIC)),
+                            Bool(),
+                            Values(1, 16)));
+OCL_INSTANTIATE_TEST_CASE_P(ImgprocWarpAntiAlias, ResizeOnnx, Combine(
+                            Values(
+                                CV_8UC1, CV_8SC2, CV_8UC(5), CV_8SC(7),
+                                CV_16UC1, CV_16SC3, CV_16UC(9), CV_16SC(10),
+                                CV_32FC1, CV_32FC4, CV_32FC(11)),
+                            Values(0.5, 0.27, 2.6),
+                            Values(0.5, 0.71, 4.1),
+                            Values(
+                                (int)(INTER_ANTIALIAS | INTER_LINEAR),
+                                (int)(INTER_ANTIALIAS | INTER_CUBIC )),
+                            Bool(),
+                            Values(1, 16)));
+
+OCL_INSTANTIATE_TEST_CASE_P(ImgprocWarpNearest, ResizeOnnx, Combine(
+                            Values(
+                                CV_8UC1, CV_8SC2, CV_8UC4, CV_8SC(7),
+                                CV_16UC1, CV_16SC3, CV_16UC(9), CV_32SC(10),
+                                CV_32FC1, CV_32FC4, CV_32FC(11)),
+                            Values(0.5, 0.27, 2.6),
+                            Values(0.5, 0.71, 4.1),
+                            Values(
+                                (int)(INTER_NEAREST | INTER_NEAREST_PREFER_FLOOR),
+                                (int)(INTER_NEAREST | INTER_NEAREST_PREFER_CEIL),
+                                (int)(INTER_NEAREST | INTER_NEAREST_CEIL),
+                                (int)(INTER_NEAREST | INTER_NEAREST_FLOOR)),
+                            Bool(),
+                            Values(1, 16)));
+
 OCL_INSTANTIATE_TEST_CASE_P(ImgprocWarp, Remap_INTER_LINEAR, Combine(
                             Values(CV_8U, CV_16U, CV_32F),
                             Values(1, 3, 4),
diff --git a/modules/imgproc/test/test_resize_onnx.cpp b/modules/imgproc/test/test_resize_onnx.cpp
new file mode 100644
index 0000000000..a9a4271e72
--- /dev/null
+++ b/modules/imgproc/test/test_resize_onnx.cpp
@@ -0,0 +1,498 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "test_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+struct ResizeOnnx
+{
+    int interpolate;
+    Size szsrc, szref, szdst;
+    Point2d scale;
+    float cubic;
+    Rect2d roi;
+    /* make sure insrc is:
+     *   (1) integer
+     *   (2) range [-127, 127]
+     *   (3) all non-positive or non-negative */
+    vector<double> insrc, inref;
+
+    void rand_roi(RNG& rng, Mat& src, Size size, int type)
+    {
+        int const border = 16;
+        int t = rng.next() % border;
+        int b = rng.next() % border;
+        int l = rng.next() % border;
+        int r = rng.next() % border;
+        if (rng.next() & 1)
+        {
+            src.create(size.height + t + b, size.width + l + r, type);
+            src.setTo(127);
+            src = src(Rect(l, t, size.width, size.height));
+        }
+        else
+            src.create(size, type);
+    }
+
+    void run()
+    {
+        CV_CheckGE(static_cast<int>(insrc.size()), szsrc.area(), "unexpected src size");
+        CV_CheckEQ(static_cast<int>(inref.size()), szref.area(), "unexpected ref size");
+        Mat iS(szsrc, CV_64F, insrc.data());
+        Mat iR(szref, CV_64F, inref.data());
+        Mat S = iS, R = iR, nS, nR;
+        double alpha[6] = {1, 1, 5, 5, -1, -3};
+        double  beta[6] = {0, 7, 0, 7, +0, -7};
+        RNG rng;
+        for (int cn = 1; cn <= 6; ++cn)
+        {
+            if (cn > 1)
+            {
+                iS.convertTo(nS, -1, alpha[cn - 1], beta[cn - 1]);
+                iR.convertTo(nR, -1, alpha[cn - 1], beta[cn - 1]);
+                merge(vector<Mat>{S, nS}, S);
+                merge(vector<Mat>{R, nR}, R);
+            }
+            for (int depth = CV_8U; depth <= CV_64F; ++depth)
+            {
+                double eps = (depth <= CV_32S) ? 1.0 : 1e-3;
+                int type = CV_MAKETYPE(depth, cn);
+                string errinfo = "failed on type " + typeToString(type);
+                Mat src, ref, dst;
+                rand_roi(rng, src, szsrc, type);
+                if (szdst.area())
+                    rand_roi(rng, dst, szdst, type);
+                S.convertTo(src, type);
+                R.convertTo(ref, type);
+                resizeOnnx(src, dst, szdst, scale, interpolate, cubic, roi);
+                EXPECT_EQ(ref.size(), dst.size()) << errinfo;
+                // nearest must give bit-same result
+                if ((interpolate & INTER_SAMPLER_MASK) == INTER_NEAREST)
+                    EXPECT_EQ(cv::norm(ref, dst, NORM_INF), 0.0) << errinfo;
+                // cvRound(4.5) = 4, but when doing resize with int, we may get 5
+                else
+                    EXPECT_LE(cv::norm(ref, dst, NORM_INF), eps) << errinfo;
+            }
+        }
+    }
+};
+
+// https://github.com/onnx/onnx/blob/main/docs/Operators.md#examples-128
+
+TEST(ResizeOnnx, downsample_scales_cubic)
+{
+    ResizeOnnx{
+        INTER_CUBIC,
+        Size(4, 4), Size(3, 3), Size(), Point2d(0.8, 0.8),
+        -0.75f, Rect2d(),
+        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+        {
+             1.47119141, 2.78125   ,  4.08251953,
+             6.71142578, 8.02148438,  9.32275391,
+            11.91650391, 13.2265625, 14.52783203,
+        }
+    }.run();
+}
+
+TEST(ResizeOnnx, downsample_scales_cubic_align_corners)
+{
+    ResizeOnnx{
+        INTER_CUBIC | INTER_ALIGN_CORNERS,
+        Size(4, 4), Size(3, 3), Size(), Point2d(0.8, 0.8),
+        -0.75f, Rect2d(),
+        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+        {
+             1.0       ,  2.39519159,  3.79038317,
+             6.58076634,  7.97595793,  9.37114951,
+            12.16153268, 13.55672427, 14.95191585,
+        }
+    }.run();
+}
+
+TEST(ResizeOnnx, downsample_scales_cubic_antialias)
+{
+    ResizeOnnx{
+        INTER_CUBIC | INTER_ANTIALIAS,
+        Size(4, 4), Size(2, 2), Size(), Point2d(0.6, 0.6),
+        -0.75f, Rect2d(),
+        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+        {
+            2.5180721,  4.2858863,
+            9.589329 , 11.357142 ,
+        }
+    }.run();
+}
+
+TEST(ResizeOnnx, downsample_scales_linear)
+{
+    ResizeOnnx{
+        INTER_LINEAR,
+        Size(4, 2), Size(2, 1), Size(), Point2d(0.6, 0.6),
+        -0.75f, Rect2d(),
+        {1, 2, 3, 4, 5, 6, 7, 8},
+        {2.6666665, 4.3333331}
+    }.run();
+}
+
+TEST(ResizeOnnx, downsample_scales_linear_align_corners)
+{
+    ResizeOnnx{
+        INTER_LINEAR | INTER_ALIGN_CORNERS,
+        Size(4, 2), Size(2, 1), Size(), Point2d(0.6, 0.6),
+        -0.75f, Rect2d(),
+        {1, 2, 3, 4, 5, 6, 7, 8},
+        {1.0, 3.142857}
+    }.run();
+}
+
+TEST(ResizeOnnx, downsample_scales_linear_antialias)
+{
+    ResizeOnnx{
+        INTER_LINEAR | INTER_ANTIALIAS,
+        Size(4, 4), Size(2, 2), Size(), Point2d(0.6, 0.6),
+        -0.75f, Rect2d(),
+        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+        {
+            2.875,  4.5,
+            9.375, 11.0,
+        }
+    }.run();
+}
+
+TEST(ResizeOnnx, downsample_scales_linear_half_pixel_symmetric)
+{
+    ResizeOnnx{
+        INTER_LINEAR | INTER_HALF_PIXEL_SYMMETRIC,
+        Size(4, 1), Size(2, 1), Size(), Point2d(0.6, 1.0),
+        -0.75f, Rect2d(),
+        {1, 2, 3, 4},
+        {1.6666667, 3.3333333}
+    }.run();
+}
+
+TEST(ResizeOnnx, downsample_scales_nearest)
+{
+    ResizeOnnx{
+        INTER_NEAREST,
+        Size(4, 2), Size(2, 1), Size(), Point2d(0.6, 0.6),
+        -0.75f, Rect2d(),
+        {1, 2, 3, 4, 5, 6, 7, 8},
+        {1, 3}
+    }.run();
+}
+
+TEST(ResizeOnnx, downsample_sizes_cubic)
+{
+    ResizeOnnx{
+        INTER_CUBIC,
+        Size(4, 4), Size(3, 3), Size(3, 3), Point2d(),
+        -0.75f, Rect2d(),
+        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+        {
+             1.63078704,  3.00462963,  4.37847222,
+             7.12615741,  8.5       ,  9.87384259,
+            12.62152778, 13.99537037, 15.36921296,
+        }
+    }.run();
+}
+
+TEST(ResizeOnnx, downsample_sizes_cubic_antialias)
+{
+    ResizeOnnx{
+        INTER_CUBIC | INTER_ANTIALIAS,
+        Size(4, 4), Size(3, 3), Size(3, 3), Point2d(),
+        -0.75f, Rect2d(),
+        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+        {
+            1.7750092,   3.1200073,  4.4650054,
+            7.1550016,   8.5      ,  9.844998 ,
+            12.534994,  13.8799925, 15.224991 ,
+        }
+    }.run();
+}
+
+TEST(ResizeOnnx, downsample_sizes_linear_antialias)
+{
+    ResizeOnnx{
+        INTER_LINEAR | INTER_ANTIALIAS,
+        Size(4, 4), Size(3, 3), Size(3, 3), Point2d(),
+        -0.75f, Rect2d(),
+        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+        {
+             2.3636363,  3.590909,   4.818182,
+             7.2727275,  8.5      ,  9.727273,
+            12.181818 , 13.409091,  14.636364,
+        }
+    }.run();
+}
+
+TEST(ResizeOnnx, downsample_sizes_linear_pytorch_half_pixel)
+{
+    ResizeOnnx{
+        INTER_LINEAR | INTER_HALF_PIXEL_PYTORCH,
+        Size(4, 4), Size(1, 3), Size(1, 3), Point2d(),
+        -0.75f, Rect2d(),
+        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+        {   
+            1.6666666,
+            7.0      ,
+            12.333333,
+        }
+    }.run();
+}
+
+TEST(ResizeOnnx, downsample_sizes_nearest)
+{
+    ResizeOnnx{
+        INTER_NEAREST,
+        Size(4, 2), Size(3, 1), Size(3, 1), Point2d(),
+        -0.75f, Rect2d(),
+        {1, 2, 3, 4, 5, 6, 7, 8},
+        {1, 2, 4}
+    }.run();
+}
+
+TEST(ResizeOnnx, tf_crop_and_resize)
+{
+    // Note: for some rois, the result may be different with that of TF for inaccurate floating point
+    ResizeOnnx{
+        INTER_LINEAR | INTER_TF_CROP_RESIZE,
+        Size(4, 4), Size(3, 3), Size(3, 3), Point2d(),
+        -0.75f, Rect2d(0.6, 0.4, 0.2, 0.2),
+        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+        {
+            7.6000004,  7.9,  8.2     ,
+            8.8      ,  9.1,  9.400001,
+            10.0     , 10.3, 10.6     ,
+        }
+    }.run();
+}
+
+TEST(ResizeOnnx, upsample_scales_cubic)
+{
+    ResizeOnnx{
+        INTER_CUBIC,
+        Size(4, 4), Size(8, 8), Size(), Point2d(2.0, 2.0),
+        -0.75f, Rect2d(),
+        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+        {
+            0.47265625, 0.76953125, 1.24609375, 1.875, 2.28125, 2.91015625, 3.38671875, 3.68359375,
+            1.66015625, 1.95703125, 2.43359375, 3.0625, 3.46875, 4.09765625, 4.57421875, 4.87109375,
+            3.56640625, 3.86328125, 4.33984375, 4.96875, 5.375, 6.00390625, 6.48046875, 6.77734375,
+            6.08203125, 6.37890625, 6.85546875, 7.484375, 7.890625, 8.51953125, 8.99609375, 9.29296875,
+            7.70703125, 8.00390625, 8.48046875, 9.109375, 9.515625, 10.14453125, 10.62109375, 10.91796875,
+            10.22265625, 10.51953125, 10.99609375, 11.625, 12.03125, 12.66015625, 13.13671875, 13.43359375,
+            12.12890625, 12.42578125, 12.90234375, 13.53125, 13.9375, 14.56640625, 15.04296875, 15.33984375,
+            13.31640625, 13.61328125, 14.08984375, 14.71875, 15.125, 15.75390625, 16.23046875, 16.52734375,
+        }
+    }.run();
+}
+
+TEST(ResizeOnnx, upsample_scales_cubic_align_corners)
+{
+    ResizeOnnx{
+        INTER_CUBIC | INTER_ALIGN_CORNERS,
+        Size(4, 4), Size(8, 8), Size(), Point2d(2.0, 2.0),
+        -0.75f, Rect2d(),
+        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+        {
+            1.0, 1.34110787, 1.80029155, 2.32944606, 2.67055394, 3.19970845, 3.65889213, 4.0,
+            2.36443149, 2.70553936, 3.16472303, 3.69387755, 4.03498542, 4.56413994, 5.02332362, 5.36443149,
+            4.20116618, 4.54227405, 5.00145773, 5.53061224, 5.87172012, 6.40087464, 6.86005831, 7.20116618,
+            6.31778426, 6.65889213, 7.1180758, 7.64723032, 7.98833819, 8.51749271, 8.97667638, 9.31778426,
+            7.68221574, 8.02332362, 8.48250729, 9.01166181, 9.35276968, 9.8819242, 10.34110787, 10.68221574,
+            9.79883382, 10.13994169, 10.59912536, 11.12827988, 11.46938776, 11.99854227, 12.45772595, 12.79883382,
+            11.63556851, 11.97667638, 12.43586006, 12.96501458, 13.30612245, 13.83527697, 14.29446064, 14.63556851,
+            13.0, 13.34110787, 13.80029155, 14.32944606, 14.67055394, 15.19970845, 15.65889213, 16.0,
+        }
+    }.run();
+}
+
+TEST(ResizeOnnx, upsample_scales_cubic_asymmetric)
+{
+    ResizeOnnx{
+        INTER_CUBIC | INTER_ASYMMETRIC,
+        Size(4, 4), Size(8, 8), Size(), Point2d(2.0, 2.0),
+        -0.75f, Rect2d(),
+        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+        {
+            1.0, 1.40625, 2.0, 2.5, 3.0, 3.59375, 4.0, 4.09375,
+            2.625, 3.03125, 3.625, 4.125, 4.625, 5.21875, 5.625, 5.71875,
+            5.0, 5.40625, 6.0, 6.5, 7.0, 7.59375, 8.0, 8.09375,
+            7.0, 7.40625, 8.0, 8.5, 9.0, 9.59375, 10.0, 10.09375,
+            9.0, 9.40625, 10.0, 10.5, 11.0, 11.59375, 12.0, 12.09375,
+            11.375, 11.78125, 12.375, 12.875, 13.375, 13.96875, 14.375, 14.46875,
+            13.0, 13.40625, 14.0, 14.5, 15.0, 15.59375, 16.0, 16.09375,
+            13.375, 13.78125, 14.375, 14.875, 15.375, 15.96875, 16.375, 16.46875,
+        }
+    }.run();
+}
+
+TEST(ResizeOnnx, upsample_scales_linear)
+{
+    ResizeOnnx{
+        INTER_LINEAR,
+        Size(2, 2), Size(4, 4), Size(), Point2d(2.0, 2.0),
+        -0.75f, Rect2d(),
+        {1, 2, 3, 4},
+        {
+            1.0, 1.25, 1.75, 2.0,
+            1.5, 1.75, 2.25, 2.5,
+            2.5, 2.75, 3.25, 3.5,
+            3.0, 3.25, 3.75, 4.0,
+        }
+    }.run();
+}
+
+TEST(ResizeOnnx, upsample_scales_linear_align_corners)
+{
+    ResizeOnnx{
+        INTER_LINEAR | INTER_ALIGN_CORNERS,
+        Size(2, 2), Size(4, 4), Size(), Point2d(2.0, 2.0),
+        -0.75f, Rect2d(),
+        {1, 2, 3, 4},
+        {
+            1.0, 1.33333333, 1.66666667, 2.0,
+            1.66666667, 2.0, 2.33333333, 2.66666667,
+            2.33333333, 2.66666667, 3.0, 3.33333333,
+            3.0, 3.33333333, 3.66666667, 4.0,
+        }
+    }.run();
+}
+
+TEST(ResizeOnnx, upsample_scales_linear_half_pixel_symmetric)
+{
+    ResizeOnnx{
+        INTER_LINEAR | INTER_HALF_PIXEL_SYMMETRIC,
+        Size(2, 2), Size(5, 4), Size(), Point2d(2.94, 2.3),
+        -0.75f, Rect2d(),
+        {1, 2, 3, 4},
+        {
+            1.0       , 1.15986395, 1.5       , 1.84013605, 2.0       ,
+            1.56521738, 1.72508133, 2.06521738, 2.40535343, 2.56521738,
+            2.43478262, 2.59464657, 2.93478262, 3.27491867, 3.43478262,
+            3.0       , 3.15986395, 3.5       , 3.84013605, 4.0       ,
+        }
+    }.run();
+}
+
+TEST(ResizeOnnx, upsample_scales_nearest)
+{
+    ResizeOnnx{
+        INTER_NEAREST,
+        Size(2, 2), Size(6, 4), Size(), Point2d(3.0, 2.0),
+        -0.75f, Rect2d(),
+        {1, 2, 3, 4},
+        {
+            1, 1, 1, 2, 2, 2,
+            1, 1, 1, 2, 2, 2,
+            3, 3, 3, 4, 4, 4,
+            3, 3, 3, 4, 4, 4,
+        }
+    }.run();
+}
+
+TEST(ResizeOnnx, upsample_sizes_cubic)
+{
+    ResizeOnnx{
+        INTER_CUBIC,
+        Size(4, 4), Size(10, 9), Size(10, 9), Point2d(),
+        -0.75f, Rect2d(),
+        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+        {
+            0.45507922, 0.64057922, 0.97157922, 1.42257922, 1.90732922, 2.22332922, 2.70807922, 3.15907922, 3.49007922, 3.67557922,
+            1.39437963, 1.57987963, 1.91087963, 2.36187963, 2.84662963, 3.16262963, 3.64737963, 4.09837963, 4.42937963, 4.61487963,
+            2.95130693, 3.13680693, 3.46780693, 3.91880693, 4.40355693, 4.71955693, 5.20430693, 5.65530693, 5.98630693, 6.17180693,
+            5.20525069, 5.39075069, 5.72175069, 6.17275069, 6.65750069, 6.97350069, 7.45825069, 7.90925069, 8.24025069, 8.42575069,
+            6.88975, 7.07525, 7.40625, 7.85725, 8.342, 8.658, 9.14275, 9.59375, 9.92475, 10.11025,
+            8.57424931, 8.75974931, 9.09074931, 9.54174931, 10.02649931, 10.34249931, 10.82724931, 11.27824931, 11.60924931, 11.79474931,
+            10.82819307, 11.01369307, 11.34469307, 11.79569307, 12.28044307, 12.59644307, 13.08119307, 13.53219307, 13.86319307, 14.04869307,
+            12.38512037, 12.57062037, 12.90162037, 13.35262037, 13.83737037, 14.15337037, 14.63812037, 15.08912037, 15.42012037, 15.60562037,
+            13.32442078, 13.50992078, 13.84092078, 14.29192078, 14.77667078, 15.09267078, 15.57742078, 16.02842078, 16.35942078, 16.54492078,
+        }
+    }.run();
+}
+
+TEST(ResizeOnnx, upsample_sizes_nearest)
+{
+    ResizeOnnx{
+        INTER_NEAREST,
+        Size(2, 2), Size(8, 7), Size(8, 7), Point2d(),
+        -0.75f, Rect2d(),
+        {1, 2, 3, 4},
+        {
+            1, 1, 1, 1, 2, 2, 2, 2,
+            1, 1, 1, 1, 2, 2, 2, 2,
+            1, 1, 1, 1, 2, 2, 2, 2,
+            1, 1, 1, 1, 2, 2, 2, 2,
+            3, 3, 3, 3, 4, 4, 4, 4,
+            3, 3, 3, 3, 4, 4, 4, 4,
+            3, 3, 3, 3, 4, 4, 4, 4,
+        }
+    }.run();
+}
+
+TEST(ResizeOnnx, upsample_sizes_nearest_ceil_half_pixel)
+{
+    ResizeOnnx{
+        INTER_NEAREST | INTER_NEAREST_CEIL,
+        Size(4, 4), Size(8, 8), Size(8, 8), Point2d(),
+        -0.75f, Rect2d(),
+        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+        {
+            1, 2, 2, 3, 3, 4, 4, 4,
+            5, 6, 6, 7, 7, 8, 8, 8,
+            5, 6, 6, 7, 7, 8, 8, 8,
+            9, 10, 10, 11, 11, 12, 12, 12,
+            9, 10, 10, 11, 11, 12, 12, 12,
+            13, 14, 14, 15, 15, 16, 16, 16,
+            13, 14, 14, 15, 15, 16, 16, 16,
+            13, 14, 14, 15, 15, 16, 16, 16,
+        }
+    }.run();
+}
+
+TEST(ResizeOnnx, upsample_sizes_nearest_floor_align_corners)
+{
+    ResizeOnnx{
+        INTER_NEAREST | INTER_NEAREST_FLOOR | INTER_ALIGN_CORNERS,
+        Size(4, 4), Size(8, 8), Size(8, 8), Point2d(),
+        -0.75f, Rect2d(),
+        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+        {
+            1, 1, 1, 2, 2, 3, 3, 4,
+            1, 1, 1, 2, 2, 3, 3, 4,
+            1, 1, 1, 2, 2, 3, 3, 4,
+            5, 5, 5, 6, 6, 7, 7, 8,
+            5, 5, 5, 6, 6, 7, 7, 8,
+            9, 9, 9, 10, 10, 11, 11, 12,
+            9, 9, 9, 10, 10, 11, 11, 12,
+            13, 13, 13, 14, 14, 15, 15, 16,
+        }
+    }.run();
+}
+
+TEST(ResizeOnnx, upsample_sizes_nearest_round_prefer_ceil_asymmetric)
+{
+    ResizeOnnx{
+        INTER_NEAREST | INTER_NEAREST_PREFER_CEIL | INTER_ASYMMETRIC,
+        Size(4, 4), Size(8, 8), Size(8, 8), Point2d(),
+        -0.75f, Rect2d(),
+        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+        {
+            1, 2, 2, 3, 3, 4, 4, 4,
+            5, 6, 6, 7, 7, 8, 8, 8,
+            5, 6, 6, 7, 7, 8, 8, 8,
+            9, 10, 10, 11, 11, 12, 12, 12,
+            9, 10, 10, 11, 11, 12, 12, 12,
+            13, 14, 14, 15, 15, 16, 16, 16,
+            13, 14, 14, 15, 15, 16, 16, 16,
+            13, 14, 14, 15, 15, 16, 16, 16,
+        }
+    }.run();
+}
+
+}}
+
diff --git a/modules/ts/include/opencv2/ts/ocl_test.hpp b/modules/ts/include/opencv2/ts/ocl_test.hpp
index 717eb7b14c..ed456385b9 100644
--- a/modules/ts/include/opencv2/ts/ocl_test.hpp
+++ b/modules/ts/include/opencv2/ts/ocl_test.hpp
@@ -134,11 +134,11 @@ do \
         mask(cv::Rect(1, 1, mask.cols - 2, mask.rows - 2)).setTo(0); \
     cv::threshold(diff, binary, (double)eps, 255, cv::THRESH_BINARY); \
     EXPECT_LE(countNonZero(binary.reshape(1)), (int)(binary.cols*binary.rows*5/1000)) \
-        << "Size: " << name ## _roi.size() << std::endl; \
+        << "Size: " << name ## _roi.size() << ", NormInf: " << cv::norm(diff, NORM_INF) <<  std::endl; \
     binary.convertTo(binary_8, mask.type()); \
     binary_8 = binary_8 & mask; \
     EXPECT_LE(countNonZero(binary_8.reshape(1)), (int)((binary_8.cols+binary_8.rows)/100)) \
-        << "Size: " << name ## _roi.size() << std::endl; \
+        << "Size: " << name ## _roi.size() << ", NormInf: " << cv::norm(diff, NORM_INF)  << std::endl; \
 } while ((void)0, 0)
 
 #define OCL_EXPECT_MATS_NEAR(name, eps) \

From 379c16e10613902160cd4a6ca03d00e71ff2f1f2 Mon Sep 17 00:00:00 2001
From: Ginkgo <ecrirelettre@outlook.com>
Date: Sat, 8 Jun 2024 19:03:43 +0800
Subject: [PATCH 02/12] delete trailing whitespace

---
 .../include/opencv2/core/cuda/vec_math.hpp    | 69 +++++++++++++------
 modules/imgproc/src/opencl/resize_onnx.cl     | 30 ++++----
 modules/imgproc/src/resize.cpp                | 24 +++----
 modules/imgproc/test/test_resize_onnx.cpp     |  3 +-
 4 files changed, 74 insertions(+), 52 deletions(-)

diff --git a/modules/core/include/opencv2/core/cuda/vec_math.hpp b/modules/core/include/opencv2/core/cuda/vec_math.hpp
index 80b1303681..f22205fcc0 100644
--- a/modules/core/include/opencv2/core/cuda/vec_math.hpp
+++ b/modules/core/include/opencv2/core/cuda/vec_math.hpp
@@ -374,6 +374,26 @@ CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atanh, ::atanh, double, double)
 
 #undef CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC
 
+// a += b
+
+#define CV_CUDEV_IMPLEMENT_VEC_BINARY_ASSIGN(op, input_type, output_type) \
+    __device__ __forceinline__ output_type ## 1 & operator op ## = (output_type ## 1 & a, const input_type ## 1 & b) \
+    { \
+    return a = VecTraits<output_type ## 1>::make(a.x op b.x); \
+    } \
+    __device__ __forceinline__ output_type ## 2 & operator op ## = (output_type ## 2 & a, const input_type ## 2 & b) \
+    { \
+    return a = VecTraits<output_type ## 2>::make(a.x op b.x, a.y op b.y); \
+    } \
+    __device__ __forceinline__ output_type ## 3 & operator op ## = (output_type ## 3 & a, const input_type ## 3 & b) \
+    { \
+    return a = VecTraits<output_type ## 3>::make(a.x op b.x, a.y op b.y, a.z op b.z); \
+    } \
+    __device__ __forceinline__ output_type ## 4 & operator op ## = (output_type ## 4 & a, const input_type ## 4 & b) \
+    { \
+    return a = VecTraits<output_type ## 4>::make(a.x op b.x, a.y op b.y, a.z op b.z, a.w op b.w); \
+    }
+
 // binary operators (vec & vec)
 
 #define CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(op, input_type, output_type) \
@@ -392,7 +412,8 @@ CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atanh, ::atanh, double, double)
     __device__ __forceinline__ output_type ## 4 operator op(const input_type ## 4 & a, const input_type ## 4 & b) \
     { \
         return VecTraits<output_type ## 4>::make(a.x op b.x, a.y op b.y, a.z op b.z, a.w op b.w); \
-    }
+    } \
+    CV_CUDEV_IMPLEMENT_VEC_BINARY_ASSIGN(op, input_type, output_type)
 
 CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(+, uchar, int)
 CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(+, char, int)
@@ -430,6 +451,30 @@ CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(/, uint, uint)
 CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(/, float, float)
 CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(/, double, double)
 
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&, char, char)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&, ushort, ushort)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&, short, short)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&, uint, uint)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(| , uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(| , char, char)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(| , ushort, ushort)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(| , short, short)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(| , int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(| , uint, uint)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(^, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(^, char, char)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(^, ushort, ushort)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(^, short, short)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(^, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(^, uint, uint)
+
+#undef CV_CUDEV_IMPLEMENT_VEC_BINARY_ASSIGN
+#define CV_CUDEV_IMPLEMENT_VEC_BINARY_ASSIGN(op, input_type, output_type)
+
 CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(==, uchar, uchar)
 CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(==, char, uchar)
 CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(==, ushort, uchar)
@@ -502,28 +547,8 @@ CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(||, uint, uchar)
 CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(||, float, uchar)
 CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(||, double, uchar)
 
-CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&, uchar, uchar)
-CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&, char, char)
-CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&, ushort, ushort)
-CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&, short, short)
-CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&, int, int)
-CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&, uint, uint)
-
-CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(|, uchar, uchar)
-CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(|, char, char)
-CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(|, ushort, ushort)
-CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(|, short, short)
-CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(|, int, int)
-CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(|, uint, uint)
-
-CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(^, uchar, uchar)
-CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(^, char, char)
-CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(^, ushort, ushort)
-CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(^, short, short)
-CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(^, int, int)
-CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(^, uint, uint)
-
 #undef CV_CUDEV_IMPLEMENT_VEC_BINARY_OP
+#undef CV_CUDEV_IMPLEMENT_VEC_BINARY_ASSIGN
 
 // binary operators (vec & scalar)
 
diff --git a/modules/imgproc/src/opencl/resize_onnx.cl b/modules/imgproc/src/opencl/resize_onnx.cl
index 03a64e2bc1..19c6c69cb4 100644
--- a/modules/imgproc/src/opencl/resize_onnx.cl
+++ b/modules/imgproc/src/opencl/resize_onnx.cl
@@ -50,7 +50,6 @@ __kernel void resizeOnnx_nearest(
     if (dx < dst_cols && dy < dst_rows)
     {
         float fx = fma(dx, m00 , m01), fy = fma(dy, m10, m11);
-    
 #if defined(INTER_NEAREST_PREFER_FLOOR) || defined(INTER_NEAREST_CEIL)
         // x, y will >= 0, so `round toward positive infinity' is equivalent to ceil
         int sx = convert_int_rtp(fx + offset);
@@ -121,13 +120,14 @@ __kernel void resizeOnnx_linear(
         VT d0 = TO_VEC_TYPE((u0 * v0) * s0 + (u1 * v0) * s1 + (u0 * v1) * s2 + (u1 * v1) * s3);
         storepix(d0, D);
 #else
+        W coeff[4] = { u0 * v0, u1 * v0, u0 * v1, u1 * v1 };
         for (int i = 0; i < channel; ++i)
         {
             W s0 = TO_WORK(((__global T const*)(S0))[i]);
             W s1 = TO_WORK(((__global T const*)(S1))[i]);
             W s2 = TO_WORK(((__global T const*)(S2))[i]);
             W s3 = TO_WORK(((__global T const*)(S3))[i]);
-            W d0 = (u0 * v0) * s0 + (u1 * v0) * s1 + (u0 * v1) * s2 + (u1 * v1) * s3;
+            W d0 = coeff[0] * s0 + coeff[1] * s1 + coeff[2] * s2 + coeff[3] * s3;
             ((__global T*)(D))[i] = TO_TYPE(d0);
         }
 #endif
@@ -271,10 +271,10 @@ __kernel void resizeOnnx_cubic(
         for (int y = ystart; y <= ylimit; ++y)
         {
             int yoffset = clamp(y, 0, src_rows - 1) * src_step + src_offset;
-            VW line = (VW)(0);
+            VW sline = (VW)(0);
             for (int x = 0; x < 4; ++x)
-                line += (VW)(xcoeff[x]) * TO_VEC_WORK(loadpix(srcptr + yoffset + xoffset[x]));
-            sum += line * (VW)(cubicCoeff(A, A2, A3, y - fy));
+                sline += (VW)(xcoeff[x]) * TO_VEC_WORK(loadpix(srcptr + yoffset + xoffset[x]));
+            sum += sline * (VW)(cubicCoeff(A, A2, A3, y - fy));
         }
         storepix(TO_VEC_TYPE(sum), D);
 #else
@@ -290,11 +290,11 @@ __kernel void resizeOnnx_cubic(
             W sum = 0;
             for (int y = 0; y < 4; ++y)
             {
-                W line = 0;
+                W sline = 0;
                 for (int x = 0; x < 4; ++x)
-                    line += xcoeff[x] * TO_WORK(((__global T const*)
+                    sline += xcoeff[x] * TO_WORK(((__global T const*)
                                                 (srcptr + yoffset[y] + xoffset[x]))[i]);
-                sum += line * ycoeff[y];
+                sum += sline * ycoeff[y];
             }
             ((__global T*)(D))[i] = TO_TYPE(sum);
         }
@@ -329,10 +329,10 @@ __kernel void resizeOnnx_table(
         {
             // offset is already clamped. xoffset is given by uchar
             __global const uchar* S = (srcptr + yoffset[y] * src_step + src_offset);
-            VW line = (VW)(0);
+            VW sline = (VW)(0);
             for (int x = dx; x < xstride; x += dst_cols)
-                line += xcoeff[x] * TO_VEC_WORK(loadpix(S + xoffset[x]));
-            sum += line * ycoeff[y];
+                sline += xcoeff[x] * TO_VEC_WORK(loadpix(S + xoffset[x]));
+            sum += sline * ycoeff[y];
         }
         storepix(TO_VEC_TYPE(sum), D);
 #else
@@ -342,10 +342,10 @@ __kernel void resizeOnnx_table(
             for (int y = dy; y < ystride; y += dst_rows)
             {
                 __global const uchar* S = (srcptr + yoffset[y] * src_step + src_offset);
-                W line = 0;
+                W sline = 0;
                 for (int x = dx; x < xstride; x += dst_cols)
-                    line += xcoeff[x] * TO_WORK(((__global T const*)(S + xoffset[x]))[i]);
-                sum += line * ycoeff[y];
+                    sline += xcoeff[x] * TO_WORK(((__global T const*)(S + xoffset[x]))[i]);
+                sum += sline * ycoeff[y];
             }
             ((__global T*)(D))[i] = TO_TYPE(sum);
         }
@@ -353,7 +353,7 @@ __kernel void resizeOnnx_table(
     }
 }
 
-#else 
+#else
 
 #error "empty kernel"
 
diff --git a/modules/imgproc/src/resize.cpp b/modules/imgproc/src/resize.cpp
index 9fa892de98..324b4034f6 100644
--- a/modules/imgproc/src/resize.cpp
+++ b/modules/imgproc/src/resize.cpp
@@ -3698,7 +3698,7 @@ class ResizeOnnxInvoker : public ParallelLoopBody
     VResize vresize;
 
     ResizeOnnxInvoker& operator =(ResizeOnnxInvoker const&);
-    
+
 public:
     typedef typename HResize::value_type T;
     typedef typename HResize::buf_type WT;
@@ -3710,7 +3710,6 @@ public:
         CV_CheckLE(ctrl.ksize, MAX_ESIZE, "resampler kernel's size is too larger");
         CV_Check(ctrl.is_fixpt, !(ctrl.is_fixpt && ctrl.is_double), "can not be both types");
         // prefer static_assert, but how ?
-#ifdef CV_CXX11
         // check generic resize
         if (ctrl.is_fixpt)
         {
@@ -3754,7 +3753,6 @@ public:
         CV_Check(sizeof(IdxT) * 10 + sizeof(WT),
             (std::is_same<IdxT, typename std::common_type<IdxT, WT>::type>::value),
             "something wrong");
-#endif
     }
 
     void hori_antialias_accumulate(T const* S, IdxT* L) const
@@ -3800,7 +3798,7 @@ public:
                 L[di + 2] += S[si + 2] * alpha;
                 L[di + 3] += S[si + 3] * alpha;
             }
-        else 
+        else
             for (int k = 0; k < len; ++k)
             {
                 int di = ctrl.xtab[k].di;
@@ -3817,7 +3815,7 @@ public:
         int dwidth = dst.cols * cn;
 #ifdef CV_CXX11
         constexpr bool same_wt_idxt = std::is_same<WT, IdxT>::value;
-#else 
+#else
         bool const same_wt_idxt = false;
 #endif
         for (int i = 0; i < count; ++i)
@@ -3917,7 +3915,7 @@ public:
         {
             if (ctrl.xkanti)
                 vert_antialias_hori_antialias(dy, L, A);
-            else 
+            else
                 vert_antialias_hori_generic(dy, Lw, A);
         }
     }
@@ -3966,7 +3964,7 @@ public:
             {
                 if (ctrl.xkanti)
                     hori_antialias_lines(srows + k0, rows + k0, L, ksize - k0);
-                else 
+                else
                     hori_generic_lines(srows + k0, rows + k0, ksize - k0);
             }
             vresize(const_cast<WT const**>(rows), dst.template ptr<T>(dy), beta, dwidth);
@@ -4321,10 +4319,10 @@ static void ocl_resizeOnnxTable(int srclen, int dstlen, int esz,
         float sum = 0.f;
         for (int i = start; i < end; ++i)
         {
-            float x = fabs(i - ratio) * scale;
+            float x = fabsf(i - ratio) * scale;
             if (sampler == INTER_LINEAR)
                 x = min(max(x, 0.f), 1.f);
-            else 
+            else
             {
                 if (x <= 1)
                     x = ((A + 2) * x - (A + 3)) * x * x + 1;
@@ -4351,7 +4349,7 @@ static void ocl_resizeOnnxTable(int srclen, int dstlen, int esz,
 static char const* ocl_resizeOnnx_typeToString(int type, char* buf, size_t size)
 {
     // typeToStr CV_Assert will failed
-    static char const* tab[CV_64F + 1] = 
+    static char const* tab[CV_64F + 1] =
         { "uchar", "char", "ushort", "short", "int", "float", "double" };
     int depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
     if (cn == 1)
@@ -4388,9 +4386,6 @@ static bool ocl_resizeOnnx(InputArray _src, OutputArray _dst,
     int nearest = interpolation & INTER_NEAREST_MODE_MASK;
     int antialias = interpolation & INTER_ANTIALIAS_MASK;
     Point2f scale = static_cast<Point2f>(scaled);
-    if (depth > CV_64F)
-        return false;
-
     ocl::Kernel k;
     UMat src = _src.getUMat(), dst = _dst.getUMat();
     size_t globalsize[] = {static_cast<size_t>(dst.cols), static_cast<size_t>(dst.rows)};
@@ -4398,6 +4393,9 @@ static bool ocl_resizeOnnx(InputArray _src, OutputArray _dst,
     int pixel_size = static_cast<int>(src.elemSize());
     int T = depth, VT = type;
     String buildopts, errmsg;
+    // opencv ocl kernel use int for step and offset
+    if (depth > CV_64F || src.size[0] * src.step[0] > INT_MAX)
+        return false;
 
     if (sampler == INTER_NEAREST)
     {
diff --git a/modules/imgproc/test/test_resize_onnx.cpp b/modules/imgproc/test/test_resize_onnx.cpp
index a9a4271e72..0de233c63c 100644
--- a/modules/imgproc/test/test_resize_onnx.cpp
+++ b/modules/imgproc/test/test_resize_onnx.cpp
@@ -235,7 +235,7 @@ TEST(ResizeOnnx, downsample_sizes_linear_pytorch_half_pixel)
         Size(4, 4), Size(1, 3), Size(1, 3), Point2d(),
         -0.75f, Rect2d(),
         {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
-        {   
+        {
             1.6666666,
             7.0      ,
             12.333333,
@@ -495,4 +495,3 @@ TEST(ResizeOnnx, upsample_sizes_nearest_round_prefer_ceil_asymmetric)
 }
 
 }}
-

From 358b64ad0e844f9621b7cde1c9d371da770d7a1d Mon Sep 17 00:00:00 2001
From: Ginkgo <ecrirelettre@outlook.com>
Date: Mon, 10 Jun 2024 13:50:54 +0800
Subject: [PATCH 03/12] cuda resize onnx done

---
 .../include/opencv2/core/cuda/vec_math.hpp    |  12 +-
 modules/imgproc/include/opencv2/imgproc.hpp   |  20 +--
 modules/imgproc/src/opencl/resize_onnx.cl     |  33 ++--
 modules/imgproc/src/resize.cpp                | 165 ++++++++----------
 modules/imgproc/test/ocl/test_warp.cpp        | 156 +++++++----------
 modules/imgproc/test/test_resize_onnx.cpp     |   8 +-
 6 files changed, 168 insertions(+), 226 deletions(-)

diff --git a/modules/core/include/opencv2/core/cuda/vec_math.hpp b/modules/core/include/opencv2/core/cuda/vec_math.hpp
index f22205fcc0..0a1205e25b 100644
--- a/modules/core/include/opencv2/core/cuda/vec_math.hpp
+++ b/modules/core/include/opencv2/core/cuda/vec_math.hpp
@@ -458,12 +458,12 @@ CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&, short, short)
 CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&, int, int)
 CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&, uint, uint)
 
-CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(| , uchar, uchar)
-CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(| , char, char)
-CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(| , ushort, ushort)
-CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(| , short, short)
-CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(| , int, int)
-CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(| , uint, uint)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(|, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(|, char, char)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(|, ushort, ushort)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(|, short, short)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(|, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(|, uint, uint)
 
 CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(^, uchar, uchar)
 CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(^, char, char)
diff --git a/modules/imgproc/include/opencv2/imgproc.hpp b/modules/imgproc/include/opencv2/imgproc.hpp
index 8e6eecb667..ba002bbc70 100644
--- a/modules/imgproc/include/opencv2/imgproc.hpp
+++ b/modules/imgproc/include/opencv2/imgproc.hpp
@@ -281,14 +281,11 @@ enum InterpolationFlags {
 //! ONNX Resize Flags
 enum ResizeONNXFlags
 {
-    // static_assert((1 << INTER_COORDINATE_SHIFT) > INTER_MAX, "");
-    // https://github.com/onnx/onnx/blob/main/docs/Operators.md#Resize
-
     INTER_SAMPLER_SHIFT        = 0,
-    INTER_SAMPLER_BIT          = 4,
+    INTER_SAMPLER_BIT          = 3,
     INTER_SAMPLER_MASK         = ((1 << INTER_SAMPLER_BIT) - 1) << INTER_SAMPLER_SHIFT,
 
-    INTER_COORDINATE_SHIFT     = 4,
+    INTER_COORDINATE_SHIFT     = INTER_SAMPLER_SHIFT + INTER_SAMPLER_BIT,
     INTER_COORDINATE_BIT       = 3,
     INTER_COORDINATE_MASK      = ((1 << INTER_COORDINATE_BIT) - 1) << INTER_COORDINATE_SHIFT,
     /** x_original = (x_resized + 0.5) / scale - 0.5 */
@@ -305,8 +302,8 @@ enum ResizeONNXFlags
     /** x_original = x_resized / scale */
     INTER_ASYMMETRIC           = 4 << INTER_COORDINATE_SHIFT,
     /** x_original = length_resized > 1
-     * ? start_x * (length_original - 1) + x_resized * (end_x - start_x) * (length_original - 1) / (length_resized - 1)
-     * : 0.5 * (start_x + end_x) * (length_original - 1) */
+            ? start_x * (length_original - 1) + x_resized * (end_x - start_x) * (length_original - 1) / (length_resized - 1)
+            : 0.5 * (start_x + end_x) * (length_original - 1) */
     INTER_TF_CROP_RESIZE       = 5 << INTER_COORDINATE_SHIFT,
 
     INTER_NEAREST_MODE_SHIFT   = INTER_COORDINATE_SHIFT + INTER_COORDINATE_BIT,
@@ -2468,12 +2465,12 @@ CV_EXPORTS_W void resize( InputArray src, OutputArray dst,
                           int interpolation = INTER_LINEAR );
 
 /** @brief onnx resize op
+
 https://github.com/onnx/onnx/blob/main/docs/Operators.md#Resize
 https://github.com/onnx/onnx/blob/main/onnx/reference/ops/op_resize.py
-
 Not support `exclude_outside` and `extrapolation_value` yet.
 
-To get a similar result to resize, give dsize and:
+To get a similar result to `cv::resize`, give dsize and:
     INTER_NEAREST : ASYMMETRIC + NEAREST_FLOOR
     INTER_LINEAR  : HALF_PIXEL
     INTER_CUBIC   : HALF_PIXEL + cubicCoeff(-0.75)
@@ -2490,9 +2487,8 @@ To get a similar result to resize, give dsize and:
 
 @sa  resize
  */
-CV_EXPORTS_W void resizeOnnx(
-    InputArray src, OutputArray dst, Size dsize, Point2d scale = Point2d(),
-    int interpolation = INTER_LINEAR | INTER_HALF_PIXEL,
+CV_EXPORTS_W void resizeOnnx(InputArray src, OutputArray dst, Size dsize,
+    Point2d scale = Point2d(), int interpolation = INTER_LINEAR | INTER_HALF_PIXEL,
     float cubicCoeff = -0.75f, Rect2d const& roi = Rect2d());
 
 /** @brief Applies an affine transformation to an image.
diff --git a/modules/imgproc/src/opencl/resize_onnx.cl b/modules/imgproc/src/opencl/resize_onnx.cl
index 19c6c69cb4..8b7c96cea0 100644
--- a/modules/imgproc/src/opencl/resize_onnx.cl
+++ b/modules/imgproc/src/opencl/resize_onnx.cl
@@ -67,22 +67,20 @@ __kernel void resizeOnnx_nearest(
 
 #if PIXEL_SIZE == 1
         *D = *S;
-#elif PIXEL_SIZE == 2 || PIXEL_SIZE == 4 || PIXEL_SIZE == 8 || PIXEL_SIZE == 16
-        *(__global VT*)(D) = *(__global const VT*)(S);
+#elif PIXEL_SIZE == 2
+        *(__global ushort*)(D) = *(__global const ushort*)(S);
 #elif PIXEL_SIZE == 3
         vstore3(vload3(0, S), 0, D);
+#elif PIXEL_SIZE == 4
+        *(__global uint*)(D) = *(__global const uint*)(S);
 #elif PIXEL_SIZE == 6
         vstore3(vload3(0, (__global ushort const*)(S)), 0, (__global ushort*)(D));
+#elif PIXEL_SIZE == 8
+        *(__global uint2*)(D) = *(__global const uint2*)(S);
 #elif PIXEL_SIZE == 12
         vstore3(vload3(0, (__global const uint*)(S)), 0, (__global uint*)(D));
-#elif PIXEL_SIZE == 24
-        vstore3(vload3(0, (__global ulong const*)(S)), 0, (__global ulong*)(D));
-#elif PIXEL_SIZE == 32
-        *(__global uint8*)(D) = *(__global uint8 const*)(S);
-#elif PIXEL_SIZE == 64
-        *(__global uint16*)(D) = *(__global uint16 const*)(S);
-#elif PIXEL_SIZE == 128
-        *(__global ulong16*)(D) = *(__global ulong16 const*)(S);
+#elif PIXEL_SIZE == 16
+        *(__global uint4*)(D) = *(__global const uint4*)(S);
 #else
         for (int i = 0; i < pixel_size; ++i)
             D[i] = S[i];
@@ -114,7 +112,7 @@ __kernel void resizeOnnx_linear(
         __global uchar const* S2 = srcptr + (y1 * src_step + mad24(x0, pixel_size, src_offset));
         __global uchar const* S3 = srcptr + (y1 * src_step + mad24(x1, pixel_size, src_offset));
         __global uchar      * D  = dstptr + (dy * dst_step + mad24(dx, pixel_size, dst_offset));
-#if CN == 1 || CN == 2 || CN == 3 || CN == 4 || CN == 8 || CN == 16
+#if CN == 1 || CN == 2 || CN == 3 || CN == 4
         VW s0 = TO_VEC_WORK(loadpix(S0)); VW s1 = TO_VEC_WORK(loadpix(S1));
         VW s2 = TO_VEC_WORK(loadpix(S2)); VW s3 = TO_VEC_WORK(loadpix(S3));
         VT d0 = TO_VEC_TYPE((u0 * v0) * s0 + (u1 * v0) * s1 + (u0 * v1) * s2 + (u1 * v1) * s3);
@@ -154,7 +152,7 @@ __kernel void resizeOnnx_linear_antialias(
         int ix = convert_int_rtn(fx), iy = convert_int_rtn(fy);
         float rx = fx - ix, ry = fy - iy;
         __global uchar* D = dstptr + dy * dst_step + mad24(dx, pixel_size, dst_offset);
-#if CN == 1 || CN == 2 || CN == 3 || CN == 4 || CN == 8 || CN == 16
+#if CN == 1 || CN == 2 || CN == 3 || CN == 4
         VW sumval = (VW)(0);
         float weight = 0;
         for (int h = ystart; h < yend; ++h)
@@ -266,7 +264,7 @@ __kernel void resizeOnnx_cubic(
             xcoeff [x - xstart] = cubicCoeff(A, A2, A3, x - fx);
         }
         __global uchar* D = dstptr + (dy * dst_step + mad24(dx, pixel_size, dst_offset));
-#if CN == 1 || CN == 2 || CN == 3 || CN == 4 || CN == 8 || CN == 16
+#if CN == 1 || CN == 2 || CN == 3 || CN == 4
         VW sum = (VW)(0);
         for (int y = ystart; y <= ylimit; ++y)
         {
@@ -322,13 +320,14 @@ __kernel void resizeOnnx_table(
         __global int const* yoffset = xoffset + xstride;
         __global float const* xcoeff = (__global float const*)(yoffset + ystride);
         __global float const* ycoeff = (__global float const*)(xcoeff + xstride);
-#if CN == 1 || CN == 2 || CN == 3 || CN == 4 || CN == 8 || CN == 16
+#if CN == 1 || CN == 2 || CN == 3 || CN == 4
         VW sum = (VW)(0);
         // exact ykanti / xkanti loops
         for (int y = dy; y < ystride; y += dst_rows)
         {
-            // offset is already clamped. xoffset is given by uchar
-            __global const uchar* S = (srcptr + yoffset[y] * src_step + src_offset);
+            // offset is already clamped
+            // xoffset is given by uchar, yoffset already multiply by src_step
+            __global const uchar* S = srcptr + yoffset[y] + src_offset;
             VW sline = (VW)(0);
             for (int x = dx; x < xstride; x += dst_cols)
                 sline += xcoeff[x] * TO_VEC_WORK(loadpix(S + xoffset[x]));
@@ -341,7 +340,7 @@ __kernel void resizeOnnx_table(
             W sum = 0;
             for (int y = dy; y < ystride; y += dst_rows)
             {
-                __global const uchar* S = (srcptr + yoffset[y] * src_step + src_offset);
+                __global const uchar* S = (srcptr + yoffset[y] + src_offset);
                 W sline = 0;
                 for (int x = dx; x < xstride; x += dst_cols)
                     sline += xcoeff[x] * TO_WORK(((__global T const*)(S + xoffset[x]))[i]);
diff --git a/modules/imgproc/src/resize.cpp b/modules/imgproc/src/resize.cpp
index 324b4034f6..c763379bb9 100644
--- a/modules/imgproc/src/resize.cpp
+++ b/modules/imgproc/src/resize.cpp
@@ -1356,36 +1356,36 @@ public:
                 break;
             case 2:
                 for (; x < width; ++x)
-                    reinterpret_cast<short*>(D)[x] = *(reinterpret_cast<short const*>(S + x_ofs[x]));
+                    reinterpret_cast<ushort*>(D)[x] = *(reinterpret_cast<ushort const*>(S + x_ofs[x]));
                 break;
             case 3:
                 for (; x < width; ++x, D += 3)
                 {
-                    const uchar* _tS = S + x_ofs[x];
+                    uchar const* _tS = S + x_ofs[x];
                     D[0] = _tS[0]; D[1] = _tS[1]; D[2] = _tS[2];
                 }
                 break;
             case 4:
                 for (; x < width; ++x)
-                    reinterpret_cast<int*>(D)[x] = *(reinterpret_cast<int const*>(S + x_ofs[x]));
+                    reinterpret_cast<uint*>(D)[x] = *(reinterpret_cast<uint const*>(S + x_ofs[x]));
                 break;
             case 6:
                 for (; x < width; ++x, D += 6)
                 {
-                    short const* _tS = reinterpret_cast<short const*>(S + x_ofs[x]);
-                    short* _tD = reinterpret_cast<short*>(D);
+                    ushort const* _tS = reinterpret_cast<ushort const*>(S + x_ofs[x]);
+                    ushort* _tD = reinterpret_cast<ushort*>(D);
                     _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2];
                 }
                 break;
             case 8:
                 for (; x < width; ++x)
-                    reinterpret_cast<int64*>(D)[x] = *(reinterpret_cast<int64 const*>(S + x_ofs[x]));
+                    reinterpret_cast<uint64*>(D)[x] = *(reinterpret_cast<uint64 const*>(S + x_ofs[x]));
                 break;
             case 12:
                 for (; x < width; ++x, D += 12)
                 {
-                    int const* _tS = reinterpret_cast<int const*>(S + x_ofs[x]);
-                    int* _tD = reinterpret_cast<int*>(D);
+                    uint const* _tS = reinterpret_cast<uint const*>(S + x_ofs[x]);
+                    uint* _tD = reinterpret_cast<uint*>(D);
                     _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2];
                 }
                 break;
@@ -3465,7 +3465,8 @@ public:
     int xmin, xmax;
 
 private:
-    void cubic_coeffs_antialias(int dstlen, int cn, float srcpos, float scale, int srclen, float A, TabIdx* elem)
+    void cubicCoeffsAntiAlias(
+        int dstlen, int cn, float srcpos, float scale, int srclen, float A, TabIdx* elem)
     {
         scale = min(scale, 1.f);
         int index = cvFloor(srcpos);
@@ -3497,7 +3498,7 @@ private:
         }
     }
 
-    void cubic_coeffs(float x, float A, float* coeffs)
+    void cubicCoeffs(float x, float A, float* coeffs)
     {
         coeffs[0] = A * ((((x + 1) - 5) * (x + 1) + 8) * (x + 1) - 4);
         coeffs[1] = ((A + 2) * x - (A + 3)) * x * x + 1;
@@ -3505,7 +3506,8 @@ private:
         coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];
     }
 
-    void linear_coeffs_antialias(int dstlen, int cn, float srcpos, float scale, int srclen, TabIdx* elem)
+    void linearCoeffsAntialias(
+        int dstlen, int cn, float srcpos, float scale, int srclen, TabIdx* elem)
     {
         scale = min(scale, 1.f);
         int index = cvFloor(srcpos);
@@ -3532,7 +3534,7 @@ private:
         }
     }
 
-    void linear_coeffs(float x, float* coeffs)
+    void linearCoeffs(float x, float* coeffs)
     {
         coeffs[0] = 1.f - x;
         coeffs[1] = x;
@@ -3570,16 +3572,17 @@ private:
         area.commit();
         CV_CheckLE(ksize, MAX_ESIZE, "resampler kernel's size is too larger");
 
-        if (antialias)
+        // when upsampling, `antialias` is same to `generic`, so use `generic` to speed up
+        if (antialias && scaled.x < 1.0)
         {
             float a = M(0, 0), b = M(0, 1);
             for (int d = 0; d < dsize.width; ++d)
             {
                 float f = fmaf(static_cast<float>(d), a, b);
                 if (sampler == INTER_LINEAR)
-                    linear_coeffs_antialias(d, cn, f, scale.x, ssize.width, xtab + d * xkanti);
+                    linearCoeffsAntialias(d, cn, f, scale.x, ssize.width, xtab + d * xkanti);
                 else // if (sampler == INTER_CUBIC)
-                    cubic_coeffs_antialias(d, cn, f, scale.x, ssize.width, cubicCoeff, xtab + d * xkanti);
+                    cubicCoeffsAntiAlias(d, cn, f, scale.x, ssize.width, cubicCoeff, xtab + d * xkanti);
             }
         }
         else
@@ -3608,9 +3611,9 @@ private:
                 for (int k = 0; k < cn; ++k)
                     xofs[cn * d + k] = cn * s + k;
                 if (sampler == INTER_LINEAR)
-                    linear_coeffs(f, cbuf);
+                    linearCoeffs(f, cbuf);
                 else // if (sampler == INTER_CUBIC)
-                    cubic_coeffs(f, cubicCoeff, cbuf);
+                    cubicCoeffs(f, cubicCoeff, cbuf);
                 if (is_fixpt)
                 {
                     short* coeffs = reinterpret_cast<short*>(xcoeffs) + cn * ksize * d;
@@ -3638,16 +3641,16 @@ private:
             }
         }
 
-        if (antialias)
+        if (antialias && scaled.y < 1.0)
         {
             float a = M(1, 0), b = M(1, 1);
             for (int d = 0; d < dsize.height; ++d)
             {
                 float f = fmaf(static_cast<float>(d), a, b);
                 if (sampler == INTER_LINEAR)
-                    linear_coeffs_antialias(d, 1, f, scale.y, ssize.height, ytab + d * ykanti);
+                    linearCoeffsAntialias(d, 1, f, scale.y, ssize.height, ytab + d * ykanti);
                 else // if (sampler == INTER_CUBIC)
-                    cubic_coeffs_antialias(d, 1, f, scale.y, ssize.height, cubicCoeff, ytab + d * ykanti);
+                    cubicCoeffsAntiAlias(d, 1, f, scale.y, ssize.height, cubicCoeff, ytab + d * ykanti);
             }
         }
         else
@@ -3662,9 +3665,9 @@ private:
                 f -= s;
                 yofs[d] = s;
                 if (sampler == INTER_LINEAR)
-                    linear_coeffs(f, cbuf);
+                    linearCoeffs(f, cbuf);
                 else // if (sampler == INTER_CUBIC)
-                    cubic_coeffs(f, cubicCoeff, cbuf);
+                    cubicCoeffs(f, cubicCoeff, cbuf);
                 if (is_fixpt)
                 {
                     short* coeffs = reinterpret_cast<short*>(ycoeffs) + 1 * ksize * d;
@@ -3755,7 +3758,7 @@ public:
             "something wrong");
     }
 
-    void hori_antialias_accumulate(T const* S, IdxT* L) const
+    void horiAntialiasAccumulate(T const* S, IdxT* L) const
     {
         IdxT alpha;
         int const cn = dst.channels();
@@ -3809,15 +3812,11 @@ public:
             }
     }
 
-    void hori_antialias_lines(T const** srcptr, WT** dstptr, IdxT* L, int count) const
+    void horiAntialiasLines(T const** srcptr, WT** dstptr, IdxT* L, int count) const
     {
         int cn = dst.channels();
         int dwidth = dst.cols * cn;
-#ifdef CV_CXX11
-        constexpr bool same_wt_idxt = std::is_same<WT, IdxT>::value;
-#else
-        bool const same_wt_idxt = false;
-#endif
+        bool const same_wt_idxt = std::is_same<WT, IdxT>::value;
         for (int i = 0; i < count; ++i)
         {
             T const* S = srcptr[i];
@@ -3825,7 +3824,7 @@ public:
             if (same_wt_idxt)
                 L = reinterpret_cast<IdxT*>(dstptr[i]);
             memset(L, 0, sizeof(IdxT) * dwidth);
-            hori_antialias_accumulate(S, L);
+            horiAntialiasAccumulate(S, L);
             if (!same_wt_idxt)
             {
                 WT* D = dstptr[i];
@@ -3844,7 +3843,7 @@ public:
         }
     }
 
-    void hori_generic_lines(T const** srcptr, WT** dstptr, int count) const
+    void horiGenericLines(T const** srcptr, WT** dstptr, int count) const
     {
         int cn = src.channels();
         int ssize = src.cols * cn;
@@ -3857,53 +3856,7 @@ public:
             ssize, dsize, cn, xmin, xmax);
     }
 
-    void vert_antialias_hori_antialias(int dy, IdxT* L, IdxT* A) const
-    {
-        // the start and end of ytab
-        int dwidth = dst.channels() * dst.cols;
-        int tstart = dy * ctrl.ykanti, tend = tstart + ctrl.ykanti;
-        memset(A, 0, dwidth * sizeof(IdxT));
-        for (int t = tstart; t < tend; ++t)
-        {
-            IdxT beta;
-            int sy = ctrl.ytab[t].si;
-            CV_CheckEQ(dy, ctrl.ytab[t].di, "something wrong");
-            ctrl.ytab[t].as(beta);
-            memset(L, 0, dwidth * sizeof(IdxT));
-            hori_antialias_accumulate(src.template ptr<T>(sy), L);
-            for (int w = 0; w < dwidth; ++w)
-                A[w] += L[w] * beta;
-        }
-        T* D = dst.template ptr<T>(dy);
-        for (int w = 0; w < dwidth; ++w)
-            D[w] = saturate_cast<T>(A[w]);
-    }
-
-    void vert_antialias_hori_generic(int dy, WT* L, IdxT* A) const
-    {
-        // FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS> cast;
-        int dwidth = dst.channels() * dst.cols;
-        int tstart = dy * ctrl.ykanti, tend = tstart + ctrl.ykanti;
-        memset(A, 0, dwidth * sizeof(IdxT));
-        for (int t = tstart; t < tend; ++t)
-        {
-            IdxT beta;
-            int sy = ctrl.ytab[t].si;
-            CV_CheckEQ(dy, ctrl.ytab[t].di, "something wrong");
-            ctrl.ytab[t].as(beta);
-            T const* S = src.template ptr<T>(sy);
-            hori_generic_lines(&S, &L, 1);
-            if (ctrl.is_fixpt)
-                beta /= INTER_RESIZE_COEF_SCALE;
-            for (int w = 0; w < dwidth; ++w)
-                A[w] += L[w] * beta;
-        }
-        T* D = dst.template ptr<T>(dy);
-        for (int w = 0; w < dwidth; ++w)
-            D[w] = saturate_cast<T>(A[w]);
-    }
-
-    void vert_antialias(Range const& range) const
+    void vertAntialias(Range const& range) const
     {
         int cn = dst.channels();
         int dwidth = dst.cols * cn;
@@ -3913,14 +3866,38 @@ public:
         WT* Lw = reinterpret_cast<WT*>(L);
         for (int dy = range.start; dy < range.end; ++dy)
         {
-            if (ctrl.xkanti)
-                vert_antialias_hori_antialias(dy, L, A);
-            else
-                vert_antialias_hori_generic(dy, Lw, A);
+            int tstart = dy * ctrl.ykanti, tend = tstart + ctrl.ykanti;
+            memset(A, 0, dwidth * sizeof(IdxT));
+            for (int t = tstart; t < tend; ++t)
+            {
+                IdxT beta;
+                int sy = ctrl.ytab[t].si;
+                CV_CheckEQ(dy, ctrl.ytab[t].di, "something wrong");
+                ctrl.ytab[t].as(beta);
+                T const* S = src.template ptr<T>(sy);
+                if (ctrl.xkanti)
+                {
+                    memset(L, 0, dwidth * sizeof(IdxT));
+                    horiAntialiasAccumulate(S, L);
+                    for (int w = 0; w < dwidth; ++w)
+                        A[w] += L[w] * beta;
+                }
+                else
+                {
+                    horiGenericLines(&S, &Lw, 1);
+                    if (ctrl.is_fixpt)
+                        beta /= INTER_RESIZE_COEF_SCALE;
+                    for (int w = 0; w < dwidth; ++w)
+                        A[w] += Lw[w] * beta;
+                }
+            }
+            T* D = dst.template ptr<T>(dy);
+            for (int w = 0; w < dwidth; ++w)
+                D[w] = saturate_cast<T>(A[w]);
         }
     }
 
-    void vert_generic(Range const& range) const
+    void vertGeneric(Range const& range) const
     {
         int ksize = ctrl.ksize, ksize2 = ksize / 2;
         int cn = src.channels();
@@ -3963,9 +3940,9 @@ public:
             if (k0 < ksize)
             {
                 if (ctrl.xkanti)
-                    hori_antialias_lines(srows + k0, rows + k0, L, ksize - k0);
+                    horiAntialiasLines(srows + k0, rows + k0, L, ksize - k0);
                 else
-                    hori_generic_lines(srows + k0, rows + k0, ksize - k0);
+                    horiGenericLines(srows + k0, rows + k0, ksize - k0);
             }
             vresize(const_cast<WT const**>(rows), dst.template ptr<T>(dy), beta, dwidth);
         }
@@ -3974,9 +3951,9 @@ public:
     virtual void operator() (Range const& range) const CV_OVERRIDE
     {
         if (ctrl.ykanti)
-            vert_antialias(range);
+            vertAntialias(range);
         else
-            vert_generic(range);
+            vertGeneric(range);
     }
 };
 
@@ -4003,7 +3980,7 @@ typedef void (*ResizeAreaFunc)( const Mat& src, Mat& dst,
                                 const DecimateAlpha* ytab, int ytab_size,
                                 const int* yofs);
 
-typedef void (*ResizeOnnxFunc)(Mat const& src, Mat& dst, ResizeOnnxCtrl const&);
+typedef void (*ResizeOnnxFunc)(Mat const& src, Mat& dst, ResizeOnnxCtrl const& ctrl);
 
 
 static int computeResizeAreaTab( int ssize, int dsize, int cn, double scale, DecimateAlpha* tab )
@@ -4517,7 +4494,7 @@ static bool ocl_resizeOnnx(InputArray _src, OutputArray _dst,
         float* ycoeff = reinterpret_cast<float*>(xcoeff + xstride);
         ocl_resizeOnnxTable(src.cols, dst.cols, pixel_size,
             sampler, M(0, 0), M(0, 1), cubicCoeff, scale.x, xoffset, xcoeff);
-        ocl_resizeOnnxTable(src.rows, dst.rows, 1,
+        ocl_resizeOnnxTable(src.rows, dst.rows, static_cast<int>(src.step[0]),
             sampler, M(1, 0), M(1, 1), cubicCoeff, scale.y, yoffset, ycoeff);
         UMat utable;
         Mat(1, tabsize, CV_32S, table.data()).copyTo(utable);
@@ -5175,6 +5152,7 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
 void cv::resizeOnnx(InputArray _src, OutputArray _dst,
     Size dsize, Point2d scale, int interpolation, float cubicCoeff, Rect2d const& roi)
 {
+    static_assert((1 << INTER_SAMPLER_BIT) >= INTER_MAX, "");
     CV_INSTRUMENT_REGION();
 
     Size ssize = _src.size();
@@ -5185,7 +5163,7 @@ void cv::resizeOnnx(InputArray _src, OutputArray _dst,
     {
         CV_CheckGT(scale.x, 0.0, "scale must > 0 if no dsize given");
         CV_CheckGT(scale.y, 0.0, "scale must > 0 if no dsize given");
-        // https://github.com/onnx/onnx/blob/main/onnx/reference/ops/op_resize.py#L365
+        // https://github.com/onnx/onnx/blob/main/onnx/reference/ops/op_resize.py
         // output_size = (scale_factors * np.array(data.shape)).astype(int)
         dsize.width  = static_cast<int>(scale.x * ssize.width );
         dsize.height = static_cast<int>(scale.y * ssize.height);
@@ -5196,8 +5174,8 @@ void cv::resizeOnnx(InputArray _src, OutputArray _dst,
         scale.y = static_cast<double>(dsize.height) / ssize.height;
     }
     CV_CheckFalse(dsize.empty(), "dst size must not empty");
-    CV_CheckGT(scale.x, 0.0, "computed scale <= 0 with given dsize");
-    CV_CheckGT(scale.y, 0.0, "computed scale <= 0 with given dsize");
+    CV_CheckGT(scale.x, 0.0, "require computed or given scale > 0");
+    CV_CheckGT(scale.y, 0.0, "require computed or given scale > 0");
 
     int sampler = interpolation & INTER_SAMPLER_MASK;
     int nearest = interpolation & INTER_NEAREST_MODE_MASK;
@@ -5237,6 +5215,9 @@ void cv::resizeOnnx(InputArray _src, OutputArray _dst,
         _src.copyTo(_dst);
         return;
     }
+    // Antialias is applied when downsampling
+    if (scale.x >= 1.0 && scale.y >= 1.0)
+        interpolation &= ~INTER_ANTIALIAS_MASK;
 
     // Fake reference to source. Resolves issue 13577 in case of src == dst.
     UMat srcUMat;
diff --git a/modules/imgproc/test/ocl/test_warp.cpp b/modules/imgproc/test/ocl/test_warp.cpp
index 963fd8bc0c..5c33b03b97 100644
--- a/modules/imgproc/test/ocl/test_warp.cpp
+++ b/modules/imgproc/test/ocl/test_warp.cpp
@@ -338,18 +338,18 @@ OCL_TEST(Resize, overflow_21198)
 
 PARAM_TEST_CASE(ResizeOnnx, MatType, double, double, int, bool, int)
 {
-    int type, interpolation;
+    int depth, interpolation;
     int widthMultiple;
     double fx, fy;
     bool useRoi;
-    Mat middle;
 
-    TEST_DECLARE_INPUT_PARAMETER(src);
-    TEST_DECLARE_OUTPUT_PARAMETER(dst);
+    Rect src_loc, dst_loc;
+    Mat src, dst, src_roi, dst_roi;
+    UMat gsrc, gdst, gsrc_roi, gdst_roi;
 
     virtual void SetUp()
     {
-        type = GET_PARAM(0);
+        depth = GET_PARAM(0);
         fx = GET_PARAM(1);
         fy = GET_PARAM(2);
         interpolation = GET_PARAM(3);
@@ -357,96 +357,70 @@ PARAM_TEST_CASE(ResizeOnnx, MatType, double, double, int, bool, int)
         widthMultiple = GET_PARAM(5);
     }
 
-    void random_roi()
+    void random_submat(int type,
+        Size& size, Rect& roi, Mat& mat, Mat& sub, UMat& gmat, UMat& gsub)
     {
-        CV_Assert(fx > 0 && fy > 0);
+        int border = useRoi ? 65 : 0;
+        roi.x = randomInt(0, border);
+        roi.y = randomInt(0, border);
+        roi.width = size.width;
+        roi.height = size.height;
+        size.width += roi.x + randomInt(0, border);
+        size.height += roi.y + randomInt(0, border);
+        mat = randomMat(size, type, -127, 127);
+        mat.copyTo(gmat);
+        sub = mat(roi);
+        gsub = gmat(roi);
+    }
 
-        Size srcRoiSize = randomSize(10, MAX_VALUE), dstRoiSize;
-        // Make sure the width is a multiple of the requested value, and no more
-        srcRoiSize.width += widthMultiple - 1 - (srcRoiSize.width - 1) % widthMultiple;
-        dstRoiSize.width = cvRound(srcRoiSize.width * fx);
-        dstRoiSize.height = cvRound(srcRoiSize.height * fy);
-
-        if (dstRoiSize.empty())
+    void random_roi(int type)
+    {
+        Size srcSize, dstSize;
+        int minSize = min(fx, fy) < 1.0 ? 10 : 1;
+        while (dstSize.empty())
         {
-            random_roi();
-            return;
+            srcSize = randomSize(minSize, 129);
+            srcSize.width += widthMultiple - 1 - (srcSize.width - 1) % widthMultiple;
+            dstSize.width = cvRound(srcSize.width * fx);
+            dstSize.height = cvRound(srcSize.height * fy);
         }
 
-        Border srcBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
-        randomSubMat(src, src_roi, srcRoiSize, srcBorder, type, -MAX_VALUE, MAX_VALUE);
-
-#if 0
-        // if nearest test failed, maybe the fma issue, try open this #if
-        // set pixels' value to their coordinate
-        if ((interpolation & INTER_SAMPLER_MASK) == INTER_NEAREST)
-        {
-            int channel = CV_MAT_CN(type);
-            middle.create(src.rows, src.cols, CV_16SC(channel));
-            for (int h = 0; h < src.rows; ++h)
-            {
-                for (int c = 0; c < channel; c += 2)
-                {
-                    // even x; odd y
-                    short* S = middle.ptr<short>(h) + c;
-                    for (int w = 0; w < src.cols; ++w, S += channel)
-                        S[0] = static_cast<short>(w);
-                }
-                for (int c = 1; c < channel; c += 2)
-                {
-                    // even x; odd y
-                    short* S = middle.ptr<short>(h) + c;
-                    for (int w = 0; w < src.cols; ++w, S += channel)
-                        S[0] = static_cast<short>(h);
-                }
-            }
-            middle.convertTo(src, type);
-            src_roi = src(Rect(srcBorder.lef, srcBorder.top, srcRoiSize.width, srcRoiSize.height));
-        }
-#endif
-        Border dstBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
-        randomSubMat(dst, dst_roi, dstRoiSize, dstBorder, type, -MAX_VALUE, MAX_VALUE);
-
-        UMAT_UPLOAD_INPUT_PARAMETER(src);
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst);
+        random_submat(type, srcSize, src_loc, src, src_roi, gsrc, gsrc_roi);
+        random_submat(type, dstSize, dst_loc, dst, dst_roi, gdst, gdst_roi);
     }
 };
 
 OCL_TEST_P(ResizeOnnx, Mat)
 {
-    Size whole;
-    Point offset;
     Mat host, host_roi;
-    int cn = CV_MAT_CN(type);
-    int depth = CV_MAT_DEPTH(type);
     double eps = depth <= CV_32S ? integerEps : 5e-2;
 
-    for (int j = 0; j < test_loop_times; j++)
+    // loop on channel to reduce the number of test
+    for (int cn = 1; cn <= 6; ++cn)
     {
-        random_roi();
-
-        OCL_OFF(cv::resizeOnnx(src_roi, dst_roi,
-            dst_roi.size(), Point2d(fx, fy), interpolation));
-        OCL_ON(cv::resizeOnnx(usrc_roi, udst_roi,
-            dst_roi.size(), Point2d(fx, fy), interpolation));
-
-        dst_roi.locateROI(whole, offset);
-        udst.copyTo(host);
-        host_roi = host(Rect(offset, dst_roi.size()));
-        if (cn <= 4 && depth != CV_8S && depth != CV_32S)
-            OCL_EXPECT_MAT_N_DIFF(dst, eps);
-        else
+        int type = CV_MAKETYPE(depth, cn);
+        for (int j = 0; j < test_loop_times; ++j)
         {
-            // more strict than OCL_EXPECT_MAT_N_DIFF
-            double dif = cv::norm(dst_roi, host_roi, NORM_INF);
-            EXPECT_LE(dif, eps)
-                << "Size: " << src_roi.size()
-                << ", NormInf: " << dif << std::endl;
+            random_roi(type);
+
+            OCL_OFF(cv::resizeOnnx(src_roi, dst_roi,
+                dst_roi.size(), Point2d(fx, fy), interpolation));
+            OCL_ON(cv::resizeOnnx(gsrc_roi, gdst_roi,
+                dst_roi.size(), Point2d(fx, fy), interpolation));
+
+            // copy whole gdst to make sure that
+            // we really use the given roi memory and not allocate a new one
+            gdst.copyTo(host);
+            host_roi = host(dst_loc);
+            string info = cv::format(
+                "fail on type %sC%d src %dx%d dst %dx%d src_roi %dx%d dst_roi %dx%d",
+                depthToString(depth), cn, src.cols, src.rows, dst.cols, dst.rows,
+                src_roi.cols, src_roi.rows, dst_roi.cols, dst_roi.rows);
+            EXPECT_LE(cv::norm(dst_roi, host_roi, NORM_INF), eps) << info;
         }
     }
 }
 
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 // remap
 
@@ -689,23 +663,18 @@ OCL_INSTANTIATE_TEST_CASE_P(ImgprocWarpResizeArea, Resize, Combine(
                             Bool(),
                             Values(1, 16)));
 
-OCL_INSTANTIATE_TEST_CASE_P(ImgprocWarpAlias, ResizeOnnx, Combine(
-                            Values(
-                                CV_8UC1, CV_8SC2, CV_8UC(5), CV_8SC(7),
-                                CV_16UC1, CV_16SC3, CV_16UC(9), CV_16SC(10),
-                                CV_32FC1, CV_32FC4, CV_32FC(11)),
-                            Values(0.5, 0.31, 1.4),
-                            Values(0.5, 0.73, 3.7),
+OCL_INSTANTIATE_TEST_CASE_P(ImgprocWarp, ResizeOnnx, Combine(
+                            Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32F),
+                            Values(0.4, 0.27, 1.6),
+                            Values(0.5, 0.71, 2.7),
                             Values((int)(INTER_LINEAR), (int)(INTER_CUBIC)),
                             Bool(),
                             Values(1, 16)));
+
 OCL_INSTANTIATE_TEST_CASE_P(ImgprocWarpAntiAlias, ResizeOnnx, Combine(
-                            Values(
-                                CV_8UC1, CV_8SC2, CV_8UC(5), CV_8SC(7),
-                                CV_16UC1, CV_16SC3, CV_16UC(9), CV_16SC(10),
-                                CV_32FC1, CV_32FC4, CV_32FC(11)),
-                            Values(0.5, 0.27, 2.6),
-                            Values(0.5, 0.71, 4.1),
+                            Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32F),
+                            Values(0.4, 0.27, 1.6),
+                            Values(0.5, 0.71, 2.7),
                             Values(
                                 (int)(INTER_ANTIALIAS | INTER_LINEAR),
                                 (int)(INTER_ANTIALIAS | INTER_CUBIC )),
@@ -713,12 +682,9 @@ OCL_INSTANTIATE_TEST_CASE_P(ImgprocWarpAntiAlias, ResizeOnnx, Combine(
                             Values(1, 16)));
 
 OCL_INSTANTIATE_TEST_CASE_P(ImgprocWarpNearest, ResizeOnnx, Combine(
-                            Values(
-                                CV_8UC1, CV_8SC2, CV_8UC4, CV_8SC(7),
-                                CV_16UC1, CV_16SC3, CV_16UC(9), CV_32SC(10),
-                                CV_32FC1, CV_32FC4, CV_32FC(11)),
-                            Values(0.5, 0.27, 2.6),
-                            Values(0.5, 0.71, 4.1),
+                            Values(CV_8S, CV_16S, CV_32F, CV_64F),
+                            Values(0.4, 0.27, 1.6),
+                            Values(0.5, 0.71, 2.7),
                             Values(
                                 (int)(INTER_NEAREST | INTER_NEAREST_PREFER_FLOOR),
                                 (int)(INTER_NEAREST | INTER_NEAREST_PREFER_CEIL),
diff --git a/modules/imgproc/test/test_resize_onnx.cpp b/modules/imgproc/test/test_resize_onnx.cpp
index 0de233c63c..4a4a8b143c 100644
--- a/modules/imgproc/test/test_resize_onnx.cpp
+++ b/modules/imgproc/test/test_resize_onnx.cpp
@@ -43,10 +43,10 @@ struct ResizeOnnx
         Mat iS(szsrc, CV_64F, insrc.data());
         Mat iR(szref, CV_64F, inref.data());
         Mat S = iS, R = iR, nS, nR;
-        double alpha[6] = {1, 1, 5, 5, -1, -3};
-        double  beta[6] = {0, 7, 0, 7, +0, -7};
+        double alpha[8] = {1, -1, 5, 5, 0, -3, -2, +4};
+        double  beta[8] = {0, -0, 0, 7, 7, -7, -6, +6};
         RNG rng;
-        for (int cn = 1; cn <= 6; ++cn)
+        for (int cn = 1; cn <= 8; ++cn)
         {
             if (cn > 1)
             {
@@ -59,7 +59,7 @@ struct ResizeOnnx
             {
                 double eps = (depth <= CV_32S) ? 1.0 : 1e-3;
                 int type = CV_MAKETYPE(depth, cn);
-                string errinfo = "failed on type " + typeToString(type);
+                string errinfo = "fail on type " + typeToString(type);
                 Mat src, ref, dst;
                 rand_roi(rng, src, szsrc, type);
                 if (szdst.area())

From 47d086916929c611463148ddddc668212722d321 Mon Sep 17 00:00:00 2001
From: Ginkgo <ecrirelettre@outlook.com>
Date: Tue, 11 Jun 2024 10:01:32 +0800
Subject: [PATCH 04/12] fix some typos

---
 modules/imgproc/src/opencl/resize_onnx.cl  | 11 +--
 modules/imgproc/src/resize.cpp             | 93 +++++-----------------
 modules/ts/include/opencv2/ts/ocl_test.hpp |  4 +-
 3 files changed, 27 insertions(+), 81 deletions(-)

diff --git a/modules/imgproc/src/opencl/resize_onnx.cl b/modules/imgproc/src/opencl/resize_onnx.cl
index 8b7c96cea0..611e0d6b35 100644
--- a/modules/imgproc/src/opencl/resize_onnx.cl
+++ b/modules/imgproc/src/opencl/resize_onnx.cl
@@ -138,16 +138,12 @@ __kernel void resizeOnnx_linear_antialias(
     __global uchar const* srcptr, int src_step, int src_offset, int src_rows, int src_cols,
     __global uchar      * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
     int pixel_size, int channel, float m00, float m01, float m10, float m11,
-    float xscale, float yscale)
+    float xscale, float yscale, int xstart, int ystart, int xend, int yend)
 {
     int dx = get_global_id(0);
     int dy = get_global_id(1);
     if (dx < dst_cols && dy < dst_rows)
     {
-        int xstart = convert_int_rtn(-1.f / xscale) + 1;
-        int xend = 2 - xstart;
-        int ystart = convert_int_rtn(-1.f / yscale) + 1;
-        int yend = 2 - ystart;
         float fx = fma(dx, m00, m01), fy = fma(dy, m10, m11);
         int ix = convert_int_rtn(fx), iy = convert_int_rtn(fy);
         float rx = fx - ix, ry = fy - iy;
@@ -307,14 +303,13 @@ __kernel void resizeOnnx_cubic(
 __kernel void resizeOnnx_table(
     __global uchar const* srcptr, int src_step, int src_offset, int src_rows, int src_cols,
     __global uchar      * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
-    int pixel_size, int channel, int xkanti, int ykanti, __global int const* table)
+    int pixel_size, int channel, int xkanti, int ykanti, int xstride, int ystride,
+    __global int const* table)
 {
     int dx = get_global_id(0);
     int dy = get_global_id(1);
     if (dx < dst_cols && dy < dst_rows)
     {
-        int xstride = xkanti * dst_cols;
-        int ystride = ykanti * dst_rows;
         __global uchar* D = dstptr + (dy * dst_step + mad24(dx, pixel_size, dst_offset));
         __global int const* xoffset = table;
         __global int const* yoffset = xoffset + xstride;
diff --git a/modules/imgproc/src/resize.cpp b/modules/imgproc/src/resize.cpp
index c763379bb9..5d8fb0d30e 100644
--- a/modules/imgproc/src/resize.cpp
+++ b/modules/imgproc/src/resize.cpp
@@ -3735,7 +3735,7 @@ public:
         else
         {
             CV_Check(sizeof(AT), (std::is_same<AT, float>::value),
-                "when use float coeffs, AT is expected to be short");
+                "when use float coeffs, AT is expected to be float");
             CV_Check(sizeof(T) * 10 + sizeof(WT),
                 (std::is_same<WT, float>::value
                     && (std::is_same<T, short>::value || std::is_same<T, ushort>::value
@@ -3751,11 +3751,11 @@ public:
         else
         {
             CV_Check(ctrl.is_double, (std::is_same<IdxT, float>::value),
-                "when use double coeffs, AT is expected to be double");
+                "when use float coeffs, IdxT is expected to be float");
         }
         CV_Check(sizeof(IdxT) * 10 + sizeof(WT),
             (std::is_same<IdxT, typename std::common_type<IdxT, WT>::type>::value),
-            "something wrong");
+            "we need that IdxT is same or more accurate than WT");
     }
 
     void horiAntialiasAccumulate(T const* S, IdxT* L) const
@@ -3851,9 +3851,8 @@ public:
         int xmin = ctrl.xmin * cn;
         int xmax = ctrl.xmax * cn;
         // just call hresize
-        hresize(srcptr, dstptr, count,
-            ctrl.xofs, reinterpret_cast<AT const*>(ctrl.xcoeffs),
-            ssize, dsize, cn, xmin, xmax);
+        hresize(srcptr, dstptr, count, ctrl.xofs,
+            reinterpret_cast<AT const*>(ctrl.xcoeffs), ssize, dsize, cn, xmin, xmax);
     }
 
     void vertAntialias(Range const& range) const
@@ -3870,9 +3869,9 @@ public:
             memset(A, 0, dwidth * sizeof(IdxT));
             for (int t = tstart; t < tend; ++t)
             {
+                CV_DbgCheckEQ(dy, ctrl.ytab[t].di, "something wrong");
                 IdxT beta;
                 int sy = ctrl.ytab[t].si;
-                CV_CheckEQ(dy, ctrl.ytab[t].di, "something wrong");
                 ctrl.ytab[t].as(beta);
                 T const* S = src.template ptr<T>(sy);
                 if (ctrl.xkanti)
@@ -3931,7 +3930,7 @@ public:
                     }
                 }
                 // remember the first row that needs to be computed
-                if( k1 == ksize )
+                if (k1 == ksize)
                     k0 = min(k0, k);
                 srows[k] = src.template ptr<T>(sy);
                 prev_sy[k] = sy;
@@ -4363,6 +4362,11 @@ static bool ocl_resizeOnnx(InputArray _src, OutputArray _dst,
     int nearest = interpolation & INTER_NEAREST_MODE_MASK;
     int antialias = interpolation & INTER_ANTIALIAS_MASK;
     Point2f scale = static_cast<Point2f>(scaled);
+    int khalf = (sampler == INTER_LINEAR ? 2 : 4) / 2;
+    float xscale = min(scale.x, 1.f), yscale = min(scale.y, 1.f);
+    int xstart = cvFloor(-khalf / xscale) + 1, xend = 2 - xstart;
+    int ystart = cvFloor(-khalf / yscale) + 1, yend = 2 - ystart;
+
     ocl::Kernel k;
     UMat src = _src.getUMat(), dst = _dst.getUMat();
     size_t globalsize[] = {static_cast<size_t>(dst.cols), static_cast<size_t>(dst.rows)};
@@ -4452,7 +4456,7 @@ static bool ocl_resizeOnnx(InputArray _src, OutputArray _dst,
             return false;
         k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
             pixel_size, cn, M(0, 0), M(0, 1), M(1, 0), M(1, 1),
-            min(scale.x, 1.f), min(scale.y, 1.f));
+            xscale, yscale, xstart, ystart, xend, yend);
     }
     else if (sampler == INTER_CUBIC && !antialias)
     {
@@ -4480,12 +4484,8 @@ static bool ocl_resizeOnnx(InputArray _src, OutputArray _dst,
     }
     else if (sampler == INTER_CUBIC && antialias)
     {
-        int ksize = 4;
-        int khalf = ksize / 2;
-        int xkanti = 2 * cvCeil(khalf / min(scale.x, 1.f));
-        int ykanti = 2 * cvCeil(khalf / min(scale.y, 1.f));
-        int xstride = xkanti * dst.cols;
-        int ystride = ykanti * dst.rows;
+        int xkanti = xend - xstart, xstride = xkanti * dst.cols;
+        int ykanti = yend - ystart, ystride = ykanti * dst.rows;
         int tabsize = (xstride + ystride) * 2;
         AutoBuffer<int> table(tabsize);
         int* xoffset = table.data();
@@ -4518,7 +4518,8 @@ static bool ocl_resizeOnnx(InputArray _src, OutputArray _dst,
         if (k.empty())
             return false;
         k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
-            pixel_size, cn, xkanti, ykanti, ocl::KernelArg::PtrReadOnly(utable));
+            pixel_size, cn, xkanti, ykanti, xstride, ystride,
+            ocl::KernelArg::PtrReadOnly(utable));
     }
     else
         CV_Error(cv::Error::StsError, "should not got here");
@@ -5197,7 +5198,7 @@ void cv::resizeOnnx(InputArray _src, OutputArray _dst,
         coordinate == INTER_ASYMMETRIC ||
         coordinate == INTER_TF_CROP_RESIZE);
 
-    // affine transformation matrix: x' = ax + b
+    // x_org = x * a + b
     Matx22f M;
     Vec2f xcoef = interCoordinate(
         coordinate, dsize.width, ssize.width, scale.x, roi.x, roi.x + roi.width);
@@ -5237,7 +5238,7 @@ void cv::resizeOnnx(InputArray _src, OutputArray _dst,
         return;
     }
 
-    static ResizeOnnxFunc linear_tab[] =
+    static ResizeOnnxFunc linear_tab[CV_DEPTH_MAX] =
     {
         resizeOnnx_<
             HResizeLinear<uchar, int, short, INTER_RESIZE_COEF_SCALE, HResizeLinearVec_8u32s>,
@@ -5272,7 +5273,7 @@ void cv::resizeOnnx(InputArray _src, OutputArray _dst,
         nullptr
     };
 
-    static ResizeOnnxFunc cubic_tab[] =
+    static ResizeOnnxFunc cubic_tab[CV_DEPTH_MAX] =
     {
         resizeOnnx_<
             HResizeCubic<uchar, int, short>,
@@ -5307,58 +5308,8 @@ void cv::resizeOnnx(InputArray _src, OutputArray _dst,
         nullptr
     };
 
-#if 0
-    static ResizeAreaFastFunc areafast_tab[] =
-    {
-        resizeAreaFast_<uchar, int, ResizeAreaFastVec<uchar, ResizeAreaFastVec_SIMD_8u> >,
-        resizeAreaFast_<schar, int, ResizeAreaFastNoVec<schar, float> >,
-        resizeAreaFast_<ushort, float, ResizeAreaFastVec<ushort, ResizeAreaFastVec_SIMD_16u> >,
-        resizeAreaFast_<short, float, ResizeAreaFastVec<short, ResizeAreaFastVec_SIMD_16s> >,
-        resizeAreaFast_<int, double, ResizeAreaFastNoVec<int, double> >,
-        resizeAreaFast_<float, float, ResizeAreaFastVec_SIMD_32f>,
-        resizeAreaFast_<double, double, ResizeAreaFastNoVec<double, double> >,
-        nullptr
-    };
-
-    // check if can use area fast
-    Point2d inv_scale(1.0 / scale.x, 1.0 / scale.y);
-    bool areafast_scale = fabs(inv_scale.y - 2.0) + fabs(inv_scale.x - 2.0) <= DBL_EPSILON;
-    bool areafast_size = (fabs(ssize.height - dsize.height * inv_scale.y) <= DBL_EPSILON)
-        && (fabs(ssize.width - dsize.width * inv_scale.x) <= DBL_EPSILON);
-    bool areafast_coordiante = (coordinate == INTER_HALF_PIXEL)
-        || (coordinate == INTER_HALF_PIXEL_SYMMETRIC)
-        || (coordinate == INTER_HALF_PIXEL_PYTORCH && min(dsize.height, dsize.width) > 1);
-    bool areafast_sampler = (sampler == INTER_LINEAR) && !(interpolation & INTER_ANTIALIAS_MASK);
-    if (areafast_scale && areafast_size && areafast_coordiante && areafast_sampler)
-    {
-        int iiy = static_cast<int>(inv_scale.y);
-        int iix = static_cast<int>(inv_scale.x);
-        int area = iiy * iix;
-        int srcstep = static_cast<int>(src.step1());
-        AutoBuffer<int> _ofs(area + dsize.width * cn);
-        int* ofs = _ofs.data();
-        int* xofs = ofs + area;
-        ResizeAreaFastFunc func = areafast_tab[depth];
-        CV_Check(0, func, "empty implementation in area fast");
-        // offsets of a pixel's sources to its left-top
-        for (int sy = 0, k = 0; sy < iiy; ++sy)
-            for (int sx = 0; sx < iix; ++sx)
-                ofs[k++] = sy * srcstep + sx * cn;
-        // left-top offsets of all pixels on a row
-        for (int dx = 0; dx < dsize.width; ++dx)
-        {
-            int j = dx * cn;
-            int sx = iix * j;
-            for(int k = 0; k < cn; k++ )
-                xofs[j + k] = sx + k;
-        }
-        func(src, dst, ofs, xofs, iix, iiy);
-        return;
-    }
-#endif
-
-    int depth = src.depth();
-    ResizeOnnxCtrl ctrl(interpolation, src.type(), cubicCoeff, ssize, dsize, scale, M);
+    int depth = src.depth(), type = src.type();
+    ResizeOnnxCtrl ctrl(interpolation, type, cubicCoeff, ssize, dsize, scale, M);
     ResizeOnnxFunc func = linear_tab[depth];
     if (sampler == INTER_LINEAR)
         func = linear_tab[depth];
diff --git a/modules/ts/include/opencv2/ts/ocl_test.hpp b/modules/ts/include/opencv2/ts/ocl_test.hpp
index ed456385b9..717eb7b14c 100644
--- a/modules/ts/include/opencv2/ts/ocl_test.hpp
+++ b/modules/ts/include/opencv2/ts/ocl_test.hpp
@@ -134,11 +134,11 @@ do \
         mask(cv::Rect(1, 1, mask.cols - 2, mask.rows - 2)).setTo(0); \
     cv::threshold(diff, binary, (double)eps, 255, cv::THRESH_BINARY); \
     EXPECT_LE(countNonZero(binary.reshape(1)), (int)(binary.cols*binary.rows*5/1000)) \
-        << "Size: " << name ## _roi.size() << ", NormInf: " << cv::norm(diff, NORM_INF) <<  std::endl; \
+        << "Size: " << name ## _roi.size() << std::endl; \
     binary.convertTo(binary_8, mask.type()); \
     binary_8 = binary_8 & mask; \
     EXPECT_LE(countNonZero(binary_8.reshape(1)), (int)((binary_8.cols+binary_8.rows)/100)) \
-        << "Size: " << name ## _roi.size() << ", NormInf: " << cv::norm(diff, NORM_INF)  << std::endl; \
+        << "Size: " << name ## _roi.size() << std::endl; \
 } while ((void)0, 0)
 
 #define OCL_EXPECT_MATS_NEAR(name, eps) \

From 971df57c41a29dbe8fd9b92f9632123de6d2d3ae Mon Sep 17 00:00:00 2001
From: Ginkgo <ecrirelettre@outlook.com>
Date: Sat, 15 Jun 2024 18:02:02 +0800
Subject: [PATCH 05/12] use simd for vertical antialias - but no significant
 improvement - add perf_test for cpu / ocl

---
 modules/imgproc/perf/opencl/perf_imgwarp.cpp | 23 +++++
 modules/imgproc/perf/perf_resize.cpp         | 27 ++++++
 modules/imgproc/src/resize.cpp               | 95 ++++++++++++++------
 modules/imgproc/test/test_resize_onnx.cpp    | 20 ++---
 4 files changed, 126 insertions(+), 39 deletions(-)

diff --git a/modules/imgproc/perf/opencl/perf_imgwarp.cpp b/modules/imgproc/perf/opencl/perf_imgwarp.cpp
index d13b54bdce..477e17d085 100644
--- a/modules/imgproc/perf/opencl/perf_imgwarp.cpp
+++ b/modules/imgproc/perf/opencl/perf_imgwarp.cpp
@@ -192,6 +192,29 @@ OCL_PERF_TEST_P(ResizeLinearExactFixture, Resize,
     SANITY_CHECK(dst, eps);
 }
 
+typedef tuple<Size, MatType, double> ResizeOnnxParams;
+typedef TestBaseWithParam<ResizeOnnxParams> ResizeOnnxFixture;
+
+OCL_PERF_TEST_P(ResizeOnnxFixture, ResizeAntialias,
+    Combine(OCL_TEST_SIZES, OCL_TEST_TYPES_134, Values(0.3, 0.5, 0.6)))
+{
+    const ResizeOnnxParams params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params);
+    double scale = get<2>(params);
+    const Size dstSize(cvRound(srcSize.width * scale), cvRound(srcSize.height * scale));
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+    checkDeviceMaxMemoryAllocSize(dstSize, type);
+
+    UMat src(srcSize, type), dst(dstSize, type);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::resizeOnnx(src, dst, dstSize, Point2d(), INTER_LINEAR | INTER_ANTIALIAS);
+
+    SANITY_CHECK_NOTHING();
+}
+
 ///////////// Remap ////////////////////////
 
 typedef tuple<Size, MatType, InterType> RemapParams;
diff --git a/modules/imgproc/perf/perf_resize.cpp b/modules/imgproc/perf/perf_resize.cpp
index 0f470a5f81..22bf75434d 100644
--- a/modules/imgproc/perf/perf_resize.cpp
+++ b/modules/imgproc/perf/perf_resize.cpp
@@ -280,4 +280,31 @@ PERF_TEST_P(MatInfo_Size_Scale_NN, ResizeNNExact,
     SANITY_CHECK_NOTHING();
 }
 
+// The complexity of non-antialias and NN resize is same as that in cv::resize
+// The complexity of antialias resize is ralated to dst size and ceil(1.0 / scale)
+PERF_TEST_P(MatInfo_Size_Size, ResizeOnnxDownLinearAntialias, Combine(
+    Values(CV_8UC1, CV_16UC1, CV_16UC4, CV_32SC1, CV_32FC1, CV_32FC3),
+    Values(sz1440p),
+    Values(szVGA, szqHD, sz720p, sz1080p)
+))
+{
+    int matType = get<0>(GetParam());
+    Size from = get<1>(GetParam());
+    Size to = get<2>(GetParam());
+
+    cv::Mat src(from, matType), dst(to, matType);
+    switch(src.depth())
+    {
+        case CV_8U: cvtest::fillGradient(src); break;
+        case CV_16U: fillFPGradient<ushort>(src); break;
+        case CV_32S: fillFPGradient<int>(src); break;
+        case CV_32F: fillFPGradient<float>(src); break;
+    }
+    declare.in(src).out(dst);
+
+    TEST_CYCLE_MULTIRUN(10) resizeOnnx(src, dst, to, Point2d(), INTER_LINEAR | INTER_ANTIALIAS);
+
+    SANITY_CHECK_NOTHING();
+}
+
 } // namespace
diff --git a/modules/imgproc/src/resize.cpp b/modules/imgproc/src/resize.cpp
index 5d8fb0d30e..e3f3067117 100644
--- a/modules/imgproc/src/resize.cpp
+++ b/modules/imgproc/src/resize.cpp
@@ -3205,6 +3205,14 @@ inline void saturate_store(const float* src, uchar* dst) {
     v_store(dst, v_pack(v_pack_u(tmp0, tmp1), v_pack_u(tmp2, tmp3)));
 }
 
+inline void saturate_store(const float* src, schar* dst) {
+    const v_int32 tmp0 = v_round(vx_load(src + 0 * VTraits<v_float32>::vlanes()));
+    const v_int32 tmp1 = v_round(vx_load(src + 1 * VTraits<v_float32>::vlanes()));
+    const v_int32 tmp2 = v_round(vx_load(src + 2 * VTraits<v_float32>::vlanes()));
+    const v_int32 tmp3 = v_round(vx_load(src + 3 * VTraits<v_float32>::vlanes()));
+    v_store(dst, v_pack(v_pack(tmp0, tmp1), v_pack(tmp2, tmp3)));
+}
+
 inline void saturate_store(const float* src, ushort* dst) {
     const v_int32 tmp0 = v_round(vx_load(src + 0 * VTraits<v_float32>::vlanes()));
     const v_int32 tmp1 = v_round(vx_load(src + 1 * VTraits<v_float32>::vlanes()));
@@ -3236,6 +3244,18 @@ struct VArea<double> {
     typedef v_float64 vWT;
 };
 
+inline void saturate_store(const double* sum, int width, int* D) {
+    const int step = VTraits<v_float64>::vlanes() * sizeof(double) / sizeof(int);
+    int dx = 0, limit = width - step;
+    for (; dx <= limit; dx += step)
+    {
+        v_store(D + dx, v_round(
+            vx_load(sum + dx + 0 * VTraits<v_float64>::vlanes()),
+            vx_load(sum + dx + 1 * VTraits<v_float64>::vlanes())));
+    }
+    for (; dx < width; ++dx)
+        D[dx] = saturate_cast<int>(sum[dx]);
+}
 #else
 inline void mul(const double* buf, int width, double beta, double* sum) {
     for (int dx = 0; dx < width; ++dx) {
@@ -3248,6 +3268,11 @@ inline void muladd(const double* buf, int width, double beta, double* sum) {
         sum[dx] += beta * buf[dx];
     }
 }
+
+inline void saturate_store(const double* sum, int width, int* D) {
+    for (int dx = 0; dx < width; ++dx)
+        D[dx] = saturate_cast<int>(sum[dx]);
+}
 #endif
 
 template <typename T, typename WT>
@@ -3275,8 +3300,10 @@ inline void mul(const WT* buf, int width, WT beta, WT* sum) {
     int dx = 0;
 #if (CV_SIMD || CV_SIMD_SCALABLE)
     const int step = VTraits<typename VArea<WT>::vWT>::vlanes();
-    for (; dx + step < width; dx += step) {
-        vx_store(sum + dx, v_mul(vx_setall(beta), vx_load(buf + dx)));
+    const typename VArea<WT>::vWT vbeta = vx_setall(beta);
+    int limit = width - step;
+    for (; dx <= limit; dx += step) {
+        vx_store(sum + dx, v_mul(vbeta, vx_load(buf + dx)));
     }
 #endif
     for (; dx < width; ++dx) {
@@ -3289,8 +3316,10 @@ inline void muladd(const WT* buf, int width, WT beta, WT* sum) {
     int dx = 0;
 #if (CV_SIMD || CV_SIMD_SCALABLE)
     const int step = VTraits<typename VArea<WT>::vWT>::vlanes();
-    for (; dx + step < width; dx += step) {
-        vx_store(sum + dx, v_add(vx_load(sum + dx), v_mul(vx_setall(beta), vx_load(buf + dx))));
+    const typename VArea<WT>::vWT vbeta = vx_setall(beta);
+    int limit = width - step;
+    for (; dx <= limit; dx += step) {
+        vx_store(sum + dx, v_add(vx_load(sum + dx), v_mul(vbeta, vx_load(buf + dx))));
     }
 #endif
     for (; dx < width; ++dx) {
@@ -3721,7 +3750,7 @@ public:
             CV_Check(sizeof(T) * 10 + sizeof(WT),
                 (std::is_same<WT, int>::value
                     && (std::is_same<T, uchar>::value || std::is_same<T, schar>::value)),
-                "something wrong");
+                "fixpt works when T is uchar or schar");
         }
         else if (ctrl.is_double)
         {
@@ -3730,7 +3759,7 @@ public:
             CV_Check(sizeof(T) * 10 + sizeof(WT),
                 (std::is_same<WT, double>::value &&
                     (std::is_same<T, int>::value || std::is_same<T, double>::value)),
-                "something wrong");
+                "double WT works when T is int or double");
          }
         else
         {
@@ -3740,13 +3769,13 @@ public:
                 (std::is_same<WT, float>::value
                     && (std::is_same<T, short>::value || std::is_same<T, ushort>::value
                         || std::is_same<T, float>::value)),
-                "something wrong");
+                "float WT works for other types");
         }
         // check antialias resize
         if (ctrl.is_double)
         {
             CV_Check(ctrl.is_double, (std::is_same<IdxT, double>::value),
-                "when use double coeffs, AT is expected to be double");
+                "when use double coeffs, IdxT is expected to be double");
         }
         else
         {
@@ -3827,18 +3856,21 @@ public:
             horiAntialiasAccumulate(S, L);
             if (!same_wt_idxt)
             {
-                WT* D = dstptr[i];
-                if (ctrl.is_fixpt)
-                {
-                    float const alpha = INTER_RESIZE_COEF_SCALE;
-                    for (int k = 0; k < dwidth; ++k)
-                        D[k] = saturate_cast<WT>(L[k] * alpha);
-                }
-                else
-                {
-                    for (int k = 0; k < dwidth; ++k)
-                        D[k] = saturate_cast<WT>(L[k]);
-                }
+                // only when is_fixpt, wt (int) and idxt (float) can be different
+                CV_Check(ctrl.is_fixpt, ctrl.is_fixpt && (std::is_same<IdxT, float>::value)
+                    && (std::is_same<WT, int>::value), "");
+                float* Lf = reinterpret_cast<float*>(L);
+                int* D = reinterpret_cast<int*>(dstptr[i]);
+                float const alpha = INTER_RESIZE_COEF_SCALE;
+                int k = 0;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                v_float32 valpha = vx_setall_f32(alpha);
+                int limit = dwidth - VTraits<v_float32>::vlanes();
+                for (; k <= limit; k += VTraits<v_float32>::vlanes())
+                    v_store(D + k, v_round(v_mul(vx_load(Lf + k), valpha)));
+#endif
+                for (; k < dwidth; ++k)
+                    D[k] = cvRound(Lf[k] * alpha);
             }
         }
     }
@@ -3866,7 +3898,6 @@ public:
         for (int dy = range.start; dy < range.end; ++dy)
         {
             int tstart = dy * ctrl.ykanti, tend = tstart + ctrl.ykanti;
-            memset(A, 0, dwidth * sizeof(IdxT));
             for (int t = tstart; t < tend; ++t)
             {
                 CV_DbgCheckEQ(dy, ctrl.ytab[t].di, "something wrong");
@@ -3878,21 +3909,28 @@ public:
                 {
                     memset(L, 0, dwidth * sizeof(IdxT));
                     horiAntialiasAccumulate(S, L);
-                    for (int w = 0; w < dwidth; ++w)
-                        A[w] += L[w] * beta;
+                    if (t == tstart)
+                        inter_area::mul(L, dwidth, beta, A);
+                    else
+                        inter_area::muladd(L, dwidth, beta, A);
                 }
                 else
                 {
+                    // A & Lw maybe different type, can not use inter_area
+                    // A double : Lw double
+                    // A float  : Lw float / int
                     horiGenericLines(&S, &Lw, 1);
                     if (ctrl.is_fixpt)
                         beta /= INTER_RESIZE_COEF_SCALE;
-                    for (int w = 0; w < dwidth; ++w)
-                        A[w] += Lw[w] * beta;
+                    if (t == tstart)
+                        for (int w = 0; w < dwidth; ++w)
+                            A[w] = saturate_cast<IdxT>(Lw[w] * beta);
+                    else
+                        for (int w = 0; w < dwidth; ++w)
+                            A[w] += Lw[w] * beta;
                 }
             }
-            T* D = dst.template ptr<T>(dy);
-            for (int w = 0; w < dwidth; ++w)
-                D[w] = saturate_cast<T>(A[w]);
+            inter_area::saturate_store(A, dwidth, dst.template ptr<T>(dy));
         }
     }
 
@@ -5317,7 +5355,6 @@ void cv::resizeOnnx(InputArray _src, OutputArray _dst,
         func = cubic_tab[depth];
     else
         CV_Error(CV_StsBadArg, format("Unknown sampler %d", sampler));
-    CV_Check(0, func, "empty implementation in area fast");
 
     func(src, dst, ctrl);
 }
diff --git a/modules/imgproc/test/test_resize_onnx.cpp b/modules/imgproc/test/test_resize_onnx.cpp
index 4a4a8b143c..6b8c4fc3de 100644
--- a/modules/imgproc/test/test_resize_onnx.cpp
+++ b/modules/imgproc/test/test_resize_onnx.cpp
@@ -45,7 +45,7 @@ struct ResizeOnnx
         Mat S = iS, R = iR, nS, nR;
         double alpha[8] = {1, -1, 5, 5, 0, -3, -2, +4};
         double  beta[8] = {0, -0, 0, 7, 7, -7, -6, +6};
-        RNG rng;
+        RNG& rng = TS::ptr()->get_rng();
         for (int cn = 1; cn <= 8; ++cn)
         {
             if (cn > 1)
@@ -206,9 +206,9 @@ TEST(ResizeOnnx, downsample_sizes_cubic_antialias)
         -0.75f, Rect2d(),
         {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
         {
-            1.7750092,   3.1200073,  4.4650054,
-            7.1550016,   8.5      ,  9.844998 ,
-            12.534994,  13.8799925, 15.224991 ,
+             1.7750092,   3.1200073,  4.4650054,
+             7.1550016,   8.5      ,  9.844998 ,
+            12.534994 ,  13.8799925, 15.224991 ,
         }
     }.run();
 }
@@ -236,9 +236,9 @@ TEST(ResizeOnnx, downsample_sizes_linear_pytorch_half_pixel)
         -0.75f, Rect2d(),
         {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
         {
-            1.6666666,
-            7.0      ,
-            12.333333,
+             1.6666666,
+             7.0      ,
+            12.333333 ,
         }
     }.run();
 }
@@ -263,9 +263,9 @@ TEST(ResizeOnnx, tf_crop_and_resize)
         -0.75f, Rect2d(0.6, 0.4, 0.2, 0.2),
         {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
         {
-            7.6000004,  7.9,  8.2     ,
-            8.8      ,  9.1,  9.400001,
-            10.0     , 10.3, 10.6     ,
+             7.6000004,  7.9,  8.2     ,
+             8.8      ,  9.1,  9.400001,
+            10.0      , 10.3, 10.6     ,
         }
     }.run();
 }

From 6a795ee253d75169daa9b4d68665b3ece7cfcda7 Mon Sep 17 00:00:00 2001
From: Ginkgo <ecrirelettre@outlook.com>
Date: Sat, 15 Jun 2024 22:00:03 +0800
Subject: [PATCH 06/12] reduce the number of resizeOnnx perf - because it takes
 too long

---
 modules/imgproc/perf/opencl/perf_imgwarp.cpp | 2 +-
 modules/imgproc/perf/perf_resize.cpp         | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/modules/imgproc/perf/opencl/perf_imgwarp.cpp b/modules/imgproc/perf/opencl/perf_imgwarp.cpp
index 477e17d085..97a35e8bd2 100644
--- a/modules/imgproc/perf/opencl/perf_imgwarp.cpp
+++ b/modules/imgproc/perf/opencl/perf_imgwarp.cpp
@@ -195,7 +195,7 @@ OCL_PERF_TEST_P(ResizeLinearExactFixture, Resize,
 typedef tuple<Size, MatType, double> ResizeOnnxParams;
 typedef TestBaseWithParam<ResizeOnnxParams> ResizeOnnxFixture;
 
-OCL_PERF_TEST_P(ResizeOnnxFixture, ResizeAntialias,
+OCL_PERF_TEST_P(ResizeOnnxFixture, LinearAntialias,
     Combine(OCL_TEST_SIZES, OCL_TEST_TYPES_134, Values(0.3, 0.5, 0.6)))
 {
     const ResizeOnnxParams params = GetParam();
diff --git a/modules/imgproc/perf/perf_resize.cpp b/modules/imgproc/perf/perf_resize.cpp
index 22bf75434d..39c1311e78 100644
--- a/modules/imgproc/perf/perf_resize.cpp
+++ b/modules/imgproc/perf/perf_resize.cpp
@@ -282,10 +282,10 @@ PERF_TEST_P(MatInfo_Size_Scale_NN, ResizeNNExact,
 
 // The complexity of non-antialias and NN resize is same as that in cv::resize
 // The complexity of antialias resize is ralated to dst size and ceil(1.0 / scale)
-PERF_TEST_P(MatInfo_Size_Size, ResizeOnnxDownLinearAntialias, Combine(
-    Values(CV_8UC1, CV_16UC1, CV_16UC4, CV_32SC1, CV_32FC1, CV_32FC3),
-    Values(sz1440p),
-    Values(szVGA, szqHD, sz720p, sz1080p)
+PERF_TEST_P(MatInfo_Size_Size, ResizeOnnxLinearAntialias, Combine(
+    Values(CV_8UC1, CV_8UC3, CV_8UC4),
+    Values(sz1080p),
+    Values(szVGA, szqHD, sz720p)
 ))
 {
     int matType = get<0>(GetParam());

From c7815826c4c1f389995b4a23bbcc135a557319b9 Mon Sep 17 00:00:00 2001
From: Ginkgo <ecrirelettre@outlook.com>
Date: Mon, 17 Jun 2024 00:44:27 +0800
Subject: [PATCH 07/12] optimize antialias row compute - cache the src-row
 results within adjacent dst-row

---
 modules/imgproc/perf/perf_resize.cpp |  8 +--
 modules/imgproc/src/resize.cpp       | 92 ++++++++++++++++++++--------
 2 files changed, 71 insertions(+), 29 deletions(-)

diff --git a/modules/imgproc/perf/perf_resize.cpp b/modules/imgproc/perf/perf_resize.cpp
index 39c1311e78..ead4696c22 100644
--- a/modules/imgproc/perf/perf_resize.cpp
+++ b/modules/imgproc/perf/perf_resize.cpp
@@ -32,7 +32,7 @@ static void fillFPGradient(Mat& img)
         }
     }
 }
-
+#if 0
 PERF_TEST_P(MatInfo_Size_Size, resizeUpLinear,
             testing::Values(
                 MatInfo_Size_Size_t(CV_8UC1, szVGA, szqHD),
@@ -279,13 +279,13 @@ PERF_TEST_P(MatInfo_Size_Scale_NN, ResizeNNExact,
     EXPECT_GT(countNonZero(dst.reshape(1)), 0);
     SANITY_CHECK_NOTHING();
 }
-
+#endif
 // The complexity of non-antialias and NN resize is same as that in cv::resize
 // The complexity of antialias resize is ralated to dst size and ceil(1.0 / scale)
 PERF_TEST_P(MatInfo_Size_Size, ResizeOnnxLinearAntialias, Combine(
     Values(CV_8UC1, CV_8UC3, CV_8UC4),
-    Values(sz1080p),
-    Values(szVGA, szqHD, sz720p)
+    Values(sz1440p),
+    Values(szVGA, szqHD, sz720p, sz1080p)
 ))
 {
     int matType = get<0>(GetParam());
diff --git a/modules/imgproc/src/resize.cpp b/modules/imgproc/src/resize.cpp
index e3f3067117..50a2d72d1a 100644
--- a/modules/imgproc/src/resize.cpp
+++ b/modules/imgproc/src/resize.cpp
@@ -3482,6 +3482,7 @@ public:
     /* resize parameter */
     bool is_fixpt, is_double;
     int ksize, xkanti, ykanti;
+    Point2f scalef;
 
     /* for antialias resize */
     TabIdx* xtab;
@@ -3575,13 +3576,13 @@ private:
     {
         int sampler = interpolation & INTER_SAMPLER_MASK;
         int antialias = interpolation & INTER_ANTIALIAS_MASK;
-        Point2f scale = static_cast<Point2f>(scaled);
         CV_CheckGE(cubicCoeff, -1.f, "cubic coefficient should range [-1, 0)");
         CV_CheckLT(cubicCoeff, +0.f, "cubic coefficient should range [-1, 0)");
         CV_Check(sampler, sampler == INTER_LINEAR || sampler == INTER_CUBIC,
             "should not error");
 
         int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
+        scalef = static_cast<Point2f>(scaled);
         ksize = (sampler == INTER_LINEAR ? 2 : 4);
         is_double = (depth == CV_64F);
         is_fixpt = (depth == CV_8U || depth == CV_8S);
@@ -3590,8 +3591,8 @@ private:
         xofs = yofs = nullptr;
         xcoeffs = ycoeffs = nullptr;
         int khalf = ksize / 2;
-        xkanti = 2 * cvCeil(khalf / min(scale.x, 1.f));
-        ykanti = 2 * cvCeil(khalf / min(scale.y, 1.f));
+        xkanti = 2 * cvCeil(khalf / min(scalef.x, 1.f));
+        ykanti = 2 * cvCeil(khalf / min(scalef.y, 1.f));
         area.allocate(xtab, xkanti * dsize.width );
         area.allocate(ytab, ykanti * dsize.height);
         area.allocate(xofs, dsize.width * cn + 1);
@@ -3609,9 +3610,9 @@ private:
             {
                 float f = fmaf(static_cast<float>(d), a, b);
                 if (sampler == INTER_LINEAR)
-                    linearCoeffsAntialias(d, cn, f, scale.x, ssize.width, xtab + d * xkanti);
+                    linearCoeffsAntialias(d, cn, f, scalef.x, ssize.width, xtab + d * xkanti);
                 else // if (sampler == INTER_CUBIC)
-                    cubicCoeffsAntiAlias(d, cn, f, scale.x, ssize.width, cubicCoeff, xtab + d * xkanti);
+                    cubicCoeffsAntiAlias(d, cn, f, scalef.x, ssize.width, cubicCoeff, xtab + d * xkanti);
             }
         }
         else
@@ -3677,9 +3678,9 @@ private:
             {
                 float f = fmaf(static_cast<float>(d), a, b);
                 if (sampler == INTER_LINEAR)
-                    linearCoeffsAntialias(d, 1, f, scale.y, ssize.height, ytab + d * ykanti);
+                    linearCoeffsAntialias(d, 1, f, scalef.y, ssize.height, ytab + d * ykanti);
                 else // if (sampler == INTER_CUBIC)
-                    cubicCoeffsAntiAlias(d, 1, f, scale.y, ssize.height, cubicCoeff, ytab + d * ykanti);
+                    cubicCoeffsAntiAlias(d, 1, f, scalef.y, ssize.height, cubicCoeff, ytab + d * ykanti);
             }
         }
         else
@@ -3857,7 +3858,8 @@ public:
             if (!same_wt_idxt)
             {
                 // only when is_fixpt, wt (int) and idxt (float) can be different
-                CV_Check(ctrl.is_fixpt, ctrl.is_fixpt && (std::is_same<IdxT, float>::value)
+                CV_DbgCheck(ctrl.is_fixpt, ctrl.is_fixpt
+                    && (std::is_same<IdxT, float>::value)
                     && (std::is_same<WT, int>::value), "");
                 float* Lf = reinterpret_cast<float*>(L);
                 int* D = reinterpret_cast<int*>(dstptr[i]);
@@ -3891,44 +3893,84 @@ public:
     {
         int cn = dst.channels();
         int dwidth = dst.cols * cn;
-        AutoBuffer<IdxT> line(dwidth * 2);
-        IdxT* L = line.data();
-        IdxT* A = line.data() + dwidth;
-        WT* Lw = reinterpret_cast<WT*>(L);
+        // the sample lines on src of the i-th row (i + 1)-th dst-row
+        // will overlap at most these src-rows
+        int bufrow = ctrl.ykanti - cvFloor(1.f / ctrl.scalef.y);
+        Mat buffer(bufrow + 2, dwidth, DataType<IdxT>::depth);
+        AutoBuffer<IdxT*> line((bufrow + 1) * 2);
+        IdxT* A = buffer.template ptr<IdxT>(bufrow + 1);
+        int* ysrc = reinterpret_cast<int*>(line.data() + bufrow + 1);
+        size_t szcopy = (ctrl.xkanti ? sizeof(WT) : sizeof(IdxT)) * dwidth;
+        for (int i = 0; i <= bufrow; ++i)
+        {
+            line[i] = buffer.template ptr<IdxT>(i);
+            ysrc[i] = -1;
+        }
         for (int dy = range.start; dy < range.end; ++dy)
         {
-            int tstart = dy * ctrl.ykanti, tend = tstart + ctrl.ykanti;
-            for (int t = tstart; t < tend; ++t)
+            int tidx = dy * ctrl.ykanti;
+            for (int t = 0; t < ctrl.ykanti; ++t, ++tidx)
             {
-                CV_DbgCheckEQ(dy, ctrl.ytab[t].di, "something wrong");
+                CV_DbgCheckEQ(dy, ctrl.ytab[tidx].di, "something wrong");
                 IdxT beta;
-                int sy = ctrl.ytab[t].si;
-                ctrl.ytab[t].as(beta);
+                ctrl.ytab[tidx].as(beta);
+                int sy = ctrl.ytab[tidx].si;
                 T const* S = src.template ptr<T>(sy);
+                // if the sy-th row has been computed already, reuse it.
+                int y0 = -1;
+                IdxT* L = line[bufrow];
+                for (int i = 0; i < bufrow; ++i)
+                    if (ysrc[i] == sy)
+                    {
+                        y0 = i;
+                        break;
+                    }
+                // have found, reuse it
+                if (y0 != -1)
+                    L = line[y0];
+                else
+                {
+                    // not found, compute it
+                    if (ctrl.xkanti)
+                    {
+                        memset(L, 0, dwidth * sizeof(IdxT));
+                        horiAntialiasAccumulate(S, L);
+                    }
+                    else
+                    {
+                        // A & Lw maybe different type, can not use inter_area
+                        // A double : Lw double
+                        // A float  : Lw float / int
+                        WT* Lw = reinterpret_cast<WT*>(L);
+                        horiGenericLines(&S, &Lw, 1);
+                    }
+                }
                 if (ctrl.xkanti)
                 {
-                    memset(L, 0, dwidth * sizeof(IdxT));
-                    horiAntialiasAccumulate(S, L);
-                    if (t == tstart)
+                    if (t == 0)
                         inter_area::mul(L, dwidth, beta, A);
                     else
                         inter_area::muladd(L, dwidth, beta, A);
                 }
                 else
                 {
-                    // A & Lw maybe different type, can not use inter_area
-                    // A double : Lw double
-                    // A float  : Lw float / int
-                    horiGenericLines(&S, &Lw, 1);
+                    WT* Lw = reinterpret_cast<WT*>(L);
                     if (ctrl.is_fixpt)
                         beta /= INTER_RESIZE_COEF_SCALE;
-                    if (t == tstart)
+                    if (t == 0)
                         for (int w = 0; w < dwidth; ++w)
                             A[w] = saturate_cast<IdxT>(Lw[w] * beta);
                     else
                         for (int w = 0; w < dwidth; ++w)
                             A[w] += Lw[w] * beta;
                 }
+                // backup the last bufrow results
+                y0 = bufrow - (ctrl.ykanti - t);
+                if (y0 >= 0 && ysrc[y0] != sy /* line[y0] != L */)
+                {
+                    ysrc[y0] = sy;
+                    memcpy(line[y0], L, szcopy);
+                }
             }
             inter_area::saturate_store(A, dwidth, dst.template ptr<T>(dy));
         }

From 734dbbd984059956c690dc70282c04499062ae90 Mon Sep 17 00:00:00 2001
From: Ginkgo <ecrirelettre@outlook.com>
Date: Wed, 19 Jun 2024 15:35:50 +0800
Subject: [PATCH 08/12] avoid the copy between cache lines in vertical
 antialias - but not see significant improvement

---
 modules/imgproc/perf/perf_resize.cpp | 14 ++----
 modules/imgproc/src/resize.cpp       | 66 +++++++++++++---------------
 2 files changed, 34 insertions(+), 46 deletions(-)

diff --git a/modules/imgproc/perf/perf_resize.cpp b/modules/imgproc/perf/perf_resize.cpp
index 355a0482b6..1c133d7b88 100644
--- a/modules/imgproc/perf/perf_resize.cpp
+++ b/modules/imgproc/perf/perf_resize.cpp
@@ -280,8 +280,6 @@ PERF_TEST_P(MatInfo_Size_Scale_NN, ResizeNNExact,
     SANITY_CHECK_NOTHING();
 }
 
-// The complexity of non-antialias and NN resize is same as that in cv::resize
-// The complexity of antialias resize is ralated to dst size and ceil(1.0 / scale)
 PERF_TEST_P(MatInfo_Size_Size, ResizeOnnxLinearAntialias, Combine(
     Values(CV_8UC1, CV_8UC3, CV_8UC4),
     Values(sz1440p),
@@ -293,18 +291,12 @@ PERF_TEST_P(MatInfo_Size_Size, ResizeOnnxLinearAntialias, Combine(
     Size to = get<2>(GetParam());
 
     cv::Mat src(from, matType), dst(to, matType);
-    switch(src.depth())
-    {
-        case CV_8U: cvtest::fillGradient(src); break;
-        case CV_16U: fillFPGradient<ushort>(src); break;
-        case CV_32S: fillFPGradient<int>(src); break;
-        case CV_32F: fillFPGradient<float>(src); break;
-    }
     declare.in(src).out(dst);
+    declare.time(100);
 
-    TEST_CYCLE_MULTIRUN(10) resizeOnnx(src, dst, to, Point2d(), INTER_LINEAR | INTER_ANTIALIAS);
+    TEST_CYCLE() resizeOnnx(src, dst, to, Point2d(), INTER_LINEAR | INTER_ANTIALIAS);
 
     SANITY_CHECK_NOTHING();
 }
 
-} // namespace
+}
diff --git a/modules/imgproc/src/resize.cpp b/modules/imgproc/src/resize.cpp
index 3632d40aaa..ffc28da20d 100644
--- a/modules/imgproc/src/resize.cpp
+++ b/modules/imgproc/src/resize.cpp
@@ -3740,6 +3740,10 @@ public:
     ResizeOnnxInvoker(const Mat& _src, Mat& _dst, ResizeOnnxCtrl const& _ctrl) :
         src(_src), dst(_dst), ctrl(_ctrl)
     {
+        static_assert(sizeof(WT) == sizeof(IdxT), "expected");
+        static_assert(std::is_same<IdxT, typename std::common_type<IdxT, WT>::type>::value,
+            "IdxT double : WT double | IdxT float : WT float / int");
+
         CV_CheckLE(ctrl.ksize, MAX_ESIZE, "resampler kernel's size is too larger");
         CV_Check(ctrl.is_fixpt, !(ctrl.is_fixpt && ctrl.is_double), "can not be both types");
         // prefer static_assert, but how ?
@@ -3783,9 +3787,6 @@ public:
             CV_Check(ctrl.is_double, (std::is_same<IdxT, float>::value),
                 "when use float coeffs, IdxT is expected to be float");
         }
-        CV_Check(sizeof(IdxT) * 10 + sizeof(WT),
-            (std::is_same<IdxT, typename std::common_type<IdxT, WT>::type>::value),
-            "we need that IdxT is same or more accurate than WT");
     }
 
     void horiAntialiasAccumulate(T const* S, IdxT* L) const
@@ -3894,18 +3895,14 @@ public:
         int cn = dst.channels();
         int dwidth = dst.cols * cn;
         // the sample lines on src of the i-th and (i + 1)-th dst-row
-        // will overlap at most these src-rows
-        int bufrow = ctrl.ykanti - cvFloor(1.f / ctrl.scalef.y);
-        Mat buffer(bufrow + 2, dwidth, DataType<IdxT>::depth);
-        AutoBuffer<IdxT*> line((bufrow + 1) * 2);
-        IdxT* A = buffer.template ptr<IdxT>(bufrow + 1);
-        int* ysrc = reinterpret_cast<int*>(line.data() + bufrow + 1);
-        size_t szcopy = (ctrl.xkanti ? sizeof(WT) : sizeof(IdxT)) * dwidth;
-        for (int i = 0; i <= bufrow; ++i)
-        {
-            line[i] = buffer.template ptr<IdxT>(i);
+        // will overlap at most bufrow src-rows
+        int bstart = 0, bufrow = ctrl.ykanti - cvFloor(1.f / ctrl.scalef.y);
+        // a ring buffer, have bufrow lines, begin with bstart
+        Mat buffer(bufrow + 1, dwidth * sizeof(IdxT), CV_8U);
+        AutoBuffer<int> ysrc(bufrow);
+        IdxT* A = buffer.template ptr<IdxT>(bufrow);
+        for (int i = 0; i < bufrow; ++i)
             ysrc[i] = -1;
-        }
         for (int dy = range.start; dy < range.end; ++dy)
         {
             int tidx = dy * ctrl.ykanti;
@@ -3915,25 +3912,24 @@ public:
                 IdxT beta;
                 ctrl.ytab[tidx].as(beta);
                 int sy = ctrl.ytab[tidx].si;
-                T const* S = src.template ptr<T>(sy);
+                IdxT* L = nullptr;
                 // if the sy-th row has been computed already, reuse it.
-                int y0 = -1;
-                IdxT* L = line[bufrow];
                 for (int i = 0; i < bufrow; ++i)
                     if (ysrc[i] == sy)
                     {
-                        y0 = i;
+                        L = buffer.template ptr<IdxT>(i);
                         break;
                     }
-                // have found, reuse it
-                if (y0 != -1)
-                    L = line[y0];
-                else
+                // else, compute and save to the buffer line with the minimum ysrc
+                if (!L)
                 {
-                    // not found, compute it
+                    T const* S = src.template ptr<T>(sy);
+                    L = buffer.template ptr<IdxT>(bstart);
+                    ysrc[bstart] = sy;
+                    bstart = (bstart + 1) % bufrow;
                     if (ctrl.xkanti)
                     {
-                        memset(L, 0, dwidth * sizeof(IdxT));
+                        memset(L, 0, buffer.cols * sizeof(uchar));
                         horiAntialiasAccumulate(S, L);
                     }
                     else
@@ -3951,8 +3947,7 @@ public:
                 }
                 else
                 {
-                    // A & Lw maybe different type, can not use inter_area
-                    // A double : Lw double | A float : Lw float / int
+                    // A & Lw (IdxT / WT) maybe different type, can not use inter_area
                     WT* Lw = reinterpret_cast<WT*>(L);
                     if (ctrl.is_fixpt)
                         beta /= INTER_RESIZE_COEF_SCALE;
@@ -3963,13 +3958,6 @@ public:
                         for (int w = 0; w < dwidth; ++w)
                             A[w] += Lw[w] * beta;
                 }
-                // backup the last bufrow results
-                y0 = bufrow - (ctrl.ykanti - t);
-                if (y0 >= 0 && ysrc[y0] != sy /* line[y0] != L */)
-                {
-                    ysrc[y0] = sy;
-                    memcpy(line[y0], L, szcopy);
-                }
             }
             inter_area::saturate_store(A, dwidth, dst.template ptr<T>(dy));
         }
@@ -4038,9 +4026,17 @@ public:
 template <typename HResize, typename VResize, typename IdxT>
 static void resizeOnnx_(Mat const& src, Mat& dst, ResizeOnnxCtrl const& ctrl)
 {
+    /* The complexity of resize is relate to ksize and:
+    - non-antialias and NN: dstsize, same as that in cv::resize.
+    - antialias: dstsize and ceil(1.0 / scale). */
+    double nstripes = static_cast<double>(dst.rows) * dst.cols / (1 << 16);
+    // only parallel by rows
+    if (ctrl.ykanti)
+        nstripes *= ceil(1.0 / ctrl.scalef.y);
+    // do not wake too many threads, really use the cache lines
+    nstripes = min(nstripes, 2.0 * getNumberOfCPUs());
     parallel_for_(Range(0, dst.rows),
-        ResizeOnnxInvoker<HResize, VResize, IdxT>(src, dst, ctrl),
-        static_cast<double>(dst.rows) * dst.cols / (1 << 16));
+        ResizeOnnxInvoker<HResize, VResize, IdxT>(src, dst, ctrl), nstripes);
 }
 
 

From 7395881d2d634cae031e05c23bba2d1a8af6a00d Mon Sep 17 00:00:00 2001
From: Ginkgo <ecrirelettre@outlook.com>
Date: Sat, 27 Jul 2024 12:35:27 +0800
Subject: [PATCH 09/12] fix warnning cbuf

---
 .../include/opencv2/core/cuda/vec_math.hpp     | 18 +++++++++---------
 modules/imgproc/src/resize.cpp                 |  8 +++-----
 2 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/modules/core/include/opencv2/core/cuda/vec_math.hpp b/modules/core/include/opencv2/core/cuda/vec_math.hpp
index 0a1205e25b..6e5f1a3b6a 100644
--- a/modules/core/include/opencv2/core/cuda/vec_math.hpp
+++ b/modules/core/include/opencv2/core/cuda/vec_math.hpp
@@ -376,22 +376,22 @@ CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atanh, ::atanh, double, double)
 
 // a += b
 
-#define CV_CUDEV_IMPLEMENT_VEC_BINARY_ASSIGN(op, input_type, output_type) \
+#define CV_CUDEV_IMPLEMENT_VEC_BINARY_OP_ASSIGN(op, input_type, output_type) \
     __device__ __forceinline__ output_type ## 1 & operator op ## = (output_type ## 1 & a, const input_type ## 1 & b) \
     { \
-    return a = VecTraits<output_type ## 1>::make(a.x op b.x); \
+        return a = VecTraits<output_type ## 1>::make(a.x op b.x); \
     } \
     __device__ __forceinline__ output_type ## 2 & operator op ## = (output_type ## 2 & a, const input_type ## 2 & b) \
     { \
-    return a = VecTraits<output_type ## 2>::make(a.x op b.x, a.y op b.y); \
+        return a = VecTraits<output_type ## 2>::make(a.x op b.x, a.y op b.y); \
     } \
     __device__ __forceinline__ output_type ## 3 & operator op ## = (output_type ## 3 & a, const input_type ## 3 & b) \
     { \
-    return a = VecTraits<output_type ## 3>::make(a.x op b.x, a.y op b.y, a.z op b.z); \
+        return a = VecTraits<output_type ## 3>::make(a.x op b.x, a.y op b.y, a.z op b.z); \
     } \
     __device__ __forceinline__ output_type ## 4 & operator op ## = (output_type ## 4 & a, const input_type ## 4 & b) \
     { \
-    return a = VecTraits<output_type ## 4>::make(a.x op b.x, a.y op b.y, a.z op b.z, a.w op b.w); \
+        return a = VecTraits<output_type ## 4>::make(a.x op b.x, a.y op b.y, a.z op b.z, a.w op b.w); \
     }
 
 // binary operators (vec & vec)
@@ -413,7 +413,7 @@ CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atanh, ::atanh, double, double)
     { \
         return VecTraits<output_type ## 4>::make(a.x op b.x, a.y op b.y, a.z op b.z, a.w op b.w); \
     } \
-    CV_CUDEV_IMPLEMENT_VEC_BINARY_ASSIGN(op, input_type, output_type)
+    CV_CUDEV_IMPLEMENT_VEC_BINARY_OP_ASSIGN(op, input_type, output_type)
 
 CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(+, uchar, int)
 CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(+, char, int)
@@ -472,8 +472,8 @@ CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(^, short, short)
 CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(^, int, int)
 CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(^, uint, uint)
 
-#undef CV_CUDEV_IMPLEMENT_VEC_BINARY_ASSIGN
-#define CV_CUDEV_IMPLEMENT_VEC_BINARY_ASSIGN(op, input_type, output_type)
+#undef CV_CUDEV_IMPLEMENT_VEC_BINARY_OP_ASSIGN
+#define CV_CUDEV_IMPLEMENT_VEC_BINARY_OP_ASSIGN(op, input_type, output_type)
 
 CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(==, uchar, uchar)
 CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(==, char, uchar)
@@ -548,7 +548,7 @@ CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(||, float, uchar)
 CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(||, double, uchar)
 
 #undef CV_CUDEV_IMPLEMENT_VEC_BINARY_OP
-#undef CV_CUDEV_IMPLEMENT_VEC_BINARY_ASSIGN
+#undef CV_CUDEV_IMPLEMENT_VEC_BINARY_OP_ASSIGN
 
 // binary operators (vec & scalar)
 
diff --git a/modules/imgproc/src/resize.cpp b/modules/imgproc/src/resize.cpp
index ffc28da20d..c7fc534021 100644
--- a/modules/imgproc/src/resize.cpp
+++ b/modules/imgproc/src/resize.cpp
@@ -3600,9 +3600,11 @@ private:
         area.allocate(xcoeffs, ksize * dsize.width * cn);
         area.allocate(ycoeffs, ksize * dsize.height * 1);
         area.commit();
+        float cbuf[MAX_ESIZE] = { 0 };
         CV_CheckLE(ksize, MAX_ESIZE, "resampler kernel's size is too larger");
 
-        // when upsampling, `antialias` is same to `generic`, so use `generic` to speed up
+        // when upsampling, `antialias` is same as `generic`
+        // so use `generic` to speed up
         if (antialias && scaled.x < 1.0)
         {
             float a = M(0, 0), b = M(0, 1);
@@ -3620,7 +3622,6 @@ private:
             xkanti = 0;
             xmin = 0;
             xmax = dsize.width;
-            float cbuf[MAX_ESIZE];
             float a = M(0, 0), b = M(0, 1);
             for (int d = 0; d < dsize.width; ++d)
             {
@@ -3686,7 +3687,6 @@ private:
         else
         {
             ykanti = 0;
-            float cbuf[MAX_ESIZE];
             float a = M(1, 0), b = M(1, 1);
             for (int d = 0; d < dsize.height; ++d)
             {
@@ -4599,8 +4599,6 @@ static bool ocl_resizeOnnx(InputArray _src, OutputArray _dst,
     else
         CV_Error(cv::Error::StsError, "should not got here");
 
-    if (errmsg.size())
-        fputs(errmsg.data(), stderr);
     return k.run(2, globalsize, 0, false);
 }
 

From 0630a39b53c1ffbff1f6e871da3f8211e3be2bbb Mon Sep 17 00:00:00 2001
From: Ginkgo <ecrirelettre@outlook.com>
Date: Sun, 4 Aug 2024 20:18:42 +0800
Subject: [PATCH 10/12] add exclude_outside and delete tf_crop_resize

---
 modules/imgproc/include/opencv2/imgproc.hpp |  75 ++++----
 modules/imgproc/src/opencl/resize_onnx.cl   |  83 +++++++--
 modules/imgproc/src/resize.cpp              | 111 ++++++++----
 modules/imgproc/test/ocl/test_warp.cpp      |  11 ++
 modules/imgproc/test/test_resize_onnx.cpp   | 191 ++++++++++++--------
 5 files changed, 312 insertions(+), 159 deletions(-)

diff --git a/modules/imgproc/include/opencv2/imgproc.hpp b/modules/imgproc/include/opencv2/imgproc.hpp
index b9a1e9e54b..0ca89edf54 100644
--- a/modules/imgproc/include/opencv2/imgproc.hpp
+++ b/modules/imgproc/include/opencv2/imgproc.hpp
@@ -281,47 +281,51 @@ enum InterpolationFlags {
 //! ONNX Resize Flags
 enum ResizeONNXFlags
 {
-    INTER_SAMPLER_SHIFT        = 0,
-    INTER_SAMPLER_BIT          = 3,
-    INTER_SAMPLER_MASK         = ((1 << INTER_SAMPLER_BIT) - 1) << INTER_SAMPLER_SHIFT,
+    INTER_SAMPLER_SHIFT         = 0,
+    INTER_SAMPLER_BIT           = 3,
+    INTER_SAMPLER_MASK          = ((1 << INTER_SAMPLER_BIT) - 1) << INTER_SAMPLER_SHIFT,
 
-    INTER_COORDINATE_SHIFT     = INTER_SAMPLER_SHIFT + INTER_SAMPLER_BIT,
-    INTER_COORDINATE_BIT       = 3,
-    INTER_COORDINATE_MASK      = ((1 << INTER_COORDINATE_BIT) - 1) << INTER_COORDINATE_SHIFT,
+    INTER_COORDINATE_SHIFT      = INTER_SAMPLER_SHIFT + INTER_SAMPLER_BIT,
+    INTER_COORDINATE_BIT        = 3,
+    INTER_COORDINATE_MASK       = ((1 << INTER_COORDINATE_BIT) - 1) << INTER_COORDINATE_SHIFT,
     /** x_original = (x_resized + 0.5) / scale - 0.5 */
-    INTER_HALF_PIXEL           = 0 << INTER_COORDINATE_SHIFT,
+    INTER_HALF_PIXEL            = 0 << INTER_COORDINATE_SHIFT,
     /** adjustment = output_width_int / output_width
         center = input_width / 2
         offset = center * (1 - adjustment)
         x_ori = offset + (x + 0.5) / scale - 0.5 */
-    INTER_HALF_PIXEL_SYMMETRIC = 1 << INTER_COORDINATE_SHIFT,
+    INTER_HALF_PIXEL_SYMMETRIC  = 1 << INTER_COORDINATE_SHIFT,
     /** x_original = length_resized > 1 ? (x_resized + 0.5) / scale - 0.5 : 0 */
-    INTER_HALF_PIXEL_PYTORCH   = 2 << INTER_COORDINATE_SHIFT,
+    INTER_HALF_PIXEL_PYTORCH    = 2 << INTER_COORDINATE_SHIFT,
     /** x_original = x_resized * (length_original - 1) / (length_resized - 1) */
-    INTER_ALIGN_CORNERS        = 3 << INTER_COORDINATE_SHIFT,
+    INTER_ALIGN_CORNERS         = 3 << INTER_COORDINATE_SHIFT,
     /** x_original = x_resized / scale */
-    INTER_ASYMMETRIC           = 4 << INTER_COORDINATE_SHIFT,
-    /** x_original = length_resized > 1
-            ? start_x * (length_original - 1) + x_resized * (end_x - start_x) * (length_original - 1) / (length_resized - 1)
-            : 0.5 * (start_x + end_x) * (length_original - 1) */
-    INTER_TF_CROP_RESIZE       = 5 << INTER_COORDINATE_SHIFT,
+    INTER_ASYMMETRIC            = 4 << INTER_COORDINATE_SHIFT,
 
-    INTER_NEAREST_MODE_SHIFT   = INTER_COORDINATE_SHIFT + INTER_COORDINATE_BIT,
-    INTER_NEAREST_MODE_BIT     = 2,
-    INTER_NEAREST_MODE_MASK    = ((1 << INTER_NEAREST_MODE_BIT) - 1) << INTER_NEAREST_MODE_SHIFT,
+    INTER_NEAREST_MODE_SHIFT    = INTER_COORDINATE_SHIFT + INTER_COORDINATE_BIT,
+    INTER_NEAREST_MODE_BIT      = 2,
+    INTER_NEAREST_MODE_MASK     = ((1 << INTER_NEAREST_MODE_BIT) - 1) << INTER_NEAREST_MODE_SHIFT,
     /** round half down: x =  ceil(x - 0.5) */
-    INTER_NEAREST_PREFER_FLOOR = 0 << INTER_NEAREST_MODE_SHIFT,
+    INTER_NEAREST_PREFER_FLOOR  = 0 << INTER_NEAREST_MODE_SHIFT,
     /** round half up  : x = floor(x + 0.5) */
-    INTER_NEAREST_PREFER_CEIL  = 1 << INTER_NEAREST_MODE_SHIFT,
+    INTER_NEAREST_PREFER_CEIL   = 1 << INTER_NEAREST_MODE_SHIFT,
     /** x = floor(x) */
-    INTER_NEAREST_FLOOR        = 2 << INTER_NEAREST_MODE_SHIFT,
+    INTER_NEAREST_FLOOR         = 2 << INTER_NEAREST_MODE_SHIFT,
     /** x =  ceil(x) */
-    INTER_NEAREST_CEIL         = 3 << INTER_NEAREST_MODE_SHIFT,
+    INTER_NEAREST_CEIL          = 3 << INTER_NEAREST_MODE_SHIFT,
 
-    INTER_ANTIALIAS_SHIFT      = INTER_NEAREST_MODE_SHIFT + INTER_NEAREST_MODE_BIT,
-    INTER_ANTIALIAS_BIT        = 1,
-    INTER_ANTIALIAS_MASK       = ((1 << INTER_ANTIALIAS_BIT) - 1) << INTER_ANTIALIAS_SHIFT,
-    INTER_ANTIALIAS            = 1 << INTER_ANTIALIAS_SHIFT,
+    INTER_ANTIALIAS_SHIFT       = INTER_NEAREST_MODE_SHIFT + INTER_NEAREST_MODE_BIT,
+    INTER_ANTIALIAS_BIT         = 1,
+    INTER_ANTIALIAS_MASK        = ((1 << INTER_ANTIALIAS_BIT) - 1) << INTER_ANTIALIAS_SHIFT,
+    INTER_ANTIALIAS             = 1 << INTER_ANTIALIAS_SHIFT,
+
+    INTER_EXCLUDE_OUTSIDE_SHIFT = INTER_ANTIALIAS_SHIFT + INTER_ANTIALIAS_BIT,
+    INTER_EXCLUDE_OUTSIDE_BIT   = 1,
+    INTER_EXCLUDE_OUTSIDE_MASK  = ((1 << INTER_EXCLUDE_OUTSIDE_BIT) - 1) << INTER_EXCLUDE_OUTSIDE_SHIFT,
+    /** If set, the weight of sampling locations outside the image
+        will be set to 0 and the weight will be renormalized so that their sum is 1.0.
+        Only available for antialias or bi-cubic resampling. */
+    INTER_EXCLUDE_OUTSIDE       = 1 << INTER_EXCLUDE_OUTSIDE_SHIFT,
 };
 
 /** \brief Specify the polar mapping mode
@@ -2452,8 +2456,8 @@ enlarge an image, it will generally look best with #INTER_CUBIC (slow) or #INTER
 @param dst output image; it has the size dsize (when it is non-zero) or the size computed from
 src.size(), fx, and fy; the type of dst is the same as of src.
 @param dsize output image size; if it equals zero (`None` in Python), it is computed as:
- \f[\texttt{dsize = Size(round(fx*src.cols), round(fy*src.rows))}\f]
- Either dsize or both fx and fy must be non-zero.
+\f[\texttt{dsize = Size(round(fx*src.cols), round(fy*src.rows))}\f]
+Either dsize or both fx and fy must be non-zero.
 @param fx scale factor along the horizontal axis; when it equals 0, it is computed as
 \f[\texttt{(double)dsize.width/src.cols}\f]
 @param fy scale factor along the vertical axis; when it equals 0, it is computed as
@@ -2470,7 +2474,7 @@ CV_EXPORTS_W void resize( InputArray src, OutputArray dst,
 
 https://github.com/onnx/onnx/blob/main/docs/Operators.md#Resize
 https://github.com/onnx/onnx/blob/main/onnx/reference/ops/op_resize.py
-Not support `exclude_outside` and `extrapolation_value` yet.
+Not support `tf_crop_resize` yet.
 
 To get a similar result to `cv::resize`, give dsize and:
     INTER_NEAREST : ASYMMETRIC + NEAREST_FLOOR
@@ -2480,18 +2484,17 @@ To get a similar result to `cv::resize`, give dsize and:
 @param src input image.
 @param dst output image; it has the size dsize (when it is non-zero) or the size computed from src.size(), scale; the type of dst is the same as of src.
 @param dsize output image size; if it equals to zero, it is computed as:
- \f[\texttt{dsize = Size(int(scale.x * src.cols), int(scale.y * src.rows))}\f]
- Either dsize or scale must be non-zero.
+\f[\texttt{dsize = Size((int)(scale.x * src.cols), (int)(scale.y * src.rows))}\f]
+Either dsize or scale must be non-zero.
 @param scale scale factor; use same definition as ONNX, if scale > 1, it's upsampling.
-@param interpolation interpolation / coordiante, see #InterpolationFlags and #ResizeONNXFlags
+@param interpolation interpolation flags, see #InterpolationFlags and #ResizeONNXFlags
 @param cubicCoeff cubic sampling coeff; range \f[[-1.0, 0)\f]
-@param roi crop region; if provided, the rois' coordinates are normalized in the coordinate system of the input image; it only takes effect with INTER_TF_CROP_RESIZE (ONNX tf_crop_and_resize)
 
 @sa  resize
  */
-CV_EXPORTS_W void resizeOnnx(InputArray src, OutputArray dst, Size dsize,
-    Point2d scale = Point2d(), int interpolation = INTER_LINEAR | INTER_HALF_PIXEL,
-    float cubicCoeff = -0.75f, Rect2d const& roi = Rect2d());
+CV_EXPORTS_W void resizeOnnx(
+    InputArray src, OutputArray dst, Size dsize, Point2d scale = Point2d(),
+    int interpolation = INTER_LINEAR, float cubicCoeff = -0.75f);
 
 /** @brief Applies an affine transformation to an image.
 
diff --git a/modules/imgproc/src/opencl/resize_onnx.cl b/modules/imgproc/src/opencl/resize_onnx.cl
index 611e0d6b35..731a98f503 100644
--- a/modules/imgproc/src/opencl/resize_onnx.cl
+++ b/modules/imgproc/src/opencl/resize_onnx.cl
@@ -12,11 +12,13 @@
 
 #define noconvert(x) (x)
 
+// for debug and intellisense
 #ifndef T
 #   define INTER_NEAREST1
 #   define INTER_LINEAR1
 #   define INTER_CUBIC
 #   define INTER_ANTIALIAS1
+#   define EXCLUDE_OUTSIDE 1
 #   define T int
 #   define W double
 #   define CN 3
@@ -155,12 +157,22 @@ __kernel void resizeOnnx_linear_antialias(
         {
             VW sline = (VW)(0);
             float wline = 0;
-            int sy = clamp(iy + h, 0, src_rows - 1);
+            int sy = iy + h;
+#if EXCLUDE_OUTSIDE
+            if ((unsigned)(sy) >= (unsigned)(src_rows))
+                continue;
+#endif
+            sy = clamp(sy, 0, src_rows - 1);
             __global uchar const* S = srcptr + sy * src_step + src_offset;
             for (int w = xstart; w < xend; ++w)
             {
+                int sx = ix + w;
+#if EXCLUDE_OUTSIDE
+                if ((unsigned)(sx) >= (unsigned)(src_cols))
+                    continue;
+#endif
+                sx = clamp(sx, 0, src_cols - 1);
                 // the computation of linear's weights is trival, so do it in kernel
-                int sx = clamp(ix + w, 0, src_cols - 1);
                 float t = fabs(w - rx) * xscale;
                 t = clamp(1.f - t, 0.f, 1.f);
                 wline += t;
@@ -171,8 +183,7 @@ __kernel void resizeOnnx_linear_antialias(
             weight += u * wline;
             sumval += u * sline;
         }
-        VT d0 = TO_VEC_TYPE(sumval / weight);
-        storepix(d0, D);
+        storepix(TO_VEC_TYPE(sumval / weight), D);
 #else
         W sumval = 0;
         float weight = 0;
@@ -180,11 +191,21 @@ __kernel void resizeOnnx_linear_antialias(
         {
             W sline = 0;
             float wline = 0;
-            int sy = clamp(iy + h, 0, src_rows - 1);
+            int sy = iy + h;
+#if EXCLUDE_OUTSIDE
+            if ((unsigned)(sy) >= (unsigned)(src_rows))
+                continue;
+#endif
+            sy = clamp(sy, 0, src_rows - 1);
             __global uchar const* S = srcptr + sy * src_step + src_offset;
             for (int w = xstart; w < xend; ++w)
             {
-                int sx = clamp(ix + w, 0, src_cols - 1);
+                int sx = ix + w;
+#if EXCLUDE_OUTSIDE
+                if ((unsigned)(sx) >= (unsigned)(src_cols))
+                    continue;
+#endif
+                sx = clamp(sx, 0, src_cols - 1);
                 float t = fabs(w - rx) * xscale;
                 t = clamp(1.f - t, 0.f, 1.f);
                 wline += t;
@@ -203,11 +224,21 @@ __kernel void resizeOnnx_linear_antialias(
             for (int h = ystart; h < yend; ++h)
             {
                 W sline = 0;
-                int sy = clamp(iy + h, 0, src_rows - 1);
+                int sy = iy + h;
+#if EXCLUDE_OUTSIDE
+                if ((unsigned)(sy) >= (unsigned)(src_rows))
+                    continue;
+#endif
+                sy = clamp(sy, 0, src_rows - 1);
                __global uchar const* S = srcptr + sy * src_step + src_offset;
                 for (int w = xstart; w < xend; ++w)
                 {
-                    int sx = clamp(ix + w, 0, src_cols - 1);
+                    int sx = ix + w;
+#if EXCLUDE_OUTSIDE
+                    if ((unsigned)(sx) >= (unsigned)(src_cols))
+                       continue;
+#endif
+                    sx = clamp(sx, 0, src_cols - 1);
                     float t = fabs(w - rx) * xscale;
                     t = clamp(1.f - t, 0.f, 1.f);
                     sline += t * TO_WORK(((__global T const*)(S + sx * pixel_size))[i]);
@@ -222,7 +253,6 @@ __kernel void resizeOnnx_linear_antialias(
     }
 }
 
-
 #elif defined(INTER_CUBIC) && !defined(INTER_ANTIALIAS)
 
 float cubicCoeff(float A, float A2, float A3, float x)
@@ -253,31 +283,56 @@ __kernel void resizeOnnx_cubic(
         int xlimit = xstart + 3;
         int ylimit = ystart + 3;
         int xoffset[4];
-        float xcoeff[4];
+        float xcoeff[4], xcoeffsum = 0;
         for (int x = xstart; x <= xlimit; ++x)
         {
             xoffset[x - xstart] = clamp(x, 0, src_cols - 1) * pixel_size;
             xcoeff [x - xstart] = cubicCoeff(A, A2, A3, x - fx);
+#if EXCLUDE_OUTSIDE
+            if ((unsigned)(x) >= (unsigned)(src_cols))
+                xcoeff[x - xstart] = 0;
+            xcoeffsum += xcoeff[x - xstart];
+#endif
         }
         __global uchar* D = dstptr + (dy * dst_step + mad24(dx, pixel_size, dst_offset));
 #if CN == 1 || CN == 2 || CN == 3 || CN == 4
         VW sum = (VW)(0);
+#if EXCLUDE_OUTSIDE
+        float ycoeffsum = 0;
+#endif
         for (int y = ystart; y <= ylimit; ++y)
         {
+#if EXCLUDE_OUTSIDE
+            if ((unsigned)(y) >= (unsigned)(src_rows))
+                continue;
+#endif
             int yoffset = clamp(y, 0, src_rows - 1) * src_step + src_offset;
             VW sline = (VW)(0);
             for (int x = 0; x < 4; ++x)
                 sline += (VW)(xcoeff[x]) * TO_VEC_WORK(loadpix(srcptr + yoffset + xoffset[x]));
-            sum += sline * (VW)(cubicCoeff(A, A2, A3, y - fy));
+            float u = cubicCoeff(A, A2, A3, y - fy);
+#if EXCLUDE_OUTSIDE
+            ycoeffsum += u;
+#endif
+            sum += sline * u;
         }
+#if EXCLUDE_OUTSIDE
+        storepix(TO_VEC_TYPE(sum / (ycoeffsum * xcoeffsum)), D);
+#else
         storepix(TO_VEC_TYPE(sum), D);
+#endif
 #else
         int yoffset[4];
-        float ycoeff[4];
+        float ycoeff[4], weight = 0;
         for (int y = ystart; y <= ylimit; ++y)
         {
             yoffset[y - ystart] = clamp(y, 0, src_rows - 1) * src_step + src_offset;
             ycoeff [y - ystart] = cubicCoeff(A, A2, A3, y - fy);
+#if EXCLUDE_OUTSIDE
+            if ((unsigned)(y) >= (unsigned)(src_rows))
+                ycoeff[y - ystart] = 0;
+            weight += ycoeff[y - ystart] * xcoeffsum;
+#endif
         }
         for (int i = 0; i < channel; ++i)
         {
@@ -290,7 +345,11 @@ __kernel void resizeOnnx_cubic(
                                                 (srcptr + yoffset[y] + xoffset[x]))[i]);
                 sum += sline * ycoeff[y];
             }
+#if EXCLUDE_OUTSIDE
+            ((__global T*)(D))[i] = TO_TYPE(sum / weight);
+#else
             ((__global T*)(D))[i] = TO_TYPE(sum);
+#endif
         }
 #endif
     }
diff --git a/modules/imgproc/src/resize.cpp b/modules/imgproc/src/resize.cpp
index c7fc534021..23d9e5eb23 100644
--- a/modules/imgproc/src/resize.cpp
+++ b/modules/imgproc/src/resize.cpp
@@ -952,7 +952,7 @@ static inline void interpolateLanczos4( float x, float* coeffs )
  * note: scale may be user input and not equal to (src / dst).
  * ref to onnx, length_resized is src * scale (float), not dst (int).
  */
-static Vec2f interCoordinate(int coordinate, int dst, int src, double scale, double start, double end)
+static Vec2f interCoordinate(int coordinate, int dst, int src, double scale)
 {
     float a, b;
     if (coordinate == INTER_HALF_PIXEL
@@ -979,22 +979,6 @@ static Vec2f interCoordinate(int coordinate, int dst, int src, double scale, dou
         a = static_cast<float>(1.0 / scale);
         b = 0.f;
     }
-    else if (coordinate == INTER_TF_CROP_RESIZE)
-    {
-        CV_CheckGE(start, 0.0, "roi's start is out of image");
-        CV_CheckLE(end  , 1.0, "roi's end   is out of image");
-        CV_CheckLT(start, end, "roi's start must be less than its end");
-        if (dst <= 1)
-        {
-            a = 0.f;
-            b = static_cast<float>(0.5 * (start + end) * (src - 1.0));
-        }
-        else
-        {
-            a = static_cast<float>((end - start) * (src - 1.0) / (src * scale - 1.0));
-            b = static_cast<float>(start * (src - 1.0));
-        }
-    }
     else
         CV_Error(Error::StsBadArg, format("Unknown coordinate transformation mode %d", coordinate));
     return Vec2f(a, b);
@@ -3481,6 +3465,14 @@ public:
 
     /* resize parameter */
     bool is_fixpt, is_double;
+    int sampler, antialias;
+    /* only meaningful when do bi-cubic or antialias resampling.
+        For nearest neighbor, it will have no pixel to select.
+        For linear without antialias,
+            the two sample pixels are at least one inside and at most one outside.
+            So exclude_outside is simply equivalent to clamp.
+    */
+    int exclude_outside;
     int ksize, xkanti, ykanti;
     Point2f scalef;
 
@@ -3504,6 +3496,8 @@ private:
         int start = cvFloor(-2.f / scale) + 1;
         int end = 2 - start;
         int len = end - start;
+        // no need to add FLT_EPSILON.
+        // in antialias cubic resize, we will have at least ceil(2 / scale) pixels inside
         float sum = 0;
         for (int i = start; i < end; ++i)
         {
@@ -3514,8 +3508,11 @@ private:
                 x = A * (((x - 5) * x + 8) * x - 4);
             else
                 x = 0;
+            int sx = index + i;
+            if (exclude_outside && static_cast<unsigned>(sx) >= static_cast<unsigned>(srclen))
+                x = 0;
             elem[i - start].di = cn * dstlen;
-            elem[i - start].si = cn * min(max(index + i, 0), srclen - 1);
+            elem[i - start].si = cn * min(max(sx, 0), srclen - 1);
             elem[i - start].f = x;
             sum += x;
         }
@@ -3550,8 +3547,11 @@ private:
         {
             float x = fabsf(i - ratio) * scale;
             x = min(max(1.f - x, 0.f), 1.f);
+            int sx = index + i;
+            if (exclude_outside && static_cast<unsigned>(sx) >= static_cast<unsigned>(srclen))
+                x = 0;
             elem[i - start].di = cn * dstlen;
-            elem[i - start].si = cn * min(max(index + i, 0), srclen - 1);
+            elem[i - start].si = cn * min(max(sx, 0), srclen - 1);
             elem[i - start].f = x;
             sum += x;
         }
@@ -3574,8 +3574,9 @@ private:
     ResizeOnnxCtrl(int interpolation, int type, float cubicCoeff,
         Size ssize, Size dsize, Point2d const& scaled, Matx22f const& M)
     {
-        int sampler = interpolation & INTER_SAMPLER_MASK;
-        int antialias = interpolation & INTER_ANTIALIAS_MASK;
+        sampler = interpolation & INTER_SAMPLER_MASK;
+        antialias = interpolation & INTER_ANTIALIAS_MASK;
+        exclude_outside = interpolation & INTER_EXCLUDE_OUTSIDE_MASK;
         CV_CheckGE(cubicCoeff, -1.f, "cubic coefficient should range [-1, 0)");
         CV_CheckLT(cubicCoeff, +0.f, "cubic coefficient should range [-1, 0)");
         CV_Check(sampler, sampler == INTER_LINEAR || sampler == INTER_CUBIC,
@@ -3603,8 +3604,7 @@ private:
         float cbuf[MAX_ESIZE] = { 0 };
         CV_CheckLE(ksize, MAX_ESIZE, "resampler kernel's size is too larger");
 
-        // when upsampling, `antialias` is same as `generic`
-        // so use `generic` to speed up
+        // when upsampling, `antialias` is same as `generic`, use `generic` to speed up
         if (antialias && scaled.x < 1.0)
         {
             float a = M(0, 0), b = M(0, 1);
@@ -3644,7 +3644,23 @@ private:
                 if (sampler == INTER_LINEAR)
                     linearCoeffs(f, cbuf);
                 else // if (sampler == INTER_CUBIC)
+                {
                     cubicCoeffs(f, cubicCoeff, cbuf);
+                    if (exclude_outside && (s < 1 || s + 2 >= ssize.width))
+                    {
+                        // no need to add FLT_EPSILON.
+                        // in cubic without antialias, we will have at least 2 pixels inside
+                        float sum = 0;
+                        for (int k = 0; k < 4; ++k)
+                        {
+                            if (static_cast<unsigned>(s + k - 1) >= static_cast<unsigned>(ssize.width))
+                                cbuf[k] = 0;
+                            sum += cbuf[k];
+                        }
+                        for (int k = 0; k < 4; ++k)
+                            cbuf[k] /= sum;
+                    }
+                }
                 if (is_fixpt)
                 {
                     short* coeffs = reinterpret_cast<short*>(xcoeffs) + cn * ksize * d;
@@ -3697,7 +3713,21 @@ private:
                 if (sampler == INTER_LINEAR)
                     linearCoeffs(f, cbuf);
                 else // if (sampler == INTER_CUBIC)
+                {
                     cubicCoeffs(f, cubicCoeff, cbuf);
+                    if (exclude_outside && (s < 1 || s + 2 >= ssize.height))
+                    {
+                        float sum = 0;
+                        for (int k = 0; k < 4; ++k)
+                        {
+                            if (static_cast<unsigned>(s + k - 1) >= static_cast<unsigned>(ssize.height))
+                                cbuf[k] = 0;
+                            sum += cbuf[k];
+                        }
+                        for (int k = 0; k < 4; ++k)
+                            cbuf[k] /= sum;
+                    }
+                }
                 if (is_fixpt)
                 {
                     short* coeffs = reinterpret_cast<short*>(ycoeffs) + 1 * ksize * d;
@@ -4353,7 +4383,7 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
     return k.run(2, globalsize, 0, false);
 }
 
-static void ocl_resizeOnnxTable(int srclen, int dstlen, int esz,
+static void ocl_resizeOnnxTable(int srclen, int dstlen, int esz, int exclude_outside,
     int sampler, float a, float b, float A, float scale, int* offset, float* coeff)
 {
     // maybe want do linear resize in this way?
@@ -4382,13 +4412,16 @@ static void ocl_resizeOnnxTable(int srclen, int dstlen, int esz,
                 else
                     x = 0;
             }
+            int sx = index + i;
+            if (exclude_outside && static_cast<unsigned>(sx) >= static_cast<unsigned>(srclen))
+                x = 0;
             // make work-item(s) in a work-group load offset / coeff in one / fewer memory transaction
             // offsets & coeffs are arranged like
             //      00     10     20     ... n0
             //      01     11     21     ... n1     ...
             //      0(k-1) 1(k-1) 2(k-1) ... n(k-1)
             int to = d + (i - start) * dstlen;
-            offset[to] = min(max(index + i, 0), srclen - 1) * esz;
+            offset[to] = min(max(sx, 0), srclen - 1) * esz;
             coeff [to] = x;
             sum += x;
         }
@@ -4428,7 +4461,6 @@ static char const* ocl_resizeOnnx_convertTypeString(int sdepth, int ddepth, int
     return buf;
 }
 
-
 static bool ocl_resizeOnnx(InputArray _src, OutputArray _dst,
     Matx22f const& M, Point2d const& scaled, int interpolation, float cubicCoeff)
 {
@@ -4436,6 +4468,7 @@ static bool ocl_resizeOnnx(InputArray _src, OutputArray _dst,
     int sampler = interpolation & INTER_SAMPLER_MASK;
     int nearest = interpolation & INTER_NEAREST_MODE_MASK;
     int antialias = interpolation & INTER_ANTIALIAS_MASK;
+    int exclude_outside = interpolation & INTER_EXCLUDE_OUTSIDE_MASK;
     Point2f scale = static_cast<Point2f>(scaled);
     int khalf = (sampler == INTER_LINEAR ? 2 : 4) / 2;
     float xscale = min(scale.x, 1.f), yscale = min(scale.y, 1.f);
@@ -4512,9 +4545,10 @@ static bool ocl_resizeOnnx(InputArray _src, OutputArray _dst,
     {
         int W = (T < CV_32S || T == CV_32F) ? CV_32F : CV_64F, VW = CV_MAKETYPE(W, cn);
         buildopts = format(
-            "-D INTER_LINEAR -D INTER_ANTIALIAS "
+            "-D INTER_LINEAR -D INTER_ANTIALIAS -D EXCLUDE_OUTSIDE=%d "
             "-D T=%s -D W=%s -D CN=%d -D PIXEL_SIZE=%d -D VT=%s -D VW=%s "
             "-D TO_WORK=%s -D TO_VEC_WORK=%s -D TO_TYPE=%s -D TO_VEC_TYPE=%s ",
+            exclude_outside,
             ocl_resizeOnnx_typeToString(T, nullptr, 0),
             ocl_resizeOnnx_typeToString(W, nullptr, 0),
             cn, pixel_size,
@@ -4537,9 +4571,10 @@ static bool ocl_resizeOnnx(InputArray _src, OutputArray _dst,
     {
         int W = (T < CV_32S || T == CV_32F) ? CV_32F : CV_64F, VW = CV_MAKETYPE(W, cn);
         buildopts = format(
-            "-D INTER_CUBIC "
+            "-D INTER_CUBIC -D EXCLUDE_OUTSIDE=%d "
             "-D T=%s -D W=%s -D CN=%d -D PIXEL_SIZE=%d -D VT=%s -D VW=%s "
             "-D TO_WORK=%s -D TO_VEC_WORK=%s -D TO_TYPE=%s -D TO_VEC_TYPE=%s ",
+            exclude_outside,
             ocl_resizeOnnx_typeToString(T, nullptr, 0),
             ocl_resizeOnnx_typeToString(W, nullptr, 0),
             cn, pixel_size,
@@ -4567,10 +4602,11 @@ static bool ocl_resizeOnnx(InputArray _src, OutputArray _dst,
         int* yoffset = xoffset + xstride;
         float* xcoeff = reinterpret_cast<float*>(yoffset + ystride);
         float* ycoeff = reinterpret_cast<float*>(xcoeff + xstride);
+        // use table coeffs, no need to define `-D EXCLUDE_OUTSIDE=%d`
         ocl_resizeOnnxTable(src.cols, dst.cols, pixel_size,
-            sampler, M(0, 0), M(0, 1), cubicCoeff, scale.x, xoffset, xcoeff);
+            exclude_outside, sampler, M(0, 0), M(0, 1), cubicCoeff, scale.x, xoffset, xcoeff);
         ocl_resizeOnnxTable(src.rows, dst.rows, static_cast<int>(src.step[0]),
-            sampler, M(1, 0), M(1, 1), cubicCoeff, scale.y, yoffset, ycoeff);
+            exclude_outside, sampler, M(1, 0), M(1, 1), cubicCoeff, scale.y, yoffset, ycoeff);
         UMat utable;
         Mat(1, tabsize, CV_32S, table.data()).copyTo(utable);
         int W = (T < CV_32S || T == CV_32F) ? CV_32F : CV_64F, VW = CV_MAKETYPE(W, cn);
@@ -5224,7 +5260,7 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
 
 
 void cv::resizeOnnx(InputArray _src, OutputArray _dst,
-    Size dsize, Point2d scale, int interpolation, float cubicCoeff, Rect2d const& roi)
+    Size dsize, Point2d scale, int interpolation, float cubicCoeff)
 {
     static_assert((1 << INTER_SAMPLER_BIT) >= INTER_MAX, "");
     CV_INSTRUMENT_REGION();
@@ -5268,22 +5304,19 @@ void cv::resizeOnnx(InputArray _src, OutputArray _dst,
         coordinate == INTER_HALF_PIXEL_PYTORCH ||
         coordinate == INTER_HALF_PIXEL_SYMMETRIC ||
         coordinate == INTER_ALIGN_CORNERS ||
-        coordinate == INTER_ASYMMETRIC ||
-        coordinate == INTER_TF_CROP_RESIZE);
+        coordinate == INTER_ASYMMETRIC);
 
     // x_org = x * a + b
     Matx22f M;
-    Vec2f xcoef = interCoordinate(
-        coordinate, dsize.width, ssize.width, scale.x, roi.x, roi.x + roi.width);
-    Vec2f ycoef = interCoordinate(
-        coordinate, dsize.height, ssize.height, scale.y, roi.y, roi.y + roi.height);
+    Vec2f xcoef = interCoordinate(coordinate, dsize.width, ssize.width, scale.x);
+    Vec2f ycoef = interCoordinate(coordinate, dsize.height, ssize.height, scale.y);
     M(0, 0) = xcoef[0];
     M(0, 1) = xcoef[1];
     M(1, 0) = ycoef[0];
     M(1, 1) = ycoef[1];
 
     _dst.create(dsize, _src.type());
-    if (dsize == ssize && coordinate != INTER_TF_CROP_RESIZE)
+    if (dsize == ssize)
     {
         // Source and destination are of same size. Use simple copy.
         _src.copyTo(_dst);
@@ -5300,6 +5333,8 @@ void cv::resizeOnnx(InputArray _src, OutputArray _dst,
 
     CV_OCL_RUN(_src.isUMat() && _dst.isUMat(),
         ocl_resizeOnnx(_src, _dst, M, scale, interpolation, cubicCoeff))
+    // if (cv::ocl::isOpenCLActivated() && _src.isUMat() && _dst.isUMat())
+        // CV_Assert(ocl_resizeOnnx(_src, _dst, M, scale, interpolation, cubicCoeff));
 
     Mat src = _src.getMat(), dst = _dst.getMat();
 
diff --git a/modules/imgproc/test/ocl/test_warp.cpp b/modules/imgproc/test/ocl/test_warp.cpp
index 19682c8502..b964b0fdaa 100644
--- a/modules/imgproc/test/ocl/test_warp.cpp
+++ b/modules/imgproc/test/ocl/test_warp.cpp
@@ -701,6 +701,17 @@ OCL_INSTANTIATE_TEST_CASE_P(ImgprocWarpAntiAlias, ResizeOnnx, Combine(
                             Bool(),
                             Values(1, 16)));
 
+OCL_INSTANTIATE_TEST_CASE_P(ImgprocWarpExcludeOutside, ResizeOnnx, Combine(
+                            Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32F),
+                            Values(0.4, 0.27, 1.6),
+                            Values(0.5, 0.71, 2.7),
+                            Values(
+                                (int)(                  INTER_CUBIC  | INTER_EXCLUDE_OUTSIDE),
+                                (int)(INTER_ANTIALIAS | INTER_CUBIC  | INTER_EXCLUDE_OUTSIDE),
+                                (int)(INTER_ANTIALIAS | INTER_LINEAR | INTER_EXCLUDE_OUTSIDE)),
+                            Bool(),
+                            Values(1, 16)));
+
 OCL_INSTANTIATE_TEST_CASE_P(ImgprocWarpNearest, ResizeOnnx, Combine(
                             Values(CV_8S, CV_16S, CV_32F, CV_64F),
                             Values(0.4, 0.27, 1.6),
diff --git a/modules/imgproc/test/test_resize_onnx.cpp b/modules/imgproc/test/test_resize_onnx.cpp
index 6b8c4fc3de..86798a4152 100644
--- a/modules/imgproc/test/test_resize_onnx.cpp
+++ b/modules/imgproc/test/test_resize_onnx.cpp
@@ -12,7 +12,6 @@ struct ResizeOnnx
     Size szsrc, szref, szdst;
     Point2d scale;
     float cubic;
-    Rect2d roi;
     /* make sure insrc is:
      *   (1) integer
      *   (2) range [-127, 127]
@@ -44,7 +43,7 @@ struct ResizeOnnx
         Mat iR(szref, CV_64F, inref.data());
         Mat S = iS, R = iR, nS, nR;
         double alpha[8] = {1, -1, 5, 5, 0, -3, -2, +4};
-        double  beta[8] = {0, -0, 0, 7, 7, -7, -6, +6};
+        double  beta[8] = {0, -0, 0, 7, 7, +7, -6, -6};
         RNG& rng = TS::ptr()->get_rng();
         for (int cn = 1; cn <= 8; ++cn)
         {
@@ -59,21 +58,19 @@ struct ResizeOnnx
             {
                 double eps = (depth <= CV_32S) ? 1.0 : 1e-3;
                 int type = CV_MAKETYPE(depth, cn);
-                string errinfo = "fail on type " + typeToString(type);
                 Mat src, ref, dst;
                 rand_roi(rng, src, szsrc, type);
                 if (szdst.area())
                     rand_roi(rng, dst, szdst, type);
                 S.convertTo(src, type);
                 R.convertTo(ref, type);
-                resizeOnnx(src, dst, szdst, scale, interpolate, cubic, roi);
-                EXPECT_EQ(ref.size(), dst.size()) << errinfo;
+                resizeOnnx(src, dst, szdst, scale, interpolate, cubic);
                 // nearest must give bit-same result
                 if ((interpolate & INTER_SAMPLER_MASK) == INTER_NEAREST)
-                    EXPECT_EQ(cv::norm(ref, dst, NORM_INF), 0.0) << errinfo;
+                    EXPECT_MAT_NEAR(ref, dst, 0.0);
                 // cvRound(4.5) = 4, but when doing resize with int, we may get 5
                 else
-                    EXPECT_LE(cv::norm(ref, dst, NORM_INF), eps) << errinfo;
+                    EXPECT_MAT_NEAR(ref, dst, eps);
             }
         }
     }
@@ -85,8 +82,7 @@ TEST(ResizeOnnx, downsample_scales_cubic)
 {
     ResizeOnnx{
         INTER_CUBIC,
-        Size(4, 4), Size(3, 3), Size(), Point2d(0.8, 0.8),
-        -0.75f, Rect2d(),
+        Size(4, 4), Size(3, 3), Size(), Point2d(0.8, 0.8), -0.75f,
         {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
         {
              1.47119141, 2.78125   ,  4.08251953,
@@ -96,12 +92,25 @@ TEST(ResizeOnnx, downsample_scales_cubic)
     }.run();
 }
 
+TEST(ResizeOnnx, downsample_scales_cubic_A_n0p5_exclude_outside)
+{
+    ResizeOnnx{
+        INTER_CUBIC | INTER_EXCLUDE_OUTSIDE,
+        Size(4, 4), Size(3, 3), Size(), Point2d(0.8, 0.8), -0.5f,
+        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+        {
+             1.36812675,  2.6695014 ,  4.0133367 ,
+             6.57362535,  7.875     ,  9.2188353 ,
+            11.94896657, 13.25034122, 14.59417652,
+        }
+    }.run();
+}
+
 TEST(ResizeOnnx, downsample_scales_cubic_align_corners)
 {
     ResizeOnnx{
         INTER_CUBIC | INTER_ALIGN_CORNERS,
-        Size(4, 4), Size(3, 3), Size(), Point2d(0.8, 0.8),
-        -0.75f, Rect2d(),
+        Size(4, 4), Size(3, 3), Size(), Point2d(0.8, 0.8), -0.75f,
         {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
         {
              1.0       ,  2.39519159,  3.79038317,
@@ -115,8 +124,7 @@ TEST(ResizeOnnx, downsample_scales_cubic_antialias)
 {
     ResizeOnnx{
         INTER_CUBIC | INTER_ANTIALIAS,
-        Size(4, 4), Size(2, 2), Size(), Point2d(0.6, 0.6),
-        -0.75f, Rect2d(),
+        Size(4, 4), Size(2, 2), Size(), Point2d(0.6, 0.6), -0.75f,
         {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
         {
             2.5180721,  4.2858863,
@@ -129,8 +137,7 @@ TEST(ResizeOnnx, downsample_scales_linear)
 {
     ResizeOnnx{
         INTER_LINEAR,
-        Size(4, 2), Size(2, 1), Size(), Point2d(0.6, 0.6),
-        -0.75f, Rect2d(),
+        Size(4, 2), Size(2, 1), Size(), Point2d(0.6, 0.6), -0.75f,
         {1, 2, 3, 4, 5, 6, 7, 8},
         {2.6666665, 4.3333331}
     }.run();
@@ -140,8 +147,7 @@ TEST(ResizeOnnx, downsample_scales_linear_align_corners)
 {
     ResizeOnnx{
         INTER_LINEAR | INTER_ALIGN_CORNERS,
-        Size(4, 2), Size(2, 1), Size(), Point2d(0.6, 0.6),
-        -0.75f, Rect2d(),
+        Size(4, 2), Size(2, 1), Size(), Point2d(0.6, 0.6), -0.75f,
         {1, 2, 3, 4, 5, 6, 7, 8},
         {1.0, 3.142857}
     }.run();
@@ -151,8 +157,7 @@ TEST(ResizeOnnx, downsample_scales_linear_antialias)
 {
     ResizeOnnx{
         INTER_LINEAR | INTER_ANTIALIAS,
-        Size(4, 4), Size(2, 2), Size(), Point2d(0.6, 0.6),
-        -0.75f, Rect2d(),
+        Size(4, 4), Size(2, 2), Size(), Point2d(0.6, 0.6), -0.75f,
         {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
         {
             2.875,  4.5,
@@ -165,8 +170,7 @@ TEST(ResizeOnnx, downsample_scales_linear_half_pixel_symmetric)
 {
     ResizeOnnx{
         INTER_LINEAR | INTER_HALF_PIXEL_SYMMETRIC,
-        Size(4, 1), Size(2, 1), Size(), Point2d(0.6, 1.0),
-        -0.75f, Rect2d(),
+        Size(4, 1), Size(2, 1), Size(), Point2d(0.6, 1.0), -0.75f,
         {1, 2, 3, 4},
         {1.6666667, 3.3333333}
     }.run();
@@ -176,8 +180,7 @@ TEST(ResizeOnnx, downsample_scales_nearest)
 {
     ResizeOnnx{
         INTER_NEAREST,
-        Size(4, 2), Size(2, 1), Size(), Point2d(0.6, 0.6),
-        -0.75f, Rect2d(),
+        Size(4, 2), Size(2, 1), Size(), Point2d(0.6, 0.6), -0.75f,
         {1, 2, 3, 4, 5, 6, 7, 8},
         {1, 3}
     }.run();
@@ -187,8 +190,7 @@ TEST(ResizeOnnx, downsample_sizes_cubic)
 {
     ResizeOnnx{
         INTER_CUBIC,
-        Size(4, 4), Size(3, 3), Size(3, 3), Point2d(),
-        -0.75f, Rect2d(),
+        Size(4, 4), Size(3, 3), Size(3, 3), Point2d(), -0.75f,
         {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
         {
              1.63078704,  3.00462963,  4.37847222,
@@ -202,8 +204,7 @@ TEST(ResizeOnnx, downsample_sizes_cubic_antialias)
 {
     ResizeOnnx{
         INTER_CUBIC | INTER_ANTIALIAS,
-        Size(4, 4), Size(3, 3), Size(3, 3), Point2d(),
-        -0.75f, Rect2d(),
+        Size(4, 4), Size(3, 3), Size(3, 3), Point2d(), -0.75f,
         {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
         {
              1.7750092,   3.1200073,  4.4650054,
@@ -217,8 +218,7 @@ TEST(ResizeOnnx, downsample_sizes_linear_antialias)
 {
     ResizeOnnx{
         INTER_LINEAR | INTER_ANTIALIAS,
-        Size(4, 4), Size(3, 3), Size(3, 3), Point2d(),
-        -0.75f, Rect2d(),
+        Size(4, 4), Size(3, 3), Size(3, 3), Point2d(), -0.75f,
         {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
         {
              2.3636363,  3.590909,   4.818182,
@@ -232,8 +232,7 @@ TEST(ResizeOnnx, downsample_sizes_linear_pytorch_half_pixel)
 {
     ResizeOnnx{
         INTER_LINEAR | INTER_HALF_PIXEL_PYTORCH,
-        Size(4, 4), Size(1, 3), Size(1, 3), Point2d(),
-        -0.75f, Rect2d(),
+        Size(4, 4), Size(1, 3), Size(1, 3), Point2d(), -0.75f,
         {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
         {
              1.6666666,
@@ -247,35 +246,17 @@ TEST(ResizeOnnx, downsample_sizes_nearest)
 {
     ResizeOnnx{
         INTER_NEAREST,
-        Size(4, 2), Size(3, 1), Size(3, 1), Point2d(),
-        -0.75f, Rect2d(),
+        Size(4, 2), Size(3, 1), Size(3, 1), Point2d(), -0.75f,
         {1, 2, 3, 4, 5, 6, 7, 8},
         {1, 2, 4}
     }.run();
 }
 
-TEST(ResizeOnnx, tf_crop_and_resize)
-{
-    // Note: for some rois, the result may be different with that of TF for inaccurate floating point
-    ResizeOnnx{
-        INTER_LINEAR | INTER_TF_CROP_RESIZE,
-        Size(4, 4), Size(3, 3), Size(3, 3), Point2d(),
-        -0.75f, Rect2d(0.6, 0.4, 0.2, 0.2),
-        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
-        {
-             7.6000004,  7.9,  8.2     ,
-             8.8      ,  9.1,  9.400001,
-            10.0      , 10.3, 10.6     ,
-        }
-    }.run();
-}
-
 TEST(ResizeOnnx, upsample_scales_cubic)
 {
     ResizeOnnx{
         INTER_CUBIC,
-        Size(4, 4), Size(8, 8), Size(), Point2d(2.0, 2.0),
-        -0.75f, Rect2d(),
+        Size(4, 4), Size(8, 8), Size(), Point2d(2.0, 2.0), -0.75f,
         {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
         {
             0.47265625, 0.76953125, 1.24609375, 1.875, 2.28125, 2.91015625, 3.38671875, 3.68359375,
@@ -290,12 +271,30 @@ TEST(ResizeOnnx, upsample_scales_cubic)
     }.run();
 }
 
+TEST(ResizeOnnx, upsample_scales_cubic_A_n0p5_exclude_outside)
+{
+    ResizeOnnx{
+        INTER_CUBIC | INTER_EXCLUDE_OUTSIDE,
+        Size(4, 4), Size(8, 8), Size(), Point2d(2.0, 2.0), -0.5f,
+        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+        {
+            0.55882353, 0.81494204, 1.35698249, 1.89705882, 2.39705882, 2.93713516, 3.47917561, 3.73529412,
+            1.58329755, 1.83941606, 2.38145651, 2.92153285, 3.42153285, 3.96160918, 4.50364964, 4.75976814,
+            3.75145936, 4.00757787, 4.54961832, 5.08969466, 5.58969466, 6.12977099, 6.67181144, 6.92792995,
+            5.91176471, 6.16788321, 6.70992366, 7.25, 7.75, 8.29007634, 8.83211679, 9.08823529,
+            7.91176471, 8.16788321, 8.70992366, 9.25, 9.75, 10.29007634, 10.83211679, 11.08823529,
+            10.07207005, 10.32818856, 10.87022901, 11.41030534, 11.91030534, 12.45038168, 12.99242213, 13.24854064,
+            12.24023186, 12.49635036, 13.03839082, 13.57846715, 14.07846715, 14.61854349, 15.16058394, 15.41670245,
+            13.26470588, 13.52082439, 14.06286484, 14.60294118, 15.10294118, 15.64301751, 16.18505796, 16.44117647,
+        }
+    }.run();
+}
+
 TEST(ResizeOnnx, upsample_scales_cubic_align_corners)
 {
     ResizeOnnx{
         INTER_CUBIC | INTER_ALIGN_CORNERS,
-        Size(4, 4), Size(8, 8), Size(), Point2d(2.0, 2.0),
-        -0.75f, Rect2d(),
+        Size(4, 4), Size(8, 8), Size(), Point2d(2.0, 2.0), -0.75f,
         {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
         {
             1.0, 1.34110787, 1.80029155, 2.32944606, 2.67055394, 3.19970845, 3.65889213, 4.0,
@@ -314,8 +313,7 @@ TEST(ResizeOnnx, upsample_scales_cubic_asymmetric)
 {
     ResizeOnnx{
         INTER_CUBIC | INTER_ASYMMETRIC,
-        Size(4, 4), Size(8, 8), Size(), Point2d(2.0, 2.0),
-        -0.75f, Rect2d(),
+        Size(4, 4), Size(8, 8), Size(), Point2d(2.0, 2.0), -0.75f,
         {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
         {
             1.0, 1.40625, 2.0, 2.5, 3.0, 3.59375, 4.0, 4.09375,
@@ -334,8 +332,7 @@ TEST(ResizeOnnx, upsample_scales_linear)
 {
     ResizeOnnx{
         INTER_LINEAR,
-        Size(2, 2), Size(4, 4), Size(), Point2d(2.0, 2.0),
-        -0.75f, Rect2d(),
+        Size(2, 2), Size(4, 4), Size(), Point2d(2.0, 2.0), -0.75f,
         {1, 2, 3, 4},
         {
             1.0, 1.25, 1.75, 2.0,
@@ -350,8 +347,7 @@ TEST(ResizeOnnx, upsample_scales_linear_align_corners)
 {
     ResizeOnnx{
         INTER_LINEAR | INTER_ALIGN_CORNERS,
-        Size(2, 2), Size(4, 4), Size(), Point2d(2.0, 2.0),
-        -0.75f, Rect2d(),
+        Size(2, 2), Size(4, 4), Size(), Point2d(2.0, 2.0), -0.75f,
         {1, 2, 3, 4},
         {
             1.0, 1.33333333, 1.66666667, 2.0,
@@ -366,8 +362,7 @@ TEST(ResizeOnnx, upsample_scales_linear_half_pixel_symmetric)
 {
     ResizeOnnx{
         INTER_LINEAR | INTER_HALF_PIXEL_SYMMETRIC,
-        Size(2, 2), Size(5, 4), Size(), Point2d(2.94, 2.3),
-        -0.75f, Rect2d(),
+        Size(2, 2), Size(5, 4), Size(), Point2d(2.94, 2.3), -0.75f,
         {1, 2, 3, 4},
         {
             1.0       , 1.15986395, 1.5       , 1.84013605, 2.0       ,
@@ -382,8 +377,7 @@ TEST(ResizeOnnx, upsample_scales_nearest)
 {
     ResizeOnnx{
         INTER_NEAREST,
-        Size(2, 2), Size(6, 4), Size(), Point2d(3.0, 2.0),
-        -0.75f, Rect2d(),
+        Size(2, 2), Size(6, 4), Size(), Point2d(3.0, 2.0), -0.75f,
         {1, 2, 3, 4},
         {
             1, 1, 1, 2, 2, 2,
@@ -398,8 +392,7 @@ TEST(ResizeOnnx, upsample_sizes_cubic)
 {
     ResizeOnnx{
         INTER_CUBIC,
-        Size(4, 4), Size(10, 9), Size(10, 9), Point2d(),
-        -0.75f, Rect2d(),
+        Size(4, 4), Size(10, 9), Size(10, 9), Point2d(), -0.75f,
         {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
         {
             0.45507922, 0.64057922, 0.97157922, 1.42257922, 1.90732922, 2.22332922, 2.70807922, 3.15907922, 3.49007922, 3.67557922,
@@ -419,8 +412,7 @@ TEST(ResizeOnnx, upsample_sizes_nearest)
 {
     ResizeOnnx{
         INTER_NEAREST,
-        Size(2, 2), Size(8, 7), Size(8, 7), Point2d(),
-        -0.75f, Rect2d(),
+        Size(2, 2), Size(8, 7), Size(8, 7), Point2d(), -0.75f,
         {1, 2, 3, 4},
         {
             1, 1, 1, 1, 2, 2, 2, 2,
@@ -438,8 +430,7 @@ TEST(ResizeOnnx, upsample_sizes_nearest_ceil_half_pixel)
 {
     ResizeOnnx{
         INTER_NEAREST | INTER_NEAREST_CEIL,
-        Size(4, 4), Size(8, 8), Size(8, 8), Point2d(),
-        -0.75f, Rect2d(),
+        Size(4, 4), Size(8, 8), Size(8, 8), Point2d(), -0.75f,
         {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
         {
             1, 2, 2, 3, 3, 4, 4, 4,
@@ -458,8 +449,7 @@ TEST(ResizeOnnx, upsample_sizes_nearest_floor_align_corners)
 {
     ResizeOnnx{
         INTER_NEAREST | INTER_NEAREST_FLOOR | INTER_ALIGN_CORNERS,
-        Size(4, 4), Size(8, 8), Size(8, 8), Point2d(),
-        -0.75f, Rect2d(),
+        Size(4, 4), Size(8, 8), Size(8, 8), Point2d(), -0.75f,
         {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
         {
             1, 1, 1, 2, 2, 3, 3, 4,
@@ -478,8 +468,7 @@ TEST(ResizeOnnx, upsample_sizes_nearest_round_prefer_ceil_asymmetric)
 {
     ResizeOnnx{
         INTER_NEAREST | INTER_NEAREST_PREFER_CEIL | INTER_ASYMMETRIC,
-        Size(4, 4), Size(8, 8), Size(8, 8), Point2d(),
-        -0.75f, Rect2d(),
+        Size(4, 4), Size(8, 8), Size(8, 8), Point2d(), -0.75f,
         {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
         {
             1, 2, 2, 3, 3, 4, 4, 4,
@@ -494,4 +483,60 @@ TEST(ResizeOnnx, upsample_sizes_nearest_round_prefer_ceil_asymmetric)
     }.run();
 }
 
+/*
+import numpy as np
+import onnx
+from onnx.reference.ops.op_resize import (
+    _interpolate_nd,
+    _cubic_coeffs, _cubic_coeffs_antialias,
+    _linear_coeffs, _linear_coeffs_antialias
+)
+data = np.arange(1, 17, dtype=np.float64).reshape(4, 4)
+scales = np.array([0.8, 0.8], dtype=np.float64)
+*/
+
+/*
+output = _interpolate_nd(
+    data,
+    lambda x, s: _cubic_coeffs_antialias(x, s, A=-0.5),
+    scale_factors=scales,
+    exclude_outside=True,
+)
+*/
+TEST(ResizeOnnx, downsample_scales_cubic_antialias_A_n0p5_exclude_outside)
+{
+    ResizeOnnx{
+        INTER_CUBIC | INTER_ANTIALIAS | INTER_EXCLUDE_OUTSIDE,
+        Size(4, 4), Size(3, 3), Size(), Point2d(0.8, 0.8), -0.5f,
+        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+        {
+             1.68342335,  2.90749817,  4.22822584,
+             6.57972264,  7.80379747,  9.12452513,
+            11.86263331, 13.08670813, 14.4074358 ,
+        }
+    }.run();
+}
+
+/*
+output = _interpolate_nd(
+    data,
+    _linear_coeffs_antialias,
+    scale_factors=scales,
+    exclude_outside=True,
+)
+*/
+TEST(ResizeOnnx, downsample_scales_linear_antialias_exclude_outside)
+{
+    ResizeOnnx{
+        INTER_LINEAR | INTER_ANTIALIAS | INTER_EXCLUDE_OUTSIDE,
+        Size(4, 4), Size(3, 3), Size(), Point2d(0.8, 0.8), -0.75f,
+        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+        {
+             2.25      ,  3.41666667,  4.58333333,
+             6.91666667,  8.08333333,  9.25      ,
+            11.58333333, 12.75      , 13.91666667,
+        }
+    }.run();
+}
+
 }}

From 820db2e236b4cab02eb61f33f2b5ff18b39142a5 Mon Sep 17 00:00:00 2001
From: Ginkgo <ecrirelettre@outlook.com>
Date: Sun, 4 Aug 2024 22:08:19 +0800
Subject: [PATCH 11/12] remove unnecessary clamp when exclude_outside

---
 modules/imgproc/include/opencv2/imgproc.hpp |  2 +-
 modules/imgproc/src/opencl/resize_onnx.cl   | 22 ++++++++++++++-------
 modules/imgproc/test/test_resize_onnx.cpp   |  5 +++--
 3 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/modules/imgproc/include/opencv2/imgproc.hpp b/modules/imgproc/include/opencv2/imgproc.hpp
index 0ca89edf54..0c04d6fa0a 100644
--- a/modules/imgproc/include/opencv2/imgproc.hpp
+++ b/modules/imgproc/include/opencv2/imgproc.hpp
@@ -2488,7 +2488,7 @@ To get a similar result to `cv::resize`, give dsize and:
 Either dsize or scale must be non-zero.
 @param scale scale factor; use same definition as ONNX, if scale > 1, it's upsampling.
 @param interpolation interpolation flags, see #InterpolationFlags and #ResizeONNXFlags
-@param cubicCoeff cubic sampling coeff; range \f[[-1.0, 0)\f]
+@param cubicCoeff cubic sampling coefficient, range \f[[-1.0, 0)\f]
 
 @sa  resize
  */
diff --git a/modules/imgproc/src/opencl/resize_onnx.cl b/modules/imgproc/src/opencl/resize_onnx.cl
index 731a98f503..d3e2476a1e 100644
--- a/modules/imgproc/src/opencl/resize_onnx.cl
+++ b/modules/imgproc/src/opencl/resize_onnx.cl
@@ -161,8 +161,9 @@ __kernel void resizeOnnx_linear_antialias(
 #if EXCLUDE_OUTSIDE
             if ((unsigned)(sy) >= (unsigned)(src_rows))
                 continue;
-#endif
+#else
             sy = clamp(sy, 0, src_rows - 1);
+#endif
             __global uchar const* S = srcptr + sy * src_step + src_offset;
             for (int w = xstart; w < xend; ++w)
             {
@@ -170,8 +171,9 @@ __kernel void resizeOnnx_linear_antialias(
 #if EXCLUDE_OUTSIDE
                 if ((unsigned)(sx) >= (unsigned)(src_cols))
                     continue;
-#endif
+#else
                 sx = clamp(sx, 0, src_cols - 1);
+#endif
                 // the computation of linear's weights is trival, so do it in kernel
                 float t = fabs(w - rx) * xscale;
                 t = clamp(1.f - t, 0.f, 1.f);
@@ -195,8 +197,9 @@ __kernel void resizeOnnx_linear_antialias(
 #if EXCLUDE_OUTSIDE
             if ((unsigned)(sy) >= (unsigned)(src_rows))
                 continue;
-#endif
+#else
             sy = clamp(sy, 0, src_rows - 1);
+#endif
             __global uchar const* S = srcptr + sy * src_step + src_offset;
             for (int w = xstart; w < xend; ++w)
             {
@@ -204,8 +207,9 @@ __kernel void resizeOnnx_linear_antialias(
 #if EXCLUDE_OUTSIDE
                 if ((unsigned)(sx) >= (unsigned)(src_cols))
                     continue;
-#endif
+#else
                 sx = clamp(sx, 0, src_cols - 1);
+#endif
                 float t = fabs(w - rx) * xscale;
                 t = clamp(1.f - t, 0.f, 1.f);
                 wline += t;
@@ -228,8 +232,9 @@ __kernel void resizeOnnx_linear_antialias(
 #if EXCLUDE_OUTSIDE
                 if ((unsigned)(sy) >= (unsigned)(src_rows))
                     continue;
-#endif
+#else
                 sy = clamp(sy, 0, src_rows - 1);
+#endif
                __global uchar const* S = srcptr + sy * src_step + src_offset;
                 for (int w = xstart; w < xend; ++w)
                 {
@@ -237,8 +242,9 @@ __kernel void resizeOnnx_linear_antialias(
 #if EXCLUDE_OUTSIDE
                     if ((unsigned)(sx) >= (unsigned)(src_cols))
                        continue;
-#endif
+#else
                     sx = clamp(sx, 0, src_cols - 1);
+#endif
                     float t = fabs(w - rx) * xscale;
                     t = clamp(1.f - t, 0.f, 1.f);
                     sline += t * TO_WORK(((__global T const*)(S + sx * pixel_size))[i]);
@@ -305,8 +311,10 @@ __kernel void resizeOnnx_cubic(
 #if EXCLUDE_OUTSIDE
             if ((unsigned)(y) >= (unsigned)(src_rows))
                 continue;
-#endif
+            int yoffset = y * src_step + src_offset;
+#else
             int yoffset = clamp(y, 0, src_rows - 1) * src_step + src_offset;
+#endif
             VW sline = (VW)(0);
             for (int x = 0; x < 4; ++x)
                 sline += (VW)(xcoeff[x]) * TO_VEC_WORK(loadpix(srcptr + yoffset + xoffset[x]));
diff --git a/modules/imgproc/test/test_resize_onnx.cpp b/modules/imgproc/test/test_resize_onnx.cpp
index 86798a4152..b995d24cbd 100644
--- a/modules/imgproc/test/test_resize_onnx.cpp
+++ b/modules/imgproc/test/test_resize_onnx.cpp
@@ -42,8 +42,9 @@ struct ResizeOnnx
         Mat iS(szsrc, CV_64F, insrc.data());
         Mat iR(szref, CV_64F, inref.data());
         Mat S = iS, R = iR, nS, nR;
-        double alpha[8] = {1, -1, 5, 5, 0, -3, -2, +4};
-        double  beta[8] = {0, -0, 0, 7, 7, +7, -6, -6};
+        // values after convertTo need to be all positive or all negative
+        double alpha[8] = {1, -1, 5, 5, 0, -3, -2, 4};
+        double  beta[8] = {0, -0, 0, 2, 7, -1, -6, 9};
         RNG& rng = TS::ptr()->get_rng();
         for (int cn = 1; cn <= 8; ++cn)
         {

From 61709877cdb5b1dd913e38f61b59269ea22c3de3 Mon Sep 17 00:00:00 2001
From: Ginkgo <ecrirelettre@outlook.com>
Date: Wed, 9 Apr 2025 00:22:39 +0800
Subject: [PATCH 12/12] move OPENCV_EXCLUDE_C_API down

---
 modules/imgproc/src/resize.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/modules/imgproc/src/resize.cpp b/modules/imgproc/src/resize.cpp
index f2a151d58c..6af3a12bc2 100644
--- a/modules/imgproc/src/resize.cpp
+++ b/modules/imgproc/src/resize.cpp
@@ -5314,8 +5314,6 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
     hal::resize(src.type(), src.data, src.step, src.cols, src.rows, dst.data, dst.step, dst.cols, dst.rows, inv_scale_x, inv_scale_y, interpolation);
 }
 
-#ifndef OPENCV_EXCLUDE_C_API
-
 void cv::resizeOnnx(InputArray _src, OutputArray _dst,
     Size dsize, Point2d scale, int interpolation, float cubicCoeff)
 {
@@ -5486,6 +5484,7 @@ void cv::resizeOnnx(InputArray _src, OutputArray _dst,
     func(src, dst, ctrl);
 }
 
+#ifndef OPENCV_EXCLUDE_C_API
 
 CV_IMPL void
 cvResize( const CvArr* srcarr, CvArr* dstarr, int method )