From 2ec1d1c986a139d399698bb9511d6033fa79798c Mon Sep 17 00:00:00 2001
From: kallaballa <amir@viel-zu.com>
Date: Mon, 13 Nov 2023 09:48:48 +0100
Subject: [PATCH] use precomputed coefficents

---
 modules/imgproc/src/opencl/resize.cl | 106 ++++++++++++++++++---------
 modules/imgproc/src/resize.cpp       |  42 ++++++++++-
 2 files changed, 112 insertions(+), 36 deletions(-)

diff --git a/modules/imgproc/src/opencl/resize.cl b/modules/imgproc/src/opencl/resize.cl
index 3cbd83db5b..197af5bc63 100644
--- a/modules/imgproc/src/opencl/resize.cl
+++ b/modules/imgproc/src/opencl/resize.cl
@@ -128,6 +128,21 @@ __kernel void resizeSampler(__read_only image2d_t srcImage,
 
 #elif defined INTER_LINEAR_INTEGER
 
+#define FIXED_POINT_BITS 16
+#define FIXED_POINT_SCALE (1 << FIXED_POINT_BITS)
+
+// Fixed-point multiply
+#define FIXED_MUL(a, b) (((a) * (b)) >> FIXED_POINT_BITS)
+
+// Rounding methods
+#define ROUND_NEAREST_EVEN 0
+#define ROUND_DOWN 1
+#define ROUND_UP 2
+#define TRUNCATE 3
+
+// Choose rounding method
+#define ROUNDING_METHOD ROUND_NEAREST_EVEN
+
 __kernel void resizeLN(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
                        __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
                        __global const uchar * buffer)
@@ -162,6 +177,9 @@ __kernel void resizeLN(__global const uchar * srcptr, int src_step, int src_offs
     }
 }
 
+
+
+
 #elif defined INTER_LINEAR
 
 __kernel void resizeLN(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
@@ -222,55 +240,77 @@ __kernel void resizeLN(__global const uchar * srcptr, int src_step, int src_offs
 
 #elif defined INTER_LINEAR_EXACT
 
-#define FIXED_POINT_BITS 8
+#define FIXED_POINT_BITS 16
 #define FIXED_POINT_SCALE (1 << FIXED_POINT_BITS)
 
 // Fixed-point multiply
 #define FIXED_MUL(a, b) (((a) * (b)) >> FIXED_POINT_BITS)
 
+// Rounding methods
+#define ROUND_NEAREST_EVEN 0
+#define ROUND_DOWN 1
+#define ROUND_UP 2
+#define TRUNCATE 3
+
+// Choose rounding method
+#define ROUNDING_METHOD ROUND_NEAREST_EVEN
+
 __kernel void resizeLN(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
                                 __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
-                                int ifx, int ify)
+                                __global const int * xofs, __global const int * yofs,
+                                __global const short * ialpha, __global const short * ibeta)
 {
     int dx = get_global_id(0);
     int dy = get_global_id(1);
 
-    if (dx < dst_cols && dy < dst_rows)
+    if (dx >= dst_cols || dy >= dst_rows)
     {
-        // Calculate source coordinates
-        int sx = (dx * ifx) >> 16;
-        int sy = (dy * ify) >> 16;
-
-        // Perform boundary checks
-        sx = clamp(sx, 0, src_cols - 1);
-        sy = clamp(sy, 0, src_rows - 1);
-
-        // Calculate interpolation coefficients
-        int u = (dx * ifx) & 0xFFFF;
-        int v = (dy * ify) & 0xFFFF;
-
-        int U = (0x10000 - u) >> 8;
-        int V = (0x10000 - v) >> 8;
-        int U1 = u >> 8;
-        int V1 = v >> 8;
-
-        // Load pixel values
-        WT data0 = convertToWT(loadpix(srcptr + mad24(sy, src_step, mad24(sx, TSIZE, src_offset))));
-        WT data1 = convertToWT(loadpix(srcptr + mad24(sy, src_step, mad24(INC(sx, src_cols), TSIZE, src_offset))));
-        WT data2 = convertToWT(loadpix(srcptr + mad24(INC(sy, src_rows), src_step, mad24(sx, TSIZE, src_offset))));
-        WT data3 = convertToWT(loadpix(srcptr + mad24(INC(sy, src_rows), src_step, mad24(INC(sx, src_cols), TSIZE, src_offset))));
-
-        // Perform fixed-point interpolation
-        WT val = mul24((WT)mul24(U1, V1), data0) + mul24((WT)mul24(U, V1), data1) +
-                 mul24((WT)mul24(U1, V), data2) + mul24((WT)mul24(U, V), data3);
-
-        // Convert and store the result
-        T uval = convertToDT((val + 2) >> 2);
-        storepix(uval, dstptr + mad24(dy, dst_step, mad24(dx, TSIZE, dst_offset)));
+        return; // Exit if dx or dy is out of bounds
     }
+
+    // Calculate source coordinates
+    int sx = (dx * ifx) >> 16;
+    int sy = (dy * ify) >> 16;
+
+    // Perform boundary checks
+    sx = clamp(sx, 0, src_cols - 1);
+    sy = clamp(sy, 0, src_rows - 1);
+
+    // Calculate interpolation coefficients
+    int u = (dx * ifx) & 0xFFFF;
+    int v = (dy * ify) & 0xFFFF;
+
+    int U = (0x10000 - u) >> 8;
+    int V = (0x10000 - v) >> 8;
+    int U1 = u >> 8;
+    int V1 = v >> 8;
+
+    // Load pixel values
+    WT data0 = convertToWT(loadpix(srcptr + mad24(sy, src_step, mad24(sx, TSIZE, src_offset))));
+    WT data1 = convertToWT(loadpix(srcptr + mad24(sy, src_step, mad24(INC(sx, src_cols), TSIZE, src_offset))));
+    WT data2 = convertToWT(loadpix(srcptr + mad24(INC(sy, src_rows), src_step, mad24(sx, TSIZE, src_offset))));
+    WT data3 = convertToWT(loadpix(srcptr + mad24(INC(sy, src_rows), src_step, mad24(INC(sx, src_cols), TSIZE, src_offset))));
+
+    // Perform fixed-point interpolation
+    WT val = mul24((WT)mul24(U1, V1), data0) + mul24((WT)mul24(U, V1), data1) +
+             mul24((WT)mul24(U1, V), data2) + mul24((WT)mul24(U, V), data3);
+
+    // Convert and store the result
+#if ROUNDING_METHOD == ROUND_NEAREST_EVEN
+    T uval = convertToDT((val + FIXED_POINT_SCALE / 2) >> FIXED_POINT_BITS);
+#elif ROUNDING_METHOD == ROUND_DOWN
+    T uval = convertToDT(val >> FIXED_POINT_BITS);
+#elif ROUNDING_METHOD == ROUND_UP
+    T uval = convertToDT((val + FIXED_POINT_SCALE - 1) >> FIXED_POINT_BITS);
+#elif ROUNDING_METHOD == TRUNCATE
+    T uval = convertToDT(val >> FIXED_POINT_BITS);
+#endif
+    storepix(uval, dstptr + mad24(dy, dst_step, mad24(dx, TSIZE, dst_offset)));
 }
 
 
+
+
 #elif defined INTER_NEAREST
 
 __kernel void resizeNN(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
diff --git a/modules/imgproc/src/resize.cpp b/modules/imgproc/src/resize.cpp
index 1226831c21..ac68a76c37 100644
--- a/modules/imgproc/src/resize.cpp
+++ b/modules/imgproc/src/resize.cpp
@@ -3495,9 +3495,45 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
         }
     }
     else if (interpolation == INTER_LINEAR_EXACT) {
+        AutoBuffer<uchar> _buffer((dsize.width + dsize.height)*(sizeof(int) + sizeof(short)*2));
+        int* xofs = (int*)_buffer.data(), * yofs = xofs + dsize.width;
+        short* ialpha = (short*)(yofs + dsize.height), * ibeta = ialpha + dsize.width*2;
+        float fxx, fyy;
+        int sx, sy;
+
+        for (int dx = 0; dx < dsize.width; dx++)
+        {
+            fxx = (float)((dx+0.5)*inv_fx - 0.5);
+            sx = cvFloor(fxx);
+            fxx -= sx;
+
+            if (sx < 0)
+                fxx = 0, sx = 0;
+
+            if (sx >= ssize.width-1)
+                fxx = 0, sx = ssize.width-1;
+
+            xofs[dx] = sx;
+            ialpha[dx*2 + 0] = saturate_cast<short>((1.f - fxx) * INTER_RESIZE_COEF_SCALE);
+            ialpha[dx*2 + 1] = saturate_cast<short>(fxx         * INTER_RESIZE_COEF_SCALE);
+        }
+
+        for (int dy = 0; dy < dsize.height; dy++)
+        {
+            fyy = (float)((dy+0.5)*inv_fy - 0.5);
+            sy = cvFloor(fyy);
+            fyy -= sy;
+
+            yofs[dy] = sy;
+            ibeta[dy*2 + 0] = saturate_cast<short>((1.f - fyy) * INTER_RESIZE_COEF_SCALE);
+            ibeta[dy*2 + 1] = saturate_cast<short>(fyy         * INTER_RESIZE_COEF_SCALE);
+        }
+
+        int wdepth = std::max(depth, CV_32S), wtype = CV_MAKETYPE(wdepth, cn);
+        UMat coeffs;
+        Mat(1, static_cast<int>(_buffer.size()), CV_8UC1, _buffer.data()).copyTo(coeffs);
+
         char buf[2][50];
-        int wdepth = depth <= CV_8S ? CV_32S : std::max(depth, CV_32F);
-        int wtype = CV_MAKETYPE(wdepth, cn);
         k.create("resizeLN", ocl::imgproc::resize_oclsrc,
                  format("-D INTER_LINEAR_EXACT -D depth=%d -D T=%s -D T1=%s "
                         "-D WT=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d "
@@ -3510,7 +3546,7 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
             return false;
 
         k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
-               (float)inv_fx, (float)inv_fy);
+               ocl::KernelArg::PtrReadOnly(coeffs));
     }
     else if (interpolation == INTER_NEAREST)
     {