From 672e06c408407d5cd4520424e8ac5f09f19adbb5 Mon Sep 17 00:00:00 2001
From: kallaballa <amir@viel-zu.com>
Date: Fri, 17 Nov 2023 10:15:31 +0100
Subject: [PATCH] ocl_resize: move coeffienct calculation for
 INTER_LINEAR_EXACT from C++ to OpenCL

---
 modules/imgproc/src/opencl/resize.cl | 85 ++++++++++++++++++++--------
 modules/imgproc/src/resize.cpp       | 63 ++++++++-------------
 2 files changed, 83 insertions(+), 65 deletions(-)

diff --git a/modules/imgproc/src/opencl/resize.cl b/modules/imgproc/src/opencl/resize.cl
index 88ed8ff095..99341e937e 100644
--- a/modules/imgproc/src/opencl/resize.cl
+++ b/modules/imgproc/src/opencl/resize.cl
@@ -222,40 +222,75 @@ __kernel void resizeLN(__global const uchar * srcptr, int src_step, int src_offs
 
 #elif defined INTER_LINEAR_EXACT
 
-__kernel void resizeLN(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
-                       __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
-                       __global const uchar * buffer)
-{
+__kernel void precomputeCoeffs(__read_only image2d_t src, __write_only image2d_t coeffsTex, float inv_fx, float inv_fy) {
     int dx = get_global_id(0);
     int dy = get_global_id(1);
+    int sx, sy;
+    float fxx, fyy;
+    short ialpha, ibeta;
 
-    if (dx < dst_cols && dy < dst_rows)
-    {
-        __global const int * xofs = (__global const int *)(buffer), * yofs = xofs + dst_cols;
-        __global const short * ialpha = (__global const short *)(yofs + dst_rows);
-        __global const short * ibeta = ialpha + ((dst_cols + dy) << 1);
-        ialpha += dx << 1;
+    fxx = (float)((dx + 0.5) * inv_fx * 0.5 - 0.5);
+    fyy = (float)((dy + 0.5) * inv_fy * 0.5 - 0.5);
 
-        int sx0 = xofs[dx], sy0 = clamp(yofs[dy], 0, src_rows - 1),
-        sy1 = clamp(yofs[dy] + 1, 0, src_rows - 1);
-        short a0 = ialpha[0], a1 = ialpha[1];
-        short b0 = ibeta[0], b1 = ibeta[1];
+    sx = floor(fxx);
+    fxx -= sx;
+    if (sx < 0)
+        fxx = 0, sx = 0;
+    if (sx >= get_image_width(src) - 1)
+        fxx = 0, sx = get_image_width(src) - 1;
 
-        int src_index0 = mad24(sy0, src_step, mad24(sx0, TSIZE, src_offset)),
-        src_index1 = mad24(sy1, src_step, mad24(sx0, TSIZE, src_offset));
-        WT data0 = convertToWT(loadpix(srcptr + src_index0));
-        WT data1 = convertToWT(loadpix(srcptr + src_index0 + TSIZE));
-        WT data2 = convertToWT(loadpix(srcptr + src_index1));
-        WT data3 = convertToWT(loadpix(srcptr + src_index1 + TSIZE));
+    sy = floor(fyy);
+    fyy -= sy;
+    if (sy < 0)
+        fyy = 0, sy = 0;
+    if (sy >= get_image_height(src) - 1)
+        fyy = 0, sy = get_image_height(src) - 1;
 
-        WT val = ( (((data0 * a0 + data1 * a1) >> 4) * b0) >> 16) +
-                 ( (((data2 * a0 + data3 * a1) >> 4) * b1) >> 16);
+    ialpha = convert_short_sat_rte((1.f - fxx) * INTER_RESIZE_COEF_SCALE);
+    ibeta = convert_short_sat_rte((1.f - fyy) * INTER_RESIZE_COEF_SCALE);
 
-        storepix(convertToDT((val + 2) >> 2),
-                 dstptr + mad24(dy, dst_step, mad24(dx, TSIZE, dst_offset)));
-    }
+    write_imagei(coeffsTex, (int2)(dx, 0), (int4)(sx, ialpha, 0, 0));
+    write_imagei(coeffsTex, (int2)(0, dy), (int4)(sy, ibeta, 0, 0));
 }
 
+__kernel void resizeLN(__read_only image2d_t src, __write_only image2d_t dst, __read_only image2d_t coeffsTex) {
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+    int sx, sy, sx0, sx1, sy0, sy1;
+    short ialpha0, ialpha1, ibeta0, ibeta1;
+    int4 v0, v1, v2, v3, res;
+    float4 f0, f1, f2, f3, fres;
+
+    sx = read_imagei(coeffsTex, (int2)(dx, 0)).x;
+    ialpha0 = read_imagei(coeffsTex, (int2)(dx, 0)).y;
+    ialpha1 = INTER_RESIZE_COEF_SCALE - ialpha0;
+
+    sy = read_imagei(coeffsTex, (int2)(0, dy)).x;
+    ibeta0 = read_imagei(coeffsTex, (int2)(0, dy)).y;
+    ibeta1 = INTER_RESIZE_COEF_SCALE - ibeta0;
+
+    sx0 = sx * cn;
+    sx1 = sx0 + cn;
+    sy0 = sy * src_stride;
+    sy1 = sy0 + src_stride;
+
+    sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+    v0 = read_imagei(src, sampler, (int2)(sx0, sy0));
+    v1 = read_imagei(src, sampler, (int2)(sx1, sy0));
+    v2 = read_imagei(src, sampler, (int2)(sx0, sy1));
+    v3 = read_imagei(src, sampler, (int2)(sx1, sy1));
+
+    f0 = convert_float4(v0);
+    f1 = convert_float4(v1);
+    f2 = convert_float4(v2);
+    f3 = convert_float4(v3);
+
+    fres = (f0 * ialpha0 + f1 * ialpha1) * ibeta0 + (f2 * ialpha0 + f3 * ialpha1) * ibeta1;
+    fres = fres * (1.0f / (INTER_RESIZE_COEF_SCALE * INTER_RESIZE_COEF_SCALE));
+
+    res = convert_int4_sat_rte(fres);
+    write_imagei(dst, (int2)(dx + dst_offset, dy + dst_offset), res);
+}
 
 #elif defined INTER_NEAREST
 
diff --git a/modules/imgproc/src/resize.cpp b/modules/imgproc/src/resize.cpp
index c7e487ea94..5feebb644a 100644
--- a/modules/imgproc/src/resize.cpp
+++ b/modules/imgproc/src/resize.cpp
@@ -3494,47 +3494,14 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
         }
     }
     else if (interpolation == INTER_LINEAR_EXACT) {
-        AutoBuffer<uchar> _buffer((dsize.width + dsize.height)*(sizeof(int) + sizeof(short)*2));
-        int* xofs = (int*)_buffer.data(), * yofs = xofs + dsize.width;
-        short* ialpha = (short*)(yofs + dsize.height), * ibeta = ialpha + dsize.width*2;
-        float fxx, fyy;
-        int sx, sy;
-
-        for (int dx = 0; dx < dsize.width; dx++)
-        {
-            fxx = (float)((dx+0.5)*inv_fx - 0.5);
-            sx = cvFloor(fxx);
-            fxx -= sx;
-
-            if (sx < 0)
-                fxx = 0, sx = 0;
-
-            if (sx >= ssize.width-1)
-                fxx = 0, sx = ssize.width-1;
-
-            xofs[dx] = sx;
-            ialpha[dx*2 + 0] = saturate_cast<short>((1.f - fxx) * INTER_RESIZE_COEF_SCALE);
-            ialpha[dx*2 + 1] = saturate_cast<short>(fxx         * INTER_RESIZE_COEF_SCALE);
-        }
-
-        for (int dy = 0; dy < dsize.height; dy++)
-        {
-            fyy = (float)((dy+0.5)*inv_fy - 0.5);
-            sy = cvFloor(fyy);
-            fyy -= sy;
-
-            yofs[dy] = sy;
-            ibeta[dy*2 + 0] = saturate_cast<short>((1.f - fyy) * INTER_RESIZE_COEF_SCALE);
-            ibeta[dy*2 + 1] = saturate_cast<short>(fyy         * INTER_RESIZE_COEF_SCALE);
-        }
-
         int wdepth = std::max(depth, CV_32S), wtype = CV_MAKETYPE(wdepth, cn);
-        UMat coeffs;
-        Mat(1, static_cast<int>(_buffer.size()), CV_8UC1, _buffer.data()).copyTo(coeffs);
 
         char buf[2][50];
-        k.create("resizeLN", ocl::imgproc::resize_oclsrc,
-                 format("-D INTER_LINEAR_EXACT -D depth=%d -D T=%s -D T1=%s "
+        // Precompute the coefficients and store them in a texture
+        UMat coeffsTex(dsize.height + dsize.width, 2, CV_16SC1);
+        ocl::KernelArg coeffsTexArg = ocl::KernelArg::WriteOnlyNoSize(coeffsTex);
+        k.create("precomputeCoeffs", ocl::imgproc::resize_oclsrc,
+                 format("-D PRECOMPUTE_COEFFS -D depth=%d -D T=%s -D T1=%s "
                         "-D WT=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d "
                         "-D INTER_RESIZE_COEF_BITS=%d",
                         depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
@@ -3543,9 +3510,25 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
                         cn, INTER_RESIZE_COEF_BITS));
         if (k.empty())
             return false;
+        k.args(ocl::KernelArg::ReadOnlyNoSize(src), coeffsTexArg, (float)inv_fx * 0.5, (float)inv_fy * 0.5);
+        size_t globalThreads[2] = { (size_t)dsize.width, (size_t)dsize.height };
+        if (!k.run(2, globalThreads, NULL, false))
+            return false;
 
-        k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
-               ocl::KernelArg::PtrReadOnly(coeffs));
+        // Use the texture in the interpolation kernel
+        k.create("resizeLN", ocl::imgproc::resize_oclsrc,
+                 format("-D INTER_LINEAR_EXACT -D depth=%d -D T=%s -D T1=%s "
+                        "-D WT=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d "
+                        "-D INTER_RESIZE_COEF_BITS=%d -D USE_TEXTURE",
+                        depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
+                        ocl::convertTypeStr(depth, wdepth, cn, buf[0], sizeof(buf[0])),
+                        ocl::convertTypeStr(wdepth, depth, cn, buf[1], sizeof(buf[1])),
+                        cn, INTER_RESIZE_COEF_BITS));
+        if (k.empty())
+            return false;
+        k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst), coeffsTexArg);
+        if (!k.run(2, globalThreads, NULL, false))
+            return false;
     }
     else if (interpolation == INTER_NEAREST)
     {