optimized INTER_LINEAR mode

2025-07-26 07:07:37 +08:00 · 2014-06-17 19:41:53 +04:00 · 2014-06-17 19:41:53 +04:00 · 87f4b47a4f
commit 87f4b47a4f
parent c41a134394
3 changed files with 60 additions and 8 deletions
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@ -3640,10 +3640,9 @@ static bool ocl_remap(InputArray _src, OutputArray _dst, InputArray _map1, Input
    }
    int scalarcn = cn == 3 ? 4 : cn;
    int sctype = CV_MAKETYPE(depth, scalarcn);
-    buildOptions += format(" -D T=%s -D T1=%s"
-                           " -D cn=%d -D ST=%s",
+    buildOptions += format(" -D T=%s -D T1=%s -D cn=%d -D ST=%s -D depth=%d",
                           ocl::typeToStr(type), ocl::typeToStr(depth),
-                           cn, ocl::typeToStr(sctype));
+                           cn, ocl::typeToStr(sctype), depth);

    ocl::Kernel k(kernelName.c_str(), ocl::imgproc::remap_oclsrc, buildOptions);

--- a/modules/imgproc/src/opencl/remap.cl
+++ b/modules/imgproc/src/opencl/remap.cl
@ -274,7 +274,7 @@ __kernel void remap_16SC2_16UC1(__global const uchar * srcptr, int src_step, int
                                ST nVal)
 {
    int x = get_global_id(0);
-    int y = get_global_id(1);
+    int y = get_global_id(1) * rowsPerWI;

    if (x < dst_cols)
    {
@ -313,7 +313,15 @@ __kernel void remap_16SC2_16UC1(__global const uchar * srcptr, int src_step, int
    }
 }

-#elif INTER_LINEAR
+#elif defined INTER_LINEAR
+
+__constant float coeffs[64] =
+{ 1.000000f, 0.000000f, 0.968750f, 0.031250f, 0.937500f, 0.062500f, 0.906250f, 0.093750f, 0.875000f, 0.125000f, 0.843750f, 0.156250f,
+  0.812500f, 0.187500f, 0.781250f, 0.218750f, 0.750000f, 0.250000f, 0.718750f, 0.281250f, 0.687500f, 0.312500f, 0.656250f, 0.343750f,
+  0.625000f, 0.375000f, 0.593750f, 0.406250f, 0.562500f, 0.437500f, 0.531250f, 0.468750f, 0.500000f, 0.500000f, 0.468750f, 0.531250f,
+  0.437500f, 0.562500f, 0.406250f, 0.593750f, 0.375000f, 0.625000f, 0.343750f, 0.656250f, 0.312500f, 0.687500f, 0.281250f, 0.718750f,
+  0.250000f, 0.750000f, 0.218750f, 0.781250f, 0.187500f, 0.812500f, 0.156250f, 0.843750f, 0.125000f, 0.875000f, 0.093750f, 0.906250f,
+  0.062500f, 0.937500f, 0.031250f, 0.968750f };

 __kernel void remap_16SC2_16UC1(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
                                __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
@ -326,6 +334,7 @@ __kernel void remap_16SC2_16UC1(__global const uchar * srcptr, int src_step, int

    if (x < dst_cols)
    {
+        WT scalar = convertToWT(convertScalar(nVal));
        int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));
        int map1_index = mad24(y, map1_step, mad24(x, (int)sizeof(short2), map1_offset));
        int map2_index = mad24(y, map2_step, mad24(x, (int)sizeof(ushort), map2_offset));
@ -347,7 +356,6 @@ __kernel void remap_16SC2_16UC1(__global const uchar * srcptr, int src_step, int
                ushort map2Value = (ushort)(map2[0] & (INTER_TAB_SIZE2 - 1));
                WT2 u = (WT2)(map2Value & (INTER_TAB_SIZE - 1), map2Value >> INTER_BITS) / (WT2)(INTER_TAB_SIZE);

-                WT scalar = convertToWT(convertScalar(nVal));
                WT a = scalar, b = scalar, c = scalar, d = scalar;

                if (!NEED_EXTRAPOLATION(map_dataA.x, map_dataA.y))
@ -390,6 +398,7 @@ __kernel void remap_2_32FC1(__global const uchar * srcptr, int src_step, int src

    if (x < dst_cols)
    {
+        WT scalar = convertToWT(convertScalar(nVal));
        int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));
        int map1_index = mad24(y, map1_step, mad24(x, (int)sizeof(float), map1_offset));
        int map2_index = mad24(y, map2_step, mad24(x, (int)sizeof(float), map2_offset));
@ -403,6 +412,49 @@ __kernel void remap_2_32FC1(__global const uchar * srcptr, int src_step, int src
                __global const float * map2 = (__global const float *)(map2ptr + map2_index);
                __global T * dst = (__global T *)(dstptr + dst_index);

+#if defined BORDER_CONSTANT
+
+                float xf = map1[0], yf = map2[0];
+                int sx = convert_int_sat_rtn(xf), sy = convert_int_sat_rtn(yf);
+
+                __constant float * coeffs_x = coeffs + ((convert_int_rte(xf * INTER_TAB_SIZE) & (INTER_TAB_SIZE - 1)) << 1);
+                __constant float * coeffs_y = coeffs + ((convert_int_rte(yf * INTER_TAB_SIZE) & (INTER_TAB_SIZE - 1)) << 1);
+
+                WT sum = (WT)(0), xsum;
+                int src_index = mad24(sy, src_step, mad24(sx, TSIZE, src_offset));
+
+                #pragma unroll
+                for (int yp = 0; yp < 2; ++yp, src_index += src_step)
+                {
+                    if (sy + yp >= 0 && sy + yp < src_rows)
+                    {
+                        xsum = (WT)(0);
+                        if (sx >= 0 && sx + 2 < src_cols)
+                        {
+#if depth == 0 && cn == 1
+                            uchar2 value = vload2(0, srcptr + src_index);
+                            xsum = dot(convert_float2(value), (float2)(coeffs_x[0], coeffs_x[1]));
+#else
+                            #pragma unroll
+                            for (int xp = 0; xp < 2; ++xp)
+                                xsum = fma(convertToWT(loadpix(srcptr + mad24(xp, TSIZE, src_index))), coeffs_x[xp], xsum);
+#endif
+                        }
+                        else
+                        {
+                            #pragma unroll
+                            for (int xp = 0; xp < 2; ++xp)
+                                xsum = fma(sx + xp >= 0 && sx + xp < src_cols ?
+                                           convertToWT(loadpix(srcptr + mad24(xp, TSIZE, src_index))) : scalar, coeffs_x[xp], xsum);
+                        }
+                        sum = fma(xsum, coeffs_y[yp], sum);
+                    }
+                    else
+                        sum = fma(scalar, coeffs_y[yp], sum);
+                }
+
+                storepix(convertToT(sum), dst);
+#else
                float2 map_data = (float2)(map1[0], map2[0]);

                int2 map_dataA = convert_int2_sat_rtn(map_data);
@ -440,6 +492,7 @@ __kernel void remap_2_32FC1(__global const uchar * srcptr, int src_step, int src
                              c * (1 - u.x) * (u.y) +
                              d * (u.x)     * (u.y);
                storepix(convertToT(dst_data), dst);
+#endif
            }
    }
 }
@ -454,6 +507,7 @@ __kernel void remap_32FC2(__global const uchar * srcptr, int src_step, int src_o

    if (x < dst_cols)
    {
+        WT scalar = convertToWT(convertScalar(nVal));
        int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));
        int map_index = mad24(y, map_step, mad24(x, (int)sizeof(float2), map_offset));

@ -473,7 +527,6 @@ __kernel void remap_32FC2(__global const uchar * srcptr, int src_step, int src_o

                float2 _u = map_data - convert_float2(map_dataA);
                WT2 u = convertToWT2(convert_int2_rte(convertToWT2(_u) * (WT2)INTER_TAB_SIZE)) / (WT2)INTER_TAB_SIZE;
-                WT scalar = convertToWT(convertScalar(nVal));
                WT a = scalar, b = scalar, c = scalar, d = scalar;

                if (!NEED_EXTRAPOLATION(map_dataA.x, map_dataA.y))
--- a/modules/imgproc/test/ocl/test_warp.cpp
+++ b/modules/imgproc/test/ocl/test_warp.cpp
@ -267,7 +267,7 @@ PARAM_TEST_CASE(Remap, MatDepth, Channels, std::pair<MatType, MatType>, BorderTy
        Border map1Border = randomBorder(0, useRoi ? MAX_VALUE : 0);
        randomSubMat(map1, map1_roi, dstROISize, map1Border, map1Type, -mapMaxValue, mapMaxValue);

-        Border map2Border = randomBorder(0, useRoi ? MAX_VALUE : 0);
+        Border map2Border = randomBorder(0, useRoi ? MAX_VALUE + 1 : 0);
        if (map2Type != noType)
        {
            int mapMinValue = -mapMaxValue;