Fixed calculation of l_stack_size

2024-11-29 05:29:54 +08:00 · 2014-09-02 17:25:25 +04:00 · 2014-09-02 17:25:25 +04:00 · 07d57db91c
commit 07d57db91c
parent a48c1c8248
6 changed files with 30 additions and 25 deletions
--- a/modules/imgproc/src/canny.cpp
+++ b/modules/imgproc/src/canny.cpp
@ -195,20 +195,20 @@ static bool ocl_Canny(InputArray _src, OutputArray _dst, float low_thresh, float
            hysteresis (add weak edges if they are connected with strong edges)
    */

+    int sizey = lSizeY / PIX_PER_WI;
+    if (sizey == 0)
+        sizey = 1;
+
+    size_t globalsize[2] = { size.width, (size.height + PIX_PER_WI - 1) / PIX_PER_WI }, localsize[2] = { lSizeX, sizey };
+
    ocl::Kernel edgesHysteresis("stage2_hysteresis", ocl::imgproc::canny_oclsrc,
-                                format("-D STAGE2 -D PIX_PER_WI=%d", PIX_PER_WI));
+                                format("-D STAGE2 -D PIX_PER_WI=%d -D LOCAL_X=%d -D LOCAL_Y=%d",
+                                PIX_PER_WI, lSizeX, sizey));

    if (edgesHysteresis.empty())
        return false;

    edgesHysteresis.args(ocl::KernelArg::ReadWrite(map));
-
-    int sizey = lSizeY / PIX_PER_WI;
-    if (sizey == 0)
-        sizey = 1;
-
-    size_t globalsize[2] = { size.width, size.height / PIX_PER_WI }, localsize[2] = { lSizeX, sizey };
-
    if (!edgesHysteresis.run(2, globalsize, localsize, false))
        return false;

--- a/modules/imgproc/src/deriv.cpp
+++ b/modules/imgproc/src/deriv.cpp
@ -672,7 +672,8 @@ static bool ocl_Laplacian5(InputArray _src, OutputArray _dst,
    size_t wgs = dev.maxWorkGroupSize();
    size_t lmsz = dev.localMemSize();
    size_t src_step = _src.step(), src_offset = _src.offset();
-    
+    const size_t tileSizeYmax = wgs / tileSizeX;
+
    // workaround for Nvidia: 3 channel vector type takes 4*elem_size in local memory
    int loc_mem_cn = dev.vendorID() == ocl::Device::VENDOR_NVIDIA && cn == 3 ? 4 : cn;

@ -680,9 +681,9 @@ static bool ocl_Laplacian5(InputArray _src, OutputArray _dst,
        (
         (borderType == BORDER_CONSTANT || borderType == BORDER_REPLICATE) ||
         ((borderType == BORDER_REFLECT || borderType == BORDER_WRAP || borderType == BORDER_REFLECT_101) &&
-          (_src.cols() >= kernelX.cols && _src.rows() >= kernelY.cols))
+          (_src.cols() >= (int) (kernelX.cols + tileSizeX) && _src.rows() >= (int) (kernelY.cols + tileSizeYmax)))
        ) &&
-        (tileSizeX * tileSizeYmin  <= wgs) &&
+        (tileSizeX * tileSizeYmin <= wgs) &&
        (LAPLACIAN_LOCAL_MEM(tileSizeX, tileSizeYmin, kernelX.cols, loc_mem_cn * 4) <= lmsz)
       )
    {
@ -691,7 +692,7 @@ static bool ocl_Laplacian5(InputArray _src, OutputArray _dst,
        int dtype = CV_MAKE_TYPE(ddepth, cn);
        int wdepth = CV_32F;

-        size_t tileSizeY = wgs / tileSizeX;
+        size_t tileSizeY = tileSizeYmax;
        while ((tileSizeX * tileSizeY > wgs) || (LAPLACIAN_LOCAL_MEM(tileSizeX, tileSizeY, kernelX.cols, loc_mem_cn * 4) > lmsz))
        {
            tileSizeY /= 2;
--- a/modules/imgproc/src/opencl/canny.cl
+++ b/modules/imgproc/src/opencl/canny.cl
@ -375,7 +375,8 @@ __kernel void stage1_without_sobel(__global const uchar *dxptr, int dx_step, int

 #define loadpix(addr) *(__global int *)(addr)
 #define storepix(val, addr) *(__global int *)(addr) = (int)(val)
-#define l_stack_size 256
+#define LOCAL_TOTAL (LOCAL_X*LOCAL_Y)
+#define l_stack_size (4*LOCAL_TOTAL)
 #define p_stack_size 8

 __constant short move_dir[2][8] = {
@ -390,7 +391,7 @@ __kernel void stage2_hysteresis(__global uchar *map, int map_step, int map_offse
    int x = get_global_id(0);
    int y0 = get_global_id(1) * PIX_PER_WI;

-    int lid = get_local_id(0) + get_local_id(1) * 32;
+    int lid = get_local_id(0) + get_local_id(1) * LOCAL_X;

    __local ushort2 l_stack[l_stack_size];
    __local int l_counter;
@ -402,10 +403,13 @@ __kernel void stage2_hysteresis(__global uchar *map, int map_step, int map_offse
    #pragma unroll
    for (int y = y0; y < min(y0 + PIX_PER_WI, rows); ++y)
    {
-        int type = loadpix(map + mad24(y, map_step, x * (int)sizeof(int)));
-        if (type == 2)
+        if (x < cols)
        {
-            l_stack[atomic_inc(&l_counter)] = (ushort2)(x, y);
+            int type = loadpix(map + mad24(y, map_step, x * (int)sizeof(int)));
+            if (type == 2)
+            {
+                l_stack[atomic_inc(&l_counter)] = (ushort2)(x, y);
+            }
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
@ -415,8 +419,8 @@ __kernel void stage2_hysteresis(__global uchar *map, int map_step, int map_offse

    while(l_counter != 0)
    {
-        int mod = l_counter % 64;
-        int pix_per_thr = l_counter / 64 + (lid < mod) ? 1 : 0;
+        int mod = l_counter % LOCAL_TOTAL;
+        int pix_per_thr = l_counter / LOCAL_TOTAL + ((lid < mod) ? 1 : 0);

        #pragma unroll
        for (int i = 0; i < pix_per_thr; ++i)
--- a/modules/imgproc/src/opencl/covardata.cl
+++ b/modules/imgproc/src/opencl/covardata.cl
@ -28,13 +28,13 @@
 //fedcba|abcdefgh|hgfedcb
 #define EXTRAPOLATE(x, maxV) \
    { \
-        (x) = clamp(min( mad24((maxV)-1,2,-(x))+1 , max((x),-(x)-1) ), 0, (maxV)-1); \
+        (x) = min( mad24((maxV)-1,2,-(x))+1 , max((x),-(x)-1) ); \
    }
 #elif defined BORDER_REFLECT_101 || defined BORDER_REFLECT101
 //gfedcb|abcdefgh|gfedcba
 #define EXTRAPOLATE(x, maxV) \
    { \
-        (x) = clamp(min( mad24((maxV)-1,2,-(x)), max((x),-(x)) ), 0, (maxV)-1); \
+        (x) = min( mad24((maxV)-1,2,-(x)), max((x),-(x)) ); \
    }
 #else
 #error No extrapolation method
--- a/modules/imgproc/src/opencl/filterSep_singlePass.cl
+++ b/modules/imgproc/src/opencl/filterSep_singlePass.cl
@ -62,13 +62,13 @@
 // fedcba|abcdefgh|hgfedcb
 #define EXTRAPOLATE(x, maxV) \
    { \
-        (x) = clamp(min(((maxV)-1)*2-(x)+1, max((x),-(x)-1) ), 0, (maxV)-1); \
+        (x) = min(((maxV)-1)*2-(x)+1, max((x),-(x)-1) ); \
    }
 #elif defined BORDER_REFLECT_101 || defined BORDER_REFLECT101
 // gfedcb|abcdefgh|gfedcba
 #define EXTRAPOLATE(x, maxV) \
    { \
-        (x) = clamp(min(((maxV)-1)*2-(x), max((x),-(x)) ), 0, (maxV)-1); \
+        (x) = min(((maxV)-1)*2-(x), max((x),-(x)) ); \
    }
 #else
 #error No extrapolation method
--- a/modules/imgproc/src/opencl/laplacian5.cl
+++ b/modules/imgproc/src/opencl/laplacian5.cl
@ -61,13 +61,13 @@ __kernel void sumConvert(__global const uchar * src1ptr, int src1_step, int src1
 // fedcba|abcdefgh|hgfedcb
 #define EXTRAPOLATE(x, maxV) \
    { \
-        (x) = clamp(min(((maxV)-1)*2-(x)+1, max((x),-(x)-1) ), 0, (maxV)-1); \
+        (x) = min(((maxV)-1)*2-(x)+1, max((x),-(x)-1) ); \
    }
 #elif defined BORDER_REFLECT_101
 // gfedcb|abcdefgh|gfedcba
 #define EXTRAPOLATE(x, maxV) \
    { \
-        (x) = clamp(min(((maxV)-1)*2-(x), max((x),-(x)) ), 0, (maxV)-1); \
+        (x) = min(((maxV)-1)*2-(x), max((x),-(x)) ); \
    }
 #else
 #error No extrapolation method