From 07d57db91c7080c568be5cabd98018c22ee5a591 Mon Sep 17 00:00:00 2001 From: Alexander Karsakov Date: Tue, 2 Sep 2014 17:25:25 +0400 Subject: [PATCH] Fixed calculation of l_stack_size --- modules/imgproc/src/canny.cpp | 16 ++++++++-------- modules/imgproc/src/deriv.cpp | 9 +++++---- modules/imgproc/src/opencl/canny.cl | 18 +++++++++++------- modules/imgproc/src/opencl/covardata.cl | 4 ++-- .../imgproc/src/opencl/filterSep_singlePass.cl | 4 ++-- modules/imgproc/src/opencl/laplacian5.cl | 4 ++-- 6 files changed, 30 insertions(+), 25 deletions(-) diff --git a/modules/imgproc/src/canny.cpp b/modules/imgproc/src/canny.cpp index cf2c6bb294..fa751c9108 100644 --- a/modules/imgproc/src/canny.cpp +++ b/modules/imgproc/src/canny.cpp @@ -195,20 +195,20 @@ static bool ocl_Canny(InputArray _src, OutputArray _dst, float low_thresh, float hysteresis (add weak edges if they are connected with strong edges) */ + int sizey = lSizeY / PIX_PER_WI; + if (sizey == 0) + sizey = 1; + + size_t globalsize[2] = { size.width, (size.height + PIX_PER_WI - 1) / PIX_PER_WI }, localsize[2] = { lSizeX, sizey }; + ocl::Kernel edgesHysteresis("stage2_hysteresis", ocl::imgproc::canny_oclsrc, - format("-D STAGE2 -D PIX_PER_WI=%d", PIX_PER_WI)); + format("-D STAGE2 -D PIX_PER_WI=%d -D LOCAL_X=%d -D LOCAL_Y=%d", + PIX_PER_WI, lSizeX, sizey)); if (edgesHysteresis.empty()) return false; edgesHysteresis.args(ocl::KernelArg::ReadWrite(map)); - - int sizey = lSizeY / PIX_PER_WI; - if (sizey == 0) - sizey = 1; - - size_t globalsize[2] = { size.width, size.height / PIX_PER_WI }, localsize[2] = { lSizeX, sizey }; - if (!edgesHysteresis.run(2, globalsize, localsize, false)) return false; diff --git a/modules/imgproc/src/deriv.cpp b/modules/imgproc/src/deriv.cpp index 48e9a6f26d..068461a8c3 100644 --- a/modules/imgproc/src/deriv.cpp +++ b/modules/imgproc/src/deriv.cpp @@ -672,7 +672,8 @@ static bool ocl_Laplacian5(InputArray _src, OutputArray _dst, size_t wgs = dev.maxWorkGroupSize(); size_t lmsz = dev.localMemSize(); size_t src_step = _src.step(), src_offset = _src.offset(); - + const size_t tileSizeYmax = wgs / tileSizeX; + // workaround for Nvidia: 3 channel vector type takes 4*elem_size in local memory int loc_mem_cn = dev.vendorID() == ocl::Device::VENDOR_NVIDIA && cn == 3 ? 4 : cn; @@ -680,9 +681,9 @@ static bool ocl_Laplacian5(InputArray _src, OutputArray _dst, ( (borderType == BORDER_CONSTANT || borderType == BORDER_REPLICATE) || ((borderType == BORDER_REFLECT || borderType == BORDER_WRAP || borderType == BORDER_REFLECT_101) && - (_src.cols() >= kernelX.cols && _src.rows() >= kernelY.cols)) + (_src.cols() >= (int) (kernelX.cols + tileSizeX) && _src.rows() >= (int) (kernelY.cols + tileSizeYmax))) ) && - (tileSizeX * tileSizeYmin <= wgs) && + (tileSizeX * tileSizeYmin <= wgs) && (LAPLACIAN_LOCAL_MEM(tileSizeX, tileSizeYmin, kernelX.cols, loc_mem_cn * 4) <= lmsz) ) { @@ -691,7 +692,7 @@ static bool ocl_Laplacian5(InputArray _src, OutputArray _dst, int dtype = CV_MAKE_TYPE(ddepth, cn); int wdepth = CV_32F; - size_t tileSizeY = wgs / tileSizeX; + size_t tileSizeY = tileSizeYmax; while ((tileSizeX * tileSizeY > wgs) || (LAPLACIAN_LOCAL_MEM(tileSizeX, tileSizeY, kernelX.cols, loc_mem_cn * 4) > lmsz)) { tileSizeY /= 2; diff --git a/modules/imgproc/src/opencl/canny.cl b/modules/imgproc/src/opencl/canny.cl index da2750e348..dd455d04b5 100644 --- a/modules/imgproc/src/opencl/canny.cl +++ b/modules/imgproc/src/opencl/canny.cl @@ -375,7 +375,8 @@ __kernel void stage1_without_sobel(__global const uchar *dxptr, int dx_step, int #define loadpix(addr) *(__global int *)(addr) #define storepix(val, addr) *(__global int *)(addr) = (int)(val) -#define l_stack_size 256 +#define LOCAL_TOTAL (LOCAL_X*LOCAL_Y) +#define l_stack_size (4*LOCAL_TOTAL) #define p_stack_size 8 __constant short move_dir[2][8] = { @@ -390,7 +391,7 @@ __kernel void stage2_hysteresis(__global uchar *map, int map_step, int map_offse int x = get_global_id(0); int y0 = get_global_id(1) * PIX_PER_WI; - int lid = get_local_id(0) + get_local_id(1) * 32; + int lid = get_local_id(0) + get_local_id(1) * LOCAL_X; __local ushort2 l_stack[l_stack_size]; __local int l_counter; @@ -402,10 +403,13 @@ __kernel void stage2_hysteresis(__global uchar *map, int map_step, int map_offse #pragma unroll for (int y = y0; y < min(y0 + PIX_PER_WI, rows); ++y) { - int type = loadpix(map + mad24(y, map_step, x * (int)sizeof(int))); - if (type == 2) + if (x < cols) { - l_stack[atomic_inc(&l_counter)] = (ushort2)(x, y); + int type = loadpix(map + mad24(y, map_step, x * (int)sizeof(int))); + if (type == 2) + { + l_stack[atomic_inc(&l_counter)] = (ushort2)(x, y); + } } } barrier(CLK_LOCAL_MEM_FENCE); @@ -415,8 +419,8 @@ __kernel void stage2_hysteresis(__global uchar *map, int map_step, int map_offse while(l_counter != 0) { - int mod = l_counter % 64; - int pix_per_thr = l_counter / 64 + (lid < mod) ? 1 : 0; + int mod = l_counter % LOCAL_TOTAL; + int pix_per_thr = l_counter / LOCAL_TOTAL + ((lid < mod) ? 1 : 0); #pragma unroll for (int i = 0; i < pix_per_thr; ++i) diff --git a/modules/imgproc/src/opencl/covardata.cl b/modules/imgproc/src/opencl/covardata.cl index f663e525eb..8f67d8ffe7 100644 --- a/modules/imgproc/src/opencl/covardata.cl +++ b/modules/imgproc/src/opencl/covardata.cl @@ -28,13 +28,13 @@ //fedcba|abcdefgh|hgfedcb #define EXTRAPOLATE(x, maxV) \ { \ - (x) = clamp(min( mad24((maxV)-1,2,-(x))+1 , max((x),-(x)-1) ), 0, (maxV)-1); \ + (x) = min( mad24((maxV)-1,2,-(x))+1 , max((x),-(x)-1) ); \ } #elif defined BORDER_REFLECT_101 || defined BORDER_REFLECT101 //gfedcb|abcdefgh|gfedcba #define EXTRAPOLATE(x, maxV) \ { \ - (x) = clamp(min( mad24((maxV)-1,2,-(x)), max((x),-(x)) ), 0, (maxV)-1); \ + (x) = min( mad24((maxV)-1,2,-(x)), max((x),-(x)) ); \ } #else #error No extrapolation method diff --git a/modules/imgproc/src/opencl/filterSep_singlePass.cl b/modules/imgproc/src/opencl/filterSep_singlePass.cl index 2fbb0ece45..1f96d7d6e1 100644 --- a/modules/imgproc/src/opencl/filterSep_singlePass.cl +++ b/modules/imgproc/src/opencl/filterSep_singlePass.cl @@ -62,13 +62,13 @@ // fedcba|abcdefgh|hgfedcb #define EXTRAPOLATE(x, maxV) \ { \ - (x) = clamp(min(((maxV)-1)*2-(x)+1, max((x),-(x)-1) ), 0, (maxV)-1); \ + (x) = min(((maxV)-1)*2-(x)+1, max((x),-(x)-1) ); \ } #elif defined BORDER_REFLECT_101 || defined BORDER_REFLECT101 // gfedcb|abcdefgh|gfedcba #define EXTRAPOLATE(x, maxV) \ { \ - (x) = clamp(min(((maxV)-1)*2-(x), max((x),-(x)) ), 0, (maxV)-1); \ + (x) = min(((maxV)-1)*2-(x), max((x),-(x)) ); \ } #else #error No extrapolation method diff --git a/modules/imgproc/src/opencl/laplacian5.cl b/modules/imgproc/src/opencl/laplacian5.cl index 11c318f918..1404a8c51e 100644 --- a/modules/imgproc/src/opencl/laplacian5.cl +++ b/modules/imgproc/src/opencl/laplacian5.cl @@ -61,13 +61,13 @@ __kernel void sumConvert(__global const uchar * src1ptr, int src1_step, int src1 // fedcba|abcdefgh|hgfedcb #define EXTRAPOLATE(x, maxV) \ { \ - (x) = clamp(min(((maxV)-1)*2-(x)+1, max((x),-(x)-1) ), 0, (maxV)-1); \ + (x) = min(((maxV)-1)*2-(x)+1, max((x),-(x)-1) ); \ } #elif defined BORDER_REFLECT_101 // gfedcb|abcdefgh|gfedcba #define EXTRAPOLATE(x, maxV) \ { \ - (x) = clamp(min(((maxV)-1)*2-(x), max((x),-(x)) ), 0, (maxV)-1); \ + (x) = min(((maxV)-1)*2-(x), max((x),-(x)) ); \ } #else #error No extrapolation method