diff --git a/modules/imgproc/src/canny.cpp b/modules/imgproc/src/canny.cpp index 352fc8f524..26d499ee16 100644 --- a/modules/imgproc/src/canny.cpp +++ b/modules/imgproc/src/canny.cpp @@ -138,10 +138,10 @@ static bool ocl_Canny(InputArray _src, OutputArray _dst, float low_thresh, float */ char cvt[40]; ocl::Kernel with_sobel("stage1_with_sobel", ocl::imgproc::canny_oclsrc, - format("-D WITH_SOBEL -D cn=%d -D TYPE=%s -D convert_intN=%s -D intN=%s -D GRP_SIZEX=%d -D GRP_SIZEY=%d%s", + format("-D WITH_SOBEL -D cn=%d -D TYPE=%s -D convert_floatN=%s -D floatN=%s -D GRP_SIZEX=%d -D GRP_SIZEY=%d%s", cn, ocl::memopTypeToStr(_src.depth()), - ocl::convertTypeStr(_src.type(), CV_32SC(cn), cn, cvt), - ocl::memopTypeToStr(CV_32SC(cn)), + ocl::convertTypeStr(_src.depth(), CV_32F, cn, cvt), + ocl::typeToStr(CV_MAKE_TYPE(CV_32F, cn)), lSizeX, lSizeY, L2gradient ? " -D L2GRAD" : "")); if (with_sobel.empty()) @@ -151,7 +151,7 @@ static bool ocl_Canny(InputArray _src, OutputArray _dst, float low_thresh, float map.create(size, CV_32S); with_sobel.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnlyNoSize(map), - low, high); + (float) low, (float) high); size_t globalsize[2] = { size.width, size.height }, localsize[2] = { lSizeX, lSizeY }; diff --git a/modules/imgproc/src/opencl/canny.cl b/modules/imgproc/src/opencl/canny.cl index dd455d04b5..caa7969032 100644 --- a/modules/imgproc/src/opencl/canny.cl +++ b/modules/imgproc/src/opencl/canny.cl @@ -49,9 +49,9 @@ #ifdef WITH_SOBEL #if cn == 1 -#define loadpix(addr) convert_intN(*(__global const TYPE *)(addr)) +#define loadpix(addr) convert_floatN(*(__global const TYPE *)(addr)) #else -#define loadpix(addr) convert_intN(vload3(0, (__global const TYPE *)(addr))) +#define loadpix(addr) convert_floatN(vload3(0, (__global const TYPE *)(addr))) #endif #define storepix(value, addr) *(__global int *)(addr) = (int)(value) @@ -77,23 +77,21 @@ __constant int next[4][2] = { { 1, 1 } }; -inline int3 sobel(int idx, __local const intN *smem) +inline float3 sobel(int idx, __local const floatN *smem) { // result: x, y, mag - int3 res; + float3 res; - intN dx = smem[idx + 2] - smem[idx] - + 2 * (smem[idx + GRP_SIZEX + 6] - smem[idx + GRP_SIZEX + 4]) - + smem[idx + 2 * GRP_SIZEX + 10] - smem[idx + 2 * GRP_SIZEX + 8]; + floatN dx = fma(2, smem[idx + GRP_SIZEX + 6] - smem[idx + GRP_SIZEX + 4], + smem[idx + 2] - smem[idx] + smem[idx + 2 * GRP_SIZEX + 10] - smem[idx + 2 * GRP_SIZEX + 8]); - intN dy = smem[idx] - smem[idx + 2 * GRP_SIZEX + 8] - + 2 * (smem[idx + 1] - smem[idx + 2 * GRP_SIZEX + 9]) - + smem[idx + 2] - smem[idx + 2 * GRP_SIZEX + 10]; + floatN dy = fma(2, smem[idx + 1] - smem[idx + 2 * GRP_SIZEX + 9], + smem[idx + 2] - smem[idx + 2 * GRP_SIZEX + 10] + smem[idx] - smem[idx + 2 * GRP_SIZEX + 8]); #ifdef L2GRAD - intN magN = dx * dx + dy * dy; + floatN magN = fma(dx, dx, dy * dy); #else - intN magN = convert_intN(abs(dx) + abs(dy)); + floatN magN = fabs(dx) + fabs(dy); #endif #if cn == 1 res.z = magN; @@ -120,9 +118,9 @@ inline int3 sobel(int idx, __local const intN *smem) __kernel void stage1_with_sobel(__global const uchar *src, int src_step, int src_offset, int rows, int cols, __global uchar *map, int map_step, int map_offset, - int low_thr, int high_thr) + float low_thr, float high_thr) { - __local intN smem[(GRP_SIZEX + 4) * (GRP_SIZEY + 4)]; + __local floatN smem[(GRP_SIZEX + 4) * (GRP_SIZEY + 4)]; int lidx = get_local_id(0); int lidy = get_local_id(1); @@ -143,7 +141,7 @@ __kernel void stage1_with_sobel(__global const uchar *src, int src_step, int src //// Sobel, Magnitude // - __local int mag[(GRP_SIZEX + 2) * (GRP_SIZEY + 2)]; + __local float mag[(GRP_SIZEX + 2) * (GRP_SIZEY + 2)]; lidx++; lidy++; @@ -164,13 +162,13 @@ __kernel void stage1_with_sobel(__global const uchar *src, int src_step, int src int idx = lidx + lidy * (GRP_SIZEX + 4); i = lidx + lidy * (GRP_SIZEX + 2); - int3 res = sobel(idx, smem); + float3 res = sobel(idx, smem); mag[i] = res.z; - int x = res.x; - int y = res.y; - barrier(CLK_LOCAL_MEM_FENCE); + int x = (int) res.x; + int y = (int) res.y; + //// Threshold + Non maxima suppression // @@ -218,7 +216,7 @@ __kernel void stage1_with_sobel(__global const uchar *src, int src_step, int src if (gidx >= cols || gidy >= rows) return; - int mag0 = mag[i]; + float mag0 = mag[i]; int value = 1; if (mag0 > low_thr) @@ -235,8 +233,8 @@ __kernel void stage1_with_sobel(__global const uchar *src, int src_step, int src int dir3 = (a * b) & (((x ^ y) & 0x80000000) >> 31); // if a = 1, b = 1, dy ^ dx < 0 int dir = a * b + 2 * dir3; - int prev_mag = mag[(lidy + prev[dir][0]) * (GRP_SIZEX + 2) + lidx + prev[dir][1]]; - int next_mag = mag[(lidy + next[dir][0]) * (GRP_SIZEX + 2) + lidx + next[dir][1]] + (dir & 1); + float prev_mag = mag[(lidy + prev[dir][0]) * (GRP_SIZEX + 2) + lidx + prev[dir][1]]; + float next_mag = mag[(lidy + next[dir][0]) * (GRP_SIZEX + 2) + lidx + next[dir][1]] + (dir & 1); if (mag0 > prev_mag && mag0 >= next_mag) { @@ -384,12 +382,12 @@ __constant short move_dir[2][8] = { { -1, 0, 1, -1, 1, -1, 0, 1 } }; -__kernel void stage2_hysteresis(__global uchar *map, int map_step, int map_offset, int rows, int cols) +__kernel void stage2_hysteresis(__global uchar *map_ptr, int map_step, int map_offset, int rows, int cols) { - map += map_offset; + map_ptr += map_offset; int x = get_global_id(0); - int y0 = get_global_id(1) * PIX_PER_WI; + int y = get_global_id(1) * PIX_PER_WI; int lid = get_local_id(0) + get_local_id(1) * LOCAL_X; @@ -400,15 +398,23 @@ __kernel void stage2_hysteresis(__global uchar *map, int map_step, int map_offse l_counter = 0; barrier(CLK_LOCAL_MEM_FENCE); - #pragma unroll - for (int y = y0; y < min(y0 + PIX_PER_WI, rows); ++y) + if (x < cols) { - if (x < cols) + __global uchar* map = map_ptr + mad24(y, map_step, x * (int)sizeof(int)); + + #pragma unroll + for (int cy = 0; cy < PIX_PER_WI; ++cy) { - int type = loadpix(map + mad24(y, map_step, x * (int)sizeof(int))); - if (type == 2) + if (y < rows) { - l_stack[atomic_inc(&l_counter)] = (ushort2)(x, y); + int type = loadpix(map); + if (type == 2) + { + l_stack[atomic_inc(&l_counter)] = (ushort2)(x, y); + } + + y++; + map += map_step; } } } @@ -422,7 +428,6 @@ __kernel void stage2_hysteresis(__global uchar *map, int map_step, int map_offse int mod = l_counter % LOCAL_TOTAL; int pix_per_thr = l_counter / LOCAL_TOTAL + ((lid < mod) ? 1 : 0); - #pragma unroll for (int i = 0; i < pix_per_thr; ++i) { ushort2 pos = l_stack[ atomic_dec(&l_counter) - 1 ]; @@ -434,7 +439,7 @@ __kernel void stage2_hysteresis(__global uchar *map, int map_step, int map_offse ushort posy = pos.y + move_dir[1][j]; if (posx < 0 || posy < 0 || posx >= cols || posy >= rows) continue; - __global uchar *addr = map + mad24(posy, map_step, posx * (int)sizeof(int)); + __global uchar *addr = map_ptr + mad24(posy, map_step, posx * (int)sizeof(int)); int type = loadpix(addr); if (type == 0) { @@ -463,16 +468,26 @@ __kernel void getEdges(__global const uchar *mapptr, int map_step, int map_offse __global uchar *dst, int dst_step, int dst_offset) { int x = get_global_id(0); - int y0 = get_global_id(1) * PIX_PER_WI; + int y = get_global_id(1) * PIX_PER_WI; - #pragma unroll - for (int y = y0; y < min(y0 + PIX_PER_WI, rows); ++y) + if (x < cols) { int map_index = mad24(map_step, y, mad24(x, (int)sizeof(int), map_offset)); - int dst_index = mad24(dst_step, y, x) + dst_offset; + int dst_index = mad24(dst_step, y, x + dst_offset); - __global const int * map = (__global const int *)(mapptr + map_index); - dst[dst_index] = (uchar)(-(map[0] >> 1)); + #pragma unroll + for (int cy = 0; cy < PIX_PER_WI; ++cy) + { + if (y < rows) + { + __global const int * map = (__global const int *)(mapptr + map_index); + dst[dst_index] = (uchar)(-(map[0] >> 1)); + + y++; + map_index += map_step; + dst_index += dst_step; + } + } } }