mirror of
https://github.com/opencv/opencv.git
synced 2024-11-29 13:47:32 +08:00
Merge pull request #3186 from akarsakov:ocl_fixes
This commit is contained in:
commit
ffd1df0cb5
@ -195,20 +195,20 @@ static bool ocl_Canny(InputArray _src, OutputArray _dst, float low_thresh, float
|
|||||||
hysteresis (add weak edges if they are connected with strong edges)
|
hysteresis (add weak edges if they are connected with strong edges)
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
int sizey = lSizeY / PIX_PER_WI;
|
||||||
|
if (sizey == 0)
|
||||||
|
sizey = 1;
|
||||||
|
|
||||||
|
size_t globalsize[2] = { size.width, (size.height + PIX_PER_WI - 1) / PIX_PER_WI }, localsize[2] = { lSizeX, sizey };
|
||||||
|
|
||||||
ocl::Kernel edgesHysteresis("stage2_hysteresis", ocl::imgproc::canny_oclsrc,
|
ocl::Kernel edgesHysteresis("stage2_hysteresis", ocl::imgproc::canny_oclsrc,
|
||||||
format("-D STAGE2 -D PIX_PER_WI=%d", PIX_PER_WI));
|
format("-D STAGE2 -D PIX_PER_WI=%d -D LOCAL_X=%d -D LOCAL_Y=%d",
|
||||||
|
PIX_PER_WI, lSizeX, sizey));
|
||||||
|
|
||||||
if (edgesHysteresis.empty())
|
if (edgesHysteresis.empty())
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
edgesHysteresis.args(ocl::KernelArg::ReadWrite(map));
|
edgesHysteresis.args(ocl::KernelArg::ReadWrite(map));
|
||||||
|
|
||||||
int sizey = lSizeY / PIX_PER_WI;
|
|
||||||
if (sizey == 0)
|
|
||||||
sizey = 1;
|
|
||||||
|
|
||||||
size_t globalsize[2] = { size.width, size.height / PIX_PER_WI }, localsize[2] = { lSizeX, sizey };
|
|
||||||
|
|
||||||
if (!edgesHysteresis.run(2, globalsize, localsize, false))
|
if (!edgesHysteresis.run(2, globalsize, localsize, false))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
|
@ -671,16 +671,20 @@ static bool ocl_Laplacian5(InputArray _src, OutputArray _dst,
|
|||||||
|
|
||||||
size_t wgs = dev.maxWorkGroupSize();
|
size_t wgs = dev.maxWorkGroupSize();
|
||||||
size_t lmsz = dev.localMemSize();
|
size_t lmsz = dev.localMemSize();
|
||||||
|
|
||||||
size_t src_step = _src.step(), src_offset = _src.offset();
|
size_t src_step = _src.step(), src_offset = _src.offset();
|
||||||
|
const size_t tileSizeYmax = wgs / tileSizeX;
|
||||||
|
|
||||||
|
// workaround for Nvidia: 3 channel vector type takes 4*elem_size in local memory
|
||||||
|
int loc_mem_cn = dev.vendorID() == ocl::Device::VENDOR_NVIDIA && cn == 3 ? 4 : cn;
|
||||||
|
|
||||||
if (((src_offset % src_step) % esz == 0) &&
|
if (((src_offset % src_step) % esz == 0) &&
|
||||||
(
|
(
|
||||||
(borderType == BORDER_CONSTANT || borderType == BORDER_REPLICATE) ||
|
(borderType == BORDER_CONSTANT || borderType == BORDER_REPLICATE) ||
|
||||||
((borderType == BORDER_REFLECT || borderType == BORDER_WRAP || borderType == BORDER_REFLECT_101) &&
|
((borderType == BORDER_REFLECT || borderType == BORDER_WRAP || borderType == BORDER_REFLECT_101) &&
|
||||||
(_src.cols() >= kernelX.cols && _src.rows() >= kernelY.cols))
|
(_src.cols() >= (int) (kernelX.cols + tileSizeX) && _src.rows() >= (int) (kernelY.cols + tileSizeYmax)))
|
||||||
) &&
|
) &&
|
||||||
(tileSizeX * tileSizeYmin <= wgs) &&
|
(tileSizeX * tileSizeYmin <= wgs) &&
|
||||||
(LAPLACIAN_LOCAL_MEM(tileSizeX, tileSizeYmin, kernelX.cols, cn * 4) <= lmsz)
|
(LAPLACIAN_LOCAL_MEM(tileSizeX, tileSizeYmin, kernelX.cols, loc_mem_cn * 4) <= lmsz)
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
Size size = _src.size(), wholeSize;
|
Size size = _src.size(), wholeSize;
|
||||||
@ -688,8 +692,8 @@ static bool ocl_Laplacian5(InputArray _src, OutputArray _dst,
|
|||||||
int dtype = CV_MAKE_TYPE(ddepth, cn);
|
int dtype = CV_MAKE_TYPE(ddepth, cn);
|
||||||
int wdepth = CV_32F;
|
int wdepth = CV_32F;
|
||||||
|
|
||||||
size_t tileSizeY = wgs / tileSizeX;
|
size_t tileSizeY = tileSizeYmax;
|
||||||
while ((tileSizeX * tileSizeY > wgs) || (LAPLACIAN_LOCAL_MEM(tileSizeX, tileSizeY, kernelX.cols, cn * 4) > lmsz))
|
while ((tileSizeX * tileSizeY > wgs) || (LAPLACIAN_LOCAL_MEM(tileSizeX, tileSizeY, kernelX.cols, loc_mem_cn * 4) > lmsz))
|
||||||
{
|
{
|
||||||
tileSizeY /= 2;
|
tileSizeY /= 2;
|
||||||
}
|
}
|
||||||
|
@ -375,7 +375,8 @@ __kernel void stage1_without_sobel(__global const uchar *dxptr, int dx_step, int
|
|||||||
|
|
||||||
#define loadpix(addr) *(__global int *)(addr)
|
#define loadpix(addr) *(__global int *)(addr)
|
||||||
#define storepix(val, addr) *(__global int *)(addr) = (int)(val)
|
#define storepix(val, addr) *(__global int *)(addr) = (int)(val)
|
||||||
#define l_stack_size 256
|
#define LOCAL_TOTAL (LOCAL_X*LOCAL_Y)
|
||||||
|
#define l_stack_size (4*LOCAL_TOTAL)
|
||||||
#define p_stack_size 8
|
#define p_stack_size 8
|
||||||
|
|
||||||
__constant short move_dir[2][8] = {
|
__constant short move_dir[2][8] = {
|
||||||
@ -390,7 +391,7 @@ __kernel void stage2_hysteresis(__global uchar *map, int map_step, int map_offse
|
|||||||
int x = get_global_id(0);
|
int x = get_global_id(0);
|
||||||
int y0 = get_global_id(1) * PIX_PER_WI;
|
int y0 = get_global_id(1) * PIX_PER_WI;
|
||||||
|
|
||||||
int lid = get_local_id(0) + get_local_id(1) * 32;
|
int lid = get_local_id(0) + get_local_id(1) * LOCAL_X;
|
||||||
|
|
||||||
__local ushort2 l_stack[l_stack_size];
|
__local ushort2 l_stack[l_stack_size];
|
||||||
__local int l_counter;
|
__local int l_counter;
|
||||||
@ -402,10 +403,13 @@ __kernel void stage2_hysteresis(__global uchar *map, int map_step, int map_offse
|
|||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int y = y0; y < min(y0 + PIX_PER_WI, rows); ++y)
|
for (int y = y0; y < min(y0 + PIX_PER_WI, rows); ++y)
|
||||||
{
|
{
|
||||||
int type = loadpix(map + mad24(y, map_step, x * (int)sizeof(int)));
|
if (x < cols)
|
||||||
if (type == 2)
|
|
||||||
{
|
{
|
||||||
l_stack[atomic_inc(&l_counter)] = (ushort2)(x, y);
|
int type = loadpix(map + mad24(y, map_step, x * (int)sizeof(int)));
|
||||||
|
if (type == 2)
|
||||||
|
{
|
||||||
|
l_stack[atomic_inc(&l_counter)] = (ushort2)(x, y);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
@ -415,8 +419,8 @@ __kernel void stage2_hysteresis(__global uchar *map, int map_step, int map_offse
|
|||||||
|
|
||||||
while(l_counter != 0)
|
while(l_counter != 0)
|
||||||
{
|
{
|
||||||
int mod = l_counter % 64;
|
int mod = l_counter % LOCAL_TOTAL;
|
||||||
int pix_per_thr = l_counter / 64 + (lid < mod) ? 1 : 0;
|
int pix_per_thr = l_counter / LOCAL_TOTAL + ((lid < mod) ? 1 : 0);
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = 0; i < pix_per_thr; ++i)
|
for (int i = 0; i < pix_per_thr; ++i)
|
||||||
|
@ -16,7 +16,7 @@
|
|||||||
//aaaaaa|abcdefgh|hhhhhhh
|
//aaaaaa|abcdefgh|hhhhhhh
|
||||||
#define EXTRAPOLATE(x, maxV) \
|
#define EXTRAPOLATE(x, maxV) \
|
||||||
{ \
|
{ \
|
||||||
(x) = max(min((x), (maxV) - 1), 0); \
|
(x) = clamp((x), 0, (maxV)-1); \
|
||||||
}
|
}
|
||||||
#elif defined BORDER_WRAP
|
#elif defined BORDER_WRAP
|
||||||
//cdefgh|abcdefgh|abcdefg
|
//cdefgh|abcdefgh|abcdefg
|
||||||
|
@ -50,7 +50,7 @@
|
|||||||
// aaaaaa|abcdefgh|hhhhhhh
|
// aaaaaa|abcdefgh|hhhhhhh
|
||||||
#define EXTRAPOLATE(x, maxV) \
|
#define EXTRAPOLATE(x, maxV) \
|
||||||
{ \
|
{ \
|
||||||
(x) = max(min((x), (maxV) - 1), 0); \
|
(x) = clamp((x), 0, (maxV)-1); \
|
||||||
}
|
}
|
||||||
#elif defined BORDER_WRAP
|
#elif defined BORDER_WRAP
|
||||||
// cdefgh|abcdefgh|abcdefg
|
// cdefgh|abcdefgh|abcdefg
|
||||||
|
@ -49,7 +49,7 @@ __kernel void sumConvert(__global const uchar * src1ptr, int src1_step, int src1
|
|||||||
// aaaaaa|abcdefgh|hhhhhhh
|
// aaaaaa|abcdefgh|hhhhhhh
|
||||||
#define EXTRAPOLATE(x, maxV) \
|
#define EXTRAPOLATE(x, maxV) \
|
||||||
{ \
|
{ \
|
||||||
(x) = max(min((x), (maxV) - 1), 0); \
|
(x) = clamp((x), 0, (maxV)-1); \
|
||||||
}
|
}
|
||||||
#elif defined BORDER_WRAP
|
#elif defined BORDER_WRAP
|
||||||
// cdefgh|abcdefgh|abcdefg
|
// cdefgh|abcdefgh|abcdefg
|
||||||
|
Loading…
Reference in New Issue
Block a user