diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.cpp index e51986c394..d23de91ea1 100644 --- a/modules/imgproc/src/filter.cpp +++ b/modules/imgproc/src/filter.cpp @@ -3492,7 +3492,7 @@ static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst, return false; size_t lt2[2] = { optimizedSepFilterLocalSize, optimizedSepFilterLocalSize }; - size_t gt2[2] = { lt2[0] * (1 + (size.width - 1) / lt2[0]), lt2[1] * (1 + (size.height - 1) / lt2[1]) }; + size_t gt2[2] = { lt2[0] * (1 + (size.width - 1) / lt2[0]), optimizedSepFilterLocalSize}; char cvt[2][40]; const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP", diff --git a/modules/imgproc/src/opencl/filterSep_singlePass.cl b/modules/imgproc/src/opencl/filterSep_singlePass.cl index 3952577d77..6c3bbdc161 100644 --- a/modules/imgproc/src/opencl/filterSep_singlePass.cl +++ b/modules/imgproc/src/opencl/filterSep_singlePass.cl @@ -119,17 +119,15 @@ __kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int int liy = get_local_id(1); int x = get_global_id(0); - int y = get_global_id(1); // calculate pixel position in source image taking image offset into account int srcX = x + srcOffsetX - RADIUSX; - int srcY = y + srcOffsetY - RADIUSY; // extrapolate coordinates, if needed // and read my own source pixel into local memory // with account for extra border pixels, which will be read by starting workitems int clocY = liy; - int cSrcY = srcY; + int cSrcY = liy + srcOffsetY - RADIUSY; do { int yb = cSrcY; @@ -154,48 +152,76 @@ __kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int while (clocY < BLK_Y+(RADIUSY*2)); barrier(CLK_LOCAL_MEM_FENCE); - // do vertical filter pass - // and store intermediate results to second local memory array - int i, clocX = lix; - WT sum = (WT) 0; - do + for (int y = 0; y < dst_rows; y+=BLK_Y) { - sum = (WT) 0; - for (i=0; i<=2*RADIUSY; i++) + // do vertical filter pass + // and store intermediate results to second local memory array + int i, clocX = lix; + WT sum = (WT) 0; + do + { + sum = (WT) 0; + for (i=0; i<=2*RADIUSY; i++) #if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE) - sum = mad24(lsmem[liy+i][clocX], mat_kernelY[i], sum); + sum = mad24(lsmem[liy + i][clocX], mat_kernelY[i], sum); #else - sum = mad(lsmem[liy+i][clocX], mat_kernelY[i], sum); + sum = mad(lsmem[liy + i][clocX], mat_kernelY[i], sum); #endif - lsmemDy[liy][clocX] = sum; - clocX += BLK_X; - } - while(clocX < BLK_X+(RADIUSX*2)); - barrier(CLK_LOCAL_MEM_FENCE); + lsmemDy[liy][clocX] = sum; + clocX += BLK_X; + } + while(clocX < BLK_X+(RADIUSX*2)); + barrier(CLK_LOCAL_MEM_FENCE); - // if this pixel happened to be out of image borders because of global size rounding, - // then just return - if( x >= dst_cols || y >=dst_rows ) - return; - - // do second horizontal filter pass - // and calculate final result - sum = 0.0f; - for (i=0; i<=2*RADIUSX; i++) + // if this pixel happened to be out of image borders because of global size rounding, + // then just return + if ((x < dst_cols) && (y + liy < dst_rows)) + { + // do second horizontal filter pass + // and calculate final result + sum = 0.0f; + for (i=0; i<=2*RADIUSX; i++) #if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE) - sum = mad24(lsmemDy[liy][lix+i], mat_kernelX[i], sum); + sum = mad24(lsmemDy[liy][lix+i], mat_kernelX[i], sum); #else - sum = mad(lsmemDy[liy][lix+i], mat_kernelX[i], sum); + sum = mad(lsmemDy[liy][lix+i], mat_kernelX[i], sum); #endif #ifdef INTEGER_ARITHMETIC #ifdef INTEL_DEVICE - sum = (sum + (1 << (SHIFT_BITS-1))) / (1 << SHIFT_BITS); + sum = (sum + (1 << (SHIFT_BITS-1))) / (1 << SHIFT_BITS); #else - sum = (sum + (1 << (SHIFT_BITS-1))) >> SHIFT_BITS; + sum = (sum + (1 << (SHIFT_BITS-1))) >> SHIFT_BITS; #endif #endif + // store result into destination image + storepix(convertToDstT(sum + (WT)(delta)), Dst + mad24(y + liy, dst_step, mad24(x, DSTSIZE, dst_offset))); + } + + for (int i = liy * BLK_X + lix; i < (RADIUSY*2) * (BLK_X+(RADIUSX*2)); i += BLK_X * BLK_Y) + { + int clocX = i % (BLK_X+(RADIUSX*2)); + int clocY = i / (BLK_X+(RADIUSX*2)); + lsmem[clocY][clocX] = lsmem[clocY + BLK_Y][clocX]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + int cSrcY = y + BLK_Y + liy + srcOffsetY + RADIUSY; + EXTRAPOLATE(cSrcY, (height)); + + clocX = lix; + int cSrcX = x + srcOffsetX - RADIUSX; + do + { + int xb = cSrcX; + EXTRAPOLATE(xb,(width)); + lsmem[liy + 2*RADIUSY][clocX] = ELEM(xb, cSrcY, (width), (height), 0 ); + + clocX += BLK_X; + cSrcX += BLK_X; + } + while(clocX < BLK_X+(RADIUSX*2)); + barrier(CLK_LOCAL_MEM_FENCE); + } - // store result into destination image - storepix(convertToDstT(sum + (WT)(delta)), Dst + mad24(y, dst_step, mad24(x, DSTSIZE, dst_offset))); }