diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.cpp index bb54471c07..ba2e347af0 100644 --- a/modules/imgproc/src/filter.cpp +++ b/modules/imgproc/src/filter.cpp @@ -41,6 +41,7 @@ //M*/ #include "precomp.hpp" +#define CV_OPENCL_RUN_ASSERT #include "opencl_kernels.hpp" #include @@ -3317,11 +3318,9 @@ static bool ocl_filter2D( InputArray _src, OutputArray _dst, int ddepth, return kernel.run(2, globalsize, localsize, true); } -static bool ocl_sepRowFilter2D( UMat &src, UMat &buf, Mat &kernelX, int anchor, int borderType, bool sync) +static bool ocl_sepRowFilter2D( UMat &src, UMat &buf, Mat &kernelX, int anchor, int borderType) { - int type = src.type(); - int cn = CV_MAT_CN(type); - int sdepth = CV_MAT_DEPTH(type); + int type = src.type(), cn = CV_MAT_CN(type), sdepth = CV_MAT_DEPTH(type); Size bufSize = buf.size(); #ifdef ANDROID @@ -3329,27 +3328,14 @@ static bool ocl_sepRowFilter2D( UMat &src, UMat &buf, Mat &kernelX, int anchor, #else size_t localsize[2] = {16, 16}; #endif + size_t globalsize[2] = {DIVUP(bufSize.width, localsize[0]) * localsize[0], DIVUP(bufSize.height, localsize[1]) * localsize[1]}; - if (CV_8U == sdepth) - { - switch (cn) - { - case 1: - globalsize[0] = DIVUP((bufSize.width + 3) >> 2, localsize[0]) * localsize[0]; - break; - case 2: - globalsize[0] = DIVUP((bufSize.width + 1) >> 1, localsize[0]) * localsize[0]; - break; - case 4: - globalsize[0] = DIVUP(bufSize.width, localsize[0]) * localsize[0]; - break; - } - } + if (type == CV_8UC1) + globalsize[0] = DIVUP((bufSize.width + 3) >> 2, localsize[0]) * localsize[0]; - int radiusX = anchor; - int radiusY = (int)((buf.rows - src.rows) >> 1); + int radiusX = anchor, radiusY = (buf.rows - src.rows) >> 1; - bool isIsolatedBorder = (borderType & BORDER_ISOLATED) != 0; + bool isolated = (borderType & BORDER_ISOLATED) != 0; const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP", "BORDER_REFLECT_101" }, * const btype = borderMap[borderType & ~BORDER_ISOLATED]; @@ -3358,49 +3344,38 @@ static bool ocl_sepRowFilter2D( UMat &src, UMat &buf, Mat &kernelX, int anchor, extra_extrapolation |= src.cols < (int)((-radiusX + globalsize[0] + 8 * localsize[0] + 3) >> 1) + 1; extra_extrapolation |= src.cols < radiusX; - cv::String build_options = cv::format("-D RADIUSX=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D %s -D %s", - radiusX, (int)localsize[0], (int)localsize[1], cn, - btype, - extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION", - isIsolatedBorder ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED"); + char cvt[40]; + cv::String build_options = cv::format("-D RADIUSX=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D %s -D %s" + " -D srcT=%s -D dstT=%s -D convertToDstT=%s -D srcT1=%s -D dstT1=%s", + radiusX, (int)localsize[0], (int)localsize[1], cn, btype, + extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION", + isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED", + ocl::typeToStr(type), ocl::typeToStr(CV_32FC(cn)), + ocl::convertTypeStr(sdepth, CV_32F, cn, cvt), + ocl::typeToStr(sdepth), ocl::typeToStr(CV_32F)); build_options += ocl::kernelToStr(kernelX, CV_32F); Size srcWholeSize; Point srcOffset; src.locateROI(srcWholeSize, srcOffset); - std::stringstream strKernel; - strKernel << "row_filter"; - if (-1 != cn) - strKernel << "_C" << cn; - if (-1 != sdepth) - strKernel << "_D" << sdepth; + String kernelName("row_filter"); + if (type == CV_8UC1) + kernelName += "_C1_D0"; - ocl::Kernel kernelRow; - if (!kernelRow.create(strKernel.str().c_str(), cv::ocl::imgproc::filterSepRow_oclsrc, - build_options)) + ocl::Kernel k(kernelName.c_str(), cv::ocl::imgproc::filterSepRow_oclsrc, + build_options); + if (k.empty()) return false; - int idxArg = 0; - idxArg = kernelRow.set(idxArg, ocl::KernelArg::PtrReadOnly(src)); - idxArg = kernelRow.set(idxArg, (int)(src.step / src.elemSize())); + k.args(ocl::KernelArg::PtrReadOnly(src), (int)(src.step / src.elemSize()), srcOffset.x, + srcOffset.y, src.cols, src.rows, srcWholeSize.width, srcWholeSize.height, + ocl::KernelArg::PtrWriteOnly(buf), (int)(buf.step / buf.elemSize()), + buf.cols, buf.rows, radiusY); - idxArg = kernelRow.set(idxArg, srcOffset.x); - idxArg = kernelRow.set(idxArg, srcOffset.y); - idxArg = kernelRow.set(idxArg, src.cols); - idxArg = kernelRow.set(idxArg, src.rows); - idxArg = kernelRow.set(idxArg, srcWholeSize.width); - idxArg = kernelRow.set(idxArg, srcWholeSize.height); - - idxArg = kernelRow.set(idxArg, ocl::KernelArg::PtrWriteOnly(buf)); - idxArg = kernelRow.set(idxArg, (int)(buf.step / buf.elemSize())); - idxArg = kernelRow.set(idxArg, buf.cols); - idxArg = kernelRow.set(idxArg, buf.rows); - idxArg = kernelRow.set(idxArg, radiusY); - - return kernelRow.run(2, globalsize, localsize, sync); + return k.run(2, globalsize, localsize, false); } -static bool ocl_sepColFilter2D(const UMat &buf, UMat &dst, Mat &kernelY, int anchor, bool sync) +static bool ocl_sepColFilter2D(const UMat &buf, UMat &dst, Mat &kernelY, int anchor) { #ifdef ANDROID size_t localsize[2] = {16, 10}; @@ -3420,28 +3395,23 @@ static bool ocl_sepColFilter2D(const UMat &buf, UMat &dst, Mat &kernelY, int anc globalsize[0] = DIVUP(sz.width, localsize[0]) * localsize[0]; char cvt[40]; - cv::String build_options = cv::format("-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s", - anchor, (int)localsize[0], (int)localsize[1], cn, ocl::typeToStr(buf.type()), - ocl::typeToStr(dtype), ocl::convertTypeStr(CV_32F, ddepth, cn, cvt)); + cv::String build_options = cv::format("-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d" + " -D srcT=%s -D dstT=%s -D convertToDstT=%s", + anchor, (int)localsize[0], (int)localsize[1], cn, + ocl::typeToStr(buf.type()), ocl::typeToStr(dtype), + ocl::convertTypeStr(CV_32F, ddepth, cn, cvt)); build_options += ocl::kernelToStr(kernelY, CV_32F); - ocl::Kernel kernelCol; - if (!kernelCol.create("col_filter", cv::ocl::imgproc::filterSepCol_oclsrc, build_options)) + ocl::Kernel k("col_filter", cv::ocl::imgproc::filterSepCol_oclsrc, + build_options); + if (k.empty()) return false; - int idxArg = 0; - idxArg = kernelCol.set(idxArg, ocl::KernelArg::PtrReadOnly(buf)); - idxArg = kernelCol.set(idxArg, (int)(buf.step / buf.elemSize())); - idxArg = kernelCol.set(idxArg, buf.cols); - idxArg = kernelCol.set(idxArg, buf.rows); + k.args(ocl::KernelArg::PtrReadOnly(buf), (int)(buf.step / buf.elemSize()), buf.cols, + buf.rows, ocl::KernelArg::PtrWriteOnly(dst), (int)(dst.offset / dst.elemSize()), + (int)(dst.step / dst.elemSize()), dst.cols, dst.rows); - idxArg = kernelCol.set(idxArg, ocl::KernelArg::PtrWriteOnly(dst)); - idxArg = kernelCol.set(idxArg, (int)(dst.offset / dst.elemSize())); - idxArg = kernelCol.set(idxArg, (int)(dst.step / dst.elemSize())); - idxArg = kernelCol.set(idxArg, dst.cols); - idxArg = kernelCol.set(idxArg, dst.rows); - - return kernelCol.run(2, globalsize, localsize, sync); + return k.run(2, globalsize, localsize, false); } const int optimizedSepFilterLocalSize = 16; @@ -3473,12 +3443,14 @@ static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst, String opts = cv::format("-D BLK_X=%d -D BLK_Y=%d -D RADIUSX=%d -D RADIUSY=%d%s%s" " -D srcT=%s -D convertToWT=%s -D WT=%s -D dstT=%s -D convertToDstT=%s" - " -D %s", (int)lt2[0], (int)lt2[1], _row_kernel.size().height / 2, _col_kernel.size().height / 2, + " -D %s -D srcT1=%s -D dstT1=%s -D cn=%d", (int)lt2[0], (int)lt2[1], + _row_kernel.size().height / 2, _col_kernel.size().height / 2, ocl::kernelToStr(_row_kernel, CV_32F, "KERNEL_MATRIX_X").c_str(), ocl::kernelToStr(_col_kernel, CV_32F, "KERNEL_MATRIX_Y").c_str(), ocl::typeToStr(stype), ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), ocl::typeToStr(dtype), - ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), borderMap[borderType]); + ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), borderMap[borderType], + ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), cn); ocl::Kernel k("sep_filter", ocl::imgproc::filterSep_singlePass_oclsrc, opts); if (k.empty()) @@ -3529,10 +3501,13 @@ static bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth, if (ddepth < 0) ddepth = sdepth; - CV_OCL_RUN_(kernelY.rows <= 21 && kernelX.rows <= 21 && - imgSize.width > optimizedSepFilterLocalSize + (kernelX.rows >> 1) && - imgSize.height > optimizedSepFilterLocalSize + (kernelY.rows >> 1), - ocl_sepFilter2D_SinglePass(_src, _dst, _kernelX, _kernelY, borderType, ddepth), true) +// printf("%d %d\n", imgSize.width, optimizedSepFilterLocalSize + (kernelX.rows >> 1)); +// printf("%d %d\n", imgSize.height, optimizedSepFilterLocalSize + (kernelY.rows >> 1)); + +// CV_OCL_RUN_(kernelY.rows <= 21 && kernelX.rows <= 21 && +// imgSize.width > optimizedSepFilterLocalSize + (kernelX.rows >> 1) && +// imgSize.height > optimizedSepFilterLocalSize + (kernelY.rows >> 1), +// ocl_sepFilter2D_SinglePass(_src, _dst, _kernelX, _kernelY, borderType, ddepth), true) UMat src = _src.getUMat(); Size srcWholeSize; Point srcOffset; @@ -3546,12 +3521,12 @@ static bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth, Size srcSize = src.size(); Size bufSize(srcSize.width, srcSize.height + kernelY.cols - 1); UMat buf; buf.create(bufSize, CV_MAKETYPE(CV_32F, cn)); - if (!ocl_sepRowFilter2D(src, buf, kernelX, anchor.x, borderType, false)) + if (!ocl_sepRowFilter2D(src, buf, kernelX, anchor.x, borderType)) return false; _dst.create(srcSize, CV_MAKETYPE(ddepth, cn)); UMat dst = _dst.getUMat(); - return ocl_sepColFilter2D(buf, dst, kernelY, anchor.y, false); + return ocl_sepColFilter2D(buf, dst, kernelY, anchor.y); } #endif diff --git a/modules/imgproc/src/opencl/filterSepCol.cl b/modules/imgproc/src/opencl/filterSepCol.cl index 30a2221cf1..05717c6ad2 100644 --- a/modules/imgproc/src/opencl/filterSepCol.cl +++ b/modules/imgproc/src/opencl/filterSepCol.cl @@ -36,16 +36,6 @@ #define READ_TIMES_COL ((2*(RADIUSY+LSIZE1)-1)/LSIZE1) #define RADIUS 1 -#if CN ==1 -#define ALIGN (((RADIUS)+3)>>2<<2) -#elif CN==2 -#define ALIGN (((RADIUS)+1)>>1<<1) -#elif CN==3 -#define ALIGN (((RADIUS)+3)>>2<<2) -#elif CN==4 -#define ALIGN (RADIUS) -#define READ_TIMES_ROW ((2*(RADIUS+LSIZE0)-1)/LSIZE0) -#endif #define noconvert @@ -65,16 +55,8 @@ The info above maybe obsolete. #define DIG(a) a, __constant float mat_kernel[] = { COEFF }; -__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void col_filter - (__global const GENTYPE_SRC * restrict src, - const int src_step_in_pixel, - const int src_whole_cols, - const int src_whole_rows, - __global GENTYPE_DST * dst, - const int dst_offset_in_pixel, - const int dst_step_in_pixel, - const int dst_cols, - const int dst_rows) +__kernel void col_filter(__global const srcT * src, int src_step_in_pixel, int src_whole_cols, int src_whole_rows, + __global dstT * dst, int dst_offset_in_pixel, int dst_step_in_pixel, int dst_cols, int dst_rows) { int x = get_global_id(0); int y = get_global_id(1); @@ -85,35 +67,35 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void col_filter int start_addr = mad24(y, src_step_in_pixel, x); int end_addr = mad24(src_whole_rows - 1, src_step_in_pixel, src_whole_cols); - int i; - GENTYPE_SRC sum, temp[READ_TIMES_COL]; - __local GENTYPE_SRC LDS_DAT[LSIZE1 * READ_TIMES_COL][LSIZE0 + 1]; + srcT sum, temp[READ_TIMES_COL]; + __local srcT LDS_DAT[LSIZE1 * READ_TIMES_COL][LSIZE0 + 1]; - //read pixels from src - for(i = 0;i>2<<2) -#elif CN==2 -#define ALIGN (((RADIUS)+1)>>1<<1) -#elif CN==3 -#define ALIGN (((RADIUS)+3)>>2<<2) -#elif CN==4 -#define ALIGN (RADIUS) -#endif #ifdef BORDER_REPLICATE -//BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh +// BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh #define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i)) #define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr)) #endif #ifdef BORDER_REFLECT -//BORDER_REFLECT: fedcba|abcdefgh|hgfedcb +// BORDER_REFLECT: fedcba|abcdefgh|hgfedcb #define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i)-1 : (i)) #define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr)) #endif #ifdef BORDER_REFLECT_101 -//BORDER_REFLECT_101: gfedcb|abcdefgh|gfedcba +// BORDER_REFLECT_101: gfedcb|abcdefgh|gfedcba #define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i) : (i)) #define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr)) #endif -//blur function does not support BORDER_WRAP #ifdef BORDER_WRAP -//BORDER_WRAP: cdefgh|abcdefgh|abcdefg +// BORDER_WRAP: cdefgh|abcdefgh|abcdefg #define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (i)+(r_edge) : (i)) #define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (i)-(r_edge) : (addr)) #endif @@ -127,65 +115,56 @@ #endif //BORDER_CONSTANT #endif //EXTRA_EXTRAPOLATION -/********************************************************************************** -These kernels are written for separable filters such as Sobel, Scharr, GaussianBlur. -Now(6/29/2011) the kernels only support 8U data type and the anchor of the convovle -kernel must be in the center. ROI is not supported either. -For channels =1,2,4, each kernels read 4 elements(not 4 pixels), and for channels =3, -the kernel read 4 pixels, save them to LDS and read the data needed from LDS to -calculate the result. -The length of the convovle kernel supported is related to the LSIZE0 and the MAX size -of LDS, which is HW related. -For channels = 1,3 the RADIUS is no more than LSIZE0*2 -For channels = 2, the RADIUS is no more than LSIZE0 -For channels = 4, arbitary RADIUS is supported unless the LDS is not enough -Niko -6/29/2011 -The info above maybe obsolete. -***********************************************************************************/ +#define noconvert + +#if cn != 3 +#define loadpix(addr) *(__global const srcT *)(addr) +#define storepix(val, addr) *(__global dstT *)(addr) = val +#define SRCSIZE ((int)sizeof(srcT)) +#define DSTSIZE ((int)sizeof(dstT)) +#else +#define loadpix(addr) vload3(0, (__global const srcT1 *)(addr)) +#define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr)) +#define SRCSIZE ((int)sizeof(srcT1)*3) +#define DSTSIZE ((int)sizeof(dstT1)*3) +#endif #define DIG(a) a, __constant float mat_kernel[] = { COEFF }; -__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C1_D0 - (__global uchar * restrict src, - int src_step_in_pixel, - int src_offset_x, int src_offset_y, - int src_cols, int src_rows, - int src_whole_cols, int src_whole_rows, - __global float * dst, - int dst_step_in_pixel, - int dst_cols, int dst_rows, - int radiusy) +__kernel void row_filter_C1_D0(__global const uchar * src, int src_step_in_pixel, int src_offset_x, int src_offset_y, + int src_cols, int src_rows, int src_whole_cols, int src_whole_rows, + __global float * dst, int dst_step_in_pixel, int dst_cols, int dst_rows, + int radiusy) { int x = get_global_id(0)<<2; int y = get_global_id(1); int l_x = get_local_id(0); int l_y = get_local_id(1); - int start_x = x+src_offset_x - RADIUSX & 0xfffffffc; + int start_x = x + src_offset_x - RADIUSX & 0xfffffffc; int offset = src_offset_x - RADIUSX & 3; int start_y = y + src_offset_y - radiusy; int start_addr = mad24(start_y, src_step_in_pixel, start_x); - int i; + float4 sum; uchar4 temp[READ_TIMES_ROW]; - __local uchar4 LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1]; + __local uchar4 LDS_DAT[LSIZE1][READ_TIMES_ROW * LSIZE0 + 1]; #ifdef BORDER_CONSTANT int end_addr = mad24(src_whole_rows - 1, src_step_in_pixel, src_whole_cols); // read pixels from src - for (i = 0; i < READ_TIMES_ROW; i++) + for (int i = 0; i < READ_TIMES_ROW; ++i) { - int current_addr = start_addr+i*LSIZE0*4; - current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0; - temp[i] = *(__global uchar4*)&src[current_addr]; + int current_addr = mad24(i, LSIZE0 << 2, start_addr); + current_addr = current_addr < end_addr && current_addr > 0 ? current_addr : 0; + temp[i] = *(__global const uchar4 *)&src[current_addr]; } // judge if read out of boundary #ifdef BORDER_ISOLATED - for (i = 0; isrc_whole_cols)| (start_y<0) | (start_y >= src_whole_rows); #endif - int4 index[READ_TIMES_ROW]; - int4 addr; + int4 index[READ_TIMES_ROW], addr; int s_y; if (not_all_in_range) { // judge if read out of boundary - for (i = 0; i < READ_TIMES_ROW; i++) + for (int i = 0; i < READ_TIMES_ROW; ++i) { - index[i] = (int4)(start_x+i*LSIZE0*4) + (int4)(0, 1, 2, 3); + index[i] = (int4)(mad24(i, LSIZE0 << 2, start_x)) + (int4)(0, 1, 2, 3); #ifdef BORDER_ISOLATED EXTRAPOLATE(index[i].x, src_offset_x, src_offset_x + src_cols); EXTRAPOLATE(index[i].y, src_offset_x, src_offset_x + src_cols); @@ -231,6 +209,7 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_ EXTRAPOLATE(index[i].w, 0, src_whole_cols); #endif } + s_y = start_y; #ifdef BORDER_ISOLATED EXTRAPOLATE(s_y, src_offset_y, src_offset_y + src_rows); @@ -239,9 +218,9 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_ #endif // read pixels from src - for (i = 0; i 0)) ? current_addr : 0; + int current_addr = mad24(i, LSIZE0, start_addr); + current_addr = current_addr < end_addr && current_addr > 0 ? current_addr : 0; temp[i] = src[current_addr]; } - //judge if read out of boundary + // judge if read out of boundary #ifdef BORDER_ISOLATED - for (i = 0; i 0)) ? current_addr : 0; - temp[i] = src[current_addr]; - } - - // judge if read out of boundary -#ifdef BORDER_ISOLATED - for (i = 0; i 0)) ? current_addr : 0; - temp[i] = src[current_addr]; - } - - // judge if read out of boundary -#ifdef BORDER_ISOLATED - for (i = 0; i