diff --git a/modules/core/src/matrix.cpp b/modules/core/src/matrix.cpp index 70b22026ba..49c5c29d28 100644 --- a/modules/core/src/matrix.cpp +++ b/modules/core/src/matrix.cpp @@ -3411,76 +3411,76 @@ namespace cv { static bool ocl_reduce(InputArray _src, OutputArray _dst, int dim, int op, int op0, int stype, int dtype) { + const int min_opt_cols = 128, buf_cols = 32; int sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype), ddepth = CV_MAT_DEPTH(dtype), ddepth0 = ddepth; - bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; + bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0, + useOptimized = 1 == dim && _src.cols() > min_opt_cols; if (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F)) return false; if (op == CV_REDUCE_AVG) { - op = CV_REDUCE_SUM; if (sdepth < CV_32S && ddepth < CV_32S) ddepth = CV_32S; } const char * const ops[4] = { "OCL_CV_REDUCE_SUM", "OCL_CV_REDUCE_AVG", "OCL_CV_REDUCE_MAX", "OCL_CV_REDUCE_MIN" }; - char cvt[40]; + char cvt[2][40]; - const int min_opt_cols = 128; - if ((1 == dim) && (_src.cols() > min_opt_cols)) + int wdepth = std::max(ddepth, CV_32F); + cv::String build_opt = format("-D %s -D dim=%d -D cn=%d -D ddepth=%d" + " -D srcT=%s -D dstT=%s -D dstT0=%s -D convertToWT=%s" + " -D convertToDT=%s -D convertToDT0=%s%s", + ops[op], dim, cn, ddepth, ocl::typeToStr(useOptimized ? ddepth : sdepth), + ocl::typeToStr(ddepth), ocl::typeToStr(ddepth0), + ocl::convertTypeStr(ddepth, wdepth, 1, cvt[0]), + ocl::convertTypeStr(sdepth, ddepth, 1, cvt[0]), + ocl::convertTypeStr(wdepth, ddepth0, 1, cvt[1]), + doubleSupport ? " -D DOUBLE_SUPPORT" : ""); + + if (useOptimized) { - int buf_cols = 32; - - cv::String build_opt_pre = format("-D BUF_COLS=%d -D %s -D dim=1 -D cn=%d -D ddepth=%d -D srcT=%s -D dstT=%s -D convertToDT=%s%s", - buf_cols, ops[op], cn, ddepth, ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), - ocl::convertTypeStr(sdepth, ddepth, 1, cvt), - doubleSupport ? " -D DOUBLE_SUPPORT" : ""); + cv::String build_opt_pre = format("-D OP_REDUCE_PRE -D BUF_COLS=%d -D %s -D dim=1" + " -D cn=%d -D ddepth=%d -D srcT=%s -D dstT=%s -D convertToDT=%s%s", + buf_cols, ops[op], cn, ddepth, ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), + ocl::convertTypeStr(sdepth, ddepth, 1, cvt[0]), + doubleSupport ? " -D DOUBLE_SUPPORT" : ""); ocl::Kernel kpre("reduce_horz_pre", ocl::core::reduce2_oclsrc, build_opt_pre); if (kpre.empty()) return false; - cv::String build_opt_main = format("-D %s -D dim=1 -D cn=%d -D ddepth=%d -D srcT=%s -D dstT=%s -D convertToDT=noconvert%s", - ops[op], cn, ddepth, ocl::typeToStr(ddepth), ocl::typeToStr(ddepth), - doubleSupport ? " -D DOUBLE_SUPPORT" : ""); - ocl::Kernel kmain("reduce", ocl::core::reduce2_oclsrc, build_opt_main); + ocl::Kernel kmain("reduce", ocl::core::reduce2_oclsrc, build_opt); if (kmain.empty()) return false; UMat src = _src.getUMat(); Size dsize(1, src.rows); _dst.create(dsize, dtype); - UMat dst = _dst.getUMat(), temp = dst; + UMat dst = _dst.getUMat(); - if (op0 == CV_REDUCE_AVG && sdepth < CV_32S && ddepth0 < CV_32S) - temp.create(dsize, CV_32SC(cn)); + UMat buf(src.rows, buf_cols, dst.type()); - UMat buf(src.rows, buf_cols, temp.type()); + kpre.args(ocl::KernelArg::ReadOnly(src), + ocl::KernelArg::WriteOnlyNoSize(buf)); size_t globalSize[2] = { buf_cols, src.rows }; - - kpre.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnlyNoSize(buf)); if (!kpre.run(2, globalSize, NULL, false)) return false; - globalSize[0] = src.rows; - kmain.args(ocl::KernelArg::ReadOnly(buf), ocl::KernelArg::WriteOnlyNoSize(temp)); - if (!kmain.run(1, globalSize, NULL, false)) - return false; - if (op0 == CV_REDUCE_AVG) - temp.convertTo(dst, ddepth0, 1. / src.cols); + kmain.args(ocl::KernelArg::ReadOnly(buf), + ocl::KernelArg::WriteOnlyNoSize(dst), 1.0f / src.cols); + else + kmain.args(ocl::KernelArg::ReadOnly(buf), + ocl::KernelArg::WriteOnlyNoSize(dst)); - return true; + globalSize[0] = src.rows; + return kmain.run(1, globalSize, NULL, false); } - cv::String build_opt = format("-D %s -D dim=%d -D cn=%d -D ddepth=%d -D srcT=%s -D dstT=%s -D convertToDT=%s%s", - ops[op], dim, cn, ddepth, ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), - ocl::convertTypeStr(sdepth, ddepth, 1, cvt), - doubleSupport ? " -D DOUBLE_SUPPORT" : ""); - ocl::Kernel k("reduce", ocl::core::reduce2_oclsrc, build_opt); if (k.empty()) return false; @@ -3488,22 +3488,18 @@ static bool ocl_reduce(InputArray _src, OutputArray _dst, UMat src = _src.getUMat(); Size dsize(dim == 0 ? src.cols : 1, dim == 0 ? 1 : src.rows); _dst.create(dsize, dtype); - UMat dst = _dst.getUMat(), temp = dst; + UMat dst = _dst.getUMat(); - if (op0 == CV_REDUCE_AVG && sdepth < CV_32S && ddepth0 < CV_32S) - temp.create(dsize, CV_32SC(cn)); - - size_t globalsize = std::max(dsize.width, dsize.height); - - k.args(ocl::KernelArg::ReadOnly(src), - ocl::KernelArg::WriteOnlyNoSize(temp)); - if (!k.run(1, &globalsize, NULL, false)) - return false; + ocl::KernelArg srcarg = ocl::KernelArg::ReadOnly(src), + temparg = ocl::KernelArg::WriteOnlyNoSize(dst); if (op0 == CV_REDUCE_AVG) - temp.convertTo(dst, ddepth0, 1. / (dim == 0 ? src.rows : src.cols)); + k.args(srcarg, temparg, 1.0f / (dim == 0 ? src.rows : src.cols)); + else + k.args(srcarg, temparg); - return true; + size_t globalsize = std::max(dsize.width, dsize.height); + return k.run(1, &globalsize, NULL, false); } } diff --git a/modules/core/src/opencl/reduce2.cl b/modules/core/src/opencl/reduce2.cl index 6f3ad7bacb..7800e7a743 100644 --- a/modules/core/src/opencl/reduce2.cl +++ b/modules/core/src/opencl/reduce2.cl @@ -76,24 +76,20 @@ #define noconvert -#ifdef OCL_CV_REDUCE_SUM +#if defined OCL_CV_REDUCE_SUM || defined OCL_CV_REDUCE_AVG #define INIT_VALUE 0 #define PROCESS_ELEM(acc, value) acc += value -#elif defined(OCL_CV_REDUCE_MAX) +#elif defined OCL_CV_REDUCE_MAX #define INIT_VALUE MIN_VAL #define PROCESS_ELEM(acc, value) acc = value > acc ? value : acc -#elif defined(OCL_CV_REDUCE_MIN) +#elif defined OCL_CV_REDUCE_MIN #define INIT_VALUE MAX_VAL #define PROCESS_ELEM(acc, value) acc = value < acc ? value : acc -#elif defined(OCL_CV_REDUCE_AVG) -#error "This operation should be implemented through OCL_CV_REDUCE_SUM" #else #error "No operation is specified" #endif -#ifndef BUF_COLS -#define BUF_COLS 32 -#endif +#ifdef OP_REDUCE_PRE __kernel void reduce_horz_pre(__global const uchar * srcptr, int src_step, int src_offset, int rows, int cols, __global uchar * bufptr, int buf_step, int buf_offset) @@ -126,15 +122,23 @@ __kernel void reduce_horz_pre(__global const uchar * srcptr, int src_step, int s } } +#else + __kernel void reduce(__global const uchar * srcptr, int src_step, int src_offset, int rows, int cols, - __global uchar * dstptr, int dst_step, int dst_offset) + __global uchar * dstptr, int dst_step, int dst_offset +#ifdef OCL_CV_REDUCE_AVG + , float fscale +#endif + ) { #if dim == 0 // reduce to a single row int x = get_global_id(0); if (x < cols) { int src_index = mad24(x, (int)sizeof(srcT) * cn, src_offset); - __global dstT * dst = (__global dstT *)(dstptr + dst_offset) + x * cn; + int dst_index = mad24(x, (int)sizeof(dstT0) * cn, dst_offset); + + __global dstT0 * dst = (__global dstT0 *)(dstptr + dst_index); dstT tmp[cn] = { INIT_VALUE }; for (int y = 0; y < rows; ++y, src_index += src_step) @@ -150,7 +154,11 @@ __kernel void reduce(__global const uchar * srcptr, int src_step, int src_offset #pragma unroll for (int c = 0; c < cn; ++c) - dst[c] = tmp[c]; +#ifdef OCL_CV_REDUCE_AVG + dst[c] = convertToDT0(convertToWT(tmp[c]) * fscale); +#else + dst[c] = convertToDT0(tmp[c]); +#endif } #elif dim == 1 // reduce to a single column int y = get_global_id(0); @@ -175,9 +183,15 @@ __kernel void reduce(__global const uchar * srcptr, int src_step, int src_offset #pragma unroll for (int c = 0; c < cn; ++c) - dst[c] = tmp[c]; +#ifdef OCL_CV_REDUCE_AVG + dst[c] = convertToDT0(convertToWT(tmp[c]) * fscale); +#else + dst[c] = convertToDT0(tmp[c]); +#endif } #else #error "Dims must be either 0 or 1" #endif } + +#endif