From fd5a8b3e9769c1b097e6246ac46f726a1204f1bd Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Wed, 4 Jun 2014 18:22:55 +0400 Subject: [PATCH 1/6] minmaxloc --- modules/core/src/opencl/minmaxloc.cl | 280 +++++++++++++++++++++++++++ modules/core/src/opencl/reduce.cl | 97 +--------- modules/core/src/stat.cpp | 175 +++++++++++------ 3 files changed, 399 insertions(+), 153 deletions(-) create mode 100644 modules/core/src/opencl/minmaxloc.cl diff --git a/modules/core/src/opencl/minmaxloc.cl b/modules/core/src/opencl/minmaxloc.cl new file mode 100644 index 0000000000..558679efda --- /dev/null +++ b/modules/core/src/opencl/minmaxloc.cl @@ -0,0 +1,280 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +// Copyright (C) 2014, Itseez, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. + +#ifdef DOUBLE_SUPPORT +#ifdef cl_amd_fp64 +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#elif defined (cl_khr_fp64) +#pragma OPENCL EXTENSION cl_khr_fp64:enable +#endif +#endif + +#ifdef DEPTH_0 +#define MIN_VAL 0 +#define MAX_VAL 255 +#elif defined DEPTH_1 +#define MIN_VAL -128 +#define MAX_VAL 127 +#elif defined DEPTH_2 +#define MIN_VAL 0 +#define MAX_VAL 65535 +#elif defined DEPTH_3 +#define MIN_VAL -32768 +#define MAX_VAL 32767 +#elif defined DEPTH_4 +#define MIN_VAL INT_MIN +#define MAX_VAL INT_MAX +#elif defined DEPTH_5 +#define MIN_VAL (-FLT_MAX) +#define MAX_VAL FLT_MAX +#elif defined DEPTH_6 +#define MIN_VAL (-DBL_MAX) +#define MAX_VAL DBL_MAX +#endif + +#define INDEX_MAX UINT_MAX + +#ifdef NEED_MINLOC +#define CALC_MINLOC(inc) minloc = id + inc +#else +#define CALC_MINLOC(inc) +#endif + +#ifdef NEED_MAXLOC +#define CALC_MAXLOC(inc) maxloc = id + inc +#else +#define CALC_MAXLOC(inc) +#endif + +#ifdef NEED_MINVAL +#define CALC_MIN(p, inc) \ + if (minval > temp.p) \ + { \ + minval = temp.p; \ + CALC_MINLOC(inc); \ + } +#else +#define CALC_MIN(p, inc) +#endif + +#ifdef NEED_MAXVAL +#define CALC_MAX(p, inc) \ + if (maxval < temp.p) \ + { \ + maxval = temp.p; \ + CALC_MAXLOC(inc); \ + } +#else +#define CALC_MAX(p, inc) +#endif + +#define CALC_P(p, inc) \ + CALC_MIN(p, inc) \ + CALC_MAX(p, inc) + +__kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_offset, int cols, + int total, int groupnum, __global uchar * dstptr +#ifdef HAVE_MASK + , __global const uchar * mask, int mask_step, int mask_offset +#endif + ) +{ + int lid = get_local_id(0); + int gid = get_group_id(0); + int id = get_global_id(0) * kercn; + + srcptr += src_offset; +#ifdef HAVE_MASK + mask += mask_offset; +#endif + +#ifdef NEED_MINVAL + __local srcT1 localmem_min[WGS2_ALIGNED]; +#ifdef NEED_MINLOC + __local uint localmem_minloc[WGS2_ALIGNED]; +#endif +#endif +#ifdef NEED_MAXVAL + __local srcT1 localmem_max[WGS2_ALIGNED]; +#ifdef NEED_MAXLOC + __local uint localmem_maxloc[WGS2_ALIGNED]; +#endif +#endif + + srcT1 minval = MAX_VAL, maxval = MIN_VAL; + srcT temp; + uint minloc = INDEX_MAX, maxloc = INDEX_MAX; + int src_index; +#ifdef HAVE_MASK + int mask_index; +#endif + + for (int grain = groupnum * WGS * kercn; id < total; id += grain) + { +#ifdef HAVE_SRC_CONT + src_index = mul24(id, (int)sizeof(srcT1)); +#else + src_index = mad24(id / cols, src_step, mul24(id % cols, (int)sizeof(srcT1))); +#endif + +#ifdef HAVE_MASK +#ifdef HAVE_MASK_CONT + mask_index = id; +#else + mask_index = mad24(id / cols, mask_step, id % cols); +#endif + if (mask[mask_index]) +#endif + { + temp = *(__global const srcT *)(srcptr + src_index); +#if kercn == 1 +#ifdef NEED_MINVAL + if (minval > temp) + { + minval = temp; +#ifdef NEED_MINLOC + minloc = id; +#endif + } +#endif +#ifdef NEED_MAXVAL + if (maxval < temp) + { + maxval = temp; +#ifdef NEED_MAXLOC + maxloc = id; +#endif + } +#endif +#elif kercn >= 2 + CALC_P(s0, 0) + CALC_P(s1, 1) +#if kercn >= 4 + CALC_P(s2, 2) + CALC_P(s3, 3) +#if kercn >= 8 + CALC_P(s4, 4) + CALC_P(s5, 5) + CALC_P(s6, 6) + CALC_P(s7, 7) +#if kercn == 16 + CALC_P(s8, 8) + CALC_P(s9, 9) + CALC_P(sA, 10) + CALC_P(sB, 11) + CALC_P(sC, 12) + CALC_P(sD, 13) + CALC_P(sE, 14) + CALC_P(sF, 15) +#endif +#endif +#endif +#endif + } + } + + if (lid < WGS2_ALIGNED) + { +#ifdef NEED_MINVAL + localmem_min[lid] = minval; +#endif +#ifdef NEED_MAXVAL + localmem_max[lid] = maxval; +#endif +#ifdef NEED_MINLOC + localmem_minloc[lid] = minloc; +#endif +#ifdef NEED_MAXLOC + localmem_maxloc[lid] = maxloc; +#endif + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (lid >= WGS2_ALIGNED && total >= WGS2_ALIGNED) + { + int lid3 = lid - WGS2_ALIGNED; +#ifdef NEED_MINVAL + if (localmem_min[lid3] >= minval) + { +#ifdef NEED_MINLOC + if (localmem_min[lid3] == minval) + localmem_minloc[lid3] = min(localmem_minloc[lid3], minloc); + else + localmem_minloc[lid3] = minloc, +#endif + localmem_min[lid3] = minval; + } +#endif +#ifdef NEED_MAXVAL + if (localmem_max[lid3] <= maxval) + { +#ifdef NEED_MAXLOC + if (localmem_max[lid3] == maxval) + localmem_maxloc[lid3] = min(localmem_maxloc[lid3], maxloc); + else + localmem_maxloc[lid3] = maxloc, +#endif + localmem_max[lid3] = maxval; + } +#endif + } + barrier(CLK_LOCAL_MEM_FENCE); + + for (int lsize = WGS2_ALIGNED >> 1; lsize > 0; lsize >>= 1) + { + if (lid < lsize) + { + int lid2 = lsize + lid; + +#ifdef NEED_MINVAL + if (localmem_min[lid] >= localmem_min[lid2]) + { +#ifdef NEED_MINLOC + if (localmem_min[lid] == localmem_min[lid2]) + localmem_minloc[lid] = min(localmem_minloc[lid2], localmem_minloc[lid]); + else + localmem_minloc[lid] = localmem_minloc[lid2], +#endif + localmem_min[lid] = localmem_min[lid2]; + } +#endif +#ifdef NEED_MAXVAL + if (localmem_max[lid] <= localmem_max[lid2]) + { +#ifdef NEED_MAXLOC + if (localmem_max[lid] == localmem_max[lid2]) + localmem_maxloc[lid] = min(localmem_maxloc[lid2], localmem_maxloc[lid]); + else + localmem_maxloc[lid] = localmem_maxloc[lid2], +#endif + localmem_max[lid] = localmem_max[lid2]; + } +#endif + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + if (lid == 0) + { + int pos = 0; +#ifdef NEED_MINVAL + *(__global srcT1 *)(dstptr + mad24(gid, (int)sizeof(srcT1), pos)) = localmem_min[0]; + pos = mad24(groupnum, (int)sizeof(srcT1), pos); +#endif +#ifdef NEED_MAXVAL + *(__global srcT1 *)(dstptr + mad24(gid, (int)sizeof(srcT1), pos)) = localmem_max[0]; + pos = mad24(groupnum, (int)sizeof(srcT1), pos); +#endif +#ifdef NEED_MINLOC + *(__global uint *)(dstptr + mad24(gid, (int)sizeof(uint), pos)) = localmem_minloc[0]; + pos = mad24(groupnum, (int)sizeof(uint), pos); +#endif +#ifdef NEED_MAXLOC + *(__global uint *)(dstptr + mad24(gid, (int)sizeof(uint), pos)) = localmem_maxloc[0]; +#endif + } +} diff --git a/modules/core/src/opencl/reduce.cl b/modules/core/src/opencl/reduce.cl index 851d36eb4d..038f132970 100644 --- a/modules/core/src/opencl/reduce.cl +++ b/modules/core/src/opencl/reduce.cl @@ -75,6 +75,8 @@ #define MAX_VAL DBL_MAX #endif +#define INDEX_MAX UINT_MAX + #define dstT srcT #define dstT1 srcT1 @@ -357,102 +359,11 @@ #define CALC_RESULT \ storepix(localmem_max[0], dstptr + dstTSIZE * gid) -// minMaxLoc stuff -#elif defined OP_MIN_MAX_LOC || defined OP_MIN_MAX_LOC_MASK - -#define DECLARE_LOCAL_MEM \ - __local srcT localmem_min[WGS2_ALIGNED]; \ - __local srcT localmem_max[WGS2_ALIGNED]; \ - __local int localmem_minloc[WGS2_ALIGNED]; \ - __local int localmem_maxloc[WGS2_ALIGNED] -#define DEFINE_ACCUMULATOR \ - srcT minval = MAX_VAL; \ - srcT maxval = MIN_VAL; \ - int negative = -1; \ - int minloc = negative; \ - int maxloc = negative; \ - srcT temp; \ - int temploc -#define REDUCE_GLOBAL \ - temp = loadpix(srcptr + src_index); \ - temploc = id; \ - srcT temp_minval = minval, temp_maxval = maxval; \ - minval = min(minval, temp); \ - maxval = max(maxval, temp); \ - minloc = (minval == temp_minval) ? (temp_minval == MAX_VAL) ? temploc : minloc : temploc; \ - maxloc = (maxval == temp_maxval) ? (temp_maxval == MIN_VAL) ? temploc : maxloc : temploc -#define SET_LOCAL_1 \ - localmem_min[lid] = minval; \ - localmem_max[lid] = maxval; \ - localmem_minloc[lid] = minloc; \ - localmem_maxloc[lid] = maxloc -#define REDUCE_LOCAL_1 \ - srcT oldmin = localmem_min[lid-WGS2_ALIGNED]; \ - srcT oldmax = localmem_max[lid-WGS2_ALIGNED]; \ - localmem_min[lid - WGS2_ALIGNED] = min(minval, localmem_min[lid-WGS2_ALIGNED]); \ - localmem_max[lid - WGS2_ALIGNED] = max(maxval, localmem_max[lid-WGS2_ALIGNED]); \ - srcT minv = localmem_min[lid - WGS2_ALIGNED], maxv = localmem_max[lid - WGS2_ALIGNED]; \ - localmem_minloc[lid - WGS2_ALIGNED] = (minv == minval) ? (minv == oldmin) ? \ - min(minloc, localmem_minloc[lid-WGS2_ALIGNED]) : minloc : localmem_minloc[lid-WGS2_ALIGNED]; \ - localmem_maxloc[lid - WGS2_ALIGNED] = (maxv == maxval) ? (maxv == oldmax) ? \ - min(maxloc, localmem_maxloc[lid-WGS2_ALIGNED]) : maxloc : localmem_maxloc[lid-WGS2_ALIGNED] -#define REDUCE_LOCAL_2 \ - srcT oldmin = localmem_min[lid]; \ - srcT oldmax = localmem_max[lid]; \ - localmem_min[lid] = min(localmem_min[lid], localmem_min[lid2]); \ - localmem_max[lid] = max(localmem_max[lid], localmem_max[lid2]); \ - srcT min1 = localmem_min[lid], min2 = localmem_min[lid2]; \ - localmem_minloc[lid] = (localmem_minloc[lid] == negative) ? localmem_minloc[lid2] : (localmem_minloc[lid2] == negative) ? \ - localmem_minloc[lid] : (min1 == min2) ? (min1 == oldmin) ? min(localmem_minloc[lid2],localmem_minloc[lid]) : \ - localmem_minloc[lid2] : localmem_minloc[lid]; \ - srcT max1 = localmem_max[lid], max2 = localmem_max[lid2]; \ - localmem_maxloc[lid] = (localmem_maxloc[lid] == negative) ? localmem_maxloc[lid2] : (localmem_maxloc[lid2] == negative) ? \ - localmem_maxloc[lid] : (max1 == max2) ? (max1 == oldmax) ? min(localmem_maxloc[lid2],localmem_maxloc[lid]) : \ - localmem_maxloc[lid2] : localmem_maxloc[lid] -#define CALC_RESULT \ - storepix(localmem_min[0], dstptr + dstTSIZE * gid); \ - storepix(localmem_max[0], dstptr2 + dstTSIZE * gid); \ - dstlocptr[gid] = localmem_minloc[0]; \ - dstlocptr2[gid] = localmem_maxloc[0] - -#if defined OP_MIN_MAX_LOC_MASK -#undef DEFINE_ACCUMULATOR -#define DEFINE_ACCUMULATOR \ - srcT minval = MAX_VAL; \ - srcT maxval = MIN_VAL; \ - int negative = -1; \ - int minloc = negative; \ - int maxloc = negative; \ - srcT temp, temp_mask, zeroVal = (srcT)(0); \ - int temploc -#undef REDUCE_GLOBAL -#define REDUCE_GLOBAL \ - temp = loadpix(srcptr + src_index); \ - temploc = id; \ - MASK_INDEX; \ - __global const uchar * mask = (__global const uchar *)(maskptr + mask_index); \ - temp_mask = mask[0]; \ - srcT temp_minval = minval, temp_maxval = maxval; \ - minval = (temp_mask == zeroVal) ? minval : min(minval, temp); \ - maxval = (temp_mask == zeroVal) ? maxval : max(maxval, temp); \ - minloc = (temp_mask == zeroVal) ? minloc : (minval == temp_minval) ? (temp_minval == MAX_VAL) ? temploc : minloc : temploc; \ - maxloc = (temp_mask == zeroVal) ? maxloc : (maxval == temp_maxval) ? (temp_maxval == MIN_VAL) ? temploc : maxloc : temploc -#endif - #else #error "No operation" -#endif // end of minMaxLoc stuff +#endif // end of norm (NORM_INF) with cn > 1 and mask -#ifdef OP_MIN_MAX_LOC -#undef EXTRA_PARAMS -#define EXTRA_PARAMS , __global uchar * dstptr2, __global int * dstlocptr, __global int * dstlocptr2 - -#elif defined OP_MIN_MAX_LOC_MASK -#undef EXTRA_PARAMS -#define EXTRA_PARAMS , __global uchar * dstptr2, __global int * dstlocptr, __global int * dstlocptr2, \ - __global const uchar * maskptr, int mask_step, int mask_offset - -#elif defined OP_DOT +#ifdef OP_DOT #undef EXTRA_PARAMS #define EXTRA_PARAMS , __global uchar * src2ptr, int src2_step, int src2_offset #endif diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp index 0a16c064c6..9d78c0f107 100644 --- a/modules/core/src/stat.cpp +++ b/modules/core/src/stat.cpp @@ -1311,104 +1311,157 @@ static void ofs2idx(const Mat& a, size_t ofs, int* idx) #ifdef HAVE_OPENCL template -void getMinMaxRes(const Mat &minv, const Mat &maxv, const Mat &minl, const Mat &maxl, double* minVal, - double* maxVal, int* minLoc, int* maxLoc, const int groupnum, const int cn, const int cols) +void getMinMaxRes(const Mat & db, double* minVal, double* maxVal, + int* minLoc, int* maxLoc, + int groupnum, int cn, int cols) { - T min = std::numeric_limits::max(); - T max = std::numeric_limits::min() > 0 ? -std::numeric_limits::max() : std::numeric_limits::min(); - int minloc = INT_MAX, maxloc = INT_MAX; - for (int i = 0; i < groupnum; i++) + uint index_max = std::numeric_limits::max(); + T minval = std::numeric_limits::max(); + T maxval = std::numeric_limits::min() > 0 ? -std::numeric_limits::max() : std::numeric_limits::min(); + uint minloc = index_max, maxloc = index_max; + + int index = 0; + const T * minptr = NULL, * maxptr = NULL; + const uint * minlocptr = NULL, * maxlocptr = NULL; + if (minVal || minLoc) { - T current_min = minv.at(0,i); - T current_max = maxv.at(0,i); - T oldmin = min, oldmax = max; - min = std::min(min, current_min); - max = std::max(max, current_max); - if (cn == 1) - { - int current_minloc = minl.at(0,i); - int current_maxloc = maxl.at(0,i); - if(current_minloc < 0 || current_maxloc < 0) continue; - minloc = (oldmin == current_min) ? std::min(minloc, current_minloc) : (oldmin < current_min) ? minloc : current_minloc; - maxloc = (oldmax == current_max) ? std::min(maxloc, current_maxloc) : (oldmax > current_max) ? maxloc : current_maxloc; - } + minptr = (const T *)db.data; + index += sizeof(T) * groupnum; + } + if (maxVal || maxLoc) + { + maxptr = (const T *)(db.data + index); + index += sizeof(T) * groupnum; } - bool zero_mask = (maxloc == INT_MAX) || (minloc == INT_MAX); - if (minVal) - *minVal = zero_mask ? 0 : (double)min; - if (maxVal) - *maxVal = zero_mask ? 0 : (double)max; if (minLoc) { - minLoc[0] = zero_mask ? -1 : minloc/cols; - minLoc[1] = zero_mask ? -1 : minloc%cols; + minlocptr = (uint *)(db.data + index); + index += sizeof(uint) * groupnum; + } + if (maxLoc) + maxlocptr = (uint *)(db.data + index); + + for (int i = 0; i < groupnum; i++) + { + if (minptr && minptr[i] <= minval) + { + if (minptr[i] == minval) + { + if (minlocptr) + minloc = std::min(minlocptr[i], minloc); + } + else + { + if (minlocptr) + minloc = minlocptr[i]; + minval = minptr[i]; + } + } + if (maxptr && maxptr[i] >= maxval) + { + if (maxptr[i] == maxval) + { + if (maxlocptr) + maxloc = std::min(maxlocptr[i], maxloc); + } + else + { + if (maxlocptr) + maxloc = maxlocptr[i]; + maxval = maxptr[i]; + } + } + } + bool zero_mask = (minLoc && minloc == index_max) || + (maxLoc && maxloc == index_max); + + if (minVal) + *minVal = zero_mask ? 0 : (double)minval; + if (maxVal) + *maxVal = zero_mask ? 0 : (double)maxval; + + if (minLoc) + { + minLoc[0] = zero_mask ? -1 : minloc / cols; + minLoc[1] = zero_mask ? -1 : minloc % cols; } if (maxLoc) { - maxLoc[0] = zero_mask ? -1 : maxloc/cols; - maxLoc[1] = zero_mask ? -1 : maxloc%cols; + maxLoc[0] = zero_mask ? -1 : maxloc / cols; + maxLoc[1] = zero_mask ? -1 : maxloc % cols; } } -typedef void (*getMinMaxResFunc)(const Mat &minv, const Mat &maxv, const Mat &minl, const Mat &maxl, double *minVal, - double *maxVal, int *minLoc, int *maxLoc, const int gropunum, const int cn, const int cols); +typedef void (*getMinMaxResFunc)(const Mat & db, double *minVal, double *maxVal, + int *minLoc, int *maxLoc, + int gropunum, int cn, int cols); static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int* minLoc, int* maxLoc, InputArray _mask) { CV_Assert( (_src.channels() == 1 && (_mask.empty() || _mask.type() == CV_8U)) || (_src.channels() >= 1 && _mask.empty() && !minLoc && !maxLoc) ); - int type = _src.type(), depth = CV_MAT_DEPTH(type), kercn = 1; - bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; + const ocl::Device & dev = ocl::Device::getDefault(); + bool doubleSupport = dev.doubleFPConfig() > 0, haveMask = !_mask.empty(); + int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), + kercn = haveMask ? 1 : std::min(4, ocl::predictOptimalVectorWidth(_src)); if (depth == CV_64F && !doubleSupport) return false; - int groupnum = ocl::Device::getDefault().maxComputeUnits(); - size_t wgs = ocl::Device::getDefault().maxWorkGroupSize(); + int groupnum = dev.maxComputeUnits(); + size_t wgs = dev.maxWorkGroupSize(); int wgs2_aligned = 1; while (wgs2_aligned < (int)wgs) wgs2_aligned <<= 1; wgs2_aligned >>= 1; - String opts = format("-D DEPTH_%d -D srcT=%s -D OP_MIN_MAX_LOC%s -D WGS=%d" - " -D WGS2_ALIGNED=%d%s%s%s -D kercn=%d", - depth, ocl::typeToStr(depth), _mask.empty() ? "" : "_MASK", (int)wgs, - wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "", - _src.isContinuous() ? " -D HAVE_SRC_CONT" : "", - _mask.isContinuous() ? " -D HAVE_MASK_CONT" : "", kercn); + bool needMinVal = minVal || minLoc, needMinLoc = minLoc != NULL, + needMaxVal = maxVal || maxLoc, needMaxLoc = maxLoc != NULL; - ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, opts); + // in case of mask we must know whether mask is filled with zeros or not + // so let's calculate min or max location, if it's undefined, so mask is zeros + if (!(needMaxLoc || needMinLoc) && haveMask) + if (needMinVal) + needMinLoc = true; + else + needMaxVal = true; + + String opts = format("-D DEPTH_%d -D srcT1=%s%s -D WGS=%d -D srcT=%s" + " -D WGS2_ALIGNED=%d%s%s%s -D kercn=%d%s%s%s%s", + depth, ocl::typeToStr(depth), haveMask ? " -D HAVE_MASK" : "", (int)wgs, + ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), wgs2_aligned, + doubleSupport ? " -D DOUBLE_SUPPORT" : "", + _src.isContinuous() ? " -D HAVE_SRC_CONT" : "", + _mask.isContinuous() ? " -D HAVE_MASK_CONT" : "", kercn, + needMinVal ? " -D NEED_MINVAL" : "", needMaxVal ? " -D NEED_MAXVAL" : "", + needMinLoc ? " -D NEED_MINLOC" : "", needMaxLoc ? " -D NEED_MAXLOC" : ""); + + ocl::Kernel k("minmaxloc", ocl::core::minmaxloc_oclsrc, opts); if (k.empty()) return false; - UMat src = _src.getUMat(), minval(1, groupnum, src.type()), - maxval(1, groupnum, src.type()), minloc( 1, groupnum, CV_32SC1), - maxloc( 1, groupnum, CV_32SC1), mask; - if (!_mask.empty()) - mask = _mask.getUMat(); + int esz = CV_ELEM_SIZE(depth), esz32s = CV_ELEM_SIZE1(CV_32S), + dbsize = groupnum * ((needMinVal ? esz : 0) + (needMaxVal ? esz : 0) + + (needMinLoc ? esz32s : 0) + (needMaxLoc ? esz32s : 0)); + UMat src = _src.getUMat(), db(1, dbsize, CV_8UC1), mask = _mask.getUMat(); - if (src.channels() > 1) + if (cn > 1) src = src.reshape(1); - if (mask.empty()) + if (!haveMask) k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(), - groupnum, ocl::KernelArg::PtrWriteOnly(minval), ocl::KernelArg::PtrWriteOnly(maxval), - ocl::KernelArg::PtrWriteOnly(minloc), ocl::KernelArg::PtrWriteOnly(maxloc)); + groupnum, ocl::KernelArg::PtrWriteOnly(db)); else - k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(), groupnum, - ocl::KernelArg::PtrWriteOnly(minval), ocl::KernelArg::PtrWriteOnly(maxval), - ocl::KernelArg::PtrWriteOnly(minloc), ocl::KernelArg::PtrWriteOnly(maxloc), ocl::KernelArg::ReadOnlyNoSize(mask)); + k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(), + groupnum, ocl::KernelArg::PtrWriteOnly(db), ocl::KernelArg::ReadOnlyNoSize(mask)); size_t globalsize = groupnum * wgs; if (!k.run(1, &globalsize, &wgs, false)) return false; - Mat minv = minval.getMat(ACCESS_READ), maxv = maxval.getMat(ACCESS_READ), - minl = minloc.getMat(ACCESS_READ), maxl = maxloc.getMat(ACCESS_READ); - - static getMinMaxResFunc functab[7] = + static const getMinMaxResFunc functab[7] = { getMinMaxRes, getMinMaxRes, @@ -1419,10 +1472,12 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int* getMinMaxRes }; - getMinMaxResFunc func; + getMinMaxResFunc func = functab[depth]; - func = functab[depth]; - func(minv, maxv, minl, maxl, minVal, maxVal, minLoc, maxLoc, groupnum, src.channels(), src.cols); + int locTemp[2]; + func(db.getMat(ACCESS_READ), minVal, maxVal, + needMinLoc ? minLoc ? minLoc : locTemp : minLoc, + needMaxLoc ? maxLoc ? maxLoc : locTemp : maxLoc, groupnum, cn, src.cols); return true; } From 1a7a262f7457171e2084f3b8c1af93c21e89c64b Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Fri, 6 Jun 2014 17:15:19 +0400 Subject: [PATCH 2/6] optimized cv::norm with NORM_INF --- modules/core/src/opencl/minmaxloc.cl | 19 +++++----- modules/core/src/opencl/reduce.cl | 4 +- modules/core/src/stat.cpp | 57 ++++++++++------------------ 3 files changed, 30 insertions(+), 50 deletions(-) diff --git a/modules/core/src/opencl/minmaxloc.cl b/modules/core/src/opencl/minmaxloc.cl index 558679efda..2e48387c77 100644 --- a/modules/core/src/opencl/minmaxloc.cl +++ b/modules/core/src/opencl/minmaxloc.cl @@ -36,6 +36,7 @@ #define MAX_VAL DBL_MAX #endif +#define noconvert #define INDEX_MAX UINT_MAX #ifdef NEED_MINLOC @@ -93,20 +94,20 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off #endif #ifdef NEED_MINVAL - __local srcT1 localmem_min[WGS2_ALIGNED]; + __local dstT1 localmem_min[WGS2_ALIGNED]; #ifdef NEED_MINLOC __local uint localmem_minloc[WGS2_ALIGNED]; #endif #endif #ifdef NEED_MAXVAL - __local srcT1 localmem_max[WGS2_ALIGNED]; + __local dstT1 localmem_max[WGS2_ALIGNED]; #ifdef NEED_MAXLOC __local uint localmem_maxloc[WGS2_ALIGNED]; #endif #endif - srcT1 minval = MAX_VAL, maxval = MIN_VAL; - srcT temp; + dstT1 minval = MAX_VAL, maxval = MIN_VAL; + dstT temp; uint minloc = INDEX_MAX, maxloc = INDEX_MAX; int src_index; #ifdef HAVE_MASK @@ -130,7 +131,7 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off if (mask[mask_index]) #endif { - temp = *(__global const srcT *)(srcptr + src_index); + temp = convertToDT(*(__global const srcT *)(srcptr + src_index)); #if kercn == 1 #ifdef NEED_MINVAL if (minval > temp) @@ -262,12 +263,12 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off { int pos = 0; #ifdef NEED_MINVAL - *(__global srcT1 *)(dstptr + mad24(gid, (int)sizeof(srcT1), pos)) = localmem_min[0]; - pos = mad24(groupnum, (int)sizeof(srcT1), pos); + *(__global dstT1 *)(dstptr + mad24(gid, (int)sizeof(dstT1), pos)) = localmem_min[0]; + pos = mad24(groupnum, (int)sizeof(dstT1), pos); #endif #ifdef NEED_MAXVAL - *(__global srcT1 *)(dstptr + mad24(gid, (int)sizeof(srcT1), pos)) = localmem_max[0]; - pos = mad24(groupnum, (int)sizeof(srcT1), pos); + *(__global dstT1 *)(dstptr + mad24(gid, (int)sizeof(dstT1), pos)) = localmem_max[0]; + pos = mad24(groupnum, (int)sizeof(dstT1), pos); #endif #ifdef NEED_MINLOC *(__global uint *)(dstptr + mad24(gid, (int)sizeof(uint), pos)) = localmem_minloc[0]; diff --git a/modules/core/src/opencl/reduce.cl b/modules/core/src/opencl/reduce.cl index 038f132970..21a5518883 100644 --- a/modules/core/src/opencl/reduce.cl +++ b/modules/core/src/opencl/reduce.cl @@ -50,7 +50,7 @@ #endif #endif -#if defined OP_NORM_INF_MASK || defined OP_MIN_MAX_LOC || defined OP_MIN_MAX_LOC_MASK +#if defined OP_NORM_INF_MASK #ifdef DEPTH_0 #define MIN_VAL 0 @@ -75,8 +75,6 @@ #define MAX_VAL DBL_MAX #endif -#define INDEX_MAX UINT_MAX - #define dstT srcT #define dstT1 srcT1 diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp index 9d78c0f107..8996c48015 100644 --- a/modules/core/src/stat.cpp +++ b/modules/core/src/stat.cpp @@ -1313,7 +1313,7 @@ static void ofs2idx(const Mat& a, size_t ofs, int* idx) template void getMinMaxRes(const Mat & db, double* minVal, double* maxVal, int* minLoc, int* maxLoc, - int groupnum, int cn, int cols) + int groupnum, int cols) { uint index_max = std::numeric_limits::max(); T minval = std::numeric_limits::max(); @@ -1393,10 +1393,10 @@ void getMinMaxRes(const Mat & db, double* minVal, double* maxVal, } typedef void (*getMinMaxResFunc)(const Mat & db, double *minVal, double *maxVal, - int *minLoc, int *maxLoc, - int gropunum, int cn, int cols); + int *minLoc, int *maxLoc, int gropunum, int cols); -static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int* minLoc, int* maxLoc, InputArray _mask) +static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int* minLoc, int* maxLoc, InputArray _mask, + int ddepth = -1, bool absValues = false) { CV_Assert( (_src.channels() == 1 && (_mask.empty() || _mask.type() == CV_8U)) || (_src.channels() >= 1 && _mask.empty() && !minLoc && !maxLoc) ); @@ -1405,8 +1405,10 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int* bool doubleSupport = dev.doubleFPConfig() > 0, haveMask = !_mask.empty(); int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), kercn = haveMask ? 1 : std::min(4, ocl::predictOptimalVectorWidth(_src)); + if (ddepth < 0) + ddepth = depth; - if (depth == CV_64F && !doubleSupport) + if ((depth == CV_64F || ddepth == CV_64F) && !doubleSupport) return false; int groupnum = dev.maxComputeUnits(); @@ -1423,26 +1425,32 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int* // in case of mask we must know whether mask is filled with zeros or not // so let's calculate min or max location, if it's undefined, so mask is zeros if (!(needMaxLoc || needMinLoc) && haveMask) + { if (needMinVal) needMinLoc = true; else needMaxVal = true; + } + char cvt[40]; String opts = format("-D DEPTH_%d -D srcT1=%s%s -D WGS=%d -D srcT=%s" - " -D WGS2_ALIGNED=%d%s%s%s -D kercn=%d%s%s%s%s", + " -D WGS2_ALIGNED=%d%s%s%s -D kercn=%d%s%s%s%s" + " -D dstT1=%s -D dstT=%s -D convertToDT=%s%s", depth, ocl::typeToStr(depth), haveMask ? " -D HAVE_MASK" : "", (int)wgs, ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "", _src.isContinuous() ? " -D HAVE_SRC_CONT" : "", _mask.isContinuous() ? " -D HAVE_MASK_CONT" : "", kercn, needMinVal ? " -D NEED_MINVAL" : "", needMaxVal ? " -D NEED_MAXVAL" : "", - needMinLoc ? " -D NEED_MINLOC" : "", needMaxLoc ? " -D NEED_MAXLOC" : ""); + needMinLoc ? " -D NEED_MINLOC" : "", needMaxLoc ? " -D NEED_MAXLOC" : "", + ocl::typeToStr(ddepth), ocl::typeToStr(CV_MAKE_TYPE(ddepth, kercn)), + ocl::convertTypeStr(depth, ddepth, kercn, cvt), absValues ? " -D OP_ABS" : ""); ocl::Kernel k("minmaxloc", ocl::core::minmaxloc_oclsrc, opts); if (k.empty()) return false; - int esz = CV_ELEM_SIZE(depth), esz32s = CV_ELEM_SIZE1(CV_32S), + int esz = CV_ELEM_SIZE(ddepth), esz32s = CV_ELEM_SIZE1(CV_32S), dbsize = groupnum * ((needMinVal ? esz : 0) + (needMaxVal ? esz : 0) + (needMinLoc ? esz32s : 0) + (needMaxLoc ? esz32s : 0)); UMat src = _src.getUMat(), db(1, dbsize, CV_8UC1), mask = _mask.getUMat(); @@ -1477,7 +1485,7 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int* int locTemp[2]; func(db.getMat(ACCESS_READ), minVal, maxVal, needMinLoc ? minLoc ? minLoc : locTemp : minLoc, - needMaxLoc ? maxLoc ? maxLoc : locTemp : maxLoc, groupnum, cn, src.cols); + needMaxLoc ? maxLoc ? maxLoc : locTemp : maxLoc, groupnum, src.cols); return true; } @@ -2116,35 +2124,8 @@ static bool ocl_norm( InputArray _src, int normType, InputArray _mask, double & if (normType == NORM_INF) { if (cn == 1 || !haveMask) - { - UMat abssrc; - - if (depth != CV_8U && depth != CV_16U) - { - int wdepth = std::max(CV_32S, depth), rowsPerWI = d.isIntel() ? 4 : 1; - char cvt[50]; - - ocl::Kernel kabs("KF", ocl::core::arithm_oclsrc, - format("-D UNARY_OP -D OP_ABS_NOSAT -D dstT=%s -D srcT1=%s" - " -D convertToDT=%s -D rowsPerWI=%d%s", - ocl::typeToStr(wdepth), ocl::typeToStr(depth), - ocl::convertTypeStr(depth, wdepth, 1, cvt), rowsPerWI, - doubleSupport ? " -D DOUBLE_SUPPORT" : "")); - if (kabs.empty()) - return false; - - abssrc.create(src.size(), CV_MAKE_TYPE(wdepth, cn)); - kabs.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(abssrc, cn)); - - size_t globalsize[2] = { src.cols * cn, (src.rows + rowsPerWI - 1) / rowsPerWI }; - if (!kabs.run(2, globalsize, NULL, false)) - return false; - } - else - abssrc = src; - - cv::minMaxIdx(haveMask ? abssrc : abssrc.reshape(1), NULL, &result, NULL, NULL, _mask); - } + ocl_minMaxIdx(_src, NULL, &result, NULL, NULL, _mask, + std::max(depth, CV_32S), depth != CV_8U && depth != CV_16U); else { int dbsize = d.maxComputeUnits(); From 2040995801b0c685c75c14fcd11cfaa974fc5b9c Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Sat, 7 Jun 2014 15:51:41 +0400 Subject: [PATCH 3/6] optimized cv::norm with 2 args --- modules/core/src/opencl/minmaxloc.cl | 81 ++++++++-- modules/core/src/opencl/reduce.cl | 219 ++++++++++++++++++++++++++- modules/core/src/stat.cpp | 148 ++++++++++++------ 3 files changed, 387 insertions(+), 61 deletions(-) diff --git a/modules/core/src/opencl/minmaxloc.cl b/modules/core/src/opencl/minmaxloc.cl index 2e48387c77..e3d87b04ce 100644 --- a/modules/core/src/opencl/minmaxloc.cl +++ b/modules/core/src/opencl/minmaxloc.cl @@ -73,14 +73,26 @@ #define CALC_MAX(p, inc) #endif +#ifdef OP_CALC2 +#define CALC_MAX2(p) \ + if (maxval2 < temp.p) \ + maxval2 = temp.p +#else +#define CALC_MAX2(p) +#endif + #define CALC_P(p, inc) \ CALC_MIN(p, inc) \ - CALC_MAX(p, inc) + CALC_MAX(p, inc) \ + CALC_MAX2(p) __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_offset, int cols, int total, int groupnum, __global uchar * dstptr #ifdef HAVE_MASK , __global const uchar * mask, int mask_step, int mask_offset +#endif +#ifdef HAVE_SRC2 + , __global const uchar * src2ptr, int src2_step, int src2_offset #endif ) { @@ -92,36 +104,46 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off #ifdef HAVE_MASK mask += mask_offset; #endif +#ifdef HAVE_SRC2 + src2ptr += src2_offset; +#endif #ifdef NEED_MINVAL __local dstT1 localmem_min[WGS2_ALIGNED]; + dstT1 minval = MAX_VAL; #ifdef NEED_MINLOC __local uint localmem_minloc[WGS2_ALIGNED]; + uint minloc = INDEX_MAX; #endif #endif #ifdef NEED_MAXVAL + dstT1 maxval = MIN_VAL; __local dstT1 localmem_max[WGS2_ALIGNED]; #ifdef NEED_MAXLOC __local uint localmem_maxloc[WGS2_ALIGNED]; + uint maxloc = INDEX_MAX; #endif +#endif +#ifdef OP_CALC2 + __local dstT1 localmem_max2[WGS2_ALIGNED]; + dstT1 maxval2 = MIN_VAL; #endif - dstT1 minval = MAX_VAL, maxval = MIN_VAL; - dstT temp; - uint minloc = INDEX_MAX, maxloc = INDEX_MAX; int src_index; #ifdef HAVE_MASK int mask_index; #endif +#ifdef HAVE_SRC2 + int src2_index; +#endif + + dstT temp; +#ifdef HAVE_SRC2 + dstT temp2; +#endif for (int grain = groupnum * WGS * kercn; id < total; id += grain) { -#ifdef HAVE_SRC_CONT - src_index = mul24(id, (int)sizeof(srcT1)); -#else - src_index = mad24(id / cols, src_step, mul24(id % cols, (int)sizeof(srcT1))); -#endif - #ifdef HAVE_MASK #ifdef HAVE_MASK_CONT mask_index = id; @@ -131,7 +153,26 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off if (mask[mask_index]) #endif { +#ifdef HAVE_SRC_CONT + src_index = mul24(id, (int)sizeof(srcT1)); +#else + src_index = mad24(id / cols, src_step, mul24(id % cols, (int)sizeof(srcT1))); +#endif temp = convertToDT(*(__global const srcT *)(srcptr + src_index)); +#ifdef OP_ABS + temp = temp >= (dstT)(0) ? temp : -temp; +#endif + +#ifdef HAVE_SRC2 +#ifdef HAVE_SRC2_CONT + src2_index = mul24(id, (int)sizeof(srcT1)); +#else + src2_index = mad24(id / cols, src2_step, mul24(id % cols, (int)sizeof(srcT1))); +#endif + temp2 = convertToDT(*(__global const srcT *)(src2ptr + src2_index)); + temp = temp > temp2 ? temp - temp2 : (temp2 - temp); +#endif + #if kercn == 1 #ifdef NEED_MINVAL if (minval > temp) @@ -150,6 +191,11 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off maxloc = id; #endif } +#ifdef OP_CALC2 + temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; + if (maxval2 < temp2) + maxval2 = temp2; +#endif #endif #elif kercn >= 2 CALC_P(s0, 0) @@ -191,6 +237,9 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off #endif #ifdef NEED_MAXLOC localmem_maxloc[lid] = maxloc; +#endif +#ifdef OP_CALC2 + localmem_max2[lid] = maxval2; #endif } barrier(CLK_LOCAL_MEM_FENCE); @@ -221,6 +270,10 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off #endif localmem_max[lid3] = maxval; } +#endif +#ifdef OP_CALC2 + if (localmem_max2[lid3] < maxval2) + localmem_max2[lid3] = maxval2; #endif } barrier(CLK_LOCAL_MEM_FENCE); @@ -254,6 +307,10 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off #endif localmem_max[lid] = localmem_max[lid2]; } +#endif +#ifdef OP_CALC2 + if (localmem_max2[lid] < localmem_max2[lid2]) + localmem_max2[lid] = localmem_max2[lid2]; #endif } barrier(CLK_LOCAL_MEM_FENCE); @@ -276,6 +333,10 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off #endif #ifdef NEED_MAXLOC *(__global uint *)(dstptr + mad24(gid, (int)sizeof(uint), pos)) = localmem_maxloc[0]; +#endif +#ifdef OP_CALC2 + pos = mad24(groupnum, (int)sizeof(uint), pos); + *(__global dstT1 *)(dstptr + mad24(gid, (int)sizeof(dstT1), pos)) = localmem_max2[0]; #endif } } diff --git a/modules/core/src/opencl/reduce.cl b/modules/core/src/opencl/reduce.cl index 21a5518883..d5350791e3 100644 --- a/modules/core/src/opencl/reduce.cl +++ b/modules/core/src/opencl/reduce.cl @@ -109,13 +109,22 @@ #endif #ifdef HAVE_MASK +#ifdef HAVE_SRC2 +#define EXTRA_PARAMS , __global const uchar * mask, int mask_step, int mask_offset, __global const uchar * src2ptr, int src2_step, int src2_offset +#else #define EXTRA_PARAMS , __global const uchar * mask, int mask_step, int mask_offset +#endif +#else +#ifdef HAVE_SRC2 +#define EXTRA_PARAMS , __global const uchar * src2ptr, int src2_step, int src2_offset #else #define EXTRA_PARAMS #endif +#endif // accumulative reduction stuff #if defined OP_SUM || defined OP_SUM_ABS || defined OP_SUM_SQR || defined OP_DOT + #ifdef OP_DOT #if ddepth <= 4 #define FUNC(a, b, c) a = mad24(b, c, a) @@ -137,18 +146,48 @@ #endif #endif +#ifdef OP_CALC2 +#define DECLARE_LOCAL_MEM \ + __local dstT localmem[WGS2_ALIGNED]; \ + __local dstT localmem2[WGS2_ALIGNED] +#define DEFINE_ACCUMULATOR \ + dstT accumulator = (dstT)(0); \ + dstT accumulator2 = (dstT)(0) +#else #define DECLARE_LOCAL_MEM \ __local dstT localmem[WGS2_ALIGNED] #define DEFINE_ACCUMULATOR \ dstT accumulator = (dstT)(0) +#endif + +#ifdef HAVE_SRC2 +#ifdef OP_CALC2 +#define PROCESS_ELEMS \ + dstT temp = convertToDT(loadpix(srcptr + src_index)) - convertToDT(loadpix(src2ptr + src2_index)); \ + dstT temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ + temp -= temp2; \ + temp = temp > (dstT)(0) ? temp : -temp; \ + FUNC(accumulator2, temp2); \ + FUNC(accumulator, temp) +#else +#define PROCESS_ELEMS \ + dstT temp = convertToDT(loadpix(srcptr + src_index)); \ + dstT temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ + temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \ + FUNC(accumulator, temp) +#endif +#else +#define PROCESS_ELEMS \ + dstT temp = convertToDT(loadpix(srcptr + src_index)); \ + FUNC(accumulator, temp) +#endif #ifdef HAVE_MASK #define REDUCE_GLOBAL \ MASK_INDEX; \ if (mask[mask_index]) \ { \ - dstT temp = convertToDT(loadpix(srcptr + src_index)); \ - FUNC(accumulator, temp); \ + PROCESS_ELEMS; \ } #elif defined OP_DOT @@ -211,7 +250,158 @@ FUNC(accumulator, temp.sF, temp2.sF) #endif -#else +#else // sum or norm with 2 args +#ifdef HAVE_SRC2 +#ifdef OP_CALC2 // norm relative +#if kercn == 1 +#define REDUCE_GLOBAL \ + dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ + dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ + temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \ + FUNC(accumulator, temp); \ + FUNC(accumulator2, temp2) +#elif kercn == 2 +#define REDUCE_GLOBAL \ + dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ + dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ + temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \ + FUNC(accumulator, temp.s0); \ + FUNC(accumulator, temp.s1); \ + FUNC(accumulator2, temp2.s0); \ + FUNC(accumulator2, temp2.s1) +#elif kercn == 4 +#define REDUCE_GLOBAL \ + dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ + dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ + temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \ + FUNC(accumulator, temp.s0); \ + FUNC(accumulator, temp.s1); \ + FUNC(accumulator, temp.s2); \ + FUNC(accumulator, temp.s3); \ + FUNC(accumulator2, temp2.s0); \ + FUNC(accumulator2, temp2.s1); \ + FUNC(accumulator2, temp2.s2); \ + FUNC(accumulator2, temp2.s3) +#elif kercn == 8 +#define REDUCE_GLOBAL \ + dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ + dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ + temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \ + FUNC(accumulator, temp.s0); \ + FUNC(accumulator, temp.s1); \ + FUNC(accumulator, temp.s2); \ + FUNC(accumulator, temp.s3); \ + FUNC(accumulator, temp.s4); \ + FUNC(accumulator, temp.s5); \ + FUNC(accumulator, temp.s6); \ + FUNC(accumulator, temp.s7); \ + FUNC(accumulator2, temp2.s0); \ + FUNC(accumulator2, temp2.s1); \ + FUNC(accumulator2, temp2.s2); \ + FUNC(accumulator2, temp2.s3); \ + FUNC(accumulator2, temp2.s4); \ + FUNC(accumulator2, temp2.s5); \ + FUNC(accumulator2, temp2.s6); \ + FUNC(accumulator2, temp2.s7) +#elif kercn == 16 +#define REDUCE_GLOBAL \ + dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ + dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ + temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \ + FUNC(accumulator, temp.s0); \ + FUNC(accumulator, temp.s1); \ + FUNC(accumulator, temp.s2); \ + FUNC(accumulator, temp.s3); \ + FUNC(accumulator, temp.s4); \ + FUNC(accumulator, temp.s5); \ + FUNC(accumulator, temp.s6); \ + FUNC(accumulator, temp.s7); \ + FUNC(accumulator, temp.s8); \ + FUNC(accumulator, temp.s9); \ + FUNC(accumulator, temp.sA); \ + FUNC(accumulator, temp.sB); \ + FUNC(accumulator, temp.sC); \ + FUNC(accumulator, temp.sD); \ + FUNC(accumulator, temp.sE); \ + FUNC(accumulator, temp.sF); \ + FUNC(accumulator2, temp2.s0); \ + FUNC(accumulator2, temp2.s1); \ + FUNC(accumulator2, temp2.s2); \ + FUNC(accumulator2, temp2.s3); \ + FUNC(accumulator2, temp2.s4); \ + FUNC(accumulator2, temp2.s5); \ + FUNC(accumulator2, temp2.s6); \ + FUNC(accumulator2, temp2.s7); \ + FUNC(accumulator2, temp2.s8); \ + FUNC(accumulator2, temp2.s9); \ + FUNC(accumulator2, temp2.sA); \ + FUNC(accumulator2, temp2.sB); \ + FUNC(accumulator2, temp2.sC); \ + FUNC(accumulator2, temp2.sD); \ + FUNC(accumulator2, temp2.sE); \ + FUNC(accumulator2, temp2.sF) +#endif +#else // norm with 2 args +#if kercn == 1 +#define REDUCE_GLOBAL \ + dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ + dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ + temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \ + FUNC(accumulator, temp) +#elif kercn == 2 +#define REDUCE_GLOBAL \ + dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ + dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ + temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \ + FUNC(accumulator, temp.s0); \ + FUNC(accumulator, temp.s1) +#elif kercn == 4 +#define REDUCE_GLOBAL \ + dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ + dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ + temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \ + FUNC(accumulator, temp.s0); \ + FUNC(accumulator, temp.s1); \ + FUNC(accumulator, temp.s2); \ + FUNC(accumulator, temp.s3) +#elif kercn == 8 +#define REDUCE_GLOBAL \ + dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ + dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ + temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \ + FUNC(accumulator, temp.s0); \ + FUNC(accumulator, temp.s1); \ + FUNC(accumulator, temp.s2); \ + FUNC(accumulator, temp.s3); \ + FUNC(accumulator, temp.s4); \ + FUNC(accumulator, temp.s5); \ + FUNC(accumulator, temp.s6); \ + FUNC(accumulator, temp.s7) +#elif kercn == 16 +#define REDUCE_GLOBAL \ + dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ + dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ + temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \ + FUNC(accumulator, temp.s0); \ + FUNC(accumulator, temp.s1); \ + FUNC(accumulator, temp.s2); \ + FUNC(accumulator, temp.s3); \ + FUNC(accumulator, temp.s4); \ + FUNC(accumulator, temp.s5); \ + FUNC(accumulator, temp.s6); \ + FUNC(accumulator, temp.s7); \ + FUNC(accumulator, temp.s8); \ + FUNC(accumulator, temp.s9); \ + FUNC(accumulator, temp.sA); \ + FUNC(accumulator, temp.sB); \ + FUNC(accumulator, temp.sC); \ + FUNC(accumulator, temp.sD); \ + FUNC(accumulator, temp.sE); \ + FUNC(accumulator, temp.sF) +#endif +#endif + +#else // sum #if kercn == 1 #define REDUCE_GLOBAL \ dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ @@ -260,6 +450,7 @@ FUNC(accumulator, temp.sF) #endif #endif +#endif #define SET_LOCAL_1 \ localmem[lid] = accumulator @@ -325,6 +516,20 @@ accumulator += value.sF == zero ? zero : one #endif +#ifdef OP_CALC2 +#define SET_LOCAL_1 \ + localmem[lid] = accumulator; \ + localmem2[lid] = accumulator2; \ +#define REDUCE_LOCAL_1 \ + localmem[lid - WGS2_ALIGNED] += accumulator; \ + localmem2[lid - WGS2_ALIGNED] += accumulator2 +#define REDUCE_LOCAL_2 \ + localmem[lid] += localmem[lid2]; \ + localmem2[lid] += localmem2[lid2] +#define CALC_RESULT \ + storepix(localmem[0], dstptr + dstTSIZE * gid); \ + storepix(localmem2[0], dstptr + mad24(groupnum, srcTSIZE, dstTSIZE * gid)) +#else #define SET_LOCAL_1 \ localmem[lid] = accumulator #define REDUCE_LOCAL_1 \ @@ -333,6 +538,7 @@ localmem[lid] += localmem[lid2] #define CALC_RESULT \ storepix(localmem[0], dstptr + dstTSIZE * gid) +#endif // norm (NORM_INF) with cn > 1 and mask #elif defined OP_NORM_INF_MASK @@ -384,6 +590,13 @@ __kernel void reduce(__global const uchar * srcptr, int src_step, int src_offset int src_index = mul24(id, srcTSIZE); #else int src_index = mad24(id / cols, src_step, mul24(id % cols, srcTSIZE)); +#endif +#ifdef HAVE_SRC2 +#ifdef HAVE_SRC2_CONT + int src2_index = mul24(id, srcTSIZE); +#else + int src2_index = mad24(id / cols, src2_step, mul24(id % cols, srcTSIZE)); +#endif #endif REDUCE_GLOBAL; } diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp index 8996c48015..b405d6f7b0 100644 --- a/modules/core/src/stat.cpp +++ b/modules/core/src/stat.cpp @@ -469,21 +469,25 @@ template Scalar ocl_part_sum(Mat m) enum { OCL_OP_SUM = 0, OCL_OP_SUM_ABS = 1, OCL_OP_SUM_SQR = 2 }; -static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask = noArray() ) +static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask = noArray(), + InputArray _src2 = noArray(), bool calc2 = false, const Scalar & res2 = Scalar() ) { CV_Assert(sum_op == OCL_OP_SUM || sum_op == OCL_OP_SUM_ABS || sum_op == OCL_OP_SUM_SQR); - bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0, - haveMask = _mask.kind() != _InputArray::NONE; + const ocl::Device & dev = ocl::Device::getDefault(); + bool doubleSupport = dev.doubleFPConfig() > 0, + haveMask = _mask.kind() != _InputArray::NONE, + haveSrc2 = _src2.kind() != _InputArray::NONE; int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), kercn = cn == 1 && !haveMask ? ocl::predictOptimalVectorWidth(_src) : 1, mcn = std::max(cn, kercn); + CV_Assert(!haveSrc2 || _src2.type() == type); if ( (!doubleSupport && depth == CV_64F) || cn > 4 ) return false; - int dbsize = ocl::Device::getDefault().maxComputeUnits(); - size_t wgs = ocl::Device::getDefault().maxWorkGroupSize(); + int ngroups = dev.maxComputeUnits(), dbsize = ngroups * (calc2 ? 2 : 1); + size_t wgs = dev.maxWorkGroupSize(); int ddepth = std::max(sum_op == OCL_OP_SUM_SQR ? CV_32F : CV_32S, depth), dtype = CV_MAKE_TYPE(ddepth, cn); @@ -497,7 +501,7 @@ static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask static const char * const opMap[3] = { "OP_SUM", "OP_SUM_ABS", "OP_SUM_SQR" }; char cvt[40]; String opts = format("-D srcT=%s -D srcT1=%s -D dstT=%s -D dstTK=%s -D dstT1=%s -D ddepth=%d -D cn=%d" - " -D convertToDT=%s -D %s -D WGS=%d -D WGS2_ALIGNED=%d%s%s%s%s -D kercn=%d", + " -D convertToDT=%s -D %s -D WGS=%d -D WGS2_ALIGNED=%d%s%s%s%s -D kercn=%d%s%s%s", ocl::typeToStr(CV_MAKE_TYPE(depth, mcn)), ocl::typeToStr(depth), ocl::typeToStr(dtype), ocl::typeToStr(CV_MAKE_TYPE(ddepth, mcn)), ocl::typeToStr(ddepth), ddepth, cn, @@ -506,30 +510,49 @@ static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask doubleSupport ? " -D DOUBLE_SUPPORT" : "", haveMask ? " -D HAVE_MASK" : "", _src.isContinuous() ? " -D HAVE_SRC_CONT" : "", - _mask.isContinuous() ? " -D HAVE_MASK_CONT" : "", kercn); + haveMask && _mask.isContinuous() ? " -D HAVE_MASK_CONT" : "", kercn, + haveSrc2 ? " -D HAVE_SRC2" : "", calc2 ? " -D OP_CALC2" : "", + haveSrc2 && _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : ""); ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, opts); if (k.empty()) return false; - UMat src = _src.getUMat(), db(1, dbsize, dtype), mask = _mask.getUMat(); + UMat src = _src.getUMat(), src2 = _src2.getUMat(), + db(1, dbsize, dtype), mask = _mask.getUMat(); ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src), dbarg = ocl::KernelArg::PtrWriteOnly(db), - maskarg = ocl::KernelArg::ReadOnlyNoSize(mask); + maskarg = ocl::KernelArg::ReadOnlyNoSize(mask), + src2arg = ocl::KernelArg::ReadOnlyNoSize(src2); if (haveMask) - k.args(srcarg, src.cols, (int)src.total(), dbsize, dbarg, maskarg); + { + if (haveSrc2) + k.args(srcarg, src.cols, (int)src.total(), ngroups, dbarg, maskarg, src2arg); + else + k.args(srcarg, src.cols, (int)src.total(), ngroups, dbarg, maskarg); + } else - k.args(srcarg, src.cols, (int)src.total(), dbsize, dbarg); + { + if (haveSrc2) + k.args(srcarg, src.cols, (int)src.total(), ngroups, dbarg, src2arg); + else + k.args(srcarg, src.cols, (int)src.total(), ngroups, dbarg); + } - size_t globalsize = dbsize * wgs; + size_t globalsize = ngroups * wgs; if (k.run(1, &globalsize, &wgs, false)) { typedef Scalar (*part_sum)(Mat m); part_sum funcs[3] = { ocl_part_sum, ocl_part_sum, ocl_part_sum }, func = funcs[ddepth - CV_32S]; - res = func(db.getMat(ACCESS_READ)); + + Mat mres = db.getMat(ACCESS_READ); + if (calc2) + const_cast(res2) = func(mres.colRange(dbsize, dbsize)); + + res = func(mres.colRange(0, dbsize)); return true; } return false; @@ -1396,18 +1419,21 @@ typedef void (*getMinMaxResFunc)(const Mat & db, double *minVal, double *maxVal, int *minLoc, int *maxLoc, int gropunum, int cols); static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int* minLoc, int* maxLoc, InputArray _mask, - int ddepth = -1, bool absValues = false) + int ddepth = -1, bool absValues = false, InputArray _src2 = noArray(), bool calc2 = false) { CV_Assert( (_src.channels() == 1 && (_mask.empty() || _mask.type() == CV_8U)) || (_src.channels() >= 1 && _mask.empty() && !minLoc && !maxLoc) ); const ocl::Device & dev = ocl::Device::getDefault(); - bool doubleSupport = dev.doubleFPConfig() > 0, haveMask = !_mask.empty(); + bool doubleSupport = dev.doubleFPConfig() > 0, haveMask = !_mask.empty(), + haveSrc2 = _src2.kind() != _InputArray::NONE; int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), kercn = haveMask ? 1 : std::min(4, ocl::predictOptimalVectorWidth(_src)); if (ddepth < 0) ddepth = depth; + CV_Assert(!haveSrc2 || _src2.type() == type); + if ((depth == CV_64F || ddepth == CV_64F) && !doubleSupport) return false; @@ -1435,7 +1461,7 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int* char cvt[40]; String opts = format("-D DEPTH_%d -D srcT1=%s%s -D WGS=%d -D srcT=%s" " -D WGS2_ALIGNED=%d%s%s%s -D kercn=%d%s%s%s%s" - " -D dstT1=%s -D dstT=%s -D convertToDT=%s%s", + " -D dstT1=%s -D dstT=%s -D convertToDT=%s%s%s%s%s", depth, ocl::typeToStr(depth), haveMask ? " -D HAVE_MASK" : "", (int)wgs, ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "", @@ -1444,7 +1470,9 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int* needMinVal ? " -D NEED_MINVAL" : "", needMaxVal ? " -D NEED_MAXVAL" : "", needMinLoc ? " -D NEED_MINLOC" : "", needMaxLoc ? " -D NEED_MAXLOC" : "", ocl::typeToStr(ddepth), ocl::typeToStr(CV_MAKE_TYPE(ddepth, kercn)), - ocl::convertTypeStr(depth, ddepth, kercn, cvt), absValues ? " -D OP_ABS" : ""); + ocl::convertTypeStr(depth, ddepth, kercn, cvt), absValues ? " -D OP_ABS" : "", + haveSrc2 ? " -D HAVE_SRC2" : "", calc2 ? " -D OP_CALC2" : "", + haveSrc2 && _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : ""); ocl::Kernel k("minmaxloc", ocl::core::minmaxloc_oclsrc, opts); if (k.empty()) @@ -1452,18 +1480,35 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int* int esz = CV_ELEM_SIZE(ddepth), esz32s = CV_ELEM_SIZE1(CV_32S), dbsize = groupnum * ((needMinVal ? esz : 0) + (needMaxVal ? esz : 0) + - (needMinLoc ? esz32s : 0) + (needMaxLoc ? esz32s : 0)); - UMat src = _src.getUMat(), db(1, dbsize, CV_8UC1), mask = _mask.getUMat(); + (needMinLoc ? esz32s : 0) + (needMaxLoc ? esz32s : 0) + + (calc2 ? esz : 0)); + UMat src = _src.getUMat(), src2 = _src2.getUMat(), db(1, dbsize, CV_8UC1), mask = _mask.getUMat(); if (cn > 1) + { src = src.reshape(1); + src2 = src2.reshape(1); + } - if (!haveMask) - k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(), - groupnum, ocl::KernelArg::PtrWriteOnly(db)); + if (haveSrc2) + { + if (!haveMask) + k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(), + groupnum, ocl::KernelArg::PtrWriteOnly(db), ocl::KernelArg::ReadOnlyNoSize(src2)); + else + k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(), + groupnum, ocl::KernelArg::PtrWriteOnly(db), ocl::KernelArg::ReadOnlyNoSize(mask), + ocl::KernelArg::ReadOnlyNoSize(src2)); + } else - k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(), - groupnum, ocl::KernelArg::PtrWriteOnly(db), ocl::KernelArg::ReadOnlyNoSize(mask)); + { + if (!haveMask) + k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(), + groupnum, ocl::KernelArg::PtrWriteOnly(db)); + else + k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(), + groupnum, ocl::KernelArg::PtrWriteOnly(db), ocl::KernelArg::ReadOnlyNoSize(mask)); + } size_t globalsize = groupnum * wgs; if (!k.run(1, &globalsize, &wgs, false)) @@ -2498,38 +2543,45 @@ namespace cv { static bool ocl_norm( InputArray _src1, InputArray _src2, int normType, InputArray _mask, double & result ) { - const ocl::Device & d = ocl::Device::getDefault(); - int type = _src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), rowsPerWI = d.isIntel() ? 4 : 1; - bool doubleSupport = d.doubleFPConfig() > 0; - bool relative = (normType & NORM_RELATIVE) != 0; + Scalar sc1, sc2; + int type = _src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); + bool relative = (normType & NORM_RELATIVE) != 0, + normsum = normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR; normType &= ~NORM_RELATIVE; - if ( !(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR) || - (!doubleSupport && depth == CV_64F)) + if ( !(normType == NORM_INF || normsum) ) return false; - int wdepth = std::max(CV_32S, depth); - char cvt[50]; - ocl::Kernel k("KF", ocl::core::arithm_oclsrc, - format("-D BINARY_OP -D OP_ABSDIFF -D dstT=%s -D workT=dstT -D srcT1=%s -D srcT2=srcT1" - " -D convertToDT=%s -D convertToWT1=convertToDT -D convertToWT2=convertToDT -D rowsPerWI=%d%s", - ocl::typeToStr(wdepth), ocl::typeToStr(depth), - ocl::convertTypeStr(depth, wdepth, 1, cvt), rowsPerWI, - doubleSupport ? " -D DOUBLE_SUPPORT" : "")); - if (k.empty()) - return false; + if (normsum) + { + if (!ocl_sum(_src1, sc1, normType == NORM_L2 || normType == NORM_L2SQR ? + OCL_OP_SUM_SQR : OCL_OP_SUM, _mask, _src2, relative, sc2)) + return false; + } + else + { + if (!ocl_minMaxIdx(_src1, NULL, &result, NULL, NULL, _mask, std::max(CV_32S, depth), + false, _src2, relative)) + return false; + } - UMat src1 = _src1.getUMat(), src2 = _src2.getUMat(), diff(src1.size(), CV_MAKE_TYPE(wdepth, cn)); - k.args(ocl::KernelArg::ReadOnlyNoSize(src1), ocl::KernelArg::ReadOnlyNoSize(src2), - ocl::KernelArg::WriteOnly(diff, cn)); + double s2 = 0; + for (int i = 0; i < cn; ++i) + { + result += sc1[i]; + if (relative) + s2 += sc2[i]; + } - size_t globalsize[2] = { diff.cols * cn, (diff.rows + rowsPerWI - 1) / rowsPerWI }; - if (!k.run(2, globalsize, NULL, false)) - return false; + if (normType == NORM_L2) + { + result = std::sqrt(result); + if (relative) + s2 = std::sqrt(s2); + } - result = cv::norm(diff, normType, _mask); if (relative) - result /= cv::norm(src2, normType, _mask) + DBL_EPSILON; + result /= (s2 + DBL_EPSILON); return true; } From 5403bdd2286cd8a4fb2b2b688fd02f6d86361a4b Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Sat, 7 Jun 2014 20:53:20 +0400 Subject: [PATCH 4/6] optimized cv::norm with NORM_RELATIVE --- modules/core/src/opencl/minmaxloc.cl | 6 ++-- modules/core/src/opencl/reduce.cl | 3 ++ modules/core/src/stat.cpp | 44 +++++++++++++++++---------- modules/core/test/ocl/test_arithm.cpp | 12 ++++++++ 4 files changed, 47 insertions(+), 18 deletions(-) diff --git a/modules/core/src/opencl/minmaxloc.cl b/modules/core/src/opencl/minmaxloc.cl index e3d87b04ce..11b6da949b 100644 --- a/modules/core/src/opencl/minmaxloc.cl +++ b/modules/core/src/opencl/minmaxloc.cl @@ -76,7 +76,7 @@ #ifdef OP_CALC2 #define CALC_MAX2(p) \ if (maxval2 < temp.p) \ - maxval2 = temp.p + maxval2 = temp.p; #else #define CALC_MAX2(p) #endif @@ -171,6 +171,9 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off #endif temp2 = convertToDT(*(__global const srcT *)(src2ptr + src2_index)); temp = temp > temp2 ? temp - temp2 : (temp2 - temp); +#ifdef OP_CALC2 + temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; +#endif #endif #if kercn == 1 @@ -192,7 +195,6 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off #endif } #ifdef OP_CALC2 - temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; if (maxval2 < temp2) maxval2 = temp2; #endif diff --git a/modules/core/src/opencl/reduce.cl b/modules/core/src/opencl/reduce.cl index d5350791e3..92818e356b 100644 --- a/modules/core/src/opencl/reduce.cl +++ b/modules/core/src/opencl/reduce.cl @@ -580,6 +580,9 @@ __kernel void reduce(__global const uchar * srcptr, int src_step, int src_offset int id = get_global_id(0) * kercn; srcptr += src_offset; +#ifdef HAVE_SRC2 + src2ptr += src2_offset; +#endif DECLARE_LOCAL_MEM; DEFINE_ACCUMULATOR; diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp index b405d6f7b0..34c487ae31 100644 --- a/modules/core/src/stat.cpp +++ b/modules/core/src/stat.cpp @@ -1334,17 +1334,17 @@ static void ofs2idx(const Mat& a, size_t ofs, int* idx) #ifdef HAVE_OPENCL template -void getMinMaxRes(const Mat & db, double* minVal, double* maxVal, +void getMinMaxRes(const Mat & db, double * minVal, double * maxVal, int* minLoc, int* maxLoc, - int groupnum, int cols) + int groupnum, int cols, double * maxVal2) { uint index_max = std::numeric_limits::max(); T minval = std::numeric_limits::max(); - T maxval = std::numeric_limits::min() > 0 ? -std::numeric_limits::max() : std::numeric_limits::min(); + T maxval = std::numeric_limits::min() > 0 ? -std::numeric_limits::max() : std::numeric_limits::min(), maxval2 = maxval; uint minloc = index_max, maxloc = index_max; int index = 0; - const T * minptr = NULL, * maxptr = NULL; + const T * minptr = NULL, * maxptr = NULL, * maxptr2 = NULL; const uint * minlocptr = NULL, * maxlocptr = NULL; if (minVal || minLoc) { @@ -1362,7 +1362,12 @@ void getMinMaxRes(const Mat & db, double* minVal, double* maxVal, index += sizeof(uint) * groupnum; } if (maxLoc) + { maxlocptr = (uint *)(db.data + index); + index += sizeof(uint) * groupnum; + } + if (maxVal2) + maxptr2 = (const T *)(db.data + index); for (int i = 0; i < groupnum; i++) { @@ -1394,6 +1399,8 @@ void getMinMaxRes(const Mat & db, double* minVal, double* maxVal, maxval = maxptr[i]; } } + if (maxptr2 && maxptr2[i] > maxval2) + maxval2 = maxptr2[i]; } bool zero_mask = (minLoc && minloc == index_max) || (maxLoc && maxloc == index_max); @@ -1402,6 +1409,8 @@ void getMinMaxRes(const Mat & db, double* minVal, double* maxVal, *minVal = zero_mask ? 0 : (double)minval; if (maxVal) *maxVal = zero_mask ? 0 : (double)maxval; + if (maxVal2) + *maxVal2 = zero_mask ? 0 : (double)maxval2; if (minLoc) { @@ -1415,20 +1424,21 @@ void getMinMaxRes(const Mat & db, double* minVal, double* maxVal, } } -typedef void (*getMinMaxResFunc)(const Mat & db, double *minVal, double *maxVal, - int *minLoc, int *maxLoc, int gropunum, int cols); +typedef void (*getMinMaxResFunc)(const Mat & db, double * minVal, double * maxVal, + int * minLoc, int *maxLoc, int gropunum, int cols, double * maxVal2); static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int* minLoc, int* maxLoc, InputArray _mask, - int ddepth = -1, bool absValues = false, InputArray _src2 = noArray(), bool calc2 = false) + int ddepth = -1, bool absValues = false, InputArray _src2 = noArray(), double * maxVal2 = NULL) { - CV_Assert( (_src.channels() == 1 && (_mask.empty() || _mask.type() == CV_8U)) || - (_src.channels() >= 1 && _mask.empty() && !minLoc && !maxLoc) ); - const ocl::Device & dev = ocl::Device::getDefault(); bool doubleSupport = dev.doubleFPConfig() > 0, haveMask = !_mask.empty(), haveSrc2 = _src2.kind() != _InputArray::NONE; int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), kercn = haveMask ? 1 : std::min(4, ocl::predictOptimalVectorWidth(_src)); + + CV_Assert( (cn == 1 && (_mask.empty() || _mask.type() == CV_8U)) || + (cn >= 1 && _mask.empty() && !minLoc && !maxLoc) ); + if (ddepth < 0) ddepth = depth; @@ -1471,7 +1481,7 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int* needMinLoc ? " -D NEED_MINLOC" : "", needMaxLoc ? " -D NEED_MAXLOC" : "", ocl::typeToStr(ddepth), ocl::typeToStr(CV_MAKE_TYPE(ddepth, kercn)), ocl::convertTypeStr(depth, ddepth, kercn, cvt), absValues ? " -D OP_ABS" : "", - haveSrc2 ? " -D HAVE_SRC2" : "", calc2 ? " -D OP_CALC2" : "", + haveSrc2 ? " -D HAVE_SRC2" : "", maxVal2 ? " -D OP_CALC2" : "", haveSrc2 && _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : ""); ocl::Kernel k("minmaxloc", ocl::core::minmaxloc_oclsrc, opts); @@ -1481,7 +1491,7 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int* int esz = CV_ELEM_SIZE(ddepth), esz32s = CV_ELEM_SIZE1(CV_32S), dbsize = groupnum * ((needMinVal ? esz : 0) + (needMaxVal ? esz : 0) + (needMinLoc ? esz32s : 0) + (needMaxLoc ? esz32s : 0) + - (calc2 ? esz : 0)); + (maxVal2 ? esz : 0)); UMat src = _src.getUMat(), src2 = _src2.getUMat(), db(1, dbsize, CV_8UC1), mask = _mask.getUMat(); if (cn > 1) @@ -1525,12 +1535,13 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int* getMinMaxRes }; - getMinMaxResFunc func = functab[depth]; + getMinMaxResFunc func = functab[ddepth]; int locTemp[2]; func(db.getMat(ACCESS_READ), minVal, maxVal, needMinLoc ? minLoc ? minLoc : locTemp : minLoc, - needMaxLoc ? maxLoc ? maxLoc : locTemp : maxLoc, groupnum, src.cols); + needMaxLoc ? maxLoc ? maxLoc : locTemp : maxLoc, + groupnum, src.cols, maxVal2); return true; } @@ -2560,9 +2571,10 @@ static bool ocl_norm( InputArray _src1, InputArray _src2, int normType, InputArr } else { - if (!ocl_minMaxIdx(_src1, NULL, &result, NULL, NULL, _mask, std::max(CV_32S, depth), - false, _src2, relative)) + if (!ocl_minMaxIdx(_src1, NULL, &sc1[0], NULL, NULL, _mask, std::max(CV_32S, depth), + false, _src2, relative ? &sc2[0] : NULL)) return false; + cn = 1; } double s2 = 0; diff --git a/modules/core/test/ocl/test_arithm.cpp b/modules/core/test/ocl/test_arithm.cpp index d39697584b..a7a09cabb7 100644 --- a/modules/core/test/ocl/test_arithm.cpp +++ b/modules/core/test/ocl/test_arithm.cpp @@ -1293,6 +1293,8 @@ OCL_TEST_P(Norm, NORM_INF_2args) { generateTestData(); + SCOPED_TRACE(relative ? "NORM_RELATIVE" : ""); + int type = NORM_INF; if (relative == 1) type |= NORM_RELATIVE; @@ -1311,6 +1313,8 @@ OCL_TEST_P(Norm, NORM_INF_2args_mask) { generateTestData(); + SCOPED_TRACE(relative ? "NORM_RELATIVE" : ""); + int type = NORM_INF; if (relative == 1) type |= NORM_RELATIVE; @@ -1329,6 +1333,8 @@ OCL_TEST_P(Norm, NORM_L1_2args) { generateTestData(); + SCOPED_TRACE(relative ? "NORM_RELATIVE" : ""); + int type = NORM_L1; if (relative == 1) type |= NORM_RELATIVE; @@ -1347,6 +1353,8 @@ OCL_TEST_P(Norm, NORM_L1_2args_mask) { generateTestData(); + SCOPED_TRACE(relative ? "NORM_RELATIVE" : ""); + int type = NORM_L1; if (relative == 1) type |= NORM_RELATIVE; @@ -1365,6 +1373,8 @@ OCL_TEST_P(Norm, NORM_L2_2args) { generateTestData(); + SCOPED_TRACE(relative ? "NORM_RELATIVE" : ""); + int type = NORM_L2; if (relative == 1) type |= NORM_RELATIVE; @@ -1383,6 +1393,8 @@ OCL_TEST_P(Norm, NORM_L2_2args_mask) { generateTestData(); + SCOPED_TRACE(relative ? "NORM_RELATIVE" : ""); + int type = NORM_L2; if (relative == 1) type |= NORM_RELATIVE; From 7f2662b310489d3336cadf46c386d271ebf09ae0 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 9 Jun 2014 00:50:14 +0400 Subject: [PATCH 5/6] fixes --- .../include/opencv2/core/opencl/ocl_defs.hpp | 2 +- modules/core/src/opencl/minmaxloc.cl | 40 ++++++++++------ modules/core/src/opencl/reduce.cl | 47 ++++++++++--------- modules/core/src/stat.cpp | 19 ++++---- 4 files changed, 62 insertions(+), 46 deletions(-) diff --git a/modules/core/include/opencv2/core/opencl/ocl_defs.hpp b/modules/core/include/opencv2/core/opencl/ocl_defs.hpp index 55f8849b8a..76d4f84365 100644 --- a/modules/core/include/opencv2/core/opencl/ocl_defs.hpp +++ b/modules/core/include/opencv2/core/opencl/ocl_defs.hpp @@ -5,7 +5,7 @@ // Copyright (C) 2014, Advanced Micro Devices, Inc., all rights reserved. // Third party copyrights are property of their respective owners. -//#define CV_OPENCL_RUN_ASSERT +#define CV_OPENCL_RUN_ASSERT #ifdef HAVE_OPENCL diff --git a/modules/core/src/opencl/minmaxloc.cl b/modules/core/src/opencl/minmaxloc.cl index 11b6da949b..56de655dff 100644 --- a/modules/core/src/opencl/minmaxloc.cl +++ b/modules/core/src/opencl/minmaxloc.cl @@ -15,16 +15,16 @@ #ifdef DEPTH_0 #define MIN_VAL 0 -#define MAX_VAL 255 +#define MAX_VAL UCHAR_MAX #elif defined DEPTH_1 -#define MIN_VAL -128 -#define MAX_VAL 127 +#define MIN_VAL SCHAR_MIN +#define MAX_VAL SCHAR_MAX #elif defined DEPTH_2 #define MIN_VAL 0 -#define MAX_VAL 65535 +#define MAX_VAL USHRT_MAX #elif defined DEPTH_3 -#define MIN_VAL -32768 -#define MAX_VAL 32767 +#define MIN_VAL SHRT_MIN +#define MAX_VAL SHRT_MAX #elif defined DEPTH_4 #define MIN_VAL INT_MIN #define MAX_VAL INT_MAX @@ -39,6 +39,14 @@ #define noconvert #define INDEX_MAX UINT_MAX +#if kercn != 3 +#define loadpix(addr) *(__global const srcT *)(addr) +#define srcTSIZE (int)sizeof(srcT1) +#else +#define loadpix(addr) vload3(0, (__global const srcT1 *)(addr)) +#define srcTSIZE ((int)sizeof(srcT1)) +#endif + #ifdef NEED_MINLOC #define CALC_MINLOC(inc) minloc = id + inc #else @@ -154,22 +162,22 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off #endif { #ifdef HAVE_SRC_CONT - src_index = mul24(id, (int)sizeof(srcT1)); + src_index = mul24(id, srcTSIZE); #else - src_index = mad24(id / cols, src_step, mul24(id % cols, (int)sizeof(srcT1))); + src_index = mad24(id / cols, src_step, mul24(id % cols, srcTSIZE)); #endif - temp = convertToDT(*(__global const srcT *)(srcptr + src_index)); + temp = convertToDT(loadpix(srcptr + src_index)); #ifdef OP_ABS temp = temp >= (dstT)(0) ? temp : -temp; #endif #ifdef HAVE_SRC2 #ifdef HAVE_SRC2_CONT - src2_index = mul24(id, (int)sizeof(srcT1)); + src2_index = mul24(id, srcTSIZE); #else - src2_index = mad24(id / cols, src2_step, mul24(id % cols, (int)sizeof(srcT1))); + src2_index = mad24(id / cols, src2_step, mul24(id % cols, srcTSIZE)); #endif - temp2 = convertToDT(*(__global const srcT *)(src2ptr + src2_index)); + temp2 = convertToDT(loadpix(src2ptr + src2_index)); temp = temp > temp2 ? temp - temp2 : (temp2 - temp); #ifdef OP_CALC2 temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; @@ -202,8 +210,9 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off #elif kercn >= 2 CALC_P(s0, 0) CALC_P(s1, 1) -#if kercn >= 4 +#if kercn >= 3 CALC_P(s2, 2) +#if kercn >= 4 CALC_P(s3, 3) #if kercn >= 8 CALC_P(s4, 4) @@ -222,6 +231,7 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off #endif #endif #endif +#endif #endif } } @@ -335,9 +345,11 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off #endif #ifdef NEED_MAXLOC *(__global uint *)(dstptr + mad24(gid, (int)sizeof(uint), pos)) = localmem_maxloc[0]; -#endif #ifdef OP_CALC2 pos = mad24(groupnum, (int)sizeof(uint), pos); +#endif +#endif +#ifdef OP_CALC2 *(__global dstT1 *)(dstptr + mad24(gid, (int)sizeof(dstT1), pos)) = localmem_max2[0]; #endif } diff --git a/modules/core/src/opencl/reduce.cl b/modules/core/src/opencl/reduce.cl index 92818e356b..9418cec0de 100644 --- a/modules/core/src/opencl/reduce.cl +++ b/modules/core/src/opencl/reduce.cl @@ -148,11 +148,9 @@ #ifdef OP_CALC2 #define DECLARE_LOCAL_MEM \ - __local dstT localmem[WGS2_ALIGNED]; \ - __local dstT localmem2[WGS2_ALIGNED] + __local dstT localmem[WGS2_ALIGNED], localmem2[WGS2_ALIGNED] #define DEFINE_ACCUMULATOR \ - dstT accumulator = (dstT)(0); \ - dstT accumulator2 = (dstT)(0) + dstT accumulator = (dstT)(0), accumulator2 = (dstT)(0) #else #define DECLARE_LOCAL_MEM \ __local dstT localmem[WGS2_ALIGNED] @@ -163,10 +161,10 @@ #ifdef HAVE_SRC2 #ifdef OP_CALC2 #define PROCESS_ELEMS \ - dstT temp = convertToDT(loadpix(srcptr + src_index)) - convertToDT(loadpix(src2ptr + src2_index)); \ + dstT temp = convertToDT(loadpix(srcptr + src_index)); \ dstT temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ - temp -= temp2; \ - temp = temp > (dstT)(0) ? temp : -temp; \ + temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \ + temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; \ FUNC(accumulator2, temp2); \ FUNC(accumulator, temp) #else @@ -258,6 +256,7 @@ dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \ + temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; \ FUNC(accumulator, temp); \ FUNC(accumulator2, temp2) #elif kercn == 2 @@ -265,6 +264,7 @@ dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \ + temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; \ FUNC(accumulator, temp.s0); \ FUNC(accumulator, temp.s1); \ FUNC(accumulator2, temp2.s0); \ @@ -274,6 +274,7 @@ dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \ + temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; \ FUNC(accumulator, temp.s0); \ FUNC(accumulator, temp.s1); \ FUNC(accumulator, temp.s2); \ @@ -287,6 +288,7 @@ dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \ + temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; \ FUNC(accumulator, temp.s0); \ FUNC(accumulator, temp.s1); \ FUNC(accumulator, temp.s2); \ @@ -308,6 +310,7 @@ dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \ + temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; \ FUNC(accumulator, temp.s0); \ FUNC(accumulator, temp.s1); \ FUNC(accumulator, temp.s2); \ @@ -452,6 +455,20 @@ #endif #endif +#ifdef OP_CALC2 +#define SET_LOCAL_1 \ + localmem[lid] = accumulator; \ + localmem2[lid] = accumulator2 +#define REDUCE_LOCAL_1 \ + localmem[lid - WGS2_ALIGNED] += accumulator; \ + localmem2[lid - WGS2_ALIGNED] += accumulator2 +#define REDUCE_LOCAL_2 \ + localmem[lid] += localmem[lid2]; \ + localmem2[lid] += localmem2[lid2] +#define CALC_RESULT \ + storepix(localmem[0], dstptr + dstTSIZE * gid); \ + storepix(localmem2[0], dstptr + mad24(groupnum, dstTSIZE, dstTSIZE * gid)) +#else #define SET_LOCAL_1 \ localmem[lid] = accumulator #define REDUCE_LOCAL_1 \ @@ -460,6 +477,7 @@ localmem[lid] += localmem[lid2] #define CALC_RESULT \ storepix(localmem[0], dstptr + dstTSIZE * gid) +#endif // countNonZero stuff #elif defined OP_COUNT_NON_ZERO @@ -516,20 +534,6 @@ accumulator += value.sF == zero ? zero : one #endif -#ifdef OP_CALC2 -#define SET_LOCAL_1 \ - localmem[lid] = accumulator; \ - localmem2[lid] = accumulator2; \ -#define REDUCE_LOCAL_1 \ - localmem[lid - WGS2_ALIGNED] += accumulator; \ - localmem2[lid - WGS2_ALIGNED] += accumulator2 -#define REDUCE_LOCAL_2 \ - localmem[lid] += localmem[lid2]; \ - localmem2[lid] += localmem2[lid2] -#define CALC_RESULT \ - storepix(localmem[0], dstptr + dstTSIZE * gid); \ - storepix(localmem2[0], dstptr + mad24(groupnum, srcTSIZE, dstTSIZE * gid)) -#else #define SET_LOCAL_1 \ localmem[lid] = accumulator #define REDUCE_LOCAL_1 \ @@ -538,7 +542,6 @@ localmem[lid] += localmem[lid2] #define CALC_RESULT \ storepix(localmem[0], dstptr + dstTSIZE * gid) -#endif // norm (NORM_INF) with cn > 1 and mask #elif defined OP_NORM_INF_MASK diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp index 34c487ae31..01f50fa232 100644 --- a/modules/core/src/stat.cpp +++ b/modules/core/src/stat.cpp @@ -550,9 +550,9 @@ static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask Mat mres = db.getMat(ACCESS_READ); if (calc2) - const_cast(res2) = func(mres.colRange(dbsize, dbsize)); + const_cast(res2) = func(mres.colRange(ngroups, dbsize)); - res = func(mres.colRange(0, dbsize)); + res = func(mres.colRange(0, ngroups)); return true; } return false; @@ -1434,10 +1434,10 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int* bool doubleSupport = dev.doubleFPConfig() > 0, haveMask = !_mask.empty(), haveSrc2 = _src2.kind() != _InputArray::NONE; int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), - kercn = haveMask ? 1 : std::min(4, ocl::predictOptimalVectorWidth(_src)); + kercn = haveMask ? cn : std::min(4, ocl::predictOptimalVectorWidth(_src)); - CV_Assert( (cn == 1 && (_mask.empty() || _mask.type() == CV_8U)) || - (cn >= 1 && _mask.empty() && !minLoc && !maxLoc) ); + CV_Assert( (cn == 1 && (!haveMask || _mask.type() == CV_8U)) || + (cn >= 1 && (!haveMask || haveSrc2) && !minLoc && !maxLoc) ); if (ddepth < 0) ddepth = depth; @@ -1484,6 +1484,8 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int* haveSrc2 ? " -D HAVE_SRC2" : "", maxVal2 ? " -D OP_CALC2" : "", haveSrc2 && _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : ""); + printf("%s\n", opts.c_str()); + ocl::Kernel k("minmaxloc", ocl::core::minmaxloc_oclsrc, opts); if (k.empty()) return false; @@ -2556,9 +2558,9 @@ static bool ocl_norm( InputArray _src1, InputArray _src2, int normType, InputArr { Scalar sc1, sc2; int type = _src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); - bool relative = (normType & NORM_RELATIVE) != 0, - normsum = normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR; + bool relative = (normType & NORM_RELATIVE) != 0; normType &= ~NORM_RELATIVE; + bool normsum = normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR; if ( !(normType == NORM_INF || normsum) ) return false; @@ -2608,8 +2610,7 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m #ifdef HAVE_OPENCL double _result = 0; - CV_OCL_RUN_(_src1.isUMat() && _src2.isUMat() && - _src1.dims() <= 2 && _src2.dims() <= 2, + CV_OCL_RUN_(_src1.isUMat(), ocl_norm(_src1, _src2, normType, _mask, _result), _result) #endif From 634da9f3bfbb32a6c337623d34fb74a879d147f3 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 9 Jun 2014 15:32:35 +0400 Subject: [PATCH 6/6] added norm_inf support to minmaxloc kernel --- .../include/opencv2/core/opencl/ocl_defs.hpp | 2 +- modules/core/src/opencl/minmaxloc.cl | 22 ++++++++-- modules/core/src/opencl/reduce.cl | 25 +---------- modules/core/src/stat.cpp | 44 +++---------------- 4 files changed, 26 insertions(+), 67 deletions(-) diff --git a/modules/core/include/opencv2/core/opencl/ocl_defs.hpp b/modules/core/include/opencv2/core/opencl/ocl_defs.hpp index 76d4f84365..55f8849b8a 100644 --- a/modules/core/include/opencv2/core/opencl/ocl_defs.hpp +++ b/modules/core/include/opencv2/core/opencl/ocl_defs.hpp @@ -5,7 +5,7 @@ // Copyright (C) 2014, Advanced Micro Devices, Inc., all rights reserved. // Third party copyrights are property of their respective owners. -#define CV_OPENCL_RUN_ASSERT +//#define CV_OPENCL_RUN_ASSERT #ifdef HAVE_OPENCL diff --git a/modules/core/src/opencl/minmaxloc.cl b/modules/core/src/opencl/minmaxloc.cl index 56de655dff..eb57347a28 100644 --- a/modules/core/src/opencl/minmaxloc.cl +++ b/modules/core/src/opencl/minmaxloc.cl @@ -41,10 +41,15 @@ #if kercn != 3 #define loadpix(addr) *(__global const srcT *)(addr) -#define srcTSIZE (int)sizeof(srcT1) +#define srcTSIZE (int)sizeof(srcT) #else #define loadpix(addr) vload3(0, (__global const srcT1 *)(addr)) -#define srcTSIZE ((int)sizeof(srcT1)) +#define srcTSIZE ((int)sizeof(srcT1) * 3) +#endif + +#ifndef HAVE_MASK +#undef srcTSIZE +#define srcTSIZE (int)sizeof(srcT1) #endif #ifdef NEED_MINLOC @@ -106,7 +111,12 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off { int lid = get_local_id(0); int gid = get_group_id(0); - int id = get_global_id(0) * kercn; + int id = get_global_id(0) +#ifndef HAVE_MASK + * kercn; +#else + ; +#endif srcptr += src_offset; #ifdef HAVE_MASK @@ -150,7 +160,11 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off dstT temp2; #endif - for (int grain = groupnum * WGS * kercn; id < total; id += grain) + for (int grain = groupnum * WGS +#ifndef HAVE_MASK + * kercn +#endif + ; id < total; id += grain) { #ifdef HAVE_MASK #ifdef HAVE_MASK_CONT diff --git a/modules/core/src/opencl/reduce.cl b/modules/core/src/opencl/reduce.cl index 9418cec0de..888b5dff8b 100644 --- a/modules/core/src/opencl/reduce.cl +++ b/modules/core/src/opencl/reduce.cl @@ -543,32 +543,9 @@ #define CALC_RESULT \ storepix(localmem[0], dstptr + dstTSIZE * gid) -// norm (NORM_INF) with cn > 1 and mask -#elif defined OP_NORM_INF_MASK - -#define DECLARE_LOCAL_MEM \ - __local srcT localmem_max[WGS2_ALIGNED] -#define DEFINE_ACCUMULATOR \ - srcT maxval = MIN_VAL, temp -#define REDUCE_GLOBAL \ - MASK_INDEX; \ - if (mask[mask_index]) \ - { \ - temp = loadpix(srcptr + src_index); \ - maxval = max(maxval, (srcT)(temp >= (srcT)(0) ? temp : -temp)); \ - } -#define SET_LOCAL_1 \ - localmem_max[lid] = maxval -#define REDUCE_LOCAL_1 \ - localmem_max[lid - WGS2_ALIGNED] = max(maxval, localmem_max[lid - WGS2_ALIGNED]) -#define REDUCE_LOCAL_2 \ - localmem_max[lid] = max(localmem_max[lid], localmem_max[lid2]) -#define CALC_RESULT \ - storepix(localmem_max[0], dstptr + dstTSIZE * gid) - #else #error "No operation" -#endif // end of norm (NORM_INF) with cn > 1 and mask +#endif #ifdef OP_DOT #undef EXTRA_PARAMS diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp index 01f50fa232..79da3c623f 100644 --- a/modules/core/src/stat.cpp +++ b/modules/core/src/stat.cpp @@ -1437,7 +1437,7 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int* kercn = haveMask ? cn : std::min(4, ocl::predictOptimalVectorWidth(_src)); CV_Assert( (cn == 1 && (!haveMask || _mask.type() == CV_8U)) || - (cn >= 1 && (!haveMask || haveSrc2) && !minLoc && !maxLoc) ); + (cn >= 1 && !minLoc && !maxLoc) ); if (ddepth < 0) ddepth = depth; @@ -1465,7 +1465,7 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int* if (needMinVal) needMinLoc = true; else - needMaxVal = true; + needMaxLoc = true; } char cvt[40]; @@ -1484,8 +1484,6 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int* haveSrc2 ? " -D HAVE_SRC2" : "", maxVal2 ? " -D OP_CALC2" : "", haveSrc2 && _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : ""); - printf("%s\n", opts.c_str()); - ocl::Kernel k("minmaxloc", ocl::core::minmaxloc_oclsrc, opts); if (k.empty()) return false; @@ -1496,7 +1494,7 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int* (maxVal2 ? esz : 0)); UMat src = _src.getUMat(), src2 = _src2.getUMat(), db(1, dbsize, CV_8UC1), mask = _mask.getUMat(); - if (cn > 1) + if (cn > 1 && !haveMask) { src = src.reshape(1); src2 = src2.reshape(1); @@ -2181,39 +2179,9 @@ static bool ocl_norm( InputArray _src, int normType, InputArray _mask, double & if (normType == NORM_INF) { - if (cn == 1 || !haveMask) - ocl_minMaxIdx(_src, NULL, &result, NULL, NULL, _mask, - std::max(depth, CV_32S), depth != CV_8U && depth != CV_16U); - else - { - int dbsize = d.maxComputeUnits(); - size_t wgs = d.maxWorkGroupSize(); - - int wgs2_aligned = 1; - while (wgs2_aligned < (int)wgs) - wgs2_aligned <<= 1; - wgs2_aligned >>= 1; - - ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, - format("-D OP_NORM_INF_MASK -D HAVE_MASK -D DEPTH_%d" - " -D srcT=%s -D srcT1=%s -D WGS=%d -D cn=%d -D WGS2_ALIGNED=%d%s%s%s", - depth, ocl::typeToStr(type), ocl::typeToStr(depth), - wgs, cn, wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "", - src.isContinuous() ? " -D HAVE_CONT_SRC" : "", - _mask.isContinuous() ? " -D HAVE_MASK_CONT" : "")); - if (k.empty()) - return false; - - UMat db(1, dbsize, type), mask = _mask.getUMat(); - k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(), - dbsize, ocl::KernelArg::PtrWriteOnly(db), ocl::KernelArg::ReadOnlyNoSize(mask)); - - size_t globalsize = dbsize * wgs; - if (!k.run(1, &globalsize, &wgs, true)) - return false; - - minMaxIdx(db.getMat(ACCESS_READ), NULL, &result, NULL, NULL, noArray()); - } + if (!ocl_minMaxIdx(_src, NULL, &result, NULL, NULL, _mask, + std::max(depth, CV_32S), depth != CV_8U && depth != CV_16U)) + return false; } else if (normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR) {