diff --git a/modules/ocl/src/arithm.cpp b/modules/ocl/src/arithm.cpp index 0dd695bfa6..0962f9256c 100644 --- a/modules/ocl/src/arithm.cpp +++ b/modules/ocl/src/arithm.cpp @@ -68,7 +68,6 @@ namespace cv extern const char *arithm_sum; extern const char *arithm_sum_3; extern const char *arithm_minMax; - extern const char *arithm_minMax_mask; extern const char *arithm_minMaxLoc; extern const char *arithm_minMaxLoc_mask; extern const char *arithm_LUT; @@ -455,139 +454,121 @@ void cv::ocl::meanStdDev(const oclMat &src, Scalar &mean, Scalar &stddev) //////////////////////////////////// minMax ///////////////////////////////// ////////////////////////////////////////////////////////////////////////////// -static void arithmetic_minMax_run(const oclMat &src, const oclMat &mask, cl_mem &dst, int vlen , int groupnum, string kernelName) +template +static void arithmetic_minMax_run(const oclMat &src, const oclMat & mask, cl_mem &dst, int groupnum, string kernelName) { - vector > args; - int all_cols = src.step / (vlen * src.elemSize1()); - int pre_cols = (src.offset % src.step) / (vlen * src.elemSize1()); - int sec_cols = all_cols - (src.offset % src.step + src.cols * src.elemSize() - 1) / (vlen * src.elemSize1()) - 1; + int all_cols = src.step / src.elemSize(); + int pre_cols = (src.offset % src.step) / src.elemSize(); + int sec_cols = all_cols - (src.offset % src.step + src.cols * src.elemSize() - 1) / src.elemSize() - 1; int invalid_cols = pre_cols + sec_cols; - int cols = all_cols - invalid_cols , elemnum = cols * src.rows;; - int offset = src.offset / (vlen * src.elemSize1()); - int repeat_s = src.offset / src.elemSize1() - offset * vlen; - int repeat_e = (offset + cols) * vlen - src.offset / src.elemSize1() - src.cols * src.oclchannels(); - char build_options[50]; - sprintf(build_options, "-D DEPTH_%d -D REPEAT_S%d -D REPEAT_E%d", src.depth(), repeat_s, repeat_e); + int cols = all_cols - invalid_cols , elemnum = cols * src.rows; + int offset = src.offset / src.elemSize(); + + const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" }; + const char * const channelMap[] = { " ", " ", "2", "4", "4" }; + + ostringstream stream; + stream << "-D T=" << typeMap[src.depth()] << channelMap[src.channels()]; + stream << " -D MAX_VAL=" << (WT)numeric_limits::max(); + stream << " -D MIN_VAL=" << (WT)numeric_limits::min(); + string buildOptions = stream.str(); + + vector > args; + args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data)); + args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst )); args.push_back( make_pair( sizeof(cl_int) , (void *)&cols )); args.push_back( make_pair( sizeof(cl_int) , (void *)&invalid_cols )); args.push_back( make_pair( sizeof(cl_int) , (void *)&offset)); args.push_back( make_pair( sizeof(cl_int) , (void *)&elemnum)); args.push_back( make_pair( sizeof(cl_int) , (void *)&groupnum)); - args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data)); + + int minvalid_cols = 0, moffset = 0; if (!mask.empty()) { - int mall_cols = mask.step / (vlen * mask.elemSize1()); - int mpre_cols = (mask.offset % mask.step) / (vlen * mask.elemSize1()); - int msec_cols = mall_cols - (mask.offset % mask.step + mask.cols * mask.elemSize() - 1) / (vlen * mask.elemSize1()) - 1; - int minvalid_cols = mpre_cols + msec_cols; - int moffset = mask.offset / (vlen * mask.elemSize1()); + int mall_cols = mask.step / mask.elemSize(); + int mpre_cols = (mask.offset % mask.step) / mask.elemSize(); + int msec_cols = mall_cols - (mask.offset % mask.step + mask.cols * mask.elemSize() - 1) / mask.elemSize() - 1; + minvalid_cols = mpre_cols + msec_cols; + moffset = mask.offset / mask.elemSize(); + args.push_back( make_pair( sizeof(cl_mem) , (void *)&mask.data )); args.push_back( make_pair( sizeof(cl_int) , (void *)&minvalid_cols )); args.push_back( make_pair( sizeof(cl_int) , (void *)&moffset )); - args.push_back( make_pair( sizeof(cl_mem) , (void *)&mask.data )); + + kernelName += "_mask"; } - args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst )); - size_t gt[3] = {groupnum * 256, 1, 1}, lt[3] = {256, 1, 1}; - openCLExecuteKernel(src.clCxt, &arithm_minMax, kernelName, gt, lt, args, -1, -1, build_options); + + size_t globalThreads[3] = {groupnum * 256, 1, 1}; + size_t localThreads[3] = {256, 1, 1}; + + openCLExecuteKernel(src.clCxt, &arithm_minMax, kernelName, globalThreads, localThreads, + args, -1, -1, buildOptions.c_str()); } - -static void arithmetic_minMax_mask_run(const oclMat &src, const oclMat &mask, cl_mem &dst, int vlen, int groupnum, string kernelName) -{ - vector > args; - size_t gt[3] = {groupnum * 256, 1, 1}, lt[3] = {256, 1, 1}; - char build_options[50]; - if (src.oclchannels() == 1) - { - int cols = (src.cols - 1) / vlen + 1; - int invalid_cols = src.step / (vlen * src.elemSize1()) - cols; - int offset = src.offset / src.elemSize1(); - int repeat_me = vlen - (mask.cols % vlen == 0 ? vlen : mask.cols % vlen); - int minvalid_cols = mask.step / (vlen * mask.elemSize1()) - cols; - int moffset = mask.offset / mask.elemSize1(); - int elemnum = cols * src.rows; - sprintf(build_options, "-D DEPTH_%d -D REPEAT_E%d", src.depth(), repeat_me); - args.push_back( make_pair( sizeof(cl_int) , (void *)&cols )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&invalid_cols )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&offset)); - args.push_back( make_pair( sizeof(cl_int) , (void *)&elemnum)); - args.push_back( make_pair( sizeof(cl_int) , (void *)&groupnum)); - args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data)); - args.push_back( make_pair( sizeof(cl_int) , (void *)&minvalid_cols )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&moffset )); - args.push_back( make_pair( sizeof(cl_mem) , (void *)&mask.data )); - args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst )); - openCLExecuteKernel(src.clCxt, &arithm_minMax_mask, kernelName, gt, lt, args, -1, -1, build_options); - } -} - -template void arithmetic_minMax(const oclMat &src, double *minVal, double *maxVal, +template +void arithmetic_minMax(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask, oclMat &buf) { size_t groupnum = src.clCxt->computeUnits(); CV_Assert(groupnum != 0); - groupnum = groupnum * 2; - int vlen = 8; - int dbsize = groupnum * 2 * vlen * sizeof(T) ; + int dbsize = groupnum * 2 * src.elemSize(); ensureSizeIsEnough(1, dbsize, CV_8UC1, buf); cl_mem buf_data = reinterpret_cast(buf.data); - - if (mask.empty()) - { - arithmetic_minMax_run(src, mask, buf_data, vlen, groupnum, "arithm_op_minMax"); - } - else - { - arithmetic_minMax_mask_run(src, mask, buf_data, vlen, groupnum, "arithm_op_minMax_mask"); - } + arithmetic_minMax_run(src, mask, buf_data, groupnum, "arithm_op_minMax"); Mat matbuf = Mat(buf); T *p = matbuf.ptr(); if (minVal != NULL) { *minVal = std::numeric_limits::max(); - for (int i = 0; i < vlen * (int)groupnum; i++) - { + for (int i = 0, end = src.oclchannels() * (int)groupnum; i < end; i++) *minVal = *minVal < p[i] ? *minVal : p[i]; - } } if (maxVal != NULL) { *maxVal = -std::numeric_limits::max(); - for (int i = vlen * (int)groupnum; i < 2 * vlen * (int)groupnum; i++) - { + for (int i = src.oclchannels() * (int)groupnum, end = i << 1; i < end; i++) *maxVal = *maxVal > p[i] ? *maxVal : p[i]; - } } } -typedef void (*minMaxFunc)(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask, oclMat &buf); + void cv::ocl::minMax(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask) { oclMat buf; minMax_buf(src, minVal, maxVal, mask, buf); } +typedef void (*minMaxFunc)(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask, oclMat &buf); + void cv::ocl::minMax_buf(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask, oclMat &buf) { - CV_Assert(src.oclchannels() == 1); + CV_Assert(src.channels() == 1); + CV_Assert(src.size() == mask.size() || mask.empty()); + CV_Assert(src.step % src.elemSize() == 0); + + if (minVal == NULL && maxVal == NULL) + return; + if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F) { CV_Error(CV_GpuNotSupported, "Selected device doesn't support double"); } + static minMaxFunc functab[8] = { - arithmetic_minMax, - arithmetic_minMax, - arithmetic_minMax, - arithmetic_minMax, - arithmetic_minMax, - arithmetic_minMax, - arithmetic_minMax, + arithmetic_minMax, + arithmetic_minMax, + arithmetic_minMax, + arithmetic_minMax, + arithmetic_minMax, + arithmetic_minMax, + arithmetic_minMax, 0 }; + minMaxFunc func; func = functab[src.depth()]; func(src, minVal, maxVal, mask, buf); diff --git a/modules/ocl/src/opencl/arithm_minMax.cl b/modules/ocl/src/opencl/arithm_minMax.cl index 23b2933066..c5d3ec2abd 100644 --- a/modules/ocl/src/opencl/arithm_minMax.cl +++ b/modules/ocl/src/opencl/arithm_minMax.cl @@ -53,169 +53,117 @@ #endif #endif -#if defined (DEPTH_0) -#define VEC_TYPE uchar8 -#define CONVERT_TYPE convert_uchar8 -#define MIN_VAL 0 -#define MAX_VAL 255 -#endif -#if defined (DEPTH_1) -#define VEC_TYPE char8 -#define CONVERT_TYPE convert_char8 -#define MIN_VAL -128 -#define MAX_VAL 127 -#endif -#if defined (DEPTH_2) -#define VEC_TYPE ushort8 -#define CONVERT_TYPE convert_ushort8 -#define MIN_VAL 0 -#define MAX_VAL 65535 -#endif -#if defined (DEPTH_3) -#define VEC_TYPE short8 -#define CONVERT_TYPE convert_short8 -#define MIN_VAL -32768 -#define MAX_VAL 32767 -#endif -#if defined (DEPTH_4) -#define VEC_TYPE int8 -#define CONVERT_TYPE convert_int8 -#define MIN_VAL INT_MIN -#define MAX_VAL INT_MAX -#endif -#if defined (DEPTH_5) -#define VEC_TYPE float8 -#define CONVERT_TYPE convert_float8 -#define MIN_VAL (-FLT_MAX) -#define MAX_VAL FLT_MAX -#endif -#if defined (DEPTH_6) -#define VEC_TYPE double8 -#define CONVERT_TYPE convert_double8 -#define MIN_VAL (-DBL_MAX) -#define MAX_VAL DBL_MAX -#endif - -#if defined (REPEAT_S0) -#define repeat_s(a) a = a; -#endif -#if defined (REPEAT_S1) -#define repeat_s(a) a.s0 = a.s1; -#endif -#if defined (REPEAT_S2) -#define repeat_s(a) a.s0 = a.s2;a.s1 = a.s2; -#endif -#if defined (REPEAT_S3) -#define repeat_s(a) a.s0 = a.s3;a.s1 = a.s3;a.s2 = a.s3; -#endif -#if defined (REPEAT_S4) -#define repeat_s(a) a.s0 = a.s4;a.s1 = a.s4;a.s2 = a.s4;a.s3 = a.s4; -#endif -#if defined (REPEAT_S5) -#define repeat_s(a) a.s0 = a.s5;a.s1 = a.s5;a.s2 = a.s5;a.s3 = a.s5;a.s4 = a.s5; -#endif -#if defined (REPEAT_S6) -#define repeat_s(a) a.s0 = a.s6;a.s1 = a.s6;a.s2 = a.s6;a.s3 = a.s6;a.s4 = a.s6;a.s5 = a.s6; -#endif -#if defined (REPEAT_S7) -#define repeat_s(a) a.s0 = a.s7;a.s1 = a.s7;a.s2 = a.s7;a.s3 = a.s7;a.s4 = a.s7;a.s5 = a.s7;a.s6 = a.s7; -#endif - -#if defined (REPEAT_E0) -#define repeat_e(a) a = a; -#endif -#if defined (REPEAT_E1) -#define repeat_e(a) a.s7 = a.s6; -#endif -#if defined (REPEAT_E2) -#define repeat_e(a) a.s7 = a.s5;a.s6 = a.s5; -#endif -#if defined (REPEAT_E3) -#define repeat_e(a) a.s7 = a.s4;a.s6 = a.s4;a.s5 = a.s4; -#endif -#if defined (REPEAT_E4) -#define repeat_e(a) a.s7 = a.s3;a.s6 = a.s3;a.s5 = a.s3;a.s4 = a.s3; -#endif -#if defined (REPEAT_E5) -#define repeat_e(a) a.s7 = a.s2;a.s6 = a.s2;a.s5 = a.s2;a.s4 = a.s2;a.s3 = a.s2; -#endif -#if defined (REPEAT_E6) -#define repeat_e(a) a.s7 = a.s1;a.s6 = a.s1;a.s5 = a.s1;a.s4 = a.s1;a.s3 = a.s1;a.s2 = a.s1; -#endif -#if defined (REPEAT_E7) -#define repeat_e(a) a.s7 = a.s0;a.s6 = a.s0;a.s5 = a.s0;a.s4 = a.s0;a.s3 = a.s0;a.s2 = a.s0;a.s1 = a.s0; -#endif - #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable #pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics:enable /**************************************Array minMax**************************************/ -__kernel void arithm_op_minMax (int cols,int invalid_cols,int offset,int elemnum,int groupnum, - __global VEC_TYPE *src, __global VEC_TYPE *dst) + +__kernel void arithm_op_minMax(__global const T * src, __global T * dst, + int cols, int invalid_cols, int offset, int elemnum, int groupnum) { unsigned int lid = get_local_id(0); unsigned int gid = get_group_id(0); - unsigned int id = get_global_id(0); + unsigned int id = get_global_id(0); + unsigned int idx = offset + id + (id / cols) * invalid_cols; - __local VEC_TYPE localmem_max[128],localmem_min[128]; - VEC_TYPE minval,maxval,temp; - if(id < elemnum) - { - temp = src[idx]; - if(id % cols == 0 ) - { - repeat_s(temp); - } - if(id % cols == cols - 1) - { - repeat_e(temp); - } - minval = temp; - maxval = temp; - } - else - { - minval = MAX_VAL; - maxval = MIN_VAL; - } - for(id=id + (groupnum << 8); id < elemnum;id = id + (groupnum << 8)) + + __local T localmem_max[128], localmem_min[128]; + T minval = (T)(MAX_VAL), maxval = (T)(MIN_VAL), temp; + + for (int grainSize = groupnum << 8; id < elemnum; id += grainSize) { idx = offset + id + (id / cols) * invalid_cols; temp = src[idx]; - if(id % cols == 0 ) - { - repeat_s(temp); - } - if(id % cols == cols - 1) - { - repeat_e(temp); - } - minval = min(minval,temp); - maxval = max(maxval,temp); + minval = min(minval, temp); + maxval = max(maxval, temp); } + if(lid > 127) { localmem_min[lid - 128] = minval; localmem_max[lid - 128] = maxval; } barrier(CLK_LOCAL_MEM_FENCE); + if(lid < 128) { - localmem_min[lid] = min(minval,localmem_min[lid]); - localmem_max[lid] = max(maxval,localmem_max[lid]); + localmem_min[lid] = min(minval, localmem_min[lid]); + localmem_max[lid] = max(maxval, localmem_max[lid]); } barrier(CLK_LOCAL_MEM_FENCE); - for(int lsize = 64; lsize > 0; lsize >>= 1) + + for (int lsize = 64; lsize > 0; lsize >>= 1) { - if(lid < lsize) + if (lid < lsize) { int lid2 = lsize + lid; - localmem_min[lid] = min(localmem_min[lid] , localmem_min[lid2]); - localmem_max[lid] = max(localmem_max[lid] , localmem_max[lid2]); + localmem_min[lid] = min(localmem_min[lid], localmem_min[lid2]); + localmem_max[lid] = max(localmem_max[lid], localmem_max[lid2]); } barrier(CLK_LOCAL_MEM_FENCE); } - if( lid == 0) + + if (lid == 0) + { + dst[gid] = localmem_min[0]; + dst[gid + groupnum] = localmem_max[0]; + } +} + +__kernel void arithm_op_minMax_mask(__global const T * src, __global T * dst, + int cols, int invalid_cols, int offset, + int elemnum, int groupnum, + const __global uchar * mask, int minvalid_cols, int moffset) +{ + unsigned int lid = get_local_id(0); + unsigned int gid = get_group_id(0); + unsigned int id = get_global_id(0); + + unsigned int idx = offset + id + (id / cols) * invalid_cols; + unsigned int midx = moffset + id + (id / cols) * minvalid_cols; + + __local T localmem_max[128], localmem_min[128]; + T minval = (T)(MAX_VAL), maxval = (T)(MIN_VAL), temp; + + for (int grainSize = groupnum << 8; id < elemnum; id += grainSize) + { + idx = offset + id + (id / cols) * invalid_cols; + midx = moffset + id + (id / cols) * minvalid_cols; + + if (mask[midx]) + { + temp = src[idx]; + minval = min(minval, temp); + maxval = max(maxval, temp); + } + } + + if(lid > 127) + { + localmem_min[lid - 128] = minval; + localmem_max[lid - 128] = maxval; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if(lid < 128) + { + localmem_min[lid] = min(minval, localmem_min[lid]); + localmem_max[lid] = max(maxval, localmem_max[lid]); + } + barrier(CLK_LOCAL_MEM_FENCE); + + for (int lsize = 64; lsize > 0; lsize >>= 1) + { + if (lid < lsize) + { + int lid2 = lsize + lid; + localmem_min[lid] = min(localmem_min[lid], localmem_min[lid2]); + localmem_max[lid] = max(localmem_max[lid], localmem_max[lid2]); + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + if (lid == 0) { dst[gid] = localmem_min[0]; dst[gid + groupnum] = localmem_max[0]; diff --git a/modules/ocl/test/test_arithm.cpp b/modules/ocl/test/test_arithm.cpp index 9b20dbf89c..acac38fea2 100644 --- a/modules/ocl/test/test_arithm.cpp +++ b/modules/ocl/test/test_arithm.cpp @@ -753,7 +753,7 @@ TEST_P(MinMax, MAT) } } -TEST_P(MinMax, DISABLED_MASK) +TEST_P(MinMax, MASK) { for (int j = 0; j < LOOP_TIMES; j++) {