From b864f48274378a91d1aaa35ba3468693b75d201f Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Fri, 27 Sep 2013 17:56:30 +0400 Subject: [PATCH] fixed ocl::sum, ocl::sqrSum, ocl::absSum --- modules/ocl/src/arithm.cpp | 110 ++++++----- modules/ocl/src/opencl/arithm_sum.cl | 161 +++------------- modules/ocl/src/opencl/arithm_sum_3.cl | 247 ------------------------- modules/ocl/test/test_arithm.cpp | 120 +++++++++++- 4 files changed, 215 insertions(+), 423 deletions(-) delete mode 100644 modules/ocl/src/opencl/arithm_sum_3.cl diff --git a/modules/ocl/src/arithm.cpp b/modules/ocl/src/arithm.cpp index 24420f477d..deb5163a4a 100644 --- a/modules/ocl/src/arithm.cpp +++ b/modules/ocl/src/arithm.cpp @@ -66,7 +66,6 @@ namespace cv extern const char *arithm_nonzero; extern const char *arithm_sum; - extern const char *arithm_sum_3; extern const char *arithm_minMax; extern const char *arithm_minMaxLoc; extern const char *arithm_minMaxLoc_mask; @@ -317,21 +316,28 @@ void cv::ocl::compare(const oclMat &src1, const oclMat &src2, oclMat &dst , int ////////////////////////////////// sum ////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////// -//type = 0 sum,type = 1 absSum,type = 2 sqrSum -static void arithmetic_sum_buffer_run(const oclMat &src, cl_mem &dst, int vlen , int groupnum, int type = 0) +enum { SUM = 0, ABS_SUM, SQR_SUM }; + +static void arithmetic_sum_buffer_run(const oclMat &src, cl_mem &dst, int groupnum, int type, int ddepth) { - vector > args; - int all_cols = src.step / (vlen * src.elemSize1()); - int pre_cols = (src.offset % src.step) / (vlen * src.elemSize1()); - int sec_cols = all_cols - (src.offset % src.step + src.cols * src.elemSize() - 1) / (vlen * src.elemSize1()) - 1; + int ochannels = src.oclchannels(); + int all_cols = src.step / src.elemSize(); + int pre_cols = (src.offset % src.step) / src.elemSize(); + int sec_cols = all_cols - (src.offset % src.step + src.cols * src.elemSize() - 1) / src.elemSize() - 1; int invalid_cols = pre_cols + sec_cols; int cols = all_cols - invalid_cols , elemnum = cols * src.rows;; - int offset = src.offset / (vlen * src.elemSize1()); - int repeat_s = src.offset / src.elemSize1() - offset * vlen; - int repeat_e = (offset + cols) * vlen - src.offset / src.elemSize1() - src.cols * src.oclchannels(); - char build_options[512]; - CV_Assert(type == 0 || type == 1 || type == 2); - sprintf(build_options, "-D DEPTH_%d -D REPEAT_S%d -D REPEAT_E%d -D FUNC_TYPE_%d", src.depth(), repeat_s, repeat_e, type); + int offset = src.offset / src.elemSize(); + + const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" }; + const char * const funcMap[] = { "FUNC_SUM", "FUNC_ABS_SUM", "FUNC_SQR_SUM" }; + const char * const channelMap[] = { " ", " ", "2", "4", "4" }; + string buildOptions = format("-D srcT=%s%s -D dstT=%s%s -D convertToDstT=convert_%s%s -D %s", + typeMap[src.depth()], channelMap[ochannels], + typeMap[ddepth], channelMap[ochannels], + typeMap[ddepth], channelMap[ochannels], + funcMap[type]); + + vector > args; args.push_back( make_pair( sizeof(cl_int) , (void *)&cols )); args.push_back( make_pair( sizeof(cl_int) , (void *)&invalid_cols )); args.push_back( make_pair( sizeof(cl_int) , (void *)&offset)); @@ -339,55 +345,63 @@ static void arithmetic_sum_buffer_run(const oclMat &src, cl_mem &dst, int vlen , args.push_back( make_pair( sizeof(cl_int) , (void *)&groupnum)); args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data)); args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst )); - size_t gt[3] = {groupnum * 256, 1, 1}, lt[3] = {256, 1, 1}; - if (src.oclchannels() != 3) - openCLExecuteKernel(src.clCxt, &arithm_sum, "arithm_op_sum", gt, lt, args, -1, -1, build_options); - else - openCLExecuteKernel(src.clCxt, &arithm_sum_3, "arithm_op_sum_3", gt, lt, args, -1, -1, build_options); + size_t globalThreads[3] = { groupnum * 256, 1, 1 }; + size_t localThreads[3] = { 256, 1, 1 }; + + openCLExecuteKernel(src.clCxt, &arithm_sum, "arithm_op_sum", globalThreads, localThreads, + args, -1, -1, buildOptions.c_str()); } template -Scalar arithmetic_sum(const oclMat &src, int type = 0) +Scalar arithmetic_sum(const oclMat &src, int type, int ddepth) { + CV_Assert(src.step % src.elemSize() == 0); + size_t groupnum = src.clCxt->computeUnits(); CV_Assert(groupnum != 0); - int vlen = src.oclchannels() == 3 ? 12 : 8, dbsize = groupnum * vlen; + + int dbsize = groupnum * src.oclchannels(); Context *clCxt = src.clCxt; AutoBuffer _buf(dbsize); T *p = (T*)_buf; - cl_mem dstBuffer = openCLCreateBuffer(clCxt, CL_MEM_WRITE_ONLY, dbsize * sizeof(T)); - Scalar s = Scalar::all(0.0); - arithmetic_sum_buffer_run(src, dstBuffer, vlen, groupnum, type); - memset(p, 0, dbsize * sizeof(T)); - openCLReadBuffer(clCxt, dstBuffer, (void *)p, dbsize * sizeof(T)); - for (int i = 0; i < dbsize;) - { - for (int j = 0; j < src.oclchannels(); j++, i++) - s.val[j] += p[i]; - } + cl_mem dstBuffer = openCLCreateBuffer(clCxt, CL_MEM_WRITE_ONLY, dbsize * sizeof(T)); + arithmetic_sum_buffer_run(src, dstBuffer, groupnum, type, ddepth); + openCLReadBuffer(clCxt, dstBuffer, (void *)p, dbsize * sizeof(T)); openCLFree(dstBuffer); + + Scalar s = Scalar::all(0.0); + for (int i = 0; i < dbsize;) + for (int j = 0; j < src.oclchannels(); j++, i++) + s.val[j] += p[i]; + return s; } -typedef Scalar (*sumFunc)(const oclMat &src, int type); +typedef Scalar (*sumFunc)(const oclMat &src, int type, int ddepth); + Scalar cv::ocl::sum(const oclMat &src) { if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F) { CV_Error(CV_GpuNotSupported, "Selected device doesn't support double"); } - static sumFunc functab[2] = + static sumFunc functab[3] = { + arithmetic_sum, arithmetic_sum, arithmetic_sum }; - sumFunc func; - func = functab[(int)src.clCxt->supportsFeature(Context::CL_DOUBLE)]; - return func(src, 0); + bool hasDouble = src.clCxt->supportsFeature(Context::CL_DOUBLE); + int ddepth = std::max(src.depth(), CV_32S); + if (!hasDouble && ddepth == CV_64F) + ddepth = CV_32F; + + sumFunc func = functab[ddepth - CV_32S]; + return func(src, SUM, ddepth); } Scalar cv::ocl::absSum(const oclMat &src) @@ -396,15 +410,20 @@ Scalar cv::ocl::absSum(const oclMat &src) { CV_Error(CV_GpuNotSupported, "Selected device doesn't support double"); } - static sumFunc functab[2] = + static sumFunc functab[3] = { + arithmetic_sum, arithmetic_sum, arithmetic_sum }; - sumFunc func; - func = functab[(int)src.clCxt->supportsFeature(Context::CL_DOUBLE)]; - return func(src, 1); + bool hasDouble = src.clCxt->supportsFeature(Context::CL_DOUBLE); + int ddepth = std::max(src.depth(), CV_32S); + if (!hasDouble && ddepth == CV_64F) + ddepth = CV_32F; + + sumFunc func = functab[ddepth - CV_32S]; + return func(src, ABS_SUM, ddepth); } Scalar cv::ocl::sqrSum(const oclMat &src) @@ -413,15 +432,20 @@ Scalar cv::ocl::sqrSum(const oclMat &src) { CV_Error(CV_GpuNotSupported, "Selected device doesn't support double"); } - static sumFunc functab[2] = + static sumFunc functab[3] = { + arithmetic_sum, arithmetic_sum, arithmetic_sum }; - sumFunc func; - func = functab[(int)src.clCxt->supportsFeature(Context::CL_DOUBLE)]; - return func(src, 2); + bool hasDouble = src.clCxt->supportsFeature(Context::CL_DOUBLE); + int ddepth = std::max(src.depth(), CV_32S); + if (!hasDouble && ddepth == CV_64F) + ddepth = CV_32F; + + sumFunc func = functab[ddepth - CV_32S]; + return func(src, SQR_SUM, ddepth); } ////////////////////////////////////////////////////////////////////////////// diff --git a/modules/ocl/src/opencl/arithm_sum.cl b/modules/ocl/src/opencl/arithm_sum.cl index 280b0a5111..4011f03bea 100644 --- a/modules/ocl/src/opencl/arithm_sum.cl +++ b/modules/ocl/src/opencl/arithm_sum.cl @@ -43,163 +43,62 @@ // //M*/ -/**************************************PUBLICFUNC*************************************/ #if defined (DOUBLE_SUPPORT) +#ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64:enable -#define RES_TYPE double8 -#define CONVERT_RES_TYPE convert_double8 -#else -#define RES_TYPE float8 -#define CONVERT_RES_TYPE convert_float8 +#elif defined (cl_amd_fp64) +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#endif #endif -#if defined (DEPTH_0) -#define VEC_TYPE uchar8 +#if defined (FUNC_SUM) +#define FUNC(a, b) b += a; #endif -#if defined (DEPTH_1) -#define VEC_TYPE char8 +#if defined (FUNC_ABS_SUM) +#define FUNC(a, b) b += a >= 0 ? a : -a; #endif -#if defined (DEPTH_2) -#define VEC_TYPE ushort8 +#if defined (FUNC_SQR_SUM) +#define FUNC(a, b) b += a * a; #endif -#if defined (DEPTH_3) -#define VEC_TYPE short8 -#endif -#if defined (DEPTH_4) -#define VEC_TYPE int8 -#endif -#if defined (DEPTH_5) -#define VEC_TYPE float8 -#endif -#if defined (DEPTH_6) -#define VEC_TYPE double8 -#endif - -#if defined (FUNC_TYPE_0) -#define FUNC(a,b) b += a; -#endif -#if defined (FUNC_TYPE_1) -#define FUNC(a,b) b = b + (a >= 0 ? a : -a); -#endif -#if defined (FUNC_TYPE_2) -#define FUNC(a,b) b = b + a * a; -#endif - -#if defined (REPEAT_S0) -#define repeat_s(a) a = a; -#endif -#if defined (REPEAT_S1) -#define repeat_s(a) a.s0 = 0; -#endif -#if defined (REPEAT_S2) -#define repeat_s(a) a.s0 = 0;a.s1 = 0; -#endif -#if defined (REPEAT_S3) -#define repeat_s(a) a.s0 = 0;a.s1 = 0;a.s2 = 0; -#endif -#if defined (REPEAT_S4) -#define repeat_s(a) a.s0 = 0;a.s1 = 0;a.s2 = 0;a.s3 = 0; -#endif -#if defined (REPEAT_S5) -#define repeat_s(a) a.s0 = 0;a.s1 = 0;a.s2 = 0;a.s3 = 0;a.s4 = 0; -#endif -#if defined (REPEAT_S6) -#define repeat_s(a) a.s0 = 0;a.s1 = 0;a.s2 = 0;a.s3 = 0;a.s4 = 0;a.s5 = 0; -#endif -#if defined (REPEAT_S7) -#define repeat_s(a) a.s0 = 0;a.s1 = 0;a.s2 = 0;a.s3 = 0;a.s4 = 0;a.s5 = 0;a.s6 = 0; -#endif - -#if defined (REPEAT_E0) -#define repeat_e(a) a = a; -#endif -#if defined (REPEAT_E1) -#define repeat_e(a) a.s7 = 0; -#endif -#if defined (REPEAT_E2) -#define repeat_e(a) a.s7 = 0;a.s6 = 0; -#endif -#if defined (REPEAT_E3) -#define repeat_e(a) a.s7 = 0;a.s6 = 0;a.s5 = 0; -#endif -#if defined (REPEAT_E4) -#define repeat_e(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0; -#endif -#if defined (REPEAT_E5) -#define repeat_e(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;a.s3 = 0; -#endif -#if defined (REPEAT_E6) -#define repeat_e(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;a.s3 = 0;a.s2 = 0; -#endif -#if defined (REPEAT_E7) -#define repeat_e(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;a.s3 = 0;a.s2 = 0;a.s1 = 0; -#endif - -#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable -#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics:enable /**************************************Array buffer SUM**************************************/ -__kernel void arithm_op_sum (int cols,int invalid_cols,int offset,int elemnum,int groupnum, - __global VEC_TYPE *src, __global RES_TYPE *dst) + +__kernel void arithm_op_sum(int cols,int invalid_cols,int offset,int elemnum,int groupnum, + __global srcT *src, __global dstT *dst) { unsigned int lid = get_local_id(0); unsigned int gid = get_group_id(0); - unsigned int id = get_global_id(0); + unsigned int id = get_global_id(0); unsigned int idx = offset + id + (id / cols) * invalid_cols; - __local RES_TYPE localmem_sum[128]; - RES_TYPE sum = 0,temp; - if(id < elemnum) - { - temp = CONVERT_RES_TYPE(src[idx]); - if(id % cols == 0 ) - { - repeat_s(temp); - } - if(id % cols == cols - 1) - { - repeat_e(temp); - } - FUNC(temp,sum); - } - else - { - sum = 0; - } - for(id=id + (groupnum << 8); id < elemnum;id = id + (groupnum << 8)) + + __local dstT localmem_sum[128]; + dstT sum = (dstT)(0), temp; + + for (int grainSize = groupnum << 8; id < elemnum; id += grainSize) { idx = offset + id + (id / cols) * invalid_cols; - temp = CONVERT_RES_TYPE(src[idx]); - if(id % cols == 0 ) - { - repeat_s(temp); - } - if(id % cols == cols - 1) - { - repeat_e(temp); - } - FUNC(temp,sum); + temp = convertToDstT(src[idx]); + FUNC(temp, sum); } - if(lid > 127) - { + + if (lid > 127) localmem_sum[lid - 128] = sum; - } barrier(CLK_LOCAL_MEM_FENCE); - if(lid < 128) - { + + if (lid < 128) localmem_sum[lid] = sum + localmem_sum[lid]; - } barrier(CLK_LOCAL_MEM_FENCE); - for(int lsize = 64; lsize > 0; lsize >>= 1) + + for (int lsize = 64; lsize > 0; lsize >>= 1) { - if(lid < lsize) + if (lid < lsize) { int lid2 = lsize + lid; localmem_sum[lid] = localmem_sum[lid] + localmem_sum[lid2]; } barrier(CLK_LOCAL_MEM_FENCE); } - if( lid == 0) - { + + if (lid == 0) dst[gid] = localmem_sum[0]; - } } diff --git a/modules/ocl/src/opencl/arithm_sum_3.cl b/modules/ocl/src/opencl/arithm_sum_3.cl deleted file mode 100644 index 3f6ed08803..0000000000 --- a/modules/ocl/src/opencl/arithm_sum_3.cl +++ /dev/null @@ -1,247 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. -// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// @Authors -// Shengen Yan,yanshengen@gmail.com -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other oclMaterials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors as is and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -/**************************************PUBLICFUNC*************************************/ -#if defined (DOUBLE_SUPPORT) -#pragma OPENCL EXTENSION cl_khr_fp64:enable -#define RES_TYPE double4 -#define CONVERT_RES_TYPE convert_double4 -#else -#define RES_TYPE float4 -#define CONVERT_RES_TYPE convert_float4 -#endif - -#if defined (DEPTH_0) -#define VEC_TYPE uchar4 -#endif -#if defined (DEPTH_1) -#define VEC_TYPE char4 -#endif -#if defined (DEPTH_2) -#define VEC_TYPE ushort4 -#endif -#if defined (DEPTH_3) -#define VEC_TYPE short4 -#endif -#if defined (DEPTH_4) -#define VEC_TYPE int4 -#endif -#if defined (DEPTH_5) -#define VEC_TYPE float4 -#endif -#if defined (DEPTH_6) -#define VEC_TYPE double4 -#endif - -#if defined (FUNC_TYPE_0) -#define FUNC(a,b) b += a; -#endif -#if defined (FUNC_TYPE_1) -#define FUNC(a,b) b = b + (a >= 0 ? a : -a); -#endif -#if defined (FUNC_TYPE_2) -#define FUNC(a,b) b = b + a * a; -#endif - -#if defined (REPEAT_S0) -#define repeat_s(a,b,c) a=a; b =b; c=c; -#endif -#if defined (REPEAT_S1) -#define repeat_s(a,b,c) a.s0=0; b=b; c=c; -#endif -#if defined (REPEAT_S2) -#define repeat_s(a,b,c) a.s0=0; a.s1=0; b=b; c=c; -#endif -#if defined (REPEAT_S3) -#define repeat_s(a,b,c) a.s0=0; a.s1=0; a.s2=0; b=b; c=c; -#endif -#if defined (REPEAT_S4) -#define repeat_s(a,b,c) a=0;b=b; c=c; -#endif -#if defined (REPEAT_S5) -#define repeat_s(a,b,c) a=0; b.s0=0;c=c; -#endif -#if defined (REPEAT_S6) -#define repeat_s(a,b,c) a=0; b.s0=0; b.s1=0; c=c; -#endif -#if defined (REPEAT_S7) -#define repeat_s(a,b,c) a=0; b.s0=0; b.s1=0; b.s2=0; c=c; -#endif -#if defined (REPEAT_S8) -#define repeat_s(a,b,c) a=0; b=0; c=c; -#endif -#if defined (REPEAT_S9) -#define repeat_s(a,b,c) a=0; b=0; c.s0=0; -#endif -#if defined (REPEAT_S10) -#define repeat_s(a,b,c) a=0; b=0; c.s0=0; c.s1=0; -#endif -#if defined (REPEAT_S11) -#define repeat_s(a,b,c) a=0; b=0; c.s0=0; c.s1=0; c.s2=0; -#endif - -#if defined (REPEAT_E0) -#define repeat_e(a,b,c) a=a; b =b; c=c; -#endif -#if defined (REPEAT_E1) -#define repeat_e(a,b,c) a=a; b=b; c.s3=0; -#endif -#if defined (REPEAT_E2) -#define repeat_e(a,b,c) a=a; b=b; c.s3=0; c.s2=0; -#endif -#if defined (REPEAT_E3) -#define repeat_e(a,b,c) a=a; b=b; c.s3=0; c.s2=0; c.s1=0; -#endif -#if defined (REPEAT_E4) -#define repeat_e(a,b,c) a=a; b=b; c=0; -#endif -#if defined (REPEAT_E5) -#define repeat_e(a,b,c) a=a; b.s3=0; c=0; -#endif -#if defined (REPEAT_E6) -#define repeat_e(a,b,c) a=a; b.s3=0; b.s2=0; c=0; -#endif -#if defined (REPEAT_E7) -#define repeat_e(a,b,c) a=a; b.s3=0; b.s2=0; b.s1=0; c=0; -#endif -#if defined (REPEAT_E8) -#define repeat_e(a,b,c) a=a; b=0; c=0; -#endif -#if defined (REPEAT_E9) -#define repeat_e(a,b,c) a.s3=0; b=0; c=0; -#endif -#if defined (REPEAT_E10) -#define repeat_e(a,b,c) a.s3=0; a.s2=0; b=0; c=0; -#endif -#if defined (REPEAT_E11) -#define repeat_e(a,b,c) a.s3=0; a.s2=0; a.s1=0; b=0; c=0; -#endif - -__kernel void arithm_op_sum_3 (int cols,int invalid_cols,int offset,int elemnum,int groupnum, - __global VEC_TYPE *src, __global RES_TYPE *dst) -{ - unsigned int lid = get_local_id(0); - unsigned int gid = get_group_id(0); - unsigned int id = get_global_id(0); - unsigned int idx = offset + id + (id / cols) * invalid_cols; - idx = idx * 3; - __local RES_TYPE localmem_sum1[128]; - __local RES_TYPE localmem_sum2[128]; - __local RES_TYPE localmem_sum3[128]; - RES_TYPE sum1 = 0,sum2 = 0,sum3 = 0,temp1,temp2,temp3; - if(id < elemnum) - { - temp1 = CONVERT_RES_TYPE(src[idx]); - temp2 = CONVERT_RES_TYPE(src[idx+1]); - temp3 = CONVERT_RES_TYPE(src[idx+2]); - if(id % cols == 0 ) - { - repeat_s(temp1,temp2,temp3); - } - if(id % cols == cols - 1) - { - repeat_e(temp1,temp2,temp3); - } - FUNC(temp1,sum1); - FUNC(temp2,sum2); - FUNC(temp3,sum3); - } - else - { - sum1 = 0; - sum2 = 0; - sum3 = 0; - } - for(id=id + (groupnum << 8); id < elemnum;id = id + (groupnum << 8)) - { - idx = offset + id + (id / cols) * invalid_cols; - idx = idx * 3; - temp1 = CONVERT_RES_TYPE(src[idx]); - temp2 = CONVERT_RES_TYPE(src[idx+1]); - temp3 = CONVERT_RES_TYPE(src[idx+2]); - if(id % cols == 0 ) - { - repeat_s(temp1,temp2,temp3); - } - if(id % cols == cols - 1) - { - repeat_e(temp1,temp2,temp3); - } - FUNC(temp1,sum1); - FUNC(temp2,sum2); - FUNC(temp3,sum3); - } - if(lid > 127) - { - localmem_sum1[lid - 128] = sum1; - localmem_sum2[lid - 128] = sum2; - localmem_sum3[lid - 128] = sum3; - } - barrier(CLK_LOCAL_MEM_FENCE); - if(lid < 128) - { - localmem_sum1[lid] = sum1 + localmem_sum1[lid]; - localmem_sum2[lid] = sum2 + localmem_sum2[lid]; - localmem_sum3[lid] = sum3 + localmem_sum3[lid]; - } - barrier(CLK_LOCAL_MEM_FENCE); - for(int lsize = 64; lsize > 0; lsize >>= 1) - { - if(lid < lsize) - { - int lid2 = lsize + lid; - localmem_sum1[lid] = localmem_sum1[lid] + localmem_sum1[lid2]; - localmem_sum2[lid] = localmem_sum2[lid] + localmem_sum2[lid2]; - localmem_sum3[lid] = localmem_sum3[lid] + localmem_sum3[lid2]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - if( lid == 0) - { - dst[gid*3] = localmem_sum1[0]; - dst[gid*3+1] = localmem_sum2[0]; - dst[gid*3+2] = localmem_sum3[0]; - } -} diff --git a/modules/ocl/test/test_arithm.cpp b/modules/ocl/test/test_arithm.cpp index acac38fea2..1505419404 100644 --- a/modules/ocl/test/test_arithm.cpp +++ b/modules/ocl/test/test_arithm.cpp @@ -1022,7 +1022,7 @@ TEST_P(MinMaxLoc, MASK) typedef ArithmTestBase Sum; -TEST_P(Sum, DISABLED_MAT) +TEST_P(Sum, MAT) { for (int j = 0; j < LOOP_TIMES; j++) { @@ -1031,7 +1031,121 @@ TEST_P(Sum, DISABLED_MAT) Scalar cpures = cv::sum(src1_roi); Scalar gpures = cv::ocl::sum(gsrc1); - //check results + // check results + EXPECT_NEAR(cpures[0], gpures[0], 0.1); + EXPECT_NEAR(cpures[1], gpures[1], 0.1); + EXPECT_NEAR(cpures[2], gpures[2], 0.1); + EXPECT_NEAR(cpures[3], gpures[3], 0.1); + } +} + +typedef ArithmTestBase SqrSum; + +template +static Scalar sqrSum(const Mat & src) +{ + Scalar sum = Scalar::all(0); + int cn = src.channels(); + WT data[4] = { 0, 0, 0, 0 }; + + int cols = src.cols * cn; + for (int y = 0; y < src.rows; ++y) + { + const T * const sdata = src.ptr(y); + for (int x = 0; x < cols; ) + for (int i = 0; i < cn; ++i, ++x) + { + WT t = static_cast(sdata[x]); + data[i] += t * t; + } + } + + for (int i = 0; i < cn; ++i) + sum[i] = static_cast(data[i]); + + return sum; +} + +typedef Scalar (*sumFunc)(const Mat &); + +TEST_P(SqrSum, MAT) +{ + for (int j = 0; j < LOOP_TIMES; j++) + { + random_roi(); + + static sumFunc funcs[] = { sqrSum, + sqrSum, + sqrSum, + sqrSum, + sqrSum, + sqrSum, + sqrSum, + 0 }; + + sumFunc func = funcs[src1_roi.depth()]; + CV_Assert(func != 0); + + Scalar cpures = func(src1_roi); + Scalar gpures = cv::ocl::sqrSum(gsrc1); + + // check results + EXPECT_NEAR(cpures[0], gpures[0], 1.0); + EXPECT_NEAR(cpures[1], gpures[1], 1.0); + EXPECT_NEAR(cpures[2], gpures[2], 1.0); + EXPECT_NEAR(cpures[3], gpures[3], 1.0); + } +} + +typedef ArithmTestBase AbsSum; + +template +static Scalar absSum(const Mat & src) +{ + Scalar sum = Scalar::all(0); + int cn = src.channels(); + WT data[4] = { 0, 0, 0, 0 }; + + int cols = src.cols * cn; + for (int y = 0; y < src.rows; ++y) + { + const T * const sdata = src.ptr(y); + for (int x = 0; x < cols; ) + for (int i = 0; i < cn; ++i, ++x) + { + WT t = static_cast(sdata[x]); + data[i] += t >= 0 ? t : -t; + } + } + + for (int i = 0; i < cn; ++i) + sum[i] = static_cast(data[i]); + + return sum; +} + +TEST_P(AbsSum, MAT) +{ + for (int j = 0; j < LOOP_TIMES; j++) + { + random_roi(); + + static sumFunc funcs[] = { absSum, + absSum, + absSum, + absSum, + absSum, + absSum, + absSum, + 0 }; + + sumFunc func = funcs[src1_roi.depth()]; + CV_Assert(func != 0); + + Scalar cpures = func(src1_roi); + Scalar gpures = cv::ocl::absSum(gsrc1); + + // check results EXPECT_NEAR(cpures[0], gpures[0], 0.1); EXPECT_NEAR(cpures[1], gpures[1], 0.1); EXPECT_NEAR(cpures[2], gpures[2], 0.1); @@ -1319,6 +1433,8 @@ INSTANTIATE_TEST_CASE_P(Arithm, Flip, Combine(testing::Range(CV_8U, CV_USRTYPE1) INSTANTIATE_TEST_CASE_P(Arithm, MinMax, Combine(testing::Range(CV_8U, CV_USRTYPE1), Values(1), Bool())); INSTANTIATE_TEST_CASE_P(Arithm, MinMaxLoc, Combine(testing::Range(CV_8U, CV_USRTYPE1), Values(1), Bool())); // + INSTANTIATE_TEST_CASE_P(Arithm, Sum, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool())); +INSTANTIATE_TEST_CASE_P(Arithm, SqrSum, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool())); +INSTANTIATE_TEST_CASE_P(Arithm, AbsSum, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool())); INSTANTIATE_TEST_CASE_P(Arithm, CountNonZero, Combine(testing::Range(CV_8U, CV_USRTYPE1), Values(1), Bool())); // + INSTANTIATE_TEST_CASE_P(Arithm, Phase, Combine(Values(CV_32F, CV_64F), testing::Range(1, 5), Bool())); // + INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_and, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool())); // +