diff --git a/modules/ocl/include/opencv2/ocl/ocl.hpp b/modules/ocl/include/opencv2/ocl/ocl.hpp index 7e1ebb54f3..af24f0aca2 100644 --- a/modules/ocl/include/opencv2/ocl/ocl.hpp +++ b/modules/ocl/include/opencv2/ocl/ocl.hpp @@ -1520,7 +1520,12 @@ namespace cv float pos, oclMat &newFrame, oclMat &buf); //! computes moments of the rasterized shape or a vector of points - CV_EXPORTS Moments ocl_moments(InputArray _array, bool binaryImage); + //! _array should be a vector a points standing for the contour + CV_EXPORTS Moments ocl_moments(InputArray contour); + //! src should be a general image uploaded to the GPU. + //! the supported oclMat type are CV_8UC1, CV_16UC1, CV_16SC1, CV_32FC1 and CV_64FC1 + //! to use type of CV_64FC1, the GPU should support CV_64FC1 + CV_EXPORTS Moments ocl_moments(oclMat& src, bool binary); class CV_EXPORTS StereoBM_OCL { diff --git a/modules/ocl/perf/perf_moments.cpp b/modules/ocl/perf/perf_moments.cpp index a36e1a13ed..4da7de06dc 100644 --- a/modules/ocl/perf/perf_moments.cpp +++ b/modules/ocl/perf/perf_moments.cpp @@ -26,7 +26,7 @@ // // * Redistribution's in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. +// and/or other Materials provided with the distribution. // // * The name of the copyright holders may not be used to endorse or promote products // derived from this software without specific prior written permission. @@ -49,41 +49,42 @@ using namespace perf; using std::tr1::tuple; using std::tr1::get; +using namespace cv; +using namespace cv::ocl; +using namespace cvtest; +using namespace testing; +using namespace std; + ///////////// Moments //////////////////////// +//*! performance of image +typedef tuple MomentsParamType; +typedef TestBaseWithParam MomentsFixture; -typedef Size_MatType MomentsFixture; - -PERF_TEST_P(MomentsFixture, DISABLED_Moments, - ::testing::Combine(OCL_TYPICAL_MAT_SIZES, - OCL_PERF_ENUM(CV_8UC1, CV_16SC1, CV_32FC1, CV_64FC1))) // TODO does not work properly (see below) +PERF_TEST_P(MomentsFixture, Moments, + ::testing::Combine(OCL_TYPICAL_MAT_SIZES, + OCL_PERF_ENUM(CV_8UC1, CV_16SC1, CV_16UC1, CV_32FC1), ::testing::Values(false, true))) { - const Size_MatType_t params = GetParam(); + const MomentsParamType params = GetParam(); const Size srcSize = get<0>(params); const int type = get<1>(params); + const bool binaryImage = get<2>(params); - Mat src(srcSize, type), dst(7, 1, CV_64F); - const bool binaryImage = false; + Mat src(srcSize, type), dst(7, 1, CV_64F); + randu(src, 0, 255); + + oclMat src_d(src); cv::Moments mom; - - declare.in(src, WARMUP_RNG).out(dst); - if (RUN_OCL_IMPL) { - ocl::oclMat oclSrc(src); - - OCL_TEST_CYCLE() mom = cv::ocl::ocl_moments(oclSrc, binaryImage); // TODO Use oclSrc - cv::HuMoments(mom, dst); - - SANITY_CHECK(dst); + OCL_TEST_CYCLE() mom = cv::ocl::ocl_moments(src_d, binaryImage); } else if (RUN_PLAIN_IMPL) { TEST_CYCLE() mom = cv::moments(src, binaryImage); - cv::HuMoments(mom, dst); - - SANITY_CHECK(dst); } else OCL_PERF_ELSE + cv::HuMoments(mom, dst); + SANITY_CHECK(dst, 1e-3); } diff --git a/modules/ocl/src/moments.cpp b/modules/ocl/src/moments.cpp index 13f4197342..f11d381c98 100644 --- a/modules/ocl/src/moments.cpp +++ b/modules/ocl/src/moments.cpp @@ -10,12 +10,12 @@ // License Agreement // For Open Source Computer Vision Library // -// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // @Authors +// Jin Ma, jin@multicorewareinc.com // Sen Liu, swjtuls1987@126.com // // Redistribution and use in source and binary forms, with or without modification, @@ -26,7 +26,7 @@ // // * Redistribution's in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. +// and/or other Materials provided with the distribution. // // * The name of the copyright holders may not be used to endorse or promote products // derived from this software without specific prior written permission. @@ -46,294 +46,342 @@ #include "precomp.hpp" #include "opencl_kernels.hpp" +#if defined _MSC_VER +#define snprintf sprintf_s +#endif namespace cv { -namespace ocl -{ -// The function calculates center of gravity and the central second order moments -static void icvCompleteMomentState( CvMoments* moments ) -{ - double cx = 0, cy = 0; - double mu20, mu11, mu02; - - assert( moments != 0 ); - moments->inv_sqrt_m00 = 0; - - if( fabs(moments->m00) > DBL_EPSILON ) + namespace ocl { - double inv_m00 = 1. / moments->m00; - cx = moments->m10 * inv_m00; - cy = moments->m01 * inv_m00; - moments->inv_sqrt_m00 = std::sqrt( fabs(inv_m00) ); - } - - // mu20 = m20 - m10*cx - mu20 = moments->m20 - moments->m10 * cx; - // mu11 = m11 - m10*cy - mu11 = moments->m11 - moments->m10 * cy; - // mu02 = m02 - m01*cy - mu02 = moments->m02 - moments->m01 * cy; - - moments->mu20 = mu20; - moments->mu11 = mu11; - moments->mu02 = mu02; - - // mu30 = m30 - cx*(3*mu20 + cx*m10) - moments->mu30 = moments->m30 - cx * (3 * mu20 + cx * moments->m10); - mu11 += mu11; - // mu21 = m21 - cx*(2*mu11 + cx*m01) - cy*mu20 - moments->mu21 = moments->m21 - cx * (mu11 + cx * moments->m01) - cy * mu20; - // mu12 = m12 - cy*(2*mu11 + cy*m10) - cx*mu02 - moments->mu12 = moments->m12 - cy * (mu11 + cy * moments->m10) - cx * mu02; - // mu03 = m03 - cy*(3*mu02 + cy*m01) - moments->mu03 = moments->m03 - cy * (3 * mu02 + cy * moments->m01); -} - - -static void icvContourMoments( CvSeq* contour, CvMoments* mom ) -{ - if( contour->total ) - { - CvSeqReader reader; - int lpt = contour->total; - double a00, a10, a01, a20, a11, a02, a30, a21, a12, a03; - - cvStartReadSeq( contour, &reader, 0 ); - - size_t reader_size = lpt << 1; - cv::Mat reader_mat(1,reader_size,CV_32FC1); - - bool is_float = CV_SEQ_ELTYPE(contour) == CV_32FC2; - - if (!cv::ocl::Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE) && is_float) + // The function calculates center of gravity and the central second order moments + static void icvCompleteMomentState( CvMoments* moments ) { - CV_Error(CV_StsUnsupportedFormat, "Moments - double is not supported by your GPU!"); - } + double cx = 0, cy = 0; + double mu20, mu11, mu02; - if( is_float ) - { - for(size_t i = 0; i < reader_size; ++i) + assert( moments != 0 ); + moments->inv_sqrt_m00 = 0; + + if( fabs(moments->m00) > DBL_EPSILON ) { - reader_mat.at(0, i++) = ((CvPoint2D32f*)(reader.ptr))->x; - reader_mat.at(0, i) = ((CvPoint2D32f*)(reader.ptr))->y; - CV_NEXT_SEQ_ELEM( contour->elem_size, reader ); + double inv_m00 = 1. / moments->m00; + cx = moments->m10 * inv_m00; + cy = moments->m01 * inv_m00; + moments->inv_sqrt_m00 = std::sqrt( fabs(inv_m00) ); } + + // mu20 = m20 - m10*cx + mu20 = moments->m20 - moments->m10 * cx; + // mu11 = m11 - m10*cy + mu11 = moments->m11 - moments->m10 * cy; + // mu02 = m02 - m01*cy + mu02 = moments->m02 - moments->m01 * cy; + + moments->mu20 = mu20; + moments->mu11 = mu11; + moments->mu02 = mu02; + + // mu30 = m30 - cx*(3*mu20 + cx*m10) + moments->mu30 = moments->m30 - cx * (3 * mu20 + cx * moments->m10); + mu11 += mu11; + // mu21 = m21 - cx*(2*mu11 + cx*m01) - cy*mu20 + moments->mu21 = moments->m21 - cx * (mu11 + cx * moments->m01) - cy * mu20; + // mu12 = m12 - cy*(2*mu11 + cy*m10) - cx*mu02 + moments->mu12 = moments->m12 - cy * (mu11 + cy * moments->m10) - cx * mu02; + // mu03 = m03 - cy*(3*mu02 + cy*m01) + moments->mu03 = moments->m03 - cy * (3 * mu02 + cy * moments->m01); } - else + + + static void icvContourMoments( CvSeq* contour, CvMoments* mom ) { - for(size_t i = 0; i < reader_size; ++i) + if( contour->total ) { - reader_mat.at(0, i++) = ((CvPoint*)(reader.ptr))->x; - reader_mat.at(0, i) = ((CvPoint*)(reader.ptr))->y; - CV_NEXT_SEQ_ELEM( contour->elem_size, reader ); + CvSeqReader reader; + int lpt = contour->total; + double a00, a10, a01, a20, a11, a02, a30, a21, a12, a03; + + cvStartReadSeq( contour, &reader, 0 ); + + size_t reader_size = lpt << 1; + cv::Mat reader_mat(1,reader_size,CV_32FC1); + + bool is_float = CV_SEQ_ELTYPE(contour) == CV_32FC2; + + if (!cv::ocl::Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE) && is_float) + { + CV_Error(CV_StsUnsupportedFormat, "Moments - double is not supported by your GPU!"); + } + + if( is_float ) + { + for(size_t i = 0; i < reader_size; ++i) + { + reader_mat.at(0, i++) = ((CvPoint2D32f*)(reader.ptr))->x; + reader_mat.at(0, i) = ((CvPoint2D32f*)(reader.ptr))->y; + CV_NEXT_SEQ_ELEM( contour->elem_size, reader ); + } + } + else + { + for(size_t i = 0; i < reader_size; ++i) + { + reader_mat.at(0, i++) = ((CvPoint*)(reader.ptr))->x; + reader_mat.at(0, i) = ((CvPoint*)(reader.ptr))->y; + CV_NEXT_SEQ_ELEM( contour->elem_size, reader ); + } + } + + cv::ocl::oclMat dst_a(10, lpt, CV_64FC1); + cv::ocl::oclMat reader_oclmat(reader_mat); + int llength = std::min(lpt,128); + size_t localThreads[3] = { llength, 1, 1}; + size_t globalThreads[3] = { lpt, 1, 1}; + vector > args; + args.push_back( make_pair( sizeof(cl_int) , (void *)&contour->total )); + args.push_back( make_pair( sizeof(cl_mem) , (void *)&reader_oclmat.data )); + args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_a.data )); + cl_int dst_step = (cl_int)dst_a.step; + args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_step )); + + char builOption[128]; + snprintf(builOption, 128, "-D CV_8UC1"); + + openCLExecuteKernel(dst_a.clCxt, &moments, "icvContourMoments", globalThreads, localThreads, args, -1, -1, builOption); + + cv::Mat dst(dst_a); + a00 = a10 = a01 = a20 = a11 = a02 = a30 = a21 = a12 = a03 = 0.0; + if (!cv::ocl::Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE)) + { + for (int i = 0; i < contour->total; ++i) + { + a00 += dst.at(0, i); + a10 += dst.at(1, i); + a01 += dst.at(2, i); + a20 += dst.at(3, i); + a11 += dst.at(4, i); + a02 += dst.at(5, i); + a30 += dst.at(6, i); + a21 += dst.at(7, i); + a12 += dst.at(8, i); + a03 += dst.at(9, i); + } + } + else + { + a00 = cv::sum(dst.row(0))[0]; + a10 = cv::sum(dst.row(1))[0]; + a01 = cv::sum(dst.row(2))[0]; + a20 = cv::sum(dst.row(3))[0]; + a11 = cv::sum(dst.row(4))[0]; + a02 = cv::sum(dst.row(5))[0]; + a30 = cv::sum(dst.row(6))[0]; + a21 = cv::sum(dst.row(7))[0]; + a12 = cv::sum(dst.row(8))[0]; + a03 = cv::sum(dst.row(9))[0]; + } + + double db1_2, db1_6, db1_12, db1_24, db1_20, db1_60; + if( fabs(a00) > FLT_EPSILON ) + { + if( a00 > 0 ) + { + db1_2 = 0.5; + db1_6 = 0.16666666666666666666666666666667; + db1_12 = 0.083333333333333333333333333333333; + db1_24 = 0.041666666666666666666666666666667; + db1_20 = 0.05; + db1_60 = 0.016666666666666666666666666666667; + } + else + { + db1_2 = -0.5; + db1_6 = -0.16666666666666666666666666666667; + db1_12 = -0.083333333333333333333333333333333; + db1_24 = -0.041666666666666666666666666666667; + db1_20 = -0.05; + db1_60 = -0.016666666666666666666666666666667; + } + + // spatial moments + mom->m00 = a00 * db1_2; + mom->m10 = a10 * db1_6; + mom->m01 = a01 * db1_6; + mom->m20 = a20 * db1_12; + mom->m11 = a11 * db1_24; + mom->m02 = a02 * db1_12; + mom->m30 = a30 * db1_20; + mom->m21 = a21 * db1_60; + mom->m12 = a12 * db1_60; + mom->m03 = a03 * db1_20; + + icvCompleteMomentState( mom ); + } } } - cv::ocl::oclMat dst_a(10, lpt, CV_64FC1); - cv::ocl::oclMat reader_oclmat(reader_mat); - int llength = std::min(lpt,128); - size_t localThreads[3] = { llength, 1, 1}; - size_t globalThreads[3] = { lpt, 1, 1}; - vector > args; - args.push_back( make_pair( sizeof(cl_int) , (void *)&contour->total )); - args.push_back( make_pair( sizeof(cl_mem) , (void *)&reader_oclmat.data )); - args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_a.data )); - cl_int dst_step = (cl_int)dst_a.step; - args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_step )); - - openCLExecuteKernel(dst_a.clCxt, &moments, "icvContourMoments", globalThreads, localThreads, args, -1, -1); - - cv::Mat dst(dst_a); - a00 = a10 = a01 = a20 = a11 = a02 = a30 = a21 = a12 = a03 = 0.0; - if (!cv::ocl::Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE)) + Moments ocl_moments(oclMat& src, bool binary) //for image { - for (int i = 0; i < contour->total; ++i) + CV_Assert(src.oclchannels() == 1); + if(src.type() == CV_64FC1 && !Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE)) { - a00 += dst.at(0, i); - a10 += dst.at(1, i); - a01 += dst.at(2, i); - a20 += dst.at(3, i); - a11 += dst.at(4, i); - a02 += dst.at(5, i); - a30 += dst.at(6, i); - a21 += dst.at(7, i); - a12 += dst.at(8, i); - a03 += dst.at(9, i); + CV_Error(CV_StsUnsupportedFormat, "Moments - double is not supported by your GPU!"); } - } - else - { - a00 = cv::sum(dst.row(0))[0]; - a10 = cv::sum(dst.row(1))[0]; - a01 = cv::sum(dst.row(2))[0]; - a20 = cv::sum(dst.row(3))[0]; - a11 = cv::sum(dst.row(4))[0]; - a02 = cv::sum(dst.row(5))[0]; - a30 = cv::sum(dst.row(6))[0]; - a21 = cv::sum(dst.row(7))[0]; - a12 = cv::sum(dst.row(8))[0]; - a03 = cv::sum(dst.row(9))[0]; - } - double db1_2, db1_6, db1_12, db1_24, db1_20, db1_60; - if( fabs(a00) > FLT_EPSILON ) - { - if( a00 > 0 ) + if(binary) { - db1_2 = 0.5; - db1_6 = 0.16666666666666666666666666666667; - db1_12 = 0.083333333333333333333333333333333; - db1_24 = 0.041666666666666666666666666666667; - db1_20 = 0.05; - db1_60 = 0.016666666666666666666666666666667; + oclMat mask; + if(src.type() != CV_8UC1) + { + src.convertTo(mask, CV_8UC1); + } + oclMat src8u(src.size(), CV_8UC1); + src8u.setTo(Scalar(255), mask); + src = src8u; } + const int TILE_SIZE = 256; + + CvMoments mom; + memset(&mom, 0, sizeof(mom)); + + cv::Size size = src.size(); + int blockx, blocky; + blockx = (size.width + TILE_SIZE - 1)/TILE_SIZE; + blocky = (size.height + TILE_SIZE - 1)/TILE_SIZE; + + oclMat dst_m; + int tile_height = TILE_SIZE; + + size_t localThreads[3] = {1, tile_height, 1}; + size_t globalThreads[3] = {blockx, size.height, 1}; + + if(Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE)) + { + dst_m.create(blocky * 10, blockx, CV_64FC1); + }else + { + dst_m.create(blocky * 10, blockx, CV_32FC1); + } + + int src_step = (int)(src.step/src.elemSize()); + int dstm_step = (int)(dst_m.step/dst_m.elemSize()); + + vector > args,args_sum; + args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data )); + args.push_back( make_pair( sizeof(cl_int) , (void *)&src.rows )); + args.push_back( make_pair( sizeof(cl_int) , (void *)&src.cols )); + args.push_back( make_pair( sizeof(cl_int) , (void *)&src_step )); + args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m.data )); + args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_m.cols )); + args.push_back( make_pair( sizeof(cl_int) , (void *)&dstm_step )); + + int binary_; + if(binary) + binary_ = 1; else + binary_ = 0; + args.push_back( make_pair( sizeof(cl_int) , (void *)&binary_)); + + char builOption[128]; + if(binary || src.type() == CV_8UC1) { - db1_2 = -0.5; - db1_6 = -0.16666666666666666666666666666667; - db1_12 = -0.083333333333333333333333333333333; - db1_24 = -0.041666666666666666666666666666667; - db1_20 = -0.05; - db1_60 = -0.016666666666666666666666666666667; + snprintf(builOption, 128, "-D CV_8UC1"); + }else if(src.type() == CV_16UC1) + { + snprintf(builOption, 128, "-D CV_16UC1"); + }else if(src.type() == CV_16SC1) + { + snprintf(builOption, 128, "-D CV_16SC1"); + }else if(src.type() == CV_32FC1) + { + snprintf(builOption, 128, "-D CV_32FC1"); + }else if(src.type() == CV_64FC1) + { + snprintf(builOption, 128, "-D CV_64FC1"); + }else + { + CV_Error( CV_StsUnsupportedFormat, "" ); } - // spatial moments - mom->m00 = a00 * db1_2; - mom->m10 = a10 * db1_6; - mom->m01 = a01 * db1_6; - mom->m20 = a20 * db1_12; - mom->m11 = a11 * db1_24; - mom->m02 = a02 * db1_12; - mom->m30 = a30 * db1_20; - mom->m21 = a21 * db1_60; - mom->m12 = a12 * db1_60; - mom->m03 = a03 * db1_20; + openCLExecuteKernel(Context::getContext(), &moments, "CvMoments", globalThreads, localThreads, args, -1, -1, builOption); - icvCompleteMomentState( mom ); + Mat tmp(dst_m); + tmp.convertTo(tmp, CV_64FC1); + + double tmp_m[10] = {0}; + + for(int j = 0; j < tmp.rows; j += 10) + { + for(int i = 0; i < tmp.cols; i++) + { + tmp_m[0] += tmp.at(j, i); + tmp_m[1] += tmp.at(j + 1, i); + tmp_m[2] += tmp.at(j + 2, i); + tmp_m[3] += tmp.at(j + 3, i); + tmp_m[4] += tmp.at(j + 4, i); + tmp_m[5] += tmp.at(j + 5, i); + tmp_m[6] += tmp.at(j + 6, i); + tmp_m[7] += tmp.at(j + 7, i); + tmp_m[8] += tmp.at(j + 8, i); + tmp_m[9] += tmp.at(j + 9, i); + } + } + + mom.m00 = tmp_m[0]; + mom.m10 = tmp_m[1]; + mom.m01 = tmp_m[2]; + mom.m20 = tmp_m[3]; + mom.m11 = tmp_m[4]; + mom.m02 = tmp_m[5]; + mom.m30 = tmp_m[6]; + mom.m21 = tmp_m[7]; + mom.m12 = tmp_m[8]; + mom.m03 = tmp_m[9]; + icvCompleteMomentState( &mom ); + return mom; } - } -} -static void ocl_cvMoments( const void* array, CvMoments* mom, int binary ) -{ - const int TILE_SIZE = 256; - int type, depth, cn, coi = 0; - CvMat stub, *mat = (CvMat*)array; - CvContour contourHeader; - CvSeq* contour = 0; - CvSeqBlock block; - if( CV_IS_SEQ( array )) - { - contour = (CvSeq*)array; - if( !CV_IS_SEQ_POINT_SET( contour )) - CV_Error( CV_StsBadArg, "The passed sequence is not a valid contour" ); - } - - if( !mom ) - CV_Error( CV_StsNullPtr, "" ); - - memset( mom, 0, sizeof(*mom)); - - if( !contour ) - { - - mat = cvGetMat( mat, &stub, &coi ); - type = CV_MAT_TYPE( mat->type ); - - if( type == CV_32SC2 || type == CV_32FC2 ) + Moments ocl_moments(InputArray _contour) //for contour { - contour = cvPointSeqFromMat( - CV_SEQ_KIND_CURVE | CV_SEQ_FLAG_CLOSED, - mat, &contourHeader, &block ); + CvMoments mom; + memset(&mom, 0, sizeof(mom)); + + Mat arr = _contour.getMat(); + CvMat c_array = arr; + + const void* array = &c_array; + + CvSeq* contour = 0; + if( CV_IS_SEQ( array )) + { + contour = (CvSeq*)(array); + if( !CV_IS_SEQ_POINT_SET( contour )) + CV_Error( CV_StsBadArg, "The passed sequence is not a valid contour" ); + } + + int type, coi = 0; + + CvMat stub, *mat = (CvMat*)(array); + CvContour contourHeader; + CvSeqBlock block; + + if( !contour ) + { + mat = cvGetMat( mat, &stub, &coi ); + type = CV_MAT_TYPE( mat->type ); + + if( type == CV_32SC2 || type == CV_32FC2 ) + { + contour = cvPointSeqFromMat( + CV_SEQ_KIND_CURVE | CV_SEQ_FLAG_CLOSED, + mat, &contourHeader, &block ); + } + } + + CV_Assert(contour); + + icvContourMoments(contour, &mom); + return mom; } } - if( contour ) - { - icvContourMoments( contour, mom ); - return; - } - - type = CV_MAT_TYPE( mat->type ); - depth = CV_MAT_DEPTH( type ); - cn = CV_MAT_CN( type ); - - cv::Size size = cvGetMatSize( mat ); - if( cn > 1 && coi == 0 ) - CV_Error( CV_StsBadArg, "Invalid image type" ); - - if( size.width <= 0 || size.height <= 0 ) - return; - - cv::Mat src0(mat); - cv::ocl::oclMat src(src0); - cv::Size tileSize; - int blockx,blocky; - if(size.width%TILE_SIZE == 0) - blockx = size.width/TILE_SIZE; - else - blockx = size.width/TILE_SIZE + 1; - if(size.height%TILE_SIZE == 0) - blocky = size.height/TILE_SIZE; - else - blocky = size.height/TILE_SIZE + 1; - oclMat dst_m(blocky * 10, blockx, CV_64FC1); - oclMat sum(1, 10, CV_64FC1); - int tile_width = std::min(size.width,TILE_SIZE); - int tile_height = std::min(size.height,TILE_SIZE); - size_t localThreads[3] = { tile_height, 1, 1}; - size_t globalThreads[3] = { size.height, blockx, 1}; - vector > args,args_sum; - args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&src.rows )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&src.cols )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&src.step )); - args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m.data )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_m.cols )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_m.step )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&blocky )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&depth )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&cn )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&coi )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&binary )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&TILE_SIZE )); - openCLExecuteKernel(Context::getContext(), &moments, "CvMoments", globalThreads, localThreads, args, -1, depth); - - size_t localThreadss[3] = { 128, 1, 1}; - size_t globalThreadss[3] = { 128, 1, 1}; - args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&src.rows )); - args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&src.cols )); - args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&tile_height )); - args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&tile_width )); - args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&TILE_SIZE )); - args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&sum.data )); - args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m.data )); - args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&dst_m.step )); - openCLExecuteKernel(Context::getContext(), &moments, "dst_sum", globalThreadss, localThreadss, args_sum, -1, -1); - - Mat dstsum(sum); - mom->m00 = dstsum.at(0, 0); - mom->m10 = dstsum.at(0, 1); - mom->m01 = dstsum.at(0, 2); - mom->m20 = dstsum.at(0, 3); - mom->m11 = dstsum.at(0, 4); - mom->m02 = dstsum.at(0, 5); - mom->m30 = dstsum.at(0, 6); - mom->m21 = dstsum.at(0, 7); - mom->m12 = dstsum.at(0, 8); - mom->m03 = dstsum.at(0, 9); - - icvCompleteMomentState( mom ); -} - -Moments ocl_moments( InputArray _array, bool binaryImage ) -{ - CvMoments om; - Mat arr = _array.getMat(); - CvMat c_array = arr; - ocl_cvMoments(&c_array, &om, binaryImage); - return om; -} - -} - -} +} \ No newline at end of file diff --git a/modules/ocl/src/opencl/moments.cl b/modules/ocl/src/opencl/moments.cl index d61b8d5ae7..602ebd1c1d 100644 --- a/modules/ocl/src/opencl/moments.cl +++ b/modules/ocl/src/opencl/moments.cl @@ -15,6 +15,7 @@ // Third party copyrights are property of their respective owners. // // @Authors +// Jin Ma, jin@multicorewareinc.com // Sen Liu, swjtuls1987@126.com // // Redistribution and use in source and binary forms, with or without modification, @@ -44,22 +45,14 @@ //M*/ #if defined (DOUBLE_SUPPORT) - #ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64:enable #elif defined (cl_amd_fp64) #pragma OPENCL EXTENSION cl_amd_fp64:enable #endif typedef double T; -typedef double F; -typedef double4 F4; -#define convert_F4 convert_double4 - #else -typedef float F; -typedef float4 F4; typedef long T; -#define convert_F4 convert_float4 #endif #define DST_ROW_00 0 @@ -99,7 +92,6 @@ __kernel void icvContourMoments(int contour_total, xi = (T)(*(reader_oclmat_data + (idx + 1) * 2)); yi = (T)(*(reader_oclmat_data + (idx + 1) * 2 + 1)); } - xi2 = xi * xi; yi2 = yi * yi; dxy = xi_1 * yi - xi * yi_1; @@ -117,864 +109,338 @@ __kernel void icvContourMoments(int contour_total, *( dst_a + DST_ROW_03 * dst_step + idx) = dxy * yii_1 * (yi_12 + yi2); *( dst_a + DST_ROW_21 * dst_step + idx) = dxy * (xi_12 * (3 * yi_1 + yi) + 2 * xi * xi_1 * yii_1 + - xi2 * (yi_1 + 3 * yi)); + xi2 * (yi_1 + 3 * yi)); *( dst_a + DST_ROW_12 * dst_step + idx) = dxy * (yi_12 * (3 * xi_1 + xi) + 2 * yi * yi_1 * xii_1 + - yi2 * (xi_1 + 3 * xi)); + yi2 * (xi_1 + 3 * xi)); } -__kernel void dst_sum(int src_rows, int src_cols, int tile_height, int tile_width, int TILE_SIZE, - __global F* sum, __global F* dst_m, int dst_step) +#if defined (DOUBLE_SUPPORT) +#define WT double +#define WT4 double4 +#define convert_T4 convert_double4 +#define convert_T convert_double +#else +#define WT float +#define WT4 float4 +#define convert_T4 convert_float4 +#define convert_T convert_float +#endif + +#ifdef CV_8UC1 +#define TT uchar +#elif defined CV_16UC1 +#define TT ushort +#elif defined CV_16SC1 +#define TT short +#elif defined CV_32FC1 +#define TT float +#elif defined CV_64FC1 +#ifdef DOUBLE_SUPPORT +#define TT double +#else +#define TT float +#endif +#endif +__kernel void CvMoments(__global TT* src_data, int src_rows, int src_cols, int src_step, + __global WT* dst_m, + int dst_cols, int dst_step, int binary) { - int gidy = get_global_id(0); - int gidx = get_global_id(1); - int block_y = src_rows/tile_height; - int block_x = src_cols/tile_width; - int block_num; - - if(src_rows > TILE_SIZE && src_rows % TILE_SIZE != 0) - block_y ++; - if(src_cols > TILE_SIZE && src_cols % TILE_SIZE != 0) - block_x ++; - block_num = block_y * block_x; - __local F dst_sum[10][128]; - if(gidy<128-block_num) - for(int i=0; i<10; i++) - dst_sum[i][gidy+block_num]=0; + int dy = get_global_id(1); + int ly = get_local_id(1); + int gidx = get_group_id(0); + int gidy = get_group_id(1); + int x_rest = src_cols % 256; + int y_rest = src_rows % 256; + __local int codxy[256]; + codxy[ly] = ly; barrier(CLK_LOCAL_MEM_FENCE); - dst_step /= sizeof(F); - if(gidy0; lsize>>=1) - { - if(gidy 0) && (gidx == (get_num_groups(0) - 1))) { - int lsize2 = gidy + lsize; - for(int i=0; i<10; i++) - dst_sum[i][gidy] += dst_sum[i][lsize2]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - if(gidy==0) - for(int i=0; i<10; i++) - sum[i] = dst_sum[i][0]; -} - -__kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_cols, int src_step, - __global F* dst_m, - int dst_cols, int dst_step, int blocky, - int depth, int cn, int coi, int binary, int TILE_SIZE) -{ - uchar tmp_coi[16]; // get the coi data - uchar16 tmp[16]; - int VLEN_C = 16; // vector length of uchar - - int gidy = get_global_id(0); - int gidx = get_global_id(1); - int wgidy = get_group_id(0); - int wgidx = get_group_id(1); - int lidy = get_local_id(0); - int lidx = get_local_id(1); - int y = wgidy*TILE_SIZE; // vector length of uchar - int x = wgidx*TILE_SIZE; // vector length of uchar - int kcn = (cn==2)?2:4; - int rstep = min(src_step, TILE_SIZE); - int tileSize_height = min(TILE_SIZE, src_rows - y); - int tileSize_width = min(TILE_SIZE, src_cols - x); - - if ( y+lidy < src_rows ) - { - if( tileSize_width < TILE_SIZE ) - for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ ) - *((__global uchar*)src_data+(y+lidy)*src_step+x+i) = 0; - - if( coi > 0 ) //channel of interest - for(int i = 0; i < tileSize_width; i += VLEN_C) + int i; + for(i = 0; i < x_rest - 4; i += 4) { - for(int j=0; j= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height) - { - m[9][lidy-bheight] = ((int)py) * sy; // m03 - m[8][lidy-bheight] = ((int)x1.s0) * sy; // m12 - m[7][lidy-bheight] = ((int)x2.s0) * lidy; // m21 - m[6][lidy-bheight] = x3.s0; // m30 - m[5][lidy-bheight] = x0.s0 * sy; // m02 - m[4][lidy-bheight] = x1.s0 * lidy; // m11 - m[3][lidy-bheight] = x2.s0; // m20 - m[2][lidy-bheight] = py; // m01 - m[1][lidy-bheight] = x1.s0; // m10 - m[0][lidy-bheight] = x0.s0; // m00 - } - else if(lidy < bheight) - { - lm[9] = ((int)py) * sy; // m03 - lm[8] = ((int)x1.s0) * sy; // m12 - lm[7] = ((int)x2.s0) * lidy; // m21 - lm[6] = x3.s0; // m30 - lm[5] = x0.s0 * sy; // m02 - lm[4] = x1.s0 * lidy; // m11 - lm[3] = x2.s0; // m20 - lm[2] = py; // m01 - lm[1] = x1.s0; // m10 - lm[0] = x0.s0; // m00 - } - barrier(CLK_LOCAL_MEM_FENCE); - for( int j = bheight; j >= 1; j = j/2 ) - { - if(lidy < j) - for( int i = 0; i < 10; i++ ) - lm[i] = lm[i] + m[i][lidy]; - barrier(CLK_LOCAL_MEM_FENCE); - if(lidy >= j/2&&lidy < j) - for( int i = 0; i < 10; i++ ) - m[i][lidy-j/2] = lm[i]; - barrier(CLK_LOCAL_MEM_FENCE); - } + x2.s0 = x2.s0 + x2.s1 + x2.s2 + x2.s3; - if(lidy == 0&&lidx == 0) - { - for( int mt = 0; mt < 10; mt++ ) - mom[mt] = (F)lm[mt]; - if(binary) - { - F s = 1./255; - for( int mt = 0; mt < 10; mt++ ) - mom[mt] *= s; - } - F xm = x * mom[0], ym = y * mom[0]; + x3.s0 = x3.s0 + x3.s1 + x3.s2 + x3.s3; - // accumulate moments computed in each tile - dst_step /= sizeof(F); + WT x0_ = 0; + WT x1_ = 0; + WT x2_ = 0; + WT x3_ = 0; - // + m00 ( = m00' ) - *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0]; - - // + m10 ( = m10' + x*m00' ) - *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm; - - // + m01 ( = m01' + y*m00' ) - *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym; - - // + m20 ( = m20' + 2*x*m10' + x*x*m00' ) - *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm); - - // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' ) - *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1]; - - // + m02 ( = m02' + 2*y*m01' + y*y*m00' ) - *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym); - - // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' ) - *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm)); - - // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20') - *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3]; - - // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02') - *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5]; - - // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' ) - *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym)); - } -} - -__kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_cols, int src_step, - __global F* dst_m, - int dst_cols, int dst_step, int blocky, - int depth, int cn, int coi, int binary, const int TILE_SIZE) -{ - ushort tmp_coi[8]; // get the coi data - ushort8 tmp[32]; - int VLEN_US = 8; // vector length of ushort - int gidy = get_global_id(0); - int gidx = get_global_id(1); - int wgidy = get_group_id(0); - int wgidx = get_group_id(1); - int lidy = get_local_id(0); - int lidx = get_local_id(1); - int y = wgidy*TILE_SIZE; // real Y index of pixel - int x = wgidx*TILE_SIZE; // real X index of pixel - int kcn = (cn==2)?2:4; - int rstep = min(src_step/2, TILE_SIZE); - int tileSize_height = min(TILE_SIZE, src_rows - y); - int tileSize_width = min(TILE_SIZE, src_cols -x); - - if ( y+lidy < src_rows ) - { - if(src_cols > TILE_SIZE && tileSize_width < TILE_SIZE) - for(int i=tileSize_width; i < rstep && (x+i) < src_cols; i++ ) - *((__global ushort*)src_data+(y+lidy)*src_step/2+x+i) = 0; - if( coi > 0 ) - for(int i=0; i < tileSize_width; i+=VLEN_US) + for(; i < x_rest; i++) { - for(int j=0; j= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height) - { - m[9][lidy-bheight] = ((long)py) * sy; // m03 - m[8][lidy-bheight] = ((long)x1.s0) * sy; // m12 - m[7][lidy-bheight] = ((long)x2.s0) * lidy; // m21 - m[6][lidy-bheight] = x3.s0; // m30 - m[5][lidy-bheight] = x0.s0 * sy; // m02 - m[4][lidy-bheight] = x1.s0 * lidy; // m11 - m[3][lidy-bheight] = x2.s0; // m20 - m[2][lidy-bheight] = py; // m01 - m[1][lidy-bheight] = x1.s0; // m10 - m[0][lidy-bheight] = x0.s0; // m00 - } - else if(lidy < bheight) - { - lm[9] = ((long)py) * sy; // m03 - lm[8] = ((long)x1.s0) * sy; // m12 - lm[7] = ((long)x2.s0) * lidy; // m21 - lm[6] = x3.s0; // m30 - lm[5] = x0.s0 * sy; // m02 - lm[4] = x1.s0 * lidy; // m11 - lm[3] = x2.s0; // m20 - lm[2] = py; // m01 - lm[1] = x1.s0; // m10 - lm[0] = x0.s0; // m00 - } - barrier(CLK_LOCAL_MEM_FENCE); - - for( int j = TILE_SIZE/2; j >= 1; j = j/2 ) - { - if(lidy < j) - for( int i = 0; i < 10; i++ ) - lm[i] = lm[i] + m[i][lidy]; - } - barrier(CLK_LOCAL_MEM_FENCE); - for( int j = TILE_SIZE/2; j >= 1; j = j/2 ) - { - if(lidy >= j/2&&lidy < j) - for( int i = 0; i < 10; i++ ) - m[i][lidy-j/2] = lm[i]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - if(lidy == 0&&lidx == 0) - { - for(int mt = 0; mt < 10; mt++ ) - mom[mt] = (F)lm[mt]; - - if(binary) + x0.s0 += x0_; + x1.s0 += x1_; + x2.s0 += x2_; + x3.s0 += x3_; + }else { - F s = 1./255; - for( int mt = 0; mt < 10; mt++ ) - mom[mt] *= s; - } - - F xm = x *mom[0], ym = y * mom[0]; - - // accumulate moments computed in each tile - dst_step /= sizeof(F); - - // + m00 ( = m00' ) - *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0]; - - // + m10 ( = m10' + x*m00' ) - *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm; - - // + m01 ( = m01' + y*m00' ) - *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym; - - // + m20 ( = m20' + 2*x*m10' + x*x*m00' ) - *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm); - - // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' ) - *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1]; - - // + m02 ( = m02' + 2*y*m01' + y*y*m00' ) - *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym); - - // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' ) - *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm)); - - // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20') - *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3]; - - // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02') - *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5]; - - // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' ) - *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym)); - } -} - -__kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols, int src_step, - __global F* dst_m, - int dst_cols, int dst_step, int blocky, - int depth, int cn, int coi, int binary, const int TILE_SIZE) -{ - short tmp_coi[8]; // get the coi data - short8 tmp[32]; - int VLEN_S =8; // vector length of short - int gidy = get_global_id(0); - int gidx = get_global_id(1); - int wgidy = get_group_id(0); - int wgidx = get_group_id(1); - int lidy = get_local_id(0); - int lidx = get_local_id(1); - int y = wgidy*TILE_SIZE; // real Y index of pixel - int x = wgidx*TILE_SIZE; // real X index of pixel - int kcn = (cn==2)?2:4; - int rstep = min(src_step/2, TILE_SIZE); - int tileSize_height = min(TILE_SIZE, src_rows - y); - int tileSize_width = min(TILE_SIZE, src_cols -x); - - if ( y+lidy < src_rows ) - { - if(tileSize_width < TILE_SIZE) - for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ ) - *((__global short*)src_data+(y+lidy)*src_step/2+x+i) = 0; - if( coi > 0 ) - for(int i=0; i < tileSize_width; i+=VLEN_S) + for(int i = 0; i < 256; i += 4) { - for(int j=0; j= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height) - { - m[9][lidy-bheight] = ((long)py) * sy; // m03 - m[8][lidy-bheight] = ((long)x1.s0) * sy; // m12 - m[7][lidy-bheight] = ((long)x2.s0) * lidy; // m21 - m[6][lidy-bheight] = x3.s0; // m30 - m[5][lidy-bheight] = x0.s0 * sy; // m02 - m[4][lidy-bheight] = x1.s0 * lidy; // m11 - m[3][lidy-bheight] = x2.s0; // m20 - m[2][lidy-bheight] = py; // m01 - m[1][lidy-bheight] = x1.s0; // m10 - m[0][lidy-bheight] = x0.s0; // m00 - } - else if(lidy < bheight) - { - lm[9] = ((long)py) * sy; // m03 - lm[8] = ((long)(x1.s0)) * sy; // m12 - lm[7] = ((long)(x2.s0)) * lidy; // m21 - lm[6] = x3.s0; // m30 - lm[5] = x0.s0 * sy; // m02 - lm[4] = x1.s0 * lidy; // m11 - lm[3] = x2.s0; // m20 - lm[2] = py; // m01 - lm[1] = x1.s0; // m10 - lm[0] = x0.s0; // m00 - } - barrier(CLK_LOCAL_MEM_FENCE); - for( int j = TILE_SIZE/2; j >=1; j = j/2 ) - { - if(lidy < j) - for( int i = 0; i < 10; i++ ) - lm[i] = lm[i] + m[i][lidy]; - barrier(CLK_LOCAL_MEM_FENCE); - if(lidy >= j/2&&lidy < j) - for( int i = 0; i < 10; i++ ) - m[i][lidy-j/2] = lm[i]; - barrier(CLK_LOCAL_MEM_FENCE); - } - if(lidy ==0 &&lidx ==0) - { - for(int mt = 0; mt < 10; mt++ ) - mom[mt] = (F)lm[mt]; + x2.s0 = x2.s0 + x2.s1 + x2.s2 + x2.s3; - if(binary) - { - F s = 1./255; - for( int mt = 0; mt < 10; mt++ ) - mom[mt] *= s; + x3.s0 = x3.s0 + x3.s1 + x3.s2 + x3.s3; } - F xm = x * mom[0], ym = y*mom[0]; - - // accumulate moments computed in each tile - dst_step /= sizeof(F); - - // + m00 ( = m00' ) - *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0]; - - // + m10 ( = m10' + x*m00' ) - *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm; - - // + m01 ( = m01' + y*m00' ) - *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym; - - // + m20 ( = m20' + 2*x*m10' + x*x*m00' ) - *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm); - - // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' ) - *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1]; - - // + m02 ( = m02' + 2*y*m01' + y*y*m00' ) - *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym); - - // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' ) - *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm)); - - // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20') - *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3]; - - // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02') - *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5]; - - // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' ) - *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym)); + py = ly * x0.s0; + sy = ly * ly; } -} + __local WT mom[10][256]; -__kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols, int src_step, - __global F* dst_m, - int dst_cols, int dst_step, int blocky, - int depth, int cn, int coi, int binary, const int TILE_SIZE) -{ - float tmp_coi[4]; // get the coi data - float4 tmp[64] ; - int VLEN_F = 4; // vector length of float - int gidy = get_global_id(0); - int gidx = get_global_id(1); - int wgidy = get_group_id(0); - int wgidx = get_group_id(1); - int lidy = get_local_id(0); - int lidx = get_local_id(1); - int y = wgidy*TILE_SIZE; // real Y index of pixel - int x = wgidx*TILE_SIZE; // real X index of pixel - int kcn = (cn==2)?2:4; - int rstep = min(src_step/4, TILE_SIZE); - int tileSize_height = min(TILE_SIZE, src_rows - y); - int tileSize_width = min(TILE_SIZE, src_cols -x); - int maxIdx = mul24(src_rows, src_cols); - int yOff = (y+lidy)*src_step; - int index; - - if ( y+lidy < src_rows ) + if((y_rest > 0) && (gidy == (get_num_groups(1) - 1))) { - if(tileSize_width < TILE_SIZE) - for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ ) - *((__global float*)src_data+(y+lidy)*src_step/4+x+i) = 0; - if( coi > 0 ) - for(int i=0; i < tileSize_width; i+=VLEN_F) + if(ly < y_rest) + { + mom[9][ly] = py * sy; + mom[8][ly] = x1.s0 * sy; + mom[7][ly] = x2.s0 * ly; + mom[6][ly] = x3.s0; + mom[5][ly] = x0.s0 * sy; + mom[4][ly] = x1.s0 * ly; + mom[3][ly] = x2.s0; + mom[2][ly] = py; + mom[1][ly] = x1.s0; + mom[0][ly] = x0.s0; + } + barrier(CLK_LOCAL_MEM_FENCE); + if(ly < 10) + { + for(int i = 1; i < y_rest; i++) { - for(int j=0; j<4; j++) - tmp_coi[j] = *(src_data+(y+lidy)*src_step/4+(x+i+j)*kcn+coi-1); - tmp[i/VLEN_F] = (float4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]); + mom[ly][0] = mom[ly][i] + mom[ly][0]; } - else - for(int i=0; i < tileSize_width; i+=VLEN_F) - tmp[i/VLEN_F] = (float4)(*(src_data+(y+lidy)*src_step/4+x+i),*(src_data+(y+lidy)*src_step/4+x+i+1),*(src_data+(y+lidy)*src_step/4+x+i+2),*(src_data+(y+lidy)*src_step/4+x+i+3)); - } - - float4 zero = (float4)(0); - float4 full = (float4)(255); - if( binary ) - for(int i=0; i < tileSize_width; i+=4) - tmp[i/VLEN_F] = (tmp[i/VLEN_F]!=zero)?full:zero; - F mom[10]; - __local F m[10][128]; - if(lidy < 128) - for(int i = 0; i < 10; i ++) - m[i][lidy] = 0; - barrier(CLK_LOCAL_MEM_FENCE); - F lm[10] = {0}; - F4 x0 = (F4)(0); - F4 x1 = (F4)(0); - F4 x2 = (F4)(0); - F4 x3 = (F4)(0); - for( int xt = 0 ; xt < tileSize_width; xt+=VLEN_F ) - { - F4 v_xt = (F4)(xt, xt+1, xt+2, xt+3); - F4 p = convert_F4(tmp[xt/VLEN_F]); - F4 xp = v_xt * p, xxp = xp * v_xt; - x0 += p; - x1 += xp; - x2 += xxp; - x3 += xxp * v_xt; - } - x0.s0 += x0.s1 + x0.s2 + x0.s3; - x1.s0 += x1.s1 + x1.s2 + x1.s3; - x2.s0 += x2.s1 + x2.s2 + x2.s3; - x3.s0 += x3.s1 + x3.s2 + x3.s3; - - F py = lidy * x0.s0, sy = lidy*lidy; - int bheight = min(tileSize_height, TILE_SIZE/2); - if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height) - { - m[9][lidy-bheight] = ((F)py) * sy; // m03 - m[8][lidy-bheight] = ((F)x1.s0) * sy; // m12 - m[7][lidy-bheight] = ((F)x2.s0) * lidy; // m21 - m[6][lidy-bheight] = x3.s0; // m30 - m[5][lidy-bheight] = x0.s0 * sy; // m02 - m[4][lidy-bheight] = x1.s0 * lidy; // m11 - m[3][lidy-bheight] = x2.s0; // m20 - m[2][lidy-bheight] = py; // m01 - m[1][lidy-bheight] = x1.s0; // m10 - m[0][lidy-bheight] = x0.s0; // m00 - } - - else if(lidy < bheight) - { - lm[9] = ((F)py) * sy; // m03 - lm[8] = ((F)x1.s0) * sy; // m12 - lm[7] = ((F)x2.s0) * lidy; // m21 - lm[6] = x3.s0; // m30 - lm[5] = x0.s0 * sy; // m02 - lm[4] = x1.s0 * lidy; // m11 - lm[3] = x2.s0; // m20 - lm[2] = py; // m01 - lm[1] = x1.s0; // m10 - lm[0] = x0.s0; // m00 - } - barrier(CLK_LOCAL_MEM_FENCE); - for( int j = TILE_SIZE/2; j >= 1; j = j/2 ) - { - if(lidy < j) - for( int i = 0; i < 10; i++ ) - lm[i] = lm[i] + m[i][lidy]; - barrier(CLK_LOCAL_MEM_FENCE); - if(lidy >= j/2&&lidy < j) - for( int i = 0; i < 10; i++ ) - m[i][lidy-j/2] = lm[i]; - barrier(CLK_LOCAL_MEM_FENCE); - } - if(lidy == 0&&lidx == 0) - { - for( int mt = 0; mt < 10; mt++ ) - mom[mt] = (F)lm[mt]; - if(binary) - { - F s = 1./255; - for( int mt = 0; mt < 10; mt++ ) - mom[mt] *= s; } + }else + { + mom[9][ly] = py * sy; + mom[8][ly] = x1.s0 * sy; + mom[7][ly] = x2.s0 * ly; + mom[6][ly] = x3.s0; + mom[5][ly] = x0.s0 * sy; + mom[4][ly] = x1.s0 * ly; + mom[3][ly] = x2.s0; + mom[2][ly] = py; + mom[1][ly] = x1.s0; + mom[0][ly] = x0.s0; - F xm = x * mom[0], ym = y * mom[0]; + barrier(CLK_LOCAL_MEM_FENCE); - // accumulate moments computed in each tile - dst_step /= sizeof(F); + if(ly < 128) + { + mom[0][ly] = mom[0][ly] + mom[0][ly + 128]; + mom[1][ly] = mom[1][ly] + mom[1][ly + 128]; + mom[2][ly] = mom[2][ly] + mom[2][ly + 128]; + mom[3][ly] = mom[3][ly] + mom[3][ly + 128]; + mom[4][ly] = mom[4][ly] + mom[4][ly + 128]; + mom[5][ly] = mom[5][ly] + mom[5][ly + 128]; + mom[6][ly] = mom[6][ly] + mom[6][ly + 128]; + mom[7][ly] = mom[7][ly] + mom[7][ly + 128]; + mom[8][ly] = mom[8][ly] + mom[8][ly + 128]; + mom[9][ly] = mom[9][ly] + mom[9][ly + 128]; + } + barrier(CLK_LOCAL_MEM_FENCE); - // + m00 ( = m00' ) - *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0]; + if(ly < 64) + { + mom[0][ly] = mom[0][ly] + mom[0][ly + 64]; + mom[1][ly] = mom[1][ly] + mom[1][ly + 64]; + mom[2][ly] = mom[2][ly] + mom[2][ly + 64]; + mom[3][ly] = mom[3][ly] + mom[3][ly + 64]; + mom[4][ly] = mom[4][ly] + mom[4][ly + 64]; + mom[5][ly] = mom[5][ly] + mom[5][ly + 64]; + mom[6][ly] = mom[6][ly] + mom[6][ly + 64]; + mom[7][ly] = mom[7][ly] + mom[7][ly + 64]; + mom[8][ly] = mom[8][ly] + mom[8][ly + 64]; + mom[9][ly] = mom[9][ly] + mom[9][ly + 64]; + } + barrier(CLK_LOCAL_MEM_FENCE); - // + m10 ( = m10' + x*m00' ) - *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm; + if(ly < 32) + { + mom[0][ly] = mom[0][ly] + mom[0][ly + 32]; + mom[1][ly] = mom[1][ly] + mom[1][ly + 32]; + mom[2][ly] = mom[2][ly] + mom[2][ly + 32]; + mom[3][ly] = mom[3][ly] + mom[3][ly + 32]; + mom[4][ly] = mom[4][ly] + mom[4][ly + 32]; + mom[5][ly] = mom[5][ly] + mom[5][ly + 32]; + mom[6][ly] = mom[6][ly] + mom[6][ly + 32]; + mom[7][ly] = mom[7][ly] + mom[7][ly + 32]; + mom[8][ly] = mom[8][ly] + mom[8][ly + 32]; + mom[9][ly] = mom[9][ly] + mom[9][ly + 32]; + } + barrier(CLK_LOCAL_MEM_FENCE); - // + m01 ( = m01' + y*m00' ) - *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym; + if(ly < 16) + { + mom[0][ly] = mom[0][ly] + mom[0][ly + 16]; + mom[1][ly] = mom[1][ly] + mom[1][ly + 16]; + mom[2][ly] = mom[2][ly] + mom[2][ly + 16]; + mom[3][ly] = mom[3][ly] + mom[3][ly + 16]; + mom[4][ly] = mom[4][ly] + mom[4][ly + 16]; + mom[5][ly] = mom[5][ly] + mom[5][ly + 16]; + mom[6][ly] = mom[6][ly] + mom[6][ly + 16]; + mom[7][ly] = mom[7][ly] + mom[7][ly + 16]; + mom[8][ly] = mom[8][ly] + mom[8][ly + 16]; + mom[9][ly] = mom[9][ly] + mom[9][ly + 16]; + } + barrier(CLK_LOCAL_MEM_FENCE); - // + m20 ( = m20' + 2*x*m10' + x*x*m00' ) - *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm); + if(ly < 8) + { + mom[0][ly] = mom[0][ly] + mom[0][ly + 8]; + mom[1][ly] = mom[1][ly] + mom[1][ly + 8]; + mom[2][ly] = mom[2][ly] + mom[2][ly + 8]; + mom[3][ly] = mom[3][ly] + mom[3][ly + 8]; + mom[4][ly] = mom[4][ly] + mom[4][ly + 8]; + mom[5][ly] = mom[5][ly] + mom[5][ly + 8]; + mom[6][ly] = mom[6][ly] + mom[6][ly + 8]; + mom[7][ly] = mom[7][ly] + mom[7][ly + 8]; + mom[8][ly] = mom[8][ly] + mom[8][ly + 8]; + mom[9][ly] = mom[9][ly] + mom[9][ly + 8]; + } + barrier(CLK_LOCAL_MEM_FENCE); - // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' ) - *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1]; + if(ly < 4) + { + mom[0][ly] = mom[0][ly] + mom[0][ly + 4]; + mom[1][ly] = mom[1][ly] + mom[1][ly + 4]; + mom[2][ly] = mom[2][ly] + mom[2][ly + 4]; + mom[3][ly] = mom[3][ly] + mom[3][ly + 4]; + mom[4][ly] = mom[4][ly] + mom[4][ly + 4]; + mom[5][ly] = mom[5][ly] + mom[5][ly + 4]; + mom[6][ly] = mom[6][ly] + mom[6][ly + 4]; + mom[7][ly] = mom[7][ly] + mom[7][ly + 4]; + mom[8][ly] = mom[8][ly] + mom[8][ly + 4]; + mom[9][ly] = mom[9][ly] + mom[9][ly + 4]; + } + barrier(CLK_LOCAL_MEM_FENCE); - // + m02 ( = m02' + 2*y*m01' + y*y*m00' ) - *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym); + if(ly < 2) + { + mom[0][ly] = mom[0][ly] + mom[0][ly + 2]; + mom[1][ly] = mom[1][ly] + mom[1][ly + 2]; + mom[2][ly] = mom[2][ly] + mom[2][ly + 2]; + mom[3][ly] = mom[3][ly] + mom[3][ly + 2]; + mom[4][ly] = mom[4][ly] + mom[4][ly + 2]; + mom[5][ly] = mom[5][ly] + mom[5][ly + 2]; + mom[6][ly] = mom[6][ly] + mom[6][ly + 2]; + mom[7][ly] = mom[7][ly] + mom[7][ly + 2]; + mom[8][ly] = mom[8][ly] + mom[8][ly + 2]; + mom[9][ly] = mom[9][ly] + mom[9][ly + 2]; + } + barrier(CLK_LOCAL_MEM_FENCE); - // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' ) - *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm)); + if(ly < 1) + { + mom[0][ly] = mom[0][ly] + mom[0][ly + 1]; + mom[1][ly] = mom[1][ly] + mom[1][ly + 1]; + mom[2][ly] = mom[2][ly] + mom[2][ly + 1]; + mom[3][ly] = mom[3][ly] + mom[3][ly + 1]; + mom[4][ly] = mom[4][ly] + mom[4][ly + 1]; + mom[5][ly] = mom[5][ly] + mom[5][ly + 1]; + mom[6][ly] = mom[6][ly] + mom[6][ly + 1]; + mom[7][ly] = mom[7][ly] + mom[7][ly + 1]; + mom[8][ly] = mom[8][ly] + mom[8][ly + 1]; + mom[9][ly] = mom[9][ly] + mom[9][ly + 1]; + } + } - // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20') - *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3]; + barrier(CLK_LOCAL_MEM_FENCE); - // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02') - *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5]; + if(binary) + { + WT s = 1./255; + if(ly < 10) + { + mom[ly][0] *= s; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + WT xm = (gidx * 256) * mom[0][0]; + WT ym = (gidy * 256) * mom[0][0]; - // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' ) - *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym)); - } -} - -__kernel void CvMoments_D6(__global F* src_data, int src_rows, int src_cols, int src_step, - __global F* dst_m, - int dst_cols, int dst_step, int blocky, - int depth, int cn, int coi, int binary, const int TILE_SIZE) -{ - F tmp_coi[4]; // get the coi data - F4 tmp[64]; - int VLEN_D = 4; // length of vetor - int gidy = get_global_id(0); - int gidx = get_global_id(1); - int wgidy = get_group_id(0); - int wgidx = get_group_id(1); - int lidy = get_local_id(0); - int lidx = get_local_id(1); - int y = wgidy*TILE_SIZE; // real Y index of pixel - int x = wgidx*TILE_SIZE; // real X index of pixel - int kcn = (cn==2)?2:4; - int rstep = min(src_step/8, TILE_SIZE); - int tileSize_height = min(TILE_SIZE, src_rows - y); - int tileSize_width = min(TILE_SIZE, src_cols - x); - - if ( y+lidy < src_rows ) - { - if(tileSize_width < TILE_SIZE) - for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ ) - *((__global F*)src_data+(y+lidy)*src_step/8+x+i) = 0; - if( coi > 0 ) - for(int i=0; i < tileSize_width; i+=VLEN_D) - { - for(int j=0; j<4 && ((x+i+j)*kcn+coi-1)= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height) - { - m[9][lidy-bheight] = ((F)py) * sy; // m03 - m[8][lidy-bheight] = ((F)x1.s0) * sy; // m12 - m[7][lidy-bheight] = ((F)x2.s0) * lidy; // m21 - m[6][lidy-bheight] = x3.s0; // m30 - m[5][lidy-bheight] = x0.s0 * sy; // m02 - m[4][lidy-bheight] = x1.s0 * lidy; // m11 - m[3][lidy-bheight] = x2.s0; // m20 - m[2][lidy-bheight] = py; // m01 - m[1][lidy-bheight] = x1.s0; // m10 - m[0][lidy-bheight] = x0.s0; // m00 - } - else if(lidy < bheight) - { - lm[9] = ((F)py) * sy; // m03 - lm[8] = ((F)x1.s0) * sy; // m12 - lm[7] = ((F)x2.s0) * lidy; // m21 - lm[6] = x3.s0; // m30 - lm[5] = x0.s0 * sy; // m02 - lm[4] = x1.s0 * lidy; // m11 - lm[3] = x2.s0; // m20 - lm[2] = py; // m01 - lm[1] = x1.s0; // m10 - lm[0] = x0.s0; // m00 - } - barrier(CLK_LOCAL_MEM_FENCE); - - for( int j = TILE_SIZE/2; j >= 1; j = j/2 ) - { - if(lidy < j) - for( int i = 0; i < 10; i++ ) - lm[i] = lm[i] + m[i][lidy]; - barrier(CLK_LOCAL_MEM_FENCE); - if(lidy >= j/2&&lidy < j) - for( int i = 0; i < 10; i++ ) - m[i][lidy-j/2] = lm[i]; - barrier(CLK_LOCAL_MEM_FENCE); - } - if(lidy == 0&&lidx == 0) - { - for( int mt = 0; mt < 10; mt++ ) - mom[mt] = (F)lm[mt]; - if(binary) - { - F s = 1./255; - for( int mt = 0; mt < 10; mt++ ) - mom[mt] *= s; - } - - F xm = x * mom[0], ym = y * mom[0]; - - // accumulate moments computed in each tile - dst_step /= sizeof(F); - - // + m00 ( = m00' ) - *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0]; - - // + m10 ( = m10' + x*m00' ) - *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm; - - // + m01 ( = m01' + y*m00' ) - *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym; - - // + m20 ( = m20' + 2*x*m10' + x*x*m00' ) - *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm); - - // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' ) - *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1]; - - // + m02 ( = m02' + 2*y*m01' + y*y*m00' ) - *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym); - - // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' ) - *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm)); - - // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20') - *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3]; - - // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02') - *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5]; - - // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' ) - *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym)); + if(ly == 0) + { + mom[0][1] = mom[0][0]; + mom[1][1] = mom[1][0] + xm; + mom[2][1] = mom[2][0] + ym; + mom[3][1] = mom[3][0] + gidx * 256 * (mom[1][0] * 2 + xm); + mom[4][1] = mom[4][0] + gidx * 256 * (mom[2][0] + ym) + gidy * 256 * mom[1][0]; + mom[5][1] = mom[5][0] + gidy * 256 * (mom[2][0] * 2 + ym); + mom[6][1] = mom[6][0] + gidx * 256 * (3 * mom[3][0] + 256 * gidx * (3 * mom[1][0] + xm)); + mom[7][1] = mom[7][0] + gidx * 256 * (2 * (mom[4][0] + 256 * gidy * mom[1][0]) + 256 * gidx * (mom[2][0] + ym)) + 256 * gidy * mom[3][0]; + mom[8][1] = mom[8][0] + gidy * 256 * (2 * (mom[4][0] + 256 * gidx * mom[2][0]) + 256 * gidy * (mom[1][0] + xm)) + 256 * gidx * mom[5][0]; + mom[9][1] = mom[9][0] + gidy * 256 * (3 * mom[5][0] + 256 * gidy * (3 * mom[2][0] + ym)); + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if(ly < 10) + { + dst_m[10 * gidy * dst_step + ly * dst_step + gidx] = mom[ly][1]; } } diff --git a/modules/ocl/test/test_moments.cpp b/modules/ocl/test/test_moments.cpp index 3f3a125aac..788ac9173f 100644 --- a/modules/ocl/test/test_moments.cpp +++ b/modules/ocl/test/test_moments.cpp @@ -10,18 +10,19 @@ using namespace cvtest; using namespace testing; using namespace std; -PARAM_TEST_CASE(MomentsTest, MatType, bool) +PARAM_TEST_CASE(MomentsTest, MatType, bool, bool) { int type; - cv::Mat mat1; + cv::Mat mat; bool test_contours; - + bool binaryImage; virtual void SetUp() { type = GET_PARAM(0); test_contours = GET_PARAM(1); - cv::Size size(10*MWIDTH, 10*MHEIGHT); - mat1 = randomMat(size, type, 5, 16, false); + cv::Size size(10 * MWIDTH, 10 * MHEIGHT); + mat = randomMat(size, type, 0, 256, false); + binaryImage = GET_PARAM(2); } void Compare(Moments& cpu, Moments& gpu) @@ -29,16 +30,13 @@ PARAM_TEST_CASE(MomentsTest, MatType, bool) Mat gpu_dst, cpu_dst; HuMoments(cpu, cpu_dst); HuMoments(gpu, gpu_dst); - EXPECT_MAT_NEAR(gpu_dst,cpu_dst, .5); + EXPECT_MAT_NEAR(gpu_dst,cpu_dst, 1e-3); } - }; - OCL_TEST_P(MomentsTest, Mat) { - bool binaryImage = 0; - + oclMat src_d(mat); for(int j = 0; j < LOOP_TIMES; j++) { if(test_contours) @@ -53,18 +51,16 @@ OCL_TEST_P(MomentsTest, Mat) for( size_t i = 0; i < contours.size(); i++ ) { Moments m = moments( contours[i], false ); - Moments dm = ocl::ocl_moments( contours[i], false ); + Moments dm = ocl::ocl_moments( contours[i]); Compare(m, dm); } } - cv::_InputArray _array(mat1); - cv::Moments CvMom = cv::moments(_array, binaryImage); - cv::Moments oclMom = cv::ocl::ocl_moments(_array, binaryImage); + cv::Moments CvMom = cv::moments(mat, binaryImage); + cv::Moments oclMom = cv::ocl::ocl_moments(src_d, binaryImage); Compare(CvMom, oclMom); - } } INSTANTIATE_TEST_CASE_P(OCL_ImgProc, MomentsTest, Combine( - Values(CV_8UC1, CV_16UC1, CV_16SC1, CV_64FC1), Values(true,false))); + Values(CV_8UC1, CV_16UC1, CV_16SC1, CV_32FC1, CV_64FC1), Values(false, true), Values(false, true))); #endif // HAVE_OPENCL