diff --git a/cmake/OpenCVDetectOpenCL.cmake b/cmake/OpenCVDetectOpenCL.cmake index 95d749113a..3d2919734c 100644 --- a/cmake/OpenCVDetectOpenCL.cmake +++ b/cmake/OpenCVDetectOpenCL.cmake @@ -2,7 +2,7 @@ if(APPLE) set(OPENCL_FOUND YES) set(OPENCL_LIBRARIES "-framework OpenCL") else() - find_package(OpenCL QUIET) + #find_package(OpenCL QUIET) if(WITH_OPENCLAMDFFT) find_path(CLAMDFFT_INCLUDE_DIR NAMES clAmdFft.h) diff --git a/modules/ocl/include/opencv2/ocl/ocl.hpp b/modules/ocl/include/opencv2/ocl/ocl.hpp index 9110feff0d..50cb0262c5 100644 --- a/modules/ocl/include/opencv2/ocl/ocl.hpp +++ b/modules/ocl/include/opencv2/ocl/ocl.hpp @@ -57,13 +57,15 @@ namespace cv namespace ocl { using std::auto_ptr; - -#define CVCL_DEVICE_TYPE_DEFAULT (1 << 0) -#define CVCL_DEVICE_TYPE_CPU (1 << 1) -#define CVCL_DEVICE_TYPE_GPU (1 << 2) -#define CVCL_DEVICE_TYPE_ACCELERATOR (1 << 3) - //#define CVCL_DEVICE_TYPE_CUSTOM (1 << 4) -#define CVCL_DEVICE_TYPE_ALL 0xFFFFFFFF + enum + { + CVCL_DEVICE_TYPE_DEFAULT = (1 << 0), + CVCL_DEVICE_TYPE_CPU = (1 << 1), + CVCL_DEVICE_TYPE_GPU = (1 << 2), + CVCL_DEVICE_TYPE_ACCELERATOR = (1 << 3), + //CVCL_DEVICE_TYPE_CUSTOM = (1 << 4) + CVCL_DEVICE_TYPE_ALL = 0xFFFFFFFF + }; //this class contains ocl runtime information class CV_EXPORTS Info { @@ -825,7 +827,6 @@ namespace cv }; -#ifdef HAVE_CLAMDFFT ///////////////////////////////////////// clAmdFft related ///////////////////////////////////////// //! Performs a forward or inverse discrete Fourier transform (1D or 2D) of floating point matrix. //! Param dft_size is the size of DFT transform. @@ -839,16 +840,13 @@ namespace cv // real to complex dft output is not the same with cpu version // real to complex and complex to real does not support DFT_ROWS CV_EXPORTS void dft(const oclMat &src, oclMat &dst, Size dft_size = Size(0, 0), int flags = 0); -#endif // HAVE_CLAMDFFT -#ifdef HAVE_CLAMDBLAS //! implements generalized matrix product algorithm GEMM from BLAS // The functionality requires clAmdBlas library // only support type CV_32FC1 // flag GEMM_3_T is not supported CV_EXPORTS void gemm(const oclMat &src1, const oclMat &src2, double alpha, const oclMat &src3, double beta, oclMat &dst, int flags = 0); -#endif //////////////// HOG (Histogram-of-Oriented-Gradients) Descriptor and Object Detector ////////////// diff --git a/modules/ocl/perf/main.cpp b/modules/ocl/perf/main.cpp index 6636b11279..fb8013635a 100644 --- a/modules/ocl/perf/main.cpp +++ b/modules/ocl/perf/main.cpp @@ -73,22 +73,86 @@ void print_info() #endif } - +std::string workdir; int main(int argc, char **argv) { - std::vector oclinfo; TS::ptr()->init("ocl"); InitGoogleTest(&argc, argv); + const char *keys = + + "{ h | help | false | print help message }" + + "{ w | workdir | ../../../samples/c/| set working directory }" + + "{ t | type | gpu | set device type:cpu or gpu}" + + "{ p | platform | 0 | set platform id }" + + "{ d | device | 0 | set device id }"; + + + + CommandLineParser cmd(argc, argv, keys); + + if (cmd.get("help")) + + { + + cout << "Avaible options besides goole test option:" << endl; + + cmd.printParams(); + } + + workdir = cmd.get("workdir"); + + string type = cmd.get("type"); + + unsigned int pid = cmd.get("platform"); + + int device = cmd.get("device"); + print_info(); - int devnums = getDevice(oclinfo); - if(devnums < 1) + int flag = CVCL_DEVICE_TYPE_GPU; + + if(type == "cpu") + { - std::cout << "no device found\n"; - return -1; + + flag = CVCL_DEVICE_TYPE_CPU; + } - //if you want to use undefault device, set it here - //setDevice(oclinfo[0]); + std::vector oclinfo; + int devnums = getDevice(oclinfo); + if(devnums <= device || device < 0) + + { + + std::cout << "device invalid\n"; + + return -1; + + } + + if(pid >= oclinfo.size()) + + { + + std::cout << "platform invalid\n"; + + return -1; + + } + + if(pid != 0 || device != 0) + + { + + setDevice(oclinfo[pid], device); + + } + + cout << "Device type:" << type << endl << "Device name:" << oclinfo[pid].DeviceName[device] << endl; setBinpath(CLBINPATH); return RUN_ALL_TESTS(); } diff --git a/modules/ocl/perf/perf_arithm.cpp b/modules/ocl/perf/perf_arithm.cpp index f2df3fc7c3..197e8d641b 100644 --- a/modules/ocl/perf/perf_arithm.cpp +++ b/modules/ocl/perf/perf_arithm.cpp @@ -2672,13 +2672,13 @@ TEST_P(CountNonZero, MAT) Has_roi(k); t0 = (double)cvGetTickCount();//cpu start - int cpures = cv::countNonZero(mat1_roi); + cv::countNonZero(mat1_roi); t0 = (double)cvGetTickCount() - t0;//cpu end t1 = (double)cvGetTickCount();//gpu start1 gmat1 = mat1_roi; t2 = (double)cvGetTickCount(); //kernel - int gpures = cv::ocl::countNonZero(gmat1); + cv::ocl::countNonZero(gmat1); t2 = (double)cvGetTickCount() - t2;//kernel t1 = (double)cvGetTickCount() - t1;//gpu end1 if(j == 0) @@ -2713,7 +2713,7 @@ TEST_P(CountNonZero, MAT) { cout << "\nwith roi:"; }; - int gpures = cv::ocl::countNonZero(gmat1); + cv::ocl::countNonZero(gmat1); }; #endif diff --git a/modules/ocl/perf/perf_canny.cpp b/modules/ocl/perf/perf_canny.cpp index ec7181a4fd..eb895df5ec 100644 --- a/modules/ocl/perf/perf_canny.cpp +++ b/modules/ocl/perf/perf_canny.cpp @@ -52,8 +52,6 @@ using namespace cvtest; using namespace testing; using namespace std; -#define FILTER_IMAGE "../../../samples/gpu/road.png" - #ifndef MWC_TEST_UTILITY #define MWC_TEST_UTILITY @@ -79,7 +77,7 @@ IMPLEMENT_PARAM_CLASS(Channels, int) //////////////////////////////////////////////////////// // Canny1 - +extern std::string workdir; IMPLEMENT_PARAM_CLASS(AppertureSize, int); IMPLEMENT_PARAM_CLASS(L2gradient, bool); @@ -101,7 +99,7 @@ PARAM_TEST_CASE(Canny1, AppertureSize, L2gradient) TEST_P(Canny1, Performance) { - cv::Mat img = readImage(FILTER_IMAGE, cv::IMREAD_GRAYSCALE); + cv::Mat img = readImage(workdir + "fruits.jpg", cv::IMREAD_GRAYSCALE); ASSERT_FALSE(img.empty()); double low_thresh = 100.0; diff --git a/modules/ocl/perf/perf_filters.cpp b/modules/ocl/perf/perf_filters.cpp index ce46b89588..100a1c59d9 100644 --- a/modules/ocl/perf/perf_filters.cpp +++ b/modules/ocl/perf/perf_filters.cpp @@ -1184,11 +1184,11 @@ INSTANTIATE_TEST_CASE_P(Filters, Laplacian, Combine( //INSTANTIATE_TEST_CASE_P(Filter, ErodeDilate, Combine(Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(1, 2, 3))); -INSTANTIATE_TEST_CASE_P(Filter, Erode, Combine(Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(false))); +INSTANTIATE_TEST_CASE_P(Filter, Erode, Combine(Values(CV_8UC1, CV_8UC1), Values(false))); //INSTANTIATE_TEST_CASE_P(Filter, ErodeDilate, Combine(Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(1, 2, 3))); -INSTANTIATE_TEST_CASE_P(Filter, Dilate, Combine(Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(false))); +INSTANTIATE_TEST_CASE_P(Filter, Dilate, Combine(Values(CV_8UC1, CV_8UC1), Values(false))); INSTANTIATE_TEST_CASE_P(Filter, Sobel, Combine(Values(CV_8UC1, CV_32FC1), diff --git a/modules/ocl/perf/perf_haar.cpp b/modules/ocl/perf/perf_haar.cpp index 634415817e..6b04ce5bbc 100644 --- a/modules/ocl/perf/perf_haar.cpp +++ b/modules/ocl/perf/perf_haar.cpp @@ -52,7 +52,7 @@ using namespace cvtest; using namespace testing; using namespace std; using namespace cv; - +extern std::string workdir; struct getRect { Rect operator ()(const CvAvgComp &e) const @@ -80,9 +80,6 @@ PARAM_TEST_CASE(HaarTestBase, int, int) if( (!cascade.load( cascadeName )) || (!cpucascade.load(cascadeName))) { cout << "ERROR: Could not load classifier cascade" << endl; - cout << "Usage: facedetect [--cascade=]\n" - " [--scale[=\n" - " [filename|camera_index]\n" << endl ; return; } //int devnums = getDevice(oclinfo); @@ -99,16 +96,16 @@ struct Haar : HaarTestBase {}; TEST_F(Haar, FaceDetect) { - string imgName = "../../../samples/c/lena.jpg"; + string imgName = workdir + "lena.jpg"; Mat img = imread( imgName, 1 ); if(img.empty()) { - std::cout << "Couldn't read test" << index << ".jpg" << std::endl; + std::cout << imgName << std::endl; return ; } - int i = 0; + //int i = 0; double t = 0; vector faces, oclfaces; diff --git a/modules/ocl/perf/perf_hog.cpp b/modules/ocl/perf/perf_hog.cpp index d3a90dcb70..75086ca449 100644 --- a/modules/ocl/perf/perf_hog.cpp +++ b/modules/ocl/perf/perf_hog.cpp @@ -53,8 +53,7 @@ using namespace cv::ocl; using namespace cvtest; using namespace testing; using namespace std; - -#define FILTER_IMAGE "../../../samples/gpu/road.png" +extern std::string workdir; #ifndef MWC_TEST_UTILITY #define MWC_TEST_UTILITY @@ -100,15 +99,15 @@ PARAM_TEST_CASE(HOG, WinSizw48, bool) TEST_P(HOG, Performance) { - cv::Mat img = readImage(FILTER_IMAGE, cv::IMREAD_GRAYSCALE); + cv::Mat img = readImage(workdir + "lena.jpg", cv::IMREAD_GRAYSCALE); ASSERT_FALSE(img.empty()); // define HOG related arguments float scale = 1.05; - int nlevels = 13; + //int nlevels = 13; float gr_threshold = 8; float hit_threshold = 1.4; - bool hit_threshold_auto = true; + //bool hit_threshold_auto = true; int win_width = is48 ? 48 : 64; int win_stride_width = 8; diff --git a/modules/ocl/perf/perf_imgproc.cpp b/modules/ocl/perf/perf_imgproc.cpp index 651a595e04..7d544b0e2d 100644 --- a/modules/ocl/perf/perf_imgproc.cpp +++ b/modules/ocl/perf/perf_imgproc.cpp @@ -1246,6 +1246,7 @@ TEST_P(Remap, Mat) } int bordertype[] = {cv::BORDER_CONSTANT, cv::BORDER_REPLICATE/*,BORDER_REFLECT,BORDER_WRAP,BORDER_REFLECT_101*/}; const char *borderstr[] = {"BORDER_CONSTANT", "BORDER_REPLICATE"/*, "BORDER_REFLECT","BORDER_WRAP","BORDER_REFLECT_101"*/}; + cout << borderstr[0] << endl; #ifndef PRINT_KERNEL_RUN_TIME double totalcputick = 0; double totalgputick = 0; diff --git a/modules/ocl/perf/perf_matrix_operation.cpp b/modules/ocl/perf/perf_matrix_operation.cpp index f4af91db90..ba011f8dfc 100644 --- a/modules/ocl/perf/perf_matrix_operation.cpp +++ b/modules/ocl/perf/perf_matrix_operation.cpp @@ -714,7 +714,6 @@ TEST_P(DataTransfer, perf) totaluploadtick = t0 + totaluploadtick; totaldownloadtick = t1 + totaldownloadtick; } - EXPECT_MAT_SIMILAR(mat, cpu_dst, 0.0); totaltick = totaluploadtick + totaldownloadtick; cout << "average upload time is " << totaluploadtick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl; cout << "average download time is " << totaldownloadtick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl; diff --git a/modules/ocl/perf/perf_split_merge.cpp b/modules/ocl/perf/perf_split_merge.cpp index 5502d7f087..67a3d24aea 100644 --- a/modules/ocl/perf/perf_split_merge.cpp +++ b/modules/ocl/perf/perf_split_merge.cpp @@ -450,7 +450,7 @@ TEST_P(Split, Accuracy) for(int j = LOOPROISTART; j < LOOPROIEND; j ++) { Has_roi(j); - cv::Mat dev_dst[4] = {dst1_roi, dst2_roi, dst3_roi, dst4_roi}; + //cv::Mat dev_dst[4] = {dst1_roi, dst2_roi, dst3_roi, dst4_roi}; cv::ocl::oclMat dev_gdst[4] = {gdst1, gdst2, gdst3, gdst4}; gdst1_whole = dst1; gdst1 = gdst1_whole(Rect(dst1x, dst1y, roicols, roirows)); diff --git a/modules/ocl/perf/perf_surf.cpp b/modules/ocl/perf/perf_surf.cpp index 6aa4f512a2..646d07fa7f 100644 --- a/modules/ocl/perf/perf_surf.cpp +++ b/modules/ocl/perf/perf_surf.cpp @@ -54,11 +54,11 @@ using namespace cvtest; using namespace testing; using namespace std; -#define FILTER_IMAGE "../../../samples/gpu/road.png" +extern std::string workdir; TEST(SURF, Performance) { - cv::Mat img = readImage(FILTER_IMAGE, cv::IMREAD_GRAYSCALE); + cv::Mat img = readImage(workdir+"lena.jpg", cv::IMREAD_GRAYSCALE); ASSERT_FALSE(img.empty()); ocl::SURF_OCL d_surf; diff --git a/modules/ocl/src/arithm.cpp b/modules/ocl/src/arithm.cpp index dadf57cb62..de97cd3857 100644 --- a/modules/ocl/src/arithm.cpp +++ b/modules/ocl/src/arithm.cpp @@ -772,7 +772,7 @@ Scalar arithmetic_sum(const oclMat &src, int type = 0) { size_t groupnum = src.clCxt->impl->maxComputeUnits; CV_Assert(groupnum != 0); - int vlen = src.oclchannels() == 3 ? 12 : 8, dbsize = groupnum * vlen, status; + int vlen = src.oclchannels() == 3 ? 12 : 8, dbsize = groupnum * vlen; Context *clCxt = src.clCxt; T *p = new T[dbsize]; cl_mem dstBuffer = openCLCreateBuffer(clCxt, CL_MEM_WRITE_ONLY, dbsize * sizeof(T)); @@ -930,7 +930,7 @@ template void arithmetic_minMax(const oclMat &src, double *minVal, CV_Assert(groupnum != 0); groupnum = groupnum * 2; int vlen = 8; - int dbsize = groupnum * 2 * vlen * sizeof(T) , status; + int dbsize = groupnum * 2 * vlen * sizeof(T) ; Context *clCxt = src.clCxt; cl_mem dstBuffer = openCLCreateBuffer(clCxt, CL_MEM_WRITE_ONLY, dbsize); *minVal = std::numeric_limits::max() , *maxVal = -std::numeric_limits::max(); @@ -945,11 +945,11 @@ template void arithmetic_minMax(const oclMat &src, double *minVal, T *p = new T[groupnum * vlen * 2]; memset(p, 0, dbsize); openCLReadBuffer(clCxt, dstBuffer, (void *)p, dbsize); - for(int i = 0; i < vlen * groupnum; i++) + for(int i = 0; i < vlen * (int)groupnum; i++) { *minVal = *minVal < p[i] ? *minVal : p[i]; } - for(int i = vlen * groupnum; i < 2 * vlen * groupnum; i++) + for(int i = vlen * (int)groupnum; i < 2 * vlen * (int)groupnum; i++) { *maxVal = *maxVal > p[i] ? *maxVal : p[i]; } @@ -1606,7 +1606,7 @@ void arithmetic_minMaxLoc(const oclMat &src, double *minVal, double *maxVal, size_t groupnum = src.clCxt->impl->maxComputeUnits; CV_Assert(groupnum != 0); int minloc = -1 , maxloc = -1; - int vlen = 4, dbsize = groupnum * vlen * 4 * sizeof(T) , status; + int vlen = 4, dbsize = groupnum * vlen * 4 * sizeof(T) ; Context *clCxt = src.clCxt; cl_mem dstBuffer = openCLCreateBuffer(clCxt, CL_MEM_WRITE_ONLY, dbsize); *minVal = std::numeric_limits::max() , *maxVal = -std::numeric_limits::max(); @@ -1621,15 +1621,15 @@ void arithmetic_minMaxLoc(const oclMat &src, double *minVal, double *maxVal, T *p = new T[groupnum * vlen * 4]; memset(p, 0, dbsize); openCLReadBuffer(clCxt, dstBuffer, (void *)p, dbsize); - for(int i = 0; i < vlen * groupnum; i++) + for(int i = 0; i < vlen * (int)groupnum; i++) { *minVal = (*minVal < p[i] || p[i + 2 * vlen * groupnum] == -1) ? *minVal : p[i]; - minloc = (*minVal < p[i] || p[i + 2 * vlen * groupnum] == -1) ? minloc : p[i + 2 * vlen * groupnum]; + minloc = (*minVal < p[i] || p[i + 2 * vlen * groupnum] == -1) ? minloc : cvRound(p[i + 2 * vlen * groupnum]); } - for(int i = vlen * groupnum; i < 2 * vlen * groupnum; i++) + for(int i = vlen * (int)groupnum; i < 2 * vlen * (int)groupnum; i++) { *maxVal = (*maxVal > p[i] || p[i + 2 * vlen * groupnum] == -1) ? *maxVal : p[i]; - maxloc = (*maxVal > p[i] || p[i + 2 * vlen * groupnum] == -1) ? maxloc : p[i + 2 * vlen * groupnum]; + maxloc = (*maxVal > p[i] || p[i + 2 * vlen * groupnum] == -1) ? maxloc : cvRound(p[i + 2 * vlen * groupnum]); } int pre_rows = src.offset / src.step; @@ -1717,7 +1717,7 @@ int cv::ocl::countNonZero(const oclMat &src) } CV_Assert(groupnum != 0); groupnum = groupnum * 2; - int vlen = 8 , dbsize = groupnum * vlen, status; + int vlen = 8 , dbsize = groupnum * vlen; //cl_ulong start, end; Context *clCxt = src.clCxt; string kernelName = "arithm_op_nonzero"; diff --git a/modules/ocl/src/brute_force_matcher.cpp b/modules/ocl/src/brute_force_matcher.cpp index 0103d278e9..d2122bb6b1 100644 --- a/modules/ocl/src/brute_force_matcher.cpp +++ b/modules/ocl/src/brute_force_matcher.cpp @@ -227,8 +227,8 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat } template < int BLOCK_SIZE, int MAX_DESC_LEN, typename T/*, typename Mask*/ > -void matchUnrolledCached(const oclMat query, const oclMat *trains, int n, const oclMat mask, - const oclMat &bestTrainIdx, const oclMat &bestImgIdx, const oclMat &bestDistance, int distType) +void matchUnrolledCached(const oclMat /*query*/, const oclMat * /*trains*/, int /*n*/, const oclMat /*mask*/, + const oclMat &/*bestTrainIdx*/, const oclMat & /*bestImgIdx*/, const oclMat & /*bestDistance*/, int /*distType*/) { } @@ -266,8 +266,8 @@ void match(const oclMat &query, const oclMat &train, const oclMat &mask, } template < int BLOCK_SIZE, typename T/*, typename Mask*/ > -void match(const oclMat query, const oclMat *trains, int n, const oclMat mask, - const oclMat &bestTrainIdx, const oclMat &bestImgIdx, const oclMat &bestDistance, int distType) +void match(const oclMat /*query*/, const oclMat * /*trains*/, int /*n*/, const oclMat /*mask*/, + const oclMat &/*bestTrainIdx*/, const oclMat & /*bestImgIdx*/, const oclMat & /*bestDistance*/, int /*distType*/) { } @@ -796,7 +796,7 @@ void match2Dispatcher(const oclMat &query, const oclMat &train, const oclMat &ma } template -void findKnnMatch(int k, const oclMat &trainIdx, const oclMat &distance, const oclMat &allDist, int distType) +void findKnnMatch(int k, const oclMat &trainIdx, const oclMat &distance, const oclMat &allDist, int /*distType*/) { cv::ocl::Context *ctx = trainIdx.clCxt; size_t globalSize[] = {trainIdx.rows * BLOCK_SIZE, 1, 1}; @@ -1406,7 +1406,7 @@ void cv::ocl::BruteForceMatcher_OCL_base::knnMatch(const oclMat &query, const oc } void cv::ocl::BruteForceMatcher_OCL_base::knnMatch2Collection(const oclMat &query, const oclMat &trainCollection, - oclMat &trainIdx, oclMat &imgIdx, oclMat &distance, const oclMat &maskCollection) + oclMat &trainIdx, oclMat &imgIdx, oclMat &distance, const oclMat &/*maskCollection*/) { if (query.empty() || trainCollection.empty()) return; @@ -1702,7 +1702,7 @@ void cv::ocl::BruteForceMatcher_OCL_base::radiusMatch(const oclMat &query, const } void cv::ocl::BruteForceMatcher_OCL_base::radiusMatchCollection(const oclMat &query, oclMat &trainIdx, oclMat &imgIdx, oclMat &distance, - oclMat &nMatches, float maxDistance, const vector &masks) + oclMat &nMatches, float /*maxDistance*/, const vector &masks) { if (query.empty() || empty()) return; diff --git a/modules/ocl/src/fft.cpp b/modules/ocl/src/fft.cpp index f62541d19a..459932cc68 100644 --- a/modules/ocl/src/fft.cpp +++ b/modules/ocl/src/fft.cpp @@ -51,13 +51,17 @@ using namespace cv; using namespace cv::ocl; using namespace std; -#if !defined (HAVE_OPENCL) +#if !defined HAVE_OPENCL void cv::ocl::dft(const oclMat &src, oclMat &dst, int flags) { throw_nogpu(); } +#elif !defined HAVE_CLAMDFFT +void cv::ocl::dft(const oclMat &src, oclMat &dst, int flags) +{ + CV_Error(CV_StsNotImplemented, "OpenCL DFT is not implemented"); +} #else - #include namespace cv diff --git a/modules/ocl/src/filtering.cpp b/modules/ocl/src/filtering.cpp index 1a236e4a10..4bc0894a94 100644 --- a/modules/ocl/src/filtering.cpp +++ b/modules/ocl/src/filtering.cpp @@ -340,12 +340,12 @@ void GPUErode(const oclMat &src, oclMat &dst, oclMat &mat_kernel, Size &ksize, c Context *clCxt = src.clCxt; string kernelName; size_t localThreads[3] = {16, 16, 1}; - size_t globalThreads[3] = {(src.cols + localThreads[0]) / localThreads[0] *localThreads[0], (src.rows + localThreads[1]) / localThreads[1] *localThreads[1], 1}; + size_t globalThreads[3] = {(src.cols + localThreads[0] - 1) / localThreads[0] *localThreads[0], (src.rows + localThreads[1] - 1) / localThreads[1] *localThreads[1], 1}; if(src.type() == CV_8UC1) { kernelName = "morph_C1_D0"; - globalThreads[0] = ((src.cols + 3) / 4 + localThreads[0]) / localThreads[0] * localThreads[0]; + globalThreads[0] = ((src.cols + 3) / 4 + localThreads[0] - 1) / localThreads[0] * localThreads[0]; CV_Assert( localThreads[0]*localThreads[1] * 8 >= (localThreads[0] * 4 + ksize.width - 1) * (localThreads[1] + ksize.height - 1) ); } else @@ -489,7 +489,7 @@ namespace MorphologyFilterEngine_GPU(const Ptr &filter2D_, int iters_) : Filter2DEngine_GPU(filter2D_), iters(iters_) {} - virtual void apply(const oclMat &src, oclMat &dst, Rect roi = Rect(0, 0, -1, -1)) + virtual void apply(const oclMat &src, oclMat &dst) { Filter2DEngine_GPU::apply(src, dst); //if (iters > 1) @@ -778,7 +778,7 @@ namespace virtual void apply(const oclMat &src, oclMat &dst, Rect roi = Rect(0, 0, -1, -1)) { Size src_size = src.size(); - int src_type = src.type(); + //int src_type = src.type(); int cn = src.oclchannels(); //dst.create(src_size, src_type); diff --git a/modules/ocl/src/gemm.cpp b/modules/ocl/src/gemm.cpp index bccf556ed4..2f11e2b36a 100644 --- a/modules/ocl/src/gemm.cpp +++ b/modules/ocl/src/gemm.cpp @@ -50,11 +50,18 @@ #include "clAmdBlas.h" -#if !defined (HAVE_OPENCL) -void cv::ocl::dft(const oclMat &src, oclMat &dst, int flags) +#if !defined HAVE_OPENCL +void cv::ocl::gemm(const oclMat &src1, const oclMat &src2, double alpha, + const oclMat &src3, double beta, oclMat &dst, int flags) { throw_nogpu(); } +#elif !defined HAVE_CLAMDBLAS +void cv::ocl::gemm(const oclMat &src1, const oclMat &src2, double alpha, + const oclMat &src3, double beta, oclMat &dst, int flags) +{ + CV_Error(CV_StsNotImplemented, "OpenCL BLAS is not implemented"); +} #else using namespace cv; diff --git a/modules/ocl/src/haar.cpp b/modules/ocl/src/haar.cpp index 19840f8b09..0e8f3b4b76 100644 --- a/modules/ocl/src/haar.cpp +++ b/modules/ocl/src/haar.cpp @@ -513,13 +513,13 @@ gpuCreateHidHaarClassifierCascade( CvHaarClassifierCascade *cascade, int *size, #define sum_elem_ptr(sum,row,col) \ - ((sumtype*)CV_MAT_ELEM_PTR_FAST((sum),(row),(col),sizeof(sumtype))) + ((sumtype*)CV_MAT_ELEM_PTR_FAST((sum),(row),(col),sizeof(sumtype))) #define sqsum_elem_ptr(sqsum,row,col) \ - ((sqsumtype*)CV_MAT_ELEM_PTR_FAST((sqsum),(row),(col),sizeof(sqsumtype))) + ((sqsumtype*)CV_MAT_ELEM_PTR_FAST((sqsum),(row),(col),sizeof(sqsumtype))) #define calc_sum(rect,offset) \ - ((rect).p0[offset] - (rect).p1[offset] - (rect).p2[offset] + (rect).p3[offset]) + ((rect).p0[offset] - (rect).p1[offset] - (rect).p2[offset] + (rect).p3[offset]) CV_IMPL void @@ -813,14 +813,9 @@ gpuSetHaarClassifierCascade( CvHaarClassifierCascade *_cascade CvHaarFeature *feature = &_cascade->stage_classifier[i].classifier[j].haar_feature[l]; GpuHidHaarTreeNode *hidnode = &stage_classifier[i].classifier[j].node[l]; - double sum0 = 0, area0 = 0; CvRect r[3]; - int base_w = -1, base_h = -1; - int new_base_w = 0, new_base_h = 0; - int kx, ky; - int flagx = 0, flagy = 0; - int x0 = 0, y0 = 0; + int nr; /* align blocks */ @@ -872,7 +867,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS std::vector rectList; std::vector rweights; double factor; - int coi; int datasize; int totalclassifier; @@ -885,9 +879,9 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS int *candidate; cl_int status; - bool doCannyPruning = (flags & CV_HAAR_DO_CANNY_PRUNING) != 0; + // bool doCannyPruning = (flags & CV_HAAR_DO_CANNY_PRUNING) != 0; bool findBiggestObject = (flags & CV_HAAR_FIND_BIGGEST_OBJECT) != 0; - bool roughSearch = (flags & CV_HAAR_DO_ROUGH_SEARCH) != 0; + // bool roughSearch = (flags & CV_HAAR_DO_ROUGH_SEARCH) != 0; //the Intel HD Graphics is unsupported if (gimg.clCxt->impl->devName.find("Intel(R) HD Graphics") != string::npos) @@ -1015,7 +1009,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS gimgroi = gsum(roi); gimgroisq = gsqsum(roi); //scaleinfo[i].rows = gimgroi.rows; - int ystep = 1; // factor > 2 ? 1 : 2; int width = gimgroi.cols - 1 - cascade->orig_window_size.width; int height = gimgroi.rows - 1 - cascade->orig_window_size.height; scaleinfo[i].width_height = (width << 16) | height; @@ -1109,7 +1102,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS pq.s[2] = gcascade->pq2; pq.s[3] = gcascade->pq3; float correction = gcascade->inv_window_area; - int argcount = 0; + //int grpnumperline = ((m + localThreads[0] - 1) / localThreads[0]); //int totalgrp = ((n + localThreads[1] - 1) / localThreads[1])*grpnumperline; // openCLVerifyKernel(gsum.clCxt, kernel, &blocksize, globalThreads, localThreads); @@ -1184,7 +1177,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS { CvSize winsize0 = cascade->orig_window_size; int n_factors = 0; - int flag = 0; oclMat gsum; oclMat gsqsum; cv::ocl::integral(gimg, gsum, gsqsum); @@ -1276,7 +1268,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS scaleinfo[i].imgoff = 0; scaleinfo[i].factor = factor; int startnodenum = nodenum * i; - int argcounts = 0; float factor2 = (float)factor; /* openCLSafeCall(clSetKernelArg(kernel2, argcounts++, sizeof(cl_mem), (void *)&nodebuffer)); @@ -1294,7 +1285,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS args1.push_back ( make_pair(sizeof(cl_int) , (void *)&startnodenum )); size_t globalThreads2[3] = {nodenum, 1, 1}; - size_t localThreads2[3] = {256, 1, 1}; openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuscaleclassifier", globalThreads2, NULL/*localThreads2*/, args1, -1, -1); @@ -2478,13 +2468,13 @@ else // } /* j */ // } //} - +/* CV_INLINE double gpuEvalHidHaarClassifier( GpuHidHaarClassifier *classifier, - double variance_norm_factor, - size_t p_offset ) +double variance_norm_factor, +size_t p_offset ) { - /* + int idx = 0; do { @@ -2501,14 +2491,15 @@ double gpuEvalHidHaarClassifier( GpuHidHaarClassifier *classifier, } while( idx > 0 ); return classifier->alpha[-idx]; - */ + return 0.; } +*/ CV_IMPL int -gpuRunHaarClassifierCascade( const CvHaarClassifierCascade *_cascade, - CvPoint pt, int start_stage ) +gpuRunHaarClassifierCascade( /*const CvHaarClassifierCascade *_cascade, +CvPoint pt, int start_stage */) { /* int result = -1; @@ -2620,7 +2611,7 @@ namespace cv for( y = y1; y < y2; y += ystep ) for( x = 0; x < ssz.width; x += ystep ) { - if( gpuRunHaarClassifierCascade( cascade, cvPoint(x, y), 0 ) > 0 ) + if( gpuRunHaarClassifierCascade( /*cascade, cvPoint(x, y), 0*/ ) > 0 ) vec->push_back(Rect(cvRound(x * factor), cvRound(y * factor), winSize.width, winSize.height)); } @@ -2679,7 +2670,7 @@ namespace cv } } - int result = gpuRunHaarClassifierCascade( cascade, cvPoint(x, y), 0 ); + int result = gpuRunHaarClassifierCascade(/* cascade, cvPoint(x, y), 0 */); if( result > 0 ) vec->push_back(Rect(x, y, winsize.width, winsize.height)); ixstep = result != 0 ? 1 : 2; diff --git a/modules/ocl/src/imgproc.cpp b/modules/ocl/src/imgproc.cpp index 62ea42e749..27ce211de1 100644 --- a/modules/ocl/src/imgproc.cpp +++ b/modules/ocl/src/imgproc.cpp @@ -296,9 +296,9 @@ namespace cv kernelName = "remapNNF1Constant"; } - int channels = dst.oclchannels(); - int depth = dst.depth(); - int type = src.type(); + //int channels = dst.oclchannels(); + //int depth = dst.depth(); + //int type = src.type(); size_t blkSizeX = 16, blkSizeY = 16; size_t glbSizeX; int cols = dst.cols; @@ -308,7 +308,7 @@ namespace cv glbSizeX = cols % blkSizeX == 0 ? cols : (cols / blkSizeX + 1) * blkSizeX; } - else if(src.type() == CV_8UC3 || src.type() == CV_8UC4 || src.type() == CV_32FC1) + else if(src.type() == CV_32FC1 && interpolation == INTER_LINEAR) { cols = (dst.cols + (dst.offset >> 2) % 4 + 3) / 4; glbSizeX = cols % blkSizeX == 0 ? cols : (cols / blkSizeX + 1) * blkSizeX; @@ -322,73 +322,6 @@ namespace cv size_t glbSizeY = dst.rows % blkSizeY == 0 ? dst.rows : (dst.rows / blkSizeY + 1) * blkSizeY; size_t globalThreads[3] = {glbSizeX, glbSizeY, 1}; size_t localThreads[3] = {blkSizeX, blkSizeY, 1}; - /* - ///////////////////////////// - //using the image buffer - ///////////////////////////// - - size_t image_row_pitch = 0; - cl_int err1, err2, err3; - cl_mem_flags flags1 = CL_MEM_READ_ONLY; - cl_image_format format; - if(src.type() == CV_8UC1) - { - format.image_channel_order = CL_R; - format.image_channel_data_type = CL_UNSIGNED_INT8; - } - else if(src.type() == CV_8UC4) - { - format.image_channel_order = CL_RGBA; - format.image_channel_data_type = CL_UNSIGNED_INT8; - } - else if(src.type() == CV_32FC1) - { - format.image_channel_order = CL_R; - format.image_channel_data_type = CL_FLOAT; - } - else if(src.type() == CV_32FC4) - { - format.image_channel_order = CL_RGBA; - format.image_channel_data_type = CL_FLOAT; - } - cl_mem srcImage = clCreateImage2D(clCxt->impl->clContext, flags1, &format, src.cols, src.rows, - image_row_pitch, NULL, &err1); - if(err1 != CL_SUCCESS) - { - printf("Error creating CL image buffer, error code %d\n", err1); - return; - } - const size_t src_origin[3] = {0, 0, 0}; - const size_t region[3] = {src.cols, src.rows, 1}; - cl_event BtoI_event, ItoB_event; - err3 = clEnqueueCopyBufferToImage(clCxt->impl->clCmdQueue, (cl_mem)src.data, srcImage, - 0, src_origin, region, 0, NULL, NULL); - if(err3 != CL_SUCCESS) - { - printf("Error copying buffer to image\n"); - printf("Error code %d \n", err3); - return; - } - // clWaitForEvents(1, &BtoI_event); - - cl_int ret; - Mat test(src.rows, src.cols, CV_8UC1); - memset(test.data, 0, src.rows*src.cols); - ret = clEnqueueReadImage(clCxt->impl->clCmdQueue, srcImage, CL_TRUE, - src_origin, region, 0, 0, test.data, NULL, NULL, &ItoB_event); - if(ret != CL_SUCCESS) - { - printf("read image error, %d ", ret); - return; - } - clWaitForEvents(1, &ItoB_event); - - cout << "src" << endl; - cout << src << endl; - cout<<"image:"< > args; @@ -396,7 +329,6 @@ namespace cv { args.push_back( make_pair(sizeof(cl_mem), (void *)&dst.data)); args.push_back( make_pair(sizeof(cl_mem), (void *)&src.data)); - // args.push_back( make_pair(sizeof(cl_mem),(void*)&srcImage)); //imageBuffer args.push_back( make_pair(sizeof(cl_mem), (void *)&map1.data)); args.push_back( make_pair(sizeof(cl_int), (void *)&dst.offset)); args.push_back( make_pair(sizeof(cl_int), (void *)&src.offset)); @@ -425,7 +357,6 @@ namespace cv { args.push_back( make_pair(sizeof(cl_mem), (void *)&dst.data)); args.push_back( make_pair(sizeof(cl_mem), (void *)&src.data)); - // args.push_back( make_pair(sizeof(cl_mem),(void*)&srcImage)); //imageBuffer args.push_back( make_pair(sizeof(cl_mem), (void *)&map1.data)); args.push_back( make_pair(sizeof(cl_mem), (void *)&map2.data)); args.push_back( make_pair(sizeof(cl_int), (void *)&dst.offset)); @@ -1369,10 +1300,10 @@ namespace cv if( src.depth() != CV_8U || src.oclchannels() != 4 ) CV_Error( CV_StsUnsupportedFormat, "Only 8-bit, 4-channel images are supported" ); - if(src.clCxt->impl->double_support == 0) - { - CV_Error( CV_GpuNotSupported, "Selected device doesn't support double, so a deviation is exists.\nIf the accuracy is acceptable, the error can be ignored.\n"); - } + // if(src.clCxt->impl->double_support == 0) + // { + // CV_Error( CV_GpuNotSupported, "Selected device doesn't support double, so a deviation exists.\nIf the accuracy is acceptable, the error can be ignored.\n"); + // } dst.create( src.size(), CV_8UC4 ); @@ -1437,10 +1368,10 @@ namespace cv if( src.depth() != CV_8U || src.oclchannels() != 4 ) CV_Error( CV_StsUnsupportedFormat, "Only 8-bit, 4-channel images are supported" ); - if(src.clCxt->impl->double_support == 0) - { - CV_Error( CV_GpuNotSupported, "Selected device doesn't support double, so a deviation is exists.\nIf the accuracy is acceptable, the error can be ignored.\n"); - } + // if(src.clCxt->impl->double_support == 0) + // { + // CV_Error( CV_GpuNotSupported, "Selected device doesn't support double, so a deviation exists.\nIf the accuracy is acceptable, the error can be ignored.\n"); + // } dstr.create( src.size(), CV_8UC4 ); dstsp.create( src.size(), CV_16SC2 ); @@ -1603,7 +1534,7 @@ namespace cv int borderType ) { int cn = src.channels(); - int i, j, k, maxk, radius; + int i, j, maxk, radius; Size size = src.size(); CV_Assert( (src.channels() == 1 || src.channels() == 3) && diff --git a/modules/ocl/src/initialization.cpp b/modules/ocl/src/initialization.cpp index b7f1feb9a2..580433f12e 100644 --- a/modules/ocl/src/initialization.cpp +++ b/modules/ocl/src/initialization.cpp @@ -424,7 +424,7 @@ namespace cv void openCLCopyBuffer2D(Context *clCxt, void *dst, size_t dpitch, int dst_offset, const void *src, size_t spitch, - size_t width, size_t height, int src_offset, enum openCLMemcpyKind kind) + size_t width, size_t height, int src_offset) { size_t src_origin[3] = {src_offset % spitch, src_offset / spitch, 0}; size_t dst_origin[3] = {dst_offset % dpitch, dst_offset / dpitch, 0}; @@ -451,7 +451,7 @@ namespace cv } int savetofile(const Context *clcxt, cl_program &program, const char *fileName) { - cl_int status; + //cl_int status; size_t numDevices = 1; cl_device_id *devices = clcxt->impl->devices; //figure out the sizes of each of the binaries. @@ -507,7 +507,7 @@ namespace cv FILE *fp = fopen(fileName, "wb+"); if(fp == NULL) { - char *temp; + char *temp = NULL; sprintf(temp, "Failed to load kernel file : %s\r\n", fileName); CV_Error(CV_GpuApiCallError, temp); } @@ -639,8 +639,7 @@ namespace cv return kernel; } - void openCLVerifyKernel(const Context *clCxt, cl_kernel kernel, size_t *blockSize, - size_t *globalThreads, size_t *localThreads) + void openCLVerifyKernel(const Context *clCxt, cl_kernel kernel, size_t *localThreads) { size_t kernelWorkGroupSize; openCLSafeCall(clGetKernelWorkGroupInfo(kernel, clCxt->impl->devices[0], @@ -679,10 +678,10 @@ namespace cv globalThreads[1] = divUp(globalThreads[1], localThreads[1]) * localThreads[1]; globalThreads[2] = divUp(globalThreads[2], localThreads[2]) * localThreads[2]; - size_t blockSize = localThreads[0] * localThreads[1] * localThreads[2]; - cv::ocl::openCLVerifyKernel(clCxt, kernel, &blockSize, globalThreads, localThreads); + //size_t blockSize = localThreads[0] * localThreads[1] * localThreads[2]; + cv::ocl::openCLVerifyKernel(clCxt, kernel, localThreads); } - for(int i = 0; i < args.size(); i ++) + for(size_t i = 0; i < args.size(); i ++) openCLSafeCall(clSetKernelArg(kernel, i, args[i].first, args[i].second)); #ifndef PRINT_KERNEL_RUN_TIME @@ -897,7 +896,7 @@ namespace cv impl->maxComputeUnits = m.impl->maxComputeUnits; impl->double_support = m.impl->double_support; memcpy(impl->extra_options, m.impl->extra_options, 512); - for(int i = 0; i < m.impl->devices.size(); i++) + for(size_t i = 0; i < m.impl->devices.size(); i++) { impl->devices.push_back(m.impl->devices[i]); impl->devName.push_back(m.impl->devName[i]); diff --git a/modules/ocl/src/kernels/arithm_addWeighted.cl b/modules/ocl/src/kernels/arithm_addWeighted.cl index 0222959e75..7e9df6f253 100644 --- a/modules/ocl/src/kernels/arithm_addWeighted.cl +++ b/modules/ocl/src/kernels/arithm_addWeighted.cl @@ -61,30 +61,29 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset int y = get_global_id(1); if (x < cols && y < rows) - + { x = x << 2; - #define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); + int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); + int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - uchar4 src1_data ,src2_data; + uchar4 src1_data ,src2_data; - src1_data.x= src1_index+0 >= 0 ? src1[src1_index+0] : 0; - src1_data.y= src1_index+1 >= 0 ? src1[src1_index+1] : 0; - src1_data.z= src1_index+2 >= 0 ? src1[src1_index+2] : 0; - src1_data.w= src1_index+3 >= 0 ? src1[src1_index+3] : 0; + src1_data.x= src1_index+0 >= 0 ? src1[src1_index+0] : 0; + src1_data.y= src1_index+1 >= 0 ? src1[src1_index+1] : 0; + src1_data.z= src1_index+2 >= 0 ? src1[src1_index+2] : 0; + src1_data.w= src1_index+3 >= 0 ? src1[src1_index+3] : 0; - src2_data.x= src2_index+0 >= 0 ? src2[src2_index+0] : 0; - src2_data.y= src2_index+1 >= 0 ? src2[src2_index+1] : 0; - src2_data.z= src2_index+2 >= 0 ? src2[src2_index+2] : 0; - src2_data.w= src2_index+3 >= 0 ? src2[src2_index+3] : 0; + src2_data.x= src2_index+0 >= 0 ? src2[src2_index+0] : 0; + src2_data.y= src2_index+1 >= 0 ? src2[src2_index+1] : 0; + src2_data.z= src2_index+2 >= 0 ? src2[src2_index+2] : 0; + src2_data.w= src2_index+3 >= 0 ? src2[src2_index+3] : 0; uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); // short4 tmp = convert_short4_sat(src1_data) * alpha + convert_short4_sat(src2_data) * beta + gama; @@ -118,21 +117,35 @@ __kernel void addWeighted_D2 (__global ushort *src1, int src1_step,int src1_offs int y = get_global_id(1); if (x < cols && y < rows) - + { x = x << 2; #define dst_align ((dst_offset >> 1) & 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1)); + int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1)); + int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset +( x<< 1) & (int)0xfffffff8); + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; + ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index_fix)); + ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix)); + if(src1_index < 0) + { + ushort4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + ushort4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } - ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index)); - ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index)); ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index)); // int4 tmp = convert_int4_sat(src1_data) * alpha + convert_int4_sat(src2_data) * beta + gama; @@ -164,22 +177,36 @@ __kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offse int y = get_global_id(1); if (x < cols && y < rows) - + { x = x << 2; #define dst_align ((dst_offset >> 1) & 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1)); + int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1)); + int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset +( x<< 1) - (dst_align << 1 )); - short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index)); - short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index)); + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; + short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index_fix)); + short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix)); + if(src1_index < 0) + { + short4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + short4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index)); // int4 tmp = convert_int4_sat(src1_data) * alpha + convert_int4_sat(src2_data) * beta + gama; int4 tmp; @@ -209,24 +236,39 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset, int y = get_global_id(1); if (x < cols && y < rows) - + { - + x = x << 2; #define bitOfInt (sizeof(int)== 4 ? 2: 3) #define dst_align ((dst_offset >> bitOfInt) & 3) - int src1_index = mad24(y, src1_step, (x << bitOfInt) + src1_offset - (dst_align << bitOfInt)); - int src2_index = mad24(y, src2_step, (x << bitOfInt) + src2_offset - (dst_align << bitOfInt)); - + int src1_index = mad24(y, src1_step, (x << bitOfInt) + src1_offset - (dst_align << bitOfInt)); + int src2_index = mad24(y, src2_step, (x << bitOfInt) + src2_offset - (dst_align << bitOfInt)); + int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + (x << bitOfInt) -(dst_align << bitOfInt)); - int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index)); - int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index)); + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; + int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index_fix)); + int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index_fix)); + + if(src1_index < 0) + { + int4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + int4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index)); // double4 tmp = convert_double4(src1_data) * alpha + convert_double4(src2_data) * beta + gama ; float4 tmp; @@ -257,23 +299,37 @@ __kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset int y = get_global_id(1); if (x < cols && y < rows) - + { - + x = x << 2; #define dst_align ((dst_offset >> 2) & 3) - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); - + int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); + int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); + int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2)); - float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index)); - float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index)); + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; + float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix)); + float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix)); float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index)); + if(src1_index < 0) + { + float4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + float4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } // double4 tmp = convert_double4(src1_data) * alpha + convert_double4(src2_data) * beta + gama ; // float4 tmp_data =(src1_data) * alpha + (src2_data) * beta + gama ; @@ -305,23 +361,37 @@ __kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offs int y = get_global_id(1); if (x < cols && y < rows) - + { - + x = x << 2; #define dst_align ((dst_offset >> 3) & 3) - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); - int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); - + int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); + int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); + int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + (x << 3) -(dst_align << 3)); - double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index)); - double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index)); + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; + double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix)); + double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix)); double4 dst_data = *((__global double4 *)((__global char *)dst + dst_index)); + if(src1_index < 0) + { + double4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + double4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } // double4 tmp_data = (src1_data) * alpha + (src2_data) * beta + gama ; double4 tmp_data; tmp_data.x = src1_data.x * alpha + src2_data.x * beta + gama; diff --git a/modules/ocl/src/kernels/arithm_bitwise_and.cl b/modules/ocl/src/kernels/arithm_bitwise_and.cl index ac513c0e9c..f954452b1f 100644 --- a/modules/ocl/src/kernels/arithm_bitwise_and.cl +++ b/modules/ocl/src/kernels/arithm_bitwise_and.cl @@ -63,15 +63,29 @@ __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int sr x = x << 2; #define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); + int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); + int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; + uchar4 src1_data = vload4(0, src1 + src1_index_fix); + uchar4 src2_data = vload4(0, src2 + src2_index_fix); - uchar4 src1_data = vload4(0, src1 + src1_index); - uchar4 src2_data = vload4(0, src2 + src2_index); + if(src1_index < 0) + { + uchar4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + uchar4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = src1_data & src2_data; @@ -99,16 +113,30 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src x = x << 2; #define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); + int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); + int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - char4 src1_data = vload4(0, src1 + src1_index); - char4 src2_data = vload4(0, src2 + src2_index); + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; + char4 src1_data = vload4(0, src1 + src1_index_fix); + char4 src2_data = vload4(0, src2 + src2_index_fix); + if(src1_index < 0) + { + char4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + char4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } char4 dst_data = *((__global char4 *)(dst + dst_index)); char4 tmp_data = src1_data & src2_data; @@ -136,16 +164,30 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s x = x << 2; #define dst_align ((dst_offset >> 1) & 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); + int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); + int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8); - ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index)); - ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index)); + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; + ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index_fix)); + ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix)); + if(src1_index < 0) + { + ushort4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + ushort4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index)); ushort4 tmp_data = src1_data & src2_data; @@ -174,16 +216,30 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr x = x << 2; #define dst_align ((dst_offset >> 1) & 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); + int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); + int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8); - short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index)); - short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index)); + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; + short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index_fix)); + short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix)); + if(src1_index < 0) + { + short4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + short4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index)); short4 tmp_data = src1_data & src2_data; diff --git a/modules/ocl/src/kernels/arithm_bitwise_not.cl b/modules/ocl/src/kernels/arithm_bitwise_not.cl index 327cc7728a..64bcc1799a 100644 --- a/modules/ocl/src/kernels/arithm_bitwise_not.cl +++ b/modules/ocl/src/kernels/arithm_bitwise_not.cl @@ -62,17 +62,24 @@ __kernel void arithm_bitwise_not_D0 (__global uchar *src1, int src1_step, int sr x = x << 2; #define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); + int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - - uchar4 src1_data = vload4(0, src1 + src1_index); + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + uchar4 src1_data = vload4(0, src1 + src1_index_fix); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = ~ src1_data; - + + /* if(src1_index < 0) + { + uchar4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + */ dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y; dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z; @@ -95,7 +102,7 @@ __kernel void arithm_bitwise_not_D1 (__global char *src1, int src1_step, int src x = x << 2; #define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); + int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); @@ -129,7 +136,7 @@ __kernel void arithm_bitwise_not_D2 (__global ushort *src1, int src1_step, int s x = x << 2; #define dst_align ((dst_offset >> 1) & 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); + int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); @@ -164,7 +171,7 @@ __kernel void arithm_bitwise_not_D3 (__global short *src1, int src1_step, int sr x = x << 2; #define dst_align ((dst_offset >> 1) & 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); + int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); @@ -238,12 +245,12 @@ __kernel void arithm_bitwise_not_D6 (__global char *src, int src_step, int src_o { int src_index = mad24(y, src_step, (x << 3) + src_offset); int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - + char8 data; data = *((__global char8 *)((__global char *)src + src_index)); data = ~ data; - + *((__global char8 *)((__global char *)dst + dst_index)) = data; } } diff --git a/modules/ocl/src/kernels/arithm_bitwise_or.cl b/modules/ocl/src/kernels/arithm_bitwise_or.cl index bcc7974d93..01e3a2f998 100644 --- a/modules/ocl/src/kernels/arithm_bitwise_or.cl +++ b/modules/ocl/src/kernels/arithm_bitwise_or.cl @@ -63,16 +63,28 @@ __kernel void arithm_bitwise_or_D0 (__global uchar *src1, int src1_step, int src x = x << 2; #define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); + int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); + int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - - uchar4 src1_data = vload4(0, src1 + src1_index); - uchar4 src2_data = vload4(0, src2 + src2_index); - + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; + uchar4 src1_data = vload4(0, src1 + src1_index_fix); + uchar4 src2_data = vload4(0, src2 + src2_index_fix); + if(src1_index < 0) + { + uchar4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + uchar4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = src1_data | src2_data; @@ -99,8 +111,8 @@ __kernel void arithm_bitwise_or_D1 (__global char *src1, int src1_step, int src1 x = x << 2; #define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); + int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); + int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); @@ -136,8 +148,8 @@ __kernel void arithm_bitwise_or_D2 (__global ushort *src1, int src1_step, int sr x = x << 2; #define dst_align ((dst_offset >> 1) & 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); + int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); + int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); @@ -174,8 +186,8 @@ __kernel void arithm_bitwise_or_D3 (__global short *src1, int src1_step, int src x = x << 2; #define dst_align ((dst_offset >> 1) & 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); + int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); + int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); diff --git a/modules/ocl/src/kernels/arithm_bitwise_xor.cl b/modules/ocl/src/kernels/arithm_bitwise_xor.cl index 621b9dece1..6e83ef50ec 100644 --- a/modules/ocl/src/kernels/arithm_bitwise_xor.cl +++ b/modules/ocl/src/kernels/arithm_bitwise_xor.cl @@ -63,16 +63,30 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr x = x << 2; #define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); + int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); + int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - uchar4 src1_data = vload4(0, src1 + src1_index); - uchar4 src2_data = vload4(0, src2 + src2_index); + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; + uchar4 src1_data = vload4(0, src1 + src1_index_fix); + uchar4 src2_data = vload4(0, src2 + src2_index_fix); + if(src1_index < 0) + { + uchar4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + uchar4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = src1_data ^ src2_data; @@ -99,16 +113,30 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src x = x << 2; #define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); + int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); + int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - char4 src1_data = vload4(0, src1 + src1_index); - char4 src2_data = vload4(0, src2 + src2_index); + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; + char4 src1_data = vload4(0, src1 + src1_index_fix); + char4 src2_data = vload4(0, src2 + src2_index_fix); + if(src1_index < 0) + { + char4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + char4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } char4 dst_data = *((__global char4 *)(dst + dst_index)); char4 tmp_data = src1_data ^ src2_data; @@ -136,16 +164,30 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s x = x << 2; #define dst_align ((dst_offset >> 1) & 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); + int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); + int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8); - ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index)); - ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index)); + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; + ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index_fix)); + ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix)); + if(src1_index < 0) + { + ushort4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + ushort4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index)); ushort4 tmp_data = src1_data ^ src2_data; @@ -174,17 +216,35 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr x = x << 2; #define dst_align ((dst_offset >> 1) & 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); + int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); + int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8); - short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index)); - short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index)); + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; + short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index_fix)); + short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix)); short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index)); + + if(src1_index < 0) + { + short4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + short4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + + + short4 tmp_data = src1_data ^ src2_data; dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; diff --git a/modules/ocl/src/kernels/arithm_compare_eq.cl b/modules/ocl/src/kernels/arithm_compare_eq.cl index f195c279e7..1db0b7dd14 100644 --- a/modules/ocl/src/kernels/arithm_compare_eq.cl +++ b/modules/ocl/src/kernels/arithm_compare_eq.cl @@ -63,16 +63,31 @@ __kernel void arithm_compare_eq_D0 (__global uchar *src1, int src1_step, int src x = x << 2; #define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); + int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); + int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; + uchar4 src1_data = vload4(0, src1 + src1_index_fix); + uchar4 src2_data = vload4(0, src2 + src2_index_fix); + if(src1_index < 0) + { + uchar4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + uchar4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + - uchar4 src1_data = vload4(0, src1 + src1_index); - uchar4 src2_data = vload4(0, src2 + src2_index); - + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data == src2_data)); @@ -85,7 +100,8 @@ __kernel void arithm_compare_eq_D0 (__global uchar *src1, int src1_step, int src } } -__kernel void arithm_compare_eq_D2 (__global ushort *src1, int src1_step, int src1_offset, + +__kernel void arithm_compare_ne_D2 (__global ushort *src1, int src1_step, int src1_offset, __global ushort *src2, int src2_step, int src2_offset, __global uchar *dst, int dst_step, int dst_offset, int rows, int cols, int dst_step1) @@ -98,16 +114,30 @@ __kernel void arithm_compare_eq_D2 (__global ushort *src1, int src1_step, int sr { x = x << 2; - #define dst_align ((dst_offset >> 1) & 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); + #define dst_align ((dst_offset >> 1)& 3) + int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); + int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index)); - ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index)); + ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index)); + if(src1_index < 0) + { + ushort4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + ushort4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data == src2_data)); @@ -122,7 +152,6 @@ __kernel void arithm_compare_eq_D2 (__global ushort *src1, int src1_step, int sr } - __kernel void arithm_compare_eq_D3 (__global short *src1, int src1_step, int src1_offset, __global short *src2, int src2_step, int src2_offset, __global uchar *dst, int dst_step, int dst_offset, @@ -137,16 +166,32 @@ __kernel void arithm_compare_eq_D3 (__global short *src1, int src1_step, int src x = x << 2; #define dst_align ((dst_offset >> 1) & 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); + int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); + int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index)); - short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index)); + short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index)); + if(src1_index < 0) + { + short4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + short4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + + + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data == src2_data)); @@ -170,18 +215,33 @@ __kernel void arithm_compare_eq_D4 (__global int *src1, int src1_step, int src1_ int y = get_global_id(1); if (x < cols && y < rows) - { + { x = x << 2; #define dst_align ((dst_offset >> 2) & 3) - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); + int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); + int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; - int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index)); + int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index)); int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index)); + if(src1_index < 0) + { + int4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + int4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data == src2_data)); @@ -206,15 +266,23 @@ __kernel void arithm_compare_eq_D5 (__global float *src1, int src1_step, int src { x = x << 2; #define dst_align ((dst_offset >> 2) & 3) - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); + int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); + int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; + float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix)); + float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix)); if(src2_index < 0) + { + float4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + - float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index)); - float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data == src2_data)); @@ -240,15 +308,30 @@ __kernel void arithm_compare_eq_D6 (__global double *src1, int src1_step, int sr { x = x << 2; #define dst_align ((dst_offset >> 3) & 3) - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); - int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); + int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); + int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; + double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix)); + double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix)); + if(src1_index < 0) + { + double4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + double4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + - double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index)); - double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data == src2_data)); @@ -276,16 +359,31 @@ __kernel void arithm_compare_gt_D0 (__global uchar *src1, int src1_step, int src x = x << 2; #define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); + int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); + int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; + uchar4 src1_data = vload4(0, src1 + src1_index_fix); + uchar4 src2_data = vload4(0, src2 + src2_index_fix); + if(src1_index < 0) + { + uchar4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + uchar4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + - uchar4 src1_data = vload4(0, src1 + src1_index); - uchar4 src2_data = vload4(0, src2 + src2_index); - + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data > src2_data)); @@ -312,16 +410,31 @@ __kernel void arithm_compare_gt_D2 (__global ushort *src1, int src1_step, int sr x = x << 2; #define dst_align ((dst_offset >> 1) & 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); + int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); + int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index)); - ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index)); + ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index)); + if(src1_index < 0) + { + ushort4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + ushort4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data > src2_data)); @@ -350,15 +463,30 @@ __kernel void arithm_compare_gt_D3 (__global short *src1, int src1_step, int src x = x << 2; #define dst_align ((dst_offset >> 1) & 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); + int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); + int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index)); - short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index)); + short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index)); + if(src1_index < 0) + { + short4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + short4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data > src2_data)); @@ -384,15 +512,31 @@ __kernel void arithm_compare_gt_D4 (__global int *src1, int src1_step, int src1_ { x = x << 2; #define dst_align ((dst_offset >> 2) & 3) - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); + int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); + int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; - int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index)); + int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index)); int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index)); + if(src1_index < 0) + { + int4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + int4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data > src2_data)); @@ -417,15 +561,30 @@ __kernel void arithm_compare_gt_D5 (__global float *src1, int src1_step, int src { x = x << 2; #define dst_align ((dst_offset >> 2) & 3) - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); + int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); + int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; + float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix)); + float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix)); + if(src1_index < 0) + { + float4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + float4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + - float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index)); - float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data > src2_data)); @@ -451,15 +610,30 @@ __kernel void arithm_compare_gt_D6 (__global double *src1, int src1_step, int sr { x = x << 2; #define dst_align ((dst_offset >> 3) & 3) - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); - int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); + int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); + int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; + double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix)); + double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix)); + if(src1_index < 0) + { + double4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + double4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + - double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index)); - double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data > src2_data)); @@ -487,15 +661,31 @@ __kernel void arithm_compare_ge_D0 (__global uchar *src1, int src1_step, int src x = x << 2; #define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); + int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); + int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - uchar4 src1_data = vload4(0, src1 + src1_index); - uchar4 src2_data = vload4(0, src2 + src2_index); + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; + uchar4 src1_data = vload4(0, src1 + src1_index_fix); + uchar4 src2_data = vload4(0, src2 + src2_index_fix); + if(src1_index < 0) + { + uchar4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + uchar4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data >= src2_data)); @@ -525,15 +715,32 @@ __kernel void arithm_compare_ge_D2 (__global ushort *src1, int src1_step, int sr x = x << 2; #define dst_align ((dst_offset >> 1) & 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); + int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); + int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index)); - ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index)); + ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index)); + if(src1_index < 0) + { + ushort4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + ushort4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + + + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data >= src2_data)); @@ -563,15 +770,31 @@ __kernel void arithm_compare_ge_D3 (__global short *src1, int src1_step, int src x = x << 2; #define dst_align ((dst_offset >> 1)& 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); + int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); + int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index)); - short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index)); + short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index)); + if(src1_index < 0) + { + short4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + short4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data >= src2_data)); @@ -598,16 +821,31 @@ __kernel void arithm_compare_ge_D4 (__global int *src1, int src1_step, int src1_ x = x << 2; #define dst_align ((dst_offset >> 2)& 3) - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); + int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); + int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; + int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index)); int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index)); - uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); + if(src1_index < 0) + { + int4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + int4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data >= src2_data)); dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; @@ -632,15 +870,31 @@ __kernel void arithm_compare_ge_D5 (__global float *src1, int src1_step, int src x = x << 2; #define dst_align ((dst_offset >> 2)& 3) - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); + int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); + int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index)); - float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index)); + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; + float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix)); + float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix)); + if(src1_index < 0) + { + + float4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + float4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data >= src2_data)); @@ -667,16 +921,28 @@ __kernel void arithm_compare_ge_D6 (__global double *src1, int src1_step, int sr x = x << 2; #define dst_align ((dst_offset >> 3)& 3) - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); - int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); + int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); + int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - - double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index)); - double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index)); - uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; + double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix)); + double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix)); + if(src1_index < 0) + { + double4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + double4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data >= src2_data)); dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; diff --git a/modules/ocl/src/kernels/arithm_compare_ne.cl b/modules/ocl/src/kernels/arithm_compare_ne.cl index 9d46676e0c..1c5063a460 100644 --- a/modules/ocl/src/kernels/arithm_compare_ne.cl +++ b/modules/ocl/src/kernels/arithm_compare_ne.cl @@ -59,15 +59,29 @@ __kernel void arithm_compare_ne_D0 (__global uchar *src1, int src1_step, int src x = x << 2; #define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); + int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); + int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - - uchar4 src1_data = vload4(0, src1 + src1_index); - uchar4 src2_data = vload4(0, src2 + src2_index); + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; + uchar4 src1_data = vload4(0, src1 + src1_index_fix); + uchar4 src2_data = vload4(0, src2 + src2_index_fix); + if(src1_index < 0) + { + uchar4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + uchar4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data != src2_data)); @@ -97,15 +111,29 @@ __kernel void arithm_compare_ne_D2 (__global ushort *src1, int src1_step, int sr x = x << 2; #define dst_align ((dst_offset >> 1)& 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); + int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); + int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index)); - ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index)); + ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index)); + if(src1_index < 0) + { + ushort4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + ushort4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data != src2_data)); @@ -135,15 +163,29 @@ __kernel void arithm_compare_ne_D3 (__global short *src1, int src1_step, int src x = x << 2; #define dst_align ((dst_offset >> 1)& 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); + int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); + int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index)); - short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index)); + short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index)); + if(src1_index < 0) + { + short4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + short4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data != src2_data)); @@ -169,15 +211,31 @@ __kernel void arithm_compare_ne_D4 (__global int *src1, int src1_step, int src1_ { x = x << 2; #define dst_align ((dst_offset >> 2)& 3) - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); + int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); + int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); + + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index)); int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index)); + if(src1_index < 0) + { + int4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + int4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data != src2_data)); @@ -202,16 +260,29 @@ __kernel void arithm_compare_ne_D5 (__global float *src1, int src1_step, int src { x = x << 2; #define dst_align ((dst_offset >> 2) & 3) - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); + int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); + int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - - float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index)); - float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index)); - uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; + float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix)); + float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix)); if(src1_index < 0) + { + float4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + float4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data != src2_data)); dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; @@ -236,15 +307,30 @@ __kernel void arithm_compare_ne_D6 (__global double *src1, int src1_step, int sr { x = x << 2; #define dst_align ((dst_offset >> 3) & 3) - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); - int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); + int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); + int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; + double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix)); + double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix)); + if(src1_index < 0) + { + double4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + double4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + - double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index)); - double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data != src2_data)); @@ -258,7 +344,7 @@ __kernel void arithm_compare_ne_D6 (__global double *src1, int src1_step, int sr } #endif - + /***********************************Compare LT*******************************/ __kernel void arithm_compare_lt_D0 (__global uchar *src1, int src1_step, int src1_offset, __global uchar *src2, int src2_step, int src2_offset, @@ -273,15 +359,29 @@ __kernel void arithm_compare_lt_D0 (__global uchar *src1, int src1_step, int src x = x << 2; #define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); + int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); + int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - - uchar4 src1_data = vload4(0, src1 + src1_index); - uchar4 src2_data = vload4(0, src2 + src2_index); + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; + uchar4 src1_data = vload4(0, src1 + src1_index_fix); + uchar4 src2_data = vload4(0, src2 + src2_index_fix); + if(src1_index < 0) + { + uchar4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + uchar4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data < src2_data)); @@ -311,15 +411,30 @@ __kernel void arithm_compare_lt_D2 (__global ushort *src1, int src1_step, int sr x = x << 2; #define dst_align ((dst_offset >> 1) & 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); + int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); + int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index)); - ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index)); + ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index)); + if(src1_index < 0) + { + ushort4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + ushort4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data < src2_data)); @@ -349,15 +464,30 @@ __kernel void arithm_compare_lt_D3 (__global short *src1, int src1_step, int src x = x << 2; #define dst_align ((dst_offset >> 1) & 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); + int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); + int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index)); - short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index)); + short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index)); + if(src1_index < 0) + { + short4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + short4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data < src2_data)); @@ -383,15 +513,34 @@ __kernel void arithm_compare_lt_D4 (__global int *src1, int src1_step, int src1_ { x = x << 2; #define dst_align ((dst_offset >> 2) & 3) - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); + int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); + int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; + int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index)); int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index)); + if(src1_index < 0) + { + int4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + int4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + + + + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data < src2_data)); @@ -416,16 +565,31 @@ __kernel void arithm_compare_lt_D5 (__global float *src1, int src1_step, int src { x = x << 2; #define dst_align ((dst_offset >> 2) & 3) - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); + int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); + int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; + float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix)); + float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix)); + if(src1_index < 0) + { + float4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + float4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + - float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index)); - float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index)); - uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data < src2_data)); dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; @@ -450,16 +614,31 @@ __kernel void arithm_compare_lt_D6 (__global double *src1, int src1_step, int sr { x = x << 2; #define dst_align ((dst_offset >> 3) & 3) - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); - int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); + int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); + int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; + double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix)); + double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix)); + if(src1_index < 0) + { + double4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + double4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + - double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index)); - double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index)); - uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data < src2_data)); dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; @@ -486,15 +665,30 @@ __kernel void arithm_compare_le_D0 (__global uchar *src1, int src1_step, int src x = x << 2; #define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); + int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); + int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; + uchar4 src1_data = vload4(0, src1 + src1_index_fix); + uchar4 src2_data = vload4(0, src2 + src2_index_fix); + if(src1_index < 0) + { + uchar4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + uchar4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + - uchar4 src1_data = vload4(0, src1 + src1_index); - uchar4 src2_data = vload4(0, src2 + src2_index); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data <= src2_data)); @@ -524,15 +718,30 @@ __kernel void arithm_compare_le_D2 (__global ushort *src1, int src1_step, int sr x = x << 2; #define dst_align ((dst_offset >> 1) & 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); + int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); + int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index)); - ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index)); + ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index)); + if(src1_index < 0) + { + ushort4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + ushort4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data <= src2_data)); @@ -562,15 +771,30 @@ __kernel void arithm_compare_le_D3 (__global short *src1, int src1_step, int src x = x << 2; #define dst_align ((dst_offset >> 1) & 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); + int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); + int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index)); - short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index)); + short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index)); + if(src1_index < 0) + { + short4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + short4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data <= src2_data)); @@ -596,15 +820,30 @@ __kernel void arithm_compare_le_D4 (__global int *src1, int src1_step, int src1_ { x = x << 2; #define dst_align ((dst_offset >> 2)& 3) - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); + int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); + int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index)); int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index)); + if(src1_index < 0) + { + int4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + int4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data =convert_uchar4((src1_data <= src2_data)); @@ -629,15 +868,29 @@ __kernel void arithm_compare_le_D5 (__global float *src1, int src1_step, int src { x = x << 2; #define dst_align ((dst_offset >> 2)& 3) - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); + int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); + int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; + float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix)); + float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix)); + if(src1_index < 0) + { + float4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + float4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } - float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index)); - float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data <= src2_data)); @@ -663,15 +916,30 @@ __kernel void arithm_compare_le_D6 (__global double *src1, int src1_step, int sr { x = x << 2; #define dst_align ((dst_offset >> 3)& 3) - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); - int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); + int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); + int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; + double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix)); + double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix)); + if(src1_index < 0) + { + double4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + double4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + - double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index)); - double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data <= src2_data)); diff --git a/modules/ocl/src/kernels/arithm_magnitudeSqr.cl b/modules/ocl/src/kernels/arithm_magnitudeSqr.cl index 4d138d816f..f1d0aa5733 100644 --- a/modules/ocl/src/kernels/arithm_magnitudeSqr.cl +++ b/modules/ocl/src/kernels/arithm_magnitudeSqr.cl @@ -60,23 +60,36 @@ __kernel void magnitudeSqr_C1_D5 (__global float *src1,int src1_step,int src1_of int y = get_global_id(1); if (x < cols && y < rows) - + { - + x = x << 2; #define dst_align ((dst_offset >> 2) & 3) - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); - + int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); + int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); + int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2)); - - float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index)); - float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index)); + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; + float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix)); + float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix)); + if(src1_index < 0) + { + float4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + float4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index)); float4 tmp_data ; @@ -112,21 +125,32 @@ __kernel void magnitudeSqr_C2_D5 (__global float *src1,int src1_step,int src1_of int y = get_global_id(1); if (x < cols && y < rows) - + { - + x = x << 2; #define dst_align ((dst_offset >> 2) & 3) - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); - + int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); + int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2)); + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + + float8 src1_data = vload8(0, (__global float *)((__global char *)src1 + src1_index_fix)); + + if(src1_index==-6) + src1_data.s01234567 = src1_data.s67012345; + if(src1_index==-4) + src1_data.s01234567 = src1_data.s45670123; + if(src1_index== -2) + src1_data.s01234567 = src1_data.s23456701; + + - float8 src1_data = vload8(0, (__global float *)((__global char *)src1 + src1_index)); float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index)); float4 tmp_data ; diff --git a/modules/ocl/src/kernels/imgproc_remap.cl b/modules/ocl/src/kernels/imgproc_remap.cl index e5277df137..4917749561 100644 --- a/modules/ocl/src/kernels/imgproc_remap.cl +++ b/modules/ocl/src/kernels/imgproc_remap.cl @@ -48,7 +48,7 @@ #if defined DOUBLE_SUPPORT #pragma OPENCL EXTENSION cl_khr_fp64:enable typedef double4 F4 ; -#else +#else typedef float4 F4; #endif @@ -62,7 +62,7 @@ __kernel void remapNNSConstant_C1_D0(__global unsigned char* dst, __global unsig { int x = get_global_id(0); int y = get_global_id(1); - + if(x < threadCols && y < dst_rows) { x = x << 2; @@ -70,7 +70,7 @@ __kernel void remapNNSConstant_C1_D0(__global unsigned char* dst, __global unsig int4 Gx = (int4)(gx, gx+1, gx+2, gx+3); uchar4 nval =convert_uchar4(nVal); - uchar val = nval.s0; + uchar4 val = (uchar4)(nval.s0); int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&3); @@ -79,35 +79,41 @@ __kernel void remapNNSConstant_C1_D0(__global unsigned char* dst, __global unsig map1_data = *((__global short8 *)((__global char*)map1 + map1Start)); int4 srcIdx = convert_int4(map1_data.odd) * src_step + convert_int4(map1_data.even) + src_offset; + + uchar4 con = convert_uchar4(convert_int4(map1_data.even) >= (int4)(src_cols) || convert_int4(map1_data.odd) >= (int4)(src_rows) || convert_int4(map1_data.even) < (int4)(0) || convert_int4(map1_data.odd) < (int4)(0)); + uchar4 src_data = val; - uchar4 src_data; - + if (con.s0 == 0) src_data.s0 = *(src + srcIdx.s0); + if (con.s1 == 0) src_data.s1 = *(src + srcIdx.s1); + if (con.s2 == 0) src_data.s2 = *(src + srcIdx.s2); + if (con.s3 == 0) src_data.s3 = *(src + srcIdx.s3); + uchar4 dst_data; - dst_data = convert_uchar4((convert_int4(map1_data.even) >= (int4)(src_cols) || convert_int4(map1_data.odd) >= (int4)(src_rows)))? (uchar4)(val) : src_data; - + __global uchar4* d = (__global uchar4 *)(dst + dstStart); - uchar4 dVal = *d; + uchar4 dVal = *d; - int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows); - dst_data = (convert_uchar4(con) != convert_uchar4((int4)(0))) ? dst_data : dVal; + int4 dcon = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows); + dst_data = (convert_uchar4(dcon) != convert_uchar4((int4)(0))) ? src_data : dVal; *d = dst_data; } } + __kernel void remapNNFConstant_C1_D0(__global unsigned char* dst, __global unsigned char const * restrict src, __global float * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step, int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows, int threadCols, F4 nVal) { int x = get_global_id(0); int y = get_global_id(1); - + if(x < threadCols && y < dst_rows) { x = x << 2; @@ -125,24 +131,28 @@ __kernel void remapNNFConstant_C1_D0(__global unsigned char* dst, __global unsig map1_data = *((__global float8 *)((__global char*)map1 + map1Start)); int8 map1_dataZ = convert_int8_sat_rte(map1_data); int4 srcIdx = map1_dataZ.odd * src_step + map1_dataZ.even + src_offset; + + uchar4 src_data = val; + uchar4 con = convert_uchar4(map1_dataZ.even >= (int4)(src_cols) || map1_dataZ.odd >= (int4)(src_rows) || map1_dataZ.even < (int4)(0) || map1_dataZ.odd < (int4)(0)); - uchar4 src_data; - + if (con.s0 == 0) src_data.s0 = *(src + srcIdx.s0); + if (con.s1 == 0) src_data.s1 = *(src + srcIdx.s1); + if (con.s2 == 0) src_data.s2 = *(src + srcIdx.s2); + if (con.s3 == 0) src_data.s3 = *(src + srcIdx.s3); uchar4 dst_data; - dst_data = convert_uchar4(map1_dataZ.even >= (int4)(src_cols) || map1_dataZ.odd >= (int4)(src_rows)) ? (uchar4)(val) : src_data; + // dst_data = convert_uchar4(map1_dataZ.even >= (int4)(src_cols) || map1_dataZ.odd >= (int4)(src_rows)) ? (uchar4)(val) : src_data; __global uchar4* d = (__global uchar4 *)(dst + dstStart); - uchar4 dVal = *d; + uchar4 dVal = *d; - int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows); - - dst_data = (convert_uchar4(con) != convert_uchar4((int4)(0))) ? dst_data : dVal; + int4 dcon = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows); + + dst_data = (convert_uchar4(dcon) != convert_uchar4((int4)(0))) ? src_data : dVal; *d = dst_data; - } } @@ -152,7 +162,7 @@ __kernel void remapNNF1Constant_C1_D0(__global unsigned char* dst, __global unsi { int x = get_global_id(0); int y = get_global_id(1); - + if(x < threadCols && y < dst_rows) { x = x << 2; @@ -160,7 +170,7 @@ __kernel void remapNNF1Constant_C1_D0(__global unsigned char* dst, __global unsi int4 Gx = (int4)(gx, gx+1, gx+2, gx+3); uchar4 nval =convert_uchar4(nVal); - uchar val = nval.s0; + uchar4 val = (uchar4)(nval.s0); int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&3); @@ -173,22 +183,28 @@ __kernel void remapNNF1Constant_C1_D0(__global unsigned char* dst, __global unsi float8 map_data = (float8)(map1_data.s0, map2_data.s0, map1_data.s1, map2_data.s1, map1_data.s2, map2_data.s2, map1_data.s3, map2_data.s3); int8 map_dataZ = convert_int8_sat_rte(map_data); int4 srcIdx = map_dataZ.odd * src_step + map_dataZ.even + src_offset; + + uchar4 src_data = val; + uchar4 con = convert_uchar4(map_dataZ.even >= (int4)(src_cols) || map_dataZ.odd >= (int4)(src_rows)|| map_dataZ.even < (int4)(0) || map_dataZ.odd < (int4)(0)); - uchar4 src_data; - + if (con.s0 == 0) src_data.s0 = *(src + srcIdx.s0); + if (con.s1 == 0) src_data.s1 = *(src + srcIdx.s1); + if (con.s2 == 0) src_data.s2 = *(src + srcIdx.s2); + if (con.s3 == 0) src_data.s3 = *(src + srcIdx.s3); uchar4 dst_data; - dst_data = convert_uchar4(map_dataZ.even >= (int4)(src_cols) || map_dataZ.odd >= (int4)(src_rows)) ? (uchar4)(val) : src_data; + + // dst_data = convert_uchar4(map_dataZ.even >= (int4)(src_cols) || map_dataZ.odd >= (int4)(src_rows)) ? (uchar4)(val) : src_data; __global uchar4* d = (__global uchar4 *)(dst + dstStart); - uchar4 dVal = *d; + uchar4 dVal = *d; - int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows); - - dst_data = (convert_uchar4(con) != convert_uchar4((int4)(0))) ? dst_data : dVal; + int4 dcon = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows); + + dst_data = (convert_uchar4(dcon) != convert_uchar4((int4)(0))) ? src_data : dVal; *d = dst_data; } } @@ -203,42 +219,24 @@ __kernel void remapNNSConstant_C4_D0(__global unsigned char* dst, __global unsig if(x < threadCols && y < dst_rows) { - x = x << 4; - int gx = x - (dst_offset&15); - int16 Gx = (int16)(gx, gx+1, gx+2, gx+3, gx+4, gx+5, gx+6, gx+7, gx+8, gx+9, gx+10, gx+11, gx+12, gx+13, gx+14, gx+15); - uchar4 nval =convert_uchar4_sat_rte(nVal); + int dstIdx = y * dst_step + (x << 2) + dst_offset; + int mapIdx = y * map1_step + (x << 2) + map1_offset; + short2 map1_data = *((__global short2 *)((__global char*)map1 + mapIdx)); + int srcIdx = map1_data.y * src_step + (map1_data.x << 2) + src_offset; + uchar4 nval = convert_uchar4(nVal); + uchar4 src_data; + if(map1_data.x >= src_cols || map1_data.y >= src_rows || map1_data.x <0 || map1_data.y < 0 ) + src_data = nval; + else + src_data = *((__global uchar4 *)((__global uchar *)src + srcIdx)); + *((__global uchar4 *)((__global uchar*)dst + dstIdx)) = src_data; - int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&15); - int map1Start = y * map1_step + x + map1_offset - (dst_offset&15 ); - short8 map1_data; - map1_data = *((__global short8 *)((__global char*)map1 + map1Start)); - int4 srcIdx = convert_int4(map1_data.odd) * src_step + (convert_int4(map1_data.even) <<((int4)(2))) + src_offset; - uchar4 src_a, src_b, src_c, src_d; - src_a = *((__global uchar4 *)((__global char*)src + srcIdx.s0)); - src_b = *((__global uchar4 *)((__global char*)src + srcIdx.s1)); - src_c = *((__global uchar4 *)((__global char*)src + srcIdx.s2)); - src_d = *((__global uchar4 *)((__global char*)src + srcIdx.s3)); - - uchar16 dst_data; - uchar4 dst_a, dst_b, dst_c, dst_d; - dst_a = (map1_data.s0 >= src_cols || map1_data.s1 >= src_rows)? nval : src_a; - dst_b = (map1_data.s2 >= src_cols || map1_data.s3 >= src_rows)? nval : src_b; - dst_c = (map1_data.s4 >= src_cols || map1_data.s5 >= src_rows)? nval : src_c; - dst_d = (map1_data.s6 >= src_cols || map1_data.s7 >= src_rows)? nval : src_d; - - dst_data = (uchar16)(dst_a, dst_b, dst_c, dst_d); - __global uchar16* d = (__global uchar16 *)(dst + dstStart); - - uchar16 dVal = *d; - - int16 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows); - dst_data = (convert_uchar16(con) != ((uchar16)(0))) ? dst_data : dVal; - - *d = dst_data; } + } + __kernel void remapNNFConstant_C4_D0(__global unsigned char* dst, __global unsigned char const * restrict src, __global float * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step, int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows, int threadCols, F4 nVal) @@ -248,43 +246,19 @@ __kernel void remapNNFConstant_C4_D0(__global unsigned char* dst, __global unsig if(x < threadCols && y < dst_rows) { - x = x << 4; - int gx = x - (dst_offset&15); - int16 Gx = (int16)(gx, gx+1, gx+2, gx+3, gx+4, gx+5, gx+6, gx+7, gx+8, gx+9, gx+10, gx+11, gx+12, gx+13, gx+14, gx+15); + int dstIdx = y * dst_step + (x << 2) + dst_offset; + int mapIdx = y * map1_step + (x << 3) + map1_offset; + float2 map1_data = *((__global float2 *)((__global char*)map1 + mapIdx)); + int2 map1_dataZ = convert_int2_sat_rte(map1_data); + int srcIdx = map1_dataZ.y * src_step + (map1_dataZ.x << 2) + src_offset; + uchar4 nval = convert_uchar4(nVal); + uchar4 src_data; + if(map1_dataZ.x >= src_cols || map1_dataZ.y >= src_rows || map1_dataZ.x < 0 || map1_dataZ.y < 0) + src_data = nval; + else + src_data = *((__global uchar4 *)((__global uchar *)src + srcIdx)); + *((__global uchar4 *)((__global uchar*)dst + dstIdx)) = src_data; - uchar4 nval =convert_uchar4(nVal); - - int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&15); - - int map1Start = y * map1_step +(x << 1) + map1_offset - ((dst_offset&15) << 1); - float8 map1_data; - - map1_data = *((__global float8 *)((__global char*)map1 + map1Start)); - int8 map1_dataZ = convert_int8_sat_rte(map1_data); - - int4 srcIdx = map1_dataZ.odd * src_step + (map1_dataZ.even <<((int4)(2))) + src_offset; - uchar4 src_a, src_b, src_c, src_d; - src_a = *((__global uchar4 *)((__global char*)src + srcIdx.s0)); - src_b = *((__global uchar4 *)((__global char*)src + srcIdx.s1)); - src_c = *((__global uchar4 *)((__global char*)src + srcIdx.s2)); - src_d = *((__global uchar4 *)((__global char*)src + srcIdx.s3)); - - uchar16 dst_data; - uchar4 dst_a, dst_b, dst_c, dst_d; - dst_a = (map1_dataZ.s0 >= src_cols || map1_dataZ.s1 >= src_rows)? nval : src_a; - dst_b = (map1_dataZ.s2 >= src_cols || map1_dataZ.s3 >= src_rows)? nval : src_b; - dst_c = (map1_dataZ.s4 >= src_cols || map1_dataZ.s5 >= src_rows)? nval : src_c; - dst_d = (map1_dataZ.s6 >= src_cols || map1_dataZ.s7 >= src_rows)? nval : src_d; - - dst_data = (uchar16)(dst_a, dst_b, dst_c, dst_d); - __global uchar16* d = (__global uchar16 *)(dst + dstStart); - - uchar16 dVal = *d; - - int16 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows); - dst_data = (convert_uchar16(con) != ((uchar16)(0))) ? dst_data : dVal; - - *d = dst_data; } @@ -298,153 +272,72 @@ __kernel void remapNNF1Constant_C4_D0(__global unsigned char* dst, __global unsi int y = get_global_id(1); if(x < threadCols && y < dst_rows) - { - x = x << 4; - int gx = x - (dst_offset&15); - int16 Gx = (int16)(gx, gx+1, gx+2, gx+3, gx+4, gx+5, gx+6, gx+7, gx+8, gx+9, gx+10, gx+11, gx+12, gx+13, gx+14, gx+15); - - uchar4 nval =convert_uchar4(nVal); - - int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&15); - - int map1Start = y * map1_step + x + map1_offset - (dst_offset&15); - float4 map1_data; - float4 map2_data; - - map1_data = *((__global float4 *)((__global char*)map1 + map1Start)); - map2_data = *((__global float4 *)((__global char*)map2 + map1Start)); - float8 map_data = (float8)(map1_data.s0, map2_data.s0, map1_data.s1, map2_data.s1, map1_data.s2, map2_data.s2, map1_data.s3, map2_data.s3); - int8 map1_dataZ = convert_int8_sat_rte(map_data); - - int4 srcIdx = map1_dataZ.odd * src_step + (map1_dataZ.even <<((int4)(2))) + src_offset; - uchar4 src_a, src_b, src_c, src_d; - src_a = *((__global uchar4 *)((__global char*)src + srcIdx.s0)); - src_b = *((__global uchar4 *)((__global char*)src + srcIdx.s1)); - src_c = *((__global uchar4 *)((__global char*)src + srcIdx.s2)); - src_d = *((__global uchar4 *)((__global char*)src + srcIdx.s3)); - - uchar16 dst_data; - uchar4 dst_a, dst_b, dst_c, dst_d; - dst_a = (map1_dataZ.s0 >= src_cols || map1_dataZ.s1 >= src_rows)? nval : src_a; - dst_b = (map1_dataZ.s2 >= src_cols || map1_dataZ.s3 >= src_rows)? nval : src_b; - dst_c = (map1_dataZ.s4 >= src_cols || map1_dataZ.s5 >= src_rows)? nval : src_c; - dst_d = (map1_dataZ.s6 >= src_cols || map1_dataZ.s7 >= src_rows)? nval : src_d; - - dst_data = (uchar16)(dst_a, dst_b, dst_c, dst_d); - __global uchar16* d = (__global uchar16 *)(dst + dstStart); - - uchar16 dVal = *d; - - int16 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows); - dst_data = (convert_uchar16(con) != ((uchar16)(0))) ? dst_data : dVal; - - *d = dst_data; - + { + int dstIdx = y * dst_step + (x << 2) + dst_offset; + int mapIdx = y * map1_step + (x << 2) + map1_offset; + float map1_data = *((__global float *)((__global char*)map1 + mapIdx)); + float map2_data = *((__global float *)((__global char*)map2 + mapIdx)); + int srcIdx = convert_int_sat_rte(map2_data) * src_step + (convert_int_sat_rte(map1_data) << 2) + src_offset; + uchar4 nval = convert_uchar4(nVal); + uchar4 src_data; + if(convert_int_sat_rte(map1_data) >= src_cols || convert_int_sat_rte(map2_data) >= src_rows || convert_int_sat_rte(map1_data) < 0 || convert_int_sat_rte(map2_data) < 0) + src_data = nval; + else + src_data = *((__global uchar4 *)((__global uchar *)src + srcIdx)); + *((__global uchar4 *)((__global uchar*)dst + dstIdx)) = src_data; } - } - __kernel void remapNNSConstant_C1_D5(__global float* dst, __global float const * restrict src, __global short * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step, int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows ,int threadCols, F4 nVal) { int x = get_global_id(0); int y = get_global_id(1); - + if(x < threadCols && y < dst_rows) { - x = x << 4; - - int gx = x - (dst_offset&15); - int4 Gx = (int4)(gx, gx+4, gx+8, gx+12); - - float4 nval =convert_float4(nVal); - float val = nval.s0; - - int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&15); - - int map1Start = y * map1_step + x + map1_offset - (dst_offset&15); - short8 map1_data; - - map1_data = *((__global short8 *)((__global char*)map1 + map1Start)); - - int4 srcIdx = convert_int4(map1_data.odd) * src_step + (convert_int4(map1_data.even) <<((int4)(2))) + src_offset; - - float4 src_data; - src_data.s0 = *((__global float *)((__global char*)src + srcIdx.s0)); - src_data.s1 = *((__global float *)((__global char*)src + srcIdx.s1)); - src_data.s2 = *((__global float *)((__global char*)src + srcIdx.s2)); - src_data.s3 = *((__global float *)((__global char*)src + srcIdx.s3)); - float4 dst_data; - - dst_data.s0 = (map1_data.s0 >= src_cols || map1_data.s1 >= src_rows)? val : src_data.s0; - dst_data.s1 = (map1_data.s2 >= src_cols || map1_data.s3 >= src_rows)? val : src_data.s1; - dst_data.s2 = (map1_data.s4 >= src_cols || map1_data.s5 >= src_rows)? val : src_data.s2; - dst_data.s3 = (map1_data.s6 >= src_cols || map1_data.s7 >= src_rows)? val : src_data.s3; - - - __global float4* d = (__global float4 *)((__global uchar*)dst + dstStart); - - float4 dVal = *d; - - int4 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows); - dst_data = (convert_float4(con) != (float4)(0)) ? dst_data : dVal; - - *d = dst_data; + int dstIdx = y * dst_step + (x << 2) + dst_offset; + int mapIdx = y * map1_step + (x << 2) + map1_offset; + short2 map1_data = *((__global short2 *)((__global char*)map1 + mapIdx)); + int srcIdx = map1_data.y * src_step + (map1_data.x << 2) + src_offset; + float nval = convert_float(nVal.x); + float src_data; + if(map1_data.x >= src_cols || map1_data.y >= src_rows|| map1_data.x < 0 || map1_data.y < 0) + src_data = nval; + else + src_data = *((__global float *)((__global uchar *)src + srcIdx)); + *((__global float *)((__global uchar*)dst + dstIdx)) = src_data; + } + } + __kernel void remapNNFConstant_C1_D5(__global float* dst, __global float const * restrict src, __global float * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step, int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows ,int threadCols, F4 nVal) { int x = get_global_id(0); int y = get_global_id(1); - + if(x < threadCols && y < dst_rows) { - x = x << 4; - - int gx = x - (dst_offset&15); - int4 Gx = (int4)(gx, gx+4, gx+8, gx+12); - - float4 nval =convert_float4(nVal); - float val = nval.s0; - - int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&15); - - int map1Start = y * map1_step + (x << 1) + map1_offset - ((dst_offset&15) << 1); - float8 map1_data; - - map1_data = *((__global float8 *)((__global char*)map1 + map1Start)); - int8 map1_dataZ = convert_int8_sat_rte(map1_data); - - int4 srcIdx = convert_int4(map1_dataZ.odd) * src_step + convert_int4(map1_dataZ.even <<(int4)(2)) + src_offset; - - float4 src_data; - src_data.s0 = *((__global float *)((__global char*)src + srcIdx.s0)); - src_data.s1 = *((__global float *)((__global char*)src + srcIdx.s1)); - src_data.s2 = *((__global float *)((__global char*)src + srcIdx.s2)); - src_data.s3 = *((__global float *)((__global char*)src + srcIdx.s3)); - float4 dst_data; - - dst_data.s0 = (map1_dataZ.s0 >= src_cols || map1_dataZ.s1 >= src_rows)? val : src_data.s0; - dst_data.s1 = (map1_dataZ.s2 >= src_cols || map1_dataZ.s3 >= src_rows)? val : src_data.s1; - dst_data.s2 = (map1_dataZ.s4 >= src_cols || map1_dataZ.s5 >= src_rows)? val : src_data.s2; - dst_data.s3 = (map1_dataZ.s6 >= src_cols || map1_dataZ.s7 >= src_rows)? val : src_data.s3; - - - __global float4* d = (__global float4 *)((__global uchar*)dst + dstStart); - - float4 dVal = *d; - - int4 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows); - dst_data = (convert_float4(con) != (float4)(0)) ? dst_data : dVal; - - *d = dst_data; + int dstIdx = y * dst_step + (x << 2) + dst_offset; + int mapIdx = y * map1_step + (x << 3) + map1_offset; + float2 map1_data = *((__global float2 *)((__global char*)map1 + mapIdx)); + int2 map1_dataZ = convert_int2_sat_rte(map1_data); + int srcIdx = map1_dataZ.y * src_step + (map1_dataZ.x << 2) + src_offset; + float nval = convert_float(nVal.x); + float src_data; + if(map1_dataZ.x >= src_cols || map1_dataZ.y >= src_rows || map1_dataZ.x < 0 || map1_dataZ.y < 0) + src_data = nval; + else + src_data = *((__global float *)((__global uchar *)src + srcIdx)); + *((__global float *)((__global uchar*)dst + dstIdx)) = src_data; + } } @@ -455,52 +348,26 @@ __kernel void remapNNF1Constant_C1_D5(__global float* dst, __global float const { int x = get_global_id(0); int y = get_global_id(1); - + if(x < threadCols && y < dst_rows) { - x = x << 4; + int dstIdx = y * dst_step + (x << 2) + dst_offset; + int mapIdx = y * map1_step + (x << 2) + map1_offset; + float map1_data = *((__global float *)((__global char*)map1 + mapIdx)); + float map2_data = *((__global float *)((__global char*)map2 + mapIdx)); + float2 map_data = (float2)(map1_data, map2_data); + int2 map1_dataZ = convert_int2_sat_rte(map_data); + int srcIdx = map1_dataZ.y * src_step + (map1_dataZ.x << 2) + src_offset; + float nval = convert_float(nVal.x); + float src_data; - int gx = x - (dst_offset&15); - int4 Gx = (int4)(gx, gx+4, gx+8, gx+12); - - float4 nval =convert_float4(nVal); - float val = nval.s0; - - int dstStart = y * dst_step + x + dst_offset - (dst_offset&15); - - int map1Start = y * map1_step + x + map1_offset - (dst_offset&15); - float4 map1_data; - float4 map2_data; - - map1_data = *((__global float4 *)((__global char*)map1 + map1Start)); - map2_data = *((__global float4 *)((__global char*)map2 + map1Start)); - float8 map_data = (float8)(map1_data.s0, map2_data.s0, map1_data.s1, map2_data.s1, map1_data.s2, map2_data.s2, map1_data.s3, map2_data.s3); - int8 map1_dataZ = convert_int8_sat_rte(map_data); - - int4 srcIdx = convert_int4(map1_dataZ.odd) * src_step + convert_int4(map1_dataZ.even <<(int4)(2)) + src_offset; - - float4 src_data; - src_data.s0 = *((__global float *)((__global char*)src + srcIdx.s0)); - src_data.s1 = *((__global float *)((__global char*)src + srcIdx.s1)); - src_data.s2 = *((__global float *)((__global char*)src + srcIdx.s2)); - src_data.s3 = *((__global float *)((__global char*)src + srcIdx.s3)); - float4 dst_data; - - dst_data.s0 = (map1_dataZ.s0 >= src_cols || map1_dataZ.s1 >= src_rows)? val : src_data.s0; - dst_data.s1 = (map1_dataZ.s2 >= src_cols || map1_dataZ.s3 >= src_rows)? val : src_data.s1; - dst_data.s2 = (map1_dataZ.s4 >= src_cols || map1_dataZ.s5 >= src_rows)? val : src_data.s2; - dst_data.s3 = (map1_dataZ.s6 >= src_cols || map1_dataZ.s7 >= src_rows)? val : src_data.s3; - - - __global float4* d = (__global float4 *)((__global uchar*)dst + dstStart); - - float4 dVal = *d; - - int4 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows); - dst_data = (convert_float4(con) != (float4)(0)) ? dst_data : dVal; - - *d = dst_data; + if(map1_dataZ.x >= src_cols || map1_dataZ.y >= src_rows || map1_dataZ.x < 0 || map1_dataZ.y < 0) + src_data = nval; + else + src_data = *((__global float *)((__global uchar *)src + srcIdx)); + *((__global float *)((__global uchar*)dst + dstIdx)) = src_data; + } } @@ -517,13 +384,19 @@ __kernel void remapNNSConstant_C4_D5(__global float * dst, __global float const int dstIdx = y * dst_step + (x << 4) + dst_offset ; int mapIdx = y * map1_step + (x << 2) + map1_offset ; short2 map1_data = *((__global short2 *)((__global char*)map1 + mapIdx)); - int srcIdx = map1_data.y * src_step + (map1_data.x << 4) + src_offset; float4 nval = convert_float4(nVal); - float4 src_data = *((__global float4 *)((__global uchar *)src + srcIdx)); - *((__global float4 *)((__global uchar*)dst + dstIdx)) = (map1_data.x >= src_cols || map1_data.y >= src_rows) ? nval : src_data; + float4 src_data; + if (map1_data.x <0 || map1_data.x >= src_cols || map1_data.y <0 || map1_data.y >= src_rows) + src_data = nval; + else + src_data = *((__global float4 *)((__global uchar *)src + srcIdx)); + *((__global float4 *)((__global uchar*)dst + dstIdx)) = src_data; + + } } + __kernel void remapNNFConstant_C4_D5(__global float * dst, __global float const * restrict src, __global float * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step, int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows , int threadCols, F4 nVal) @@ -539,8 +412,10 @@ __kernel void remapNNFConstant_C4_D5(__global float * dst, __global float const int2 map1_dataZ = convert_int2_sat_rte(map1_data); int srcIdx = map1_dataZ.y * src_step + (map1_dataZ.x << 4) + src_offset; float4 nval = convert_float4(nVal); - float4 src_data = *((__global float4 *)((__global uchar *)src + srcIdx)); - *((__global float4 *)((__global uchar*)dst + dstIdx)) = (map1_dataZ.x >= src_cols || map1_dataZ.y >= src_rows) ? nval : src_data; + float4 src_data = nval; + if(map1_dataZ.x >= 0 && map1_dataZ.x < src_cols && map1_dataZ.y >=0 && map1_dataZ.y < src_rows) + src_data = *((__global float4 *)((__global uchar *)src + srcIdx)); + *((__global float4 *)((__global uchar*)dst + dstIdx)) = src_data; } } @@ -561,8 +436,10 @@ __kernel void remapNNF1Constant_C4_D5(__global float * dst, __global float const int2 map1_dataZ = convert_int2_sat_rte(map_data); int srcIdx = map1_dataZ.y * src_step + (map1_dataZ.x << 4) + src_offset; float4 nval = convert_float4(nVal); - float4 src_data = *((__global float4 *)((__global uchar *)src + srcIdx)); - *((__global float4 *)((__global uchar*)dst + dstIdx)) = (map1_dataZ.x >= src_cols || map1_dataZ.y >= src_rows) ? nval : src_data; + float4 src_data = nval; + if(map1_dataZ.x >= 0 && map1_dataZ.x < src_cols && map1_dataZ.y >= 0 && map1_dataZ.y < src_rows) + src_data = *((__global float4 *)((__global uchar *)src + srcIdx)); + *((__global float4 *)((__global uchar*)dst + dstIdx)) = src_data; } } @@ -577,13 +454,13 @@ __kernel void remapLNFConstant_C1_D0(__global unsigned char* dst, __global unsig int y = get_global_id(1); if(x < threadCols && y < dst_rows) { - x = x << 2; + x = x << 2; int gx = x - (dst_offset&3); int4 Gx = (int4)(gx, gx+1, gx+2, gx+3); uchar4 nval =convert_uchar4(nVal); - uchar val = nval.s0; - + uchar4 val = (uchar4)(nval.s0); + int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&3); @@ -604,66 +481,49 @@ __kernel void remapLNFConstant_C1_D0(__global unsigned char* dst, __global unsig int4 map1_dataDy = map1_dataD.odd; int4 map1_dataDx1 = map1_dataDx + (int4)(1); int4 map1_dataDy1 = map1_dataDy + (int4)(1); + uchar4 a = val, b = val, c = val, d =val; - int4 src_StartU = map1_dataDy * src_step + map1_dataDx + src_offset; - int4 src_StartD = src_StartU + src_step; - /* - //not using the vload - int4 src_StartU1 = src_StartU + (int4)(1); - int4 src_StartD1 = src_StartD + (int4)(1); + if (map1_dataDx.s0 < src_cols && map1_dataDx.s0 >= 0 && map1_dataDy.s0 < src_rows && map1_dataDy.s0 >= 0) + a.s0 = *((__global uchar*)((__global uchar *)src + map1_dataDy.s0 * src_step + map1_dataDx.s0 + src_offset)); + if (map1_dataDx.s1 < src_cols && map1_dataDx.s1 >= 0 && map1_dataDy.s1 < src_rows && map1_dataDy.s1 >= 0) + a.s1 = *((__global uchar*)((__global uchar *)src + map1_dataDy.s1 * src_step + map1_dataDx.s1 + src_offset)); + if (map1_dataDx.s2 < src_cols && map1_dataDx.s2 >= 0 && map1_dataDy.s2 < src_rows && map1_dataDy.s2 >= 0) + a.s2 = *((__global uchar*)((__global uchar *)src + map1_dataDy.s2 * src_step + map1_dataDx.s2 + src_offset)); + if (map1_dataDx.s3 < src_cols && map1_dataDx.s3 >= 0 && map1_dataDy.s3 < src_rows && map1_dataDy.s3 >= 0) + a.s3 = *((__global uchar*)((__global uchar *)src + map1_dataDy.s3 * src_step + map1_dataDx.s3 + src_offset)); - uchar4 a, b, c, d; - a.x = *(src_StartU.x + src); - a.y = *(src_StartU.y + src); - a.z = *(src_StartU.z + src); - a.w = *(src_StartU.w + src); + if (map1_dataDx1.s0 < src_cols && map1_dataDx1.s0 >= 0 && map1_dataDy.s0 < src_rows && map1_dataDy.s0 >= 0) + b.s0 = *((__global uchar*)((__global uchar *)src + map1_dataDy.s0 * src_step + map1_dataDx1.s0 + src_offset)); + if (map1_dataDx1.s1 < src_cols && map1_dataDx1.s1 >= 0 && map1_dataDy.s1 < src_rows && map1_dataDy.s1 >= 0) + b.s1 = *((__global uchar*)((__global uchar *)src + map1_dataDy.s1 * src_step + map1_dataDx1.s1 + src_offset)); + if (map1_dataDx1.s2 < src_cols && map1_dataDx1.s2 >= 0 && map1_dataDy.s2 < src_rows && map1_dataDy.s2 >= 0) + b.s2 = *((__global uchar*)((__global uchar *)src + map1_dataDy.s2 * src_step + map1_dataDx1.s2 + src_offset)); + if (map1_dataDx1.s3 < src_cols && map1_dataDx1.s3 >= 0 && map1_dataDy.s3 < src_rows && map1_dataDy.s3 >= 0) + b.s3 = *((__global uchar*)((__global uchar *)src + map1_dataDy.s3 * src_step + map1_dataDx1.s3 + src_offset)); - b.x = *(src_StartU1.x + src); - b.y = *(src_StartU1.y + src); - b.z = *(src_StartU1.z + src); - b.w = *(src_StartU1.w + src); - - c.x = *(src_StartD.x + src); - c.y = *(src_StartD.y + src); - c.z = *(src_StartD.z + src); - c.w = *(src_StartD.w + src); - - d.x = *(src_StartD1.x + src); - d.y = *(src_StartD1.y + src); - d.z = *(src_StartD1.z + src); - d.w = *(src_StartD1.w + src); - */ - uchar2 aU, aD, bU, bD, cU, cD, dU, dD; - - aU = vload2(0, src + src_StartU.s0); - bU = vload2(0, src + src_StartU.s1); - cU = vload2(0, src + src_StartU.s2); - dU = vload2(0, src + src_StartU.s3); - aD = vload2(0, src + src_StartD.s0); - bD = vload2(0, src + src_StartD.s1); - cD = vload2(0, src + src_StartD.s2); - dD = vload2(0, src + src_StartD.s3); - - uchar4 a, b, c, d; - a = (uchar4)(aU.x, bU.x, cU.x, dU.x); - b = (uchar4)(aU.y, bU.y, cU.y, dU.y); - c = (uchar4)(aD.x, bD.x, cD.x, dD.x); - d = (uchar4)(aD.y, bD.y, cD.y, dD.y); - - int4 ac =(map1_dataDx >= src_cols || map1_dataDy >= src_rows || map1_dataDy< 0 || map1_dataDy < 0); - int4 bc =(map1_dataDx1 >= src_cols || map1_dataDy >= src_rows || map1_dataDx1 < 0 || map1_dataDy < 0); - int4 cc =(map1_dataDx >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDx < 0); - int4 dc =(map1_dataDx1 >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDy1 < 0); - a = (convert_uchar4(ac) == (uchar4)(0))? a : val; - b = (convert_uchar4(bc) == (uchar4)(0))? b : val; - c = (convert_uchar4(cc) == (uchar4)(0))? c : val; - d = (convert_uchar4(dc) == (uchar4)(0))? d : val; + if (map1_dataDx.s0 < src_cols && map1_dataDx.s0 >= 0 && map1_dataDy1.s0 < src_rows && map1_dataDy1.s0 >= 0) + c.s0 = *((__global uchar*)((__global uchar *)src + map1_dataDy1.s0 * src_step + map1_dataDx.s0 + src_offset)); + if (map1_dataDx.s1 < src_cols && map1_dataDx.s1 >= 0 && map1_dataDy1.s1 < src_rows && map1_dataDy1.s1 >= 0) + c.s1 = *((__global uchar*)((__global uchar *)src + map1_dataDy1.s1 * src_step + map1_dataDx.s1 + src_offset)); + if (map1_dataDx.s2 < src_cols && map1_dataDx.s2 >= 0 && map1_dataDy1.s2 < src_rows && map1_dataDy1.s2 >= 0) + c.s2 = *((__global uchar*)((__global uchar *)src + map1_dataDy1.s2 * src_step + map1_dataDx.s2 + src_offset)); + if (map1_dataDx.s3 < src_cols && map1_dataDx.s3 >= 0 && map1_dataDy1.s3 < src_rows && map1_dataDy1.s3 >= 0) + c.s3 = *((__global uchar*)((__global uchar *)src + map1_dataDy1.s3 * src_step + map1_dataDx.s3 + src_offset)); + if (map1_dataDx1.s0 < src_cols && map1_dataDx1.s0 >= 0 && map1_dataDy1.s0 < src_rows && map1_dataDy1.s0 >= 0) + d.s0 = *((__global uchar*)((__global uchar *)src + map1_dataDy1.s0 * src_step + map1_dataDx1.s0 + src_offset)); + if (map1_dataDx1.s1 < src_cols && map1_dataDx1.s1 >= 0 && map1_dataDy1.s1 < src_rows && map1_dataDy1.s1 >= 0) + d.s1 = *((__global uchar*)((__global uchar *)src + map1_dataDy1.s1 * src_step + map1_dataDx1.s1 + src_offset)); + if (map1_dataDx1.s2 < src_cols && map1_dataDx1.s2 >= 0 && map1_dataDy1.s2 < src_rows && map1_dataDy1.s2 >= 0) + d.s2 = *((__global uchar*)((__global uchar *)src + map1_dataDy1.s2 * src_step + map1_dataDx1.s2 + src_offset)); + if (map1_dataDx1.s3 < src_cols && map1_dataDx1.s3 >= 0 && map1_dataDy1.s3 < src_rows && map1_dataDy1.s3 >= 0) + d.s3 = *((__global uchar*)((__global uchar *)src + map1_dataDy1.s3 * src_step + map1_dataDx1.s3 + src_offset)); + uchar4 dst_data = convert_uchar4_sat_rte((convert_float4(a))* ud * vd +(convert_float4(b))* u * vd + (convert_float4(c))* ud * v + (convert_float4(d)) * u * v ); - + __global uchar4* D = (__global uchar4 *)(dst + dstStart); - uchar4 dVal = *D; + uchar4 dVal = *D; int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows); dst_data = (convert_uchar4(con) != (uchar4)(0)) ? dst_data : dVal; @@ -680,13 +540,13 @@ __kernel void remapLNF1Constant_C1_D0(__global unsigned char* dst, __global unsi int y = get_global_id(1); if(x < threadCols && y < dst_rows) { - x = x << 2; + x = x << 2; int gx = x - (dst_offset&3); int4 Gx = (int4)(gx, gx+1, gx+2, gx+3); uchar4 nval =convert_uchar4(nVal); - uchar val = nval.s0; - + uchar4 val = (uchar4)(nval.s0); + int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&3); @@ -711,65 +571,49 @@ __kernel void remapLNF1Constant_C1_D0(__global unsigned char* dst, __global unsi int4 map1_dataDx1 = map1_dataDx + (int4)(1); int4 map1_dataDy1 = map1_dataDy + (int4)(1); - int4 src_StartU = map1_dataDy * src_step + map1_dataDx + src_offset; - int4 src_StartD = src_StartU + src_step; - /* - //not using the vload - int4 src_StartU1 = src_StartU + (int4)(1); - int4 src_StartD1 = src_StartD + (int4)(1); + uchar4 a = val, b = val, c = val, d =val; + if (map1_dataDx.s0 < src_cols && map1_dataDx.s0 >= 0 && map1_dataDy.s0 < src_rows && map1_dataDy.s0 >= 0) + a.s0 = *((__global uchar*)((__global uchar *)src + map1_dataDy.s0 * src_step + map1_dataDx.s0 + src_offset)); + if (map1_dataDx.s1 < src_cols && map1_dataDx.s1 >= 0 && map1_dataDy.s1 < src_rows && map1_dataDy.s1 >= 0) + a.s1 = *((__global uchar*)((__global uchar *)src + map1_dataDy.s1 * src_step + map1_dataDx.s1 + src_offset)); + if (map1_dataDx.s2 < src_cols && map1_dataDx.s2 >= 0 && map1_dataDy.s2 < src_rows && map1_dataDy.s2 >= 0) + a.s2 = *((__global uchar*)((__global uchar *)src + map1_dataDy.s2 * src_step + map1_dataDx.s2 + src_offset)); + if (map1_dataDx.s3 < src_cols && map1_dataDx.s3 >= 0 && map1_dataDy.s3 < src_rows && map1_dataDy.s3 >= 0) + a.s3 = *((__global uchar*)((__global uchar *)src + map1_dataDy.s3 * src_step + map1_dataDx.s3 + src_offset)); - uchar4 a, b, c, d; - a.x = *(src_StartU.x + src); - a.y = *(src_StartU.y + src); - a.z = *(src_StartU.z + src); - a.w = *(src_StartU.w + src); + if (map1_dataDx1.s0 < src_cols && map1_dataDx1.s0 >= 0 && map1_dataDy.s0 < src_rows && map1_dataDy.s0 >= 0) + b.s0 = *((__global uchar*)((__global uchar *)src + map1_dataDy.s0 * src_step + map1_dataDx1.s0 + src_offset)); + if (map1_dataDx1.s1 < src_cols && map1_dataDx1.s1 >= 0 && map1_dataDy.s1 < src_rows && map1_dataDy.s1 >= 0) + b.s1 = *((__global uchar*)((__global uchar *)src + map1_dataDy.s1 * src_step + map1_dataDx1.s1 + src_offset)); + if (map1_dataDx1.s2 < src_cols && map1_dataDx1.s2 >= 0 && map1_dataDy.s2 < src_rows && map1_dataDy.s2 >= 0) + b.s2 = *((__global uchar*)((__global uchar *)src + map1_dataDy.s2 * src_step + map1_dataDx1.s2 + src_offset)); + if (map1_dataDx1.s3 < src_cols && map1_dataDx1.s3 >= 0 && map1_dataDy.s3 < src_rows && map1_dataDy.s3 >= 0) + b.s3 = *((__global uchar*)((__global uchar *)src + map1_dataDy.s3 * src_step + map1_dataDx1.s3 + src_offset)); - b.x = *(src_StartU1.x + src); - b.y = *(src_StartU1.y + src); - b.z = *(src_StartU1.z + src); - b.w = *(src_StartU1.w + src); + if (map1_dataDx.s0 < src_cols && map1_dataDx.s0 >= 0 && map1_dataDy1.s0 < src_rows && map1_dataDy1.s0 >= 0) + c.s0 = *((__global uchar*)((__global uchar *)src + map1_dataDy1.s0 * src_step + map1_dataDx.s0 + src_offset)); + if (map1_dataDx.s1 < src_cols && map1_dataDx.s1 >= 0 && map1_dataDy1.s1 < src_rows && map1_dataDy1.s1 >= 0) + c.s1 = *((__global uchar*)((__global uchar *)src + map1_dataDy1.s1 * src_step + map1_dataDx.s1 + src_offset)); + if (map1_dataDx.s2 < src_cols && map1_dataDx.s2 >= 0 && map1_dataDy1.s2 < src_rows && map1_dataDy1.s2 >= 0) + c.s2 = *((__global uchar*)((__global uchar *)src + map1_dataDy1.s2 * src_step + map1_dataDx.s2 + src_offset)); + if (map1_dataDx.s3 < src_cols && map1_dataDx.s3 >= 0 && map1_dataDy1.s3 < src_rows && map1_dataDy1.s3 >= 0) + c.s3 = *((__global uchar*)((__global uchar *)src + map1_dataDy1.s3 * src_step + map1_dataDx.s3 + src_offset)); - c.x = *(src_StartD.x + src); - c.y = *(src_StartD.y + src); - c.z = *(src_StartD.z + src); - c.w = *(src_StartD.w + src); - - d.x = *(src_StartD1.x + src); - d.y = *(src_StartD1.y + src); - d.z = *(src_StartD1.z + src); - d.w = *(src_StartD1.w + src); - */ - uchar2 aU, aD, bU, bD, cU, cD, dU, dD; - - aU = vload2(0, src + src_StartU.s0); - bU = vload2(0, src + src_StartU.s1); - cU = vload2(0, src + src_StartU.s2); - dU = vload2(0, src + src_StartU.s3); - aD = vload2(0, src + src_StartD.s0); - bD = vload2(0, src + src_StartD.s1); - cD = vload2(0, src + src_StartD.s2); - dD = vload2(0, src + src_StartD.s3); - - uchar4 a, b, c, d; - a = (uchar4)(aU.x, bU.x, cU.x, dU.x); - b = (uchar4)(aU.y, bU.y, cU.y, dU.y); - c = (uchar4)(aD.x, bD.x, cD.x, dD.x); - d = (uchar4)(aD.y, bD.y, cD.y, dD.y); - - int4 ac =(map1_dataDx >= src_cols || map1_dataDy >= src_rows || map1_dataDy< 0 || map1_dataDy < 0); - int4 bc =(map1_dataDx1 >= src_cols || map1_dataDy >= src_rows || map1_dataDx1 < 0 || map1_dataDy < 0); - int4 cc =(map1_dataDx >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDx < 0); - int4 dc =(map1_dataDx1 >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDy1 < 0); - a = (convert_uchar4(ac) == (uchar4)(0))? a : val; - b = (convert_uchar4(bc) == (uchar4)(0))? b : val; - c = (convert_uchar4(cc) == (uchar4)(0))? c : val; - d = (convert_uchar4(dc) == (uchar4)(0))? d : val; + if (map1_dataDx1.s0 < src_cols && map1_dataDx1.s0 >= 0 && map1_dataDy1.s0 < src_rows && map1_dataDy1.s0 >= 0) + d.s0 = *((__global uchar*)((__global uchar *)src + map1_dataDy1.s0 * src_step + map1_dataDx1.s0 + src_offset)); + if (map1_dataDx1.s1 < src_cols && map1_dataDx1.s1 >= 0 && map1_dataDy1.s1 < src_rows && map1_dataDy1.s1 >= 0) + d.s1 = *((__global uchar*)((__global uchar *)src + map1_dataDy1.s1 * src_step + map1_dataDx1.s1 + src_offset)); + if (map1_dataDx1.s2 < src_cols && map1_dataDx1.s2 >= 0 && map1_dataDy1.s2 < src_rows && map1_dataDy1.s2 >= 0) + d.s2 = *((__global uchar*)((__global uchar *)src + map1_dataDy1.s2 * src_step + map1_dataDx1.s2 + src_offset)); + if (map1_dataDx1.s3 < src_cols && map1_dataDx1.s3 >= 0 && map1_dataDy1.s3 < src_rows && map1_dataDy1.s3 >= 0) + d.s3 = *((__global uchar*)((__global uchar *)src + map1_dataDy1.s3 * src_step + map1_dataDx1.s3 + src_offset)); + uchar4 dst_data = convert_uchar4_sat_rte((convert_float4(a))* ud * vd +(convert_float4(b))* u * vd + (convert_float4(c))* ud * v + (convert_float4(d)) * u * v ); - + __global uchar4* D = (__global uchar4 *)(dst + dstStart); - uchar4 dVal = *D; + uchar4 dVal = *D; int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows); dst_data = (convert_uchar4(con) != (uchar4)(0)) ? dst_data : dVal; @@ -778,53 +622,6 @@ __kernel void remapLNF1Constant_C1_D0(__global unsigned char* dst, __global unsi } -__kernel void remapLNSConstant_C1_D0(__global unsigned char* dst, __global unsigned char const * restrict src, - __global short * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step, - int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows , int threadCols, F4 nVal) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if(x < threadCols && y < dst_rows) - { - x = x << 2; - int gx = x - (dst_offset&3); - int4 Gx = (int4)(gx, gx+1, gx+2, gx+3); - - uchar4 nval =convert_uchar4(nVal); - uchar val = nval.s0; - - int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&3); - - int map1Start = y * map1_step + (x << 2) + map1_offset - ((dst_offset & 3) << 2); - short8 map1_data; - - map1_data = *((__global short8 *)((__global char*)map1 + map1Start)); - int4 srcIdx = convert_int4(map1_data.odd) * src_step + convert_int4(map1_data.even) + src_offset; - - uchar4 src_data; - - src_data.s0 = *(src + srcIdx.s0); - src_data.s1 = *(src + srcIdx.s1); - src_data.s2 = *(src + srcIdx.s2); - src_data.s3 = *(src + srcIdx.s3); - uchar4 dst_data; - dst_data = convert_uchar4((convert_int4(map1_data.even) >= (int4)(src_cols) || convert_int4(map1_data.odd) >= (int4)(src_rows)))? (uchar4)(val) : src_data; - - __global uchar4* d = (__global uchar4 *)(dst + dstStart); - - uchar4 dVal = *d; - - int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows); - dst_data = (convert_uchar4(con) != (uchar4)(0)) ? dst_data : dVal; - - *d = dst_data; - - } - -} - - __kernel void remapLNFConstant_C4_D0(__global unsigned char* dst, __global unsigned char const * restrict src, __global float * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step, @@ -835,84 +632,41 @@ __kernel void remapLNFConstant_C4_D0(__global unsigned char* dst, __global unsig int y = get_global_id(1); if(x < threadCols && y < dst_rows) { - x = x << 4; - int gx = x - (dst_offset&15); - int16 Gx = (int16)(gx, gx+1, gx+2, gx+3, gx+4, gx+5, gx+6, gx+7, gx+8, gx+9, gx+10, gx+11, gx+12, gx+13, gx+14, gx+15); + int dstIdx = y * dst_step + (x << 2) + dst_offset; + int mapIdx = y * map1_step + (x << 3) + map1_offset; + float2 map_data = *((__global float2 *)((__global char*)map1 + mapIdx)); + int2 map_dataA = convert_int2(map_data); + float2 u = map_data - convert_float2(map_dataA); + int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y); + int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1); + int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y +1); + uchar4 nval = convert_uchar4(nVal); + uchar4 a, b, c , d; + if(map_dataA.x < 0 || map_dataA.x >= src_cols || map_dataA.y >= src_rows || map_dataA.y < 0) + a = nval; + else + a = *((__global uchar4 *)((__global uchar *)src + map_dataA.y * src_step + (map_dataA.x<<2) + src_offset )); + if(map_dataB.x < 0 || map_dataB.x >= src_cols || map_dataB.y >= src_rows || map_dataB.y < 0) + b = nval; + else + b = *((__global uchar4 *)((__global uchar *)src + map_dataB.y * src_step + (map_dataB.x<<2) + src_offset )); - uchar4 nval =convert_uchar4(nVal); + if(map_dataC.x < 0 || map_dataC.x >= src_cols || map_dataC.y >= src_rows || map_dataC.y < 0) + c = nval; + else + c = *((__global uchar4 *)((__global uchar *)src + map_dataC.y * src_step + (map_dataC.x<<2) + src_offset )); - int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&15); + if(map_dataD.x < 0 || map_dataD.x >= src_cols || map_dataD.y >= src_rows || map_dataD.y < 0) + d = nval; + else + d = *((__global uchar4 *)((__global uchar *)src + map_dataD.y * src_step + (map_dataD.x<<2) + src_offset )); + float4 dst_data = convert_float4(a)*((float4)(1.0-u.x)*((float4)(1.0-u.y))) + convert_float4(b)*((float4)(u.x))*((float4)(1.0-u.y)) + convert_float4(c)*((float4)(1.0-u.x))*((float4)(u.y)) + convert_float4(d)*((float4)(u.x))*((float4)(u.y)); + *((__global uchar4 *)((__global uchar*)dst + dstIdx)) = convert_uchar4_sat_rte(dst_data); - int map1Start = y * map1_step + (x << 1) + map1_offset - ((dst_offset & 15) << 1); - float8 map1_data; - map1_data = *((__global float8 *)((__global char*)map1 + map1Start)); - int8 map1_dataD = convert_int8(map1_data); - float8 temp = map1_data - convert_float8(map1_dataD); - - float4 u = temp.even; - float4 v = temp.odd; - float4 ud = (float4)(1.0) - u; - float4 vd = (float4)(1.0) - v; - - //float8 map1_dataU = map1_dataD + 1; - - int4 map1_dataDx = map1_dataD.even; - int4 map1_dataDy = map1_dataD.odd; - int4 map1_dataDx1 = map1_dataDx + (int4)(1); - int4 map1_dataDy1 = map1_dataDy + (int4)(1); - - int4 src_StartU = map1_dataDy * src_step + (convert_int4(map1_dataDx) << (int4)(2)) + src_offset; - int4 src_StartD = src_StartU + src_step; - - uchar8 aU, bU, cU, dU, aD, bD, cD, dD; - aU = vload8(0, src + src_StartU.s0); - bU = vload8(0, src + src_StartU.s1); - cU = vload8(0, src + src_StartU.s2); - dU = vload8(0, src + src_StartU.s3); - aD = vload8(0, src + src_StartD.s0); - bD = vload8(0, src + src_StartD.s1); - cD = vload8(0, src + src_StartD.s2); - dD = vload8(0, src + src_StartD.s3); - uchar16 a, b, c, d; - a = (uchar16)(aU.s0123, bU.s0123, cU.s0123, dU.s0123); - b = (uchar16)(aU.s4567, bU.s4567, cU.s4567, dU.s4567); - c = (uchar16)(aD.s0123, bD.s0123, cD.s0123, dD.s0123); - d = (uchar16)(aD.s4567, bD.s4567, cD.s4567, dD.s4567); - int4 ac =(map1_dataDx >= src_cols || map1_dataDy >= src_rows || map1_dataDy< 0 || map1_dataDy < 0); - int4 bc =(map1_dataDx1 >= src_cols || map1_dataDy >= src_rows || map1_dataDx1 < 0 || map1_dataDy < 0); - int4 cc =(map1_dataDx >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDx < 0); - int4 dc =(map1_dataDx1 >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDy1 < 0); - - int16 acc = (int16)((int4)(ac.x), (int4)(ac.y), (int4)(ac.z), (int4)(ac.w)); - int16 bcc = (int16)((int4)(bc.x), (int4)(bc.y), (int4)(bc.z), (int4)(bc.w)); - int16 ccc = (int16)((int4)(cc.x), (int4)(cc.y), (int4)(cc.z), (int4)(cc.w)); - int16 dcc = (int16)((int4)(dc.x), (int4)(dc.y), (int4)(dc.z), (int4)(dc.w)); - - uchar16 val = (uchar16)(nval, nval, nval, nval); - a = (convert_uchar16(acc) == (uchar16)(0))? a : val; - b = (convert_uchar16(bcc) == (uchar16)(0))? b : val; - c = (convert_uchar16(ccc) == (uchar16)(0))? c : val; - d = (convert_uchar16(dcc) == (uchar16)(0))? d : val; - - float16 U = (float16)((float4)(u.x), (float4)(u.y), (float4)(u.z), (float4)(u.w)); - float16 V = (float16)((float4)(v.x), (float4)(v.y), (float4)(v.z), (float4)(v.w)); - float16 Ud = (float16)((float4)(ud.x), (float4)(ud.y), (float4)(ud.z), (float4)(ud.w)); - float16 Vd = (float16)((float4)(vd.x), (float4)(vd.y), (float4)(vd.z), (float4)(vd.w)); - - uchar16 dst_data = convert_uchar16_sat_rte((convert_float16(a))* Ud * Vd +(convert_float16(b))* U * Vd + (convert_float16(c))* Ud * V + (convert_float16(d)) * U * V ); - - __global uchar16* D = (__global uchar16 *)(dst + dstStart); - - uchar16 dVal = *D; - int16 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows); - dst_data = (convert_uchar16(con) != (uchar16)(0)) ? dst_data : dVal; - - *D = dst_data; } + } - - __kernel void remapLNF1Constant_C4_D0(__global unsigned char* dst, __global unsigned char const * restrict src, __global float * map1, __global float * map2, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step, int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows , int threadCols, F4 nVal) @@ -922,133 +676,45 @@ __kernel void remapLNF1Constant_C4_D0(__global unsigned char* dst, __global unsi int y = get_global_id(1); if(x < threadCols && y < dst_rows) { - x = x << 4; - int gx = x - (dst_offset&15); - int16 Gx = (int16)(gx, gx+1, gx+2, gx+3, gx+4, gx+5, gx+6, gx+7, gx+8, gx+9, gx+10, gx+11, gx+12, gx+13, gx+14, gx+15); + int dstIdx = y * dst_step + (x << 2) + dst_offset; + int mapIdx = y * map1_step + (x << 2) + map1_offset; + float map1_data = *((__global float *)((__global char*)map1 + mapIdx)); + float map2_data = *((__global float *)((__global char*)map2 + mapIdx)); + float2 map_data = (float2)(map1_data, map2_data); + int2 map_dataA = convert_int2(map_data); + float2 u = map_data - convert_float2(map_dataA); + int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y); + int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1); + int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y +1); + uchar4 nval = convert_uchar4(nVal); + uchar4 a, b, c , d; + if(map_dataA.x < 0 || map_dataA.x >= src_cols || map_dataA.y >= src_rows || map_dataA.y < 0) + a = nval; + else + a = *((__global uchar4 *)((__global uchar *)src + map_dataA.y * src_step + (map_dataA.x<<2) + src_offset )); + if(map_dataB.x < 0 || map_dataB.x >= src_cols || map_dataB.y >= src_rows || map_dataB.y < 0) + b = nval; + else + b = *((__global uchar4 *)((__global uchar *)src + map_dataB.y * src_step + (map_dataB.x<<2) + src_offset )); - uchar4 nval =convert_uchar4(nVal); + if(map_dataC.x < 0 || map_dataC.x >= src_cols || map_dataC.y >= src_rows || map_dataC.y < 0) + c = nval; + else + c = *((__global uchar4 *)((__global uchar *)src + map_dataC.y * src_step + (map_dataC.x<<2) + src_offset )); - int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&15); + if(map_dataD.x < 0 || map_dataD.x >= src_cols || map_dataD.y >= src_rows || map_dataD.y < 0) + d = nval; + else + d = *((__global uchar4 *)((__global uchar *)src + map_dataD.y * src_step + (map_dataD.x<<2) + src_offset )); + float4 dst_data = convert_float4(a)*((float4)(1.0-u.x)*((float4)(1.0-u.y))) + convert_float4(b)*((float4)(u.x))*((float4)(1.0-u.y)) + convert_float4(c)*((float4)(1.0-u.x))*((float4)(u.y)) + convert_float4(d)*((float4)(u.x))*((float4)(u.y)); + *((__global uchar4 *)((__global uchar*)dst + dstIdx)) = convert_uchar4_sat_rte(dst_data); - int map1Start = y * map1_step + x + map1_offset - (dst_offset & 15); - float4 map1_data; - float4 map2_data; - map1_data = *((__global float4 *)((__global char*)map1 + map1Start)); - map2_data = *((__global float4 *)((__global char*)map2 + map1Start)); - float8 map_data = (float8)(map1_data.s0, map2_data.s0, map1_data.s1, map2_data.s1, map1_data.s2, map2_data.s2, map1_data.s3, map2_data.s3); - int8 map1_dataD = convert_int8(map_data); - float8 temp = map_data - convert_float8(map1_dataD); - float4 u = temp.even; - float4 v = temp.odd; - float4 ud = (float4)(1.0) - u; - float4 vd = (float4)(1.0) - v; - - //float8 map1_dataU = map1_dataD + 1; - - int4 map1_dataDx = map1_dataD.even; - int4 map1_dataDy = map1_dataD.odd; - int4 map1_dataDx1 = map1_dataDx + (int4)(1); - int4 map1_dataDy1 = map1_dataDy + (int4)(1); - - int4 src_StartU = map1_dataDy * src_step + (convert_int4(map1_dataDx) << (int4)(2)) + src_offset; - int4 src_StartD = src_StartU + src_step; - - uchar8 aU, bU, cU, dU, aD, bD, cD, dD; - aU = vload8(0, src + src_StartU.s0); - bU = vload8(0, src + src_StartU.s1); - cU = vload8(0, src + src_StartU.s2); - dU = vload8(0, src + src_StartU.s3); - aD = vload8(0, src + src_StartD.s0); - bD = vload8(0, src + src_StartD.s1); - cD = vload8(0, src + src_StartD.s2); - dD = vload8(0, src + src_StartD.s3); - uchar16 a, b, c, d; - a = (uchar16)(aU.s0123, bU.s0123, cU.s0123, dU.s0123); - b = (uchar16)(aU.s4567, bU.s4567, cU.s4567, dU.s4567); - c = (uchar16)(aD.s0123, bD.s0123, cD.s0123, dD.s0123); - d = (uchar16)(aD.s4567, bD.s4567, cD.s4567, dD.s4567); - int4 ac =(map1_dataDx >= src_cols || map1_dataDy >= src_rows || map1_dataDy< 0 || map1_dataDy < 0); - int4 bc =(map1_dataDx1 >= src_cols || map1_dataDy >= src_rows || map1_dataDx1 < 0 || map1_dataDy < 0); - int4 cc =(map1_dataDx >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDx < 0); - int4 dc =(map1_dataDx1 >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDy1 < 0); - - int16 acc = (int16)((int4)(ac.x), (int4)(ac.y), (int4)(ac.z), (int4)(ac.w)); - int16 bcc = (int16)((int4)(bc.x), (int4)(bc.y), (int4)(bc.z), (int4)(bc.w)); - int16 ccc = (int16)((int4)(cc.x), (int4)(cc.y), (int4)(cc.z), (int4)(cc.w)); - int16 dcc = (int16)((int4)(dc.x), (int4)(dc.y), (int4)(dc.z), (int4)(dc.w)); - - uchar16 val = (uchar16)(nval, nval, nval, nval); - a = (convert_uchar16(acc) == (uchar16)(0))? a : val; - b = (convert_uchar16(bcc) == (uchar16)(0))? b : val; - c = (convert_uchar16(ccc) == (uchar16)(0))? c : val; - d = (convert_uchar16(dcc) == (uchar16)(0))? d : val; - - float16 U = (float16)((float4)(u.x), (float4)(u.y), (float4)(u.z), (float4)(u.w)); - float16 V = (float16)((float4)(v.x), (float4)(v.y), (float4)(v.z), (float4)(v.w)); - float16 Ud = (float16)((float4)(ud.x), (float4)(ud.y), (float4)(ud.z), (float4)(ud.w)); - float16 Vd = (float16)((float4)(vd.x), (float4)(vd.y), (float4)(vd.z), (float4)(vd.w)); - - uchar16 dst_data = convert_uchar16_sat_rte((convert_float16(a))* Ud * Vd +(convert_float16(b))* U * Vd + (convert_float16(c))* Ud * V + (convert_float16(d)) * U * V ); - - __global uchar16* D = (__global uchar16 *)(dst + dstStart); - - uchar16 dVal = *D; - int16 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows); - dst_data = (convert_uchar16(con) != (uchar16)(0)) ? dst_data : dVal; - - *D = dst_data; } } -__kernel void remapLNSConstant_C4_D0(__global unsigned char* dst, __global unsigned char const * restrict src, - __global short * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step, - int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows, int threadCols, F4 nVal) -{ - int x = get_global_id(0); - int y = get_global_id(1); - if(x < threadCols && y < dst_rows) - { - x = x << 4; - int gx = x - (dst_offset&15); - int16 Gx = (int16)(gx, gx+1, gx+2, gx+3, gx+4, gx+5, gx+6, gx+7, gx+8, gx+9, gx+10, gx+11, gx+12, gx+13, gx+14, gx+15); - uchar4 nval =convert_uchar4_sat_rte(nVal); - - int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&15); - - int map1Start = y * map1_step + x + map1_offset - (dst_offset&15 ); - short8 map1_data; - - map1_data = *((__global short8 *)((__global char*)map1 + map1Start)); - int4 srcIdx = convert_int4(map1_data.odd) * src_step + (convert_int4(map1_data.even) <<(int4)(2)) + src_offset; - uchar4 src_a, src_b, src_c, src_d; - src_a = *((__global uchar4 *)((__global char*)src + srcIdx.s0)); - src_b = *((__global uchar4 *)((__global char*)src + srcIdx.s1)); - src_c = *((__global uchar4 *)((__global char*)src + srcIdx.s2)); - src_d = *((__global uchar4 *)((__global char*)src + srcIdx.s3)); - - uchar16 dst_data; - uchar4 dst_a, dst_b, dst_c, dst_d; - dst_a = (map1_data.s0 >= src_cols || map1_data.s1 >= src_rows)? nval : src_a; - dst_b = (map1_data.s2 >= src_cols || map1_data.s3 >= src_rows)? nval : src_b; - dst_c = (map1_data.s4 >= src_cols || map1_data.s5 >= src_rows)? nval : src_c; - dst_d = (map1_data.s6 >= src_cols || map1_data.s7 >= src_rows)? nval : src_d; - - dst_data = (uchar16)(dst_a, dst_b, dst_c, dst_d); - __global uchar16* d = (__global uchar16 *)(dst + dstStart); - - uchar16 dVal = *d; - - int16 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows); - dst_data = (convert_uchar16(con) != (uchar16)(0)) ? dst_data : dVal; - - *d = dst_data; - - } - -} __kernel void remapLNFConstant_C1_D5(__global float* dst, __global float const * restrict src, __global float * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step, @@ -1059,13 +725,13 @@ __kernel void remapLNFConstant_C1_D5(__global float* dst, __global float const * int y = get_global_id(1); if(x < threadCols && y < dst_rows) { - x = x << 4; + x = x << 4; int gx = x - (dst_offset&15); int4 Gx = (int4)(gx, gx+4, gx+8, gx+12); float4 nval =convert_float4(nVal); float4 val = (float4)(nval.s0); - + int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&15); int map1Start = y * map1_step + (x << 1) + map1_offset - ((dst_offset & 15) << 1); float8 map1_data; @@ -1085,65 +751,48 @@ __kernel void remapLNFConstant_C1_D5(__global float* dst, __global float const * int4 map1_dataDx1 = map1_dataDx + (int4)(1); int4 map1_dataDy1 = map1_dataDy + (int4)(1); - int4 src_StartU = map1_dataDy * src_step + (map1_dataDx << (int4)(2)) + src_offset; - int4 src_StartD = src_StartU + src_step; - /* - //not using the vload - int4 src_StartU1 = src_StartU + (int4)(1); - int4 src_StartD1 = src_StartD + (int4)(1); + float4 a = val, b = val, c = val, d = val; + if (map1_dataDx.s0 < src_cols && map1_dataDx.s0 >= 0 && map1_dataDy.s0 < src_rows && map1_dataDy.s0 >= 0) + a.s0 = *((__global float*)((__global uchar *)src + map1_dataDy.s0 * src_step + (map1_dataDx.s0 << 2) + src_offset)); + if (map1_dataDx.s1 < src_cols && map1_dataDx.s1 >= 0 && map1_dataDy.s1 < src_rows && map1_dataDy.s1 >= 0) + a.s1 = *((__global float*)((__global uchar *)src + map1_dataDy.s1 * src_step + (map1_dataDx.s1 << 2) + src_offset)); + if (map1_dataDx.s2 < src_cols && map1_dataDx.s2 >= 0 && map1_dataDy.s2 < src_rows && map1_dataDy.s2 >= 0) + a.s2 = *((__global float*)((__global uchar *)src + map1_dataDy.s2 * src_step + (map1_dataDx.s2 << 2) + src_offset)); + if (map1_dataDx.s3 < src_cols && map1_dataDx.s3 >= 0 && map1_dataDy.s3 < src_rows && map1_dataDy.s3 >= 0) + a.s3 = *((__global float*)((__global uchar *)src + map1_dataDy.s3 * src_step + (map1_dataDx.s3 << 2) + src_offset)); - float4 a, b, c, d; - a.x = *(src_StartU.x + src); - a.y = *(src_StartU.y + src); - a.z = *(src_StartU.z + src); - a.w = *(src_StartU.w + src); + if (map1_dataDx1.s0 < src_cols && map1_dataDx1.s0 >= 0 && map1_dataDy.s0 < src_rows && map1_dataDy.s0 >= 0) + b.s0 = *((__global float*)((__global uchar *)src + map1_dataDy.s0 * src_step + (map1_dataDx1.s0 << 2) + src_offset)); + if (map1_dataDx1.s1 < src_cols && map1_dataDx1.s1 >= 0 && map1_dataDy.s1 < src_rows && map1_dataDy.s1 >= 0) + b.s1 = *((__global float*)((__global uchar *)src + map1_dataDy.s1 * src_step + (map1_dataDx1.s1 << 2) + src_offset)); + if (map1_dataDx1.s2 < src_cols && map1_dataDx1.s2 >= 0 && map1_dataDy.s2 < src_rows && map1_dataDy.s2 >= 0) + b.s2 = *((__global float*)((__global uchar *)src + map1_dataDy.s2 * src_step + (map1_dataDx1.s2 << 2) + src_offset)); + if (map1_dataDx1.s3 < src_cols && map1_dataDx1.s3 >= 0 && map1_dataDy.s3 < src_rows && map1_dataDy.s3 >= 0) + b.s3 = *((__global float*)((__global uchar *)src + map1_dataDy.s3 * src_step + (map1_dataDx1.s3 << 2) + src_offset)); - b.x = *(src_StartU1.x + src); - b.y = *(src_StartU1.y + src); - b.z = *(src_StartU1.z + src); - b.w = *(src_StartU1.w + src); - - c.x = *(src_StartD.x + src); - c.y = *(src_StartD.y + src); - c.z = *(src_StartD.z + src); - c.w = *(src_StartD.w + src); - - d.x = *(src_StartD1.x + src); - d.y = *(src_StartD1.y + src); - d.z = *(src_StartD1.z + src); - d.w = *(src_StartD1.w + src); - */ - float2 aU, aD, bU, bD, cU, cD, dU, dD; - - aU = vload2(0, (__global float *)((__global char*)src + src_StartU.s0)); - bU = vload2(0, (__global float *)((__global char*)src + src_StartU.s1)); - cU = vload2(0, (__global float *)((__global char*)src + src_StartU.s2)); - dU = vload2(0, (__global float *)((__global char*)src + src_StartU.s3)); - aD = vload2(0, (__global float *)((__global char*)src + src_StartD.s0)); - bD = vload2(0, (__global float *)((__global char*)src + src_StartD.s1)); - cD = vload2(0, (__global float *)((__global char*)src + src_StartD.s2)); - dD = vload2(0, (__global float *)((__global char*)src + src_StartD.s3)); - - float4 a, b, c, d; - a = (float4)(aU.x, bU.x, cU.x, dU.x); - b = (float4)(aU.y, bU.y, cU.y, dU.y); - c = (float4)(aD.x, bD.x, cD.x, dD.x); - d = (float4)(aD.y, bD.y, cD.y, dD.y); - - int4 ac =(map1_dataDx >= (int4)(src_cols) || map1_dataDy >= (int4)(src_rows) || map1_dataDy < (int4)(0) || map1_dataDy < (int4)(0)); - int4 bc =(map1_dataDx1 >= (int4)(src_cols) || map1_dataDy >= (int4)(src_rows) || map1_dataDx1 < (int4)(0) || map1_dataDy < (int4)(0)); - int4 cc =(map1_dataDx >= (int4)(src_cols) || map1_dataDy1 >= (int4)(src_rows) || map1_dataDy1 < (int4)(0) || map1_dataDx < (int4)(0)); - int4 dc =(map1_dataDx1 >= (int4)(src_cols) || map1_dataDy1 >= (int4)(src_rows) || map1_dataDy1 < (int4)(0) || map1_dataDy1 < (int4)(0)); - a = (convert_float4(ac) == (float4)(0))? a : val; - b = (convert_float4(bc) == (float4)(0))? b : val; - c = (convert_float4(cc) == (float4)(0))? c : val; - d = (convert_float4(dc) == (float4)(0))? d : val; + if (map1_dataDx.s0 < src_cols && map1_dataDx.s0 >= 0 && map1_dataDy1.s0 < src_rows && map1_dataDy1.s0 >= 0) + c.s0 = *((__global float*)((__global uchar *)src + map1_dataDy1.s0 * src_step + (map1_dataDx.s0 << 2) + src_offset)); + if (map1_dataDx.s1 < src_cols && map1_dataDx.s1 >= 0 && map1_dataDy1.s1 < src_rows && map1_dataDy1.s1 >= 0) + c.s1 = *((__global float*)((__global uchar *)src + map1_dataDy1.s1 * src_step + (map1_dataDx.s1 << 2) + src_offset)); + if (map1_dataDx.s2 < src_cols && map1_dataDx.s2 >= 0 && map1_dataDy1.s2 < src_rows && map1_dataDy1.s2 >= 0) + c.s2 = *((__global float*)((__global uchar *)src + map1_dataDy1.s2 * src_step + (map1_dataDx.s2 << 2) + src_offset)); + if (map1_dataDx.s3 < src_cols && map1_dataDx.s3 >= 0 && map1_dataDy1.s3 < src_rows && map1_dataDy1.s3 >= 0) + c.s3 = *((__global float*)((__global uchar *)src + map1_dataDy1.s3 * src_step + (map1_dataDx.s3 << 2) + src_offset)); + if (map1_dataDx1.s0 < src_cols && map1_dataDx1.s0 >= 0 && map1_dataDy1.s0 < src_rows && map1_dataDy1.s0 >= 0) + d.s0 = *((__global float*)((__global uchar *)src + map1_dataDy1.s0 * src_step + (map1_dataDx1.s0 << 2) + src_offset)); + if (map1_dataDx1.s1 < src_cols && map1_dataDx1.s1 >= 0 && map1_dataDy1.s1 < src_rows && map1_dataDy1.s1 >= 0) + d.s1 = *((__global float*)((__global uchar *)src + map1_dataDy1.s1 * src_step + (map1_dataDx1.s1 << 2) + src_offset)); + if (map1_dataDx1.s2 < src_cols && map1_dataDx1.s2 >= 0 && map1_dataDy1.s2 < src_rows && map1_dataDy1.s2 >= 0) + d.s2 = *((__global float*)((__global uchar *)src + map1_dataDy1.s2 * src_step + (map1_dataDx1.s2 << 2) + src_offset)); + if (map1_dataDx1.s3 < src_cols && map1_dataDx1.s3 >= 0 && map1_dataDy1.s3 < src_rows && map1_dataDy1.s3 >= 0) + d.s3 = *((__global float*)((__global uchar *)src + map1_dataDy1.s3 * src_step + (map1_dataDx1.s3 << 2) + src_offset)); + float4 dst_data = a * ud * vd + b * u * vd + c * ud * v + d * u * v ; - + __global float4* D = (__global float4 *)((__global char*)dst + dstStart); - float4 dVal = *D; + float4 dVal = *D; int4 con = (Gx >= 0 && Gx < (dst_cols << 2) && y >= 0 && y < dst_rows); dst_data = (convert_float4(con) != (float4)(0)) ? dst_data : dVal; @@ -1160,13 +809,13 @@ __kernel void remapLNF1Constant_C1_D5(__global float* dst, __global float const int y = get_global_id(1); if(x < threadCols && y < dst_rows) { - x = x << 4; + x = x << 4; int gx = x - (dst_offset&15); int4 Gx = (int4)(gx, gx+4, gx+8, gx+12); float4 nval =convert_float4(nVal); float4 val = (float4)(nval.s0); - + int dstStart = y * dst_step + x + dst_offset - (dst_offset & 15); int map1Start = y * map1_step + x + map1_offset - (dst_offset & 15); float4 map1_data; @@ -1189,65 +838,49 @@ __kernel void remapLNF1Constant_C1_D5(__global float* dst, __global float const int4 map1_dataDx1 = map1_dataDx + (int4)(1); int4 map1_dataDy1 = map1_dataDy + (int4)(1); - int4 src_StartU = map1_dataDy * src_step + (map1_dataDx << (int4)(2)) + src_offset; - int4 src_StartD = src_StartU + src_step; - /* - //not using the vload - int4 src_StartU1 = src_StartU + (int4)(1); - int4 src_StartD1 = src_StartD + (int4)(1); + float4 a = val, b = val, c = val, d = val; + if (map1_dataDx.s0 < src_cols && map1_dataDx.s0 >= 0 && map1_dataDy.s0 < src_rows && map1_dataDy.s0 >= 0) + a.s0 = *((__global float*)((__global uchar *)src + map1_dataDy.s0 * src_step + (map1_dataDx.s0 << 2) + src_offset)); + if (map1_dataDx.s1 < src_cols && map1_dataDx.s1 >= 0 && map1_dataDy.s1 < src_rows && map1_dataDy.s1 >= 0) + a.s1 = *((__global float*)((__global uchar *)src + map1_dataDy.s1 * src_step + (map1_dataDx.s1 << 2) + src_offset)); + if (map1_dataDx.s2 < src_cols && map1_dataDx.s2 >= 0 && map1_dataDy.s2 < src_rows && map1_dataDy.s2 >= 0) + a.s2 = *((__global float*)((__global uchar *)src + map1_dataDy.s2 * src_step + (map1_dataDx.s2 << 2) + src_offset)); + if (map1_dataDx.s3 < src_cols && map1_dataDx.s3 >= 0 && map1_dataDy.s3 < src_rows && map1_dataDy.s3 >= 0) + a.s3 = *((__global float*)((__global uchar *)src + map1_dataDy.s3 * src_step + (map1_dataDx.s3 << 2) + src_offset)); - float4 a, b, c, d; - a.x = *(src_StartU.x + src); - a.y = *(src_StartU.y + src); - a.z = *(src_StartU.z + src); - a.w = *(src_StartU.w + src); + if (map1_dataDx1.s0 < src_cols && map1_dataDx1.s0 >= 0 && map1_dataDy.s0 < src_rows && map1_dataDy.s0 >= 0) + b.s0 = *((__global float*)((__global uchar *)src + map1_dataDy.s0 * src_step + (map1_dataDx1.s0 << 2) + src_offset)); + if (map1_dataDx1.s1 < src_cols && map1_dataDx1.s1 >= 0 && map1_dataDy.s1 < src_rows && map1_dataDy.s1 >= 0) + b.s1 = *((__global float*)((__global uchar *)src + map1_dataDy.s1 * src_step + (map1_dataDx1.s1 << 2) + src_offset)); + if (map1_dataDx1.s2 < src_cols && map1_dataDx1.s2 >= 0 && map1_dataDy.s2 < src_rows && map1_dataDy.s2 >= 0) + b.s2 = *((__global float*)((__global uchar *)src + map1_dataDy.s2 * src_step + (map1_dataDx1.s2 << 2) + src_offset)); + if (map1_dataDx1.s3 < src_cols && map1_dataDx1.s3 >= 0 && map1_dataDy.s3 < src_rows && map1_dataDy.s3 >= 0) + b.s3 = *((__global float*)((__global uchar *)src + map1_dataDy.s3 * src_step + (map1_dataDx1.s3 << 2) + src_offset)); - b.x = *(src_StartU1.x + src); - b.y = *(src_StartU1.y + src); - b.z = *(src_StartU1.z + src); - b.w = *(src_StartU1.w + src); - - c.x = *(src_StartD.x + src); - c.y = *(src_StartD.y + src); - c.z = *(src_StartD.z + src); - c.w = *(src_StartD.w + src); - - d.x = *(src_StartD1.x + src); - d.y = *(src_StartD1.y + src); - d.z = *(src_StartD1.z + src); - d.w = *(src_StartD1.w + src); - */ - float2 aU, aD, bU, bD, cU, cD, dU, dD; - - aU = vload2(0, (__global float *)((__global char*)src + src_StartU.s0)); - bU = vload2(0, (__global float *)((__global char*)src + src_StartU.s1)); - cU = vload2(0, (__global float *)((__global char*)src + src_StartU.s2)); - dU = vload2(0, (__global float *)((__global char*)src + src_StartU.s3)); - aD = vload2(0, (__global float *)((__global char*)src + src_StartD.s0)); - bD = vload2(0, (__global float *)((__global char*)src + src_StartD.s1)); - cD = vload2(0, (__global float *)((__global char*)src + src_StartD.s2)); - dD = vload2(0, (__global float *)((__global char*)src + src_StartD.s3)); - - float4 a, b, c, d; - a = (float4)(aU.x, bU.x, cU.x, dU.x); - b = (float4)(aU.y, bU.y, cU.y, dU.y); - c = (float4)(aD.x, bD.x, cD.x, dD.x); - d = (float4)(aD.y, bD.y, cD.y, dD.y); - - int4 ac =(map1_dataDx >= (int4)(src_cols) || map1_dataDy >= (int4)(src_rows) || map1_dataDy < (int4)(0) || map1_dataDy < (int4)(0)); - int4 bc =(map1_dataDx1 >= (int4)(src_cols) || map1_dataDy >= (int4)(src_rows) || map1_dataDx1 < (int4)(0) || map1_dataDy < (int4)(0)); - int4 cc =(map1_dataDx >= (int4)(src_cols) || map1_dataDy1 >= (int4)(src_rows) || map1_dataDy1 < (int4)(0) || map1_dataDx < (int4)(0)); - int4 dc =(map1_dataDx1 >= (int4)(src_cols) || map1_dataDy1 >= (int4)(src_rows) || map1_dataDy1 < (int4)(0) || map1_dataDy1 < (int4)(0)); - a = (convert_float4(ac) == (float4)(0))? a : val; - b = (convert_float4(bc) == (float4)(0))? b : val; - c = (convert_float4(cc) == (float4)(0))? c : val; - d = (convert_float4(dc) == (float4)(0))? d : val; + if (map1_dataDx.s0 < src_cols && map1_dataDx.s0 >= 0 && map1_dataDy1.s0 < src_rows && map1_dataDy1.s0 >= 0) + c.s0 = *((__global float*)((__global uchar *)src + map1_dataDy1.s0 * src_step + (map1_dataDx.s0 << 2) + src_offset)); + if (map1_dataDx.s1 < src_cols && map1_dataDx.s1 >= 0 && map1_dataDy1.s1 < src_rows && map1_dataDy1.s1 >= 0) + c.s1 = *((__global float*)((__global uchar *)src + map1_dataDy1.s1 * src_step + (map1_dataDx.s1 << 2) + src_offset)); + if (map1_dataDx.s2 < src_cols && map1_dataDx.s2 >= 0 && map1_dataDy1.s2 < src_rows && map1_dataDy1.s2 >= 0) + c.s2 = *((__global float*)((__global uchar *)src + map1_dataDy1.s2 * src_step + (map1_dataDx.s2 << 2) + src_offset)); + if (map1_dataDx.s3 < src_cols && map1_dataDx.s3 >= 0 && map1_dataDy1.s3 < src_rows && map1_dataDy1.s3 >= 0) + c.s3 = *((__global float*)((__global uchar *)src + map1_dataDy1.s3 * src_step + (map1_dataDx.s3 << 2) + src_offset)); + if (map1_dataDx1.s0 < src_cols && map1_dataDx1.s0 >= 0 && map1_dataDy1.s0 < src_rows && map1_dataDy1.s0 >= 0) + d.s0 = *((__global float*)((__global uchar *)src + map1_dataDy1.s0 * src_step + (map1_dataDx1.s0 << 2) + src_offset)); + if (map1_dataDx1.s1 < src_cols && map1_dataDx1.s1 >= 0 && map1_dataDy1.s1 < src_rows && map1_dataDy1.s1 >= 0) + d.s1 = *((__global float*)((__global uchar *)src + map1_dataDy1.s1 * src_step + (map1_dataDx1.s1 << 2) + src_offset)); + if (map1_dataDx1.s2 < src_cols && map1_dataDx1.s2 >= 0 && map1_dataDy1.s2 < src_rows && map1_dataDy1.s2 >= 0) + d.s2 = *((__global float*)((__global uchar *)src + map1_dataDy1.s2 * src_step + (map1_dataDx1.s2 << 2) + src_offset)); + if (map1_dataDx1.s3 < src_cols && map1_dataDx1.s3 >= 0 && map1_dataDy1.s3 < src_rows && map1_dataDy1.s3 >= 0) + d.s3 = *((__global float*)((__global uchar *)src + map1_dataDy1.s3 * src_step + (map1_dataDx1.s3 << 2) + src_offset)); + + float4 dst_data = a * ud * vd + b * u * vd + c * ud * v + d * u * v ; - + __global float4* D = (__global float4 *)((__global char*)dst + dstStart); - float4 dVal = *D; + float4 dVal = *D; int4 con = (Gx >= 0 && Gx < (dst_cols << 2) && y >= 0 && y < dst_rows); dst_data = (convert_float4(con) != (float4)(0)) ? dst_data : dVal; @@ -1255,57 +888,6 @@ __kernel void remapLNF1Constant_C1_D5(__global float* dst, __global float const } } -__kernel void remapLNSConstant_C1_D5(__global float* dst, __global float const * restrict src, - __global short * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step, - int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows ,int threadCols, F4 nVal) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if(x < threadCols && y < dst_rows) - { - x = x << 4; - - int gx = x - (dst_offset&15); - int4 Gx = (int4)(gx, gx+4, gx+8, gx+12); - - float4 nval =convert_float4(nVal); - float val = nval.s0; - - int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&15); - - int map1Start = y * map1_step + x + map1_offset - (dst_offset&15); - short8 map1_data; - - map1_data = *((__global short8 *)((__global char*)map1 + map1Start)); - - int4 srcIdx = convert_int4(map1_data.odd) * src_step + (convert_int4(map1_data.even) << (int4)(2)) + src_offset; - - float4 src_data; - src_data.s0 = *((__global float *)((__global char*)src + srcIdx.s0)); - src_data.s1 = *((__global float *)((__global char*)src + srcIdx.s1)); - src_data.s2 = *((__global float *)((__global char*)src + srcIdx.s2)); - src_data.s3 = *((__global float *)((__global char*)src + srcIdx.s3)); - float4 dst_data; - - dst_data.s0 = (map1_data.s0 >= src_cols || map1_data.s1 >= src_rows)? val : src_data.s0; - dst_data.s1 = (map1_data.s2 >= src_cols || map1_data.s3 >= src_rows)? val : src_data.s1; - dst_data.s2 = (map1_data.s4 >= src_cols || map1_data.s5 >= src_rows)? val : src_data.s2; - dst_data.s3 = (map1_data.s6 >= src_cols || map1_data.s7 >= src_rows)? val : src_data.s3; - - - __global float4* d = (__global float4 *)((__global uchar*)dst + dstStart); - - float4 dVal = *d; - - int4 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows); - dst_data = (convert_float4(con) != (float4)(0)) ? dst_data : dVal; - - *d = dst_data; - - } - -} __kernel void remapLNFConstant_C4_D5(__global float * dst, __global float const * restrict src, @@ -1319,37 +901,35 @@ __kernel void remapLNFConstant_C4_D5(__global float * dst, __global float const { int dstIdx = y * dst_step + (x << 4) + dst_offset ; int mapIdx = y * map1_step + (x << 3) + map1_offset ; - float2 map1_data = *((__global float2 *)((__global char*)map1 + mapIdx)); - - int2 map1_dataZ = convert_int2(map1_data); - - int mX = map1_dataZ.x; - int mY = map1_dataZ.y; - int mX1 = map1_dataZ.x + 1; - int mY1 = map1_dataZ.y + 1; - - float u = map1_data.x - convert_float(map1_dataZ.x); - float v = map1_data.y - convert_float(map1_dataZ.y); - float ud = 1.0 - u; - float vd = 1.0 - v; - - int srcIdx = map1_dataZ.y * src_step + (map1_dataZ.x << 4) + src_offset; - float8 src_dataU = vload8(0,(__global float *)((__global char*)src + srcIdx)); - float8 src_dataD = vload8(0,(__global float *)((__global char*)src + srcIdx + src_step)); - - float4 a = src_dataU.lo; - float4 b = src_dataU.hi; - float4 c = src_dataD.lo; - float4 d = src_dataD.hi; - + float2 map_data = *((__global float2 *)((__global char*)map1 + mapIdx)); + int2 map_dataA = convert_int2(map_data); + float2 u = map_data - convert_float2(map_dataA); + int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y); + int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1); + int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y +1); float4 nval = convert_float4(nVal); - a = (mX >= src_cols || mY >= src_rows ) ? nval : a; - b = (mX1 >= src_cols || mY >= src_rows ) ? nval : b; - c = (mX >= src_cols || mY1 >= src_rows ) ? nval : c; - d = (mX1 >= src_cols || mY1 >= src_rows ) ? nval : d; + float4 a, b, c , d; + if(map_dataA.x < 0 || map_dataA.x >= src_cols || map_dataA.y >= src_rows || map_dataA.y < 0) + a = nval; + else + a = *((__global float4 *)((__global uchar *)src + map_dataA.y * src_step + (map_dataA.x<<4) + src_offset )); + if(map_dataB.x < 0 || map_dataB.x >= src_cols || map_dataB.y >= src_rows || map_dataB.y < 0) + b = nval; + else + b = *((__global float4 *)((__global uchar *)src + map_dataB.y * src_step + (map_dataB.x<<4) + src_offset )); - float4 dst_data = a * ud * vd + b * u * vd + c * ud * v + d * u * v; - *((__global float4 *)((__global uchar*)dst + dstIdx)) = a * ud * vd + b * u * vd + c * ud * v + d * u * v ; + if(map_dataC.x < 0 || map_dataC.x >= src_cols || map_dataC.y >= src_rows || map_dataC.y < 0) + c = nval; + else + c = *((__global float4 *)((__global uchar *)src + map_dataC.y * src_step + (map_dataC.x<<4) + src_offset )); + + if(map_dataD.x < 0 || map_dataD.x >= src_cols || map_dataD.y >= src_rows || map_dataD.y < 0) + d = nval; + else + d = *((__global float4 *)((__global uchar *)src + map_dataD.y * src_step + (map_dataD.x<<4) + src_offset )); + + float4 dst_data = a * ((float4)(1.0-u.x)) * ((float4)(1.0-u.y)) + b *((float4)(u.x)) * ((float4)(1.0-u.y)) + c * ((float4)(1.0-u.x)) *((float4)(u.y)) + d *((float4)(u.x)) *((float4)(u.y)); + *((__global float4 *)((__global uchar*)dst + dstIdx)) = dst_data ; } } @@ -1368,87 +948,38 @@ __kernel void remapLNF1Constant_C4_D5(__global float * dst, __global float const float map1_data = *((__global float *)((__global char*)map1 + mapIdx)); float map2_data = *((__global float *)((__global char*)map2 + mapIdx)); float2 map_data = (float2)(map1_data, map2_data); - int2 map1_dataZ = convert_int2(map_data); - - int mX = map1_dataZ.x; - int mY = map1_dataZ.y; - int mX1 = map1_dataZ.x + 1; - int mY1 = map1_dataZ.y + 1; - - float u = map1_data - convert_float(map1_dataZ.x); - float v = map2_data - convert_float(map1_dataZ.y); - float ud = 1.0 - u; - float vd = 1.0 - v; - - int srcIdx = map1_dataZ.y * src_step + (map1_dataZ.x << 4) + src_offset; - float8 src_dataU = vload8(0,(__global float *)((__global char*)src + srcIdx)); - float8 src_dataD = vload8(0,(__global float *)((__global char*)src + srcIdx + src_step)); - - float4 a = src_dataU.lo; - float4 b = src_dataU.hi; - float4 c = src_dataD.lo; - float4 d = src_dataD.hi; - + int2 map_dataA = convert_int2(map_data); + float2 u = map_data - convert_float2(map_dataA); + int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y); + int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1); + int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y +1); float4 nval = convert_float4(nVal); - a = (mX >= src_cols || mY >= src_rows ) ? nval : a; - b = (mX1 >= src_cols || mY >= src_rows ) ? nval : b; - c = (mX >= src_cols || mY1 >= src_rows ) ? nval : c; - d = (mX1 >= src_cols || mY1 >= src_rows ) ? nval : d; + float4 a, b, c , d; + if(map_dataA.x < 0 || map_dataA.x >= src_cols || map_dataA.y >= src_rows || map_dataA.y < 0) + a = nval; + else + a = *((__global float4 *)((__global uchar *)src + map_dataA.y * src_step + (map_dataA.x<<4) + src_offset )); + if(map_dataB.x < 0 || map_dataB.x >= src_cols || map_dataB.y >= src_rows || map_dataB.y < 0) + b = nval; + else + b = *((__global float4 *)((__global uchar *)src + map_dataB.y * src_step + (map_dataB.x<<4) + src_offset )); + + if(map_dataC.x < 0 || map_dataC.x >= src_cols || map_dataC.y >= src_rows || map_dataC.y < 0) + c = nval; + else + c = *((__global float4 *)((__global uchar *)src + map_dataC.y * src_step + (map_dataC.x<<4) + src_offset )); + + if(map_dataD.x < 0 || map_dataD.x >= src_cols || map_dataD.y >= src_rows || map_dataD.y < 0) + d = nval; + else + d = *((__global float4 *)((__global uchar *)src + map_dataD.y * src_step + (map_dataD.x<<4) + src_offset )); + + float4 dst_data = a * ((float4)(1.0-u.x)) * ((float4)(1.0-u.y)) + b *((float4)(u.x)) * ((float4)(1.0-u.y)) + c * ((float4)(1.0-u.x)) *((float4)(u.y)) + d *((float4)(u.x)) *((float4)(u.y)); + *((__global float4 *)((__global uchar*)dst + dstIdx)) = dst_data ; - float4 dst_data = a * ud * vd + b * u * vd + c * ud * v + d * u * v; - *((__global float4 *)((__global uchar*)dst + dstIdx)) = a * ud * vd + b * u * vd + c * ud * v + d * u * v ; } } -/* -//////////////////////////////////////////////////////////////////////// -///////////////////using image buffer/////////////////////////////////// -//////////////////////////////////////////////////////////////////////// - -__kernel void remapNNSConstant_C1_D0(__global unsigned char* dst, __read_only image2d_t src, - __global short * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step, - int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows , int threadCols, F4 nVal) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - x = x << 2; - if(x < threadCols && y < dst_rows) - { - int gx = x - (dst_offset&3); - int4 Gx = (int4)(gx, gx+1, gx+2, gx+3); - - uchar4 nval =convert_uchar4(nVal); - char val = nval.s0; - - int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&3); - - int map1Start = y * map1_step + (x << 2) + map1_offset - ((dst_offset & 3) << 2); - short8 map1_data; - - map1_data = *((__global short8 *)((__global char*)map1 + map1Start)); - - const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | - CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST; - - int4 src_data; - src_data.x = read_imageui(src, sampler, (int2)((int)map1_data.s0, (int)map1_data.s1)).x; - src_data.y = read_imageui(src, sampler, (int2)((int)map1_data.s2, (int)map1_data.s3)).x; - src_data.z = read_imageui(src, sampler, (int2)((int)map1_data.s4, (int)map1_data.s5)).x; - src_data.w = read_imageui(src, sampler, (int2)((int)map1_data.s6, (int)map1_data.s7)).x; - - int4 bcon = (convert_int4(map1_data.even) >= (int4)(src_cols) || convert_int4(map1_data.odd) >= (int4)(src_rows)); - uchar4 dst_data = (convert_uchar4(bcon != 0)) ? (uchar4)(val) : convert_uchar4(src_data); - - __global uchar4* d = (__global uchar4 *)(dst + dstStart); - uchar4 dVal = *d; - int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows); - dst_data = (convert_uchar4(con) != (uchar4)(0)) ? dst_data : dVal; - - *d = dst_data; - } -} -*/ diff --git a/modules/ocl/src/kernels/split_mat.cl b/modules/ocl/src/kernels/split_mat.cl index cdf735b6e9..3c70859264 100644 --- a/modules/ocl/src/kernels/split_mat.cl +++ b/modules/ocl/src/kernels/split_mat.cl @@ -51,9 +51,9 @@ ////////////vector fuction name format: split_vector_C(channels number)_D(data type depth)////// //////////////////////////////////////////////////////////////////////////////////////////////// __kernel void split_vector_C4_D0 (__global uchar *mat_src, int src_step, int src_offset, - __global uchar *mat_dst0, int dst0_step, int dst0_offset, - __global uchar *mat_dst1, int dst1_step, int dst1_offset, - __global uchar *mat_dst2, int dst2_step, int dst2_offset, + __global uchar *mat_dst0, int dst0_step, int dst0_offset, + __global uchar *mat_dst1, int dst1_step, int dst1_offset, + __global uchar *mat_dst2, int dst2_step, int dst2_offset, __global uchar *mat_dst3, int dst3_step, int dst3_offset, int rows, int cols, int dst_step1) @@ -61,37 +61,37 @@ __kernel void split_vector_C4_D0 (__global uchar *mat_src, int src_step, int s int x = get_global_id(0); int y = get_global_id(1); - if((x < cols) && (y < rows)) + if((x < cols) && (y < rows)) { x = x << 2; - int src_idx = mad24(y, src_step, src_offset + (x << 2)); + int src_idx = mad24(y, src_step, src_offset + (x << 2)); - int dst0_start = mad24(y, dst0_step, dst0_offset); + int dst0_start = mad24(y, dst0_step, dst0_offset); int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1); int dst0_idx = mad24(y, dst0_step, dst0_offset + x) & (int)0xfffffffc; - int dst1_start = mad24(y, dst1_step, dst1_offset); + int dst1_start = mad24(y, dst1_step, dst1_offset); int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1); int dst1_idx = mad24(y, dst1_step, dst1_offset + x) & (int)0xfffffffc; - int dst2_start = mad24(y, dst2_step, dst2_offset); + int dst2_start = mad24(y, dst2_step, dst2_offset); int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1); int dst2_idx = mad24(y, dst2_step, dst2_offset + x) & (int)0xfffffffc; - int dst3_start = mad24(y, dst3_step, dst3_offset); + int dst3_start = mad24(y, dst3_step, dst3_offset); int dst3_end = mad24(y, dst3_step, dst3_offset + dst_step1); int dst3_idx = mad24(y, dst3_step, dst3_offset + x) & (int)0xfffffffc; + + uchar4 data_0 = *((global uchar4 *)(mat_src + (src_idx - 12 >= 0 ? src_idx - 12 : src_idx))); + uchar4 data_1 = *((global uchar4 *)(mat_src + (src_idx - 8 >= 0 ? src_idx - 8 : src_idx))); + uchar4 data_2 = *((global uchar4 *)(mat_src + (src_idx - 4 >= 0 ? src_idx - 4 : src_idx))); + uchar4 data_3 = *((global uchar4 *)(mat_src + src_idx + 0 )); - uchar4 data_0 = *((global uchar4 *)(mat_src + (src_idx - 12 >= 0 ? src_idx - 12 : src_idx))); - uchar4 data_1 = *((global uchar4 *)(mat_src + (src_idx - 8 >= 0 ? src_idx - 8 : src_idx))); - uchar4 data_2 = *((global uchar4 *)(mat_src + (src_idx - 4 >= 0 ? src_idx - 4 : src_idx))); - uchar4 data_3 = *((global uchar4 *)(mat_src + src_idx + 0 )); - - int total_bytes = src_offset + rows * src_step; - uchar4 data_4 = *((global uchar4 *)(mat_src + (src_idx + 4 < total_bytes ? src_idx + 4 : src_idx))); - uchar4 data_5 = *((global uchar4 *)(mat_src + (src_idx + 8 < total_bytes ? src_idx + 8 : src_idx))); - uchar4 data_6 = *((global uchar4 *)(mat_src + (src_idx + 12 < total_bytes ? src_idx + 12 : src_idx))); + int total_bytes = src_offset + rows * src_step; + uchar4 data_4 = *((global uchar4 *)(mat_src + (src_idx + 4 < total_bytes ? src_idx + 4 : src_idx))); + uchar4 data_5 = *((global uchar4 *)(mat_src + (src_idx + 8 < total_bytes ? src_idx + 8 : src_idx))); + uchar4 data_6 = *((global uchar4 *)(mat_src + (src_idx + 12 < total_bytes ? src_idx + 12 : src_idx))); uchar4 tmp_data0=1, tmp_data1=2, tmp_data2, tmp_data3; @@ -164,33 +164,33 @@ __kernel void split_vector_C4_D0 (__global uchar *mat_src, int src_step, int s } __kernel void split_vector_C3_D0 (__global uchar *mat_src, int src_step, int src_offset, - __global uchar *mat_dst0, int dst0_step, int dst0_offset, - __global uchar *mat_dst1, int dst1_step, int dst1_offset, - __global uchar *mat_dst2, int dst2_step, int dst2_offset, + __global uchar *mat_dst0, int dst0_step, int dst0_offset, + __global uchar *mat_dst1, int dst1_step, int dst1_offset, + __global uchar *mat_dst2, int dst2_step, int dst2_offset, int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); - if((x < cols) && (y < rows)) + if((x < cols) && (y < rows)) { x = x << 2; - int src_idx = mad24(y, src_step, src_offset); + int src_idx = mad24(y, src_step, src_offset); - int dst0_start = mad24(y, dst0_step, dst0_offset); + int dst0_start = mad24(y, dst0_step, dst0_offset); int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1); int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc); - int dst1_start = mad24(y, dst1_step, dst1_offset); + int dst1_start = mad24(y, dst1_step, dst1_offset); int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1); int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc); - int dst2_start = mad24(y, dst2_step, dst2_offset); + int dst2_start = mad24(y, dst2_step, dst2_offset); int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1); int dst2_idx = mad24(y, dst2_step, dst2_offset + x & (int)0xfffffffc); - + uchar4 dst0_data = *((__global uchar4 *)(mat_dst0 + dst0_idx)); uchar4 dst1_data = *((__global uchar4 *)(mat_dst1 + dst1_idx)); uchar4 dst2_data = *((__global uchar4 *)(mat_dst2 + dst2_idx)); @@ -227,10 +227,10 @@ __kernel void split_vector_C3_D0 (__global uchar *mat_src, int src_step, int s uchar data[7] = {src_data_0, src_data_3, src_data_6, src_data_9, src_data_12, src_data_15, src_data_18}; int index = 3 - dst0_offset & 3; - tmp_data0 = (uchar4)(data[index], data[index + 1], data[index + 2], data[index + 3]); + tmp_data0 = (uchar4)(data[index], data[index + 1], data[index + 2], data[index + 3]); uchar4 data0, data1, data2; - + data0 = (uchar4)(src_data_1, src_data_4, src_data_7, src_data_10); data1 = (dst1_offset & 3) == 2 ? (uchar4)(src_data_4, src_data_7, src_data_10, src_data_13) : data0; data2 = (dst1_offset & 3) == 1 ? (uchar4)(src_data_7, src_data_10, src_data_13, src_data_16) : data1; @@ -263,33 +263,47 @@ __kernel void split_vector_C3_D0 (__global uchar *mat_src, int src_step, int s } __kernel void split_vector_C2_D0 (__global uchar *mat_src, int src_step, int src_offset, - __global uchar *mat_dst0, int dst0_step, int dst0_offset, - __global uchar *mat_dst1, int dst1_step, int dst1_offset, + __global uchar *mat_dst0, int dst0_step, int dst0_offset, + __global uchar *mat_dst1, int dst1_step, int dst1_offset, int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); - if((x < cols) && (y < rows)) + if((x < cols) && (y < rows)) { x = x << 2; #define dst0_align ((dst0_offset & 3) << 1) #define dst1_align ((dst1_offset & 3) << 1) - int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 1)); - int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 1)); + int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 1)); + int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 1)); - int dst0_start = mad24(y, dst0_step, dst0_offset); + int dst0_start = mad24(y, dst0_step, dst0_offset); int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1); int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc); - int dst1_start = mad24(y, dst1_step, dst1_offset); + int dst1_start = mad24(y, dst1_step, dst1_offset); int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1); int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc); - + + int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0; + int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1; uchar8 src_data_0 = vload8(0, mat_src + src_idx_0); uchar8 src_data_1 = vload8(0, mat_src + src_idx_1); + if(src_idx_0 == -6) + src_data_0.s01234567 = src_data_0.s67012345; + if(src_idx_0 == -4) + src_data_0.s01234567 = src_data_0.s45670123; + if(src_idx_0 == -2) + src_data_0.s01234567 = src_data_0.s23456701; + if(src_idx_1 == -6) + src_data_1.s01234567 = src_data_1.s67012345; + if(src_idx_1 == -4) + src_data_1.s01234567 = src_data_1.s45670123; + if(src_idx_1 == -2) + src_data_1.s01234567 = src_data_1.s23456701; uchar4 dst0_data = *((__global uchar4 *)(mat_dst0 + dst0_idx)); uchar4 dst1_data = *((__global uchar4 *)(mat_dst1 + dst1_idx)); @@ -312,9 +326,9 @@ __kernel void split_vector_C2_D0 (__global uchar *mat_src, int src_step, int s } __kernel void split_vector_C4_D1 (__global char *mat_src, int src_step, int src_offset, - __global char *mat_dst0, int dst0_step, int dst0_offset, - __global char *mat_dst1, int dst1_step, int dst1_offset, - __global char *mat_dst2, int dst2_step, int dst2_offset, + __global char *mat_dst0, int dst0_step, int dst0_offset, + __global char *mat_dst1, int dst1_step, int dst1_offset, + __global char *mat_dst2, int dst2_step, int dst2_offset, __global char *mat_dst3, int dst3_step, int dst3_offset, int rows, int cols, int dst_step1) @@ -322,35 +336,35 @@ __kernel void split_vector_C4_D1 (__global char *mat_src, int src_step, int sr int x = get_global_id(0); int y = get_global_id(1); - if((x < cols) && (y < rows)) + if((x < cols) && (y < rows)) { x = x << 2; - int src_idx = mad24(y, src_step, src_offset + (x << 2)); + int src_idx = mad24(y, src_step, src_offset + (x << 2)); - int dst0_start = mad24(y, dst0_step, dst0_offset); + int dst0_start = mad24(y, dst0_step, dst0_offset); int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1); int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc); - int dst1_start = mad24(y, dst1_step, dst1_offset); + int dst1_start = mad24(y, dst1_step, dst1_offset); int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1); int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc); - int dst2_start = mad24(y, dst2_step, dst2_offset); + int dst2_start = mad24(y, dst2_step, dst2_offset); int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1); int dst2_idx = mad24(y, dst2_step, dst2_offset + x & (int)0xfffffffc); - int dst3_start = mad24(y, dst3_step, dst3_offset); + int dst3_start = mad24(y, dst3_step, dst3_offset); int dst3_end = mad24(y, dst3_step, dst3_offset + dst_step1); int dst3_idx = mad24(y, dst3_step, dst3_offset + x & (int)0xfffffffc); - - char4 data_0 = *((global char4 *)(mat_src + src_idx - 12)); - char4 data_1 = *((global char4 *)(mat_src + src_idx - 8 )); - char4 data_2 = *((global char4 *)(mat_src + src_idx - 4 )); - char4 data_3 = *((global char4 *)(mat_src + src_idx + 0 )); - char4 data_4 = *((global char4 *)(mat_src + src_idx + 4 )); - char4 data_5 = *((global char4 *)(mat_src + src_idx + 8 )); - char4 data_6 = *((global char4 *)(mat_src + src_idx + 12)); + + char4 data_0 = *((global char4 *)(mat_src + src_idx - 12)); + char4 data_1 = *((global char4 *)(mat_src + src_idx - 8 )); + char4 data_2 = *((global char4 *)(mat_src + src_idx - 4 )); + char4 data_3 = *((global char4 *)(mat_src + src_idx + 0 )); + char4 data_4 = *((global char4 *)(mat_src + src_idx + 4 )); + char4 data_5 = *((global char4 *)(mat_src + src_idx + 8 )); + char4 data_6 = *((global char4 *)(mat_src + src_idx + 12)); char4 tmp_data0=1, tmp_data1=2, tmp_data2, tmp_data3; @@ -423,33 +437,33 @@ __kernel void split_vector_C4_D1 (__global char *mat_src, int src_step, int sr } __kernel void split_vector_C3_D1 (__global char *mat_src, int src_step, int src_offset, - __global char *mat_dst0, int dst0_step, int dst0_offset, - __global char *mat_dst1, int dst1_step, int dst1_offset, - __global char *mat_dst2, int dst2_step, int dst2_offset, + __global char *mat_dst0, int dst0_step, int dst0_offset, + __global char *mat_dst1, int dst1_step, int dst1_offset, + __global char *mat_dst2, int dst2_step, int dst2_offset, int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); - if((x < cols) && (y < rows)) + if((x < cols) && (y < rows)) { x = x << 2; - int src_idx = mad24(y, src_step, src_offset); + int src_idx = mad24(y, src_step, src_offset); - int dst0_start = mad24(y, dst0_step, dst0_offset); + int dst0_start = mad24(y, dst0_step, dst0_offset); int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1); int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc); - int dst1_start = mad24(y, dst1_step, dst1_offset); + int dst1_start = mad24(y, dst1_step, dst1_offset); int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1); int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc); - int dst2_start = mad24(y, dst2_step, dst2_offset); + int dst2_start = mad24(y, dst2_step, dst2_offset); int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1); int dst2_idx = mad24(y, dst2_step, dst2_offset + x & (int)0xfffffffc); - + char4 dst0_data = *((__global char4 *)(mat_dst0 + dst0_idx)); char4 dst1_data = *((__global char4 *)(mat_dst1 + dst1_idx)); char4 dst2_data = *((__global char4 *)(mat_dst2 + dst2_idx)); @@ -486,10 +500,10 @@ __kernel void split_vector_C3_D1 (__global char *mat_src, int src_step, int sr char data[7] = {src_data_0, src_data_3, src_data_6, src_data_9, src_data_12, src_data_15, src_data_18}; int index = 3 - dst0_offset & 3; - tmp_data0 = (char4)(data[index], data[index + 1], data[index + 2], data[index + 3]); + tmp_data0 = (char4)(data[index], data[index + 1], data[index + 2], data[index + 3]); char4 data0, data1, data2; - + data0 = (char4)(src_data_1, src_data_4, src_data_7, src_data_10); data1 = (dst1_offset & 3) == 2 ? (char4)(src_data_4, src_data_7, src_data_10, src_data_13) : data0; data2 = (dst1_offset & 3) == 1 ? (char4)(src_data_7, src_data_10, src_data_13, src_data_16) : data1; @@ -522,34 +536,46 @@ __kernel void split_vector_C3_D1 (__global char *mat_src, int src_step, int sr } __kernel void split_vector_C2_D1 (__global char *mat_src, int src_step, int src_offset, - __global char *mat_dst0, int dst0_step, int dst0_offset, - __global char *mat_dst1, int dst1_step, int dst1_offset, + __global char *mat_dst0, int dst0_step, int dst0_offset, + __global char *mat_dst1, int dst1_step, int dst1_offset, int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); - if((x < cols) && (y < rows)) + if((x < cols) && (y < rows)) { x = x << 2; #define dst0_align ((dst0_offset & 3) << 1) #define dst1_align ((dst1_offset & 3) << 1) - int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 1)); - int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 1)); + int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 1)); + int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 1)); - int dst0_start = mad24(y, dst0_step, dst0_offset); + int dst0_start = mad24(y, dst0_step, dst0_offset); int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1); int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc); - int dst1_start = mad24(y, dst1_step, dst1_offset); + int dst1_start = mad24(y, dst1_step, dst1_offset); int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1); int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc); - + int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0; + int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1; char8 src_data_0 = vload8(0, mat_src + src_idx_0); char8 src_data_1 = vload8(0, mat_src + src_idx_1); - + if(src_idx_0 == -6) + src_data_0.s01234567 = src_data_0.s67012345; + if(src_idx_0 == -4) + src_data_0.s01234567 = src_data_0.s45670123; + if(src_idx_0 == -2) + src_data_0.s01234567 = src_data_0.s23456701; + if(src_idx_1 == -6) + src_data_1.s01234567 = src_data_1.s67012345; + if(src_idx_1 == -4) + src_data_1.s01234567 = src_data_1.s45670123; + if(src_idx_1 == -2) + src_data_1.s01234567 = src_data_1.s23456701; char4 dst0_data = *((__global char4 *)(mat_dst0 + dst0_idx)); char4 dst1_data = *((__global char4 *)(mat_dst1 + dst1_idx)); @@ -571,9 +597,9 @@ __kernel void split_vector_C2_D1 (__global char *mat_src, int src_step, int sr } __kernel void split_vector_C4_D2 (__global ushort *mat_src, int src_step, int src_offset, - __global ushort *mat_dst0, int dst0_step, int dst0_offset, - __global ushort *mat_dst1, int dst1_step, int dst1_offset, - __global ushort *mat_dst2, int dst2_step, int dst2_offset, + __global ushort *mat_dst0, int dst0_step, int dst0_offset, + __global ushort *mat_dst1, int dst1_step, int dst1_offset, + __global ushort *mat_dst2, int dst2_step, int dst2_offset, __global ushort *mat_dst3, int dst3_step, int dst3_offset, int rows, int cols, int dst_step1) @@ -581,30 +607,37 @@ __kernel void split_vector_C4_D2 (__global ushort *mat_src, int src_step, int int x = get_global_id(0); int y = get_global_id(1); - if((x < cols) && (y < rows)) + if((x < cols) && (y < rows)) { x = x << 1; - int src_idx_0 = mad24(y, src_step, src_offset + (x << 3) - 8); - int src_idx_1 = mad24(y, src_step, src_offset + (x << 3) + 8); + int src_idx_0 = mad24(y, src_step, src_offset + (x << 3) - 8); + int src_idx_1 = mad24(y, src_step, src_offset + (x << 3) + 8); - int dst0_start = mad24(y, dst0_step, dst0_offset); + int dst0_start = mad24(y, dst0_step, dst0_offset); int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1); int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc); - int dst1_start = mad24(y, dst1_step, dst1_offset); + int dst1_start = mad24(y, dst1_step, dst1_offset); int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1); int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc); - int dst2_start = mad24(y, dst2_step, dst2_offset); + int dst2_start = mad24(y, dst2_step, dst2_offset); int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1); int dst2_idx = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc); - int dst3_start = mad24(y, dst3_step, dst3_offset); + int dst3_start = mad24(y, dst3_step, dst3_offset); int dst3_end = mad24(y, dst3_step, dst3_offset + dst_step1); int dst3_idx = mad24(y, dst3_step, dst3_offset + (x << 1) & (int)0xfffffffc); - - ushort8 src_data0 = vload8(0, (__global ushort *)((__global char *)mat_src + src_idx_0)); + + int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0; + ushort8 src_data0 = vload8(0,(__global ushort *)((__global char *)mat_src + src_idx_0)); + if(src_idx_0 == -6) + src_data0.s01234567 = src_data0.s67012345; + if(src_idx_0 == -4) + src_data0.s01234567 = src_data0.s45670123; + if(src_idx_0 == -2) + src_data0.s01234567 = src_data0.s23456701; ushort4 src_data1 = *((__global ushort4 *)((__global char *)mat_src + src_idx_1)); ushort2 dst0_data = *((__global ushort2 *)((__global char *)mat_dst0 + dst0_idx)); @@ -639,33 +672,33 @@ __kernel void split_vector_C4_D2 (__global ushort *mat_src, int src_step, int } __kernel void split_vector_C3_D2 (__global ushort *mat_src, int src_step, int src_offset, - __global ushort *mat_dst0, int dst0_step, int dst0_offset, - __global ushort *mat_dst1, int dst1_step, int dst1_offset, - __global ushort *mat_dst2, int dst2_step, int dst2_offset, + __global ushort *mat_dst0, int dst0_step, int dst0_offset, + __global ushort *mat_dst1, int dst1_step, int dst1_offset, + __global ushort *mat_dst2, int dst2_step, int dst2_offset, int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); - if((x < cols) && (y < rows)) + if((x < cols) && (y < rows)) { x = x << 1; - int src_idx = mad24(y, src_step, src_offset); + int src_idx = mad24(y, src_step, src_offset); - int dst0_start = mad24(y, dst0_step, dst0_offset); + int dst0_start = mad24(y, dst0_step, dst0_offset); int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1); int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc); - int dst1_start = mad24(y, dst1_step, dst1_offset); + int dst1_start = mad24(y, dst1_step, dst1_offset); int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1); int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc); - int dst2_start = mad24(y, dst2_step, dst2_offset); + int dst2_start = mad24(y, dst2_step, dst2_offset); int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1); int dst2_idx = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc); - + ushort2 dst0_data = *((__global ushort2 *)((__global char *)mat_dst0 + dst0_idx)); ushort2 dst1_data = *((__global ushort2 *)((__global char *)mat_dst1 + dst1_idx)); ushort2 dst2_data = *((__global ushort2 *)((__global char *)mat_dst2 + dst2_idx)); @@ -702,34 +735,48 @@ __kernel void split_vector_C3_D2 (__global ushort *mat_src, int src_step, int } __kernel void split_vector_C2_D2 (__global ushort *mat_src, int src_step, int src_offset, - __global ushort *mat_dst0, int dst0_step, int dst0_offset, - __global ushort *mat_dst1, int dst1_step, int dst1_offset, + __global ushort *mat_dst0, int dst0_step, int dst0_offset, + __global ushort *mat_dst1, int dst1_step, int dst1_offset, int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); - if((x < cols) && (y < rows)) + if((x < cols) && (y < rows)) { x = x << 1; #define dst0_align ((dst0_offset & 3) << 1) #define dst1_align ((dst1_offset & 3) << 1) - int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 2)); - int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 2)); + int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 2)); + int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 2)); - int dst0_start = mad24(y, dst0_step, dst0_offset); + int dst0_start = mad24(y, dst0_step, dst0_offset); int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1); int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc); - int dst1_start = mad24(y, dst1_step, dst1_offset); + int dst1_start = mad24(y, dst1_step, dst1_offset); int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1); int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc); - - ushort4 src_data_0 = vload4(0, (__global ushort *)((__global char *)mat_src + src_idx_0)); - ushort4 src_data_1 = vload4(0, (__global ushort *)((__global char *)mat_src + src_idx_1)); - + + int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0; + int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1; + ushort4 src_data_0 = vload4(0, (__global ushort *)((__global char *)mat_src + src1_index_fix)); + ushort4 src_data_1 = vload4(0, (__global ushort *)((__global char *)mat_src + src2_index_fix)); + if(src_idx_0 < 0) + { + ushort4 tmp; + tmp.xyzw = (src_idx_0 == -2) ? src_data_0.zwxy : src_data_0.yzwx; + src_data_0.xyzw = (src_idx_1 == -1) ? src_data_0.wxyz:tmp.xyzw; + } + if(src_idx_1 < 0) + { + ushort4 tmp; + tmp.xyzw = (src_idx_1 == -2) ? src_data_1.zwxy : src_data_1.yzwx; + src_data_1.xyzw = (src_idx_1 == -1) ? src_data_1.wxyz : tmp.xyzw; + } + ushort2 dst0_data = *((__global ushort2 *)((__global char *)mat_dst0 + dst0_idx)); ushort2 dst1_data = *((__global ushort2 *)((__global char *)mat_dst1 + dst1_idx)); @@ -746,9 +793,9 @@ __kernel void split_vector_C2_D2 (__global ushort *mat_src, int src_step, int } } __kernel void split_vector_C4_D3 (__global short *mat_src, int src_step, int src_offset, - __global short *mat_dst0, int dst0_step, int dst0_offset, - __global short *mat_dst1, int dst1_step, int dst1_offset, - __global short *mat_dst2, int dst2_step, int dst2_offset, + __global short *mat_dst0, int dst0_step, int dst0_offset, + __global short *mat_dst1, int dst1_step, int dst1_offset, + __global short *mat_dst2, int dst2_step, int dst2_offset, __global short *mat_dst3, int dst3_step, int dst3_offset, int rows, int cols, int dst_step1) @@ -756,30 +803,38 @@ __kernel void split_vector_C4_D3 (__global short *mat_src, int src_step, int s int x = get_global_id(0); int y = get_global_id(1); - if((x < cols) && (y < rows)) + if((x < cols) && (y < rows)) { x = x << 1; - int src_idx_0 = mad24(y, src_step, src_offset + (x << 3) - 8); - int src_idx_1 = mad24(y, src_step, src_offset + (x << 3) + 8); + int src_idx_0 = mad24(y, src_step, src_offset + (x << 3) - 8); + int src_idx_1 = mad24(y, src_step, src_offset + (x << 3) + 8); - int dst0_start = mad24(y, dst0_step, dst0_offset); + int dst0_start = mad24(y, dst0_step, dst0_offset); int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1); int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc); - int dst1_start = mad24(y, dst1_step, dst1_offset); + int dst1_start = mad24(y, dst1_step, dst1_offset); int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1); int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc); - int dst2_start = mad24(y, dst2_step, dst2_offset); + int dst2_start = mad24(y, dst2_step, dst2_offset); int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1); int dst2_idx = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc); - int dst3_start = mad24(y, dst3_step, dst3_offset); + int dst3_start = mad24(y, dst3_step, dst3_offset); int dst3_end = mad24(y, dst3_step, dst3_offset + dst_step1); int dst3_idx = mad24(y, dst3_step, dst3_offset + (x << 1) & (int)0xfffffffc); - - short8 src_data0 = vload8(0, (__global short *)((__global char *)mat_src + src_idx_0)); + int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0; + short8 src_data0 = vload8(0,(__global short *)((__global char *)mat_src + src_idx_0)); + + if(src_idx_0 == -6) + src_data0.s01234567 = src_data0.s67012345; + if(src_idx_0 == -4) + src_data0.s01234567 = src_data0.s45670123; + if(src_idx_0 == -2) + src_data0.s01234567 = src_data0.s23456701; + short4 src_data1 = *((__global short4 *)((__global char *)mat_src + src_idx_1)); short2 dst0_data = *((__global short2 *)((__global char *)mat_dst0 + dst0_idx)); @@ -813,33 +868,33 @@ __kernel void split_vector_C4_D3 (__global short *mat_src, int src_step, int s } } __kernel void split_vector_C3_D3 (__global short *mat_src, int src_step, int src_offset, - __global short *mat_dst0, int dst0_step, int dst0_offset, - __global short *mat_dst1, int dst1_step, int dst1_offset, - __global short *mat_dst2, int dst2_step, int dst2_offset, + __global short *mat_dst0, int dst0_step, int dst0_offset, + __global short *mat_dst1, int dst1_step, int dst1_offset, + __global short *mat_dst2, int dst2_step, int dst2_offset, int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); - if((x < cols) && (y < rows)) + if((x < cols) && (y < rows)) { x = x << 1; - int src_idx = mad24(y, src_step, src_offset); + int src_idx = mad24(y, src_step, src_offset); - int dst0_start = mad24(y, dst0_step, dst0_offset); + int dst0_start = mad24(y, dst0_step, dst0_offset); int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1); int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc); - int dst1_start = mad24(y, dst1_step, dst1_offset); + int dst1_start = mad24(y, dst1_step, dst1_offset); int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1); int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc); - int dst2_start = mad24(y, dst2_step, dst2_offset); + int dst2_start = mad24(y, dst2_step, dst2_offset); int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1); int dst2_idx = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc); - + short2 dst0_data = *((__global short2 *)((__global char *)mat_dst0 + dst0_idx)); short2 dst1_data = *((__global short2 *)((__global char *)mat_dst1 + dst1_idx)); short2 dst2_data = *((__global short2 *)((__global char *)mat_dst2 + dst2_idx)); @@ -877,33 +932,47 @@ __kernel void split_vector_C3_D3 (__global short *mat_src, int src_step, int s __kernel void split_vector_C2_D3 (__global short *mat_src, int src_step, int src_offset, - __global short *mat_dst0, int dst0_step, int dst0_offset, - __global short *mat_dst1, int dst1_step, int dst1_offset, + __global short *mat_dst0, int dst0_step, int dst0_offset, + __global short *mat_dst1, int dst1_step, int dst1_offset, int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); - if((x < cols) && (y < rows)) + if((x < cols) && (y < rows)) { x = x << 1; #define dst0_align ((dst0_offset & 3) << 1) #define dst1_align ((dst1_offset & 3) << 1) - int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 2)); - int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 2)); + int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 2)); + int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 2)); - int dst0_start = mad24(y, dst0_step, dst0_offset); + int dst0_start = mad24(y, dst0_step, dst0_offset); int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1); int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc); - int dst1_start = mad24(y, dst1_step, dst1_offset); + int dst1_start = mad24(y, dst1_step, dst1_offset); int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1); int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc); - + int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0; + int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1; short4 src_data_0 = vload4(0, (__global short *)((__global char *)mat_src + src_idx_0)); short4 src_data_1 = vload4(0, (__global short *)((__global char *)mat_src + src_idx_1)); + if(src_idx_0 < 0) + { + short4 tmp; + tmp.xyzw = (src_idx_0 == -2) ? src_data_0.zwxy : src_data_0.yzwx; + src_data_0.xyzw = (src_idx_0 == -1) ? src_data_0.wxyz:tmp.xyzw; + } + if(src_idx_1< 0) + { + short4 tmp; + tmp.xyzw = ( src_idx_1== -2) ? src_data_1.zwxy : src_data_1.yzwx; + src_data_1.xyzw = ( src_idx_1== -1) ? src_data_1.wxyz : tmp.xyzw; + } + short2 dst0_data = *((__global short2 *)((__global char *)mat_dst0 + dst0_idx)); short2 dst1_data = *((__global short2 *)((__global char *)mat_dst1 + dst1_idx)); @@ -921,9 +990,9 @@ __kernel void split_vector_C2_D3 (__global short *mat_src, int src_step, int s } } __kernel void split_vector_C4_D4 (__global int *mat_src, int src_step, int src_offset, - __global int *mat_dst0, int dst0_step, int dst0_offset, - __global int *mat_dst1, int dst1_step, int dst1_offset, - __global int *mat_dst2, int dst2_step, int dst2_offset, + __global int *mat_dst0, int dst0_step, int dst0_offset, + __global int *mat_dst1, int dst1_step, int dst1_offset, + __global int *mat_dst2, int dst2_step, int dst2_offset, __global int *mat_dst3, int dst3_step, int dst3_offset, int rows, int cols, int dst_step1) @@ -931,14 +1000,14 @@ __kernel void split_vector_C4_D4 (__global int *mat_src, int src_step, int src int x = get_global_id(0); int y = get_global_id(1); - if((x < cols) && (y < rows)) + if((x < cols) && (y < rows)) { - int src_idx = mad24(y, src_step, src_offset); + int src_idx = mad24(y, src_step, src_offset); int dst0_idx = mad24(y, dst0_step, dst0_offset); int dst1_idx = mad24(y, dst1_step, dst1_offset); int dst2_idx = mad24(y, dst2_step, dst2_offset); int dst3_idx = mad24(y, dst3_step, dst3_offset); - + int4 src_data = ((__global int4 *)((__global char *)mat_src + src_idx))[x]; ((__global int *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x; @@ -948,18 +1017,18 @@ __kernel void split_vector_C4_D4 (__global int *mat_src, int src_step, int src } } __kernel void split_vector_C3_D4 (__global int *mat_src, int src_step, int src_offset, - __global int *mat_dst0, int dst0_step, int dst0_offset, - __global int *mat_dst1, int dst1_step, int dst1_offset, - __global int *mat_dst2, int dst2_step, int dst2_offset, + __global int *mat_dst0, int dst0_step, int dst0_offset, + __global int *mat_dst1, int dst1_step, int dst1_offset, + __global int *mat_dst2, int dst2_step, int dst2_offset, int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); - if((x < cols) && (y < rows)) + if((x < cols) && (y < rows)) { - int src_idx = mad24(y, src_step, src_offset); + int src_idx = mad24(y, src_step, src_offset); int dst0_idx = mad24(y, dst0_step, dst0_offset); int dst1_idx = mad24(y, dst1_step, dst1_offset); int dst2_idx = mad24(y, dst2_step, dst2_offset); @@ -975,20 +1044,20 @@ __kernel void split_vector_C3_D4 (__global int *mat_src, int src_step, int src } __kernel void split_vector_C2_D4 (__global int *mat_src, int src_step, int src_offset, - __global int *mat_dst0, int dst0_step, int dst0_offset, - __global int *mat_dst1, int dst1_step, int dst1_offset, + __global int *mat_dst0, int dst0_step, int dst0_offset, + __global int *mat_dst1, int dst1_step, int dst1_offset, int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); - if((x < cols) && (y < rows)) + if((x < cols) && (y < rows)) { - int src_idx = mad24(y, src_step, src_offset); + int src_idx = mad24(y, src_step, src_offset); int dst0_idx = mad24(y, dst0_step, dst0_offset); int dst1_idx = mad24(y, dst1_step, dst1_offset); - + int2 src_data = ((__global int2 *)((__global char *)mat_src + src_idx))[x]; ((__global int *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x; @@ -997,9 +1066,9 @@ __kernel void split_vector_C2_D4 (__global int *mat_src, int src_step, int src } __kernel void split_vector_C4_D5 (__global float *mat_src, int src_step, int src_offset, - __global float *mat_dst0, int dst0_step, int dst0_offset, - __global float *mat_dst1, int dst1_step, int dst1_offset, - __global float *mat_dst2, int dst2_step, int dst2_offset, + __global float *mat_dst0, int dst0_step, int dst0_offset, + __global float *mat_dst1, int dst1_step, int dst1_offset, + __global float *mat_dst2, int dst2_step, int dst2_offset, __global float *mat_dst3, int dst3_step, int dst3_offset, int rows, int cols, int dst_step1) @@ -1007,14 +1076,14 @@ __kernel void split_vector_C4_D5 (__global float *mat_src, int src_step, int s int x = get_global_id(0); int y = get_global_id(1); - if((x < cols) && (y < rows)) + if((x < cols) && (y < rows)) { - int src_idx = mad24(y, src_step, src_offset); + int src_idx = mad24(y, src_step, src_offset); int dst0_idx = mad24(y, dst0_step, dst0_offset); int dst1_idx = mad24(y, dst1_step, dst1_offset); int dst2_idx = mad24(y, dst2_step, dst2_offset); int dst3_idx = mad24(y, dst3_step, dst3_offset); - + float4 src_data = ((__global float4 *)((__global char *)mat_src + src_idx))[x]; ((__global float *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x; @@ -1025,18 +1094,18 @@ __kernel void split_vector_C4_D5 (__global float *mat_src, int src_step, int s } __kernel void split_vector_C3_D5 (__global float *mat_src, int src_step, int src_offset, - __global float *mat_dst0, int dst0_step, int dst0_offset, - __global float *mat_dst1, int dst1_step, int dst1_offset, - __global float *mat_dst2, int dst2_step, int dst2_offset, + __global float *mat_dst0, int dst0_step, int dst0_offset, + __global float *mat_dst1, int dst1_step, int dst1_offset, + __global float *mat_dst2, int dst2_step, int dst2_offset, int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); - if((x < cols) && (y < rows)) + if((x < cols) && (y < rows)) { - int src_idx = mad24(y, src_step, src_offset); + int src_idx = mad24(y, src_step, src_offset); int dst0_idx = mad24(y, dst0_step, dst0_offset); int dst1_idx = mad24(y, dst1_step, dst1_offset); int dst2_idx = mad24(y, dst2_step, dst2_offset); @@ -1052,20 +1121,20 @@ __kernel void split_vector_C3_D5 (__global float *mat_src, int src_step, int s } __kernel void split_vector_C2_D5 (__global float *mat_src, int src_step, int src_offset, - __global float *mat_dst0, int dst0_step, int dst0_offset, - __global float *mat_dst1, int dst1_step, int dst1_offset, + __global float *mat_dst0, int dst0_step, int dst0_offset, + __global float *mat_dst1, int dst1_step, int dst1_offset, int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); - if((x < cols) && (y < rows)) + if((x < cols) && (y < rows)) { - int src_idx = mad24(y, src_step, src_offset); + int src_idx = mad24(y, src_step, src_offset); int dst0_idx = mad24(y, dst0_step, dst0_offset); int dst1_idx = mad24(y, dst1_step, dst1_offset); - + float2 src_data = ((__global float2 *)((__global char *)mat_src + src_idx))[x]; ((__global float *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x; @@ -1075,9 +1144,9 @@ __kernel void split_vector_C2_D5 (__global float *mat_src, int src_step, int s #if defined (DOUBLE_SUPPORT) __kernel void split_vector_C4_D6 (__global double *mat_src, int src_step, int src_offset, - __global double *mat_dst0, int dst0_step, int dst0_offset, - __global double *mat_dst1, int dst1_step, int dst1_offset, - __global double *mat_dst2, int dst2_step, int dst2_offset, + __global double *mat_dst0, int dst0_step, int dst0_offset, + __global double *mat_dst1, int dst1_step, int dst1_offset, + __global double *mat_dst2, int dst2_step, int dst2_offset, __global double *mat_dst3, int dst3_step, int dst3_offset, int rows, int cols, int dst_step1) @@ -1085,14 +1154,14 @@ __kernel void split_vector_C4_D6 (__global double *mat_src, int src_step, int int x = get_global_id(0); int y = get_global_id(1); - if((x < cols) && (y < rows)) + if((x < cols) && (y < rows)) { - int src_idx = mad24(y, src_step, src_offset); + int src_idx = mad24(y, src_step, src_offset); int dst0_idx = mad24(y, dst0_step, dst0_offset); int dst1_idx = mad24(y, dst1_step, dst1_offset); int dst2_idx = mad24(y, dst2_step, dst2_offset); int dst3_idx = mad24(y, dst3_step, dst3_offset); - + double4 src_data = ((__global double4 *)((__global char *)mat_src + src_idx))[x]; ((__global double *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x; @@ -1103,18 +1172,18 @@ __kernel void split_vector_C4_D6 (__global double *mat_src, int src_step, int } __kernel void split_vector_C3_D6 (__global double *mat_src, int src_step, int src_offset, - __global double *mat_dst0, int dst0_step, int dst0_offset, - __global double *mat_dst1, int dst1_step, int dst1_offset, - __global double *mat_dst2, int dst2_step, int dst2_offset, + __global double *mat_dst0, int dst0_step, int dst0_offset, + __global double *mat_dst1, int dst1_step, int dst1_offset, + __global double *mat_dst2, int dst2_step, int dst2_offset, int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); - if((x < cols) && (y < rows)) + if((x < cols) && (y < rows)) { - int src_idx = mad24(y, src_step, src_offset); + int src_idx = mad24(y, src_step, src_offset); int dst0_idx = mad24(y, dst0_step, dst0_offset); int dst1_idx = mad24(y, dst1_step, dst1_offset); int dst2_idx = mad24(y, dst2_step, dst2_offset); @@ -1130,20 +1199,20 @@ __kernel void split_vector_C3_D6 (__global double *mat_src, int src_step, int } __kernel void split_vector_C2_D6 (__global double *mat_src, int src_step, int src_offset, - __global double *mat_dst0, int dst0_step, int dst0_offset, - __global double *mat_dst1, int dst1_step, int dst1_offset, + __global double *mat_dst0, int dst0_step, int dst0_offset, + __global double *mat_dst1, int dst1_step, int dst1_offset, int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); - if((x < cols) && (y < rows)) + if((x < cols) && (y < rows)) { - int src_idx = mad24(y, src_step, src_offset); + int src_idx = mad24(y, src_step, src_offset); int dst0_idx = mad24(y, dst0_step, dst0_offset); int dst1_idx = mad24(y, dst1_step, dst1_offset); - + double2 src_data = ((__global double2 *)((__global char *)mat_src + src_idx))[x]; ((__global double *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x; diff --git a/modules/ocl/src/match_template.cpp b/modules/ocl/src/match_template.cpp index d5b017ca6b..b2c08afcf4 100644 --- a/modules/ocl/src/match_template.cpp +++ b/modules/ocl/src/match_template.cpp @@ -124,7 +124,7 @@ namespace cv result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F); if (templ.size().area() < getTemplateThreshold(CV_TM_SQDIFF, image.depth())) { - matchTemplateNaive_SQDIFF(image, templ, result, image.channels()); + matchTemplateNaive_SQDIFF(image, templ, result, image.oclchannels()); return; } else @@ -172,7 +172,7 @@ namespace cv CV_Assert((image.depth() == CV_8U && templ.depth() == CV_8U ) || ((image.depth() == CV_32F && templ.depth() == CV_32F) && result.depth() == CV_32F) ); - CV_Assert(image.channels() == templ.channels() && (image.channels() == 1 || image.oclchannels() == 4) && result.channels() == 1); + CV_Assert(image.oclchannels() == templ.oclchannels() && (image.oclchannels() == 1 || image.oclchannels() == 4) && result.oclchannels() == 1); CV_Assert(result.rows == image.rows - templ.rows + 1 && result.cols == image.cols - templ.cols + 1); Context *clCxt = image.clCxt; @@ -209,7 +209,7 @@ namespace cv result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F); if (templ.size().area() < getTemplateThreshold(CV_TM_SQDIFF, image.depth())) { - matchTemplateNaive_CCORR(image, templ, result, image.channels()); + matchTemplateNaive_CCORR(image, templ, result, image.oclchannels()); return; } else @@ -220,8 +220,8 @@ namespace cv image.convertTo(buf.imagef, CV_32F); templ.convertTo(buf.templf, CV_32F); } - CV_Assert(image.channels() == 1); - oclMat o_result(image.size(), CV_MAKETYPE(CV_32F, image.channels())); + CV_Assert(image.oclchannels() == 1); + oclMat o_result(image.size(), CV_MAKETYPE(CV_32F, image.oclchannels())); filter2D(buf.imagef, o_result, CV_32F, buf.templf, Point(0, 0)); result = o_result(Rect(0, 0, image.rows - templ.rows + 1, image.cols - templ.cols + 1)); } @@ -265,7 +265,7 @@ namespace cv CV_Assert((image.depth() == CV_8U && templ.depth() == CV_8U ) || ((image.depth() == CV_32F && templ.depth() == CV_32F) && result.depth() == CV_32F) ); - CV_Assert(image.channels() == templ.channels() && (image.oclchannels() == 1 || image.oclchannels() == 4) && result.channels() == 1); + CV_Assert(image.oclchannels() == templ.oclchannels() && (image.oclchannels() == 1 || image.oclchannels() == 4) && result.oclchannels() == 1); CV_Assert(result.rows == image.rows - templ.rows + 1 && result.cols == image.cols - templ.cols + 1); Context *clCxt = image.clCxt; @@ -320,7 +320,7 @@ namespace cv args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset)); args.push_back( make_pair( sizeof(cl_int), (void *)&result.step)); // to be continued in the following section - if(image.channels() == 1) + if(image.oclchannels() == 1) { buf.image_sums.resize(1); integral(image, buf.image_sums[0]); @@ -340,7 +340,7 @@ namespace cv buf.image_sums.resize(buf.images.size()); - for(int i = 0; i < image.channels(); i ++) + for(int i = 0; i < image.oclchannels(); i ++) { integral(buf.images[i], buf.image_sums[i]); } @@ -394,7 +394,7 @@ namespace cv args.push_back( make_pair( sizeof(cl_int), (void *)&result.step)); args.push_back( make_pair( sizeof(cl_float), (void *)&scale) ); // to be continued in the following section - if(image.channels() == 1) + if(image.oclchannels() == 1) { buf.image_sums.resize(1); buf.image_sqsums.resize(1); diff --git a/modules/ocl/src/matrix_operations.cpp b/modules/ocl/src/matrix_operations.cpp index 3317d68a7b..b2baed8381 100644 --- a/modules/ocl/src/matrix_operations.cpp +++ b/modules/ocl/src/matrix_operations.cpp @@ -126,7 +126,7 @@ namespace cv //////////////////////////////////////////////////////////////////////// // convert_C3C4 -void convert_C3C4(const cl_mem &src, oclMat &dst, int srcStep) +void convert_C3C4(const cl_mem &src, oclMat &dst) { int dstStep_in_pixel = dst.step1() / dst.oclchannels(); int pixel_end = dst.wholecols * dst.wholerows - 1; @@ -174,7 +174,7 @@ void convert_C3C4(const cl_mem &src, oclMat &dst, int srcStep) } //////////////////////////////////////////////////////////////////////// // convert_C4C3 -void convert_C4C3(const oclMat &src, cl_mem &dst, int dstStep) +void convert_C4C3(const oclMat &src, cl_mem &dst) { int srcStep_in_pixel = src.step1() / src.oclchannels(); int pixel_end = src.wholecols * src.wholerows - 1; @@ -245,7 +245,7 @@ void cv::ocl::oclMat::upload(const Mat &m) openCLVerifyCall(err); openCLMemcpy2D(clCxt, temp, pitch, m.datastart, m.step, wholeSize.width * m.elemSize(), wholeSize.height, clMemcpyHostToDevice, 3); - convert_C3C4(temp, *this, pitch); + convert_C3C4(temp, *this); //int* cputemp=new int[wholeSize.height*wholeSize.width * 3]; //int* cpudata=new int[this->step*this->wholerows/sizeof(int)]; //openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, temp, CL_TRUE, @@ -296,7 +296,7 @@ void cv::ocl::oclMat::download(cv::Mat &m) const (pitch * wholerows + tail_padding - 1) / tail_padding * tail_padding, 0, &err); openCLVerifyCall(err); - convert_C4C3(*this, temp, pitch / m.elemSize1()); + convert_C4C3(*this, temp); openCLMemcpy2D(clCxt, m.data, m.step, temp, pitch, wholecols * m.elemSize(), wholerows, clMemcpyDeviceToHost, 3); //int* cputemp=new int[wholecols*wholerows * 3]; //int* cpudata=new int[this->step*this->wholerows/sizeof(int)]; @@ -382,7 +382,7 @@ void cv::ocl::oclMat::copyTo( oclMat &m ) const CV_DbgAssert(!this->empty()); m.create(size(), type()); openCLCopyBuffer2D(clCxt, m.data, m.step, m.offset, - data, step, cols * elemSize(), rows, offset, clMemcpyDeviceToDevice); + data, step, cols * elemSize(), rows, offset); } void cv::ocl::oclMat::copyTo( oclMat &mat, const oclMat &mask) const diff --git a/modules/ocl/src/mcwutil.cpp b/modules/ocl/src/mcwutil.cpp index c6096c305d..b4dafa5b79 100644 --- a/modules/ocl/src/mcwutil.cpp +++ b/modules/ocl/src/mcwutil.cpp @@ -85,10 +85,10 @@ namespace cv globalThreads[1] = divUp(globalThreads[1], localThreads[1]) * localThreads[1]; globalThreads[2] = divUp(globalThreads[2], localThreads[2]) * localThreads[2]; - size_t blockSize = localThreads[0] * localThreads[1] * localThreads[2]; - cv::ocl::openCLVerifyKernel(clCxt, kernel, &blockSize, globalThreads, localThreads); + //size_t blockSize = localThreads[0] * localThreads[1] * localThreads[2]; + cv::ocl::openCLVerifyKernel(clCxt, kernel, localThreads); } - for(int i = 0; i < args.size(); i ++) + for(size_t i = 0; i < args.size(); i ++) openCLSafeCall(clSetKernelArg(kernel, i, args[i].first, args[i].second)); openCLSafeCall(clEnqueueNDRangeKernel(clCxt->impl->clCmdQueue, kernel, 3, NULL, globalThreads, diff --git a/modules/ocl/src/precomp.hpp b/modules/ocl/src/precomp.hpp index 6dcb38827a..33f89dffcb 100644 --- a/modules/ocl/src/precomp.hpp +++ b/modules/ocl/src/precomp.hpp @@ -47,7 +47,7 @@ #define __OPENCV_PRECOMP_H__ #if _MSC_VER >= 1200 -#pragma warning( disable: 4251 4710 4711 4514 4996 ) +#pragma warning( disable: 4244 4251 4710 4711 4514 4996 ) #endif #ifdef HAVE_CVCONFIG_H @@ -81,7 +81,7 @@ #if defined __APPLE__ #include #else -#include +#include #endif #include "safe_call.hpp" @@ -100,7 +100,7 @@ namespace cv size_t width, size_t height, enum openCLMemcpyKind kind, int channels = -1); void openCLCopyBuffer2D(Context *clCxt, void *dst, size_t dpitch, int dst_offset, const void *src, size_t spitch, - size_t width, size_t height, int src_offset, enum openCLMemcpyKind kind); + size_t width, size_t height, int src_offset); void openCLFree(void *devPtr); cl_mem openCLCreateBuffer(Context *clCxt, size_t flag, size_t size); void openCLReadBuffer(Context *clCxt, cl_mem dst_buffer, void *host_buffer, size_t size); @@ -108,8 +108,7 @@ namespace cv const char **source, string kernelName); cl_kernel openCLGetKernelFromSource(const Context *clCxt, const char **source, string kernelName, const char *build_options); - void openCLVerifyKernel(const Context *clCxt, cl_kernel kernel, size_t *blockSize, - size_t *globalThreads, size_t *localThreads); + void openCLVerifyKernel(const Context *clCxt, cl_kernel kernel, size_t *localThreads); void openCLExecuteKernel(Context *clCxt , const char **source, string kernelName, vector< std::pair > &args, int globalcols , int globalrows, size_t blockSize = 16, int kernel_expand_depth = -1, int kernel_expand_channel = -1); void openCLExecuteKernel_(Context *clCxt , const char **source, string kernelName, diff --git a/modules/ocl/src/pyrlk.cpp b/modules/ocl/src/pyrlk.cpp index a701d61d3f..2382a60c28 100644 --- a/modules/ocl/src/pyrlk.cpp +++ b/modules/ocl/src/pyrlk.cpp @@ -61,6 +61,7 @@ namespace cv extern const char *pyrlk; extern const char *operator_setTo; extern const char *operator_convertTo; + extern const char *operator_copyToM; extern const char *arithm_mul; extern const char *pyr_down; } @@ -397,6 +398,71 @@ oclMat &setTo(oclMat &src, const Scalar &scalar) return src; } +/////////////////////////////////////////////////////////////////////////// +////////////////////////////////// CopyTo ///////////////////////////////// +/////////////////////////////////////////////////////////////////////////// +void copy_to_with_mask_cus(const oclMat &src, oclMat &dst, const oclMat &mask, string kernelName) +{ + CV_DbgAssert( dst.rows == mask.rows && dst.cols == mask.cols && + src.rows == dst.rows && src.cols == dst.cols + && mask.type() == CV_8UC1); + + vector > args; + + std::string string_types[4][7] = {{"uchar", "char", "ushort", "short", "int", "float", "double"}, + {"uchar2", "char2", "ushort2", "short2", "int2", "float2", "double2"}, + {"uchar3", "char3", "ushort3", "short3", "int3", "float3", "double3"}, + {"uchar4", "char4", "ushort4", "short4", "int4", "float4", "double4"} + }; + char compile_option[32]; + sprintf(compile_option, "-D GENTYPE=%s", string_types[dst.oclchannels() - 1][dst.depth()].c_str()); + size_t localThreads[3] = {16, 16, 1}; + size_t globalThreads[3]; + + globalThreads[0] = divUp(dst.cols, localThreads[0]) * localThreads[0]; + globalThreads[1] = divUp(dst.rows, localThreads[1]) * localThreads[1]; + globalThreads[2] = 1; + + int dststep_in_pixel = dst.step / dst.elemSize(), dstoffset_in_pixel = dst.offset / dst.elemSize(); + int srcstep_in_pixel = src.step / src.elemSize(), srcoffset_in_pixel = src.offset / src.elemSize(); + + args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data )); + args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data )); + args.push_back( make_pair( sizeof(cl_mem) , (void *)&mask.data )); + args.push_back( make_pair( sizeof(cl_int) , (void *)&src.cols )); + args.push_back( make_pair( sizeof(cl_int) , (void *)&src.rows )); + args.push_back( make_pair( sizeof(cl_int) , (void *)&srcstep_in_pixel )); + args.push_back( make_pair( sizeof(cl_int) , (void *)&srcoffset_in_pixel )); + args.push_back( make_pair( sizeof(cl_int) , (void *)&dststep_in_pixel )); + args.push_back( make_pair( sizeof(cl_int) , (void *)&dstoffset_in_pixel )); + args.push_back( make_pair( sizeof(cl_int) , (void *)&mask.step )); + args.push_back( make_pair( sizeof(cl_int) , (void *)&mask.offset )); + + openCLExecuteKernel2(dst.clCxt , &operator_copyToM, kernelName, globalThreads, + localThreads, args, -1, -1, compile_option, CLFLUSH); +} + +void copyTo(const oclMat &src, oclMat &m ) +{ + CV_DbgAssert(!src.empty()); + m.create(src.size(), src.type()); + openCLCopyBuffer2D(src.clCxt, m.data, m.step, m.offset, + src.data, src.step, src.cols * src.elemSize(), src.rows, src.offset); +} + +void copyTo(const oclMat &src, oclMat &mat, const oclMat &mask) +{ + if (mask.empty()) + { + copyTo(src, mat); + } + else + { + mat.create(src.size(), src.type()); + copy_to_with_mask_cus(src, mat, mask, "copy_to_with_mask"); + } +} + void arithmetic_run(const oclMat &src1, oclMat &dst, string kernelName, const char **kernelString, void *_scalar) { if(src1.clCxt -> impl -> double_support == 0 && src1.type() == CV_64F) @@ -879,20 +945,23 @@ void cv::ocl::PyrLKOpticalFlow::dense(const oclMat &prevImg, const oclMat &nextI nextPyr_.resize(maxLevel + 1); prevPyr_[0] = prevImg; - nextImg.convertTo(nextPyr_[0], CV_32F); + //nextImg.convertTo(nextPyr_[0], CV_32F); + convertTo(nextImg, nextPyr_[0], CV_32F); for (int level = 1; level <= maxLevel; ++level) { - pyrDown(prevPyr_[level - 1], prevPyr_[level]); - pyrDown(nextPyr_[level - 1], nextPyr_[level]); + pyrDown_cus(prevPyr_[level - 1], prevPyr_[level]); + pyrDown_cus(nextPyr_[level - 1], nextPyr_[level]); } ensureSizeIsEnough(prevImg.size(), CV_32FC1, uPyr_[0]); ensureSizeIsEnough(prevImg.size(), CV_32FC1, vPyr_[0]); ensureSizeIsEnough(prevImg.size(), CV_32FC1, uPyr_[1]); ensureSizeIsEnough(prevImg.size(), CV_32FC1, vPyr_[1]); - uPyr_[1].setTo(Scalar::all(0)); - vPyr_[1].setTo(Scalar::all(0)); + //uPyr_[1].setTo(Scalar::all(0)); + //vPyr_[1].setTo(Scalar::all(0)); + setTo(uPyr_[1], Scalar::all(0)); + setTo(vPyr_[1], Scalar::all(0)); Size winSize2i(winSize.width, winSize.height); @@ -909,8 +978,12 @@ void cv::ocl::PyrLKOpticalFlow::dense(const oclMat &prevImg, const oclMat &nextI idx = idx2; } - uPyr_[idx].copyTo(u); - vPyr_[idx].copyTo(v); + //uPyr_[idx].copyTo(u); + //vPyr_[idx].copyTo(v); + copyTo(uPyr_[idx], u); + copyTo(vPyr_[idx], v); + + clFinish(prevImg.clCxt->impl->clCmdQueue); } #endif /* !defined (HAVE_CUDA) */ diff --git a/modules/ocl/test/main.cpp b/modules/ocl/test/main.cpp index 92740e7a34..856828d6a5 100644 --- a/modules/ocl/test/main.cpp +++ b/modules/ocl/test/main.cpp @@ -73,22 +73,53 @@ void print_info() #endif } - +std::string workdir; int main(int argc, char **argv) { TS::ptr()->init("ocl"); InitGoogleTest(&argc, argv); + const char *keys = + "{ h | help | false | print help message }" + "{ w | workdir | ../../../samples/c/| set working directory }" + "{ t | type | gpu | set device type:cpu or gpu}" + "{ p | platform | 0 | set platform id }" + "{ d | device | 0 | set device id }"; + + CommandLineParser cmd(argc, argv, keys); + if (cmd.get("help")) + { + cout << "Avaible options besides goole test option:" << endl; + cmd.printParams(); + return 0; + } + workdir = cmd.get("workdir"); + string type = cmd.get("type"); + unsigned int pid = cmd.get("platform"); + int device = cmd.get("device"); print_info(); - - std::vector oclinfo; - int devnums = getDevice(oclinfo); - if(devnums < 1) + int flag = CVCL_DEVICE_TYPE_GPU; + if(type == "cpu") { - std::cout << "no device found\n"; + flag = CVCL_DEVICE_TYPE_CPU; + } + std::vector oclinfo; + int devnums = getDevice(oclinfo, flag); + if(devnums <= device || device < 0) + { + std::cout << "device invalid\n"; return -1; } - //setDevice(oclinfo[1]); + if(pid >= oclinfo.size()) + { + std::cout << "platform invalid\n"; + return -1; + } + if(pid != 0 || device != 0) + { + setDevice(oclinfo[pid], device); + } + cout << "Device type:" << type << endl << "Device name:" << oclinfo[pid].DeviceName[device] << endl; return RUN_ALL_TESTS(); } diff --git a/modules/ocl/test/precomp.hpp b/modules/ocl/test/precomp.hpp index 5d57a30f3e..b9b228e6ee 100644 --- a/modules/ocl/test/precomp.hpp +++ b/modules/ocl/test/precomp.hpp @@ -67,7 +67,5 @@ #include "interpolation.hpp" //#include "add_test_info.h" -#define OPENCV_DEFAULT_OPENCL_DEVICE CVCL_DEVICE_TYPE_GPU - #endif diff --git a/modules/ocl/test/test_arithm.cpp b/modules/ocl/test/test_arithm.cpp index 0abf0ceb6d..bebbc69713 100644 --- a/modules/ocl/test/test_arithm.cpp +++ b/modules/ocl/test/test_arithm.cpp @@ -133,10 +133,9 @@ PARAM_TEST_CASE(ArithmTestBase, MatType, bool) void random_roi() { - cv::RNG &rng = TS::ptr()->get_rng(); - #ifdef RANDOMROI //randomize ROI + cv::RNG &rng = TS::ptr()->get_rng(); roicols = rng.uniform(1, mat1.cols); roirows = rng.uniform(1, mat1.rows); src1x = rng.uniform(0, mat1.cols - roicols); @@ -234,7 +233,7 @@ TEST_P(Exp, Mat) char s[1024]; sprintf(s, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y); - EXPECT_MAT_NEAR(dst, cpu_dst, 1, s); + EXPECT_MAT_NEAR(dst, cpu_dst, 2, s); } } @@ -855,7 +854,7 @@ TEST_P(MinMaxLoc, MAT) cv::Point minLoc_, maxLoc_; cv::ocl::minMaxLoc(gmat1, &minVal_, &maxVal_, &minLoc_, &maxLoc_, cv::ocl::oclMat()); - double error0, error1, minlocVal, minlocVal_, maxlocVal, maxlocVal_; + double error0 = 0., error1 = 0., minlocVal = 0., minlocVal_ = 0., maxlocVal = 0., maxlocVal_ = 0.; if(depth == 0) { minlocVal = mat1_roi.at(minLoc); @@ -975,7 +974,7 @@ TEST_P(MinMaxLoc, MASK) cv::Point minLoc_, maxLoc_; cv::ocl::minMaxLoc(gmat1, &minVal_, &maxVal_, &minLoc_, &maxLoc_, gmask); - double error0, error1, minlocVal, minlocVal_, maxlocVal, maxlocVal_; + double error0 = 0., error1 = 0., minlocVal = 0., minlocVal_ = 0., maxlocVal = 0., maxlocVal_ = 0.; if(minLoc_.x == -1 || minLoc_.y == -1 || maxLoc_.x == -1 || maxLoc_.y == -1) continue; if(depth == 0) { diff --git a/modules/ocl/test/test_canny.cpp b/modules/ocl/test/test_canny.cpp index f206cc3019..cac6b66f51 100644 --- a/modules/ocl/test/test_canny.cpp +++ b/modules/ocl/test/test_canny.cpp @@ -45,16 +45,11 @@ #include "precomp.hpp" #ifdef HAVE_OPENCL -#ifdef WIN32 -#define FILTER_IMAGE "C:/Users/Public/Pictures/Sample Pictures/Penguins.jpg" -#else -#define FILTER_IMAGE "/Users/Test/Valve_original.PNG" // user need to specify a valid image path -#endif #define SHOW_RESULT 0 //////////////////////////////////////////////////////// // Canny - +extern std::string workdir; IMPLEMENT_PARAM_CLASS(AppertureSize, int); IMPLEMENT_PARAM_CLASS(L2gradient, bool); @@ -76,7 +71,7 @@ PARAM_TEST_CASE(Canny, AppertureSize, L2gradient) TEST_P(Canny, Accuracy) { - cv::Mat img = readImage(FILTER_IMAGE, cv::IMREAD_GRAYSCALE); + cv::Mat img = readImage(workdir + "fruits.jpg", cv::IMREAD_GRAYSCALE); ASSERT_FALSE(img.empty()); double low_thresh = 50.0; @@ -110,4 +105,4 @@ TEST_P(Canny, Accuracy) INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Canny, testing::Combine( testing::Values(AppertureSize(3), AppertureSize(5)), testing::Values(L2gradient(false), L2gradient(true)))); -#endif +#endif \ No newline at end of file diff --git a/modules/ocl/test/test_haar.cpp b/modules/ocl/test/test_haar.cpp index 1a21ff6e33..52ce6a3a3d 100644 --- a/modules/ocl/test/test_haar.cpp +++ b/modules/ocl/test/test_haar.cpp @@ -52,7 +52,7 @@ using namespace cvtest; using namespace testing; using namespace std; using namespace cv; - +extern string workdir; struct getRect { Rect operator ()(const CvAvgComp &e) const @@ -75,14 +75,11 @@ PARAM_TEST_CASE(HaarTestBase, int, int) { scale = 1.0; index = 0; - string cascadeName = "../../../data/haarcascades/haarcascade_frontalface_alt.xml"; + string cascadeName = workdir + "../../data/haarcascades/haarcascade_frontalface_alt.xml"; if( (!cascade.load( cascadeName )) || (!cpucascade.load(cascadeName))) { cout << "ERROR: Could not load classifier cascade" << endl; - cout << "Usage: facedetect [--cascade=]\n" - " [--scale[=\n" - " [filename|camera_index]\n" << endl ; return; } //int devnums = getDevice(oclinfo); @@ -99,17 +96,17 @@ struct Haar : HaarTestBase {}; TEST_F(Haar, FaceDetect) { - string imgName = "../../../samples/c/lena.jpg"; + string imgName = workdir + "lena.jpg"; Mat img = imread( imgName, 1 ); if(img.empty()) { - std::cout << "Couldn't read test" << index << ".jpg" << std::endl; + std::cout << "Couldn't read " << imgName << std::endl; return ; } - int i = 0; - double t = 0; + //int i = 0; + //double t = 0; vector faces, oclfaces; const static Scalar colors[] = { CV_RGB(0, 0, 255), diff --git a/modules/ocl/test/test_hog.cpp b/modules/ocl/test/test_hog.cpp index 16176a2e95..e1fdeaf000 100644 --- a/modules/ocl/test/test_hog.cpp +++ b/modules/ocl/test/test_hog.cpp @@ -48,7 +48,7 @@ using namespace std; #ifdef HAVE_OPENCL - +extern string workdir; PARAM_TEST_CASE(HOG, cv::Size, int) { cv::Size winSize; @@ -63,7 +63,7 @@ PARAM_TEST_CASE(HOG, cv::Size, int) TEST_P(HOG, GetDescriptors) { // Load image - cv::Mat img_rgb = readImage("../../../samples/gpu/road.png"); + cv::Mat img_rgb = readImage(workdir + "lena.jpg"); ASSERT_FALSE(img_rgb.empty()); // Convert image @@ -120,7 +120,7 @@ bool match_rect(cv::Rect r1, cv::Rect r2, int threshold) TEST_P(HOG, Detect) { // Load image - cv::Mat img_rgb = readImage("../../../samples/gpu/road.png"); + cv::Mat img_rgb = readImage(workdir + "lena.jpg"); ASSERT_FALSE(img_rgb.empty()); // Convert image diff --git a/modules/ocl/test/test_imgproc.cpp b/modules/ocl/test/test_imgproc.cpp index 8e4c0eb1d6..0417c7b3a7 100644 --- a/modules/ocl/test/test_imgproc.cpp +++ b/modules/ocl/test/test_imgproc.cpp @@ -1008,7 +1008,7 @@ TEST_P(Remap, Mat) int bordertype[] = {cv::BORDER_CONSTANT, cv::BORDER_REPLICATE/*,BORDER_REFLECT,BORDER_WRAP,BORDER_REFLECT_101*/}; const char *borderstr[] = {"BORDER_CONSTANT", "BORDER_REPLICATE"/*, "BORDER_REFLECT","BORDER_WRAP","BORDER_REFLECT_101"*/}; // for(int i = 0; i < sizeof(bordertype)/sizeof(int); i++) - for(int j = 0; j < 100; j++) + for(int j = 0; j < LOOP_TIMES; j++) { random_roi(); cv::remap(src_roi, dst_roi, map1_roi, map2_roi, interpolation, bordertype[0], val); @@ -1017,7 +1017,7 @@ TEST_P(Remap, Mat) gdst.download(cpu_dst); char sss[1024]; - sprintf(sss, "src_roicols=%d,src_roirows=%d,dst_roicols=%d,dst_roirows=%d,src1x =%d,src1y=%d,dstx=%d,dsty=%d", src_roicols, src_roirows, dst_roicols, dst_roirows, srcx, srcy, dstx, dsty); + sprintf(sss, "src_roicols=%d,src_roirows=%d,dst_roicols=%d,dst_roirows=%d,src1x =%d,src1y=%d,dstx=%d,dsty=%d bordertype=%s", src_roicols, src_roirows, dst_roicols, dst_roirows, srcx, srcy, dstx, dsty, borderstr[0]); if(interpolation == 0) @@ -1371,7 +1371,9 @@ TEST_P(meanShiftFiltering, Mat) gdst.download(cpu_gdst); char sss[1024]; + char warning[300] = "Warning: If the selected device doesn't support double, a deviation will exist.\nIf the accuracy is acceptable, please ignore it.\n"; sprintf(sss, "roicols=%d,roirows=%d,srcx=%d,srcy=%d,dstx=%d,dsty=%d\n", roicols, roirows, srcx, srcy, dstx, dsty); + strcat(sss, warning); EXPECT_MAT_NEAR(dst, cpu_gdst, 0.0, sss); } @@ -1397,7 +1399,9 @@ TEST_P(meanShiftProc, Mat) gdstCoor.download(cpu_gdstCoor); char sss[1024]; + char warning[300] = "Warning: If the selected device doesn't support double, a deviation will exist.\nIf the accuracy is acceptable, please ignore it.\n"; sprintf(sss, "roicols=%d,roirows=%d,srcx=%d,srcy=%d,dstx=%d,dsty=%d\n", roicols, roirows, srcx, srcy, dstx, dsty); + strcat(sss, warning); EXPECT_MAT_NEAR(dst, cpu_gdst, 0.0, sss); EXPECT_MAT_NEAR(dstCoor, cpu_gdstCoor, 0.0, sss); } @@ -1740,7 +1744,7 @@ INSTANTIATE_TEST_CASE_P(Imgproc, meanShiftProc, Combine( )); INSTANTIATE_TEST_CASE_P(Imgproc, Remap, Combine( - Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC4), + Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4), Values(CV_32FC1, CV_16SC2, CV_32FC2), Values(-1, CV_32FC1), Values((int)cv::INTER_NEAREST, (int)cv::INTER_LINEAR), Values((int)cv::BORDER_CONSTANT))); @@ -1751,7 +1755,7 @@ INSTANTIATE_TEST_CASE_P(histTestBase, calcHist, Combine( ONE_TYPE(CV_32SC1) //no use )); -INSTANTIATE_TEST_CASE_P(ConvolveTestBase, Convolve, Combine( - Values(CV_32FC1, CV_32FC1), - Values(false))); // Values(false) is the reserved parameter +//INSTANTIATE_TEST_CASE_P(ConvolveTestBase, Convolve, Combine( +// Values(CV_32FC1, CV_32FC1), +// Values(false))); // Values(false) is the reserved parameter #endif // HAVE_OPENCL diff --git a/modules/ocl/test/test_match_template.cpp b/modules/ocl/test/test_match_template.cpp index 1ba33a3eea..673a5f7fde 100644 --- a/modules/ocl/test/test_match_template.cpp +++ b/modules/ocl/test/test_match_template.cpp @@ -44,7 +44,7 @@ #include "precomp.hpp" -#define PERF_TEST 0 +//#define PERF_TEST 0 #ifdef HAVE_OPENCL //////////////////////////////////////////////////////////////////////////////// // MatchTemplate @@ -157,18 +157,18 @@ TEST_P(MatchTemplate32F, Accuracy) #endif // PERF_TEST } -//INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplate8U, -// testing::Combine( -// MTEMP_SIZES, -// testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16))/*, TemplateSize(cv::Size(30, 30))*/), -// testing::Values(Channels(1), Channels(3), Channels(4)), -// ALL_TEMPLATE_METHODS -// ) -// ); -// -//INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplate32F, testing::Combine( -// MTEMP_SIZES, -// testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16))/*, TemplateSize(cv::Size(30, 30))*/), -// testing::Values(Channels(1), Channels(3), Channels(4)), -// testing::Values(TemplateMethod(cv::TM_SQDIFF), TemplateMethod(cv::TM_CCORR)))); +INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplate8U, + testing::Combine( + MTEMP_SIZES, + testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16))/*, TemplateSize(cv::Size(30, 30))*/), + testing::Values(Channels(1), Channels(3), Channels(4)), + ALL_TEMPLATE_METHODS + ) + ); + +INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplate32F, testing::Combine( + MTEMP_SIZES, + testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16))/*, TemplateSize(cv::Size(30, 30))*/), + testing::Values(Channels(1), Channels(3), Channels(4)), + testing::Values(TemplateMethod(cv::TM_SQDIFF), TemplateMethod(cv::TM_CCORR)))); #endif diff --git a/modules/ocl/test/test_pyrlk.cpp b/modules/ocl/test/test_pyrlk.cpp index f9bccebdca..b594a34839 100644 --- a/modules/ocl/test/test_pyrlk.cpp +++ b/modules/ocl/test/test_pyrlk.cpp @@ -54,7 +54,7 @@ using namespace std; ///////////////////////////////////////////////////////////////////////////////////////////////// // BroxOpticalFlow - +extern string workdir; #define BROX_OPTICAL_FLOW_DUMP_FILE "opticalflow/brox_optical_flow.bin" #define BROX_OPTICAL_FLOW_DUMP_FILE_CC20 "opticalflow/brox_optical_flow_cc20.bin" @@ -78,10 +78,10 @@ PARAM_TEST_CASE(Sparse, bool, bool) TEST_P(Sparse, Mat) { - cv::Mat frame0 = readImage("../../../samples/gpu/rubberwhale1.png", useGray ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR); + cv::Mat frame0 = readImage(workdir + "../gpu/rubberwhale1.png", useGray ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR); ASSERT_FALSE(frame0.empty()); - cv::Mat frame1 = readImage("../../../samples/gpu/rubberwhale2.png", useGray ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR); + cv::Mat frame1 = readImage(workdir + "../gpu/rubberwhale2.png", useGray ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR); ASSERT_FALSE(frame1.empty()); cv::Mat gray_frame;