From 8db1a7310272d4c9f6d621aa4dacd625d5ea05a4 Mon Sep 17 00:00:00 2001 From: dave Date: Tue, 2 Apr 2013 10:31:02 -0700 Subject: [PATCH 01/30] Added v4l2 support for getting capture property CV_CAP_PROP_POS_MSEC --- modules/highgui/src/cap_v4l.cpp | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/modules/highgui/src/cap_v4l.cpp b/modules/highgui/src/cap_v4l.cpp index 829d0ab638..a788c903f9 100644 --- a/modules/highgui/src/cap_v4l.cpp +++ b/modules/highgui/src/cap_v4l.cpp @@ -154,6 +154,11 @@ the symptoms were damaged image and 'Corrupt JPEG data: premature end of data se - USE_TEMP_BUFFER fixes the main problem (improper buffer management) and prevents bad images in the first place +11th patch: April 2, 2013, Forrest Reiling forrest.reiling@gmail.com +Added v4l2 support for getting capture property CV_CAP_PROP_POS_MSEC. +Returns the millisecond timestamp of the last frame grabbed or 0 if no frames have been grabbed +Used to successfully synchonize 2 Logitech C310 USB webcams to within 16 ms of one another + make & enjoy! @@ -320,6 +325,8 @@ typedef struct CvCaptureCAM_V4L struct v4l2_queryctrl queryctrl; struct v4l2_querymenu querymenu; + struct timeval timestamp; + /* V4L2 control variables */ int v4l2_brightness, v4l2_brightness_min, v4l2_brightness_max; int v4l2_contrast, v4l2_contrast_min, v4l2_contrast_max; @@ -836,6 +843,9 @@ static int _capture_V4L2 (CvCaptureCAM_V4L *capture, char *deviceName) capture->v4l2_gain_max = 0; capture->v4l2_exposure_max = 0; + capture->timestamp.tv_sec = 0; + capture->timestamp.tv_usec = 0; + /* Scan V4L2 controls */ v4l2_scan_controls(capture); @@ -1221,6 +1231,9 @@ static int read_frame_v4l2(CvCaptureCAM_V4L* capture) { if (-1 == ioctl (capture->deviceHandle, VIDIOC_QBUF, &buf)) perror ("VIDIOC_QBUF"); + //set timestamp in capture struct to be timestamp of most recent frame + capture->timestamp = buf.timestamp; + return 1; } @@ -2308,6 +2321,13 @@ static double icvGetPropertyCAM_V4L (CvCaptureCAM_V4L* capture, /* initialize the control structure */ switch (property_id) { + case CV_CAP_PROP_POS_MSEC: + if (capture->FirstCapture) { + return 0; + } else { + return 1000 * capture->timestamp.tv_sec + ((double) capture->timestamp.tv_usec) / 1000; + } + break; case CV_CAP_PROP_BRIGHTNESS: capture->control.id = V4L2_CID_BRIGHTNESS; break; From fd4a6f0af05a0c4e0461b05f2bd20c85bfcbe73c Mon Sep 17 00:00:00 2001 From: yao Date: Wed, 3 Apr 2013 13:23:04 +0800 Subject: [PATCH 02/30] make the sparse method give correct results on CPU ocl Add CL_CPU to supportsFeature check simplify the logic of pyrlk --- modules/ocl/include/opencv2/ocl/ocl.hpp | 2 +- modules/ocl/src/initialization.cpp | 6 + modules/ocl/src/matrix_operations.cpp | 2 +- modules/ocl/src/opencl/pyrlk.cl | 278 ++++++++++++- modules/ocl/src/pyrlk.cpp | 520 ++---------------------- 5 files changed, 307 insertions(+), 501 deletions(-) diff --git a/modules/ocl/include/opencv2/ocl/ocl.hpp b/modules/ocl/include/opencv2/ocl/ocl.hpp index da7ca27aeb..7b79cb5b27 100644 --- a/modules/ocl/include/opencv2/ocl/ocl.hpp +++ b/modules/ocl/include/opencv2/ocl/ocl.hpp @@ -155,7 +155,7 @@ namespace cv static Context* getContext(); static void setContext(Info &oclinfo); - enum {CL_DOUBLE, CL_UNIFIED_MEM}; + enum {CL_DOUBLE, CL_UNIFIED_MEM, CL_CPU}; bool supportsFeature(int ftype); size_t computeUnits(); void* oclContext(); diff --git a/modules/ocl/src/initialization.cpp b/modules/ocl/src/initialization.cpp index b582f1ce3e..78263d86ae 100644 --- a/modules/ocl/src/initialization.cpp +++ b/modules/ocl/src/initialization.cpp @@ -979,6 +979,12 @@ namespace cv return impl->double_support == 1; case CL_UNIFIED_MEM: return impl->unified_memory == 1; + case CL_CPU: + cl_device_type devicetype; + clGetDeviceInfo(impl->devices[impl->devnum], + CL_DEVICE_TYPE, sizeof(cl_device_type), + &devicetype, NULL); + return devicetype == CVCL_DEVICE_TYPE_CPU; default: return false; } diff --git a/modules/ocl/src/matrix_operations.cpp b/modules/ocl/src/matrix_operations.cpp index ce96e3a9e3..87d1d375ef 100644 --- a/modules/ocl/src/matrix_operations.cpp +++ b/modules/ocl/src/matrix_operations.cpp @@ -394,7 +394,7 @@ void cv::ocl::oclMat::convertTo( oclMat &dst, int rtype, double alpha, double be if( rtype < 0 ) rtype = type(); else - rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), channels()); + rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), oclchannels()); //int scn = channels(); int sdepth = depth(), ddepth = CV_MAT_DEPTH(rtype); diff --git a/modules/ocl/src/opencl/pyrlk.cl b/modules/ocl/src/opencl/pyrlk.cl index c772be78ac..1043b8410b 100644 --- a/modules/ocl/src/opencl/pyrlk.cl +++ b/modules/ocl/src/opencl/pyrlk.cl @@ -184,6 +184,209 @@ float linearFilter_float(__global const float* src, int srcStep, int cn, float2 } #define BUFFER 64 + +#ifdef CPU +void reduce3(float val1, float val2, float val3, __local float* smem1, __local float* smem2, __local float* smem3, int tid) +{ + smem1[tid] = val1; + smem2[tid] = val2; + smem3[tid] = val3; + barrier(CLK_LOCAL_MEM_FENCE); + +#if BUFFER > 128 + if (tid < 128) + { + smem1[tid] = val1 += smem1[tid + 128]; + smem2[tid] = val2 += smem2[tid + 128]; + smem3[tid] = val3 += smem3[tid + 128]; + } + barrier(CLK_LOCAL_MEM_FENCE); +#endif + +#if BUFFER > 64 + if (tid < 64) + { + smem1[tid] = val1 += smem1[tid + 64]; + smem2[tid] = val2 += smem2[tid + 64]; + smem3[tid] = val3 += smem3[tid + 64]; + } + barrier(CLK_LOCAL_MEM_FENCE); +#endif + + if (tid < 32) + { + smem1[tid] = val1 += smem1[tid + 32]; + smem2[tid] = val2 += smem2[tid + 32]; + smem3[tid] = val3 += smem3[tid + 32]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 16) + { + smem1[tid] = val1 += smem1[tid + 16]; + smem2[tid] = val2 += smem2[tid + 16]; + smem3[tid] = val3 += smem3[tid + 16]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 8) + { + smem1[tid] = val1 += smem1[tid + 8]; + smem2[tid] = val2 += smem2[tid + 8]; + smem3[tid] = val3 += smem3[tid + 8]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 4) + { + smem1[tid] = val1 += smem1[tid + 4]; + smem2[tid] = val2 += smem2[tid + 4]; + smem3[tid] = val3 += smem3[tid + 4]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 2) + { + smem1[tid] = val1 += smem1[tid + 2]; + smem2[tid] = val2 += smem2[tid + 2]; + smem3[tid] = val3 += smem3[tid + 2]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 1) + { + smem1[BUFFER] = val1 += smem1[tid + 1]; + smem2[BUFFER] = val2 += smem2[tid + 1]; + smem3[BUFFER] = val3 += smem3[tid + 1]; + } + barrier(CLK_LOCAL_MEM_FENCE); +} + +void reduce2(float val1, float val2, volatile __local float* smem1, volatile __local float* smem2, int tid) +{ + smem1[tid] = val1; + smem2[tid] = val2; + barrier(CLK_LOCAL_MEM_FENCE); + +#if BUFFER > 128 + if (tid < 128) + { + smem1[tid] = (val1 += smem1[tid + 128]); + smem2[tid] = (val2 += smem2[tid + 128]); + } + barrier(CLK_LOCAL_MEM_FENCE); +#endif + +#if BUFFER > 64 + if (tid < 64) + { + smem1[tid] = (val1 += smem1[tid + 64]); + smem2[tid] = (val2 += smem2[tid + 64]); + } + barrier(CLK_LOCAL_MEM_FENCE); +#endif + + if (tid < 32) + { + smem1[tid] = (val1 += smem1[tid + 32]); + smem2[tid] = (val2 += smem2[tid + 32]); + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 16) + { + smem1[tid] = (val1 += smem1[tid + 16]); + smem2[tid] = (val2 += smem2[tid + 16]); + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 8) + { + smem1[tid] = (val1 += smem1[tid + 8]); + smem2[tid] = (val2 += smem2[tid + 8]); + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 4) + { + smem1[tid] = (val1 += smem1[tid + 4]); + smem2[tid] = (val2 += smem2[tid + 4]); + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 2) + { + smem1[tid] = (val1 += smem1[tid + 2]); + smem2[tid] = (val2 += smem2[tid + 2]); + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 1) + { + smem1[BUFFER] = (val1 += smem1[tid + 1]); + smem2[BUFFER] = (val2 += smem2[tid + 1]); + } + barrier(CLK_LOCAL_MEM_FENCE); +} + +void reduce1(float val1, volatile __local float* smem1, int tid) +{ + smem1[tid] = val1; + barrier(CLK_LOCAL_MEM_FENCE); + +#if BUFFER > 128 + if (tid < 128) + { + smem1[tid] = (val1 += smem1[tid + 128]); + } + barrier(CLK_LOCAL_MEM_FENCE); +#endif + +#if BUFFER > 64 + if (tid < 64) + { + smem1[tid] = (val1 += smem1[tid + 64]); + } + barrier(CLK_LOCAL_MEM_FENCE); +#endif + + if (tid < 32) + { + smem1[tid] = (val1 += smem1[tid + 32]); + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 16) + { + smem1[tid] = (val1 += smem1[tid + 16]); + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 8) + { + smem1[tid] = (val1 += smem1[tid + 8]); + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 4) + { + smem1[tid] = (val1 += smem1[tid + 4]); + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 2) + { + smem1[tid] = (val1 += smem1[tid + 2]); + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 1) + { + smem1[BUFFER] = (val1 += smem1[tid + 1]); + } + barrier(CLK_LOCAL_MEM_FENCE); +} +#else void reduce3(float val1, float val2, float val3, __local float* smem1, __local float* smem2, __local float* smem3, int tid) { smem1[tid] = val1; @@ -325,6 +528,7 @@ void reduce1(float val1, __local float* smem1, int tid) vmem1[tid] = val1 += vmem1[tid + 1]; } } +#endif #define SCALE (1.0f / (1 << 20)) #define THRESHOLD 0.01f @@ -411,14 +615,20 @@ void GetError4(image2d_t J, const float x, const float y, const float4* Pch, flo *errval += fabs(diff.x) + fabs(diff.y) + fabs(diff.z); } - +#define GRIDSIZE 3 __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J, __global const float2* prevPts, int prevPtsStep, __global float2* nextPts, int nextPtsStep, __global uchar* status, __global float* err, const int level, const int rows, const int cols, int PATCH_X, int PATCH_Y, int cn, int c_winSize_x, int c_winSize_y, int c_iters, char calcErr) { +#ifdef CPU + __local float smem1[BUFFER+1]; + __local float smem2[BUFFER+1]; + __local float smem3[BUFFER+1]; +#else __local float smem1[BUFFER]; __local float smem2[BUFFER]; __local float smem3[BUFFER]; +#endif unsigned int xid=get_local_id(0); unsigned int yid=get_local_id(1); @@ -431,7 +641,7 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J, const int tid = mad24(yid, xsize, xid); - float2 prevPt = prevPts[gid] / (1 << level); + float2 prevPt = prevPts[gid] / (float2)(1 << level); if (prevPt.x < 0 || prevPt.x >= cols || prevPt.y < 0 || prevPt.y >= rows) { @@ -450,9 +660,9 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J, float A12 = 0; float A22 = 0; - float I_patch[3][3]; - float dIdx_patch[3][3]; - float dIdy_patch[3][3]; + float I_patch[GRIDSIZE][GRIDSIZE]; + float dIdx_patch[GRIDSIZE][GRIDSIZE]; + float dIdy_patch[GRIDSIZE][GRIDSIZE]; yBase=yid; { @@ -512,12 +722,19 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J, &I_patch[2][2], &dIdx_patch[2][2], &dIdy_patch[2][2], &A11, &A12, &A22); } + reduce3(A11, A12, A22, smem1, smem2, smem3, tid); barrier(CLK_LOCAL_MEM_FENCE); +#ifdef CPU + A11 = smem1[BUFFER]; + A12 = smem2[BUFFER]; + A22 = smem3[BUFFER]; +#else A11 = smem1[0]; A12 = smem2[0]; A22 = smem3[0]; +#endif float D = A11 * A22 - A12 * A12; @@ -609,8 +826,13 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J, reduce2(b1, b2, smem1, smem2, tid); barrier(CLK_LOCAL_MEM_FENCE); +#ifdef CPU + b1 = smem1[BUFFER]; + b2 = smem2[BUFFER]; +#else b1 = smem1[0]; b2 = smem2[0]; +#endif float2 delta; delta.x = A12 * b2 - A22 * b1; @@ -685,18 +907,28 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J, nextPts[gid] = prevPt; if (calcErr) - err[gid] = smem1[0] / (c_winSize_x * c_winSize_y); +#ifdef CPU + err[gid] = smem1[BUFFER] / (float)(c_winSize_x * c_winSize_y); +#else + err[gid] = smem1[0] / (float)(c_winSize_x * c_winSize_y); +#endif } - } + __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J, __global const float2* prevPts, int prevPtsStep, __global float2* nextPts, int nextPtsStep, __global uchar* status, __global float* err, const int level, const int rows, const int cols, int PATCH_X, int PATCH_Y, int cn, int c_winSize_x, int c_winSize_y, int c_iters, char calcErr) { - __local float smem1[BUFFER]; - __local float smem2[BUFFER]; - __local float smem3[BUFFER]; +#ifdef CPU + __local float smem1[BUFFER+1]; + __local float smem2[BUFFER+1]; + __local float smem3[BUFFER+1]; +#else + __local float smem1[BUFFER]; + __local float smem2[BUFFER]; + __local float smem3[BUFFER]; +#endif unsigned int xid=get_local_id(0); unsigned int yid=get_local_id(1); @@ -709,7 +941,7 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J, const int tid = mad24(yid, xsize, xid); - float2 nextPt = prevPts[gid]/(1<= cols || nextPt.y < 0 || nextPt.y >= rows) { @@ -725,9 +957,9 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J, // extract the patch from the first image, compute covariation matrix of derivatives - float A11 = 0; - float A12 = 0; - float A22 = 0; + float A11 = 0.0f; + float A12 = 0.0f; + float A22 = 0.0f; float4 I_patch[8]; float4 dIdx_patch[8]; @@ -797,9 +1029,15 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J, reduce3(A11, A12, A22, smem1, smem2, smem3, tid); barrier(CLK_LOCAL_MEM_FENCE); +#ifdef CPU + A11 = smem1[BUFFER]; + A12 = smem2[BUFFER]; + A22 = smem3[BUFFER]; +#else A11 = smem1[0]; A12 = smem2[0]; A22 = smem3[0]; +#endif float D = A11 * A22 - A12 * A12; @@ -888,12 +1126,16 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J, &b1, &b2); } - reduce2(b1, b2, smem1, smem2, tid); barrier(CLK_LOCAL_MEM_FENCE); +#ifdef CPU + b1 = smem1[BUFFER]; + b2 = smem2[BUFFER]; +#else b1 = smem1[0]; b2 = smem2[0]; +#endif float2 delta; delta.x = A12 * b2 - A22 * b1; @@ -967,7 +1209,11 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J, nextPts[gid] = nextPt; if (calcErr) - err[gid] = smem1[0] / (3 * c_winSize_x * c_winSize_y); +#ifdef CPU + err[gid] = smem1[BUFFER] / (float)(3 * c_winSize_x * c_winSize_y); +#else + err[gid] = smem1[0] / (float)(3 * c_winSize_x * c_winSize_y); +#endif } } diff --git a/modules/ocl/src/pyrlk.cpp b/modules/ocl/src/pyrlk.cpp index c8d4b52deb..374134c1cd 100644 --- a/modules/ocl/src/pyrlk.cpp +++ b/modules/ocl/src/pyrlk.cpp @@ -16,7 +16,7 @@ // // @Authors // Dachuan Zhao, dachuan@multicorewareinc.com -// Yao Wang, yao@multicorewareinc.com +// Yao Wang, bitwangyaoyao@gmail.com // Nathan, liujun@multicorewareinc.com // // Redistribution and use in source and binary forms, with or without modification, @@ -47,6 +47,7 @@ #include "precomp.hpp" + using namespace std; using namespace cv; using namespace cv::ocl; @@ -58,11 +59,7 @@ namespace ocl ///////////////////////////OpenCL kernel strings/////////////////////////// extern const char *pyrlk; extern const char *pyrlk_no_image; -extern const char *operator_setTo; -extern const char *operator_convertTo; -extern const char *operator_copyToM; extern const char *arithm_mul; -extern const char *pyr_down; } } @@ -105,364 +102,7 @@ void calcPatchSize(cv::Size winSize, int cn, dim3 &block, dim3 &patch, bool isDe } } -inline int divUp(int total, int grain) -{ - return (total + grain - 1) / grain; -} - -/////////////////////////////////////////////////////////////////////////// -//////////////////////////////// ConvertTo //////////////////////////////// -/////////////////////////////////////////////////////////////////////////// -static void convert_run_cus(const oclMat &src, oclMat &dst, double alpha, double beta) -{ - string kernelName = "convert_to_S"; - stringstream idxStr; - idxStr << src.depth(); - kernelName += idxStr.str(); - float alpha_f = (float)alpha, beta_f = (float)beta; - CV_DbgAssert(src.rows == dst.rows && src.cols == dst.cols); - vector > args; - size_t localThreads[3] = {16, 16, 1}; - size_t globalThreads[3]; - globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0]; - globalThreads[1] = (dst.rows + localThreads[1] - 1) / localThreads[1] * localThreads[1]; - globalThreads[2] = 1; - int dststep_in_pixel = dst.step / dst.elemSize(), dstoffset_in_pixel = dst.offset / dst.elemSize(); - int srcstep_in_pixel = src.step / src.elemSize(), srcoffset_in_pixel = src.offset / src.elemSize(); - if(dst.type() == CV_8UC1) - { - globalThreads[0] = ((dst.cols + 4) / 4 + localThreads[0]) / localThreads[0] * localThreads[0]; - } - args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data )); - args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&src.cols )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&src.rows )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&srcstep_in_pixel )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&srcoffset_in_pixel )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&dststep_in_pixel )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&dstoffset_in_pixel )); - args.push_back( make_pair( sizeof(cl_float) , (void *)&alpha_f )); - args.push_back( make_pair( sizeof(cl_float) , (void *)&beta_f )); - openCLExecuteKernel2(dst.clCxt , &operator_convertTo, kernelName, globalThreads, - localThreads, args, dst.oclchannels(), dst.depth(), CLFLUSH); -} -void convertTo( const oclMat &src, oclMat &m, int rtype, double alpha = 1, double beta = 0 ); -void convertTo( const oclMat &src, oclMat &dst, int rtype, double alpha, double beta ) -{ - //cout << "cv::ocl::oclMat::convertTo()" << endl; - - bool noScale = fabs(alpha - 1) < std::numeric_limits::epsilon() - && fabs(beta) < std::numeric_limits::epsilon(); - - if( rtype < 0 ) - rtype = src.type(); - else - rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), src.oclchannels()); - - int sdepth = src.depth(), ddepth = CV_MAT_DEPTH(rtype); - if( sdepth == ddepth && noScale ) - { - src.copyTo(dst); - return; - } - - oclMat temp; - const oclMat *psrc = &src; - if( sdepth != ddepth && psrc == &dst ) - psrc = &(temp = src); - - dst.create( src.size(), rtype ); - convert_run_cus(*psrc, dst, alpha, beta); -} - -/////////////////////////////////////////////////////////////////////////// -//////////////////////////////// setTo //////////////////////////////////// -/////////////////////////////////////////////////////////////////////////// -//oclMat &operator = (const Scalar &s) -//{ -// //cout << "cv::ocl::oclMat::=" << endl; -// setTo(s); -// return *this; -//} -static void set_to_withoutmask_run_cus(const oclMat &dst, const Scalar &scalar, string kernelName) -{ - vector > args; - - size_t localThreads[3] = {16, 16, 1}; - size_t globalThreads[3]; - globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0]; - globalThreads[1] = (dst.rows + localThreads[1] - 1) / localThreads[1] * localThreads[1]; - globalThreads[2] = 1; - int step_in_pixel = dst.step / dst.elemSize(), offset_in_pixel = dst.offset / dst.elemSize(); - if(dst.type() == CV_8UC1) - { - globalThreads[0] = ((dst.cols + 4) / 4 + localThreads[0] - 1) / localThreads[0] * localThreads[0]; - } - char compile_option[32]; - union sc - { - cl_uchar4 uval; - cl_char4 cval; - cl_ushort4 usval; - cl_short4 shval; - cl_int4 ival; - cl_float4 fval; - cl_double4 dval; - } val; - switch(dst.depth()) - { - case 0: - val.uval.s[0] = saturate_cast(scalar.val[0]); - val.uval.s[1] = saturate_cast(scalar.val[1]); - val.uval.s[2] = saturate_cast(scalar.val[2]); - val.uval.s[3] = saturate_cast(scalar.val[3]); - switch(dst.oclchannels()) - { - case 1: - sprintf(compile_option, "-D GENTYPE=uchar"); - args.push_back( make_pair( sizeof(cl_uchar) , (void *)&val.uval.s[0] )); - break; - case 4: - sprintf(compile_option, "-D GENTYPE=uchar4"); - args.push_back( make_pair( sizeof(cl_uchar4) , (void *)&val.uval )); - break; - default: - CV_Error(CV_StsUnsupportedFormat, "unsupported channels"); - } - break; - case 1: - val.cval.s[0] = saturate_cast(scalar.val[0]); - val.cval.s[1] = saturate_cast(scalar.val[1]); - val.cval.s[2] = saturate_cast(scalar.val[2]); - val.cval.s[3] = saturate_cast(scalar.val[3]); - switch(dst.oclchannels()) - { - case 1: - sprintf(compile_option, "-D GENTYPE=char"); - args.push_back( make_pair( sizeof(cl_char) , (void *)&val.cval.s[0] )); - break; - case 4: - sprintf(compile_option, "-D GENTYPE=char4"); - args.push_back( make_pair( sizeof(cl_char4) , (void *)&val.cval )); - break; - default: - CV_Error(CV_StsUnsupportedFormat, "unsupported channels"); - } - break; - case 2: - val.usval.s[0] = saturate_cast(scalar.val[0]); - val.usval.s[1] = saturate_cast(scalar.val[1]); - val.usval.s[2] = saturate_cast(scalar.val[2]); - val.usval.s[3] = saturate_cast(scalar.val[3]); - switch(dst.oclchannels()) - { - case 1: - sprintf(compile_option, "-D GENTYPE=ushort"); - args.push_back( make_pair( sizeof(cl_ushort) , (void *)&val.usval.s[0] )); - break; - case 4: - sprintf(compile_option, "-D GENTYPE=ushort4"); - args.push_back( make_pair( sizeof(cl_ushort4) , (void *)&val.usval )); - break; - default: - CV_Error(CV_StsUnsupportedFormat, "unsupported channels"); - } - break; - case 3: - val.shval.s[0] = saturate_cast(scalar.val[0]); - val.shval.s[1] = saturate_cast(scalar.val[1]); - val.shval.s[2] = saturate_cast(scalar.val[2]); - val.shval.s[3] = saturate_cast(scalar.val[3]); - switch(dst.oclchannels()) - { - case 1: - sprintf(compile_option, "-D GENTYPE=short"); - args.push_back( make_pair( sizeof(cl_short) , (void *)&val.shval.s[0] )); - break; - case 4: - sprintf(compile_option, "-D GENTYPE=short4"); - args.push_back( make_pair( sizeof(cl_short4) , (void *)&val.shval )); - break; - default: - CV_Error(CV_StsUnsupportedFormat, "unsupported channels"); - } - break; - case 4: - val.ival.s[0] = saturate_cast(scalar.val[0]); - val.ival.s[1] = saturate_cast(scalar.val[1]); - val.ival.s[2] = saturate_cast(scalar.val[2]); - val.ival.s[3] = saturate_cast(scalar.val[3]); - switch(dst.oclchannels()) - { - case 1: - sprintf(compile_option, "-D GENTYPE=int"); - args.push_back( make_pair( sizeof(cl_int) , (void *)&val.ival.s[0] )); - break; - case 2: - sprintf(compile_option, "-D GENTYPE=int2"); - cl_int2 i2val; - i2val.s[0] = val.ival.s[0]; - i2val.s[1] = val.ival.s[1]; - args.push_back( make_pair( sizeof(cl_int2) , (void *)&i2val )); - break; - case 4: - sprintf(compile_option, "-D GENTYPE=int4"); - args.push_back( make_pair( sizeof(cl_int4) , (void *)&val.ival )); - break; - default: - CV_Error(CV_StsUnsupportedFormat, "unsupported channels"); - } - break; - case 5: - val.fval.s[0] = (float)scalar.val[0]; - val.fval.s[1] = (float)scalar.val[1]; - val.fval.s[2] = (float)scalar.val[2]; - val.fval.s[3] = (float)scalar.val[3]; - switch(dst.oclchannels()) - { - case 1: - sprintf(compile_option, "-D GENTYPE=float"); - args.push_back( make_pair( sizeof(cl_float) , (void *)&val.fval.s[0] )); - break; - case 4: - sprintf(compile_option, "-D GENTYPE=float4"); - args.push_back( make_pair( sizeof(cl_float4) , (void *)&val.fval )); - break; - default: - CV_Error(CV_StsUnsupportedFormat, "unsupported channels"); - } - break; - case 6: - val.dval.s[0] = scalar.val[0]; - val.dval.s[1] = scalar.val[1]; - val.dval.s[2] = scalar.val[2]; - val.dval.s[3] = scalar.val[3]; - switch(dst.oclchannels()) - { - case 1: - sprintf(compile_option, "-D GENTYPE=double"); - args.push_back( make_pair( sizeof(cl_double) , (void *)&val.dval.s[0] )); - break; - case 4: - sprintf(compile_option, "-D GENTYPE=double4"); - args.push_back( make_pair( sizeof(cl_double4) , (void *)&val.dval )); - break; - default: - CV_Error(CV_StsUnsupportedFormat, "unsupported channels"); - } - break; - default: - CV_Error(CV_StsUnsupportedFormat, "unknown depth"); - } -#ifdef CL_VERSION_1_2 - if(dst.offset == 0 && dst.cols == dst.wholecols) - { - clEnqueueFillBuffer((cl_command_queue)dst.clCxt->oclCommandQueue(), (cl_mem)dst.data, args[0].second, args[0].first, 0, dst.step * dst.rows, 0, NULL, NULL); - } - else - { - args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.cols )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&step_in_pixel )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&offset_in_pixel)); - openCLExecuteKernel2(dst.clCxt , &operator_setTo, kernelName, globalThreads, - localThreads, args, -1, -1, compile_option, CLFLUSH); - } -#else - args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.cols )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&step_in_pixel )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&offset_in_pixel)); - openCLExecuteKernel2(dst.clCxt , &operator_setTo, kernelName, globalThreads, - localThreads, args, -1, -1, compile_option, CLFLUSH); -#endif -} - -static oclMat &setTo(oclMat &src, const Scalar &scalar) -{ - CV_Assert( src.depth() >= 0 && src.depth() <= 6 ); - CV_DbgAssert( !src.empty()); - - if(src.type() == CV_8UC1) - { - set_to_withoutmask_run_cus(src, scalar, "set_to_without_mask_C1_D0"); - } - else - { - set_to_withoutmask_run_cus(src, scalar, "set_to_without_mask"); - } - - return src; -} - -/////////////////////////////////////////////////////////////////////////// -////////////////////////////////// CopyTo ///////////////////////////////// -/////////////////////////////////////////////////////////////////////////// -// static void copy_to_with_mask_cus(const oclMat &src, oclMat &dst, const oclMat &mask, string kernelName) -// { -// CV_DbgAssert( dst.rows == mask.rows && dst.cols == mask.cols && -// src.rows == dst.rows && src.cols == dst.cols -// && mask.type() == CV_8UC1); - -// vector > args; - -// std::string string_types[4][7] = {{"uchar", "char", "ushort", "short", "int", "float", "double"}, -// {"uchar2", "char2", "ushort2", "short2", "int2", "float2", "double2"}, -// {"uchar3", "char3", "ushort3", "short3", "int3", "float3", "double3"}, -// {"uchar4", "char4", "ushort4", "short4", "int4", "float4", "double4"} -// }; -// char compile_option[32]; -// sprintf(compile_option, "-D GENTYPE=%s", string_types[dst.oclchannels() - 1][dst.depth()].c_str()); -// size_t localThreads[3] = {16, 16, 1}; -// size_t globalThreads[3]; - -// globalThreads[0] = divUp(dst.cols, localThreads[0]) * localThreads[0]; -// globalThreads[1] = divUp(dst.rows, localThreads[1]) * localThreads[1]; -// globalThreads[2] = 1; - -// int dststep_in_pixel = dst.step / dst.elemSize(), dstoffset_in_pixel = dst.offset / dst.elemSize(); -// int srcstep_in_pixel = src.step / src.elemSize(), srcoffset_in_pixel = src.offset / src.elemSize(); - -// args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data )); -// args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data )); -// args.push_back( make_pair( sizeof(cl_mem) , (void *)&mask.data )); -// args.push_back( make_pair( sizeof(cl_int) , (void *)&src.cols )); -// args.push_back( make_pair( sizeof(cl_int) , (void *)&src.rows )); -// args.push_back( make_pair( sizeof(cl_int) , (void *)&srcstep_in_pixel )); -// args.push_back( make_pair( sizeof(cl_int) , (void *)&srcoffset_in_pixel )); -// args.push_back( make_pair( sizeof(cl_int) , (void *)&dststep_in_pixel )); -// args.push_back( make_pair( sizeof(cl_int) , (void *)&dstoffset_in_pixel )); -// args.push_back( make_pair( sizeof(cl_int) , (void *)&mask.step )); -// args.push_back( make_pair( sizeof(cl_int) , (void *)&mask.offset )); - -// openCLExecuteKernel2(dst.clCxt , &operator_copyToM, kernelName, globalThreads, -// localThreads, args, -1, -1, compile_option, CLFLUSH); -// } - -static void copyTo(const oclMat &src, oclMat &m ) -{ - CV_DbgAssert(!src.empty()); - m.create(src.size(), src.type()); - openCLCopyBuffer2D(src.clCxt, m.data, m.step, m.offset, - src.data, src.step, src.cols * src.elemSize(), src.rows, src.offset); -} - -// static void copyTo(const oclMat &src, oclMat &mat, const oclMat &mask) -// { -// if (mask.empty()) -// { -// copyTo(src, mat); -// } -// else -// { -// mat.create(src.size(), src.type()); -// copy_to_with_mask_cus(src, mat, mask, "copy_to_with_mask"); -// } -// } - -static void arithmetic_run(const oclMat &src1, oclMat &dst, string kernelName, const char **kernelString, void *_scalar) +static void multiply_cus(const oclMat &src1, oclMat &dst, float scalar) { if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F) { @@ -470,9 +110,6 @@ static void arithmetic_run(const oclMat &src1, oclMat &dst, string kernelName, c return; } - //dst.create(src1.size(), src1.type()); - //CV_Assert(src1.cols == src2.cols && src2.cols == dst.cols && - // src1.rows == src2.rows && src2.rows == dst.rows); CV_Assert(src1.cols == dst.cols && src1.rows == dst.rows); @@ -480,24 +117,8 @@ static void arithmetic_run(const oclMat &src1, oclMat &dst, string kernelName, c CV_Assert(src1.depth() != CV_8S); Context *clCxt = src1.clCxt; - //int channels = dst.channels(); - //int depth = dst.depth(); - - //int vector_lengths[4][7] = {{4, 0, 4, 4, 1, 1, 1}, - // {4, 0, 4, 4, 1, 1, 1}, - // {4, 0, 4, 4, 1, 1, 1}, - // {4, 0, 4, 4, 1, 1, 1} - //}; - - //size_t vector_length = vector_lengths[channels-1][depth]; - //int offset_cols = (dst.offset / dst.elemSize1()) & (vector_length - 1); - //int cols = divUp(dst.cols * channels + offset_cols, vector_length); size_t localThreads[3] = { 16, 16, 1 }; - //size_t globalThreads[3] = { divUp(cols, localThreads[0]) * localThreads[0], - // divUp(dst.rows, localThreads[1]) * localThreads[1], - // 1 - // }; size_t globalThreads[3] = { src1.cols, src1.rows, 1 @@ -508,67 +129,20 @@ static void arithmetic_run(const oclMat &src1, oclMat &dst, string kernelName, c args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data )); args.push_back( make_pair( sizeof(cl_int), (void *)&src1.step )); args.push_back( make_pair( sizeof(cl_int), (void *)&src1.offset )); - //args.push_back( make_pair( sizeof(cl_mem), (void *)&src2.data )); - //args.push_back( make_pair( sizeof(cl_int), (void *)&src2.step )); - //args.push_back( make_pair( sizeof(cl_int), (void *)&src2.offset )); args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data )); args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step )); args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset )); args.push_back( make_pair( sizeof(cl_int), (void *)&src1.rows )); args.push_back( make_pair( sizeof(cl_int), (void *)&src1.cols )); args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 )); + args.push_back( make_pair( sizeof(float), (float *)&scalar )); - //if(_scalar != NULL) - //{ - float scalar1 = *((float *)_scalar); - args.push_back( make_pair( sizeof(float), (float *)&scalar1 )); - //} - - openCLExecuteKernel2(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, src1.depth(), CLFLUSH); -} - -static void multiply_cus(const oclMat &src1, oclMat &dst, float scalar) -{ - arithmetic_run(src1, dst, "arithm_muls", &arithm_mul, (void *)(&scalar)); -} - -static void pyrdown_run_cus(const oclMat &src, const oclMat &dst) -{ - - CV_Assert(src.type() == dst.type()); - CV_Assert(src.depth() != CV_8S); - - Context *clCxt = src.clCxt; - - string kernelName = "pyrDown"; - - size_t localThreads[3] = { 256, 1, 1 }; - size_t globalThreads[3] = { src.cols, dst.rows, 1}; - - vector > args; - args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data )); - args.push_back( make_pair( sizeof(cl_int), (void *)&src.step )); - args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows)); - args.push_back( make_pair( sizeof(cl_int), (void *)&src.cols)); - args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data )); - args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step )); - args.push_back( make_pair( sizeof(cl_int), (void *)&dst.cols)); - - openCLExecuteKernel2(clCxt, &pyr_down, kernelName, globalThreads, localThreads, args, src.oclchannels(), src.depth(), CLFLUSH); -} - -static void pyrDown_cus(const oclMat &src, oclMat &dst) -{ - CV_Assert(src.depth() <= CV_32F && src.channels() <= 4); - - dst.create((src.rows + 1) / 2, (src.cols + 1) / 2, src.type()); - - pyrdown_run_cus(src, dst); + openCLExecuteKernel(clCxt, &arithm_mul, "arithm_muls", globalThreads, localThreads, args, -1, src1.depth()); } static void lkSparse_run(oclMat &I, oclMat &J, - const oclMat &prevPts, oclMat &nextPts, oclMat &status, oclMat& err, bool /*GET_MIN_EIGENVALS*/, int ptcount, - int level, /*dim3 block, */dim3 patch, Size winSize, int iters) + const oclMat &prevPts, oclMat &nextPts, oclMat &status, oclMat& err, bool /*GET_MIN_EIGENVALS*/, int ptcount, + int level, dim3 patch, Size winSize, int iters) { Context *clCxt = I.clCxt; int elemCntPerRow = I.step / I.elemSize(); @@ -603,7 +177,7 @@ static void lkSparse_run(oclMat &I, oclMat &J, args.push_back( make_pair( sizeof(cl_int), (void *)&level )); args.push_back( make_pair( sizeof(cl_int), (void *)&I.rows )); args.push_back( make_pair( sizeof(cl_int), (void *)&I.cols )); - if (!isImageSupported) + if (!isImageSupported) args.push_back( make_pair( sizeof(cl_int), (void *)&elemCntPerRow ) ); args.push_back( make_pair( sizeof(cl_int), (void *)&patch.x )); args.push_back( make_pair( sizeof(cl_int), (void *)&patch.y )); @@ -613,15 +187,24 @@ static void lkSparse_run(oclMat &I, oclMat &J, args.push_back( make_pair( sizeof(cl_int), (void *)&iters )); args.push_back( make_pair( sizeof(cl_char), (void *)&calcErr )); - if(isImageSupported) + if (clCxt->supportsFeature(Context::CL_CPU)) { - openCLExecuteKernel2(clCxt, &pyrlk, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth(), CLFLUSH); + openCLExecuteKernel(clCxt, &pyrlk, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth(), (char*)" -D CPU"); releaseTexture(ITex); releaseTexture(JTex); } else { - openCLExecuteKernel2(clCxt, &pyrlk_no_image, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth(), CLFLUSH); + if(isImageSupported) + { + openCLExecuteKernel(clCxt, &pyrlk, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth()); + releaseTexture(ITex); + releaseTexture(JTex); + } + else + { + openCLExecuteKernel(clCxt, &pyrlk_no_image, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth()); + } } } @@ -631,7 +214,7 @@ void cv::ocl::PyrLKOpticalFlow::sparse(const oclMat &prevImg, const oclMat &next { nextPts.release(); status.release(); - //if (err) err->release(); + if (err) err->release(); return; } @@ -657,13 +240,11 @@ void cv::ocl::PyrLKOpticalFlow::sparse(const oclMat &prevImg, const oclMat &next oclMat temp1 = (useInitialFlow ? nextPts : prevPts).reshape(1); oclMat temp2 = nextPts.reshape(1); - //oclMat scalar(temp1.rows, temp1.cols, temp1.type(), Scalar(1.0f / (1 << maxLevel) / 2.0f)); multiply_cus(temp1, temp2, 1.0f / (1 << maxLevel) / 2.0f); //::multiply(temp1, 1.0f / (1 << maxLevel) / 2.0f, temp2); ensureSizeIsEnough(1, prevPts.cols, CV_8UC1, status); - //status.setTo(Scalar::all(1)); - setTo(status, Scalar::all(1)); + status.setTo(Scalar::all(1)); bool errMat = false; if (!err) @@ -673,7 +254,6 @@ void cv::ocl::PyrLKOpticalFlow::sparse(const oclMat &prevImg, const oclMat &next } else ensureSizeIsEnough(1, prevPts.cols, CV_32FC1, *err); - //ensureSizeIsEnough(1, prevPts.cols, CV_32FC1, err); // build the image pyramids. @@ -682,25 +262,14 @@ void cv::ocl::PyrLKOpticalFlow::sparse(const oclMat &prevImg, const oclMat &next if (cn == 1 || cn == 4) { - //prevImg.convertTo(prevPyr_[0], CV_32F); - //nextImg.convertTo(nextPyr_[0], CV_32F); - convertTo(prevImg, prevPyr_[0], CV_32F); - convertTo(nextImg, nextPyr_[0], CV_32F); - } - else - { - //oclMat buf_; - // cvtColor(prevImg, buf_, COLOR_BGR2BGRA); - // buf_.convertTo(prevPyr_[0], CV_32F); - - // cvtColor(nextImg, buf_, COLOR_BGR2BGRA); - // buf_.convertTo(nextPyr_[0], CV_32F); + prevImg.convertTo(prevPyr_[0], CV_32F); + nextImg.convertTo(nextPyr_[0], CV_32F); } for (int level = 1; level <= maxLevel; ++level) { - pyrDown_cus(prevPyr_[level - 1], prevPyr_[level]); - pyrDown_cus(nextPyr_[level - 1], nextPyr_[level]); + pyrDown(prevPyr_[level - 1], prevPyr_[level]); + pyrDown(nextPyr_[level - 1], nextPyr_[level]); } // dI/dx ~ Ix, dI/dy ~ Iy @@ -709,17 +278,15 @@ void cv::ocl::PyrLKOpticalFlow::sparse(const oclMat &prevImg, const oclMat &next { lkSparse_run(prevPyr_[level], nextPyr_[level], prevPts, nextPts, status, *err, getMinEigenVals, prevPts.cols, - level, /*block, */patch, winSize, iters); + level, patch, winSize, iters); } - clFinish((cl_command_queue)prevImg.clCxt->oclCommandQueue()); - if(errMat) delete err; } static void lkDense_run(oclMat &I, oclMat &J, oclMat &u, oclMat &v, - oclMat &prevU, oclMat &prevV, oclMat *err, Size winSize, int iters) + oclMat &prevU, oclMat &prevV, oclMat *err, Size winSize, int iters) { Context *clCxt = I.clCxt; bool isImageSupported = support_image2d(); @@ -754,11 +321,6 @@ static void lkDense_run(oclMat &I, oclMat &J, oclMat &u, oclMat &v, JTex = (cl_mem)J.data; } - //int2 halfWin = {(winSize.width - 1) / 2, (winSize.height - 1) / 2}; - //const int patchWidth = 16 + 2 * halfWin.x; - //const int patchHeight = 16 + 2 * halfWin.y; - //size_t smem_size = 3 * patchWidth * patchHeight * sizeof(int); - vector > args; args.push_back( make_pair( sizeof(cl_mem), (void *)&ITex )); @@ -787,15 +349,14 @@ static void lkDense_run(oclMat &I, oclMat &J, oclMat &u, oclMat &v, if (isImageSupported) { - openCLExecuteKernel2(clCxt, &pyrlk, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth(), CLFLUSH); + openCLExecuteKernel(clCxt, &pyrlk, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth()); releaseTexture(ITex); releaseTexture(JTex); } else { - //printf("Warning: The image2d_t is not supported by the device. Using alternative method!\n"); - openCLExecuteKernel2(clCxt, &pyrlk_no_image, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth(), CLFLUSH); + openCLExecuteKernel(clCxt, &pyrlk_no_image, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth()); } } @@ -813,23 +374,20 @@ void cv::ocl::PyrLKOpticalFlow::dense(const oclMat &prevImg, const oclMat &nextI nextPyr_.resize(maxLevel + 1); prevPyr_[0] = prevImg; - //nextImg.convertTo(nextPyr_[0], CV_32F); - convertTo(nextImg, nextPyr_[0], CV_32F); + nextImg.convertTo(nextPyr_[0], CV_32F); for (int level = 1; level <= maxLevel; ++level) { - pyrDown_cus(prevPyr_[level - 1], prevPyr_[level]); - pyrDown_cus(nextPyr_[level - 1], nextPyr_[level]); + pyrDown(prevPyr_[level - 1], prevPyr_[level]); + pyrDown(nextPyr_[level - 1], nextPyr_[level]); } ensureSizeIsEnough(prevImg.size(), CV_32FC1, uPyr_[0]); ensureSizeIsEnough(prevImg.size(), CV_32FC1, vPyr_[0]); ensureSizeIsEnough(prevImg.size(), CV_32FC1, uPyr_[1]); ensureSizeIsEnough(prevImg.size(), CV_32FC1, vPyr_[1]); - //uPyr_[1].setTo(Scalar::all(0)); - //vPyr_[1].setTo(Scalar::all(0)); - setTo(uPyr_[1], Scalar::all(0)); - setTo(vPyr_[1], Scalar::all(0)); + uPyr_[1].setTo(Scalar::all(0)); + vPyr_[1].setTo(Scalar::all(0)); Size winSize2i(winSize.width, winSize.height); @@ -846,10 +404,6 @@ void cv::ocl::PyrLKOpticalFlow::dense(const oclMat &prevImg, const oclMat &nextI idx = idx2; } - //uPyr_[idx].copyTo(u); - //vPyr_[idx].copyTo(v); - copyTo(uPyr_[idx], u); - copyTo(vPyr_[idx], v); - - clFinish((cl_command_queue)prevImg.clCxt->oclCommandQueue()); + uPyr_[idx].copyTo(u); + vPyr_[idx].copyTo(v); } From cb63bbf0013e442c6ba81ef98628df7b99746605 Mon Sep 17 00:00:00 2001 From: yao Date: Wed, 3 Apr 2013 13:58:44 +0800 Subject: [PATCH 03/30] fix hog on some CPU device running ocl --- modules/ocl/src/hog.cpp | 61 ++++--- modules/ocl/src/opencl/objdetect_hog.cl | 209 +++++++++++++++++------- 2 files changed, 186 insertions(+), 84 deletions(-) diff --git a/modules/ocl/src/hog.cpp b/modules/ocl/src/hog.cpp index b23f00c90d..7a13324077 100644 --- a/modules/ocl/src/hog.cpp +++ b/modules/ocl/src/hog.cpp @@ -44,7 +44,6 @@ //M*/ #include "precomp.hpp" - using namespace cv; using namespace cv::ocl; using namespace std; @@ -230,7 +229,6 @@ void cv::ocl::HOGDescriptor::computeGradient(const oclMat &img, oclMat &grad, oc } } - void cv::ocl::HOGDescriptor::computeBlockHistograms(const oclMat &img) { computeGradient(img, grad, qangle); @@ -1571,6 +1569,27 @@ void cv::ocl::device::hog::set_up_constants(int nbins, int block_stride_x, int b cdescr_size = descr_size; } +static inline int divUp(int total, int grain) +{ + return (total + grain - 1) / grain; +} + +static void openCLExecuteKernel_hog(Context *clCxt , const char **source, string kernelName, + size_t globalThreads[3], size_t localThreads[3], + vector< pair > &args) +{ + size_t wave_size = 0; + queryDeviceInfo(WAVEFRONT_SIZE, &wave_size); + if (wave_size <= 16) + { + char build_options[64]; + sprintf(build_options, (wave_size == 16) ? "-D WAVE_SIZE_16" : "-D WAVE_SIZE_1"); + openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, -1, -1, build_options); + } + else + openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, -1, -1); +} + void cv::ocl::device::hog::compute_hists(int nbins, int block_stride_x, int block_stride_y, int height, int width, const cv::ocl::oclMat &grad, const cv::ocl::oclMat &qangle, float sigma, cv::ocl::oclMat &block_hists) @@ -1582,8 +1601,10 @@ void cv::ocl::device::hog::compute_hists(int nbins, int block_stride_x, int bloc int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x; int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) / block_stride_y; - size_t globalThreads[3] = { img_block_width * 32, img_block_height * 2, 1 }; - size_t localThreads[3] = { 32, 2, 1 }; + int blocks_total = img_block_width * img_block_height; + int blocks_in_group = 4; + size_t localThreads[3] = { blocks_in_group * 24, 2, 1 }; + size_t globalThreads[3] = { divUp(blocks_total, blocks_in_group) * localThreads[0], 2, 1 }; int grad_quadstep = grad.step >> 2; int qangle_step = qangle.step; @@ -1593,14 +1614,15 @@ void cv::ocl::device::hog::compute_hists(int nbins, int block_stride_x, int bloc int hists_size = (nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12) * sizeof(float); int final_hists_size = (nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y) * sizeof(float); - int smem = hists_size + final_hists_size; + int smem = (hists_size + final_hists_size) * blocks_in_group; - args.push_back( make_pair( sizeof(cl_int), (void *)&width)); args.push_back( make_pair( sizeof(cl_int), (void *)&cblock_stride_x)); args.push_back( make_pair( sizeof(cl_int), (void *)&cblock_stride_y)); args.push_back( make_pair( sizeof(cl_int), (void *)&cnbins)); args.push_back( make_pair( sizeof(cl_int), (void *)&cblock_hist_size)); args.push_back( make_pair( sizeof(cl_int), (void *)&img_block_width)); + args.push_back( make_pair( sizeof(cl_int), (void *)&blocks_in_group)); + args.push_back( make_pair( sizeof(cl_int), (void *)&blocks_total)); args.push_back( make_pair( sizeof(cl_int), (void *)&grad_quadstep)); args.push_back( make_pair( sizeof(cl_int), (void *)&qangle_step)); args.push_back( make_pair( sizeof(cl_mem), (void *)&grad.data)); @@ -1609,7 +1631,7 @@ void cv::ocl::device::hog::compute_hists(int nbins, int block_stride_x, int bloc args.push_back( make_pair( sizeof(cl_mem), (void *)&block_hists.data)); args.push_back( make_pair( smem, (void *)NULL)); - openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1); + openCLExecuteKernel_hog(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args); } void cv::ocl::device::hog::normalize_hists(int nbins, int block_stride_x, int block_stride_y, @@ -1637,7 +1659,7 @@ void cv::ocl::device::hog::normalize_hists(int nbins, int block_stride_x, int bl args.push_back( make_pair( sizeof(cl_float), (void *)&threshold)); args.push_back( make_pair( nthreads * sizeof(float), (void *)NULL)); - openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1); + openCLExecuteKernel_hog(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args); } void cv::ocl::device::hog::classify_hists(int win_height, int win_width, int block_stride_y, @@ -1671,7 +1693,7 @@ void cv::ocl::device::hog::classify_hists(int win_height, int win_width, int blo args.push_back( make_pair( sizeof(cl_float), (void *)&threshold)); args.push_back( make_pair( sizeof(cl_mem), (void *)&labels.data)); - openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1); + openCLExecuteKernel_hog(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args); } void cv::ocl::device::hog::extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, @@ -1702,7 +1724,7 @@ void cv::ocl::device::hog::extract_descrs_by_rows(int win_height, int win_width, args.push_back( make_pair( sizeof(cl_mem), (void *)&block_hists.data)); args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data)); - openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1); + openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1); } void cv::ocl::device::hog::extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x, @@ -1734,12 +1756,7 @@ void cv::ocl::device::hog::extract_descrs_by_cols(int win_height, int win_width, args.push_back( make_pair( sizeof(cl_mem), (void *)&block_hists.data)); args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data)); - openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1); -} - -static inline int divUp(int total, int grain) -{ - return (total + grain - 1) / grain; + openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1); } void cv::ocl::device::hog::compute_gradients_8UC1(int height, int width, const cv::ocl::oclMat &img, @@ -1768,7 +1785,7 @@ void cv::ocl::device::hog::compute_gradients_8UC1(int height, int width, const c args.push_back( make_pair( sizeof(cl_char), (void *)&correctGamma)); args.push_back( make_pair( sizeof(cl_int), (void *)&cnbins)); - openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1); + openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1); } void cv::ocl::device::hog::compute_gradients_8UC4(int height, int width, const cv::ocl::oclMat &img, @@ -1798,7 +1815,7 @@ void cv::ocl::device::hog::compute_gradients_8UC4(int height, int width, const c args.push_back( make_pair( sizeof(cl_char), (void *)&correctGamma)); args.push_back( make_pair( sizeof(cl_int), (void *)&cnbins)); - openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1); + openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1); } void cv::ocl::device::hog::resize( const oclMat &src, oclMat &dst, const Size sz) @@ -1815,14 +1832,16 @@ void cv::ocl::device::hog::resize( const oclMat &src, oclMat &dst, const Size sz float ifx = (float)src.cols / sz.width; float ify = (float)src.rows / sz.height; + int src_step = static_cast(src.step); + int dst_step = static_cast(dst.step); vector< pair > args; args.push_back( make_pair(sizeof(cl_mem), (void *)&dst.data)); args.push_back( make_pair(sizeof(cl_mem), (void *)&src.data)); args.push_back( make_pair(sizeof(cl_int), (void *)&dst.offset)); args.push_back( make_pair(sizeof(cl_int), (void *)&src.offset)); - args.push_back( make_pair(sizeof(cl_int), (void *)&dst.step)); - args.push_back( make_pair(sizeof(cl_int), (void *)&src.step)); + args.push_back( make_pair(sizeof(cl_int), (void *)&dst_step)); + args.push_back( make_pair(sizeof(cl_int), (void *)&src_step)); args.push_back( make_pair(sizeof(cl_int), (void *)&src.cols)); args.push_back( make_pair(sizeof(cl_int), (void *)&src.rows)); args.push_back( make_pair(sizeof(cl_int), (void *)&sz.width)); @@ -1830,5 +1849,5 @@ void cv::ocl::device::hog::resize( const oclMat &src, oclMat &dst, const Size sz args.push_back( make_pair(sizeof(cl_float), (void *)&ifx)); args.push_back( make_pair(sizeof(cl_float), (void *)&ify)); - openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1); + openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1); } diff --git a/modules/ocl/src/opencl/objdetect_hog.cl b/modules/ocl/src/opencl/objdetect_hog.cl index db11ed1410..8852facae8 100644 --- a/modules/ocl/src/opencl/objdetect_hog.cl +++ b/modules/ocl/src/opencl/objdetect_hog.cl @@ -53,76 +53,96 @@ //---------------------------------------------------------------------------- // Histogram computation - -__kernel void compute_hists_kernel(const int width, const int cblock_stride_x, const int cblock_stride_y, - const int cnbins, const int cblock_hist_size, const int img_block_width, - const int grad_quadstep, const int qangle_step, - __global const float* grad, __global const uchar* qangle, - const float scale, __global float* block_hists, __local float* smem) +// 12 threads for a cell, 12x4 threads per block +__kernel void compute_hists_kernel( + const int cblock_stride_x, const int cblock_stride_y, + const int cnbins, const int cblock_hist_size, const int img_block_width, + const int blocks_in_group, const int blocks_total, + const int grad_quadstep, const int qangle_step, + __global const float* grad, __global const uchar* qangle, + const float scale, __global float* block_hists, __local float* smem) { - const int lidX = get_local_id(0); + const int lx = get_local_id(0); + const int lp = lx / 24; /* local group id */ + const int gid = get_group_id(0) * blocks_in_group + lp;/* global group id */ + const int gidY = gid / img_block_width; + const int gidX = gid - gidY * img_block_width; + + const int lidX = lx - lp * 24; const int lidY = get_local_id(1); - const int gidX = get_group_id(0); - const int gidY = get_group_id(1); - const int cell_x = lidX / 16; + const int cell_x = lidX / 12; const int cell_y = lidY; - const int cell_thread_x = lidX & 0xF; + const int cell_thread_x = lidX - cell_x * 12; - __local float* hists = smem; - __local float* final_hist = smem + cnbins * 48; + __local float* hists = smem + lp * cnbins * (CELLS_PER_BLOCK_X * + CELLS_PER_BLOCK_Y * 12 + CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y); + __local float* final_hist = hists + cnbins * + (CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12); const int offset_x = gidX * cblock_stride_x + (cell_x << 2) + cell_thread_x; const int offset_y = gidY * cblock_stride_y + (cell_y << 2); - __global const float* grad_ptr = grad + offset_y * grad_quadstep + (offset_x << 1); - __global const uchar* qangle_ptr = qangle + offset_y * qangle_step + (offset_x << 1); + __global const float* grad_ptr = (gid < blocks_total) ? + grad + offset_y * grad_quadstep + (offset_x << 1) : grad; + __global const uchar* qangle_ptr = (gid < blocks_total) ? + qangle + offset_y * qangle_step + (offset_x << 1) : qangle; - // 12 means that 12 pixels affect on block's cell (in one row) - if (cell_thread_x < 12) + __local float* hist = hists + 12 * (cell_y * CELLS_PER_BLOCK_Y + cell_x) + + cell_thread_x; + for (int bin_id = 0; bin_id < cnbins; ++bin_id) + hist[bin_id * 48] = 0.f; + + const int dist_x = -4 + cell_thread_x - 4 * cell_x; + const int dist_center_x = dist_x - 4 * (1 - 2 * cell_x); + + const int dist_y_begin = -4 - 4 * lidY; + for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y) { - __local float* hist = hists + 12 * (cell_y * CELLS_PER_BLOCK_Y + cell_x) + cell_thread_x; - for (int bin_id = 0; bin_id < cnbins; ++bin_id) - hist[bin_id * 48] = 0.f; + float2 vote = (float2) (grad_ptr[0], grad_ptr[1]); + uchar2 bin = (uchar2) (qangle_ptr[0], qangle_ptr[1]); - const int dist_x = -4 + cell_thread_x - 4 * cell_x; + grad_ptr += grad_quadstep; + qangle_ptr += qangle_step; - const int dist_y_begin = -4 - 4 * lidY; - for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y) - { - float2 vote = (float2) (grad_ptr[0], grad_ptr[1]); - uchar2 bin = (uchar2) (qangle_ptr[0], qangle_ptr[1]); + int dist_center_y = dist_y - 4 * (1 - 2 * cell_y); - grad_ptr += grad_quadstep; - qangle_ptr += qangle_step; + float gaussian = exp(-(dist_center_y * dist_center_y + dist_center_x * + dist_center_x) * scale); + float interp_weight = (8.f - fabs(dist_y + 0.5f)) * + (8.f - fabs(dist_x + 0.5f)) / 64.f; - int dist_center_y = dist_y - 4 * (1 - 2 * cell_y); - int dist_center_x = dist_x - 4 * (1 - 2 * cell_x); - - float gaussian = exp(-(dist_center_y * dist_center_y + dist_center_x * dist_center_x) * scale); - float interp_weight = (8.f - fabs(dist_y + 0.5f)) * (8.f - fabs(dist_x + 0.5f)) / 64.f; - - hist[bin.x * 48] += gaussian * interp_weight * vote.x; - hist[bin.y * 48] += gaussian * interp_weight * vote.y; - } - - volatile __local float* hist_ = hist; - for (int bin_id = 0; bin_id < cnbins; ++bin_id, hist_ += 48) - { - if (cell_thread_x < 6) hist_[0] += hist_[6]; - if (cell_thread_x < 3) hist_[0] += hist_[3]; - if (cell_thread_x == 0) - final_hist[(cell_x * 2 + cell_y) * cnbins + bin_id] = hist_[0] + hist_[1] + hist_[2]; - } + hist[bin.x * 48] += gaussian * interp_weight * vote.x; + hist[bin.y * 48] += gaussian * interp_weight * vote.y; } - barrier(CLK_LOCAL_MEM_FENCE); - __global float* block_hist = block_hists + (gidY * img_block_width + gidX) * cblock_hist_size; + volatile __local float* hist_ = hist; + for (int bin_id = 0; bin_id < cnbins; ++bin_id, hist_ += 48) + { + if (cell_thread_x < 6) + hist_[0] += hist_[6]; + barrier(CLK_LOCAL_MEM_FENCE); + if (cell_thread_x < 3) + hist_[0] += hist_[3]; +#ifdef WAVE_SIZE_1 + barrier(CLK_LOCAL_MEM_FENCE); +#endif + if (cell_thread_x == 0) + final_hist[(cell_x * 2 + cell_y) * cnbins + bin_id] = + hist_[0] + hist_[1] + hist_[2]; + } +#ifdef WAVE_SIZE_1 + barrier(CLK_LOCAL_MEM_FENCE); +#endif - int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 16 + cell_thread_x; - if (tid < cblock_hist_size) + int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 12 + cell_thread_x; + if ((tid < cblock_hist_size) && (gid < blocks_total)) + { + __global float* block_hist = block_hists + + (gidY * img_block_width + gidX) * cblock_hist_size; block_hist[tid] = final_hist[tid]; + } } //------------------------------------------------------------- @@ -133,21 +153,59 @@ float reduce_smem(volatile __local float* smem, int size) unsigned int tid = get_local_id(0); float sum = smem[tid]; - if (size >= 512) { if (tid < 256) smem[tid] = sum = sum + smem[tid + 256]; barrier(CLK_LOCAL_MEM_FENCE); } - if (size >= 256) { if (tid < 128) smem[tid] = sum = sum + smem[tid + 128]; barrier(CLK_LOCAL_MEM_FENCE); } - if (size >= 128) { if (tid < 64) smem[tid] = sum = sum + smem[tid + 64]; barrier(CLK_LOCAL_MEM_FENCE); } + if (size >= 512) + { + if (tid < 256) smem[tid] = sum = sum + smem[tid + 256]; + barrier(CLK_LOCAL_MEM_FENCE); + } + if (size >= 256) + { + if (tid < 128) smem[tid] = sum = sum + smem[tid + 128]; + barrier(CLK_LOCAL_MEM_FENCE); + } + if (size >= 128) + { + if (tid < 64) smem[tid] = sum = sum + smem[tid + 64]; + barrier(CLK_LOCAL_MEM_FENCE); + } if (tid < 32) { if (size >= 64) smem[tid] = sum = sum + smem[tid + 32]; +#if defined(WAVE_SIZE_16) || defined(WAVE_SIZE_1) } barrier(CLK_LOCAL_MEM_FENCE); if (tid < 16) { +#endif if (size >= 32) smem[tid] = sum = sum + smem[tid + 16]; +#ifdef WAVE_SIZE_1 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 8) + { +#endif if (size >= 16) smem[tid] = sum = sum + smem[tid + 8]; +#ifdef WAVE_SIZE_1 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 4) + { +#endif if (size >= 8) smem[tid] = sum = sum + smem[tid + 4]; +#ifdef WAVE_SIZE_1 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 2) + { +#endif if (size >= 4) smem[tid] = sum = sum + smem[tid + 2]; +#ifdef WAVE_SIZE_1 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 1) + { +#endif if (size >= 2) smem[tid] = sum = sum + smem[tid + 1]; } @@ -224,19 +282,44 @@ __kernel void classify_hists_kernel(const int cblock_hist_size, const int cdescr if (tid < 64) products[tid] = product = product + products[tid + 64]; barrier(CLK_LOCAL_MEM_FENCE); + volatile __local float* smem = products; if (tid < 32) { - volatile __local float* smem = products; smem[tid] = product = product + smem[tid + 32]; +#if defined(WAVE_SIZE_16) || defined(WAVE_SIZE_1) } barrier(CLK_LOCAL_MEM_FENCE); if (tid < 16) { - volatile __local float* smem = products; +#endif smem[tid] = product = product + smem[tid + 16]; +#ifdef WAVE_SIZE_1 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 8) + { +#endif smem[tid] = product = product + smem[tid + 8]; +#ifdef WAVE_SIZE_1 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 4) + { +#endif smem[tid] = product = product + smem[tid + 4]; +#ifdef WAVE_SIZE_1 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 2) + { +#endif smem[tid] = product = product + smem[tid + 2]; +#ifdef WAVE_SIZE_1 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 1) + { +#endif smem[tid] = product = product + smem[tid + 1]; } @@ -248,8 +331,8 @@ __kernel void classify_hists_kernel(const int cblock_hist_size, const int cdescr // Extract descriptors __kernel void extract_descrs_by_rows_kernel(const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size, const int cdescr_width, - const int img_block_width, const int win_block_stride_x, const int win_block_stride_y, - __global const float* block_hists, __global float* descriptors) + const int img_block_width, const int win_block_stride_x, const int win_block_stride_y, + __global const float* block_hists, __global float* descriptors) { int tid = get_local_id(0); int gidX = get_group_id(0); @@ -271,8 +354,8 @@ __kernel void extract_descrs_by_rows_kernel(const int cblock_hist_size, const in } __kernel void extract_descrs_by_cols_kernel(const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size, - const int cnblocks_win_x, const int cnblocks_win_y, const int img_block_width, const int win_block_stride_x, - const int win_block_stride_y, __global const float* block_hists, __global float* descriptors) + const int cnblocks_win_x, const int cnblocks_win_y, const int img_block_width, const int win_block_stride_x, + const int win_block_stride_y, __global const float* block_hists, __global float* descriptors) { int tid = get_local_id(0); int gidX = get_group_id(0); @@ -301,8 +384,8 @@ __kernel void extract_descrs_by_cols_kernel(const int cblock_hist_size, const in // Gradients computation __kernel void compute_gradients_8UC4_kernel(const int height, const int width, const int img_step, const int grad_quadstep, const int qangle_step, - const __global uchar4 * img, __global float * grad, __global uchar * qangle, - const float angle_scale, const char correct_gamma, const int cnbins) + const __global uchar4 * img, __global float * grad, __global uchar * qangle, + const float angle_scale, const char correct_gamma, const int cnbins) { const int x = get_global_id(0); const int tid = get_local_id(0); @@ -400,8 +483,8 @@ __kernel void compute_gradients_8UC4_kernel(const int height, const int width, c } __kernel void compute_gradients_8UC1_kernel(const int height, const int width, const int img_step, const int grad_quadstep, const int qangle_step, - __global const uchar * img, __global float * grad, __global uchar * qangle, - const float angle_scale, const char correct_gamma, const int cnbins) + __global const uchar * img, __global float * grad, __global uchar * qangle, + const float angle_scale, const char correct_gamma, const int cnbins) { const int x = get_global_id(0); const int tid = get_local_id(0); From d5aaea2749a1fea96c1740ec4a6cbd8c62c59a7a Mon Sep 17 00:00:00 2001 From: yao Date: Wed, 3 Apr 2013 14:24:55 +0800 Subject: [PATCH 04/30] fix some mismatch on cpu device running OCL --- modules/ocl/src/opencl/stereobm.cl | 80 +++++++++++++++--------------- 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/modules/ocl/src/opencl/stereobm.cl b/modules/ocl/src/opencl/stereobm.cl index 99177c7bd0..196a786d5b 100644 --- a/modules/ocl/src/opencl/stereobm.cl +++ b/modules/ocl/src/opencl/stereobm.cl @@ -226,9 +226,9 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char volatile __local unsigned int *col_ssd_extra = get_local_id(0) < (2 * radius) ? col_ssd + BLOCK_W : 0; int X = get_group_id(0) * BLOCK_W + get_local_id(0) + maxdisp + radius; - // int Y = get_group_id(1) * ROWSperTHREAD + radius; + // int Y = get_group_id(1) * ROWSperTHREAD + radius; - #define Y (get_group_id(1) * ROWSperTHREAD + radius) +#define Y (get_group_id(1) * ROWSperTHREAD + radius) volatile __global unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step; __global unsigned char* disparImage = disp + X + Y * disp_step; @@ -251,9 +251,9 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char barrier(CLK_LOCAL_MEM_FENCE); //before MinSSD function + uint2 minSSD = MinSSD(col_ssd_cache + get_local_id(0), col_ssd, radius); if (X < cwidth - radius && Y < cheight - radius) { - uint2 minSSD = MinSSD(col_ssd_cache + get_local_id(0), col_ssd, radius); if (minSSD.x < minSSDImage[0]) { disparImage[0] = (unsigned char)(d + minSSD.y); @@ -264,7 +264,7 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char for(int row = 1; row < end_row; row++) { int idx1 = y_tex * img_step + x_tex; - int idx2 = (y_tex + (2 * radius + 1)) * img_step + x_tex; + int idx2 = min(y_tex + (2 * radius + 1), cheight - 1) * img_step + x_tex; barrier(CLK_GLOBAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE); @@ -278,10 +278,10 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char barrier(CLK_LOCAL_MEM_FENCE); + uint2 minSSD = MinSSD(col_ssd_cache + get_local_id(0), col_ssd, radius); if (X < cwidth - radius && row < cheight - radius - Y) { int idx = row * cminSSD_step; - uint2 minSSD = MinSSD(col_ssd_cache + get_local_id(0), col_ssd, radius); if (minSSD.x < minSSDImage[idx]) { disparImage[disp_step * row] = (unsigned char)(d + minSSD.y); @@ -378,50 +378,50 @@ __kernel void textureness_kernel(__global unsigned char *disp, int disp_rows, in int beg_row = group_id_y * RpT; int end_row = min(beg_row + RpT, disp_rows); - // if (x < disp_cols) - // { - int y = beg_row; +// if (x < disp_cols) +// { + int y = beg_row; - float sum = 0; - float sum_extra = 0; + float sum = 0; + float sum_extra = 0; - for(int i = y - winsz2; i <= y + winsz2; ++i) - { - sum += sobel(input, x - winsz2, i, input_rows, input_cols); - if (cols_extra) - sum_extra += sobel(input, x + group_size_x - winsz2, i, input_rows, input_cols); - } - *cols = sum; + for(int i = y - winsz2; i <= y + winsz2; ++i) + { + sum += sobel(input, x - winsz2, i, input_rows, input_cols); if (cols_extra) + sum_extra += sobel(input, x + group_size_x - winsz2, i, input_rows, input_cols); + } + *cols = sum; + if (cols_extra) + *cols_extra = sum_extra; + + barrier(CLK_LOCAL_MEM_FENCE); + + float sum_win = CalcSums(cols, cols_cache + local_id_x, winsz) * 255; + if (sum_win < threshold) + disp[y * disp_step + x] = 0; + + barrier(CLK_LOCAL_MEM_FENCE); + + for(int y = beg_row + 1; y < end_row; ++y) + { + sum = sum - sobel(input, x - winsz2, y - winsz2 - 1, input_rows, input_cols) + + sobel(input, x - winsz2, y + winsz2, input_rows, input_cols); + *cols = sum; + + if (cols_extra) + { + sum_extra = sum_extra - sobel(input, x + group_size_x - winsz2, y - winsz2 - 1,input_rows, input_cols) + + sobel(input, x + group_size_x - winsz2, y + winsz2, input_rows, input_cols); *cols_extra = sum_extra; + } barrier(CLK_LOCAL_MEM_FENCE); - float sum_win = CalcSums(cols, cols_cache + local_id_x, winsz) * 255; if (sum_win < threshold) disp[y * disp_step + x] = 0; barrier(CLK_LOCAL_MEM_FENCE); - - for(int y = beg_row + 1; y < end_row; ++y) - { - sum = sum - sobel(input, x - winsz2, y - winsz2 - 1, input_rows, input_cols) + - sobel(input, x - winsz2, y + winsz2, input_rows, input_cols); - *cols = sum; - - if (cols_extra) - { - sum_extra = sum_extra - sobel(input, x + group_size_x - winsz2, y - winsz2 - 1,input_rows, input_cols) - + sobel(input, x + group_size_x - winsz2, y + winsz2, input_rows, input_cols); - *cols_extra = sum_extra; - } - - barrier(CLK_LOCAL_MEM_FENCE); - float sum_win = CalcSums(cols, cols_cache + local_id_x, winsz) * 255; - if (sum_win < threshold) - disp[y * disp_step + x] = 0; - - barrier(CLK_LOCAL_MEM_FENCE); - } - // } + } + // } } From 5022bc8c2532a8a66facfcc2a636ac1f8d2c1ba0 Mon Sep 17 00:00:00 2001 From: yao Date: Fri, 5 Apr 2013 09:17:14 +0800 Subject: [PATCH 05/30] move the "cpu device checking" from supportsFeatures() to queryDeviceInfo() --- modules/ocl/include/opencv2/ocl/ocl.hpp | 2 +- modules/ocl/include/opencv2/ocl/private/util.hpp | 5 +++-- modules/ocl/src/initialization.cpp | 15 +++++++++------ modules/ocl/src/pyrlk.cpp | 4 +++- 4 files changed, 16 insertions(+), 10 deletions(-) diff --git a/modules/ocl/include/opencv2/ocl/ocl.hpp b/modules/ocl/include/opencv2/ocl/ocl.hpp index 7b79cb5b27..da7ca27aeb 100644 --- a/modules/ocl/include/opencv2/ocl/ocl.hpp +++ b/modules/ocl/include/opencv2/ocl/ocl.hpp @@ -155,7 +155,7 @@ namespace cv static Context* getContext(); static void setContext(Info &oclinfo); - enum {CL_DOUBLE, CL_UNIFIED_MEM, CL_CPU}; + enum {CL_DOUBLE, CL_UNIFIED_MEM}; bool supportsFeature(int ftype); size_t computeUnits(); void* oclContext(); diff --git a/modules/ocl/include/opencv2/ocl/private/util.hpp b/modules/ocl/include/opencv2/ocl/private/util.hpp index 62e69a8a24..081d2343dc 100644 --- a/modules/ocl/include/opencv2/ocl/private/util.hpp +++ b/modules/ocl/include/opencv2/ocl/private/util.hpp @@ -127,8 +127,9 @@ namespace cv // currently only support wavefront size queries enum DEVICE_INFO { - WAVEFRONT_SIZE, //in AMD speak - WARP_SIZE = WAVEFRONT_SIZE //in nvidia speak + WAVEFRONT_SIZE, //in AMD speak + WARP_SIZE = WAVEFRONT_SIZE, //in nvidia speak + IS_CPU_DEVICE //check if the device is CPU }; //info should have been pre-allocated void CV_EXPORTS queryDeviceInfo(DEVICE_INFO info_type, void* info); diff --git a/modules/ocl/src/initialization.cpp b/modules/ocl/src/initialization.cpp index 78263d86ae..bdcbfa517e 100644 --- a/modules/ocl/src/initialization.cpp +++ b/modules/ocl/src/initialization.cpp @@ -397,6 +397,15 @@ namespace cv } break; + case IS_CPU_DEVICE: + { + cl_device_type devicetype; + openCLSafeCall(clGetDeviceInfo(impl->devices[impl->devnum], + CL_DEVICE_TYPE, sizeof(cl_device_type), + &devicetype, NULL)); + *(bool*)info = (devicetype == CVCL_DEVICE_TYPE_CPU); + } + break; default: CV_Error(-1, "Invalid device info type"); break; @@ -979,12 +988,6 @@ namespace cv return impl->double_support == 1; case CL_UNIFIED_MEM: return impl->unified_memory == 1; - case CL_CPU: - cl_device_type devicetype; - clGetDeviceInfo(impl->devices[impl->devnum], - CL_DEVICE_TYPE, sizeof(cl_device_type), - &devicetype, NULL); - return devicetype == CVCL_DEVICE_TYPE_CPU; default: return false; } diff --git a/modules/ocl/src/pyrlk.cpp b/modules/ocl/src/pyrlk.cpp index 374134c1cd..4a6ce1c790 100644 --- a/modules/ocl/src/pyrlk.cpp +++ b/modules/ocl/src/pyrlk.cpp @@ -187,7 +187,9 @@ static void lkSparse_run(oclMat &I, oclMat &J, args.push_back( make_pair( sizeof(cl_int), (void *)&iters )); args.push_back( make_pair( sizeof(cl_char), (void *)&calcErr )); - if (clCxt->supportsFeature(Context::CL_CPU)) + bool is_cpu; + queryDeviceInfo(IS_CPU_DEVICE, &is_cpu); + if (is_cpu) { openCLExecuteKernel(clCxt, &pyrlk, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth(), (char*)" -D CPU"); releaseTexture(ITex); From 8cc5b980512c02cee20c9f6f82a4531750e75778 Mon Sep 17 00:00:00 2001 From: yao Date: Fri, 5 Apr 2013 09:19:59 +0800 Subject: [PATCH 06/30] Fix the problem of device selection on hybrid video systems. --- modules/ocl/test/main.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/modules/ocl/test/main.cpp b/modules/ocl/test/main.cpp index 856828d6a5..dd46ff6e06 100644 --- a/modules/ocl/test/main.cpp +++ b/modules/ocl/test/main.cpp @@ -115,10 +115,9 @@ int main(int argc, char **argv) std::cout << "platform invalid\n"; return -1; } - if(pid != 0 || device != 0) - { - setDevice(oclinfo[pid], device); - } + + setDevice(oclinfo[pid], device); + cout << "Device type:" << type << endl << "Device name:" << oclinfo[pid].DeviceName[device] << endl; return RUN_ALL_TESTS(); } From 64b5784c0b6db801725c2398d1c0a59d8eb82dfa Mon Sep 17 00:00:00 2001 From: Dustin Spicuzza Date: Fri, 5 Apr 2013 00:43:10 -0400 Subject: [PATCH 07/30] Catch exceptions when large allocations fail --- modules/python/src2/cv2.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/python/src2/cv2.cpp b/modules/python/src2/cv2.cpp index 308eb42632..64c4ad9ff3 100644 --- a/modules/python/src2/cv2.cpp +++ b/modules/python/src2/cv2.cpp @@ -379,7 +379,7 @@ static PyObject* pyopencv_from(const Mat& m) if(!p->refcount || p->allocator != &g_numpyAllocator) { temp.allocator = &g_numpyAllocator; - m.copyTo(temp); + ERRWRAP2(m.copyTo(temp)); p = &temp; } p->addref(); From bee970ab94384664c005142083e806bcf4a870f0 Mon Sep 17 00:00:00 2001 From: yao Date: Fri, 5 Apr 2013 21:29:29 +0800 Subject: [PATCH 08/30] remove the C3 kernels in arithm, as the oclMat will never store 3 channels data --- modules/ocl/src/opencl/arithm_add.cl | 315 -------------- modules/ocl/src/opencl/arithm_add_scalar.cl | 267 ------------ .../ocl/src/opencl/arithm_add_scalar_mask.cl | 305 -------------- .../ocl/src/opencl/arithm_bitwise_and_mask.cl | 391 ------------------ .../src/opencl/arithm_bitwise_and_scalar.cl | 333 --------------- .../ocl/src/opencl/arithm_bitwise_or_mask.cl | 390 ----------------- .../src/opencl/arithm_bitwise_or_scalar.cl | 337 --------------- .../opencl/arithm_bitwise_or_scalar_mask.cl | 380 ----------------- .../ocl/src/opencl/arithm_bitwise_xor_mask.cl | 391 ------------------ .../src/opencl/arithm_bitwise_xor_scalar.cl | 333 --------------- .../opencl/arithm_bitwise_xor_scalar_mask.cl | 373 ----------------- modules/ocl/src/opencl/arithm_sub.cl | 310 +------------- modules/ocl/src/opencl/arithm_sub_scalar.cl | 303 +------------- .../ocl/src/opencl/arithm_sub_scalar_mask.cl | 338 +-------------- modules/ocl/test/test_arithm.cpp | 12 +- 15 files changed, 20 insertions(+), 4758 deletions(-) diff --git a/modules/ocl/src/opencl/arithm_add.cl b/modules/ocl/src/opencl/arithm_add.cl index 647171578d..f8f32cde6b 100644 --- a/modules/ocl/src/opencl/arithm_add.cl +++ b/modules/ocl/src/opencl/arithm_add.cl @@ -675,322 +675,7 @@ __kernel void arithm_add_with_mask_C2_D6 (__global double *src1, int src1_step, } } #endif -__kernel void arithm_add_with_mask_C3_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0); - uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4); - uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - uchar4 src2_data_0 = vload4(0, src2 + src2_index + 0); - uchar4 src2_data_1 = vload4(0, src2 + src2_index + 4); - uchar4 src2_data_2 = vload4(0, src2 + src2_index + 8); - - uchar4 mask_data = vload4(0, mask + mask_index); - - uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0)); - uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4)); - uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8)); - - uchar4 tmp_data_0 = convert_uchar4_sat(convert_short4_sat(src1_data_0) + convert_short4_sat(src2_data_0)); - uchar4 tmp_data_1 = convert_uchar4_sat(convert_short4_sat(src1_data_1) + convert_short4_sat(src2_data_1)); - uchar4 tmp_data_2 = convert_uchar4_sat(convert_short4_sat(src1_data_2) + convert_short4_sat(src2_data_2)); - - data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global uchar4 *)(dst + dst_index + 0)) = data_0; - *((__global uchar4 *)(dst + dst_index + 4)) = data_1; - *((__global uchar4 *)(dst + dst_index + 8)) = data_2; - } -} -__kernel void arithm_add_with_mask_C3_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0)); - ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4)); - ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8)); - - ushort2 src2_data_0 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 0)); - ushort2 src2_data_1 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 4)); - ushort2 src2_data_2 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 8)); - - uchar2 mask_data = vload2(0, mask + mask_index); - - ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0)); - ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4)); - ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8)); - - ushort2 tmp_data_0 = convert_ushort2_sat(convert_int2_sat(src1_data_0) + convert_int2_sat(src2_data_0)); - ushort2 tmp_data_1 = convert_ushort2_sat(convert_int2_sat(src1_data_1) + convert_int2_sat(src2_data_1)); - ushort2 tmp_data_2 = convert_ushort2_sat(convert_int2_sat(src1_data_2) + convert_int2_sat(src2_data_2)); - - data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_add_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0)); - short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4)); - short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8)); - - short2 src2_data_0 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 0)); - short2 src2_data_1 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 4)); - short2 src2_data_2 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 8)); - - uchar2 mask_data = vload2(0, mask + mask_index); - - short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0)); - short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4)); - short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8)); - - short2 tmp_data_0 = convert_short2_sat(convert_int2_sat(src1_data_0) + convert_int2_sat(src2_data_0)); - short2 tmp_data_1 = convert_short2_sat(convert_int2_sat(src1_data_1) + convert_int2_sat(src2_data_1)); - short2 tmp_data_2 = convert_short2_sat(convert_int2_sat(src1_data_2) + convert_int2_sat(src2_data_2)); - - data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_add_with_mask_C3_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int src2_index = mad24(y, src2_step, (x * 12) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0)); - int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4)); - int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8)); - - int src2_data_0 = *((__global int *)((__global char *)src2 + src2_index + 0)); - int src2_data_1 = *((__global int *)((__global char *)src2 + src2_index + 4)); - int src2_data_2 = *((__global int *)((__global char *)src2 + src2_index + 8)); - - uchar mask_data = * (mask + mask_index); - - int data_0 = *((__global int *)((__global char *)dst + dst_index + 0)); - int data_1 = *((__global int *)((__global char *)dst + dst_index + 4)); - int data_2 = *((__global int *)((__global char *)dst + dst_index + 8)); - - int tmp_data_0 = convert_int_sat((long)src1_data_0 + (long)src2_data_0); - int tmp_data_1 = convert_int_sat((long)src1_data_1 + (long)src2_data_1); - int tmp_data_2 = convert_int_sat((long)src1_data_2 + (long)src2_data_2); - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global int *)((__global char *)dst + dst_index + 0))= data_0; - *((__global int *)((__global char *)dst + dst_index + 4))= data_1; - *((__global int *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_add_with_mask_C3_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global float *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int src2_index = mad24(y, src2_step, (x * 12) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0)); - float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4)); - float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8)); - - float src2_data_0 = *((__global float *)((__global char *)src2 + src2_index + 0)); - float src2_data_1 = *((__global float *)((__global char *)src2 + src2_index + 4)); - float src2_data_2 = *((__global float *)((__global char *)src2 + src2_index + 8)); - - uchar mask_data = * (mask + mask_index); - - float data_0 = *((__global float *)((__global char *)dst + dst_index + 0)); - float data_1 = *((__global float *)((__global char *)dst + dst_index + 4)); - float data_2 = *((__global float *)((__global char *)dst + dst_index + 8)); - - float tmp_data_0 = src1_data_0 + src2_data_0; - float tmp_data_1 = src1_data_1 + src2_data_1; - float tmp_data_2 = src1_data_2 + src2_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global float *)((__global char *)dst + dst_index + 0))= data_0; - *((__global float *)((__global char *)dst + dst_index + 4))= data_1; - *((__global float *)((__global char *)dst + dst_index + 8))= data_2; - } -} - -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_add_with_mask_C3_D6 (__global double *src1, int src1_step, int src1_offset, - __global double *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global double *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); - int src2_index = mad24(y, src2_step, (x * 24) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 24)); - - double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 )); - double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 )); - double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16)); - - double src2_data_0 = *((__global double *)((__global char *)src2 + src2_index + 0 )); - double src2_data_1 = *((__global double *)((__global char *)src2 + src2_index + 8 )); - double src2_data_2 = *((__global double *)((__global char *)src2 + src2_index + 16)); - - uchar mask_data = * (mask + mask_index); - - double data_0 = *((__global double *)((__global char *)dst + dst_index + 0 )); - double data_1 = *((__global double *)((__global char *)dst + dst_index + 8 )); - double data_2 = *((__global double *)((__global char *)dst + dst_index + 16)); - - double tmp_data_0 = src1_data_0 + src2_data_0; - double tmp_data_1 = src1_data_1 + src2_data_1; - double tmp_data_2 = src1_data_2 + src2_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global double *)((__global char *)dst + dst_index + 0 ))= data_0; - *((__global double *)((__global char *)dst + dst_index + 8 ))= data_1; - *((__global double *)((__global char *)dst + dst_index + 16))= data_2; - } -} -#endif __kernel void arithm_add_with_mask_C4_D0 (__global uchar *src1, int src1_step, int src1_offset, __global uchar *src2, int src2_step, int src2_offset, __global uchar *mask, int mask_step, int mask_offset, diff --git a/modules/ocl/src/opencl/arithm_add_scalar.cl b/modules/ocl/src/opencl/arithm_add_scalar.cl index 15ae95df25..152b5a19fb 100644 --- a/modules/ocl/src/opencl/arithm_add_scalar.cl +++ b/modules/ocl/src/opencl/arithm_add_scalar.cl @@ -382,274 +382,7 @@ __kernel void arithm_s_add_C2_D6 (__global double *src1, int src1_step, int sr } } #endif -__kernel void arithm_s_add_C3_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0); - uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4); - uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - int4 src2_data_0 = (int4)(src2.x, src2.y, src2.z, src2.x); - int4 src2_data_1 = (int4)(src2.y, src2.z, src2.x, src2.y); - int4 src2_data_2 = (int4)(src2.z, src2.x, src2.y, src2.z); - - uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0)); - uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4)); - uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8)); - - uchar4 tmp_data_0 = convert_uchar4_sat(convert_int4_sat(src1_data_0) + src2_data_0); - uchar4 tmp_data_1 = convert_uchar4_sat(convert_int4_sat(src1_data_1) + src2_data_1); - uchar4 tmp_data_2 = convert_uchar4_sat(convert_int4_sat(src1_data_2) + src2_data_2); - - data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global uchar4 *)(dst + dst_index + 0)) = data_0; - *((__global uchar4 *)(dst + dst_index + 4)) = data_1; - *((__global uchar4 *)(dst + dst_index + 8)) = data_2; - } -} -__kernel void arithm_s_add_C3_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0)); - ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4)); - ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8)); - - int2 src2_data_0 = (int2)(src2.x, src2.y); - int2 src2_data_1 = (int2)(src2.z, src2.x); - int2 src2_data_2 = (int2)(src2.y, src2.z); - - ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0)); - ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4)); - ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8)); - - ushort2 tmp_data_0 = convert_ushort2_sat(convert_int2_sat(src1_data_0) + src2_data_0); - ushort2 tmp_data_1 = convert_ushort2_sat(convert_int2_sat(src1_data_1) + src2_data_1); - ushort2 tmp_data_2 = convert_ushort2_sat(convert_int2_sat(src1_data_2) + src2_data_2); - - data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_add_C3_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0)); - short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4)); - short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8)); - - int2 src2_data_0 = (int2)(src2.x, src2.y); - int2 src2_data_1 = (int2)(src2.z, src2.x); - int2 src2_data_2 = (int2)(src2.y, src2.z); - - short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0)); - short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4)); - short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8)); - - short2 tmp_data_0 = convert_short2_sat(convert_int2_sat(src1_data_0) + src2_data_0); - short2 tmp_data_1 = convert_short2_sat(convert_int2_sat(src1_data_1) + src2_data_1); - short2 tmp_data_2 = convert_short2_sat(convert_int2_sat(src1_data_2) + src2_data_2); - - data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_add_C3_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0)); - int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4)); - int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8)); - - int src2_data_0 = src2.x; - int src2_data_1 = src2.y; - int src2_data_2 = src2.z; - - int data_0 = *((__global int *)((__global char *)dst + dst_index + 0)); - int data_1 = *((__global int *)((__global char *)dst + dst_index + 4)); - int data_2 = *((__global int *)((__global char *)dst + dst_index + 8)); - - int tmp_data_0 = convert_int_sat((long)src1_data_0 + (long)src2_data_0); - int tmp_data_1 = convert_int_sat((long)src1_data_1 + (long)src2_data_1); - int tmp_data_2 = convert_int_sat((long)src1_data_2 + (long)src2_data_2); - - *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0; - *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1; - *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2; - } -} -__kernel void arithm_s_add_C3_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *dst, int dst_step, int dst_offset, - float4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0)); - float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4)); - float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8)); - - float src2_data_0 = src2.x; - float src2_data_1 = src2.y; - float src2_data_2 = src2.z; - - float data_0 = *((__global float *)((__global char *)dst + dst_index + 0)); - float data_1 = *((__global float *)((__global char *)dst + dst_index + 4)); - float data_2 = *((__global float *)((__global char *)dst + dst_index + 8)); - - float tmp_data_0 = src1_data_0 + src2_data_0; - float tmp_data_1 = src1_data_1 + src2_data_1; - float tmp_data_2 = src1_data_2 + src2_data_2; - - *((__global float *)((__global char *)dst + dst_index + 0))= tmp_data_0; - *((__global float *)((__global char *)dst + dst_index + 4))= tmp_data_1; - *((__global float *)((__global char *)dst + dst_index + 8))= tmp_data_2; - } -} - -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_add_C3_D6 (__global double *src1, int src1_step, int src1_offset, - __global double *dst, int dst_step, int dst_offset, - double4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 24)); - - double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 )); - double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 )); - double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16)); - - double src2_data_0 = src2.x; - double src2_data_1 = src2.y; - double src2_data_2 = src2.z; - - double data_0 = *((__global double *)((__global char *)dst + dst_index + 0 )); - double data_1 = *((__global double *)((__global char *)dst + dst_index + 8 )); - double data_2 = *((__global double *)((__global char *)dst + dst_index + 16)); - - double tmp_data_0 = src1_data_0 + src2_data_0; - double tmp_data_1 = src1_data_1 + src2_data_1; - double tmp_data_2 = src1_data_2 + src2_data_2; - - *((__global double *)((__global char *)dst + dst_index + 0 ))= tmp_data_0; - *((__global double *)((__global char *)dst + dst_index + 8 ))= tmp_data_1; - *((__global double *)((__global char *)dst + dst_index + 16))= tmp_data_2; - } -} -#endif __kernel void arithm_s_add_C4_D0 (__global uchar *src1, int src1_step, int src1_offset, __global uchar *dst, int dst_step, int dst_offset, int4 src2, int rows, int cols, int dst_step1) diff --git a/modules/ocl/src/opencl/arithm_add_scalar_mask.cl b/modules/ocl/src/opencl/arithm_add_scalar_mask.cl index 1e2ae71af6..673e323ff6 100644 --- a/modules/ocl/src/opencl/arithm_add_scalar_mask.cl +++ b/modules/ocl/src/opencl/arithm_add_scalar_mask.cl @@ -446,311 +446,6 @@ __kernel void arithm_s_add_with_mask_C2_D6 (__global double *src1, int src1_st } #endif -__kernel void arithm_s_add_with_mask_C3_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0); - uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4); - uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - int4 src2_data_0 = (int4)(src2.x, src2.y, src2.z, src2.x); - int4 src2_data_1 = (int4)(src2.y, src2.z, src2.x, src2.y); - int4 src2_data_2 = (int4)(src2.z, src2.x, src2.y, src2.z); - - uchar4 mask_data = vload4(0, mask + mask_index); - - uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0)); - uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4)); - uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8)); - - uchar4 tmp_data_0 = convert_uchar4_sat(convert_int4_sat(src1_data_0) + src2_data_0); - uchar4 tmp_data_1 = convert_uchar4_sat(convert_int4_sat(src1_data_1) + src2_data_1); - uchar4 tmp_data_2 = convert_uchar4_sat(convert_int4_sat(src1_data_2) + src2_data_2); - - data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global uchar4 *)(dst + dst_index + 0)) = data_0; - *((__global uchar4 *)(dst + dst_index + 4)) = data_1; - *((__global uchar4 *)(dst + dst_index + 8)) = data_2; - } -} -__kernel void arithm_s_add_with_mask_C3_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0)); - ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4)); - ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8)); - - int2 src2_data_0 = (int2)(src2.x, src2.y); - int2 src2_data_1 = (int2)(src2.z, src2.x); - int2 src2_data_2 = (int2)(src2.y, src2.z); - - uchar2 mask_data = vload2(0, mask + mask_index); - - ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0)); - ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4)); - ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8)); - - ushort2 tmp_data_0 = convert_ushort2_sat(convert_int2_sat(src1_data_0) + src2_data_0); - ushort2 tmp_data_1 = convert_ushort2_sat(convert_int2_sat(src1_data_1) + src2_data_1); - ushort2 tmp_data_2 = convert_ushort2_sat(convert_int2_sat(src1_data_2) + src2_data_2); - - data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_add_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0)); - short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4)); - short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8)); - - int2 src2_data_0 = (int2)(src2.x, src2.y); - int2 src2_data_1 = (int2)(src2.z, src2.x); - int2 src2_data_2 = (int2)(src2.y, src2.z); - - uchar2 mask_data = vload2(0, mask + mask_index); - - short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0)); - short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4)); - short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8)); - - short2 tmp_data_0 = convert_short2_sat(convert_int2_sat(src1_data_0) + src2_data_0); - short2 tmp_data_1 = convert_short2_sat(convert_int2_sat(src1_data_1) + src2_data_1); - short2 tmp_data_2 = convert_short2_sat(convert_int2_sat(src1_data_2) + src2_data_2); - - data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_add_with_mask_C3_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0)); - int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4)); - int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8)); - - int src2_data_0 = src2.x; - int src2_data_1 = src2.y; - int src2_data_2 = src2.z; - - uchar mask_data = * (mask + mask_index); - - int data_0 = *((__global int *)((__global char *)dst + dst_index + 0)); - int data_1 = *((__global int *)((__global char *)dst + dst_index + 4)); - int data_2 = *((__global int *)((__global char *)dst + dst_index + 8)); - - int tmp_data_0 = convert_int_sat((long)src1_data_0 + (long)src2_data_0); - int tmp_data_1 = convert_int_sat((long)src1_data_1 + (long)src2_data_1); - int tmp_data_2 = convert_int_sat((long)src1_data_2 + (long)src2_data_2); - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global int *)((__global char *)dst + dst_index + 0))= data_0; - *((__global int *)((__global char *)dst + dst_index + 4))= data_1; - *((__global int *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_add_with_mask_C3_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - float4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0)); - float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4)); - float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8)); - - float src2_data_0 = src2.x; - float src2_data_1 = src2.y; - float src2_data_2 = src2.z; - - uchar mask_data = * (mask + mask_index); - - float data_0 = *((__global float *)((__global char *)dst + dst_index + 0)); - float data_1 = *((__global float *)((__global char *)dst + dst_index + 4)); - float data_2 = *((__global float *)((__global char *)dst + dst_index + 8)); - - float tmp_data_0 = src1_data_0 + src2_data_0; - float tmp_data_1 = src1_data_1 + src2_data_1; - float tmp_data_2 = src1_data_2 + src2_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global float *)((__global char *)dst + dst_index + 0))= data_0; - *((__global float *)((__global char *)dst + dst_index + 4))= data_1; - *((__global float *)((__global char *)dst + dst_index + 8))= data_2; - } -} - -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_add_with_mask_C3_D6 (__global double *src1, int src1_step, int src1_offset, - __global double *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - double4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 24)); - - double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 )); - double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 )); - double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16)); - - double src2_data_0 = src2.x; - double src2_data_1 = src2.y; - double src2_data_2 = src2.z; - - uchar mask_data = * (mask + mask_index); - - double data_0 = *((__global double *)((__global char *)dst + dst_index + 0 )); - double data_1 = *((__global double *)((__global char *)dst + dst_index + 8 )); - double data_2 = *((__global double *)((__global char *)dst + dst_index + 16)); - - double tmp_data_0 = src1_data_0 + src2_data_0; - double tmp_data_1 = src1_data_1 + src2_data_1; - double tmp_data_2 = src1_data_2 + src2_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global double *)((__global char *)dst + dst_index + 0 ))= data_0; - *((__global double *)((__global char *)dst + dst_index + 8 ))= data_1; - *((__global double *)((__global char *)dst + dst_index + 16))= data_2; - } -} -#endif - __kernel void arithm_s_add_with_mask_C4_D0 (__global uchar *src1, int src1_step, int src1_offset, __global uchar *dst, int dst_step, int dst_offset, __global uchar *mask, int mask_step, int mask_offset, diff --git a/modules/ocl/src/opencl/arithm_bitwise_and_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_and_mask.cl index fbc42364ac..5e0428f34b 100644 --- a/modules/ocl/src/opencl/arithm_bitwise_and_mask.cl +++ b/modules/ocl/src/opencl/arithm_bitwise_and_mask.cl @@ -565,397 +565,6 @@ __kernel void arithm_bitwise_and_with_mask_C2_D6 ( } - -__kernel void arithm_bitwise_and_with_mask_C3_D0 ( - __global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0); - uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4); - uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - uchar4 src2_data_0 = vload4(0, src2 + src2_index + 0); - uchar4 src2_data_1 = vload4(0, src2 + src2_index + 4); - uchar4 src2_data_2 = vload4(0, src2 + src2_index + 8); - - uchar4 mask_data = vload4(0, mask + mask_index); - - uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0)); - uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4)); - uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8)); - - uchar4 tmp_data_0 = src1_data_0 & src2_data_0; - uchar4 tmp_data_1 = src1_data_1 & src2_data_1; - uchar4 tmp_data_2 = src1_data_2 & src2_data_2; - - data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global uchar4 *)(dst + dst_index + 0)) = data_0; - *((__global uchar4 *)(dst + dst_index + 4)) = data_1; - *((__global uchar4 *)(dst + dst_index + 8)) = data_2; - } -} - - -__kernel void arithm_bitwise_and_with_mask_C3_D1 ( - __global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - char4 src1_data_0 = vload4(0, src1 + src1_index + 0); - char4 src1_data_1 = vload4(0, src1 + src1_index + 4); - char4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - char4 src2_data_0 = vload4(0, src2 + src2_index + 0); - char4 src2_data_1 = vload4(0, src2 + src2_index + 4); - char4 src2_data_2 = vload4(0, src2 + src2_index + 8); - - uchar4 mask_data = vload4(0, mask + mask_index); - - char4 data_0 = *((__global char4 *)(dst + dst_index + 0)); - char4 data_1 = *((__global char4 *)(dst + dst_index + 4)); - char4 data_2 = *((__global char4 *)(dst + dst_index + 8)); - - char4 tmp_data_0 = src1_data_0 & src2_data_0; - char4 tmp_data_1 = src1_data_1 & src2_data_1; - char4 tmp_data_2 = src1_data_2 & src2_data_2; - - data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global char4 *)(dst + dst_index + 0)) = data_0; - *((__global char4 *)(dst + dst_index + 4)) = data_1; - *((__global char4 *)(dst + dst_index + 8)) = data_2; - } -} - -__kernel void arithm_bitwise_and_with_mask_C3_D2 ( - __global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0)); - ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4)); - ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8)); - - ushort2 src2_data_0 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 0)); - ushort2 src2_data_1 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 4)); - ushort2 src2_data_2 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 8)); - - uchar2 mask_data = vload2(0, mask + mask_index); - - ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0)); - ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4)); - ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8)); - - ushort2 tmp_data_0 = src1_data_0 & src2_data_0 ; - ushort2 tmp_data_1 = src1_data_1 & src2_data_1 ; - ushort2 tmp_data_2 = src1_data_2 & src2_data_2 ; - - data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_bitwise_and_with_mask_C3_D3 ( - __global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0)); - short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4)); - short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8)); - - short2 src2_data_0 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 0)); - short2 src2_data_1 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 4)); - short2 src2_data_2 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 8)); - - uchar2 mask_data = vload2(0, mask + mask_index); - - short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0)); - short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4)); - short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8)); - - short2 tmp_data_0 = src1_data_0 & src2_data_0 ; - short2 tmp_data_1 = src1_data_1 & src2_data_1 ; - short2 tmp_data_2 = src1_data_2 & src2_data_2 ; - - data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_bitwise_and_with_mask_C3_D4 ( - __global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int src2_index = mad24(y, src2_step, (x * 12) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0)); - int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4)); - int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8)); - - int src2_data_0 = *((__global int *)((__global char *)src2 + src2_index + 0)); - int src2_data_1 = *((__global int *)((__global char *)src2 + src2_index + 4)); - int src2_data_2 = *((__global int *)((__global char *)src2 + src2_index + 8)); - - uchar mask_data = * (mask + mask_index); - - int data_0 = *((__global int *)((__global char *)dst + dst_index + 0)); - int data_1 = *((__global int *)((__global char *)dst + dst_index + 4)); - int data_2 = *((__global int *)((__global char *)dst + dst_index + 8)); - - int tmp_data_0 = src1_data_0 & src2_data_0 ; - int tmp_data_1 = src1_data_1 & src2_data_1 ; - int tmp_data_2 = src1_data_2 & src2_data_2 ; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global int *)((__global char *)dst + dst_index + 0))= data_0; - *((__global int *)((__global char *)dst + dst_index + 4))= data_1; - *((__global int *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_bitwise_and_with_mask_C3_D5 ( - __global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int src2_index = mad24(y, src2_step, (x * 12) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0)); - char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4)); - char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8)); - - char4 src2_data_0 = *((__global char4 *)((__global char *)src2 + src2_index + 0)); - char4 src2_data_1 = *((__global char4 *)((__global char *)src2 + src2_index + 4)); - char4 src2_data_2 = *((__global char4 *)((__global char *)src2 + src2_index + 8)); - - uchar mask_data = * (mask + mask_index); - - char4 data_0 = *((__global char4 *)((__global char *)dst + dst_index + 0)); - char4 data_1 = *((__global char4 *)((__global char *)dst + dst_index + 4)); - char4 data_2 = *((__global char4 *)((__global char *)dst + dst_index + 8)); - - char4 tmp_data_0 = src1_data_0 & src2_data_0; - char4 tmp_data_1 = src1_data_1 & src2_data_1; - char4 tmp_data_2 = src1_data_2 & src2_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global char4 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global char4 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global char4 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_bitwise_and_with_mask_C3_D6 ( - __global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); - int src2_index = mad24(y, src2_step, (x * 24) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 24)); - - char8 src1_data_0 = *((__global char8 *)((__global char *)src1 + src1_index + 0 )); - char8 src1_data_1 = *((__global char8 *)((__global char *)src1 + src1_index + 8 )); - char8 src1_data_2 = *((__global char8 *)((__global char *)src1 + src1_index + 16)); - - char8 src2_data_0 = *((__global char8 *)((__global char *)src2 + src2_index + 0 )); - char8 src2_data_1 = *((__global char8 *)((__global char *)src2 + src2_index + 8 )); - char8 src2_data_2 = *((__global char8 *)((__global char *)src2 + src2_index + 16)); - - uchar mask_data = * (mask + mask_index); - - char8 data_0 = *((__global char8 *)((__global char *)dst + dst_index + 0 )); - char8 data_1 = *((__global char8 *)((__global char *)dst + dst_index + 8 )); - char8 data_2 = *((__global char8 *)((__global char *)dst + dst_index + 16)); - - char8 tmp_data_0 = src1_data_0 & src2_data_0; - char8 tmp_data_1 = src1_data_1 & src2_data_1; - char8 tmp_data_2 = src1_data_2 & src2_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global char8 *)((__global char *)dst + dst_index + 0 ))= data_0; - *((__global char8 *)((__global char *)dst + dst_index + 8 ))= data_1; - *((__global char8 *)((__global char *)dst + dst_index + 16))= data_2; - } -} -#endif - - __kernel void arithm_bitwise_and_with_mask_C4_D0 ( __global uchar *src1, int src1_step, int src1_offset, __global uchar *src2, int src2_step, int src2_offset, diff --git a/modules/ocl/src/opencl/arithm_bitwise_and_scalar.cl b/modules/ocl/src/opencl/arithm_bitwise_and_scalar.cl index 5058d318e0..9605476581 100644 --- a/modules/ocl/src/opencl/arithm_bitwise_and_scalar.cl +++ b/modules/ocl/src/opencl/arithm_bitwise_and_scalar.cl @@ -461,340 +461,7 @@ __kernel void arithm_s_bitwise_and_C2_D6 ( } } #endif -__kernel void arithm_s_bitwise_and_C3_D0 ( - __global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - uchar4 src2, int rows, int cols, int dst_step1) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0); - uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4); - uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x); - uchar4 src2_data_1 = (uchar4)(src2.y, src2.z, src2.x, src2.y); - uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z); - - uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0)); - uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4)); - uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8)); - - uchar4 tmp_data_0 = src1_data_0 & src2_data_0; - uchar4 tmp_data_1 = src1_data_1 & src2_data_1; - uchar4 tmp_data_2 = src1_data_2 & src2_data_2; - - data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global uchar4 *)(dst + dst_index + 0)) = data_0; - *((__global uchar4 *)(dst + dst_index + 4)) = data_1; - *((__global uchar4 *)(dst + dst_index + 8)) = data_2; - } -} - - -__kernel void arithm_s_bitwise_and_C3_D1 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - char4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - char4 src1_data_0 = vload4(0, src1 + src1_index + 0); - char4 src1_data_1 = vload4(0, src1 + src1_index + 4); - char4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x); - char4 src2_data_1 = (char4)(src2.y, src2.z, src2.x, src2.y); - char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z); - - char4 data_0 = *((__global char4 *)(dst + dst_index + 0)); - char4 data_1 = *((__global char4 *)(dst + dst_index + 4)); - char4 data_2 = *((__global char4 *)(dst + dst_index + 8)); - - char4 tmp_data_0 = convert_char4_sat(convert_uchar4_sat(src1_data_0) & convert_uchar4_sat(src2_data_0)); - char4 tmp_data_1 = convert_char4_sat(convert_uchar4_sat(src1_data_1) & convert_uchar4_sat(src2_data_1)); - char4 tmp_data_2 = convert_char4_sat(convert_uchar4_sat(src1_data_2) & convert_uchar4_sat(src2_data_2)); - - data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global char4 *)(dst + dst_index + 0)) = data_0; - *((__global char4 *)(dst + dst_index + 4)) = data_1; - *((__global char4 *)(dst + dst_index + 8)) = data_2; - } -} - -__kernel void arithm_s_bitwise_and_C3_D2 ( - __global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - ushort4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0)); - ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4)); - ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8)); - - ushort2 src2_data_0 = (ushort2)(src2.x, src2.y); - ushort2 src2_data_1 = (ushort2)(src2.z, src2.x); - ushort2 src2_data_2 = (ushort2)(src2.y, src2.z); - - ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0)); - ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4)); - ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8)); - - ushort2 tmp_data_0 = src1_data_0 & src2_data_0; - ushort2 tmp_data_1 = src1_data_1 & src2_data_1; - ushort2 tmp_data_2 = src1_data_2 & src2_data_2; - - data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_bitwise_and_C3_D3 ( - __global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - short4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0)); - short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4)); - short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8)); - - short2 src2_data_0 = (short2)(src2.x, src2.y); - short2 src2_data_1 = (short2)(src2.z, src2.x); - short2 src2_data_2 = (short2)(src2.y, src2.z); - - short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0)); - short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4)); - short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8)); - - short2 tmp_data_0 = src1_data_0 & src2_data_0; - short2 tmp_data_1 = src1_data_1 & src2_data_1; - short2 tmp_data_2 = src1_data_2 & src2_data_2; - - data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_bitwise_and_C3_D4 ( - __global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0)); - int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4)); - int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8)); - - int src2_data_0 = src2.x; - int src2_data_1 = src2.y; - int src2_data_2 = src2.z; - - int data_0 = *((__global int *)((__global char *)dst + dst_index + 0)); - int data_1 = *((__global int *)((__global char *)dst + dst_index + 4)); - int data_2 = *((__global int *)((__global char *)dst + dst_index + 8)); - - int tmp_data_0 = src1_data_0 & src2_data_0; - int tmp_data_1 = src1_data_1 & src2_data_1; - int tmp_data_2 = src1_data_2 & src2_data_2; - - *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0; - *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1; - *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2; - } -} -__kernel void arithm_s_bitwise_and_C3_D5 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - char16 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0)); - char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4)); - char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8)); - - char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3); - char4 src2_data_1 = (char4)(src2.s4, src2.s5, src2.s6, src2.s7); - char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB); - - char4 data_0 = *((__global char4 *)((__global char *)dst + dst_index + 0)); - char4 data_1 = *((__global char4 *)((__global char *)dst + dst_index + 4)); - char4 data_2 = *((__global char4 *)((__global char *)dst + dst_index + 8)); - - char4 tmp_data_0 = src1_data_0 & src2_data_0; - char4 tmp_data_1 = src1_data_1 & src2_data_1; - char4 tmp_data_2 = src1_data_2 & src2_data_2; - - *((__global char4 *)((__global char *)dst + dst_index + 0))= tmp_data_0; - *((__global char4 *)((__global char *)dst + dst_index + 4))= tmp_data_1; - *((__global char4 *)((__global char *)dst + dst_index + 8))= tmp_data_2; - } -} -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_and_C3_D6 ( - __global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - short16 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 24)); - - short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0 )); - short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8 )); - short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16)); - - short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3); - short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7); - short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb); - - short4 data_0 = *((__global short4 *)((__global char *)dst + dst_index + 0 )); - short4 data_1 = *((__global short4 *)((__global char *)dst + dst_index + 8 )); - short4 data_2 = *((__global short4 *)((__global char *)dst + dst_index + 16)); - - short4 tmp_data_0 = src1_data_0 & src2_data_0; - short4 tmp_data_1 = src1_data_1 & src2_data_1; - short4 tmp_data_2 = src1_data_2 & src2_data_2; - - *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0; - *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1; - *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2; - } -} -#endif __kernel void arithm_s_bitwise_and_C4_D0 ( __global uchar *src1, int src1_step, int src1_offset, __global uchar *dst, int dst_step, int dst_offset, diff --git a/modules/ocl/src/opencl/arithm_bitwise_or_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_or_mask.cl index 2523eddcd9..f2cc36e1e0 100644 --- a/modules/ocl/src/opencl/arithm_bitwise_or_mask.cl +++ b/modules/ocl/src/opencl/arithm_bitwise_or_mask.cl @@ -566,396 +566,6 @@ __kernel void arithm_bitwise_or_with_mask_C2_D6 ( #endif -__kernel void arithm_bitwise_or_with_mask_C3_D0 ( - __global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0); - uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4); - uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - uchar4 src2_data_0 = vload4(0, src2 + src2_index + 0); - uchar4 src2_data_1 = vload4(0, src2 + src2_index + 4); - uchar4 src2_data_2 = vload4(0, src2 + src2_index + 8); - - uchar4 mask_data = vload4(0, mask + mask_index); - - uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0)); - uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4)); - uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8)); - - uchar4 tmp_data_0 = src1_data_0 | src2_data_0; - uchar4 tmp_data_1 = src1_data_1 | src2_data_1; - uchar4 tmp_data_2 = src1_data_2 | src2_data_2; - - data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global uchar4 *)(dst + dst_index + 0)) = data_0; - *((__global uchar4 *)(dst + dst_index + 4)) = data_1; - *((__global uchar4 *)(dst + dst_index + 8)) = data_2; - } -} - - -__kernel void arithm_bitwise_or_with_mask_C3_D1 ( - __global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - char4 src1_data_0 = vload4(0, src1 + src1_index + 0); - char4 src1_data_1 = vload4(0, src1 + src1_index + 4); - char4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - char4 src2_data_0 = vload4(0, src2 + src2_index + 0); - char4 src2_data_1 = vload4(0, src2 + src2_index + 4); - char4 src2_data_2 = vload4(0, src2 + src2_index + 8); - - uchar4 mask_data = vload4(0, mask + mask_index); - - char4 data_0 = *((__global char4 *)(dst + dst_index + 0)); - char4 data_1 = *((__global char4 *)(dst + dst_index + 4)); - char4 data_2 = *((__global char4 *)(dst + dst_index + 8)); - - char4 tmp_data_0 = src1_data_0 | src2_data_0; - char4 tmp_data_1 = src1_data_1 | src2_data_1; - char4 tmp_data_2 = src1_data_2 | src2_data_2; - - data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global char4 *)(dst + dst_index + 0)) = data_0; - *((__global char4 *)(dst + dst_index + 4)) = data_1; - *((__global char4 *)(dst + dst_index + 8)) = data_2; - } -} - -__kernel void arithm_bitwise_or_with_mask_C3_D2 ( - __global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0)); - ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4)); - ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8)); - - ushort2 src2_data_0 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 0)); - ushort2 src2_data_1 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 4)); - ushort2 src2_data_2 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 8)); - - uchar2 mask_data = vload2(0, mask + mask_index); - - ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0)); - ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4)); - ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8)); - - ushort2 tmp_data_0 = src1_data_0 | src2_data_0 ; - ushort2 tmp_data_1 = src1_data_1 | src2_data_1 ; - ushort2 tmp_data_2 = src1_data_2 | src2_data_2 ; - - data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_bitwise_or_with_mask_C3_D3 ( - __global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0)); - short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4)); - short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8)); - - short2 src2_data_0 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 0)); - short2 src2_data_1 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 4)); - short2 src2_data_2 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 8)); - - uchar2 mask_data = vload2(0, mask + mask_index); - - short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0)); - short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4)); - short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8)); - - short2 tmp_data_0 = src1_data_0 | src2_data_0 ; - short2 tmp_data_1 = src1_data_1 | src2_data_1 ; - short2 tmp_data_2 = src1_data_2 | src2_data_2 ; - - data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_bitwise_or_with_mask_C3_D4 ( - __global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int src2_index = mad24(y, src2_step, (x * 12) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0)); - int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4)); - int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8)); - - int src2_data_0 = *((__global int *)((__global char *)src2 + src2_index + 0)); - int src2_data_1 = *((__global int *)((__global char *)src2 + src2_index + 4)); - int src2_data_2 = *((__global int *)((__global char *)src2 + src2_index + 8)); - - uchar mask_data = * (mask + mask_index); - - int data_0 = *((__global int *)((__global char *)dst + dst_index + 0)); - int data_1 = *((__global int *)((__global char *)dst + dst_index + 4)); - int data_2 = *((__global int *)((__global char *)dst + dst_index + 8)); - - int tmp_data_0 = src1_data_0 | src2_data_0 ; - int tmp_data_1 = src1_data_1 | src2_data_1 ; - int tmp_data_2 = src1_data_2 | src2_data_2 ; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global int *)((__global char *)dst + dst_index + 0))= data_0; - *((__global int *)((__global char *)dst + dst_index + 4))= data_1; - *((__global int *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_bitwise_or_with_mask_C3_D5 ( - __global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int src2_index = mad24(y, src2_step, (x * 12) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0)); - char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4)); - char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8)); - - char4 src2_data_0 = *((__global char4 *)((__global char *)src2 + src2_index + 0)); - char4 src2_data_1 = *((__global char4 *)((__global char *)src2 + src2_index + 4)); - char4 src2_data_2 = *((__global char4 *)((__global char *)src2 + src2_index + 8)); - - uchar mask_data = * (mask + mask_index); - - char4 data_0 = *((__global char4 *)((__global char *)dst + dst_index + 0)); - char4 data_1 = *((__global char4 *)((__global char *)dst + dst_index + 4)); - char4 data_2 = *((__global char4 *)((__global char *)dst + dst_index + 8)); - - char4 tmp_data_0 = src1_data_0 | src2_data_0; - char4 tmp_data_1 = src1_data_1 | src2_data_1; - char4 tmp_data_2 = src1_data_2 | src2_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global char4 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global char4 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global char4 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_bitwise_or_with_mask_C3_D6 ( - __global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); - int src2_index = mad24(y, src2_step, (x * 24) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 24)); - - char8 src1_data_0 = *((__global char8 *)((__global char *)src1 + src1_index + 0 )); - char8 src1_data_1 = *((__global char8 *)((__global char *)src1 + src1_index + 8 )); - char8 src1_data_2 = *((__global char8 *)((__global char *)src1 + src1_index + 16)); - - char8 src2_data_0 = *((__global char8 *)((__global char *)src2 + src2_index + 0 )); - char8 src2_data_1 = *((__global char8 *)((__global char *)src2 + src2_index + 8 )); - char8 src2_data_2 = *((__global char8 *)((__global char *)src2 + src2_index + 16)); - - uchar mask_data = * (mask + mask_index); - - char8 data_0 = *((__global char8 *)((__global char *)dst + dst_index + 0 )); - char8 data_1 = *((__global char8 *)((__global char *)dst + dst_index + 8 )); - char8 data_2 = *((__global char8 *)((__global char *)dst + dst_index + 16)); - - char8 tmp_data_0 = src1_data_0 | src2_data_0; - char8 tmp_data_1 = src1_data_1 | src2_data_1; - char8 tmp_data_2 = src1_data_2 | src2_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global char8 *)((__global char *)dst + dst_index + 0 ))= data_0; - *((__global char8 *)((__global char *)dst + dst_index + 8 ))= data_1; - *((__global char8 *)((__global char *)dst + dst_index + 16))= data_2; - } -} -#endif - - __kernel void arithm_bitwise_or_with_mask_C4_D0 ( __global uchar *src1, int src1_step, int src1_offset, __global uchar *src2, int src2_step, int src2_offset, diff --git a/modules/ocl/src/opencl/arithm_bitwise_or_scalar.cl b/modules/ocl/src/opencl/arithm_bitwise_or_scalar.cl index fdcc00c4ef..7ade345f9b 100644 --- a/modules/ocl/src/opencl/arithm_bitwise_or_scalar.cl +++ b/modules/ocl/src/opencl/arithm_bitwise_or_scalar.cl @@ -457,344 +457,7 @@ __kernel void arithm_s_bitwise_or_C2_D6 ( } } #endif -__kernel void arithm_s_bitwise_or_C3_D0 ( - __global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - uchar4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0); - uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4); - uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x); - uchar4 src2_data_1 = (uchar4)(src2.y, src2.z, src2.x, src2.y); - uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z); - - uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0)); - uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4)); - uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8)); - - uchar4 tmp_data_0 = src1_data_0 | src2_data_0 ; - uchar4 tmp_data_1 = src1_data_1 | src2_data_1 ; - uchar4 tmp_data_2 = src1_data_2 | src2_data_2 ; - - data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global uchar4 *)(dst + dst_index + 0)) = data_0; - *((__global uchar4 *)(dst + dst_index + 4)) = data_1; - *((__global uchar4 *)(dst + dst_index + 8)) = data_2; - } -} - - -__kernel void arithm_s_bitwise_or_C3_D1 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - char4 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - char4 src1_data_0 = vload4(0, src1 + src1_index + 0); - char4 src1_data_1 = vload4(0, src1 + src1_index + 4); - char4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x); - char4 src2_data_1 = (char4)(src2.y, src2.z, src2.x, src2.y); - char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z); - - char4 data_0 = *((__global char4 *)(dst + dst_index + 0)); - char4 data_1 = *((__global char4 *)(dst + dst_index + 4)); - char4 data_2 = *((__global char4 *)(dst + dst_index + 8)); - - char4 tmp_data_0 = src1_data_0 | src2_data_0; - char4 tmp_data_1 = src1_data_1 | src2_data_1; - char4 tmp_data_2 = src1_data_2 | src2_data_2; - - data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global char4 *)(dst + dst_index + 0)) = data_0; - *((__global char4 *)(dst + dst_index + 4)) = data_1; - *((__global char4 *)(dst + dst_index + 8)) = data_2; - } -} - -__kernel void arithm_s_bitwise_or_C3_D2 ( - __global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - ushort4 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0)); - ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4)); - ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8)); - - ushort2 src2_data_0 = (ushort2)(src2.x, src2.y); - ushort2 src2_data_1 = (ushort2)(src2.z, src2.x); - ushort2 src2_data_2 = (ushort2)(src2.y, src2.z); - - ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0)); - ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4)); - ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8)); - - ushort2 tmp_data_0 = src1_data_0 | src2_data_0 ; - ushort2 tmp_data_1 = src1_data_1 | src2_data_1 ; - ushort2 tmp_data_2 = src1_data_2 | src2_data_2 ; - - data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_bitwise_or_C3_D3 ( - __global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - short4 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0)); - short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4)); - short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8)); - - short2 src2_data_0 = (short2)(src2.x, src2.y); - short2 src2_data_1 = (short2)(src2.z, src2.x); - short2 src2_data_2 = (short2)(src2.y, src2.z); - - short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0)); - short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4)); - short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8)); - - short2 tmp_data_0 = src1_data_0 | src2_data_0 ; - short2 tmp_data_1 = src1_data_1 | src2_data_1 ; - short2 tmp_data_2 = src1_data_2 | src2_data_2 ; - - data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_bitwise_or_C3_D4 ( - __global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0)); - int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4)); - int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8)); - - int src2_data_0 = src2.x; - int src2_data_1 = src2.y; - int src2_data_2 = src2.z; - - int data_0 = *((__global int *)((__global char *)dst + dst_index + 0)); - int data_1 = *((__global int *)((__global char *)dst + dst_index + 4)); - int data_2 = *((__global int *)((__global char *)dst + dst_index + 8)); - - int tmp_data_0 = src1_data_0 | src2_data_0; - int tmp_data_1 = src1_data_1 | src2_data_1; - int tmp_data_2 = src1_data_2 | src2_data_2; - - *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0; - *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1; - *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2; - } -} -__kernel void arithm_s_bitwise_or_C3_D5 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - char16 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0)); - char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4)); - char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8)); - - char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3); - char4 src2_data_1 = (char4)(src2.s4, src2.s5, src2.s6, src2.s7); - char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB); - - char4 tmp_data_0 = src1_data_0 | src2_data_0; - char4 tmp_data_1 = src1_data_1 | src2_data_1; - char4 tmp_data_2 = src1_data_2 | src2_data_2; - - *((__global char4 *)((__global char *)dst + dst_index + 0))= tmp_data_0; - *((__global char4 *)((__global char *)dst + dst_index + 4))= tmp_data_1; - *((__global char4 *)((__global char *)dst + dst_index + 8))= tmp_data_2; - } -} -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_or_C3_D6 ( - __global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - short16 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 24)); - - short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0 )); - short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8 )); - short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16)); - - short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3); - short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7); - short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb); - - short4 data_0 = *((__global short4 *)((__global char *)dst + dst_index + 0 )); - short4 data_1 = *((__global short4 *)((__global char *)dst + dst_index + 8 )); - short4 data_2 = *((__global short4 *)((__global char *)dst + dst_index + 16)); - - short4 tmp_data_0 = src1_data_0 | src2_data_0; - short4 tmp_data_1 = src1_data_1 | src2_data_1; - short4 tmp_data_2 = src1_data_2 | src2_data_2; - - *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0; - *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1; - *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2; - } -} -#endif __kernel void arithm_s_bitwise_or_C4_D0 ( __global uchar *src1, int src1_step, int src1_offset, __global uchar *dst, int dst_step, int dst_offset, diff --git a/modules/ocl/src/opencl/arithm_bitwise_or_scalar_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_or_scalar_mask.cl index 8baa9a2ca2..b8f07a8724 100644 --- a/modules/ocl/src/opencl/arithm_bitwise_or_scalar_mask.cl +++ b/modules/ocl/src/opencl/arithm_bitwise_or_scalar_mask.cl @@ -533,387 +533,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D6 ( } } #endif -__kernel void arithm_s_bitwise_or_with_mask_C3_D0 ( - __global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - uchar4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0); - uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4); - uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x); - uchar4 src2_data_1 = (uchar4)(src2.y, src2.z, src2.x, src2.y); - uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z); - - uchar4 mask_data = vload4(0, mask + mask_index); - - uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0)); - uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4)); - uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8)); - - uchar4 tmp_data_0 = src1_data_0 | src2_data_0; - uchar4 tmp_data_1 = src1_data_1 | src2_data_1; - uchar4 tmp_data_2 = src1_data_2 | src2_data_2; - - data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global uchar4 *)(dst + dst_index + 0)) = data_0; - *((__global uchar4 *)(dst + dst_index + 4)) = data_1; - *((__global uchar4 *)(dst + dst_index + 8)) = data_2; - } -} - - -__kernel void arithm_s_bitwise_or_with_mask_C3_D1 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - char4 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - char4 src1_data_0 = vload4(0, src1 + src1_index + 0); - char4 src1_data_1 = vload4(0, src1 + src1_index + 4); - char4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x); - char4 src2_data_1 = (char4)(src2.y, src2.z, src2.x, src2.y); - char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z); - - uchar4 mask_data = vload4(0, mask + mask_index); - - char4 data_0 = *((__global char4 *)(dst + dst_index + 0)); - char4 data_1 = *((__global char4 *)(dst + dst_index + 4)); - char4 data_2 = *((__global char4 *)(dst + dst_index + 8)); - - char4 tmp_data_0 = src1_data_0 | src2_data_0; - char4 tmp_data_1 = src1_data_1 | src2_data_1; - char4 tmp_data_2 = src1_data_2 | src2_data_2; - - data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global char4 *)(dst + dst_index + 0)) = data_0; - *((__global char4 *)(dst + dst_index + 4)) = data_1; - *((__global char4 *)(dst + dst_index + 8)) = data_2; - } -} - -__kernel void arithm_s_bitwise_or_with_mask_C3_D2 ( - __global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - ushort4 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0)); - ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4)); - ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8)); - - ushort2 src2_data_0 = (ushort2)(src2.x, src2.y); - ushort2 src2_data_1 = (ushort2)(src2.z, src2.x); - ushort2 src2_data_2 = (ushort2)(src2.y, src2.z); - - uchar2 mask_data = vload2(0, mask + mask_index); - - ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0)); - ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4)); - ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8)); - - ushort2 tmp_data_0 = src1_data_0 | src2_data_0; - ushort2 tmp_data_1 = src1_data_1 | src2_data_1; - ushort2 tmp_data_2 = src1_data_2 | src2_data_2; - - data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_bitwise_or_with_mask_C3_D3 ( - __global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - short4 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0)); - short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4)); - short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8)); - - short2 src2_data_0 = (short2)(src2.x, src2.y); - short2 src2_data_1 = (short2)(src2.z, src2.x); - short2 src2_data_2 = (short2)(src2.y, src2.z); - - uchar2 mask_data = vload2(0, mask + mask_index); - - short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0)); - short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4)); - short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8)); - - short2 tmp_data_0 = src1_data_0 | src2_data_0; - short2 tmp_data_1 = src1_data_1 | src2_data_1; - short2 tmp_data_2 = src1_data_2 | src2_data_2; - - data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_bitwise_or_with_mask_C3_D4 ( - __global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0)); - int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4)); - int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8)); - - int src2_data_0 = src2.x; - int src2_data_1 = src2.y; - int src2_data_2 = src2.z; - - uchar mask_data = * (mask + mask_index); - - int data_0 = *((__global int *)((__global char *)dst + dst_index + 0)); - int data_1 = *((__global int *)((__global char *)dst + dst_index + 4)); - int data_2 = *((__global int *)((__global char *)dst + dst_index + 8)); - - int tmp_data_0 = src1_data_0 | src2_data_0; - int tmp_data_1 = src1_data_1 | src2_data_1; - int tmp_data_2 = src1_data_2 | src2_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global int *)((__global char *)dst + dst_index + 0))= data_0; - *((__global int *)((__global char *)dst + dst_index + 4))= data_1; - *((__global int *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_bitwise_or_with_mask_C3_D5 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - char16 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0)); - char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4)); - char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8)); - - char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3); - char4 src2_data_1 = (char4)(src2.s4, src2.s5, src2.s6, src2.s7); - char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB); - - uchar mask_data = * (mask + mask_index); - - char4 data_0 = *((__global char4 *)((__global char *)dst + dst_index + 0)); - char4 data_1 = *((__global char4 *)((__global char *)dst + dst_index + 4)); - char4 data_2 = *((__global char4 *)((__global char *)dst + dst_index + 8)); - - char4 tmp_data_0 = src1_data_0 | src2_data_0; - char4 tmp_data_1 = src1_data_1 | src2_data_1; - char4 tmp_data_2 = src1_data_2 | src2_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global char4 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global char4 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global char4 *)((__global char *)dst + dst_index + 8))= data_2; - - } -} -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_or_with_mask_C3_D6 ( - __global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - short16 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 24)); - - short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0 )); - short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8 )); - short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16)); - - short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3); - short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7); - short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb); - - uchar mask_data = * (mask + mask_index); - - short4 data_0 = *((__global short4 *)((__global char *)dst + dst_index + 0 )); - short4 data_1 = *((__global short4 *)((__global char *)dst + dst_index + 8 )); - short4 data_2 = *((__global short4 *)((__global char *)dst + dst_index + 16)); - - short4 tmp_data_0 = src1_data_0 | src2_data_0; - short4 tmp_data_1 = src1_data_1 | src2_data_1; - short4 tmp_data_2 = src1_data_2 | src2_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global short4 *)((__global char *)dst + dst_index + 0 ))= data_0; - *((__global short4 *)((__global char *)dst + dst_index + 8 ))= data_1; - *((__global short4 *)((__global char *)dst + dst_index + 16))= data_2; - } -} -#endif __kernel void arithm_s_bitwise_or_with_mask_C4_D0 ( __global uchar *src1, int src1_step, int src1_offset, __global uchar *dst, int dst_step, int dst_offset, diff --git a/modules/ocl/src/opencl/arithm_bitwise_xor_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_xor_mask.cl index 48bd3e444a..7655be3a80 100644 --- a/modules/ocl/src/opencl/arithm_bitwise_xor_mask.cl +++ b/modules/ocl/src/opencl/arithm_bitwise_xor_mask.cl @@ -565,397 +565,6 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D6 ( } #endif - -__kernel void arithm_bitwise_xor_with_mask_C3_D0 ( - __global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0); - uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4); - uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - uchar4 src2_data_0 = vload4(0, src2 + src2_index + 0); - uchar4 src2_data_1 = vload4(0, src2 + src2_index + 4); - uchar4 src2_data_2 = vload4(0, src2 + src2_index + 8); - - uchar4 mask_data = vload4(0, mask + mask_index); - - uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0)); - uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4)); - uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8)); - - uchar4 tmp_data_0 = src1_data_0 ^ src2_data_0; - uchar4 tmp_data_1 = src1_data_1 ^ src2_data_1; - uchar4 tmp_data_2 = src1_data_2 ^ src2_data_2; - - data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global uchar4 *)(dst + dst_index + 0)) = data_0; - *((__global uchar4 *)(dst + dst_index + 4)) = data_1; - *((__global uchar4 *)(dst + dst_index + 8)) = data_2; - } -} - - -__kernel void arithm_bitwise_xor_with_mask_C3_D1 ( - __global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - char4 src1_data_0 = vload4(0, src1 + src1_index + 0); - char4 src1_data_1 = vload4(0, src1 + src1_index + 4); - char4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - char4 src2_data_0 = vload4(0, src2 + src2_index + 0); - char4 src2_data_1 = vload4(0, src2 + src2_index + 4); - char4 src2_data_2 = vload4(0, src2 + src2_index + 8); - - uchar4 mask_data = vload4(0, mask + mask_index); - - char4 data_0 = *((__global char4 *)(dst + dst_index + 0)); - char4 data_1 = *((__global char4 *)(dst + dst_index + 4)); - char4 data_2 = *((__global char4 *)(dst + dst_index + 8)); - - char4 tmp_data_0 = src1_data_0 ^ src2_data_0; - char4 tmp_data_1 = src1_data_1 ^ src2_data_1; - char4 tmp_data_2 = src1_data_2 ^ src2_data_2; - - data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global char4 *)(dst + dst_index + 0)) = data_0; - *((__global char4 *)(dst + dst_index + 4)) = data_1; - *((__global char4 *)(dst + dst_index + 8)) = data_2; - } -} - -__kernel void arithm_bitwise_xor_with_mask_C3_D2 ( - __global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0)); - ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4)); - ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8)); - - ushort2 src2_data_0 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 0)); - ushort2 src2_data_1 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 4)); - ushort2 src2_data_2 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 8)); - - uchar2 mask_data = vload2(0, mask + mask_index); - - ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0)); - ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4)); - ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8)); - - ushort2 tmp_data_0 = src1_data_0 ^ src2_data_0 ; - ushort2 tmp_data_1 = src1_data_1 ^ src2_data_1 ; - ushort2 tmp_data_2 = src1_data_2 ^ src2_data_2 ; - - data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_bitwise_xor_with_mask_C3_D3 ( - __global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0)); - short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4)); - short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8)); - - short2 src2_data_0 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 0)); - short2 src2_data_1 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 4)); - short2 src2_data_2 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 8)); - - uchar2 mask_data = vload2(0, mask + mask_index); - - short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0)); - short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4)); - short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8)); - - short2 tmp_data_0 = src1_data_0 ^ src2_data_0 ; - short2 tmp_data_1 = src1_data_1 ^ src2_data_1 ; - short2 tmp_data_2 = src1_data_2 ^ src2_data_2 ; - - data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_bitwise_xor_with_mask_C3_D4 ( - __global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int src2_index = mad24(y, src2_step, (x * 12) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0)); - int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4)); - int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8)); - - int src2_data_0 = *((__global int *)((__global char *)src2 + src2_index + 0)); - int src2_data_1 = *((__global int *)((__global char *)src2 + src2_index + 4)); - int src2_data_2 = *((__global int *)((__global char *)src2 + src2_index + 8)); - - uchar mask_data = * (mask + mask_index); - - int data_0 = *((__global int *)((__global char *)dst + dst_index + 0)); - int data_1 = *((__global int *)((__global char *)dst + dst_index + 4)); - int data_2 = *((__global int *)((__global char *)dst + dst_index + 8)); - - int tmp_data_0 = src1_data_0 ^ src2_data_0 ; - int tmp_data_1 = src1_data_1 ^ src2_data_1 ; - int tmp_data_2 = src1_data_2 ^ src2_data_2 ; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global int *)((__global char *)dst + dst_index + 0))= data_0; - *((__global int *)((__global char *)dst + dst_index + 4))= data_1; - *((__global int *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_bitwise_xor_with_mask_C3_D5 ( - __global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int src2_index = mad24(y, src2_step, (x * 12) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0)); - char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4)); - char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8)); - - char4 src2_data_0 = *((__global char4 *)((__global char *)src2 + src2_index + 0)); - char4 src2_data_1 = *((__global char4 *)((__global char *)src2 + src2_index + 4)); - char4 src2_data_2 = *((__global char4 *)((__global char *)src2 + src2_index + 8)); - - uchar mask_data = * (mask + mask_index); - - char4 data_0 = *((__global char4 *)((__global char *)dst + dst_index + 0)); - char4 data_1 = *((__global char4 *)((__global char *)dst + dst_index + 4)); - char4 data_2 = *((__global char4 *)((__global char *)dst + dst_index + 8)); - - char4 tmp_data_0 = src1_data_0 ^ src2_data_0; - char4 tmp_data_1 = src1_data_1 ^ src2_data_1; - char4 tmp_data_2 = src1_data_2 ^ src2_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global char4 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global char4 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global char4 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_bitwise_xor_with_mask_C3_D6 ( - __global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); - int src2_index = mad24(y, src2_step, (x * 24) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 24)); - - char8 src1_data_0 = *((__global char8 *)((__global char *)src1 + src1_index + 0 )); - char8 src1_data_1 = *((__global char8 *)((__global char *)src1 + src1_index + 8 )); - char8 src1_data_2 = *((__global char8 *)((__global char *)src1 + src1_index + 16)); - - char8 src2_data_0 = *((__global char8 *)((__global char *)src2 + src2_index + 0 )); - char8 src2_data_1 = *((__global char8 *)((__global char *)src2 + src2_index + 8 )); - char8 src2_data_2 = *((__global char8 *)((__global char *)src2 + src2_index + 16)); - - uchar mask_data = * (mask + mask_index); - - char8 data_0 = *((__global char8 *)((__global char *)dst + dst_index + 0 )); - char8 data_1 = *((__global char8 *)((__global char *)dst + dst_index + 8 )); - char8 data_2 = *((__global char8 *)((__global char *)dst + dst_index + 16)); - - char8 tmp_data_0 = src1_data_0 ^ src2_data_0; - char8 tmp_data_1 = src1_data_1 ^ src2_data_1; - char8 tmp_data_2 = src1_data_2 ^ src2_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global char8 *)((__global char *)dst + dst_index + 0 ))= data_0; - *((__global char8 *)((__global char *)dst + dst_index + 8 ))= data_1; - *((__global char8 *)((__global char *)dst + dst_index + 16))= data_2; - } -} -#endif - - __kernel void arithm_bitwise_xor_with_mask_C4_D0 ( __global uchar *src1, int src1_step, int src1_offset, __global uchar *src2, int src2_step, int src2_offset, diff --git a/modules/ocl/src/opencl/arithm_bitwise_xor_scalar.cl b/modules/ocl/src/opencl/arithm_bitwise_xor_scalar.cl index 2c6dd50cd4..73b5687c11 100644 --- a/modules/ocl/src/opencl/arithm_bitwise_xor_scalar.cl +++ b/modules/ocl/src/opencl/arithm_bitwise_xor_scalar.cl @@ -461,340 +461,7 @@ __kernel void arithm_s_bitwise_xor_C2_D6 ( } } #endif -__kernel void arithm_s_bitwise_xor_C3_D0 ( - __global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - uchar4 src2, int rows, int cols, int dst_step1) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0); - uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4); - uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x); - uchar4 src2_data_1 = (uchar4)(src2.y, src2.z, src2.x, src2.y); - uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z); - - uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0)); - uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4)); - uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8)); - - uchar4 tmp_data_0 = src1_data_0 ^ src2_data_0; - uchar4 tmp_data_1 = src1_data_1 ^ src2_data_1; - uchar4 tmp_data_2 = src1_data_2 ^ src2_data_2; - - data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global uchar4 *)(dst + dst_index + 0)) = data_0; - *((__global uchar4 *)(dst + dst_index + 4)) = data_1; - *((__global uchar4 *)(dst + dst_index + 8)) = data_2; - } -} - - -__kernel void arithm_s_bitwise_xor_C3_D1 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - char4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - char4 src1_data_0 = vload4(0, src1 + src1_index + 0); - char4 src1_data_1 = vload4(0, src1 + src1_index + 4); - char4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x); - char4 src2_data_1 = (char4)(src2.y, src2.z, src2.x, src2.y); - char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z); - - char4 data_0 = *((__global char4 *)(dst + dst_index + 0)); - char4 data_1 = *((__global char4 *)(dst + dst_index + 4)); - char4 data_2 = *((__global char4 *)(dst + dst_index + 8)); - - char4 tmp_data_0 = src1_data_0 ^ src2_data_0; - char4 tmp_data_1 = src1_data_1 ^ src2_data_1; - char4 tmp_data_2 = src1_data_2 ^ src2_data_2; - - data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global char4 *)(dst + dst_index + 0)) = data_0; - *((__global char4 *)(dst + dst_index + 4)) = data_1; - *((__global char4 *)(dst + dst_index + 8)) = data_2; - } -} - -__kernel void arithm_s_bitwise_xor_C3_D2 ( - __global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - ushort4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0)); - ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4)); - ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8)); - - ushort2 src2_data_0 = (ushort2)(src2.x, src2.y); - ushort2 src2_data_1 = (ushort2)(src2.z, src2.x); - ushort2 src2_data_2 = (ushort2)(src2.y, src2.z); - - ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0)); - ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4)); - ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8)); - - ushort2 tmp_data_0 = src1_data_0 ^ src2_data_0; - ushort2 tmp_data_1 = src1_data_1 ^ src2_data_1; - ushort2 tmp_data_2 = src1_data_2 ^ src2_data_2; - - data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_bitwise_xor_C3_D3 ( - __global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - short4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0)); - short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4)); - short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8)); - - short2 src2_data_0 = (short2)(src2.x, src2.y); - short2 src2_data_1 = (short2)(src2.z, src2.x); - short2 src2_data_2 = (short2)(src2.y, src2.z); - - short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0)); - short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4)); - short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8)); - - short2 tmp_data_0 = src1_data_0 ^ src2_data_0; - short2 tmp_data_1 = src1_data_1 ^ src2_data_1; - short2 tmp_data_2 = src1_data_2 ^ src2_data_2; - - data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_bitwise_xor_C3_D4 ( - __global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0)); - int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4)); - int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8)); - - int src2_data_0 = src2.x; - int src2_data_1 = src2.y; - int src2_data_2 = src2.z; - - int data_0 = *((__global int *)((__global char *)dst + dst_index + 0)); - int data_1 = *((__global int *)((__global char *)dst + dst_index + 4)); - int data_2 = *((__global int *)((__global char *)dst + dst_index + 8)); - - int tmp_data_0 = src1_data_0 ^ src2_data_0; - int tmp_data_1 = src1_data_1 ^ src2_data_1; - int tmp_data_2 = src1_data_2 ^ src2_data_2; - - *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0; - *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1; - *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2; - } -} -__kernel void arithm_s_bitwise_xor_C3_D5 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - char16 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0)); - char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4)); - char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8)); - - char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3); - char4 src2_data_1 = (char4)(src2.s4, src2.s5, src2.s6, src2.s7); - char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB); - - char4 data_0 = *((__global char4 *)((__global char *)dst + dst_index + 0)); - char4 data_1 = *((__global char4 *)((__global char *)dst + dst_index + 4)); - char4 data_2 = *((__global char4 *)((__global char *)dst + dst_index + 8)); - - char4 tmp_data_0 = src1_data_0 ^ src2_data_0; - char4 tmp_data_1 = src1_data_1 ^ src2_data_1; - char4 tmp_data_2 = src1_data_2 ^ src2_data_2; - - *((__global char4 *)((__global char *)dst + dst_index + 0))= tmp_data_0; - *((__global char4 *)((__global char *)dst + dst_index + 4))= tmp_data_1; - *((__global char4 *)((__global char *)dst + dst_index + 8))= tmp_data_2; - } -} -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_xor_C3_D6 ( - __global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - short16 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 24)); - - short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0 )); - short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8 )); - short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16)); - - short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3); - short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7); - short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb); - - short4 data_0 = *((__global short4 *)((__global char *)dst + dst_index + 0 )); - short4 data_1 = *((__global short4 *)((__global char *)dst + dst_index + 8 )); - short4 data_2 = *((__global short4 *)((__global char *)dst + dst_index + 16)); - - short4 tmp_data_0 = src1_data_0 ^ src2_data_0; - short4 tmp_data_1 = src1_data_1 ^ src2_data_1; - short4 tmp_data_2 = src1_data_2 ^ src2_data_2; - - *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0; - *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1; - *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2; - } -} -#endif __kernel void arithm_s_bitwise_xor_C4_D0 ( __global uchar *src1, int src1_step, int src1_offset, __global uchar *dst, int dst_step, int dst_offset, diff --git a/modules/ocl/src/opencl/arithm_bitwise_xor_scalar_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_xor_scalar_mask.cl index 26ca59c3a3..ad481aad80 100644 --- a/modules/ocl/src/opencl/arithm_bitwise_xor_scalar_mask.cl +++ b/modules/ocl/src/opencl/arithm_bitwise_xor_scalar_mask.cl @@ -523,380 +523,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D6 ( } } #endif -__kernel void arithm_s_bitwise_xor_with_mask_C3_D0 ( - __global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - uchar4 src2, int rows, int cols, int dst_step1) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0); - uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4); - uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x); - uchar4 src2_data_1 = (uchar4)(src2.y, src2.z, src2.x, src2.y); - uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z); - - uchar4 mask_data = vload4(0, mask + mask_index); - - uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0)); - uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4)); - uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8)); - - uchar4 tmp_data_0 = src1_data_0 ^ src2_data_0; - uchar4 tmp_data_1 = src1_data_1 ^ src2_data_1; - uchar4 tmp_data_2 = src1_data_2 ^ src2_data_2; - - data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global uchar4 *)(dst + dst_index + 0)) = data_0; - *((__global uchar4 *)(dst + dst_index + 4)) = data_1; - *((__global uchar4 *)(dst + dst_index + 8)) = data_2; - } -} - - -__kernel void arithm_s_bitwise_xor_with_mask_C3_D1 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - char4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - char4 src1_data_0 = vload4(0, src1 + src1_index + 0); - char4 src1_data_1 = vload4(0, src1 + src1_index + 4); - char4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x); - char4 src2_data_1 = (char4)(src2.y, src2.z, src2.x, src2.y); - char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z); - - uchar4 mask_data = vload4(0, mask + mask_index); - - char4 data_0 = *((__global char4 *)(dst + dst_index + 0)); - char4 data_1 = *((__global char4 *)(dst + dst_index + 4)); - char4 data_2 = *((__global char4 *)(dst + dst_index + 8)); - - char4 tmp_data_0 = src1_data_0 ^ src2_data_0; - char4 tmp_data_1 = src1_data_1 ^ src2_data_1; - char4 tmp_data_2 = src1_data_2 ^ src2_data_2; - - data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global char4 *)(dst + dst_index + 0)) = data_0; - *((__global char4 *)(dst + dst_index + 4)) = data_1; - *((__global char4 *)(dst + dst_index + 8)) = data_2; - } -} - -__kernel void arithm_s_bitwise_xor_with_mask_C3_D2 ( - __global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - ushort4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0)); - ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4)); - ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8)); - - ushort2 src2_data_0 = (ushort2)(src2.x, src2.y); - ushort2 src2_data_1 = (ushort2)(src2.z, src2.x); - ushort2 src2_data_2 = (ushort2)(src2.y, src2.z); - - uchar2 mask_data = vload2(0, mask + mask_index); - - ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0)); - ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4)); - ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8)); - - ushort2 tmp_data_0 = src1_data_0 ^ src2_data_0; - ushort2 tmp_data_1 = src1_data_1 ^ src2_data_1; - ushort2 tmp_data_2 = src1_data_2 ^ src2_data_2; - - data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_bitwise_xor_with_mask_C3_D3 ( - __global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - short4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0)); - short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4)); - short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8)); - - short2 src2_data_0 = (short2)(src2.x, src2.y); - short2 src2_data_1 = (short2)(src2.z, src2.x); - short2 src2_data_2 = (short2)(src2.y, src2.z); - - uchar2 mask_data = vload2(0, mask + mask_index); - - short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0)); - short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4)); - short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8)); - - short2 tmp_data_0 = src1_data_0 ^ src2_data_0; - short2 tmp_data_1 = src1_data_1 ^ src2_data_1; - short2 tmp_data_2 = src1_data_2 ^ src2_data_2; - - data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_bitwise_xor_with_mask_C3_D4 ( - __global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0)); - int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4)); - int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8)); - - int src2_data_0 = src2.x; - int src2_data_1 = src2.y; - int src2_data_2 = src2.z; - - uchar mask_data = * (mask + mask_index); - - int data_0 = *((__global int *)((__global char *)dst + dst_index + 0)); - int data_1 = *((__global int *)((__global char *)dst + dst_index + 4)); - int data_2 = *((__global int *)((__global char *)dst + dst_index + 8)); - - int tmp_data_0 = src1_data_0 ^ src2_data_0; - int tmp_data_1 = src1_data_1 ^ src2_data_1; - int tmp_data_2 = src1_data_2 ^ src2_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global int *)((__global char *)dst + dst_index + 0))= data_0; - *((__global int *)((__global char *)dst + dst_index + 4))= data_1; - *((__global int *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_bitwise_xor_with_mask_C3_D5 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - char16 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0)); - char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4)); - char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8)); - - char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3); - char4 src2_data_1 = (char4)(src2.s4, src2.s5, src2.s6, src2.s7); - char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB); - - uchar mask_data = * (mask + mask_index); - - char4 data_0 = *((__global char4 *)((__global char *)dst + dst_index + 0)); - char4 data_1 = *((__global char4 *)((__global char *)dst + dst_index + 4)); - char4 data_2 = *((__global char4 *)((__global char *)dst + dst_index + 8)); - - char4 tmp_data_0 = src1_data_0 ^ src2_data_0; - char4 tmp_data_1 = src1_data_1 ^ src2_data_1; - char4 tmp_data_2 = src1_data_2 ^ src2_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global char4 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global char4 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global char4 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_xor_with_mask_C3_D6 ( - __global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - short16 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 24)); - - short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0 )); - short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8 )); - short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16)); - - short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3); - short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7); - short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb); - - uchar mask_data = * (mask + mask_index); - - short4 data_0 = *((__global short4 *)((__global char *)dst + dst_index + 0 )); - short4 data_1 = *((__global short4 *)((__global char *)dst + dst_index + 8 )); - short4 data_2 = *((__global short4 *)((__global char *)dst + dst_index + 16)); - - short4 tmp_data_0 = src1_data_0 ^ src2_data_0; - short4 tmp_data_1 = src1_data_1 ^ src2_data_1; - short4 tmp_data_2 = src1_data_2 ^ src2_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global short4 *)((__global char *)dst + dst_index + 0 ))= data_0; - *((__global short4 *)((__global char *)dst + dst_index + 8 ))= data_1; - *((__global short4 *)((__global char *)dst + dst_index + 16))= data_2; - } -} -#endif __kernel void arithm_s_bitwise_xor_with_mask_C4_D0 ( __global uchar *src1, int src1_step, int src1_offset, __global uchar *dst, int dst_step, int dst_offset, diff --git a/modules/ocl/src/opencl/arithm_sub.cl b/modules/ocl/src/opencl/arithm_sub.cl index 9cf37970b2..d461d3aaea 100644 --- a/modules/ocl/src/opencl/arithm_sub.cl +++ b/modules/ocl/src/opencl/arithm_sub.cl @@ -44,7 +44,11 @@ //M*/ #if defined (DOUBLE_SUPPORT) +#ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64:enable +#elif defined (cl_amd_fp64) +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#endif #endif ////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -618,313 +622,7 @@ __kernel void arithm_sub_with_mask_C2_D6 (__global double *src1, int src1_step, } } #endif -__kernel void arithm_sub_with_mask_C3_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - - #define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0); - uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4); - uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - uchar4 src2_data_0 = vload4(0, src2 + src2_index + 0); - uchar4 src2_data_1 = vload4(0, src2 + src2_index + 4); - uchar4 src2_data_2 = vload4(0, src2 + src2_index + 8); - - uchar4 mask_data = vload4(0, mask + mask_index); - - uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0)); - uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4)); - uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8)); - - uchar4 tmp_data_0 = convert_uchar4_sat(convert_short4_sat(src1_data_0) - convert_short4_sat(src2_data_0)); - uchar4 tmp_data_1 = convert_uchar4_sat(convert_short4_sat(src1_data_1) - convert_short4_sat(src2_data_1)); - uchar4 tmp_data_2 = convert_uchar4_sat(convert_short4_sat(src1_data_2) - convert_short4_sat(src2_data_2)); - - data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global uchar4 *)(dst + dst_index + 0)) = data_0; - *((__global uchar4 *)(dst + dst_index + 4)) = data_1; - *((__global uchar4 *)(dst + dst_index + 8)) = data_2; - } -} -__kernel void arithm_sub_with_mask_C3_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - - #define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0)); - ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4)); - ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8)); - - ushort2 src2_data_0 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 0)); - ushort2 src2_data_1 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 4)); - ushort2 src2_data_2 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 8)); - - uchar2 mask_data = vload2(0, mask + mask_index); - - ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0)); - ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4)); - ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8)); - - ushort2 tmp_data_0 = convert_ushort2_sat(convert_int2_sat(src1_data_0) - convert_int2_sat(src2_data_0)); - ushort2 tmp_data_1 = convert_ushort2_sat(convert_int2_sat(src1_data_1) - convert_int2_sat(src2_data_1)); - ushort2 tmp_data_2 = convert_ushort2_sat(convert_int2_sat(src1_data_2) - convert_int2_sat(src2_data_2)); - - data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_sub_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - - #define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0)); - short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4)); - short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8)); - - short2 src2_data_0 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 0)); - short2 src2_data_1 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 4)); - short2 src2_data_2 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 8)); - - uchar2 mask_data = vload2(0, mask + mask_index); - - short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0)); - short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4)); - short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8)); - - short2 tmp_data_0 = convert_short2_sat(convert_int2_sat(src1_data_0) - convert_int2_sat(src2_data_0)); - short2 tmp_data_1 = convert_short2_sat(convert_int2_sat(src1_data_1) - convert_int2_sat(src2_data_1)); - short2 tmp_data_2 = convert_short2_sat(convert_int2_sat(src1_data_2) - convert_int2_sat(src2_data_2)); - - data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_sub_with_mask_C3_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int src2_index = mad24(y, src2_step, (x * 12) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0)); - int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4)); - int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8)); - - int src2_data_0 = *((__global int *)((__global char *)src2 + src2_index + 0)); - int src2_data_1 = *((__global int *)((__global char *)src2 + src2_index + 4)); - int src2_data_2 = *((__global int *)((__global char *)src2 + src2_index + 8)); - - uchar mask_data = * (mask + mask_index); - - int data_0 = *((__global int *)((__global char *)dst + dst_index + 0)); - int data_1 = *((__global int *)((__global char *)dst + dst_index + 4)); - int data_2 = *((__global int *)((__global char *)dst + dst_index + 8)); - - int tmp_data_0 = convert_int_sat((long)src1_data_0 - (long)src2_data_0); - int tmp_data_1 = convert_int_sat((long)src1_data_1 - (long)src2_data_1); - int tmp_data_2 = convert_int_sat((long)src1_data_2 - (long)src2_data_2); - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global int *)((__global char *)dst + dst_index + 0))= data_0; - *((__global int *)((__global char *)dst + dst_index + 4))= data_1; - *((__global int *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_sub_with_mask_C3_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global float *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int src2_index = mad24(y, src2_step, (x * 12) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0)); - float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4)); - float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8)); - - float src2_data_0 = *((__global float *)((__global char *)src2 + src2_index + 0)); - float src2_data_1 = *((__global float *)((__global char *)src2 + src2_index + 4)); - float src2_data_2 = *((__global float *)((__global char *)src2 + src2_index + 8)); - - uchar mask_data = * (mask + mask_index); - - float data_0 = *((__global float *)((__global char *)dst + dst_index + 0)); - float data_1 = *((__global float *)((__global char *)dst + dst_index + 4)); - float data_2 = *((__global float *)((__global char *)dst + dst_index + 8)); - - float tmp_data_0 = src1_data_0 - src2_data_0; - float tmp_data_1 = src1_data_1 - src2_data_1; - float tmp_data_2 = src1_data_2 - src2_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global float *)((__global char *)dst + dst_index + 0))= data_0; - *((__global float *)((__global char *)dst + dst_index + 4))= data_1; - *((__global float *)((__global char *)dst + dst_index + 8))= data_2; - } -} - -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_sub_with_mask_C3_D6 (__global double *src1, int src1_step, int src1_offset, - __global double *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global double *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); - int src2_index = mad24(y, src2_step, (x * 24) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 24)); - - double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 )); - double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 )); - double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16)); - - double src2_data_0 = *((__global double *)((__global char *)src2 + src2_index + 0 )); - double src2_data_1 = *((__global double *)((__global char *)src2 + src2_index + 8 )); - double src2_data_2 = *((__global double *)((__global char *)src2 + src2_index + 16)); - - uchar mask_data = * (mask + mask_index); - - double data_0 = *((__global double *)((__global char *)dst + dst_index + 0 )); - double data_1 = *((__global double *)((__global char *)dst + dst_index + 8 )); - double data_2 = *((__global double *)((__global char *)dst + dst_index + 16)); - - double tmp_data_0 = src1_data_0 - src2_data_0; - double tmp_data_1 = src1_data_1 - src2_data_1; - double tmp_data_2 = src1_data_2 - src2_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global double *)((__global char *)dst + dst_index + 0 ))= data_0; - *((__global double *)((__global char *)dst + dst_index + 8 ))= data_1; - *((__global double *)((__global char *)dst + dst_index + 16))= data_2; - } -} -#endif __kernel void arithm_sub_with_mask_C4_D0 (__global uchar *src1, int src1_step, int src1_offset, __global uchar *src2, int src2_step, int src2_offset, __global uchar *mask, int mask_step, int mask_offset, diff --git a/modules/ocl/src/opencl/arithm_sub_scalar.cl b/modules/ocl/src/opencl/arithm_sub_scalar.cl index 782bcd0607..76bb29418c 100644 --- a/modules/ocl/src/opencl/arithm_sub_scalar.cl +++ b/modules/ocl/src/opencl/arithm_sub_scalar.cl @@ -42,9 +42,12 @@ // the use of this software, even if advised of the possibility of such damage. // //M*/ - #if defined (DOUBLE_SUPPORT) +#ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64:enable +#elif defined (cl_amd_fp64) +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#endif #endif /**************************************sub with scalar without mask**************************************/ __kernel void arithm_s_sub_C1_D0 (__global uchar *src1, int src1_step, int src1_offset, @@ -372,305 +375,7 @@ __kernel void arithm_s_sub_C2_D6 (__global double *src1, int src1_step, int sr } } #endif -__kernel void arithm_s_sub_C3_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - - #define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0); - uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4); - uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - int4 src2_data_0 = (int4)(src2.x, src2.y, src2.z, src2.x); - int4 src2_data_1 = (int4)(src2.y, src2.z, src2.x, src2.y); - int4 src2_data_2 = (int4)(src2.z, src2.x, src2.y, src2.z); - - uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0)); - uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4)); - uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8)); - - int4 tmp_0 = convert_int4_sat(src1_data_0) - src2_data_0; - int4 tmp_1 = convert_int4_sat(src1_data_1) - src2_data_1; - int4 tmp_2 = convert_int4_sat(src1_data_2) - src2_data_2; - - tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0; - tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1; - tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2; - - uchar4 tmp_data_0 = convert_uchar4_sat(tmp_0); - uchar4 tmp_data_1 = convert_uchar4_sat(tmp_1); - uchar4 tmp_data_2 = convert_uchar4_sat(tmp_2); - - data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global uchar4 *)(dst + dst_index + 0)) = data_0; - *((__global uchar4 *)(dst + dst_index + 4)) = data_1; - *((__global uchar4 *)(dst + dst_index + 8)) = data_2; - } -} -__kernel void arithm_s_sub_C3_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - - #define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0)); - ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4)); - ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8)); - - int2 src2_data_0 = (int2)(src2.x, src2.y); - int2 src2_data_1 = (int2)(src2.z, src2.x); - int2 src2_data_2 = (int2)(src2.y, src2.z); - - ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0)); - ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4)); - ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8)); - - int2 tmp_0 = convert_int2_sat(src1_data_0) - src2_data_0; - int2 tmp_1 = convert_int2_sat(src1_data_1) - src2_data_1; - int2 tmp_2 = convert_int2_sat(src1_data_2) - src2_data_2; - - tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0; - tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1; - tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2; - - ushort2 tmp_data_0 = convert_ushort2_sat(tmp_0); - ushort2 tmp_data_1 = convert_ushort2_sat(tmp_1); - ushort2 tmp_data_2 = convert_ushort2_sat(tmp_2); - - data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_sub_C3_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - - #define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0)); - short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4)); - short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8)); - - int2 src2_data_0 = (int2)(src2.x, src2.y); - int2 src2_data_1 = (int2)(src2.z, src2.x); - int2 src2_data_2 = (int2)(src2.y, src2.z); - - short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0)); - short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4)); - short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8)); - - int2 tmp_0 = convert_int2_sat(src1_data_0) - src2_data_0; - int2 tmp_1 = convert_int2_sat(src1_data_1) - src2_data_1; - int2 tmp_2 = convert_int2_sat(src1_data_2) - src2_data_2; - - tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0; - tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1; - tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2; - - short2 tmp_data_0 = convert_short2_sat(tmp_0); - short2 tmp_data_1 = convert_short2_sat(tmp_1); - short2 tmp_data_2 = convert_short2_sat(tmp_2); - - data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_sub_C3_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0)); - int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4)); - int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8)); - - int src2_data_0 = src2.x; - int src2_data_1 = src2.y; - int src2_data_2 = src2.z; - - int data_0 = *((__global int *)((__global char *)dst + dst_index + 0)); - int data_1 = *((__global int *)((__global char *)dst + dst_index + 4)); - int data_2 = *((__global int *)((__global char *)dst + dst_index + 8)); - - long tmp_0 = (long)src1_data_0 - (long)src2_data_0; - long tmp_1 = (long)src1_data_1 - (long)src2_data_1; - long tmp_2 = (long)src1_data_2 - (long)src2_data_2; - - tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0; - tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1; - tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2; - - int tmp_data_0 = convert_int_sat(tmp_0); - int tmp_data_1 = convert_int_sat(tmp_1); - int tmp_data_2 = convert_int_sat(tmp_2); - - *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0; - *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1; - *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2; - } -} -__kernel void arithm_s_sub_C3_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *dst, int dst_step, int dst_offset, - float4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0)); - float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4)); - float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8)); - - float src2_data_0 = src2.x; - float src2_data_1 = src2.y; - float src2_data_2 = src2.z; - - float data_0 = *((__global float *)((__global char *)dst + dst_index + 0)); - float data_1 = *((__global float *)((__global char *)dst + dst_index + 4)); - float data_2 = *((__global float *)((__global char *)dst + dst_index + 8)); - - float tmp_0 = src1_data_0 - src2_data_0; - float tmp_1 = src1_data_1 - src2_data_1; - float tmp_2 = src1_data_2 - src2_data_2; - - tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0; - tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1; - tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2; - - *((__global float *)((__global char *)dst + dst_index + 0))= tmp_0; - *((__global float *)((__global char *)dst + dst_index + 4))= tmp_1; - *((__global float *)((__global char *)dst + dst_index + 8))= tmp_2; - } -} - -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_sub_C3_D6 (__global double *src1, int src1_step, int src1_offset, - __global double *dst, int dst_step, int dst_offset, - double4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 24)); - - double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 )); - double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 )); - double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16)); - - double src2_data_0 = src2.x; - double src2_data_1 = src2.y; - double src2_data_2 = src2.z; - - double data_0 = *((__global double *)((__global char *)dst + dst_index + 0 )); - double data_1 = *((__global double *)((__global char *)dst + dst_index + 8 )); - double data_2 = *((__global double *)((__global char *)dst + dst_index + 16)); - - double tmp_data_0 = src1_data_0 - src2_data_0; - double tmp_data_1 = src1_data_1 - src2_data_1; - double tmp_data_2 = src1_data_2 - src2_data_2; - - tmp_data_0 = isMatSubScalar ? tmp_data_0 : -tmp_data_0; - tmp_data_1 = isMatSubScalar ? tmp_data_1 : -tmp_data_1; - tmp_data_2 = isMatSubScalar ? tmp_data_2 : -tmp_data_2; - - *((__global double *)((__global char *)dst + dst_index + 0 ))= tmp_data_0; - *((__global double *)((__global char *)dst + dst_index + 8 ))= tmp_data_1; - *((__global double *)((__global char *)dst + dst_index + 16))= tmp_data_2; - } -} -#endif __kernel void arithm_s_sub_C4_D0 (__global uchar *src1, int src1_step, int src1_offset, __global uchar *dst, int dst_step, int dst_offset, int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) diff --git a/modules/ocl/src/opencl/arithm_sub_scalar_mask.cl b/modules/ocl/src/opencl/arithm_sub_scalar_mask.cl index 135354993c..9b758cf4c9 100644 --- a/modules/ocl/src/opencl/arithm_sub_scalar_mask.cl +++ b/modules/ocl/src/opencl/arithm_sub_scalar_mask.cl @@ -44,7 +44,11 @@ //M*/ #if defined (DOUBLE_SUPPORT) +#ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64:enable +#elif defined (cl_amd_fp64) +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#endif #endif /**************************************sub with scalar with mask**************************************/ @@ -430,341 +434,7 @@ __kernel void arithm_s_sub_with_mask_C2_D6 (__global double *src1, int src1_st } } #endif -__kernel void arithm_s_sub_with_mask_C3_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - - #define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0); - uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4); - uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - int4 src2_data_0 = (int4)(src2.x, src2.y, src2.z, src2.x); - int4 src2_data_1 = (int4)(src2.y, src2.z, src2.x, src2.y); - int4 src2_data_2 = (int4)(src2.z, src2.x, src2.y, src2.z); - - uchar4 mask_data = vload4(0, mask + mask_index); - - uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0)); - uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4)); - uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8)); - - int4 tmp_0 = convert_int4_sat(src1_data_0) - src2_data_0; - int4 tmp_1 = convert_int4_sat(src1_data_1) - src2_data_1; - int4 tmp_2 = convert_int4_sat(src1_data_2) - src2_data_2; - - tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0; - tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1; - tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2; - - uchar4 tmp_data_0 = convert_uchar4_sat(tmp_0); - uchar4 tmp_data_1 = convert_uchar4_sat(tmp_1); - uchar4 tmp_data_2 = convert_uchar4_sat(tmp_2); - - data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global uchar4 *)(dst + dst_index + 0)) = data_0; - *((__global uchar4 *)(dst + dst_index + 4)) = data_1; - *((__global uchar4 *)(dst + dst_index + 8)) = data_2; - } -} -__kernel void arithm_s_sub_with_mask_C3_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - - #define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0)); - ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4)); - ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8)); - - int2 src2_data_0 = (int2)(src2.x, src2.y); - int2 src2_data_1 = (int2)(src2.z, src2.x); - int2 src2_data_2 = (int2)(src2.y, src2.z); - - uchar2 mask_data = vload2(0, mask + mask_index); - - ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0)); - ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4)); - ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8)); - - int2 tmp_0 = convert_int2_sat(src1_data_0) - src2_data_0; - int2 tmp_1 = convert_int2_sat(src1_data_1) - src2_data_1; - int2 tmp_2 = convert_int2_sat(src1_data_2) - src2_data_2; - - tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0; - tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1; - tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2; - - ushort2 tmp_data_0 = convert_ushort2_sat(tmp_0); - ushort2 tmp_data_1 = convert_ushort2_sat(tmp_1); - ushort2 tmp_data_2 = convert_ushort2_sat(tmp_2); - - data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_sub_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - - #define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0)); - short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4)); - short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8)); - - int2 src2_data_0 = (int2)(src2.x, src2.y); - int2 src2_data_1 = (int2)(src2.z, src2.x); - int2 src2_data_2 = (int2)(src2.y, src2.z); - - uchar2 mask_data = vload2(0, mask + mask_index); - - short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0)); - short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4)); - short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8)); - - int2 tmp_0 = convert_int2_sat(src1_data_0) - src2_data_0; - int2 tmp_1 = convert_int2_sat(src1_data_1) - src2_data_1; - int2 tmp_2 = convert_int2_sat(src1_data_2) - src2_data_2; - - tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0; - tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1; - tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2; - - short2 tmp_data_0 = convert_short2_sat(tmp_0); - short2 tmp_data_1 = convert_short2_sat(tmp_1); - short2 tmp_data_2 = convert_short2_sat(tmp_2); - - data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_sub_with_mask_C3_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0)); - int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4)); - int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8)); - - int src2_data_0 = src2.x; - int src2_data_1 = src2.y; - int src2_data_2 = src2.z; - - uchar mask_data = * (mask + mask_index); - - int data_0 = *((__global int *)((__global char *)dst + dst_index + 0)); - int data_1 = *((__global int *)((__global char *)dst + dst_index + 4)); - int data_2 = *((__global int *)((__global char *)dst + dst_index + 8)); - - long tmp_0 = (long)src1_data_0 - (long)src2_data_0; - long tmp_1 = (long)src1_data_1 - (long)src2_data_1; - long tmp_2 = (long)src1_data_2 - (long)src2_data_2; - - tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0; - tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1; - tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2; - - int tmp_data_0 = convert_int_sat(tmp_0); - int tmp_data_1 = convert_int_sat(tmp_1); - int tmp_data_2 = convert_int_sat(tmp_2); - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global int *)((__global char *)dst + dst_index + 0))= data_0; - *((__global int *)((__global char *)dst + dst_index + 4))= data_1; - *((__global int *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_sub_with_mask_C3_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - float4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0)); - float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4)); - float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8)); - - float src2_data_0 = src2.x; - float src2_data_1 = src2.y; - float src2_data_2 = src2.z; - - uchar mask_data = * (mask + mask_index); - - float data_0 = *((__global float *)((__global char *)dst + dst_index + 0)); - float data_1 = *((__global float *)((__global char *)dst + dst_index + 4)); - float data_2 = *((__global float *)((__global char *)dst + dst_index + 8)); - - float tmp_data_0 = src1_data_0 - src2_data_0; - float tmp_data_1 = src1_data_1 - src2_data_1; - float tmp_data_2 = src1_data_2 - src2_data_2; - - tmp_data_0 = isMatSubScalar ? tmp_data_0 : -tmp_data_0; - tmp_data_1 = isMatSubScalar ? tmp_data_1 : -tmp_data_1; - tmp_data_2 = isMatSubScalar ? tmp_data_2 : -tmp_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global float *)((__global char *)dst + dst_index + 0))= data_0; - *((__global float *)((__global char *)dst + dst_index + 4))= data_1; - *((__global float *)((__global char *)dst + dst_index + 8))= data_2; - } -} - -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_sub_with_mask_C3_D6 (__global double *src1, int src1_step, int src1_offset, - __global double *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - double4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 24)); - - double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 )); - double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 )); - double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16)); - - double src2_data_0 = src2.x; - double src2_data_1 = src2.y; - double src2_data_2 = src2.z; - - uchar mask_data = * (mask + mask_index); - - double data_0 = *((__global double *)((__global char *)dst + dst_index + 0 )); - double data_1 = *((__global double *)((__global char *)dst + dst_index + 8 )); - double data_2 = *((__global double *)((__global char *)dst + dst_index + 16)); - - double tmp_data_0 = src1_data_0 - src2_data_0; - double tmp_data_1 = src1_data_1 - src2_data_1; - double tmp_data_2 = src1_data_2 - src2_data_2; - - tmp_data_0 = isMatSubScalar ? tmp_data_0 : -tmp_data_0; - tmp_data_1 = isMatSubScalar ? tmp_data_1 : -tmp_data_1; - tmp_data_2 = isMatSubScalar ? tmp_data_2 : -tmp_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global double *)((__global char *)dst + dst_index + 0 ))= data_0; - *((__global double *)((__global char *)dst + dst_index + 8 ))= data_1; - *((__global double *)((__global char *)dst + dst_index + 16))= data_2; - } -} -#endif __kernel void arithm_s_sub_with_mask_C4_D0 (__global uchar *src1, int src1_step, int src1_offset, __global uchar *dst, int dst_step, int dst_offset, __global uchar *mask, int mask_step, int mask_offset, diff --git a/modules/ocl/test/test_arithm.cpp b/modules/ocl/test/test_arithm.cpp index f643864a86..e46fdbddd1 100644 --- a/modules/ocl/test/test_arithm.cpp +++ b/modules/ocl/test/test_arithm.cpp @@ -1531,6 +1531,10 @@ INSTANTIATE_TEST_CASE_P(Arithm, Add, Combine( Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32SC3, CV_32SC4, CV_32FC1, CV_32FC3, CV_32FC4), Values(false))); +INSTANTIATE_TEST_CASE_P(Arithm, Sub, Combine( + Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32SC3, CV_32SC4, CV_32FC1, CV_32FC3, CV_32FC4), + Values(false))); + INSTANTIATE_TEST_CASE_P(Arithm, Mul, Combine( Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32SC3, CV_32SC4, CV_32FC1, CV_32FC3, CV_32FC4), Values(false))); // Values(false) is the reserved parameter @@ -1586,19 +1590,19 @@ INSTANTIATE_TEST_CASE_P(Arithm, Phase, Combine(Values(CV_32FC1, CV_32FC3, CV_32F INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_and, Combine( - Values(CV_8UC1, CV_32SC1, CV_32SC3, CV_32SC4, CV_32FC1, CV_32FC3, CV_32FC4), Values(false))); + Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32SC3, CV_32SC4, CV_32FC1, CV_32FC3, CV_32FC4), Values(false))); //Values(false) is the reserved parameter INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_or, Combine( - Values(CV_8UC1, CV_8UC3, CV_32SC1, CV_32FC1, CV_32FC3, CV_32FC4), Values(false))); + Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32FC1, CV_32FC3, CV_32FC4), Values(false))); //Values(false) is the reserved parameter INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_xor, Combine( - Values(CV_8UC1, CV_8UC3, CV_32SC1, CV_32FC1, CV_32FC3, CV_32FC4), Values(false))); + Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32FC1, CV_32FC3, CV_32FC4), Values(false))); //Values(false) is the reserved parameter INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_not, Combine( - Values(CV_8UC1, CV_8UC3, CV_32SC1, CV_32FC1, CV_32FC3, CV_32FC4), Values(false))); + Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32FC1, CV_32FC3, CV_32FC4), Values(false))); //Values(false) is the reserved parameter INSTANTIATE_TEST_CASE_P(Arithm, Compare, Combine(Values(CV_8UC1, CV_32SC1, CV_32FC1), Values(false))); From e2df8c7e62ecd077cec1aca87b63d5dfa22d4df1 Mon Sep 17 00:00:00 2001 From: Siva Prasad Varma Date: Fri, 5 Apr 2013 19:09:09 +0530 Subject: [PATCH 09/30] Fix bug #2590 replaced wrong check to correct check --- samples/cpp/freak_demo.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/cpp/freak_demo.cpp b/samples/cpp/freak_demo.cpp index 60778fd4e2..5112eae122 100644 --- a/samples/cpp/freak_demo.cpp +++ b/samples/cpp/freak_demo.cpp @@ -72,7 +72,7 @@ int main( int argc, char** argv ) { } Mat imgB = imread(argv[2], CV_LOAD_IMAGE_GRAYSCALE ); - if( !imgA.data ) { + if( !imgB.data ) { std::cout << " --(!) Error reading image " << argv[2] << std::endl; return -1; } From 7726e273a9b160635aefa895798673509979010f Mon Sep 17 00:00:00 2001 From: yao Date: Sat, 6 Apr 2013 13:37:36 +0800 Subject: [PATCH 10/30] merge add and sub into one set of kernels --- modules/ocl/src/arithm.cpp | 64 +- modules/ocl/src/opencl/arithm_add.cl | 53 +- modules/ocl/src/opencl/arithm_add_scalar.cl | 41 +- .../ocl/src/opencl/arithm_add_scalar_mask.cl | 42 +- modules/ocl/src/opencl/arithm_sub.cl | 802 ------------------ modules/ocl/src/opencl/arithm_sub_scalar.cl | 511 ----------- .../ocl/src/opencl/arithm_sub_scalar_mask.cl | 611 ------------- 7 files changed, 115 insertions(+), 2009 deletions(-) delete mode 100644 modules/ocl/src/opencl/arithm_sub.cl delete mode 100644 modules/ocl/src/opencl/arithm_sub_scalar.cl delete mode 100644 modules/ocl/src/opencl/arithm_sub_scalar_mask.cl diff --git a/modules/ocl/src/arithm.cpp b/modules/ocl/src/arithm.cpp index 410e460b6c..cba6ccf22f 100644 --- a/modules/ocl/src/arithm.cpp +++ b/modules/ocl/src/arithm.cpp @@ -92,9 +92,6 @@ namespace cv extern const char *arithm_bitwise_xor_scalar_mask; extern const char *arithm_compare_eq; extern const char *arithm_compare_ne; - extern const char *arithm_sub; - extern const char *arithm_sub_scalar; - extern const char *arithm_sub_scalar_mask; extern const char *arithm_mul; extern const char *arithm_div; extern const char *arithm_absdiff; @@ -130,7 +127,8 @@ inline int divUp(int total, int grain) /////////////////////// add subtract multiply divide ///////////////////////// ////////////////////////////////////////////////////////////////////////////// template -void arithmetic_run(const oclMat &src1, const oclMat &src2, oclMat &dst, string kernelName, const char **kernelString, void *_scalar) +void arithmetic_run(const oclMat &src1, const oclMat &src2, oclMat &dst, + string kernelName, const char **kernelString, void *_scalar, int op_type = 0) { if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F) { @@ -186,14 +184,25 @@ void arithmetic_run(const oclMat &src1, const oclMat &src2, oclMat &dst, string scalar = (T)scalar1; args.push_back( make_pair( sizeof(T), (void *)&scalar )); } - - openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth); + switch(op_type) + { + case MAT_ADD: + openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth, "-D ARITHM_ADD"); + break; + case MAT_SUB: + openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth, "-D ARITHM_SUB"); + break; + default: + openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth); + } } -static void arithmetic_run(const oclMat &src1, const oclMat &src2, oclMat &dst, string kernelName, const char **kernelString) +static void arithmetic_run(const oclMat &src1, const oclMat &src2, oclMat &dst, + string kernelName, const char **kernelString, int op_type = 0) { - arithmetic_run(src1, src2, dst, kernelName, kernelString, (void *)NULL); + arithmetic_run(src1, src2, dst, kernelName, kernelString, (void *)NULL, op_type); } -static void arithmetic_run(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask, string kernelName, const char **kernelString) +static void arithmetic_run(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask, + string kernelName, const char **kernelString, int op_type = 0) { if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F) { @@ -248,24 +257,34 @@ static void arithmetic_run(const oclMat &src1, const oclMat &src2, oclMat &dst, args.push_back( make_pair( sizeof(cl_int), (void *)&cols )); args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 )); - openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, channels, depth); + switch (op_type) + { + case MAT_ADD: + openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, channels, depth, "-D ARITHM_ADD"); + break; + case MAT_SUB: + openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, channels, depth, "-D ARITHM_SUB"); + break; + default: + openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, channels, depth); + } } void cv::ocl::add(const oclMat &src1, const oclMat &src2, oclMat &dst) { - arithmetic_run(src1, src2, dst, "arithm_add", &arithm_add); + arithmetic_run(src1, src2, dst, "arithm_add", &arithm_add, MAT_ADD); } void cv::ocl::add(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask) { - arithmetic_run(src1, src2, dst, mask, "arithm_add_with_mask", &arithm_add); + arithmetic_run(src1, src2, dst, mask, "arithm_add_with_mask", &arithm_add, MAT_ADD); } void cv::ocl::subtract(const oclMat &src1, const oclMat &src2, oclMat &dst) { - arithmetic_run(src1, src2, dst, "arithm_sub", &arithm_sub); + arithmetic_run(src1, src2, dst, "arithm_add", &arithm_add, MAT_SUB); } void cv::ocl::subtract(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask) { - arithmetic_run(src1, src2, dst, mask, "arithm_sub_with_mask", &arithm_sub); + arithmetic_run(src1, src2, dst, mask, "arithm_add_with_mask", &arithm_add, MAT_SUB); } typedef void (*MulDivFunc)(const oclMat &src1, const oclMat &src2, oclMat &dst, string kernelName, const char **kernelString, void *scalar); @@ -351,12 +370,9 @@ void arithmetic_scalar_run(const oclMat &src1, const Scalar &src2, oclMat &dst, args.push_back( make_pair( sizeof(cl_int) , (void *)&cols )); args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_step1 )); if(isMatSubScalar != 0) - { - isMatSubScalar = isMatSubScalar > 0 ? 1 : 0; - args.push_back( make_pair( sizeof(cl_int) , (void *)&isMatSubScalar)); - } - - openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, channels, depth); + openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, channels, depth, "-D ARITHM_SUB"); + else + openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, channels, depth, "-D ARITHM_ADD"); } static void arithmetic_scalar_run(const oclMat &src, oclMat &dst, string kernelName, const char **kernelString, double scalar) @@ -452,14 +468,14 @@ void cv::ocl::add(const oclMat &src1, const Scalar &src2, oclMat &dst, const ocl void cv::ocl::subtract(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask) { - string kernelName = mask.data ? "arithm_s_sub_with_mask" : "arithm_s_sub"; - const char **kernelString = mask.data ? &arithm_sub_scalar_mask : &arithm_sub_scalar; + string kernelName = mask.data ? "arithm_s_add_with_mask" : "arithm_s_add"; + const char **kernelString = mask.data ? &arithm_add_scalar_mask : &arithm_add_scalar; arithmetic_scalar( src1, src2, dst, mask, kernelName, kernelString, 1); } void cv::ocl::subtract(const Scalar &src2, const oclMat &src1, oclMat &dst, const oclMat &mask) { - string kernelName = mask.data ? "arithm_s_sub_with_mask" : "arithm_s_sub"; - const char **kernelString = mask.data ? &arithm_sub_scalar_mask : &arithm_sub_scalar; + string kernelName = mask.data ? "arithm_s_add_with_mask" : "arithm_s_add"; + const char **kernelString = mask.data ? &arithm_add_scalar_mask : &arithm_add_scalar; arithmetic_scalar( src1, src2, dst, mask, kernelName, kernelString, -1); } void cv::ocl::divide(double scalar, const oclMat &src, oclMat &dst) diff --git a/modules/ocl/src/opencl/arithm_add.cl b/modules/ocl/src/opencl/arithm_add.cl index f8f32cde6b..7d4b0a7653 100644 --- a/modules/ocl/src/opencl/arithm_add.cl +++ b/modules/ocl/src/opencl/arithm_add.cl @@ -52,6 +52,11 @@ #endif #endif +#ifdef ARITHM_ADD + #define ARITHM_OP(A,B) ((A)+(B)) +#elif defined ARITHM_SUB + #define ARITHM_OP(A,B) ((A)-(B)) +#endif ////////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////ADD//////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -95,7 +100,7 @@ __kernel void arithm_add_D0 (__global uchar *src1, int src1_step, int src1_offse src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; } uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); - short4 tmp = convert_short4_sat(src1_data) + convert_short4_sat(src2_data); + short4 tmp = ARITHM_OP(convert_short4_sat(src1_data), convert_short4_sat(src2_data)); uchar4 tmp_data = convert_uchar4_sat(tmp); dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; @@ -134,7 +139,7 @@ __kernel void arithm_add_D2 (__global ushort *src1, int src1_step, int src1_offs ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index)); ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index)); - int4 tmp = convert_int4_sat(src1_data) + convert_int4_sat(src2_data); + int4 tmp = ARITHM_OP(convert_int4_sat(src1_data), convert_int4_sat(src2_data)); ushort4 tmp_data = convert_ushort4_sat(tmp); dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; @@ -172,7 +177,7 @@ __kernel void arithm_add_D3 (__global short *src1, int src1_step, int src1_offse short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index)); short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index)); - int4 tmp = convert_int4_sat(src1_data) + convert_int4_sat(src2_data); + int4 tmp = ARITHM_OP(convert_int4_sat(src1_data), convert_int4_sat(src2_data)); short4 tmp_data = convert_short4_sat(tmp); dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; @@ -200,7 +205,7 @@ __kernel void arithm_add_D4 (__global int *src1, int src1_step, int src1_offset, int data1 = *((__global int *)((__global char *)src1 + src1_index)); int data2 = *((__global int *)((__global char *)src2 + src2_index)); - long tmp = (long)(data1) + (long)(data2); + long tmp = ARITHM_OP((long)(data1), (long)(data2)); *((__global int *)((__global char *)dst + dst_index)) = convert_int_sat(tmp); } @@ -221,7 +226,7 @@ __kernel void arithm_add_D5 (__global float *src1, int src1_step, int src1_offse float data1 = *((__global float *)((__global char *)src1 + src1_index)); float data2 = *((__global float *)((__global char *)src2 + src2_index)); - float tmp = data1 + data2; + float tmp = ARITHM_OP(data1, data2); *((__global float *)((__global char *)dst + dst_index)) = tmp; } @@ -245,7 +250,7 @@ __kernel void arithm_add_D6 (__global double *src1, int src1_step, int src1_offs double data1 = *((__global double *)((__global char *)src1 + src1_index)); double data2 = *((__global double *)((__global char *)src2 + src2_index)); - *((__global double *)((__global char *)dst + dst_index)) = data1 + data2; + *((__global double *)((__global char *)dst + dst_index)) = ARITHM_OP(data1, data2); } } #endif @@ -302,7 +307,7 @@ __kernel void arithm_add_with_mask_C1_D0 (__global uchar *src1, int src1_step, i } uchar4 data = *((__global uchar4 *)(dst + dst_index)); - short4 tmp = convert_short4_sat(src1_data) + convert_short4_sat(src2_data); + short4 tmp = ARITHM_OP(convert_short4_sat(src1_data), convert_short4_sat(src2_data)); uchar4 tmp_data = convert_uchar4_sat(tmp); data.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x; @@ -344,7 +349,7 @@ __kernel void arithm_add_with_mask_C1_D2 (__global ushort *src1, int src1_step, uchar2 mask_data = vload2(0, mask + mask_index); ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index)); - int2 tmp = convert_int2_sat(src1_data) + convert_int2_sat(src2_data); + int2 tmp = ARITHM_OP(convert_int2_sat(src1_data), convert_int2_sat(src2_data)); ushort2 tmp_data = convert_ushort2_sat(tmp); data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x; @@ -384,7 +389,7 @@ __kernel void arithm_add_with_mask_C1_D3 (__global short *src1, int src1_step, i uchar2 mask_data = vload2(0, mask + mask_index); short2 data = *((__global short2 *)((__global uchar *)dst + dst_index)); - int2 tmp = convert_int2_sat(src1_data) + convert_int2_sat(src2_data); + int2 tmp = ARITHM_OP(convert_int2_sat(src1_data), convert_int2_sat(src2_data)); short2 tmp_data = convert_short2_sat(tmp); data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x; @@ -416,7 +421,7 @@ __kernel void arithm_add_with_mask_C1_D4 (__global int *src1, int src1_step, i int src_data2 = *((__global int *)((__global char *)src2 + src2_index)); int dst_data = *((__global int *)((__global char *)dst + dst_index)); - int data = convert_int_sat((long)src_data1 + (long)src_data2); + int data = convert_int_sat(ARITHM_OP((long)src_data1, (long)src_data2)); data = mask_data ? data : dst_data; *((__global int *)((__global char *)dst + dst_index)) = data; @@ -446,7 +451,7 @@ __kernel void arithm_add_with_mask_C1_D5 (__global float *src1, int src1_step, i float src_data2 = *((__global float *)((__global char *)src2 + src2_index)); float dst_data = *((__global float *)((__global char *)dst + dst_index)); - float data = src_data1 + src_data2; + float data = ARITHM_OP(src_data1, src_data2); data = mask_data ? data : dst_data; *((__global float *)((__global char *)dst + dst_index)) = data; @@ -477,7 +482,7 @@ __kernel void arithm_add_with_mask_C1_D6 (__global double *src1, int src1_step, double src_data2 = *((__global double *)((__global char *)src2 + src2_index)); double dst_data = *((__global double *)((__global char *)dst + dst_index)); - double data = src_data1 + src_data2; + double data = ARITHM_OP(src_data1, src_data2); data = mask_data ? data : dst_data; *((__global double *)((__global char *)dst + dst_index)) = data; @@ -516,7 +521,7 @@ __kernel void arithm_add_with_mask_C2_D0 (__global uchar *src1, int src1_step, i uchar2 mask_data = vload2(0, mask + mask_index); uchar4 data = *((__global uchar4 *)(dst + dst_index)); - short4 tmp = convert_short4_sat(src1_data) + convert_short4_sat(src2_data); + short4 tmp = ARITHM_OP(convert_short4_sat(src1_data), convert_short4_sat(src2_data)); uchar4 tmp_data = convert_uchar4_sat(tmp); data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy; @@ -548,7 +553,7 @@ __kernel void arithm_add_with_mask_C2_D2 (__global ushort *src1, int src1_step, ushort2 src_data2 = *((__global ushort2 *)((__global char *)src2 + src2_index)); ushort2 dst_data = *((__global ushort2 *)((__global char *)dst + dst_index)); - int2 tmp = convert_int2_sat(src_data1) + convert_int2_sat(src_data2); + int2 tmp = ARITHM_OP(convert_int2_sat(src_data1), convert_int2_sat(src_data2)); ushort2 data = convert_ushort2_sat(tmp); data = mask_data ? data : dst_data; @@ -578,7 +583,7 @@ __kernel void arithm_add_with_mask_C2_D3 (__global short *src1, int src1_step, i short2 src_data2 = *((__global short2 *)((__global char *)src2 + src2_index)); short2 dst_data = *((__global short2 *)((__global char *)dst + dst_index)); - int2 tmp = convert_int2_sat(src_data1) + convert_int2_sat(src_data2); + int2 tmp = ARITHM_OP(convert_int2_sat(src_data1), convert_int2_sat(src_data2)); short2 data = convert_short2_sat(tmp); data = mask_data ? data : dst_data; @@ -608,7 +613,7 @@ __kernel void arithm_add_with_mask_C2_D4 (__global int *src1, int src1_step, i int2 src_data2 = *((__global int2 *)((__global char *)src2 + src2_index)); int2 dst_data = *((__global int2 *)((__global char *)dst + dst_index)); - int2 data = convert_int2_sat(convert_long2_sat(src_data1) + convert_long2_sat(src_data2)); + int2 data = convert_int2_sat(ARITHM_OP(convert_long2_sat(src_data1), convert_long2_sat(src_data2))); data = mask_data ? data : dst_data; *((__global int2 *)((__global char *)dst + dst_index)) = data; @@ -637,7 +642,7 @@ __kernel void arithm_add_with_mask_C2_D5 (__global float *src1, int src1_step, i float2 src_data2 = *((__global float2 *)((__global char *)src2 + src2_index)); float2 dst_data = *((__global float2 *)((__global char *)dst + dst_index)); - float2 data = src_data1 + src_data2; + float2 data = ARITHM_OP(src_data1, src_data2); data = mask_data ? data : dst_data; *((__global float2 *)((__global char *)dst + dst_index)) = data; @@ -668,7 +673,7 @@ __kernel void arithm_add_with_mask_C2_D6 (__global double *src1, int src1_step, double2 src_data2 = *((__global double2 *)((__global char *)src2 + src2_index)); double2 dst_data = *((__global double2 *)((__global char *)dst + dst_index)); - double2 data = src_data1 + src_data2; + double2 data = ARITHM_OP(src_data1, src_data2); data = mask_data ? data : dst_data; *((__global double2 *)((__global char *)dst + dst_index)) = data; @@ -699,7 +704,7 @@ __kernel void arithm_add_with_mask_C4_D0 (__global uchar *src1, int src1_step, i uchar4 src_data2 = *((__global uchar4 *)(src2 + src2_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); - uchar4 data = convert_uchar4_sat(convert_ushort4_sat(src_data1) + convert_ushort4_sat(src_data2)); + uchar4 data = convert_uchar4_sat(ARITHM_OP(convert_short4_sat(src_data1), convert_short4_sat(src_data2))); data = mask_data ? data : dst_data; *((__global uchar4 *)(dst + dst_index)) = data; @@ -728,7 +733,7 @@ __kernel void arithm_add_with_mask_C4_D2 (__global ushort *src1, int src1_step, ushort4 src_data2 = *((__global ushort4 *)((__global char *)src2 + src2_index)); ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index)); - ushort4 data = convert_ushort4_sat(convert_int4_sat(src_data1) + convert_int4_sat(src_data2)); + ushort4 data = convert_ushort4_sat(ARITHM_OP(convert_int4_sat(src_data1), convert_int4_sat(src_data2))); data = mask_data ? data : dst_data; *((__global ushort4 *)((__global char *)dst + dst_index)) = data; @@ -757,7 +762,7 @@ __kernel void arithm_add_with_mask_C4_D3 (__global short *src1, int src1_step, i short4 src_data2 = *((__global short4 *)((__global char *)src2 + src2_index)); short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index)); - short4 data = convert_short4_sat(convert_int4_sat(src_data1) + convert_int4_sat(src_data2)); + short4 data = convert_short4_sat(ARITHM_OP(convert_int4_sat(src_data1), convert_int4_sat(src_data2))); data = mask_data ? data : dst_data; *((__global short4 *)((__global char *)dst + dst_index)) = data; @@ -786,7 +791,7 @@ __kernel void arithm_add_with_mask_C4_D4 (__global int *src1, int src1_step, i int4 src_data2 = *((__global int4 *)((__global char *)src2 + src2_index)); int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index)); - int4 data = convert_int4_sat(convert_long4_sat(src_data1) + convert_long4_sat(src_data2)); + int4 data = convert_int4_sat(ARITHM_OP(convert_long4_sat(src_data1), convert_long4_sat(src_data2))); data = mask_data ? data : dst_data; *((__global int4 *)((__global char *)dst + dst_index)) = data; @@ -815,7 +820,7 @@ __kernel void arithm_add_with_mask_C4_D5 (__global float *src1, int src1_step, i float4 src_data2 = *((__global float4 *)((__global char *)src2 + src2_index)); float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index)); - float4 data = src_data1 + src_data2; + float4 data = ARITHM_OP(src_data1, src_data2); data = mask_data ? data : dst_data; *((__global float4 *)((__global char *)dst + dst_index)) = data; @@ -846,7 +851,7 @@ __kernel void arithm_add_with_mask_C4_D6 (__global double *src1, int src1_step, double4 src_data2 = *((__global double4 *)((__global char *)src2 + src2_index)); double4 dst_data = *((__global double4 *)((__global char *)dst + dst_index)); - double4 data = src_data1 + src_data2; + double4 data = ARITHM_OP(src_data1, src_data2); data = mask_data ? data : dst_data; *((__global double4 *)((__global char *)dst + dst_index)) = data; diff --git a/modules/ocl/src/opencl/arithm_add_scalar.cl b/modules/ocl/src/opencl/arithm_add_scalar.cl index 152b5a19fb..0552fc8a7b 100644 --- a/modules/ocl/src/opencl/arithm_add_scalar.cl +++ b/modules/ocl/src/opencl/arithm_add_scalar.cl @@ -49,7 +49,12 @@ #elif defined (cl_amd_fp64) #pragma OPENCL EXTENSION cl_amd_fp64:enable #endif +#endif +#ifdef ARITHM_ADD + #define ARITHM_OP(A,B) ((A)+(B)) +#elif defined ARITHM_SUB + #define ARITHM_OP(A,B) ((A)-(B)) #endif /**************************************add with scalar without mask**************************************/ __kernel void arithm_s_add_C1_D0 (__global uchar *src1, int src1_step, int src1_offset, @@ -83,7 +88,7 @@ __kernel void arithm_s_add_C1_D0 (__global uchar *src1, int src1_step, int src } uchar4 data = *((__global uchar4 *)(dst + dst_index)); - int4 tmp = convert_int4_sat(src1_data) + src2_data; + int4 tmp = ARITHM_OP(convert_int4_sat(src1_data), src2_data); uchar4 tmp_data = convert_uchar4_sat(tmp); data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x; @@ -120,7 +125,7 @@ __kernel void arithm_s_add_C1_D2 (__global ushort *src1, int src1_step, int sr int2 src2_data = (int2)(src2.x, src2.x); ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index)); - int2 tmp = convert_int2_sat(src1_data) + src2_data; + int2 tmp = ARITHM_OP(convert_int2_sat(src1_data), src2_data); ushort2 tmp_data = convert_ushort2_sat(tmp); data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x; @@ -155,7 +160,7 @@ __kernel void arithm_s_add_C1_D3 (__global short *src1, int src1_step, int src int2 src2_data = (int2)(src2.x, src2.x); short2 data = *((__global short2 *)((__global uchar *)dst + dst_index)); - int2 tmp = convert_int2_sat(src1_data) + src2_data; + int2 tmp = ARITHM_OP(convert_int2_sat(src1_data), src2_data); short2 tmp_data = convert_short2_sat(tmp); data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x; @@ -181,7 +186,7 @@ __kernel void arithm_s_add_C1_D4 (__global int *src1, int src1_step, int src1_ int src_data2 = src2.x; int dst_data = *((__global int *)((__global char *)dst + dst_index)); - int data = convert_int_sat((long)src_data1 + (long)src_data2); + int data = convert_int_sat(ARITHM_OP((long)src_data1, (long)src_data2)); *((__global int *)((__global char *)dst + dst_index)) = data; } @@ -203,7 +208,7 @@ __kernel void arithm_s_add_C1_D5 (__global float *src1, int src1_step, int src float src_data2 = src2.x; float dst_data = *((__global float *)((__global char *)dst + dst_index)); - float data = src_data1 + src_data2; + float data = ARITHM_OP(src_data1, src_data2); *((__global float *)((__global char *)dst + dst_index)) = data; } @@ -227,7 +232,7 @@ __kernel void arithm_s_add_C1_D6 (__global double *src1, int src1_step, int sr double src2_data = src2.x; double dst_data = *((__global double *)((__global char *)dst + dst_index)); - double data = src_data1 + src2_data; + double data = ARITHM_OP(src_data1, src2_data); *((__global double *)((__global char *)dst + dst_index)) = data; } @@ -260,7 +265,7 @@ __kernel void arithm_s_add_C2_D0 (__global uchar *src1, int src1_step, int src int4 src2_data = (int4)(src2.x, src2.y, src2.x, src2.y); uchar4 data = *((__global uchar4 *)(dst + dst_index)); - int4 tmp = convert_int4_sat(src1_data) + src2_data; + int4 tmp = ARITHM_OP(convert_int4_sat(src1_data), src2_data); uchar4 tmp_data = convert_uchar4_sat(tmp); data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy; @@ -286,7 +291,7 @@ __kernel void arithm_s_add_C2_D2 (__global ushort *src1, int src1_step, int sr int2 src_data2 = (int2)(src2.x, src2.y); ushort2 dst_data = *((__global ushort2 *)((__global char *)dst + dst_index)); - int2 tmp = convert_int2_sat(src_data1) + src_data2; + int2 tmp = ARITHM_OP(convert_int2_sat(src_data1), src_data2); ushort2 data = convert_ushort2_sat(tmp); *((__global ushort2 *)((__global char *)dst + dst_index)) = data; @@ -309,7 +314,7 @@ __kernel void arithm_s_add_C2_D3 (__global short *src1, int src1_step, int src int2 src_data2 = (int2)(src2.x, src2.y); short2 dst_data = *((__global short2 *)((__global char *)dst + dst_index)); - int2 tmp = convert_int2_sat(src_data1) + src_data2; + int2 tmp = ARITHM_OP(convert_int2_sat(src_data1), src_data2); short2 data = convert_short2_sat(tmp); *((__global short2 *)((__global char *)dst + dst_index)) = data; @@ -332,7 +337,7 @@ __kernel void arithm_s_add_C2_D4 (__global int *src1, int src1_step, int src1_ int2 src_data2 = (int2)(src2.x, src2.y); int2 dst_data = *((__global int2 *)((__global char *)dst + dst_index)); - int2 data = convert_int2_sat(convert_long2_sat(src_data1) + convert_long2_sat(src_data2)); + int2 data = convert_int2_sat(ARITHM_OP(convert_long2_sat(src_data1), convert_long2_sat(src_data2))); *((__global int2 *)((__global char *)dst + dst_index)) = data; } } @@ -353,7 +358,7 @@ __kernel void arithm_s_add_C2_D5 (__global float *src1, int src1_step, int src float2 src_data2 = (float2)(src2.x, src2.y); float2 dst_data = *((__global float2 *)((__global char *)dst + dst_index)); - float2 data = src_data1 + src_data2; + float2 data = ARITHM_OP(src_data1, src_data2); *((__global float2 *)((__global char *)dst + dst_index)) = data; } } @@ -376,7 +381,7 @@ __kernel void arithm_s_add_C2_D6 (__global double *src1, int src1_step, int sr double2 src_data2 = (double2)(src2.x, src2.y); double2 dst_data = *((__global double2 *)((__global char *)dst + dst_index)); - double2 data = src_data1 + src_data2; + double2 data = ARITHM_OP(src_data1, src_data2); *((__global double2 *)((__global char *)dst + dst_index)) = data; } @@ -398,7 +403,7 @@ __kernel void arithm_s_add_C4_D0 (__global uchar *src1, int src1_step, int src uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index)); - uchar4 data = convert_uchar4_sat(convert_int4_sat(src_data1) + src2); + uchar4 data = convert_uchar4_sat(ARITHM_OP(convert_int4_sat(src_data1), src2)); *((__global uchar4 *)(dst + dst_index)) = data; } @@ -418,7 +423,7 @@ __kernel void arithm_s_add_C4_D2 (__global ushort *src1, int src1_step, int sr ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index)); - ushort4 data = convert_ushort4_sat(convert_int4_sat(src_data1) + src2); + ushort4 data = convert_ushort4_sat(ARITHM_OP(convert_int4_sat(src_data1), src2)); *((__global ushort4 *)((__global char *)dst + dst_index)) = data; } @@ -438,7 +443,7 @@ __kernel void arithm_s_add_C4_D3 (__global short *src1, int src1_step, int src short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index)); - short4 data = convert_short4_sat(convert_int4_sat(src_data1) + src2); + short4 data = convert_short4_sat(ARITHM_OP(convert_int4_sat(src_data1), src2)); *((__global short4 *)((__global char *)dst + dst_index)) = data; } @@ -458,7 +463,7 @@ __kernel void arithm_s_add_C4_D4 (__global int *src1, int src1_step, int src1_ int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index)); - int4 data = convert_int4_sat(convert_long4_sat(src_data1) + convert_long4_sat(src2)); + int4 data = convert_int4_sat(ARITHM_OP(convert_long4_sat(src_data1), convert_long4_sat(src2))); *((__global int4 *)((__global char *)dst + dst_index)) = data; } @@ -478,7 +483,7 @@ __kernel void arithm_s_add_C4_D5 (__global float *src1, int src1_step, int src float4 src_data1 = *((__global float4 *)((__global char *)src1 + src1_index)); - float4 data = src_data1 + src2; + float4 data = ARITHM_OP(src_data1, src2); *((__global float4 *)((__global char *)dst + dst_index)) = data; } @@ -500,7 +505,7 @@ __kernel void arithm_s_add_C4_D6 (__global double *src1, int src1_step, int sr double4 src_data1 = *((__global double4 *)((__global char *)src1 + src1_index)); - double4 data = src_data1 + src2; + double4 data = ARITHM_OP(src_data1, src2); *((__global double4 *)((__global char *)dst + dst_index)) = data; } diff --git a/modules/ocl/src/opencl/arithm_add_scalar_mask.cl b/modules/ocl/src/opencl/arithm_add_scalar_mask.cl index 673e323ff6..fdf65923cd 100644 --- a/modules/ocl/src/opencl/arithm_add_scalar_mask.cl +++ b/modules/ocl/src/opencl/arithm_add_scalar_mask.cl @@ -51,6 +51,11 @@ #endif #endif +#ifdef ARITHM_ADD + #define ARITHM_OP(A,B) ((A)+(B)) +#elif defined ARITHM_SUB + #define ARITHM_OP(A,B) ((A)-(B)) +#endif /**************************************add with scalar with mask**************************************/ __kernel void arithm_s_add_with_mask_C1_D0 (__global uchar *src1, int src1_step, int src1_offset, __global uchar *dst, int dst_step, int dst_offset, @@ -94,7 +99,7 @@ __kernel void arithm_s_add_with_mask_C1_D0 (__global uchar *src1, int src1_ste } uchar4 data = *((__global uchar4 *)(dst + dst_index)); - int4 tmp = convert_int4_sat(src1_data) + src2_data; + int4 tmp = ARITHM_OP(convert_int4_sat(src1_data), src2_data); uchar4 tmp_data = convert_uchar4_sat(tmp); data.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x; @@ -134,7 +139,7 @@ __kernel void arithm_s_add_with_mask_C1_D2 (__global ushort *src1, int src1_st uchar2 mask_data = vload2(0, mask + mask_index); ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index)); - int2 tmp = convert_int2_sat(src1_data) + src2_data; + int2 tmp = ARITHM_OP(convert_int2_sat(src1_data), src2_data); ushort2 tmp_data = convert_ushort2_sat(tmp); data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x; @@ -172,7 +177,7 @@ __kernel void arithm_s_add_with_mask_C1_D3 (__global short *src1, int src1_ste uchar2 mask_data = vload2(0, mask + mask_index); short2 data = *((__global short2 *)((__global uchar *)dst + dst_index)); - int2 tmp = convert_int2_sat(src1_data) + src2_data; + int2 tmp = ARITHM_OP(convert_int2_sat(src1_data), src2_data); short2 tmp_data = convert_short2_sat(tmp); data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x; @@ -202,7 +207,7 @@ __kernel void arithm_s_add_with_mask_C1_D4 (__global int *src1, int src1_ste int src_data2 = src2.x; int dst_data = *((__global int *)((__global char *)dst + dst_index)); - int data = convert_int_sat((long)src_data1 + (long)src_data2); + int data = convert_int_sat(ARITHM_OP((long)src_data1, (long)src_data2)); data = mask_data ? data : dst_data; *((__global int *)((__global char *)dst + dst_index)) = data; @@ -230,7 +235,7 @@ __kernel void arithm_s_add_with_mask_C1_D5 (__global float *src1, int src1_s float src_data2 = src2.x; float dst_data = *((__global float *)((__global char *)dst + dst_index)); - float data = src_data1 + src_data2; + float data = ARITHM_OP(src_data1, src_data2); data = mask_data ? data : dst_data; *((__global float *)((__global char *)dst + dst_index)) = data; @@ -260,7 +265,7 @@ __kernel void arithm_s_add_with_mask_C1_D6 (__global double *src1, int src1_ double src_data2 = src2.x; double dst_data = *((__global double *)((__global char *)dst + dst_index)); - double data = src_data1 + src_data2; + double data = ARITHM_OP(src_data1, src_data2); data = mask_data ? data : dst_data; *((__global double *)((__global char *)dst + dst_index)) = data; @@ -296,7 +301,7 @@ __kernel void arithm_s_add_with_mask_C2_D0 (__global uchar *src1, int src1_ste uchar2 mask_data = vload2(0, mask + mask_index); uchar4 data = *((__global uchar4 *)(dst + dst_index)); - int4 tmp = convert_int4_sat(src1_data) + src2_data; + int4 tmp = ARITHM_OP(convert_int4_sat(src1_data), src2_data); uchar4 tmp_data = convert_uchar4_sat(tmp); data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy; @@ -326,7 +331,7 @@ __kernel void arithm_s_add_with_mask_C2_D2 (__global ushort *src1, int src1_st int2 src_data2 = (int2)(src2.x, src2.y); ushort2 dst_data = *((__global ushort2 *)((__global char *)dst + dst_index)); - int2 tmp = convert_int2_sat(src_data1) + src_data2; + int2 tmp = ARITHM_OP(convert_int2_sat(src_data1), src_data2); ushort2 data = convert_ushort2_sat(tmp); data = mask_data ? data : dst_data; @@ -354,7 +359,7 @@ __kernel void arithm_s_add_with_mask_C2_D3 (__global short *src1, int src1_ste int2 src_data2 = (int2)(src2.x, src2.y); short2 dst_data = *((__global short2 *)((__global char *)dst + dst_index)); - int2 tmp = convert_int2_sat(src_data1) + src_data2; + int2 tmp = ARITHM_OP(convert_int2_sat(src_data1), src_data2); short2 data = convert_short2_sat(tmp); data = mask_data ? data : dst_data; @@ -382,7 +387,7 @@ __kernel void arithm_s_add_with_mask_C2_D4 (__global int *src1, int src1_step, int2 src_data2 = (int2)(src2.x, src2.y); int2 dst_data = *((__global int2 *)((__global char *)dst + dst_index)); - int2 data = convert_int2_sat(convert_long2_sat(src_data1) + convert_long2_sat(src_data2)); + int2 data = convert_int2_sat(ARITHM_OP(convert_long2_sat(src_data1), convert_long2_sat(src_data2))); data = mask_data ? data : dst_data; *((__global int2 *)((__global char *)dst + dst_index)) = data; @@ -409,7 +414,7 @@ __kernel void arithm_s_add_with_mask_C2_D5 (__global float *src1, int src1_ste float2 src_data2 = (float2)(src2.x, src2.y); float2 dst_data = *((__global float2 *)((__global char *)dst + dst_index)); - float2 data = src_data1 + src_data2; + float2 data = ARITHM_OP(src_data1, src_data2); data = mask_data ? data : dst_data; *((__global float2 *)((__global char *)dst + dst_index)) = data; @@ -438,7 +443,7 @@ __kernel void arithm_s_add_with_mask_C2_D6 (__global double *src1, int src1_st double2 src_data2 = (double2)(src2.x, src2.y); double2 dst_data = *((__global double2 *)((__global char *)dst + dst_index)); - double2 data = src_data1 + src_data2; + double2 data = ARITHM_OP(src_data1, src_data2); data = mask_data ? data : dst_data; *((__global double2 *)((__global char *)dst + dst_index)) = data; @@ -451,7 +456,6 @@ __kernel void arithm_s_add_with_mask_C4_D0 (__global uchar *src1, int src1_ste __global uchar *mask, int mask_step, int mask_offset, int4 src2, int rows, int cols, int dst_step1) { - int x = get_global_id(0); int y = get_global_id(1); @@ -466,7 +470,7 @@ __kernel void arithm_s_add_with_mask_C4_D0 (__global uchar *src1, int src1_ste uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); - uchar4 data = convert_uchar4_sat(convert_int4_sat(src_data1) + src2); + uchar4 data = convert_uchar4_sat(ARITHM_OP(convert_int4_sat(src_data1), src2)); data = mask_data ? data : dst_data; *((__global uchar4 *)(dst + dst_index)) = data; @@ -492,7 +496,7 @@ __kernel void arithm_s_add_with_mask_C4_D2 (__global ushort *src1, int src1_st ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index)); ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index)); - ushort4 data = convert_ushort4_sat(convert_int4_sat(src_data1) + src2); + ushort4 data = convert_ushort4_sat(ARITHM_OP(convert_int4_sat(src_data1), src2)); data = mask_data ? data : dst_data; *((__global ushort4 *)((__global char *)dst + dst_index)) = data; @@ -518,7 +522,7 @@ __kernel void arithm_s_add_with_mask_C4_D3 (__global short *src1, int src1_ste short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index)); short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index)); - short4 data = convert_short4_sat(convert_int4_sat(src_data1) + src2); + short4 data = convert_short4_sat(ARITHM_OP(convert_int4_sat(src_data1), src2)); data = mask_data ? data : dst_data; *((__global short4 *)((__global char *)dst + dst_index)) = data; @@ -544,7 +548,7 @@ __kernel void arithm_s_add_with_mask_C4_D4 (__global int *src1, int src1_step, int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index)); int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index)); - int4 data = convert_int4_sat(convert_long4_sat(src_data1) + convert_long4_sat(src2)); + int4 data = convert_int4_sat(ARITHM_OP(convert_long4_sat(src_data1), convert_long4_sat(src2))); data = mask_data ? data : dst_data; *((__global int4 *)((__global char *)dst + dst_index)) = data; @@ -570,7 +574,7 @@ __kernel void arithm_s_add_with_mask_C4_D5 (__global float *src1, int src1_ste float4 src_data1 = *((__global float4 *)((__global char *)src1 + src1_index)); float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index)); - float4 data = src_data1 + src2; + float4 data = ARITHM_OP(src_data1, src2); data = mask_data ? data : dst_data; *((__global float4 *)((__global char *)dst + dst_index)) = data; @@ -598,7 +602,7 @@ __kernel void arithm_s_add_with_mask_C4_D6 (__global double *src1, int src1_st double4 src_data1 = *((__global double4 *)((__global char *)src1 + src1_index)); double4 dst_data = *((__global double4 *)((__global char *)dst + dst_index)); - double4 data = src_data1 + src2; + double4 data = ARITHM_OP(src_data1, src2); data = mask_data ? data : dst_data; *((__global double4 *)((__global char *)dst + dst_index)) = data; diff --git a/modules/ocl/src/opencl/arithm_sub.cl b/modules/ocl/src/opencl/arithm_sub.cl deleted file mode 100644 index d461d3aaea..0000000000 --- a/modules/ocl/src/opencl/arithm_sub.cl +++ /dev/null @@ -1,802 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. -// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// @Authors -// Jia Haipeng, jiahaipeng95@gmail.com -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other GpuMaterials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors as is and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -#if defined (DOUBLE_SUPPORT) -#ifdef cl_khr_fp64 -#pragma OPENCL EXTENSION cl_khr_fp64:enable -#elif defined (cl_amd_fp64) -#pragma OPENCL EXTENSION cl_amd_fp64:enable -#endif -#endif - -////////////////////////////////////////////////////////////////////////////////////////////////////// -/////////////////////////////////////////////SUB//////////////////////////////////////////////////// -/////////////////////////////////////////////////////////////////////////////////////////////////////// -/**************************************sub without mask**************************************/ -__kernel void arithm_sub_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - - #define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - - uchar4 src1_data = vload4(0, src1 + src1_index); - uchar4 src2_data = vload4(0, src2 + src2_index); - - uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); - short4 tmp = convert_short4_sat(src1_data) - convert_short4_sat(src2_data); - uchar4 tmp_data = convert_uchar4_sat(tmp); - - dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; - dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y; - dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z; - dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w; - - *((__global uchar4 *)(dst + dst_index)) = dst_data; - } -} -__kernel void arithm_sub_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - - #define dst_align ((dst_offset >> 1) & 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8); - - ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index)); - ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index)); - - ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index)); - int4 tmp = convert_int4_sat(src1_data) - convert_int4_sat(src2_data); - ushort4 tmp_data = convert_ushort4_sat(tmp); - - dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; - dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y; - dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z; - dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w; - - *((__global ushort4 *)((__global char *)dst + dst_index)) = dst_data; - } -} -__kernel void arithm_sub_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - - #define dst_align ((dst_offset >> 1) & 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8); - - short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index)); - short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index)); - - short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index)); - int4 tmp = convert_int4_sat(src1_data) - convert_int4_sat(src2_data); - short4 tmp_data = convert_short4_sat(tmp); - - dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; - dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y; - dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z; - dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w; - - *((__global short4 *)((__global char *)dst + dst_index)) = dst_data; - } -} - -__kernel void arithm_sub_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - int data1 = *((__global int *)((__global char *)src1 + src1_index)); - int data2 = *((__global int *)((__global char *)src2 + src2_index)); - long tmp = (long)(data1) - (long)(data2); - - *((__global int *)((__global char *)dst + dst_index)) = convert_int_sat(tmp); - } -} -__kernel void arithm_sub_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *src2, int src2_step, int src2_offset, - __global float *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - float data1 = *((__global float *)((__global char *)src1 + src1_index)); - float data2 = *((__global float *)((__global char *)src2 + src2_index)); - float tmp = data1 - data2; - - *((__global float *)((__global char *)dst + dst_index)) = tmp; - } -} - - -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_sub_D6 (__global double *src1, int src1_step, int src1_offset, - __global double *src2, int src2_step, int src2_offset, - __global double *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 3) + src2_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - double data1 = *((__global double *)((__global char *)src1 + src1_index)); - double data2 = *((__global double *)((__global char *)src2 + src2_index)); - - *((__global double *)((__global char *)dst + dst_index)) = data1 - data2; - } -} -#endif - -/**************************************sub with mask**************************************/ -__kernel void arithm_sub_with_mask_C1_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - - #define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - - uchar4 src1_data = vload4(0, src1 + src1_index); - uchar4 src2_data = vload4(0, src2 + src2_index); - uchar4 mask_data = vload4(0, mask + mask_index); - - uchar4 data = *((__global uchar4 *)(dst + dst_index)); - short4 tmp = convert_short4_sat(src1_data) - convert_short4_sat(src2_data); - uchar4 tmp_data = convert_uchar4_sat(tmp); - - data.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x; - data.y = ((mask_data.y) && (dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y; - data.z = ((mask_data.z) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z; - data.w = ((mask_data.w) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w; - - *((__global uchar4 *)(dst + dst_index)) = data; - } -} -__kernel void arithm_sub_with_mask_C1_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); - - ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index)); - ushort2 src2_data = vload2(0, (__global ushort *)((__global char *)src2 + src2_index)); - uchar2 mask_data = vload2(0, mask + mask_index); - - ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index)); - int2 tmp = convert_int2_sat(src1_data) - convert_int2_sat(src2_data); - ushort2 tmp_data = convert_ushort2_sat(tmp); - - data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x; - data.y = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.y : data.y; - - *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data; - } -} -__kernel void arithm_sub_with_mask_C1_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); - - short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index)); - short2 src2_data = vload2(0, (__global short *)((__global char *)src2 + src2_index)); - uchar2 mask_data = vload2(0, mask + mask_index); - - short2 data = *((__global short2 *)((__global uchar *)dst + dst_index)); - int2 tmp = convert_int2_sat(src1_data) - convert_int2_sat(src2_data); - short2 tmp_data = convert_short2_sat(tmp); - - data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x; - data.y = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.y : data.y; - - *((__global short2 *)((__global uchar *)dst + dst_index)) = data; - } -} -__kernel void arithm_sub_with_mask_C1_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - int src_data1 = *((__global int *)((__global char *)src1 + src1_index)); - int src_data2 = *((__global int *)((__global char *)src2 + src2_index)); - int dst_data = *((__global int *)((__global char *)dst + dst_index)); - - int data = convert_int_sat((long)src_data1 - (long)src_data2); - data = mask_data ? data : dst_data; - - *((__global int *)((__global char *)dst + dst_index)) = data; - } -} - -__kernel void arithm_sub_with_mask_C1_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global float *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - float src_data1 = *((__global float *)((__global char *)src1 + src1_index)); - float src_data2 = *((__global float *)((__global char *)src2 + src2_index)); - float dst_data = *((__global float *)((__global char *)dst + dst_index)); - - float data = src_data1 - src_data2; - data = mask_data ? data : dst_data; - - *((__global float *)((__global char *)dst + dst_index)) = data; - } -} - - -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_sub_with_mask_C1_D6 (__global double *src1, int src1_step, int src1_offset, - __global double *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global double *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 3) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - double src_data1 = *((__global double *)((__global char *)src1 + src1_index)); - double src_data2 = *((__global double *)((__global char *)src2 + src2_index)); - double dst_data = *((__global double *)((__global char *)dst + dst_index)); - - double data = src_data1 - src_data2; - data = mask_data ? data : dst_data; - - *((__global double *)((__global char *)dst + dst_index)) = data; - } -} -#endif - -__kernel void arithm_sub_with_mask_C2_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); - - uchar4 src1_data = vload4(0, src1 + src1_index); - uchar4 src2_data = vload4(0, src2 + src2_index); - uchar2 mask_data = vload2(0, mask + mask_index); - - uchar4 data = *((__global uchar4 *)(dst + dst_index)); - short4 tmp = convert_short4_sat(src1_data) - convert_short4_sat(src2_data); - uchar4 tmp_data = convert_uchar4_sat(tmp); - - data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy; - data.zw = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.zw : data.zw; - - *((__global uchar4 *)(dst + dst_index)) = data; - } -} -__kernel void arithm_sub_with_mask_C2_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index)); - ushort2 src_data2 = *((__global ushort2 *)((__global char *)src2 + src2_index)); - ushort2 dst_data = *((__global ushort2 *)((__global char *)dst + dst_index)); - - int2 tmp = convert_int2_sat(src_data1) - convert_int2_sat(src_data2); - ushort2 data = convert_ushort2_sat(tmp); - data = mask_data ? data : dst_data; - - *((__global ushort2 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_sub_with_mask_C2_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index)); - short2 src_data2 = *((__global short2 *)((__global char *)src2 + src2_index)); - short2 dst_data = *((__global short2 *)((__global char *)dst + dst_index)); - - int2 tmp = convert_int2_sat(src_data1) - convert_int2_sat(src_data2); - short2 data = convert_short2_sat(tmp); - data = mask_data ? data : dst_data; - - *((__global short2 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_sub_with_mask_C2_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 3) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index)); - int2 src_data2 = *((__global int2 *)((__global char *)src2 + src2_index)); - int2 dst_data = *((__global int2 *)((__global char *)dst + dst_index)); - - int2 data = convert_int2_sat(convert_long2_sat(src_data1) - convert_long2_sat(src_data2)); - data = mask_data ? data : dst_data; - - *((__global int2 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_sub_with_mask_C2_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global float *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 3) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - float2 src_data1 = *((__global float2 *)((__global char *)src1 + src1_index)); - float2 src_data2 = *((__global float2 *)((__global char *)src2 + src2_index)); - float2 dst_data = *((__global float2 *)((__global char *)dst + dst_index)); - - float2 data = src_data1 - src_data2; - data = mask_data ? data : dst_data; - - *((__global float2 *)((__global char *)dst + dst_index)) = data; - } -} - -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_sub_with_mask_C2_D6 (__global double *src1, int src1_step, int src1_offset, - __global double *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global double *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 4) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 4) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 4) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - double2 src_data1 = *((__global double2 *)((__global char *)src1 + src1_index)); - double2 src_data2 = *((__global double2 *)((__global char *)src2 + src2_index)); - double2 dst_data = *((__global double2 *)((__global char *)dst + dst_index)); - - double2 data = src_data1 - src_data2; - data = mask_data ? data : dst_data; - - *((__global double2 *)((__global char *)dst + dst_index)) = data; - } -} -#endif - -__kernel void arithm_sub_with_mask_C4_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index)); - uchar4 src_data2 = *((__global uchar4 *)(src2 + src2_index)); - uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); - - uchar4 data = convert_uchar4_sat(convert_short4_sat(src_data1) - convert_short4_sat(src_data2)); - data = mask_data ? data : dst_data; - - *((__global uchar4 *)(dst + dst_index)) = data; - } -} -__kernel void arithm_sub_with_mask_C4_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 3) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index)); - ushort4 src_data2 = *((__global ushort4 *)((__global char *)src2 + src2_index)); - ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index)); - - ushort4 data = convert_ushort4_sat(convert_int4_sat(src_data1) - convert_int4_sat(src_data2)); - data = mask_data ? data : dst_data; - - *((__global ushort4 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_sub_with_mask_C4_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 3) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index)); - short4 src_data2 = *((__global short4 *)((__global char *)src2 + src2_index)); - short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index)); - - short4 data = convert_short4_sat(convert_int4_sat(src_data1) - convert_int4_sat(src_data2)); - data = mask_data ? data : dst_data; - - *((__global short4 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_sub_with_mask_C4_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 4) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 4) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 4) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index)); - int4 src_data2 = *((__global int4 *)((__global char *)src2 + src2_index)); - int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index)); - - int4 data = convert_int4_sat(convert_long4_sat(src_data1) - convert_long4_sat(src_data2)); - data = mask_data ? data : dst_data; - - *((__global int4 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_sub_with_mask_C4_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global float *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 4) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 4) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 4) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - float4 src_data1 = *((__global float4 *)((__global char *)src1 + src1_index)); - float4 src_data2 = *((__global float4 *)((__global char *)src2 + src2_index)); - float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index)); - - float4 data = src_data1 - src_data2; - data = mask_data ? data : dst_data; - - *((__global float4 *)((__global char *)dst + dst_index)) = data; - } -} - -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_sub_with_mask_C4_D6 (__global double *src1, int src1_step, int src1_offset, - __global double *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global double *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 5) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 5) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 5) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - double4 src_data1 = *((__global double4 *)((__global char *)src1 + src1_index)); - double4 src_data2 = *((__global double4 *)((__global char *)src2 + src2_index)); - double4 dst_data = *((__global double4 *)((__global char *)dst + dst_index)); - - double4 data = src_data1 - src_data2; - data = mask_data ? data : dst_data; - - *((__global double4 *)((__global char *)dst + dst_index)) = data; - } -} -#endif diff --git a/modules/ocl/src/opencl/arithm_sub_scalar.cl b/modules/ocl/src/opencl/arithm_sub_scalar.cl deleted file mode 100644 index 76bb29418c..0000000000 --- a/modules/ocl/src/opencl/arithm_sub_scalar.cl +++ /dev/null @@ -1,511 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. -// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// @Authors -// Jia Haipeng, jiahaipeng95@gmail.com -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other oclMaterials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors as is and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ -#if defined (DOUBLE_SUPPORT) -#ifdef cl_khr_fp64 -#pragma OPENCL EXTENSION cl_khr_fp64:enable -#elif defined (cl_amd_fp64) -#pragma OPENCL EXTENSION cl_amd_fp64:enable -#endif -#endif -/**************************************sub with scalar without mask**************************************/ -__kernel void arithm_s_sub_C1_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - - #define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - - uchar4 src1_data = vload4(0, src1 + src1_index); - int4 src2_data = (int4)(src2.x, src2.x, src2.x, src2.x); - - uchar4 data = *((__global uchar4 *)(dst + dst_index)); - int4 tmp = convert_int4_sat(src1_data) - src2_data; - tmp = isMatSubScalar ? tmp : -tmp; - uchar4 tmp_data = convert_uchar4_sat(tmp); - - data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x; - data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y; - data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z; - data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w; - - *((__global uchar4 *)(dst + dst_index)) = data; - } -} -__kernel void arithm_s_sub_C1_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); - - ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index)); - int2 src2_data = (int2)(src2.x, src2.x); - - ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index)); - int2 tmp = convert_int2_sat(src1_data) - src2_data; - tmp = isMatSubScalar ? tmp : -tmp; - ushort2 tmp_data = convert_ushort2_sat(tmp); - - data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x; - data.y = (dst_index + 2 < dst_end ) ? tmp_data.y : data.y; - - *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_sub_C1_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); - - short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index)); - int2 src2_data = (int2)(src2.x, src2.x); - short2 data = *((__global short2 *)((__global uchar *)dst + dst_index)); - - int2 tmp = convert_int2_sat(src1_data) - src2_data; - tmp = isMatSubScalar ? tmp : -tmp; - short2 tmp_data = convert_short2_sat(tmp); - - data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x; - data.y = (dst_index + 2 < dst_end ) ? tmp_data.y : data.y; - - *((__global short2 *)((__global uchar *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_sub_C1_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - int src_data1 = *((__global int *)((__global char *)src1 + src1_index)); - int src_data2 = src2.x; - - long tmp = (long)src_data1 - (long)src_data2; - tmp = isMatSubScalar ? tmp : -tmp; - int data = convert_int_sat(tmp); - - *((__global int *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_sub_C1_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *dst, int dst_step, int dst_offset, - float4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - float src_data1 = *((__global float *)((__global char *)src1 + src1_index)); - float src_data2 = src2.x; - - float tmp = src_data1 - src_data2; - tmp = isMatSubScalar ? tmp : -tmp; - - *((__global float *)((__global char *)dst + dst_index)) = tmp; - } -} - -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_sub_C1_D6 (__global double *src1, int src1_step, int src1_offset, - __global double *dst, int dst_step, int dst_offset, - double4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - double src_data1 = *((__global double *)((__global char *)src1 + src1_index)); - double src2_data = src2.x; - - double data = src_data1 - src2_data; - data = isMatSubScalar ? data : -data; - - *((__global double *)((__global char *)dst + dst_index)) = data; - } -} -#endif - -__kernel void arithm_s_sub_C2_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); - - uchar4 src1_data = vload4(0, src1 + src1_index); - int4 src2_data = (int4)(src2.x, src2.y, src2.x, src2.y); - - uchar4 data = *((__global uchar4 *)(dst + dst_index)); - int4 tmp = convert_int4_sat(src1_data) - src2_data; - tmp = isMatSubScalar ? tmp : -tmp; - uchar4 tmp_data = convert_uchar4_sat(tmp); - - data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy; - data.zw = (dst_index + 2 < dst_end ) ? tmp_data.zw : data.zw; - - *((__global uchar4 *)(dst + dst_index)) = data; - } -} -__kernel void arithm_s_sub_C2_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index)); - int2 src_data2 = (int2)(src2.x, src2.y); - ushort2 dst_data = *((__global ushort2 *)((__global char *)dst + dst_index)); - - int2 tmp = convert_int2_sat(src_data1) - src_data2; - tmp = isMatSubScalar ? tmp : -tmp; - ushort2 data = convert_ushort2_sat(tmp); - - *((__global ushort2 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_sub_C2_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index)); - int2 src_data2 = (int2)(src2.x, src2.y); - short2 dst_data = *((__global short2 *)((__global char *)dst + dst_index)); - - int2 tmp = convert_int2_sat(src_data1) - src_data2; - tmp = isMatSubScalar ? tmp : -tmp; - short2 data = convert_short2_sat(tmp); - - *((__global short2 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_sub_C2_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index)); - int2 src_data2 = (int2)(src2.x, src2.y); - int2 dst_data = *((__global int2 *)((__global char *)dst + dst_index)); - - long2 tmp = convert_long2_sat(src_data1) - convert_long2_sat(src_data2); - tmp = isMatSubScalar ? tmp : -tmp; - int2 data = convert_int2_sat(tmp); - - *((__global int2 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_sub_C2_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *dst, int dst_step, int dst_offset, - float4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - float2 src_data1 = *((__global float2 *)((__global char *)src1 + src1_index)); - float2 src_data2 = (float2)(src2.x, src2.y); - float2 dst_data = *((__global float2 *)((__global char *)dst + dst_index)); - - float2 tmp = src_data1 - src_data2; - tmp = isMatSubScalar ? tmp : -tmp; - - *((__global float2 *)((__global char *)dst + dst_index)) = tmp; - } -} - -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_sub_C2_D6 (__global double *src1, int src1_step, int src1_offset, - __global double *dst, int dst_step, int dst_offset, - double4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 4) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 4) + dst_offset); - - double2 src_data1 = *((__global double2 *)((__global char *)src1 + src1_index)); - double2 src_data2 = (double2)(src2.x, src2.y); - double2 dst_data = *((__global double2 *)((__global char *)dst + dst_index)); - - double2 data = src_data1 - src_data2; - data = isMatSubScalar ? data : -data; - - *((__global double2 *)((__global char *)dst + dst_index)) = data; - } -} -#endif - -__kernel void arithm_s_sub_C4_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index)); - - int4 tmp = convert_int4_sat(src_data1) - src2; - tmp = isMatSubScalar ? tmp : -tmp; - uchar4 data = convert_uchar4_sat(tmp); - - *((__global uchar4 *)(dst + dst_index)) = data; - } -} -__kernel void arithm_s_sub_C4_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index)); - - int4 tmp = convert_int4_sat(src_data1) - src2; - tmp = isMatSubScalar ? tmp : -tmp; - ushort4 data = convert_ushort4_sat(tmp); - - *((__global ushort4 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_sub_C4_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index)); - - int4 tmp = convert_int4_sat(src_data1) - src2; - tmp = isMatSubScalar ? tmp : -tmp; - short4 data = convert_short4_sat(tmp); - - *((__global short4 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_sub_C4_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 4) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 4) + dst_offset); - - int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index)); - - long4 tmp = convert_long4_sat(src_data1) - convert_long4_sat(src2); - tmp = isMatSubScalar ? tmp : -tmp; - int4 data = convert_int4_sat(tmp); - - *((__global int4 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_sub_C4_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *dst, int dst_step, int dst_offset, - float4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 4) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 4) + dst_offset); - - float4 src_data1 = *((__global float4 *)((__global char *)src1 + src1_index)); - - float4 tmp = src_data1 - src2; - tmp = isMatSubScalar ? tmp : -tmp; - - *((__global float4 *)((__global char *)dst + dst_index)) = tmp; - } -} - -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_sub_C4_D6 (__global double *src1, int src1_step, int src1_offset, - __global double *dst, int dst_step, int dst_offset, - double4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 5) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 5) + dst_offset); - - double4 src_data1 = *((__global double4 *)((__global char *)src1 + src1_index)); - - double4 data = src_data1 - src2; - data = isMatSubScalar ? data : -data; - - *((__global double4 *)((__global char *)dst + dst_index)) = data; - } -} -#endif diff --git a/modules/ocl/src/opencl/arithm_sub_scalar_mask.cl b/modules/ocl/src/opencl/arithm_sub_scalar_mask.cl deleted file mode 100644 index 9b758cf4c9..0000000000 --- a/modules/ocl/src/opencl/arithm_sub_scalar_mask.cl +++ /dev/null @@ -1,611 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. -// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// @Authors -// Jia Haipeng, jiahaipeng95@gmail.com -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other GpuMaterials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors as is and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -#if defined (DOUBLE_SUPPORT) -#ifdef cl_khr_fp64 -#pragma OPENCL EXTENSION cl_khr_fp64:enable -#elif defined (cl_amd_fp64) -#pragma OPENCL EXTENSION cl_amd_fp64:enable -#endif -#endif - -/**************************************sub with scalar with mask**************************************/ -__kernel void arithm_s_sub_with_mask_C1_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - - #define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - - uchar4 src1_data = vload4(0, src1 + src1_index); - int4 src2_data = (int4)(src2.x, src2.x, src2.x, src2.x); - uchar4 mask_data = vload4(0, mask + mask_index); - - uchar4 data = *((__global uchar4 *)(dst + dst_index)); - int4 tmp = convert_int4_sat(src1_data) - src2_data; - tmp = isMatSubScalar ? tmp : -tmp; - uchar4 tmp_data = convert_uchar4_sat(tmp); - - data.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x; - data.y = ((mask_data.y) && (dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y; - data.z = ((mask_data.z) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z; - data.w = ((mask_data.w) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w; - - *((__global uchar4 *)(dst + dst_index)) = data; - } -} -__kernel void arithm_s_sub_with_mask_C1_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); - - ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index)); - int2 src2_data = (int2)(src2.x, src2.x); - uchar2 mask_data = vload2(0, mask + mask_index); - - ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index)); - int2 tmp = convert_int2_sat(src1_data) - src2_data; - tmp = isMatSubScalar ? tmp : -tmp; - ushort2 tmp_data = convert_ushort2_sat(tmp); - - data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x; - data.y = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.y : data.y; - - *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_sub_with_mask_C1_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); - - short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index)); - int2 src2_data = (int2)(src2.x, src2.x); - uchar2 mask_data = vload2(0, mask + mask_index); - - short2 data = *((__global short2 *)((__global uchar *)dst + dst_index)); - int2 tmp = convert_int2_sat(src1_data) - src2_data; - tmp = isMatSubScalar ? tmp : -tmp; - short2 tmp_data = convert_short2_sat(tmp); - - data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x; - data.y = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.y : data.y; - - *((__global short2 *)((__global uchar *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_sub_with_mask_C1_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - int src_data1 = *((__global int *)((__global char *)src1 + src1_index)); - int src_data2 = src2.x; - int dst_data = *((__global int *)((__global char *)dst + dst_index)); - - long tmp = (long)src_data1 - (long)src_data2; - tmp = isMatSubScalar ? tmp : - tmp; - int data = convert_int_sat(tmp); - data = mask_data ? data : dst_data; - - *((__global int *)((__global char *)dst + dst_index)) = data; - } -} - -__kernel void arithm_s_sub_with_mask_C1_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - float4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - float src_data1 = *((__global float *)((__global char *)src1 + src1_index)); - float src_data2 = src2.x; - float dst_data = *((__global float *)((__global char *)dst + dst_index)); - - float data = src_data1 - src_data2; - data = isMatSubScalar ? data : -data; - data = mask_data ? data : dst_data; - - *((__global float *)((__global char *)dst + dst_index)) = data; - } -} - - -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_sub_with_mask_C1_D6 (__global double *src1, int src1_step, int src1_offset, - __global double *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - double4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - double src_data1 = *((__global double *)((__global char *)src1 + src1_index)); - double src_data2 = src2.x; - double dst_data = *((__global double *)((__global char *)dst + dst_index)); - - double data = src_data1 - src_data2; - data = isMatSubScalar ? data : -data; - data = mask_data ? data : dst_data; - - *((__global double *)((__global char *)dst + dst_index)) = data; - } -} -#endif -__kernel void arithm_s_sub_with_mask_C2_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); - - uchar4 src1_data = vload4(0, src1 + src1_index); - int4 src2_data = (int4)(src2.x, src2.y, src2.x, src2.y); - uchar2 mask_data = vload2(0, mask + mask_index); - - uchar4 data = *((__global uchar4 *)(dst + dst_index)); - int4 tmp = convert_int4_sat(src1_data) - src2_data; - tmp = isMatSubScalar ? tmp : -tmp; - uchar4 tmp_data = convert_uchar4_sat(tmp); - - data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy; - data.zw = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.zw : data.zw; - - *((__global uchar4 *)(dst + dst_index)) = data; - } -} -__kernel void arithm_s_sub_with_mask_C2_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index)); - int2 src_data2 = (int2)(src2.x, src2.y); - ushort2 dst_data = *((__global ushort2 *)((__global char *)dst + dst_index)); - - int2 tmp = convert_int2_sat(src_data1) - src_data2; - tmp = isMatSubScalar ? tmp : -tmp; - ushort2 data = convert_ushort2_sat(tmp); - data = mask_data ? data : dst_data; - - *((__global ushort2 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_sub_with_mask_C2_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index)); - int2 src_data2 = (int2)(src2.x, src2.y); - short2 dst_data = *((__global short2 *)((__global char *)dst + dst_index)); - - int2 tmp = convert_int2_sat(src_data1) - src_data2; - tmp = isMatSubScalar ? tmp : -tmp; - short2 data = convert_short2_sat(tmp); - data = mask_data ? data : dst_data; - - *((__global short2 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_sub_with_mask_C2_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index)); - int2 src_data2 = (int2)(src2.x, src2.y); - int2 dst_data = *((__global int2 *)((__global char *)dst + dst_index)); - - long2 tmp = convert_long2_sat(src_data1) - convert_long2_sat(src_data2); - tmp = isMatSubScalar ? tmp : -tmp; - int2 data = convert_int2_sat(tmp); - data = mask_data ? data : dst_data; - - *((__global int2 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_sub_with_mask_C2_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - float4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - float2 src_data1 = *((__global float2 *)((__global char *)src1 + src1_index)); - float2 src_data2 = (float2)(src2.x, src2.y); - float2 dst_data = *((__global float2 *)((__global char *)dst + dst_index)); - - float2 data = src_data1 - src_data2; - data = isMatSubScalar ? data : -data; - data = mask_data ? data : dst_data; - - *((__global float2 *)((__global char *)dst + dst_index)) = data; - } -} - -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_sub_with_mask_C2_D6 (__global double *src1, int src1_step, int src1_offset, - __global double *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - double4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 4) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 4) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - double2 src_data1 = *((__global double2 *)((__global char *)src1 + src1_index)); - double2 src_data2 = (double2)(src2.x, src2.y); - double2 dst_data = *((__global double2 *)((__global char *)dst + dst_index)); - - double2 data = src_data1 - src_data2; - data = isMatSubScalar ? data : -data; - data = mask_data ? data : dst_data; - - *((__global double2 *)((__global char *)dst + dst_index)) = data; - } -} -#endif - -__kernel void arithm_s_sub_with_mask_C4_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index)); - uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); - - int4 tmp = convert_int4_sat(src_data1) - src2; - tmp = isMatSubScalar ? tmp : -tmp; - uchar4 data = convert_uchar4_sat(tmp); - - data = mask_data ? data : dst_data; - - *((__global uchar4 *)(dst + dst_index)) = data; - } -} -__kernel void arithm_s_sub_with_mask_C4_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index)); - ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index)); - - int4 tmp = convert_int4_sat(src_data1) - src2; - tmp = isMatSubScalar ? tmp : -tmp; - ushort4 data = convert_ushort4_sat(tmp); - - data = mask_data ? data : dst_data; - - *((__global ushort4 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_sub_with_mask_C4_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index)); - short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index)); - - int4 tmp = convert_int4_sat(src_data1) - src2; - tmp = isMatSubScalar ? tmp : -tmp; - short4 data = convert_short4_sat(tmp); - - data = mask_data ? data : dst_data; - - *((__global short4 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_sub_with_mask_C4_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 4) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 4) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index)); - int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index)); - - long4 tmp = convert_long4_sat(src_data1) - convert_long4_sat(src2); - tmp = isMatSubScalar ? tmp : -tmp; - int4 data = convert_int4_sat(tmp); - - data = mask_data ? data : dst_data; - - *((__global int4 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_sub_with_mask_C4_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - float4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 4) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 4) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - float4 src_data1 = *((__global float4 *)((__global char *)src1 + src1_index)); - float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index)); - - float4 data = src_data1 - src2; - data = isMatSubScalar ? data : -data; - - data = mask_data ? data : dst_data; - - *((__global float4 *)((__global char *)dst + dst_index)) = data; - } -} - -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_sub_with_mask_C4_D6 (__global double *src1, int src1_step, int src1_offset, - __global double *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - double4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 5) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 5) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - double4 src_data1 = *((__global double4 *)((__global char *)src1 + src1_index)); - double4 dst_data = *((__global double4 *)((__global char *)dst + dst_index)); - - double4 data = src_data1 - src2; - data = isMatSubScalar ? data : -data; - data = mask_data ? data : dst_data; - - *((__global double4 *)((__global char *)dst + dst_index)) = data; - } -} -#endif From d3083ecf6de344081eec567fd74b8430dc666822 Mon Sep 17 00:00:00 2001 From: Andrey Kamaev Date: Sat, 6 Apr 2013 20:11:40 +0400 Subject: [PATCH 11/30] Fix bitness detection for target platform --- cmake/OpenCVDetectCXXCompiler.cmake | 100 +++++++++++++++------------- 1 file changed, 52 insertions(+), 48 deletions(-) diff --git a/cmake/OpenCVDetectCXXCompiler.cmake b/cmake/OpenCVDetectCXXCompiler.cmake index 9ee23da55b..9b841dad8a 100644 --- a/cmake/OpenCVDetectCXXCompiler.cmake +++ b/cmake/OpenCVDetectCXXCompiler.cmake @@ -27,23 +27,23 @@ endif() # the -fPIC flag should be used. # ---------------------------------------------------------------------------- if(UNIX) - if (__ICL) - set(CV_ICC __ICL) - elseif(__ICC) - set(CV_ICC __ICC) - elseif(__ECL) - set(CV_ICC __ECL) - elseif(__ECC) - set(CV_ICC __ECC) - elseif(__INTEL_COMPILER) - set(CV_ICC __INTEL_COMPILER) - elseif(CMAKE_C_COMPILER MATCHES "icc") - set(CV_ICC icc_matches_c_compiler) - endif() + if (__ICL) + set(CV_ICC __ICL) + elseif(__ICC) + set(CV_ICC __ICC) + elseif(__ECL) + set(CV_ICC __ECL) + elseif(__ECC) + set(CV_ICC __ECC) + elseif(__INTEL_COMPILER) + set(CV_ICC __INTEL_COMPILER) + elseif(CMAKE_C_COMPILER MATCHES "icc") + set(CV_ICC icc_matches_c_compiler) + endif() endif() if(MSVC AND CMAKE_C_COMPILER MATCHES "icc") - set(CV_ICC __INTEL_COMPILER_FOR_WINDOWS) + set(CV_ICC __INTEL_COMPILER_FOR_WINDOWS) endif() # ---------------------------------------------------------------------------- @@ -64,45 +64,49 @@ if(CMAKE_COMPILER_IS_CLANGCXX) string(REGEX MATCH "[0-9]+\\.[0-9]+" CMAKE_CLANG_REGEX_VERSION "${CMAKE_OPENCV_CLANG_VERSION_FULL}") elseif(CMAKE_COMPILER_IS_GNUCXX) - execute_process(COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER_ARG1} -dumpversion - OUTPUT_VARIABLE CMAKE_OPENCV_GCC_VERSION_FULL - OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process(COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER_ARG1} -dumpversion + OUTPUT_VARIABLE CMAKE_OPENCV_GCC_VERSION_FULL + OUTPUT_STRIP_TRAILING_WHITESPACE) - execute_process(COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER_ARG1} -v - ERROR_VARIABLE CMAKE_OPENCV_GCC_INFO_FULL - OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process(COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER_ARG1} -v + ERROR_VARIABLE CMAKE_OPENCV_GCC_INFO_FULL + OUTPUT_STRIP_TRAILING_WHITESPACE) - # Typical output in CMAKE_OPENCV_GCC_VERSION_FULL: "c+//0 (whatever) 4.2.3 (...)" - # Look for the version number - string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" CMAKE_GCC_REGEX_VERSION "${CMAKE_OPENCV_GCC_VERSION_FULL}") - if(NOT CMAKE_GCC_REGEX_VERSION) - string(REGEX MATCH "[0-9]+\\.[0-9]+" CMAKE_GCC_REGEX_VERSION "${CMAKE_OPENCV_GCC_VERSION_FULL}") - endif() - - # Split the three parts: - string(REGEX MATCHALL "[0-9]+" CMAKE_OPENCV_GCC_VERSIONS "${CMAKE_GCC_REGEX_VERSION}") - - list(GET CMAKE_OPENCV_GCC_VERSIONS 0 CMAKE_OPENCV_GCC_VERSION_MAJOR) - list(GET CMAKE_OPENCV_GCC_VERSIONS 1 CMAKE_OPENCV_GCC_VERSION_MINOR) - - set(CMAKE_OPENCV_GCC_VERSION ${CMAKE_OPENCV_GCC_VERSION_MAJOR}${CMAKE_OPENCV_GCC_VERSION_MINOR}) - math(EXPR CMAKE_OPENCV_GCC_VERSION_NUM "${CMAKE_OPENCV_GCC_VERSION_MAJOR}*100 + ${CMAKE_OPENCV_GCC_VERSION_MINOR}") - message(STATUS "Detected version of GNU GCC: ${CMAKE_OPENCV_GCC_VERSION} (${CMAKE_OPENCV_GCC_VERSION_NUM})") - - if(WIN32) - execute_process(COMMAND ${CMAKE_CXX_COMPILER} -dumpmachine - OUTPUT_VARIABLE CMAKE_OPENCV_GCC_TARGET_MACHINE - OUTPUT_STRIP_TRAILING_WHITESPACE) - if(CMAKE_OPENCV_GCC_TARGET_MACHINE MATCHES "amd64|x86_64|AMD64") - set(MINGW64 1) - endif() + # Typical output in CMAKE_OPENCV_GCC_VERSION_FULL: "c+//0 (whatever) 4.2.3 (...)" + # Look for the version number + string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" CMAKE_GCC_REGEX_VERSION "${CMAKE_OPENCV_GCC_VERSION_FULL}") + if(NOT CMAKE_GCC_REGEX_VERSION) + string(REGEX MATCH "[0-9]+\\.[0-9]+" CMAKE_GCC_REGEX_VERSION "${CMAKE_OPENCV_GCC_VERSION_FULL}") + endif() + + # Split the three parts: + string(REGEX MATCHALL "[0-9]+" CMAKE_OPENCV_GCC_VERSIONS "${CMAKE_GCC_REGEX_VERSION}") + + list(GET CMAKE_OPENCV_GCC_VERSIONS 0 CMAKE_OPENCV_GCC_VERSION_MAJOR) + list(GET CMAKE_OPENCV_GCC_VERSIONS 1 CMAKE_OPENCV_GCC_VERSION_MINOR) + + set(CMAKE_OPENCV_GCC_VERSION ${CMAKE_OPENCV_GCC_VERSION_MAJOR}${CMAKE_OPENCV_GCC_VERSION_MINOR}) + math(EXPR CMAKE_OPENCV_GCC_VERSION_NUM "${CMAKE_OPENCV_GCC_VERSION_MAJOR}*100 + ${CMAKE_OPENCV_GCC_VERSION_MINOR}") + message(STATUS "Detected version of GNU GCC: ${CMAKE_OPENCV_GCC_VERSION} (${CMAKE_OPENCV_GCC_VERSION_NUM})") + + if(WIN32) + execute_process(COMMAND ${CMAKE_CXX_COMPILER} -dumpmachine + OUTPUT_VARIABLE CMAKE_OPENCV_GCC_TARGET_MACHINE + OUTPUT_STRIP_TRAILING_WHITESPACE) + if(CMAKE_OPENCV_GCC_TARGET_MACHINE MATCHES "amd64|x86_64|AMD64") + set(MINGW64 1) endif() + endif() endif() -if(MINGW64 OR CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*" OR CMAKE_GENERATOR MATCHES "Visual Studio.*Win64") - set(X86_64 1) +if(MSVC64 OR MINGW64) + set(X86_64 1) +elseif(MSVC AND NOT CMAKE_CROSSCOMPILING) + set(X86 1) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") + set(X86_64 1) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*") - set(X86 1) + set(X86 1) elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "arm.*|ARM.*") - set(ARM 1) + set(ARM 1) endif() From 7bd169caa916ecde3a67f3e862dac7708c3f0e7b Mon Sep 17 00:00:00 2001 From: Alexander Smorkalov Date: Tue, 9 Apr 2013 10:13:46 +0400 Subject: [PATCH 12/30] Native activity build with Android.mk fixed. --- samples/android/native-activity/jni/Android.mk | 2 +- samples/android/native-activity/jni/Application.mk | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/samples/android/native-activity/jni/Android.mk b/samples/android/native-activity/jni/Android.mk index 720d38b40d..fd4fd2bfbb 100644 --- a/samples/android/native-activity/jni/Android.mk +++ b/samples/android/native-activity/jni/Android.mk @@ -6,7 +6,7 @@ include ../../sdk/native/jni/OpenCV.mk LOCAL_MODULE := native_activity LOCAL_SRC_FILES := native.cpp -LOCAL_LDLIBS := -lm -llog -landroid +LOCAL_LDLIBS += -lm -llog -landroid LOCAL_STATIC_LIBRARIES := android_native_app_glue include $(BUILD_SHARED_LIBRARY) diff --git a/samples/android/native-activity/jni/Application.mk b/samples/android/native-activity/jni/Application.mk index a89e12df19..e9392cfed2 100644 --- a/samples/android/native-activity/jni/Application.mk +++ b/samples/android/native-activity/jni/Application.mk @@ -1,2 +1,4 @@ APP_ABI := armeabi-v7a +APP_STL := gnustl_static +APP_CPPFLAGS := -frtti -fexceptions APP_PLATFORM := android-9 From ec6f0e1baf396f2bc25d4ce50c686702cf83981d Mon Sep 17 00:00:00 2001 From: Alexander Smorkalov Date: Wed, 10 Apr 2013 10:43:22 +0400 Subject: [PATCH 13/30] Incompatible hardware detetction added to OpenCV Manager(Feature #2941) --- .../org/opencv/engine/BinderConnector.java | 46 ++++++++---- .../org/opencv/engine/HardwareDetector.java | 16 +++-- .../opencv/engine/OpenCVEngineService.java | 47 +++++++++--- .../engine/manager/ManagerActivity.java | 72 ++++++++++++++----- 4 files changed, 137 insertions(+), 44 deletions(-) diff --git a/android/service/engine/src/org/opencv/engine/BinderConnector.java b/android/service/engine/src/org/opencv/engine/BinderConnector.java index fd23fbfe49..bde54d5b96 100644 --- a/android/service/engine/src/org/opencv/engine/BinderConnector.java +++ b/android/service/engine/src/org/opencv/engine/BinderConnector.java @@ -4,23 +4,43 @@ import android.os.IBinder; public class BinderConnector { - public BinderConnector(MarketConnector Market) - { - Init(Market); - } - public native IBinder Connect(); - public boolean Disconnect() - { - Final(); - return true; + public BinderConnector(MarketConnector Market) { + mMarket = Market; } - static + public boolean Init() { + boolean result = false; + if (mIsReady) + result = Init(mMarket); + + return result; + } + + public native IBinder Connect(); + + public boolean Disconnect() { - System.loadLibrary("OpenCVEngine"); - System.loadLibrary("OpenCVEngine_jni"); + if (mIsReady) + Final(); + + return mIsReady; } private native boolean Init(MarketConnector Market); - public native void Final(); + private native void Final(); + private static boolean mIsReady = false; + private MarketConnector mMarket; + + static { + try { + System.loadLibrary("OpenCVEngine"); + System.loadLibrary("OpenCVEngine_jni"); + mIsReady = true; + } + catch(UnsatisfiedLinkError e) { + mIsReady = false; + e.printStackTrace(); + } + } + } diff --git a/android/service/engine/src/org/opencv/engine/HardwareDetector.java b/android/service/engine/src/org/opencv/engine/HardwareDetector.java index 67320865af..7fc7e1ae8a 100644 --- a/android/service/engine/src/org/opencv/engine/HardwareDetector.java +++ b/android/service/engine/src/org/opencv/engine/HardwareDetector.java @@ -47,9 +47,17 @@ public class HardwareDetector public static native int DetectKnownPlatforms(); - static - { - System.loadLibrary("OpenCVEngine"); - System.loadLibrary("OpenCVEngine_jni"); + public static boolean mIsReady = false; + + static { + try { + System.loadLibrary("OpenCVEngine"); + System.loadLibrary("OpenCVEngine_jni"); + mIsReady = true; + } + catch(UnsatisfiedLinkError e) { + mIsReady = false; + e.printStackTrace(); + } } } diff --git a/android/service/engine/src/org/opencv/engine/OpenCVEngineService.java b/android/service/engine/src/org/opencv/engine/OpenCVEngineService.java index df31c7fe8e..b3c4ea0575 100644 --- a/android/service/engine/src/org/opencv/engine/OpenCVEngineService.java +++ b/android/service/engine/src/org/opencv/engine/OpenCVEngineService.java @@ -3,31 +3,62 @@ package org.opencv.engine; import android.app.Service; import android.content.Intent; import android.os.IBinder; +import android.os.RemoteException; import android.util.Log; - public class OpenCVEngineService extends Service { private static final String TAG = "OpenCVEngine/Service"; - private IBinder mEngineInterface; + private IBinder mEngineInterface = null; private MarketConnector mMarket; private BinderConnector mNativeBinder; - public void onCreate() - { + + public void onCreate() { Log.i(TAG, "Service starting"); super.onCreate(); Log.i(TAG, "Engine binder component creating"); mMarket = new MarketConnector(getBaseContext()); mNativeBinder = new BinderConnector(mMarket); - mEngineInterface = mNativeBinder.Connect(); - Log.i(TAG, "Service started successfully"); + if (mNativeBinder.Init()) { + mEngineInterface = mNativeBinder.Connect(); + Log.i(TAG, "Service started successfully"); + } else { + Log.e(TAG, "Cannot initialize native part of OpenCV Manager!"); + Log.e(TAG, "Using stub instead"); + + mEngineInterface = new OpenCVEngineInterface.Stub() { + + @Override + public boolean installVersion(String version) throws RemoteException { + // TODO Auto-generated method stub + return false; + } + + @Override + public String getLibraryList(String version) throws RemoteException { + // TODO Auto-generated method stub + return null; + } + + @Override + public String getLibPathByVersion(String version) throws RemoteException { + // TODO Auto-generated method stub + return null; + } + + @Override + public int getEngineVersion() throws RemoteException { + return -1; + } + }; + } } - public IBinder onBind(Intent intent) - { + public IBinder onBind(Intent intent) { Log.i(TAG, "Service onBind called for intent " + intent.toString()); return mEngineInterface; } + public boolean onUnbind(Intent intent) { Log.i(TAG, "Service onUnbind called for intent " + intent.toString()); diff --git a/android/service/engine/src/org/opencv/engine/manager/ManagerActivity.java b/android/service/engine/src/org/opencv/engine/manager/ManagerActivity.java index 5213d91495..3c1aac994a 100644 --- a/android/service/engine/src/org/opencv/engine/manager/ManagerActivity.java +++ b/android/service/engine/src/org/opencv/engine/manager/ManagerActivity.java @@ -42,6 +42,26 @@ public class ManagerActivity extends Activity @Override public void onCreate(Bundle savedInstanceState) { super.onCreate(savedInstanceState); + + if (!HardwareDetector.mIsReady) { + Log.e(TAG, "Cannot initialize native part of OpenCV Manager!"); + + AlertDialog dialog = new AlertDialog.Builder(this).create(); + + dialog.setTitle("OpenCV Manager Error"); + dialog.setMessage("OpenCV Manager is incompatible with this device. Please replace it with an appropriate package."); + dialog.setCancelable(false); + dialog.setButton("OK", new DialogInterface.OnClickListener() { + + public void onClick(DialogInterface dialog, int which) { + finish(); + } + }); + + dialog.show(); + return; + } + setContentView(R.layout.main); TextView OsVersionView = (TextView)findViewById(R.id.OsVersionValue); @@ -186,6 +206,20 @@ public class ManagerActivity extends Activity } }); + mPackageChangeReciever = new BroadcastReceiver() { + + @Override + public void onReceive(Context context, Intent intent) { + Log.d("OpenCVManager/Reciever", "Bradcast message " + intent.getAction() + " reciever"); + Log.d("OpenCVManager/Reciever", "Filling package list on broadcast message"); + if (!bindService(new Intent("org.opencv.engine.BIND"), new OpenCVEngineServiceConnection(), Context.BIND_AUTO_CREATE)) + { + TextView EngineVersionView = (TextView)findViewById(R.id.EngineVersionValue); + EngineVersionView.setText("not avaliable"); + } + } + }; + IntentFilter filter = new IntentFilter(); filter.addAction(Intent.ACTION_PACKAGE_ADDED); filter.addAction(Intent.ACTION_PACKAGE_CHANGED); @@ -199,17 +233,23 @@ public class ManagerActivity extends Activity @Override protected void onDestroy() { super.onDestroy(); - unregisterReceiver(mPackageChangeReciever); + if (mPackageChangeReciever != null) + unregisterReceiver(mPackageChangeReciever); } @Override protected void onResume() { super.onResume(); - Log.d(TAG, "Filling package list on resume"); - if (!bindService(new Intent("org.opencv.engine.BIND"), new OpenCVEngineServiceConnection(), Context.BIND_AUTO_CREATE)) - { - TextView EngineVersionView = (TextView)findViewById(R.id.EngineVersionValue); - EngineVersionView.setText("not avaliable"); + if (HardwareDetector.mIsReady) { + Log.d(TAG, "Filling package list on resume"); + OpenCVEngineServiceConnection connection = new OpenCVEngineServiceConnection(); + if (!bindService(new Intent("org.opencv.engine.BIND"), connection, Context.BIND_AUTO_CREATE)) { + Log.e(TAG, "Cannot bind to OpenCV Manager service!"); + TextView EngineVersionView = (TextView)findViewById(R.id.EngineVersionValue); + if (EngineVersionView != null) + EngineVersionView.setText("not avaliable"); + unbindService(connection); + } } } @@ -225,19 +265,7 @@ public class ManagerActivity extends Activity protected int ManagerApiLevel = 0; protected String ManagerVersion; - protected BroadcastReceiver mPackageChangeReciever = new BroadcastReceiver() { - - @Override - public void onReceive(Context context, Intent intent) { - Log.d("OpenCVManager/Reciever", "Bradcast message " + intent.getAction() + " reciever"); - Log.d("OpenCVManager/Reciever", "Filling package list on broadcast message"); - if (!bindService(new Intent("org.opencv.engine.BIND"), new OpenCVEngineServiceConnection(), Context.BIND_AUTO_CREATE)) - { - TextView EngineVersionView = (TextView)findViewById(R.id.EngineVersionValue); - EngineVersionView.setText("not avaliable"); - } - } - }; + protected BroadcastReceiver mPackageChangeReciever = null; protected class OpenCVEngineServiceConnection implements ServiceConnection { @@ -246,6 +274,12 @@ public class ManagerActivity extends Activity public void onServiceConnected(ComponentName name, IBinder service) { OpenCVEngineInterface EngineService = OpenCVEngineInterface.Stub.asInterface(service); + if (EngineService == null) { + Log.e(TAG, "Cannot connect to OpenCV Manager Service!"); + unbindService(this); + return; + } + try { ManagerApiLevel = EngineService.getEngineVersion(); } catch (RemoteException e) { From e8721f1f6fd4a406eba715b0fb44bd8e0f03ce6a Mon Sep 17 00:00:00 2001 From: Roman Donchenko Date: Wed, 10 Apr 2013 12:20:54 +0400 Subject: [PATCH 14/30] Automatically add CUDA support to a module if it has CUDA sources. Backport from master. --- cmake/OpenCVModule.cmake | 28 +++++++++++++++++++------ modules/core/CMakeLists.txt | 24 +++++++++------------ modules/nonfree/CMakeLists.txt | 27 ++---------------------- modules/nonfree/src/cuda/surf.cu | 6 ++++-- modules/nonfree/src/cuda/vibe.cu | 7 ++++--- modules/superres/CMakeLists.txt | 28 ++----------------------- modules/superres/src/cuda/btv_l1_gpu.cu | 6 ++++++ 7 files changed, 50 insertions(+), 76 deletions(-) diff --git a/cmake/OpenCVModule.cmake b/cmake/OpenCVModule.cmake index 90b4863405..8312845fe0 100644 --- a/cmake/OpenCVModule.cmake +++ b/cmake/OpenCVModule.cmake @@ -432,11 +432,19 @@ macro(ocv_glob_module_sources) file(GLOB lib_hdrs "include/opencv2/${name}/*.hpp" "include/opencv2/${name}/*.h") file(GLOB lib_hdrs_detail "include/opencv2/${name}/detail/*.hpp" "include/opencv2/${name}/detail/*.h") - file(GLOB cl_kernels "src/opencl/*.cl") + file(GLOB lib_cuda_srcs "src/cuda/*.cu") + set(cuda_objs "") + set(lib_cuda_hdrs "") - source_group("Src" FILES ${lib_srcs} ${lib_int_hdrs}) - source_group("Include" FILES ${lib_hdrs}) - source_group("Include\\detail" FILES ${lib_hdrs_detail}) + if(HAVE_CUDA AND lib_cuda_srcs) + ocv_include_directories(${CUDA_INCLUDE_DIRS}) + file(GLOB lib_cuda_hdrs "src/cuda/*.hpp") + + ocv_cuda_compile(cuda_objs ${lib_cuda_srcs} ${lib_cuda_hdrs}) + source_group("Src\\Cuda" FILES ${lib_cuda_srcs} ${lib_cuda_hdrs}) + endif() + + file(GLOB cl_kernels "src/opencl/*.cl") if(HAVE_OPENCL AND cl_kernels) ocv_include_directories(${OPENCL_INCLUDE_DIRS}) @@ -448,7 +456,12 @@ macro(ocv_glob_module_sources) list(APPEND lib_srcs ${cl_kernels} "${CMAKE_CURRENT_BINARY_DIR}/kernels.cpp") endif() - ocv_set_module_sources(${ARGN} HEADERS ${lib_hdrs} ${lib_hdrs_detail} SOURCES ${lib_srcs} ${lib_int_hdrs}) + source_group("Src" FILES ${lib_srcs} ${lib_int_hdrs}) + source_group("Include" FILES ${lib_hdrs}) + source_group("Include\\detail" FILES ${lib_hdrs_detail}) + + ocv_set_module_sources(${ARGN} HEADERS ${lib_hdrs} ${lib_hdrs_detail} + SOURCES ${lib_srcs} ${lib_int_hdrs} ${cuda_objs} ${lib_cuda_srcs} ${lib_cuda_hdrs}) endmacro() # creates OpenCV module in current folder @@ -461,6 +474,9 @@ macro(ocv_create_module) if(NOT "${ARGN}" STREQUAL "SKIP_LINK") target_link_libraries(${the_module} ${OPENCV_MODULE_${the_module}_DEPS} ${OPENCV_MODULE_${the_module}_DEPS_EXT} ${OPENCV_LINKER_LIBS} ${IPP_LIBS} ${ARGN}) + if (HAVE_CUDA) + target_link_libraries(${the_module} ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY}) + endif() if(HAVE_OPENCL AND OPENCL_LIBRARIES) target_link_libraries(${the_module} ${OPENCL_LIBRARIES}) endif() @@ -545,8 +561,8 @@ endmacro() # ocv_define_module(module_name [INTERNAL] [REQUIRED] [] [OPTIONAL ]) macro(ocv_define_module module_name) ocv_add_module(${module_name} ${ARGN}) - ocv_glob_module_sources() ocv_module_include_directories() + ocv_glob_module_sources() ocv_create_module() ocv_add_precompiled_headers(${the_module}) diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt index 4c5112e3f9..dc62a884fa 100644 --- a/modules/core/CMakeLists.txt +++ b/modules/core/CMakeLists.txt @@ -3,25 +3,21 @@ ocv_add_module(core ${ZLIB_LIBRARIES}) ocv_module_include_directories(${ZLIB_INCLUDE_DIR}) if(HAVE_CUDA) - ocv_source_group("Src\\Cuda" GLOB "src/cuda/*.cu") - ocv_include_directories("${OpenCV_SOURCE_DIR}/modules/gpu/include" ${CUDA_INCLUDE_DIRS}) + ocv_include_directories("${OpenCV_SOURCE_DIR}/modules/gpu/include") ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef) - - file(GLOB lib_cuda "src/cuda/*.cu") - ocv_cuda_compile(cuda_objs ${lib_cuda}) - - set(cuda_link_libs ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY}) -else() - set(lib_cuda "") - set(cuda_objs "") - set(cuda_link_libs "") endif() -ocv_glob_module_sources(SOURCES ${lib_cuda} ${cuda_objs} "${opencv_core_BINARY_DIR}/version_string.inc") +file(GLOB lib_cuda_hdrs "include/opencv2/${name}/cuda/*.hpp" "include/opencv2/${name}/cuda/*.h") +file(GLOB lib_cuda_hdrs_detail "include/opencv2/${name}/cuda/detail/*.hpp" "include/opencv2/${name}/cuda/detail/*.h") -ocv_create_module(${cuda_link_libs}) +source_group("Cuda Headers" FILES ${lib_cuda_hdrs}) +source_group("Cuda Headers\\Detail" FILES ${lib_cuda_hdrs_detail}) + +ocv_glob_module_sources(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc" + HEADERS ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail}) + +ocv_create_module() ocv_add_precompiled_headers(${the_module}) ocv_add_accuracy_tests() ocv_add_perf_tests() - diff --git a/modules/nonfree/CMakeLists.txt b/modules/nonfree/CMakeLists.txt index a846f7406b..5689a12e36 100644 --- a/modules/nonfree/CMakeLists.txt +++ b/modules/nonfree/CMakeLists.txt @@ -3,28 +3,5 @@ if(BUILD_ANDROID_PACKAGE) endif() set(the_description "Functionality with possible limitations on the use") -ocv_add_module(nonfree opencv_imgproc opencv_features2d opencv_calib3d OPTIONAL opencv_gpu opencv_ocl) -ocv_module_include_directories() - -if(HAVE_CUDA AND HAVE_opencv_gpu) - ocv_source_group("Src\\Cuda" GLOB "src/cuda/*.cu") - ocv_include_directories("${OpenCV_SOURCE_DIR}/modules/gpu/include" ${CUDA_INCLUDE_DIRS}) - ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef) - - file(GLOB lib_cuda "src/cuda/*.cu") - ocv_cuda_compile(cuda_objs ${lib_cuda}) - - set(cuda_link_libs ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY}) -else() - set(lib_cuda "") - set(cuda_objs "") - set(cuda_link_libs "") -endif() - -ocv_glob_module_sources(SOURCES ${lib_cuda} ${cuda_objs}) - -ocv_create_module(${cuda_link_libs}) -ocv_add_precompiled_headers(${the_module}) - -ocv_add_accuracy_tests() -ocv_add_perf_tests() +ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef) +ocv_define_module(nonfree opencv_imgproc opencv_features2d opencv_calib3d OPTIONAL opencv_gpu opencv_ocl) diff --git a/modules/nonfree/src/cuda/surf.cu b/modules/nonfree/src/cuda/surf.cu index 3f34cc745e..2002f534d0 100644 --- a/modules/nonfree/src/cuda/surf.cu +++ b/modules/nonfree/src/cuda/surf.cu @@ -40,7 +40,9 @@ // //M*/ -#if !defined CUDA_DISABLER +#include "opencv2/opencv_modules.hpp" + +#ifdef HAVE_OPENCV_GPU #include "opencv2/gpu/device/common.hpp" #include "opencv2/gpu/device/limits.hpp" @@ -956,4 +958,4 @@ namespace cv { namespace gpu { namespace device }}} // namespace cv { namespace gpu { namespace device -#endif /* CUDA_DISABLER */ +#endif /* HAVE_OPENCV_GPU */ diff --git a/modules/nonfree/src/cuda/vibe.cu b/modules/nonfree/src/cuda/vibe.cu index 6d4653f2a6..ba678abae2 100644 --- a/modules/nonfree/src/cuda/vibe.cu +++ b/modules/nonfree/src/cuda/vibe.cu @@ -40,7 +40,9 @@ // //M*/ -#if !defined CUDA_DISABLER +#include "opencv2/opencv_modules.hpp" + +#ifdef HAVE_OPENCV_GPU #include "opencv2/gpu/device/common.hpp" @@ -266,5 +268,4 @@ namespace cv { namespace gpu { namespace device } }}} - -#endif /* CUDA_DISABLER */ +#endif /* HAVE_OPENCV_GPU */ diff --git a/modules/superres/CMakeLists.txt b/modules/superres/CMakeLists.txt index 92ce01c2d2..6c6022c72c 100644 --- a/modules/superres/CMakeLists.txt +++ b/modules/superres/CMakeLists.txt @@ -3,29 +3,5 @@ if(ANDROID OR IOS) endif() set(the_description "Super Resolution") -ocv_add_module(superres opencv_imgproc opencv_video OPTIONAL opencv_gpu opencv_highgui) -ocv_module_include_directories() - -ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef /wd4127) - -if(HAVE_CUDA) - ocv_source_group("Src\\Cuda" GLOB "src/cuda/*.cu") - ocv_include_directories("${OpenCV_SOURCE_DIR}/modules/gpu/include" ${CUDA_INCLUDE_DIRS}) - - file(GLOB lib_cuda "src/cuda/*.cu") - ocv_cuda_compile(cuda_objs ${lib_cuda}) - - set(cuda_link_libs ${CUDA_LIBRARIES}) -else() - set(lib_cuda "") - set(cuda_objs "") - set(cuda_link_libs "") -endif() - -ocv_glob_module_sources(SOURCES ${lib_cuda} ${cuda_objs}) - -ocv_create_module(${cuda_link_libs}) -ocv_add_precompiled_headers(${the_module}) - -ocv_add_accuracy_tests() -ocv_add_perf_tests() +ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 -Wundef) +ocv_define_module(superres opencv_imgproc opencv_video OPTIONAL opencv_gpu opencv_highgui) diff --git a/modules/superres/src/cuda/btv_l1_gpu.cu b/modules/superres/src/cuda/btv_l1_gpu.cu index b27671aa0d..b4d96190ae 100644 --- a/modules/superres/src/cuda/btv_l1_gpu.cu +++ b/modules/superres/src/cuda/btv_l1_gpu.cu @@ -40,6 +40,10 @@ // //M*/ +#include "opencv2/opencv_modules.hpp" + +#ifdef HAVE_OPENCV_GPU + #include "opencv2/gpu/device/common.hpp" #include "opencv2/gpu/device/transform.hpp" #include "opencv2/gpu/device/vec_traits.hpp" @@ -232,3 +236,5 @@ namespace btv_l1_device template void calcBtvRegularization<3>(PtrStepSzb src, PtrStepSzb dst, int ksize); template void calcBtvRegularization<4>(PtrStepSzb src, PtrStepSzb dst, int ksize); } + +#endif /* HAVE_OPENCV_GPU */ From 82c61eba4917a5aa00d99e91795d6370e2e031ea Mon Sep 17 00:00:00 2001 From: Alexander Smorkalov Date: Wed, 10 Apr 2013 13:38:59 +0400 Subject: [PATCH 15/30] Feature #2893 Create Java sample without layout.xml implemented. --- .../15-puzzle/res/layout/activity_puzzle15.xml | 11 ----------- .../15-puzzle/res/menu/activity_puzzle15.xml | 6 ------ .../samples/puzzle15/Puzzle15Activity.java | 18 ++++++++++++------ 3 files changed, 12 insertions(+), 23 deletions(-) delete mode 100644 samples/android/15-puzzle/res/layout/activity_puzzle15.xml delete mode 100644 samples/android/15-puzzle/res/menu/activity_puzzle15.xml diff --git a/samples/android/15-puzzle/res/layout/activity_puzzle15.xml b/samples/android/15-puzzle/res/layout/activity_puzzle15.xml deleted file mode 100644 index 3257ed801c..0000000000 --- a/samples/android/15-puzzle/res/layout/activity_puzzle15.xml +++ /dev/null @@ -1,11 +0,0 @@ - - - - - diff --git a/samples/android/15-puzzle/res/menu/activity_puzzle15.xml b/samples/android/15-puzzle/res/menu/activity_puzzle15.xml deleted file mode 100644 index 7810d81963..0000000000 --- a/samples/android/15-puzzle/res/menu/activity_puzzle15.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - diff --git a/samples/android/15-puzzle/src/org/opencv/samples/puzzle15/Puzzle15Activity.java b/samples/android/15-puzzle/src/org/opencv/samples/puzzle15/Puzzle15Activity.java index 466400d873..ebd34fc7e2 100644 --- a/samples/android/15-puzzle/src/org/opencv/samples/puzzle15/Puzzle15Activity.java +++ b/samples/android/15-puzzle/src/org/opencv/samples/puzzle15/Puzzle15Activity.java @@ -6,6 +6,7 @@ import org.opencv.android.OpenCVLoader; import org.opencv.core.Mat; import org.opencv.android.CameraBridgeViewBase; import org.opencv.android.CameraBridgeViewBase.CvCameraViewListener; +import org.opencv.android.JavaCameraView; import android.os.Bundle; import android.app.Activity; @@ -22,6 +23,9 @@ public class Puzzle15Activity extends Activity implements CvCameraViewListener, private CameraBridgeViewBase mOpenCvCameraView; private Puzzle15Processor mPuzzle15; + private MenuItem mItemHideNumbers; + private MenuItem mItemStartNewGame; + private int mGameWidth; private int mGameHeight; @@ -52,9 +56,9 @@ public class Puzzle15Activity extends Activity implements CvCameraViewListener, super.onCreate(savedInstanceState); getWindow().addFlags(WindowManager.LayoutParams.FLAG_KEEP_SCREEN_ON); - setContentView(R.layout.activity_puzzle15); - - mOpenCvCameraView = (CameraBridgeViewBase) findViewById(R.id.puzzle_activity_surface_view); + Log.d(TAG, "Creating and seting view"); + mOpenCvCameraView = (CameraBridgeViewBase) new JavaCameraView(this, -1); + setContentView(mOpenCvCameraView); mOpenCvCameraView.setCvCameraViewListener(this); mPuzzle15 = new Puzzle15Processor(); mPuzzle15.prepareNewGame(); @@ -83,17 +87,19 @@ public class Puzzle15Activity extends Activity implements CvCameraViewListener, @Override public boolean onCreateOptionsMenu(Menu menu) { - getMenuInflater().inflate(R.menu.activity_puzzle15, menu); + Log.i(TAG, "called onCreateOptionsMenu"); + mItemHideNumbers = menu.add("Show/hide tile numbers"); + mItemStartNewGame = menu.add("Start new game"); return true; } @Override public boolean onOptionsItemSelected(MenuItem item) { Log.i(TAG, "Menu Item selected " + item); - if (item.getItemId() == R.id.menu_start_new_game) { + if (item == mItemStartNewGame) { /* We need to start new game */ mPuzzle15.prepareNewGame(); - } else if (item.getItemId() == R.id.menu_toggle_tile_numbers) { + } else if (item == mItemHideNumbers) { /* We need to enable or disable drawing of the tile numbers */ mPuzzle15.toggleTileNumbers(); } From f64d5127749865b69c02e18b72d96191a2d252d4 Mon Sep 17 00:00:00 2001 From: Roman Donchenko Date: Wed, 10 Apr 2013 19:36:39 +0400 Subject: [PATCH 16/30] Backported globbing from master. --- modules/core/include/opencv2/core/core.hpp | 2 + modules/core/src/glob.cpp | 244 +++++++++++++++++++++ modules/core/test/test_io.cpp | 17 ++ 3 files changed, 263 insertions(+) create mode 100644 modules/core/src/glob.cpp diff --git a/modules/core/include/opencv2/core/core.hpp b/modules/core/include/opencv2/core/core.hpp index 7caf7538c5..1bffbaaf10 100644 --- a/modules/core/include/opencv2/core/core.hpp +++ b/modules/core/include/opencv2/core/core.hpp @@ -225,6 +225,8 @@ CV_EXPORTS ErrorCallback redirectError( ErrorCallback errCallback, #define CV_DbgAssert(expr) #endif +CV_EXPORTS void glob(String pattern, std::vector& result, bool recursive = false); + CV_EXPORTS void setNumThreads(int nthreads); CV_EXPORTS int getNumThreads(); CV_EXPORTS int getThreadNum(); diff --git a/modules/core/src/glob.cpp b/modules/core/src/glob.cpp new file mode 100644 index 0000000000..368f304ef2 --- /dev/null +++ b/modules/core/src/glob.cpp @@ -0,0 +1,244 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2008-2013, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and / or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "precomp.hpp" + +#if defined WIN32 || defined _WIN32 || defined WINCE +# include +const char dir_separators[] = "/\\"; +const char native_separator = '\\'; + +namespace +{ + struct dirent + { + const char* d_name; + }; + + struct DIR + { + WIN32_FIND_DATA data; + HANDLE handle; + dirent ent; + }; + + DIR* opendir(const char* path) + { + DIR* dir = new DIR; + dir->ent.d_name = 0; + dir->handle = ::FindFirstFileA((cv::String(path) + "\\*").c_str(), &dir->data); + if(dir->handle == INVALID_HANDLE_VALUE) + { + /*closedir will do all cleanup*/ + return 0; + } + return dir; + } + + dirent* readdir(DIR* dir) + { + if (dir->ent.d_name != 0) + { + if (::FindNextFile(dir->handle, &dir->data) != TRUE) + return 0; + } + dir->ent.d_name = dir->data.cFileName; + return &dir->ent; + } + + void closedir(DIR* dir) + { + ::FindClose(dir->handle); + delete dir; + } + + +} +#else +# include +# include +const char dir_separators[] = "/"; +const char native_separator = '/'; +#endif + +static bool isDir(const cv::String& path, DIR* dir) +{ +#if defined WIN32 || defined _WIN32 || defined WINCE + DWORD attributes; + if (dir) + attributes = dir->data.dwFileAttributes; + else + attributes = ::GetFileAttributes(path.c_str()); + + return (attributes != INVALID_FILE_ATTRIBUTES) && ((attributes & FILE_ATTRIBUTE_DIRECTORY) != 0); +#else + (void)dir; + struct stat stat_buf; + if (0 != stat( path.c_str(), &stat_buf)) + return false; + int is_dir = S_ISDIR( stat_buf.st_mode); + return is_dir != 0; +#endif +} + +static bool wildcmp(const char *string, const char *wild) +{ + // Based on wildcmp written by Jack Handy - jakkhandy@hotmail.com + const char *cp = 0, *mp = 0; + + while ((*string) && (*wild != '*')) + { + if ((*wild != *string) && (*wild != '?')) + { + return false; + } + + wild++; + string++; + } + + while (*string) + { + if (*wild == '*') + { + if (!*++wild) + { + return true; + } + + mp = wild; + cp = string + 1; + } + else if ((*wild == *string) || (*wild == '?')) + { + wild++; + string++; + } + else + { + wild = mp; + string = cp++; + } + } + + while (*wild == '*') + { + wild++; + } + + return *wild == 0; +} + +static void glob_rec(const cv::String& directory, const cv::String& wildchart, std::vector& result, bool recursive) +{ + DIR *dir; + struct dirent *ent; + + if ((dir = opendir (directory.c_str())) != 0) + { + /* find all the files and directories within directory */ + try + { + while ((ent = readdir (dir)) != 0) + { + const char* name = ent->d_name; + if((name[0] == 0) || (name[0] == '.' && name[1] == 0) || (name[0] == '.' && name[1] == '.' && name[2] == 0)) + continue; + + cv::String path = directory + native_separator + name; + + if (isDir(path, dir)) + { + if (recursive) + glob_rec(path, wildchart, result, recursive); + } + else + { + if (wildchart.empty() || wildcmp(name, wildchart.c_str())) + result.push_back(path); + } + } + } + catch (...) + { + closedir(dir); + throw; + } + closedir(dir); + } + else CV_Error(CV_StsObjectNotFound, cv::format("could not open directory: %s", directory.c_str())); +} + +void cv::glob(String pattern, std::vector& result, bool recursive) +{ + result.clear(); + String path, wildchart; + + if (isDir(pattern, 0)) + { + if(strchr(dir_separators, pattern[pattern.size() - 1]) != 0) + { + path = pattern.substr(0, pattern.size() - 1); + } + else + { + path = pattern; + } + } + else + { + size_t pos = pattern.find_last_of(dir_separators); + if (pos == String::npos) + { + wildchart = pattern; + path = "."; + } + else + { + path = pattern.substr(0, pos); + wildchart = pattern.substr(pos + 1); + } + } + + glob_rec(path, wildchart, result, recursive); + std::sort(result.begin(), result.end()); +} diff --git a/modules/core/test/test_io.cpp b/modules/core/test/test_io.cpp index bf976cf543..3526e83768 100644 --- a/modules/core/test/test_io.cpp +++ b/modules/core/test/test_io.cpp @@ -454,6 +454,23 @@ protected: TEST(Core_InputOutput, huge) { CV_BigMatrixIOTest test; test.safe_run(); } */ +TEST(Core_globbing, accuracy) +{ + std::string patternLena = cvtest::TS::ptr()->get_data_path() + "lena*.*"; + std::string patternLenaPng = cvtest::TS::ptr()->get_data_path() + "lena.png"; + + std::vector lenas, pngLenas; + cv::glob(patternLena, lenas, true); + cv::glob(patternLenaPng, pngLenas, true); + + ASSERT_GT(lenas.size(), pngLenas.size()); + + for (size_t i = 0; i < pngLenas.size(); ++i) + { + ASSERT_NE(std::find(lenas.begin(), lenas.end(), pngLenas[i]), lenas.end()); + } +} + TEST(Core_InputOutput, FileStorage) { std::string file = cv::tempfile(".xml"); From 484607fb6f0e4b54ebc83cf0f79a1159f7fe8cd7 Mon Sep 17 00:00:00 2001 From: Roman Donchenko Date: Wed, 10 Apr 2013 19:39:38 +0400 Subject: [PATCH 17/30] Backported RNG_MT19937 from master. --- modules/core/include/opencv2/core/core.hpp | 34 ++++++ modules/core/src/rand.cpp | 125 +++++++++++++++++++++ modules/core/test/test_rand.cpp | 26 +++++ 3 files changed, 185 insertions(+) diff --git a/modules/core/include/opencv2/core/core.hpp b/modules/core/include/opencv2/core/core.hpp index 1bffbaaf10..1c8e0e2cac 100644 --- a/modules/core/include/opencv2/core/core.hpp +++ b/modules/core/include/opencv2/core/core.hpp @@ -2044,6 +2044,40 @@ public: uint64 state; }; +/*! + Random Number Generator - MT + + The class implements RNG using the Mersenne Twister algorithm +*/ +class CV_EXPORTS RNG_MT19937 +{ +public: + RNG_MT19937(); + RNG_MT19937(unsigned s); + void seed(unsigned s); + + unsigned next(); + + operator int(); + operator unsigned(); + operator float(); + operator double(); + + unsigned operator ()(unsigned N); + unsigned operator ()(); + + //! returns uniformly distributed integer random number from [a,b) range + int uniform(int a, int b); + //! returns uniformly distributed floating-point random number from [a,b) range + float uniform(float a, float b); + //! returns uniformly distributed double-precision floating-point random number from [a,b) range + double uniform(double a, double b); + +private: + enum PeriodParameters {N = 624, M = 397}; + unsigned state[N]; + int mti; +}; /*! Termination criteria in iterative algorithms diff --git a/modules/core/src/rand.cpp b/modules/core/src/rand.cpp index bae8eae89d..2cdbe3916d 100644 --- a/modules/core/src/rand.cpp +++ b/modules/core/src/rand.cpp @@ -883,4 +883,129 @@ CV_IMPL void cvRandShuffle( CvArr* arr, CvRNG* _rng, double iter_factor ) cv::randShuffle( dst, iter_factor, &rng ); } +// Mersenne Twister random number generator. +// Inspired by http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/MT2002/CODES/mt19937ar.c + +/* + A C-program for MT19937, with initialization improved 2002/1/26. + Coded by Takuji Nishimura and Makoto Matsumoto. + + Before using, initialize the state by using init_genrand(seed) + or init_by_array(init_key, key_length). + + Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura, + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + 3. The names of its contributors may not be used to endorse or promote + products derived from this software without specific prior written + permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + + Any feedback is very welcome. + http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html + email: m-mat @ math.sci.hiroshima-u.ac.jp (remove space) +*/ + +cv::RNG_MT19937::RNG_MT19937(unsigned s) { seed(s); } + +cv::RNG_MT19937::RNG_MT19937() { seed(5489U); } + +void cv::RNG_MT19937::seed(unsigned s) +{ + state[0]= s; + for (mti = 1; mti < N; mti++) + { + /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */ + state[mti] = (1812433253U * (state[mti - 1] ^ (state[mti - 1] >> 30)) + mti); + } +} + +unsigned cv::RNG_MT19937::next() +{ + /* mag01[x] = x * MATRIX_A for x=0,1 */ + static unsigned mag01[2] = { 0x0U, /*MATRIX_A*/ 0x9908b0dfU}; + + const unsigned UPPER_MASK = 0x80000000U; + const unsigned LOWER_MASK = 0x7fffffffU; + + /* generate N words at one time */ + if (mti >= N) + { + int kk = 0; + + for (; kk < N - M; ++kk) + { + unsigned y = (state[kk] & UPPER_MASK) | (state[kk + 1] & LOWER_MASK); + state[kk] = state[kk + M] ^ (y >> 1) ^ mag01[y & 0x1U]; + } + + for (; kk < N - 1; ++kk) + { + unsigned y = (state[kk] & UPPER_MASK) | (state[kk + 1] & LOWER_MASK); + state[kk] = state[kk + (M - N)] ^ (y >> 1) ^ mag01[y & 0x1U]; + } + + unsigned y = (state[N - 1] & UPPER_MASK) | (state[0] & LOWER_MASK); + state[N - 1] = state[M - 1] ^ (y >> 1) ^ mag01[y & 0x1U]; + + mti = 0; + } + + unsigned y = state[mti++]; + + /* Tempering */ + y ^= (y >> 11); + y ^= (y << 7) & 0x9d2c5680U; + y ^= (y << 15) & 0xefc60000U; + y ^= (y >> 18); + + return y; +} + +cv::RNG_MT19937::operator unsigned() { return next(); } + +cv::RNG_MT19937::operator int() { return (int)next();} + +cv::RNG_MT19937::operator float() { return next() * (1.f / 4294967296.f); } + +cv::RNG_MT19937::operator double() +{ + unsigned a = next() >> 5; + unsigned b = next() >> 6; + return (a * 67108864.0 + b) * (1.0 / 9007199254740992.0); +} + +int cv::RNG_MT19937::uniform(int a, int b) { return (int)(next() % (b - a) + a); } + +float cv::RNG_MT19937::uniform(float a, float b) { return ((float)*this)*(b - a) + a; } + +double cv::RNG_MT19937::uniform(double a, double b) { return ((double)*this)*(b - a) + a; } + +unsigned cv::RNG_MT19937::operator ()(unsigned b) { return next() % b; } + +unsigned cv::RNG_MT19937::operator ()() { return next(); } + /* End of file. */ diff --git a/modules/core/test/test_rand.cpp b/modules/core/test/test_rand.cpp index e93415b3b5..1d9b3dd0d1 100644 --- a/modules/core/test/test_rand.cpp +++ b/modules/core/test/test_rand.cpp @@ -339,3 +339,29 @@ protected: TEST(Core_Rand, range) { Core_RandRangeTest test; test.safe_run(); } + +TEST(Core_RNG_MT19937, regression) +{ + cv::RNG_MT19937 rng; + int actual[61] = {0, }; + const size_t length = (sizeof(actual) / sizeof(actual[0])); + for (int i = 0; i < 10000; ++i ) + { + actual[(unsigned)(rng.next() ^ i) % length]++; + } + + int expected[length] = { + 177, 158, 180, 177, 160, 179, 143, 162, + 177, 144, 170, 174, 165, 168, 168, 156, + 177, 157, 159, 169, 177, 182, 166, 154, + 144, 180, 168, 152, 170, 187, 160, 145, + 139, 164, 157, 179, 148, 183, 159, 160, + 196, 184, 149, 142, 162, 148, 163, 152, + 168, 173, 160, 181, 172, 181, 155, 153, + 158, 171, 138, 150, 150 }; + + for (size_t i = 0; i < length; ++i) + { + ASSERT_EQ(expected[i], actual[i]); + } +} From 4143071e222d5749787dc8c68d35f01bbf71ec3d Mon Sep 17 00:00:00 2001 From: Roman Donchenko Date: Thu, 11 Apr 2013 13:43:51 +0400 Subject: [PATCH 18/30] In CvBoost, delegated update_weights's implementation to a helper method. This allows subclasses of CvBoost to override initial weights that update_weights uses without duplicating its entire implementation. --- modules/ml/include/opencv2/ml/ml.hpp | 2 ++ modules/ml/src/boost.cpp | 11 ++++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/modules/ml/include/opencv2/ml/ml.hpp b/modules/ml/include/opencv2/ml/ml.hpp index 6612d2ea13..4a928c7305 100644 --- a/modules/ml/include/opencv2/ml/ml.hpp +++ b/modules/ml/include/opencv2/ml/ml.hpp @@ -1253,6 +1253,8 @@ public: protected: + void update_weights_impl( CvBoostTree* tree, double initial_weights[2] ); + virtual bool set_params( const CvBoostParams& params ); virtual void update_weights( CvBoostTree* tree ); virtual void trim_weights(); diff --git a/modules/ml/src/boost.cpp b/modules/ml/src/boost.cpp index 8db94bd713..d8e5c0d1d2 100644 --- a/modules/ml/src/boost.cpp +++ b/modules/ml/src/boost.cpp @@ -1117,9 +1117,9 @@ bool CvBoost::train( CvMLData* _data, } void -CvBoost::update_weights( CvBoostTree* tree ) +CvBoost::update_weights_impl( CvBoostTree* tree, double initial_weights[2] ) { - CV_FUNCNAME( "CvBoost::update_weights" ); + CV_FUNCNAME( "CvBoost::update_weights_impl" ); __BEGIN__; @@ -1161,7 +1161,7 @@ CvBoost::update_weights( CvBoostTree* tree ) // so we need to convert class labels to floating-point values double w0 = 1./n; - double p[2] = { 1, 1 }; + double p[2] = { initial_weights[0], initial_weights[1] }; cvReleaseMat( &orig_response ); cvReleaseMat( &sum_response ); @@ -1414,6 +1414,11 @@ CvBoost::update_weights( CvBoostTree* tree ) __END__; } +void +CvBoost::update_weights( CvBoostTree* tree ) { + double initial_weights[2] = { 1, 1 }; + update_weights_impl( tree, initial_weights ); +} static CV_IMPLEMENT_QSORT_EX( icvSort_64f, double, CV_LT, int ) From e351538697bc010feeb5a77688ec997aa26a04cd Mon Sep 17 00:00:00 2001 From: Kirill Kornyakov Date: Thu, 11 Apr 2013 17:50:10 +0400 Subject: [PATCH 19/30] #2813 bugfix --- samples/cpp/lkdemo.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/samples/cpp/lkdemo.cpp b/samples/cpp/lkdemo.cpp index 9ea395c8a3..47fc50e161 100644 --- a/samples/cpp/lkdemo.cpp +++ b/samples/cpp/lkdemo.cpp @@ -134,6 +134,7 @@ int main( int argc, char** argv ) needToInit = true; break; case 'c': + points[0].clear(); points[1].clear(); break; case 'n': From 53a06913cb4541e6d9555b87f72d3f3c83969ca2 Mon Sep 17 00:00:00 2001 From: Kirill Kornyakov Date: Thu, 11 Apr 2013 17:57:33 +0400 Subject: [PATCH 20/30] Minor code cleanings in lkdemo.cpp --- samples/cpp/lkdemo.cpp | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/samples/cpp/lkdemo.cpp b/samples/cpp/lkdemo.cpp index 47fc50e161..bbb734cf3c 100644 --- a/samples/cpp/lkdemo.cpp +++ b/samples/cpp/lkdemo.cpp @@ -12,9 +12,8 @@ static void help() { // print a welcome message, and the OpenCV version cout << "\nThis is a demo of Lukas-Kanade optical flow lkdemo(),\n" - "Using OpenCV version %s\n" << CV_VERSION << "\n" - << endl; - + "Using OpenCV version " << CV_VERSION << endl; + cout << "\nIt uses camera by default, but you can provide a path to video as an argument.\n"; cout << "\nHot keys: \n" "\tESC - quit the program\n" "\tr - auto-initialize tracking\n" @@ -30,15 +29,17 @@ static void onMouse( int event, int x, int y, int /*flags*/, void* /*param*/ ) { if( event == CV_EVENT_LBUTTONDOWN ) { - point = Point2f((float)x,(float)y); + point = Point2f((float)x, (float)y); addRemovePt = true; } } int main( int argc, char** argv ) { + help(); + VideoCapture cap; - TermCriteria termcrit(CV_TERMCRIT_ITER|CV_TERMCRIT_EPS,20,0.03); + TermCriteria termcrit(CV_TERMCRIT_ITER|CV_TERMCRIT_EPS, 20, 0.03); Size subPixWinSize(10,10), winSize(31,31); const int MAX_COUNT = 500; @@ -56,8 +57,6 @@ int main( int argc, char** argv ) return 0; } - help(); - namedWindow( "LK Demo", 1 ); setMouseCallback( "LK Demo", onMouse, 0 ); @@ -140,12 +139,10 @@ int main( int argc, char** argv ) case 'n': nightMode = !nightMode; break; - default: - ; } std::swap(points[1], points[0]); - swap(prevGray, gray); + cv::swap(prevGray, gray); } return 0; From 57d4c86b2bd86be5811b35fc4ccaf653e7939e0e Mon Sep 17 00:00:00 2001 From: Roman Donchenko Date: Thu, 11 Apr 2013 18:34:04 +0400 Subject: [PATCH 21/30] Fixed the shebang lines on the Python scripts. Also, removed the one from modules/python/src2/cv.py and cleared its executable bit, since it's not a script. --- doc/check_docs.py | 2 +- doc/check_docs2.py | 2 +- doc/conf.py | 2 +- doc/ocv.py | 2 +- doc/patch_refman_latex.py | 2 +- doc/pattern_tools/gen_pattern.py | 2 +- doc/pattern_tools/svgfig.py | 2 +- doc/reformat.py | 2 +- modules/contrib/doc/facerec/src/create_csv.py | 2 +- modules/gpu/misc/mark_nvidia.py | 2 +- modules/java/check-tests.py | 2 +- modules/java/generator/gen_java.py | 2 +- modules/java/generator/gen_javadoc.py | 2 +- modules/java/generator/rst_parser.py | 2 +- modules/python/src2/cv.py | 2 -- modules/python/src2/gen.py | 2 +- modules/python/src2/gen2.py | 2 +- modules/python/src2/hdr_parser.py | 2 +- modules/python/test/calchist.py | 2 +- modules/python/test/camera_calibration.py | 2 +- modules/python/test/findstereocorrespondence.py | 2 +- modules/python/test/goodfeatures.py | 2 +- modules/python/test/leak1.py | 2 +- modules/python/test/leak2.py | 2 +- modules/python/test/leak3.py | 2 +- modules/python/test/leak4.py | 2 +- modules/python/test/precornerdetect.py | 2 +- modules/python/test/test.py | 2 +- modules/python/test/test2.py | 2 +- modules/python/test/ticket_6.py | 2 +- modules/python/test/tickets.py | 2 +- modules/python/test/transformations.py | 2 +- modules/ts/misc/testlog_parser.py | 2 +- samples/python2/_coverage.py | 2 +- samples/python2/_doc.py | 2 +- samples/python2/asift.py | 2 +- samples/python2/browse.py | 2 +- samples/python2/calibrate.py | 2 +- samples/python2/camshift.py | 2 +- samples/python2/coherence.py | 2 +- samples/python2/color_histogram.py | 2 +- samples/python2/common.py | 2 +- samples/python2/contours.py | 2 +- samples/python2/deconvolution.py | 2 +- samples/python2/demo.py | 2 +- samples/python2/dft.py | 2 +- samples/python2/digits.py | 2 +- samples/python2/digits_adjust.py | 2 +- samples/python2/digits_video.py | 2 +- samples/python2/distrans.py | 2 +- samples/python2/edge.py | 2 +- samples/python2/facedetect.py | 2 +- samples/python2/feature_homography.py | 2 +- samples/python2/find_obj.py | 2 +- samples/python2/fitline.py | 2 +- samples/python2/floodfill.py | 2 +- samples/python2/gabor_threads.py | 2 +- samples/python2/gaussian_mix.py | 2 +- samples/python2/hist.py | 2 +- samples/python2/inpaint.py | 2 +- samples/python2/kmeans.py | 2 +- samples/python2/lappyr.py | 2 +- samples/python2/letter_recog.py | 2 +- samples/python2/lk_homography.py | 2 +- samples/python2/lk_track.py | 2 +- samples/python2/morphology.py | 2 +- samples/python2/mosse.py | 2 +- samples/python2/motempl.py | 2 +- samples/python2/mouse_and_match.py | 2 +- samples/python2/mser.py | 2 +- samples/python2/opt_flow.py | 2 +- samples/python2/peopledetect.py | 2 +- samples/python2/plane_ar.py | 2 +- samples/python2/plane_tracker.py | 2 +- samples/python2/squares.py | 2 +- samples/python2/stereo_match.py | 2 +- samples/python2/texture_flow.py | 2 +- samples/python2/turing.py | 2 +- samples/python2/video.py | 2 +- samples/python2/video_dmtx.py | 2 +- samples/python2/video_threaded.py | 2 +- samples/python2/watershed.py | 2 +- 82 files changed, 81 insertions(+), 83 deletions(-) mode change 100755 => 100644 modules/python/src2/cv.py diff --git a/doc/check_docs.py b/doc/check_docs.py index 2d8799341d..8ab5fced4d 100755 --- a/doc/check_docs.py +++ b/doc/check_docs.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import sys, glob diff --git a/doc/check_docs2.py b/doc/check_docs2.py index 8092c68011..ca99a50bfb 100755 --- a/doc/check_docs2.py +++ b/doc/check_docs2.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import os, sys, fnmatch, re diff --git a/doc/conf.py b/doc/conf.py index 7b9b02ecf4..4c7a15c891 100755 --- a/doc/conf.py +++ b/doc/conf.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python # -*- coding: utf-8 -*- # diff --git a/doc/ocv.py b/doc/ocv.py index 4ff8a6deb5..8fcef4040b 100755 --- a/doc/ocv.py +++ b/doc/ocv.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python # -*- coding: utf-8 -*- """ ocv domain, a modified copy of sphinx.domains.cpp + shpinx.domains.python. diff --git a/doc/patch_refman_latex.py b/doc/patch_refman_latex.py index 352c46cb56..ff762fc8f3 100755 --- a/doc/patch_refman_latex.py +++ b/doc/patch_refman_latex.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import sys diff --git a/doc/pattern_tools/gen_pattern.py b/doc/pattern_tools/gen_pattern.py index 45b45af2db..3643b6d3b2 100755 --- a/doc/pattern_tools/gen_pattern.py +++ b/doc/pattern_tools/gen_pattern.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python """gen_pattern.py To run: diff --git a/doc/pattern_tools/svgfig.py b/doc/pattern_tools/svgfig.py index bf182a8b09..86afa59133 100755 --- a/doc/pattern_tools/svgfig.py +++ b/doc/pattern_tools/svgfig.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python # svgfig.py copyright (C) 2008 Jim Pivarski # diff --git a/doc/reformat.py b/doc/reformat.py index 00e4aae9e0..017efebb38 100755 --- a/doc/reformat.py +++ b/doc/reformat.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import os, sys, re diff --git a/modules/contrib/doc/facerec/src/create_csv.py b/modules/contrib/doc/facerec/src/create_csv.py index 71d773c017..c4de778f98 100755 --- a/modules/contrib/doc/facerec/src/create_csv.py +++ b/modules/contrib/doc/facerec/src/create_csv.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import sys import os.path diff --git a/modules/gpu/misc/mark_nvidia.py b/modules/gpu/misc/mark_nvidia.py index e8cc3e8417..08743fb136 100755 --- a/modules/gpu/misc/mark_nvidia.py +++ b/modules/gpu/misc/mark_nvidia.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import sys, re diff --git a/modules/java/check-tests.py b/modules/java/check-tests.py index 4cb80ff724..c4d34f61e9 100755 --- a/modules/java/check-tests.py +++ b/modules/java/check-tests.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import sys, os, re diff --git a/modules/java/generator/gen_java.py b/modules/java/generator/gen_java.py index c0da34fe09..1da5cb68c6 100755 --- a/modules/java/generator/gen_java.py +++ b/modules/java/generator/gen_java.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import sys, re, os.path from string import Template diff --git a/modules/java/generator/gen_javadoc.py b/modules/java/generator/gen_javadoc.py index 71372d3a2c..dfa591a959 100755 --- a/modules/java/generator/gen_javadoc.py +++ b/modules/java/generator/gen_javadoc.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import os, sys, re, string, glob from optparse import OptionParser diff --git a/modules/java/generator/rst_parser.py b/modules/java/generator/rst_parser.py index 33dae447d5..d32323a784 100755 --- a/modules/java/generator/rst_parser.py +++ b/modules/java/generator/rst_parser.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import os, sys, re, string, fnmatch allmodules = ["core", "flann", "imgproc", "ml", "highgui", "video", "features2d", "calib3d", "objdetect", "legacy", "contrib", "gpu", "androidcamera", "java", "python", "stitching", "ts", "photo", "nonfree", "videostab", "ocl", "superres"] diff --git a/modules/python/src2/cv.py b/modules/python/src2/cv.py old mode 100755 new mode 100644 index 4238d05f7a..2d4daf08b0 --- a/modules/python/src2/cv.py +++ b/modules/python/src2/cv.py @@ -1,3 +1 @@ -#/usr/bin/env python - from cv2.cv import * diff --git a/modules/python/src2/gen.py b/modules/python/src2/gen.py index 65cafc9900..40879e569f 100755 --- a/modules/python/src2/gen.py +++ b/modules/python/src2/gen.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import sys from string import Template diff --git a/modules/python/src2/gen2.py b/modules/python/src2/gen2.py index 5061b11479..69a0d369d7 100755 --- a/modules/python/src2/gen2.py +++ b/modules/python/src2/gen2.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import hdr_parser, sys, re, os, cStringIO from string import Template diff --git a/modules/python/src2/hdr_parser.py b/modules/python/src2/hdr_parser.py index 4c81b04ff7..14da8873c3 100755 --- a/modules/python/src2/hdr_parser.py +++ b/modules/python/src2/hdr_parser.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import os, sys, re, string diff --git a/modules/python/test/calchist.py b/modules/python/test/calchist.py index 0a52258b20..287e22f91e 100755 --- a/modules/python/test/calchist.py +++ b/modules/python/test/calchist.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python # Calculating and displaying 2D Hue-Saturation histogram of a color image import sys diff --git a/modules/python/test/camera_calibration.py b/modules/python/test/camera_calibration.py index 488dd15c62..8ffc5b1cd9 100755 --- a/modules/python/test/camera_calibration.py +++ b/modules/python/test/camera_calibration.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import sys import math diff --git a/modules/python/test/findstereocorrespondence.py b/modules/python/test/findstereocorrespondence.py index 8f11738cce..40a9603beb 100755 --- a/modules/python/test/findstereocorrespondence.py +++ b/modules/python/test/findstereocorrespondence.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import sys import cv2.cv as cv diff --git a/modules/python/test/goodfeatures.py b/modules/python/test/goodfeatures.py index 62907772ab..5ccd5b46c1 100755 --- a/modules/python/test/goodfeatures.py +++ b/modules/python/test/goodfeatures.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import cv2.cv as cv import unittest diff --git a/modules/python/test/leak1.py b/modules/python/test/leak1.py index dde5608951..dbd6040a5a 100755 --- a/modules/python/test/leak1.py +++ b/modules/python/test/leak1.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import cv2.cv as cv import numpy as np diff --git a/modules/python/test/leak2.py b/modules/python/test/leak2.py index af1cb0556c..518226448a 100755 --- a/modules/python/test/leak2.py +++ b/modules/python/test/leak2.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import cv2.cv as cv import numpy as np diff --git a/modules/python/test/leak3.py b/modules/python/test/leak3.py index f72afbbf08..d763c4044d 100755 --- a/modules/python/test/leak3.py +++ b/modules/python/test/leak3.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import cv2.cv as cv import math diff --git a/modules/python/test/leak4.py b/modules/python/test/leak4.py index dcfc5cfdc9..9e5864092b 100755 --- a/modules/python/test/leak4.py +++ b/modules/python/test/leak4.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import cv2.cv as cv import math diff --git a/modules/python/test/precornerdetect.py b/modules/python/test/precornerdetect.py index 29a6ca1ecd..97aa906d4a 100755 --- a/modules/python/test/precornerdetect.py +++ b/modules/python/test/precornerdetect.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import cv2.cv as cv diff --git a/modules/python/test/test.py b/modules/python/test/test.py index 7c511e4ef7..48138cbdac 100755 --- a/modules/python/test/test.py +++ b/modules/python/test/test.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import unittest import random diff --git a/modules/python/test/test2.py b/modules/python/test/test2.py index 703d2ed742..a96be4f6bb 100644 --- a/modules/python/test/test2.py +++ b/modules/python/test/test2.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import unittest import random diff --git a/modules/python/test/ticket_6.py b/modules/python/test/ticket_6.py index 533027f5b9..7249ff2c75 100755 --- a/modules/python/test/ticket_6.py +++ b/modules/python/test/ticket_6.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import urllib import cv2.cv as cv diff --git a/modules/python/test/tickets.py b/modules/python/test/tickets.py index 1e756bcd82..de51e7aa16 100755 --- a/modules/python/test/tickets.py +++ b/modules/python/test/tickets.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import unittest import random diff --git a/modules/python/test/transformations.py b/modules/python/test/transformations.py index 1f63bcef22..5dce6b0497 100755 --- a/modules/python/test/transformations.py +++ b/modules/python/test/transformations.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python # -*- coding: utf-8 -*- # transformations.py diff --git a/modules/ts/misc/testlog_parser.py b/modules/ts/misc/testlog_parser.py index f61b47bba9..7ae6aa5980 100755 --- a/modules/ts/misc/testlog_parser.py +++ b/modules/ts/misc/testlog_parser.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import sys, re, os.path from xml.dom.minidom import parse diff --git a/samples/python2/_coverage.py b/samples/python2/_coverage.py index 1d0f0418be..80edffb997 100755 --- a/samples/python2/_coverage.py +++ b/samples/python2/_coverage.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python ''' Utility for measuring python opencv API coverage by samples. diff --git a/samples/python2/_doc.py b/samples/python2/_doc.py index 71c9faa7dd..fe2b6f32be 100755 --- a/samples/python2/_doc.py +++ b/samples/python2/_doc.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python ''' Scans current directory for *.py files and reports diff --git a/samples/python2/asift.py b/samples/python2/asift.py index 09894dd4c2..61fca80fbb 100755 --- a/samples/python2/asift.py +++ b/samples/python2/asift.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python ''' Affine invariant feature-based image matching sample. diff --git a/samples/python2/browse.py b/samples/python2/browse.py index da2c98d1c7..1ea31c0b64 100755 --- a/samples/python2/browse.py +++ b/samples/python2/browse.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python ''' browse.py diff --git a/samples/python2/calibrate.py b/samples/python2/calibrate.py index 54aba8aaf9..11ab813f59 100755 --- a/samples/python2/calibrate.py +++ b/samples/python2/calibrate.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import numpy as np import cv2 diff --git a/samples/python2/camshift.py b/samples/python2/camshift.py index 43543dcfc5..4ce005b878 100755 --- a/samples/python2/camshift.py +++ b/samples/python2/camshift.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python ''' Camshift tracker diff --git a/samples/python2/coherence.py b/samples/python2/coherence.py index 082a7c6101..d2de154c08 100755 --- a/samples/python2/coherence.py +++ b/samples/python2/coherence.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python ''' Coherence-enhancing filtering example diff --git a/samples/python2/color_histogram.py b/samples/python2/color_histogram.py index c27744ed5a..ebb4b642d5 100755 --- a/samples/python2/color_histogram.py +++ b/samples/python2/color_histogram.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import numpy as np import cv2 diff --git a/samples/python2/common.py b/samples/python2/common.py index 4c6f800c44..f3c41018a1 100755 --- a/samples/python2/common.py +++ b/samples/python2/common.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python ''' This module contais some common routines used by other samples. diff --git a/samples/python2/contours.py b/samples/python2/contours.py index daa13b2a11..f8cc12c0c9 100755 --- a/samples/python2/contours.py +++ b/samples/python2/contours.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python ''' This program illustrates the use of findContours and drawContours. diff --git a/samples/python2/deconvolution.py b/samples/python2/deconvolution.py index 5e3becc645..e9c4f44d1d 100755 --- a/samples/python2/deconvolution.py +++ b/samples/python2/deconvolution.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python ''' Wiener deconvolution. diff --git a/samples/python2/demo.py b/samples/python2/demo.py index a9c1070154..068d1eef6d 100755 --- a/samples/python2/demo.py +++ b/samples/python2/demo.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python ''' Sample-launcher application. diff --git a/samples/python2/dft.py b/samples/python2/dft.py index 32a91e3b58..73df84dc9e 100644 --- a/samples/python2/dft.py +++ b/samples/python2/dft.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import cv2 import numpy as np diff --git a/samples/python2/digits.py b/samples/python2/digits.py index c84bc241cb..e68ec0461b 100755 --- a/samples/python2/digits.py +++ b/samples/python2/digits.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python ''' SVM and KNearest digit recognition. diff --git a/samples/python2/digits_adjust.py b/samples/python2/digits_adjust.py index 72805d3f00..3147310104 100755 --- a/samples/python2/digits_adjust.py +++ b/samples/python2/digits_adjust.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python ''' Digit recognition adjustment. diff --git a/samples/python2/digits_video.py b/samples/python2/digits_video.py index ef62826fd1..473d54560e 100755 --- a/samples/python2/digits_video.py +++ b/samples/python2/digits_video.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import numpy as np import cv2 diff --git a/samples/python2/distrans.py b/samples/python2/distrans.py index 386ea8ab99..fc2e3d1772 100755 --- a/samples/python2/distrans.py +++ b/samples/python2/distrans.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python ''' Distance transform sample. diff --git a/samples/python2/edge.py b/samples/python2/edge.py index 9ce3457d30..4abc94255d 100755 --- a/samples/python2/edge.py +++ b/samples/python2/edge.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python ''' This sample demonstrates Canny edge detection. diff --git a/samples/python2/facedetect.py b/samples/python2/facedetect.py index 27a78950a9..5154711d20 100755 --- a/samples/python2/facedetect.py +++ b/samples/python2/facedetect.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import numpy as np import cv2 diff --git a/samples/python2/feature_homography.py b/samples/python2/feature_homography.py index 4dd8ddf136..a9e0d32183 100755 --- a/samples/python2/feature_homography.py +++ b/samples/python2/feature_homography.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python ''' Feature homography diff --git a/samples/python2/find_obj.py b/samples/python2/find_obj.py index 7a875ca4f7..66c971dd23 100755 --- a/samples/python2/find_obj.py +++ b/samples/python2/find_obj.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python ''' Feature-based image matching sample. diff --git a/samples/python2/fitline.py b/samples/python2/fitline.py index c19dbdc35b..5960f2c856 100755 --- a/samples/python2/fitline.py +++ b/samples/python2/fitline.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python ''' Robust line fitting. diff --git a/samples/python2/floodfill.py b/samples/python2/floodfill.py index 177e380255..33978c1589 100755 --- a/samples/python2/floodfill.py +++ b/samples/python2/floodfill.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python ''' Floodfill sample. diff --git a/samples/python2/gabor_threads.py b/samples/python2/gabor_threads.py index 9582bdb04f..7b8766eb55 100755 --- a/samples/python2/gabor_threads.py +++ b/samples/python2/gabor_threads.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python ''' gabor_threads.py diff --git a/samples/python2/gaussian_mix.py b/samples/python2/gaussian_mix.py index de714fa353..704d30cf6b 100755 --- a/samples/python2/gaussian_mix.py +++ b/samples/python2/gaussian_mix.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import numpy as np from numpy import random diff --git a/samples/python2/hist.py b/samples/python2/hist.py index 47fdb56bb4..1f32b0e079 100755 --- a/samples/python2/hist.py +++ b/samples/python2/hist.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python ''' This is a sample for histogram plotting for RGB images and grayscale images for better understanding of colour distribution diff --git a/samples/python2/inpaint.py b/samples/python2/inpaint.py index d2b8c849a7..8e91406bcd 100755 --- a/samples/python2/inpaint.py +++ b/samples/python2/inpaint.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python ''' Inpainting sample. diff --git a/samples/python2/kmeans.py b/samples/python2/kmeans.py index 4418a46792..0656fa7ad0 100755 --- a/samples/python2/kmeans.py +++ b/samples/python2/kmeans.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python ''' K-means clusterization sample. diff --git a/samples/python2/lappyr.py b/samples/python2/lappyr.py index 232ec54b92..550136262b 100755 --- a/samples/python2/lappyr.py +++ b/samples/python2/lappyr.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python ''' An example of Laplacian Pyramid construction and merging. diff --git a/samples/python2/letter_recog.py b/samples/python2/letter_recog.py index 73eeff3bb7..ef8c7229c9 100755 --- a/samples/python2/letter_recog.py +++ b/samples/python2/letter_recog.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python ''' The sample demonstrates how to train Random Trees classifier diff --git a/samples/python2/lk_homography.py b/samples/python2/lk_homography.py index 9996764c6f..5f63897551 100755 --- a/samples/python2/lk_homography.py +++ b/samples/python2/lk_homography.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python ''' Lucas-Kanade homography tracker diff --git a/samples/python2/lk_track.py b/samples/python2/lk_track.py index f0d0439622..794cb10035 100755 --- a/samples/python2/lk_track.py +++ b/samples/python2/lk_track.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python ''' Lucas-Kanade tracker diff --git a/samples/python2/morphology.py b/samples/python2/morphology.py index 0980981380..99e63444e7 100755 --- a/samples/python2/morphology.py +++ b/samples/python2/morphology.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python ''' Morphology operations. diff --git a/samples/python2/mosse.py b/samples/python2/mosse.py index aecb423452..671b33fc7d 100755 --- a/samples/python2/mosse.py +++ b/samples/python2/mosse.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python ''' MOSSE tracking sample diff --git a/samples/python2/motempl.py b/samples/python2/motempl.py index 4f78ebad57..3ded78e4bf 100755 --- a/samples/python2/motempl.py +++ b/samples/python2/motempl.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import numpy as np import cv2 diff --git a/samples/python2/mouse_and_match.py b/samples/python2/mouse_and_match.py index a9fa882adc..b55b002667 100755 --- a/samples/python2/mouse_and_match.py +++ b/samples/python2/mouse_and_match.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python #!/usr/bin/env python ''' diff --git a/samples/python2/mser.py b/samples/python2/mser.py index 73e0a4f8da..d640ea4b8f 100755 --- a/samples/python2/mser.py +++ b/samples/python2/mser.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python ''' MSER detector demo diff --git a/samples/python2/opt_flow.py b/samples/python2/opt_flow.py index d0bc5c5c11..b476b0142e 100755 --- a/samples/python2/opt_flow.py +++ b/samples/python2/opt_flow.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import numpy as np import cv2 diff --git a/samples/python2/peopledetect.py b/samples/python2/peopledetect.py index f4bd46702f..f8d5e6fd0f 100755 --- a/samples/python2/peopledetect.py +++ b/samples/python2/peopledetect.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python import numpy as np import cv2 diff --git a/samples/python2/plane_ar.py b/samples/python2/plane_ar.py index dc2d5d584f..4b12dfe9d9 100755 --- a/samples/python2/plane_ar.py +++ b/samples/python2/plane_ar.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python ''' Planar augmented reality diff --git a/samples/python2/plane_tracker.py b/samples/python2/plane_tracker.py index 189f50b362..4b7d3959c7 100755 --- a/samples/python2/plane_tracker.py +++ b/samples/python2/plane_tracker.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python ''' Multitarget planar tracking diff --git a/samples/python2/squares.py b/samples/python2/squares.py index a247c35627..36676b4546 100755 --- a/samples/python2/squares.py +++ b/samples/python2/squares.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python ''' Simple "Square Detector" program. diff --git a/samples/python2/stereo_match.py b/samples/python2/stereo_match.py index 0e46c26617..0803a3d12b 100755 --- a/samples/python2/stereo_match.py +++ b/samples/python2/stereo_match.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python ''' Simple example of stereo image matching and point cloud generation. diff --git a/samples/python2/texture_flow.py b/samples/python2/texture_flow.py index 66eb58fe69..8b20faf4c9 100755 --- a/samples/python2/texture_flow.py +++ b/samples/python2/texture_flow.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python ''' Texture flow direction estimation. diff --git a/samples/python2/turing.py b/samples/python2/turing.py index 101f22c045..e847f014c1 100755 --- a/samples/python2/turing.py +++ b/samples/python2/turing.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python ''' Multiscale Turing Patterns generator diff --git a/samples/python2/video.py b/samples/python2/video.py index 2cdcb2a600..7e90ded038 100755 --- a/samples/python2/video.py +++ b/samples/python2/video.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python ''' Video capture sample. diff --git a/samples/python2/video_dmtx.py b/samples/python2/video_dmtx.py index 1ed06c4adc..bcb5785745 100755 --- a/samples/python2/video_dmtx.py +++ b/samples/python2/video_dmtx.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python ''' Data matrix detector sample. diff --git a/samples/python2/video_threaded.py b/samples/python2/video_threaded.py index 6fce334183..7bead033c5 100755 --- a/samples/python2/video_threaded.py +++ b/samples/python2/video_threaded.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python ''' Multithreaded video processing sample. diff --git a/samples/python2/watershed.py b/samples/python2/watershed.py index d4ab22ad59..9740e38895 100755 --- a/samples/python2/watershed.py +++ b/samples/python2/watershed.py @@ -1,4 +1,4 @@ -#/usr/bin/env python +#!/usr/bin/env python ''' Watershed segmentation From 36aad46fe14e032258ce898a87c29d3bbc2ae4ed Mon Sep 17 00:00:00 2001 From: Gurpinder Singh Sandhu Date: Thu, 11 Apr 2013 22:32:50 +0530 Subject: [PATCH 22/30] changed reinterpret_cast to static_cast This issue seem to be lingering around for quite some time https://github.com/Itseez/opencv/pull/639 http://code.opencv.org/issues/2819 --- modules/core/include/opencv2/core/operations.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/core/include/opencv2/core/operations.hpp b/modules/core/include/opencv2/core/operations.hpp index 1170fc4074..9d8696a05d 100644 --- a/modules/core/include/opencv2/core/operations.hpp +++ b/modules/core/include/opencv2/core/operations.hpp @@ -686,7 +686,7 @@ template static inline Scalar operator * (const Matx<_Tp, 4, 4>& a, const Scalar& b) { Matx c(Matx(a), b, Matx_MatMulOp()); - return reinterpret_cast(c); + return static_cast(c); } @@ -694,7 +694,7 @@ static inline Scalar operator * (const Matx& a, const Scalar& b) { Matx c(a, b, Matx_MatMulOp()); - return reinterpret_cast(c); + return static_cast(c); } From 61e041673eb0c7218035398cb93ab54e6dbdaca9 Mon Sep 17 00:00:00 2001 From: yao Date: Fri, 12 Apr 2013 14:44:55 +0800 Subject: [PATCH 23/30] remove the OpenCL Dir finding in CMake --- cmake/OpenCVDetectOpenCL.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/OpenCVDetectOpenCL.cmake b/cmake/OpenCVDetectOpenCL.cmake index eafecd93cc..970632151a 100644 --- a/cmake/OpenCVDetectOpenCL.cmake +++ b/cmake/OpenCVDetectOpenCL.cmake @@ -4,7 +4,7 @@ if(APPLE) set(OPENCL_INCLUDE_DIR "" CACHE STRING "OpenCL include directory") mark_as_advanced(OPENCL_INCLUDE_DIR OPENCL_LIBRARY) else(APPLE) - find_package(OpenCL QUIET) + #find_package(OpenCL QUIET) if (NOT OPENCL_FOUND) find_path(OPENCL_ROOT_DIR From 1e49c00f4b1d9f7f7931a27925bea54d7bf01203 Mon Sep 17 00:00:00 2001 From: peng xiao Date: Fri, 12 Apr 2013 16:47:44 +0800 Subject: [PATCH 24/30] Replace create with ensureSizeIsEnough thus buffer objects can be reused. --- modules/ocl/src/brute_force_matcher.cpp | 37 ++++++++++++++----------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/modules/ocl/src/brute_force_matcher.cpp b/modules/ocl/src/brute_force_matcher.cpp index e61a9f6330..5656e926ba 100644 --- a/modules/ocl/src/brute_force_matcher.cpp +++ b/modules/ocl/src/brute_force_matcher.cpp @@ -547,8 +547,8 @@ void cv::ocl::BruteForceMatcher_OCL_base::matchSingle(const oclMat &query, const CV_Assert(query.channels() == 1 && query.depth() < CV_64F); CV_Assert(train.cols == query.cols && train.type() == query.type()); - trainIdx.create(1, query.rows, CV_32S); - distance.create(1, query.rows, CV_32F); + ensureSizeIsEnough(1, query.rows, CV_32S, trainIdx); + ensureSizeIsEnough(1, query.rows, CV_32F, distance); matchDispatcher(query, train, mask, trainIdx, distance, distType); exit: @@ -667,10 +667,11 @@ void cv::ocl::BruteForceMatcher_OCL_base::matchCollection(const oclMat &query, c } CV_Assert(query.channels() == 1 && query.depth() < CV_64F); + const int nQuery = query.rows; - trainIdx.create(1, query.rows, CV_32S); - imgIdx.create(1, query.rows, CV_32S); - distance.create(1, query.rows, CV_32F); + ensureSizeIsEnough(1, nQuery, CV_32S, trainIdx); + ensureSizeIsEnough(1, nQuery, CV_32S, imgIdx); + ensureSizeIsEnough(1, nQuery, CV_32F, distance); matchDispatcher(query, (const oclMat *)trainCollection.ptr(), trainCollection.cols, masks, trainIdx, imgIdx, distance, distType); exit: @@ -759,16 +760,18 @@ void cv::ocl::BruteForceMatcher_OCL_base::knnMatchSingle(const oclMat &query, co CV_Assert(query.channels() == 1 && query.depth() < CV_64F); CV_Assert(train.type() == query.type() && train.cols == query.cols); + const int nQuery = query.rows; + const int nTrain = train.rows; if (k == 2) { - trainIdx.create(1, query.rows, CV_32SC2); - distance.create(1, query.rows, CV_32FC2); + ensureSizeIsEnough(1, nQuery, CV_32SC2, trainIdx); + ensureSizeIsEnough(1, nQuery, CV_32FC2, distance); } else { - trainIdx.create(query.rows, k, CV_32S); - distance.create(query.rows, k, CV_32F); - allDist.create(query.rows, train.rows, CV_32FC1); + ensureSizeIsEnough(nQuery, k, CV_32S, trainIdx); + ensureSizeIsEnough(nQuery, k, CV_32F, distance); + ensureSizeIsEnough(nQuery, nTrain, CV_32FC1, allDist); } trainIdx.setTo(Scalar::all(-1)); @@ -873,9 +876,9 @@ void cv::ocl::BruteForceMatcher_OCL_base::knnMatch2Collection(const oclMat &quer const int nQuery = query.rows; - trainIdx.create(1, nQuery, CV_32SC2); - imgIdx.create(1, nQuery, CV_32SC2); - distance.create(1, nQuery, CV_32SC2); + ensureSizeIsEnough(1, nQuery, CV_32SC2, trainIdx); + ensureSizeIsEnough(1, nQuery, CV_32SC2, imgIdx); + ensureSizeIsEnough(1, nQuery, CV_32FC2, distance); trainIdx.setTo(Scalar::all(-1)); @@ -1031,15 +1034,17 @@ void cv::ocl::BruteForceMatcher_OCL_base::radiusMatchSingle(const oclMat &query, CV_ERROR(CV_UNSUPPORTED_DEPTH_ERR, "BruteForceMatch OpenCL only support float type query!\n"); } + const int nQuery = query.rows; + const int nTrain = train.rows; CV_Assert(query.channels() == 1 && query.depth() < CV_64F); CV_Assert(train.type() == query.type() && train.cols == query.cols); CV_Assert(trainIdx.empty() || (trainIdx.rows == query.rows && trainIdx.size() == distance.size())); - nMatches.create(1, query.rows, CV_32SC1); + ensureSizeIsEnough(1, nQuery, CV_32SC1, nMatches); if (trainIdx.empty()) { - trainIdx.create(query.rows, std::max((train.rows/ 100), 10), CV_32SC1); - distance.create(query.rows, std::max((train.rows/ 100), 10), CV_32FC1); + ensureSizeIsEnough(nQuery, std::max((nTrain / 100), 10), CV_32SC1, trainIdx); + ensureSizeIsEnough(nQuery, std::max((nTrain / 100), 10), CV_32FC1, distance); } nMatches.setTo(Scalar::all(0)); From 113b7584e062388be1b9b8628d1e36b3208ebf2b Mon Sep 17 00:00:00 2001 From: peng xiao Date: Fri, 12 Apr 2013 16:50:30 +0800 Subject: [PATCH 25/30] Optimize bfmatcher by passing macros. --- modules/ocl/src/brute_force_matcher.cpp | 39 +++++---- modules/ocl/src/opencl/brute_force_match.cl | 91 +++++++++++++++------ 2 files changed, 90 insertions(+), 40 deletions(-) diff --git a/modules/ocl/src/brute_force_matcher.cpp b/modules/ocl/src/brute_force_matcher.cpp index 5656e926ba..ee0989d643 100644 --- a/modules/ocl/src/brute_force_matcher.cpp +++ b/modules/ocl/src/brute_force_matcher.cpp @@ -74,6 +74,9 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat int m_size = MAX_DESC_LEN; vector< pair > args; + static const int OPT_SIZE = 40; + char opt [OPT_SIZE] = ""; + sprintf(opt, "-D block_size=%d -D max_desc_len=%d", block_size, m_size); if(globalSize[0] != 0) { args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data )); @@ -82,8 +85,6 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data )); args.push_back( make_pair( smemSize, (void *)NULL)); - args.push_back( make_pair( sizeof(cl_int), (void *)&block_size )); - args.push_back( make_pair( sizeof(cl_int), (void *)&m_size )); args.push_back( make_pair( sizeof(cl_int), (void *)&query.rows )); args.push_back( make_pair( sizeof(cl_int), (void *)&query.cols )); args.push_back( make_pair( sizeof(cl_int), (void *)&train.rows )); @@ -93,7 +94,7 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat std::string kernelName = "BruteForceMatch_UnrollMatch"; - openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth()); + openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth(), opt); } } @@ -115,6 +116,9 @@ void match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, int block_size = BLOCK_SIZE; vector< pair > args; + static const int OPT_SIZE = 40; + char opt [OPT_SIZE] = ""; + sprintf(opt, "-D block_size=%d", block_size); if(globalSize[0] != 0) { args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data )); @@ -123,7 +127,6 @@ void match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data )); args.push_back( make_pair( smemSize, (void *)NULL)); - args.push_back( make_pair( sizeof(cl_int), (void *)&block_size )); args.push_back( make_pair( sizeof(cl_int), (void *)&query.rows )); args.push_back( make_pair( sizeof(cl_int), (void *)&query.cols )); args.push_back( make_pair( sizeof(cl_int), (void *)&train.rows )); @@ -133,7 +136,7 @@ void match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, std::string kernelName = "BruteForceMatch_Match"; - openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth()); + openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth(), opt); } } @@ -157,6 +160,9 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDist int m_size = MAX_DESC_LEN; vector< pair > args; + static const int OPT_SIZE = 40; + char opt [OPT_SIZE] = ""; + sprintf(opt, "-D block_size=%d -D max_desc_len=%d", block_size, m_size); if(globalSize[0] != 0) { args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data )); @@ -167,8 +173,6 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDist args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&nMatches.data )); args.push_back( make_pair( smemSize, (void *)NULL)); - args.push_back( make_pair( sizeof(cl_int), (void *)&block_size )); - args.push_back( make_pair( sizeof(cl_int), (void *)&m_size )); args.push_back( make_pair( sizeof(cl_int), (void *)&query.rows )); args.push_back( make_pair( sizeof(cl_int), (void *)&query.cols )); args.push_back( make_pair( sizeof(cl_int), (void *)&train.rows )); @@ -180,7 +184,7 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDist std::string kernelName = "BruteForceMatch_RadiusUnrollMatch"; - openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth()); + openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth(), opt); } } @@ -197,6 +201,9 @@ void radius_match(const oclMat &query, const oclMat &train, float maxDistance, c int block_size = BLOCK_SIZE; vector< pair > args; + static const int OPT_SIZE = 40; + char opt [OPT_SIZE] = ""; + sprintf(opt, "-D block_size=%d", block_size); if(globalSize[0] != 0) { args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data )); @@ -207,7 +214,6 @@ void radius_match(const oclMat &query, const oclMat &train, float maxDistance, c args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&nMatches.data )); args.push_back( make_pair( smemSize, (void *)NULL)); - args.push_back( make_pair( sizeof(cl_int), (void *)&block_size )); args.push_back( make_pair( sizeof(cl_int), (void *)&query.rows )); args.push_back( make_pair( sizeof(cl_int), (void *)&query.cols )); args.push_back( make_pair( sizeof(cl_int), (void *)&train.rows )); @@ -219,7 +225,7 @@ void radius_match(const oclMat &query, const oclMat &train, float maxDistance, c std::string kernelName = "BruteForceMatch_RadiusMatch"; - openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth()); + openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth(), opt); } } @@ -294,6 +300,9 @@ void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const ocl int m_size = MAX_DESC_LEN; vector< pair > args; + static const int OPT_SIZE = 40; + char opt [OPT_SIZE] = ""; + sprintf(opt, "-D block_size=%d -D max_desc_len=%d", block_size, m_size); if(globalSize[0] != 0) { args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data )); @@ -302,8 +311,6 @@ void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const ocl args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data )); args.push_back( make_pair( smemSize, (void *)NULL)); - args.push_back( make_pair( sizeof(cl_int), (void *)&block_size )); - args.push_back( make_pair( sizeof(cl_int), (void *)&m_size )); args.push_back( make_pair( sizeof(cl_int), (void *)&query.rows )); args.push_back( make_pair( sizeof(cl_int), (void *)&query.cols )); args.push_back( make_pair( sizeof(cl_int), (void *)&train.rows )); @@ -313,7 +320,7 @@ void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const ocl std::string kernelName = "BruteForceMatch_knnUnrollMatch"; - openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth()); + openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth(), opt); } } @@ -328,6 +335,9 @@ void knn_match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, int block_size = BLOCK_SIZE; vector< pair > args; + static const int OPT_SIZE = 40; + char opt [OPT_SIZE] = ""; + sprintf(opt, "-D block_size=%d", block_size); if(globalSize[0] != 0) { args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data )); @@ -336,7 +346,6 @@ void knn_match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data )); args.push_back( make_pair( smemSize, (void *)NULL)); - args.push_back( make_pair( sizeof(cl_int), (void *)&block_size )); args.push_back( make_pair( sizeof(cl_int), (void *)&query.rows )); args.push_back( make_pair( sizeof(cl_int), (void *)&query.cols )); args.push_back( make_pair( sizeof(cl_int), (void *)&train.rows )); @@ -346,7 +355,7 @@ void knn_match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, std::string kernelName = "BruteForceMatch_knnMatch"; - openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth()); + openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth(), opt); } } diff --git a/modules/ocl/src/opencl/brute_force_match.cl b/modules/ocl/src/opencl/brute_force_match.cl index e76fb1d21e..7821920dc2 100644 --- a/modules/ocl/src/opencl/brute_force_match.cl +++ b/modules/ocl/src/opencl/brute_force_match.cl @@ -1,5 +1,58 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Nathan, liujun@multicorewareinc.com +// Peng Xiao, pengxiao@outlook.com +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other oclMaterials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable -#define MAX_FLOAT 1e7f +#define MAX_FLOAT 3.40282e+038f + +#ifndef block_size +#define block_size 16 +#endif +#ifndef max_desc_len +#define max_desc_len 64 +#endif int bit1Count(float x) { @@ -15,7 +68,6 @@ int bit1Count(float x) float reduce_block(__local float *s_query, __local float *s_train, - int block_size, int lidx, int lidy, int distType @@ -51,8 +103,6 @@ float reduce_block(__local float *s_query, float reduce_multi_block(__local float *s_query, __local float *s_train, - int max_desc_len, - int block_size, int block_index, int lidx, int lidy, @@ -98,8 +148,6 @@ __kernel void BruteForceMatch_UnrollMatch_D5( __global int *bestTrainIdx, __global float *bestDistance, __local float *sharebuffer, - int block_size, - int max_desc_len, int query_rows, int query_cols, int train_rows, @@ -108,6 +156,7 @@ __kernel void BruteForceMatch_UnrollMatch_D5( int distType ) { + const int lidx = get_local_id(0); const int lidy = get_local_id(1); const int groupidx = get_group_id(0); @@ -117,6 +166,7 @@ __kernel void BruteForceMatch_UnrollMatch_D5( int queryIdx = groupidx * block_size + lidy; // load the query into local memory. + #pragma unroll for (int i = 0 ; i < max_desc_len / block_size; i ++) { int loadx = lidx + i * block_size; @@ -128,9 +178,10 @@ __kernel void BruteForceMatch_UnrollMatch_D5( // loopUnrolledCached to find the best trainIdx and best distance. volatile int imgIdx = 0; - for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++) + for (int t = 0, endt = (train_rows + block_size - 1) / block_size; t < endt; t++) { float result = 0; + #pragma unroll for (int i = 0 ; i < max_desc_len / block_size ; i++) { //load a block_size * block_size block into local train. @@ -140,7 +191,7 @@ __kernel void BruteForceMatch_UnrollMatch_D5( //synchronize to make sure each elem for reduceIteration in share memory is written already. barrier(CLK_LOCAL_MEM_FENCE); - result += reduce_multi_block(s_query, s_train, max_desc_len, block_size, i, lidx, lidy, distType); + result += reduce_multi_block(s_query, s_train, i, lidx, lidy, distType); barrier(CLK_LOCAL_MEM_FENCE); } @@ -168,6 +219,7 @@ __kernel void BruteForceMatch_UnrollMatch_D5( barrier(CLK_LOCAL_MEM_FENCE); //reduce -- now all reduce implement in each threads. + #pragma unroll for (int k = 0 ; k < block_size; k++) { if (myBestDistance > s_distance[k]) @@ -191,7 +243,6 @@ __kernel void BruteForceMatch_Match_D5( __global int *bestTrainIdx, __global float *bestDistance, __local float *sharebuffer, - int block_size, int query_rows, int query_cols, int train_rows, @@ -232,7 +283,7 @@ __kernel void BruteForceMatch_Match_D5( barrier(CLK_LOCAL_MEM_FENCE); - result += reduce_block(s_query, s_train, block_size, lidx, lidy, distType); + result += reduce_block(s_query, s_train, lidx, lidy, distType); barrier(CLK_LOCAL_MEM_FENCE); } @@ -287,8 +338,6 @@ __kernel void BruteForceMatch_RadiusUnrollMatch_D5( __global float *bestDistance, __global int *nMatches, __local float *sharebuffer, - int block_size, - int max_desc_len, int query_rows, int query_cols, int train_rows, @@ -322,7 +371,7 @@ __kernel void BruteForceMatch_RadiusUnrollMatch_D5( //synchronize to make sure each elem for reduceIteration in share memory is written already. barrier(CLK_LOCAL_MEM_FENCE); - result += reduce_block(s_query, s_train, block_size, lidx, lidy, distType); + result += reduce_block(s_query, s_train, lidx, lidy, distType); barrier(CLK_LOCAL_MEM_FENCE); } @@ -350,7 +399,6 @@ __kernel void BruteForceMatch_RadiusMatch_D5( __global float *bestDistance, __global int *nMatches, __local float *sharebuffer, - int block_size, int query_rows, int query_cols, int train_rows, @@ -384,7 +432,7 @@ __kernel void BruteForceMatch_RadiusMatch_D5( //synchronize to make sure each elem for reduceIteration in share memory is written already. barrier(CLK_LOCAL_MEM_FENCE); - result += reduce_block(s_query, s_train, block_size, lidx, lidy, distType); + result += reduce_block(s_query, s_train, lidx, lidy, distType); barrier(CLK_LOCAL_MEM_FENCE); } @@ -410,8 +458,6 @@ __kernel void BruteForceMatch_knnUnrollMatch_D5( __global int2 *bestTrainIdx, __global float2 *bestDistance, __local float *sharebuffer, - int block_size, - int max_desc_len, int query_rows, int query_cols, int train_rows, @@ -455,7 +501,7 @@ __kernel void BruteForceMatch_knnUnrollMatch_D5( //synchronize to make sure each elem for reduceIteration in share memory is written already. barrier(CLK_LOCAL_MEM_FENCE); - result += reduce_multi_block(s_query, s_train, max_desc_len, block_size, i, lidx, lidy, distType); + result += reduce_multi_block(s_query, s_train, i, lidx, lidy, distType); barrier(CLK_LOCAL_MEM_FENCE); } @@ -559,7 +605,6 @@ __kernel void BruteForceMatch_knnMatch_D5( __global int2 *bestTrainIdx, __global float2 *bestDistance, __local float *sharebuffer, - int block_size, int query_rows, int query_cols, int train_rows, @@ -600,7 +645,7 @@ __kernel void BruteForceMatch_knnMatch_D5( barrier(CLK_LOCAL_MEM_FENCE); - result += reduce_block(s_query, s_train, block_size, lidx, lidy, distType); + result += reduce_block(s_query, s_train, lidx, lidy, distType); barrier(CLK_LOCAL_MEM_FENCE); } @@ -703,8 +748,6 @@ kernel void BruteForceMatch_calcDistanceUnrolled_D5( //__global float *mask, __global float *allDist, __local float *sharebuffer, - int block_size, - int max_desc_len, int query_rows, int query_cols, int train_rows, @@ -721,7 +764,6 @@ kernel void BruteForceMatch_calcDistance_D5( //__global float *mask, __global float *allDist, __local float *sharebuffer, - int block_size, int query_rows, int query_cols, int train_rows, @@ -736,8 +778,7 @@ kernel void BruteForceMatch_findBestMatch_D5( __global float *allDist, __global int *bestTrainIdx, __global float *bestDistance, - int k, - int block_size + int k ) { /* Todo */ From 6eefd276cf0b9f403bd963686772c5e1620bbb89 Mon Sep 17 00:00:00 2001 From: peng xiao Date: Fri, 12 Apr 2013 16:51:36 +0800 Subject: [PATCH 26/30] Further optimize bfmatcher by passing macros. --- modules/ocl/src/brute_force_matcher.cpp | 47 +++++---- modules/ocl/src/opencl/brute_force_match.cl | 106 +++++++------------- 2 files changed, 60 insertions(+), 93 deletions(-) diff --git a/modules/ocl/src/brute_force_matcher.cpp b/modules/ocl/src/brute_force_matcher.cpp index ee0989d643..c3143048f3 100644 --- a/modules/ocl/src/brute_force_matcher.cpp +++ b/modules/ocl/src/brute_force_matcher.cpp @@ -16,6 +16,7 @@ // // @Authors // Nathan, liujun@multicorewareinc.com +// Peng Xiao, pengxiao@outlook.com // // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: @@ -61,6 +62,8 @@ namespace cv } } +static const int OPT_SIZE = 100; + template < int BLOCK_SIZE, int MAX_DESC_LEN/*, typename Mask*/ > void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, const oclMat &trainIdx, const oclMat &distance, int distType) @@ -74,9 +77,9 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat int m_size = MAX_DESC_LEN; vector< pair > args; - static const int OPT_SIZE = 40; char opt [OPT_SIZE] = ""; - sprintf(opt, "-D block_size=%d -D max_desc_len=%d", block_size, m_size); + sprintf(opt, "-D distType=%d -D block_size=%d -D max_desc_len=%d", distType, block_size, m_size); + if(globalSize[0] != 0) { args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data )); @@ -90,7 +93,6 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat args.push_back( make_pair( sizeof(cl_int), (void *)&train.rows )); args.push_back( make_pair( sizeof(cl_int), (void *)&train.cols )); args.push_back( make_pair( sizeof(cl_int), (void *)&query.step )); - args.push_back( make_pair( sizeof(cl_int), (void *)&distType )); std::string kernelName = "BruteForceMatch_UnrollMatch"; @@ -116,9 +118,9 @@ void match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, int block_size = BLOCK_SIZE; vector< pair > args; - static const int OPT_SIZE = 40; char opt [OPT_SIZE] = ""; - sprintf(opt, "-D block_size=%d", block_size); + sprintf(opt, "-D distType=%d -D block_size=%d", distType, block_size); + if(globalSize[0] != 0) { args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data )); @@ -132,7 +134,6 @@ void match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, args.push_back( make_pair( sizeof(cl_int), (void *)&train.rows )); args.push_back( make_pair( sizeof(cl_int), (void *)&train.cols )); args.push_back( make_pair( sizeof(cl_int), (void *)&query.step )); - args.push_back( make_pair( sizeof(cl_int), (void *)&distType )); std::string kernelName = "BruteForceMatch_Match"; @@ -160,9 +161,9 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDist int m_size = MAX_DESC_LEN; vector< pair > args; - static const int OPT_SIZE = 40; char opt [OPT_SIZE] = ""; - sprintf(opt, "-D block_size=%d -D max_desc_len=%d", block_size, m_size); + sprintf(opt, "-D distType=%d -D block_size=%d -D max_desc_len=%d", distType, block_size, m_size); + if(globalSize[0] != 0) { args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data )); @@ -180,7 +181,6 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDist args.push_back( make_pair( sizeof(cl_int), (void *)&trainIdx.cols )); args.push_back( make_pair( sizeof(cl_int), (void *)&query.step )); args.push_back( make_pair( sizeof(cl_int), (void *)&trainIdx.step )); - args.push_back( make_pair( sizeof(cl_int), (void *)&distType )); std::string kernelName = "BruteForceMatch_RadiusUnrollMatch"; @@ -201,9 +201,9 @@ void radius_match(const oclMat &query, const oclMat &train, float maxDistance, c int block_size = BLOCK_SIZE; vector< pair > args; - static const int OPT_SIZE = 40; char opt [OPT_SIZE] = ""; - sprintf(opt, "-D block_size=%d", block_size); + sprintf(opt, "-D distType=%d -D block_size=%d", distType, block_size); + if(globalSize[0] != 0) { args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data )); @@ -221,7 +221,6 @@ void radius_match(const oclMat &query, const oclMat &train, float maxDistance, c args.push_back( make_pair( sizeof(cl_int), (void *)&trainIdx.cols )); args.push_back( make_pair( sizeof(cl_int), (void *)&query.step )); args.push_back( make_pair( sizeof(cl_int), (void *)&trainIdx.step )); - args.push_back( make_pair( sizeof(cl_int), (void *)&distType )); std::string kernelName = "BruteForceMatch_RadiusMatch"; @@ -300,9 +299,9 @@ void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const ocl int m_size = MAX_DESC_LEN; vector< pair > args; - static const int OPT_SIZE = 40; char opt [OPT_SIZE] = ""; - sprintf(opt, "-D block_size=%d -D max_desc_len=%d", block_size, m_size); + sprintf(opt, "-D distType=%d -D block_size=%d -D max_desc_len=%d", distType, block_size, m_size); + if(globalSize[0] != 0) { args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data )); @@ -316,7 +315,6 @@ void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const ocl args.push_back( make_pair( sizeof(cl_int), (void *)&train.rows )); args.push_back( make_pair( sizeof(cl_int), (void *)&train.cols )); args.push_back( make_pair( sizeof(cl_int), (void *)&query.step )); - args.push_back( make_pair( sizeof(cl_int), (void *)&distType )); std::string kernelName = "BruteForceMatch_knnUnrollMatch"; @@ -335,9 +333,9 @@ void knn_match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, int block_size = BLOCK_SIZE; vector< pair > args; - static const int OPT_SIZE = 40; char opt [OPT_SIZE] = ""; - sprintf(opt, "-D block_size=%d", block_size); + sprintf(opt, "-D distType=%d -D block_size=%d", distType, block_size); + if(globalSize[0] != 0) { args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data )); @@ -351,7 +349,6 @@ void knn_match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, args.push_back( make_pair( sizeof(cl_int), (void *)&train.rows )); args.push_back( make_pair( sizeof(cl_int), (void *)&train.cols )); args.push_back( make_pair( sizeof(cl_int), (void *)&query.step )); - args.push_back( make_pair( sizeof(cl_int), (void *)&distType )); std::string kernelName = "BruteForceMatch_knnMatch"; @@ -370,6 +367,8 @@ void calcDistanceUnrolled(const oclMat &query, const oclMat &train, const oclMat int m_size = MAX_DESC_LEN; vector< pair > args; + char opt [OPT_SIZE] = ""; + sprintf(opt, "-D distType=%d", distType); if(globalSize[0] != 0) { args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data )); @@ -384,11 +383,10 @@ void calcDistanceUnrolled(const oclMat &query, const oclMat &train, const oclMat args.push_back( make_pair( sizeof(cl_int), (void *)&train.rows )); args.push_back( make_pair( sizeof(cl_int), (void *)&train.cols )); args.push_back( make_pair( sizeof(cl_int), (void *)&query.step )); - args.push_back( make_pair( sizeof(cl_int), (void *)&distType )); std::string kernelName = "BruteForceMatch_calcDistanceUnrolled"; - openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth()); + openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth(), opt); } } @@ -402,6 +400,8 @@ void calcDistance(const oclMat &query, const oclMat &train, const oclMat &/*mask int block_size = BLOCK_SIZE; vector< pair > args; + char opt [OPT_SIZE] = ""; + sprintf(opt, "-D distType=%d", distType); if(globalSize[0] != 0) { args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data )); @@ -415,11 +415,10 @@ void calcDistance(const oclMat &query, const oclMat &train, const oclMat &/*mask args.push_back( make_pair( sizeof(cl_int), (void *)&train.rows )); args.push_back( make_pair( sizeof(cl_int), (void *)&train.cols )); args.push_back( make_pair( sizeof(cl_int), (void *)&query.step )); - args.push_back( make_pair( sizeof(cl_int), (void *)&distType )); std::string kernelName = "BruteForceMatch_calcDistance"; - openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth()); + openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth(), opt); } } @@ -676,12 +675,14 @@ void cv::ocl::BruteForceMatcher_OCL_base::matchCollection(const oclMat &query, c } CV_Assert(query.channels() == 1 && query.depth() < CV_64F); + const int nQuery = query.rows; ensureSizeIsEnough(1, nQuery, CV_32S, trainIdx); ensureSizeIsEnough(1, nQuery, CV_32S, imgIdx); ensureSizeIsEnough(1, nQuery, CV_32F, distance); + matchDispatcher(query, (const oclMat *)trainCollection.ptr(), trainCollection.cols, masks, trainIdx, imgIdx, distance, distType); exit: return; @@ -771,6 +772,7 @@ void cv::ocl::BruteForceMatcher_OCL_base::knnMatchSingle(const oclMat &query, co const int nQuery = query.rows; const int nTrain = train.rows; + if (k == 2) { ensureSizeIsEnough(1, nQuery, CV_32SC2, trainIdx); @@ -1045,6 +1047,7 @@ void cv::ocl::BruteForceMatcher_OCL_base::radiusMatchSingle(const oclMat &query, const int nQuery = query.rows; const int nTrain = train.rows; + CV_Assert(query.channels() == 1 && query.depth() < CV_64F); CV_Assert(train.type() == query.type() && train.cols == query.cols); CV_Assert(trainIdx.empty() || (trainIdx.rows == query.rows && trainIdx.size() == distance.size())); diff --git a/modules/ocl/src/opencl/brute_force_match.cl b/modules/ocl/src/opencl/brute_force_match.cl index 7821920dc2..4e069efce5 100644 --- a/modules/ocl/src/opencl/brute_force_match.cl +++ b/modules/ocl/src/opencl/brute_force_match.cl @@ -66,37 +66,30 @@ int bit1Count(float x) return (float)c; } +#ifndef distType +#define distType 0 +#endif + +#if (distType == 0) +#define DIST(x, y) fabs((x) - (y)) +#elif (distType == 1) +#define DIST(x, y) (((x) - (y)) * ((x) - (y))) +#elif (distType == 2) +#define DIST(x, y) bit1Count((uint)(x) ^ (uint)(y)) +#endif + + float reduce_block(__local float *s_query, __local float *s_train, int lidx, - int lidy, - int distType + int lidy ) { - /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to - sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ float result = 0; - switch(distType) + #pragma unroll + for (int j = 0 ; j < block_size ; j++) { - case 0: - for (int j = 0 ; j < block_size ; j++) - { - result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]); - } - break; - case 1: - for (int j = 0 ; j < block_size ; j++) - { - float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx]; - result += qr * qr; - } - break; - case 2: - for (int j = 0 ; j < block_size ; j++) - { - result += bit1Count((uint)s_query[lidy * block_size + j] ^ (uint)s_train[(uint)j * block_size + lidx]); - } - break; + result += DIST(s_query[lidy * block_size + j], s_train[j * block_size + lidx]); } return result; } @@ -105,35 +98,14 @@ float reduce_multi_block(__local float *s_query, __local float *s_train, int block_index, int lidx, - int lidy, - int distType + int lidy ) { - /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to - sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ float result = 0; - switch(distType) + #pragma unroll + for (int j = 0 ; j < block_size ; j++) { - case 0: - for (int j = 0 ; j < block_size ; j++) - { - result += fabs(s_query[lidy * max_desc_len + block_index * block_size + j] - s_train[j * block_size + lidx]); - } - break; - case 1: - for (int j = 0 ; j < block_size ; j++) - { - float qr = s_query[lidy * max_desc_len + block_index * block_size + j] - s_train[j * block_size + lidx]; - result += qr * qr; - } - break; - case 2: - for (int j = 0 ; j < block_size ; j++) - { - //result += popcount((uint)s_query[lidy * max_desc_len + block_index * block_size + j] ^ (uint)s_train[j * block_size + lidx]); - result += bit1Count((uint)s_query[lidy * max_desc_len + block_index * block_size + j] ^ (uint)s_train[j * block_size + lidx]); - } - break; + result += DIST(s_query[lidy * max_desc_len + block_index * block_size + j], s_train[j * block_size + lidx]); } return result; } @@ -152,8 +124,7 @@ __kernel void BruteForceMatch_UnrollMatch_D5( int query_cols, int train_rows, int train_cols, - int step, - int distType + int step ) { @@ -191,7 +162,7 @@ __kernel void BruteForceMatch_UnrollMatch_D5( //synchronize to make sure each elem for reduceIteration in share memory is written already. barrier(CLK_LOCAL_MEM_FENCE); - result += reduce_multi_block(s_query, s_train, i, lidx, lidy, distType); + result += reduce_multi_block(s_query, s_train, i, lidx, lidy); barrier(CLK_LOCAL_MEM_FENCE); } @@ -247,8 +218,7 @@ __kernel void BruteForceMatch_Match_D5( int query_cols, int train_rows, int train_cols, - int step, - int distType + int step ) { const int lidx = get_local_id(0); @@ -283,7 +253,7 @@ __kernel void BruteForceMatch_Match_D5( barrier(CLK_LOCAL_MEM_FENCE); - result += reduce_block(s_query, s_train, lidx, lidy, distType); + result += reduce_block(s_query, s_train, lidx, lidy); barrier(CLK_LOCAL_MEM_FENCE); } @@ -344,8 +314,7 @@ __kernel void BruteForceMatch_RadiusUnrollMatch_D5( int train_cols, int bestTrainIdx_cols, int step, - int ostep, - int distType + int ostep ) { const int lidx = get_local_id(0); @@ -371,7 +340,7 @@ __kernel void BruteForceMatch_RadiusUnrollMatch_D5( //synchronize to make sure each elem for reduceIteration in share memory is written already. barrier(CLK_LOCAL_MEM_FENCE); - result += reduce_block(s_query, s_train, lidx, lidy, distType); + result += reduce_block(s_query, s_train, lidx, lidy); barrier(CLK_LOCAL_MEM_FENCE); } @@ -405,8 +374,7 @@ __kernel void BruteForceMatch_RadiusMatch_D5( int train_cols, int bestTrainIdx_cols, int step, - int ostep, - int distType + int ostep ) { const int lidx = get_local_id(0); @@ -432,7 +400,7 @@ __kernel void BruteForceMatch_RadiusMatch_D5( //synchronize to make sure each elem for reduceIteration in share memory is written already. barrier(CLK_LOCAL_MEM_FENCE); - result += reduce_block(s_query, s_train, lidx, lidy, distType); + result += reduce_block(s_query, s_train, lidx, lidy); barrier(CLK_LOCAL_MEM_FENCE); } @@ -462,8 +430,7 @@ __kernel void BruteForceMatch_knnUnrollMatch_D5( int query_cols, int train_rows, int train_cols, - int step, - int distType + int step ) { const int lidx = get_local_id(0); @@ -501,7 +468,7 @@ __kernel void BruteForceMatch_knnUnrollMatch_D5( //synchronize to make sure each elem for reduceIteration in share memory is written already. barrier(CLK_LOCAL_MEM_FENCE); - result += reduce_multi_block(s_query, s_train, i, lidx, lidy, distType); + result += reduce_multi_block(s_query, s_train, i, lidx, lidy); barrier(CLK_LOCAL_MEM_FENCE); } @@ -609,8 +576,7 @@ __kernel void BruteForceMatch_knnMatch_D5( int query_cols, int train_rows, int train_cols, - int step, - int distType + int step ) { const int lidx = get_local_id(0); @@ -645,7 +611,7 @@ __kernel void BruteForceMatch_knnMatch_D5( barrier(CLK_LOCAL_MEM_FENCE); - result += reduce_block(s_query, s_train, lidx, lidy, distType); + result += reduce_block(s_query, s_train, lidx, lidy); barrier(CLK_LOCAL_MEM_FENCE); } @@ -752,8 +718,7 @@ kernel void BruteForceMatch_calcDistanceUnrolled_D5( int query_cols, int train_rows, int train_cols, - int step, - int distType) + int step) { /* Todo */ } @@ -768,8 +733,7 @@ kernel void BruteForceMatch_calcDistance_D5( int query_cols, int train_rows, int train_cols, - int step, - int distType) + int step) { /* Todo */ } From c9d8eb7a84012001a4a57526286e038258b83efe Mon Sep 17 00:00:00 2001 From: peng xiao Date: Fri, 12 Apr 2013 16:52:21 +0800 Subject: [PATCH 27/30] Fix build error on linux. --- modules/ocl/src/brute_force_matcher.cpp | 29 +++++++++++-------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/modules/ocl/src/brute_force_matcher.cpp b/modules/ocl/src/brute_force_matcher.cpp index c3143048f3..1e51bff14b 100644 --- a/modules/ocl/src/brute_force_matcher.cpp +++ b/modules/ocl/src/brute_force_matcher.cpp @@ -542,14 +542,13 @@ void cv::ocl::BruteForceMatcher_OCL_base::matchSingle(const oclMat &query, const // match1 doesn't support signed char type, match2 only support float, hamming support uchar, ushort and int int callType = query.depth(); - char cvFuncName[] = "singleMatch"; if (callType != 5) - CV_ERROR(CV_UNSUPPORTED_FORMAT_ERR, "BruteForceMatch OpenCL only support float type query!\n"); + CV_Error(CV_UNSUPPORTED_FORMAT_ERR, "BruteForceMatch OpenCL only support float type query!\n"); if ((distType == 0 && callType == 1 ) || (distType == 1 && callType != 5) || (distType == 2 && (callType != 0 || callType != 2 || callType != 4))) { - CV_ERROR(CV_UNSUPPORTED_DEPTH_ERR, "BruteForceMatch OpenCL only support float type query!\n"); + CV_Error(CV_UNSUPPORTED_DEPTH_ERR, "BruteForceMatch OpenCL only support float type query!\n"); } CV_Assert(query.channels() == 1 && query.depth() < CV_64F); @@ -559,7 +558,7 @@ void cv::ocl::BruteForceMatcher_OCL_base::matchSingle(const oclMat &query, const ensureSizeIsEnough(1, query.rows, CV_32F, distance); matchDispatcher(query, train, mask, trainIdx, distance, distType); -exit: + return; } @@ -664,14 +663,13 @@ void cv::ocl::BruteForceMatcher_OCL_base::matchCollection(const oclMat &query, c // match1 doesn't support signed char type, match2 only support float, hamming support uchar, ushort and int int callType = query.depth(); - char cvFuncName[] = "matchCollection"; if (callType != 5) - CV_ERROR(CV_UNSUPPORTED_FORMAT_ERR, "BruteForceMatch OpenCL only support float type query!\n"); + CV_Error(CV_UNSUPPORTED_FORMAT_ERR, "BruteForceMatch OpenCL only support float type query!\n"); if ((distType == 0 && callType == 1 ) || (distType == 1 && callType != 5) || (distType == 2 && (callType != 0 || callType != 2 || callType != 4))) { - CV_ERROR(CV_UNSUPPORTED_DEPTH_ERR, "BruteForceMatch OpenCL only support float type query!\n"); + CV_Error(CV_UNSUPPORTED_DEPTH_ERR, "BruteForceMatch OpenCL only support float type query!\n"); } CV_Assert(query.channels() == 1 && query.depth() < CV_64F); @@ -684,7 +682,7 @@ void cv::ocl::BruteForceMatcher_OCL_base::matchCollection(const oclMat &query, c matchDispatcher(query, (const oclMat *)trainCollection.ptr(), trainCollection.cols, masks, trainIdx, imgIdx, distance, distType); -exit: + return; } @@ -757,14 +755,13 @@ void cv::ocl::BruteForceMatcher_OCL_base::knnMatchSingle(const oclMat &query, co // match1 doesn't support signed char type, match2 only support float, hamming support uchar, ushort and int int callType = query.depth(); - char cvFuncName[] = "knnMatchSingle"; if (callType != 5) - CV_ERROR(CV_UNSUPPORTED_FORMAT_ERR, "BruteForceMatch OpenCL only support float type query!\n"); + CV_Error(CV_UNSUPPORTED_FORMAT_ERR, "BruteForceMatch OpenCL only support float type query!\n"); if ((distType == 0 && callType == 1 ) || (distType == 1 && callType != 5) || (distType == 2 && (callType != 0 || callType != 2 || callType != 4))) { - CV_ERROR(CV_UNSUPPORTED_DEPTH_ERR, "BruteForceMatch OpenCL only support float type query!\n"); + CV_Error(CV_UNSUPPORTED_DEPTH_ERR, "BruteForceMatch OpenCL only support float type query!\n"); } CV_Assert(query.channels() == 1 && query.depth() < CV_64F); @@ -788,7 +785,7 @@ void cv::ocl::BruteForceMatcher_OCL_base::knnMatchSingle(const oclMat &query, co trainIdx.setTo(Scalar::all(-1)); kmatchDispatcher(query, train, k, mask, trainIdx, distance, allDist, distType); -exit: + return; } @@ -1035,14 +1032,14 @@ void cv::ocl::BruteForceMatcher_OCL_base::radiusMatchSingle(const oclMat &query, // match1 doesn't support signed char type, match2 only support float, hamming support uchar, ushort and int int callType = query.depth(); - char cvFuncName[] = "radiusMatchSingle"; + if (callType != 5) - CV_ERROR(CV_UNSUPPORTED_FORMAT_ERR, "BruteForceMatch OpenCL only support float type query!\n"); + CV_Error(CV_UNSUPPORTED_FORMAT_ERR, "BruteForceMatch OpenCL only support float type query!\n"); if ((distType == 0 && callType == 1 ) || (distType == 1 && callType != 5) || (distType == 2 && (callType != 0 || callType != 2 || callType != 4))) { - CV_ERROR(CV_UNSUPPORTED_DEPTH_ERR, "BruteForceMatch OpenCL only support float type query!\n"); + CV_Error(CV_UNSUPPORTED_DEPTH_ERR, "BruteForceMatch OpenCL only support float type query!\n"); } const int nQuery = query.rows; @@ -1062,7 +1059,7 @@ void cv::ocl::BruteForceMatcher_OCL_base::radiusMatchSingle(const oclMat &query, nMatches.setTo(Scalar::all(0)); matchDispatcher(query, train, maxDistance, mask, trainIdx, distance, nMatches, distType); -exit: + return; } From 1bea9ee26cd9d9ed82f3ac26f9a298af800aa1e4 Mon Sep 17 00:00:00 2001 From: peng xiao Date: Fri, 12 Apr 2013 16:54:06 +0800 Subject: [PATCH 28/30] Rename test case category and code clean up. --- modules/ocl/test/test_brute_force_matcher.cpp | 72 +++++++------------ 1 file changed, 25 insertions(+), 47 deletions(-) diff --git a/modules/ocl/test/test_brute_force_matcher.cpp b/modules/ocl/test/test_brute_force_matcher.cpp index 424781fe0a..d658c32d16 100644 --- a/modules/ocl/test/test_brute_force_matcher.cpp +++ b/modules/ocl/test/test_brute_force_matcher.cpp @@ -43,16 +43,14 @@ #ifdef HAVE_OPENCL namespace { - ///////////////////////////////////////////////////////////////////////////////////////////////// // BruteForceMatcher - - CV_ENUM(DistType, cv::ocl::BruteForceMatcher_OCL_base::L1Dist, cv::ocl::BruteForceMatcher_OCL_base::L2Dist, cv::ocl::BruteForceMatcher_OCL_base::HammingDist) + CV_ENUM(DistType, cv::ocl::BruteForceMatcher_OCL_base::L1Dist,\ + cv::ocl::BruteForceMatcher_OCL_base::L2Dist,\ + cv::ocl::BruteForceMatcher_OCL_base::HammingDist) IMPLEMENT_PARAM_CLASS(DescriptorSize, int) - - PARAM_TEST_CASE(BruteForceMatcher/*, NormCode*/, DistType, DescriptorSize) + PARAM_TEST_CASE(BruteForceMatcher, DistType, DescriptorSize) { - //std::vector oclinfo; cv::ocl::BruteForceMatcher_OCL_base::DistType distType; int normCode; int dim; @@ -64,13 +62,9 @@ namespace virtual void SetUp() { - //normCode = GET_PARAM(0); distType = (cv::ocl::BruteForceMatcher_OCL_base::DistType)(int)GET_PARAM(0); dim = GET_PARAM(1); - //int devnums = getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE); - //CV_Assert(devnums > 0); - queryDescCount = 300; // must be even number because we split train data in some cases in two countFactor = 4; // do not change it @@ -172,49 +166,33 @@ namespace cv::ocl::BruteForceMatcher_OCL_base matcher(distType); - // assume support atomic. - //if (!supportFeature(devInfo, cv::gpu::GLOBAL_ATOMICS)) - //{ - // try - // { - // std::vector< std::vector > matches; - // matcher.radiusMatch(loadMat(query), loadMat(train), matches, radius); - // } - // catch (const cv::Exception& e) - // { - // ASSERT_EQ(CV_StsNotImplemented, e.code); - // } - //} - //else + std::vector< std::vector > matches; + matcher.radiusMatch(cv::ocl::oclMat(query), cv::ocl::oclMat(train), matches, radius); + + ASSERT_EQ(static_cast(queryDescCount), matches.size()); + + int badCount = 0; + for (size_t i = 0; i < matches.size(); i++) { - std::vector< std::vector > matches; - matcher.radiusMatch(cv::ocl::oclMat(query), cv::ocl::oclMat(train), matches, radius); - - ASSERT_EQ(static_cast(queryDescCount), matches.size()); - - int badCount = 0; - for (size_t i = 0; i < matches.size(); i++) + if ((int)matches[i].size() != 1) { - if ((int)matches[i].size() != 1) - { - badCount++; - } - else - { - cv::DMatch match = matches[i][0]; - if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i * countFactor) || (match.imgIdx != 0)) - badCount++; - } + badCount++; + } + else + { + cv::DMatch match = matches[i][0]; + if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i * countFactor) || (match.imgIdx != 0)) + badCount++; } - - ASSERT_EQ(0, badCount); } + + ASSERT_EQ(0, badCount); } - INSTANTIATE_TEST_CASE_P(GPU_Features2D, BruteForceMatcher, testing::Combine( - //ALL_DEVICES, - testing::Values(DistType(cv::ocl::BruteForceMatcher_OCL_base::L1Dist), DistType(cv::ocl::BruteForceMatcher_OCL_base::L2Dist)), - testing::Values(DescriptorSize(57), DescriptorSize(64), DescriptorSize(83), DescriptorSize(128), DescriptorSize(179), DescriptorSize(256), DescriptorSize(304)))); + INSTANTIATE_TEST_CASE_P(OCL_Features2D, BruteForceMatcher, + testing::Combine( + testing::Values(DistType(cv::ocl::BruteForceMatcher_OCL_base::L1Dist), DistType(cv::ocl::BruteForceMatcher_OCL_base::L2Dist)), + testing::Values(DescriptorSize(57), DescriptorSize(64), DescriptorSize(83), DescriptorSize(128), DescriptorSize(179), DescriptorSize(256), DescriptorSize(304)))); } // namespace #endif From 2338a895f5a0c531feb96c9b8c6c21cfe6a3ccf3 Mon Sep 17 00:00:00 2001 From: peng xiao Date: Fri, 12 Apr 2013 16:56:49 +0800 Subject: [PATCH 29/30] Capitalize macro namings. --- modules/ocl/src/brute_force_matcher.cpp | 16 +- modules/ocl/src/opencl/brute_force_match.cl | 184 ++++++++++---------- 2 files changed, 100 insertions(+), 100 deletions(-) diff --git a/modules/ocl/src/brute_force_matcher.cpp b/modules/ocl/src/brute_force_matcher.cpp index 1e51bff14b..e8f28b778c 100644 --- a/modules/ocl/src/brute_force_matcher.cpp +++ b/modules/ocl/src/brute_force_matcher.cpp @@ -78,7 +78,7 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat vector< pair > args; char opt [OPT_SIZE] = ""; - sprintf(opt, "-D distType=%d -D block_size=%d -D max_desc_len=%d", distType, block_size, m_size); + sprintf(opt, "-D DIST_TYPE=%d -D BLOCK_SIZE=%d -D MAX_DESC_LEN=%d", distType, block_size, m_size); if(globalSize[0] != 0) { @@ -119,7 +119,7 @@ void match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, vector< pair > args; char opt [OPT_SIZE] = ""; - sprintf(opt, "-D distType=%d -D block_size=%d", distType, block_size); + sprintf(opt, "-D DIST_TYPE=%d -D BLOCK_SIZE=%d", distType, block_size); if(globalSize[0] != 0) { @@ -162,7 +162,7 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDist vector< pair > args; char opt [OPT_SIZE] = ""; - sprintf(opt, "-D distType=%d -D block_size=%d -D max_desc_len=%d", distType, block_size, m_size); + sprintf(opt, "-D DIST_TYPE=%d -D BLOCK_SIZE=%d -D MAX_DESC_LEN=%d", distType, block_size, m_size); if(globalSize[0] != 0) { @@ -202,7 +202,7 @@ void radius_match(const oclMat &query, const oclMat &train, float maxDistance, c vector< pair > args; char opt [OPT_SIZE] = ""; - sprintf(opt, "-D distType=%d -D block_size=%d", distType, block_size); + sprintf(opt, "-D DIST_TYPE=%d -D BLOCK_SIZE=%d", distType, block_size); if(globalSize[0] != 0) { @@ -300,7 +300,7 @@ void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const ocl vector< pair > args; char opt [OPT_SIZE] = ""; - sprintf(opt, "-D distType=%d -D block_size=%d -D max_desc_len=%d", distType, block_size, m_size); + sprintf(opt, "-D DIST_TYPE=%d -D BLOCK_SIZE=%d -D MAX_DESC_LEN=%d", distType, block_size, m_size); if(globalSize[0] != 0) { @@ -334,7 +334,7 @@ void knn_match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, vector< pair > args; char opt [OPT_SIZE] = ""; - sprintf(opt, "-D distType=%d -D block_size=%d", distType, block_size); + sprintf(opt, "-D DIST_TYPE=%d -D BLOCK_SIZE=%d", distType, block_size); if(globalSize[0] != 0) { @@ -368,7 +368,7 @@ void calcDistanceUnrolled(const oclMat &query, const oclMat &train, const oclMat vector< pair > args; char opt [OPT_SIZE] = ""; - sprintf(opt, "-D distType=%d", distType); + sprintf(opt, "-D DIST_TYPE=%d", distType); if(globalSize[0] != 0) { args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data )); @@ -401,7 +401,7 @@ void calcDistance(const oclMat &query, const oclMat &train, const oclMat &/*mask vector< pair > args; char opt [OPT_SIZE] = ""; - sprintf(opt, "-D distType=%d", distType); + sprintf(opt, "-D DIST_TYPE=%d", distType); if(globalSize[0] != 0) { args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data )); diff --git a/modules/ocl/src/opencl/brute_force_match.cl b/modules/ocl/src/opencl/brute_force_match.cl index 4e069efce5..7446c779b0 100644 --- a/modules/ocl/src/opencl/brute_force_match.cl +++ b/modules/ocl/src/opencl/brute_force_match.cl @@ -47,11 +47,11 @@ #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable #define MAX_FLOAT 3.40282e+038f -#ifndef block_size -#define block_size 16 +#ifndef BLOCK_SIZE +#define BLOCK_SIZE 16 #endif -#ifndef max_desc_len -#define max_desc_len 64 +#ifndef MAX_DESC_LEN +#define MAX_DESC_LEN 64 #endif int bit1Count(float x) @@ -66,15 +66,15 @@ int bit1Count(float x) return (float)c; } -#ifndef distType -#define distType 0 +#ifndef DIST_TYPE +#define DIST_TYPE 0 #endif -#if (distType == 0) +#if (DIST_TYPE == 0) #define DIST(x, y) fabs((x) - (y)) -#elif (distType == 1) +#elif (DIST_TYPE == 1) #define DIST(x, y) (((x) - (y)) * ((x) - (y))) -#elif (distType == 2) +#elif (DIST_TYPE == 2) #define DIST(x, y) bit1Count((uint)(x) ^ (uint)(y)) #endif @@ -87,9 +87,9 @@ float reduce_block(__local float *s_query, { float result = 0; #pragma unroll - for (int j = 0 ; j < block_size ; j++) + for (int j = 0 ; j < BLOCK_SIZE ; j++) { - result += DIST(s_query[lidy * block_size + j], s_train[j * block_size + lidx]); + result += DIST(s_query[lidy * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + lidx]); } return result; } @@ -103,15 +103,15 @@ float reduce_multi_block(__local float *s_query, { float result = 0; #pragma unroll - for (int j = 0 ; j < block_size ; j++) + for (int j = 0 ; j < BLOCK_SIZE ; j++) { - result += DIST(s_query[lidy * max_desc_len + block_index * block_size + j], s_train[j * block_size + lidx]); + result += DIST(s_query[lidy * MAX_DESC_LEN + block_index * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + lidx]); } return result; } -/* 2dim launch, global size: dim0 is (query rows + block_size - 1) / block_size * block_size, dim1 is block_size -local size: dim0 is block_size, dim1 is block_size. +/* 2dim launch, global size: dim0 is (query rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, dim1 is BLOCK_SIZE +local size: dim0 is BLOCK_SIZE, dim1 is BLOCK_SIZE. */ __kernel void BruteForceMatch_UnrollMatch_D5( __global float *query, @@ -133,15 +133,15 @@ __kernel void BruteForceMatch_UnrollMatch_D5( const int groupidx = get_group_id(0); __local float *s_query = sharebuffer; - __local float *s_train = sharebuffer + block_size * max_desc_len; + __local float *s_train = sharebuffer + BLOCK_SIZE * MAX_DESC_LEN; - int queryIdx = groupidx * block_size + lidy; + int queryIdx = groupidx * BLOCK_SIZE + lidy; // load the query into local memory. #pragma unroll - for (int i = 0 ; i < max_desc_len / block_size; i ++) + for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE; i ++) { - int loadx = lidx + i * block_size; - s_query[lidy * max_desc_len + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; + int loadx = lidx + i * BLOCK_SIZE; + s_query[lidy * MAX_DESC_LEN + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; } float myBestDistance = MAX_FLOAT; @@ -149,15 +149,15 @@ __kernel void BruteForceMatch_UnrollMatch_D5( // loopUnrolledCached to find the best trainIdx and best distance. volatile int imgIdx = 0; - for (int t = 0, endt = (train_rows + block_size - 1) / block_size; t < endt; t++) + for (int t = 0, endt = (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; t++) { float result = 0; #pragma unroll - for (int i = 0 ; i < max_desc_len / block_size ; i++) + for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE ; i++) { - //load a block_size * block_size block into local train. - const int loadx = lidx + i * block_size; - s_train[lidx * block_size + lidy] = loadx < train_cols ? train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; + //load a BLOCK_SIZE * BLOCK_SIZE block into local train. + const int loadx = lidx + i * BLOCK_SIZE; + s_train[lidx * BLOCK_SIZE + lidy] = loadx < train_cols ? train[min(t * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; //synchronize to make sure each elem for reduceIteration in share memory is written already. barrier(CLK_LOCAL_MEM_FENCE); @@ -167,7 +167,7 @@ __kernel void BruteForceMatch_UnrollMatch_D5( barrier(CLK_LOCAL_MEM_FENCE); } - int trainIdx = t * block_size + lidx; + int trainIdx = t * BLOCK_SIZE + lidx; if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance/* && mask(queryIdx, trainIdx)*/) { @@ -179,11 +179,11 @@ __kernel void BruteForceMatch_UnrollMatch_D5( barrier(CLK_LOCAL_MEM_FENCE); __local float *s_distance = (__local float*)(sharebuffer); - __local int* s_trainIdx = (__local int *)(sharebuffer + block_size * block_size); + __local int* s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE * BLOCK_SIZE); //find BestMatch - s_distance += lidy * block_size; - s_trainIdx += lidy * block_size; + s_distance += lidy * BLOCK_SIZE; + s_trainIdx += lidy * BLOCK_SIZE; s_distance[lidx] = myBestDistance; s_trainIdx[lidx] = myBestTrainIdx; @@ -191,7 +191,7 @@ __kernel void BruteForceMatch_UnrollMatch_D5( //reduce -- now all reduce implement in each threads. #pragma unroll - for (int k = 0 ; k < block_size; k++) + for (int k = 0 ; k < BLOCK_SIZE; k++) { if (myBestDistance > s_distance[k]) { @@ -225,30 +225,30 @@ __kernel void BruteForceMatch_Match_D5( const int lidy = get_local_id(1); const int groupidx = get_group_id(0); - const int queryIdx = groupidx * block_size + lidy; + const int queryIdx = groupidx * BLOCK_SIZE + lidy; float myBestDistance = MAX_FLOAT; int myBestTrainIdx = -1; __local float *s_query = sharebuffer; - __local float *s_train = sharebuffer + block_size * block_size; + __local float *s_train = sharebuffer + BLOCK_SIZE * BLOCK_SIZE; // loop - for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++) + for (int t = 0 ; t < (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE ; t++) { //Dist dist; float result = 0; - for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++) + for (int i = 0 ; i < (query_cols + BLOCK_SIZE - 1) / BLOCK_SIZE ; i++) { - const int loadx = lidx + i * block_size; + const int loadx = lidx + i * BLOCK_SIZE; //load query and train into local memory - s_query[lidy * block_size + lidx] = 0; - s_train[lidx * block_size + lidy] = 0; + s_query[lidy * BLOCK_SIZE + lidx] = 0; + s_train[lidx * BLOCK_SIZE + lidy] = 0; if (loadx < query_cols) { - s_query[lidy * block_size + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx]; - s_train[lidx * block_size + lidy] = train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx]; + s_query[lidy * BLOCK_SIZE + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx]; + s_train[lidx * BLOCK_SIZE + lidy] = train[min(t * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx]; } barrier(CLK_LOCAL_MEM_FENCE); @@ -258,7 +258,7 @@ __kernel void BruteForceMatch_Match_D5( barrier(CLK_LOCAL_MEM_FENCE); } - const int trainIdx = t * block_size + lidx; + const int trainIdx = t * BLOCK_SIZE + lidx; if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance /*&& mask(queryIdx, trainIdx)*/) { @@ -271,18 +271,18 @@ __kernel void BruteForceMatch_Match_D5( barrier(CLK_LOCAL_MEM_FENCE); __local float *s_distance = (__local float *)sharebuffer; - __local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size); + __local int *s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE * BLOCK_SIZE); //findBestMatch - s_distance += lidy * block_size; - s_trainIdx += lidy * block_size; + s_distance += lidy * BLOCK_SIZE; + s_trainIdx += lidy * BLOCK_SIZE; s_distance[lidx] = myBestDistance; s_trainIdx[lidx] = myBestTrainIdx; barrier(CLK_LOCAL_MEM_FENCE); //reduce -- now all reduce implement in each threads. - for (int k = 0 ; k < block_size; k++) + for (int k = 0 ; k < BLOCK_SIZE; k++) { if (myBestDistance > s_distance[k]) { @@ -322,20 +322,20 @@ __kernel void BruteForceMatch_RadiusUnrollMatch_D5( const int groupidx = get_group_id(0); const int groupidy = get_group_id(1); - const int queryIdx = groupidy * block_size + lidy; - const int trainIdx = groupidx * block_size + lidx; + const int queryIdx = groupidy * BLOCK_SIZE + lidy; + const int trainIdx = groupidx * BLOCK_SIZE + lidx; __local float *s_query = sharebuffer; - __local float *s_train = sharebuffer + block_size * block_size; + __local float *s_train = sharebuffer + BLOCK_SIZE * BLOCK_SIZE; float result = 0; - for (int i = 0 ; i < max_desc_len / block_size ; ++i) + for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE ; ++i) { - //load a block_size * block_size block into local train. - const int loadx = lidx + i * block_size; + //load a BLOCK_SIZE * BLOCK_SIZE block into local train. + const int loadx = lidx + i * BLOCK_SIZE; - s_query[lidy * block_size + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; - s_train[lidx * block_size + lidy] = loadx < query_cols ? train[min(groupidx * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; + s_query[lidy * BLOCK_SIZE + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; + s_train[lidx * BLOCK_SIZE + lidy] = loadx < query_cols ? train[min(groupidx * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; //synchronize to make sure each elem for reduceIteration in share memory is written already. barrier(CLK_LOCAL_MEM_FENCE); @@ -382,20 +382,20 @@ __kernel void BruteForceMatch_RadiusMatch_D5( const int groupidx = get_group_id(0); const int groupidy = get_group_id(1); - const int queryIdx = groupidy * block_size + lidy; - const int trainIdx = groupidx * block_size + lidx; + const int queryIdx = groupidy * BLOCK_SIZE + lidy; + const int trainIdx = groupidx * BLOCK_SIZE + lidx; __local float *s_query = sharebuffer; - __local float *s_train = sharebuffer + block_size * block_size; + __local float *s_train = sharebuffer + BLOCK_SIZE * BLOCK_SIZE; float result = 0; - for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; ++i) + for (int i = 0 ; i < (query_cols + BLOCK_SIZE - 1) / BLOCK_SIZE ; ++i) { - //load a block_size * block_size block into local train. - const int loadx = lidx + i * block_size; + //load a BLOCK_SIZE * BLOCK_SIZE block into local train. + const int loadx = lidx + i * BLOCK_SIZE; - s_query[lidy * block_size + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; - s_train[lidx * block_size + lidy] = loadx < query_cols ? train[min(groupidx * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; + s_query[lidy * BLOCK_SIZE + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; + s_train[lidx * BLOCK_SIZE + lidy] = loadx < query_cols ? train[min(groupidx * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; //synchronize to make sure each elem for reduceIteration in share memory is written already. barrier(CLK_LOCAL_MEM_FENCE); @@ -437,15 +437,15 @@ __kernel void BruteForceMatch_knnUnrollMatch_D5( const int lidy = get_local_id(1); const int groupidx = get_group_id(0); - const int queryIdx = groupidx * block_size + lidy; + const int queryIdx = groupidx * BLOCK_SIZE + lidy; local float *s_query = sharebuffer; - local float *s_train = sharebuffer + block_size * max_desc_len; + local float *s_train = sharebuffer + BLOCK_SIZE * MAX_DESC_LEN; // load the query into local memory. - for (int i = 0 ; i < max_desc_len / block_size; i ++) + for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE; i ++) { - int loadx = lidx + i * block_size; - s_query[lidy * max_desc_len + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; + int loadx = lidx + i * BLOCK_SIZE; + s_query[lidy * MAX_DESC_LEN + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; } float myBestDistance1 = MAX_FLOAT; @@ -455,15 +455,15 @@ __kernel void BruteForceMatch_knnUnrollMatch_D5( //loopUnrolledCached volatile int imgIdx = 0; - for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++) + for (int t = 0 ; t < (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE ; t++) { float result = 0; - for (int i = 0 ; i < max_desc_len / block_size ; i++) + for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE ; i++) { - const int loadX = lidx + i * block_size; - //load a block_size * block_size block into local train. - const int loadx = lidx + i * block_size; - s_train[lidx * block_size + lidy] = loadx < train_cols ? train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; + const int loadX = lidx + i * BLOCK_SIZE; + //load a BLOCK_SIZE * BLOCK_SIZE block into local train. + const int loadx = lidx + i * BLOCK_SIZE; + s_train[lidx * BLOCK_SIZE + lidy] = loadx < train_cols ? train[min(t * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; //synchronize to make sure each elem for reduceIteration in share memory is written already. barrier(CLK_LOCAL_MEM_FENCE); @@ -473,7 +473,7 @@ __kernel void BruteForceMatch_knnUnrollMatch_D5( barrier(CLK_LOCAL_MEM_FENCE); } - const int trainIdx = t * block_size + lidx; + const int trainIdx = t * BLOCK_SIZE + lidx; if (queryIdx < query_rows && trainIdx < train_rows) { @@ -495,11 +495,11 @@ __kernel void BruteForceMatch_knnUnrollMatch_D5( barrier(CLK_LOCAL_MEM_FENCE); local float *s_distance = (local float *)sharebuffer; - local int *s_trainIdx = (local int *)(sharebuffer + block_size * block_size); + local int *s_trainIdx = (local int *)(sharebuffer + BLOCK_SIZE * BLOCK_SIZE); // find BestMatch - s_distance += lidy * block_size; - s_trainIdx += lidy * block_size; + s_distance += lidy * BLOCK_SIZE; + s_trainIdx += lidy * BLOCK_SIZE; s_distance[lidx] = myBestDistance1; s_trainIdx[lidx] = myBestTrainIdx1; @@ -512,7 +512,7 @@ __kernel void BruteForceMatch_knnUnrollMatch_D5( if (lidx == 0) { - for (int i = 0 ; i < block_size ; i++) + for (int i = 0 ; i < BLOCK_SIZE ; i++) { float val = s_distance[i]; if (val < bestDistance1) @@ -540,7 +540,7 @@ __kernel void BruteForceMatch_knnUnrollMatch_D5( if (lidx == 0) { - for (int i = 0 ; i < block_size ; i++) + for (int i = 0 ; i < BLOCK_SIZE ; i++) { float val = s_distance[i]; @@ -583,9 +583,9 @@ __kernel void BruteForceMatch_knnMatch_D5( const int lidy = get_local_id(1); const int groupidx = get_group_id(0); - const int queryIdx = groupidx * block_size + lidy; + const int queryIdx = groupidx * BLOCK_SIZE + lidy; local float *s_query = sharebuffer; - local float *s_train = sharebuffer + block_size * block_size; + local float *s_train = sharebuffer + BLOCK_SIZE * BLOCK_SIZE; float myBestDistance1 = MAX_FLOAT; float myBestDistance2 = MAX_FLOAT; @@ -593,20 +593,20 @@ __kernel void BruteForceMatch_knnMatch_D5( int myBestTrainIdx2 = -1; //loop - for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++) + for (int t = 0 ; t < (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE ; t++) { float result = 0.0f; - for (int i = 0 ; i < (query_cols + block_size -1) / block_size ; i++) + for (int i = 0 ; i < (query_cols + BLOCK_SIZE -1) / BLOCK_SIZE ; i++) { - const int loadx = lidx + i * block_size; + const int loadx = lidx + i * BLOCK_SIZE; //load query and train into local memory - s_query[lidy * block_size + lidx] = 0; - s_train[lidx * block_size + lidy] = 0; + s_query[lidy * BLOCK_SIZE + lidx] = 0; + s_train[lidx * BLOCK_SIZE + lidy] = 0; if (loadx < query_cols) { - s_query[lidy * block_size + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx]; - s_train[lidx * block_size + lidy] = train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx]; + s_query[lidy * BLOCK_SIZE + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx]; + s_train[lidx * BLOCK_SIZE + lidy] = train[min(t * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx]; } barrier(CLK_LOCAL_MEM_FENCE); @@ -616,7 +616,7 @@ __kernel void BruteForceMatch_knnMatch_D5( barrier(CLK_LOCAL_MEM_FENCE); } - const int trainIdx = t * block_size + lidx; + const int trainIdx = t * BLOCK_SIZE + lidx; if (queryIdx < query_rows && trainIdx < train_rows /*&& mask(queryIdx, trainIdx)*/) { @@ -638,11 +638,11 @@ __kernel void BruteForceMatch_knnMatch_D5( barrier(CLK_LOCAL_MEM_FENCE); __local float *s_distance = (__local float *)sharebuffer; - __local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size); + __local int *s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE * BLOCK_SIZE); //findBestMatch - s_distance += lidy * block_size; - s_trainIdx += lidy * block_size; + s_distance += lidy * BLOCK_SIZE; + s_trainIdx += lidy * BLOCK_SIZE; s_distance[lidx] = myBestDistance1; s_trainIdx[lidx] = myBestTrainIdx1; @@ -655,7 +655,7 @@ __kernel void BruteForceMatch_knnMatch_D5( if (lidx == 0) { - for (int i = 0 ; i < block_size ; i++) + for (int i = 0 ; i < BLOCK_SIZE ; i++) { float val = s_distance[i]; if (val < bestDistance1) @@ -683,7 +683,7 @@ __kernel void BruteForceMatch_knnMatch_D5( if (lidx == 0) { - for (int i = 0 ; i < block_size ; i++) + for (int i = 0 ; i < BLOCK_SIZE ; i++) { float val = s_distance[i]; From 719e8674ad78820ef8980eb0781438e36aed9a94 Mon Sep 17 00:00:00 2001 From: yao Date: Fri, 12 Apr 2013 17:38:59 +0800 Subject: [PATCH 30/30] fix the compile errors on Mac --- modules/nonfree/src/surf.ocl.cpp | 2 +- modules/ocl/src/filtering.cpp | 3 +- modules/ocl/src/opencl/arithm_flip.cl | 44 +++-- modules/ocl/src/opencl/filter_sep_row.cl | 181 +++++++++--------- modules/ocl/src/opencl/filtering_laplacian.cl | 90 ++++----- modules/ocl/src/opencl/imgproc_integral.cl | 36 ++-- modules/ocl/src/opencl/imgproc_warpAffine.cl | 111 +++++------ .../ocl/src/opencl/imgproc_warpPerspective.cl | 115 +++++------ modules/ocl/src/opencl/match_template.cl | 11 +- 9 files changed, 311 insertions(+), 282 deletions(-) diff --git a/modules/nonfree/src/surf.ocl.cpp b/modules/nonfree/src/surf.ocl.cpp index 78864c6f96..acc188edf8 100644 --- a/modules/nonfree/src/surf.ocl.cpp +++ b/modules/nonfree/src/surf.ocl.cpp @@ -77,7 +77,7 @@ namespace cv size_t wave_size = 0; queryDeviceInfo(WAVEFRONT_SIZE, &wave_size); - std::sprintf(pSURF_OPTIONS, " -D WAVE_SIZE=%d", static_cast(wave_size)); + std::sprintf(pSURF_OPTIONS, "-D WAVE_SIZE=%d", static_cast(wave_size)); OPTION_INIT = true; } openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth, SURF_OPTIONS); diff --git a/modules/ocl/src/filtering.cpp b/modules/ocl/src/filtering.cpp index 2f4a494cda..cc07209b15 100644 --- a/modules/ocl/src/filtering.cpp +++ b/modules/ocl/src/filtering.cpp @@ -277,8 +277,7 @@ static void GPUErode(const oclMat &src, oclMat &dst, oclMat &mat_kernel, char compile_option[128]; sprintf(compile_option, "-D RADIUSX=%d -D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D ERODE %s %s", anchor.x, anchor.y, (int)localThreads[0], (int)localThreads[1], - rectKernel?"-D RECTKERNEL":"", - s); + s, rectKernel?"-D RECTKERNEL":""); vector< pair > args; args.push_back(make_pair(sizeof(cl_mem), (void *)&src.data)); args.push_back(make_pair(sizeof(cl_mem), (void *)&dst.data)); diff --git a/modules/ocl/src/opencl/arithm_flip.cl b/modules/ocl/src/opencl/arithm_flip.cl index f4925244a5..d0e6782cbb 100644 --- a/modules/ocl/src/opencl/arithm_flip.cl +++ b/modules/ocl/src/opencl/arithm_flip.cl @@ -44,7 +44,11 @@ //M*/ #if defined (DOUBLE_SUPPORT) +#ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64:enable +#elif defined (cl_amd_fp64) +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#endif #endif ////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -60,8 +64,11 @@ __kernel void arithm_flip_rows_D0 (__global uchar *src, int src_step, int src_of if (x < cols && y < thread_rows) { x = x << 2; - - #define dst_align (dst_offset & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (dst_offset & 3) int src_index_0 = mad24(y, src_step, x + src_offset - dst_align); int src_index_1 = mad24(rows - y - 1, src_step, x + src_offset - dst_align); @@ -115,8 +122,11 @@ __kernel void arithm_flip_rows_D1 (__global char *src, int src_step, int src_off if (x < cols && y < thread_rows) { x = x << 2; - - #define dst_align (dst_offset & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (dst_offset & 3) int src_index_0 = mad24(y, src_step, x + src_offset - dst_align); int src_index_1 = mad24(rows - y - 1, src_step, x + src_offset - dst_align); @@ -157,8 +167,11 @@ __kernel void arithm_flip_rows_D2 (__global ushort *src, int src_step, int src_o if (x < cols && y < thread_rows) { x = x << 2; - - #define dst_align (((dst_offset >> 1) & 3) << 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (((dst_offset >> 1) & 3) << 1) int src_index_0 = mad24(y, src_step, (x << 1) + src_offset - dst_align); int src_index_1 = mad24(rows - y - 1, src_step, (x << 1) + src_offset - dst_align); @@ -199,8 +212,11 @@ __kernel void arithm_flip_rows_D3 (__global short *src, int src_step, int src_of if (x < cols && y < thread_rows) { x = x << 2; - - #define dst_align (((dst_offset >> 1) & 3) << 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (((dst_offset >> 1) & 3) << 1) int src_index_0 = mad24(y, src_step, (x << 1) + src_offset - dst_align); int src_index_1 = mad24(rows - y - 1, src_step, (x << 1) + src_offset - dst_align); @@ -314,16 +330,14 @@ __kernel void arithm_flip_cols_C1_D0 (__global uchar *src, int src_step, int src if (x < thread_cols && y < rows) { int src_index_0 = mad24(y, src_step, (x) + src_offset); - int src_index_1 = mad24(y, src_step, (cols - x -1) + src_offset); - - int dst_index_0 = mad24(y, dst_step, (x) + dst_offset); int dst_index_1 = mad24(y, dst_step, (cols - x -1) + dst_offset); - uchar data0 = *(src + src_index_0); - uchar data1 = *(src + src_index_1); - - *(dst + dst_index_0) = data1; *(dst + dst_index_1) = data0; + + int src_index_1 = mad24(y, src_step, (cols - x -1) + src_offset); + int dst_index_0 = mad24(y, dst_step, (x) + dst_offset); + uchar data1 = *(src + src_index_1); + *(dst + dst_index_0) = data1; } } __kernel void arithm_flip_cols_C1_D1 (__global char *src, int src_step, int src_offset, diff --git a/modules/ocl/src/opencl/filter_sep_row.cl b/modules/ocl/src/opencl/filter_sep_row.cl index bfe6cd4dd6..5524041fc3 100644 --- a/modules/ocl/src/opencl/filter_sep_row.cl +++ b/modules/ocl/src/opencl/filter_sep_row.cl @@ -96,18 +96,18 @@ The info above maybe obsolete. ***********************************************************************************/ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C1_D0 - (__global const uchar * restrict src, - __global float * dst, - const int dst_cols, - const int dst_rows, - const int src_whole_cols, - const int src_whole_rows, - const int src_step_in_pixel, - const int src_offset_x, - const int src_offset_y, - const int dst_step_in_pixel, - const int radiusy, - __constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSX+1))))) +(__global const uchar * restrict src, + __global float * dst, + const int dst_cols, + const int dst_rows, + const int src_whole_cols, + const int src_whole_rows, + const int src_step_in_pixel, + const int src_offset_x, + const int src_offset_y, + const int dst_step_in_pixel, + const int radiusy, + __constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSX+1))))) { int x = get_global_id(0)<<2; int y = get_global_id(1); @@ -122,17 +122,17 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_ uchar4 temp[READ_TIMES_ROW]; __local uchar4 LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1]; - #ifdef BORDER_CONSTANT +#ifdef BORDER_CONSTANT int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols); //read pixels from src - for(i = 0;i 0)) ? current_addr : 0; temp[i] = *(__global uchar4*)&src[current_addr]; } //judge if read out of boundary - for(i = 0;isrc_whole_cols)| (start_y<0) | (start_y >= src_whole_rows); int4 index[READ_TIMES_ROW]; int4 addr; @@ -148,7 +148,7 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_ if(not_all_in_range) { //judge if read out of boundary - for(i = 0;i 0)) ? current_addr : 0; temp[i] = src[current_addr]; } //judge if read out of boundary - for(i = 0;i 0)) ? current_addr : 0; temp[i] = src[current_addr]; } //judge if read out of boundary - for(i = 0;i 0)) ? current_addr : 0; temp[i] = src[current_addr]; } //judge if read out of boundary - for(i = 0;i> THREADS_PER_ROW_BIT) + i; - int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j; + int local_row = (lX >> THREADS_PER_ROW_BIT) + i; + int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j; - data = vload4(0, local_data+local_row * LOCAL_MEM_STEP + local_cols); - sum = sum + (mat_kernel[i * ANCHOR + j] * convert_int4_sat(data)); - } + data = vload4(0, local_data+local_row * LOCAL_MEM_STEP + local_cols); + sum = sum + (mat_kernel[i * ANCHOR + j] * convert_int4_sat(data)); + } } } @@ -207,7 +207,7 @@ __kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x sum.w = ((dst_cols_index + 3 >= dst_cols_start) && (dst_cols_index + 3 < dst_cols_end)) ? sum.w : dst_data.w; *((__global uchar4 *)(dst + dst_rows_index * dst_step + dst_cols_index)) = convert_uchar4_sat(sum); } - } + } } /////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////32FC1//////////////////////////////////////////////////////// @@ -225,7 +225,7 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x int groupX_size = get_local_size(0); int groupX_id = get_group_id(0); - #define dst_align (dst_offset_x & 3) +#define dst_align (dst_offset_x & 3) int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX; int rows_start_index = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY; @@ -236,7 +236,7 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x { if((rows_start_index - src_offset_y) + i < rows + ANY) { - #ifdef BORDER_CONSTANT +#ifdef BORDER_CONSTANT int selected_row = rows_start_index + i; int selected_cols = cols_start_index_group + lX; @@ -254,7 +254,7 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x data = con ? data : 0; local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; } - #else +#else int selected_row = ADDR_H(rows_start_index + i, 0, wholerows); selected_row = ADDR_B(rows_start_index + i, wholerows, selected_row); @@ -272,7 +272,7 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x data = *((__global float *)((__global char *)src + selected_row * src_step + (selected_cols << 2))); local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; } - #endif +#endif } } } @@ -295,17 +295,17 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x for(int i = 0; i < ANCHOR; i++) { - #pragma unroll 3 - for(int j = 0; j < ANCHOR; j++) - { +#pragma unroll 3 + for(int j = 0; j < ANCHOR; j++) + { if(dst_rows_index < dst_rows_end) { - int local_row = (lX >> THREADS_PER_ROW_BIT) + i; - int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j; + int local_row = (lX >> THREADS_PER_ROW_BIT) + i; + int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j; - data = vload4(0, local_data+local_row * LOCAL_MEM_STEP + local_cols); - sum = sum + (mat_kernel[i * ANCHOR + j] * data); - } + data = vload4(0, local_data+local_row * LOCAL_MEM_STEP + local_cols); + sum = sum + ((float)(mat_kernel[i * ANCHOR + j]) * data); + } } } @@ -318,7 +318,7 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x *((__global float4 *)((__global char *)dst + dst_rows_index * dst_step + (dst_cols_index << 2))) = sum; } - } + } } /////////////////////////////////////////////////////////////////////////////////////////////////// @@ -337,7 +337,7 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_ int groupX_size = get_local_size(0); int groupX_id = get_group_id(0); - #define dst_align (dst_offset_x & 3) +#define dst_align (dst_offset_x & 3) int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX; int rows_start_index = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY; @@ -349,7 +349,7 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_ { if((rows_start_index - src_offset_y) + i < rows + ANY) { - #ifdef BORDER_CONSTANT +#ifdef BORDER_CONSTANT int selected_row = rows_start_index + i; int selected_cols = cols_start_index_group + lX; @@ -367,7 +367,7 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_ data = con ? data : 0; local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; } - #else +#else int selected_row = ADDR_H(rows_start_index + i, 0, wholerows); selected_row = ADDR_B(rows_start_index + i, wholerows, selected_row); @@ -386,7 +386,7 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_ data = *((__global uchar4*)((__global char*)src + selected_row * src_step + (selected_cols << 2))); local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; } - #endif +#endif } } } @@ -410,17 +410,17 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_ for(int i = 0; i < ANCHOR; i++) { - #pragma unroll 3 - for(int j = 0; j < ANCHOR; j++) - { +#pragma unroll 3 + for(int j = 0; j < ANCHOR; j++) + { if(dst_rows_index < dst_rows_end) { - int local_row = (lX >> THREADS_PER_ROW_BIT) + i; - int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j; + int local_row = (lX >> THREADS_PER_ROW_BIT) + i; + int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j; - data = vload16(0, (__local uchar *)(local_data+local_row * LOCAL_MEM_STEP + local_cols)); - sum = sum + (mat_kernel[i * ANCHOR + j] * convert_int16_sat(data)); - } + data = vload16(0, (__local uchar *)(local_data+local_row * LOCAL_MEM_STEP + local_cols)); + sum = sum + (mat_kernel[i * ANCHOR + j] * convert_int16_sat(data)); + } } } @@ -468,7 +468,7 @@ __kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_ { if((rows_start_index - src_offset_y) + i < rows + ANY) { - #ifdef BORDER_CONSTANT +#ifdef BORDER_CONSTANT int selected_row = rows_start_index + i; int selected_cols = cols_start_index_group + lX; @@ -486,7 +486,7 @@ __kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_ data = con ? data : 0; local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; } - #else +#else int selected_row = ADDR_H(rows_start_index + i, 0, wholerows); selected_row = ADDR_B(rows_start_index + i, wholerows, selected_row); @@ -504,7 +504,7 @@ __kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_ data = *((__global float4*)((__global char*)src + selected_row * src_step + (selected_cols << 4))); local_data[i * LOCAL_MEM_STEP_C4 + lX + groupX_size] =data; } - #endif +#endif } } } @@ -519,10 +519,10 @@ __kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_ for(int i = 0; i < ANCHOR; i++) { - for(int j = 0; j < ANCHOR; j++) - { - int local_cols = lX + j; - sum = sum + mat_kernel[i * ANCHOR + j] * local_data[i * LOCAL_MEM_STEP_C4 + local_cols]; + for(int j = 0; j < ANCHOR; j++) + { + int local_cols = lX + j; + sum = sum + ((float)mat_kernel[i * ANCHOR + j] * local_data[i * LOCAL_MEM_STEP_C4 + local_cols]); } } diff --git a/modules/ocl/src/opencl/imgproc_integral.cl b/modules/ocl/src/opencl/imgproc_integral.cl index 80f460b86e..c546957687 100644 --- a/modules/ocl/src/opencl/imgproc_integral.cl +++ b/modules/ocl/src/opencl/imgproc_integral.cl @@ -44,7 +44,11 @@ //M*/ #if defined (DOUBLE_SUPPORT) +#ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64:enable +#elif defined (cl_amd_fp64) +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#endif #endif #define LSIZE 256 #define LSIZE_1 255 @@ -71,13 +75,13 @@ kernel void integral_cols(__global uchar4 *src,__global int *sum ,__global float gid = gid << 1; for(int i = 0; i < rows; i =i + LSIZE_1) { - src_t[0] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + gid]) : 0); - src_t[1] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + gid + 1]) : 0); + src_t[0] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + min(gid, (uint)cols - 1)]) : 0); + src_t[1] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + min(gid + 1, (uint)cols - 1)]) : 0); sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]); - sqsum_t[0] = (i == 0 ? 0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]); + sqsum_t[0] = (i == 0 ? (float4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]); sum_t[1] = (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]); - sqsum_t[1] = (i == 0 ? 0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]); + sqsum_t[1] = (i == 0 ? (float4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]); barrier(CLK_LOCAL_MEM_FENCE); int bf_loc = lid + GET_CONFLICT_OFFSET(lid); @@ -127,7 +131,8 @@ kernel void integral_cols(__global uchar4 *src,__global int *sum ,__global float } barrier(CLK_LOCAL_MEM_FENCE); int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ; - if(lid > 0 && (i+lid) <= rows){ + if(lid > 0 && (i+lid) <= rows) + { lm_sum[0][bf_loc] += sum_t[0]; lm_sum[1][bf_loc] += sum_t[1]; lm_sqsum[0][bf_loc] += sqsum_t[0]; @@ -169,15 +174,15 @@ kernel void integral_rows(__global int4 *srcsum,__global float4 * srcsqsum,__glo src_step = src_step >> 4; for(int i = 0; i < rows; i =i + LSIZE_1) { - src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : 0; - sqsrc_t[0] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2] : 0; - src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : 0; - sqsrc_t[1] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2 + 1] : 0; + src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : (int4)0; + sqsrc_t[0] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2] : (float4)0; + src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : (int4)0; + sqsrc_t[1] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2 + 1] : (float4)0; sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]); - sqsum_t[0] = (i == 0 ? 0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]); + sqsum_t[0] = (i == 0 ? (float4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]); sum_t[1] = (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]); - sqsum_t[1] = (i == 0 ? 0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]); + sqsum_t[1] = (i == 0 ? (float4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]); barrier(CLK_LOCAL_MEM_FENCE); int bf_loc = lid + GET_CONFLICT_OFFSET(lid); @@ -228,14 +233,14 @@ kernel void integral_rows(__global int4 *srcsum,__global float4 * srcsqsum,__glo barrier(CLK_LOCAL_MEM_FENCE); if(gid == 0 && (i + lid) <= rows) { - sum[sum_offset + i + lid] = 0; - sqsum[sqsum_offset + i + lid] = 0; + sum[sum_offset + i + lid] = 0; + sqsum[sqsum_offset + i + lid] = 0; } if(i + lid == 0) { int loc0 = gid * 2 * sum_step; int loc1 = gid * 2 * sqsum_step; - for(int k = 1;k <= 8;k++) + for(int k = 1; k <= 8; k++) { if(gid * 8 + k > cols) break; sum[sum_offset + loc0 + k * sum_step / 4] = 0; @@ -244,7 +249,8 @@ kernel void integral_rows(__global int4 *srcsum,__global float4 * srcsqsum,__glo } int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ; int loc_sq0 = sqsum_offset + gid * 2 * sqsum_step + sqsum_step / 4 + i + lid, loc_sq1 = loc_sq0 + sqsum_step ; - if(lid > 0 && (i+lid) <= rows){ + if(lid > 0 && (i+lid) <= rows) + { lm_sum[0][bf_loc] += sum_t[0]; lm_sum[1][bf_loc] += sum_t[1]; lm_sqsum[0][bf_loc] += sqsum_t[0]; diff --git a/modules/ocl/src/opencl/imgproc_warpAffine.cl b/modules/ocl/src/opencl/imgproc_warpAffine.cl index 8aee1838c4..6eee8d3fa7 100644 --- a/modules/ocl/src/opencl/imgproc_warpAffine.cl +++ b/modules/ocl/src/opencl/imgproc_warpAffine.cl @@ -47,8 +47,12 @@ //warpAffine kernel //support data types: CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4, and three interpolation methods: NN, Linear, Cubic. -#if defined DOUBLE_SUPPORT +#if defined (DOUBLE_SUPPORT) +#ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64:enable +#elif defined (cl_amd_fp64) +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#endif typedef double F; typedef double4 F4; #define convert_F4 convert_double4 @@ -58,7 +62,6 @@ typedef float4 F4; #define convert_F4 convert_float4 #endif - #define INTER_BITS 5 #define INTER_TAB_SIZE (1 << INTER_BITS) #define INTER_SCALE 1.f/INTER_TAB_SIZE @@ -81,8 +84,8 @@ inline void interpolateCubic( float x, float* coeffs ) /**********************************************8UC1********************************************* ***********************************************************************************************/ __kernel void warpAffineNN_C1_D0(__global uchar const * restrict src, __global uchar * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -123,14 +126,14 @@ __kernel void warpAffineNN_C1_D0(__global uchar const * restrict src, __global u sval.s1 = scon.s1 ? src[spos.s1] : 0; sval.s2 = scon.s2 ? src[spos.s2] : 0; sval.s3 = scon.s3 ? src[spos.s3] : 0; - dval = convert_uchar4(dcon != 0) ? sval : dval; + dval = convert_uchar4(dcon) != (uchar4)(0,0,0,0) ? sval : dval; *d = dval; } } __kernel void warpAffineLinear_C1_D0(__global const uchar * restrict src, __global uchar * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -180,7 +183,7 @@ __kernel void warpAffineLinear_C1_D0(__global const uchar * restrict src, __glob spos1 = src_offset + sy * srcStep + sx + 1; spos2 = src_offset + (sy+1) * srcStep + sx; spos3 = src_offset + (sy+1) * srcStep + sx + 1; - + v0.s0 = scon0.s0 ? src[spos0.s0] : 0; v1.s0 = scon1.s0 ? src[spos1.s0] : 0; v2.s0 = scon2.s0 ? src[spos2.s0] : 0; @@ -200,22 +203,22 @@ __kernel void warpAffineLinear_C1_D0(__global const uchar * restrict src, __glob v1.s3 = scon1.s3 ? src[spos1.s3] : 0; v2.s3 = scon2.s3 ? src[spos2.s3] : 0; v3.s3 = scon3.s3 ? src[spos3.s3] : 0; - + short4 itab0, itab1, itab2, itab3; float4 taby, tabx; taby = INTER_SCALE * convert_float4(ay); tabx = INTER_SCALE * convert_float4(ax); - itab0 = convert_short4_sat(( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE )); - itab1 = convert_short4_sat(( (1.0f-taby)*tabx * INTER_REMAP_COEF_SCALE )); - itab2 = convert_short4_sat(( taby*(1.0f-tabx) * INTER_REMAP_COEF_SCALE )); - itab3 = convert_short4_sat(( taby*tabx * INTER_REMAP_COEF_SCALE )); + itab0 = convert_short4_sat(( (1.0f-taby)*(1.0f-tabx) * (float4)INTER_REMAP_COEF_SCALE )); + itab1 = convert_short4_sat(( (1.0f-taby)*tabx * (float4)INTER_REMAP_COEF_SCALE )); + itab2 = convert_short4_sat(( taby*(1.0f-tabx) * (float4)INTER_REMAP_COEF_SCALE )); + itab3 = convert_short4_sat(( taby*tabx * (float4)INTER_REMAP_COEF_SCALE )); int4 val; uchar4 tval; val = convert_int4(v0) * convert_int4(itab0) + convert_int4(v1) * convert_int4(itab1) - + convert_int4(v2) * convert_int4(itab2) + convert_int4(v3) * convert_int4(itab3); + + convert_int4(v2) * convert_int4(itab2) + convert_int4(v3) * convert_int4(itab3); tval = convert_uchar4_sat ( (val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ; __global uchar4 * d =(__global uchar4 *)(dst+dst_offset+dy*dstStep+dx); @@ -228,8 +231,8 @@ __kernel void warpAffineLinear_C1_D0(__global const uchar * restrict src, __glob } __kernel void warpAffineCubic_C1_D0(__global uchar * src, __global uchar * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -255,10 +258,10 @@ __kernel void warpAffineCubic_C1_D0(__global uchar * src, __global uchar * dst, #pragma unroll 4 for(i=0; i<4; i++) - for(j=0; j<4; j++) - { - v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? src[src_offset+(sy+i) * srcStep + (sx+j)] : 0; - } + for(j=0; j<4; j++) + { + v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? src[src_offset+(sy+i) * srcStep + (sx+j)] : 0; + } short itab[16]; float tab1y[4], tab1x[4]; @@ -288,7 +291,7 @@ __kernel void warpAffineCubic_C1_D0(__global uchar * src, __global uchar * dst, if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] ) mk1 = k1, mk2 = k2; else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] ) - Mk1 = k1, Mk2 = k2; + Mk1 = k1, Mk2 = k2; } diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff)); } @@ -309,8 +312,8 @@ __kernel void warpAffineCubic_C1_D0(__global uchar * src, __global uchar * dst, ***********************************************************************************************/ __kernel void warpAffineNN_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -333,8 +336,8 @@ __kernel void warpAffineNN_C4_D0(__global uchar4 const * restrict src, __global } __kernel void warpAffineLinear_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -386,8 +389,8 @@ __kernel void warpAffineLinear_C4_D0(__global uchar4 const * restrict src, __glo } __kernel void warpAffineCubic_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -418,10 +421,10 @@ __kernel void warpAffineCubic_C4_D0(__global uchar4 const * restrict src, __glob int i,j; #pragma unroll 4 for(i=0; i<4; i++) - for(j=0; j<4; j++) - { - v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? (src[src_offset+(sy+i) * srcStep + (sx+j)]) : (uchar4)0; - } + for(j=0; j<4; j++) + { + v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? (src[src_offset+(sy+i) * srcStep + (sx+j)]) : (uchar4)0; + } int itab[16]; float tab1y[4], tab1x[4]; float axx, ayy; @@ -447,14 +450,14 @@ __kernel void warpAffineCubic_C4_D0(__global uchar4 const * restrict src, __glob int diff = isum - INTER_REMAP_COEF_SCALE; int Mk1=2, Mk2=2, mk1=2, mk2=2; - for( k1 = 2; k1 < 4; k1++ ) + for( k1 = 2; k1 < 4; k1++ ) for( k2 = 2; k2 < 4; k2++ ) { if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] ) mk1 = k1, mk2 = k2; else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] ) - Mk1 = k1, Mk2 = k2; + Mk1 = k1, Mk2 = k2; } diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff)); @@ -477,8 +480,8 @@ __kernel void warpAffineCubic_C4_D0(__global uchar4 const * restrict src, __glob ***********************************************************************************************/ __kernel void warpAffineNN_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -501,8 +504,8 @@ __kernel void warpAffineNN_C1_D5(__global float * src, __global float * dst, int } __kernel void warpAffineLinear_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -548,12 +551,12 @@ __kernel void warpAffineLinear_C1_D5(__global float * src, __global float * dst, sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3]; if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) dst[(dst_offset>>2)+dy*dstStep+dx] = sum; - } + } } __kernel void warpAffineCubic_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -617,8 +620,8 @@ __kernel void warpAffineCubic_C1_D5(__global float * src, __global float * dst, ***********************************************************************************************/ __kernel void warpAffineNN_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -636,13 +639,13 @@ __kernel void warpAffineNN_C4_D5(__global float4 * src, __global float4 * dst, i short sy0 = (short)(Y0 >> AB_BITS); if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) - dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx0>=0 && sx0=0 && sy0>4)+sy0*(srcStep>>2)+sx0] : 0; + dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx0>=0 && sx0=0 && sy0>4)+sy0*(srcStep>>2)+sx0] : (float4)0; } } __kernel void warpAffineLinear_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -670,10 +673,10 @@ __kernel void warpAffineLinear_C4_D5(__global float4 * src, __global float4 * ds float4 v0, v1, v2, v3; - v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : 0; - v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0+1] : 0; - v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0] : 0; - v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0+1] : 0; + v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : (float4)0; + v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0+1] : (float4)0; + v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0] : (float4)0; + v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0+1] : (float4)0; float tab[4]; float taby[2], tabx[2]; @@ -691,12 +694,12 @@ __kernel void warpAffineLinear_C4_D5(__global float4 * src, __global float4 * ds sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3]; if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) dst[dst_offset+dy*dstStep+dx] = sum; - } + } } __kernel void warpAffineCubic_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -726,7 +729,7 @@ __kernel void warpAffineCubic_C4_D5(__global float4 * src, __global float4 * dst int i; for(i=0; i<16; i++) - v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : 0; + v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : (float4)0; float tab[16]; float tab1y[4], tab1x[4]; @@ -754,5 +757,5 @@ __kernel void warpAffineCubic_C4_D5(__global float4 * src, __global float4 * dst dst[dst_offset+dy*dstStep+dx] = sum; } - } + } } diff --git a/modules/ocl/src/opencl/imgproc_warpPerspective.cl b/modules/ocl/src/opencl/imgproc_warpPerspective.cl index a37ffa1bee..edbe42c4a7 100644 --- a/modules/ocl/src/opencl/imgproc_warpPerspective.cl +++ b/modules/ocl/src/opencl/imgproc_warpPerspective.cl @@ -47,8 +47,12 @@ //wrapPerspective kernel //support data types: CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4, and three interpolation methods: NN, Linear, Cubic. -#if defined DOUBLE_SUPPORT +#if defined (DOUBLE_SUPPORT) +#ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64:enable +#elif defined (cl_amd_fp64) +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#endif typedef double F; typedef double4 F4; #define convert_F4 convert_double4 @@ -81,8 +85,8 @@ inline void interpolateCubic( float x, float* coeffs ) /**********************************************8UC1********************************************* ***********************************************************************************************/ __kernel void warpPerspectiveNN_C1_D0(__global uchar const * restrict src, __global uchar * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -112,14 +116,14 @@ __kernel void warpPerspectiveNN_C1_D0(__global uchar const * restrict src, __glo sval.s1 = scon.s1 ? src[spos.s1] : 0; sval.s2 = scon.s2 ? src[spos.s2] : 0; sval.s3 = scon.s3 ? src[spos.s3] : 0; - dval = convert_uchar4(dcon != 0) ? sval : dval; + dval = convert_uchar4(dcon) != (uchar4)(0,0,0,0) ? sval : dval; *d = dval; } } __kernel void warpPerspectiveLinear_C1_D0(__global const uchar * restrict src, __global uchar * dst, - int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, - int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols ) + int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, + int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -142,7 +146,7 @@ __kernel void warpPerspectiveLinear_C1_D0(__global const uchar * restrict src, _ int i; #pragma unroll 4 for(i=0; i<4; i++) - v[i] = (sx+(i&1) >= 0 && sx+(i&1) < src_cols && sy+(i>>1) >= 0 && sy+(i>>1) < src_rows) ? src[src_offset + (sy+(i>>1)) * srcStep + (sx+(i&1))] : 0; + v[i] = (sx+(i&1) >= 0 && sx+(i&1) < src_cols && sy+(i>>1) >= 0 && sy+(i>>1) < src_rows) ? src[src_offset + (sy+(i>>1)) * srcStep + (sx+(i&1))] : (uchar)0; short itab[4]; float tab1y[2], tab1x[2]; @@ -170,8 +174,8 @@ __kernel void warpPerspectiveLinear_C1_D0(__global const uchar * restrict src, _ } __kernel void warpPerspectiveCubic_C1_D0(__global uchar * src, __global uchar * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -190,15 +194,15 @@ __kernel void warpPerspectiveCubic_C1_D0(__global uchar * src, __global uchar * short ay = (short)(Y & (INTER_TAB_SIZE-1)); short ax = (short)(X & (INTER_TAB_SIZE-1)); - uchar v[16]; + uchar v[16]; int i, j; #pragma unroll 4 for(i=0; i<4; i++) - for(j=0; j<4; j++) - { - v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? src[src_offset+(sy+i) * srcStep + (sx+j)] : 0; - } + for(j=0; j<4; j++) + { + v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? src[src_offset+(sy+i) * srcStep + (sx+j)] : (uchar)0; + } short itab[16]; float tab1y[4], tab1x[4]; @@ -227,7 +231,7 @@ __kernel void warpPerspectiveCubic_C1_D0(__global uchar * src, __global uchar * if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] ) mk1 = k1, mk2 = k2; else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] ) - Mk1 = k1, Mk2 = k2; + Mk1 = k1, Mk2 = k2; } diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff)); } @@ -249,8 +253,8 @@ __kernel void warpPerspectiveCubic_C1_D0(__global uchar * src, __global uchar * ***********************************************************************************************/ __kernel void warpPerspectiveNN_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, - int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, - int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols ) + int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, + int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -273,8 +277,8 @@ __kernel void warpPerspectiveNN_C4_D0(__global uchar4 const * restrict src, __gl } __kernel void warpPerspectiveLinear_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, - int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, - int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols ) + int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, + int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -299,10 +303,10 @@ __kernel void warpPerspectiveLinear_C4_D0(__global uchar4 const * restrict src, int4 v0, v1, v2, v3; - v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? convert_int4(src[src_offset+sy * srcStep + sx]) : 0; - v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ? convert_int4(src[src_offset+sy * srcStep + sx+1]) : 0; - v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? convert_int4(src[src_offset+(sy+1) * srcStep + sx]) : 0; - v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? convert_int4(src[src_offset+(sy+1) * srcStep + sx+1]) : 0; + v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? convert_int4(src[src_offset+sy * srcStep + sx]) : (int4)0; + v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ? convert_int4(src[src_offset+sy * srcStep + sx+1]) : (int4)0; + v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? convert_int4(src[src_offset+(sy+1) * srcStep + sx]) : (int4)0; + v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? convert_int4(src[src_offset+(sy+1) * srcStep + sx+1]) : (int4)0; int itab0, itab1, itab2, itab3; float taby, tabx; @@ -323,8 +327,8 @@ __kernel void warpPerspectiveLinear_C4_D0(__global uchar4 const * restrict src, } __kernel void warpPerspectiveCubic_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, - int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, - int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols ) + int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, + int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -352,10 +356,10 @@ __kernel void warpPerspectiveCubic_C4_D0(__global uchar4 const * restrict src, _ int i,j; #pragma unroll 4 for(i=0; i<4; i++) - for(j=0; j<4; j++) - { - v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? (src[src_offset+(sy+i) * srcStep + (sx+j)]) : (uchar4)0; - } + for(j=0; j<4; j++) + { + v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? (src[src_offset+(sy+i) * srcStep + (sx+j)]) : (uchar4)0; + } int itab[16]; float tab1y[4], tab1x[4]; float axx, ayy; @@ -381,14 +385,14 @@ __kernel void warpPerspectiveCubic_C4_D0(__global uchar4 const * restrict src, _ int diff = isum - INTER_REMAP_COEF_SCALE; int Mk1=2, Mk2=2, mk1=2, mk2=2; - for( k1 = 2; k1 < 4; k1++ ) + for( k1 = 2; k1 < 4; k1++ ) for( k2 = 2; k2 < 4; k2++ ) { if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] ) mk1 = k1, mk2 = k2; else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] ) - Mk1 = k1, Mk2 = k2; + Mk1 = k1, Mk2 = k2; } diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff)); @@ -411,8 +415,8 @@ __kernel void warpPerspectiveCubic_C4_D0(__global uchar4 const * restrict src, _ ***********************************************************************************************/ __kernel void warpPerspectiveNN_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -434,8 +438,8 @@ __kernel void warpPerspectiveNN_C1_D5(__global float * src, __global float * dst } __kernel void warpPerspectiveLinear_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -458,10 +462,10 @@ __kernel void warpPerspectiveLinear_C1_D5(__global float * src, __global float * float v0, v1, v2, v3; - v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? src[src_offset+sy * srcStep + sx] : 0; - v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ? src[src_offset+sy * srcStep + sx+1] : 0; - v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? src[src_offset+(sy+1) * srcStep + sx] : 0; - v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? src[src_offset+(sy+1) * srcStep + sx+1] : 0; + v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? src[src_offset+sy * srcStep + sx] : (float)0; + v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ? src[src_offset+sy * srcStep + sx+1] : (float)0; + v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? src[src_offset+(sy+1) * srcStep + sx] : (float)0; + v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? src[src_offset+(sy+1) * srcStep + sx+1] : (float)0; float tab[4]; float taby[2], tabx[2]; @@ -483,8 +487,8 @@ __kernel void warpPerspectiveLinear_C1_D5(__global float * src, __global float * } __kernel void warpPerspectiveCubic_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -510,7 +514,7 @@ __kernel void warpPerspectiveCubic_C1_D5(__global float * src, __global float * int i; for(i=0; i<16; i++) - v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : 0; + v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : (float)0; float tab[16]; float tab1y[4], tab1x[4]; @@ -546,8 +550,8 @@ __kernel void warpPerspectiveCubic_C1_D5(__global float * src, __global float * ***********************************************************************************************/ __kernel void warpPerspectiveNN_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -564,13 +568,13 @@ __kernel void warpPerspectiveNN_C4_D5(__global float4 * src, __global float4 * d short sy = (short)Y; if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) - dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx>=0 && sx=0 && sy>4)+sy*(srcStep>>2)+sx] : 0; + dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx>=0 && sx=0 && sy>4)+sy*(srcStep>>2)+sx] : (float)0; } } __kernel void warpPerspectiveLinear_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) + int dst_cols, int dst_rows, int srcStep, int dstStep, + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -597,10 +601,10 @@ __kernel void warpPerspectiveLinear_C4_D5(__global float4 * src, __global float4 float4 v0, v1, v2, v3; - v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : 0; - v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0+1] : 0; - v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0] : 0; - v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0+1] : 0; + v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : (float4)0; + v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0+1] : (float4)0; + v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0] : (float4)0; + v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0+1] : (float4)0; float tab[4]; float taby[2], tabx[2]; @@ -622,8 +626,8 @@ __kernel void warpPerspectiveLinear_C4_D5(__global float4 * src, __global float4 } __kernel void warpPerspectiveCubic_C4_D5(__global float4 * src, __global float4 * dst, - int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, - int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols ) + int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, + int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -652,7 +656,7 @@ __kernel void warpPerspectiveCubic_C4_D5(__global float4 * src, __global float4 int i; for(i=0; i<16; i++) - v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : 0; + v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : (float4)0; float tab[16]; float tab1y[4], tab1x[4]; @@ -680,5 +684,6 @@ __kernel void warpPerspectiveCubic_C4_D5(__global float4 * src, __global float4 dst[dst_offset+dy*dstStep+dx] = sum; } - } + } } + diff --git a/modules/ocl/src/opencl/match_template.cl b/modules/ocl/src/opencl/match_template.cl index 857f891c38..0dd3e69c40 100644 --- a/modules/ocl/src/opencl/match_template.cl +++ b/modules/ocl/src/opencl/match_template.cl @@ -447,10 +447,10 @@ void matchTemplate_Naive_CCORR_C1_D0 __global const uchar * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset); for(j = 0; j < tpl_cols; j ++) { - sum = mad24(img_ptr[j], tpl_ptr[j], sum); + sum = mad24(convert_int(img_ptr[j]), convert_int(tpl_ptr[j]), sum); } } - res[res_idx] = sum; + res[res_idx] = (float)sum; } } @@ -548,7 +548,7 @@ void matchTemplate_Naive_CCORR_C4_D0 sum = mad24(convert_int4(img_ptr[j]), convert_int4(tpl_ptr[j]), sum); } } - res[res_idx] = sum.x + sum.y + sum.z + sum.w; + res[res_idx] = (float)(sum.x + sum.y + sum.z + sum.w); } } @@ -633,9 +633,8 @@ void matchTemplate_Prepared_CCOFF_C1_D0 if(gidx < res_cols && gidy < res_rows) { - float sum = (float)( - (img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)]) - - (img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)])); + float sum = (float)((img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)]) + -(img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)])); res[res_idx] -= sum * tpl_sum; } }