diff --git a/modules/core/src/copy.cpp b/modules/core/src/copy.cpp index e3e959c950..5ac5f22c58 100644 --- a/modules/core/src/copy.cpp +++ b/modules/core/src/copy.cpp @@ -482,9 +482,9 @@ enum { FLIP_COLS = 1 << 0, FLIP_ROWS = 1 << 1, FLIP_BOTH = FLIP_ROWS | FLIP_COLS static bool ocl_flip(InputArray _src, OutputArray _dst, int flipCode ) { CV_Assert(flipCode >= - 1 && flipCode <= 1); - int type = _src.type(), cn = CV_MAT_CN(type), flipType; + int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), flipType; - if (cn > 4 || cn == 3) + if (cn > 4) return false; const char * kernelName; @@ -506,7 +506,8 @@ static bool ocl_flip(InputArray _src, OutputArray _dst, int flipCode ) } ocl::Kernel k(kernelName, ocl::core::flip_oclsrc, - format( "-D type=%s", ocl::memopTypeToStr(type))); + format( "-D T=%s -D T1=%s -D cn=%d", ocl::memopTypeToStr(type), + ocl::memopTypeToStr(depth), cn)); if (k.empty()) return false; diff --git a/modules/core/src/matrix.cpp b/modules/core/src/matrix.cpp index db1ce760f3..82d177eda4 100644 --- a/modules/core/src/matrix.cpp +++ b/modules/core/src/matrix.cpp @@ -2679,17 +2679,17 @@ namespace cv { static bool ocl_setIdentity( InputOutputArray _m, const Scalar& s ) { - int type = _m.type(), cn = CV_MAT_CN(type); - if (cn == 3) - return false; + int type = _m.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), + sctype = CV_MAKE_TYPE(depth, cn == 3 ? 4 : cn); ocl::Kernel k("setIdentity", ocl::core::set_identity_oclsrc, - format("-D T=%s", ocl::memopTypeToStr(type))); + format("-D T=%s -D T1=%s -D cn=%d -D ST=%s", ocl::memopTypeToStr(type), + ocl::memopTypeToStr(depth), cn, ocl::memopTypeToStr(sctype))); if (k.empty()) return false; UMat m = _m.getUMat(); - k.args(ocl::KernelArg::WriteOnly(m), ocl::KernelArg::Constant(Mat(1, 1, type, s))); + k.args(ocl::KernelArg::WriteOnly(m), ocl::KernelArg::Constant(Mat(1, 1, sctype, s))); size_t globalsize[2] = { m.cols, m.rows }; return k.run(2, globalsize, NULL, false); diff --git a/modules/core/src/opencl/flip.cl b/modules/core/src/opencl/flip.cl index 0c874dbe6f..bacfe7adfb 100644 --- a/modules/core/src/opencl/flip.cl +++ b/modules/core/src/opencl/flip.cl @@ -39,10 +39,18 @@ // //M*/ -#define sizeoftype ((int)sizeof(type)) +#if cn != 3 +#define loadpix(addr) *(__global const T *)(addr) +#define storepix(val, addr) *(__global T *)(addr) = val +#define TSIZE (int)sizeof(T) +#else +#define loadpix(addr) vload3(0, (__global const T1 *)(addr)) +#define storepix(val, addr) vstore3(val, 0, (__global T1 *)(addr)) +#define TSIZE ((int)sizeof(T1)*3) +#endif -__kernel void arithm_flip_rows(__global const uchar* srcptr, int srcstep, int srcoffset, - __global uchar* dstptr, int dststep, int dstoffset, +__kernel void arithm_flip_rows(__global const uchar * srcptr, int src_step, int src_offset, + __global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols, int thread_rows, int thread_cols) { int x = get_global_id(0); @@ -50,19 +58,16 @@ __kernel void arithm_flip_rows(__global const uchar* srcptr, int srcstep, int sr if (x < cols && y < thread_rows) { - __global const type* src0 = (__global const type*)(srcptr + mad24(y, srcstep, mad24(x, sizeoftype, srcoffset))); - __global const type* src1 = (__global const type*)(srcptr + mad24(rows - y - 1, srcstep, mad24(x, sizeoftype, srcoffset))); + T src0 = loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset))); + T src1 = loadpix(srcptr + mad24(rows - y - 1, src_step, mad24(x, TSIZE, src_offset))); - __global type* dst0 = (__global type*)(dstptr + mad24(y, dststep, mad24(x, sizeoftype, dstoffset))); - __global type* dst1 = (__global type*)(dstptr + mad24(rows - y - 1, dststep, mad24(x, sizeoftype, dstoffset))); - - dst0[0] = src1[0]; - dst1[0] = src0[0]; + storepix(src1, dstptr + mad24(y, dst_step, mad24(x, TSIZE, dst_offset))); + storepix(src0, dstptr + mad24(rows - y - 1, dst_step, mad24(x, TSIZE, dst_offset))); } } -__kernel void arithm_flip_rows_cols(__global const uchar* srcptr, int srcstep, int srcoffset, - __global uchar* dstptr, int dststep, int dstoffset, +__kernel void arithm_flip_rows_cols(__global const uchar * srcptr, int src_step, int src_offset, + __global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols, int thread_rows, int thread_cols) { int x = get_global_id(0); @@ -71,19 +76,16 @@ __kernel void arithm_flip_rows_cols(__global const uchar* srcptr, int srcstep, i if (x < cols && y < thread_rows) { int x1 = cols - x - 1; - __global const type* src0 = (__global const type*)(srcptr + mad24(y, srcstep, mad24(x, sizeoftype, srcoffset))); - __global const type* src1 = (__global const type*)(srcptr + mad24(rows - y - 1, srcstep, mad24(x1, sizeoftype, srcoffset))); + T src0 = loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset))); + T src1 = loadpix(srcptr + mad24(rows - y - 1, src_step, mad24(x1, TSIZE, src_offset))); - __global type* dst0 = (__global type*)(dstptr + mad24(rows - y - 1, dststep, mad24(x1, sizeoftype, dstoffset))); - __global type* dst1 = (__global type*)(dstptr + mad24(y, dststep, mad24(x, sizeoftype, dstoffset))); - - dst0[0] = src0[0]; - dst1[0] = src1[0]; + storepix(src0, dstptr + mad24(rows - y - 1, dst_step, mad24(x1, TSIZE, dst_offset))); + storepix(src1, dstptr + mad24(y, dst_step, mad24(x, TSIZE, dst_offset))); } } -__kernel void arithm_flip_cols(__global const uchar* srcptr, int srcstep, int srcoffset, - __global uchar* dstptr, int dststep, int dstoffset, +__kernel void arithm_flip_cols(__global const uchar * srcptr, int src_step, int src_offset, + __global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols, int thread_rows, int thread_cols) { int x = get_global_id(0); @@ -92,13 +94,10 @@ __kernel void arithm_flip_cols(__global const uchar* srcptr, int srcstep, int sr if (x < thread_cols && y < rows) { int x1 = cols - x - 1; - __global const type* src0 = (__global const type*)(srcptr + mad24(y, srcstep, mad24(x, sizeoftype, srcoffset))); - __global const type* src1 = (__global const type*)(srcptr + mad24(y, srcstep, mad24(x1, sizeoftype, srcoffset))); + T src0 = loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset))); + T src1 = loadpix(srcptr + mad24(y, src_step, mad24(x1, TSIZE, src_offset))); - __global type* dst0 = (__global type*)(dstptr + mad24(y, dststep, mad24(x1, sizeoftype, dstoffset))); - __global type* dst1 = (__global type*)(dstptr + mad24(y, dststep, mad24(x, sizeoftype, dstoffset))); - - dst1[0] = src1[0]; - dst0[0] = src0[0]; + storepix(src0, dstptr + mad24(y, dst_step, mad24(x1, TSIZE, dst_offset))); + storepix(src1, dstptr + mad24(y, dst_step, mad24(x, TSIZE, dst_offset))); } } diff --git a/modules/core/src/opencl/set_identity.cl b/modules/core/src/opencl/set_identity.cl index d63ce793db..0e8f1424fb 100644 --- a/modules/core/src/opencl/set_identity.cl +++ b/modules/core/src/opencl/set_identity.cl @@ -43,17 +43,28 @@ // //M*/ +#if cn != 3 +#define loadpix(addr) *(__global const T *)(addr) +#define storepix(val, addr) *(__global T *)(addr) = val +#define TSIZE (int)sizeof(T) +#define scalar scalar_ +#else +#define loadpix(addr) vload3(0, (__global const T1 *)(addr)) +#define storepix(val, addr) vstore3(val, 0, (__global T1 *)(addr)) +#define TSIZE ((int)sizeof(T1)*3) +#define scalar (T)(scalar_.x, scalar_.y, scalar_.z) +#endif + __kernel void setIdentity(__global uchar * srcptr, int src_step, int src_offset, int rows, int cols, - T scalar) + ST scalar_) { int x = get_global_id(0); int y = get_global_id(1); if (x < cols && y < rows) { - int src_index = mad24(y, src_step, mad24(x, (int)sizeof(T), src_offset)); - __global T * src = (__global T *)(srcptr + src_index); + int src_index = mad24(y, src_step, mad24(x, TSIZE, src_offset)); - src[0] = x == y ? scalar : (T)(0); + storepix(x == y ? scalar : (T)(0), srcptr + src_index); } }