mirror of
https://github.com/opencv/opencv.git
synced 2024-11-29 05:29:54 +08:00
Merge pull request #2326 from vpisarev:ocl_ch3
This commit is contained in:
commit
0fc248fb70
@ -934,16 +934,23 @@ static bool ocl_binary_op(InputArray _src1, InputArray _src2, OutputArray _dst,
|
||||
|
||||
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
|
||||
|
||||
if( oclop < 0 || ((haveMask || haveScalar) && (cn > 4 || cn == 3)) ||
|
||||
if( oclop < 0 || ((haveMask || haveScalar) && cn > 4) ||
|
||||
(!doubleSupport && srcdepth == CV_64F))
|
||||
return false;
|
||||
|
||||
char opts[1024];
|
||||
int kercn = haveMask || haveScalar ? cn : 1;
|
||||
sprintf(opts, "-D %s%s -D %s -D dstT=%s%s",
|
||||
int scalarcn = kercn == 3 ? 4 : kercn;
|
||||
|
||||
sprintf(opts, "-D %s%s -D %s -D dstT=%s%s -D dstT_C1=%s -D workST=%s -D cn=%d",
|
||||
(haveMask ? "MASK_" : ""), (haveScalar ? "UNARY_OP" : "BINARY_OP"), oclop2str[oclop],
|
||||
bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, kercn)) :
|
||||
ocl::typeToStr(CV_MAKETYPE(srcdepth, kercn)), doubleSupport ? " -D DOUBLE_SUPPORT" : "");
|
||||
ocl::typeToStr(CV_MAKETYPE(srcdepth, kercn)), doubleSupport ? " -D DOUBLE_SUPPORT" : "",
|
||||
bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, 1)) :
|
||||
ocl::typeToStr(CV_MAKETYPE(srcdepth, 1)),
|
||||
bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, scalarcn)) :
|
||||
ocl::typeToStr(CV_MAKETYPE(srcdepth, scalarcn)),
|
||||
kercn);
|
||||
|
||||
ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts);
|
||||
if( k.empty() )
|
||||
@ -960,7 +967,7 @@ static bool ocl_binary_op(InputArray _src1, InputArray _src2, OutputArray _dst,
|
||||
|
||||
if( haveScalar )
|
||||
{
|
||||
size_t esz = CV_ELEM_SIZE(srctype);
|
||||
size_t esz = CV_ELEM_SIZE1(srctype)*scalarcn;
|
||||
double buf[4] = {0,0,0,0};
|
||||
|
||||
if( oclop != OCL_OP_NOT )
|
||||
@ -1294,7 +1301,7 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
|
||||
int type1 = _src1.type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1);
|
||||
bool haveMask = !_mask.empty();
|
||||
|
||||
if( ((haveMask || haveScalar) && (cn > 4 || cn == 3)) )
|
||||
if( ((haveMask || haveScalar) && cn > 4) )
|
||||
return false;
|
||||
|
||||
int dtype = _dst.type(), ddepth = CV_MAT_DEPTH(dtype), wdepth = std::max(CV_32S, CV_MAT_DEPTH(wtype));
|
||||
@ -1307,21 +1314,26 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
|
||||
return false;
|
||||
|
||||
int kercn = haveMask || haveScalar ? cn : 1;
|
||||
int scalarcn = kercn == 3 ? 4 : kercn;
|
||||
|
||||
char cvtstr[4][32], opts[1024];
|
||||
sprintf(opts, "-D %s%s -D %s -D srcT1=%s -D srcT2=%s "
|
||||
"-D dstT=%s -D workT=%s -D scaleT=%s -D convertToWT1=%s "
|
||||
"-D convertToWT2=%s -D convertToDT=%s%s",
|
||||
sprintf(opts, "-D %s%s -D %s -D srcT1=%s -D srcT1_C1=%s -D srcT2=%s -D srcT2_C1=%s "
|
||||
"-D dstT=%s -D dstT_C1=%s -D workT=%s -D workST=%s -D scaleT=%s -D convertToWT1=%s "
|
||||
"-D convertToWT2=%s -D convertToDT=%s%s -D cn=%d",
|
||||
(haveMask ? "MASK_" : ""), (haveScalar ? "UNARY_OP" : "BINARY_OP"),
|
||||
oclop2str[oclop], ocl::typeToStr(CV_MAKETYPE(depth1, kercn)),
|
||||
ocl::typeToStr(CV_MAKETYPE(depth1, 1)),
|
||||
ocl::typeToStr(CV_MAKETYPE(depth2, kercn)),
|
||||
ocl::typeToStr(CV_MAKETYPE(depth2, 1)),
|
||||
ocl::typeToStr(CV_MAKETYPE(ddepth, kercn)),
|
||||
ocl::typeToStr(CV_MAKETYPE(ddepth, 1)),
|
||||
ocl::typeToStr(CV_MAKETYPE(wdepth, kercn)),
|
||||
ocl::typeToStr(CV_MAKETYPE(wdepth, scalarcn)),
|
||||
ocl::typeToStr(CV_MAKETYPE(wdepth, 1)),
|
||||
ocl::convertTypeStr(depth1, wdepth, kercn, cvtstr[0]),
|
||||
ocl::convertTypeStr(depth2, wdepth, kercn, cvtstr[1]),
|
||||
ocl::convertTypeStr(wdepth, ddepth, kercn, cvtstr[2]),
|
||||
doubleSupport ? " -D DOUBLE_SUPPORT" : "");
|
||||
doubleSupport ? " -D DOUBLE_SUPPORT" : "", kercn);
|
||||
|
||||
size_t usrdata_esz = CV_ELEM_SIZE(wdepth);
|
||||
const uchar* usrdata_p = (const uchar*)usrdata;
|
||||
@ -1352,7 +1364,7 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
|
||||
|
||||
if( haveScalar )
|
||||
{
|
||||
size_t esz = CV_ELEM_SIZE(wtype);
|
||||
size_t esz = CV_ELEM_SIZE(wtype)*scalarcn;
|
||||
double buf[4]={0,0,0,0};
|
||||
Mat src2sc = _src2.getMat();
|
||||
|
||||
@ -2621,7 +2633,7 @@ static bool ocl_compare(InputArray _src1, InputArray _src2, OutputArray _dst, in
|
||||
|
||||
const char * const operationMap[] = { "==", ">", ">=", "<", "<=", "!=" };
|
||||
ocl::Kernel k("KF", ocl::core::arithm_oclsrc,
|
||||
format("-D BINARY_OP -D srcT1=%s -D workT=srcT1"
|
||||
format("-D BINARY_OP -D srcT1=%s -D workT=srcT1 -D cn=1"
|
||||
" -D OP_CMP -D CMP_OPERATOR=%s%s",
|
||||
ocl::typeToStr(CV_MAKE_TYPE(depth, 1)),
|
||||
operationMap[op],
|
||||
|
@ -70,21 +70,47 @@
|
||||
#define CV_PI M_PI_F
|
||||
#endif
|
||||
|
||||
#define dstelem *(__global dstT*)(dstptr + dst_index)
|
||||
#define dstelem2 *(__global dstT*)(dstptr2 + dst_index2)
|
||||
#ifndef cn
|
||||
#define cn 1
|
||||
#endif
|
||||
|
||||
#if cn == 1
|
||||
#undef srcT1_C1
|
||||
#undef srcT2_C1
|
||||
#undef dstT_C1
|
||||
#define srcT1_C1 srcT1
|
||||
#define srcT2_C1 srcT2
|
||||
#define dstT_C1 dstT
|
||||
#endif
|
||||
|
||||
#if cn != 3
|
||||
#define storedst(val) *(__global dstT*)(dstptr + dst_index) = val
|
||||
#define storedst2(val) *(__global dstT*)(dstptr2 + dst_index2) = val
|
||||
#else
|
||||
#define storedst(val) vstore3(val, 0, (__global dstT_C1*)(dstptr + dst_index))
|
||||
#define storedst2(val) vstore3(val, 0, (__global dstT_C1*)(dstptr2 + dst_index2))
|
||||
#endif
|
||||
|
||||
#define noconvert
|
||||
|
||||
#ifndef workT
|
||||
|
||||
#ifndef srcT1
|
||||
#define srcT1 dstT
|
||||
#define srcT1_C1 dstT_C1
|
||||
#endif
|
||||
#ifndef srcT2
|
||||
#define srcT2 dstT
|
||||
#define srcT2_C1 dstT_C1
|
||||
#endif
|
||||
#define workT dstT
|
||||
#define srcelem1 *(__global srcT1*)(srcptr1 + src1_index)
|
||||
#define srcelem2 *(__global srcT2*)(srcptr2 + src2_index)
|
||||
#if cn != 3
|
||||
#define srcelem1 *(__global srcT1*)(srcptr1 + src1_index)
|
||||
#define srcelem2 *(__global srcT2*)(srcptr2 + src2_index)
|
||||
#else
|
||||
#define srcelem1 vload3(0, (__global srcT1_C1*)(srcptr1 + src1_index))
|
||||
#define srcelem2 vload3(0, (__global srcT2_C1*)(srcptr2 + src2_index))
|
||||
#endif
|
||||
#ifndef convertToDT
|
||||
#define convertToDT noconvert
|
||||
#endif
|
||||
@ -94,153 +120,168 @@
|
||||
#ifndef convertToWT2
|
||||
#define convertToWT2 convertToWT1
|
||||
#endif
|
||||
#define srcelem1 convertToWT1(*(__global srcT1*)(srcptr1 + src1_index))
|
||||
#define srcelem2 convertToWT2(*(__global srcT2*)(srcptr2 + src2_index))
|
||||
#if cn != 3
|
||||
#define srcelem1 convertToWT1(*(__global srcT1*)(srcptr1 + src1_index))
|
||||
#define srcelem2 convertToWT2(*(__global srcT2*)(srcptr2 + src2_index))
|
||||
#else
|
||||
#define srcelem1 convertToWT1(vload3(0, (__global srcT1_C1*)(srcptr1 + src1_index)))
|
||||
#define srcelem2 convertToWT2(vload3(0, (__global srcT2_C1*)(srcptr2 + src2_index)))
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#ifndef workST
|
||||
#define workST workT
|
||||
#endif
|
||||
|
||||
#define EXTRA_PARAMS
|
||||
#define EXTRA_INDEX
|
||||
|
||||
#if defined OP_ADD
|
||||
#define PROCESS_ELEM dstelem = convertToDT(srcelem1 + srcelem2)
|
||||
#define PROCESS_ELEM storedst(convertToDT(srcelem1 + srcelem2))
|
||||
|
||||
#elif defined OP_SUB
|
||||
#define PROCESS_ELEM dstelem = convertToDT(srcelem1 - srcelem2)
|
||||
#define PROCESS_ELEM storedst(convertToDT(srcelem1 - srcelem2))
|
||||
|
||||
#elif defined OP_RSUB
|
||||
#define PROCESS_ELEM dstelem = convertToDT(srcelem2 - srcelem1)
|
||||
#define PROCESS_ELEM storedst(convertToDT(srcelem2 - srcelem1))
|
||||
|
||||
#elif defined OP_ABSDIFF
|
||||
#define PROCESS_ELEM \
|
||||
workT v = srcelem1 - srcelem2; \
|
||||
dstelem = convertToDT(v >= (workT)(0) ? v : -v);
|
||||
storedst(convertToDT(v >= (workT)(0) ? v : -v))
|
||||
|
||||
#elif defined OP_AND
|
||||
#define PROCESS_ELEM dstelem = srcelem1 & srcelem2
|
||||
#define PROCESS_ELEM storedst(srcelem1 & srcelem2)
|
||||
|
||||
#elif defined OP_OR
|
||||
#define PROCESS_ELEM dstelem = srcelem1 | srcelem2
|
||||
#define PROCESS_ELEM storedst(srcelem1 | srcelem2)
|
||||
|
||||
#elif defined OP_XOR
|
||||
#define PROCESS_ELEM dstelem = srcelem1 ^ srcelem2
|
||||
#define PROCESS_ELEM storedst(srcelem1 ^ srcelem2)
|
||||
|
||||
#elif defined OP_NOT
|
||||
#define PROCESS_ELEM dstelem = ~srcelem1
|
||||
#define PROCESS_ELEM storedst(~srcelem1)
|
||||
|
||||
#elif defined OP_MIN
|
||||
#define PROCESS_ELEM dstelem = min(srcelem1, srcelem2)
|
||||
#define PROCESS_ELEM storedst(min(srcelem1, srcelem2))
|
||||
|
||||
#elif defined OP_MAX
|
||||
#define PROCESS_ELEM dstelem = max(srcelem1, srcelem2)
|
||||
#define PROCESS_ELEM storedst(max(srcelem1, srcelem2))
|
||||
|
||||
#elif defined OP_MUL
|
||||
#define PROCESS_ELEM dstelem = convertToDT(srcelem1 * srcelem2)
|
||||
#define PROCESS_ELEM storedst(convertToDT(srcelem1 * srcelem2))
|
||||
|
||||
#elif defined OP_MUL_SCALE
|
||||
#undef EXTRA_PARAMS
|
||||
#ifdef UNARY_OP
|
||||
#define EXTRA_PARAMS , workT srcelem2, scaleT scale
|
||||
#define EXTRA_PARAMS , workST srcelem2_, scaleT scale
|
||||
#undef srcelem2
|
||||
#define srcelem2 srcelem2_
|
||||
#else
|
||||
#define EXTRA_PARAMS , scaleT scale
|
||||
#endif
|
||||
#define PROCESS_ELEM dstelem = convertToDT(srcelem1 * scale * srcelem2)
|
||||
#define PROCESS_ELEM storedst(convertToDT(srcelem1 * scale * srcelem2))
|
||||
|
||||
#elif defined OP_DIV
|
||||
#define PROCESS_ELEM \
|
||||
workT e2 = srcelem2, zero = (workT)(0); \
|
||||
dstelem = convertToDT(e2 != zero ? srcelem1 / e2 : zero)
|
||||
storedst(convertToDT(e2 != zero ? srcelem1 / e2 : zero))
|
||||
|
||||
#elif defined OP_DIV_SCALE
|
||||
#undef EXTRA_PARAMS
|
||||
#ifdef UNARY_OP
|
||||
#define EXTRA_PARAMS , workT srcelem2, scaleT scale
|
||||
#define EXTRA_PARAMS , workST srcelem2_, scaleT scale
|
||||
#undef srcelem2
|
||||
#define srcelem2 srcelem2_
|
||||
#else
|
||||
#define EXTRA_PARAMS , scaleT scale
|
||||
#endif
|
||||
#define PROCESS_ELEM \
|
||||
workT e2 = srcelem2, zero = (workT)(0); \
|
||||
dstelem = convertToDT(e2 == zero ? zero : (srcelem1 * (workT)(scale) / e2))
|
||||
storedst(convertToDT(e2 == zero ? zero : (srcelem1 * (workT)(scale) / e2)))
|
||||
|
||||
#elif defined OP_RDIV_SCALE
|
||||
#undef EXTRA_PARAMS
|
||||
#ifdef UNARY_OP
|
||||
#define EXTRA_PARAMS , workT srcelem2, scaleT scale
|
||||
#define EXTRA_PARAMS , workST srcelem2_, scaleT scale
|
||||
#undef srcelem2
|
||||
#define srcelem2 srcelem2_
|
||||
#else
|
||||
#define EXTRA_PARAMS , scaleT scale
|
||||
#endif
|
||||
#define PROCESS_ELEM \
|
||||
workT e1 = srcelem1, zero = (workT)(0); \
|
||||
dstelem = convertToDT(e1 == zero ? zero : (srcelem2 * (workT)(scale) / e1))
|
||||
storedst(convertToDT(e1 == zero ? zero : (srcelem2 * (workT)(scale) / e1)))
|
||||
|
||||
#elif defined OP_RECIP_SCALE
|
||||
#undef EXTRA_PARAMS
|
||||
#define EXTRA_PARAMS , scaleT scale
|
||||
#define PROCESS_ELEM \
|
||||
workT e1 = srcelem1, zero = (workT)(0); \
|
||||
dstelem = convertToDT(e1 != zero ? scale / e1 : zero)
|
||||
storedst(convertToDT(e1 != zero ? scale / e1 : zero))
|
||||
|
||||
#elif defined OP_ADDW
|
||||
#undef EXTRA_PARAMS
|
||||
#define EXTRA_PARAMS , scaleT alpha, scaleT beta, scaleT gamma
|
||||
#define PROCESS_ELEM dstelem = convertToDT(srcelem1*alpha + srcelem2*beta + gamma)
|
||||
#define PROCESS_ELEM storedst(convertToDT(srcelem1*alpha + srcelem2*beta + gamma))
|
||||
|
||||
#elif defined OP_MAG
|
||||
#define PROCESS_ELEM dstelem = hypot(srcelem1, srcelem2)
|
||||
#define PROCESS_ELEM storedst(hypot(srcelem1, srcelem2))
|
||||
|
||||
#elif defined OP_ABS_NOSAT
|
||||
#define PROCESS_ELEM \
|
||||
dstT v = convertToDT(srcelem1); \
|
||||
dstelem = v >= 0 ? v : -v
|
||||
storedst(v >= 0 ? v : -v)
|
||||
|
||||
#elif defined OP_PHASE_RADIANS
|
||||
#define PROCESS_ELEM \
|
||||
workT tmp = atan2(srcelem2, srcelem1); \
|
||||
if(tmp < 0) tmp += 6.283185307179586232f; \
|
||||
dstelem = tmp
|
||||
storedst(tmp)
|
||||
|
||||
#elif defined OP_PHASE_DEGREES
|
||||
#define PROCESS_ELEM \
|
||||
workT tmp = atan2(srcelem2, srcelem1)*57.29577951308232286465f; \
|
||||
if(tmp < 0) tmp += 360; \
|
||||
dstelem = tmp
|
||||
storedst(tmp)
|
||||
|
||||
#elif defined OP_EXP
|
||||
#define PROCESS_ELEM dstelem = exp(srcelem1)
|
||||
#define PROCESS_ELEM storedst(exp(srcelem1))
|
||||
|
||||
#elif defined OP_POW
|
||||
#define PROCESS_ELEM dstelem = pow(srcelem1, srcelem2)
|
||||
#define PROCESS_ELEM storedst(pow(srcelem1, srcelem2))
|
||||
|
||||
#elif defined OP_POWN
|
||||
#undef workT
|
||||
#define workT int
|
||||
#define PROCESS_ELEM dstelem = pown(srcelem1, srcelem2)
|
||||
#define PROCESS_ELEM storedst(pown(srcelem1, srcelem2))
|
||||
|
||||
#elif defined OP_SQRT
|
||||
#define PROCESS_ELEM dstelem = sqrt(srcelem1)
|
||||
#define PROCESS_ELEM storedst(sqrt(srcelem1))
|
||||
|
||||
#elif defined OP_LOG
|
||||
#define PROCESS_ELEM \
|
||||
dstT v = (dstT)(srcelem1);\
|
||||
dstelem = v > (dstT)(0) ? log(v) : log(-v)
|
||||
dstT v = (dstT)(srcelem1);\
|
||||
storedst(v > (dstT)(0) ? log(v) : log(-v))
|
||||
|
||||
#elif defined OP_CMP
|
||||
#define dstT uchar
|
||||
#define srcT2 srcT1
|
||||
#define convertToWT1
|
||||
#define PROCESS_ELEM dstelem = convert_uchar(srcelem1 CMP_OPERATOR srcelem2 ? 255 : 0)
|
||||
#define PROCESS_ELEM storedst(convert_uchar(srcelem1 CMP_OPERATOR srcelem2 ? 255 : 0))
|
||||
|
||||
#elif defined OP_CONVERT_SCALE_ABS
|
||||
#undef EXTRA_PARAMS
|
||||
#define EXTRA_PARAMS , workT alpha, workT beta
|
||||
#define PROCESS_ELEM \
|
||||
workT value = srcelem1 * alpha + beta; \
|
||||
dstelem = convertToDT(value >= 0 ? value : -value)
|
||||
storedst(convertToDT(value >= 0 ? value : -value))
|
||||
|
||||
#elif defined OP_SCALE_ADD
|
||||
#undef EXTRA_PARAMS
|
||||
#define EXTRA_PARAMS , workT alpha
|
||||
#define PROCESS_ELEM dstelem = convertToDT(srcelem1 * alpha + srcelem2)
|
||||
#define PROCESS_ELEM storedst(convertToDT(srcelem1 * alpha + srcelem2))
|
||||
|
||||
#elif defined OP_CTP_AD || defined OP_CTP_AR
|
||||
#ifdef OP_CTP_AD
|
||||
@ -257,8 +298,8 @@ dstelem = v > (dstT)(0) ? log(v) : log(-v)
|
||||
dstT tmp1 = y >= 0 ? CV_PI * 0.5f : CV_PI * 1.5f; \
|
||||
dstT cartToPolar = y2 <= x2 ? x * y / (x2 + 0.28f * y2 + CV_EPSILON) + tmp : (tmp1 - x * y / (y2 + 0.28f * x2 + CV_EPSILON)); \
|
||||
TO_DEGREE \
|
||||
dstelem = magnitude; \
|
||||
dstelem2 = cartToPolar
|
||||
storedst(magnitude); \
|
||||
storedst2(cartToPolar)
|
||||
|
||||
#elif defined OP_PTC_AD || defined OP_PTC_AR
|
||||
#ifdef OP_PTC_AD
|
||||
@ -272,15 +313,15 @@ dstelem = v > (dstT)(0) ? log(v) : log(-v)
|
||||
#define PROCESS_ELEM \
|
||||
dstT x = srcelem1, y = srcelem2; \
|
||||
FROM_DEGREE; \
|
||||
dstelem = cos(alpha) * x; \
|
||||
dstelem2 = sin(alpha) * x
|
||||
storedst(cos(alpha) * x); \
|
||||
storedst2(sin(alpha) * x)
|
||||
|
||||
#elif defined OP_PATCH_NANS
|
||||
#undef EXTRA_PARAMS
|
||||
#define EXTRA_PARAMS , int val
|
||||
#define PROCESS_ELEM \
|
||||
if (( srcelem1 & 0x7fffffff) > 0x7f800000 ) \
|
||||
dstelem = val
|
||||
storedst(val)
|
||||
|
||||
#else
|
||||
#error "unknown op type"
|
||||
@ -290,18 +331,26 @@ dstelem = v > (dstT)(0) ? log(v) : log(-v)
|
||||
#undef EXTRA_PARAMS
|
||||
#define EXTRA_PARAMS , __global uchar* dstptr2, int dststep2, int dstoffset2
|
||||
#undef EXTRA_INDEX
|
||||
#define EXTRA_INDEX int dst_index2 = mad24(y, dststep2, x*(int)sizeof(dstT) + dstoffset2)
|
||||
#define EXTRA_INDEX int dst_index2 = mad24(y, dststep2, x*(int)sizeof(dstT_C1)*cn + dstoffset2)
|
||||
#endif
|
||||
|
||||
#if defined UNARY_OP || defined MASK_UNARY_OP
|
||||
#undef srcelem2
|
||||
|
||||
#if defined OP_AND || defined OP_OR || defined OP_XOR || defined OP_ADD || defined OP_SAT_ADD || \
|
||||
defined OP_SUB || defined OP_SAT_SUB || defined OP_RSUB || defined OP_SAT_RSUB || \
|
||||
defined OP_ABSDIFF || defined OP_CMP || defined OP_MIN || defined OP_MAX || defined OP_POW || \
|
||||
defined OP_MUL || defined OP_DIV || defined OP_POWN
|
||||
#undef EXTRA_PARAMS
|
||||
#define EXTRA_PARAMS , workT srcelem2
|
||||
#define EXTRA_PARAMS , workST srcelem2_
|
||||
#undef srcelem2
|
||||
#define srcelem2 srcelem2_
|
||||
#endif
|
||||
|
||||
#if cn == 3
|
||||
#undef srcelem2
|
||||
#define srcelem2 (workT)(srcelem2_.x, srcelem2_.y, srcelem2_.z)
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#if defined BINARY_OP
|
||||
@ -316,11 +365,11 @@ __kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1,
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, srcstep1, x*(int)sizeof(srcT1) + srcoffset1);
|
||||
int src1_index = mad24(y, srcstep1, x*(int)sizeof(srcT1_C1)*cn + srcoffset1);
|
||||
#if !(defined(OP_RECIP_SCALE) || defined(OP_NOT))
|
||||
int src2_index = mad24(y, srcstep2, x*(int)sizeof(srcT2) + srcoffset2);
|
||||
int src2_index = mad24(y, srcstep2, x*(int)sizeof(srcT2_C1)*cn + srcoffset2);
|
||||
#endif
|
||||
int dst_index = mad24(y, dststep, x*(int)sizeof(dstT) + dstoffset);
|
||||
int dst_index = mad24(y, dststep, x*(int)sizeof(dstT_C1)*cn + dstoffset);
|
||||
EXTRA_INDEX;
|
||||
|
||||
PROCESS_ELEM;
|
||||
@ -343,9 +392,9 @@ __kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1,
|
||||
int mask_index = mad24(y, maskstep, x + maskoffset);
|
||||
if( mask[mask_index] )
|
||||
{
|
||||
int src1_index = mad24(y, srcstep1, x*(int)sizeof(srcT1) + srcoffset1);
|
||||
int src2_index = mad24(y, srcstep2, x*(int)sizeof(srcT2) + srcoffset2);
|
||||
int dst_index = mad24(y, dststep, x*(int)sizeof(dstT) + dstoffset);
|
||||
int src1_index = mad24(y, srcstep1, x*(int)sizeof(srcT1_C1)*cn + srcoffset1);
|
||||
int src2_index = mad24(y, srcstep2, x*(int)sizeof(srcT2_C1)*cn + srcoffset2);
|
||||
int dst_index = mad24(y, dststep, x*(int)sizeof(dstT_C1)*cn + dstoffset);
|
||||
|
||||
PROCESS_ELEM;
|
||||
}
|
||||
@ -363,9 +412,8 @@ __kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1,
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, srcstep1, x*(int)sizeof(srcT1) + srcoffset1);
|
||||
int dst_index = mad24(y, dststep, x*(int)sizeof(dstT) + dstoffset);
|
||||
EXTRA_INDEX;
|
||||
int src1_index = mad24(y, srcstep1, x*(int)sizeof(srcT1_C1)*cn + srcoffset1);
|
||||
int dst_index = mad24(y, dststep, x*(int)sizeof(dstT_C1)*cn + dstoffset);
|
||||
|
||||
PROCESS_ELEM;
|
||||
}
|
||||
@ -386,8 +434,8 @@ __kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1,
|
||||
int mask_index = mad24(y, maskstep, x + maskoffset);
|
||||
if( mask[mask_index] )
|
||||
{
|
||||
int src1_index = mad24(y, srcstep1, x*(int)sizeof(srcT1) + srcoffset1);
|
||||
int dst_index = mad24(y, dststep, x*(int)sizeof(dstT) + dstoffset);
|
||||
int src1_index = mad24(y, srcstep1, x*(int)sizeof(srcT1_C1)*cn + srcoffset1);
|
||||
int dst_index = mad24(y, dststep, x*(int)sizeof(dstT_C1)*cn + dstoffset);
|
||||
|
||||
PROCESS_ELEM;
|
||||
}
|
||||
|
@ -87,9 +87,21 @@ __kernel void copyToMask(__global const uchar * srcptr, int src_step, int src_of
|
||||
|
||||
#else
|
||||
|
||||
#ifndef dstST
|
||||
#define dstST dstT
|
||||
#endif
|
||||
|
||||
#if cn != 3
|
||||
#define value value_
|
||||
#define storedst(val) *(__global dstT*)(dstptr + dst_index) = val
|
||||
#else
|
||||
#define value (dstT)(value_.x, value_.y, value_.z)
|
||||
#define storedst(val) vstore3(val, 0, (__global dstT1*)(dstptr + dst_index))
|
||||
#endif
|
||||
|
||||
__kernel void setMask(__global const uchar* mask, int maskstep, int maskoffset,
|
||||
__global uchar* dstptr, int dststep, int dstoffset,
|
||||
int rows, int cols, dstT value )
|
||||
int rows, int cols, dstST value_ )
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
@ -99,22 +111,22 @@ __kernel void setMask(__global const uchar* mask, int maskstep, int maskoffset,
|
||||
int mask_index = mad24(y, maskstep, x + maskoffset);
|
||||
if( mask[mask_index] )
|
||||
{
|
||||
int dst_index = mad24(y, dststep, x*(int)sizeof(dstT) + dstoffset);
|
||||
*(__global dstT*)(dstptr + dst_index) = value;
|
||||
int dst_index = mad24(y, dststep, x*(int)sizeof(dstT1)*cn + dstoffset);
|
||||
storedst(value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void set(__global uchar* dstptr, int dststep, int dstoffset,
|
||||
int rows, int cols, dstT value )
|
||||
int rows, int cols, dstST value_ )
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int dst_index = mad24(y, dststep, x*(int)sizeof(dstT) + dstoffset);
|
||||
*(__global dstT*)(dstptr + dst_index) = value;
|
||||
int dst_index = mad24(y, dststep, x*(int)sizeof(dstT1)*cn + dstoffset);
|
||||
storedst(value);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -744,20 +744,23 @@ UMat& UMat::setTo(InputArray _value, InputArray _mask)
|
||||
{
|
||||
bool haveMask = !_mask.empty();
|
||||
int tp = type(), cn = CV_MAT_CN(tp);
|
||||
if( dims <= 2 && cn <= 4 && cn != 3 && ocl::useOpenCL() )
|
||||
if( dims <= 2 && cn <= 4 && CV_MAT_DEPTH(tp) < CV_64F && ocl::useOpenCL() )
|
||||
{
|
||||
Mat value = _value.getMat();
|
||||
CV_Assert( checkScalar(value, type(), _value.kind(), _InputArray::UMAT) );
|
||||
double buf[4];
|
||||
double buf[4]={0,0,0,0};
|
||||
convertAndUnrollScalar(value, tp, (uchar*)buf, 1);
|
||||
|
||||
int scalarcn = cn == 3 ? 4 : cn;
|
||||
char opts[1024];
|
||||
sprintf(opts, "-D dstT=%s", ocl::memopTypeToStr(tp));
|
||||
sprintf(opts, "-D dstT=%s -D dstST=%s -D dstT1=%s -D cn=%d", ocl::memopTypeToStr(tp),
|
||||
ocl::memopTypeToStr(CV_MAKETYPE(tp,scalarcn)),
|
||||
ocl::memopTypeToStr(CV_MAT_DEPTH(tp)), cn);
|
||||
|
||||
ocl::Kernel setK(haveMask ? "setMask" : "set", ocl::core::copyset_oclsrc, opts);
|
||||
if( !setK.empty() )
|
||||
{
|
||||
ocl::KernelArg scalararg(0, 0, 0, buf, CV_ELEM_SIZE(tp));
|
||||
ocl::KernelArg scalararg(0, 0, 0, buf, CV_ELEM_SIZE1(tp)*scalarcn);
|
||||
UMat mask;
|
||||
|
||||
if( haveMask )
|
||||
|
@ -1957,7 +1957,7 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
|
||||
double inv_fx = 1. / fx, inv_fy = 1. / fy;
|
||||
float inv_fxf = (float)inv_fx, inv_fyf = (float)inv_fy;
|
||||
|
||||
if( cn == 3 || !(cn <= 4 &&
|
||||
if( !(cn <= 4 &&
|
||||
(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR ||
|
||||
(interpolation == INTER_AREA && inv_fx >= 1 && inv_fy >= 1) )) )
|
||||
return false;
|
||||
@ -1975,15 +1975,18 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
|
||||
int wtype = CV_MAKETYPE(wdepth, cn);
|
||||
char buf[2][32];
|
||||
k.create("resizeLN", ocl::imgproc::resize_oclsrc,
|
||||
format("-D INTER_LINEAR -D depth=%d -D PIXTYPE=%s -D WORKTYPE=%s -D convertToWT=%s -D convertToDT=%s",
|
||||
depth, ocl::typeToStr(type), ocl::typeToStr(wtype),
|
||||
format("-D INTER_LINEAR -D depth=%d -D PIXTYPE=%s -D PIXTYPE1=%s "
|
||||
"-D WORKTYPE=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d",
|
||||
depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
|
||||
ocl::convertTypeStr(depth, wdepth, cn, buf[0]),
|
||||
ocl::convertTypeStr(wdepth, depth, cn, buf[1])));
|
||||
ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
|
||||
cn));
|
||||
}
|
||||
else if (interpolation == INTER_NEAREST)
|
||||
{
|
||||
k.create("resizeNN", ocl::imgproc::resize_oclsrc,
|
||||
format("-D INTER_NEAREST -D PIXTYPE=%s -D cn", ocl::memopTypeToStr(type), cn));
|
||||
format("-D INTER_NEAREST -D PIXTYPE=%s -D PIXTYPE1=%s -D cn=%d",
|
||||
ocl::memopTypeToStr(type), ocl::memopTypeToStr(depth), cn));
|
||||
}
|
||||
else if (interpolation == INTER_AREA)
|
||||
{
|
||||
@ -1995,9 +1998,9 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
|
||||
int wtype = CV_MAKE_TYPE(wdepth, cn);
|
||||
|
||||
char cvt[2][40];
|
||||
String buildOption = format("-D INTER_AREA -D T=%s -D WTV=%s -D convertToWTV=%s",
|
||||
ocl::typeToStr(type), ocl::typeToStr(wtype),
|
||||
ocl::convertTypeStr(depth, wdepth, cn, cvt[0]));
|
||||
String buildOption = format("-D INTER_AREA -D PIXTYPE=%s -D PIXTYPE1=%s -D WTV=%s -D convertToWTV=%s -D cn=%d",
|
||||
ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
|
||||
ocl::convertTypeStr(depth, wdepth, cn, cvt[0]), cn);
|
||||
|
||||
UMat alphaOcl, tabofsOcl, mapOcl;
|
||||
UMat dmap, smap;
|
||||
@ -2005,7 +2008,7 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
|
||||
if (is_area_fast)
|
||||
{
|
||||
int wdepth2 = std::max(CV_32F, depth), wtype2 = CV_MAKE_TYPE(wdepth2, cn);
|
||||
buildOption = buildOption + format(" -D convertToT=%s -D WT2V=%s -D convertToWT2V=%s -D INTER_AREA_FAST"
|
||||
buildOption = buildOption + format(" -D convertToPIXTYPE=%s -D WT2V=%s -D convertToWT2V=%s -D INTER_AREA_FAST"
|
||||
" -D XSCALE=%d -D YSCALE=%d -D SCALE=%ff",
|
||||
ocl::convertTypeStr(wdepth2, depth, cn, cvt[0]),
|
||||
ocl::typeToStr(wtype2), ocl::convertTypeStr(wdepth, wdepth2, cn, cvt[1]),
|
||||
@ -2028,7 +2031,7 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
|
||||
}
|
||||
else
|
||||
{
|
||||
buildOption = buildOption + format(" -D convertToT=%s", ocl::convertTypeStr(wdepth, depth, cn, cvt[0]));
|
||||
buildOption = buildOption + format(" -D convertToPIXTYPE=%s", ocl::convertTypeStr(wdepth, depth, cn, cvt[0]));
|
||||
k.create("resizeAREA", ocl::imgproc::resize_oclsrc, buildOption);
|
||||
if (k.empty())
|
||||
return false;
|
||||
@ -3887,7 +3890,7 @@ static bool ocl_warpTransform(InputArray _src, OutputArray _dst, InputArray _M0,
|
||||
{
|
||||
CV_Assert(op_type == OCL_OP_AFFINE || op_type == OCL_OP_PERSPECTIVE);
|
||||
|
||||
int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), wdepth = depth;
|
||||
int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
|
||||
double doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
|
||||
|
||||
int interpolation = flags & INTER_MAX;
|
||||
@ -3896,7 +3899,7 @@ static bool ocl_warpTransform(InputArray _src, OutputArray _dst, InputArray _M0,
|
||||
|
||||
if ( !(borderType == cv::BORDER_CONSTANT &&
|
||||
(interpolation == cv::INTER_NEAREST || interpolation == cv::INTER_LINEAR || interpolation == cv::INTER_CUBIC)) ||
|
||||
(!doubleSupport && depth == CV_64F) || cn > 4 || cn == 3)
|
||||
(!doubleSupport && depth == CV_64F) || cn > 4)
|
||||
return false;
|
||||
|
||||
const char * const interpolationMap[3] = { "NEAREST", "LINEAR", "CUBIC" };
|
||||
@ -3904,28 +3907,40 @@ static bool ocl_warpTransform(InputArray _src, OutputArray _dst, InputArray _M0,
|
||||
ocl::imgproc::warp_affine_oclsrc : ocl::imgproc::warp_perspective_oclsrc;
|
||||
const char * const kernelName = op_type == OCL_OP_AFFINE ? "warpAffine" : "warpPerspective";
|
||||
|
||||
int scalarcn = cn == 3 ? 4 : cn;
|
||||
int wdepth = interpolation == INTER_NEAREST ? depth : std::max(CV_32S, depth);
|
||||
int sctype = CV_MAKETYPE(wdepth, scalarcn);
|
||||
|
||||
ocl::Kernel k;
|
||||
String opts;
|
||||
if (interpolation == INTER_NEAREST)
|
||||
{
|
||||
k.create(kernelName, program,
|
||||
format("-D INTER_NEAREST -D T=%s%s", ocl::typeToStr(type),
|
||||
doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
|
||||
opts = format("-D INTER_NEAREST -D T=%s%s -D T1=%s -D ST=%s -D cn=%d", ocl::typeToStr(type),
|
||||
doubleSupport ? " -D DOUBLE_SUPPORT" : "",
|
||||
ocl::typeToStr(CV_MAT_DEPTH(type)),
|
||||
ocl::typeToStr(sctype),
|
||||
cn);
|
||||
}
|
||||
else
|
||||
{
|
||||
char cvt[2][50];
|
||||
wdepth = std::max(CV_32S, depth);
|
||||
k.create(kernelName, program,
|
||||
format("-D INTER_%s -D T=%s -D WT=%s -D depth=%d -D convertToWT=%s -D convertToT=%s%s",
|
||||
interpolationMap[interpolation], ocl::typeToStr(type),
|
||||
ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), depth,
|
||||
ocl::convertTypeStr(depth, wdepth, cn, cvt[0]),
|
||||
ocl::convertTypeStr(wdepth, depth, cn, cvt[1]),
|
||||
doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
|
||||
opts = format("-D INTER_%s -D T=%s -D T1=%s -D ST=%s -D WT=%s -D depth=%d -D convertToWT=%s -D convertToT=%s%s -D cn=%d",
|
||||
interpolationMap[interpolation], ocl::typeToStr(type),
|
||||
ocl::typeToStr(CV_MAT_DEPTH(type)),
|
||||
ocl::typeToStr(sctype),
|
||||
ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), depth,
|
||||
ocl::convertTypeStr(depth, wdepth, cn, cvt[0]),
|
||||
ocl::convertTypeStr(wdepth, depth, cn, cvt[1]),
|
||||
doubleSupport ? " -D DOUBLE_SUPPORT" : "", cn);
|
||||
}
|
||||
|
||||
k.create(kernelName, program, opts);
|
||||
if (k.empty())
|
||||
return false;
|
||||
|
||||
double borderBuf[] = {0, 0, 0, 0};
|
||||
scalarToRawData(borderValue, borderBuf, sctype);
|
||||
|
||||
UMat src = _src.getUMat(), M0;
|
||||
_dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() );
|
||||
UMat dst = _dst.getUMat();
|
||||
@ -3956,7 +3971,7 @@ static bool ocl_warpTransform(InputArray _src, OutputArray _dst, InputArray _M0,
|
||||
matM.convertTo(M0, doubleSupport ? CV_64F : CV_32F);
|
||||
|
||||
k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(M0),
|
||||
ocl::KernelArg::Constant(Mat(1, 1, CV_MAKE_TYPE(wdepth, cn), borderValue)));
|
||||
ocl::KernelArg(0, 0, 0, borderBuf, CV_ELEM_SIZE(sctype)));
|
||||
|
||||
size_t globalThreads[2] = { dst.cols, dst.rows };
|
||||
return k.run(2, globalThreads, NULL, false);
|
||||
|
@ -52,9 +52,19 @@
|
||||
#define CAST_BITS (INTER_RESIZE_COEF_BITS << 1)
|
||||
#define INC(x,l) min(x+1,l-1)
|
||||
|
||||
#define PIXSIZE ((int)sizeof(PIXTYPE))
|
||||
|
||||
#define noconvert(x) (x)
|
||||
|
||||
#if cn != 3
|
||||
#define loadpix(addr) *(__global const PIXTYPE*)(addr)
|
||||
#define storepix(val, addr) *(__global PIXTYPE*)(addr) = val
|
||||
#define PIXSIZE ((int)sizeof(PIXTYPE))
|
||||
#else
|
||||
#define loadpix(addr) vload3(0, (__global const PIXTYPE1*)(addr))
|
||||
#define storepix(val, addr) vstore3(val, 0, (__global PIXTYPE1*)(addr))
|
||||
#define PIXSIZE ((int)sizeof(PIXTYPE1)*3)
|
||||
#endif
|
||||
|
||||
#if defined INTER_LINEAR
|
||||
|
||||
__kernel void resizeLN(__global const uchar* srcptr, int srcstep, int srcoffset,
|
||||
@ -89,10 +99,10 @@ __kernel void resizeLN(__global const uchar* srcptr, int srcstep, int srcoffset,
|
||||
int U1 = rint(INTER_RESIZE_COEF_SCALE - u);
|
||||
int V1 = rint(INTER_RESIZE_COEF_SCALE - v);
|
||||
|
||||
WORKTYPE data0 = convertToWT(*(__global const PIXTYPE*)(srcptr + mad24(y, srcstep, srcoffset + x*PIXSIZE)));
|
||||
WORKTYPE data1 = convertToWT(*(__global const PIXTYPE*)(srcptr + mad24(y, srcstep, srcoffset + x_*PIXSIZE)));
|
||||
WORKTYPE data2 = convertToWT(*(__global const PIXTYPE*)(srcptr + mad24(y_, srcstep, srcoffset + x*PIXSIZE)));
|
||||
WORKTYPE data3 = convertToWT(*(__global const PIXTYPE*)(srcptr + mad24(y_, srcstep, srcoffset + x_*PIXSIZE)));
|
||||
WORKTYPE data0 = convertToWT(loadpix(srcptr + mad24(y, srcstep, srcoffset + x*PIXSIZE)));
|
||||
WORKTYPE data1 = convertToWT(loadpix(srcptr + mad24(y, srcstep, srcoffset + x_*PIXSIZE)));
|
||||
WORKTYPE data2 = convertToWT(loadpix(srcptr + mad24(y_, srcstep, srcoffset + x*PIXSIZE)));
|
||||
WORKTYPE data3 = convertToWT(loadpix(srcptr + mad24(y_, srcstep, srcoffset + x_*PIXSIZE)));
|
||||
|
||||
WORKTYPE val = mul24((WORKTYPE)mul24(U1, V1), data0) + mul24((WORKTYPE)mul24(U, V1), data1) +
|
||||
mul24((WORKTYPE)mul24(U1, V), data2) + mul24((WORKTYPE)mul24(U, V), data3);
|
||||
@ -102,10 +112,10 @@ __kernel void resizeLN(__global const uchar* srcptr, int srcstep, int srcoffset,
|
||||
#else
|
||||
float u1 = 1.f - u;
|
||||
float v1 = 1.f - v;
|
||||
WORKTYPE data0 = convertToWT(*(__global const PIXTYPE*)(srcptr + mad24(y, srcstep, srcoffset + x*PIXSIZE)));
|
||||
WORKTYPE data1 = convertToWT(*(__global const PIXTYPE*)(srcptr + mad24(y, srcstep, srcoffset + x_*PIXSIZE)));
|
||||
WORKTYPE data2 = convertToWT(*(__global const PIXTYPE*)(srcptr + mad24(y_, srcstep, srcoffset + x*PIXSIZE)));
|
||||
WORKTYPE data3 = convertToWT(*(__global const PIXTYPE*)(srcptr + mad24(y_, srcstep, srcoffset + x_*PIXSIZE)));
|
||||
WORKTYPE data0 = convertToWT(loadpix(srcptr + mad24(y, srcstep, srcoffset + x*PIXSIZE)));
|
||||
WORKTYPE data1 = convertToWT(loadpix(srcptr + mad24(y, srcstep, srcoffset + x_*PIXSIZE)));
|
||||
WORKTYPE data2 = convertToWT(loadpix(srcptr + mad24(y_, srcstep, srcoffset + x*PIXSIZE)));
|
||||
WORKTYPE data3 = convertToWT(loadpix(srcptr + mad24(y_, srcstep, srcoffset + x_*PIXSIZE)));
|
||||
|
||||
PIXTYPE uval = u1 * v1 * data0 + u * v1 * data1 + u1 * v *data2 + u * v *data3;
|
||||
|
||||
@ -113,8 +123,7 @@ __kernel void resizeLN(__global const uchar* srcptr, int srcstep, int srcoffset,
|
||||
|
||||
if(dx < dstcols && dy < dstrows)
|
||||
{
|
||||
__global PIXTYPE* dst = (__global PIXTYPE*)(dstptr + mad24(dy, dststep, dstoffset + dx*PIXSIZE));
|
||||
dst[0] = uval;
|
||||
storepix(uval, dstptr + mad24(dy, dststep, dstoffset + dx*PIXSIZE));
|
||||
}
|
||||
}
|
||||
|
||||
@ -136,17 +145,13 @@ __kernel void resizeNN(__global const uchar* srcptr, int srcstep, int srcoffset,
|
||||
int sx = min(convert_int_rtz(s1), srccols-1);
|
||||
int sy = min(convert_int_rtz(s2), srcrows-1);
|
||||
|
||||
__global PIXTYPE* dst = (__global PIXTYPE*)(dstptr + mad24(dy, dststep, dstoffset + dx*PIXSIZE));
|
||||
__global const PIXTYPE* src = (__global const PIXTYPE*)(srcptr + mad24(sy, srcstep, srcoffset + sx*PIXSIZE));
|
||||
|
||||
dst[0] = src[0];
|
||||
storepix(loadpix(srcptr + mad24(sy, srcstep, srcoffset + sx*PIXSIZE)),
|
||||
dstptr + mad24(dy, dststep, dstoffset + dx*PIXSIZE));
|
||||
}
|
||||
}
|
||||
|
||||
#elif defined INTER_AREA
|
||||
|
||||
#define TSIZE ((int)(sizeof(T)))
|
||||
|
||||
#ifdef INTER_AREA_FAST
|
||||
|
||||
__kernel void resizeAREA_FAST(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,
|
||||
@ -174,10 +179,10 @@ __kernel void resizeAREA_FAST(__global const uchar * src, int src_step, int src_
|
||||
int src_index = mad24(symap_tab[y + sy], src_step, src_offset);
|
||||
#pragma unroll
|
||||
for (int x = 0; x < XSCALE; ++x)
|
||||
sum += convertToWTV(((__global const T*)(src + src_index))[sxmap_tab[sx + x]]);
|
||||
sum += convertToWTV(loadpix(src + src_index + sxmap_tab[sx + x]*PIXSIZE));
|
||||
}
|
||||
|
||||
((__global T*)(dst + dst_index))[dx] = convertToT(convertToWT2V(sum) * (WT2V)(SCALE));
|
||||
storepix(convertToPIXTYPE(convertToWT2V(sum) * (WT2V)(SCALE)), dst + dst_index + dx*PIXSIZE);
|
||||
}
|
||||
}
|
||||
|
||||
@ -219,12 +224,12 @@ __kernel void resizeAREA(__global const uchar * src, int src_step, int src_offse
|
||||
for (int sx = sx0, xk = xk0; sx <= sx1; ++sx, ++xk)
|
||||
{
|
||||
WTV alpha = (WTV)(xalpha_tab[xk]);
|
||||
buf += convertToWTV(((__global const T*)(src + src_index))[sx]) * alpha;
|
||||
buf += convertToWTV(loadpix(src + src_index + sx*PIXSIZE)) * alpha;
|
||||
}
|
||||
sum += buf * beta;
|
||||
}
|
||||
|
||||
((__global T*)(dst + dst_index))[dx] = convertToT(sum);
|
||||
storepix(convertToPIXTYPE(sum), dst + dst_index + dx*PIXSIZE);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -64,11 +64,31 @@
|
||||
|
||||
#define noconvert
|
||||
|
||||
#ifndef ST
|
||||
#define ST T
|
||||
#endif
|
||||
|
||||
#if cn != 3
|
||||
#define loadpix(addr) *(__global const T*)(addr)
|
||||
#define storepix(val, addr) *(__global T*)(addr) = val
|
||||
#define scalar scalar_
|
||||
#define pixsize (int)sizeof(T)
|
||||
#else
|
||||
#define loadpix(addr) vload3(0, (__global const T1*)(addr))
|
||||
#define storepix(val, addr) vstore3(val, 0, (__global T1*)(addr))
|
||||
#ifdef INTER_NEAREST
|
||||
#define scalar (T)(scalar_.x, scalar_.y, scalar_.z)
|
||||
#else
|
||||
#define scalar (WT)(scalar_.x, scalar_.y, scalar_.z)
|
||||
#endif
|
||||
#define pixsize ((int)sizeof(T1)*3)
|
||||
#endif
|
||||
|
||||
#ifdef INTER_NEAREST
|
||||
|
||||
__kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
|
||||
__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
|
||||
__constant CT * M, T scalar)
|
||||
__constant CT * M, ST scalar_)
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
@ -85,17 +105,15 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
|
||||
short sx = convert_short_sat(X0 >> AB_BITS);
|
||||
short sy = convert_short_sat(Y0 >> AB_BITS);
|
||||
|
||||
int dst_index = mad24(dy, dst_step, dst_offset + dx * (int)sizeof(T));
|
||||
__global T * dst = (__global T *)(dstptr + dst_index);
|
||||
int dst_index = mad24(dy, dst_step, dst_offset + dx * pixsize);
|
||||
|
||||
if (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows)
|
||||
{
|
||||
int src_index = mad24(sy, src_step, src_offset + sx * (int)sizeof(T));
|
||||
__global const T * src = (__global const T *)(srcptr + src_index);
|
||||
dst[0] = src[0];
|
||||
int src_index = mad24(sy, src_step, src_offset + sx * pixsize);
|
||||
storepix(loadpix(srcptr + src_index), dstptr + dst_index);
|
||||
}
|
||||
else
|
||||
dst[0] = scalar;
|
||||
storepix(scalar, dstptr + dst_index);
|
||||
}
|
||||
}
|
||||
|
||||
@ -103,7 +121,7 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
|
||||
|
||||
__kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
|
||||
__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
|
||||
__constant CT * M, WT scalar)
|
||||
__constant CT * M, ST scalar_)
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
@ -126,19 +144,18 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
|
||||
short ay = convert_short(Y0 & (INTER_TAB_SIZE-1));
|
||||
|
||||
WT v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ?
|
||||
convertToWT(*(__global const T *)(srcptr + mad24(sy, src_step, src_offset + sx * (int)sizeof(T)))) : scalar;
|
||||
convertToWT(loadpix(srcptr + mad24(sy, src_step, src_offset + sx * pixsize))) : scalar;
|
||||
WT v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ?
|
||||
convertToWT(*(__global const T *)(srcptr + mad24(sy, src_step, src_offset + (sx+1) * (int)sizeof(T)))) : scalar;
|
||||
convertToWT(loadpix(srcptr + mad24(sy, src_step, src_offset + (sx+1) * pixsize))) : scalar;
|
||||
WT v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ?
|
||||
convertToWT(*(__global const T *)(srcptr + mad24(sy+1, src_step, src_offset + sx * (int)sizeof(T)))) : scalar;
|
||||
convertToWT(loadpix(srcptr + mad24(sy+1, src_step, src_offset + sx * pixsize))) : scalar;
|
||||
WT v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ?
|
||||
convertToWT(*(__global const T *)(srcptr + mad24(sy+1, src_step, src_offset + (sx+1) * (int)sizeof(T)))) : scalar;
|
||||
convertToWT(loadpix(srcptr + mad24(sy+1, src_step, src_offset + (sx+1) * pixsize))) : scalar;
|
||||
|
||||
float taby = 1.f/INTER_TAB_SIZE*ay;
|
||||
float tabx = 1.f/INTER_TAB_SIZE*ax;
|
||||
|
||||
int dst_index = mad24(dy, dst_step, dst_offset + dx * (int)sizeof(T));
|
||||
__global T * dst = (__global T *)(dstptr + dst_index);
|
||||
int dst_index = mad24(dy, dst_step, dst_offset + dx * pixsize);
|
||||
|
||||
#if depth <= 4
|
||||
int itab0 = convert_short_sat_rte( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE );
|
||||
@ -147,11 +164,11 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
|
||||
int itab3 = convert_short_sat_rte( taby*tabx * INTER_REMAP_COEF_SCALE );
|
||||
|
||||
WT val = v0 * itab0 + v1 * itab1 + v2 * itab2 + v3 * itab3;
|
||||
dst[0] = convertToT((val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS);
|
||||
storepix(convertToT((val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS), dstptr + dst_index);
|
||||
#else
|
||||
float tabx2 = 1.0f - tabx, taby2 = 1.0f - taby;
|
||||
WT val = v0 * tabx2 * taby2 + v1 * tabx * taby2 + v2 * tabx2 * taby + v3 * tabx * taby;
|
||||
dst[0] = convertToT(val);
|
||||
storepix(convertToT(val), dstptr + dst_index);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
@ -170,7 +187,7 @@ inline void interpolateCubic( float x, float* coeffs )
|
||||
|
||||
__kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
|
||||
__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
|
||||
__constant CT * M, WT scalar)
|
||||
__constant CT * M, ST scalar_)
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
@ -198,7 +215,7 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
|
||||
#pragma unroll
|
||||
for (int x = 0; x < 4; x++)
|
||||
v[mad24(y, 4, x)] = (sx+x >= 0 && sx+x < src_cols && sy+y >= 0 && sy+y < src_rows) ?
|
||||
convertToWT(*(__global const T *)(srcptr + mad24(sy+y, src_step, src_offset + (sx+x) * (int)sizeof(T)))) : scalar;
|
||||
convertToWT(loadpix(srcptr + mad24(sy+y, src_step, src_offset + (sx+x) * pixsize))) : scalar;
|
||||
|
||||
float tab1y[4], tab1x[4];
|
||||
|
||||
@ -207,8 +224,7 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
|
||||
interpolateCubic(ayy, tab1y);
|
||||
interpolateCubic(axx, tab1x);
|
||||
|
||||
int dst_index = mad24(dy, dst_step, dst_offset + dx * (int)sizeof(T));
|
||||
__global T * dst = (__global T *)(dstptr + dst_index);
|
||||
int dst_index = mad24(dy, dst_step, dst_offset + dx * pixsize);
|
||||
|
||||
WT sum = (WT)(0);
|
||||
#if depth <= 4
|
||||
@ -221,12 +237,12 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 16; i++)
|
||||
sum += v[i] * itab[i];
|
||||
dst[0] = convertToT( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS );
|
||||
storepix(convertToT( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ), dstptr + dst_index);
|
||||
#else
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 16; i++)
|
||||
sum += v[i] * tab1y[(i>>2)] * tab1x[(i&3)];
|
||||
dst[0] = convertToT( sum );
|
||||
storepix(convertToT( sum ), dstptr + dst_index);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
@ -64,11 +64,31 @@
|
||||
|
||||
#define noconvert
|
||||
|
||||
#ifndef ST
|
||||
#define ST T
|
||||
#endif
|
||||
|
||||
#if cn != 3
|
||||
#define loadpix(addr) *(__global const T*)(addr)
|
||||
#define storepix(val, addr) *(__global T*)(addr) = val
|
||||
#define scalar scalar_
|
||||
#define pixsize (int)sizeof(T)
|
||||
#else
|
||||
#define loadpix(addr) vload3(0, (__global const T1*)(addr))
|
||||
#define storepix(val, addr) vstore3(val, 0, (__global T1*)(addr))
|
||||
#ifdef INTER_NEAREST
|
||||
#define scalar (T)(scalar_.x, scalar_.y, scalar_.z)
|
||||
#else
|
||||
#define scalar (WT)(scalar_.x, scalar_.y, scalar_.z)
|
||||
#endif
|
||||
#define pixsize ((int)sizeof(T1)*3)
|
||||
#endif
|
||||
|
||||
#ifdef INTER_NEAREST
|
||||
|
||||
__kernel void warpPerspective(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
|
||||
__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
|
||||
__constant CT * M, T scalar)
|
||||
__constant CT * M, ST scalar_)
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
@ -82,17 +102,15 @@ __kernel void warpPerspective(__global const uchar * srcptr, int src_step, int s
|
||||
short sx = convert_short_sat_rte(X0*W);
|
||||
short sy = convert_short_sat_rte(Y0*W);
|
||||
|
||||
int dst_index = mad24(dy, dst_step, dx * (int)sizeof(T) + dst_offset);
|
||||
__global T * dst = (__global T *)(dstptr + dst_index);
|
||||
int dst_index = mad24(dy, dst_step, dx * pixsize + dst_offset);
|
||||
|
||||
if (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows)
|
||||
{
|
||||
int src_index = mad24(sy, src_step, sx * (int)sizeof(T) + src_offset);
|
||||
__global const T * src = (__global const T *)(srcptr + src_index);
|
||||
dst[0] = src[0];
|
||||
int src_index = mad24(sy, src_step, sx * pixsize + src_offset);
|
||||
storepix(loadpix(srcptr + src_index), dstptr + dst_index);
|
||||
}
|
||||
else
|
||||
dst[0] = scalar;
|
||||
storepix(scalar, dstptr + dst_index);
|
||||
}
|
||||
}
|
||||
|
||||
@ -100,7 +118,7 @@ __kernel void warpPerspective(__global const uchar * srcptr, int src_step, int s
|
||||
|
||||
__kernel void warpPerspective(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
|
||||
__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
|
||||
__constant CT * M, WT scalar)
|
||||
__constant CT * M, ST scalar_)
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
@ -119,19 +137,18 @@ __kernel void warpPerspective(__global const uchar * srcptr, int src_step, int s
|
||||
short ax = (short)(X & (INTER_TAB_SIZE - 1));
|
||||
|
||||
WT v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ?
|
||||
convertToWT(*(__global const T *)(srcptr + mad24(sy, src_step, src_offset + sx * (int)sizeof(T)))) : scalar;
|
||||
convertToWT(loadpix(srcptr + mad24(sy, src_step, src_offset + sx * pixsize))) : scalar;
|
||||
WT v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ?
|
||||
convertToWT(*(__global const T *)(srcptr + mad24(sy, src_step, src_offset + (sx+1) * (int)sizeof(T)))) : scalar;
|
||||
convertToWT(loadpix(srcptr + mad24(sy, src_step, src_offset + (sx+1) * pixsize))) : scalar;
|
||||
WT v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ?
|
||||
convertToWT(*(__global const T *)(srcptr + mad24(sy+1, src_step, src_offset + sx * (int)sizeof(T)))) : scalar;
|
||||
convertToWT(loadpix(srcptr + mad24(sy+1, src_step, src_offset + sx * pixsize))) : scalar;
|
||||
WT v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ?
|
||||
convertToWT(*(__global const T *)(srcptr + mad24(sy+1, src_step, src_offset + (sx+1) * (int)sizeof(T)))) : scalar;
|
||||
convertToWT(loadpix(srcptr + mad24(sy+1, src_step, src_offset + (sx+1) * pixsize))) : scalar;
|
||||
|
||||
float taby = 1.f/INTER_TAB_SIZE*ay;
|
||||
float tabx = 1.f/INTER_TAB_SIZE*ax;
|
||||
|
||||
int dst_index = mad24(dy, dst_step, dst_offset + dx * (int)sizeof(T));
|
||||
__global T * dst = (__global T *)(dstptr + dst_index);
|
||||
int dst_index = mad24(dy, dst_step, dst_offset + dx * pixsize);
|
||||
|
||||
#if depth <= 4
|
||||
int itab0 = convert_short_sat_rte( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE );
|
||||
@ -140,11 +157,11 @@ __kernel void warpPerspective(__global const uchar * srcptr, int src_step, int s
|
||||
int itab3 = convert_short_sat_rte( taby*tabx * INTER_REMAP_COEF_SCALE );
|
||||
|
||||
WT val = v0 * itab0 + v1 * itab1 + v2 * itab2 + v3 * itab3;
|
||||
dst[0] = convertToT((val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS);
|
||||
storepix(convertToT((val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS), dstptr + dst_index);
|
||||
#else
|
||||
float tabx2 = 1.0f - tabx, taby2 = 1.0f - taby;
|
||||
WT val = v0 * tabx2 * taby2 + v1 * tabx * taby2 + v2 * tabx2 * taby + v3 * tabx * taby;
|
||||
dst[0] = convertToT(val);
|
||||
storepix(convertToT(val), dstptr + dst_index);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
@ -163,7 +180,7 @@ inline void interpolateCubic( float x, float* coeffs )
|
||||
|
||||
__kernel void warpPerspective(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
|
||||
__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
|
||||
__constant CT * M, WT scalar)
|
||||
__constant CT * M, ST scalar_)
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
@ -187,7 +204,7 @@ __kernel void warpPerspective(__global const uchar * srcptr, int src_step, int s
|
||||
#pragma unroll
|
||||
for (int x = 0; x < 4; x++)
|
||||
v[mad24(y, 4, x)] = (sx+x >= 0 && sx+x < src_cols && sy+y >= 0 && sy+y < src_rows) ?
|
||||
convertToWT(*(__global const T *)(srcptr + mad24(sy+y, src_step, src_offset + (sx+x) * (int)sizeof(T)))) : scalar;
|
||||
convertToWT(loadpix(srcptr + mad24(sy+y, src_step, src_offset + (sx+x) * pixsize))) : scalar;
|
||||
|
||||
float tab1y[4], tab1x[4];
|
||||
|
||||
@ -196,8 +213,7 @@ __kernel void warpPerspective(__global const uchar * srcptr, int src_step, int s
|
||||
interpolateCubic(ayy, tab1y);
|
||||
interpolateCubic(axx, tab1x);
|
||||
|
||||
int dst_index = mad24(dy, dst_step, dst_offset + dx * (int)sizeof(T));
|
||||
__global T * dst = (__global T *)(dstptr + dst_index);
|
||||
int dst_index = mad24(dy, dst_step, dst_offset + dx * pixsize);
|
||||
|
||||
WT sum = (WT)(0);
|
||||
#if depth <= 4
|
||||
@ -210,12 +226,12 @@ __kernel void warpPerspective(__global const uchar * srcptr, int src_step, int s
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 16; i++)
|
||||
sum += v[i] * itab[i];
|
||||
dst[0] = convertToT( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS );
|
||||
storepix(convertToT( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ), dstptr + dst_index);
|
||||
#else
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 16; i++)
|
||||
sum += v[i] * tab1y[(i>>2)] * tab1x[(i&3)];
|
||||
dst[0] = convertToT( sum );
|
||||
storepix(convertToT( sum ), dstptr + dst_index);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user