Merge pull request #2470 from ilya-lavrenov:tapi_reduction

This commit is contained in:
Andrey Pavlenko 2014-03-13 18:42:00 +04:00 committed by OpenCV Buildbot
commit c5bf1c108c
3 changed files with 47 additions and 30 deletions

View File

@ -344,7 +344,7 @@ OCL_PERF_TEST_P(FlipFixture, Flip,
typedef Size_MatType MinMaxLocFixture;
OCL_PERF_TEST_P(MinMaxLocFixture, MinMaxLoc,
::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES))
::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES_134))
{
const Size_MatType_t params = GetParam();
const Size srcSize = get<0>(params);
@ -380,7 +380,7 @@ typedef Size_MatType SumFixture;
OCL_PERF_TEST_P(SumFixture, Sum,
::testing::Combine(OCL_TEST_SIZES,
OCL_TEST_TYPES))
OCL_TEST_TYPES_134))
{
const Size_MatType_t params = GetParam();
const Size srcSize = get<0>(params);
@ -652,7 +652,8 @@ OCL_PERF_TEST_P(SetIdentityFixture, SetIdentity,
typedef Size_MatType MeanStdDevFixture;
OCL_PERF_TEST_P(MeanStdDevFixture, MeanStdDev,
::testing::Combine(OCL_PERF_ENUM(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3), OCL_TEST_TYPES))
::testing::Combine(OCL_PERF_ENUM(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3),
OCL_TEST_TYPES_134))
{
const Size_MatType_t params = GetParam();
const Size srcSize = get<0>(params);
@ -688,7 +689,8 @@ typedef std::tr1::tuple<Size, MatType, NormType> NormParams;
typedef TestBaseWithParam<NormParams> NormFixture;
OCL_PERF_TEST_P(NormFixture, Norm,
::testing::Combine(OCL_PERF_ENUM(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3), OCL_TEST_TYPES, NormType::all()))
::testing::Combine(OCL_PERF_ENUM(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3),
OCL_TEST_TYPES_134, NormType::all()))
{
const NormParams params = GetParam();
const Size srcSize = get<0>(params);
@ -711,7 +713,8 @@ OCL_PERF_TEST_P(NormFixture, Norm,
typedef Size_MatType UMatDotFixture;
OCL_PERF_TEST_P(UMatDotFixture, UMatDot,
::testing::Combine(OCL_PERF_ENUM(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3), OCL_TEST_TYPES))
::testing::Combine(OCL_PERF_ENUM(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3),
OCL_TEST_TYPES_134))
{
const Size_MatType_t params = GetParam();
const Size srcSize = get<0>(params);
@ -820,7 +823,8 @@ typedef tuple<Size, MatType, NormalizeModes> NormalizeParams;
typedef TestBaseWithParam<NormalizeParams> NormalizeFixture;
OCL_PERF_TEST_P(NormalizeFixture, Normalize,
::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES, NormalizeModes::all()))
::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES_134,
NormalizeModes::all()))
{
const NormalizeParams params = GetParam();
const Size srcSize = get<0>(params);

View File

@ -52,6 +52,18 @@
#define noconvert
#if cn != 3
#define loadpix(addr) *(__global const srcT *)(addr)
#define storepix(val, addr) *(__global dstT *)(addr) = val
#define srcTSIZE (int)sizeof(srcT)
#define dstTSIZE (int)sizeof(dstT)
#else
#define loadpix(addr) vload3(0, (__global const srcT1 *)(addr))
#define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr))
#define srcTSIZE ((int)sizeof(srcT1)*3)
#define dstTSIZE ((int)sizeof(dstT1)*3)
#endif
#ifdef HAVE_MASK
#define EXTRA_PARAMS , __global const uchar * mask, int mask_step, int mask_offset
#else
@ -88,19 +100,20 @@
#ifdef HAVE_MASK
#define REDUCE_GLOBAL \
dstT temp = convertToDT(src[0]); \
int mask_index = mad24(id / cols, mask_step, mask_offset + (id % cols)); \
if (mask[mask_index]) \
FUNC(accumulator, temp)
{ \
dstT temp = convertToDT(loadpix(srcptr + src_index)); \
FUNC(accumulator, temp); \
}
#elif defined OP_DOT
#define REDUCE_GLOBAL \
int src2_index = mad24(id / cols, src2_step, mad24(id % cols, (int)sizeof(srcT), src2_offset)); \
__global const srcT * src2 = (__global const srcT *)(src2ptr + src2_index); \
dstT temp = convertToDT(src[0]), temp2 = convertToDT(src2[0]); \
int src2_index = mad24(id / cols, src2_step, mad24(id % cols, srcTSIZE, src2_offset)); \
dstT temp = convertToDT(loadpix(srcptr + src_index)), temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
FUNC(accumulator, temp, temp2)
#else
#define REDUCE_GLOBAL \
dstT temp = convertToDT(src[0]); \
dstT temp = convertToDT(loadpix(srcptr + src_index)); \
FUNC(accumulator, temp)
#endif
@ -111,8 +124,7 @@
#define REDUCE_LOCAL_2 \
localmem[lid] += localmem[lid2]
#define CALC_RESULT \
__global dstT * dst = (__global dstT *)(dstptr + (int)sizeof(dstT) * gid); \
dst[0] = localmem[0]
storepix(localmem[0], dstptr + dstTSIZE * gid)
// countNonZero stuff
#elif defined OP_COUNT_NON_ZERO
@ -123,7 +135,7 @@
dstT accumulator = (dstT)(0); \
srcT zero = (srcT)(0), one = (srcT)(1)
#define REDUCE_GLOBAL \
accumulator += src[0] == zero ? zero : one
accumulator += loadpix(srcptr + src_index) == zero ? zero : one
#define SET_LOCAL_1 \
localmem[lid] = accumulator
#define REDUCE_LOCAL_1 \
@ -131,8 +143,7 @@
#define REDUCE_LOCAL_2 \
localmem[lid] += localmem[lid2]
#define CALC_RESULT \
__global dstT * dst = (__global dstT *)(dstptr + (int)sizeof(dstT) * gid); \
dst[0] = localmem[0]
storepix(localmem[0], dstptr + dstTSIZE * gid)
// minMaxLoc stuff
#elif defined OP_MIN_MAX_LOC || defined OP_MIN_MAX_LOC_MASK
@ -167,6 +178,8 @@
#define MAX_VAL DBL_MAX
#endif
#define dstT srcT
#define DECLARE_LOCAL_MEM \
__local srcT localmem_min[WGS2_ALIGNED]; \
__local srcT localmem_max[WGS2_ALIGNED]; \
@ -181,7 +194,7 @@
srcT temp; \
int temploc
#define REDUCE_GLOBAL \
temp = src[0]; \
temp = loadpix(srcptr + src_index); \
temploc = id; \
srcT temp_minval = minval, temp_maxval = maxval; \
minval = min(minval, temp); \
@ -217,10 +230,8 @@
localmem_maxloc[lid] : (max1 == max2) ? (max1 == oldmax) ? min(localmem_maxloc[lid2],localmem_maxloc[lid]) : \
localmem_maxloc[lid2] : localmem_maxloc[lid]
#define CALC_RESULT \
__global srcT * dstminval = (__global srcT *)(dstptr + (int)sizeof(srcT) * gid); \
__global srcT * dstmaxval = (__global srcT *)(dstptr2 + (int)sizeof(srcT) * gid); \
dstminval[0] = localmem_min[0]; \
dstmaxval[0] = localmem_max[0]; \
storepix(localmem_min[0], dstptr + dstTSIZE * gid); \
storepix(localmem_max[0], dstptr2 + dstTSIZE * gid); \
dstlocptr[gid] = localmem_minloc[0]; \
dstlocptr2[gid] = localmem_maxloc[0]
@ -236,7 +247,7 @@
int temploc
#undef REDUCE_GLOBAL
#define REDUCE_GLOBAL \
temp = src[0]; \
temp = loadpix(srcptr + src_index); \
temploc = id; \
int mask_index = mad24(id / cols, mask_step, mask_offset + (id % cols) * (int)sizeof(uchar)); \
__global const uchar * mask = (__global const uchar *)(maskptr + mask_index); \
@ -278,8 +289,7 @@ __kernel void reduce(__global const uchar * srcptr, int src_step, int src_offset
for (int grain = groupnum * WGS; id < total; id += grain)
{
int src_index = mad24(id / cols, src_step, mad24(id % cols, (int)sizeof(srcT), src_offset));
__global const srcT * src = (__global const srcT *)(srcptr + src_index);
int src_index = mad24(id / cols, src_step, mad24(id % cols, srcTSIZE, src_offset));
REDUCE_GLOBAL;
}

View File

@ -475,7 +475,7 @@ static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask
int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
if ( (!doubleSupport && depth == CV_64F) || cn > 4 || cn == 3 )
if ( (!doubleSupport && depth == CV_64F) || cn > 4 )
return false;
int dbsize = ocl::Device::getDefault().maxComputeUnits();
@ -494,8 +494,11 @@ static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask
static const char * const opMap[3] = { "OP_SUM", "OP_SUM_ABS", "OP_SUM_SQR" };
char cvt[40];
ocl::Kernel k("reduce", ocl::core::reduce_oclsrc,
format("-D srcT=%s -D dstT=%s -D ddepth=%d -D convertToDT=%s -D %s -D WGS=%d -D WGS2_ALIGNED=%d%s%s",
ocl::typeToStr(type), ocl::typeToStr(dtype), ddepth, ocl::convertTypeStr(depth, ddepth, cn, cvt),
format("-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D ddepth=%d -D cn=%d"
" -D convertToDT=%s -D %s -D WGS=%d -D WGS2_ALIGNED=%d%s%s",
ocl::typeToStr(type), ocl::typeToStr(depth),
ocl::typeToStr(dtype), ocl::typeToStr(ddepth), ddepth, cn,
ocl::convertTypeStr(depth, ddepth, cn, cvt),
opMap[sum_op], (int)wgs, wgs2_aligned,
doubleSupport ? " -D DOUBLE_SUPPORT" : "",
haveMask ? " -D HAVE_MASK" : ""));