From fd5a8b3e9769c1b097e6246ac46f726a1204f1bd Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Wed, 4 Jun 2014 18:22:55 +0400
Subject: [PATCH 1/6] minmaxloc

---
 modules/core/src/opencl/minmaxloc.cl | 280 +++++++++++++++++++++++++++
 modules/core/src/opencl/reduce.cl    |  97 +---------
 modules/core/src/stat.cpp            | 175 +++++++++++------
 3 files changed, 399 insertions(+), 153 deletions(-)
 create mode 100644 modules/core/src/opencl/minmaxloc.cl

diff --git a/modules/core/src/opencl/minmaxloc.cl b/modules/core/src/opencl/minmaxloc.cl
new file mode 100644
index 0000000000..558679efda
--- /dev/null
+++ b/modules/core/src/opencl/minmaxloc.cl
@@ -0,0 +1,280 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2014, Itseez, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+
+#ifdef DOUBLE_SUPPORT
+#ifdef cl_amd_fp64
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#elif defined (cl_khr_fp64)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+#endif
+
+#ifdef DEPTH_0
+#define MIN_VAL 0
+#define MAX_VAL 255
+#elif defined DEPTH_1
+#define MIN_VAL -128
+#define MAX_VAL 127
+#elif defined DEPTH_2
+#define MIN_VAL 0
+#define MAX_VAL 65535
+#elif defined DEPTH_3
+#define MIN_VAL -32768
+#define MAX_VAL 32767
+#elif defined DEPTH_4
+#define MIN_VAL INT_MIN
+#define MAX_VAL INT_MAX
+#elif defined DEPTH_5
+#define MIN_VAL (-FLT_MAX)
+#define MAX_VAL FLT_MAX
+#elif defined DEPTH_6
+#define MIN_VAL (-DBL_MAX)
+#define MAX_VAL DBL_MAX
+#endif
+
+#define INDEX_MAX UINT_MAX
+
+#ifdef NEED_MINLOC
+#define CALC_MINLOC(inc) minloc = id + inc
+#else
+#define CALC_MINLOC(inc)
+#endif
+
+#ifdef NEED_MAXLOC
+#define CALC_MAXLOC(inc) maxloc = id + inc
+#else
+#define CALC_MAXLOC(inc)
+#endif
+
+#ifdef NEED_MINVAL
+#define CALC_MIN(p, inc) \
+    if (minval > temp.p) \
+    { \
+        minval = temp.p; \
+        CALC_MINLOC(inc); \
+    }
+#else
+#define CALC_MIN(p, inc)
+#endif
+
+#ifdef NEED_MAXVAL
+#define CALC_MAX(p, inc) \
+    if (maxval < temp.p) \
+    { \
+        maxval = temp.p; \
+        CALC_MAXLOC(inc); \
+    }
+#else
+#define CALC_MAX(p, inc)
+#endif
+
+#define CALC_P(p, inc) \
+    CALC_MIN(p, inc) \
+    CALC_MAX(p, inc)
+
+__kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_offset, int cols,
+                        int total, int groupnum, __global uchar * dstptr
+#ifdef HAVE_MASK
+                        , __global const uchar * mask, int mask_step, int mask_offset
+#endif
+                        )
+{
+    int lid = get_local_id(0);
+    int gid = get_group_id(0);
+    int  id = get_global_id(0) * kercn;
+
+    srcptr += src_offset;
+#ifdef HAVE_MASK
+    mask += mask_offset;
+#endif
+
+#ifdef NEED_MINVAL
+    __local srcT1 localmem_min[WGS2_ALIGNED];
+#ifdef NEED_MINLOC
+    __local uint localmem_minloc[WGS2_ALIGNED];
+#endif
+#endif
+#ifdef NEED_MAXVAL
+    __local srcT1 localmem_max[WGS2_ALIGNED];
+#ifdef NEED_MAXLOC
+    __local uint localmem_maxloc[WGS2_ALIGNED];
+#endif
+#endif
+
+    srcT1 minval = MAX_VAL, maxval = MIN_VAL;
+    srcT temp;
+    uint minloc = INDEX_MAX, maxloc = INDEX_MAX;
+    int src_index;
+#ifdef HAVE_MASK
+    int mask_index;
+#endif
+
+    for (int grain = groupnum * WGS * kercn; id < total; id += grain)
+    {
+#ifdef HAVE_SRC_CONT
+        src_index = mul24(id, (int)sizeof(srcT1));
+#else
+        src_index = mad24(id / cols, src_step, mul24(id % cols, (int)sizeof(srcT1)));
+#endif
+
+#ifdef HAVE_MASK
+#ifdef HAVE_MASK_CONT
+        mask_index = id;
+#else
+        mask_index = mad24(id / cols, mask_step, id % cols);
+#endif
+        if (mask[mask_index])
+#endif
+        {
+            temp = *(__global const srcT *)(srcptr + src_index);
+#if kercn == 1
+#ifdef NEED_MINVAL
+            if (minval > temp)
+            {
+                minval = temp;
+#ifdef NEED_MINLOC
+                minloc = id;
+#endif
+            }
+#endif
+#ifdef NEED_MAXVAL
+            if (maxval < temp)
+            {
+                maxval = temp;
+#ifdef NEED_MAXLOC
+                maxloc = id;
+#endif
+            }
+#endif
+#elif kercn >= 2
+            CALC_P(s0, 0)
+            CALC_P(s1, 1)
+#if kercn >= 4
+            CALC_P(s2, 2)
+            CALC_P(s3, 3)
+#if kercn >= 8
+            CALC_P(s4, 4)
+            CALC_P(s5, 5)
+            CALC_P(s6, 6)
+            CALC_P(s7, 7)
+#if kercn == 16
+            CALC_P(s8, 8)
+            CALC_P(s9, 9)
+            CALC_P(sA, 10)
+            CALC_P(sB, 11)
+            CALC_P(sC, 12)
+            CALC_P(sD, 13)
+            CALC_P(sE, 14)
+            CALC_P(sF, 15)
+#endif
+#endif
+#endif
+#endif
+        }
+    }
+
+    if (lid < WGS2_ALIGNED)
+    {
+#ifdef NEED_MINVAL
+        localmem_min[lid] = minval;
+#endif
+#ifdef NEED_MAXVAL
+        localmem_max[lid] = maxval;
+#endif
+#ifdef NEED_MINLOC
+        localmem_minloc[lid] = minloc;
+#endif
+#ifdef NEED_MAXLOC
+        localmem_maxloc[lid] = maxloc;
+#endif
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (lid >= WGS2_ALIGNED && total >= WGS2_ALIGNED)
+    {
+        int lid3 = lid - WGS2_ALIGNED;
+#ifdef NEED_MINVAL
+        if (localmem_min[lid3] >= minval)
+        {
+#ifdef NEED_MINLOC
+            if (localmem_min[lid3] == minval)
+                localmem_minloc[lid3] = min(localmem_minloc[lid3], minloc);
+            else
+                localmem_minloc[lid3] = minloc,
+#endif
+                localmem_min[lid3] = minval;
+        }
+#endif
+#ifdef NEED_MAXVAL
+        if (localmem_max[lid3] <= maxval)
+        {
+#ifdef NEED_MAXLOC
+            if (localmem_max[lid3] == maxval)
+                localmem_maxloc[lid3] = min(localmem_maxloc[lid3], maxloc);
+            else
+                localmem_maxloc[lid3] = maxloc,
+#endif
+                localmem_max[lid3] = maxval;
+        }
+#endif
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for (int lsize = WGS2_ALIGNED >> 1; lsize > 0; lsize >>= 1)
+    {
+        if (lid < lsize)
+        {
+            int lid2 = lsize + lid;
+
+#ifdef NEED_MINVAL
+            if (localmem_min[lid] >= localmem_min[lid2])
+            {
+#ifdef NEED_MINLOC
+                if (localmem_min[lid] == localmem_min[lid2])
+                    localmem_minloc[lid] = min(localmem_minloc[lid2], localmem_minloc[lid]);
+                else
+                    localmem_minloc[lid] = localmem_minloc[lid2],
+#endif
+                    localmem_min[lid] = localmem_min[lid2];
+            }
+#endif
+#ifdef NEED_MAXVAL
+            if (localmem_max[lid] <= localmem_max[lid2])
+            {
+#ifdef NEED_MAXLOC
+                if (localmem_max[lid] == localmem_max[lid2])
+                    localmem_maxloc[lid] = min(localmem_maxloc[lid2], localmem_maxloc[lid]);
+                else
+                    localmem_maxloc[lid] = localmem_maxloc[lid2],
+#endif
+                    localmem_max[lid] = localmem_max[lid2];
+            }
+#endif
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    if (lid == 0)
+    {
+        int pos = 0;
+#ifdef NEED_MINVAL
+        *(__global srcT1 *)(dstptr + mad24(gid, (int)sizeof(srcT1), pos)) = localmem_min[0];
+        pos = mad24(groupnum, (int)sizeof(srcT1), pos);
+#endif
+#ifdef NEED_MAXVAL
+        *(__global srcT1 *)(dstptr + mad24(gid, (int)sizeof(srcT1), pos)) = localmem_max[0];
+        pos = mad24(groupnum, (int)sizeof(srcT1), pos);
+#endif
+#ifdef NEED_MINLOC
+        *(__global uint *)(dstptr + mad24(gid, (int)sizeof(uint), pos)) = localmem_minloc[0];
+        pos = mad24(groupnum, (int)sizeof(uint), pos);
+#endif
+#ifdef NEED_MAXLOC
+        *(__global uint *)(dstptr + mad24(gid, (int)sizeof(uint), pos)) = localmem_maxloc[0];
+#endif
+    }
+}
diff --git a/modules/core/src/opencl/reduce.cl b/modules/core/src/opencl/reduce.cl
index 851d36eb4d..038f132970 100644
--- a/modules/core/src/opencl/reduce.cl
+++ b/modules/core/src/opencl/reduce.cl
@@ -75,6 +75,8 @@
 #define MAX_VAL DBL_MAX
 #endif
 
+#define INDEX_MAX UINT_MAX
+
 #define dstT srcT
 #define dstT1 srcT1
 
@@ -357,102 +359,11 @@
 #define CALC_RESULT \
     storepix(localmem_max[0], dstptr + dstTSIZE * gid)
 
-// minMaxLoc stuff
-#elif defined OP_MIN_MAX_LOC || defined OP_MIN_MAX_LOC_MASK
-
-#define DECLARE_LOCAL_MEM \
-    __local srcT localmem_min[WGS2_ALIGNED]; \
-    __local srcT localmem_max[WGS2_ALIGNED]; \
-    __local int localmem_minloc[WGS2_ALIGNED]; \
-    __local int localmem_maxloc[WGS2_ALIGNED]
-#define DEFINE_ACCUMULATOR \
-    srcT minval = MAX_VAL; \
-    srcT maxval = MIN_VAL; \
-    int negative = -1; \
-    int minloc = negative; \
-    int maxloc = negative; \
-    srcT temp; \
-    int temploc
-#define REDUCE_GLOBAL \
-    temp = loadpix(srcptr + src_index); \
-    temploc = id; \
-    srcT temp_minval = minval, temp_maxval = maxval; \
-    minval = min(minval, temp); \
-    maxval = max(maxval, temp); \
-    minloc = (minval == temp_minval) ? (temp_minval == MAX_VAL) ? temploc : minloc : temploc; \
-    maxloc = (maxval == temp_maxval) ? (temp_maxval == MIN_VAL) ? temploc : maxloc : temploc
-#define SET_LOCAL_1 \
-    localmem_min[lid] = minval; \
-    localmem_max[lid] = maxval; \
-    localmem_minloc[lid] = minloc; \
-    localmem_maxloc[lid] = maxloc
-#define REDUCE_LOCAL_1 \
-    srcT oldmin = localmem_min[lid-WGS2_ALIGNED]; \
-    srcT oldmax = localmem_max[lid-WGS2_ALIGNED]; \
-    localmem_min[lid - WGS2_ALIGNED] = min(minval, localmem_min[lid-WGS2_ALIGNED]); \
-    localmem_max[lid - WGS2_ALIGNED] = max(maxval, localmem_max[lid-WGS2_ALIGNED]); \
-    srcT minv = localmem_min[lid - WGS2_ALIGNED], maxv = localmem_max[lid - WGS2_ALIGNED]; \
-    localmem_minloc[lid - WGS2_ALIGNED] = (minv == minval) ? (minv == oldmin) ? \
-        min(minloc, localmem_minloc[lid-WGS2_ALIGNED]) : minloc : localmem_minloc[lid-WGS2_ALIGNED]; \
-    localmem_maxloc[lid - WGS2_ALIGNED] = (maxv == maxval) ? (maxv == oldmax) ? \
-        min(maxloc, localmem_maxloc[lid-WGS2_ALIGNED]) : maxloc : localmem_maxloc[lid-WGS2_ALIGNED]
-#define REDUCE_LOCAL_2 \
-    srcT oldmin = localmem_min[lid]; \
-    srcT oldmax = localmem_max[lid]; \
-    localmem_min[lid] = min(localmem_min[lid], localmem_min[lid2]); \
-    localmem_max[lid] = max(localmem_max[lid], localmem_max[lid2]); \
-    srcT min1 = localmem_min[lid], min2 = localmem_min[lid2]; \
-    localmem_minloc[lid] = (localmem_minloc[lid] == negative) ? localmem_minloc[lid2] : (localmem_minloc[lid2] == negative) ? \
-        localmem_minloc[lid] : (min1 == min2) ? (min1 == oldmin) ? min(localmem_minloc[lid2],localmem_minloc[lid]) : \
-        localmem_minloc[lid2] : localmem_minloc[lid]; \
-    srcT max1 = localmem_max[lid], max2 = localmem_max[lid2]; \
-    localmem_maxloc[lid] = (localmem_maxloc[lid] == negative) ? localmem_maxloc[lid2] : (localmem_maxloc[lid2] == negative) ? \
-        localmem_maxloc[lid] : (max1 == max2) ? (max1 == oldmax) ? min(localmem_maxloc[lid2],localmem_maxloc[lid]) : \
-        localmem_maxloc[lid2] : localmem_maxloc[lid]
-#define CALC_RESULT \
-    storepix(localmem_min[0], dstptr + dstTSIZE * gid); \
-    storepix(localmem_max[0], dstptr2 + dstTSIZE * gid); \
-    dstlocptr[gid] = localmem_minloc[0]; \
-    dstlocptr2[gid] = localmem_maxloc[0]
-
-#if defined OP_MIN_MAX_LOC_MASK
-#undef DEFINE_ACCUMULATOR
-#define DEFINE_ACCUMULATOR \
-    srcT minval = MAX_VAL; \
-    srcT maxval = MIN_VAL; \
-    int negative = -1; \
-    int minloc = negative; \
-    int maxloc = negative; \
-    srcT temp, temp_mask, zeroVal = (srcT)(0); \
-    int temploc
-#undef REDUCE_GLOBAL
-#define REDUCE_GLOBAL \
-    temp = loadpix(srcptr + src_index); \
-    temploc = id; \
-    MASK_INDEX; \
-    __global const uchar * mask = (__global const uchar *)(maskptr + mask_index); \
-    temp_mask = mask[0]; \
-    srcT temp_minval = minval, temp_maxval = maxval; \
-    minval = (temp_mask == zeroVal) ? minval : min(minval, temp); \
-    maxval = (temp_mask == zeroVal) ? maxval : max(maxval, temp); \
-    minloc = (temp_mask == zeroVal) ? minloc : (minval == temp_minval) ? (temp_minval == MAX_VAL) ? temploc : minloc : temploc; \
-    maxloc = (temp_mask == zeroVal) ? maxloc : (maxval == temp_maxval) ? (temp_maxval == MIN_VAL) ? temploc : maxloc : temploc
-#endif
-
 #else
 #error "No operation"
-#endif // end of minMaxLoc stuff
+#endif // end of norm (NORM_INF) with cn > 1 and mask
 
-#ifdef OP_MIN_MAX_LOC
-#undef EXTRA_PARAMS
-#define EXTRA_PARAMS , __global uchar * dstptr2, __global int * dstlocptr, __global int * dstlocptr2
-
-#elif defined OP_MIN_MAX_LOC_MASK
-#undef EXTRA_PARAMS
-#define EXTRA_PARAMS , __global uchar * dstptr2, __global int * dstlocptr, __global int * dstlocptr2, \
-    __global const uchar * maskptr, int mask_step, int mask_offset
-
-#elif defined OP_DOT
+#ifdef OP_DOT
 #undef EXTRA_PARAMS
 #define EXTRA_PARAMS , __global uchar * src2ptr, int src2_step, int src2_offset
 #endif
diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp
index 0a16c064c6..9d78c0f107 100644
--- a/modules/core/src/stat.cpp
+++ b/modules/core/src/stat.cpp
@@ -1311,104 +1311,157 @@ static void ofs2idx(const Mat& a, size_t ofs, int* idx)
 #ifdef HAVE_OPENCL
 
 template <typename T>
-void getMinMaxRes(const Mat &minv, const Mat &maxv, const Mat &minl, const Mat &maxl, double* minVal,
-                  double* maxVal, int* minLoc, int* maxLoc, const int groupnum, const int cn, const int cols)
+void getMinMaxRes(const Mat & db, double* minVal, double* maxVal,
+                  int* minLoc, int* maxLoc,
+                  int groupnum, int cn, int cols)
 {
-    T min = std::numeric_limits<T>::max();
-    T max = std::numeric_limits<T>::min() > 0 ? -std::numeric_limits<T>::max() : std::numeric_limits<T>::min();
-    int minloc = INT_MAX, maxloc = INT_MAX;
-    for (int i = 0; i < groupnum; i++)
+    uint index_max = std::numeric_limits<uint>::max();
+    T minval = std::numeric_limits<T>::max();
+    T maxval = std::numeric_limits<T>::min() > 0 ? -std::numeric_limits<T>::max() : std::numeric_limits<T>::min();
+    uint minloc = index_max, maxloc = index_max;
+
+    int index = 0;
+    const T * minptr = NULL, * maxptr = NULL;
+    const uint * minlocptr = NULL, * maxlocptr = NULL;
+    if (minVal || minLoc)
     {
-        T current_min = minv.at<T>(0,i);
-        T current_max = maxv.at<T>(0,i);
-        T oldmin = min, oldmax = max;
-        min = std::min(min, current_min);
-        max = std::max(max, current_max);
-        if (cn == 1)
-        {
-            int current_minloc = minl.at<int>(0,i);
-            int current_maxloc = maxl.at<int>(0,i);
-            if(current_minloc < 0 || current_maxloc < 0) continue;
-            minloc = (oldmin == current_min) ? std::min(minloc, current_minloc) : (oldmin < current_min) ? minloc : current_minloc;
-            maxloc = (oldmax == current_max) ? std::min(maxloc, current_maxloc) : (oldmax > current_max) ? maxloc : current_maxloc;
-        }
+        minptr = (const T *)db.data;
+        index += sizeof(T) * groupnum;
+    }
+    if (maxVal || maxLoc)
+    {
+        maxptr = (const T *)(db.data + index);
+        index += sizeof(T) * groupnum;
     }
-    bool zero_mask = (maxloc == INT_MAX) || (minloc == INT_MAX);
-    if (minVal)
-        *minVal = zero_mask ? 0 : (double)min;
-    if (maxVal)
-        *maxVal = zero_mask ? 0 : (double)max;
     if (minLoc)
     {
-        minLoc[0] = zero_mask ? -1 : minloc/cols;
-        minLoc[1] = zero_mask ? -1 : minloc%cols;
+        minlocptr = (uint *)(db.data + index);
+        index += sizeof(uint) * groupnum;
+    }
+    if (maxLoc)
+        maxlocptr = (uint *)(db.data + index);
+
+    for (int i = 0; i < groupnum; i++)
+    {
+        if (minptr && minptr[i] <= minval)
+        {
+            if (minptr[i] == minval)
+            {
+                if (minlocptr)
+                    minloc = std::min(minlocptr[i], minloc);
+            }
+            else
+            {
+                if (minlocptr)
+                    minloc = minlocptr[i];
+                minval = minptr[i];
+            }
+        }
+        if (maxptr && maxptr[i] >= maxval)
+        {
+            if (maxptr[i] == maxval)
+            {
+                if (maxlocptr)
+                    maxloc = std::min(maxlocptr[i], maxloc);
+            }
+            else
+            {
+                if (maxlocptr)
+                    maxloc = maxlocptr[i];
+                maxval = maxptr[i];
+            }
+        }
+    }
+    bool zero_mask = (minLoc && minloc == index_max) ||
+            (maxLoc && maxloc == index_max);
+
+    if (minVal)
+        *minVal = zero_mask ? 0 : (double)minval;
+    if (maxVal)
+        *maxVal = zero_mask ? 0 : (double)maxval;
+
+    if (minLoc)
+    {
+        minLoc[0] = zero_mask ? -1 : minloc / cols;
+        minLoc[1] = zero_mask ? -1 : minloc % cols;
     }
     if (maxLoc)
     {
-        maxLoc[0] = zero_mask ? -1 : maxloc/cols;
-        maxLoc[1] = zero_mask ? -1 : maxloc%cols;
+        maxLoc[0] = zero_mask ? -1 : maxloc / cols;
+        maxLoc[1] = zero_mask ? -1 : maxloc % cols;
     }
 }
 
-typedef void (*getMinMaxResFunc)(const Mat &minv, const Mat &maxv, const Mat &minl, const Mat &maxl, double *minVal,
-                                 double *maxVal, int *minLoc, int *maxLoc, const int gropunum, const int cn, const int cols);
+typedef void (*getMinMaxResFunc)(const Mat & db, double *minVal, double *maxVal,
+                                 int *minLoc, int *maxLoc,
+                                 int gropunum, int cn, int cols);
 
 static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int* minLoc, int* maxLoc, InputArray _mask)
 {
     CV_Assert( (_src.channels() == 1 && (_mask.empty() || _mask.type() == CV_8U)) ||
         (_src.channels() >= 1 && _mask.empty() && !minLoc && !maxLoc) );
 
-    int type = _src.type(), depth = CV_MAT_DEPTH(type), kercn = 1;
-    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
+    const ocl::Device & dev = ocl::Device::getDefault();
+    bool doubleSupport = dev.doubleFPConfig() > 0, haveMask = !_mask.empty();
+    int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
+            kercn = haveMask ? 1 : std::min(4, ocl::predictOptimalVectorWidth(_src));
 
     if (depth == CV_64F && !doubleSupport)
         return false;
 
-    int groupnum = ocl::Device::getDefault().maxComputeUnits();
-    size_t wgs = ocl::Device::getDefault().maxWorkGroupSize();
+    int groupnum = dev.maxComputeUnits();
+    size_t wgs = dev.maxWorkGroupSize();
 
     int wgs2_aligned = 1;
     while (wgs2_aligned < (int)wgs)
         wgs2_aligned <<= 1;
     wgs2_aligned >>= 1;
 
-    String opts = format("-D DEPTH_%d -D srcT=%s -D OP_MIN_MAX_LOC%s -D WGS=%d"
-                         " -D WGS2_ALIGNED=%d%s%s%s -D kercn=%d",
-                         depth, ocl::typeToStr(depth), _mask.empty() ? "" : "_MASK", (int)wgs,
-                         wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "",
-                         _src.isContinuous() ? " -D HAVE_SRC_CONT" : "",
-                         _mask.isContinuous() ? " -D HAVE_MASK_CONT" : "", kercn);
+    bool needMinVal = minVal || minLoc, needMinLoc = minLoc != NULL,
+            needMaxVal = maxVal || maxLoc, needMaxLoc = maxLoc != NULL;
 
-    ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, opts);
+    // in case of mask we must know whether mask is filled with zeros or not
+    // so let's calculate min or max location, if it's undefined, so mask is zeros
+    if (!(needMaxLoc || needMinLoc) && haveMask)
+        if (needMinVal)
+            needMinLoc = true;
+        else
+            needMaxVal = true;
+
+    String opts = format("-D DEPTH_%d -D srcT1=%s%s -D WGS=%d -D srcT=%s"
+                         " -D WGS2_ALIGNED=%d%s%s%s -D kercn=%d%s%s%s%s",
+                         depth, ocl::typeToStr(depth), haveMask ? " -D HAVE_MASK" : "", (int)wgs,
+                         ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), wgs2_aligned,
+                         doubleSupport ? " -D DOUBLE_SUPPORT" : "",
+                         _src.isContinuous() ? " -D HAVE_SRC_CONT" : "",
+                         _mask.isContinuous() ? " -D HAVE_MASK_CONT" : "", kercn,
+                         needMinVal ? " -D NEED_MINVAL" : "", needMaxVal ? " -D NEED_MAXVAL" : "",
+                         needMinLoc ? " -D NEED_MINLOC" : "", needMaxLoc ? " -D NEED_MAXLOC" : "");
+
+    ocl::Kernel k("minmaxloc", ocl::core::minmaxloc_oclsrc, opts);
     if (k.empty())
         return false;
 
-    UMat src = _src.getUMat(), minval(1, groupnum, src.type()),
-        maxval(1, groupnum, src.type()), minloc( 1, groupnum, CV_32SC1),
-        maxloc( 1, groupnum, CV_32SC1), mask;
-    if (!_mask.empty())
-        mask = _mask.getUMat();
+    int esz = CV_ELEM_SIZE(depth), esz32s = CV_ELEM_SIZE1(CV_32S),
+            dbsize = groupnum * ((needMinVal ? esz : 0) + (needMaxVal ? esz : 0) +
+                                 (needMinLoc ? esz32s : 0) + (needMaxLoc ? esz32s : 0));
+    UMat src = _src.getUMat(), db(1, dbsize, CV_8UC1), mask = _mask.getUMat();
 
-    if (src.channels() > 1)
+    if (cn > 1)
         src = src.reshape(1);
 
-    if (mask.empty())
+    if (!haveMask)
         k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
-            groupnum, ocl::KernelArg::PtrWriteOnly(minval), ocl::KernelArg::PtrWriteOnly(maxval),
-            ocl::KernelArg::PtrWriteOnly(minloc), ocl::KernelArg::PtrWriteOnly(maxloc));
+               groupnum, ocl::KernelArg::PtrWriteOnly(db));
     else
-        k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(), groupnum,
-            ocl::KernelArg::PtrWriteOnly(minval), ocl::KernelArg::PtrWriteOnly(maxval),
-            ocl::KernelArg::PtrWriteOnly(minloc), ocl::KernelArg::PtrWriteOnly(maxloc), ocl::KernelArg::ReadOnlyNoSize(mask));
+        k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
+               groupnum, ocl::KernelArg::PtrWriteOnly(db), ocl::KernelArg::ReadOnlyNoSize(mask));
 
     size_t globalsize = groupnum * wgs;
     if (!k.run(1, &globalsize, &wgs, false))
         return false;
 
-    Mat minv = minval.getMat(ACCESS_READ), maxv = maxval.getMat(ACCESS_READ),
-        minl = minloc.getMat(ACCESS_READ), maxl = maxloc.getMat(ACCESS_READ);
-
-    static getMinMaxResFunc functab[7] =
+    static const getMinMaxResFunc functab[7] =
     {
         getMinMaxRes<uchar>,
         getMinMaxRes<char>,
@@ -1419,10 +1472,12 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int*
         getMinMaxRes<double>
     };
 
-    getMinMaxResFunc func;
+    getMinMaxResFunc func = functab[depth];
 
-    func = functab[depth];
-    func(minv, maxv, minl, maxl, minVal, maxVal, minLoc, maxLoc, groupnum, src.channels(), src.cols);
+    int locTemp[2];
+    func(db.getMat(ACCESS_READ), minVal, maxVal,
+         needMinLoc ? minLoc ? minLoc : locTemp : minLoc,
+         needMaxLoc ? maxLoc ? maxLoc : locTemp : maxLoc, groupnum, cn, src.cols);
 
     return true;
 }

From 1a7a262f7457171e2084f3b8c1af93c21e89c64b Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Fri, 6 Jun 2014 17:15:19 +0400
Subject: [PATCH 2/6] optimized cv::norm with NORM_INF

---
 modules/core/src/opencl/minmaxloc.cl | 19 +++++-----
 modules/core/src/opencl/reduce.cl    |  4 +-
 modules/core/src/stat.cpp            | 57 ++++++++++------------------
 3 files changed, 30 insertions(+), 50 deletions(-)

diff --git a/modules/core/src/opencl/minmaxloc.cl b/modules/core/src/opencl/minmaxloc.cl
index 558679efda..2e48387c77 100644
--- a/modules/core/src/opencl/minmaxloc.cl
+++ b/modules/core/src/opencl/minmaxloc.cl
@@ -36,6 +36,7 @@
 #define MAX_VAL DBL_MAX
 #endif
 
+#define noconvert
 #define INDEX_MAX UINT_MAX
 
 #ifdef NEED_MINLOC
@@ -93,20 +94,20 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
 #endif
 
 #ifdef NEED_MINVAL
-    __local srcT1 localmem_min[WGS2_ALIGNED];
+    __local dstT1 localmem_min[WGS2_ALIGNED];
 #ifdef NEED_MINLOC
     __local uint localmem_minloc[WGS2_ALIGNED];
 #endif
 #endif
 #ifdef NEED_MAXVAL
-    __local srcT1 localmem_max[WGS2_ALIGNED];
+    __local dstT1 localmem_max[WGS2_ALIGNED];
 #ifdef NEED_MAXLOC
     __local uint localmem_maxloc[WGS2_ALIGNED];
 #endif
 #endif
 
-    srcT1 minval = MAX_VAL, maxval = MIN_VAL;
-    srcT temp;
+    dstT1 minval = MAX_VAL, maxval = MIN_VAL;
+    dstT temp;
     uint minloc = INDEX_MAX, maxloc = INDEX_MAX;
     int src_index;
 #ifdef HAVE_MASK
@@ -130,7 +131,7 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
         if (mask[mask_index])
 #endif
         {
-            temp = *(__global const srcT *)(srcptr + src_index);
+            temp = convertToDT(*(__global const srcT *)(srcptr + src_index));
 #if kercn == 1
 #ifdef NEED_MINVAL
             if (minval > temp)
@@ -262,12 +263,12 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
     {
         int pos = 0;
 #ifdef NEED_MINVAL
-        *(__global srcT1 *)(dstptr + mad24(gid, (int)sizeof(srcT1), pos)) = localmem_min[0];
-        pos = mad24(groupnum, (int)sizeof(srcT1), pos);
+        *(__global dstT1 *)(dstptr + mad24(gid, (int)sizeof(dstT1), pos)) = localmem_min[0];
+        pos = mad24(groupnum, (int)sizeof(dstT1), pos);
 #endif
 #ifdef NEED_MAXVAL
-        *(__global srcT1 *)(dstptr + mad24(gid, (int)sizeof(srcT1), pos)) = localmem_max[0];
-        pos = mad24(groupnum, (int)sizeof(srcT1), pos);
+        *(__global dstT1 *)(dstptr + mad24(gid, (int)sizeof(dstT1), pos)) = localmem_max[0];
+        pos = mad24(groupnum, (int)sizeof(dstT1), pos);
 #endif
 #ifdef NEED_MINLOC
         *(__global uint *)(dstptr + mad24(gid, (int)sizeof(uint), pos)) = localmem_minloc[0];
diff --git a/modules/core/src/opencl/reduce.cl b/modules/core/src/opencl/reduce.cl
index 038f132970..21a5518883 100644
--- a/modules/core/src/opencl/reduce.cl
+++ b/modules/core/src/opencl/reduce.cl
@@ -50,7 +50,7 @@
 #endif
 #endif
 
-#if defined OP_NORM_INF_MASK || defined OP_MIN_MAX_LOC || defined OP_MIN_MAX_LOC_MASK
+#if defined OP_NORM_INF_MASK
 
 #ifdef DEPTH_0
 #define MIN_VAL 0
@@ -75,8 +75,6 @@
 #define MAX_VAL DBL_MAX
 #endif
 
-#define INDEX_MAX UINT_MAX
-
 #define dstT srcT
 #define dstT1 srcT1
 
diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp
index 9d78c0f107..8996c48015 100644
--- a/modules/core/src/stat.cpp
+++ b/modules/core/src/stat.cpp
@@ -1313,7 +1313,7 @@ static void ofs2idx(const Mat& a, size_t ofs, int* idx)
 template <typename T>
 void getMinMaxRes(const Mat & db, double* minVal, double* maxVal,
                   int* minLoc, int* maxLoc,
-                  int groupnum, int cn, int cols)
+                  int groupnum, int cols)
 {
     uint index_max = std::numeric_limits<uint>::max();
     T minval = std::numeric_limits<T>::max();
@@ -1393,10 +1393,10 @@ void getMinMaxRes(const Mat & db, double* minVal, double* maxVal,
 }
 
 typedef void (*getMinMaxResFunc)(const Mat & db, double *minVal, double *maxVal,
-                                 int *minLoc, int *maxLoc,
-                                 int gropunum, int cn, int cols);
+                                 int *minLoc, int *maxLoc, int gropunum, int cols);
 
-static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int* minLoc, int* maxLoc, InputArray _mask)
+static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int* minLoc, int* maxLoc, InputArray _mask,
+                           int ddepth = -1, bool absValues = false)
 {
     CV_Assert( (_src.channels() == 1 && (_mask.empty() || _mask.type() == CV_8U)) ||
         (_src.channels() >= 1 && _mask.empty() && !minLoc && !maxLoc) );
@@ -1405,8 +1405,10 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int*
     bool doubleSupport = dev.doubleFPConfig() > 0, haveMask = !_mask.empty();
     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
             kercn = haveMask ? 1 : std::min(4, ocl::predictOptimalVectorWidth(_src));
+    if (ddepth < 0)
+        ddepth = depth;
 
-    if (depth == CV_64F && !doubleSupport)
+    if ((depth == CV_64F || ddepth == CV_64F) && !doubleSupport)
         return false;
 
     int groupnum = dev.maxComputeUnits();
@@ -1423,26 +1425,32 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int*
     // in case of mask we must know whether mask is filled with zeros or not
     // so let's calculate min or max location, if it's undefined, so mask is zeros
     if (!(needMaxLoc || needMinLoc) && haveMask)
+    {
         if (needMinVal)
             needMinLoc = true;
         else
             needMaxVal = true;
+    }
 
+    char cvt[40];
     String opts = format("-D DEPTH_%d -D srcT1=%s%s -D WGS=%d -D srcT=%s"
-                         " -D WGS2_ALIGNED=%d%s%s%s -D kercn=%d%s%s%s%s",
+                         " -D WGS2_ALIGNED=%d%s%s%s -D kercn=%d%s%s%s%s"
+                         " -D dstT1=%s -D dstT=%s -D convertToDT=%s%s",
                          depth, ocl::typeToStr(depth), haveMask ? " -D HAVE_MASK" : "", (int)wgs,
                          ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), wgs2_aligned,
                          doubleSupport ? " -D DOUBLE_SUPPORT" : "",
                          _src.isContinuous() ? " -D HAVE_SRC_CONT" : "",
                          _mask.isContinuous() ? " -D HAVE_MASK_CONT" : "", kercn,
                          needMinVal ? " -D NEED_MINVAL" : "", needMaxVal ? " -D NEED_MAXVAL" : "",
-                         needMinLoc ? " -D NEED_MINLOC" : "", needMaxLoc ? " -D NEED_MAXLOC" : "");
+                         needMinLoc ? " -D NEED_MINLOC" : "", needMaxLoc ? " -D NEED_MAXLOC" : "",
+                         ocl::typeToStr(ddepth), ocl::typeToStr(CV_MAKE_TYPE(ddepth, kercn)),
+                         ocl::convertTypeStr(depth, ddepth, kercn, cvt), absValues ? " -D OP_ABS" : "");
 
     ocl::Kernel k("minmaxloc", ocl::core::minmaxloc_oclsrc, opts);
     if (k.empty())
         return false;
 
-    int esz = CV_ELEM_SIZE(depth), esz32s = CV_ELEM_SIZE1(CV_32S),
+    int esz = CV_ELEM_SIZE(ddepth), esz32s = CV_ELEM_SIZE1(CV_32S),
             dbsize = groupnum * ((needMinVal ? esz : 0) + (needMaxVal ? esz : 0) +
                                  (needMinLoc ? esz32s : 0) + (needMaxLoc ? esz32s : 0));
     UMat src = _src.getUMat(), db(1, dbsize, CV_8UC1), mask = _mask.getUMat();
@@ -1477,7 +1485,7 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int*
     int locTemp[2];
     func(db.getMat(ACCESS_READ), minVal, maxVal,
          needMinLoc ? minLoc ? minLoc : locTemp : minLoc,
-         needMaxLoc ? maxLoc ? maxLoc : locTemp : maxLoc, groupnum, cn, src.cols);
+         needMaxLoc ? maxLoc ? maxLoc : locTemp : maxLoc, groupnum, src.cols);
 
     return true;
 }
@@ -2116,35 +2124,8 @@ static bool ocl_norm( InputArray _src, int normType, InputArray _mask, double &
     if (normType == NORM_INF)
     {
         if (cn == 1 || !haveMask)
-        {
-            UMat abssrc;
-
-            if (depth != CV_8U && depth != CV_16U)
-            {
-                int wdepth = std::max(CV_32S, depth), rowsPerWI = d.isIntel() ? 4 : 1;
-                char cvt[50];
-
-                ocl::Kernel kabs("KF", ocl::core::arithm_oclsrc,
-                                 format("-D UNARY_OP -D OP_ABS_NOSAT -D dstT=%s -D srcT1=%s"
-                                        " -D convertToDT=%s -D rowsPerWI=%d%s",
-                                        ocl::typeToStr(wdepth), ocl::typeToStr(depth),
-                                        ocl::convertTypeStr(depth, wdepth, 1, cvt), rowsPerWI,
-                                        doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
-                if (kabs.empty())
-                    return false;
-
-                abssrc.create(src.size(), CV_MAKE_TYPE(wdepth, cn));
-                kabs.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(abssrc, cn));
-
-                size_t globalsize[2] = { src.cols * cn, (src.rows + rowsPerWI - 1) / rowsPerWI };
-                if (!kabs.run(2, globalsize, NULL, false))
-                    return false;
-            }
-            else
-                abssrc = src;
-
-            cv::minMaxIdx(haveMask ? abssrc : abssrc.reshape(1), NULL, &result, NULL, NULL, _mask);
-        }
+            ocl_minMaxIdx(_src, NULL, &result, NULL, NULL, _mask,
+                          std::max(depth, CV_32S), depth != CV_8U && depth != CV_16U);
         else
         {
             int dbsize = d.maxComputeUnits();

From 2040995801b0c685c75c14fcd11cfaa974fc5b9c Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Sat, 7 Jun 2014 15:51:41 +0400
Subject: [PATCH 3/6] optimized cv::norm with 2 args

---
 modules/core/src/opencl/minmaxloc.cl |  81 ++++++++--
 modules/core/src/opencl/reduce.cl    | 219 ++++++++++++++++++++++++++-
 modules/core/src/stat.cpp            | 148 ++++++++++++------
 3 files changed, 387 insertions(+), 61 deletions(-)

diff --git a/modules/core/src/opencl/minmaxloc.cl b/modules/core/src/opencl/minmaxloc.cl
index 2e48387c77..e3d87b04ce 100644
--- a/modules/core/src/opencl/minmaxloc.cl
+++ b/modules/core/src/opencl/minmaxloc.cl
@@ -73,14 +73,26 @@
 #define CALC_MAX(p, inc)
 #endif
 
+#ifdef OP_CALC2
+#define CALC_MAX2(p) \
+    if (maxval2 < temp.p) \
+        maxval2 = temp.p
+#else
+#define CALC_MAX2(p)
+#endif
+
 #define CALC_P(p, inc) \
     CALC_MIN(p, inc) \
-    CALC_MAX(p, inc)
+    CALC_MAX(p, inc) \
+    CALC_MAX2(p)
 
 __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_offset, int cols,
                         int total, int groupnum, __global uchar * dstptr
 #ifdef HAVE_MASK
                         , __global const uchar * mask, int mask_step, int mask_offset
+#endif
+#ifdef HAVE_SRC2
+                        , __global const uchar * src2ptr, int src2_step, int src2_offset
 #endif
                         )
 {
@@ -92,36 +104,46 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
 #ifdef HAVE_MASK
     mask += mask_offset;
 #endif
+#ifdef HAVE_SRC2
+    src2ptr += src2_offset;
+#endif
 
 #ifdef NEED_MINVAL
     __local dstT1 localmem_min[WGS2_ALIGNED];
+    dstT1 minval = MAX_VAL;
 #ifdef NEED_MINLOC
     __local uint localmem_minloc[WGS2_ALIGNED];
+    uint minloc = INDEX_MAX;
 #endif
 #endif
 #ifdef NEED_MAXVAL
+    dstT1 maxval = MIN_VAL;
     __local dstT1 localmem_max[WGS2_ALIGNED];
 #ifdef NEED_MAXLOC
     __local uint localmem_maxloc[WGS2_ALIGNED];
+    uint maxloc = INDEX_MAX;
 #endif
+#endif
+#ifdef OP_CALC2
+    __local dstT1 localmem_max2[WGS2_ALIGNED];
+    dstT1 maxval2 = MIN_VAL;
 #endif
 
-    dstT1 minval = MAX_VAL, maxval = MIN_VAL;
-    dstT temp;
-    uint minloc = INDEX_MAX, maxloc = INDEX_MAX;
     int src_index;
 #ifdef HAVE_MASK
     int mask_index;
 #endif
+#ifdef HAVE_SRC2
+    int src2_index;
+#endif
+
+    dstT temp;
+#ifdef HAVE_SRC2
+    dstT temp2;
+#endif
 
     for (int grain = groupnum * WGS * kercn; id < total; id += grain)
     {
-#ifdef HAVE_SRC_CONT
-        src_index = mul24(id, (int)sizeof(srcT1));
-#else
-        src_index = mad24(id / cols, src_step, mul24(id % cols, (int)sizeof(srcT1)));
-#endif
-
 #ifdef HAVE_MASK
 #ifdef HAVE_MASK_CONT
         mask_index = id;
@@ -131,7 +153,26 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
         if (mask[mask_index])
 #endif
         {
+#ifdef HAVE_SRC_CONT
+            src_index = mul24(id, (int)sizeof(srcT1));
+#else
+            src_index = mad24(id / cols, src_step, mul24(id % cols, (int)sizeof(srcT1)));
+#endif
             temp = convertToDT(*(__global const srcT *)(srcptr + src_index));
+#ifdef OP_ABS
+            temp = temp >= (dstT)(0) ? temp : -temp;
+#endif
+
+#ifdef HAVE_SRC2
+#ifdef HAVE_SRC2_CONT
+            src2_index = mul24(id, (int)sizeof(srcT1));
+#else
+            src2_index = mad24(id / cols, src2_step, mul24(id % cols, (int)sizeof(srcT1)));
+#endif
+            temp2 = convertToDT(*(__global const srcT *)(src2ptr + src2_index));
+            temp = temp > temp2 ? temp - temp2 : (temp2 - temp);
+#endif
+
 #if kercn == 1
 #ifdef NEED_MINVAL
             if (minval > temp)
@@ -150,6 +191,11 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
                 maxloc = id;
 #endif
             }
+#ifdef OP_CALC2
+            temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2;
+            if (maxval2 < temp2)
+                maxval2 = temp2;
+#endif
 #endif
 #elif kercn >= 2
             CALC_P(s0, 0)
@@ -191,6 +237,9 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
 #endif
 #ifdef NEED_MAXLOC
         localmem_maxloc[lid] = maxloc;
+#endif
+#ifdef OP_CALC2
+        localmem_max2[lid] = maxval2;
 #endif
     }
     barrier(CLK_LOCAL_MEM_FENCE);
@@ -221,6 +270,10 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
 #endif
                 localmem_max[lid3] = maxval;
         }
+#endif
+#ifdef OP_CALC2
+        if (localmem_max2[lid3] < maxval2)
+            localmem_max2[lid3] = maxval2;
 #endif
     }
     barrier(CLK_LOCAL_MEM_FENCE);
@@ -254,6 +307,10 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
 #endif
                     localmem_max[lid] = localmem_max[lid2];
             }
+#endif
+#ifdef OP_CALC2
+            if (localmem_max2[lid] < localmem_max2[lid2])
+                localmem_max2[lid] = localmem_max2[lid2];
 #endif
         }
         barrier(CLK_LOCAL_MEM_FENCE);
@@ -276,6 +333,10 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
 #endif
 #ifdef NEED_MAXLOC
         *(__global uint *)(dstptr + mad24(gid, (int)sizeof(uint), pos)) = localmem_maxloc[0];
+#endif
+#ifdef OP_CALC2
+        pos = mad24(groupnum, (int)sizeof(uint), pos);
+        *(__global dstT1 *)(dstptr + mad24(gid, (int)sizeof(dstT1), pos)) = localmem_max2[0];
 #endif
     }
 }
diff --git a/modules/core/src/opencl/reduce.cl b/modules/core/src/opencl/reduce.cl
index 21a5518883..d5350791e3 100644
--- a/modules/core/src/opencl/reduce.cl
+++ b/modules/core/src/opencl/reduce.cl
@@ -109,13 +109,22 @@
 #endif
 
 #ifdef HAVE_MASK
+#ifdef HAVE_SRC2
+#define EXTRA_PARAMS , __global const uchar * mask, int mask_step, int mask_offset, __global const uchar * src2ptr, int src2_step, int src2_offset
+#else
 #define EXTRA_PARAMS , __global const uchar * mask, int mask_step, int mask_offset
+#endif
+#else
+#ifdef HAVE_SRC2
+#define EXTRA_PARAMS , __global const uchar * src2ptr, int src2_step, int src2_offset
 #else
 #define EXTRA_PARAMS
 #endif
+#endif
 
 // accumulative reduction stuff
 #if defined OP_SUM || defined OP_SUM_ABS || defined OP_SUM_SQR || defined OP_DOT
+
 #ifdef OP_DOT
 #if ddepth <= 4
 #define FUNC(a, b, c) a = mad24(b, c, a)
@@ -137,18 +146,48 @@
 #endif
 #endif
 
+#ifdef OP_CALC2
+#define DECLARE_LOCAL_MEM \
+    __local dstT localmem[WGS2_ALIGNED]; \
+    __local dstT localmem2[WGS2_ALIGNED]
+#define DEFINE_ACCUMULATOR \
+    dstT accumulator = (dstT)(0); \
+    dstT accumulator2 = (dstT)(0)
+#else
 #define DECLARE_LOCAL_MEM \
     __local dstT localmem[WGS2_ALIGNED]
 #define DEFINE_ACCUMULATOR \
     dstT accumulator = (dstT)(0)
+#endif
+
+#ifdef HAVE_SRC2
+#ifdef OP_CALC2
+#define PROCESS_ELEMS \
+    dstT temp = convertToDT(loadpix(srcptr + src_index)) - convertToDT(loadpix(src2ptr + src2_index)); \
+    dstT temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
+    temp -= temp2; \
+    temp = temp > (dstT)(0) ? temp : -temp; \
+    FUNC(accumulator2, temp2); \
+    FUNC(accumulator, temp)
+#else
+#define PROCESS_ELEMS \
+    dstT temp = convertToDT(loadpix(srcptr + src_index)); \
+    dstT temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
+    temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    FUNC(accumulator, temp)
+#endif
+#else
+#define PROCESS_ELEMS \
+    dstT temp = convertToDT(loadpix(srcptr + src_index)); \
+    FUNC(accumulator, temp)
+#endif
 
 #ifdef HAVE_MASK
 #define REDUCE_GLOBAL \
     MASK_INDEX; \
     if (mask[mask_index]) \
     { \
-        dstT temp = convertToDT(loadpix(srcptr + src_index)); \
-        FUNC(accumulator, temp); \
+        PROCESS_ELEMS; \
     }
 #elif defined OP_DOT
 
@@ -211,7 +250,158 @@
     FUNC(accumulator, temp.sF, temp2.sF)
 #endif
 
-#else
+#else // sum or norm with 2 args
+#ifdef HAVE_SRC2
+#ifdef OP_CALC2 // norm relative
+#if kercn == 1
+#define REDUCE_GLOBAL \
+    dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
+    dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
+    temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    FUNC(accumulator, temp); \
+    FUNC(accumulator2, temp2)
+#elif kercn == 2
+#define REDUCE_GLOBAL \
+    dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
+    dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
+    temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    FUNC(accumulator, temp.s0); \
+    FUNC(accumulator, temp.s1); \
+    FUNC(accumulator2, temp2.s0); \
+    FUNC(accumulator2, temp2.s1)
+#elif kercn == 4
+#define REDUCE_GLOBAL \
+    dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
+    dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
+    temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    FUNC(accumulator, temp.s0); \
+    FUNC(accumulator, temp.s1); \
+    FUNC(accumulator, temp.s2); \
+    FUNC(accumulator, temp.s3); \
+    FUNC(accumulator2, temp2.s0); \
+    FUNC(accumulator2, temp2.s1); \
+    FUNC(accumulator2, temp2.s2); \
+    FUNC(accumulator2, temp2.s3)
+#elif kercn == 8
+#define REDUCE_GLOBAL \
+    dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
+    dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
+    temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    FUNC(accumulator, temp.s0); \
+    FUNC(accumulator, temp.s1); \
+    FUNC(accumulator, temp.s2); \
+    FUNC(accumulator, temp.s3); \
+    FUNC(accumulator, temp.s4); \
+    FUNC(accumulator, temp.s5); \
+    FUNC(accumulator, temp.s6); \
+    FUNC(accumulator, temp.s7); \
+    FUNC(accumulator2, temp2.s0); \
+    FUNC(accumulator2, temp2.s1); \
+    FUNC(accumulator2, temp2.s2); \
+    FUNC(accumulator2, temp2.s3); \
+    FUNC(accumulator2, temp2.s4); \
+    FUNC(accumulator2, temp2.s5); \
+    FUNC(accumulator2, temp2.s6); \
+    FUNC(accumulator2, temp2.s7)
+#elif kercn == 16
+#define REDUCE_GLOBAL \
+    dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
+    dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
+    temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    FUNC(accumulator, temp.s0); \
+    FUNC(accumulator, temp.s1); \
+    FUNC(accumulator, temp.s2); \
+    FUNC(accumulator, temp.s3); \
+    FUNC(accumulator, temp.s4); \
+    FUNC(accumulator, temp.s5); \
+    FUNC(accumulator, temp.s6); \
+    FUNC(accumulator, temp.s7); \
+    FUNC(accumulator, temp.s8); \
+    FUNC(accumulator, temp.s9); \
+    FUNC(accumulator, temp.sA); \
+    FUNC(accumulator, temp.sB); \
+    FUNC(accumulator, temp.sC); \
+    FUNC(accumulator, temp.sD); \
+    FUNC(accumulator, temp.sE); \
+    FUNC(accumulator, temp.sF); \
+    FUNC(accumulator2, temp2.s0); \
+    FUNC(accumulator2, temp2.s1); \
+    FUNC(accumulator2, temp2.s2); \
+    FUNC(accumulator2, temp2.s3); \
+    FUNC(accumulator2, temp2.s4); \
+    FUNC(accumulator2, temp2.s5); \
+    FUNC(accumulator2, temp2.s6); \
+    FUNC(accumulator2, temp2.s7); \
+    FUNC(accumulator2, temp2.s8); \
+    FUNC(accumulator2, temp2.s9); \
+    FUNC(accumulator2, temp2.sA); \
+    FUNC(accumulator2, temp2.sB); \
+    FUNC(accumulator2, temp2.sC); \
+    FUNC(accumulator2, temp2.sD); \
+    FUNC(accumulator2, temp2.sE); \
+    FUNC(accumulator2, temp2.sF)
+#endif
+#else // norm with 2 args
+#if kercn == 1
+#define REDUCE_GLOBAL \
+    dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
+    dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
+    temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    FUNC(accumulator, temp)
+#elif kercn == 2
+#define REDUCE_GLOBAL \
+    dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
+    dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
+    temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    FUNC(accumulator, temp.s0); \
+    FUNC(accumulator, temp.s1)
+#elif kercn == 4
+#define REDUCE_GLOBAL \
+    dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
+    dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
+    temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    FUNC(accumulator, temp.s0); \
+    FUNC(accumulator, temp.s1); \
+    FUNC(accumulator, temp.s2); \
+    FUNC(accumulator, temp.s3)
+#elif kercn == 8
+#define REDUCE_GLOBAL \
+    dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
+    dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
+    temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    FUNC(accumulator, temp.s0); \
+    FUNC(accumulator, temp.s1); \
+    FUNC(accumulator, temp.s2); \
+    FUNC(accumulator, temp.s3); \
+    FUNC(accumulator, temp.s4); \
+    FUNC(accumulator, temp.s5); \
+    FUNC(accumulator, temp.s6); \
+    FUNC(accumulator, temp.s7)
+#elif kercn == 16
+#define REDUCE_GLOBAL \
+    dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
+    dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
+    temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    FUNC(accumulator, temp.s0); \
+    FUNC(accumulator, temp.s1); \
+    FUNC(accumulator, temp.s2); \
+    FUNC(accumulator, temp.s3); \
+    FUNC(accumulator, temp.s4); \
+    FUNC(accumulator, temp.s5); \
+    FUNC(accumulator, temp.s6); \
+    FUNC(accumulator, temp.s7); \
+    FUNC(accumulator, temp.s8); \
+    FUNC(accumulator, temp.s9); \
+    FUNC(accumulator, temp.sA); \
+    FUNC(accumulator, temp.sB); \
+    FUNC(accumulator, temp.sC); \
+    FUNC(accumulator, temp.sD); \
+    FUNC(accumulator, temp.sE); \
+    FUNC(accumulator, temp.sF)
+#endif
+#endif
+
+#else // sum
 #if kercn == 1
 #define REDUCE_GLOBAL \
     dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
@@ -260,6 +450,7 @@
     FUNC(accumulator, temp.sF)
 #endif
 #endif
+#endif
 
 #define SET_LOCAL_1 \
     localmem[lid] = accumulator
@@ -325,6 +516,20 @@
     accumulator += value.sF == zero ? zero : one
 #endif
 
+#ifdef OP_CALC2
+#define SET_LOCAL_1 \
+    localmem[lid] = accumulator; \
+    localmem2[lid] = accumulator2; \
+#define REDUCE_LOCAL_1 \
+    localmem[lid - WGS2_ALIGNED] += accumulator; \
+    localmem2[lid - WGS2_ALIGNED] += accumulator2
+#define REDUCE_LOCAL_2 \
+    localmem[lid] += localmem[lid2]; \
+    localmem2[lid] += localmem2[lid2]
+#define CALC_RESULT \
+    storepix(localmem[0], dstptr + dstTSIZE * gid); \
+    storepix(localmem2[0], dstptr + mad24(groupnum, srcTSIZE, dstTSIZE * gid))
+#else
 #define SET_LOCAL_1 \
     localmem[lid] = accumulator
 #define REDUCE_LOCAL_1 \
@@ -333,6 +538,7 @@
     localmem[lid] += localmem[lid2]
 #define CALC_RESULT \
     storepix(localmem[0], dstptr + dstTSIZE * gid)
+#endif
 
 // norm (NORM_INF) with cn > 1 and mask
 #elif defined OP_NORM_INF_MASK
@@ -384,6 +590,13 @@ __kernel void reduce(__global const uchar * srcptr, int src_step, int src_offset
         int src_index = mul24(id, srcTSIZE);
 #else
         int src_index = mad24(id / cols, src_step, mul24(id % cols, srcTSIZE));
+#endif
+#ifdef HAVE_SRC2
+#ifdef HAVE_SRC2_CONT
+        int src2_index = mul24(id, srcTSIZE);
+#else
+        int src2_index = mad24(id / cols, src2_step, mul24(id % cols, srcTSIZE));
+#endif
 #endif
         REDUCE_GLOBAL;
     }
diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp
index 8996c48015..b405d6f7b0 100644
--- a/modules/core/src/stat.cpp
+++ b/modules/core/src/stat.cpp
@@ -469,21 +469,25 @@ template <typename T> Scalar ocl_part_sum(Mat m)
 
 enum { OCL_OP_SUM = 0, OCL_OP_SUM_ABS =  1, OCL_OP_SUM_SQR = 2 };
 
-static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask = noArray() )
+static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask = noArray(),
+                     InputArray _src2 = noArray(), bool calc2 = false, const Scalar & res2 = Scalar() )
 {
     CV_Assert(sum_op == OCL_OP_SUM || sum_op == OCL_OP_SUM_ABS || sum_op == OCL_OP_SUM_SQR);
 
-    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0,
-            haveMask = _mask.kind() != _InputArray::NONE;
+    const ocl::Device & dev = ocl::Device::getDefault();
+    bool doubleSupport = dev.doubleFPConfig() > 0,
+        haveMask = _mask.kind() != _InputArray::NONE,
+        haveSrc2 = _src2.kind() != _InputArray::NONE;
     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
             kercn = cn == 1 && !haveMask ? ocl::predictOptimalVectorWidth(_src) : 1,
             mcn = std::max(cn, kercn);
+    CV_Assert(!haveSrc2 || _src2.type() == type);
 
     if ( (!doubleSupport && depth == CV_64F) || cn > 4 )
         return false;
 
-    int dbsize = ocl::Device::getDefault().maxComputeUnits();
-    size_t wgs = ocl::Device::getDefault().maxWorkGroupSize();
+    int ngroups = dev.maxComputeUnits(), dbsize = ngroups * (calc2 ? 2 : 1);
+    size_t wgs = dev.maxWorkGroupSize();
 
     int ddepth = std::max(sum_op == OCL_OP_SUM_SQR ? CV_32F : CV_32S, depth),
             dtype = CV_MAKE_TYPE(ddepth, cn);
@@ -497,7 +501,7 @@ static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask
     static const char * const opMap[3] = { "OP_SUM", "OP_SUM_ABS", "OP_SUM_SQR" };
     char cvt[40];
     String opts = format("-D srcT=%s -D srcT1=%s -D dstT=%s -D dstTK=%s -D dstT1=%s -D ddepth=%d -D cn=%d"
-                         " -D convertToDT=%s -D %s -D WGS=%d -D WGS2_ALIGNED=%d%s%s%s%s -D kercn=%d",
+                         " -D convertToDT=%s -D %s -D WGS=%d -D WGS2_ALIGNED=%d%s%s%s%s -D kercn=%d%s%s%s",
                          ocl::typeToStr(CV_MAKE_TYPE(depth, mcn)), ocl::typeToStr(depth),
                          ocl::typeToStr(dtype), ocl::typeToStr(CV_MAKE_TYPE(ddepth, mcn)),
                          ocl::typeToStr(ddepth), ddepth, cn,
@@ -506,30 +510,49 @@ static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask
                          doubleSupport ? " -D DOUBLE_SUPPORT" : "",
                          haveMask ? " -D HAVE_MASK" : "",
                          _src.isContinuous() ? " -D HAVE_SRC_CONT" : "",
-                         _mask.isContinuous() ? " -D HAVE_MASK_CONT" : "", kercn);
+                         haveMask && _mask.isContinuous() ? " -D HAVE_MASK_CONT" : "", kercn,
+                         haveSrc2 ? " -D HAVE_SRC2" : "", calc2 ? " -D OP_CALC2" : "",
+                         haveSrc2 && _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : "");
 
     ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, opts);
     if (k.empty())
         return false;
 
-    UMat src = _src.getUMat(), db(1, dbsize, dtype), mask = _mask.getUMat();
+    UMat src = _src.getUMat(), src2 = _src2.getUMat(),
+        db(1, dbsize, dtype), mask = _mask.getUMat();
 
     ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
             dbarg = ocl::KernelArg::PtrWriteOnly(db),
-            maskarg = ocl::KernelArg::ReadOnlyNoSize(mask);
+            maskarg = ocl::KernelArg::ReadOnlyNoSize(mask),
+            src2arg = ocl::KernelArg::ReadOnlyNoSize(src2);
 
     if (haveMask)
-        k.args(srcarg, src.cols, (int)src.total(), dbsize, dbarg, maskarg);
+    {
+        if (haveSrc2)
+            k.args(srcarg, src.cols, (int)src.total(), ngroups, dbarg, maskarg, src2arg);
+        else
+            k.args(srcarg, src.cols, (int)src.total(), ngroups, dbarg, maskarg);
+    }
     else
-        k.args(srcarg, src.cols, (int)src.total(), dbsize, dbarg);
+    {
+        if (haveSrc2)
+            k.args(srcarg, src.cols, (int)src.total(), ngroups, dbarg, src2arg);
+        else
+            k.args(srcarg, src.cols, (int)src.total(), ngroups, dbarg);
+    }
 
-    size_t globalsize = dbsize * wgs;
+    size_t globalsize = ngroups * wgs;
     if (k.run(1, &globalsize, &wgs, false))
     {
         typedef Scalar (*part_sum)(Mat m);
         part_sum funcs[3] = { ocl_part_sum<int>, ocl_part_sum<float>, ocl_part_sum<double> },
                 func = funcs[ddepth - CV_32S];
-        res = func(db.getMat(ACCESS_READ));
+
+        Mat mres = db.getMat(ACCESS_READ);
+        if (calc2)
+            const_cast<Scalar &>(res2) = func(mres.colRange(dbsize, dbsize));
+
+        res = func(mres.colRange(0, dbsize));
         return true;
     }
     return false;
@@ -1396,18 +1419,21 @@ typedef void (*getMinMaxResFunc)(const Mat & db, double *minVal, double *maxVal,
                                  int *minLoc, int *maxLoc, int gropunum, int cols);
 
 static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int* minLoc, int* maxLoc, InputArray _mask,
-                           int ddepth = -1, bool absValues = false)
+                           int ddepth = -1, bool absValues = false, InputArray _src2 = noArray(), bool calc2 = false)
 {
     CV_Assert( (_src.channels() == 1 && (_mask.empty() || _mask.type() == CV_8U)) ||
         (_src.channels() >= 1 && _mask.empty() && !minLoc && !maxLoc) );
 
     const ocl::Device & dev = ocl::Device::getDefault();
-    bool doubleSupport = dev.doubleFPConfig() > 0, haveMask = !_mask.empty();
+    bool doubleSupport = dev.doubleFPConfig() > 0, haveMask = !_mask.empty(),
+        haveSrc2 = _src2.kind() != _InputArray::NONE;
     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
             kercn = haveMask ? 1 : std::min(4, ocl::predictOptimalVectorWidth(_src));
     if (ddepth < 0)
         ddepth = depth;
 
+    CV_Assert(!haveSrc2 || _src2.type() == type);
+
     if ((depth == CV_64F || ddepth == CV_64F) && !doubleSupport)
         return false;
 
@@ -1435,7 +1461,7 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int*
     char cvt[40];
     String opts = format("-D DEPTH_%d -D srcT1=%s%s -D WGS=%d -D srcT=%s"
                          " -D WGS2_ALIGNED=%d%s%s%s -D kercn=%d%s%s%s%s"
-                         " -D dstT1=%s -D dstT=%s -D convertToDT=%s%s",
+                         " -D dstT1=%s -D dstT=%s -D convertToDT=%s%s%s%s%s",
                          depth, ocl::typeToStr(depth), haveMask ? " -D HAVE_MASK" : "", (int)wgs,
                          ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), wgs2_aligned,
                          doubleSupport ? " -D DOUBLE_SUPPORT" : "",
@@ -1444,7 +1470,9 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int*
                          needMinVal ? " -D NEED_MINVAL" : "", needMaxVal ? " -D NEED_MAXVAL" : "",
                          needMinLoc ? " -D NEED_MINLOC" : "", needMaxLoc ? " -D NEED_MAXLOC" : "",
                          ocl::typeToStr(ddepth), ocl::typeToStr(CV_MAKE_TYPE(ddepth, kercn)),
-                         ocl::convertTypeStr(depth, ddepth, kercn, cvt), absValues ? " -D OP_ABS" : "");
+                         ocl::convertTypeStr(depth, ddepth, kercn, cvt), absValues ? " -D OP_ABS" : "",
+                         haveSrc2 ? " -D HAVE_SRC2" : "", calc2 ? " -D OP_CALC2" : "",
+                         haveSrc2 && _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : "");
 
     ocl::Kernel k("minmaxloc", ocl::core::minmaxloc_oclsrc, opts);
     if (k.empty())
@@ -1452,18 +1480,35 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int*
 
     int esz = CV_ELEM_SIZE(ddepth), esz32s = CV_ELEM_SIZE1(CV_32S),
             dbsize = groupnum * ((needMinVal ? esz : 0) + (needMaxVal ? esz : 0) +
-                                 (needMinLoc ? esz32s : 0) + (needMaxLoc ? esz32s : 0));
-    UMat src = _src.getUMat(), db(1, dbsize, CV_8UC1), mask = _mask.getUMat();
+                                 (needMinLoc ? esz32s : 0) + (needMaxLoc ? esz32s : 0) +
+                                 (calc2 ? esz : 0));
+    UMat src = _src.getUMat(), src2 = _src2.getUMat(), db(1, dbsize, CV_8UC1), mask = _mask.getUMat();
 
     if (cn > 1)
+    {
         src = src.reshape(1);
+        src2 = src2.reshape(1);
+    }
 
-    if (!haveMask)
-        k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
-               groupnum, ocl::KernelArg::PtrWriteOnly(db));
+    if (haveSrc2)
+    {
+        if (!haveMask)
+            k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
+                   groupnum, ocl::KernelArg::PtrWriteOnly(db), ocl::KernelArg::ReadOnlyNoSize(src2));
+        else
+            k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
+                   groupnum, ocl::KernelArg::PtrWriteOnly(db), ocl::KernelArg::ReadOnlyNoSize(mask),
+                   ocl::KernelArg::ReadOnlyNoSize(src2));
+    }
     else
-        k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
-               groupnum, ocl::KernelArg::PtrWriteOnly(db), ocl::KernelArg::ReadOnlyNoSize(mask));
+    {
+        if (!haveMask)
+            k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
+                   groupnum, ocl::KernelArg::PtrWriteOnly(db));
+        else
+            k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
+                   groupnum, ocl::KernelArg::PtrWriteOnly(db), ocl::KernelArg::ReadOnlyNoSize(mask));
+    }
 
     size_t globalsize = groupnum * wgs;
     if (!k.run(1, &globalsize, &wgs, false))
@@ -2498,38 +2543,45 @@ namespace cv {
 
 static bool ocl_norm( InputArray _src1, InputArray _src2, int normType, InputArray _mask, double & result )
 {
-    const ocl::Device & d = ocl::Device::getDefault();
-    int type = _src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), rowsPerWI = d.isIntel() ? 4 : 1;
-    bool doubleSupport = d.doubleFPConfig() > 0;
-    bool relative = (normType & NORM_RELATIVE) != 0;
+    Scalar sc1, sc2;
+    int type = _src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+    bool relative = (normType & NORM_RELATIVE) != 0,
+        normsum = normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR;
     normType &= ~NORM_RELATIVE;
 
-    if ( !(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR) ||
-         (!doubleSupport && depth == CV_64F))
+    if ( !(normType == NORM_INF || normsum) )
         return false;
 
-    int wdepth = std::max(CV_32S, depth);
-    char cvt[50];
-    ocl::Kernel k("KF", ocl::core::arithm_oclsrc,
-                  format("-D BINARY_OP -D OP_ABSDIFF -D dstT=%s -D workT=dstT -D srcT1=%s -D srcT2=srcT1"
-                         " -D convertToDT=%s -D convertToWT1=convertToDT -D convertToWT2=convertToDT -D rowsPerWI=%d%s",
-                         ocl::typeToStr(wdepth), ocl::typeToStr(depth),
-                         ocl::convertTypeStr(depth, wdepth, 1, cvt), rowsPerWI,
-                         doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
-    if (k.empty())
-        return false;
+    if (normsum)
+    {
+        if (!ocl_sum(_src1, sc1, normType == NORM_L2 || normType == NORM_L2SQR ?
+                     OCL_OP_SUM_SQR : OCL_OP_SUM, _mask, _src2, relative, sc2))
+            return false;
+    }
+    else
+    {
+        if (!ocl_minMaxIdx(_src1, NULL, &result, NULL, NULL, _mask, std::max(CV_32S, depth),
+                           false, _src2, relative))
+            return false;
+    }
 
-    UMat src1 = _src1.getUMat(), src2 = _src2.getUMat(), diff(src1.size(), CV_MAKE_TYPE(wdepth, cn));
-    k.args(ocl::KernelArg::ReadOnlyNoSize(src1), ocl::KernelArg::ReadOnlyNoSize(src2),
-           ocl::KernelArg::WriteOnly(diff, cn));
+    double s2 = 0;
+    for (int i = 0; i < cn; ++i)
+    {
+        result += sc1[i];
+        if (relative)
+            s2 += sc2[i];
+    }
 
-    size_t globalsize[2] = { diff.cols * cn, (diff.rows + rowsPerWI - 1) / rowsPerWI };
-    if (!k.run(2, globalsize, NULL, false))
-        return false;
+    if (normType == NORM_L2)
+    {
+        result = std::sqrt(result);
+        if (relative)
+            s2 = std::sqrt(s2);
+    }
 
-    result = cv::norm(diff, normType, _mask);
     if (relative)
-        result /= cv::norm(src2, normType, _mask) + DBL_EPSILON;
+        result /= (s2 + DBL_EPSILON);
 
     return true;
 }

From 5403bdd2286cd8a4fb2b2b688fd02f6d86361a4b Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Sat, 7 Jun 2014 20:53:20 +0400
Subject: [PATCH 4/6] optimized cv::norm with NORM_RELATIVE

---
 modules/core/src/opencl/minmaxloc.cl  |  6 ++--
 modules/core/src/opencl/reduce.cl     |  3 ++
 modules/core/src/stat.cpp             | 44 +++++++++++++++++----------
 modules/core/test/ocl/test_arithm.cpp | 12 ++++++++
 4 files changed, 47 insertions(+), 18 deletions(-)

diff --git a/modules/core/src/opencl/minmaxloc.cl b/modules/core/src/opencl/minmaxloc.cl
index e3d87b04ce..11b6da949b 100644
--- a/modules/core/src/opencl/minmaxloc.cl
+++ b/modules/core/src/opencl/minmaxloc.cl
@@ -76,7 +76,7 @@
 #ifdef OP_CALC2
 #define CALC_MAX2(p) \
     if (maxval2 < temp.p) \
-        maxval2 = temp.p
+        maxval2 = temp.p;
 #else
 #define CALC_MAX2(p)
 #endif
@@ -171,6 +171,9 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
 #endif
             temp2 = convertToDT(*(__global const srcT *)(src2ptr + src2_index));
             temp = temp > temp2 ? temp - temp2 : (temp2 - temp);
+#ifdef OP_CALC2
+            temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2;
+#endif
 #endif
 
 #if kercn == 1
@@ -192,7 +195,6 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
 #endif
             }
 #ifdef OP_CALC2
-            temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2;
             if (maxval2 < temp2)
                 maxval2 = temp2;
 #endif
diff --git a/modules/core/src/opencl/reduce.cl b/modules/core/src/opencl/reduce.cl
index d5350791e3..92818e356b 100644
--- a/modules/core/src/opencl/reduce.cl
+++ b/modules/core/src/opencl/reduce.cl
@@ -580,6 +580,9 @@ __kernel void reduce(__global const uchar * srcptr, int src_step, int src_offset
     int  id = get_global_id(0) * kercn;
 
     srcptr += src_offset;
+#ifdef HAVE_SRC2
+    src2ptr += src2_offset;
+#endif
 
     DECLARE_LOCAL_MEM;
     DEFINE_ACCUMULATOR;
diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp
index b405d6f7b0..34c487ae31 100644
--- a/modules/core/src/stat.cpp
+++ b/modules/core/src/stat.cpp
@@ -1334,17 +1334,17 @@ static void ofs2idx(const Mat& a, size_t ofs, int* idx)
 #ifdef HAVE_OPENCL
 
 template <typename T>
-void getMinMaxRes(const Mat & db, double* minVal, double* maxVal,
+void getMinMaxRes(const Mat & db, double * minVal, double * maxVal,
                   int* minLoc, int* maxLoc,
-                  int groupnum, int cols)
+                  int groupnum, int cols, double * maxVal2)
 {
     uint index_max = std::numeric_limits<uint>::max();
     T minval = std::numeric_limits<T>::max();
-    T maxval = std::numeric_limits<T>::min() > 0 ? -std::numeric_limits<T>::max() : std::numeric_limits<T>::min();
+    T maxval = std::numeric_limits<T>::min() > 0 ? -std::numeric_limits<T>::max() : std::numeric_limits<T>::min(), maxval2 = maxval;
     uint minloc = index_max, maxloc = index_max;
 
     int index = 0;
-    const T * minptr = NULL, * maxptr = NULL;
+    const T * minptr = NULL, * maxptr = NULL, * maxptr2 = NULL;
     const uint * minlocptr = NULL, * maxlocptr = NULL;
     if (minVal || minLoc)
     {
@@ -1362,7 +1362,12 @@ void getMinMaxRes(const Mat & db, double* minVal, double* maxVal,
         index += sizeof(uint) * groupnum;
     }
     if (maxLoc)
+    {
         maxlocptr = (uint *)(db.data + index);
+        index += sizeof(uint) * groupnum;
+    }
+    if (maxVal2)
+        maxptr2 = (const T *)(db.data + index);
 
     for (int i = 0; i < groupnum; i++)
     {
@@ -1394,6 +1399,8 @@ void getMinMaxRes(const Mat & db, double* minVal, double* maxVal,
                 maxval = maxptr[i];
             }
         }
+        if (maxptr2 && maxptr2[i] > maxval2)
+            maxval2 = maxptr2[i];
     }
     bool zero_mask = (minLoc && minloc == index_max) ||
             (maxLoc && maxloc == index_max);
@@ -1402,6 +1409,8 @@ void getMinMaxRes(const Mat & db, double* minVal, double* maxVal,
         *minVal = zero_mask ? 0 : (double)minval;
     if (maxVal)
         *maxVal = zero_mask ? 0 : (double)maxval;
+    if (maxVal2)
+        *maxVal2 = zero_mask ? 0 : (double)maxval2;
 
     if (minLoc)
     {
@@ -1415,20 +1424,21 @@ void getMinMaxRes(const Mat & db, double* minVal, double* maxVal,
     }
 }
 
-typedef void (*getMinMaxResFunc)(const Mat & db, double *minVal, double *maxVal,
-                                 int *minLoc, int *maxLoc, int gropunum, int cols);
+typedef void (*getMinMaxResFunc)(const Mat & db, double * minVal, double * maxVal,
+                                 int * minLoc, int *maxLoc, int gropunum, int cols, double * maxVal2);
 
 static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int* minLoc, int* maxLoc, InputArray _mask,
-                           int ddepth = -1, bool absValues = false, InputArray _src2 = noArray(), bool calc2 = false)
+                           int ddepth = -1, bool absValues = false, InputArray _src2 = noArray(), double * maxVal2 = NULL)
 {
-    CV_Assert( (_src.channels() == 1 && (_mask.empty() || _mask.type() == CV_8U)) ||
-        (_src.channels() >= 1 && _mask.empty() && !minLoc && !maxLoc) );
-
     const ocl::Device & dev = ocl::Device::getDefault();
     bool doubleSupport = dev.doubleFPConfig() > 0, haveMask = !_mask.empty(),
         haveSrc2 = _src2.kind() != _InputArray::NONE;
     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
             kercn = haveMask ? 1 : std::min(4, ocl::predictOptimalVectorWidth(_src));
+
+    CV_Assert( (cn == 1 && (_mask.empty() || _mask.type() == CV_8U)) ||
+              (cn >= 1 && _mask.empty() && !minLoc && !maxLoc) );
+
     if (ddepth < 0)
         ddepth = depth;
 
@@ -1471,7 +1481,7 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int*
                          needMinLoc ? " -D NEED_MINLOC" : "", needMaxLoc ? " -D NEED_MAXLOC" : "",
                          ocl::typeToStr(ddepth), ocl::typeToStr(CV_MAKE_TYPE(ddepth, kercn)),
                          ocl::convertTypeStr(depth, ddepth, kercn, cvt), absValues ? " -D OP_ABS" : "",
-                         haveSrc2 ? " -D HAVE_SRC2" : "", calc2 ? " -D OP_CALC2" : "",
+                         haveSrc2 ? " -D HAVE_SRC2" : "", maxVal2 ? " -D OP_CALC2" : "",
                          haveSrc2 && _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : "");
 
     ocl::Kernel k("minmaxloc", ocl::core::minmaxloc_oclsrc, opts);
@@ -1481,7 +1491,7 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int*
     int esz = CV_ELEM_SIZE(ddepth), esz32s = CV_ELEM_SIZE1(CV_32S),
             dbsize = groupnum * ((needMinVal ? esz : 0) + (needMaxVal ? esz : 0) +
                                  (needMinLoc ? esz32s : 0) + (needMaxLoc ? esz32s : 0) +
-                                 (calc2 ? esz : 0));
+                                 (maxVal2 ? esz : 0));
     UMat src = _src.getUMat(), src2 = _src2.getUMat(), db(1, dbsize, CV_8UC1), mask = _mask.getUMat();
 
     if (cn > 1)
@@ -1525,12 +1535,13 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int*
         getMinMaxRes<double>
     };
 
-    getMinMaxResFunc func = functab[depth];
+    getMinMaxResFunc func = functab[ddepth];
 
     int locTemp[2];
     func(db.getMat(ACCESS_READ), minVal, maxVal,
          needMinLoc ? minLoc ? minLoc : locTemp : minLoc,
-         needMaxLoc ? maxLoc ? maxLoc : locTemp : maxLoc, groupnum, src.cols);
+         needMaxLoc ? maxLoc ? maxLoc : locTemp : maxLoc,
+         groupnum, src.cols, maxVal2);
 
     return true;
 }
@@ -2560,9 +2571,10 @@ static bool ocl_norm( InputArray _src1, InputArray _src2, int normType, InputArr
     }
     else
     {
-        if (!ocl_minMaxIdx(_src1, NULL, &result, NULL, NULL, _mask, std::max(CV_32S, depth),
-                           false, _src2, relative))
+        if (!ocl_minMaxIdx(_src1, NULL, &sc1[0], NULL, NULL, _mask, std::max(CV_32S, depth),
+                           false, _src2, relative ? &sc2[0] : NULL))
             return false;
+        cn = 1;
     }
 
     double s2 = 0;
diff --git a/modules/core/test/ocl/test_arithm.cpp b/modules/core/test/ocl/test_arithm.cpp
index d39697584b..a7a09cabb7 100644
--- a/modules/core/test/ocl/test_arithm.cpp
+++ b/modules/core/test/ocl/test_arithm.cpp
@@ -1293,6 +1293,8 @@ OCL_TEST_P(Norm, NORM_INF_2args)
         {
             generateTestData();
 
+            SCOPED_TRACE(relative ? "NORM_RELATIVE" : "");
+
             int type = NORM_INF;
             if (relative == 1)
                 type |= NORM_RELATIVE;
@@ -1311,6 +1313,8 @@ OCL_TEST_P(Norm, NORM_INF_2args_mask)
         {
             generateTestData();
 
+            SCOPED_TRACE(relative ? "NORM_RELATIVE" : "");
+
             int type = NORM_INF;
             if (relative == 1)
                 type |= NORM_RELATIVE;
@@ -1329,6 +1333,8 @@ OCL_TEST_P(Norm, NORM_L1_2args)
         {
             generateTestData();
 
+            SCOPED_TRACE(relative ? "NORM_RELATIVE" : "");
+
             int type = NORM_L1;
             if (relative == 1)
                 type |= NORM_RELATIVE;
@@ -1347,6 +1353,8 @@ OCL_TEST_P(Norm, NORM_L1_2args_mask)
         {
             generateTestData();
 
+            SCOPED_TRACE(relative ? "NORM_RELATIVE" : "");
+
             int type = NORM_L1;
             if (relative == 1)
                 type |= NORM_RELATIVE;
@@ -1365,6 +1373,8 @@ OCL_TEST_P(Norm, NORM_L2_2args)
         {
             generateTestData();
 
+            SCOPED_TRACE(relative ? "NORM_RELATIVE" : "");
+
             int type = NORM_L2;
             if (relative == 1)
                 type |= NORM_RELATIVE;
@@ -1383,6 +1393,8 @@ OCL_TEST_P(Norm, NORM_L2_2args_mask)
         {
             generateTestData();
 
+            SCOPED_TRACE(relative ? "NORM_RELATIVE" : "");
+
             int type = NORM_L2;
             if (relative == 1)
                 type |= NORM_RELATIVE;

From 7f2662b310489d3336cadf46c386d271ebf09ae0 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Mon, 9 Jun 2014 00:50:14 +0400
Subject: [PATCH 5/6] fixes

---
 .../include/opencv2/core/opencl/ocl_defs.hpp  |  2 +-
 modules/core/src/opencl/minmaxloc.cl          | 40 ++++++++++------
 modules/core/src/opencl/reduce.cl             | 47 ++++++++++---------
 modules/core/src/stat.cpp                     | 19 ++++----
 4 files changed, 62 insertions(+), 46 deletions(-)

diff --git a/modules/core/include/opencv2/core/opencl/ocl_defs.hpp b/modules/core/include/opencv2/core/opencl/ocl_defs.hpp
index 55f8849b8a..76d4f84365 100644
--- a/modules/core/include/opencv2/core/opencl/ocl_defs.hpp
+++ b/modules/core/include/opencv2/core/opencl/ocl_defs.hpp
@@ -5,7 +5,7 @@
 // Copyright (C) 2014, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 
-//#define CV_OPENCL_RUN_ASSERT
+#define CV_OPENCL_RUN_ASSERT
 
 #ifdef HAVE_OPENCL
 
diff --git a/modules/core/src/opencl/minmaxloc.cl b/modules/core/src/opencl/minmaxloc.cl
index 11b6da949b..56de655dff 100644
--- a/modules/core/src/opencl/minmaxloc.cl
+++ b/modules/core/src/opencl/minmaxloc.cl
@@ -15,16 +15,16 @@
 
 #ifdef DEPTH_0
 #define MIN_VAL 0
-#define MAX_VAL 255
+#define MAX_VAL UCHAR_MAX
 #elif defined DEPTH_1
-#define MIN_VAL -128
-#define MAX_VAL 127
+#define MIN_VAL SCHAR_MIN
+#define MAX_VAL SCHAR_MAX
 #elif defined DEPTH_2
 #define MIN_VAL 0
-#define MAX_VAL 65535
+#define MAX_VAL USHRT_MAX
 #elif defined DEPTH_3
-#define MIN_VAL -32768
-#define MAX_VAL 32767
+#define MIN_VAL SHRT_MIN
+#define MAX_VAL SHRT_MAX
 #elif defined DEPTH_4
 #define MIN_VAL INT_MIN
 #define MAX_VAL INT_MAX
@@ -39,6 +39,14 @@
 #define noconvert
 #define INDEX_MAX UINT_MAX
 
+#if kercn != 3
+#define loadpix(addr) *(__global const srcT *)(addr)
+#define srcTSIZE (int)sizeof(srcT1)
+#else
+#define loadpix(addr) vload3(0, (__global const srcT1 *)(addr))
+#define srcTSIZE ((int)sizeof(srcT1))
+#endif
+
 #ifdef NEED_MINLOC
 #define CALC_MINLOC(inc) minloc = id + inc
 #else
@@ -154,22 +162,22 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
 #endif
         {
 #ifdef HAVE_SRC_CONT
-            src_index = mul24(id, (int)sizeof(srcT1));
+            src_index = mul24(id, srcTSIZE);
 #else
-            src_index = mad24(id / cols, src_step, mul24(id % cols, (int)sizeof(srcT1)));
+            src_index = mad24(id / cols, src_step, mul24(id % cols, srcTSIZE));
 #endif
-            temp = convertToDT(*(__global const srcT *)(srcptr + src_index));
+            temp = convertToDT(loadpix(srcptr + src_index));
 #ifdef OP_ABS
             temp = temp >= (dstT)(0) ? temp : -temp;
 #endif
 
 #ifdef HAVE_SRC2
 #ifdef HAVE_SRC2_CONT
-            src2_index = mul24(id, (int)sizeof(srcT1));
+            src2_index = mul24(id, srcTSIZE);
 #else
-            src2_index = mad24(id / cols, src2_step, mul24(id % cols, (int)sizeof(srcT1)));
+            src2_index = mad24(id / cols, src2_step, mul24(id % cols, srcTSIZE));
 #endif
-            temp2 = convertToDT(*(__global const srcT *)(src2ptr + src2_index));
+            temp2 = convertToDT(loadpix(src2ptr + src2_index));
             temp = temp > temp2 ? temp - temp2 : (temp2 - temp);
 #ifdef OP_CALC2
             temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2;
@@ -202,8 +210,9 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
 #elif kercn >= 2
             CALC_P(s0, 0)
             CALC_P(s1, 1)
-#if kercn >= 4
+#if kercn >= 3
             CALC_P(s2, 2)
+#if kercn >= 4
             CALC_P(s3, 3)
 #if kercn >= 8
             CALC_P(s4, 4)
@@ -222,6 +231,7 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
 #endif
 #endif
 #endif
+#endif
 #endif
         }
     }
@@ -335,9 +345,11 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
 #endif
 #ifdef NEED_MAXLOC
         *(__global uint *)(dstptr + mad24(gid, (int)sizeof(uint), pos)) = localmem_maxloc[0];
-#endif
 #ifdef OP_CALC2
         pos = mad24(groupnum, (int)sizeof(uint), pos);
+#endif
+#endif
+#ifdef OP_CALC2
         *(__global dstT1 *)(dstptr + mad24(gid, (int)sizeof(dstT1), pos)) = localmem_max2[0];
 #endif
     }
diff --git a/modules/core/src/opencl/reduce.cl b/modules/core/src/opencl/reduce.cl
index 92818e356b..9418cec0de 100644
--- a/modules/core/src/opencl/reduce.cl
+++ b/modules/core/src/opencl/reduce.cl
@@ -148,11 +148,9 @@
 
 #ifdef OP_CALC2
 #define DECLARE_LOCAL_MEM \
-    __local dstT localmem[WGS2_ALIGNED]; \
-    __local dstT localmem2[WGS2_ALIGNED]
+    __local dstT localmem[WGS2_ALIGNED], localmem2[WGS2_ALIGNED]
 #define DEFINE_ACCUMULATOR \
-    dstT accumulator = (dstT)(0); \
-    dstT accumulator2 = (dstT)(0)
+    dstT accumulator = (dstT)(0), accumulator2 = (dstT)(0)
 #else
 #define DECLARE_LOCAL_MEM \
     __local dstT localmem[WGS2_ALIGNED]
@@ -163,10 +161,10 @@
 #ifdef HAVE_SRC2
 #ifdef OP_CALC2
 #define PROCESS_ELEMS \
-    dstT temp = convertToDT(loadpix(srcptr + src_index)) - convertToDT(loadpix(src2ptr + src2_index)); \
+    dstT temp = convertToDT(loadpix(srcptr + src_index)); \
     dstT temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
-    temp -= temp2; \
-    temp = temp > (dstT)(0) ? temp : -temp; \
+    temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; \
     FUNC(accumulator2, temp2); \
     FUNC(accumulator, temp)
 #else
@@ -258,6 +256,7 @@
     dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
     dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
     temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; \
     FUNC(accumulator, temp); \
     FUNC(accumulator2, temp2)
 #elif kercn == 2
@@ -265,6 +264,7 @@
     dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
     dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
     temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; \
     FUNC(accumulator, temp.s0); \
     FUNC(accumulator, temp.s1); \
     FUNC(accumulator2, temp2.s0); \
@@ -274,6 +274,7 @@
     dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
     dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
     temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; \
     FUNC(accumulator, temp.s0); \
     FUNC(accumulator, temp.s1); \
     FUNC(accumulator, temp.s2); \
@@ -287,6 +288,7 @@
     dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
     dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
     temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; \
     FUNC(accumulator, temp.s0); \
     FUNC(accumulator, temp.s1); \
     FUNC(accumulator, temp.s2); \
@@ -308,6 +310,7 @@
     dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
     dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
     temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; \
     FUNC(accumulator, temp.s0); \
     FUNC(accumulator, temp.s1); \
     FUNC(accumulator, temp.s2); \
@@ -452,6 +455,20 @@
 #endif
 #endif
 
+#ifdef OP_CALC2
+#define SET_LOCAL_1 \
+    localmem[lid] = accumulator; \
+    localmem2[lid] = accumulator2
+#define REDUCE_LOCAL_1 \
+    localmem[lid - WGS2_ALIGNED] += accumulator; \
+    localmem2[lid - WGS2_ALIGNED] += accumulator2
+#define REDUCE_LOCAL_2 \
+    localmem[lid] += localmem[lid2]; \
+    localmem2[lid] += localmem2[lid2]
+#define CALC_RESULT \
+    storepix(localmem[0], dstptr + dstTSIZE * gid); \
+    storepix(localmem2[0], dstptr + mad24(groupnum, dstTSIZE, dstTSIZE * gid))
+#else
 #define SET_LOCAL_1 \
     localmem[lid] = accumulator
 #define REDUCE_LOCAL_1 \
@@ -460,6 +477,7 @@
     localmem[lid] += localmem[lid2]
 #define CALC_RESULT \
     storepix(localmem[0], dstptr + dstTSIZE * gid)
+#endif
 
 // countNonZero stuff
 #elif defined OP_COUNT_NON_ZERO
@@ -516,20 +534,6 @@
     accumulator += value.sF == zero ? zero : one
 #endif
 
-#ifdef OP_CALC2
-#define SET_LOCAL_1 \
-    localmem[lid] = accumulator; \
-    localmem2[lid] = accumulator2; \
-#define REDUCE_LOCAL_1 \
-    localmem[lid - WGS2_ALIGNED] += accumulator; \
-    localmem2[lid - WGS2_ALIGNED] += accumulator2
-#define REDUCE_LOCAL_2 \
-    localmem[lid] += localmem[lid2]; \
-    localmem2[lid] += localmem2[lid2]
-#define CALC_RESULT \
-    storepix(localmem[0], dstptr + dstTSIZE * gid); \
-    storepix(localmem2[0], dstptr + mad24(groupnum, srcTSIZE, dstTSIZE * gid))
-#else
 #define SET_LOCAL_1 \
     localmem[lid] = accumulator
 #define REDUCE_LOCAL_1 \
@@ -538,7 +542,6 @@
     localmem[lid] += localmem[lid2]
 #define CALC_RESULT \
     storepix(localmem[0], dstptr + dstTSIZE * gid)
-#endif
 
 // norm (NORM_INF) with cn > 1 and mask
 #elif defined OP_NORM_INF_MASK
diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp
index 34c487ae31..01f50fa232 100644
--- a/modules/core/src/stat.cpp
+++ b/modules/core/src/stat.cpp
@@ -550,9 +550,9 @@ static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask
 
         Mat mres = db.getMat(ACCESS_READ);
         if (calc2)
-            const_cast<Scalar &>(res2) = func(mres.colRange(dbsize, dbsize));
+            const_cast<Scalar &>(res2) = func(mres.colRange(ngroups, dbsize));
 
-        res = func(mres.colRange(0, dbsize));
+        res = func(mres.colRange(0, ngroups));
         return true;
     }
     return false;
@@ -1434,10 +1434,10 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int*
     bool doubleSupport = dev.doubleFPConfig() > 0, haveMask = !_mask.empty(),
         haveSrc2 = _src2.kind() != _InputArray::NONE;
     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
-            kercn = haveMask ? 1 : std::min(4, ocl::predictOptimalVectorWidth(_src));
+            kercn = haveMask ? cn : std::min(4, ocl::predictOptimalVectorWidth(_src));
 
-    CV_Assert( (cn == 1 && (_mask.empty() || _mask.type() == CV_8U)) ||
-              (cn >= 1 && _mask.empty() && !minLoc && !maxLoc) );
+    CV_Assert( (cn == 1 && (!haveMask || _mask.type() == CV_8U)) ||
+              (cn >= 1 && (!haveMask || haveSrc2) && !minLoc && !maxLoc) );
 
     if (ddepth < 0)
         ddepth = depth;
@@ -1484,6 +1484,8 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int*
                          haveSrc2 ? " -D HAVE_SRC2" : "", maxVal2 ? " -D OP_CALC2" : "",
                          haveSrc2 && _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : "");
 
+    printf("%s\n", opts.c_str());
+
     ocl::Kernel k("minmaxloc", ocl::core::minmaxloc_oclsrc, opts);
     if (k.empty())
         return false;
@@ -2556,9 +2558,9 @@ static bool ocl_norm( InputArray _src1, InputArray _src2, int normType, InputArr
 {
     Scalar sc1, sc2;
     int type = _src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
-    bool relative = (normType & NORM_RELATIVE) != 0,
-        normsum = normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR;
+    bool relative = (normType & NORM_RELATIVE) != 0;
     normType &= ~NORM_RELATIVE;
+    bool normsum = normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR;
 
     if ( !(normType == NORM_INF || normsum) )
         return false;
@@ -2608,8 +2610,7 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m
 
 #ifdef HAVE_OPENCL
     double _result = 0;
-    CV_OCL_RUN_(_src1.isUMat() && _src2.isUMat() &&
-                _src1.dims() <= 2 && _src2.dims() <= 2,
+    CV_OCL_RUN_(_src1.isUMat(),
                 ocl_norm(_src1, _src2, normType, _mask, _result),
                 _result)
 #endif

From 634da9f3bfbb32a6c337623d34fb74a879d147f3 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Mon, 9 Jun 2014 15:32:35 +0400
Subject: [PATCH 6/6] added norm_inf support to minmaxloc kernel

---
 .../include/opencv2/core/opencl/ocl_defs.hpp  |  2 +-
 modules/core/src/opencl/minmaxloc.cl          | 22 ++++++++--
 modules/core/src/opencl/reduce.cl             | 25 +----------
 modules/core/src/stat.cpp                     | 44 +++----------------
 4 files changed, 26 insertions(+), 67 deletions(-)

diff --git a/modules/core/include/opencv2/core/opencl/ocl_defs.hpp b/modules/core/include/opencv2/core/opencl/ocl_defs.hpp
index 76d4f84365..55f8849b8a 100644
--- a/modules/core/include/opencv2/core/opencl/ocl_defs.hpp
+++ b/modules/core/include/opencv2/core/opencl/ocl_defs.hpp
@@ -5,7 +5,7 @@
 // Copyright (C) 2014, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 
-#define CV_OPENCL_RUN_ASSERT
+//#define CV_OPENCL_RUN_ASSERT
 
 #ifdef HAVE_OPENCL
 
diff --git a/modules/core/src/opencl/minmaxloc.cl b/modules/core/src/opencl/minmaxloc.cl
index 56de655dff..eb57347a28 100644
--- a/modules/core/src/opencl/minmaxloc.cl
+++ b/modules/core/src/opencl/minmaxloc.cl
@@ -41,10 +41,15 @@
 
 #if kercn != 3
 #define loadpix(addr) *(__global const srcT *)(addr)
-#define srcTSIZE (int)sizeof(srcT1)
+#define srcTSIZE (int)sizeof(srcT)
 #else
 #define loadpix(addr) vload3(0, (__global const srcT1 *)(addr))
-#define srcTSIZE ((int)sizeof(srcT1))
+#define srcTSIZE ((int)sizeof(srcT1) * 3)
+#endif
+
+#ifndef HAVE_MASK
+#undef srcTSIZE
+#define srcTSIZE (int)sizeof(srcT1)
 #endif
 
 #ifdef NEED_MINLOC
@@ -106,7 +111,12 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
 {
     int lid = get_local_id(0);
     int gid = get_group_id(0);
-    int  id = get_global_id(0) * kercn;
+    int  id = get_global_id(0)
+#ifndef HAVE_MASK
+    * kercn;
+#else
+    ;
+#endif
 
     srcptr += src_offset;
 #ifdef HAVE_MASK
@@ -150,7 +160,11 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
     dstT temp2;
 #endif
 
-    for (int grain = groupnum * WGS * kercn; id < total; id += grain)
+    for (int grain = groupnum * WGS
+#ifndef HAVE_MASK
+        * kercn
+#endif
+        ; id < total; id += grain)
     {
 #ifdef HAVE_MASK
 #ifdef HAVE_MASK_CONT
diff --git a/modules/core/src/opencl/reduce.cl b/modules/core/src/opencl/reduce.cl
index 9418cec0de..888b5dff8b 100644
--- a/modules/core/src/opencl/reduce.cl
+++ b/modules/core/src/opencl/reduce.cl
@@ -543,32 +543,9 @@
 #define CALC_RESULT \
     storepix(localmem[0], dstptr + dstTSIZE * gid)
 
-// norm (NORM_INF) with cn > 1 and mask
-#elif defined OP_NORM_INF_MASK
-
-#define DECLARE_LOCAL_MEM \
-    __local srcT localmem_max[WGS2_ALIGNED]
-#define DEFINE_ACCUMULATOR \
-    srcT maxval = MIN_VAL, temp
-#define REDUCE_GLOBAL \
-    MASK_INDEX; \
-    if (mask[mask_index]) \
-    { \
-        temp = loadpix(srcptr + src_index); \
-        maxval = max(maxval, (srcT)(temp >= (srcT)(0) ? temp : -temp)); \
-    }
-#define SET_LOCAL_1 \
-    localmem_max[lid] = maxval
-#define REDUCE_LOCAL_1 \
-    localmem_max[lid - WGS2_ALIGNED] = max(maxval, localmem_max[lid - WGS2_ALIGNED])
-#define REDUCE_LOCAL_2 \
-    localmem_max[lid] = max(localmem_max[lid], localmem_max[lid2])
-#define CALC_RESULT \
-    storepix(localmem_max[0], dstptr + dstTSIZE * gid)
-
 #else
 #error "No operation"
-#endif // end of norm (NORM_INF) with cn > 1 and mask
+#endif
 
 #ifdef OP_DOT
 #undef EXTRA_PARAMS
diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp
index 01f50fa232..79da3c623f 100644
--- a/modules/core/src/stat.cpp
+++ b/modules/core/src/stat.cpp
@@ -1437,7 +1437,7 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int*
             kercn = haveMask ? cn : std::min(4, ocl::predictOptimalVectorWidth(_src));
 
     CV_Assert( (cn == 1 && (!haveMask || _mask.type() == CV_8U)) ||
-              (cn >= 1 && (!haveMask || haveSrc2) && !minLoc && !maxLoc) );
+              (cn >= 1 && !minLoc && !maxLoc) );
 
     if (ddepth < 0)
         ddepth = depth;
@@ -1465,7 +1465,7 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int*
         if (needMinVal)
             needMinLoc = true;
         else
-            needMaxVal = true;
+            needMaxLoc = true;
     }
 
     char cvt[40];
@@ -1484,8 +1484,6 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int*
                          haveSrc2 ? " -D HAVE_SRC2" : "", maxVal2 ? " -D OP_CALC2" : "",
                          haveSrc2 && _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : "");
 
-    printf("%s\n", opts.c_str());
-
     ocl::Kernel k("minmaxloc", ocl::core::minmaxloc_oclsrc, opts);
     if (k.empty())
         return false;
@@ -1496,7 +1494,7 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int*
                                  (maxVal2 ? esz : 0));
     UMat src = _src.getUMat(), src2 = _src2.getUMat(), db(1, dbsize, CV_8UC1), mask = _mask.getUMat();
 
-    if (cn > 1)
+    if (cn > 1 && !haveMask)
     {
         src = src.reshape(1);
         src2 = src2.reshape(1);
@@ -2181,39 +2179,9 @@ static bool ocl_norm( InputArray _src, int normType, InputArray _mask, double &
 
     if (normType == NORM_INF)
     {
-        if (cn == 1 || !haveMask)
-            ocl_minMaxIdx(_src, NULL, &result, NULL, NULL, _mask,
-                          std::max(depth, CV_32S), depth != CV_8U && depth != CV_16U);
-        else
-        {
-            int dbsize = d.maxComputeUnits();
-            size_t wgs = d.maxWorkGroupSize();
-
-            int wgs2_aligned = 1;
-            while (wgs2_aligned < (int)wgs)
-                wgs2_aligned <<= 1;
-            wgs2_aligned >>= 1;
-
-            ocl::Kernel k("reduce", ocl::core::reduce_oclsrc,
-                          format("-D OP_NORM_INF_MASK -D HAVE_MASK -D DEPTH_%d"
-                                 " -D srcT=%s -D srcT1=%s -D WGS=%d -D cn=%d -D WGS2_ALIGNED=%d%s%s%s",
-                                 depth, ocl::typeToStr(type), ocl::typeToStr(depth),
-                                 wgs, cn, wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "",
-                                 src.isContinuous() ? " -D HAVE_CONT_SRC" : "",
-                                 _mask.isContinuous() ? " -D HAVE_MASK_CONT" : ""));
-            if (k.empty())
-                return false;
-
-            UMat db(1, dbsize, type), mask = _mask.getUMat();
-            k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
-                   dbsize, ocl::KernelArg::PtrWriteOnly(db), ocl::KernelArg::ReadOnlyNoSize(mask));
-
-            size_t globalsize = dbsize * wgs;
-            if (!k.run(1, &globalsize, &wgs, true))
-                return false;
-
-            minMaxIdx(db.getMat(ACCESS_READ), NULL, &result, NULL, NULL, noArray());
-        }
+        if (!ocl_minMaxIdx(_src, NULL, &result, NULL, NULL, _mask,
+                           std::max(depth, CV_32S), depth != CV_8U && depth != CV_16U))
+            return false;
     }
     else if (normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR)
     {