From 576ab3df9aa21b7da314b3bba0b05fbb58741901 Mon Sep 17 00:00:00 2001
From: catree <catree.catreus@outlook.com>
Date: Wed, 27 Feb 2019 19:36:23 +0100
Subject: [PATCH 01/18] Add division operators for Matx.

---
 modules/core/include/opencv2/core/matx.hpp | 42 ++++++++++++++++++++++
 1 file changed, 42 insertions(+)
diff --git a/modules/core/include/opencv2/core/matx.hpp b/modules/core/include/opencv2/core/matx.hpp
index bf3f046276..d384f8e1b2 100644
--- a/modules/core/include/opencv2/core/matx.hpp
+++ b/modules/core/include/opencv2/core/matx.hpp
@@ -1275,6 +1275,48 @@ Matx<_Tp, m, n> operator * (double alpha, const Matx<_Tp, m, n>& a)
     return Matx<_Tp, m, n>(a, alpha, Matx_ScaleOp());
 }
 
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n>& operator /= (Matx<_Tp, m, n>& a, int alpha)
+{
+    for( int i = 0; i < m*n; i++ )
+        a.val[i] = saturate_cast<_Tp>(a.val[i] / alpha);
+    return a;
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n>& operator /= (Matx<_Tp, m, n>& a, float alpha)
+{
+    for( int i = 0; i < m*n; i++ )
+        a.val[i] = saturate_cast<_Tp>(a.val[i] / alpha);
+    return a;
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n>& operator /= (Matx<_Tp, m, n>& a, double alpha)
+{
+    for( int i = 0; i < m*n; i++ )
+        a.val[i] = saturate_cast<_Tp>(a.val[i] / alpha);
+    return a;
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator / (const Matx<_Tp, m, n>& a, int alpha)
+{
+    return Matx<_Tp, m, n>(a, 1./alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator / (const Matx<_Tp, m, n>& a, float alpha)
+{
+    return Matx<_Tp, m, n>(a, 1.f/alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator / (const Matx<_Tp, m, n>& a, double alpha)
+{
+    return Matx<_Tp, m, n>(a, 1./alpha, Matx_ScaleOp());
+}
+
 template<typename _Tp, int m, int n> static inline
 Matx<_Tp, m, n> operator - (const Matx<_Tp, m, n>& a)
 {

From c87b99e82bd27bd78cfc9be29804b7059e05c196 Mon Sep 17 00:00:00 2001
From: Alexander Smorkalov <alexander.smorkalov@xperience.ai>
Date: Fri, 21 Feb 2020 09:57:37 +0300
Subject: [PATCH 02/18] Added test for new MatX division.

---
 modules/core/include/opencv2/core/matx.hpp | 18 +-------
 modules/core/test/test_operations.cpp      | 52 ++++++++++++++++++++++
 2 files changed, 54 insertions(+), 16 deletions(-)

diff --git a/modules/core/include/opencv2/core/matx.hpp b/modules/core/include/opencv2/core/matx.hpp
index d384f8e1b2..a68e34edd6 100644
--- a/modules/core/include/opencv2/core/matx.hpp
+++ b/modules/core/include/opencv2/core/matx.hpp
@@ -1275,19 +1275,11 @@ Matx<_Tp, m, n> operator * (double alpha, const Matx<_Tp, m, n>& a)
     return Matx<_Tp, m, n>(a, alpha, Matx_ScaleOp());
 }
 
-template<typename _Tp, int m, int n> static inline
-Matx<_Tp, m, n>& operator /= (Matx<_Tp, m, n>& a, int alpha)
-{
-    for( int i = 0; i < m*n; i++ )
-        a.val[i] = saturate_cast<_Tp>(a.val[i] / alpha);
-    return a;
-}
-
 template<typename _Tp, int m, int n> static inline
 Matx<_Tp, m, n>& operator /= (Matx<_Tp, m, n>& a, float alpha)
 {
     for( int i = 0; i < m*n; i++ )
-        a.val[i] = saturate_cast<_Tp>(a.val[i] / alpha);
+        a.val[i] = a.val[i] / alpha;
     return a;
 }
 
@@ -1295,16 +1287,10 @@ template<typename _Tp, int m, int n> static inline
 Matx<_Tp, m, n>& operator /= (Matx<_Tp, m, n>& a, double alpha)
 {
     for( int i = 0; i < m*n; i++ )
-        a.val[i] = saturate_cast<_Tp>(a.val[i] / alpha);
+        a.val[i] = a.val[i] / alpha;
     return a;
 }
 
-template<typename _Tp, int m, int n> static inline
-Matx<_Tp, m, n> operator / (const Matx<_Tp, m, n>& a, int alpha)
-{
-    return Matx<_Tp, m, n>(a, 1./alpha, Matx_ScaleOp());
-}
-
 template<typename _Tp, int m, int n> static inline
 Matx<_Tp, m, n> operator / (const Matx<_Tp, m, n>& a, float alpha)
 {
diff --git a/modules/core/test/test_operations.cpp b/modules/core/test/test_operations.cpp
index aea6f229ac..c380568d9f 100644
--- a/modules/core/test/test_operations.cpp
+++ b/modules/core/test/test_operations.cpp
@@ -69,6 +69,8 @@ protected:
     bool TestVec();
     bool TestMatxMultiplication();
     bool TestMatxElementwiseDivison();
+    bool TestDivisionByValue();
+    bool TestInplaceDivisionByValue();
     bool TestMatMatxCastSum();
     bool TestSubMatAccess();
     bool TestExp();
@@ -976,6 +978,50 @@ bool CV_OperationsTest::TestMatxElementwiseDivison()
     return true;
 }
 
+bool CV_OperationsTest::TestDivisionByValue()
+{
+    try
+    {
+        Matx22f mat(2, 4, 6, 8);
+        float alpha = 2.f;
+
+        Matx22f res = mat / alpha;
+
+        if(res(0, 0) != 1.0) throw test_excep();
+        if(res(0, 1) != 2.0) throw test_excep();
+        if(res(1, 0) != 3.0) throw test_excep();
+        if(res(1, 1) != 4.0) throw test_excep();
+    }
+    catch(const test_excep&)
+    {
+        ts->set_failed_test_info(cvtest::TS::FAIL_INVALID_OUTPUT);
+        return false;
+    }
+    return true;
+}
+
+
+bool CV_OperationsTest::TestInplaceDivisionByValue()
+{
+    try
+    {
+        Matx22f mat(2, 4, 6, 8);
+        float alpha = 2.f;
+
+        mat /= alpha;
+
+        if(mat(0, 0) != 1.0) throw test_excep();
+        if(mat(0, 1) != 2.0) throw test_excep();
+        if(mat(1, 0) != 3.0) throw test_excep();
+        if(mat(1, 1) != 4.0) throw test_excep();
+    }
+    catch(const test_excep&)
+    {
+        ts->set_failed_test_info(cvtest::TS::FAIL_INVALID_OUTPUT);
+        return false;
+    }
+    return true;
+}
 
 bool CV_OperationsTest::TestVec()
 {
@@ -1204,6 +1250,12 @@ void CV_OperationsTest::run( int /* start_from */)
     if (!TestMatxElementwiseDivison())
         return;
 
+    if (!TestDivisionByValue())
+        return;
+
+    if (!TestInplaceDivisionByValue())
+        return;
+
     if (!TestMatMatxCastSum())
         return;
 

From 8b2c499be623e6a4b07802ab63abeb17445cab88 Mon Sep 17 00:00:00 2001
From: Maksim Shabunin <maksim.shabunin@gmail.com>
Date: Fri, 21 Feb 2020 15:17:34 +0300
Subject: [PATCH 03/18] intrin: fixed int64->double conversion for AVX-512

---
 modules/core/include/opencv2/core/hal/intrin_avx512.hpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/modules/core/include/opencv2/core/hal/intrin_avx512.hpp b/modules/core/include/opencv2/core/hal/intrin_avx512.hpp
index 3fa9027c04..e189582daa 100644
--- a/modules/core/include/opencv2/core/hal/intrin_avx512.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_avx512.hpp
@@ -1553,13 +1553,13 @@ inline v_float64x8 v_cvt_f64(const v_int64x8& v)
     return v_float64x8(_mm512_cvtepi64_pd(v.val));
 #else
     // constants encoded as floating-point
-    __m512i magic_i_lo   = _mm512_set1_epi64x(0x4330000000000000); // 2^52
-    __m512i magic_i_hi32 = _mm512_set1_epi64x(0x4530000080000000); // 2^84 + 2^63
-    __m512i magic_i_all  = _mm512_set1_epi64x(0x4530000080100000); // 2^84 + 2^63 + 2^52
+    __m512i magic_i_lo   = _mm512_set1_epi64(0x4330000000000000); // 2^52
+    __m512i magic_i_hi32 = _mm512_set1_epi64(0x4530000080000000); // 2^84 + 2^63
+    __m512i magic_i_all  = _mm512_set1_epi64(0x4530000080100000); // 2^84 + 2^63 + 2^52
     __m512d magic_d_all  = _mm512_castsi512_pd(magic_i_all);
 
     // Blend the 32 lowest significant bits of v with magic_int_lo
-    __m512i v_lo         = _mm512_blend_epi32(magic_i_lo, v.val, 0x55);
+    __m512i v_lo         = _mm512_mask_blend_epi32(0x5555, magic_i_lo, v.val);
     // Extract the 32 most significant bits of v
     __m512i v_hi         = _mm512_srli_epi64(v.val, 32);
     // Flip the msb of v_hi and blend with 0x45300000

From 07b475062fa2ca671d61639a3bae1e5c86718001 Mon Sep 17 00:00:00 2001
From: Vadim Pisarevsky <vadim.pisarevsky@gmail.com>
Date: Fri, 21 Feb 2020 16:13:41 +0300
Subject: [PATCH 04/18] Merge pull request #16608 from
 vpisarev:fix_mac_ocl_tests

* fixed several problems when running tests on Mac:
* OCL_pyrUp
* OCL_flip
* some basic UMat tests
* histogram badarg test (out of range access)

* retained the storepix fix in ocl_flip only for 16U/16S datatype, where the OpenCL compiler on Mac generates incorrect code

* moved deletion of ACCESS_FAST flag to non-SVM branch (where SVM is shared virtual memory (in OpenCL 2.x), not support vector machine)

* force OpenCL to use read/write for GPU<=>CPU memory transfers on machines with discrete video only on Macs. On Windows/Linux the drivers are seemingly smart enough to implement map/unmap properly (and maybe more efficiently than explicit read/write)
---
 modules/core/src/copy.cpp                |  4 ++--
 modules/core/src/ocl.cpp                 | 13 +++++++++++--
 modules/core/src/opencl/flip.cl          | 24 ++++++++++++++++++------
 modules/imgproc/src/pyramids.cpp         | 23 +++++++++--------------
 modules/imgproc/test/test_histograms.cpp |  2 +-
 5 files changed, 41 insertions(+), 25 deletions(-)

diff --git a/modules/core/src/copy.cpp b/modules/core/src/copy.cpp
index 1f981ee871..48440ef265 100644
--- a/modules/core/src/copy.cpp
+++ b/modules/core/src/copy.cpp
@@ -916,9 +916,9 @@ static bool ocl_flip(InputArray _src, OutputArray _dst, int flipCode )
     kercn = (cn!=3 || flipType == FLIP_ROWS) ? std::max(kercn, cn) : cn;
 
     ocl::Kernel k(kernelName, ocl::core::flip_oclsrc,
-        format( "-D T=%s -D T1=%s -D cn=%d -D PIX_PER_WI_Y=%d -D kercn=%d",
+        format( "-D T=%s -D T1=%s -D DEPTH=%d -D cn=%d -D PIX_PER_WI_Y=%d -D kercn=%d",
                 kercn != cn ? ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)) : ocl::vecopTypeToStr(CV_MAKE_TYPE(depth, kercn)),
-                kercn != cn ? ocl::typeToStr(depth) : ocl::vecopTypeToStr(depth), cn, pxPerWIy, kercn));
+                kercn != cn ? ocl::typeToStr(depth) : ocl::vecopTypeToStr(depth), depth, cn, pxPerWIy, kercn));
     if (k.empty())
         return false;
 
diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp
index 7780364f1c..dbebf02f7f 100644
--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@@ -4705,6 +4705,8 @@ public:
             int createFlags = 0, flags0 = 0;
             getBestFlags(ctx, accessFlags, usageFlags, createFlags, flags0);
 
+            bool copyOnMap = (flags0 & UMatData::COPY_ON_MAP) != 0;
+
             cl_context ctx_handle = (cl_context)ctx.ptr();
             int allocatorFlags = 0;
             int tempUMatFlags = 0;
@@ -4764,8 +4766,15 @@ public:
             else
 #endif
             {
+                if( copyOnMap )
+                    accessFlags &= ~ACCESS_FAST;
+
                 tempUMatFlags = UMatData::TEMP_UMAT;
-                if (CV_OPENCL_ENABLE_MEM_USE_HOST_PTR
+                if (
+                #ifdef __APPLE__
+                    !copyOnMap &&
+                #endif
+                    CV_OPENCL_ENABLE_MEM_USE_HOST_PTR
                     // There are OpenCL runtime issues for less aligned data
                     && (CV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR != 0
                         && u->origdata == cv::alignPtr(u->origdata, (int)CV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR))
@@ -4793,7 +4802,7 @@ public:
             u->handle = handle;
             u->prevAllocator = u->currAllocator;
             u->currAllocator = this;
-            u->flags |= tempUMatFlags;
+            u->flags |= tempUMatFlags | flags0;
             u->allocatorFlags_ = allocatorFlags;
         }
         if(accessFlags & ACCESS_WRITE)
diff --git a/modules/core/src/opencl/flip.cl b/modules/core/src/opencl/flip.cl
index bd670a5b72..afd14e4e1f 100644
--- a/modules/core/src/opencl/flip.cl
+++ b/modules/core/src/opencl/flip.cl
@@ -42,10 +42,25 @@
 #if kercn != 3
 #define loadpix(addr) *(__global const T *)(addr)
 #define storepix(val, addr)  *(__global T *)(addr) = val
+#define storepix_2(val0, val1, addr0, addr1) \
+    *(__global T *)(addr0) = val0; *(__global T *)(addr1) = val1
 #define TSIZE (int)sizeof(T)
 #else
 #define loadpix(addr) vload3(0, (__global const T1 *)(addr))
 #define storepix(val, addr) vstore3(val, 0, (__global T1 *)(addr))
+#if DEPTH == 2 || DEPTH == 3
+#define storepix_2(val0, val1, addr0, addr1) \
+    ((__global T1 *)(addr0))[0] = val0.x; \
+    ((__global T1 *)(addr1))[0] = val1.x; \
+    ((__global T1 *)(addr0))[1] = val0.y; \
+    ((__global T1 *)(addr1))[1] = val1.y; \
+    ((__global T1 *)(addr0))[2] = val0.z; \
+    ((__global T1 *)(addr1))[2] = val1.z
+#else
+#define storepix_2(val0, val1, addr0, addr1) \
+    storepix(val0, addr0); \
+    storepix(val1, addr1)
+#endif
 #define TSIZE ((int)sizeof(T1)*3)
 #endif
 
@@ -69,8 +84,7 @@ __kernel void arithm_flip_rows(__global const uchar * srcptr, int src_step, int
             T src0 = loadpix(srcptr + src_index0);
             T src1 = loadpix(srcptr + src_index1);
 
-            storepix(src1, dstptr + dst_index0);
-            storepix(src0, dstptr + dst_index1);
+            storepix_2(src1, src0, dstptr + dst_index0, dstptr + dst_index1);
 
             src_index0 += src_step;
             src_index1 -= src_step;
@@ -115,8 +129,7 @@ __kernel void arithm_flip_rows_cols(__global const uchar * srcptr, int src_step,
 #endif
 #endif
 
-            storepix(src1, dstptr + dst_index0);
-            storepix(src0, dstptr + dst_index1);
+            storepix_2(src1, src0, dstptr + dst_index0, dstptr + dst_index1);
 
             src_index0 += src_step;
             src_index1 -= src_step;
@@ -161,8 +174,7 @@ __kernel void arithm_flip_cols(__global const uchar * srcptr, int src_step, int
 #endif
 #endif
 
-            storepix(src1, dstptr + dst_index0);
-            storepix(src0, dstptr + dst_index1);
+            storepix_2(src1, src0, dstptr + dst_index0, dstptr + dst_index1);
 
             src_index0 += src_step;
             src_index1 += src_step;
diff --git a/modules/imgproc/src/pyramids.cpp b/modules/imgproc/src/pyramids.cpp
index ec4427f219..ab6c8fdb6f 100644
--- a/modules/imgproc/src/pyramids.cpp
+++ b/modules/imgproc/src/pyramids.cpp
@@ -1078,7 +1078,7 @@ static bool ocl_pyrUp( InputArray _src, OutputArray _dst, const Size& _dsz, int
     UMat dst = _dst.getUMat();
 
     int float_depth = depth == CV_64F ? CV_64F : CV_32F;
-    const int local_size = 16;
+    const int local_size = channels == 1 ? 16 : 8;
     char cvt[2][50];
     String buildOptions = format(
             "-D T=%s -D FT=%s -D convertToT=%s -D convertToFT=%s%s "
@@ -1092,22 +1092,17 @@ static bool ocl_pyrUp( InputArray _src, OutputArray _dst, const Size& _dsz, int
     size_t globalThreads[2] = { (size_t)dst.cols, (size_t)dst.rows };
     size_t localThreads[2] = { (size_t)local_size, (size_t)local_size };
     ocl::Kernel k;
-    if (ocl::Device::getDefault().isIntel() && channels == 1)
+    if (type == CV_8UC1 && src.cols % 2 == 0)
     {
-        if (type == CV_8UC1 && src.cols % 2 == 0)
-        {
-            buildOptions.clear();
-            k.create("pyrUp_cols2", ocl::imgproc::pyramid_up_oclsrc, buildOptions);
-            globalThreads[0] = dst.cols/4; globalThreads[1] = dst.rows/2;
-        }
-        else
-        {
-            k.create("pyrUp_unrolled", ocl::imgproc::pyr_up_oclsrc, buildOptions);
-            globalThreads[0] = dst.cols/2; globalThreads[1] = dst.rows/2;
-        }
+        buildOptions.clear();
+        k.create("pyrUp_cols2", ocl::imgproc::pyramid_up_oclsrc, buildOptions);
+        globalThreads[0] = dst.cols/4; globalThreads[1] = dst.rows/2;
     }
     else
-        k.create("pyrUp", ocl::imgproc::pyr_up_oclsrc, buildOptions);
+    {
+        k.create("pyrUp_unrolled", ocl::imgproc::pyr_up_oclsrc, buildOptions);
+        globalThreads[0] = dst.cols/2; globalThreads[1] = dst.rows/2;
+    }
 
     if (k.empty())
         return false;
diff --git a/modules/imgproc/test/test_histograms.cpp b/modules/imgproc/test/test_histograms.cpp
index fdf31fe771..afe6e53603 100644
--- a/modules/imgproc/test/test_histograms.cpp
+++ b/modules/imgproc/test/test_histograms.cpp
@@ -1966,7 +1966,7 @@ TEST(Imgproc_Hist_Calc, badarg)
     Mat img = cv::Mat::zeros(10, 10, CV_8UC1);
     Mat imgInt = cv::Mat::zeros(10, 10, CV_32SC1);
     Mat hist;
-    const int hist_size[] = { 100 };
+    const int hist_size[] = { 100, 100 };
     // base run
     EXPECT_NO_THROW(cv::calcHist(&img, 1, channels, noArray(), hist, 1, hist_size, ranges, true));
     // bad parameters

From b29512032ac6673548990e27d255b71c569be080 Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.alekhin@intel.com>
Date: Fri, 21 Feb 2020 17:48:28 +0300
Subject: [PATCH 05/18] 3rdparty(openexr): fix compilation with MSVS2019

---
 3rdparty/openexr/Imath/ImathQuat.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/3rdparty/openexr/Imath/ImathQuat.h b/3rdparty/openexr/Imath/ImathQuat.h
index e95e356d59..e01d10b7c3 100644
--- a/3rdparty/openexr/Imath/ImathQuat.h
+++ b/3rdparty/openexr/Imath/ImathQuat.h
@@ -60,6 +60,7 @@
 #include "ImathNamespace.h"
 
 #include <iostream>
+#include <algorithm>
 
 IMATH_INTERNAL_NAMESPACE_HEADER_ENTER
 

From 8f3867756c0f17d2ffeed2c29cc173fbb15b81d1 Mon Sep 17 00:00:00 2001
From: Vadim Pisarevsky <vadim.pisarevsky@gmail.com>
Date: Fri, 21 Feb 2020 18:18:24 +0300
Subject: [PATCH 06/18] Merge pull request #16594 from
 vpisarev:hull_ordering_fix

fixed the ordering of contour convex hull points

* partially fixed the issue #4539

* fixed warnings and test failures

* fixed integer overflow (issue #14521)

* added comment to force buildbot to re-run

* extended the test for the issue 4539. Check the expected behaviour on the original contour as well

* added comment; fixed typo, renamed another variable for a little better clarity

* added yet another part to the test for issue #4539, where we run convexHull and convexityDetects on the original contour, without any manipulations. the rest of the test stays the same
---
 .../imgproc/misc/java/test/ImgprocTest.java   |   2 +-
 modules/imgproc/src/convhull.cpp              |  77 +++++++--
 modules/imgproc/test/test_convhull.cpp        | 152 ++++++++++++++++++
 3 files changed, 219 insertions(+), 12 deletions(-)

diff --git a/modules/imgproc/misc/java/test/ImgprocTest.java b/modules/imgproc/misc/java/test/ImgprocTest.java
index 52da455993..cc664c8002 100644
--- a/modules/imgproc/misc/java/test/ImgprocTest.java
+++ b/modules/imgproc/misc/java/test/ImgprocTest.java
@@ -427,7 +427,7 @@ public class ImgprocTest extends OpenCVTestCase {
         Imgproc.convexHull(points, hull);
 
         MatOfInt expHull = new MatOfInt(
-                1, 2, 3, 0
+                0, 1, 2, 3
         );
         assertMatEqual(expHull, hull, EPS);
     }
diff --git a/modules/imgproc/src/convhull.cpp b/modules/imgproc/src/convhull.cpp
index e288f6a626..b964ca3f62 100644
--- a/modules/imgproc/src/convhull.cpp
+++ b/modules/imgproc/src/convhull.cpp
@@ -45,7 +45,7 @@
 namespace cv
 {
 
-template<typename _Tp>
+template<typename _Tp, typename _DotTp>
 static int Sklansky_( Point_<_Tp>** array, int start, int end, int* stack, int nsign, int sign2 )
 {
     int incr = end > start ? 1 : -1;
@@ -79,7 +79,7 @@ static int Sklansky_( Point_<_Tp>** array, int start, int end, int* stack, int n
             _Tp ax = array[pcur]->x - array[pprev]->x;
             _Tp bx = array[pnext]->x - array[pcur]->x;
             _Tp ay = cury - array[pprev]->y;
-            _Tp convexity = ay*bx - ax*by; // if >0 then convex angle
+            _DotTp convexity = (_DotTp)ay*bx - (_DotTp)ax*by; // if >0 then convex angle
 
             if( CV_SIGN( convexity ) == sign2 && (ax != 0 || ay != 0) )
             {
@@ -122,7 +122,13 @@ template<typename _Tp>
 struct CHullCmpPoints
 {
     bool operator()(const Point_<_Tp>* p1, const Point_<_Tp>* p2) const
-    { return p1->x < p2->x || (p1->x == p2->x && p1->y < p2->y); }
+    {
+        if( p1->x != p2->x )
+            return p1->x < p2->x;
+        if( p1->y != p2->y )
+            return p1->y < p2->y;
+        return p1 < p2;
+    }
 };
 
 
@@ -194,12 +200,12 @@ void convexHull( InputArray _points, OutputArray _hull, bool clockwise, bool ret
         // upper half
         int *tl_stack = stack;
         int tl_count = !is_float ?
-            Sklansky_( pointer, 0, maxy_ind, tl_stack, -1, 1) :
-            Sklansky_( pointerf, 0, maxy_ind, tl_stack, -1, 1);
+            Sklansky_<int, int64>( pointer, 0, maxy_ind, tl_stack, -1, 1) :
+            Sklansky_<float, double>( pointerf, 0, maxy_ind, tl_stack, -1, 1);
         int *tr_stack = stack + tl_count;
         int tr_count = !is_float ?
-            Sklansky_( pointer, total-1, maxy_ind, tr_stack, -1, -1) :
-            Sklansky_( pointerf, total-1, maxy_ind, tr_stack, -1, -1);
+            Sklansky_<int, int64>( pointer, total-1, maxy_ind, tr_stack, -1, -1) :
+            Sklansky_<float, double>( pointerf, total-1, maxy_ind, tr_stack, -1, -1);
 
         // gather upper part of convex hull to output
         if( !clockwise )
@@ -217,12 +223,12 @@ void convexHull( InputArray _points, OutputArray _hull, bool clockwise, bool ret
         // lower half
         int *bl_stack = stack;
         int bl_count = !is_float ?
-            Sklansky_( pointer, 0, miny_ind, bl_stack, 1, -1) :
-            Sklansky_( pointerf, 0, miny_ind, bl_stack, 1, -1);
+            Sklansky_<int, int64>( pointer, 0, miny_ind, bl_stack, 1, -1) :
+            Sklansky_<float, double>( pointerf, 0, miny_ind, bl_stack, 1, -1);
         int *br_stack = stack + bl_count;
         int br_count = !is_float ?
-            Sklansky_( pointer, total-1, miny_ind, br_stack, 1, 1) :
-            Sklansky_( pointerf, total-1, miny_ind, br_stack, 1, 1);
+            Sklansky_<int, int64>( pointer, total-1, miny_ind, br_stack, 1, 1) :
+            Sklansky_<float, double>( pointerf, total-1, miny_ind, br_stack, 1, 1);
 
         if( clockwise )
         {
@@ -250,6 +256,45 @@ void convexHull( InputArray _points, OutputArray _hull, bool clockwise, bool ret
             hullbuf[nout++] = int(pointer[bl_stack[i]] - data0);
         for( i = br_count-1; i > 0; i-- )
             hullbuf[nout++] = int(pointer[br_stack[i]] - data0);
+
+        // try to make the convex hull indices form
+        // an ascending or descending sequence by the cyclic
+        // shift of the output sequence.
+        if( nout >= 3 )
+        {
+            int min_idx = 0, max_idx = 0, lt = 0;
+            for( i = 1; i < nout; i++ )
+            {
+                int idx = hullbuf[i];
+                lt += hullbuf[i-1] < idx;
+                if( lt > 1 && lt <= i-2 )
+                    break;
+                if( idx < hullbuf[min_idx] )
+                    min_idx = i;
+                if( idx > hullbuf[max_idx] )
+                    max_idx = i;
+            }
+            int mmdist = std::abs(max_idx - min_idx);
+            if( (mmdist == 1 || mmdist == nout-1) && (lt <= 1 || lt >= nout-2) )
+            {
+                int ascending = (max_idx + 1) % nout == min_idx;
+                int i0 = ascending ? min_idx : max_idx, j = i0;
+                if( i0 > 0 )
+                {
+                    for( i = 0; i < nout; i++ )
+                    {
+                        int curr_idx = stack[i] = hullbuf[j];
+                        int next_j = j+1 < nout ? j+1 : 0;
+                        int next_idx = hullbuf[next_j];
+                        if( i < nout-1 && (ascending != (curr_idx < next_idx)) )
+                            break;
+                        j = next_j;
+                    }
+                    if( i == nout )
+                        memcpy(hullbuf, stack, nout*sizeof(hullbuf[0]));
+                }
+            }
+        }
     }
 
     if( !returnPoints )
@@ -299,12 +344,22 @@ void convexityDefects( InputArray _points, InputArray _hull, OutputArray _defect
     int hcurr = hptr[rev_orientation ? 0 : hpoints-1];
     CV_Assert( 0 <= hcurr && hcurr < npoints );
 
+    int increasing_idx = -1;
+
     for( i = 0; i < hpoints; i++ )
     {
         int hnext = hptr[rev_orientation ? hpoints - i - 1 : i];
         CV_Assert( 0 <= hnext && hnext < npoints );
 
         Point pt0 = ptr[hcurr], pt1 = ptr[hnext];
+        if( increasing_idx < 0 )
+            increasing_idx = !(hcurr < hnext);
+        else if( increasing_idx != (hcurr < hnext))
+        {
+            CV_Error(Error::StsBadArg,
+            "The convex hull indices are not monotonous, which can be in the case when the input contour contains self-intersections");
+        }
+
         double dx0 = pt1.x - pt0.x;
         double dy0 = pt1.y - pt0.y;
         double scale = dx0 == 0 && dy0 == 0 ? 0. : 1./std::sqrt(dx0*dx0 + dy0*dy0);
diff --git a/modules/imgproc/test/test_convhull.cpp b/modules/imgproc/test/test_convhull.cpp
index fc29b7fbb5..5e353286fe 100644
--- a/modules/imgproc/test/test_convhull.cpp
+++ b/modules/imgproc/test/test_convhull.cpp
@@ -2154,5 +2154,157 @@ TEST(Imgproc_FitLine, regression_4903)
     EXPECT_GE(fabs(lineParam[1]), fabs(lineParam[0]) * 4) << lineParam;
 }
 
+#if 0
+#define DRAW(x) x
+#else
+#define DRAW(x)
+#endif
+
+// the Python test by @hannarud is converted to C++; see the issue #4539
+TEST(Imgproc_ConvexityDefects, ordering_4539)
+{
+    int contour[][2] =
+    {
+        {26,  9}, {25, 10}, {24, 10}, {23, 10}, {22, 10}, {21, 10}, {20, 11}, {19, 11}, {18, 11}, {17, 12},
+        {17, 13}, {18, 14}, {18, 15}, {18, 16}, {18, 17}, {19, 18}, {19, 19}, {20, 20}, {21, 21}, {21, 22},
+        {22, 23}, {22, 24}, {23, 25}, {23, 26}, {24, 27}, {25, 28}, {26, 29}, {27, 30}, {27, 31}, {28, 32},
+        {29, 32}, {30, 33}, {31, 34}, {30, 35}, {29, 35}, {30, 35}, {31, 34}, {32, 34}, {33, 34}, {34, 33},
+        {35, 32}, {35, 31}, {35, 30}, {36, 29}, {37, 28}, {37, 27}, {38, 26}, {39, 25}, {40, 24}, {40, 23},
+        {41, 22}, {42, 21}, {42, 20}, {42, 19}, {43, 18}, {43, 17}, {44, 16}, {45, 15}, {45, 14}, {46, 13},
+        {46, 12}, {45, 11}, {44, 11}, {43, 11}, {42, 10}, {41, 10}, {40,  9}, {39,  9}, {38,  9}, {37,  9},
+        {36,  9}, {35,  9}, {34,  9}, {33,  9}, {32,  9}, {31,  9}, {30,  9}, {29,  9}, {28,  9}, {27,  9}
+    };
+    int npoints = (int)(sizeof(contour)/sizeof(contour[0][0])/2);
+    Mat contour_(1, npoints, CV_32SC2, contour);
+    vector<Point> hull;
+    vector<int> hull_ind;
+    vector<Vec4i> defects;
+
+    // first, check the original contour as-is, without intermediate fillPoly/drawContours.
+    convexHull(contour_, hull_ind, false, false);
+    EXPECT_THROW( convexityDefects(contour_, hull_ind, defects), cv::Exception );
+
+    int scale = 20;
+    contour_ *= (double)scale;
+
+    Mat canvas_gray(Size(60*scale, 45*scale), CV_8U, Scalar::all(0));
+    const Point* ptptr = contour_.ptr<Point>();
+    fillPoly(canvas_gray, &ptptr, &npoints, 1, Scalar(255, 255, 255));
+
+    vector<vector<Point> > contours;
+    findContours(canvas_gray, contours, noArray(), RETR_LIST, CHAIN_APPROX_SIMPLE);
+    convexHull(contours[0], hull_ind, false, false);
+
+    // the original contour contains self-intersections,
+    // therefore convexHull does not return a monotonous sequence of points
+    // and therefore convexityDefects throws an exception
+    EXPECT_THROW( convexityDefects(contours[0], hull_ind, defects), cv::Exception );
+
+#if 1
+    // one way to eliminate the contour self-intersection in this particular case is to apply dilate(),
+    // so that the self-repeating points are not self-repeating anymore
+    dilate(canvas_gray, canvas_gray, Mat());
+#else
+    // another popular technique to eliminate such thin "hair" is to use morphological "close" operation,
+    // which is erode() + dilate()
+    erode(canvas_gray, canvas_gray, Mat());
+    dilate(canvas_gray, canvas_gray, Mat());
+#endif
+
+    // after the "fix", the newly retrieved contour should not have self-intersections,
+    // and everything should work well
+    findContours(canvas_gray, contours, noArray(), RETR_LIST, CHAIN_APPROX_SIMPLE);
+    convexHull(contours[0], hull, false, true);
+    convexHull(contours[0], hull_ind, false, false);
+
+    DRAW(Mat canvas(Size(60*scale, 45*scale), CV_8UC3, Scalar::all(0));
+        drawContours(canvas, contours, -1, Scalar(255, 255, 255), -1));
+
+    size_t nhull = hull.size();
+    ASSERT_EQ( nhull, hull_ind.size() );
+
+    if( nhull > 2 )
+    {
+        bool initial_lt = hull_ind[0] < hull_ind[1];
+        for( size_t i = 0; i < nhull; i++ )
+        {
+            int ind = hull_ind[i];
+            Point pt = contours[0][ind];
+
+            ASSERT_EQ(pt, hull[i]);
+            if( i > 0 )
+            {
+                // check that the convex hull indices are monotone
+                if( initial_lt )
+                {
+                    ASSERT_LT(hull_ind[i-1], hull_ind[i]);
+                }
+                else
+                {
+                    ASSERT_GT(hull_ind[i-1], hull_ind[i]);
+                }
+            }
+            DRAW(circle(canvas, pt, 7, Scalar(180, 0, 180), -1, LINE_AA);
+                putText(canvas, format("%d (%d)", (int)i, ind), pt+Point(15, 0), FONT_HERSHEY_SIMPLEX, 0.4, Scalar(200, 0, 200), 1, LINE_AA));
+            //printf("%d. ind=%d, pt=(%d, %d)\n", (int)i, ind, pt.x, pt.y);
+        }
+    }
+
+    convexityDefects(contours[0], hull_ind, defects);
+
+    for(size_t i = 0; i < defects.size(); i++ )
+    {
+        Vec4i d = defects[i];
+        //printf("defect %d. start=%d, end=%d, farthest=%d, depth=%d\n", (int)i, d[0], d[1], d[2], d[3]);
+        EXPECT_LT(d[0], d[1]);
+        EXPECT_LE(d[0], d[2]);
+        EXPECT_LE(d[2], d[1]);
+
+        DRAW(Point start = contours[0][d[0]];
+             Point end = contours[0][d[1]];
+             Point far = contours[0][d[2]];
+             line(canvas, start, end, Scalar(255, 255, 128), 3, LINE_AA);
+             line(canvas, start, far, Scalar(255, 150, 255), 3, LINE_AA);
+             line(canvas, end, far, Scalar(255, 150, 255), 3, LINE_AA);
+             circle(canvas, start, 7, Scalar(0, 0, 255), -1, LINE_AA);
+             circle(canvas, end, 7, Scalar(0, 0, 255), -1, LINE_AA);
+             circle(canvas, far, 7, Scalar(255, 0, 0), -1, LINE_AA));
+    }
+
+    DRAW(imshow("defects", canvas);
+         waitKey());
+}
+
+#undef DRAW
+
+TEST(Imgproc_ConvexHull, overflow)
+{
+    std::vector<Point> points;
+    std::vector<Point2f> pointsf;
+
+    points.push_back(Point(14763, 2890));
+    points.push_back(Point(14388, 72088));
+    points.push_back(Point(62810, 72274));
+    points.push_back(Point(63166, 3945));
+    points.push_back(Point(56782, 3945));
+    points.push_back(Point(56763, 3077));
+    points.push_back(Point(34666, 2965));
+    points.push_back(Point(34547, 2953));
+    points.push_back(Point(34508, 2866));
+    points.push_back(Point(34429, 2965));
+
+    size_t i, n = points.size();
+    for( i = 0; i < n; i++ )
+        pointsf.push_back(Point2f(points[i]));
+
+    std::vector<int> hull;
+    std::vector<int> hullf;
+
+    convexHull(points, hull, false, false);
+    convexHull(pointsf, hullf, false, false);
+
+    ASSERT_EQ(hull, hullf);
+}
+
 }} // namespace
 /* End of file. */

From 01048e56030d768a471c33f7b307451d8941e232 Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Fri, 21 Feb 2020 22:39:54 +0300
Subject: [PATCH 07/18] Merge pull request #16616 from
 alalek:dnn_fix_input_shape

* dnn: fix processing of input shapes

- importer: avoid using of .setInput() => .setInputShape()
- setInput: shape limitation check (partial)

* dnn(test): test .setInput() in readNet()
---
 modules/core/include/opencv2/core/check.hpp   |  1 +
 modules/core/src/check.cpp                    |  4 +
 modules/core/src/matrix_wrap.cpp              |  1 +
 modules/dnn/include/opencv2/dnn/dnn.hpp       |  4 +
 .../dnn/include/opencv2/dnn/shape_utils.hpp   | 10 +++
 modules/dnn/src/caffe/caffe_importer.cpp      |  5 +-
 modules/dnn/src/dnn.cpp                       | 86 ++++++++++++++++---
 modules/dnn/test/test_misc.cpp                | 59 +++++++++++++
 8 files changed, 152 insertions(+), 18 deletions(-)

diff --git a/modules/core/include/opencv2/core/check.hpp b/modules/core/include/opencv2/core/check.hpp
index 604447e8d7..0e0c7cbf31 100644
--- a/modules/core/include/opencv2/core/check.hpp
+++ b/modules/core/include/opencv2/core/check.hpp
@@ -79,6 +79,7 @@ CV_EXPORTS void CV_NORETURN check_failed_auto(const size_t v, const CheckContext
 CV_EXPORTS void CV_NORETURN check_failed_auto(const float v, const CheckContext& ctx);
 CV_EXPORTS void CV_NORETURN check_failed_auto(const double v, const CheckContext& ctx);
 CV_EXPORTS void CV_NORETURN check_failed_auto(const Size_<int> v, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_auto(const std::string& v1, const CheckContext& ctx);
 CV_EXPORTS void CV_NORETURN check_failed_MatDepth(const int v, const CheckContext& ctx);
 CV_EXPORTS void CV_NORETURN check_failed_MatType(const int v, const CheckContext& ctx);
 CV_EXPORTS void CV_NORETURN check_failed_MatChannels(const int v, const CheckContext& ctx);
diff --git a/modules/core/src/check.cpp b/modules/core/src/check.cpp
index 59c2bbe5de..4988b87c34 100644
--- a/modules/core/src/check.cpp
+++ b/modules/core/src/check.cpp
@@ -171,6 +171,10 @@ void check_failed_auto(const Size_<int> v, const CheckContext& ctx)
 {
     check_failed_auto_< Size_<int> >(v, ctx);
 }
+void check_failed_auto(const std::string& v, const CheckContext& ctx)
+{
+    check_failed_auto_< std::string >(v, ctx);
+}
 
 
 }} // namespace
diff --git a/modules/core/src/matrix_wrap.cpp b/modules/core/src/matrix_wrap.cpp
index e16e2f3f83..4c5efd6ba5 100644
--- a/modules/core/src/matrix_wrap.cpp
+++ b/modules/core/src/matrix_wrap.cpp
@@ -569,6 +569,7 @@ int _InputArray::sizend(int* arrsz, int i) const
     }
     else
     {
+        CV_CheckLE(dims(i), 2, "Not supported");  // TODO Support EXPR with 3+ dims
         Size sz2d = size(i);
         d = 2;
         if(arrsz)
diff --git a/modules/dnn/include/opencv2/dnn/dnn.hpp b/modules/dnn/include/opencv2/dnn/dnn.hpp
index 1b763bac70..113893813c 100644
--- a/modules/dnn/include/opencv2/dnn/dnn.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.hpp
@@ -484,6 +484,10 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
          */
         CV_WRAP void setInputsNames(const std::vector<String> &inputBlobNames);
 
+        /** @brief Specify shape of network input.
+         */
+        CV_WRAP void setInputShape(const String &inputName, const MatShape& shape);
+
         /** @brief Runs forward pass to compute output of layer with name @p outputName.
          *  @param outputName name for layer which output is needed to get
          *  @return blob for first output of specified layer.
diff --git a/modules/dnn/include/opencv2/dnn/shape_utils.hpp b/modules/dnn/include/opencv2/dnn/shape_utils.hpp
index b0ed3afc54..c975fcff04 100644
--- a/modules/dnn/include/opencv2/dnn/shape_utils.hpp
+++ b/modules/dnn/include/opencv2/dnn/shape_utils.hpp
@@ -138,6 +138,16 @@ static inline MatShape shape(const UMat& mat)
     return shape(mat.size.p, mat.dims);
 }
 
+#if 0  // issues with MatExpr wrapped into InputArray
+static inline
+MatShape shape(InputArray input)
+{
+    int sz[CV_MAX_DIM];
+    int ndims = input.sizend(sz);
+    return shape(sz, ndims);
+}
+#endif
+
 namespace {inline bool is_neg(int i) { return i < 0; }}
 
 static inline MatShape shape(int a0, int a1=-1, int a2=-1, int a3=-1)
diff --git a/modules/dnn/src/caffe/caffe_importer.cpp b/modules/dnn/src/caffe/caffe_importer.cpp
index 7d3d15b1b8..16860a9256 100644
--- a/modules/dnn/src/caffe/caffe_importer.cpp
+++ b/modules/dnn/src/caffe/caffe_importer.cpp
@@ -484,10 +484,7 @@ public:
         {
             CV_CheckEQ(inp_shapes.size(), netInputs.size(), "");
             for (int inp_id = 0; inp_id < inp_shapes.size(); inp_id++)
-            {
-                if (!inp_shapes[inp_id].empty())
-                    dstNet.setInput(Mat(inp_shapes[inp_id], CV_32F), netInputs[inp_id]);
-            }
+                dstNet.setInputShape(netInputs[inp_id], inp_shapes[inp_id]);
         }
 
         addedBlobs.clear();
diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp
index 9e2d255bb0..b0c52b101a 100644
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@@ -722,6 +722,18 @@ struct DataLayer : public Layer
     void setNames(const std::vector<String> &names)
     {
         outNames.assign(names.begin(), names.end());
+        shapes.clear(); shapes.resize(outNames.size());
+    }
+
+    void setInputShape(const String& tgtName, const MatShape& shape)
+    {
+        std::vector<String>::const_iterator it = std::find(outNames.begin(), outNames.end(), tgtName);
+        CV_Check(tgtName, it != outNames.end(), "Unknown input");
+        int idx = (int)(it - outNames.begin());
+
+        CV_Assert(idx < (int)shapes.size());
+        CV_Check(tgtName, shapes[idx].empty(), "Input shape redefinition is not allowed");
+        shapes[idx] = shape;
     }
 
     bool getMemoryShapes(const std::vector<MatShape> &inputs,
@@ -784,6 +796,7 @@ struct DataLayer : public Layer
 #endif  // HAVE_INF_ENGINE
 
     std::vector<String> outNames;
+    std::vector<MatShape> shapes;
     // Preprocessing parameters for each network's input.
     std::vector<double> scaleFactors;
     std::vector<Scalar> means;
@@ -2842,8 +2855,25 @@ struct Net::Impl
             }
             else
             {
-                inOutShapes[0].out.clear();
-                return;
+                const std::vector<MatShape>& inputShapes = netInputLayer->shapes;
+                bool none = true;
+                for (size_t i = 0; i < inputShapes.size(); i++)
+                {
+                    if (!inputShapes[i].empty())
+                    {
+                        none = false;
+                        break;
+                    }
+                }
+                if (none)
+                {
+                    inOutShapes[0].out.clear();
+                    return;
+                }
+                else
+                {
+                    inOutShapes[0].in = inputShapes;
+                }
             }
         }
 
@@ -3069,7 +3099,7 @@ Net Net::Impl::createNetworkFromModelOptimizer(InferenceEngine::CNNNetwork& ieNe
     // set empty input to determine input shapes
     for (int inp_id = 0; inp_id < inputsNames.size(); ++inp_id)
     {
-        cvNet.setInput(Mat(inp_shapes[inp_id], CV_32F), inputsNames[inp_id]);
+        cvNet.setInputShape(inputsNames[inp_id], inp_shapes[inp_id]);
     }
 
     Ptr<BackendNode> backendNode;
@@ -3494,6 +3524,13 @@ void Net::setInputsNames(const std::vector<String> &inputBlobNames)
     impl->netInputLayer->setNames(inputBlobNames);
 }
 
+void Net::setInputShape(const String &inputName, const MatShape& shape)
+{
+    CV_TRACE_FUNCTION();
+
+    impl->netInputLayer->setInputShape(inputName, shape);
+}
+
 void Net::setInput(InputArray blob, const String& name, double scalefactor, const Scalar& mean)
 {
     CV_TRACE_FUNCTION();
@@ -3506,6 +3543,33 @@ void Net::setInput(InputArray blob, const String& name, double scalefactor, cons
     if (!pin.valid())
         CV_Error(Error::StsObjectNotFound, "Requested blob \"" + name + "\" not found");
 
+    Mat blob_ = blob.getMat();  // can't use InputArray directly due MatExpr stuff
+    MatShape blobShape = shape(blob_);
+
+    if (pin.lid == 0)
+    {
+        CV_Assert(!impl->netInputLayer.empty());
+        const DataLayer& netInputLayer = *impl->netInputLayer.get();
+        if (!netInputLayer.shapes.empty())
+        {
+            CV_CheckLT(pin.oid, (int)netInputLayer.shapes.size(), "");
+            const MatShape& inputShapeLimitation = netInputLayer.shapes[pin.oid];
+            if (!inputShapeLimitation.empty())
+            {
+                CV_CheckEQ(inputShapeLimitation.size(), blobShape.size(), "");
+#if 0  // TODO: DNNTestNetwork.MobileNet_SSD_Caffe_Different_Width_Height/0
+                const size_t dims = inputShapeLimitation.size();
+                for (size_t dim = 0; dim < dims; dim++)
+                {
+                    if (dims >= 3 && dim == 0 && inputShapeLimitation[0] == 1)
+                        continue;  // don't limit batch
+                    CV_CheckEQ(inputShapeLimitation[dim], blobShape[dim], "");
+                }
+#endif
+            }
+        }
+    }
+
     LayerData &ld = impl->layers[pin.lid];
     const int numInputs = std::max(pin.oid+1, (int)ld.requiredOutputs.size());
     ld.outputBlobs.resize(numInputs);
@@ -3515,17 +3579,11 @@ void Net::setInput(InputArray blob, const String& name, double scalefactor, cons
     impl->netInputLayer->means.resize(numInputs);
 
     MatShape prevShape = shape(impl->netInputLayer->inputsData[pin.oid]);
-    Mat blob_ = blob.getMat();
-    bool oldShape = prevShape == shape(blob_);
-    if (oldShape)
-    {
-        blob_.copyTo(impl->netInputLayer->inputsData[pin.oid]);
-    }
-    else
-    {
-        ld.outputBlobs[pin.oid] = blob_.clone();
-        impl->netInputLayer->inputsData[pin.oid] = ld.outputBlobs[pin.oid];
-    }
+    bool oldShape = prevShape == blobShape;
+
+    blob_.copyTo(impl->netInputLayer->inputsData[pin.oid]);
+    if (!oldShape)
+        ld.outputBlobs[pin.oid] = impl->netInputLayer->inputsData[pin.oid];
 
     if (!ld.outputBlobsWrappers[pin.oid].empty())
     {
diff --git a/modules/dnn/test/test_misc.cpp b/modules/dnn/test/test_misc.cpp
index ca60b9111a..eccf171539 100644
--- a/modules/dnn/test/test_misc.cpp
+++ b/modules/dnn/test/test_misc.cpp
@@ -78,6 +78,65 @@ TEST(readNet, Regression)
     EXPECT_FALSE(net.empty());
 }
 
+TEST(readNet, do_not_call_setInput)  // https://github.com/opencv/opencv/issues/16618
+{
+    // 1. load network
+    const string proto = findDataFile("dnn/squeezenet_v1.1.prototxt");
+    const string model = findDataFile("dnn/squeezenet_v1.1.caffemodel", false);
+    Net net = readNetFromCaffe(proto, model);
+
+    // 2. mistake: no inputs are specified through .setInput()
+
+    // 3. try inference
+    Mat res;
+    EXPECT_THROW(
+    {
+        res = net.forward();  // no inputs after loading => should fail
+    }, cv::Exception);
+    EXPECT_TRUE(res.empty()) << res.size;
+}
+
+#ifdef HAVE_INF_ENGINE
+static
+void test_readNet_IE_do_not_call_setInput(Backend backendId)
+{
+    const Target targetId = DNN_TARGET_CPU;
+
+    const std::string& model = findDataFile("dnn/layers/layer_convolution.bin");
+    const std::string& proto = findDataFile("dnn/layers/layer_convolution.xml");
+
+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
+        setInferenceEngineBackendType(CV_DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_API);
+    else if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        setInferenceEngineBackendType(CV_DNN_BACKEND_INFERENCE_ENGINE_NGRAPH);
+    else
+        FAIL() << "Unknown backendId";
+
+    Net net = readNet(model, proto);
+    net.setPreferableBackend(backendId);
+    net.setPreferableTarget(targetId);
+
+    // 2. mistake: no inputs are specified through .setInput()
+
+    // 3. try inference
+    Mat res;
+    EXPECT_THROW(
+    {
+        res = net.forward();  // no inputs after loading => should fail
+    }, cv::Exception);
+    EXPECT_TRUE(res.empty()) << res.size;
+}
+
+TEST(readNet, do_not_call_setInput_IE_NN_BUILDER_2019)
+{
+    test_readNet_IE_do_not_call_setInput(DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019);
+}
+TEST(readNet, do_not_call_setInput_IE_NGRAPH)
+{
+    test_readNet_IE_do_not_call_setInput(DNN_BACKEND_INFERENCE_ENGINE_NGRAPH);
+}
+#endif  // HAVE_INF_ENGINE
+
 typedef testing::TestWithParam<tuple<Backend, Target> > dump;
 TEST_P(dump, Regression)
 {

From 8b5efc6f4ccaf3f8e12abd1dd32cc49825925ad3 Mon Sep 17 00:00:00 2001
From: Gourav Roy <34737471+themechanicalcoder@users.noreply.github.com>
Date: Sat, 22 Feb 2020 17:12:26 +0530
Subject: [PATCH 08/18] Merge pull request #16586 from
 themechanicalcoder:video-psnr

* add  python version of video-input-psnr-ssim

* remove ret

* documentation changes

* added link for python file

* command line argument
---
 .../video_input_psnr_ssim.markdown            |  89 +++--------
 .../video-input-psnr-ssim.cpp                 |   5 +
 .../videoio/video-input-psnr-ssim.py          | 148 ++++++++++++++++++
 3 files changed, 171 insertions(+), 71 deletions(-)
 create mode 100644 samples/python/tutorial_code/videoio/video-input-psnr-ssim.py

diff --git a/doc/tutorials/videoio/video-input-psnr-ssim/video_input_psnr_ssim.markdown b/doc/tutorials/videoio/video-input-psnr-ssim/video_input_psnr_ssim.markdown
index 80205213a2..96c6637c5f 100644
--- a/doc/tutorials/videoio/video-input-psnr-ssim/video_input_psnr_ssim.markdown
+++ b/doc/tutorials/videoio/video-input-psnr-ssim/video_input_psnr_ssim.markdown
@@ -25,7 +25,13 @@ version of it ](https://github.com/opencv/opencv/tree/3.4/samples/data/Megamind_
 You may also find the source code and these video file in the
 `samples/data` folder of the OpenCV source library.
 
+@add_toggle_cpp
 @include cpp/tutorial_code/videoio/video-input-psnr-ssim/video-input-psnr-ssim.cpp
+@end_toggle
+
+@add_toggle_python
+@include samples/python/tutorial_code/videoio/video-input-psnr-ssim.py
+@end_toggle
 
 How to read a video stream (online-camera or offline-file)?
 -----------------------------------------------------------
@@ -139,28 +145,15 @@ an invalid divide by zero operation in the PSNR formula. In this case the PSNR i
 we'll need to handle this case separately. The transition to a logarithmic scale is made because the
 pixel values have a very wide dynamic range. All this translated to OpenCV and a C++ function looks
 like:
-@code{.cpp}
-double getPSNR(const Mat& I1, const Mat& I2)
-{
- Mat s1;
- absdiff(I1, I2, s1);       // |I1 - I2|
- s1.convertTo(s1, CV_32F);  // cannot make a square on 8 bits
- s1 = s1.mul(s1);           // |I1 - I2|^2
 
- Scalar s = sum(s1);        // sum elements per channel
+@add_toggle_cpp
+@include cpp/tutorial_code/videoio/video-input-psnr-ssim/video-input-psnr-ssim.cpp get-psnr
+@end_toggle
 
- double sse = s.val[0] + s.val[1] + s.val[2]; // sum channels
+@add_toggle_python
+@include samples/python/tutorial_code/videoio/video-input-psnr-ssim.py get-psnr
+@end_toggle
 
- if( sse <= 1e-10) // for small values return zero
-     return 0;
- else
- {
-     double  mse =sse /(double)(I1.channels() * I1.total());
-     double psnr = 10.0*log10((255*255)/mse);
-     return psnr;
- }
-}
-@endcode
 Typically result values are anywhere between 30 and 50 for video compression, where higher is
 better. If the images significantly differ you'll get much lower ones like 15 and so. This
 similarity check is easy and fast to calculate, however in practice it may turn out somewhat
@@ -176,60 +169,14 @@ implementation below.
     Simoncelli, "Image quality assessment: From error visibility to structural similarity," IEEE
     Transactions on Image Processing, vol. 13, no. 4, pp. 600-612, Apr. 2004." article.
 
-@code{.cpp}
-Scalar getMSSIM( const Mat& i1, const Mat& i2)
-{
- const double C1 = 6.5025, C2 = 58.5225;
- /***************************** INITS **********************************/
- int d     = CV_32F;
+@add_toggle_cpp
+@include cpp/tutorial_code/videoio/video-input-psnr-ssim/video-input-psnr-ssim.cpp get-mssim
+@end_toggle
 
- Mat I1, I2;
- i1.convertTo(I1, d);           // cannot calculate on one byte large values
- i2.convertTo(I2, d);
+@add_toggle_python
+@include samples/python/tutorial_code/videoio/video-input-psnr-ssim.py get-mssim
+@end_toggle
 
- Mat I2_2   = I2.mul(I2);        // I2^2
- Mat I1_2   = I1.mul(I1);        // I1^2
- Mat I1_I2  = I1.mul(I2);        // I1 * I2
-
- /***********************PRELIMINARY COMPUTING ******************************/
-
- Mat mu1, mu2;   //
- GaussianBlur(I1, mu1, Size(11, 11), 1.5);
- GaussianBlur(I2, mu2, Size(11, 11), 1.5);
-
- Mat mu1_2   =   mu1.mul(mu1);
- Mat mu2_2   =   mu2.mul(mu2);
- Mat mu1_mu2 =   mu1.mul(mu2);
-
- Mat sigma1_2, sigma2_2, sigma12;
-
- GaussianBlur(I1_2, sigma1_2, Size(11, 11), 1.5);
- sigma1_2 -= mu1_2;
-
- GaussianBlur(I2_2, sigma2_2, Size(11, 11), 1.5);
- sigma2_2 -= mu2_2;
-
- GaussianBlur(I1_I2, sigma12, Size(11, 11), 1.5);
- sigma12 -= mu1_mu2;
-
- ///////////////////////////////// FORMULA ////////////////////////////////
- Mat t1, t2, t3;
-
- t1 = 2 * mu1_mu2 + C1;
- t2 = 2 * sigma12 + C2;
- t3 = t1.mul(t2);              // t3 = ((2*mu1_mu2 + C1).*(2*sigma12 + C2))
-
- t1 = mu1_2 + mu2_2 + C1;
- t2 = sigma1_2 + sigma2_2 + C2;
- t1 = t1.mul(t2);               // t1 =((mu1_2 + mu2_2 + C1).*(sigma1_2 + sigma2_2 + C2))
-
- Mat ssim_map;
- divide(t3, t1, ssim_map);      // ssim_map =  t3./t1;
-
- Scalar mssim = mean( ssim_map ); // mssim = average of ssim map
- return mssim;
-}
-@endcode
 This will return a similarity index for each channel of the image. This value is between zero and
 one, where one corresponds to perfect fit. Unfortunately, the many Gaussian blurring is quite
 costly, so while the PSNR may work in a real time like environment (24 frame per second) this will
diff --git a/samples/cpp/tutorial_code/videoio/video-input-psnr-ssim/video-input-psnr-ssim.cpp b/samples/cpp/tutorial_code/videoio/video-input-psnr-ssim/video-input-psnr-ssim.cpp
index be0b1a8a21..8d567b2f5e 100644
--- a/samples/cpp/tutorial_code/videoio/video-input-psnr-ssim/video-input-psnr-ssim.cpp
+++ b/samples/cpp/tutorial_code/videoio/video-input-psnr-ssim/video-input-psnr-ssim.cpp
@@ -132,6 +132,7 @@ int main(int argc, char *argv[])
     return 0;
 }
 
+// ![get-psnr]
 double getPSNR(const Mat& I1, const Mat& I2)
 {
     Mat s1;
@@ -152,6 +153,9 @@ double getPSNR(const Mat& I1, const Mat& I2)
         return psnr;
     }
 }
+// ![get-psnr]
+
+// ![get-mssim]
 
 Scalar getMSSIM( const Mat& i1, const Mat& i2)
 {
@@ -205,3 +209,4 @@ Scalar getMSSIM( const Mat& i1, const Mat& i2)
     Scalar mssim = mean(ssim_map);   // mssim = average of ssim map
     return mssim;
 }
+// ![get-mssim]
diff --git a/samples/python/tutorial_code/videoio/video-input-psnr-ssim.py b/samples/python/tutorial_code/videoio/video-input-psnr-ssim.py
new file mode 100644
index 0000000000..84610d4768
--- /dev/null
+++ b/samples/python/tutorial_code/videoio/video-input-psnr-ssim.py
@@ -0,0 +1,148 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# Python 2/3 compatibility
+from __future__ import print_function
+
+import numpy as np
+import cv2 as cv
+import argparse
+import sys
+
+# [get-psnr]
+def getPSNR(I1, I2):
+    s1 = cv.absdiff(I1, I2) #|I1 - I2|
+    s1 = np.float32(s1)     # cannot make a square on 8 bits
+    s1 = s1 * s1            # |I1 - I2|^2
+    sse = s1.sum()          # sum elements per channel
+    if sse <= 1e-10:        # sum channels
+        return 0            # for small values return zero
+    else:
+        shape = I1.shape
+        mse = 1.0 * sse / (shape[0] * shape[1] * shape[2])
+        psnr = 10.0 * np.log10((255 * 255) / mse)
+        return psnr
+# [get-psnr]
+
+# [get-mssim]
+def getMSSISM(i1, i2):
+    C1 = 6.5025
+    C2 = 58.5225
+    # INITS
+
+    I1 = np.float32(i1) # cannot calculate on one byte large values
+    I2 = np.float32(i2)
+
+    I2_2 = I2 * I2 # I2^2
+    I1_2 = I1 * I1 # I1^2
+    I1_I2 = I1 * I2 # I1 * I2
+    # END INITS
+
+    # PRELIMINARY COMPUTING
+    mu1 = cv.GaussianBlur(I1, (11, 11), 1.5)
+    mu2 = cv.GaussianBlur(I2, (11, 11), 1.5)
+
+    mu1_2 = mu1 * mu1
+    mu2_2 = mu2 * mu2
+    mu1_mu2 = mu1 * mu2
+
+    sigma1_2 = cv.GaussianBlur(I1_2, (11, 11), 1.5)
+    sigma1_2 -= mu1_2
+
+    sigma2_2 = cv.GaussianBlur(I2_2, (11, 11), 1.5)
+    sigma2_2 -= mu2_2
+
+    sigma12 = cv.GaussianBlur(I1_I2, (11, 11), 1.5)
+    sigma12 -= mu1_mu2
+
+    t1 = 2 * mu1_mu2 + C1
+    t2 = 2 * sigma12 + C2
+    t3 = t1 * t2                    # t3 = ((2*mu1_mu2 + C1).*(2*sigma12 + C2))
+
+    t1 = mu1_2 + mu2_2 + C1
+    t2 = sigma1_2 + sigma2_2 + C2
+    t1 = t1 * t2                    # t1 =((mu1_2 + mu2_2 + C1).*(sigma1_2 + sigma2_2 + C2))
+
+    ssim_map = cv.divide(t3, t1)    # ssim_map =  t3./t1;
+
+    mssim = cv.mean(ssim_map)       # mssim = average of ssim map
+    return mssim
+# [get-mssim]
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-d", "--delay", type=int, default=30, help=" Time delay")
+    parser.add_argument("-v", "--psnrtriggervalue", type=int, default=30, help="PSNR Trigger Value")
+    parser.add_argument("-r", "--ref", type=str, default="Megamind.avi", help="Path to reference video")
+    parser.add_argument("-t", "--undertest", type=str, default="Megamind_bugy.avi",
+                        help="Path to the video to be tested")
+    args = parser.parse_args()
+
+    sourceReference = args.ref
+    sourceCompareWith = args.undertest
+    delay = args.delay
+    psnrTriggerValue = args.psnrtriggervalue
+
+    framenum = -1 # Frame counter
+
+    captRefrnc = cv.VideoCapture(sourceReference)
+    captUndTst = cv.VideoCapture(sourceCompareWith)
+
+    if not captRefrnc.isOpened():
+        print("Could not open the reference " + sourceReference)
+        sys.exit(-1)
+    if not captUndTst.isOpened():
+        print("Could not open case test " + sourceCompareWith)
+        sys.exit(-1)
+
+    refS = (int(captRefrnc.get(cv.CAP_PROP_FRAME_WIDTH)), int(captRefrnc.get(cv.CAP_PROP_FRAME_HEIGHT)))
+    uTSi = (int(captUndTst.get(cv.CAP_PROP_FRAME_WIDTH)), int(captUndTst.get(cv.CAP_PROP_FRAME_HEIGHT)))
+
+    if refS != uTSi:
+        print("Inputs have different size!!! Closing.")
+        sys.exit(-1)
+
+    WIN_UT = "Under Test"
+    WIN_RF = "Reference"
+
+    cv.namedWindow(WIN_RF, cv.WINDOW_AUTOSIZE)
+    cv.namedWindow(WIN_UT, cv.WINDOW_AUTOSIZE)
+    cv.moveWindow(WIN_RF, 400, 0) #750,  2 (bernat =0)
+    cv.moveWindow(WIN_UT, refS[0], 0) #1500, 2
+
+    print("Reference frame resolution: Width={} Height={} of nr#: {}".format(refS[0], refS[1],
+                                                                             captRefrnc.get(cv.CAP_PROP_FRAME_COUNT)))
+    print("PSNR trigger value {}".format(psnrTriggerValue))
+
+    while True: # Show the image captured in the window and repeat
+        _, frameReference = captRefrnc.read()
+        _, frameUnderTest = captUndTst.read()
+
+        if frameReference is None or frameUnderTest is None:
+            print(" < < <  Game over!  > > > ")
+            break
+
+        framenum += 1
+        psnrv = getPSNR(frameReference, frameUnderTest)
+        print("Frame: {}# {}dB".format(framenum, round(psnrv, 3)), end=" ")
+
+        if (psnrv < psnrTriggerValue and psnrv):
+            mssimv = getMSSISM(frameReference, frameUnderTest)
+            print("MSSISM: R {}% G {}% B {}%".format(round(mssimv[2] * 100, 2), round(mssimv[1] * 100, 2),
+                                                     round(mssimv[0] * 100, 2)), end=" ")
+
+        print()
+
+        cv.imshow(WIN_RF, frameReference)
+        cv.imshow(WIN_UT, frameUnderTest)
+
+        k = cv.waitKey(delay)
+        if k == 27:
+            break
+
+    sys.exit(0)
+
+
+if __name__ == "__main__":
+    main()

From 1540ae340f59417d3555b38593398aca444c3441 Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Thu, 20 Feb 2020 20:23:19 +0000
Subject: [PATCH 09/18] dnn(test): configure filtering for 32-bit systems

---
 modules/dnn/test/test_caffe_importer.cpp   | 12 ++++++++++++
 modules/dnn/test/test_darknet_importer.cpp |  9 ++++++++-
 modules/dnn/test/test_onnx_importer.cpp    | 15 +++++++++++++++
 3 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/modules/dnn/test/test_caffe_importer.cpp b/modules/dnn/test/test_caffe_importer.cpp
index e07bdefcf2..4d4f2d0d10 100644
--- a/modules/dnn/test/test_caffe_importer.cpp
+++ b/modules/dnn/test/test_caffe_importer.cpp
@@ -159,7 +159,11 @@ typedef testing::TestWithParam<tuple<bool, Target> > Reproducibility_AlexNet;
 TEST_P(Reproducibility_AlexNet, Accuracy)
 {
     Target targetId = get<1>(GetParam());
+#if defined(OPENCV_32BIT_CONFIGURATION) && defined(HAVE_OPENCL)
+    applyTestTag(CV_TEST_TAG_MEMORY_2GB);
+#else
     applyTestTag(targetId == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB);
+#endif
     ASSERT_TRUE(ocl::useOpenCL() || targetId == DNN_TARGET_CPU);
 
     bool readFromMemory = get<0>(GetParam());
@@ -637,7 +641,11 @@ INSTANTIATE_TEST_CASE_P(Test_Caffe, opencv_face_detector,
 TEST_P(Test_Caffe_nets, FasterRCNN_vgg16)
 {
     applyTestTag(
+#if defined(OPENCV_32BIT_CONFIGURATION) && defined(HAVE_OPENCL)
+        CV_TEST_TAG_MEMORY_2GB,  // utilizes ~1Gb, but huge blobs may not be allocated on 32-bit systems due memory fragmentation
+#else
         (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_1GB : CV_TEST_TAG_MEMORY_2GB),
+#endif
         CV_TEST_TAG_LONG,
         CV_TEST_TAG_DEBUG_VERYLONG
     );
@@ -662,7 +670,11 @@ TEST_P(Test_Caffe_nets, FasterRCNN_vgg16)
 TEST_P(Test_Caffe_nets, FasterRCNN_zf)
 {
     applyTestTag(
+#if defined(OPENCV_32BIT_CONFIGURATION) && defined(HAVE_OPENCL)
+        CV_TEST_TAG_MEMORY_2GB,
+#else
         (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB),
+#endif
         CV_TEST_TAG_DEBUG_LONG
     );
     if ((backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 ||
diff --git a/modules/dnn/test/test_darknet_importer.cpp b/modules/dnn/test/test_darknet_importer.cpp
index 3939344932..068f85eb48 100644
--- a/modules/dnn/test/test_darknet_importer.cpp
+++ b/modules/dnn/test/test_darknet_importer.cpp
@@ -300,7 +300,14 @@ public:
 
 TEST_P(Test_Darknet_nets, YoloVoc)
 {
-    applyTestTag(CV_TEST_TAG_LONG, CV_TEST_TAG_MEMORY_1GB);
+    applyTestTag(
+#if defined(OPENCV_32BIT_CONFIGURATION) && defined(HAVE_OPENCL)
+        CV_TEST_TAG_MEMORY_2GB,
+#else
+        CV_TEST_TAG_MEMORY_1GB,
+#endif
+        CV_TEST_TAG_LONG
+    );
 
 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2019010000)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_OPENCL_FP16)
diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp
index 35f00ef503..84933193e7 100644
--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@@ -436,7 +436,12 @@ public:
 
 TEST_P(Test_ONNX_nets, Alexnet)
 {
+#if defined(OPENCV_32BIT_CONFIGURATION) && defined(HAVE_OPENCL)
+    applyTestTag(CV_TEST_TAG_MEMORY_2GB);
+#else
     applyTestTag(target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB);
+#endif
+
     const String model =  _tf("models/alexnet.onnx", false);
 
     Net net = readNetFromONNX(model);
@@ -495,7 +500,12 @@ TEST_P(Test_ONNX_nets, Googlenet)
 
 TEST_P(Test_ONNX_nets, CaffeNet)
 {
+#if defined(OPENCV_32BIT_CONFIGURATION) && defined(HAVE_OPENCL)
+    applyTestTag(CV_TEST_TAG_MEMORY_2GB);
+#else
     applyTestTag(target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB);
+#endif
+
 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2019030000)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_MYRIAD
         && getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X)
@@ -506,7 +516,12 @@ TEST_P(Test_ONNX_nets, CaffeNet)
 
 TEST_P(Test_ONNX_nets, RCNN_ILSVRC13)
 {
+#if defined(OPENCV_32BIT_CONFIGURATION) && defined(HAVE_OPENCL)
+    applyTestTag(CV_TEST_TAG_MEMORY_2GB);
+#else
     applyTestTag(target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB);
+#endif
+
 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2019030000)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_MYRIAD
         && getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X)

From 2b96a485e7602cc69d034302f52d44163b1444f0 Mon Sep 17 00:00:00 2001
From: "ashishiva3@gmail.com" <ashishiva3@gmail.com>
Date: Sun, 23 Feb 2020 11:46:12 +0530
Subject: [PATCH 10/18] Darknet_io: Parsing for cost layer added

---
 modules/dnn/src/darknet/darknet_io.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/modules/dnn/src/darknet/darknet_io.cpp b/modules/dnn/src/darknet/darknet_io.cpp
index 5e1b125a0c..b93d740109 100644
--- a/modules/dnn/src/darknet/darknet_io.cpp
+++ b/modules/dnn/src/darknet/darknet_io.cpp
@@ -556,7 +556,7 @@ namespace cv {
                             const size_t layer_type_size = line.find("]") - 1;
                             CV_Assert(layer_type_size < line.size());
                             std::string layer_type = line.substr(1, layer_type_size);
-                            net->layers_cfg[layers_counter]["type"] = layer_type;
+                            net->layers_cfg[layers_counter]["layer_type"] = layer_type;
                         }
                         break;
                     default:
@@ -599,7 +599,7 @@ namespace cv {
                 for (it_type i = net->layers_cfg.begin(); i != net->layers_cfg.end(); ++i) {
                     ++layers_counter;
                     std::map<std::string, std::string> &layer_params = i->second;
-                    std::string layer_type = layer_params["type"];
+                    std::string layer_type = layer_params["layer_type"];
 
                     if (layer_type == "convolutional")
                     {
@@ -682,7 +682,7 @@ namespace cv {
                         else
                             setParams.setConcat(layers_vec.size(), layers_vec.data());
                     }
-                    else if (layer_type == "dropout")
+                    else if (layer_type == "dropout" || layer_type == "cost")
                     {
                         setParams.setIdentity(layers_counter-1);
                     }
@@ -806,7 +806,7 @@ namespace cv {
                     ++darknet_layers_counter;
                     ++cv_layers_counter;
                     std::map<std::string, std::string> &layer_params = i->second;
-                    std::string layer_type = layer_params["type"];
+                    std::string layer_type = layer_params["layer_type"];
 
                     if (layer_type == "convolutional" || layer_type == "connected")
                     {

From d54d01ca4681dc18999b721afadba109e04a64e8 Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Sun, 23 Feb 2020 17:05:05 +0000
Subject: [PATCH 11/18] core(MatExpr): fix .type() bug

---
 modules/core/src/matrix_expressions.cpp |  2 +-
 modules/core/test/test_mat.cpp          | 11 +++++++++++
 modules/dnn/test/test_misc.cpp          |  5 ++++-
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/modules/core/src/matrix_expressions.cpp b/modules/core/src/matrix_expressions.cpp
index ea431336f0..5ac1fafbd6 100644
--- a/modules/core/src/matrix_expressions.cpp
+++ b/modules/core/src/matrix_expressions.cpp
@@ -1257,7 +1257,7 @@ int MatExpr::type() const
     if( isInitializer(*this) )
         return a.type();
     if( isCmp(*this) )
-        return CV_8U;
+        return CV_MAKETYPE(CV_8U, a.channels());
     return op ? op->type(*this) : -1;
 }
 
diff --git a/modules/core/test/test_mat.cpp b/modules/core/test/test_mat.cpp
index 7aa79c4d8c..3fa8442d69 100644
--- a/modules/core/test/test_mat.cpp
+++ b/modules/core/test/test_mat.cpp
@@ -2017,6 +2017,17 @@ TEST(Core_MatExpr, issue_13926)
     EXPECT_GE(1e-6, cvtest::norm(M2*M1, M2*M2, NORM_INF)) << Mat(M2*M1) << std::endl << Mat(M2*M2);
 }
 
+TEST(Core_MatExpr, issue_16655)
+{
+    Mat a(Size(5, 5), CV_32FC3, Scalar::all(1));
+    Mat b(Size(5, 5), CV_32FC3, Scalar::all(2));
+    MatExpr ab_expr = a != b;
+    Mat ab_mat = ab_expr;
+    EXPECT_EQ(CV_8UC3, ab_expr.type())
+        << "MatExpr: CV_8UC3 != " << typeToString(ab_expr.type());
+    EXPECT_EQ(CV_8UC3, ab_mat.type())
+        << "Mat: CV_8UC3 != " << typeToString(ab_mat.type());
+}
 
 #ifdef HAVE_EIGEN
 TEST(Core_Eigen, eigen2cv_check_Mat_type)
diff --git a/modules/dnn/test/test_misc.cpp b/modules/dnn/test/test_misc.cpp
index ca60b9111a..eaab49ea79 100644
--- a/modules/dnn/test/test_misc.cpp
+++ b/modules/dnn/test/test_misc.cpp
@@ -56,7 +56,10 @@ TEST(imagesFromBlob, Regression)
 
     for (int i = 0; i < nbOfImages; i++)
     {
-        ASSERT_EQ(cv::countNonZero(inputImgs[i] != outputImgs[i]), 0);
+        EXPECT_EQ(0, cvtest::norm(inputImgs[i], outputImgs[i], NORM_INF))
+            << "i=" << i
+            << " inputImgs[i]=" << inputImgs[i].size
+            << " outputImgs[i]=" << outputImgs[i].size;
     }
 }
 

From c2f5f5a202bf91d4f2c58ce96e160cd4d54e3e88 Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Mon, 24 Feb 2020 18:18:33 +0000
Subject: [PATCH 12/18] dnn(test): configure filtering for 32-bit systems (part
 2)

---
 modules/dnn/test/test_onnx_importer.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp
index 84933193e7..7de5ee22cc 100644
--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@@ -624,7 +624,11 @@ TEST_P(Test_ONNX_nets, MobileNet_v2)
 TEST_P(Test_ONNX_nets, LResNet100E_IR)
 {
     applyTestTag(
+#if defined(OPENCV_32BIT_CONFIGURATION) && defined(HAVE_OPENCL)
+        CV_TEST_TAG_MEMORY_2GB,
+#else
         (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB),
+#endif
         CV_TEST_TAG_DEBUG_LONG
     );
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)

From f9bd0257227eeb288f812419e6a586290e91b04a Mon Sep 17 00:00:00 2001
From: Ivan Galanin <61196467+iigalanin@users.noreply.github.com>
Date: Tue, 25 Feb 2020 22:04:11 +0300
Subject: [PATCH 13/18] Merge pull request #16639 from iigalanin:patch-1

* Update to new ICV packages

MacOS ia32 ICV package is no longer built.

* cmake(ippicv): add Apple 32-bit check

* Updated commit hash
---
 3rdparty/ippicv/ippicv.cmake | 27 +++++++++++----------------
 cmake/OpenCVFindIPP.cmake    |  4 ++++
 2 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/3rdparty/ippicv/ippicv.cmake b/3rdparty/ippicv/ippicv.cmake
index ae8748c283..257af6fcc6 100644
--- a/3rdparty/ippicv/ippicv.cmake
+++ b/3rdparty/ippicv/ippicv.cmake
@@ -2,37 +2,32 @@ function(download_ippicv root_var)
   set(${root_var} "" PARENT_SCOPE)
 
   # Commit SHA in the opencv_3rdparty repo
-  set(IPPICV_COMMIT "32e315a5b106a7b89dbed51c28f8120a48b368b4")
+  set(IPPICV_COMMIT "a56b6ac6f030c312b2dce17430eef13aed9af274")
   # Define actual ICV versions
   if(APPLE)
     set(OPENCV_ICV_PLATFORM "macosx")
     set(OPENCV_ICV_PACKAGE_SUBDIR "ippicv_mac")
-    if(X86_64)
-      set(OPENCV_ICV_NAME "ippicv_2019_mac_intel64_general_20180723.tgz")
-      set(OPENCV_ICV_HASH "fe6b2bb75ae0e3f19ad3ae1a31dfa4a2")
-    else()
-      set(OPENCV_ICV_NAME "ippicv_2019_mac_ia32_general_20180723.tgz")
-      set(OPENCV_ICV_HASH "b5dfa78c87eb75c64470cbe5ec876f4f")
-    endif()
+    set(OPENCV_ICV_NAME "ippicv_2020_mac_intel64_20191018_general.tgz")
+    set(OPENCV_ICV_HASH "1c3d675c2a2395d094d523024896e01b")
   elseif((UNIX AND NOT ANDROID) OR (UNIX AND ANDROID_ABI MATCHES "x86"))
     set(OPENCV_ICV_PLATFORM "linux")
     set(OPENCV_ICV_PACKAGE_SUBDIR "ippicv_lnx")
     if(X86_64)
-      set(OPENCV_ICV_NAME "ippicv_2019_lnx_intel64_general_20180723.tgz")
-      set(OPENCV_ICV_HASH "c0bd78adb4156bbf552c1dfe90599607")
+      set(OPENCV_ICV_NAME "ippicv_2020_lnx_intel64_20191018_general.tgz")
+      set(OPENCV_ICV_HASH "7421de0095c7a39162ae13a6098782f9")
     else()
-      set(OPENCV_ICV_NAME "ippicv_2019_lnx_ia32_general_20180723.tgz")
-      set(OPENCV_ICV_HASH "4f38432c30bfd6423164b7a24bbc98a0")
+      set(OPENCV_ICV_NAME "ippicv_2020_lnx_ia32_20191018_general.tgz")
+      set(OPENCV_ICV_HASH "ad189a940fb60eb71f291321322fe3e8")
     endif()
   elseif(WIN32 AND NOT ARM)
     set(OPENCV_ICV_PLATFORM "windows")
     set(OPENCV_ICV_PACKAGE_SUBDIR "ippicv_win")
     if(X86_64)
-      set(OPENCV_ICV_NAME "ippicv_2019_win_intel64_20180723_general.zip")
-      set(OPENCV_ICV_HASH "1d222685246896fe089f88b8858e4b2f")
+      set(OPENCV_ICV_NAME "ippicv_2020_win_intel64_20191018_general.zip")
+      set(OPENCV_ICV_HASH "879741a7946b814455eee6c6ffde2984")
     else()
-      set(OPENCV_ICV_NAME "ippicv_2019_win_ia32_20180723_general.zip")
-      set(OPENCV_ICV_HASH "0157251a2eb9cd63a3ebc7eed0f3e59e")
+      set(OPENCV_ICV_NAME "ippicv_2020_win_ia32_20191018_general.zip")
+      set(OPENCV_ICV_HASH "cd39bdf0c2e1cac9a61101dad7a2413e")
     endif()
   else()
     return()
diff --git a/cmake/OpenCVFindIPP.cmake b/cmake/OpenCVFindIPP.cmake
index f938e21a57..79555f60d9 100644
--- a/cmake/OpenCVFindIPP.cmake
+++ b/cmake/OpenCVFindIPP.cmake
@@ -236,6 +236,10 @@ if(DEFINED ENV{OPENCV_IPP_PATH} AND NOT DEFINED IPPROOT)
 endif()
 
 if(NOT DEFINED IPPROOT)
+  if(APPLE AND NOT IPP_X64)
+    message(STATUS "IPPICV: 32-bit binaries are not supported on Apple platform (MacOSX)")
+    return()
+  endif()
   include("${OpenCV_SOURCE_DIR}/3rdparty/ippicv/ippicv.cmake")
   download_ippicv(ICV_PACKAGE_ROOT)
   if(NOT ICV_PACKAGE_ROOT)

From 64588dff463d8522d9f44ee3a0dd43051976ca30 Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Fri, 21 Feb 2020 20:52:01 +0000
Subject: [PATCH 14/18] valgrind: update suppression rules

---
 platforms/scripts/valgrind.supp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/platforms/scripts/valgrind.supp b/platforms/scripts/valgrind.supp
index aa9d24d45c..600c61fad0 100644
--- a/platforms/scripts/valgrind.supp
+++ b/platforms/scripts/valgrind.supp
@@ -43,7 +43,7 @@
 {
    OpenCV-getCoreTlsData
    Memcheck:Leak
-   fun:_Znwm
+   ...
    fun:_ZN2cv14getCoreTlsDataEv
 }
 

From bf96d8239dfb4b0a40260447ade886762094428f Mon Sep 17 00:00:00 2001
From: Maksim Shabunin <maksim.shabunin@gmail.com>
Date: Thu, 13 Feb 2020 14:25:47 +0300
Subject: [PATCH 15/18] Use BufferArea in more places

---
 modules/calib3d/src/rho.cpp                   | 123 +--
 modules/calib3d/src/stereosgbm.cpp            | 848 +++++++++---------
 .../core/utils/buffer_area.private.hpp        |  27 +
 modules/core/src/buffer_area.cpp              |  49 +-
 modules/core/test/test_utils.cpp              |  15 +
 modules/features2d/src/fast.cpp               |  27 +-
 6 files changed, 570 insertions(+), 519 deletions(-)

diff --git a/modules/calib3d/src/rho.cpp b/modules/calib3d/src/rho.cpp
index 3cfa6b19e8..341b6b9063 100644
--- a/modules/calib3d/src/rho.cpp
+++ b/modules/calib3d/src/rho.cpp
@@ -55,7 +55,7 @@
 #include <math.h>
 #include <vector>
 #include "rho.h"
-
+#include "opencv2/core/utils/buffer_area.private.hpp"
 
 
 
@@ -65,7 +65,6 @@ namespace cv{/* For C support, replace with extern "C" { */
 
 
 /* Constants */
-const int    MEM_ALIGN              = 32;
 const size_t HSIZE                  = (3*3*sizeof(float));
 const double MIN_DELTA_CHNG         = 0.1;
 // const double CHI_STAT               = 2.706;
@@ -312,16 +311,14 @@ struct RHO_HEST_REFC : RHO_HEST{
 
     /* Levenberg-Marquardt Refinement */
     struct{
-        float  (* JtJ)[8];         /* JtJ matrix */
-        float  (* tmp1)[8];        /* Temporary 1 */
+        float*    JtJ;             /* JtJ matrix */
+        float*    tmp1;            /* Temporary 1 */
         float*    Jte;             /* Jte vector */
     } lm;
 
     /* Memory Management */
-    struct{
-        cv::Mat perObj;
-        cv::Mat perRun;
-    } mem;
+    utils::BufferArea runArea;
+    utils::BufferArea objArea;
 
     /* Initialized? */
     int initialized;
@@ -659,16 +656,9 @@ inline int    RHO_HEST_REFC::initialize(void){
 
     fastSeed((uint64_t)~0);
 
+    initialized = 1;
 
-    int areAllAllocsSuccessful = !mem.perObj.empty();
-
-    if(!areAllAllocsSuccessful){
-        finalize();
-    }else{
-        initialized = 1;
-    }
-
-    return areAllAllocsSuccessful;
+    return true;
 }
 
 /**
@@ -835,45 +825,14 @@ unsigned RHO_HEST_REFC::rhoHest(const float*   src,     /* Source points */
  */
 
 inline void   RHO_HEST_REFC::allocatePerObj(void){
-    /* We have known sizes */
-    size_t ctrl_smpl_sz   = SMPL_SIZE*sizeof(*ctrl.smpl);
-    size_t curr_pkdPts_sz = SMPL_SIZE*2*2*sizeof(*curr.pkdPts);
-    size_t curr_H_sz      = HSIZE;
-    size_t best_H_sz      = HSIZE;
-    size_t lm_JtJ_sz      = 8*8*sizeof(float);
-    size_t lm_tmp1_sz     = 8*8*sizeof(float);
-    size_t lm_Jte_sz      = 1*8*sizeof(float);
-
-    /* We compute offsets */
-    size_t total = 0;
-#define MK_OFFSET(v)                                     \
-    size_t v ## _of = total;                             \
-    total = alignSize(v ## _of  +  v ## _sz, MEM_ALIGN)
-
-    MK_OFFSET(ctrl_smpl);
-    MK_OFFSET(curr_pkdPts);
-    MK_OFFSET(curr_H);
-    MK_OFFSET(best_H);
-    MK_OFFSET(lm_JtJ);
-    MK_OFFSET(lm_tmp1);
-    MK_OFFSET(lm_Jte);
-
-#undef MK_OFFSET
-
-    /* Allocate dynamic memory managed by cv::Mat */
-    mem.perObj.create(1, (int)(total + MEM_ALIGN), CV_8UC1);
-
-    /* Extract aligned pointer */
-    unsigned char* ptr = alignPtr(mem.perObj.data, MEM_ALIGN);
-
-    /* Assign pointers */
-    ctrl.smpl   = (unsigned*)  (ptr + ctrl_smpl_of);
-    curr.pkdPts = (float*)     (ptr + curr_pkdPts_of);
-    curr.H      = (float*)     (ptr + curr_H_of);
-    best.H      = (float*)     (ptr + best_H_of);
-    lm.JtJ      = (float(*)[8])(ptr + lm_JtJ_of);
-    lm.tmp1     = (float(*)[8])(ptr + lm_tmp1_of);
-    lm.Jte      = (float*)     (ptr + lm_Jte_of);
+    objArea.allocate(ctrl.smpl, SMPL_SIZE);
+    objArea.allocate(curr.pkdPts, SMPL_SIZE*2*2);
+    objArea.allocate(curr.H, HSIZE);
+    objArea.allocate(best.H, HSIZE);
+    objArea.allocate(lm.JtJ, 8*8);
+    objArea.allocate(lm.tmp1, 8*8);
+    objArea.allocate(lm.Jte, 1*8);
+    objArea.commit();
 }
 
 
@@ -885,30 +844,9 @@ inline void   RHO_HEST_REFC::allocatePerObj(void){
  */
 
 inline void   RHO_HEST_REFC::allocatePerRun(void){
-    /* We have known sizes */
-    size_t best_inl_sz = arg.N;
-    size_t curr_inl_sz = arg.N;
-
-    /* We compute offsets */
-    size_t total = 0;
-#define MK_OFFSET(v)                                     \
-    size_t v ## _of = total;                             \
-    total = alignSize(v ## _of  +  v ## _sz, MEM_ALIGN)
-
-    MK_OFFSET(best_inl);
-    MK_OFFSET(curr_inl);
-
-#undef MK_OFFSET
-
-    /* Allocate dynamic memory managed by cv::Mat */
-    mem.perRun.create(1, (int)(total + MEM_ALIGN), CV_8UC1);
-
-    /* Extract aligned pointer */
-    unsigned char* ptr = alignPtr(mem.perRun.data, MEM_ALIGN);
-
-    /* Assign pointers */
-    best.inl  = (char*)(ptr + best_inl_of);
-    curr.inl  = (char*)(ptr + curr_inl_of);
+    runArea.allocate(best.inl, arg.N);
+    runArea.allocate(curr.inl, arg.N);
+    runArea.commit();
 }
 
 
@@ -919,10 +857,7 @@ inline void   RHO_HEST_REFC::allocatePerRun(void){
  */
 
 inline void   RHO_HEST_REFC::deallocatePerRun(void){
-    best.inl  = NULL;
-    curr.inl  = NULL;
-
-    mem.perRun.release();
+    runArea.release();
 }
 
 
@@ -933,15 +868,7 @@ inline void   RHO_HEST_REFC::deallocatePerRun(void){
  */
 
 inline void   RHO_HEST_REFC::deallocatePerObj(void){
-    ctrl.smpl   = NULL;
-    curr.pkdPts = NULL;
-    curr.H      = NULL;
-    best.H      = NULL;
-    lm.JtJ      = NULL;
-    lm.tmp1     = NULL;
-    lm.Jte      = NULL;
-
-    mem.perObj.release();
+    objArea.release();
 }
 
 
@@ -2144,7 +2071,7 @@ inline void   RHO_HEST_REFC::refine(void){
      */
     /* Find initial conditions */
     sacCalcJacobianErrors(best.H, arg.src, arg.dst, best.inl, arg.N,
-                          lm.JtJ, lm.Jte,  &S);
+                          (float(*)[8])lm.JtJ, lm.Jte,  &S);
 
     /*Levenberg-Marquardt Loop.*/
     for(i=0;i<MAXLEVMARQITERS;i++){
@@ -2169,11 +2096,11 @@ inline void   RHO_HEST_REFC::refine(void){
          * transpose) then multiply Jte in order to find dH.
          */
 
-        while(!sacChol8x8Damped(lm.JtJ, L, lm.tmp1)){
+        while(!sacChol8x8Damped((float(*)[8])lm.JtJ, L, (float(*)[8])lm.tmp1)){
             L *= 2.0f;
         }
-        sacTRInv8x8   (lm.tmp1, lm.tmp1);
-        sacTRISolve8x8(lm.tmp1, lm.Jte,  dH);
+        sacTRInv8x8   ((float(*)[8])lm.tmp1, (float(*)[8])lm.tmp1);
+        sacTRISolve8x8((float(*)[8])lm.tmp1, lm.Jte,  dH);
         sacSub8x1     (newH,       best.H,  dH);
         sacCalcJacobianErrors(newH, arg.src, arg.dst, best.inl, arg.N,
                               NULL, NULL, &newS);
@@ -2204,7 +2131,7 @@ inline void   RHO_HEST_REFC::refine(void){
             S = newS;
             memcpy(best.H, newH, sizeof(newH));
             sacCalcJacobianErrors(best.H, arg.src, arg.dst, best.inl, arg.N,
-                                  lm.JtJ, lm.Jte,  &S);
+                                  (float(*)[8])lm.JtJ, lm.Jte,  &S);
         }
     }
 }
diff --git a/modules/calib3d/src/stereosgbm.cpp b/modules/calib3d/src/stereosgbm.cpp
index 3b721ccf66..7d5d23c18d 100644
--- a/modules/calib3d/src/stereosgbm.cpp
+++ b/modules/calib3d/src/stereosgbm.cpp
@@ -53,6 +53,7 @@
 #include "precomp.hpp"
 #include <limits.h>
 #include "opencv2/core/hal/intrin.hpp"
+#include "opencv2/core/utils/buffer_area.private.hpp"
 
 namespace cv
 {
@@ -99,6 +100,16 @@ struct StereoSGBMParams
         mode = _mode;
     }
 
+    inline bool isFullDP() const
+    {
+        return mode == StereoSGBM::MODE_HH || mode == StereoSGBM::MODE_HH4;
+    }
+    inline Size calcSADWindowSize() const
+    {
+        const int dim = SADWindowSize > 0 ? SADWindowSize : 5;
+        return Size(dim, dim);
+    }
+
     int minDisparity;
     int numDisparities;
     int SADWindowSize;
@@ -148,6 +159,7 @@ static inline void min_pos(const v_int16& val, const v_int16& pos, short &min_va
 #endif
 
 static const int DEFAULT_RIGHT_BORDER = -1;
+
 /*
  For each pixel row1[x], max(maxD, 0) <= minX <= x < maxX <= width - max(0, -minD),
  and for each disparity minD<=d<maxD the function
@@ -161,7 +173,7 @@ static const int DEFAULT_RIGHT_BORDER = -1;
 static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
                             int minD, int maxD, CostType* cost,
                             PixType* buffer, const PixType* tab,
-                            int tabOfs, int , int xrange_min = 0, int xrange_max = DEFAULT_RIGHT_BORDER )
+                            int xrange_min = 0, int xrange_max = DEFAULT_RIGHT_BORDER )
 {
     int x, c, width = img1.cols, cn = img1.channels();
     int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
@@ -178,8 +190,6 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
     const PixType *row1 = img1.ptr<PixType>(y), *row2 = img2.ptr<PixType>(y);
     PixType *prow1 = buffer + width2*2, *prow2 = prow1 + width*cn*2;
 
-    tab += tabOfs;
-
     for( c = 0; c < cn*2; c++ )
     {
         prow1[width*c] = prow1[width*c + width-1] =
@@ -297,6 +307,166 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
 }
 
 
+
+class BufferSGBM
+{
+private:
+    size_t width1;
+    size_t Da;
+    size_t Dlra;
+    size_t costWidth;
+    size_t costHeight;
+    size_t hsumRows;
+    bool fullDP;
+    uchar dirs;
+    uchar dirs2;
+    static const size_t TAB_OFS = 256*4;
+
+public:
+    CostType* Cbuf;
+    CostType* Sbuf;
+    CostType* hsumBuf;
+    CostType* pixDiff;
+    CostType* disp2cost;
+    DispType* disp2ptr;
+    PixType* tempBuf;
+    std::vector<CostType*> Lr;
+    std::vector<CostType*> minLr;
+    PixType * clipTab;
+
+private:
+    utils::BufferArea area;
+
+public:
+    BufferSGBM(size_t width1_,
+               size_t Da_,
+               size_t Dlra_,
+               size_t cn,
+               size_t width,
+               size_t height,
+               const StereoSGBMParams &params)
+        : width1(width1_),
+        Da(Da_),
+        Dlra(Dlra_),
+        Cbuf(NULL),
+        Sbuf(NULL),
+        hsumBuf(NULL),
+        pixDiff(NULL),
+        disp2cost(NULL),
+        disp2ptr(NULL),
+        tempBuf(NULL),
+        Lr(2, (CostType*)NULL),
+        minLr(2, (CostType*)NULL),
+        clipTab(NULL)
+    {
+        const size_t TAB_SIZE = 256 + TAB_OFS*2;
+        fullDP = params.isFullDP();
+        costWidth = width1 * Da;
+        costHeight = fullDP ? height : 1;
+        hsumRows = params.calcSADWindowSize().height + 2;
+        dirs = params.mode == StereoSGBM::MODE_HH4 ? 1 : NR;
+        dirs2 = params.mode == StereoSGBM::MODE_HH4 ? 1 : NR2;
+        // for each possible stereo match (img1(x,y) <=> img2(x-d,y))
+        // we keep pixel difference cost (C) and the summary cost over NR directions (S).
+        // we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k)
+        area.allocate(Cbuf, costWidth * costHeight, CV_SIMD_WIDTH); // summary cost over different (nDirs) directions
+        area.allocate(Sbuf, costWidth * costHeight, CV_SIMD_WIDTH);
+        area.allocate(hsumBuf, costWidth * hsumRows, CV_SIMD_WIDTH);
+        area.allocate(pixDiff, costWidth, CV_SIMD_WIDTH);
+        area.allocate(disp2cost,    width, CV_SIMD_WIDTH);
+        area.allocate(disp2ptr,     width, CV_SIMD_WIDTH);
+        area.allocate(tempBuf,      width * (4 * cn + 2), CV_SIMD_WIDTH);
+        // the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer:
+        // for 8-way dynamic programming we need the current row and
+        // the previous row, i.e. 2 rows in total
+        for (size_t i = 0; i < 2; ++i)
+        {
+            // 2D: [ NR ][ w1 * NR2 ][ NR ] * [ Dlra ]
+            area.allocate(Lr[i], calcLrCount() * Dlra, CV_SIMD_WIDTH);
+            // 1D: [ NR ][ w1 * NR2 ][ NR ]
+            area.allocate(minLr[i], calcLrCount(), CV_SIMD_WIDTH);
+        }
+        area.allocate(clipTab, TAB_SIZE, CV_SIMD_WIDTH);
+        area.commit();
+
+        // init clipTab
+        const int ftzero = std::max(params.preFilterCap, 15) | 1;
+        for(int i = 0; i < (int)TAB_SIZE; i++ )
+            clipTab[i] = (PixType)(std::min(std::max(i - (int)TAB_OFS, -ftzero), ftzero) + ftzero);
+    }
+    inline const PixType * getClipTab() const
+    {
+        return clipTab + TAB_OFS;
+    }
+    inline void initCBuf(CostType val) const
+    {
+        for (size_t i = 0; i < costWidth * costHeight; ++i)
+            Cbuf[i] = val;
+    }
+    inline void clearLr(const Range & range = Range::all()) const
+    {
+            for (uchar i = 0; i < 2; ++i)
+            {
+                if (range == Range::all())
+                {
+                    memset(Lr[i],    0, calcLrCount() * Dlra * sizeof(CostType));
+                    memset(minLr[i], 0, calcLrCount()        * sizeof(CostType));
+                }
+                else
+                {
+                    memset(getLr(i, range.start), 0, range.size() * sizeof(CostType) * Dlra);
+                    memset(getMinLr(i, range.start), 0, range.size() * sizeof(CostType));
+                }
+            }
+    }
+    inline size_t calcLrCount() const
+    {
+        return width1 * dirs2 + 2 * dirs;
+    }
+    inline void swapLr()
+    {
+        std::swap(Lr[0], Lr[1]);
+        std::swap(minLr[0], minLr[1]);
+    }
+    inline CostType * getHSumBuf(int row) const
+    {
+        return hsumBuf + (row % hsumRows) * costWidth;
+    }
+    inline CostType * getCBuf(int row) const
+    {
+        CV_Assert(row >= 0);
+        return Cbuf + (!fullDP ? 0 : (row * costWidth));
+    }
+    inline CostType * getSBuf(int row) const
+    {
+        CV_Assert(row >= 0);
+        return Sbuf + (!fullDP ? 0 : (row * costWidth));
+    }
+    inline void clearSBuf(int row, const Range & range = Range::all()) const
+    {
+        if (range == Range::all())
+            memset(getSBuf(row), 0, costWidth * sizeof(CostType));
+        else
+            memset(getSBuf(row) + range.start * Da, 0, range.size() * Da * sizeof(CostType));
+    }
+
+    // shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,
+    // and will occasionally use negative indices with the arrays
+    // we need to shift Lr[k] pointers by 1, to give the space for d=-1.
+    inline CostType * getLr(uchar id, int idx, uchar shift = 0) const
+    {
+        CV_Assert(id < 2);
+        const size_t fixed_offset = dirs * Dlra;
+        return Lr[id] + fixed_offset + (idx * (int)dirs2 + (int)shift) * (int)Dlra;
+    }
+    inline CostType * getMinLr(uchar id, int idx, uchar shift = 0) const
+    {
+        CV_Assert(id < 2);
+        const size_t fixed_offset = dirs;
+        return minLr[id] + fixed_offset + (idx * dirs2 + shift);
+    }
+};
+
 /*
  computes disparity for "roi" in img1 w.r.t. img2 and write it to disp1buf.
  that is, disp1buf(x, y)=d means that img1(x+roi.x, y+roi.y) ~ img2(x+roi.x-d, y+roi.y).
@@ -318,34 +488,25 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
  It contains the minimum current cost, used to find the best disparity, corresponding to the minimal cost.
  */
 static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
-                                 Mat& disp1, const StereoSGBMParams& params,
-                                 Mat& buffer )
+                                 Mat& disp1, const StereoSGBMParams& params )
 {
     const int DISP_SHIFT = StereoMatcher::DISP_SHIFT;
     const int DISP_SCALE = (1 << DISP_SHIFT);
     const CostType MAX_COST = SHRT_MAX;
 
     int minD = params.minDisparity, maxD = minD + params.numDisparities;
-    Size SADWindowSize;
-    SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5;
-    int ftzero = std::max(params.preFilterCap, 15) | 1;
     int uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;
     int disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
     int P1 = params.P1 > 0 ? params.P1 : 2, P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
     int k, width = disp1.cols, height = disp1.rows;
     int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
-    int D = maxD - minD, width1 = maxX1 - minX1;
+    const int D = params.numDisparities;
+    int width1 = maxX1 - minX1;
     int Da = (int)alignSize(D, v_int16::nlanes);
     int Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
     int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
-    int SW2 = SADWindowSize.width/2, SH2 = SADWindowSize.height/2;
-    bool fullDP = params.mode == StereoSGBM::MODE_HH;
-    int npasses = fullDP ? 2 : 1;
-    const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;
-    PixType clipTab[TAB_SIZE];
-
-    for( k = 0; k < TAB_SIZE; k++ )
-        clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);
+    int SW2 = params.calcSADWindowSize().width/2, SH2 = params.calcSADWindowSize().height/2;
+    int npasses = params.isFullDP() ? 2 : 1;
 
     if( minX1 >= maxX1 )
     {
@@ -353,39 +514,8 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
         return;
     }
 
-    // for each possible stereo match (img1(x,y) <=> img2(x-d,y))
-    // we keep pixel difference cost (C) and the summary cost over NR directions (S).
-    // we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k)
-    size_t costBufSize = width1*Da;
-    size_t CSBufSize = costBufSize*(fullDP ? height : 1);
-    size_t minLrSize = (width1 + 2)*NR2, LrSize = minLrSize*Dlra;
-    int hsumBufNRows = SH2*2 + 2;
-    // the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer:
-    // for 8-way dynamic programming we need the current row and
-    // the previous row, i.e. 2 rows in total
-    size_t totalBufSize = CV_SIMD_WIDTH + CSBufSize * 2 * sizeof(CostType) + // alignment, C, S
-    costBufSize*(hsumBufNRows + 1)*sizeof(CostType) + // hsumBuf, pixdiff
-    ((LrSize + minLrSize)*2 + v_int16::nlanes) * sizeof(CostType) + // minLr[] and Lr[]
-    width*(sizeof(CostType) + sizeof(DispType)) + // disp2cost + disp2
-    width * (4*img1.channels() + 2) * sizeof(PixType); // temp buffer for computing per-pixel cost
-
-    if( buffer.empty() || !buffer.isContinuous() ||
-        buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize )
-        buffer.reserveBuffer(totalBufSize);
-
-    // summary cost over different (nDirs) directions
-    CostType* Cbuf = (CostType*)alignPtr(buffer.ptr(), CV_SIMD_WIDTH);
-    CostType* Sbuf = Cbuf + CSBufSize;
-    CostType* hsumBuf = Sbuf + CSBufSize;
-    CostType* pixDiff = hsumBuf + costBufSize*hsumBufNRows;
-
-    CostType* disp2cost = pixDiff + costBufSize + ((LrSize + minLrSize)*2 + v_int16::nlanes);
-    DispType* disp2ptr = (DispType*)(disp2cost + width);
-    PixType* tempBuf = (PixType*)(disp2ptr + width);
-
-    // add P2 to every C(x,y). it saves a few operations in the inner loops
-    for(k = 0; k < (int)CSBufSize; k++ )
-        Cbuf[k] = (CostType)P2;
+    BufferSGBM mem(width1, Da, Dlra, img1.channels(), width, height, params);
+    mem.initCBuf((CostType)P2); // add P2 to every C(x,y). it saves a few operations in the inner loops
 
     for( int pass = 1; pass <= npasses; pass++ )
     {
@@ -402,27 +532,15 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
             x1 = width1-1; x2 = -1; dx = -1;
         }
 
-        CostType *Lr[2]={0}, *minLr[2]={0};
-
-        for( k = 0; k < 2; k++ )
-        {
-            // shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,
-            // and will occasionally use negative indices with the arrays
-            // we need to shift Lr[k] pointers by 1, to give the space for d=-1.
-            // however, then the alignment will be imperfect, i.e. bad for SSE,
-            // thus we shift the pointers by SIMD vector size
-            Lr[k] = pixDiff + costBufSize + v_int16::nlanes + LrSize*k + NR2*Dlra;
-            memset( Lr[k] - NR2*Dlra, 0, LrSize*sizeof(CostType) );
-            minLr[k] = pixDiff + costBufSize + v_int16::nlanes + LrSize*2 + minLrSize*k + NR2;
-            memset( minLr[k] - NR2, 0, minLrSize*sizeof(CostType) );
-        }
+        uchar lrID = 0;
+        mem.clearLr();
 
         for( int y = y1; y != y2; y += dy )
         {
             int x, d;
             DispType* disp1ptr = disp1.ptr<DispType>(y);
-            CostType* C = Cbuf + (!fullDP ? 0 : y*costBufSize);
-            CostType* S = Sbuf + (!fullDP ? 0 : y*costBufSize);
+            CostType* const C = mem.getCBuf(y);
+            CostType* const S = mem.getSBuf(y);
 
             if( pass == 1 ) // compute C on the first pass, and reuse it on the second pass, if any.
             {
@@ -430,35 +548,35 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
 
                 for( k = dy1; k <= dy2; k++ )
                 {
-                    CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize;
+                    CostType* hsumAdd = mem.getHSumBuf(std::min(k, height-1));
 
                     if( k < height )
                     {
-                        calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, clipTab, TAB_OFS, ftzero );
+                        calcPixelCostBT( img1, img2, k, minD, maxD, mem.pixDiff, mem.tempBuf, mem.getClipTab() );
 
                         memset(hsumAdd, 0, Da*sizeof(CostType));
 #if CV_SIMD
                         v_int16 h_scale = vx_setall_s16((short)SW2 + 1);
                         for( d = 0; d < Da; d += v_int16::nlanes )
                         {
-                            v_int16 v_hsumAdd = vx_load_aligned(pixDiff + d) * h_scale;
+                            v_int16 v_hsumAdd = vx_load_aligned(mem.pixDiff + d) * h_scale;
                             for( x = Da; x <= SW2*Da; x += Da )
-                                v_hsumAdd += vx_load_aligned(pixDiff + x + d);
+                                v_hsumAdd += vx_load_aligned(mem.pixDiff + x + d);
                             v_store_aligned(hsumAdd + d, v_hsumAdd);
                         }
 #else
                         for (d = 0; d < D; d++)
                         {
-                            hsumAdd[d] = (CostType)(pixDiff[d] * (SW2 + 1));
+                            hsumAdd[d] = (CostType)(mem.pixDiff[d] * (SW2 + 1));
                             for( x = Da; x <= SW2*Da; x += Da )
-                                hsumAdd[d] = (CostType)(hsumAdd[d] + pixDiff[x + d]);
+                                hsumAdd[d] = (CostType)(hsumAdd[d] + mem.pixDiff[x + d]);
                         }
 #endif
 
                         if( y > 0 )
                         {
-                            const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize;
-                            const CostType* Cprev = !fullDP || y == 0 ? C : C - costBufSize;
+                            const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, 0));
+                            const CostType* Cprev =  mem.getCBuf(y - 1);
 
 #if CV_SIMD
                             for (d = 0; d < Da; d += v_int16::nlanes)
@@ -470,8 +588,8 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
 
                             for( x = Da; x < width1*Da; x += Da )
                             {
-                                const CostType* pixAdd = pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
-                                const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*Da, 0);
+                                const CostType* pixAdd = mem.pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
+                                const CostType* pixSub = mem.pixDiff + std::max(x - (SW2+1)*Da, 0);
 #if CV_SIMD
                                 for( d = 0; d < Da; d += v_int16::nlanes )
                                 {
@@ -501,8 +619,8 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
 #endif
                             for( x = Da; x < width1*Da; x += Da )
                             {
-                                const CostType* pixAdd = pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
-                                const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*Da, 0);
+                                const CostType* pixAdd = mem.pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
+                                const CostType* pixSub = mem.pixDiff + std::max(x - (SW2+1)*Da, 0);
 
 #if CV_SIMD
                                 for (d = 0; d < Da; d += v_int16::nlanes)
@@ -526,8 +644,8 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                     {
                         if( y > 0 )
                         {
-                            const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize;
-                            const CostType* Cprev = !fullDP || y == 0 ? C : C - costBufSize;
+                            const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, 0));
+                            const CostType* Cprev = mem.getCBuf(y - 1);
 #if CV_SIMD
                             for (x = 0; x < width1*Da; x += v_int16::nlanes)
                                 v_store_aligned(C + x, vx_load_aligned(Cprev + x) - vx_load_aligned(hsumSub + x) + vx_load_aligned(hsumAdd + x));
@@ -551,7 +669,7 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                 }
 
                 // also, clear the S buffer
-                memset(S, 0, width1*Da * sizeof(CostType));
+                mem.clearSBuf(y);
             }
 
             /*
@@ -575,24 +693,26 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
 
             for( x = x1; x != x2; x += dx )
             {
-                int xm = x*NR2, xd = xm*Dlra;
+                int delta0 = P2 + *mem.getMinLr(lrID, x - dx);
+                int delta1 = P2 + *mem.getMinLr(1 - lrID, x - 1, 1);
+                int delta2 = P2 + *mem.getMinLr(1 - lrID, x,     2);
+                int delta3 = P2 + *mem.getMinLr(1 - lrID, x + 1, 3);
 
-                int delta0 = minLr[0][xm - dx*NR2] + P2, delta1 = minLr[1][xm - NR2 + 1] + P2;
-                int delta2 = minLr[1][xm + 2] + P2, delta3 = minLr[1][xm + NR2 + 3] + P2;
+                CostType* Lr_p0 = mem.getLr(lrID, x - dx);
+                CostType* Lr_p1 = mem.getLr(1 - lrID, x - 1, 1);
+                CostType* Lr_p2 = mem.getLr(1 - lrID, x,     2);
+                CostType* Lr_p3 = mem.getLr(1 - lrID, x + 1, 3);
 
-                CostType* Lr_p0 = Lr[0] + xd - dx*NR2*Dlra;
-                CostType* Lr_p1 = Lr[1] + xd - NR2*Dlra + Dlra;
-                CostType* Lr_p2 = Lr[1] + xd + Dlra*2;
-                CostType* Lr_p3 = Lr[1] + xd + NR2*Dlra + Dlra*3;
+                Lr_p0[-1] = Lr_p0[D] = MAX_COST;
+                Lr_p1[-1] = Lr_p1[D] = MAX_COST;
+                Lr_p2[-1] = Lr_p2[D] = MAX_COST;
+                Lr_p3[-1] = Lr_p3[D] = MAX_COST;
 
-                Lr_p0[-1] = Lr_p0[D] = Lr_p1[-1] = Lr_p1[D] =
-                Lr_p2[-1] = Lr_p2[D] = Lr_p3[-1] = Lr_p3[D] = MAX_COST;
-
-                CostType* Lr_p = Lr[0] + xd;
+                CostType* Lr_p = mem.getLr(lrID, x);
                 const CostType* Cp = C + x*Da;
                 CostType* Sp = S + x*Da;
 
-                CostType* minL = minLr[0] + xm;
+                CostType* minL = mem.getMinLr(lrID, x);
                 d = 0;
 #if CV_SIMD
                 v_int16 _P1 = vx_setall_s16((short)P1);
@@ -703,14 +823,14 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                 for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes )
                 {
                     v_store(disp1ptr + x, v_inv_dist);
-                    v_store(disp2ptr + x, v_inv_dist);
-                    v_store(disp2cost + x, v_max_cost);
+                    v_store(mem.disp2ptr + x, v_inv_dist);
+                    v_store(mem.disp2cost + x, v_max_cost);
                 }
 #endif
                 for( ; x < width; x++ )
                 {
-                    disp1ptr[x] = disp2ptr[x] = (DispType)INVALID_DISP_SCALED;
-                    disp2cost[x] = MAX_COST;
+                    disp1ptr[x] = mem.disp2ptr[x] = (DispType)INVALID_DISP_SCALED;
+                    mem.disp2cost[x] = MAX_COST;
                 }
 
                 for( x = width1 - 1; x >= 0; x-- )
@@ -721,16 +841,14 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
 
                     if( npasses == 1 )
                     {
-                        int xm = x*NR2, xd = xm*Dlra;
-
-                        CostType* Lr_p0 = Lr[0] + xd + NR2*Dlra;
+                        CostType* Lr_p0 = mem.getLr(lrID, x + 1);
                         Lr_p0[-1] = Lr_p0[D] = MAX_COST;
-                        CostType* Lr_p = Lr[0] + xd;
+                        CostType* Lr_p = mem.getLr(lrID, x);
 
                         const CostType* Cp = C + x*Da;
 
                         d = 0;
-                        int delta0 = minLr[0][xm + NR2] + P2;
+                        int delta0 = P2 + *mem.getMinLr(lrID, x + 1);
                         int minL0 = MAX_COST;
 #if CV_SIMD
                         v_int16 _P1 = vx_setall_s16((short)P1);
@@ -768,7 +886,7 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                                 bestDisp = (short)d;
                             }
                         }
-                        minLr[0][xm] = (CostType)minL0;
+                        *mem.getMinLr(lrID, x) = (CostType)minL0;
                     }
                     else
                     {
@@ -803,10 +921,10 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                         continue;
                     d = bestDisp;
                     int _x2 = x + minX1 - d - minD;
-                    if( disp2cost[_x2] > minS )
+                    if( mem.disp2cost[_x2] > minS )
                     {
-                        disp2cost[_x2] = (CostType)minS;
-                        disp2ptr[_x2] = (DispType)(d + minD);
+                        mem.disp2cost[_x2] = (CostType)minS;
+                        mem.disp2ptr[_x2] = (DispType)(d + minD);
                     }
 
                     if( 0 < d && d < D-1 )
@@ -833,15 +951,13 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                     int _d = d1 >> DISP_SHIFT;
                     int d_ = (d1 + DISP_SCALE-1) >> DISP_SHIFT;
                     int _x = x - _d, x_ = x - d_;
-                    if( 0 <= _x && _x < width && disp2ptr[_x] >= minD && std::abs(disp2ptr[_x] - _d) > disp12MaxDiff &&
-                       0 <= x_ && x_ < width && disp2ptr[x_] >= minD && std::abs(disp2ptr[x_] - d_) > disp12MaxDiff )
+                    if( 0 <= _x && _x < width && mem.disp2ptr[_x] >= minD && std::abs(mem.disp2ptr[_x] - _d) > disp12MaxDiff &&
+                       0 <= x_ && x_ < width && mem.disp2ptr[x_] >= minD && std::abs(mem.disp2ptr[x_] - d_) > disp12MaxDiff )
                         disp1ptr[x] = (DispType)INVALID_DISP_SCALED;
                 }
             }
 
-            // now shift the cyclic buffers
-            std::swap( Lr[0], Lr[1] );
-            std::swap( minLr[0], minLr[1] );
+            lrID = 1 - lrID; // now shift the cyclic buffers
         }
     }
 }
@@ -849,13 +965,12 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
 ////////////////////////////////////////////////////////////////////////////////////////////
 struct CalcVerticalSums: public ParallelLoopBody
 {
-    CalcVerticalSums(const Mat& _img1, const Mat& _img2, const StereoSGBMParams& params,
-                     CostType* alignedBuf, PixType* _clipTab): img1(_img1), img2(_img2), clipTab(_clipTab)
+    CalcVerticalSums(const Mat& _img1, const Mat& _img2, const StereoSGBMParams& params, const BufferSGBM &mem_)
+        : img1(_img1), img2(_img2), mem(mem_)
     {
         minD = params.minDisparity;
         maxD = minD + params.numDisparities;
-        SW2 = SH2 = (params.SADWindowSize > 0 ? params.SADWindowSize : 5)/2;
-        ftzero = std::max(params.preFilterCap, 15) | 1;
+        SW2 = SH2 = params.calcSADWindowSize().height/2;
         P1 = params.P1 > 0 ? params.P1 : 2;
         P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
         height = img1.rows;
@@ -865,32 +980,27 @@ struct CalcVerticalSums: public ParallelLoopBody
         Da = (int)alignSize(D, v_int16::nlanes);
         Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
         width1 = maxX1 - minX1;
-        costBufSize = width1*Da;
-        CSBufSize = costBufSize*height;
-        minLrSize = width1;
-        LrSize = minLrSize*Dlra;
-        hsumBufNRows = SH2*2 + 2;
-        Cbuf = alignedBuf;
-        Sbuf = Cbuf + CSBufSize;
-        hsumBuf = Sbuf + CSBufSize;
+        D = params.numDisparities;
+        Da = (int)alignSize(D, v_int16::nlanes);
     }
 
     void operator()(const Range& range) const CV_OVERRIDE
     {
-        static const CostType MAX_COST = SHRT_MAX;
-        static const int TAB_OFS = 256*4;
-        static const int npasses = 2;
-        int x1 = range.start, x2 = range.end, k;
-        size_t pixDiffSize = ((x2 - x1) + 2*SW2)*Da;
-        size_t auxBufsSize = CV_SIMD_WIDTH + pixDiffSize*sizeof(CostType) + //alignment and pixdiff size
-                             width*(4*img1.channels()+2)*sizeof(PixType);   //tempBuf
-        Mat auxBuff;
-        auxBuff.create(1, (int)auxBufsSize, CV_8U);
-        CostType* pixDiff = (CostType*)alignPtr(auxBuff.ptr(), CV_SIMD_WIDTH);
-        PixType* tempBuf = (PixType*)(pixDiff + pixDiffSize);
+        const CostType MAX_COST = SHRT_MAX;
+        const int npasses = 2;
+        const int x1 = range.start, x2 = range.end;
+        int k;
+
+        CostType* pixDiff = 0;
+        PixType* tempBuf = 0;
+        utils::BufferArea aux_area;
+        aux_area.allocate(pixDiff, ((x2 - x1) + 2 * SW2) * Da, CV_SIMD_WIDTH);
+        aux_area.allocate(tempBuf, width * (4 * img1.channels() + 2) * sizeof(PixType), CV_SIMD_WIDTH);
+        aux_area.commit();
 
         // Simplification of index calculation
-        pixDiff -= (x1>SW2 ? (x1 - SW2): 0)*Da;
+        if (x1 > SW2)
+            pixDiff -= (x1 - SW2) * Da;
 
         for( int pass = 1; pass <= npasses; pass++ )
         {
@@ -905,26 +1015,14 @@ struct CalcVerticalSums: public ParallelLoopBody
                 y1 = height-1; y2 = -1; dy = -1;
             }
 
-            CostType *Lr[2]={0}, *minLr[2]={0};
-
-            for( k = 0; k < 2; k++ )
-            {
-                // shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,
-                // and will occasionally use negative indices with the arrays
-                // we need to shift Lr[k] pointers by 1, to give the space for d=-1.
-                // however, then the alignment will be imperfect, i.e. bad for SSE,
-                // thus we shift the pointers by SIMD vector size
-                Lr[k] = hsumBuf + costBufSize*hsumBufNRows + v_int16::nlanes + LrSize*k;
-                memset( Lr[k] + x1*Dlra, 0, (x2-x1)*Dlra*sizeof(CostType) );
-                minLr[k] = hsumBuf + costBufSize*hsumBufNRows + v_int16::nlanes + LrSize*2 + minLrSize*k;
-                memset( minLr[k] + x1, 0, (x2-x1)*sizeof(CostType) );
-            }
+            uchar lrID = 0;
+            mem.clearLr(range);
 
             for( int y = y1; y != y2; y += dy )
             {
                 int x, d;
-                CostType* C = Cbuf + y*costBufSize;
-                CostType* S = Sbuf + y*costBufSize;
+                CostType* C = mem.getCBuf(y);
+                CostType* S = mem.getSBuf(y);
 
                 if( pass == 1 ) // compute C on the first pass, and reuse it on the second pass, if any.
                 {
@@ -932,11 +1030,11 @@ struct CalcVerticalSums: public ParallelLoopBody
 
                     for( k = dy1; k <= dy2; k++ )
                     {
-                        CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize;
+                        CostType* hsumAdd = mem.getHSumBuf(std::min(k, height-1));
 
                         if( k < height )
                         {
-                            calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, clipTab, TAB_OFS, ftzero, x1 - SW2, x2 + SW2);
+                            calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, mem.getClipTab(), x1 - SW2, x2 + SW2);
 
                             memset(hsumAdd + x1*Da, 0, Da*sizeof(CostType));
                             for( x = (x1 - SW2)*Da; x <= (x1 + SW2)*Da; x += Da )
@@ -953,8 +1051,8 @@ struct CalcVerticalSums: public ParallelLoopBody
 
                             if( y > 0 )
                             {
-                                const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize;
-                                const CostType* Cprev = C - costBufSize;
+                                const CostType* hsumSub =  mem.getHSumBuf(std::max(y - SH2 - 1, 0));
+                                const CostType* Cprev = mem.getCBuf(y - 1);
 #if CV_SIMD
                                 for( d = 0; d < Da; d += v_int16::nlanes )
                                     v_store_aligned(C + x1*Da + d, vx_load_aligned(Cprev + x1*Da + d) + vx_load_aligned(hsumAdd + x1*Da + d) - vx_load_aligned(hsumSub + x1*Da + d));
@@ -1020,8 +1118,8 @@ struct CalcVerticalSums: public ParallelLoopBody
                         {
 /*                            if (y > 0)
                             {
-                                const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize;
-                                const CostType* Cprev = C - costBufSize;
+                                const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, 0));
+                                const CostType* Cprev = mem.getCBuf(y - 1);
 
 #if CV_SIMD
                                 for( x = x1*Da; x < x2*Da; x += v_int16::nlanes )
@@ -1044,9 +1142,7 @@ struct CalcVerticalSums: public ParallelLoopBody
                             }
                         }
                     }
-
-                    // also, clear the S buffer
-                    memset(S + x1*Da, 0, (x2-x1)*Da*sizeof(CostType));
+                    mem.clearSBuf(y, range);
                 }
 
 //              [formula 13 in the paper]
@@ -1061,19 +1157,16 @@ struct CalcVerticalSums: public ParallelLoopBody
 
                 for( x = x1; x != x2; x++ )
                 {
-                    int xd = x*Dlra;
-
-                    int delta = minLr[1][x] + P2;
-
-                    CostType* Lr_ppr = Lr[1] + xd;
+                    int delta = P2 + *mem.getMinLr(1 - lrID, x);
+                    CostType* Lr_ppr = mem.getLr(1 - lrID, x);
 
                     Lr_ppr[-1] = Lr_ppr[D] = MAX_COST;
 
-                    CostType* Lr_p = Lr[0] + xd;
+                    CostType* Lr_p = mem.getLr(lrID, x);
                     const CostType* Cp = C + x*Da;
                     CostType* Sp = S + x*Da;
 
-                    CostType& minL = minLr[0][x];
+                    CostType& minL = *(mem.getMinLr(lrID, x));
                     d = 0;
 #if CV_SIMD
                     v_int16 _P1 = vx_setall_s16((short)P1);
@@ -1105,19 +1198,13 @@ struct CalcVerticalSums: public ParallelLoopBody
                         Sp[d] = saturate_cast<CostType>(Sp[d] + L);
                     }
                 }
-
-                // now shift the cyclic buffers
-                std::swap( Lr[0], Lr[1] );
-                std::swap( minLr[0], minLr[1] );
+                lrID = 1 - lrID; // now shift the cyclic buffers
             }
         }
     }
     const Mat& img1;
     const Mat& img2;
-    CostType* Cbuf;
-    CostType* Sbuf;
-    CostType* hsumBuf;
-    PixType* clipTab;
+    const BufferSGBM & mem;
     int minD;
     int maxD;
     int D, Da, Dlra;
@@ -1128,18 +1215,12 @@ struct CalcVerticalSums: public ParallelLoopBody
     int height;
     int P1;
     int P2;
-    size_t costBufSize;
-    size_t CSBufSize;
-    size_t minLrSize;
-    size_t LrSize;
-    size_t hsumBufNRows;
-    int ftzero;
 };
 
 struct CalcHorizontalSums: public ParallelLoopBody
 {
-    CalcHorizontalSums(const Mat& _img1, const Mat& _img2, Mat& _disp1, const StereoSGBMParams& params,
-                     CostType* alignedBuf): img1(_img1), img2(_img2), disp1(_disp1)
+    CalcHorizontalSums(const Mat& _img1, const Mat& _img2, Mat& _disp1, const StereoSGBMParams& params, const BufferSGBM &mem_)
+        : img1(_img1), img2(_img2), disp1(_disp1), mem(mem_)
     {
         minD = params.minDisparity;
         maxD = minD + params.numDisparities;
@@ -1157,23 +1238,22 @@ struct CalcHorizontalSums: public ParallelLoopBody
         Da = (int)alignSize(D, v_int16::nlanes);
         Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
         width1 = maxX1 - minX1;
-        costBufSize = width1*Da;
-        CSBufSize = costBufSize*height;
-        LrSize = 2 * Dlra;
-        Cbuf = alignedBuf;
-        Sbuf = Cbuf + CSBufSize;
     }
 
     void operator()(const Range& range) const CV_OVERRIDE
     {
         int y1 = range.start, y2 = range.end;
-        size_t auxBufsSize = CV_SIMD_WIDTH + (v_int16::nlanes + LrSize) * sizeof(CostType) + width*(sizeof(CostType) + sizeof(DispType));
 
-        Mat auxBuff;
-        auxBuff.create(1, (int)auxBufsSize, CV_8U);
-        CostType *Lr = ((CostType*)alignPtr(auxBuff.ptr(), CV_SIMD_WIDTH)) + v_int16::nlanes;
-        CostType* disp2cost = Lr + LrSize;
-        DispType* disp2ptr = (DispType*)(disp2cost + width);
+        const size_t LrSize = 2 * (1 + Dlra + 1);
+
+        CostType * Lr = 0;
+        CostType * disp2cost = 0;
+        DispType * disp2ptr = 0;
+        utils::BufferArea aux_area;
+        aux_area.allocate(Lr, LrSize);
+        aux_area.allocate(disp2cost, width, CV_SIMD_WIDTH);
+        aux_area.allocate(disp2ptr, width, CV_SIMD_WIDTH);
+        aux_area.commit();
 
         CostType minLr;
 
@@ -1181,8 +1261,8 @@ struct CalcHorizontalSums: public ParallelLoopBody
         {
             int x, d;
             DispType* disp1ptr = disp1.ptr<DispType>(y);
-            CostType* C = Cbuf + y*costBufSize;
-            CostType* S = Sbuf + y*costBufSize;
+            CostType* C = mem.getCBuf(y);
+            CostType* S = mem.getSBuf(y);
 
             x = 0;
 #if CV_SIMD
@@ -1202,8 +1282,8 @@ struct CalcHorizontalSums: public ParallelLoopBody
             }
 
             // clear buffers
-            memset( Lr, 0, LrSize*sizeof(CostType) );
-            Lr[-1] = Lr[D] = Lr[Dlra - 1] = Lr[Dlra + D] = MAX_COST;
+            aux_area.zeroFill(Lr);
+            Lr[0] = Lr[1 + D] = Lr[3 + Dlra - 1] = Lr[3 + Dlra + D] = MAX_COST;
 
             minLr = 0;
 //          [formula 13 in the paper]
@@ -1219,10 +1299,8 @@ struct CalcHorizontalSums: public ParallelLoopBody
             for( x = 0; x != width1; x++)
             {
                 int delta = minLr + P2;
-
-                CostType* Lr_ppr = Lr + ((x&1)? 0 : Dlra);
-
-                CostType* Lr_p = Lr + ((x&1)? Dlra :0);
+                CostType* Lr_ppr = Lr + ((x&1)? 1 : 3 + Dlra);
+                CostType* Lr_p = Lr + ((x&1)? 3 + Dlra : 1);
                 const CostType* Cp = C + x*Da;
                 CostType* Sp = S + x*Da;
 
@@ -1236,8 +1314,8 @@ struct CalcHorizontalSums: public ParallelLoopBody
                 for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes)
                 {
                     v_int16 Cpd = vx_load_aligned(Cp + d);
-                    v_int16 L = v_min(v_min(v_min(vx_load_aligned(Lr_ppr + d), vx_load(Lr_ppr + d - 1) + _P1), vx_load(Lr_ppr + d + 1) + _P1), _delta) - _delta + Cpd;
-                    v_store_aligned(Lr_p + d, L);
+                    v_int16 L = v_min(v_min(v_min(vx_load(Lr_ppr + d), vx_load(Lr_ppr + d - 1) + _P1), vx_load(Lr_ppr + d + 1) + _P1), _delta) - _delta + Cpd;
+                    v_store(Lr_p + d, L);
                     _minL = v_min(_minL, L);
                     v_store_aligned(Sp + d, vx_load_aligned(Sp + d) + L);
                 }
@@ -1255,18 +1333,16 @@ struct CalcHorizontalSums: public ParallelLoopBody
                 }
             }
 
-            memset( Lr, 0, LrSize*sizeof(CostType) );
-            Lr[-1] = Lr[D] = Lr[Dlra - 1] = Lr[Dlra + D] = MAX_COST;
+            aux_area.zeroFill(Lr);
+            Lr[0] = Lr[1 + D] = Lr[3 + Dlra - 1] = Lr[3 + Dlra + D] = MAX_COST;
 
             minLr = 0;
 
             for( x = width1-1; x != -1; x--)
             {
                 int delta = minLr + P2;
-
-                CostType* Lr_ppr = Lr + ((x&1)? 0 :Dlra);
-
-                CostType* Lr_p = Lr + ((x&1)? Dlra :0);
+                CostType* Lr_ppr = Lr + ((x&1)? 1 : 3 + Dlra);
+                CostType* Lr_p = Lr + ((x&1)? 3 + Dlra : 1);
                 const CostType* Cp = C + x*Da;
                 CostType* Sp = S + x*Da;
                 CostType minS = MAX_COST;
@@ -1283,8 +1359,8 @@ struct CalcHorizontalSums: public ParallelLoopBody
                 for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes )
                 {
                     v_int16 Cpd = vx_load_aligned(Cp + d);
-                    v_int16 L = v_min(v_min(v_min(vx_load_aligned(Lr_ppr + d), vx_load(Lr_ppr + d - 1) + _P1), vx_load(Lr_ppr + d + 1) + _P1), _delta) - _delta + Cpd;
-                    v_store_aligned(Lr_p + d, L);
+                    v_int16 L = v_min(v_min(v_min(vx_load(Lr_ppr + d), vx_load(Lr_ppr + d - 1) + _P1), vx_load(Lr_ppr + d + 1) + _P1), _delta) - _delta + Cpd;
+                    v_store(Lr_p + d, L);
                     _minL = v_min(_minL, L);
                     L += vx_load_aligned(Sp + d);
                     v_store_aligned(Sp + d, L);
@@ -1366,8 +1442,7 @@ struct CalcHorizontalSums: public ParallelLoopBody
     const Mat& img1;
     const Mat& img2;
     Mat& disp1;
-    CostType* Cbuf;
-    CostType* Sbuf;
+    const BufferSGBM & mem;
     int minD;
     int maxD;
     int D, Da, Dlra;
@@ -1378,9 +1453,6 @@ struct CalcHorizontalSums: public ParallelLoopBody
     int P2;
     int minX1;
     int maxX1;
-    size_t costBufSize;
-    size_t CSBufSize;
-    size_t LrSize;
     int INVALID_DISP;
     int INVALID_DISP_SCALED;
     int uniquenessRatio;
@@ -1401,28 +1473,21 @@ struct CalcHorizontalSums: public ParallelLoopBody
  is written as is, without interpolation.
  */
 static void computeDisparitySGBM_HH4( const Mat& img1, const Mat& img2,
-                                 Mat& disp1, const StereoSGBMParams& params,
-                                 Mat& buffer )
+                                 Mat& disp1, const StereoSGBMParams& params)
 {
     const int DISP_SHIFT = StereoMatcher::DISP_SHIFT;
     const int DISP_SCALE = (1 << DISP_SHIFT);
     int minD = params.minDisparity, maxD = minD + params.numDisparities;
     Size SADWindowSize;
     SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5;
-    int ftzero = std::max(params.preFilterCap, 15) | 1;
     int P1 = params.P1 > 0 ? params.P1 : 2, P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
-    int k, width = disp1.cols, height = disp1.rows;
+    int width = disp1.cols, height = disp1.rows;
     int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
-    int D = (int)alignSize(maxD - minD, v_int16::nlanes), width1 = maxX1 - minX1;
-    int Dlra = D + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
-    int SH2 = SADWindowSize.height/2;
+    int width1 = maxX1 - minX1;
+    int Da = (int)alignSize(params.numDisparities, v_int16::nlanes);
+    int Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
     int INVALID_DISP = minD - 1;
     int INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
-    const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;
-    PixType clipTab[TAB_SIZE];
-
-    for( k = 0; k < TAB_SIZE; k++ )
-        clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);
 
     if( minX1 >= maxX1 )
     {
@@ -1430,54 +1495,79 @@ static void computeDisparitySGBM_HH4( const Mat& img1, const Mat& img2,
         return;
     }
 
-    // for each possible stereo match (img1(x,y) <=> img2(x-d,y))
-    // we keep pixel difference cost (C) and the summary cost over 4 directions (S).
-    // we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k)
-
-    // the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer:
-    // for dynamic programming we need the current row and
-    // the previous row, i.e. 2 rows in total
-    size_t costBufSize = width1*D;
-    size_t CSBufSize = costBufSize*height;
-    size_t minLrSize = width1 , LrSize = minLrSize*Dlra;
-    int hsumBufNRows = SH2*2 + 2;
-    size_t totalBufSize = CV_SIMD_WIDTH + CSBufSize * 2 * sizeof(CostType) + // Alignment, C, S
-                          costBufSize*hsumBufNRows * sizeof(CostType) + // hsumBuf
-                          ((LrSize + minLrSize)*2 + v_int16::nlanes) * sizeof(CostType); // minLr[] and Lr[]
-
-    if( buffer.empty() || !buffer.isContinuous() ||
-        buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize )
-    {
-        buffer.reserveBuffer(totalBufSize);
-    }
-
-    // summary cost over different (nDirs) directions
-    CostType* Cbuf = (CostType*)alignPtr(buffer.ptr(), CV_SIMD_WIDTH);
-
-    // add P2 to every C(x,y). it saves a few operations in the inner loops
-    for(k = 0; k < (int)CSBufSize; k++ )
-        Cbuf[k] = (CostType)P2;
-
-    parallel_for_(Range(0,width1),CalcVerticalSums(img1, img2, params, Cbuf, clipTab),8);
-    parallel_for_(Range(0,height),CalcHorizontalSums(img1, img2, disp1, params, Cbuf),8);
+    BufferSGBM mem(width1, Da, Dlra, img1.channels(), width, height, params);
+    mem.initCBuf((CostType)P2); // add P2 to every C(x,y). it saves a few operations in the inner loops
 
+    parallel_for_(Range(0,width1),CalcVerticalSums(img1, img2, params, mem),8);
+    parallel_for_(Range(0,height),CalcHorizontalSums(img1, img2, disp1, params, mem),8);
 }
 
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 
-void getBufferPointers(Mat& buffer, int width, int width1, int Da, int num_ch, int SH2, int P2,
-                       CostType*& curCostVolumeLine, CostType*& hsumBuf, CostType*& pixDiff,
-                       PixType*& tmpBuf, CostType*& horPassCostVolume,
-                       CostType*& vertPassCostVolume, CostType*& vertPassMin, CostType*& rightPassBuf,
-                       CostType*& disp2CostBuf, short*& disp2Buf);
+class BufferSGBM3Way
+{
+private:
+    size_t hsumCols;
+    size_t hsumRows;
+public:
+    CostType *curCostVolumeLine;
+    CostType *hsumBuf;
+    CostType *pixDiff;
+    PixType *tmpBuf;
+    CostType *horPassCostVolume;
+    CostType *vertPassCostVolume;
+    CostType *vertPassMin;
+    CostType *rightPassBuf;
+    CostType *disp2CostBuf;
+    short *disp2Buf;
+private:
+    utils::BufferArea area;
+public:
+    BufferSGBM3Way(int width1, int width, int num_ch, int Da, int SH2, int P2) :
+        curCostVolumeLine(0),
+        hsumBuf(0),
+        pixDiff(0),
+        tmpBuf(0),
+        horPassCostVolume(0),
+        vertPassCostVolume(0),
+        vertPassMin(0),
+        rightPassBuf(0),
+        disp2CostBuf(0),
+        disp2Buf(0)
+    {
+        hsumCols = width1 * Da;
+        hsumRows = SH2*2 + 2;
+        area.allocate(curCostVolumeLine, hsumCols, CV_SIMD_WIDTH);
+        area.allocate(hsumBuf, hsumCols * hsumRows, CV_SIMD_WIDTH);
+        area.allocate(pixDiff,hsumCols, CV_SIMD_WIDTH);
+        area.allocate(tmpBuf, width * (4 * num_ch + 2), CV_SIMD_WIDTH);
+        area.allocate(horPassCostVolume, (width1 + 2) * Da, CV_SIMD_WIDTH);
+        area.allocate(vertPassCostVolume, (width1 + 2) * Da, CV_SIMD_WIDTH);
+        area.allocate(vertPassMin, width1 + 2, CV_SIMD_WIDTH);
+        area.allocate(rightPassBuf, Da, CV_SIMD_WIDTH);
+        area.allocate(disp2CostBuf, width, CV_SIMD_WIDTH);
+        area.allocate(disp2Buf, width, CV_SIMD_WIDTH);
+        area.commit();
+        area.zeroFill();
+        for(size_t i = 0; i < hsumCols; i++)
+            curCostVolumeLine[i] = (CostType)P2;
+    }
+    inline void clearRightPassBuf()
+    {
+        area.zeroFill(rightPassBuf);
+    }
+    CostType *getHSumBuf(int x) const
+    {
+        return hsumBuf + (x % hsumRows) * hsumCols;
+    }
+};
 
 struct SGBM3WayMainLoop : public ParallelLoopBody
 {
-    Mat* buffers;
     const Mat *img1, *img2;
     Mat* dst_disp;
 
-    int nstripes, stripe_sz;
+    int stripe_sz;
     int stripe_overlap;
 
     int width,height;
@@ -1488,25 +1578,54 @@ struct SGBM3WayMainLoop : public ParallelLoopBody
     int P1, P2;
     int uniquenessRatio, disp12MaxDiff;
 
-    int costBufSize, hsumBufNRows;
-    int TAB_OFS, ftzero;
+    int TAB_OFS;
 
+    utils::BufferArea aux_area;
     PixType* clipTab;
 #if CV_SIMD
     short idx_row[v_int16::nlanes];
 #endif
-    SGBM3WayMainLoop(Mat *_buffers, const Mat& _img1, const Mat& _img2, Mat* _dst_disp, const StereoSGBMParams& params, PixType* _clipTab, int _nstripes, int _stripe_overlap);
-    void getRawMatchingCost(CostType* C, CostType* hsumBuf, CostType* pixDiff, PixType* tmpBuf, int y, int src_start_idx) const;
+    SGBM3WayMainLoop(const Mat& _img1, const Mat& _img2, Mat* _dst_disp, const StereoSGBMParams& params, int stripe_size, int _stripe_overlap);
     void operator () (const Range& range) const CV_OVERRIDE;
     template<bool x_nlanes> void impl(const Range& range) const;
+
+private:
+    void getRawMatchingCost(const BufferSGBM3Way &mem, int y, int src_start_idx) const;
+
+    template<bool x_nlanes>
+    void accumulateCostsLeftTop(const BufferSGBM3Way &mem,
+                                int x,
+                                CostType &leftMinCost) const;
+
+    template<bool x_nlanes>
+    void accumulateCostsRight(const BufferSGBM3Way &mem,
+                              int x,
+                              CostType &rightMinCost,
+                              short &optimal_disp,
+                              CostType &min_cost) const;
 };
 
-SGBM3WayMainLoop::SGBM3WayMainLoop(Mat *_buffers, const Mat& _img1, const Mat& _img2, Mat* _dst_disp, const StereoSGBMParams& params, PixType* _clipTab, int _nstripes, int _stripe_overlap):
-buffers(_buffers), img1(&_img1), img2(&_img2), dst_disp(_dst_disp), clipTab(_clipTab)
+SGBM3WayMainLoop::SGBM3WayMainLoop(const Mat& _img1,
+                                   const Mat& _img2,
+                                   Mat* _dst_disp,
+                                   const StereoSGBMParams& params,
+                                   int _stripe_sz,
+                                   int _stripe_overlap)
+    : img1(&_img1),
+    img2(&_img2),
+    dst_disp(_dst_disp),
+    stripe_sz(_stripe_sz),
+    stripe_overlap(_stripe_overlap),
+    clipTab(0)
 {
-    nstripes = _nstripes;
-    stripe_overlap = _stripe_overlap;
-    stripe_sz = (int)ceil(img1->rows/(double)nstripes);
+    // precompute a lookup table for the raw matching cost computation:
+    TAB_OFS = 256*4;
+    const int TAB_SIZE = 256 + TAB_OFS*2;
+    aux_area.allocate(clipTab, TAB_SIZE, CV_SIMD_WIDTH);
+    aux_area.commit();
+    const int ftzero = std::max(params.preFilterCap, 15) | 1;
+    for(int k = 0; k < TAB_SIZE; k++ )
+        clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);
 
     width = img1->cols; height = img1->rows;
     minD = params.minDisparity; maxD = minD + params.numDisparities; D = maxD - minD;
@@ -1519,100 +1638,27 @@ buffers(_buffers), img1(&_img1), img2(&_img2), dst_disp(_dst_disp), clipTab(_cli
     uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;
     disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
 
-    costBufSize = width1*Da;
-    hsumBufNRows = SH2*2 + 2;
-    TAB_OFS = 256*4;
-    ftzero = std::max(params.preFilterCap, 15) | 1;
 #if CV_SIMD
     for(short i = 0; i < v_int16::nlanes; ++i)
         idx_row[i] = i;
 #endif
 }
 
-void getBufferPointers(Mat& buffer, int width, int width1, int Da, int num_ch, int SH2, int P2,
-                       CostType*& curCostVolumeLine, CostType*& hsumBuf, CostType*& pixDiff,
-                       PixType*& tmpBuf, CostType*& horPassCostVolume,
-                       CostType*& vertPassCostVolume, CostType*& vertPassMin, CostType*& rightPassBuf,
-                       CostType*& disp2CostBuf, short*& disp2Buf)
-{
-    // allocating all the required memory:
-    int costVolumeLineSize = width1*Da;
-    int width1_ext = width1+2;
-    int costVolumeLineSize_ext = width1_ext*Da;
-    int hsumBufNRows = SH2*2 + 2;
-
-    // main buffer to store matching costs for the current line:
-    int curCostVolumeLineSize = costVolumeLineSize*sizeof(CostType);
-
-    // auxiliary buffers for the raw matching cost computation:
-    int hsumBufSize  = costVolumeLineSize*hsumBufNRows*sizeof(CostType);
-    int pixDiffSize  = costVolumeLineSize*sizeof(CostType);
-    int tmpBufSize = width * (4 * num_ch + 2) * sizeof(PixType);
-
-    // auxiliary buffers for the matching cost aggregation:
-    int horPassCostVolumeSize  = costVolumeLineSize_ext*sizeof(CostType); // buffer for the 2-pass horizontal cost aggregation
-    int vertPassCostVolumeSize = costVolumeLineSize_ext*sizeof(CostType); // buffer for the vertical cost aggregation
-    int rightPassBufSize = Da * sizeof(CostType);                     // additional small buffer for the right-to-left pass
-    int vertPassMinSize        = width1_ext*sizeof(CostType);             // buffer for storing minimum costs from the previous line
-
-    // buffers for the pseudo-LRC check:
-    int disp2CostBufSize = width*sizeof(CostType);
-    int disp2BufSize     = width*sizeof(short);
-
-    // sum up the sizes of all the buffers:
-    size_t totalBufSize = CV_SIMD_WIDTH + curCostVolumeLineSize +
-                          hsumBufSize +
-                          pixDiffSize +
-                          horPassCostVolumeSize +
-                          vertPassCostVolumeSize +
-                          rightPassBufSize +
-                          vertPassMinSize +
-                          disp2CostBufSize +
-                          disp2BufSize +
-                          tmpBufSize;
-
-    if( buffer.empty() || !buffer.isContinuous() || buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize )
-        buffer.reserveBuffer(totalBufSize);
-
-    // set up all the pointers:
-    curCostVolumeLine  = (CostType*)alignPtr(buffer.ptr(), CV_SIMD_WIDTH);
-    hsumBuf            = curCostVolumeLine + costVolumeLineSize;
-    pixDiff            = hsumBuf + costVolumeLineSize*hsumBufNRows;
-    horPassCostVolume  = pixDiff + costVolumeLineSize;
-    vertPassCostVolume = horPassCostVolume + costVolumeLineSize_ext;
-    rightPassBuf       = vertPassCostVolume + costVolumeLineSize_ext;
-    vertPassMin        = rightPassBuf + Da;
-
-    disp2CostBuf       = vertPassMin + width1_ext;
-    disp2Buf           = disp2CostBuf + width;
-    tmpBuf = (PixType*)(disp2Buf + width);
-
-    // initialize memory:
-    memset(buffer.ptr(),0,totalBufSize);
-    int i = 0;
-#if CV_SIMD
-    v_int16 _P2 = vx_setall_s16((CostType)P2);
-    for (; i<=costVolumeLineSize-v_int16::nlanes; i+=v_int16::nlanes)
-        v_store_aligned(curCostVolumeLine + i, _P2);
-#endif
-    for(;i<costVolumeLineSize;i++)
-        curCostVolumeLine[i] = (CostType)P2; //such initialization simplifies the cost aggregation loops a bit
-}
-
 // performing block matching and building raw cost-volume for the current row
-void SGBM3WayMainLoop::getRawMatchingCost(CostType* C, // target cost-volume row
-                                          CostType* hsumBuf, CostType* pixDiff, PixType* tmpBuf, //buffers
-                                          int y, int src_start_idx) const
+void SGBM3WayMainLoop::getRawMatchingCost(const BufferSGBM3Way &mem, int y, int src_start_idx) const
 {
+    CostType* C = mem.curCostVolumeLine;
+    CostType* pixDiff = mem.pixDiff;
+    PixType* tmpBuf = mem.tmpBuf;
     int x, d;
     int dy1 = (y == src_start_idx) ? src_start_idx : y + SH2, dy2 = (y == src_start_idx) ? src_start_idx+SH2 : dy1;
 
     for(int k = dy1; k <= dy2; k++ )
     {
-        CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize;
+        CostType* hsumAdd = mem.getHSumBuf(std::min(k, height-1));
         if( k < height )
         {
-            calcPixelCostBT( *img1, *img2, k, minD, maxD, pixDiff, tmpBuf, clipTab, TAB_OFS, ftzero );
+            calcPixelCostBT( *img1, *img2, k, minD, maxD, pixDiff, tmpBuf, clipTab + TAB_OFS );
 
 #if CV_SIMD
             v_int16 sw2_1 = vx_setall_s16((short)SW2 + 1);
@@ -1634,7 +1680,7 @@ void SGBM3WayMainLoop::getRawMatchingCost(CostType* C, // target cost-volume row
 #endif
             if( y > src_start_idx )
             {
-                const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, src_start_idx) % hsumBufNRows)*costBufSize;
+                const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, src_start_idx));
 
 #if CV_SIMD
                 for (d = 0; d < Da; d += v_int16::nlanes)
@@ -1702,7 +1748,7 @@ void SGBM3WayMainLoop::getRawMatchingCost(CostType* C, // target cost-volume row
         {
             if( y > src_start_idx )
             {
-                const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, src_start_idx) % hsumBufNRows)*costBufSize;
+                const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, src_start_idx));
 #if CV_SIMD
                 for( x = 0; x < width1*Da; x += v_int16::nlanes)
                     v_store_aligned(C + x, vx_load_aligned(C + x) + vx_load_aligned(hsumAdd + x) - vx_load_aligned(hsumSub + x));
@@ -1728,12 +1774,15 @@ void SGBM3WayMainLoop::getRawMatchingCost(CostType* C, // target cost-volume row
 // performing SGM cost accumulation from left to right (result is stored in leftBuf) and
 // in-place cost accumulation from top to bottom (result is stored in topBuf)
 template<bool x_nlanes>
-inline void accumulateCostsLeftTop(CostType* leftBuf, CostType* leftBuf_prev, CostType* topBuf, CostType* costs,
-                                   CostType& leftMinCost, CostType& topMinCost, int D, int P1, int P2)
+void SGBM3WayMainLoop::accumulateCostsLeftTop(const BufferSGBM3Way &mem, int x, CostType& leftMinCost) const
 {
+    CostType *leftBuf = mem.horPassCostVolume + x;
+    CostType *leftBuf_prev = mem.horPassCostVolume + x - Da;
+    CostType *topBuf = mem.vertPassCostVolume + x;
+    CostType *costs = mem.curCostVolumeLine - Da + x;
+    CostType& topMinCost = mem.vertPassMin[x/Da];
     int i = 0;
 #if CV_SIMD
-    int Da = (int)alignSize(D, v_int16::nlanes);
     v_int16 P1_reg = vx_setall_s16(cv::saturate_cast<CostType>(P1));
 
     v_int16 leftMinCostP2_reg   = vx_setall_s16(cv::saturate_cast<CostType>(leftMinCost+P2));
@@ -1847,12 +1896,16 @@ inline void accumulateCostsLeftTop(CostType* leftBuf, CostType* leftBuf_prev, Co
 // summing rightBuf, topBuf, leftBuf together (the result is stored in leftBuf), as well as finding the
 // optimal disparity value with minimum accumulated cost
 template<bool x_nlanes>
-inline void accumulateCostsRight(CostType* rightBuf, CostType* topBuf, CostType* leftBuf, CostType* costs,
-                                 CostType& rightMinCost, int D, int P1, int P2, short& optimal_disp, CostType& min_cost)
+void SGBM3WayMainLoop::accumulateCostsRight(const BufferSGBM3Way &mem, int x,
+                                            CostType& rightMinCost, short& optimal_disp, CostType& min_cost) const
 {
+    CostType* costs = mem.curCostVolumeLine - Da + x;
+    CostType* rightBuf = mem.rightPassBuf;
+    CostType* topBuf = mem.vertPassCostVolume + x;
+    CostType* leftBuf = mem.horPassCostVolume + x;
+
     int i = 0;
 #if CV_SIMD
-    int Da = (int)alignSize(D, v_int16::nlanes);
     v_int16 P1_reg = vx_setall_s16(cv::saturate_cast<CostType>(P1));
 
     v_int16 rightMinCostP2_reg   = vx_setall_s16(cv::saturate_cast<CostType>(rightMinCost+P2));
@@ -1955,6 +2008,7 @@ void SGBM3WayMainLoop::operator () (const Range& range) const
     if (D == Da) impl<true>(range);
     else impl<false>(range);
 }
+
 template<bool x_nlanes>
 void SGBM3WayMainLoop::impl(const Range& range) const
 {
@@ -1979,33 +2033,24 @@ void SGBM3WayMainLoop::impl(const Range& range) const
     else
         dst_offset=0;
 
-    Mat cur_buffer = buffers [range.start];
     Mat cur_disp   = dst_disp[range.start];
     cur_disp = Scalar(INVALID_DISP_SCALED);
 
-    // prepare buffers:
-    CostType *curCostVolumeLine, *hsumBuf, *pixDiff;
-    PixType* tmpBuf;
-    CostType *horPassCostVolume, *vertPassCostVolume, *vertPassMin, *rightPassBuf, *disp2CostBuf;
-    short* disp2Buf;
-    getBufferPointers(cur_buffer,width,width1,Da,img1->channels(),SH2,P2,
-                      curCostVolumeLine,hsumBuf,pixDiff,tmpBuf,horPassCostVolume,
-                      vertPassCostVolume,vertPassMin,rightPassBuf,disp2CostBuf,disp2Buf);
-
+    BufferSGBM3Way mem(width1, width, img1->channels(), Da, SH2, P2);
+    CostType *horPassCostVolume = mem.horPassCostVolume;
     // start real processing:
     for(int y=src_start_idx;y<src_end_idx;y++)
     {
-        getRawMatchingCost(curCostVolumeLine,hsumBuf,pixDiff,tmpBuf,y,src_start_idx);
+        getRawMatchingCost(mem, y, src_start_idx);
 
         short* disp_row = (short*)cur_disp.ptr(dst_offset+(y-src_start_idx));
 
         // initialize the auxiliary buffers for the pseudo left-right consistency check:
         for(int x=0;x<width;x++)
         {
-            disp2Buf[x] = (short)INVALID_DISP_SCALED;
-            disp2CostBuf[x] = SHRT_MAX;
+            mem.disp2Buf[x] = (short)INVALID_DISP_SCALED;
+            mem.disp2CostBuf[x] = SHRT_MAX;
         }
-        CostType* C = curCostVolumeLine - Da;
         CostType prev_min, min_cost;
         int d;
         short best_d;
@@ -2014,14 +2059,14 @@ void SGBM3WayMainLoop::impl(const Range& range) const
         // forward pass
         prev_min=0;
         for (int x=Da;x<(1+width1)*Da;x+=Da)
-            accumulateCostsLeftTop<x_nlanes>(horPassCostVolume+x,horPassCostVolume+x-Da,vertPassCostVolume+x,C+x,prev_min,vertPassMin[x/Da],D,P1,P2);
+            accumulateCostsLeftTop<x_nlanes>(mem, x, prev_min);
 
         //backward pass
-        memset(rightPassBuf,0,Da*sizeof(CostType));
+        mem.clearRightPassBuf();
         prev_min=0;
         for (int x=width1*Da;x>=Da;x-=Da)
         {
-            accumulateCostsRight<x_nlanes>(rightPassBuf,vertPassCostVolume+x,horPassCostVolume+x,C+x,prev_min,D,P1,P2,best_d,min_cost);
+            accumulateCostsRight<x_nlanes>(mem, x, prev_min, best_d, min_cost);
 
             if(uniquenessRatio>0)
             {
@@ -2074,10 +2119,10 @@ void SGBM3WayMainLoop::impl(const Range& range) const
             d = best_d;
 
             int _x2 = x/Da - 1 + minX1 - d - minD;
-            if( _x2>=0 && _x2<width && disp2CostBuf[_x2] > min_cost )
+            if( _x2>=0 && _x2<width && mem.disp2CostBuf[_x2] > min_cost )
             {
-                disp2CostBuf[_x2] = min_cost;
-                disp2Buf[_x2] = (short)(d + minD);
+                mem.disp2CostBuf[_x2] = min_cost;
+                mem.disp2Buf[_x2] = (short)(d + minD);
             }
 
             if( 0 < d && d < D-1 )
@@ -2104,32 +2149,27 @@ void SGBM3WayMainLoop::impl(const Range& range) const
             int _d = d1 >> StereoMatcher::DISP_SHIFT;
             int d_ = (d1 + DISP_SCALE-1) >> StereoMatcher::DISP_SHIFT;
             int _x = x - _d, x_ = x - d_;
-            if( 0 <= _x && _x < width && disp2Buf[_x] >= minD && std::abs(disp2Buf[_x] - _d) > disp12MaxDiff &&
-                0 <= x_ && x_ < width && disp2Buf[x_] >= minD && std::abs(disp2Buf[x_] - d_) > disp12MaxDiff )
+            if( 0 <= _x && _x < width && mem.disp2Buf[_x] >= minD && std::abs(mem.disp2Buf[_x] - _d) > disp12MaxDiff &&
+                0 <= x_ && x_ < width && mem.disp2Buf[x_] >= minD && std::abs(mem.disp2Buf[x_] - d_) > disp12MaxDiff )
                 disp_row[x] = (short)INVALID_DISP_SCALED;
         }
     }
 }
 
-static void computeDisparity3WaySGBM( const Mat& img1, const Mat& img2,
-                                      Mat& disp1, const StereoSGBMParams& params,
-                                      Mat* buffers, int nstripes )
+template <uchar nstripes>
+static void computeDisparity3WaySGBM(const Mat& img1, const Mat& img2, Mat& disp1, const StereoSGBMParams& params)
 {
-    // precompute a lookup table for the raw matching cost computation:
-    const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;
-    PixType* clipTab = new PixType[TAB_SIZE];
-    int ftzero = std::max(params.preFilterCap, 15) | 1;
-    for(int k = 0; k < TAB_SIZE; k++ )
-        clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);
-
     // allocate separate dst_disp arrays to avoid conflicts due to stripe overlap:
     int stripe_sz = (int)ceil(img1.rows/(double)nstripes);
     int stripe_overlap = (params.SADWindowSize/2+1) + (int)ceil(0.1*stripe_sz);
-    Mat* dst_disp = new Mat[nstripes];
+    Mat dst_disp[nstripes];
     for(int i=0;i<nstripes;i++)
         dst_disp[i].create(stripe_sz+stripe_overlap,img1.cols,CV_16S);
 
-    parallel_for_(Range(0,nstripes),SGBM3WayMainLoop(buffers,img1,img2,dst_disp,params,clipTab,nstripes,stripe_overlap));
+    parallel_for_(
+        Range(0,nstripes),
+        SGBM3WayMainLoop(img1,img2,dst_disp,params,stripe_sz,stripe_overlap)
+    );
 
     //assemble disp1 from dst_disp:
     short* dst_row;
@@ -2140,9 +2180,6 @@ static void computeDisparity3WaySGBM( const Mat& img1, const Mat& img2,
         src_row = (short*)dst_disp[i/stripe_sz].ptr(stripe_overlap+i%stripe_sz);
         memcpy(dst_row,src_row,disp1.cols*sizeof(short));
     }
-
-    delete[] clipTab;
-    delete[] dst_disp;
 }
 
 class StereoSGBMImpl CV_FINAL : public StereoSGBM
@@ -2176,11 +2213,13 @@ public:
         Mat disp = disparr.getMat();
 
         if(params.mode==MODE_SGBM_3WAY)
-            computeDisparity3WaySGBM( left, right, disp, params, buffers, num_stripes );
+            // the number of stripes is fixed, disregarding the number of threads/processors
+            // to make the results fully reproducible
+            computeDisparity3WaySGBM<4>( left, right, disp, params );
         else if(params.mode==MODE_HH4)
-            computeDisparitySGBM_HH4( left, right, disp, params, buffer );
+            computeDisparitySGBM_HH4( left, right, disp, params );
         else
-            computeDisparitySGBM( left, right, disp, params, buffer );
+            computeDisparitySGBM( left, right, disp, params );
 
         medianBlur(disp, disp, 3);
 
@@ -2259,11 +2298,6 @@ public:
     StereoSGBMParams params;
     Mat buffer;
 
-    // the number of stripes is fixed, disregarding the number of threads/processors
-    // to make the results fully reproducible:
-    static const int num_stripes = 4;
-    Mat buffers[num_stripes];
-
     static const char* name_;
 };
 
diff --git a/modules/core/include/opencv2/core/utils/buffer_area.private.hpp b/modules/core/include/opencv2/core/utils/buffer_area.private.hpp
index 141ad2c502..ab19da6416 100644
--- a/modules/core/include/opencv2/core/utils/buffer_area.private.hpp
+++ b/modules/core/include/opencv2/core/utils/buffer_area.private.hpp
@@ -74,6 +74,25 @@ public:
         allocate_((void**)(&ptr), static_cast<ushort>(sizeof(T)), count, alignment);
     }
 
+    /** @brief Fill one of buffers with zeroes
+
+    @param ptr pointer to memory block previously added using BufferArea::allocate
+
+    BufferArea::commit must be called before using this method
+    */
+    template <typename T>
+    void zeroFill(T*&ptr)
+    {
+        CV_Assert(ptr);
+        zeroFill_((void**)&ptr);
+    }
+
+    /** @brief Fill all buffers with zeroes
+
+    BufferArea::commit must be called before using this method
+    */
+    void zeroFill();
+
     /** @brief Allocate memory and initialize all bound pointers
 
     Each pointer bound to the area with the BufferArea::allocate will be initialized and will be set
@@ -83,10 +102,18 @@ public:
     */
     void commit();
 
+    /** @brief Release all memory and unbind all pointers
+
+    All memory will be freed and all pointers will be reset to NULL and untied from the area allowing
+    to call `allocate` and `commit` again.
+    */
+    void release();
+
 private:
     BufferArea(const BufferArea &); // = delete
     BufferArea &operator=(const BufferArea &); // = delete
     void allocate_(void **ptr, ushort type_size, size_t count, ushort alignment);
+    void zeroFill_(void **ptr);
 
 private:
     class Block;
diff --git a/modules/core/src/buffer_area.cpp b/modules/core/src/buffer_area.cpp
index 2a41c72f45..b6bb321bba 100644
--- a/modules/core/src/buffer_area.cpp
+++ b/modules/core/src/buffer_area.cpp
@@ -66,6 +66,16 @@ public:
         *ptr = buf;
         return static_cast<void*>(static_cast<uchar*>(*ptr) + type_size * count);
     }
+    bool operator==(void **other) const
+    {
+        CV_Assert(ptr && other);
+        return *ptr == *other;
+    }
+    void zeroFill() const
+    {
+        CV_Assert(ptr && *ptr);
+        memset(static_cast<uchar*>(*ptr), 0, count * type_size);
+    }
 private:
     void **ptr;
     void * raw_mem;
@@ -85,10 +95,7 @@ BufferArea::BufferArea(bool safe_) :
 
 BufferArea::~BufferArea()
 {
-    for(std::vector<Block>::const_iterator i = blocks.begin(); i != blocks.end(); ++i)
-        i->cleanup();
-    if (oneBuf)
-        fastFree(oneBuf);
+    release();
 }
 
 void BufferArea::allocate_(void **ptr, ushort type_size, size_t count, ushort alignment)
@@ -100,6 +107,26 @@ void BufferArea::allocate_(void **ptr, ushort type_size, size_t count, ushort al
         totalSize += blocks.back().getByteCount();
 }
 
+void BufferArea::zeroFill_(void **ptr)
+{
+    for(std::vector<Block>::const_iterator i = blocks.begin(); i != blocks.end(); ++i)
+    {
+        if (*i == ptr)
+        {
+            i->zeroFill();
+            break;
+        }
+    }
+}
+
+void BufferArea::zeroFill()
+{
+    for(std::vector<Block>::const_iterator i = blocks.begin(); i != blocks.end(); ++i)
+    {
+        i->zeroFill();
+    }
+}
+
 void BufferArea::commit()
 {
     if (!safe)
@@ -116,6 +143,20 @@ void BufferArea::commit()
     }
 }
 
+void BufferArea::release()
+{
+    for(std::vector<Block>::const_iterator i = blocks.begin(); i != blocks.end(); ++i)
+    {
+        i->cleanup();
+    }
+    blocks.clear();
+    if (oneBuf)
+    {
+        fastFree(oneBuf);
+        oneBuf = 0;
+    }
+}
+
 //==================================================================================================
 
 }} // cv::utils::
diff --git a/modules/core/test/test_utils.cpp b/modules/core/test/test_utils.cpp
index 87891488ec..1a23e01fb9 100644
--- a/modules/core/test/test_utils.cpp
+++ b/modules/core/test/test_utils.cpp
@@ -337,6 +337,21 @@ TEST_P(BufferArea, basic)
         ASSERT_TRUE(dbl_ptr != NULL);
         EXPECT_EQ((size_t)0, (size_t)int_ptr % sizeof(int));
         EXPECT_EQ((size_t)0, (size_t)dbl_ptr % sizeof(double));
+        for (size_t i = 0; i < SZ; ++i)
+        {
+            int_ptr[i] = (int)i + 1;
+            uchar_ptr[i] = (uchar)i + 1;
+            dbl_ptr[i] = (double)i + 1;
+        }
+        area.zeroFill(int_ptr);
+        area.zeroFill(uchar_ptr);
+        area.zeroFill(dbl_ptr);
+        for (size_t i = 0; i < SZ; ++i)
+        {
+            EXPECT_EQ((int)0, int_ptr[i]);
+            EXPECT_EQ((uchar)0, uchar_ptr[i]);
+            EXPECT_EQ((double)0, dbl_ptr[i]);
+        }
     }
     EXPECT_TRUE(int_ptr == NULL);
     EXPECT_TRUE(uchar_ptr == NULL);
diff --git a/modules/features2d/src/fast.cpp b/modules/features2d/src/fast.cpp
index cb9ff257e3..7e292143e7 100644
--- a/modules/features2d/src/fast.cpp
+++ b/modules/features2d/src/fast.cpp
@@ -47,6 +47,7 @@ The references are:
 #include "opencl_kernels_features2d.hpp"
 #include "hal_replacement.hpp"
 #include "opencv2/core/hal/intrin.hpp"
+#include "opencv2/core/utils/buffer_area.private.hpp"
 
 #include "opencv2/core/openvx/ovx_defs.hpp"
 
@@ -80,20 +81,26 @@ void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bo
     for( i = -255; i <= 255; i++ )
         threshold_tab[i+255] = (uchar)(i < -threshold ? 1 : i > threshold ? 2 : 0);
 
-    AutoBuffer<uchar> _buf((img.cols+16)*3*(sizeof(int) + sizeof(uchar)) + 128);
-    uchar* buf[3];
-    buf[0] = _buf.data(); buf[1] = buf[0] + img.cols; buf[2] = buf[1] + img.cols;
-    int* cpbuf[3];
-    cpbuf[0] = (int*)alignPtr(buf[2] + img.cols, sizeof(int)) + 1;
-    cpbuf[1] = cpbuf[0] + img.cols + 1;
-    cpbuf[2] = cpbuf[1] + img.cols + 1;
-    memset(buf[0], 0, img.cols*3);
+    uchar* buf[3] = { 0 };
+    int* cpbuf[3] = { 0 };
+    utils::BufferArea area;
+    for (unsigned idx = 0; idx < 3; ++idx)
+    {
+        area.allocate(buf[idx], img.cols);
+        area.allocate(cpbuf[idx], img.cols + 1);
+    }
+    area.commit();
+
+    for (unsigned idx = 0; idx < 3; ++idx)
+    {
+        memset(buf[idx], 0, img.cols);
+    }
 
     for(i = 3; i < img.rows-2; i++)
     {
         const uchar* ptr = img.ptr<uchar>(i) + 3;
         uchar* curr = buf[(i - 3)%3];
-        int* cornerpos = cpbuf[(i - 3)%3];
+        int* cornerpos = cpbuf[(i - 3)%3] + 1; // cornerpos[-1] is used to store a value
         memset(curr, 0, img.cols);
         int ncorners = 0;
 
@@ -266,7 +273,7 @@ void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bo
 
         const uchar* prev = buf[(i - 4 + 3)%3];
         const uchar* pprev = buf[(i - 5 + 3)%3];
-        cornerpos = cpbuf[(i - 4 + 3)%3];
+        cornerpos = cpbuf[(i - 4 + 3)%3] + 1; // cornerpos[-1] is used to store a value
         ncorners = cornerpos[-1];
 
         for( k = 0; k < ncorners; k++ )

From 09df7810d1dbaad2411a42a9af4584c8b4109371 Mon Sep 17 00:00:00 2001
From: Ganesh Kathiresan <ganesh3597@gmail.com>
Date: Wed, 26 Feb 2020 20:18:50 +0530
Subject: [PATCH 16/18] Merge pull request #16457 from
 ganesh-k13:bugfix/getCPUCount-fix

* Fixed getCPUCount

Minor new line changes

Android fix | efficient linux checks

Android fix 2

Fixed cpu logic for non linux platforms

Android fix 3

Android fix 4

* No v1 case handle | Refactor long lines

* Refined Cgroups logic | Combine Android and Linux

* Fixed directives

* Added support for --cpus | Fixed minor bug in Andriod | Change file read logic

* Added macro checks for apple errors

* Fixed macro to include android

* Addressed review comments

* Fixed android macro

* Refined return values

* Fixed apple warning

* Addressed review comments

* Fixed whitespace

* Android Fix try 1

* Android Fix try 2

* Android Fix try 3

* Removed unwanted endif

* Android Fix try 4

* Android Fix try 5

* Macro Restructure

* core: updates to CPUs detection (minor)
---
 modules/core/src/parallel.cpp | 142 +++++++++++++++++++++++++++++-----
 1 file changed, 122 insertions(+), 20 deletions(-)

diff --git a/modules/core/src/parallel.cpp b/modules/core/src/parallel.cpp
index 9dd8648a72..0ac4c4aca3 100644
--- a/modules/core/src/parallel.cpp
+++ b/modules/core/src/parallel.cpp
@@ -58,13 +58,20 @@
     #include <unistd.h>
     #include <stdio.h>
     #include <sys/types.h>
+    #include <fstream>
     #if defined __ANDROID__
         #include <sys/sysconf.h>
+        #include <sys/syscall.h>
+        #include <sched.h>
     #elif defined __APPLE__
         #include <sys/sysctl.h>
     #endif
 #endif
 
+#if defined CV_CXX11
+    #include <thread>
+#endif
+
 #ifdef _OPENMP
     #define HAVE_OPENMP
 #endif
@@ -739,19 +746,40 @@ int cv::getThreadNum(void)
 #endif
 }
 
-#ifdef __ANDROID__
-static inline int getNumberOfCPUsImpl()
+
+#if defined __linux__ || defined __GLIBC__ || defined __EMSCRIPTEN__ || defined __HAIKU__ || defined __ANDROID__
+  #define CV_CPU_GROUPS_1
+#endif
+
+#if defined __linux__ || defined __ANDROID__
+  #define CV_HAVE_CGROUPS 1
+#endif
+
+#if defined CV_CPU_GROUPS_1
+static inline
+std::string getFileContents(const char *filename)
 {
-   FILE* cpuPossible = fopen("/sys/devices/system/cpu/possible", "r");
-   if(!cpuPossible)
-       return 1;
+    std::ifstream ifs(filename);
+    if (!ifs.is_open())
+        return std::string();
 
-   char buf[2000]; //big enough for 1000 CPUs in worst possible configuration
-   char* pbuf = fgets(buf, sizeof(buf), cpuPossible);
-   fclose(cpuPossible);
-   if(!pbuf)
-      return 1;
+    std::string content( (std::istreambuf_iterator<char>(ifs) ),
+                         (std::istreambuf_iterator<char>()    ) );
 
+    if (ifs.fail())
+        return std::string();
+
+    return content;
+}
+
+static inline
+int getNumberOfCPUsImpl(const char *filename)
+{
+   std::string file_contents = getFileContents(filename);
+   if(file_contents.empty())
+       return 0;
+
+   char *pbuf = const_cast<char*>(file_contents.c_str());
    //parse string of form "0-1,3,5-7,10,13-15"
    int cpusAvailable = 0;
 
@@ -775,12 +803,63 @@ static inline int getNumberOfCPUsImpl()
       }
 
    }
-   return cpusAvailable ? cpusAvailable : 1;
+   return cpusAvailable;
 }
 #endif
 
+#if defined CV_HAVE_CGROUPS
+static inline
+unsigned getNumberOfCPUsCFS()
+{
+    int cfs_quota = 0;
+    {
+        std::ifstream ss_period("/sys/fs/cgroup/cpu/cpu.cfs_quota_us", std::ios::in | std::ios::binary);
+        ss_period >> cfs_quota;
+
+        if (ss_period.fail() || cfs_quota < 1) /* cfs_quota must not be 0 or negative */
+            return 0;
+    }
+
+    int cfs_period = 0;
+    {
+        std::ifstream ss_quota("/sys/fs/cgroup/cpu/cpu.cfs_period_us", std::ios::in | std::ios::binary);
+        ss_quota >> cfs_period;
+
+        if (ss_quota.fail() || cfs_period < 1)
+            return 0;
+    }
+
+    return (unsigned)max(1, cfs_quota/cfs_period);
+}
+#endif
+
+template <typename T> static inline
+T minNonZero(const T& val_1, const T& val_2)
+{
+    if ((val_1 != 0) && (val_2 != 0))
+        return std::min(val_1, val_2);
+    return (val_1 != 0) ? val_1 : val_2;
+}
+
 int cv::getNumberOfCPUs(void)
 {
+    /*
+     * Logic here is to try different methods of getting CPU counts and return
+     * the minimum most value as it has high probablity of being right and safe.
+     * Return 1 if we get 0 or not found on all methods.
+    */
+#if defined CV_CXX11
+    /*
+     * Check for this standard C++11 way, we do not return directly because
+     * running in a docker or K8s environment will mean this is the host
+     * machines config not the containers or pods and as per docs this value
+     * must be "considered only a hint".
+    */
+    unsigned ncpus = std::thread::hardware_concurrency(); /* If the value is not well defined or not computable, returns 0 */
+#else
+    unsigned ncpus = 0; /* 0 means we have to find out some other way */
+#endif
+
 #if defined _WIN32
     SYSTEM_INFO sysinfo;
 #if (defined(_M_ARM) || defined(_M_ARM64) || defined(_M_X64) || defined(WINRT)) && _WIN32_WINNT >= 0x501
@@ -788,13 +867,37 @@ int cv::getNumberOfCPUs(void)
 #else
     GetSystemInfo( &sysinfo );
 #endif
+    unsigned ncpus_sysinfo = sysinfo.dwNumberOfProcessors < 0 ? 1 : sysinfo.dwNumberOfProcessors; /* Just a fail safe */
+    ncpus = minNonZero(ncpus, ncpus_sysinfo);
+
+#elif defined CV_CPU_GROUPS_1
+
+#if defined CV_HAVE_CGROUPS
+    static unsigned ncpus_impl_cpuset = (unsigned)getNumberOfCPUsImpl("/sys/fs/cgroup/cpuset/cpuset.cpus");
+    ncpus = minNonZero(ncpus, ncpus_impl_cpuset);
+
+    static unsigned ncpus_impl_cfs = getNumberOfCPUsCFS();
+    ncpus = minNonZero(ncpus, ncpus_impl_cfs);
+#endif
+
+    static unsigned ncpus_impl_devices = (unsigned)getNumberOfCPUsImpl("/sys/devices/system/cpu/online");
+    ncpus = minNonZero(ncpus, ncpus_impl_devices);
+
+#if defined _GNU_SOURCE \
+    && !defined(__ANDROID__)  // TODO: add check for modern Android NDK
+
+    cpu_set_t cpu_set;
+    if (0 == sched_getaffinity(0, sizeof(cpu_set), &cpu_set))
+    {
+        unsigned cpu_count_cpu_set = CPU_COUNT(&cpu_set);
+        ncpus = minNonZero(ncpus, cpu_count_cpu_set);
+    }
+
+#endif
+
+    static unsigned cpu_count_sysconf = (unsigned)sysconf( _SC_NPROCESSORS_ONLN );
+    ncpus = minNonZero(ncpus, cpu_count_sysconf);
 
-    return (int)sysinfo.dwNumberOfProcessors;
-#elif defined __ANDROID__
-    static int ncpus = getNumberOfCPUsImpl();
-    return ncpus;
-#elif defined __linux__ || defined __GLIBC__ || defined __HAIKU__ || defined __EMSCRIPTEN__
-    return (int)sysconf( _SC_NPROCESSORS_ONLN );
 #elif defined __APPLE__
     int numCPU=0;
     int mib[4];
@@ -816,10 +919,9 @@ int cv::getNumberOfCPUs(void)
             numCPU = 1;
     }
 
-    return (int)numCPU;
-#else
-    return 1;
+    ncpus = minNonZero(ncpus, (unsigned)numCPU);
 #endif
+    return ncpus != 0 ? ncpus : 1;
 }
 
 const char* cv::currentParallelFramework() {

From d8dea7896b41cc9afdfa3738e7904ea34d2fc698 Mon Sep 17 00:00:00 2001
From: Dmitry Kurtaev <dmitry.kurtaev+github@gmail.com>
Date: Wed, 26 Feb 2020 17:51:18 +0300
Subject: [PATCH 17/18] Merge pull request #16628 from
 dkurt:dnn_ngraph_custom_layers

* Custom layers with nGraph

* nGraph: multiple outputs from nodes
---
 modules/dnn/src/dnn.cpp                  |  87 +++++++++++++----
 modules/dnn/src/ie_ngraph.cpp            | 118 ++++++++++++++++++++++-
 modules/dnn/src/ie_ngraph.hpp            |   5 +
 modules/dnn/src/layers/const_layer.cpp   |  17 +++-
 modules/dnn/test/test_backends.cpp       |   2 +-
 modules/dnn/test/test_tf_importer.cpp    |  12 ++-
 modules/dnn/test/test_torch_importer.cpp |   7 +-
 7 files changed, 221 insertions(+), 27 deletions(-)

diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp
index b0c52b101a..e6baa53b4f 100644
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@@ -1897,7 +1897,9 @@ struct Net::Impl
                 for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
                 {
                     InferenceEngine::DataPtr dataPtr = ngraphDataNode(ld.outputBlobsWrappers[i]);
-                    dataPtr->setName(netInputLayer->outNames.empty() ? ld.name : netInputLayer->outNames[i]);
+                    std::string outputName = netInputLayer->outNames.empty() ? ld.name : netInputLayer->outNames[i];
+                    outputName = ld.outputBlobsWrappers.size() > 1 ? (outputName + "." + std::to_string(i)) : outputName;
+                    dataPtr->setName(outputName);
                 }
             }
             else
@@ -1905,7 +1907,8 @@ struct Net::Impl
                 for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
                 {
                     InferenceEngine::DataPtr dataPtr = ngraphDataNode(ld.outputBlobsWrappers[i]);
-                    dataPtr->setName(ld.name);
+                    std::string outputName = ld.outputBlobsWrappers.size() > 1 ? (ld.name + "." + std::to_string(i)) : ld.name;
+                    dataPtr->setName(outputName);
                 }
             }
         }
@@ -1946,6 +1949,9 @@ struct Net::Impl
             return;
         }
 
+        bool supportsCPUFallback = preferableTarget == DNN_TARGET_CPU ||
+                                   BackendRegistry::checkIETarget(DNN_TARGET_CPU);
+
         // Build Inference Engine networks from sets of layers that support this
         // backend. Split a whole model on several Inference Engine networks if
         // some of layers are not implemented.
@@ -1960,20 +1966,47 @@ struct Net::Impl
             Ptr<Layer> layer = ld.layerInstance;
             if (!fused && !layer->supportBackend(preferableBackend))
             {
-                addNgraphOutputs(ld);
-                net = Ptr<InfEngineNgraphNet>();
-                layer->preferableTarget = DNN_TARGET_CPU;
+                bool customizable = ld.id != 0 && supportsCPUFallback;
 
-                for (int i = 0; i < ld.inputBlobsId.size(); ++i)
+                // TODO: there is a bug in Myriad plugin with custom layers shape infer.
+                if (preferableTarget == DNN_TARGET_MYRIAD)
                 {
-                    LayerData &inpLd = layers[ld.inputBlobsId[i].lid];
-                    Ptr<BackendNode> inpNode = inpLd.backendNodes[preferableBackend];
-                    if (!inpNode.empty()) {
-                        Ptr<InfEngineNgraphNode> ieNode = inpNode.dynamicCast<InfEngineNgraphNode>();
-                        ieNode->net->setUnconnectedNodes(ieNode);
+                    for (int i = 0; customizable && i < ld.inputBlobs.size(); ++i)
+                    {
+                        customizable = ld.inputBlobs[i]->size[0] == 1;
                     }
                 }
-                continue;
+
+                // TODO: fix these workarounds
+                if (preferableTarget == DNN_TARGET_MYRIAD ||
+                    preferableTarget == DNN_TARGET_OPENCL ||
+                    preferableTarget == DNN_TARGET_OPENCL_FP16)
+                    customizable &= ld.type != "Concat";
+
+                if (preferableTarget == DNN_TARGET_OPENCL ||
+                    preferableTarget == DNN_TARGET_OPENCL_FP16)
+                    customizable &= ld.type != "Power";
+
+                if (preferableTarget == DNN_TARGET_OPENCL)
+                    customizable &= ld.type != "Eltwise";
+
+                if (!customizable)
+                {
+                    addNgraphOutputs(ld);
+                    net = Ptr<InfEngineNgraphNet>();
+                    layer->preferableTarget = DNN_TARGET_CPU;
+
+                    for (int i = 0; i < ld.inputBlobsId.size(); ++i)
+                    {
+                        LayerData &inpLd = layers[ld.inputBlobsId[i].lid];
+                        Ptr<BackendNode> inpNode = inpLd.backendNodes[preferableBackend];
+                        if (!inpNode.empty()) {
+                            Ptr<InfEngineNgraphNode> ieNode = inpNode.dynamicCast<InfEngineNgraphNode>();
+                            ieNode->net->setUnconnectedNodes(ieNode);
+                        }
+                    }
+                    continue;
+                }
             }
             ld.skip = true;  // Initially skip all Inference Engine supported layers.
 
@@ -2047,12 +2080,32 @@ struct Net::Impl
 
             if (!fused)
             {
-                CV_Assert(!inputNodes.empty());
-                node = layer->initNgraph(ld.inputBlobsWrappers, inputNodes);
-                for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
+                CV_Assert(ld.inputBlobsId.size() == inputNodes.size());
+                for (int i = 0; i < ld.inputBlobsId.size(); ++i)
                 {
-                    InferenceEngine::DataPtr dataPtr = ngraphDataNode(ld.outputBlobsWrappers[i]);
-                    node.dynamicCast<InfEngineNgraphNode>()->setName(dataPtr->getName());
+                    int lid = ld.inputBlobsId[i].lid;
+                    int oid = ld.inputBlobsId[i].oid;
+                    if (oid == 0 || lid == 0)
+                        continue;
+
+                    auto ieInpNode = inputNodes[i].dynamicCast<InfEngineNgraphNode>();
+                    CV_Assert(oid < ieInpNode->node->get_output_size());
+                    inputNodes[i] = Ptr<BackendNode>(new InfEngineNgraphNode(ieInpNode->node->get_output_as_single_output_node(oid, false)));
+                }
+
+                if (layer->supportBackend(preferableBackend))
+                {
+                    node = layer->initNgraph(ld.inputBlobsWrappers, inputNodes);
+                    for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
+                    {
+                        InferenceEngine::DataPtr dataPtr = ngraphDataNode(ld.outputBlobsWrappers[i]);
+                        node.dynamicCast<InfEngineNgraphNode>()->setName(dataPtr->getName());
+                    }
+                }
+                else
+                {
+                    node = Ptr<BackendNode>(new InfEngineNgraphNode(inputNodes,
+                        ld.layerInstance, ld.inputBlobs, ld.outputBlobs, ld.internals));
                 }
             }
             else if (node.empty())
diff --git a/modules/dnn/src/ie_ngraph.cpp b/modules/dnn/src/ie_ngraph.cpp
index d7df547412..e8cfd1265e 100644
--- a/modules/dnn/src/ie_ngraph.cpp
+++ b/modules/dnn/src/ie_ngraph.cpp
@@ -26,6 +26,35 @@ namespace cv { namespace dnn {
 // OpenCV lets users use an empty input name and to prevent unexpected naming,
 // we can use some predefined name.
 static std::string kDefaultInpLayerName = "empty_inp_layer_name";
+static constexpr const char* kOpenCVLayersType = "OpenCVLayer";
+
+static std::string shapesToStr(const std::vector<Mat>& mats)
+{
+    std::ostringstream shapes;
+    shapes << mats.size() << " ";
+    for (const Mat& m : mats)
+    {
+        shapes << m.dims << " ";
+        for (int i = 0; i < m.dims; ++i)
+            shapes << m.size[i] << " ";
+    }
+    return shapes.str();
+}
+
+static void strToShapes(const std::string& str, std::vector<std::vector<size_t> >& shapes)
+{
+    std::istringstream ss(str);
+    int num, dims;
+    ss >> num;
+    shapes.resize(num);
+    for (int i = 0; i < num; ++i)
+    {
+        ss >> dims;
+        shapes[i].resize(dims);
+        for (int j = 0; j < dims; ++j)
+            ss >> shapes[i][j];
+    }
+}
 
 static std::vector<Ptr<NgraphBackendWrapper> >
 ngraphWrappers(const std::vector<Ptr<BackendWrapper> >& ptrs)
@@ -40,12 +69,82 @@ ngraphWrappers(const std::vector<Ptr<BackendWrapper> >& ptrs)
     return wrappers;
 }
 
+class NgraphCustomOp: public ngraph::op::Op {
+public:
+    const ngraph::NodeTypeInfo& get_type_info() const override
+    {
+        static constexpr ngraph::NodeTypeInfo type_info{kOpenCVLayersType, 0};
+        return type_info;
+    }
+
+    NgraphCustomOp() {};
+    NgraphCustomOp(const ngraph::NodeVector& inputs,
+                   const std::map<std::string, InferenceEngine::Parameter>& params = {}):
+        Op(inputs), params(params)
+    {
+        constructor_validate_and_infer_types();
+    }
+
+    void validate_and_infer_types() override
+    {
+        std::vector<std::vector<size_t> > shapes;
+        strToShapes(params["outputs"], shapes);
+        set_output_size(shapes.size());
+        for (size_t i = 0; i < shapes.size(); ++i)
+        {
+            ngraph::Shape output_shape(shapes[i]);
+            set_output_type(i, get_input_element_type(0), output_shape);
+        }
+    }
+
+    std::shared_ptr<ngraph::Node> copy_with_new_args(const ngraph::NodeVector& new_args) const override
+    {
+        return std::make_shared<NgraphCustomOp>(new_args, params);
+    }
+
+    bool visit_attributes(ngraph::AttributeVisitor& visitor) override
+    {
+        for (auto& attr : params)
+        {
+            if (attr.second.is<std::string>())
+                visitor.on_attribute(attr.first, attr.second.as<std::string>());
+        }
+        return true;
+    }
+
+private:
+    std::map<std::string, InferenceEngine::Parameter> params;
+};
+
 InfEngineNgraphNode::InfEngineNgraphNode(std::shared_ptr<ngraph::Node>&& _node)
     : BackendNode(DNN_BACKEND_INFERENCE_ENGINE_NGRAPH), node(std::move(_node)) {}
 
 InfEngineNgraphNode::InfEngineNgraphNode(std::shared_ptr<ngraph::Node>& _node)
     : BackendNode(DNN_BACKEND_INFERENCE_ENGINE_NGRAPH), node(_node) {}
 
+InfEngineNgraphNode::InfEngineNgraphNode(const std::vector<Ptr<BackendNode> >& nodes,
+                                         Ptr<Layer>& cvLayer_, std::vector<Mat*>& inputs,
+                                         std::vector<Mat>& outputs, std::vector<Mat>& internals)
+    : BackendNode(DNN_BACKEND_INFERENCE_ENGINE_NGRAPH), cvLayer(cvLayer_)
+{
+    std::ostringstream oss;
+    oss << (size_t)cvLayer.get();
+
+    std::map<std::string, InferenceEngine::Parameter> params = {
+        {"impl", oss.str()},
+        {"outputs", shapesToStr(outputs)},
+        {"internals", shapesToStr(internals)}
+    };
+
+    ngraph::NodeVector inp_nodes;
+    for (const auto& node : nodes)
+        inp_nodes.emplace_back(node.dynamicCast<InfEngineNgraphNode>()->node);
+    node = std::make_shared<NgraphCustomOp>(inp_nodes, params);
+
+    CV_Assert(!cvLayer->name.empty());
+    setName(cvLayer->name);
+}
+
 void InfEngineNgraphNode::setName(const std::string& name) {
     node->set_friendly_name(name);
 }
@@ -342,7 +441,24 @@ void InfEngineNgraphNet::initPlugin(InferenceEngine::CNNNetwork& net)
         if (device_name == "MYRIAD") {
             config.emplace("VPU_DETECT_NETWORK_BATCH", CONFIG_VALUE(NO));
         }
-        netExec = ie.LoadNetwork(net, device_name, config);
+
+        bool isHetero = false;
+        if (device_name != "CPU")
+        {
+            isHetero = device_name == "FPGA";
+            for (auto& layer : net)
+            {
+                if (layer->type == kOpenCVLayersType)
+                {
+                    isHetero = true;
+                    break;
+                }
+            }
+        }
+        if (isHetero)
+            netExec = ie.LoadNetwork(net, "HETERO:" + device_name + ",CPU", config);
+        else
+            netExec = ie.LoadNetwork(net, device_name, config);
     }
     catch (const std::exception& ex)
     {
diff --git a/modules/dnn/src/ie_ngraph.hpp b/modules/dnn/src/ie_ngraph.hpp
index c24839dc67..3058178cbe 100644
--- a/modules/dnn/src/ie_ngraph.hpp
+++ b/modules/dnn/src/ie_ngraph.hpp
@@ -90,6 +90,10 @@ private:
 class InfEngineNgraphNode : public BackendNode
 {
 public:
+    InfEngineNgraphNode(const std::vector<Ptr<BackendNode> >& nodes, Ptr<Layer>& layer,
+                        std::vector<Mat*>& inputs, std::vector<Mat>& outputs,
+                        std::vector<Mat>& internals);
+
     InfEngineNgraphNode(std::shared_ptr<ngraph::Node>&& _node);
     InfEngineNgraphNode(std::shared_ptr<ngraph::Node>& _node);
 
@@ -98,6 +102,7 @@ public:
     // Inference Engine network object that allows to obtain the outputs of this layer.
     std::shared_ptr<ngraph::Node> node;
     Ptr<InfEngineNgraphNet> net;
+    Ptr<dnn::Layer> cvLayer;
 };
 
 class NgraphBackendWrapper : public BackendWrapper
diff --git a/modules/dnn/src/layers/const_layer.cpp b/modules/dnn/src/layers/const_layer.cpp
index 2c6b51efde..5de45252a2 100644
--- a/modules/dnn/src/layers/const_layer.cpp
+++ b/modules/dnn/src/layers/const_layer.cpp
@@ -8,6 +8,7 @@
 #include "../precomp.hpp"
 #include "../op_inf_engine.hpp"
 #include "layers_common.hpp"
+#include "../ie_ngraph.hpp"
 
 #ifdef HAVE_OPENCL
 #include "opencl_kernels_dnn.hpp"
@@ -26,7 +27,9 @@ public:
 
     virtual bool supportBackend(int backendId) CV_OVERRIDE
     {
-        return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019;
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 ||
+               backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;
     }
 
     virtual bool getMemoryShapes(const std::vector<MatShape> &inputs,
@@ -73,6 +76,18 @@ public:
         return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
     }
 #endif  // HAVE_INF_ENGINE
+
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        auto node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32,
+                                                           getShape<size_t>(blobs[0]),
+                                                           blobs[0].data);
+        return Ptr<BackendNode>(new InfEngineNgraphNode(node));
+    }
+#endif  // HAVE_INF_ENGINE
 };
 
 Ptr<Layer> ConstLayer::create(const LayerParams& params)
diff --git a/modules/dnn/test/test_backends.cpp b/modules/dnn/test/test_backends.cpp
index 522398746a..f1cb2663eb 100644
--- a/modules/dnn/test/test_backends.cpp
+++ b/modules/dnn/test/test_backends.cpp
@@ -234,7 +234,7 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_v1_TensorFlow_Different_Width_Height)
 
     Mat sample = imread(findDataFile("dnn/street.png"));
     Mat inp = blobFromImage(sample, 1.0f, Size(300, 560), Scalar(), false);
-    float l1 = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.012 : 0.0;
+    float l1 = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.013 : 0.0;
     float lInf = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.06 : 0.0;
     processNet("dnn/ssd_mobilenet_v1_coco_2017_11_17.pb", "dnn/ssd_mobilenet_v1_coco_2017_11_17.pbtxt",
                inp, "detection_out", "", l1, lInf);
diff --git a/modules/dnn/test/test_tf_importer.cpp b/modules/dnn/test/test_tf_importer.cpp
index ecbf776184..e8064f1c90 100644
--- a/modules/dnn/test/test_tf_importer.cpp
+++ b/modules/dnn/test/test_tf_importer.cpp
@@ -158,12 +158,12 @@ TEST_P(Test_TensorFlow_layers, padding)
     runTensorFlowNet("spatial_padding");
     runTensorFlowNet("mirror_pad");
 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2019020000)
-    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
+    if (target == DNN_TARGET_MYRIAD)
     {
-        if (target == DNN_TARGET_MYRIAD)
+        if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
             applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
-        if (target == DNN_TARGET_OPENCL_FP16)
-            applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+        if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+            applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
     }
 #endif
     runTensorFlowNet("keras_pad_concat");
@@ -784,6 +784,8 @@ TEST_P(Test_TensorFlow_layers, split)
 {
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_MYRIAD)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
     runTensorFlowNet("split");
 }
 
@@ -922,7 +924,7 @@ TEST_P(Test_TensorFlow_nets, Mask_RCNN)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
 
     if (target == DNN_TARGET_MYRIAD && getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X)
-        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X);
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
 
     applyTestTag(CV_TEST_TAG_MEMORY_1GB, CV_TEST_TAG_DEBUG_VERYLONG);
     Mat img = imread(findDataFile("dnn/street.png"));
diff --git a/modules/dnn/test/test_torch_importer.cpp b/modules/dnn/test/test_torch_importer.cpp
index 55ce803db8..4b89afc331 100644
--- a/modules/dnn/test/test_torch_importer.cpp
+++ b/modules/dnn/test/test_torch_importer.cpp
@@ -360,9 +360,12 @@ TEST_P(Test_Torch_nets, ENet_accuracy)
         throw SkipTestException("");
     }
 #endif
-    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target != DNN_TARGET_CPU)
     {
-        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+        if (target == DNN_TARGET_OPENCL_FP16) applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+        if (target == DNN_TARGET_OPENCL)      applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+        if (target == DNN_TARGET_MYRIAD)      applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+        throw SkipTestException("");
     }
 
     Net net;

From af9ded89d0dbb654c14fcf32bb4ce5b1f9417530 Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.alekhin@intel.com>
Date: Wed, 26 Feb 2020 18:43:31 +0300
Subject: [PATCH 18/18] core: fix build getNumberOfCPUs for JavaScript

---
 modules/core/src/parallel.cpp | 67 ++++++++++++++++++++---------------
 1 file changed, 38 insertions(+), 29 deletions(-)

diff --git a/modules/core/src/parallel.cpp b/modules/core/src/parallel.cpp
index 0ac4c4aca3..e7393ed584 100644
--- a/modules/core/src/parallel.cpp
+++ b/modules/core/src/parallel.cpp
@@ -747,7 +747,7 @@ int cv::getThreadNum(void)
 }
 
 
-#if defined __linux__ || defined __GLIBC__ || defined __EMSCRIPTEN__ || defined __HAIKU__ || defined __ANDROID__
+#if defined __linux__ || defined __GLIBC__ || defined __HAIKU__ || defined __ANDROID__
   #define CV_CPU_GROUPS_1
 #endif
 
@@ -861,6 +861,7 @@ int cv::getNumberOfCPUs(void)
 #endif
 
 #if defined _WIN32
+
     SYSTEM_INFO sysinfo;
 #if (defined(_M_ARM) || defined(_M_ARM64) || defined(_M_X64) || defined(WINRT)) && _WIN32_WINNT >= 0x501
     GetNativeSystemInfo( &sysinfo );
@@ -870,35 +871,8 @@ int cv::getNumberOfCPUs(void)
     unsigned ncpus_sysinfo = sysinfo.dwNumberOfProcessors < 0 ? 1 : sysinfo.dwNumberOfProcessors; /* Just a fail safe */
     ncpus = minNonZero(ncpus, ncpus_sysinfo);
 
-#elif defined CV_CPU_GROUPS_1
-
-#if defined CV_HAVE_CGROUPS
-    static unsigned ncpus_impl_cpuset = (unsigned)getNumberOfCPUsImpl("/sys/fs/cgroup/cpuset/cpuset.cpus");
-    ncpus = minNonZero(ncpus, ncpus_impl_cpuset);
-
-    static unsigned ncpus_impl_cfs = getNumberOfCPUsCFS();
-    ncpus = minNonZero(ncpus, ncpus_impl_cfs);
-#endif
-
-    static unsigned ncpus_impl_devices = (unsigned)getNumberOfCPUsImpl("/sys/devices/system/cpu/online");
-    ncpus = minNonZero(ncpus, ncpus_impl_devices);
-
-#if defined _GNU_SOURCE \
-    && !defined(__ANDROID__)  // TODO: add check for modern Android NDK
-
-    cpu_set_t cpu_set;
-    if (0 == sched_getaffinity(0, sizeof(cpu_set), &cpu_set))
-    {
-        unsigned cpu_count_cpu_set = CPU_COUNT(&cpu_set);
-        ncpus = minNonZero(ncpus, cpu_count_cpu_set);
-    }
-
-#endif
-
-    static unsigned cpu_count_sysconf = (unsigned)sysconf( _SC_NPROCESSORS_ONLN );
-    ncpus = minNonZero(ncpus, cpu_count_sysconf);
-
 #elif defined __APPLE__
+
     int numCPU=0;
     int mib[4];
     size_t len = sizeof(numCPU);
@@ -920,7 +894,42 @@ int cv::getNumberOfCPUs(void)
     }
 
     ncpus = minNonZero(ncpus, (unsigned)numCPU);
+
+#elif defined CV_CPU_GROUPS_1
+
+#if defined CV_HAVE_CGROUPS
+    static unsigned ncpus_impl_cpuset = (unsigned)getNumberOfCPUsImpl("/sys/fs/cgroup/cpuset/cpuset.cpus");
+    ncpus = minNonZero(ncpus, ncpus_impl_cpuset);
+
+    static unsigned ncpus_impl_cfs = getNumberOfCPUsCFS();
+    ncpus = minNonZero(ncpus, ncpus_impl_cfs);
 #endif
+
+    static unsigned ncpus_impl_devices = (unsigned)getNumberOfCPUsImpl("/sys/devices/system/cpu/online");
+    ncpus = minNonZero(ncpus, ncpus_impl_devices);
+
+#endif
+
+#if defined _GNU_SOURCE \
+    && !defined(__EMSCRIPTEN__) \
+    && !defined(__ANDROID__)  // TODO: add check for modern Android NDK
+
+    cpu_set_t cpu_set;
+    if (0 == sched_getaffinity(0, sizeof(cpu_set), &cpu_set))
+    {
+        unsigned cpu_count_cpu_set = CPU_COUNT(&cpu_set);
+        ncpus = minNonZero(ncpus, cpu_count_cpu_set);
+    }
+
+#endif
+
+#if !defined(_WIN32) && !defined(__APPLE__)
+
+    static unsigned cpu_count_sysconf = (unsigned)sysconf( _SC_NPROCESSORS_ONLN );
+    ncpus = minNonZero(ncpus, cpu_count_sysconf);
+
+#endif
+
     return ncpus != 0 ? ncpus : 1;
 }