From e8283f94ed122772baad0e517df59bde0048ab9e Mon Sep 17 00:00:00 2001
From: tribta <joaocartuchoo@gmail.com>
Date: Wed, 19 Jun 2019 19:52:09 +0100
Subject: [PATCH 01/14] Fix Python code, for the tutorial_laplace_operator, to
 get the same result between the Cpp, Java and Python

---
 samples/python/tutorial_code/ImgTrans/LaPlace/laplace_demo.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/samples/python/tutorial_code/ImgTrans/LaPlace/laplace_demo.py b/samples/python/tutorial_code/ImgTrans/LaPlace/laplace_demo.py
index a90af4da1d..e3b13ca2e6 100644
--- a/samples/python/tutorial_code/ImgTrans/LaPlace/laplace_demo.py
+++ b/samples/python/tutorial_code/ImgTrans/LaPlace/laplace_demo.py
@@ -40,7 +40,7 @@ def main(argv):
 
     # [laplacian]
     # Apply Laplace function
-    dst = cv.Laplacian(src_gray, ddepth, kernel_size)
+    dst = cv.Laplacian(src_gray, ddepth, ksize=kernel_size)
     # [laplacian]
 
     # [convert]

From 75f4c1abf2c7984e9905bff85645086d2845669b Mon Sep 17 00:00:00 2001
From: Dmitry Kurtaev <dmitry.kurtaev+github@gmail.com>
Date: Mon, 24 Jun 2019 21:55:32 +0300
Subject: [PATCH 02/14] Enable some tests for Inference Engine backend

---
 modules/dnn/test/test_backends.cpp       |  3 +-
 modules/dnn/test/test_caffe_importer.cpp | 24 +++++++++-------
 modules/dnn/test/test_layers.cpp         | 32 +---------------------
 modules/dnn/test/test_onnx_importer.cpp  |  2 +-
 modules/dnn/test/test_tf_importer.cpp    | 35 ++++++++----------------
 5 files changed, 30 insertions(+), 66 deletions(-)

diff --git a/modules/dnn/test/test_backends.cpp b/modules/dnn/test/test_backends.cpp
index c1dcbb66b1..831f754bc7 100644
--- a/modules/dnn/test/test_backends.cpp
+++ b/modules/dnn/test/test_backends.cpp
@@ -383,7 +383,8 @@ TEST_P(DNNTestNetwork, DenseNet_121)
         l1 = 0.1; lInf = 0.6;
     }
     processNet("dnn/DenseNet_121.caffemodel", "dnn/DenseNet_121.prototxt", Size(224, 224), "", "", l1, lInf);
-    expectNoFallbacksFromIE(net);
+    if (target != DNN_TARGET_MYRIAD || getInferenceEngineVPUType() != CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X)
+        expectNoFallbacksFromIE(net);
 }
 
 TEST_P(DNNTestNetwork, FastNeuralStyle_eccv16)
diff --git a/modules/dnn/test/test_caffe_importer.cpp b/modules/dnn/test/test_caffe_importer.cpp
index c2d4673876..a5cae50621 100644
--- a/modules/dnn/test/test_caffe_importer.cpp
+++ b/modules/dnn/test/test_caffe_importer.cpp
@@ -286,19 +286,22 @@ TEST_P(Reproducibility_MobileNet_SSD, Accuracy)
     zerosOut = zerosOut.reshape(1, zerosOut.total() / 7);
 
     const int numDetections = zerosOut.rows;
-    ASSERT_NE(numDetections, 0);
-    for (int i = 0; i < numDetections; ++i)
+    // TODO: fix it
+    if (targetId != DNN_TARGET_MYRIAD ||
+        getInferenceEngineVPUType() != CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X)
     {
-        float confidence = zerosOut.ptr<float>(i)[2];
-        ASSERT_EQ(confidence, 0);
+        ASSERT_NE(numDetections, 0);
+        for (int i = 0; i < numDetections; ++i)
+        {
+            float confidence = zerosOut.ptr<float>(i)[2];
+            ASSERT_EQ(confidence, 0);
+        }
     }
 
-    // There is something wrong with Reshape layer in Myriad plugin and
-    // regression with DLIE/OCL_FP16 target.
+    // There is something wrong with Reshape layer in Myriad plugin.
     if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
     {
-        if ((targetId == DNN_TARGET_MYRIAD && getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_2) ||
-            targetId == DNN_TARGET_OPENCL_FP16)
+        if (targetId == DNN_TARGET_MYRIAD || targetId == DNN_TARGET_OPENCL_FP16)
             return;
     }
 
@@ -465,7 +468,7 @@ TEST_P(Test_Caffe_nets, Colorization)
     double lInf = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 5.3 : 3e-3;
     if (target == DNN_TARGET_MYRIAD && getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X)
     {
-        l1 = 0.6; lInf = 15;
+        l1 = 0.5; lInf = 11;
     }
     normAssert(out, ref, "", l1, lInf);
     expectNoFallbacksFromIE(net);
@@ -500,7 +503,8 @@ TEST_P(Test_Caffe_nets, DenseNet_121)
         l1 = 0.11; lInf = 0.5;
     }
     normAssert(out, ref, "", l1, lInf);
-    expectNoFallbacksFromIE(net);
+    if (target != DNN_TARGET_MYRIAD || getInferenceEngineVPUType() != CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X)
+        expectNoFallbacksFromIE(net);
 }
 
 TEST(Test_Caffe, multiple_inputs)
diff --git a/modules/dnn/test/test_layers.cpp b/modules/dnn/test/test_layers.cpp
index 18f47c0a2e..02d33b4c36 100644
--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@@ -141,8 +141,6 @@ TEST_P(Test_Caffe_layers, Convolution)
 
 TEST_P(Test_Caffe_layers, DeConvolution)
 {
-    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_CPU)
-        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE);  // TODO IE_CPU
     testLayerUsingCaffeModels("layer_deconvolution", true, false);
 }
 
@@ -246,15 +244,8 @@ TEST_P(Test_Caffe_layers, Concat)
 
 TEST_P(Test_Caffe_layers, Fused_Concat)
 {
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2019010000)
-    if (backend == DNN_BACKEND_INFERENCE_ENGINE)  // Test is disabled for DLIE due negative_slope parameter
-        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE, CV_TEST_TAG_DNN_SKIP_IE_2019R1, CV_TEST_TAG_DNN_SKIP_IE_2019R1_1);
-#endif
-
-#if defined(INF_ENGINE_RELEASE)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE && (target == DNN_TARGET_OPENCL || target == DNN_TARGET_OPENCL_FP16))
         applyTestTag(target == DNN_TARGET_OPENCL ? CV_TEST_TAG_DNN_SKIP_IE_OPENCL : CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16);
-#endif
 
     checkBackend();
 
@@ -319,26 +310,6 @@ TEST_P(Test_Caffe_layers, layer_prelu_fc)
     testLayerUsingCaffeModels("layer_prelu_fc", true, false, l1, lInf);
 }
 
-//template<typename XMat>
-//static void test_Layer_Concat()
-//{
-//    Matx21f a(1.f, 1.f), b(2.f, 2.f), c(3.f, 3.f);
-//    std::vector<Blob> res(1), src = { Blob(XMat(a)), Blob(XMat(b)), Blob(XMat(c)) };
-//    Blob ref(XMat(Matx23f(1.f, 2.f, 3.f, 1.f, 2.f, 3.f)));
-//
-//    runLayer(ConcatLayer::create(1), src, res);
-//    normAssert(ref, res[0]);
-//}
-//TEST(Layer_Concat, Accuracy)
-//{
-//    test_Layer_Concat<Mat>());
-//}
-//OCL_TEST(Layer_Concat, Accuracy)
-//{
-//    OCL_ON(test_Layer_Concat<Mat>());
-//    );
-//}
-
 TEST_P(Test_Caffe_layers, Reshape_Split_Slice)
 {
     if (backend == DNN_BACKEND_INFERENCE_ENGINE)
@@ -774,9 +745,8 @@ TEST_P(Test_Caffe_layers, Average_pooling_kernel_area)
 // Test PriorBoxLayer in case of no aspect ratios (just squared proposals).
 TEST_P(Test_Caffe_layers, PriorBox_squares)
 {
-    if (backend == DNN_BACKEND_INFERENCE_ENGINE)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD);
-
     LayerParams lp;
     lp.name = "testPriorBox";
     lp.type = "PriorBox";
diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp
index 05fa79dcf1..c99b8cf431 100644
--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@@ -225,7 +225,7 @@ TEST_P(Test_ONNX_layers, Multiplication)
 
 TEST_P(Test_ONNX_layers, Constant)
 {
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2018050000)
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2018050000)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD
             && getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X, CV_TEST_TAG_DNN_SKIP_IE_2018R5);
diff --git a/modules/dnn/test/test_tf_importer.cpp b/modules/dnn/test/test_tf_importer.cpp
index 7b311fa294..dd5d871d71 100644
--- a/modules/dnn/test/test_tf_importer.cpp
+++ b/modules/dnn/test/test_tf_importer.cpp
@@ -151,12 +151,6 @@ TEST_P(Test_TensorFlow_layers, padding)
 
 TEST_P(Test_TensorFlow_layers, padding_same)
 {
-#if defined(INF_ENGINE_RELEASE)
-    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD
-            && getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X
-    )
-        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X);
-#endif
     // Reference output values are in range [0.0006, 2.798]
     runTensorFlowNet("padding_same");
 }
@@ -432,14 +426,6 @@ TEST_P(Test_TensorFlow_nets, Inception_v2_SSD)
 TEST_P(Test_TensorFlow_nets, MobileNet_v1_SSD)
 {
     checkBackend();
-
-#if defined(INF_ENGINE_RELEASE)
-    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD
-            && getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X
-    )
-        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X);
-#endif
-
     std::string proto = findDataFile("dnn/ssd_mobilenet_v1_coco_2017_11_17.pbtxt");
     std::string model = findDataFile("dnn/ssd_mobilenet_v1_coco_2017_11_17.pb", false);
 
@@ -456,7 +442,17 @@ TEST_P(Test_TensorFlow_nets, MobileNet_v1_SSD)
     Mat ref = blobFromNPY(findDataFile("dnn/tensorflow/ssd_mobilenet_v1_coco_2017_11_17.detection_out.npy"));
     float scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 7e-3 : 1.5e-5;
     float iouDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.012 : 1e-3;
-    normAssertDetections(ref, out, "", 0.3, scoreDiff, iouDiff);
+    float detectionConfThresh = (target == DNN_TARGET_MYRIAD) ? 0.35 : 0.3;
+
+#if defined(INF_ENGINE_RELEASE)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD
+            && getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X
+    )
+        scoreDiff = 0.061;
+        iouDiff = 0.12;
+        detectionConfThresh = 0.36;
+#endif
+    normAssertDetections(ref, out, "", detectionConfThresh, scoreDiff, iouDiff);
     expectNoFallbacksFromIE(net);
 }
 
@@ -648,15 +644,8 @@ TEST_P(Test_TensorFlow_layers, fp16_weights)
 
 TEST_P(Test_TensorFlow_layers, fp16_padding_same)
 {
-#if defined(INF_ENGINE_RELEASE)
-    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD
-            && getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X
-    )
-        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X);
-#endif
-
     // Reference output values are in range [-3.504, -0.002]
-    runTensorFlowNet("fp16_padding_same", false, 6e-4, 4e-3);
+    runTensorFlowNet("fp16_padding_same", false, 7e-4, 4e-3);
 }
 
 TEST_P(Test_TensorFlow_layers, defun)

From 0a45b8c478e1d92348e028aa2c640cffe691fdfb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Stefan=20Br=C3=BCns?= <stefan.bruens@rwth-aachen.de>
Date: Fri, 28 Jun 2019 15:36:18 +0200
Subject: [PATCH 03/14] Handle absolute OPENCV_INCLUDE_INSTALL_PATH correctly

In case OPENCV_INCLUDE_INSTALL_PATH is absolute (i.e. starts with a "/"),
the path ends up with a double "/".

While this is mostly equivalent to a single slash, it may have a nasty
side effect when:
- OpenCV_INSTALL_PATH is empty
- OPENCV_INCLUDE_INSTALL_PATH is "/usr/include"
- the calling build script uses "-isystem" to specify the path to the
  headers of dependencies (to avoid warnings)

Specifying "-isystem /usr/include" breaks the path ordering, and GCC can
no longer find its "stdlib.h", thus CMake filters such statements.
Unfortunately it fails to do so when using "//usr/include".
---
 cmake/OpenCVGenConfig.cmake | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/cmake/OpenCVGenConfig.cmake b/cmake/OpenCVGenConfig.cmake
index b2ca82bad0..2c7c42b719 100644
--- a/cmake/OpenCVGenConfig.cmake
+++ b/cmake/OpenCVGenConfig.cmake
@@ -68,7 +68,11 @@ configure_file("${OpenCV_SOURCE_DIR}/cmake/templates/OpenCVConfig-version.cmake.
 #  Part 2/3: ${BIN_DIR}/unix-install/OpenCVConfig.cmake -> For use *with* "make install"
 # -------------------------------------------------------------------------------------------
 file(RELATIVE_PATH OpenCV_INSTALL_PATH_RELATIVE_CONFIGCMAKE "${CMAKE_INSTALL_PREFIX}/${OPENCV_CONFIG_INSTALL_PATH}/" ${CMAKE_INSTALL_PREFIX})
-set(OpenCV_INCLUDE_DIRS_CONFIGCMAKE "\"\${OpenCV_INSTALL_PATH}/${OPENCV_INCLUDE_INSTALL_PATH}\" \"\${OpenCV_INSTALL_PATH}/${OPENCV_INCLUDE_INSTALL_PATH}/opencv\"")
+if (IS_ABSOLUTE ${OPENCV_INCLUDE_INSTALL_PATH})
+  set(OpenCV_INCLUDE_DIRS_CONFIGCMAKE "\"${OPENCV_INCLUDE_INSTALL_PATH}\" \"${OPENCV_INCLUDE_INSTALL_PATH}/opencv\"")
+else()
+  set(OpenCV_INCLUDE_DIRS_CONFIGCMAKE "\"\${OpenCV_INSTALL_PATH}/${OPENCV_INCLUDE_INSTALL_PATH}\" \"\${OpenCV_INSTALL_PATH}/${OPENCV_INCLUDE_INSTALL_PATH}/opencv\"")
+endif()
 
 if(USE_IPPICV)
   file(RELATIVE_PATH IPPICV_INSTALL_PATH_RELATIVE_CONFIGCMAKE "${CMAKE_INSTALL_PREFIX}" "${IPPICV_INSTALL_PATH}")

From e3aa96ccf2490c900867e4e3b355087582466441 Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Fri, 28 Jun 2019 20:33:17 +0000
Subject: [PATCH 04/14] cmake: normalize include directories in
 OpenCVConfig.cmake

---
 cmake/templates/OpenCVConfig.cmake.in | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/cmake/templates/OpenCVConfig.cmake.in b/cmake/templates/OpenCVConfig.cmake.in
index c255fc2677..ef299a4515 100644
--- a/cmake/templates/OpenCVConfig.cmake.in
+++ b/cmake/templates/OpenCVConfig.cmake.in
@@ -106,7 +106,21 @@ set(OpenCV_SHARED @BUILD_SHARED_LIBS@)
 set(OpenCV_USE_MANGLED_PATHS @OpenCV_USE_MANGLED_PATHS_CONFIGCMAKE@)
 
 set(OpenCV_LIB_COMPONENTS @OPENCV_MODULES_CONFIGCMAKE@)
-set(OpenCV_INCLUDE_DIRS @OpenCV_INCLUDE_DIRS_CONFIGCMAKE@)
+set(__OpenCV_INCLUDE_DIRS @OpenCV_INCLUDE_DIRS_CONFIGCMAKE@)
+
+set(OpenCV_INCLUDE_DIRS "")
+foreach(d ${__OpenCV_INCLUDE_DIRS})
+  get_filename_component(__d "${d}" REALPATH)
+  if(NOT EXISTS "${__d}")
+    if(NOT OpenCV_FIND_QUIETLY)
+      message(WARNING "OpenCV: Include directory doesn't exist: '${d}'. OpenCV installation may be broken. Skip...")
+    endif()
+  else()
+    list(APPEND OpenCV_INCLUDE_DIRS "${__d}")
+  endif()
+endforeach()
+unset(__d)
+
 
 if(NOT TARGET opencv_core)
   include(${CMAKE_CURRENT_LIST_DIR}/OpenCVModules${OpenCV_MODULES_SUFFIX}.cmake)

From bc79f477dd4cbff875557ffa4a879b4a0dddf76f Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Fri, 28 Jun 2019 22:39:38 +0000
Subject: [PATCH 05/14] cmake: support rpath-link linker option

- builds with CMAKE_SKIP_RPATH=ON should properly find project .so files from <build>/lib directory
- $ORIGIN doesn't work properly in cross-compilation mode
---
 cmake/OpenCVCompilerOptions.cmake | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/cmake/OpenCVCompilerOptions.cmake b/cmake/OpenCVCompilerOptions.cmake
index 23b08b2f36..0b9d6692a4 100644
--- a/cmake/OpenCVCompilerOptions.cmake
+++ b/cmake/OpenCVCompilerOptions.cmake
@@ -396,3 +396,23 @@ macro(ocv_add_modules_compiler_options)
     add_definitions(-DOPENCV_ENABLE_MEMORY_SANITIZER=1)
   endif()
 endmacro()
+
+# adjust -Wl,-rpath-link
+if(CMAKE_SKIP_RPATH)
+  if((NOT CMAKE_CROSSCOMPILING OR OPENCV_ENABLE_LINKER_RPATH_LINK_ORIGIN) AND NOT OPENCV_SKIP_LINKER_RPATH_LINK_ORIGIN)
+    if(DEFINED CMAKE_SHARED_LIBRARY_RPATH_ORIGIN_TOKEN)
+      list(APPEND CMAKE_PLATFORM_RUNTIME_PATH "${CMAKE_SHARED_LIBRARY_RPATH_ORIGIN_TOKEN}")
+    else()
+      list(APPEND CMAKE_PLATFORM_RUNTIME_PATH "\$ORIGIN")
+    endif()
+  elseif(NOT OPENCV_SKIP_LINKER_RPATH_LINK_BINARY_LIB)
+    list(APPEND CMAKE_PLATFORM_RUNTIME_PATH "${LIBRARY_OUTPUT_PATH}")
+  endif()
+endif()
+if(OPENCV_EXTRA_RPATH_LINK_PATH)
+  string(REPLACE ":" ";" OPENCV_EXTRA_RPATH_LINK_PATH_ "${OPENCV_EXTRA_RPATH_LINK_PATH}")
+  list(APPEND CMAKE_PLATFORM_RUNTIME_PATH ${OPENCV_EXTRA_RPATH_LINK_PATH_})
+  if(NOT CMAKE_EXECUTABLE_RPATH_LINK_CXX_FLAG)
+    message(WARNING "OPENCV_EXTRA_RPATH_LINK_PATH may not work properly because CMAKE_EXECUTABLE_RPATH_LINK_CXX_FLAG is not defined (not supported)")
+  endif()
+endif()

From 43e457c8831aead88cbb0450e099488976b497e7 Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.alekhin@intel.com>
Date: Fri, 28 Jun 2019 19:32:59 +0300
Subject: [PATCH 06/14] 3rdparty: TBB version 2018u1 => 2019u8

- make it configurable via OPENCV_TBB_RELEASE + OPENCV_TBB_RELEASE_MD5
- remove legacy support
---
 3rdparty/tbb/CMakeLists.txt       | 40 +++++++++++++-----------------
 3rdparty/tbb/android_additional.h | 41 -------------------------------
 3rdparty/tbb/arm_linux_stub.cpp   | 11 ---------
 3 files changed, 17 insertions(+), 75 deletions(-)
 delete mode 100644 3rdparty/tbb/android_additional.h
 delete mode 100644 3rdparty/tbb/arm_linux_stub.cpp

diff --git a/3rdparty/tbb/CMakeLists.txt b/3rdparty/tbb/CMakeLists.txt
index a05d0d3c1a..a3c0a812ce 100644
--- a/3rdparty/tbb/CMakeLists.txt
+++ b/3rdparty/tbb/CMakeLists.txt
@@ -5,19 +5,14 @@ if (WIN32 AND NOT ARM)
   message(FATAL_ERROR "BUILD_TBB option supports Windows on ARM only!\nUse regular official TBB build instead of the BUILD_TBB option!")
 endif()
 
-set(tbb_filename "2018_U1.tar.gz")
-set(tbb_subdir "tbb-2018_U1")
-set(tbb_md5 "b2f2fa09adf44a22f4024049907f774b")
-
-ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4702
-    -Wshadow
-    -Wunused-parameter
-    -Wmissing-prototypes  # MacOSX
-)
+ocv_update(OPENCV_TBB_RELEASE "2019_U8")
+ocv_update(OPENCV_TBB_RELEASE_MD5 "7c371d0f62726154d2c568a85697a0ad")
+ocv_update(OPENCV_TBB_FILENAME "${OPENCV_TBB_RELEASE}.tar.gz")
+ocv_update(OPENCV_TBB_SUBDIR "tbb-${OPENCV_TBB_RELEASE}")
 
 set(tbb_src_dir "${OpenCV_BINARY_DIR}/3rdparty/tbb")
-ocv_download(FILENAME ${tbb_filename}
-             HASH ${tbb_md5}
+ocv_download(FILENAME ${OPENCV_TBB_FILENAME}
+             HASH ${OPENCV_TBB_RELEASE_MD5}
              URL
                "${OPENCV_TBB_URL}"
                "$ENV{OPENCV_TBB_URL}"
@@ -29,7 +24,7 @@ ocv_download(FILENAME ${tbb_filename}
 if(NOT res)
   return()
 endif()
-set(tbb_src_dir "${tbb_src_dir}/${tbb_subdir}")
+set(tbb_src_dir "${tbb_src_dir}/${OPENCV_TBB_SUBDIR}")
 
 ocv_include_directories("${tbb_src_dir}/include"
                         "${tbb_src_dir}/src/"
@@ -82,19 +77,20 @@ endif()
 
 if(ANDROID_COMPILER_IS_CLANG)
   add_definitions(-D__TBB_GCC_BUILTIN_ATOMICS_PRESENT=1)
-  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wmissing-prototypes)
 endif()
 
+ocv_warnings_disable(CMAKE_CXX_FLAGS
+    /wd4702
+    -Wshadow
+    -Wunused-parameter
+    -Wclass-memaccess                  # TBB 2018 under GCC 8+
+    -Wimplicit-fallthrough             # TBB 2018 under GCC 7+
+    -Wmissing-prototypes               # MacOSX, Android/Clang
+    -Wundef -Wmissing-declarations     # TBB 2019
+)
+
 set(TBB_SOURCE_FILES ${lib_srcs} ${lib_hdrs})
 
-if (ARM AND NOT WIN32)
-  if (NOT ANDROID)
-    set(TBB_SOURCE_FILES ${TBB_SOURCE_FILES} "${CMAKE_CURRENT_SOURCE_DIR}/arm_linux_stub.cpp")
-  endif()
-  set(TBB_SOURCE_FILES ${TBB_SOURCE_FILES} "${CMAKE_CURRENT_SOURCE_DIR}/android_additional.h")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -include \"${CMAKE_CURRENT_SOURCE_DIR}/android_additional.h\"")
-endif()
-
 set(tbb_version_file "version_string.ver")
 configure_file("${CMAKE_CURRENT_SOURCE_DIR}/${tbb_version_file}.cmakein" "${CMAKE_CURRENT_BINARY_DIR}/${tbb_version_file}" @ONLY)
 list(APPEND TBB_SOURCE_FILES "${CMAKE_CURRENT_BINARY_DIR}/${tbb_version_file}")
@@ -122,8 +118,6 @@ else()
   target_link_libraries(tbb c m dl)
 endif()
 
-ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef -Wmissing-declarations)
-
 # filter out flags that are not handled well by the TBB code
 foreach(var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_DEBUG)
   string(REPLACE "-Werror=non-virtual-dtor" "" ${var} "${${var}}")
diff --git a/3rdparty/tbb/android_additional.h b/3rdparty/tbb/android_additional.h
deleted file mode 100644
index 2faa495032..0000000000
--- a/3rdparty/tbb/android_additional.h
+++ /dev/null
@@ -1,41 +0,0 @@
-#include <cstdio>
-
-static inline int getPossibleCPUs()
-{
-   FILE* cpuPossible = fopen("/sys/devices/system/cpu/possible", "r");
-   if(!cpuPossible)
-       return 1;
-
-   char buf[2000]; //big enough for 1000 CPUs in worst possible configuration
-   char* pbuf = fgets(buf, sizeof(buf), cpuPossible);
-   fclose(cpuPossible);
-   if(!pbuf)
-      return 1;
-
-   //parse string of form "0-1,3,5-7,10,13-15"
-   int cpusAvailable = 0;
-
-   while(*pbuf)
-   {
-      const char* pos = pbuf;
-      bool range = false;
-      while(*pbuf && *pbuf != ',')
-      {
-          if(*pbuf == '-') range = true;
-          ++pbuf;
-      }
-      if(*pbuf) *pbuf++ = 0;
-      if(!range)
-        ++cpusAvailable;
-      else
-      {
-          int rstart = 0, rend = 0;
-          sscanf(pos, "%d-%d", &rstart, &rend);
-          cpusAvailable += rend - rstart + 1;
-      }
-
-   }
-   return cpusAvailable ? cpusAvailable : 1;
-}
-
-#define __TBB_HardwareConcurrency() getPossibleCPUs()
diff --git a/3rdparty/tbb/arm_linux_stub.cpp b/3rdparty/tbb/arm_linux_stub.cpp
deleted file mode 100644
index 93cc336be4..0000000000
--- a/3rdparty/tbb/arm_linux_stub.cpp
+++ /dev/null
@@ -1,11 +0,0 @@
-#include "tbb/tbb_misc.h"
-
-namespace tbb {
-namespace internal {
-
-void affinity_helper::protect_affinity_mask(bool) {}
-affinity_helper::~affinity_helper() {}
-void destroy_process_mask() {}
-
-}
-}

From e9a2e665b25efc3fd2c8d800c85cf74eab7997dc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Stefan=20Br=C3=BCns?= <stefan.bruens@rwth-aachen.de>
Date: Sat, 29 Jun 2019 20:28:24 +0200
Subject: [PATCH 07/14] Explicitly default operator= for Vec<T, n>
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Due to the explicitly declared copy constructor Vec<T, n>::Vec(Vec <T,n>&)
GCC 9 warns if there is no assignment operator, as having one typically
requires the other (rule-of-three, constructor/desctructor/assginment).

As the values are just a plain array the default assignment operator does
the right thing. Tell the compiler explicitly to default it.

Signed-off-by: Stefan Brüns <stefan.bruens@rwth-aachen.de>
---
 modules/core/include/opencv2/core/matx.hpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/modules/core/include/opencv2/core/matx.hpp b/modules/core/include/opencv2/core/matx.hpp
index d8e17e7015..f017b0910f 100644
--- a/modules/core/include/opencv2/core/matx.hpp
+++ b/modules/core/include/opencv2/core/matx.hpp
@@ -391,6 +391,10 @@ public:
     const _Tp& operator ()(int i) const;
     _Tp& operator ()(int i);
 
+#ifdef CV_CXX11
+    Vec<_Tp, cn>& operator=(const Vec<_Tp, cn>& rhs) = default;
+#endif
+
     Vec(const Matx<_Tp, cn, 1>& a, const Matx<_Tp, cn, 1>& b, Matx_AddOp);
     Vec(const Matx<_Tp, cn, 1>& a, const Matx<_Tp, cn, 1>& b, Matx_SubOp);
     template<typename _T2> Vec(const Matx<_Tp, cn, 1>& a, _T2 alpha, Matx_ScaleOp);

From 3e4a195b6101efcccebfb3ad975e5ef480819bf9 Mon Sep 17 00:00:00 2001
From: StefanBruens <stefan.bruens@rwth-aachen.de>
Date: Sun, 30 Jun 2019 18:04:25 +0200
Subject: [PATCH 08/14] Merge pull request #14936 from
 StefanBruens:crosscorr_cleanup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Crosscorr cleanup (#14936)

* Simplify code for convolution destination type/size

For the 2d filter code, destination size equals source size, and the
crossCorr function even (re-)creates the output matrix with the given size.

The number of channels also have to match. The destination type() is the
one used to create the output matrix, so we can use its type() here.

This is a preparatory patch.

Signed-off-by: Stefan Brüns <stefan.bruens@rwth-aachen.de>

* Remove redundant destination size and type parameters from crossCorr

All calling sites of crossCorr already use (...,
mat, mat.size(), mat.type(), ...), so the parameters are redundant.

Signed-off-by: Stefan Brüns <stefan.bruens@rwth-aachen.de>
---
 modules/imgproc/src/filter.dispatch.cpp |  8 ++------
 modules/imgproc/src/filterengine.hpp    |  1 -
 modules/imgproc/src/templmatch.cpp      | 19 ++++++++-----------
 3 files changed, 10 insertions(+), 18 deletions(-)

diff --git a/modules/imgproc/src/filter.dispatch.cpp b/modules/imgproc/src/filter.dispatch.cpp
index 24e1a74e88..c21efe181c 100644
--- a/modules/imgproc/src/filter.dispatch.cpp
+++ b/modules/imgproc/src/filter.dispatch.cpp
@@ -1160,9 +1160,7 @@ static bool dftFilter2D(int stype, int dtype, int kernel_type,
             corrDepth = ddepth == CV_64F ? CV_64F : CV_32F;
             temp.create(Size(width, height), CV_MAKETYPE(corrDepth, dst_channels));
         }
-        crossCorr(src, kernel, temp, src.size(),
-                  CV_MAKETYPE(corrDepth, src_channels),
-                  anchor, 0, borderType);
+        crossCorr(src, kernel, temp, anchor, 0, borderType);
         add(temp, delta, temp);
         if (temp.data != dst_data) {
             temp.convertTo(dst, dst.type());
@@ -1172,9 +1170,7 @@ static bool dftFilter2D(int stype, int dtype, int kernel_type,
             temp = Mat(Size(width, height), dtype, dst_data, dst_step);
         else
             temp.create(Size(width, height), dtype);
-        crossCorr(src, kernel, temp, src.size(),
-                  CV_MAKETYPE(ddepth, src_channels),
-                  anchor, delta, borderType);
+        crossCorr(src, kernel, temp, anchor, delta, borderType);
         if (temp.data != dst_data)
             temp.copyTo(dst);
     }
diff --git a/modules/imgproc/src/filterengine.hpp b/modules/imgproc/src/filterengine.hpp
index 019c1d5d2d..9ec0b6e8b1 100644
--- a/modules/imgproc/src/filterengine.hpp
+++ b/modules/imgproc/src/filterengine.hpp
@@ -366,7 +366,6 @@ static inline Point normalizeAnchor( Point anchor, Size ksize )
 
 void preprocess2DKernel( const Mat& kernel, std::vector<Point>& coords, std::vector<uchar>& coeffs );
 void crossCorr( const Mat& src, const Mat& templ, Mat& dst,
-               Size corrsize, int ctype,
                Point anchor=Point(0,0), double delta=0,
                int borderType=BORDER_REFLECT_101 );
 
diff --git a/modules/imgproc/src/templmatch.cpp b/modules/imgproc/src/templmatch.cpp
index b5a08f087a..539c1e64d8 100644
--- a/modules/imgproc/src/templmatch.cpp
+++ b/modules/imgproc/src/templmatch.cpp
@@ -564,7 +564,6 @@ static bool ocl_matchTemplate( InputArray _img, InputArray _templ, OutputArray _
 #include "opencv2/core/hal/hal.hpp"
 
 void crossCorr( const Mat& img, const Mat& _templ, Mat& corr,
-                Size corrsize, int ctype,
                 Point anchor, double delta, int borderType )
 {
     const double blockScale = 4.5;
@@ -574,7 +573,7 @@ void crossCorr( const Mat& img, const Mat& _templ, Mat& corr,
     Mat templ = _templ;
     int depth = img.depth(), cn = img.channels();
     int tdepth = templ.depth(), tcn = templ.channels();
-    int cdepth = CV_MAT_DEPTH(ctype), ccn = CV_MAT_CN(ctype);
+    int cdepth = corr.depth(), ccn = corr.channels();
 
     CV_Assert( img.dims <= 2 && templ.dims <= 2 && corr.dims <= 2 );
 
@@ -585,13 +584,11 @@ void crossCorr( const Mat& img, const Mat& _templ, Mat& corr,
     }
 
     CV_Assert( depth == tdepth || tdepth == CV_32F);
-    CV_Assert( corrsize.height <= img.rows + templ.rows - 1 &&
-               corrsize.width <= img.cols + templ.cols - 1 );
+    CV_Assert( corr.rows <= img.rows + templ.rows - 1 &&
+               corr.cols <= img.cols + templ.cols - 1 );
 
     CV_Assert( ccn == 1 || delta == 0 );
 
-    corr.create(corrsize, ctype);
-
     int maxDepth = depth > CV_8S ? CV_64F : std::max(std::max(CV_32F, tdepth), cdepth);
     Size blocksize, dftsize;
 
@@ -815,8 +812,8 @@ static void matchTemplateMask( InputArray _img, InputArray _templ, OutputArray _
         Mat mask2_templ = templ.mul(mask2);
 
         Mat corr(corrSize, CV_32F);
-        crossCorr( img, mask2_templ, corr, corr.size(), corr.type(), Point(0,0), 0, 0 );
-        crossCorr( img2, mask, result, result.size(), result.type(), Point(0,0), 0, 0 );
+        crossCorr( img, mask2_templ, corr, Point(0,0), 0, 0 );
+        crossCorr( img2, mask, result, Point(0,0), 0, 0 );
 
         result -= corr * 2;
         result += templSum2;
@@ -830,8 +827,8 @@ static void matchTemplateMask( InputArray _img, InputArray _templ, OutputArray _
         }
 
         Mat corr(corrSize, CV_32F);
-        crossCorr( img2, mask2, corr, corr.size(), corr.type(), Point(0,0), 0, 0 );
-        crossCorr( img, mask_templ, result, result.size(), result.type(), Point(0,0), 0, 0 );
+        crossCorr( img2, mask2, corr, Point(0,0), 0, 0 );
+        crossCorr( img, mask_templ, result, Point(0,0), 0, 0 );
 
         sqrt(corr, corr);
         result = result.mul(1/corr);
@@ -1130,7 +1127,7 @@ void cv::matchTemplate( InputArray _img, InputArray _templ, OutputArray _result,
 
     CV_IPP_RUN_FAST(ipp_matchTemplate(img, templ, result, method))
 
-    crossCorr( img, templ, result, result.size(), result.type(), Point(0,0), 0, 0);
+    crossCorr( img, templ, result, Point(0,0), 0, 0);
 
     common_matchTemplate(img, templ, result, method, cn);
 }

From 44836c7f7806121218350cf91f7285e405008ad9 Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.alekhin@intel.com>
Date: Mon, 1 Jul 2019 18:17:03 +0300
Subject: [PATCH 09/14] core: evaluate CV_Error() parameters during static
 scans

---
 modules/core/include/opencv2/core/base.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/core/include/opencv2/core/base.hpp b/modules/core/include/opencv2/core/base.hpp
index 0179d18aec..546140e9f1 100644
--- a/modules/core/include/opencv2/core/base.hpp
+++ b/modules/core/include/opencv2/core/base.hpp
@@ -326,8 +326,8 @@ CV_INLINE CV_NORETURN void errorNoReturn(int _code, const String& _err, const ch
 
 // In practice, some macro are not processed correctly (noreturn is not detected).
 // We need to use simplified definition for them.
-#define CV_Error(...) do { abort(); } while (0)
-#define CV_Error_( code, args ) do { cv::format args; abort(); } while (0)
+#define CV_Error(code, msg) do { (void)(code); (void)(msg); abort(); } while (0)
+#define CV_Error_(code, args) do { (void)(code); (void)(cv::format args); abort(); } while (0)
 #define CV_Assert( expr ) do { if (!(expr)) abort(); } while (0)
 #define CV_ErrorNoReturn CV_Error
 #define CV_ErrorNoReturn_ CV_Error_

From 9befb7a1d7b05cb528bfd24406074a854d168f59 Mon Sep 17 00:00:00 2001
From: Vitaly Tuzov <terfendail@users.noreply.github.com>
Date: Mon, 1 Jul 2019 19:53:51 +0300
Subject: [PATCH 10/14] Merge pull request #14916 from
 terfendail:wsignmask_deprecated

* Avoid using v_signmask universal intrinsic and mark it as deprecated

* Renamed v_find_negative to v_scan_forward
---
 modules/calib3d/src/stereobm.cpp              |   4 +-
 modules/calib3d/src/stereosgbm.cpp            |   4 +-
 .../core/include/opencv2/core/hal/intrin.hpp  |  54 ++--
 .../include/opencv2/core/hal/intrin_avx.hpp   |  11 +
 .../opencv2/core/hal/intrin_avx512.hpp        |  20 +-
 .../include/opencv2/core/hal/intrin_cpp.hpp   |  18 ++
 .../include/opencv2/core/hal/intrin_neon.hpp  |  17 +-
 .../include/opencv2/core/hal/intrin_sse.hpp   |  11 +
 .../include/opencv2/core/hal/intrin_vsx.hpp   |  11 +
 modules/features2d/src/fast.cpp               |  35 +-
 modules/imgproc/src/canny.cpp                 | 298 ++++++------------
 modules/imgproc/src/contours.cpp              |  46 +--
 modules/imgproc/src/hough.cpp                 | 112 +++----
 13 files changed, 305 insertions(+), 336 deletions(-)

diff --git a/modules/calib3d/src/stereobm.cpp b/modules/calib3d/src/stereobm.cpp
index 0257fd572b..64a7071ca2 100644
--- a/modules/calib3d/src/stereobm.cpp
+++ b/modules/calib3d/src/stereobm.cpp
@@ -534,12 +534,12 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
                     v_expand(sad8, sad4_l, sad4_h);
                     mask4 = thresh4 > sad4_l;
                     mask4 = mask4 & ((d1 > d4) | (d4 > d2));
-                    if( v_signmask(mask4) )
+                    if( v_check_any(mask4) )
                         break;
                     d4 += dd_4;
                     mask4 = thresh4 > sad4_h;
                     mask4 = mask4 & ((d1 > d4) | (d4 > d2));
-                    if( v_signmask(mask4) )
+                    if( v_check_any(mask4) )
                         break;
                     d4 += dd_4;
                 }
diff --git a/modules/calib3d/src/stereosgbm.cpp b/modules/calib3d/src/stereosgbm.cpp
index afc57c4cb2..88b28ff598 100644
--- a/modules/calib3d/src/stereosgbm.cpp
+++ b/modules/calib3d/src/stereosgbm.cpp
@@ -2013,14 +2013,14 @@ void SGBM3WayMainLoop::operator () (const Range& range) const
 
                         mask = cost1 < thresh_reg;
                         mask = mask & ( (cur_d<d1) | (cur_d>d2) );
-                        if( v_signmask(mask) )
+                        if( v_check_any(mask) )
                             break;
 
                         cur_d = cur_d+eight_reg;
 
                         mask = cost2 < thresh_reg;
                         mask = mask & ( (cur_d<d1) | (cur_d>d2) );
-                        if( v_signmask(mask) )
+                        if( v_check_any(mask) )
                             break;
 
                         cur_d = cur_d+eight_reg;
diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp
index adce1b3fb1..a96cfbdfb6 100644
--- a/modules/core/include/opencv2/core/hal/intrin.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin.hpp
@@ -55,6 +55,34 @@
 #define OPENCV_HAL_NOP(a) (a)
 #define OPENCV_HAL_1ST(a, b) (a)
 
+namespace {
+inline unsigned int trailingZeros32(unsigned int value) {
+#if defined(_MSC_VER)
+#if (_MSC_VER < 1700) || defined(_M_ARM)
+    unsigned long index = 0;
+    _BitScanForward(&index, value);
+    return (unsigned int)index;
+#elif defined(__clang__)
+    // clang-cl doesn't export _tzcnt_u32 for non BMI systems
+    return value ? __builtin_ctz(value) : 32;
+#else
+    return _tzcnt_u32(value);
+#endif
+#elif defined(__GNUC__) || defined(__GNUG__)
+    return __builtin_ctz(value);
+#elif defined(__ICC) || defined(__INTEL_COMPILER)
+    return _bit_scan_forward(value);
+#elif defined(__clang__)
+    return llvm.cttz.i32(value, true);
+#else
+    static const int MultiplyDeBruijnBitPosition[32] = {
+        0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
+        31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 };
+    return MultiplyDeBruijnBitPosition[((uint32_t)((value & -value) * 0x077CB531U)) >> 27];
+#endif
+}
+}
+
 // unlike HAL API, which is in cv::hal,
 // we put intrinsics into cv namespace to make its
 // access from within opencv code more accessible
@@ -419,32 +447,6 @@ namespace CV__SIMD_NAMESPACE {
 using namespace CV__SIMD_NAMESPACE;
 #endif
 
-inline unsigned int trailingZeros32(unsigned int value) {
-#if defined(_MSC_VER)
-#if (_MSC_VER < 1700) || defined(_M_ARM)
-    unsigned long index = 0;
-    _BitScanForward(&index, value);
-    return (unsigned int)index;
-#elif defined(__clang__)
-    // clang-cl doesn't export _tzcnt_u32 for non BMI systems
-    return value ? __builtin_ctz(value) : 32;
-#else
-    return _tzcnt_u32(value);
-#endif
-#elif defined(__GNUC__) || defined(__GNUG__)
-    return __builtin_ctz(value);
-#elif defined(__ICC) || defined(__INTEL_COMPILER)
-    return _bit_scan_forward(value);
-#elif defined(__clang__)
-    return llvm.cttz.i32(value, true);
-#else
-    static const int MultiplyDeBruijnBitPosition[32] = {
-        0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
-        31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 };
-    return MultiplyDeBruijnBitPosition[((uint32_t)((value & -value) * 0x077CB531U)) >> 27];
-#endif
-}
-
 #ifndef CV_DOXYGEN
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
 #endif
diff --git a/modules/core/include/opencv2/core/hal/intrin_avx.hpp b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
index 15ec47f7ef..24e2a52893 100644
--- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
@@ -1244,6 +1244,17 @@ inline int v_signmask(const v_float32x8& a)
 inline int v_signmask(const v_float64x4& a)
 { return _mm256_movemask_pd(a.val); }
 
+inline int v_scan_forward(const v_int8x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
+inline int v_scan_forward(const v_uint8x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
+inline int v_scan_forward(const v_int16x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
+inline int v_scan_forward(const v_uint16x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
+inline int v_scan_forward(const v_int32x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
+inline int v_scan_forward(const v_uint32x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
+inline int v_scan_forward(const v_float32x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
+inline int v_scan_forward(const v_int64x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
+inline int v_scan_forward(const v_uint64x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
+inline int v_scan_forward(const v_float64x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
+
 /** Checks **/
 #define OPENCV_HAL_IMPL_AVX_CHECK(_Tpvec, and_op, allmask)  \
     inline bool v_check_all(const _Tpvec& a)                \
diff --git a/modules/core/include/opencv2/core/hal/intrin_avx512.hpp b/modules/core/include/opencv2/core/hal/intrin_avx512.hpp
index 190d435001..d4edf0cdd1 100644
--- a/modules/core/include/opencv2/core/hal/intrin_avx512.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_avx512.hpp
@@ -2719,7 +2719,7 @@ OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_float64x8, double, f64, v_uint64x8
 ////////// Mask and checks /////////
 
 /** Mask **/
-inline int64 v_signmask(const v_int8x64& a) { return (int64)_mm512_cmp_epi8_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
+inline int64 v_signmask(const v_int8x64& a) { return (int64)_mm512_movepi8_mask(a.val); }
 inline int v_signmask(const v_int16x32& a) { return (int)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
 inline int v_signmask(const v_int32x16& a) { return (int)_mm512_cmp_epi32_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
 inline int v_signmask(const v_int64x8& a) { return (int)_mm512_cmp_epi64_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
@@ -2733,7 +2733,7 @@ inline int v_signmask(const v_float64x8& a) { return v_signmask(v_reinterpret_as
 
 /** Checks **/
 inline bool v_check_all(const v_int8x64& a) { return !(bool)_mm512_cmp_epi8_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }
-inline bool v_check_any(const v_int8x64& a) { return (bool)_mm512_cmp_epi8_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
+inline bool v_check_any(const v_int8x64& a) { return (bool)_mm512_movepi8_mask(a.val); }
 inline bool v_check_all(const v_int16x32& a) { return !(bool)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }
 inline bool v_check_any(const v_int16x32& a) { return (bool)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
 inline bool v_check_all(const v_int32x16& a) { return !(bool)_mm512_cmp_epi32_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }
@@ -2754,6 +2754,22 @@ inline bool v_check_any(const v_uint16x32& a) { return v_check_any(v_reinterpret
 inline bool v_check_any(const v_uint32x16& a) { return v_check_any(v_reinterpret_as_s32(a)); }
 inline bool v_check_any(const v_uint64x8& a) { return v_check_any(v_reinterpret_as_s64(a)); }
 
+inline int v_scan_forward(const v_int8x64& a)
+{
+    int64 mask = _mm512_movepi8_mask(a.val);
+    int mask32 = (int)mask;
+    return mask != 0 ? mask32 != 0 ? trailingZeros32(mask32) : 32 + trailingZeros32((int)(mask >> 32)) : 0;
+}
+inline int v_scan_forward(const v_uint8x64& a) { return v_scan_forward(v_reinterpret_as_s8(a)); }
+inline int v_scan_forward(const v_int16x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))); }
+inline int v_scan_forward(const v_uint16x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))); }
+inline int v_scan_forward(const v_int32x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 2; }
+inline int v_scan_forward(const v_uint32x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 2; }
+inline int v_scan_forward(const v_float32x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 2; }
+inline int v_scan_forward(const v_int64x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 4; }
+inline int v_scan_forward(const v_uint64x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 4; }
+inline int v_scan_forward(const v_float64x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 4; }
+
 inline void v512_cleanup() { _mm256_zeroall(); }
 
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
index f069609a0d..fc8fe165d2 100644
--- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@@ -1072,6 +1072,7 @@ template<typename _Tp, int n> inline typename V_TypeTraits< typename V_TypeTrait
 }
 
 /** @brief Get negative values mask
+@deprecated v_signmask depends on a lane count heavily and therefore isn't universal enough
 
 Returned value is a bit mask with bits set to 1 on places corresponding to negative packed values indexes.
 Example:
@@ -1088,6 +1089,23 @@ template<typename _Tp, int n> inline int v_signmask(const v_reg<_Tp, n>& a)
     return mask;
 }
 
+/** @brief Get first negative lane index
+
+Returned value is an index of first negative lane (undefined for input of all positive values)
+Example:
+@code{.cpp}
+v_int32x4 r; // set to {0, 0, -1, -1}
+int idx = v_heading_zeros(r); // idx = 2
+@endcode
+*/
+template <typename _Tp, int n> inline int v_scan_forward(const v_reg<_Tp, n>& a)
+{
+    for (int i = 0; i < n; i++)
+        if(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0)
+            return i;
+    return 0;
+}
+
 /** @brief Check if all packed values are less than zero
 
 Unsigned values will be casted to signed: `uchar 254 => char -2`.
diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
index 46d347d234..5617bc24e6 100644
--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@@ -1096,17 +1096,32 @@ inline int v_signmask(const v_int32x4& a)
 { return v_signmask(v_reinterpret_as_u32(a)); }
 inline int v_signmask(const v_float32x4& a)
 { return v_signmask(v_reinterpret_as_u32(a)); }
-#if CV_SIMD128_64F
 inline int v_signmask(const v_uint64x2& a)
 {
     int64x1_t m0 = vdup_n_s64(0);
     uint64x2_t v0 = vshlq_u64(vshrq_n_u64(a.val, 63), vcombine_s64(m0, m0));
     return (int)vgetq_lane_u64(v0, 0) + ((int)vgetq_lane_u64(v0, 1) << 1);
 }
+inline int v_signmask(const v_int64x2& a)
+{ return v_signmask(v_reinterpret_as_u64(a)); }
+#if CV_SIMD128_64F
 inline int v_signmask(const v_float64x2& a)
 { return v_signmask(v_reinterpret_as_u64(a)); }
 #endif
 
+inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(a)); }
+#if CV_SIMD128_64F
+inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(a)); }
+#endif
+
 #define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \
 inline bool v_check_all(const v_##_Tpvec& a) \
 { \
diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
index a01c99fa99..e172d45a9f 100644
--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@@ -1617,6 +1617,17 @@ OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int32x4, epi8, v_packq_epi32, OPENCV_HAL_AND,
 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float32x4, ps, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 15, 15)
 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float64x2, pd, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 3, 3)
 
+inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
+inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
+inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
+inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
+inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
+inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
+inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
+inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
+inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
+inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
+
 #if CV_SSE4_1
 #define OPENCV_HAL_IMPL_SSE_SELECT(_Tpvec, cast_ret, cast, suffix) \
 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
index 4d98809a34..a4d2c29d34 100644
--- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
@@ -891,6 +891,17 @@ inline int v_signmask(const v_uint64x2& a)
 inline int v_signmask(const v_float64x2& a)
 { return v_signmask(v_reinterpret_as_s64(a)); }
 
+inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(a)); }
+
 template<typename _Tpvec>
 inline bool v_check_all(const _Tpvec& a)
 { return vec_all_lt(a.val, _Tpvec().val); }
diff --git a/modules/features2d/src/fast.cpp b/modules/features2d/src/fast.cpp
index 76d23cbab9..fe010c0e39 100644
--- a/modules/features2d/src/fast.cpp
+++ b/modules/features2d/src/fast.cpp
@@ -132,10 +132,9 @@ void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bo
                             m1 = m1 | ((x3 < v1) & (x0 < v1));
                             m0 = m0 | m1;
 
-                            int mask = v_signmask(m0);
-                            if( mask == 0 )
+                            if( !v_check_any(m0) )
                                 continue;
-                            if( (mask & 255) == 0 )
+                            if( !v_check_any(v_combine_low(m0, m0)) )
                             {
                                 j -= 8;
                                 ptr -= 8;
@@ -159,16 +158,36 @@ void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bo
                                 max1 = v_max(max1, v_reinterpret_as_u8(c1));
                             }
 
-                            max0 = v_max(max0, max1);
-                            int m = v_signmask(K16 < max0);
+                            max0 = K16 < v_max(max0, max1);
+                            int m = -v_reduce_sum(v_reinterpret_as_s8(max0));
+                            uchar mflag[16];
+                            v_store(mflag, max0);
 
-                            for( k = 0; m > 0 && k < 16; k++, m >>= 1 )
+                            for( k = 0; m > 0 && k < 16; k++ )
                             {
-                                if(m & 1)
+                                if(mflag[k])
                                 {
+                                    --m;
                                     cornerpos[ncorners++] = j+k;
                                     if(nonmax_suppression)
-                                        curr[j+k] = (uchar)cornerScore<patternSize>(ptr+k, pixel, threshold);
+                                    {
+                                        short d[25];
+                                        for (int _k = 0; _k < 25; _k++)
+                                            d[_k] = (short)(ptr[k] - ptr[k + pixel[_k]]);
+
+                                        v_int16x8 a0, b0, a1, b1;
+                                        a0 = b0 = a1 = b1 = v_load(d + 8);
+                                        for(int shift = 0; shift < 8; ++shift)
+                                        {
+                                            v_int16x8 v_nms = v_load(d + shift);
+                                            a0 = v_min(a0, v_nms);
+                                            b0 = v_max(b0, v_nms);
+                                            v_nms = v_load(d + 9 + shift);
+                                            a1 = v_min(a1, v_nms);
+                                            b1 = v_max(b1, v_nms);
+                                        }
+                                        curr[j + k] = (uchar)(v_reduce_max(v_max(v_max(a0, a1), v_setzero_s16() - v_min(b0, b1))) - 1);
+                                    }
                                 }
                             }
                         }
diff --git a/modules/imgproc/src/canny.cpp b/modules/imgproc/src/canny.cpp
index 38fd30acc0..e8205f0eb4 100644
--- a/modules/imgproc/src/canny.cpp
+++ b/modules/imgproc/src/canny.cpp
@@ -47,10 +47,6 @@
 
 #include "opencv2/core/openvx/ovx_defs.hpp"
 
-#if CV_SIMD128
-#define CV_MALLOC_SIMD128 16
-#endif
-
 namespace cv
 {
 
@@ -296,18 +292,11 @@ static bool ocl_Canny(InputArray _src, const UMat& dx_, const UMat& dy_, OutputA
 
 #define CANNY_PUSH(map, stack) *map = 2, stack.push_back(map)
 
-#define CANNY_CHECK_SIMD(m, high, map, stack) \
-    if (m > high) \
-        CANNY_PUSH(map, stack); \
-    else \
-        *map = 0
-
 #define CANNY_CHECK(m, high, map, stack) \
     if (m > high) \
         CANNY_PUSH(map, stack); \
     else \
-        *map = 0; \
-    continue
+        *map = 0
 
 class parallelCanny : public ParallelLoopBody
 {
@@ -317,9 +306,14 @@ public:
         src(_src), src2(_src), map(_map), _borderPeaksParallel(borderPeaksParallel),
         low(_low), high(_high), aperture_size(_aperture_size), L2gradient(_L2gradient)
     {
-#if CV_SIMD128
+#if CV_SIMD
+        for(int i = 0; i < v_int8::nlanes; ++i)
+        {
+            smask[i] = 0;
+            smask[i + v_int8::nlanes] = (schar)-1;
+        }
         if (true)
-            _map.create(src.rows + 2, (int)alignSize((size_t)(src.cols + CV_MALLOC_SIMD128 + 1), CV_MALLOC_SIMD128), CV_8UC1);
+            _map.create(src.rows + 2, (int)alignSize((size_t)(src.cols + CV_SIMD_WIDTH + 1), CV_SIMD_WIDTH), CV_8UC1);
         else
 #endif
             _map.create(src.rows + 2, src.cols + 2,  CV_8UC1);
@@ -336,9 +330,14 @@ public:
         src(_dx), src2(_dy), map(_map), _borderPeaksParallel(borderPeaksParallel),
         low(_low), high(_high), aperture_size(0), L2gradient(_L2gradient)
     {
-#if CV_SIMD128
+#if CV_SIMD
+        for(int i = 0; i < v_int8::nlanes; ++i)
+        {
+            smask[i] = 0;
+            smask[i + v_int8::nlanes] = (schar)-1;
+        }
         if (true)
-            _map.create(src.rows + 2, (int)alignSize((size_t)(src.cols + CV_MALLOC_SIMD128 + 1), CV_MALLOC_SIMD128), CV_8UC1);
+            _map.create(src.rows + 2, (int)alignSize((size_t)(src.cols + CV_SIMD_WIDTH + 1), CV_SIMD_WIDTH), CV_8UC1);
         else
 #endif
             _map.create(src.rows + 2, src.cols + 2,  CV_8UC1);
@@ -397,11 +396,11 @@ public:
         }
 
         // _mag_p: previous row, _mag_a: actual row, _mag_n: next row
-#if CV_SIMD128
-        AutoBuffer<int> buffer(3 * (mapstep * cn + CV_MALLOC_SIMD128));
-        _mag_p = alignPtr(buffer.data() + 1, CV_MALLOC_SIMD128);
-        _mag_a = alignPtr(_mag_p + mapstep * cn, CV_MALLOC_SIMD128);
-        _mag_n = alignPtr(_mag_a + mapstep * cn, CV_MALLOC_SIMD128);
+#if CV_SIMD
+        AutoBuffer<int> buffer(3 * (mapstep * cn + CV_SIMD_WIDTH));
+        _mag_p = alignPtr(buffer.data() + 1, CV_SIMD_WIDTH);
+        _mag_a = alignPtr(_mag_p + mapstep * cn, CV_SIMD_WIDTH);
+        _mag_n = alignPtr(_mag_a + mapstep * cn, CV_SIMD_WIDTH);
 #else
         AutoBuffer<int> buffer(3 * (mapstep * cn));
         _mag_p = buffer.data() + 1;
@@ -437,21 +436,19 @@ public:
                 if (L2gradient)
                 {
                     int j = 0, width = src.cols * cn;
-#if CV_SIMD128
+#if CV_SIMD
+                    for ( ; j <= width - v_int16::nlanes; j += v_int16::nlanes)
                     {
-                       for ( ; j <= width - 8; j += 8)
-                        {
-                            v_int16x8 v_dx = v_load((const short*)(_dx + j));
-                            v_int16x8 v_dy = v_load((const short*)(_dy + j));
+                        v_int16 v_dx = vx_load((const short*)(_dx + j));
+                        v_int16 v_dy = vx_load((const short*)(_dy + j));
 
-                            v_int32x4 v_dxp_low, v_dxp_high;
-                            v_int32x4 v_dyp_low, v_dyp_high;
-                            v_expand(v_dx, v_dxp_low, v_dxp_high);
-                            v_expand(v_dy, v_dyp_low, v_dyp_high);
+                        v_int32 v_dxp_low, v_dxp_high;
+                        v_int32 v_dyp_low, v_dyp_high;
+                        v_expand(v_dx, v_dxp_low, v_dxp_high);
+                        v_expand(v_dy, v_dyp_low, v_dyp_high);
 
-                            v_store_aligned((int *)(_mag_n + j), v_dxp_low*v_dxp_low+v_dyp_low*v_dyp_low);
-                            v_store_aligned((int *)(_mag_n + j + 4), v_dxp_high*v_dxp_high+v_dyp_high*v_dyp_high);
-                        }
+                        v_store_aligned((int *)(_mag_n + j), v_dxp_low*v_dxp_low+v_dyp_low*v_dyp_low);
+                        v_store_aligned((int *)(_mag_n + j + v_int32::nlanes), v_dxp_high*v_dxp_high+v_dyp_high*v_dyp_high);
                     }
 #endif
                     for ( ; j < width; ++j)
@@ -460,23 +457,21 @@ public:
                 else
                 {
                     int j = 0, width = src.cols * cn;
-#if CV_SIMD128
+#if CV_SIMD
+                    for(; j <= width - v_int16::nlanes; j += v_int16::nlanes)
                     {
-                        for(; j <= width - 8; j += 8)
-                        {
-                            v_int16x8 v_dx = v_load((const short *)(_dx + j));
-                            v_int16x8 v_dy = v_load((const short *)(_dy + j));
+                        v_int16 v_dx = vx_load((const short *)(_dx + j));
+                        v_int16 v_dy = vx_load((const short *)(_dy + j));
 
-                            v_dx = v_reinterpret_as_s16(v_abs(v_dx));
-                            v_dy = v_reinterpret_as_s16(v_abs(v_dy));
+                        v_dx = v_reinterpret_as_s16(v_abs(v_dx));
+                        v_dy = v_reinterpret_as_s16(v_abs(v_dy));
 
-                            v_int32x4 v_dx_ml, v_dy_ml, v_dx_mh, v_dy_mh;
-                            v_expand(v_dx, v_dx_ml, v_dx_mh);
-                            v_expand(v_dy, v_dy_ml, v_dy_mh);
+                        v_int32 v_dx_ml, v_dy_ml, v_dx_mh, v_dy_mh;
+                        v_expand(v_dx, v_dx_ml, v_dx_mh);
+                        v_expand(v_dy, v_dy_ml, v_dy_mh);
 
-                            v_store_aligned((int *)(_mag_n + j), v_dx_ml + v_dy_ml);
-                            v_store_aligned((int *)(_mag_n + j + 4), v_dx_mh + v_dy_mh);
-                        }
+                        v_store_aligned((int *)(_mag_n + j), v_dx_ml + v_dy_ml);
+                        v_store_aligned((int *)(_mag_n + j + v_int32::nlanes), v_dx_mh + v_dy_mh);
                     }
 #endif
                     for ( ; j < width; ++j)
@@ -520,9 +515,9 @@ public:
 
             // From here actual src row is (i - 1)
             // Set left and right border to 1
-#if CV_SIMD128
+#if CV_SIMD
             if (true)
-                _pmap = map.ptr<uchar>(i) + CV_MALLOC_SIMD128;
+                _pmap = map.ptr<uchar>(i) + CV_SIMD_WIDTH;
             else
 #endif
                 _pmap = map.ptr<uchar>(i) + 1;
@@ -542,167 +537,60 @@ public:
 
             const int TG22 = 13573;
             int j = 0;
-#if CV_SIMD128
+#if CV_SIMD
             {
-                const v_int32x4 v_low = v_setall_s32(low);
-                const v_int8x16 v_one = v_setall_s8(1);
+                const v_int32 v_low = vx_setall_s32(low);
+                const v_int8 v_one = vx_setall_s8(1);
 
-                for (; j <= src.cols - 32; j += 32)
+                for (; j <= src.cols - v_int8::nlanes; j += v_int8::nlanes)
                 {
-                    v_int32x4 v_m1 = v_load_aligned((const int*)(_mag_a + j));
-                    v_int32x4 v_m2 = v_load_aligned((const int*)(_mag_a + j + 4));
-                    v_int32x4 v_m3 = v_load_aligned((const int*)(_mag_a + j + 8));
-                    v_int32x4 v_m4 = v_load_aligned((const int*)(_mag_a + j + 12));
-
-                    v_int32x4 v_cmp1 = v_m1 > v_low;
-                    v_int32x4 v_cmp2 = v_m2 > v_low;
-                    v_int32x4 v_cmp3 = v_m3 > v_low;
-                    v_int32x4 v_cmp4 = v_m4 > v_low;
-
-                    v_m1 = v_load_aligned((const int*)(_mag_a + j + 16));
-                    v_m2 = v_load_aligned((const int*)(_mag_a + j + 20));
-                    v_m3 = v_load_aligned((const int*)(_mag_a + j + 24));
-                    v_m4 = v_load_aligned((const int*)(_mag_a + j + 28));
-
                     v_store_aligned((signed char*)(_pmap + j), v_one);
-                    v_store_aligned((signed char*)(_pmap + j + 16), v_one);
-
-                    v_int16x8 v_cmp80 = v_pack(v_cmp1, v_cmp2);
-                    v_int16x8 v_cmp81 = v_pack(v_cmp3, v_cmp4);
-
-                    v_cmp1 = v_m1 > v_low;
-                    v_cmp2 = v_m2 > v_low;
-                    v_cmp3 = v_m3 > v_low;
-                    v_cmp4 = v_m4 > v_low;
-
-                    v_int8x16 v_cmp = v_pack(v_cmp80, v_cmp81);
-
-                    v_cmp80 = v_pack(v_cmp1, v_cmp2);
-                    v_cmp81 = v_pack(v_cmp3, v_cmp4);
-
-                    unsigned int mask = v_signmask(v_cmp);
-
-                    v_cmp = v_pack(v_cmp80, v_cmp81);
-                    mask |= v_signmask(v_cmp) << 16;
-
-                    if (mask)
+                    v_int8 v_cmp = v_pack(v_pack(vx_load_aligned((const int*)(_mag_a + j                    )) > v_low,
+                                                 vx_load_aligned((const int*)(_mag_a + j +   v_int32::nlanes)) > v_low),
+                                          v_pack(vx_load_aligned((const int*)(_mag_a + j + 2*v_int32::nlanes)) > v_low,
+                                                 vx_load_aligned((const int*)(_mag_a + j + 3*v_int32::nlanes)) > v_low));
+                    while (v_check_any(v_cmp))
                     {
-                        int k = j;
+                        int l = v_scan_forward(v_cmp);
+                        v_cmp &= vx_load(smask + v_int8::nlanes - 1 - l);
+                        int k = j + l;
 
-                        do
+                        int m = _mag_a[k];
+                        short xs = _dx[k];
+                        short ys = _dy[k];
+                        int x = (int)std::abs(xs);
+                        int y = (int)std::abs(ys) << 15;
+
+                        int tg22x = x * TG22;
+
+                        if (y < tg22x)
                         {
-                            int l = trailingZeros32(mask);
-                            k += l;
-                            mask >>= l;
-
-                            int m = _mag_a[k];
-                            short xs = _dx[k];
-                            short ys = _dy[k];
-                            int x = (int)std::abs(xs);
-                            int y = (int)std::abs(ys) << 15;
-
-                            int tg22x = x * TG22;
-
-                            if (y < tg22x)
+                            if (m > _mag_a[k - 1] && m >= _mag_a[k + 1])
                             {
-                                if (m > _mag_a[k - 1] && m >= _mag_a[k + 1])
+                                CANNY_CHECK(m, high, (_pmap+k), stack);
+                            }
+                        }
+                        else
+                        {
+                            int tg67x = tg22x + (x << 16);
+                            if (y > tg67x)
+                            {
+                                if (m > _mag_p[k] && m >= _mag_n[k])
                                 {
-                                    CANNY_CHECK_SIMD(m, high, (_pmap+k), stack);
+                                    CANNY_CHECK(m, high, (_pmap+k), stack);
                                 }
                             }
                             else
                             {
-                                int tg67x = tg22x + (x << 16);
-                                if (y > tg67x)
+                                int s = (xs ^ ys) < 0 ? -1 : 1;
+                                if(m > _mag_p[k - s] && m > _mag_n[k + s])
                                 {
-                                    if (m > _mag_p[k] && m >= _mag_n[k])
-                                    {
-                                        CANNY_CHECK_SIMD(m, high, (_pmap+k), stack);
-                                    }
-                                }
-                                else
-                                {
-                                    int s = (xs ^ ys) < 0 ? -1 : 1;
-                                    if(m > _mag_p[k - s] && m > _mag_n[k + s])
-                                    {
-                                        CANNY_CHECK_SIMD(m, high, (_pmap+k), stack);
-                                    }
+                                    CANNY_CHECK(m, high, (_pmap+k), stack);
                                 }
                             }
-                            ++k;
-                        } while((mask >>= 1));
+                        }
                     }
                 }
-
-                if (j <= src.cols - 16)
-                {
-                    v_int32x4 v_m1 = v_load_aligned((const int*)(_mag_a + j));
-                    v_int32x4 v_m2 = v_load_aligned((const int*)(_mag_a + j + 4));
-                    v_int32x4 v_m3 = v_load_aligned((const int*)(_mag_a + j + 8));
-                    v_int32x4 v_m4 = v_load_aligned((const int*)(_mag_a + j + 12));
-
-                    v_store_aligned((signed char*)(_pmap + j), v_one);
-
-                    v_int32x4 v_cmp1 = v_m1 > v_low;
-                    v_int32x4 v_cmp2 = v_m2 > v_low;
-                    v_int32x4 v_cmp3 = v_m3 > v_low;
-                    v_int32x4 v_cmp4 = v_m4 > v_low;
-
-                    v_int16x8 v_cmp80 = v_pack(v_cmp1, v_cmp2);
-                    v_int16x8 v_cmp81 = v_pack(v_cmp3, v_cmp4);
-
-                    v_int8x16 v_cmp = v_pack(v_cmp80, v_cmp81);
-                    unsigned int mask = v_signmask(v_cmp);
-
-                    if (mask)
-                    {
-                        int k = j;
-
-                        do
-                        {
-                            int l = trailingZeros32(mask);
-                            k += l;
-                            mask >>= l;
-
-                            int m = _mag_a[k];
-                            short xs = _dx[k];
-                            short ys = _dy[k];
-                            int x = (int)std::abs(xs);
-                            int y = (int)std::abs(ys) << 15;
-
-                            int tg22x = x * TG22;
-
-                            if (y < tg22x)
-                            {
-                                if (m > _mag_a[k - 1] && m >= _mag_a[k + 1])
-                                {
-                                    CANNY_CHECK_SIMD(m, high, (_pmap+k), stack);
-                                }
-                            }
-                            else
-                            {
-                                int tg67x = tg22x + (x << 16);
-                                if (y > tg67x)
-                                {
-                                    if (m > _mag_p[k] && m >= _mag_n[k])
-                                    {
-                                        CANNY_CHECK_SIMD(m, high, (_pmap+k), stack);
-                                    }
-                                }
-                                else
-                                {
-                                    int s = (xs ^ ys) < 0 ? -1 : 1;
-                                    if(m > _mag_p[k - s] && m > _mag_n[k + s])
-                                    {
-                                        CANNY_CHECK_SIMD(m, high, (_pmap+k), stack);
-                                    }
-                                }
-                            }
-                            ++k;
-                        } while((mask >>= 1));
-                    }
-                    j += 16;
-                }
             }
 #endif
             for (; j < src.cols; j++)
@@ -723,6 +611,7 @@ public:
                         if (m > _mag_a[j - 1] && m >= _mag_a[j + 1])
                         {
                             CANNY_CHECK(m, high, (_pmap+j), stack);
+                            continue;
                         }
                     }
                     else
@@ -733,6 +622,7 @@ public:
                             if (m > _mag_p[j] && m >= _mag_n[j])
                             {
                                 CANNY_CHECK(m, high, (_pmap+j), stack);
+                                continue;
                             }
                         }
                         else
@@ -741,6 +631,7 @@ public:
                             if(m > _mag_p[j - s] && m > _mag_n[j + s])
                             {
                                 CANNY_CHECK(m, high, (_pmap+j), stack);
+                                continue;
                             }
                         }
                     }
@@ -802,6 +693,9 @@ private:
     ptrdiff_t mapstep;
     int cn;
     mutable Mutex mutex;
+#if CV_SIMD
+    schar smask[2*v_int8::nlanes];
+#endif
 };
 
 class finalPass : public ParallelLoopBody
@@ -824,31 +718,31 @@ public:
             int j = 0;
             uchar *pdst = dst.ptr<uchar>(i);
             const uchar *pmap = map.ptr<uchar>(i + 1);
-#if CV_SIMD128
+#if CV_SIMD
             if (true)
-                pmap += CV_MALLOC_SIMD128;
+                pmap += CV_SIMD_WIDTH;
             else
 #endif
                 pmap += 1;
-#if CV_SIMD128
+#if CV_SIMD
             {
-                const v_uint8x16 v_zero = v_setzero_u8();
-                const v_uint8x16 v_ff = ~v_zero;
-                const v_uint8x16 v_two(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+                const v_uint8 v_zero = vx_setzero_u8();
+                const v_uint8 v_ff = ~v_zero;
+                const v_uint8 v_two = vx_setall_u8(2);
 
-                for (; j <= dst.cols - 16; j += 16)
+                for (; j <= dst.cols - v_uint8::nlanes; j += v_uint8::nlanes)
                 {
-                    v_uint8x16 v_pmap = v_load_aligned((const unsigned char*)(pmap + j));
+                    v_uint8 v_pmap = vx_load_aligned((const unsigned char*)(pmap + j));
                     v_pmap = v_select(v_pmap == v_two, v_ff, v_zero);
                     v_store((pdst + j), v_pmap);
                 }
 
-                if (j <= dst.cols - 8)
+                if (j <= dst.cols - v_uint8::nlanes/2)
                 {
-                    v_uint8x16 v_pmap = v_load_low((const unsigned char*)(pmap + j));
+                    v_uint8 v_pmap = vx_load_low((const unsigned char*)(pmap + j));
                     v_pmap = v_select(v_pmap == v_two, v_ff, v_zero);
                     v_store_low((pdst + j), v_pmap);
-                    j += 8;
+                    j += v_uint8::nlanes/2;
                 }
             }
 #endif
diff --git a/modules/imgproc/src/contours.cpp b/modules/imgproc/src/contours.cpp
index c086050b0e..e433cdb514 100644
--- a/modules/imgproc/src/contours.cpp
+++ b/modules/imgproc/src/contours.cpp
@@ -1061,19 +1061,13 @@ cvFindNextContour( CvContourScanner scanner )
                 }
                 else
                 {
-#if CV_SIMD_WIDTH > 16
-                    v_uint8 vx_prev = vx_setall_u8((uchar)prev);
-                    while (x <= width - v_uint8::nlanes &&
-                           v_check_all(vx_load((uchar*)(img + x)) == vx_prev))
-                        x += v_uint8::nlanes;
-#endif
-                    v_uint8x16 v_prev = v_setall_u8((uchar)prev);
-                    for (; x <= width - v_uint8x16::nlanes; x += v_uint8x16::nlanes)
+                    v_uint8 v_prev = vx_setall_u8((uchar)prev);
+                    for (; x <= width - v_uint8::nlanes; x += v_uint8::nlanes)
                     {
-                        unsigned int mask = (unsigned int)v_signmask(v_load((uchar*)(img + x)) != v_prev);
-                        if (mask)
+                        v_uint8 vmask = (vx_load((uchar*)(img + x)) != v_prev);
+                        if (v_check_any(vmask))
                         {
-                            p = img[(x += cv::trailingZeros32(mask))];
+                            p = img[(x += v_scan_forward(vmask))];
                             goto _next_contour;
                         }
                     }
@@ -1334,19 +1328,13 @@ CvLinkedRunPoint;
 inline int findStartContourPoint(uchar *src_data, CvSize img_size, int j)
 {
 #if CV_SIMD
-#if CV_SIMD_WIDTH > 16
-    v_uint8 vx_zero = vx_setzero_u8();
-    while (j <= img_size.width - v_uint8::nlanes &&
-           v_check_all(vx_load((uchar*)(src_data + j)) == vx_zero))
-        j += v_uint8::nlanes;
-#endif
-    v_uint8x16 v_zero = v_setzero_u8();
-    for (; j <= img_size.width - v_uint8x16::nlanes; j += v_uint8x16::nlanes)
+    v_uint8 v_zero = vx_setzero_u8();
+    for (; j <= img_size.width - v_uint8::nlanes; j += v_uint8::nlanes)
     {
-        unsigned int mask = (unsigned int)v_signmask(v_load((uchar*)(src_data + j)) != v_zero);
-        if (mask)
+        v_uint8 vmask = (vx_load((uchar*)(src_data + j)) != v_zero);
+        if (v_check_any(vmask))
         {
-            j += cv::trailingZeros32(mask);
+            j += v_scan_forward(vmask);
             return j;
         }
     }
@@ -1365,19 +1353,13 @@ inline int findEndContourPoint(uchar *src_data, CvSize img_size, int j)
     }
     else
     {
-#if CV_SIMD_WIDTH > 16
-        v_uint8 vx_zero = vx_setzero_u8();
-        while (j <= img_size.width - v_uint8::nlanes &&
-               v_check_all(vx_load((uchar*)(src_data + j)) != vx_zero))
-            j += v_uint8::nlanes;
-#endif
-        v_uint8x16 v_zero = v_setzero_u8();
+        v_uint8 v_zero = vx_setzero_u8();
         for (; j <= img_size.width - v_uint8::nlanes; j += v_uint8::nlanes)
         {
-            unsigned int mask = (unsigned int)v_signmask(v_load((uchar*)(src_data + j)) == v_zero);
-            if (mask)
+            v_uint8 vmask = (vx_load((uchar*)(src_data + j)) == v_zero);
+            if (v_check_any(vmask))
             {
-                j += cv::trailingZeros32(mask);
+                j += v_scan_forward(vmask);
                 return j;
             }
         }
diff --git a/modules/imgproc/src/hough.cpp b/modules/imgproc/src/hough.cpp
index 5862319738..6b18b17b56 100644
--- a/modules/imgproc/src/hough.cpp
+++ b/modules/imgproc/src/hough.cpp
@@ -1139,32 +1139,23 @@ public:
 
             for(; x < numCols; ++x )
             {
-#if CV_SIMD128
+#if CV_SIMD
                 {
-                    v_uint8x16 v_zero = v_setzero_u8();
+                    v_uint8 v_zero = vx_setzero_u8();
 
-                    for(; x <= numCols - 32; x += 32) {
-                        v_uint8x16 v_edge1 = v_load(edgeData + x);
-                        v_uint8x16 v_edge2 = v_load(edgeData + x + 16);
+                    for(; x <= numCols - 2*v_uint8::nlanes; x += 2*v_uint8::nlanes) {
+                        v_uint8 v_edge1 = (vx_load(edgeData + x                  ) != v_zero);
+                        v_uint8 v_edge2 = (vx_load(edgeData + x + v_uint8::nlanes) != v_zero);
 
-                        v_uint8x16 v_cmp1 = (v_edge1 == v_zero);
-                        v_uint8x16 v_cmp2 = (v_edge2 == v_zero);
-
-                        unsigned int mask1 = v_signmask(v_cmp1);
-                        unsigned int mask2 = v_signmask(v_cmp2);
-
-                        mask1 ^= 0x0000ffff;
-                        mask2 ^= 0x0000ffff;
-
-                        if(mask1)
+                        if(v_check_any(v_edge1))
                         {
-                            x += trailingZeros32(mask1);
+                            x += v_scan_forward(v_edge1);
                             goto _next_step;
                         }
 
-                        if(mask2)
+                        if(v_check_any(v_edge2))
                         {
-                            x += trailingZeros32(mask2 << 16);
+                            x += v_uint8::nlanes + v_scan_forward(v_edge2);
                             goto _next_step;
                         }
                     }
@@ -1175,7 +1166,7 @@ public:
 
                 if(x == numCols)
                     continue;
-#if CV_SIMD128
+#if CV_SIMD
 _next_step:
 #endif
                 float vx, vy;
@@ -1506,36 +1497,35 @@ inline int HoughCircleEstimateRadiusInvoker<NZPointList>::filterCircles(const Po
     int nzCount = 0;
     const Point* nz_ = &nz[0];
     int j = 0;
-#if CV_SIMD128
+#if CV_SIMD
     {
-        const v_float32x4 v_minRadius2 = v_setall_f32(minRadius2);
-        const v_float32x4 v_maxRadius2 = v_setall_f32(maxRadius2);
+        const v_float32 v_minRadius2 = vx_setall_f32(minRadius2);
+        const v_float32 v_maxRadius2 = vx_setall_f32(maxRadius2);
 
-        v_float32x4 v_curCenterX = v_setall_f32(curCenter.x);
-        v_float32x4 v_curCenterY = v_setall_f32(curCenter.y);
+        v_float32 v_curCenterX = vx_setall_f32(curCenter.x);
+        v_float32 v_curCenterY = vx_setall_f32(curCenter.y);
 
-        float CV_DECL_ALIGNED(16) rbuf[4];
-        for(; j <= nzSz - 4; j += 4)
+        float CV_DECL_ALIGNED(CV_SIMD_WIDTH) rbuf[v_float32::nlanes];
+        int CV_DECL_ALIGNED(CV_SIMD_WIDTH) rmask[v_int32::nlanes];
+        for(; j <= nzSz - v_float32::nlanes; j += v_float32::nlanes)
         {
-            v_float32x4 v_nzX, v_nzY;
+            v_float32 v_nzX, v_nzY;
             v_load_deinterleave((const float*)&nz_[j], v_nzX, v_nzY); // FIXIT use proper datatype
 
-            v_float32x4 v_x = v_cvt_f32(v_reinterpret_as_s32(v_nzX));
-            v_float32x4 v_y = v_cvt_f32(v_reinterpret_as_s32(v_nzY));
+            v_float32 v_x = v_cvt_f32(v_reinterpret_as_s32(v_nzX));
+            v_float32 v_y = v_cvt_f32(v_reinterpret_as_s32(v_nzY));
 
-            v_float32x4 v_dx = v_x - v_curCenterX;
-            v_float32x4 v_dy = v_y - v_curCenterY;
+            v_float32 v_dx = v_x - v_curCenterX;
+            v_float32 v_dy = v_y - v_curCenterY;
 
-            v_float32x4 v_r2 = (v_dx * v_dx) + (v_dy * v_dy);
-            v_float32x4 vmask = (v_minRadius2 <= v_r2) & (v_r2 <= v_maxRadius2);
-            unsigned int mask = v_signmask(vmask);
-            if (mask)
+            v_float32 v_r2 = (v_dx * v_dx) + (v_dy * v_dy);
+            v_float32 vmask = (v_minRadius2 <= v_r2) & (v_r2 <= v_maxRadius2);
+            if (v_check_any(vmask))
             {
+                v_store_aligned(rmask, v_reinterpret_as_s32(vmask));
                 v_store_aligned(rbuf, v_r2);
-                if (mask & 1) ddata[nzCount++] = rbuf[0];
-                if (mask & 2) ddata[nzCount++] = rbuf[1];
-                if (mask & 4) ddata[nzCount++] = rbuf[2];
-                if (mask & 8) ddata[nzCount++] = rbuf[3];
+                for (int i = 0; i < v_int32::nlanes; ++i)
+                    if (rmask[i]) ddata[nzCount++] = rbuf[i];
             }
         }
     }
@@ -1566,12 +1556,13 @@ inline int HoughCircleEstimateRadiusInvoker<NZPointSet>::filterCircles(const Poi
     const Range xOuter = Range(std::max(int(curCenter.x - rOuter), 0), std::min(int(curCenter.x + rOuter), positions.cols));
     const Range yOuter = Range(std::max(int(curCenter.y - rOuter), 0), std::min(int(curCenter.y + rOuter), positions.rows));
 
-#if CV_SIMD128
-    const int numSIMDPoints = 4;
-
-    const v_float32x4 v_minRadius2 = v_setall_f32(minRadius2);
-    const v_float32x4 v_maxRadius2 = v_setall_f32(maxRadius2);
-    const v_float32x4 v_curCenterX_0123 = v_setall_f32(curCenter.x) - v_float32x4(0.0f, 1.0f, 2.0f, 3.0f);
+#if CV_SIMD
+    float v_seq[v_float32::nlanes];
+    for (int i = 0; i < v_float32::nlanes; ++i)
+        v_seq[i] = (float)i;
+    const v_float32 v_minRadius2 = vx_setall_f32(minRadius2);
+    const v_float32 v_maxRadius2 = vx_setall_f32(maxRadius2);
+    const v_float32 v_curCenterX_0123 = vx_setall_f32(curCenter.x) - vx_load(v_seq);
 #endif
 
     for (int y = yOuter.start; y < yOuter.end; y++)
@@ -1581,29 +1572,28 @@ inline int HoughCircleEstimateRadiusInvoker<NZPointSet>::filterCircles(const Poi
         float dy2 = dy * dy;
 
         int x = xOuter.start;
-#if CV_SIMD128
+#if CV_SIMD
         {
-            const v_float32x4 v_dy2 = v_setall_f32(dy2);
-            const v_uint32x4 v_zero_u32 = v_setall_u32(0);
-            float CV_DECL_ALIGNED(16) rbuf[4];
-            for (; x <= xOuter.end - 4; x += numSIMDPoints)
+            const v_float32 v_dy2 = vx_setall_f32(dy2);
+            const v_uint32 v_zero_u32 = vx_setall_u32(0);
+            float CV_DECL_ALIGNED(CV_SIMD_WIDTH) rbuf[v_float32::nlanes];
+            int CV_DECL_ALIGNED(CV_SIMD_WIDTH) rmask[v_int32::nlanes];
+            for (; x <= xOuter.end - v_float32::nlanes; x += v_float32::nlanes)
             {
-                v_uint32x4 v_mask = v_load_expand_q(ptr + x);
+                v_uint32 v_mask = vx_load_expand_q(ptr + x);
                 v_mask = v_mask != v_zero_u32;
 
-                v_float32x4 v_x = v_cvt_f32(v_setall_s32(x));
-                v_float32x4 v_dx = v_x - v_curCenterX_0123;
+                v_float32 v_x = v_cvt_f32(vx_setall_s32(x));
+                v_float32 v_dx = v_x - v_curCenterX_0123;
 
-                v_float32x4 v_r2 = (v_dx * v_dx) + v_dy2;
-                v_float32x4 vmask = (v_minRadius2 <= v_r2) & (v_r2 <= v_maxRadius2) & v_reinterpret_as_f32(v_mask);
-                unsigned int mask = v_signmask(vmask);
-                if (mask)
+                v_float32 v_r2 = (v_dx * v_dx) + v_dy2;
+                v_float32 vmask = (v_minRadius2 <= v_r2) & (v_r2 <= v_maxRadius2) & v_reinterpret_as_f32(v_mask);
+                if (v_check_any(vmask))
                 {
+                    v_store_aligned(rmask, v_reinterpret_as_s32(vmask));
                     v_store_aligned(rbuf, v_r2);
-                    if (mask & 1) ddata[nzCount++] = rbuf[0];
-                    if (mask & 2) ddata[nzCount++] = rbuf[1];
-                    if (mask & 4) ddata[nzCount++] = rbuf[2];
-                    if (mask & 8) ddata[nzCount++] = rbuf[3];
+                    for (int i = 0; i < v_int32::nlanes; ++i)
+                        if (rmask[i]) ddata[nzCount++] = rbuf[i];
                 }
             }
         }

From edf2cbd5f7d310af9bb2e746caa8402a955b4880 Mon Sep 17 00:00:00 2001
From: armenpoghosov <39712046+armenpoghosov@users.noreply.github.com>
Date: Mon, 1 Jul 2019 20:57:28 +0200
Subject: [PATCH 11/14] Merge pull request #14828 from
 armenpoghosov:parmen_RANSACPointSetRegistrator_getSubset_disaster_cleanup

Parmen ransac point set registrator get subset disaster cleanup (#14828)
---
 modules/calib3d/src/ptsetreg.cpp | 73 +++++++++++++++++---------------
 1 file changed, 39 insertions(+), 34 deletions(-)

diff --git a/modules/calib3d/src/ptsetreg.cpp b/modules/calib3d/src/ptsetreg.cpp
index ae9ac459bc..88e1815c35 100644
--- a/modules/calib3d/src/ptsetreg.cpp
+++ b/modules/calib3d/src/ptsetreg.cpp
@@ -99,55 +99,60 @@ public:
         return nz;
     }
 
-    bool getSubset( const Mat& m1, const Mat& m2,
-                    Mat& ms1, Mat& ms2, RNG& rng,
-                    int maxAttempts=1000 ) const
+    bool getSubset( const Mat& m1, const Mat& m2, Mat& ms1, Mat& ms2, RNG& rng, int maxAttempts=1000 ) const
     {
         cv::AutoBuffer<int> _idx(modelPoints);
         int* idx = _idx.data();
-        int i = 0, j, k, iters = 0;
-        int d1 = m1.channels() > 1 ? m1.channels() : m1.cols;
-        int d2 = m2.channels() > 1 ? m2.channels() : m2.cols;
-        int esz1 = (int)m1.elemSize1()*d1, esz2 = (int)m2.elemSize1()*d2;
-        int count = m1.checkVector(d1), count2 = m2.checkVector(d2);
-        const int *m1ptr = m1.ptr<int>(), *m2ptr = m2.ptr<int>();
+
+        const int d1 = m1.channels() > 1 ? m1.channels() : m1.cols;
+        const int d2 = m2.channels() > 1 ? m2.channels() : m2.cols;
+
+        int esz1 = (int)m1.elemSize1() * d1;
+        int esz2 = (int)m2.elemSize1() * d2;
+        CV_Assert((esz1 % sizeof(int)) == 0 && (esz2 % sizeof(int)) == 0);
+        esz1 /= sizeof(int);
+        esz2 /= sizeof(int);
+
+        const int count = m1.checkVector(d1);
+        const int count2 = m2.checkVector(d2);
+        CV_Assert(count >= modelPoints && count == count2);
+
+        const int *m1ptr = m1.ptr<int>();
+        const int *m2ptr = m2.ptr<int>();
 
         ms1.create(modelPoints, 1, CV_MAKETYPE(m1.depth(), d1));
         ms2.create(modelPoints, 1, CV_MAKETYPE(m2.depth(), d2));
 
-        int *ms1ptr = ms1.ptr<int>(), *ms2ptr = ms2.ptr<int>();
+        int *ms1ptr = ms1.ptr<int>();
+        int *ms2ptr = ms2.ptr<int>();
 
-        CV_Assert( count >= modelPoints && count == count2 );
-        CV_Assert( (esz1 % sizeof(int)) == 0 && (esz2 % sizeof(int)) == 0 );
-        esz1 /= sizeof(int);
-        esz2 /= sizeof(int);
-
-        for(; iters < maxAttempts; iters++)
+        for( int iters = 0; iters < maxAttempts; ++iters )
         {
-            for( i = 0; i < modelPoints && iters < maxAttempts; )
+            int i;
+
+            for( i = 0; i < modelPoints; ++i )
             {
-                int idx_i = 0;
-                for(;;)
-                {
-                    idx_i = idx[i] = rng.uniform(0, count);
-                    for( j = 0; j < i; j++ )
-                        if( idx_i == idx[j] )
-                            break;
-                    if( j == i )
-                        break;
-                }
-                for( k = 0; k < esz1; k++ )
+                int idx_i;
+
+                for ( idx_i = rng.uniform(0, count);
+                    std::find(idx, idx + i, idx_i) != idx + i;
+                    idx_i = rng.uniform(0, count) )
+                {}
+
+                idx[i] = idx_i;
+
+                for( int k = 0; k < esz1; ++k )
                     ms1ptr[i*esz1 + k] = m1ptr[idx_i*esz1 + k];
-                for( k = 0; k < esz2; k++ )
+
+                for( int k = 0; k < esz2; ++k )
                     ms2ptr[i*esz2 + k] = m2ptr[idx_i*esz2 + k];
-                i++;
             }
-            if( i == modelPoints && !cb->checkSubset(ms1, ms2, i) )
-                continue;
-            break;
+
+            if( cb->checkSubset(ms1, ms2, i) )
+                return true;
         }
 
-        return i == modelPoints && iters < maxAttempts;
+        return false;
     }
 
     bool run(InputArray _m1, InputArray _m2, OutputArray _model, OutputArray _mask) const CV_OVERRIDE

From db6a6ccaba59bb96fb6aafe41e6a589d461a2b92 Mon Sep 17 00:00:00 2001
From: Tomoaki Teshima <tomoaki.teshima@gmail.com>
Date: Tue, 2 Jul 2019 21:57:15 +0900
Subject: [PATCH 12/14] re-enable CPU_BASELINE=FP16 on Armv7 platform

---
 cmake/OpenCVCompilerOptimizations.cmake | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/cmake/OpenCVCompilerOptimizations.cmake b/cmake/OpenCVCompilerOptimizations.cmake
index 11c13b2886..9e4691760c 100644
--- a/cmake/OpenCVCompilerOptimizations.cmake
+++ b/cmake/OpenCVCompilerOptimizations.cmake
@@ -329,7 +329,7 @@ elseif(ARM OR AARCH64)
       ocv_update(CPU_VFPV3_FLAGS_ON "-mfpu=vfpv3")
       ocv_update(CPU_NEON_FLAGS_ON "-mfpu=neon")
       ocv_update(CPU_NEON_FLAGS_CONFLICT "-mfpu=[^ ]*")
-      ocv_update(CPU_FP16_FLAGS_ON "-mfpu=neon-fp16")
+      ocv_update(CPU_FP16_FLAGS_ON "-mfpu=neon-fp16 -mfp16-format=ieee")
       ocv_update(CPU_FP16_FLAGS_CONFLICT "-mfpu=[^ ]*")
     endif()
     ocv_update(CPU_FP16_IMPLIES "NEON")
@@ -617,9 +617,6 @@ macro(ocv_compiler_optimization_options)
   if(ENABLE_POWERPC)
     add_extra_compiler_option("-mcpu=G3 -mtune=G5")
   endif()
-  if(ARM)
-    add_extra_compiler_option("-mfp16-format=ieee")
-  endif(ARM)
 endmacro()
 
 macro(ocv_compiler_optimization_options_finalize)

From a37201abee79fa8c23da92b0d65a62e3dab65add Mon Sep 17 00:00:00 2001
From: arnaudbrejeon <arnaud@tangibleplay.com>
Date: Tue, 2 Jul 2019 09:56:31 -0700
Subject: [PATCH 13/14] Fix crash, add assert and test

---
 modules/imgproc/src/connectedcomponents.cpp       |  4 +++-
 modules/imgproc/test/test_connectedcomponents.cpp | 14 ++++++++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/modules/imgproc/src/connectedcomponents.cpp b/modules/imgproc/src/connectedcomponents.cpp
index 10095842b2..9241c6c09e 100644
--- a/modules/imgproc/src/connectedcomponents.cpp
+++ b/modules/imgproc/src/connectedcomponents.cpp
@@ -2542,7 +2542,8 @@ namespace cv{
 
             //Array used to store info and labeled pixel by each thread.
             //Different threads affect different memory location of chunksSizeAndLabels
-            int *chunksSizeAndLabels = (int *)cv::fastMalloc(h * sizeof(int));
+            const int chunksSizeAndLabelsSize = h + 1;
+            int *chunksSizeAndLabels = (int *)cv::fastMalloc(chunksSizeAndLabelsSize * sizeof(int));
 
             //Tree of labels
             LabelT *P = (LabelT *)cv::fastMalloc(Plength * sizeof(LabelT));
@@ -2561,6 +2562,7 @@ namespace cv{
 
             LabelT nLabels = 1;
             for (int i = 0; i < h; i = chunksSizeAndLabels[i]){
+                CV_Assert(i + 1 < chunksSizeAndLabelsSize);
                 flattenL(P, LabelT((i + 1) / 2) * LabelT((w + 1) / 2) + 1, chunksSizeAndLabels[i + 1], nLabels);
             }
 
diff --git a/modules/imgproc/test/test_connectedcomponents.cpp b/modules/imgproc/test/test_connectedcomponents.cpp
index abd6fd43b4..3817f6d172 100644
--- a/modules/imgproc/test/test_connectedcomponents.cpp
+++ b/modules/imgproc/test/test_connectedcomponents.cpp
@@ -136,4 +136,18 @@ void CV_ConnectedComponentsTest::run( int /* start_from */)
 
 TEST(Imgproc_ConnectedComponents, regression) { CV_ConnectedComponentsTest test; test.safe_run(); }
 
+TEST(Imgproc_ConnectedComponents, grana_buffer_overflow)
+{
+    cv::Mat darkMask;
+    darkMask.create(31, 87, CV_8U);
+    darkMask = 0;
+
+    cv::Mat labels;
+    cv::Mat stats;
+    cv::Mat centroids;
+
+    int nbComponents = cv::connectedComponentsWithStats(darkMask, labels, stats, centroids, 8, CV_32S, cv::CCL_GRANA);
+    EXPECT_EQ(1, nbComponents);
+}
+
 }} // namespace

From 9a2488628288a1b87d821de5b7303e040c102b1f Mon Sep 17 00:00:00 2001
From: Rafa Gomez-Jordana <rgjordana@magicleap.com>
Date: Mon, 1 Jul 2019 17:48:48 +0200
Subject: [PATCH 14/14] Fix blob detector insertion sort

---
 modules/features2d/src/blobdetector.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/features2d/src/blobdetector.cpp b/modules/features2d/src/blobdetector.cpp
index 403a8974cc..c973b09764 100644
--- a/modules/features2d/src/blobdetector.cpp
+++ b/modules/features2d/src/blobdetector.cpp
@@ -338,7 +338,7 @@ void SimpleBlobDetectorImpl::detect(InputArray image, std::vector<cv::KeyPoint>&
                     centers[j].push_back(curCenters[i]);
 
                     size_t k = centers[j].size() - 1;
-                    while( k > 0 && centers[j][k].radius < centers[j][k-1].radius )
+                    while( k > 0 && curCenters[i].radius < centers[j][k-1].radius )
                     {
                         centers[j][k] = centers[j][k-1];
                         k--;