Merge remote-tracking branch 'upstream/3.4' into merge-3.4

2025-06-08 01:53:19 +08:00 · 2018-08-07 20:09:27 +03:00 · 2018-08-07 20:09:27 +03:00 · 4eb2966559
commit 4eb2966559
parent 053259fd92 9f2edc1135
80 changed files with 1272 additions and 822 deletions
--- a/apps/interactive-calibration/rotationConverters.cpp
+++ b/apps/interactive-calibration/rotationConverters.cpp
@ -16,7 +16,7 @@ void calib::Euler(const cv::Mat& src, cv::Mat& dst, int argType)
 {
    if((src.rows == 3) && (src.cols == 3))
    {
-        //convert rotaion matrix to 3 angles (pitch, yaw, roll)
+        //convert rotation matrix to 3 angles (pitch, yaw, roll)
        dst = cv::Mat(3, 1, CV_64F);
        double pitch, yaw, roll;

@ -55,7 +55,7 @@ void calib::Euler(const cv::Mat& src, cv::Mat& dst, int argType)
    else if( (src.cols == 1 && src.rows == 3) ||
             (src.cols == 3 && src.rows == 1 ) )
    {
-        //convert vector which contains 3 angles (pitch, yaw, roll) to rotaion matrix
+        //convert vector which contains 3 angles (pitch, yaw, roll) to rotation matrix
        double pitch, yaw, roll;
        if(src.cols == 1 && src.rows == 3)
        {
--- a/cmake/FindCUDA.cmake
+++ b/cmake/FindCUDA.cmake
@ -141,7 +141,7 @@
 #   -- Same as CUDA_ADD_EXECUTABLE except that a library is created.
 #
 #   CUDA_BUILD_CLEAN_TARGET()
-#   -- Creates a convience target that deletes all the dependency files
+#   -- Creates a convenience target that deletes all the dependency files
 #      generated.  You should make clean after running this target to ensure the
 #      dependency files get regenerated.
 #
@ -473,7 +473,7 @@ else()
 endif()

 # Propagate the host flags to the host compiler via -Xcompiler
-option(CUDA_PROPAGATE_HOST_FLAGS "Propage C/CXX_FLAGS and friends to the host compiler via -Xcompile" ON)
+option(CUDA_PROPAGATE_HOST_FLAGS "Propagate C/CXX_FLAGS and friends to the host compiler via -Xcompile" ON)

 # Enable CUDA_SEPARABLE_COMPILATION
 option(CUDA_SEPARABLE_COMPILATION "Compile CUDA objects with separable compilation enabled.  Requires CUDA 5.0+" OFF)
--- a/cmake/OpenCVCompilerOptimizations.cmake
+++ b/cmake/OpenCVCompilerOptimizations.cmake
@ -761,24 +761,24 @@ macro(ocv_compiler_optimization_fill_cpu_config)
  endif()
 endmacro()

-macro(ocv_add_dispatched_file filename)
+macro(__ocv_add_dispatched_file filename target_src_var src_directory dst_directory precomp_hpp optimizations_var)
  if(NOT OPENCV_INITIAL_PASS)
    set(__codestr "
-#include \"${CMAKE_CURRENT_LIST_DIR}/src/precomp.hpp\"
-#include \"${CMAKE_CURRENT_LIST_DIR}/src/${filename}.simd.hpp\"
+#include \"${src_directory}/${precomp_hpp}\"
+#include \"${src_directory}/${filename}.simd.hpp\"
 ")

-    set(__declarations_str "#define CV_CPU_SIMD_FILENAME \"${CMAKE_CURRENT_LIST_DIR}/src/${filename}.simd.hpp\"")
+    set(__declarations_str "#define CV_CPU_SIMD_FILENAME \"${src_directory}/${filename}.simd.hpp\"")
    set(__dispatch_modes "BASELINE")

-    set(__optimizations "${ARGN}")
+    set(__optimizations "${${optimizations_var}}")
    if(CV_DISABLE_OPTIMIZATION OR NOT CV_ENABLE_INTRINSICS)
      set(__optimizations "")
    endif()

    foreach(OPT ${__optimizations})
      string(TOLOWER "${OPT}" OPT_LOWER)
-      set(__file "${CMAKE_CURRENT_BINARY_DIR}/${filename}.${OPT_LOWER}.cpp")
+      set(__file "${CMAKE_CURRENT_BINARY_DIR}/${dst_directory}${filename}.${OPT_LOWER}.cpp")
      if(EXISTS "${__file}")
        file(READ "${__file}" __content)
      else()
@ -791,7 +791,11 @@ macro(ocv_add_dispatched_file filename)
      endif()

      if(";${CPU_DISPATCH};" MATCHES "${OPT}" OR __CPU_DISPATCH_INCLUDE_ALL)
-        list(APPEND OPENCV_MODULE_${the_module}_SOURCES_DISPATCHED "${__file}")
+        if(EXISTS "${src_directory}/${filename}.${OPT_LOWER}.cpp")
+          message(STATUS "Using overrided ${OPT} source: ${src_directory}/${filename}.${OPT_LOWER}.cpp")
+        else()
+          list(APPEND ${target_src_var} "${__file}")
+        endif()
      endif()

      set(__declarations_str "${__declarations_str}
@ -803,9 +807,11 @@ macro(ocv_add_dispatched_file filename)

    set(__declarations_str "${__declarations_str}
 #define CV_CPU_DISPATCH_MODES_ALL ${__dispatch_modes}
+
+#undef CV_CPU_SIMD_FILENAME
 ")

-    set(__file "${CMAKE_CURRENT_BINARY_DIR}/${filename}.simd_declarations.hpp")
+    set(__file "${CMAKE_CURRENT_BINARY_DIR}/${dst_directory}${filename}.simd_declarations.hpp")
    if(EXISTS "${__file}")
      file(READ "${__file}" __content)
    endif()
@ -817,6 +823,17 @@ macro(ocv_add_dispatched_file filename)
  endif()
 endmacro()

+macro(ocv_add_dispatched_file filename)
+  set(__optimizations "${ARGN}")
+  if(" ${ARGV1}" STREQUAL " TEST")
+    list(REMOVE_AT __optimizations 0)
+    __ocv_add_dispatched_file("${filename}" "OPENCV_MODULE_${the_module}_TEST_SOURCES_DISPATCHED" "${CMAKE_CURRENT_LIST_DIR}/test" "test/" "test_precomp.hpp" __optimizations)
+  else()
+    __ocv_add_dispatched_file("${filename}" "OPENCV_MODULE_${the_module}_SOURCES_DISPATCHED" "${CMAKE_CURRENT_LIST_DIR}/src" "" "precomp.hpp" __optimizations)
+  endif()
+endmacro()
+
+
 # Workaround to support code which always require all code paths
 macro(ocv_add_dispatched_file_force_all)
  set(__CPU_DISPATCH_INCLUDE_ALL 1)
--- a/cmake/OpenCVDetectCUDA.cmake
+++ b/cmake/OpenCVDetectCUDA.cmake
@ -3,7 +3,7 @@ if(WIN32 AND NOT MSVC)
  return()
 endif()

-if(NOT APPLE AND CV_CLANG)
+if(NOT UNIX AND CV_CLANG)
  message(STATUS "CUDA compilation is disabled (due to Clang unsupported on your platform).")
  return()
 endif()
@ -188,6 +188,13 @@ if(CUDA_FOUND)
    foreach(var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_DEBUG)
      set(${var}_backup_in_cuda_compile_ "${${var}}")

+      if (CV_CLANG)
+        # we remove -Winconsistent-missing-override and -Qunused-arguments
+        # just in case we are compiling CUDA with gcc but OpenCV with clang
+        string(REPLACE "-Winconsistent-missing-override" "" ${var} "${${var}}")
+        string(REPLACE "-Qunused-arguments" "" ${var} "${${var}}")
+      endif()
+
      # we remove /EHa as it generates warnings under windows
      string(REPLACE "/EHa" "" ${var} "${${var}}")

--- a/cmake/OpenCVDownload.cmake
+++ b/cmake/OpenCVDownload.cmake
@ -20,16 +20,19 @@ if(DEFINED ENV{OPENCV_DOWNLOAD_PATH})
 endif()
 set(OPENCV_DOWNLOAD_PATH "${OpenCV_SOURCE_DIR}/.cache" CACHE PATH "${HELP_OPENCV_DOWNLOAD_PATH}")
 set(OPENCV_DOWNLOAD_LOG "${OpenCV_BINARY_DIR}/CMakeDownloadLog.txt")
+set(OPENCV_DOWNLOAD_WITH_CURL "${OpenCV_BINARY_DIR}/download_with_curl.sh")
+set(OPENCV_DOWNLOAD_WITH_WGET "${OpenCV_BINARY_DIR}/download_with_wget.sh")

-# Init download cache directory and log file
+# Init download cache directory and log file and helper scripts
 if(NOT EXISTS "${OPENCV_DOWNLOAD_PATH}")
  file(MAKE_DIRECTORY ${OPENCV_DOWNLOAD_PATH})
 endif()
 if(NOT EXISTS "${OPENCV_DOWNLOAD_PATH}/.gitignore")
  file(WRITE "${OPENCV_DOWNLOAD_PATH}/.gitignore" "*\n")
 endif()
-file(WRITE "${OPENCV_DOWNLOAD_LOG}" "use_cache \"${OPENCV_DOWNLOAD_PATH}\"\n")
-
+file(WRITE "${OPENCV_DOWNLOAD_LOG}" "#use_cache \"${OPENCV_DOWNLOAD_PATH}\"\n")
+file(REMOVE "${OPENCV_DOWNLOAD_WITH_CURL}")
+file(REMOVE "${OPENCV_DOWNLOAD_WITH_WGET}")

 function(ocv_download)
  cmake_parse_arguments(DL "UNPACK;RELATIVE_URL" "FILENAME;HASH;DESTINATION_DIR;ID;STATUS" "URL" ${ARGN})
@ -103,7 +106,7 @@ function(ocv_download)
  endif()

  # Log all calls to file
-  ocv_download_log("do_${mode} \"${DL_FILENAME}\" \"${DL_HASH}\" \"${DL_URL}\" \"${DL_DESTINATION_DIR}\"")
+  ocv_download_log("#do_${mode} \"${DL_FILENAME}\" \"${DL_HASH}\" \"${DL_URL}\" \"${DL_DESTINATION_DIR}\"")
  # ... and to console
  set(__msg_prefix "")
  if(DL_ID)
@ -191,6 +194,9 @@ function(ocv_download)
 For details please refer to the download log file:
 ${OPENCV_DOWNLOAD_LOG}
 ")
+      # write helper scripts for failed downloads
+      file(APPEND "${OPENCV_DOWNLOAD_WITH_CURL}" "curl --output \"${CACHE_CANDIDATE}\" \"${DL_URL}\"\n")
+      file(APPEND "${OPENCV_DOWNLOAD_WITH_WGET}" "wget -O \"${CACHE_CANDIDATE}\" \"${DL_URL}\"\n")
      return()
    endif()

--- a/cmake/OpenCVModule.cmake
+++ b/cmake/OpenCVModule.cmake
@ -1202,6 +1202,9 @@ function(ocv_add_accuracy_tests)
        set(OPENCV_TEST_${the_module}_SOURCES ${test_srcs} ${test_hdrs})
      endif()

+      if(OPENCV_MODULE_${the_module}_TEST_SOURCES_DISPATCHED)
+        list(APPEND OPENCV_TEST_${the_module}_SOURCES ${OPENCV_MODULE_${the_module}_TEST_SOURCES_DISPATCHED})
+      endif()
      ocv_compiler_optimization_process_sources(OPENCV_TEST_${the_module}_SOURCES OPENCV_TEST_${the_module}_DEPS ${the_target})

      if(NOT BUILD_opencv_world)
@ -1211,6 +1214,9 @@ function(ocv_add_accuracy_tests)
      source_group("Src" FILES "${${the_target}_pch}")
      ocv_add_executable(${the_target} ${OPENCV_TEST_${the_module}_SOURCES} ${${the_target}_pch})
      ocv_target_include_modules(${the_target} ${test_deps} "${test_path}")
+      if(EXISTS "${CMAKE_CURRENT_BINARY_DIR}/test")
+        ocv_target_include_directories(${the_target} "${CMAKE_CURRENT_BINARY_DIR}/test")
+      endif()
      ocv_target_link_libraries(${the_target} LINK_PRIVATE ${test_deps} ${OPENCV_MODULE_${the_module}_DEPS} ${OPENCV_LINKER_LIBS} ${OPENCV_TEST_${the_module}_DEPS})
      add_dependencies(opencv_tests ${the_target})

--- a/cmake/OpenCVPCHSupport.cmake
+++ b/cmake/OpenCVPCHSupport.cmake
@ -362,7 +362,7 @@ MACRO(ADD_NATIVE_PRECOMPILED_HEADER _targetName _input)
          endif()
        endforeach()

-        #also inlude ${oldProps} to have the same compile options
+        #also include ${oldProps} to have the same compile options
        GET_TARGET_PROPERTY(oldProps ${_targetName} COMPILE_FLAGS)
        if (oldProps MATCHES NOTFOUND)
            SET(oldProps "")
--- a/cmake/templates/OpenCVConfig.cmake.in
+++ b/cmake/templates/OpenCVConfig.cmake.in
@ -260,7 +260,7 @@ endif()
 set(OpenCV_LIBRARIES ${OpenCV_LIBS})

 #
-# Some macroses for samples
+# Some macros for samples
 #
 macro(ocv_check_dependencies)
  set(OCV_DEPENDENCIES_FOUND TRUE)
--- a/doc/js_tutorials/js_imgproc/js_grabcut/js_grabcut.markdown
+++ b/doc/js_tutorials/js_imgproc/js_grabcut/js_grabcut.markdown
@ -29,7 +29,7 @@ What happens in background ?
    objects). Everything inside rectangle is unknown. Similarly any user input specifying
    foreground and background are considered as hard-labelling which means they won't change in
    the process.
-   Computer does an initial labelling depeding on the data we gave. It labels the foreground and
+-   Computer does an initial labelling depending on the data we gave. It labels the foreground and
    background pixels (or it hard-labels)
 -   Now a Gaussian Mixture Model(GMM) is used to model the foreground and background.
 -   Depending on the data we gave, GMM learns and create new pixel distribution. That is, the
--- a/doc/js_tutorials/js_setup/js_usage/js_usage.markdown
+++ b/doc/js_tutorials/js_setup/js_usage/js_usage.markdown
@ -129,7 +129,7 @@ function onOpenCvReady() {
 </html>
@endcode

-@note You have to call delete method of cv.Mat to free memory allocated in Emscripten's heap. Please refer to [Memeory management of Emscripten](https://kripken.github.io/emscripten-site/docs/porting/connecting_cpp_and_javascript/embind.html#memory-management) for details.
+@note You have to call delete method of cv.Mat to free memory allocated in Emscripten's heap. Please refer to [Memory management of Emscripten](https://kripken.github.io/emscripten-site/docs/porting/connecting_cpp_and_javascript/embind.html#memory-management) for details.

 Try it
 ------
--- a/doc/py_tutorials/py_imgproc/py_grabcut/py_grabcut.markdown
+++ b/doc/py_tutorials/py_imgproc/py_grabcut/py_grabcut.markdown
@ -37,7 +37,7 @@ So what happens in background ?
    objects). Everything inside rectangle is unknown. Similarly any user input specifying
    foreground and background are considered as hard-labelling which means they won't change in
    the process.
-   Computer does an initial labelling depeding on the data we gave. It labels the foreground and
+-   Computer does an initial labelling depending on the data we gave. It labels the foreground and
    background pixels (or it hard-labels)
 -   Now a Gaussian Mixture Model(GMM) is used to model the foreground and background.
 -   Depending on the data we gave, GMM learns and create new pixel distribution. That is, the
--- a/doc/tutorials/calib3d/real_time_pose/real_time_pose.markdown
+++ b/doc/tutorials/calib3d/real_time_pose/real_time_pose.markdown
@ -16,7 +16,7 @@ In this tutorial is explained how to build a real time application to estimate t
 order to track a textured object with six degrees of freedom given a 2D image and its 3D textured
 model.

-The application will have the followings parts:
+The application will have the following parts:

 -   Read 3D textured object model and object mesh.
 -   Take input from Camera or Video.
@ -426,16 +426,16 @@ Here is explained in detail the code for the real time application:
    @endcode
    OpenCV provides four PnP methods: ITERATIVE, EPNP, P3P and DLS. Depending on the application type,
    the estimation method will be different. In the case that we want to make a real time application,
-    the more suitable methods are EPNP and P3P due to that are faster than ITERATIVE and DLS at
+    the more suitable methods are EPNP and P3P since they are faster than ITERATIVE and DLS at
    finding an optimal solution. However, EPNP and P3P are not especially robust in front of planar
-    surfaces and sometimes the pose estimation seems to have a mirror effect. Therefore, in this this
-    tutorial is used ITERATIVE method due to the object to be detected has planar surfaces.
+    surfaces and sometimes the pose estimation seems to have a mirror effect. Therefore, in this
+    tutorial an ITERATIVE method is used due to the object to be detected has planar surfaces.

-    The OpenCV RANSAC implementation wants you to provide three parameters: the maximum number of
-    iterations until stop the algorithm, the maximum allowed distance between the observed and
-    computed point projections to consider it an inlier and the confidence to obtain a good result.
+    The OpenCV RANSAC implementation wants you to provide three parameters: 1) the maximum number of
+    iterations until the algorithm stops, 2) the maximum allowed distance between the observed and
+    computed point projections to consider it an inlier and 3) the confidence to obtain a good result.
    You can tune these parameters in order to improve your algorithm performance. Increasing the
-    number of iterations you will have a more accurate solution, but will take more time to find a
+    number of iterations will have a more accurate solution, but will take more time to find a
    solution. Increasing the reprojection error will reduce the computation time, but your solution
    will be unaccurate. Decreasing the confidence your algorithm will be faster, but the obtained
    solution will be unaccurate.
--- a/doc/tutorials/introduction/windows_install/windows_install.markdown
+++ b/doc/tutorials/introduction/windows_install/windows_install.markdown
@ -46,7 +46,7 @@ cd /c/lib
 myRepo=$(pwd)
 CMAKE_CONFIG_GENERATOR="Visual Studio 14 2015 Win64"
 if [  ! -d "$myRepo/opencv"  ]; then
-    echo "clonning opencv"
+    echo "cloning opencv"
    git clone https://github.com/opencv/opencv.git
    mkdir Build
    mkdir Build/opencv
@ -58,7 +58,7 @@ else
    cd ..
 fi
 if [  ! -d "$myRepo/opencv_contrib"  ]; then
-    echo "clonning opencv_contrib"
+    echo "cloning opencv_contrib"
    git clone https://github.com/opencv/opencv_contrib.git
    mkdir Build
    mkdir Build/opencv_contrib
--- a/modules/calib3d/test/test_chesscorners.cpp
+++ b/modules/calib3d/test/test_chesscorners.cpp
@ -198,7 +198,7 @@ void CV_ChessboardDetectorTest::run_batch( const string& filename )

    if( !fs.isOpened() || board_list.empty() || !board_list.isSeq() || board_list.size() % 2 != 0 )
    {
-        ts->printf( cvtest::TS::LOG, "%s can not be readed or is not valid\n", (folder + filename).c_str() );
+        ts->printf( cvtest::TS::LOG, "%s can not be read or is not valid\n", (folder + filename).c_str() );
        ts->printf( cvtest::TS::LOG, "fs.isOpened=%d, board_list.empty=%d, board_list.isSeq=%d,board_list.size()%2=%d\n",
            fs.isOpened(), (int)board_list.empty(), board_list.isSeq(), board_list.size()%2);
        ts->set_failed_test_info( cvtest::TS::FAIL_MISSING_TEST_DATA );
--- a/modules/calib3d/test/test_chesscorners_timing.cpp
+++ b/modules/calib3d/test/test_chesscorners_timing.cpp
@ -85,7 +85,7 @@ void CV_ChessboardDetectorTimingTest::run( int start_from )
    if( !fs || !board_list || !CV_NODE_IS_SEQ(board_list->tag) ||
        board_list->data.seq->total % 4 != 0 )
    {
-        ts->printf( cvtest::TS::LOG, "chessboard_timing_list.dat can not be readed or is not valid" );
+        ts->printf( cvtest::TS::LOG, "chessboard_timing_list.dat can not be read or is not valid" );
        code = cvtest::TS::FAIL_MISSING_TEST_DATA;
        goto _exit_;
    }
--- a/modules/core/CMakeLists.txt
+++ b/modules/core/CMakeLists.txt
@ -3,6 +3,10 @@ set(the_description "The Core Functionality")
 ocv_add_dispatched_file(mathfuncs_core SSE2 AVX AVX2)
 ocv_add_dispatched_file(stat SSE4_2 AVX2)

+# dispatching for accuracy tests
+ocv_add_dispatched_file_force_all(test_intrin128 TEST SSE2 SSE3 SSSE3 SSE4_1 SSE4_2 AVX FP16 AVX2)
+ocv_add_dispatched_file_force_all(test_intrin256 TEST AVX2)
+
 ocv_add_module(core
               OPTIONAL opencv_cudev
               WRAP java python js)
--- a/modules/core/include/opencv2/core/hal/intrin.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin.hpp
@ -204,20 +204,6 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
 #define CV_SIMD512_64F 0
 #endif

-#if CV_SIMD512
-    #define CV_SIMD 1
-    #define CV_SIMD_64F CV_SIMD512_64F
-    #define CV_SIMD_WIDTH 64
-#elif CV_SIMD256
-    #define CV_SIMD 1
-    #define CV_SIMD_64F CV_SIMD256_64F
-    #define CV_SIMD_WIDTH 32
-#else
-    #define CV_SIMD CV_SIMD128
-    #define CV_SIMD_64F CV_SIMD128_64F
-    #define CV_SIMD_WIDTH 16
-#endif
-
 //==================================================================================================

 #define CV_INTRIN_DEFINE_WIDE_INTRIN(typ, vtyp, short_typ, prefix, loadsfx) \
@ -309,7 +295,21 @@ template<typename _Tp> struct V_RegTraits
 #endif
 #endif

-#if CV_SIMD256
+#if CV_SIMD512 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 512)
+#define CV__SIMD_NAMESPACE simd512
+namespace CV__SIMD_NAMESPACE {
+    #define CV_SIMD 1
+    #define CV_SIMD_64F CV_SIMD512_64F
+    #define CV_SIMD_WIDTH 64
+    // TODO typedef v_uint8 / v_int32 / etc types here
+} // namespace
+using namespace CV__SIMD_NAMESPACE;
+#elif CV_SIMD256 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 256)
+#define CV__SIMD_NAMESPACE simd256
+namespace CV__SIMD_NAMESPACE {
+    #define CV_SIMD 1
+    #define CV_SIMD_64F CV_SIMD256_64F
+    #define CV_SIMD_WIDTH 32
    typedef v_uint8x32   v_uint8;
    typedef v_int8x32    v_int8;
    typedef v_uint16x16  v_uint16;
@ -329,7 +329,14 @@ template<typename _Tp> struct V_RegTraits
    CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v256)
    CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v256, load)
    inline void vx_cleanup() { v256_cleanup(); }
-#elif CV_SIMD128 || CV_SIMD128_CPP
+} // namespace
+using namespace CV__SIMD_NAMESPACE;
+#elif (CV_SIMD128 || CV_SIMD128_CPP) && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 128)
+#define CV__SIMD_NAMESPACE simd128
+namespace CV__SIMD_NAMESPACE {
+    #define CV_SIMD CV_SIMD128
+    #define CV_SIMD_64F CV_SIMD128_64F
+    #define CV_SIMD_WIDTH 16
    typedef v_uint8x16  v_uint8;
    typedef v_int8x16   v_int8;
    typedef v_uint16x8  v_uint16;
@ -351,6 +358,8 @@ template<typename _Tp> struct V_RegTraits
    CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v, load)
    #endif
    inline void vx_cleanup() { v_cleanup(); }
+} // namespace
+using namespace CV__SIMD_NAMESPACE;
 #endif

 inline unsigned int trailingZeros32(unsigned int value) {
@ -380,6 +389,14 @@ inline unsigned int trailingZeros32(unsigned int value) {
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
 #endif

+#ifndef CV_SIMD_64F
+#define CV_SIMD_64F 0
+#endif
+
+#ifndef CV_SIMD
+#define CV_SIMD 0
+#endif
+
 } // cv::

 //! @endcond
--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@ -494,7 +494,12 @@ void v_rshr_pack_store(ushort* ptr, const v_uint32x4& a)
 inline v_uint16x8 v_pack_u(const v_int32x4& a, const v_int32x4& b)
 {
    __m128i delta32 = _mm_set1_epi32(32768);
-    __m128i r = _mm_packs_epi32(_mm_sub_epi32(a.val, delta32), _mm_sub_epi32(b.val, delta32));
+
+    // preliminary saturate negative values to zero
+    __m128i a1 = _mm_and_si128(a.val, _mm_cmpgt_epi32(a.val, _mm_set1_epi32(0)));
+    __m128i b1 = _mm_and_si128(b.val, _mm_cmpgt_epi32(b.val, _mm_set1_epi32(0)));
+
+    __m128i r = _mm_packs_epi32(_mm_sub_epi32(a1, delta32), _mm_sub_epi32(b1, delta32));
    return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768)));
 }

--- a/modules/core/include/opencv2/core/types_c.h
+++ b/modules/core/include/opencv2/core/types_c.h
@ -1764,7 +1764,7 @@ typedef struct CvString
 }
 CvString;

-/** All the keys (names) of elements in the readed file storage
+/** All the keys (names) of elements in the read file storage
   are stored in the hash to speed up the lookup operations: */
 typedef struct CvStringHashNode
 {
--- a/modules/core/src/convert.cpp
+++ b/modules/core/src/convert.cpp
@ -453,9 +453,9 @@ struct Cvt_SIMD<int, uchar>
            {
                v_int32x4 v_src1 = v_load(src + x), v_src2 = v_load(src + x + cWidth);
                v_int32x4 v_src3 = v_load(src + x + cWidth * 2), v_src4 = v_load(src + x + cWidth * 3);
-                v_uint16x8 v_dst1 = v_pack_u(v_src1, v_src2);
-                v_uint16x8 v_dst2 = v_pack_u(v_src3, v_src4);
-                v_store(dst + x, v_pack(v_dst1, v_dst2));
+                v_int16x8 v_dst1 = v_pack(v_src1, v_src2);
+                v_int16x8 v_dst2 = v_pack(v_src3, v_src4);
+                v_store(dst + x, v_pack_u(v_dst1, v_dst2));
            }
        }
        return x;
--- a/modules/core/src/datastructs.cpp
+++ b/modules/core/src/datastructs.cpp
@ -2779,7 +2779,7 @@ cvGraphAddEdgeByPtr( CvGraph* graph,

    if( start_vtx == end_vtx )
        CV_Error( start_vtx ? CV_StsBadArg : CV_StsNullPtr,
-        "vertex pointers coinside (or set to NULL)" );
+        "vertex pointers coincide (or set to NULL)" );

    edge = (CvGraphEdge*)cvSetNew( (CvSet*)(graph->edges) );
    assert( edge->flags >= 0 );
--- a/modules/core/src/merge.cpp
+++ b/modules/core/src/merge.cpp
@ -36,13 +36,14 @@ vecmerge_( const T** src, T* dst, int len, int cn )
    const T* src0 = src[0];
    const T* src1 = src[1];

+    const int dstElemSize = cn * sizeof(T);
    int r = (int)((size_t)(void*)dst % (VECSZ*sizeof(T)));
    hal::StoreMode mode = hal::STORE_ALIGNED_NOCACHE;
    if( r != 0 )
    {
        mode = hal::STORE_UNALIGNED;
-        if( r % cn == 0 && len > VECSZ )
-            i0 = VECSZ - (r / cn);
+        if (r % dstElemSize == 0 && len > VECSZ*2)
+            i0 = VECSZ - (r / dstElemSize);
    }

    if( cn == 2 )
--- a/modules/core/src/persistence_c.cpp
+++ b/modules/core/src/persistence_c.cpp
@ -1063,7 +1063,7 @@ cvReadRawDataSlice( const CvFileStorage* fs, CvSeqReader* reader,
        CV_Error( CV_StsNullPtr, "Null pointer to reader or destination array" );

    if( !reader->seq && len != 1 )
-        CV_Error( CV_StsBadSize, "The readed sequence is a scalar, thus len must be 1" );
+        CV_Error( CV_StsBadSize, "The read sequence is a scalar, thus len must be 1" );

    fmt_pair_count = icvDecodeFormat( dt, fmt_pairs, CV_FS_MAX_FMT_PAIRS );
    size_t step = ::icvCalcStructSize(dt, 0);
--- a/modules/core/src/split.cpp
+++ b/modules/core/src/split.cpp
@ -27,8 +27,8 @@ vecsplit_( const T* src, T** dst, int len, int cn )
    if( (r0|r1|r2|r3) != 0 )
    {
        mode = hal::STORE_UNALIGNED;
-        if( r0 == r1 && r0 == r2 && r0 == r3 && r0 % cn == 0 && len > VECSZ )
-            i0 = VECSZ - (r0 / cn);
+        if( r0 == r1 && r0 == r2 && r0 == r3 && r0 % sizeof(T) == 0 && len > VECSZ*2 )
+            i0 = VECSZ - (r0 / sizeof(T));
    }

    if( cn == 2 )
--- a/modules/core/src/utils/filesystem.cpp
+++ b/modules/core/src/utils/filesystem.cpp
@ -469,7 +469,32 @@ cv::String getCacheDirectory(const char* sub_directory_name, const char* configu
        {
            if (utils::fs::isDirectory(default_cache_path))
            {
-                default_cache_path = utils::fs::join(default_cache_path, utils::fs::join("opencv", CV_VERSION));
+                cv::String default_cache_path_base = utils::fs::join(default_cache_path, "opencv");
+                default_cache_path = utils::fs::join(default_cache_path_base, "4.0" CV_VERSION_STATUS);
+                if (utils::getConfigurationParameterBool("OPENCV_CACHE_SHOW_CLEANUP_MESSAGE", true)
+                    && !utils::fs::isDirectory(default_cache_path))
+                {
+                    std::vector<cv::String> existedCacheDirs;
+                    try
+                    {
+                        utils::fs::glob_relative(default_cache_path_base, "*", existedCacheDirs, false, true);
+                    }
+                    catch (...)
+                    {
+                        // ignore
+                    }
+                    if (!existedCacheDirs.empty())
+                    {
+                        CV_LOG_WARNING(NULL, "Creating new OpenCV cache directory: " << default_cache_path);
+                        CV_LOG_WARNING(NULL, "There are several neighbour directories, probably created by old OpenCV versions.");
+                        CV_LOG_WARNING(NULL, "Feel free to cleanup these unused directories:");
+                        for (size_t i = 0; i < existedCacheDirs.size(); i++)
+                        {
+                            CV_LOG_WARNING(NULL, "  - " << existedCacheDirs[i]);
+                        }
+                        CV_LOG_WARNING(NULL, "Note: This message is showed only once.");
+                    }
+                }
                if (sub_directory_name && sub_directory_name[0] != '\0')
                    default_cache_path = utils::fs::join(default_cache_path, cv::String(sub_directory_name) + native_separator);
                if (!utils::fs::createDirectories(default_cache_path))
--- a/modules/core/test/test_intrin.avx2.cpp
+++ b/modules/core/test/test_intrin.avx2.cpp
@ -1,5 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-#include "test_precomp.hpp"
-#include "test_intrin.simd.hpp"
--- a/modules/core/test/test_intrin.cpp
+++ b/modules/core/test/test_intrin.cpp
@ -2,101 +2,100 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 #include "test_precomp.hpp"
-#include "test_intrin.simd.hpp"

-#define CV_CPU_SIMD_FILENAME "test_intrin.simd.hpp"
-#define CV_CPU_DISPATCH_MODE FP16
-#include "opencv2/core/private/cv_cpu_include_simd_declarations.hpp"
+#include "test_intrin128.simd.hpp"
+#include "test_intrin128.simd_declarations.hpp"
+
+#undef CV_CPU_DISPATCH_MODES_ALL
+
+#include "opencv2/core/cv_cpu_dispatch.h"
+#include "test_intrin256.simd.hpp"
+#include "test_intrin256.simd_declarations.hpp"

-#define CV_CPU_DISPATCH_MODE AVX2
-#include "opencv2/core/private/cv_cpu_include_simd_declarations.hpp"

 namespace opencv_test { namespace hal {
-using namespace CV_CPU_OPTIMIZATION_NAMESPACE;

-TEST(hal_intrin, uint8x16)
-{ test_hal_intrin_uint8(); }
+#define CV_CPU_CALL_BASELINE_(fn, args)  CV_CPU_CALL_BASELINE(fn, args)

-TEST(hal_intrin, int8x16)
-{ test_hal_intrin_int8(); }
+#define DISPATCH_SIMD128(fn, cpu_opt) do { \
+    CV_CPU_CALL_ ## cpu_opt ## _(fn, ()); \
+    throw SkipTestException("SIMD128 (" #cpu_opt ") is not available or disabled"); \
+} while(0)

-TEST(hal_intrin, uint16x8)
-{ test_hal_intrin_uint16(); }
+#define DISPATCH_SIMD256(fn, cpu_opt) do { \
+    CV_CPU_CALL_ ## cpu_opt ## _(fn, ()); \
+    throw SkipTestException("SIMD256 (" #cpu_opt ") is not available or disabled"); \
+} while(0)

-TEST(hal_intrin, int16x8)
-{ test_hal_intrin_int16(); }
+#define DEFINE_SIMD_TESTS(simd_size, cpu_opt) \
+TEST(hal_intrin ## simd_size, uint8x16_ ## cpu_opt)  { DISPATCH_SIMD ## simd_size(test_hal_intrin_uint8, cpu_opt); } \
+TEST(hal_intrin ## simd_size, int8x16_ ## cpu_opt)   { DISPATCH_SIMD ## simd_size(test_hal_intrin_int8, cpu_opt); } \
+TEST(hal_intrin ## simd_size, uint16x8_ ## cpu_opt)  { DISPATCH_SIMD ## simd_size(test_hal_intrin_uint16, cpu_opt); } \
+TEST(hal_intrin ## simd_size, int16x8_ ## cpu_opt)   { DISPATCH_SIMD ## simd_size(test_hal_intrin_int16, cpu_opt); } \
+TEST(hal_intrin ## simd_size, int32x4_ ## cpu_opt)   { DISPATCH_SIMD ## simd_size(test_hal_intrin_int32, cpu_opt); } \
+TEST(hal_intrin ## simd_size, uint32x4_ ## cpu_opt)  { DISPATCH_SIMD ## simd_size(test_hal_intrin_uint32, cpu_opt); } \
+TEST(hal_intrin ## simd_size, uint64x2_ ## cpu_opt)  { DISPATCH_SIMD ## simd_size(test_hal_intrin_uint64, cpu_opt); } \
+TEST(hal_intrin ## simd_size, int64x2_ ## cpu_opt)   { DISPATCH_SIMD ## simd_size(test_hal_intrin_int64, cpu_opt); } \
+TEST(hal_intrin ## simd_size, float32x4_ ## cpu_opt) { DISPATCH_SIMD ## simd_size(test_hal_intrin_float32, cpu_opt); } \
+TEST(hal_intrin ## simd_size, float64x2_ ## cpu_opt) { DISPATCH_SIMD ## simd_size(test_hal_intrin_float64, cpu_opt); } \

-TEST(hal_intrin, int32x4)
-{ test_hal_intrin_int32(); }
+namespace intrin128 {

-TEST(hal_intrin, uint32x4)
-{ test_hal_intrin_uint32(); }
+DEFINE_SIMD_TESTS(128, BASELINE)

-TEST(hal_intrin, uint64x2)
-{ test_hal_intrin_uint64(); }
+#if defined CV_CPU_DISPATCH_COMPILE_SSE2 || defined CV_CPU_BASELINE_COMPILE_SSE2
+DEFINE_SIMD_TESTS(128, SSE2)
+#endif
+#if defined CV_CPU_DISPATCH_COMPILE_SSE3 || defined CV_CPU_BASELINE_COMPILE_SSE3
+DEFINE_SIMD_TESTS(128, SSE3)
+#endif
+#if defined CV_CPU_DISPATCH_COMPILE_SSSE3 || defined CV_CPU_BASELINE_COMPILE_SSSE3
+DEFINE_SIMD_TESTS(128, SSSE3)
+#endif
+#if defined CV_CPU_DISPATCH_COMPILE_SSE4_1 || defined CV_CPU_BASELINE_COMPILE_SSE4_1
+DEFINE_SIMD_TESTS(128, SSE4_1)
+#endif
+#if defined CV_CPU_DISPATCH_COMPILE_SSE4_2 || defined CV_CPU_BASELINE_COMPILE_SSE4_2
+DEFINE_SIMD_TESTS(128, SSE4_2)
+#endif
+#if defined CV_CPU_DISPATCH_COMPILE_AVX || defined CV_CPU_BASELINE_COMPILE_AVX
+DEFINE_SIMD_TESTS(128, AVX)
+#endif
+#if defined CV_CPU_DISPATCH_COMPILE_AVX2 || defined CV_CPU_BASELINE_COMPILE_AVX2
+DEFINE_SIMD_TESTS(128, AVX2)
+#endif

-TEST(hal_intrin, int64x2)
-{ test_hal_intrin_int64(); }
-
-TEST(hal_intrin, float32x4)
-{ test_hal_intrin_float32(); }
-
-TEST(hal_intrin, float64x2)
-{ test_hal_intrin_float64(); }
-
-TEST(hal_intrin, float16x8)
+TEST(hal_intrin128, float16x8_FP16)
 {
    CV_CPU_CALL_FP16_(test_hal_intrin_float16, ());
    throw SkipTestException("Unsupported hardware: FP16 is not available");
 }

-#define DISPATCH_SIMD_MODES AVX2
-#define DISPATCH_SIMD_NAME "SIMD256"
-#define DISPATCH_SIMD(fun)                              \
-    do {                                                \
-        CV_CPU_DISPATCH(fun, (), DISPATCH_SIMD_MODES);  \
-        throw SkipTestException(                        \
-            "Unsupported hardware: "                    \
-            DISPATCH_SIMD_NAME                          \
-            " is not available"                         \
-        );                                              \
-    } while(0)
+} // namespace intrin128

-TEST(hal_intrin256, uint8x32)
-{ DISPATCH_SIMD(test_hal_intrin_uint8); }

-TEST(hal_intrin256, int8x32)
-{ DISPATCH_SIMD(test_hal_intrin_int8); }
+namespace intrin256 {

-TEST(hal_intrin256, uint16x16)
-{ DISPATCH_SIMD(test_hal_intrin_uint16); }

-TEST(hal_intrin256, int16x16)
-{ DISPATCH_SIMD(test_hal_intrin_int16); }
+// Not available due missing C++ backend for SIMD256
+//DEFINE_SIMD_TESTS(256, BASELINE)

-TEST(hal_intrin256, uint32x8)
-{ DISPATCH_SIMD(test_hal_intrin_uint32); }
+//#if defined CV_CPU_DISPATCH_COMPILE_AVX
+//DEFINE_SIMD_TESTS(256, AVX)
+//#endif

-TEST(hal_intrin256, int32x8)
-{ DISPATCH_SIMD(test_hal_intrin_int32); }
+#if defined CV_CPU_DISPATCH_COMPILE_AVX2 || defined CV_CPU_BASELINE_COMPILE_AVX2
+DEFINE_SIMD_TESTS(256, AVX2)
+#endif

-TEST(hal_intrin256, uint64x4)
-{ DISPATCH_SIMD(test_hal_intrin_uint64); }
-
-TEST(hal_intrin256, int64x4)
-{ DISPATCH_SIMD(test_hal_intrin_int64); }
-
-TEST(hal_intrin256, float32x8)
-{ DISPATCH_SIMD(test_hal_intrin_float32); }
-
-TEST(hal_intrin256, float64x4)
-{ DISPATCH_SIMD(test_hal_intrin_float64); }
-
-TEST(hal_intrin256, float16x16)
+TEST(hal_intrin256, float16x16_FP16)
 {
-    if (!CV_CPU_HAS_SUPPORT_FP16)
-        throw SkipTestException("Unsupported hardware: FP16 is not available");
-    DISPATCH_SIMD(test_hal_intrin_float16);
+    //CV_CPU_CALL_FP16_(test_hal_intrin_float16, ());
+    CV_CPU_CALL_AVX2_(test_hal_intrin_float16, ());
+    throw SkipTestException("Unsupported hardware: FP16 is not available");
 }

+
+} // namespace intrin256
+
 }} // namespace
--- a/modules/core/test/test_intrin.fp16.cpp
+++ b/modules/core/test/test_intrin.fp16.cpp
@ -1,19 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-#include "test_precomp.hpp"
-#include "test_intrin_utils.hpp"
-
-namespace opencv_test { namespace hal {
-CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
-
-void test_hal_intrin_float16()
-{
-    TheTest<v_float16>()
-        .test_loadstore_fp16()
-        .test_float_cvt_fp16()
-        ;
-}
-
-CV_CPU_OPTIMIZATION_NAMESPACE_END
-}} // namespace
--- a/modules/core/test/test_intrin.simd.hpp
+++ b/modules/core/test/test_intrin.simd.hpp
@ -1,296 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-#include "test_precomp.hpp"
-#include "test_intrin_utils.hpp"
-
-namespace opencv_test { namespace hal {
-CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
-
-void test_hal_intrin_uint8();
-void test_hal_intrin_int8();
-void test_hal_intrin_uint16();
-void test_hal_intrin_int16();
-void test_hal_intrin_uint32();
-void test_hal_intrin_int32();
-void test_hal_intrin_uint64();
-void test_hal_intrin_int64();
-void test_hal_intrin_float32();
-void test_hal_intrin_float64();
-
-#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
-
-//=============  8-bit integer =====================================================================
-
-void test_hal_intrin_uint8()
-{
-    TheTest<v_uint8>()
-        .test_loadstore()
-        .test_interleave()
-        .test_expand()
-        .test_expand_q()
-        .test_addsub()
-        .test_addsub_wrap()
-        .test_cmp()
-        .test_logic()
-        .test_min_max()
-        .test_absdiff()
-        .test_mask()
-        .test_popcount()
-        .test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>()
-        .test_pack_u<1>().test_pack_u<2>().test_pack_u<3>().test_pack_u<8>()
-        .test_unpack()
-        .test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>()
-        .test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>()
-        ;
-
-#if CV_SIMD256
-    TheTest<v_uint8>()
-        .test_pack<9>().test_pack<10>().test_pack<13>().test_pack<15>()
-        .test_pack_u<9>().test_pack_u<10>().test_pack_u<13>().test_pack_u<15>()
-        .test_extract<16>().test_extract<17>().test_extract<23>().test_extract<31>()
-        .test_rotate<16>().test_rotate<17>().test_rotate<23>().test_rotate<31>()
-        ;
-#endif
-}
-
-void test_hal_intrin_int8()
-{
-    TheTest<v_int8>()
-        .test_loadstore()
-        .test_interleave()
-        .test_expand()
-        .test_expand_q()
-        .test_addsub()
-        .test_addsub_wrap()
-        .test_cmp()
-        .test_logic()
-        .test_min_max()
-        .test_absdiff()
-        .test_abs()
-        .test_mask()
-        .test_popcount()
-        .test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>()
-        .test_unpack()
-        .test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>()
-        .test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>()
-        ;
-}
-
-//============= 16-bit integer =====================================================================
-
-void test_hal_intrin_uint16()
-{
-    TheTest<v_uint16>()
-        .test_loadstore()
-        .test_interleave()
-        .test_expand()
-        .test_addsub()
-        .test_addsub_wrap()
-        .test_mul()
-        .test_mul_expand()
-        .test_cmp()
-        .test_shift<1>()
-        .test_shift<8>()
-        .test_logic()
-        .test_min_max()
-        .test_absdiff()
-        .test_reduce()
-        .test_mask()
-        .test_popcount()
-        .test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>()
-        .test_pack_u<1>().test_pack_u<2>().test_pack_u<7>().test_pack_u<16>()
-        .test_unpack()
-        .test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>()
-        .test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>()
-        ;
-}
-
-void test_hal_intrin_int16()
-{
-    TheTest<v_int16>()
-        .test_loadstore()
-        .test_interleave()
-        .test_expand()
-        .test_addsub()
-        .test_addsub_wrap()
-        .test_mul()
-        .test_mul_expand()
-        .test_cmp()
-        .test_shift<1>()
-        .test_shift<8>()
-        .test_dot_prod()
-        .test_logic()
-        .test_min_max()
-        .test_absdiff()
-        .test_abs()
-        .test_reduce()
-        .test_mask()
-        .test_popcount()
-        .test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>()
-        .test_unpack()
-        .test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>()
-        .test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>()
-        ;
-}
-
-//============= 32-bit integer =====================================================================
-
-void test_hal_intrin_uint32()
-{
-    TheTest<v_uint32>()
-        .test_loadstore()
-        .test_interleave()
-        .test_expand()
-        .test_addsub()
-        .test_mul()
-        .test_mul_expand()
-        .test_cmp()
-        .test_shift<1>()
-        .test_shift<8>()
-        .test_logic()
-        .test_min_max()
-        .test_absdiff()
-        .test_reduce()
-        .test_mask()
-        .test_popcount()
-        .test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>()
-        .test_unpack()
-        .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
-        .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
-        .test_transpose()
-        ;
-}
-
-void test_hal_intrin_int32()
-{
-    TheTest<v_int32>()
-        .test_loadstore()
-        .test_interleave()
-        .test_expand()
-        .test_addsub()
-        .test_mul()
-        .test_abs()
-        .test_cmp()
-        .test_popcount()
-        .test_shift<1>().test_shift<8>()
-        .test_logic()
-        .test_min_max()
-        .test_absdiff()
-        .test_reduce()
-        .test_mask()
-        .test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>()
-        .test_unpack()
-        .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
-        .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
-        .test_float_cvt32()
-        .test_float_cvt64()
-        .test_transpose()
-        ;
-}
-
-//============= 64-bit integer =====================================================================
-
-void test_hal_intrin_uint64()
-{
-    TheTest<v_uint64>()
-        .test_loadstore()
-        .test_addsub()
-        .test_shift<1>().test_shift<8>()
-        .test_logic()
-        .test_extract<0>().test_extract<1>()
-        .test_rotate<0>().test_rotate<1>()
-        ;
-}
-
-void test_hal_intrin_int64()
-{
-    TheTest<v_int64>()
-        .test_loadstore()
-        .test_addsub()
-        .test_shift<1>().test_shift<8>()
-        .test_logic()
-        .test_extract<0>().test_extract<1>()
-        .test_rotate<0>().test_rotate<1>()
-        ;
-}
-
-//============= Floating point =====================================================================
-void test_hal_intrin_float32()
-{
-    TheTest<v_float32>()
-        .test_loadstore()
-        .test_interleave()
-        .test_interleave_2channel()
-        .test_addsub()
-        .test_mul()
-        .test_div()
-        .test_cmp()
-        .test_sqrt_abs()
-        .test_min_max()
-        .test_float_absdiff()
-        .test_reduce()
-        .test_mask()
-        .test_unpack()
-        .test_float_math()
-        .test_float_cvt64()
-        .test_matmul()
-        .test_transpose()
-        .test_reduce_sum4()
-        .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
-        .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
-        ;
-
-#if CV_SIMD256
-    TheTest<v_float32>()
-        .test_extract<4>().test_extract<5>().test_extract<6>().test_extract<7>()
-        .test_rotate<4>().test_rotate<5>().test_rotate<6>().test_rotate<7>()
-        ;
-#endif
-}
-
-void test_hal_intrin_float64()
-{
-#if CV_SIMD_64F
-    TheTest<v_float64>()
-        .test_loadstore()
-        .test_addsub()
-        .test_mul()
-        .test_div()
-        .test_cmp()
-        .test_sqrt_abs()
-        .test_min_max()
-        .test_float_absdiff()
-        .test_mask()
-        .test_unpack()
-        .test_float_math()
-        .test_float_cvt32()
-        .test_extract<0>().test_extract<1>()
-        .test_rotate<0>().test_rotate<1>()
-        ;
-
-#if CV_SIMD256
-    TheTest<v_float64>()
-        .test_extract<2>().test_extract<3>()
-        .test_rotate<2>().test_rotate<3>()
-        ;
-#endif //CV_SIMD256
-
-#endif
-}
-
-#if CV_FP16 && CV_SIMD_WIDTH > 16
-void test_hal_intrin_float16()
-{
-    TheTest<v_float16>()
-        .test_loadstore_fp16()
-        .test_float_cvt_fp16()
-        ;
-}
-#endif
-
-#endif //CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
-
-CV_CPU_OPTIMIZATION_NAMESPACE_END
-
-}} //namespace
--- a/modules/core/test/test_intrin128.simd.hpp
+++ b/modules/core/test/test_intrin128.simd.hpp
@ -0,0 +1,22 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
+
+#define CV__SIMD_FORCE_WIDTH 128
+#include "opencv2/core/hal/intrin.hpp"
+#undef CV__SIMD_FORCE_WIDTH
+
+#if CV_SIMD_WIDTH != 16
+#error "Invalid build configuration"
+#endif
+
+#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
+
+namespace opencv_test { namespace hal { namespace intrin128 {
+CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
+
+#include "test_intrin_utils.hpp"
+
+CV_CPU_OPTIMIZATION_NAMESPACE_END
+}}} //namespace
--- a/modules/core/test/test_intrin256.simd.hpp
+++ b/modules/core/test/test_intrin256.simd.hpp
@ -0,0 +1,23 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+#if !defined CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY && \
+    !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS // TODO? C++ fallback implementation for SIMD256
+
+#define CV__SIMD_FORCE_WIDTH 256
+#include "opencv2/core/hal/intrin.hpp"
+#undef CV__SIMD_FORCE_WIDTH
+
+#if CV_SIMD_WIDTH != 32
+#error "Invalid build configuration"
+#endif
+
+#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
+
+namespace opencv_test { namespace hal { namespace intrin256 {
+CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
+
+#include "test_intrin_utils.hpp"
+
+CV_CPU_OPTIMIZATION_NAMESPACE_END
+}}} //namespace
--- a/modules/core/test/test_intrin_utils.hpp
+++ b/modules/core/test/test_intrin_utils.hpp
@ -1,10 +1,22 @@
 // This file is part of OpenCV project.
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
-#include "opencv2/core/hal/intrin.hpp"

-namespace opencv_test { namespace hal {
-CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
+// This file is not standalone.
+// It is included with these active namespaces:
+//namespace opencv_test { namespace hal { namespace intrinXXX {
+//CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
+
+void test_hal_intrin_uint8();
+void test_hal_intrin_int8();
+void test_hal_intrin_uint16();
+void test_hal_intrin_int16();
+void test_hal_intrin_uint32();
+void test_hal_intrin_int32();
+void test_hal_intrin_uint64();
+void test_hal_intrin_int64();
+void test_hal_intrin_float32();
+void test_hal_intrin_float64();

 void test_hal_intrin_float16();

@ -258,6 +270,7 @@ template<typename R> struct TheTest
        v_store(out.u.d, r_low);
        for (int i = 0; i < R::nlanes/2; ++i)
        {
+            SCOPED_TRACE(cv::format("i=%d", i));
            EXPECT_EQ((LaneType)data.u[i], (LaneType)out.u[i]);
        }

@ -266,6 +279,7 @@ template<typename R> struct TheTest
        v_store(out.u.d, r_low_align8byte);
        for (int i = 0; i < R::nlanes/2; ++i)
        {
+            SCOPED_TRACE(cv::format("i=%d", i));
            EXPECT_EQ((LaneType)data.u[i + R::nlanes/2], (LaneType)out.u[i]);
        }

@ -296,6 +310,7 @@ template<typename R> struct TheTest
        resV.fill((LaneType)8);
        for (int i = 0; i < R::nlanes; ++i)
        {
+            SCOPED_TRACE(cv::format("i=%d", i));
            EXPECT_EQ((LaneType)0, resZ[i]);
            EXPECT_EQ((LaneType)8, resV[i]);
        }
@ -342,6 +357,7 @@ template<typename R> struct TheTest

        for (int i = 0; i < R::nlanes; ++i)
        {
+            SCOPED_TRACE(cv::format("i=%d", i));
            EXPECT_EQ(data1, Data<R>(a));
            EXPECT_EQ(data2, Data<R>(b));
            EXPECT_EQ(data3, Data<R>(c));
@ -374,6 +390,7 @@ template<typename R> struct TheTest

        for (int i = 0; i < R::nlanes; ++i)
        {
+            SCOPED_TRACE(cv::format("i=%d", i));
            EXPECT_EQ(data1, Data<R>(a));
            EXPECT_EQ(data2, Data<R>(b));
        }
@ -397,6 +414,7 @@ template<typename R> struct TheTest
        const int n = Rx2::nlanes;
        for (int i = 0; i < n; ++i)
        {
+            SCOPED_TRACE(cv::format("i=%d", i));
            EXPECT_EQ(dataA[i], resB[i]);
            EXPECT_EQ(dataA[i], resC[i]);
            EXPECT_EQ(dataA[i + n], resD[i]);
@ -412,7 +430,10 @@ template<typename R> struct TheTest
        Data<Rx4> out = vx_load_expand_q(data.d);
        const int n = Rx4::nlanes;
        for (int i = 0; i < n; ++i)
+        {
+            SCOPED_TRACE(cv::format("i=%d", i));
            EXPECT_EQ(data[i], out[i]);
+        }

        return *this;
    }
@ -426,6 +447,7 @@ template<typename R> struct TheTest
        Data<R> resC = a + b, resD = a - b;
        for (int i = 0; i < R::nlanes; ++i)
        {
+            SCOPED_TRACE(cv::format("i=%d", i));
            EXPECT_EQ(saturate_cast<LaneType>(dataA[i] + dataB[i]), resC[i]);
            EXPECT_EQ(saturate_cast<LaneType>(dataA[i] - dataB[i]), resD[i]);
        }
@ -443,6 +465,7 @@ template<typename R> struct TheTest
                resD = v_sub_wrap(a, b);
        for (int i = 0; i < R::nlanes; ++i)
        {
+            SCOPED_TRACE(cv::format("i=%d", i));
            EXPECT_EQ((LaneType)(dataA[i] + dataB[i]), resC[i]);
            EXPECT_EQ((LaneType)(dataA[i] - dataB[i]), resD[i]);
        }
@ -458,6 +481,7 @@ template<typename R> struct TheTest
        Data<R> resC = a * b;
        for (int i = 0; i < R::nlanes; ++i)
        {
+            SCOPED_TRACE(cv::format("i=%d", i));
            EXPECT_EQ(dataA[i] * dataB[i], resC[i]);
        }

@ -473,6 +497,7 @@ template<typename R> struct TheTest
        Data<R> resC = a / b;
        for (int i = 0; i < R::nlanes; ++i)
        {
+            SCOPED_TRACE(cv::format("i=%d", i));
            EXPECT_EQ(dataA[i] / dataB[i], resC[i]);
        }

@ -492,6 +517,7 @@ template<typename R> struct TheTest
        const int n = R::nlanes / 2;
        for (int i = 0; i < n; ++i)
        {
+            SCOPED_TRACE(cv::format("i=%d", i));
            EXPECT_EQ((typename Rx2::lane_type)dataA[i] * dataB[i], resC[i]);
            EXPECT_EQ((typename Rx2::lane_type)dataA[i + n] * dataB[i + n], resD[i]);
        }
@ -511,6 +537,7 @@ template<typename R> struct TheTest

        for (int i = 0; i < Ru::nlanes; ++i)
        {
+            SCOPED_TRACE(cv::format("i=%d", i));
            EXPECT_EQ((u_type)std::abs(dataA[i] - dataB[i]), resC[i]);
        }

@ -529,6 +556,7 @@ template<typename R> struct TheTest

        for (int i = 0; i < R::nlanes; ++i)
        {
+            SCOPED_TRACE(cv::format("i=%d", i));
            EXPECT_EQ(static_cast<LaneType>(dataA[i] << s), resB[i]);
            EXPECT_EQ(static_cast<LaneType>(dataA[i] << s), resC[i]);
            EXPECT_EQ(static_cast<LaneType>(dataA[i] >> s), resD[i]);
@ -553,6 +581,7 @@ template<typename R> struct TheTest

        for (int i = 0; i < R::nlanes; ++i)
        {
+            SCOPED_TRACE(cv::format("i=%d", i));
            EXPECT_EQ(dataA[i] == dataB[i], resC[i] != 0);
            EXPECT_EQ(dataA[i] != dataB[i], resD[i] != 0);
            EXPECT_EQ(dataA[i] >  dataB[i], resE[i] != 0);
@ -583,6 +612,7 @@ template<typename R> struct TheTest
        const int n = R::nlanes / 2;
        for (int i = 0; i < n; ++i)
        {
+            SCOPED_TRACE(cv::format("i=%d", i));
            EXPECT_EQ(dataA[i*2] * dataB[i*2] + dataA[i*2 + 1] * dataB[i*2 + 1], resD[i]);
            EXPECT_EQ(dataA[i*2] * dataB[i*2] + dataA[i*2 + 1] * dataB[i*2 + 1] + dataC[i], resE[i]);
        }
@ -597,6 +627,7 @@ template<typename R> struct TheTest
        Data<R> resC = a & b, resD = a | b, resE = a ^ b, resF = ~a;
        for (int i = 0; i < R::nlanes; ++i)
        {
+            SCOPED_TRACE(cv::format("i=%d", i));
            EXPECT_EQ(dataA[i] & dataB[i], resC[i]);
            EXPECT_EQ(dataA[i] | dataB[i], resD[i]);
            EXPECT_EQ(dataA[i] ^ dataB[i], resE[i]);
@ -615,6 +646,7 @@ template<typename R> struct TheTest
        Data<R> resB = v_sqrt(a), resC = v_invsqrt(a), resE = v_abs(d);
        for (int i = 0; i < R::nlanes; ++i)
        {
+            SCOPED_TRACE(cv::format("i=%d", i));
            EXPECT_COMPARE_EQ((float)std::sqrt(dataA[i]), (float)resB[i]);
            EXPECT_COMPARE_EQ(1/(float)std::sqrt(dataA[i]), (float)resC[i]);
            EXPECT_COMPARE_EQ((float)abs(dataA[i]), (float)resE[i]);
@ -632,6 +664,7 @@ template<typename R> struct TheTest
        Data<R> resC = v_min(a, b), resD = v_max(a, b);
        for (int i = 0; i < R::nlanes; ++i)
        {
+            SCOPED_TRACE(cv::format("i=%d", i));
            EXPECT_EQ(std::min(dataA[i], dataB[i]), resC[i]);
            EXPECT_EQ(std::max(dataA[i], dataB[i]), resD[i]);
        }
@ -672,6 +705,7 @@ template<typename R> struct TheTest
        const u_type mask = std::numeric_limits<LaneType>::is_signed ? (u_type)(1 << (sizeof(u_type)*8 - 1)) : 0;
        for (int i = 0; i < Ru::nlanes; ++i)
        {
+            SCOPED_TRACE(cv::format("i=%d", i));
            u_type uA = dataA[i] ^ mask;
            u_type uB = dataB[i] ^ mask;
            EXPECT_EQ(uA > uB ? uA - uB : uB - uA, resC[i]);
@ -691,6 +725,7 @@ template<typename R> struct TheTest
        Data<R> resC = v_absdiff(a, b);
        for (int i = 0; i < R::nlanes; ++i)
        {
+            SCOPED_TRACE(cv::format("i=%d", i));
            EXPECT_EQ(dataA[i] > dataB[i] ? dataA[i] - dataB[i] : dataB[i] - dataA[i], resC[i]);
        }
        return *this;
@ -744,6 +779,7 @@ template<typename R> struct TheTest
        Data<R> resF = f;
        for (int i = 0; i < R::nlanes; ++i)
        {
+            SCOPED_TRACE(cv::format("i=%d", i));
            int_type m2 = dataB.as_int(i);
            EXPECT_EQ((dataD.as_int(i) & m2) | (dataE.as_int(i) & ~m2), resF.as_int(i));
        }
@ -776,6 +812,7 @@ template<typename R> struct TheTest
        const w_type add = (w_type)1 << (s - 1);
        for (int i = 0; i < n; ++i)
        {
+            SCOPED_TRACE(cv::format("i=%d", i));
            EXPECT_EQ(pack_saturate_cast<LaneType>(dataA[i]), resC[i]);
            EXPECT_EQ(pack_saturate_cast<LaneType>(dataB[i]), resC[i + n]);
            EXPECT_EQ(pack_saturate_cast<LaneType>((dataA[i] + add) >> s), resD[i]);
@ -816,6 +853,7 @@ template<typename R> struct TheTest
        const w_type add = (w_type)1 << (s - 1);
        for (int i = 0; i < n; ++i)
        {
+            SCOPED_TRACE(cv::format("i=%d", i));
            EXPECT_EQ(pack_saturate_cast<LaneType>(dataA[i]), resC[i]);
            EXPECT_EQ(pack_saturate_cast<LaneType>(dataB[i]), resC[i + n]);
            EXPECT_EQ(pack_saturate_cast<LaneType>((dataA[i] + add) >> s), resD[i]);
@ -845,6 +883,7 @@ template<typename R> struct TheTest
        const int n = R::nlanes/2;
        for (int i = 0; i < n; ++i)
        {
+            SCOPED_TRACE(cv::format("i=%d", i));
            EXPECT_EQ(dataA[i], resC[i*2]);
            EXPECT_EQ(dataB[i], resC[i*2+1]);
            EXPECT_EQ(dataA[i+n], resD[i*2]);
@ -876,6 +915,7 @@ template<typename R> struct TheTest

        for (int i = 0; i < R::nlanes; ++i)
        {
+            SCOPED_TRACE(cv::format("i=%d", i));
            if (i + s >= R::nlanes)
                EXPECT_EQ(dataB[i - R::nlanes + s], resC[i]);
            else
@ -901,6 +941,7 @@ template<typename R> struct TheTest

        for (int i = 0; i < R::nlanes; ++i)
        {
+            SCOPED_TRACE(cv::format("i=%d", i));
            if (i + s >= R::nlanes)
            {
                EXPECT_EQ((LaneType)0, resC[i]);
@ -940,6 +981,7 @@ template<typename R> struct TheTest

        for (int i = 0; i < R::nlanes; ++i)
        {
+            SCOPED_TRACE(cv::format("i=%d", i));
            EXPECT_EQ(cvRound(data1[i]), resB[i]);
            EXPECT_EQ((typename Ri::lane_type)data1[i], resC[i]);
            EXPECT_EQ(cvFloor(data1[i]), resD[i]);
@ -964,6 +1006,7 @@ template<typename R> struct TheTest
        int n = std::min<int>(Rt::nlanes, R::nlanes);
        for (int i = 0; i < n; ++i)
        {
+            SCOPED_TRACE(cv::format("i=%d", i));
            EXPECT_EQ((typename Rt::lane_type)dataA[i], resB[i]);
        }
        return *this;
@ -983,10 +1026,12 @@ template<typename R> struct TheTest
        int n = std::min<int>(Rt::nlanes, R::nlanes);
        for (int i = 0; i < n; ++i)
        {
+            SCOPED_TRACE(cv::format("i=%d", i));
            EXPECT_EQ((typename Rt::lane_type)dataA[i], resB[i]);
        }
        for (int i = 0; i < n; ++i)
        {
+            SCOPED_TRACE(cv::format("i=%d", i));
            EXPECT_EQ((typename Rt::lane_type)dataA[i+n], resC[i]);
        }
 #endif
@ -1006,6 +1051,7 @@ template<typename R> struct TheTest
        {
            for (int j = i; j < i + 4; ++j)
            {
+                SCOPED_TRACE(cv::format("i=%d j=%d", i, j));
                LaneType val = dataV[i]     * dataA[j]
                             + dataV[i + 1] * dataB[j]
                             + dataV[i + 2] * dataC[j]
@ -1019,6 +1065,7 @@ template<typename R> struct TheTest
        {
            for (int j = i; j < i + 4; ++j)
            {
+                SCOPED_TRACE(cv::format("i=%d j=%d", i, j));
                LaneType val = dataV[i]     * dataA[j]
                             + dataV[i + 1] * dataB[j]
                             + dataV[i + 2] * dataC[j]
@ -1045,6 +1092,7 @@ template<typename R> struct TheTest
        {
            for (int j = 0; j < 4; ++j)
            {
+                SCOPED_TRACE(cv::format("i=%d j=%d", i, j));
                EXPECT_EQ(dataA[i + j], res[j][i]);
                EXPECT_EQ(dataB[i + j], res[j][i + 1]);
                EXPECT_EQ(dataC[i + j], res[j][i + 2]);
@ -1066,6 +1114,7 @@ template<typename R> struct TheTest

        for (int i = 0; i < R::nlanes; i += 4)
        {
+            SCOPED_TRACE(cv::format("i=%d", i));
            EXPECT_COMPARE_EQ(dataA.sum(i, 4), res[i]);
            EXPECT_COMPARE_EQ(dataB.sum(i, 4), res[i + 1]);
            EXPECT_COMPARE_EQ(dataC.sum(i, 4), res[i + 2]);
@ -1121,7 +1170,304 @@ template<typename R> struct TheTest

 };

+
+#if 1
+#define DUMP_ENTRY(type) printf("SIMD%d: %s\n", 8*(int)sizeof(v_uint8), CV__TRACE_FUNCTION);
 #endif

-CV_CPU_OPTIMIZATION_NAMESPACE_END
-}} // namespace
+//=============  8-bit integer =====================================================================
+
+void test_hal_intrin_uint8()
+{
+    DUMP_ENTRY(v_uint8);
+    TheTest<v_uint8>()
+        .test_loadstore()
+        .test_interleave()
+        .test_expand()
+        .test_expand_q()
+        .test_addsub()
+        .test_addsub_wrap()
+        .test_cmp()
+        .test_logic()
+        .test_min_max()
+        .test_absdiff()
+        .test_mask()
+        .test_popcount()
+        .test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>()
+        .test_pack_u<1>().test_pack_u<2>().test_pack_u<3>().test_pack_u<8>()
+        .test_unpack()
+        .test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>()
+        .test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>()
+        ;
+
+#if CV_SIMD_WIDTH == 32
+    TheTest<v_uint8>()
+        .test_pack<9>().test_pack<10>().test_pack<13>().test_pack<15>()
+        .test_pack_u<9>().test_pack_u<10>().test_pack_u<13>().test_pack_u<15>()
+        .test_extract<16>().test_extract<17>().test_extract<23>().test_extract<31>()
+        .test_rotate<16>().test_rotate<17>().test_rotate<23>().test_rotate<31>()
+        ;
+#endif
+}
+
+void test_hal_intrin_int8()
+{
+    DUMP_ENTRY(v_int8);
+    TheTest<v_int8>()
+        .test_loadstore()
+        .test_interleave()
+        .test_expand()
+        .test_expand_q()
+        .test_addsub()
+        .test_addsub_wrap()
+        .test_cmp()
+        .test_logic()
+        .test_min_max()
+        .test_absdiff()
+        .test_abs()
+        .test_mask()
+        .test_popcount()
+        .test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>()
+        .test_unpack()
+        .test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>()
+        .test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>()
+        ;
+}
+
+//============= 16-bit integer =====================================================================
+
+void test_hal_intrin_uint16()
+{
+    DUMP_ENTRY(v_uint16);
+    TheTest<v_uint16>()
+        .test_loadstore()
+        .test_interleave()
+        .test_expand()
+        .test_addsub()
+        .test_addsub_wrap()
+        .test_mul()
+        .test_mul_expand()
+        .test_cmp()
+        .test_shift<1>()
+        .test_shift<8>()
+        .test_logic()
+        .test_min_max()
+        .test_absdiff()
+        .test_reduce()
+        .test_mask()
+        .test_popcount()
+        .test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>()
+        .test_pack_u<1>().test_pack_u<2>().test_pack_u<7>().test_pack_u<16>()
+        .test_unpack()
+        .test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>()
+        .test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>()
+        ;
+}
+
+void test_hal_intrin_int16()
+{
+    DUMP_ENTRY(v_int16);
+    TheTest<v_int16>()
+        .test_loadstore()
+        .test_interleave()
+        .test_expand()
+        .test_addsub()
+        .test_addsub_wrap()
+        .test_mul()
+        .test_mul_expand()
+        .test_cmp()
+        .test_shift<1>()
+        .test_shift<8>()
+        .test_dot_prod()
+        .test_logic()
+        .test_min_max()
+        .test_absdiff()
+        .test_abs()
+        .test_reduce()
+        .test_mask()
+        .test_popcount()
+        .test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>()
+        .test_unpack()
+        .test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>()
+        .test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>()
+        ;
+}
+
+//============= 32-bit integer =====================================================================
+
+void test_hal_intrin_uint32()
+{
+    DUMP_ENTRY(v_uint32);
+    TheTest<v_uint32>()
+        .test_loadstore()
+        .test_interleave()
+        .test_expand()
+        .test_addsub()
+        .test_mul()
+        .test_mul_expand()
+        .test_cmp()
+        .test_shift<1>()
+        .test_shift<8>()
+        .test_logic()
+        .test_min_max()
+        .test_absdiff()
+        .test_reduce()
+        .test_mask()
+        .test_popcount()
+        .test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>()
+        .test_unpack()
+        .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
+        .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
+        .test_transpose()
+        ;
+}
+
+void test_hal_intrin_int32()
+{
+    DUMP_ENTRY(v_int32);
+    TheTest<v_int32>()
+        .test_loadstore()
+        .test_interleave()
+        .test_expand()
+        .test_addsub()
+        .test_mul()
+        .test_abs()
+        .test_cmp()
+        .test_popcount()
+        .test_shift<1>().test_shift<8>()
+        .test_logic()
+        .test_min_max()
+        .test_absdiff()
+        .test_reduce()
+        .test_mask()
+        .test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>()
+        .test_unpack()
+        .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
+        .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
+        .test_float_cvt32()
+        .test_float_cvt64()
+        .test_transpose()
+        ;
+}
+
+//============= 64-bit integer =====================================================================
+
+void test_hal_intrin_uint64()
+{
+    DUMP_ENTRY(v_uint64);
+    TheTest<v_uint64>()
+        .test_loadstore()
+        .test_addsub()
+        .test_shift<1>().test_shift<8>()
+        .test_logic()
+        .test_extract<0>().test_extract<1>()
+        .test_rotate<0>().test_rotate<1>()
+        ;
+}
+
+void test_hal_intrin_int64()
+{
+    DUMP_ENTRY(v_int64);
+    TheTest<v_int64>()
+        .test_loadstore()
+        .test_addsub()
+        .test_shift<1>().test_shift<8>()
+        .test_logic()
+        .test_extract<0>().test_extract<1>()
+        .test_rotate<0>().test_rotate<1>()
+        ;
+}
+
+//============= Floating point =====================================================================
+void test_hal_intrin_float32()
+{
+    DUMP_ENTRY(v_float32);
+    TheTest<v_float32>()
+        .test_loadstore()
+        .test_interleave()
+        .test_interleave_2channel()
+        .test_addsub()
+        .test_mul()
+        .test_div()
+        .test_cmp()
+        .test_sqrt_abs()
+        .test_min_max()
+        .test_float_absdiff()
+        .test_reduce()
+        .test_mask()
+        .test_unpack()
+        .test_float_math()
+        .test_float_cvt64()
+        .test_matmul()
+        .test_transpose()
+        .test_reduce_sum4()
+        .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
+        .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
+        ;
+
+#if CV_SIMD_WIDTH == 32
+    TheTest<v_float32>()
+        .test_extract<4>().test_extract<5>().test_extract<6>().test_extract<7>()
+        .test_rotate<4>().test_rotate<5>().test_rotate<6>().test_rotate<7>()
+        ;
+#endif
+}
+
+void test_hal_intrin_float64()
+{
+    DUMP_ENTRY(v_float64);
+#if CV_SIMD_64F
+    TheTest<v_float64>()
+        .test_loadstore()
+        .test_addsub()
+        .test_mul()
+        .test_div()
+        .test_cmp()
+        .test_sqrt_abs()
+        .test_min_max()
+        .test_float_absdiff()
+        .test_mask()
+        .test_unpack()
+        .test_float_math()
+        .test_float_cvt32()
+        .test_extract<0>().test_extract<1>()
+        .test_rotate<0>().test_rotate<1>()
+        ;
+
+#if CV_SIMD_WIDTH == 32
+    TheTest<v_float64>()
+        .test_extract<2>().test_extract<3>()
+        .test_rotate<2>().test_rotate<3>()
+        ;
+#endif //CV_SIMD256
+
+#endif
+}
+
+#if CV_FP16
+void test_hal_intrin_float16()
+{
+    DUMP_ENTRY(v_float16);
+#if CV_SIMD_WIDTH > 16
+    TheTest<v_float16>()
+        .test_loadstore_fp16()
+        .test_float_cvt_fp16()
+        ;
+#endif
+}
+#endif
+
+/*#if defined(CV_CPU_DISPATCH_MODE_FP16) && CV_CPU_DISPATCH_MODE == FP16
+void test_hal_intrin_float16()
+{
+    TheTest<v_float16>()
+        .test_loadstore_fp16()
+        .test_float_cvt_fp16()
+        ;
+}
+#endif*/
+
+#endif //CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
+
+//CV_CPU_OPTIMIZATION_NAMESPACE_END
+//}}} // namespace
--- a/modules/core/test/test_mat.cpp
+++ b/modules/core/test/test_mat.cpp
@ -1814,4 +1814,62 @@ BIGDATA_TEST(Mat, push_back_regression_4158)  // memory usage: ~10.6 Gb
    }
 }

+
+TEST(Core_Merge, hang_12171)
+{
+    Mat src1(4, 24, CV_8UC1, Scalar::all(1));
+    Mat src2(4, 24, CV_8UC1, Scalar::all(2));
+    Rect src_roi(0, 0, 23, 4);
+    Mat src_channels[2] = { src1(src_roi), src2(src_roi) };
+    Mat dst(4, 24, CV_8UC2, Scalar::all(5));
+    Rect dst_roi(1, 0, 23, 4);
+    cv::merge(src_channels, 2, dst(dst_roi));
+    EXPECT_EQ(5, dst.ptr<uchar>()[0]);
+    EXPECT_EQ(5, dst.ptr<uchar>()[1]);
+    EXPECT_EQ(1, dst.ptr<uchar>()[2]);
+    EXPECT_EQ(2, dst.ptr<uchar>()[3]);
+    EXPECT_EQ(5, dst.ptr<uchar>(1)[0]);
+    EXPECT_EQ(5, dst.ptr<uchar>(1)[1]);
+    EXPECT_EQ(1, dst.ptr<uchar>(1)[2]);
+    EXPECT_EQ(2, dst.ptr<uchar>(1)[3]);
+}
+
+TEST(Core_Split, hang_12171)
+{
+    Mat src(4, 24, CV_8UC2, Scalar(1,2,3,4));
+    Rect src_roi(0, 0, 23, 4);
+    Mat dst1(4, 24, CV_8UC1, Scalar::all(5));
+    Mat dst2(4, 24, CV_8UC1, Scalar::all(10));
+    Rect dst_roi(0, 0, 23, 4);
+    Mat dst[2] = { dst1(dst_roi), dst2(dst_roi) };
+    cv::split(src(src_roi), dst);
+    EXPECT_EQ(1, dst1.ptr<uchar>()[0]);
+    EXPECT_EQ(1, dst1.ptr<uchar>()[1]);
+    EXPECT_EQ(2, dst2.ptr<uchar>()[0]);
+    EXPECT_EQ(2, dst2.ptr<uchar>()[1]);
+    EXPECT_EQ(1, dst1.ptr<uchar>(1)[0]);
+    EXPECT_EQ(1, dst1.ptr<uchar>(1)[1]);
+    EXPECT_EQ(2, dst2.ptr<uchar>(1)[0]);
+    EXPECT_EQ(2, dst2.ptr<uchar>(1)[1]);
+}
+
+TEST(Core_Split, crash_12171)
+{
+    Mat src(4, 40, CV_8UC2, Scalar(1,2,3,4));
+    Rect src_roi(0, 0, 39, 4);
+    Mat dst1(4, 40, CV_8UC1, Scalar::all(5));
+    Mat dst2(4, 40, CV_8UC1, Scalar::all(10));
+    Rect dst_roi(0, 0, 39, 4);
+    Mat dst[2] = { dst1(dst_roi), dst2(dst_roi) };
+    cv::split(src(src_roi), dst);
+    EXPECT_EQ(1, dst1.ptr<uchar>()[0]);
+    EXPECT_EQ(1, dst1.ptr<uchar>()[1]);
+    EXPECT_EQ(2, dst2.ptr<uchar>()[0]);
+    EXPECT_EQ(2, dst2.ptr<uchar>()[1]);
+    EXPECT_EQ(1, dst1.ptr<uchar>(1)[0]);
+    EXPECT_EQ(1, dst1.ptr<uchar>(1)[1]);
+    EXPECT_EQ(2, dst2.ptr<uchar>(1)[0]);
+    EXPECT_EQ(2, dst2.ptr<uchar>(1)[1]);
+}
+
 }} // namespace
--- a/modules/core/test/test_precomp.hpp
+++ b/modules/core/test/test_precomp.hpp
@ -11,6 +11,5 @@
 #include "opencv2/core/cvdef.h"
 #include "opencv2/core/private.hpp"
 #include "opencv2/core/hal/hal.hpp"
-#include "opencv2/core/hal/intrin.hpp"

 #endif
--- a/modules/cudafilters/src/cuda/median_filter.cu
+++ b/modules/cudafilters/src/cuda/median_filter.cu
@ -246,7 +246,7 @@ namespace cv { namespace cuda { namespace device
        }
        __syncthreads();

-        // Fot all remaining rows in the median filter, add the values to the the histogram
+        // For all remaining rows in the median filter, add the values to the the histogram
        for (int j=threadIdx.x; j<cols; j+=blockDim.x){
            for(int i=initStartRow; i<initStopRow; i++){
                    int pos=::min(i,rows-1);
--- a/modules/cudaimgproc/src/mssegmentation.cpp
+++ b/modules/cudaimgproc/src/mssegmentation.cpp
@ -342,7 +342,7 @@ void cv::cuda::meanShiftSegmentation(InputArray _src, OutputArray _dst, int sp,
        }
    }

-    // Sort all graph's edges connecting different components (in asceding order)
+    // Sort all graph's edges connecting different components (in ascending order)
    std::sort(edges.begin(), edges.end());

    // Exclude small components (starting from the nearest couple)
--- a/modules/cudawarping/test/test_warp_affine.cpp
+++ b/modules/cudawarping/test/test_warp_affine.cpp
@ -48,7 +48,7 @@ namespace opencv_test { namespace {

 namespace
 {
-    cv::Mat createTransfomMatrix(cv::Size srcSize, double angle)
+    cv::Mat createTransformMatrix(cv::Size srcSize, double angle)
    {
        cv::Mat M(2, 3, CV_64FC1);

@ -80,7 +80,7 @@ PARAM_TEST_CASE(BuildWarpAffineMaps, cv::cuda::DeviceInfo, cv::Size, Inverse)

 CUDA_TEST_P(BuildWarpAffineMaps, Accuracy)
 {
-    cv::Mat M = createTransfomMatrix(size, CV_PI / 4);
+    cv::Mat M = createTransformMatrix(size, CV_PI / 4);
    cv::Mat src = randomMat(randomSize(200, 400), CV_8UC1);

    cv::cuda::GpuMat xmap, ymap;
@ -207,7 +207,7 @@ PARAM_TEST_CASE(WarpAffine, cv::cuda::DeviceInfo, cv::Size, MatType, Inverse, In
 CUDA_TEST_P(WarpAffine, Accuracy)
 {
    cv::Mat src = randomMat(size, type);
-    cv::Mat M = createTransfomMatrix(size, CV_PI / 3);
+    cv::Mat M = createTransformMatrix(size, CV_PI / 3);
    int flags = interpolation;
    if (inverse)
        flags |= cv::WARP_INVERSE_MAP;
@ -257,7 +257,7 @@ CUDA_TEST_P(WarpAffineNPP, Accuracy)
    cv::Mat src = readImageType("stereobp/aloe-L.png", type);
    ASSERT_FALSE(src.empty());

-    cv::Mat M = createTransfomMatrix(src.size(), CV_PI / 4);
+    cv::Mat M = createTransformMatrix(src.size(), CV_PI / 4);
    int flags = interpolation;
    if (inverse)
        flags |= cv::WARP_INVERSE_MAP;
--- a/modules/cudawarping/test/test_warp_perspective.cpp
+++ b/modules/cudawarping/test/test_warp_perspective.cpp
@ -48,7 +48,7 @@ namespace opencv_test { namespace {

 namespace
 {
-    cv::Mat createTransfomMatrix(cv::Size srcSize, double angle)
+    cv::Mat createTransformMatrix(cv::Size srcSize, double angle)
    {
        cv::Mat M(3, 3, CV_64FC1);

@ -81,7 +81,7 @@ PARAM_TEST_CASE(BuildWarpPerspectiveMaps, cv::cuda::DeviceInfo, cv::Size, Invers

 CUDA_TEST_P(BuildWarpPerspectiveMaps, Accuracy)
 {
-    cv::Mat M = createTransfomMatrix(size, CV_PI / 4);
+    cv::Mat M = createTransformMatrix(size, CV_PI / 4);

    cv::cuda::GpuMat xmap, ymap;
    cv::cuda::buildWarpPerspectiveMaps(M, inverse, size, xmap, ymap);
@ -210,7 +210,7 @@ PARAM_TEST_CASE(WarpPerspective, cv::cuda::DeviceInfo, cv::Size, MatType, Invers
 CUDA_TEST_P(WarpPerspective, Accuracy)
 {
    cv::Mat src = randomMat(size, type);
-    cv::Mat M = createTransfomMatrix(size, CV_PI / 3);
+    cv::Mat M = createTransformMatrix(size, CV_PI / 3);
    int flags = interpolation;
    if (inverse)
        flags |= cv::WARP_INVERSE_MAP;
@ -260,7 +260,7 @@ CUDA_TEST_P(WarpPerspectiveNPP, Accuracy)
    cv::Mat src = readImageType("stereobp/aloe-L.png", type);
    ASSERT_FALSE(src.empty());

-    cv::Mat M = createTransfomMatrix(src.size(), CV_PI / 4);
+    cv::Mat M = createTransformMatrix(src.size(), CV_PI / 4);
    int flags = interpolation;
    if (inverse)
        flags |= cv::WARP_INVERSE_MAP;
--- a/modules/cudev/test/test_warp.cu
+++ b/modules/cudev/test/test_warp.cu
@ -199,7 +199,7 @@ TEST(Resize, Downscale)

 // warpAffine & warpPerspective

-Mat createAffineTransfomMatrix(Size srcSize, float angle, bool perspective)
+Mat createAffineTransformMatrix(Size srcSize, float angle, bool perspective)
 {
    cv::Mat M(perspective ? 3 : 2, 3, CV_32FC1);

@ -220,7 +220,7 @@ TEST(WarpAffine, Rotation)
    const Size size = randomSize(100, 400);

    Mat src = randomMat(size, CV_32FC1, 0, 1);
-    Mat M = createAffineTransfomMatrix(size, static_cast<float>(CV_PI / 4), false);
+    Mat M = createAffineTransformMatrix(size, static_cast<float>(CV_PI / 4), false);

    GpuMat_<float> d_src(src);
    GpuMat_<float> d_M;
@ -240,7 +240,7 @@ TEST(WarpPerspective, Rotation)
    const Size size = randomSize(100, 400);

    Mat src = randomMat(size, CV_32FC1, 0, 1);
-    Mat M = createAffineTransfomMatrix(size, static_cast<float>(CV_PI / 4), true);
+    Mat M = createAffineTransformMatrix(size, static_cast<float>(CV_PI / 4), true);

    GpuMat_<float> d_src(src);
    GpuMat_<float> d_M;
--- a/modules/dnn/include/opencv2/dnn/all_layers.hpp
+++ b/modules/dnn/include/opencv2/dnn/all_layers.hpp
@ -489,7 +489,7 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
        static Ptr<EltwiseLayer> create(const LayerParams &params);
    };

-    class CV_EXPORTS BatchNormLayer : public Layer
+    class CV_EXPORTS BatchNormLayer : public ActivationLayer
    {
    public:
        bool hasWeights, hasBias;
--- a/modules/dnn/perf/perf_net.cpp
+++ b/modules/dnn/perf/perf_net.cpp
@ -258,6 +258,17 @@ PERF_TEST_P_(DNNTestNetwork, FastNeuralStyle_eccv16)
    processNet("dnn/fast_neural_style_eccv16_starry_night.t7", "", "", Mat(cv::Size(320, 240), CV_32FC3));
 }

+PERF_TEST_P_(DNNTestNetwork, Inception_v2_Faster_RCNN)
+{
+    if (backend == DNN_BACKEND_HALIDE ||
+        (backend == DNN_BACKEND_INFERENCE_ENGINE && target != DNN_TARGET_CPU) ||
+        (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16))
+        throw SkipTestException("");
+    processNet("dnn/faster_rcnn_inception_v2_coco_2018_01_28.pb",
+               "dnn/faster_rcnn_inception_v2_coco_2018_01_28.pbtxt", "",
+               Mat(cv::Size(800, 600), CV_32FC3));
+}
+
 const tuple<DNNBackend, DNNTarget> testCases[] = {
 #ifdef HAVE_HALIDE
    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_HALIDE, DNN_TARGET_CPU),
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@ -1408,7 +1408,7 @@ struct Net::Impl
            bool fused = ld.skip;

            Ptr<Layer> layer = ld.layerInstance;
-            if (!layer->supportBackend(preferableBackend))
+            if (!fused && !layer->supportBackend(preferableBackend))
            {
                addInfEngineNetOutputs(ld);
                net = Ptr<InfEngineBackendNet>();
@ -1471,6 +1471,8 @@ struct Net::Impl
            {
                node = layer->initInfEngine(ld.inputBlobsWrappers);
            }
+            else if (node.empty())
+                continue;

            CV_Assert(!node.empty());
            ld.backendNodes[preferableBackend] = node;
@ -1715,40 +1717,41 @@ struct Net::Impl
                if (preferableBackend != DNN_BACKEND_OPENCV)
                    continue;  // Go to the next layer.

-                // For now, OpenCL target support fusion with activation of ReLU/ChannelsPReLU/Power/Tanh
-                if ( !IS_DNN_OPENCL_TARGET(preferableTarget) ||
-                     (IS_DNN_OPENCL_TARGET(preferableTarget) &&
-                         nextData &&
-                        ((nextData->type == "ReLU") ||
-                         (nextData->type == "ChannelsPReLU") ||
-                         (nextData->type == "ReLU6") ||
-                         (nextData->type == "TanH") ||
-                         (nextData->type == "Power"))) )
+                while (nextData)
                {
+                    // For now, OpenCL target support fusion with activation of ReLU/ChannelsPReLU/Power/Tanh
+                    if (IS_DNN_OPENCL_TARGET(preferableTarget) &&
+                        nextData->type != "ReLU" &&
+                        nextData->type != "ChannelsPReLU" &&
+                        nextData->type != "ReLU6" &&
+                        nextData->type != "TanH" &&
+                        nextData->type != "Power")
+                        break;

-                    Ptr<ActivationLayer> nextActivLayer;
+                    Ptr<ActivationLayer> nextActivLayer = nextData->layerInstance.dynamicCast<ActivationLayer>();
+                    if (nextActivLayer.empty())
+                        break;

-                    if( nextData )
-                        nextActivLayer = nextData->layerInstance.dynamicCast<ActivationLayer>();
-
-                    if( !nextActivLayer.empty() && pinsToKeep.count(lpNext) == 0
-                            && currLayer->setActivation(nextActivLayer) )
+                    if (currLayer->setActivation(nextActivLayer))
                    {
-                        LayerData *activData = nextData;
                        printf_(("\tfused with %s\n", nextActivLayer->name.c_str()));
-                        activData->skip = true;
+                        nextData->skip = true;
                        ld.outputBlobs = layers[lpNext.lid].outputBlobs;
                        ld.outputBlobsWrappers = layers[lpNext.lid].outputBlobsWrappers;
-
-                        if ( IS_DNN_OPENCL_TARGET(preferableTarget) )
+                        if (nextData->consumers.size() == 1)
                        {
-                            if ( !activData->consumers.empty() )
-                            {
-                                nextData = &layers[activData->consumers[0].lid];
-                                lpNext = LayerPin(activData->consumers[0].lid, 0);
-                            }
+                            int nextLayerId = nextData->consumers[0].lid;
+                            nextData = &layers[nextLayerId];
+                            lpNext = LayerPin(nextLayerId, 0);
+                        }
+                        else
+                        {
+                            nextData = 0;
+                            break;
                        }
                    }
+                    else
+                        break;
                }

                // fuse convolution layer followed by eltwise + relu
@ -2050,10 +2053,10 @@ struct Net::Impl
        TickMeter tm;
        tm.start();

-        if (preferableBackend == DNN_BACKEND_OPENCV ||
-            !layer->supportBackend(preferableBackend))
+        if( !ld.skip )
        {
-            if( !ld.skip )
+            std::map<int, Ptr<BackendNode> >::iterator it = ld.backendNodes.find(preferableBackend);
+            if (preferableBackend == DNN_BACKEND_OPENCV || it == ld.backendNodes.end() || it->second.empty())
            {
                if (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget))
                {
@ -2196,24 +2199,25 @@ struct Net::Impl
                }
            }
            else
-                tm.reset();
-        }
-        else if (!ld.skip)
-        {
-            Ptr<BackendNode> node = ld.backendNodes[preferableBackend];
-            if (preferableBackend == DNN_BACKEND_HALIDE)
            {
-                forwardHalide(ld.outputBlobsWrappers, node);
-            }
-            else if (preferableBackend == DNN_BACKEND_INFERENCE_ENGINE)
-            {
-                forwardInfEngine(node);
-            }
-            else
-            {
-                CV_Error(Error::StsNotImplemented, "Unknown backend identifier");
+                Ptr<BackendNode> node = it->second;
+                CV_Assert(!node.empty());
+                if (preferableBackend == DNN_BACKEND_HALIDE)
+                {
+                    forwardHalide(ld.outputBlobsWrappers, node);
+                }
+                else if (preferableBackend == DNN_BACKEND_INFERENCE_ENGINE)
+                {
+                    forwardInfEngine(node);
+                }
+                else
+                {
+                    CV_Error(Error::StsNotImplemented, "Unknown backend identifier");
+                }
            }
        }
+        else
+            tm.reset();

        tm.stop();
        layersTimings[ld.id] = tm.getTimeTicks();
--- a/modules/dnn/src/layers/batch_norm_layer.cpp
+++ b/modules/dnn/src/layers/batch_norm_layer.cpp
@ -268,6 +268,36 @@ public:
        }
    }

+    void forwardSlice(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const CV_OVERRIDE
+    {
+        for( int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize )
+        {
+            int i = 0;
+            float w = weights_.at<float>(cn);
+            float b = bias_.at<float>(cn);
+#if CV_SIMD128
+            v_float32x4 wV = v_setall_f32(w), bV = v_setall_f32(b);
+            for( ; i <= len - 16; i += 16 )
+            {
+                v_float32x4 x0 = v_load(srcptr + i);
+                v_float32x4 x1 = v_load(srcptr + i + 4);
+                v_float32x4 x2 = v_load(srcptr + i + 8);
+                v_float32x4 x3 = v_load(srcptr + i + 12);
+                x0 = v_muladd(x0, w, b);
+                x1 = v_muladd(x1, w, b);
+                x2 = v_muladd(x2, w, b);
+                x3 = v_muladd(x3, w, b);
+                v_store(dstptr + i, x0);
+                v_store(dstptr + i + 4, x1);
+                v_store(dstptr + i + 8, x2);
+                v_store(dstptr + i + 12, x3);
+            }
+#endif
+            for( ; i < len; i++ )
+                dstptr[i] = w * srcptr[i] + b;
+        }
+    }
+
    virtual Ptr<BackendNode> tryAttach(const Ptr<BackendNode>& node) CV_OVERRIDE
    {
        switch (node->backendId)
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@ -296,6 +296,9 @@ public:

    bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
    {
+        if (!activ.empty() && !layer.empty())
+            return false;
+
        activ = layer;
        if (activ.empty())
            reluslope.clear();
--- a/modules/dnn/src/layers/detection_output_layer.cpp
+++ b/modules/dnn/src/layers/detection_output_layer.cpp
@ -196,7 +196,7 @@ public:
    virtual bool supportBackend(int backendId) CV_OVERRIDE
    {
        return backendId == DNN_BACKEND_OPENCV ||
-               backendId == DNN_BACKEND_INFERENCE_ENGINE && !_locPredTransposed && _bboxesNormalized;
+               backendId == DNN_BACKEND_INFERENCE_ENGINE && !_locPredTransposed && _bboxesNormalized && !_clip;
    }

    bool getMemoryShapes(const std::vector<MatShape> &inputs,
--- a/modules/dnn/src/layers/eltwise_layer.cpp
+++ b/modules/dnn/src/layers/eltwise_layer.cpp
@ -452,8 +452,13 @@ public:

    bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
    {
-        activ = layer;
-        return !activ.empty();
+        if (activ.empty() || layer.empty())
+        {
+            activ = layer;
+            return !activ.empty();
+        }
+        else
+            return false;
    }

    Ptr<ActivationLayer> activ;
--- a/modules/dnn/src/layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/layers/fully_connected_layer.cpp
@ -135,8 +135,13 @@ public:

    virtual bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
    {
-        activ = layer;
-        return !activ.empty();
+        if (activ.empty() || layer.empty())
+        {
+            activ = layer;
+            return !activ.empty();
+        }
+        else
+            return false;
    }

    class FullyConnected : public ParallelLoopBody
--- a/modules/dnn/src/layers/mvn_layer.cpp
+++ b/modules/dnn/src/layers/mvn_layer.cpp
@ -42,6 +42,7 @@

 #include "../precomp.hpp"
 #include "layers_common.hpp"
+#include "../op_inf_engine.hpp"
 #include <opencv2/dnn/shape_utils.hpp>

 #ifdef HAVE_OPENCL
@ -66,27 +67,25 @@ public:
        fuse_batch_norm = false;
        fuse_relu = false;
        relu_slope = 0.f;
+        zeroDev = false;
    }

    Mat scale, shift;
    bool fuse_batch_norm;

-    virtual bool tryFuse(Ptr<Layer>& top) CV_OVERRIDE
-    {
-        if (!fuse_batch_norm)
-        {
-            top->getScaleShift(scale, shift);
-            fuse_batch_norm = !scale.empty() || !shift.empty();
-            return fuse_batch_norm;
-        }
-        return false;
-    }
-
    Ptr<ReLULayer> activ_relu;
    float relu_slope;
    bool fuse_relu;
+    bool zeroDev;  // TODO: Doesn't considered in Intel's Inference Engine backend.
    bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
    {
+        if (!layer.empty() && !fuse_relu && !fuse_batch_norm)
+        {
+            layer->getScaleShift(scale, shift);
+            fuse_batch_norm = !scale.empty() || !shift.empty();
+            return fuse_batch_norm;
+        }
+
        if (!layer.empty() && preferableTarget == DNN_TARGET_OPENCL)
        {
            activ_relu = layer.dynamicCast<ReLULayer>();
@ -97,6 +96,23 @@ public:
        return fuse_relu;
    }

+    void finalize(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs) CV_OVERRIDE
+    {
+        int splitDim = (acrossChannels) ? 1 : 2;
+        int i, newRows = 1;
+        for( i = 0; i < splitDim; i++ )
+            newRows *= inputs[0]->size[i];
+        zeroDev = inputs[0]->total() == newRows;
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
+            return !zeroDev && (preferableTarget == DNN_TARGET_CPU || eps <= 1e-7f);
+        else
+            return backendId == DNN_BACKEND_OPENCV;
+    }
+
 #ifdef HAVE_OPENCL
    bool fast_forward_ocl(std::vector<UMat> &inputs, std::vector<UMat> &outputs)
    {
@ -324,6 +340,22 @@ public:
        }
    }

+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
+    {
+#ifdef HAVE_INF_ENGINE
+        InferenceEngine::LayerParams lp;
+        lp.name = name;
+        lp.type = "MVN";
+        lp.precision = InferenceEngine::Precision::FP32;
+        std::shared_ptr<InferenceEngine::MVNLayer> ieLayer(new InferenceEngine::MVNLayer(lp));
+        ieLayer->params["across_channels"] = acrossChannels ? "1" : "0";
+        ieLayer->params["normalize_variance"] = normVariance ? "1" : "0";
+        ieLayer->params["eps"] = format("%f", eps);
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#endif  // HAVE_INF_ENGINE
+        return Ptr<BackendNode>();
+    }
+
    virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
                           const std::vector<MatShape> &outputs) const CV_OVERRIDE
    {
--- a/modules/dnn/src/layers/scale_layer.cpp
+++ b/modules/dnn/src/layers/scale_layer.cpp
@ -48,9 +48,8 @@ public:

    virtual bool supportBackend(int backendId) CV_OVERRIDE
    {
-        return backendId == DNN_BACKEND_OPENCV ||
-               backendId == DNN_BACKEND_HALIDE && haveHalide() ||
-               backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine();
+        return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_HALIDE ||
+               backendId == DNN_BACKEND_INFERENCE_ENGINE && axis == 1;
    }

    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
--- a/modules/dnn/src/layers/slice_layer.cpp
+++ b/modules/dnn/src/layers/slice_layer.cpp
@ -111,7 +111,7 @@ public:
    virtual bool supportBackend(int backendId) CV_OVERRIDE
    {
        return backendId == DNN_BACKEND_OPENCV ||
-               backendId == DNN_BACKEND_INFERENCE_ENGINE && sliceRanges.size() == 1;
+               backendId == DNN_BACKEND_INFERENCE_ENGINE && sliceRanges.size() == 1 && sliceRanges[0].size() == 4;
    }

    bool getMemoryShapes(const std::vector<MatShape> &inputs,
--- a/modules/dnn/src/layers/softmax_layer.cpp
+++ b/modules/dnn/src/layers/softmax_layer.cpp
@ -307,15 +307,17 @@ public:
        return Ptr<BackendNode>();
    }

-    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
    {
 #ifdef HAVE_INF_ENGINE
+        InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]);
+
        InferenceEngine::LayerParams lp;
        lp.name = name;
        lp.type = "SoftMax";
        lp.precision = InferenceEngine::Precision::FP32;
        std::shared_ptr<InferenceEngine::SoftMaxLayer> ieLayer(new InferenceEngine::SoftMaxLayer(lp));
-        ieLayer->axis = axisRaw;
+        ieLayer->axis = clamp(axisRaw, input->dims.size());
        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
 #endif  // HAVE_INF_ENGINE
        return Ptr<BackendNode>();
--- a/modules/dnn/src/opencl/conv_layer_spatial.cl
+++ b/modules/dnn/src/opencl/conv_layer_spatial.cl
@ -248,39 +248,38 @@ convolve_simd(

  int curr_y = or * STRIDE_Y;
  int curr_x = oc * STRIDE_X + lid;
-#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
-  int saved_y = curr_y;
-#endif
+
  int in_addr = input_batch_offset
                +  (curr_y - INPUT_PAD_H) * INPUT_WIDTH          // y tile offset
                +   curr_x - INPUT_PAD_W;                        // x tile offset

+  const int in_limit = (get_global_size(2) / ALIGNED_NUM_FILTERS) * TOTAL_INPUT_DEPTH_SIZE * INPUT_PITCH - 1;
+
  Dtype in_buf[INVEC_SIZE];

  for(int kd = 0; kd < INPUT_DEPTH; kd++)
  {
+#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
+    const bool cx_out_of_range = !(curr_x >= INPUT_PAD_W && curr_x < INPUT_WIDTH + INPUT_PAD_W);
    int in_offset = in_addr;
    __attribute__((opencl_unroll_hint(INVEC_SIZE)))
-    for (int reg = 0; reg < INVEC_SIZE; reg++)
+    for (int reg = 0; reg < INVEC_SIZE; reg++, in_offset += INPUT_WIDTH)
    {
-        in_buf[reg] = inputs[in_offset];
-#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
-        if (!(curr_y >= INPUT_PAD_H && curr_y < INPUT_HEIGHT + INPUT_PAD_H &&
-              curr_x >= INPUT_PAD_W && curr_x < INPUT_WIDTH + INPUT_PAD_W))
-        {
-          in_buf[reg] = 0;
-        }
-#endif
-        curr_y += 1;
-        in_offset += INPUT_WIDTH;
+      Dtype input = inputs[clamp(in_offset, 0, in_limit)];
+      int cy = curr_y + reg;
+      in_buf[reg] = (cx_out_of_range || cy < INPUT_PAD_H || cy >= INPUT_HEIGHT + INPUT_PAD_H) ? 0 : input;
    }
+#else
+    int in_offset = in_addr;
+    __attribute__((opencl_unroll_hint(INVEC_SIZE)))
+    for (int reg = 0; reg < INVEC_SIZE; reg++, in_offset += INPUT_WIDTH)
+    {
+      in_buf[reg] = inputs[min(in_offset, in_limit)];
+    }
+#endif

    in_addr += INPUT_PITCH;

-#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
-    curr_y = saved_y;
-#endif
-
    Dtype weight_buf[WEIGHT_PREF];
    int w_idx=0;

--- a/modules/dnn/src/tensorflow/tf_importer.cpp
+++ b/modules/dnn/src/tensorflow/tf_importer.cpp
@ -716,6 +716,8 @@ void TFImporter::populateNet(Net dstNet)

    // find all Const layers for params
    std::map<String, int> value_id;
+    // A map with constant blobs which are shared between multiple layers.
+    std::map<String, Mat> sharedWeights;
    addConstNodes(netBin, value_id, layers_to_ignore);
    addConstNodes(netTxt, value_id, layers_to_ignore);

@ -805,51 +807,64 @@ void TFImporter::populateNet(Net dstNet)
                }
            }

-            const tensorflow::TensorProto& kernelTensor = getConstBlob(layer, value_id);
-            kernelFromTensor(kernelTensor, layerParams.blobs[0]);
-            releaseTensor(const_cast<tensorflow::TensorProto*>(&kernelTensor));
-            int* kshape = layerParams.blobs[0].size.p;
-            const int outCh = kshape[0];
-            const int inCh = kshape[1];
-            const int height = kshape[2];
-            const int width = kshape[3];
-            if (type == "DepthwiseConv2dNative")
+            int kernelTensorInpId = -1;
+            const tensorflow::TensorProto& kernelTensor = getConstBlob(layer, value_id, -1, &kernelTensorInpId);
+            const String kernelTensorName = layer.input(kernelTensorInpId);
+            std::map<String, Mat>::iterator sharedWeightsIt = sharedWeights.find(kernelTensorName);
+            if (sharedWeightsIt == sharedWeights.end())
            {
-                CV_Assert(!locPredTransposed);
-                const int chMultiplier = kshape[0];
+                kernelFromTensor(kernelTensor, layerParams.blobs[0]);
+                releaseTensor(const_cast<tensorflow::TensorProto*>(&kernelTensor));

-                Mat copy = layerParams.blobs[0].clone();
-                float* src = (float*)copy.data;
-                float* dst = (float*)layerParams.blobs[0].data;
-                for (int i = 0; i < chMultiplier; ++i)
-                    for (int j = 0; j < inCh; ++j)
-                        for (int s = 0; s < height * width; ++s)
-                            {
-                                int src_i = (i * inCh + j) * height * width + s;
-                                int dst_i = (j * chMultiplier + i) * height* width + s;
-                                dst[dst_i] = src[src_i];
-                            }
-                // TODO Use reshape instead
-                kshape[0] = inCh * chMultiplier;
-                kshape[1] = 1;
-                size_t* kstep = layerParams.blobs[0].step.p;
-                kstep[0] = kstep[1]; // fix steps too
-            }
-            layerParams.set("kernel_h", height);
-            layerParams.set("kernel_w", width);
-            layerParams.set("num_output", outCh);
-
-            // Shuffle output channels from yxYX to xyXY.
-            if (locPredTransposed)
-            {
-                const int slice = height * width * inCh;
-                for (int i = 0; i < outCh; i += 2)
+                int* kshape = layerParams.blobs[0].size.p;
+                const int outCh = kshape[0];
+                const int inCh = kshape[1];
+                const int height = kshape[2];
+                const int width = kshape[3];
+                if (type == "DepthwiseConv2dNative")
                {
-                    cv::Mat src(1, slice, CV_32F, layerParams.blobs[0].ptr<float>(i));
-                    cv::Mat dst(1, slice, CV_32F, layerParams.blobs[0].ptr<float>(i + 1));
-                    std::swap_ranges(src.begin<float>(), src.end<float>(), dst.begin<float>());
+                    CV_Assert(!locPredTransposed);
+                    const int chMultiplier = kshape[0];
+
+                    Mat copy = layerParams.blobs[0].clone();
+                    float* src = (float*)copy.data;
+                    float* dst = (float*)layerParams.blobs[0].data;
+                    for (int i = 0; i < chMultiplier; ++i)
+                        for (int j = 0; j < inCh; ++j)
+                            for (int s = 0; s < height * width; ++s)
+                                {
+                                    int src_i = (i * inCh + j) * height * width + s;
+                                    int dst_i = (j * chMultiplier + i) * height* width + s;
+                                    dst[dst_i] = src[src_i];
+                                }
+                    // TODO Use reshape instead
+                    kshape[0] = inCh * chMultiplier;
+                    kshape[1] = 1;
+                    size_t* kstep = layerParams.blobs[0].step.p;
+                    kstep[0] = kstep[1]; // fix steps too
                }
+
+                // Shuffle output channels from yxYX to xyXY.
+                if (locPredTransposed)
+                {
+                    const int slice = height * width * inCh;
+                    for (int i = 0; i < outCh; i += 2)
+                    {
+                        cv::Mat src(1, slice, CV_32F, layerParams.blobs[0].ptr<float>(i));
+                        cv::Mat dst(1, slice, CV_32F, layerParams.blobs[0].ptr<float>(i + 1));
+                        std::swap_ranges(src.begin<float>(), src.end<float>(), dst.begin<float>());
+                    }
+                }
+                sharedWeights[kernelTensorName] = layerParams.blobs[0];
            }
+            else
+            {
+                layerParams.blobs[0] = sharedWeightsIt->second;
+            }
+
+            layerParams.set("kernel_h", layerParams.blobs[0].size[2]);
+            layerParams.set("kernel_w", layerParams.blobs[0].size[3]);
+            layerParams.set("num_output", layerParams.blobs[0].size[0]);

            setStrides(layerParams, layer);
            setPadding(layerParams, layer);
@ -954,6 +969,13 @@ void TFImporter::populateNet(Net dstNet)
        {
            CV_Assert(layer.input_size() == 2);

+            // For the object detection networks, TensorFlow Object Detection API
+            // predicts deltas for bounding boxes in yxYX (ymin, xmin, ymax, xmax)
+            // order. We can manage it at DetectionOutput layer parsing predictions
+            // or shuffle last Faster-RCNN's matmul weights.
+            bool locPredTransposed = hasLayerAttr(layer, "loc_pred_transposed") &&
+                                     getLayerAttr(layer, "loc_pred_transposed").b();
+
            layerParams.set("bias_term", false);
            layerParams.blobs.resize(1);

@ -970,6 +992,17 @@ void TFImporter::populateNet(Net dstNet)
                blobFromTensor(getConstBlob(net.node(weights_layer_index), value_id), layerParams.blobs[1]);
                ExcludeLayer(net, weights_layer_index, 0, false);
                layers_to_ignore.insert(next_layers[0].first);
+
+                if (locPredTransposed)
+                {
+                    const int numWeights = layerParams.blobs[1].total();
+                    float* biasData = reinterpret_cast<float*>(layerParams.blobs[1].data);
+                    CV_Assert(numWeights % 4 == 0);
+                    for (int i = 0; i < numWeights; i += 2)
+                    {
+                        std::swap(biasData[i], biasData[i + 1]);
+                    }
+                }
            }

            int kernel_blob_index = -1;
@ -983,6 +1016,16 @@ void TFImporter::populateNet(Net dstNet)
            }

            layerParams.set("num_output", layerParams.blobs[0].size[0]);
+            if (locPredTransposed)
+            {
+                CV_Assert(layerParams.blobs[0].dims == 2);
+                for (int i = 0; i < layerParams.blobs[0].size[0]; i += 2)
+                {
+                    cv::Mat src = layerParams.blobs[0].row(i);
+                    cv::Mat dst = layerParams.blobs[0].row(i + 1);
+                    std::swap_ranges(src.begin<float>(), src.end<float>(), dst.begin<float>());
+                }
+            }

            int id = dstNet.addLayer(name, "InnerProduct", layerParams);
            layer_id[name] = id;
@ -1010,6 +1053,7 @@ void TFImporter::populateNet(Net dstNet)
                layer_id[permName] = permId;
                connect(layer_id, dstNet, inpId, permId, 0);
                inpId = Pin(permName);
+                inpLayout = DATA_LAYOUT_NCHW;
            }
            else if (newShape.total() == 4 && inpLayout == DATA_LAYOUT_NHWC)
            {
@ -1024,7 +1068,7 @@ void TFImporter::populateNet(Net dstNet)

            // one input only
            connect(layer_id, dstNet, inpId, id, 0);
-            data_layouts[name] = newShape.total() == 2 ? DATA_LAYOUT_PLANAR : DATA_LAYOUT_UNKNOWN;
+            data_layouts[name] = newShape.total() == 2 ? DATA_LAYOUT_PLANAR : inpLayout;
        }
        else if (type == "Flatten" || type == "Squeeze")
        {
@ -1696,41 +1740,6 @@ void TFImporter::populateNet(Net dstNet)
            connect(layer_id, dstNet, parsePin(layer.input(1)), id, 1);
            data_layouts[name] = DATA_LAYOUT_UNKNOWN;
        }
-        else if (type == "DetectionOutput")
-        {
-            // op: "DetectionOutput"
-            // input_0: "locations"
-            // input_1: "classifications"
-            // input_2: "prior_boxes"
-            if (hasLayerAttr(layer, "num_classes"))
-                layerParams.set("num_classes", getLayerAttr(layer, "num_classes").i());
-            if (hasLayerAttr(layer, "share_location"))
-                layerParams.set("share_location", getLayerAttr(layer, "share_location").b());
-            if (hasLayerAttr(layer, "background_label_id"))
-                layerParams.set("background_label_id", getLayerAttr(layer, "background_label_id").i());
-            if (hasLayerAttr(layer, "nms_threshold"))
-                layerParams.set("nms_threshold", getLayerAttr(layer, "nms_threshold").f());
-            if (hasLayerAttr(layer, "top_k"))
-                layerParams.set("top_k", getLayerAttr(layer, "top_k").i());
-            if (hasLayerAttr(layer, "code_type"))
-                layerParams.set("code_type", getLayerAttr(layer, "code_type").s());
-            if (hasLayerAttr(layer, "keep_top_k"))
-                layerParams.set("keep_top_k", getLayerAttr(layer, "keep_top_k").i());
-            if (hasLayerAttr(layer, "confidence_threshold"))
-                layerParams.set("confidence_threshold", getLayerAttr(layer, "confidence_threshold").f());
-            if (hasLayerAttr(layer, "loc_pred_transposed"))
-                layerParams.set("loc_pred_transposed", getLayerAttr(layer, "loc_pred_transposed").b());
-            if (hasLayerAttr(layer, "clip"))
-                layerParams.set("clip", getLayerAttr(layer, "clip").b());
-            if (hasLayerAttr(layer, "variance_encoded_in_target"))
-                layerParams.set("variance_encoded_in_target", getLayerAttr(layer, "variance_encoded_in_target").b());
-
-            int id = dstNet.addLayer(name, "DetectionOutput", layerParams);
-            layer_id[name] = id;
-            for (int i = 0; i < 3; ++i)
-                connect(layer_id, dstNet, parsePin(layer.input(i)), id, i);
-            data_layouts[name] = DATA_LAYOUT_UNKNOWN;
-        }
        else if (type == "Softmax")
        {
            if (hasLayerAttr(layer, "axis"))
--- a/modules/dnn/test/test_tf_importer.cpp
+++ b/modules/dnn/test/test_tf_importer.cpp
@ -165,12 +165,6 @@ TEST_P(Test_TensorFlow_layers, batch_norm)
    runTensorFlowNet("unfused_batch_norm");
    runTensorFlowNet("fused_batch_norm_no_gamma");
    runTensorFlowNet("unfused_batch_norm_no_gamma");
-}
-
-TEST_P(Test_TensorFlow_layers, mvn_batch_norm)
-{
-    if (backend == DNN_BACKEND_INFERENCE_ENGINE)
-        throw SkipTestException("");
    runTensorFlowNet("mvn_batch_norm");
    runTensorFlowNet("mvn_batch_norm_1x1");
 }
@ -323,7 +317,7 @@ TEST_P(Test_TensorFlow_nets, Inception_v2_SSD)
 TEST_P(Test_TensorFlow_nets, Inception_v2_Faster_RCNN)
 {
    checkBackend();
-    if (backend == DNN_BACKEND_INFERENCE_ENGINE ||
+    if ((backend == DNN_BACKEND_INFERENCE_ENGINE && target != DNN_TARGET_CPU) ||
        (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16))
        throw SkipTestException("");

@ -343,6 +337,26 @@ TEST_P(Test_TensorFlow_nets, Inception_v2_Faster_RCNN)
    normAssertDetections(ref, out, "", 0.3);
 }

+TEST_P(Test_TensorFlow_nets, MobileNet_v1_SSD_PPN)
+{
+    checkBackend();
+    std::string proto = findDataFile("dnn/ssd_mobilenet_v1_ppn_coco.pbtxt", false);
+    std::string model = findDataFile("dnn/ssd_mobilenet_v1_ppn_coco.pb", false);
+
+    Net net = readNetFromTensorflow(model, proto);
+    Mat img = imread(findDataFile("dnn/dog416.png", false));
+    Mat ref = blobFromNPY(findDataFile("dnn/tensorflow/ssd_mobilenet_v1_ppn_coco.detection_out.npy", false));
+    Mat blob = blobFromImage(img, 1.0f / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), true, false);
+
+    net.setPreferableBackend(backend);
+    net.setPreferableTarget(target);
+
+    net.setInput(blob);
+    Mat out = net.forward();
+    double scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.006 : default_l1;
+    normAssertDetections(ref, out, "", 0.4, scoreDiff, default_lInf);
+}
+
 TEST_P(Test_TensorFlow_nets, opencv_face_detector_uint8)
 {
    checkBackend();
--- a/modules/features2d/doc/read_file_nondiff32.pl
+++ b/modules/features2d/doc/read_file_nondiff32.pl
@ -131,7 +131,7 @@ my $success_structured;
                  }
                  close $in2 or die "Can't close $filein: $!";
                }
-                #find next else and interprete it
+                #find next else and interpret it
                open(my $in3,  "<",  $filein)  or die "Can't open $filein: $!";
        $i3=1;
        $ifcount3=0;
--- a/modules/features2d/doc/read_file_score32.pl
+++ b/modules/features2d/doc/read_file_score32.pl
@ -119,7 +119,7 @@ my $is_a_corner;
                  }
                  close $in2 or die "Can't close $filein: $!";
                }
-                #find next else and interprete it
+                #find next else and interpret it
                open(my $in3,  "<",  $filein)  or die "Can't open $filein: $!";
        $i3=1;
        $ifcount3=0;
--- a/modules/imgproc/include/opencv2/imgproc.hpp
+++ b/modules/imgproc/include/opencv2/imgproc.hpp
@ -1861,7 +1861,7 @@ gradient term \f$G\f$ and the second gradient term \f$b\f$ gives:
 The algorithm sets the center of the neighborhood window at this new center \f$q\f$ and then iterates
 until the center stays within a set threshold.

-@param image Input image.
+@param image Input single-channel, 8-bit or float image.
@param corners Initial coordinates of the input corners and refined coordinates provided for
 output.
@param winSize Half of the side length of the search window. For example, if winSize=Size(5,5) ,
--- a/modules/ml/src/svm.cpp
+++ b/modules/ml/src/svm.cpp
@ -2048,7 +2048,7 @@ public:
            svmType == NU_SVC ? "NU_SVC" :
            svmType == ONE_CLASS ? "ONE_CLASS" :
            svmType == EPS_SVR ? "EPS_SVR" :
-            svmType == NU_SVR ? "NU_SVR" : format("Uknown_%d", svmType);
+            svmType == NU_SVR ? "NU_SVR" : format("Unknown_%d", svmType);
        String kernel_type_str =
            kernelType == LINEAR ? "LINEAR" :
            kernelType == POLY ? "POLY" :
--- a/modules/objdetect/src/hog.cpp
+++ b/modules/objdetect/src/hog.cpp
@ -255,8 +255,8 @@ void HOGDescriptor::computeGradient(const Mat& img, Mat& grad, Mat& qangle,
    Mat_<float> _lut(1, 256);
    const float* const lut = &_lut(0,0);
 #if CV_SSE2
-    const int indeces[] = { 0, 1, 2, 3 };
-    __m128i idx = _mm_loadu_si128((const __m128i*)indeces);
+    const int indices[] = { 0, 1, 2, 3 };
+    __m128i idx = _mm_loadu_si128((const __m128i*)indices);
    __m128i ifour = _mm_set1_epi32(4);

    float* const _data = &_lut(0, 0);
@ -273,8 +273,8 @@ void HOGDescriptor::computeGradient(const Mat& img, Mat& grad, Mat& qangle,
            idx = _mm_add_epi32(idx, ifour);
        }
 #elif CV_NEON
-    const int indeces[] = { 0, 1, 2, 3 };
-    uint32x4_t idx = *(uint32x4_t*)indeces;
+    const int indices[] = { 0, 1, 2, 3 };
+    uint32x4_t idx = *(uint32x4_t*)indices;
    uint32x4_t ifour = vdupq_n_u32(4);

    float* const _data = &_lut(0, 0);
--- a/modules/objdetect/src/qrcode.cpp
+++ b/modules/objdetect/src/qrcode.cpp
@ -7,7 +7,6 @@

 #include "precomp.hpp"
 #include "opencv2/objdetect.hpp"
-// #include "opencv2/calib3d.hpp"

 #include <limits>
 #include <cmath>
@ -21,7 +20,6 @@ class QRDecode
 {
 public:
    void init(Mat src, double eps_vertical_ = 0.2, double eps_horizontal_ = 0.1);
-    void binarization();
    bool localization();
    bool transformation();
    Mat getBinBarcode() { return bin_barcode; }
@ -35,9 +33,7 @@ protected:
    Point2f intersectionLines(Point2f a1, Point2f a2, Point2f b1, Point2f b2);
    vector<Point2f> getQuadrilateral(vector<Point2f> angle_list);
    bool testBypassRoute(vector<Point2f> hull, int start, int finish);
-    double getTriangleArea(Point2f a, Point2f b, Point2f c);
-    double getPolygonArea(vector<Point2f> points);
-    double getCosVectors(Point2f a, Point2f b, Point2f c);
+    inline double getCosVectors(Point2f a, Point2f b, Point2f c);

    Mat barcode, bin_barcode, straight_barcode;
    vector<Point2f> localization_points, transformation_points;
@ -63,13 +59,7 @@ void QRDecode::init(Mat src, double eps_vertical_, double eps_horizontal_)
    }
    eps_vertical   = eps_vertical_;
    eps_horizontal = eps_horizontal_;
-}
-
-void QRDecode::binarization()
-{
-    Mat filter_barcode;
-    GaussianBlur(barcode, filter_barcode, Size(3, 3), 0);
-    threshold(filter_barcode, bin_barcode, 0, 255, THRESH_BINARY + THRESH_OTSU);
+    adaptiveThreshold(barcode, bin_barcode, 255, ADAPTIVE_THRESH_GAUSSIAN_C, THRESH_BINARY, 71, 2);
 }

 vector<Vec3d> QRDecode::searchVerticalLines()
@ -139,7 +129,7 @@ vector<Point2f> QRDecode::separateHorizontalLines(vector<Vec3d> list_lines)

    for (size_t pnt = 0; pnt < list_lines.size(); pnt++)
    {
-        int x = static_cast<int>(list_lines[pnt][0] + list_lines[pnt][2] / 2);
+        int x = static_cast<int>(list_lines[pnt][0] + list_lines[pnt][2] * 0.5);
        int y = static_cast<int>(list_lines[pnt][1]);

        // --------------- Search horizontal up-lines --------------- //
@ -203,7 +193,7 @@ vector<Point2f> QRDecode::separateHorizontalLines(vector<Vec3d> list_lines)
    {
        point2f_result.push_back(
              Point2f(static_cast<float>(result[i][1]),
-                      static_cast<float>(result[i][0] + result[i][2] / 2)));
+                      static_cast<float>(result[i][0] + result[i][2] * 0.5)));
    }
    return point2f_result;
 }
@ -345,16 +335,23 @@ bool QRDecode::computeTransformationPoints()
            }
        }
    }
+
    if (down_left_edge_point == Point2f(0, 0) ||
-        up_right_edge_point  == Point2f(0, 0)) { return false; }
+        up_right_edge_point  == Point2f(0, 0) ||
+        new_non_zero_elem[0].size() == 0) { return false; }

    double max_area = -1;
    up_left_edge_point = new_non_zero_elem[0][0];
+
    for (size_t i = 0; i < new_non_zero_elem[0].size(); i++)
    {
-        double temp_area = getTriangleArea(new_non_zero_elem[0][i],
-                                           down_left_edge_point,
-                                           up_right_edge_point);
+        vector<Point2f> list_edge_points;
+        list_edge_points.push_back(new_non_zero_elem[0][i]);
+        list_edge_points.push_back(down_left_edge_point);
+        list_edge_points.push_back(up_right_edge_point);
+
+        double temp_area = fabs(contourArea(list_edge_points));
+
        if (max_area < temp_area)
        {
            up_left_edge_point = new_non_zero_elem[0][i];
@ -375,6 +372,7 @@ bool QRDecode::computeTransformationPoints()
        }
    }

+
    for (size_t i = 0; i < new_non_zero_elem[2].size(); i++)
    {
        double temp_norm_delta = norm(up_left_edge_point - new_non_zero_elem[2][i])
@ -485,7 +483,7 @@ vector<Point2f> QRDecode::getQuadrilateral(vector<Point2f> angle_list)
        hull[i] = Point2f(x, y);
    }

-    const double experimental_area = getPolygonArea(hull);
+    const double experimental_area = fabs(contourArea(hull));

    vector<Point2f> result_hull_point(angle_size);
    double min_norm;
@ -539,7 +537,7 @@ vector<Point2f> QRDecode::getQuadrilateral(vector<Point2f> angle_list)
        double temp_norm = getCosVectors(hull[index_hull], intrsc_line_hull, angle_closest_pnt);
        if (min_norm > temp_norm &&
            norm(hull[index_hull] - hull[next_index_hull]) >
-            norm(angle_list[1] - angle_list[2]) / 10)
+            norm(angle_list[1] - angle_list[2]) * 0.1)
        {
            min_norm = temp_norm;
            result_side_begin[0] = hull[index_hull];
@ -577,7 +575,7 @@ vector<Point2f> QRDecode::getQuadrilateral(vector<Point2f> angle_list)
        double temp_norm = getCosVectors(hull[index_hull], intrsc_line_hull, angle_closest_pnt);
        if (min_norm > temp_norm &&
            norm(hull[index_hull] - hull[next_index_hull]) >
-            norm(angle_list[0] - angle_list[1]) / 20)
+            norm(angle_list[0] - angle_list[1]) * 0.05)
        {
            min_norm = temp_norm;
            result_side_begin[1] = hull[index_hull];
@ -611,7 +609,7 @@ vector<Point2f> QRDecode::getQuadrilateral(vector<Point2f> angle_list)
        if (next_index_hull == hull_size) { next_index_hull = 0; }
        if (next_index_hull == -1) { next_index_hull = hull_size - 1; }

-        if (norm(hull[index_hull] - hull[next_index_hull]) < standart_norm / 10.0)
+        if (norm(hull[index_hull] - hull[next_index_hull]) < standart_norm * 0.1)
        { index_hull = next_index_hull; continue; }

        extra_index_hull = finish_line[1];
@ -623,7 +621,7 @@ vector<Point2f> QRDecode::getQuadrilateral(vector<Point2f> angle_list)
            if (extra_next_index_hull == hull_size) { extra_next_index_hull = 0; }
            if (extra_next_index_hull == -1) { extra_next_index_hull = hull_size - 1; }

-            if (norm(hull[extra_index_hull] - hull[extra_next_index_hull]) < standart_norm / 10.0)
+            if (norm(hull[extra_index_hull] - hull[extra_next_index_hull]) < standart_norm * 0.1)
            { extra_index_hull = extra_next_index_hull; continue; }

            test_result_angle_list[0]
@ -639,7 +637,7 @@ vector<Point2f> QRDecode::getQuadrilateral(vector<Point2f> angle_list)
            = intersectionLines(hull[index_hull], hull[next_index_hull],
                                result_side_begin[0], result_side_end[0]);

-            test_diff_area = fabs(getPolygonArea(test_result_angle_list) - experimental_area);
+            test_diff_area = fabs(fabs(contourArea(test_result_angle_list)) - experimental_area);
            if (min_diff_area > test_diff_area)
            {
                min_diff_area = test_diff_area;
@ -656,53 +654,22 @@ vector<Point2f> QRDecode::getQuadrilateral(vector<Point2f> angle_list)
        index_hull = next_index_hull;
    }
    while(index_hull != unstable_pnt);
+
+    if (norm(result_angle_list[0] - angle_list[1]) > 2) { result_angle_list[0] = angle_list[1]; }
+    if (norm(result_angle_list[1] - angle_list[0]) > 2) { result_angle_list[1] = angle_list[0]; }
+    if (norm(result_angle_list[3] - angle_list[2]) > 2) { result_angle_list[3] = angle_list[2]; }
+
    return result_angle_list;
 }

-//          b
-//         / |
-//        /  |
-//       /   |
-//      /  S |
-//     /     |
-//   a ----- c
-
-double QRDecode::getTriangleArea(Point2f a, Point2f b, Point2f c)
-{
-    double norm_sides[] = { norm(a - b), norm(b - c), norm(c - a) };
-    double half_perimeter = (norm_sides[0] + norm_sides[1] + norm_sides[2]) / 2.0;
-    double triangle_area = sqrt(half_perimeter *
-                               (half_perimeter - norm_sides[0]) *
-                               (half_perimeter - norm_sides[1]) *
-                               (half_perimeter - norm_sides[2]));
-    return triangle_area;
-}
-
-double QRDecode::getPolygonArea(vector<Point2f> points)
-{
-    CV_Assert(points.size() >= 3);
-    if (points.size() == 3)
-    { return getTriangleArea(points[0], points[1], points[2]); }
-    else
-    {
-        double result_area = 0.0;
-        for (size_t i = 1; i < points.size() - 1; i++)
-        {
-            result_area += getTriangleArea(points[0], points[i], points[i + 1]);
-        }
-        return result_area;
-    }
-}
-
 //      / | b
 //     /  |
 //    /   |
 //  a/    | c

-double QRDecode::getCosVectors(Point2f a, Point2f b, Point2f c)
+inline double QRDecode::getCosVectors(Point2f a, Point2f b, Point2f c)
 {
-    return ((a - b).x * (c - b).x + (a - b).y * (c - b).y)
-            / (norm(a - b) * norm(c - b));
+    return ((a - b).x * (c - b).x + (a - b).y * (c - b).y) / (norm(a - b) * norm(c - b));
 }

 bool QRDecode::transformation()
@ -764,7 +731,6 @@ bool QRCodeDetector::detect(InputArray in, OutputArray points) const
    CV_Assert(inarr.type() == CV_8UC1);
    QRDecode qrdec;
    qrdec.init(inarr, p->epsX, p->epsY);
-    qrdec.binarization();
    if (!qrdec.localization()) { return false; }
    if (!qrdec.transformation()) { return false; }
    vector<Point2f> pnts2f = qrdec.getTransformationPoints();
--- a/modules/photo/src/contrast_preserve.hpp
+++ b/modules/photo/src/contrast_preserve.hpp
@ -159,12 +159,12 @@ void Decolor::gradvector(const Mat &img, vector <double> &grad) const

    for(int i=0;i<height;i++)
        for(int j=0;j<width;j++)
-            grad[i*height + j] = d_trans.at<float>(i, j);
+            grad[i*width + j] = d_trans.at<float>(i, j);

    const int offset = width * height;
    for(int i=0;i<height;i++)
        for(int j=0;j<width;j++)
-            grad[offset + i * height + j] = d1_trans.at<float>(i, j);
+            grad[offset + i * width + j] = d1_trans.at<float>(i, j);
 }

 void Decolor::colorGrad(const Mat &img, vector <double> &Cg) const
@ -204,14 +204,19 @@ void Decolor::add_to_vector_poly(vector < vector <double> > &polyGrad, const vec
    idx1++;
 }

-void Decolor::weak_order(const Mat &img, vector <double> &alf) const
+void Decolor::weak_order(const Mat &im, vector <double> &alf) const
 {
-    const int h = img.size().height;
-    const int w = img.size().width;
+    Mat img;
+    const int h = im.size().height;
+    const int w = im.size().width;
    if((h + w) > 800)
    {
        const double sizefactor = double(800)/(h+w);
-        resize(img, img, Size(cvRound(h*sizefactor), cvRound(w*sizefactor)));
+        resize(im, img, Size(cvRound(w*sizefactor), cvRound(h*sizefactor)));
+    }
+    else
+    {
+        img = im;
    }

    Mat curIm = Mat(img.size(),CV_32FC1);
@ -246,16 +251,20 @@ void Decolor::weak_order(const Mat &img, vector <double> &alf) const
        alf[i] -= tmp1[i] * tmp2[i] * tmp3[i];
 }

-void Decolor::grad_system(const Mat &img, vector < vector < double > > &polyGrad,
+void Decolor::grad_system(const Mat &im, vector < vector < double > > &polyGrad,
        vector < double > &Cg, vector <Vec3i>& comb) const
 {
-    int h = img.size().height;
-    int w = img.size().width;
-
+    Mat img;
+    int h = im.size().height;
+    int w = im.size().width;
    if((h + w) > 800)
    {
        const double sizefactor = double(800)/(h+w);
-        resize(img, img, Size(cvRound(h*sizefactor), cvRound(w*sizefactor)));
+        resize(im, img, Size(cvRound(w*sizefactor), cvRound(h*sizefactor)));
+    }
+    else
+    {
+        img = im;
    }

    h = img.size().height;
--- a/modules/stitching/include/opencv2/stitching/detail/matchers.hpp
+++ b/modules/stitching/include/opencv2/stitching/detail/matchers.hpp
@ -137,6 +137,21 @@ private:
    Ptr<Feature2D> surf;
 };

+
+/** @brief SIFT features finder.
+
+@sa detail::FeaturesFinder, SIFT
+*/
+class CV_EXPORTS SiftFeaturesFinder : public FeaturesFinder
+{
+public:
+    SiftFeaturesFinder();
+
+private:
+    void find(InputArray image, ImageFeatures &features) CV_OVERRIDE;
+    Ptr<Feature2D> sift;
+};
+
 /** @brief ORB features finder. :

@sa detail::FeaturesFinder, ORB
--- a/modules/stitching/src/matchers.cpp
+++ b/modules/stitching/src/matchers.cpp
@ -51,6 +51,7 @@ using namespace cv::cuda;
 #ifdef HAVE_OPENCV_XFEATURES2D
 #include "opencv2/xfeatures2d.hpp"
 using xfeatures2d::SURF;
+using xfeatures2d::SIFT;
 #endif

 #ifdef HAVE_OPENCV_CUDAIMGPROC
@ -475,6 +476,35 @@ void SurfFeaturesFinder::find(InputArray image, ImageFeatures &features)
    }
 }

+SiftFeaturesFinder::SiftFeaturesFinder()
+{
+#ifdef HAVE_OPENCV_XFEATURES2D
+    Ptr<SIFT> sift_ = SIFT::create();
+    if( !sift_ )
+        CV_Error( Error::StsNotImplemented, "OpenCV was built without SIFT support" );
+    sift = sift_;
+#else
+    CV_Error( Error::StsNotImplemented, "OpenCV was built without SIFT support" );
+#endif
+}
+
+void SiftFeaturesFinder::find(InputArray image, ImageFeatures &features)
+{
+    UMat gray_image;
+    CV_Assert((image.type() == CV_8UC3) || (image.type() == CV_8UC1));
+    if(image.type() == CV_8UC3)
+    {
+        cvtColor(image, gray_image, COLOR_BGR2GRAY);
+    }
+    else
+    {
+        gray_image = image.getUMat();
+    }
+    UMat descriptors;
+    sift->detectAndCompute(gray_image, Mat(), features.keypoints, descriptors);
+    features.descriptors = descriptors.reshape(1, (int)features.keypoints.size());
+}
+
 OrbFeaturesFinder::OrbFeaturesFinder(Size _grid_size, int n_features, float scaleFactor, int nlevels)
 {
    grid_size = _grid_size;
--- a/modules/ts/include/opencv2/ts/ts_gtest.h
+++ b/modules/ts/include/opencv2/ts/ts_gtest.h
@ -9013,7 +9013,7 @@ class NativeArray {

 // Implements Boolean test assertions such as EXPECT_TRUE. expression can be
 // either a boolean expression or an AssertionResult. text is a textual
-// represenation of expression as it was passed into the EXPECT_TRUE.
+// representation of expression as it was passed into the EXPECT_TRUE.
 #define GTEST_TEST_BOOLEAN_(expression, text, actual, expected, fail) \
  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
  if (const ::testing::AssertionResult gtest_ar_ = \
--- a/modules/videoio/src/cap_gstreamer.cpp
+++ b/modules/videoio/src/cap_gstreamer.cpp
@ -613,10 +613,12 @@ int GStreamerCapture::getCaptureDomain() { return CAP_GSTREAMER; }
 */
 bool GStreamerCapture::open(int id)
 {
+    gst_initializer::init();
+
    if (!is_gst_element_exists("v4l2src"))
        return false;
    std::ostringstream desc;
-    desc << "v4l2src device-name=/dev/video" << id
+    desc << "v4l2src device=/dev/video" << id
             << " ! " << COLOR_ELEM
             << " ! appsink";
    return open(desc.str());
--- a/modules/videoio/src/cap_mjpeg_decoder.cpp
+++ b/modules/videoio/src/cap_mjpeg_decoder.cpp
@ -146,6 +146,9 @@ bool MotionJpegCapture::grabFrame()
        }
        else
        {
+            if (m_frame_iterator == m_mjpeg_frames.end())
+                return false;
+
            ++m_frame_iterator;
        }
    }
--- a/modules/videoio/src/cap_v4l.cpp
+++ b/modules/videoio/src/cap_v4l.cpp
@ -431,6 +431,7 @@ static int autosetup_capture_mode_v4l2(CvCaptureCAM_V4L* capture) {
            V4L2_PIX_FMT_BGR24,
            V4L2_PIX_FMT_RGB24,
            V4L2_PIX_FMT_YVU420,
+            V4L2_PIX_FMT_YUV420,
            V4L2_PIX_FMT_YUV411P,
            V4L2_PIX_FMT_YUYV,
            V4L2_PIX_FMT_UYVY,
@ -532,6 +533,7 @@ static int v4l2_set_fps(CvCaptureCAM_V4L* capture) {
 static int v4l2_num_channels(__u32 palette) {
    switch(palette) {
    case V4L2_PIX_FMT_YVU420:
+    case V4L2_PIX_FMT_YUV420:
    case V4L2_PIX_FMT_MJPEG:
    case V4L2_PIX_FMT_JPEG:
    case V4L2_PIX_FMT_Y16:
@ -562,6 +564,7 @@ static void v4l2_create_frame(CvCaptureCAM_V4L *capture) {
            size = CvSize(capture->buffers[capture->bufferIndex].length, 1);
            break;
        case V4L2_PIX_FMT_YVU420:
+        case V4L2_PIX_FMT_YUV420:
            size.height = size.height * 3 / 2; // "1.5" channels
            break;
        case V4L2_PIX_FMT_Y16:
@ -1021,10 +1024,10 @@ move_411_block(int yTL, int yTR, int yBL, int yBR, int u, int v,

 /* Converts from planar YUV420P to RGB24. */
 static inline void
-yuv420p_to_rgb24(int width, int height, uchar* src, uchar* dst)
+yuv420p_to_rgb24(int width, int height, uchar* src, uchar* dst, bool isYUV)
 {
    cvtColor(Mat(height * 3 / 2, width, CV_8U, src), Mat(height, width, CV_8UC3, dst),
-            COLOR_YUV2BGR_YV12);
+            isYUV ? COLOR_YUV2BGR_IYUV : COLOR_YUV2BGR_YV12);
 }

 // Consider a YUV411P image of 8x2 pixels.
@ -1490,10 +1493,12 @@ static IplImage* icvRetrieveFrameCAM_V4L( CvCaptureCAM_V4L* capture, int) {
        break;

    case V4L2_PIX_FMT_YVU420:
+    case V4L2_PIX_FMT_YUV420:
        yuv420p_to_rgb24(capture->form.fmt.pix.width,
                capture->form.fmt.pix.height,
                (unsigned char*)(capture->buffers[capture->bufferIndex].start),
-                (unsigned char*)capture->frame.imageData);
+                (unsigned char*)capture->frame.imageData,
+                capture->palette == V4L2_PIX_FMT_YUV420);
        break;

    case V4L2_PIX_FMT_YUV411P:
--- a/samples/CMakeLists.txt
+++ b/samples/CMakeLists.txt
@ -70,9 +70,7 @@ endif()

 ocv_install_example_src("." CMakeLists.txt)
 if(INSTALL_C_EXAMPLES)
-  install(DIRECTORY data
-          DESTINATION "${OPENCV_SAMPLES_SRC_INSTALL_PATH}/data"
-        COMPONENT samples_data)
+  install(DIRECTORY data DESTINATION "${OPENCV_SAMPLES_SRC_INSTALL_PATH}" COMPONENT samples_data)
 endif()

 else()
--- a/samples/cpp/stitching_detailed.cpp
+++ b/samples/cpp/stitching_detailed.cpp
@ -82,7 +82,7 @@ static void printUsage()
        "\nMotion Estimation Flags:\n"
        "  --work_megapix <float>\n"
        "      Resolution for image registration step. The default is 0.6 Mpx.\n"
-        "  --features (surf|orb)\n"
+        "  --features (surf|orb|sift)\n"
        "      Type of features used for images matching. The default is surf.\n"
        "  --matcher (homography|affine)\n"
        "      Matcher used for pairwise image matching.\n"
@ -430,6 +430,9 @@ int main(int argc, char* argv[])
    {
        finder = makePtr<OrbFeaturesFinder>();
    }
+    else if (features_type == "sift") {
+        finder = makePtr<SiftFeaturesFinder>();
+    }
    else
    {
        cout << "Unknown 2D features type: '" << features_type << "'.\n";
--- a/samples/cpp/train_HOG.cpp
+++ b/samples/cpp/train_HOG.cpp
@ -204,7 +204,7 @@ int main( int argc, char** argv )
    const char* keys =
    {
        "{help h|     | show help message}"
-        "{pd    |     | path of directory contains possitive images}"
+        "{pd    |     | path of directory contains positive images}"
        "{nd    |     | path of directory contains negative images}"
        "{td    |     | path of directory contains test images}"
        "{tv    |     | test video file name}"
--- a/samples/cpp/tutorial_code/ml/introduction_to_pca/introduction_to_pca.cpp
+++ b/samples/cpp/tutorial_code/ml/introduction_to_pca/introduction_to_pca.cpp
@ -1,6 +1,6 @@
 /**
 * @file introduction_to_pca.cpp
- * @brief This program demonstrates how to use OpenCV PCA to extract the orienation of an object
+ * @brief This program demonstrates how to use OpenCV PCA to extract the orientation of an object
 * @author OpenCV team
 */

--- a/samples/cpp/warpPerspective_demo.cpp
+++ b/samples/cpp/warpPerspective_demo.cpp
@ -26,7 +26,7 @@ static void help(char** argv)
         "\tESC, q - quit the program\n"
         "\tr - change order of points to rotate transformation\n"
         "\tc - delete selected points\n"
-         "\ti - change order of points to invers transformation \n"
+         "\ti - change order of points to inverse transformation \n"
         "\nUse your mouse to select a point and move it to see transformation changes" << endl;
 }

--- a/samples/dnn/CMakeLists.txt
+++ b/samples/dnn/CMakeLists.txt
@ -13,32 +13,6 @@ if(NOT BUILD_EXAMPLES OR NOT OCV_DEPENDENCIES_FOUND)
  return()
 endif()

-function(download_net name commit hash)
-  set(DNN_FACE_DETECTOR_MODEL_DOWNLOAD_DIR "${CMAKE_CURRENT_LIST_DIR}/face_detector")
-  if(COMMAND ocv_download)
-    ocv_download(FILENAME ${name}
-               HASH ${hash}
-               URL
-                 "$ENV{OPENCV_DNN_MODELS_URL}"
-                 "${OPENCV_DNN_MODELS_URL}"
-                 "https://raw.githubusercontent.com/opencv/opencv_3rdparty/${commit}/"
-               DESTINATION_DIR ${DNN_FACE_DETECTOR_MODEL_DOWNLOAD_DIR}
-               ID DNN_FACE_DETECTOR
-               RELATIVE_URL
-               STATUS res)
-  endif()
-endfunction()
-
-# Model branch name: dnn_samples_face_detector_20180205_fp16
-download_net("res10_300x300_ssd_iter_140000_fp16.caffemodel"
-             "19512576c112aa2c7b6328cb0e8d589a4a90a26d"
-             "f737f886e33835410c69e3ccfe0720a1")
-
-# Model branch name: dnn_samples_face_detector_20180220_uint8
-download_net("opencv_face_detector_uint8.pb"
-             "7b425df276ba2161b8edaab0f0756f4a735d61b9"
-             "56acf81f55d9b9e96c3347bc65409b9e")
-
 project(dnn_samples)
 ocv_include_modules_recurse(${OPENCV_DNN_SAMPLES_REQUIRED_DEPS})
 file(GLOB_RECURSE dnn_samples RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cpp)
--- a/samples/dnn/custom_layers.hpp
+++ b/samples/dnn/custom_layers.hpp
@ -198,7 +198,7 @@ private:
 //! [ResizeBilinearLayer]

 //
-// The folowing code is used only to generate tutorials documentation.
+// The following code is used only to generate tutorials documentation.
 //

 //! [A custom layer interface]
--- a/samples/dnn/face_detector/download_weights.py
+++ b/samples/dnn/face_detector/download_weights.py
@ -0,0 +1,74 @@
+#!/usr/bin/env python
+
+from __future__ import print_function
+import hashlib
+import time
+import sys
+import xml.etree.ElementTree as ET
+if sys.version_info[0] < 3:
+    from urllib2 import urlopen
+else:
+    from urllib.request import urlopen
+
+class HashMismatchException(Exception):
+    def __init__(self, expected, actual):
+        Exception.__init__(self)
+        self.expected = expected
+        self.actual = actual
+    def __str__(self):
+        return 'Hash mismatch: {} vs {}'.format(self.expected, self.actual)
+
+class MetalinkDownloader(object):
+    BUFSIZE = 10*1024*1024
+    NS = {'ml': 'urn:ietf:params:xml:ns:metalink'}
+    tick = 0
+
+    def download(self, metalink_file):
+        status = True
+        for file_elem in ET.parse(metalink_file).getroot().findall('ml:file', self.NS):
+            url = file_elem.find('ml:url', self.NS).text
+            fname = file_elem.attrib['name']
+            hash_sum = file_elem.find('ml:hash', self.NS).text
+            print('*** {}'.format(fname))
+            try:
+                self.verify(hash_sum, fname)
+            except Exception as ex:
+                print('  {}'.format(ex))
+                try:
+                    print('  {}'.format(url))
+                    with open(fname, 'wb') as file_stream:
+                        self.buffered_read(urlopen(url), file_stream.write)
+                    self.verify(hash_sum, fname)
+                except Exception as ex:
+                    print('  {}'.format(ex))
+                    print('  FAILURE')
+                    status = False
+                    continue
+            print('  SUCCESS')
+        return status
+
+    def print_progress(self, msg, timeout = 0):
+        if time.time() - self.tick > timeout:
+            print(msg, end='')
+            sys.stdout.flush()
+            self.tick = time.time()
+
+    def buffered_read(self, in_stream, processing):
+        self.print_progress('  >')
+        while True:
+            buf = in_stream.read(self.BUFSIZE)
+            if not buf:
+                break
+            processing(buf)
+            self.print_progress('>', 5)
+        print(' done')
+
+    def verify(self, hash_sum, fname):
+        sha = hashlib.sha1()
+        with open(fname, 'rb') as file_stream:
+            self.buffered_read(file_stream, sha.update)
+        if hash_sum != sha.hexdigest():
+            raise HashMismatchException(hash_sum, sha.hexdigest())
+
+if __name__ == '__main__':
+    sys.exit(0 if MetalinkDownloader().download('weights.meta4') else 1)
--- a/samples/dnn/face_detector/weights.meta4
+++ b/samples/dnn/face_detector/weights.meta4
@ -0,0 +1,13 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<metalink xmlns="urn:ietf:params:xml:ns:metalink">
+    <file name="res10_300x300_ssd_iter_140000_fp16.caffemodel">
+        <identity>OpenCV face detector FP16 weights</identity>
+        <hash type="sha-1">31fc22bfdd907567a04bb45b7cfad29966caddc1</hash>
+        <url>https://raw.githubusercontent.com/opencv/opencv_3rdparty/dnn_samples_face_detector_20180205_fp16/res10_300x300_ssd_iter_140000_fp16.caffemodel</url>
+    </file>
+    <file name="opencv_face_detector_uint8.pb">
+        <identity>OpenCV face detector UINT8 weights</identity>
+        <hash type="sha-1">4f2fdf6f231d759d7bbdb94353c5a68690f3d2ae</hash>
+        <url>https://raw.githubusercontent.com/opencv/opencv_3rdparty/dnn_samples_face_detector_20180220_uint8/opencv_face_detector_uint8.pb</url>
+    </file>
+</metalink>
--- a/samples/dnn/tf_text_graph_common.py
+++ b/samples/dnn/tf_text_graph_common.py
@ -0,0 +1,25 @@
+import tensorflow as tf
+from tensorflow.core.framework.node_def_pb2 import NodeDef
+from google.protobuf import text_format
+
+def tensorMsg(values):
+    if all([isinstance(v, float) for v in values]):
+        dtype = 'DT_FLOAT'
+        field = 'float_val'
+    elif all([isinstance(v, int) for v in values]):
+        dtype = 'DT_INT32'
+        field = 'int_val'
+    else:
+        raise Exception('Wrong values types')
+
+    msg = 'tensor { dtype: ' + dtype + ' tensor_shape { dim { size: %d } }' % len(values)
+    for value in values:
+        msg += '%s: %s ' % (field, str(value))
+    return msg + '}'
+
+def addConstNode(name, values, graph_def):
+    node = NodeDef()
+    node.name = name
+    node.op = 'Const'
+    text_format.Merge(tensorMsg(values), node.attr["value"])
+    graph_def.node.extend([node])
--- a/samples/dnn/tf_text_graph_faster_rcnn.py
+++ b/samples/dnn/tf_text_graph_faster_rcnn.py
@ -6,6 +6,8 @@ from tensorflow.core.framework.node_def_pb2 import NodeDef
 from tensorflow.tools.graph_transforms import TransformGraph
 from google.protobuf import text_format

+from tf_text_graph_common import tensorMsg, addConstNode
+
 parser = argparse.ArgumentParser(description='Run this script to get a text graph of '
                                             'SSD model from TensorFlow Object Detection API. '
                                             'Then pass it with .pb file to cv::dnn::readNetFromTensorflow function.')
@ -93,21 +95,6 @@ while True:
    if node.op == 'CropAndResize':
        break

-def tensorMsg(values):
-    if all([isinstance(v, float) for v in values]):
-        dtype = 'DT_FLOAT'
-        field = 'float_val'
-    elif all([isinstance(v, int) for v in values]):
-        dtype = 'DT_INT32'
-        field = 'int_val'
-    else:
-        raise Exception('Wrong values types')
-
-    msg = 'tensor { dtype: ' + dtype + ' tensor_shape { dim { size: %d } }' % len(values)
-    for value in values:
-        msg += '%s: %s ' % (field, str(value))
-    return msg + '}'
-
 def addSlice(inp, out, begins, sizes):
    beginsNode = NodeDef()
    beginsNode.name = out + '/begins'
@ -151,17 +138,25 @@ def addSoftMax(inp, out):
    softmax.input.append(inp)
    graph_def.node.extend([softmax])

+def addFlatten(inp, out):
+    flatten = NodeDef()
+    flatten.name = out
+    flatten.op = 'Flatten'
+    flatten.input.append(inp)
+    graph_def.node.extend([flatten])
+
 addReshape('FirstStageBoxPredictor/ClassPredictor/BiasAdd',
           'FirstStageBoxPredictor/ClassPredictor/reshape_1', [0, -1, 2])

 addSoftMax('FirstStageBoxPredictor/ClassPredictor/reshape_1',
           'FirstStageBoxPredictor/ClassPredictor/softmax')  # Compare with Reshape_4

-flatten = NodeDef()
-flatten.name = 'FirstStageBoxPredictor/BoxEncodingPredictor/flatten'  # Compare with FirstStageBoxPredictor/BoxEncodingPredictor/BiasAdd
-flatten.op = 'Flatten'
-flatten.input.append('FirstStageBoxPredictor/BoxEncodingPredictor/BiasAdd')
-graph_def.node.extend([flatten])
+addFlatten('FirstStageBoxPredictor/ClassPredictor/softmax',
+           'FirstStageBoxPredictor/ClassPredictor/softmax/flatten')
+
+# Compare with FirstStageBoxPredictor/BoxEncodingPredictor/BiasAdd
+addFlatten('FirstStageBoxPredictor/BoxEncodingPredictor/BiasAdd',
+           'FirstStageBoxPredictor/BoxEncodingPredictor/flatten')

 proposals = NodeDef()
 proposals.name = 'proposals'  # Compare with ClipToWindow/Gather/Gather (NOTE: normalized)
@ -194,7 +189,7 @@ detectionOut.name = 'detection_out'
 detectionOut.op = 'DetectionOutput'

 detectionOut.input.append('FirstStageBoxPredictor/BoxEncodingPredictor/flatten')
-detectionOut.input.append('FirstStageBoxPredictor/ClassPredictor/softmax')
+detectionOut.input.append('FirstStageBoxPredictor/ClassPredictor/softmax/flatten')
 detectionOut.input.append('proposals')

 text_format.Merge('i: 2', detectionOut.attr['num_classes'])
@ -204,11 +199,21 @@ text_format.Merge('f: 0.7', detectionOut.attr['nms_threshold'])
 text_format.Merge('i: 6000', detectionOut.attr['top_k'])
 text_format.Merge('s: "CENTER_SIZE"', detectionOut.attr['code_type'])
 text_format.Merge('i: 100', detectionOut.attr['keep_top_k'])
-text_format.Merge('b: true', detectionOut.attr['clip'])
-text_format.Merge('b: true', detectionOut.attr['loc_pred_transposed'])
+text_format.Merge('b: false', detectionOut.attr['clip'])

 graph_def.node.extend([detectionOut])

+addConstNode('clip_by_value/lower', [0.0], graph_def)
+addConstNode('clip_by_value/upper', [1.0], graph_def)
+
+clipByValueNode = NodeDef()
+clipByValueNode.name = 'detection_out/clip_by_value'
+clipByValueNode.op = 'ClipByValue'
+clipByValueNode.input.append('detection_out')
+clipByValueNode.input.append('clip_by_value/lower')
+clipByValueNode.input.append('clip_by_value/upper')
+graph_def.node.extend([clipByValueNode])
+
 # Save as text.
 for node in reversed(topNodes):
    graph_def.node.extend([node])
@ -225,17 +230,13 @@ addReshape('SecondStageBoxPredictor/Reshape_1/slice',
 # Replace Flatten subgraph onto a single node.
 for i in reversed(range(len(graph_def.node))):
    if graph_def.node[i].op == 'CropAndResize':
-        graph_def.node[i].input.insert(1, 'detection_out')
+        graph_def.node[i].input.insert(1, 'detection_out/clip_by_value')

    if graph_def.node[i].name == 'SecondStageBoxPredictor/Reshape':
-        shapeNode = NodeDef()
-        shapeNode.name = 'SecondStageBoxPredictor/Reshape/shape2'
-        shapeNode.op = 'Const'
-        text_format.Merge(tensorMsg([1, -1, 4]), shapeNode.attr["value"])
-        graph_def.node.extend([shapeNode])
+        addConstNode('SecondStageBoxPredictor/Reshape/shape2', [1, -1, 4], graph_def)

        graph_def.node[i].input.pop()
-        graph_def.node[i].input.append(shapeNode.name)
+        graph_def.node[i].input.append('SecondStageBoxPredictor/Reshape/shape2')

    if graph_def.node[i].name in ['SecondStageBoxPredictor/Flatten/flatten/Shape',
                                  'SecondStageBoxPredictor/Flatten/flatten/strided_slice',
@ -246,12 +247,15 @@ for node in graph_def.node:
    if node.name == 'SecondStageBoxPredictor/Flatten/flatten/Reshape':
        node.op = 'Flatten'
        node.input.pop()
-        break
+
+    if node.name in ['FirstStageBoxPredictor/BoxEncodingPredictor/Conv2D',
+                     'SecondStageBoxPredictor/BoxEncodingPredictor/MatMul']:
+        text_format.Merge('b: true', node.attr["loc_pred_transposed"])

 ################################################################################
 ### Postprocessing
 ################################################################################
-addSlice('detection_out', 'detection_out/slice', [0, 0, 0, 3], [-1, -1, -1, 4])
+addSlice('detection_out/clip_by_value', 'detection_out/slice', [0, 0, 0, 3], [-1, -1, -1, 4])

 variance = NodeDef()
 variance.name = 'proposals/variance'
@ -268,12 +272,13 @@ text_format.Merge('i: 2', varianceEncoder.attr["axis"])
 graph_def.node.extend([varianceEncoder])

 addReshape('detection_out/slice', 'detection_out/slice/reshape', [1, 1, -1])
+addFlatten('variance_encoded', 'variance_encoded/flatten')

 detectionOut = NodeDef()
 detectionOut.name = 'detection_out_final'
 detectionOut.op = 'DetectionOutput'

-detectionOut.input.append('variance_encoded')
+detectionOut.input.append('variance_encoded/flatten')
 detectionOut.input.append('SecondStageBoxPredictor/Reshape_1/Reshape')
 detectionOut.input.append('detection_out/slice/reshape')

@ -283,7 +288,6 @@ text_format.Merge('i: %d' % (args.num_classes + 1), detectionOut.attr['backgroun
 text_format.Merge('f: 0.6', detectionOut.attr['nms_threshold'])
 text_format.Merge('s: "CENTER_SIZE"', detectionOut.attr['code_type'])
 text_format.Merge('i: 100', detectionOut.attr['keep_top_k'])
-text_format.Merge('b: true', detectionOut.attr['loc_pred_transposed'])
 text_format.Merge('b: true', detectionOut.attr['clip'])
 text_format.Merge('b: true', detectionOut.attr['variance_encoded_in_target'])
 graph_def.node.extend([detectionOut])
--- a/samples/dnn/tf_text_graph_ssd.py
+++ b/samples/dnn/tf_text_graph_ssd.py
@ -15,6 +15,7 @@ from math import sqrt
 from tensorflow.core.framework.node_def_pb2 import NodeDef
 from tensorflow.tools.graph_transforms import TransformGraph
 from google.protobuf import text_format
+from tf_text_graph_common import tensorMsg, addConstNode

 parser = argparse.ArgumentParser(description='Run this script to get a text graph of '
                                             'SSD model from TensorFlow Object Detection API. '
@ -29,6 +30,11 @@ parser.add_argument('--aspect_ratios', default=[1.0, 2.0, 0.5, 3.0, 0.333], type
                    help='Hyper-parameter of ssd_anchor_generator from config file.')
 parser.add_argument('--image_width', default=300, type=int, help='Training images width.')
 parser.add_argument('--image_height', default=300, type=int, help='Training images height.')
+parser.add_argument('--not_reduce_boxes_in_lowest_layer', default=False, action='store_true',
+                    help='A boolean to indicate whether the fixed 3 boxes per '
+                         'location is used in the lowest achors generation layer.')
+parser.add_argument('--box_predictor', default='convolutional', type=str,
+                    choices=['convolutional', 'weight_shared_convolutional'])
 args = parser.parse_args()

 # Nodes that should be kept.
@ -160,28 +166,6 @@ graph_def.node[1].input.append(weights)
 # Create SSD postprocessing head ###############################################

 # Concatenate predictions of classes, predictions of bounding boxes and proposals.
-def tensorMsg(values):
-    if all([isinstance(v, float) for v in values]):
-        dtype = 'DT_FLOAT'
-        field = 'float_val'
-    elif all([isinstance(v, int) for v in values]):
-        dtype = 'DT_INT32'
-        field = 'int_val'
-    else:
-        raise Exception('Wrong values types')
-
-    msg = 'tensor { dtype: ' + dtype + ' tensor_shape { dim { size: %d } }' % len(values)
-    for value in values:
-        msg += '%s: %s ' % (field, str(value))
-    return msg + '}'
-
-def addConstNode(name, values):
-    node = NodeDef()
-    node.name = name
-    node.op = 'Const'
-    text_format.Merge(tensorMsg(values), node.attr["value"])
-    graph_def.node.extend([node])
-
 def addConcatNode(name, inputs, axisNodeName):
    concat = NodeDef()
    concat.name = name
@ -194,12 +178,18 @@ def addConcatNode(name, inputs, axisNodeName):
 addConstNode('concat/axis_flatten', [-1])
 addConstNode('PriorBox/concat/axis', [-2])

-for label in ['ClassPredictor', 'BoxEncodingPredictor']:
+for label in ['ClassPredictor', 'BoxEncodingPredictor' if args.box_predictor is 'convolutional' else 'BoxPredictor']:
    concatInputs = []
    for i in range(args.num_layers):
        # Flatten predictions
        flatten = NodeDef()
-        inpName = 'BoxPredictor_%d/%s/BiasAdd' % (i, label)
+        if args.box_predictor is 'convolutional':
+            inpName = 'BoxPredictor_%d/%s/BiasAdd' % (i, label)
+        else:
+            if i == 0:
+                inpName = 'WeightSharedConvolutionalBoxPredictor/%s/BiasAdd' % label
+            else:
+                inpName = 'WeightSharedConvolutionalBoxPredictor_%d/%s/BiasAdd' % (i, label)
        flatten.input.append(inpName)
        flatten.name = inpName + '/Flatten'
        flatten.op = 'Flatten'
@ -210,7 +200,9 @@ for label in ['ClassPredictor', 'BoxEncodingPredictor']:

 idx = 0
 for node in graph_def.node:
-    if node.name == ('BoxPredictor_%d/BoxEncodingPredictor/Conv2D' % idx):
+    if node.name == ('BoxPredictor_%d/BoxEncodingPredictor/Conv2D' % idx) or \
+       node.name == ('WeightSharedConvolutionalBoxPredictor_%d/BoxPredictor/Conv2D' % idx) or \
+       node.name == 'WeightSharedConvolutionalBoxPredictor/BoxPredictor/Conv2D':
        text_format.Merge('b: true', node.attr["loc_pred_transposed"])
        idx += 1
 assert(idx == args.num_layers)
@ -224,13 +216,19 @@ for i in range(args.num_layers):
    priorBox = NodeDef()
    priorBox.name = 'PriorBox_%d' % i
    priorBox.op = 'PriorBox'
-    priorBox.input.append('BoxPredictor_%d/BoxEncodingPredictor/BiasAdd' % i)
+    if args.box_predictor is 'convolutional':
+        priorBox.input.append('BoxPredictor_%d/BoxEncodingPredictor/BiasAdd' % i)
+    else:
+        if i == 0:
+            priorBox.input.append('WeightSharedConvolutionalBoxPredictor/BoxPredictor/Conv2D')
+        else:
+            priorBox.input.append('WeightSharedConvolutionalBoxPredictor_%d/BoxPredictor/BiasAdd' % i)
    priorBox.input.append(graph_def.node[0].name)  # image_tensor

    text_format.Merge('b: false', priorBox.attr["flip"])
    text_format.Merge('b: false', priorBox.attr["clip"])

-    if i == 0:
+    if i == 0 and not args.not_reduce_boxes_in_lowest_layer:
        widths = [0.1, args.min_scale * sqrt(2.0), args.min_scale * sqrt(0.5)]
        heights = [0.1, args.min_scale / sqrt(2.0), args.min_scale / sqrt(0.5)]
    else:
@ -261,7 +259,10 @@ detectionOut = NodeDef()
 detectionOut.name = 'detection_out'
 detectionOut.op = 'DetectionOutput'

-detectionOut.input.append('BoxEncodingPredictor/concat')
+if args.box_predictor == 'convolutional':
+    detectionOut.input.append('BoxEncodingPredictor/concat')
+else:
+    detectionOut.input.append('BoxPredictor/concat')
 detectionOut.input.append(sigmoid.name)
 detectionOut.input.append('PriorBox/concat')

--- a/samples/winrt/OcvImageProcessing/OcvImageProcessing/Common/StandardStyles.xaml
+++ b/samples/winrt/OcvImageProcessing/OcvImageProcessing/Common/StandardStyles.xaml
@ -1091,7 +1091,7 @@ Style x:Key="SkipBackAppBarButtonStyle" TargetType="ButtonBase" BasedOn="{Static
    </Style>
    <Style x:Key="PermissionsAppBarButtonStyle" TargetType="ButtonBase" BasedOn="{StaticResource AppBarButtonStyle}">
        <Setter Property="AutomationProperties.AutomationId" Value="PermissionsAppBarButton"/>
-        <Setter Property="AutomationProperties.Name" Value="Permisions"/>
+        <Setter Property="AutomationProperties.Name" Value="Permissions"/>
        <Setter Property="Content" Value="&#xE192;"/>
    </Style>
    <Style x:Key="HighlightAppBarButtonStyle" TargetType="ButtonBase" BasedOn="{StaticResource AppBarButtonStyle}">