Merge branch 'master' of https://github.com/opencv/opencv into interpMultichannelImg

Added assertios to remap and warpAffine functions As @mshabunin said, remap and warpAffine functions do not support more than 4 channels in Bicubic and Lanczos4 interpolation modes. Assertions were added. Appropriate test was chenged. resolves #8272
2025-07-24 14:06:27 +08:00 · 2017-03-24 23:32:44 +03:00 · 2017-03-24 23:32:44 +03:00 · 84a0a91d16
commit 84a0a91d16
parent c4ae5c0ee5 49e16a3c9f
107 changed files with 2570 additions and 868 deletions
--- a/3rdparty/openvx/hal/openvx_hal.cpp
+++ b/3rdparty/openvx/hal/openvx_hal.cpp
@ -11,6 +11,7 @@
 #include <cfloat>
 #include <climits>
 #include <cmath>
+#include <cstring>

 //==================================================================================================
 // utility
@ -600,7 +601,7 @@ int ovx_hal_sepFilterInit(cvhalFilter2D **filter_context, int src_type, int dst_
 {
    if (!filter_context || !kernelx_data || !kernely_data || delta != 0 ||
        src_type != CV_8UC1 || (dst_type != CV_8UC1 && dst_type != CV_16SC1) ||
-        kernelx_length % 2 == 0 || kernely_length % 2 == 0 || anchor_x != kernelx_length / 2 || anchor_y != kernely_length / 2)
+        kernelx_length != 3 || kernely_length != 3 || anchor_x != 1 || anchor_y != 1)
        return CV_HAL_ERROR_NOT_IMPLEMENTED;

    ivx::border_t border;
@ -1076,7 +1077,7 @@ int ovx_hal_integral(int depth, int sdepth, int, const uchar * a, size_t astep,
            ib = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U32,
                ivx::Image::createAddressing(w, h, 4, (vx_int32)bstep), (unsigned int *)(b + bstep + sizeof(unsigned int)));
        ivx::IVX_CHECK_STATUS(vxuIntegralImage(ctx, ia, ib));
-        memset(b, 0, (w + 1) * sizeof(unsigned int));
+        std::memset(b, 0, (w + 1) * sizeof(unsigned int));
        b += bstep;
        for (int i = 0; i < h; i++, b += bstep)
        {
--- a/3rdparty/openvx/include/ivx.hpp
+++ b/3rdparty/openvx/include/ivx.hpp
@ -32,6 +32,12 @@ static const vx_enum VX_INTERPOLATION_NEAREST_NEIGHBOR = VX_INTERPOLATION_TYPE_N
 static const vx_enum VX_BORDER_CONSTANT = VX_BORDER_MODE_CONSTANT;
 static const vx_enum VX_BORDER_REPLICATE = VX_BORDER_MODE_REPLICATE;

+#else
+
+    #ifdef IVX_RENAMED_REFS
+        static const vx_enum VX_REF_ATTRIBUTE_TYPE = VX_REFERENCE_TYPE;
+    #endif
+
 #endif

 #ifndef IVX_USE_CXX98
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -91,6 +91,14 @@ if(POLICY CMP0042)
  cmake_policy(SET CMP0042 NEW)
 endif()

+if(POLICY CMP0051)
+  cmake_policy(SET CMP0051 NEW)
+endif()
+
+if(POLICY CMP0056)
+  cmake_policy(SET CMP0056 NEW)
+endif()
+
 include(cmake/OpenCVUtils.cmake)

 # must go before the project command
@ -280,16 +288,6 @@ OCV_OPTION(ENABLE_COVERAGE            "Enable coverage collection with  GCov"
 OCV_OPTION(ENABLE_OMIT_FRAME_POINTER  "Enable -fomit-frame-pointer for GCC"                      ON   IF CMAKE_COMPILER_IS_GNUCXX AND NOT (APPLE AND CMAKE_COMPILER_IS_CLANGCXX) )
 OCV_OPTION(ENABLE_POWERPC             "Enable PowerPC for GCC"                                   ON   IF (CMAKE_COMPILER_IS_GNUCXX AND CMAKE_SYSTEM_PROCESSOR MATCHES powerpc.*) )
 OCV_OPTION(ENABLE_FAST_MATH           "Enable -ffast-math (not recommended for GCC 4.6.x)"       OFF  IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) )
-OCV_OPTION(ENABLE_SSE                 "Enable SSE instructions"                                  ON   IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
-OCV_OPTION(ENABLE_SSE2                "Enable SSE2 instructions"                                 ON   IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
-OCV_OPTION(ENABLE_SSE3                "Enable SSE3 instructions"                                 ON   IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX OR CV_ICC) AND (X86 OR X86_64)) )
-OCV_OPTION(ENABLE_SSSE3               "Enable SSSE3 instructions"                                OFF  IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
-OCV_OPTION(ENABLE_SSE41               "Enable SSE4.1 instructions"                               OFF  IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX OR CV_ICC) AND (X86 OR X86_64)) )
-OCV_OPTION(ENABLE_SSE42               "Enable SSE4.2 instructions"                               OFF  IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
-OCV_OPTION(ENABLE_POPCNT              "Enable POPCNT instructions"                               OFF  IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
-OCV_OPTION(ENABLE_AVX                 "Enable AVX instructions"                                  OFF  IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
-OCV_OPTION(ENABLE_AVX2                "Enable AVX2 instructions"                                 OFF  IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
-OCV_OPTION(ENABLE_FMA3                "Enable FMA3 instructions"                                 OFF  IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
 OCV_OPTION(ENABLE_NEON                "Enable NEON instructions"                                 "${NEON}" IF CMAKE_COMPILER_IS_GNUCXX AND (ARM OR AARCH64 OR IOS) )
 OCV_OPTION(ENABLE_VFPV3               "Enable VFPv3-D32 instructions"                            OFF  IF CMAKE_COMPILER_IS_GNUCXX AND (ARM OR AARCH64 OR IOS) )
 OCV_OPTION(ENABLE_NOISY_WARNINGS      "Show all warnings even if they are too noisy"             OFF )
@ -299,6 +297,9 @@ OCV_OPTION(ENABLE_IMPL_COLLECTION     "Collect implementation data on function c
 OCV_OPTION(ENABLE_INSTRUMENTATION     "Instrument functions to collect calls trace and performance" OFF )
 OCV_OPTION(ENABLE_GNU_STL_DEBUG       "Enable GNU STL Debug mode (defines _GLIBCXX_DEBUG)"       OFF IF ((NOT CMAKE_VERSION VERSION_LESS "2.8.11") AND CMAKE_COMPILER_IS_GNUCXX) )
 OCV_OPTION(GENERATE_ABI_DESCRIPTOR    "Generate XML file for abi_compliance_checker tool" OFF IF UNIX)
+OCV_OPTION(CV_ENABLE_INTRINSICS       "Use intrinsic-based optimized code" ON )
+OCV_OPTION(CV_DISABLE_OPTIMIZATION    "Disable explicit optimized code (dispatched code/intrinsics/loop unrolling/etc)" OFF )
+

 OCV_OPTION(DOWNLOAD_EXTERNAL_TEST_DATA "Download external test data (Python executable and OPENCV_TEST_DATA_PATH environment variable may be required)" OFF )

@ -499,6 +500,9 @@ if(CMAKE_GENERATOR MATCHES "Makefiles|Ninja" AND "${CMAKE_BUILD_TYPE}" STREQUAL
  set(CMAKE_BUILD_TYPE Release)
 endif()

+# --- Python Support ---
+include(cmake/OpenCVDetectPython.cmake)
+
 include(cmake/OpenCVCompilerOptions.cmake)


@ -578,9 +582,6 @@ else()
  unset(DOXYGEN_FOUND CACHE)
 endif()

-# --- Python Support ---
-include(cmake/OpenCVDetectPython.cmake)
-
 # --- Java Support ---
 include(cmake/OpenCVDetectApacheAnt.cmake)
 if(ANDROID)
@ -869,6 +870,33 @@ if(NOT CMAKE_GENERATOR MATCHES "Xcode|Visual Studio")
  status("    Configuration:"  ${CMAKE_BUILD_TYPE})
 endif()

+
+# ========================= CPU code generation mode =========================
+status("")
+status("  CPU/HW features:")
+status("    Baseline:"  "${CPU_BASELINE_FINAL}")
+if(NOT CPU_BASELINE STREQUAL CPU_BASELINE_FINAL)
+  status("      requested:"  "${CPU_BASELINE}")
+endif()
+if(CPU_BASELINE_REQUIRE)
+  status("      required:"  "${CPU_BASELINE_REQUIRE}")
+endif()
+if(CPU_BASELINE_DISABLE)
+  status("      disabled:"  "${CPU_BASELINE_DISABLE}")
+endif()
+if(CPU_DISPATCH_FINAL OR CPU_DISPATCH)
+  status("    Dispatched code generation:"  "${CPU_DISPATCH_FINAL}")
+  if(NOT CPU_DISPATCH STREQUAL CPU_DISPATCH_FINAL)
+    status("      requested:"  "${CPU_DISPATCH}")
+  endif()
+  if(CPU_DISPATCH_REQUIRE)
+    status("      required:"  "${CPU_DISPATCH_REQUIRE}")
+  endif()
+  foreach(OPT ${CPU_DISPATCH_FINAL})
+    status("      ${OPT} (${CPU_${OPT}_USAGE_COUNT} files):"  "+ ${CPU_DISPATCH_${OPT}_INCLUDED}")
+  endforeach()
+endif()
+
 # ========================== C/C++ options ==========================
 if(CMAKE_CXX_COMPILER_VERSION)
  set(OPENCV_COMPILER_STR "${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER_ARG1} (ver ${CMAKE_CXX_COMPILER_VERSION})")
--- a/apps/interactive-calibration/CMakeLists.txt
+++ b/apps/interactive-calibration/CMakeLists.txt
@ -1,4 +1,7 @@
-set(OPENCV_INTERACTIVECALIBRATION_DEPS opencv_core opencv_imgproc opencv_features2d opencv_aruco opencv_highgui opencv_calib3d opencv_videoio)
+set(OPENCV_INTERACTIVECALIBRATION_DEPS opencv_core opencv_imgproc opencv_features2d opencv_highgui opencv_calib3d opencv_videoio)
+if(${BUILD_opencv_aruco})
+    list(APPEND OPENCV_INTERACTIVECALIBRATION_DEPS opencv_aruco)
+endif()
 ocv_check_dependencies(${OPENCV_INTERACTIVECALIBRATION_DEPS})

 if(NOT OCV_DEPENDENCIES_FOUND)
--- a/apps/interactive-calibration/frameProcessor.cpp
+++ b/apps/interactive-calibration/frameProcessor.cpp
@ -7,7 +7,6 @@

 #include <opencv2/calib3d.hpp>
 #include <opencv2/imgproc.hpp>
-#include <opencv2/aruco/charuco.hpp>
 #include <opencv2/highgui.hpp>

 #include <vector>
@ -75,6 +74,7 @@ bool CalibProcessor::detectAndParseChessboard(const cv::Mat &frame)

 bool CalibProcessor::detectAndParseChAruco(const cv::Mat &frame)
 {
+#ifdef HAVE_OPENCV_ARUCO
    cv::Ptr<cv::aruco::Board> board = mCharucoBoard.staticCast<cv::aruco::Board>();

    std::vector<std::vector<cv::Point2f> > corners, rejected;
@ -95,14 +95,16 @@ bool CalibProcessor::detectAndParseChAruco(const cv::Mat &frame)
        }
        centerX /= currentCharucoCorners.size[0];
        centerY /= currentCharucoCorners.size[0];
-        //cv::circle(frame, cv::Point2f(centerX, centerY), 10, cv::Scalar(0, 255, 0), 10);
+
        mTemplateLocations.insert(mTemplateLocations.begin(), cv::Point2f(centerX, centerY));
        cv::aruco::drawDetectedCornersCharuco(frame, currentCharucoCorners, currentCharucoIds);
        mCurrentCharucoCorners = currentCharucoCorners;
        mCurrentCharucoIds = currentCharucoIds;
        return true;
    }
-
+#else
+    (void)frame;
+#endif
    return false;
 }

@ -231,6 +233,7 @@ bool CalibProcessor::checkLastFrame()
        }
    }
    else {
+#ifdef HAVE_OPENCV_ARUCO
        cv::Mat r, t, angles;
        std::vector<cv::Point3f> allObjPoints;
        allObjPoints.reserve(mCurrentCharucoIds.total());
@ -248,6 +251,7 @@ bool CalibProcessor::checkLastFrame()
            mCalibData->allCharucoCorners.pop_back();
            mCalibData->allCharucoIds.pop_back();
        }
+#endif
    }
    return isFrameBad;
 }
@ -266,10 +270,12 @@ CalibProcessor::CalibProcessor(cv::Ptr<calibrationData> data, captureParameters
    switch(mBoardType)
    {
    case chAruco:
+#ifdef HAVE_OPENCV_ARUCO
        mArucoDictionary = cv::aruco::getPredefinedDictionary(
                    cv::aruco::PREDEFINED_DICTIONARY_NAME(capParams.charucoDictName));
        mCharucoBoard = cv::aruco::CharucoBoard::create(mBoardSize.width, mBoardSize.height, capParams.charucoSquareLenght,
                                                        capParams.charucoMarkerSize, mArucoDictionary);
+#endif
        break;
    case AcirclesGrid:
        mBlobDetectorPtr = cv::SimpleBlobDetector::create();
--- a/apps/interactive-calibration/frameProcessor.hpp
+++ b/apps/interactive-calibration/frameProcessor.hpp
@ -6,8 +6,10 @@
 #define FRAME_PROCESSOR_HPP

 #include <opencv2/core.hpp>
-#include <opencv2/aruco/charuco.hpp>
 #include <opencv2/calib3d.hpp>
+#ifdef HAVE_OPENCV_ARUCO
+#include <opencv2/aruco/charuco.hpp>
+#endif

 #include "calibCommon.hpp"
 #include "calibController.hpp"
@ -37,8 +39,10 @@ protected:
    cv::Mat mCurrentCharucoIds;

    cv::Ptr<cv::SimpleBlobDetector> mBlobDetectorPtr;
+#ifdef HAVE_OPENCV_ARUCO
    cv::Ptr<cv::aruco::Dictionary> mArucoDictionary;
    cv::Ptr<cv::aruco::CharucoBoard> mCharucoBoard;
+#endif

    int mNeededFramesNum;
    unsigned mDelayBetweenCaptures;
--- a/apps/interactive-calibration/main.cpp
+++ b/apps/interactive-calibration/main.cpp
@ -4,10 +4,13 @@

 #include <opencv2/core.hpp>
 #include <opencv2/calib3d.hpp>
-#include <opencv2/aruco/charuco.hpp>
 #include <opencv2/cvconfig.h>
 #include <opencv2/highgui.hpp>

+#ifdef HAVE_OPENCV_ARUCO
+#include <opencv2/aruco/charuco.hpp>
+#endif
+
 #include <string>
 #include <vector>
 #include <stdexcept>
@ -50,31 +53,27 @@ bool calib::showOverlayMessage(const std::string& message)
 #endif
 }

-static void deleteButton(int state, void* data)
+static void deleteButton(int, void* data)
 {
-    state++; //to avoid gcc warnings
    (static_cast<cv::Ptr<calibDataController>*>(data))->get()->deleteLastFrame();
    calib::showOverlayMessage("Last frame deleted");
 }

-static void deleteAllButton(int state, void* data)
+static void deleteAllButton(int, void* data)
 {
-    state++;
    (static_cast<cv::Ptr<calibDataController>*>(data))->get()->deleteAllData();
    calib::showOverlayMessage("All frames deleted");
 }

-static void saveCurrentParamsButton(int state, void* data)
+static void saveCurrentParamsButton(int, void* data)
 {
-    state++;
    if((static_cast<cv::Ptr<calibDataController>*>(data))->get()->saveCurrentCameraParameters())
        calib::showOverlayMessage("Calibration parameters saved");
 }

 #ifdef HAVE_QT
-static void switchVisualizationModeButton(int state, void* data)
+static void switchVisualizationModeButton(int, void* data)
 {
-    state++;
    ShowProcessor* processor = static_cast<ShowProcessor*>(((cv::Ptr<FrameProcessor>*)data)->get());
    processor->switchVisualizationMode();
 }
@ -103,6 +102,11 @@ int main(int argc, char** argv)

    captureParameters capParams = paramsController.getCaptureParameters();
    internalParameters intParams = paramsController.getInternalParameters();
+#ifndef HAVE_OPENCV_ARUCO
+    if(capParams.board == chAruco)
+        CV_Error(cv::Error::StsNotImplemented, "Aruco module is disabled in current build configuration."
+                                               " Consider usage of another calibration pattern\n");
+#endif

    cv::TermCriteria solverTermCrit = cv::TermCriteria(cv::TermCriteria::COUNT+cv::TermCriteria::EPS,
                                                       intParams.solverMaxIters, intParams.solverEps);
@ -172,6 +176,7 @@ int main(int argc, char** argv)
                                                    calibrationFlags, solverTermCrit);
                }
                else {
+#ifdef HAVE_OPENCV_ARUCO
                    cv::Ptr<cv::aruco::Dictionary> dictionary =
                            cv::aruco::getPredefinedDictionary(cv::aruco::PREDEFINED_DICTIONARY_NAME(capParams.charucoDictName));
                    cv::Ptr<cv::aruco::CharucoBoard> charucoboard =
@ -183,6 +188,7 @@ int main(int argc, char** argv)
                                                           globalData->cameraMatrix, globalData->distCoeffs,
                                                           cv::noArray(), cv::noArray(), globalData->stdDeviations, cv::noArray(),
                                                           globalData->perViewErrors, calibrationFlags, solverTermCrit);
+#endif
                }
                dataController->updateUndistortMap();
                dataController->printParametersToConsole(std::cout);
--- a/cmake/FindOpenVX.cmake
+++ b/cmake/FindOpenVX.cmake
@ -25,6 +25,20 @@ endif()

 if(OPENVX_INCLUDE_DIR AND OPENVX_LIBRARIES)
  set(HAVE_OPENVX TRUE)
+
+  try_compile(OPENVX_RENAMED_REF
+      "${OpenCV_BINARY_DIR}"
+      "${OpenCV_SOURCE_DIR}/cmake/checks/openvx_refenum_test.cpp"
+      CMAKE_FLAGS "-DINCLUDE_DIRECTORIES:STRING=${OPENVX_INCLUDE_DIR}"
+      LINK_LIBRARIES ${OPENVX_LIBRARIES}
+      OUTPUT_VARIABLE OUTPUT
+  )
+  if(OPENVX_RENAMED_REF)
+      add_definitions(-DIVX_RENAMED_REFS=1)
+      message(STATUS "OpenVX: Checking reference attribute name convention... New")
+  else()
+      message(STATUS "OpenVX: Checking reference attribute name convention... Old")
+  endif()
 endif()

 if(NOT HAVE_OPENVX)
--- a/cmake/OpenCVCompilerOptimizations.cmake
+++ b/cmake/OpenCVCompilerOptimizations.cmake
@ -0,0 +1,651 @@
+# x86/x86-64 arch:
+# SSE / SSE2 (always available on 64-bit CPUs)
+# SSE3 / SSSE3
+# SSE4_1 / SSE4_2 / POPCNT
+# AVX / AVX2 / AVX512
+# FMA3
+
+# CPU_{opt}_SUPPORTED=ON/OFF - compiler support (possibly with additional flag)
+# CPU_{opt}_IMPLIES=<list>
+# CPU_{opt}_FORCE=<list> - subset of "implies" list
+# CPU_{opt}_FLAGS_ON=""
+# CPU_{opt}_FEATURE_ALIAS - mapping to CV_CPU_* HWFeature enum
+
+# Input variables:
+# CPU_BASELINE=<list> - preferred list of baseline optimizations
+# CPU_DISPATCH=<list> - preferred list of dispatched optimizations
+
+# Advanced input variables:
+# CPU_BASELINE_REQUIRE=<list> - list of required baseline optimizations
+# CPU_DISPATCH_REQUIRE=<list> - list of required dispatched optimizations
+# CPU_BASELINE_DISABLE=<list> - list of disabled baseline optimizations
+
+# Output variables:
+# CPU_BASELINE_FINAL=<list> - final list of enabled compiler optimizations
+# CPU_DISPATCH_FINAL=<list> - final list of dispatched optimizations
+#
+# CPU_DISPATCH_FLAGS_${opt} - flags for source files compiled separately (_opt_avx2.cpp)
+
+set(CPU_ALL_OPTIMIZATIONS "SSE;SSE2;SSE3;SSSE3;SSE4_1;SSE4_2;POPCNT;AVX;FP16;AVX2;FMA3") # without AVX512
+list(APPEND CPU_ALL_OPTIMIZATIONS NEON VFPV3 FP16)
+list(REMOVE_DUPLICATES CPU_ALL_OPTIMIZATIONS)
+
+ocv_update(CPU_VFPV3_FEATURE_ALIAS "")
+
+
+set(HELP_CPU_BASELINE "Specify list of enabled baseline CPU optimizations")
+set(HELP_CPU_BASELINE_REQUIRE "Specify list of required baseline CPU optimizations")
+set(HELP_CPU_BASELINE_DISABLE "Specify list of forbidden baseline CPU optimizations")
+set(HELP_CPU_DISPATCH "Specify list of dispatched CPU optimizations")
+set(HELP_CPU_DISPATCH_REQUIRE "Specify list of required dispatched CPU optimizations")
+
+foreach(var CPU_BASELINE CPU_BASELINE_REQUIRE CPU_BASELINE_DISABLE CPU_DISPATCH CPU_DISPATCH_REQUIRE)
+  if(DEFINED ${var})
+    string(REPLACE "," ";" _list "${${var}}")
+    set(${var} "${_list}" CACHE STRING "${HELP_${var}}" FORCE)
+  endif()
+endforeach()
+
+# process legacy flags
+macro(ocv_optimization_process_obsolete_option legacy_flag OPT legacy_warn)
+  if(DEFINED ${legacy_flag})
+    if(${legacy_warn})
+      message(STATUS "WARNING: Option ${legacy_flag}='${${legacy_flag}}' is deprecated and should not be used anymore")
+      message(STATUS "         Behaviour of this option is not backward compatible")
+      message(STATUS "         Refer to 'CPU_BASELINE'/'CPU_DISPATCH' CMake options documentation")
+    endif()
+    if(${legacy_flag})
+      if(NOT ";${CPU_BASELINE_REQUIRE};" MATCHES ";${OPT};")
+        set(CPU_BASELINE_REQUIRE "${CPU_BASELINE_REQUIRE};${OPT}" CACHE STRING "${HELP_CPU_BASELINE_REQUIRE}" FORCE)
+      endif()
+    else()
+      if(NOT ";${CPU_BASELINE_DISABLE};" MATCHES ";${OPT};")
+        set(CPU_BASELINE_DISABLE "${CPU_BASELINE_DISABLE};${OPT}" CACHE STRING "${HELP_CPU_BASELINE_DISABLE}" FORCE)
+      endif()
+    endif()
+  endif()
+endmacro()
+ocv_optimization_process_obsolete_option(ENABLE_SSE SSE ON)
+ocv_optimization_process_obsolete_option(ENABLE_SSE2 SSE2 ON)
+ocv_optimization_process_obsolete_option(ENABLE_SSE3 SSE3 ON)
+ocv_optimization_process_obsolete_option(ENABLE_SSSE3 SSSE3 ON)
+ocv_optimization_process_obsolete_option(ENABLE_SSE41 SSE4_1 ON)
+ocv_optimization_process_obsolete_option(ENABLE_SSE42 SSE4_2 ON)
+ocv_optimization_process_obsolete_option(ENABLE_POPCNT POPCNT ON)
+ocv_optimization_process_obsolete_option(ENABLE_AVX AVX ON)
+ocv_optimization_process_obsolete_option(ENABLE_AVX2 AVX2 ON)
+ocv_optimization_process_obsolete_option(ENABLE_FMA3 FMA3 ON)
+
+ocv_optimization_process_obsolete_option(ENABLE_VFPV3 VFPV3 OFF)
+ocv_optimization_process_obsolete_option(ENABLE_NEON NEON OFF)
+
+
+macro(ocv_is_optimization_in_list resultvar check_opt)
+  set(__checked "")
+  set(__queue ${ARGN})
+  set(${resultvar} 0)
+  while(__queue AND NOT ${resultvar})
+    list(REMOVE_DUPLICATES __queue)
+    set(__queue_current ${__queue})
+    set(__queue "")
+    foreach(OPT ${__queue_current})
+      if("x${OPT}" STREQUAL "x${check_opt}")
+        set(${resultvar} 1)
+        break()
+      elseif(NOT ";${__checked};" MATCHES ";${OPT};")
+        list(APPEND __queue ${CPU_${OPT}_IMPLIES})
+      endif()
+      list(APPEND __checked ${OPT})
+    endforeach()
+  endwhile()
+endmacro()
+
+macro(ocv_is_optimization_in_force_list resultvar check_opt)
+  set(__checked "")
+  set(__queue ${ARGN})
+  set(${resultvar} 0)
+  while(__queue AND NOT ${resultvar})
+    list(REMOVE_DUPLICATES __queue)
+    set(__queue_current ${__queue})
+    set(__queue "")
+    foreach(OPT ${__queue_current})
+      if(OPT STREQUAL "${check_opt}")
+        set(${resultvar} 1)
+        break()
+      elseif(NOT ";${__checked};" MATCHES ";${OPT};")
+        list(APPEND __queue ${CPU_${OPT}_FORCE})
+      endif()
+      list(APPEND __checked ${OPT})
+    endforeach()
+  endwhile()
+endmacro()
+
+macro(ocv_append_optimization_flag var OPT)
+  if(CPU_${OPT}_FLAGS_CONFLICT)
+    string(REGEX REPLACE " ${CPU_${OPT}_FLAGS_CONFLICT}" "" ${var} " ${${var}}")
+    string(REGEX REPLACE "^ +" "" ${var} "${${var}}")
+  endif()
+  set(${var} "${${var}} ${CPU_${OPT}_FLAGS_ON}")
+endmacro()
+
+# Support GCC -march=native or Intel Compiler -xHost flags
+if(";${CPU_BASELINE};" MATCHES ";NATIVE;" OR ";${CPU_BASELINE};" MATCHES ";HOST;")
+  set(CPU_BASELINE_DETECT ON)
+  set(_add_native_flag ON)
+elseif(";${CPU_BASELINE};" MATCHES ";DETECT;")
+  set(CPU_BASELINE_DETECT ON)
+elseif(" ${CMAKE_CXX_FLAGS} " MATCHES " -march=native | -xHost | /QxHost ")
+  if(DEFINED CPU_BASELINE)
+    message(STATUS "CPU: Detected '-march=native' or '-xHost' compiler flag. Force CPU_BASELINE=DETECT.")
+  endif()
+  set(CPU_BASELINE "DETECT" CACHE STRING "${HELP_CPU_BASELINE}")
+  set(CPU_BASELINE_DETECT ON)
+endif()
+
+if(X86 OR X86_64)
+  ocv_update(CPU_KNOWN_OPTIMIZATIONS "SSE;SSE2;SSE3;SSSE3;SSE4_1;POPCNT;SSE4_2;FP16;FMA3;AVX;AVX2;AVX512")
+
+  ocv_update(CPU_SSE_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_sse.cpp")
+  ocv_update(CPU_SSE2_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_sse2.cpp")
+  ocv_update(CPU_SSE3_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_sse3.cpp")
+  ocv_update(CPU_SSSE3_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_ssse3.cpp")
+  ocv_update(CPU_SSE4_1_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_sse41.cpp")
+  ocv_update(CPU_SSE4_2_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_sse42.cpp")
+  ocv_update(CPU_POPCNT_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_popcnt.cpp")
+  ocv_update(CPU_AVX_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_avx.cpp")
+  ocv_update(CPU_AVX2_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_avx2.cpp")
+  ocv_update(CPU_FP16_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_fp16.cpp")
+  ocv_update(CPU_AVX512_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_avx512.cpp")
+
+  if(NOT OPENCV_CPU_OPT_IMPLIES_IGNORE)
+    ocv_update(CPU_AVX512_IMPLIES "AVX2")
+    ocv_update(CPU_AVX512_FORCE "") # Don't force other optimizations
+    ocv_update(CPU_AVX2_IMPLIES "AVX;FMA3;FP16")
+    ocv_update(CPU_FMA3_IMPLIES "AVX2")
+    ocv_update(CPU_FMA3_FORCE "") # Don't force other optimizations
+    ocv_update(CPU_FP16_IMPLIES "AVX")
+    ocv_update(CPU_FP16_FORCE "") # Don't force other optimizations
+    ocv_update(CPU_AVX_IMPLIES "SSE4_2")
+    ocv_update(CPU_SSE4_2_IMPLIES "SSE4_1;POPCNT")
+    ocv_update(CPU_POPCNT_IMPLIES "SSE4_1")
+    ocv_update(CPU_POPCNT_FORCE "") # Don't force other optimizations
+    ocv_update(CPU_SSE4_1_IMPLIES "SSE3;SSSE3")
+    ocv_update(CPU_SSSE3_IMPLIES "SSE3")
+    ocv_update(CPU_SSE3_IMPLIES "SSE2")
+    ocv_update(CPU_SSE2_IMPLIES "SSE")
+  endif()
+
+  if(CV_ICC)
+    macro(ocv_intel_compiler_optimization_option name unix_flags msvc_flags)
+      ocv_update(CPU_${name}_FLAGS_NAME "${name}")
+      if(MSVC)
+        set(enable_flags "${msvc_flags}")
+        set(flags_conflict "/arch:[^ ]+")
+      else()
+        set(enable_flags "${unix_flags}")
+        set(flags_conflict "-msse[^ ]*|-mssse3|-mavx[^ ]*|-march[^ ]+")
+      endif()
+      ocv_update(CPU_${name}_FLAGS_ON "${enable_flags}")
+      if(flags_conflict)
+        ocv_update(CPU_${name}_FLAGS_CONFLICT "${flags_conflict}")
+      endif()
+    endmacro()
+    ocv_intel_compiler_optimization_option(AVX2 "-march=core-avx2" "/arch:CORE-AVX2")
+    ocv_intel_compiler_optimization_option(FP16 "-mavx" "/arch:AVX")
+    ocv_intel_compiler_optimization_option(AVX "-mavx" "/arch:AVX")
+    ocv_intel_compiler_optimization_option(FMA3 "" "")
+    ocv_intel_compiler_optimization_option(POPCNT "" "")
+    ocv_intel_compiler_optimization_option(SSE4_2 "-msse4.2" "/arch:SSE4.2")
+    ocv_intel_compiler_optimization_option(SSE4_1 "-msse4.1" "/arch:SSE4.1")
+    ocv_intel_compiler_optimization_option(SSE3 "-msse3" "/arch:SSE3")
+    ocv_intel_compiler_optimization_option(SSSE3 "-mssse3" "/arch:SSSE3")
+    ocv_intel_compiler_optimization_option(SSE2 "-msse2" "/arch:SSE2")
+    if(NOT X86_64) # x64 compiler doesn't support /arch:sse
+      ocv_intel_compiler_optimization_option(SSE "-msse" "/arch:SSE")
+    endif()
+    #ocv_intel_compiler_optimization_option(AVX512   "-march=core-avx512")
+  elseif(CMAKE_COMPILER_IS_GNUCXX)
+    ocv_update(CPU_AVX2_FLAGS_ON "-mavx2")
+    ocv_update(CPU_FP16_FLAGS_ON "-mf16c")
+    ocv_update(CPU_AVX_FLAGS_ON "-mavx")
+    ocv_update(CPU_FMA3_FLAGS_ON "-mfma")
+    ocv_update(CPU_POPCNT_FLAGS_ON "-mpopcnt")
+    ocv_update(CPU_SSE4_2_FLAGS_ON "-msse4.2")
+    ocv_update(CPU_SSE4_1_FLAGS_ON "-msse4.1")
+    ocv_update(CPU_SSE3_FLAGS_ON "-msse3")
+    ocv_update(CPU_SSSE3_FLAGS_ON "-mssse3")
+    ocv_update(CPU_SSE2_FLAGS_ON "-msse2")
+    ocv_update(CPU_SSE_FLAGS_ON "-msse")
+    if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS "5.0")
+      ocv_update(CPU_AVX512_FLAGS_ON "-mavx512f -mavx512pf -mavx512er -mavx512cd -mavx512vl -mavx512bw -mavx512dq -mavx512ifma -mavx512vbmi")
+    endif()
+  elseif(MSVC)
+    ocv_update(CPU_AVX2_FLAGS_ON "/arch:AVX2")
+    ocv_update(CPU_AVX_FLAGS_ON "/arch:AVX")
+    if(NOT MSVC64)
+      # 64-bit MSVC compiler uses SSE/SSE2 by default
+      ocv_update(CPU_SSE_FLAGS_ON "/arch:SSE")
+      ocv_update(CPU_SSE_SUPPORTED ON)
+      ocv_update(CPU_SSE2_FLAGS_ON "/arch:SSE2")
+      ocv_update(CPU_SSE2_SUPPORTED ON)
+    else()
+      ocv_update(CPU_SSE_SUPPORTED ON)
+      ocv_update(CPU_SSE2_SUPPORTED ON)
+    endif()
+    # Other instruction sets are supported by default since MSVC 2008 at least
+  else()
+    message(WARNING "TODO: Unsupported compiler")
+  endif()
+
+  if(NOT DEFINED CPU_DISPATCH)
+    set(CPU_DISPATCH "SSE4_1;AVX;FP16;AVX2" CACHE STRING "${HELP_CPU_DISPATCH}")
+  endif()
+
+  if(NOT DEFINED CPU_BASELINE)
+    if(X86_64)
+      set(CPU_BASELINE "SSSE3" CACHE STRING "${HELP_CPU_BASELINE}")
+    else()
+      set(CPU_BASELINE "SSE2" CACHE STRING "${HELP_CPU_BASELINE}")
+    endif()
+  endif()
+
+elseif(ARM OR AARCH64)
+  ocv_update(CPU_FP16_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_fp16.cpp")
+  if(NOT AARCH64)
+    ocv_update(CPU_KNOWN_OPTIMIZATIONS "VFPV3;NEON;FP16")
+    ocv_update(CPU_NEON_FLAGS_ON "-mfpu=neon")
+    ocv_update(CPU_VFPV3_FLAGS_ON "-mfpu=vfpv3")
+    ocv_update(CPU_FP16_FLAGS_ON "-mfpu=neon-fp16")
+    set(CPU_BASELINE "DETECT" CACHE STRING "${HELP_CPU_BASELINE}")
+  else()
+    ocv_update(CPU_KNOWN_OPTIMIZATIONS "NEON;FP16")
+    ocv_update(CPU_NEON_FLAGS_ON "")
+    set(CPU_BASELINE "NEON" CACHE STRING "${HELP_CPU_BASELINE}")
+  endif()
+endif()
+
+# Helper values for cmake-gui
+set(CPU_BASELINE "DETECT" CACHE STRING "${HELP_CPU_BASELINE}")
+set(CPU_DISPATCH "" CACHE STRING "${HELP_CPU_DISPATCH}")
+set_property(CACHE CPU_BASELINE PROPERTY STRINGS "" ${CPU_KNOWN_OPTIMIZATIONS})
+set_property(CACHE CPU_DISPATCH PROPERTY STRINGS "" ${CPU_KNOWN_OPTIMIZATIONS})
+
+set(CPU_BASELINE_FLAGS "")
+
+set(CPU_BASELINE_FINAL "")
+set(CPU_DISPATCH_FINAL "")
+
+macro(ocv_check_compiler_optimization OPT)
+  if(NOT DEFINED CPU_${OPT}_SUPPORTED)
+    if((DEFINED CPU_${OPT}_FLAGS_ON AND NOT "x${CPU_${OPT}_FLAGS_ON}" STREQUAL "x") OR CPU_${OPT}_TEST_FILE)
+      set(_varname "")
+      if(CPU_${OPT}_TEST_FILE)
+        set(__available 0)
+        if(CPU_BASELINE_DETECT)
+          set(_varname "HAVE_CPU_${OPT}_SUPPORT")
+          ocv_check_compiler_flag(CXX "${CPU_BASELINE_FLAGS}" "${_varname}" "${CPU_${OPT}_TEST_FILE}")
+          if(${_varname})
+            list(APPEND CPU_BASELINE_FINAL ${OPT})
+            set(__available 1)
+          endif()
+        endif()
+        if(NOT __available)
+          if(NOT "x${CPU_${OPT}_FLAGS_NAME}" STREQUAL "x")
+            set(_varname "HAVE_CPU_${CPU_${OPT}_FLAGS_NAME}")
+            set(_compile_flags "${CPU_BASELINE_FLAGS}")
+            ocv_append_optimization_flag(_compile_flags ${OPT})
+            ocv_check_compiler_flag(CXX "${_compile_flags}" "${_varname}" "${CPU_${OPT}_TEST_FILE}")
+          elseif(NOT "x${CPU_${OPT}_FLAGS_ON}" STREQUAL "x")
+            ocv_check_flag_support(CXX "${CPU_${OPT}_FLAGS_ON}" _varname "" "${CPU_${OPT}_TEST_FILE}")
+          else()
+            set(_varname "HAVE_CPU_${OPT}_SUPPORT")
+            set(_compile_flags "${CPU_BASELINE_FLAGS}")
+            ocv_append_optimization_flag(_compile_flags ${OPT})
+            ocv_check_compiler_flag(CXX "${_compile_flags}" "${_varname}" "${CPU_${OPT}_TEST_FILE}")
+          endif()
+        endif()
+      else()
+        ocv_check_flag_support(CXX "${CPU_${OPT}_FLAGS_ON}" _varname "")
+      endif()
+      if(_varname AND ${_varname})
+        set(CPU_${OPT}_SUPPORTED ON)
+      elseif(NOT CPU_${OPT}_SUPPORTED)
+        message(STATUS "${OPT} is not supported by C++ compiler")
+      endif()
+    else()
+      set(CPU_${OPT}_SUPPORTED ON)
+    endif()
+  endif()
+endmacro()
+
+foreach(OPT ${CPU_KNOWN_OPTIMIZATIONS})
+  set(CPU_${OPT}_USAGE_COUNT 0 CACHE INTERNAL "" FORCE)
+  if(NOT DEFINED CPU_${OPT}_FORCE)
+    set(CPU_${OPT}_FORCE "${CPU_${OPT}_IMPLIES}")
+  endif()
+endforeach()
+
+if(_add_native_flag)
+  set(_varname "HAVE_CPU_NATIVE_SUPPORT")
+  ocv_check_compiler_flag(CXX "-march=native" "${_varname}" "")
+  if(_varname)
+    set(CPU_BASELINE_FLAGS "${CPU_BASELINE_FLAGS} -march=native")
+  else()
+    set(_varname "HAVE_CPU_HOST_SUPPORT")
+    if(MSVC)
+      set(_flag "/QxHost")
+    else()
+      set(_flag "-xHost")
+    endif()
+    ocv_check_compiler_flag(CXX "${_flag}" "${_varname}" "")
+    if(_varname)
+      set(CPU_BASELINE_FLAGS "${CPU_BASELINE_FLAGS} ${flag}")
+    endif()
+  endif()
+endif()
+
+foreach(OPT ${CPU_KNOWN_OPTIMIZATIONS})
+  set(__is_disabled 0)
+  foreach(OPT2 ${CPU_BASELINE_DISABLE})
+    ocv_is_optimization_in_list(__is_disabled ${OPT2} ${OPT})
+    if(__is_disabled)
+      break()
+    endif()
+  endforeach()
+  if(__is_disabled)
+    set(__is_from_baseline 0)
+  else()
+    ocv_is_optimization_in_list(__is_from_baseline ${OPT} ${CPU_BASELINE_REQUIRE})
+    if(NOT __is_from_baseline)
+      ocv_is_optimization_in_list(__is_from_baseline ${OPT} ${CPU_BASELINE})
+    endif()
+  endif()
+  ocv_is_optimization_in_list(__is_from_dispatch ${OPT} ${CPU_DISPATCH_REQUIRE})
+  if(NOT __is_from_dispatch)
+    ocv_is_optimization_in_list(__is_from_dispatch ${OPT} ${CPU_DISPATCH})
+  endif()
+  if(__is_from_dispatch OR __is_from_baseline OR CPU_BASELINE_DETECT)
+    ocv_check_compiler_optimization(${OPT})
+  endif()
+  if(CPU_BASELINE_DETECT AND NOT __is_from_baseline AND NOT __is_disabled)
+    ocv_is_optimization_in_list(__is_from_baseline ${OPT} ${CPU_BASELINE_FINAL})
+  endif()
+  if(CPU_${OPT}_SUPPORTED)
+    if(";${CPU_DISPATCH};" MATCHES ";${OPT};" AND NOT __is_from_baseline)
+      list(APPEND CPU_DISPATCH_FINAL ${OPT})
+    elseif(__is_from_baseline AND NOT CPU_BASELINE_DETECT)
+      list(APPEND CPU_BASELINE_FINAL ${OPT})
+      ocv_append_optimization_flag(CPU_BASELINE_FLAGS ${OPT})
+    endif()
+  endif()
+endforeach()
+
+foreach(OPT ${CPU_BASELINE_REQUIRE})
+  if(NOT ";${CPU_BASELINE_FINAL};" MATCHES ";${OPT};")
+    message(SEND_ERROR "Required baseline optimization is not supported: ${OPT} (CPU_BASELINE_REQUIRE=${CPU_BASELINE_REQUIRE})")
+  endif()
+endforeach()
+
+foreach(OPT ${CPU_BASELINE})
+  if(OPT STREQUAL "DETECT" OR OPT STREQUAL "HOST" OR OPT STREQUAL "NATIVE")
+    # nothing
+  elseif(NOT ";${CPU_BASELINE_FINAL};" MATCHES ";${OPT};")
+    message(STATUS "Optimization ${OPT} is not available, skipped")
+  endif()
+endforeach()
+
+foreach(OPT ${CPU_DISPATCH_REQUIRE})
+  if(";${CPU_DISPATCH_FINAL};" MATCHES ";${OPT};")
+    # OK
+  elseif(";${CPU_BASELINE_FINAL};" MATCHES ";${OPT};")
+    message(SEND_ERROR "Dispatched optimization ${OPT} is in baseline list (CPU_DISPATCH_REQUIRE=${CPU_DISPATCH_REQUIRE})")
+  else()
+    message(SEND_ERROR "Required dispatch optimization is not supported: ${OPT} (CPU_DISPATCH_REQUIRE=${CPU_DISPATCH_REQUIRE})")
+  endif()
+endforeach()
+
+foreach(OPT ${CPU_DISPATCH})
+  if(";${CPU_DISPATCH_FINAL};" MATCHES ";${OPT};")
+    # OK
+  elseif(";${CPU_BASELINE_FINAL};" MATCHES ";${OPT};")
+    # OK
+  else()
+    message(STATUS "Dispatch optimization ${OPT} is not available, skipped")
+  endif()
+endforeach()
+
+#message(STATUS "CPU_BASELINE_FINAL=${CPU_BASELINE_FINAL}")
+#message(STATUS "CPU_DISPATCH_FINAL=${CPU_DISPATCH_FINAL}")
+
+#if(CPU_DISPATCH_FINAL AND NOT PYTHON_DEFAULT_EXECUTABLE)
+#  message(FATAL_ERROR "Python is required for CPU dispatched optimization support")
+#endif()
+
+macro(ocv_compiler_optimization_options)
+  set(__flags "${OPENCV_EXTRA_CXX_FLAGS} ${CPU_BASELINE_FLAGS}")
+  if(NOT __flags STREQUAL CACHED_CPU_BASELINE_FLAGS)
+    set(CACHED_CPU_BASELINE_FLAGS "${__flags}" CACHE INTERNAL "" FORCE)
+    ocv_clear_vars(HAVE_CPU_BASELINE_FLAGS)
+  endif()
+  ocv_check_compiler_flag(CXX "${__flags}" HAVE_CPU_BASELINE_FLAGS)
+  if(NOT HAVE_CPU_BASELINE_FLAGS)
+    message(FATAL_ERROR "Compiler doesn't support baseline optimization flags: ${CPU_BASELINE_FLAGS}")
+  endif()
+  add_extra_compiler_option_force("${CPU_BASELINE_FLAGS}")
+
+  foreach(OPT ${CPU_DISPATCH_FINAL})
+    set(__dispatch_flags "")
+    set(__dispatch_definitions "")
+    set(__dispatch_opts "")
+    set(__dispatch_opts_force "")
+    foreach(OPT2 ${CPU_KNOWN_OPTIMIZATIONS})
+      if(NOT CPU_${OPT2}_SUPPORTED)
+        #continue()
+      else()
+      ocv_is_optimization_in_list(__is_from_baseline ${OPT2} ${CPU_BASELINE_FINAL})
+      if(NOT __is_from_baseline)
+        ocv_is_optimization_in_list(__is_active ${OPT2} ${OPT})
+        if(__is_active)
+          ocv_append_optimization_flag(__dispatch_flags ${OPT2})
+          list(APPEND __dispatch_definitions "CV_CPU_COMPILE_${OPT2}=1")
+          list(APPEND __dispatch_opts "${OPT2}")
+        endif()
+        ocv_is_optimization_in_force_list(__is_force ${OPT2} ${OPT})
+        if(__is_force)
+          list(APPEND __dispatch_opts_force "${OPT2}")
+        endif()
+      endif()
+      endif()
+    endforeach()
+    set(__flags "${OPENCV_EXTRA_CXX_FLAGS} ${__dispatch_flags}")
+    if(NOT __flags STREQUAL CACHED_CPU_DISPATCH_${OPT}_FLAGS)
+      set(CACHED_CPU_DISPATCH_${OPT}_FLAGS "${__flags}" CACHE INTERNAL "" FORCE)
+      ocv_clear_vars(HAVE_CPU_DISPATCH_FLAGS_${OPT})
+    endif()
+    ocv_check_compiler_flag(CXX "${__flags}" HAVE_CPU_DISPATCH_FLAGS_${OPT})
+    if(NOT HAVE_CPU_DISPATCH_FLAGS_${OPT})
+      message(FATAL_ERROR "Compiler doesn't support optimization flags for ${OPT} dispatch mode: ${__dispatch_flags}")
+    endif()
+    set(CPU_DISPATCH_FLAGS_${OPT} "${__dispatch_flags}")
+    set(CPU_DISPATCH_DEFINITIONS_${OPT} "${__dispatch_definitions}")
+    set(CPU_DISPATCH_${OPT}_INCLUDED "${__dispatch_opts}")
+    set(CPU_DISPATCH_${OPT}_FORCED "${__dispatch_opts_force}")
+  endforeach()
+
+  if(ENABLE_POWERPC)
+    add_extra_compiler_option("-mcpu=G3 -mtune=G5")
+  endif()
+  if(ARM)
+    add_extra_compiler_option("-mfp16-format=ieee")
+  endif(ARM)
+  if(ENABLE_NEON)
+    add_extra_compiler_option("-mfpu=neon")
+  endif()
+  if(ENABLE_VFPV3 AND NOT ENABLE_NEON)
+    add_extra_compiler_option("-mfpu=vfpv3")
+  endif()
+endmacro()
+
+macro(ocv_compiler_optimization_options_finalize)
+  if(CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64))
+    if(NOT APPLE AND CMAKE_SIZEOF_VOID_P EQUAL 4)
+      if(OPENCV_EXTRA_CXX_FLAGS MATCHES "-m(sse2|avx)")
+        add_extra_compiler_option(-mfpmath=sse) # !! important - be on the same wave with x64 compilers
+      else()
+        add_extra_compiler_option(-mfpmath=387)
+      endif()
+    endif()
+  endif()
+
+  if(MSVC)
+    # Generate Intrinsic Functions
+    set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /Oi")
+
+    if((X86 OR X86_64) AND CMAKE_SIZEOF_VOID_P EQUAL 4 AND ";${CPU_BASELINE_FINAL};" MATCHES ";SSE;")
+      set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /fp:fast") # !! important - be on the same wave with x64 compilers
+    endif()
+  endif(MSVC)
+endmacro()
+
+macro(ocv_compiler_optimization_process_sources SOURCES_VAR_NAME LIBS_VAR_NAME TARGET_BASE_NAME)
+  set(__result "")
+  set(__result_libs "")
+  foreach(OPT ${CPU_DISPATCH_FINAL})
+    set(__result_${OPT} "")
+  endforeach()
+  foreach(fname ${${SOURCES_VAR_NAME}})
+    string(TOLOWER "${fname}" fname_LOWER)
+    if(fname_LOWER MATCHES "[.]opt_.*[.]cpp$")
+      if(CV_DISABLE_OPTIMIZATION OR NOT CV_ENABLE_INTRINSICS)
+        message(STATUS "Excluding from source files list: ${fname}")
+        #continue()
+      else()
+        set(__opt_found 0)
+        foreach(OPT ${CPU_BASELINE_FINAL})
+          string(TOLOWER "${OPT}" OPT_LOWER)
+          if(fname_LOWER MATCHES "_${OPT_LOWER}[.]cpp$")
+#message("${fname} BASELINE-${OPT}")
+            set(__opt_found 1)
+            list(APPEND __result "${fname}")
+            break()
+          endif()
+        endforeach()
+        foreach(OPT ${CPU_DISPATCH_FINAL})
+          foreach(OPT2 ${CPU_DISPATCH_${OPT}_FORCED})
+            string(TOLOWER "${OPT2}" OPT2_LOWER)
+            if(fname_LOWER MATCHES "_${OPT2_LOWER}[.]cpp$")
+              list(APPEND __result_${OPT} "${fname}")
+              math(EXPR CPU_${OPT}_USAGE_COUNT "${CPU_${OPT}_USAGE_COUNT}+1")
+              set(CPU_${OPT}_USAGE_COUNT "${CPU_${OPT}_USAGE_COUNT}" CACHE INTERNAL "" FORCE)
+#message("${fname} ${OPT}")
+#message("    ${CPU_DISPATCH_${OPT}_INCLUDED}")
+#message("    ${CPU_DISPATCH_DEFINITIONS_${OPT}}")
+#message("    ${CPU_DISPATCH_FLAGS_${OPT}}")
+              set(__opt_found 1)
+              break()
+            endif()
+          endforeach()
+          if(__opt_found)
+            set(__opt_found 1)
+            break()
+          endif()
+        endforeach()
+        if(NOT __opt_found)
+          message(STATUS "Excluding from source files list: ${fname}")
+        endif()
+      endif()
+    else()
+      list(APPEND __result "${fname}")
+    endif()
+  endforeach()
+
+  foreach(OPT ${CPU_DISPATCH_FINAL})
+    if(__result_${OPT})
+#message("${OPT}: ${__result_${OPT}}")
+      if(CMAKE_GENERATOR MATCHES "^Visual")
+        # extra flags are added before common flags, so switching between optimizations doesn't work correctly
+        # Also CMAKE_CXX_FLAGS doesn't work (it is directory-based, so add_subdirectory is required)
+        add_library(${TARGET_BASE_NAME}_${OPT} OBJECT ${__result_${OPT}})
+        ocv_append_dependant_targets(${TARGET_BASE_NAME} ${TARGET_BASE_NAME}_${OPT})
+        set_target_properties(${TARGET_BASE_NAME}_${OPT} PROPERTIES COMPILE_DEFINITIONS "${CPU_DISPATCH_DEFINITIONS_${OPT}}")
+        set_target_properties(${TARGET_BASE_NAME}_${OPT} PROPERTIES COMPILE_FLAGS "${CPU_DISPATCH_FLAGS_${OPT}}")
+        #list(APPEND __result_libs ${TARGET_BASE_NAME}_${OPT})
+        list(APPEND __result "$<TARGET_OBJECTS:${TARGET_BASE_NAME}_${OPT}>")
+      else()
+        foreach(fname ${__result_${OPT}})
+          set_source_files_properties("${fname}" PROPERTIES COMPILE_DEFINITIONS "${CPU_DISPATCH_DEFINITIONS_${OPT}}")
+          set_source_files_properties("${fname}" PROPERTIES COMPILE_FLAGS "${CPU_DISPATCH_FLAGS_${OPT}}")
+        endforeach()
+        list(APPEND __result ${__result_${OPT}})
+      endif()
+    endif()
+  endforeach()
+  set(${SOURCES_VAR_NAME} "${__result}")
+  list(APPEND ${LIBS_VAR_NAME} ${__result_libs})
+endmacro()
+
+macro(ocv_compiler_optimization_fill_cpu_config)
+  set(OPENCV_CPU_BASELINE_DEFINITIONS_CONFIGMAKE "")
+  foreach(OPT ${CPU_BASELINE_FINAL})
+    set(OPENCV_CPU_BASELINE_DEFINITIONS_CONFIGMAKE "${OPENCV_CPU_BASELINE_DEFINITIONS_CONFIGMAKE}
+#define CV_CPU_COMPILE_${OPT} 1
+#define CV_CPU_BASELINE_COMPILE_${OPT} 1
+")
+  endforeach()
+
+  set(OPENCV_CPU_BASELINE_DEFINITIONS_CONFIGMAKE "${OPENCV_CPU_BASELINE_DEFINITIONS_CONFIGMAKE}
+#define CV_CPU_BASELINE_FEATURES 0 \\")
+  foreach(OPT ${CPU_BASELINE_FINAL})
+    if(NOT DEFINED CPU_${OPT}_FEATURE_ALIAS OR NOT "x${CPU_${OPT}_FEATURE_ALIAS}" STREQUAL "x")
+      set(OPENCV_CPU_BASELINE_DEFINITIONS_CONFIGMAKE "${OPENCV_CPU_BASELINE_DEFINITIONS_CONFIGMAKE}
+    , CV_CPU_${OPT} \\")
+    endif()
+  endforeach()
+  set(OPENCV_CPU_BASELINE_DEFINITIONS_CONFIGMAKE "${OPENCV_CPU_BASELINE_DEFINITIONS_CONFIGMAKE}\n")
+
+  set(__dispatch_modes "")
+  foreach(OPT ${CPU_DISPATCH_FINAL})
+    list(APPEND __dispatch_modes ${CPU_DISPATCH_${OPT}_FORCE} ${OPT})
+  endforeach()
+  list(REMOVE_DUPLICATES __dispatch_modes)
+  set(OPENCV_CPU_DISPATCH_DEFINITIONS_CONFIGMAKE "")
+  foreach(OPT ${__dispatch_modes})
+    set(OPENCV_CPU_DISPATCH_DEFINITIONS_CONFIGMAKE "${OPENCV_CPU_DISPATCH_DEFINITIONS_CONFIGMAKE}
+#define CV_CPU_DISPATCH_COMPILE_${OPT} 1")
+  endforeach()
+
+  set(OPENCV_CPU_CONTROL_DEFINITIONS_CONFIGMAKE "// AUTOGENERATED, DO NOT EDIT\n")
+  foreach(OPT ${CPU_ALL_OPTIMIZATIONS})
+    if(NOT DEFINED CPU_${OPT}_FEATURE_ALIAS OR NOT "x${CPU_${OPT}_FEATURE_ALIAS}" STREQUAL "x")
+      set(OPENCV_CPU_CONTROL_DEFINITIONS_CONFIGMAKE "${OPENCV_CPU_CONTROL_DEFINITIONS_CONFIGMAKE}
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_${OPT}
+#  define CV_CPU_HAS_SUPPORT_${OPT} 1
+#  define CV_CPU_CALL_${OPT}(...) return __VA_ARGS__
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_${OPT}
+#  define CV_CPU_HAS_SUPPORT_${OPT} (cv::checkHardwareSupport(CV_CPU_${OPT}))
+#  define CV_CPU_CALL_${OPT}(...) if (CV_CPU_HAS_SUPPORT_${OPT}) return __VA_ARGS__
+#else
+#  define CV_CPU_HAS_SUPPORT_${OPT} 0
+#  define CV_CPU_CALL_${OPT}(...)
+#endif
+")
+    endif()
+  endforeach()
+
+  set(__file "${CMAKE_SOURCE_DIR}/modules/core/include/opencv2/core/cv_cpu_helper.h")
+  if(EXISTS "${__file}")
+    file(READ "${__file}" __content)
+  endif()
+  if(__content STREQUAL OPENCV_CPU_CONTROL_DEFINITIONS_CONFIGMAKE)
+    #message(STATUS "${__file} contains same content")
+  else()
+    file(WRITE "${__file}" "${OPENCV_CPU_CONTROL_DEFINITIONS_CONFIGMAKE}")
+    message(WARNING "${__file} is updated")
+  endif()
+endmacro()
+
+if(CV_DISABLE_OPTIMIZATION OR CV_ICC)
+  ocv_update(CV_ENABLE_UNROLLED 0)
+else()
+  ocv_update(CV_ENABLE_UNROLLED 1)
+endif()
--- a/cmake/OpenCVCompilerOptions.cmake
+++ b/cmake/OpenCVCompilerOptions.cmake
@ -31,24 +31,21 @@ endif()
 if(MINGW OR (X86 AND UNIX AND NOT APPLE))
  # mingw compiler is known to produce unstable SSE code with -O3 hence we are trying to use -O2 instead
  if(CMAKE_COMPILER_IS_GNUCXX)
-    foreach(flags CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_DEBUG)
-      string(REPLACE "-O3" "-O2" ${flags} "${${flags}}")
-    endforeach()
-  endif()
-
-  if(CMAKE_COMPILER_IS_GNUCC)
-    foreach(flags CMAKE_C_FLAGS CMAKE_C_FLAGS_RELEASE CMAKE_C_FLAGS_DEBUG)
+    foreach(flags
+            CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_DEBUG
+            CMAKE_C_FLAGS CMAKE_C_FLAGS_RELEASE CMAKE_C_FLAGS_DEBUG)
      string(REPLACE "-O3" "-O2" ${flags} "${${flags}}")
    endforeach()
  endif()
 endif()

 if(MSVC)
-  string(REGEX REPLACE "^  *| * $" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
-  string(REGEX REPLACE "^  *| * $" "" CMAKE_CXX_FLAGS_INIT "${CMAKE_CXX_FLAGS_INIT}")
+  string(STRIP "${CMAKE_CXX_FLAGS}" CMAKE_CXX_FLAGS)
+  string(STRIP "${CMAKE_CXX_FLAGS_INIT}" CMAKE_CXX_FLAGS_INIT)
  if(CMAKE_CXX_FLAGS STREQUAL CMAKE_CXX_FLAGS_INIT)
    # override cmake default exception handling option
-    string(REPLACE "/EHsc" "/EHa" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+    string(REPLACE "/EHsc" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHa")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}"  CACHE STRING "Flags used by the compiler during all build types." FORCE)
  endif()
 endif()
@ -63,9 +60,6 @@ set(OPENCV_EXTRA_EXE_LINKER_FLAGS_RELEASE "")
 set(OPENCV_EXTRA_EXE_LINKER_FLAGS_DEBUG "")

 macro(add_extra_compiler_option option)
-  if(CMAKE_BUILD_TYPE)
-    set(CMAKE_TRY_COMPILE_CONFIGURATION ${CMAKE_BUILD_TYPE})
-  endif()
  ocv_check_flag_support(CXX "${option}" _varname "${OPENCV_EXTRA_CXX_FLAGS} ${ARGN}")
  if(${_varname})
    set(OPENCV_EXTRA_CXX_FLAGS "${OPENCV_EXTRA_CXX_FLAGS} ${option}")
@ -77,6 +71,12 @@ macro(add_extra_compiler_option option)
  endif()
 endmacro()

+macro(add_extra_compiler_option_force option)
+  set(OPENCV_EXTRA_CXX_FLAGS "${OPENCV_EXTRA_CXX_FLAGS} ${option}")
+  set(OPENCV_EXTRA_C_FLAGS "${OPENCV_EXTRA_C_FLAGS} ${option}")
+endmacro()
+
+
 # Gets environment variable and puts its value to the corresponding preprocessor definition
 # Useful for WINRT that has no access to environment variables
 macro(add_env_definitions option)
@ -102,7 +102,11 @@ if(MINGW)
 endif()

 if(CV_ICC AND NOT ENABLE_FAST_MATH)
-  add_extra_compiler_option("-fp-model precise")
+  if(MSVC)
+    add_extra_compiler_option("/fp:precise")
+  else()
+    add_extra_compiler_option("-fp-model precise")
+  endif()
 endif()

 if(CMAKE_COMPILER_IS_GNUCXX)
@ -141,7 +145,7 @@ if(CMAKE_COMPILER_IS_GNUCXX)
  endif()

  # We need pthread's
-  if(UNIX AND NOT ANDROID AND NOT (APPLE AND CMAKE_COMPILER_IS_CLANGCXX))
+  if(UNIX AND NOT ANDROID AND NOT (APPLE AND CMAKE_COMPILER_IS_CLANGCXX)) # TODO
    add_extra_compiler_option(-pthread)
  endif()

@ -170,83 +174,6 @@ if(CMAKE_COMPILER_IS_GNUCXX)
  if(ENABLE_FAST_MATH)
    add_extra_compiler_option(-ffast-math)
  endif()
-  if(ENABLE_POWERPC)
-    add_extra_compiler_option("-mcpu=G3 -mtune=G5")
-  endif()
-  if(ENABLE_SSE)
-    add_extra_compiler_option(-msse)
-  endif()
-  if(ENABLE_SSE2)
-    add_extra_compiler_option(-msse2)
-  elseif(X86 OR X86_64)
-    add_extra_compiler_option(-mno-sse2)
-  endif()
-  if(ARM)
-    add_extra_compiler_option("-mfp16-format=ieee")
-  endif(ARM)
-  if(ENABLE_NEON)
-    add_extra_compiler_option("-mfpu=neon")
-  endif()
-  if(ENABLE_VFPV3 AND NOT ENABLE_NEON)
-    add_extra_compiler_option("-mfpu=vfpv3")
-  endif()
-
-  # SSE3 and further should be disabled under MingW because it generates compiler errors
-  if(NOT MINGW)
-    if(ENABLE_AVX)
-      add_extra_compiler_option(-mavx)
-    elseif(X86 OR X86_64)
-      add_extra_compiler_option(-mno-avx)
-    endif()
-    if(ENABLE_AVX2)
-      add_extra_compiler_option(-mavx2)
-
-      if(ENABLE_FMA3)
-        add_extra_compiler_option(-mfma)
-      endif()
-    endif()
-
-    # GCC depresses SSEx instructions when -mavx is used. Instead, it generates new AVX instructions or AVX equivalence for all SSEx instructions when needed.
-    if(NOT OPENCV_EXTRA_CXX_FLAGS MATCHES "-mavx")
-      if(ENABLE_SSE3)
-        add_extra_compiler_option(-msse3)
-      elseif(X86 OR X86_64)
-        add_extra_compiler_option(-mno-sse3)
-      endif()
-
-      if(ENABLE_SSSE3)
-        add_extra_compiler_option(-mssse3)
-      elseif(X86 OR X86_64)
-        add_extra_compiler_option(-mno-ssse3)
-      endif()
-
-      if(ENABLE_SSE41)
-        add_extra_compiler_option(-msse4.1)
-      elseif(X86 OR X86_64)
-        add_extra_compiler_option(-mno-sse4.1)
-      endif()
-
-      if(ENABLE_SSE42)
-        add_extra_compiler_option(-msse4.2)
-      elseif(X86 OR X86_64)
-        add_extra_compiler_option(-mno-sse4.2)
-      endif()
-
-      if(ENABLE_POPCNT)
-        add_extra_compiler_option(-mpopcnt)
-      endif()
-    endif()
-  endif(NOT MINGW)
-
-  if(X86 OR X86_64)
-    if(NOT APPLE AND CMAKE_SIZEOF_VOID_P EQUAL 4)
-      if(OPENCV_EXTRA_CXX_FLAGS MATCHES "-m(sse2|avx)")
-        add_extra_compiler_option(-mfpmath=sse)# !! important - be on the same wave with x64 compilers
-      else()
-        add_extra_compiler_option(-mfpmath=387)
-      endif()
-    endif()
-  endif()

  # Profiling?
  if(ENABLE_PROFILING)
@ -257,7 +184,7 @@ if(CMAKE_COMPILER_IS_GNUCXX)
      string(REPLACE "-fomit-frame-pointer" "" ${flags} "${${flags}}")
      string(REPLACE "-ffunction-sections" "" ${flags} "${${flags}}")
    endforeach()
-  elseif(NOT APPLE AND NOT ANDROID)
+  elseif(NOT ((IOS OR ANDROID) AND NOT BUILD_SHARED_LIBS))
    # Remove unreferenced functions: function level linking
    add_extra_compiler_option(-ffunction-sections)
  endif()
@ -265,6 +192,7 @@ if(CMAKE_COMPILER_IS_GNUCXX)
  if(ENABLE_COVERAGE)
    set(OPENCV_EXTRA_C_FLAGS "${OPENCV_EXTRA_C_FLAGS} --coverage")
    set(OPENCV_EXTRA_CXX_FLAGS "${OPENCV_EXTRA_CXX_FLAGS} --coverage")
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} --coverage")
  endif()

  if(ENABLE_INSTRUMENTATION)
@ -296,41 +224,6 @@ if(MSVC)
    set(OPENCV_EXTRA_FLAGS_RELEASE "${OPENCV_EXTRA_FLAGS_RELEASE} /Zi")
  endif()

-  if(ENABLE_AVX2 AND NOT MSVC_VERSION LESS 1800)
-    set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /arch:AVX2")
-  endif()
-  if(ENABLE_AVX AND NOT MSVC_VERSION LESS 1600 AND NOT OPENCV_EXTRA_FLAGS MATCHES "/arch:")
-    set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /arch:AVX")
-  endif()
-
-  if(ENABLE_SSE4_1 AND CV_ICC AND NOT OPENCV_EXTRA_FLAGS MATCHES "/arch:")
-    set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /arch:SSE4.1")
-  endif()
-
-  if(ENABLE_SSE3 AND CV_ICC AND NOT OPENCV_EXTRA_FLAGS MATCHES "/arch:")
-    set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /arch:SSE3")
-  endif()
-
-  if(NOT MSVC64)
-    # 64-bit MSVC compiler uses SSE/SSE2 by default
-    if(ENABLE_SSE2 AND NOT OPENCV_EXTRA_FLAGS MATCHES "/arch:")
-      set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /arch:SSE2")
-    endif()
-    if(ENABLE_SSE AND NOT OPENCV_EXTRA_FLAGS MATCHES "/arch:")
-      set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /arch:SSE")
-    endif()
-  endif()
-
-  if(ENABLE_SSE OR ENABLE_SSE2 OR ENABLE_SSE3 OR ENABLE_SSE4_1 OR ENABLE_AVX OR ENABLE_AVX2)
-    set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /Oi")
-  endif()
-
-  if(X86 OR X86_64)
-    if(CMAKE_SIZEOF_VOID_P EQUAL 4 AND ENABLE_SSE2)
-      set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /fp:fast") # !! important - be on the same wave with x64 compilers
-    endif()
-  endif()
-
  if(OPENCV_WARNINGS_ARE_ERRORS)
    set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /WX")
  endif()
@ -353,6 +246,16 @@ if(NOT BUILD_SHARED_LIBS AND CMAKE_COMPILER_IS_GNUCXX AND NOT ANDROID)
  set(OPENCV_EXTRA_FLAGS "-fPIC ${OPENCV_EXTRA_FLAGS}")
 endif()

+include(cmake/OpenCVCompilerOptimizations.cmake)
+
+if(COMMAND ocv_compiler_optimization_options)
+  ocv_compiler_optimization_options()
+endif()
+
+if(COMMAND ocv_compiler_optimization_options_finalize)
+  ocv_compiler_optimization_options_finalize()
+endif()
+
 # Add user supplied extra options (optimization, etc...)
 # ==========================================================
 set(OPENCV_EXTRA_FLAGS         "${OPENCV_EXTRA_FLAGS}"         CACHE INTERNAL "Extra compiler options")
@ -370,6 +273,7 @@ if(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_OPENCV_GCC_VERSION_NUM GREATER 399)
  add_extra_compiler_option(-fvisibility-inlines-hidden)
 endif()

+# TODO !!!!!
 if(NOT OPENCV_FP16_DISABLE AND NOT IOS)
  if(ARM AND ENABLE_NEON)
    set(FP16_OPTION "-mfpu=neon-fp16")
@ -378,7 +282,7 @@ if(NOT OPENCV_FP16_DISABLE AND NOT IOS)
  endif()
  try_compile(__VALID_FP16
    "${OpenCV_BINARY_DIR}"
-    "${OpenCV_SOURCE_DIR}/cmake/checks/fp16.cpp"
+    "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_fp16.cpp"
    COMPILE_DEFINITIONS "-DCHECK_FP16" "${FP16_OPTION}"
    OUTPUT_VARIABLE TRY_OUT
    )
--- a/cmake/OpenCVFindMKL.cmake
+++ b/cmake/OpenCVFindMKL.cmake
@ -48,7 +48,7 @@ endif()
 #check current MKL_ROOT_DIR
 if(NOT MKL_ROOT_DIR OR NOT EXISTS ${MKL_ROOT_DIR}/include/mkl.h)
    set(mkl_root_paths ${MKL_ROOT_DIR})
-    if(DEFINED $ENV{MKLROOT})
+    if(DEFINED ENV{MKLROOT})
        list(APPEND mkl_root_paths $ENV{MKLROOT})
    endif()
    if(WIN32)
--- a/cmake/OpenCVGenHeaders.cmake
+++ b/cmake/OpenCVGenHeaders.cmake
@ -3,6 +3,10 @@ configure_file("${OpenCV_SOURCE_DIR}/cmake/templates/cvconfig.h.in" "${OPENCV_CO
 configure_file("${OpenCV_SOURCE_DIR}/cmake/templates/cvconfig.h.in" "${OPENCV_CONFIG_FILE_INCLUDE_DIR}/opencv2/cvconfig.h")
 install(FILES "${OPENCV_CONFIG_FILE_INCLUDE_DIR}/cvconfig.h" DESTINATION ${OPENCV_INCLUDE_INSTALL_PATH}/opencv2 COMPONENT dev)

+# platform-specific config file
+ocv_compiler_optimization_fill_cpu_config()
+configure_file("${OpenCV_SOURCE_DIR}/cmake/templates/cv_cpu_config.h.in" "${OPENCV_CONFIG_FILE_INCLUDE_DIR}/cv_cpu_config.h")
+
 # ----------------------------------------------------------------------------
 #  opencv_modules.hpp based on actual modules list
 # ----------------------------------------------------------------------------
--- a/cmake/OpenCVModule.cmake
+++ b/cmake/OpenCVModule.cmake
@ -65,6 +65,7 @@ foreach(mod ${OPENCV_MODULES_BUILD} ${OPENCV_MODULES_DISABLED_USER} ${OPENCV_MOD
  unset(OPENCV_MODULE_${mod}_PRIVATE_OPT_DEPS CACHE)
  unset(OPENCV_MODULE_${mod}_LINK_DEPS CACHE)
  unset(OPENCV_MODULE_${mod}_WRAPPERS CACHE)
+  unset(OPENCV_DEPENDANT_TARGETS_${mod} CACHE)
 endforeach()

 # clean modules info which needs to be recalculated
@ -648,6 +649,8 @@ macro(ocv_set_module_sources)
  # use full paths for module to be independent from the module location
  ocv_convert_to_full_paths(OPENCV_MODULE_${the_module}_HEADERS)

+  ocv_compiler_optimization_process_sources(OPENCV_MODULE_${the_module}_SOURCES OPENCV_MODULE_${the_module}_DEPS_EXT ${the_module})
+
  set(OPENCV_MODULE_${the_module}_HEADERS ${OPENCV_MODULE_${the_module}_HEADERS} CACHE INTERNAL "List of header files for ${the_module}")
  set(OPENCV_MODULE_${the_module}_SOURCES ${OPENCV_MODULE_${the_module}_SOURCES} CACHE INTERNAL "List of source files for ${the_module}")
 endmacro()
--- a/cmake/OpenCVPCHSupport.cmake
+++ b/cmake/OpenCVPCHSupport.cmake
@ -65,6 +65,9 @@ MACRO(_PCH_GET_COMPILE_FLAGS _out_compile_flags)
        ocv_is_opencv_directory(__result ${item})
        if(__result)
          LIST(APPEND ${_out_compile_flags} "${_PCH_include_prefix}\"${item}\"")
+        elseif(CMAKE_COMPILER_IS_GNUCXX AND NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS "6.0" AND
+               item MATCHES "/usr/include$")
+          # workaround for GCC 6.x bug
        else()
          LIST(APPEND ${_out_compile_flags} "${_PCH_isystem_prefix}\"${item}\"")
        endif()
@ -75,6 +78,9 @@ MACRO(_PCH_GET_COMPILE_FLAGS _out_compile_flags)
        ocv_is_opencv_directory(__result ${item})
        if(__result)
          LIST(APPEND ${_out_compile_flags} "${_PCH_include_prefix}\"${item}\"")
+        elseif(CMAKE_COMPILER_IS_GNUCXX AND NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS "6.0" AND
+               item MATCHES "/usr/include$")
+          # workaround for GCC 6.x bug
        else()
          LIST(APPEND ${_out_compile_flags} "${_PCH_isystem_prefix}\"${item}\"")
        endif()
@ -328,7 +334,10 @@ MACRO(ADD_NATIVE_PRECOMPILED_HEADER _targetName _input)

        get_target_property(_sources ${_targetName} SOURCES)
        foreach(src ${_sources})
-          if(NOT "${src}" MATCHES "\\.mm$")
+          if(NOT "${src}" MATCHES "\\.mm$"
+               AND NOT "${src}" MATCHES "\\.h$" AND NOT "${src}" MATCHES "\\.hpp$" # header files
+               AND NOT "${src}" MATCHES "^\$" # CMake generator expressions
+          )
            get_source_file_property(oldProps "${src}" COMPILE_FLAGS)
            if(NOT oldProps)
              set(newProperties "/Yu\"${_input}\" /FI\"${_input}\"")
--- a/cmake/OpenCVUtils.cmake
+++ b/cmake/OpenCVUtils.cmake
@ -37,7 +37,11 @@ endmacro()

 macro(ocv_update VAR)
  if(NOT DEFINED ${VAR})
-    set(${VAR} ${ARGN})
+    if("x${ARGN}" STREQUAL "x")
+      set(${VAR} "")
+    else()
+      set(${VAR} ${ARGN})
+    endif()
  else()
    #ocv_debug_message("Preserve old value for ${VAR}: ${${VAR}}")
  endif()
@ -151,8 +155,15 @@ function(ocv_append_target_property target prop)
  endif()
 endfunction()

+function(ocv_append_dependant_targets target)
+  #ocv_debug_message("ocv_append_dependant_targets(${target} ${ARGN})")
+  _ocv_fix_target(target)
+  set(OPENCV_DEPENDANT_TARGETS_${target} "${OPENCV_DEPENDANT_TARGETS_${target}};${ARGN}" CACHE INTERNAL "" FORCE)
+endfunction()
+
 # adds include directories in such way that directories from the OpenCV source tree go first
 function(ocv_target_include_directories target)
+  #ocv_debug_message("ocv_target_include_directories(${target} ${ARGN})")
  _ocv_fix_target(target)
  set(__params "")
  if(CMAKE_COMPILER_IS_GNUCXX AND NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS "6.0" AND
@ -173,6 +184,11 @@ function(ocv_target_include_directories target)
  else()
    if(TARGET ${target})
      target_include_directories(${target} PRIVATE ${__params})
+      if(OPENCV_DEPENDANT_TARGETS_${target})
+        foreach(t ${OPENCV_DEPENDANT_TARGETS_${target}})
+          target_include_directories(${t} PRIVATE ${__params})
+        endforeach()
+      endif()
    else()
      set(__new_inc "${OCV_TARGET_INCLUDE_DIRS_${target}};${__params}")
      set(OCV_TARGET_INCLUDE_DIRS_${target} "${__new_inc}" CACHE INTERNAL "")
@ -205,8 +221,11 @@ set(OCV_COMPILER_FAIL_REGEX
  )

 MACRO(ocv_check_compiler_flag LANG FLAG RESULT)
+  set(_fname "${ARGN}")
  if(NOT DEFINED ${RESULT})
-    if("_${LANG}_" MATCHES "_CXX_")
+    if(_fname)
+      # nothing
+    elseif("_${LANG}_" MATCHES "_CXX_")
      set(_fname "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/src.cxx")
      if("${CMAKE_CXX_FLAGS} ${FLAG} " MATCHES "-Werror " OR "${CMAKE_CXX_FLAGS} ${FLAG} " MATCHES "-Werror=unknown-pragmas ")
        FILE(WRITE "${_fname}" "int main() { return 0; }\n")
@ -231,10 +250,17 @@ MACRO(ocv_check_compiler_flag LANG FLAG RESULT)
      unset(_fname)
    endif()
    if(_fname)
-      MESSAGE(STATUS "Performing Test ${RESULT}")
+      if(NOT "x${ARGN}" STREQUAL "x")
+        file(RELATIVE_PATH __msg "${CMAKE_SOURCE_DIR}" "${ARGN}")
+        set(__msg " (check file: ${__msg})")
+      else()
+        set(__msg "")
+      endif()
+      MESSAGE(STATUS "Performing Test ${RESULT}${__msg}")
      TRY_COMPILE(${RESULT}
        "${CMAKE_BINARY_DIR}"
        "${_fname}"
+        CMAKE_FLAGS "-DCMAKE_EXE_LINKER_FLAGS=${CMAKE_EXE_LINKER_FLAGS}"   # CMP0056 do this on new CMake
        COMPILE_DEFINITIONS "${FLAG}"
        OUTPUT_VARIABLE OUTPUT)

@ -278,7 +304,11 @@ MACRO(ocv_check_compiler_flag LANG FLAG RESULT)
  endif()
 ENDMACRO()

-macro(ocv_check_flag_support lang flag varname)
+macro(ocv_check_flag_support lang flag varname base_options)
+  if(CMAKE_BUILD_TYPE)
+    set(CMAKE_TRY_COMPILE_CONFIGURATION ${CMAKE_BUILD_TYPE})
+  endif()
+
  if("_${lang}_" MATCHES "_CXX_")
    set(_lang CXX)
  elseif("_${lang}_" MATCHES "_C_")
@ -293,7 +323,7 @@ macro(ocv_check_flag_support lang flag varname)
  string(REGEX REPLACE "^(/|-)" "HAVE_${_lang}_" ${varname} "${${varname}}")
  string(REGEX REPLACE " -|-|=| |\\." "_" ${varname} "${${varname}}")

-  ocv_check_compiler_flag("${_lang}" "${ARGN} ${flag}" ${${varname}})
+  ocv_check_compiler_flag("${_lang}" "${base_options} ${flag}" ${${varname}} ${ARGN})
 endmacro()

 # turns off warnings
@ -327,7 +357,7 @@ macro(ocv_warnings_disable)
            string(REPLACE "${warning}" "" ${var} "${${var}}")
            string(REPLACE "-W" "-Wno-" warning "${warning}")
          endif()
-          ocv_check_flag_support(${var} "${warning}" _varname)
+          ocv_check_flag_support(${var} "${warning}" _varname "")
          if(${_varname})
            set(${var} "${${var}} ${warning}")
          endif()
@ -342,7 +372,7 @@ macro(ocv_warnings_disable)
          else()
            string(REPLACE "-wd" "-Qwd" warning "${warning}")
          endif()
-          ocv_check_flag_support(${var} "${warning}" _varname)
+          ocv_check_flag_support(${var} "${warning}" _varname "")
          if(${_varname})
            set(${var} "${${var}} ${warning}")
          endif()
@ -357,7 +387,7 @@ macro(ocv_warnings_disable)
 endmacro()

 macro(add_apple_compiler_options the_module)
-  ocv_check_flag_support(OBJCXX "-fobjc-exceptions" HAVE_OBJC_EXCEPTIONS)
+  ocv_check_flag_support(OBJCXX "-fobjc-exceptions" HAVE_OBJC_EXCEPTIONS "")
  if(HAVE_OBJC_EXCEPTIONS)
    foreach(source ${OPENCV_MODULE_${the_module}_SOURCES})
      if("${source}" MATCHES "\\.mm$")
@ -903,6 +933,11 @@ function(_ocv_append_target_includes target)
    if (TARGET ${target}_object)
      target_include_directories(${target}_object PRIVATE ${OCV_TARGET_INCLUDE_DIRS_${target}})
    endif()
+    if(OPENCV_DEPENDANT_TARGETS_${target})
+      foreach(t ${OPENCV_DEPENDANT_TARGETS_${target}})
+        target_include_directories(${t} PRIVATE ${OCV_TARGET_INCLUDE_DIRS_${target}})
+      endforeach()
+    endif()
    unset(OCV_TARGET_INCLUDE_DIRS_${target} CACHE)
  endif()
 endfunction()
--- a/cmake/checks/cpu_avx.cpp
+++ b/cmake/checks/cpu_avx.cpp
@ -0,0 +1,9 @@
+#if !defined __AVX__ // MSVC supports this flag since MSVS 2013
+#error "__AVX__ define is missing"
+#endif
+#include <immintrin.h>
+void test()
+{
+    __m256 a = _mm256_set1_ps(0.0f);
+}
+int main() { return 0; }
--- a/cmake/checks/cpu_avx2.cpp
+++ b/cmake/checks/cpu_avx2.cpp
@ -0,0 +1,10 @@
+#if !defined __AVX2__ // MSVC supports this flag since MSVS 2013
+#error "__AVX2__ define is missing"
+#endif
+#include <immintrin.h>
+void test()
+{
+    int data[8] = {0,0,0,0, 0,0,0,0};
+    __m256i a = _mm256_loadu_si256((const __m256i *)data);
+}
+int main() { return 0; }
--- a/cmake/checks/cpu_avx512.cpp
+++ b/cmake/checks/cpu_avx512.cpp
@ -0,0 +1,10 @@
+#if defined __AVX512__ || defined __AVX512F__
+#include <immintrin.h>
+void test()
+{
+    __m512i zmm = _mm512_setzero_si512();
+}
+#else
+#error "AVX512 is not supported"
+#endif
+int main() { return 0; }
--- a/cmake/checks/cpu_fp16.cpp
+++ b/cmake/checks/cpu_fp16.cpp
@ -1,6 +1,6 @@
 #include <stdio.h>

-#if defined __F16C__ || (defined _MSC_VER && _MSC_VER >= 1700)
+#if defined __F16C__ || (defined _MSC_VER && _MSC_VER >= 1700) || (defined __INTEL_COMPILER && defined __AVX__)
 #include <immintrin.h>
 int test()
 {
--- a/cmake/checks/cpu_popcnt.cpp
+++ b/cmake/checks/cpu_popcnt.cpp
@ -0,0 +1,8 @@
+#include <nmmintrin.h>
+#ifndef _MSC_VER
+#include <popcntintrin.h>
+#endif
+int main() {
+    int i = _mm_popcnt_u64(1);
+    return 0;
+}
--- a/cmake/checks/cpu_sse.cpp
+++ b/cmake/checks/cpu_sse.cpp
@ -0,0 +1,2 @@
+#include <xmmintrin.h>
+int main() { return 0; }
--- a/cmake/checks/cpu_sse2.cpp
+++ b/cmake/checks/cpu_sse2.cpp
@ -0,0 +1,2 @@
+#include <emmintrin.h>
+int main() { return 0; }
--- a/cmake/checks/cpu_sse3.cpp
+++ b/cmake/checks/cpu_sse3.cpp
@ -0,0 +1,7 @@
+#include <pmmintrin.h>
+int main() {
+    __m128 u, v;
+    u = _mm_set1_ps(0.0f);
+    v = _mm_moveldup_ps(u); // SSE3
+    return 0;
+}
--- a/cmake/checks/cpu_sse41.cpp
+++ b/cmake/checks/cpu_sse41.cpp
@ -0,0 +1,6 @@
+#include <smmintrin.h>
+int main() {
+    __m128i a = _mm_setzero_si128(), b = _mm_setzero_si128();
+    __m128i c = _mm_packus_epi32(a, b);
+    return 0;
+}
--- a/cmake/checks/cpu_sse42.cpp
+++ b/cmake/checks/cpu_sse42.cpp
@ -0,0 +1,5 @@
+#include <nmmintrin.h>
+int main() {
+    int i = _mm_popcnt_u64(1);
+    return 0;
+}
--- a/cmake/checks/cpu_ssse3.cpp
+++ b/cmake/checks/cpu_ssse3.cpp
@ -0,0 +1,7 @@
+#include <tmmintrin.h>
+const double v = 0;
+int main() {
+    __m128i a = _mm_setzero_si128();
+    __m128i b = _mm_abs_epi32(a);
+    return 0;
+}
--- a/cmake/checks/openvx_refenum_test.cpp
+++ b/cmake/checks/openvx_refenum_test.cpp
@ -0,0 +1,5 @@
+#include <VX/vx.h>
+int main()
+{
+    return VX_REFERENCE_COUNT == VX_REFERENCE_TYPE ? VX_REFERENCE_NAME : 0;
+}
--- a/cmake/cl2cpp.cmake
+++ b/cmake/cl2cpp.cmake
@ -9,7 +9,7 @@ if (NOT cl_list)
  message(FATAL_ERROR "Can't find OpenCL kernels in directory: ${CL_DIR}")
 endif()

-string(REPLACE ".cpp" ".hpp" OUTPUT_HPP "${OUTPUT}")
+string(REGEX REPLACE "\\.cpp$" ".hpp" OUTPUT_HPP "${OUTPUT}")
 get_filename_component(OUTPUT_HPP_NAME "${OUTPUT_HPP}" NAME)

 if("${MODULE_NAME}" STREQUAL "ocl")
--- a/cmake/templates/cv_cpu_config.h.in
+++ b/cmake/templates/cv_cpu_config.h.in
@ -0,0 +1,5 @@
+// OpenCV CPU baseline features
+@OPENCV_CPU_BASELINE_DEFINITIONS_CONFIGMAKE@
+
+// OpenCV supported CPU dispatched features
+@OPENCV_CPU_DISPATCH_DEFINITIONS_CONFIGMAKE@
--- a/cmake/templates/cvconfig.h.in
+++ b/cmake/templates/cvconfig.h.in
@ -1,6 +1,15 @@
+#ifndef OPENCV_CVCONFIG_H_INCLUDED
+#define OPENCV_CVCONFIG_H_INCLUDED
+
 /* OpenCV compiled as static or dynamic libs */
 #cmakedefine BUILD_SHARED_LIBS

+/* OpenCV intrinsics optimized code */
+#cmakedefine CV_ENABLE_INTRINSICS
+
+/* OpenCV additional optimized code */
+#cmakedefine CV_DISABLE_OPTIMIZATION
+
 /* Compile for 'real' NVIDIA GPU architectures */
 #define CUDA_ARCH_BIN "${OPENCV_CUDA_ARCH_BIN}"

@ -206,3 +215,7 @@

 /* OpenVX */
 #cmakedefine HAVE_OPENVX
+
+
+
+#endif // OPENCV_CVCONFIG_H_INCLUDED
--- a/doc/py_tutorials/py_calib3d/py_epipolar_geometry/py_epipolar_geometry.markdown
+++ b/doc/py_tutorials/py_calib3d/py_epipolar_geometry/py_epipolar_geometry.markdown
@ -86,7 +86,7 @@ kp1, des1 = sift.detectAndCompute(img1,None)
 kp2, des2 = sift.detectAndCompute(img2,None)

 # FLANN parameters
-FLANN_INDEX_KDTREE = 0
+FLANN_INDEX_KDTREE = 1
 index_params = dict(algorithm = FLANN_INDEX_KDTREE, trees = 5)
 search_params = dict(checks=50)

--- a/doc/py_tutorials/py_core/py_basic_ops/py_basic_ops.markdown
+++ b/doc/py_tutorials/py_core/py_basic_ops/py_basic_ops.markdown
@ -8,7 +8,7 @@ Learn to:

 -   Access pixel values and modify them
 -   Access image properties
-   Setting Region of Image (ROI)
+-   Setting Region of Interest (ROI)
 -   Splitting and Merging images

 Almost all the operations in this section is mainly related to Numpy rather than OpenCV. A good
--- a/doc/py_tutorials/py_feature2d/py_feature_homography/py_feature_homography.markdown
+++ b/doc/py_tutorials/py_feature2d/py_feature_homography/py_feature_homography.markdown
@ -50,7 +50,7 @@ sift = cv2.xfeatures2d.SIFT_create()
 kp1, des1 = sift.detectAndCompute(img1,None)
 kp2, des2 = sift.detectAndCompute(img2,None)

-FLANN_INDEX_KDTREE = 0
+FLANN_INDEX_KDTREE = 1
 index_params = dict(algorithm = FLANN_INDEX_KDTREE, trees = 5)
 search_params = dict(checks = 50)

--- a/doc/py_tutorials/py_feature2d/py_features_meaning/py_features_meaning.markdown
+++ b/doc/py_tutorials/py_feature2d/py_features_meaning/py_features_meaning.markdown
@ -10,7 +10,7 @@ corners are important etc.
 Explanation
 -----------

-Most of you will have played the jigsaw puzzle games. You get a lot of small pieces of a images,
+Most of you will have played the jigsaw puzzle games. You get a lot of small pieces of an image,
 where you need to assemble them correctly to form a big real image. **The question is, how you do
 it?** What about the projecting the same theory to a computer program so that computer can play
 jigsaw puzzles? If the computer can play jigsaw puzzles, why can't we give a lot of real-life images
--- a/doc/py_tutorials/py_feature2d/py_matcher/py_matcher.markdown
+++ b/doc/py_tutorials/py_feature2d/py_matcher/py_matcher.markdown
@ -148,11 +148,13 @@ its related parameters etc. First one is IndexParams. For various algorithms, th
 passed is explained in FLANN docs. As a summary, for algorithms like SIFT, SURF etc. you can pass
 following:
@code{.py}
+FLANN_INDEX_KDTREE = 1
 index_params = dict(algorithm = FLANN_INDEX_KDTREE, trees = 5)
@endcode
 While using ORB, you can pass the following. The commented values are recommended as per the docs,
 but it didn't provide required results in some cases. Other values worked fine.:
@code{.py}
+FLANN_INDEX_LSH = 6
 index_params= dict(algorithm = FLANN_INDEX_LSH,
                   table_number = 6, # 12
                   key_size = 12,     # 20
@ -179,7 +181,7 @@ kp1, des1 = sift.detectAndCompute(img1,None)
 kp2, des2 = sift.detectAndCompute(img2,None)

 # FLANN parameters
-FLANN_INDEX_KDTREE = 0
+FLANN_INDEX_KDTREE = 1
 index_params = dict(algorithm = FLANN_INDEX_KDTREE, trees = 5)
 search_params = dict(checks=50)   # or pass empty dictionary

--- a/doc/tutorials/objdetect/cascade_classifier/cascade_classifier.markdown
+++ b/doc/tutorials/objdetect/cascade_classifier/cascade_classifier.markdown
@ -19,8 +19,6 @@ Code

 This tutorial code's is shown lines below. You can also download it from
 [here](https://github.com/opencv/opencv/tree/master/samples/cpp/tutorial_code/objectDetection/objectDetection.cpp)
-. The second version (using LBP for face detection) can be [found
-here](https://github.com/opencv/opencv/tree/master/samples/cpp/tutorial_code/objectDetection/objectDetection2.cpp)
@include samples/cpp/tutorial_code/objectDetection/objectDetection.cpp

 Explanation
@ -34,8 +32,8 @@ Result

    ![](images/Cascade_Classifier_Tutorial_Result_Haar.jpg)

-    Remember to copy the files *haarcascade_frontalface_alt.xml* and
-    *haarcascade_eye_tree_eyeglasses.xml* in your current directory. They are located in
+    Be sure the program will find the path of files *haarcascade_frontalface_alt.xml* and
+    *haarcascade_eye_tree_eyeglasses.xml*. They are located in
    *opencv/data/haarcascades*

 -#  This is the result of using the file *lbpcascade_frontalface.xml* (LBP trained) for the face
--- a/modules/calib3d/include/opencv2/calib3d.hpp
+++ b/modules/calib3d/include/opencv2/calib3d.hpp
@ -714,6 +714,30 @@ found, or as colored corners connected with lines if the board was found.
 CV_EXPORTS_W void drawChessboardCorners( InputOutputArray image, Size patternSize,
                                         InputArray corners, bool patternWasFound );

+struct CV_EXPORTS_W_SIMPLE CirclesGridFinderParameters
+{
+    CV_WRAP CirclesGridFinderParameters();
+    CV_PROP_RW cv::Size2f densityNeighborhoodSize;
+    CV_PROP_RW float minDensity;
+    CV_PROP_RW int kmeansAttempts;
+    CV_PROP_RW int minDistanceToAddKeypoint;
+    CV_PROP_RW int keypointScale;
+    CV_PROP_RW float minGraphConfidence;
+    CV_PROP_RW float vertexGain;
+    CV_PROP_RW float vertexPenalty;
+    CV_PROP_RW float existingVertexGain;
+    CV_PROP_RW float edgeGain;
+    CV_PROP_RW float edgePenalty;
+    CV_PROP_RW float convexHullFactor;
+    CV_PROP_RW float minRNGEdgeSwitchDist;
+
+    enum GridType
+    {
+      SYMMETRIC_GRID, ASYMMETRIC_GRID
+    };
+    GridType gridType;
+};
+
 /** @brief Finds centers in the grid of circles.

@param image grid view of input circles; it must be an 8-bit grayscale or color image.
@ -726,6 +750,7 @@ CV_EXPORTS_W void drawChessboardCorners( InputOutputArray image, Size patternSiz
 -   **CALIB_CB_CLUSTERING** uses a special algorithm for grid detection. It is more robust to
 perspective distortions but much more sensitive to background clutter.
@param blobDetector feature detector that finds blobs like dark circles on light background.
+@param parameters struct for finding circles in a grid pattern.

 The function attempts to determine whether the input image contains a grid of circles. If it is, the
 function locates centers of the circles. The function returns a non-zero value if all of the centers
@ -745,6 +770,12 @@ Sample usage of detecting and drawing the centers of circles: :
@note The function requires white space (like a square-thick border, the wider the better) around
 the board to make the detection more robust in various environments.
 */
+CV_EXPORTS_W bool findCirclesGrid( InputArray image, Size patternSize,
+                                   OutputArray centers, int flags,
+                                   const Ptr<FeatureDetector> &blobDetector,
+                                   CirclesGridFinderParameters parameters);
+
+/** @overload */
 CV_EXPORTS_W bool findCirclesGrid( InputArray image, Size patternSize,
                                   OutputArray centers, int flags = CALIB_CB_SYMMETRIC_GRID,
                                   const Ptr<FeatureDetector> &blobDetector = SimpleBlobDetector::create());
@ -1433,6 +1464,28 @@ CV_EXPORTS_W int recoverPose( InputArray E, InputArray points1, InputArray point
                            double focal = 1.0, Point2d pp = Point2d(0, 0),
                            InputOutputArray mask = noArray() );

+/** @overload
+@param E The input essential matrix.
+@param points1 Array of N 2D points from the first image. The point coordinates should be
+floating-point (single or double precision).
+@param points2 Array of the second image points of the same size and format as points1.
+@param cameraMatrix Camera matrix \f$K = \vecthreethree{f_x}{0}{c_x}{0}{f_y}{c_y}{0}{0}{1}\f$ .
+Note that this function assumes that points1 and points2 are feature points from cameras with the
+same camera matrix.
+@param R Recovered relative rotation.
+@param t Recoverd relative translation.
+@param distanceThresh threshold distance which is used to filter out far away points (i.e. infinite points).
+@param mask Input/output mask for inliers in points1 and points2.
+:   If it is not empty, then it marks inliers in points1 and points2 for then given essential
+matrix E. Only these inliers will be used to recover pose. In the output mask only inliers
+which pass the cheirality check.
+@param triangulatedPoints 3d points which were reconstructed by triangulation.
+ */
+
+CV_EXPORTS_W int recoverPose( InputArray E, InputArray points1, InputArray points2,
+                            InputArray cameraMatrix, OutputArray R, OutputArray t, double distanceThresh, InputOutputArray mask = noArray(),
+                            OutputArray triangulatedPoints = noArray());
+
 /** @brief For points in an image of a stereo pair, computes the corresponding epilines in the other image.

@param points Input points. \f$N \times 1\f$ or \f$1 \times N\f$ matrix of type CV_32FC2 or
--- a/modules/calib3d/src/calibinit.cpp
+++ b/modules/calib3d/src/calibinit.cpp
@ -442,7 +442,7 @@ int cvFindChessboardCorners( const void* arr, CvSize pattern_size,

    Mat img = cvarrToMat((CvMat*)arr).clone();

-    if( img.depth() != CV_8U || (img.channels() != 1 && img.channels() != 3) )
+    if( img.depth() != CV_8U || (img.channels() != 1 && img.channels() != 3 && img.channels() != 4) )
       CV_Error( CV_StsUnsupportedFormat, "Only 8-bit grayscale or color images are supported" );

    if( pattern_size.width <= 2 || pattern_size.height <= 2 )
@ -2093,7 +2093,8 @@ void cv::drawChessboardCorners( InputOutputArray _image, Size patternSize,
 }

 bool cv::findCirclesGrid( InputArray _image, Size patternSize,
-                          OutputArray _centers, int flags, const Ptr<FeatureDetector> &blobDetector )
+                          OutputArray _centers, int flags, const Ptr<FeatureDetector> &blobDetector,
+                          CirclesGridFinderParameters parameters)
 {
    CV_INSTRUMENT_REGION()

@ -2120,13 +2121,6 @@ bool cv::findCirclesGrid( InputArray _image, Size patternSize,
      return !centers.empty();
    }

-    CirclesGridFinderParameters parameters;
-    parameters.vertexPenalty = -0.6f;
-    parameters.vertexGain = 1;
-    parameters.existingVertexGain = 10000;
-    parameters.edgeGain = 1;
-    parameters.edgePenalty = -0.6f;
-
    if(flags & CALIB_CB_ASYMMETRIC_GRID)
      parameters.gridType = CirclesGridFinderParameters::ASYMMETRIC_GRID;
    if(flags & CALIB_CB_SYMMETRIC_GRID)
@ -2192,4 +2186,10 @@ bool cv::findCirclesGrid( InputArray _image, Size patternSize,
    return false;
 }

+bool cv::findCirclesGrid( InputArray _image, Size patternSize,
+                          OutputArray _centers, int flags, const Ptr<FeatureDetector> &blobDetector)
+{
+    return cv::findCirclesGrid(_image, patternSize, _centers, flags, blobDetector, CirclesGridFinderParameters());
+}
+
 /* End of file. */
--- a/modules/calib3d/src/circlesgrid.cpp
+++ b/modules/calib3d/src/circlesgrid.cpp
@ -551,11 +551,11 @@ CirclesGridFinderParameters::CirclesGridFinderParameters()
  keypointScale = 1;

  minGraphConfidence = 9;
-  vertexGain = 2;
-  vertexPenalty = -5;
+  vertexGain = 1;
+  vertexPenalty = -0.6f;
  edgeGain = 1;
-  edgePenalty = -5;
-  existingVertexGain = 0;
+  edgePenalty = -0.6f;
+  existingVertexGain = 10000;

  minRNGEdgeSwitchDist = 5.f;
  gridType = SYMMETRIC_GRID;
--- a/modules/calib3d/src/circlesgrid.hpp
+++ b/modules/calib3d/src/circlesgrid.hpp
@ -119,35 +119,11 @@ struct Path
  }
 };

-struct CirclesGridFinderParameters
-{
-  CirclesGridFinderParameters();
-  cv::Size2f densityNeighborhoodSize;
-  float minDensity;
-  int kmeansAttempts;
-  int minDistanceToAddKeypoint;
-  int keypointScale;
-  float minGraphConfidence;
-  float vertexGain;
-  float vertexPenalty;
-  float existingVertexGain;
-  float edgeGain;
-  float edgePenalty;
-  float convexHullFactor;
-  float minRNGEdgeSwitchDist;
-
-  enum GridType
-  {
-    SYMMETRIC_GRID, ASYMMETRIC_GRID
-  };
-  GridType gridType;
-};
-
 class CirclesGridFinder
 {
 public:
  CirclesGridFinder(cv::Size patternSize, const std::vector<cv::Point2f> &testKeypoints,
-                    const CirclesGridFinderParameters &parameters = CirclesGridFinderParameters());
+                    const cv::CirclesGridFinderParameters &parameters = cv::CirclesGridFinderParameters());
  bool findHoles();
  static cv::Mat rectifyGrid(cv::Size detectedGridSize, const std::vector<cv::Point2f>& centers, const std::vector<
      cv::Point2f> &keypoint, std::vector<cv::Point2f> &warpedKeypoints);
@ -211,7 +187,7 @@ private:
  std::vector<std::vector<size_t> > *smallHoles;

  const cv::Size_<size_t> patternSize;
-  CirclesGridFinderParameters parameters;
+  cv::CirclesGridFinderParameters parameters;

  CirclesGridFinder& operator=(const CirclesGridFinder&);
  CirclesGridFinder(const CirclesGridFinder&);
--- a/modules/calib3d/src/five-point.cpp
+++ b/modules/calib3d/src/five-point.cpp
@ -458,8 +458,9 @@ cv::Mat cv::findEssentialMat( InputArray _points1, InputArray _points2, double f
    return cv::findEssentialMat(_points1, _points2, cameraMatrix, method, prob, threshold, _mask);
 }

-int cv::recoverPose( InputArray E, InputArray _points1, InputArray _points2, InputArray _cameraMatrix,
-                     OutputArray _R, OutputArray _t, InputOutputArray _mask)
+int cv::recoverPose( InputArray E, InputArray _points1, InputArray _points2,
+                            InputArray _cameraMatrix, OutputArray _R, OutputArray _t, double distanceThresh,
+                     InputOutputArray _mask, OutputArray triangulatedPoints)
 {
    CV_INSTRUMENT_REGION()

@ -506,51 +507,60 @@ int cv::recoverPose( InputArray E, InputArray _points1, InputArray _points2, Inp
    // Notice here a threshold dist is used to filter
    // out far away points (i.e. infinite points) since
    // there depth may vary between postive and negtive.
-    double dist = 50.0;
+    std::vector<Mat> allTriangulations(4);
    Mat Q;
+
    triangulatePoints(P0, P1, points1, points2, Q);
+    if(triangulatedPoints.needed())
+        Q.copyTo(allTriangulations[0]);
    Mat mask1 = Q.row(2).mul(Q.row(3)) > 0;
    Q.row(0) /= Q.row(3);
    Q.row(1) /= Q.row(3);
    Q.row(2) /= Q.row(3);
    Q.row(3) /= Q.row(3);
-    mask1 = (Q.row(2) < dist) & mask1;
+    mask1 = (Q.row(2) < distanceThresh) & mask1;
    Q = P1 * Q;
    mask1 = (Q.row(2) > 0) & mask1;
-    mask1 = (Q.row(2) < dist) & mask1;
+    mask1 = (Q.row(2) < distanceThresh) & mask1;

    triangulatePoints(P0, P2, points1, points2, Q);
+    if(triangulatedPoints.needed())
+        Q.copyTo(allTriangulations[1]);
    Mat mask2 = Q.row(2).mul(Q.row(3)) > 0;
    Q.row(0) /= Q.row(3);
    Q.row(1) /= Q.row(3);
    Q.row(2) /= Q.row(3);
    Q.row(3) /= Q.row(3);
-    mask2 = (Q.row(2) < dist) & mask2;
+    mask2 = (Q.row(2) < distanceThresh) & mask2;
    Q = P2 * Q;
    mask2 = (Q.row(2) > 0) & mask2;
-    mask2 = (Q.row(2) < dist) & mask2;
+    mask2 = (Q.row(2) < distanceThresh) & mask2;

    triangulatePoints(P0, P3, points1, points2, Q);
+    if(triangulatedPoints.needed())
+        Q.copyTo(allTriangulations[2]);
    Mat mask3 = Q.row(2).mul(Q.row(3)) > 0;
    Q.row(0) /= Q.row(3);
    Q.row(1) /= Q.row(3);
    Q.row(2) /= Q.row(3);
    Q.row(3) /= Q.row(3);
-    mask3 = (Q.row(2) < dist) & mask3;
+    mask3 = (Q.row(2) < distanceThresh) & mask3;
    Q = P3 * Q;
    mask3 = (Q.row(2) > 0) & mask3;
-    mask3 = (Q.row(2) < dist) & mask3;
+    mask3 = (Q.row(2) < distanceThresh) & mask3;

    triangulatePoints(P0, P4, points1, points2, Q);
+    if(triangulatedPoints.needed())
+        Q.copyTo(allTriangulations[3]);
    Mat mask4 = Q.row(2).mul(Q.row(3)) > 0;
    Q.row(0) /= Q.row(3);
    Q.row(1) /= Q.row(3);
    Q.row(2) /= Q.row(3);
    Q.row(3) /= Q.row(3);
-    mask4 = (Q.row(2) < dist) & mask4;
+    mask4 = (Q.row(2) < distanceThresh) & mask4;
    Q = P4 * Q;
    mask4 = (Q.row(2) > 0) & mask4;
-    mask4 = (Q.row(2) < dist) & mask4;
+    mask4 = (Q.row(2) < distanceThresh) & mask4;

    mask1 = mask1.t();
    mask2 = mask2.t();
@ -583,6 +593,7 @@ int cv::recoverPose( InputArray E, InputArray _points1, InputArray _points2, Inp

    if (good1 >= good2 && good1 >= good3 && good1 >= good4)
    {
+        if(triangulatedPoints.needed()) allTriangulations[0].copyTo(triangulatedPoints);
        R1.copyTo(_R);
        t.copyTo(_t);
        if (_mask.needed()) mask1.copyTo(_mask);
@ -590,6 +601,7 @@ int cv::recoverPose( InputArray E, InputArray _points1, InputArray _points2, Inp
    }
    else if (good2 >= good1 && good2 >= good3 && good2 >= good4)
    {
+        if(triangulatedPoints.needed()) allTriangulations[1].copyTo(triangulatedPoints);
        R2.copyTo(_R);
        t.copyTo(_t);
        if (_mask.needed()) mask2.copyTo(_mask);
@ -597,6 +609,7 @@ int cv::recoverPose( InputArray E, InputArray _points1, InputArray _points2, Inp
    }
    else if (good3 >= good1 && good3 >= good2 && good3 >= good4)
    {
+        if(triangulatedPoints.needed()) allTriangulations[2].copyTo(triangulatedPoints);
        t = -t;
        R1.copyTo(_R);
        t.copyTo(_t);
@ -605,6 +618,7 @@ int cv::recoverPose( InputArray E, InputArray _points1, InputArray _points2, Inp
    }
    else
    {
+        if(triangulatedPoints.needed()) allTriangulations[3].copyTo(triangulatedPoints);
        t = -t;
        R2.copyTo(_R);
        t.copyTo(_t);
@ -613,6 +627,12 @@ int cv::recoverPose( InputArray E, InputArray _points1, InputArray _points2, Inp
    }
 }

+int cv::recoverPose( InputArray E, InputArray _points1, InputArray _points2, InputArray _cameraMatrix,
+                     OutputArray _R, OutputArray _t, InputOutputArray _mask)
+{
+    return cv::recoverPose(E, _points1, _points2, _cameraMatrix, _R, _t, 50, _mask);
+}
+
 int cv::recoverPose( InputArray E, InputArray _points1, InputArray _points2, OutputArray _R,
                     OutputArray _t, double focal, Point2d pp, InputOutputArray _mask)
 {
--- a/modules/core/include/opencv2/core.hpp
+++ b/modules/core/include/opencv2/core.hpp
@ -626,7 +626,7 @@ then pass the matrix to calcCovarMatrix .
@param src input array that should have from 1 to 4 channels so that the results can be stored in
 Scalar_ 's.
@param mean output parameter: calculated mean value.
-@param stddev output parameter: calculateded standard deviation.
+@param stddev output parameter: calculated standard deviation.
@param mask optional operation mask.
@sa  countNonZero, mean, norm, minMaxLoc, calcCovarMatrix
 */
@ -1639,7 +1639,7 @@ CV_EXPORTS_W void mulTransposed( InputArray src, OutputArray dst, bool aTa,

 The function cv::transpose transposes the matrix src :
 \f[\texttt{dst} (i,j) =  \texttt{src} (j,i)\f]
-@note No complex conjugation is done in case of a complex matrix. It it
+@note No complex conjugation is done in case of a complex matrix. It
 should be done separately if needed.
@param src input array.
@param dst output array of the same type as src.
--- a/modules/core/include/opencv2/core/base.hpp
+++ b/modules/core/include/opencv2/core/base.hpp
@ -239,6 +239,10 @@ enum DftFlags {
        into a real array and inverse transformation is executed, the function treats the input as a
        packed complex-conjugate symmetrical array, and the output will also be a real array). */
    DFT_REAL_OUTPUT    = 32,
+    /** specifies that input is complex input. If this flag is set, the input must have 2 channels.
+        On the other hand, for backwards compatibility reason, if input has 2 channels, input is
+        already considered complex. */
+    DFT_COMPLEX_INPUT  = 64,
    /** performs an inverse 1D or 2D transform instead of the default forward transform. */
    DCT_INVERSE        = DFT_INVERSE,
    /** performs a forward or inverse transform of every individual row of the input
--- a/modules/core/include/opencv2/core/cuda.hpp
+++ b/modules/core/include/opencv2/core/cuda.hpp
@ -327,6 +327,34 @@ The function does not reallocate memory if the matrix has proper attributes alre
 */
 CV_EXPORTS void ensureSizeIsEnough(int rows, int cols, int type, OutputArray arr);

+/** @brief BufferPool for use with CUDA streams
+
+ * BufferPool utilizes cuda::Stream's allocator to create new buffers. It is
+ * particularly useful when BufferPoolUsage is set to true, or a custom
+ * allocator is specified for the cuda::Stream, and you want to implement your
+ * own stream based functions utilizing the same underlying GPU memory
+ * management.
+ */
+class CV_EXPORTS BufferPool
+{
+public:
+
+    //! Gets the BufferPool for the given stream.
+    explicit BufferPool(Stream& stream);
+
+    //! Allocates a new GpuMat of given size and type.
+    GpuMat getBuffer(int rows, int cols, int type);
+
+    //! Allocates a new GpuMat of given size and type.
+    GpuMat getBuffer(Size size, int type) { return getBuffer(size.height, size.width, type); }
+
+    //! Returns the allocator associated with the stream.
+    Ptr<GpuMat::Allocator> getAllocator() const { return allocator_; }
+
+private:
+    Ptr<GpuMat::Allocator> allocator_;
+};
+
 //! BufferPool management (must be called before Stream creation)
 CV_EXPORTS void setBufferPoolUsage(bool on);
 CV_EXPORTS void setBufferPoolConfig(int deviceId, size_t stackSize, int stackCount);
@ -479,6 +507,9 @@ public:
    //! creates a new asynchronous stream
    Stream();

+    //! creates a new asynchronous stream with custom allocator
+    Stream(const Ptr<GpuMat::Allocator>& allocator);
+
    /** @brief Returns true if the current stream queue is finished. Otherwise, it returns false.
    */
    bool queryIfComplete() const;
--- a/modules/core/include/opencv2/core/cv_cpu_dispatch.h
+++ b/modules/core/include/opencv2/core/cv_cpu_dispatch.h
@ -0,0 +1,166 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#if defined __OPENCV_BUILD \
+
+#include "cv_cpu_config.h"
+#include "cv_cpu_helper.h"
+
+#if defined CV_ENABLE_INTRINSICS \
+    && !defined CV_DISABLE_OPTIMIZATION \
+    && !defined __CUDACC__ /* do not include SSE/AVX/NEON headers for NVCC compiler */ \
+
+#ifdef CV_CPU_COMPILE_SSE2
+#  include <emmintrin.h>
+#  define CV_MMX 1
+#  define CV_SSE 1
+#  define CV_SSE2 1
+#endif
+#ifdef CV_CPU_COMPILE_SSE3
+#  include <pmmintrin.h>
+#  define CV_SSE3 1
+#endif
+#ifdef CV_CPU_COMPILE_SSSE3
+#  include <tmmintrin.h>
+#  define CV_SSSE3 1
+#endif
+#ifdef CV_CPU_COMPILE_SSE4_1
+#  include <smmintrin.h>
+#  define CV_SSE4_1 1
+#endif
+#ifdef CV_CPU_COMPILE_SSE4_2
+#  include <nmmintrin.h>
+#  define CV_SSE4_2 1
+#endif
+#ifdef CV_CPU_COMPILE_POPCNT
+#  ifdef _MSC_VER
+#    include <nmmintrin.h>
+#    if defined(_M_X64)
+#      define CV_POPCNT_U64 _mm_popcnt_u64
+#    endif
+#    define CV_POPCNT_U32 _mm_popcnt_u32
+#  else
+#    include <popcntintrin.h>
+#    if defined(__x86_64__)
+#      define CV_POPCNT_U64 __builtin_popcountll
+#    endif
+#    define CV_POPCNT_U32 __builtin_popcount
+#  endif
+#  define CV_POPCNT 1
+#endif
+#ifdef CV_CPU_COMPILE_AVX
+#  include <immintrin.h>
+#  define CV_AVX 1
+#endif
+#ifdef CV_CPU_COMPILE_AVX2
+#  include <immintrin.h>
+#  define CV_AVX2 1
+#endif
+#ifdef CV_CPU_COMPILE_FMA3
+#  define CV_FMA3 1
+#endif
+
+#if (defined WIN32 || defined _WIN32) && defined(_M_ARM)
+# include <Intrin.h>
+# include <arm_neon.h>
+# define CV_NEON 1
+#elif defined(__ARM_NEON__) || (defined (__ARM_NEON) && defined(__aarch64__))
+#  include <arm_neon.h>
+#  define CV_NEON 1
+#endif
+
+#if defined(__ARM_NEON__) || defined(__aarch64__)
+#  include <arm_neon.h>
+#endif
+
+#endif // CV_ENABLE_INTRINSICS && !CV_DISABLE_OPTIMIZATION && !__CUDACC__
+
+#endif // __OPENCV_BUILD
+
+
+
+#if !defined __OPENCV_BUILD // Compatibility code
+
+#if defined __SSE2__ || defined _M_X64 || (defined _M_IX86_FP && _M_IX86_FP >= 2)
+#  include <emmintrin.h>
+#  define CV_MMX 1
+#  define CV_SSE 1
+#  define CV_SSE2 1
+#elif (defined WIN32 || defined _WIN32) && defined(_M_ARM)
+# include <Intrin.h>
+# include <arm_neon.h>
+# define CV_NEON 1
+#elif defined(__ARM_NEON__) || (defined (__ARM_NEON) && defined(__aarch64__))
+#  include <arm_neon.h>
+#  define CV_NEON 1
+#endif
+
+#endif // !__OPENCV_BUILD (Compatibility code)
+
+
+
+#ifndef CV_MMX
+#  define CV_MMX 0
+#endif
+#ifndef CV_SSE
+#  define CV_SSE 0
+#endif
+#ifndef CV_SSE2
+#  define CV_SSE2 0
+#endif
+#ifndef CV_SSE3
+#  define CV_SSE3 0
+#endif
+#ifndef CV_SSSE3
+#  define CV_SSSE3 0
+#endif
+#ifndef CV_SSE4_1
+#  define CV_SSE4_1 0
+#endif
+#ifndef CV_SSE4_2
+#  define CV_SSE4_2 0
+#endif
+#ifndef CV_POPCNT
+#  define CV_POPCNT 0
+#endif
+#ifndef CV_AVX
+#  define CV_AVX 0
+#endif
+#ifndef CV_AVX2
+#  define CV_AVX2 0
+#endif
+#ifndef CV_FMA3
+#  define CV_FMA3 0
+#endif
+#ifndef CV_AVX_512F
+#  define CV_AVX_512F 0
+#endif
+#ifndef CV_AVX_512BW
+#  define CV_AVX_512BW 0
+#endif
+#ifndef CV_AVX_512CD
+#  define CV_AVX_512CD 0
+#endif
+#ifndef CV_AVX_512DQ
+#  define CV_AVX_512DQ 0
+#endif
+#ifndef CV_AVX_512ER
+#  define CV_AVX_512ER 0
+#endif
+#ifndef CV_AVX_512IFMA512
+#  define CV_AVX_512IFMA512 0
+#endif
+#ifndef CV_AVX_512PF
+#  define CV_AVX_512PF 0
+#endif
+#ifndef CV_AVX_512VBMI
+#  define CV_AVX_512VBMI 0
+#endif
+#ifndef CV_AVX_512VL
+#  define CV_AVX_512VL 0
+#endif
+
+#ifndef CV_NEON
+#  define CV_NEON 0
+#endif
--- a/modules/core/include/opencv2/core/cv_cpu_helper.h
+++ b/modules/core/include/opencv2/core/cv_cpu_helper.h
@ -0,0 +1,133 @@
+// AUTOGENERATED, DO NOT EDIT
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE
+#  define CV_CPU_HAS_SUPPORT_SSE 1
+#  define CV_CPU_CALL_SSE(...) return __VA_ARGS__
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE
+#  define CV_CPU_HAS_SUPPORT_SSE (cv::checkHardwareSupport(CV_CPU_SSE))
+#  define CV_CPU_CALL_SSE(...) if (CV_CPU_HAS_SUPPORT_SSE) return __VA_ARGS__
+#else
+#  define CV_CPU_HAS_SUPPORT_SSE 0
+#  define CV_CPU_CALL_SSE(...)
+#endif
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE2
+#  define CV_CPU_HAS_SUPPORT_SSE2 1
+#  define CV_CPU_CALL_SSE2(...) return __VA_ARGS__
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE2
+#  define CV_CPU_HAS_SUPPORT_SSE2 (cv::checkHardwareSupport(CV_CPU_SSE2))
+#  define CV_CPU_CALL_SSE2(...) if (CV_CPU_HAS_SUPPORT_SSE2) return __VA_ARGS__
+#else
+#  define CV_CPU_HAS_SUPPORT_SSE2 0
+#  define CV_CPU_CALL_SSE2(...)
+#endif
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE3
+#  define CV_CPU_HAS_SUPPORT_SSE3 1
+#  define CV_CPU_CALL_SSE3(...) return __VA_ARGS__
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE3
+#  define CV_CPU_HAS_SUPPORT_SSE3 (cv::checkHardwareSupport(CV_CPU_SSE3))
+#  define CV_CPU_CALL_SSE3(...) if (CV_CPU_HAS_SUPPORT_SSE3) return __VA_ARGS__
+#else
+#  define CV_CPU_HAS_SUPPORT_SSE3 0
+#  define CV_CPU_CALL_SSE3(...)
+#endif
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSSE3
+#  define CV_CPU_HAS_SUPPORT_SSSE3 1
+#  define CV_CPU_CALL_SSSE3(...) return __VA_ARGS__
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSSE3
+#  define CV_CPU_HAS_SUPPORT_SSSE3 (cv::checkHardwareSupport(CV_CPU_SSSE3))
+#  define CV_CPU_CALL_SSSE3(...) if (CV_CPU_HAS_SUPPORT_SSSE3) return __VA_ARGS__
+#else
+#  define CV_CPU_HAS_SUPPORT_SSSE3 0
+#  define CV_CPU_CALL_SSSE3(...)
+#endif
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE4_1
+#  define CV_CPU_HAS_SUPPORT_SSE4_1 1
+#  define CV_CPU_CALL_SSE4_1(...) return __VA_ARGS__
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE4_1
+#  define CV_CPU_HAS_SUPPORT_SSE4_1 (cv::checkHardwareSupport(CV_CPU_SSE4_1))
+#  define CV_CPU_CALL_SSE4_1(...) if (CV_CPU_HAS_SUPPORT_SSE4_1) return __VA_ARGS__
+#else
+#  define CV_CPU_HAS_SUPPORT_SSE4_1 0
+#  define CV_CPU_CALL_SSE4_1(...)
+#endif
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE4_2
+#  define CV_CPU_HAS_SUPPORT_SSE4_2 1
+#  define CV_CPU_CALL_SSE4_2(...) return __VA_ARGS__
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE4_2
+#  define CV_CPU_HAS_SUPPORT_SSE4_2 (cv::checkHardwareSupport(CV_CPU_SSE4_2))
+#  define CV_CPU_CALL_SSE4_2(...) if (CV_CPU_HAS_SUPPORT_SSE4_2) return __VA_ARGS__
+#else
+#  define CV_CPU_HAS_SUPPORT_SSE4_2 0
+#  define CV_CPU_CALL_SSE4_2(...)
+#endif
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_POPCNT
+#  define CV_CPU_HAS_SUPPORT_POPCNT 1
+#  define CV_CPU_CALL_POPCNT(...) return __VA_ARGS__
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_POPCNT
+#  define CV_CPU_HAS_SUPPORT_POPCNT (cv::checkHardwareSupport(CV_CPU_POPCNT))
+#  define CV_CPU_CALL_POPCNT(...) if (CV_CPU_HAS_SUPPORT_POPCNT) return __VA_ARGS__
+#else
+#  define CV_CPU_HAS_SUPPORT_POPCNT 0
+#  define CV_CPU_CALL_POPCNT(...)
+#endif
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX
+#  define CV_CPU_HAS_SUPPORT_AVX 1
+#  define CV_CPU_CALL_AVX(...) return __VA_ARGS__
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX
+#  define CV_CPU_HAS_SUPPORT_AVX (cv::checkHardwareSupport(CV_CPU_AVX))
+#  define CV_CPU_CALL_AVX(...) if (CV_CPU_HAS_SUPPORT_AVX) return __VA_ARGS__
+#else
+#  define CV_CPU_HAS_SUPPORT_AVX 0
+#  define CV_CPU_CALL_AVX(...)
+#endif
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_FP16
+#  define CV_CPU_HAS_SUPPORT_FP16 1
+#  define CV_CPU_CALL_FP16(...) return __VA_ARGS__
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_FP16
+#  define CV_CPU_HAS_SUPPORT_FP16 (cv::checkHardwareSupport(CV_CPU_FP16))
+#  define CV_CPU_CALL_FP16(...) if (CV_CPU_HAS_SUPPORT_FP16) return __VA_ARGS__
+#else
+#  define CV_CPU_HAS_SUPPORT_FP16 0
+#  define CV_CPU_CALL_FP16(...)
+#endif
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX2
+#  define CV_CPU_HAS_SUPPORT_AVX2 1
+#  define CV_CPU_CALL_AVX2(...) return __VA_ARGS__
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX2
+#  define CV_CPU_HAS_SUPPORT_AVX2 (cv::checkHardwareSupport(CV_CPU_AVX2))
+#  define CV_CPU_CALL_AVX2(...) if (CV_CPU_HAS_SUPPORT_AVX2) return __VA_ARGS__
+#else
+#  define CV_CPU_HAS_SUPPORT_AVX2 0
+#  define CV_CPU_CALL_AVX2(...)
+#endif
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_FMA3
+#  define CV_CPU_HAS_SUPPORT_FMA3 1
+#  define CV_CPU_CALL_FMA3(...) return __VA_ARGS__
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_FMA3
+#  define CV_CPU_HAS_SUPPORT_FMA3 (cv::checkHardwareSupport(CV_CPU_FMA3))
+#  define CV_CPU_CALL_FMA3(...) if (CV_CPU_HAS_SUPPORT_FMA3) return __VA_ARGS__
+#else
+#  define CV_CPU_HAS_SUPPORT_FMA3 0
+#  define CV_CPU_CALL_FMA3(...)
+#endif
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_NEON
+#  define CV_CPU_HAS_SUPPORT_NEON 1
+#  define CV_CPU_CALL_NEON(...) return __VA_ARGS__
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_NEON
+#  define CV_CPU_HAS_SUPPORT_NEON (cv::checkHardwareSupport(CV_CPU_NEON))
+#  define CV_CPU_CALL_NEON(...) if (CV_CPU_HAS_SUPPORT_NEON) return __VA_ARGS__
+#else
+#  define CV_CPU_HAS_SUPPORT_NEON 0
+#  define CV_CPU_CALL_NEON(...)
+#endif
--- a/modules/core/include/opencv2/core/cvdef.h
+++ b/modules/core/include/opencv2/core/cvdef.h
@ -48,6 +48,10 @@
 //! @addtogroup core_utils
 //! @{

+#ifdef __OPENCV_BUILD
+#include "cvconfig.h"
+#endif
+
 #if !defined _CRT_SECURE_NO_DEPRECATE && defined _MSC_VER && _MSC_VER > 1300
 #  define _CRT_SECURE_NO_DEPRECATE /* to avoid multiple Visual Studio warnings */
 #endif
@ -59,10 +63,6 @@
 #undef abs
 #undef Complex

-#if !defined _CRT_SECURE_NO_DEPRECATE && defined _MSC_VER && _MSC_VER > 1300
-#  define _CRT_SECURE_NO_DEPRECATE /* to avoid multiple Visual Studio warnings */
-#endif
-
 #include <limits.h>
 #include "opencv2/core/hal/interface.h"

@ -88,7 +88,7 @@
 #  endif
 #endif

-#if defined CV_ICC && !defined CV_ENABLE_UNROLLED
+#if defined CV_DISABLE_OPTIMIZATION || (defined CV_ICC && !defined CV_ENABLE_UNROLLED)
 #  define CV_ENABLE_UNROLLED 0
 #else
 #  define CV_ENABLE_UNROLLED 1
@ -161,150 +161,9 @@ enum CpuFeatures {
    CPU_NEON            = 100
 };

-// do not include SSE/AVX/NEON headers for NVCC compiler
-#ifndef __CUDACC__

-#if defined __SSE2__ || defined _M_X64 || (defined _M_IX86_FP && _M_IX86_FP >= 2)
-#  include <emmintrin.h>
-#  define CV_MMX 1
-#  define CV_SSE 1
-#  define CV_SSE2 1
-#  if defined __SSE3__ || (defined _MSC_VER && _MSC_VER >= 1500)
-#    include <pmmintrin.h>
-#    define CV_SSE3 1
-#  endif
-#  if defined __SSSE3__  || (defined _MSC_VER && _MSC_VER >= 1500)
-#    include <tmmintrin.h>
-#    define CV_SSSE3 1
-#  endif
-#  if defined __SSE4_1__ || (defined _MSC_VER && _MSC_VER >= 1500)
-#    include <smmintrin.h>
-#    define CV_SSE4_1 1
-#  endif
-#  if defined __SSE4_2__ || (defined _MSC_VER && _MSC_VER >= 1500)
-#    include <nmmintrin.h>
-#    define CV_SSE4_2 1
-#  endif
-#  if defined __POPCNT__ || (defined _MSC_VER && _MSC_VER >= 1500)
-#    ifdef _MSC_VER
-#      include <nmmintrin.h>
-#      if defined(_M_X64)
-#        define CV_POPCNT_U64 _mm_popcnt_u64
-#      endif
-#      define CV_POPCNT_U32 _mm_popcnt_u32
-#    else
-#      include <popcntintrin.h>
-#      if defined(__x86_64__)
-#        define CV_POPCNT_U64 __builtin_popcountll
-#      endif
-#      define CV_POPCNT_U32 __builtin_popcount
-#    endif
-#    define CV_POPCNT 1
-#  endif
-#  if defined __AVX__ || (defined _MSC_VER && _MSC_VER >= 1600 && 0)
-// MS Visual Studio 2010 (2012?) has no macro pre-defined to identify the use of /arch:AVX
-// See: http://connect.microsoft.com/VisualStudio/feedback/details/605858/arch-avx-should-define-a-predefined-macro-in-x64-and-set-a-unique-value-for-m-ix86-fp-in-win32
-#    include <immintrin.h>
-#    define CV_AVX 1
-#    if defined(_XCR_XFEATURE_ENABLED_MASK)
-#      define __xgetbv() _xgetbv(_XCR_XFEATURE_ENABLED_MASK)
-#    else
-#      define __xgetbv() 0
-#    endif
-#  endif
-#  if defined __AVX2__ || (defined _MSC_VER && _MSC_VER >= 1800 && 0)
-#    include <immintrin.h>
-#    define CV_AVX2 1
-#    if defined __FMA__
-#      define CV_FMA3 1
-#    endif
-#  endif
-#endif
+#include "cv_cpu_dispatch.h"

-#if (defined WIN32 || defined _WIN32) && defined(_M_ARM)
-# include <Intrin.h>
-# include <arm_neon.h>
-# define CV_NEON 1
-# define CPU_HAS_NEON_FEATURE (true)
-#elif defined(__ARM_NEON__) || (defined (__ARM_NEON) && defined(__aarch64__))
-#  include <arm_neon.h>
-#  define CV_NEON 1
-#endif
-
-#if defined __GNUC__ && defined __arm__ && (defined __ARM_PCS_VFP || defined __ARM_VFPV3__ || defined __ARM_NEON__) && !defined __SOFTFP__
-#  define CV_VFP 1
-#endif
-
-#endif // __CUDACC__
-
-#ifndef CV_POPCNT
-#define CV_POPCNT 0
-#endif
-#ifndef CV_MMX
-#  define CV_MMX 0
-#endif
-#ifndef CV_SSE
-#  define CV_SSE 0
-#endif
-#ifndef CV_SSE2
-#  define CV_SSE2 0
-#endif
-#ifndef CV_SSE3
-#  define CV_SSE3 0
-#endif
-#ifndef CV_SSSE3
-#  define CV_SSSE3 0
-#endif
-#ifndef CV_SSE4_1
-#  define CV_SSE4_1 0
-#endif
-#ifndef CV_SSE4_2
-#  define CV_SSE4_2 0
-#endif
-#ifndef CV_AVX
-#  define CV_AVX 0
-#endif
-#ifndef CV_AVX2
-#  define CV_AVX2 0
-#endif
-#ifndef CV_FMA3
-#  define CV_FMA3 0
-#endif
-#ifndef CV_AVX_512F
-#  define CV_AVX_512F 0
-#endif
-#ifndef CV_AVX_512BW
-#  define CV_AVX_512BW 0
-#endif
-#ifndef CV_AVX_512CD
-#  define CV_AVX_512CD 0
-#endif
-#ifndef CV_AVX_512DQ
-#  define CV_AVX_512DQ 0
-#endif
-#ifndef CV_AVX_512ER
-#  define CV_AVX_512ER 0
-#endif
-#ifndef CV_AVX_512IFMA512
-#  define CV_AVX_512IFMA512 0
-#endif
-#ifndef CV_AVX_512PF
-#  define CV_AVX_512PF 0
-#endif
-#ifndef CV_AVX_512VBMI
-#  define CV_AVX_512VBMI 0
-#endif
-#ifndef CV_AVX_512VL
-#  define CV_AVX_512VL 0
-#endif
-
-#ifndef CV_NEON
-#  define CV_NEON 0
-#endif
-
-#ifndef CV_VFP
-#  define CV_VFP 0
-#endif

 /* fundamental constants */
 #define CV_PI   3.1415926535897932384626433832795
--- a/modules/core/include/opencv2/core/fast_math.hpp
+++ b/modules/core/include/opencv2/core/fast_math.hpp
@ -47,6 +47,12 @@

 #include "opencv2/core/cvdef.h"

+#if ((defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __x86_64__ \
+    && defined __SSE2__ && !defined __APPLE__)) && !defined(__CUDACC__)
+#include <emmintrin.h>
+#endif
+
+
 //! @addtogroup core_utils
 //! @{

@ -68,7 +74,7 @@
 #  include "tegra_round.hpp"
 #endif

-#if CV_VFP
+#if defined __GNUC__ && defined __arm__ && (defined __ARM_PCS_VFP || defined __ARM_VFPV3__ || defined __ARM_NEON__) && !defined __SOFTFP__
    // 1. general scheme
    #define ARM_ROUND(_value, _asm_string) \
        int res; \
@ -84,7 +90,7 @@
    #endif
    // 3. version for float
    #define ARM_ROUND_FLT(value) ARM_ROUND(value, "vcvtr.s32.f32 %[temp], %[value]\n vmov %[res], %[temp]")
-#endif // CV_VFP
+#endif

 /** @brief Rounds floating-point number to the nearest integer

@ -95,7 +101,7 @@ CV_INLINE int
 cvRound( double value )
 {
 #if ((defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __x86_64__ \
-    && defined __SSE2__ && !defined __APPLE__)) && !defined(__CUDACC__)
+    && defined __SSE2__ && !defined __APPLE__) || CV_SSE2) && !defined(__CUDACC__)
    __m128d t = _mm_set_sd( value );
    return _mm_cvtsd_si32(t);
 #elif defined _MSC_VER && defined _M_IX86
@ -110,7 +116,7 @@ cvRound( double value )
        defined __GNUC__) && defined HAVE_TEGRA_OPTIMIZATION
    TEGRA_ROUND_DBL(value);
 #elif defined CV_ICC || defined __GNUC__
-# if CV_VFP
+# if defined ARM_ROUND_DBL
    ARM_ROUND_DBL(value);
 # else
    return (int)lrint(value);
@ -132,18 +138,8 @@ cvRound( double value )
 */
 CV_INLINE int cvFloor( double value )
 {
-#if (defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__ && !defined __APPLE__)) && !defined(__CUDACC__)
-    __m128d t = _mm_set_sd( value );
-    int i = _mm_cvtsd_si32(t);
-    return i - _mm_movemask_pd(_mm_cmplt_sd(t, _mm_cvtsi32_sd(t,i)));
-#elif defined __GNUC__
    int i = (int)value;
    return i - (i > value);
-#else
-    int i = cvRound(value);
-    float diff = (float)(value - i);
-    return i - (diff < 0);
-#endif
 }

 /** @brief Rounds floating-point number to the nearest integer not smaller than the original.
@ -155,18 +151,8 @@ CV_INLINE int cvFloor( double value )
 */
 CV_INLINE int cvCeil( double value )
 {
-#if (defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__&& !defined __APPLE__)) && !defined(__CUDACC__)
-    __m128d t = _mm_set_sd( value );
-    int i = _mm_cvtsd_si32(t);
-    return i + _mm_movemask_pd(_mm_cmplt_sd(_mm_cvtsi32_sd(t,i), t));
-#elif defined __GNUC__
    int i = (int)value;
    return i + (i < value);
-#else
-    int i = cvRound(value);
-    float diff = (float)(i - value);
-    return i + (diff < 0);
-#endif
 }

 /** @brief Determines if the argument is Not A Number.
@ -202,8 +188,8 @@ CV_INLINE int cvIsInf( double value )
 /** @overload */
 CV_INLINE int cvRound(float value)
 {
-#if ((defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __x86_64__ && \
-      defined __SSE2__ && !defined __APPLE__)) && !defined(__CUDACC__)
+#if ((defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __x86_64__ \
+    && defined __SSE2__ && !defined __APPLE__) || CV_SSE2) && !defined(__CUDACC__)
    __m128 t = _mm_set_ss( value );
    return _mm_cvtss_si32(t);
 #elif defined _MSC_VER && defined _M_IX86
@ -218,7 +204,7 @@ CV_INLINE int cvRound(float value)
        defined __GNUC__) && defined HAVE_TEGRA_OPTIMIZATION
    TEGRA_ROUND_FLT(value);
 #elif defined CV_ICC || defined __GNUC__
-# if CV_VFP
+# if defined ARM_ROUND_FLT
    ARM_ROUND_FLT(value);
 # else
    return (int)lrintf(value);
@ -239,18 +225,8 @@ CV_INLINE int cvRound( int value )
 /** @overload */
 CV_INLINE int cvFloor( float value )
 {
-#if (defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__ && !defined __APPLE__)) && !defined(__CUDACC__)
-    __m128 t = _mm_set_ss( value );
-    int i = _mm_cvtss_si32(t);
-    return i - _mm_movemask_ps(_mm_cmplt_ss(t, _mm_cvtsi32_ss(t,i)));
-#elif defined __GNUC__
    int i = (int)value;
    return i - (i > value);
-#else
-    int i = cvRound(value);
-    float diff = (float)(value - i);
-    return i - (diff < 0);
-#endif
 }

 /** @overload */
@ -262,18 +238,8 @@ CV_INLINE int cvFloor( int value )
 /** @overload */
 CV_INLINE int cvCeil( float value )
 {
-#if (defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__&& !defined __APPLE__)) && !defined(__CUDACC__)
-    __m128 t = _mm_set_ss( value );
-    int i = _mm_cvtss_si32(t);
-    return i + _mm_movemask_ps(_mm_cmplt_ss(_mm_cvtsi32_ss(t,i), t));
-#elif defined __GNUC__
    int i = (int)value;
    return i + (i < value);
-#else
-    int i = cvRound(value);
-    float diff = (float)(i - value);
-    return i + (diff < 0);
-#endif
 }

 /** @overload */
--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
@ -73,8 +73,8 @@ It is defined as:
    typedef const _InputArray& InputArray;
@endcode
 where _InputArray is a class that can be constructed from `Mat`, `Mat_<T>`, `Matx<T, m, n>`,
-`std::vector<T>`, `std::vector<std::vector<T> >` or `std::vector<Mat>`. It can also be constructed
-from a matrix expression.
+`std::vector<T>`, `std::vector<std::vector<T> >`, `std::vector<Mat>`, `std::vector<Mat_<T> >`,
+`UMat`, `std::vector<UMat>` or `double`. It can also be constructed from a matrix expression.

 Since this is mostly implementation-level class, and its interface may change in future versions, we
 do not describe it in details. There are a few key things, though, that should be kept in mind:
@ -660,7 +660,7 @@ sub-matrices.

 - Use MATLAB-style array initializers, zeros(), ones(), eye(), for example:
@code
-    // create a double-precision identity martix and add it to M.
+    // create a double-precision identity matrix and add it to M.
    M += Mat::eye(M.rows, M.cols, CV_64F);
@endcode

@ -693,7 +693,7 @@ If you need to process a whole row of a 2D array, the most efficient way is to g
 the row first, and then just use the plain C operator [] :
@code
    // compute sum of positive matrix elements
-    // (assuming that M isa double-precision matrix)
+    // (assuming that M is a double-precision matrix)
    double sum=0;
    for(int i = 0; i < M.rows; i++)
    {
@ -1085,6 +1085,29 @@ public:
      immediately below the main one.
    - `d>0` is a diagonal from the upper half. For example, d=1 means the diagonal is set
      immediately above the main one.
+    For example:
+    @code
+        Mat m = (Mat_<int>(3,3) <<
+                    1,2,3,
+                    4,5,6,
+                    7,8,9);
+        Mat d0 = m.diag(0);
+        Mat d1 = m.diag(1);
+        Mat d_1 = m.diag(-1);
+    @endcode
+    The resulting matrices are
+    @code
+     d0 =
+       [1;
+        5;
+        9]
+     d1 =
+       [2;
+        6]
+     d_1 =
+       [4;
+        8]
+    @endcode
     */
    Mat diag(int d=0) const;

@ -2287,9 +2310,9 @@ public:
    UMat colRange(int startcol, int endcol) const;
    UMat colRange(const Range& r) const;
    //! ... for the specified diagonal
-    // (d=0 - the main diagonal,
-    //  >0 - a diagonal from the lower half,
-    //  <0 - a diagonal from the upper half)
+    //! (d=0 - the main diagonal,
+    //!  >0 - a diagonal from the upper half,
+    //!  <0 - a diagonal from the lower half)
    UMat diag(int d=0) const;
    //! constructs a square diagonal matrix which main diagonal is vector "d"
    static UMat diag(const UMat& d);
--- a/modules/core/include/opencv2/core/mat.inl.hpp
+++ b/modules/core/include/opencv2/core/mat.inl.hpp
@ -1634,14 +1634,14 @@ Mat_<_Tp> Mat_<_Tp>::operator()(const std::vector<Range>& ranges) const
 template<typename _Tp> inline
 _Tp* Mat_<_Tp>::operator [](int y)
 {
-    CV_DbgAssert( 0 <= y && y < rows );
+    CV_DbgAssert( 0 <= y && y < size.p[0] );
    return (_Tp*)(data + y*step.p[0]);
 }

 template<typename _Tp> inline
 const _Tp* Mat_<_Tp>::operator [](int y) const
 {
-    CV_DbgAssert( 0 <= y && y < rows );
+    CV_DbgAssert( 0 <= y && y < size.p[0] );
    return (const _Tp*)(data + y*step.p[0]);
 }

--- a/modules/core/include/opencv2/core/matx.hpp
+++ b/modules/core/include/opencv2/core/matx.hpp
@ -590,11 +590,12 @@ Matx<_Tp,m,n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp
 template<typename _Tp, int m, int n> inline
 Matx<_Tp,m,n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8, _Tp v9, _Tp v10, _Tp v11, _Tp v12, _Tp v13)
 {
-    CV_StaticAssert(channels == 14, "Matx should have at least 14 elements.");
+    CV_StaticAssert(channels >= 14, "Matx should have at least 14 elements.");
    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
    val[4] = v4; val[5] = v5; val[6] = v6; val[7] = v7;
    val[8] = v8; val[9] = v9; val[10] = v10; val[11] = v11;
    val[12] = v12; val[13] = v13;
+    for (int i = 14; i < channels; i++) val[i] = _Tp(0);
 }


--- a/modules/core/include/opencv2/core/persistence.hpp
+++ b/modules/core/include/opencv2/core/persistence.hpp
@ -1055,6 +1055,20 @@ void write(FileStorage& fs, const String& name, const Range& r )
    write(fs, r);
 }

+static inline
+void write(FileStorage& fs, const String& name, const KeyPoint& r )
+{
+    cv::internal::WriteStructContext ws(fs, name, FileNode::SEQ+FileNode::FLOW);
+    write(fs, r);
+}
+
+static inline
+void write(FileStorage& fs, const String& name, const DMatch& r )
+{
+    cv::internal::WriteStructContext ws(fs, name, FileNode::SEQ+FileNode::FLOW);
+    write(fs, r);
+}
+
 template<typename _Tp> static inline
 void write( FileStorage& fs, const String& name, const std::vector<_Tp>& vec )
 {
@ -1245,6 +1259,14 @@ void operator >> (const FileNode& n, std::vector<KeyPoint>& vec)
 {
    read(n, vec);
 }
+
+static inline
+void operator >> (const FileNode& n, KeyPoint& kpt)
+{
+    FileNodeIterator it = n.begin();
+    it >> kpt.pt.x >> kpt.pt.y >> kpt.size >> kpt.angle >> kpt.response >> kpt.octave >> kpt.class_id;
+}
+
 /** @brief Reads DMatch from a file storage.
 */
 //It needs special handling because it contains two types of fields, int & float.
@ -1254,6 +1276,13 @@ void operator >> (const FileNode& n, std::vector<DMatch>& vec)
    read(n, vec);
 }

+static inline
+void operator >> (const FileNode& n, DMatch& m)
+{
+    FileNodeIterator it = n.begin();
+    it >> m.queryIdx >> m.trainIdx >> m.imgIdx >> m.distance;
+}
+
 //! @} FileNode

 //! @relates cv::FileNodeIterator
--- a/modules/core/include/opencv2/core/private.cuda.hpp
+++ b/modules/core/include/opencv2/core/private.cuda.hpp
@ -102,20 +102,6 @@ static inline void throw_no_cuda() { CV_Error(cv::Error::StsNotImplemented, "The

 namespace cv { namespace cuda
 {
-    class CV_EXPORTS BufferPool
-    {
-    public:
-        explicit BufferPool(Stream& stream);
-
-        GpuMat getBuffer(int rows, int cols, int type);
-        GpuMat getBuffer(Size size, int type) { return getBuffer(size.height, size.width, type); }
-
-        GpuMat::Allocator* getAllocator() const { return allocator_; }
-
-    private:
-        GpuMat::Allocator* allocator_;
-    };
-
    static inline void checkNppError(int code, const char* file, const int line, const char* func)
    {
        if (code < 0)
--- a/modules/core/include/opencv2/core/utility.hpp
+++ b/modules/core/include/opencv2/core/utility.hpp
@ -507,7 +507,7 @@ void Mat::forEach_impl(const Functor& operation) {
                    this->rowCall2(row, COLS);
                }
            } else {
-                std::vector<int> idx(COLS); /// idx is modified in this->rowCall
+                std::vector<int> idx(DIMS); /// idx is modified in this->rowCall
                idx[DIMS - 2] = range.start - 1;

                for (int line_num = range.start; line_num < range.end; ++line_num) {
--- a/modules/core/src/cuda_stream.cpp
+++ b/modules/core/src/cuda_stream.cpp
@ -282,9 +282,10 @@ public:
    cudaStream_t stream;
    bool ownStream;

-    Ptr<StackAllocator> stackAllocator;
+    Ptr<GpuMat::Allocator> allocator;

    Impl();
+    Impl(const Ptr<GpuMat::Allocator>& allocator);
    explicit Impl(cudaStream_t stream);

    ~Impl();
@ -295,17 +296,23 @@ cv::cuda::Stream::Impl::Impl() : stream(0), ownStream(false)
    cudaSafeCall( cudaStreamCreate(&stream) );
    ownStream = true;

-    stackAllocator = makePtr<StackAllocator>(stream);
+    allocator = makePtr<StackAllocator>(stream);
+}
+
+cv::cuda::Stream::Impl::Impl(const Ptr<GpuMat::Allocator>& allocator) : stream(0), ownStream(false), allocator(allocator)
+{
+    cudaSafeCall( cudaStreamCreate(&stream) );
+    ownStream = true;
 }

 cv::cuda::Stream::Impl::Impl(cudaStream_t stream_) : stream(stream_), ownStream(false)
 {
-    stackAllocator = makePtr<StackAllocator>(stream);
+    allocator = makePtr<StackAllocator>(stream);
 }

 cv::cuda::Stream::Impl::~Impl()
 {
-    stackAllocator.release();
+    allocator.release();

    if (stream && ownStream)
    {
@ -417,6 +424,16 @@ cv::cuda::Stream::Stream()
 #endif
 }

+cv::cuda::Stream::Stream(const Ptr<GpuMat::Allocator>& allocator)
+{
+#ifndef HAVE_CUDA
+    (void) allocator;
+    throw_no_cuda();
+#else
+    impl_ = makePtr<Impl>(allocator);
+#endif
+}
+
 bool cv::cuda::Stream::queryIfComplete() const
 {
 #ifndef HAVE_CUDA
@ -668,20 +685,33 @@ void cv::cuda::setBufferPoolConfig(int deviceId, size_t stackSize, int stackCoun
 #endif
 }

-#ifdef HAVE_CUDA
-
-cv::cuda::BufferPool::BufferPool(Stream& stream) : allocator_(stream.impl_->stackAllocator.get())
+#ifndef HAVE_CUDA
+cv::cuda::BufferPool::BufferPool(Stream& stream)
+{
+    (void) stream;
+    throw_no_cuda();
+}
+#else
+cv::cuda::BufferPool::BufferPool(Stream& stream) : allocator_(stream.impl_->allocator)
 {
 }
+#endif

 GpuMat cv::cuda::BufferPool::getBuffer(int rows, int cols, int type)
 {
+#ifndef HAVE_CUDA
+    (void) rows;
+    (void) cols;
+    (void) type;
+    throw_no_cuda();
+    return GpuMat();
+#else
    GpuMat buf(allocator_);
    buf.create(rows, cols, type);
    return buf;
+#endif
 }

-#endif

 ////////////////////////////////////////////////////////////////
 // Event
--- a/modules/core/src/dxt.cpp
+++ b/modules/core/src/dxt.cpp
@ -3342,6 +3342,9 @@ void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows )

    CV_Assert( type == CV_32FC1 || type == CV_32FC2 || type == CV_64FC1 || type == CV_64FC2 );

+    // Fail if DFT_COMPLEX_INPUT is specified, but src is not 2 channels.
+    CV_Assert( !((flags & DFT_COMPLEX_INPUT) && src.channels() != 2) );
+
    if( !inv && src.channels() == 1 && (flags & DFT_COMPLEX_OUTPUT) )
        _dst.create( src.size(), CV_MAKETYPE(depth, 2) );
    else if( inv && src.channels() == 2 && (flags & DFT_REAL_OUTPUT) )
--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
@ -1395,7 +1395,7 @@ void _InputArray::getMatVector(std::vector<Mat>& mv) const
    {
        const std::vector<uchar>& v = *(const std::vector<uchar>*)obj;

-        size_t n = v.size(), esz = CV_ELEM_SIZE(flags);
+        size_t n = size().width, esz = CV_ELEM_SIZE(flags);
        int t = CV_MAT_DEPTH(flags), cn = CV_MAT_CN(flags);
        mv.resize(n);

--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@ -3434,7 +3434,7 @@ int Kernel::set(int i, const KernelArg& arg)
            if( !(arg.flags & KernelArg::NO_SIZE) )
            {
                int cols = u3d.cols*arg.wscale/arg.iwscale;
-                CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, sizeof(u3d.slices), &u3d.rows) == CL_SUCCESS);
+                CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, sizeof(u3d.slices), &u3d.slices) == CL_SUCCESS);
                CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.rows), &u3d.rows) == CL_SUCCESS);
                CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.cols), &cols) == CL_SUCCESS);
                i += 3;
--- a/modules/core/src/precomp.hpp
+++ b/modules/core/src/precomp.hpp
@ -272,8 +272,8 @@ struct CoreTLSData

    RNG rng;
 //#ifdef HAVE_OPENCL
-    int device;
-    ocl::Queue oclQueue;
+    int device; // device index of an array of devices in a context, see also Device::getDefault
+    ocl::Queue oclQueue; // the queue used for running a kernel, see also getQueue, Kernel::run
    int useOpenCL; // 1 - use, 0 - do not use, -1 - auto/not initialized
 //#endif
    int useIPP; // 1 - use, 0 - do not use, -1 - auto/not initialized
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@ -237,24 +237,81 @@ void Exception::formatMessage()
        msg = format("%s:%d: error: (%d) %s\n", file.c_str(), line, code, err.c_str());
 }

+static const char* g_hwFeatureNames[CV_HARDWARE_MAX_FEATURE] = { NULL };
+
+static const char* getHWFeatureName(int id)
+{
+    return (id < CV_HARDWARE_MAX_FEATURE) ? g_hwFeatureNames[id] : NULL;
+}
+static const char* getHWFeatureNameSafe(int id)
+{
+    const char* name = getHWFeatureName(id);
+    return name ? name : "Unknown feature";
+}
+
 struct HWFeatures
 {
    enum { MAX_FEATURE = CV_HARDWARE_MAX_FEATURE };

-    HWFeatures(void)
+    HWFeatures(bool run_initialize = false)
    {
-        memset( have, 0, sizeof(have) );
-        x86_family = 0;
+        memset( have, 0, sizeof(have[0]) * MAX_FEATURE );
+        if (run_initialize)
+            initialize();
    }

-    static HWFeatures initialize(void)
+    static void initializeNames()
    {
-        HWFeatures f;
+        for (int i = 0; i < CV_HARDWARE_MAX_FEATURE; i++)
+        {
+            g_hwFeatureNames[i] = 0;
+        }
+        g_hwFeatureNames[CPU_MMX] = "MMX";
+        g_hwFeatureNames[CPU_SSE] = "SSE";
+        g_hwFeatureNames[CPU_SSE2] = "SSE2";
+        g_hwFeatureNames[CPU_SSE3] = "SSE3";
+        g_hwFeatureNames[CPU_SSSE3] = "SSSE3";
+        g_hwFeatureNames[CPU_SSE4_1] = "SSE4.1";
+        g_hwFeatureNames[CPU_SSE4_2] = "SSE4.2";
+        g_hwFeatureNames[CPU_POPCNT] = "POPCNT";
+        g_hwFeatureNames[CPU_FP16] = "FP16";
+        g_hwFeatureNames[CPU_AVX] = "AVX";
+        g_hwFeatureNames[CPU_AVX2] = "AVX2";
+        g_hwFeatureNames[CPU_FMA3] = "FMA3";
+
+        g_hwFeatureNames[CPU_AVX_512F] = "AVX512F";
+        g_hwFeatureNames[CPU_AVX_512BW] = "AVX512BW";
+        g_hwFeatureNames[CPU_AVX_512CD] = "AVX512CD";
+        g_hwFeatureNames[CPU_AVX_512DQ] = "AVX512DQ";
+        g_hwFeatureNames[CPU_AVX_512ER] = "AVX512ER";
+        g_hwFeatureNames[CPU_AVX_512IFMA512] = "AVX512IFMA";
+        g_hwFeatureNames[CPU_AVX_512PF] = "AVX512PF";
+        g_hwFeatureNames[CPU_AVX_512VBMI] = "AVX512VBMI";
+        g_hwFeatureNames[CPU_AVX_512VL] = "AVX512VL";
+
+        g_hwFeatureNames[CPU_NEON] = "NEON";
+    }
+
+    void initialize(void)
+    {
+#ifndef WINRT
+        if (getenv("OPENCV_DUMP_CONFIG"))
+        {
+            fprintf(stderr, "\nOpenCV build configuration is:\n%s\n",
+                cv::getBuildInformation().c_str());
+        }
+#endif
+
+        initializeNames();
+
        int cpuid_data[4] = { 0, 0, 0, 0 };
+        int cpuid_data_ex[4] = { 0, 0, 0, 0 };

    #if defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
+    #define OPENCV_HAVE_X86_CPUID 1
        __cpuid(cpuid_data, 1);
    #elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+    #define OPENCV_HAVE_X86_CPUID 1
        #ifdef __x86_64__
        asm __volatile__
        (
@ -278,33 +335,36 @@ struct HWFeatures
        #endif
    #endif

-        f.x86_family = (cpuid_data[0] >> 8) & 15;
-        if( f.x86_family >= 6 )
+    #ifdef OPENCV_HAVE_X86_CPUID
+        int x86_family = (cpuid_data[0] >> 8) & 15;
+        if( x86_family >= 6 )
        {
-            f.have[CV_CPU_MMX]    = (cpuid_data[3] & (1 << 23)) != 0;
-            f.have[CV_CPU_SSE]    = (cpuid_data[3] & (1<<25)) != 0;
-            f.have[CV_CPU_SSE2]   = (cpuid_data[3] & (1<<26)) != 0;
-            f.have[CV_CPU_SSE3]   = (cpuid_data[2] & (1<<0)) != 0;
-            f.have[CV_CPU_SSSE3]  = (cpuid_data[2] & (1<<9)) != 0;
-            f.have[CV_CPU_FMA3]  = (cpuid_data[2] & (1<<12)) != 0;
-            f.have[CV_CPU_SSE4_1] = (cpuid_data[2] & (1<<19)) != 0;
-            f.have[CV_CPU_SSE4_2] = (cpuid_data[2] & (1<<20)) != 0;
-            f.have[CV_CPU_POPCNT] = (cpuid_data[2] & (1<<23)) != 0;
-            f.have[CV_CPU_AVX]    = (((cpuid_data[2] & (1<<28)) != 0)&&((cpuid_data[2] & (1<<27)) != 0));//OS uses XSAVE_XRSTORE and CPU support AVX
-            f.have[CV_CPU_FP16]   = (cpuid_data[2] & (1<<29)) != 0;
+            have[CV_CPU_MMX]    = (cpuid_data[3] & (1<<23)) != 0;
+            have[CV_CPU_SSE]    = (cpuid_data[3] & (1<<25)) != 0;
+            have[CV_CPU_SSE2]   = (cpuid_data[3] & (1<<26)) != 0;
+            have[CV_CPU_SSE3]   = (cpuid_data[2] & (1<<0)) != 0;
+            have[CV_CPU_SSSE3]  = (cpuid_data[2] & (1<<9)) != 0;
+            have[CV_CPU_FMA3]   = (cpuid_data[2] & (1<<12)) != 0;
+            have[CV_CPU_SSE4_1] = (cpuid_data[2] & (1<<19)) != 0;
+            have[CV_CPU_SSE4_2] = (cpuid_data[2] & (1<<20)) != 0;
+            have[CV_CPU_POPCNT] = (cpuid_data[2] & (1<<23)) != 0;
+            have[CV_CPU_AVX]    = (cpuid_data[2] & (1<<28)) != 0;
+            have[CV_CPU_FP16]   = (cpuid_data[2] & (1<<29)) != 0;

            // make the second call to the cpuid command in order to get
            // information about extended features like AVX2
        #if defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
-            __cpuidex(cpuid_data, 7, 0);
+        #define OPENCV_HAVE_X86_CPUID_EX 1
+            __cpuidex(cpuid_data_ex, 7, 0);
        #elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+        #define OPENCV_HAVE_X86_CPUID_EX 1
            #ifdef __x86_64__
            asm __volatile__
            (
             "movl $7, %%eax\n\t"
             "movl $0, %%ecx\n\t"
             "cpuid\n\t"
-             :[eax]"=a"(cpuid_data[0]),[ebx]"=b"(cpuid_data[1]),[ecx]"=c"(cpuid_data[2]),[edx]"=d"(cpuid_data[3])
+             :[eax]"=a"(cpuid_data_ex[0]),[ebx]"=b"(cpuid_data_ex[1]),[ecx]"=c"(cpuid_data_ex[2]),[edx]"=d"(cpuid_data_ex[3])
             :
             : "cc"
            );
@ -317,29 +377,76 @@ struct HWFeatures
             "cpuid\n\t"
             "movl %%ebx, %0\n\t"
             "popl %%ebx\n\t"
-             : "=r"(cpuid_data[1]), "=c"(cpuid_data[2])
+             : "=r"(cpuid_data_ex[1]), "=c"(cpuid_data_ex[2])
             :
             : "cc"
            );
            #endif
        #endif
-            f.have[CV_CPU_AVX2]   = (cpuid_data[1] & (1<<5)) != 0;

-            f.have[CV_CPU_AVX_512F]       = (cpuid_data[1] & (1<<16)) != 0;
-            f.have[CV_CPU_AVX_512DQ]      = (cpuid_data[1] & (1<<17)) != 0;
-            f.have[CV_CPU_AVX_512IFMA512] = (cpuid_data[1] & (1<<21)) != 0;
-            f.have[CV_CPU_AVX_512PF]      = (cpuid_data[1] & (1<<26)) != 0;
-            f.have[CV_CPU_AVX_512ER]      = (cpuid_data[1] & (1<<27)) != 0;
-            f.have[CV_CPU_AVX_512CD]      = (cpuid_data[1] & (1<<28)) != 0;
-            f.have[CV_CPU_AVX_512BW]      = (cpuid_data[1] & (1<<30)) != 0;
-            f.have[CV_CPU_AVX_512VL]      = (cpuid_data[1] & (1<<31)) != 0;
-            f.have[CV_CPU_AVX_512VBMI]    = (cpuid_data[2] &  (1<<1)) != 0;
+        #ifdef OPENCV_HAVE_X86_CPUID_EX
+            have[CV_CPU_AVX2]   = (cpuid_data_ex[1] & (1<<5)) != 0;
+
+            have[CV_CPU_AVX_512F]       = (cpuid_data_ex[1] & (1<<16)) != 0;
+            have[CV_CPU_AVX_512DQ]      = (cpuid_data_ex[1] & (1<<17)) != 0;
+            have[CV_CPU_AVX_512IFMA512] = (cpuid_data_ex[1] & (1<<21)) != 0;
+            have[CV_CPU_AVX_512PF]      = (cpuid_data_ex[1] & (1<<26)) != 0;
+            have[CV_CPU_AVX_512ER]      = (cpuid_data_ex[1] & (1<<27)) != 0;
+            have[CV_CPU_AVX_512CD]      = (cpuid_data_ex[1] & (1<<28)) != 0;
+            have[CV_CPU_AVX_512BW]      = (cpuid_data_ex[1] & (1<<30)) != 0;
+            have[CV_CPU_AVX_512VL]      = (cpuid_data_ex[1] & (1<<31)) != 0;
+            have[CV_CPU_AVX_512VBMI]    = (cpuid_data_ex[2] & (1<<1)) != 0;
+        #else
+            CV_UNUSED(cpuid_data_ex);
+        #endif
+
+            bool have_AVX_OS_support = true;
+            bool have_AVX512_OS_support = true;
+            if (!(cpuid_data[2] & (1<<27)))
+                have_AVX_OS_support = false; // OS uses XSAVE_XRSTORE and CPU support AVX
+            else
+            {
+                int xcr0 = 0;
+            #ifdef _XCR_XFEATURE_ENABLED_MASK // requires immintrin.h
+                xcr0 = (int)_xgetbv(_XCR_XFEATURE_ENABLED_MASK);
+            #elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+                __asm__ ("xgetbv" : "=a" (xcr0) : "c" (0) : "%edx" );
+            #endif
+                if ((xcr0 & 0x6) != 0x6)
+                    have_AVX_OS_support = false; // YMM registers
+                if ((xcr0 & 0xe6) != 0xe6)
+                    have_AVX512_OS_support = false; // ZMM registers
+            }
+
+            if (!have_AVX_OS_support)
+            {
+                have[CV_CPU_AVX] = false;
+                have[CV_CPU_FP16] = false;
+                have[CV_CPU_AVX2] = false;
+                have[CV_CPU_FMA3] = false;
+            }
+            if (!have_AVX_OS_support || !have_AVX512_OS_support)
+            {
+                have[CV_CPU_AVX_512F] = false;
+                have[CV_CPU_AVX_512BW] = false;
+                have[CV_CPU_AVX_512CD] = false;
+                have[CV_CPU_AVX_512DQ] = false;
+                have[CV_CPU_AVX_512ER] = false;
+                have[CV_CPU_AVX_512IFMA512] = false;
+                have[CV_CPU_AVX_512PF] = false;
+                have[CV_CPU_AVX_512VBMI] = false;
+                have[CV_CPU_AVX_512VL] = false;
+            }
        }
+    #else
+        CV_UNUSED(cpuid_data);
+        CV_UNUSED(cpuid_data_ex);
+    #endif // OPENCV_HAVE_X86_CPUID

    #if defined ANDROID || defined __linux__
    #ifdef __aarch64__
-        f.have[CV_CPU_NEON] = true;
-        f.have[CV_CPU_FP16] = true;
+        have[CV_CPU_NEON] = true;
+        have[CV_CPU_FP16] = true;
    #elif defined __arm__
        int cpufile = open("/proc/self/auxv", O_RDONLY);

@ -352,8 +459,8 @@ struct HWFeatures
            {
                if (auxv.a_type == AT_HWCAP)
                {
-                    f.have[CV_CPU_NEON] = (auxv.a_un.a_val & 4096) != 0;
-                    f.have[CV_CPU_FP16] = (auxv.a_un.a_val & 2) != 0;
+                    have[CV_CPU_NEON] = (auxv.a_un.a_val & 4096) != 0;
+                    have[CV_CPU_FP16] = (auxv.a_un.a_val & 2) != 0;
                    break;
                }
            }
@ -363,21 +470,133 @@ struct HWFeatures
    #endif
    #elif (defined __clang__ || defined __APPLE__)
    #if (defined __ARM_NEON__ || (defined __ARM_NEON && defined __aarch64__))
-        f.have[CV_CPU_NEON] = true;
+        have[CV_CPU_NEON] = true;
    #endif
    #if (defined __ARM_FP  && (((__ARM_FP & 0x2) != 0) && defined __ARM_NEON__))
-        f.have[CV_CPU_FP16] = true;
+        have[CV_CPU_FP16] = true;
    #endif
    #endif

-        return f;
+        int baseline_features[] = { CV_CPU_BASELINE_FEATURES };
+        if (!checkFeatures(baseline_features, sizeof(baseline_features) / sizeof(baseline_features[0])))
+        {
+            fprintf(stderr, "\n"
+                    "******************************************************************\n"
+                    "* FATAL ERROR:                                                   *\n"
+                    "* This OpenCV build doesn't support current CPU/HW configuration *\n"
+                    "*                                                                *\n"
+                    "* Use OPENCV_DUMP_CONFIG=1 environment variable for details      *\n"
+                    "******************************************************************\n");
+            fprintf(stderr, "\nRequired baseline features:\n");
+            checkFeatures(baseline_features, sizeof(baseline_features) / sizeof(baseline_features[0]), true);
+            CV_ErrorNoReturn(cv::Error::StsAssert, "Missing support for required CPU baseline features. Check OpenCV build configuration and required CPU/HW setup.");
+        }
+
+        readSettings(baseline_features, sizeof(baseline_features) / sizeof(baseline_features[0]));
+    }
+
+    bool checkFeatures(const int* features, int count, bool dump = false)
+    {
+        bool result = true;
+        for (int i = 0; i < count; i++)
+        {
+            int feature = features[i];
+            if (feature)
+            {
+                if (have[feature])
+                {
+                    if (dump) fprintf(stderr, "%s - OK\n", getHWFeatureNameSafe(feature));
+                }
+                else
+                {
+                    result = false;
+                    if (dump) fprintf(stderr, "%s - NOT AVAILABLE\n", getHWFeatureNameSafe(feature));
+                }
+            }
+        }
+        return result;
+    }
+
+    static inline bool isSymbolSeparator(char c)
+    {
+        return c == ',' || c == ';' || c == '-';
+    }
+
+    void readSettings(const int* baseline_features, int baseline_count)
+    {
+        bool dump = true;
+        const char* disabled_features =
+#ifndef WINRT
+                getenv("OPENCV_CPU_DISABLE");
+#else
+                NULL;
+#endif
+        if (disabled_features && disabled_features[0] != 0)
+        {
+            const char* start = disabled_features;
+            for (;;)
+            {
+                while (start[0] != 0 && isSymbolSeparator(start[0]))
+                {
+                    start++;
+                }
+                if (start[0] == 0)
+                    break;
+                const char* end = start;
+                while (end[0] != 0 && !isSymbolSeparator(end[0]))
+                {
+                    end++;
+                }
+                if (end == start)
+                    continue;
+                cv::String feature(start, end);
+                start = end;
+
+                CV_Assert(feature.size() > 0);
+
+                bool found = false;
+                for (int i = 0; i < CV_HARDWARE_MAX_FEATURE; i++)
+                {
+                    if (!g_hwFeatureNames[i]) continue;
+                    size_t len = strlen(g_hwFeatureNames[i]);
+                    if (len != feature.size()) continue;
+                    if (feature.compare(g_hwFeatureNames[i]) == 0)
+                    {
+                        bool isBaseline = false;
+                        for (int k = 0; k < baseline_count; k++)
+                        {
+                            if (baseline_features[k] == i)
+                            {
+                                isBaseline = true;
+                                break;
+                            }
+                        }
+                        if (isBaseline)
+                        {
+                            if (dump) fprintf(stderr, "OPENCV: Trying to disable baseline CPU feature: '%s'. This has very limited effect, because code optimizations for this feature are executed unconditionally in the most cases.\n", getHWFeatureNameSafe(i));
+                        }
+                        if (!have[i])
+                        {
+                            if (dump) fprintf(stderr, "OPENCV: Trying to disable unavailable CPU feature on the current platform: '%s'.\n", getHWFeatureNameSafe(i));
+                        }
+                        have[i] = false;
+
+                        found = true;
+                        break;
+                    }
+                }
+                if (!found)
+                {
+                    if (dump) fprintf(stderr, "OPENCV: Trying to disable unknown CPU feature: '%s'.\n", feature.c_str());
+                }
+            }
+        }
    }

-    int x86_family;
    bool have[MAX_FEATURE+1];
 };

-static HWFeatures  featuresEnabled = HWFeatures::initialize(), featuresDisabled = HWFeatures();
+static HWFeatures  featuresEnabled(true), featuresDisabled = HWFeatures(false);
 static HWFeatures* currentFeatures = &featuresEnabled;

 bool checkHardwareSupport(int feature)
--- a/modules/core/test/test_io.cpp
+++ b/modules/core/test/test_io.cpp
@ -1014,7 +1014,7 @@ TEST(Core_InputOutput, filestorage_yaml_advanvced_type_heading)
    ASSERT_EQ(cv::norm(inputMatrix, actualMatrix, NORM_INF), 0.);
 }

-TEST(Core_InputOutput, filestorage_keypoints_io)
+TEST(Core_InputOutput, filestorage_keypoints_vec_vec_io)
 {
    vector<vector<KeyPoint> > kptsVec;
    vector<KeyPoint> kpts;
@ -1051,36 +1051,111 @@ TEST(Core_InputOutput, filestorage_keypoints_io)
    }
 }

-TEST(Core_InputOutput, filestorage_dmatch_io)
+TEST(Core_InputOutput, FileStorage_DMatch)
 {
-    vector<vector<DMatch> > matchesVec;
-    vector<DMatch> matches;
-    matches.push_back(DMatch(1, 0, 10, 11.5f));
-    matches.push_back(DMatch(2, 1, 11, 21.5f));
-    matchesVec.push_back(matches);
-    matches.clear();
-    matches.push_back(DMatch(22, 10, 1, 1.5f));
-    matchesVec.push_back(matches);
+    cv::FileStorage fs("dmatch.yml", cv::FileStorage::WRITE | cv::FileStorage::MEMORY);

-    FileStorage writer("", FileStorage::WRITE + FileStorage::MEMORY + FileStorage::FORMAT_XML);
-    writer << "dmatches" << matchesVec;
-    String content = writer.releaseAndGetString();
+    cv::DMatch d(1, 2, 3, -1.5f);

-    FileStorage reader(content, FileStorage::READ + FileStorage::MEMORY);
-    vector<vector<DMatch> > readKptsVec;
-    reader["dmatches"] >> readKptsVec;
+    EXPECT_NO_THROW(fs << "d" << d);
+    cv::String fs_result = fs.releaseAndGetString();
+    EXPECT_STREQ(fs_result.c_str(), "%YAML:1.0\n---\nd: [ 1, 2, 3, -1.5000000000000000e+00 ]\n");

-    ASSERT_EQ(matchesVec.size(), readKptsVec.size());
+    cv::FileStorage fs_read(fs_result, cv::FileStorage::READ | cv::FileStorage::MEMORY);

-    for(size_t i = 0; i < matchesVec.size(); i++)
+    cv::DMatch d_read;
+    ASSERT_NO_THROW(fs_read["d"] >> d_read);
+
+    EXPECT_EQ(d.queryIdx, d_read.queryIdx);
+    EXPECT_EQ(d.trainIdx, d_read.trainIdx);
+    EXPECT_EQ(d.imgIdx, d_read.imgIdx);
+    EXPECT_EQ(d.distance, d_read.distance);
+}
+
+TEST(Core_InputOutput, FileStorage_DMatch_vector)
+{
+    cv::FileStorage fs("dmatch.yml", cv::FileStorage::WRITE | cv::FileStorage::MEMORY);
+
+    cv::DMatch d1(1, 2, 3, -1.5f);
+    cv::DMatch d2(2, 3, 4, 1.5f);
+    cv::DMatch d3(3, 2, 1, 0.5f);
+    std::vector<cv::DMatch> dv;
+    dv.push_back(d1);
+    dv.push_back(d2);
+    dv.push_back(d3);
+
+    EXPECT_NO_THROW(fs << "dv" << dv);
+    cv::String fs_result = fs.releaseAndGetString();
+    EXPECT_STREQ(fs_result.c_str(),
+"%YAML:1.0\n"
+"---\n"
+"dv: [ 1, 2, 3, -1.5000000000000000e+00, 2, 3, 4, 1.5000000000000000e+00,\n"
+"    3, 2, 1, 5.0000000000000000e-01 ]\n"
+);
+
+    cv::FileStorage fs_read(fs_result, cv::FileStorage::READ | cv::FileStorage::MEMORY);
+
+    std::vector<cv::DMatch> dv_read;
+    ASSERT_NO_THROW(fs_read["dv"] >> dv_read);
+
+    ASSERT_EQ(dv.size(), dv_read.size());
+    for (size_t i = 0; i < dv.size(); i++)
    {
-        ASSERT_EQ(matchesVec[i].size(), readKptsVec[i].size());
-        for(size_t j = 0; j < matchesVec[i].size(); j++)
+        EXPECT_EQ(dv[i].queryIdx, dv_read[i].queryIdx);
+        EXPECT_EQ(dv[i].trainIdx, dv_read[i].trainIdx);
+        EXPECT_EQ(dv[i].imgIdx, dv_read[i].imgIdx);
+        EXPECT_EQ(dv[i].distance, dv_read[i].distance);
+    }
+}
+
+TEST(Core_InputOutput, FileStorage_DMatch_vector_vector)
+{
+    cv::FileStorage fs("dmatch.yml", cv::FileStorage::WRITE | cv::FileStorage::MEMORY);
+
+    cv::DMatch d1(1, 2, 3, -1.5f);
+    cv::DMatch d2(2, 3, 4, 1.5f);
+    cv::DMatch d3(3, 2, 1, 0.5f);
+    std::vector<cv::DMatch> dv1;
+    dv1.push_back(d1);
+    dv1.push_back(d2);
+    dv1.push_back(d3);
+
+    std::vector<cv::DMatch> dv2;
+    dv2.push_back(d3);
+    dv2.push_back(d1);
+
+    std::vector< std::vector<cv::DMatch> > dvv;
+    dvv.push_back(dv1);
+    dvv.push_back(dv2);
+
+    EXPECT_NO_THROW(fs << "dvv" << dvv);
+    cv::String fs_result = fs.releaseAndGetString();
+    EXPECT_STREQ(fs_result.c_str(),
+"%YAML:1.0\n"
+"---\n"
+"dvv:\n"
+"   - [ 1, 2, 3, -1.5000000000000000e+00, 2, 3, 4, 1.5000000000000000e+00,\n"
+"       3, 2, 1, 5.0000000000000000e-01 ]\n"
+"   - [ 3, 2, 1, 5.0000000000000000e-01, 1, 2, 3, -1.5000000000000000e+00 ]\n"
+);
+
+    cv::FileStorage fs_read(fs_result, cv::FileStorage::READ | cv::FileStorage::MEMORY);
+
+    std::vector< std::vector<cv::DMatch> > dvv_read;
+    ASSERT_NO_THROW(fs_read["dvv"] >> dvv_read);
+
+    ASSERT_EQ(dvv.size(), dvv_read.size());
+    for (size_t j = 0; j < dvv.size(); j++)
+    {
+        const std::vector<cv::DMatch>& dv = dvv[j];
+        const std::vector<cv::DMatch>& dv_read = dvv_read[j];
+        ASSERT_EQ(dvv.size(), dvv_read.size());
+        for (size_t i = 0; i < dv.size(); i++)
        {
-            ASSERT_FLOAT_EQ(matchesVec[i][j].distance, readKptsVec[i][j].distance);
-            ASSERT_EQ(matchesVec[i][j].imgIdx, readKptsVec[i][j].imgIdx);
-            ASSERT_EQ(matchesVec[i][j].queryIdx, readKptsVec[i][j].queryIdx);
-            ASSERT_EQ(matchesVec[i][j].trainIdx, readKptsVec[i][j].trainIdx);
+            EXPECT_EQ(dv[i].queryIdx, dv_read[i].queryIdx);
+            EXPECT_EQ(dv[i].trainIdx, dv_read[i].trainIdx);
+            EXPECT_EQ(dv[i].imgIdx, dv_read[i].imgIdx);
+            EXPECT_EQ(dv[i].distance, dv_read[i].distance);
        }
    }
 }
--- a/modules/core/test/test_mat.cpp
+++ b/modules/core/test/test_mat.cpp
@ -659,6 +659,18 @@ struct InitializerFunctor{
    }
 };

+template<typename Pixel>
+struct InitializerFunctor5D{
+    /// Initializer for cv::Mat::forEach test (5 dimensional case)
+    void operator()(Pixel & pixel, const int * idx) const {
+        pixel[0] = idx[0];
+        pixel[1] = idx[1];
+        pixel[2] = idx[2];
+        pixel[3] = idx[3];
+        pixel[4] = idx[4];
+    }
+};
+
 void Core_ArrayOpTest::run( int /* start_from */)
 {
    int errcount = 0;
@ -736,6 +748,57 @@ void Core_ArrayOpTest::run( int /* start_from */)
        }
    }

+    // test cv::Mat::forEach
+    // with a matrix that has more dimensions than columns
+    // See https://github.com/opencv/opencv/issues/8447
+    {
+        const int dims[5] = { 2, 2, 2, 2, 2 };
+        typedef cv::Vec<int, 5> Pixel;
+
+        cv::Mat a = cv::Mat::zeros(5, dims, CV_32SC(5));
+        InitializerFunctor5D<Pixel> initializer;
+
+        a.forEach<Pixel>(initializer);
+
+        uint64 total = 0;
+        bool error_reported = false;
+        for (int i0 = 0; i0 < dims[0]; ++i0) {
+            for (int i1 = 0; i1 < dims[1]; ++i1) {
+                for (int i2 = 0; i2 < dims[2]; ++i2) {
+                    for (int i3 = 0; i3 < dims[3]; ++i3) {
+                        for (int i4 = 0; i4 < dims[4]; ++i4) {
+                            const int i[5] = { i0, i1, i2, i3, i4 };
+                            Pixel& pixel = a.at<Pixel>(i);
+                            if (pixel[0] != i0 || pixel[1] != i1 || pixel[2] != i2 || pixel[3] != i3 || pixel[4] != i4) {
+                                if (!error_reported) {
+                                    ts->printf(cvtest::TS::LOG, "forEach is not correct.\n"
+                                        "First error detected at position (%d, %d, %d, %d, %d), got value (%d, %d, %d, %d, %d).\n",
+                                        i0, i1, i2, i3, i4,
+                                        pixel[0], pixel[1], pixel[2], pixel[3], pixel[4]);
+                                    error_reported = true;
+                                }
+                                errcount++;
+                            }
+                            total += pixel[0];
+                            total += pixel[1];
+                            total += pixel[2];
+                            total += pixel[3];
+                            total += pixel[4];
+                        }
+                    }
+                }
+            }
+        }
+        uint64 total2 = 0;
+        for (size_t i = 0; i < sizeof(dims) / sizeof(dims[0]); ++i) {
+            total2 += ((dims[i] - 1) * dims[i] / 2) * dims[0] * dims[1] * dims[2] * dims[3] * dims[4] / dims[i];
+        }
+        if (total != total2) {
+            ts->printf(cvtest::TS::LOG, "forEach is not correct because total is invalid.\n");
+            errcount++;
+        }
+    }
+
    RNG rng;
    const int MAX_DIM = 5, MAX_DIM_SZ = 10;
    // sparse matrix operations
--- a/modules/cudaarithm/include/opencv2/cudaarithm.hpp
+++ b/modules/cudaarithm/include/opencv2/cudaarithm.hpp
@ -788,6 +788,7 @@ CV_EXPORTS void mulAndScaleSpectrums(InputArray src1, InputArray src2, OutputArr
 (obtained from dft_size ).
 -   **DFT_INVERSE** inverts DFT. Use for complex-complex cases (real-complex and complex-real
 cases are always forward and inverse, respectively).
+-   **DFT_COMPLEX_INPUT** Specifies that input is complex input with 2 channels.
 -   **DFT_REAL_OUTPUT** specifies the output as real. The source matrix is the result of
 real-complex transform, so the destination matrix must be real.
@param stream Stream for the asynchronous version.
@ -813,6 +814,35 @@ instead of the width.
 */
 CV_EXPORTS void dft(InputArray src, OutputArray dst, Size dft_size, int flags=0, Stream& stream = Stream::Null());

+/** @brief Base class for DFT operator as a cv::Algorithm. :
+ */
+class CV_EXPORTS DFT : public Algorithm
+{
+public:
+    /** @brief Computes an FFT of a given image.
+
+    @param image Source image. Only CV_32FC1 images are supported for now.
+    @param result Result image.
+    @param stream Stream for the asynchronous version.
+     */
+    virtual void compute(InputArray image, OutputArray result, Stream& stream = Stream::Null()) = 0;
+};
+
+/** @brief Creates implementation for cuda::DFT.
+
+@param dft_size The image size.
+@param flags Optional flags:
+-   **DFT_ROWS** transforms each individual row of the source matrix.
+-   **DFT_SCALE** scales the result: divide it by the number of elements in the transform
+(obtained from dft_size ).
+-   **DFT_INVERSE** inverts DFT. Use for complex-complex cases (real-complex and complex-real
+cases are always forward and inverse, respectively).
+-   **DFT_COMPLEX_INPUT** Specifies that inputs will be complex with 2 channels.
+-   **DFT_REAL_OUTPUT** specifies the output as real. The source matrix is the result of
+real-complex transform, so the destination matrix must be real.
+ */
+CV_EXPORTS Ptr<DFT> createDFT(Size dft_size, int flags);
+
 /** @brief Base class for convolution (or cross-correlation) operator. :
 */
 class CV_EXPORTS Convolution : public Algorithm
--- a/modules/cudaarithm/src/arithm.cpp
+++ b/modules/cudaarithm/src/arithm.cpp
@ -286,111 +286,146 @@ void cv::cuda::gemm(InputArray _src1, InputArray _src2, double alpha, InputArray
 }

 //////////////////////////////////////////////////////////////////////////////
-// dft
+// DFT function

 void cv::cuda::dft(InputArray _src, OutputArray _dst, Size dft_size, int flags, Stream& stream)
+{
+    if (getInputMat(_src, stream).channels() == 2)
+        flags |= DFT_COMPLEX_INPUT;
+
+    Ptr<DFT> dft = createDFT(dft_size, flags);
+    dft->compute(_src, _dst, stream);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// DFT algorithm
+
+#ifdef HAVE_CUFFT
+
+namespace
+{
+
+    class DFTImpl : public DFT
+    {
+        Size dft_size, dft_size_opt;
+        bool is_1d_input, is_row_dft, is_scaled_dft, is_inverse, is_complex_input, is_complex_output;
+
+        cufftType dft_type;
+        cufftHandle plan;
+
+    public:
+        DFTImpl(Size dft_size, int flags)
+            : dft_size(dft_size),
+              dft_size_opt(dft_size),
+              is_1d_input((dft_size.height == 1) || (dft_size.width == 1)),
+              is_row_dft((flags & DFT_ROWS) != 0),
+              is_scaled_dft((flags & DFT_SCALE) != 0),
+              is_inverse((flags & DFT_INVERSE) != 0),
+              is_complex_input((flags & DFT_COMPLEX_INPUT) != 0),
+              is_complex_output(!(flags & DFT_REAL_OUTPUT)),
+              dft_type(!is_complex_input ? CUFFT_R2C : (is_complex_output ? CUFFT_C2C : CUFFT_C2R))
+        {
+            // We don't support unpacked output (in the case of real input)
+            CV_Assert( !(flags & DFT_COMPLEX_OUTPUT) );
+
+            // We don't support real-to-real transform
+            CV_Assert( is_complex_input || is_complex_output );
+
+            if (is_1d_input && !is_row_dft)
+            {
+                // If the source matrix is single column handle it as single row
+                dft_size_opt.width = std::max(dft_size.width, dft_size.height);
+                dft_size_opt.height = std::min(dft_size.width, dft_size.height);
+            }
+
+            CV_Assert( dft_size_opt.width > 1 );
+
+            if (is_1d_input || is_row_dft)
+                cufftSafeCall( cufftPlan1d(&plan, dft_size_opt.width, dft_type, dft_size_opt.height) );
+            else
+                cufftSafeCall( cufftPlan2d(&plan, dft_size_opt.height, dft_size_opt.width, dft_type) );
+        }
+
+        ~DFTImpl()
+        {
+            cufftSafeCall( cufftDestroy(plan) );
+        }
+
+        void compute(InputArray _src, OutputArray _dst, Stream& stream)
+        {
+            GpuMat src = getInputMat(_src, stream);
+
+            CV_Assert( src.type() == CV_32FC1 || src.type() == CV_32FC2 );
+            CV_Assert( is_complex_input == (src.channels() == 2) );
+
+            // Make sure here we work with the continuous input,
+            // as CUFFT can't handle gaps
+            GpuMat src_cont;
+            if (src.isContinuous())
+            {
+                src_cont = src;
+            }
+            else
+            {
+                BufferPool pool(stream);
+                src_cont.allocator = pool.getAllocator();
+                createContinuous(src.rows, src.cols, src.type(), src_cont);
+                src.copyTo(src_cont, stream);
+            }
+
+            cufftSafeCall( cufftSetStream(plan, StreamAccessor::getStream(stream)) );
+
+            if (is_complex_input)
+            {
+                if (is_complex_output)
+                {
+                    createContinuous(dft_size, CV_32FC2, _dst);
+                    GpuMat dst = _dst.getGpuMat();
+
+                    cufftSafeCall(cufftExecC2C(
+                            plan, src_cont.ptr<cufftComplex>(), dst.ptr<cufftComplex>(),
+                            is_inverse ? CUFFT_INVERSE : CUFFT_FORWARD));
+                }
+                else
+                {
+                    createContinuous(dft_size, CV_32F, _dst);
+                    GpuMat dst = _dst.getGpuMat();
+
+                    cufftSafeCall(cufftExecC2R(
+                            plan, src_cont.ptr<cufftComplex>(), dst.ptr<cufftReal>()));
+                }
+            }
+            else
+            {
+                // We could swap dft_size for efficiency. Here we must reflect it
+                if (dft_size == dft_size_opt)
+                    createContinuous(Size(dft_size.width / 2 + 1, dft_size.height), CV_32FC2, _dst);
+                else
+                    createContinuous(Size(dft_size.width, dft_size.height / 2 + 1), CV_32FC2, _dst);
+
+                GpuMat dst = _dst.getGpuMat();
+
+                cufftSafeCall(cufftExecR2C(
+                                  plan, src_cont.ptr<cufftReal>(), dst.ptr<cufftComplex>()));
+            }
+
+            if (is_scaled_dft)
+                cuda::multiply(_dst, Scalar::all(1. / dft_size.area()), _dst, 1, -1, stream);
+        }
+    };
+}
+
+#endif
+
+Ptr<DFT> cv::cuda::createDFT(Size dft_size, int flags)
 {
 #ifndef HAVE_CUFFT
-    (void) _src;
-    (void) _dst;
    (void) dft_size;
    (void) flags;
-    (void) stream;
-    throw_no_cuda();
+    CV_Error(Error::StsNotImplemented, "The library was build without CUFFT");
+    return Ptr<DFT>();
 #else
-    GpuMat src = getInputMat(_src, stream);
-
-    CV_Assert( src.type() == CV_32FC1 || src.type() == CV_32FC2 );
-
-    // We don't support unpacked output (in the case of real input)
-    CV_Assert( !(flags & DFT_COMPLEX_OUTPUT) );
-
-    const bool is_1d_input       = (dft_size.height == 1) || (dft_size.width == 1);
-    const bool is_row_dft        = (flags & DFT_ROWS) != 0;
-    const bool is_scaled_dft     = (flags & DFT_SCALE) != 0;
-    const bool is_inverse        = (flags & DFT_INVERSE) != 0;
-    const bool is_complex_input  = src.channels() == 2;
-    const bool is_complex_output = !(flags & DFT_REAL_OUTPUT);
-
-    // We don't support real-to-real transform
-    CV_Assert( is_complex_input || is_complex_output );
-
-    // Make sure here we work with the continuous input,
-    // as CUFFT can't handle gaps
-    GpuMat src_cont;
-    if (src.isContinuous())
-    {
-        src_cont = src;
-    }
-    else
-    {
-        BufferPool pool(stream);
-        src_cont.allocator = pool.getAllocator();
-        createContinuous(src.rows, src.cols, src.type(), src_cont);
-        src.copyTo(src_cont, stream);
-    }
-
-    Size dft_size_opt = dft_size;
-    if (is_1d_input && !is_row_dft)
-    {
-        // If the source matrix is single column handle it as single row
-        dft_size_opt.width = std::max(dft_size.width, dft_size.height);
-        dft_size_opt.height = std::min(dft_size.width, dft_size.height);
-    }
-
-    CV_Assert( dft_size_opt.width > 1 );
-
-    cufftType dft_type = CUFFT_R2C;
-    if (is_complex_input)
-        dft_type = is_complex_output ? CUFFT_C2C : CUFFT_C2R;
-
-    cufftHandle plan;
-    if (is_1d_input || is_row_dft)
-        cufftSafeCall( cufftPlan1d(&plan, dft_size_opt.width, dft_type, dft_size_opt.height) );
-    else
-        cufftSafeCall( cufftPlan2d(&plan, dft_size_opt.height, dft_size_opt.width, dft_type) );
-
-    cufftSafeCall( cufftSetStream(plan, StreamAccessor::getStream(stream)) );
-
-    if (is_complex_input)
-    {
-        if (is_complex_output)
-        {
-            createContinuous(dft_size, CV_32FC2, _dst);
-            GpuMat dst = _dst.getGpuMat();
-
-            cufftSafeCall(cufftExecC2C(
-                    plan, src_cont.ptr<cufftComplex>(), dst.ptr<cufftComplex>(),
-                    is_inverse ? CUFFT_INVERSE : CUFFT_FORWARD));
-        }
-        else
-        {
-            createContinuous(dft_size, CV_32F, _dst);
-            GpuMat dst = _dst.getGpuMat();
-
-            cufftSafeCall(cufftExecC2R(
-                    plan, src_cont.ptr<cufftComplex>(), dst.ptr<cufftReal>()));
-        }
-    }
-    else
-    {
-        // We could swap dft_size for efficiency. Here we must reflect it
-        if (dft_size == dft_size_opt)
-            createContinuous(Size(dft_size.width / 2 + 1, dft_size.height), CV_32FC2, _dst);
-        else
-            createContinuous(Size(dft_size.width, dft_size.height / 2 + 1), CV_32FC2, _dst);
-
-        GpuMat dst = _dst.getGpuMat();
-
-        cufftSafeCall(cufftExecR2C(
-                plan, src_cont.ptr<cufftReal>(), dst.ptr<cufftComplex>()));
-    }
-
-    cufftSafeCall( cufftDestroy(plan) );
-
-    if (is_scaled_dft)
-        cuda::multiply(_dst, Scalar::all(1. / dft_size.area()), _dst, 1, -1, stream);
-
+    return makePtr<DFTImpl>(dft_size, flags);
 #endif
 }

--- a/modules/cudaarithm/test/test_arithm.cpp
+++ b/modules/cudaarithm/test/test_arithm.cpp
@ -250,6 +250,33 @@ CUDA_TEST_P(Dft, C2C)
    }
 }

+CUDA_TEST_P(Dft, Algorithm)
+{
+    int cols = randomInt(2, 100);
+    int rows = randomInt(2, 100);
+
+    int flags = 0;
+    cv::Ptr<cv::cuda::DFT> dft = cv::cuda::createDFT(cv::Size(cols, rows), flags);
+
+    for (int i = 0; i < 5; ++i)
+    {
+        SCOPED_TRACE("dft algorithm");
+
+        cv::Mat a = randomMat(cv::Size(cols, rows), CV_32FC2, 0.0, 10.0);
+
+        cv::cuda::GpuMat d_b;
+        cv::cuda::GpuMat d_b_data;
+        dft->compute(loadMat(a), d_b);
+
+        cv::Mat b_gold;
+        cv::dft(a, b_gold, flags);
+
+        ASSERT_EQ(CV_32F, d_b.depth());
+        ASSERT_EQ(2, d_b.channels());
+        EXPECT_MAT_NEAR(b_gold, cv::Mat(d_b), rows * cols * 1e-4);
+    }
+}
+
 namespace
 {
    void testR2CThenC2R(const std::string& hint, int cols, int rows, bool inplace)
--- a/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp
+++ b/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp
@ -201,6 +201,15 @@ CV_EXPORTS void alphaComp(InputArray img1, InputArray img2, OutputArray dst, int
 */
 CV_EXPORTS void calcHist(InputArray src, OutputArray hist, Stream& stream = Stream::Null());

+/** @brief Calculates histogram for one channel 8-bit image confined in given mask.
+
+@param src Source image with CV_8UC1 type.
+@param hist Destination histogram with one row, 256 columns, and the CV_32SC1 type.
+@param mask A mask image same size as src and of type CV_8UC1.
+@param stream Stream for the asynchronous version.
+ */
+CV_EXPORTS void calcHist(InputArray src, InputArray mask, OutputArray hist, Stream& stream = Stream::Null());
+
 /** @brief Equalizes the histogram of a grayscale image.

@param src Source image with CV_8UC1 type.
--- a/modules/cudaimgproc/src/cuda/hist.cu
+++ b/modules/cudaimgproc/src/cuda/hist.cu
@ -105,6 +105,72 @@ namespace hist
        if (stream == 0)
            cudaSafeCall( cudaDeviceSynchronize() );
    }
+
+    __global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t srcStep, const uchar* mask, size_t maskStep, int* hist)
+    {
+        __shared__ int shist[256];
+
+        const int y = blockIdx.x * blockDim.y + threadIdx.y;
+        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+
+        shist[tid] = 0;
+        __syncthreads();
+
+        if (y < rows)
+        {
+            const unsigned int* rowPtr = (const unsigned int*) (src + y * srcStep);
+            const unsigned int* maskRowPtr = (const unsigned int*) (mask + y * maskStep);
+
+            const int cols_4 = cols / 4;
+            for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
+            {
+                unsigned int data = rowPtr[x];
+                unsigned int m = maskRowPtr[x];
+
+                if ((m >>  0) & 0xFFU)
+                    Emulation::smem::atomicAdd(&shist[(data >>  0) & 0xFFU], 1);
+
+                if ((m >>  8) & 0xFFU)
+                    Emulation::smem::atomicAdd(&shist[(data >>  8) & 0xFFU], 1);
+
+                if ((m >>  16) & 0xFFU)
+                    Emulation::smem::atomicAdd(&shist[(data >> 16) & 0xFFU], 1);
+
+                if ((m >>  24) & 0xFFU)
+                    Emulation::smem::atomicAdd(&shist[(data >> 24) & 0xFFU], 1);
+            }
+
+            if (cols % 4 != 0 && threadIdx.x == 0)
+            {
+                for (int x = cols_4 * 4; x < cols; ++x)
+                {
+                    unsigned int data = ((const uchar*)rowPtr)[x];
+                    unsigned int m = ((const uchar*)maskRowPtr)[x];
+
+                    if (m)
+                        Emulation::smem::atomicAdd(&shist[data], 1);
+                }
+            }
+        }
+
+        __syncthreads();
+
+        const int histVal = shist[tid];
+        if (histVal > 0)
+            ::atomicAdd(hist + tid, histVal);
+    }
+
+    void histogram256(PtrStepSzb src, PtrStepSzb mask, int* hist, cudaStream_t stream)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(src.rows, block.y));
+
+        histogram256Kernel<<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, mask.data, mask.step, hist);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
 }

 /////////////////////////////////////////////////////////////////////////
--- a/modules/cudaimgproc/src/histogram.cpp
+++ b/modules/cudaimgproc/src/histogram.cpp
@ -69,20 +69,32 @@ void cv::cuda::histRange(InputArray, GpuMat*, const GpuMat*, Stream&) { throw_no
 namespace hist
 {
    void histogram256(PtrStepSzb src, int* hist, cudaStream_t stream);
+    void histogram256(PtrStepSzb src, PtrStepSzb mask, int* hist, cudaStream_t stream);
 }

 void cv::cuda::calcHist(InputArray _src, OutputArray _hist, Stream& stream)
+{
+    calcHist(_src, cv::cuda::GpuMat(), _hist, stream);
+}
+
+void cv::cuda::calcHist(InputArray _src, InputArray _mask, OutputArray _hist, Stream& stream)
 {
    GpuMat src = _src.getGpuMat();
+    GpuMat mask = _mask.getGpuMat();

    CV_Assert( src.type() == CV_8UC1 );
+    CV_Assert( mask.empty() || mask.type() == CV_8UC1 );
+    CV_Assert( mask.empty() || mask.size() == src.size() );

    _hist.create(1, 256, CV_32SC1);
    GpuMat hist = _hist.getGpuMat();

    hist.setTo(Scalar::all(0), stream);

-    hist::histogram256(src, hist.ptr<int>(), StreamAccessor::getStream(stream));
+    if (mask.empty())
+        hist::histogram256(src, hist.ptr<int>(), StreamAccessor::getStream(stream));
+    else
+        hist::histogram256(src, mask, hist.ptr<int>(), StreamAccessor::getStream(stream));
 }

 ////////////////////////////////////////////////////////////////////////
--- a/modules/cudaimgproc/test/test_histogram.cpp
+++ b/modules/cudaimgproc/test/test_histogram.cpp
@ -136,6 +136,49 @@ INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, CalcHist, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES));

+PARAM_TEST_CASE(CalcHistWithMask, cv::cuda::DeviceInfo, cv::Size)
+{
+    cv::cuda::DeviceInfo devInfo;
+
+    cv::Size size;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(CalcHistWithMask, Accuracy)
+{
+    cv::Mat src = randomMat(size, CV_8UC1);
+    cv::Mat mask = randomMat(size, CV_8UC1);
+    cv::Mat(mask, cv::Rect(0, 0, size.width / 2, size.height / 2)).setTo(0);
+
+    cv::cuda::GpuMat hist;
+    cv::cuda::calcHist(loadMat(src), loadMat(mask), hist);
+
+    cv::Mat hist_gold;
+
+    const int hbins = 256;
+    const float hranges[] = {0.0f, 256.0f};
+    const int histSize[] = {hbins};
+    const float* ranges[] = {hranges};
+    const int channels[] = {0};
+
+    cv::calcHist(&src, 1, channels, mask, hist_gold, 1, histSize, ranges);
+    hist_gold = hist_gold.reshape(1, 1);
+    hist_gold.convertTo(hist_gold, CV_32S);
+
+    EXPECT_MAT_NEAR(hist_gold, hist, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, CalcHistWithMask, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES));
+
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // EqualizeHist

--- a/modules/features2d/CMakeLists.txt
+++ b/modules/features2d/CMakeLists.txt
@ -1,2 +1,2 @@
 set(the_description "2D Features Framework")
-ocv_define_module(features2d opencv_imgproc opencv_ml opencv_flann OPTIONAL opencv_highgui WRAP java python)
+ocv_define_module(features2d opencv_imgproc opencv_flann OPTIONAL opencv_highgui WRAP java python)
--- a/modules/features2d/test/test_precomp.hpp
+++ b/modules/features2d/test/test_precomp.hpp
@ -13,7 +13,6 @@
 #include "opencv2/imgproc.hpp"
 #include "opencv2/features2d.hpp"
 #include "opencv2/imgcodecs.hpp"
-#include "opencv2/ml.hpp"
 #include <iostream>

 #endif
--- a/modules/flann/include/opencv2/flann/kdtree_index.h
+++ b/modules/flann/include/opencv2/flann/kdtree_index.h
@ -125,7 +125,12 @@ public:
        /* Construct the randomized trees. */
        for (int i = 0; i < trees_; i++) {
            /* Randomize the order of vectors to allow for unbiased sampling. */
+#ifndef OPENCV_FLANN_USE_STD_RAND
+            cv::randShuffle(vind_);
+#else
            std::random_shuffle(vind_.begin(), vind_.end());
+#endif
+
            tree_roots_[i] = divideTree(&vind_[0], int(size_) );
        }
    }
--- a/modules/flann/include/opencv2/flann/lsh_table.h
+++ b/modules/flann/include/opencv2/flann/lsh_table.h
@ -350,7 +350,11 @@ inline LshTable<unsigned char>::LshTable(unsigned int feature_size, unsigned int
    // A bit brutal but fast to code
    std::vector<size_t> indices(feature_size * CHAR_BIT);
    for (size_t i = 0; i < feature_size * CHAR_BIT; ++i) indices[i] = i;
+#ifndef OPENCV_FLANN_USE_STD_RAND
+    cv::randShuffle(indices);
+#else
    std::random_shuffle(indices.begin(), indices.end());
+#endif

    // Generate a random set of order of subsignature_size_ bits
    for (unsigned int i = 0; i < key_size_; ++i) {
--- a/modules/flann/include/opencv2/flann/random.h
+++ b/modules/flann/include/opencv2/flann/random.h
@ -40,13 +40,31 @@
 namespace cvflann
 {

+inline int rand()
+{
+#ifndef OPENCV_FLANN_USE_STD_RAND
+#   if INT_MAX == RAND_MAX
+    int v = cv::theRNG().next() & INT_MAX;
+#   else
+    int v = cv::theRNG().uniform(0, RAND_MAX + 1);
+#   endif
+#else
+    int v = std::rand();
+#endif // OPENCV_FLANN_USE_STD_RAND
+    return v;
+}
+
 /**
 * Seeds the random number generator
 *  @param seed Random seed
 */
 inline void seed_random(unsigned int seed)
 {
-    srand(seed);
+#ifndef OPENCV_FLANN_USE_STD_RAND
+    cv::theRNG() = cv::RNG(seed);
+#else
+    std::srand(seed);
+#endif
 }

 /*
@ -60,7 +78,7 @@ inline void seed_random(unsigned int seed)
 */
 inline double rand_double(double high = 1.0, double low = 0)
 {
-    return low + ((high-low) * (std::rand() / (RAND_MAX + 1.0)));
+    return low + ((high-low) * (rand() / (RAND_MAX + 1.0)));
 }

 /**
@ -71,7 +89,7 @@ inline double rand_double(double high = 1.0, double low = 0)
 */
 inline int rand_int(int high = RAND_MAX, int low = 0)
 {
-    return low + (int) ( double(high-low) * (std::rand() / (RAND_MAX + 1.0)));
+    return low + (int) ( double(high-low) * (rand() / (RAND_MAX + 1.0)));
 }

 /**
@ -107,7 +125,11 @@ public:
        for (int i = 0; i < size_; ++i) vals_[i] = i;

        // shuffle the elements in the array
+#ifndef OPENCV_FLANN_USE_STD_RAND
+        cv::randShuffle(vals_);
+#else
        std::random_shuffle(vals_.begin(), vals_.end());
+#endif

        counter_ = 0;
    }
--- a/modules/flann/misc/python/pyopencv_flann.hpp
+++ b/modules/flann/misc/python/pyopencv_flann.hpp
@ -23,6 +23,9 @@ bool pyopencv_to(PyObject *o, cv::flann::IndexParams& p, const char *name)
    PyObject* item = NULL;
    Py_ssize_t pos = 0;

+    if (!o || o == Py_None)
+        return true;
+
    if(PyDict_Check(o)) {
        while(PyDict_Next(o, &pos, &key, &item)) {
            if( !PyString_Check(key) ) {
--- a/modules/highgui/CMakeLists.txt
+++ b/modules/highgui/CMakeLists.txt
@ -1,5 +1,5 @@
 set(the_description "High-level GUI and Media I/O")
-ocv_add_module(highgui opencv_imgproc OPTIONAL opencv_imgcodecs opencv_videoio WRAP python)
+ocv_add_module(highgui opencv_imgproc opencv_imgcodecs OPTIONAL opencv_videoio WRAP python)

 # ----------------------------------------------------------------------------
 #  CMake file for highgui. See root CMakeLists.txt
@ -65,7 +65,7 @@ elseif(HAVE_QT)

  list(APPEND HIGHGUI_LIBRARIES ${QT_LIBRARIES})
  list(APPEND highgui_srcs ${CMAKE_CURRENT_LIST_DIR}/src/window_QT.cpp ${_MOC_OUTFILES} ${_RCC_OUTFILES})
-  ocv_check_flag_support(CXX -Wno-missing-declarations _have_flag)
+  ocv_check_flag_support(CXX -Wno-missing-declarations _have_flag "")
  if(${_have_flag})
    set_source_files_properties(${_RCC_OUTFILES} PROPERTIES COMPILE_FLAGS -Wno-missing-declarations)
  endif()
--- a/modules/highgui/src/precomp.hpp
+++ b/modules/highgui/src/precomp.hpp
@ -50,10 +50,8 @@
 #include "opencv2/imgproc/imgproc_c.h"
 #include "opencv2/highgui/highgui_c.h"

-#ifdef HAVE_OPENCV_IMGCODECS
 #include "opencv2/imgcodecs.hpp"
 #include "opencv2/imgcodecs/imgcodecs_c.h"
-#endif

 #include <stdlib.h>
 #include <stdio.h>
--- a/modules/highgui/src/window_QT.h
+++ b/modules/highgui/src/window_QT.h
@ -42,6 +42,10 @@

 #include "precomp.hpp"

+#ifndef _DEBUG
+#define QT_NO_DEBUG_OUTPUT
+#endif
+
 #if defined( HAVE_QT_OPENGL )
 #include <QtOpenGL>
 #include <QGLWidget>
--- a/modules/imgcodecs/src/ios_conversions.mm
+++ b/modules/imgcodecs/src/ios_conversions.mm
@ -53,7 +53,7 @@ void UIImageToMat(const UIImage* image, cv::Mat& m, bool alphaExist);
 UIImage* MatToUIImage(const cv::Mat& image) {

    NSData *data = [NSData dataWithBytes:image.data
-                                  length:image.elemSize()*image.total()];
+                                  length:image.step.p[0] * image.rows];

    CGColorSpaceRef colorSpace;

@ -73,7 +73,7 @@ UIImage* MatToUIImage(const cv::Mat& image) {
    // Creating CGImage from cv::Mat
    CGImageRef imageRef = CGImageCreate(image.cols,
                                        image.rows,
-                                        8,
+                                        8 * image.elemSize1(),
                                        8 * image.elemSize(),
                                        image.step.p[0],
                                        colorSpace,
@ -97,7 +97,7 @@ UIImage* MatToUIImage(const cv::Mat& image) {
 void UIImageToMat(const UIImage* image,
                         cv::Mat& m, bool alphaExist) {
    CGColorSpaceRef colorSpace = CGImageGetColorSpace(image.CGImage);
-    CGFloat cols = image.size.width, rows = image.size.height;
+    CGFloat cols = CGImageGetWidth(image.CGImage), rows = CGImageGetHeight(image.CGImage);
    CGContextRef contextRef;
    CGBitmapInfo bitmapInfo = kCGImageAlphaPremultipliedLast;
    if (CGColorSpaceGetModel(colorSpace) == kCGColorSpaceModelMonochrome)
--- a/modules/imgproc/include/opencv2/imgproc.hpp
+++ b/modules/imgproc/include/opencv2/imgproc.hpp
@ -452,6 +452,20 @@ enum ContourApproximationModes {
    CHAIN_APPROX_TC89_KCOS = 4
 };

+/** @brief Shape matching methods
+
+\f$A\f$ denotes object1,\f$B\f$ denotes object2
+
+\f$\begin{array}{l} m^A_i =  \mathrm{sign} (h^A_i)  \cdot \log{h^A_i} \\ m^B_i =  \mathrm{sign} (h^B_i)  \cdot \log{h^B_i} \end{array}\f$
+
+and \f$h^A_i, h^B_i\f$ are the Hu moments of \f$A\f$ and \f$B\f$ , respectively.
+*/
+enum ShapeMatchModes {
+    CONTOURS_MATCH_I1  =1, //!< \f[I_1(A,B) =  \sum _{i=1...7}  \left |  \frac{1}{m^A_i} -  \frac{1}{m^B_i} \right |\f]
+    CONTOURS_MATCH_I2  =2, //!< \f[I_2(A,B) =  \sum _{i=1...7}  \left | m^A_i - m^B_i  \right |\f]
+    CONTOURS_MATCH_I3  =3  //!< \f[I_3(A,B) =  \max _{i=1...7}  \frac{ \left| m^A_i - m^B_i \right| }{ \left| m^A_i \right| }\f]
+};
+
 //! @} imgproc_shape

 //! Variants of a Hough transform
@ -2588,9 +2602,8 @@ The function supports multi-channel images. Each channel is processed independen
 The functions accumulate\* can be used, for example, to collect statistics of a scene background
 viewed by a still camera and for the further foreground-background segmentation.

-@param src Input image as 1- or 3-channel, 8-bit or 32-bit floating point.
-@param dst %Accumulator image with the same number of channels as input image, 32-bit or 64-bit
-floating-point.
+@param src Input image of type CV_8UC(n), CV_16UC(n), CV_32FC(n) or CV_64FC(n), where n is a positive integer.
+@param dst %Accumulator image with the same number of channels as input image, and a depth of CV_32F or CV_64F.
@param mask Optional operation mask.

@sa  accumulateSquare, accumulateProduct, accumulateWeighted
@ -3916,7 +3929,7 @@ The function compares two shapes. All three implemented methods use the Hu invar

@param contour1 First contour or grayscale image.
@param contour2 Second contour or grayscale image.
-@param method Comparison method, see ::ShapeMatchModes
+@param method Comparison method, see cv::ShapeMatchModes
@param parameter Method-specific parameter (not supported now).
 */
 CV_EXPORTS_W double matchShapes( InputArray contour1, InputArray contour2,
@ -4081,7 +4094,13 @@ CV_EXPORTS Ptr<GeneralizedHoughBallard> createGeneralizedHoughBallard();
 //! Detects position, translation and rotation
 CV_EXPORTS Ptr<GeneralizedHoughGuil> createGeneralizedHoughGuil();

-//! Performs linear blending of two images
+//! Performs linear blending of two images:
+//! \f[ \texttt{dst}(i,j) = \texttt{weights1}(i,j)*\texttt{src1}(i,j) + \texttt{weights2}(i,j)*\texttt{src2}(i,j) \f]
+//! @param src1 It has a type of CV_8UC(n) or CV_32FC(n), where n is a positive integer.
+//! @param src2 It has the same type and size as src1.
+//! @param weights1 It has a type of CV_32FC1 and the same size with src1.
+//! @param weights2 It has a type of CV_32FC1 and the same size with src1.
+//! @param dst It is created if it does not have the same size and type with src1.
 CV_EXPORTS void blendLinear(InputArray src1, InputArray src2, InputArray weights1, InputArray weights2, OutputArray dst);

 //! @addtogroup imgproc_colormap
--- a/modules/imgproc/include/opencv2/imgproc/types_c.h
+++ b/modules/imgproc/include/opencv2/imgproc/types_c.h
@ -501,15 +501,8 @@ enum
    CV_POLY_APPROX_DP = 0
 };

-/** @brief Shape matching methods
-
-\f$A\f$ denotes object1,\f$B\f$ denotes object2
-
-\f$\begin{array}{l} m^A_i =  \mathrm{sign} (h^A_i)  \cdot \log{h^A_i} \\ m^B_i =  \mathrm{sign} (h^B_i)  \cdot \log{h^B_i} \end{array}\f$
-
-and \f$h^A_i, h^B_i\f$ are the Hu moments of \f$A\f$ and \f$B\f$ , respectively.
-*/
-enum ShapeMatchModes
+/** Shape matching methods */
+enum
 {
    CV_CONTOURS_MATCH_I1  =1, //!< \f[I_1(A,B) =  \sum _{i=1...7}  \left |  \frac{1}{m^A_i} -  \frac{1}{m^B_i} \right |\f]
    CV_CONTOURS_MATCH_I2  =2, //!< \f[I_2(A,B) =  \sum _{i=1...7}  \left | m^A_i - m^B_i  \right |\f]
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@ -1649,7 +1649,7 @@ struct VResizeLanczos4
    {
        CastOp castOp;
        VecOp vecOp;
-        int k, x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
+        int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
        #if CV_ENABLE_UNROLLED
        for( ; x <= width - 4; x += 4 )
        {
@ -1657,7 +1657,7 @@ struct VResizeLanczos4
            const WT* S = src[0];
            WT s0 = S[x]*b, s1 = S[x+1]*b, s2 = S[x+2]*b, s3 = S[x+3]*b;

-            for( k = 1; k < 8; k++ )
+            for( int k = 1; k < 8; k++ )
            {
                b = beta[k]; S = src[k];
                s0 += S[x]*b; s1 += S[x+1]*b;
@ -3533,14 +3533,13 @@ static void remapNearest( const Mat& _src, Mat& _dst, const Mat& _xy,
                          int borderType, const Scalar& _borderValue )
 {
    Size ssize = _src.size(), dsize = _dst.size();
-    int cn = _src.channels();
+    const int cn = _src.channels();
    const T* S0 = _src.ptr<T>();
+    T cval[CV_CN_MAX];
    size_t sstep = _src.step/sizeof(S0[0]);
-    Scalar_<T> cval(saturate_cast<T>(_borderValue[0]),
-        saturate_cast<T>(_borderValue[1]),
-        saturate_cast<T>(_borderValue[2]),
-        saturate_cast<T>(_borderValue[3]));
-    int dx, dy;
+
+    for(int k = 0; k < cn; k++ )
+        cval[k] = saturate_cast<T>(_borderValue[k & 3]);

    unsigned width1 = ssize.width, height1 = ssize.height;

@ -3550,14 +3549,14 @@ static void remapNearest( const Mat& _src, Mat& _dst, const Mat& _xy,
        dsize.height = 1;
    }

-    for( dy = 0; dy < dsize.height; dy++ )
+    for(int dy = 0; dy < dsize.height; dy++ )
    {
        T* D = _dst.ptr<T>(dy);
        const short* XY = _xy.ptr<short>(dy);

        if( cn == 1 )
        {
-            for( dx = 0; dx < dsize.width; dx++ )
+            for(int dx = 0; dx < dsize.width; dx++ )
            {
                int sx = XY[dx*2], sy = XY[dx*2+1];
                if( (unsigned)sx < width1 && (unsigned)sy < height1 )
@ -3583,9 +3582,9 @@ static void remapNearest( const Mat& _src, Mat& _dst, const Mat& _xy,
        }
        else
        {
-            for( dx = 0; dx < dsize.width; dx++, D += cn )
+            for(int dx = 0; dx < dsize.width; dx++, D += cn )
            {
-                int sx = XY[dx*2], sy = XY[dx*2+1], k;
+                int sx = XY[dx*2], sy = XY[dx*2+1];
                const T *S;
                if( (unsigned)sx < width1 && (unsigned)sy < height1 )
                {
@ -3602,7 +3601,7 @@ static void remapNearest( const Mat& _src, Mat& _dst, const Mat& _xy,
                    else
                    {
                        S = S0 + sy*sstep + sx*cn;
-                        for( k = 0; k < cn; k++ )
+                        for(int k = 0; k < cn; k++ )
                            D[k] = S[k];
                    }
                }
@ -3622,7 +3621,7 @@ static void remapNearest( const Mat& _src, Mat& _dst, const Mat& _xy,
                        sy = borderInterpolate(sy, ssize.height, borderType);
                        S = S0 + sy*sstep + sx*cn;
                    }
-                    for( k = 0; k < cn; k++ )
+                    for(int k = 0; k < cn; k++ )
                        D[k] = S[k];
                }
            }
@ -3852,16 +3851,15 @@ static void remapBilinear( const Mat& _src, Mat& _dst, const Mat& _xy,
    typedef typename CastOp::rtype T;
    typedef typename CastOp::type1 WT;
    Size ssize = _src.size(), dsize = _dst.size();
-    int k, cn = _src.channels();
+    const int cn = _src.channels();
    const AT* wtab = (const AT*)_wtab;
    const T* S0 = _src.ptr<T>();
    size_t sstep = _src.step/sizeof(S0[0]);
    T cval[CV_CN_MAX];
-    int dx, dy;
    CastOp castOp;
    VecOp vecOp;

-    for( k = 0; k < cn; k++ )
+    for(int k = 0; k < cn; k++ )
        cval[k] = saturate_cast<T>(_borderValue[k & 3]);

    unsigned width1 = std::max(ssize.width-1, 0), height1 = std::max(ssize.height-1, 0);
@ -3871,7 +3869,7 @@ static void remapBilinear( const Mat& _src, Mat& _dst, const Mat& _xy,
        width1 = std::max(ssize.width-2, 0);
 #endif

-    for( dy = 0; dy < dsize.height; dy++ )
+    for(int dy = 0; dy < dsize.height; dy++ )
    {
        T* D = _dst.ptr<T>(dy);
        const short* XY = _xy.ptr<short>(dy);
@ -3879,7 +3877,7 @@ static void remapBilinear( const Mat& _src, Mat& _dst, const Mat& _xy,
        int X0 = 0;
        bool prevInlier = false;

-        for( dx = 0; dx <= dsize.width; dx++ )
+        for(int dx = 0; dx <= dsize.width; dx++ )
        {
            bool curInlier = dx < dsize.width ?
                (unsigned)XY[dx*2] < width1 &&
@ -3948,7 +3946,7 @@ static void remapBilinear( const Mat& _src, Mat& _dst, const Mat& _xy,
                        int sx = XY[dx*2], sy = XY[dx*2+1];
                        const AT* w = wtab + FXY[dx]*4;
                        const T* S = S0 + sy*sstep + sx*cn;
-                        for( k = 0; k < cn; k++ )
+                        for(int k = 0; k < cn; k++ )
                        {
                            WT t0 = S[k]*w[0] + S[k+cn]*w[1] + S[sstep+k]*w[2] + S[sstep+k+cn]*w[3];
                            D[k] = castOp(t0);
@ -4012,7 +4010,7 @@ static void remapBilinear( const Mat& _src, Mat& _dst, const Mat& _xy,
                            (sx >= ssize.width || sx+1 < 0 ||
                             sy >= ssize.height || sy+1 < 0) )
                        {
-                            for( k = 0; k < cn; k++ )
+                            for(int k = 0; k < cn; k++ )
                                D[k] = cval[k];
                        }
                        else
@ -4046,7 +4044,7 @@ static void remapBilinear( const Mat& _src, Mat& _dst, const Mat& _xy,
                                v2 = sx0 >= 0 && sy1 >= 0 ? S0 + sy1*sstep + sx0*cn : &cval[0];
                                v3 = sx1 >= 0 && sy1 >= 0 ? S0 + sy1*sstep + sx1*cn : &cval[0];
                            }
-                            for( k = 0; k < cn; k++ )
+                            for(int k = 0; k < cn; k++ )
                                D[k] = castOp(WT(v0[k]*w[0] + v1[k]*w[1] + v2[k]*w[2] + v3[k]*w[3]));
                        }
                    }
@ -4064,16 +4062,16 @@ static void remapBicubic( const Mat& _src, Mat& _dst, const Mat& _xy,
    typedef typename CastOp::rtype T;
    typedef typename CastOp::type1 WT;
    Size ssize = _src.size(), dsize = _dst.size();
-    int cn = _src.channels();
+    const int cn = _src.channels();
    const AT* wtab = (const AT*)_wtab;
    const T* S0 = _src.ptr<T>();
    size_t sstep = _src.step/sizeof(S0[0]);
-    Scalar_<T> cval(saturate_cast<T>(_borderValue[0]),
-        saturate_cast<T>(_borderValue[1]),
-        saturate_cast<T>(_borderValue[2]),
-        saturate_cast<T>(_borderValue[3]));
-    int dx, dy;
+    T cval[CV_CN_MAX];
    CastOp castOp;
+
+    for(int k = 0; k < cn; k++ )
+        cval[k] = saturate_cast<T>(_borderValue[k & 3]);
+
    int borderType1 = borderType != BORDER_TRANSPARENT ? borderType : BORDER_REFLECT_101;

    unsigned width1 = std::max(ssize.width-3, 0), height1 = std::max(ssize.height-3, 0);
@ -4084,21 +4082,20 @@ static void remapBicubic( const Mat& _src, Mat& _dst, const Mat& _xy,
        dsize.height = 1;
    }

-    for( dy = 0; dy < dsize.height; dy++ )
+    for(int dy = 0; dy < dsize.height; dy++ )
    {
        T* D = _dst.ptr<T>(dy);
        const short* XY = _xy.ptr<short>(dy);
        const ushort* FXY = _fxy.ptr<ushort>(dy);

-        for( dx = 0; dx < dsize.width; dx++, D += cn )
+        for(int dx = 0; dx < dsize.width; dx++, D += cn )
        {
            int sx = XY[dx*2]-1, sy = XY[dx*2+1]-1;
            const AT* w = wtab + FXY[dx]*16;
-            int i, k;
            if( (unsigned)sx < width1 && (unsigned)sy < height1 )
            {
                const T* S = S0 + sy*sstep + sx*cn;
-                for( k = 0; k < cn; k++ )
+                for(int k = 0; k < cn; k++ )
                {
                    WT sum = S[0]*w[0] + S[cn]*w[1] + S[cn*2]*w[2] + S[cn*3]*w[3];
                    S += sstep;
@ -4123,21 +4120,21 @@ static void remapBicubic( const Mat& _src, Mat& _dst, const Mat& _xy,
                    (sx >= ssize.width || sx+4 <= 0 ||
                    sy >= ssize.height || sy+4 <= 0))
                {
-                    for( k = 0; k < cn; k++ )
+                    for(int k = 0; k < cn; k++ )
                        D[k] = cval[k];
                    continue;
                }

-                for( i = 0; i < 4; i++ )
+                for(int i = 0; i < 4; i++ )
                {
                    x[i] = borderInterpolate(sx + i, ssize.width, borderType1)*cn;
                    y[i] = borderInterpolate(sy + i, ssize.height, borderType1);
                }

-                for( k = 0; k < cn; k++, S0++, w -= 16 )
+                for(int k = 0; k < cn; k++, S0++, w -= 16 )
                {
                    WT cv = cval[k], sum = cv*ONE;
-                    for( i = 0; i < 4; i++, w += 4 )
+                    for(int i = 0; i < 4; i++, w += 4 )
                    {
                        int yi = y[i];
                        const T* S = S0 + yi*sstep;
@ -4169,16 +4166,16 @@ static void remapLanczos4( const Mat& _src, Mat& _dst, const Mat& _xy,
    typedef typename CastOp::rtype T;
    typedef typename CastOp::type1 WT;
    Size ssize = _src.size(), dsize = _dst.size();
-    int cn = _src.channels();
+    const int cn = _src.channels();
    const AT* wtab = (const AT*)_wtab;
    const T* S0 = _src.ptr<T>();
    size_t sstep = _src.step/sizeof(S0[0]);
-    Scalar_<T> cval(saturate_cast<T>(_borderValue[0]),
-        saturate_cast<T>(_borderValue[1]),
-        saturate_cast<T>(_borderValue[2]),
-        saturate_cast<T>(_borderValue[3]));
-    int dx, dy;
+    T cval[CV_CN_MAX];
    CastOp castOp;
+
+    for(int k = 0; k < cn; k++ )
+        cval[k] = saturate_cast<T>(_borderValue[k & 3]);
+
    int borderType1 = borderType != BORDER_TRANSPARENT ? borderType : BORDER_REFLECT_101;

    unsigned width1 = std::max(ssize.width-7, 0), height1 = std::max(ssize.height-7, 0);
@ -4189,21 +4186,20 @@ static void remapLanczos4( const Mat& _src, Mat& _dst, const Mat& _xy,
        dsize.height = 1;
    }

-    for( dy = 0; dy < dsize.height; dy++ )
+    for(int dy = 0; dy < dsize.height; dy++ )
    {
        T* D = _dst.ptr<T>(dy);
        const short* XY = _xy.ptr<short>(dy);
        const ushort* FXY = _fxy.ptr<ushort>(dy);

-        for( dx = 0; dx < dsize.width; dx++, D += cn )
+        for(int dx = 0; dx < dsize.width; dx++, D += cn )
        {
            int sx = XY[dx*2]-3, sy = XY[dx*2+1]-3;
            const AT* w = wtab + FXY[dx]*64;
            const T* S = S0 + sy*sstep + sx*cn;
-            int i, k;
            if( (unsigned)sx < width1 && (unsigned)sy < height1 )
            {
-                for( k = 0; k < cn; k++ )
+                for(int k = 0; k < cn; k++ )
                {
                    WT sum = 0;
                    for( int r = 0; r < 8; r++, S += sstep, w += 8 )
@ -4226,21 +4222,21 @@ static void remapLanczos4( const Mat& _src, Mat& _dst, const Mat& _xy,
                    (sx >= ssize.width || sx+8 <= 0 ||
                    sy >= ssize.height || sy+8 <= 0))
                {
-                    for( k = 0; k < cn; k++ )
+                    for(int k = 0; k < cn; k++ )
                        D[k] = cval[k];
                    continue;
                }

-                for( i = 0; i < 8; i++ )
+                for(int i = 0; i < 8; i++ )
                {
                    x[i] = borderInterpolate(sx + i, ssize.width, borderType1)*cn;
                    y[i] = borderInterpolate(sy + i, ssize.height, borderType1);
                }

-                for( k = 0; k < cn; k++, S0++, w -= 64 )
+                for(int k = 0; k < cn; k++, S0++, w -= 64 )
                {
                    WT cv = cval[k], sum = cv*ONE;
-                    for( i = 0; i < 8; i++, w += 8 )
+                    for(int i = 0; i < 8; i++, w += 8 )
                    {
                        int yi = y[i];
                        const T* S1 = S0 + yi*sstep;
--- a/modules/imgproc/src/opencl/cvtcolor.cl
+++ b/modules/imgproc/src/opencl/cvtcolor.cl
@ -160,6 +160,7 @@ enum
 #define CAT(x, y) __CAT(x, y)

 #define DATA_TYPE_4 CAT(DATA_TYPE, 4)
+#define DATA_TYPE_3 CAT(DATA_TYPE, 3)

 ///////////////////////////////////// RGB <-> GRAY //////////////////////////////////////

@ -182,7 +183,7 @@ __kernel void RGB2Gray(__global const uchar * srcptr, int src_step, int src_offs
            {
                __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);
                __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);
-                DATA_TYPE_4 src_pix = vload4(0, src);
+                DATA_TYPE_3 src_pix = vload3(0, src);
 #ifdef DEPTH_5
                dst[0] = fma(src_pix.B_COMP, B2YF, fma(src_pix.G_COMP, G2YF, src_pix.R_COMP * R2YF));
 #else
@ -256,7 +257,7 @@ __kernel void RGB2YUV(__global const uchar* srcptr, int src_step, int src_offset
            {
                __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);
                __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);
-                DATA_TYPE_4 src_pix = vload4(0, src);
+                DATA_TYPE_3 src_pix = vload3(0, src);
                DATA_TYPE b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;

 #ifdef DEPTH_5
--- a/modules/imgproc/src/undistort.cpp
+++ b/modules/imgproc/src/undistort.cpp
@ -476,8 +476,6 @@ static Point2f mapPointSpherical(const Point2f& p, float alpha, Vec4d* J, int pr

 static Point2f invMapPointSpherical(Point2f _p, float alpha, int projType)
 {
-    static int avgiter = 0, avgn = 0;
-
    double eps = 1e-12;
    Vec2d p(_p.x, _p.y), q(_p.x, _p.y), err;
    Vec4d J;
@ -502,14 +500,6 @@ static Point2f invMapPointSpherical(Point2f _p, float alpha, int projType)
        //q -= Vec2d((J.t()*J).inv()*(J.t()*err));
    }

-    if( i < maxiter )
-    {
-        avgiter += i;
-        avgn++;
-        if( avgn == 1500 )
-            printf("avg iters = %g\n", (double)avgiter/avgn);
-    }
-
    return i < maxiter ? Point2f((float)q[0], (float)q[1]) : Point2f(-FLT_MAX, -FLT_MAX);
 }

--- a/modules/imgproc/test/test_imgwarp.cpp
+++ b/modules/imgproc/test/test_imgwarp.cpp
@ -1686,22 +1686,35 @@ TEST(Resize, Area_half)

 TEST(Imgproc_Warp, multichannel)
 {
+    static const int inter_types[] = {INTER_NEAREST, INTER_AREA, INTER_CUBIC,
+                                      INTER_LANCZOS4, INTER_LINEAR};
+    static const int inter_n = sizeof(inter_types) / sizeof(int);
+
+    static const int border_types[] = {BORDER_CONSTANT, BORDER_DEFAULT,
+                                       BORDER_REFLECT, BORDER_REPLICATE,
+                                       BORDER_WRAP, BORDER_WRAP};
+    static const int border_n = sizeof(border_types) / sizeof(int);
+
    RNG& rng = theRNG();
-    for( int iter = 0; iter < 30; iter++ )
+    for( int iter = 0; iter < 100; iter++ )
    {
+        int inter = inter_types[rng.uniform(0, inter_n)];
+        int border = border_types[rng.uniform(0, border_n)];
        int width = rng.uniform(3, 333);
        int height = rng.uniform(3, 333);
-        int cn = rng.uniform(1, 10);
+        int cn = rng.uniform(1, 15);
+        if(inter == INTER_CUBIC || inter == INTER_LANCZOS4)
+            cn = rng.uniform(1, 5);
        Mat src(height, width, CV_8UC(cn)), dst;
        //randu(src, 0, 256);
        src.setTo(0.);

-        Mat rot = getRotationMatrix2D(Point2f(0.f, 0.f), 1, 1);
-        warpAffine(src, dst, rot, src.size());
+        Mat rot = getRotationMatrix2D(Point2f(0.f, 0.f), 1.0, 1.0);
+        warpAffine(src, dst, rot, src.size(), inter, border);
        ASSERT_EQ(0.0, norm(dst, NORM_INF));
        Mat rot2 = Mat::eye(3, 3, rot.type());
        rot.copyTo(rot2.rowRange(0, 2));
-        warpPerspective(src, dst, rot2, src.size());
+        warpPerspective(src, dst, rot2, src.size(), inter, border);
        ASSERT_EQ(0.0, norm(dst, NORM_INF));
    }
 }
--- a/modules/java/generator/gen_java.py
+++ b/modules/java/generator/gen_java.py
@ -14,7 +14,8 @@ class_ignore_list = (
    #core
    "FileNode", "FileStorage", "KDTree", "KeyPoint", "DMatch",
    #features2d
-    "SimpleBlobDetector"
+    "SimpleBlobDetector",
+    "CirclesGridFinderParameters"
 )

 const_ignore_list = (
@ -862,10 +863,13 @@ class ClassInfo(GeneralInfo):
        self.j_code = StringIO()
        self.jn_code = StringIO()
        self.cpp_code = StringIO();
-        if self.name != Module:
-            self.j_code.write(T_JAVA_START_INHERITED if self.base else T_JAVA_START_ORPHAN)
+        if self.base:
+            self.j_code.write(T_JAVA_START_INHERITED)
        else:
-            self.j_code.write(T_JAVA_START_MODULE)
+            if self.name != Module:
+                self.j_code.write(T_JAVA_START_ORPHAN)
+            else:
+                self.j_code.write(T_JAVA_START_MODULE)
        # misc handling
        if self.name == 'Core':
            self.imports.add("java.lang.String")
@ -962,11 +966,11 @@ class JavaWrapperGenerator(object):
            logging.info('ignored: %s', classinfo)
            return
        name = classinfo.name
-        if self.isWrapped(name):
+        if self.isWrapped(name) and not classinfo.base:
            logging.warning('duplicated: %s', classinfo)
            return
        self.classes[name] = classinfo
-        if name in type_dict:
+        if name in type_dict and not classinfo.base:
            logging.warning('duplicated: %s', classinfo)
            return
        type_dict[name] = \
@ -1520,7 +1524,7 @@ JNIEXPORT $rtype JNICALL Java_org_opencv_${module}_${clazz}_$fname
                ci.jn_code.write( ManualFuncs[ci.name][func]["jn_code"] )
                ci.cpp_code.write( ManualFuncs[ci.name][func]["cpp_code"] )

-        if ci.name != self.Module:
+        if ci.name != self.Module or ci.base:
            # finalize()
            ci.j_code.write(
 """
--- a/modules/java/generator/src/java/android+AsyncServiceHelper.java
+++ b/modules/java/generator/src/java/android+AsyncServiceHelper.java
@ -131,7 +131,7 @@ class AsyncServiceHelper
                }
                public void cancel()
                {
-                    Log.d(TAG, "Wating for OpenCV canceled by user");
+                    Log.d(TAG, "Waiting for OpenCV canceled by user");
                    mServiceInstallationProgress = false;
                    int Status = LoaderCallbackInterface.INSTALL_CANCELED;
                    Log.d(TAG, "Init finished with status " + Status);
@ -197,7 +197,7 @@ class AsyncServiceHelper
                                        if (mEngineService.installVersion(mOpenCVersion))
                                        {
                                            mLibraryInstallationProgress = true;
-                                            Log.d(TAG, "Package installation statred");
+                                            Log.d(TAG, "Package installation started");
                                            Log.d(TAG, "Unbind from service");
                                            mAppContext.unbindService(mServiceConnection);
                                        }
@ -228,7 +228,7 @@ class AsyncServiceHelper
                                    mUserAppCallback.onManagerConnected(LoaderCallbackInterface.INSTALL_CANCELED);
                                }
                                public void wait_install() {
-                                    Log.e(TAG, "Instalation was not started! Nothing to wait!");
+                                    Log.e(TAG, "Installation was not started! Nothing to wait!");
                                }
                            };

--- a/modules/java/generator/src/java/android+BaseLoaderCallback.java
+++ b/modules/java/generator/src/java/android+BaseLoaderCallback.java
@ -43,7 +43,7 @@ public abstract class BaseLoaderCallback implements LoaderCallbackInterface {
            /** Package installation has been canceled. **/
            case LoaderCallbackInterface.INSTALL_CANCELED:
            {
-                Log.d(TAG, "OpenCV library instalation was canceled by user");
+                Log.d(TAG, "OpenCV library installation was canceled by user");
                finish();
            } break;
            /** Application is incompatible with this version of OpenCV Manager. Possibly, a service update is required. **/
--- a/modules/java/generator/src/java/android+CameraBridgeViewBase.java
+++ b/modules/java/generator/src/java/android+CameraBridgeViewBase.java
@ -38,7 +38,7 @@ public abstract class CameraBridgeViewBase extends SurfaceView implements Surfac
    private Bitmap mCacheBitmap;
    private CvCameraViewListener2 mListener;
    private boolean mSurfaceExist;
-    private Object mSyncObject = new Object();
+    private final Object mSyncObject = new Object();

    protected int mFrameWidth;
    protected int mFrameHeight;
--- a/modules/java/generator/src/java/android+JavaCameraView.java
+++ b/modules/java/generator/src/java/android+JavaCameraView.java
@ -275,7 +275,7 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb
            synchronized (this) {
                this.notify();
            }
-            Log.d(TAG, "Wating for thread");
+            Log.d(TAG, "Waiting for thread");
            if (mThread != null)
                mThread.join();
        } catch (InterruptedException e) {
--- a/modules/java/pure_test/CMakeLists.txt
+++ b/modules/java/pure_test/CMakeLists.txt
@ -1,4 +1,6 @@
-if(NOT ANT_EXECUTABLE)
+if(NOT ANT_EXECUTABLE
+  OR NOT BUILD_opencv_imgcodecs
+  OR NOT BUILD_opencv_calib3d)
  return()
 endif()

--- a/modules/ml/include/opencv2/ml.hpp
+++ b/modules/ml/include/opencv2/ml.hpp
@ -104,7 +104,7 @@ enum SampleTypes
 It is used for optimizing statmodel accuracy by varying model parameters, the accuracy estimate
 being computed by cross-validation.
 */
-class CV_EXPORTS ParamGrid
+class CV_EXPORTS_W ParamGrid
 {
 public:
    /** @brief Default constructor */
@ -112,8 +112,8 @@ public:
    /** @brief Constructor with parameters */
    ParamGrid(double _minVal, double _maxVal, double _logStep);

-    double minVal; //!< Minimum value of the statmodel parameter. Default value is 0.
-    double maxVal; //!< Maximum value of the statmodel parameter. Default value is 0.
+    CV_PROP_RW double minVal; //!< Minimum value of the statmodel parameter. Default value is 0.
+    CV_PROP_RW double maxVal; //!< Maximum value of the statmodel parameter. Default value is 0.
    /** @brief Logarithmic step for iterating the statmodel parameter.

    The grid determines the following iteration sequence of the statmodel parameter values:
@ -122,7 +122,15 @@ public:
    \f[\texttt{minVal} * \texttt{logStep} ^n <  \texttt{maxVal}\f]
    The grid is logarithmic, so logStep must always be greater then 1. Default value is 1.
    */
-    double logStep;
+    CV_PROP_RW double logStep;
+
+    /** @brief Creates a ParamGrid Ptr that can be given to the %SVM::trainAuto method
+
+    @param minVal minimum value of the parameter grid
+    @param maxVal maximum value of the parameter grid
+    @param logstep Logarithmic step for iterating the statmodel parameter
+    */
+    CV_WRAP static Ptr<ParamGrid> create(double minVal=0., double maxVal=0., double logstep=1.);
 };

 /** @brief Class encapsulating training data.
@ -683,14 +691,54 @@ public:
    the usual %SVM with parameters specified in params is executed.
     */
    virtual bool trainAuto( const Ptr<TrainData>& data, int kFold = 10,
-                    ParamGrid Cgrid = SVM::getDefaultGrid(SVM::C),
-                    ParamGrid gammaGrid  = SVM::getDefaultGrid(SVM::GAMMA),
-                    ParamGrid pGrid      = SVM::getDefaultGrid(SVM::P),
-                    ParamGrid nuGrid     = SVM::getDefaultGrid(SVM::NU),
-                    ParamGrid coeffGrid  = SVM::getDefaultGrid(SVM::COEF),
-                    ParamGrid degreeGrid = SVM::getDefaultGrid(SVM::DEGREE),
+                    ParamGrid Cgrid = getDefaultGrid(C),
+                    ParamGrid gammaGrid  = getDefaultGrid(GAMMA),
+                    ParamGrid pGrid      = getDefaultGrid(P),
+                    ParamGrid nuGrid     = getDefaultGrid(NU),
+                    ParamGrid coeffGrid  = getDefaultGrid(COEF),
+                    ParamGrid degreeGrid = getDefaultGrid(DEGREE),
                    bool balanced=false) = 0;

+    /** @brief Trains an %SVM with optimal parameters
+
+    @param samples training samples
+    @param layout See ml::SampleTypes.
+    @param responses vector of responses associated with the training samples.
+    @param kFold Cross-validation parameter. The training set is divided into kFold subsets. One
+        subset is used to test the model, the others form the train set. So, the %SVM algorithm is
+    @param Cgrid grid for C
+    @param gammaGrid grid for gamma
+    @param pGrid grid for p
+    @param nuGrid grid for nu
+    @param coeffGrid grid for coeff
+    @param degreeGrid grid for degree
+    @param balanced If true and the problem is 2-class classification then the method creates more
+        balanced cross-validation subsets that is proportions between classes in subsets are close
+        to such proportion in the whole train dataset.
+
+    The method trains the %SVM model automatically by choosing the optimal parameters C, gamma, p,
+    nu, coef0, degree. Parameters are considered optimal when the cross-validation
+    estimate of the test set error is minimal.
+
+    This function only makes use of SVM::getDefaultGrid for parameter optimization and thus only
+    offers rudimentary parameter options.
+
+    This function works for the classification (SVM::C_SVC or SVM::NU_SVC) as well as for the
+    regression (SVM::EPS_SVR or SVM::NU_SVR). If it is SVM::ONE_CLASS, no optimization is made and
+    the usual %SVM with parameters specified in params is executed.
+    */
+    CV_WRAP bool trainAuto(InputArray samples,
+            int layout,
+            InputArray responses,
+            int kFold = 10,
+            Ptr<ParamGrid> Cgrid = SVM::getDefaultGridPtr(SVM::C),
+            Ptr<ParamGrid> gammaGrid  = SVM::getDefaultGridPtr(SVM::GAMMA),
+            Ptr<ParamGrid> pGrid      = SVM::getDefaultGridPtr(SVM::P),
+            Ptr<ParamGrid> nuGrid     = SVM::getDefaultGridPtr(SVM::NU),
+            Ptr<ParamGrid> coeffGrid  = SVM::getDefaultGridPtr(SVM::COEF),
+            Ptr<ParamGrid> degreeGrid = SVM::getDefaultGridPtr(SVM::DEGREE),
+            bool balanced=false);
+
    /** @brief Retrieves all the support vectors

    The method returns all the support vectors as a floating-point matrix, where support vectors are
@ -733,6 +781,16 @@ public:
     */
    static ParamGrid getDefaultGrid( int param_id );

+    /** @brief Generates a grid for %SVM parameters.
+
+    @param param_id %SVM parameters IDs that must be one of the SVM::ParamTypes. The grid is
+    generated for the parameter with this ID.
+
+    The function generates a grid pointer for the specified parameter of the %SVM algorithm.
+    The grid may be passed to the function SVM::trainAuto.
+     */
+    CV_WRAP static Ptr<ParamGrid> getDefaultGridPtr( int param_id );
+
    /** Creates empty model.
    Use StatModel::train to train the model. Since %SVM has several parameters, you may want to
    find the best parameters for your problem, it can be done with SVM::trainAuto. */
--- a/modules/ml/src/inner_functions.cpp
+++ b/modules/ml/src/inner_functions.cpp
@ -50,6 +50,10 @@ ParamGrid::ParamGrid(double _minVal, double _maxVal, double _logStep)
    logStep = std::max(_logStep, 1.);
 }

+Ptr<ParamGrid> ParamGrid::create(double minval, double maxval, double logstep) {
+  return makePtr<ParamGrid>(minval, maxval, logstep);
+}
+
 bool StatModel::empty() const { return !isTrained(); }

 int StatModel::getVarCount() const { return 0; }
--- a/modules/ml/src/svm.cpp
+++ b/modules/ml/src/svm.cpp
@ -362,6 +362,12 @@ static void sortSamplesByClasses( const Mat& _samples, const Mat& _responses,

 //////////////////////// SVM implementation //////////////////////////////

+Ptr<ParamGrid> SVM::getDefaultGridPtr( int param_id)
+{
+  ParamGrid grid = getDefaultGrid(param_id); // this is not a nice solution..
+  return makePtr<ParamGrid>(grid.minVal, grid.maxVal, grid.logStep);
+}
+
 ParamGrid SVM::getDefaultGrid( int param_id )
 {
    ParamGrid grid;
@ -1920,6 +1926,24 @@ public:
        bool returnDFVal;
    };

+    bool trainAuto_(InputArray samples, int layout,
+            InputArray responses, int kfold, Ptr<ParamGrid> Cgrid,
+            Ptr<ParamGrid> gammaGrid, Ptr<ParamGrid> pGrid, Ptr<ParamGrid> nuGrid,
+            Ptr<ParamGrid> coeffGrid, Ptr<ParamGrid> degreeGrid, bool balanced)
+    {
+        Ptr<TrainData> data = TrainData::create(samples, layout, responses);
+        return this->trainAuto(
+                data, kfold,
+                *Cgrid.get(),
+                *gammaGrid.get(),
+                *pGrid.get(),
+                *nuGrid.get(),
+                *coeffGrid.get(),
+                *degreeGrid.get(),
+                balanced);
+    }
+
+
    float predict( InputArray _samples, OutputArray _results, int flags ) const
    {
        float result = 0;
@ -2281,6 +2305,19 @@ Mat SVM::getUncompressedSupportVectors() const
    return this_->getUncompressedSupportVectors_();
 }

+bool SVM::trainAuto(InputArray samples, int layout,
+            InputArray responses, int kfold, Ptr<ParamGrid> Cgrid,
+            Ptr<ParamGrid> gammaGrid, Ptr<ParamGrid> pGrid, Ptr<ParamGrid> nuGrid,
+            Ptr<ParamGrid> coeffGrid, Ptr<ParamGrid> degreeGrid, bool balanced)
+{
+  SVMImpl* this_ = dynamic_cast<SVMImpl*>(this);
+  if (!this_) {
+    CV_Error(Error::StsNotImplemented, "the class is not SVMImpl");
+  }
+  return this_->trainAuto_(samples, layout, responses,
+    kfold, Cgrid, gammaGrid, pGrid, nuGrid, coeffGrid, degreeGrid, balanced);
+}
+
 }
 }

--- a/modules/objdetect/CMakeLists.txt
+++ b/modules/objdetect/CMakeLists.txt
@ -1,2 +1,2 @@
 set(the_description "Object Detection")
-ocv_define_module(objdetect opencv_core opencv_imgproc opencv_ml OPTIONAL opencv_highgui WRAP java python)
+ocv_define_module(objdetect opencv_core opencv_imgproc WRAP java python)
--- a/modules/objdetect/src/haar.cpp
+++ b/modules/objdetect/src/haar.cpp
@ -824,10 +824,7 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade,
                               CvPoint pt, double& stage_sum, int start_stage )
 {
 #ifdef CV_HAAR_USE_AVX
-    bool haveAVX = false;
-    if(cv::checkHardwareSupport(CV_CPU_AVX))
-    if(__xgetbv()&0x6)// Check if the OS will save the YMM registers
-       haveAVX = true;
+    bool haveAVX = cv::checkHardwareSupport(CV_CPU_AVX);
 #else
 #  ifdef CV_HAAR_USE_SSE
    bool haveSSE2 = cv::checkHardwareSupport(CV_CPU_SSE2);
--- a/modules/objdetect/src/precomp.hpp
+++ b/modules/objdetect/src/precomp.hpp
@ -46,16 +46,8 @@
 #include "opencv2/objdetect.hpp"
 #include "opencv2/imgproc.hpp"

-#include "opencv2/ml.hpp"
-
 #include "opencv2/core/utility.hpp"
 #include "opencv2/core/ocl.hpp"
-
-#include "opencv2/opencv_modules.hpp"
-#ifdef HAVE_OPENCV_HIGHGUI
-#  include "opencv2/highgui.hpp"
-#endif
-
 #include "opencv2/core/private.hpp"

 #ifdef HAVE_TEGRA_OPTIMIZATION
--- a/modules/python/src2/cv2.cpp
+++ b/modules/python/src2/cv2.cpp
@ -798,6 +798,21 @@ PyObject* pyopencv_from(const Size& sz)
    return Py_BuildValue("(ii)", sz.width, sz.height);
 }

+template<>
+bool pyopencv_to(PyObject* obj, Size_<float>& sz, const char* name)
+{
+    (void)name;
+    if(!obj || obj == Py_None)
+        return true;
+    return PyArg_ParseTuple(obj, "ff", &sz.width, &sz.height) > 0;
+}
+
+template<>
+PyObject* pyopencv_from(const Size_<float>& sz)
+{
+    return Py_BuildValue("(ff)", sz.width, sz.height);
+}
+
 template<>
 bool pyopencv_to(PyObject* obj, Rect& r, const char* name)
 {
--- a/modules/stitching/CMakeLists.txt
+++ b/modules/stitching/CMakeLists.txt
@ -8,6 +8,6 @@ set(STITCHING_CONTRIB_DEPS "opencv_xfeatures2d")
 if(BUILD_SHARED_LIBS AND BUILD_opencv_world AND OPENCV_WORLD_EXCLUDE_EXTRA_MODULES)
  set(STITCHING_CONTRIB_DEPS "")
 endif()
-ocv_define_module(stitching opencv_imgproc opencv_features2d opencv_calib3d opencv_objdetect
+ocv_define_module(stitching opencv_imgproc opencv_features2d opencv_calib3d
                  OPTIONAL opencv_cudaarithm opencv_cudafilters opencv_cudafeatures2d opencv_cudalegacy ${STITCHING_CONTRIB_DEPS}
                  WRAP python)
--- a/modules/stitching/perf/perf_stich.cpp
+++ b/modules/stitching/perf/perf_stich.cpp
@ -2,6 +2,8 @@
 #include "opencv2/imgcodecs.hpp"
 #include "opencv2/opencv_modules.hpp"

+#include "opencv2/core/ocl.hpp"
+
 using namespace std;
 using namespace cv;
 using namespace perf;
@ -161,6 +163,9 @@ PERF_TEST_P(stitchDatasets, affine, testing::Combine(AFFINE_DATASETS, TEST_DETEC
        Ptr<Stitcher> stitcher = Stitcher::create(Stitcher::SCANS, false);
        stitcher->setFeaturesFinder(featuresFinder);

+        if (cv::ocl::useOpenCL())
+            cv::theRNG() = cv::RNG(12345); // prevent fails of Windows OpenCL builds (see #8294)
+
        startTimer();
        stitcher->stitch(imgs, pano);
        stopTimer();
--- a/Show More
+++ b/Show More