diff --git a/cmake/OpenCVCompilerDefenses.cmake b/cmake/OpenCVCompilerDefenses.cmake
index c4563ff0c1..62029ea38b 100644
--- a/cmake/OpenCVCompilerDefenses.cmake
+++ b/cmake/OpenCVCompilerDefenses.cmake
@@ -5,7 +5,7 @@
 # - https://wiki.debian.org/Hardening
 # - https://wiki.gentoo.org/wiki/Hardened/Toolchain
 # - https://docs.microsoft.com/en-us/cpp/build/reference/sdl-enable-additional-security-checks
-
+# - https://developer.apple.com/library/archive/documentation/Security/Conceptual/SecureCodingGuide/Articles/BufferOverflows.html
 
 set(OPENCV_LINKER_DEFENSES_FLAGS_COMMON "")
 
@@ -44,6 +44,12 @@ if(MSVC)
   if(NOT X86_64)
     set(OPENCV_LINKER_DEFENSES_FLAGS_COMMON "${OPENCV_LINKER_DEFENSES_FLAGS_COMMON} /safeseh")
   endif()
+elseif(CV_CLANG)
+  ocv_add_defense_compiler_flag("-fstack-protector-strong")
+  ocv_add_defense_compiler_flag_release("-D_FORTIFY_SOURCE=2")
+  if (NOT APPLE)
+    set(OPENCV_LINKER_DEFENSES_FLAGS_COMMON "${OPENCV_LINKER_DEFENSES_FLAGS_COMMON} -z noexecstack -z relro -z now" )
+  endif()
 elseif(CV_GCC)
   if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "4.9")
     ocv_add_defense_compiler_flag("-fstack-protector")
diff --git a/cmake/OpenCVCompilerOptimizations.cmake b/cmake/OpenCVCompilerOptimizations.cmake
index f82159ea3a..8247a9a53c 100644
--- a/cmake/OpenCVCompilerOptimizations.cmake
+++ b/cmake/OpenCVCompilerOptimizations.cmake
@@ -4,6 +4,15 @@
 # SSE4_1 / SSE4_2 / POPCNT
 # AVX / AVX2 / AVX_512F
 # FMA3
+#
+# CPU features groups:
+# AVX512_COMMON (Common instructions AVX-512F/CD for all CPUs that support AVX-512)
+# AVX512_KNL (Knights Landing with AVX-512F/CD/ER/PF)
+# AVX512_KNM (Knights Mill with AVX-512F/CD/ER/PF/4FMAPS/4VNNIW/VPOPCNTDQ)
+# AVX512_SKX (Skylake-X with AVX-512F/CD/BW/DQ/VL)
+# AVX512_CNL (Cannon Lake with AVX-512F/CD/BW/DQ/VL/IFMA/VBMI)
+# AVX512_CEL (Cascade Lake with AVX-512F/CD/BW/DQ/VL/IFMA/VBMI/VNNI)
+# AVX512_ICL (Ice Lake with AVX-512F/CD/BW/DQ/VL/IFMA/VBMI/VNNI/VBMI2/BITALG/VPOPCNTDQ/VPCLMULQDQ*/GFNI*/VAES*)
 
 # ppc64le arch:
 # VSX  (always available on Power8)
@@ -33,7 +42,8 @@
 #
 # CPU_{opt}_ENABLED_DEFAULT=ON/OFF - has compiler support without additional flag (CPU_BASELINE_DETECT=ON only)
 
-set(CPU_ALL_OPTIMIZATIONS "SSE;SSE2;SSE3;SSSE3;SSE4_1;SSE4_2;POPCNT;AVX;FP16;AVX2;FMA3;AVX_512F;AVX512_SKX")
+set(CPU_ALL_OPTIMIZATIONS "SSE;SSE2;SSE3;SSSE3;SSE4_1;SSE4_2;POPCNT;AVX;FP16;AVX2;FMA3;AVX_512F")
+list(APPEND CPU_ALL_OPTIMIZATIONS "AVX512_COMMON;AVX512_KNL;AVX512_KNM;AVX512_SKX;AVX512_CNL;AVX512_CEL;AVX512_ICL")
 list(APPEND CPU_ALL_OPTIMIZATIONS NEON VFPV3 FP16)
 list(APPEND CPU_ALL_OPTIMIZATIONS VSX VSX3)
 list(REMOVE_DUPLICATES CPU_ALL_OPTIMIZATIONS)
@@ -152,9 +162,15 @@ elseif(" ${CMAKE_CXX_FLAGS} " MATCHES " -march=native | -xHost | /QxHost ")
 endif()
 
 if(X86 OR X86_64)
-  ocv_update(CPU_KNOWN_OPTIMIZATIONS "SSE;SSE2;SSE3;SSSE3;SSE4_1;POPCNT;SSE4_2;FP16;FMA3;AVX;AVX2;AVX_512F;AVX512_SKX")
+  ocv_update(CPU_KNOWN_OPTIMIZATIONS "SSE;SSE2;SSE3;SSSE3;SSE4_1;POPCNT;SSE4_2;FP16;FMA3;AVX;AVX2;AVX_512F;AVX512_COMMON;AVX512_KNL;AVX512_KNM;AVX512_SKX;AVX512_CNL;AVX512_CEL;AVX512_ICL")
 
-  ocv_update(CPU_AVX512_SKX_GROUP "AVX_512F;AVX_512CD;AVX_512BW;AVX_512DQ;AVX_512VL")
+  ocv_update(CPU_AVX512_COMMON_GROUP "AVX_512F;AVX_512CD")
+  ocv_update(CPU_AVX512_KNL_GROUP "AVX512_COMMON;AVX512_KNL_EXTRA")
+  ocv_update(CPU_AVX512_KNM_GROUP "AVX512_KNL;AVX512_KNM_EXTRA;AVX_512VPOPCNTDQ")
+  ocv_update(CPU_AVX512_SKX_GROUP "AVX512_COMMON;AVX_512VL;AVX_512BW;AVX_512DQ")
+  ocv_update(CPU_AVX512_CNL_GROUP "AVX512_SKX;AVX_512IFMA;AVX_512VBMI")
+  ocv_update(CPU_AVX512_CEL_GROUP "AVX512_CNL;AVX_512VNNI")
+  ocv_update(CPU_AVX512_ICL_GROUP "AVX512_CEL;AVX_512VBMI2;AVX_512BITALG;AVX_512VPOPCNTDQ") # ? VPCLMULQDQ, GFNI, VAES
 
   ocv_update(CPU_SSE_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_sse.cpp")
   ocv_update(CPU_SSE2_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_sse2.cpp")
@@ -167,9 +183,22 @@ if(X86 OR X86_64)
   ocv_update(CPU_AVX2_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_avx2.cpp")
   ocv_update(CPU_FP16_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_fp16.cpp")
   ocv_update(CPU_AVX_512F_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_avx512.cpp")
+  ocv_update(CPU_AVX512_COMMON_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_avx512common.cpp")
+  ocv_update(CPU_AVX512_KNL_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_avx512knl.cpp")
+  ocv_update(CPU_AVX512_KNM_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_avx512knm.cpp")
   ocv_update(CPU_AVX512_SKX_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_avx512skx.cpp")
+  ocv_update(CPU_AVX512_CNL_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_avx512cnl.cpp")
+  ocv_update(CPU_AVX512_CEL_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_avx512cel.cpp")
+  ocv_update(CPU_AVX512_ICL_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_avx512icl.cpp")
 
   if(NOT OPENCV_CPU_OPT_IMPLIES_IGNORE)
+    ocv_update(CPU_AVX512_ICL_IMPLIES "AVX512_CEL")
+    ocv_update(CPU_AVX512_CEL_IMPLIES "AVX512_CNL")
+    ocv_update(CPU_AVX512_CNL_IMPLIES "AVX512_SKX")
+    ocv_update(CPU_AVX512_SKX_IMPLIES "AVX512_COMMON")
+    ocv_update(CPU_AVX512_KNM_IMPLIES "AVX512_KNL")
+    ocv_update(CPU_AVX512_KNL_IMPLIES "AVX512_COMMON")
+    ocv_update(CPU_AVX512_COMMON_IMPLIES "AVX_512F")
     ocv_update(CPU_AVX_512F_IMPLIES "AVX2")
     ocv_update(CPU_AVX_512F_FORCE "") # Don't force other optimizations
     ocv_update(CPU_AVX2_IMPLIES "AVX;FMA3;FP16")
@@ -192,10 +221,10 @@ if(X86 OR X86_64)
       ocv_update(CPU_${name}_FLAGS_NAME "${name}")
       if(MSVC)
         set(enable_flags "${msvc_flags}")
-        set(flags_conflict "/arch:[^ ]+")
+        set(flags_conflict "/arch:[^ ]*|/Qx:[^ ]+")
       else()
         set(enable_flags "${unix_flags}")
-        set(flags_conflict "-msse[^ ]*|-mssse3|-mavx[^ ]*|-march[^ ]+")
+        set(flags_conflict "-msse[^ ]*|-mssse3|-mavx[^ ]*|-march[^ ]*|-x[^ ]+")
       endif()
       ocv_update(CPU_${name}_FLAGS_ON "${enable_flags}")
       if(flags_conflict)
@@ -215,8 +244,14 @@ if(X86 OR X86_64)
     if(NOT X86_64) # x64 compiler doesn't support /arch:sse
       ocv_intel_compiler_optimization_option(SSE "-msse" "/arch:SSE")
     endif()
-    ocv_intel_compiler_optimization_option(AVX_512F "-march=common-avx512" "/arch:COMMON-AVX512")
-    ocv_intel_compiler_optimization_option(AVX512_SKX "-march=core-avx512" "/arch:CORE-AVX512")
+    ocv_intel_compiler_optimization_option(AVX_512F "-xCOMMON-AVX512" "/Qx:COMMON-AVX512")
+    ocv_intel_compiler_optimization_option(AVX512_COMMON "-xCOMMON-AVX512" "/Qx:COMMON-AVX512")
+    ocv_intel_compiler_optimization_option(AVX512_KNL "-xKNL" "/Qx:KNL")
+    ocv_intel_compiler_optimization_option(AVX512_KNM "-xKNM" "/Qx:KNM")
+    ocv_intel_compiler_optimization_option(AVX512_SKX "-xSKYLAKE-AVX512" "/Qx:SKYLAKE-AVX512")
+    ocv_intel_compiler_optimization_option(AVX512_CNL "-xCANNONLAKE" "/Qx:CANNONLAKE")
+    ocv_intel_compiler_optimization_option(AVX512_CEL "-xCASCADELAKE" "/Qx:CASCADELAKE")
+    ocv_intel_compiler_optimization_option(AVX512_ICL "-xICELAKE-CLIENT" "/Qx:ICELAKE-CLIENT")
   elseif(CV_GCC OR CV_CLANG)
     ocv_update(CPU_AVX2_FLAGS_ON "-mavx2")
     ocv_update(CPU_FP16_FLAGS_ON "-mf16c")
@@ -230,12 +265,21 @@ if(X86 OR X86_64)
     ocv_update(CPU_SSE2_FLAGS_ON "-msse2")
     ocv_update(CPU_SSE_FLAGS_ON "-msse")
     if(NOT (CV_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS "5.0"))  # GCC >= 5.0
-      # -mavx512f -mavx512pf -mavx512er -mavx512cd -mavx512vl -mavx512bw -mavx512dq -mavx512ifma -mavx512vbmi
       ocv_update(CPU_AVX_512F_FLAGS_ON "-mavx512f")
-      ocv_update(CPU_AVX512_SKX_FLAGS_ON "-mavx512f -mavx512cd -mavx512vl -mavx512bw -mavx512dq")
+      ocv_update(CPU_AVX_512CD_FLAGS_ON "-mavx512cd")
+      ocv_update(CPU_AVX512_KNL_EXTRA_FLAGS_ON "-mavx512er -mavx512pf")
+      ocv_update(CPU_AVX512_KNM_EXTRA_FLAGS_ON "-mavx5124fmaps -mavx5124vnniw")
+      ocv_update(CPU_AVX_512BW_FLAGS_ON "-mavx512bw")
+      ocv_update(CPU_AVX_512DQ_FLAGS_ON "-mavx512dq")
+      ocv_update(CPU_AVX_512VL_FLAGS_ON "-mavx512vl")
+      ocv_update(CPU_AVX_512IFMA_FLAGS_ON "-mavx512ifma")
+      ocv_update(CPU_AVX_512VBMI_FLAGS_ON "-mavx512vbmi")
+      ocv_update(CPU_AVX_512VNNI_FLAGS_ON "-mavx512vnni")
+      ocv_update(CPU_AVX_512VBMI2_FLAGS_ON "-mavx512vbmi2")
+      ocv_update(CPU_AVX_512BITALG_FLAGS_ON "-mavx512bitalg")
+      ocv_update(CPU_AVX_512VPOPCNTDQ_FLAGS_ON "-mavx512vpopcntdq")
     else()
       ocv_update(CPU_AVX_512F_SUPPORTED OFF)
-      ocv_update(CPU_AVX512_SKX_SUPPORTED OFF)
     endif()
   elseif(MSVC)
     ocv_update(CPU_AVX2_FLAGS_ON "/arch:AVX2")
@@ -336,6 +380,10 @@ if(CV_DISABLE_OPTIMIZATION)
   set(CPU_DISPATCH_REQUIRE "")
 endif()
 
+if("x${CPU_DISPATCH}" STREQUAL "xALL")
+  set(CPU_DISPATCH "${CPU_KNOWN_OPTIMIZATIONS}")
+endif()
+
 macro(ocv_check_compiler_optimization OPT)
   if(NOT DEFINED CPU_${OPT}_SUPPORTED)
     if((DEFINED CPU_${OPT}_FLAGS_ON AND NOT "x${CPU_${OPT}_FLAGS_ON}" STREQUAL "x") OR CPU_${OPT}_TEST_FILE)
@@ -408,6 +456,7 @@ foreach(OPT ${CPU_KNOWN_OPTIMIZATIONS})
   if(NOT DEFINED CPU_${OPT}_FORCE)
     set(CPU_${OPT}_FORCE "${CPU_${OPT}_IMPLIES}")
   endif()
+  #message("${OPT}: CPU_${OPT}_FLAGS_ON=${CPU_${OPT}_FLAGS_ON}")
 endforeach()
 
 if(_add_native_flag)
@@ -809,19 +858,19 @@ macro(__ocv_add_dispatched_file filename target_src_var src_directory dst_direct
         file(WRITE "${__file}" "${__codestr}")
       endif()
 
-      if(";${CPU_DISPATCH};" MATCHES "${OPT}" OR __CPU_DISPATCH_INCLUDE_ALL)
+      if(";${CPU_DISPATCH_FINAL};" MATCHES "${OPT}" OR __CPU_DISPATCH_INCLUDE_ALL)
         if(EXISTS "${src_directory}/${filename}.${OPT_LOWER}.cpp")
           message(STATUS "Using overrided ${OPT} source: ${src_directory}/${filename}.${OPT_LOWER}.cpp")
         else()
           list(APPEND ${target_src_var} "${__file}")
         endif()
-      endif()
 
-      set(__declarations_str "${__declarations_str}
+        set(__declarations_str "${__declarations_str}
 #define CV_CPU_DISPATCH_MODE ${OPT}
 #include \"opencv2/core/private/cv_cpu_include_simd_declarations.hpp\"
 ")
-      set(__dispatch_modes "${OPT}, ${__dispatch_modes}")
+        set(__dispatch_modes "${OPT}, ${__dispatch_modes}")
+      endif()
     endforeach()
 
     set(__declarations_str "${__declarations_str}
diff --git a/cmake/checks/cpu_avx512cel.cpp b/cmake/checks/cpu_avx512cel.cpp
new file mode 100644
index 0000000000..e372cf9a45
--- /dev/null
+++ b/cmake/checks/cpu_avx512cel.cpp
@@ -0,0 +1,11 @@
+#if defined __AVX512__ || defined __AVX512F__
+#include <immintrin.h>
+void test()
+{
+    __m512i a, b, c;
+    a = _mm512_dpwssd_epi32(a, b, c);
+}
+#else
+#error "AVX512-CEL is not supported"
+#endif
+int main() { return 0; }
\ No newline at end of file
diff --git a/cmake/checks/cpu_avx512cnl.cpp b/cmake/checks/cpu_avx512cnl.cpp
new file mode 100644
index 0000000000..480a312fe5
--- /dev/null
+++ b/cmake/checks/cpu_avx512cnl.cpp
@@ -0,0 +1,12 @@
+#if defined __AVX512__ || defined __AVX512F__
+#include <immintrin.h>
+void test()
+{
+    __m512i a, b, c;
+    a = _mm512_madd52hi_epu64(a, b, c);
+    a = _mm512_permutexvar_epi8(a, b);
+}
+#else
+#error "AVX512-CNL is not supported"
+#endif
+int main() { return 0; }
\ No newline at end of file
diff --git a/cmake/checks/cpu_avx512common.cpp b/cmake/checks/cpu_avx512common.cpp
new file mode 100644
index 0000000000..1754a95399
--- /dev/null
+++ b/cmake/checks/cpu_avx512common.cpp
@@ -0,0 +1,14 @@
+#if defined __AVX512__ || defined __AVX512F__
+#include <immintrin.h>
+void test()
+{
+    __m512i zmm = _mm512_setzero_si512();
+    zmm = _mm512_lzcnt_epi32(zmm);
+#if defined __GNUC__ && defined __x86_64__
+    asm volatile ("" : : : "zmm16", "zmm17", "zmm18", "zmm19");
+#endif
+}
+#else
+#error "AVX512-COMMON is not supported"
+#endif
+int main() { return 0; }
diff --git a/cmake/checks/cpu_avx512icl.cpp b/cmake/checks/cpu_avx512icl.cpp
new file mode 100644
index 0000000000..a67f5f35d4
--- /dev/null
+++ b/cmake/checks/cpu_avx512icl.cpp
@@ -0,0 +1,13 @@
+#if defined __AVX512__ || defined __AVX512F__
+#include <immintrin.h>
+void test()
+{
+    __m512i a, b, c;
+    a = _mm512_popcnt_epi8(a);
+    a = _mm512_shrdv_epi64(a, b, c);
+    a = _mm512_popcnt_epi64(a);
+}
+#else
+#error "AVX512-ICL is not supported"
+#endif
+int main() { return 0; }
\ No newline at end of file
diff --git a/cmake/checks/cpu_avx512knl.cpp b/cmake/checks/cpu_avx512knl.cpp
new file mode 100644
index 0000000000..f0eaa646e3
--- /dev/null
+++ b/cmake/checks/cpu_avx512knl.cpp
@@ -0,0 +1,16 @@
+#if defined __AVX512__ || defined __AVX512F__
+#include <immintrin.h>
+
+void test()
+{
+    int* base;
+    __m512i idx;
+    __mmask16 m16;
+    __m512 f;
+    _mm512_mask_prefetch_i32gather_ps(idx, m16, base, 1, _MM_HINT_T1);
+    f = _mm512_rsqrt28_ps(f);
+}
+#else
+#error "AVX512-KNL is not supported"
+#endif
+int main() { return 0; }
\ No newline at end of file
diff --git a/cmake/checks/cpu_avx512knm.cpp b/cmake/checks/cpu_avx512knm.cpp
new file mode 100644
index 0000000000..18b2bf53e8
--- /dev/null
+++ b/cmake/checks/cpu_avx512knm.cpp
@@ -0,0 +1,17 @@
+#if defined __AVX512__ || defined __AVX512F__
+#include <immintrin.h>
+void test()
+{
+    __m512 a, b, c, d, e;
+    __m512i ai, bi, ci, di, ei, fi;
+    __m128  *mem;
+    __m128i *memi;
+    __mmask16 m;
+    a  = _mm512_4fnmadd_ps(a, b, c, d, e, mem);
+    ai = _mm512_4dpwssd_epi32(ai, bi, ci, di, ei, memi);
+    ai = _mm512_popcnt_epi64(ai);
+}
+#else
+#error "AVX512-KNM is not supported"
+#endif
+int main() { return 0; }
\ No newline at end of file
diff --git a/doc/opencv.bib b/doc/opencv.bib
index e2af456532..fd1b60dfd1 100644
--- a/doc/opencv.bib
+++ b/doc/opencv.bib
@@ -195,6 +195,21 @@
   volume = {9},
   publisher = {Walter de Gruyter}
 }
+@article{Chaumette06,
+  author = {Chaumette, Fran{\c c}ois and Hutchinson, S.},
+  title = {{Visual servo control, Part I: Basic approaches}},
+  url = {https://hal.inria.fr/inria-00350283},
+  journal = {{IEEE Robotics and Automation Magazine}},
+  publisher = {{Institute of Electrical and Electronics Engineers}},
+  volume = {13},
+  number = {4},
+  pages = {82-90},
+  year = {2006},
+  pdf = {https://hal.inria.fr/inria-00350283/file/2006_ieee_ram_chaumette.pdf},
+  hal_id = {inria-00350283},
+  hal_version = {v1},
+}
+
 @article{Daniilidis98,
   author = {Konstantinos Daniilidis},
   title = {Hand-Eye Calibration Using Dual Quaternions},
@@ -242,6 +257,12 @@
   publisher = {IEEE},
   url = {http://alumni.media.mit.edu/~jdavis/Publications/publications_402.pdf}
 }
+@misc{Eade13,
+  author = {Eade, Ethan},
+  title = {Gauss-Newton / Levenberg-Marquardt Optimization},
+  year = {2013},
+  url = {http://ethaneade.com/optimization.pdf}
+}
 @inproceedings{EM11,
   author = {Gastal, Eduardo SL and Oliveira, Manuel M},
   title = {Domain transform for edge-aware image and video processing},
@@ -596,10 +617,14 @@
   title = {ROF and TV-L1 denoising with Primal-Dual algorithm},
   url = {http://znah.net/rof-and-tv-l1-denoising-with-primal-dual-algorithm.html}
 }
-@misc{VandLec,
-  author = {Vandenberghe, Lieven},
-  title = {QR Factorization},
-  url = {http://www.seas.ucla.edu/~vandenbe/133A/lectures/qr.pdf}
+@misc{Madsen04,
+  author = {K. Madsen and H. B. Nielsen and O. Tingleff},
+  title = {Methods for Non-Linear Least Squares Problems (2nd ed.)},
+  year = {2004},
+  pages = {60},
+  publisher = {Informatics and Mathematical Modelling, Technical University of Denmark, {DTU}},
+  address = {Richard Petersens Plads, Building 321, {DK-}2800 Kgs. Lyngby},
+  url = {http://www2.imm.dtu.dk/pubdb/views/edoc_download.php/3215/pdf/imm3215.pdf}
 }
 @article{MHT2011,
   author = {Getreuer, Pascal},
@@ -645,6 +670,23 @@
   title = {Deeper understanding of the homography decomposition for vision-based control},
   year = {2007}
 }
+@article{Marchand16,
+  author = {Marchand, Eric and Uchiyama, Hideaki and Spindler, Fabien},
+  title = {{Pose Estimation for Augmented Reality: A Hands-On Survey}},
+  url = {https://hal.inria.fr/hal-01246370},
+  journal = {{IEEE Transactions on Visualization and Computer Graphics}},
+  publisher = {{Institute of Electrical and Electronics Engineers}},
+  volume = {22},
+  number = {12},
+  pages = {2633 - 2651},
+  year = {2016},
+  month = Dec,
+  doi = {10.1109/TVCG.2015.2513408},
+  keywords = {homography ; SLAM ; motion estimation ; Index Terms-Survey ; augmented reality ; vision-based camera localization ; pose estimation ; PnP ; keypoint matching ; code examples},
+  pdf = {https://hal.inria.fr/hal-01246370/file/survey-ieee-v2.pdf},
+  hal_id = {hal-01246370},
+  hal_version = {v1},
+}
 @article{Matas00,
   author = {Matas, Jiri and Galambos, Charles and Kittler, Josef},
   title = {Robust detection of lines using the progressive probabilistic hough transform},
@@ -915,6 +957,11 @@
   volume = {2},
   publisher = {IEEE}
 }
+@misc{VandLec,
+  author = {Vandenberghe, Lieven},
+  title = {QR Factorization},
+  url = {http://www.seas.ucla.edu/~vandenbe/133A/lectures/qr.pdf}
+}
 @inproceedings{V03,
   author = {Kwatra, Vivek and Sch{\"o}dl, Arno and Essa, Irfan and Turk, Greg and Bobick, Aaron},
   title = {Graphcut textures: image and video synthesis using graph cuts},
diff --git a/modules/calib3d/include/opencv2/calib3d.hpp b/modules/calib3d/include/opencv2/calib3d.hpp
index 264c976b2f..5f6d3c048a 100644
--- a/modules/calib3d/include/opencv2/calib3d.hpp
+++ b/modules/calib3d/include/opencv2/calib3d.hpp
@@ -383,8 +383,11 @@ public:
          modified using setMaxIters() method.
     */
     static Ptr<LMSolver> create(const Ptr<LMSolver::Callback>& cb, int maxIters);
+    static Ptr<LMSolver> create(const Ptr<LMSolver::Callback>& cb, int maxIters, double eps);
 };
 
+
+
 /** @brief Finds a perspective transformation between two planes.
 
 @param srcPoints Coordinates of the points in the original plane, a matrix of the type CV_32FC2
@@ -842,6 +845,65 @@ CV_EXPORTS_W int solveP3P( InputArray objectPoints, InputArray imagePoints,
                            OutputArrayOfArrays rvecs, OutputArrayOfArrays tvecs,
                            int flags );
 
+/** @brief Refine a pose (the translation and the rotation that transform a 3D point expressed in the object coordinate frame
+to the camera coordinate frame) from a 3D-2D point correspondences and starting from an initial solution.
+
+@param objectPoints Array of object points in the object coordinate space, Nx3 1-channel or 1xN/Nx1 3-channel,
+where N is the number of points. vector\<Point3f\> can also be passed here.
+@param imagePoints Array of corresponding image points, Nx2 1-channel or 1xN/Nx1 2-channel,
+where N is the number of points. vector\<Point2f\> can also be passed here.
+@param cameraMatrix Input camera matrix \f$A = \vecthreethree{fx}{0}{cx}{0}{fy}{cy}{0}{0}{1}\f$ .
+@param distCoeffs Input vector of distortion coefficients
+\f$(k_1, k_2, p_1, p_2[, k_3[, k_4, k_5, k_6 [, s_1, s_2, s_3, s_4[, \tau_x, \tau_y]]]])\f$ of
+4, 5, 8, 12 or 14 elements. If the vector is NULL/empty, the zero distortion coefficients are
+assumed.
+@param rvec Input/Output rotation vector (see @ref Rodrigues ) that, together with tvec , brings points from
+the model coordinate system to the camera coordinate system. Input values are used as an initial solution.
+@param tvec Input/Output translation vector. Input values are used as an initial solution.
+@param criteria Criteria when to stop the Levenberg-Marquard iterative algorithm.
+
+The function refines the object pose given at least 3 object points, their corresponding image
+projections, an initial solution for the rotation and translation vector,
+as well as the camera matrix and the distortion coefficients.
+The function minimizes the projection error with respect to the rotation and the translation vectors, according
+to a Levenberg-Marquardt iterative minimization @cite Madsen04 @cite Eade13 process.
+ */
+CV_EXPORTS_W void solvePnPRefineLM( InputArray objectPoints, InputArray imagePoints,
+                                    InputArray cameraMatrix, InputArray distCoeffs,
+                                    InputOutputArray rvec, InputOutputArray tvec,
+                                    TermCriteria criteria = TermCriteria(TermCriteria::EPS + TermCriteria::COUNT, 20, FLT_EPSILON));
+
+/** @brief Refine a pose (the translation and the rotation that transform a 3D point expressed in the object coordinate frame
+to the camera coordinate frame) from a 3D-2D point correspondences and starting from an initial solution.
+
+@param objectPoints Array of object points in the object coordinate space, Nx3 1-channel or 1xN/Nx1 3-channel,
+where N is the number of points. vector\<Point3f\> can also be passed here.
+@param imagePoints Array of corresponding image points, Nx2 1-channel or 1xN/Nx1 2-channel,
+where N is the number of points. vector\<Point2f\> can also be passed here.
+@param cameraMatrix Input camera matrix \f$A = \vecthreethree{fx}{0}{cx}{0}{fy}{cy}{0}{0}{1}\f$ .
+@param distCoeffs Input vector of distortion coefficients
+\f$(k_1, k_2, p_1, p_2[, k_3[, k_4, k_5, k_6 [, s_1, s_2, s_3, s_4[, \tau_x, \tau_y]]]])\f$ of
+4, 5, 8, 12 or 14 elements. If the vector is NULL/empty, the zero distortion coefficients are
+assumed.
+@param rvec Input/Output rotation vector (see @ref Rodrigues ) that, together with tvec , brings points from
+the model coordinate system to the camera coordinate system. Input values are used as an initial solution.
+@param tvec Input/Output translation vector. Input values are used as an initial solution.
+@param criteria Criteria when to stop the Levenberg-Marquard iterative algorithm.
+@param VVSlambda Gain for the virtual visual servoing control law, equivalent to the \f$\alpha\f$
+gain in the Gauss-Newton formulation.
+
+The function refines the object pose given at least 3 object points, their corresponding image
+projections, an initial solution for the rotation and translation vector,
+as well as the camera matrix and the distortion coefficients.
+The function minimizes the projection error with respect to the rotation and the translation vectors, using a
+virtual visual servoing (VVS) @cite Chaumette06 @cite Marchand16 scheme.
+ */
+CV_EXPORTS_W void solvePnPRefineVVS( InputArray objectPoints, InputArray imagePoints,
+                                     InputArray cameraMatrix, InputArray distCoeffs,
+                                     InputOutputArray rvec, InputOutputArray tvec,
+                                     TermCriteria criteria = TermCriteria(TermCriteria::EPS + TermCriteria::COUNT, 20, FLT_EPSILON),
+                                     double VVSlambda = 1);
+
 /** @brief Finds an initial camera matrix from 3D-2D point correspondences.
 
 @param objectPoints Vector of vectors of the calibration pattern points in the calibration pattern
diff --git a/modules/calib3d/src/levmarq.cpp b/modules/calib3d/src/levmarq.cpp
index 4e59f043a8..0d339ccf79 100644
--- a/modules/calib3d/src/levmarq.cpp
+++ b/modules/calib3d/src/levmarq.cpp
@@ -81,11 +81,11 @@ class LMSolverImpl CV_FINAL : public LMSolver
 {
 public:
     LMSolverImpl() : maxIters(100) { init(); }
-    LMSolverImpl(const Ptr<LMSolver::Callback>& _cb, int _maxIters) : cb(_cb), maxIters(_maxIters) { init(); }
+    LMSolverImpl(const Ptr<LMSolver::Callback>& _cb, int _maxIters) : cb(_cb), epsx(FLT_EPSILON), epsf(FLT_EPSILON), maxIters(_maxIters) { init(); }
+    LMSolverImpl(const Ptr<LMSolver::Callback>& _cb, int _maxIters, double _eps) : cb(_cb), epsx(_eps), epsf(_eps), maxIters(_maxIters) { init(); }
 
     void init()
     {
-        epsx = epsf = FLT_EPSILON;
         printInterval = 0;
     }
 
@@ -215,4 +215,9 @@ Ptr<LMSolver> LMSolver::create(const Ptr<LMSolver::Callback>& cb, int maxIters)
     return makePtr<LMSolverImpl>(cb, maxIters);
 }
 
+Ptr<LMSolver> LMSolver::create(const Ptr<LMSolver::Callback>& cb, int maxIters, double eps)
+{
+    return makePtr<LMSolverImpl>(cb, maxIters, eps);
+}
+
 }
diff --git a/modules/calib3d/src/solvepnp.cpp b/modules/calib3d/src/solvepnp.cpp
index 0f76e8c452..58c16f40cc 100644
--- a/modules/calib3d/src/solvepnp.cpp
+++ b/modules/calib3d/src/solvepnp.cpp
@@ -456,4 +456,271 @@ int solveP3P( InputArray _opoints, InputArray _ipoints,
     return solutions;
 }
 
+class SolvePnPRefineLMCallback CV_FINAL : public LMSolver::Callback
+{
+public:
+    SolvePnPRefineLMCallback(InputArray _opoints, InputArray _ipoints, InputArray _cameraMatrix, InputArray _distCoeffs)
+    {
+        objectPoints = _opoints.getMat();
+        imagePoints = _ipoints.getMat();
+        npoints = std::max(objectPoints.checkVector(3, CV_32F), objectPoints.checkVector(3, CV_64F));
+        imagePoints0 = imagePoints.reshape(1, npoints*2);
+        cameraMatrix = _cameraMatrix.getMat();
+        distCoeffs = _distCoeffs.getMat();
+    }
+
+    bool compute(InputArray _param, OutputArray _err, OutputArray _Jac) const CV_OVERRIDE
+    {
+         Mat param = _param.getMat();
+         _err.create(npoints*2, 1, CV_64FC1);
+
+         if(_Jac.needed())
+         {
+             _Jac.create(npoints*2, param.rows, CV_64FC1);
+         }
+
+         Mat rvec = param(Rect(0, 0, 1, 3)), tvec = param(Rect(0, 3, 1, 3));
+
+         Mat J, projectedPts;
+         projectPoints(objectPoints, rvec, tvec, cameraMatrix, distCoeffs, projectedPts, _Jac.needed() ? J : noArray());
+
+         if (_Jac.needed())
+         {
+             Mat Jac = _Jac.getMat();
+             for (int i = 0; i < Jac.rows; i++)
+             {
+                 for (int j = 0; j < Jac.cols; j++)
+                 {
+                     Jac.at<double>(i,j) = J.at<double>(i,j);
+                 }
+             }
+         }
+
+         Mat err = _err.getMat();
+         projectedPts = projectedPts.reshape(1, npoints*2);
+         err = projectedPts - imagePoints0;
+
+        return true;
+    }
+
+    Mat objectPoints, imagePoints, imagePoints0;
+    Mat cameraMatrix, distCoeffs;
+    int npoints;
+};
+
+/**
+ * @brief Compute the Interaction matrix and the residuals for the current pose.
+ * @param objectPoints 3D object points.
+ * @param R Current estimated rotation matrix.
+ * @param tvec Current estimated translation vector.
+ * @param L Interaction matrix for a vector of point features.
+ * @param s Residuals.
+ */
+static void computeInteractionMatrixAndResiduals(const Mat& objectPoints, const Mat& R, const Mat& tvec,
+                                                 Mat& L, Mat& s)
+{
+    Mat objectPointsInCam;
+
+    int npoints = objectPoints.rows;
+    for (int i = 0; i < npoints; i++)
+    {
+        Mat curPt = objectPoints.row(i);
+        objectPointsInCam = R * curPt.t() + tvec;
+
+        double Zi = objectPointsInCam.at<double>(2,0);
+        double xi = objectPointsInCam.at<double>(0,0) / Zi;
+        double yi = objectPointsInCam.at<double>(1,0) / Zi;
+
+        s.at<double>(2*i,0) = xi;
+        s.at<double>(2*i+1,0) = yi;
+
+        L.at<double>(2*i,0) = -1 / Zi;
+        L.at<double>(2*i,1) = 0;
+        L.at<double>(2*i,2) = xi / Zi;
+        L.at<double>(2*i,3) = xi*yi;
+        L.at<double>(2*i,4) = -(1 + xi*xi);
+        L.at<double>(2*i,5) = yi;
+
+        L.at<double>(2*i+1,0) = 0;
+        L.at<double>(2*i+1,1) = -1 / Zi;
+        L.at<double>(2*i+1,2) = yi / Zi;
+        L.at<double>(2*i+1,3) = 1 + yi*yi;
+        L.at<double>(2*i+1,4) = -xi*yi;
+        L.at<double>(2*i+1,5) = -xi;
+    }
+}
+
+/**
+ * @brief The exponential map from se(3) to SE(3).
+ * @param twist A twist (v, w) represents the velocity of a rigid body as an angular velocity
+ * around an axis and a linear velocity along this axis.
+ * @param R1 Resultant rotation matrix from the twist.
+ * @param t1 Resultant translation vector from the twist.
+ */
+static void exponentialMapToSE3Inv(const Mat& twist, Mat& R1, Mat& t1)
+{
+    //see Exponential Map in http://ethaneade.com/lie.pdf
+    /*
+    \begin{align*}
+    \boldsymbol{\delta} &= \left( \mathbf{u}, \boldsymbol{\omega} \right ) \in se(3) \\
+    \mathbf{u}, \boldsymbol{\omega} &\in \mathbb{R}^3 \\
+    \theta &= \sqrt{ \boldsymbol{\omega}^T \boldsymbol{\omega} } \\
+    A &= \frac{\sin \theta}{\theta} \\
+    B &= \frac{1 - \cos \theta}{\theta^2} \\
+    C &= \frac{1-A}{\theta^2} \\
+    \mathbf{R} &= \mathbf{I} + A \boldsymbol{\omega}_{\times} + B \boldsymbol{\omega}_{\times}^2 \\
+    \mathbf{V} &= \mathbf{I} + B \boldsymbol{\omega}_{\times} + C \boldsymbol{\omega}_{\times}^2 \\
+    \exp \begin{pmatrix}
+    \mathbf{u} \\
+    \boldsymbol{\omega}
+    \end{pmatrix} &=
+    \left(
+    \begin{array}{c|c}
+    \mathbf{R} & \mathbf{V} \mathbf{u} \\ \hline
+    \mathbf{0} & 1
+    \end{array}
+    \right )
+    \end{align*}
+    */
+    double vx = twist.at<double>(0,0);
+    double vy = twist.at<double>(1,0);
+    double vz = twist.at<double>(2,0);
+    double wx = twist.at<double>(3,0);
+    double wy = twist.at<double>(4,0);
+    double wz = twist.at<double>(5,0);
+
+    Matx31d rvec(wx, wy, wz);
+    Mat R;
+    Rodrigues(rvec, R);
+
+    double theta = sqrt(wx*wx + wy*wy + wz*wz);
+    double sinc = std::fabs(theta) < 1e-8 ? 1 : sin(theta) / theta;
+    double mcosc = (std::fabs(theta) < 1e-8) ? 0.5 : (1-cos(theta)) / (theta*theta);
+    double msinc = (std::abs(theta) < 1e-8) ? (1/6.0) : (1-sinc) / (theta*theta);
+
+    Matx31d dt;
+    dt(0) = vx*(sinc + wx*wx*msinc) + vy*(wx*wy*msinc - wz*mcosc) + vz*(wx*wz*msinc + wy*mcosc);
+    dt(1) = vx*(wx*wy*msinc + wz*mcosc) + vy*(sinc + wy*wy*msinc) + vz*(wy*wz*msinc - wx*mcosc);
+    dt(2) = vx*(wx*wz*msinc - wy*mcosc) + vy*(wy*wz*msinc + wx*mcosc) + vz*(sinc + wz*wz*msinc);
+
+    R1 = R.t();
+    t1 = -R1 * dt;
+}
+
+enum SolvePnPRefineMethod {
+    SOLVEPNP_REFINE_LM   = 0,
+    SOLVEPNP_REFINE_VVS  = 1
+};
+
+static void solvePnPRefine(InputArray _objectPoints, InputArray _imagePoints,
+                           InputArray _cameraMatrix, InputArray _distCoeffs,
+                           InputOutputArray _rvec, InputOutputArray _tvec,
+                           SolvePnPRefineMethod _flags,
+                           TermCriteria _criteria=TermCriteria(TermCriteria::EPS+TermCriteria::COUNT, 20, FLT_EPSILON),
+                           double _vvslambda=1)
+{
+    CV_INSTRUMENT_REGION();
+
+    Mat opoints_ = _objectPoints.getMat(), ipoints_ = _imagePoints.getMat();
+    Mat opoints, ipoints;
+    opoints_.convertTo(opoints, CV_64F);
+    ipoints_.convertTo(ipoints, CV_64F);
+    int npoints = opoints.checkVector(3, CV_64F);
+    CV_Assert( npoints >= 3 && npoints == ipoints.checkVector(2, CV_64F) );
+    CV_Assert( !_rvec.empty() && !_tvec.empty() );
+
+    int rtype = _rvec.type(), ttype = _tvec.type();
+    Size rsize = _rvec.size(), tsize = _tvec.size();
+    CV_Assert( (rtype == CV_32FC1 || rtype == CV_64FC1) &&
+               (ttype == CV_32FC1 || ttype == CV_64FC1) );
+    CV_Assert( (rsize == Size(1, 3) || rsize == Size(3, 1)) &&
+               (tsize == Size(1, 3) || tsize == Size(3, 1)) );
+
+    Mat cameraMatrix0 = _cameraMatrix.getMat();
+    Mat distCoeffs0 = _distCoeffs.getMat();
+    Mat cameraMatrix = Mat_<double>(cameraMatrix0);
+    Mat distCoeffs = Mat_<double>(distCoeffs0);
+
+    if (_flags == SOLVEPNP_REFINE_LM)
+    {
+        Mat rvec0 = _rvec.getMat(), tvec0 = _tvec.getMat();
+        Mat rvec, tvec;
+        rvec0.convertTo(rvec, CV_64F);
+        tvec0.convertTo(tvec, CV_64F);
+
+        Mat params(6, 1, CV_64FC1);
+        for (int i = 0; i < 3; i++)
+        {
+            params.at<double>(i,0) = rvec.at<double>(i,0);
+            params.at<double>(i+3,0) = tvec.at<double>(i,0);
+        }
+
+        LMSolver::create(makePtr<SolvePnPRefineLMCallback>(opoints, ipoints, cameraMatrix, distCoeffs), _criteria.maxCount, _criteria.epsilon)->run(params);
+
+        params.rowRange(0, 3).convertTo(rvec0, rvec0.depth());
+        params.rowRange(3, 6).convertTo(tvec0, tvec0.depth());
+    }
+    else if (_flags == SOLVEPNP_REFINE_VVS)
+    {
+        Mat rvec0 = _rvec.getMat(), tvec0 = _tvec.getMat();
+        Mat rvec, tvec;
+        rvec0.convertTo(rvec, CV_64F);
+        tvec0.convertTo(tvec, CV_64F);
+
+        vector<Point2d> ipoints_normalized;
+        undistortPoints(ipoints, ipoints_normalized, cameraMatrix, distCoeffs);
+        Mat sd = Mat(ipoints_normalized).reshape(1, npoints*2);
+        Mat objectPoints0 = opoints.reshape(1, npoints);
+        Mat imagePoints0 = ipoints.reshape(1, npoints*2);
+        Mat L(npoints*2, 6, CV_64FC1), s(npoints*2, 1, CV_64FC1);
+
+        double residuals_1 = std::numeric_limits<double>::max(), residuals = 0;
+        Mat err;
+        Mat R;
+        Rodrigues(rvec, R);
+        for (int iter = 0; iter < _criteria.maxCount; iter++)
+        {
+            computeInteractionMatrixAndResiduals(objectPoints0, R, tvec, L, s);
+            err = s - sd;
+
+            Mat Lp = L.inv(cv::DECOMP_SVD);
+            Mat dq = -_vvslambda * Lp * err;
+
+            Mat R1, t1;
+            exponentialMapToSE3Inv(dq, R1, t1);
+            R = R1 * R;
+            tvec = R1 * tvec + t1;
+
+            residuals_1 = residuals;
+            Mat res = err.t()*err;
+            residuals = res.at<double>(0,0);
+
+            if (std::fabs(residuals - residuals_1) < _criteria.epsilon)
+                break;
+        }
+
+        Rodrigues(R, rvec);
+        rvec.convertTo(rvec0, rvec0.depth());
+        tvec.convertTo(tvec0, tvec0.depth());
+    }
+}
+
+void solvePnPRefineLM(InputArray _objectPoints, InputArray _imagePoints,
+                      InputArray _cameraMatrix, InputArray _distCoeffs,
+                      InputOutputArray _rvec, InputOutputArray _tvec,
+                      TermCriteria _criteria)
+{
+    CV_INSTRUMENT_REGION();
+    solvePnPRefine(_objectPoints, _imagePoints, _cameraMatrix, _distCoeffs, _rvec, _tvec, SOLVEPNP_REFINE_LM, _criteria);
+}
+
+void solvePnPRefineVVS(InputArray _objectPoints, InputArray _imagePoints,
+                       InputArray _cameraMatrix, InputArray _distCoeffs,
+                       InputOutputArray _rvec, InputOutputArray _tvec,
+                       TermCriteria _criteria, double _VVSlambda)
+{
+    CV_INSTRUMENT_REGION();
+    solvePnPRefine(_objectPoints, _imagePoints, _cameraMatrix, _distCoeffs, _rvec, _tvec, SOLVEPNP_REFINE_VVS, _criteria, _VVSlambda);
+}
+
 }
diff --git a/modules/calib3d/test/test_solvepnp_ransac.cpp b/modules/calib3d/test/test_solvepnp_ransac.cpp
index 2359fa9282..adf7758c92 100644
--- a/modules/calib3d/test/test_solvepnp_ransac.cpp
+++ b/modules/calib3d/test/test_solvepnp_ransac.cpp
@@ -589,4 +589,330 @@ TEST(Calib3d_SolvePnP, iterativeInitialGuess3pts)
     }
 }
 
+TEST(Calib3d_SolvePnP, refine3pts)
+{
+    {
+        Matx33d intrinsics(605.4, 0.0, 317.35,
+                           0.0, 601.2, 242.63,
+                           0.0, 0.0, 1.0);
+
+        double L = 0.1;
+        vector<Point3d> p3d;
+        p3d.push_back(Point3d(-L, -L, 0.0));
+        p3d.push_back(Point3d(L, -L, 0.0));
+        p3d.push_back(Point3d(L, L, 0.0));
+
+        Mat rvec_ground_truth = (Mat_<double>(3,1) << 0.3, -0.2, 0.75);
+        Mat tvec_ground_truth = (Mat_<double>(3,1) << 0.15, -0.2, 1.5);
+
+        vector<Point2d> p2d;
+        projectPoints(p3d, rvec_ground_truth, tvec_ground_truth, intrinsics, noArray(), p2d);
+
+        {
+            Mat rvec_est = (Mat_<double>(3,1) << 0.2, -0.1, 0.6);
+            Mat tvec_est = (Mat_<double>(3,1) << 0.05, -0.05, 1.0);
+
+            solvePnPRefineLM(p3d, p2d, intrinsics, noArray(), rvec_est, tvec_est);
+
+            cout << "\nmethod: Levenberg-Marquardt" << endl;
+            cout << "rvec_ground_truth: " << rvec_ground_truth.t() << std::endl;
+            cout << "rvec_est: " << rvec_est.t() << std::endl;
+            cout << "tvec_ground_truth: " << tvec_ground_truth.t() << std::endl;
+            cout << "tvec_est: " << tvec_est.t() << std::endl;
+
+            EXPECT_LE(cvtest::norm(rvec_ground_truth, rvec_est, NORM_INF), 1e-6);
+            EXPECT_LE(cvtest::norm(tvec_ground_truth, tvec_est, NORM_INF), 1e-6);
+        }
+        {
+            Mat rvec_est = (Mat_<double>(3,1) << 0.2, -0.1, 0.6);
+            Mat tvec_est = (Mat_<double>(3,1) << 0.05, -0.05, 1.0);
+
+            solvePnPRefineVVS(p3d, p2d, intrinsics, noArray(), rvec_est, tvec_est);
+
+            cout << "\nmethod: Virtual Visual Servoing" << endl;
+            cout << "rvec_ground_truth: " << rvec_ground_truth.t() << std::endl;
+            cout << "rvec_est: " << rvec_est.t() << std::endl;
+            cout << "tvec_ground_truth: " << tvec_ground_truth.t() << std::endl;
+            cout << "tvec_est: " << tvec_est.t() << std::endl;
+
+            EXPECT_LE(cvtest::norm(rvec_ground_truth, rvec_est, NORM_INF), 1e-6);
+            EXPECT_LE(cvtest::norm(tvec_ground_truth, tvec_est, NORM_INF), 1e-6);
+        }
+    }
+
+    {
+        Matx33f intrinsics(605.4f, 0.0f, 317.35f,
+                           0.0f, 601.2f, 242.63f,
+                           0.0f, 0.0f, 1.0f);
+
+        float L = 0.1f;
+        vector<Point3f> p3d;
+        p3d.push_back(Point3f(-L, -L, 0.0f));
+        p3d.push_back(Point3f(L, -L, 0.0f));
+        p3d.push_back(Point3f(L, L, 0.0f));
+
+        Mat rvec_ground_truth = (Mat_<float>(3,1) << -0.75f, 0.4f, 0.34f);
+        Mat tvec_ground_truth = (Mat_<float>(3,1) << -0.15f, 0.35f, 1.58f);
+
+        vector<Point2f> p2d;
+        projectPoints(p3d, rvec_ground_truth, tvec_ground_truth, intrinsics, noArray(), p2d);
+
+        {
+            Mat rvec_est = (Mat_<float>(3,1) << -0.5f, 0.2f, 0.2f);
+            Mat tvec_est = (Mat_<float>(3,1) << 0.0f, 0.2f, 1.0f);
+
+            solvePnPRefineLM(p3d, p2d, intrinsics, noArray(), rvec_est, tvec_est);
+
+            cout << "\nmethod: Levenberg-Marquardt" << endl;
+            cout << "rvec_ground_truth: " << rvec_ground_truth.t() << std::endl;
+            cout << "rvec_est: " << rvec_est.t() << std::endl;
+            cout << "tvec_ground_truth: " << tvec_ground_truth.t() << std::endl;
+            cout << "tvec_est: " << tvec_est.t() << std::endl;
+
+            EXPECT_LE(cvtest::norm(rvec_ground_truth, rvec_est, NORM_INF), 1e-6);
+            EXPECT_LE(cvtest::norm(tvec_ground_truth, tvec_est, NORM_INF), 1e-6);
+        }
+        {
+            Mat rvec_est = (Mat_<float>(3,1) << -0.5f, 0.2f, 0.2f);
+            Mat tvec_est = (Mat_<float>(3,1) << 0.0f, 0.2f, 1.0f);
+
+            solvePnPRefineVVS(p3d, p2d, intrinsics, noArray(), rvec_est, tvec_est);
+
+            cout << "\nmethod: Virtual Visual Servoing" << endl;
+            cout << "rvec_ground_truth: " << rvec_ground_truth.t() << std::endl;
+            cout << "rvec_est: " << rvec_est.t() << std::endl;
+            cout << "tvec_ground_truth: " << tvec_ground_truth.t() << std::endl;
+            cout << "tvec_est: " << tvec_est.t() << std::endl;
+
+            EXPECT_LE(cvtest::norm(rvec_ground_truth, rvec_est, NORM_INF), 1e-6);
+            EXPECT_LE(cvtest::norm(tvec_ground_truth, tvec_est, NORM_INF), 1e-6);
+        }
+    }
+}
+
+TEST(Calib3d_SolvePnP, refine)
+{
+    //double
+    {
+        Matx33d intrinsics(605.4, 0.0, 317.35,
+                           0.0, 601.2, 242.63,
+                           0.0, 0.0, 1.0);
+
+        double L = 0.1;
+        vector<Point3d> p3d;
+        p3d.push_back(Point3d(-L, -L, 0.0));
+        p3d.push_back(Point3d(L, -L, 0.0));
+        p3d.push_back(Point3d(L, L, 0.0));
+        p3d.push_back(Point3d(-L, L, L/2));
+        p3d.push_back(Point3d(0, 0, -L/2));
+
+        Mat rvec_ground_truth = (Mat_<double>(3,1) << 0.3, -0.2, 0.75);
+        Mat tvec_ground_truth = (Mat_<double>(3,1) << 0.15, -0.2, 1.5);
+
+        vector<Point2d> p2d;
+        projectPoints(p3d, rvec_ground_truth, tvec_ground_truth, intrinsics, noArray(), p2d);
+
+        {
+            Mat rvec_est = (Mat_<double>(3,1) << 0.1, -0.1, 0.1);
+            Mat tvec_est = (Mat_<double>(3,1) << 0.0, -0.5, 1.0);
+
+            solvePnP(p3d, p2d, intrinsics, noArray(), rvec_est, tvec_est, true, SOLVEPNP_ITERATIVE);
+
+            cout << "\nmethod: Levenberg-Marquardt (C API)" << endl;
+            cout << "rvec_ground_truth: " << rvec_ground_truth.t() << std::endl;
+            cout << "rvec_est: " << rvec_est.t() << std::endl;
+            cout << "tvec_ground_truth: " << tvec_ground_truth.t() << std::endl;
+            cout << "tvec_est: " << tvec_est.t() << std::endl;
+
+            EXPECT_LE(cvtest::norm(rvec_ground_truth, rvec_est, NORM_INF), 1e-6);
+            EXPECT_LE(cvtest::norm(tvec_ground_truth, tvec_est, NORM_INF), 1e-6);
+        }
+        {
+            Mat rvec_est = (Mat_<double>(3,1) << 0.1, -0.1, 0.1);
+            Mat tvec_est = (Mat_<double>(3,1) << 0.0, -0.5, 1.0);
+
+            solvePnPRefineLM(p3d, p2d, intrinsics, noArray(), rvec_est, tvec_est);
+
+            cout << "\nmethod: Levenberg-Marquardt (C++ API)" << endl;
+            cout << "rvec_ground_truth: " << rvec_ground_truth.t() << std::endl;
+            cout << "rvec_est: " << rvec_est.t() << std::endl;
+            cout << "tvec_ground_truth: " << tvec_ground_truth.t() << std::endl;
+            cout << "tvec_est: " << tvec_est.t() << std::endl;
+
+            EXPECT_LE(cvtest::norm(rvec_ground_truth, rvec_est, NORM_INF), 1e-6);
+            EXPECT_LE(cvtest::norm(tvec_ground_truth, tvec_est, NORM_INF), 1e-6);
+        }
+        {
+            Mat rvec_est = (Mat_<double>(3,1) << 0.1, -0.1, 0.1);
+            Mat tvec_est = (Mat_<double>(3,1) << 0.0, -0.5, 1.0);
+
+            solvePnPRefineVVS(p3d, p2d, intrinsics, noArray(), rvec_est, tvec_est);
+
+            cout << "\nmethod: Virtual Visual Servoing" << endl;
+            cout << "rvec_ground_truth: " << rvec_ground_truth.t() << std::endl;
+            cout << "rvec_est: " << rvec_est.t() << std::endl;
+            cout << "tvec_ground_truth: " << tvec_ground_truth.t() << std::endl;
+            cout << "tvec_est: " << tvec_est.t() << std::endl;
+
+            EXPECT_LE(cvtest::norm(rvec_ground_truth, rvec_est, NORM_INF), 1e-6);
+            EXPECT_LE(cvtest::norm(tvec_ground_truth, tvec_est, NORM_INF), 1e-6);
+        }
+    }
+
+    //float
+    {
+        Matx33f intrinsics(605.4f, 0.0f, 317.35f,
+                           0.0f, 601.2f, 242.63f,
+                           0.0f, 0.0f, 1.0f);
+
+        float L = 0.1f;
+        vector<Point3f> p3d;
+        p3d.push_back(Point3f(-L, -L, 0.0f));
+        p3d.push_back(Point3f(L, -L, 0.0f));
+        p3d.push_back(Point3f(L, L, 0.0f));
+        p3d.push_back(Point3f(-L, L, L/2));
+        p3d.push_back(Point3f(0, 0, -L/2));
+
+        Mat rvec_ground_truth = (Mat_<float>(3,1) << -0.75f, 0.4f, 0.34f);
+        Mat tvec_ground_truth = (Mat_<float>(3,1) << -0.15f, 0.35f, 1.58f);
+
+        vector<Point2f> p2d;
+        projectPoints(p3d, rvec_ground_truth, tvec_ground_truth, intrinsics, noArray(), p2d);
+
+        {
+            Mat rvec_est = (Mat_<float>(3,1) << -0.1f, 0.1f, 0.1f);
+            Mat tvec_est = (Mat_<float>(3,1) << 0.0f, 0.0f, 1.0f);
+
+            solvePnP(p3d, p2d, intrinsics, noArray(), rvec_est, tvec_est, true, SOLVEPNP_ITERATIVE);
+
+            cout << "\nmethod: Levenberg-Marquardt (C API)" << endl;
+            cout << "rvec_ground_truth: " << rvec_ground_truth.t() << std::endl;
+            cout << "rvec_est: " << rvec_est.t() << std::endl;
+            cout << "tvec_ground_truth: " << tvec_ground_truth.t() << std::endl;
+            cout << "tvec_est: " << tvec_est.t() << std::endl;
+
+            EXPECT_LE(cvtest::norm(rvec_ground_truth, rvec_est, NORM_INF), 1e-6);
+            EXPECT_LE(cvtest::norm(tvec_ground_truth, tvec_est, NORM_INF), 1e-6);
+        }
+        {
+            Mat rvec_est = (Mat_<float>(3,1) << -0.1f, 0.1f, 0.1f);
+            Mat tvec_est = (Mat_<float>(3,1) << 0.0f, 0.0f, 1.0f);
+
+            solvePnPRefineLM(p3d, p2d, intrinsics, noArray(), rvec_est, tvec_est);
+
+            cout << "\nmethod: Levenberg-Marquardt (C++ API)" << endl;
+            cout << "rvec_ground_truth: " << rvec_ground_truth.t() << std::endl;
+            cout << "rvec_est: " << rvec_est.t() << std::endl;
+            cout << "tvec_ground_truth: " << tvec_ground_truth.t() << std::endl;
+            cout << "tvec_est: " << tvec_est.t() << std::endl;
+
+            EXPECT_LE(cvtest::norm(rvec_ground_truth, rvec_est, NORM_INF), 1e-6);
+            EXPECT_LE(cvtest::norm(tvec_ground_truth, tvec_est, NORM_INF), 1e-6);
+        }
+        {
+            Mat rvec_est = (Mat_<float>(3,1) << -0.1f, 0.1f, 0.1f);
+            Mat tvec_est = (Mat_<float>(3,1) << 0.0f, 0.0f, 1.0f);
+
+            solvePnPRefineVVS(p3d, p2d, intrinsics, noArray(), rvec_est, tvec_est);
+
+            cout << "\nmethod: Virtual Visual Servoing" << endl;
+            cout << "rvec_ground_truth: " << rvec_ground_truth.t() << std::endl;
+            cout << "rvec_est: " << rvec_est.t() << std::endl;
+            cout << "tvec_ground_truth: " << tvec_ground_truth.t() << std::endl;
+            cout << "tvec_est: " << tvec_est.t() << std::endl;
+
+            EXPECT_LE(cvtest::norm(rvec_ground_truth, rvec_est, NORM_INF), 1e-6);
+            EXPECT_LE(cvtest::norm(tvec_ground_truth, tvec_est, NORM_INF), 1e-6);
+        }
+    }
+
+    //refine after solvePnP
+    {
+        Matx33d intrinsics(605.4, 0.0, 317.35,
+                           0.0, 601.2, 242.63,
+                           0.0, 0.0, 1.0);
+
+        double L = 0.1;
+        vector<Point3d> p3d;
+        p3d.push_back(Point3d(-L, -L, 0.0));
+        p3d.push_back(Point3d(L, -L, 0.0));
+        p3d.push_back(Point3d(L, L, 0.0));
+        p3d.push_back(Point3d(-L, L, L/2));
+        p3d.push_back(Point3d(0, 0, -L/2));
+
+        Mat rvec_ground_truth = (Mat_<double>(3,1) << 0.3, -0.2, 0.75);
+        Mat tvec_ground_truth = (Mat_<double>(3,1) << 0.15, -0.2, 1.5);
+
+        vector<Point2d> p2d;
+        projectPoints(p3d, rvec_ground_truth, tvec_ground_truth, intrinsics, noArray(), p2d);
+
+        //add small Gaussian noise
+        RNG& rng = theRNG();
+        for (size_t i = 0; i < p2d.size(); i++)
+        {
+            p2d[i].x += rng.gaussian(5e-2);
+            p2d[i].y += rng.gaussian(5e-2);
+        }
+
+        Mat rvec_est, tvec_est;
+        solvePnP(p3d, p2d, intrinsics, noArray(), rvec_est, tvec_est, false, SOLVEPNP_EPNP);
+
+        {
+
+            Mat rvec_est_refine = rvec_est.clone(), tvec_est_refine = tvec_est.clone();
+            solvePnP(p3d, p2d, intrinsics, noArray(), rvec_est_refine, tvec_est_refine, true, SOLVEPNP_ITERATIVE);
+
+            cout << "\nmethod: Levenberg-Marquardt (C API)" << endl;
+            cout << "rvec_ground_truth: " << rvec_ground_truth.t() << std::endl;
+            cout << "rvec_est (EPnP): " << rvec_est.t() << std::endl;
+            cout << "rvec_est_refine: " << rvec_est_refine.t() << std::endl;
+            cout << "tvec_ground_truth: " << tvec_ground_truth.t() << std::endl;
+            cout << "tvec_est (EPnP): " << tvec_est.t() << std::endl;
+            cout << "tvec_est_refine: " << tvec_est_refine.t() << std::endl;
+
+            EXPECT_LE(cvtest::norm(rvec_ground_truth, rvec_est, NORM_INF), 1e-2);
+            EXPECT_LE(cvtest::norm(tvec_ground_truth, tvec_est, NORM_INF), 1e-3);
+
+            EXPECT_LT(cvtest::norm(rvec_ground_truth, rvec_est_refine, NORM_INF), cvtest::norm(rvec_ground_truth, rvec_est, NORM_INF));
+            EXPECT_LT(cvtest::norm(tvec_ground_truth, tvec_est_refine, NORM_INF), cvtest::norm(tvec_ground_truth, tvec_est, NORM_INF));
+        }
+        {
+            Mat rvec_est_refine = rvec_est.clone(), tvec_est_refine = tvec_est.clone();
+            solvePnPRefineLM(p3d, p2d, intrinsics, noArray(), rvec_est_refine, tvec_est_refine);
+
+            cout << "\nmethod: Levenberg-Marquardt (C++ API)" << endl;
+            cout << "rvec_ground_truth: " << rvec_ground_truth.t() << std::endl;
+            cout << "rvec_est: " << rvec_est.t() << std::endl;
+            cout << "rvec_est_refine: " << rvec_est_refine.t() << std::endl;
+            cout << "tvec_ground_truth: " << tvec_ground_truth.t() << std::endl;
+            cout << "tvec_est: " << tvec_est.t() << std::endl;
+            cout << "tvec_est_refine: " << tvec_est_refine.t() << std::endl;
+
+            EXPECT_LE(cvtest::norm(rvec_ground_truth, rvec_est, NORM_INF), 1e-2);
+            EXPECT_LE(cvtest::norm(tvec_ground_truth, tvec_est, NORM_INF), 1e-3);
+
+            EXPECT_LT(cvtest::norm(rvec_ground_truth, rvec_est_refine, NORM_INF), cvtest::norm(rvec_ground_truth, rvec_est, NORM_INF));
+            EXPECT_LT(cvtest::norm(tvec_ground_truth, tvec_est_refine, NORM_INF), cvtest::norm(tvec_ground_truth, tvec_est, NORM_INF));
+        }
+        {
+            Mat rvec_est_refine = rvec_est.clone(), tvec_est_refine = tvec_est.clone();
+            solvePnPRefineVVS(p3d, p2d, intrinsics, noArray(), rvec_est_refine, tvec_est_refine);
+
+            cout << "\nmethod: Virtual Visual Servoing" << endl;
+            cout << "rvec_ground_truth: " << rvec_ground_truth.t() << std::endl;
+            cout << "rvec_est: " << rvec_est.t() << std::endl;
+            cout << "rvec_est_refine: " << rvec_est_refine.t() << std::endl;
+            cout << "tvec_ground_truth: " << tvec_ground_truth.t() << std::endl;
+            cout << "tvec_est: " << tvec_est.t() << std::endl;
+            cout << "tvec_est_refine: " << tvec_est_refine.t() << std::endl;
+
+            EXPECT_LE(cvtest::norm(rvec_ground_truth, rvec_est, NORM_INF), 1e-2);
+            EXPECT_LE(cvtest::norm(tvec_ground_truth, tvec_est, NORM_INF), 1e-3);
+
+            EXPECT_LT(cvtest::norm(rvec_ground_truth, rvec_est_refine, NORM_INF), cvtest::norm(rvec_ground_truth, rvec_est, NORM_INF));
+            EXPECT_LT(cvtest::norm(tvec_ground_truth, tvec_est_refine, NORM_INF), cvtest::norm(tvec_ground_truth, tvec_est, NORM_INF));
+        }
+    }
+}
+
 }} // namespace
diff --git a/modules/core/include/opencv2/core/cv_cpu_dispatch.h b/modules/core/include/opencv2/core/cv_cpu_dispatch.h
index 7f6d6b0fb9..483cc8f269 100644
--- a/modules/core/include/opencv2/core/cv_cpu_dispatch.h
+++ b/modules/core/include/opencv2/core/cv_cpu_dispatch.h
@@ -87,9 +87,41 @@
 #  include <immintrin.h>
 #  define CV_AVX_512F 1
 #endif
+#ifdef CV_CPU_COMPILE_AVX512_COMMON
+#  define CV_AVX512_COMMON 1
+#  define CV_AVX_512CD 1
+#endif
+#ifdef CV_CPU_COMPILE_AVX512_KNL
+#  define CV_AVX512_KNL 1
+#  define CV_AVX_512ER 1
+#  define CV_AVX_512PF 1
+#endif
+#ifdef CV_CPU_COMPILE_AVX512_KNM
+#  define CV_AVX512_KNM 1
+#  define CV_AVX_5124FMAPS 1
+#  define CV_AVX_5124VNNIW 1
+#  define CV_AVX_512VPOPCNTDQ 1
+#endif
 #ifdef CV_CPU_COMPILE_AVX512_SKX
-#  include <immintrin.h>
 #  define CV_AVX512_SKX 1
+#  define CV_AVX_512VL 1
+#  define CV_AVX_512BW 1
+#  define CV_AVX_512DQ 1
+#endif
+#ifdef CV_CPU_COMPILE_AVX512_CNL
+#  define CV_AVX512_CNL 1
+#  define CV_AVX_512IFMA 1
+#  define CV_AVX_512VBMI 1
+#endif
+#ifdef CV_CPU_COMPILE_AVX512_CEL
+#  define CV_AVX512_CEL 1
+#  define CV_AVX_512VNNI 1
+#endif
+#ifdef CV_CPU_COMPILE_AVX512_ICL
+#  define CV_AVX512_ICL 1
+#  define CV_AVX_512VBMI2 1
+#  define CV_AVX_512BITALG 1
+#  define CV_AVX_512VPOPCNTDQ 1
 #endif
 #ifdef CV_CPU_COMPILE_FMA3
 #  define CV_FMA3 1
@@ -223,9 +255,10 @@ struct VZeroUpperGuard {
 #ifndef CV_AVX_512ER
 #  define CV_AVX_512ER 0
 #endif
-#ifndef CV_AVX_512IFMA512
-#  define CV_AVX_512IFMA512 0
+#ifndef CV_AVX_512IFMA
+#  define CV_AVX_512IFMA 0
 #endif
+#define CV_AVX_512IFMA512 CV_AVX_512IFMA // deprecated
 #ifndef CV_AVX_512PF
 #  define CV_AVX_512PF 0
 #endif
@@ -235,9 +268,45 @@ struct VZeroUpperGuard {
 #ifndef CV_AVX_512VL
 #  define CV_AVX_512VL 0
 #endif
+#ifndef CV_AVX_5124FMAPS
+#  define CV_AVX_5124FMAPS 0
+#endif
+#ifndef CV_AVX_5124VNNIW
+#  define CV_AVX_5124VNNIW 0
+#endif
+#ifndef CV_AVX_512VPOPCNTDQ
+#  define CV_AVX_512VPOPCNTDQ 0
+#endif
+#ifndef CV_AVX_512VNNI
+#  define CV_AVX_512VNNI 0
+#endif
+#ifndef CV_AVX_512VBMI2
+#  define CV_AVX_512VBMI2 0
+#endif
+#ifndef CV_AVX_512BITALG
+#  define CV_AVX_512BITALG 0
+#endif
+#ifndef CV_AVX512_COMMON
+#  define CV_AVX512_COMMON 0
+#endif
+#ifndef CV_AVX512_KNL
+#  define CV_AVX512_KNL 0
+#endif
+#ifndef CV_AVX512_KNM
+#  define CV_AVX512_KNM 0
+#endif
 #ifndef CV_AVX512_SKX
 #  define CV_AVX512_SKX 0
 #endif
+#ifndef CV_AVX512_CNL
+#  define CV_AVX512_CNL 0
+#endif
+#ifndef CV_AVX512_CEL
+#  define CV_AVX512_CEL 0
+#endif
+#ifndef CV_AVX512_ICL
+#  define CV_AVX512_ICL 0
+#endif
 
 #ifndef CV_NEON
 #  define CV_NEON 0
diff --git a/modules/core/include/opencv2/core/cv_cpu_helper.h b/modules/core/include/opencv2/core/cv_cpu_helper.h
index ad1339796d..90e0e9b9e3 100644
--- a/modules/core/include/opencv2/core/cv_cpu_helper.h
+++ b/modules/core/include/opencv2/core/cv_cpu_helper.h
@@ -252,6 +252,69 @@
 #endif
 #define __CV_CPU_DISPATCH_CHAIN_AVX_512F(fn, args, mode, ...)  CV_CPU_CALL_AVX_512F(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
 
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX512_COMMON
+#  define CV_TRY_AVX512_COMMON 1
+#  define CV_CPU_FORCE_AVX512_COMMON 1
+#  define CV_CPU_HAS_SUPPORT_AVX512_COMMON 1
+#  define CV_CPU_CALL_AVX512_COMMON(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_AVX512_COMMON_(fn, args) return (opt_AVX512_COMMON::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX512_COMMON
+#  define CV_TRY_AVX512_COMMON 1
+#  define CV_CPU_FORCE_AVX512_COMMON 0
+#  define CV_CPU_HAS_SUPPORT_AVX512_COMMON (cv::checkHardwareSupport(CV_CPU_AVX512_COMMON))
+#  define CV_CPU_CALL_AVX512_COMMON(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_COMMON) return (opt_AVX512_COMMON::fn args)
+#  define CV_CPU_CALL_AVX512_COMMON_(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_COMMON) return (opt_AVX512_COMMON::fn args)
+#else
+#  define CV_TRY_AVX512_COMMON 0
+#  define CV_CPU_FORCE_AVX512_COMMON 0
+#  define CV_CPU_HAS_SUPPORT_AVX512_COMMON 0
+#  define CV_CPU_CALL_AVX512_COMMON(fn, args)
+#  define CV_CPU_CALL_AVX512_COMMON_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_AVX512_COMMON(fn, args, mode, ...)  CV_CPU_CALL_AVX512_COMMON(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX512_KNL
+#  define CV_TRY_AVX512_KNL 1
+#  define CV_CPU_FORCE_AVX512_KNL 1
+#  define CV_CPU_HAS_SUPPORT_AVX512_KNL 1
+#  define CV_CPU_CALL_AVX512_KNL(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_AVX512_KNL_(fn, args) return (opt_AVX512_KNL::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX512_KNL
+#  define CV_TRY_AVX512_KNL 1
+#  define CV_CPU_FORCE_AVX512_KNL 0
+#  define CV_CPU_HAS_SUPPORT_AVX512_KNL (cv::checkHardwareSupport(CV_CPU_AVX512_KNL))
+#  define CV_CPU_CALL_AVX512_KNL(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_KNL) return (opt_AVX512_KNL::fn args)
+#  define CV_CPU_CALL_AVX512_KNL_(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_KNL) return (opt_AVX512_KNL::fn args)
+#else
+#  define CV_TRY_AVX512_KNL 0
+#  define CV_CPU_FORCE_AVX512_KNL 0
+#  define CV_CPU_HAS_SUPPORT_AVX512_KNL 0
+#  define CV_CPU_CALL_AVX512_KNL(fn, args)
+#  define CV_CPU_CALL_AVX512_KNL_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_AVX512_KNL(fn, args, mode, ...)  CV_CPU_CALL_AVX512_KNL(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX512_KNM
+#  define CV_TRY_AVX512_KNM 1
+#  define CV_CPU_FORCE_AVX512_KNM 1
+#  define CV_CPU_HAS_SUPPORT_AVX512_KNM 1
+#  define CV_CPU_CALL_AVX512_KNM(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_AVX512_KNM_(fn, args) return (opt_AVX512_KNM::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX512_KNM
+#  define CV_TRY_AVX512_KNM 1
+#  define CV_CPU_FORCE_AVX512_KNM 0
+#  define CV_CPU_HAS_SUPPORT_AVX512_KNM (cv::checkHardwareSupport(CV_CPU_AVX512_KNM))
+#  define CV_CPU_CALL_AVX512_KNM(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_KNM) return (opt_AVX512_KNM::fn args)
+#  define CV_CPU_CALL_AVX512_KNM_(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_KNM) return (opt_AVX512_KNM::fn args)
+#else
+#  define CV_TRY_AVX512_KNM 0
+#  define CV_CPU_FORCE_AVX512_KNM 0
+#  define CV_CPU_HAS_SUPPORT_AVX512_KNM 0
+#  define CV_CPU_CALL_AVX512_KNM(fn, args)
+#  define CV_CPU_CALL_AVX512_KNM_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_AVX512_KNM(fn, args, mode, ...)  CV_CPU_CALL_AVX512_KNM(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
 #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX512_SKX
 #  define CV_TRY_AVX512_SKX 1
 #  define CV_CPU_FORCE_AVX512_SKX 1
@@ -273,6 +336,69 @@
 #endif
 #define __CV_CPU_DISPATCH_CHAIN_AVX512_SKX(fn, args, mode, ...)  CV_CPU_CALL_AVX512_SKX(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
 
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX512_CNL
+#  define CV_TRY_AVX512_CNL 1
+#  define CV_CPU_FORCE_AVX512_CNL 1
+#  define CV_CPU_HAS_SUPPORT_AVX512_CNL 1
+#  define CV_CPU_CALL_AVX512_CNL(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_AVX512_CNL_(fn, args) return (opt_AVX512_CNL::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX512_CNL
+#  define CV_TRY_AVX512_CNL 1
+#  define CV_CPU_FORCE_AVX512_CNL 0
+#  define CV_CPU_HAS_SUPPORT_AVX512_CNL (cv::checkHardwareSupport(CV_CPU_AVX512_CNL))
+#  define CV_CPU_CALL_AVX512_CNL(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_CNL) return (opt_AVX512_CNL::fn args)
+#  define CV_CPU_CALL_AVX512_CNL_(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_CNL) return (opt_AVX512_CNL::fn args)
+#else
+#  define CV_TRY_AVX512_CNL 0
+#  define CV_CPU_FORCE_AVX512_CNL 0
+#  define CV_CPU_HAS_SUPPORT_AVX512_CNL 0
+#  define CV_CPU_CALL_AVX512_CNL(fn, args)
+#  define CV_CPU_CALL_AVX512_CNL_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_AVX512_CNL(fn, args, mode, ...)  CV_CPU_CALL_AVX512_CNL(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX512_CEL
+#  define CV_TRY_AVX512_CEL 1
+#  define CV_CPU_FORCE_AVX512_CEL 1
+#  define CV_CPU_HAS_SUPPORT_AVX512_CEL 1
+#  define CV_CPU_CALL_AVX512_CEL(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_AVX512_CEL_(fn, args) return (opt_AVX512_CEL::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX512_CEL
+#  define CV_TRY_AVX512_CEL 1
+#  define CV_CPU_FORCE_AVX512_CEL 0
+#  define CV_CPU_HAS_SUPPORT_AVX512_CEL (cv::checkHardwareSupport(CV_CPU_AVX512_CEL))
+#  define CV_CPU_CALL_AVX512_CEL(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_CEL) return (opt_AVX512_CEL::fn args)
+#  define CV_CPU_CALL_AVX512_CEL_(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_CEL) return (opt_AVX512_CEL::fn args)
+#else
+#  define CV_TRY_AVX512_CEL 0
+#  define CV_CPU_FORCE_AVX512_CEL 0
+#  define CV_CPU_HAS_SUPPORT_AVX512_CEL 0
+#  define CV_CPU_CALL_AVX512_CEL(fn, args)
+#  define CV_CPU_CALL_AVX512_CEL_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_AVX512_CEL(fn, args, mode, ...)  CV_CPU_CALL_AVX512_CEL(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX512_ICL
+#  define CV_TRY_AVX512_ICL 1
+#  define CV_CPU_FORCE_AVX512_ICL 1
+#  define CV_CPU_HAS_SUPPORT_AVX512_ICL 1
+#  define CV_CPU_CALL_AVX512_ICL(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_AVX512_ICL_(fn, args) return (opt_AVX512_ICL::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX512_ICL
+#  define CV_TRY_AVX512_ICL 1
+#  define CV_CPU_FORCE_AVX512_ICL 0
+#  define CV_CPU_HAS_SUPPORT_AVX512_ICL (cv::checkHardwareSupport(CV_CPU_AVX512_ICL))
+#  define CV_CPU_CALL_AVX512_ICL(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_ICL) return (opt_AVX512_ICL::fn args)
+#  define CV_CPU_CALL_AVX512_ICL_(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_ICL) return (opt_AVX512_ICL::fn args)
+#else
+#  define CV_TRY_AVX512_ICL 0
+#  define CV_CPU_FORCE_AVX512_ICL 0
+#  define CV_CPU_HAS_SUPPORT_AVX512_ICL 0
+#  define CV_CPU_CALL_AVX512_ICL(fn, args)
+#  define CV_CPU_CALL_AVX512_ICL_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_AVX512_ICL(fn, args, mode, ...)  CV_CPU_CALL_AVX512_ICL(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
 #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_NEON
 #  define CV_TRY_NEON 1
 #  define CV_CPU_FORCE_NEON 1
diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h
index caa4a9e4c7..0b301623b0 100644
--- a/modules/core/include/opencv2/core/cvdef.h
+++ b/modules/core/include/opencv2/core/cvdef.h
@@ -235,6 +235,12 @@ namespace cv { namespace debug_build_guard { } using namespace debug_build_guard
 #define CV_CPU_AVX_512PF        19
 #define CV_CPU_AVX_512VBMI      20
 #define CV_CPU_AVX_512VL        21
+#define CV_CPU_AVX_512VBMI2     22
+#define CV_CPU_AVX_512VNNI      23
+#define CV_CPU_AVX_512BITALG    24
+#define CV_CPU_AVX_512VPOPCNTDQ 25
+#define CV_CPU_AVX_5124VNNIW    26
+#define CV_CPU_AVX_5124FMAPS    27
 
 #define CV_CPU_NEON             100
 
@@ -243,6 +249,12 @@ namespace cv { namespace debug_build_guard { } using namespace debug_build_guard
 
 // CPU features groups
 #define CV_CPU_AVX512_SKX       256
+#define CV_CPU_AVX512_COMMON    257
+#define CV_CPU_AVX512_KNL       258
+#define CV_CPU_AVX512_KNM       259
+#define CV_CPU_AVX512_CNL       260
+#define CV_CPU_AVX512_CEL       261
+#define CV_CPU_AVX512_ICL       262
 
 // when adding to this list remember to update the following enum
 #define CV_HARDWARE_MAX_FEATURE 512
@@ -273,6 +285,12 @@ enum CpuFeatures {
     CPU_AVX_512PF       = 19,
     CPU_AVX_512VBMI     = 20,
     CPU_AVX_512VL       = 21,
+    CPU_AVX_512VBMI2    = 22,
+    CPU_AVX_512VNNI     = 23,
+    CPU_AVX_512BITALG   = 24,
+    CPU_AVX_512VPOPCNTDQ= 25,
+    CPU_AVX_5124VNNIW   = 26,
+    CPU_AVX_5124FMAPS   = 27,
 
     CPU_NEON            = 100,
 
@@ -280,6 +298,12 @@ enum CpuFeatures {
     CPU_VSX3            = 201,
 
     CPU_AVX512_SKX      = 256, //!< Skylake-X with AVX-512F/CD/BW/DQ/VL
+    CPU_AVX512_COMMON   = 257, //!< Common instructions AVX-512F/CD for all CPUs that support AVX-512
+    CPU_AVX512_KNL      = 258, //!< Knights Landing with AVX-512F/CD/ER/PF
+    CPU_AVX512_KNM      = 259, //!< Knights Mill with AVX-512F/CD/ER/PF/4FMAPS/4VNNIW/VPOPCNTDQ
+    CPU_AVX512_CNL      = 260, //!< Cannon Lake with AVX-512F/CD/BW/DQ/VL/IFMA/VBMI
+    CPU_AVX512_CEL      = 261, //!< Cascade Lake with AVX-512F/CD/BW/DQ/VL/IFMA/VBMI/VNNI
+    CPU_AVX512_ICL      = 262, //!< Ice Lake with AVX-512F/CD/BW/DQ/VL/IFMA/VBMI/VNNI/VBMI2/BITALG/VPOPCNTDQ
 
     CPU_MAX_FEATURE     = 512  // see CV_HARDWARE_MAX_FEATURE
 };
diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp
index 894dc6cbef..1be4a54959 100644
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@@ -356,6 +356,12 @@ struct HWFeatures
         g_hwFeatureNames[CPU_AVX_512PF] = "AVX512PF";
         g_hwFeatureNames[CPU_AVX_512VBMI] = "AVX512VBMI";
         g_hwFeatureNames[CPU_AVX_512VL] = "AVX512VL";
+        g_hwFeatureNames[CPU_AVX_512VBMI2] = "AVX512VBMI2";
+        g_hwFeatureNames[CPU_AVX_512VNNI] = "AVX512VNNI";
+        g_hwFeatureNames[CPU_AVX_512BITALG] = "AVX512BITALG";
+        g_hwFeatureNames[CPU_AVX_512VPOPCNTDQ] = "AVX512VPOPCNTDQ";
+        g_hwFeatureNames[CPU_AVX_5124VNNIW] = "AVX5124VNNIW";
+        g_hwFeatureNames[CPU_AVX_5124FMAPS] = "AVX5124FMAPS";
 
         g_hwFeatureNames[CPU_NEON] = "NEON";
 
@@ -363,6 +369,11 @@ struct HWFeatures
         g_hwFeatureNames[CPU_VSX3] = "VSX3";
 
         g_hwFeatureNames[CPU_AVX512_SKX] = "AVX512-SKX";
+        g_hwFeatureNames[CPU_AVX512_KNL] = "AVX512-KNL";
+        g_hwFeatureNames[CPU_AVX512_KNM] = "AVX512-KNM";
+        g_hwFeatureNames[CPU_AVX512_CNL] = "AVX512-CNL";
+        g_hwFeatureNames[CPU_AVX512_CEL] = "AVX512-CEL";
+        g_hwFeatureNames[CPU_AVX512_ICL] = "AVX512-ICL";
     }
 
     void initialize(void)
@@ -404,15 +415,21 @@ struct HWFeatures
 
             have[CV_CPU_AVX2]   = (cpuid_data_ex[1] & (1<<5)) != 0;
 
-            have[CV_CPU_AVX_512F]       = (cpuid_data_ex[1] & (1<<16)) != 0;
-            have[CV_CPU_AVX_512DQ]      = (cpuid_data_ex[1] & (1<<17)) != 0;
-            have[CV_CPU_AVX_512IFMA512] = (cpuid_data_ex[1] & (1<<21)) != 0;
-            have[CV_CPU_AVX_512PF]      = (cpuid_data_ex[1] & (1<<26)) != 0;
-            have[CV_CPU_AVX_512ER]      = (cpuid_data_ex[1] & (1<<27)) != 0;
-            have[CV_CPU_AVX_512CD]      = (cpuid_data_ex[1] & (1<<28)) != 0;
-            have[CV_CPU_AVX_512BW]      = (cpuid_data_ex[1] & (1<<30)) != 0;
-            have[CV_CPU_AVX_512VL]      = (cpuid_data_ex[1] & (1<<31)) != 0;
-            have[CV_CPU_AVX_512VBMI]    = (cpuid_data_ex[2] & (1<<1)) != 0;
+            have[CV_CPU_AVX_512F]         = (cpuid_data_ex[1] & (1<<16)) != 0;
+            have[CV_CPU_AVX_512DQ]        = (cpuid_data_ex[1] & (1<<17)) != 0;
+            have[CV_CPU_AVX_512IFMA]      = (cpuid_data_ex[1] & (1<<21)) != 0;
+            have[CV_CPU_AVX_512PF]        = (cpuid_data_ex[1] & (1<<26)) != 0;
+            have[CV_CPU_AVX_512ER]        = (cpuid_data_ex[1] & (1<<27)) != 0;
+            have[CV_CPU_AVX_512CD]        = (cpuid_data_ex[1] & (1<<28)) != 0;
+            have[CV_CPU_AVX_512BW]        = (cpuid_data_ex[1] & (1<<30)) != 0;
+            have[CV_CPU_AVX_512VL]        = (cpuid_data_ex[1] & (1<<31)) != 0;
+            have[CV_CPU_AVX_512VBMI]      = (cpuid_data_ex[2] & (1<<1))  != 0;
+            have[CV_CPU_AVX_512VBMI2]     = (cpuid_data_ex[2] & (1<<6))  != 0;
+            have[CV_CPU_AVX_512VNNI]      = (cpuid_data_ex[2] & (1<<11)) != 0;
+            have[CV_CPU_AVX_512BITALG]    = (cpuid_data_ex[2] & (1<<12)) != 0;
+            have[CV_CPU_AVX_512VPOPCNTDQ] = (cpuid_data_ex[2] & (1<<14)) != 0;
+            have[CV_CPU_AVX_5124VNNIW]    = (cpuid_data_ex[3] & (1<<2))  != 0;
+            have[CV_CPU_AVX_5124FMAPS]    = (cpuid_data_ex[3] & (1<<3))  != 0;
 
             bool have_AVX_OS_support = true;
             bool have_AVX512_OS_support = true;
@@ -446,15 +463,38 @@ struct HWFeatures
                 have[CV_CPU_AVX_512CD] = false;
                 have[CV_CPU_AVX_512DQ] = false;
                 have[CV_CPU_AVX_512ER] = false;
-                have[CV_CPU_AVX_512IFMA512] = false;
+                have[CV_CPU_AVX_512IFMA] = false;
                 have[CV_CPU_AVX_512PF] = false;
                 have[CV_CPU_AVX_512VBMI] = false;
                 have[CV_CPU_AVX_512VL] = false;
+                have[CV_CPU_AVX_512VBMI2] = false;
+                have[CV_CPU_AVX_512VNNI] = false;
+                have[CV_CPU_AVX_512BITALG] = false;
+                have[CV_CPU_AVX_512VPOPCNTDQ] = false;
+                have[CV_CPU_AVX_5124VNNIW] = false;
+                have[CV_CPU_AVX_5124FMAPS] = false;
             }
 
-            if (have[CV_CPU_AVX_512F])
+            have[CV_CPU_AVX512_COMMON] = have[CV_CPU_AVX_512F] && have[CV_CPU_AVX_512CD];
+            if (have[CV_CPU_AVX512_COMMON])
             {
-                have[CV_CPU_AVX512_SKX] = have[CV_CPU_AVX_512F] & have[CV_CPU_AVX_512CD] & have[CV_CPU_AVX_512BW] & have[CV_CPU_AVX_512DQ] & have[CV_CPU_AVX_512VL];
+                have[CV_CPU_AVX512_KNL] = have[CV_CPU_AVX_512ER]  && have[CV_CPU_AVX_512PF];
+                have[CV_CPU_AVX512_KNM] = have[CV_CPU_AVX512_KNL] && have[CV_CPU_AVX_5124FMAPS] &&
+                                          have[CV_CPU_AVX_5124VNNIW] && have[CV_CPU_AVX_512VPOPCNTDQ];
+                have[CV_CPU_AVX512_SKX] = have[CV_CPU_AVX_512BW] && have[CV_CPU_AVX_512DQ] && have[CV_CPU_AVX_512VL];
+                have[CV_CPU_AVX512_CNL] = have[CV_CPU_AVX512_SKX] && have[CV_CPU_AVX_512IFMA] && have[CV_CPU_AVX_512VBMI];
+                have[CV_CPU_AVX512_CEL] = have[CV_CPU_AVX512_CNL] && have[CV_CPU_AVX_512VNNI];
+                have[CV_CPU_AVX512_ICL] = have[CV_CPU_AVX512_CEL] && have[CV_CPU_AVX_512VBMI2] &&
+                                          have[CV_CPU_AVX_512BITALG] && have[CV_CPU_AVX_512VPOPCNTDQ];
+            }
+            else
+            {
+                have[CV_CPU_AVX512_KNL] = false;
+                have[CV_CPU_AVX512_KNM] = false;
+                have[CV_CPU_AVX512_SKX] = false;
+                have[CV_CPU_AVX512_CNL] = false;
+                have[CV_CPU_AVX512_CEL] = false;
+                have[CV_CPU_AVX512_ICL] = false;
             }
         }
     #endif // CV_CPUID_X86
@@ -621,11 +661,14 @@ struct HWFeatures
                         }
                         if (isBaseline)
                         {
-                            if (dump) fprintf(stderr, "OPENCV: Trying to disable baseline CPU feature: '%s'. This has very limited effect, because code optimizations for this feature are executed unconditionally in the most cases.\n", getHWFeatureNameSafe(i));
+                            if (dump) fprintf(stderr, "OPENCV: Trying to disable baseline CPU feature: '%s'."
+                                                      "This has very limited effect, because code optimizations for this feature are executed unconditionally "
+                                                      "in the most cases.\n", getHWFeatureNameSafe(i));
                         }
                         if (!have[i])
                         {
-                            if (dump) fprintf(stderr, "OPENCV: Trying to disable unavailable CPU feature on the current platform: '%s'.\n", getHWFeatureNameSafe(i));
+                            if (dump) fprintf(stderr, "OPENCV: Trying to disable unavailable CPU feature on the current platform: '%s'.\n",
+                                getHWFeatureNameSafe(i));
                         }
                         have[i] = false;
 
diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp
index bc4c782ca7..95f5b57de6 100644
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@@ -126,7 +126,7 @@ public:
             inpShape.push_back(inputs[0].size[i]);
             outShape.push_back(outputs[0].size[i]);
         }
-        getConvPoolPaddings(inpShape, outShape, kernel_size, strides, padMode, dilations, pads_begin, pads_end);
+        getConvPoolPaddings(inpShape, kernel_size, strides, padMode, pads_begin, pads_end);
         if (pads_begin.size() == 2) {
             for (int i = 0; i < pads_begin.size(); i++) {
                 if (pads_begin[i] != pads_end[i])
@@ -1331,7 +1331,7 @@ public:
             inpShape.push_back(inputs[0].size[i]);
             outShape.push_back(outputs[0].size[i]);
         }
-        getConvPoolPaddings(outShape, inpShape, kernel_size, strides, padMode, dilations, pads_begin, pads_end);
+        getConvPoolPaddings(outShape, kernel_size, strides, padMode, pads_begin, pads_end);
         if (pads_begin.size() == 2) {
             for (int i = 0; i < pads_begin.size(); i++) {
                 if (pads_begin[i] != pads_end[i])
diff --git a/modules/dnn/src/layers/layers_common.cpp b/modules/dnn/src/layers/layers_common.cpp
index 627f79c784..29d863d2ad 100644
--- a/modules/dnn/src/layers/layers_common.cpp
+++ b/modules/dnn/src/layers/layers_common.cpp
@@ -214,25 +214,25 @@ void getConvPoolOutParams(const std::vector<int>& inp, const std::vector<size_t>
     }
 }
 
-void getConvPoolPaddings(const std::vector<int>& inp, const std::vector<int>& out,
-                         const std::vector<size_t>& kernel, const std::vector<size_t>& strides,
-                         const String &padMode, const std::vector<size_t>& dilation,
+void getConvPoolPaddings(const std::vector<int>& inp, const std::vector<size_t>& kernel,
+                         const std::vector<size_t>& strides, const String &padMode,
                          std::vector<size_t>& pads_begin, std::vector<size_t>& pads_end)
 {
-    if (padMode == "VALID")
+    if (padMode == "SAME" || padMode == "VALID")
     {
         pads_begin.assign(kernel.size(), 0);
         pads_end.assign(kernel.size(), 0);
     }
-    else if (padMode == "SAME")
+    if (padMode == "SAME")
     {
-        CV_Assert_N(kernel.size() == dilation.size(), kernel.size() == strides.size(),
-                    kernel.size() == inp.size(), kernel.size() == out.size());
-        pads_begin.resize(kernel.size());
-        pads_end.resize(kernel.size());
+        CV_Assert_N(kernel.size() == strides.size(), kernel.size() == inp.size());
         for (int i = 0; i < pads_begin.size(); i++) {
-            int pad = ((out[i] - 1) * strides[i] + dilation[i] * (kernel[i] - 1) + 1 - inp[i]) / 2;
-            pads_begin[i] = pads_end[i] = std::max(0, pad);
+            // There are test cases with stride > kernel.
+            if (strides[i] <= kernel[i])
+            {
+                int pad = (kernel[i] - 1 - (inp[i] - 1 + strides[i]) % strides[i]) / 2;
+                pads_begin[i] = pads_end[i] = pad;
+            }
         }
     }
 }
diff --git a/modules/dnn/src/layers/layers_common.hpp b/modules/dnn/src/layers/layers_common.hpp
index fd1e430a54..26c1ce62d5 100644
--- a/modules/dnn/src/layers/layers_common.hpp
+++ b/modules/dnn/src/layers/layers_common.hpp
@@ -69,9 +69,8 @@ void getConvPoolOutParams(const std::vector<int>& inp, const std::vector<size_t>
                           const std::vector<size_t>& stride, const String &padMode,
                           const std::vector<size_t>& dilation, std::vector<int>& out);
 
- void getConvPoolPaddings(const std::vector<int>& inp, const std::vector<int>& out,
-                          const std::vector<size_t>& kernel, const std::vector<size_t>& strides,
-                          const String &padMode, const std::vector<size_t>& dilation,
+ void getConvPoolPaddings(const std::vector<int>& inp, const std::vector<size_t>& kernel,
+                          const std::vector<size_t>& strides, const String &padMode,
                           std::vector<size_t>& pads_begin, std::vector<size_t>& pads_end);
 }
 }
diff --git a/modules/dnn/src/layers/pooling_layer.cpp b/modules/dnn/src/layers/pooling_layer.cpp
index b087cb0219..7316347f2e 100644
--- a/modules/dnn/src/layers/pooling_layer.cpp
+++ b/modules/dnn/src/layers/pooling_layer.cpp
@@ -144,7 +144,7 @@ public:
             kernel_size = std::vector<size_t>(inp.begin(), inp.end());
         }
 
-        getConvPoolPaddings(inp, out, kernel_size, strides, padMode, std::vector<size_t>(kernel_size.size(), 1), pads_begin, pads_end);
+        getConvPoolPaddings(inp, kernel_size, strides, padMode, pads_begin, pads_end);
         if (pads_begin.size() == 2) {
             pad_t = pads_begin[0];
             pad_l = pads_begin[1];
diff --git a/modules/dnn/src/op_inf_engine.cpp b/modules/dnn/src/op_inf_engine.cpp
index 2657b0c40e..9e072dc91d 100644
--- a/modules/dnn/src/op_inf_engine.cpp
+++ b/modules/dnn/src/op_inf_engine.cpp
@@ -958,7 +958,16 @@ Mat infEngineBlobToMat(const InferenceEngine::Blob::Ptr& blob)
     // NOTE: Inference Engine sizes are reversed.
     std::vector<size_t> dims = blob->dims();
     std::vector<int> size(dims.rbegin(), dims.rend());
-    return Mat(size, CV_32F, (void*)blob->buffer());
+
+    int type = -1;
+    switch (blob->precision())
+    {
+        case InferenceEngine::Precision::FP32: type = CV_32F; break;
+        case InferenceEngine::Precision::U8: type = CV_8U; break;
+        default:
+            CV_Error(Error::StsNotImplemented, "Unsupported blob precision");
+    }
+    return Mat(size, type, (void*)blob->buffer());
 }
 
 bool InfEngineBackendLayer::getMemoryShapes(const std::vector<MatShape> &inputs,
diff --git a/modules/dnn/src/tensorflow/tf_graph_simplifier.cpp b/modules/dnn/src/tensorflow/tf_graph_simplifier.cpp
index aaa1c09ee4..8e57d557db 100644
--- a/modules/dnn/src/tensorflow/tf_graph_simplifier.cpp
+++ b/modules/dnn/src/tensorflow/tf_graph_simplifier.cpp
@@ -770,43 +770,47 @@ void RemoveIdentityOps(tensorflow::GraphDef& net)
     }
 }
 
-Mat getTensorContent(const tensorflow::TensorProto &tensor)
+Mat getTensorContent(const tensorflow::TensorProto &tensor, bool copy)
 {
     const std::string& content = tensor.tensor_content();
+    Mat m;
     switch (tensor.dtype())
     {
         case tensorflow::DT_FLOAT:
         {
             if (!content.empty())
-                return Mat(1, content.size() / sizeof(float), CV_32FC1, (void*)content.c_str()).clone();
+                m = Mat(1, content.size() / sizeof(float), CV_32FC1, (void*)content.c_str());
             else
             {
                 const RepeatedField<float>& field = tensor.float_val();
                 CV_Assert(!field.empty());
-                return Mat(1, field.size(), CV_32FC1, (void*)field.data()).clone();
+                m = Mat(1, field.size(), CV_32FC1, (void*)field.data());
             }
+            break;
         }
         case tensorflow::DT_DOUBLE:
         {
             if (!content.empty())
-                return Mat(1, content.size() / sizeof(double), CV_64FC1, (void*)content.c_str()).clone();
+                m = Mat(1, content.size() / sizeof(double), CV_64FC1, (void*)content.c_str());
             else
             {
                 const RepeatedField<double>& field = tensor.double_val();
                 CV_Assert(!field.empty());
-                return Mat(1, field.size(), CV_64FC1, (void*)field.data()).clone();
+                m = Mat(1, field.size(), CV_64FC1, (void*)field.data());
             }
+            break;
         }
         case tensorflow::DT_INT32:
         {
             if (!content.empty())
-                return Mat(1, content.size() / sizeof(int32_t), CV_32SC1, (void*)content.c_str()).clone();
+                m = Mat(1, content.size() / sizeof(int32_t), CV_32SC1, (void*)content.c_str());
             else
             {
                 const RepeatedField<int32_t>& field = tensor.int_val();
                 CV_Assert(!field.empty());
-                return Mat(1, field.size(), CV_32SC1, (void*)field.data()).clone();
+                m = Mat(1, field.size(), CV_32SC1, (void*)field.data());
             }
+            break;
         }
         case tensorflow::DT_HALF:
         {
@@ -825,20 +829,20 @@ Mat getTensorContent(const tensorflow::TensorProto &tensor)
             }
             // Reinterpret as a signed shorts just for a convertFp16 call.
             Mat halfsSigned(halfs.size(), CV_16SC1, halfs.data);
-            Mat floats(halfs.size(), CV_32FC1);
-            convertFp16(halfsSigned, floats);
-            return floats;
+            convertFp16(halfsSigned, m);
+            break;
         }
         case tensorflow::DT_QUINT8:
         {
             CV_Assert(!content.empty());
-            return Mat(1, content.size(), CV_8UC1, (void*)content.c_str()).clone();
+            m = Mat(1, content.size(), CV_8UC1, (void*)content.c_str());
+            break;
         }
         default:
             CV_Error(Error::StsError, "Tensor's data type is not supported");
             break;
     }
-    return Mat();
+    return copy ? m.clone() : m;
 }
 
 void releaseTensor(tensorflow::TensorProto* tensor)
diff --git a/modules/dnn/src/tensorflow/tf_graph_simplifier.hpp b/modules/dnn/src/tensorflow/tf_graph_simplifier.hpp
index 986fc3c06e..8a77dda6d4 100644
--- a/modules/dnn/src/tensorflow/tf_graph_simplifier.hpp
+++ b/modules/dnn/src/tensorflow/tf_graph_simplifier.hpp
@@ -21,7 +21,7 @@ void RemoveIdentityOps(tensorflow::GraphDef& net);
 
 void simplifySubgraphs(tensorflow::GraphDef& net);
 
-Mat getTensorContent(const tensorflow::TensorProto &tensor);
+Mat getTensorContent(const tensorflow::TensorProto &tensor, bool copy = true);
 
 void releaseTensor(tensorflow::TensorProto* tensor);
 
diff --git a/modules/dnn/src/tensorflow/tf_importer.cpp b/modules/dnn/src/tensorflow/tf_importer.cpp
index 298d532dc7..41985c834d 100644
--- a/modules/dnn/src/tensorflow/tf_importer.cpp
+++ b/modules/dnn/src/tensorflow/tf_importer.cpp
@@ -109,7 +109,7 @@ void parseTensor(const tensorflow::TensorProto &tensor, Mat &dstBlob)
 
     dstBlob.create(shape, CV_32F);
 
-    Mat tensorContent = getTensorContent(tensor);
+    Mat tensorContent = getTensorContent(tensor, /*no copy*/false);
     int size = tensorContent.total();
     CV_Assert(size == (int)dstBlob.total());
 
@@ -509,7 +509,7 @@ void TFImporter::kernelFromTensor(const tensorflow::TensorProto &tensor, Mat &ds
 
     dstBlob.create(shape, CV_32F);
 
-    Mat tensorContent = getTensorContent(tensor);
+    Mat tensorContent = getTensorContent(tensor, /*no copy*/false);
     int size = tensorContent.total();
     CV_Assert(size == (int)dstBlob.total());
 
diff --git a/modules/dnn/test/test_misc.cpp b/modules/dnn/test/test_misc.cpp
index 7dad46d331..ea2929c20c 100644
--- a/modules/dnn/test/test_misc.cpp
+++ b/modules/dnn/test/test_misc.cpp
@@ -345,11 +345,12 @@ TEST(Net, forwardAndRetrieve)
 #ifdef HAVE_INF_ENGINE
 // This test runs network in synchronous mode for different inputs and then
 // runs the same model asynchronously for the same inputs.
-typedef testing::TestWithParam<Target> Async;
+typedef testing::TestWithParam<tuple<int, Target> > Async;
 TEST_P(Async, set_and_forward_single)
 {
     static const int kTimeout = 5000;  // in milliseconds.
-    const int target = GetParam();
+    const int dtype = get<0>(GetParam());
+    const int target = get<1>(GetParam());
 
     const std::string suffix = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? "_fp16" : "";
     const std::string& model = findDataFile("dnn/layers/layer_convolution" + suffix + ".bin");
@@ -367,8 +368,8 @@ TEST_P(Async, set_and_forward_single)
     int blobSize[] = {2, 6, 75, 113};
     for (int i = 0; i < numInputs; ++i)
     {
-        inputs[i].create(4, &blobSize[0], CV_32FC1);
-        randu(inputs[i], 0.0f, 1.0f);
+        inputs[i].create(4, &blobSize[0], dtype);
+        randu(inputs[i], 0, 255);
     }
 
     // Run synchronously.
@@ -394,7 +395,8 @@ TEST_P(Async, set_and_forward_single)
 TEST_P(Async, set_and_forward_all)
 {
     static const int kTimeout = 5000;  // in milliseconds.
-    const int target = GetParam();
+    const int dtype = get<0>(GetParam());
+    const int target = get<1>(GetParam());
 
     const std::string suffix = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? "_fp16" : "";
     const std::string& model = findDataFile("dnn/layers/layer_convolution" + suffix + ".bin");
@@ -413,8 +415,8 @@ TEST_P(Async, set_and_forward_all)
     int blobSize[] = {2, 6, 75, 113};
     for (int i = 0; i < numInputs; ++i)
     {
-        inputs[i].create(4, &blobSize[0], CV_32FC1);
-        randu(inputs[i], 0.0f, 1.0f);
+        inputs[i].create(4, &blobSize[0], dtype);
+        randu(inputs[i], 0, 255);
     }
 
     // Run synchronously.
@@ -441,7 +443,10 @@ TEST_P(Async, set_and_forward_all)
     }
 }
 
-INSTANTIATE_TEST_CASE_P(/**/, Async, testing::ValuesIn(getAvailableTargets(DNN_BACKEND_INFERENCE_ENGINE)));
+INSTANTIATE_TEST_CASE_P(/**/, Async, Combine(
+  Values(CV_32F, CV_8U),
+  testing::ValuesIn(getAvailableTargets(DNN_BACKEND_INFERENCE_ENGINE))
+));
 #endif  // HAVE_INF_ENGINE
 
 }} // namespace
diff --git a/modules/ml/doc/ml_intro.markdown b/modules/ml/doc/ml_intro.markdown
index 3bc9b068f2..fb4f1a7bd7 100644
--- a/modules/ml/doc/ml_intro.markdown
+++ b/modules/ml/doc/ml_intro.markdown
@@ -25,7 +25,7 @@ components:
     vector responses.
 -   Another optional component is the mask of missing measurements. Most algorithms require all the
     components in all the training samples be valid, but some other algorithms, such as decision
-    tress, can handle the cases of missing measurements.
+    trees, can handle the cases of missing measurements.
 -   In the case of classification problem user may want to give different weights to different
     classes. This is useful, for example, when:
     -   user wants to shift prediction accuracy towards lower false-alarm rate or higher hit-rate.
diff --git a/samples/dnn/tf_text_graph_ssd.py b/samples/dnn/tf_text_graph_ssd.py
index 35207ca3cd..730e0bbc25 100644
--- a/samples/dnn/tf_text_graph_ssd.py
+++ b/samples/dnn/tf_text_graph_ssd.py
@@ -274,7 +274,8 @@ def createSSDGraph(modelPath, configPath, outputPath):
 
     num_matched_layers = 0
     for node in graph_def.node:
-        if re.match('BoxPredictor_\d/BoxEncodingPredictor/Conv2D', node.name) or \
+        if re.match('BoxPredictor_\d/BoxEncodingPredictor/convolution', node.name) or \
+           re.match('BoxPredictor_\d/BoxEncodingPredictor/Conv2D', node.name) or \
            re.match('WeightSharedConvolutionalBoxPredictor(_\d)*/BoxPredictor/Conv2D', node.name):
             node.addAttr('loc_pred_transposed', True)
             num_matched_layers += 1