diff --git a/CMakeLists.txt b/CMakeLists.txt
index 80e1e085ad..9a56a15281 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -252,8 +252,8 @@ OCV_OPTION(WITH_CUBLAS "Include NVidia Cuda Basic Linear Algebra Subprograms (BL
 OCV_OPTION(WITH_NVCUVID "Include NVidia Video Decoding library support" WITH_CUDA
   VISIBLE_IF WITH_CUDA
   VERIFY HAVE_NVCUVID)
-OCV_OPTION(WITH_EIGEN "Include Eigen2/Eigen3 support" (NOT CV_DISABLE_OPTIMIZATION)
-  VISIBLE_IF NOT WINRT AND NOT CMAKE_CROSSCOMPILING
+OCV_OPTION(WITH_EIGEN "Include Eigen2/Eigen3 support" (NOT CV_DISABLE_OPTIMIZATION AND NOT CMAKE_CROSSCOMPILING)
+  VISIBLE_IF NOT WINRT
   VERIFY HAVE_EIGEN)
 OCV_OPTION(WITH_FFMPEG "Include FFMPEG support" (NOT ANDROID)
   VISIBLE_IF NOT IOS AND NOT WINRT
diff --git a/apps/traincascade/haarfeatures.cpp b/apps/traincascade/haarfeatures.cpp
index f2d18229e8..c151ee7963 100644
--- a/apps/traincascade/haarfeatures.cpp
+++ b/apps/traincascade/haarfeatures.cpp
@@ -153,14 +153,14 @@ void CvHaarEvaluator::generateFeatures()
                     {
                         features.push_back( Feature( offset, false,
                             x,    y, dx*3, dy, -1,
-                            x+dx, y, dx  , dy, +3 ) );
+                            x+dx, y, dx  , dy, +2 ) );
                     }
                     // haar_y3
                     if ( (x+dx <= winSize.width) && (y+dy*3 <= winSize.height) )
                     {
                         features.push_back( Feature( offset, false,
                             x, y,    dx, dy*3, -1,
-                            x, y+dy, dx, dy,   +3 ) );
+                            x, y+dy, dx, dy,   +2 ) );
                     }
                     if( mode != CvHaarFeatureParams::BASIC )
                     {
diff --git a/cmake/OpenCVFindLAPACK.cmake b/cmake/OpenCVFindLAPACK.cmake
index 684818027e..342bebc723 100644
--- a/cmake/OpenCVFindLAPACK.cmake
+++ b/cmake/OpenCVFindLAPACK.cmake
@@ -31,27 +31,33 @@ macro(ocv_lapack_check)
   else()
     # adding proxy opencv_lapack.h header
     set(CBLAS_H_PROXY_PATH ${CMAKE_BINARY_DIR}/opencv_lapack.h)
-    if((APPLE OR OPENCV_SKIP_LAPACK_EXTERN_C) AND NOT OPENCV_FORCE_LAPACK_EXTERN_C)
-        set(_lapack_include_str_extern_C "")
-        set(_lapack_include_str_extern_C_end "")
-    else()
-        set(_lapack_include_str_extern_C "extern \"C\" {\n")
-        set(_lapack_include_str_extern_C_end "}\n")
+
+    set(_lapack_add_extern_c NOT (APPLE OR OPENCV_SKIP_LAPACK_EXTERN_C) OR OPENCV_FORCE_LAPACK_EXTERN_C)
+
+    set(_lapack_content "// This file is auto-generated\n")
+    if(${_lapack_add_extern_c})
+      list(APPEND _lapack_content "extern \"C\" {")
     endif()
-    set(_lapack_include_str "${_lapack_include_str_extern_C}\#include \"${OPENCV_CBLAS_H_PATH_${_lapack_impl}}\"")
+    if(NOT OPENCV_SKIP_LAPACK_MSVC_FIX)
+      list(APPEND _lapack_content "
+#ifdef _MSC_VER
+#include <complex.h>
+#define lapack_complex_float _Fcomplex
+#define lapack_complex_double _Dcomplex
+#endif
+")
+    endif()
+    list(APPEND _lapack_content "#include \"${OPENCV_CBLAS_H_PATH_${_lapack_impl}}\"")
     if(NOT "${OPENCV_CBLAS_H_PATH_${_lapack_impl}}" STREQUAL "${OPENCV_LAPACKE_H_PATH_${_lapack_impl}}")
-      set(_lapack_include_str "${_lapack_include_str}\n#include \"${OPENCV_LAPACKE_H_PATH_${_lapack_impl}}\"")
+      list(APPEND _lapack_content "#include \"${OPENCV_LAPACKE_H_PATH_${_lapack_impl}}\"")
     endif()
-    set(_lapack_include_str "${_lapack_include_str}\n${_lapack_include_str_extern_C_end}")
-    # update file contents (if required)
-    set(__content_str "")
-    if(EXISTS "${CBLAS_H_PROXY_PATH}")
-      file(READ "${CBLAS_H_PROXY_PATH}" __content_str)
-    endif()
-    if(NOT " ${__content_str}" STREQUAL " ${_lapack_include_str}")
-      file(WRITE "${CBLAS_H_PROXY_PATH}" "${_lapack_include_str}")
+    if(${_lapack_add_extern_c})
+      list(APPEND _lapack_content "}")
     endif()
 
+    string(REPLACE ";" "\n" _lapack_content "${_lapack_content}")
+    ocv_update_file("${CBLAS_H_PROXY_PATH}" "${_lapack_content}")
+
     try_compile(__VALID_LAPACK
         "${OpenCV_BINARY_DIR}"
         "${OpenCV_SOURCE_DIR}/cmake/checks/lapack_check.cpp"
diff --git a/cmake/OpenCVFindLibsGUI.cmake b/cmake/OpenCVFindLibsGUI.cmake
index 27b5d77e92..e3593d4dc9 100644
--- a/cmake/OpenCVFindLibsGUI.cmake
+++ b/cmake/OpenCVFindLibsGUI.cmake
@@ -64,7 +64,7 @@ if(WITH_GTK AND NOT HAVE_QT)
   if(WITH_OPENGL AND NOT HAVE_GTK3)
     ocv_check_modules(GTKGLEXT gtkglext-1.0)
     if(HAVE_GTKGLEXT)
-      ocv_append_build_options(GTKGLEXT GTHREAD)
+      ocv_append_build_options(HIGHGUI GTKGLEXT)
     endif()
   endif()
 endif()
diff --git a/cmake/OpenCVFindLibsPerf.cmake b/cmake/OpenCVFindLibsPerf.cmake
index 67978f9210..a658bf6bdc 100644
--- a/cmake/OpenCVFindLibsPerf.cmake
+++ b/cmake/OpenCVFindLibsPerf.cmake
@@ -40,19 +40,67 @@ To eliminate this warning remove WITH_CUDA=ON CMake configuration option.
 endif(WITH_CUDA)
 
 # --- Eigen ---
-if(WITH_EIGEN)
-  find_path(EIGEN_INCLUDE_PATH "Eigen/Core"
-            PATHS /usr/local /opt /usr $ENV{EIGEN_ROOT}/include ENV ProgramFiles ENV ProgramW6432
-            PATH_SUFFIXES include/eigen3 include/eigen2 Eigen/include/eigen3 Eigen/include/eigen2
-            DOC "The path to Eigen3/Eigen2 headers"
-            CMAKE_FIND_ROOT_PATH_BOTH)
+if(WITH_EIGEN AND NOT HAVE_EIGEN)
+  find_package(Eigen3 QUIET)
 
-  if(EIGEN_INCLUDE_PATH)
-    ocv_include_directories(${EIGEN_INCLUDE_PATH})
-    ocv_parse_header("${EIGEN_INCLUDE_PATH}/Eigen/src/Core/util/Macros.h" EIGEN_VERSION_LINES EIGEN_WORLD_VERSION EIGEN_MAJOR_VERSION EIGEN_MINOR_VERSION)
-    set(HAVE_EIGEN 1)
+  if(Eigen3_FOUND)
+    if(TARGET Eigen3::Eigen)
+      # Use Eigen3 imported target if possible
+      list(APPEND OPENCV_LINKER_LIBS Eigen3::Eigen)
+      set(HAVE_EIGEN 1)
+    else()
+      if(DEFINED EIGEN3_INCLUDE_DIRS)
+        set(EIGEN_INCLUDE_PATH ${EIGEN3_INCLUDE_DIRS})
+        set(HAVE_EIGEN 1)
+      elseif(DEFINED EIGEN3_INCLUDE_DIR)
+        set(EIGEN_INCLUDE_PATH ${EIGEN3_INCLUDE_DIR})
+        set(HAVE_EIGEN 1)
+      endif()
+    endif()
+    if(HAVE_EIGEN)
+      if(DEFINED EIGEN3_WORLD_VERSION)  # CMake module
+        set(EIGEN_WORLD_VERSION ${EIGEN3_WORLD_VERSION})
+        set(EIGEN_MAJOR_VERSION ${EIGEN3_MAJOR_VERSION})
+        set(EIGEN_MINOR_VERSION ${EIGEN3_MINOR_VERSION})
+      else()  # Eigen config file
+        set(EIGEN_WORLD_VERSION ${EIGEN3_VERSION_MAJOR})
+        set(EIGEN_MAJOR_VERSION ${EIGEN3_VERSION_MINOR})
+        set(EIGEN_MINOR_VERSION ${EIGEN3_VERSION_PATCH})
+      endif()
+    endif()
   endif()
-endif(WITH_EIGEN)
+
+  if(NOT HAVE_EIGEN)
+    if(NOT EIGEN_INCLUDE_PATH OR NOT EXISTS "${EIGEN_INCLUDE_PATH}")
+      set(__find_paths "")
+      set(__find_path_extra_options "")
+      if(NOT CMAKE_CROSSCOMPILING)
+        list(APPEND __find_paths /opt)
+      endif()
+      if(DEFINED ENV{EIGEN_ROOT})
+        set(__find_paths "$ENV{EIGEN_ROOT}/include")
+        list(APPEND __find_path_extra_options NO_DEFAULT_PATH)
+      else()
+        set(__find_paths ENV ProgramFiles ENV ProgramW6432)
+      endif()
+      find_path(EIGEN_INCLUDE_PATH "Eigen/Core"
+                PATHS ${__find_paths}
+                PATH_SUFFIXES include/eigen3 include/eigen2 Eigen/include/eigen3 Eigen/include/eigen2
+                DOC "The path to Eigen3/Eigen2 headers"
+                ${__find_path_extra_options}
+      )
+    endif()
+    if(EIGEN_INCLUDE_PATH AND EXISTS "${EIGEN_INCLUDE_PATH}")
+      ocv_parse_header("${EIGEN_INCLUDE_PATH}/Eigen/src/Core/util/Macros.h" EIGEN_VERSION_LINES EIGEN_WORLD_VERSION EIGEN_MAJOR_VERSION EIGEN_MINOR_VERSION)
+      set(HAVE_EIGEN 1)
+    endif()
+  endif()
+endif()
+if(HAVE_EIGEN)
+  if(EIGEN_INCLUDE_PATH AND EXISTS "${EIGEN_INCLUDE_PATH}")
+    ocv_include_directories(SYSTEM ${EIGEN_INCLUDE_PATH})
+  endif()
+endif()
 
 # --- Clp ---
 # Ubuntu: sudo apt-get install coinor-libclp-dev coinor-libcoinutils-dev
diff --git a/doc/opencv.bib b/doc/opencv.bib
index fd1b60dfd1..e861e5b756 100644
--- a/doc/opencv.bib
+++ b/doc/opencv.bib
@@ -209,7 +209,21 @@
   hal_id = {inria-00350283},
   hal_version = {v1},
 }
-
+@article{Collins14
+  year = {2014},
+  issn = {0920-5691},
+  journal = {International Journal of Computer Vision},
+  volume = {109},
+  number = {3},
+  doi = {10.1007/s11263-014-0725-5},
+  title = {Infinitesimal Plane-Based Pose Estimation},
+  url = {http://dx.doi.org/10.1007/s11263-014-0725-5},
+  publisher = {Springer US},
+  keywords = {Plane; Pose; SfM; PnP; Homography},
+  author = {Collins, Toby and Bartoli, Adrien},
+  pages = {252-286},
+  language = {English}
+}
 @article{Daniilidis98,
   author = {Konstantinos Daniilidis},
   title = {Hand-Eye Calibration Using Dual Quaternions},
diff --git a/doc/py_tutorials/py_ml/py_knn/py_knn_opencv/py_knn_opencv.markdown b/doc/py_tutorials/py_ml/py_knn/py_knn_opencv/py_knn_opencv.markdown
index 5fbbff27a3..1ef8443306 100644
--- a/doc/py_tutorials/py_ml/py_knn/py_knn_opencv/py_knn_opencv.markdown
+++ b/doc/py_tutorials/py_ml/py_knn/py_knn_opencv/py_knn_opencv.markdown
@@ -21,7 +21,6 @@ train_data, and next 250 samples as test_data. So let's prepare them first.
 @code{.py}
 import numpy as np
 import cv2 as cv
-from matplotlib import pyplot as plt
 
 img = cv.imread('digits.png')
 gray = cv.cvtColor(img,cv.COLOR_BGR2GRAY)
@@ -89,7 +88,6 @@ alphabets directly.
 @code{.py}
 import cv2 as cv
 import numpy as np
-import matplotlib.pyplot as plt
 
 # Load the data, converters convert the letter to a number
 data= np.loadtxt('letter-recognition.data', dtype= 'float32', delimiter = ',',
diff --git a/modules/calib3d/include/opencv2/calib3d.hpp b/modules/calib3d/include/opencv2/calib3d.hpp
index b812d5928e..a70b968e47 100644
--- a/modules/calib3d/include/opencv2/calib3d.hpp
+++ b/modules/calib3d/include/opencv2/calib3d.hpp
@@ -231,13 +231,25 @@ enum { LMEDS  = 4, //!< least-median of squares algorithm
        RHO    = 16 //!< RHO algorithm
      };
 
-enum { SOLVEPNP_ITERATIVE = 0,
-       SOLVEPNP_EPNP      = 1, //!< EPnP: Efficient Perspective-n-Point Camera Pose Estimation @cite lepetit2009epnp
-       SOLVEPNP_P3P       = 2, //!< Complete Solution Classification for the Perspective-Three-Point Problem @cite gao2003complete
-       SOLVEPNP_DLS       = 3, //!< A Direct Least-Squares (DLS) Method for PnP  @cite hesch2011direct
-       SOLVEPNP_UPNP      = 4, //!< Exhaustive Linearization for Robust Camera Pose and Focal Length Estimation @cite penate2013exhaustive
-       SOLVEPNP_AP3P      = 5, //!< An Efficient Algebraic Solution to the Perspective-Three-Point Problem @cite Ke17
-       SOLVEPNP_MAX_COUNT      //!< Used for count
+enum SolvePnPMethod {
+    SOLVEPNP_ITERATIVE   = 0,
+    SOLVEPNP_EPNP        = 1, //!< EPnP: Efficient Perspective-n-Point Camera Pose Estimation @cite lepetit2009epnp
+    SOLVEPNP_P3P         = 2, //!< Complete Solution Classification for the Perspective-Three-Point Problem @cite gao2003complete
+    SOLVEPNP_DLS         = 3, //!< A Direct Least-Squares (DLS) Method for PnP  @cite hesch2011direct
+    SOLVEPNP_UPNP        = 4, //!< Exhaustive Linearization for Robust Camera Pose and Focal Length Estimation @cite penate2013exhaustive
+    SOLVEPNP_AP3P        = 5, //!< An Efficient Algebraic Solution to the Perspective-Three-Point Problem @cite Ke17
+    SOLVEPNP_IPPE        = 6, //!< Infinitesimal Plane-Based Pose Estimation @cite Collins14 \n
+                              //!< Object points must be coplanar.
+    SOLVEPNP_IPPE_SQUARE = 7, //!< Infinitesimal Plane-Based Pose Estimation @cite Collins14 \n
+                              //!< This is a special case suitable for marker pose estimation.\n
+                              //!< 4 coplanar object points must be defined in the following order:
+                              //!<   - point 0: [-squareLength / 2,  squareLength / 2, 0]
+                              //!<   - point 1: [ squareLength / 2,  squareLength / 2, 0]
+                              //!<   - point 2: [ squareLength / 2, -squareLength / 2, 0]
+                              //!<   - point 3: [-squareLength / 2, -squareLength / 2, 0]
+#ifndef CV_DOXYGEN
+    SOLVEPNP_MAX_COUNT        //!< Used for count
+#endif
 };
 
 enum { CALIB_CB_ADAPTIVE_THRESH = 1,
@@ -610,6 +622,17 @@ Check @ref tutorial_homography "the corresponding tutorial" for more details
 */
 
 /** @brief Finds an object pose from 3D-2D point correspondences.
+This function returns the rotation and the translation vectors that transform a 3D point expressed in the object
+coordinate frame to the camera coordinate frame, using different methods:
+- P3P methods (@ref SOLVEPNP_P3P, @ref SOLVEPNP_AP3P): need 4 input points to return a unique solution.
+- @ref SOLVEPNP_IPPE Input points must be >= 4 and object points must be coplanar.
+- @ref SOLVEPNP_IPPE_SQUARE Special case suitable for marker pose estimation.
+Number of input points must be 4. Object points must be defined in the following order:
+  - point 0: [-squareLength / 2,  squareLength / 2, 0]
+  - point 1: [ squareLength / 2,  squareLength / 2, 0]
+  - point 2: [ squareLength / 2, -squareLength / 2, 0]
+  - point 3: [-squareLength / 2, -squareLength / 2, 0]
+- for all the other flags, number of input points must be >= 4 and object points can be in any configuration.
 
 @param objectPoints Array of object points in the object coordinate space, Nx3 1-channel or
 1xN/Nx1 3-channel, where N is the number of points. vector\<Point3f\> can be also passed here.
@@ -620,14 +643,14 @@ where N is the number of points. vector\<Point2f\> can be also passed here.
 \f$(k_1, k_2, p_1, p_2[, k_3[, k_4, k_5, k_6 [, s_1, s_2, s_3, s_4[, \tau_x, \tau_y]]]])\f$ of
 4, 5, 8, 12 or 14 elements. If the vector is NULL/empty, the zero distortion coefficients are
 assumed.
-@param rvec Output rotation vector (see @ref Rodrigues ) that, together with tvec , brings points from
+@param rvec Output rotation vector (see @ref Rodrigues ) that, together with tvec, brings points from
 the model coordinate system to the camera coordinate system.
 @param tvec Output translation vector.
 @param useExtrinsicGuess Parameter used for #SOLVEPNP_ITERATIVE. If true (1), the function uses
 the provided rvec and tvec values as initial approximations of the rotation and translation
 vectors, respectively, and further optimizes them.
 @param flags Method for solving a PnP problem:
--   **SOLVEPNP_ITERATIVE** Iterative method is based on Levenberg-Marquardt optimization. In
+-   **SOLVEPNP_ITERATIVE** Iterative method is based on a Levenberg-Marquardt optimization. In
 this case the function finds such a pose that minimizes reprojection error, that is the sum
 of squared distances between the observed projections imagePoints and the projected (using
 projectPoints ) objectPoints .
@@ -637,18 +660,24 @@ In this case the function requires exactly four object and image points.
 -   **SOLVEPNP_AP3P** Method is based on the paper of T. Ke, S. Roumeliotis
 "An Efficient Algebraic Solution to the Perspective-Three-Point Problem" (@cite Ke17).
 In this case the function requires exactly four object and image points.
--   **SOLVEPNP_EPNP** Method has been introduced by F.Moreno-Noguer, V.Lepetit and P.Fua in the
+-   **SOLVEPNP_EPNP** Method has been introduced by F. Moreno-Noguer, V. Lepetit and P. Fua in the
 paper "EPnP: Efficient Perspective-n-Point Camera Pose Estimation" (@cite lepetit2009epnp).
--   **SOLVEPNP_DLS** Method is based on the paper of Joel A. Hesch and Stergios I. Roumeliotis.
+-   **SOLVEPNP_DLS** Method is based on the paper of J. Hesch and S. Roumeliotis.
 "A Direct Least-Squares (DLS) Method for PnP" (@cite hesch2011direct).
--   **SOLVEPNP_UPNP** Method is based on the paper of A.Penate-Sanchez, J.Andrade-Cetto,
-F.Moreno-Noguer. "Exhaustive Linearization for Robust Camera Pose and Focal Length
+-   **SOLVEPNP_UPNP** Method is based on the paper of A. Penate-Sanchez, J. Andrade-Cetto,
+F. Moreno-Noguer. "Exhaustive Linearization for Robust Camera Pose and Focal Length
 Estimation" (@cite penate2013exhaustive). In this case the function also estimates the parameters \f$f_x\f$ and \f$f_y\f$
 assuming that both have the same value. Then the cameraMatrix is updated with the estimated
 focal length.
--   **SOLVEPNP_AP3P** Method is based on the paper of Tong Ke and Stergios I. Roumeliotis.
-"An Efficient Algebraic Solution to the Perspective-Three-Point Problem" (@cite Ke17). In this case the
-function requires exactly four object and image points.
+-   **SOLVEPNP_IPPE** Method is based on the paper of T. Collins and A. Bartoli.
+"Infinitesimal Plane-Based Pose Estimation" (@cite Collins14). This method requires coplanar object points.
+-   **SOLVEPNP_IPPE_SQUARE** Method is based on the paper of Toby Collins and Adrien Bartoli.
+"Infinitesimal Plane-Based Pose Estimation" (@cite Collins14). This method is suitable for marker pose estimation.
+It requires 4 coplanar object points defined in the following order:
+  - point 0: [-squareLength / 2,  squareLength / 2, 0]
+  - point 1: [ squareLength / 2,  squareLength / 2, 0]
+  - point 2: [ squareLength / 2, -squareLength / 2, 0]
+  - point 3: [-squareLength / 2, -squareLength / 2, 0]
 
 The function estimates the object pose given a set of object points, their corresponding image
 projections, as well as the camera matrix and the distortion coefficients, see the figure below
@@ -704,7 +733,7 @@ using the perspective projection model \f$ \Pi \f$ and the camera intrinsic para
   \end{align*}
 \f]
 
-The estimated pose is thus the rotation (`rvec`) and the translation (`tvec`) vectors that allow to transform
+The estimated pose is thus the rotation (`rvec`) and the translation (`tvec`) vectors that allow transforming
 a 3D point expressed in the world frame into the camera frame:
 
 \f[
@@ -765,6 +794,13 @@ a 3D point expressed in the world frame into the camera frame:
    -   With **SOLVEPNP_ITERATIVE** method and `useExtrinsicGuess=true`, the minimum number of points is 3 (3 points
        are sufficient to compute a pose but there are up to 4 solutions). The initial solution should be close to the
        global solution to converge.
+   -   With **SOLVEPNP_IPPE** input points must be >= 4 and object points must be coplanar.
+   -   With **SOLVEPNP_IPPE_SQUARE** this is a special case suitable for marker pose estimation.
+       Number of input points must be 4. Object points must be defined in the following order:
+         - point 0: [-squareLength / 2,  squareLength / 2, 0]
+         - point 1: [ squareLength / 2,  squareLength / 2, 0]
+         - point 2: [ squareLength / 2, -squareLength / 2, 0]
+         - point 3: [-squareLength / 2, -squareLength / 2, 0]
  */
 CV_EXPORTS_W bool solvePnP( InputArray objectPoints, InputArray imagePoints,
                             InputArray cameraMatrix, InputArray distCoeffs,
@@ -782,10 +818,10 @@ where N is the number of points. vector\<Point2f\> can be also passed here.
 \f$(k_1, k_2, p_1, p_2[, k_3[, k_4, k_5, k_6 [, s_1, s_2, s_3, s_4[, \tau_x, \tau_y]]]])\f$ of
 4, 5, 8, 12 or 14 elements. If the vector is NULL/empty, the zero distortion coefficients are
 assumed.
-@param rvec Output rotation vector (see Rodrigues ) that, together with tvec , brings points from
+@param rvec Output rotation vector (see @ref Rodrigues ) that, together with tvec, brings points from
 the model coordinate system to the camera coordinate system.
 @param tvec Output translation vector.
-@param useExtrinsicGuess Parameter used for SOLVEPNP_ITERATIVE. If true (1), the function uses
+@param useExtrinsicGuess Parameter used for @ref SOLVEPNP_ITERATIVE. If true (1), the function uses
 the provided rvec and tvec values as initial approximations of the rotation and translation
 vectors, respectively, and further optimizes them.
 @param iterationsCount Number of iterations.
@@ -794,12 +830,12 @@ is the maximum allowed distance between the observed and computed point projecti
 an inlier.
 @param confidence The probability that the algorithm produces a useful result.
 @param inliers Output vector that contains indices of inliers in objectPoints and imagePoints .
-@param flags Method for solving a PnP problem (see solvePnP ).
+@param flags Method for solving a PnP problem (see @ref solvePnP ).
 
 The function estimates an object pose given a set of object points, their corresponding image
 projections, as well as the camera matrix and the distortion coefficients. This function finds such
 a pose that minimizes reprojection error, that is, the sum of squared distances between the observed
-projections imagePoints and the projected (using projectPoints ) objectPoints. The use of RANSAC
+projections imagePoints and the projected (using @ref projectPoints ) objectPoints. The use of RANSAC
 makes the function resistant to outliers.
 
 @note
@@ -819,6 +855,7 @@ CV_EXPORTS_W bool solvePnPRansac( InputArray objectPoints, InputArray imagePoint
                                   bool useExtrinsicGuess = false, int iterationsCount = 100,
                                   float reprojectionError = 8.0, double confidence = 0.99,
                                   OutputArray inliers = noArray(), int flags = SOLVEPNP_ITERATIVE );
+
 /** @brief Finds an object pose from 3 3D-2D point correspondences.
 
 @param objectPoints Array of object points in the object coordinate space, 3x3 1-channel or
@@ -830,17 +867,20 @@ CV_EXPORTS_W bool solvePnPRansac( InputArray objectPoints, InputArray imagePoint
 \f$(k_1, k_2, p_1, p_2[, k_3[, k_4, k_5, k_6 [, s_1, s_2, s_3, s_4[, \tau_x, \tau_y]]]])\f$ of
 4, 5, 8, 12 or 14 elements. If the vector is NULL/empty, the zero distortion coefficients are
 assumed.
-@param rvecs Output rotation vectors (see Rodrigues ) that, together with tvecs , brings points from
+@param rvecs Output rotation vectors (see @ref Rodrigues ) that, together with tvecs, brings points from
 the model coordinate system to the camera coordinate system. A P3P problem has up to 4 solutions.
 @param tvecs Output translation vectors.
 @param flags Method for solving a P3P problem:
 -   **SOLVEPNP_P3P** Method is based on the paper of X.S. Gao, X.-R. Hou, J. Tang, H.-F. Chang
 "Complete Solution Classification for the Perspective-Three-Point Problem" (@cite gao2003complete).
--   **SOLVEPNP_AP3P** Method is based on the paper of Tong Ke and Stergios I. Roumeliotis.
+-   **SOLVEPNP_AP3P** Method is based on the paper of T. Ke and S. Roumeliotis.
 "An Efficient Algebraic Solution to the Perspective-Three-Point Problem" (@cite Ke17).
 
 The function estimates the object pose given 3 object points, their corresponding image
 projections, as well as the camera matrix and the distortion coefficients.
+
+@note
+The solutions are sorted by reprojection errors (lowest to highest).
  */
 CV_EXPORTS_W int solveP3P( InputArray objectPoints, InputArray imagePoints,
                            InputArray cameraMatrix, InputArray distCoeffs,
@@ -859,7 +899,7 @@ where N is the number of points. vector\<Point2f\> can also be passed here.
 \f$(k_1, k_2, p_1, p_2[, k_3[, k_4, k_5, k_6 [, s_1, s_2, s_3, s_4[, \tau_x, \tau_y]]]])\f$ of
 4, 5, 8, 12 or 14 elements. If the vector is NULL/empty, the zero distortion coefficients are
 assumed.
-@param rvec Input/Output rotation vector (see @ref Rodrigues ) that, together with tvec , brings points from
+@param rvec Input/Output rotation vector (see @ref Rodrigues ) that, together with tvec, brings points from
 the model coordinate system to the camera coordinate system. Input values are used as an initial solution.
 @param tvec Input/Output translation vector. Input values are used as an initial solution.
 @param criteria Criteria when to stop the Levenberg-Marquard iterative algorithm.
@@ -887,12 +927,12 @@ where N is the number of points. vector\<Point2f\> can also be passed here.
 \f$(k_1, k_2, p_1, p_2[, k_3[, k_4, k_5, k_6 [, s_1, s_2, s_3, s_4[, \tau_x, \tau_y]]]])\f$ of
 4, 5, 8, 12 or 14 elements. If the vector is NULL/empty, the zero distortion coefficients are
 assumed.
-@param rvec Input/Output rotation vector (see @ref Rodrigues ) that, together with tvec , brings points from
+@param rvec Input/Output rotation vector (see @ref Rodrigues ) that, together with tvec, brings points from
 the model coordinate system to the camera coordinate system. Input values are used as an initial solution.
 @param tvec Input/Output translation vector. Input values are used as an initial solution.
 @param criteria Criteria when to stop the Levenberg-Marquard iterative algorithm.
 @param VVSlambda Gain for the virtual visual servoing control law, equivalent to the \f$\alpha\f$
-gain in the Gauss-Newton formulation.
+gain in the Damped Gauss-Newton formulation.
 
 The function refines the object pose given at least 3 object points, their corresponding image
 projections, an initial solution for the rotation and translation vector,
@@ -906,6 +946,202 @@ CV_EXPORTS_W void solvePnPRefineVVS( InputArray objectPoints, InputArray imagePo
                                      TermCriteria criteria = TermCriteria(TermCriteria::EPS + TermCriteria::COUNT, 20, FLT_EPSILON),
                                      double VVSlambda = 1);
 
+/** @brief Finds an object pose from 3D-2D point correspondences.
+This function returns a list of all the possible solutions (a solution is a <rotation vector, translation vector>
+couple), depending on the number of input points and the chosen method:
+- P3P methods (@ref SOLVEPNP_P3P, @ref SOLVEPNP_AP3P): 3 or 4 input points. Number of returned solutions can be between 0 and 4 with 3 input points.
+- @ref SOLVEPNP_IPPE Input points must be >= 4 and object points must be coplanar. Returns 2 solutions.
+- @ref SOLVEPNP_IPPE_SQUARE Special case suitable for marker pose estimation.
+Number of input points must be 4 and 2 solutions are returned. Object points must be defined in the following order:
+  - point 0: [-squareLength / 2,  squareLength / 2, 0]
+  - point 1: [ squareLength / 2,  squareLength / 2, 0]
+  - point 2: [ squareLength / 2, -squareLength / 2, 0]
+  - point 3: [-squareLength / 2, -squareLength / 2, 0]
+- for all the other flags, number of input points must be >= 4 and object points can be in any configuration.
+Only 1 solution is returned.
+
+@param objectPoints Array of object points in the object coordinate space, Nx3 1-channel or
+1xN/Nx1 3-channel, where N is the number of points. vector\<Point3f\> can be also passed here.
+@param imagePoints Array of corresponding image points, Nx2 1-channel or 1xN/Nx1 2-channel,
+where N is the number of points. vector\<Point2f\> can be also passed here.
+@param cameraMatrix Input camera matrix \f$A = \vecthreethree{fx}{0}{cx}{0}{fy}{cy}{0}{0}{1}\f$ .
+@param distCoeffs Input vector of distortion coefficients
+\f$(k_1, k_2, p_1, p_2[, k_3[, k_4, k_5, k_6 [, s_1, s_2, s_3, s_4[, \tau_x, \tau_y]]]])\f$ of
+4, 5, 8, 12 or 14 elements. If the vector is NULL/empty, the zero distortion coefficients are
+assumed.
+@param rvecs Vector of output rotation vectors (see @ref Rodrigues ) that, together with tvecs, brings points from
+the model coordinate system to the camera coordinate system.
+@param tvecs Vector of output translation vectors.
+@param useExtrinsicGuess Parameter used for #SOLVEPNP_ITERATIVE. If true (1), the function uses
+the provided rvec and tvec values as initial approximations of the rotation and translation
+vectors, respectively, and further optimizes them.
+@param flags Method for solving a PnP problem:
+-   **SOLVEPNP_ITERATIVE** Iterative method is based on a Levenberg-Marquardt optimization. In
+this case the function finds such a pose that minimizes reprojection error, that is the sum
+of squared distances between the observed projections imagePoints and the projected (using
+projectPoints ) objectPoints .
+-   **SOLVEPNP_P3P** Method is based on the paper of X.S. Gao, X.-R. Hou, J. Tang, H.-F. Chang
+"Complete Solution Classification for the Perspective-Three-Point Problem" (@cite gao2003complete).
+In this case the function requires exactly four object and image points.
+-   **SOLVEPNP_AP3P** Method is based on the paper of T. Ke, S. Roumeliotis
+"An Efficient Algebraic Solution to the Perspective-Three-Point Problem" (@cite Ke17).
+In this case the function requires exactly four object and image points.
+-   **SOLVEPNP_EPNP** Method has been introduced by F.Moreno-Noguer, V.Lepetit and P.Fua in the
+paper "EPnP: Efficient Perspective-n-Point Camera Pose Estimation" (@cite lepetit2009epnp).
+-   **SOLVEPNP_DLS** Method is based on the paper of Joel A. Hesch and Stergios I. Roumeliotis.
+"A Direct Least-Squares (DLS) Method for PnP" (@cite hesch2011direct).
+-   **SOLVEPNP_UPNP** Method is based on the paper of A.Penate-Sanchez, J.Andrade-Cetto,
+F.Moreno-Noguer. "Exhaustive Linearization for Robust Camera Pose and Focal Length
+Estimation" (@cite penate2013exhaustive). In this case the function also estimates the parameters \f$f_x\f$ and \f$f_y\f$
+assuming that both have the same value. Then the cameraMatrix is updated with the estimated
+focal length.
+-   **SOLVEPNP_IPPE** Method is based on the paper of T. Collins and A. Bartoli.
+"Infinitesimal Plane-Based Pose Estimation" (@cite Collins14). This method requires coplanar object points.
+-   **SOLVEPNP_IPPE_SQUARE** Method is based on the paper of Toby Collins and Adrien Bartoli.
+"Infinitesimal Plane-Based Pose Estimation" (@cite Collins14). This method is suitable for marker pose estimation.
+It requires 4 coplanar object points defined in the following order:
+  - point 0: [-squareLength / 2,  squareLength / 2, 0]
+  - point 1: [ squareLength / 2,  squareLength / 2, 0]
+  - point 2: [ squareLength / 2, -squareLength / 2, 0]
+  - point 3: [-squareLength / 2, -squareLength / 2, 0]
+@param rvec Rotation vector used to initialize an iterative PnP refinement algorithm, when flag is SOLVEPNP_ITERATIVE
+and useExtrinsicGuess is set to true.
+@param tvec Translation vector used to initialize an iterative PnP refinement algorithm, when flag is SOLVEPNP_ITERATIVE
+and useExtrinsicGuess is set to true.
+@param reprojectionError Optional vector of reprojection error, that is the RMS error
+(\f$ \text{RMSE} = \sqrt{\frac{\sum_{i}^{N} \left ( \hat{y_i} - y_i \right )^2}{N}} \f$) between the input image points
+and the 3D object points projected with the estimated pose.
+
+The function estimates the object pose given a set of object points, their corresponding image
+projections, as well as the camera matrix and the distortion coefficients, see the figure below
+(more precisely, the X-axis of the camera frame is pointing to the right, the Y-axis downward
+and the Z-axis forward).
+
+![](pnp.jpg)
+
+Points expressed in the world frame \f$ \bf{X}_w \f$ are projected into the image plane \f$ \left[ u, v \right] \f$
+using the perspective projection model \f$ \Pi \f$ and the camera intrinsic parameters matrix \f$ \bf{A} \f$:
+
+\f[
+  \begin{align*}
+  \begin{bmatrix}
+  u \\
+  v \\
+  1
+  \end{bmatrix} &=
+  \bf{A} \hspace{0.1em} \Pi \hspace{0.2em} ^{c}\bf{M}_w
+  \begin{bmatrix}
+  X_{w} \\
+  Y_{w} \\
+  Z_{w} \\
+  1
+  \end{bmatrix} \\
+  \begin{bmatrix}
+  u \\
+  v \\
+  1
+  \end{bmatrix} &=
+  \begin{bmatrix}
+  f_x & 0 & c_x \\
+  0 & f_y & c_y \\
+  0 & 0 & 1
+  \end{bmatrix}
+  \begin{bmatrix}
+  1 & 0 & 0 & 0 \\
+  0 & 1 & 0 & 0 \\
+  0 & 0 & 1 & 0
+  \end{bmatrix}
+  \begin{bmatrix}
+  r_{11} & r_{12} & r_{13} & t_x \\
+  r_{21} & r_{22} & r_{23} & t_y \\
+  r_{31} & r_{32} & r_{33} & t_z \\
+  0 & 0 & 0 & 1
+  \end{bmatrix}
+  \begin{bmatrix}
+  X_{w} \\
+  Y_{w} \\
+  Z_{w} \\
+  1
+  \end{bmatrix}
+  \end{align*}
+\f]
+
+The estimated pose is thus the rotation (`rvec`) and the translation (`tvec`) vectors that allow transforming
+a 3D point expressed in the world frame into the camera frame:
+
+\f[
+  \begin{align*}
+  \begin{bmatrix}
+  X_c \\
+  Y_c \\
+  Z_c \\
+  1
+  \end{bmatrix} &=
+  \hspace{0.2em} ^{c}\bf{M}_w
+  \begin{bmatrix}
+  X_{w} \\
+  Y_{w} \\
+  Z_{w} \\
+  1
+  \end{bmatrix} \\
+  \begin{bmatrix}
+  X_c \\
+  Y_c \\
+  Z_c \\
+  1
+  \end{bmatrix} &=
+  \begin{bmatrix}
+  r_{11} & r_{12} & r_{13} & t_x \\
+  r_{21} & r_{22} & r_{23} & t_y \\
+  r_{31} & r_{32} & r_{33} & t_z \\
+  0 & 0 & 0 & 1
+  \end{bmatrix}
+  \begin{bmatrix}
+  X_{w} \\
+  Y_{w} \\
+  Z_{w} \\
+  1
+  \end{bmatrix}
+  \end{align*}
+\f]
+
+@note
+   -   An example of how to use solvePnP for planar augmented reality can be found at
+        opencv_source_code/samples/python/plane_ar.py
+   -   If you are using Python:
+        - Numpy array slices won't work as input because solvePnP requires contiguous
+        arrays (enforced by the assertion using cv::Mat::checkVector() around line 55 of
+        modules/calib3d/src/solvepnp.cpp version 2.4.9)
+        - The P3P algorithm requires image points to be in an array of shape (N,1,2) due
+        to its calling of cv::undistortPoints (around line 75 of modules/calib3d/src/solvepnp.cpp version 2.4.9)
+        which requires 2-channel information.
+        - Thus, given some data D = np.array(...) where D.shape = (N,M), in order to use a subset of
+        it as, e.g., imagePoints, one must effectively copy it into a new array: imagePoints =
+        np.ascontiguousarray(D[:,:2]).reshape((N,1,2))
+   -   The methods **SOLVEPNP_DLS** and **SOLVEPNP_UPNP** cannot be used as the current implementations are
+       unstable and sometimes give completely wrong results. If you pass one of these two
+       flags, **SOLVEPNP_EPNP** method will be used instead.
+   -   The minimum number of points is 4 in the general case. In the case of **SOLVEPNP_P3P** and **SOLVEPNP_AP3P**
+       methods, it is required to use exactly 4 points (the first 3 points are used to estimate all the solutions
+       of the P3P problem, the last one is used to retain the best solution that minimizes the reprojection error).
+   -   With **SOLVEPNP_ITERATIVE** method and `useExtrinsicGuess=true`, the minimum number of points is 3 (3 points
+       are sufficient to compute a pose but there are up to 4 solutions). The initial solution should be close to the
+       global solution to converge.
+   -   With **SOLVEPNP_IPPE** input points must be >= 4 and object points must be coplanar.
+   -   With **SOLVEPNP_IPPE_SQUARE** this is a special case suitable for marker pose estimation.
+       Number of input points must be 4. Object points must be defined in the following order:
+         - point 0: [-squareLength / 2,  squareLength / 2, 0]
+         - point 1: [ squareLength / 2,  squareLength / 2, 0]
+         - point 2: [ squareLength / 2, -squareLength / 2, 0]
+         - point 3: [-squareLength / 2, -squareLength / 2, 0]
+ */
+CV_EXPORTS_W int solvePnPGeneric( InputArray objectPoints, InputArray imagePoints,
+                                  InputArray cameraMatrix, InputArray distCoeffs,
+                                  OutputArrayOfArrays rvecs, OutputArrayOfArrays tvecs,
+                                  bool useExtrinsicGuess = false, SolvePnPMethod flags = SOLVEPNP_ITERATIVE,
+                                  InputArray rvec = noArray(), InputArray tvec = noArray(),
+                                  OutputArray reprojectionError = noArray() );
+
 /** @brief Finds an initial camera matrix from 3D-2D point correspondences.
 
 @param objectPoints Vector of vectors of the calibration pattern points in the calibration pattern
@@ -1041,7 +1277,7 @@ CV_EXPORTS_W void drawChessboardCorners( InputOutputArray image, Size patternSiz
 @param distCoeffs Input vector of distortion coefficients
 \f$(k_1, k_2, p_1, p_2[, k_3[, k_4, k_5, k_6 [, s_1, s_2, s_3, s_4[, \tau_x, \tau_y]]]])\f$ of
 4, 5, 8, 12 or 14 elements. If the vector is empty, the zero distortion coefficients are assumed.
-@param rvec Rotation vector (see @ref Rodrigues ) that, together with tvec , brings points from
+@param rvec Rotation vector (see @ref Rodrigues ) that, together with tvec, brings points from
 the model coordinate system to the camera coordinate system.
 @param tvec Translation vector.
 @param length Length of the painted axes in the same unit than tvec (usually in meters).
diff --git a/modules/calib3d/src/ap3p.cpp b/modules/calib3d/src/ap3p.cpp
index 7b86834db8..11171f81a6 100644
--- a/modules/calib3d/src/ap3p.cpp
+++ b/modules/calib3d/src/ap3p.cpp
@@ -1,3 +1,4 @@
+#include "precomp.hpp"
 #include "ap3p.h"
 
 #include <cmath>
@@ -154,10 +155,11 @@ ap3p::ap3p(double _fx, double _fy, double _cx, double _cy) {
 // worldPoints: The positions of the 3 feature points stored as column vectors
 // solutionsR: 4 possible solutions of rotation matrix of the world w.r.t the camera frame
 // solutionsT: 4 possible solutions of translation of the world origin w.r.t the camera frame
-int ap3p::computePoses(const double featureVectors[3][3],
-                       const double worldPoints[3][3],
+int ap3p::computePoses(const double featureVectors[3][4],
+                       const double worldPoints[3][4],
                        double solutionsR[4][3][3],
-                       double solutionsT[4][3]) {
+                       double solutionsT[4][3],
+                       bool p4p) {
 
     //world point vectors
     double w1[3] = {worldPoints[0][0], worldPoints[1][0], worldPoints[2][0]};
@@ -246,6 +248,13 @@ int ap3p::computePoses(const double featureVectors[3][3],
     double b3p[3];
     vect_scale((delta / k3b3), b3, b3p);
 
+    double X3 = worldPoints[0][3];
+    double Y3 = worldPoints[1][3];
+    double Z3 = worldPoints[2][3];
+    double mu3 = featureVectors[0][3];
+    double mv3 = featureVectors[1][3];
+    double reproj_errors[4];
+
     int nb_solutions = 0;
     for (int i = 0; i < 4; ++i) {
         double ctheta1p = s[i];
@@ -290,9 +299,29 @@ int ap3p::computePoses(const double featureVectors[3][3],
         solutionsR[nb_solutions][1][2] = R[2][1];
         solutionsR[nb_solutions][2][2] = R[2][2];
 
+        if (p4p) {
+            double X3p = solutionsR[nb_solutions][0][0] * X3 + solutionsR[nb_solutions][0][1] * Y3 + solutionsR[nb_solutions][0][2] * Z3 + solutionsT[nb_solutions][0];
+            double Y3p = solutionsR[nb_solutions][1][0] * X3 + solutionsR[nb_solutions][1][1] * Y3 + solutionsR[nb_solutions][1][2] * Z3 + solutionsT[nb_solutions][1];
+            double Z3p = solutionsR[nb_solutions][2][0] * X3 + solutionsR[nb_solutions][2][1] * Y3 + solutionsR[nb_solutions][2][2] * Z3 + solutionsT[nb_solutions][2];
+            double mu3p = X3p / Z3p;
+            double mv3p = Y3p / Z3p;
+            reproj_errors[nb_solutions] = (mu3p - mu3) * (mu3p - mu3) + (mv3p - mv3) * (mv3p - mv3);
+        }
+
         nb_solutions++;
     }
 
+    //sort the solutions
+    if (p4p) {
+        for (int i = 1; i < nb_solutions; i++) {
+            for (int j = i; j > 0 && reproj_errors[j-1] > reproj_errors[j]; j--) {
+                std::swap(reproj_errors[j], reproj_errors[j-1]);
+                std::swap(solutionsR[j], solutionsR[j-1]);
+                std::swap(solutionsT[j], solutionsT[j-1]);
+            }
+        }
+    }
+
     return nb_solutions;
 }
 
@@ -311,9 +340,10 @@ bool ap3p::solve(cv::Mat &R, cv::Mat &tvec, const cv::Mat &opoints, const cv::Ma
     else
         extract_points<cv::Point3d, cv::Point2f>(opoints, ipoints, points);
 
-    bool result = solve(rotation_matrix, translation, points[0], points[1], points[2], points[3], points[4], points[5],
-                        points[6], points[7], points[8], points[9], points[10], points[11], points[12], points[13],
-                        points[14],
+    bool result = solve(rotation_matrix, translation,
+                        points[0], points[1], points[2], points[3], points[4],
+                        points[5], points[6], points[7], points[8], points[9],
+                        points[10], points[11], points[12], points[13],points[14],
                         points[15], points[16], points[17], points[18], points[19]);
     cv::Mat(3, 1, CV_64F, translation).copyTo(tvec);
     cv::Mat(3, 3, CV_64F, rotation_matrix).copyTo(R);
@@ -335,10 +365,13 @@ int ap3p::solve(std::vector<cv::Mat> &Rs, std::vector<cv::Mat> &tvecs, const cv:
     else
         extract_points<cv::Point3d, cv::Point2f>(opoints, ipoints, points);
 
+    const bool p4p = std::max(opoints.checkVector(3, CV_32F), opoints.checkVector(3, CV_64F)) == 4;
     int solutions = solve(rotation_matrix, translation,
                           points[0], points[1], points[2], points[3], points[4],
                           points[5], points[6], points[7], points[8], points[9],
-                          points[10], points[11], points[12], points[13], points[14]);
+                          points[10], points[11], points[12], points[13], points[14],
+                          points[15], points[16], points[17], points[18], points[19],
+                          p4p);
 
     for (int i = 0; i < solutions; i++) {
         cv::Mat R, tvec;
@@ -353,42 +386,33 @@ int ap3p::solve(std::vector<cv::Mat> &Rs, std::vector<cv::Mat> &tvecs, const cv:
 }
 
 bool
-ap3p::solve(double R[3][3], double t[3], double mu0, double mv0, double X0, double Y0, double Z0, double mu1,
-            double mv1,
-            double X1, double Y1, double Z1, double mu2, double mv2, double X2, double Y2, double Z2, double mu3,
-            double mv3, double X3, double Y3, double Z3) {
+ap3p::solve(double R[3][3], double t[3],
+            double mu0, double mv0, double X0, double Y0, double Z0,
+            double mu1, double mv1, double X1, double Y1, double Z1,
+            double mu2, double mv2, double X2, double Y2, double Z2,
+            double mu3, double mv3, double X3, double Y3, double Z3) {
     double Rs[4][3][3], ts[4][3];
 
-    int n = solve(Rs, ts, mu0, mv0, X0, Y0, Z0, mu1, mv1, X1, Y1, Z1, mu2, mv2, X2, Y2, Z2);
+    const bool p4p = true;
+    int n = solve(Rs, ts, mu0, mv0, X0, Y0, Z0, mu1, mv1, X1, Y1, Z1, mu2, mv2, X2, Y2, Z2, mu3, mv3, X3, Y3, Z3, p4p);
     if (n == 0)
         return false;
 
-    int ns = 0;
-    double min_reproj = 0;
-    for (int i = 0; i < n; i++) {
-        double X3p = Rs[i][0][0] * X3 + Rs[i][0][1] * Y3 + Rs[i][0][2] * Z3 + ts[i][0];
-        double Y3p = Rs[i][1][0] * X3 + Rs[i][1][1] * Y3 + Rs[i][1][2] * Z3 + ts[i][1];
-        double Z3p = Rs[i][2][0] * X3 + Rs[i][2][1] * Y3 + Rs[i][2][2] * Z3 + ts[i][2];
-        double mu3p = cx + fx * X3p / Z3p;
-        double mv3p = cy + fy * Y3p / Z3p;
-        double reproj = (mu3p - mu3) * (mu3p - mu3) + (mv3p - mv3) * (mv3p - mv3);
-        if (i == 0 || min_reproj > reproj) {
-            ns = i;
-            min_reproj = reproj;
-        }
-    }
-
     for (int i = 0; i < 3; i++) {
         for (int j = 0; j < 3; j++)
-            R[i][j] = Rs[ns][i][j];
-        t[i] = ts[ns][i];
+            R[i][j] = Rs[0][i][j];
+        t[i] = ts[0][i];
     }
 
     return true;
 }
 
-int ap3p::solve(double R[4][3][3], double t[4][3], double mu0, double mv0, double X0, double Y0, double Z0, double mu1,
-                double mv1, double X1, double Y1, double Z1, double mu2, double mv2, double X2, double Y2, double Z2) {
+int ap3p::solve(double R[4][3][3], double t[4][3],
+                double mu0, double mv0, double X0, double Y0, double Z0,
+                double mu1, double mv1, double X1, double Y1, double Z1,
+                double mu2, double mv2, double X2, double Y2, double Z2,
+                double mu3, double mv3, double X3, double Y3, double Z3,
+                bool p4p) {
     double mk0, mk1, mk2;
     double norm;
 
@@ -413,13 +437,17 @@ int ap3p::solve(double R[4][3][3], double t[4][3], double mu0, double mv0, doubl
     mu2 *= mk2;
     mv2 *= mk2;
 
-    double featureVectors[3][3] = {{mu0, mu1, mu2},
-                                   {mv0, mv1, mv2},
-                                   {mk0, mk1, mk2}};
-    double worldPoints[3][3] = {{X0, X1, X2},
-                                {Y0, Y1, Y2},
-                                {Z0, Z1, Z2}};
+    mu3 = inv_fx * mu3 - cx_fx;
+    mv3 = inv_fy * mv3 - cy_fy;
+    double mk3 = 1; //not used
 
-    return computePoses(featureVectors, worldPoints, R, t);
+    double featureVectors[3][4] = {{mu0, mu1, mu2, mu3},
+                                   {mv0, mv1, mv2, mv3},
+                                   {mk0, mk1, mk2, mk3}};
+    double worldPoints[3][4] = {{X0, X1, X2, X3},
+                                {Y0, Y1, Y2, Y3},
+                                {Z0, Z1, Z2, Z3}};
+
+    return computePoses(featureVectors, worldPoints, R, t, p4p);
 }
 }
diff --git a/modules/calib3d/src/ap3p.h b/modules/calib3d/src/ap3p.h
index df44198115..c044c6fd32 100644
--- a/modules/calib3d/src/ap3p.h
+++ b/modules/calib3d/src/ap3p.h
@@ -1,7 +1,7 @@
 #ifndef P3P_P3P_H
 #define P3P_P3P_H
 
-#include "precomp.hpp"
+#include <opencv2/core.hpp>
 
 namespace cv {
 class ap3p {
@@ -18,7 +18,7 @@ private:
     void extract_points(const cv::Mat &opoints, const cv::Mat &ipoints, std::vector<double> &points) {
         points.clear();
         int npoints = std::max(opoints.checkVector(3, CV_32F), opoints.checkVector(3, CV_64F));
-        points.resize(5*npoints);
+        points.resize(5*4); //resize vector to fit for p4p case
         for (int i = 0; i < npoints; i++) {
             points[i * 5] = ipoints.at<IpointType>(i).x * fx + cx;
             points[i * 5 + 1] = ipoints.at<IpointType>(i).y * fy + cy;
@@ -26,6 +26,12 @@ private:
             points[i * 5 + 3] = opoints.at<OpointType>(i).y;
             points[i * 5 + 4] = opoints.at<OpointType>(i).z;
         }
+        //Fill vectors with unused values for p3p case
+        for (int i = npoints; i < 4; i++) {
+            for (int j = 0; j < 5; j++) {
+                points[i * 5 + j] = 0;
+            }
+        }
     }
 
     void init_inverse_parameters();
@@ -45,7 +51,9 @@ public:
     int solve(double R[4][3][3], double t[4][3],
               double mu0, double mv0, double X0, double Y0, double Z0,
               double mu1, double mv1, double X1, double Y1, double Z1,
-              double mu2, double mv2, double X2, double Y2, double Z2);
+              double mu2, double mv2, double X2, double Y2, double Z2,
+              double mu3, double mv3, double X3, double Y3, double Z3,
+              bool p4p);
 
     bool solve(double R[3][3], double t[3],
                double mu0, double mv0, double X0, double Y0, double Z0,
@@ -59,8 +67,8 @@ public:
     // worldPoints: Positions of the 3 feature points stored as column vectors
     // solutionsR: 4 possible solutions of rotation matrix of the world w.r.t the camera frame
     // solutionsT: 4 possible solutions of translation of the world origin w.r.t the camera frame
-    int computePoses(const double featureVectors[3][3], const double worldPoints[3][3], double solutionsR[4][3][3],
-                     double solutionsT[4][3]);
+    int computePoses(const double featureVectors[3][4], const double worldPoints[3][4], double solutionsR[4][3][3],
+                     double solutionsT[4][3], bool p4p);
 
 };
 }
diff --git a/modules/calib3d/src/ippe.cpp b/modules/calib3d/src/ippe.cpp
new file mode 100644
index 0000000000..74a2864525
--- /dev/null
+++ b/modules/calib3d/src/ippe.cpp
@@ -0,0 +1,1100 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+// This file is based on file issued with the following license:
+
+/*============================================================================
+
+Copyright 2017 Toby Collins
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice, this
+   list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "precomp.hpp"
+#include "ippe.hpp"
+
+namespace cv {
+namespace IPPE {
+PoseSolver::PoseSolver() : IPPE_SMALL(1e-3)
+{
+}
+
+void PoseSolver::solveGeneric(InputArray _objectPoints, InputArray _imagePoints, OutputArray _rvec1, OutputArray _tvec1,
+                              float& err1, OutputArray _rvec2, OutputArray _tvec2, float& err2)
+{
+    Mat normalizedImagePoints;
+    if (_imagePoints.getMat().type() == CV_32FC2)
+    {
+        _imagePoints.getMat().convertTo(normalizedImagePoints, CV_64F);
+    }
+    else
+    {
+        normalizedImagePoints = _imagePoints.getMat();
+    }
+
+    //solve:
+    Mat Ma, Mb;
+    solveGeneric(_objectPoints, normalizedImagePoints, Ma, Mb);
+
+    //the two poses computed by IPPE (sorted):
+    Mat M1, M2;
+
+    //sort poses by reprojection error:
+    sortPosesByReprojError(_objectPoints, normalizedImagePoints, Ma, Mb, M1, M2, err1, err2);
+
+    //fill outputs
+    rot2vec(M1.colRange(0, 3).rowRange(0, 3), _rvec1);
+    rot2vec(M2.colRange(0, 3).rowRange(0, 3), _rvec2);
+
+    M1.colRange(3, 4).rowRange(0, 3).copyTo(_tvec1);
+    M2.colRange(3, 4).rowRange(0, 3).copyTo(_tvec2);
+}
+
+void PoseSolver::solveGeneric(InputArray _objectPoints, InputArray _normalizedInputPoints,
+                              OutputArray _Ma, OutputArray _Mb)
+{
+    //argument checking:
+    size_t n = static_cast<size_t>(_objectPoints.rows() * _objectPoints.cols()); //number of points
+    int objType = _objectPoints.type();
+    int type_input = _normalizedInputPoints.type();
+
+    CV_CheckType(objType, objType == CV_32FC3 || objType == CV_64FC3,
+                 "Type of _objectPoints must be CV_32FC3 or CV_64FC3" );
+    CV_CheckType(type_input, type_input == CV_32FC2 || type_input == CV_64FC2,
+                 "Type of _normalizedInputPoints must be CV_32FC3 or CV_64FC3" );
+    CV_Assert(_objectPoints.rows() == 1 || _objectPoints.cols() == 1);
+    CV_Assert(_objectPoints.rows() >= 4 || _objectPoints.cols() >= 4);
+    CV_Assert(_normalizedInputPoints.rows() == 1 || _normalizedInputPoints.cols() == 1);
+    CV_Assert(static_cast<size_t>(_objectPoints.rows() * _objectPoints.cols()) == n);
+
+    Mat normalizedInputPoints;
+    if (type_input == CV_32FC2)
+    {
+        _normalizedInputPoints.getMat().convertTo(normalizedInputPoints, CV_64F);
+    }
+    else
+    {
+        normalizedInputPoints = _normalizedInputPoints.getMat();
+    }
+
+    Mat objectInputPoints;
+    if (type_input == CV_32FC3)
+    {
+        _objectPoints.getMat().convertTo(objectInputPoints, CV_64F);
+    }
+    else
+    {
+        objectInputPoints = _objectPoints.getMat();
+    }
+
+    Mat canonicalObjPoints;
+    Mat MmodelPoints2Canonical;
+
+    //transform object points to the canonical position (zero centred and on the plane z=0):
+    makeCanonicalObjectPoints(objectInputPoints, canonicalObjPoints, MmodelPoints2Canonical);
+
+    //compute the homography mapping the model's points to normalizedInputPoints
+    Matx33d H;
+    HomographyHO::homographyHO(canonicalObjPoints, _normalizedInputPoints, H);
+
+    //now solve
+    Mat MaCanon, MbCanon;
+    solveCanonicalForm(canonicalObjPoints, normalizedInputPoints, H, MaCanon, MbCanon);
+
+    //transform computed poses to account for canonical transform:
+    Mat Ma = MaCanon * MmodelPoints2Canonical;
+    Mat Mb = MbCanon * MmodelPoints2Canonical;
+
+    //output poses:
+    Ma.copyTo(_Ma);
+    Mb.copyTo(_Mb);
+}
+
+void PoseSolver::solveCanonicalForm(InputArray _canonicalObjPoints, InputArray _normalizedInputPoints, const Matx33d& H,
+                                    OutputArray _Ma, OutputArray _Mb)
+{
+    _Ma.create(4, 4, CV_64FC1);
+    _Mb.create(4, 4, CV_64FC1);
+
+    Mat Ma = _Ma.getMat();
+    Mat Mb = _Mb.getMat();
+
+    //initialise poses:
+    Ma.setTo(0);
+    Ma.at<double>(3, 3) = 1;
+    Mb.setTo(0);
+    Mb.at<double>(3, 3) = 1;
+
+    //Compute the Jacobian J of the homography at (0,0):
+    double j00 = H(0, 0) - H(2, 0) * H(0, 2);
+    double j01 = H(0, 1) - H(2, 1) * H(0, 2);
+    double j10 = H(1, 0) - H(2, 0) * H(1, 2);
+    double j11 = H(1, 1) - H(2, 1) * H(1, 2);
+
+    //Compute the transformation of (0,0) into the image:
+    double v0 = H(0, 2);
+    double v1 = H(1, 2);
+
+    //compute the two rotation solutions:
+    Mat Ra = Ma.colRange(0, 3).rowRange(0, 3);
+    Mat Rb = Mb.colRange(0, 3).rowRange(0, 3);
+    computeRotations(j00, j01, j10, j11, v0, v1, Ra, Rb);
+
+    //for each rotation solution, compute the corresponding translation solution:
+    Mat ta = Ma.colRange(3, 4).rowRange(0, 3);
+    Mat tb = Mb.colRange(3, 4).rowRange(0, 3);
+    computeTranslation(_canonicalObjPoints, _normalizedInputPoints, Ra, ta);
+    computeTranslation(_canonicalObjPoints, _normalizedInputPoints, Rb, tb);
+}
+
+void PoseSolver::solveSquare(InputArray _objectPoints, InputArray _imagePoints, OutputArray _rvec1, OutputArray _tvec1,
+                             float& err1, OutputArray _rvec2, OutputArray _tvec2, float& err2)
+{
+    //allocate outputs:
+    _rvec1.create(3, 1, CV_64FC1);
+    _tvec1.create(3, 1, CV_64FC1);
+    _rvec2.create(3, 1, CV_64FC1);
+    _tvec2.create(3, 1, CV_64FC1);
+
+    Mat objectPoints2D;
+
+    //generate the object points:
+    objectPoints2D.create(1, 4, CV_64FC2);
+    Mat objectPoints = _objectPoints.getMat();
+    double squareLength;
+    if (objectPoints.depth() == CV_32F)
+    {
+        objectPoints2D.ptr<Vec2d>(0)[0] = Vec2d(objectPoints.ptr<Vec3f>(0)[0](0), objectPoints.ptr<Vec3f>(0)[0](1));
+        objectPoints2D.ptr<Vec2d>(0)[1] = Vec2d(objectPoints.ptr<Vec3f>(0)[1](0), objectPoints.ptr<Vec3f>(0)[1](1));
+        objectPoints2D.ptr<Vec2d>(0)[2] = Vec2d(objectPoints.ptr<Vec3f>(0)[2](0), objectPoints.ptr<Vec3f>(0)[2](1));
+        objectPoints2D.ptr<Vec2d>(0)[3] = Vec2d(objectPoints.ptr<Vec3f>(0)[3](0), objectPoints.ptr<Vec3f>(0)[3](1));
+
+        squareLength = sqrt( (objectPoints.ptr<Vec3f>(0)[1](0) - objectPoints.ptr<Vec3f>(0)[0](0))*
+                             (objectPoints.ptr<Vec3f>(0)[1](0) - objectPoints.ptr<Vec3f>(0)[0](0)) +
+                             (objectPoints.ptr<Vec3f>(0)[1](1) - objectPoints.ptr<Vec3f>(0)[0](1))*
+                             (objectPoints.ptr<Vec3f>(0)[1](1) - objectPoints.ptr<Vec3f>(0)[0](1)) );
+    }
+    else
+    {
+        objectPoints2D.ptr<Vec2d>(0)[0] = Vec2d(objectPoints.ptr<Vec3d>(0)[0](0), objectPoints.ptr<Vec3d>(0)[0](1));
+        objectPoints2D.ptr<Vec2d>(0)[1] = Vec2d(objectPoints.ptr<Vec3d>(0)[1](0), objectPoints.ptr<Vec3d>(0)[1](1));
+        objectPoints2D.ptr<Vec2d>(0)[2] = Vec2d(objectPoints.ptr<Vec3d>(0)[2](0), objectPoints.ptr<Vec3d>(0)[2](1));
+        objectPoints2D.ptr<Vec2d>(0)[3] = Vec2d(objectPoints.ptr<Vec3d>(0)[3](0), objectPoints.ptr<Vec3d>(0)[3](1));
+
+        squareLength = sqrt( (objectPoints.ptr<Vec3d>(0)[1](0) - objectPoints.ptr<Vec3d>(0)[0](0))*
+                             (objectPoints.ptr<Vec3d>(0)[1](0) - objectPoints.ptr<Vec3d>(0)[0](0)) +
+                             (objectPoints.ptr<Vec3d>(0)[1](1) - objectPoints.ptr<Vec3d>(0)[0](1))*
+                             (objectPoints.ptr<Vec3d>(0)[1](1) - objectPoints.ptr<Vec3d>(0)[0](1)) );
+    }
+
+    Mat H; //homography from canonical object points to normalized pixels
+
+    Mat normalizedInputPoints;
+    if (_imagePoints.getMat().type() == CV_32FC2)
+    {
+        _imagePoints.getMat().convertTo(normalizedInputPoints, CV_64F);
+    }
+    else
+    {
+        normalizedInputPoints = _imagePoints.getMat();
+    }
+
+    //compute H
+    homographyFromSquarePoints(normalizedInputPoints, squareLength / 2.0, H);
+
+    //now solve
+    Mat Ma, Mb;
+    solveCanonicalForm(objectPoints2D, normalizedInputPoints, H, Ma, Mb);
+
+    //sort poses according to reprojection error:
+    Mat M1, M2;
+    sortPosesByReprojError(_objectPoints, normalizedInputPoints, Ma, Mb, M1, M2, err1, err2);
+
+    //fill outputs
+    rot2vec(M1.colRange(0, 3).rowRange(0, 3), _rvec1);
+    rot2vec(M2.colRange(0, 3).rowRange(0, 3), _rvec2);
+
+    M1.colRange(3, 4).rowRange(0, 3).copyTo(_tvec1);
+    M2.colRange(3, 4).rowRange(0, 3).copyTo(_tvec2);
+}
+
+void PoseSolver::generateSquareObjectCorners3D(double squareLength, OutputArray _objectPoints)
+{
+    _objectPoints.create(1, 4, CV_64FC3);
+    Mat objectPoints = _objectPoints.getMat();
+    objectPoints.ptr<Vec3d>(0)[0] = Vec3d(-squareLength / 2.0, squareLength / 2.0, 0.0);
+    objectPoints.ptr<Vec3d>(0)[1] = Vec3d(squareLength / 2.0, squareLength / 2.0, 0.0);
+    objectPoints.ptr<Vec3d>(0)[2] = Vec3d(squareLength / 2.0, -squareLength / 2.0, 0.0);
+    objectPoints.ptr<Vec3d>(0)[3] = Vec3d(-squareLength / 2.0, -squareLength / 2.0, 0.0);
+}
+
+void PoseSolver::generateSquareObjectCorners2D(double squareLength, OutputArray _objectPoints)
+{
+    _objectPoints.create(1, 4, CV_64FC2);
+    Mat objectPoints = _objectPoints.getMat();
+    objectPoints.ptr<Vec2d>(0)[0] = Vec2d(-squareLength / 2.0, squareLength / 2.0);
+    objectPoints.ptr<Vec2d>(0)[1] = Vec2d(squareLength / 2.0, squareLength / 2.0);
+    objectPoints.ptr<Vec2d>(0)[2] = Vec2d(squareLength / 2.0, -squareLength / 2.0);
+    objectPoints.ptr<Vec2d>(0)[3] = Vec2d(-squareLength / 2.0, -squareLength / 2.0);
+}
+
+double PoseSolver::meanSceneDepth(InputArray _objectPoints, InputArray _rvec, InputArray _tvec)
+{
+    CV_CheckType(_objectPoints.type(), _objectPoints.type() == CV_64FC3,
+                 "Type of _objectPoints must be CV_64FC3" );
+
+    size_t n = static_cast<size_t>(_objectPoints.rows() * _objectPoints.cols());
+    Mat R;
+    Mat q;
+    Rodrigues(_rvec, R);
+    double zBar = 0;
+
+    for (size_t i = 0; i < n; i++)
+    {
+        Mat p(_objectPoints.getMat().at<Point3d>(static_cast<int>(i)));
+        q = R * p + _tvec.getMat();
+        double z;
+        if (q.depth() == CV_64F)
+        {
+            z = q.at<double>(2);
+        }
+        else
+        {
+            z = static_cast<double>(q.at<float>(2));
+        }
+        zBar += z;
+    }
+    return zBar / static_cast<double>(n);
+}
+
+void PoseSolver::rot2vec(InputArray _R, OutputArray _r)
+{
+    CV_CheckType(_R.type(), _R.type() == CV_64FC1,
+                 "Type of _R must be CV_64FC1" );
+    CV_Assert(_R.rows() == 3);
+    CV_Assert(_R.cols() == 3);
+
+    _r.create(3, 1, CV_64FC1);
+
+    Mat R = _R.getMat();
+    Mat rvec = _r.getMat();
+
+    double trace = R.at<double>(0, 0) + R.at<double>(1, 1) + R.at<double>(2, 2);
+    double w_norm = acos((trace - 1.0) / 2.0);
+    double eps = std::numeric_limits<float>::epsilon();
+    double d = 1 / (2 * sin(w_norm)) * w_norm;
+    if (w_norm < eps) //rotation is the identity
+    {
+        rvec.setTo(0);
+    }
+    else
+    {
+        double c0 = R.at<double>(2, 1) - R.at<double>(1, 2);
+        double c1 = R.at<double>(0, 2) - R.at<double>(2, 0);
+        double c2 = R.at<double>(1, 0) - R.at<double>(0, 1);
+        rvec.at<double>(0) = d * c0;
+        rvec.at<double>(1) = d * c1;
+        rvec.at<double>(2) = d * c2;
+    }
+}
+
+void PoseSolver::computeTranslation(InputArray _objectPoints, InputArray _normalizedImgPoints, InputArray _R, OutputArray _t)
+{
+    //This is solved by building the linear system At = b, where t corresponds to the (unknown) translation.
+    //This is then inverted with the associated normal equations to give t = inv(transpose(A)*A)*transpose(A)*b
+    //For efficiency we only store the coefficients of (transpose(A)*A) and (transpose(A)*b)
+
+    CV_CheckType(_objectPoints.type(), _objectPoints.type() == CV_64FC2,
+                 "Type of _objectPoints must be CV_64FC2" );
+    CV_CheckType(_normalizedImgPoints.type(), _normalizedImgPoints.type() == CV_64FC2,
+                 "Type of _normalizedImgPoints must be CV_64FC2" );
+    CV_CheckType(_R.type(), _R.type() == CV_64FC1,
+                 "Type of _R must be CV_64FC1" );
+    CV_Assert(_R.rows() == 3 && _R.cols() == 3);
+    CV_Assert(_objectPoints.rows() == 1 || _objectPoints.cols() == 1);
+    CV_Assert(_normalizedImgPoints.rows() == 1 || _normalizedImgPoints.cols() == 1);
+
+    size_t n = static_cast<size_t>(_normalizedImgPoints.rows() * _normalizedImgPoints.cols());
+    CV_Assert(n == static_cast<size_t>(_objectPoints.rows() * _objectPoints.cols()));
+
+    Mat objectPoints = _objectPoints.getMat();
+    Mat imgPoints = _normalizedImgPoints.getMat();
+
+    _t.create(3, 1, CV_64FC1);
+
+    Mat R = _R.getMat();
+
+    //coefficients of (transpose(A)*A)
+    double ATA00 = static_cast<double>(n);
+    double ATA02 = 0;
+    double ATA11 = static_cast<double>(n);
+    double ATA12 = 0;
+    double ATA20 = 0;
+    double ATA21 = 0;
+    double ATA22 = 0;
+
+    //coefficients of (transpose(A)*b)
+    double ATb0 = 0;
+    double ATb1 = 0;
+    double ATb2 = 0;
+
+    //now loop through each point and increment the coefficients:
+    for (int i = 0; i < static_cast<int>(n); i++)
+    {
+        const Vec2d& objPt = objectPoints.at<Vec2d>(i);
+        double rx = R.at<double>(0, 0) * objPt(0) + R.at<double>(0, 1) * objPt(1);
+        double ry = R.at<double>(1, 0) * objPt(0) + R.at<double>(1, 1) * objPt(1);
+        double rz = R.at<double>(2, 0) * objPt(0) + R.at<double>(2, 1) * objPt(1);
+
+        const Vec2d& imgPt = imgPoints.at<Vec2d>(i);
+        double a2 = -imgPt(0);
+        double b2 = -imgPt(1);
+
+        ATA02 = ATA02 + a2;
+        ATA12 = ATA12 + b2;
+        ATA20 = ATA20 + a2;
+        ATA21 = ATA21 + b2;
+        ATA22 = ATA22 + a2 * a2 + b2 * b2;
+
+        double bx = -a2 * rz - rx;
+        double by = -b2 * rz - ry;
+
+        ATb0 = ATb0 + bx;
+        ATb1 = ATb1 + by;
+        ATb2 = ATb2 + a2 * bx + b2 * by;
+    }
+
+    double detAInv = 1.0 / (ATA00 * ATA11 * ATA22 - ATA00 * ATA12 * ATA21 - ATA02 * ATA11 * ATA20);
+
+    //S gives inv(transpose(A)*A)/det(A)^2
+    //construct S:
+    double S00 = ATA11 * ATA22 - ATA12 * ATA21;
+    double S01 = ATA02 * ATA21;
+    double S02 = -ATA02 * ATA11;
+    double S10 = ATA12 * ATA20;
+    double S11 = ATA00 * ATA22 - ATA02 * ATA20;
+    double S12 = -ATA00 * ATA12;
+    double S20 = -ATA11 * ATA20;
+    double S21 = -ATA00 * ATA21;
+    double S22 = ATA00 * ATA11;
+
+    //solve t:
+    Mat t = _t.getMat();
+    t.at<double>(0) = detAInv * (S00 * ATb0 + S01 * ATb1 + S02 * ATb2);
+    t.at<double>(1) = detAInv * (S10 * ATb0 + S11 * ATb1 + S12 * ATb2);
+    t.at<double>(2) = detAInv * (S20 * ATb0 + S21 * ATb1 + S22 * ATb2);
+}
+
+void PoseSolver::computeRotations(double j00, double j01, double j10, double j11, double p, double q, OutputArray _R1, OutputArray _R2)
+{
+    //This is fairly optimized code which makes it hard to understand. The matlab code is certainly easier to read.
+    _R1.create(3, 3, CV_64FC1);
+    _R2.create(3, 3, CV_64FC1);
+
+    Matx33d Rv;
+    Matx31d v(p, q, 1);
+    rotateVec2ZAxis(v,Rv);
+    Rv = Rv.t();
+
+    //setup the 2x2 SVD decomposition:
+    double rv00 = Rv(0,0);
+    double rv01 = Rv(0,1);
+    double rv02 = Rv(0,2);
+
+    double rv10 = Rv(1,0);
+    double rv11 = Rv(1,1);
+    double rv12 = Rv(1,2);
+
+    double rv20 = Rv(2,0);
+    double rv21 = Rv(2,1);
+    double rv22 = Rv(2,2);
+
+    double b00 = rv00 - p * rv20;
+    double b01 = rv01 - p * rv21;
+    double b10 = rv10 - q * rv20;
+    double b11 = rv11 - q * rv21;
+
+    double dtinv = 1.0 / ((b00 * b11 - b01 * b10));
+
+    double binv00 = dtinv * b11;
+    double binv01 = -dtinv * b01;
+    double binv10 = -dtinv * b10;
+    double binv11 = dtinv * b00;
+
+    double a00 = binv00 * j00 + binv01 * j10;
+    double a01 = binv00 * j01 + binv01 * j11;
+    double a10 = binv10 * j00 + binv11 * j10;
+    double a11 = binv10 * j01 + binv11 * j11;
+
+    //compute the largest singular value of A:
+    double ata00 = a00 * a00 + a01 * a01;
+    double ata01 = a00 * a10 + a01 * a11;
+    double ata11 = a10 * a10 + a11 * a11;
+
+    double gamma2 = 0.5 * (ata00 + ata11 + sqrt((ata00 - ata11) * (ata00 - ata11) + 4.0 * ata01 * ata01));
+    if (gamma2 < 0)
+        CV_Error(Error::StsNoConv, "gamma2 is negative.");
+
+    double gamma = sqrt(gamma2);
+
+    if (std::fabs(gamma) < std::numeric_limits<float>::epsilon())
+        CV_Error(Error::StsNoConv, "gamma is zero.");
+
+    //reconstruct the full rotation matrices:
+    double rtilde00 = a00 / gamma;
+    double rtilde01 = a01 / gamma;
+    double rtilde10 = a10 / gamma;
+    double rtilde11 = a11 / gamma;
+
+    double rtilde00_2 = rtilde00 * rtilde00;
+    double rtilde01_2 = rtilde01 * rtilde01;
+    double rtilde10_2 = rtilde10 * rtilde10;
+    double rtilde11_2 = rtilde11 * rtilde11;
+
+    double b0 = sqrt(-rtilde00_2 - rtilde10_2 + 1);
+    double b1 = sqrt(-rtilde01_2 - rtilde11_2 + 1);
+    double sp = (-rtilde00 * rtilde01 - rtilde10 * rtilde11);
+
+    if (sp < 0)
+    {
+        b1 = -b1;
+    }
+
+    //store results:
+    Mat R1 = _R1.getMat();
+    Mat R2 = _R2.getMat();
+
+    R1.at<double>(0, 0) = (rtilde00)*rv00 + (rtilde10)*rv01 + (b0)*rv02;
+    R1.at<double>(0, 1) = (rtilde01)*rv00 + (rtilde11)*rv01 + (b1)*rv02;
+    R1.at<double>(0, 2) = (b1 * rtilde10 - b0 * rtilde11) * rv00 + (b0 * rtilde01 - b1 * rtilde00) * rv01 + (rtilde00 * rtilde11 - rtilde01 * rtilde10) * rv02;
+    R1.at<double>(1, 0) = (rtilde00)*rv10 + (rtilde10)*rv11 + (b0)*rv12;
+    R1.at<double>(1, 1) = (rtilde01)*rv10 + (rtilde11)*rv11 + (b1)*rv12;
+    R1.at<double>(1, 2) = (b1 * rtilde10 - b0 * rtilde11) * rv10 + (b0 * rtilde01 - b1 * rtilde00) * rv11 + (rtilde00 * rtilde11 - rtilde01 * rtilde10) * rv12;
+    R1.at<double>(2, 0) = (rtilde00)*rv20 + (rtilde10)*rv21 + (b0)*rv22;
+    R1.at<double>(2, 1) = (rtilde01)*rv20 + (rtilde11)*rv21 + (b1)*rv22;
+    R1.at<double>(2, 2) = (b1 * rtilde10 - b0 * rtilde11) * rv20 + (b0 * rtilde01 - b1 * rtilde00) * rv21 + (rtilde00 * rtilde11 - rtilde01 * rtilde10) * rv22;
+
+    R2.at<double>(0, 0) = (rtilde00)*rv00 + (rtilde10)*rv01 + (-b0) * rv02;
+    R2.at<double>(0, 1) = (rtilde01)*rv00 + (rtilde11)*rv01 + (-b1) * rv02;
+    R2.at<double>(0, 2) = (b0 * rtilde11 - b1 * rtilde10) * rv00 + (b1 * rtilde00 - b0 * rtilde01) * rv01 + (rtilde00 * rtilde11 - rtilde01 * rtilde10) * rv02;
+    R2.at<double>(1, 0) = (rtilde00)*rv10 + (rtilde10)*rv11 + (-b0) * rv12;
+    R2.at<double>(1, 1) = (rtilde01)*rv10 + (rtilde11)*rv11 + (-b1) * rv12;
+    R2.at<double>(1, 2) = (b0 * rtilde11 - b1 * rtilde10) * rv10 + (b1 * rtilde00 - b0 * rtilde01) * rv11 + (rtilde00 * rtilde11 - rtilde01 * rtilde10) * rv12;
+    R2.at<double>(2, 0) = (rtilde00)*rv20 + (rtilde10)*rv21 + (-b0) * rv22;
+    R2.at<double>(2, 1) = (rtilde01)*rv20 + (rtilde11)*rv21 + (-b1) * rv22;
+    R2.at<double>(2, 2) = (b0 * rtilde11 - b1 * rtilde10) * rv20 + (b1 * rtilde00 - b0 * rtilde01) * rv21 + (rtilde00 * rtilde11 - rtilde01 * rtilde10) * rv22;
+}
+
+void PoseSolver::homographyFromSquarePoints(InputArray _targetPoints, double halfLength, OutputArray H_)
+{
+    CV_CheckType(_targetPoints.type(), _targetPoints.type() == CV_32FC2 || _targetPoints.type() == CV_64FC2,
+                 "Type of _targetPoints must be CV_32FC2 or CV_64FC2" );
+
+    Mat pts = _targetPoints.getMat();
+
+    double p1x, p1y;
+    double p2x, p2y;
+    double p3x, p3y;
+    double p4x, p4y;
+
+    if (_targetPoints.type() == CV_32FC2)
+    {
+        p1x = -pts.at<Vec2f>(0)(0);
+        p1y = -pts.at<Vec2f>(0)(1);
+
+        p2x = -pts.at<Vec2f>(1)(0);
+        p2y = -pts.at<Vec2f>(1)(1);
+
+        p3x = -pts.at<Vec2f>(2)(0);
+        p3y = -pts.at<Vec2f>(2)(1);
+
+        p4x = -pts.at<Vec2f>(3)(0);
+        p4y = -pts.at<Vec2f>(3)(1);
+    }
+    else
+    {
+        p1x = -pts.at<Vec2d>(0)(0);
+        p1y = -pts.at<Vec2d>(0)(1);
+
+        p2x = -pts.at<Vec2d>(1)(0);
+        p2y = -pts.at<Vec2d>(1)(1);
+
+        p3x = -pts.at<Vec2d>(2)(0);
+        p3y = -pts.at<Vec2d>(2)(1);
+
+        p4x = -pts.at<Vec2d>(3)(0);
+        p4y = -pts.at<Vec2d>(3)(1);
+    }
+
+    //analytic solution:
+    double det = (halfLength * (p1x * p2y - p2x * p1y - p1x * p4y + p2x * p3y - p3x * p2y + p4x * p1y + p3x * p4y - p4x * p3y));
+    if (abs(det) < 1e-9)
+        CV_Error(Error::StsNoConv, "Determinant is zero!");
+    double detsInv = -1 / det;
+
+    Matx33d H;
+    H(0, 0) = detsInv * (p1x * p3x * p2y - p2x * p3x * p1y - p1x * p4x * p2y + p2x * p4x * p1y - p1x * p3x * p4y + p1x * p4x * p3y + p2x * p3x * p4y - p2x * p4x * p3y);
+    H(0, 1) = detsInv * (p1x * p2x * p3y - p1x * p3x * p2y - p1x * p2x * p4y + p2x * p4x * p1y + p1x * p3x * p4y - p3x * p4x * p1y - p2x * p4x * p3y + p3x * p4x * p2y);
+    H(0, 2) = detsInv * halfLength * (p1x * p2x * p3y - p2x * p3x * p1y - p1x * p2x * p4y + p1x * p4x * p2y - p1x * p4x * p3y + p3x * p4x * p1y + p2x * p3x * p4y - p3x * p4x * p2y);
+    H(1, 0) = detsInv * (p1x * p2y * p3y - p2x * p1y * p3y - p1x * p2y * p4y + p2x * p1y * p4y - p3x * p1y * p4y + p4x * p1y * p3y + p3x * p2y * p4y - p4x * p2y * p3y);
+    H(1, 1) = detsInv * (p2x * p1y * p3y - p3x * p1y * p2y - p1x * p2y * p4y + p4x * p1y * p2y + p1x * p3y * p4y - p4x * p1y * p3y - p2x * p3y * p4y + p3x * p2y * p4y);
+    H(1, 2) = detsInv * halfLength * (p1x * p2y * p3y - p3x * p1y * p2y - p2x * p1y * p4y + p4x * p1y * p2y - p1x * p3y * p4y + p3x * p1y * p4y + p2x * p3y * p4y - p4x * p2y * p3y);
+    H(2, 0) = -detsInv * (p1x * p3y - p3x * p1y - p1x * p4y - p2x * p3y + p3x * p2y + p4x * p1y + p2x * p4y - p4x * p2y);
+    H(2, 1) = detsInv * (p1x * p2y - p2x * p1y - p1x * p3y + p3x * p1y + p2x * p4y - p4x * p2y - p3x * p4y + p4x * p3y);
+    H(2, 2) = 1.0;
+
+    Mat(H, false).copyTo(H_);
+}
+
+void PoseSolver::makeCanonicalObjectPoints(InputArray _objectPoints, OutputArray _canonicalObjPoints, OutputArray _MmodelPoints2Canonical)
+{
+    int objType = _objectPoints.type();
+    CV_CheckType(objType, objType == CV_32FC3 || objType == CV_64FC3,
+                 "Type of _objectPoints must be CV_32FC3 or CV_64FC3" );
+
+    int n = _objectPoints.rows() * _objectPoints.cols();
+
+    _canonicalObjPoints.create(1, n, CV_64FC2);
+
+    Mat objectPoints = _objectPoints.getMat();
+    Mat canonicalObjPoints = _canonicalObjPoints.getMat();
+
+    Mat UZero(3, n, CV_64FC1);
+
+    double xBar = 0;
+    double yBar = 0;
+    double zBar = 0;
+    bool isOnZPlane = true;
+    for (int i = 0; i < n; i++)
+    {
+        double x, y, z;
+        if (objType == CV_32FC3)
+        {
+            x = static_cast<double>(objectPoints.at<Vec3f>(i)[0]);
+            y = static_cast<double>(objectPoints.at<Vec3f>(i)[1]);
+            z = static_cast<double>(objectPoints.at<Vec3f>(i)[2]);
+        }
+        else
+        {
+            x = objectPoints.at<Vec3d>(i)[0];
+            y = objectPoints.at<Vec3d>(i)[1];
+            z = objectPoints.at<Vec3d>(i)[2];
+        }
+
+        if (abs(z) > IPPE_SMALL)
+        {
+            isOnZPlane = false;
+        }
+
+        xBar += x;
+        yBar += y;
+        zBar += z;
+
+        UZero.at<double>(0, i) = x;
+        UZero.at<double>(1, i) = y;
+        UZero.at<double>(2, i) = z;
+    }
+    xBar = xBar / static_cast<double>(n);
+    yBar = yBar / static_cast<double>(n);
+    zBar = zBar / static_cast<double>(n);
+
+    for (int i = 0; i < n; i++)
+    {
+        UZero.at<double>(0, i) -= xBar;
+        UZero.at<double>(1, i) -= yBar;
+        UZero.at<double>(2, i) -= zBar;
+    }
+
+    Matx44d MCenter = Matx44d::eye();
+    MCenter(0, 3) = -xBar;
+    MCenter(1, 3) = -yBar;
+    MCenter(2, 3) = -zBar;
+
+    if (isOnZPlane)
+    {
+        //MmodelPoints2Canonical is given by MCenter
+        Mat(MCenter, false).copyTo(_MmodelPoints2Canonical);
+        for (int i = 0; i < n; i++)
+        {
+            canonicalObjPoints.at<Vec2d>(i)[0] = UZero.at<double>(0, i);
+            canonicalObjPoints.at<Vec2d>(i)[1] = UZero.at<double>(1, i);
+        }
+    }
+    else
+    {
+        Mat UZeroAligned(3, n, CV_64FC1);
+        Matx33d R; //rotation that rotates objectPoints to the plane z=0
+
+        if (!computeObjextSpaceR3Pts(objectPoints,R))
+        {
+            //we could not compute R, problably because there is a duplicate point in {objectPoints(0),objectPoints(1),objectPoints(2)}.
+            //So we compute it with the SVD (which is slower):
+            computeObjextSpaceRSvD(UZero,R);
+        }
+
+        UZeroAligned = R * UZero;
+
+        for (int i = 0; i < n; i++)
+        {
+            canonicalObjPoints.at<Vec2d>(i)[0] = UZeroAligned.at<double>(0, i);
+            canonicalObjPoints.at<Vec2d>(i)[1] = UZeroAligned.at<double>(1, i);
+            if (abs(UZeroAligned.at<double>(2, i)) > IPPE_SMALL)
+                CV_Error(Error::StsNoConv, "Cannot transform object points to the plane z=0!");
+        }
+
+        Matx44d MRot = Matx44d::zeros();
+        MRot(3, 3) = 1;
+
+        for (int i = 0; i < 3; i++)
+        {
+            for (int j = 0; j < 3; j++)
+            {
+                MRot(i,j) = R(i,j);
+            }
+        }
+        Matx44d Mb = MRot * MCenter;
+        Mat(Mb, false).copyTo(_MmodelPoints2Canonical);
+    }
+}
+
+void PoseSolver::evalReprojError(InputArray _objectPoints, InputArray _imagePoints, InputArray _M, float& err)
+{
+    Mat projectedPoints;
+    Mat imagePoints = _imagePoints.getMat();
+    Mat r;
+    rot2vec(_M.getMat().colRange(0, 3).rowRange(0, 3), r);
+
+    Mat K = Mat::eye(3, 3, CV_64FC1);
+    Mat dist;
+    projectPoints(_objectPoints, r, _M.getMat().colRange(3, 4).rowRange(0, 3), K, dist, projectedPoints);
+
+    err = 0;
+    int n = _objectPoints.rows() * _objectPoints.cols();
+
+    float dx, dy;
+    const int projPtsDepth = projectedPoints.depth();
+    for (int i = 0; i < n; i++)
+    {
+        if (projPtsDepth == CV_32F)
+        {
+            dx = projectedPoints.at<Vec2f>(i)[0] - static_cast<float>(imagePoints.at<Vec2d>(i)[0]);
+            dy = projectedPoints.at<Vec2f>(i)[1] - static_cast<float>(imagePoints.at<Vec2d>(i)[1]);
+        }
+        else
+        {
+            dx = static_cast<float>(projectedPoints.at<Vec2d>(i)[0] - imagePoints.at<Vec2d>(i)[0]);
+            dy = static_cast<float>(projectedPoints.at<Vec2d>(i)[1] - imagePoints.at<Vec2d>(i)[1]);
+        }
+
+        err += dx * dx + dy * dy;
+    }
+    err = sqrt(err / (2.0f * n));
+}
+
+void PoseSolver::sortPosesByReprojError(InputArray _objectPoints, InputArray _imagePoints, InputArray _Ma, InputArray _Mb,
+                                        OutputArray _M1, OutputArray _M2, float& err1, float& err2)
+{
+    float erra, errb;
+    evalReprojError(_objectPoints, _imagePoints, _Ma, erra);
+    evalReprojError(_objectPoints, _imagePoints, _Mb, errb);
+    if (erra < errb)
+    {
+        err1 = erra;
+        _Ma.copyTo(_M1);
+
+        err2 = errb;
+        _Mb.copyTo(_M2);
+    }
+    else
+    {
+        err1 = errb;
+        _Mb.copyTo(_M1);
+
+        err2 = erra;
+        _Ma.copyTo(_M2);
+    }
+}
+
+void PoseSolver::rotateVec2ZAxis(const Matx31d& a, Matx33d& Ra)
+{
+    double ax = a(0);
+    double ay = a(1);
+    double az = a(2);
+
+    double nrm = sqrt(ax*ax + ay*ay + az*az);
+    ax = ax/nrm;
+    ay = ay/nrm;
+    az = az/nrm;
+
+    double c = az;
+
+    if (abs(1.0+c) < std::numeric_limits<float>::epsilon())
+    {
+        Ra = Matx33d::zeros();
+        Ra(0,0) = 1.0;
+        Ra(1,1) = 1.0;
+        Ra(2,2) = -1.0;
+    }
+    else
+    {
+        double d = 1.0/(1.0+c);
+        double ax2 = ax*ax;
+        double ay2 = ay*ay;
+        double axay = ax*ay;
+
+        Ra(0,0) = -ax2*d + 1.0;
+        Ra(0,1) = -axay*d;
+        Ra(0,2) = -ax;
+
+        Ra(1,0) = -axay*d;
+        Ra(1,1) = -ay2*d + 1.0;
+        Ra(1,2) = -ay;
+
+        Ra(2,0) = ax;
+        Ra(2,1) = ay;
+        Ra(2,2) = 1.0 - (ax2 + ay2)*d;
+    }
+}
+
+bool PoseSolver::computeObjextSpaceR3Pts(InputArray _objectPoints, Matx33d& R)
+{
+    bool ret; //return argument
+    double p1x,p1y,p1z;
+    double p2x,p2y,p2z;
+    double p3x,p3y,p3z;
+
+    Mat objectPoints = _objectPoints.getMat();
+    if (objectPoints.type() == CV_32FC3)
+    {
+        p1x = objectPoints.at<Vec3f>(0)[0];
+        p1y = objectPoints.at<Vec3f>(0)[1];
+        p1z = objectPoints.at<Vec3f>(0)[2];
+
+        p2x = objectPoints.at<Vec3f>(1)[0];
+        p2y = objectPoints.at<Vec3f>(1)[1];
+        p2z = objectPoints.at<Vec3f>(1)[2];
+
+        p3x = objectPoints.at<Vec3f>(2)[0];
+        p3y = objectPoints.at<Vec3f>(2)[1];
+        p3z = objectPoints.at<Vec3f>(2)[2];
+    }
+    else
+    {
+        p1x = objectPoints.at<Vec3d>(0)[0];
+        p1y = objectPoints.at<Vec3d>(0)[1];
+        p1z = objectPoints.at<Vec3d>(0)[2];
+
+        p2x = objectPoints.at<Vec3d>(1)[0];
+        p2y = objectPoints.at<Vec3d>(1)[1];
+        p2z = objectPoints.at<Vec3d>(1)[2];
+
+        p3x = objectPoints.at<Vec3d>(2)[0];
+        p3y = objectPoints.at<Vec3d>(2)[1];
+        p3z = objectPoints.at<Vec3d>(2)[2];
+    }
+
+    double nx = (p1y - p2y)*(p1z - p3z) - (p1y - p3y)*(p1z - p2z);
+    double ny = (p1x - p3x)*(p1z - p2z) - (p1x - p2x)*(p1z - p3z);
+    double nz = (p1x - p2x)*(p1y - p3y) - (p1x - p3x)*(p1y - p2y);
+
+    double nrm = sqrt(nx*nx+ ny*ny + nz*nz);
+    if (nrm > IPPE_SMALL)
+    {
+        nx = nx/nrm;
+        ny = ny/nrm;
+        nz = nz/nrm;
+        Matx31d v(nx, ny, nz);
+        rotateVec2ZAxis(v,R);
+        ret = true;
+    }
+    else
+    {
+        ret = false;
+    }
+    return ret;
+}
+
+void PoseSolver::computeObjextSpaceRSvD(InputArray _objectPointsZeroMean, OutputArray _R)
+{
+    _R.create(3, 3, CV_64FC1);
+    Mat R = _R.getMat();
+
+    //we could not compute R with the first three points, so lets use the SVD
+    SVD s;
+    Mat W, U, VT;
+    s.compute(_objectPointsZeroMean.getMat() * _objectPointsZeroMean.getMat().t(), W, U, VT);
+    double s3 = W.at<double>(2);
+    double s2 = W.at<double>(1);
+
+    //check if points are coplanar:
+    CV_Assert(s3 / s2 < IPPE_SMALL);
+
+    R = U.t();
+    if (determinant(R) < 0)
+    {
+        //this ensures R is a rotation matrix and not a general unitary matrix:
+        R.at<double>(2, 0) = -R.at<double>(2, 0);
+        R.at<double>(2, 1) = -R.at<double>(2, 1);
+        R.at<double>(2, 2) = -R.at<double>(2, 2);
+    }
+}
+} //namespace IPPE
+
+namespace HomographyHO {
+void normalizeDataIsotropic(InputArray _Data, OutputArray _DataN, OutputArray _T, OutputArray _Ti)
+{
+    Mat Data = _Data.getMat();
+    int numPoints = Data.rows * Data.cols;
+    CV_Assert(Data.rows == 1 || Data.cols == 1);
+    CV_Assert(Data.channels() == 2 || Data.channels() == 3);
+    CV_Assert(numPoints >= 4);
+
+    int dataType = _Data.type();
+    CV_CheckType(dataType, dataType == CV_32FC2 || dataType == CV_32FC3 || dataType == CV_64FC2 || dataType == CV_64FC3,
+                 "Type of _Data must be one of CV_32FC2, CV_32FC3, CV_64FC2, CV_64FC3");
+
+    _DataN.create(2, numPoints, CV_64FC1);
+
+    _T.create(3, 3, CV_64FC1);
+    _Ti.create(3, 3, CV_64FC1);
+
+    Mat DataN = _DataN.getMat();
+    Mat T = _T.getMat();
+    Mat Ti = _Ti.getMat();
+
+    _T.setTo(0);
+    _Ti.setTo(0);
+
+    int numChannels = Data.channels();
+    double xm = 0;
+    double ym = 0;
+    for (int i = 0; i < numPoints; i++)
+    {
+        if (numChannels == 2)
+        {
+            if (dataType == CV_32FC2)
+            {
+                xm = xm + Data.at<Vec2f>(i)[0];
+                ym = ym + Data.at<Vec2f>(i)[1];
+            }
+            else
+            {
+                xm = xm + Data.at<Vec2d>(i)[0];
+                ym = ym + Data.at<Vec2d>(i)[1];
+            }
+        }
+        else
+        {
+            if (dataType == CV_32FC3)
+            {
+                xm = xm + Data.at<Vec3f>(i)[0];
+                ym = ym + Data.at<Vec3f>(i)[1];
+            }
+            else
+            {
+                xm = xm + Data.at<Vec3d>(i)[0];
+                ym = ym + Data.at<Vec3d>(i)[1];
+            }
+        }
+    }
+    xm = xm / static_cast<double>(numPoints);
+    ym = ym / static_cast<double>(numPoints);
+
+    double kappa = 0;
+    double xh, yh;
+
+    for (int i = 0; i < numPoints; i++)
+    {
+
+        if (numChannels == 2)
+        {
+            if (dataType == CV_32FC2)
+            {
+                xh = Data.at<Vec2f>(i)[0] - xm;
+                yh = Data.at<Vec2f>(i)[1] - ym;
+            }
+            else
+            {
+                xh = Data.at<Vec2d>(i)[0] - xm;
+                yh = Data.at<Vec2d>(i)[1] - ym;
+            }
+        }
+        else
+        {
+            if (dataType == CV_32FC3)
+            {
+                xh = Data.at<Vec3f>(i)[0] - xm;
+                yh = Data.at<Vec3f>(i)[1] - ym;
+            }
+            else
+            {
+                xh = Data.at<Vec3d>(i)[0] - xm;
+                yh = Data.at<Vec3d>(i)[1] - ym;
+            }
+        }
+
+        DataN.at<double>(0, i) = xh;
+        DataN.at<double>(1, i) = yh;
+        kappa = kappa + xh * xh + yh * yh;
+    }
+    double beta = sqrt(2 * numPoints / kappa);
+    DataN = DataN * beta;
+
+    T.at<double>(0, 0) = 1.0 / beta;
+    T.at<double>(1, 1) = 1.0 / beta;
+
+    T.at<double>(0, 2) = xm;
+    T.at<double>(1, 2) = ym;
+
+    T.at<double>(2, 2) = 1;
+
+    Ti.at<double>(0, 0) = beta;
+    Ti.at<double>(1, 1) = beta;
+
+    Ti.at<double>(0, 2) = -beta * xm;
+    Ti.at<double>(1, 2) = -beta * ym;
+
+    Ti.at<double>(2, 2) = 1;
+}
+
+void homographyHO(InputArray _srcPoints, InputArray _targPoints, Matx33d& H)
+{
+    Mat DataA, DataB, TA, TAi, TB, TBi;
+
+    HomographyHO::normalizeDataIsotropic(_srcPoints, DataA, TA, TAi);
+    HomographyHO::normalizeDataIsotropic(_targPoints, DataB, TB, TBi);
+
+    int n = DataA.cols;
+    CV_Assert(n == DataB.cols);
+
+    Mat C1(1, n, CV_64FC1);
+    Mat C2(1, n, CV_64FC1);
+    Mat C3(1, n, CV_64FC1);
+    Mat C4(1, n, CV_64FC1);
+
+    double mC1 = 0, mC2 = 0, mC3 = 0, mC4 = 0;
+
+    for (int i = 0; i < n; i++)
+    {
+        C1.at<double>(0, i) = -DataB.at<double>(0, i) * DataA.at<double>(0, i);
+        C2.at<double>(0, i) = -DataB.at<double>(0, i) * DataA.at<double>(1, i);
+        C3.at<double>(0, i) = -DataB.at<double>(1, i) * DataA.at<double>(0, i);
+        C4.at<double>(0, i) = -DataB.at<double>(1, i) * DataA.at<double>(1, i);
+
+        mC1 += C1.at<double>(0, i);
+        mC2 += C2.at<double>(0, i);
+        mC3 += C3.at<double>(0, i);
+        mC4 += C4.at<double>(0, i);
+    }
+
+    mC1 /= n;
+    mC2 /= n;
+    mC3 /= n;
+    mC4 /= n;
+
+    Mat Mx(n, 3, CV_64FC1);
+    Mat My(n, 3, CV_64FC1);
+
+    for (int i = 0; i < n; i++)
+    {
+        Mx.at<double>(i, 0) = C1.at<double>(0, i) - mC1;
+        Mx.at<double>(i, 1) = C2.at<double>(0, i) - mC2;
+        Mx.at<double>(i, 2) = -DataB.at<double>(0, i);
+
+        My.at<double>(i, 0) = C3.at<double>(0, i) - mC3;
+        My.at<double>(i, 1) = C4.at<double>(0, i) - mC4;
+        My.at<double>(i, 2) = -DataB.at<double>(1, i);
+    }
+
+    Mat DataAT, DataADataAT;
+
+    transpose(DataA, DataAT);
+    DataADataAT = DataA * DataAT;
+    double dt = DataADataAT.at<double>(0, 0) * DataADataAT.at<double>(1, 1) - DataADataAT.at<double>(0, 1) * DataADataAT.at<double>(1, 0);
+
+    Mat DataADataATi(2, 2, CV_64FC1);
+    DataADataATi.at<double>(0, 0) = DataADataAT.at<double>(1, 1) / dt;
+    DataADataATi.at<double>(0, 1) = -DataADataAT.at<double>(0, 1) / dt;
+    DataADataATi.at<double>(1, 0) = -DataADataAT.at<double>(1, 0) / dt;
+    DataADataATi.at<double>(1, 1) = DataADataAT.at<double>(0, 0) / dt;
+
+    Mat Pp = DataADataATi * DataA;
+
+    Mat Bx = Pp * Mx;
+    Mat By = Pp * My;
+
+    Mat Ex = DataAT * Bx;
+    Mat Ey = DataAT * By;
+
+    Mat D(2 * n, 3, CV_64FC1);
+
+    for (int i = 0; i < n; i++)
+    {
+        D.at<double>(i, 0) = Mx.at<double>(i, 0) - Ex.at<double>(i, 0);
+        D.at<double>(i, 1) = Mx.at<double>(i, 1) - Ex.at<double>(i, 1);
+        D.at<double>(i, 2) = Mx.at<double>(i, 2) - Ex.at<double>(i, 2);
+
+        D.at<double>(i + n, 0) = My.at<double>(i, 0) - Ey.at<double>(i, 0);
+        D.at<double>(i + n, 1) = My.at<double>(i, 1) - Ey.at<double>(i, 1);
+        D.at<double>(i + n, 2) = My.at<double>(i, 2) - Ey.at<double>(i, 2);
+    }
+
+    Mat DT, DDT;
+    transpose(D, DT);
+    DDT = DT * D;
+
+    Mat S, U;
+    eigen(DDT, S, U);
+
+    Mat h789(3, 1, CV_64FC1);
+    h789.at<double>(0, 0) = U.at<double>(2, 0);
+    h789.at<double>(1, 0) = U.at<double>(2, 1);
+    h789.at<double>(2, 0) = U.at<double>(2, 2);
+
+    Mat h12 = -Bx * h789;
+    Mat h45 = -By * h789;
+
+    double h3 = -(mC1 * h789.at<double>(0, 0) + mC2 * h789.at<double>(1, 0));
+    double h6 = -(mC3 * h789.at<double>(0, 0) + mC4 * h789.at<double>(1, 0));
+
+    H(0, 0) = h12.at<double>(0, 0);
+    H(0, 1) = h12.at<double>(1, 0);
+    H(0, 2) = h3;
+
+    H(1, 0) = h45.at<double>(0, 0);
+    H(1, 1) = h45.at<double>(1, 0);
+    H(1, 2) = h6;
+
+    H(2, 0) = h789.at<double>(0, 0);
+    H(2, 1) = h789.at<double>(1, 0);
+    H(2, 2) = h789.at<double>(2, 0);
+
+    H = Mat(TB * H * TAi);
+    double h22_inv = 1 / H(2, 2);
+    H = H * h22_inv;
+}
+}
+} //namespace cv
diff --git a/modules/calib3d/src/ippe.hpp b/modules/calib3d/src/ippe.hpp
new file mode 100644
index 0000000000..6dc76f59a6
--- /dev/null
+++ b/modules/calib3d/src/ippe.hpp
@@ -0,0 +1,259 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+// This file is based on file issued with the following license:
+
+/*============================================================================
+
+Copyright 2017 Toby Collins
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice, this
+   list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef OPENCV_CALIB3D_IPPE_HPP
+#define OPENCV_CALIB3D_IPPE_HPP
+
+#include <opencv2/core.hpp>
+
+namespace cv {
+namespace IPPE {
+
+class PoseSolver {
+public:
+    /**
+     * @brief PoseSolver constructor
+     */
+    PoseSolver();
+
+    /**
+     * @brief                Finds the two possible poses of a planar object given a set of correspondences and their respective reprojection errors.
+     *                       The poses are sorted with the first having the lowest reprojection error.
+     * @param objectPoints   Array of 4 or more coplanar object points defined in object coordinates.
+     *                       1xN/Nx1 3-channel (float or double) where N is the number of points
+     * @param imagePoints    Array of corresponding image points, 1xN/Nx1 2-channel. Points are in normalized pixel coordinates.
+     * @param rvec1          First rotation solution (3x1 rotation vector)
+     * @param tvec1          First translation solution (3x1 vector)
+     * @param reprojErr1     Reprojection error of first solution
+     * @param rvec2          Second rotation solution (3x1 rotation vector)
+     * @param tvec2          Second translation solution (3x1 vector)
+     * @param reprojErr2     Reprojection error of second solution
+     */
+    void solveGeneric(InputArray objectPoints, InputArray imagePoints, OutputArray rvec1, OutputArray tvec1,
+                      float& reprojErr1, OutputArray rvec2, OutputArray tvec2, float& reprojErr2);
+
+    /**
+     * @brief                   Finds the two possible poses of a square planar object and their respective reprojection errors using IPPE.
+     *                          The poses are sorted so that the first one is the one with the lowest reprojection error.
+     *
+     * @param objectPoints      Array of 4 coplanar object points defined in the following object coordinates:
+     *                            - point 0: [-squareLength / 2.0, squareLength / 2.0, 0]
+     *                            - point 1: [squareLength / 2.0, squareLength / 2.0, 0]
+     *                            - point 2: [squareLength / 2.0, -squareLength / 2.0, 0]
+     *                            - point 3: [-squareLength / 2.0, -squareLength / 2.0, 0]
+     *                          1xN/Nx1 3-channel (float or double) where N is the number of points
+     * @param imagePoints       Array of corresponding image points, 1xN/Nx1 2-channel. Points are in normalized pixel coordinates.
+     * @param rvec1             First rotation solution (3x1 rotation vector)
+     * @param tvec1             First translation solution (3x1 vector)
+     * @param reprojErr1        Reprojection error of first solution
+     * @param rvec2             Second rotation solution (3x1 rotation vector)
+     * @param tvec2             Second translation solution (3x1 vector)
+     * @param reprojErr2        Reprojection error of second solution
+     */
+    void solveSquare(InputArray objectPoints, InputArray imagePoints, OutputArray rvec1, OutputArray tvec1,
+                     float& reprojErr1, OutputArray rvec2, OutputArray tvec2, float& reprojErr2);
+
+private:
+    /**
+     * @brief                         Finds the two possible poses of a planar object given a set of correspondences in normalized pixel coordinates.
+     *                                These poses are **NOT** sorted on reprojection error. Note that the returned poses are object-to-camera transforms, and not camera-to-object transforms.
+     * @param objectPoints            Array of 4 or more coplanar object points defined in object coordinates. 1xN/Nx1 3-channel (float or double).
+     * @param normalizedImagePoints   Array of corresponding image points in normalized pixel coordinates, 1xN/Nx1 2-channel (float or double).
+     * @param Ma                      First pose solution (unsorted)
+     * @param Mb                      Second pose solution (unsorted)
+     */
+    void solveGeneric(InputArray objectPoints, InputArray normalizedImagePoints, OutputArray Ma, OutputArray Mb);
+
+    /**
+     * @brief                         Finds the two possible poses of a planar object in its canonical position, given a set of correspondences in normalized pixel coordinates.
+     *                                These poses are **NOT** sorted on reprojection error. Note that the returned poses are object-to-camera transforms, and not camera-to-object transforms.
+     * @param canonicalObjPoints      Array of 4 or more coplanar object points defined in object coordinates. 1xN/Nx1 3-channel (double) where N is the number of points
+     * @param normalizedInputPoints   Array of corresponding image points in normalized pixel coordinates, 1xN/Nx1 2-channel (double) where N is the number of points
+     * @param H                       Homography mapping canonicalObjPoints to normalizedInputPoints.
+     * @param Ma
+     * @param Mb
+     */
+    void solveCanonicalForm(InputArray canonicalObjPoints, InputArray normalizedInputPoints, const Matx33d& H,
+                            OutputArray Ma, OutputArray Mb);
+
+    /**
+     * @brief                           Computes the translation solution for a given rotation solution
+     * @param objectPoints              Array of corresponding object points, 1xN/Nx1 3-channel where N is the number of points
+     * @param normalizedImagePoints     Array of corresponding image points (undistorted), 1xN/Nx1 2-channel where N is the number of points
+     * @param R                         Rotation solution (3x1 rotation vector)
+     * @param t                         Translation solution (3x1 rotation vector)
+     */
+    void computeTranslation(InputArray objectPoints, InputArray normalizedImgPoints, InputArray R, OutputArray t);
+
+    /**
+     * @brief                           Computes the two rotation solutions from the Jacobian of a homography matrix H at a point (ux,uy) on the object plane.
+     *                                  For highest accuracy the Jacobian should be computed at the centroid of the point correspondences (see the IPPE paper for the explanation of this).
+     *                                  For a point (ux,uy) on the object plane, suppose the homography H maps (ux,uy) to a point (p,q) in the image (in normalized pixel coordinates).
+     *                                  The Jacobian matrix [J00, J01; J10,J11] is the Jacobian of the mapping evaluated at (ux,uy).
+     * @param j00                       Homography jacobian coefficent at (ux,uy)
+     * @param j01                       Homography jacobian coefficent at (ux,uy)
+     * @param j10                       Homography jacobian coefficent at (ux,uy)
+     * @param j11                       Homography jacobian coefficent at (ux,uy)
+     * @param p                         The x coordinate of point (ux,uy) mapped into the image (undistorted and normalized position)
+     * @param q                         The y coordinate of point (ux,uy) mapped into the image (undistorted and normalized position)
+    */
+    void computeRotations(double j00, double j01, double j10, double j11, double p, double q, OutputArray _R1, OutputArray _R2);
+
+    /**
+     * @brief                         Closed-form solution for the homography mapping with four corner correspondences of a square (it maps source points to target points).
+     *                                The source points are the four corners of a zero-centred squared defined by:
+     *                                  - point 0: [-squareLength / 2.0, squareLength / 2.0]
+     *                                  - point 1: [squareLength / 2.0, squareLength / 2.0]
+     *                                  - point 2: [squareLength / 2.0, -squareLength / 2.0]
+     *                                  - point 3: [-squareLength / 2.0, -squareLength / 2.0]
+     *
+     * @param targetPoints            Array of four corresponding target points, 1x4/4x1 2-channel. Note that the points should be ordered to correspond with points 0, 1, 2 and 3.
+     * @param halfLength              The square's half length (i.e. squareLength/2.0)
+     * @param H                       Homograhy mapping the source points to the target points, 3x3 single channel
+    */
+    void homographyFromSquarePoints(InputArray targetPoints, double halfLength, OutputArray H);
+
+    /**
+     * @brief                  Fast conversion from a rotation matrix to a rotation vector using Rodrigues' formula
+     * @param R                Input rotation matrix, 3x3 1-channel (double)
+     * @param r                Output rotation vector, 3x1/1x3 1-channel (double)
+     */
+    void rot2vec(InputArray R, OutputArray r);
+
+    /**
+     * @brief                         Takes a set of planar object points and transforms them to 'canonical' object coordinates This is when they have zero mean and are on the plane z=0
+     * @param objectPoints            Array of 4 or more coplanar object points defined in object coordinates. 1xN/Nx1 3-channel (float or double) where N is the number of points
+     * @param canonicalObjectPoints   Object points in canonical coordinates 1xN/Nx1 2-channel (double)
+     * @param MobjectPoints2Canonical Transform matrix mapping _objectPoints to _canonicalObjectPoints: 4x4 1-channel (double)
+     */
+    void makeCanonicalObjectPoints(InputArray objectPoints, OutputArray canonicalObjectPoints, OutputArray MobjectPoints2Canonical);
+
+    /**
+     * @brief                         Evaluates the Root Mean Squared (RMS) reprojection error of a pose solution.
+     * @param objectPoints            Array of 4 or more coplanar object points defined in object coordinates. 1xN/Nx1 3-channel (float or double) where N is the number of points
+     * @param imagePoints             Array of corresponding image points, 1xN/Nx1 2-channel. This can either be in pixel coordinates or normalized pixel coordinates.
+     * @param M                       Pose matrix from 3D object to camera coordinates: 4x4 1-channel (double)
+     * @param err                     RMS reprojection error
+     */
+    void evalReprojError(InputArray objectPoints, InputArray imagePoints, InputArray M, float& err);
+
+    /**
+     * @brief                         Sorts two pose solutions according to their RMS reprojection error (lowest first).
+     * @param objectPoints            Array of 4 or more coplanar object points defined in object coordinates. 1xN/Nx1 3-channel (float or double) where N is the number of points
+     * @param imagePoints             Array of corresponding image points, 1xN/Nx1 2-channel.  This can either be in pixel coordinates or normalized pixel coordinates.
+     * @param Ma                      Pose matrix 1: 4x4 1-channel
+     * @param Mb                      Pose matrix 2: 4x4 1-channel
+     * @param M1                      Member of (Ma,Mb} with lowest RMS reprojection error. Performs deep copy.
+     * @param M2                      Member of (Ma,Mb} with highest RMS reprojection error. Performs deep copy.
+     * @param err1                    RMS reprojection error of _M1
+     * @param err2                    RMS reprojection error of _M2
+     */
+    void sortPosesByReprojError(InputArray objectPoints, InputArray imagePoints, InputArray Ma, InputArray Mb, OutputArray M1, OutputArray M2, float& err1, float& err2);
+
+    /**
+     * @brief                         Finds the rotation _Ra that rotates a vector _a to the z axis (0,0,1)
+     * @param a                       vector: 3x1 mat (double)
+     * @param Ra                      Rotation: 3x3 mat (double)
+     */
+    void rotateVec2ZAxis(const Matx31d& a, Matx33d& Ra);
+
+    /**
+     * @brief                         Computes the rotation _R that rotates the object points to the plane z=0. This uses the cross-product method with the first three object points.
+     * @param objectPoints            Array of N>=3 coplanar object points defined in object coordinates. 1xN/Nx1 3-channel (float or double) where N is the number of points
+     * @param R                       Rotation Mat: 3x3 (double)
+     * @return                        Success (true) or failure (false)
+     */
+    bool computeObjextSpaceR3Pts(InputArray objectPoints, Matx33d& R);
+
+    /**
+     * @brief computeObjextSpaceRSvD   Computes the rotation _R that rotates the object points to the plane z=0. This uses the cross-product method with the first three object points.
+     * @param objectPointsZeroMean     Zero-meaned coplanar object points: 3xN matrix (double) where N>=3
+     * @param R                        Rotation Mat: 3x3 (double)
+     */
+    void computeObjextSpaceRSvD(InputArray objectPointsZeroMean, OutputArray R);
+
+    /**
+     * @brief                   Generates the 4 object points of a square planar object
+     * @param squareLength      The square's length (which is also it's width) in object coordinate units (e.g. millimeters, meters, etc.)
+     * @param objectPoints      Set of 4 object points (1x4 3-channel double)
+     */
+    void generateSquareObjectCorners3D(double squareLength, OutputArray objectPoints);
+
+    /**
+     * @brief                   Generates the 4 object points of a square planar object, without including the z-component (which is z=0 for all points).
+     * @param squareLength      The square's length (which is also it's width) in object coordinate units (e.g. millimeters, meters, etc.)
+     * @param objectPoints      Set of 4 object points (1x4 2-channel double)
+     */
+    void generateSquareObjectCorners2D(double squareLength, OutputArray objectPoints);
+
+    /**
+     * @brief                   Computes the average depth of an object given its pose in camera coordinates
+     * @param objectPoints:     Object points defined in 3D object space
+     * @param rvec:             Rotation component of pose
+     * @param tvec:             Translation component of pose
+     * @return:                 average depth of the object
+     */
+    double meanSceneDepth(InputArray objectPoints, InputArray rvec, InputArray tvec);
+
+    //! a small constant used to test 'small' values close to zero.
+    double IPPE_SMALL;
+};
+} //namespace IPPE
+
+namespace HomographyHO {
+
+/**
+* @brief                   Computes the best-fitting homography matrix from source to target points using Harker and O'Leary's method:
+*                          Harker, M., O'Leary, P., Computation of Homographies, Proceedings of the British Machine Vision Conference 2005, Oxford, England.
+*                          This is not the author's implementation.
+* @param srcPoints         Array of source points: 1xN/Nx1 2-channel (float or double) where N is the number of points
+* @param targPoints        Array of target points: 1xN/Nx1 2-channel (float or double)
+* @param H                 Homography from source to target: 3x3 1-channel (double)
+*/
+void homographyHO(InputArray srcPoints, InputArray targPoints, Matx33d& H);
+
+/**
+* @brief                      Performs data normalization before homography estimation. For details see Hartley, R., Zisserman, A., Multiple View Geometry in Computer Vision,
+*                             Cambridge University Press, Cambridge, 2001
+* @param Data                 Array of source data points: 1xN/Nx1 2-channel (float or double) where N is the number of points
+* @param DataN                Normalized data points: 1xN/Nx1 2-channel (float or double) where N is the number of points
+* @param T                    Homogeneous transform from source to normalized: 3x3 1-channel (double)
+* @param Ti                   Homogeneous transform from normalized to source: 3x3 1-channel (double)
+*/
+void normalizeDataIsotropic(InputArray Data, OutputArray DataN, OutputArray T, OutputArray Ti);
+
+}
+} //namespace cv
+#endif
diff --git a/modules/calib3d/src/p3p.cpp b/modules/calib3d/src/p3p.cpp
index 7521e6b167..8ee0f490c7 100644
--- a/modules/calib3d/src/p3p.cpp
+++ b/modules/calib3d/src/p3p.cpp
@@ -49,9 +49,11 @@ bool p3p::solve(cv::Mat& R, cv::Mat& tvec, const cv::Mat& opoints, const cv::Mat
     else
         extract_points<cv::Point3d,cv::Point2f>(opoints, ipoints, points);
 
-    bool result = solve(rotation_matrix, translation, points[0], points[1], points[2], points[3], points[4], points[5],
-          points[6], points[7], points[8], points[9], points[10], points[11], points[12], points[13], points[14],
-          points[15], points[16], points[17], points[18], points[19]);
+    bool result = solve(rotation_matrix, translation,
+                        points[0], points[1], points[2], points[3], points[4],
+                        points[5], points[6], points[7], points[8], points[9],
+                        points[10], points[11], points[12], points[13], points[14],
+                        points[15], points[16], points[17], points[18], points[19]);
     cv::Mat(3, 1, CV_64F, translation).copyTo(tvec);
     cv::Mat(3, 3, CV_64F, rotation_matrix).copyTo(R);
     return result;
@@ -75,10 +77,13 @@ int p3p::solve(std::vector<cv::Mat>& Rs, std::vector<cv::Mat>& tvecs, const cv::
     else
         extract_points<cv::Point3d,cv::Point2f>(opoints, ipoints, points);
 
+    const bool p4p = std::max(opoints.checkVector(3, CV_32F), opoints.checkVector(3, CV_64F)) == 4;
     int solutions = solve(rotation_matrix, translation,
                           points[0], points[1], points[2], points[3], points[4],
                           points[5], points[6], points[7], points[8], points[9],
-                          points[10], points[11], points[12], points[13], points[14]);
+                          points[10], points[11], points[12], points[13], points[14],
+                          points[15], points[16], points[17], points[18], points[19],
+                          p4p);
 
     for (int i = 0; i < solutions; i++) {
         cv::Mat R, tvec;
@@ -100,39 +105,27 @@ bool p3p::solve(double R[3][3], double t[3],
 {
     double Rs[4][3][3], ts[4][3];
 
-    int n = solve(Rs, ts, mu0, mv0, X0, Y0, Z0,  mu1, mv1, X1, Y1, Z1, mu2, mv2, X2, Y2, Z2);
+    const bool p4p = true;
+    int n = solve(Rs, ts, mu0, mv0, X0, Y0, Z0,  mu1, mv1, X1, Y1, Z1, mu2, mv2, X2, Y2, Z2, mu3, mv3, X3, Y3, Z3, p4p);
 
     if (n == 0)
         return false;
 
-    int ns = 0;
-    double min_reproj = 0;
-    for(int i = 0; i < n; i++) {
-        double X3p = Rs[i][0][0] * X3 + Rs[i][0][1] * Y3 + Rs[i][0][2] * Z3 + ts[i][0];
-        double Y3p = Rs[i][1][0] * X3 + Rs[i][1][1] * Y3 + Rs[i][1][2] * Z3 + ts[i][1];
-        double Z3p = Rs[i][2][0] * X3 + Rs[i][2][1] * Y3 + Rs[i][2][2] * Z3 + ts[i][2];
-        double mu3p = cx + fx * X3p / Z3p;
-        double mv3p = cy + fy * Y3p / Z3p;
-        double reproj = (mu3p - mu3) * (mu3p - mu3) + (mv3p - mv3) * (mv3p - mv3);
-        if (i == 0 || min_reproj > reproj) {
-            ns = i;
-            min_reproj = reproj;
-        }
-    }
-
     for(int i = 0; i < 3; i++) {
         for(int j = 0; j < 3; j++)
-            R[i][j] = Rs[ns][i][j];
-        t[i] = ts[ns][i];
+            R[i][j] = Rs[0][i][j];
+        t[i] = ts[0][i];
     }
 
     return true;
 }
 
 int p3p::solve(double R[4][3][3], double t[4][3],
-    double mu0, double mv0,   double X0, double Y0, double Z0,
-    double mu1, double mv1,   double X1, double Y1, double Z1,
-    double mu2, double mv2,   double X2, double Y2, double Z2)
+               double mu0, double mv0,   double X0, double Y0, double Z0,
+               double mu1, double mv1,   double X1, double Y1, double Z1,
+               double mu2, double mv2,   double X2, double Y2, double Z2,
+               double mu3, double mv3,   double X3, double Y3, double Z3,
+               bool p4p)
 {
     double mk0, mk1, mk2;
     double norm;
@@ -152,6 +145,9 @@ int p3p::solve(double R[4][3][3], double t[4][3],
     norm = sqrt(mu2 * mu2 + mv2 * mv2 + 1);
     mk2 = 1. / norm; mu2 *= mk2; mv2 *= mk2;
 
+    mu3 = inv_fx * mu3 - cx_fx;
+    mv3 = inv_fy * mv3 - cy_fy;
+
     double distances[3];
     distances[0] = sqrt( (X1 - X2) * (X1 - X2) + (Y1 - Y2) * (Y1 - Y2) + (Z1 - Z2) * (Z1 - Z2) );
     distances[1] = sqrt( (X0 - X2) * (X0 - X2) + (Y0 - Y2) * (Y0 - Y2) + (Z0 - Z2) * (Z0 - Z2) );
@@ -167,6 +163,7 @@ int p3p::solve(double R[4][3][3], double t[4][3],
     int n = solve_for_lengths(lengths, distances, cosines);
 
     int nb_solutions = 0;
+    double reproj_errors[4];
     for(int i = 0; i < n; i++) {
         double M_orig[3][3];
 
@@ -185,9 +182,29 @@ int p3p::solve(double R[4][3][3], double t[4][3],
         if (!align(M_orig, X0, Y0, Z0, X1, Y1, Z1, X2, Y2, Z2, R[nb_solutions], t[nb_solutions]))
             continue;
 
+        if (p4p) {
+            double X3p = R[nb_solutions][0][0] * X3 + R[nb_solutions][0][1] * Y3 + R[nb_solutions][0][2] * Z3 + t[nb_solutions][0];
+            double Y3p = R[nb_solutions][1][0] * X3 + R[nb_solutions][1][1] * Y3 + R[nb_solutions][1][2] * Z3 + t[nb_solutions][1];
+            double Z3p = R[nb_solutions][2][0] * X3 + R[nb_solutions][2][1] * Y3 + R[nb_solutions][2][2] * Z3 + t[nb_solutions][2];
+            double mu3p = X3p / Z3p;
+            double mv3p = Y3p / Z3p;
+            reproj_errors[nb_solutions] = (mu3p - mu3) * (mu3p - mu3) + (mv3p - mv3) * (mv3p - mv3);
+        }
+
         nb_solutions++;
     }
 
+    if (p4p) {
+        //sort the solutions
+        for (int i = 1; i < nb_solutions; i++) {
+            for (int j = i; j > 0 && reproj_errors[j-1] > reproj_errors[j]; j--) {
+                std::swap(reproj_errors[j], reproj_errors[j-1]);
+                std::swap(R[j], R[j-1]);
+                std::swap(t[j], t[j-1]);
+            }
+        }
+    }
+
     return nb_solutions;
 }
 
diff --git a/modules/calib3d/src/p3p.h b/modules/calib3d/src/p3p.h
index 9c7f7ec987..93e867d479 100644
--- a/modules/calib3d/src/p3p.h
+++ b/modules/calib3d/src/p3p.h
@@ -15,7 +15,9 @@ class p3p
   int solve(double R[4][3][3], double t[4][3],
             double mu0, double mv0,   double X0, double Y0, double Z0,
             double mu1, double mv1,   double X1, double Y1, double Z1,
-            double mu2, double mv2,   double X2, double Y2, double Z2);
+            double mu2, double mv2,   double X2, double Y2, double Z2,
+            double mu3, double mv3,   double X3, double Y3, double Z3,
+            bool p4p);
   bool solve(double R[3][3], double t[3],
              double mu0, double mv0,   double X0, double Y0, double Z0,
              double mu1, double mv1,   double X1, double Y1, double Z1,
@@ -36,7 +38,7 @@ class p3p
   {
       points.clear();
       int npoints = std::max(opoints.checkVector(3, CV_32F), opoints.checkVector(3, CV_64F));
-      points.resize(5*npoints);
+      points.resize(5*4); //resize vector to fit for p4p case
       for(int i = 0; i < npoints; i++)
       {
           points[i*5] = ipoints.at<IpointType>(i).x*fx + cx;
@@ -45,6 +47,12 @@ class p3p
           points[i*5+3] = opoints.at<OpointType>(i).y;
           points[i*5+4] = opoints.at<OpointType>(i).z;
       }
+      //Fill vectors with unused values for p3p case
+      for (int i = npoints; i < 4; i++) {
+          for (int j = 0; j < 5; j++) {
+              points[i * 5 + j] = 0;
+          }
+      }
   }
   void init_inverse_parameters();
   int solve_for_lengths(double lengths[4][3], double distances[3], double cosines[3]);
diff --git a/modules/calib3d/src/solvepnp.cpp b/modules/calib3d/src/solvepnp.cpp
index 58c16f40cc..aa7332d69f 100644
--- a/modules/calib3d/src/solvepnp.cpp
+++ b/modules/calib3d/src/solvepnp.cpp
@@ -46,12 +46,44 @@
 #include "epnp.h"
 #include "p3p.h"
 #include "ap3p.h"
+#include "ippe.hpp"
 #include "calib3d_c_api.h"
 
-#include <iostream>
-
 namespace cv
 {
+#if defined _DEBUG || defined CV_STATIC_ANALYSIS
+static bool isPlanarObjectPoints(InputArray _objectPoints, double threshold)
+{
+    CV_CheckType(_objectPoints.type(), _objectPoints.type() == CV_32FC3 || _objectPoints.type() == CV_64FC3,
+                 "Type of _objectPoints must be CV_32FC3 or CV_64FC3");
+    Mat objectPoints;
+    if (_objectPoints.type() == CV_32FC3)
+    {
+        _objectPoints.getMat().convertTo(objectPoints, CV_64F);
+    }
+    else
+    {
+        objectPoints = _objectPoints.getMat();
+    }
+
+    Scalar meanValues = mean(objectPoints);
+    int nbPts = objectPoints.checkVector(3, CV_64F);
+    Mat objectPointsCentred = objectPoints - meanValues;
+    objectPointsCentred = objectPointsCentred.reshape(1, nbPts);
+
+    Mat w, u, vt;
+    Mat MM = objectPointsCentred.t() * objectPointsCentred;
+    SVDecomp(MM, w, u, vt);
+
+    return (w.at<double>(2) < w.at<double>(1) * threshold);
+}
+
+static bool approxEqual(double a, double b, double eps)
+{
+    return std::fabs(a-b) < eps;
+}
+#endif
+
 void drawFrameAxes(InputOutputArray image, InputArray cameraMatrix, InputArray distCoeffs,
                    InputArray rvec, InputArray tvec, float length, int thickness)
 {
@@ -80,120 +112,24 @@ void drawFrameAxes(InputOutputArray image, InputArray cameraMatrix, InputArray d
     line(image, imagePoints[0], imagePoints[3], Scalar(255, 0, 0), thickness);
 }
 
-bool solvePnP( InputArray _opoints, InputArray _ipoints,
-               InputArray _cameraMatrix, InputArray _distCoeffs,
-               OutputArray _rvec, OutputArray _tvec, bool useExtrinsicGuess, int flags )
+bool solvePnP( InputArray opoints, InputArray ipoints,
+               InputArray cameraMatrix, InputArray distCoeffs,
+               OutputArray rvec, OutputArray tvec, bool useExtrinsicGuess, int flags )
 {
     CV_INSTRUMENT_REGION();
 
-    Mat opoints = _opoints.getMat(), ipoints = _ipoints.getMat();
-    int npoints = std::max(opoints.checkVector(3, CV_32F), opoints.checkVector(3, CV_64F));
-    CV_Assert( ( (npoints >= 4) || (npoints == 3 && flags == SOLVEPNP_ITERATIVE && useExtrinsicGuess) )
-               && npoints == std::max(ipoints.checkVector(2, CV_32F), ipoints.checkVector(2, CV_64F)) );
+    vector<Mat> rvecs, tvecs;
+    int solutions = solvePnPGeneric(opoints, ipoints, cameraMatrix, distCoeffs, rvecs, tvecs, useExtrinsicGuess, (SolvePnPMethod)flags, rvec, tvec);
 
-    Mat rvec, tvec;
-    if( flags != SOLVEPNP_ITERATIVE )
-        useExtrinsicGuess = false;
-
-    if( useExtrinsicGuess )
+    if (solutions > 0)
     {
-        int rtype = _rvec.type(), ttype = _tvec.type();
-        Size rsize = _rvec.size(), tsize = _tvec.size();
-        CV_Assert( (rtype == CV_32F || rtype == CV_64F) &&
-                   (ttype == CV_32F || ttype == CV_64F) );
-        CV_Assert( (rsize == Size(1, 3) || rsize == Size(3, 1)) &&
-                   (tsize == Size(1, 3) || tsize == Size(3, 1)) );
+        int rdepth = rvec.empty() ? CV_64F : rvec.depth();
+        int tdepth = tvec.empty() ? CV_64F : tvec.depth();
+        rvecs[0].convertTo(rvec, rdepth);
+        tvecs[0].convertTo(tvec, tdepth);
     }
-    else
-    {
-        int mtype = CV_64F;
-        // use CV_32F if all PnP inputs are CV_32F and outputs are empty
-        if (_ipoints.depth() == _cameraMatrix.depth() && _ipoints.depth() == _opoints.depth() &&
-            _rvec.empty() && _tvec.empty())
-            mtype = _opoints.depth();
 
-        _rvec.create(3, 1, mtype);
-        _tvec.create(3, 1, mtype);
-    }
-    rvec = _rvec.getMat();
-    tvec = _tvec.getMat();
-
-    Mat cameraMatrix0 = _cameraMatrix.getMat();
-    Mat distCoeffs0 = _distCoeffs.getMat();
-    Mat cameraMatrix = Mat_<double>(cameraMatrix0);
-    Mat distCoeffs = Mat_<double>(distCoeffs0);
-    bool result = false;
-
-    if (flags == SOLVEPNP_EPNP || flags == SOLVEPNP_DLS || flags == SOLVEPNP_UPNP)
-    {
-        Mat undistortedPoints;
-        undistortPoints(ipoints, undistortedPoints, cameraMatrix, distCoeffs);
-        epnp PnP(cameraMatrix, opoints, undistortedPoints);
-
-        Mat R;
-        PnP.compute_pose(R, tvec);
-        Rodrigues(R, rvec);
-        result = true;
-    }
-    else if (flags == SOLVEPNP_P3P)
-    {
-        CV_Assert( npoints == 4);
-        Mat undistortedPoints;
-        undistortPoints(ipoints, undistortedPoints, cameraMatrix, distCoeffs);
-        p3p P3Psolver(cameraMatrix);
-
-        Mat R;
-        result = P3Psolver.solve(R, tvec, opoints, undistortedPoints);
-        if (result)
-            Rodrigues(R, rvec);
-    }
-    else if (flags == SOLVEPNP_AP3P)
-    {
-        CV_Assert( npoints == 4);
-        Mat undistortedPoints;
-        undistortPoints(ipoints, undistortedPoints, cameraMatrix, distCoeffs);
-        ap3p P3Psolver(cameraMatrix);
-
-        Mat R;
-        result = P3Psolver.solve(R, tvec, opoints, undistortedPoints);
-        if (result)
-            Rodrigues(R, rvec);
-    }
-    else if (flags == SOLVEPNP_ITERATIVE)
-    {
-        CvMat c_objectPoints = cvMat(opoints), c_imagePoints = cvMat(ipoints);
-        CvMat c_cameraMatrix = cvMat(cameraMatrix), c_distCoeffs = cvMat(distCoeffs);
-        CvMat c_rvec = cvMat(rvec), c_tvec = cvMat(tvec);
-        cvFindExtrinsicCameraParams2(&c_objectPoints, &c_imagePoints, &c_cameraMatrix,
-                                     (c_distCoeffs.rows && c_distCoeffs.cols) ? &c_distCoeffs : 0,
-                                     &c_rvec, &c_tvec, useExtrinsicGuess );
-        result = true;
-    }
-    /*else if (flags == SOLVEPNP_DLS)
-    {
-        Mat undistortedPoints;
-        undistortPoints(ipoints, undistortedPoints, cameraMatrix, distCoeffs);
-
-        dls PnP(opoints, undistortedPoints);
-
-        Mat R, rvec = _rvec.getMat(), tvec = _tvec.getMat();
-        bool result = PnP.compute_pose(R, tvec);
-        if (result)
-            Rodrigues(R, rvec);
-        return result;
-    }
-    else if (flags == SOLVEPNP_UPNP)
-    {
-        upnp PnP(cameraMatrix, opoints, ipoints);
-
-        Mat R, rvec = _rvec.getMat(), tvec = _tvec.getMat();
-        PnP.compute_pose(R, tvec);
-        Rodrigues(R, rvec);
-        return true;
-    }*/
-    else
-        CV_Error(CV_StsBadArg, "The flags argument must be one of SOLVEPNP_ITERATIVE, SOLVEPNP_P3P, SOLVEPNP_EPNP or SOLVEPNP_DLS");
-    return result;
+    return solutions > 0;
 }
 
 class PnPRansacCallback CV_FINAL : public PointSetRegistrator::Callback
@@ -258,10 +194,10 @@ public:
 };
 
 bool solvePnPRansac(InputArray _opoints, InputArray _ipoints,
-                        InputArray _cameraMatrix, InputArray _distCoeffs,
-                        OutputArray _rvec, OutputArray _tvec, bool useExtrinsicGuess,
-                        int iterationsCount, float reprojectionError, double confidence,
-                        OutputArray _inliers, int flags)
+                    InputArray _cameraMatrix, InputArray _distCoeffs,
+                    OutputArray _rvec, OutputArray _tvec, bool useExtrinsicGuess,
+                    int iterationsCount, float reprojectionError, double confidence,
+                    OutputArray _inliers, int flags)
 {
     CV_INSTRUMENT_REGION();
 
@@ -410,7 +346,8 @@ int solveP3P( InputArray _opoints, InputArray _ipoints,
 
     Mat opoints = _opoints.getMat(), ipoints = _ipoints.getMat();
     int npoints = std::max(opoints.checkVector(3, CV_32F), opoints.checkVector(3, CV_64F));
-    CV_Assert( npoints == 3 && npoints == std::max(ipoints.checkVector(2, CV_32F), ipoints.checkVector(2, CV_64F)) );
+    CV_Assert( npoints == std::max(ipoints.checkVector(2, CV_32F), ipoints.checkVector(2, CV_64F)) );
+    CV_Assert( npoints == 3 || npoints == 4 );
     CV_Assert( flags == SOLVEPNP_P3P || flags == SOLVEPNP_AP3P );
 
     Mat cameraMatrix0 = _cameraMatrix.getMat();
@@ -420,7 +357,7 @@ int solveP3P( InputArray _opoints, InputArray _ipoints,
 
     Mat undistortedPoints;
     undistortPoints(ipoints, undistortedPoints, cameraMatrix, distCoeffs);
-    std::vector<Mat> Rs, ts;
+    std::vector<Mat> Rs, ts, rvecs;
 
     int solutions = 0;
     if (flags == SOLVEPNP_P3P)
@@ -438,19 +375,91 @@ int solveP3P( InputArray _opoints, InputArray _ipoints,
         return 0;
     }
 
-    if (_rvecs.needed()) {
-        _rvecs.create(solutions, 1, CV_64F);
+    Mat objPts, imgPts;
+    opoints.convertTo(objPts, CV_64F);
+    ipoints.convertTo(imgPts, CV_64F);
+    if (imgPts.cols > 1)
+    {
+        imgPts = imgPts.reshape(1);
+        imgPts = imgPts.t();
     }
+    else
+        imgPts = imgPts.reshape(1, 2*imgPts.rows);
 
-    if (_tvecs.needed()) {
-        _tvecs.create(solutions, 1, CV_64F);
-    }
-
-    for (int i = 0; i < solutions; i++) {
+    vector<double> reproj_errors(solutions);
+    for (size_t i = 0; i < reproj_errors.size(); i++)
+    {
         Mat rvec;
         Rodrigues(Rs[i], rvec);
-        _tvecs.getMatRef(i) = ts[i];
-        _rvecs.getMatRef(i) = rvec;
+        rvecs.push_back(rvec);
+
+        Mat projPts;
+        projectPoints(objPts, rvec, ts[i], _cameraMatrix, _distCoeffs, projPts);
+
+        projPts = projPts.reshape(1, 2*projPts.rows);
+        Mat err = imgPts - projPts;
+
+        err = err.t() * err;
+        reproj_errors[i] = err.at<double>(0,0);
+    }
+
+    //sort the solutions
+    for (int i = 1; i < solutions; i++)
+    {
+        for (int j = i; j > 0 && reproj_errors[j-1] > reproj_errors[j]; j--)
+        {
+            std::swap(reproj_errors[j], reproj_errors[j-1]);
+            std::swap(rvecs[j], rvecs[j-1]);
+            std::swap(ts[j], ts[j-1]);
+        }
+    }
+
+    int depthRot = _rvecs.fixedType() ? _rvecs.depth() : CV_64F;
+    int depthTrans = _tvecs.fixedType() ? _tvecs.depth() : CV_64F;
+    _rvecs.create(solutions, 1, CV_MAKETYPE(depthRot, _rvecs.fixedType() && _rvecs.kind() == _InputArray::STD_VECTOR ? 3 : 1));
+    _tvecs.create(solutions, 1, CV_MAKETYPE(depthTrans, _tvecs.fixedType() && _tvecs.kind() == _InputArray::STD_VECTOR ? 3 : 1));
+
+    for (int i = 0; i < solutions; i++)
+    {
+        Mat rvec0, tvec0;
+        if (depthRot == CV_64F)
+            rvec0 = rvecs[i];
+        else
+            rvecs[i].convertTo(rvec0, depthRot);
+
+        if (depthTrans == CV_64F)
+            tvec0 = ts[i];
+        else
+            ts[i].convertTo(tvec0, depthTrans);
+
+        if (_rvecs.fixedType() && _rvecs.kind() == _InputArray::STD_VECTOR)
+        {
+            Mat rref = _rvecs.getMat_();
+
+            if (_rvecs.depth() == CV_32F)
+                rref.at<Vec3f>(0,i) = Vec3f(rvec0.at<float>(0,0), rvec0.at<float>(1,0), rvec0.at<float>(2,0));
+            else
+                rref.at<Vec3d>(0,i) = Vec3d(rvec0.at<double>(0,0), rvec0.at<double>(1,0), rvec0.at<double>(2,0));
+        }
+        else
+        {
+            _rvecs.getMatRef(i) = rvec0;
+        }
+
+        if (_tvecs.fixedType() && _tvecs.kind() == _InputArray::STD_VECTOR)
+        {
+
+            Mat tref = _tvecs.getMat_();
+
+            if (_tvecs.depth() == CV_32F)
+                tref.at<Vec3f>(0,i) = Vec3f(tvec0.at<float>(0,0), tvec0.at<float>(1,0), tvec0.at<float>(2,0));
+            else
+                tref.at<Vec3d>(0,i) = Vec3d(tvec0.at<double>(0,0), tvec0.at<double>(1,0), tvec0.at<double>(2,0));
+        }
+        else
+        {
+            _tvecs.getMatRef(i) = tvec0;
+        }
     }
 
     return solutions;
@@ -723,4 +732,314 @@ void solvePnPRefineVVS(InputArray _objectPoints, InputArray _imagePoints,
     solvePnPRefine(_objectPoints, _imagePoints, _cameraMatrix, _distCoeffs, _rvec, _tvec, SOLVEPNP_REFINE_VVS, _criteria, _VVSlambda);
 }
 
+int solvePnPGeneric( InputArray _opoints, InputArray _ipoints,
+                     InputArray _cameraMatrix, InputArray _distCoeffs,
+                     OutputArrayOfArrays _rvecs, OutputArrayOfArrays _tvecs,
+                     bool useExtrinsicGuess, SolvePnPMethod flags,
+                     InputArray _rvec, InputArray _tvec,
+                     OutputArray reprojectionError) {
+    CV_INSTRUMENT_REGION();
+
+    Mat opoints = _opoints.getMat(), ipoints = _ipoints.getMat();
+    int npoints = std::max(opoints.checkVector(3, CV_32F), opoints.checkVector(3, CV_64F));
+    CV_Assert( ( (npoints >= 4) || (npoints == 3 && flags == SOLVEPNP_ITERATIVE && useExtrinsicGuess) )
+               && npoints == std::max(ipoints.checkVector(2, CV_32F), ipoints.checkVector(2, CV_64F)) );
+
+    if( flags != SOLVEPNP_ITERATIVE )
+        useExtrinsicGuess = false;
+
+    if (useExtrinsicGuess)
+        CV_Assert( !_rvec.empty() && !_tvec.empty() );
+
+    if( useExtrinsicGuess )
+    {
+        int rtype = _rvec.type(), ttype = _tvec.type();
+        Size rsize = _rvec.size(), tsize = _tvec.size();
+        CV_Assert( (rtype == CV_32FC1 || rtype == CV_64FC1) &&
+                   (ttype == CV_32FC1 || ttype == CV_64FC1) );
+        CV_Assert( (rsize == Size(1, 3) || rsize == Size(3, 1)) &&
+                   (tsize == Size(1, 3) || tsize == Size(3, 1)) );
+    }
+
+    Mat cameraMatrix0 = _cameraMatrix.getMat();
+    Mat distCoeffs0 = _distCoeffs.getMat();
+    Mat cameraMatrix = Mat_<double>(cameraMatrix0);
+    Mat distCoeffs = Mat_<double>(distCoeffs0);
+
+    vector<Mat> vec_rvecs, vec_tvecs;
+    if (flags == SOLVEPNP_EPNP || flags == SOLVEPNP_DLS || flags == SOLVEPNP_UPNP)
+    {
+        Mat undistortedPoints;
+        undistortPoints(ipoints, undistortedPoints, cameraMatrix, distCoeffs);
+        epnp PnP(cameraMatrix, opoints, undistortedPoints);
+
+        Mat rvec, tvec, R;
+        PnP.compute_pose(R, tvec);
+        Rodrigues(R, rvec);
+
+        vec_rvecs.push_back(rvec);
+        vec_tvecs.push_back(tvec);
+    }
+    else if (flags == SOLVEPNP_P3P || flags == SOLVEPNP_AP3P)
+    {
+        vector<Mat> rvecs, tvecs;
+        solveP3P(_opoints, _ipoints, _cameraMatrix, _distCoeffs, rvecs, tvecs, flags);
+        vec_rvecs.insert(vec_rvecs.end(), rvecs.begin(), rvecs.end());
+        vec_tvecs.insert(vec_tvecs.end(), tvecs.begin(), tvecs.end());
+    }
+    else if (flags == SOLVEPNP_ITERATIVE)
+    {
+        Mat rvec, tvec;
+        if (useExtrinsicGuess)
+        {
+            rvec = _rvec.getMat();
+            tvec = _tvec.getMat();
+        }
+        else
+        {
+            rvec.create(3, 1, CV_64FC1);
+            tvec.create(3, 1, CV_64FC1);
+        }
+
+        CvMat c_objectPoints = cvMat(opoints), c_imagePoints = cvMat(ipoints);
+        CvMat c_cameraMatrix = cvMat(cameraMatrix), c_distCoeffs = cvMat(distCoeffs);
+        CvMat c_rvec = cvMat(rvec), c_tvec = cvMat(tvec);
+        cvFindExtrinsicCameraParams2(&c_objectPoints, &c_imagePoints, &c_cameraMatrix,
+                                     (c_distCoeffs.rows && c_distCoeffs.cols) ? &c_distCoeffs : 0,
+                                     &c_rvec, &c_tvec, useExtrinsicGuess );
+
+        vec_rvecs.push_back(rvec);
+        vec_tvecs.push_back(tvec);
+    }
+    else if (flags == SOLVEPNP_IPPE)
+    {
+        CV_DbgAssert(isPlanarObjectPoints(opoints, 1e-3));
+        Mat undistortedPoints;
+        undistortPoints(ipoints, undistortedPoints, cameraMatrix, distCoeffs);
+
+        IPPE::PoseSolver poseSolver;
+        Mat rvec1, tvec1, rvec2, tvec2;
+        float reprojErr1, reprojErr2;
+        try
+        {
+            poseSolver.solveGeneric(opoints, undistortedPoints, rvec1, tvec1, reprojErr1, rvec2, tvec2, reprojErr2);
+
+            if (reprojErr1 < reprojErr2)
+            {
+                vec_rvecs.push_back(rvec1);
+                vec_tvecs.push_back(tvec1);
+
+                vec_rvecs.push_back(rvec2);
+                vec_tvecs.push_back(tvec2);
+            }
+            else
+            {
+                vec_rvecs.push_back(rvec2);
+                vec_tvecs.push_back(tvec2);
+
+                vec_rvecs.push_back(rvec1);
+                vec_tvecs.push_back(tvec1);
+            }
+        }
+        catch (...) { }
+    }
+    else if (flags == SOLVEPNP_IPPE_SQUARE)
+    {
+        CV_Assert(npoints == 4);
+
+#if defined _DEBUG || defined CV_STATIC_ANALYSIS
+        double Xs[4][3];
+        if (opoints.depth() == CV_32F)
+        {
+            for (int i = 0; i < 4; i++)
+            {
+                for (int j = 0; j < 3; j++)
+                {
+                    Xs[i][j] = opoints.ptr<Vec3f>(0)[i](j);
+                }
+            }
+        }
+        else
+        {
+            for (int i = 0; i < 4; i++)
+            {
+                for (int j = 0; j < 3; j++)
+                {
+                    Xs[i][j] = opoints.ptr<Vec3d>(0)[i](j);
+                }
+            }
+        }
+
+        const double equalThreshold = 1e-9;
+        //Z must be zero
+        for (int i = 0; i < 4; i++)
+        {
+            CV_DbgCheck(Xs[i][2], approxEqual(Xs[i][2], 0, equalThreshold), "Z object point coordinate must be zero!");
+        }
+        //Y0 == Y1 && Y2 == Y3
+        CV_DbgCheck(Xs[0][1], approxEqual(Xs[0][1], Xs[1][1], equalThreshold), "Object points must be: Y0 == Y1!");
+        CV_DbgCheck(Xs[2][1], approxEqual(Xs[2][1], Xs[3][1], equalThreshold), "Object points must be: Y2 == Y3!");
+        //X0 == X3 && X1 == X2
+        CV_DbgCheck(Xs[0][0], approxEqual(Xs[0][0], Xs[3][0], equalThreshold), "Object points must be: X0 == X3!");
+        CV_DbgCheck(Xs[1][0], approxEqual(Xs[1][0], Xs[2][0], equalThreshold), "Object points must be: X1 == X2!");
+        //X1 == Y1 && X3 == Y3
+        CV_DbgCheck(Xs[1][0], approxEqual(Xs[1][0], Xs[1][1], equalThreshold), "Object points must be: X1 == Y1!");
+        CV_DbgCheck(Xs[3][0], approxEqual(Xs[3][0], Xs[3][1], equalThreshold), "Object points must be: X3 == Y3!");
+#endif
+
+        Mat undistortedPoints;
+        undistortPoints(ipoints, undistortedPoints, cameraMatrix, distCoeffs);
+
+        IPPE::PoseSolver poseSolver;
+        Mat rvec1, tvec1, rvec2, tvec2;
+        float reprojErr1, reprojErr2;
+        try
+        {
+            poseSolver.solveSquare(opoints, undistortedPoints, rvec1, tvec1, reprojErr1, rvec2, tvec2, reprojErr2);
+
+            if (reprojErr1 < reprojErr2)
+            {
+                vec_rvecs.push_back(rvec1);
+                vec_tvecs.push_back(tvec1);
+
+                vec_rvecs.push_back(rvec2);
+                vec_tvecs.push_back(tvec2);
+            }
+            else
+            {
+                vec_rvecs.push_back(rvec2);
+                vec_tvecs.push_back(tvec2);
+
+                vec_rvecs.push_back(rvec1);
+                vec_tvecs.push_back(tvec1);
+            }
+        } catch (...) { }
+    }
+    /*else if (flags == SOLVEPNP_DLS)
+    {
+        Mat undistortedPoints;
+        undistortPoints(ipoints, undistortedPoints, cameraMatrix, distCoeffs);
+
+        dls PnP(opoints, undistortedPoints);
+
+        Mat rvec, tvec, R;
+        bool result = PnP.compute_pose(R, tvec);
+        if (result)
+        {
+            Rodrigues(R, rvec);
+            vec_rvecs.push_back(rvec);
+            vec_tvecs.push_back(tvec);
+        }
+    }
+    else if (flags == SOLVEPNP_UPNP)
+    {
+        upnp PnP(cameraMatrix, opoints, ipoints);
+
+        Mat rvec, tvec, R;
+        PnP.compute_pose(R, tvec);
+        Rodrigues(R, rvec);
+        vec_rvecs.push_back(rvec);
+        vec_tvecs.push_back(tvec);
+    }*/
+    else
+        CV_Error(CV_StsBadArg, "The flags argument must be one of SOLVEPNP_ITERATIVE, SOLVEPNP_P3P, SOLVEPNP_EPNP or SOLVEPNP_DLS");
+
+    CV_Assert(vec_rvecs.size() == vec_tvecs.size());
+
+    int solutions = static_cast<int>(vec_rvecs.size());
+
+    int depthRot = _rvecs.fixedType() ? _rvecs.depth() : CV_64F;
+    int depthTrans = _tvecs.fixedType() ? _tvecs.depth() : CV_64F;
+    _rvecs.create(solutions, 1, CV_MAKETYPE(depthRot, _rvecs.fixedType() && _rvecs.kind() == _InputArray::STD_VECTOR ? 3 : 1));
+    _tvecs.create(solutions, 1, CV_MAKETYPE(depthTrans, _tvecs.fixedType() && _tvecs.kind() == _InputArray::STD_VECTOR ? 3 : 1));
+
+    for (int i = 0; i < solutions; i++)
+    {
+        Mat rvec0, tvec0;
+        if (depthRot == CV_64F)
+            rvec0 = vec_rvecs[i];
+        else
+            vec_rvecs[i].convertTo(rvec0, depthRot);
+
+        if (depthTrans == CV_64F)
+            tvec0 = vec_tvecs[i];
+        else
+            vec_tvecs[i].convertTo(tvec0, depthTrans);
+
+        if (_rvecs.fixedType() && _rvecs.kind() == _InputArray::STD_VECTOR)
+        {
+            Mat rref = _rvecs.getMat_();
+
+            if (_rvecs.depth() == CV_32F)
+                rref.at<Vec3f>(0,i) = Vec3f(rvec0.at<float>(0,0), rvec0.at<float>(1,0), rvec0.at<float>(2,0));
+            else
+                rref.at<Vec3d>(0,i) = Vec3d(rvec0.at<double>(0,0), rvec0.at<double>(1,0), rvec0.at<double>(2,0));
+        }
+        else
+        {
+            _rvecs.getMatRef(i) = rvec0;
+        }
+
+        if (_tvecs.fixedType() && _tvecs.kind() == _InputArray::STD_VECTOR)
+        {
+
+            Mat tref = _tvecs.getMat_();
+
+            if (_tvecs.depth() == CV_32F)
+                tref.at<Vec3f>(0,i) = Vec3f(tvec0.at<float>(0,0), tvec0.at<float>(1,0), tvec0.at<float>(2,0));
+            else
+                tref.at<Vec3d>(0,i) = Vec3d(tvec0.at<double>(0,0), tvec0.at<double>(1,0), tvec0.at<double>(2,0));
+        }
+        else
+        {
+            _tvecs.getMatRef(i) = tvec0;
+        }
+    }
+
+    if (reprojectionError.needed())
+    {
+        int type = reprojectionError.type();
+        reprojectionError.create(solutions, 1, type);
+        CV_CheckType(reprojectionError.type(), type == CV_32FC1 || type == CV_64FC1,
+                     "Type of reprojectionError must be CV_32FC1 or CV_64FC1!");
+
+        Mat objectPoints, imagePoints;
+        if (_opoints.depth() == CV_32F)
+        {
+            _opoints.getMat().convertTo(objectPoints, CV_64F);
+        }
+        else
+        {
+            objectPoints = _opoints.getMat();
+        }
+        if (_ipoints.depth() == CV_32F)
+        {
+            _ipoints.getMat().convertTo(imagePoints, CV_64F);
+        }
+        else
+        {
+            imagePoints = _ipoints.getMat();
+        }
+
+        for (size_t i = 0; i < vec_rvecs.size(); i++)
+        {
+            vector<Point2d> projectedPoints;
+            projectPoints(objectPoints, vec_rvecs[i], vec_tvecs[i], cameraMatrix, distCoeffs, projectedPoints);
+            double rmse = norm(projectedPoints, imagePoints, NORM_L2) / sqrt(2*projectedPoints.size());
+
+            Mat err = reprojectionError.getMat();
+            if (type == CV_32F)
+            {
+                err.at<float>(0,static_cast<int>(i)) = static_cast<float>(rmse);
+            }
+            else
+            {
+                err.at<double>(0,static_cast<int>(i)) = rmse;
+            }
+        }
+    }
+
+    return solutions;
+}
+
 }
diff --git a/modules/calib3d/test/test_solvepnp_ransac.cpp b/modules/calib3d/test/test_solvepnp_ransac.cpp
index adf7758c92..77a5d5df8d 100644
--- a/modules/calib3d/test/test_solvepnp_ransac.cpp
+++ b/modules/calib3d/test/test_solvepnp_ransac.cpp
@@ -44,10 +44,161 @@
 
 namespace opencv_test { namespace {
 
+//Statistics Helpers
+struct ErrorInfo
+{
+    ErrorInfo(double errT, double errR) : errorTrans(errT), errorRot(errR)
+    {
+    }
+
+    bool operator<(const ErrorInfo& e) const
+    {
+        return sqrt(errorTrans*errorTrans + errorRot*errorRot) <
+                sqrt(e.errorTrans*e.errorTrans + e.errorRot*e.errorRot);
+    }
+
+    double errorTrans;
+    double errorRot;
+};
+
+//Try to find the translation and rotation thresholds to achieve a predefined percentage of success.
+//Since a success is defined by error_trans < trans_thresh && error_rot < rot_thresh
+//this just gives an idea of the values to use
+static void findThreshold(const std::vector<double>& v_trans, const std::vector<double>& v_rot, double percentage,
+                          double& transThresh, double& rotThresh)
+{
+    if (v_trans.empty() || v_rot.empty() || v_trans.size() != v_rot.size())
+    {
+        transThresh = -1;
+        rotThresh = -1;
+        return;
+    }
+
+    std::vector<ErrorInfo> error_info;
+    error_info.reserve(v_trans.size());
+    for (size_t i = 0; i < v_trans.size(); i++)
+    {
+        error_info.push_back(ErrorInfo(v_trans[i], v_rot[i]));
+    }
+
+    std::sort(error_info.begin(), error_info.end());
+    size_t idx = static_cast<size_t>(error_info.size() * percentage);
+    transThresh = error_info[idx].errorTrans;
+    rotThresh = error_info[idx].errorRot;
+}
+
+static double getMax(const std::vector<double>& v)
+{
+    return *std::max_element(v.begin(), v.end());
+}
+
+static double getMean(const std::vector<double>& v)
+{
+    if (v.empty())
+    {
+        return 0.0;
+    }
+
+    double sum = std::accumulate(v.begin(), v.end(), 0.0);
+    return sum / v.size();
+}
+
+static double getMedian(const std::vector<double>& v)
+{
+    if (v.empty())
+    {
+        return 0.0;
+    }
+
+    std::vector<double> v_copy = v;
+    size_t size = v_copy.size();
+
+    size_t n = size / 2;
+    std::nth_element(v_copy.begin(), v_copy.begin() + n, v_copy.end());
+    double val_n = v_copy[n];
+
+    if (size % 2 == 1)
+    {
+        return val_n;
+    } else
+    {
+        std::nth_element(v_copy.begin(), v_copy.begin() + n - 1, v_copy.end());
+        return 0.5 * (val_n + v_copy[n - 1]);
+    }
+}
+
+static void generatePose(const vector<Point3d>& points, Mat& rvec, Mat& tvec, RNG& rng, int nbTrials=10)
+{
+    const double minVal = 1.0e-3;
+    const double maxVal = 1.0;
+    rvec.create(3, 1, CV_64FC1);
+    tvec.create(3, 1, CV_64FC1);
+
+    bool validPose = false;
+    for (int trial = 0; trial < nbTrials && !validPose; trial++)
+    {
+        for (int i = 0; i < 3; i++)
+        {
+            rvec.at<double>(i,0) = rng.uniform(minVal, maxVal);
+            tvec.at<double>(i,0) = (i == 2) ? rng.uniform(minVal*10, maxVal) : rng.uniform(-maxVal, maxVal);
+        }
+
+        Mat R;
+        cv::Rodrigues(rvec, R);
+        bool positiveDepth = true;
+        for (size_t i = 0; i < points.size() && positiveDepth; i++)
+        {
+            Matx31d objPts(points[i].x, points[i].y, points[i].z);
+            Mat camPts = R*objPts + tvec;
+            if (camPts.at<double>(2,0) <= 0)
+            {
+                positiveDepth = false;
+            }
+        }
+        validPose = positiveDepth;
+    }
+}
+
+static void generatePose(const vector<Point3f>& points, Mat& rvec, Mat& tvec, RNG& rng, int nbTrials=10)
+{
+    vector<Point3d> points_double(points.size());
+
+    for (size_t i = 0; i < points.size(); i++)
+    {
+        points_double[i] = Point3d(points[i].x, points[i].y, points[i].z);
+    }
+
+    generatePose(points_double, rvec, tvec, rng, nbTrials);
+}
+
+static std::string printMethod(int method)
+{
+    switch (method) {
+    case 0:
+        return "SOLVEPNP_ITERATIVE";
+    case 1:
+        return "SOLVEPNP_EPNP";
+    case 2:
+        return "SOLVEPNP_P3P";
+    case 3:
+        return "SOLVEPNP_DLS (remaped to SOLVEPNP_EPNP)";
+    case 4:
+        return "SOLVEPNP_UPNP (remaped to SOLVEPNP_EPNP)";
+    case 5:
+        return "SOLVEPNP_AP3P";
+    case 6:
+        return "SOLVEPNP_IPPE";
+    case 7:
+        return "SOLVEPNP_IPPE_SQUARE";
+    default:
+        return "Unknown value";
+    }
+}
+
 class CV_solvePnPRansac_Test : public cvtest::BaseTest
 {
 public:
-    CV_solvePnPRansac_Test()
+    CV_solvePnPRansac_Test(bool planar_=false, bool planarTag_=false) : planar(planar_), planarTag(planarTag_)
     {
         eps[SOLVEPNP_ITERATIVE] = 1.0e-2;
         eps[SOLVEPNP_EPNP] = 1.0e-2;
@@ -61,10 +212,10 @@ public:
     ~CV_solvePnPRansac_Test() {}
 protected:
     void generate3DPointCloud(vector<Point3f>& points,
-        Point3f pmin = Point3f(-1, -1, 5),
-        Point3f pmax = Point3f(1, 1, 10))
+                              Point3f pmin = Point3f(-1, -1, 5),
+                              Point3f pmax = Point3f(1, 1, 10))
     {
-        RNG rng = cv::theRNG(); // fix the seed to use "fixed" input 3D points
+        RNG& rng = theRNG(); // fix the seed to use "fixed" input 3D points
 
         for (size_t i = 0; i < points.size(); i++)
         {
@@ -75,6 +226,44 @@ protected:
         }
     }
 
+    void generatePlanarPointCloud(vector<Point3f>& points,
+                                  Point2f pmin = Point2f(-1, -1),
+                                  Point2f pmax = Point2f(1, 1))
+    {
+        RNG& rng = theRNG(); // fix the seed to use "fixed" input 3D points
+
+        if (planarTag)
+        {
+            const float squareLength_2 = rng.uniform(0.01f, pmax.x) / 2;
+            points.clear();
+            points.push_back(Point3f(-squareLength_2, squareLength_2, 0));
+            points.push_back(Point3f(squareLength_2, squareLength_2, 0));
+            points.push_back(Point3f(squareLength_2, -squareLength_2, 0));
+            points.push_back(Point3f(-squareLength_2, -squareLength_2, 0));
+        }
+        else
+        {
+            Mat rvec_double, tvec_double;
+            generatePose(points, rvec_double, tvec_double, rng);
+
+            Mat rvec, tvec, R;
+            rvec_double.convertTo(rvec, CV_32F);
+            tvec_double.convertTo(tvec, CV_32F);
+            cv::Rodrigues(rvec, R);
+
+            for (size_t i = 0; i < points.size(); i++)
+            {
+                float x = rng.uniform(pmin.x, pmax.x);
+                float y = rng.uniform(pmin.y, pmax.y);
+                float z = 0;
+
+                Matx31f pt(x, y, z);
+                Mat pt_trans = R * pt + tvec;
+                points[i] = Point3f(pt_trans.at<float>(0,0), pt_trans.at<float>(1,0), pt_trans.at<float>(2,0));
+            }
+        }
+    }
+
     void generateCameraMatrix(Mat& cameraMatrix, RNG& rng)
     {
         const double fcMinVal = 1e-3;
@@ -95,32 +284,34 @@ protected:
             distCoeffs.at<double>(i,0) = rng.uniform(0.0, 1.0e-6);
     }
 
-    void generatePose(Mat& rvec, Mat& tvec, RNG& rng)
+    virtual bool runTest(RNG& rng, int mode, int method, const vector<Point3f>& points, double& errorTrans, double& errorRot)
     {
-        const double minVal = 1.0e-3;
-        const double maxVal = 1.0;
-        rvec.create(3, 1, CV_64FC1);
-        tvec.create(3, 1, CV_64FC1);
-        for (int i = 0; i < 3; i++)
+        if ((!planar && method == SOLVEPNP_IPPE) || method == SOLVEPNP_IPPE_SQUARE)
         {
-            rvec.at<double>(i,0) = rng.uniform(minVal, maxVal);
-            tvec.at<double>(i,0) = rng.uniform(minVal, maxVal/10);
+            return true;
         }
-    }
 
-    virtual bool runTest(RNG& rng, int mode, int method, const vector<Point3f>& points, const double* epsilon, double& maxError)
-    {
         Mat rvec, tvec;
         vector<int> inliers;
         Mat trueRvec, trueTvec;
         Mat intrinsics, distCoeffs;
         generateCameraMatrix(intrinsics, rng);
-        if (method == 4) intrinsics.at<double>(1,1) = intrinsics.at<double>(0,0);
+        //UPnP is mapped to EPnP
+        //Uncomment this when UPnP is fixed
+//        if (method == SOLVEPNP_UPNP)
+//        {
+//            intrinsics.at<double>(1,1) = intrinsics.at<double>(0,0);
+//        }
         if (mode == 0)
+        {
             distCoeffs = Mat::zeros(4, 1, CV_64FC1);
+        }
         else
+        {
             generateDistCoeffs(distCoeffs, rng);
-        generatePose(trueRvec, trueTvec, rng);
+        }
+
+        generatePose(points, trueRvec, trueTvec, rng);
 
         vector<Point2f> projectedPoints;
         projectedPoints.resize(points.size());
@@ -138,11 +329,9 @@ protected:
         bool isTestSuccess = inliers.size() >= points.size()*0.95;
 
         double rvecDiff = cvtest::norm(rvec, trueRvec, NORM_L2), tvecDiff = cvtest::norm(tvec, trueTvec, NORM_L2);
-        isTestSuccess = isTestSuccess && rvecDiff < epsilon[method] && tvecDiff < epsilon[method];
-        double error = rvecDiff > tvecDiff ? rvecDiff : tvecDiff;
-        //cout << error << " " << inliers.size() << " " << eps[method] << endl;
-        if (error > maxError)
-            maxError = error;
+        isTestSuccess = isTestSuccess && rvecDiff < eps[method] && tvecDiff < eps[method];
+        errorTrans = tvecDiff;
+        errorRot = rvecDiff;
 
         return isTestSuccess;
     }
@@ -152,68 +341,184 @@ protected:
         ts->set_failed_test_info(cvtest::TS::OK);
 
         vector<Point3f> points, points_dls;
-        points.resize(pointsCount);
-        generate3DPointCloud(points);
+        points.resize(static_cast<size_t>(pointsCount));
 
-        RNG rng = ts->get_rng();
+        if (planar || planarTag)
+        {
+            generatePlanarPointCloud(points);
+        }
+        else
+        {
+            generate3DPointCloud(points);
+        }
 
+        RNG& rng = ts->get_rng();
 
         for (int mode = 0; mode < 2; mode++)
         {
             for (int method = 0; method < SOLVEPNP_MAX_COUNT; method++)
             {
-                double maxError = 0;
+                //To get the same input for each methods
+                RNG rngCopy = rng;
+                std::vector<double> vec_errorTrans, vec_errorRot;
+                vec_errorTrans.reserve(static_cast<size_t>(totalTestsCount));
+                vec_errorRot.reserve(static_cast<size_t>(totalTestsCount));
+
                 int successfulTestsCount = 0;
                 for (int testIndex = 0; testIndex < totalTestsCount; testIndex++)
                 {
-                    if (runTest(rng, mode, method, points, eps, maxError))
+                    double errorTrans, errorRot;
+                    if (runTest(rngCopy, mode, method, points, errorTrans, errorRot))
                     {
                         successfulTestsCount++;
                     }
+                    vec_errorTrans.push_back(errorTrans);
+                    vec_errorRot.push_back(errorRot);
                 }
+
+                double maxErrorTrans = getMax(vec_errorTrans);
+                double maxErrorRot = getMax(vec_errorRot);
+                double meanErrorTrans = getMean(vec_errorTrans);
+                double meanErrorRot = getMean(vec_errorRot);
+                double medianErrorTrans = getMedian(vec_errorTrans);
+                double medianErrorRot = getMedian(vec_errorRot);
+
                 if (successfulTestsCount < 0.7*totalTestsCount)
                 {
-                    ts->printf( cvtest::TS::LOG, "Invalid accuracy for method %d, failed %d tests from %d, maximum error equals %f, distortion mode equals %d\n",
-                        method, totalTestsCount - successfulTestsCount, totalTestsCount, maxError, mode);
+                    ts->printf(cvtest::TS::LOG, "Invalid accuracy for %s, failed %d tests from %d, %s, "
+                                                "maxErrT: %f, maxErrR: %f, "
+                                                "meanErrT: %f, meanErrR: %f, "
+                                                "medErrT: %f, medErrR: %f\n",
+                               printMethod(method).c_str(), totalTestsCount - successfulTestsCount, totalTestsCount, printMode(mode).c_str(),
+                               maxErrorTrans, maxErrorRot, meanErrorTrans, meanErrorRot, medianErrorTrans, medianErrorRot);
                     ts->set_failed_test_info(cvtest::TS::FAIL_BAD_ACCURACY);
                 }
-                cout << "mode: " << mode << ", method: " << method << " -> "
+                cout << "mode: " << printMode(mode) << ", method: " << printMethod(method) << " -> "
                      << ((double)successfulTestsCount / totalTestsCount) * 100 << "%"
-                     << " (err < " << maxError << ")" << endl;
+                     << " (maxErrT: " << maxErrorTrans << ", maxErrR: " << maxErrorRot
+                     << ", meanErrT: " << meanErrorTrans << ", meanErrR: " << meanErrorRot
+                     << ", medErrT: " << medianErrorTrans << ", medErrR: " << medianErrorRot << ")" << endl;
+                double transThres, rotThresh;
+                findThreshold(vec_errorTrans, vec_errorRot, 0.7, transThres, rotThresh);
+                cout << "approximate translation threshold for 0.7: " << transThres
+                     << ", approximate rotation threshold for 0.7: " << rotThresh << endl;
             }
+            cout << endl;
+        }
+    }
+    std::string printMode(int mode)
+    {
+        switch (mode) {
+        case 0:
+            return "no distortion";
+        case 1:
+        default:
+            return "distorsion";
         }
     }
     double eps[SOLVEPNP_MAX_COUNT];
     int totalTestsCount;
     int pointsCount;
+    bool planar;
+    bool planarTag;
 };
 
 class CV_solvePnP_Test : public CV_solvePnPRansac_Test
 {
 public:
-    CV_solvePnP_Test()
+    CV_solvePnP_Test(bool planar_=false, bool planarTag_=false) : CV_solvePnPRansac_Test(planar_, planarTag_)
     {
         eps[SOLVEPNP_ITERATIVE] = 1.0e-6;
         eps[SOLVEPNP_EPNP] = 1.0e-6;
         eps[SOLVEPNP_P3P] = 2.0e-4;
         eps[SOLVEPNP_AP3P] = 1.0e-4;
-        eps[SOLVEPNP_DLS] = 1.0e-4;
-        eps[SOLVEPNP_UPNP] = 1.0e-4;
+        eps[SOLVEPNP_DLS] = 1.0e-6; //DLS is remapped to EPnP, so we use the same threshold
+        eps[SOLVEPNP_UPNP] = 1.0e-6; //UPnP is remapped to EPnP, so we use the same threshold
+        eps[SOLVEPNP_IPPE] = 1.0e-6;
+        eps[SOLVEPNP_IPPE_SQUARE] = 1.0e-6;
+
         totalTestsCount = 1000;
+
+        if (planar || planarTag)
+        {
+            if (planarTag)
+            {
+                pointsCount = 4;
+            }
+            else
+            {
+                pointsCount = 30;
+            }
+        }
+        else
+        {
+            pointsCount = 500;
+        }
     }
 
     ~CV_solvePnP_Test() {}
 protected:
-    virtual bool runTest(RNG& rng, int mode, int method, const vector<Point3f>& points, const double* epsilon, double& maxError)
+    virtual bool runTest(RNG& rng, int mode, int method, const vector<Point3f>& points, double& errorTrans, double& errorRot)
     {
-        Mat rvec, tvec;
+        if ((!planar && (method == SOLVEPNP_IPPE || method == SOLVEPNP_IPPE_SQUARE)) ||
+            (!planarTag && method == SOLVEPNP_IPPE_SQUARE))
+        {
+            errorTrans = -1;
+            errorRot = -1;
+            //SOLVEPNP_IPPE and SOLVEPNP_IPPE_SQUARE need planar object
+            return true;
+        }
+
+        //Tune thresholds...
+        double epsilon_trans[SOLVEPNP_MAX_COUNT];
+        memcpy(epsilon_trans, eps, SOLVEPNP_MAX_COUNT * sizeof(*epsilon_trans));
+
+        double epsilon_rot[SOLVEPNP_MAX_COUNT];
+        memcpy(epsilon_rot, eps, SOLVEPNP_MAX_COUNT * sizeof(*epsilon_rot));
+
+        if (planar)
+        {
+            if (mode == 0)
+            {
+                epsilon_trans[SOLVEPNP_EPNP] = 5.0e-3;
+                epsilon_trans[SOLVEPNP_DLS] = 5.0e-3;
+                epsilon_trans[SOLVEPNP_UPNP] = 5.0e-3;
+
+                epsilon_rot[SOLVEPNP_EPNP] = 5.0e-3;
+                epsilon_rot[SOLVEPNP_DLS] = 5.0e-3;
+                epsilon_rot[SOLVEPNP_UPNP] = 5.0e-3;
+            }
+            else
+            {
+                epsilon_trans[SOLVEPNP_ITERATIVE] = 1e-4;
+                epsilon_trans[SOLVEPNP_EPNP] = 5e-3;
+                epsilon_trans[SOLVEPNP_DLS] = 5e-3;
+                epsilon_trans[SOLVEPNP_UPNP] = 5e-3;
+                epsilon_trans[SOLVEPNP_P3P] = 1e-4;
+                epsilon_trans[SOLVEPNP_AP3P] = 1e-4;
+                epsilon_trans[SOLVEPNP_IPPE] = 1e-4;
+                epsilon_trans[SOLVEPNP_IPPE_SQUARE] = 1e-4;
+
+                epsilon_rot[SOLVEPNP_ITERATIVE] = 1e-4;
+                epsilon_rot[SOLVEPNP_EPNP] = 5e-3;
+                epsilon_rot[SOLVEPNP_DLS] = 5e-3;
+                epsilon_rot[SOLVEPNP_UPNP] = 5e-3;
+                epsilon_rot[SOLVEPNP_P3P] = 1e-4;
+                epsilon_rot[SOLVEPNP_AP3P] = 1e-4;
+                epsilon_rot[SOLVEPNP_IPPE] = 1e-4;
+                epsilon_rot[SOLVEPNP_IPPE_SQUARE] = 1e-4;
+            }
+        }
+
         Mat trueRvec, trueTvec;
         Mat intrinsics, distCoeffs;
         generateCameraMatrix(intrinsics, rng);
-        if (method == SOLVEPNP_DLS)
-        {
-            intrinsics.at<double>(1,1) = intrinsics.at<double>(0,0);
-        }
+        //UPnP is mapped to EPnP
+        //Uncomment this when UPnP is fixed
+//        if (method == SOLVEPNP_UPNP)
+//        {
+//            intrinsics.at<double>(1,1) = intrinsics.at<double>(0,0);
+//        }
         if (mode == 0)
         {
             distCoeffs = Mat::zeros(4, 1, CV_64FC1);
@@ -222,7 +527,8 @@ protected:
         {
             generateDistCoeffs(distCoeffs, rng);
         }
-        generatePose(trueRvec, trueTvec, rng);
+
+        generatePose(points, trueRvec, trueTvec, rng);
 
         std::vector<Point3f> opoints;
         switch(method)
@@ -231,9 +537,18 @@ protected:
             case SOLVEPNP_AP3P:
                 opoints = std::vector<Point3f>(points.begin(), points.begin()+4);
                 break;
-            case SOLVEPNP_UPNP:
-                opoints = std::vector<Point3f>(points.begin(), points.begin()+50);
-                break;
+                //UPnP is mapped to EPnP
+                //Uncomment this when UPnP is fixed
+//            case SOLVEPNP_UPNP:
+//                if (points.size() > 50)
+//                {
+//                    opoints = std::vector<Point3f>(points.begin(), points.begin()+50);
+//                }
+//                else
+//                {
+//                    opoints = points;
+//                }
+//                break;
             default:
                 opoints = points;
                 break;
@@ -243,20 +558,19 @@ protected:
         projectedPoints.resize(opoints.size());
         projectPoints(opoints, trueRvec, trueTvec, intrinsics, distCoeffs, projectedPoints);
 
+        Mat rvec, tvec;
         bool isEstimateSuccess = solvePnP(opoints, projectedPoints, intrinsics, distCoeffs, rvec, tvec, false, method);
-        if (isEstimateSuccess == false)
+
+        if (!isEstimateSuccess)
         {
-            return isEstimateSuccess;
+            return false;
         }
 
         double rvecDiff = cvtest::norm(rvec, trueRvec, NORM_L2), tvecDiff = cvtest::norm(tvec, trueTvec, NORM_L2);
-        bool isTestSuccess = rvecDiff < epsilon[method] && tvecDiff < epsilon[method];
+        bool isTestSuccess = rvecDiff < epsilon_rot[method] && tvecDiff < epsilon_trans[method];
 
-        double error = rvecDiff > tvecDiff ? rvecDiff : tvecDiff;
-        if (error > maxError)
-        {
-            maxError = error;
-        }
+        errorTrans = tvecDiff;
+        errorRot = rvecDiff;
 
         return isTestSuccess;
     }
@@ -264,95 +578,129 @@ protected:
 
 class CV_solveP3P_Test : public CV_solvePnPRansac_Test
 {
- public:
-  CV_solveP3P_Test()
-  {
-    eps[SOLVEPNP_P3P] = 2.0e-4;
-    eps[SOLVEPNP_AP3P] = 1.0e-4;
-    totalTestsCount = 1000;
-  }
-
-  ~CV_solveP3P_Test() {}
- protected:
-  virtual bool runTest(RNG& rng, int mode, int method, const vector<Point3f>& points, const double* epsilon, double& maxError)
-  {
-    std::vector<Mat> rvecs, tvecs;
-    Mat trueRvec, trueTvec;
-    Mat intrinsics, distCoeffs;
-    generateCameraMatrix(intrinsics, rng);
-    if (mode == 0)
-      distCoeffs = Mat::zeros(4, 1, CV_64FC1);
-    else
-      generateDistCoeffs(distCoeffs, rng);
-    generatePose(trueRvec, trueTvec, rng);
-
-    std::vector<Point3f> opoints;
-    opoints = std::vector<Point3f>(points.begin(), points.begin()+3);
-
-    vector<Point2f> projectedPoints;
-    projectedPoints.resize(opoints.size());
-    projectPoints(opoints, trueRvec, trueTvec, intrinsics, distCoeffs, projectedPoints);
-
-    int num_of_solutions = solveP3P(opoints, projectedPoints, intrinsics, distCoeffs, rvecs, tvecs, method);
-    if (num_of_solutions != (int) rvecs.size() || num_of_solutions != (int) tvecs.size() || num_of_solutions == 0)
-      return false;
-
-    bool isTestSuccess = false;
-    double error = DBL_MAX;
-    for (unsigned int i = 0; i < rvecs.size() && !isTestSuccess; ++i) {
-      double rvecDiff = cvtest::norm(rvecs[i], trueRvec, NORM_L2);
-      double tvecDiff = cvtest::norm(tvecs[i], trueTvec, NORM_L2);
-      isTestSuccess = rvecDiff < epsilon[method] && tvecDiff < epsilon[method];
-      error = std::min(error, std::max(rvecDiff, tvecDiff));
-    }
-
-    if (error > maxError)
-      maxError = error;
-
-    return isTestSuccess;
-  }
-
-  virtual void run(int)
-  {
-    ts->set_failed_test_info(cvtest::TS::OK);
-
-    vector<Point3f> points;
-    points.resize(pointsCount);
-    generate3DPointCloud(points);
-
-    const int methodsCount = 2;
-    int methods[] = {SOLVEPNP_P3P, SOLVEPNP_AP3P};
-    RNG rng = ts->get_rng();
-
-    for (int mode = 0; mode < 2; mode++)
+public:
+    CV_solveP3P_Test()
     {
-      for (int method = 0; method < methodsCount; method++)
-      {
-        double maxError = 0;
-        int successfulTestsCount = 0;
-        for (int testIndex = 0; testIndex < totalTestsCount; testIndex++)
-        {
-          if (runTest(rng, mode, methods[method], points, eps, maxError))
-            successfulTestsCount++;
-        }
-        if (successfulTestsCount < 0.7*totalTestsCount)
-        {
-          ts->printf( cvtest::TS::LOG, "Invalid accuracy for method %d, failed %d tests from %d, maximum error equals %f, distortion mode equals %d\n",
-                      method, totalTestsCount - successfulTestsCount, totalTestsCount, maxError, mode);
-          ts->set_failed_test_info(cvtest::TS::FAIL_BAD_ACCURACY);
-        }
-        cout << "mode: " << mode << ", method: " << method << " -> "
-             << ((double)successfulTestsCount / totalTestsCount) * 100 << "%"
-             << " (err < " << maxError << ")" << endl;
-      }
+        eps[SOLVEPNP_P3P] = 2.0e-4;
+        eps[SOLVEPNP_AP3P] = 1.0e-4;
+        totalTestsCount = 1000;
+    }
+
+    ~CV_solveP3P_Test() {}
+protected:
+    virtual bool runTest(RNG& rng, int mode, int method, const vector<Point3f>& points, double& errorTrans, double& errorRot)
+    {
+        std::vector<Mat> rvecs, tvecs;
+        Mat trueRvec, trueTvec;
+        Mat intrinsics, distCoeffs;
+        generateCameraMatrix(intrinsics, rng);
+        if (mode == 0)
+        {
+            distCoeffs = Mat::zeros(4, 1, CV_64FC1);
+        }
+        else
+        {
+            generateDistCoeffs(distCoeffs, rng);
+        }
+        generatePose(points, trueRvec, trueTvec, rng);
+
+        std::vector<Point3f> opoints;
+        opoints = std::vector<Point3f>(points.begin(), points.begin()+3);
+
+        vector<Point2f> projectedPoints;
+        projectedPoints.resize(opoints.size());
+        projectPoints(opoints, trueRvec, trueTvec, intrinsics, distCoeffs, projectedPoints);
+
+        int num_of_solutions = solveP3P(opoints, projectedPoints, intrinsics, distCoeffs, rvecs, tvecs, method);
+        if (num_of_solutions != (int) rvecs.size() || num_of_solutions != (int) tvecs.size() || num_of_solutions == 0)
+        {
+            return false;
+        }
+
+        bool isTestSuccess = false;
+        for (size_t i = 0; i < rvecs.size() && !isTestSuccess; i++) {
+            double rvecDiff = cvtest::norm(rvecs[i], trueRvec, NORM_L2);
+            double tvecDiff = cvtest::norm(tvecs[i], trueTvec, NORM_L2);
+            isTestSuccess = rvecDiff < eps[method] && tvecDiff < eps[method];
+
+            errorTrans = std::min(errorTrans, tvecDiff);
+            errorRot = std::min(errorRot, rvecDiff);
+        }
+
+        return isTestSuccess;
+    }
+
+    virtual void run(int)
+    {
+        ts->set_failed_test_info(cvtest::TS::OK);
+
+        vector<Point3f> points;
+        points.resize(static_cast<size_t>(pointsCount));
+        generate3DPointCloud(points);
+
+        const int methodsCount = 2;
+        int methods[] = {SOLVEPNP_P3P, SOLVEPNP_AP3P};
+        RNG rng = ts->get_rng();
+
+        for (int mode = 0; mode < 2; mode++)
+        {
+            //To get the same input for each methods
+            RNG rngCopy = rng;
+            for (int method = 0; method < methodsCount; method++)
+            {
+                std::vector<double> vec_errorTrans, vec_errorRot;
+                vec_errorTrans.reserve(static_cast<size_t>(totalTestsCount));
+                vec_errorRot.reserve(static_cast<size_t>(totalTestsCount));
+
+                int successfulTestsCount = 0;
+                for (int testIndex = 0; testIndex < totalTestsCount; testIndex++)
+                {
+                    double errorTrans = 0, errorRot = 0;
+                    if (runTest(rngCopy, mode, methods[method], points, errorTrans, errorRot))
+                    {
+                        successfulTestsCount++;
+                    }
+                    vec_errorTrans.push_back(errorTrans);
+                    vec_errorRot.push_back(errorRot);
+                }
+
+                double maxErrorTrans = getMax(vec_errorTrans);
+                double maxErrorRot = getMax(vec_errorRot);
+                double meanErrorTrans = getMean(vec_errorTrans);
+                double meanErrorRot = getMean(vec_errorRot);
+                double medianErrorTrans = getMedian(vec_errorTrans);
+                double medianErrorRot = getMedian(vec_errorRot);
+
+                if (successfulTestsCount < 0.7*totalTestsCount)
+                {
+                    ts->printf(cvtest::TS::LOG, "Invalid accuracy for %s, failed %d tests from %d, %s, "
+                                                "maxErrT: %f, maxErrR: %f, "
+                                                "meanErrT: %f, meanErrR: %f, "
+                                                "medErrT: %f, medErrR: %f\n",
+                               printMethod(methods[method]).c_str(), totalTestsCount - successfulTestsCount, totalTestsCount, printMode(mode).c_str(),
+                               maxErrorTrans, maxErrorRot, meanErrorTrans, meanErrorRot, medianErrorTrans, medianErrorRot);
+                    ts->set_failed_test_info(cvtest::TS::FAIL_BAD_ACCURACY);
+                }
+                cout << "mode: " << printMode(mode) << ", method: " << printMethod(methods[method]) << " -> "
+                     << ((double)successfulTestsCount / totalTestsCount) * 100 << "%"
+                     << " (maxErrT: " << maxErrorTrans << ", maxErrR: " << maxErrorRot
+                     << ", meanErrT: " << meanErrorTrans << ", meanErrR: " << meanErrorRot
+                     << ", medErrT: " << medianErrorTrans << ", medErrR: " << medianErrorRot << ")" << endl;
+                double transThres, rotThresh;
+                findThreshold(vec_errorTrans, vec_errorRot, 0.7, transThres, rotThresh);
+                cout << "approximate translation threshold for 0.7: " << transThres
+                     << ", approximate rotation threshold for 0.7: " << rotThresh << endl;
+            }
+        }
     }
-  }
 };
 
 
 TEST(Calib3d_SolveP3P, accuracy) { CV_solveP3P_Test test; test.safe_run();}
 TEST(Calib3d_SolvePnPRansac, accuracy) { CV_solvePnPRansac_Test test; test.safe_run(); }
 TEST(Calib3d_SolvePnP, accuracy) { CV_solvePnP_Test test; test.safe_run(); }
+TEST(Calib3d_SolvePnP, accuracy_planar) { CV_solvePnP_Test test(true); test.safe_run(); }
+TEST(Calib3d_SolvePnP, accuracy_planar_tag) { CV_solvePnP_Test test(true, true); test.safe_run(); }
 
 TEST(Calib3d_SolvePnPRansac, concurrency)
 {
@@ -367,6 +715,7 @@ TEST(Calib3d_SolvePnPRansac, concurrency)
     camera_mat.at<float>(1, 0) = 0.f;
     camera_mat.at<float>(2, 0) = 0.f;
     camera_mat.at<float>(2, 1) = 0.f;
+    camera_mat.at<float>(2, 2) = 1.f;
 
     Mat dist_coef(1, 8, CV_32F, cv::Scalar::all(0));
 
@@ -420,7 +769,7 @@ TEST(Calib3d_SolvePnPRansac, input_type)
 {
     const int numPoints = 10;
     Matx33d intrinsics(5.4794130238156129e+002, 0., 2.9835545700043139e+002, 0.,
-        5.4817724002728005e+002, 2.3062194051986233e+002, 0., 0., 1.);
+                       5.4817724002728005e+002, 2.3062194051986233e+002, 0., 0., 1.);
 
     std::vector<cv::Point3f> points3d;
     std::vector<cv::Point2f> points2d;
@@ -455,7 +804,7 @@ TEST(Calib3d_SolvePnPRansac, input_type)
     EXPECT_LE(cvtest::norm(t1, t4, NORM_INF), 1e-6);
 }
 
-TEST(Calib3d_SolvePnP, double_support)
+TEST(Calib3d_SolvePnPRansac, double_support)
 {
     Matx33d intrinsics(5.4794130238156129e+002, 0., 2.9835545700043139e+002, 0.,
                        5.4817724002728005e+002, 2.3062194051986233e+002, 0., 0., 1.);
@@ -466,15 +815,15 @@ TEST(Calib3d_SolvePnP, double_support)
     for (int i = 0; i < 10 ; i+=2)
     {
         points3d.push_back(cv::Point3d(5+i, 3, 2));
-        points3dF.push_back(cv::Point3d(5+i, 3, 2));
+        points3dF.push_back(cv::Point3f(static_cast<float>(5+i), 3, 2));
         points3d.push_back(cv::Point3d(5+i, 3+i, 2+i));
-        points3dF.push_back(cv::Point3d(5+i, 3+i, 2+i));
+        points3dF.push_back(cv::Point3f(static_cast<float>(5+i), static_cast<float>(3+i), static_cast<float>(2+i)));
         points2d.push_back(cv::Point2d(0, i));
-        points2dF.push_back(cv::Point2d(0, i));
+        points2dF.push_back(cv::Point2f(0, static_cast<float>(i)));
         points2d.push_back(cv::Point2d(-i, i));
-        points2dF.push_back(cv::Point2d(-i, i));
+        points2dF.push_back(cv::Point2f(static_cast<float>(-i), static_cast<float>(i)));
     }
-    Mat R,t, RF, tF;
+    Mat R, t, RF, tF;
     vector<int> inliers;
 
     solvePnPRansac(points3dF, points2dF, intrinsics, cv::Mat(), RF, tF, true, 100, 8.f, 0.999, inliers, cv::SOLVEPNP_P3P);
@@ -484,6 +833,367 @@ TEST(Calib3d_SolvePnP, double_support)
     EXPECT_LE(cvtest::norm(t, Mat_<double>(tF), NORM_INF), 1e-3);
 }
 
+TEST(Calib3d_SolvePnP, input_type)
+{
+    Matx33d intrinsics(5.4794130238156129e+002, 0., 2.9835545700043139e+002, 0.,
+                       5.4817724002728005e+002, 2.3062194051986233e+002, 0., 0., 1.);
+    vector<Point3d> points3d_;
+    vector<Point3f> points3dF_;
+    //Cube
+    const float l = -0.1f;
+    //Front face
+    points3d_.push_back(Point3d(-l, -l, -l));
+    points3dF_.push_back(Point3f(-l, -l, -l));
+    points3d_.push_back(Point3d(l, -l, -l));
+    points3dF_.push_back(Point3f(l, -l, -l));
+    points3d_.push_back(Point3d(l, l, -l));
+    points3dF_.push_back(Point3f(l, l, -l));
+    points3d_.push_back(Point3d(-l, l, -l));
+    points3dF_.push_back(Point3f(-l, l, -l));
+    //Back face
+    points3d_.push_back(Point3d(-l, -l, l));
+    points3dF_.push_back(Point3f(-l, -l, l));
+    points3d_.push_back(Point3d(l, -l, l));
+    points3dF_.push_back(Point3f(l, -l, l));
+    points3d_.push_back(Point3d(l, l, l));
+    points3dF_.push_back(Point3f(l, l, l));
+    points3d_.push_back(Point3d(-l, l, l));
+    points3dF_.push_back(Point3f(-l, l, l));
+
+    Mat trueRvec = (Mat_<double>(3,1) << 0.1, -0.25, 0.467);
+    Mat trueTvec = (Mat_<double>(3,1) << -0.21, 0.12, 0.746);
+
+    for (int method = 0; method < SOLVEPNP_MAX_COUNT; method++)
+    {
+        vector<Point3d> points3d;
+        vector<Point2d> points2d;
+        vector<Point3f> points3dF;
+        vector<Point2f> points2dF;
+
+        if (method == SOLVEPNP_IPPE || method == SOLVEPNP_IPPE_SQUARE)
+        {
+            const float tagSize_2 = 0.05f / 2;
+            points3d.push_back(Point3d(-tagSize_2,  tagSize_2, 0));
+            points3d.push_back(Point3d( tagSize_2,  tagSize_2, 0));
+            points3d.push_back(Point3d( tagSize_2, -tagSize_2, 0));
+            points3d.push_back(Point3d(-tagSize_2, -tagSize_2, 0));
+
+            points3dF.push_back(Point3f(-tagSize_2,  tagSize_2, 0));
+            points3dF.push_back(Point3f( tagSize_2,  tagSize_2, 0));
+            points3dF.push_back(Point3f( tagSize_2, -tagSize_2, 0));
+            points3dF.push_back(Point3f(-tagSize_2, -tagSize_2, 0));
+        }
+        else if (method == SOLVEPNP_P3P || method == SOLVEPNP_AP3P)
+        {
+            points3d = vector<Point3d>(points3d_.begin(), points3d_.begin()+4);
+            points3dF = vector<Point3f>(points3dF_.begin(), points3dF_.begin()+4);
+        }
+        else
+        {
+            points3d = points3d_;
+            points3dF = points3dF_;
+        }
+
+        projectPoints(points3d, trueRvec, trueTvec, intrinsics, noArray(), points2d);
+        projectPoints(points3dF, trueRvec, trueTvec, intrinsics, noArray(), points2dF);
+
+        //solvePnP
+        {
+            Mat R, t, RF, tF;
+
+            solvePnP(points3dF, points2dF, Matx33f(intrinsics), Mat(), RF, tF, false, method);
+            solvePnP(points3d, points2d, intrinsics, Mat(), R, t, false, method);
+
+            //By default rvec and tvec must be returned in double precision
+            EXPECT_EQ(RF.type(), tF.type());
+            EXPECT_EQ(RF.type(), CV_64FC1);
+
+            EXPECT_EQ(R.type(), t.type());
+            EXPECT_EQ(R.type(), CV_64FC1);
+
+            EXPECT_LE(cvtest::norm(R, RF, NORM_INF), 1e-3);
+            EXPECT_LE(cvtest::norm(t, tF, NORM_INF), 1e-3);
+
+            EXPECT_LE(cvtest::norm(trueRvec, R, NORM_INF), 1e-3);
+            EXPECT_LE(cvtest::norm(trueTvec, t, NORM_INF), 1e-3);
+            EXPECT_LE(cvtest::norm(trueRvec, RF, NORM_INF), 1e-3);
+            EXPECT_LE(cvtest::norm(trueTvec, tF, NORM_INF), 1e-3);
+        }
+        {
+            Mat R1, t1, R2, t2;
+
+            solvePnP(points3dF, points2d, intrinsics, Mat(), R1, t1, false, method);
+            solvePnP(points3d, points2dF, intrinsics, Mat(), R2, t2, false, method);
+
+            //By default rvec and tvec must be returned in double precision
+            EXPECT_EQ(R1.type(), t1.type());
+            EXPECT_EQ(R1.type(), CV_64FC1);
+
+            EXPECT_EQ(R2.type(), t2.type());
+            EXPECT_EQ(R2.type(), CV_64FC1);
+
+            EXPECT_LE(cvtest::norm(R1, R2, NORM_INF), 1e-3);
+            EXPECT_LE(cvtest::norm(t1, t2, NORM_INF), 1e-3);
+
+            EXPECT_LE(cvtest::norm(trueRvec, R1, NORM_INF), 1e-3);
+            EXPECT_LE(cvtest::norm(trueTvec, t1, NORM_INF), 1e-3);
+            EXPECT_LE(cvtest::norm(trueRvec, R2, NORM_INF), 1e-3);
+            EXPECT_LE(cvtest::norm(trueTvec, t2, NORM_INF), 1e-3);
+        }
+        {
+            Mat R1(3,1,CV_32FC1), t1(3,1,CV_64FC1);
+            Mat R2(3,1,CV_64FC1), t2(3,1,CV_32FC1);
+
+            solvePnP(points3dF, points2d, intrinsics, Mat(), R1, t1, false, method);
+            solvePnP(points3d, points2dF, intrinsics, Mat(), R2, t2, false, method);
+
+            //If not null, rvec and tvec must be returned in the same precision
+            EXPECT_EQ(R1.type(), CV_32FC1);
+            EXPECT_EQ(t1.type(), CV_64FC1);
+
+            EXPECT_EQ(R2.type(), CV_64FC1);
+            EXPECT_EQ(t2.type(), CV_32FC1);
+
+            EXPECT_LE(cvtest::norm(Mat_<double>(R1), R2, NORM_INF), 1e-3);
+            EXPECT_LE(cvtest::norm(t1, Mat_<double>(t2), NORM_INF), 1e-3);
+
+            EXPECT_LE(cvtest::norm(trueRvec, Mat_<double>(R1), NORM_INF), 1e-3);
+            EXPECT_LE(cvtest::norm(trueTvec, t1, NORM_INF), 1e-3);
+            EXPECT_LE(cvtest::norm(trueRvec, R2, NORM_INF), 1e-3);
+            EXPECT_LE(cvtest::norm(trueTvec, Mat_<double>(t2), NORM_INF), 1e-3);
+        }
+        {
+            Matx31f R1, t2;
+            Matx31d R2, t1;
+
+            solvePnP(points3dF, points2d, intrinsics, Mat(), R1, t1, false, method);
+            solvePnP(points3d, points2dF, intrinsics, Mat(), R2, t2, false, method);
+
+            Matx31d R1d(R1(0), R1(1), R1(2));
+            Matx31d t2d(t2(0), t2(1), t2(2));
+
+            EXPECT_LE(cvtest::norm(R1d, R2, NORM_INF), 1e-3);
+            EXPECT_LE(cvtest::norm(t1, t2d, NORM_INF), 1e-3);
+
+            EXPECT_LE(cvtest::norm(trueRvec, R1d, NORM_INF), 1e-3);
+            EXPECT_LE(cvtest::norm(trueTvec, t1, NORM_INF), 1e-3);
+            EXPECT_LE(cvtest::norm(trueRvec, R2, NORM_INF), 1e-3);
+            EXPECT_LE(cvtest::norm(trueTvec, t2d, NORM_INF), 1e-3);
+        }
+
+        //solvePnPGeneric
+        {
+            vector<Mat> Rs, ts, RFs, tFs;
+
+            int res1 = solvePnPGeneric(points3dF, points2dF, Matx33f(intrinsics), Mat(), RFs, tFs, false, (SolvePnPMethod)method);
+            int res2 = solvePnPGeneric(points3d, points2d, intrinsics, Mat(), Rs, ts, false, (SolvePnPMethod)method);
+
+            EXPECT_GT(res1, 0);
+            EXPECT_GT(res2, 0);
+
+            Mat R = Rs.front(), t = ts.front(), RF = RFs.front(), tF = tFs.front();
+
+            //By default rvecs and tvecs must be returned in double precision
+            EXPECT_EQ(RF.type(), tF.type());
+            EXPECT_EQ(RF.type(), CV_64FC1);
+
+            EXPECT_EQ(R.type(), t.type());
+            EXPECT_EQ(R.type(), CV_64FC1);
+
+            EXPECT_LE(cvtest::norm(R, RF, NORM_INF), 1e-3);
+            EXPECT_LE(cvtest::norm(t, tF, NORM_INF), 1e-3);
+
+            EXPECT_LE(cvtest::norm(trueRvec, R, NORM_INF), 1e-3);
+            EXPECT_LE(cvtest::norm(trueTvec, t, NORM_INF), 1e-3);
+            EXPECT_LE(cvtest::norm(trueRvec, RF, NORM_INF), 1e-3);
+            EXPECT_LE(cvtest::norm(trueTvec, tF, NORM_INF), 1e-3);
+        }
+        {
+            vector<Mat> R1s, t1s, R2s, t2s;
+
+            int res1 = solvePnPGeneric(points3dF, points2d, intrinsics, Mat(), R1s, t1s, false, (SolvePnPMethod)method);
+            int res2 = solvePnPGeneric(points3d, points2dF, intrinsics, Mat(), R2s, t2s, false, (SolvePnPMethod)method);
+
+            EXPECT_GT(res1, 0);
+            EXPECT_GT(res2, 0);
+
+            Mat R1 = R1s.front(), t1 = t1s.front(), R2 = R2s.front(), t2 = t2s.front();
+
+            //By default rvecs and tvecs must be returned in double precision
+            EXPECT_EQ(R1.type(), t1.type());
+            EXPECT_EQ(R1.type(), CV_64FC1);
+
+            EXPECT_EQ(R2.type(), t2.type());
+            EXPECT_EQ(R2.type(), CV_64FC1);
+
+            EXPECT_LE(cvtest::norm(R1, R2, NORM_INF), 1e-3);
+            EXPECT_LE(cvtest::norm(t1, t2, NORM_INF), 1e-3);
+
+            EXPECT_LE(cvtest::norm(trueRvec, R1, NORM_INF), 1e-3);
+            EXPECT_LE(cvtest::norm(trueTvec, t1, NORM_INF), 1e-3);
+            EXPECT_LE(cvtest::norm(trueRvec, R2, NORM_INF), 1e-3);
+            EXPECT_LE(cvtest::norm(trueTvec, t2, NORM_INF), 1e-3);
+        }
+        {
+            vector<Mat_<float> > R1s, t2s;
+            vector<Mat_<double> > R2s, t1s;
+
+            int res1 = solvePnPGeneric(points3dF, points2d, intrinsics, Mat(), R1s, t1s, false, (SolvePnPMethod)method);
+            int res2 = solvePnPGeneric(points3d, points2dF, intrinsics, Mat(), R2s, t2s, false, (SolvePnPMethod)method);
+
+            EXPECT_GT(res1, 0);
+            EXPECT_GT(res2, 0);
+
+            Mat R1 = R1s.front(), t1 = t1s.front();
+            Mat R2 = R2s.front(), t2 = t2s.front();
+
+            //If not null, rvecs and tvecs must be returned in the same precision
+            EXPECT_EQ(R1.type(), CV_32FC1);
+            EXPECT_EQ(t1.type(), CV_64FC1);
+
+            EXPECT_EQ(R2.type(), CV_64FC1);
+            EXPECT_EQ(t2.type(), CV_32FC1);
+
+            EXPECT_LE(cvtest::norm(Mat_<double>(R1), R2, NORM_INF), 1e-3);
+            EXPECT_LE(cvtest::norm(t1, Mat_<double>(t2), NORM_INF), 1e-3);
+
+            EXPECT_LE(cvtest::norm(trueRvec, Mat_<double>(R1), NORM_INF), 1e-3);
+            EXPECT_LE(cvtest::norm(trueTvec, t1, NORM_INF), 1e-3);
+            EXPECT_LE(cvtest::norm(trueRvec, R2, NORM_INF), 1e-3);
+            EXPECT_LE(cvtest::norm(trueTvec, Mat_<double>(t2), NORM_INF), 1e-3);
+        }
+        {
+            vector<Matx31f> R1s, t2s;
+            vector<Matx31d> R2s, t1s;
+
+            int res1 = solvePnPGeneric(points3dF, points2d, intrinsics, Mat(), R1s, t1s, false, (SolvePnPMethod)method);
+            int res2 = solvePnPGeneric(points3d, points2dF, intrinsics, Mat(), R2s, t2s, false, (SolvePnPMethod)method);
+
+            EXPECT_GT(res1, 0);
+            EXPECT_GT(res2, 0);
+
+            Matx31f R1 = R1s.front(), t2 = t2s.front();
+            Matx31d R2 = R2s.front(), t1 = t1s.front();
+            Matx31d R1d(R1(0), R1(1), R1(2)), t2d(t2(0), t2(1), t2(2));
+
+            EXPECT_LE(cvtest::norm(R1d, R2, NORM_INF), 1e-3);
+            EXPECT_LE(cvtest::norm(t1, t2d, NORM_INF), 1e-3);
+
+            EXPECT_LE(cvtest::norm(trueRvec, R1d, NORM_INF), 1e-3);
+            EXPECT_LE(cvtest::norm(trueTvec, t1, NORM_INF), 1e-3);
+            EXPECT_LE(cvtest::norm(trueRvec, R2, NORM_INF), 1e-3);
+            EXPECT_LE(cvtest::norm(trueTvec, t2d, NORM_INF), 1e-3);
+        }
+
+        if (method == SOLVEPNP_P3P || method == SOLVEPNP_AP3P)
+        {
+            //solveP3P
+            {
+                vector<Mat> Rs, ts, RFs, tFs;
+
+                int res1 = solveP3P(points3dF, points2dF, Matx33f(intrinsics), Mat(), RFs, tFs, (SolvePnPMethod)method);
+                int res2 = solveP3P(points3d, points2d, intrinsics, Mat(), Rs, ts, (SolvePnPMethod)method);
+
+                EXPECT_GT(res1, 0);
+                EXPECT_GT(res2, 0);
+
+                Mat R = Rs.front(), t = ts.front(), RF = RFs.front(), tF = tFs.front();
+
+                //By default rvecs and tvecs must be returned in double precision
+                EXPECT_EQ(RF.type(), tF.type());
+                EXPECT_EQ(RF.type(), CV_64FC1);
+
+                EXPECT_EQ(R.type(), t.type());
+                EXPECT_EQ(R.type(), CV_64FC1);
+
+                EXPECT_LE(cvtest::norm(R, RF, NORM_INF), 1e-3);
+                EXPECT_LE(cvtest::norm(t, tF, NORM_INF), 1e-3);
+
+                EXPECT_LE(cvtest::norm(trueRvec, R, NORM_INF), 1e-3);
+                EXPECT_LE(cvtest::norm(trueTvec, t, NORM_INF), 1e-3);
+                EXPECT_LE(cvtest::norm(trueRvec, RF, NORM_INF), 1e-3);
+                EXPECT_LE(cvtest::norm(trueTvec, tF, NORM_INF), 1e-3);
+            }
+            {
+                vector<Mat> R1s, t1s, R2s, t2s;
+
+                int res1 = solveP3P(points3dF, points2d, intrinsics, Mat(), R1s, t1s, (SolvePnPMethod)method);
+                int res2 = solveP3P(points3d, points2dF, intrinsics, Mat(), R2s, t2s, (SolvePnPMethod)method);
+
+                EXPECT_GT(res1, 0);
+                EXPECT_GT(res2, 0);
+
+                Mat R1 = R1s.front(), t1 = t1s.front(), R2 = R2s.front(), t2 = t2s.front();
+
+                //By default rvecs and tvecs must be returned in double precision
+                EXPECT_EQ(R1.type(), t1.type());
+                EXPECT_EQ(R1.type(), CV_64FC1);
+
+                EXPECT_EQ(R2.type(), t2.type());
+                EXPECT_EQ(R2.type(), CV_64FC1);
+
+                EXPECT_LE(cvtest::norm(R1, R2, NORM_INF), 1e-3);
+                EXPECT_LE(cvtest::norm(t1, t2, NORM_INF), 1e-3);
+
+                EXPECT_LE(cvtest::norm(trueRvec, R1, NORM_INF), 1e-3);
+                EXPECT_LE(cvtest::norm(trueTvec, t1, NORM_INF), 1e-3);
+                EXPECT_LE(cvtest::norm(trueRvec, R2, NORM_INF), 1e-3);
+                EXPECT_LE(cvtest::norm(trueTvec, t2, NORM_INF), 1e-3);
+            }
+            {
+                vector<Mat_<float> > R1s, t2s;
+                vector<Mat_<double> > R2s, t1s;
+
+                int res1 = solveP3P(points3dF, points2d, intrinsics, Mat(), R1s, t1s, (SolvePnPMethod)method);
+                int res2 = solveP3P(points3d, points2dF, intrinsics, Mat(), R2s, t2s, (SolvePnPMethod)method);
+
+                EXPECT_GT(res1, 0);
+                EXPECT_GT(res2, 0);
+
+                Mat R1 = R1s.front(), t1 = t1s.front();
+                Mat R2 = R2s.front(), t2 = t2s.front();
+
+                //If not null, rvecs and tvecs must be returned in the same precision
+                EXPECT_EQ(R1.type(), CV_32FC1);
+                EXPECT_EQ(t1.type(), CV_64FC1);
+
+                EXPECT_EQ(R2.type(), CV_64FC1);
+                EXPECT_EQ(t2.type(), CV_32FC1);
+
+                EXPECT_LE(cvtest::norm(Mat_<double>(R1), R2, NORM_INF), 1e-3);
+                EXPECT_LE(cvtest::norm(t1, Mat_<double>(t2), NORM_INF), 1e-3);
+
+                EXPECT_LE(cvtest::norm(trueRvec, Mat_<double>(R1), NORM_INF), 1e-3);
+                EXPECT_LE(cvtest::norm(trueTvec, t1, NORM_INF), 1e-3);
+                EXPECT_LE(cvtest::norm(trueRvec, R2, NORM_INF), 1e-3);
+                EXPECT_LE(cvtest::norm(trueTvec, Mat_<double>(t2), NORM_INF), 1e-3);
+            }
+            {
+                vector<Matx31f> R1s, t2s;
+                vector<Matx31d> R2s, t1s;
+
+                int res1 = solveP3P(points3dF, points2d, intrinsics, Mat(), R1s, t1s, (SolvePnPMethod)method);
+                int res2 = solveP3P(points3d, points2dF, intrinsics, Mat(), R2s, t2s, (SolvePnPMethod)method);
+
+                EXPECT_GT(res1, 0);
+                EXPECT_GT(res2, 0);
+
+                Matx31f R1 = R1s.front(), t2 = t2s.front();
+                Matx31d R2 = R2s.front(), t1 = t1s.front();
+                Matx31d R1d(R1(0), R1(1), R1(2)), t2d(t2(0), t2(1), t2(2));
+
+                EXPECT_LE(cvtest::norm(R1d, R2, NORM_INF), 1e-3);
+                EXPECT_LE(cvtest::norm(t1, t2d, NORM_INF), 1e-3);
+
+                EXPECT_LE(cvtest::norm(trueRvec, R1d, NORM_INF), 1e-3);
+                EXPECT_LE(cvtest::norm(trueTvec, t1, NORM_INF), 1e-3);
+                EXPECT_LE(cvtest::norm(trueRvec, R2, NORM_INF), 1e-3);
+                EXPECT_LE(cvtest::norm(trueTvec, t2d, NORM_INF), 1e-3);
+            }
+        }
+    }
+}
+
 TEST(Calib3d_SolvePnP, translation)
 {
     Mat cameraIntrinsic = Mat::eye(3,3, CV_32FC1);
@@ -548,13 +1258,16 @@ TEST(Calib3d_SolvePnP, iterativeInitialGuess3pts)
 
         solvePnP(p3d, p2d, intrinsics, noArray(), rvec_est, tvec_est, true, SOLVEPNP_ITERATIVE);
 
-        std::cout << "rvec_ground_truth: " << rvec_ground_truth.t() << std::endl;
-        std::cout << "rvec_est: " << rvec_est.t() << std::endl;
-        std::cout << "tvec_ground_truth: " << tvec_ground_truth.t() << std::endl;
-        std::cout << "tvec_est: " << tvec_est.t() << std::endl;
+        cout << "rvec_ground_truth: " << rvec_ground_truth.t() << std::endl;
+        cout << "rvec_est: " << rvec_est.t() << std::endl;
+        cout << "tvec_ground_truth: " << tvec_ground_truth.t() << std::endl;
+        cout << "tvec_est: " << tvec_est.t() << std::endl;
 
         EXPECT_LE(cvtest::norm(rvec_ground_truth, rvec_est, NORM_INF), 1e-6);
         EXPECT_LE(cvtest::norm(tvec_ground_truth, tvec_est, NORM_INF), 1e-6);
+
+        EXPECT_EQ(rvec_est.type(), CV_64FC1);
+        EXPECT_EQ(tvec_est.type(), CV_64FC1);
     }
 
     {
@@ -579,13 +1292,230 @@ TEST(Calib3d_SolvePnP, iterativeInitialGuess3pts)
 
         solvePnP(p3d, p2d, intrinsics, noArray(), rvec_est, tvec_est, true, SOLVEPNP_ITERATIVE);
 
-        std::cout << "rvec_ground_truth: " << rvec_ground_truth.t() << std::endl;
-        std::cout << "rvec_est: " << rvec_est.t() << std::endl;
-        std::cout << "tvec_ground_truth: " << tvec_ground_truth.t() << std::endl;
-        std::cout << "tvec_est: " << tvec_est.t() << std::endl;
+        cout << "rvec_ground_truth: " << rvec_ground_truth.t() << std::endl;
+        cout << "rvec_est: " << rvec_est.t() << std::endl;
+        cout << "tvec_ground_truth: " << tvec_ground_truth.t() << std::endl;
+        cout << "tvec_est: " << tvec_est.t() << std::endl;
 
         EXPECT_LE(cvtest::norm(rvec_ground_truth, rvec_est, NORM_INF), 1e-6);
         EXPECT_LE(cvtest::norm(tvec_ground_truth, tvec_est, NORM_INF), 1e-6);
+
+        EXPECT_EQ(rvec_est.type(), CV_32FC1);
+        EXPECT_EQ(tvec_est.type(), CV_32FC1);
+    }
+}
+
+TEST(Calib3d_SolvePnP, iterativeInitialGuess)
+{
+    {
+        Matx33d intrinsics(605.4, 0.0, 317.35,
+                           0.0, 601.2, 242.63,
+                           0.0, 0.0, 1.0);
+
+        double L = 0.1;
+        vector<Point3d> p3d;
+        p3d.push_back(Point3d(-L, -L, 0.0));
+        p3d.push_back(Point3d(L, -L, 0.0));
+        p3d.push_back(Point3d(L, L, 0.0));
+        p3d.push_back(Point3d(-L, L, L/2));
+        p3d.push_back(Point3d(0, 0, -L/2));
+
+        Mat rvec_ground_truth = (Mat_<double>(3,1) << 0.3, -0.2, 0.75);
+        Mat tvec_ground_truth = (Mat_<double>(3,1) << 0.15, -0.2, 1.5);
+
+        vector<Point2d> p2d;
+        projectPoints(p3d, rvec_ground_truth, tvec_ground_truth, intrinsics, noArray(), p2d);
+
+        Mat rvec_est = (Mat_<double>(3,1) << 0.1, -0.1, 0.1);
+        Mat tvec_est = (Mat_<double>(3,1) << 0.0, -0.5, 1.0);
+
+        solvePnP(p3d, p2d, intrinsics, noArray(), rvec_est, tvec_est, true, SOLVEPNP_ITERATIVE);
+
+        cout << "rvec_ground_truth: " << rvec_ground_truth.t() << std::endl;
+        cout << "rvec_est: " << rvec_est.t() << std::endl;
+        cout << "tvec_ground_truth: " << tvec_ground_truth.t() << std::endl;
+        cout << "tvec_est: " << tvec_est.t() << std::endl;
+
+        EXPECT_LE(cvtest::norm(rvec_ground_truth, rvec_est, NORM_INF), 1e-6);
+        EXPECT_LE(cvtest::norm(tvec_ground_truth, tvec_est, NORM_INF), 1e-6);
+
+        EXPECT_EQ(rvec_est.type(), CV_64FC1);
+        EXPECT_EQ(tvec_est.type(), CV_64FC1);
+    }
+
+    {
+        Matx33f intrinsics(605.4f, 0.0f, 317.35f,
+                           0.0f, 601.2f, 242.63f,
+                           0.0f, 0.0f, 1.0f);
+
+        float L = 0.1f;
+        vector<Point3f> p3d;
+        p3d.push_back(Point3f(-L, -L, 0.0f));
+        p3d.push_back(Point3f(L, -L, 0.0f));
+        p3d.push_back(Point3f(L, L, 0.0f));
+        p3d.push_back(Point3f(-L, L, L/2));
+        p3d.push_back(Point3f(0, 0, -L/2));
+
+        Mat rvec_ground_truth = (Mat_<float>(3,1) << -0.75f, 0.4f, 0.34f);
+        Mat tvec_ground_truth = (Mat_<float>(3,1) << -0.15f, 0.35f, 1.58f);
+
+        vector<Point2f> p2d;
+        projectPoints(p3d, rvec_ground_truth, tvec_ground_truth, intrinsics, noArray(), p2d);
+
+        Mat rvec_est = (Mat_<float>(3,1) << -0.1f, 0.1f, 0.1f);
+        Mat tvec_est = (Mat_<float>(3,1) << 0.0f, 0.0f, 1.0f);
+
+        solvePnP(p3d, p2d, intrinsics, noArray(), rvec_est, tvec_est, true, SOLVEPNP_ITERATIVE);
+
+        cout << "rvec_ground_truth: " << rvec_ground_truth.t() << std::endl;
+        cout << "rvec_est: " << rvec_est.t() << std::endl;
+        cout << "tvec_ground_truth: " << tvec_ground_truth.t() << std::endl;
+        cout << "tvec_est: " << tvec_est.t() << std::endl;
+
+        EXPECT_LE(cvtest::norm(rvec_ground_truth, rvec_est, NORM_INF), 1e-6);
+        EXPECT_LE(cvtest::norm(tvec_ground_truth, tvec_est, NORM_INF), 1e-6);
+
+        EXPECT_EQ(rvec_est.type(), CV_32FC1);
+        EXPECT_EQ(tvec_est.type(), CV_32FC1);
+    }
+}
+
+TEST(Calib3d_SolvePnP, generic)
+{
+    {
+        Matx33d intrinsics(605.4, 0.0, 317.35,
+                           0.0, 601.2, 242.63,
+                           0.0, 0.0, 1.0);
+
+        double L = 0.1;
+        vector<Point3d> p3d_;
+        p3d_.push_back(Point3d(-L, L, 0));
+        p3d_.push_back(Point3d(L, L, 0));
+        p3d_.push_back(Point3d(L, -L, 0));
+        p3d_.push_back(Point3d(-L, -L, 0));
+        p3d_.push_back(Point3d(-L, L, L/2));
+        p3d_.push_back(Point3d(0, 0, -L/2));
+
+        const int ntests = 10;
+        for (int numTest = 0; numTest < ntests; numTest++)
+        {
+            Mat rvec_ground_truth;
+            Mat tvec_ground_truth;
+            generatePose(p3d_, rvec_ground_truth, tvec_ground_truth, theRNG());
+
+            vector<Point2d> p2d_;
+            projectPoints(p3d_, rvec_ground_truth, tvec_ground_truth, intrinsics, noArray(), p2d_);
+
+            for (int method = 0; method < SOLVEPNP_MAX_COUNT; method++)
+            {
+                vector<Mat> rvecs_est;
+                vector<Mat> tvecs_est;
+
+                vector<Point3d> p3d;
+                vector<Point2d> p2d;
+                if (method == SOLVEPNP_P3P || method == SOLVEPNP_AP3P ||
+                    method == SOLVEPNP_IPPE || method == SOLVEPNP_IPPE_SQUARE)
+                {
+                    p3d = vector<Point3d>(p3d_.begin(), p3d_.begin()+4);
+                    p2d = vector<Point2d>(p2d_.begin(), p2d_.begin()+4);
+                }
+                else
+                {
+                    p3d = p3d_;
+                    p2d = p2d_;
+                }
+
+                vector<double> reprojectionErrors;
+                solvePnPGeneric(p3d, p2d, intrinsics, noArray(), rvecs_est, tvecs_est, false, (SolvePnPMethod)method,
+                                noArray(), noArray(), reprojectionErrors);
+
+                EXPECT_TRUE(!rvecs_est.empty());
+                EXPECT_TRUE(rvecs_est.size() == tvecs_est.size() && tvecs_est.size() == reprojectionErrors.size());
+
+                for (size_t i = 0; i < reprojectionErrors.size()-1; i++)
+                {
+                    EXPECT_GE(reprojectionErrors[i+1], reprojectionErrors[i]);
+                }
+
+                bool isTestSuccess = false;
+                for (size_t i = 0; i < rvecs_est.size() && !isTestSuccess; i++) {
+                    double rvecDiff = cvtest::norm(rvecs_est[i], rvec_ground_truth, NORM_L2);
+                    double tvecDiff = cvtest::norm(tvecs_est[i], tvec_ground_truth, NORM_L2);
+                    const double threshold = method == SOLVEPNP_P3P ? 1e-2 : 1e-4;
+                    isTestSuccess = rvecDiff < threshold && tvecDiff < threshold;
+                }
+
+                EXPECT_TRUE(isTestSuccess);
+            }
+        }
+    }
+
+    {
+        Matx33f intrinsics(605.4f, 0.0f, 317.35f,
+                           0.0f, 601.2f, 242.63f,
+                           0.0f, 0.0f, 1.0f);
+
+        float L = 0.1f;
+        vector<Point3f> p3f_;
+        p3f_.push_back(Point3f(-L, L, 0));
+        p3f_.push_back(Point3f(L, L, 0));
+        p3f_.push_back(Point3f(L, -L, 0));
+        p3f_.push_back(Point3f(-L, -L, 0));
+        p3f_.push_back(Point3f(-L, L, L/2));
+        p3f_.push_back(Point3f(0, 0, -L/2));
+
+        const int ntests = 10;
+        for (int numTest = 0; numTest < ntests; numTest++)
+        {
+            Mat rvec_ground_truth;
+            Mat tvec_ground_truth;
+            generatePose(p3f_, rvec_ground_truth, tvec_ground_truth, theRNG());
+
+            vector<Point2f> p2f_;
+            projectPoints(p3f_, rvec_ground_truth, tvec_ground_truth, intrinsics, noArray(), p2f_);
+
+            for (int method = 0; method < SOLVEPNP_MAX_COUNT; method++)
+            {
+                vector<Mat> rvecs_est;
+                vector<Mat> tvecs_est;
+
+                vector<Point3f> p3f;
+                vector<Point2f> p2f;
+                if (method == SOLVEPNP_P3P || method == SOLVEPNP_AP3P ||
+                    method == SOLVEPNP_IPPE || method == SOLVEPNP_IPPE_SQUARE)
+                {
+                    p3f = vector<Point3f>(p3f_.begin(), p3f_.begin()+4);
+                    p2f = vector<Point2f>(p2f_.begin(), p2f_.begin()+4);
+                }
+                else
+                {
+                    p3f = p3f_;
+                    p2f = p2f_;
+                }
+
+                vector<double> reprojectionErrors;
+                solvePnPGeneric(p3f, p2f, intrinsics, noArray(), rvecs_est, tvecs_est, false, (SolvePnPMethod)method,
+                                noArray(), noArray(), reprojectionErrors);
+
+                EXPECT_TRUE(!rvecs_est.empty());
+                EXPECT_TRUE(rvecs_est.size() == tvecs_est.size() && tvecs_est.size() == reprojectionErrors.size());
+
+                for (size_t i = 0; i < reprojectionErrors.size()-1; i++)
+                {
+                    EXPECT_GE(reprojectionErrors[i+1], reprojectionErrors[i]);
+                }
+
+                bool isTestSuccess = false;
+                for (size_t i = 0; i < rvecs_est.size() && !isTestSuccess; i++) {
+                    double rvecDiff = cvtest::norm(rvecs_est[i], rvec_ground_truth, NORM_L2);
+                    double tvecDiff = cvtest::norm(tvecs_est[i], tvec_ground_truth, NORM_L2);
+                    const double threshold = method == SOLVEPNP_P3P ? 1e-2 : 1e-4;
+                    isTestSuccess = rvecDiff < threshold && tvecDiff < threshold;
+                }
+
+                EXPECT_TRUE(isTestSuccess);
+            }
+        }
     }
 }
 
diff --git a/modules/core/include/opencv2/core/base.hpp b/modules/core/include/opencv2/core/base.hpp
index f484e9e108..5d76f52494 100644
--- a/modules/core/include/opencv2/core/base.hpp
+++ b/modules/core/include/opencv2/core/base.hpp
@@ -188,7 +188,7 @@ enum NormTypes {
                  norm =  \forkthree
                  { \| \texttt{src1} \| _{L_2} ^{2} = \sum_I \texttt{src1}(I)^2} {if  \(\texttt{normType} = \texttt{NORM_L2SQR}\)}
                  { \| \texttt{src1} - \texttt{src2} \| _{L_2} ^{2} =  \sum_I (\texttt{src1}(I) - \texttt{src2}(I))^2 }{if  \(\texttt{normType} = \texttt{NORM_L2SQR}\) }
-                 { \left(\frac{\|\texttt{src1}-\texttt{src2}\|_{L_2} }{\|\texttt{src2}\|_{L_2}}\right)^2 }{if  \(\texttt{normType} = \texttt{NORM_RELATIVE | NORM_L2}\) }
+                 { \left(\frac{\|\texttt{src1}-\texttt{src2}\|_{L_2} }{\|\texttt{src2}\|_{L_2}}\right)^2 }{if  \(\texttt{normType} = \texttt{NORM_RELATIVE | NORM_L2SQR}\) }
                  \f]
                  */
                  NORM_L2SQR     = 5,
diff --git a/modules/core/misc/java/src/java/core+CvType.java b/modules/core/misc/java/src/java/core+CvType.java
index a03b794bb9..fcf616fe02 100644
--- a/modules/core/misc/java/src/java/core+CvType.java
+++ b/modules/core/misc/java/src/java/core+CvType.java
@@ -34,11 +34,11 @@ public final class CvType {
 
     public static final int makeType(int depth, int channels) {
         if (channels <= 0 || channels >= CV_CN_MAX) {
-            throw new java.lang.UnsupportedOperationException(
+            throw new UnsupportedOperationException(
                     "Channels count should be 1.." + (CV_CN_MAX - 1));
         }
         if (depth < 0 || depth >= CV_DEPTH_MAX) {
-            throw new java.lang.UnsupportedOperationException(
+            throw new UnsupportedOperationException(
                     "Data type depth should be 0.." + (CV_DEPTH_MAX - 1));
         }
         return (depth & (CV_DEPTH_MAX - 1)) + ((channels - 1) << CV_CN_SHIFT);
@@ -103,7 +103,7 @@ public final class CvType {
         case CV_64F:
             return 8 * channels(type);
         default:
-            throw new java.lang.UnsupportedOperationException(
+            throw new UnsupportedOperationException(
                     "Unsupported CvType value: " + type);
         }
     }
@@ -136,7 +136,7 @@ public final class CvType {
             s = "CV_16F";
             break;
         default:
-            throw new java.lang.UnsupportedOperationException(
+            throw new UnsupportedOperationException(
                     "Unsupported CvType value: " + type);
         }
 
diff --git a/modules/core/misc/java/src/java/core+Mat.java b/modules/core/misc/java/src/java/core+Mat.java
index e42fca9897..3bcb1ee9f7 100644
--- a/modules/core/misc/java/src/java/core+Mat.java
+++ b/modules/core/misc/java/src/java/core+Mat.java
@@ -11,7 +11,7 @@ public class Mat {
     public Mat(long addr)
     {
         if (addr == 0)
-            throw new java.lang.UnsupportedOperationException("Native object address is NULL");
+            throw new UnsupportedOperationException("Native object address is NULL");
         nativeObj = addr;
     }
 
@@ -1074,7 +1074,7 @@ public class Mat {
     public int put(int row, int col, double... data) {
         int t = type();
         if (data == null || data.length % CvType.channels(t) != 0)
-            throw new java.lang.UnsupportedOperationException(
+            throw new UnsupportedOperationException(
                     "Provided data element number (" +
                             (data == null ? 0 : data.length) +
                             ") should be multiple of the Mat channels count (" +
@@ -1086,7 +1086,7 @@ public class Mat {
     public int put(int[] idx, double... data) {
         int t = type();
         if (data == null || data.length % CvType.channels(t) != 0)
-            throw new java.lang.UnsupportedOperationException(
+            throw new UnsupportedOperationException(
                     "Provided data element number (" +
                             (data == null ? 0 : data.length) +
                             ") should be multiple of the Mat channels count (" +
@@ -1100,7 +1100,7 @@ public class Mat {
     public int put(int row, int col, float[] data) {
         int t = type();
         if (data == null || data.length % CvType.channels(t) != 0)
-            throw new java.lang.UnsupportedOperationException(
+            throw new UnsupportedOperationException(
                     "Provided data element number (" +
                             (data == null ? 0 : data.length) +
                             ") should be multiple of the Mat channels count (" +
@@ -1108,14 +1108,14 @@ public class Mat {
         if (CvType.depth(t) == CvType.CV_32F) {
             return nPutF(nativeObj, row, col, data.length, data);
         }
-        throw new java.lang.UnsupportedOperationException("Mat data type is not compatible: " + t);
+        throw new UnsupportedOperationException("Mat data type is not compatible: " + t);
     }
 
     // javadoc:Mat::put(idx,data)
     public int put(int[] idx, float[] data) {
         int t = type();
         if (data == null || data.length % CvType.channels(t) != 0)
-            throw new java.lang.UnsupportedOperationException(
+            throw new UnsupportedOperationException(
                     "Provided data element number (" +
                             (data == null ? 0 : data.length) +
                             ") should be multiple of the Mat channels count (" +
@@ -1125,14 +1125,14 @@ public class Mat {
         if (CvType.depth(t) == CvType.CV_32F) {
             return nPutFIdx(nativeObj, idx, data.length, data);
         }
-        throw new java.lang.UnsupportedOperationException("Mat data type is not compatible: " + t);
+        throw new UnsupportedOperationException("Mat data type is not compatible: " + t);
     }
 
     // javadoc:Mat::put(row,col,data)
     public int put(int row, int col, int[] data) {
         int t = type();
         if (data == null || data.length % CvType.channels(t) != 0)
-            throw new java.lang.UnsupportedOperationException(
+            throw new UnsupportedOperationException(
                     "Provided data element number (" +
                             (data == null ? 0 : data.length) +
                             ") should be multiple of the Mat channels count (" +
@@ -1140,14 +1140,14 @@ public class Mat {
         if (CvType.depth(t) == CvType.CV_32S) {
             return nPutI(nativeObj, row, col, data.length, data);
         }
-        throw new java.lang.UnsupportedOperationException("Mat data type is not compatible: " + t);
+        throw new UnsupportedOperationException("Mat data type is not compatible: " + t);
     }
 
     // javadoc:Mat::put(idx,data)
     public int put(int[] idx, int[] data) {
         int t = type();
         if (data == null || data.length % CvType.channels(t) != 0)
-            throw new java.lang.UnsupportedOperationException(
+            throw new UnsupportedOperationException(
                     "Provided data element number (" +
                             (data == null ? 0 : data.length) +
                             ") should be multiple of the Mat channels count (" +
@@ -1157,14 +1157,14 @@ public class Mat {
         if (CvType.depth(t) == CvType.CV_32S) {
             return nPutIIdx(nativeObj, idx, data.length, data);
         }
-        throw new java.lang.UnsupportedOperationException("Mat data type is not compatible: " + t);
+        throw new UnsupportedOperationException("Mat data type is not compatible: " + t);
     }
 
     // javadoc:Mat::put(row,col,data)
     public int put(int row, int col, short[] data) {
         int t = type();
         if (data == null || data.length % CvType.channels(t) != 0)
-            throw new java.lang.UnsupportedOperationException(
+            throw new UnsupportedOperationException(
                     "Provided data element number (" +
                             (data == null ? 0 : data.length) +
                             ") should be multiple of the Mat channels count (" +
@@ -1172,14 +1172,14 @@ public class Mat {
         if (CvType.depth(t) == CvType.CV_16U || CvType.depth(t) == CvType.CV_16S) {
             return nPutS(nativeObj, row, col, data.length, data);
         }
-        throw new java.lang.UnsupportedOperationException("Mat data type is not compatible: " + t);
+        throw new UnsupportedOperationException("Mat data type is not compatible: " + t);
     }
 
     // javadoc:Mat::put(idx,data)
     public int put(int[] idx, short[] data) {
         int t = type();
         if (data == null || data.length % CvType.channels(t) != 0)
-            throw new java.lang.UnsupportedOperationException(
+            throw new UnsupportedOperationException(
                     "Provided data element number (" +
                             (data == null ? 0 : data.length) +
                             ") should be multiple of the Mat channels count (" +
@@ -1189,14 +1189,14 @@ public class Mat {
         if (CvType.depth(t) == CvType.CV_16U || CvType.depth(t) == CvType.CV_16S) {
             return nPutSIdx(nativeObj, idx, data.length, data);
         }
-        throw new java.lang.UnsupportedOperationException("Mat data type is not compatible: " + t);
+        throw new UnsupportedOperationException("Mat data type is not compatible: " + t);
     }
 
     // javadoc:Mat::put(row,col,data)
     public int put(int row, int col, byte[] data) {
         int t = type();
         if (data == null || data.length % CvType.channels(t) != 0)
-            throw new java.lang.UnsupportedOperationException(
+            throw new UnsupportedOperationException(
                     "Provided data element number (" +
                             (data == null ? 0 : data.length) +
                             ") should be multiple of the Mat channels count (" +
@@ -1204,14 +1204,14 @@ public class Mat {
         if (CvType.depth(t) == CvType.CV_8U || CvType.depth(t) == CvType.CV_8S) {
             return nPutB(nativeObj, row, col, data.length, data);
         }
-        throw new java.lang.UnsupportedOperationException("Mat data type is not compatible: " + t);
+        throw new UnsupportedOperationException("Mat data type is not compatible: " + t);
     }
 
     // javadoc:Mat::put(idx,data)
     public int put(int[] idx, byte[] data) {
         int t = type();
         if (data == null || data.length % CvType.channels(t) != 0)
-            throw new java.lang.UnsupportedOperationException(
+            throw new UnsupportedOperationException(
                     "Provided data element number (" +
                             (data == null ? 0 : data.length) +
                             ") should be multiple of the Mat channels count (" +
@@ -1221,14 +1221,14 @@ public class Mat {
         if (CvType.depth(t) == CvType.CV_8U || CvType.depth(t) == CvType.CV_8S) {
             return nPutBIdx(nativeObj, idx, data.length, data);
         }
-        throw new java.lang.UnsupportedOperationException("Mat data type is not compatible: " + t);
+        throw new UnsupportedOperationException("Mat data type is not compatible: " + t);
     }
 
     // javadoc:Mat::put(row,col,data,offset,length)
     public int put(int row, int col, byte[] data, int offset, int length) {
         int t = type();
         if (data == null || length % CvType.channels(t) != 0)
-            throw new java.lang.UnsupportedOperationException(
+            throw new UnsupportedOperationException(
                     "Provided data element number (" +
                             (data == null ? 0 : data.length) +
                             ") should be multiple of the Mat channels count (" +
@@ -1236,14 +1236,14 @@ public class Mat {
         if (CvType.depth(t) == CvType.CV_8U || CvType.depth(t) == CvType.CV_8S) {
             return nPutBwOffset(nativeObj, row, col, length, offset, data);
         }
-        throw new java.lang.UnsupportedOperationException("Mat data type is not compatible: " + t);
+        throw new UnsupportedOperationException("Mat data type is not compatible: " + t);
     }
 
     // javadoc:Mat::put(idx,data,offset,length)
     public int put(int[] idx, byte[] data, int offset, int length) {
         int t = type();
         if (data == null || length % CvType.channels(t) != 0)
-            throw new java.lang.UnsupportedOperationException(
+            throw new UnsupportedOperationException(
                     "Provided data element number (" +
                             (data == null ? 0 : data.length) +
                             ") should be multiple of the Mat channels count (" +
@@ -1253,14 +1253,14 @@ public class Mat {
         if (CvType.depth(t) == CvType.CV_8U || CvType.depth(t) == CvType.CV_8S) {
             return nPutBwIdxOffset(nativeObj, idx, length, offset, data);
         }
-        throw new java.lang.UnsupportedOperationException("Mat data type is not compatible: " + t);
+        throw new UnsupportedOperationException("Mat data type is not compatible: " + t);
     }
 
     // javadoc:Mat::get(row,col,data)
     public int get(int row, int col, byte[] data) {
         int t = type();
         if (data == null || data.length % CvType.channels(t) != 0)
-            throw new java.lang.UnsupportedOperationException(
+            throw new UnsupportedOperationException(
                     "Provided data element number (" +
                             (data == null ? 0 : data.length) +
                             ") should be multiple of the Mat channels count (" +
@@ -1268,14 +1268,14 @@ public class Mat {
         if (CvType.depth(t) == CvType.CV_8U || CvType.depth(t) == CvType.CV_8S) {
             return nGetB(nativeObj, row, col, data.length, data);
         }
-        throw new java.lang.UnsupportedOperationException("Mat data type is not compatible: " + t);
+        throw new UnsupportedOperationException("Mat data type is not compatible: " + t);
     }
 
     // javadoc:Mat::get(idx,data)
     public int get(int[] idx, byte[] data) {
         int t = type();
         if (data == null || data.length % CvType.channels(t) != 0)
-            throw new java.lang.UnsupportedOperationException(
+            throw new UnsupportedOperationException(
                     "Provided data element number (" +
                             (data == null ? 0 : data.length) +
                             ") should be multiple of the Mat channels count (" +
@@ -1285,14 +1285,14 @@ public class Mat {
         if (CvType.depth(t) == CvType.CV_8U || CvType.depth(t) == CvType.CV_8S) {
             return nGetBIdx(nativeObj, idx, data.length, data);
         }
-        throw new java.lang.UnsupportedOperationException("Mat data type is not compatible: " + t);
+        throw new UnsupportedOperationException("Mat data type is not compatible: " + t);
     }
 
     // javadoc:Mat::get(row,col,data)
     public int get(int row, int col, short[] data) {
         int t = type();
         if (data == null || data.length % CvType.channels(t) != 0)
-            throw new java.lang.UnsupportedOperationException(
+            throw new UnsupportedOperationException(
                     "Provided data element number (" +
                             (data == null ? 0 : data.length) +
                             ") should be multiple of the Mat channels count (" +
@@ -1300,14 +1300,14 @@ public class Mat {
         if (CvType.depth(t) == CvType.CV_16U || CvType.depth(t) == CvType.CV_16S) {
             return nGetS(nativeObj, row, col, data.length, data);
         }
-        throw new java.lang.UnsupportedOperationException("Mat data type is not compatible: " + t);
+        throw new UnsupportedOperationException("Mat data type is not compatible: " + t);
     }
 
     // javadoc:Mat::get(idx,data)
     public int get(int[] idx, short[] data) {
         int t = type();
         if (data == null || data.length % CvType.channels(t) != 0)
-            throw new java.lang.UnsupportedOperationException(
+            throw new UnsupportedOperationException(
                     "Provided data element number (" +
                             (data == null ? 0 : data.length) +
                             ") should be multiple of the Mat channels count (" +
@@ -1317,14 +1317,14 @@ public class Mat {
         if (CvType.depth(t) == CvType.CV_16U || CvType.depth(t) == CvType.CV_16S) {
             return nGetSIdx(nativeObj, idx, data.length, data);
         }
-        throw new java.lang.UnsupportedOperationException("Mat data type is not compatible: " + t);
+        throw new UnsupportedOperationException("Mat data type is not compatible: " + t);
     }
 
     // javadoc:Mat::get(row,col,data)
     public int get(int row, int col, int[] data) {
         int t = type();
         if (data == null || data.length % CvType.channels(t) != 0)
-            throw new java.lang.UnsupportedOperationException(
+            throw new UnsupportedOperationException(
                     "Provided data element number (" +
                             (data == null ? 0 : data.length) +
                             ") should be multiple of the Mat channels count (" +
@@ -1332,14 +1332,14 @@ public class Mat {
         if (CvType.depth(t) == CvType.CV_32S) {
             return nGetI(nativeObj, row, col, data.length, data);
         }
-        throw new java.lang.UnsupportedOperationException("Mat data type is not compatible: " + t);
+        throw new UnsupportedOperationException("Mat data type is not compatible: " + t);
     }
 
     // javadoc:Mat::get(idx,data)
     public int get(int[] idx, int[] data) {
         int t = type();
         if (data == null || data.length % CvType.channels(t) != 0)
-            throw new java.lang.UnsupportedOperationException(
+            throw new UnsupportedOperationException(
                     "Provided data element number (" +
                             (data == null ? 0 : data.length) +
                             ") should be multiple of the Mat channels count (" +
@@ -1349,14 +1349,14 @@ public class Mat {
         if (CvType.depth(t) == CvType.CV_32S) {
             return nGetIIdx(nativeObj, idx, data.length, data);
         }
-        throw new java.lang.UnsupportedOperationException("Mat data type is not compatible: " + t);
+        throw new UnsupportedOperationException("Mat data type is not compatible: " + t);
     }
 
     // javadoc:Mat::get(row,col,data)
     public int get(int row, int col, float[] data) {
         int t = type();
         if (data == null || data.length % CvType.channels(t) != 0)
-            throw new java.lang.UnsupportedOperationException(
+            throw new UnsupportedOperationException(
                     "Provided data element number (" +
                             (data == null ? 0 : data.length) +
                             ") should be multiple of the Mat channels count (" +
@@ -1364,14 +1364,14 @@ public class Mat {
         if (CvType.depth(t) == CvType.CV_32F) {
             return nGetF(nativeObj, row, col, data.length, data);
         }
-        throw new java.lang.UnsupportedOperationException("Mat data type is not compatible: " + t);
+        throw new UnsupportedOperationException("Mat data type is not compatible: " + t);
     }
 
     // javadoc:Mat::get(idx,data)
     public int get(int[] idx, float[] data) {
         int t = type();
         if (data == null || data.length % CvType.channels(t) != 0)
-            throw new java.lang.UnsupportedOperationException(
+            throw new UnsupportedOperationException(
                     "Provided data element number (" +
                             (data == null ? 0 : data.length) +
                             ") should be multiple of the Mat channels count (" +
@@ -1381,14 +1381,14 @@ public class Mat {
         if (CvType.depth(t) == CvType.CV_32F) {
             return nGetFIdx(nativeObj, idx, data.length, data);
         }
-        throw new java.lang.UnsupportedOperationException("Mat data type is not compatible: " + t);
+        throw new UnsupportedOperationException("Mat data type is not compatible: " + t);
     }
 
     // javadoc:Mat::get(row,col,data)
     public int get(int row, int col, double[] data) {
         int t = type();
         if (data == null || data.length % CvType.channels(t) != 0)
-            throw new java.lang.UnsupportedOperationException(
+            throw new UnsupportedOperationException(
                     "Provided data element number (" +
                             (data == null ? 0 : data.length) +
                             ") should be multiple of the Mat channels count (" +
@@ -1396,14 +1396,14 @@ public class Mat {
         if (CvType.depth(t) == CvType.CV_64F) {
             return nGetD(nativeObj, row, col, data.length, data);
         }
-        throw new java.lang.UnsupportedOperationException("Mat data type is not compatible: " + t);
+        throw new UnsupportedOperationException("Mat data type is not compatible: " + t);
     }
 
     // javadoc:Mat::get(idx,data)
     public int get(int[] idx, double[] data) {
         int t = type();
         if (data == null || data.length % CvType.channels(t) != 0)
-            throw new java.lang.UnsupportedOperationException(
+            throw new UnsupportedOperationException(
                     "Provided data element number (" +
                             (data == null ? 0 : data.length) +
                             ") should be multiple of the Mat channels count (" +
@@ -1413,7 +1413,7 @@ public class Mat {
         if (CvType.depth(t) == CvType.CV_64F) {
             return nGetDIdx(nativeObj, idx, data.length, data);
         }
-        throw new java.lang.UnsupportedOperationException("Mat data type is not compatible: " + t);
+        throw new UnsupportedOperationException("Mat data type is not compatible: " + t);
     }
 
     // javadoc:Mat::get(row,col)
diff --git a/modules/dnn/include/opencv2/dnn/dnn.hpp b/modules/dnn/include/opencv2/dnn/dnn.hpp
index 02cc825cff..dce2bd7b73 100644
--- a/modules/dnn/include/opencv2/dnn/dnn.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.hpp
@@ -816,6 +816,7 @@ CV__DNN_INLINE_NS_BEGIN
       *                  * `*.t7` | `*.net` (Torch, http://torch.ch/)
       *                  * `*.weights` (Darknet, https://pjreddie.com/darknet/)
       *                  * `*.bin` (DLDT, https://software.intel.com/openvino-toolkit)
+      *                  * `*.onnx` (ONNX, https://onnx.ai/)
       * @param[in] config Text file contains network configuration. It could be a
       *                   file with the following extensions:
       *                  * `*.prototxt` (Caffe, http://caffe.berkeleyvision.org/)
@@ -864,6 +865,23 @@ CV__DNN_INLINE_NS_BEGIN
      */
     CV_EXPORTS_W Net readNetFromONNX(const String &onnxFile);
 
+    /** @brief Reads a network model from <a href="https://onnx.ai/">ONNX</a>
+     *         in-memory buffer.
+     *  @param buffer memory address of the first byte of the buffer.
+     *  @param sizeBuffer size of the buffer.
+     *  @returns Network object that ready to do forward, throw an exception
+     *        in failure cases.
+     */
+    CV_EXPORTS Net readNetFromONNX(const char* buffer, size_t sizeBuffer);
+
+    /** @brief Reads a network model from <a href="https://onnx.ai/">ONNX</a>
+     *         in-memory buffer.
+     *  @param buffer in-memory buffer that stores the ONNX model bytes.
+     *  @returns Network object that ready to do forward, throw an exception
+     *        in failure cases.
+     */
+    CV_EXPORTS_W Net readNetFromONNX(const std::vector<uchar>& buffer);
+
     /** @brief Creates blob from .pb file.
      *  @param path to the .pb file with input tensor.
      *  @returns Mat.
diff --git a/modules/dnn/src/layers/batch_norm_layer.cpp b/modules/dnn/src/layers/batch_norm_layer.cpp
index 4c69c247c4..b2fd75aef1 100644
--- a/modules/dnn/src/layers/batch_norm_layer.cpp
+++ b/modules/dnn/src/layers/batch_norm_layer.cpp
@@ -29,6 +29,8 @@ class BatchNormLayerImpl CV_FINAL : public BatchNormLayer
 public:
     Mat weights_, bias_;
     UMat umat_weight, umat_bias;
+    mutable int dims;
+
 
     BatchNormLayerImpl(const LayerParams& params)
     {
@@ -142,6 +144,7 @@ public:
                          std::vector<MatShape> &outputs,
                          std::vector<MatShape> &internals) const CV_OVERRIDE
     {
+        dims = inputs[0].size();
         if (!useGlobalStats && inputs[0][0] != 1)
             CV_Error(Error::StsNotImplemented, "Batch normalization in training mode with batch size > 1");
         Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals);
@@ -150,9 +153,9 @@ public:
 
     virtual bool supportBackend(int backendId) CV_OVERRIDE
     {
-        return backendId == DNN_BACKEND_OPENCV ||
+        return (backendId == DNN_BACKEND_OPENCV) ||
                (backendId == DNN_BACKEND_HALIDE && haveHalide()) ||
-               (backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine());
+               (backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine() && (preferableTarget == DNN_TARGET_CPU || dims == 4));
     }
 
 #ifdef HAVE_OPENCL
@@ -178,11 +181,12 @@ public:
         }
 
         UMat &inpBlob = inputs[0];
-        CV_Assert(inpBlob.dims == 2 || inpBlob.dims == 4);
         int groups = inpBlob.size[0];
         int channels = inpBlob.size[1];
-        int rows = inpBlob.dims > 2 ? inpBlob.size[2] : 1;
-        int cols = inpBlob.dims > 2 ? inpBlob.size[3] : 1;
+        int planeSize = 1;
+        for (size_t i = 2; i < inpBlob.dims; i++) {
+            planeSize *= inpBlob.size[i];
+        }
 
         String opts = (use_half) ? " -DDtype=half" : " -DDtype=float";
         for (size_t ii = 0; ii < outputs.size(); ii++)
@@ -196,7 +200,7 @@ public:
             }
             else
             {
-                MatShape s = shape(groups * channels, rows * cols);
+                MatShape s = shape(groups * channels, planeSize);
                 UMat src = inputs[ii].reshape(1, s.size(), &s[0]);
                 UMat dst = outputs[ii].reshape(1, s.size(), &s[0]);
                 int number = (s[1] % 8 == 0) ? 8 : ((s[1] % 4 == 0) ? 4 : 1);
@@ -248,9 +252,10 @@ public:
         CV_Assert(inputs.size() == 1);
 
         Mat &inpBlob = inputs[0];
-        CV_Assert(inpBlob.dims == 2 || inpBlob.dims == 4);
-        int rows = inpBlob.dims > 2 ? inpBlob.size[2] : 1;
-        int cols = inpBlob.dims > 2 ? inpBlob.size[3] : 1;
+        int planeSize = 1;
+        for (size_t i = 2; i < inpBlob.dims; i++) {
+            planeSize *= inpBlob.size[i];
+        }
 
         for (size_t ii = 0; ii < outputs.size(); ii++)
         {
@@ -262,8 +267,8 @@ public:
                 {
                     float w = weights_.at<float>(n);
                     float b = bias_.at<float>(n);
-                    Mat inpBlobPlane(rows, cols, CV_32F, inpBlob.ptr<float>(num, n));
-                    Mat outBlobPlane(rows, cols, CV_32F, outBlob.ptr<float>(num, n));
+                    Mat inpBlobPlane(1, planeSize, CV_32F, inpBlob.ptr<float>(num, n));
+                    Mat outBlobPlane(1, planeSize, CV_32F, outBlob.ptr<float>(num, n));
                     inpBlobPlane.convertTo(outBlobPlane, CV_32F, w, b);
                 }
             }
diff --git a/modules/dnn/src/onnx/onnx_importer.cpp b/modules/dnn/src/onnx/onnx_importer.cpp
index e55d7cd5a4..ac91907c5d 100644
--- a/modules/dnn/src/onnx/onnx_importer.cpp
+++ b/modules/dnn/src/onnx/onnx_importer.cpp
@@ -57,6 +57,24 @@ public:
             CV_Error(Error::StsUnsupportedFormat, "Failed to parse onnx model");
     }
 
+    ONNXImporter(const char* buffer, size_t sizeBuffer)
+    {
+        struct _Buf : public std::streambuf
+        {
+            _Buf(const char* buffer, size_t sizeBuffer)
+            {
+                char* p = const_cast<char*>(buffer);
+                setg(p, p, p + sizeBuffer);
+            }
+        };
+
+        _Buf buf(buffer, sizeBuffer);
+        std::istream input(&buf);
+
+        if (!model_proto.ParseFromIstream(&input))
+            CV_Error(Error::StsUnsupportedFormat, "Failed to parse onnx model from in-memory byte array.");
+    }
+
     void populateNet(Net dstNet);
 };
 
@@ -768,37 +786,42 @@ void ONNXImporter::populateNet(Net dstNet)
             }
             replaceLayerParam(layerParams, "mode", "interpolation");
         }
+        else if (layer_type == "LogSoftmax")
+        {
+            layerParams.type = "Softmax";
+            layerParams.set("log_softmax", true);
+        }
         else
         {
             for (int j = 0; j < node_proto.input_size(); j++) {
                 if (layer_id.find(node_proto.input(j)) == layer_id.end())
                     layerParams.blobs.push_back(getBlob(node_proto, constBlobs, j));
             }
-         }
+        }
 
-         int id = dstNet.addLayer(layerParams.name, layerParams.type, layerParams);
-         layer_id.insert(std::make_pair(layerParams.name, LayerInfo(id, 0)));
+        int id = dstNet.addLayer(layerParams.name, layerParams.type, layerParams);
+        layer_id.insert(std::make_pair(layerParams.name, LayerInfo(id, 0)));
 
 
-         std::vector<MatShape> layerInpShapes, layerOutShapes, layerInternalShapes;
-         for (int j = 0; j < node_proto.input_size(); j++) {
-             layerId = layer_id.find(node_proto.input(j));
-             if (layerId != layer_id.end()) {
-                 dstNet.connect(layerId->second.layerId, layerId->second.outputId, id, j);
-                 // Collect input shapes.
-                 shapeIt = outShapes.find(node_proto.input(j));
-                 CV_Assert(shapeIt != outShapes.end());
-                 layerInpShapes.push_back(shapeIt->second);
-             }
-         }
+        std::vector<MatShape> layerInpShapes, layerOutShapes, layerInternalShapes;
+        for (int j = 0; j < node_proto.input_size(); j++) {
+            layerId = layer_id.find(node_proto.input(j));
+            if (layerId != layer_id.end()) {
+                dstNet.connect(layerId->second.layerId, layerId->second.outputId, id, j);
+                // Collect input shapes.
+                shapeIt = outShapes.find(node_proto.input(j));
+                CV_Assert(shapeIt != outShapes.end());
+                layerInpShapes.push_back(shapeIt->second);
+            }
+        }
 
-         // Compute shape of output blob for this layer.
-         Ptr<Layer> layer = dstNet.getLayer(id);
-         layer->getMemoryShapes(layerInpShapes, 0, layerOutShapes, layerInternalShapes);
-         CV_Assert(!layerOutShapes.empty());
-         outShapes[layerParams.name] = layerOutShapes[0];
-     }
- }
+        // Compute shape of output blob for this layer.
+        Ptr<Layer> layer = dstNet.getLayer(id);
+        layer->getMemoryShapes(layerInpShapes, 0, layerOutShapes, layerInternalShapes);
+        CV_Assert(!layerOutShapes.empty());
+        outShapes[layerParams.name] = layerOutShapes[0];
+    }
+}
 
 Net readNetFromONNX(const String& onnxFile)
 {
@@ -808,6 +831,19 @@ Net readNetFromONNX(const String& onnxFile)
     return net;
 }
 
+Net readNetFromONNX(const char* buffer, size_t sizeBuffer)
+{
+    ONNXImporter onnxImporter(buffer, sizeBuffer);
+    Net net;
+    onnxImporter.populateNet(net);
+    return net;
+}
+
+Net readNetFromONNX(const std::vector<uchar>& buffer)
+{
+    return readNetFromONNX(reinterpret_cast<const char*>(buffer.data()), buffer.size());
+}
+
 Mat readTensorFromONNX(const String& path)
 {
     opencv_onnx::TensorProto tensor_proto = opencv_onnx::TensorProto();
diff --git a/modules/dnn/src/tensorflow/tf_importer.cpp b/modules/dnn/src/tensorflow/tf_importer.cpp
index 41985c834d..5f34096534 100644
--- a/modules/dnn/src/tensorflow/tf_importer.cpp
+++ b/modules/dnn/src/tensorflow/tf_importer.cpp
@@ -1423,6 +1423,43 @@ void TFImporter::populateNet(Net dstNet)
 
             connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
         }
+        else if (type == "StridedSlice")
+        {
+            CV_Assert(layer.input_size() == 4);
+            Mat begins = getTensorContent(getConstBlob(layer, value_id, 1));
+            Mat ends = getTensorContent(getConstBlob(layer, value_id, 2));
+            Mat strides = getTensorContent(getConstBlob(layer, value_id, 3));
+            CV_CheckTypeEQ(begins.type(), CV_32SC1, "");
+            CV_CheckTypeEQ(ends.type(), CV_32SC1, "");
+            CV_CheckTypeEQ(strides.type(), CV_32SC1, "");
+            const int num = begins.total();
+            CV_Assert_N(num == ends.total(), num == strides.total());
+
+            int end_mask = getLayerAttr(layer, "end_mask").i();
+            for (int i = 0; i < num; ++i)
+            {
+                if (end_mask & (1 << i))
+                    ends.at<int>(i) = -1;
+                if (strides.at<int>(i) != 1)
+                    CV_Error(Error::StsNotImplemented,
+                             format("StridedSlice with stride %d", strides.at<int>(i)));
+            }
+            if (begins.total() == 4 && getDataLayout(name, data_layouts) == DATA_LAYOUT_NHWC)
+            {
+                // Swap NHWC parameters' order to NCHW.
+                std::swap(begins.at<int>(2), begins.at<int>(3));
+                std::swap(begins.at<int>(1), begins.at<int>(2));
+                std::swap(ends.at<int>(2), ends.at<int>(3));
+                std::swap(ends.at<int>(1), ends.at<int>(2));
+            }
+            layerParams.set("begin", DictValue::arrayInt((int*)begins.data, begins.total()));
+            layerParams.set("end", DictValue::arrayInt((int*)ends.data, ends.total()));
+
+            int id = dstNet.addLayer(name, "Slice", layerParams);
+            layer_id[name] = id;
+
+            connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
+        }
         else if (type == "Mul")
         {
             bool haveConst = false;
diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp
index f1b0a81e8e..e66012c304 100644
--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@@ -167,6 +167,13 @@ TEST_P(Test_ONNX_layers, BatchNormalization)
     testONNXModels("batch_norm");
 }
 
+TEST_P(Test_ONNX_layers, BatchNormalization3D)
+{
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target != DNN_TARGET_CPU)
+        throw SkipTestException("");
+    testONNXModels("batch_norm_3d");
+}
+
 TEST_P(Test_ONNX_layers, Transpose)
 {
     if (backend == DNN_BACKEND_INFERENCE_ENGINE &&
@@ -238,6 +245,12 @@ TEST_P(Test_ONNX_layers, Reshape)
     testONNXModels("unsqueeze");
 }
 
+TEST_P(Test_ONNX_layers, Softmax)
+{
+    testONNXModels("softmax");
+    testONNXModels("log_softmax", npy, 0, 0, false, false);
+}
+
 INSTANTIATE_TEST_CASE_P(/*nothing*/, Test_ONNX_layers, dnnBackendsAndTargets());
 
 class Test_ONNX_nets : public Test_ONNX_layers {};
diff --git a/modules/dnn/test/test_tf_importer.cpp b/modules/dnn/test/test_tf_importer.cpp
index e662da53e7..1a70e8f471 100644
--- a/modules/dnn/test/test_tf_importer.cpp
+++ b/modules/dnn/test/test_tf_importer.cpp
@@ -188,6 +188,13 @@ TEST_P(Test_TensorFlow_layers, batch_norm)
     runTensorFlowNet("mvn_batch_norm_1x1");
 }
 
+TEST_P(Test_TensorFlow_layers, batch_norm3D)
+{
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target != DNN_TARGET_CPU)
+        throw SkipTestException("");
+    runTensorFlowNet("batch_norm3d");
+}
+
 TEST_P(Test_TensorFlow_layers, slim_batch_norm)
 {
     if (backend == DNN_BACKEND_INFERENCE_ENGINE)
@@ -656,6 +663,7 @@ TEST_P(Test_TensorFlow_layers, slice)
         (target == DNN_TARGET_OPENCL || target == DNN_TARGET_OPENCL_FP16))
         throw SkipTestException("");
     runTensorFlowNet("slice_4d");
+    runTensorFlowNet("strided_slice");
 }
 
 TEST_P(Test_TensorFlow_layers, softmax)
diff --git a/modules/imgproc/misc/java/src/java/imgproc+Moments.java b/modules/imgproc/misc/java/src/java/imgproc+Moments.java
index 2eeebc9875..5c3d94c78c 100644
--- a/modules/imgproc/misc/java/src/java/imgproc+Moments.java
+++ b/modules/imgproc/misc/java/src/java/imgproc+Moments.java
@@ -1,7 +1,5 @@
 package org.opencv.imgproc;
 
-import java.lang.Math;
-
 //javadoc:Moments
 public class Moments {
 
diff --git a/modules/imgproc/src/color_lab.cpp b/modules/imgproc/src/color_lab.cpp
index cb5c0fdf53..e488d26a8e 100644
--- a/modules/imgproc/src/color_lab.cpp
+++ b/modules/imgproc/src/color_lab.cpp
@@ -56,63 +56,42 @@ template<typename _Tp> static inline _Tp splineInterpolate(_Tp x, const _Tp* tab
     return ((tab[3]*x + tab[2])*x + tab[1])*x + tab[0];
 }
 
-#if CV_NEON
-template<typename _Tp> static inline void splineInterpolate(float32x4_t& v_x, const _Tp* tab, int n)
+#if CV_SIMD
+
+template<typename _Tp> static inline cv::v_float32 splineInterpolate(const cv::v_float32& x, const _Tp* tab, int n)
 {
-    int32x4_t v_ix = vcvtq_s32_f32(vminq_f32(vmaxq_f32(v_x, vdupq_n_f32(0)), vdupq_n_f32(n - 1)));
-    v_x = vsubq_f32(v_x, vcvtq_f32_s32(v_ix));
-    v_ix = vshlq_n_s32(v_ix, 2);
+    using namespace cv;
+    v_int32 ix = v_min(v_max(v_trunc(x), vx_setzero_s32()), vx_setall_s32(n-1));
+    cv::v_float32 xx = x - v_cvt_f32(ix);
+    ix = ix << 2;
 
-    int CV_DECL_ALIGNED(16) ix[4];
-    vst1q_s32(ix, v_ix);
+    v_float32 t[4];
+    // assume that v_float32::nlanes == v_int32::nlanes
+    if(v_float32::nlanes == 4)
+    {
+#if CV_SIMD_WIDTH == 16
+        int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) idx[4];
+        v_store_aligned(idx, ix);
+        v_float32x4 tt[4];
+        tt[0] = v_load(tab + idx[0]);
+        tt[1] = v_load(tab + idx[1]);
+        tt[2] = v_load(tab + idx[2]);
+        tt[3] = v_load(tab + idx[3]);
+        v_transpose4x4(tt[0], tt[1], tt[2], tt[3],
+                        t[0],  t[1],  t[2],  t[3]);
+#endif
+    }
+    else
+    {
+        t[0] = v_lut(tab + 0, ix);
+        t[1] = v_lut(tab + 1, ix);
+        t[2] = v_lut(tab + 2, ix);
+        t[3] = v_lut(tab + 3, ix);
+    }
 
-    float32x4_t v_tab0 = vld1q_f32(tab + ix[0]);
-    float32x4_t v_tab1 = vld1q_f32(tab + ix[1]);
-    float32x4_t v_tab2 = vld1q_f32(tab + ix[2]);
-    float32x4_t v_tab3 = vld1q_f32(tab + ix[3]);
-
-    float32x4x2_t v01 = vtrnq_f32(v_tab0, v_tab1);
-    float32x4x2_t v23 = vtrnq_f32(v_tab2, v_tab3);
-
-    v_tab0 = vcombine_f32(vget_low_f32(v01.val[0]), vget_low_f32(v23.val[0]));
-    v_tab1 = vcombine_f32(vget_low_f32(v01.val[1]), vget_low_f32(v23.val[1]));
-    v_tab2 = vcombine_f32(vget_high_f32(v01.val[0]), vget_high_f32(v23.val[0]));
-    v_tab3 = vcombine_f32(vget_high_f32(v01.val[1]), vget_high_f32(v23.val[1]));
-
-    v_x = vmlaq_f32(v_tab0, vmlaq_f32(v_tab1, vmlaq_f32(v_tab2, v_tab3, v_x), v_x), v_x);
+    return v_fma(v_fma(v_fma(t[3], xx, t[2]), xx, t[1]), xx, t[0]);
 }
-#elif CV_SSE2
-template<typename _Tp> static inline void splineInterpolate(__m128& v_x, const _Tp* tab, int n)
-{
-    __m128i v_ix = _mm_cvttps_epi32(_mm_min_ps(_mm_max_ps(v_x, _mm_setzero_ps()), _mm_set1_ps(float(n - 1))));
-    v_x = _mm_sub_ps(v_x, _mm_cvtepi32_ps(v_ix));
-    v_ix = _mm_slli_epi32(v_ix, 2);
 
-    int CV_DECL_ALIGNED(16) ix[4];
-    _mm_store_si128((__m128i *)ix, v_ix);
-
-    __m128 v_tab0 = _mm_loadu_ps(tab + ix[0]);
-    __m128 v_tab1 = _mm_loadu_ps(tab + ix[1]);
-    __m128 v_tab2 = _mm_loadu_ps(tab + ix[2]);
-    __m128 v_tab3 = _mm_loadu_ps(tab + ix[3]);
-
-    __m128 v_tmp0 = _mm_unpacklo_ps(v_tab0, v_tab1);
-    __m128 v_tmp1 = _mm_unpacklo_ps(v_tab2, v_tab3);
-    __m128 v_tmp2 = _mm_unpackhi_ps(v_tab0, v_tab1);
-    __m128 v_tmp3 = _mm_unpackhi_ps(v_tab2, v_tab3);
-
-    v_tab0 = _mm_shuffle_ps(v_tmp0, v_tmp1, 0x44);
-    v_tab2 = _mm_shuffle_ps(v_tmp2, v_tmp3, 0x44);
-    v_tab1 = _mm_shuffle_ps(v_tmp0, v_tmp1, 0xee);
-    v_tab3 = _mm_shuffle_ps(v_tmp2, v_tmp3, 0xee);
-
-    __m128 v_l = _mm_mul_ps(v_x, v_tab3);
-    v_l = _mm_add_ps(v_l, v_tab2);
-    v_l = _mm_mul_ps(v_l, v_x);
-    v_l = _mm_add_ps(v_l, v_tab1);
-    v_l = _mm_mul_ps(v_l, v_x);
-    v_x = _mm_add_ps(v_l, v_tab0);
-}
 #endif
 
 namespace cv
@@ -201,7 +180,6 @@ template<typename _Tp> struct RGB2XYZ_f
     float coeffs[9];
 };
 
-#if CV_NEON
 
 template <>
 struct RGB2XYZ_f<float>
@@ -218,175 +196,59 @@ struct RGB2XYZ_f<float>
             std::swap(coeffs[3], coeffs[5]);
             std::swap(coeffs[6], coeffs[8]);
         }
-
-        v_c0 = vdupq_n_f32(coeffs[0]);
-        v_c1 = vdupq_n_f32(coeffs[1]);
-        v_c2 = vdupq_n_f32(coeffs[2]);
-        v_c3 = vdupq_n_f32(coeffs[3]);
-        v_c4 = vdupq_n_f32(coeffs[4]);
-        v_c5 = vdupq_n_f32(coeffs[5]);
-        v_c6 = vdupq_n_f32(coeffs[6]);
-        v_c7 = vdupq_n_f32(coeffs[7]);
-        v_c8 = vdupq_n_f32(coeffs[8]);
     }
 
     void operator()(const float* src, float* dst, int n) const
     {
-        int scn = srccn, i = 0;
+        CV_INSTRUMENT_REGION();
+
+        int scn = srccn;
         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
               C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
               C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
-
-        n *= 3;
-
-        if (scn == 3)
-            for ( ; i <= n - 12; i += 12, src += 12)
+        int i = 0;
+#if CV_SIMD
+        const int vsize = v_float32::nlanes;
+        v_float32 vc0 = vx_setall_f32(C0), vc1 = vx_setall_f32(C1), vc2 = vx_setall_f32(C2);
+        v_float32 vc3 = vx_setall_f32(C3), vc4 = vx_setall_f32(C4), vc5 = vx_setall_f32(C5);
+        v_float32 vc6 = vx_setall_f32(C6), vc7 = vx_setall_f32(C7), vc8 = vx_setall_f32(C8);
+        for( ; i <= n-vsize;
+             i += vsize, src += scn*vsize, dst += vsize*3)
+        {
+            v_float32 b, g, r, a;
+            if(scn == 4)
             {
-                float32x4x3_t v_src = vld3q_f32(src), v_dst;
-                v_dst.val[0] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c0), v_src.val[1], v_c1), v_src.val[2], v_c2);
-                v_dst.val[1] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c3), v_src.val[1], v_c4), v_src.val[2], v_c5);
-                v_dst.val[2] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c6), v_src.val[1], v_c7), v_src.val[2], v_c8);
-                vst3q_f32(dst + i, v_dst);
+                v_load_deinterleave(src, b, g, r, a);
             }
-        else
-            for ( ; i <= n - 12; i += 12, src += 16)
+            else // scn == 3
             {
-                float32x4x4_t v_src = vld4q_f32(src);
-                float32x4x3_t v_dst;
-                v_dst.val[0] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c0), v_src.val[1], v_c1), v_src.val[2], v_c2);
-                v_dst.val[1] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c3), v_src.val[1], v_c4), v_src.val[2], v_c5);
-                v_dst.val[2] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c6), v_src.val[1], v_c7), v_src.val[2], v_c8);
-                vst3q_f32(dst + i, v_dst);
+                v_load_deinterleave(src, b, g, r);
             }
 
-        for ( ; i < n; i += 3, src += scn)
-        {
-            float X = saturate_cast<float>(src[0]*C0 + src[1]*C1 + src[2]*C2);
-            float Y = saturate_cast<float>(src[0]*C3 + src[1]*C4 + src[2]*C5);
-            float Z = saturate_cast<float>(src[0]*C6 + src[1]*C7 + src[2]*C8);
-            dst[i] = X; dst[i+1] = Y; dst[i+2] = Z;
+            v_float32 x, y, z;
+            x = v_fma(b, vc0, v_fma(g, vc1, r*vc2));
+            y = v_fma(b, vc3, v_fma(g, vc4, r*vc5));
+            z = v_fma(b, vc6, v_fma(g, vc7, r*vc8));
+
+            v_store_interleave(dst, x, y, z);
         }
-    }
-
-    int srccn;
-    float coeffs[9];
-    float32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8;
-};
-
-#elif CV_SSE2
-
-template <>
-struct RGB2XYZ_f<float>
-{
-    typedef float channel_type;
-
-    RGB2XYZ_f(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
-    {
-        for(int i = 0; i < 9; i++)
-            coeffs[i] = _coeffs ? _coeffs[i] : (float)sRGB2XYZ_D65[i];
-        if(blueIdx == 0)
-        {
-            std::swap(coeffs[0], coeffs[2]);
-            std::swap(coeffs[3], coeffs[5]);
-            std::swap(coeffs[6], coeffs[8]);
-        }
-
-        v_c0 = _mm_set1_ps(coeffs[0]);
-        v_c1 = _mm_set1_ps(coeffs[1]);
-        v_c2 = _mm_set1_ps(coeffs[2]);
-        v_c3 = _mm_set1_ps(coeffs[3]);
-        v_c4 = _mm_set1_ps(coeffs[4]);
-        v_c5 = _mm_set1_ps(coeffs[5]);
-        v_c6 = _mm_set1_ps(coeffs[6]);
-        v_c7 = _mm_set1_ps(coeffs[7]);
-        v_c8 = _mm_set1_ps(coeffs[8]);
-
-        haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
-    }
-
-    void process(__m128 v_r, __m128 v_g, __m128 v_b,
-                 __m128 & v_x, __m128 & v_y, __m128 & v_z) const
-    {
-        v_x = _mm_mul_ps(v_r, v_c0);
-        v_x = _mm_add_ps(v_x, _mm_mul_ps(v_g, v_c1));
-        v_x = _mm_add_ps(v_x, _mm_mul_ps(v_b, v_c2));
-
-        v_y = _mm_mul_ps(v_r, v_c3);
-        v_y = _mm_add_ps(v_y, _mm_mul_ps(v_g, v_c4));
-        v_y = _mm_add_ps(v_y, _mm_mul_ps(v_b, v_c5));
-
-        v_z = _mm_mul_ps(v_r, v_c6);
-        v_z = _mm_add_ps(v_z, _mm_mul_ps(v_g, v_c7));
-        v_z = _mm_add_ps(v_z, _mm_mul_ps(v_b, v_c8));
-    }
-
-    void operator()(const float* src, float* dst, int n) const
-    {
-        int scn = srccn, i = 0;
-        float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
-              C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
-              C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
-
-        n *= 3;
-
-        if (haveSIMD)
-        {
-            for ( ; i <= n - 24; i += 24, src += 8 * scn)
-            {
-                __m128 v_r0 = _mm_loadu_ps(src);
-                __m128 v_r1 = _mm_loadu_ps(src + 4);
-                __m128 v_g0 = _mm_loadu_ps(src + 8);
-                __m128 v_g1 = _mm_loadu_ps(src + 12);
-                __m128 v_b0 = _mm_loadu_ps(src + 16);
-                __m128 v_b1 = _mm_loadu_ps(src + 20);
-
-                if (scn == 4)
-                {
-                    __m128 v_a0 = _mm_loadu_ps(src + 24);
-                    __m128 v_a1 = _mm_loadu_ps(src + 28);
-
-                    _mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1,
-                                        v_b0, v_b1, v_a0, v_a1);
-                }
-                else
-                    _mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
-
-                __m128 v_x0, v_y0, v_z0;
-                process(v_r0, v_g0, v_b0,
-                        v_x0, v_y0, v_z0);
-
-                __m128 v_x1, v_y1, v_z1;
-                process(v_r1, v_g1, v_b1,
-                        v_x1, v_y1, v_z1);
-
-                _mm_interleave_ps(v_x0, v_x1, v_y0, v_y1, v_z0, v_z1);
-
-                _mm_storeu_ps(dst + i, v_x0);
-                _mm_storeu_ps(dst + i + 4, v_x1);
-                _mm_storeu_ps(dst + i + 8, v_y0);
-                _mm_storeu_ps(dst + i + 12, v_y1);
-                _mm_storeu_ps(dst + i + 16, v_z0);
-                _mm_storeu_ps(dst + i + 20, v_z1);
-            }
-        }
-
-        for ( ; i < n; i += 3, src += scn)
-        {
-            float X = saturate_cast<float>(src[0]*C0 + src[1]*C1 + src[2]*C2);
-            float Y = saturate_cast<float>(src[0]*C3 + src[1]*C4 + src[2]*C5);
-            float Z = saturate_cast<float>(src[0]*C6 + src[1]*C7 + src[2]*C8);
-            dst[i] = X; dst[i+1] = Y; dst[i+2] = Z;
-        }
-    }
-
-    int srccn;
-    float coeffs[9];
-    __m128 v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8;
-    bool haveSIMD;
-};
-
-
 #endif
+        for( ; i < n; i++, src += scn, dst += 3)
+        {
+            float b = src[0], g = src[1], r = src[2];
+
+            float X = saturate_cast<float>(b*C0 + g*C1 + r*C2);
+            float Y = saturate_cast<float>(b*C3 + g*C4 + r*C5);
+            float Z = saturate_cast<float>(b*C6 + g*C7 + r*C8);
+
+            dst[0] = X; dst[1] = Y; dst[2] = Z;
+        }
+    }
+
+    int srccn;
+    float coeffs[9];
+};
+
 
 template<typename _Tp> struct RGB2XYZ_i
 {
@@ -424,235 +286,244 @@ template<typename _Tp> struct RGB2XYZ_i
     int coeffs[9];
 };
 
-#if CV_NEON
 
 template <>
 struct RGB2XYZ_i<uchar>
 {
     typedef uchar channel_type;
+    static const int shift = xyz_shift;
 
     RGB2XYZ_i(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
     {
         for( int i = 0; i < 9; i++ )
-            coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : sRGB2XYZ_D65_i[i];
+            coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << shift)) : sRGB2XYZ_D65_i[i];
         if(blueIdx == 0)
         {
             std::swap(coeffs[0], coeffs[2]);
             std::swap(coeffs[3], coeffs[5]);
             std::swap(coeffs[6], coeffs[8]);
         }
-
-        v_c0 = vdup_n_u16(coeffs[0]);
-        v_c1 = vdup_n_u16(coeffs[1]);
-        v_c2 = vdup_n_u16(coeffs[2]);
-        v_c3 = vdup_n_u16(coeffs[3]);
-        v_c4 = vdup_n_u16(coeffs[4]);
-        v_c5 = vdup_n_u16(coeffs[5]);
-        v_c6 = vdup_n_u16(coeffs[6]);
-        v_c7 = vdup_n_u16(coeffs[7]);
-        v_c8 = vdup_n_u16(coeffs[8]);
-        v_delta = vdupq_n_u32(1 << (xyz_shift - 1));
     }
     void operator()(const uchar * src, uchar * dst, int n) const
     {
+        CV_INSTRUMENT_REGION();
+
         int scn = srccn, i = 0;
         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
             C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
             C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
-        n *= 3;
 
-        for ( ; i <= n - 24; i += 24, src += scn * 8)
+#if CV_SIMD
+        const int vsize = v_uint8::nlanes;
+        int descaleShift = 1 << (shift-1);
+        v_int16 vdescale = vx_setall_s16((short)descaleShift);
+        v_int16 cxbg, cxr1, cybg, cyr1, czbg, czr1;
+        v_int16 dummy;
+        v_zip(vx_setall_s16((short)C0), vx_setall_s16((short)C1), cxbg, dummy);
+        v_zip(vx_setall_s16((short)C2), vx_setall_s16(        1), cxr1, dummy);
+        v_zip(vx_setall_s16((short)C3), vx_setall_s16((short)C4), cybg, dummy);
+        v_zip(vx_setall_s16((short)C5), vx_setall_s16(        1), cyr1, dummy);
+        v_zip(vx_setall_s16((short)C6), vx_setall_s16((short)C7), czbg, dummy);
+        v_zip(vx_setall_s16((short)C8), vx_setall_s16(        1), czr1, dummy);
+
+        for( ; i <= n-vsize;
+             i += vsize, src += scn*vsize, dst += 3*vsize)
         {
-            uint8x8x3_t v_dst;
-            uint16x8x3_t v_src16;
-
-            if (scn == 3)
+            v_uint8 b, g, r, a;
+            if(scn == 4)
             {
-                uint8x8x3_t v_src = vld3_u8(src);
-                v_src16.val[0] = vmovl_u8(v_src.val[0]);
-                v_src16.val[1] = vmovl_u8(v_src.val[1]);
-                v_src16.val[2] = vmovl_u8(v_src.val[2]);
+                v_load_deinterleave(src, b, g, r, a);
             }
-            else
+            else // scn == 3
             {
-                uint8x8x4_t v_src = vld4_u8(src);
-                v_src16.val[0] = vmovl_u8(v_src.val[0]);
-                v_src16.val[1] = vmovl_u8(v_src.val[1]);
-                v_src16.val[2] = vmovl_u8(v_src.val[2]);
+                v_load_deinterleave(src, b, g, r);
             }
 
-            uint16x4_t v_s0 = vget_low_u16(v_src16.val[0]),
-                       v_s1 = vget_low_u16(v_src16.val[1]),
-                       v_s2 = vget_low_u16(v_src16.val[2]);
+            v_uint16 b0, b1, g0, g1, r0, r1;
+            v_expand(b, b0, b1);
+            v_expand(g, g0, g1);
+            v_expand(r, r0, r1);
 
-            uint32x4_t v_X0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
-            uint32x4_t v_Y0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
-            uint32x4_t v_Z0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
-            v_X0 = vshrq_n_u32(vaddq_u32(v_X0, v_delta), xyz_shift);
-            v_Y0 = vshrq_n_u32(vaddq_u32(v_Y0, v_delta), xyz_shift);
-            v_Z0 = vshrq_n_u32(vaddq_u32(v_Z0, v_delta), xyz_shift);
+            v_int16 sb0, sb1, sg0, sg1, sr0, sr1;
+            sr0 = v_reinterpret_as_s16(r0); sr1 = v_reinterpret_as_s16(r1);
+            sg0 = v_reinterpret_as_s16(g0); sg1 = v_reinterpret_as_s16(g1);
+            sb0 = v_reinterpret_as_s16(b0); sb1 = v_reinterpret_as_s16(b1);
 
-            v_s0 = vget_high_u16(v_src16.val[0]),
-            v_s1 = vget_high_u16(v_src16.val[1]),
-            v_s2 = vget_high_u16(v_src16.val[2]);
+            v_int16 bg[4], rd[4];
+            v_zip(sb0, sg0, bg[0], bg[1]);
+            v_zip(sb1, sg1, bg[2], bg[3]);
+            v_zip(sr0, vdescale, rd[0], rd[1]);
+            v_zip(sr1, vdescale, rd[2], rd[3]);
 
-            uint32x4_t v_X1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
-            uint32x4_t v_Y1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
-            uint32x4_t v_Z1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
-            v_X1 = vshrq_n_u32(vaddq_u32(v_X1, v_delta), xyz_shift);
-            v_Y1 = vshrq_n_u32(vaddq_u32(v_Y1, v_delta), xyz_shift);
-            v_Z1 = vshrq_n_u32(vaddq_u32(v_Z1, v_delta), xyz_shift);
+            v_uint32 vx[4], vy[4], vz[4];
+            for(int j = 0; j < 4; j++)
+            {
+                vx[j] = v_reinterpret_as_u32(v_dotprod(bg[j], cxbg) + v_dotprod(rd[j], cxr1)) >> shift;
+                vy[j] = v_reinterpret_as_u32(v_dotprod(bg[j], cybg) + v_dotprod(rd[j], cyr1)) >> shift;
+                vz[j] = v_reinterpret_as_u32(v_dotprod(bg[j], czbg) + v_dotprod(rd[j], czr1)) >> shift;
+            }
 
-            v_dst.val[0] = vqmovn_u16(vcombine_u16(vmovn_u32(v_X0), vmovn_u32(v_X1)));
-            v_dst.val[1] = vqmovn_u16(vcombine_u16(vmovn_u32(v_Y0), vmovn_u32(v_Y1)));
-            v_dst.val[2] = vqmovn_u16(vcombine_u16(vmovn_u32(v_Z0), vmovn_u32(v_Z1)));
+            v_uint16 x0, x1, y0, y1, z0, z1;
+            x0 = v_pack(vx[0], vx[1]);
+            x1 = v_pack(vx[2], vx[3]);
+            y0 = v_pack(vy[0], vy[1]);
+            y1 = v_pack(vy[2], vy[3]);
+            z0 = v_pack(vz[0], vz[1]);
+            z1 = v_pack(vz[2], vz[3]);
 
-            vst3_u8(dst + i, v_dst);
+            v_uint8 x, y, z;
+            x = v_pack(x0, x1);
+            y = v_pack(y0, y1);
+            z = v_pack(z0, z1);
+
+            v_store_interleave(dst, x, y, z);
         }
+#endif
 
-        for ( ; i < n; i += 3, src += scn)
+        for ( ; i < n; i++, src += scn, dst += 3)
         {
-            int X = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, xyz_shift);
-            int Y = CV_DESCALE(src[0]*C3 + src[1]*C4 + src[2]*C5, xyz_shift);
-            int Z = CV_DESCALE(src[0]*C6 + src[1]*C7 + src[2]*C8, xyz_shift);
-            dst[i] = saturate_cast<uchar>(X);
-            dst[i+1] = saturate_cast<uchar>(Y);
-            dst[i+2] = saturate_cast<uchar>(Z);
+            uchar b = src[0], g = src[1], r = src[2];
+
+            int X = CV_DESCALE(b*C0 + g*C1 + r*C2, shift);
+            int Y = CV_DESCALE(b*C3 + g*C4 + r*C5, shift);
+            int Z = CV_DESCALE(b*C6 + g*C7 + r*C8, shift);
+            dst[0] = saturate_cast<uchar>(X);
+            dst[1] = saturate_cast<uchar>(Y);
+            dst[2] = saturate_cast<uchar>(Z);
         }
     }
 
     int srccn, coeffs[9];
-    uint16x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8;
-    uint32x4_t v_delta;
 };
 
+
 template <>
 struct RGB2XYZ_i<ushort>
 {
     typedef ushort channel_type;
+    static const int shift = xyz_shift;
+    static const int fix_shift = (int)(sizeof(short)*8 - shift);
 
     RGB2XYZ_i(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
     {
         for( int i = 0; i < 9; i++ )
-            coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : sRGB2XYZ_D65_i[i];
+            coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << shift)) : sRGB2XYZ_D65_i[i];
         if(blueIdx == 0)
         {
             std::swap(coeffs[0], coeffs[2]);
             std::swap(coeffs[3], coeffs[5]);
             std::swap(coeffs[6], coeffs[8]);
         }
-
-        v_c0 = vdup_n_u16(coeffs[0]);
-        v_c1 = vdup_n_u16(coeffs[1]);
-        v_c2 = vdup_n_u16(coeffs[2]);
-        v_c3 = vdup_n_u16(coeffs[3]);
-        v_c4 = vdup_n_u16(coeffs[4]);
-        v_c5 = vdup_n_u16(coeffs[5]);
-        v_c6 = vdup_n_u16(coeffs[6]);
-        v_c7 = vdup_n_u16(coeffs[7]);
-        v_c8 = vdup_n_u16(coeffs[8]);
-        v_delta = vdupq_n_u32(1 << (xyz_shift - 1));
     }
 
     void operator()(const ushort * src, ushort * dst, int n) const
     {
+        CV_INSTRUMENT_REGION();
+
         int scn = srccn, i = 0;
         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
             C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
             C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
-        n *= 3;
+#if CV_SIMD
+        const int vsize = v_uint16::nlanes;
+        const int descaleShift = 1 << (shift-1);
+        v_int16 vdescale = vx_setall_s16(descaleShift);
+        v_int16 vc0 = vx_setall_s16((short)C0), vc1 = vx_setall_s16((short)C1), vc2 = vx_setall_s16((short)C2);
+        v_int16 vc3 = vx_setall_s16((short)C3), vc4 = vx_setall_s16((short)C4), vc5 = vx_setall_s16((short)C5);
+        v_int16 vc6 = vx_setall_s16((short)C6), vc7 = vx_setall_s16((short)C7), vc8 = vx_setall_s16((short)C8);
+        v_int16 zero = vx_setzero_s16(), one = vx_setall_s16(1);
+        v_int16 cxbg, cxr1, cybg, cyr1, czbg, czr1;
+        v_int16 dummy;
+        v_zip(vc0, vc1, cxbg, dummy);
+        v_zip(vc2, one, cxr1, dummy);
+        v_zip(vc3, vc4, cybg, dummy);
+        v_zip(vc5, one, cyr1, dummy);
+        v_zip(vc6, vc7, czbg, dummy);
+        v_zip(vc8, one, czr1, dummy);
 
-        for ( ; i <= n - 24; i += 24, src += scn * 8)
+        for (; i <= n-vsize;
+             i += vsize, src += scn*vsize, dst += 3*vsize)
         {
-            uint16x8x3_t v_src, v_dst;
-
-            if (scn == 3)
-                v_src = vld3q_u16(src);
-            else
+            v_uint16 b, g, r, a;
+            if(scn == 4)
             {
-                uint16x8x4_t v_src4 = vld4q_u16(src);
-                v_src.val[0] = v_src4.val[0];
-                v_src.val[1] = v_src4.val[1];
-                v_src.val[2] = v_src4.val[2];
+                v_load_deinterleave(src, b, g, r, a);
+            }
+            else // scn == 3
+            {
+                v_load_deinterleave(src, b, g, r);
             }
 
-            uint16x4_t v_s0 = vget_low_u16(v_src.val[0]),
-                       v_s1 = vget_low_u16(v_src.val[1]),
-                       v_s2 = vget_low_u16(v_src.val[2]);
+            v_int16 sb, sg, sr;
+            sr = v_reinterpret_as_s16(r);
+            sg = v_reinterpret_as_s16(g);
+            sb = v_reinterpret_as_s16(b);
 
-            uint32x4_t v_X0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
-            uint32x4_t v_Y0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
-            uint32x4_t v_Z0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
-            v_X0 = vshrq_n_u32(vaddq_u32(v_X0, v_delta), xyz_shift);
-            v_Y0 = vshrq_n_u32(vaddq_u32(v_Y0, v_delta), xyz_shift);
-            v_Z0 = vshrq_n_u32(vaddq_u32(v_Z0, v_delta), xyz_shift);
+            // fixing 16bit signed multiplication
+            v_int16 xmr, xmg, xmb;
+            v_int16 ymr, ymg, ymb;
+            v_int16 zmr, zmg, zmb;
 
-            v_s0 = vget_high_u16(v_src.val[0]),
-            v_s1 = vget_high_u16(v_src.val[1]),
-            v_s2 = vget_high_u16(v_src.val[2]);
+            v_int16 mr = sr < zero, mg = sg < zero, mb = sb < zero;
 
-            uint32x4_t v_X1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
-            uint32x4_t v_Y1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
-            uint32x4_t v_Z1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
-            v_X1 = vshrq_n_u32(vaddq_u32(v_X1, v_delta), xyz_shift);
-            v_Y1 = vshrq_n_u32(vaddq_u32(v_Y1, v_delta), xyz_shift);
-            v_Z1 = vshrq_n_u32(vaddq_u32(v_Z1, v_delta), xyz_shift);
+            xmb = mb & vc0;
+            xmg = mg & vc1;
+            xmr = mr & vc2;
+            ymb = mb & vc3;
+            ymg = mg & vc4;
+            ymr = mr & vc5;
+            zmb = mb & vc6;
+            zmg = mg & vc7;
+            zmr = mr & vc8;
 
-            v_dst.val[0] = vcombine_u16(vqmovn_u32(v_X0), vqmovn_u32(v_X1));
-            v_dst.val[1] = vcombine_u16(vqmovn_u32(v_Y0), vqmovn_u32(v_Y1));
-            v_dst.val[2] = vcombine_u16(vqmovn_u32(v_Z0), vqmovn_u32(v_Z1));
+            v_int32 xfix0, xfix1, yfix0, yfix1, zfix0, zfix1;
+            v_expand(xmr + xmg + xmb, xfix0, xfix1);
+            v_expand(ymr + ymg + ymb, yfix0, yfix1);
+            v_expand(zmr + zmg + zmb, zfix0, zfix1);
 
-            vst3q_u16(dst + i, v_dst);
+            xfix0 = xfix0 << 16;
+            xfix1 = xfix1 << 16;
+            yfix0 = yfix0 << 16;
+            yfix1 = yfix1 << 16;
+            zfix0 = zfix0 << 16;
+            zfix1 = zfix1 << 16;
+
+            v_int16 bg0, bg1, rd0, rd1;
+            v_zip(sb, sg, bg0, bg1);
+            v_zip(sr, vdescale, rd0, rd1);
+
+            v_uint32 x0, x1, y0, y1, z0, z1;
+
+            x0 = v_reinterpret_as_u32(v_dotprod(bg0, cxbg) + v_dotprod(rd0, cxr1) + xfix0) >> shift;
+            x1 = v_reinterpret_as_u32(v_dotprod(bg1, cxbg) + v_dotprod(rd1, cxr1) + xfix1) >> shift;
+            y0 = v_reinterpret_as_u32(v_dotprod(bg0, cybg) + v_dotprod(rd0, cyr1) + yfix0) >> shift;
+            y1 = v_reinterpret_as_u32(v_dotprod(bg1, cybg) + v_dotprod(rd1, cyr1) + yfix1) >> shift;
+            z0 = v_reinterpret_as_u32(v_dotprod(bg0, czbg) + v_dotprod(rd0, czr1) + zfix0) >> shift;
+            z1 = v_reinterpret_as_u32(v_dotprod(bg1, czbg) + v_dotprod(rd1, czr1) + zfix1) >> shift;
+
+            v_uint16 x, y, z;
+            x = v_pack(x0, x1);
+            y = v_pack(y0, y1);
+            z = v_pack(z0, z1);
+
+            v_store_interleave(dst, x, y, z);
         }
-
-        for ( ; i <= n - 12; i += 12, src += scn * 4)
+#endif
+        for ( ; i < n; i++, src += scn, dst += 3)
         {
-            uint16x4x3_t v_dst;
-            uint16x4_t v_s0, v_s1, v_s2;
-
-            if (scn == 3)
-            {
-                uint16x4x3_t v_src = vld3_u16(src);
-                v_s0 = v_src.val[0];
-                v_s1 = v_src.val[1];
-                v_s2 = v_src.val[2];
-            }
-            else
-            {
-                uint16x4x4_t v_src = vld4_u16(src);
-                v_s0 = v_src.val[0];
-                v_s1 = v_src.val[1];
-                v_s2 = v_src.val[2];
-            }
-
-            uint32x4_t v_X = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
-            uint32x4_t v_Y = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
-            uint32x4_t v_Z = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
-
-            v_dst.val[0] = vqmovn_u32(vshrq_n_u32(vaddq_u32(v_X, v_delta), xyz_shift));
-            v_dst.val[1] = vqmovn_u32(vshrq_n_u32(vaddq_u32(v_Y, v_delta), xyz_shift));
-            v_dst.val[2] = vqmovn_u32(vshrq_n_u32(vaddq_u32(v_Z, v_delta), xyz_shift));
-
-            vst3_u16(dst + i, v_dst);
-        }
-
-        for ( ; i < n; i += 3, src += scn)
-        {
-            int X = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, xyz_shift);
-            int Y = CV_DESCALE(src[0]*C3 + src[1]*C4 + src[2]*C5, xyz_shift);
-            int Z = CV_DESCALE(src[0]*C6 + src[1]*C7 + src[2]*C8, xyz_shift);
-            dst[i] = saturate_cast<ushort>(X);
-            dst[i+1] = saturate_cast<ushort>(Y);
-            dst[i+2] = saturate_cast<ushort>(Z);
+            ushort b = src[0], g = src[1], r = src[2];
+            int X = CV_DESCALE(b*C0 + g*C1 + r*C2, shift);
+            int Y = CV_DESCALE(b*C3 + g*C4 + r*C5, shift);
+            int Z = CV_DESCALE(b*C6 + g*C7 + r*C8, shift);
+            dst[0] = saturate_cast<ushort>(X);
+            dst[1] = saturate_cast<ushort>(Y);
+            dst[2] = saturate_cast<ushort>(Z);
         }
     }
 
     int srccn, coeffs[9];
-    uint16x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8;
-    uint32x4_t v_delta;
 };
 
-#endif
 
 template<typename _Tp> struct XYZ2RGB_f
 {
@@ -693,7 +564,6 @@ template<typename _Tp> struct XYZ2RGB_f
     float coeffs[9];
 };
 
-#if CV_SSE2
 
 template <>
 struct XYZ2RGB_f<float>
@@ -711,113 +581,61 @@ struct XYZ2RGB_f<float>
             std::swap(coeffs[1], coeffs[7]);
             std::swap(coeffs[2], coeffs[8]);
         }
-
-        v_c0 = _mm_set1_ps(coeffs[0]);
-        v_c1 = _mm_set1_ps(coeffs[1]);
-        v_c2 = _mm_set1_ps(coeffs[2]);
-        v_c3 = _mm_set1_ps(coeffs[3]);
-        v_c4 = _mm_set1_ps(coeffs[4]);
-        v_c5 = _mm_set1_ps(coeffs[5]);
-        v_c6 = _mm_set1_ps(coeffs[6]);
-        v_c7 = _mm_set1_ps(coeffs[7]);
-        v_c8 = _mm_set1_ps(coeffs[8]);
-
-        v_alpha = _mm_set1_ps(ColorChannel<float>::max());
-
-        haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
-    }
-
-    void process(__m128 v_x, __m128 v_y, __m128 v_z,
-                 __m128 & v_r, __m128 & v_g, __m128 & v_b) const
-    {
-        v_b = _mm_mul_ps(v_x, v_c0);
-        v_b = _mm_add_ps(v_b, _mm_mul_ps(v_y, v_c1));
-        v_b = _mm_add_ps(v_b, _mm_mul_ps(v_z, v_c2));
-
-        v_g = _mm_mul_ps(v_x, v_c3);
-        v_g = _mm_add_ps(v_g, _mm_mul_ps(v_y, v_c4));
-        v_g = _mm_add_ps(v_g, _mm_mul_ps(v_z, v_c5));
-
-        v_r = _mm_mul_ps(v_x, v_c6);
-        v_r = _mm_add_ps(v_r, _mm_mul_ps(v_y, v_c7));
-        v_r = _mm_add_ps(v_r, _mm_mul_ps(v_z, v_c8));
     }
 
     void operator()(const float* src, float* dst, int n) const
     {
+        CV_INSTRUMENT_REGION();
+
         int dcn = dstcn;
         float alpha = ColorChannel<float>::max();
         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
               C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
               C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
-        n *= 3;
         int i = 0;
-
-        if (haveSIMD)
+#if CV_SIMD
+        const int vsize = v_float32::nlanes;
+        v_float32 valpha = vx_setall_f32(alpha);
+        v_float32 vc0 = vx_setall_f32(C0), vc1 = vx_setall_f32(C1), vc2 = vx_setall_f32(C2);
+        v_float32 vc3 = vx_setall_f32(C3), vc4 = vx_setall_f32(C4), vc5 = vx_setall_f32(C5);
+        v_float32 vc6 = vx_setall_f32(C6), vc7 = vx_setall_f32(C7), vc8 = vx_setall_f32(C8);
+        for( ; i <= n-vsize;
+             i += vsize, src += 3*vsize, dst += dcn*vsize)
         {
-            for ( ; i <= n - 24; i += 24, dst += 8 * dcn)
+            v_float32 x, y, z;
+            v_load_deinterleave(src, x, y, z);
+
+            v_float32 b, g, r;
+            b = v_fma(x, vc0, v_fma(y, vc1, z*vc2));
+            g = v_fma(x, vc3, v_fma(y, vc4, z*vc5));
+            r = v_fma(x, vc6, v_fma(y, vc7, z*vc8));
+
+            if(dcn == 4)
             {
-                __m128 v_x0 = _mm_loadu_ps(src + i);
-                __m128 v_x1 = _mm_loadu_ps(src + i + 4);
-                __m128 v_y0 = _mm_loadu_ps(src + i + 8);
-                __m128 v_y1 = _mm_loadu_ps(src + i + 12);
-                __m128 v_z0 = _mm_loadu_ps(src + i + 16);
-                __m128 v_z1 = _mm_loadu_ps(src + i + 20);
-
-                _mm_deinterleave_ps(v_x0, v_x1, v_y0, v_y1, v_z0, v_z1);
-
-                __m128 v_r0, v_g0, v_b0;
-                process(v_x0, v_y0, v_z0,
-                        v_r0, v_g0, v_b0);
-
-                __m128 v_r1, v_g1, v_b1;
-                process(v_x1, v_y1, v_z1,
-                        v_r1, v_g1, v_b1);
-
-                __m128 v_a0 = v_alpha, v_a1 = v_alpha;
-
-                if (dcn == 4)
-                    _mm_interleave_ps(v_b0, v_b1, v_g0, v_g1,
-                                      v_r0, v_r1, v_a0, v_a1);
-                else
-                    _mm_interleave_ps(v_b0, v_b1, v_g0, v_g1, v_r0, v_r1);
-
-                _mm_storeu_ps(dst, v_b0);
-                _mm_storeu_ps(dst + 4, v_b1);
-                _mm_storeu_ps(dst + 8, v_g0);
-                _mm_storeu_ps(dst + 12, v_g1);
-                _mm_storeu_ps(dst + 16, v_r0);
-                _mm_storeu_ps(dst + 20, v_r1);
-
-                if (dcn == 4)
-                {
-                    _mm_storeu_ps(dst + 24, v_a0);
-                    _mm_storeu_ps(dst + 28, v_a1);
-                }
+                v_store_interleave(dst, b, g, r, valpha);
+            }
+            else // dcn == 3
+            {
+                v_store_interleave(dst, b, g, r);
             }
-
         }
-
-        for( ; i < n; i += 3, dst += dcn)
+#endif
+        for( ; i < n; i++, src += 3, dst += dcn)
         {
-            float B = src[i]*C0 + src[i+1]*C1 + src[i+2]*C2;
-            float G = src[i]*C3 + src[i+1]*C4 + src[i+2]*C5;
-            float R = src[i]*C6 + src[i+1]*C7 + src[i+2]*C8;
+            float x = src[0], y = src[1], z = src[2];
+            float B = saturate_cast<float>(x*C0 + y*C1 + z*C2);
+            float G = saturate_cast<float>(x*C3 + y*C4 + z*C5);
+            float R = saturate_cast<float>(x*C6 + y*C7 + z*C8);
             dst[0] = B; dst[1] = G; dst[2] = R;
             if( dcn == 4 )
                 dst[3] = alpha;
         }
     }
+
     int dstcn, blueIdx;
     float coeffs[9];
-
-    __m128 v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8;
-    __m128 v_alpha;
-    bool haveSIMD;
 };
 
-#endif // CV_SSE2
-
 
 template<typename _Tp> struct XYZ2RGB_i
 {
@@ -859,18 +677,18 @@ template<typename _Tp> struct XYZ2RGB_i
     int coeffs[9];
 };
 
-#if CV_NEON
 
 template <>
 struct XYZ2RGB_i<uchar>
 {
     typedef uchar channel_type;
+    static const int shift = xyz_shift;
 
     XYZ2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
     : dstcn(_dstcn), blueIdx(_blueIdx)
     {
         for(int i = 0; i < 9; i++)
-            coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : XYZ2sRGB_D65_i[i];
+            coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << shift)) : XYZ2sRGB_D65_i[i];
 
         if(blueIdx == 0)
         {
@@ -878,87 +696,90 @@ struct XYZ2RGB_i<uchar>
             std::swap(coeffs[1], coeffs[7]);
             std::swap(coeffs[2], coeffs[8]);
         }
-
-        v_c0 = vdup_n_s16(coeffs[0]);
-        v_c1 = vdup_n_s16(coeffs[1]);
-        v_c2 = vdup_n_s16(coeffs[2]);
-        v_c3 = vdup_n_s16(coeffs[3]);
-        v_c4 = vdup_n_s16(coeffs[4]);
-        v_c5 = vdup_n_s16(coeffs[5]);
-        v_c6 = vdup_n_s16(coeffs[6]);
-        v_c7 = vdup_n_s16(coeffs[7]);
-        v_c8 = vdup_n_s16(coeffs[8]);
-        v_delta = vdupq_n_s32(1 << (xyz_shift - 1));
-        v_alpha = vmovn_u16(vdupq_n_u16(ColorChannel<uchar>::max()));
     }
 
     void operator()(const uchar* src, uchar* dst, int n) const
     {
+        CV_INSTRUMENT_REGION();
+
         int dcn = dstcn, i = 0;
         uchar alpha = ColorChannel<uchar>::max();
         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
             C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
             C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
-        n *= 3;
+#if CV_SIMD
+        const int vsize = v_uint8::nlanes;
+        const int descaleShift = 1 << (shift - 1);
+        v_uint8 valpha = vx_setall_u8(alpha);
+        v_int16 vdescale = vx_setall_s16(descaleShift);
+        v_int16 cbxy, cbz1, cgxy, cgz1, crxy, crz1;
+        v_int16 dummy;
+        v_zip(vx_setall_s16((short)C0), vx_setall_s16((short)C1), cbxy, dummy);
+        v_zip(vx_setall_s16((short)C2), vx_setall_s16(        1), cbz1, dummy);
+        v_zip(vx_setall_s16((short)C3), vx_setall_s16((short)C4), cgxy, dummy);
+        v_zip(vx_setall_s16((short)C5), vx_setall_s16(        1), cgz1, dummy);
+        v_zip(vx_setall_s16((short)C6), vx_setall_s16((short)C7), crxy, dummy);
+        v_zip(vx_setall_s16((short)C8), vx_setall_s16(        1), crz1, dummy);
 
-        for ( ; i <= n - 24; i += 24, dst += dcn * 8)
+        for ( ; i <= n-vsize;
+              i += vsize, src += 3*vsize, dst += dcn*vsize)
         {
-            uint8x8x3_t v_src = vld3_u8(src + i);
-            int16x8x3_t v_src16;
-            v_src16.val[0] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[0]));
-            v_src16.val[1] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[1]));
-            v_src16.val[2] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[2]));
+            v_uint8 x, y, z;
+            v_load_deinterleave(src, x, y, z);
 
-            int16x4_t v_s0 = vget_low_s16(v_src16.val[0]),
-                       v_s1 = vget_low_s16(v_src16.val[1]),
-                       v_s2 = vget_low_s16(v_src16.val[2]);
+            v_uint16 ux0, ux1, uy0, uy1, uz0, uz1;
+            v_expand(x, ux0, ux1);
+            v_expand(y, uy0, uy1);
+            v_expand(z, uz0, uz1);
+            v_int16 x0, x1, y0, y1, z0, z1;
+            x0 = v_reinterpret_as_s16(ux0);
+            x1 = v_reinterpret_as_s16(ux1);
+            y0 = v_reinterpret_as_s16(uy0);
+            y1 = v_reinterpret_as_s16(uy1);
+            z0 = v_reinterpret_as_s16(uz0);
+            z1 = v_reinterpret_as_s16(uz1);
 
-            int32x4_t v_X0 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
-            int32x4_t v_Y0 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
-            int32x4_t v_Z0 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
-            v_X0 = vshrq_n_s32(vaddq_s32(v_X0, v_delta), xyz_shift);
-            v_Y0 = vshrq_n_s32(vaddq_s32(v_Y0, v_delta), xyz_shift);
-            v_Z0 = vshrq_n_s32(vaddq_s32(v_Z0, v_delta), xyz_shift);
+            v_int32 b[4], g[4], r[4];
 
-            v_s0 = vget_high_s16(v_src16.val[0]),
-            v_s1 = vget_high_s16(v_src16.val[1]),
-            v_s2 = vget_high_s16(v_src16.val[2]);
+            v_int16 xy[4], zd[4];
+            v_zip(x0, y0, xy[0], xy[1]);
+            v_zip(x1, y1, xy[2], xy[3]);
+            v_zip(z0, vdescale, zd[0], zd[1]);
+            v_zip(z1, vdescale, zd[2], zd[3]);
 
-            int32x4_t v_X1 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
-            int32x4_t v_Y1 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
-            int32x4_t v_Z1 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
-            v_X1 = vshrq_n_s32(vaddq_s32(v_X1, v_delta), xyz_shift);
-            v_Y1 = vshrq_n_s32(vaddq_s32(v_Y1, v_delta), xyz_shift);
-            v_Z1 = vshrq_n_s32(vaddq_s32(v_Z1, v_delta), xyz_shift);
-
-            uint8x8_t v_b = vqmovun_s16(vcombine_s16(vqmovn_s32(v_X0), vqmovn_s32(v_X1)));
-            uint8x8_t v_g = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Y0), vqmovn_s32(v_Y1)));
-            uint8x8_t v_r = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Z0), vqmovn_s32(v_Z1)));
-
-            if (dcn == 3)
+            for(int j = 0; j < 4; j++)
             {
-                uint8x8x3_t v_dst;
-                v_dst.val[0] = v_b;
-                v_dst.val[1] = v_g;
-                v_dst.val[2] = v_r;
-                vst3_u8(dst, v_dst);
+                b[j] = (v_dotprod(xy[j], cbxy) + v_dotprod(zd[j], cbz1)) >> shift;
+                g[j] = (v_dotprod(xy[j], cgxy) + v_dotprod(zd[j], cgz1)) >> shift;
+                r[j] = (v_dotprod(xy[j], crxy) + v_dotprod(zd[j], crz1)) >> shift;
             }
-            else
+
+            v_uint16 b0, b1, g0, g1, r0, r1;
+            b0 = v_pack_u(b[0], b[1]); b1 = v_pack_u(b[2], b[3]);
+            g0 = v_pack_u(g[0], g[1]); g1 = v_pack_u(g[2], g[3]);
+            r0 = v_pack_u(r[0], r[1]); r1 = v_pack_u(r[2], r[3]);
+
+            v_uint8 bb, gg, rr;
+            bb = v_pack(b0, b1);
+            gg = v_pack(g0, g1);
+            rr = v_pack(r0, r1);
+
+            if(dcn == 4)
             {
-                uint8x8x4_t v_dst;
-                v_dst.val[0] = v_b;
-                v_dst.val[1] = v_g;
-                v_dst.val[2] = v_r;
-                v_dst.val[3] = v_alpha;
-                vst4_u8(dst, v_dst);
+                v_store_interleave(dst, bb, gg, rr, valpha);
+            }
+            else // dcn == 3
+            {
+                v_store_interleave(dst, bb, gg, rr);
             }
         }
-
-        for ( ; i < n; i += 3, dst += dcn)
+#endif
+        for ( ; i < n; i++, src += 3, dst += dcn)
         {
-            int B = CV_DESCALE(src[i]*C0 + src[i+1]*C1 + src[i+2]*C2, xyz_shift);
-            int G = CV_DESCALE(src[i]*C3 + src[i+1]*C4 + src[i+2]*C5, xyz_shift);
-            int R = CV_DESCALE(src[i]*C6 + src[i+1]*C7 + src[i+2]*C8, xyz_shift);
+            uchar x = src[0], y = src[1], z = src[2];
+            int B = CV_DESCALE(x*C0 + y*C1 + z*C2, shift);
+            int G = CV_DESCALE(x*C3 + y*C4 + z*C5, shift);
+            int R = CV_DESCALE(x*C6 + y*C7 + z*C8, shift);
             dst[0] = saturate_cast<uchar>(B); dst[1] = saturate_cast<uchar>(G);
             dst[2] = saturate_cast<uchar>(R);
             if( dcn == 4 )
@@ -967,22 +788,20 @@ struct XYZ2RGB_i<uchar>
     }
     int dstcn, blueIdx;
     int coeffs[9];
-
-    int16x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8;
-    uint8x8_t v_alpha;
-    int32x4_t v_delta;
 };
 
+
 template <>
 struct XYZ2RGB_i<ushort>
 {
     typedef ushort channel_type;
+    static const int shift = xyz_shift;
 
     XYZ2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
     : dstcn(_dstcn), blueIdx(_blueIdx)
     {
         for(int i = 0; i < 9; i++)
-            coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : XYZ2sRGB_D65_i[i];
+            coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << shift)) : XYZ2sRGB_D65_i[i];
 
         if(blueIdx == 0)
         {
@@ -990,120 +809,104 @@ struct XYZ2RGB_i<ushort>
             std::swap(coeffs[1], coeffs[7]);
             std::swap(coeffs[2], coeffs[8]);
         }
-
-        v_c0 = vdupq_n_s32(coeffs[0]);
-        v_c1 = vdupq_n_s32(coeffs[1]);
-        v_c2 = vdupq_n_s32(coeffs[2]);
-        v_c3 = vdupq_n_s32(coeffs[3]);
-        v_c4 = vdupq_n_s32(coeffs[4]);
-        v_c5 = vdupq_n_s32(coeffs[5]);
-        v_c6 = vdupq_n_s32(coeffs[6]);
-        v_c7 = vdupq_n_s32(coeffs[7]);
-        v_c8 = vdupq_n_s32(coeffs[8]);
-        v_delta = vdupq_n_s32(1 << (xyz_shift - 1));
-        v_alpha = vdupq_n_u16(ColorChannel<ushort>::max());
-        v_alpha2 = vget_low_u16(v_alpha);
     }
 
     void operator()(const ushort* src, ushort* dst, int n) const
     {
+        CV_INSTRUMENT_REGION();
+
         int dcn = dstcn, i = 0;
         ushort alpha = ColorChannel<ushort>::max();
         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
             C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
             C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
-        n *= 3;
+#if CV_SIMD
+        const int vsize = v_uint16::nlanes;
+        const int descaleShift = 1 << (shift-1);
+        v_uint16 valpha = vx_setall_u16(alpha);
+        v_int16 vdescale = vx_setall_s16(descaleShift);
+        v_int16 vc0 = vx_setall_s16((short)C0), vc1 = vx_setall_s16((short)C1), vc2 = vx_setall_s16((short)C2);
+        v_int16 vc3 = vx_setall_s16((short)C3), vc4 = vx_setall_s16((short)C4), vc5 = vx_setall_s16((short)C5);
+        v_int16 vc6 = vx_setall_s16((short)C6), vc7 = vx_setall_s16((short)C7), vc8 = vx_setall_s16((short)C8);
+        v_int16 zero = vx_setzero_s16(), one = vx_setall_s16(1);
+        v_int16 cbxy, cbz1, cgxy, cgz1, crxy, crz1;
+        v_int16 dummy;
+        v_zip(vc0, vc1, cbxy, dummy);
+        v_zip(vc2, one, cbz1, dummy);
+        v_zip(vc3, vc4, cgxy, dummy);
+        v_zip(vc5, one, cgz1, dummy);
+        v_zip(vc6, vc7, crxy, dummy);
+        v_zip(vc8, one, crz1, dummy);
 
-        for ( ; i <= n - 24; i += 24, dst += dcn * 8)
+        for( ; i <= n-vsize;
+             i += vsize, src += 3*vsize, dst += dcn*vsize)
         {
-            uint16x8x3_t v_src = vld3q_u16(src + i);
-            int32x4_t v_s0 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[0]))),
-                      v_s1 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[1]))),
-                      v_s2 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[2])));
+            v_uint16 x, y, z;
+            v_load_deinterleave(src, x, y, z);
 
-            int32x4_t v_X0 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
-            int32x4_t v_Y0 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
-            int32x4_t v_Z0 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
-            v_X0 = vshrq_n_s32(vaddq_s32(v_X0, v_delta), xyz_shift);
-            v_Y0 = vshrq_n_s32(vaddq_s32(v_Y0, v_delta), xyz_shift);
-            v_Z0 = vshrq_n_s32(vaddq_s32(v_Z0, v_delta), xyz_shift);
+            v_int16 sx, sy, sz;
+            sx = v_reinterpret_as_s16(x);
+            sy = v_reinterpret_as_s16(y);
+            sz = v_reinterpret_as_s16(z);
 
-            v_s0 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[0])));
-            v_s1 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[1])));
-            v_s2 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[2])));
+            // fixing 16bit signed multiplication
+            v_int16 mx = sx < zero, my = sy < zero, mz = sz < zero;
 
-            int32x4_t v_X1 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
-            int32x4_t v_Y1 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
-            int32x4_t v_Z1 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
-            v_X1 = vshrq_n_s32(vaddq_s32(v_X1, v_delta), xyz_shift);
-            v_Y1 = vshrq_n_s32(vaddq_s32(v_Y1, v_delta), xyz_shift);
-            v_Z1 = vshrq_n_s32(vaddq_s32(v_Z1, v_delta), xyz_shift);
+            v_int16 bmx, bmy, bmz;
+            v_int16 gmx, gmy, gmz;
+            v_int16 rmx, rmy, rmz;
 
-            uint16x8_t v_b = vcombine_u16(vqmovun_s32(v_X0), vqmovun_s32(v_X1));
-            uint16x8_t v_g = vcombine_u16(vqmovun_s32(v_Y0), vqmovun_s32(v_Y1));
-            uint16x8_t v_r = vcombine_u16(vqmovun_s32(v_Z0), vqmovun_s32(v_Z1));
+            bmx = mx & vc0;
+            bmy = my & vc1;
+            bmz = mz & vc2;
+            gmx = mx & vc3;
+            gmy = my & vc4;
+            gmz = mz & vc5;
+            rmx = mx & vc6;
+            rmy = my & vc7;
+            rmz = mz & vc8;
 
-            if (dcn == 3)
+            v_int32 bfix0, bfix1, gfix0, gfix1, rfix0, rfix1;
+            v_expand(bmx + bmy + bmz, bfix0, bfix1);
+            v_expand(gmx + gmy + gmz, gfix0, gfix1);
+            v_expand(rmx + rmy + rmz, rfix0, rfix1);
+
+            bfix0 = bfix0 << 16; bfix1 = bfix1 << 16;
+            gfix0 = gfix0 << 16; gfix1 = gfix1 << 16;
+            rfix0 = rfix0 << 16; rfix1 = rfix1 << 16;
+
+            v_int16 xy0, xy1, zd0, zd1;
+            v_zip(sx, sy, xy0, xy1);
+            v_zip(sz, vdescale, zd0, zd1);
+
+            v_int32 b0, b1, g0, g1, r0, r1;
+
+            b0 = (v_dotprod(xy0, cbxy) + v_dotprod(zd0, cbz1) + bfix0) >> shift;
+            b1 = (v_dotprod(xy1, cbxy) + v_dotprod(zd1, cbz1) + bfix1) >> shift;
+            g0 = (v_dotprod(xy0, cgxy) + v_dotprod(zd0, cgz1) + gfix0) >> shift;
+            g1 = (v_dotprod(xy1, cgxy) + v_dotprod(zd1, cgz1) + gfix1) >> shift;
+            r0 = (v_dotprod(xy0, crxy) + v_dotprod(zd0, crz1) + rfix0) >> shift;
+            r1 = (v_dotprod(xy1, crxy) + v_dotprod(zd1, crz1) + rfix1) >> shift;
+
+            v_uint16 b, g, r;
+            b = v_pack_u(b0, b1); g = v_pack_u(g0, g1); r = v_pack_u(r0, r1);
+
+            if(dcn == 4)
             {
-                uint16x8x3_t v_dst;
-                v_dst.val[0] = v_b;
-                v_dst.val[1] = v_g;
-                v_dst.val[2] = v_r;
-                vst3q_u16(dst, v_dst);
+                v_store_interleave(dst, b, g, r, valpha);
             }
-            else
+            else // dcn == 3
             {
-                uint16x8x4_t v_dst;
-                v_dst.val[0] = v_b;
-                v_dst.val[1] = v_g;
-                v_dst.val[2] = v_r;
-                v_dst.val[3] = v_alpha;
-                vst4q_u16(dst, v_dst);
+                v_store_interleave(dst, b, g, r);
             }
         }
-
-        for ( ; i <= n - 12; i += 12, dst += dcn * 4)
+#endif
+        for ( ; i < n; i++, src += 3, dst += dcn)
         {
-            uint16x4x3_t v_src = vld3_u16(src + i);
-            int32x4_t v_s0 = vreinterpretq_s32_u32(vmovl_u16(v_src.val[0])),
-                      v_s1 = vreinterpretq_s32_u32(vmovl_u16(v_src.val[1])),
-                      v_s2 = vreinterpretq_s32_u32(vmovl_u16(v_src.val[2]));
-
-            int32x4_t v_X = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
-            int32x4_t v_Y = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
-            int32x4_t v_Z = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
-            v_X = vshrq_n_s32(vaddq_s32(v_X, v_delta), xyz_shift);
-            v_Y = vshrq_n_s32(vaddq_s32(v_Y, v_delta), xyz_shift);
-            v_Z = vshrq_n_s32(vaddq_s32(v_Z, v_delta), xyz_shift);
-
-            uint16x4_t v_b = vqmovun_s32(v_X);
-            uint16x4_t v_g = vqmovun_s32(v_Y);
-            uint16x4_t v_r = vqmovun_s32(v_Z);
-
-            if (dcn == 3)
-            {
-                uint16x4x3_t v_dst;
-                v_dst.val[0] = v_b;
-                v_dst.val[1] = v_g;
-                v_dst.val[2] = v_r;
-                vst3_u16(dst, v_dst);
-            }
-            else
-            {
-                uint16x4x4_t v_dst;
-                v_dst.val[0] = v_b;
-                v_dst.val[1] = v_g;
-                v_dst.val[2] = v_r;
-                v_dst.val[3] = v_alpha2;
-                vst4_u16(dst, v_dst);
-            }
-        }
-
-        for ( ; i < n; i += 3, dst += dcn)
-        {
-            int B = CV_DESCALE(src[i]*C0 + src[i+1]*C1 + src[i+2]*C2, xyz_shift);
-            int G = CV_DESCALE(src[i]*C3 + src[i+1]*C4 + src[i+2]*C5, xyz_shift);
-            int R = CV_DESCALE(src[i]*C6 + src[i+1]*C7 + src[i+2]*C8, xyz_shift);
+            ushort x = src[0], y = src[1], z = src[2];
+            int B = CV_DESCALE(x*C0 + y*C1 + z*C2, shift);
+            int G = CV_DESCALE(x*C3 + y*C4 + z*C5, shift);
+            int R = CV_DESCALE(x*C6 + y*C7 + z*C8, shift);
             dst[0] = saturate_cast<ushort>(B); dst[1] = saturate_cast<ushort>(G);
             dst[2] = saturate_cast<ushort>(R);
             if( dcn == 4 )
@@ -1112,16 +915,8 @@ struct XYZ2RGB_i<ushort>
     }
     int dstcn, blueIdx;
     int coeffs[9];
-
-    int32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8, v_delta;
-    uint16x4_t v_alpha2;
-    uint16x8_t v_alpha;
 };
 
-#endif
-
-
-
 
 ///////////////////////////////////// RGB <-> L*a*b* /////////////////////////////////////
 
@@ -1482,8 +1277,8 @@ static void initLabTabs()
                 y = cvRound(fy*fy*fy/softfloat(LUT_BASE*LUT_BASE));
             }
 
-            LabToYF_b[i*2  ] = (ushort)y;   // 2260 <= y <= BASE
-            LabToYF_b[i*2+1] = (ushort)ify; // 0 <= ify <= BASE
+            LabToYF_b[i*2  ] = (ushort)y;   // 0 <= y <= BASE
+            LabToYF_b[i*2+1] = (ushort)ify; // 2260 <= ify <= BASE
         }
 
         //Lookup table for a,b to x,z conversion
@@ -1563,7 +1358,7 @@ static inline void trilinearInterpolate(int cx, int cy, int cz, const int16_t* L
     c = CV_DESCALE(c, trilinear_shift*3);
 }
 
-#if CV_SIMD128
+#if CV_SIMD_WIDTH == 16
 
 // 8 inValues are in [0; LAB_BASE]
 static inline void trilinearPackedInterpolate(const v_uint16x8& inX, const v_uint16x8& inY, const v_uint16x8& inZ,
@@ -1652,7 +1447,93 @@ static inline void trilinearPackedInterpolate(const v_uint16x8& inX, const v_uin
 #undef DOT_SHIFT_PACK
 }
 
-#endif // CV_SIMD128
+#elif CV_SIMD
+
+// inValues are in [0; LAB_BASE]
+static inline void trilinearPackedInterpolate(const v_uint16& inX, const v_uint16& inY, const v_uint16& inZ,
+                                              const int16_t* LUT,
+                                              v_uint16& outA, v_uint16& outB, v_uint16& outC)
+{
+    const int vsize = v_uint16::nlanes;
+
+    // LUT idx of origin pt of cube
+    v_uint16 tx = inX >> (lab_base_shift - lab_lut_shift);
+    v_uint16 ty = inY >> (lab_base_shift - lab_lut_shift);
+    v_uint16 tz = inZ >> (lab_base_shift - lab_lut_shift);
+
+    v_uint32 btmp00, btmp01, btmp10, btmp11, btmp20, btmp21;
+    v_uint32 baseIdx0, baseIdx1;
+    // baseIdx = tx*(3*8)+ty*(3*8*LAB_LUT_DIM)+tz*(3*8*LAB_LUT_DIM*LAB_LUT_DIM)
+    v_mul_expand(tx, vx_setall_u16(3*8), btmp00, btmp01);
+    v_mul_expand(ty, vx_setall_u16(3*8*LAB_LUT_DIM), btmp10, btmp11);
+    v_mul_expand(tz, vx_setall_u16(3*8*LAB_LUT_DIM*LAB_LUT_DIM), btmp20, btmp21);
+    baseIdx0 = btmp00 + btmp10 + btmp20;
+    baseIdx1 = btmp01 + btmp11 + btmp21;
+
+    uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vbaseIdx[vsize];
+    v_store_aligned(vbaseIdx + 0*vsize/2, baseIdx0);
+    v_store_aligned(vbaseIdx + 1*vsize/2, baseIdx1);
+
+    // fracX, fracY, fracZ are [0; TRILINEAR_BASE)
+    const uint16_t bitMask = (1 << trilinear_shift) - 1;
+    v_uint16 bitMaskReg = vx_setall_u16(bitMask);
+    v_uint16 fracX = (inX >> (lab_base_shift - 8 - 1)) & bitMaskReg;
+    v_uint16 fracY = (inY >> (lab_base_shift - 8 - 1)) & bitMaskReg;
+    v_uint16 fracZ = (inZ >> (lab_base_shift - 8 - 1)) & bitMaskReg;
+
+    // trilinearIdx = 8*x + 8*TRILINEAR_BASE*y + 8*TRILINEAR_BASE*TRILINEAR_BASE*z
+    v_uint32 trilinearIdx0, trilinearIdx1;
+    v_uint32 fracX0, fracX1, fracY0, fracY1, fracZ0, fracZ1;
+    v_expand(fracX, fracX0, fracX1);
+    v_expand(fracY, fracY0, fracY1);
+    v_expand(fracZ, fracZ0, fracZ1);
+
+    trilinearIdx0 = (fracX0 << 3) + (fracY0 << (3+trilinear_shift)) + (fracZ0 << (3+trilinear_shift*2));
+    trilinearIdx1 = (fracX1 << 3) + (fracY1 << (3+trilinear_shift)) + (fracZ1 << (3+trilinear_shift*2));
+
+    uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vtrilinearIdx[vsize];
+    v_store_aligned(vtrilinearIdx + 0*vsize/2, trilinearIdx0);
+    v_store_aligned(vtrilinearIdx + 1*vsize/2, trilinearIdx1);
+
+    v_uint32 a0, a1, b0, b1, c0, c1;
+
+    uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) va[vsize], vb[vsize], vc[vsize];
+    for(int j = 0; j < vsize; j++)
+    {
+        const int16_t* baseLUT = LUT + vbaseIdx[j];
+
+        v_int16x8 aa, bb, cc;
+        aa = v_load(baseLUT);
+        bb = v_load(baseLUT + 8);
+        cc = v_load(baseLUT + 16);
+
+        v_int16x8 w = v_load(trilinearLUT + vtrilinearIdx[j]);
+
+        va[j] = v_reduce_sum(v_dotprod(aa, w));
+        vb[j] = v_reduce_sum(v_dotprod(bb, w));
+        vc[j] = v_reduce_sum(v_dotprod(cc, w));
+    }
+
+    a0 = vx_load_aligned(va + 0*vsize/2);
+    a1 = vx_load_aligned(va + 1*vsize/2);
+    b0 = vx_load_aligned(vb + 0*vsize/2);
+    b1 = vx_load_aligned(vb + 1*vsize/2);
+    c0 = vx_load_aligned(vc + 0*vsize/2);
+    c1 = vx_load_aligned(vc + 1*vsize/2);
+
+    // CV_DESCALE
+    const v_uint32 descaleShift = vx_setall_u32(1 << (trilinear_shift*3 - 1));
+    a0 = (a0 + descaleShift) >> (trilinear_shift*3);
+    a1 = (a1 + descaleShift) >> (trilinear_shift*3);
+    b0 = (b0 + descaleShift) >> (trilinear_shift*3);
+    b1 = (b1 + descaleShift) >> (trilinear_shift*3);
+    c0 = (c0 + descaleShift) >> (trilinear_shift*3);
+    c1 = (c1 + descaleShift) >> (trilinear_shift*3);
+
+    outA = v_pack(a0, a1); outB = v_pack(b0, b1); outC = v_pack(c0, c1);
+}
+
+#endif // CV_SIMD
 
 
 struct RGB2Lab_b
@@ -1663,7 +1544,6 @@ struct RGB2Lab_b
               const float* _whitept, bool _srgb)
     : srccn(_srccn), srgb(_srgb)
     {
-        static volatile int _3 = 3;
         initLabTabs();
 
         softdouble whitePt[3];
@@ -1674,7 +1554,7 @@ struct RGB2Lab_b
                 whitePt[i] = D65[i];
 
         static const softdouble lshift(1 << lab_shift);
-        for( int i = 0; i < _3; i++ )
+        for( int i = 0; i < 3; i++ )
         {
             softdouble c[3];
             for(int j = 0; j < 3; j++)
@@ -1693,6 +1573,8 @@ struct RGB2Lab_b
 
     void operator()(const uchar* src, uchar* dst, int n) const
     {
+        CV_INSTRUMENT_REGION();
+
         const int Lscale = (116*255+50)/100;
         const int Lshift = -((16*255*(1 << lab_shift2) + 50)/100);
         const ushort* tab = srgb ? sRGBGammaTab_b : linearGammaTab_b;
@@ -1700,10 +1582,158 @@ struct RGB2Lab_b
         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
             C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
             C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
-        n *= 3;
 
         i = 0;
-        for(; i < n; i += 3, src += scn )
+
+#if CV_SIMD
+        const int vsize = v_uint8::nlanes;
+        const int xyzDescaleShift = 1 << (lab_shift - 1);
+        v_int16 vXYZdescale = vx_setall_s16(xyzDescaleShift);
+        v_int16 cxrg, cxb1, cyrg, cyb1, czrg, czb1;
+        v_int16 dummy;
+        v_zip(vx_setall_s16((short)C0), vx_setall_s16((short)C1), cxrg, dummy);
+        v_zip(vx_setall_s16((short)C2), vx_setall_s16(        1), cxb1, dummy);
+        v_zip(vx_setall_s16((short)C3), vx_setall_s16((short)C4), cyrg, dummy);
+        v_zip(vx_setall_s16((short)C5), vx_setall_s16(        1), cyb1, dummy);
+        v_zip(vx_setall_s16((short)C6), vx_setall_s16((short)C7), czrg, dummy);
+        v_zip(vx_setall_s16((short)C8), vx_setall_s16(        1), czb1, dummy);
+        const int labDescaleShift = 1 << (lab_shift2 - 1);
+
+        for( ; i <= n - vsize;
+             i += vsize , src += scn*vsize, dst += 3*vsize)
+        {
+            v_uint8 R, G, B, A;
+            if(scn == 4)
+            {
+                v_load_deinterleave(src, R, G, B, A);
+            }
+            else // scn == 3
+            {
+                v_load_deinterleave(src, R, G, B);
+            }
+
+            // gamma substitution using tab
+            v_uint16 drgb[6];
+            // [0 1 2 3 4 5 6] => [R0 R1 G0 G1 B0 B1]
+            v_expand(R, drgb[0], drgb[1]);
+            v_expand(G, drgb[2], drgb[3]);
+            v_expand(B, drgb[4], drgb[5]);
+
+            // [0 1 2 3 4 5 6 7 8 9 10 11 12] => [4 per R, 4 per G, 4 per B]
+            v_uint32 qrgb[12];
+            for(int k = 0; k < 6; k++)
+            {
+                v_expand(drgb[k], qrgb[k*2+0], qrgb[k*2+1]);
+            }
+
+            uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vdrgb[vsize*3];
+            for(int k = 0; k < 12; k++)
+            {
+                v_store_aligned(vdrgb + k*vsize/4, qrgb[k]);
+            }
+
+            v_uint16 trgb[6];
+            for(int k = 0; k < 6; k++)
+            {
+                trgb[k] = vx_lut(tab, (const int*)vdrgb + k*vsize/2);
+            }
+
+            v_int16 rgbs[6];
+            for(int k = 0; k < 6; k++)
+            {
+                rgbs[k] = v_reinterpret_as_s16(trgb[k]);
+            }
+            v_int16 sB0, sB1, sG0, sG1, sR0, sR1;
+            sR0 = rgbs[0]; sR1 = rgbs[1];
+            sG0 = rgbs[2]; sG1 = rgbs[3];
+            sB0 = rgbs[4]; sB1 = rgbs[5];
+
+            v_int16 rg[4], bd[4];
+            v_zip(sR0, sG0, rg[0], rg[1]);
+            v_zip(sR1, sG1, rg[2], rg[3]);
+            v_zip(sB0, vXYZdescale, bd[0], bd[1]);
+            v_zip(sB1, vXYZdescale, bd[2], bd[3]);
+
+            // [X, Y, Z] = CV_DESCALE(R*C_ + G*C_ + B*C_, lab_shift)
+            v_uint32 x[4], y[4], z[4];
+            for(int j = 0; j < 4; j++)
+            {
+                x[j] = v_reinterpret_as_u32(v_dotprod(rg[j], cxrg) + v_dotprod(bd[j], cxb1)) >> lab_shift;
+                y[j] = v_reinterpret_as_u32(v_dotprod(rg[j], cyrg) + v_dotprod(bd[j], cyb1)) >> lab_shift;
+                z[j] = v_reinterpret_as_u32(v_dotprod(rg[j], czrg) + v_dotprod(bd[j], czb1)) >> lab_shift;
+            }
+
+            // [fX, fY, fZ] = LabCbrtTab_b[vx, vy, vz]
+            // [4 per X, 4 per Y, 4 per Z]
+            uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vxyz[vsize*3];
+            for(int j = 0; j < 4; j++)
+            {
+                v_store_aligned(vxyz + (0*4+j)*vsize/4, x[j]);
+                v_store_aligned(vxyz + (1*4+j)*vsize/4, y[j]);
+                v_store_aligned(vxyz + (2*4+j)*vsize/4, z[j]);
+            }
+            // [X0, X1, Y0, Y1, Z0, Z1]
+            v_uint16 fxyz[2*3];
+            for(int j = 0; j < 2*3; j++)
+            {
+                fxyz[j] = vx_lut(LabCbrtTab_b, (const int*)vxyz + j*vsize/2);
+            }
+
+            v_int16 fX0, fX1, fY0, fY1, fZ0, fZ1;
+            fX0 = v_reinterpret_as_s16(fxyz[0]), fX1 = v_reinterpret_as_s16(fxyz[1]);
+            fY0 = v_reinterpret_as_s16(fxyz[2]), fY1 = v_reinterpret_as_s16(fxyz[3]);
+            fZ0 = v_reinterpret_as_s16(fxyz[4]), fZ1 = v_reinterpret_as_s16(fxyz[5]);
+
+            v_uint16 Ldiff0 = fxyz[2], Ldiff1 = fxyz[3];
+
+            v_uint8 L, a, b;
+
+            // L = (Lscale*Ldiff + (Lshift + labDescaleShift)) >> lab_shift2;
+            v_uint32 vL[4];
+            v_uint16 vLscale = vx_setall_u16(Lscale);
+            v_mul_expand(Ldiff0, vLscale, vL[0], vL[1]);
+            v_mul_expand(Ldiff1, vLscale, vL[2], vL[3]);
+            v_uint32 vLshift = vx_setall_u32((uint32_t)(Lshift + labDescaleShift));
+            for(int k = 0; k < 4; k++)
+            {
+                vL[k] = (vL[k] + vLshift) >> lab_shift2;
+            }
+            v_uint16 L0, L1;
+            L0 = v_pack(vL[0], vL[1]);
+            L1 = v_pack(vL[2], vL[3]);
+
+            L = v_pack(L0, L1);
+
+            // a = (500*(fX - fY) + (128*(1 << lab_shift2) + labDescaleShift)) >> lab_shift2;
+            // b = (200*(fY - fZ) + (128*(1 << lab_shift2) + labDescaleShift)) >> lab_shift2;
+            v_int16 adiff0 = v_sub_wrap(fX0, fY0), adiff1 = v_sub_wrap(fX1, fY1);
+            v_int16 bdiff0 = v_sub_wrap(fY0, fZ0), bdiff1 = v_sub_wrap(fY1, fZ1);
+
+            // [4 for a, 4 for b]
+            v_int32 ab[8];
+            v_int16 v500 = vx_setall_s16(500);
+            v_mul_expand(adiff0, v500, ab[0], ab[1]);
+            v_mul_expand(adiff1, v500, ab[2], ab[3]);
+            v_int16 v200 = vx_setall_s16(200);
+            v_mul_expand(bdiff0, v200, ab[4], ab[5]);
+            v_mul_expand(bdiff1, v200, ab[6], ab[7]);
+            v_int32 abShift = vx_setall_s32(128*(1 << lab_shift2) + labDescaleShift);
+            for(int k = 0; k < 8; k++)
+            {
+                ab[k] = (ab[k] + abShift) >> lab_shift2;
+            }
+            v_int16 a0, a1, b0, b1;
+            a0 = v_pack(ab[0], ab[1]); a1 = v_pack(ab[2], ab[3]);
+            b0 = v_pack(ab[4], ab[5]); b1 = v_pack(ab[6], ab[7]);
+
+            a = v_pack_u(a0, a1);
+            b = v_pack_u(b0, b1);
+
+            v_store_interleave(dst, L, a, b);
+        }
+#endif
+
+        for(; i < n; i++, src += scn, dst += 3 )
         {
             int R = tab[src[0]], G = tab[src[1]], B = tab[src[2]];
             int fX = LabCbrtTab_b[CV_DESCALE(R*C0 + G*C1 + B*C2, lab_shift)];
@@ -1714,9 +1744,9 @@ struct RGB2Lab_b
             int a = CV_DESCALE( 500*(fX - fY) + 128*(1 << lab_shift2), lab_shift2 );
             int b = CV_DESCALE( 200*(fY - fZ) + 128*(1 << lab_shift2), lab_shift2 );
 
-            dst[i] = saturate_cast<uchar>(L);
-            dst[i+1] = saturate_cast<uchar>(a);
-            dst[i+2] = saturate_cast<uchar>(b);
+            dst[0] = saturate_cast<uchar>(L);
+            dst[1] = saturate_cast<uchar>(a);
+            dst[2] = saturate_cast<uchar>(b);
         }
     }
 
@@ -1734,7 +1764,6 @@ struct RGB2Lab_f
               const float* _whitept, bool _srgb)
     : srccn(_srccn), srgb(_srgb), blueIdx(_blueIdx)
     {
-        volatile int _3 = 3;
         initLabTabs();
 
         useInterpolation = (!_coeffs && !_whitept && srgb && enableRGB2LabInterpolation);
@@ -1750,7 +1779,7 @@ struct RGB2Lab_f
                                softdouble::one(),
                                softdouble::one() / whitePt[2] };
 
-        for( int i = 0; i < _3; i++ )
+        for( int i = 0; i < 3; i++ )
         {
             softfloat c[3];
             for(int k = 0; k < 3; k++)
@@ -1769,44 +1798,47 @@ struct RGB2Lab_f
 
     void operator()(const float* src, float* dst, int n) const
     {
-        int i, scn = srccn, bIdx = blueIdx;
+        CV_INSTRUMENT_REGION();
+
+        int scn = srccn, bIdx = blueIdx;
         float gscale = GammaTabScale;
         const float* gammaTab = srgb ? sRGBGammaTab : 0;
         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
               C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
               C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
-        n *= 3;
 
-        i = 0;
         if(useInterpolation)
         {
+            int i = 0;
+            n *= 3;
 
-#if CV_SIMD128
+#if CV_SIMD
             if(enablePackedLab)
             {
-                static const int nPixels = 4*2;
+                const int vsize = v_float32::nlanes;
+                static const int nPixels = vsize*2;
                 for(; i < n - 3*nPixels; i += 3*nPixels, src += scn*nPixels)
                 {
-                    v_float32x4 rvec0, gvec0, bvec0, rvec1, gvec1, bvec1;
-                    v_float32x4 dummy0, dummy1;
+                    v_float32 rvec0, gvec0, bvec0, rvec1, gvec1, bvec1;
                     if(scn == 3)
                     {
-                        v_load_deinterleave(src, rvec0, gvec0, bvec0);
-                        v_load_deinterleave(src + scn*4, rvec1, gvec1, bvec1);
+                        v_load_deinterleave(src + 0*vsize, rvec0, gvec0, bvec0);
+                        v_load_deinterleave(src + 3*vsize, rvec1, gvec1, bvec1);
                     }
                     else // scn == 4
                     {
-                        v_load_deinterleave(src, rvec0, gvec0, bvec0, dummy0);
-                        v_load_deinterleave(src + scn*4, rvec1, gvec1, bvec1, dummy1);
+                        v_float32 dummy0, dummy1;
+                        v_load_deinterleave(src + 0*vsize, rvec0, gvec0, bvec0, dummy0);
+                        v_load_deinterleave(src + 4*vsize, rvec1, gvec1, bvec1, dummy1);
                     }
 
                     if(bIdx)
                     {
-                        dummy0 = rvec0; rvec0 = bvec0; bvec0 = dummy0;
-                        dummy1 = rvec1; rvec1 = bvec1; bvec1 = dummy1;
+                        swap(rvec0, bvec0);
+                        swap(rvec1, bvec1);
                     }
 
-                    v_float32x4 zerof = v_setzero_f32(), onef = v_setall_f32(1.0f);
+                    v_float32 zerof = vx_setzero_f32(), onef = vx_setall_f32(1.0f);
                     /* clip() */
                     #define clipv(r) (r) = v_min(v_max((r), zerof), onef)
                     clipv(rvec0); clipv(rvec1);
@@ -1814,58 +1846,55 @@ struct RGB2Lab_f
                     clipv(bvec0); clipv(bvec1);
                     #undef clipv
                     /* int iR = R*LAB_BASE, iG = G*LAB_BASE, iB = B*LAB_BASE, iL, ia, ib; */
-                    v_float32x4 basef = v_setall_f32(LAB_BASE);
+                    v_float32 basef = vx_setall_f32(LAB_BASE);
                     rvec0 *= basef, gvec0 *= basef, bvec0 *= basef;
                     rvec1 *= basef, gvec1 *= basef, bvec1 *= basef;
 
-                    v_int32x4 irvec0, igvec0, ibvec0, irvec1, igvec1, ibvec1;
+                    v_int32 irvec0, igvec0, ibvec0, irvec1, igvec1, ibvec1;
                     irvec0 = v_round(rvec0); irvec1 = v_round(rvec1);
                     igvec0 = v_round(gvec0); igvec1 = v_round(gvec1);
                     ibvec0 = v_round(bvec0); ibvec1 = v_round(bvec1);
 
-                    v_int16x8 irvec, igvec, ibvec;
-                    irvec = v_pack(irvec0, irvec1);
-                    igvec = v_pack(igvec0, igvec1);
-                    ibvec = v_pack(ibvec0, ibvec1);
+                    v_uint16 uirvec = v_pack_u(irvec0, irvec1);
+                    v_uint16 uigvec = v_pack_u(igvec0, igvec1);
+                    v_uint16 uibvec = v_pack_u(ibvec0, ibvec1);
 
-                    v_uint16x8 uirvec = v_reinterpret_as_u16(irvec);
-                    v_uint16x8 uigvec = v_reinterpret_as_u16(igvec);
-                    v_uint16x8 uibvec = v_reinterpret_as_u16(ibvec);
-
-                    v_uint16x8 ui_lvec, ui_avec, ui_bvec;
+                    v_uint16 ui_lvec, ui_avec, ui_bvec;
                     trilinearPackedInterpolate(uirvec, uigvec, uibvec, LABLUVLUTs16.RGB2LabLUT_s16, ui_lvec, ui_avec, ui_bvec);
-                    v_int16x8 i_lvec = v_reinterpret_as_s16(ui_lvec);
-                    v_int16x8 i_avec = v_reinterpret_as_s16(ui_avec);
-                    v_int16x8 i_bvec = v_reinterpret_as_s16(ui_bvec);
+                    v_int16 i_lvec = v_reinterpret_as_s16(ui_lvec);
+                    v_int16 i_avec = v_reinterpret_as_s16(ui_avec);
+                    v_int16 i_bvec = v_reinterpret_as_s16(ui_bvec);
 
                     /* float L = iL*1.0f/LAB_BASE, a = ia*1.0f/LAB_BASE, b = ib*1.0f/LAB_BASE; */
-                    v_int32x4 i_lvec0, i_avec0, i_bvec0, i_lvec1, i_avec1, i_bvec1;
+                    v_int32 i_lvec0, i_avec0, i_bvec0, i_lvec1, i_avec1, i_bvec1;
                     v_expand(i_lvec, i_lvec0, i_lvec1);
                     v_expand(i_avec, i_avec0, i_avec1);
                     v_expand(i_bvec, i_bvec0, i_bvec1);
 
-                    v_float32x4 l_vec0, a_vec0, b_vec0, l_vec1, a_vec1, b_vec1;
+                    v_float32 l_vec0, a_vec0, b_vec0, l_vec1, a_vec1, b_vec1;
                     l_vec0 = v_cvt_f32(i_lvec0); l_vec1 = v_cvt_f32(i_lvec1);
                     a_vec0 = v_cvt_f32(i_avec0); a_vec1 = v_cvt_f32(i_avec1);
                     b_vec0 = v_cvt_f32(i_bvec0); b_vec1 = v_cvt_f32(i_bvec1);
 
                     /* dst[i] = L*100.0f */
-                    l_vec0 = l_vec0*v_setall_f32(100.0f/LAB_BASE);
-                    l_vec1 = l_vec1*v_setall_f32(100.0f/LAB_BASE);
+                    v_float32 v100dBase = vx_setall_f32(100.0f/LAB_BASE);
+                    l_vec0 = l_vec0*v100dBase;
+                    l_vec1 = l_vec1*v100dBase;
                     /*
                     dst[i + 1] = a*256.0f - 128.0f;
                     dst[i + 2] = b*256.0f - 128.0f;
                     */
-                    a_vec0 = a_vec0*v_setall_f32(256.0f/LAB_BASE) - v_setall_f32(128.0f);
-                    a_vec1 = a_vec1*v_setall_f32(256.0f/LAB_BASE) - v_setall_f32(128.0f);
-                    b_vec0 = b_vec0*v_setall_f32(256.0f/LAB_BASE) - v_setall_f32(128.0f);
-                    b_vec1 = b_vec1*v_setall_f32(256.0f/LAB_BASE) - v_setall_f32(128.0f);
+                    v_float32 v256dBase = vx_setall_f32(256.0f/LAB_BASE), vm128 = vx_setall_f32(-128.f);
+                    a_vec0 = v_fma(a_vec0, v256dBase, vm128);
+                    a_vec1 = v_fma(a_vec1, v256dBase, vm128);
+                    b_vec0 = v_fma(b_vec0, v256dBase, vm128);
+                    b_vec1 = v_fma(b_vec1, v256dBase, vm128);
 
-                    v_store_interleave(dst + i, l_vec0, a_vec0, b_vec0);
-                    v_store_interleave(dst + i + 3*4, l_vec1, a_vec1, b_vec1);
+                    v_store_interleave(dst + i + 0*vsize, l_vec0, a_vec0, b_vec0);
+                    v_store_interleave(dst + i + 3*vsize, l_vec1, a_vec1, b_vec1);
                 }
             }
-#endif // CV_SIMD128
+#endif // CV_SIMD
 
             for(; i < n; i += 3, src += scn)
             {
@@ -1883,35 +1912,112 @@ struct RGB2Lab_f
                 dst[i + 2] = b*256.0f - 128.0f;
             }
         }
-
-        static const float _a = (softfloat(16) / softfloat(116));
-        for (; i < n; i += 3, src += scn )
+        else
         {
-            float R = clip(src[0]);
-            float G = clip(src[1]);
-            float B = clip(src[2]);
-
-            if (gammaTab)
+            static const float _a = (softfloat(16) / softfloat(116));
+            int i = 0;
+#if CV_SIMD
+            const int vsize = v_float32::nlanes;
+            const int nrepeats = vsize == 4 ? 2 : 1;
+            v_float32 vc0 = vx_setall_f32(C0), vc1 = vx_setall_f32(C1), vc2 = vx_setall_f32(C2);
+            v_float32 vc3 = vx_setall_f32(C3), vc4 = vx_setall_f32(C4), vc5 = vx_setall_f32(C5);
+            v_float32 vc6 = vx_setall_f32(C6), vc7 = vx_setall_f32(C7), vc8 = vx_setall_f32(C8);
+            for( ; i <= n - vsize*nrepeats;
+                 i += vsize*nrepeats, src += scn*vsize*nrepeats, dst += 3*vsize*nrepeats)
             {
-                R = splineInterpolate(R * gscale, gammaTab, GAMMA_TAB_SIZE);
-                G = splineInterpolate(G * gscale, gammaTab, GAMMA_TAB_SIZE);
-                B = splineInterpolate(B * gscale, gammaTab, GAMMA_TAB_SIZE);
+                v_float32 R[nrepeats], G[nrepeats], B[nrepeats], A;
+                if(scn == 4)
+                {
+                    for (int k = 0; k < nrepeats; k++)
+                    {
+                        v_load_deinterleave(src + k*4*vsize, R[k], G[k], B[k], A);
+                    }
+                }
+                else // scn == 3
+                {
+                    for (int k = 0; k < nrepeats; k++)
+                    {
+                        v_load_deinterleave(src + k*3*vsize, R[k], G[k], B[k]);
+                    }
+                }
+
+                v_float32 one = vx_setall_f32(1.0f), z = vx_setzero_f32();
+                for (int k = 0; k < nrepeats; k++)
+                {
+                    R[k] = v_max(z, v_min(R[k], one));
+                    G[k] = v_max(z, v_min(G[k], one));
+                    B[k] = v_max(z, v_min(B[k], one));
+                }
+
+                if(gammaTab)
+                {
+                    v_float32 vgscale = vx_setall_f32(gscale);
+                    for (int k = 0; k < nrepeats; k++)
+                    {
+                        R[k] = splineInterpolate(R[k]*vgscale, gammaTab, GAMMA_TAB_SIZE);
+                        G[k] = splineInterpolate(G[k]*vgscale, gammaTab, GAMMA_TAB_SIZE);
+                        B[k] = splineInterpolate(B[k]*vgscale, gammaTab, GAMMA_TAB_SIZE);
+                    }
+                }
+
+                v_float32 X[nrepeats], Y[nrepeats], Z[nrepeats];
+                v_float32 FX[nrepeats], FY[nrepeats], FZ[nrepeats];
+                for (int k = 0; k < nrepeats; k++)
+                {
+                    X[k] = v_fma(R[k], vc0, v_fma(G[k], vc1, B[k]*vc2));
+                    Y[k] = v_fma(R[k], vc3, v_fma(G[k], vc4, B[k]*vc5));
+                    Z[k] = v_fma(R[k], vc6, v_fma(G[k], vc7, B[k]*vc8));
+
+                    // use spline interpolation instead of direct calculation
+                    v_float32 vTabScale = vx_setall_f32(LabCbrtTabScale);
+                    FX[k] = splineInterpolate(X[k]*vTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE);
+                    FY[k] = splineInterpolate(Y[k]*vTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE);
+                    FZ[k] = splineInterpolate(Z[k]*vTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE);
+                }
+
+                v_float32 L[nrepeats], a[nrepeats], b[nrepeats];
+                for (int k = 0; k < nrepeats; k++)
+                {
+                    // 7.787f = (29/3)^3/(29*4), 0.008856f = (6/29)^3, 903.3 = (29/3)^3
+                    v_float32 mask = Y[k] > (vx_setall_f32(0.008856f));
+                    v_float32 v116 = vx_setall_f32(116.f), vm16 = vx_setall_f32(-16.f);
+                    L[k] = v_select(mask, v_fma(v116, FY[k], vm16), vx_setall_f32(903.3f)*Y[k]);
+                    a[k] = vx_setall_f32(500.f) * (FX[k] - FY[k]);
+                    b[k] = vx_setall_f32(200.f) * (FY[k] - FZ[k]);
+
+                    v_store_interleave(dst + k*3*vsize, L[k], a[k], b[k]);
+                }
             }
-            float X = R*C0 + G*C1 + B*C2;
-            float Y = R*C3 + G*C4 + B*C5;
-            float Z = R*C6 + G*C7 + B*C8;
-            // 7.787f = (29/3)^3/(29*4), 0.008856f = (6/29)^3, 903.3 = (29/3)^3
-            float FX = X > 0.008856f ? cubeRoot(X) : (7.787f * X + _a);
-            float FY = Y > 0.008856f ? cubeRoot(Y) : (7.787f * Y + _a);
-            float FZ = Z > 0.008856f ? cubeRoot(Z) : (7.787f * Z + _a);
+#endif
 
-            float L = Y > 0.008856f ? (116.f * FY - 16.f) : (903.3f * Y);
-            float a = 500.f * (FX - FY);
-            float b = 200.f * (FY - FZ);
+            for (; i < n; i++, src += scn, dst += 3 )
+            {
+                float R = clip(src[0]);
+                float G = clip(src[1]);
+                float B = clip(src[2]);
 
-            dst[i] = L;
-            dst[i + 1] = a;
-            dst[i + 2] = b;
+                if (gammaTab)
+                {
+                    R = splineInterpolate(R * gscale, gammaTab, GAMMA_TAB_SIZE);
+                    G = splineInterpolate(G * gscale, gammaTab, GAMMA_TAB_SIZE);
+                    B = splineInterpolate(B * gscale, gammaTab, GAMMA_TAB_SIZE);
+                }
+                float X = R*C0 + G*C1 + B*C2;
+                float Y = R*C3 + G*C4 + B*C5;
+                float Z = R*C6 + G*C7 + B*C8;
+                // 7.787f = (29/3)^3/(29*4), 0.008856f = (6/29)^3, 903.3 = (29/3)^3
+                float FX = X > 0.008856f ? cubeRoot(X) : (7.787f * X + _a);
+                float FY = Y > 0.008856f ? cubeRoot(Y) : (7.787f * Y + _a);
+                float FZ = Z > 0.008856f ? cubeRoot(Z) : (7.787f * Z + _a);
+
+                float L = Y > 0.008856f ? (116.f * FY - 16.f) : (903.3f * Y);
+                float a = 500.f * (FX - FY);
+                float b = 200.f * (FY - FZ);
+
+                dst[0] = L;
+                dst[1] = a;
+                dst[2] = b;
+            }
         }
     }
 
@@ -1957,104 +2063,12 @@ struct Lab2RGBfloat
 
         lThresh = softfloat(8); // 0.008856f * 903.3f  = (6/29)^3*(29/3)^3 = 8
         fThresh = softfloat(6)/softfloat(29); // 7.787f * 0.008856f + 16.0f / 116.0f = 6/29
-
-        #if CV_SSE2
-        haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
-        #endif
     }
 
-    #if CV_SSE2
-    void process(__m128& v_li0, __m128& v_li1, __m128& v_ai0,
-                 __m128& v_ai1, __m128& v_bi0, __m128& v_bi1) const
-    {
-        // 903.3 = (29/3)^3, 7.787 = (29/3)^3/(29*4)
-        __m128 v_y00 = _mm_mul_ps(v_li0, _mm_set1_ps(1.0f/903.3f));
-        __m128 v_y01 = _mm_mul_ps(v_li1, _mm_set1_ps(1.0f/903.3f));
-        __m128 v_fy00 = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(7.787f), v_y00), _mm_set1_ps(16.0f/116.0f));
-        __m128 v_fy01 = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(7.787f), v_y01), _mm_set1_ps(16.0f/116.0f));
-
-        __m128 v_fy10 = _mm_mul_ps(_mm_add_ps(v_li0, _mm_set1_ps(16.0f)), _mm_set1_ps(1.0f/116.0f));
-        __m128 v_fy11 = _mm_mul_ps(_mm_add_ps(v_li1, _mm_set1_ps(16.0f)), _mm_set1_ps(1.0f/116.0f));
-        __m128 v_y10 = _mm_mul_ps(_mm_mul_ps(v_fy10, v_fy10), v_fy10);
-        __m128 v_y11 = _mm_mul_ps(_mm_mul_ps(v_fy11, v_fy11), v_fy11);
-
-        __m128 v_cmpli0 = _mm_cmple_ps(v_li0, _mm_set1_ps(lThresh));
-        __m128 v_cmpli1 = _mm_cmple_ps(v_li1, _mm_set1_ps(lThresh));
-        v_y00 = _mm_and_ps(v_cmpli0, v_y00);
-        v_y01 = _mm_and_ps(v_cmpli1, v_y01);
-        v_fy00 = _mm_and_ps(v_cmpli0, v_fy00);
-        v_fy01 = _mm_and_ps(v_cmpli1, v_fy01);
-        v_y10 = _mm_andnot_ps(v_cmpli0, v_y10);
-        v_y11 = _mm_andnot_ps(v_cmpli1, v_y11);
-        v_fy10 = _mm_andnot_ps(v_cmpli0, v_fy10);
-        v_fy11 = _mm_andnot_ps(v_cmpli1, v_fy11);
-        __m128 v_y0 = _mm_or_ps(v_y00, v_y10);
-        __m128 v_y1 = _mm_or_ps(v_y01, v_y11);
-        __m128 v_fy0 = _mm_or_ps(v_fy00, v_fy10);
-        __m128 v_fy1 = _mm_or_ps(v_fy01, v_fy11);
-
-        __m128 v_fxz00 = _mm_add_ps(v_fy0, _mm_mul_ps(v_ai0, _mm_set1_ps(0.002f)));
-        __m128 v_fxz01 = _mm_add_ps(v_fy1, _mm_mul_ps(v_ai1, _mm_set1_ps(0.002f)));
-        __m128 v_fxz10 = _mm_sub_ps(v_fy0, _mm_mul_ps(v_bi0, _mm_set1_ps(0.005f)));
-        __m128 v_fxz11 = _mm_sub_ps(v_fy1, _mm_mul_ps(v_bi1, _mm_set1_ps(0.005f)));
-
-        __m128 v_fxz000 = _mm_mul_ps(_mm_sub_ps(v_fxz00, _mm_set1_ps(16.0f/116.0f)), _mm_set1_ps(1.0f/7.787f));
-        __m128 v_fxz001 = _mm_mul_ps(_mm_sub_ps(v_fxz01, _mm_set1_ps(16.0f/116.0f)), _mm_set1_ps(1.0f/7.787f));
-        __m128 v_fxz010 = _mm_mul_ps(_mm_sub_ps(v_fxz10, _mm_set1_ps(16.0f/116.0f)), _mm_set1_ps(1.0f/7.787f));
-        __m128 v_fxz011 = _mm_mul_ps(_mm_sub_ps(v_fxz11, _mm_set1_ps(16.0f/116.0f)), _mm_set1_ps(1.0f/7.787f));
-
-        __m128 v_fxz100 = _mm_mul_ps(_mm_mul_ps(v_fxz00, v_fxz00), v_fxz00);
-        __m128 v_fxz101 = _mm_mul_ps(_mm_mul_ps(v_fxz01, v_fxz01), v_fxz01);
-        __m128 v_fxz110 = _mm_mul_ps(_mm_mul_ps(v_fxz10, v_fxz10), v_fxz10);
-        __m128 v_fxz111 = _mm_mul_ps(_mm_mul_ps(v_fxz11, v_fxz11), v_fxz11);
-
-        __m128 v_cmpfxz00 = _mm_cmple_ps(v_fxz00, _mm_set1_ps(fThresh));
-        __m128 v_cmpfxz01 = _mm_cmple_ps(v_fxz01, _mm_set1_ps(fThresh));
-        __m128 v_cmpfxz10 = _mm_cmple_ps(v_fxz10, _mm_set1_ps(fThresh));
-        __m128 v_cmpfxz11 = _mm_cmple_ps(v_fxz11, _mm_set1_ps(fThresh));
-        v_fxz000 = _mm_and_ps(v_cmpfxz00, v_fxz000);
-        v_fxz001 = _mm_and_ps(v_cmpfxz01, v_fxz001);
-        v_fxz010 = _mm_and_ps(v_cmpfxz10, v_fxz010);
-        v_fxz011 = _mm_and_ps(v_cmpfxz11, v_fxz011);
-        v_fxz100 = _mm_andnot_ps(v_cmpfxz00, v_fxz100);
-        v_fxz101 = _mm_andnot_ps(v_cmpfxz01, v_fxz101);
-        v_fxz110 = _mm_andnot_ps(v_cmpfxz10, v_fxz110);
-        v_fxz111 = _mm_andnot_ps(v_cmpfxz11, v_fxz111);
-        __m128 v_x0 = _mm_or_ps(v_fxz000, v_fxz100);
-        __m128 v_x1 = _mm_or_ps(v_fxz001, v_fxz101);
-        __m128 v_z0 = _mm_or_ps(v_fxz010, v_fxz110);
-        __m128 v_z1 = _mm_or_ps(v_fxz011, v_fxz111);
-
-        __m128 v_ro0 = _mm_mul_ps(_mm_set1_ps(coeffs[0]), v_x0);
-        __m128 v_ro1 = _mm_mul_ps(_mm_set1_ps(coeffs[0]), v_x1);
-        __m128 v_go0 = _mm_mul_ps(_mm_set1_ps(coeffs[3]), v_x0);
-        __m128 v_go1 = _mm_mul_ps(_mm_set1_ps(coeffs[3]), v_x1);
-        __m128 v_bo0 = _mm_mul_ps(_mm_set1_ps(coeffs[6]), v_x0);
-        __m128 v_bo1 = _mm_mul_ps(_mm_set1_ps(coeffs[6]), v_x1);
-        v_ro0 = _mm_add_ps(v_ro0, _mm_mul_ps(_mm_set1_ps(coeffs[1]), v_y0));
-        v_ro1 = _mm_add_ps(v_ro1, _mm_mul_ps(_mm_set1_ps(coeffs[1]), v_y1));
-        v_go0 = _mm_add_ps(v_go0, _mm_mul_ps(_mm_set1_ps(coeffs[4]), v_y0));
-        v_go1 = _mm_add_ps(v_go1, _mm_mul_ps(_mm_set1_ps(coeffs[4]), v_y1));
-        v_bo0 = _mm_add_ps(v_bo0, _mm_mul_ps(_mm_set1_ps(coeffs[7]), v_y0));
-        v_bo1 = _mm_add_ps(v_bo1, _mm_mul_ps(_mm_set1_ps(coeffs[7]), v_y1));
-        v_ro0 = _mm_add_ps(v_ro0, _mm_mul_ps(_mm_set1_ps(coeffs[2]), v_z0));
-        v_ro1 = _mm_add_ps(v_ro1, _mm_mul_ps(_mm_set1_ps(coeffs[2]), v_z1));
-        v_go0 = _mm_add_ps(v_go0, _mm_mul_ps(_mm_set1_ps(coeffs[5]), v_z0));
-        v_go1 = _mm_add_ps(v_go1, _mm_mul_ps(_mm_set1_ps(coeffs[5]), v_z1));
-        v_bo0 = _mm_add_ps(v_bo0, _mm_mul_ps(_mm_set1_ps(coeffs[8]), v_z0));
-        v_bo1 = _mm_add_ps(v_bo1, _mm_mul_ps(_mm_set1_ps(coeffs[8]), v_z1));
-
-        v_li0 = _mm_min_ps(_mm_max_ps(v_ro0, _mm_setzero_ps()), _mm_set1_ps(1.0f));
-        v_li1 = _mm_min_ps(_mm_max_ps(v_ro1, _mm_setzero_ps()), _mm_set1_ps(1.0f));
-        v_ai0 = _mm_min_ps(_mm_max_ps(v_go0, _mm_setzero_ps()), _mm_set1_ps(1.0f));
-        v_ai1 = _mm_min_ps(_mm_max_ps(v_go1, _mm_setzero_ps()), _mm_set1_ps(1.0f));
-        v_bi0 = _mm_min_ps(_mm_max_ps(v_bo0, _mm_setzero_ps()), _mm_set1_ps(1.0f));
-        v_bi1 = _mm_min_ps(_mm_max_ps(v_bo1, _mm_setzero_ps()), _mm_set1_ps(1.0f));
-    }
-    #endif
-
     void operator()(const float* src, float* dst, int n) const
     {
+        CV_INSTRUMENT_REGION();
+
         int i = 0, dcn = dstcn;
         const float* gammaTab = srgb ? sRGBInvGammaTab : 0;
         float gscale = GammaTabScale;
@@ -2062,76 +2076,137 @@ struct Lab2RGBfloat
         C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
         C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
         float alpha = ColorChannel<float>::max();
-        n *= 3;
 
-        #if CV_SSE2
-        if (haveSIMD)
+#if CV_SIMD
+        const int vsize = v_float32::nlanes;
+        const int nrepeats = 2;
+        v_float32 v16_116 = vx_setall_f32(16.0f / 116.0f);
+        for( ; i <= n-vsize*nrepeats;
+               i += vsize*nrepeats, src += 3*vsize*nrepeats, dst += dcn*vsize*nrepeats)
         {
-            for (; i <= n - 24; i += 24, dst += dcn * 8)
+            v_float32 li[nrepeats], ai[nrepeats], bi[nrepeats];
+            for(int k = 0; k < nrepeats; k++)
             {
-                __m128 v_li0 = _mm_loadu_ps(src + i +  0);
-                __m128 v_li1 = _mm_loadu_ps(src + i +  4);
-                __m128 v_ai0 = _mm_loadu_ps(src + i +  8);
-                __m128 v_ai1 = _mm_loadu_ps(src + i + 12);
-                __m128 v_bi0 = _mm_loadu_ps(src + i + 16);
-                __m128 v_bi1 = _mm_loadu_ps(src + i + 20);
+                v_load_deinterleave(src + k*3*vsize, li[k], ai[k], bi[k]);
+            }
 
-                _mm_deinterleave_ps(v_li0, v_li1, v_ai0, v_ai1, v_bi0, v_bi1);
+            v_float32 x[nrepeats], y[nrepeats], z[nrepeats], fy[nrepeats];
+            v_float32 limask[nrepeats];
+            v_float32 vlThresh = vx_setall_f32(lThresh);
+            for(int k = 0; k < nrepeats; k++)
+            {
+                limask[k] = li[k] <= vlThresh;
+            }
+            v_float32 ylo[nrepeats], yhi[nrepeats], fylo[nrepeats], fyhi[nrepeats];
+            // 903.3 = (29/3)^3, 7.787 = (29/3)^3/(29*4)
+            v_float32 vinv903 = vx_setall_f32(1.f/903.3f);
+            for(int k = 0; k < nrepeats; k++)
+            {
+                ylo[k] = li[k] * vinv903;
+            }
+            v_float32 v7787 = vx_setall_f32(7.787f);
+            for(int k = 0; k < nrepeats; k++)
+            {
+                fylo[k] = v_fma(v7787, ylo[k], v16_116);
+            }
+            v_float32 v16 = vx_setall_f32(16.0f), vinv116 = vx_setall_f32(1.f/116.0f);
+            for(int k = 0; k < nrepeats; k++)
+            {
+                fyhi[k] = (li[k] + v16) * vinv116;
+            }
+            for(int k = 0; k < nrepeats; k++)
+            {
+                yhi[k] = fyhi[k] * fyhi[k] * fyhi[k];
+            }
+            for(int k = 0; k < nrepeats; k++)
+            {
+                y[k]  = v_select(limask[k], ylo[k],  yhi[k]);
+                fy[k] = v_select(limask[k], fylo[k], fyhi[k]);
+            }
 
-                process(v_li0, v_li1, v_ai0, v_ai1, v_bi0, v_bi1);
-
-                if (gammaTab)
+            v_float32 fxz[nrepeats*2];
+            v_float32 vpinv500 = vx_setall_f32( 1.f/500.f);
+            v_float32 vninv200 = vx_setall_f32(-1.f/200.f);
+            for(int k = 0; k < nrepeats; k++)
+            {
+                fxz[k*2+0] = v_fma(ai[k], vpinv500, fy[k]);
+                fxz[k*2+1] = v_fma(bi[k], vninv200, fy[k]);
+            }
+            v_float32 vfTresh = vx_setall_f32(fThresh);
+            v_float32 vinv7787 = vx_setall_f32(1.f/7.787f);
+            for(int k = 0; k < nrepeats; k++)
+            {
+                for (int j = 0; j < 2; j++)
                 {
-                    __m128 v_gscale = _mm_set1_ps(gscale);
-                    v_li0 = _mm_mul_ps(v_li0, v_gscale);
-                    v_li1 = _mm_mul_ps(v_li1, v_gscale);
-                    v_ai0 = _mm_mul_ps(v_ai0, v_gscale);
-                    v_ai1 = _mm_mul_ps(v_ai1, v_gscale);
-                    v_bi0 = _mm_mul_ps(v_bi0, v_gscale);
-                    v_bi1 = _mm_mul_ps(v_bi1, v_gscale);
+                    v_float32 f = fxz[k*2+j];
+                    v_float32 fmask = f <= vfTresh;
+                    v_float32 flo = (f - v16_116) * vinv7787;
+                    v_float32 fhi = f*f*f;
+                    fxz[k*2+j] = v_select(fmask, flo, fhi);
+                }
+            }
+            for(int k = 0; k < nrepeats; k++)
+            {
+                x[k] = fxz[k*2+0], z[k] = fxz[k*2+1];
+            }
+            v_float32 ro[nrepeats], go[nrepeats], bo[nrepeats];
+            v_float32 vc0 = vx_setall_f32(C0), vc1 = vx_setall_f32(C1), vc2 = vx_setall_f32(C2);
+            v_float32 vc3 = vx_setall_f32(C3), vc4 = vx_setall_f32(C4), vc5 = vx_setall_f32(C5);
+            v_float32 vc6 = vx_setall_f32(C6), vc7 = vx_setall_f32(C7), vc8 = vx_setall_f32(C8);
+            for(int k = 0; k < nrepeats; k++)
+            {
+                ro[k] = v_fma(vc0, x[k], v_fma(vc1, y[k], vc2 * z[k]));
+                go[k] = v_fma(vc3, x[k], v_fma(vc4, y[k], vc5 * z[k]));
+                bo[k] = v_fma(vc6, x[k], v_fma(vc7, y[k], vc8 * z[k]));
+            }
+            v_float32 one = vx_setall_f32(1.f), zero = vx_setzero_f32();
+            for(int k = 0; k < nrepeats; k++)
+            {
+                ro[k] = v_max(zero, v_min(ro[k], one));
+                go[k] = v_max(zero, v_min(go[k], one));
+                bo[k] = v_max(zero, v_min(bo[k], one));
+            }
 
-                    splineInterpolate(v_li0, gammaTab, GAMMA_TAB_SIZE);
-                    splineInterpolate(v_li1, gammaTab, GAMMA_TAB_SIZE);
-                    splineInterpolate(v_ai0, gammaTab, GAMMA_TAB_SIZE);
-                    splineInterpolate(v_ai1, gammaTab, GAMMA_TAB_SIZE);
-                    splineInterpolate(v_bi0, gammaTab, GAMMA_TAB_SIZE);
-                    splineInterpolate(v_bi1, gammaTab, GAMMA_TAB_SIZE);
+            if (gammaTab)
+            {
+                v_float32 vgscale = vx_setall_f32(gscale);
+                for(int k = 0; k < nrepeats; k++)
+                {
+                    ro[k] *= vgscale;
+                    go[k] *= vgscale;
+                    bo[k] *= vgscale;
                 }
 
-                if( dcn == 4 )
+                for(int k = 0; k < nrepeats; k++)
                 {
-                    __m128 v_a0 = _mm_set1_ps(alpha);
-                    __m128 v_a1 = _mm_set1_ps(alpha);
-                    _mm_interleave_ps(v_li0, v_li1, v_ai0, v_ai1, v_bi0, v_bi1, v_a0, v_a1);
-
-                    _mm_storeu_ps(dst +  0, v_li0);
-                    _mm_storeu_ps(dst +  4, v_li1);
-                    _mm_storeu_ps(dst +  8, v_ai0);
-                    _mm_storeu_ps(dst + 12, v_ai1);
-                    _mm_storeu_ps(dst + 16, v_bi0);
-                    _mm_storeu_ps(dst + 20, v_bi1);
-                    _mm_storeu_ps(dst + 24, v_a0);
-                    _mm_storeu_ps(dst + 28, v_a1);
+                    ro[k] = splineInterpolate(ro[k], gammaTab, GAMMA_TAB_SIZE);
+                    go[k] = splineInterpolate(go[k], gammaTab, GAMMA_TAB_SIZE);
+                    bo[k] = splineInterpolate(bo[k], gammaTab, GAMMA_TAB_SIZE);
                 }
-                else
-                {
-                    _mm_interleave_ps(v_li0, v_li1, v_ai0, v_ai1, v_bi0, v_bi1);
+            }
 
-                    _mm_storeu_ps(dst +  0, v_li0);
-                    _mm_storeu_ps(dst +  4, v_li1);
-                    _mm_storeu_ps(dst +  8, v_ai0);
-                    _mm_storeu_ps(dst + 12, v_ai1);
-                    _mm_storeu_ps(dst + 16, v_bi0);
-                    _mm_storeu_ps(dst + 20, v_bi1);
+            if(dcn == 4)
+            {
+                v_float32 valpha = vx_setall_f32(alpha);
+                for(int k = 0; k < nrepeats; k++)
+                {
+                    v_store_interleave(dst + 4*vsize*k, ro[k], go[k], bo[k], valpha);
+                }
+            }
+            else // dcn == 3
+            {
+                for(int k = 0; k < nrepeats; k++)
+                {
+                    v_store_interleave(dst + 3*vsize*k, ro[k], go[k], bo[k]);
                 }
             }
         }
-        #endif
-        for (; i < n; i += 3, dst += dcn)
+#endif
+        for (; i < n; i++, src += 3, dst += dcn)
         {
-            float li = src[i];
-            float ai = src[i + 1];
-            float bi = src[i + 2];
+            float li = src[0];
+            float ai = src[1];
+            float bi = src[2];
 
             // 903.3 = (29/3)^3, 7.787 = (29/3)^3/(29*4)
             float y, fy;
@@ -2180,9 +2255,6 @@ struct Lab2RGBfloat
     bool srgb;
     float lThresh;
     float fThresh;
-    #if CV_SSE2
-    bool haveSIMD;
-    #endif
     int blueIdx;
 };
 
@@ -2204,7 +2276,7 @@ struct Lab2RGBinteger
 
     Lab2RGBinteger( int _dstcn, int blueIdx, const float* _coeffs,
                     const float* _whitept, bool srgb )
-    : dstcn(_dstcn)
+    : dstcn(_dstcn), issRGB(srgb)
     {
         softdouble whitePt[3];
         for(int i = 0; i < 3; i++)
@@ -2227,8 +2299,6 @@ struct Lab2RGBinteger
             coeffs[i+3]             = cvRound(lshift*c[1]*whitePt[i]);
             coeffs[i+(blueIdx^2)*3] = cvRound(lshift*c[2]*whitePt[i]);
         }
-
-        tab = srgb ? sRGBInvGammaTab_b : linearInvGammaTab_b;
     }
 
     // L, a, b should be in their natural range
@@ -2268,63 +2338,75 @@ struct Lab2RGBinteger
         go = max(0, min((int)INV_GAMMA_TAB_SIZE-1, go));
         bo = max(0, min((int)INV_GAMMA_TAB_SIZE-1, bo));
 
-        ro = tab[ro];
-        go = tab[go];
-        bo = tab[bo];
+        if(issRGB)
+        {
+            ushort* tab = sRGBInvGammaTab_b;
+            ro = tab[ro];
+            go = tab[go];
+            bo = tab[bo];
+        }
+        else
+        {
+            // rgb = (rgb*255) >> inv_gamma_shift
+            ro = ((ro << 8) - ro) >> inv_gamma_shift;
+            go = ((go << 8) - go) >> inv_gamma_shift;
+            bo = ((bo << 8) - bo) >> inv_gamma_shift;
+        }
     }
 
-    // L, a, b should be in their natural range
-    inline void processLabToXYZ(const v_uint8x16& lv, const v_uint8x16& av, const v_uint8x16& bv,
-                                v_int32x4& xiv00, v_int32x4& yiv00, v_int32x4& ziv00,
-                                v_int32x4& xiv01, v_int32x4& yiv01, v_int32x4& ziv01,
-                                v_int32x4& xiv10, v_int32x4& yiv10, v_int32x4& ziv10,
-                                v_int32x4& xiv11, v_int32x4& yiv11, v_int32x4& ziv11) const
+#if CV_SIMD
+    inline void processLabToXYZ(const v_uint8& l, const v_uint8& a, const v_uint8& b,
+                                v_int32 (&xiv)[4], v_int32 (&y)[4], v_int32 (&ziv)[4]) const
     {
-        v_uint16x8 lv0, lv1;
-        v_expand(lv, lv0, lv1);
+        v_uint16 l0, l1;
+        v_expand(l, l0, l1);
+        v_int32 lq[4];
+        v_expand(v_reinterpret_as_s16(l0), lq[0], lq[1]);
+        v_expand(v_reinterpret_as_s16(l1), lq[2], lq[3]);
+
         // Load Y and IFY values from lookup-table
         // y = LabToYF_b[L_value*2], ify = LabToYF_b[L_value*2 + 1]
-        // LabToYF_b[i*2  ] = y;   // 2260 <= y <= BASE
-        // LabToYF_b[i*2+1] = ify; // 0 <= ify <= BASE
-        uint16_t CV_DECL_ALIGNED(16) v_lv0[8], v_lv1[8];
-        v_store_aligned(v_lv0, (lv0 << 1)); v_store_aligned(v_lv1, (lv1 << 1));
-        v_int16x8 ify0, ify1;
+        // LabToYF_b[i*2  ] = y;   // 0 <= y <= BASE
+        // LabToYF_b[i*2+1] = ify; // 2260 <= ify <= BASE
+        v_int32 yf[4];
+        v_int32 ify[4];
+        v_int32 mask16 = vx_setall_s32(0xFFFF);
+        for(int k = 0; k < 4; k++)
+        {
+            yf[k] = v_lut((const int*)LabToYF_b, lq[k]);
+            y[k]   = yf[k] & mask16;
+            ify[k] = v_reinterpret_as_s32(v_reinterpret_as_u32(yf[k]) >> 16);
+        }
 
-        yiv00 = v_int32x4(LabToYF_b[v_lv0[0]  ], LabToYF_b[v_lv0[1]  ], LabToYF_b[v_lv0[2]  ], LabToYF_b[v_lv0[3]  ]);
-        yiv01 = v_int32x4(LabToYF_b[v_lv0[4]  ], LabToYF_b[v_lv0[5]  ], LabToYF_b[v_lv0[6]  ], LabToYF_b[v_lv0[7]  ]);
-        yiv10 = v_int32x4(LabToYF_b[v_lv1[0]  ], LabToYF_b[v_lv1[1]  ], LabToYF_b[v_lv1[2]  ], LabToYF_b[v_lv1[3]  ]);
-        yiv11 = v_int32x4(LabToYF_b[v_lv1[4]  ], LabToYF_b[v_lv1[5]  ], LabToYF_b[v_lv1[6]  ], LabToYF_b[v_lv1[7]  ]);
+        v_int16 ify0, ify1;
+        ify0 = v_pack(ify[0], ify[1]);
+        ify1 = v_pack(ify[2], ify[3]);
 
-        ify0 = v_int16x8(LabToYF_b[v_lv0[0]+1], LabToYF_b[v_lv0[1]+1], LabToYF_b[v_lv0[2]+1], LabToYF_b[v_lv0[3]+1],
-                         LabToYF_b[v_lv0[4]+1], LabToYF_b[v_lv0[5]+1], LabToYF_b[v_lv0[6]+1], LabToYF_b[v_lv0[7]+1]);
-        ify1 = v_int16x8(LabToYF_b[v_lv1[0]+1], LabToYF_b[v_lv1[1]+1], LabToYF_b[v_lv1[2]+1], LabToYF_b[v_lv1[3]+1],
-                         LabToYF_b[v_lv1[4]+1], LabToYF_b[v_lv1[5]+1], LabToYF_b[v_lv1[6]+1], LabToYF_b[v_lv1[7]+1]);
-
-        v_int16x8 adiv0, adiv1, bdiv0, bdiv1;
-        v_uint16x8 av0, av1, bv0, bv1;
-        v_expand(av, av0, av1); v_expand(bv, bv0, bv1);
+        v_int16 adiv0, adiv1, bdiv0, bdiv1;
+        v_uint16 a0, a1, b0, b1;
+        v_expand(a, a0, a1); v_expand(b, b0, b1);
         //adiv = aa*BASE/500 - 128*BASE/500, bdiv = bb*BASE/200 - 128*BASE/200;
         //approximations with reasonable precision
-        v_uint16x8 mulA = v_setall_u16(53687);
-        v_uint32x4 ma00, ma01, ma10, ma11;
-        v_uint32x4 addA = v_setall_u32(1 << 7);
-        v_mul_expand((av0 + (av0 << 2)), mulA, ma00, ma01);
-        v_mul_expand((av1 + (av1 << 2)), mulA, ma10, ma11);
-        adiv0 = v_reinterpret_as_s16(v_pack(((ma00 + addA) >> 13), ((ma01 + addA) >> 13)));
-        adiv1 = v_reinterpret_as_s16(v_pack(((ma10 + addA) >> 13), ((ma11 + addA) >> 13)));
+        v_uint16 mulA = vx_setall_u16(53687);
+        v_uint32 ma[4];
+        v_uint32 addA = vx_setall_u32(1 << 7);
+        v_mul_expand((a0 + (a0 << 2)), mulA, ma[0], ma[1]);
+        v_mul_expand((a1 + (a1 << 2)), mulA, ma[2], ma[3]);
+        adiv0 = v_reinterpret_as_s16(v_pack(((ma[0] + addA) >> 13), ((ma[1] + addA) >> 13)));
+        adiv1 = v_reinterpret_as_s16(v_pack(((ma[2] + addA) >> 13), ((ma[3] + addA) >> 13)));
 
-        v_uint16x8 mulB = v_setall_u16(41943);
-        v_uint32x4 mb00, mb01, mb10, mb11;
-        v_uint32x4 addB = v_setall_u32(1 << 4);
-        v_mul_expand(bv0, mulB, mb00, mb01);
-        v_mul_expand(bv1, mulB, mb10, mb11);
-        bdiv0 = v_reinterpret_as_s16(v_pack((mb00 + addB) >> 9, (mb01 + addB) >> 9));
-        bdiv1 = v_reinterpret_as_s16(v_pack((mb10 + addB) >> 9, (mb11 + addB) >> 9));
+        v_uint16 mulB = vx_setall_u16(41943);
+        v_uint32 mb[4];
+        v_uint32 addB = vx_setall_u32(1 << 4);
+        v_mul_expand(b0, mulB, mb[0], mb[1]);
+        v_mul_expand(b1, mulB, mb[2], mb[3]);
+        bdiv0 = v_reinterpret_as_s16(v_pack((mb[0] + addB) >> 9, (mb[1] + addB) >> 9));
+        bdiv1 = v_reinterpret_as_s16(v_pack((mb[2] + addB) >> 9, (mb[3] + addB) >> 9));
 
         // 0 <= adiv <= 8356, 0 <= bdiv <= 20890
         /* x = ifxz[0]; y = y; z = ifxz[1]; */
-        v_uint16x8 xiv0, xiv1, ziv0, ziv1;
-        v_int16x8 vSubA = v_setall_s16(-128*BASE/500 - minABvalue), vSubB = v_setall_s16(128*BASE/200-1 - minABvalue);
+        v_uint16 xiv0, xiv1, ziv0, ziv1;
+        v_int16 vSubA = vx_setall_s16(-128*BASE/500 - minABvalue), vSubB = vx_setall_s16(128*BASE/200-1 - minABvalue);
 
         // int ifxz[] = {ify + adiv, ify - bdiv};
         // ifxz[k] = abToXZ_b[ifxz[k]-minABvalue];
@@ -2333,214 +2415,131 @@ struct Lab2RGBinteger
         ziv0 = v_reinterpret_as_u16(v_add_wrap(v_sub_wrap(ify0, bdiv0), vSubB));
         ziv1 = v_reinterpret_as_u16(v_add_wrap(v_sub_wrap(ify1, bdiv1), vSubB));
 
-        uint16_t CV_DECL_ALIGNED(16) v_x0[8], v_x1[8], v_z0[8], v_z1[8];
-        v_store_aligned(v_x0, xiv0 ); v_store_aligned(v_x1, xiv1 );
-        v_store_aligned(v_z0, ziv0 ); v_store_aligned(v_z1, ziv1 );
+        v_uint32 uxiv[4], uziv[4];
+        v_expand(xiv0, uxiv[0], uxiv[1]);
+        v_expand(xiv1, uxiv[2], uxiv[3]);
+        v_expand(ziv0, uziv[0], uziv[1]);
+        v_expand(ziv1, uziv[2], uziv[3]);
 
-        xiv00 = v_int32x4(abToXZ_b[v_x0[0]], abToXZ_b[v_x0[1]], abToXZ_b[v_x0[2]], abToXZ_b[v_x0[3]]);
-        xiv01 = v_int32x4(abToXZ_b[v_x0[4]], abToXZ_b[v_x0[5]], abToXZ_b[v_x0[6]], abToXZ_b[v_x0[7]]);
-        xiv10 = v_int32x4(abToXZ_b[v_x1[0]], abToXZ_b[v_x1[1]], abToXZ_b[v_x1[2]], abToXZ_b[v_x1[3]]);
-        xiv11 = v_int32x4(abToXZ_b[v_x1[4]], abToXZ_b[v_x1[5]], abToXZ_b[v_x1[6]], abToXZ_b[v_x1[7]]);
-        ziv00 = v_int32x4(abToXZ_b[v_z0[0]], abToXZ_b[v_z0[1]], abToXZ_b[v_z0[2]], abToXZ_b[v_z0[3]]);
-        ziv01 = v_int32x4(abToXZ_b[v_z0[4]], abToXZ_b[v_z0[5]], abToXZ_b[v_z0[6]], abToXZ_b[v_z0[7]]);
-        ziv10 = v_int32x4(abToXZ_b[v_z1[0]], abToXZ_b[v_z1[1]], abToXZ_b[v_z1[2]], abToXZ_b[v_z1[3]]);
-        ziv11 = v_int32x4(abToXZ_b[v_z1[4]], abToXZ_b[v_z1[5]], abToXZ_b[v_z1[6]], abToXZ_b[v_z1[7]]);
+        for(int k = 0; k < 4; k++)
+        {
+            xiv[k] = v_lut(abToXZ_b, v_reinterpret_as_s32(uxiv[k]));
+            ziv[k] = v_lut(abToXZ_b, v_reinterpret_as_s32(uziv[k]));
+        }
         // abToXZ_b[i-minABvalue] = v; // -1335 <= v <= 88231
     }
-
-    void operator()(const float* src, float* dst, int n) const
-    {
-        int dcn = dstcn;
-        float alpha = ColorChannel<float>::max();
-
-        int i = 0;
-
-#if CV_SIMD128
-        if(enablePackedLab)
-        {
-            v_float32x4 vldiv  = v_setall_f32(256.f/100.0f);
-            v_float32x4 vf255  = v_setall_f32(255.f);
-            static const int nPixels = 16;
-            for(; i <= n*3-3*nPixels; i += 3*nPixels, dst += dcn*nPixels)
-            {
-                /*
-                int L = saturate_cast<int>(src[i]*BASE/100.0f);
-                int a = saturate_cast<int>(src[i + 1]*BASE/256);
-                int b = saturate_cast<int>(src[i + 2]*BASE/256);
-                */
-                v_float32x4 vl[4], va[4], vb[4];
-                for(int k = 0; k < 4; k++)
-                {
-                    v_load_deinterleave(src + i + k*3*4, vl[k], va[k], vb[k]);
-                    vl[k] *= vldiv;
-                }
-
-                v_int32x4 ivl[4], iva[4], ivb[4];
-                for(int k = 0; k < 4; k++)
-                {
-                    ivl[k] = v_round(vl[k]), iva[k] = v_round(va[k]), ivb[k] = v_round(vb[k]);
-                }
-                v_int16x8 ivl16[2], iva16[2], ivb16[2];
-                ivl16[0] = v_pack(ivl[0], ivl[1]); iva16[0] = v_pack(iva[0], iva[1]); ivb16[0] = v_pack(ivb[0], ivb[1]);
-                ivl16[1] = v_pack(ivl[2], ivl[3]); iva16[1] = v_pack(iva[2], iva[3]); ivb16[1] = v_pack(ivb[2], ivb[3]);
-                v_uint8x16 ivl8, iva8, ivb8;
-                ivl8 = v_reinterpret_as_u8(v_pack(ivl16[0], ivl16[1]));
-                iva8 = v_reinterpret_as_u8(v_pack(iva16[0], iva16[1]));
-                ivb8 = v_reinterpret_as_u8(v_pack(ivb16[0], ivb16[1]));
-
-                v_int32x4 ixv[4], iyv[4], izv[4];
-
-                processLabToXYZ(ivl8, iva8, ivb8, ixv[0], iyv[0], izv[0],
-                                                  ixv[1], iyv[1], izv[1],
-                                                  ixv[2], iyv[2], izv[2],
-                                                  ixv[3], iyv[3], izv[3]);
-                /*
-                ro = CV_DESCALE(C0 * x + C1 * y + C2 * z, shift);
-                go = CV_DESCALE(C3 * x + C4 * y + C5 * z, shift);
-                bo = CV_DESCALE(C6 * x + C7 * y + C8 * z, shift);
-                */
-                v_int32x4 C0 = v_setall_s32(coeffs[0]), C1 = v_setall_s32(coeffs[1]), C2 = v_setall_s32(coeffs[2]);
-                v_int32x4 C3 = v_setall_s32(coeffs[3]), C4 = v_setall_s32(coeffs[4]), C5 = v_setall_s32(coeffs[5]);
-                v_int32x4 C6 = v_setall_s32(coeffs[6]), C7 = v_setall_s32(coeffs[7]), C8 = v_setall_s32(coeffs[8]);
-                v_int32x4 descaleShift = v_setall_s32(1 << (shift-1)), tabsz = v_setall_s32((int)INV_GAMMA_TAB_SIZE-1);
-                for(int k = 0; k < 4; k++)
-                {
-                    v_int32x4 i_r, i_g, i_b;
-                    v_uint32x4 r_vecs, g_vecs, b_vecs;
-                    i_r = (ixv[k]*C0 + iyv[k]*C1 + izv[k]*C2 + descaleShift) >> shift;
-                    i_g = (ixv[k]*C3 + iyv[k]*C4 + izv[k]*C5 + descaleShift) >> shift;
-                    i_b = (ixv[k]*C6 + iyv[k]*C7 + izv[k]*C8 + descaleShift) >> shift;
-
-                    //limit indices in table and then substitute
-                    //ro = tab[ro]; go = tab[go]; bo = tab[bo];
-                    int32_t CV_DECL_ALIGNED(16) rshifts[4], gshifts[4], bshifts[4];
-                    v_int32x4 rs = v_max(v_setzero_s32(), v_min(tabsz, i_r));
-                    v_int32x4 gs = v_max(v_setzero_s32(), v_min(tabsz, i_g));
-                    v_int32x4 bs = v_max(v_setzero_s32(), v_min(tabsz, i_b));
-
-                    v_store_aligned(rshifts, rs);
-                    v_store_aligned(gshifts, gs);
-                    v_store_aligned(bshifts, bs);
-
-                    r_vecs = v_uint32x4(tab[rshifts[0]], tab[rshifts[1]], tab[rshifts[2]], tab[rshifts[3]]);
-                    g_vecs = v_uint32x4(tab[gshifts[0]], tab[gshifts[1]], tab[gshifts[2]], tab[gshifts[3]]);
-                    b_vecs = v_uint32x4(tab[bshifts[0]], tab[bshifts[1]], tab[bshifts[2]], tab[bshifts[3]]);
-
-                    v_float32x4 v_r, v_g, v_b;
-                    v_r = v_cvt_f32(v_reinterpret_as_s32(r_vecs))/vf255;
-                    v_g = v_cvt_f32(v_reinterpret_as_s32(g_vecs))/vf255;
-                    v_b = v_cvt_f32(v_reinterpret_as_s32(b_vecs))/vf255;
-
-                    if(dcn == 4)
-                    {
-                        v_store_interleave(dst + k*dcn*4, v_b, v_g, v_r, v_setall_f32(alpha));
-                    }
-                    else // dcn == 3
-                    {
-                        v_store_interleave(dst + k*dcn*4, v_b, v_g, v_r);
-                    }
-                }
-            }
-        }
 #endif
 
-        for(; i < n*3; i += 3, dst += dcn)
-        {
-            int ro, go, bo;
-            process((uchar)(src[i + 0]*255.f/100.f), (uchar)src[i + 1], (uchar)src[i + 2], ro, go, bo);
-
-            dst[0] = bo/255.f;
-            dst[1] = go/255.f;
-            dst[2] = ro/255.f;
-            if(dcn == 4)
-                dst[3] = alpha;
-        }
-    }
-
     void operator()(const uchar* src, uchar* dst, int n) const
     {
+        CV_INSTRUMENT_REGION();
+
         int i, dcn = dstcn;
         uchar alpha = ColorChannel<uchar>::max();
+
         i = 0;
 
-#if CV_SIMD128
+#if CV_SIMD
         if(enablePackedLab)
         {
-            static const int nPixels = 8*2;
-            for(; i <= n*3-3*nPixels; i += 3*nPixels, dst += dcn*nPixels)
+            bool srgb = issRGB;
+            ushort* tab = sRGBInvGammaTab_b;
+            const int vsize = v_uint8::nlanes;
+            v_uint8 valpha = vx_setall_u8(alpha);
+            v_int32 vc[9];
+            for(int k = 0; k < 9; k++)
             {
-                /*
-                    int L = src[i + 0];
-                    int a = src[i + 1];
-                    int b = src[i + 2];
-                */
-                v_uint8x16 u8l, u8a, u8b;
-                v_load_deinterleave(src + i, u8l, u8a, u8b);
+                vc[k] = vx_setall_s32(coeffs[k]);
+            }
+            const int descaleShift = 1 << (shift-1);
+            v_int32 vdescale = vx_setall_s32(descaleShift);
+            for ( ; i <= n-vsize;
+                  i += vsize, src += 3*vsize, dst += dcn*vsize)
+            {
+                v_uint8 l, a, b;
+                v_load_deinterleave(src, l, a, b);
 
-                v_int32x4 xiv[4], yiv[4], ziv[4];
-                processLabToXYZ(u8l, u8a, u8b, xiv[0], yiv[0], ziv[0],
-                                               xiv[1], yiv[1], ziv[1],
-                                               xiv[2], yiv[2], ziv[2],
-                                               xiv[3], yiv[3], ziv[3]);
-                /*
-                        ro = CV_DESCALE(C0 * x + C1 * y + C2 * z, shift);
-                        go = CV_DESCALE(C3 * x + C4 * y + C5 * z, shift);
-                        bo = CV_DESCALE(C6 * x + C7 * y + C8 * z, shift);
-                */
-                v_int32x4 C0 = v_setall_s32(coeffs[0]), C1 = v_setall_s32(coeffs[1]), C2 = v_setall_s32(coeffs[2]);
-                v_int32x4 C3 = v_setall_s32(coeffs[3]), C4 = v_setall_s32(coeffs[4]), C5 = v_setall_s32(coeffs[5]);
-                v_int32x4 C6 = v_setall_s32(coeffs[6]), C7 = v_setall_s32(coeffs[7]), C8 = v_setall_s32(coeffs[8]);
-                v_int32x4 descaleShift = v_setall_s32(1 << (shift-1));
-                v_int32x4 tabsz = v_setall_s32((int)INV_GAMMA_TAB_SIZE-1);
-                v_uint32x4 r_vecs[4], g_vecs[4], b_vecs[4];
+                v_int32 xq[4], yq[4], zq[4];
+                processLabToXYZ(l, a, b, xq, yq, zq);
+
+                // x, y, z exceed 2^16 so we cannot do v_mul_expand or v_dotprod
+                v_int32 rq[4], gq[4], bq[4];
                 for(int k = 0; k < 4; k++)
                 {
-                    v_int32x4 i_r, i_g, i_b;
-                    i_r = (xiv[k]*C0 + yiv[k]*C1 + ziv[k]*C2 + descaleShift) >> shift;
-                    i_g = (xiv[k]*C3 + yiv[k]*C4 + ziv[k]*C5 + descaleShift) >> shift;
-                    i_b = (xiv[k]*C6 + yiv[k]*C7 + ziv[k]*C8 + descaleShift) >> shift;
-
-                    //limit indices in table and then substitute
-                    //ro = tab[ro]; go = tab[go]; bo = tab[bo];
-                    int32_t CV_DECL_ALIGNED(16) rshifts[4], gshifts[4], bshifts[4];
-                    v_int32x4 rs = v_max(v_setzero_s32(), v_min(tabsz, i_r));
-                    v_int32x4 gs = v_max(v_setzero_s32(), v_min(tabsz, i_g));
-                    v_int32x4 bs = v_max(v_setzero_s32(), v_min(tabsz, i_b));
-
-                    v_store_aligned(rshifts, rs);
-                    v_store_aligned(gshifts, gs);
-                    v_store_aligned(bshifts, bs);
-
-                    r_vecs[k] = v_uint32x4(tab[rshifts[0]], tab[rshifts[1]], tab[rshifts[2]], tab[rshifts[3]]);
-                    g_vecs[k] = v_uint32x4(tab[gshifts[0]], tab[gshifts[1]], tab[gshifts[2]], tab[gshifts[3]]);
-                    b_vecs[k] = v_uint32x4(tab[bshifts[0]], tab[bshifts[1]], tab[bshifts[2]], tab[bshifts[3]]);
+                    rq[k] = (vc[0] * xq[k] + vc[1] * yq[k] + vc[2] * zq[k] + vdescale) >> shift;
+                    gq[k] = (vc[3] * xq[k] + vc[4] * yq[k] + vc[5] * zq[k] + vdescale) >> shift;
+                    bq[k] = (vc[6] * xq[k] + vc[7] * yq[k] + vc[8] * zq[k] + vdescale) >> shift;
                 }
 
-                v_uint16x8 u_rvec0 = v_pack(r_vecs[0], r_vecs[1]), u_rvec1 = v_pack(r_vecs[2], r_vecs[3]);
-                v_uint16x8 u_gvec0 = v_pack(g_vecs[0], g_vecs[1]), u_gvec1 = v_pack(g_vecs[2], g_vecs[3]);
-                v_uint16x8 u_bvec0 = v_pack(b_vecs[0], b_vecs[1]), u_bvec1 = v_pack(b_vecs[2], b_vecs[3]);
-
-                v_uint8x16 u8_b, u8_g, u8_r;
-                u8_b = v_pack(u_bvec0, u_bvec1);
-                u8_g = v_pack(u_gvec0, u_gvec1);
-                u8_r = v_pack(u_rvec0, u_rvec1);
-
-                if(dcn == 4)
+                //limit indices in table and then substitute
+                //ro = tab[ro]; go = tab[go]; bo = tab[bo];
+                v_int32 z = vx_setzero_s32(), up = vx_setall_s32((int)INV_GAMMA_TAB_SIZE-1);
+                for (int k = 0; k < 4; k++)
                 {
-                    v_store_interleave(dst, u8_b, u8_g, u8_r, v_setall_u8(alpha));
+                    rq[k] = v_max(z, v_min(up, rq[k]));
+                    gq[k] = v_max(z, v_min(up, gq[k]));
+                    bq[k] = v_max(z, v_min(up, bq[k]));
+                }
+
+                v_uint16 rgb[6];
+                if(srgb)
+                {
+                    // [RRR... , GGG... , BBB...]
+                    int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vidx[vsize*3];
+                    for (int k = 0; k < 4; k++)
+                        v_store_aligned(vidx + 0*vsize + k*vsize/4, rq[k]);
+                    for (int k = 0; k < 4; k++)
+                        v_store_aligned(vidx + 1*vsize + k*vsize/4, gq[k]);
+                    for (int k = 0; k < 4; k++)
+                        v_store_aligned(vidx + 2*vsize + k*vsize/4, bq[k]);
+
+                    rgb[0] = vx_lut(tab, vidx + 0*vsize/2);
+                    rgb[1] = vx_lut(tab, vidx + 1*vsize/2);
+                    rgb[2] = vx_lut(tab, vidx + 2*vsize/2);
+                    rgb[3] = vx_lut(tab, vidx + 3*vsize/2);
+                    rgb[4] = vx_lut(tab, vidx + 4*vsize/2);
+                    rgb[5] = vx_lut(tab, vidx + 5*vsize/2);
                 }
                 else
                 {
-                    v_store_interleave(dst, u8_b, u8_g, u8_r);
+                    // rgb = (rgb*255) >> inv_gamma_shift
+                    for(int k = 0; k < 4; k++)
+                    {
+                        rq[k] = ((rq[k] << 8) - rq[k]) >> inv_gamma_shift;
+                        gq[k] = ((gq[k] << 8) - gq[k]) >> inv_gamma_shift;
+                        bq[k] = ((bq[k] << 8) - bq[k]) >> inv_gamma_shift;
+                    }
+                    rgb[0] = v_reinterpret_as_u16(v_pack(rq[0], rq[1]));
+                    rgb[1] = v_reinterpret_as_u16(v_pack(rq[2], rq[3]));
+                    rgb[2] = v_reinterpret_as_u16(v_pack(gq[0], gq[1]));
+                    rgb[3] = v_reinterpret_as_u16(v_pack(gq[2], gq[3]));
+                    rgb[4] = v_reinterpret_as_u16(v_pack(bq[0], bq[1]));
+                    rgb[5] = v_reinterpret_as_u16(v_pack(bq[2], bq[3]));
+                }
+
+                v_uint16 R0, R1, G0, G1, B0, B1;
+
+                v_uint8 R, G, B;
+                R = v_pack(rgb[0], rgb[1]);
+                G = v_pack(rgb[2], rgb[3]);
+                B = v_pack(rgb[4], rgb[5]);
+
+                if(dcn == 4)
+                {
+                    v_store_interleave(dst, B, G, R, valpha);
+                }
+                else // dcn == 3
+                {
+                    v_store_interleave(dst, B, G, R);
                 }
             }
         }
 #endif
 
-        for (; i < n*3; i += 3, dst += dcn)
+        for (; i < n; i++, src += 3, dst += dcn)
         {
             int ro, go, bo;
-            process(src[i + 0], src[i + 1], src[i + 2], ro, go, bo);
+            process(src[0], src[1], src[2], ro, go, bo);
 
             dst[0] = saturate_cast<uchar>(bo);
             dst[1] = saturate_cast<uchar>(go);
@@ -2552,7 +2551,7 @@ struct Lab2RGBinteger
 
     int dstcn;
     int coeffs[9];
-    ushort* tab;
+    bool issRGB;
 };
 
 
@@ -2582,63 +2581,12 @@ struct Lab2RGB_b
     Lab2RGB_b( int _dstcn, int _blueIdx, const float* _coeffs,
                const float* _whitept, bool _srgb )
     : fcvt(3, _blueIdx, _coeffs, _whitept, _srgb ), icvt(_dstcn, _blueIdx, _coeffs, _whitept, _srgb), dstcn(_dstcn)
-    {
-        #if CV_NEON
-        v_scale_inv = vdupq_n_f32(100.f/255.f);
-        v_scale = vdupq_n_f32(255.f);
-        v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
-        v_128 = vdupq_n_f32(128.0f);
-        #elif CV_SSE2
-        v_scale = _mm_set1_ps(255.f);
-        v_alpha = _mm_set1_ps(ColorChannel<uchar>::max());
-        v_zero = _mm_setzero_si128();
-        haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
-        #endif
-    }
-
-    #if CV_SSE2
-    // 16s x 8
-    void process(__m128i v_r, __m128i v_g, __m128i v_b,
-                 const __m128& v_coeffs_, const __m128& v_res_,
-                 float * buf) const
-    {
-        __m128 v_r0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_r, v_zero));
-        __m128 v_g0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_g, v_zero));
-        __m128 v_b0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_b, v_zero));
-
-        __m128 v_r1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_r, v_zero));
-        __m128 v_g1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_g, v_zero));
-        __m128 v_b1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_b, v_zero));
-
-        __m128 v_coeffs = v_coeffs_;
-        __m128 v_res = v_res_;
-
-        v_r0 = _mm_sub_ps(_mm_mul_ps(v_r0, v_coeffs), v_res);
-        v_g1 = _mm_sub_ps(_mm_mul_ps(v_g1, v_coeffs), v_res);
-
-        v_coeffs = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v_coeffs), 0x49));
-        v_res = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v_res), 0x49));
-
-        v_r1 = _mm_sub_ps(_mm_mul_ps(v_r1, v_coeffs), v_res);
-        v_b0 = _mm_sub_ps(_mm_mul_ps(v_b0, v_coeffs), v_res);
-
-        v_coeffs = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v_coeffs), 0x49));
-        v_res = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v_res), 0x49));
-
-        v_g0 = _mm_sub_ps(_mm_mul_ps(v_g0, v_coeffs), v_res);
-        v_b1 = _mm_sub_ps(_mm_mul_ps(v_b1, v_coeffs), v_res);
-
-        _mm_store_ps(buf, v_r0);
-        _mm_store_ps(buf + 4, v_r1);
-        _mm_store_ps(buf + 8, v_g0);
-        _mm_store_ps(buf + 12, v_g1);
-        _mm_store_ps(buf + 16, v_b0);
-        _mm_store_ps(buf + 20, v_b1);
-    }
-    #endif
+    { }
 
     void operator()(const uchar* src, uchar* dst, int n) const
     {
+        CV_INSTRUMENT_REGION();
+
         if(enableBitExactness)
         {
             icvt(src, dst, n);
@@ -2647,11 +2595,31 @@ struct Lab2RGB_b
 
         int i, j, dcn = dstcn;
         uchar alpha = ColorChannel<uchar>::max();
+#if CV_SIMD
+        float CV_DECL_ALIGNED(CV_SIMD_WIDTH) buf[3*BLOCK_SIZE];
+#else
         float CV_DECL_ALIGNED(16) buf[3*BLOCK_SIZE];
-        #if CV_SSE2
-        __m128 v_coeffs = _mm_set_ps(100.f/255.f, 1.f, 1.f, 100.f/255.f);
-        __m128 v_res = _mm_set_ps(0.f, 128.f, 128.f, 0.f);
-        #endif
+#endif
+
+        static const softfloat fl = softfloat(100)/f255;
+
+#if CV_SIMD
+        const int fsize = v_float32::nlanes;
+        v_float32 vl = vx_setall_f32((float)fl);
+        v_float32 va = vx_setall_f32(1.f);
+        v_float32 vb = vx_setall_f32(1.f);
+        v_float32 vaLow = vx_setall_f32(-128.f), vbLow = vx_setall_f32(-128.f);
+        //TODO: fix that when v_interleave is available
+        float CV_DECL_ALIGNED(CV_SIMD_WIDTH) interTmpM[fsize*3], interTmpA[fsize*3];
+        v_store_interleave(interTmpM, vl, va, vb);
+        v_store_interleave(interTmpA, vx_setzero_f32(), vaLow, vbLow);
+        v_float32 mluv[3], aluv[3];
+        for(int k = 0; k < 3; k++)
+        {
+            mluv[k] = vx_load_aligned(interTmpM + k*fsize);
+            aluv[k] = vx_load_aligned(interTmpA + k*fsize);
+        }
+#endif
 
         i = 0;
         for(; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 )
@@ -2659,129 +2627,89 @@ struct Lab2RGB_b
             int dn = std::min(n - i, (int)BLOCK_SIZE);
             j = 0;
 
-            #if CV_NEON
-            for ( ; j <= (dn - 8) * 3; j += 24)
+#if CV_SIMD
+            const int vsize = v_uint8::nlanes;
+            for( ; j <= (dn - vsize)*3; j += 3*vsize )
             {
-                uint8x8x3_t v_src = vld3_u8(src + j);
-                uint16x8_t v_t0 = vmovl_u8(v_src.val[0]),
-                           v_t1 = vmovl_u8(v_src.val[1]),
-                           v_t2 = vmovl_u8(v_src.val[2]);
+                v_uint8 s0, s1, s2;
+                s0 = vx_load(src + j + 0*vsize);
+                s1 = vx_load(src + j + 1*vsize);
+                s2 = vx_load(src + j + 2*vsize);
 
-                float32x4x3_t v_dst;
-                v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0))), v_scale_inv);
-                v_dst.val[1] = vsubq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_128);
-                v_dst.val[2] = vsubq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_128);
-                vst3q_f32(buf + j, v_dst);
-
-                v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0))), v_scale_inv);
-                v_dst.val[1] = vsubq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_128);
-                v_dst.val[2] = vsubq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_128);
-                vst3q_f32(buf + j + 12, v_dst);
-            }
-            #elif CV_SSE2
-            if (haveSIMD)
-            {
-                for ( ; j <= (dn - 8) * 3; j += 24)
+                v_uint16 ss[6];
+                v_expand(s0, ss[0], ss[1]);
+                v_expand(s1, ss[2], ss[3]);
+                v_expand(s2, ss[4], ss[5]);
+                v_int32 vs[12];
+                for(int k = 0; k < 6; k++)
                 {
-                    __m128i v_src0 = _mm_loadu_si128((__m128i const *)(src + j));
-                    __m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src + j + 16));
+                    v_expand(v_reinterpret_as_s16(ss[k]), vs[k*2+0], vs[k*2+1]);
+                }
 
-                    process(_mm_unpacklo_epi8(v_src0, v_zero),
-                            _mm_unpackhi_epi8(v_src0, v_zero),
-                            _mm_unpacklo_epi8(v_src1, v_zero),
-                            v_coeffs, v_res,
-                            buf + j);
+                for(int bufp = 0; bufp < 12; bufp++)
+                {
+                    v_store_aligned(buf + j + bufp, v_muladd(v_cvt_f32(vs[bufp]), mluv[bufp%3], aluv[bufp%3]));
                 }
             }
-            #endif
+#endif
 
             for( ; j < dn*3; j += 3 )
             {
-                buf[j] = src[j]*(100.f/255.f);
-                buf[j+1] = (float)(src[j+1] - 128);
-                buf[j+2] = (float)(src[j+2] - 128);
+                buf[j] = src[j]*((float)fl);
+                buf[j+1] = (float)(src[j+1] - 128.f);
+                buf[j+2] = (float)(src[j+2] - 128.f);
             }
+
             fcvt(buf, buf, dn);
+
             j = 0;
 
-            #if CV_NEON
-            for ( ; j <= (dn - 8) * 3; j += 24, dst += dcn * 8)
+#if CV_SIMD
+            static const int nBlock = 4*fsize;
+            v_float32 v255 = vx_setall_f32(255.f);
+            if(dcn == 4)
             {
-                float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12);
-                uint8x8_t v_dst0 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[0], v_scale))),
-                                                           vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[0], v_scale)))));
-                uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[1], v_scale))),
-                                                           vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[1], v_scale)))));
-                uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[2], v_scale))),
-                                                           vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[2], v_scale)))));
+                v_uint8 valpha = vx_setall_u8(alpha);
+                for( ; j <= (dn-nBlock)*3;
+                     j += nBlock*3, dst += nBlock)
+                {
+                    v_float32 vf[4*3];
+                    for(int k = 0; k < 4; k++)
+                    {
+                        v_load_deinterleave(buf + j, vf[k*3+0], vf[k*3+1], vf[k*3+2]);
+                    }
 
-                if (dcn == 4)
-                {
-                    uint8x8x4_t v_dst;
-                    v_dst.val[0] = v_dst0;
-                    v_dst.val[1] = v_dst1;
-                    v_dst.val[2] = v_dst2;
-                    v_dst.val[3] = v_alpha;
-                    vst4_u8(dst, v_dst);
-                }
-                else
-                {
-                    uint8x8x3_t v_dst;
-                    v_dst.val[0] = v_dst0;
-                    v_dst.val[1] = v_dst1;
-                    v_dst.val[2] = v_dst2;
-                    vst3_u8(dst, v_dst);
+                    v_int32 vi[4*3];
+                    for(int k = 0; k < 4*3; k++)
+                    {
+                        vi[k] = v_round(vf[k]*v255);
+                    }
+
+                    v_uint8 rgb[3];
+                    for(int k = 0; k < 3; k++)
+                    {
+                        rgb[k] = v_pack_u(v_pack(vi[0*3+k], vi[1*3+k]),
+                                          v_pack(vi[2*3+k], vi[3*3+k]));
+                    }
+
+                    v_store_interleave(dst, rgb[0], rgb[1], rgb[2], valpha);
                 }
             }
-            #elif CV_SSE2
-            if (dcn == 3 && haveSIMD)
+            else // dcn == 3
             {
-                for ( ; j <= (dn * 3 - 16); j += 16, dst += 16)
+                for(; j < dn*3 - nBlock; j += nBlock, dst += nBlock)
                 {
-                    __m128 v_src0 = _mm_mul_ps(_mm_load_ps(buf + j), v_scale);
-                    __m128 v_src1 = _mm_mul_ps(_mm_load_ps(buf + j + 4), v_scale);
-                    __m128 v_src2 = _mm_mul_ps(_mm_load_ps(buf + j + 8), v_scale);
-                    __m128 v_src3 = _mm_mul_ps(_mm_load_ps(buf + j + 12), v_scale);
-
-                    __m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(v_src0),
-                                                     _mm_cvtps_epi32(v_src1));
-                    __m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(v_src2),
-                                                     _mm_cvtps_epi32(v_src3));
-
-                    _mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(v_dst0, v_dst1));
+                    v_float32 vf[4];
+                    v_int32 vi[4];
+                    for(int k = 0; k < 4; k++)
+                    {
+                        vf[k] = vx_load_aligned(buf + j + k*fsize);
+                        vi[k] = v_round(vf[k]*v255);
+                    }
+                    v_store(dst, v_pack_u(v_pack(vi[0], vi[1]), v_pack(vi[2], vi[3])));
                 }
-
-                int jr = j % 3;
-                if (jr)
-                    dst -= jr, j -= jr;
             }
-            else if (dcn == 4 && haveSIMD)
-            {
-                for ( ; j <= (dn * 3 - 12); j += 12, dst += 16)
-                {
-                    __m128 v_buf0 = _mm_mul_ps(_mm_load_ps(buf + j), v_scale);
-                    __m128 v_buf1 = _mm_mul_ps(_mm_load_ps(buf + j + 4), v_scale);
-                    __m128 v_buf2 = _mm_mul_ps(_mm_load_ps(buf + j + 8), v_scale);
-
-                    __m128 v_ba0 = _mm_unpackhi_ps(v_buf0, v_alpha);
-                    __m128 v_ba1 = _mm_unpacklo_ps(v_buf2, v_alpha);
-
-                    __m128i v_src0 = _mm_cvtps_epi32(_mm_shuffle_ps(v_buf0, v_ba0, 0x44));
-                    __m128i v_src1 = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_shuffle_ps(v_ba0, v_buf1, 0x4e)), 0x78);
-                    __m128i v_src2 = _mm_cvtps_epi32(_mm_shuffle_ps(v_buf1, v_ba1, 0x4e));
-                    __m128i v_src3 = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_shuffle_ps(v_ba1, v_buf2, 0xee)), 0x78);
-
-                    __m128i v_dst0 = _mm_packs_epi32(v_src0, v_src1);
-                    __m128i v_dst1 = _mm_packs_epi32(v_src2, v_src3);
-
-                    _mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(v_dst0, v_dst1));
-                }
-
-                int jr = j % 3;
-                if (jr)
-                    dst -= jr, j -= jr;
-            }
-            #endif
+#endif
 
             for( ; j < dn*3; j += 3, dst += dcn )
             {
@@ -2796,15 +2724,6 @@ struct Lab2RGB_b
 
     Lab2RGBfloat   fcvt;
     Lab2RGBinteger icvt;
-    #if CV_NEON
-    float32x4_t v_scale, v_scale_inv, v_128;
-    uint8x8_t v_alpha;
-    #elif CV_SSE2
-    __m128 v_scale;
-    __m128 v_alpha;
-    __m128i v_zero;
-    bool haveSIMD;
-    #endif
     int dstcn;
 };
 
@@ -2818,17 +2737,16 @@ struct RGB2Luvfloat
                const float* whitept, bool _srgb )
     : srccn(_srccn), srgb(_srgb)
     {
-        volatile int i;
         initLabTabs();
 
         softdouble whitePt[3];
-        for( i = 0; i < 3; i++ )
+        for(int i = 0; i < 3; i++ )
             if(whitept)
                 whitePt[i] = softdouble(whitept[i]);
             else
                 whitePt[i] = D65[i];
 
-        for( i = 0; i < 3; i++ )
+        for(int i = 0; i < 3; i++ )
         {
             for(int j = 0; j < 3; j++)
                 if(_coeffs)
@@ -2851,241 +2769,105 @@ struct RGB2Luvfloat
         un = d*softfloat(13*4)*whitePt[0];
         vn = d*softfloat(13*9)*whitePt[1];
 
-        #if CV_SSE2
-        haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
-        #endif
-
         CV_Assert(whitePt[1] == softdouble::one());
     }
 
-    #if CV_NEON
-    void process(float32x4x3_t& v_src) const
-    {
-        float32x4_t v_x = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], vdupq_n_f32(coeffs[0])), v_src.val[1], vdupq_n_f32(coeffs[1])), v_src.val[2], vdupq_n_f32(coeffs[2]));
-        float32x4_t v_y = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], vdupq_n_f32(coeffs[3])), v_src.val[1], vdupq_n_f32(coeffs[4])), v_src.val[2], vdupq_n_f32(coeffs[5]));
-        float32x4_t v_z = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], vdupq_n_f32(coeffs[6])), v_src.val[1], vdupq_n_f32(coeffs[7])), v_src.val[2], vdupq_n_f32(coeffs[8]));
-
-        v_src.val[0] = vmulq_f32(v_y, vdupq_n_f32(LabCbrtTabScale));
-        splineInterpolate(v_src.val[0], LabCbrtTab, LAB_CBRT_TAB_SIZE);
-
-        v_src.val[0] = vmlaq_f32(vdupq_n_f32(-16.f), v_src.val[0], vdupq_n_f32(116.f));
-
-        float32x4_t v_div = vmaxq_f32(vmlaq_f32(vmlaq_f32(v_x, vdupq_n_f32(15.f), v_y), vdupq_n_f32(3.f), v_z), vdupq_n_f32(FLT_EPSILON));
-        float32x4_t v_reciprocal = vrecpeq_f32(v_div);
-        v_reciprocal = vmulq_f32(vrecpsq_f32(v_div, v_reciprocal), v_reciprocal);
-        v_reciprocal = vmulq_f32(vrecpsq_f32(v_div, v_reciprocal), v_reciprocal);
-        float32x4_t v_d = vmulq_f32(vdupq_n_f32(52.f), v_reciprocal);
-
-        v_src.val[1] = vmulq_f32(v_src.val[0], vmlaq_f32(vdupq_n_f32(-un), v_x, v_d));
-        v_src.val[2] = vmulq_f32(v_src.val[0], vmlaq_f32(vdupq_n_f32(-vn), vmulq_f32(vdupq_n_f32(2.25f), v_y), v_d));
-    }
-    #elif CV_SSE2
-    void process(__m128& v_r0, __m128& v_r1, __m128& v_g0,
-                 __m128& v_g1, __m128& v_b0, __m128& v_b1) const
-    {
-        __m128 v_x0 = _mm_mul_ps(v_r0, _mm_set1_ps(coeffs[0]));
-        __m128 v_x1 = _mm_mul_ps(v_r1, _mm_set1_ps(coeffs[0]));
-        __m128 v_y0 = _mm_mul_ps(v_r0, _mm_set1_ps(coeffs[3]));
-        __m128 v_y1 = _mm_mul_ps(v_r1, _mm_set1_ps(coeffs[3]));
-        __m128 v_z0 = _mm_mul_ps(v_r0, _mm_set1_ps(coeffs[6]));
-        __m128 v_z1 = _mm_mul_ps(v_r1, _mm_set1_ps(coeffs[6]));
-
-        v_x0 = _mm_add_ps(v_x0, _mm_mul_ps(v_g0, _mm_set1_ps(coeffs[1])));
-        v_x1 = _mm_add_ps(v_x1, _mm_mul_ps(v_g1, _mm_set1_ps(coeffs[1])));
-        v_y0 = _mm_add_ps(v_y0, _mm_mul_ps(v_g0, _mm_set1_ps(coeffs[4])));
-        v_y1 = _mm_add_ps(v_y1, _mm_mul_ps(v_g1, _mm_set1_ps(coeffs[4])));
-        v_z0 = _mm_add_ps(v_z0, _mm_mul_ps(v_g0, _mm_set1_ps(coeffs[7])));
-        v_z1 = _mm_add_ps(v_z1, _mm_mul_ps(v_g1, _mm_set1_ps(coeffs[7])));
-
-        v_x0 = _mm_add_ps(v_x0, _mm_mul_ps(v_b0, _mm_set1_ps(coeffs[2])));
-        v_x1 = _mm_add_ps(v_x1, _mm_mul_ps(v_b1, _mm_set1_ps(coeffs[2])));
-        v_y0 = _mm_add_ps(v_y0, _mm_mul_ps(v_b0, _mm_set1_ps(coeffs[5])));
-        v_y1 = _mm_add_ps(v_y1, _mm_mul_ps(v_b1, _mm_set1_ps(coeffs[5])));
-        v_z0 = _mm_add_ps(v_z0, _mm_mul_ps(v_b0, _mm_set1_ps(coeffs[8])));
-        v_z1 = _mm_add_ps(v_z1, _mm_mul_ps(v_b1, _mm_set1_ps(coeffs[8])));
-
-        __m128 v_l0 = _mm_mul_ps(v_y0, _mm_set1_ps(LabCbrtTabScale));
-        __m128 v_l1 = _mm_mul_ps(v_y1, _mm_set1_ps(LabCbrtTabScale));
-        splineInterpolate(v_l0, LabCbrtTab, LAB_CBRT_TAB_SIZE);
-        splineInterpolate(v_l1, LabCbrtTab, LAB_CBRT_TAB_SIZE);
-
-        v_l0 = _mm_mul_ps(v_l0, _mm_set1_ps(116.0f));
-        v_l1 = _mm_mul_ps(v_l1, _mm_set1_ps(116.0f));
-        v_r0 = _mm_sub_ps(v_l0, _mm_set1_ps(16.0f));
-        v_r1 = _mm_sub_ps(v_l1, _mm_set1_ps(16.0f));
-
-        v_z0 = _mm_mul_ps(v_z0, _mm_set1_ps(3.0f));
-        v_z1 = _mm_mul_ps(v_z1, _mm_set1_ps(3.0f));
-        v_z0 = _mm_add_ps(v_z0, v_x0);
-        v_z1 = _mm_add_ps(v_z1, v_x1);
-        v_z0 = _mm_add_ps(v_z0, _mm_mul_ps(v_y0, _mm_set1_ps(15.0f)));
-        v_z1 = _mm_add_ps(v_z1, _mm_mul_ps(v_y1, _mm_set1_ps(15.0f)));
-        v_z0 = _mm_max_ps(v_z0, _mm_set1_ps(FLT_EPSILON));
-        v_z1 = _mm_max_ps(v_z1, _mm_set1_ps(FLT_EPSILON));
-        __m128 v_d0 = _mm_div_ps(_mm_set1_ps(52.0f), v_z0);
-        __m128 v_d1 = _mm_div_ps(_mm_set1_ps(52.0f), v_z1);
-
-        v_x0 = _mm_mul_ps(v_x0, v_d0);
-        v_x1 = _mm_mul_ps(v_x1, v_d1);
-        v_x0 = _mm_sub_ps(v_x0, _mm_set1_ps(un));
-        v_x1 = _mm_sub_ps(v_x1, _mm_set1_ps(un));
-        v_g0 = _mm_mul_ps(v_x0, v_r0);
-        v_g1 = _mm_mul_ps(v_x1, v_r1);
-
-        v_y0 = _mm_mul_ps(v_y0, v_d0);
-        v_y1 = _mm_mul_ps(v_y1, v_d1);
-        v_y0 = _mm_mul_ps(v_y0, _mm_set1_ps(2.25f));
-        v_y1 = _mm_mul_ps(v_y1, _mm_set1_ps(2.25f));
-        v_y0 = _mm_sub_ps(v_y0, _mm_set1_ps(vn));
-        v_y1 = _mm_sub_ps(v_y1, _mm_set1_ps(vn));
-        v_b0 = _mm_mul_ps(v_y0, v_r0);
-        v_b1 = _mm_mul_ps(v_y1, v_r1);
-    }
-    #endif
-
     void operator()(const float* src, float* dst, int n) const
     {
+        CV_INSTRUMENT_REGION();
+
         int i = 0, scn = srccn;
         float gscale = GammaTabScale;
         const float* gammaTab = srgb ? sRGBGammaTab : 0;
         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
               C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
               C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
-        n *= 3;
 
-        #if CV_NEON
-        if (scn == 3)
+#if CV_SIMD
+        const int vsize = v_float32::nlanes;
+        const int nrepeats = vsize == 4 ? 2 : 1;
+        for( ; i <= n-vsize*nrepeats;
+             i+= vsize*nrepeats, src += scn*vsize*nrepeats, dst += 3*vsize*nrepeats)
         {
-            for( ; i <= n - 12; i += 12, src += scn * 4 )
+            v_float32 R[nrepeats], G[nrepeats], B[nrepeats], A;
+            if(scn == 4)
             {
-                float32x4x3_t v_src = vld3q_f32(src);
-
-                v_src.val[0] = vmaxq_f32(v_src.val[0], vdupq_n_f32(0));
-                v_src.val[1] = vmaxq_f32(v_src.val[1], vdupq_n_f32(0));
-                v_src.val[2] = vmaxq_f32(v_src.val[2], vdupq_n_f32(0));
-
-                v_src.val[0] = vminq_f32(v_src.val[0], vdupq_n_f32(1));
-                v_src.val[1] = vminq_f32(v_src.val[1], vdupq_n_f32(1));
-                v_src.val[2] = vminq_f32(v_src.val[2], vdupq_n_f32(1));
-
-                if( gammaTab )
+                for (int k = 0; k < nrepeats; k++)
                 {
-                    v_src.val[0] = vmulq_f32(v_src.val[0], vdupq_n_f32(gscale));
-                    v_src.val[1] = vmulq_f32(v_src.val[1], vdupq_n_f32(gscale));
-                    v_src.val[2] = vmulq_f32(v_src.val[2], vdupq_n_f32(gscale));
-                    splineInterpolate(v_src.val[0], gammaTab, GAMMA_TAB_SIZE);
-                    splineInterpolate(v_src.val[1], gammaTab, GAMMA_TAB_SIZE);
-                    splineInterpolate(v_src.val[2], gammaTab, GAMMA_TAB_SIZE);
+                    v_load_deinterleave(src + k*4*vsize, R[k], G[k], B[k], A);
+                }
+            }
+            else // scn == 3
+            {
+                for (int k = 0; k < nrepeats; k++)
+                {
+                    v_load_deinterleave(src + k*3*vsize, R[k], G[k], B[k]);
+                }
+            }
+
+            v_float32 zero = vx_setzero_f32(), one = vx_setall_f32(1.f);
+            for (int k = 0; k < nrepeats; k++)
+            {
+                R[k] = v_min(v_max(R[k], zero), one);
+                G[k] = v_min(v_max(G[k], zero), one);
+                B[k] = v_min(v_max(B[k], zero), one);
+            }
+
+            if(gammaTab)
+            {
+                v_float32 vgscale = vx_setall_f32(gscale);
+                for (int k = 0; k < nrepeats; k++)
+                {
+                    R[k] *= vgscale;
+                    G[k] *= vgscale;
+                    B[k] *= vgscale;
                 }
 
-                process(v_src);
+                for (int k = 0; k < nrepeats; k++)
+                {
+                    R[k] = splineInterpolate(R[k], gammaTab, GAMMA_TAB_SIZE);
+                    G[k] = splineInterpolate(G[k], gammaTab, GAMMA_TAB_SIZE);
+                    B[k] = splineInterpolate(B[k], gammaTab, GAMMA_TAB_SIZE);
+                }
+            }
 
-                vst3q_f32(dst + i, v_src);
+            v_float32 X[nrepeats], Y[nrepeats], Z[nrepeats];
+            v_float32 vc0 = vx_setall_f32(C0), vc1 = vx_setall_f32(C1), vc2 = vx_setall_f32(C2);
+            v_float32 vc3 = vx_setall_f32(C3), vc4 = vx_setall_f32(C4), vc5 = vx_setall_f32(C5);
+            v_float32 vc6 = vx_setall_f32(C6), vc7 = vx_setall_f32(C7), vc8 = vx_setall_f32(C8);
+            for (int k = 0; k < nrepeats; k++)
+            {
+                X[k] = v_fma(R[k], vc0, v_fma(G[k], vc1, B[k]*vc2));
+                Y[k] = v_fma(R[k], vc3, v_fma(G[k], vc4, B[k]*vc5));
+                Z[k] = v_fma(R[k], vc6, v_fma(G[k], vc7, B[k]*vc8));
+            }
+
+            v_float32 L[nrepeats], u[nrepeats], v[nrepeats];
+            v_float32 vmun = vx_setall_f32(-un), vmvn = vx_setall_f32(-vn);
+            for (int k = 0; k < nrepeats; k++)
+            {
+                L[k] = splineInterpolate(Y[k]*vx_setall_f32(LabCbrtTabScale), LabCbrtTab, LAB_CBRT_TAB_SIZE);
+                // L = 116.f*L - 16.f;
+                L[k] = v_fma(L[k], vx_setall_f32(116.f), vx_setall_f32(-16.f));
+
+                v_float32 d;
+                // d = (4*13) / max(X + 15 * Y + 3 * Z, FLT_EPSILON)
+                d = v_fma(Y[k], vx_setall_f32(15.f), v_fma(Z[k], vx_setall_f32(3.f), X[k]));
+                d = vx_setall_f32(4.f*13.f) / v_max(d, vx_setall_f32(FLT_EPSILON));
+                // u = L*(X*d - un)
+                u[k] = L[k]*v_fma(X[k], d, vmun);
+                // v = L*((9*0.25f)*Y*d - vn);
+                v[k] = L[k]*v_fma(vx_setall_f32(9.f*0.25f)*Y[k], d, vmvn);
+            }
+
+            for (int k = 0; k < nrepeats; k++)
+            {
+                v_store_interleave(dst + k*3*vsize, L[k], u[k], v[k]);
             }
         }
-        else
-        {
-            for( ; i <= n - 12; i += 12, src += scn * 4 )
-            {
-                float32x4x4_t v_src = vld4q_f32(src);
+#endif
 
-                v_src.val[0] = vmaxq_f32(v_src.val[0], vdupq_n_f32(0));
-                v_src.val[1] = vmaxq_f32(v_src.val[1], vdupq_n_f32(0));
-                v_src.val[2] = vmaxq_f32(v_src.val[2], vdupq_n_f32(0));
-
-                v_src.val[0] = vminq_f32(v_src.val[0], vdupq_n_f32(1));
-                v_src.val[1] = vminq_f32(v_src.val[1], vdupq_n_f32(1));
-                v_src.val[2] = vminq_f32(v_src.val[2], vdupq_n_f32(1));
-
-                if( gammaTab )
-                {
-                    v_src.val[0] = vmulq_f32(v_src.val[0], vdupq_n_f32(gscale));
-                    v_src.val[1] = vmulq_f32(v_src.val[1], vdupq_n_f32(gscale));
-                    v_src.val[2] = vmulq_f32(v_src.val[2], vdupq_n_f32(gscale));
-                    splineInterpolate(v_src.val[0], gammaTab, GAMMA_TAB_SIZE);
-                    splineInterpolate(v_src.val[1], gammaTab, GAMMA_TAB_SIZE);
-                    splineInterpolate(v_src.val[2], gammaTab, GAMMA_TAB_SIZE);
-                }
-
-                float32x4x3_t v_dst;
-                v_dst.val[0] = v_src.val[0];
-                v_dst.val[1] = v_src.val[1];
-                v_dst.val[2] = v_src.val[2];
-                process(v_dst);
-
-                vst3q_f32(dst + i, v_dst);
-            }
-        }
-        #elif CV_SSE2
-        if (haveSIMD)
-        {
-            for( ; i <= n - 24; i += 24, src += scn * 8 )
-            {
-                __m128 v_r0 = _mm_loadu_ps(src +  0);
-                __m128 v_r1 = _mm_loadu_ps(src +  4);
-                __m128 v_g0 = _mm_loadu_ps(src +  8);
-                __m128 v_g1 = _mm_loadu_ps(src + 12);
-                __m128 v_b0 = _mm_loadu_ps(src + 16);
-                __m128 v_b1 = _mm_loadu_ps(src + 20);
-
-                if (scn == 3)
-                {
-                    _mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
-                }
-                else
-                {
-                    __m128 v_a0 = _mm_loadu_ps(src + 24);
-                    __m128 v_a1 = _mm_loadu_ps(src + 28);
-
-                    _mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1, v_a0, v_a1);
-                }
-
-                v_r0 = _mm_max_ps(v_r0, _mm_setzero_ps());
-                v_r1 = _mm_max_ps(v_r1, _mm_setzero_ps());
-                v_g0 = _mm_max_ps(v_g0, _mm_setzero_ps());
-                v_g1 = _mm_max_ps(v_g1, _mm_setzero_ps());
-                v_b0 = _mm_max_ps(v_b0, _mm_setzero_ps());
-                v_b1 = _mm_max_ps(v_b1, _mm_setzero_ps());
-
-                v_r0 = _mm_min_ps(v_r0, _mm_set1_ps(1.f));
-                v_r1 = _mm_min_ps(v_r1, _mm_set1_ps(1.f));
-                v_g0 = _mm_min_ps(v_g0, _mm_set1_ps(1.f));
-                v_g1 = _mm_min_ps(v_g1, _mm_set1_ps(1.f));
-                v_b0 = _mm_min_ps(v_b0, _mm_set1_ps(1.f));
-                v_b1 = _mm_min_ps(v_b1, _mm_set1_ps(1.f));
-
-                if ( gammaTab )
-                {
-                    __m128 v_gscale = _mm_set1_ps(gscale);
-                    v_r0 = _mm_mul_ps(v_r0, v_gscale);
-                    v_r1 = _mm_mul_ps(v_r1, v_gscale);
-                    v_g0 = _mm_mul_ps(v_g0, v_gscale);
-                    v_g1 = _mm_mul_ps(v_g1, v_gscale);
-                    v_b0 = _mm_mul_ps(v_b0, v_gscale);
-                    v_b1 = _mm_mul_ps(v_b1, v_gscale);
-
-                    splineInterpolate(v_r0, gammaTab, GAMMA_TAB_SIZE);
-                    splineInterpolate(v_r1, gammaTab, GAMMA_TAB_SIZE);
-                    splineInterpolate(v_g0, gammaTab, GAMMA_TAB_SIZE);
-                    splineInterpolate(v_g1, gammaTab, GAMMA_TAB_SIZE);
-                    splineInterpolate(v_b0, gammaTab, GAMMA_TAB_SIZE);
-                    splineInterpolate(v_b1, gammaTab, GAMMA_TAB_SIZE);
-                }
-
-                process(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
-
-                _mm_interleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
-
-                _mm_storeu_ps(dst + i +  0, v_r0);
-                _mm_storeu_ps(dst + i +  4, v_r1);
-                _mm_storeu_ps(dst + i +  8, v_g0);
-                _mm_storeu_ps(dst + i + 12, v_g1);
-                _mm_storeu_ps(dst + i + 16, v_b0);
-                _mm_storeu_ps(dst + i + 20, v_b1);
-            }
-        }
-        #endif
-        for( ; i < n; i += 3, src += scn )
+        for( ; i < n; i++, src += scn, dst += 3 )
         {
             float R = src[0], G = src[1], B = src[2];
             R = std::min(std::max(R, 0.f), 1.f);
@@ -3109,16 +2891,13 @@ struct RGB2Luvfloat
             float u = L*(X*d - un);
             float v = L*((9*0.25f)*Y*d - vn);
 
-            dst[i] = L; dst[i+1] = u; dst[i+2] = v;
+            dst[0] = L; dst[1] = u; dst[2] = v;
         }
     }
 
     int srccn;
     float coeffs[9], un, vn;
     bool srgb;
-    #if CV_SSE2
-    bool haveSIMD;
-    #endif
 };
 
 struct RGB2Luv_f
@@ -3176,95 +2955,14 @@ struct Luv2RGBfloat
         d = softfloat::one()/max(d, softfloat::eps());
         un = softfloat(4*13)*d*whitePt[0];
         vn = softfloat(9*13)*d*whitePt[1];
-        #if CV_SSE2
-        haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
-        #endif
 
         CV_Assert(whitePt[1] == softdouble::one());
     }
 
-    #if CV_SSE2
-    void process(__m128& v_l0, __m128& v_l1, __m128& v_u0,
-                 __m128& v_u1, __m128& v_v0, __m128& v_v1) const
-    {
-        // L*(3./29.)^3
-        __m128 v_y00 = _mm_mul_ps(v_l0, _mm_set1_ps(1.0f/903.3f));
-        __m128 v_y01 = _mm_mul_ps(v_l1, _mm_set1_ps(1.0f/903.3f));
-        // ((L + 16)/116)^3
-        __m128 v_y10 = _mm_mul_ps(_mm_add_ps(v_l0, _mm_set1_ps(16.0f)), _mm_set1_ps(1.f/116.f));
-        __m128 v_y11 = _mm_mul_ps(_mm_add_ps(v_l1, _mm_set1_ps(16.0f)), _mm_set1_ps(1.f/116.f));
-        v_y10 = _mm_mul_ps(_mm_mul_ps(v_y10, v_y10), v_y10);
-        v_y11 = _mm_mul_ps(_mm_mul_ps(v_y11, v_y11), v_y11);
-        // Y = (L <= 8) ? Y0 : Y1;
-        __m128 v_cmpl0 = _mm_cmplt_ps(v_l0, _mm_set1_ps(8.f));
-        __m128 v_cmpl1 = _mm_cmplt_ps(v_l1, _mm_set1_ps(8.f));
-        v_y00 = _mm_and_ps(v_cmpl0, v_y00);
-        v_y01 = _mm_and_ps(v_cmpl1, v_y01);
-        v_y10 = _mm_andnot_ps(v_cmpl0, v_y10);
-        v_y11 = _mm_andnot_ps(v_cmpl1, v_y11);
-        __m128 v_y0 = _mm_or_ps(v_y00, v_y10);
-        __m128 v_y1 = _mm_or_ps(v_y01, v_y11);
-        // up = 3*(u + L*_un);
-        __m128 v_up0 = _mm_mul_ps(_mm_set1_ps(3.f), _mm_add_ps(v_u0, _mm_mul_ps(v_l0, _mm_set1_ps(un))));
-        __m128 v_up1 = _mm_mul_ps(_mm_set1_ps(3.f), _mm_add_ps(v_u1, _mm_mul_ps(v_l1, _mm_set1_ps(un))));
-        // vp = 0.25/(v + L*_vn);
-        __m128 v_vp0 = _mm_div_ps(_mm_set1_ps(0.25f), _mm_add_ps(v_v0, _mm_mul_ps(v_l0, _mm_set1_ps(vn))));
-        __m128 v_vp1 = _mm_div_ps(_mm_set1_ps(0.25f), _mm_add_ps(v_v1, _mm_mul_ps(v_l1, _mm_set1_ps(vn))));
-        // vp = max(-0.25, min(0.25, vp));
-        v_vp0 = _mm_max_ps(v_vp0, _mm_set1_ps(-0.25f));
-        v_vp1 = _mm_max_ps(v_vp1, _mm_set1_ps(-0.25f));
-        v_vp0 = _mm_min_ps(v_vp0, _mm_set1_ps( 0.25f));
-        v_vp1 = _mm_min_ps(v_vp1, _mm_set1_ps( 0.25f));
-        //X = 3*up*vp; // (*Y) is done later
-        __m128 v_x0 = _mm_mul_ps(_mm_set1_ps(3.f), _mm_mul_ps(v_up0, v_vp0));
-        __m128 v_x1 = _mm_mul_ps(_mm_set1_ps(3.f), _mm_mul_ps(v_up1, v_vp1));
-        //Z = ((12*13*L - up)*vp - 5); // (*Y) is done later
-        __m128 v_z0 = _mm_sub_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_set1_ps(12.f*13.f), v_l0), v_up0), v_vp0), _mm_set1_ps(5.f));
-        __m128 v_z1 = _mm_sub_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_set1_ps(12.f*13.f), v_l1), v_up1), v_vp1), _mm_set1_ps(5.f));
-
-        // R = (X*C0 + C1 + Z*C2)*Y; // here (*Y) is done
-        v_l0 = _mm_mul_ps(v_x0, _mm_set1_ps(coeffs[0]));
-        v_l1 = _mm_mul_ps(v_x1, _mm_set1_ps(coeffs[0]));
-        v_u0 = _mm_mul_ps(v_x0, _mm_set1_ps(coeffs[3]));
-        v_u1 = _mm_mul_ps(v_x1, _mm_set1_ps(coeffs[3]));
-        v_v0 = _mm_mul_ps(v_x0, _mm_set1_ps(coeffs[6]));
-        v_v1 = _mm_mul_ps(v_x1, _mm_set1_ps(coeffs[6]));
-        v_l0 = _mm_add_ps(v_l0, _mm_set1_ps(coeffs[1]));
-        v_l1 = _mm_add_ps(v_l1, _mm_set1_ps(coeffs[1]));
-        v_u0 = _mm_add_ps(v_u0, _mm_set1_ps(coeffs[4]));
-        v_u1 = _mm_add_ps(v_u1, _mm_set1_ps(coeffs[4]));
-        v_v0 = _mm_add_ps(v_v0, _mm_set1_ps(coeffs[7]));
-        v_v1 = _mm_add_ps(v_v1, _mm_set1_ps(coeffs[7]));
-        v_l0 = _mm_add_ps(v_l0, _mm_mul_ps(v_z0, _mm_set1_ps(coeffs[2])));
-        v_l1 = _mm_add_ps(v_l1, _mm_mul_ps(v_z1, _mm_set1_ps(coeffs[2])));
-        v_u0 = _mm_add_ps(v_u0, _mm_mul_ps(v_z0, _mm_set1_ps(coeffs[5])));
-        v_u1 = _mm_add_ps(v_u1, _mm_mul_ps(v_z1, _mm_set1_ps(coeffs[5])));
-        v_v0 = _mm_add_ps(v_v0, _mm_mul_ps(v_z0, _mm_set1_ps(coeffs[8])));
-        v_v1 = _mm_add_ps(v_v1, _mm_mul_ps(v_z1, _mm_set1_ps(coeffs[8])));
-        v_l0 = _mm_mul_ps(v_l0, v_y0);
-        v_l1 = _mm_mul_ps(v_l1, v_y1);
-        v_u0 = _mm_mul_ps(v_u0, v_y0);
-        v_u1 = _mm_mul_ps(v_u1, v_y1);
-        v_v0 = _mm_mul_ps(v_v0, v_y0);
-        v_v1 = _mm_mul_ps(v_v1, v_y1);
-
-        v_l0 = _mm_max_ps(v_l0, _mm_setzero_ps());
-        v_l1 = _mm_max_ps(v_l1, _mm_setzero_ps());
-        v_u0 = _mm_max_ps(v_u0, _mm_setzero_ps());
-        v_u1 = _mm_max_ps(v_u1, _mm_setzero_ps());
-        v_v0 = _mm_max_ps(v_v0, _mm_setzero_ps());
-        v_v1 = _mm_max_ps(v_v1, _mm_setzero_ps());
-        v_l0 = _mm_min_ps(v_l0, _mm_set1_ps(1.f));
-        v_l1 = _mm_min_ps(v_l1, _mm_set1_ps(1.f));
-        v_u0 = _mm_min_ps(v_u0, _mm_set1_ps(1.f));
-        v_u1 = _mm_min_ps(v_u1, _mm_set1_ps(1.f));
-        v_v0 = _mm_min_ps(v_v0, _mm_set1_ps(1.f));
-        v_v1 = _mm_min_ps(v_v1, _mm_set1_ps(1.f));
-    }
-    #endif
-
     void operator()(const float* src, float* dst, int n) const
     {
+        CV_INSTRUMENT_REGION();
+
         int i = 0, dcn = dstcn;
         const float* gammaTab = srgb ? sRGBInvGammaTab : 0;
         float gscale = GammaTabScale;
@@ -3273,73 +2971,111 @@ struct Luv2RGBfloat
               C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
         float alpha = ColorChannel<float>::max();
         float _un = un, _vn = vn;
-        n *= 3;
 
-        #if CV_SSE2
-        if (haveSIMD)
+#if CV_SIMD
+        const int vsize = v_float32::nlanes;
+        const int nrepeats = vsize == 4 ? 2 : 1;
+        for( ; i <= n - vsize*nrepeats;
+             i += vsize*nrepeats, src += vsize*3*nrepeats, dst += dcn*vsize*nrepeats)
         {
-            for( ; i <= n - 24; i += 24, dst += dcn * 8 )
+            v_float32 L[nrepeats], u[nrepeats], v[nrepeats];
+            for (int k = 0; k < nrepeats; k++)
             {
-                __m128 v_l0 = _mm_loadu_ps(src + i +  0);
-                __m128 v_l1 = _mm_loadu_ps(src + i +  4);
-                __m128 v_u0 = _mm_loadu_ps(src + i +  8);
-                __m128 v_u1 = _mm_loadu_ps(src + i + 12);
-                __m128 v_v0 = _mm_loadu_ps(src + i + 16);
-                __m128 v_v1 = _mm_loadu_ps(src + i + 20);
+                v_load_deinterleave(src + k*vsize*3, L[k], u[k], v[k]);
+            }
 
-                _mm_deinterleave_ps(v_l0, v_l1, v_u0, v_u1, v_v0, v_v1);
+            v_float32 X[nrepeats], Y[nrepeats], Z[nrepeats];
 
-                process(v_l0, v_l1, v_u0, v_u1, v_v0, v_v1);
+            v_float32 v16 = vx_setall_f32(16.f);
+            v_float32 v116inv = vx_setall_f32(1.f/116.f);
+            v_float32 v903inv = vx_setall_f32(1.0f/903.296296f); //(3./29.)^3
+            for (int k = 0; k < nrepeats; k++)
+            {
+                v_float32 Ylo, Yhi;
 
-                if( gammaTab )
+                // ((L + 16)/116)^3
+                Ylo = (L[k] + v16) * v116inv;
+                Ylo = Ylo*Ylo*Ylo;
+                // L*(3./29.)^3
+                Yhi = L[k] * v903inv;
+
+                // Y = (L <= 8) ? Y0 : Y1;
+                Y[k] = v_select(L[k] >= vx_setall_f32(8.f), Ylo, Yhi);
+            }
+
+            v_float32 v4inv = vx_setall_f32(0.25f), v3 = vx_setall_f32(3.f);
+            for(int k = 0; k < nrepeats; k++)
+            {
+                v_float32 up, vp;
+
+                // up = 3*(u + L*_un);
+                up = v3*(v_fma(L[k], vx_setall_f32(_un), u[k]));
+                // vp = 0.25/(v + L*_vn);
+                vp = v4inv/(v_fma(L[k], vx_setall_f32(_vn), v[k]));
+
+                // vp = max(-0.25, min(0.25, vp));
+                vp = v_max(vx_setall_f32(-0.25f), v_min(v4inv, vp));
+
+                //X = 3*up*vp; // (*Y) is done later
+                X[k] = v3*up*vp;
+                //Z = ((12*13*L - up)*vp - 5); // (*Y) is done later
+                // xor flips the sign, works like unary minus
+                Z[k] = v_fma(v_fma(L[k], vx_setall_f32(12.f*13.f), (vx_setall_f32(-0.f) ^ up)), vp, vx_setall_f32(-5.f));
+            }
+
+            v_float32 R[nrepeats], G[nrepeats], B[nrepeats];
+            v_float32 vc0 = vx_setall_f32(C0), vc1 = vx_setall_f32(C1), vc2 = vx_setall_f32(C2);
+            v_float32 vc3 = vx_setall_f32(C3), vc4 = vx_setall_f32(C4), vc5 = vx_setall_f32(C5);
+            v_float32 vc6 = vx_setall_f32(C6), vc7 = vx_setall_f32(C7), vc8 = vx_setall_f32(C8);
+            for(int k = 0; k < nrepeats; k++)
+            {
+                // R = (X*C0 + C1 + Z*C2)*Y; // here (*Y) is done
+                R[k] = v_fma(Z[k], vc2, v_fma(X[k], vc0, vc1))*Y[k];
+                G[k] = v_fma(Z[k], vc5, v_fma(X[k], vc3, vc4))*Y[k];
+                B[k] = v_fma(Z[k], vc8, v_fma(X[k], vc6, vc7))*Y[k];
+            }
+
+            v_float32 vzero = vx_setzero_f32(), v1 = vx_setall_f32(1.f);
+            for(int k = 0; k < nrepeats; k++)
+            {
+                R[k] = v_min(v_max(R[k], vzero), v1);
+                G[k] = v_min(v_max(G[k], vzero), v1);
+                B[k] = v_min(v_max(B[k], vzero), v1);
+            }
+
+            if(gammaTab)
+            {
+                v_float32 vgscale = vx_setall_f32(gscale);
+                for(int k = 0; k < nrepeats; k++)
                 {
-                    __m128 v_gscale = _mm_set1_ps(gscale);
-                    v_l0 = _mm_mul_ps(v_l0, v_gscale);
-                    v_l1 = _mm_mul_ps(v_l1, v_gscale);
-                    v_u0 = _mm_mul_ps(v_u0, v_gscale);
-                    v_u1 = _mm_mul_ps(v_u1, v_gscale);
-                    v_v0 = _mm_mul_ps(v_v0, v_gscale);
-                    v_v1 = _mm_mul_ps(v_v1, v_gscale);
-                    splineInterpolate(v_l0, gammaTab, GAMMA_TAB_SIZE);
-                    splineInterpolate(v_l1, gammaTab, GAMMA_TAB_SIZE);
-                    splineInterpolate(v_u0, gammaTab, GAMMA_TAB_SIZE);
-                    splineInterpolate(v_u1, gammaTab, GAMMA_TAB_SIZE);
-                    splineInterpolate(v_v0, gammaTab, GAMMA_TAB_SIZE);
-                    splineInterpolate(v_v1, gammaTab, GAMMA_TAB_SIZE);
+                    R[k] *= vgscale;
+                    G[k] *= vgscale;
+                    B[k] *= vgscale;
                 }
-
-                if( dcn == 4 )
+                for(int k = 0; k < nrepeats; k++)
                 {
-                    __m128 v_a0 = _mm_set1_ps(alpha);
-                    __m128 v_a1 = _mm_set1_ps(alpha);
-                    _mm_interleave_ps(v_l0, v_l1, v_u0, v_u1, v_v0, v_v1, v_a0, v_a1);
-
-                    _mm_storeu_ps(dst +  0, v_l0);
-                    _mm_storeu_ps(dst +  4, v_l1);
-                    _mm_storeu_ps(dst +  8, v_u0);
-                    _mm_storeu_ps(dst + 12, v_u1);
-                    _mm_storeu_ps(dst + 16, v_v0);
-                    _mm_storeu_ps(dst + 20, v_v1);
-                    _mm_storeu_ps(dst + 24, v_a0);
-                    _mm_storeu_ps(dst + 28, v_a1);
+                    R[k] = splineInterpolate(R[k], gammaTab, GAMMA_TAB_SIZE);
+                    G[k] = splineInterpolate(G[k], gammaTab, GAMMA_TAB_SIZE);
+                    B[k] = splineInterpolate(B[k], gammaTab, GAMMA_TAB_SIZE);
                 }
-                else
+            }
+            for(int k = 0; k < nrepeats; k++)
+            {
+                if(dcn == 4)
                 {
-                    _mm_interleave_ps(v_l0, v_l1, v_u0, v_u1, v_v0, v_v1);
-
-                    _mm_storeu_ps(dst +  0, v_l0);
-                    _mm_storeu_ps(dst +  4, v_l1);
-                    _mm_storeu_ps(dst +  8, v_u0);
-                    _mm_storeu_ps(dst + 12, v_u1);
-                    _mm_storeu_ps(dst + 16, v_v0);
-                    _mm_storeu_ps(dst + 20, v_v1);
+                    v_store_interleave(dst + k*vsize*4, R[k], G[k], B[k], vx_setall_f32(alpha));
+                }
+                else // dcn == 3
+                {
+                    v_store_interleave(dst + k*vsize*3, R[k], G[k], B[k]);
                 }
             }
         }
-        #endif
-        for( ; i < n; i += 3, dst += dcn )
+#endif
+
+        for( ; i < n; i++, src += 3,  dst += dcn )
         {
-            float L = src[i], u = src[i+1], v = src[i+2], X, Y, Z;
+            float L = src[0], u = src[1], v = src[2], X, Y, Z;
             if(L >= 8)
             {
                 Y = (L + 16.f) * (1.f/116.f);
@@ -3380,9 +3116,6 @@ struct Luv2RGBfloat
     int dstcn;
     float coeffs[9], un, vn;
     bool srgb;
-    #if CV_SSE2
-    bool haveSIMD;
-    #endif
 };
 
 
@@ -3417,69 +3150,72 @@ struct RGB2Luvinterpolate
 
     void operator()(const uchar* src, uchar* dst, int n) const
     {
+        CV_INSTRUMENT_REGION();
+
         int i, scn = srccn, bIdx = blueIdx;
 
         i = 0; n *= 3;
 
-#if CV_SIMD128
+#if CV_SIMD
         if(enablePackedRGB2Luv)
         {
-            static const int nPixels = 8*2;
+            const int vsize = v_uint16::nlanes;
+            static const int nPixels = vsize*2;
             for(; i < n - 3*nPixels; i += 3*nPixels, src += scn*nPixels)
             {
                 /*
                     int R = src[bIdx], G = src[1], B = src[bIdx^2];
-                    */
-                v_uint8x16 r16, g16, b16, dummy16;
+                */
+                v_uint8 r, g, b, dummy;
                 if(scn == 3)
                 {
-                    v_load_deinterleave(src, r16, g16, b16);
+                    v_load_deinterleave(src, r, g, b);
                 }
                 else // scn == 4
                 {
-                    v_load_deinterleave(src, r16, g16, b16, dummy16);
+                    v_load_deinterleave(src, r, g, b, dummy);
                 }
 
                 if(bIdx)
                 {
-                    dummy16 = r16; r16 = b16; b16 = dummy16;
+                    swap(r, b);
                 }
 
                 /*
                     static const int baseDiv = LAB_BASE/256;
                     R = R*baseDiv, G = G*baseDiv, B = B*baseDiv;
-                    */
-                v_uint16x8 r80, r81, g80, g81, b80, b81;
-                v_expand(r16, r80, r81);
-                v_expand(g16, g80, g81);
-                v_expand(b16, b80, b81);
-                r80 = r80 << (lab_base_shift - 8); r81 = r81 << (lab_base_shift - 8);
-                g80 = g80 << (lab_base_shift - 8); g81 = g81 << (lab_base_shift - 8);
-                b80 = b80 << (lab_base_shift - 8); b81 = b81 << (lab_base_shift - 8);
+                */
+                v_uint16 r0, r1, g0, g1, b0, b1;
+                v_expand(r, r0, r1);
+                v_expand(g, g0, g1);
+                v_expand(b, b0, b1);
+                r0 = r0 << (lab_base_shift - 8); r1 = r1 << (lab_base_shift - 8);
+                g0 = g0 << (lab_base_shift - 8); g1 = g1 << (lab_base_shift - 8);
+                b0 = b0 << (lab_base_shift - 8); b1 = b1 << (lab_base_shift - 8);
 
                 /*
                     int L, u, v;
                     trilinearInterpolate(R, G, B, RGB2LuvLUT_s16, L, u, v);
-                    */
-                v_uint16x8 l80, u80, v80, l81, u81, v81;
-                trilinearPackedInterpolate(r80, g80, b80, LABLUVLUTs16.RGB2LuvLUT_s16, l80, u80, v80);
-                trilinearPackedInterpolate(r81, g81, b81, LABLUVLUTs16.RGB2LuvLUT_s16, l81, u81, v81);
+                 */
+                v_uint16 l0, u0, v0, l1, u1, v1;
+                trilinearPackedInterpolate(r0, g0, b0, LABLUVLUTs16.RGB2LuvLUT_s16, l0, u0, v0);
+                trilinearPackedInterpolate(r1, g1, b1, LABLUVLUTs16.RGB2LuvLUT_s16, l1, u1, v1);
 
                 /*
-                    dst[i] = saturate_cast<uchar>(L/baseDiv);
+                    dst[i]   = saturate_cast<uchar>(L/baseDiv);
                     dst[i+1] = saturate_cast<uchar>(u/baseDiv);
                     dst[i+2] = saturate_cast<uchar>(v/baseDiv);
-                    */
-                l80 = l80 >> (lab_base_shift - 8); l81 = l81 >> (lab_base_shift - 8);
-                u80 = u80 >> (lab_base_shift - 8); u81 = u81 >> (lab_base_shift - 8);
-                v80 = v80 >> (lab_base_shift - 8); v81 = v81 >> (lab_base_shift - 8);
-                v_uint8x16 l16 = v_pack(l80, l81);
-                v_uint8x16 u16 = v_pack(u80, u81);
-                v_uint8x16 v16 = v_pack(v80, v81);
-                v_store_interleave(dst + i, l16, u16, v16);
+                 */
+                l0 = l0 >> (lab_base_shift - 8); l1 = l1 >> (lab_base_shift - 8);
+                u0 = u0 >> (lab_base_shift - 8); u1 = u1 >> (lab_base_shift - 8);
+                v0 = v0 >> (lab_base_shift - 8); v1 = v1 >> (lab_base_shift - 8);
+                v_uint8 l = v_pack(l0, l1);
+                v_uint8 u = v_pack(u0, u1);
+                v_uint8 v = v_pack(v0, v1);
+                v_store_interleave(dst + i, l, u, v);
             }
         }
-#endif // CV_SIMD128
+#endif // CV_SIMD
 
         for(; i < n; i += 3, src += scn)
         {
@@ -3506,60 +3242,24 @@ struct RGB2Luvinterpolate
 struct RGB2Luv_b
 {
     typedef uchar channel_type;
+    static const int bufChannels = 3;
 
     RGB2Luv_b( int _srccn, int blueIdx, const float* _coeffs,
                const float* _whitept, bool _srgb )
     : srccn(_srccn),
-      fcvt(3, blueIdx, _coeffs, _whitept, _srgb),
+      fcvt(bufChannels, blueIdx, _coeffs, _whitept, _srgb),
       icvt(_srccn, blueIdx, _coeffs, _whitept, _srgb)
     {
+        // using interpolation for LRGB gives error up to 8 of 255, don't use it
         useInterpolation = (!_coeffs && !_whitept && _srgb
                             && enableBitExactness
                             && enableRGB2LuvInterpolation);
-
-        #if CV_NEON
-        v_scale_inv = vdupq_n_f32(softfloat::one()/f255);
-        v_scale = vdupq_n_f32(f255/softfloat(100));
-        v_coeff1 = vdupq_n_f32(f255/uRange);
-        v_coeff2 = vdupq_n_f32(-uLow*f255/uRange);
-        v_coeff3 = vdupq_n_f32(f255/vRange);
-        v_coeff4 = vdupq_n_f32(-vLow*f255/vRange);
-        v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
-        #elif CV_SSE2
-        v_zero = _mm_setzero_si128();
-        v_scale_inv = _mm_set1_ps(softfloat::one()/f255);
-        haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
-        #endif
     }
 
-    #if CV_SSE2
-    void process(const float * buf,
-                 __m128 & v_coeffs, __m128 & v_res, uchar * dst) const
-    {
-        __m128 v_l0f = _mm_load_ps(buf);
-        __m128 v_l1f = _mm_load_ps(buf + 4);
-        __m128 v_u0f = _mm_load_ps(buf + 8);
-        __m128 v_u1f = _mm_load_ps(buf + 12);
-
-        v_l0f = _mm_add_ps(_mm_mul_ps(v_l0f, v_coeffs), v_res);
-        v_u1f = _mm_add_ps(_mm_mul_ps(v_u1f, v_coeffs), v_res);
-        v_coeffs = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v_coeffs), 0x92));
-        v_res = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v_res), 0x92));
-        v_u0f = _mm_add_ps(_mm_mul_ps(v_u0f, v_coeffs), v_res);
-        v_coeffs = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v_coeffs), 0x92));
-        v_res = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v_res), 0x92));
-        v_l1f = _mm_add_ps(_mm_mul_ps(v_l1f, v_coeffs), v_res);
-
-        __m128i v_l = _mm_packs_epi32(_mm_cvtps_epi32(v_l0f), _mm_cvtps_epi32(v_l1f));
-        __m128i v_u = _mm_packs_epi32(_mm_cvtps_epi32(v_u0f), _mm_cvtps_epi32(v_u1f));
-        __m128i v_l0 = _mm_packus_epi16(v_l, v_u);
-
-        _mm_storeu_si128((__m128i *)(dst), v_l0);
-    }
-    #endif
-
     void operator()(const uchar* src, uchar* dst, int n) const
     {
+        CV_INSTRUMENT_REGION();
+
         if(useInterpolation)
         {
             icvt(src, dst, n);
@@ -3567,92 +3267,90 @@ struct RGB2Luv_b
         }
 
         int i, j, scn = srccn;
-        float CV_DECL_ALIGNED(16) buf[3*BLOCK_SIZE];
+#if CV_SIMD
+        float CV_DECL_ALIGNED(CV_SIMD_WIDTH) buf[bufChannels*BLOCK_SIZE];
+#else
+        float CV_DECL_ALIGNED(16) buf[bufChannels*BLOCK_SIZE];
+#endif
 
-        #if CV_SSE2
-        __m128 v_coeffs = _mm_set_ps(f255/softfloat(100), f255/vRange, f255/uRange, f255/softfloat(100));
-        __m128 v_res = _mm_set_ps(0.f, -vLow*f255/vRange, -uLow*f255/uRange, 0.f);
-        #endif
+        static const softfloat fL = f255/softfloat(100);
+        static const softfloat fu = f255/uRange;
+        static const softfloat fv = f255/vRange;
+        static const softfloat su = -uLow*f255/uRange;
+        static const softfloat sv = -vLow*f255/vRange;
+#if CV_SIMD
+        const int fsize = v_float32::nlanes;
+        v_float32 ml = vx_setall_f32((float)fL), al = vx_setzero_f32();
+        v_float32 mu = vx_setall_f32((float)fu), au = vx_setall_f32((float)su);
+        v_float32 mv = vx_setall_f32((float)fv), av = vx_setall_f32((float)sv);
+        //TODO: fix that when v_interleave is available
+        float CV_DECL_ALIGNED(CV_SIMD_WIDTH) interTmpM[fsize*3], interTmpA[fsize*3];
+        v_store_interleave(interTmpM, ml, mu, mv);
+        v_store_interleave(interTmpA, al, au, av);
+        v_float32 mluv[3], aluv[3];
+        for(int k = 0; k < 3; k++)
+        {
+            mluv[k] = vx_load_aligned(interTmpM + k*fsize);
+            aluv[k] = vx_load_aligned(interTmpA + k*fsize);
+        }
+#endif
 
-        for( i = 0; i < n; i += BLOCK_SIZE, dst += BLOCK_SIZE*3 )
+        for( i = 0; i < n; i += BLOCK_SIZE, dst += BLOCK_SIZE*bufChannels )
         {
             int dn = std::min(n - i, (int)BLOCK_SIZE);
             j = 0;
 
-            #if CV_NEON
-            for ( ; j <= (dn - 8) * 3; j += 24, src += 8 * scn)
-            {
-                uint16x8_t v_t0, v_t1, v_t2;
-
-                if (scn == 3)
-                {
-                    uint8x8x3_t v_src = vld3_u8(src);
-                    v_t0 = vmovl_u8(v_src.val[0]);
-                    v_t1 = vmovl_u8(v_src.val[1]);
-                    v_t2 = vmovl_u8(v_src.val[2]);
-                }
-                else
-                {
-                    uint8x8x4_t v_src = vld4_u8(src);
-                    v_t0 = vmovl_u8(v_src.val[0]);
-                    v_t1 = vmovl_u8(v_src.val[1]);
-                    v_t2 = vmovl_u8(v_src.val[2]);
-                }
-
-                float32x4x3_t v_dst;
-                v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0))), v_scale_inv);
-                v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_scale_inv);
-                v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_scale_inv);
-                vst3q_f32(buf + j, v_dst);
-
-                v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0))), v_scale_inv);
-                v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_scale_inv);
-                v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_scale_inv);
-                vst3q_f32(buf + j + 12, v_dst);
-            }
-            #elif CV_SSE2
-            if (scn == 3 && haveSIMD)
-            {
-                for ( ; j <= (dn * 3 - 16); j += 16, src += 16)
-                {
-                    __m128i v_src = _mm_loadu_si128((__m128i const *)src);
-
-                    __m128i v_src_p = _mm_unpacklo_epi8(v_src, v_zero);
-                    _mm_store_ps(buf + j, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_p, v_zero)), v_scale_inv));
-                    _mm_store_ps(buf + j + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_p, v_zero)), v_scale_inv));
-
-                    v_src_p = _mm_unpackhi_epi8(v_src, v_zero);
-                    _mm_store_ps(buf + j + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_p, v_zero)), v_scale_inv));
-                    _mm_store_ps(buf + j + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_p, v_zero)), v_scale_inv));
-                }
-
-                int jr = j % 3;
-                if (jr)
-                    src -= jr, j -= jr;
-            }
-            else if (scn == 4 && haveSIMD)
-            {
-                for ( ; j <= (dn * 3 - 12); j += 12, src += 16)
-                {
-                    __m128i v_src = _mm_loadu_si128((__m128i const *)src);
-
-                    __m128i v_src_lo = _mm_unpacklo_epi8(v_src, v_zero);
-                    __m128i v_src_hi = _mm_unpackhi_epi8(v_src, v_zero);
-                    _mm_storeu_ps(buf + j, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_lo, v_zero)), v_scale_inv));
-                    _mm_storeu_ps(buf + j + 3, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_lo, v_zero)), v_scale_inv));
-                    _mm_storeu_ps(buf + j + 6, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_hi, v_zero)), v_scale_inv));
-                    float tmp = buf[j + 8];
-                    _mm_storeu_ps(buf + j + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_unpackhi_epi16(v_src_hi, v_zero), 0x90)), v_scale_inv));
-                    buf[j + 8] = tmp;
-                }
-
-                int jr = j % 3;
-                if (jr)
-                    src -= jr, j -= jr;
-            }
-            #endif
             static const softfloat f255inv = softfloat::one()/f255;
-            for( ; j < dn*3; j += 3, src += scn )
+#if CV_SIMD
+            v_float32 v255inv = vx_setall_f32((float)f255inv);
+            if(scn == 4)
+            {
+                static const int nBlock = fsize*4;
+                for( ; j <= dn*bufChannels - nBlock*3;
+                     j += nBlock*3, src += nBlock*4)
+                {
+                    v_uint8 rgb[3], dummy;
+                    v_load_deinterleave(src, rgb[0], rgb[1], rgb[2], dummy);
+
+                    v_uint16 d[3*2];
+                    for(int k = 0; k < 3; k++)
+                    {
+                        v_expand(rgb[k], d[k*2+0], d[k*2+1]);
+                    }
+                    v_int32 q[3*4];
+                    for(int k = 0; k < 3*2; k++)
+                    {
+                        v_expand(v_reinterpret_as_s16(d[k]), q[k*2+0], q[k*2+1]);
+                    }
+
+                    v_float32 f[3*4];
+                    for(int k = 0; k < 3*4; k++)
+                    {
+                        f[k] = v_cvt_f32(q[k])*v255inv;
+                    }
+
+                    for(int k = 0; k < 4; k++)
+                    {
+                        v_store_interleave(buf + j + k*3*fsize, f[0*4+k], f[1*4+k], f[2*4+k]);
+                    }
+                }
+            }
+            else // scn == 3
+            {
+                static const int nBlock = fsize*2;
+                for( ; j <= dn*bufChannels - nBlock;
+                     j += nBlock, src += nBlock)
+                {
+                    v_uint16 d = vx_load_expand(src);
+                    v_int32 q0, q1;
+                    v_expand(v_reinterpret_as_s16(d), q0, q1);
+
+                    v_store_aligned(buf + j + 0*fsize, v_cvt_f32(q0)*v255inv);
+                    v_store_aligned(buf + j + 1*fsize, v_cvt_f32(q1)*v255inv);
+                }
+            }
+#endif
+            for( ; j < dn*bufChannels; j += bufChannels, src += scn )
             {
                 buf[j  ] = (float)(src[0]*((float)f255inv));
                 buf[j+1] = (float)(src[1]*((float)f255inv));
@@ -3661,43 +3359,34 @@ struct RGB2Luv_b
             fcvt(buf, buf, dn);
 
             j = 0;
-            #if CV_NEON
-            for ( ; j <= (dn - 8) * 3; j += 24)
-            {
-                float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12);
 
-                uint8x8x3_t v_dst;
-                v_dst.val[0] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[0], v_scale))),
-                                                       vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[0], v_scale)))));
-                v_dst.val[1] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vmulq_f32(v_src0.val[1], v_coeff1), v_coeff2))),
-                                                       vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vmulq_f32(v_src1.val[1], v_coeff1), v_coeff2)))));
-                v_dst.val[2] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vmulq_f32(v_src0.val[2], v_coeff3), v_coeff4))),
-                                                       vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vmulq_f32(v_src1.val[2], v_coeff3), v_coeff4)))));
-
-                vst3_u8(dst + j, v_dst);
-            }
-            #elif CV_SSE2
-            if (haveSIMD)
+#if CV_SIMD
+            for( ; j <= dn*3 - fsize*3*4; j += fsize*3*4)
             {
-                for ( ; j <= (dn - 16) * 3; j += 48)
+                v_float32 f[3*4];
+                for(int k = 0; k < 3*4; k++)
+                    f[k] = vx_load_aligned(buf + j + k*fsize);
+
+                for(int k = 0; k < 4; k++)
                 {
-                    process(buf + j,
-                            v_coeffs, v_res, dst + j);
+                    f[k*3+0] = v_fma(f[k*3+0], mluv[0], aluv[0]);
+                    f[k*3+1] = v_fma(f[k*3+1], mluv[1], aluv[1]);
+                    f[k*3+2] = v_fma(f[k*3+2], mluv[2], aluv[2]);
+                }
 
-                    process(buf + j + 16,
-                            v_coeffs, v_res, dst + j + 16);
+                v_int32 q[3*4];
+                for(int k = 0; k < 3*4; k++)
+                {
+                    q[k] = v_round(f[k]);
+                }
 
-                    process(buf + j + 32,
-                            v_coeffs, v_res, dst + j + 32);
+                for(int k = 0; k < 3; k++)
+                {
+                    v_store(dst + j + k*fsize*4, v_pack_u(v_pack(q[k*4+0], q[k*4+1]),
+                                                          v_pack(q[k*4+2], q[k*4+3])));
                 }
             }
-            #endif
-
-            static const softfloat fL = f255/softfloat(100);
-            static const softfloat fu = f255/uRange;
-            static const softfloat fv = f255/vRange;
-            static const softfloat su = -uLow*f255/uRange;
-            static const softfloat sv = -vLow*f255/vRange;
+#endif
             for( ; j < dn*3; j += 3 )
             {
                 dst[j] = saturate_cast<uchar>(buf[j]*(float)fL);
@@ -3711,14 +3400,6 @@ struct RGB2Luv_b
     RGB2Luvfloat fcvt;
     RGB2Luvinterpolate icvt;
 
-    #if CV_NEON
-    float32x4_t v_scale, v_scale_inv, v_coeff1, v_coeff2, v_coeff3, v_coeff4;
-    uint8x8_t v_alpha;
-    #elif CV_SSE2
-    __m128 v_scale_inv;
-    __m128i v_zero;
-    bool haveSIMD;
-    #endif
     bool useInterpolation;
 };
 
@@ -3734,7 +3415,7 @@ struct Luv2RGBinteger
     // whitept is fixed for int calculations
     Luv2RGBinteger( int _dstcn, int blueIdx, const float* _coeffs,
                     const float* /*_whitept*/, bool _srgb )
-    : dstcn(_dstcn)
+    : dstcn(_dstcn), issRGB(_srgb)
     {
         initLabTabs();
 
@@ -3752,8 +3433,6 @@ struct Luv2RGBinteger
             coeffs[i+3]             = cvRound(lshift*c[1]);
             coeffs[i+(blueIdx^2)*3] = cvRound(lshift*c[2]);
         }
-
-        tab = _srgb ? sRGBInvGammaTab_b : linearInvGammaTab_b;
     }
 
     // L, u, v should be in their natural range
@@ -3766,8 +3445,8 @@ struct Luv2RGBinteger
         // vp: +/- 0.25*BASE*1024
         int up = LUVLUT.LuToUp_b[LL*256+uu];
         int vp = LUVLUT.LvToVp_b[LL*256+vv];
-        //X = y*3.f* up/((float)BASE/1024) *vp/((float)BASE*1024);
-        //Z = y*(((12.f*13.f)*((float)LL)*100.f/255.f - up/((float)BASE))*vp/((float)BASE*1024) - 5.f);
+        // X = y*3.f* up/((float)BASE/1024) *vp/((float)BASE*1024);
+        // Z = y*(((12.f*13.f)*((float)LL)*100.f/255.f - up/((float)BASE))*vp/((float)BASE*1024) - 5.f);
 
         long long int xv = ((int)up)*(long long)vp;
         int x = (int)(xv/BASE);
@@ -3795,116 +3474,269 @@ struct Luv2RGBinteger
         go = max(0, min((int)INV_GAMMA_TAB_SIZE-1, go));
         bo = max(0, min((int)INV_GAMMA_TAB_SIZE-1, bo));
 
-        ro = tab[ro];
-        go = tab[go];
-        bo = tab[bo];
+        if(issRGB)
+        {
+            ushort* tab = sRGBInvGammaTab_b;
+            ro = tab[ro];
+            go = tab[go];
+            bo = tab[bo];
+        }
+        else
+        {
+            // rgb = (rgb*255) >> inv_gamma_shift
+            ro = ((ro << 8) - ro) >> inv_gamma_shift;
+            go = ((go << 8) - go) >> inv_gamma_shift;
+            bo = ((bo << 8) - bo) >> inv_gamma_shift;
+        }
     }
 
-    inline void processLuvToXYZ(const v_uint8x16& lv, const v_uint8x16& uv, const v_uint8x16& vv,
-                                int32_t* xyz) const
+    inline void processLuvToXYZ(const v_uint8& lv, const v_uint8& uv, const v_uint8& vv,
+                                v_int32 (&x)[4], v_int32 (&y)[4], v_int32 (&z)[4]) const
     {
-        uint8_t CV_DECL_ALIGNED(16) lvstore[16], uvstore[16], vvstore[16];
-        v_store_aligned(lvstore, lv); v_store_aligned(uvstore, uv); v_store_aligned(vvstore, vv);
+        const int vsize = v_uint8::nlanes;
 
-        for(int i = 0; i < 16; i++)
+        v_uint16 lv0, lv1;
+        v_expand(lv, lv0, lv1);
+        v_uint32 lq[4];
+        v_expand(lv0, lq[0], lq[1]);
+        v_expand(lv1, lq[2], lq[3]);
+
+        // y = LabToYF_b[LL*2];
+        // load int32 instead of int16 then cut unused part by masking
+        v_int32 mask16 = vx_setall_s32(0xFFFF);
+        for(int k = 0; k < 4; k++)
         {
-            int LL = lvstore[i];
-            int u = uvstore[i];
-            int v = vvstore[i];
-            int y = LabToYF_b[LL*2];
+            y[k] = v_lut((const int*)LabToYF_b, v_reinterpret_as_s32(lq[k])) & mask16;
+        }
 
-            int up = LUVLUT.LuToUp_b[LL*256+u];
-            int vp = LUVLUT.LvToVp_b[LL*256+v];
+        v_int32 up[4], vp[4];
+        // int up = LUVLUT.LuToUp_b[LL*256+u];
+        // int vp = LUVLUT.LvToVp_b[LL*256+v];
+        v_uint16 uv0, uv1, vv0, vv1;
+        v_expand(uv, uv0, uv1);
+        v_expand(vv, vv0, vv1);
+        // LL*256
+        v_uint16 ll0, ll1;
+        ll0 = lv0 << 8; ll1 = lv1 << 8;
+        v_uint16 upidx0, upidx1, vpidx0, vpidx1;
+        upidx0 = ll0 + uv0; upidx1 = ll1 + uv1;
+        vpidx0 = ll0 + vv0; vpidx1 = ll1 + vv1;
+        v_uint32 upidx[4], vpidx[4];
+        v_expand(upidx0, upidx[0], upidx[1]); v_expand(upidx1, upidx[2], upidx[3]);
+        v_expand(vpidx0, vpidx[0], vpidx[1]); v_expand(vpidx1, vpidx[2], vpidx[3]);
+        for(int k = 0; k < 4; k++)
+        {
+            up[k] = v_lut(LUVLUT.LuToUp_b, v_reinterpret_as_s32(upidx[k]));
+            vp[k] = v_lut(LUVLUT.LvToVp_b, v_reinterpret_as_s32(vpidx[k]));
+        }
 
-            long long int xv = up*(long long int)vp;
-            long long int vpl = LUVLUT.LvToVpl_b[LL*256+v];
-            long long int zp = vpl - xv*(255/3);
+        // long long int vpl = LUVLUT.LvToVpl_b[LL*256+v];
+        v_int64 vpl[8];
+        int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vpidxstore[vsize];
+        for(int k = 0; k < 4; k++)
+        {
+            v_store_aligned(vpidxstore + k*vsize/4, v_reinterpret_as_s32(vpidx[k]));
+        }
+        for(int k = 0; k < 8; k++)
+        {
+            vpl[k] = vx_lut((const int64_t*)LUVLUT.LvToVpl_b, vpidxstore + k*vsize/8);
+        }
+
+        // not all 64-bit arithmetic is available in univ. intrinsics
+        // need to handle it with scalar code
+        int64_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vvpl[vsize];
+        for(int k = 0; k < 8; k++)
+        {
+            v_store_aligned(vvpl + k*vsize/8, vpl[k]);
+        }
+        int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vup[vsize], vvp[vsize], vx[vsize], vy[vsize], vzm[vsize];
+        for(int k = 0; k < 4; k++)
+        {
+            v_store_aligned(vup + k*vsize/4, up[k]);
+            v_store_aligned(vvp + k*vsize/4, vp[k]);
+            v_store_aligned(vy + k*vsize/4, y[k]);
+        }
+        for(int i = 0; i < vsize; i++)
+        {
+            int32_t y_ = vy[i];
+            int32_t up_ = vup[i];
+            int32_t vp_ = vvp[i];
+
+            int64_t vpl_ = vvpl[i];
+            int64_t xv = up_*(int64_t)vp_;
+
+            int64_t zp = vpl_ - xv*(255/3);
             zp = zp >> base_shift;
-            long long int zq = zp - (5*255*BASE);
-            int zm = (int)((y*zq) >> base_shift);
+            int64_t zq = zp - (5*255*BASE);
+            int32_t zm = (int32_t)((y_*zq) >> base_shift);
+            vzm[i] = zm;
 
-            int x = (int)(xv >> base_shift);
-            x = (y*x) >> base_shift;
+            vx[i] = (int32_t)(xv >> base_shift);
+        }
+        v_int32 zm[4];
+        for(int k = 0; k < 4; k++)
+        {
+            x[k] = vx_load_aligned(vx + k*vsize/4);
+            zm[k] = vx_load_aligned(vzm + k*vsize/4);
+        }
 
-            int z = zm/256 + zm/65536;
-            x = max(0, min(2*BASE, x)); z = max(0, min(2*BASE, z));
+        for(int k = 0; k < 4; k++)
+        {
+            x[k] = (y[k]*x[k]) >> base_shift;
+        }
 
-            xyz[i] = x; xyz[i + 16] = y; xyz[i + 32] = z;
+        // z = zm/256 + zm/65536;
+        for (int k = 0; k < 4; k++)
+        {
+            z[k] = (zm[k] >> 8) + (zm[k] >> 16);
+        }
+
+        // (x, z) = clip((x, z), min=0, max=2*BASE)
+        v_int32 zero = vx_setzero_s32(), base2 = vx_setall_s32(2*BASE);
+        for(int k = 0; k < 4; k++)
+        {
+            x[k] = v_max(zero, v_min(base2, x[k]));
+            z[k] = v_max(zero, v_min(base2, z[k]));
         }
     }
 
     void operator()(const uchar* src, uchar* dst, int n) const
     {
+        CV_INSTRUMENT_REGION();
+
         int i, dcn = dstcn;
         uchar alpha = ColorChannel<uchar>::max();
 
         i = 0;
-#if CV_SIMD128
+
+#if CV_SIMD
         if(enablePackedLuv2RGB)
         {
-            static const int nPixels = 16;
-            for (; i < n*3-3*nPixels; i += 3*nPixels, dst += dcn*nPixels)
+            ushort* tab = sRGBInvGammaTab_b;
+            bool srgb = issRGB;
+            static const int vsize = v_uint8::nlanes;
+            const int descaleShift = 1 << (shift-1);
+            v_int16 vdescale = vx_setall_s16(descaleShift);
+            v_int16 vc[9];
+            for(int k = 0; k < 9; k++)
             {
-                v_uint8x16 u8l, u8u, u8v;
-                v_load_deinterleave(src + i, u8l, u8u, u8v);
+                vc[k] = vx_setall_s16((short)coeffs[k]);
+            }
+            v_int16 one = vx_setall_s16(1);
+            v_int16 cbxy, cbz1, cgxy, cgz1, crxy, crz1;
+            v_int16 dummy;
+            v_zip(vc[0], vc[1], crxy, dummy);
+            v_zip(vc[2],   one, crz1, dummy);
+            v_zip(vc[3], vc[4], cgxy, dummy);
+            v_zip(vc[5],   one, cgz1, dummy);
+            v_zip(vc[6], vc[7], cbxy, dummy);
+            v_zip(vc[8],   one, cbz1, dummy);
+            // fixing 16bit signed multiplication
+            // by subtracting 2^(base_shift-1) and then adding result back
+            v_int32 dummy32, fm[3];
+            v_expand(vc[0]+vc[1]+vc[2], fm[0], dummy32);
+            v_expand(vc[3]+vc[4]+vc[5], fm[1], dummy32);
+            v_expand(vc[6]+vc[7]+vc[8], fm[2], dummy32);
+            fm[0] = fm[0] << (base_shift-1);
+            fm[1] = fm[1] << (base_shift-1);
+            fm[2] = fm[2] << (base_shift-1);
 
-                int32_t CV_DECL_ALIGNED(16) xyz[48];
-                processLuvToXYZ(u8l, u8u, u8v, xyz);
+            for (; i <= n-vsize; i += vsize, src += 3*vsize, dst += dcn*vsize)
+            {
+                v_uint8 u8l, u8u, u8v;
+                v_load_deinterleave(src, u8l, u8u, u8v);
 
-                v_int32x4 xiv[4], yiv[4], ziv[4];
-                for(int k = 0; k < 4; k++)
+                v_int32 xiv[4], yiv[4], ziv[4];
+
+                processLuvToXYZ(u8l, u8u, u8v, xiv, yiv, ziv);
+
+                // [xxyyzz]
+                v_uint16 xyz[6];
+                xyz[0] = v_pack_u(xiv[0], xiv[1]); xyz[1] = v_pack_u(xiv[2], xiv[3]);
+                xyz[2] = v_pack_u(yiv[0], yiv[1]); xyz[3] = v_pack_u(yiv[2], yiv[3]);
+                xyz[4] = v_pack_u(ziv[0], ziv[1]); xyz[5] = v_pack_u(ziv[2], ziv[3]);
+
+                // ro = CV_DESCALE(C0 * x + C1 * y + C2 * z, shift);
+                // go = CV_DESCALE(C3 * x + C4 * y + C5 * z, shift);
+                // bo = CV_DESCALE(C6 * x + C7 * y + C8 * z, shift);
+
+                // fix 16bit multiplication: c_i*v = c_i*(v-fixmul) + c_i*fixmul
+                v_uint16 fixmul = vx_setall_u16(1 << (base_shift-1));
+                v_int16 sxyz[6];
+                for(int k = 0; k < 6; k++)
                 {
-                    xiv[k] = v_load_aligned(xyz + 4*k);
-                    yiv[k] = v_load_aligned(xyz + 4*k + 16);
-                    ziv[k] = v_load_aligned(xyz + 4*k + 32);
+                    sxyz[k] = v_reinterpret_as_s16(v_sub_wrap(xyz[k], fixmul));
                 }
 
-                /*
-                        ro = CV_DESCALE(C0 * x + C1 * y + C2 * z, shift);
-                        go = CV_DESCALE(C3 * x + C4 * y + C5 * z, shift);
-                        bo = CV_DESCALE(C6 * x + C7 * y + C8 * z, shift);
-                */
-                v_int32x4 C0 = v_setall_s32(coeffs[0]), C1 = v_setall_s32(coeffs[1]), C2 = v_setall_s32(coeffs[2]);
-                v_int32x4 C3 = v_setall_s32(coeffs[3]), C4 = v_setall_s32(coeffs[4]), C5 = v_setall_s32(coeffs[5]);
-                v_int32x4 C6 = v_setall_s32(coeffs[6]), C7 = v_setall_s32(coeffs[7]), C8 = v_setall_s32(coeffs[8]);
-                v_int32x4 descaleShift = v_setall_s32(1 << (shift-1));
-                v_int32x4 tabsz = v_setall_s32((int)INV_GAMMA_TAB_SIZE-1);
-                v_uint32x4 r_vecs[4], g_vecs[4], b_vecs[4];
+                v_int16 xy[4], zd[4];
+                v_zip(sxyz[0], sxyz[2], xy[0], xy[1]);
+                v_zip(sxyz[4], vdescale, zd[0], zd[1]);
+                v_zip(sxyz[1], sxyz[3], xy[2], xy[3]);
+                v_zip(sxyz[5], vdescale, zd[2], zd[3]);
+
+                // [rrrrggggbbbb]
+                v_int32 i_rgb[4*3];
+                // a bit faster than one loop for all
                 for(int k = 0; k < 4; k++)
                 {
-                    v_int32x4 i_r, i_g, i_b;
-                    i_r = (xiv[k]*C0 + yiv[k]*C1 + ziv[k]*C2 + descaleShift) >> shift;
-                    i_g = (xiv[k]*C3 + yiv[k]*C4 + ziv[k]*C5 + descaleShift) >> shift;
-                    i_b = (xiv[k]*C6 + yiv[k]*C7 + ziv[k]*C8 + descaleShift) >> shift;
-
-                    //limit indices in table and then substitute
-                    //ro = tab[ro]; go = tab[go]; bo = tab[bo];
-                    int32_t CV_DECL_ALIGNED(16) rshifts[4], gshifts[4], bshifts[4];
-                    v_int32x4 rs = v_max(v_setzero_s32(), v_min(tabsz, i_r));
-                    v_int32x4 gs = v_max(v_setzero_s32(), v_min(tabsz, i_g));
-                    v_int32x4 bs = v_max(v_setzero_s32(), v_min(tabsz, i_b));
-
-                    v_store_aligned(rshifts, rs);
-                    v_store_aligned(gshifts, gs);
-                    v_store_aligned(bshifts, bs);
-
-                    r_vecs[k] = v_uint32x4(tab[rshifts[0]], tab[rshifts[1]], tab[rshifts[2]], tab[rshifts[3]]);
-                    g_vecs[k] = v_uint32x4(tab[gshifts[0]], tab[gshifts[1]], tab[gshifts[2]], tab[gshifts[3]]);
-                    b_vecs[k] = v_uint32x4(tab[bshifts[0]], tab[bshifts[1]], tab[bshifts[2]], tab[bshifts[3]]);
+                    i_rgb[k+4*0] = (v_dotprod(xy[k], crxy) + v_dotprod(zd[k], crz1) + fm[0]) >> shift;
+                }
+                for(int k = 0; k < 4; k++)
+                {
+                    i_rgb[k+4*1] = (v_dotprod(xy[k], cgxy) + v_dotprod(zd[k], cgz1) + fm[1]) >> shift;
+                }
+                for(int k = 0; k < 4; k++)
+                {
+                    i_rgb[k+4*2] = (v_dotprod(xy[k], cbxy) + v_dotprod(zd[k], cbz1) + fm[2]) >> shift;
                 }
 
-                v_uint16x8 u_rvec0 = v_pack(r_vecs[0], r_vecs[1]), u_rvec1 = v_pack(r_vecs[2], r_vecs[3]);
-                v_uint16x8 u_gvec0 = v_pack(g_vecs[0], g_vecs[1]), u_gvec1 = v_pack(g_vecs[2], g_vecs[3]);
-                v_uint16x8 u_bvec0 = v_pack(b_vecs[0], b_vecs[1]), u_bvec1 = v_pack(b_vecs[2], b_vecs[3]);
+                // [rrggbb]
+                v_uint16 u_rgbvec[6];
 
-                v_uint8x16 u8_b, u8_g, u8_r;
-                u8_b = v_pack(u_bvec0, u_bvec1);
-                u8_g = v_pack(u_gvec0, u_gvec1);
-                u8_r = v_pack(u_rvec0, u_rvec1);
+                // limit indices in table and then substitute
+                v_int32 z32 = vx_setzero_s32();
+                v_int32 tabsz = vx_setall_s32((int)INV_GAMMA_TAB_SIZE-1);
+                for(int k = 0; k < 12; k++)
+                {
+                    i_rgb[k] = v_max(z32, v_min(tabsz, i_rgb[k]));
+                }
+
+                // ro = tab[ro]; go = tab[go]; bo = tab[bo];
+                if(srgb)
+                {
+                    // [rr.., gg.., bb..]
+                    int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) rgbshifts[3*vsize];
+                    for(int k = 0; k < 12; k++)
+                    {
+                        v_store_aligned(rgbshifts + k*vsize/4, i_rgb[k]);
+                    }
+                    for(int k = 0; k < 6; k++)
+                    {
+                        u_rgbvec[k] = vx_lut(tab, rgbshifts + k*vsize/2);
+                    }
+                }
+                else
+                {
+                    // rgb = (rgb*255) >> inv_gamma_shift
+                    for(int k = 0; k < 12; k++)
+                    {
+                        i_rgb[k] = ((i_rgb[k] << 8) - i_rgb[k]) >> inv_gamma_shift;
+                    }
+
+                    for(int k = 0; k < 6; k++)
+                    {
+                        u_rgbvec[k] = v_reinterpret_as_u16(v_pack(i_rgb[k*2+0], i_rgb[k*2+1]));
+                    }
+                }
+
+                v_uint8 u8_b, u8_g, u8_r;
+                u8_r = v_pack(u_rgbvec[0], u_rgbvec[1]);
+                u8_g = v_pack(u_rgbvec[2], u_rgbvec[3]);
+                u8_b = v_pack(u_rgbvec[4], u_rgbvec[5]);
 
                 if(dcn == 4)
                 {
-                    v_store_interleave(dst, u8_b, u8_g, u8_r, v_setall_u8(alpha));
+                    v_store_interleave(dst, u8_b, u8_g, u8_r, vx_setall_u8(alpha));
                 }
                 else
                 {
@@ -3914,10 +3746,10 @@ struct Luv2RGBinteger
         }
 #endif
 
-        for (; i < n*3; i += 3, dst += dcn)
+        for (; i < n; i++, src += 3, dst += dcn)
         {
             int ro, go, bo;
-            process(src[i + 0], src[i + 1], src[i + 2], ro, go, bo);
+            process(src[0], src[1], src[2], ro, go, bo);
 
             dst[0] = saturate_cast<uchar>(bo);
             dst[1] = saturate_cast<uchar>(go);
@@ -3930,7 +3762,7 @@ struct Luv2RGBinteger
 
     int dstcn;
     int coeffs[9];
-    ushort* tab;
+    bool issRGB;
 };
 
 
@@ -3941,7 +3773,7 @@ struct Luv2RGB_b
     Luv2RGB_b( int _dstcn, int blueIdx, const float* _coeffs,
                const float* _whitept, bool _srgb )
     : dstcn(_dstcn),
-      fcvt(_dstcn, blueIdx, _coeffs, _whitept, _srgb),
+      fcvt(3, blueIdx, _coeffs, _whitept, _srgb),
       icvt(_dstcn, blueIdx, _coeffs, _whitept, _srgb)
     {
         // whitept is fixed for int calculations
@@ -3950,6 +3782,8 @@ struct Luv2RGB_b
 
     void operator()(const uchar* src, uchar* dst, int n) const
     {
+        CV_INSTRUMENT_REGION();
+
         if(useBitExactness)
         {
             icvt(src, dst, n);
@@ -3958,49 +3792,65 @@ struct Luv2RGB_b
 
         int i, j, dcn = dstcn;
         uchar alpha = ColorChannel<uchar>::max();
+#if CV_SIMD
+        float CV_DECL_ALIGNED(CV_SIMD_WIDTH) buf[3*BLOCK_SIZE];
+#else
         float CV_DECL_ALIGNED(16) buf[3*BLOCK_SIZE];
+#endif
 
         static const softfloat fl = softfloat(100)/f255;
         static const softfloat fu = uRange/f255;
         static const softfloat fv = vRange/f255;
 
-        for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 )
+#if CV_SIMD
+        const int fsize = v_float32::nlanes;
+        v_float32 vl = vx_setall_f32((float)fl);
+        v_float32 vu = vx_setall_f32((float)fu);
+        v_float32 vv = vx_setall_f32((float)fv);
+        v_float32 vuLow = vx_setall_f32((float)uLow), vvLow = vx_setall_f32((float)vLow);
+        //TODO: fix that when v_interleave is available
+        float CV_DECL_ALIGNED(CV_SIMD_WIDTH) interTmpM[fsize*3], interTmpA[fsize*3];
+        v_store_interleave(interTmpM, vl, vu, vv);
+        v_store_interleave(interTmpA, vx_setzero_f32(), vuLow, vvLow);
+        v_float32 mluv[3], aluv[3];
+        for(int k = 0; k < 3; k++)
+        {
+            mluv[k] = vx_load_aligned(interTmpM + k*fsize);
+            aluv[k] = vx_load_aligned(interTmpA + k*fsize);
+        }
+#endif
+
+        i = 0;
+        for( ; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 )
         {
             int dn = std::min(n - i, (int)BLOCK_SIZE);
             j = 0;
 
-            v_float32x4 luvlm(fl, fu, fv, fl), uvlum(fu, fv, fl, fu), vluvm(fv, fl, fu, fv);
-            v_float32x4 luvla(0, uLow, vLow, 0), uvlua(uLow, vLow, 0, uLow), vluva(vLow, 0, uLow, vLow);
-
-            static const int nPixBlock = 16;
-            for( ; j < (dn-nPixBlock)*3; j += nPixBlock*3)
+#if CV_SIMD
+            const int vsize = v_uint8::nlanes;
+            for( ; j <= (dn - vsize)*3; j += 3*vsize )
             {
-                v_uint8x16 src8;
-                v_uint16x8 src16_0, src16_1;
-                v_int32x4 src32_00, src32_01, src32_10, src32_11;
-                v_float32x4 m00, m01, m10, m11, a00, a01, a10, a11;
+                v_uint8 s0, s1, s2;
+                s0 = vx_load(src + j + 0*vsize);
+                s1 = vx_load(src + j + 1*vsize);
+                s2 = vx_load(src + j + 2*vsize);
 
-                int bufp = 0, srcp = 0;
+                v_uint16 ss[6];
+                v_expand(s0, ss[0], ss[1]);
+                v_expand(s1, ss[2], ss[3]);
+                v_expand(s2, ss[4], ss[5]);
+                v_int32 vs[12];
+                for(int k = 0; k < 6; k++)
+                {
+                    v_expand(v_reinterpret_as_s16(ss[k]), vs[k*2+0], vs[k*2+1]);
+                }
 
-                #define CVTSTORE(n) v_store_aligned(buf + j + (bufp++)*4, v_muladd(v_cvt_f32(src32_##n), m##n, a##n))
-                #define LOADSTORE(seq1, seq2, seq3, seq4) \
-                do{\
-                    m00 = seq1##m, m01 = seq2##m, m10 = seq3##m, m11 = seq4##m;\
-                    a00 = seq1##a, a01 = seq2##a, a10 = seq3##a, a11 = seq4##a;\
-                    src8 = v_load(src + j + (srcp++)*16);\
-                    v_expand(src8, src16_0, src16_1);\
-                    v_expand(v_reinterpret_as_s16(src16_0), src32_00, src32_01);\
-                    v_expand(v_reinterpret_as_s16(src16_1), src32_10, src32_11);\
-                    CVTSTORE(00); CVTSTORE(01); CVTSTORE(10); CVTSTORE(11);\
-                }while(0)
-
-                LOADSTORE(luvl, uvlu, vluv, luvl);
-                LOADSTORE(uvlu, vluv, luvl, uvlu);
-                LOADSTORE(vluv, luvl, uvlu, vluv);
-
-                #undef CVTSTORE
-                #undef LOADSTORE
+                for(int bufp = 0; bufp < 12; bufp++)
+                {
+                    v_store_aligned(buf + j + bufp, v_muladd(v_cvt_f32(vs[bufp]), mluv[bufp%3], aluv[bufp%3]));
+                }
             }
+#endif
             for( ; j < dn*3; j += 3 )
             {
                 buf[j] = src[j]*((float)fl);
@@ -4012,20 +3862,52 @@ struct Luv2RGB_b
 
             j = 0;
 
-            //assume that fcvt returns 1.f as alpha value in case of 4 channels
-            static const int nBlock = 16;
-            v_float32x4 m255(255.f, 255.f, 255.f, 255.f);
-            v_float32x4 f00, f01, f10, f11;
-            v_int32x4 i00, i01, i10, i11;
-            for(; j < dn*3 - nBlock; j += nBlock, dst += nBlock)
+#if CV_SIMD
+            static const int nBlock = 4*fsize;
+            v_float32 v255 = vx_setall_f32(255.f);
+            if(dcn == 4)
             {
-                f00 = v_load_aligned(buf + j + 0); f01 = v_load_aligned(buf + j +  4);
-                f10 = v_load_aligned(buf + j + 8); f11 = v_load_aligned(buf + j + 12);
-                i00 = v_round(f00*m255); i01 = v_round(f01*m255);
-                i10 = v_round(f10*m255); i11 = v_round(f11*m255);
-                v_store(dst, v_pack(v_reinterpret_as_u16(v_pack(i00, i01)),
-                                    v_reinterpret_as_u16(v_pack(i10, i11))));
+                v_uint8 valpha = vx_setall_u8(alpha);
+                for( ; j <= (dn-nBlock)*3;
+                     j += nBlock*3, dst += nBlock)
+                {
+                    v_float32 vf[4*3];
+                    for(int k = 0; k < 4; k++)
+                    {
+                        v_load_deinterleave(buf + j, vf[k*3+0], vf[k*3+1], vf[k*3+2]);
+                    }
+
+                    v_int32 vi[4*3];
+                    for(int k = 0; k < 4*3; k++)
+                    {
+                        vi[k] = v_round(vf[k]*v255);
+                    }
+
+                    v_uint8 rgb[3];
+                    for(int k = 0; k < 3; k++)
+                    {
+                        rgb[k] = v_pack_u(v_pack(vi[0*3+k], vi[1*3+k]),
+                                          v_pack(vi[2*3+k], vi[3*3+k]));
+                    }
+
+                    v_store_interleave(dst, rgb[0], rgb[1], rgb[2], valpha);
+                }
             }
+            else // dcn == 3
+            {
+                for(; j < dn*3 - nBlock; j += nBlock, dst += nBlock)
+                {
+                    v_float32 vf[4];
+                    v_int32 vi[4];
+                    for(int k = 0; k < 4; k++)
+                    {
+                        vf[k] = vx_load_aligned(buf + j + k*fsize);
+                        vi[k] = v_round(vf[k]*v255);
+                    }
+                    v_store(dst, v_pack_u(v_pack(vi[0], vi[1]), v_pack(vi[2], vi[3])));
+                }
+            }
+#endif
 
             for( ; j < dn*3; j += 3, dst += dcn )
             {
diff --git a/modules/imgproc/src/geometry.cpp b/modules/imgproc/src/geometry.cpp
index 9e21caf796..332cc5d477 100644
--- a/modules/imgproc/src/geometry.cpp
+++ b/modules/imgproc/src/geometry.cpp
@@ -544,21 +544,41 @@ float cv::intersectConvexConvex( InputArray _p1, InputArray _p2, OutputArray _p1
             return 0.f;
         }
 
-        if( pointPolygonTest(_InputArray(fp1, n), fp2[0], false) >= 0 )
+        bool intersected = false;
+
+        // check if all of fp2's vertices is inside/on the edge of fp1.
+        int nVertices = 0;
+        for (int i=0; i<m; ++i)
+            nVertices += pointPolygonTest(_InputArray(fp1, n), fp2[i], false) >= 0;
+
+        // if all of fp2's vertices is inside/on the edge of fp1.
+        if (nVertices == m)
         {
+            intersected = true;
             result = fp2;
             nr = m;
         }
-        else if( pointPolygonTest(_InputArray(fp2, m), fp1[0], false) >= 0 )
+        else // otherwise check if fp2 is inside fp1.
         {
-            result = fp1;
-            nr = n;
+            nVertices = 0;
+            for (int i=0; i<n; ++i)
+                nVertices += pointPolygonTest(_InputArray(fp2, m), fp1[i], false) >= 0;
+
+            // // if all of fp1's vertices is inside/on the edge of fp2.
+            if (nVertices == n)
+            {
+                intersected = true;
+                result = fp1;
+                nr = n;
+            }
         }
-        else
+
+        if (!intersected)
         {
             _p12.release();
             return 0.f;
         }
+
         area = (float)contourArea(_InputArray(result, nr), false);
     }
 
diff --git a/modules/imgproc/test/test_color.cpp b/modules/imgproc/test/test_color.cpp
index 6ad51ad512..e1fb21bd40 100644
--- a/modules/imgproc/test/test_color.cpp
+++ b/modules/imgproc/test/test_color.cpp
@@ -2687,9 +2687,9 @@ TEST(Imgproc_ColorLab_Full, bitExactness)
                                    << "Iteration: " << iter << endl
                                    << "Hash vs Correct hash: " << h << ", " << goodHash << endl
                                    << "Error in: (" << x << ", " << y << ")" << endl
-                                   << "Reference value: " << gx[0] << " " << gx[1] << " " << gx[2] << endl
-                                   << "Actual value: "    << rx[0] << " " << rx[1] << " " << rx[2] << endl
-                                   << "Src value: " << px[0] << " " << px[1] << " " << px[2] << endl
+                                   << "Reference value: " << int(gx[0]) << " " << int(gx[1]) << " " << int(gx[2]) << endl
+                                   << "Actual value: "    << int(rx[0]) << " " << int(rx[1]) << " " << int(rx[2]) << endl
+                                   << "Src value: " << int(px[0]) << " " << int(px[1]) << " " << int(px[2]) << endl
                                    << "Size: (" << probe.rows << ", " << probe.cols << ")" << endl;
 
                             break;
@@ -2780,9 +2780,9 @@ TEST(Imgproc_ColorLuv_Full, bitExactness)
                                    << "Iteration: " << iter << endl
                                    << "Hash vs Correct hash: " << h << ", " << goodHash << endl
                                    << "Error in: (" << x << ", " << y << ")" << endl
-                                   << "Reference value: " << gx[0] << " " << gx[1] << " " << gx[2] << endl
-                                   << "Actual value: "    << rx[0] << " " << rx[1] << " " << rx[2] << endl
-                                   << "Src value: " << px[0] << " " << px[1] << " " << px[2] << endl
+                                   << "Reference value: " << int(gx[0]) << " " << int(gx[1]) << " " << int(gx[2]) << endl
+                                   << "Actual value: "    << int(rx[0]) << " " << int(rx[1]) << " " << int(rx[2]) << endl
+                                   << "Src value: " << int(px[0]) << " " << int(px[1]) << " " << int(px[2]) << endl
                                    << "Size: (" << probe.rows << ", " << probe.cols << ")" << endl;
 
                             break;
diff --git a/modules/imgproc/test/test_intersectconvexconvex.cpp b/modules/imgproc/test/test_intersectconvexconvex.cpp
new file mode 100644
index 0000000000..fa25f3d531
--- /dev/null
+++ b/modules/imgproc/test/test_intersectconvexconvex.cpp
@@ -0,0 +1,260 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "test_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+
+TEST(Imgproc_IntersectConvexConvex, no_intersection)
+{
+    std::vector<cv::Point> convex1;
+    convex1.push_back(cv::Point(290, 126));
+    convex1.push_back(cv::Point(284, 132));
+    convex1.push_back(cv::Point(281, 133));
+    convex1.push_back(cv::Point(256, 124));
+    convex1.push_back(cv::Point(249, 116));
+    convex1.push_back(cv::Point(234, 91));
+    convex1.push_back(cv::Point(232, 86));
+    convex1.push_back(cv::Point(232, 79));
+    convex1.push_back(cv::Point(251, 69));
+    convex1.push_back(cv::Point(257, 68));
+    convex1.push_back(cv::Point(297, 85));
+    convex1.push_back(cv::Point(299, 87));
+
+    std::vector<cv::Point> convex2;
+    convex2.push_back(cv::Point(192, 236));
+    convex2.push_back(cv::Point(190, 245));
+    convex2.push_back(cv::Point(177, 260));
+    convex2.push_back(cv::Point(154, 271));
+    convex2.push_back(cv::Point(142, 270));
+    convex2.push_back(cv::Point(135, 263));
+    convex2.push_back(cv::Point(131, 254));
+    convex2.push_back(cv::Point(132, 240));
+    convex2.push_back(cv::Point(172, 213));
+    convex2.push_back(cv::Point(176, 216));
+
+    std::vector<cv::Point> intersection;
+    float area = cv::intersectConvexConvex(convex1, convex2, intersection);
+
+    EXPECT_TRUE(intersection.empty());
+    EXPECT_NEAR(area, 0, std::numeric_limits<float>::epsilon());
+}
+
+
+TEST(Imgproc_IntersectConvexConvex, no_intersection_with_1_vertex_on_edge_1)
+{
+    std::vector<cv::Point> convex1;
+    convex1.push_back(cv::Point(0,0));
+    convex1.push_back(cv::Point(740, 0));
+    convex1.push_back(cv::Point(740, 540));
+    convex1.push_back(cv::Point(0, 540));
+
+    std::vector<cv::Point> convex2;
+    convex2.push_back(cv::Point(0, 210));
+    convex2.push_back(cv::Point(-30, 210));
+    convex2.push_back(cv::Point(-37, 170));
+    convex2.push_back(cv::Point(-7, 172));
+
+    std::vector<cv::Point> intersection;
+    float area = cv::intersectConvexConvex(convex1, convex2, intersection);
+
+    EXPECT_TRUE(intersection.empty());
+    EXPECT_NEAR(area, 0, std::numeric_limits<float>::epsilon());
+}
+
+
+TEST(Imgproc_IntersectConvexConvex, no_intersection_with_1_vertex_on_edge_2)
+{
+    std::vector<cv::Point> convex1;
+    convex1.push_back(cv::Point(0,0));
+    convex1.push_back(cv::Point(740, 0));
+    convex1.push_back(cv::Point(740, 540));
+    convex1.push_back(cv::Point(0, 540));
+
+    std::vector<cv::Point> convex2;
+    convex2.push_back(cv::Point(740, 210));
+    convex2.push_back(cv::Point(750, 100));
+    convex2.push_back(cv::Point(790, 250));
+    convex2.push_back(cv::Point(800, 500));
+
+    std::vector<cv::Point> intersection;
+    float area = cv::intersectConvexConvex(convex1, convex2, intersection);
+
+    EXPECT_TRUE(intersection.empty());
+    EXPECT_NEAR(area, 0, std::numeric_limits<float>::epsilon());
+}
+
+
+TEST(Imgproc_IntersectConvexConvex, intersection_with_1_vertex_on_edge)
+{
+    std::vector<cv::Point> convex1;
+    convex1.push_back(cv::Point(0,0));
+    convex1.push_back(cv::Point(740, 0));
+    convex1.push_back(cv::Point(740, 540));
+    convex1.push_back(cv::Point(0, 540));
+
+    std::vector<cv::Point> convex2;
+    convex2.push_back(cv::Point(30, 210));
+    convex2.push_back(cv::Point(0,210));
+    convex2.push_back(cv::Point(7, 172));
+    convex2.push_back(cv::Point(37, 170));
+
+    std::vector<cv::Point> intersection;
+    float area = cv::intersectConvexConvex(convex1, convex2, intersection);
+
+    std::vector<cv::Point> expected_intersection;
+    expected_intersection.push_back(cv::Point(0, 210));
+    expected_intersection.push_back(cv::Point(7, 172));
+    expected_intersection.push_back(cv::Point(37, 170));
+    expected_intersection.push_back(cv::Point(30, 210));
+
+    EXPECT_EQ(intersection, expected_intersection);
+    EXPECT_NEAR(area, 1163, std::numeric_limits<float>::epsilon());
+}
+
+
+TEST(Imgproc_IntersectConvexConvex, intersection_with_2_vertices_on_edge)
+{
+    std::vector<cv::Point> convex1;
+    convex1.push_back(cv::Point(0,0));
+    convex1.push_back(cv::Point(740, 0));
+    convex1.push_back(cv::Point(740, 540));
+    convex1.push_back(cv::Point(0, 540));
+
+    std::vector<cv::Point> convex2;
+    convex2.push_back(cv::Point(30, 210));
+    convex2.push_back(cv::Point(37, 170));
+    convex2.push_back(cv::Point(0,210));
+    convex2.push_back(cv::Point(0, 300));
+
+    std::vector<cv::Point> intersection;
+    float area = cv::intersectConvexConvex(convex1, convex2, intersection);
+
+    std::vector<cv::Point> expected_intersection;
+    expected_intersection.push_back(cv::Point(0, 300));
+    expected_intersection.push_back(cv::Point(0, 210));
+    expected_intersection.push_back(cv::Point(37, 170));
+    expected_intersection.push_back(cv::Point(30, 210));
+
+    EXPECT_EQ(intersection, expected_intersection);
+    EXPECT_NEAR(area, 1950, std::numeric_limits<float>::epsilon());
+}
+
+
+TEST(Imgproc_IntersectConvexConvex, intersection_1)
+{
+    std::vector<cv::Point> convex1;
+    convex1.push_back(cv::Point(0,0));
+    convex1.push_back(cv::Point(740, 0));
+    convex1.push_back(cv::Point(740, 540));
+    convex1.push_back(cv::Point(0, 540));
+
+    std::vector<cv::Point> convex2;
+    convex2.push_back(cv::Point(20,210));
+    convex2.push_back(cv::Point(30, 210));
+    convex2.push_back(cv::Point(37, 170));
+    convex2.push_back(cv::Point(7, 172));
+
+    std::vector<cv::Point> intersection;
+    float area = cv::intersectConvexConvex(convex1, convex2, intersection);
+
+    std::vector<cv::Point> expected_intersection;
+    expected_intersection.push_back(cv::Point(7, 172));
+    expected_intersection.push_back(cv::Point(37, 170));
+    expected_intersection.push_back(cv::Point(30, 210));
+    expected_intersection.push_back(cv::Point(20, 210));
+
+    EXPECT_EQ(intersection, expected_intersection);
+    EXPECT_NEAR(area, 783, std::numeric_limits<float>::epsilon());
+}
+
+
+TEST(Imgproc_IntersectConvexConvex, intersection_2)
+{
+    std::vector<cv::Point> convex1;
+    convex1.push_back(cv::Point(0,0));
+    convex1.push_back(cv::Point(740, 0));
+    convex1.push_back(cv::Point(740, 540));
+    convex1.push_back(cv::Point(0, 540));
+
+    std::vector<cv::Point> convex2;
+    convex2.push_back(cv::Point(-2,210));
+    convex2.push_back(cv::Point(-5, 300));
+    convex2.push_back(cv::Point(37, 150));
+    convex2.push_back(cv::Point(7, 172));
+
+    std::vector<cv::Point> intersection;
+    float area = cv::intersectConvexConvex(convex1, convex2, intersection);
+
+    std::vector<cv::Point> expected_intersection;
+    expected_intersection.push_back(cv::Point(0, 202));
+    expected_intersection.push_back(cv::Point(7, 172));
+    expected_intersection.push_back(cv::Point(37, 150));
+    expected_intersection.push_back(cv::Point(0, 282));
+
+    EXPECT_EQ(intersection, expected_intersection);
+    EXPECT_NEAR(area, 1857.19836425781, std::numeric_limits<float>::epsilon());
+}
+
+
+TEST(Imgproc_IntersectConvexConvex, intersection_3)
+{
+    std::vector<cv::Point> convex1;
+    convex1.push_back(cv::Point(15, 0));
+    convex1.push_back(cv::Point(740, 0));
+    convex1.push_back(cv::Point(740, 540));
+    convex1.push_back(cv::Point(15, 540));
+
+    std::vector<cv::Point> convex2;
+    convex2.push_back(cv::Point(0,210));
+    convex2.push_back(cv::Point(30, 210));
+    convex2.push_back(cv::Point(37, 170));
+    convex2.push_back(cv::Point(7, 172));
+
+    std::vector<cv::Point> intersection;
+    float area = cv::intersectConvexConvex(convex1, convex2, intersection);
+
+    std::vector<cv::Point> expected_intersection;
+    expected_intersection.push_back(cv::Point(15, 171));
+    expected_intersection.push_back(cv::Point(37, 170));
+    expected_intersection.push_back(cv::Point(30, 210));
+    expected_intersection.push_back(cv::Point(15, 210));
+
+    EXPECT_EQ(intersection, expected_intersection);
+
+    EXPECT_NEAR(area, 723.866760253906, std::numeric_limits<float>::epsilon());
+}
+
+
+TEST(Imgproc_IntersectConvexConvex, intersection_4)
+{
+    std::vector<cv::Point> convex1;
+    convex1.push_back(cv::Point(15, 0));
+    convex1.push_back(cv::Point(740, 0));
+    convex1.push_back(cv::Point(740, 540));
+    convex1.push_back(cv::Point(15, 540));
+
+    std::vector<cv::Point> convex2;
+    convex2.push_back(cv::Point(15, 0));
+    convex2.push_back(cv::Point(740, 0));
+    convex2.push_back(cv::Point(740, 540));
+    convex2.push_back(cv::Point(15, 540));
+
+    std::vector<cv::Point> intersection;
+    float area = cv::intersectConvexConvex(convex1, convex2, intersection);
+
+    std::vector<cv::Point> expected_intersection;
+    expected_intersection.push_back(cv::Point(15, 0));
+    expected_intersection.push_back(cv::Point(740, 0));
+    expected_intersection.push_back(cv::Point(740, 540));
+    expected_intersection.push_back(cv::Point(15, 540));
+
+    EXPECT_EQ(intersection, expected_intersection);
+    EXPECT_NEAR(area, 391500, std::numeric_limits<float>::epsilon());
+}
+
+
+} // namespace
+} // opencv_test
diff --git a/modules/java/generator/android/java/org/opencv/android/Utils.java b/modules/java/generator/android/java/org/opencv/android/Utils.java
index 404c986da8..eef4c45622 100644
--- a/modules/java/generator/android/java/org/opencv/android/Utils.java
+++ b/modules/java/generator/android/java/org/opencv/android/Utils.java
@@ -87,9 +87,9 @@ public class Utils {
      */
     public static void bitmapToMat(Bitmap bmp, Mat mat, boolean unPremultiplyAlpha) {
         if (bmp == null)
-            throw new java.lang.IllegalArgumentException("bmp == null");
+            throw new IllegalArgumentException("bmp == null");
         if (mat == null)
-            throw new java.lang.IllegalArgumentException("mat == null");
+            throw new IllegalArgumentException("mat == null");
         nBitmapToMat2(bmp, mat.nativeObj, unPremultiplyAlpha);
     }
 
@@ -117,9 +117,9 @@ public class Utils {
      */
     public static void matToBitmap(Mat mat, Bitmap bmp, boolean premultiplyAlpha) {
         if (mat == null)
-            throw new java.lang.IllegalArgumentException("mat == null");
+            throw new IllegalArgumentException("mat == null");
         if (bmp == null)
-            throw new java.lang.IllegalArgumentException("bmp == null");
+            throw new IllegalArgumentException("bmp == null");
         nMatToBitmap2(mat.nativeObj, bmp, premultiplyAlpha);
     }
 
diff --git a/modules/java/generator/src/java/org/opencv/utils/Converters.java b/modules/java/generator/src/java/org/opencv/utils/Converters.java
index 9faf2ecee9..94675da183 100644
--- a/modules/java/generator/src/java/org/opencv/utils/Converters.java
+++ b/modules/java/generator/src/java/org/opencv/utils/Converters.java
@@ -159,11 +159,11 @@ public class Converters {
 
     public static void Mat_to_vector_Point(Mat m, List<Point> pts) {
         if (pts == null)
-            throw new java.lang.IllegalArgumentException("Output List can't be null");
+            throw new IllegalArgumentException("Output List can't be null");
         int count = m.rows();
         int type = m.type();
         if (m.cols() != 1)
-            throw new java.lang.IllegalArgumentException("Input Mat should have one column\n" + m);
+            throw new IllegalArgumentException("Input Mat should have one column\n" + m);
 
         pts.clear();
         if (type == CvType.CV_32SC2) {
@@ -185,7 +185,7 @@ public class Converters {
                 pts.add(new Point(buff[i * 2], buff[i * 2 + 1]));
             }
         } else {
-            throw new java.lang.IllegalArgumentException(
+            throw new IllegalArgumentException(
                     "Input Mat should be of CV_32SC2, CV_32FC2 or CV_64FC2 type\n" + m);
         }
     }
@@ -204,11 +204,11 @@ public class Converters {
 
     public static void Mat_to_vector_Point3(Mat m, List<Point3> pts) {
         if (pts == null)
-            throw new java.lang.IllegalArgumentException("Output List can't be null");
+            throw new IllegalArgumentException("Output List can't be null");
         int count = m.rows();
         int type = m.type();
         if (m.cols() != 1)
-            throw new java.lang.IllegalArgumentException("Input Mat should have one column\n" + m);
+            throw new IllegalArgumentException("Input Mat should have one column\n" + m);
 
         pts.clear();
         if (type == CvType.CV_32SC3) {
@@ -230,7 +230,7 @@ public class Converters {
                 pts.add(new Point3(buff[i * 3], buff[i * 3 + 1], buff[i * 3 + 2]));
             }
         } else {
-            throw new java.lang.IllegalArgumentException(
+            throw new IllegalArgumentException(
                     "Input Mat should be of CV_32SC3, CV_32FC3 or CV_64FC3 type\n" + m);
         }
     }
@@ -255,10 +255,10 @@ public class Converters {
 
     public static void Mat_to_vector_Mat(Mat m, List<Mat> mats) {
         if (mats == null)
-            throw new java.lang.IllegalArgumentException("mats == null");
+            throw new IllegalArgumentException("mats == null");
         int count = m.rows();
         if (CvType.CV_32SC2 != m.type() || m.cols() != 1)
-            throw new java.lang.IllegalArgumentException(
+            throw new IllegalArgumentException(
                     "CvType.CV_32SC2 != m.type() ||  m.cols()!=1\n" + m);
 
         mats.clear();
@@ -289,10 +289,10 @@ public class Converters {
 
     public static void Mat_to_vector_float(Mat m, List<Float> fs) {
         if (fs == null)
-            throw new java.lang.IllegalArgumentException("fs == null");
+            throw new IllegalArgumentException("fs == null");
         int count = m.rows();
         if (CvType.CV_32FC1 != m.type() || m.cols() != 1)
-            throw new java.lang.IllegalArgumentException(
+            throw new IllegalArgumentException(
                     "CvType.CV_32FC1 != m.type() ||  m.cols()!=1\n" + m);
 
         fs.clear();
@@ -322,10 +322,10 @@ public class Converters {
 
     public static void Mat_to_vector_uchar(Mat m, List<Byte> us) {
         if (us == null)
-            throw new java.lang.IllegalArgumentException("Output List can't be null");
+            throw new IllegalArgumentException("Output List can't be null");
         int count = m.rows();
         if (CvType.CV_8UC1 != m.type() || m.cols() != 1)
-            throw new java.lang.IllegalArgumentException(
+            throw new IllegalArgumentException(
                     "CvType.CV_8UC1 != m.type() ||  m.cols()!=1\n" + m);
 
         us.clear();
@@ -372,10 +372,10 @@ public class Converters {
 
     public static void Mat_to_vector_int(Mat m, List<Integer> is) {
         if (is == null)
-            throw new java.lang.IllegalArgumentException("is == null");
+            throw new IllegalArgumentException("is == null");
         int count = m.rows();
         if (CvType.CV_32SC1 != m.type() || m.cols() != 1)
-            throw new java.lang.IllegalArgumentException(
+            throw new IllegalArgumentException(
                     "CvType.CV_32SC1 != m.type() ||  m.cols()!=1\n" + m);
 
         is.clear();
@@ -388,10 +388,10 @@ public class Converters {
 
     public static void Mat_to_vector_char(Mat m, List<Byte> bs) {
         if (bs == null)
-            throw new java.lang.IllegalArgumentException("Output List can't be null");
+            throw new IllegalArgumentException("Output List can't be null");
         int count = m.rows();
         if (CvType.CV_8SC1 != m.type() || m.cols() != 1)
-            throw new java.lang.IllegalArgumentException(
+            throw new IllegalArgumentException(
                     "CvType.CV_8SC1 != m.type() ||  m.cols()!=1\n" + m);
 
         bs.clear();
@@ -424,10 +424,10 @@ public class Converters {
 
     public static void Mat_to_vector_Rect(Mat m, List<Rect> rs) {
         if (rs == null)
-            throw new java.lang.IllegalArgumentException("rs == null");
+            throw new IllegalArgumentException("rs == null");
         int count = m.rows();
         if (CvType.CV_32SC4 != m.type() || m.cols() != 1)
-            throw new java.lang.IllegalArgumentException(
+            throw new IllegalArgumentException(
                     "CvType.CV_32SC4 != m.type() ||  m.rows()!=1\n" + m);
 
         rs.clear();
@@ -460,10 +460,10 @@ public class Converters {
 
     public static void Mat_to_vector_Rect2d(Mat m, List<Rect2d> rs) {
         if (rs == null)
-            throw new java.lang.IllegalArgumentException("rs == null");
+            throw new IllegalArgumentException("rs == null");
         int count = m.rows();
         if (CvType.CV_64FC4 != m.type() || m.cols() != 1)
-            throw new java.lang.IllegalArgumentException(
+            throw new IllegalArgumentException(
                                                          "CvType.CV_64FC4 != m.type() ||  m.rows()!=1\n" + m);
 
         rs.clear();
@@ -499,10 +499,10 @@ public class Converters {
 
     public static void Mat_to_vector_KeyPoint(Mat m, List<KeyPoint> kps) {
         if (kps == null)
-            throw new java.lang.IllegalArgumentException("Output List can't be null");
+            throw new IllegalArgumentException("Output List can't be null");
         int count = m.rows();
         if (CvType.CV_64FC(7) != m.type() || m.cols() != 1)
-            throw new java.lang.IllegalArgumentException(
+            throw new IllegalArgumentException(
                     "CvType.CV_64FC(7) != m.type() ||  m.cols()!=1\n" + m);
 
         kps.clear();
@@ -530,10 +530,10 @@ public class Converters {
 
     public static void Mat_to_vector_vector_Point(Mat m, List<MatOfPoint> pts) {
         if (pts == null)
-            throw new java.lang.IllegalArgumentException("Output List can't be null");
+            throw new IllegalArgumentException("Output List can't be null");
 
         if (m == null)
-            throw new java.lang.IllegalArgumentException("Input Mat can't be null");
+            throw new IllegalArgumentException("Input Mat can't be null");
 
         List<Mat> mats = new ArrayList<Mat>(m.rows());
         Mat_to_vector_Mat(m, mats);
@@ -548,10 +548,10 @@ public class Converters {
     // vector_vector_Point2f
     public static void Mat_to_vector_vector_Point2f(Mat m, List<MatOfPoint2f> pts) {
         if (pts == null)
-            throw new java.lang.IllegalArgumentException("Output List can't be null");
+            throw new IllegalArgumentException("Output List can't be null");
 
         if (m == null)
-            throw new java.lang.IllegalArgumentException("Input Mat can't be null");
+            throw new IllegalArgumentException("Input Mat can't be null");
 
         List<Mat> mats = new ArrayList<Mat>(m.rows());
         Mat_to_vector_Mat(m, mats);
@@ -580,10 +580,10 @@ public class Converters {
     // vector_vector_Point3f
     public static void Mat_to_vector_vector_Point3f(Mat m, List<MatOfPoint3f> pts) {
         if (pts == null)
-            throw new java.lang.IllegalArgumentException("Output List can't be null");
+            throw new IllegalArgumentException("Output List can't be null");
 
         if (m == null)
-            throw new java.lang.IllegalArgumentException("Input Mat can't be null");
+            throw new IllegalArgumentException("Input Mat can't be null");
 
         List<Mat> mats = new ArrayList<Mat>(m.rows());
         Mat_to_vector_Mat(m, mats);
@@ -625,10 +625,10 @@ public class Converters {
 
     public static void Mat_to_vector_vector_KeyPoint(Mat m, List<MatOfKeyPoint> kps) {
         if (kps == null)
-            throw new java.lang.IllegalArgumentException("Output List can't be null");
+            throw new IllegalArgumentException("Output List can't be null");
 
         if (m == null)
-            throw new java.lang.IllegalArgumentException("Input Mat can't be null");
+            throw new IllegalArgumentException("Input Mat can't be null");
 
         List<Mat> mats = new ArrayList<Mat>(m.rows());
         Mat_to_vector_Mat(m, mats);
@@ -659,10 +659,10 @@ public class Converters {
 
     public static void Mat_to_vector_double(Mat m, List<Double> ds) {
         if (ds == null)
-            throw new java.lang.IllegalArgumentException("ds == null");
+            throw new IllegalArgumentException("ds == null");
         int count = m.rows();
         if (CvType.CV_64FC1 != m.type() || m.cols() != 1)
-            throw new java.lang.IllegalArgumentException(
+            throw new IllegalArgumentException(
                     "CvType.CV_64FC1 != m.type() ||  m.cols()!=1\n" + m);
 
         ds.clear();
@@ -695,10 +695,10 @@ public class Converters {
 
     public static void Mat_to_vector_DMatch(Mat m, List<DMatch> matches) {
         if (matches == null)
-            throw new java.lang.IllegalArgumentException("Output List can't be null");
+            throw new IllegalArgumentException("Output List can't be null");
         int count = m.rows();
         if (CvType.CV_64FC4 != m.type() || m.cols() != 1)
-            throw new java.lang.IllegalArgumentException(
+            throw new IllegalArgumentException(
                     "CvType.CV_64FC4 != m.type() ||  m.cols()!=1\n" + m);
 
         matches.clear();
@@ -725,10 +725,10 @@ public class Converters {
 
     public static void Mat_to_vector_vector_DMatch(Mat m, List<MatOfDMatch> lvdm) {
         if (lvdm == null)
-            throw new java.lang.IllegalArgumentException("Output List can't be null");
+            throw new IllegalArgumentException("Output List can't be null");
 
         if (m == null)
-            throw new java.lang.IllegalArgumentException("Input Mat can't be null");
+            throw new IllegalArgumentException("Input Mat can't be null");
 
         List<Mat> mats = new ArrayList<Mat>(m.rows());
         Mat_to_vector_Mat(m, mats);
@@ -757,10 +757,10 @@ public class Converters {
 
     public static void Mat_to_vector_vector_char(Mat m, List<List<Byte>> llb) {
         if (llb == null)
-            throw new java.lang.IllegalArgumentException("Output List can't be null");
+            throw new IllegalArgumentException("Output List can't be null");
 
         if (m == null)
-            throw new java.lang.IllegalArgumentException("Input Mat can't be null");
+            throw new IllegalArgumentException("Input Mat can't be null");
 
         List<Mat> mats = new ArrayList<Mat>(m.rows());
         Mat_to_vector_Mat(m, mats);
@@ -796,10 +796,10 @@ public class Converters {
 
     public static void Mat_to_vector_RotatedRect(Mat m, List<RotatedRect> rs) {
         if (rs == null)
-            throw new java.lang.IllegalArgumentException("rs == null");
+            throw new IllegalArgumentException("rs == null");
         int count = m.rows();
         if (CvType.CV_32FC(5) != m.type() || m.cols() != 1)
-            throw new java.lang.IllegalArgumentException(
+            throw new IllegalArgumentException(
                     "CvType.CV_32FC5 != m.type() ||  m.rows()!=1\n" + m);
 
         rs.clear();
diff --git a/samples/dnn/tf_text_graph_faster_rcnn.py b/samples/dnn/tf_text_graph_faster_rcnn.py
index e1dfba9fee..8a88c7328a 100644
--- a/samples/dnn/tf_text_graph_faster_rcnn.py
+++ b/samples/dnn/tf_text_graph_faster_rcnn.py
@@ -31,7 +31,13 @@ def createFasterRCNNGraph(modelPath, configPath, outputPath):
     aspect_ratios = [float(ar) for ar in grid_anchor_generator['aspect_ratios']]
     width_stride = float(grid_anchor_generator['width_stride'][0])
     height_stride = float(grid_anchor_generator['height_stride'][0])
-    features_stride = float(config['feature_extractor'][0]['first_stage_features_stride'][0])
+
+    feature_extractor = config['feature_extractor'][0]
+    if 'type' in feature_extractor and feature_extractor['type'][0] == 'faster_rcnn_nas':
+        features_stride = 16.0
+    else:
+        features_stride = float(feature_extractor['first_stage_features_stride'][0])
+
     first_stage_nms_iou_threshold = float(config['first_stage_nms_iou_threshold'][0])
     first_stage_max_proposals = int(config['first_stage_max_proposals'][0])