diff --git a/cmake/OpenCVDetectCXXCompiler.cmake b/cmake/OpenCVDetectCXXCompiler.cmake
index 04a1f48134..03afccd935 100644
--- a/cmake/OpenCVDetectCXXCompiler.cmake
+++ b/cmake/OpenCVDetectCXXCompiler.cmake
@@ -143,6 +143,8 @@ elseif(MSVC)
     set(OpenCV_RUNTIME vc14)
   elseif(MSVC_VERSION MATCHES "^191[0-9]$")
     set(OpenCV_RUNTIME vc15)
+  elseif(MSVC_VERSION MATCHES "^192[0-9]$")
+    set(OpenCV_RUNTIME vc16)
   else()
     message(WARNING "OpenCV does not recognize MSVC_VERSION \"${MSVC_VERSION}\". Cannot set OpenCV_RUNTIME")
   endif()
diff --git a/cmake/templates/OpenCVConfig.root-WIN32.cmake.in b/cmake/templates/OpenCVConfig.root-WIN32.cmake.in
index 687298acbc..76027bf92e 100644
--- a/cmake/templates/OpenCVConfig.root-WIN32.cmake.in
+++ b/cmake/templates/OpenCVConfig.root-WIN32.cmake.in
@@ -113,6 +113,16 @@ elseif(MSVC)
     if(NOT has_VS2017)
       set(OpenCV_RUNTIME vc14) # selecting previous compatible runtime version
     endif()
+  elseif(MSVC_VERSION MATCHES "^192[0-9]$")
+    set(OpenCV_RUNTIME vc16)
+    check_one_config(has_VS2019)
+    if(NOT has_VS2019)
+      set(OpenCV_RUNTIME vc15) # selecting previous compatible runtime version
+      check_one_config(has_VS2017)
+      if(NOT has_VS2017)
+        set(OpenCV_RUNTIME vc14) # selecting previous compatible runtime version
+      endif()
+    endif()
   endif()
 elseif(MINGW)
   set(OpenCV_RUNTIME mingw)
diff --git a/doc/opencv.bib b/doc/opencv.bib
index aa2bac5062..e2af456532 100644
--- a/doc/opencv.bib
+++ b/doc/opencv.bib
@@ -17,6 +17,21 @@
   number = {7},
   url = {http://www.bmva.org/bmvc/2013/Papers/paper0013/paper0013.pdf}
 }
+@inproceedings{Andreff99,
+  author = {Andreff, Nicolas and Horaud, Radu and Espiau, Bernard},
+  title = {On-line Hand-eye Calibration},
+  booktitle = {Proceedings of the 2Nd International Conference on 3-D Digital Imaging and Modeling},
+  series = {3DIM'99},
+  year = {1999},
+  isbn = {0-7695-0062-5},
+  location = {Ottawa, Canada},
+  pages = {430--436},
+  numpages = {7},
+  url = {http://dl.acm.org/citation.cfm?id=1889712.1889775},
+  acmid = {1889775},
+  publisher = {IEEE Computer Society},
+  address = {Washington, DC, USA},
+}
 @inproceedings{Arandjelovic:2012:TTE:2354409.2355123,
   author = {Arandjelovic, Relja},
   title = {Three Things Everyone Should Know to Improve Object Retrieval},
@@ -180,6 +195,14 @@
   volume = {9},
   publisher = {Walter de Gruyter}
 }
+@article{Daniilidis98,
+  author = {Konstantinos Daniilidis},
+  title = {Hand-Eye Calibration Using Dual Quaternions},
+  journal = {International Journal of Robotics Research},
+  year = {1998},
+  volume = {18},
+  pages = {286--298}
+}
 @inproceedings{DM03,
   author = {Drago, Fr{\'e}d{\'e}ric and Myszkowski, Karol and Annen, Thomas and Chiba, Norishige},
   title = {Adaptive logarithmic mapping for displaying high contrast scenes},
@@ -431,6 +454,24 @@
   publisher = {Cambridge university press},
   url = {http://cds.cern.ch/record/1598612/files/0521540518_TOC.pdf}
 }
+@article{Horaud95,
+  author = {Horaud, Radu and Dornaika, Fadi},
+  title = {Hand-eye Calibration},
+  journal = {Int. J. Rob. Res.},
+  issue_date = {June 1995},
+  volume = {14},
+  number = {3},
+  month = jun,
+  year = {1995},
+  issn = {0278-3649},
+  pages = {195--210},
+  numpages = {16},
+  url = {http://dx.doi.org/10.1177/027836499501400301},
+  doi = {10.1177/027836499501400301},
+  acmid = {208622},
+  publisher = {Sage Publications, Inc.},
+  address = {Thousand Oaks, CA, USA}
+}
 @article{Horn81,
   author = {Horn, Berthold KP and Schunck, Brian G},
   title = {Determining Optical Flow},
@@ -667,6 +708,18 @@
   number = {2},
   publisher = {Elsevier}
 }
+@article{Park94,
+  author = {F. C. Park and B. J. Martin},
+  journal = {IEEE Transactions on Robotics and Automation},
+  title = {Robot sensor calibration: solving AX=XB on the Euclidean group},
+  year = {1994},
+  volume = {10},
+  number = {5},
+  pages = {717-721},
+  doi = {10.1109/70.326576},
+  ISSN = {1042-296X},
+  month = {Oct}
+}
 @inproceedings{PM03,
   author = {P{\'e}rez, Patrick and Gangnet, Michel and Blake, Andrew},
   title = {Poisson image editing},
@@ -841,6 +894,18 @@
   number = {1},
   publisher = {Taylor \& Francis}
 }
+@article{Tsai89,
+  author = {R. Y. Tsai and R. K. Lenz},
+  journal = {IEEE Transactions on Robotics and Automation},
+  title = {A new technique for fully autonomous and efficient 3D robotics hand/eye calibration},
+  year = {1989},
+  volume = {5},
+  number = {3},
+  pages = {345-358},
+  doi = {10.1109/70.34770},
+  ISSN = {1042-296X},
+  month = {June}
+}
 @inproceedings{UES01,
   author = {Uyttendaele, Matthew and Eden, Ashley and Skeliski, R},
   title = {Eliminating ghosting and exposure artifacts in image mosaics},
diff --git a/doc/tutorials/introduction/documenting_opencv/documentation_tutorial.markdown b/doc/tutorials/introduction/documenting_opencv/documentation_tutorial.markdown
index 0f26dafaa0..0379a3e382 100644
--- a/doc/tutorials/introduction/documenting_opencv/documentation_tutorial.markdown
+++ b/doc/tutorials/introduction/documenting_opencv/documentation_tutorial.markdown
@@ -32,11 +32,11 @@ Generate documentation {#tutorial_documentation_generate}
 -   Create build directory near the sources folder(s) and go into it
 -   Run cmake (assuming you put sources to _opencv_ folder):
     @code{.sh}
-    cmake ../opencv
+    cmake -DBUILD_DOCS=ON ../opencv
     @endcode
     Or if you get contrib sources too:
     @code{.sh}
-    cmake -DOPENCV_EXTRA_MODULES_PATH=../opencv_contrib/modules ../opencv
+    cmake -DBUILD_DOCS=ON -DOPENCV_EXTRA_MODULES_PATH=../opencv_contrib/modules ../opencv
     @endcode
 -   Run make:
     @code{.sh}
diff --git a/modules/calib3d/doc/pics/hand-eye_figure.png b/modules/calib3d/doc/pics/hand-eye_figure.png
new file mode 100644
index 0000000000..ff41d33f8c
Binary files /dev/null and b/modules/calib3d/doc/pics/hand-eye_figure.png differ
diff --git a/modules/calib3d/include/opencv2/calib3d.hpp b/modules/calib3d/include/opencv2/calib3d.hpp
index 06fa957e4b..149484ebc2 100644
--- a/modules/calib3d/include/opencv2/calib3d.hpp
+++ b/modules/calib3d/include/opencv2/calib3d.hpp
@@ -280,7 +280,7 @@ enum { CALIB_NINTRINSIC          = 18,
        // for stereo rectification
        CALIB_ZERO_DISPARITY      = 0x00400,
        CALIB_USE_LU              = (1 << 17), //!< use LU instead of SVD decomposition for solving. much faster but potentially less precise
-       CALIB_USE_EXTRINSIC_GUESS = (1 << 22), //!< for stereoCalibrate
+       CALIB_USE_EXTRINSIC_GUESS = (1 << 22)  //!< for stereoCalibrate
      };
 
 //! the algorithm for finding fundamental matrix
@@ -290,6 +290,14 @@ enum { FM_7POINT = 1, //!< 7-point algorithm
        FM_RANSAC = 8  //!< RANSAC algorithm. It needs at least 15 points. 7-point algorithm is used.
      };
 
+enum HandEyeCalibrationMethod
+{
+    CALIB_HAND_EYE_TSAI         = 0, //!< A New Technique for Fully Autonomous and Efficient 3D Robotics Hand/Eye Calibration @cite Tsai89
+    CALIB_HAND_EYE_PARK         = 1, //!< Robot Sensor Calibration: Solving AX = XB on the Euclidean Group @cite Park94
+    CALIB_HAND_EYE_HORAUD       = 2, //!< Hand-eye Calibration @cite Horaud95
+    CALIB_HAND_EYE_ANDREFF      = 3, //!< On-line Hand-Eye Calibration @cite Andreff99
+    CALIB_HAND_EYE_DANIILIDIS   = 4  //!< Hand-Eye Calibration Using Dual Quaternions @cite Daniilidis98
+};
 
 
 /** @brief Converts a rotation matrix to a rotation vector or vice versa.
@@ -1569,6 +1577,139 @@ CV_EXPORTS_W Mat getOptimalNewCameraMatrix( InputArray cameraMatrix, InputArray
                                             CV_OUT Rect* validPixROI = 0,
                                             bool centerPrincipalPoint = false);
 
+/** @brief Computes Hand-Eye calibration: \f$_{}^{g}\textrm{T}_c\f$
+
+@param[in] R_gripper2base Rotation part extracted from the homogeneous matrix that transforms a point
+expressed in the gripper frame to the robot base frame (\f$_{}^{b}\textrm{T}_g\f$).
+This is a vector (`vector<Mat>`) that contains the rotation matrices for all the transformations
+from gripper frame to robot base frame.
+@param[in] t_gripper2base Translation part extracted from the homogeneous matrix that transforms a point
+expressed in the gripper frame to the robot base frame (\f$_{}^{b}\textrm{T}_g\f$).
+This is a vector (`vector<Mat>`) that contains the translation vectors for all the transformations
+from gripper frame to robot base frame.
+@param[in] R_target2cam Rotation part extracted from the homogeneous matrix that transforms a point
+expressed in the target frame to the camera frame (\f$_{}^{c}\textrm{T}_t\f$).
+This is a vector (`vector<Mat>`) that contains the rotation matrices for all the transformations
+from calibration target frame to camera frame.
+@param[in] t_target2cam Rotation part extracted from the homogeneous matrix that transforms a point
+expressed in the target frame to the camera frame (\f$_{}^{c}\textrm{T}_t\f$).
+This is a vector (`vector<Mat>`) that contains the translation vectors for all the transformations
+from calibration target frame to camera frame.
+@param[out] R_cam2gripper Estimated rotation part extracted from the homogeneous matrix that transforms a point
+expressed in the camera frame to the gripper frame (\f$_{}^{g}\textrm{T}_c\f$).
+@param[out] t_cam2gripper Estimated translation part extracted from the homogeneous matrix that transforms a point
+expressed in the camera frame to the gripper frame (\f$_{}^{g}\textrm{T}_c\f$).
+@param[in] method One of the implemented Hand-Eye calibration method, see cv::HandEyeCalibrationMethod
+
+The function performs the Hand-Eye calibration using various methods. One approach consists in estimating the
+rotation then the translation (separable solutions) and the following methods are implemented:
+  - R. Tsai, R. Lenz A New Technique for Fully Autonomous and Efficient 3D Robotics Hand/EyeCalibration \cite Tsai89
+  - F. Park, B. Martin Robot Sensor Calibration: Solving AX = XB on the Euclidean Group \cite Park94
+  - R. Horaud, F. Dornaika Hand-Eye Calibration \cite Horaud95
+
+Another approach consists in estimating simultaneously the rotation and the translation (simultaneous solutions),
+with the following implemented method:
+  - N. Andreff, R. Horaud, B. Espiau On-line Hand-Eye Calibration \cite Andreff99
+  - K. Daniilidis Hand-Eye Calibration Using Dual Quaternions \cite Daniilidis98
+
+The following picture describes the Hand-Eye calibration problem where the transformation between a camera ("eye")
+mounted on a robot gripper ("hand") has to be estimated.
+
+![](pics/hand-eye_figure.png)
+
+The calibration procedure is the following:
+  - a static calibration pattern is used to estimate the transformation between the target frame
+  and the camera frame
+  - the robot gripper is moved in order to acquire several poses
+  - for each pose, the homogeneous transformation between the gripper frame and the robot base frame is recorded using for
+  instance the robot kinematics
+\f[
+    \begin{bmatrix}
+    X_b\\
+    Y_b\\
+    Z_b\\
+    1
+    \end{bmatrix}
+    =
+    \begin{bmatrix}
+    _{}^{b}\textrm{R}_g & _{}^{b}\textrm{t}_g \\
+    0_{1 \times 3} & 1
+    \end{bmatrix}
+    \begin{bmatrix}
+    X_g\\
+    Y_g\\
+    Z_g\\
+    1
+    \end{bmatrix}
+\f]
+  - for each pose, the homogeneous transformation between the calibration target frame and the camera frame is recorded using
+  for instance a pose estimation method (PnP) from 2D-3D point correspondences
+\f[
+    \begin{bmatrix}
+    X_c\\
+    Y_c\\
+    Z_c\\
+    1
+    \end{bmatrix}
+    =
+    \begin{bmatrix}
+    _{}^{c}\textrm{R}_t & _{}^{c}\textrm{t}_t \\
+    0_{1 \times 3} & 1
+    \end{bmatrix}
+    \begin{bmatrix}
+    X_t\\
+    Y_t\\
+    Z_t\\
+    1
+    \end{bmatrix}
+\f]
+
+The Hand-Eye calibration procedure returns the following homogeneous transformation
+\f[
+    \begin{bmatrix}
+    X_g\\
+    Y_g\\
+    Z_g\\
+    1
+    \end{bmatrix}
+    =
+    \begin{bmatrix}
+    _{}^{g}\textrm{R}_c & _{}^{g}\textrm{t}_c \\
+    0_{1 \times 3} & 1
+    \end{bmatrix}
+    \begin{bmatrix}
+    X_c\\
+    Y_c\\
+    Z_c\\
+    1
+    \end{bmatrix}
+\f]
+
+This problem is also known as solving the \f$\mathbf{A}\mathbf{X}=\mathbf{X}\mathbf{B}\f$ equation:
+\f[
+    \begin{align*}
+    ^{b}{\textrm{T}_g}^{(1)} \hspace{0.2em} ^{g}\textrm{T}_c \hspace{0.2em} ^{c}{\textrm{T}_t}^{(1)} &=
+    \hspace{0.1em} ^{b}{\textrm{T}_g}^{(2)} \hspace{0.2em} ^{g}\textrm{T}_c \hspace{0.2em} ^{c}{\textrm{T}_t}^{(2)} \\
+
+    (^{b}{\textrm{T}_g}^{(2)})^{-1} \hspace{0.2em} ^{b}{\textrm{T}_g}^{(1)} \hspace{0.2em} ^{g}\textrm{T}_c &=
+    \hspace{0.1em} ^{g}\textrm{T}_c \hspace{0.2em} ^{c}{\textrm{T}_t}^{(2)} (^{c}{\textrm{T}_t}^{(1)})^{-1} \\
+
+    \textrm{A}_i \textrm{X} &= \textrm{X} \textrm{B}_i \\
+    \end{align*}
+\f]
+
+\note
+Additional information can be found on this [website](http://campar.in.tum.de/Chair/HandEyeCalibration).
+\note
+A minimum of 2 motions with non parallel rotation axes are necessary to determine the hand-eye transformation.
+So at least 3 different poses are required, but it is strongly recommended to use many more poses.
+
+ */
+CV_EXPORTS_W void calibrateHandEye( InputArrayOfArrays R_gripper2base, InputArrayOfArrays t_gripper2base,
+                                    InputArrayOfArrays R_target2cam, InputArrayOfArrays t_target2cam,
+                                    OutputArray R_cam2gripper, OutputArray t_cam2gripper,
+                                    HandEyeCalibrationMethod method=CALIB_HAND_EYE_TSAI );
+
 /** @brief Converts points from Euclidean to homogeneous space.
 
 @param src Input vector of N-dimensional points.
diff --git a/modules/calib3d/src/calibration_handeye.cpp b/modules/calib3d/src/calibration_handeye.cpp
new file mode 100644
index 0000000000..18561c77fe
--- /dev/null
+++ b/modules/calib3d/src/calibration_handeye.cpp
@@ -0,0 +1,770 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "precomp.hpp"
+#include "opencv2/calib3d.hpp"
+
+namespace cv {
+
+static Mat homogeneousInverse(const Mat& T)
+{
+    CV_Assert(T.rows == 4 && T.cols == 4);
+
+    Mat R = T(Rect(0, 0, 3, 3));
+    Mat t = T(Rect(3, 0, 1, 3));
+    Mat Rt = R.t();
+    Mat tinv = -Rt * t;
+    Mat Tinv = Mat::eye(4, 4, T.type());
+    Rt.copyTo(Tinv(Rect(0, 0, 3, 3)));
+    tinv.copyTo(Tinv(Rect(3, 0, 1, 3)));
+
+    return Tinv;
+}
+
+// q = rot2quatMinimal(R)
+//
+// R - 3x3 rotation matrix, or 4x4 homogeneous matrix
+// q - 3x1 unit quaternion <qx, qy, qz>
+// q = sin(theta/2) * v
+// theta - rotation angle
+// v     - unit rotation axis, |v| = 1
+static Mat rot2quatMinimal(const Mat& R)
+{
+    CV_Assert(R.type() == CV_64FC1 && R.rows >= 3 && R.cols >= 3);
+
+    double m00 = R.at<double>(0,0), m01 = R.at<double>(0,1), m02 = R.at<double>(0,2);
+    double m10 = R.at<double>(1,0), m11 = R.at<double>(1,1), m12 = R.at<double>(1,2);
+    double m20 = R.at<double>(2,0), m21 = R.at<double>(2,1), m22 = R.at<double>(2,2);
+    double trace = m00 + m11 + m22;
+
+    double qx, qy, qz;
+    if (trace > 0) {
+        double S = sqrt(trace + 1.0) * 2; // S=4*qw
+        qx = (m21 - m12) / S;
+        qy = (m02 - m20) / S;
+        qz = (m10 - m01) / S;
+    } else if ((m00 > m11)&(m00 > m22)) {
+        double S = sqrt(1.0 + m00 - m11 - m22) * 2; // S=4*qx
+        qx = 0.25 * S;
+        qy = (m01 + m10) / S;
+        qz = (m02 + m20) / S;
+    } else if (m11 > m22) {
+        double S = sqrt(1.0 + m11 - m00 - m22) * 2; // S=4*qy
+        qx = (m01 + m10) / S;
+        qy = 0.25 * S;
+        qz = (m12 + m21) / S;
+    } else {
+        double S = sqrt(1.0 + m22 - m00 - m11) * 2; // S=4*qz
+        qx = (m02 + m20) / S;
+        qy = (m12 + m21) / S;
+        qz = 0.25 * S;
+    }
+
+    return (Mat_<double>(3,1) << qx, qy, qz);
+}
+
+static Mat skew(const Mat& v)
+{
+    CV_Assert(v.type() == CV_64FC1 && v.rows == 3 && v.cols == 1);
+
+    double vx = v.at<double>(0,0);
+    double vy = v.at<double>(1,0);
+    double vz = v.at<double>(2,0);
+    return (Mat_<double>(3,3) << 0, -vz, vy,
+                                vz, 0, -vx,
+                                -vy, vx, 0);
+}
+
+// R = quatMinimal2rot(q)
+//
+// q - 3x1 unit quaternion <qx, qy, qz>
+// R - 3x3 rotation matrix
+// q = sin(theta/2) * v
+// theta - rotation angle
+// v     - unit rotation axis, |v| = 1
+static Mat quatMinimal2rot(const Mat& q)
+{
+    CV_Assert(q.type() == CV_64FC1 && q.rows == 3 && q.cols == 1);
+
+    Mat p = q.t()*q;
+    double w = sqrt(1 - p.at<double>(0,0));
+
+    Mat diag_p = Mat::eye(3,3,CV_64FC1)*p.at<double>(0,0);
+    return 2*q*q.t() + 2*w*skew(q) + Mat::eye(3,3,CV_64FC1) - 2*diag_p;
+}
+
+// q = rot2quat(R)
+//
+// q - 4x1 unit quaternion <qw, qx, qy, qz>
+// R - 3x3 rotation matrix
+static Mat rot2quat(const Mat& R)
+{
+    CV_Assert(R.type() == CV_64FC1 && R.rows >= 3 && R.cols >= 3);
+
+    double m00 = R.at<double>(0,0), m01 = R.at<double>(0,1), m02 = R.at<double>(0,2);
+    double m10 = R.at<double>(1,0), m11 = R.at<double>(1,1), m12 = R.at<double>(1,2);
+    double m20 = R.at<double>(2,0), m21 = R.at<double>(2,1), m22 = R.at<double>(2,2);
+    double trace = m00 + m11 + m22;
+
+    double qw, qx, qy, qz;
+    if (trace > 0) {
+        double S = sqrt(trace + 1.0) * 2; // S=4*qw
+        qw = 0.25 * S;
+        qx = (m21 - m12) / S;
+        qy = (m02 - m20) / S;
+        qz = (m10 - m01) / S;
+    } else if ((m00 > m11)&(m00 > m22)) {
+        double S = sqrt(1.0 + m00 - m11 - m22) * 2; // S=4*qx
+        qw = (m21 - m12) / S;
+        qx = 0.25 * S;
+        qy = (m01 + m10) / S;
+        qz = (m02 + m20) / S;
+    } else if (m11 > m22) {
+        double S = sqrt(1.0 + m11 - m00 - m22) * 2; // S=4*qy
+        qw = (m02 - m20) / S;
+        qx = (m01 + m10) / S;
+        qy = 0.25 * S;
+        qz = (m12 + m21) / S;
+    } else {
+        double S = sqrt(1.0 + m22 - m00 - m11) * 2; // S=4*qz
+        qw = (m10 - m01) / S;
+        qx = (m02 + m20) / S;
+        qy = (m12 + m21) / S;
+        qz = 0.25 * S;
+    }
+
+    return (Mat_<double>(4,1) << qw, qx, qy, qz);
+}
+
+// R = quat2rot(q)
+//
+// q - 4x1 unit quaternion <qw, qx, qy, qz>
+// R - 3x3 rotation matrix
+static Mat quat2rot(const Mat& q)
+{
+    CV_Assert(q.type() == CV_64FC1 && q.rows == 4 && q.cols == 1);
+
+    double qw = q.at<double>(0,0);
+    double qx = q.at<double>(1,0);
+    double qy = q.at<double>(2,0);
+    double qz = q.at<double>(3,0);
+
+    Mat R(3, 3, CV_64FC1);
+    R.at<double>(0, 0) = 1 - 2*qy*qy - 2*qz*qz;
+    R.at<double>(0, 1) = 2*qx*qy - 2*qz*qw;
+    R.at<double>(0, 2) = 2*qx*qz + 2*qy*qw;
+
+    R.at<double>(1, 0) = 2*qx*qy + 2*qz*qw;
+    R.at<double>(1, 1) = 1 - 2*qx*qx - 2*qz*qz;
+    R.at<double>(1, 2) = 2*qy*qz - 2*qx*qw;
+
+    R.at<double>(2, 0) = 2*qx*qz - 2*qy*qw;
+    R.at<double>(2, 1) = 2*qy*qz + 2*qx*qw;
+    R.at<double>(2, 2) = 1 - 2*qx*qx - 2*qy*qy;
+
+    return R;
+}
+
+// Kronecker product or tensor product
+// https://stackoverflow.com/a/36552682
+static Mat kron(const Mat& A, const Mat& B)
+{
+    CV_Assert(A.channels() == 1 && B.channels() == 1);
+
+    Mat1d Ad, Bd;
+    A.convertTo(Ad, CV_64F);
+    B.convertTo(Bd, CV_64F);
+
+    Mat1d Kd(Ad.rows * Bd.rows, Ad.cols * Bd.cols, 0.0);
+    for (int ra = 0; ra < Ad.rows; ra++)
+    {
+        for (int ca = 0; ca < Ad.cols; ca++)
+        {
+            Kd(Range(ra*Bd.rows, (ra + 1)*Bd.rows), Range(ca*Bd.cols, (ca + 1)*Bd.cols)) = Bd.mul(Ad(ra, ca));
+        }
+    }
+
+    Mat K;
+    Kd.convertTo(K, A.type());
+    return K;
+}
+
+// quaternion multiplication
+static Mat qmult(const Mat& s, const Mat& t)
+{
+    CV_Assert(s.type() == CV_64FC1 && t.type() == CV_64FC1);
+    CV_Assert(s.rows == 4 && s.cols == 1);
+    CV_Assert(t.rows == 4 && t.cols == 1);
+
+    double s0 = s.at<double>(0,0);
+    double s1 = s.at<double>(1,0);
+    double s2 = s.at<double>(2,0);
+    double s3 = s.at<double>(3,0);
+
+    double t0 = t.at<double>(0,0);
+    double t1 = t.at<double>(1,0);
+    double t2 = t.at<double>(2,0);
+    double t3 = t.at<double>(3,0);
+
+    Mat q(4, 1, CV_64FC1);
+    q.at<double>(0,0) = s0*t0 - s1*t1 - s2*t2 - s3*t3;
+    q.at<double>(1,0) = s0*t1 + s1*t0 + s2*t3 - s3*t2;
+    q.at<double>(2,0) = s0*t2 - s1*t3 + s2*t0 + s3*t1;
+    q.at<double>(3,0) = s0*t3 + s1*t2 - s2*t1 + s3*t0;
+
+    return q;
+}
+
+// dq = homogeneous2dualQuaternion(H)
+//
+// H  - 4x4 homogeneous transformation: [R | t; 0 0 0 | 1]
+// dq - 8x1 dual quaternion: <q (rotation part), qprime (translation part)>
+static Mat homogeneous2dualQuaternion(const Mat& H)
+{
+    CV_Assert(H.type() == CV_64FC1 && H.rows == 4 && H.cols == 4);
+
+    Mat dualq(8, 1, CV_64FC1);
+    Mat R = H(Rect(0, 0, 3, 3));
+    Mat t = H(Rect(3, 0, 1, 3));
+
+    Mat q = rot2quat(R);
+    Mat qt = Mat::zeros(4, 1, CV_64FC1);
+    t.copyTo(qt(Rect(0, 1, 1, 3)));
+    Mat qprime = 0.5 * qmult(qt, q);
+
+    q.copyTo(dualq(Rect(0, 0, 1, 4)));
+    qprime.copyTo(dualq(Rect(0, 4, 1, 4)));
+
+    return dualq;
+}
+
+// H = dualQuaternion2homogeneous(dq)
+//
+// H  - 4x4 homogeneous transformation: [R | t; 0 0 0 | 1]
+// dq - 8x1 dual quaternion: <q (rotation part), qprime (translation part)>
+static Mat dualQuaternion2homogeneous(const Mat& dualq)
+{
+    CV_Assert(dualq.type() == CV_64FC1 && dualq.rows == 8 && dualq.cols == 1);
+
+    Mat q = dualq(Rect(0, 0, 1, 4));
+    Mat qprime = dualq(Rect(0, 4, 1, 4));
+
+    Mat R = quat2rot(q);
+    q.at<double>(1,0) = -q.at<double>(1,0);
+    q.at<double>(2,0) = -q.at<double>(2,0);
+    q.at<double>(3,0) = -q.at<double>(3,0);
+
+    Mat qt = 2*qmult(qprime, q);
+    Mat t = qt(Rect(0, 1, 1, 3));
+
+    Mat H = Mat::eye(4, 4, CV_64FC1);
+    R.copyTo(H(Rect(0, 0, 3, 3)));
+    t.copyTo(H(Rect(3, 0, 1, 3)));
+
+    return H;
+}
+
+//Reference:
+//R. Y. Tsai and R. K. Lenz, "A new technique for fully autonomous and efficient 3D robotics hand/eye calibration."
+//In IEEE Transactions on Robotics and Automation, vol. 5, no. 3, pp. 345-358, June 1989.
+//C++ code converted from Zoran Lazarevic's Matlab code:
+//http://lazax.com/www.cs.columbia.edu/~laza/html/Stewart/matlab/handEye.m
+static void calibrateHandEyeTsai(const std::vector<Mat>& Hg, const std::vector<Mat>& Hc,
+                                 Mat& R_cam2gripper, Mat& t_cam2gripper)
+{
+    //Number of unique camera position pairs
+    int K = static_cast<int>((Hg.size()*Hg.size() - Hg.size()) / 2.0);
+    //Will store: skew(Pgij+Pcij)
+    Mat A(3*K, 3, CV_64FC1);
+    //Will store: Pcij - Pgij
+    Mat B(3*K, 1, CV_64FC1);
+
+    std::vector<Mat> vec_Hgij, vec_Hcij;
+    vec_Hgij.reserve(static_cast<size_t>(K));
+    vec_Hcij.reserve(static_cast<size_t>(K));
+
+    int idx = 0;
+    for (size_t i = 0; i < Hg.size(); i++)
+    {
+        for (size_t j = i+1; j < Hg.size(); j++, idx++)
+        {
+            //Defines coordinate transformation from Gi to Gj
+            //Hgi is from Gi (gripper) to RW (robot base)
+            //Hgj is from Gj (gripper) to RW (robot base)
+            Mat Hgij = homogeneousInverse(Hg[j]) * Hg[i]; //eq 6
+            vec_Hgij.push_back(Hgij);
+            //Rotation axis for Rgij which is the 3D rotation from gripper coordinate frame Gi to Gj
+            Mat Pgij = 2*rot2quatMinimal(Hgij);
+
+            //Defines coordinate transformation from Ci to Cj
+            //Hci is from CW (calibration target) to Ci (camera)
+            //Hcj is from CW (calibration target) to Cj (camera)
+            Mat Hcij = Hc[j] * homogeneousInverse(Hc[i]); //eq 7
+            vec_Hcij.push_back(Hcij);
+            //Rotation axis for Rcij
+            Mat Pcij = 2*rot2quatMinimal(Hcij);
+
+            //Left-hand side: skew(Pgij+Pcij)
+            skew(Pgij+Pcij).copyTo(A(Rect(0, idx*3, 3, 3)));
+            //Right-hand side: Pcij - Pgij
+            Mat diff = Pcij - Pgij;
+            diff.copyTo(B(Rect(0, idx*3, 1, 3)));
+        }
+    }
+
+    Mat Pcg_;
+    //Rotation from camera to gripper is obtained from the set of equations:
+    //    skew(Pgij+Pcij) * Pcg_ = Pcij - Pgij    (eq 12)
+    solve(A, B, Pcg_, DECOMP_SVD);
+
+    Mat Pcg_norm = Pcg_.t() * Pcg_;
+    //Obtained non-unit quaternion is scaled back to unit value that
+    //designates camera-gripper rotation
+    Mat Pcg = 2 * Pcg_ / sqrt(1 + Pcg_norm.at<double>(0,0)); //eq 14
+
+    Mat Rcg = quatMinimal2rot(Pcg/2.0);
+
+    idx = 0;
+    for (size_t i = 0; i < Hg.size(); i++)
+    {
+        for (size_t j = i+1; j < Hg.size(); j++, idx++)
+        {
+            //Defines coordinate transformation from Gi to Gj
+            //Hgi is from Gi (gripper) to RW (robot base)
+            //Hgj is from Gj (gripper) to RW (robot base)
+            Mat Hgij = vec_Hgij[static_cast<size_t>(idx)];
+            //Defines coordinate transformation from Ci to Cj
+            //Hci is from CW (calibration target) to Ci (camera)
+            //Hcj is from CW (calibration target) to Cj (camera)
+            Mat Hcij = vec_Hcij[static_cast<size_t>(idx)];
+
+            //Left-hand side: (Rgij - I)
+            Mat diff = Hgij(Rect(0,0,3,3)) - Mat::eye(3,3,CV_64FC1);
+            diff.copyTo(A(Rect(0, idx*3, 3, 3)));
+
+            //Right-hand side: Rcg*Tcij - Tgij
+            diff = Rcg*Hcij(Rect(3, 0, 1, 3)) - Hgij(Rect(3, 0, 1, 3));
+            diff.copyTo(B(Rect(0, idx*3, 1, 3)));
+        }
+    }
+
+    Mat Tcg;
+    //Translation from camera to gripper is obtained from the set of equations:
+    //    (Rgij - I) * Tcg = Rcg*Tcij - Tgij    (eq 15)
+    solve(A, B, Tcg, DECOMP_SVD);
+
+    R_cam2gripper = Rcg;
+    t_cam2gripper = Tcg;
+}
+
+//Reference:
+//F. Park, B. Martin, "Robot Sensor Calibration: Solving AX = XB on the Euclidean Group."
+//In IEEE Transactions on Robotics and Automation, 10(5): 717-721, 1994.
+//Matlab code: http://math.loyola.edu/~mili/Calibration/
+static void calibrateHandEyePark(const std::vector<Mat>& Hg, const std::vector<Mat>& Hc,
+                                 Mat& R_cam2gripper, Mat& t_cam2gripper)
+{
+    Mat M = Mat::zeros(3, 3, CV_64FC1);
+
+    for (size_t i = 0; i < Hg.size(); i++)
+    {
+        for (size_t j = i+1; j < Hg.size(); j++)
+        {
+            Mat Hgij = homogeneousInverse(Hg[j]) * Hg[i];
+            Mat Hcij = Hc[j] * homogeneousInverse(Hc[i]);
+
+            Mat Rgij = Hgij(Rect(0, 0, 3, 3));
+            Mat Rcij = Hcij(Rect(0, 0, 3, 3));
+
+            Mat a, b;
+            Rodrigues(Rgij, a);
+            Rodrigues(Rcij, b);
+
+            M += b * a.t();
+        }
+    }
+
+    Mat eigenvalues, eigenvectors;
+    eigen(M.t()*M, eigenvalues, eigenvectors);
+
+    Mat v = Mat::zeros(3, 3, CV_64FC1);
+    for (int i = 0; i < 3; i++) {
+        v.at<double>(i,i) = 1.0 / sqrt(eigenvalues.at<double>(i,0));
+    }
+
+    Mat R = eigenvectors.t() * v * eigenvectors * M.t();
+    R_cam2gripper = R;
+
+    int K = static_cast<int>((Hg.size()*Hg.size() - Hg.size()) / 2.0);
+    Mat C(3*K, 3, CV_64FC1);
+    Mat d(3*K, 1, CV_64FC1);
+    Mat I3 = Mat::eye(3, 3, CV_64FC1);
+
+    int idx = 0;
+    for (size_t i = 0; i < Hg.size(); i++)
+    {
+        for (size_t j = i+1; j < Hg.size(); j++, idx++)
+        {
+            Mat Hgij = homogeneousInverse(Hg[j]) * Hg[i];
+            Mat Hcij = Hc[j] * homogeneousInverse(Hc[i]);
+
+            Mat Rgij = Hgij(Rect(0, 0, 3, 3));
+
+            Mat tgij = Hgij(Rect(3, 0, 1, 3));
+            Mat tcij = Hcij(Rect(3, 0, 1, 3));
+
+            Mat I_tgij = I3 - Rgij;
+            I_tgij.copyTo(C(Rect(0, 3*idx, 3, 3)));
+
+            Mat A_RB = tgij - R*tcij;
+            A_RB.copyTo(d(Rect(0, 3*idx, 1, 3)));
+        }
+    }
+
+    Mat t;
+    solve(C, d, t, DECOMP_SVD);
+    t_cam2gripper = t;
+}
+
+//Reference:
+//R. Horaud, F. Dornaika, "Hand-Eye Calibration"
+//In International Journal of Robotics Research, 14(3): 195-210, 1995.
+//Matlab code: http://math.loyola.edu/~mili/Calibration/
+static void calibrateHandEyeHoraud(const std::vector<Mat>& Hg, const std::vector<Mat>& Hc,
+                                   Mat& R_cam2gripper, Mat& t_cam2gripper)
+{
+    Mat A = Mat::zeros(4, 4, CV_64FC1);
+
+    for (size_t i = 0; i < Hg.size(); i++)
+    {
+        for (size_t j = i+1; j < Hg.size(); j++)
+        {
+            Mat Hgij = homogeneousInverse(Hg[j]) * Hg[i];
+            Mat Hcij = Hc[j] * homogeneousInverse(Hc[i]);
+
+            Mat Rgij = Hgij(Rect(0, 0, 3, 3));
+            Mat Rcij = Hcij(Rect(0, 0, 3, 3));
+
+            Mat qgij = rot2quat(Rgij);
+            double r0 = qgij.at<double>(0,0);
+            double rx = qgij.at<double>(1,0);
+            double ry = qgij.at<double>(2,0);
+            double rz = qgij.at<double>(3,0);
+
+            // Q(r) Appendix A
+            Matx44d Qvi(r0, -rx, -ry, -rz,
+                        rx,  r0, -rz,  ry,
+                        ry,  rz,  r0, -rx,
+                        rz, -ry,  rx,  r0);
+
+            Mat qcij = rot2quat(Rcij);
+            r0 = qcij.at<double>(0,0);
+            rx = qcij.at<double>(1,0);
+            ry = qcij.at<double>(2,0);
+            rz = qcij.at<double>(3,0);
+
+            // W(r) Appendix A
+            Matx44d Wvi(r0, -rx, -ry, -rz,
+                        rx,  r0,  rz, -ry,
+                        ry, -rz,  r0,  rx,
+                        rz,  ry, -rx,  r0);
+
+            // Ai = (Q(vi') - W(vi))^T (Q(vi') - W(vi))
+            A += (Qvi - Wvi).t() * (Qvi - Wvi);
+        }
+    }
+
+    Mat eigenvalues, eigenvectors;
+    eigen(A, eigenvalues, eigenvectors);
+
+    Mat R = quat2rot(eigenvectors.row(3).t());
+    R_cam2gripper = R;
+
+    int K = static_cast<int>((Hg.size()*Hg.size() - Hg.size()) / 2.0);
+    Mat C(3*K, 3, CV_64FC1);
+    Mat d(3*K, 1, CV_64FC1);
+    Mat I3 = Mat::eye(3, 3, CV_64FC1);
+
+    int idx = 0;
+    for (size_t i = 0; i < Hg.size(); i++)
+    {
+        for (size_t j = i+1; j < Hg.size(); j++, idx++)
+        {
+            Mat Hgij = homogeneousInverse(Hg[j]) * Hg[i];
+            Mat Hcij = Hc[j] * homogeneousInverse(Hc[i]);
+
+            Mat Rgij = Hgij(Rect(0, 0, 3, 3));
+
+            Mat tgij = Hgij(Rect(3, 0, 1, 3));
+            Mat tcij = Hcij(Rect(3, 0, 1, 3));
+
+            Mat I_tgij = I3 - Rgij;
+            I_tgij.copyTo(C(Rect(0, 3*idx, 3, 3)));
+
+            Mat A_RB = tgij - R*tcij;
+            A_RB.copyTo(d(Rect(0, 3*idx, 1, 3)));
+        }
+    }
+
+    Mat t;
+    solve(C, d, t, DECOMP_SVD);
+    t_cam2gripper = t;
+}
+
+// sign function, return -1 if negative values, +1 otherwise
+static int sign_double(double val)
+{
+    return (0 < val) - (val < 0);
+}
+
+//Reference:
+//N. Andreff, R. Horaud, B. Espiau, "On-line Hand-Eye Calibration."
+//In Second International Conference on 3-D Digital Imaging and Modeling (3DIM'99), pages 430-436, 1999.
+//Matlab code: http://math.loyola.edu/~mili/Calibration/
+static void calibrateHandEyeAndreff(const std::vector<Mat>& Hg, const std::vector<Mat>& Hc,
+                                    Mat& R_cam2gripper, Mat& t_cam2gripper)
+{
+    int K = static_cast<int>((Hg.size()*Hg.size() - Hg.size()) / 2.0);
+    Mat A(12*K, 12, CV_64FC1);
+    Mat B(12*K, 1, CV_64FC1);
+
+    Mat I9 = Mat::eye(9, 9, CV_64FC1);
+    Mat I3 = Mat::eye(3, 3, CV_64FC1);
+    Mat O9x3 = Mat::zeros(9, 3, CV_64FC1);
+    Mat O9x1 = Mat::zeros(9, 1, CV_64FC1);
+
+    int idx = 0;
+    for (size_t i = 0; i < Hg.size(); i++)
+    {
+        for (size_t j = i+1; j < Hg.size(); j++, idx++)
+        {
+            Mat Hgij = homogeneousInverse(Hg[j]) * Hg[i];
+            Mat Hcij = Hc[j] * homogeneousInverse(Hc[i]);
+
+            Mat Rgij = Hgij(Rect(0, 0, 3, 3));
+            Mat Rcij = Hcij(Rect(0, 0, 3, 3));
+
+            Mat tgij = Hgij(Rect(3, 0, 1, 3));
+            Mat tcij = Hcij(Rect(3, 0, 1, 3));
+
+            //Eq 10
+            Mat a00 = I9 - kron(Rgij, Rcij);
+            Mat a01 = O9x3;
+            Mat a10 = kron(I3, tcij.t());
+            Mat a11 = I3 - Rgij;
+
+            a00.copyTo(A(Rect(0, idx*12, 9, 9)));
+            a01.copyTo(A(Rect(9, idx*12, 3, 9)));
+            a10.copyTo(A(Rect(0, idx*12 + 9, 9, 3)));
+            a11.copyTo(A(Rect(9, idx*12 + 9, 3, 3)));
+
+            O9x1.copyTo(B(Rect(0, idx*12, 1, 9)));
+            tgij.copyTo(B(Rect(0, idx*12 + 9, 1, 3)));
+        }
+    }
+
+    Mat X;
+    solve(A, B, X, DECOMP_SVD);
+
+    Mat R = X(Rect(0, 0, 1, 9));
+    int newSize[] = {3, 3};
+    R = R.reshape(1, 2, newSize);
+    //Eq 15
+    double det = determinant(R);
+    R = pow(sign_double(det) / abs(det), 1.0/3.0) * R;
+
+    Mat w, u, vt;
+    SVDecomp(R, w, u, vt);
+    R = u*vt;
+
+    if (determinant(R) < 0)
+    {
+        Mat diag = (Mat_<double>(3,3) << 1.0, 0.0, 0.0,
+                                         0.0, 1.0, 0.0,
+                                         0.0, 0.0, -1.0);
+        R = u*diag*vt;
+    }
+
+    R_cam2gripper = R;
+
+    Mat t = X(Rect(0, 9, 1, 3));
+    t_cam2gripper = t;
+}
+
+//Reference:
+//K. Daniilidis, "Hand-Eye Calibration Using Dual Quaternions."
+//In The International Journal of Robotics Research,18(3): 286-298, 1998.
+//Matlab code: http://math.loyola.edu/~mili/Calibration/
+static void calibrateHandEyeDaniilidis(const std::vector<Mat>& Hg, const std::vector<Mat>& Hc,
+                                       Mat& R_cam2gripper, Mat& t_cam2gripper)
+{
+    int K = static_cast<int>((Hg.size()*Hg.size() - Hg.size()) / 2.0);
+    Mat T = Mat::zeros(6*K, 8, CV_64FC1);
+
+    int idx = 0;
+    for (size_t i = 0; i < Hg.size(); i++)
+    {
+        for (size_t j = i+1; j < Hg.size(); j++, idx++)
+        {
+            Mat Hgij = homogeneousInverse(Hg[j]) * Hg[i];
+            Mat Hcij = Hc[j] * homogeneousInverse(Hc[i]);
+
+            Mat dualqa = homogeneous2dualQuaternion(Hgij);
+            Mat dualqb = homogeneous2dualQuaternion(Hcij);
+
+            Mat a = dualqa(Rect(0, 1, 1, 3));
+            Mat b = dualqb(Rect(0, 1, 1, 3));
+
+            Mat aprime = dualqa(Rect(0, 5, 1, 3));
+            Mat bprime = dualqb(Rect(0, 5, 1, 3));
+
+            //Eq 31
+            Mat s00 = a - b;
+            Mat s01 = skew(a + b);
+            Mat s10 = aprime - bprime;
+            Mat s11 = skew(aprime + bprime);
+            Mat s12 = a - b;
+            Mat s13 = skew(a + b);
+
+            s00.copyTo(T(Rect(0, idx*6, 1, 3)));
+            s01.copyTo(T(Rect(1, idx*6, 3, 3)));
+            s10.copyTo(T(Rect(0, idx*6 + 3, 1, 3)));
+            s11.copyTo(T(Rect(1, idx*6 + 3, 3, 3)));
+            s12.copyTo(T(Rect(4, idx*6 + 3, 1, 3)));
+            s13.copyTo(T(Rect(5, idx*6 + 3, 3, 3)));
+        }
+    }
+
+    Mat w, u, vt;
+    SVDecomp(T, w, u, vt);
+    Mat v = vt.t();
+
+    Mat u1 = v(Rect(6, 0, 1, 4));
+    Mat v1 = v(Rect(6, 4, 1, 4));
+    Mat u2 = v(Rect(7, 0, 1, 4));
+    Mat v2 = v(Rect(7, 4, 1, 4));
+
+    //Solves Eq 34, Eq 35
+    Mat ma = u1.t()*v1;
+    Mat mb = u1.t()*v2 + u2.t()*v1;
+    Mat mc = u2.t()*v2;
+
+    double a = ma.at<double>(0,0);
+    double b = mb.at<double>(0,0);
+    double c = mc.at<double>(0,0);
+
+    double s1 = (-b + sqrt(b*b - 4*a*c)) / (2*a);
+    double s2 = (-b - sqrt(b*b - 4*a*c)) / (2*a);
+
+    Mat sol1 = s1*s1*u1.t()*u1 + 2*s1*u1.t()*u2 + u2.t()*u2;
+    Mat sol2 = s2*s2*u1.t()*u1 + 2*s2*u1.t()*u2 + u2.t()*u2;
+    double s, val;
+    if (sol1.at<double>(0,0) > sol2.at<double>(0,0))
+    {
+        s = s1;
+        val = sol1.at<double>(0,0);
+    }
+    else
+    {
+        s = s2;
+        val = sol2.at<double>(0,0);
+    }
+
+    double lambda2 = sqrt(1.0 / val);
+    double lambda1 = s * lambda2;
+
+    Mat dualq = lambda1 * v(Rect(6, 0, 1, 8)) + lambda2*v(Rect(7, 0, 1, 8));
+    Mat X = dualQuaternion2homogeneous(dualq);
+
+    Mat R = X(Rect(0, 0, 3, 3));
+    Mat t = X(Rect(3, 0, 1, 3));
+    R_cam2gripper = R;
+    t_cam2gripper = t;
+}
+
+void calibrateHandEye(InputArrayOfArrays R_gripper2base, InputArrayOfArrays t_gripper2base,
+                      InputArrayOfArrays R_target2cam, InputArrayOfArrays t_target2cam,
+                      OutputArray R_cam2gripper, OutputArray t_cam2gripper,
+                      HandEyeCalibrationMethod method)
+{
+    CV_Assert(R_gripper2base.isMatVector() && t_gripper2base.isMatVector() &&
+              R_target2cam.isMatVector() && t_target2cam.isMatVector());
+
+    std::vector<Mat> R_gripper2base_, t_gripper2base_;
+    R_gripper2base.getMatVector(R_gripper2base_);
+    t_gripper2base.getMatVector(t_gripper2base_);
+
+    std::vector<Mat> R_target2cam_, t_target2cam_;
+    R_target2cam.getMatVector(R_target2cam_);
+    t_target2cam.getMatVector(t_target2cam_);
+
+    CV_Assert(R_gripper2base_.size() == t_gripper2base_.size() &&
+              R_target2cam_.size() == t_target2cam_.size() &&
+              R_gripper2base_.size() == R_target2cam_.size());
+    CV_Assert(R_gripper2base_.size() >= 3);
+
+    //Notation used in Tsai paper
+    //Defines coordinate transformation from G (gripper) to RW (robot base)
+    std::vector<Mat> Hg;
+    Hg.reserve(R_gripper2base_.size());
+    for (size_t i = 0; i < R_gripper2base_.size(); i++)
+    {
+        Mat m = Mat::eye(4, 4, CV_64FC1);
+        Mat R = m(Rect(0, 0, 3, 3));
+        R_gripper2base_[i].convertTo(R, CV_64F);
+
+        Mat t = m(Rect(3, 0, 1, 3));
+        t_gripper2base_[i].convertTo(t, CV_64F);
+
+        Hg.push_back(m);
+    }
+
+    //Defines coordinate transformation from CW (calibration target) to C (camera)
+    std::vector<Mat> Hc;
+    Hc.reserve(R_target2cam_.size());
+    for (size_t i = 0; i < R_target2cam_.size(); i++)
+    {
+        Mat m = Mat::eye(4, 4, CV_64FC1);
+        Mat R = m(Rect(0, 0, 3, 3));
+        R_target2cam_[i].convertTo(R, CV_64F);
+
+        Mat t = m(Rect(3, 0, 1, 3));
+        t_target2cam_[i].convertTo(t, CV_64F);
+
+        Hc.push_back(m);
+    }
+
+    Mat Rcg = Mat::eye(3, 3, CV_64FC1);
+    Mat Tcg = Mat::zeros(3, 1, CV_64FC1);
+
+    switch (method)
+    {
+    case CALIB_HAND_EYE_TSAI:
+        calibrateHandEyeTsai(Hg, Hc, Rcg, Tcg);
+        break;
+
+    case CALIB_HAND_EYE_PARK:
+        calibrateHandEyePark(Hg, Hc, Rcg, Tcg);
+        break;
+
+    case CALIB_HAND_EYE_HORAUD:
+        calibrateHandEyeHoraud(Hg, Hc, Rcg, Tcg);
+        break;
+
+    case CALIB_HAND_EYE_ANDREFF:
+        calibrateHandEyeAndreff(Hg, Hc, Rcg, Tcg);
+        break;
+
+    case CALIB_HAND_EYE_DANIILIDIS:
+        calibrateHandEyeDaniilidis(Hg, Hc, Rcg, Tcg);
+        break;
+
+    default:
+        break;
+    }
+
+    Rcg.copyTo(R_cam2gripper);
+    Tcg.copyTo(t_cam2gripper);
+}
+}
diff --git a/modules/calib3d/test/test_calibration_hand_eye.cpp b/modules/calib3d/test/test_calibration_hand_eye.cpp
new file mode 100644
index 0000000000..d2cef969b3
--- /dev/null
+++ b/modules/calib3d/test/test_calibration_hand_eye.cpp
@@ -0,0 +1,381 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "test_precomp.hpp"
+#include "opencv2/calib3d.hpp"
+
+namespace opencv_test { namespace {
+
+class CV_CalibrateHandEyeTest : public cvtest::BaseTest
+{
+public:
+    CV_CalibrateHandEyeTest() {
+        eps_rvec[CALIB_HAND_EYE_TSAI] = 1.0e-8;
+        eps_rvec[CALIB_HAND_EYE_PARK] = 1.0e-8;
+        eps_rvec[CALIB_HAND_EYE_HORAUD] = 1.0e-8;
+        eps_rvec[CALIB_HAND_EYE_ANDREFF] = 1.0e-8;
+        eps_rvec[CALIB_HAND_EYE_DANIILIDIS] = 1.0e-8;
+
+        eps_tvec[CALIB_HAND_EYE_TSAI] = 1.0e-8;
+        eps_tvec[CALIB_HAND_EYE_PARK] = 1.0e-8;
+        eps_tvec[CALIB_HAND_EYE_HORAUD] = 1.0e-8;
+        eps_tvec[CALIB_HAND_EYE_ANDREFF] = 1.0e-8;
+        eps_tvec[CALIB_HAND_EYE_DANIILIDIS] = 1.0e-8;
+
+        eps_rvec_noise[CALIB_HAND_EYE_TSAI] = 2.0e-2;
+        eps_rvec_noise[CALIB_HAND_EYE_PARK] = 2.0e-2;
+        eps_rvec_noise[CALIB_HAND_EYE_HORAUD] = 2.0e-2;
+        eps_rvec_noise[CALIB_HAND_EYE_ANDREFF] = 1.0e-2;
+        eps_rvec_noise[CALIB_HAND_EYE_DANIILIDIS] = 1.0e-2;
+
+        eps_tvec_noise[CALIB_HAND_EYE_TSAI] = 5.0e-2;
+        eps_tvec_noise[CALIB_HAND_EYE_PARK] = 5.0e-2;
+        eps_tvec_noise[CALIB_HAND_EYE_HORAUD] = 5.0e-2;
+        eps_tvec_noise[CALIB_HAND_EYE_ANDREFF] = 5.0e-2;
+        eps_tvec_noise[CALIB_HAND_EYE_DANIILIDIS] = 5.0e-2;
+    }
+protected:
+    virtual void run(int);
+    void generatePose(RNG& rng, double min_theta, double max_theta,
+                      double min_tx, double max_tx,
+                      double min_ty, double max_ty,
+                      double min_tz, double max_tz,
+                      Mat& R, Mat& tvec,
+                      bool randSign=false);
+    void simulateData(RNG& rng, int nPoses,
+                      std::vector<Mat> &R_gripper2base, std::vector<Mat> &t_gripper2base,
+                      std::vector<Mat> &R_target2cam, std::vector<Mat> &t_target2cam,
+                      bool noise, Mat& R_cam2gripper, Mat& t_cam2gripper);
+    Mat homogeneousInverse(const Mat& T);
+    std::string getMethodName(HandEyeCalibrationMethod method);
+    double sign_double(double val);
+
+    double eps_rvec[5];
+    double eps_tvec[5];
+    double eps_rvec_noise[5];
+    double eps_tvec_noise[5];
+};
+
+void CV_CalibrateHandEyeTest::run(int)
+{
+    ts->set_failed_test_info(cvtest::TS::OK);
+
+    RNG& rng = ts->get_rng();
+
+    std::vector<std::vector<double> > vec_rvec_diff(5);
+    std::vector<std::vector<double> > vec_tvec_diff(5);
+    std::vector<std::vector<double> > vec_rvec_diff_noise(5);
+    std::vector<std::vector<double> > vec_tvec_diff_noise(5);
+
+    std::vector<HandEyeCalibrationMethod> methods;
+    methods.push_back(CALIB_HAND_EYE_TSAI);
+    methods.push_back(CALIB_HAND_EYE_PARK);
+    methods.push_back(CALIB_HAND_EYE_HORAUD);
+    methods.push_back(CALIB_HAND_EYE_ANDREFF);
+    methods.push_back(CALIB_HAND_EYE_DANIILIDIS);
+
+    const int nTests = 100;
+    for (int i = 0; i < nTests; i++)
+    {
+        const int nPoses = 10;
+        {
+            //No noise
+            std::vector<Mat> R_gripper2base, t_gripper2base;
+            std::vector<Mat> R_target2cam, t_target2cam;
+            Mat R_cam2gripper_true, t_cam2gripper_true;
+
+            const bool noise = false;
+            simulateData(rng, nPoses, R_gripper2base, t_gripper2base, R_target2cam, t_target2cam, noise, R_cam2gripper_true, t_cam2gripper_true);
+
+            for (size_t idx = 0; idx < methods.size(); idx++)
+            {
+                Mat rvec_cam2gripper_true;
+                cv::Rodrigues(R_cam2gripper_true, rvec_cam2gripper_true);
+
+                Mat R_cam2gripper_est, t_cam2gripper_est;
+                calibrateHandEye(R_gripper2base, t_gripper2base, R_target2cam, t_target2cam, R_cam2gripper_est, t_cam2gripper_est, methods[idx]);
+
+                Mat rvec_cam2gripper_est;
+                cv::Rodrigues(R_cam2gripper_est, rvec_cam2gripper_est);
+
+                double rvecDiff = cvtest::norm(rvec_cam2gripper_true, rvec_cam2gripper_est, NORM_L2);
+                double tvecDiff = cvtest::norm(t_cam2gripper_true, t_cam2gripper_est, NORM_L2);
+
+                vec_rvec_diff[idx].push_back(rvecDiff);
+                vec_tvec_diff[idx].push_back(tvecDiff);
+
+                const double epsilon_rvec = eps_rvec[idx];
+                const double epsilon_tvec = eps_tvec[idx];
+
+                //Maybe a better accuracy test would be to compare the mean and std errors with some thresholds?
+                if (rvecDiff > epsilon_rvec || tvecDiff > epsilon_tvec)
+                {
+                    ts->printf(cvtest::TS::LOG, "Invalid accuracy (no noise) for method: %s, rvecDiff: %f, epsilon_rvec: %f, tvecDiff: %f, epsilon_tvec: %f\n",
+                               getMethodName(methods[idx]).c_str(), rvecDiff, epsilon_rvec, tvecDiff, epsilon_tvec);
+                    ts->set_failed_test_info(cvtest::TS::FAIL_BAD_ACCURACY);
+                }
+            }
+        }
+
+        {
+            //Gaussian noise on transformations between calibration target frame and camera frame and between gripper and robot base frames
+            std::vector<Mat> R_gripper2base, t_gripper2base;
+            std::vector<Mat> R_target2cam, t_target2cam;
+            Mat R_cam2gripper_true, t_cam2gripper_true;
+
+            const bool noise = true;
+            simulateData(rng, nPoses, R_gripper2base, t_gripper2base, R_target2cam, t_target2cam, noise, R_cam2gripper_true, t_cam2gripper_true);
+
+            for (size_t idx = 0; idx < methods.size(); idx++)
+            {
+                Mat rvec_cam2gripper_true;
+                cv::Rodrigues(R_cam2gripper_true, rvec_cam2gripper_true);
+
+                Mat R_cam2gripper_est, t_cam2gripper_est;
+                calibrateHandEye(R_gripper2base, t_gripper2base, R_target2cam, t_target2cam, R_cam2gripper_est, t_cam2gripper_est, methods[idx]);
+
+                Mat rvec_cam2gripper_est;
+                cv::Rodrigues(R_cam2gripper_est, rvec_cam2gripper_est);
+
+                double rvecDiff = cvtest::norm(rvec_cam2gripper_true, rvec_cam2gripper_est, NORM_L2);
+                double tvecDiff = cvtest::norm(t_cam2gripper_true, t_cam2gripper_est, NORM_L2);
+
+                vec_rvec_diff_noise[idx].push_back(rvecDiff);
+                vec_tvec_diff_noise[idx].push_back(tvecDiff);
+
+                const double epsilon_rvec = eps_rvec_noise[idx];
+                const double epsilon_tvec = eps_tvec_noise[idx];
+
+                //Maybe a better accuracy test would be to compare the mean and std errors with some thresholds?
+                if (rvecDiff > epsilon_rvec || tvecDiff > epsilon_tvec)
+                {
+                    ts->printf(cvtest::TS::LOG, "Invalid accuracy (noise) for method: %s, rvecDiff: %f, epsilon_rvec: %f, tvecDiff: %f, epsilon_tvec: %f\n",
+                               getMethodName(methods[idx]).c_str(), rvecDiff, epsilon_rvec, tvecDiff, epsilon_tvec);
+                    ts->set_failed_test_info(cvtest::TS::FAIL_BAD_ACCURACY);
+                }
+            }
+        }
+    }
+
+    for (size_t idx = 0; idx < methods.size(); idx++)
+    {
+        {
+            double max_rvec_diff = *std::max_element(vec_rvec_diff[idx].begin(), vec_rvec_diff[idx].end());
+            double mean_rvec_diff = std::accumulate(vec_rvec_diff[idx].begin(),
+                                                    vec_rvec_diff[idx].end(), 0.0) / vec_rvec_diff[idx].size();
+            double sq_sum_rvec_diff = std::inner_product(vec_rvec_diff[idx].begin(), vec_rvec_diff[idx].end(),
+                                                         vec_rvec_diff[idx].begin(), 0.0);
+            double std_rvec_diff = std::sqrt(sq_sum_rvec_diff / vec_rvec_diff[idx].size() - mean_rvec_diff * mean_rvec_diff);
+
+            double max_tvec_diff = *std::max_element(vec_tvec_diff[idx].begin(), vec_tvec_diff[idx].end());
+            double mean_tvec_diff = std::accumulate(vec_tvec_diff[idx].begin(),
+                                                    vec_tvec_diff[idx].end(), 0.0) / vec_tvec_diff[idx].size();
+            double sq_sum_tvec_diff = std::inner_product(vec_tvec_diff[idx].begin(), vec_tvec_diff[idx].end(),
+                                                         vec_tvec_diff[idx].begin(), 0.0);
+            double std_tvec_diff = std::sqrt(sq_sum_tvec_diff / vec_tvec_diff[idx].size() - mean_tvec_diff * mean_tvec_diff);
+
+            std::cout << "\nMethod " << getMethodName(methods[idx]) << ":\n"
+                      << "Max rvec error: " << max_rvec_diff << ", Mean rvec error: " << mean_rvec_diff
+                      << ", Std rvec error: " << std_rvec_diff << "\n"
+                      << "Max tvec error: " << max_tvec_diff << ", Mean tvec error: " << mean_tvec_diff
+                      << ", Std tvec error: " << std_tvec_diff << std::endl;
+        }
+        {
+            double max_rvec_diff = *std::max_element(vec_rvec_diff_noise[idx].begin(), vec_rvec_diff_noise[idx].end());
+            double mean_rvec_diff = std::accumulate(vec_rvec_diff_noise[idx].begin(),
+                                                    vec_rvec_diff_noise[idx].end(), 0.0) / vec_rvec_diff_noise[idx].size();
+            double sq_sum_rvec_diff = std::inner_product(vec_rvec_diff_noise[idx].begin(), vec_rvec_diff_noise[idx].end(),
+                                                         vec_rvec_diff_noise[idx].begin(), 0.0);
+            double std_rvec_diff = std::sqrt(sq_sum_rvec_diff / vec_rvec_diff_noise[idx].size() - mean_rvec_diff * mean_rvec_diff);
+
+            double max_tvec_diff = *std::max_element(vec_tvec_diff_noise[idx].begin(), vec_tvec_diff_noise[idx].end());
+            double mean_tvec_diff = std::accumulate(vec_tvec_diff_noise[idx].begin(),
+                                                    vec_tvec_diff_noise[idx].end(), 0.0) / vec_tvec_diff_noise[idx].size();
+            double sq_sum_tvec_diff = std::inner_product(vec_tvec_diff_noise[idx].begin(), vec_tvec_diff_noise[idx].end(),
+                                                         vec_tvec_diff_noise[idx].begin(), 0.0);
+            double std_tvec_diff = std::sqrt(sq_sum_tvec_diff / vec_tvec_diff_noise[idx].size() - mean_tvec_diff * mean_tvec_diff);
+
+            std::cout << "Method (noise) " << getMethodName(methods[idx]) << ":\n"
+                      << "Max rvec error: " << max_rvec_diff << ", Mean rvec error: " << mean_rvec_diff
+                      << ", Std rvec error: " << std_rvec_diff << "\n"
+                      << "Max tvec error: " << max_tvec_diff << ", Mean tvec error: " << mean_tvec_diff
+                      << ", Std tvec error: " << std_tvec_diff << std::endl;
+        }
+    }
+}
+
+void CV_CalibrateHandEyeTest::generatePose(RNG& rng, double min_theta, double max_theta,
+                                               double min_tx, double max_tx,
+                                               double min_ty, double max_ty,
+                                               double min_tz, double max_tz,
+                                               Mat& R, Mat& tvec,
+                                               bool random_sign)
+{
+    Mat axis(3, 1, CV_64FC1);
+    for (int i = 0; i < 3; i++)
+    {
+        axis.at<double>(i,0) = rng.uniform(-1.0, 1.0);
+    }
+    double theta = rng.uniform(min_theta, max_theta);
+    if (random_sign)
+    {
+        theta *= sign_double(rng.uniform(-1.0, 1.0));
+    }
+
+    Mat rvec(3, 1, CV_64FC1);
+    rvec.at<double>(0,0) = theta*axis.at<double>(0,0);
+    rvec.at<double>(1,0) = theta*axis.at<double>(1,0);
+    rvec.at<double>(2,0) = theta*axis.at<double>(2,0);
+
+    tvec.create(3, 1, CV_64FC1);
+    tvec.at<double>(0,0) = rng.uniform(min_tx, max_tx);
+    tvec.at<double>(1,0) = rng.uniform(min_ty, max_ty);
+    tvec.at<double>(2,0) = rng.uniform(min_tz, max_tz);
+
+    if (random_sign)
+    {
+        tvec.at<double>(0,0) *= sign_double(rng.uniform(-1.0, 1.0));
+        tvec.at<double>(1,0) *= sign_double(rng.uniform(-1.0, 1.0));
+        tvec.at<double>(2,0) *= sign_double(rng.uniform(-1.0, 1.0));
+    }
+
+    cv::Rodrigues(rvec, R);
+}
+
+void CV_CalibrateHandEyeTest::simulateData(RNG& rng, int nPoses,
+                                               std::vector<Mat> &R_gripper2base, std::vector<Mat> &t_gripper2base,
+                                               std::vector<Mat> &R_target2cam, std::vector<Mat> &t_target2cam,
+                                               bool noise, Mat& R_cam2gripper, Mat& t_cam2gripper)
+{
+    //to avoid generating values close to zero,
+    //we use positive range values and randomize the sign
+    const bool random_sign = true;
+    generatePose(rng, 10.0*CV_PI/180.0, 50.0*CV_PI/180.0,
+                 0.05, 0.5, 0.05, 0.5, 0.05, 0.5,
+                 R_cam2gripper, t_cam2gripper, random_sign);
+
+    Mat R_target2base, t_target2base;
+    generatePose(rng, 5.0*CV_PI/180.0, 85.0*CV_PI/180.0,
+                 0.5, 3.5, 0.5, 3.5, 0.5, 3.5,
+                 R_target2base, t_target2base, random_sign);
+
+    for (int i = 0; i < nPoses; i++)
+    {
+        Mat R_gripper2base_, t_gripper2base_;
+        generatePose(rng, 5.0*CV_PI/180.0, 45.0*CV_PI/180.0,
+                     0.5, 1.5, 0.5, 1.5, 0.5, 1.5,
+                     R_gripper2base_, t_gripper2base_, random_sign);
+
+        R_gripper2base.push_back(R_gripper2base_);
+        t_gripper2base.push_back(t_gripper2base_);
+
+        Mat T_cam2gripper = Mat::eye(4, 4, CV_64FC1);
+        R_cam2gripper.copyTo(T_cam2gripper(Rect(0, 0, 3, 3)));
+        t_cam2gripper.copyTo(T_cam2gripper(Rect(3, 0, 1, 3)));
+
+        Mat T_gripper2base = Mat::eye(4, 4, CV_64FC1);
+        R_gripper2base_.copyTo(T_gripper2base(Rect(0, 0, 3, 3)));
+        t_gripper2base_.copyTo(T_gripper2base(Rect(3, 0, 1, 3)));
+
+        Mat T_base2cam = homogeneousInverse(T_cam2gripper) * homogeneousInverse(T_gripper2base);
+        Mat T_target2base = Mat::eye(4, 4, CV_64FC1);
+        R_target2base.copyTo(T_target2base(Rect(0, 0, 3, 3)));
+        t_target2base.copyTo(T_target2base(Rect(3, 0, 1, 3)));
+        Mat T_target2cam = T_base2cam * T_target2base;
+
+        if (noise)
+        {
+            //Add some noise for the transformation between the target and the camera
+            Mat R_target2cam_noise = T_target2cam(Rect(0, 0, 3, 3));
+            Mat rvec_target2cam_noise;
+            cv::Rodrigues(R_target2cam_noise, rvec_target2cam_noise);
+            rvec_target2cam_noise.at<double>(0,0) += rng.gaussian(0.002);
+            rvec_target2cam_noise.at<double>(1,0) += rng.gaussian(0.002);
+            rvec_target2cam_noise.at<double>(2,0) += rng.gaussian(0.002);
+
+            cv::Rodrigues(rvec_target2cam_noise, R_target2cam_noise);
+
+            Mat t_target2cam_noise = T_target2cam(Rect(3, 0, 1, 3));
+            t_target2cam_noise.at<double>(0,0) += rng.gaussian(0.005);
+            t_target2cam_noise.at<double>(1,0) += rng.gaussian(0.005);
+            t_target2cam_noise.at<double>(2,0) += rng.gaussian(0.005);
+
+            //Add some noise for the transformation between the gripper and the robot base
+            Mat R_gripper2base_noise = T_gripper2base(Rect(0, 0, 3, 3));
+            Mat rvec_gripper2base_noise;
+            cv::Rodrigues(R_gripper2base_noise, rvec_gripper2base_noise);
+            rvec_gripper2base_noise.at<double>(0,0) += rng.gaussian(0.001);
+            rvec_gripper2base_noise.at<double>(1,0) += rng.gaussian(0.001);
+            rvec_gripper2base_noise.at<double>(2,0) += rng.gaussian(0.001);
+
+            cv::Rodrigues(rvec_gripper2base_noise, R_gripper2base_noise);
+
+            Mat t_gripper2base_noise = T_gripper2base(Rect(3, 0, 1, 3));
+            t_gripper2base_noise.at<double>(0,0) += rng.gaussian(0.001);
+            t_gripper2base_noise.at<double>(1,0) += rng.gaussian(0.001);
+            t_gripper2base_noise.at<double>(2,0) += rng.gaussian(0.001);
+        }
+
+        R_target2cam.push_back(T_target2cam(Rect(0, 0, 3, 3)));
+        t_target2cam.push_back(T_target2cam(Rect(3, 0, 1, 3)));
+    }
+}
+
+Mat CV_CalibrateHandEyeTest::homogeneousInverse(const Mat& T)
+{
+    CV_Assert( T.rows == 4 && T.cols == 4 );
+
+    Mat R = T(Rect(0, 0, 3, 3));
+    Mat t = T(Rect(3, 0, 1, 3));
+    Mat Rt = R.t();
+    Mat tinv = -Rt * t;
+    Mat Tinv = Mat::eye(4, 4, T.type());
+    Rt.copyTo(Tinv(Rect(0, 0, 3, 3)));
+    tinv.copyTo(Tinv(Rect(3, 0, 1, 3)));
+
+    return Tinv;
+}
+
+std::string CV_CalibrateHandEyeTest::getMethodName(HandEyeCalibrationMethod method)
+{
+    std::string method_name = "";
+    switch (method)
+    {
+    case CALIB_HAND_EYE_TSAI:
+        method_name = "Tsai";
+        break;
+
+    case CALIB_HAND_EYE_PARK:
+        method_name = "Park";
+        break;
+
+    case CALIB_HAND_EYE_HORAUD:
+        method_name = "Horaud";
+        break;
+
+    case CALIB_HAND_EYE_ANDREFF:
+        method_name = "Andreff";
+        break;
+
+    case CALIB_HAND_EYE_DANIILIDIS:
+        method_name = "Daniilidis";
+        break;
+
+    default:
+        break;
+    }
+
+    return method_name;
+}
+
+double CV_CalibrateHandEyeTest::sign_double(double val)
+{
+    return (0 < val) - (val < 0);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(Calib3d_CalibrateHandEye, regression) { CV_CalibrateHandEyeTest test; test.safe_run(); }
+
+}} // namespace
diff --git a/modules/core/include/opencv2/core/hal/intrin_avx.hpp b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
index 913c915270..a6725c82de 100644
--- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
@@ -1610,6 +1610,16 @@ inline v_int16x16 v_pack_triplets(const v_int16x16& vec)
 }
 inline v_uint16x16 v_pack_triplets(const v_uint16x16& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
 
+inline v_int32x8 v_pack_triplets(const v_int32x8& vec)
+{
+    return v_int32x8(_mm256_permutevar8x32_epi32(vec.val, _mm256_set_epi64x(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
+}
+inline v_uint32x8 v_pack_triplets(const v_uint32x8& vec) { return v_reinterpret_as_u32(v_pack_triplets(v_reinterpret_as_s32(vec))); }
+inline v_float32x8 v_pack_triplets(const v_float32x8& vec)
+{
+    return v_float32x8(_mm256_permutevar8x32_ps(vec.val, _mm256_set_epi64x(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
+}
+
 ////////// Matrix operations /////////
 
 inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b)
diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
index 5cfaea7220..757c67b314 100644
--- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@@ -1908,7 +1908,6 @@ template<typename _Tp, int n> inline v_reg<_Tp, n> v_interleave_quads(const v_re
 template<typename _Tp, int n> inline v_reg<_Tp, n> v_pack_triplets(const v_reg<_Tp, n>& vec)
 {
     v_reg<float, n> c;
-    int j = 0;
     for (int i = 0; i < n/4; i++)
     {
         c.s[3*i  ] = vec.s[4*i  ];
diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
index f67479171d..e131909845 100644
--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@@ -1597,29 +1597,49 @@ inline v_int8x16 v_lut(const schar* tab, const int* idx)
 }
 inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
 {
-    short CV_DECL_ALIGNED(32) elems[8] =
+    schar CV_DECL_ALIGNED(32) elems[16] =
     {
-        *(short*)(tab+idx[0]),
-        *(short*)(tab+idx[1]),
-        *(short*)(tab+idx[2]),
-        *(short*)(tab+idx[3]),
-        *(short*)(tab+idx[4]),
-        *(short*)(tab+idx[5]),
-        *(short*)(tab+idx[6]),
-        *(short*)(tab+idx[7])
+        tab[idx[0]],
+        tab[idx[0] + 1],
+        tab[idx[1]],
+        tab[idx[1] + 1],
+        tab[idx[2]],
+        tab[idx[2] + 1],
+        tab[idx[3]],
+        tab[idx[3] + 1],
+        tab[idx[4]],
+        tab[idx[4] + 1],
+        tab[idx[5]],
+        tab[idx[5] + 1],
+        tab[idx[6]],
+        tab[idx[6] + 1],
+        tab[idx[7]],
+        tab[idx[7] + 1]
     };
-    return v_int8x16(vreinterpretq_s8_s16(vld1q_s16(elems)));
+    return v_int8x16(vld1q_s8(elems));
 }
 inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
 {
-    int CV_DECL_ALIGNED(32) elems[4] =
+    schar CV_DECL_ALIGNED(32) elems[16] =
     {
-        *(int*)(tab + idx[0]),
-        *(int*)(tab + idx[1]),
-        *(int*)(tab + idx[2]),
-        *(int*)(tab + idx[3])
+        tab[idx[0]],
+        tab[idx[0] + 1],
+        tab[idx[0] + 2],
+        tab[idx[0] + 3],
+        tab[idx[1]],
+        tab[idx[1] + 1],
+        tab[idx[1] + 2],
+        tab[idx[1] + 3],
+        tab[idx[2]],
+        tab[idx[2] + 1],
+        tab[idx[2] + 2],
+        tab[idx[2] + 3],
+        tab[idx[3]],
+        tab[idx[3] + 1],
+        tab[idx[3] + 2],
+        tab[idx[3] + 3]
     };
-    return v_int8x16(vreinterpretq_s8_s32(vld1q_s32(elems)));
+    return v_int8x16(vld1q_s8(elems));
 }
 inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
 inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
@@ -1642,23 +1662,22 @@ inline v_int16x8 v_lut(const short* tab, const int* idx)
 }
 inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
 {
-    int CV_DECL_ALIGNED(32) elems[4] =
+    short CV_DECL_ALIGNED(32) elems[8] =
     {
-        *(int*)(tab + idx[0]),
-        *(int*)(tab + idx[1]),
-        *(int*)(tab + idx[2]),
-        *(int*)(tab + idx[3])
+        tab[idx[0]],
+        tab[idx[0] + 1],
+        tab[idx[1]],
+        tab[idx[1] + 1],
+        tab[idx[2]],
+        tab[idx[2] + 1],
+        tab[idx[3]],
+        tab[idx[3] + 1]
     };
-    return v_int16x8(vreinterpretq_s16_s32(vld1q_s32(elems)));
+    return v_int16x8(vld1q_s16(elems));
 }
 inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
 {
-    int64 CV_DECL_ALIGNED(32) elems[2] =
-    {
-        *(int64*)(tab + idx[0]),
-        *(int64*)(tab + idx[1])
-    };
-    return v_int16x8(vreinterpretq_s16_s64(vld1q_s64(elems)));
+    return v_int16x8(vcombine_s16(vld1_s16(tab + idx[0]), vld1_s16(tab + idx[1])));
 }
 inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); }
 inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); }
@@ -1677,12 +1696,7 @@ inline v_int32x4 v_lut(const int* tab, const int* idx)
 }
 inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
 {
-    int64 CV_DECL_ALIGNED(32) elems[2] =
-    {
-        *(int64*)(tab + idx[0]),
-        *(int64*)(tab + idx[1])
-    };
-    return v_int32x4(vreinterpretq_s32_s64(vld1q_s64(elems)));
+    return v_int32x4(vcombine_s32(vld1_s32(tab + idx[0]), vld1_s32(tab + idx[1])));
 }
 inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
 {
@@ -1800,7 +1814,8 @@ inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
 inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
 inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
 {
-    return v_int16x8(vreinterpretq_s16_s8(vcombine_s8(vtbl1_s8(vget_low_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0b0a030209080100)), vtbl1_s8(vget_high_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0b0a030209080100)))));
+    int16x4x2_t res = vzip_s16(vget_low_s16(vec.val), vget_high_s16(vec.val));
+    return v_int16x8(vcombine_s16(res.val[0], res.val[1]));
 }
 inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
 
@@ -1824,6 +1839,10 @@ inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
 }
 inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
 
+inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
+inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
+inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
+
 #if CV_SIMD128_64F
 inline v_float64x2 v_lut(const double* tab, const int* idx)
 {
diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
index dcfae9a3a8..a5adad04c5 100644
--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@@ -2789,7 +2789,7 @@ inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
 }
 inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
 {
-    return v_int32x4(_mm_load_si128((const __m128i*)(tab + idx[0])));
+    return v_int32x4(_mm_loadu_si128((const __m128i*)(tab + idx[0])));
 }
 inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((const int *)tab, idx)); }
 inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((const int *)tab, idx)); }
@@ -2801,7 +2801,7 @@ inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
 }
 inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
 {
-    return v_int64x2(_mm_load_si128((const __m128i*)(tab + idx[0])));
+    return v_int64x2(_mm_loadu_si128((const __m128i*)(tab + idx[0])));
 }
 inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
 inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
@@ -2817,7 +2817,7 @@ inline v_float64x2 v_lut(const double* tab, const int* idx)
 {
     return v_float64x2(_mm_setr_pd(tab[idx[0]], tab[idx[1]]));
 }
-inline v_float64x2 v_lut_pairs(const double* tab, const int* idx) { return v_float64x2(_mm_castsi128_pd(_mm_load_si128((const __m128i*)(tab + idx[0])))); }
+inline v_float64x2 v_lut_pairs(const double* tab, const int* idx) { return v_float64x2(_mm_castsi128_pd(_mm_loadu_si128((const __m128i*)(tab + idx[0])))); }
 
 inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
 {
@@ -2932,7 +2932,7 @@ inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
     return v_int8x16(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0xffffff0f0e0d0c0a, 0x0908060504020100)));
 #else
     __m128i mask = _mm_set1_epi64x(0x00000000FFFFFFFF);
-    __m128i a = _mm_or_si128(_mm_andnot_si128(mask, vec.val), _mm_and_si128(mask, _mm_sll_epi32(vec.val, _mm_set_epi64x(0, 8))));
+    __m128i a = _mm_srli_si128(_mm_or_si128(_mm_andnot_si128(mask, vec.val), _mm_and_si128(mask, _mm_sll_epi32(vec.val, _mm_set_epi64x(0, 8)))), 1);
     return v_int8x16(_mm_srli_si128(_mm_shufflelo_epi16(a, _MM_SHUFFLE(2, 1, 0, 3)), 2));
 #endif
 }
@@ -2948,6 +2948,10 @@ inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
 }
 inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
 
+inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
+inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
+inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
+
 ////////////// FP16 support ///////////////////////////
 
 inline v_float32x4 v_load_expand(const float16_t* ptr)
diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
index ddda1d10d0..4e0c75ff93 100644
--- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
@@ -1160,6 +1160,10 @@ inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
 }
 inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
 
+inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
+inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
+inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
+
 /////// FP16 support ////////
 
 // [TODO] implement these 2 using VSX or universal intrinsics (copy from intrin_sse.cpp and adopt)
diff --git a/modules/core/include/opencv2/core/mat.hpp b/modules/core/include/opencv2/core/mat.hpp
index e33269b4b8..703c27858b 100644
--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
@@ -3589,9 +3589,9 @@ CV_EXPORTS MatExpr operator * (const MatExpr& e, double s);
 CV_EXPORTS MatExpr operator * (double s, const MatExpr& e);
 CV_EXPORTS MatExpr operator * (const MatExpr& e1, const MatExpr& e2);
 template<typename _Tp, int m, int n> static inline
-MatExpr operator * (const Mat& a, const Matx<_Tp, m, n>& b) { return a + Mat(b); }
+MatExpr operator * (const Mat& a, const Matx<_Tp, m, n>& b) { return a * Mat(b); }
 template<typename _Tp, int m, int n> static inline
-MatExpr operator * (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) + b; }
+MatExpr operator * (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) * b; }
 
 CV_EXPORTS MatExpr operator / (const Mat& a, const Mat& b);
 CV_EXPORTS MatExpr operator / (const Mat& a, double s);
diff --git a/modules/core/include/opencv2/core/vsx_utils.hpp b/modules/core/include/opencv2/core/vsx_utils.hpp
index b4e3f30562..da5b25625c 100644
--- a/modules/core/include/opencv2/core/vsx_utils.hpp
+++ b/modules/core/include/opencv2/core/vsx_utils.hpp
@@ -22,37 +22,37 @@
 
 typedef __vector unsigned char vec_uchar16;
 #define vec_uchar16_set(...) (vec_uchar16){__VA_ARGS__}
-#define vec_uchar16_sp(c)    (__VSX_S16__(vec_uchar16, c))
+#define vec_uchar16_sp(c)    (__VSX_S16__(vec_uchar16, (unsigned char)c))
 #define vec_uchar16_c(v)     ((vec_uchar16)(v))
 #define vec_uchar16_z        vec_uchar16_sp(0)
 
 typedef __vector signed char vec_char16;
 #define vec_char16_set(...) (vec_char16){__VA_ARGS__}
-#define vec_char16_sp(c)    (__VSX_S16__(vec_char16, c))
+#define vec_char16_sp(c)    (__VSX_S16__(vec_char16, (signed char)c))
 #define vec_char16_c(v)     ((vec_char16)(v))
 #define vec_char16_z        vec_char16_sp(0)
 
 typedef __vector unsigned short vec_ushort8;
 #define vec_ushort8_set(...) (vec_ushort8){__VA_ARGS__}
-#define vec_ushort8_sp(c)    (__VSX_S8__(vec_ushort8, c))
+#define vec_ushort8_sp(c)    (__VSX_S8__(vec_ushort8, (unsigned short)c))
 #define vec_ushort8_c(v)     ((vec_ushort8)(v))
 #define vec_ushort8_z        vec_ushort8_sp(0)
 
 typedef __vector signed short vec_short8;
 #define vec_short8_set(...) (vec_short8){__VA_ARGS__}
-#define vec_short8_sp(c)    (__VSX_S8__(vec_short8, c))
+#define vec_short8_sp(c)    (__VSX_S8__(vec_short8, (signed short)c))
 #define vec_short8_c(v)     ((vec_short8)(v))
 #define vec_short8_z        vec_short8_sp(0)
 
 typedef __vector unsigned int vec_uint4;
 #define vec_uint4_set(...) (vec_uint4){__VA_ARGS__}
-#define vec_uint4_sp(c)    (__VSX_S4__(vec_uint4, c))
+#define vec_uint4_sp(c)    (__VSX_S4__(vec_uint4, (unsigned int)c))
 #define vec_uint4_c(v)     ((vec_uint4)(v))
 #define vec_uint4_z        vec_uint4_sp(0)
 
 typedef __vector signed int vec_int4;
 #define vec_int4_set(...)  (vec_int4){__VA_ARGS__}
-#define vec_int4_sp(c)     (__VSX_S4__(vec_int4, c))
+#define vec_int4_sp(c)     (__VSX_S4__(vec_int4, (signed int)c))
 #define vec_int4_c(v)      ((vec_int4)(v))
 #define vec_int4_z         vec_int4_sp(0)
 
@@ -64,13 +64,13 @@ typedef __vector float vec_float4;
 
 typedef __vector unsigned long long vec_udword2;
 #define vec_udword2_set(...) (vec_udword2){__VA_ARGS__}
-#define vec_udword2_sp(c)    (__VSX_S2__(vec_udword2, c))
+#define vec_udword2_sp(c)    (__VSX_S2__(vec_udword2, (unsigned long long)c))
 #define vec_udword2_c(v)     ((vec_udword2)(v))
 #define vec_udword2_z        vec_udword2_sp(0)
 
 typedef __vector signed long long vec_dword2;
 #define vec_dword2_set(...) (vec_dword2){__VA_ARGS__}
-#define vec_dword2_sp(c)    (__VSX_S2__(vec_dword2, c))
+#define vec_dword2_sp(c)    (__VSX_S2__(vec_dword2, (signed long long)c))
 #define vec_dword2_c(v)     ((vec_dword2)(v))
 #define vec_dword2_z        vec_dword2_sp(0)
 
diff --git a/modules/core/perf/perf_io_base64.cpp b/modules/core/perf/perf_io_base64.cpp
index ebc9ee1463..e6f7a6807f 100644
--- a/modules/core/perf/perf_io_base64.cpp
+++ b/modules/core/perf/perf_io_base64.cpp
@@ -12,7 +12,7 @@ typedef TestBaseWithParam<Size_MatType_Str_t> Size_Mat_StrType;
 #define FILE_EXTENSION String(".xml"), String(".yml"), String(".json")
 
 
-PERF_TEST_P(Size_Mat_StrType, fs_text,
+PERF_TEST_P(Size_Mat_StrType, DISABLED_fs_text,
             testing::Combine(testing::Values(MAT_SIZES),
                              testing::Values(MAT_TYPES),
                              testing::Values(FILE_EXTENSION))
@@ -48,7 +48,7 @@ PERF_TEST_P(Size_Mat_StrType, fs_text,
     SANITY_CHECK_NOTHING();
 }
 
-PERF_TEST_P(Size_Mat_StrType, fs_base64,
+PERF_TEST_P(Size_Mat_StrType, DISABLED_fs_base64,
             testing::Combine(testing::Values(MAT_SIZES),
                              testing::Values(MAT_TYPES),
                              testing::Values(FILE_EXTENSION))
diff --git a/modules/core/src/opengl.cpp b/modules/core/src/opengl.cpp
index b04c7eafe9..b8decd508a 100644
--- a/modules/core/src/opengl.cpp
+++ b/modules/core/src/opengl.cpp
@@ -1726,7 +1726,7 @@ void convertToGLTexture2D(InputArray src, Texture2D& texture)
         CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clEnqueueAcquireGLObjects failed");
     size_t offset = 0; // TODO
     size_t dst_origin[3] = {0, 0, 0};
-    size_t region[3] = {u.cols, u.rows, 1};
+    size_t region[3] = { (size_t)u.cols, (size_t)u.rows, 1};
     status = clEnqueueCopyBufferToImage(q, clBuffer, clImage, offset, dst_origin, region, 0, NULL, NULL);
     if (status != CL_SUCCESS)
         CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clEnqueueCopyBufferToImage failed");
@@ -1786,7 +1786,7 @@ void convertFromGLTexture2D(const Texture2D& texture, OutputArray dst)
         CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clEnqueueAcquireGLObjects failed");
     size_t offset = 0; // TODO
     size_t src_origin[3] = {0, 0, 0};
-    size_t region[3] = {u.cols, u.rows, 1};
+    size_t region[3] = { (size_t)u.cols, (size_t)u.rows, 1};
     status = clEnqueueCopyImageToBuffer(q, clImage, clBuffer, src_origin, region, offset, 0, NULL, NULL);
     if (status != CL_SUCCESS)
         CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clEnqueueCopyImageToBuffer failed");
diff --git a/modules/core/test/test_mat.cpp b/modules/core/test/test_mat.cpp
index 2d8fdf7d17..885ff0e43f 100644
--- a/modules/core/test/test_mat.cpp
+++ b/modules/core/test/test_mat.cpp
@@ -1991,6 +1991,22 @@ TEST(Core_Vectors, issue_13078_workaround)
     ASSERT_EQ(7, ints[3]);
 }
 
+TEST(Core_MatExpr, issue_13926)
+{
+    Mat M1 = (Mat_<double>(4,4,CV_64FC1) << 1, 2, 3, 4,
+                                           5, 6, 7, 8,
+                                           9, 10, 11, 12,
+                                           13, 14, 15, 16);
+
+    Matx44d M2(1, 2, 3, 4,
+               5, 6, 7, 8,
+               9, 10, 11, 12,
+               13, 14, 15, 16);
+
+    EXPECT_GE(1e-6, cvtest::norm(M1*M2, M1*M1, NORM_INF)) << Mat(M1*M2) << std::endl << Mat(M1*M1);
+    EXPECT_GE(1e-6, cvtest::norm(M2*M1, M2*M2, NORM_INF)) << Mat(M2*M1) << std::endl << Mat(M2*M2);
+}
+
 
 #ifdef HAVE_EIGEN
 TEST(Core_Eigen, eigen2cv_check_Mat_type)
diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
index 2b253d067e..0e995bce2b 100644
--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
@@ -1826,7 +1826,7 @@ void OCL4DNNConvSpatial<float>::setupConvolution(const UMat &bottom,
                 }
                 else
                 {
-                    CV_LOG_VERBOSE(NULL, "Kernel " << config->kernelName << " pass verification");
+                    CV_LOG_VERBOSE(NULL, 0, "Kernel " << config->kernelName << " pass verification");
                 }
             }
             catch (...)
diff --git a/modules/highgui/src/window_QT.cpp b/modules/highgui/src/window_QT.cpp
index f2af4230c5..6a7e800e03 100644
--- a/modules/highgui/src/window_QT.cpp
+++ b/modules/highgui/src/window_QT.cpp
@@ -103,7 +103,7 @@ CV_IMPL CvFont cvFontQt(const char* nameFont, int pointSize,CvScalar color,int w
     float       dx;//spacing letter in Qt (0 default) in pixel
     int         line_type;//<- pointSize in Qt
     */
-    CvFont f = {nameFont,color,style,NULL,NULL,NULL,0,0,0,weight,spacing,pointSize};
+    CvFont f = {nameFont,color,style,NULL,NULL,NULL,0,0,0,weight, (float)spacing, pointSize};
     return f;
 }
 
diff --git a/modules/imgproc/include/opencv2/imgproc.hpp b/modules/imgproc/include/opencv2/imgproc.hpp
index 3ef7eb1ff6..a6e75cb20c 100644
--- a/modules/imgproc/include/opencv2/imgproc.hpp
+++ b/modules/imgproc/include/opencv2/imgproc.hpp
@@ -1248,14 +1248,12 @@ protected:
 //! @addtogroup imgproc_feature
 //! @{
 
-/** @example samples/cpp/lsd_lines.cpp
-An example using the LineSegmentDetector
-\image html building_lsd.png "Sample output image" width=434 height=300
-*/
-
 /** @brief Line segment detector class
 
 following the algorithm described at @cite Rafael12 .
+
+@note Implementation has been removed due original code license conflict
+
 */
 class CV_EXPORTS_W LineSegmentDetector : public Algorithm
 {
@@ -1319,6 +1317,8 @@ to edit those, as to tailor it for their own application.
 is chosen.
 @param _density_th Minimal density of aligned region points in the enclosing rectangle.
 @param _n_bins Number of bins in pseudo-ordering of gradient modulus.
+
+@note Implementation has been removed due original code license conflict
  */
 CV_EXPORTS_W Ptr<LineSegmentDetector> createLineSegmentDetector(
     int _refine = LSD_REFINE_STD, double _scale = 0.8,
diff --git a/modules/imgproc/src/lsd.cpp b/modules/imgproc/src/lsd.cpp
index d06759c2bb..af022811d1 100644
--- a/modules/imgproc/src/lsd.cpp
+++ b/modules/imgproc/src/lsd.cpp
@@ -42,123 +42,11 @@
 #include "precomp.hpp"
 #include <vector>
 
-/////////////////////////////////////////////////////////////////////////////////////////
-// Default LSD parameters
-// SIGMA_SCALE 0.6    - Sigma for Gaussian filter is computed as sigma = sigma_scale/scale.
-// QUANT       2.0    - Bound to the quantization error on the gradient norm.
-// ANG_TH      22.5   - Gradient angle tolerance in degrees.
-// LOG_EPS     0.0    - Detection threshold: -log10(NFA) > log_eps
-// DENSITY_TH  0.7    - Minimal density of region points in rectangle.
-// N_BINS      1024   - Number of bins in pseudo-ordering of gradient modulus.
-
-#define M_3_2_PI    (3 * CV_PI) / 2   // 3/2 pi
-#define M_2__PI     (2 * CV_PI)         // 2 pi
-
-#ifndef M_LN10
-#define M_LN10      2.30258509299404568402
+#if defined(_MSC_VER)
+#   pragma warning(disable:4702)  // unreachable code
 #endif
 
-#define NOTDEF      double(-1024.0) // Label for pixels with undefined gradient.
-
-#define NOTUSED     0   // Label for pixels not used in yet.
-#define USED        1   // Label for pixels already used in detection.
-
-#define RELATIVE_ERROR_FACTOR 100.0
-
-const double DEG_TO_RADS = CV_PI / 180;
-
-#define log_gamma(x) ((x)>15.0?log_gamma_windschitl(x):log_gamma_lanczos(x))
-
-struct edge
-{
-    cv::Point p;
-    bool taken;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////
-
-inline double distSq(const double x1, const double y1,
-                     const double x2, const double y2)
-{
-    return (x2 - x1)*(x2 - x1) + (y2 - y1)*(y2 - y1);
-}
-
-inline double dist(const double x1, const double y1,
-                   const double x2, const double y2)
-{
-    return sqrt(distSq(x1, y1, x2, y2));
-}
-
-// Signed angle difference
-inline double angle_diff_signed(const double& a, const double& b)
-{
-    double diff = a - b;
-    while(diff <= -CV_PI) diff += M_2__PI;
-    while(diff >   CV_PI) diff -= M_2__PI;
-    return diff;
-}
-
-// Absolute value angle difference
-inline double angle_diff(const double& a, const double& b)
-{
-    return std::fabs(angle_diff_signed(a, b));
-}
-
-// Compare doubles by relative error.
-inline bool double_equal(const double& a, const double& b)
-{
-    // trivial case
-    if(a == b) return true;
-
-    double abs_diff = fabs(a - b);
-    double aa = fabs(a);
-    double bb = fabs(b);
-    double abs_max = (aa > bb)? aa : bb;
-
-    if(abs_max < DBL_MIN) abs_max = DBL_MIN;
-
-    return (abs_diff / abs_max) <= (RELATIVE_ERROR_FACTOR * DBL_EPSILON);
-}
-
-inline bool AsmallerB_XoverY(const edge& a, const edge& b)
-{
-    if (a.p.x == b.p.x) return a.p.y < b.p.y;
-    else return a.p.x < b.p.x;
-}
-
-/**
- *   Computes the natural logarithm of the absolute value of
- *   the gamma function of x using Windschitl method.
- *   See http://www.rskey.org/gamma.htm
- */
-inline double log_gamma_windschitl(const double& x)
-{
-    return 0.918938533204673 + (x-0.5)*log(x) - x
-         + 0.5*x*log(x*sinh(1/x) + 1/(810.0*pow(x, 6.0)));
-}
-
-/**
- *   Computes the natural logarithm of the absolute value of
- *   the gamma function of x using the Lanczos approximation.
- *   See http://www.rskey.org/gamma.htm
- */
-inline double log_gamma_lanczos(const double& x)
-{
-    static double q[7] = { 75122.6331530, 80916.6278952, 36308.2951477,
-                         8687.24529705, 1168.92649479, 83.8676043424,
-                         2.50662827511 };
-    double a = (x + 0.5) * log(x + 5.5) - (x + 5.5);
-    double b = 0;
-    for(int n = 0; n < 7; ++n)
-    {
-        a -= log(x + double(n));
-        b += q[n] * pow(x, double(n));
-    }
-    return a + log(b);
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cv{
+namespace cv {
 
 class LineSegmentDetectorImpl CV_FINAL : public LineSegmentDetector
 {
@@ -229,164 +117,7 @@ public:
     int compareSegments(const Size& size, InputArray lines1, InputArray lines2, InputOutputArray _image = noArray()) CV_OVERRIDE;
 
 private:
-    Mat image;
-    Mat scaled_image;
-    Mat_<double> angles;     // in rads
-    Mat_<double> modgrad;
-    Mat_<uchar> used;
-
-    int img_width;
-    int img_height;
-    double LOG_NT;
-
-    bool w_needed;
-    bool p_needed;
-    bool n_needed;
-
-    const double SCALE;
-    const int doRefine;
-    const double SIGMA_SCALE;
-    const double QUANT;
-    const double ANG_TH;
-    const double LOG_EPS;
-    const double DENSITY_TH;
-    const int N_BINS;
-
-    struct RegionPoint {
-        int x;
-        int y;
-        uchar* used;
-        double angle;
-        double modgrad;
-    };
-
-    struct normPoint
-    {
-        Point2i p;
-        int norm;
-    };
-
-    std::vector<normPoint> ordered_points;
-
-    struct rect
-    {
-        double x1, y1, x2, y2;    // first and second point of the line segment
-        double width;             // rectangle width
-        double x, y;              // center of the rectangle
-        double theta;             // angle
-        double dx,dy;             // (dx,dy) is vector oriented as the line segment
-        double prec;              // tolerance angle
-        double p;                 // probability of a point with angle within 'prec'
-    };
-
     LineSegmentDetectorImpl& operator= (const LineSegmentDetectorImpl&); // to quiet MSVC
-
-/**
- * Detect lines in the whole input image.
- *
- * @param lines         Return: A vector of Vec4f elements specifying the beginning and ending point of a line.
- *                              Where Vec4f is (x1, y1, x2, y2), point 1 is the start, point 2 - end.
- *                              Returned lines are strictly oriented depending on the gradient.
- * @param widths        Return: Vector of widths of the regions, where the lines are found. E.g. Width of line.
- * @param precisions    Return: Vector of precisions with which the lines are found.
- * @param nfas          Return: Vector containing number of false alarms in the line region, with precision of 10%.
- *                              The bigger the value, logarithmically better the detection.
- *                                  * -1 corresponds to 10 mean false alarms
- *                                  * 0 corresponds to 1 mean false alarm
- *                                  * 1 corresponds to 0.1 mean false alarms
- */
-    void flsd(std::vector<Vec4f>& lines,
-              std::vector<double>& widths, std::vector<double>& precisions,
-              std::vector<double>& nfas);
-
-/**
- * Finds the angles and the gradients of the image. Generates a list of pseudo ordered points.
- *
- * @param threshold      The minimum value of the angle that is considered defined, otherwise NOTDEF
- * @param n_bins         The number of bins with which gradients are ordered by, using bucket sort.
- * @param ordered_points Return: Vector of coordinate points that are pseudo ordered by magnitude.
- *                       Pixels would be ordered by norm value, up to a precision given by max_grad/n_bins.
- */
-    void ll_angle(const double& threshold, const unsigned int& n_bins);
-
-/**
- * Grow a region starting from point s with a defined precision,
- * returning the containing points size and the angle of the gradients.
- *
- * @param s         Starting point for the region.
- * @param reg       Return: Vector of points, that are part of the region
- * @param reg_angle Return: The mean angle of the region.
- * @param prec      The precision by which each region angle should be aligned to the mean.
- */
-    void region_grow(const Point2i& s, std::vector<RegionPoint>& reg,
-                     double& reg_angle, const double& prec);
-
-/**
- * Finds the bounding rotated rectangle of a region.
- *
- * @param reg       The region of points, from which the rectangle to be constructed from.
- * @param reg_angle The mean angle of the region.
- * @param prec      The precision by which points were found.
- * @param p         Probability of a point with angle within 'prec'.
- * @param rec       Return: The generated rectangle.
- */
-    void region2rect(const std::vector<RegionPoint>& reg, const double reg_angle,
-                     const double prec, const double p, rect& rec) const;
-
-/**
- * Compute region's angle as the principal inertia axis of the region.
- * @return          Regions angle.
- */
-    double get_theta(const std::vector<RegionPoint>& reg, const double& x,
-                     const double& y, const double& reg_angle, const double& prec) const;
-
-/**
- * An estimation of the angle tolerance is performed by the standard deviation of the angle at points
- * near the region's starting point. Then, a new region is grown starting from the same point, but using the
- * estimated angle tolerance. If this fails to produce a rectangle with the right density of region points,
- * 'reduce_region_radius' is called to try to satisfy this condition.
- */
-    bool refine(std::vector<RegionPoint>& reg, double reg_angle,
-                const double prec, double p, rect& rec, const double& density_th);
-
-/**
- * Reduce the region size, by elimination the points far from the starting point, until that leads to
- * rectangle with the right density of region points or to discard the region if too small.
- */
-    bool reduce_region_radius(std::vector<RegionPoint>& reg, double reg_angle,
-                const double prec, double p, rect& rec, double density, const double& density_th);
-
-/**
- * Try some rectangles variations to improve NFA value. Only if the rectangle is not meaningful (i.e., log_nfa <= log_eps).
- * @return      The new NFA value.
- */
-    double rect_improve(rect& rec) const;
-
-/**
- * Calculates the number of correctly aligned points within the rectangle.
- * @return      The new NFA value.
- */
-    double rect_nfa(const rect& rec) const;
-
-/**
- * Computes the NFA values based on the total number of points, points that agree.
- * n, k, p are the binomial parameters.
- * @return      The new NFA value.
- */
-    double nfa(const int& n, const int& k, const double& p) const;
-
-/**
- * Is the point at place 'address' aligned to angle theta, up to precision 'prec'?
- * @return      Whether the point is aligned.
- */
-    bool isAligned(int x, int y, const double& theta, const double& prec) const;
-
-public:
-    // Compare norm
-    static inline bool compare_norm( const normPoint& n1, const normPoint& n2 )
-    {
-        return (n1.norm > n2.norm);
-    }
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////
@@ -404,13 +135,12 @@ CV_EXPORTS Ptr<LineSegmentDetector> createLineSegmentDetector(
 
 LineSegmentDetectorImpl::LineSegmentDetectorImpl(int _refine, double _scale, double _sigma_scale, double _quant,
         double _ang_th, double _log_eps, double _density_th, int _n_bins)
-        : img_width(0), img_height(0), LOG_NT(0), w_needed(false), p_needed(false), n_needed(false),
-          SCALE(_scale), doRefine(_refine), SIGMA_SCALE(_sigma_scale), QUANT(_quant),
-          ANG_TH(_ang_th), LOG_EPS(_log_eps), DENSITY_TH(_density_th), N_BINS(_n_bins)
 {
     CV_Assert(_scale > 0 && _sigma_scale > 0 && _quant >= 0 &&
               _ang_th > 0 && _ang_th < 180 && _density_th >= 0 && _density_th < 1 &&
               _n_bins > 0);
+    CV_UNUSED(_refine); CV_UNUSED(_log_eps);
+    CV_Error(Error::StsNotImplemented, "Implementation has been removed due original code license issues");
 }
 
 void LineSegmentDetectorImpl::detect(InputArray _image, OutputArray _lines,
@@ -418,708 +148,11 @@ void LineSegmentDetectorImpl::detect(InputArray _image, OutputArray _lines,
 {
     CV_INSTRUMENT_REGION();
 
-    image = _image.getMat();
-    CV_Assert(!image.empty() && image.type() == CV_8UC1);
-
-    std::vector<Vec4f> lines;
-    std::vector<double> w, p, n;
-    w_needed = _width.needed();
-    p_needed = _prec.needed();
-    if (doRefine < LSD_REFINE_ADV)
-        n_needed = false;
-    else
-        n_needed = _nfa.needed();
-
-    flsd(lines, w, p, n);
-
-    Mat(lines).copyTo(_lines);
-    if(w_needed) Mat(w).copyTo(_width);
-    if(p_needed) Mat(p).copyTo(_prec);
-    if(n_needed) Mat(n).copyTo(_nfa);
-
-    // Clear used structures
-    ordered_points.clear();
+    CV_UNUSED(_image); CV_UNUSED(_lines);
+    CV_UNUSED(_width); CV_UNUSED(_prec); CV_UNUSED(_nfa);
+    CV_Error(Error::StsNotImplemented, "Implementation has been removed due original code license issues");
 }
 
-void LineSegmentDetectorImpl::flsd(std::vector<Vec4f>& lines,
-    std::vector<double>& widths, std::vector<double>& precisions,
-    std::vector<double>& nfas)
-{
-    // Angle tolerance
-    const double prec = CV_PI * ANG_TH / 180;
-    const double p = ANG_TH / 180;
-    const double rho = QUANT / sin(prec);    // gradient magnitude threshold
-
-    if(SCALE != 1)
-    {
-        Mat gaussian_img;
-        const double sigma = (SCALE < 1)?(SIGMA_SCALE / SCALE):(SIGMA_SCALE);
-        const double sprec = 3;
-        const unsigned int h =  (unsigned int)(ceil(sigma * sqrt(2 * sprec * log(10.0))));
-        Size ksize(1 + 2 * h, 1 + 2 * h); // kernel size
-        GaussianBlur(image, gaussian_img, ksize, sigma);
-        // Scale image to needed size
-        resize(gaussian_img, scaled_image, Size(), SCALE, SCALE, INTER_LINEAR_EXACT);
-        ll_angle(rho, N_BINS);
-    }
-    else
-    {
-        scaled_image = image;
-        ll_angle(rho, N_BINS);
-    }
-
-    LOG_NT = 5 * (log10(double(img_width)) + log10(double(img_height))) / 2 + log10(11.0);
-    const size_t min_reg_size = size_t(-LOG_NT/log10(p)); // minimal number of points in region that can give a meaningful event
-
-    // // Initialize region only when needed
-    // Mat region = Mat::zeros(scaled_image.size(), CV_8UC1);
-    used = Mat_<uchar>::zeros(scaled_image.size()); // zeros = NOTUSED
-    std::vector<RegionPoint> reg;
-
-    // Search for line segments
-    for(size_t i = 0, points_size = ordered_points.size(); i < points_size; ++i)
-    {
-        const Point2i& point = ordered_points[i].p;
-        if((used.at<uchar>(point) == NOTUSED) && (angles.at<double>(point) != NOTDEF))
-        {
-            double reg_angle;
-            region_grow(ordered_points[i].p, reg, reg_angle, prec);
-
-            // Ignore small regions
-            if(reg.size() < min_reg_size) { continue; }
-
-            // Construct rectangular approximation for the region
-            rect rec;
-            region2rect(reg, reg_angle, prec, p, rec);
-
-            double log_nfa = -1;
-            if(doRefine > LSD_REFINE_NONE)
-            {
-                // At least REFINE_STANDARD lvl.
-                if(!refine(reg, reg_angle, prec, p, rec, DENSITY_TH)) { continue; }
-
-                if(doRefine >= LSD_REFINE_ADV)
-                {
-                    // Compute NFA
-                    log_nfa = rect_improve(rec);
-                    if(log_nfa <= LOG_EPS) { continue; }
-                }
-            }
-            // Found new line
-
-            // Add the offset
-            rec.x1 += 0.5; rec.y1 += 0.5;
-            rec.x2 += 0.5; rec.y2 += 0.5;
-
-            // scale the result values if a sub-sampling was performed
-            if(SCALE != 1)
-            {
-                rec.x1 /= SCALE; rec.y1 /= SCALE;
-                rec.x2 /= SCALE; rec.y2 /= SCALE;
-                rec.width /= SCALE;
-            }
-
-            //Store the relevant data
-            lines.push_back(Vec4f(float(rec.x1), float(rec.y1), float(rec.x2), float(rec.y2)));
-            if(w_needed) widths.push_back(rec.width);
-            if(p_needed) precisions.push_back(rec.p);
-            if(n_needed && doRefine >= LSD_REFINE_ADV) nfas.push_back(log_nfa);
-        }
-    }
-}
-
-void LineSegmentDetectorImpl::ll_angle(const double& threshold,
-                                   const unsigned int& n_bins)
-{
-    //Initialize data
-    angles = Mat_<double>(scaled_image.size());
-    modgrad = Mat_<double>(scaled_image.size());
-
-    img_width = scaled_image.cols;
-    img_height = scaled_image.rows;
-
-    // Undefined the down and right boundaries
-    angles.row(img_height - 1).setTo(NOTDEF);
-    angles.col(img_width - 1).setTo(NOTDEF);
-
-    // Computing gradient for remaining pixels
-    double max_grad = -1;
-    for(int y = 0; y < img_height - 1; ++y)
-    {
-        const uchar* scaled_image_row = scaled_image.ptr<uchar>(y);
-        const uchar* next_scaled_image_row = scaled_image.ptr<uchar>(y+1);
-        double* angles_row = angles.ptr<double>(y);
-        double* modgrad_row = modgrad.ptr<double>(y);
-        for(int x = 0; x < img_width-1; ++x)
-        {
-            int DA = next_scaled_image_row[x + 1] - scaled_image_row[x];
-            int BC = scaled_image_row[x + 1] - next_scaled_image_row[x];
-            int gx = DA + BC;    // gradient x component
-            int gy = DA - BC;    // gradient y component
-            double norm = std::sqrt((gx * gx + gy * gy) / 4.0); // gradient norm
-
-            modgrad_row[x] = norm;    // store gradient
-
-            if (norm <= threshold)  // norm too small, gradient no defined
-            {
-                angles_row[x] = NOTDEF;
-            }
-            else
-            {
-                angles_row[x] = fastAtan2(float(gx), float(-gy)) * DEG_TO_RADS;  // gradient angle computation
-                if (norm > max_grad) { max_grad = norm; }
-            }
-
-        }
-    }
-
-    // Compute histogram of gradient values
-    double bin_coef = (max_grad > 0) ? double(n_bins - 1) / max_grad : 0; // If all image is smooth, max_grad <= 0
-    for(int y = 0; y < img_height - 1; ++y)
-    {
-        const double* modgrad_row = modgrad.ptr<double>(y);
-        for(int x = 0; x < img_width - 1; ++x)
-        {
-            normPoint _point;
-            int i = int(modgrad_row[x] * bin_coef);
-            _point.p = Point(x, y);
-            _point.norm = i;
-            ordered_points.push_back(_point);
-        }
-    }
-
-    // Sort
-    std::sort(ordered_points.begin(), ordered_points.end(), compare_norm);
-}
-
-void LineSegmentDetectorImpl::region_grow(const Point2i& s, std::vector<RegionPoint>& reg,
-                                      double& reg_angle, const double& prec)
-{
-    reg.clear();
-
-    // Point to this region
-    RegionPoint seed;
-    seed.x = s.x;
-    seed.y = s.y;
-    seed.used = &used.at<uchar>(s);
-    reg_angle = angles.at<double>(s);
-    seed.angle = reg_angle;
-    seed.modgrad = modgrad.at<double>(s);
-    reg.push_back(seed);
-
-    float sumdx = float(std::cos(reg_angle));
-    float sumdy = float(std::sin(reg_angle));
-    *seed.used = USED;
-
-    //Try neighboring regions
-    for (size_t i = 0;i<reg.size();i++)
-    {
-        const RegionPoint& rpoint = reg[i];
-        int xx_min = std::max(rpoint.x - 1, 0), xx_max = std::min(rpoint.x + 1, img_width - 1);
-        int yy_min = std::max(rpoint.y - 1, 0), yy_max = std::min(rpoint.y + 1, img_height - 1);
-        for(int yy = yy_min; yy <= yy_max; ++yy)
-        {
-            uchar* used_row = used.ptr<uchar>(yy);
-            const double* angles_row = angles.ptr<double>(yy);
-            const double* modgrad_row = modgrad.ptr<double>(yy);
-            for(int xx = xx_min; xx <= xx_max; ++xx)
-            {
-                uchar& is_used = used_row[xx];
-                if(is_used != USED &&
-                   (isAligned(xx, yy, reg_angle, prec)))
-                {
-                    const double& angle = angles_row[xx];
-                    // Add point
-                    is_used = USED;
-                    RegionPoint region_point;
-                    region_point.x = xx;
-                    region_point.y = yy;
-                    region_point.used = &is_used;
-                    region_point.modgrad = modgrad_row[xx];
-                    region_point.angle = angle;
-                    reg.push_back(region_point);
-
-                    // Update region's angle
-                    sumdx += cos(float(angle));
-                    sumdy += sin(float(angle));
-                    // reg_angle is used in the isAligned, so it needs to be updates?
-                    reg_angle = fastAtan2(sumdy, sumdx) * DEG_TO_RADS;
-                }
-            }
-        }
-    }
-}
-
-void LineSegmentDetectorImpl::region2rect(const std::vector<RegionPoint>& reg,
-                                      const double reg_angle, const double prec, const double p, rect& rec) const
-{
-    double x = 0, y = 0, sum = 0;
-    for(size_t i = 0; i < reg.size(); ++i)
-    {
-        const RegionPoint& pnt = reg[i];
-        const double& weight = pnt.modgrad;
-        x += double(pnt.x) * weight;
-        y += double(pnt.y) * weight;
-        sum += weight;
-    }
-
-    // Weighted sum must differ from 0
-    CV_Assert(sum > 0);
-
-    x /= sum;
-    y /= sum;
-
-    double theta = get_theta(reg, x, y, reg_angle, prec);
-
-    // Find length and width
-    double dx = cos(theta);
-    double dy = sin(theta);
-    double l_min = 0, l_max = 0, w_min = 0, w_max = 0;
-
-    for(size_t i = 0; i < reg.size(); ++i)
-    {
-        double regdx = double(reg[i].x) - x;
-        double regdy = double(reg[i].y) - y;
-
-        double l = regdx * dx + regdy * dy;
-        double w = -regdx * dy + regdy * dx;
-
-        if(l > l_max) l_max = l;
-        else if(l < l_min) l_min = l;
-        if(w > w_max) w_max = w;
-        else if(w < w_min) w_min = w;
-    }
-
-    // Store values
-    rec.x1 = x + l_min * dx;
-    rec.y1 = y + l_min * dy;
-    rec.x2 = x + l_max * dx;
-    rec.y2 = y + l_max * dy;
-    rec.width = w_max - w_min;
-    rec.x = x;
-    rec.y = y;
-    rec.theta = theta;
-    rec.dx = dx;
-    rec.dy = dy;
-    rec.prec = prec;
-    rec.p = p;
-
-    // Min width of 1 pixel
-    if(rec.width < 1.0) rec.width = 1.0;
-}
-
-double LineSegmentDetectorImpl::get_theta(const std::vector<RegionPoint>& reg, const double& x,
-                                      const double& y, const double& reg_angle, const double& prec) const
-{
-    double Ixx = 0.0;
-    double Iyy = 0.0;
-    double Ixy = 0.0;
-
-    // Compute inertia matrix
-    for(size_t i = 0; i < reg.size(); ++i)
-    {
-        const double& regx = reg[i].x;
-        const double& regy = reg[i].y;
-        const double& weight = reg[i].modgrad;
-        double dx = regx - x;
-        double dy = regy - y;
-        Ixx += dy * dy * weight;
-        Iyy += dx * dx * weight;
-        Ixy -= dx * dy * weight;
-    }
-
-    // Check if inertia matrix is null
-    CV_Assert(!(double_equal(Ixx, 0) && double_equal(Iyy, 0) && double_equal(Ixy, 0)));
-
-    // Compute smallest eigenvalue
-    double lambda = 0.5 * (Ixx + Iyy - sqrt((Ixx - Iyy) * (Ixx - Iyy) + 4.0 * Ixy * Ixy));
-
-    // Compute angle
-    double theta = (fabs(Ixx)>fabs(Iyy))?
-                    double(fastAtan2(float(lambda - Ixx), float(Ixy))):
-                    double(fastAtan2(float(Ixy), float(lambda - Iyy))); // in degs
-    theta *= DEG_TO_RADS;
-
-    // Correct angle by 180 deg if necessary
-    if(angle_diff(theta, reg_angle) > prec) { theta += CV_PI; }
-
-    return theta;
-}
-
-bool LineSegmentDetectorImpl::refine(std::vector<RegionPoint>& reg, double reg_angle,
-                                 const double prec, double p, rect& rec, const double& density_th)
-{
-    double density = double(reg.size()) / (dist(rec.x1, rec.y1, rec.x2, rec.y2) * rec.width);
-
-    if (density >= density_th) { return true; }
-
-    // Try to reduce angle tolerance
-    double xc = double(reg[0].x);
-    double yc = double(reg[0].y);
-    const double& ang_c = reg[0].angle;
-    double sum = 0, s_sum = 0;
-    int n = 0;
-
-    for (size_t i = 0; i < reg.size(); ++i)
-    {
-        *(reg[i].used) = NOTUSED;
-        if (dist(xc, yc, reg[i].x, reg[i].y) < rec.width)
-        {
-            const double& angle = reg[i].angle;
-            double ang_d = angle_diff_signed(angle, ang_c);
-            sum += ang_d;
-            s_sum += ang_d * ang_d;
-            ++n;
-        }
-    }
-    CV_Assert(n > 0);
-    double mean_angle = sum / double(n);
-    // 2 * standard deviation
-    double tau = 2.0 * sqrt((s_sum - 2.0 * mean_angle * sum) / double(n) + mean_angle * mean_angle);
-
-    // Try new region
-    region_grow(Point(reg[0].x, reg[0].y), reg, reg_angle, tau);
-
-    if (reg.size() < 2) { return false; }
-
-    region2rect(reg, reg_angle, prec, p, rec);
-    density = double(reg.size()) / (dist(rec.x1, rec.y1, rec.x2, rec.y2) * rec.width);
-
-    if (density < density_th)
-    {
-        return reduce_region_radius(reg, reg_angle, prec, p, rec, density, density_th);
-    }
-    else
-    {
-        return true;
-    }
-}
-
-bool LineSegmentDetectorImpl::reduce_region_radius(std::vector<RegionPoint>& reg, double reg_angle,
-                const double prec, double p, rect& rec, double density, const double& density_th)
-{
-    // Compute region's radius
-    double xc = double(reg[0].x);
-    double yc = double(reg[0].y);
-    double radSq1 = distSq(xc, yc, rec.x1, rec.y1);
-    double radSq2 = distSq(xc, yc, rec.x2, rec.y2);
-    double radSq = radSq1 > radSq2 ? radSq1 : radSq2;
-
-    while(density < density_th)
-    {
-        radSq *= 0.75*0.75; // Reduce region's radius to 75% of its value
-        // Remove points from the region and update 'used' map
-        for (size_t i = 0; i < reg.size(); ++i)
-        {
-            if(distSq(xc, yc, double(reg[i].x), double(reg[i].y)) > radSq)
-            {
-                // Remove point from the region
-                *(reg[i].used) = NOTUSED;
-                std::swap(reg[i], reg[reg.size() - 1]);
-                reg.pop_back();
-                --i; // To avoid skipping one point
-            }
-        }
-
-        if(reg.size() < 2) { return false; }
-
-        // Re-compute rectangle
-        region2rect(reg ,reg_angle, prec, p, rec);
-
-        // Re-compute region points density
-        density = double(reg.size()) /
-                  (dist(rec.x1, rec.y1, rec.x2, rec.y2) * rec.width);
-    }
-
-    return true;
-}
-
-double LineSegmentDetectorImpl::rect_improve(rect& rec) const
-{
-    double delta = 0.5;
-    double delta_2 = delta / 2.0;
-
-    double log_nfa = rect_nfa(rec);
-
-    if(log_nfa > LOG_EPS) return log_nfa; // Good rectangle
-
-    // Try to improve
-    // Finer precision
-    rect r = rect(rec); // Copy
-    for(int n = 0; n < 5; ++n)
-    {
-        r.p /= 2;
-        r.prec = r.p * CV_PI;
-        double log_nfa_new = rect_nfa(r);
-        if(log_nfa_new > log_nfa)
-        {
-            log_nfa = log_nfa_new;
-            rec = rect(r);
-        }
-    }
-    if(log_nfa > LOG_EPS) return log_nfa;
-
-    // Try to reduce width
-    r = rect(rec);
-    for(unsigned int n = 0; n < 5; ++n)
-    {
-        if((r.width - delta) >= 0.5)
-        {
-            r.width -= delta;
-            double log_nfa_new = rect_nfa(r);
-            if(log_nfa_new > log_nfa)
-            {
-                rec = rect(r);
-                log_nfa = log_nfa_new;
-            }
-        }
-    }
-    if(log_nfa > LOG_EPS) return log_nfa;
-
-    // Try to reduce one side of rectangle
-    r = rect(rec);
-    for(unsigned int n = 0; n < 5; ++n)
-    {
-        if((r.width - delta) >= 0.5)
-        {
-            r.x1 += -r.dy * delta_2;
-            r.y1 +=  r.dx * delta_2;
-            r.x2 += -r.dy * delta_2;
-            r.y2 +=  r.dx * delta_2;
-            r.width -= delta;
-            double log_nfa_new = rect_nfa(r);
-            if(log_nfa_new > log_nfa)
-            {
-                rec = rect(r);
-                log_nfa = log_nfa_new;
-            }
-        }
-    }
-    if(log_nfa > LOG_EPS) return log_nfa;
-
-    // Try to reduce other side of rectangle
-    r = rect(rec);
-    for(unsigned int n = 0; n < 5; ++n)
-    {
-        if((r.width - delta) >= 0.5)
-        {
-            r.x1 -= -r.dy * delta_2;
-            r.y1 -=  r.dx * delta_2;
-            r.x2 -= -r.dy * delta_2;
-            r.y2 -=  r.dx * delta_2;
-            r.width -= delta;
-            double log_nfa_new = rect_nfa(r);
-            if(log_nfa_new > log_nfa)
-            {
-                rec = rect(r);
-                log_nfa = log_nfa_new;
-            }
-        }
-    }
-    if(log_nfa > LOG_EPS) return log_nfa;
-
-    // Try finer precision
-    r = rect(rec);
-    for(unsigned int n = 0; n < 5; ++n)
-    {
-        if((r.width - delta) >= 0.5)
-        {
-            r.p /= 2;
-            r.prec = r.p * CV_PI;
-            double log_nfa_new = rect_nfa(r);
-            if(log_nfa_new > log_nfa)
-            {
-                rec = rect(r);
-                log_nfa = log_nfa_new;
-            }
-        }
-    }
-
-    return log_nfa;
-}
-
-double LineSegmentDetectorImpl::rect_nfa(const rect& rec) const
-{
-    int total_pts = 0, alg_pts = 0;
-    double half_width = rec.width / 2.0;
-    double dyhw = rec.dy * half_width;
-    double dxhw = rec.dx * half_width;
-
-    edge ordered_x[4];
-    edge* min_y = &ordered_x[0];
-    edge* max_y = &ordered_x[0]; // Will be used for loop range
-
-    ordered_x[0].p.x = int(rec.x1 - dyhw); ordered_x[0].p.y = int(rec.y1 + dxhw); ordered_x[0].taken = false;
-    ordered_x[1].p.x = int(rec.x2 - dyhw); ordered_x[1].p.y = int(rec.y2 + dxhw); ordered_x[1].taken = false;
-    ordered_x[2].p.x = int(rec.x2 + dyhw); ordered_x[2].p.y = int(rec.y2 - dxhw); ordered_x[2].taken = false;
-    ordered_x[3].p.x = int(rec.x1 + dyhw); ordered_x[3].p.y = int(rec.y1 - dxhw); ordered_x[3].taken = false;
-
-    std::sort(ordered_x, ordered_x + 4, AsmallerB_XoverY);
-
-    // Find min y. And mark as taken. find max y.
-    for(unsigned int i = 1; i < 4; ++i)
-    {
-        if(min_y->p.y > ordered_x[i].p.y) {min_y = &ordered_x[i]; }
-        if(max_y->p.y < ordered_x[i].p.y) {max_y = &ordered_x[i]; }
-    }
-    min_y->taken = true;
-
-    // Find leftmost untaken point;
-    edge* leftmost = 0;
-    for(unsigned int i = 0; i < 4; ++i)
-    {
-        if(!ordered_x[i].taken)
-        {
-            if(!leftmost) // if uninitialized
-            {
-                leftmost = &ordered_x[i];
-            }
-            else if (leftmost->p.x > ordered_x[i].p.x)
-            {
-                leftmost = &ordered_x[i];
-            }
-        }
-    }
-    CV_Assert(leftmost != NULL);
-    leftmost->taken = true;
-
-    // Find rightmost untaken point;
-    edge* rightmost = 0;
-    for(unsigned int i = 0; i < 4; ++i)
-    {
-        if(!ordered_x[i].taken)
-        {
-            if(!rightmost) // if uninitialized
-            {
-                rightmost = &ordered_x[i];
-            }
-            else if (rightmost->p.x < ordered_x[i].p.x)
-            {
-                rightmost = &ordered_x[i];
-            }
-        }
-    }
-    CV_Assert(rightmost != NULL);
-    rightmost->taken = true;
-
-    // Find last untaken point;
-    edge* tailp = 0;
-    for(unsigned int i = 0; i < 4; ++i)
-    {
-        if(!ordered_x[i].taken)
-        {
-            if(!tailp) // if uninitialized
-            {
-                tailp = &ordered_x[i];
-            }
-            else if (tailp->p.x > ordered_x[i].p.x)
-            {
-                tailp = &ordered_x[i];
-            }
-        }
-    }
-    CV_Assert(tailp != NULL);
-    tailp->taken = true;
-
-    double flstep = (min_y->p.y != leftmost->p.y) ?
-                    (min_y->p.x - leftmost->p.x) / (min_y->p.y - leftmost->p.y) : 0; //first left step
-    double slstep = (leftmost->p.y != tailp->p.x) ?
-                    (leftmost->p.x - tailp->p.x) / (leftmost->p.y - tailp->p.x) : 0; //second left step
-
-    double frstep = (min_y->p.y != rightmost->p.y) ?
-                    (min_y->p.x - rightmost->p.x) / (min_y->p.y - rightmost->p.y) : 0; //first right step
-    double srstep = (rightmost->p.y != tailp->p.x) ?
-                    (rightmost->p.x - tailp->p.x) / (rightmost->p.y - tailp->p.x) : 0; //second right step
-
-    double lstep = flstep, rstep = frstep;
-
-    double left_x = min_y->p.x, right_x = min_y->p.x;
-
-    // Loop around all points in the region and count those that are aligned.
-    int min_iter = min_y->p.y;
-    int max_iter = max_y->p.y;
-    for(int y = min_iter; y <= max_iter; ++y)
-    {
-        if (y < 0 || y >= img_height) continue;
-
-        for(int x = int(left_x); x <= int(right_x); ++x)
-        {
-            if (x < 0 || x >= img_width) continue;
-
-            ++total_pts;
-            if(isAligned(x, y, rec.theta, rec.prec))
-            {
-                ++alg_pts;
-            }
-        }
-
-        if(y >= leftmost->p.y) { lstep = slstep; }
-        if(y >= rightmost->p.y) { rstep = srstep; }
-
-        left_x += lstep;
-        right_x += rstep;
-    }
-
-    return nfa(total_pts, alg_pts, rec.p);
-}
-
-double LineSegmentDetectorImpl::nfa(const int& n, const int& k, const double& p) const
-{
-    // Trivial cases
-    if(n == 0 || k == 0) { return -LOG_NT; }
-    if(n == k) { return -LOG_NT - double(n) * log10(p); }
-
-    double p_term = p / (1 - p);
-
-    double log1term = (double(n) + 1) - log_gamma(double(k) + 1)
-                - log_gamma(double(n-k) + 1)
-                + double(k) * log(p) + double(n-k) * log(1.0 - p);
-    double term = exp(log1term);
-
-    if(double_equal(term, 0))
-    {
-        if(k > n * p) return -log1term / M_LN10 - LOG_NT;
-        else return -LOG_NT;
-    }
-
-    // Compute more terms if needed
-    double bin_tail = term;
-    double tolerance = 0.1; // an error of 10% in the result is accepted
-    for(int i = k + 1; i <= n; ++i)
-    {
-        double bin_term = double(n - i + 1) / double(i);
-        double mult_term = bin_term * p_term;
-        term *= mult_term;
-        bin_tail += term;
-        if(bin_term < 1)
-        {
-            double err = term * ((1 - pow(mult_term, double(n-i+1))) / (1 - mult_term) - 1);
-            if(err < tolerance * fabs(-log10(bin_tail) - LOG_NT) * bin_tail) break;
-        }
-
-    }
-    return -log10(bin_tail) - LOG_NT;
-}
-
-inline bool LineSegmentDetectorImpl::isAligned(int x, int y, const double& theta, const double& prec) const
-{
-    if(x < 0 || y < 0 || x >= angles.cols || y >= angles.rows) { return false; }
-    const double& a = angles.at<double>(y, x);
-    if(a == NOTDEF) { return false; }
-
-    // It is assumed that 'theta' and 'a' are in the range [-pi,pi]
-    double n_theta = theta - a;
-    if(n_theta < 0) { n_theta = -n_theta; }
-    if(n_theta > M_3_2_PI)
-    {
-        n_theta -= M_2__PI;
-        if(n_theta < 0) n_theta = -n_theta;
-    }
-
-    return n_theta <= prec;
-}
-
-
 void LineSegmentDetectorImpl::drawSegments(InputOutputArray _image, InputArray lines)
 {
     CV_INSTRUMENT_REGION();
diff --git a/modules/imgproc/src/pyramids.cpp b/modules/imgproc/src/pyramids.cpp
index 6aa9d0279f..ecee8b57ce 100644
--- a/modules/imgproc/src/pyramids.cpp
+++ b/modules/imgproc/src/pyramids.cpp
@@ -64,333 +64,662 @@ template<typename T, int shift> struct FltCast
     rtype operator ()(type1 arg) const { return arg*(T)(1./(1 << shift)); }
 };
 
-template<typename T1, typename T2> struct PyrDownNoVec
+template<typename T1, typename T2, int cn> int PyrDownVecH(const T1*, T2*, int)
 {
-    int operator()(T1**, T2*, int, int) const { return 0; }
-};
+    //   row[x       ] = src[x * 2 + 2*cn  ] * 6 + (src[x * 2 +   cn  ] + src[x * 2 + 3*cn  ]) * 4 + src[x * 2       ] + src[x * 2 + 4*cn  ];
+    //   row[x +    1] = src[x * 2 + 2*cn+1] * 6 + (src[x * 2 +   cn+1] + src[x * 2 + 3*cn+1]) * 4 + src[x * 2 +    1] + src[x * 2 + 4*cn+1];
+    //   ....
+    //   row[x + cn-1] = src[x * 2 + 3*cn-1] * 6 + (src[x * 2 + 2*cn-1] + src[x * 2 + 4*cn-1]) * 4 + src[x * 2 + cn-1] + src[x * 2 + 5*cn-1];
+    return 0;
+}
 
-template<typename T1, typename T2> struct PyrUpNoVec
+template<typename T1, typename T2, int cn> int PyrUpVecH(const T1*, T2*, int)
 {
-    int operator()(T1**, T2**, int, int) const { return 0; }
-};
+    return 0;
+}
+
+template<typename T1, typename T2> int PyrDownVecV(T1**, T2*, int) { return 0; }
+
+template<typename T1, typename T2> int PyrUpVecV(T1**, T2**, int) { return 0; }
 
 #if CV_SIMD
 
-struct PyrDownVec_32s8u
+template<> int PyrDownVecH<uchar, int, 1>(const uchar* src, int* row, int width)
 {
-    int operator()(int** src, uchar* dst, int, int width) const
-    {
-        int x = 0;
-        const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
+    int x = 0;
+    const uchar *src0 = src, *src2 = src + 2, *src4 = src + 3;
 
-        for( ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes )
-        {
-            v_uint16 r0, r1, r2, r3, r4, t0, t1;
-            r0 = v_reinterpret_as_u16(v_pack(vx_load(row0 + x), vx_load(row0 + x + v_int32::nlanes)));
-            r1 = v_reinterpret_as_u16(v_pack(vx_load(row1 + x), vx_load(row1 + x + v_int32::nlanes)));
-            r2 = v_reinterpret_as_u16(v_pack(vx_load(row2 + x), vx_load(row2 + x + v_int32::nlanes)));
-            r3 = v_reinterpret_as_u16(v_pack(vx_load(row3 + x), vx_load(row3 + x + v_int32::nlanes)));
-            r4 = v_reinterpret_as_u16(v_pack(vx_load(row4 + x), vx_load(row4 + x + v_int32::nlanes)));
-            t0 = r0 + r4 + (r2 + r2) + ((r1 + r3 + r2) << 2);
-            r0 = v_reinterpret_as_u16(v_pack(vx_load(row0 + x + 2*v_int32::nlanes), vx_load(row0 + x + 3*v_int32::nlanes)));
-            r1 = v_reinterpret_as_u16(v_pack(vx_load(row1 + x + 2*v_int32::nlanes), vx_load(row1 + x + 3*v_int32::nlanes)));
-            r2 = v_reinterpret_as_u16(v_pack(vx_load(row2 + x + 2*v_int32::nlanes), vx_load(row2 + x + 3*v_int32::nlanes)));
-            r3 = v_reinterpret_as_u16(v_pack(vx_load(row3 + x + 2*v_int32::nlanes), vx_load(row3 + x + 3*v_int32::nlanes)));
-            r4 = v_reinterpret_as_u16(v_pack(vx_load(row4 + x + 2*v_int32::nlanes), vx_load(row4 + x + 3*v_int32::nlanes)));
-            t1 = r0 + r4 + (r2 + r2) + ((r1 + r3 + r2) << 2);
-            v_store(dst + x, v_rshr_pack<8>(t0, t1));
-        }
-        if (x <= width - v_int16::nlanes)
-        {
-            v_uint16 r0, r1, r2, r3, r4, t0;
-            r0 = v_reinterpret_as_u16(v_pack(vx_load(row0 + x), vx_load(row0 + x + v_int32::nlanes)));
-            r1 = v_reinterpret_as_u16(v_pack(vx_load(row1 + x), vx_load(row1 + x + v_int32::nlanes)));
-            r2 = v_reinterpret_as_u16(v_pack(vx_load(row2 + x), vx_load(row2 + x + v_int32::nlanes)));
-            r3 = v_reinterpret_as_u16(v_pack(vx_load(row3 + x), vx_load(row3 + x + v_int32::nlanes)));
-            r4 = v_reinterpret_as_u16(v_pack(vx_load(row4 + x), vx_load(row4 + x + v_int32::nlanes)));
-            t0 = r0 + r4 + (r2 + r2) + ((r1 + r3 + r2) << 2);
-            v_rshr_pack_store<8>(dst + x, t0);
-            x += v_uint16::nlanes;
-        }
-        typedef int CV_DECL_ALIGNED(1) unaligned_int;
-        for ( ; x <= width - v_int32x4::nlanes; x += v_int32x4::nlanes)
-        {
-            v_int32x4 r0, r1, r2, r3, r4, t0;
-            r0 = v_load(row0 + x);
-            r1 = v_load(row1 + x);
-            r2 = v_load(row2 + x);
-            r3 = v_load(row3 + x);
-            r4 = v_load(row4 + x);
-            t0 = r0 + r4 + (r2 + r2) + ((r1 + r3 + r2) << 2);
+    v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001));
+    v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006));
+    for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src0 += v_int16::nlanes, src2 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes)
+        v_store(row, v_dotprod(v_reinterpret_as_s16(vx_load_expand(src0)), v_1_4) +
+                     v_dotprod(v_reinterpret_as_s16(vx_load_expand(src2)), v_6_4) +
+                     (v_reinterpret_as_s32(vx_load_expand(src4)) >> 16));
+    vx_cleanup();
 
-            *((unaligned_int*) (dst + x)) = v_reinterpret_as_s32(v_rshr_pack<8>(v_pack_u(t0, t0), v_setzero_u16())).get0();
-        }
-
-        return x;
-    }
-};
-
-struct PyrDownVec_32f
+    return x;
+}
+template<> int PyrDownVecH<uchar, int, 2>(const uchar* src, int* row, int width)
 {
-    int operator()(float** src, float* dst, int, int width) const
-    {
-        int x = 0;
-        const float *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
+    int x = 0;
+    const uchar *src0 = src, *src2 = src + 4, *src4 = src + 6;
 
-        v_float32 _4 = vx_setall_f32(4.f), _scale = vx_setall_f32(1.f/256);
-        for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes)
-        {
-            v_float32 r0, r1, r2, r3, r4;
-            r0 = vx_load(row0 + x);
-            r1 = vx_load(row1 + x);
-            r2 = vx_load(row2 + x);
-            r3 = vx_load(row3 + x);
-            r4 = vx_load(row4 + x);
-            v_store(dst + x, v_muladd(r1 + r3 + r2, _4, r0 + r4 + (r2 + r2)) * _scale);
-        }
+    v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001));
+    v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006));
+    for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src0 += v_int16::nlanes, src2 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes)
+        v_store(row, v_dotprod(v_interleave_pairs(v_reinterpret_as_s16(vx_load_expand(src0))), v_1_4) +
+                     v_dotprod(v_interleave_pairs(v_reinterpret_as_s16(vx_load_expand(src2))), v_6_4) +
+                     (v_reinterpret_as_s32(v_interleave_pairs(vx_load_expand(src4))) >> 16));
+    vx_cleanup();
 
-        return x;
-    }
-};
-
-#if CV_SSE4_1 || CV_NEON || CV_VSX
-
-struct PyrDownVec_32s16u
+    return x;
+}
+template<> int PyrDownVecH<uchar, int, 3>(const uchar* src, int* row, int width)
 {
-    int operator()(int** src, ushort* dst, int, int width) const
+    int idx[v_int8::nlanes/2 + 4];
+    for (int i = 0; i < v_int8::nlanes/4 + 2; i++)
     {
-        int x = 0;
-        const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
-
-        for( ; x <= width - v_uint16::nlanes; x += v_uint16::nlanes)
-        {
-            v_int32 r00 = vx_load(row0 + x),
-                    r01 = vx_load(row0 + x + v_int32::nlanes),
-                    r10 = vx_load(row1 + x),
-                    r11 = vx_load(row1 + x + v_int32::nlanes),
-                    r20 = vx_load(row2 + x),
-                    r21 = vx_load(row2 + x + v_int32::nlanes),
-                    r30 = vx_load(row3 + x),
-                    r31 = vx_load(row3 + x + v_int32::nlanes),
-                    r40 = vx_load(row4 + x),
-                    r41 = vx_load(row4 + x + v_int32::nlanes);
-            v_store(dst + x, v_rshr_pack_u<8>(r00 + r40 + (r20 + r20) + ((r10 + r20 + r30) << 2),
-                                              r01 + r41 + (r21 + r21) + ((r11 + r21 + r31) << 2)));
-        }
-        if (x <= width - v_int32::nlanes)
-        {
-            v_int32 r00 = vx_load(row0 + x),
-                    r10 = vx_load(row1 + x),
-                    r20 = vx_load(row2 + x),
-                    r30 = vx_load(row3 + x),
-                    r40 = vx_load(row4 + x);
-            v_rshr_pack_u_store<8>(dst + x, r00 + r40 + (r20 + r20) + ((r10 + r20 + r30) << 2));
-            x += v_int32::nlanes;
-        }
-
-        return x;
+        idx[i] = 6*i;
+        idx[i + v_int8::nlanes/4 + 2] = 6*i + 3;
     }
-};
 
-#else
+    int x = 0;
+    v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006));
+    for (; x <= width - v_int8::nlanes; x += 3*v_int8::nlanes/4, src += 6*v_int8::nlanes/4, row += 3*v_int8::nlanes/4)
+    {
+        v_uint16 r0l, r0h, r1l, r1h, r2l, r2h, r3l, r3h, r4l, r4h;
+        v_expand(vx_lut_quads(src, idx                       ), r0l, r0h);
+        v_expand(vx_lut_quads(src, idx + v_int8::nlanes/4 + 2), r1l, r1h);
+        v_expand(vx_lut_quads(src, idx + 1                   ), r2l, r2h);
+        v_expand(vx_lut_quads(src, idx + v_int8::nlanes/4 + 3), r3l, r3h);
+        v_expand(vx_lut_quads(src, idx + 2                   ), r4l, r4h);
 
-typedef PyrDownNoVec<int, ushort> PyrDownVec_32s16u;
+        v_zip(r2l, r1l + r3l, r1l, r3l);
+        v_zip(r2h, r1h + r3h, r1h, r3h);
+        r0l += r4l; r0h += r4h;
 
+        v_store(row                      , v_pack_triplets(v_dotprod(v_reinterpret_as_s16(r1l), v_6_4) + v_reinterpret_as_s32(v_expand_low( r0l))));
+        v_store(row + 3*v_int32::nlanes/4, v_pack_triplets(v_dotprod(v_reinterpret_as_s16(r3l), v_6_4) + v_reinterpret_as_s32(v_expand_high(r0l))));
+        v_store(row + 6*v_int32::nlanes/4, v_pack_triplets(v_dotprod(v_reinterpret_as_s16(r1h), v_6_4) + v_reinterpret_as_s32(v_expand_low( r0h))));
+        v_store(row + 9*v_int32::nlanes/4, v_pack_triplets(v_dotprod(v_reinterpret_as_s16(r3h), v_6_4) + v_reinterpret_as_s32(v_expand_high(r0h))));
+    }
+    vx_cleanup();
+
+    return x;
+}
+template<> int PyrDownVecH<uchar, int, 4>(const uchar* src, int* row, int width)
+{
+    int x = 0;
+    const uchar *src0 = src, *src2 = src + 8, *src4 = src + 12;
+
+    v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001));
+    v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006));
+    for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src0 += v_int16::nlanes, src2 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes)
+        v_store(row, v_dotprod(v_interleave_quads(v_reinterpret_as_s16(vx_load_expand(src0))), v_1_4) +
+                     v_dotprod(v_interleave_quads(v_reinterpret_as_s16(vx_load_expand(src2))), v_6_4) +
+                     (v_reinterpret_as_s32(v_interleave_quads(vx_load_expand(src4))) >> 16));
+    vx_cleanup();
+
+    return x;
+}
+
+template<> int PyrDownVecH<short, int, 1>(const short* src, int* row, int width)
+{
+    int x = 0;
+    const short *src0 = src, *src2 = src + 2, *src4 = src + 3;
+
+    v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001));
+    v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006));
+    for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src0 += v_int16::nlanes, src2 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes)
+        v_store(row, v_dotprod(vx_load(src0), v_1_4) +
+                     v_dotprod(vx_load(src2), v_6_4) +
+                     (v_reinterpret_as_s32(vx_load(src4)) >> 16));
+    vx_cleanup();
+
+    return x;
+}
+template<> int PyrDownVecH<short, int, 2>(const short* src, int* row, int width)
+{
+    int x = 0;
+    const short *src0 = src, *src2 = src + 4, *src4 = src + 6;
+
+    v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001));
+    v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006));
+    for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src0 += v_int16::nlanes, src2 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes)
+        v_store(row, v_dotprod(v_interleave_pairs(vx_load(src0)), v_1_4) +
+                     v_dotprod(v_interleave_pairs(vx_load(src2)), v_6_4) +
+                     (v_reinterpret_as_s32(v_interleave_pairs(vx_load(src4))) >> 16));
+    vx_cleanup();
+
+    return x;
+}
+template<> int PyrDownVecH<short, int, 3>(const short* src, int* row, int width)
+{
+    int idx[v_int16::nlanes/2 + 4];
+    for (int i = 0; i < v_int16::nlanes/4 + 2; i++)
+    {
+        idx[i] = 6*i;
+        idx[i + v_int16::nlanes/4 + 2] = 6*i + 3;
+    }
+
+    int x = 0;
+    v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001));
+    v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006));
+    for (; x <= width - v_int16::nlanes; x += 3*v_int16::nlanes/4, src += 6*v_int16::nlanes/4, row += 3*v_int16::nlanes/4)
+    {
+        v_int16 r0, r1, r2, r3, r4;
+        v_zip(vx_lut_quads(src, idx), vx_lut_quads(src, idx + v_int16::nlanes/4 + 2), r0, r1);
+        v_zip(vx_lut_quads(src, idx + 1), vx_lut_quads(src, idx + v_int16::nlanes/4 + 3), r2, r3);
+        r4 = vx_lut_quads(src, idx + 2);
+        v_store(row, v_pack_triplets(v_dotprod(r0, v_1_4) + v_dotprod(r2, v_6_4) + v_expand_low(r4)));
+        v_store(row + 3*v_int32::nlanes/4, v_pack_triplets(v_dotprod(r1, v_1_4) + v_dotprod(r3, v_6_4) + v_expand_high(r4)));
+    }
+    vx_cleanup();
+
+    return x;
+}
+template<> int PyrDownVecH<short, int, 4>(const short* src, int* row, int width)
+{
+    int idx[v_int16::nlanes/2 + 4];
+    for (int i = 0; i < v_int16::nlanes/4 + 2; i++)
+    {
+        idx[i] = 8*i;
+        idx[i + v_int16::nlanes/4 + 2] = 8*i + 4;
+    }
+
+    int x = 0;
+    v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001));
+    v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006));
+    for (; x <= width - v_int16::nlanes; x += v_int16::nlanes, src += 2*v_int16::nlanes, row += v_int16::nlanes)
+    {
+        v_int16 r0, r1, r2, r3, r4;
+        v_zip(vx_lut_quads(src, idx), vx_lut_quads(src, idx + v_int16::nlanes/4 + 2), r0, r1);
+        v_zip(vx_lut_quads(src, idx + 1), vx_lut_quads(src, idx + v_int16::nlanes/4 + 3), r2, r3);
+        r4 = vx_lut_quads(src, idx + 2);
+        v_store(row, v_dotprod(r0, v_1_4) + v_dotprod(r2, v_6_4) + v_expand_low(r4));
+        v_store(row + v_int32::nlanes, v_dotprod(r1, v_1_4) + v_dotprod(r3, v_6_4) + v_expand_high(r4));
+    }
+    vx_cleanup();
+
+    return x;
+}
+
+template<> int PyrDownVecH<ushort, int, 1>(const ushort* src, int* row, int width)
+{
+    int x = 0;
+    const ushort *src0 = src, *src2 = src + 2, *src4 = src + 3;
+
+    v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001));
+    v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006));
+    v_uint16 v_half = vx_setall_u16(0x8000);
+    v_int32 v_half15 = vx_setall_s32(0x00078000);
+    for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src0 += v_int16::nlanes, src2 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes)
+        v_store(row, v_dotprod(v_reinterpret_as_s16(v_sub_wrap(vx_load(src0), v_half)), v_1_4) +
+                     v_dotprod(v_reinterpret_as_s16(v_sub_wrap(vx_load(src2), v_half)), v_6_4) +
+                     v_reinterpret_as_s32(v_reinterpret_as_u32(vx_load(src4)) >> 16) + v_half15);
+    vx_cleanup();
+
+    return x;
+}
+template<> int PyrDownVecH<ushort, int, 2>(const ushort* src, int* row, int width)
+{
+    int x = 0;
+    const ushort *src0 = src, *src2 = src + 4, *src4 = src + 6;
+
+    v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001));
+    v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006));
+    v_uint16 v_half = vx_setall_u16(0x8000);
+    v_int32 v_half15 = vx_setall_s32(0x00078000);
+    for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src0 += v_int16::nlanes, src2 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes)
+        v_store(row, v_dotprod(v_interleave_pairs(v_reinterpret_as_s16(v_sub_wrap(vx_load(src0), v_half))), v_1_4) +
+                     v_dotprod(v_interleave_pairs(v_reinterpret_as_s16(v_sub_wrap(vx_load(src2), v_half))), v_6_4) +
+                     v_reinterpret_as_s32(v_reinterpret_as_u32(v_interleave_pairs(vx_load(src4))) >> 16) + v_half15);
+    vx_cleanup();
+
+    return x;
+}
+template<> int PyrDownVecH<ushort, int, 3>(const ushort* src, int* row, int width)
+{
+    int idx[v_int16::nlanes/2 + 4];
+    for (int i = 0; i < v_int16::nlanes/4 + 2; i++)
+    {
+        idx[i] = 6*i;
+        idx[i + v_int16::nlanes/4 + 2] = 6*i + 3;
+    }
+
+    int x = 0;
+    v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001));
+    v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006));
+    v_uint16 v_half = vx_setall_u16(0x8000);
+    v_int32 v_half15 = vx_setall_s32(0x00078000);
+    for (; x <= width - v_int16::nlanes; x += 3*v_int16::nlanes/4, src += 6*v_int16::nlanes/4, row += 3*v_int16::nlanes/4)
+    {
+        v_uint16 r0, r1, r2, r3, r4;
+        v_zip(vx_lut_quads(src, idx), vx_lut_quads(src, idx + v_int16::nlanes/4 + 2), r0, r1);
+        v_zip(vx_lut_quads(src, idx + 1), vx_lut_quads(src, idx + v_int16::nlanes/4 + 3), r2, r3);
+        r4 = vx_lut_quads(src, idx + 2);
+        v_store(row                      , v_pack_triplets(v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r0, v_half)), v_1_4) +
+                                                           v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r2, v_half)), v_6_4) +
+                                                           v_reinterpret_as_s32(v_expand_low(r4)) + v_half15));
+        v_store(row + 3*v_int32::nlanes/4, v_pack_triplets(v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r1, v_half)), v_1_4) +
+                                                           v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r3, v_half)), v_6_4) +
+                                                           v_reinterpret_as_s32(v_expand_high(r4)) + v_half15));
+    }
+    vx_cleanup();
+
+    return x;
+}
+template<> int PyrDownVecH<ushort, int, 4>(const ushort* src, int* row, int width)
+{
+    int idx[v_int16::nlanes/2 + 4];
+    for (int i = 0; i < v_int16::nlanes/4 + 2; i++)
+    {
+        idx[i] = 8*i;
+        idx[i + v_int16::nlanes/4 + 2] = 8*i + 4;
+    }
+
+    int x = 0;
+    v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001));
+    v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006));
+    v_uint16 v_half = vx_setall_u16(0x8000);
+    v_int32 v_half15 = vx_setall_s32(0x00078000);
+    for (; x <= width - v_int16::nlanes; x += v_int16::nlanes, src += 2*v_int16::nlanes, row += v_int16::nlanes)
+    {
+        v_uint16 r0, r1, r2, r3, r4;
+        v_zip(vx_lut_quads(src, idx), vx_lut_quads(src, idx + v_int16::nlanes/4 + 2), r0, r1);
+        v_zip(vx_lut_quads(src, idx + 1), vx_lut_quads(src, idx + v_int16::nlanes/4 + 3), r2, r3);
+        r4 = vx_lut_quads(src, idx + 2);
+        v_store(row                  , v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r0, v_half)), v_1_4) +
+                                       v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r2, v_half)), v_6_4) +
+                                       v_reinterpret_as_s32(v_expand_low(r4)) + v_half15);
+        v_store(row + v_int32::nlanes, v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r1, v_half)), v_1_4) +
+                                       v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r3, v_half)), v_6_4) +
+                                       v_reinterpret_as_s32(v_expand_high(r4)) + v_half15);
+    }
+    vx_cleanup();
+
+    return x;
+}
+
+template<> int PyrDownVecH<float, float, 1>(const float* src, float* row, int width)
+{
+    int x = 0;
+    const float *src0 = src, *src2 = src + 2, *src4 = src + 4;
+
+    v_float32 _4 = vx_setall_f32(4.f), _6 = vx_setall_f32(6.f);
+    for (; x <= width - v_float32::nlanes; x += v_float32::nlanes, src0 += 2*v_float32::nlanes, src2 += 2*v_float32::nlanes, src4 += 2*v_float32::nlanes, row+=v_float32::nlanes)
+    {
+        v_float32 r0, r1, r2, r3, r4, rtmp;
+        v_load_deinterleave(src0, r0, r1);
+        v_load_deinterleave(src2, r2, r3);
+        v_load_deinterleave(src4, r4, rtmp);
+        v_store(row, v_muladd(r2, _6, v_muladd(r1 + r3, _4, r0 + r4)));
+    }
+    vx_cleanup();
+
+    return x;
+}
+template<> int PyrDownVecH<float, float, 2>(const float* src, float* row, int width)
+{
+    int x = 0;
+    const float *src0 = src, *src2 = src + 4, *src4 = src + 6;
+
+    v_float32 _4 = vx_setall_f32(4.f), _6 = vx_setall_f32(6.f);
+    for (; x <= width - 2*v_float32::nlanes; x += 2*v_float32::nlanes, src0 += 4*v_float32::nlanes, src2 += 4*v_float32::nlanes, src4 += 4*v_float32::nlanes, row += 2*v_float32::nlanes)
+    {
+        v_float32 r0a, r0b, r1a, r1b, r2a, r2b, r3a, r3b, r4a, r4b, rtmpa, rtmpb;
+        v_load_deinterleave(src0, r0a, r0b, r1a, r1b);
+        v_load_deinterleave(src2, r2a, r2b, r3a, r3b);
+        v_load_deinterleave(src4, rtmpa, rtmpb, r4a, r4b);
+        v_store_interleave(row, v_muladd(r2a, _6, v_muladd(r1a + r3a, _4, r0a + r4a)), v_muladd(r2b, _6, v_muladd(r1b + r3b, _4, r0b + r4b)));
+    }
+    vx_cleanup();
+
+    return x;
+}
+template<> int PyrDownVecH<float, float, 3>(const float* src, float* row, int width)
+{
+    int idx[v_float32::nlanes/2 + 4];
+    for (int i = 0; i < v_float32::nlanes/4 + 2; i++)
+    {
+        idx[i] = 6*i;
+        idx[i + v_float32::nlanes/4 + 2] = 6*i + 3;
+    }
+
+    int x = 0;
+    v_float32 _4 = vx_setall_f32(4.f), _6 = vx_setall_f32(6.f);
+    for (; x <= width - v_float32::nlanes; x += 3*v_float32::nlanes/4, src += 6*v_float32::nlanes/4, row += 3*v_float32::nlanes/4)
+    {
+        v_float32 r0 = vx_lut_quads(src, idx);
+        v_float32 r1 = vx_lut_quads(src, idx + v_float32::nlanes/4 + 2);
+        v_float32 r2 = vx_lut_quads(src, idx + 1);
+        v_float32 r3 = vx_lut_quads(src, idx + v_float32::nlanes/4 + 3);
+        v_float32 r4 = vx_lut_quads(src, idx + 2);
+        v_store(row, v_pack_triplets(v_muladd(r2, _6, v_muladd(r1 + r3, _4, r0 + r4))));
+    }
+    vx_cleanup();
+
+    return x;
+}
+template<> int PyrDownVecH<float, float, 4>(const float* src, float* row, int width)
+{
+    int idx[v_float32::nlanes/2 + 4];
+    for (int i = 0; i < v_float32::nlanes/4 + 2; i++)
+    {
+        idx[i] = 8*i;
+        idx[i + v_float32::nlanes/4 + 2] = 8*i + 4;
+    }
+
+    int x = 0;
+    v_float32 _4 = vx_setall_f32(4.f), _6 = vx_setall_f32(6.f);
+    for (; x <= width - v_float32::nlanes; x += v_float32::nlanes, src += 2*v_float32::nlanes, row += v_float32::nlanes)
+    {
+        v_float32 r0 = vx_lut_quads(src, idx);
+        v_float32 r1 = vx_lut_quads(src, idx + v_float32::nlanes/4 + 2);
+        v_float32 r2 = vx_lut_quads(src, idx + 1);
+        v_float32 r3 = vx_lut_quads(src, idx + v_float32::nlanes/4 + 3);
+        v_float32 r4 = vx_lut_quads(src, idx + 2);
+        v_store(row, v_muladd(r2, _6, v_muladd(r1 + r3, _4, r0 + r4)));
+    }
+    vx_cleanup();
+
+    return x;
+}
+
+#if CV_SIMD_64F
+template<> int PyrDownVecH<double, double, 1>(const double* src, double* row, int width)
+{
+    int x = 0;
+    const double *src0 = src, *src2 = src + 2, *src4 = src + 4;
+
+    v_float64 _4 = vx_setall_f64(4.f), _6 = vx_setall_f64(6.f);
+    for (; x <= width - v_float64::nlanes; x += v_float64::nlanes, src0 += 2*v_float64::nlanes, src2 += 2*v_float64::nlanes, src4 += 2*v_float64::nlanes, row += v_float64::nlanes)
+    {
+        v_float64 r0, r1, r2, r3, r4, rtmp;
+        v_load_deinterleave(src0, r0, r1);
+        v_load_deinterleave(src2, r2, r3);
+        v_load_deinterleave(src4, r4, rtmp);
+        v_store(row, v_muladd(r2, _6, v_muladd(r1 + r3, _4, r0 + r4)));
+    }
+    vx_cleanup();
+
+    return x;
+}
 #endif
 
-struct PyrDownVec_32s16s
+template<> int PyrDownVecV<int, uchar>(int** src, uchar* dst, int width)
 {
-    int operator()(int** src, short* dst, int, int width) const
-    {
-        int x = 0;
-        const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
+    int x = 0;
+    const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
 
-        for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes)
-        {
-            v_int32 r00 = vx_load(row0 + x),
-                    r01 = vx_load(row0 + x + v_int32::nlanes),
-                    r10 = vx_load(row1 + x),
-                    r11 = vx_load(row1 + x + v_int32::nlanes),
-                    r20 = vx_load(row2 + x),
-                    r21 = vx_load(row2 + x + v_int32::nlanes),
-                    r30 = vx_load(row3 + x),
-                    r31 = vx_load(row3 + x + v_int32::nlanes),
-                    r40 = vx_load(row4 + x),
-                    r41 = vx_load(row4 + x + v_int32::nlanes);
-            v_store(dst + x, v_rshr_pack<8>(r00 + r40 + (r20 + r20) + ((r10 + r20 + r30) << 2),
+    for( ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes )
+    {
+        v_uint16 r0, r1, r2, r3, r4, t0, t1;
+        r0 = v_reinterpret_as_u16(v_pack(vx_load(row0 + x), vx_load(row0 + x + v_int32::nlanes)));
+        r1 = v_reinterpret_as_u16(v_pack(vx_load(row1 + x), vx_load(row1 + x + v_int32::nlanes)));
+        r2 = v_reinterpret_as_u16(v_pack(vx_load(row2 + x), vx_load(row2 + x + v_int32::nlanes)));
+        r3 = v_reinterpret_as_u16(v_pack(vx_load(row3 + x), vx_load(row3 + x + v_int32::nlanes)));
+        r4 = v_reinterpret_as_u16(v_pack(vx_load(row4 + x), vx_load(row4 + x + v_int32::nlanes)));
+        t0 = r0 + r4 + (r2 + r2) + ((r1 + r3 + r2) << 2);
+        r0 = v_reinterpret_as_u16(v_pack(vx_load(row0 + x + 2*v_int32::nlanes), vx_load(row0 + x + 3*v_int32::nlanes)));
+        r1 = v_reinterpret_as_u16(v_pack(vx_load(row1 + x + 2*v_int32::nlanes), vx_load(row1 + x + 3*v_int32::nlanes)));
+        r2 = v_reinterpret_as_u16(v_pack(vx_load(row2 + x + 2*v_int32::nlanes), vx_load(row2 + x + 3*v_int32::nlanes)));
+        r3 = v_reinterpret_as_u16(v_pack(vx_load(row3 + x + 2*v_int32::nlanes), vx_load(row3 + x + 3*v_int32::nlanes)));
+        r4 = v_reinterpret_as_u16(v_pack(vx_load(row4 + x + 2*v_int32::nlanes), vx_load(row4 + x + 3*v_int32::nlanes)));
+        t1 = r0 + r4 + (r2 + r2) + ((r1 + r3 + r2) << 2);
+        v_store(dst + x, v_rshr_pack<8>(t0, t1));
+    }
+    if (x <= width - v_int16::nlanes)
+    {
+        v_uint16 r0, r1, r2, r3, r4, t0;
+        r0 = v_reinterpret_as_u16(v_pack(vx_load(row0 + x), vx_load(row0 + x + v_int32::nlanes)));
+        r1 = v_reinterpret_as_u16(v_pack(vx_load(row1 + x), vx_load(row1 + x + v_int32::nlanes)));
+        r2 = v_reinterpret_as_u16(v_pack(vx_load(row2 + x), vx_load(row2 + x + v_int32::nlanes)));
+        r3 = v_reinterpret_as_u16(v_pack(vx_load(row3 + x), vx_load(row3 + x + v_int32::nlanes)));
+        r4 = v_reinterpret_as_u16(v_pack(vx_load(row4 + x), vx_load(row4 + x + v_int32::nlanes)));
+        t0 = r0 + r4 + (r2 + r2) + ((r1 + r3 + r2) << 2);
+        v_rshr_pack_store<8>(dst + x, t0);
+        x += v_uint16::nlanes;
+    }
+    typedef int CV_DECL_ALIGNED(1) unaligned_int;
+    for ( ; x <= width - v_int32x4::nlanes; x += v_int32x4::nlanes)
+    {
+        v_int32x4 r0, r1, r2, r3, r4, t0;
+        r0 = v_load(row0 + x);
+        r1 = v_load(row1 + x);
+        r2 = v_load(row2 + x);
+        r3 = v_load(row3 + x);
+        r4 = v_load(row4 + x);
+        t0 = r0 + r4 + (r2 + r2) + ((r1 + r3 + r2) << 2);
+
+        *((unaligned_int*) (dst + x)) = v_reinterpret_as_s32(v_rshr_pack<8>(v_pack_u(t0, t0), v_setzero_u16())).get0();
+    }
+    vx_cleanup();
+
+    return x;
+}
+
+template <>
+int PyrDownVecV<float, float>(float** src, float* dst, int width)
+{
+    int x = 0;
+    const float *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
+
+    v_float32 _4 = vx_setall_f32(4.f), _scale = vx_setall_f32(1.f/256);
+    for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes)
+    {
+        v_float32 r0, r1, r2, r3, r4;
+        r0 = vx_load(row0 + x);
+        r1 = vx_load(row1 + x);
+        r2 = vx_load(row2 + x);
+        r3 = vx_load(row3 + x);
+        r4 = vx_load(row4 + x);
+        v_store(dst + x, v_muladd(r1 + r3 + r2, _4, r0 + r4 + (r2 + r2)) * _scale);
+    }
+    vx_cleanup();
+
+    return x;
+}
+
+template <> int PyrDownVecV<int, ushort>(int** src, ushort* dst, int width)
+{
+    int x = 0;
+    const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
+
+    for( ; x <= width - v_uint16::nlanes; x += v_uint16::nlanes)
+    {
+        v_int32 r00 = vx_load(row0 + x),
+                r01 = vx_load(row0 + x + v_int32::nlanes),
+                r10 = vx_load(row1 + x),
+                r11 = vx_load(row1 + x + v_int32::nlanes),
+                r20 = vx_load(row2 + x),
+                r21 = vx_load(row2 + x + v_int32::nlanes),
+                r30 = vx_load(row3 + x),
+                r31 = vx_load(row3 + x + v_int32::nlanes),
+                r40 = vx_load(row4 + x),
+                r41 = vx_load(row4 + x + v_int32::nlanes);
+        v_store(dst + x, v_rshr_pack_u<8>(r00 + r40 + (r20 + r20) + ((r10 + r20 + r30) << 2),
                                             r01 + r41 + (r21 + r21) + ((r11 + r21 + r31) << 2)));
-        }
-        if (x <= width - v_int32::nlanes)
-        {
-            v_int32 r00 = vx_load(row0 + x),
+    }
+    if (x <= width - v_int32::nlanes)
+    {
+        v_int32 r00 = vx_load(row0 + x),
                 r10 = vx_load(row1 + x),
                 r20 = vx_load(row2 + x),
                 r30 = vx_load(row3 + x),
                 r40 = vx_load(row4 + x);
-            v_rshr_pack_store<8>(dst + x, r00 + r40 + (r20 + r20) + ((r10 + r20 + r30) << 2));
-            x += v_int32::nlanes;
-        }
-
-        return x;
+        v_rshr_pack_u_store<8>(dst + x, r00 + r40 + (r20 + r20) + ((r10 + r20 + r30) << 2));
+        x += v_int32::nlanes;
     }
-};
+    vx_cleanup();
 
-struct PyrUpVec_32s8u
+    return x;
+}
+
+template <> int PyrDownVecV<int, short>(int** src, short* dst, int width)
 {
-    int operator()(int** src, uchar** dst, int, int width) const
+    int x = 0;
+    const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
+
+    for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes)
     {
-        int x = 0;
-        uchar *dst0 = dst[0], *dst1 = dst[1];
-        const int *row0 = src[0], *row1 = src[1], *row2 = src[2];
-
-        for( ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes)
-        {
-            v_int16 v_r00 = v_pack(vx_load(row0 + x), vx_load(row0 + x + v_int32::nlanes)),
-                    v_r01 = v_pack(vx_load(row0 + x + 2 * v_int32::nlanes), vx_load(row0 + x + 3 * v_int32::nlanes)),
-                    v_r10 = v_pack(vx_load(row1 + x), vx_load(row1 + x + v_int32::nlanes)),
-                    v_r11 = v_pack(vx_load(row1 + x + 2 * v_int32::nlanes), vx_load(row1 + x + 3 * v_int32::nlanes)),
-                    v_r20 = v_pack(vx_load(row2 + x), vx_load(row2 + x + v_int32::nlanes)),
-                    v_r21 = v_pack(vx_load(row2 + x + 2 * v_int32::nlanes), vx_load(row2 + x + 3 * v_int32::nlanes));
-            v_int16 v_2r10 = v_r10 + v_r10, v_2r11 = (v_r11 + v_r11);
-            v_store(dst0 + x, v_rshr_pack_u<6>(v_r00 + v_r20 + (v_2r10 + v_2r10 + v_2r10), v_r01 + v_r21 + (v_2r11 + v_2r11 + v_2r11)));
-            v_store(dst1 + x, v_rshr_pack_u<6>((v_r10 + v_r20) << 2, (v_r11 + v_r21) << 2));
-        }
-        if(x <= width - v_uint16::nlanes)
-        {
-            v_int16 v_r00 = v_pack(vx_load(row0 + x), vx_load(row0 + x + v_int32::nlanes)),
-                    v_r10 = v_pack(vx_load(row1 + x), vx_load(row1 + x + v_int32::nlanes)),
-                    v_r20 = v_pack(vx_load(row2 + x), vx_load(row2 + x + v_int32::nlanes));
-            v_int16 v_2r10 = v_r10 + v_r10;
-            v_rshr_pack_u_store<6>(dst0 + x, v_r00 + v_r20 + (v_2r10 + v_2r10 + v_2r10));
-            v_rshr_pack_u_store<6>(dst1 + x, (v_r10 + v_r20) << 2);
-            x += v_uint16::nlanes;
-        }
-        for (; x <= width - v_int32x4::nlanes; x += v_int32x4::nlanes)
-        {
-            v_int32 v_r00 = vx_load(row0 + x),
-                    v_r10 = vx_load(row1 + x),
-                    v_r20 = vx_load(row2 + x);
-            v_int32 v_2r10 = v_r10 + v_r10;
-            v_int16 d = v_pack(v_r00 + v_r20 + (v_2r10 + v_2r10 + v_2r10), (v_r10 + v_r20) << 2);
-            *(int*)(dst0 + x) = v_reinterpret_as_s32(v_rshr_pack_u<6>(d, vx_setzero_s16())).get0();
-            *(int*)(dst1 + x) = v_reinterpret_as_s32(v_rshr_pack_u<6>(v_combine_high(d, d), vx_setzero_s16())).get0();
-        }
-
-        return x;
+        v_int32 r00 = vx_load(row0 + x),
+                r01 = vx_load(row0 + x + v_int32::nlanes),
+                r10 = vx_load(row1 + x),
+                r11 = vx_load(row1 + x + v_int32::nlanes),
+                r20 = vx_load(row2 + x),
+                r21 = vx_load(row2 + x + v_int32::nlanes),
+                r30 = vx_load(row3 + x),
+                r31 = vx_load(row3 + x + v_int32::nlanes),
+                r40 = vx_load(row4 + x),
+                r41 = vx_load(row4 + x + v_int32::nlanes);
+        v_store(dst + x, v_rshr_pack<8>(r00 + r40 + (r20 + r20) + ((r10 + r20 + r30) << 2),
+                                        r01 + r41 + (r21 + r21) + ((r11 + r21 + r31) << 2)));
     }
-};
+    if (x <= width - v_int32::nlanes)
+    {
+        v_int32 r00 = vx_load(row0 + x),
+            r10 = vx_load(row1 + x),
+            r20 = vx_load(row2 + x),
+            r30 = vx_load(row3 + x),
+            r40 = vx_load(row4 + x);
+        v_rshr_pack_store<8>(dst + x, r00 + r40 + (r20 + r20) + ((r10 + r20 + r30) << 2));
+        x += v_int32::nlanes;
+    }
+    vx_cleanup();
 
-struct PyrUpVec_32s16s
+    return x;
+}
+
+template <> int PyrUpVecV<int, uchar>(int** src, uchar** dst, int width)
 {
-    int operator()(int** src, short** dst, int, int width) const
+    int x = 0;
+    uchar *dst0 = dst[0], *dst1 = dst[1];
+    const int *row0 = src[0], *row1 = src[1], *row2 = src[2];
+
+    for( ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes)
     {
-        int x = 0;
-        short *dst0 = dst[0], *dst1 = dst[1];
-        const int *row0 = src[0], *row1 = src[1], *row2 = src[2];
-
-        for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes)
-        {
-            v_int32 v_r00 = vx_load(row0 + x),
-                    v_r01 = vx_load(row0 + x + v_int32::nlanes),
-                    v_r10 = vx_load(row1 + x),
-                    v_r11 = vx_load(row1 + x + v_int32::nlanes),
-                    v_r20 = vx_load(row2 + x),
-                    v_r21 = vx_load(row2 + x + v_int32::nlanes);
-            v_store(dst0 + x, v_rshr_pack<6>(v_r00 + v_r20 + ((v_r10 << 1) + (v_r10 << 2)), v_r01 + v_r21 + ((v_r11 << 1) + (v_r11 << 2))));
-            v_store(dst1 + x, v_rshr_pack<6>((v_r10 + v_r20) << 2, (v_r11 + v_r21) << 2));
-        }
-        if(x <= width - v_int32::nlanes)
-        {
-            v_int32 v_r00 = vx_load(row0 + x),
-                    v_r10 = vx_load(row1 + x),
-                    v_r20 = vx_load(row2 + x);
-            v_rshr_pack_store<6>(dst0 + x, v_r00 + v_r20 + ((v_r10 << 1) + (v_r10 << 2)));
-            v_rshr_pack_store<6>(dst1 + x, (v_r10 + v_r20) << 2);
-            x += v_int32::nlanes;
-        }
-
-        return x;
+        v_int16 v_r00 = v_pack(vx_load(row0 + x), vx_load(row0 + x + v_int32::nlanes)),
+                v_r01 = v_pack(vx_load(row0 + x + 2 * v_int32::nlanes), vx_load(row0 + x + 3 * v_int32::nlanes)),
+                v_r10 = v_pack(vx_load(row1 + x), vx_load(row1 + x + v_int32::nlanes)),
+                v_r11 = v_pack(vx_load(row1 + x + 2 * v_int32::nlanes), vx_load(row1 + x + 3 * v_int32::nlanes)),
+                v_r20 = v_pack(vx_load(row2 + x), vx_load(row2 + x + v_int32::nlanes)),
+                v_r21 = v_pack(vx_load(row2 + x + 2 * v_int32::nlanes), vx_load(row2 + x + 3 * v_int32::nlanes));
+        v_int16 v_2r10 = v_r10 + v_r10, v_2r11 = (v_r11 + v_r11);
+        v_store(dst0 + x, v_rshr_pack_u<6>(v_r00 + v_r20 + (v_2r10 + v_2r10 + v_2r10), v_r01 + v_r21 + (v_2r11 + v_2r11 + v_2r11)));
+        v_store(dst1 + x, v_rshr_pack_u<6>((v_r10 + v_r20) << 2, (v_r11 + v_r21) << 2));
     }
-};
+    if(x <= width - v_uint16::nlanes)
+    {
+        v_int16 v_r00 = v_pack(vx_load(row0 + x), vx_load(row0 + x + v_int32::nlanes)),
+                v_r10 = v_pack(vx_load(row1 + x), vx_load(row1 + x + v_int32::nlanes)),
+                v_r20 = v_pack(vx_load(row2 + x), vx_load(row2 + x + v_int32::nlanes));
+        v_int16 v_2r10 = v_r10 + v_r10;
+        v_rshr_pack_u_store<6>(dst0 + x, v_r00 + v_r20 + (v_2r10 + v_2r10 + v_2r10));
+        v_rshr_pack_u_store<6>(dst1 + x, (v_r10 + v_r20) << 2);
+        x += v_uint16::nlanes;
+    }
+    typedef int CV_DECL_ALIGNED(1) unaligned_int;
+    for (; x <= width - v_int32x4::nlanes; x += v_int32x4::nlanes)
+    {
+        v_int32 v_r00 = vx_load(row0 + x),
+                v_r10 = vx_load(row1 + x),
+                v_r20 = vx_load(row2 + x);
+        v_int32 v_2r10 = v_r10 + v_r10;
+        v_int16 d = v_pack(v_r00 + v_r20 + (v_2r10 + v_2r10 + v_2r10), (v_r10 + v_r20) << 2);
+        *(unaligned_int*)(dst0 + x) = v_reinterpret_as_s32(v_rshr_pack_u<6>(d, vx_setzero_s16())).get0();
+        *(unaligned_int*)(dst1 + x) = v_reinterpret_as_s32(v_rshr_pack_u<6>(v_combine_high(d, d), vx_setzero_s16())).get0();
+    }
+    vx_cleanup();
 
-#if CV_SSE4_1 || CV_NEON || CV_VSX
+    return x;
+}
 
-struct PyrUpVec_32s16u
+template <> int PyrUpVecV<int, short>(int** src, short** dst, int width)
 {
-    int operator()(int** src, ushort** dst, int, int width) const
+    int x = 0;
+    short *dst0 = dst[0], *dst1 = dst[1];
+    const int *row0 = src[0], *row1 = src[1], *row2 = src[2];
+
+    for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes)
     {
-        int x = 0;
-        ushort *dst0 = dst[0], *dst1 = dst[1];
-        const int *row0 = src[0], *row1 = src[1], *row2 = src[2];
-
-        for( ; x <= width - v_uint16::nlanes; x += v_uint16::nlanes)
-        {
-            v_int32 v_r00 = vx_load(row0 + x),
-                    v_r01 = vx_load(row0 + x + v_int32::nlanes),
-                    v_r10 = vx_load(row1 + x),
-                    v_r11 = vx_load(row1 + x + v_int32::nlanes),
-                    v_r20 = vx_load(row2 + x),
-                    v_r21 = vx_load(row2 + x + v_int32::nlanes);
-            v_store(dst0 + x, v_rshr_pack_u<6>(v_r00 + v_r20 + ((v_r10 << 1) + (v_r10 << 2)), v_r01 + v_r21 + ((v_r11 << 1) + (v_r11 << 2))));
-            v_store(dst1 + x, v_rshr_pack_u<6>((v_r10 + v_r20) << 2, (v_r11 + v_r21) << 2));
-        }
-        if(x <= width - v_int32::nlanes)
-        {
-            v_int32 v_r00 = vx_load(row0 + x),
-                    v_r10 = vx_load(row1 + x),
-                    v_r20 = vx_load(row2 + x);
-            v_rshr_pack_u_store<6>(dst0 + x, v_r00 + v_r20 + ((v_r10 << 1) + (v_r10 << 2)));
-            v_rshr_pack_u_store<6>(dst1 + x, (v_r10 + v_r20) << 2);
-            x += v_int32::nlanes;
-        }
-
-        return x;
+        v_int32 v_r00 = vx_load(row0 + x),
+                v_r01 = vx_load(row0 + x + v_int32::nlanes),
+                v_r10 = vx_load(row1 + x),
+                v_r11 = vx_load(row1 + x + v_int32::nlanes),
+                v_r20 = vx_load(row2 + x),
+                v_r21 = vx_load(row2 + x + v_int32::nlanes);
+        v_store(dst0 + x, v_rshr_pack<6>(v_r00 + v_r20 + ((v_r10 << 1) + (v_r10 << 2)), v_r01 + v_r21 + ((v_r11 << 1) + (v_r11 << 2))));
+        v_store(dst1 + x, v_rshr_pack<6>((v_r10 + v_r20) << 2, (v_r11 + v_r21) << 2));
     }
-};
+    if(x <= width - v_int32::nlanes)
+    {
+        v_int32 v_r00 = vx_load(row0 + x),
+                v_r10 = vx_load(row1 + x),
+                v_r20 = vx_load(row2 + x);
+        v_rshr_pack_store<6>(dst0 + x, v_r00 + v_r20 + ((v_r10 << 1) + (v_r10 << 2)));
+        v_rshr_pack_store<6>(dst1 + x, (v_r10 + v_r20) << 2);
+        x += v_int32::nlanes;
+    }
+    vx_cleanup();
 
-#else
+    return x;
+}
 
-typedef PyrUpNoVec<int, ushort> PyrUpVec_32s16u;
-
-#endif // CV_SSE4_1
-
-struct PyrUpVec_32f
+template <> int PyrUpVecV<int, ushort>(int** src, ushort** dst, int width)
 {
-    int operator()(float** src, float** dst, int, int width) const
+    int x = 0;
+    ushort *dst0 = dst[0], *dst1 = dst[1];
+    const int *row0 = src[0], *row1 = src[1], *row2 = src[2];
+
+    for( ; x <= width - v_uint16::nlanes; x += v_uint16::nlanes)
     {
-        int x = 0;
-        const float *row0 = src[0], *row1 = src[1], *row2 = src[2];
-        float *dst0 = dst[0], *dst1 = dst[1];
-
-        v_float32 v_6 = vx_setall_f32(6.0f), v_scale = vx_setall_f32(1.f/64.f), v_scale4 = vx_setall_f32(1.f/16.f);
-        for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes)
-        {
-            v_float32 v_r0 = vx_load(row0 + x),
-                      v_r1 = vx_load(row1 + x),
-                      v_r2 = vx_load(row2 + x);
-            v_store(dst1 + x, v_scale4 * (v_r1 + v_r2));
-            v_store(dst0 + x, v_scale * (v_muladd(v_6, v_r1, v_r0) + v_r2));
-        }
-
-        return x;
+        v_int32 v_r00 = vx_load(row0 + x),
+                v_r01 = vx_load(row0 + x + v_int32::nlanes),
+                v_r10 = vx_load(row1 + x),
+                v_r11 = vx_load(row1 + x + v_int32::nlanes),
+                v_r20 = vx_load(row2 + x),
+                v_r21 = vx_load(row2 + x + v_int32::nlanes);
+        v_store(dst0 + x, v_rshr_pack_u<6>(v_r00 + v_r20 + ((v_r10 << 1) + (v_r10 << 2)), v_r01 + v_r21 + ((v_r11 << 1) + (v_r11 << 2))));
+        v_store(dst1 + x, v_rshr_pack_u<6>((v_r10 + v_r20) << 2, (v_r11 + v_r21) << 2));
     }
-};
+    if(x <= width - v_int32::nlanes)
+    {
+        v_int32 v_r00 = vx_load(row0 + x),
+                v_r10 = vx_load(row1 + x),
+                v_r20 = vx_load(row2 + x);
+        v_rshr_pack_u_store<6>(dst0 + x, v_r00 + v_r20 + ((v_r10 << 1) + (v_r10 << 2)));
+        v_rshr_pack_u_store<6>(dst1 + x, (v_r10 + v_r20) << 2);
+        x += v_int32::nlanes;
+    }
+    vx_cleanup();
 
-#else
+    return x;
+}
 
-typedef PyrDownNoVec<int, uchar> PyrDownVec_32s8u;
-typedef PyrDownNoVec<int, ushort> PyrDownVec_32s16u;
-typedef PyrDownNoVec<int, short> PyrDownVec_32s16s;
-typedef PyrDownNoVec<float, float> PyrDownVec_32f;
+template <> int PyrUpVecV<float, float>(float** src, float** dst, int width)
+{
+    int x = 0;
+    const float *row0 = src[0], *row1 = src[1], *row2 = src[2];
+    float *dst0 = dst[0], *dst1 = dst[1];
 
-typedef PyrUpNoVec<int, uchar> PyrUpVec_32s8u;
-typedef PyrUpNoVec<int, short> PyrUpVec_32s16s;
-typedef PyrUpNoVec<int, ushort> PyrUpVec_32s16u;
-typedef PyrUpNoVec<float, float> PyrUpVec_32f;
+    v_float32 v_6 = vx_setall_f32(6.0f), v_scale = vx_setall_f32(1.f/64.f), v_scale4 = vx_setall_f32(1.f/16.f);
+    for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes)
+    {
+        v_float32 v_r0 = vx_load(row0 + x),
+                  v_r1 = vx_load(row1 + x),
+                  v_r2 = vx_load(row2 + x);
+        v_store(dst1 + x, v_scale4 * (v_r1 + v_r2));
+        v_store(dst0 + x, v_scale * (v_muladd(v_6, v_r1, v_r0) + v_r2));
+    }
+    vx_cleanup();
+
+    return x;
+}
 
 #endif
 
-template<class CastOp, class VecOp> void
+template<class CastOp> void
 pyrDown_( const Mat& _src, Mat& _dst, int borderType )
 {
     const int PD_SZ = 5;
@@ -408,7 +737,6 @@ pyrDown_( const Mat& _src, Mat& _dst, int borderType )
     int* tabM = _tabM.data();
     WT* rows[PD_SZ];
     CastOp castOp;
-    VecOp vecOp;
 
     CV_Assert( ssize.width > 0 && ssize.height > 0 &&
                std::abs(dsize.width*2 - ssize.width) <= 2 &&
@@ -460,12 +788,25 @@ pyrDown_( const Mat& _src, Mat& _dst, int borderType )
 
                 if( cn == 1 )
                 {
+                    x += PyrDownVecH<T, WT, 1>(src + x * 2 - 2, row + x, width0 - x);
                     for( ; x < width0; x++ )
                         row[x] = src[x*2]*6 + (src[x*2 - 1] + src[x*2 + 1])*4 +
                             src[x*2 - 2] + src[x*2 + 2];
                 }
+                else if( cn == 2 )
+                {
+                    x += PyrDownVecH<T, WT, 2>(src + x * 2 - 4, row + x, width0 - x);
+                    for( ; x < width0; x += 2 )
+                    {
+                        const T* s = src + x*2;
+                        WT t0 = s[0] * 6 + (s[-2] + s[2]) * 4 + s[-4] + s[4];
+                        WT t1 = s[1] * 6 + (s[-1] + s[3]) * 4 + s[-3] + s[5];
+                        row[x] = t0; row[x + 1] = t1;
+                    }
+                }
                 else if( cn == 3 )
                 {
+                    x += PyrDownVecH<T, WT, 3>(src + x * 2 - 6, row + x, width0 - x);
                     for( ; x < width0; x += 3 )
                     {
                         const T* s = src + x*2;
@@ -477,6 +818,7 @@ pyrDown_( const Mat& _src, Mat& _dst, int borderType )
                 }
                 else if( cn == 4 )
                 {
+                    x += PyrDownVecH<T, WT, 4>(src + x * 2 - 8, row + x, width0 - x);
                     for( ; x < width0; x += 4 )
                     {
                         const T* s = src + x*2;
@@ -508,14 +850,14 @@ pyrDown_( const Mat& _src, Mat& _dst, int borderType )
             rows[k] = buf + ((y*2 - PD_SZ/2 + k - sy0) % PD_SZ)*bufstep;
         row0 = rows[0]; row1 = rows[1]; row2 = rows[2]; row3 = rows[3]; row4 = rows[4];
 
-        x = vecOp(rows, dst, (int)_dst.step, dsize.width);
+        x = PyrDownVecV<WT, T>(rows, dst, dsize.width);
         for( ; x < dsize.width; x++ )
             dst[x] = castOp(row2[x]*6 + (row1[x] + row3[x])*4 + row0[x] + row4[x]);
     }
 }
 
 
-template<class CastOp, class VecOp> void
+template<class CastOp> void
 pyrUp_( const Mat& _src, Mat& _dst, int)
 {
     const int PU_SZ = 3;
@@ -532,7 +874,7 @@ pyrUp_( const Mat& _src, Mat& _dst, int)
     WT* rows[PU_SZ];
     T* dsts[2];
     CastOp castOp;
-    VecOp vecOp;
+    //PyrUpVecH<T, WT> vecOpH;
 
     CV_Assert( std::abs(dsize.width - ssize.width*2) == dsize.width % 2 &&
                std::abs(dsize.height - ssize.height*2) == dsize.height % 2);
@@ -598,7 +940,7 @@ pyrUp_( const Mat& _src, Mat& _dst, int)
         row0 = rows[0]; row1 = rows[1]; row2 = rows[2];
         dsts[0] = dst0; dsts[1] = dst1;
 
-        x = vecOp(rows, dsts, (int)_dst.step, dsize.width);
+        x = PyrUpVecV<WT, T>(rows, dsts, dsize.width);
         for( ; x < dsize.width; x++ )
         {
             T t1 = castOp((row1[x] + row2[x])*4);
@@ -907,15 +1249,15 @@ void cv::pyrDown( InputArray _src, OutputArray _dst, const Size& _dsz, int borde
 
     PyrFunc func = 0;
     if( depth == CV_8U )
-        func = pyrDown_<FixPtCast<uchar, 8>, PyrDownVec_32s8u>;
+        func = pyrDown_< FixPtCast<uchar, 8> >;
     else if( depth == CV_16S )
-        func = pyrDown_<FixPtCast<short, 8>, PyrDownVec_32s16s >;
+        func = pyrDown_< FixPtCast<short, 8> >;
     else if( depth == CV_16U )
-        func = pyrDown_<FixPtCast<ushort, 8>, PyrDownVec_32s16u >;
+        func = pyrDown_< FixPtCast<ushort, 8> >;
     else if( depth == CV_32F )
-        func = pyrDown_<FltCast<float, 8>, PyrDownVec_32f>;
+        func = pyrDown_< FltCast<float, 8> >;
     else if( depth == CV_64F )
-        func = pyrDown_<FltCast<double, 8>, PyrDownNoVec<double, double> >;
+        func = pyrDown_< FltCast<double, 8> >;
     else
         CV_Error( CV_StsUnsupportedFormat, "" );
 
@@ -1010,15 +1352,15 @@ void cv::pyrUp( InputArray _src, OutputArray _dst, const Size& _dsz, int borderT
 
     PyrFunc func = 0;
     if( depth == CV_8U )
-        func = pyrUp_<FixPtCast<uchar, 6>, PyrUpVec_32s8u >;
+        func = pyrUp_< FixPtCast<uchar, 6> >;
     else if( depth == CV_16S )
-        func = pyrUp_<FixPtCast<short, 6>, PyrUpVec_32s16s >;
+        func = pyrUp_< FixPtCast<short, 6> >;
     else if( depth == CV_16U )
-        func = pyrUp_<FixPtCast<ushort, 6>, PyrUpVec_32s16u >;
+        func = pyrUp_< FixPtCast<ushort, 6> >;
     else if( depth == CV_32F )
-        func = pyrUp_<FltCast<float, 6>, PyrUpVec_32f >;
+        func = pyrUp_< FltCast<float, 6> >;
     else if( depth == CV_64F )
-        func = pyrUp_<FltCast<double, 6>, PyrUpNoVec<double, double> >;
+        func = pyrUp_< FltCast<double, 6> >;
     else
         CV_Error( CV_StsUnsupportedFormat, "" );
 
diff --git a/modules/imgproc/src/resize.cpp b/modules/imgproc/src/resize.cpp
index 996f6977b1..2882f26341 100644
--- a/modules/imgproc/src/resize.cpp
+++ b/modules/imgproc/src/resize.cpp
@@ -485,7 +485,7 @@ void hlineResizeCn<uint8_t, ufixedpoint16, 2, true, 3>(uint8_t* src, int, int *o
         v_store(ofst3, vx_load(ofst + i) * vx_setall_s32(3));
         v_uint8 v_src01, v_src23;
         v_uint16 v_src0, v_src1, v_src2, v_src3;
-        v_zip(vx_lut_quads(src, ofst3), vx_lut_quads(src+3, ofst3), v_src01, v_src23);
+        v_zip(vx_lut_quads(src, ofst3), v_reinterpret_as_u8(v_reinterpret_as_u32(vx_lut_quads(src+2, ofst3)) >> 8), v_src01, v_src23);
         v_expand(v_src01, v_src0, v_src1);
         v_expand(v_src23, v_src2, v_src3);
 
diff --git a/modules/imgproc/test/test_lsd.cpp b/modules/imgproc/test/test_lsd.cpp
index 43d00b4928..e162a3c6f4 100644
--- a/modules/imgproc/test/test_lsd.cpp
+++ b/modules/imgproc/test/test_lsd.cpp
@@ -5,6 +5,8 @@
 
 namespace opencv_test { namespace {
 
+#if 0  // LSD implementation has been removed due original code license issues
+
 const Size img_size(640, 480);
 const int LSD_TEST_SEED = 0x134679;
 const int EPOCHS = 20;
@@ -402,4 +404,6 @@ TEST_F(Imgproc_LSD_Common, compareSegmentsVec4i)
     ASSERT_EQ(result2, 11);
 }
 
+#endif
+
 }} // namespace
diff --git a/modules/java/generator/gen_java.py b/modules/java/generator/gen_java.py
index b1ea133edf..31b947a957 100755
--- a/modules/java/generator/gen_java.py
+++ b/modules/java/generator/gen_java.py
@@ -953,11 +953,19 @@ JNIEXPORT $rtype JNICALL Java_org_opencv_${module}_${clazz}_$fname
     def gen_class(self, ci):
         logging.info("%s", ci)
         # constants
+        consts_map = {c.name: c for c in ci.private_consts}
+        consts_map.update({c.name: c for c in ci.consts})
+        def const_value(v):
+            if v in consts_map:
+                target = consts_map[v]
+                assert target.value != v
+                return const_value(target.value)
+            return v
         if ci.private_consts:
             logging.info("%s", ci.private_consts)
             ci.j_code.write("""
     private static final int
-            %s;\n\n""" % (",\n"+" "*12).join(["%s = %s" % (c.name, c.value) for c in ci.private_consts])
+            %s;\n\n""" % (",\n"+" "*12).join(["%s = %s" % (c.name, const_value(c.value)) for c in ci.private_consts])
             )
         if ci.consts:
             enumTypes = set(map(lambda c: c.enumType, ci.consts))
diff --git a/modules/ml/src/svm.cpp b/modules/ml/src/svm.cpp
index 39eddc5402..08c133feaf 100644
--- a/modules/ml/src/svm.cpp
+++ b/modules/ml/src/svm.cpp
@@ -205,11 +205,14 @@ public:
         for( j = 0; j < vcount; j++ )
         {
             Qfloat t = results[j];
-            Qfloat e = std::exp(std::abs(t));
-            if( t > 0 )
-                results[j] = (Qfloat)((e - 1.)/(e + 1.));
-            else
-                results[j] = (Qfloat)((1. - e)/(1. + e));
+            Qfloat e = std::exp(std::abs(t));          // Inf value is possible here
+            Qfloat r = (Qfloat)((e - 1.) / (e + 1.));  // NaN value is possible here (Inf/Inf or similar)
+            if (cvIsNaN(r))
+                r = std::numeric_limits<Qfloat>::infinity();
+            if (t < 0)
+                r = -r;
+            CV_DbgAssert(!cvIsNaN(r));
+            results[j] = r;
         }
     }
 
@@ -327,7 +330,7 @@ public:
         const Qfloat max_val = (Qfloat)(FLT_MAX*1e-3);
         for( int j = 0; j < vcount; j++ )
         {
-            if( results[j] > max_val )
+            if (!(results[j] <= max_val))  // handle NaNs too
                 results[j] = max_val;
         }
     }
@@ -1949,6 +1952,7 @@ public:
                             const DecisionFunc& df = svm->decision_func[dfi];
                             sum = -df.rho;
                             int sv_count = svm->getSVCount(dfi);
+                            CV_DbgAssert(sv_count > 0);
                             const double* alpha = &svm->df_alpha[df.ofs];
                             const int* sv_index = &svm->df_index[df.ofs];
                             for( k = 0; k < sv_count; k++ )
diff --git a/modules/python/src2/cv2.cpp b/modules/python/src2/cv2.cpp
index 52ff70d3bc..b3cbff5b42 100644
--- a/modules/python/src2/cv2.cpp
+++ b/modules/python/src2/cv2.cpp
@@ -92,7 +92,7 @@ bool pyopencv_to(PyObject* dst, TYPE& src, const char* name)
 {                                                                                                     \
     if (!dst || dst == Py_None)                                                                       \
         return true;                                                                                  \
-    std::underlying_type<TYPE>::type underlying = 0;                                                  \
+    int underlying = 0;                                                  \
                                                                                                       \
     if (!pyopencv_to(dst, underlying, name)) return false;                                            \
     src = static_cast<TYPE>(underlying);                                                              \
@@ -103,7 +103,7 @@ bool pyopencv_to(PyObject* dst, TYPE& src, const char* name)
 template<>                                                                                            \
 PyObject* pyopencv_from(const TYPE& src)                                                              \
 {                                                                                                     \
-    return pyopencv_from(static_cast<std::underlying_type<TYPE>::type>(src));                         \
+    return pyopencv_from(static_cast<int>(src));                         \
 }
 
 #include "pyopencv_generated_include.h"
@@ -847,6 +847,40 @@ bool pyopencv_to(PyObject* obj, Range& r, const char* name)
     CV_UNUSED(name);
     if(!obj || obj == Py_None)
         return true;
+    while (PySequence_Check(obj))
+    {
+        PyObject *fi = PySequence_Fast(obj, name);
+        if (fi == NULL)
+            break;
+        if (2 != PySequence_Fast_GET_SIZE(fi))
+        {
+            failmsg("Range value for argument '%s' is longer than 2", name);
+            Py_DECREF(fi);
+            return false;
+        }
+        {
+            PyObject *item = PySequence_Fast_GET_ITEM(fi, 0);
+            if (PyInt_Check(item)) {
+                r.start = (int)PyInt_AsLong(item);
+            } else {
+                failmsg("Range.start value for argument '%s' is not integer", name);
+                Py_DECREF(fi);
+                break;
+            }
+        }
+        {
+            PyObject *item = PySequence_Fast_GET_ITEM(fi, 1);
+            if (PyInt_Check(item)) {
+                r.end = (int)PyInt_AsLong(item);
+            } else {
+                failmsg("Range.end value for argument '%s' is not integer", name);
+                Py_DECREF(fi);
+                break;
+            }
+        }
+        Py_DECREF(fi);
+        return true;
+    }
     if(PyObject_Size(obj) == 0)
     {
         r = Range::all();
diff --git a/modules/videoio/src/cap_v4l.cpp b/modules/videoio/src/cap_v4l.cpp
index 2fb5dbc5c1..3e69f3c138 100644
--- a/modules/videoio/src/cap_v4l.cpp
+++ b/modules/videoio/src/cap_v4l.cpp
@@ -243,6 +243,14 @@ make & enjoy!
 #define V4L2_CID_ISO_SENSITIVITY (V4L2_CID_CAMERA_CLASS_BASE+23)
 #endif
 
+// https://github.com/opencv/opencv/issues/13929
+#ifndef V4L2_CID_MPEG_VIDEO_H264_VUI_EXT_SAR_HEIGHT
+#define V4L2_CID_MPEG_VIDEO_H264_VUI_EXT_SAR_HEIGHT (V4L2_CID_MPEG_BASE+364)
+#endif
+#ifndef V4L2_CID_MPEG_VIDEO_H264_VUI_EXT_SAR_WIDTH
+#define V4L2_CID_MPEG_VIDEO_H264_VUI_EXT_SAR_WIDTH (V4L2_CID_MPEG_BASE+365)
+#endif
+
 /* Defaults - If your board can do better, set it here.  Set for the most common type inputs. */
 #define DEFAULT_V4L_WIDTH  640
 #define DEFAULT_V4L_HEIGHT 480
diff --git a/platforms/android/build_sdk.py b/platforms/android/build_sdk.py
index 34b93824e2..678e48bdb7 100755
--- a/platforms/android/build_sdk.py
+++ b/platforms/android/build_sdk.py
@@ -50,12 +50,12 @@ def check_dir(d, create=False, clean=False):
 
 def check_executable(cmd):
     try:
-        FNULL = open(os.devnull, 'w')
-        retcode = subprocess.call(cmd, stdout=FNULL, stderr=subprocess.STDOUT)
-        if retcode < 0:
-            return False
+        log.debug("Executing: %s" % cmd)
+        result = subprocess.check_output(cmd, stderr=subprocess.STDOUT)
+        log.debug("Result: %s" % (result+'\n').split('\n')[0])
         return True
-    except:
+    except Exception as e:
+        log.debug('Failed: %s' % e)
         return False
 
 def determine_opencv_version(version_hpp_path):
@@ -140,9 +140,11 @@ class Builder:
         self.extra_packs = []
         self.opencv_version = determine_opencv_version(os.path.join(self.opencvdir, "modules", "core", "include", "opencv2", "core", "version.hpp"))
         self.use_ccache = False if config.no_ccache else True
+        self.cmake_path = self.get_cmake()
+        self.ninja_path = self.get_ninja()
 
     def get_cmake(self):
-        if check_executable(['cmake', '--version']):
+        if not self.config.use_android_buildtools and check_executable(['cmake', '--version']):
             log.info("Using cmake from PATH")
             return 'cmake'
         # look to see if Android SDK's cmake is installed
@@ -157,7 +159,7 @@ class Builder:
         raise Fail("Can't find cmake")
 
     def get_ninja(self):
-        if check_executable(['ninja', '--version']):
+        if not self.config.use_android_buildtools and check_executable(['ninja', '--version']):
             log.info("Using ninja from PATH")
             return 'ninja'
         # Android SDK's cmake includes a copy of ninja - look to see if its there
@@ -195,7 +197,7 @@ class Builder:
             rm_one(d)
 
     def build_library(self, abi, do_install):
-        cmd = [self.get_cmake(), "-GNinja"]
+        cmd = [self.cmake_path, "-GNinja"]
         cmake_vars = dict(
             CMAKE_TOOLCHAIN_FILE=self.get_toolchain_file(),
             INSTALL_CREATE_DISTRIB="ON",
@@ -208,8 +210,9 @@ class Builder:
             BUILD_DOCS="OFF",
             BUILD_ANDROID_EXAMPLES="ON",
             INSTALL_ANDROID_EXAMPLES="ON",
-            CMAKE_MAKE_PROGRAM=self.get_ninja(),
         )
+        if self.ninja_path != 'ninja':
+            cmake_vars['CMAKE_MAKE_PROGRAM'] = self.ninja_path
 
         if self.config.extra_modules_path is not None:
             cmd.append("-DOPENCV_EXTRA_MODULES_PATH='%s'" % self.config.extra_modules_path)
@@ -223,7 +226,7 @@ class Builder:
         cmd += [ "-D%s='%s'" % (k, v) for (k, v) in cmake_vars.items() if v is not None]
         cmd.append(self.opencvdir)
         execute(cmd)
-        execute([self.get_ninja(), "install/strip"])
+        execute([self.ninja_path, "install/strip"])
 
     def build_javadoc(self):
         classpaths = []
@@ -273,6 +276,7 @@ if __name__ == "__main__":
     parser.add_argument('--config', default='ndk-18.config.py', type=str, help="Package build configuration", )
     parser.add_argument('--ndk_path', help="Path to Android NDK to use for build")
     parser.add_argument('--sdk_path', help="Path to Android SDK to use for build")
+    parser.add_argument('--use_android_buildtools', action="store_true", help='Use cmake/ninja build tools from Android SDK')
     parser.add_argument("--extra_modules_path", help="Path to extra modules to use for build")
     parser.add_argument('--sign_with', help="Certificate to sign the Manager apk")
     parser.add_argument('--build_doc', action="store_true", help="Build javadoc")
@@ -303,7 +307,7 @@ if __name__ == "__main__":
         raise Fail("NDK location not set. Either pass --ndk_path or set ANDROID_NDK environment variable")
 
     if not check_executable(['ccache', '--version']):
-        log.info("ccache not found - disabling ccache supprt")
+        log.info("ccache not found - disabling ccache support")
         args.no_ccache = True
 
     if os.path.realpath(args.work_dir) == os.path.realpath(SCRIPT_DIR):
diff --git a/samples/cpp/lsd_lines.cpp b/samples/cpp/lsd_lines.cpp
deleted file mode 100644
index 3feed9cbc2..0000000000
--- a/samples/cpp/lsd_lines.cpp
+++ /dev/null
@@ -1,73 +0,0 @@
-#include "opencv2/imgproc.hpp"
-#include "opencv2/imgcodecs.hpp"
-#include "opencv2/highgui.hpp"
-#include <iostream>
-
-using namespace std;
-using namespace cv;
-
-int main(int argc, char** argv)
-{
-    cv::CommandLineParser parser(argc, argv,
-                                 "{input   i|building.jpg|input image}"
-                                 "{refine  r|false|if true use LSD_REFINE_STD method, if false use LSD_REFINE_NONE method}"
-                                 "{canny   c|false|use Canny edge detector}"
-                                 "{overlay o|false|show result on input image}"
-                                 "{help    h|false|show help message}");
-
-    if (parser.get<bool>("help"))
-    {
-        parser.printMessage();
-        return 0;
-    }
-
-    parser.printMessage();
-
-    String filename = samples::findFile(parser.get<String>("input"));
-    bool useRefine = parser.get<bool>("refine");
-    bool useCanny = parser.get<bool>("canny");
-    bool overlay = parser.get<bool>("overlay");
-
-    Mat image = imread(filename, IMREAD_GRAYSCALE);
-
-    if( image.empty() )
-    {
-        cout << "Unable to load " << filename;
-        return 1;
-    }
-
-    imshow("Source Image", image);
-
-    if (useCanny)
-    {
-        Canny(image, image, 50, 200, 3); // Apply Canny edge detector
-    }
-
-    // Create and LSD detector with standard or no refinement.
-    Ptr<LineSegmentDetector> ls = useRefine ? createLineSegmentDetector(LSD_REFINE_STD) : createLineSegmentDetector(LSD_REFINE_NONE);
-
-    double start = double(getTickCount());
-    vector<Vec4f> lines_std;
-
-    // Detect the lines
-    ls->detect(image, lines_std);
-
-    double duration_ms = (double(getTickCount()) - start) * 1000 / getTickFrequency();
-    std::cout << "It took " << duration_ms << " ms." << std::endl;
-
-    // Show found lines
-    if (!overlay || useCanny)
-    {
-        image = Scalar(0, 0, 0);
-    }
-
-    ls->drawSegments(image, lines_std);
-
-    String window_name = useRefine ? "Result - standard refinement" : "Result - no refinement";
-    window_name += useCanny ? " - Canny edge detector used" : "";
-
-    imshow(window_name, image);
-
-    waitKey();
-    return 0;
-}
diff --git a/samples/dnn/tf_text_graph_common.py b/samples/dnn/tf_text_graph_common.py
index 5a8e62495d..b46b1d492c 100644
--- a/samples/dnn/tf_text_graph_common.py
+++ b/samples/dnn/tf_text_graph_common.py
@@ -324,6 +324,6 @@ def writeTextGraph(modelPath, outputPath, outNodes):
             for node in graph_def.node:
                 if node.op == 'Const':
                     if 'value' in node.attr and node.attr['value'].tensor.tensor_content:
-                        node.attr['value'].tensor.tensor_content = ''
+                        node.attr['value'].tensor.tensor_content = b''
 
         tf.train.write_graph(graph_def, "", outputPath, as_text=True)