Merge remote-tracking branch 'upstream/3.4' into merge-3.4

2025-06-27 15:01:50 +08:00 · 2020-08-26 15:51:25 +00:00 · 2020-08-26 15:51:25 +00:00 · ba147d2be2
commit ba147d2be2
parent 1192cbe4ab 01324b02e7
13 changed files with 771 additions and 174 deletions
--- a/doc/mymath.js
+++ b/doc/mymath.js
@ -9,6 +9,9 @@ MathJax.Hub.Config(
          forkfour: ["\\left\\{ \\begin{array}{l l} #1 & \\mbox{#2}\\\\ #3 & \\mbox{#4}\\\\ #5 & \\mbox{#6}\\\\ #7 & \\mbox{#8}\\\\ \\end{array} \\right.", 8],
          vecthree: ["\\begin{bmatrix} #1\\\\ #2\\\\ #3 \\end{bmatrix}", 3],
          vecthreethree: ["\\begin{bmatrix} #1 & #2 & #3\\\\ #4 & #5 & #6\\\\ #7 & #8 & #9 \\end{bmatrix}", 9],
+          cameramatrix: ["#1 = \\begin{bmatrix} f_x & 0 & c_x\\\\ 0 & f_y & c_y\\\\ 0 & 0 & 1 \\end{bmatrix}", 1],
+          distcoeffs: ["(k_1, k_2, p_1, p_2[, k_3[, k_4, k_5, k_6 [, s_1, s_2, s_3, s_4[, \\tau_x, \\tau_y]]]]) \\text{ of 4, 5, 8, 12 or 14 elements}"],
+          distcoeffsfisheye: ["(k_1, k_2, k_3, k_4)"],
          hdotsfor: ["\\dots", 1],
          mathbbm: ["\\mathbb{#1}", 1],
          bordermatrix: ["\\matrix{#1}", 1]
--- a/doc/mymath.sty
+++ b/doc/mymath.sty
@ -51,3 +51,20 @@
 #7 & #8 & #9
 \end{bmatrix}
 }
+
+\newcommand{\cameramatrix}[1]{
+#1 =
+\begin{bmatrix}
+ f_x & 0 & c_x\\
+ 0 & f_y & c_y\\
+ 0 & 0 & 1
+\end{bmatrix}
+}
+
+\newcommand{\distcoeffs}[]{
+(k_1, k_2, p_1, p_2[, k_3[, k_4, k_5, k_6 [, s_1, s_2, s_3, s_4[, \tau_x, \tau_y]]]]) \text{ of 4, 5, 8, 12 or 14 elements}
+}
+
+\newcommand{\distcoeffsfisheye}[]{
+(k_1, k_2, k_3, k_4)
+}
--- a/doc/pattern_tools/gen_pattern.py
+++ b/doc/pattern_tools/gen_pattern.py
@ -36,18 +36,27 @@ class PatternMaker:
    def make_circles_pattern(self):
        spacing = self.square_size
        r = spacing / self.radius_rate
-        for x in range(1, self.cols + 1):
-            for y in range(1, self.rows + 1):
-                dot = SVG("circle", cx=x * spacing, cy=y * spacing, r=r, fill="black", stroke="none")
+        pattern_width = ((self.cols - 1.0) * spacing) + (2.0 * r)
+        pattern_height = ((self.rows - 1.0) * spacing) + (2.0 * r)
+        x_spacing = (self.width - pattern_width) / 2.0
+        y_spacing = (self.height - pattern_height) / 2.0
+        for x in range(0, self.cols):
+            for y in range(0, self.rows):
+                dot = SVG("circle", cx=(x * spacing) + x_spacing + r,
+                          cy=(y * spacing) + y_spacing + r, r=r, fill="black", stroke="none")
                self.g.append(dot)

    def make_acircles_pattern(self):
        spacing = self.square_size
        r = spacing / self.radius_rate
-        for i in range(0, self.rows):
-            for j in range(0, self.cols):
-                dot = SVG("circle", cx=((j * 2 + i % 2) * spacing) + spacing, cy=self.height - (i * spacing + spacing),
-                          r=r, fill="black", stroke="none")
+        pattern_width = ((self.cols-1.0) * 2 * spacing) + spacing + (2.0 * r)
+        pattern_height = ((self.rows-1.0) * spacing) + (2.0 * r)
+        x_spacing = (self.width - pattern_width) / 2.0
+        y_spacing = (self.height - pattern_height) / 2.0
+        for x in range(0, self.cols):
+            for y in range(0, self.rows):
+                dot = SVG("circle", cx=(2 * x * spacing) + (y % 2)*spacing + x_spacing + r,
+                          cy=(y * spacing) + y_spacing + r, r=r, fill="black", stroke="none")
                self.g.append(dot)

    def make_checkerboard_pattern(self):
@ -84,9 +93,9 @@ def main():
    parser.add_argument("-R", "--radius_rate", help="circles_radius = square_size/radius_rate", default="5.0",
                        action="store", dest="radius_rate", type=float)
    parser.add_argument("-w", "--page_width", help="page width in units", default="216", action="store",
-                        dest="page_width", type=int)
+                        dest="page_width", type=float)
    parser.add_argument("-h", "--page_height", help="page height in units", default="279", action="store",
-                        dest="page_width", type=int)
+                        dest="page_width", type=float)
    parser.add_argument("-a", "--page_size", help="page size, supersedes -h -w arguments", default="A4", action="store",
                        dest="page_size", choices=["A0", "A1", "A2", "A3", "A4", "A5"])
    args = parser.parse_args()
--- a/modules/calib3d/include/opencv2/calib3d.hpp
+++ b/modules/calib3d/include/opencv2/calib3d.hpp
@ -64,17 +64,17 @@ The distortion-free projective transformation given by a  pinhole camera model i
 \f[s \; p = A \begin{bmatrix} R|t \end{bmatrix} P_w,\f]

 where \f$P_w\f$ is a 3D point expressed with respect to the world coordinate system,
-\f$p\f$ is a 2D pixel in the image plane, \f$A\f$ is the intrinsic camera matrix,
+\f$p\f$ is a 2D pixel in the image plane, \f$A\f$ is the camera intrinsic matrix,
 \f$R\f$ and \f$t\f$ are the rotation and translation that describe the change of coordinates from
 world to camera coordinate systems (or camera frame) and \f$s\f$ is the projective transformation's
 arbitrary scaling and not part of the camera model.

-The intrinsic camera matrix \f$A\f$ (notation used as in @cite Zhang2000 and also generally notated
+The camera intrinsic matrix \f$A\f$ (notation used as in @cite Zhang2000 and also generally notated
 as \f$K\f$) projects 3D points given in the camera coordinate system to 2D pixel coordinates, i.e.

 \f[p = A P_c.\f]

-The camera matrix \f$A\f$ is composed of the focal lengths \f$f_x\f$ and \f$f_y\f$, which are
+The camera intrinsic matrix \f$A\f$ is composed of the focal lengths \f$f_x\f$ and \f$f_y\f$, which are
 expressed in pixel units, and the principal point \f$(c_x, c_y)\f$, that is usually close to the
 image center:

@ -382,9 +382,9 @@ R & t \\
 \end{bmatrix} P_{h_0}.\f]

@note
-    -   Many functions in this module take a camera matrix as an input parameter. Although all
+    -   Many functions in this module take a camera intrinsic matrix as an input parameter. Although all
        functions assume the same structure of this parameter, they may name it differently. The
-        parameter's description, however, will be clear in that a camera matrix with the structure
+        parameter's description, however, will be clear in that a camera intrinsic matrix with the structure
        shown above is required.
    -   A calibration sample for 3 cameras in a horizontal position can be found at
        opencv_source_code/samples/cpp/3calibration.cpp
@ -457,8 +457,10 @@ enum SolvePnPMethod {
    SOLVEPNP_ITERATIVE   = 0,
    SOLVEPNP_EPNP        = 1, //!< EPnP: Efficient Perspective-n-Point Camera Pose Estimation @cite lepetit2009epnp
    SOLVEPNP_P3P         = 2, //!< Complete Solution Classification for the Perspective-Three-Point Problem @cite gao2003complete
-    SOLVEPNP_DLS         = 3, //!< A Direct Least-Squares (DLS) Method for PnP  @cite hesch2011direct
-    SOLVEPNP_UPNP        = 4, //!< Exhaustive Linearization for Robust Camera Pose and Focal Length Estimation @cite penate2013exhaustive
+    SOLVEPNP_DLS         = 3, //!< **Broken implementation. Using this flag will fallback to EPnP.** \n
+                              //!< A Direct Least-Squares (DLS) Method for PnP @cite hesch2011direct
+    SOLVEPNP_UPNP        = 4, //!< **Broken implementation. Using this flag will fallback to EPnP.** \n
+                              //!< Exhaustive Linearization for Robust Camera Pose and Focal Length Estimation @cite penate2013exhaustive
    SOLVEPNP_AP3P        = 5, //!< An Efficient Algebraic Solution to the Perspective-Three-Point Problem @cite Ke17
    SOLVEPNP_IPPE        = 6, //!< Infinitesimal Plane-Based Pose Estimation @cite Collins14 \n
                              //!< Object points must be coplanar.
@ -752,10 +754,10 @@ CV_EXPORTS_W Vec3d RQDecomp3x3( InputArray src, OutputArray mtxR, OutputArray mt
                                OutputArray Qy = noArray(),
                                OutputArray Qz = noArray());

-/** @brief Decomposes a projection matrix into a rotation matrix and a camera matrix.
+/** @brief Decomposes a projection matrix into a rotation matrix and a camera intrinsic matrix.

@param projMatrix 3x4 input projection matrix P.
-@param cameraMatrix Output 3x3 camera matrix K.
+@param cameraMatrix Output 3x3 camera intrinsic matrix \f$\cameramatrix{A}\f$.
@param rotMatrix Output 3x3 external rotation matrix R.
@param transVect Output 4x1 translation vector T.
@param rotMatrixX Optional 3x3 rotation matrix around x-axis.
@ -840,10 +842,9 @@ CV_EXPORTS_W void composeRT( InputArray rvec1, InputArray tvec1,
@param rvec The rotation vector (@ref Rodrigues) that, together with tvec, performs a change of
 basis from world to camera coordinate system, see @ref calibrateCamera for details.
@param tvec The translation vector, see parameter description above.
-@param cameraMatrix Camera matrix \f$A = \vecthreethree{f_x}{0}{c_x}{0}{f_y}{c_y}{0}{0}{_1}\f$ .
+@param cameraMatrix Camera intrinsic matrix \f$\cameramatrix{A}\f$ .
@param distCoeffs Input vector of distortion coefficients
-\f$(k_1, k_2, p_1, p_2[, k_3[, k_4, k_5, k_6 [, s_1, s_2, s_3, s_4[, \tau_x, \tau_y]]]])\f$ of
-4, 5, 8, 12 or 14 elements. If the vector is empty, the zero distortion coefficients are assumed.
+\f$\distcoeffs\f$ . If the vector is empty, the zero distortion coefficients are assumed.
@param imagePoints Output array of image points, 1xN/Nx1 2-channel, or
 vector\<Point2f\> .
@param jacobian Optional output 2Nx(10+\<numDistCoeffs\>) jacobian matrix of derivatives of image
@ -897,10 +898,9 @@ Number of input points must be 4. Object points must be defined in the following
 1xN/Nx1 3-channel, where N is the number of points. vector\<Point3d\> can be also passed here.
@param imagePoints Array of corresponding image points, Nx2 1-channel or 1xN/Nx1 2-channel,
 where N is the number of points. vector\<Point2d\> can be also passed here.
-@param cameraMatrix Input camera matrix \f$A = \vecthreethree{f_x}{0}{c_x}{0}{f_y}{c_y}{0}{0}{1}\f$ .
+@param cameraMatrix Input camera intrinsic matrix \f$\cameramatrix{A}\f$ .
@param distCoeffs Input vector of distortion coefficients
-\f$(k_1, k_2, p_1, p_2[, k_3[, k_4, k_5, k_6 [, s_1, s_2, s_3, s_4[, \tau_x, \tau_y]]]])\f$ of
-4, 5, 8, 12 or 14 elements. If the vector is NULL/empty, the zero distortion coefficients are
+\f$\distcoeffs\f$. If the vector is NULL/empty, the zero distortion coefficients are
 assumed.
@param rvec Output rotation vector (see @ref Rodrigues ) that, together with tvec, brings points from
 the model coordinate system to the camera coordinate system.
@ -912,7 +912,7 @@ vectors, respectively, and further optimizes them.
 -   **SOLVEPNP_ITERATIVE** Iterative method is based on a Levenberg-Marquardt optimization. In
 this case the function finds such a pose that minimizes reprojection error, that is the sum
 of squared distances between the observed projections imagePoints and the projected (using
-projectPoints ) objectPoints .
+@ref projectPoints ) objectPoints .
 -   **SOLVEPNP_P3P** Method is based on the paper of X.S. Gao, X.-R. Hou, J. Tang, H.-F. Chang
 "Complete Solution Classification for the Perspective-Three-Point Problem" (@cite gao2003complete).
 In this case the function requires exactly four object and image points.
@ -921,9 +921,11 @@ In this case the function requires exactly four object and image points.
 In this case the function requires exactly four object and image points.
 -   **SOLVEPNP_EPNP** Method has been introduced by F. Moreno-Noguer, V. Lepetit and P. Fua in the
 paper "EPnP: Efficient Perspective-n-Point Camera Pose Estimation" (@cite lepetit2009epnp).
-   **SOLVEPNP_DLS** Method is based on the paper of J. Hesch and S. Roumeliotis.
+-   **SOLVEPNP_DLS** **Broken implementation. Using this flag will fallback to EPnP.** \n
+Method is based on the paper of J. Hesch and S. Roumeliotis.
 "A Direct Least-Squares (DLS) Method for PnP" (@cite hesch2011direct).
-   **SOLVEPNP_UPNP** Method is based on the paper of A. Penate-Sanchez, J. Andrade-Cetto,
+-   **SOLVEPNP_UPNP** **Broken implementation. Using this flag will fallback to EPnP.** \n
+Method is based on the paper of A. Penate-Sanchez, J. Andrade-Cetto,
 F. Moreno-Noguer. "Exhaustive Linearization for Robust Camera Pose and Focal Length
 Estimation" (@cite penate2013exhaustive). In this case the function also estimates the parameters \f$f_x\f$ and \f$f_y\f$
 assuming that both have the same value. Then the cameraMatrix is updated with the estimated
@ -939,7 +941,7 @@ It requires 4 coplanar object points defined in the following order:
  - point 3: [-squareLength / 2, -squareLength / 2, 0]

 The function estimates the object pose given a set of object points, their corresponding image
-projections, as well as the camera matrix and the distortion coefficients, see the figure below
+projections, as well as the camera intrinsic matrix and the distortion coefficients, see the figure below
 (more precisely, the X-axis of the camera frame is pointing to the right, the Y-axis downward
 and the Z-axis forward).

@ -1072,10 +1074,9 @@ CV_EXPORTS_W bool solvePnP( InputArray objectPoints, InputArray imagePoints,
 1xN/Nx1 3-channel, where N is the number of points. vector\<Point3d\> can be also passed here.
@param imagePoints Array of corresponding image points, Nx2 1-channel or 1xN/Nx1 2-channel,
 where N is the number of points. vector\<Point2d\> can be also passed here.
-@param cameraMatrix Input camera matrix \f$A = \vecthreethree{fx}{0}{cx}{0}{fy}{cy}{0}{0}{1}\f$ .
+@param cameraMatrix Input camera intrinsic matrix \f$\cameramatrix{A}\f$ .
@param distCoeffs Input vector of distortion coefficients
-\f$(k_1, k_2, p_1, p_2[, k_3[, k_4, k_5, k_6 [, s_1, s_2, s_3, s_4[, \tau_x, \tau_y]]]])\f$ of
-4, 5, 8, 12 or 14 elements. If the vector is NULL/empty, the zero distortion coefficients are
+\f$\distcoeffs\f$. If the vector is NULL/empty, the zero distortion coefficients are
 assumed.
@param rvec Output rotation vector (see @ref Rodrigues ) that, together with tvec, brings points from
 the model coordinate system to the camera coordinate system.
@ -1092,7 +1093,7 @@ an inlier.
@param flags Method for solving a PnP problem (see @ref solvePnP ).

 The function estimates an object pose given a set of object points, their corresponding image
-projections, as well as the camera matrix and the distortion coefficients. This function finds such
+projections, as well as the camera intrinsic matrix and the distortion coefficients. This function finds such
 a pose that minimizes reprojection error, that is, the sum of squared distances between the observed
 projections imagePoints and the projected (using @ref projectPoints ) objectPoints. The use of RANSAC
 makes the function resistant to outliers.
@ -1131,10 +1132,9 @@ CV_EXPORTS_W bool solvePnPRansac( InputArray objectPoints, InputArray imagePoint
 1x3/3x1 3-channel. vector\<Point3f\> can be also passed here.
@param imagePoints Array of corresponding image points, 3x2 1-channel or 1x3/3x1 2-channel.
 vector\<Point2f\> can be also passed here.
-@param cameraMatrix Input camera matrix \f$A = \vecthreethree{f_x}{0}{c_x}{0}{f_y}{c_y}{0}{0}{1}\f$ .
+@param cameraMatrix Input camera intrinsic matrix \f$\cameramatrix{A}\f$ .
@param distCoeffs Input vector of distortion coefficients
-\f$(k_1, k_2, p_1, p_2[, k_3[, k_4, k_5, k_6 [, s_1, s_2, s_3, s_4[, \tau_x, \tau_y]]]])\f$ of
-4, 5, 8, 12 or 14 elements. If the vector is NULL/empty, the zero distortion coefficients are
+\f$\distcoeffs\f$. If the vector is NULL/empty, the zero distortion coefficients are
 assumed.
@param rvecs Output rotation vectors (see @ref Rodrigues ) that, together with tvecs, brings points from
 the model coordinate system to the camera coordinate system. A P3P problem has up to 4 solutions.
@ -1146,7 +1146,7 @@ the model coordinate system to the camera coordinate system. A P3P problem has u
 "An Efficient Algebraic Solution to the Perspective-Three-Point Problem" (@cite Ke17).

 The function estimates the object pose given 3 object points, their corresponding image
-projections, as well as the camera matrix and the distortion coefficients.
+projections, as well as the camera intrinsic matrix and the distortion coefficients.

@note
 The solutions are sorted by reprojection errors (lowest to highest).
@ -1163,10 +1163,9 @@ to the camera coordinate frame) from a 3D-2D point correspondences and starting
 where N is the number of points. vector\<Point3d\> can also be passed here.
@param imagePoints Array of corresponding image points, Nx2 1-channel or 1xN/Nx1 2-channel,
 where N is the number of points. vector\<Point2d\> can also be passed here.
-@param cameraMatrix Input camera matrix \f$A = \vecthreethree{f_x}{0}{c_x}{0}{f_y}{c_y}{0}{0}{1}\f$ .
+@param cameraMatrix Input camera intrinsic matrix \f$\cameramatrix{A}\f$ .
@param distCoeffs Input vector of distortion coefficients
-\f$(k_1, k_2, p_1, p_2[, k_3[, k_4, k_5, k_6 [, s_1, s_2, s_3, s_4[, \tau_x, \tau_y]]]])\f$ of
-4, 5, 8, 12 or 14 elements. If the vector is NULL/empty, the zero distortion coefficients are
+\f$\distcoeffs\f$. If the vector is NULL/empty, the zero distortion coefficients are
 assumed.
@param rvec Input/Output rotation vector (see @ref Rodrigues ) that, together with tvec, brings points from
 the model coordinate system to the camera coordinate system. Input values are used as an initial solution.
@ -1175,7 +1174,7 @@ the model coordinate system to the camera coordinate system. Input values are us

 The function refines the object pose given at least 3 object points, their corresponding image
 projections, an initial solution for the rotation and translation vector,
-as well as the camera matrix and the distortion coefficients.
+as well as the camera intrinsic matrix and the distortion coefficients.
 The function minimizes the projection error with respect to the rotation and the translation vectors, according
 to a Levenberg-Marquardt iterative minimization @cite Madsen04 @cite Eade13 process.
 */
@ -1191,10 +1190,9 @@ to the camera coordinate frame) from a 3D-2D point correspondences and starting
 where N is the number of points. vector\<Point3d\> can also be passed here.
@param imagePoints Array of corresponding image points, Nx2 1-channel or 1xN/Nx1 2-channel,
 where N is the number of points. vector\<Point2d\> can also be passed here.
-@param cameraMatrix Input camera matrix \f$A = \vecthreethree{f_x}{0}{c_x}{0}{f_y}{c_y}{0}{0}{1}\f$ .
+@param cameraMatrix Input camera intrinsic matrix \f$\cameramatrix{A}\f$ .
@param distCoeffs Input vector of distortion coefficients
-\f$(k_1, k_2, p_1, p_2[, k_3[, k_4, k_5, k_6 [, s_1, s_2, s_3, s_4[, \tau_x, \tau_y]]]])\f$ of
-4, 5, 8, 12 or 14 elements. If the vector is NULL/empty, the zero distortion coefficients are
+\f$\distcoeffs\f$. If the vector is NULL/empty, the zero distortion coefficients are
 assumed.
@param rvec Input/Output rotation vector (see @ref Rodrigues ) that, together with tvec, brings points from
 the model coordinate system to the camera coordinate system. Input values are used as an initial solution.
@ -1205,7 +1203,7 @@ gain in the Damped Gauss-Newton formulation.

 The function refines the object pose given at least 3 object points, their corresponding image
 projections, an initial solution for the rotation and translation vector,
-as well as the camera matrix and the distortion coefficients.
+as well as the camera intrinsic matrix and the distortion coefficients.
 The function minimizes the projection error with respect to the rotation and the translation vectors, using a
 virtual visual servoing (VVS) @cite Chaumette06 @cite Marchand16 scheme.
 */
@ -1233,10 +1231,9 @@ Only 1 solution is returned.
 1xN/Nx1 3-channel, where N is the number of points. vector\<Point3d\> can be also passed here.
@param imagePoints Array of corresponding image points, Nx2 1-channel or 1xN/Nx1 2-channel,
 where N is the number of points. vector\<Point2d\> can be also passed here.
-@param cameraMatrix Input camera matrix \f$A = \vecthreethree{f_x}{0}{c_x}{0}{f_y}{c_y}{0}{0}{1}\f$ .
+@param cameraMatrix Input camera intrinsic matrix \f$\cameramatrix{A}\f$ .
@param distCoeffs Input vector of distortion coefficients
-\f$(k_1, k_2, p_1, p_2[, k_3[, k_4, k_5, k_6 [, s_1, s_2, s_3, s_4[, \tau_x, \tau_y]]]])\f$ of
-4, 5, 8, 12 or 14 elements. If the vector is NULL/empty, the zero distortion coefficients are
+\f$\distcoeffs\f$. If the vector is NULL/empty, the zero distortion coefficients are
 assumed.
@param rvecs Vector of output rotation vectors (see @ref Rodrigues ) that, together with tvecs, brings points from
 the model coordinate system to the camera coordinate system.
@ -1257,9 +1254,11 @@ In this case the function requires exactly four object and image points.
 In this case the function requires exactly four object and image points.
 -   **SOLVEPNP_EPNP** Method has been introduced by F.Moreno-Noguer, V.Lepetit and P.Fua in the
 paper "EPnP: Efficient Perspective-n-Point Camera Pose Estimation" (@cite lepetit2009epnp).
-   **SOLVEPNP_DLS** Method is based on the paper of Joel A. Hesch and Stergios I. Roumeliotis.
+-   **SOLVEPNP_DLS** **Broken implementation. Using this flag will fallback to EPnP.** \n
+Method is based on the paper of Joel A. Hesch and Stergios I. Roumeliotis.
 "A Direct Least-Squares (DLS) Method for PnP" (@cite hesch2011direct).
-   **SOLVEPNP_UPNP** Method is based on the paper of A.Penate-Sanchez, J.Andrade-Cetto,
+-   **SOLVEPNP_UPNP** **Broken implementation. Using this flag will fallback to EPnP.** \n
+Method is based on the paper of A.Penate-Sanchez, J.Andrade-Cetto,
 F.Moreno-Noguer. "Exhaustive Linearization for Robust Camera Pose and Focal Length
 Estimation" (@cite penate2013exhaustive). In this case the function also estimates the parameters \f$f_x\f$ and \f$f_y\f$
 assuming that both have the same value. Then the cameraMatrix is updated with the estimated
@ -1282,7 +1281,7 @@ and useExtrinsicGuess is set to true.
 and the 3D object points projected with the estimated pose.

 The function estimates the object pose given a set of object points, their corresponding image
-projections, as well as the camera matrix and the distortion coefficients, see the figure below
+projections, as well as the camera intrinsic matrix and the distortion coefficients, see the figure below
 (more precisely, the X-axis of the camera frame is pointing to the right, the Y-axis downward
 and the Z-axis forward).

@ -1411,7 +1410,7 @@ CV_EXPORTS_W int solvePnPGeneric( InputArray objectPoints, InputArray imagePoint
                                  InputArray rvec = noArray(), InputArray tvec = noArray(),
                                  OutputArray reprojectionError = noArray() );

-/** @brief Finds an initial camera matrix from 3D-2D point correspondences.
+/** @brief Finds an initial camera intrinsic matrix from 3D-2D point correspondences.

@param objectPoints Vector of vectors of the calibration pattern points in the calibration pattern
 coordinate space. In the old interface all the per-view vectors are concatenated. See
@ -1422,7 +1421,7 @@ old interface all the per-view vectors are concatenated.
@param aspectRatio If it is zero or negative, both \f$f_x\f$ and \f$f_y\f$ are estimated independently.
 Otherwise, \f$f_x = f_y * \texttt{aspectRatio}\f$ .

-The function estimates and returns an initial camera matrix for the camera calibration process.
+The function estimates and returns an initial camera intrinsic matrix for the camera calibration process.
 Currently, the function only supports planar calibration patterns, which are patterns where each
 object point has z-coordinate =0.
 */
@ -1601,10 +1600,9 @@ CV_EXPORTS_W void drawChessboardCorners( InputOutputArray image, Size patternSiz

@param image Input/output image. It must have 1 or 3 channels. The number of channels is not altered.
@param cameraMatrix Input 3x3 floating-point matrix of camera intrinsic parameters.
-\f$A = \vecthreethree{f_x}{0}{c_x}{0}{f_y}{c_y}{0}{0}{1}\f$
+\f$\cameramatrix{A}\f$
@param distCoeffs Input vector of distortion coefficients
-\f$(k_1, k_2, p_1, p_2[, k_3[, k_4, k_5, k_6 [, s_1, s_2, s_3, s_4[, \tau_x, \tau_y]]]])\f$ of
-4, 5, 8, 12 or 14 elements. If the vector is empty, the zero distortion coefficients are assumed.
+\f$\distcoeffs\f$. If the vector is empty, the zero distortion coefficients are assumed.
@param rvec Rotation vector (see @ref Rodrigues ) that, together with tvec, brings points from
 the model coordinate system to the camera coordinate system.
@param tvec Translation vector.
@ -1707,14 +1705,13 @@ pattern points (e.g. std::vector<std::vector<cv::Vec2f>>). imagePoints.size() an
 objectPoints.size(), and imagePoints[i].size() and objectPoints[i].size() for each i, must be equal,
 respectively. In the old interface all the vectors of object points from different views are
 concatenated together.
-@param imageSize Size of the image used only to initialize the intrinsic camera matrix.
-@param cameraMatrix Input/output 3x3 floating-point camera matrix
-\f$A = \vecthreethree{f_x}{0}{c_x}{0}{f_y}{c_y}{0}{0}{1}\f$ . If CV\_CALIB\_USE\_INTRINSIC\_GUESS
+@param imageSize Size of the image used only to initialize the camera intrinsic matrix.
+@param cameraMatrix Input/output 3x3 floating-point camera intrinsic matrix
+\f$\cameramatrix{A}\f$ . If CV\_CALIB\_USE\_INTRINSIC\_GUESS
 and/or CALIB_FIX_ASPECT_RATIO are specified, some or all of fx, fy, cx, cy must be
 initialized before calling the function.
@param distCoeffs Input/output vector of distortion coefficients
-\f$(k_1, k_2, p_1, p_2[, k_3[, k_4, k_5, k_6 [, s_1, s_2, s_3, s_4[, \tau_x, \tau_y]]]])\f$ of
-4, 5, 8, 12 or 14 elements.
+\f$\distcoeffs\f$.
@param rvecs Output vector of rotation vectors (@ref Rodrigues ) estimated for each pattern view
 (e.g. std::vector<cv::Mat>>). That is, each i-th rotation vector together with the corresponding
 i-th translation vector (see the next output parameter description) brings the calibration pattern
@ -1904,9 +1901,9 @@ CV_EXPORTS_W double calibrateCameraRO( InputArrayOfArrays objectPoints,
                                     int flags = 0, TermCriteria criteria = TermCriteria(
                                        TermCriteria::COUNT + TermCriteria::EPS, 30, DBL_EPSILON) );

-/** @brief Computes useful camera characteristics from the camera matrix.
+/** @brief Computes useful camera characteristics from the camera intrinsic matrix.

-@param cameraMatrix Input camera matrix that can be estimated by calibrateCamera or
+@param cameraMatrix Input camera intrinsic matrix that can be estimated by calibrateCamera or
 stereoCalibrate .
@param imageSize Input image size in pixels.
@param apertureWidth Physical width in mm of the sensor.
@ -1942,15 +1939,15 @@ be equal for each i.
 observed by the first camera. The same structure as in @ref calibrateCamera.
@param imagePoints2 Vector of vectors of the projections of the calibration pattern points,
 observed by the second camera. The same structure as in @ref calibrateCamera.
-@param cameraMatrix1 Input/output camera matrix for the first camera, the same as in
+@param cameraMatrix1 Input/output camera intrinsic matrix for the first camera, the same as in
@ref calibrateCamera. Furthermore, for the stereo case, additional flags may be used, see below.
@param distCoeffs1 Input/output vector of distortion coefficients, the same as in
@ref calibrateCamera.
-@param cameraMatrix2 Input/output second camera matrix for the second camera. See description for
+@param cameraMatrix2 Input/output second camera intrinsic matrix for the second camera. See description for
 cameraMatrix1.
@param distCoeffs2 Input/output lens distortion coefficients for the second camera. See
 description for distCoeffs1.
-@param imageSize Size of the image used only to initialize the intrinsic camera matrices.
+@param imageSize Size of the image used only to initialize the camera intrinsic matrices.
@param R Output rotation matrix. Together with the translation vector T, this matrix brings
 points given in the first camera's coordinate system to points in the second camera's
 coordinate system. In more technical terms, the tuple of R and T performs a change of basis
@ -2071,9 +2068,9 @@ CV_EXPORTS_W double stereoCalibrate( InputArrayOfArrays objectPoints,

 /** @brief Computes rectification transforms for each head of a calibrated stereo camera.

-@param cameraMatrix1 First camera matrix.
+@param cameraMatrix1 First camera intrinsic matrix.
@param distCoeffs1 First camera distortion parameters.
-@param cameraMatrix2 Second camera matrix.
+@param cameraMatrix2 Second camera intrinsic matrix.
@param distCoeffs2 Second camera distortion parameters.
@param imageSize Size of the image used for stereo calibration.
@param R Rotation matrix from the coordinate system of the first camera to the second camera,
@ -2229,12 +2226,11 @@ CV_EXPORTS_W float rectify3Collinear( InputArray cameraMatrix1, InputArray distC
                                      OutputArray Q, double alpha, Size newImgSize,
                                      CV_OUT Rect* roi1, CV_OUT Rect* roi2, int flags );

-/** @brief Returns the new camera matrix based on the free scaling parameter.
+/** @brief Returns the new camera intrinsic matrix based on the free scaling parameter.

-@param cameraMatrix Input camera matrix.
+@param cameraMatrix Input camera intrinsic matrix.
@param distCoeffs Input vector of distortion coefficients
-\f$(k_1, k_2, p_1, p_2[, k_3[, k_4, k_5, k_6 [, s_1, s_2, s_3, s_4[, \tau_x, \tau_y]]]])\f$ of
-4, 5, 8, 12 or 14 elements. If the vector is NULL/empty, the zero distortion coefficients are
+\f$\distcoeffs\f$. If the vector is NULL/empty, the zero distortion coefficients are
 assumed.
@param imageSize Original image size.
@param alpha Free scaling parameter between 0 (when all the pixels in the undistorted image are
@ -2243,17 +2239,17 @@ stereoRectify for details.
@param newImgSize Image size after rectification. By default, it is set to imageSize .
@param validPixROI Optional output rectangle that outlines all-good-pixels region in the
 undistorted image. See roi1, roi2 description in stereoRectify .
-@param centerPrincipalPoint Optional flag that indicates whether in the new camera matrix the
+@param centerPrincipalPoint Optional flag that indicates whether in the new camera intrinsic matrix the
 principal point should be at the image center or not. By default, the principal point is chosen to
 best fit a subset of the source image (determined by alpha) to the corrected image.
-@return new_camera_matrix Output new camera matrix.
+@return new_camera_matrix Output new camera intrinsic matrix.

-The function computes and returns the optimal new camera matrix based on the free scaling parameter.
+The function computes and returns the optimal new camera intrinsic matrix based on the free scaling parameter.
 By varying this parameter, you may retrieve only sensible pixels alpha=0 , keep all the original
 image pixels if there is valuable information in the corners alpha=1 , or get something in between.
 When alpha\>0 , the undistorted result is likely to have some black pixels corresponding to
-"virtual" pixels outside of the captured distorted image. The original camera matrix, distortion
-coefficients, the computed new camera matrix, and newImageSize should be passed to
+"virtual" pixels outside of the captured distorted image. The original camera intrinsic matrix, distortion
+coefficients, the computed new camera intrinsic matrix, and newImageSize should be passed to
 initUndistortRectifyMap to produce the maps for remap .
 */
 CV_EXPORTS_W Mat getOptimalNewCameraMatrix( InputArray cameraMatrix, InputArray distCoeffs,
@ -2265,23 +2261,23 @@ CV_EXPORTS_W Mat getOptimalNewCameraMatrix( InputArray cameraMatrix, InputArray

@param[in] R_gripper2base Rotation part extracted from the homogeneous matrix that transforms a point
 expressed in the gripper frame to the robot base frame (\f$_{}^{b}\textrm{T}_g\f$).
-This is a vector (`vector<Mat>`) that contains the rotation matrices for all the transformations
-from gripper frame to robot base frame.
+This is a vector (`vector<Mat>`) that contains the rotation, `(3x3)` rotation matrices or `(3x1)` rotation vectors,
+for all the transformations from gripper frame to robot base frame.
@param[in] t_gripper2base Translation part extracted from the homogeneous matrix that transforms a point
 expressed in the gripper frame to the robot base frame (\f$_{}^{b}\textrm{T}_g\f$).
-This is a vector (`vector<Mat>`) that contains the translation vectors for all the transformations
+This is a vector (`vector<Mat>`) that contains the `(3x1)` translation vectors for all the transformations
 from gripper frame to robot base frame.
@param[in] R_target2cam Rotation part extracted from the homogeneous matrix that transforms a point
 expressed in the target frame to the camera frame (\f$_{}^{c}\textrm{T}_t\f$).
-This is a vector (`vector<Mat>`) that contains the rotation matrices for all the transformations
-from calibration target frame to camera frame.
+This is a vector (`vector<Mat>`) that contains the rotation, `(3x3)` rotation matrices or `(3x1)` rotation vectors,
+for all the transformations from calibration target frame to camera frame.
@param[in] t_target2cam Rotation part extracted from the homogeneous matrix that transforms a point
 expressed in the target frame to the camera frame (\f$_{}^{c}\textrm{T}_t\f$).
-This is a vector (`vector<Mat>`) that contains the translation vectors for all the transformations
+This is a vector (`vector<Mat>`) that contains the `(3x1)` translation vectors for all the transformations
 from calibration target frame to camera frame.
-@param[out] R_cam2gripper Estimated rotation part extracted from the homogeneous matrix that transforms a point
+@param[out] R_cam2gripper Estimated `(3x3)` rotation part extracted from the homogeneous matrix that transforms a point
 expressed in the camera frame to the gripper frame (\f$_{}^{g}\textrm{T}_c\f$).
-@param[out] t_cam2gripper Estimated translation part extracted from the homogeneous matrix that transforms a point
+@param[out] t_cam2gripper Estimated `(3x1)` translation part extracted from the homogeneous matrix that transforms a point
 expressed in the camera frame to the gripper frame (\f$_{}^{g}\textrm{T}_c\f$).
@param[in] method One of the implemented Hand-Eye calibration method, see cv::HandEyeCalibrationMethod

@ -2502,11 +2498,11 @@ CV_EXPORTS_W Mat findFundamentalMat( InputArray points1, InputArray points2,
@param points1 Array of N (N \>= 5) 2D points from the first image. The point coordinates should
 be floating-point (single or double precision).
@param points2 Array of the second image points of the same size and format as points1 .
-@param cameraMatrix Camera matrix \f$K = \vecthreethree{f_x}{0}{c_x}{0}{f_y}{c_y}{0}{0}{1}\f$ .
+@param cameraMatrix Camera intrinsic matrix \f$\cameramatrix{A}\f$ .
 Note that this function assumes that points1 and points2 are feature points from cameras with the
-same camera matrix. If this assumption does not hold for your use case, use
+same camera intrinsic matrix. If this assumption does not hold for your use case, use
 `undistortPoints()` with `P = cv::NoArray()` for both cameras to transform image points
-to normalized image coordinates, which are valid for the identity camera matrix. When
+to normalized image coordinates, which are valid for the identity camera intrinsic matrix. When
 passing these coordinates, pass the identity matrix for this parameter.
@param method Method for computing an essential matrix.
 -   **RANSAC** for the RANSAC algorithm.
@ -2553,10 +2549,10 @@ confidence (probability) that the estimated matrix is correct.
@param mask Output array of N elements, every element of which is set to 0 for outliers and to 1
 for the other points. The array is computed only in the RANSAC and LMedS methods.

-This function differs from the one above that it computes camera matrix from focal length and
+This function differs from the one above that it computes camera intrinsic matrix from focal length and
 principal point:

-\f[K =
+\f[A =
 \begin{bmatrix}
 f & 0 & x_{pp}  \\
 0 & f & y_{pp}  \\
@ -2653,9 +2649,9 @@ inliers that pass the check.
@param points1 Array of N 2D points from the first image. The point coordinates should be
 floating-point (single or double precision).
@param points2 Array of the second image points of the same size and format as points1 .
-@param cameraMatrix Camera matrix \f$A = \vecthreethree{f_x}{0}{c_x}{0}{f_y}{c_y}{0}{0}{1}\f$ .
+@param cameraMatrix Camera intrinsic matrix \f$\cameramatrix{A}\f$ .
 Note that this function assumes that points1 and points2 are feature points from cameras with the
-same camera matrix.
+same camera intrinsic matrix.
@param R Output rotation matrix. Together with the translation vector, this matrix makes up a tuple
 that performs a change of basis from the first camera's coordinate system to the second camera's
 coordinate system. Note that, in general, t can not be used for this tuple, see the parameter
@ -2718,7 +2714,7 @@ are feature points from cameras with same focal length and principal point.
 inliers in points1 and points2 for then given essential matrix E. Only these inliers will be used to
 recover pose. In the output mask only inliers which pass the cheirality check.

-This function differs from the one above that it computes camera matrix from focal length and
+This function differs from the one above that it computes camera intrinsic matrix from focal length and
 principal point:

 \f[A =
@ -2738,9 +2734,9 @@ CV_EXPORTS_W int recoverPose( InputArray E, InputArray points1, InputArray point
@param points1 Array of N 2D points from the first image. The point coordinates should be
 floating-point (single or double precision).
@param points2 Array of the second image points of the same size and format as points1.
-@param cameraMatrix Camera matrix \f$A = \vecthreethree{f_x}{0}{c_x}{0}{f_y}{c_y}{0}{0}{1}\f$ .
+@param cameraMatrix Camera intrinsic matrix \f$\cameramatrix{A}\f$ .
 Note that this function assumes that points1 and points2 are feature points from cameras with the
-same camera matrix.
+same camera intrinsic matrix.
@param R Output rotation matrix. Together with the translation vector, this matrix makes up a tuple
 that performs a change of basis from the first camera's coordinate system to the second camera's
 coordinate system. Note that, in general, t can not be used for this tuple, see the parameter
@ -3150,7 +3146,7 @@ Check @ref tutorial_homography "the corresponding tutorial" for more details.
 /** @brief Decompose a homography matrix to rotation(s), translation(s) and plane normal(s).

@param H The input homography matrix between two images.
-@param K The input intrinsic camera calibration matrix.
+@param K The input camera intrinsic matrix.
@param rotations Array of rotation matrices.
@param translations Array of translation matrices.
@param normals Array of plane normal matrices.
@ -3611,8 +3607,8 @@ namespace fisheye
    @param imagePoints Output array of image points, 2xN/Nx2 1-channel or 1xN/Nx1 2-channel, or
    vector\<Point2f\>.
    @param affine
-    @param K Camera matrix \f$K = \vecthreethree{f_x}{0}{c_x}{0}{f_y}{c_y}{0}{0}{_1}\f$.
-    @param D Input vector of distortion coefficients \f$(k_1, k_2, k_3, k_4)\f$.
+    @param K Camera intrinsic matrix \f$cameramatrix{K}\f$.
+    @param D Input vector of distortion coefficients \f$\distcoeffsfisheye\f$.
    @param alpha The skew coefficient.
    @param jacobian Optional output 2Nx15 jacobian matrix of derivatives of image points with respect
    to components of the focal lengths, coordinates of the principal point, distortion coefficients,
@ -3635,12 +3631,12 @@ namespace fisheye

    @param undistorted Array of object points, 1xN/Nx1 2-channel (or vector\<Point2f\> ), where N is
    the number of points in the view.
-    @param K Camera matrix \f$K = \vecthreethree{f_x}{0}{c_x}{0}{f_y}{c_y}{0}{0}{_1}\f$.
-    @param D Input vector of distortion coefficients \f$(k_1, k_2, k_3, k_4)\f$.
+    @param K Camera intrinsic matrix \f$cameramatrix{K}\f$.
+    @param D Input vector of distortion coefficients \f$\distcoeffsfisheye\f$.
    @param alpha The skew coefficient.
    @param distorted Output array of image points, 1xN/Nx1 2-channel, or vector\<Point2f\> .

-    Note that the function assumes the camera matrix of the undistorted points to be identity.
+    Note that the function assumes the camera intrinsic matrix of the undistorted points to be identity.
    This means if you want to transform back points undistorted with undistortPoints() you have to
    multiply them with \f$P^{-1}\f$.
     */
@ -3650,11 +3646,11 @@ namespace fisheye

    @param distorted Array of object points, 1xN/Nx1 2-channel (or vector\<Point2f\> ), where N is the
    number of points in the view.
-    @param K Camera matrix \f$K = \vecthreethree{f_x}{0}{c_x}{0}{f_y}{c_y}{0}{0}{_1}\f$.
-    @param D Input vector of distortion coefficients \f$(k_1, k_2, k_3, k_4)\f$.
+    @param K Camera intrinsic matrix \f$cameramatrix{K}\f$.
+    @param D Input vector of distortion coefficients \f$\distcoeffsfisheye\f$.
    @param R Rectification transformation in the object space: 3x3 1-channel, or vector: 3x1/1x3
    1-channel or 1x1 3-channel
-    @param P New camera matrix (3x3) or new projection matrix (3x4)
+    @param P New camera intrinsic matrix (3x3) or new projection matrix (3x4)
    @param undistorted Output array of image points, 1xN/Nx1 2-channel, or vector\<Point2f\> .
     */
    CV_EXPORTS_W void undistortPoints(InputArray distorted, OutputArray undistorted,
@ -3663,11 +3659,11 @@ namespace fisheye
    /** @brief Computes undistortion and rectification maps for image transform by cv::remap(). If D is empty zero
    distortion is used, if R or P is empty identity matrixes are used.

-    @param K Camera matrix \f$K = \vecthreethree{f_x}{0}{c_x}{0}{f_y}{c_y}{0}{0}{_1}\f$.
-    @param D Input vector of distortion coefficients \f$(k_1, k_2, k_3, k_4)\f$.
+    @param K Camera intrinsic matrix \f$cameramatrix{K}\f$.
+    @param D Input vector of distortion coefficients \f$\distcoeffsfisheye\f$.
    @param R Rectification transformation in the object space: 3x3 1-channel, or vector: 3x1/1x3
    1-channel or 1x1 3-channel
-    @param P New camera matrix (3x3) or new projection matrix (3x4)
+    @param P New camera intrinsic matrix (3x3) or new projection matrix (3x4)
    @param size Undistorted image size.
    @param m1type Type of the first output map that can be CV_32FC1 or CV_16SC2 . See convertMaps()
    for details.
@ -3681,9 +3677,9 @@ namespace fisheye

    @param distorted image with fisheye lens distortion.
    @param undistorted Output image with compensated fisheye lens distortion.
-    @param K Camera matrix \f$K = \vecthreethree{f_x}{0}{c_x}{0}{f_y}{c_y}{0}{0}{_1}\f$.
-    @param D Input vector of distortion coefficients \f$(k_1, k_2, k_3, k_4)\f$.
-    @param Knew Camera matrix of the distorted image. By default, it is the identity matrix but you
+    @param K Camera intrinsic matrix \f$cameramatrix{K}\f$.
+    @param D Input vector of distortion coefficients \f$\distcoeffsfisheye\f$.
+    @param Knew Camera intrinsic matrix of the distorted image. By default, it is the identity matrix but you
    may additionally scale and shift the result by using a different matrix.
    @param new_size the new size

@ -3708,14 +3704,14 @@ namespace fisheye
    CV_EXPORTS_W void undistortImage(InputArray distorted, OutputArray undistorted,
        InputArray K, InputArray D, InputArray Knew = cv::noArray(), const Size& new_size = Size());

-    /** @brief Estimates new camera matrix for undistortion or rectification.
+    /** @brief Estimates new camera intrinsic matrix for undistortion or rectification.

-    @param K Camera matrix \f$K = \vecthreethree{f_x}{0}{c_x}{0}{f_y}{c_y}{0}{0}{_1}\f$.
+    @param K Camera intrinsic matrix \f$cameramatrix{K}\f$.
    @param image_size Size of the image
-    @param D Input vector of distortion coefficients \f$(k_1, k_2, k_3, k_4)\f$.
+    @param D Input vector of distortion coefficients \f$\distcoeffsfisheye\f$.
    @param R Rectification transformation in the object space: 3x3 1-channel, or vector: 3x1/1x3
    1-channel or 1x1 3-channel
-    @param P New camera matrix (3x3) or new projection matrix (3x4)
+    @param P New camera intrinsic matrix (3x3) or new projection matrix (3x4)
    @param balance Sets the new focal length in range between the min focal length and the max focal
    length. Balance is in range of [0, 1].
    @param new_size the new size
@ -3731,12 +3727,12 @@ namespace fisheye
    @param imagePoints vector of vectors of the projections of calibration pattern points.
    imagePoints.size() and objectPoints.size() and imagePoints[i].size() must be equal to
    objectPoints[i].size() for each i.
-    @param image_size Size of the image used only to initialize the intrinsic camera matrix.
-    @param K Output 3x3 floating-point camera matrix
-    \f$A = \vecthreethree{f_x}{0}{c_x}{0}{f_y}{c_y}{0}{0}{1}\f$ . If
+    @param image_size Size of the image used only to initialize the camera intrinsic matrix.
+    @param K Output 3x3 floating-point camera intrinsic matrix
+    \f$\cameramatrix{A}\f$ . If
    fisheye::CALIB_USE_INTRINSIC_GUESS/ is specified, some or all of fx, fy, cx, cy must be
    initialized before calling the function.
-    @param D Output vector of distortion coefficients \f$(k_1, k_2, k_3, k_4)\f$.
+    @param D Output vector of distortion coefficients \f$\distcoeffsfisheye\f$.
    @param rvecs Output vector of rotation vectors (see Rodrigues ) estimated for each pattern view.
    That is, each k-th rotation vector together with the corresponding k-th translation vector (see
    the next output parameter description) brings the calibration pattern from the model coordinate
@ -3763,9 +3759,9 @@ optimization. It stays at the center or at a different location specified when C

    /** @brief Stereo rectification for fisheye camera model

-    @param K1 First camera matrix.
+    @param K1 First camera intrinsic matrix.
    @param D1 First camera distortion parameters.
-    @param K2 Second camera matrix.
+    @param K2 Second camera intrinsic matrix.
    @param D2 Second camera distortion parameters.
    @param imageSize Size of the image used for stereo calibration.
    @param R Rotation matrix between the coordinate systems of the first and the second
@ -3802,15 +3798,15 @@ optimization. It stays at the center or at a different location specified when C
    observed by the first camera.
    @param imagePoints2 Vector of vectors of the projections of the calibration pattern points,
    observed by the second camera.
-    @param K1 Input/output first camera matrix:
+    @param K1 Input/output first camera intrinsic matrix:
    \f$\vecthreethree{f_x^{(j)}}{0}{c_x^{(j)}}{0}{f_y^{(j)}}{c_y^{(j)}}{0}{0}{1}\f$ , \f$j = 0,\, 1\f$ . If
    any of fisheye::CALIB_USE_INTRINSIC_GUESS , fisheye::CALIB_FIX_INTRINSIC are specified,
    some or all of the matrix components must be initialized.
-    @param D1 Input/output vector of distortion coefficients \f$(k_1, k_2, k_3, k_4)\f$ of 4 elements.
-    @param K2 Input/output second camera matrix. The parameter is similar to K1 .
+    @param D1 Input/output vector of distortion coefficients \f$\distcoeffsfisheye\f$ of 4 elements.
+    @param K2 Input/output second camera intrinsic matrix. The parameter is similar to K1 .
    @param D2 Input/output lens distortion coefficients for the second camera. The parameter is
    similar to D1 .
-    @param imageSize Size of the image used only to initialize intrinsic camera matrix.
+    @param imageSize Size of the image used only to initialize camera intrinsic matrix.
    @param R Output rotation matrix between the 1st and the 2nd camera coordinate systems.
    @param T Output translation vector between the coordinate systems of the cameras.
    @param flags Different flags that may be zero or a combination of the following values:
--- a/modules/calib3d/src/calibration_handeye.cpp
+++ b/modules/calib3d/src/calibration_handeye.cpp
@ -29,6 +29,7 @@ static Mat homogeneousInverse(const Mat& T)
 // q = sin(theta/2) * v
 // theta - rotation angle
 // v     - unit rotation axis, |v| = 1
+// Reference: http://www.euclideanspace.com/maths/geometry/rotations/conversions/matrixToQuaternion/
 static Mat rot2quatMinimal(const Mat& R)
 {
    CV_Assert(R.type() == CV_64FC1 && R.rows >= 3 && R.cols >= 3);
@ -44,7 +45,7 @@ static Mat rot2quatMinimal(const Mat& R)
        qx = (m21 - m12) / S;
        qy = (m02 - m20) / S;
        qz = (m10 - m01) / S;
-    } else if ((m00 > m11)&(m00 > m22)) {
+    } else if (m00 > m11 && m00 > m22) {
        double S = sqrt(1.0 + m00 - m11 - m22) * 2; // S=4*qx
        qx = 0.25 * S;
        qy = (m01 + m10) / S;
@ -98,6 +99,7 @@ static Mat quatMinimal2rot(const Mat& q)
 //
 // q - 4x1 unit quaternion <qw, qx, qy, qz>
 // R - 3x3 rotation matrix
+// Reference: http://www.euclideanspace.com/maths/geometry/rotations/conversions/matrixToQuaternion/
 static Mat rot2quat(const Mat& R)
 {
    CV_Assert(R.type() == CV_64FC1 && R.rows >= 3 && R.cols >= 3);
@ -114,7 +116,7 @@ static Mat rot2quat(const Mat& R)
        qx = (m21 - m12) / S;
        qy = (m02 - m20) / S;
        qz = (m10 - m01) / S;
-    } else if ((m00 > m11)&(m00 > m22)) {
+    } else if (m00 > m11 && m00 > m22) {
        double S = sqrt(1.0 + m00 - m11 - m22) * 2; // S=4*qx
        qw = (m21 - m12) / S;
        qx = 0.25 * S;
@ -572,7 +574,11 @@ static void calibrateHandEyeAndreff(const std::vector<Mat>& Hg, const std::vecto
    R = R.reshape(1, 2, newSize);
    //Eq 15
    double det = determinant(R);
-    R = pow(sign_double(det) / abs(det), 1.0/3.0) * R;
+    if (std::fabs(det) < FLT_EPSILON)
+    {
+        CV_Error(Error::StsNoConv, "calibrateHandEye() with CALIB_HAND_EYE_ANDREFF method: determinant(R) is null");
+    }
+    R = cubeRoot(static_cast<float>(sign_double(det) / abs(det))) * R;

    Mat w, u, vt;
    SVDecomp(R, w, u, vt);
--- a/modules/calib3d/src/solvepnp.cpp
+++ b/modules/calib3d/src/solvepnp.cpp
@ -51,6 +51,8 @@

 #include "usac.hpp"

+#include <opencv2/core/utils/logger.hpp>
+
 namespace cv
 {
 #if defined _DEBUG || defined CV_STATIC_ANALYSIS
@ -809,6 +811,15 @@ int solvePnPGeneric( InputArray _opoints, InputArray _ipoints,
    vector<Mat> vec_rvecs, vec_tvecs;
    if (flags == SOLVEPNP_EPNP || flags == SOLVEPNP_DLS || flags == SOLVEPNP_UPNP)
    {
+        if (flags == SOLVEPNP_DLS)
+        {
+            CV_LOG_DEBUG(NULL, "Broken implementation for SOLVEPNP_DLS. Fallback to EPnP.");
+        }
+        else if (flags == SOLVEPNP_UPNP)
+        {
+            CV_LOG_DEBUG(NULL, "Broken implementation for SOLVEPNP_UPNP. Fallback to EPnP.");
+        }
+
        Mat undistortedPoints;
        undistortPoints(ipoints, undistortedPoints, cameraMatrix, distCoeffs);
        epnp PnP(cameraMatrix, opoints, undistortedPoints);
--- a/modules/calib3d/test/test_calibration_hand_eye.cpp
+++ b/modules/calib3d/test/test_calibration_hand_eye.cpp
@ -7,6 +7,38 @@

 namespace opencv_test { namespace {

+static std::string getMethodName(HandEyeCalibrationMethod method)
+{
+    std::string method_name = "";
+    switch (method)
+    {
+    case CALIB_HAND_EYE_TSAI:
+        method_name = "Tsai";
+        break;
+
+    case CALIB_HAND_EYE_PARK:
+        method_name = "Park";
+        break;
+
+    case CALIB_HAND_EYE_HORAUD:
+        method_name = "Horaud";
+        break;
+
+    case CALIB_HAND_EYE_ANDREFF:
+        method_name = "Andreff";
+        break;
+
+    case CALIB_HAND_EYE_DANIILIDIS:
+        method_name = "Daniilidis";
+        break;
+
+    default:
+        break;
+    }
+
+    return method_name;
+}
+
 class CV_CalibrateHandEyeTest : public cvtest::BaseTest
 {
 public:
@ -48,7 +80,6 @@ protected:
                      std::vector<Mat> &R_target2cam, std::vector<Mat> &t_target2cam,
                      bool noise, Mat& R_cam2gripper, Mat& t_cam2gripper);
    Mat homogeneousInverse(const Mat& T);
-    std::string getMethodName(HandEyeCalibrationMethod method);
    double sign_double(double val);

    double eps_rvec[5];
@ -340,38 +371,6 @@ Mat CV_CalibrateHandEyeTest::homogeneousInverse(const Mat& T)
    return Tinv;
 }

-std::string CV_CalibrateHandEyeTest::getMethodName(HandEyeCalibrationMethod method)
-{
-    std::string method_name = "";
-    switch (method)
-    {
-    case CALIB_HAND_EYE_TSAI:
-        method_name = "Tsai";
-        break;
-
-    case CALIB_HAND_EYE_PARK:
-        method_name = "Park";
-        break;
-
-    case CALIB_HAND_EYE_HORAUD:
-        method_name = "Horaud";
-        break;
-
-    case CALIB_HAND_EYE_ANDREFF:
-        method_name = "Andreff";
-        break;
-
-    case CALIB_HAND_EYE_DANIILIDIS:
-        method_name = "Daniilidis";
-        break;
-
-    default:
-        break;
-    }
-
-    return method_name;
-}
-
 double CV_CalibrateHandEyeTest::sign_double(double val)
 {
    return (0 < val) - (val < 0);
@ -381,4 +380,86 @@ double CV_CalibrateHandEyeTest::sign_double(double val)

 TEST(Calib3d_CalibrateHandEye, regression) { CV_CalibrateHandEyeTest test; test.safe_run(); }

+TEST(Calib3d_CalibrateHandEye, regression_17986)
+{
+    const std::string camera_poses_filename = findDataFile("cv/hand_eye_calibration/cali.txt");
+    const std::string end_effector_poses = findDataFile("cv/hand_eye_calibration/robot_cali.txt");
+
+    std::vector<Mat> R_target2cam;
+    std::vector<Mat> t_target2cam;
+    // Parse camera poses
+    {
+        std::ifstream file(camera_poses_filename.c_str());
+        ASSERT_TRUE(file.is_open());
+
+        int ndata = 0;
+        file >> ndata;
+        R_target2cam.reserve(ndata);
+        t_target2cam.reserve(ndata);
+
+        std::string image_name;
+        Matx33d cameraMatrix;
+        Matx33d R;
+        Matx31d t;
+        Matx16d distCoeffs;
+        Matx13d distCoeffs2;
+        while (file >> image_name >>
+               cameraMatrix(0,0) >> cameraMatrix(0,1) >> cameraMatrix(0,2) >>
+               cameraMatrix(1,0) >> cameraMatrix(1,1) >> cameraMatrix(1,2) >>
+               cameraMatrix(2,0) >> cameraMatrix(2,1) >> cameraMatrix(2,2) >>
+               R(0,0) >> R(0,1) >> R(0,2) >>
+               R(1,0) >> R(1,1) >> R(1,2) >>
+               R(2,0) >> R(2,1) >> R(2,2) >>
+               t(0) >> t(1) >> t(2) >>
+               distCoeffs(0) >> distCoeffs(1) >> distCoeffs(2) >> distCoeffs(3) >> distCoeffs(4) >>
+               distCoeffs2(0) >> distCoeffs2(1) >> distCoeffs2(2)) {
+            R_target2cam.push_back(Mat(R));
+            t_target2cam.push_back(Mat(t));
+        }
+    }
+
+    std::vector<Mat> R_gripper2base;
+    std::vector<Mat> t_gripper2base;
+    // Parse end-effector poses
+    {
+        std::ifstream file(end_effector_poses.c_str());
+        ASSERT_TRUE(file.is_open());
+
+        int ndata = 0;
+        file >> ndata;
+        R_gripper2base.reserve(ndata);
+        t_gripper2base.reserve(ndata);
+
+        Matx33d R;
+        Matx31d t;
+        Matx14d last_row;
+        while (file >>
+               R(0,0) >> R(0,1) >> R(0,2) >> t(0) >>
+               R(1,0) >> R(1,1) >> R(1,2) >> t(1) >>
+               R(2,0) >> R(2,1) >> R(2,2) >> t(2) >>
+               last_row(0) >> last_row(1) >> last_row(2) >> last_row(3)) {
+            R_gripper2base.push_back(Mat(R));
+            t_gripper2base.push_back(Mat(t));
+        }
+    }
+
+    std::vector<HandEyeCalibrationMethod> methods;
+    methods.push_back(CALIB_HAND_EYE_TSAI);
+    methods.push_back(CALIB_HAND_EYE_PARK);
+    methods.push_back(CALIB_HAND_EYE_HORAUD);
+    methods.push_back(CALIB_HAND_EYE_ANDREFF);
+    methods.push_back(CALIB_HAND_EYE_DANIILIDIS);
+
+    for (size_t idx = 0; idx < methods.size(); idx++) {
+        SCOPED_TRACE(cv::format("method=%s", getMethodName(methods[idx]).c_str()));
+
+        Matx33d R_cam2gripper_est;
+        Matx31d t_cam2gripper_est;
+        calibrateHandEye(R_gripper2base, t_gripper2base, R_target2cam, t_target2cam, R_cam2gripper_est, t_cam2gripper_est, methods[idx]);
+
+        EXPECT_TRUE(checkRange(R_cam2gripper_est));
+        EXPECT_TRUE(checkRange(t_cam2gripper_est));
+    }
+}
+
 }} // namespace
--- a/modules/dnn/src/layers/scale_layer.cpp
+++ b/modules/dnn/src/layers/scale_layer.cpp
@ -16,6 +16,7 @@ Implementation of Scale layer.
 #include "../op_inf_engine.hpp"
 #include "../ie_ngraph.hpp"

+#include <opencv2/imgproc.hpp>
 #include <opencv2/dnn/shape_utils.hpp>

 #ifdef HAVE_CUDA
@ -389,7 +390,7 @@ public:
                         std::vector<MatShape> &internals) const CV_OVERRIDE
    {
        CV_Assert_N(inputs.size() == 1, blobs.size() == 3);
-        CV_Assert_N(blobs[0].total() == 1, blobs[1].total() == total(inputs[0], 1),
+        CV_Assert_N(blobs[0].total() == 1,
                    blobs[2].total() == inputs[0][1]);

        outputs.assign(1, inputs[0]);
@ -412,15 +413,20 @@ public:
        float* outData = outputs[0].ptr<float>();

        Mat data_mean_cpu = blobs[1].clone();
+        Mat mean_resize = Mat(inputs[0].size[3], inputs[0].size[2], CV_32FC3);
+        Mat mean_3d = Mat(data_mean_cpu.size[3], data_mean_cpu.size[2], CV_32FC3, data_mean_cpu.ptr<float>(0));
+        resize(mean_3d, mean_resize, Size(inputs[0].size[3], inputs[0].size[2]));
+        int new_size[] = {1, mean_resize.channels(), mean_resize.cols, mean_resize.rows};
+        Mat data_mean_cpu_resize = mean_resize.reshape(1, *new_size);
        Mat data_mean_per_channel_cpu = blobs[2].clone();

-        const int numWeights = data_mean_cpu.total();
+        const int numWeights = data_mean_cpu_resize.total();
        CV_Assert(numWeights != 0);

        ++num_iter;
        if (num_iter <= recompute_mean)
        {
-            data_mean_cpu *= (num_iter - 1);
+            data_mean_cpu_resize *= (num_iter - 1);
            const int batch = inputs[0].size[0];
            float alpha = 1.0 / batch;

@ -429,15 +435,15 @@ public:
                Mat inpSlice(1, numWeights, CV_32F, inpData);
                inpSlice = alpha * inpSlice;

-                add(data_mean_cpu.reshape(1, 1), inpSlice, data_mean_cpu.reshape(1, 1));
+                add(data_mean_cpu_resize.reshape(1, 1), inpSlice, data_mean_cpu_resize.reshape(1, 1));
                inpData += numWeights;
            }
-            data_mean_cpu *= (1.0 / num_iter);
+            data_mean_cpu_resize *= (1.0 / num_iter);

-            int newsize[] = {blobs[1].size[1], (int)blobs[1].total(2)};
-            reduce(data_mean_cpu.reshape(1, 2, &newsize[0]), data_mean_per_channel_cpu, 1, REDUCE_SUM, CV_32F);
+            int newsize[] = {inputs[0].size[1], (int)inputs[0].total(2)};
+            reduce(data_mean_cpu_resize.reshape(1, 2, &newsize[0]), data_mean_per_channel_cpu, 1, REDUCE_SUM, CV_32F);

-            int area = blobs[1].total(2);
+            int area = inputs[0].total(2);
            data_mean_per_channel_cpu *= (1.0 / area);
        }

@ -452,7 +458,7 @@ public:
                Mat inpSlice(1, numWeights, CV_32F, inpData);
                Mat outSlice(1, numWeights, CV_32F, outData);

-                add(inpSlice, (-1) * data_mean_cpu, outSlice);
+                add(inpSlice, (-1) * data_mean_cpu_resize, outSlice);
                inpData += numWeights;
                outData += numWeights;
            }
--- a/modules/dnn/src/onnx/onnx_importer.cpp
+++ b/modules/dnn/src/onnx/onnx_importer.cpp
@ -712,6 +712,19 @@ void ONNXImporter::populateNet(Net dstNet)
                layerParams.set("bias_term", true);
            }
        }
+        else if (layer_type == "Pow")
+        {
+            if (layer_id.find(node_proto.input(1)) != layer_id.end())
+                CV_Error(Error::StsNotImplemented, "Unsupported Pow op with variable power");
+
+            Mat blob = getBlob(node_proto, constBlobs, 1);
+            if (blob.total() != 1)
+                CV_Error(Error::StsNotImplemented, "Pow op supports only scalar power");
+
+            blob.convertTo(blob, CV_32F);
+            layerParams.type = "Power";
+            layerParams.set("power", blob.at<float>(0));
+        }
        else if (layer_type == "Max")
        {
            layerParams.type = "Eltwise";
@ -933,6 +946,19 @@ void ONNXImporter::populateNet(Net dstNet)
                Mat bias = getBlob(node_proto, constBlobs, 2);
                layerParams.blobs.push_back(bias);
            }
+            if (constBlobs.find(node_proto.input(0)) != constBlobs.end())
+            {
+                Mat inputBuf = getBlob(node_proto, constBlobs, 0);
+
+                LayerParams constParams;
+                constParams.name = node_proto.input(0);
+                constParams.type = "Const";
+                constParams.blobs.push_back(inputBuf);
+
+                opencv_onnx::NodeProto proto;
+                proto.add_output(constParams.name);
+                addLayer(dstNet, constParams, proto, layer_id, outShapes);
+            }

            layerParams.set("num_output", layerParams.blobs[0].size[ind_num_out]);
            layerParams.set("bias_term", node_proto.input_size() == 3);
--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@ -657,6 +657,8 @@ TEST_P(Test_Caffe_layers, DataAugmentation)
    if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
    testLayerUsingCaffeModels("data_augmentation", true, false);
+    testLayerUsingCaffeModels("data_augmentation_2x1", true, false);
+    testLayerUsingCaffeModels("data_augmentation_8x6", true, false);
 }

 TEST_P(Test_Caffe_layers, Resample)
--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@ -301,6 +301,11 @@ TEST_P(Test_ONNX_layers, Cast)
    testONNXModels("cast");
 }

+TEST_P(Test_ONNX_layers, Power)
+{
+    testONNXModels("pow2", npy, 0, 0, false, false);
+}
+
 TEST_P(Test_ONNX_layers, Concatenation)
 {
    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
@ -639,6 +644,26 @@ TEST_P(Test_ONNX_layers, Pad2d_Unfused)
    testONNXModels("ZeroPad2d");
 }

+TEST_P(Test_ONNX_layers, LinearWithConstant)
+{
+    if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2020040000)
+    applyTestTag(CV_TEST_TAG_DNN_SKIP_IE);
+#endif
+    testONNXModels("lin_with_constant");
+}
+
+TEST_P(Test_ONNX_layers, MatmulWithTwoInputs)
+{
+    if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2020040000)
+    applyTestTag(CV_TEST_TAG_DNN_SKIP_IE);
+#endif
+    testONNXModels("matmul_with_two_inputs");
+}
+
 INSTANTIATE_TEST_CASE_P(/*nothing*/, Test_ONNX_layers, dnnBackendsAndTargets());

 class Test_ONNX_nets : public Test_ONNX_layers
--- a/samples/dnn/optical_flow.py
+++ b/samples/dnn/optical_flow.py
@ -5,7 +5,7 @@ Original paper: https://arxiv.org/abs/1612.01925.
 Original repo:  https://github.com/lmb-freiburg/flownet2.

 Download the converted .caffemodel model from https://drive.google.com/open?id=16qvE9VNmU39NttpZwZs81Ga8VYQJDaWZ
-and .prototxt from https://drive.google.com/open?id=19bo6SWU2p8ZKvjXqMKiCPdK8mghwDy9b.
+and .prototxt from https://drive.google.com/file/d/1RyNIUsan1ZOh2hpYIH36A-jofAvJlT6a/view?usp=sharing.
 Otherwise download original model from https://lmb.informatik.uni-freiburg.de/resources/binaries/flownet2/flownet2-models.tar.gz,
 convert .h5 model to .caffemodel and modify original .prototxt using .prototxt from link above.
 '''
@ -18,7 +18,7 @@ import cv2 as cv

 class OpticalFlow(object):
    def __init__(self, proto, model, height, width):
-        self.net = cv.dnn.readNet(proto, model)
+        self.net = cv.dnn.readNetFromCaffe(proto, model)
        self.net.setPreferableBackend(cv.dnn.DNN_BACKEND_OPENCV)
        self.height = height
        self.width = width
@ -62,9 +62,9 @@ if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Use this script to calculate optical flow using FlowNetv2',
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-input', '-i', required=True, help='Path to input video file. Skip this argument to capture frames from a camera.')
-    parser.add_argument('--height', default=320, help='Input height')
-    parser.add_argument('--width',  default=448, help='Input width')
-    parser.add_argument('--proto', '-p', default='FlowNet2_deploy.prototxt', help='Path to prototxt.')
+    parser.add_argument('--height', default=320, type=int, help='Input height')
+    parser.add_argument('--width', default=448, type=int, help='Input width')
+    parser.add_argument('--proto', '-p', default='FlowNet2_deploy_anysize.prototxt', help='Path to prototxt.')
    parser.add_argument('--model', '-m', default='FlowNet2_weights.caffemodel', help='Path to caffemodel.')
    args, _ = parser.parse_known_args()

@ -75,7 +75,25 @@ if __name__ == '__main__':
    cv.namedWindow(winName, cv.WINDOW_NORMAL)
    cap = cv.VideoCapture(args.input if args.input else 0)
    hasFrame, first_frame = cap.read()
-    opt_flow = OpticalFlow(args.proto, args.model, args.height, args.width)
+
+    divisor = 64.
+    var = {}
+    var['ADAPTED_WIDTH'] = int(np.ceil(args.width/divisor) * divisor)
+    var['ADAPTED_HEIGHT'] = int(np.ceil(args.height/divisor) * divisor)
+    var['SCALE_WIDTH'] = args.width / float(var['ADAPTED_WIDTH'])
+    var['SCALE_HEIGHT'] = args.height / float(var['ADAPTED_HEIGHT'])
+
+    config = ''
+    proto = open(args.proto).readlines()
+    for line in proto:
+        for key, value in var.items():
+            tag = "$%s$" % key
+            line = line.replace(tag, str(value))
+        config += line
+
+    caffemodel = open(args.model, 'rb').read()
+
+    opt_flow = OpticalFlow(bytearray(config.encode()), caffemodel, var['ADAPTED_HEIGHT'], var['ADAPTED_WIDTH'])
    while cv.waitKey(1) < 0:
        hasFrame, second_frame = cap.read()
        if not hasFrame:
--- a/samples/dnn/siamrpnpp.py
+++ b/samples/dnn/siamrpnpp.py
@ -0,0 +1,397 @@
+import argparse
+import cv2 as cv
+import numpy as np
+import os
+
+"""
+Link to original paper : https://arxiv.org/abs/1812.11703
+Link to original repo  : https://github.com/STVIR/pysot
+
+You can download the pre-trained weights of the Tracker Model from https://drive.google.com/file/d/11bwgPFVkps9AH2NOD1zBDdpF_tQghAB-/view?usp=sharing
+You can download the target net (target branch of SiamRPN++) from https://drive.google.com/file/d/1dw_Ne3UMcCnFsaD6xkZepwE4GEpqq7U_/view?usp=sharing
+You can download the search net (search branch of SiamRPN++) from https://drive.google.com/file/d/1Lt4oE43ZSucJvze3Y-Z87CVDreO-Afwl/view?usp=sharing
+You can download the head model (RPN Head) from https://drive.google.com/file/d/1zT1yu12mtj3JQEkkfKFJWiZ71fJ-dQTi/view?usp=sharing
+"""
+
+class ModelBuilder():
+    """ This class generates the SiamRPN++ Tracker Model by using Imported ONNX Nets
+    """
+    def __init__(self, target_net, search_net, rpn_head):
+        super(ModelBuilder, self).__init__()
+        # Build the target branch
+        self.target_net = target_net
+        # Build the search branch
+        self.search_net = search_net
+        # Build RPN_Head
+        self.rpn_head = rpn_head
+
+    def template(self, z):
+        """ Takes the template of size (1, 1, 127, 127) as an input to generate kernel
+        """
+        self.target_net.setInput(z)
+        outNames = self.target_net.getUnconnectedOutLayersNames()
+        self.zfs_1, self.zfs_2, self.zfs_3 = self.target_net.forward(outNames)
+
+    def track(self, x):
+        """ Takes the search of size (1, 1, 255, 255) as an input to generate classification score and bounding box regression
+        """
+        self.search_net.setInput(x)
+        outNames = self.search_net.getUnconnectedOutLayersNames()
+        xfs_1, xfs_2, xfs_3 = self.search_net.forward(outNames)
+        self.rpn_head.setInput(np.stack([self.zfs_1, self.zfs_2, self.zfs_3]), 'input_1')
+        self.rpn_head.setInput(np.stack([xfs_1, xfs_2, xfs_3]), 'input_2')
+        outNames = self.rpn_head.getUnconnectedOutLayersNames()
+        cls, loc = self.rpn_head.forward(outNames)
+        return {'cls': cls, 'loc': loc}
+
+class Anchors:
+    """ This class generate anchors.
+    """
+    def __init__(self, stride, ratios, scales, image_center=0, size=0):
+        self.stride = stride
+        self.ratios = ratios
+        self.scales = scales
+        self.image_center = image_center
+        self.size = size
+        self.anchor_num = len(self.scales) * len(self.ratios)
+        self.anchors = self.generate_anchors()
+
+    def generate_anchors(self):
+        """
+        generate anchors based on predefined configuration
+        """
+        anchors = np.zeros((self.anchor_num, 4), dtype=np.float32)
+        size = self.stride**2
+        count = 0
+        for r in self.ratios:
+            ws = int(np.sqrt(size * 1. / r))
+            hs = int(ws * r)
+
+            for s in self.scales:
+                w = ws * s
+                h = hs * s
+                anchors[count][:] = [-w * 0.5, -h * 0.5, w * 0.5, h * 0.5][:]
+                count += 1
+        return anchors
+
+class SiamRPNTracker:
+    def __init__(self, model):
+        super(SiamRPNTracker, self).__init__()
+        self.anchor_stride = 8
+        self.anchor_ratios = [0.33, 0.5, 1, 2, 3]
+        self.anchor_scales = [8]
+        self.track_base_size = 8
+        self.track_context_amount = 0.5
+        self.track_exemplar_size = 127
+        self.track_instance_size = 255
+        self.track_lr = 0.4
+        self.track_penalty_k = 0.04
+        self.track_window_influence = 0.44
+        self.score_size = (self.track_instance_size - self.track_exemplar_size) // \
+                          self.anchor_stride + 1 + self.track_base_size
+        self.anchor_num = len(self.anchor_ratios) * len(self.anchor_scales)
+        hanning = np.hanning(self.score_size)
+        window = np.outer(hanning, hanning)
+        self.window = np.tile(window.flatten(), self.anchor_num)
+        self.anchors = self.generate_anchor(self.score_size)
+        self.model = model
+
+    def get_subwindow(self, im, pos, model_sz, original_sz, avg_chans):
+        """
+        Args:
+            im:         bgr based input image frame
+            pos:        position of the center of the frame
+            model_sz:   exemplar / target image size
+            s_z:        original / search image size
+            avg_chans:  channel average
+        Return:
+            im_patch:   sub_windows for the given image input
+        """
+        if isinstance(pos, float):
+            pos = [pos, pos]
+        sz = original_sz
+        im_h, im_w, im_d = im.shape
+        c = (original_sz + 1) / 2
+        cx, cy = pos
+        context_xmin = np.floor(cx - c + 0.5)
+        context_xmax = context_xmin + sz - 1
+        context_ymin = np.floor(cy - c + 0.5)
+        context_ymax = context_ymin + sz - 1
+        left_pad = int(max(0., -context_xmin))
+        top_pad = int(max(0., -context_ymin))
+        right_pad = int(max(0., context_xmax - im_w + 1))
+        bottom_pad = int(max(0., context_ymax - im_h + 1))
+        context_xmin += left_pad
+        context_xmax += left_pad
+        context_ymin += top_pad
+        context_ymax += top_pad
+
+        if any([top_pad, bottom_pad, left_pad, right_pad]):
+            size = (im_h + top_pad + bottom_pad, im_w + left_pad + right_pad, im_d)
+            te_im = np.zeros(size, np.uint8)
+            te_im[top_pad:top_pad + im_h, left_pad:left_pad + im_w, :] = im
+            if top_pad:
+                te_im[0:top_pad, left_pad:left_pad + im_w, :] = avg_chans
+            if bottom_pad:
+                te_im[im_h + top_pad:, left_pad:left_pad + im_w, :] = avg_chans
+            if left_pad:
+                te_im[:, 0:left_pad, :] = avg_chans
+            if right_pad:
+                te_im[:, im_w + left_pad:, :] = avg_chans
+            im_patch = te_im[int(context_ymin):int(context_ymax + 1),
+                       int(context_xmin):int(context_xmax + 1), :]
+        else:
+            im_patch = im[int(context_ymin):int(context_ymax + 1),
+                       int(context_xmin):int(context_xmax + 1), :]
+
+        if not np.array_equal(model_sz, original_sz):
+            im_patch = cv.resize(im_patch, (model_sz, model_sz))
+        im_patch = im_patch.transpose(2, 0, 1)
+        im_patch = im_patch[np.newaxis, :, :, :]
+        im_patch = im_patch.astype(np.float32)
+        return im_patch
+
+    def generate_anchor(self, score_size):
+        """
+        Args:
+            im:         bgr based input image frame
+            pos:        position of the center of the frame
+            model_sz:   exemplar / target image size
+            s_z:        original / search image size
+            avg_chans:  channel average
+        Return:
+            anchor:     anchors for pre-determined values of stride, ratio, and scale
+        """
+        anchors = Anchors(self.anchor_stride, self.anchor_ratios, self.anchor_scales)
+        anchor = anchors.anchors
+        x1, y1, x2, y2 = anchor[:, 0], anchor[:, 1], anchor[:, 2], anchor[:, 3]
+        anchor = np.stack([(x1 + x2) * 0.5, (y1 + y2) * 0.5, x2 - x1, y2 - y1], 1)
+        total_stride = anchors.stride
+        anchor_num = anchors.anchor_num
+        anchor = np.tile(anchor, score_size * score_size).reshape((-1, 4))
+        ori = - (score_size // 2) * total_stride
+        xx, yy = np.meshgrid([ori + total_stride * dx for dx in range(score_size)],
+                             [ori + total_stride * dy for dy in range(score_size)])
+        xx, yy = np.tile(xx.flatten(), (anchor_num, 1)).flatten(), \
+                 np.tile(yy.flatten(), (anchor_num, 1)).flatten()
+        anchor[:, 0], anchor[:, 1] = xx.astype(np.float32), yy.astype(np.float32)
+        return anchor
+
+    def _convert_bbox(self, delta, anchor):
+        """
+        Args:
+            delta:      localisation
+            anchor:     anchor of pre-determined anchor size
+        Return:
+            delta:      prediction of bounding box
+        """
+        delta_transpose = np.transpose(delta, (1, 2, 3, 0))
+        delta_contig = np.ascontiguousarray(delta_transpose)
+        delta = delta_contig.reshape(4, -1)
+        delta[0, :] = delta[0, :] * anchor[:, 2] + anchor[:, 0]
+        delta[1, :] = delta[1, :] * anchor[:, 3] + anchor[:, 1]
+        delta[2, :] = np.exp(delta[2, :]) * anchor[:, 2]
+        delta[3, :] = np.exp(delta[3, :]) * anchor[:, 3]
+        return delta
+
+    def _softmax(self, x):
+        """
+        Softmax in the direction of the depth of the layer
+        """
+        x = x.astype(dtype=np.float32)
+        x_max = x.max(axis=1)[:, np.newaxis]
+        e_x = np.exp(x-x_max)
+        div = np.sum(e_x, axis=1)[:, np.newaxis]
+        y = e_x / div
+        return y
+
+    def _convert_score(self, score):
+        """
+        Args:
+            cls:        score
+        Return:
+            cls:        score for cls
+        """
+        score_transpose = np.transpose(score, (1, 2, 3, 0))
+        score_con = np.ascontiguousarray(score_transpose)
+        score_view = score_con.reshape(2, -1)
+        score = np.transpose(score_view, (1, 0))
+        score = self._softmax(score)
+        return score[:,1]
+
+    def _bbox_clip(self, cx, cy, width, height, boundary):
+        """
+        Adjusting the bounding box
+        """
+        bbox_h, bbox_w = boundary
+        cx = max(0, min(cx, bbox_w))
+        cy = max(0, min(cy, bbox_h))
+        width = max(10, min(width, bbox_w))
+        height = max(10, min(height, bbox_h))
+        return cx, cy, width, height
+
+    def init(self, img, bbox):
+        """
+        Args:
+            img(np.ndarray):    bgr based input image frame
+            bbox: (x,y,w,h):    bounding box
+        """
+        x,y,h,w = bbox
+        self.center_pos = np.array([x + (h - 1) / 2, y + (w - 1) / 2])
+        self.h = h
+        self.w = w
+        w_z = self.w + self.track_context_amount * np.add(h, w)
+        h_z = self.h + self.track_context_amount * np.add(h, w)
+        s_z = round(np.sqrt(w_z * h_z))
+        self.channel_average = np.mean(img, axis=(0, 1))
+        z_crop = self.get_subwindow(img, self.center_pos, self.track_exemplar_size, s_z, self.channel_average)
+        self.model.template(z_crop)
+
+    def track(self, img):
+        """
+        Args:
+            img(np.ndarray): BGR image
+        Return:
+            bbox(list):[x, y, width, height]
+        """
+        w_z = self.w + self.track_context_amount * np.add(self.w, self.h)
+        h_z = self.h + self.track_context_amount * np.add(self.w, self.h)
+        s_z = np.sqrt(w_z * h_z)
+        scale_z = self.track_exemplar_size / s_z
+        s_x = s_z * (self.track_instance_size / self.track_exemplar_size)
+        x_crop = self.get_subwindow(img, self.center_pos, self.track_instance_size, round(s_x), self.channel_average)
+        outputs = self.model.track(x_crop)
+        score = self._convert_score(outputs['cls'])
+        pred_bbox = self._convert_bbox(outputs['loc'], self.anchors)
+
+        def change(r):
+            return np.maximum(r, 1. / r)
+
+        def sz(w, h):
+            pad = (w + h) * 0.5
+            return np.sqrt((w + pad) * (h + pad))
+
+        # scale penalty
+        s_c = change(sz(pred_bbox[2, :], pred_bbox[3, :]) /
+                     (sz(self.w * scale_z, self.h * scale_z)))
+
+        # aspect ratio penalty
+        r_c = change((self.w / self.h) /
+                     (pred_bbox[2, :] / pred_bbox[3, :]))
+        penalty = np.exp(-(r_c * s_c - 1) * self.track_penalty_k)
+        pscore = penalty * score
+
+        # window penalty
+        pscore = pscore * (1 - self.track_window_influence) + \
+                 self.window * self.track_window_influence
+        best_idx = np.argmax(pscore)
+        bbox = pred_bbox[:, best_idx] / scale_z
+        lr = penalty[best_idx] * score[best_idx] * self.track_lr
+
+        cpx, cpy = self.center_pos
+        x,y,w,h = bbox
+        cx = x + cpx
+        cy = y + cpy
+
+        # smooth bbox
+        width = self.w * (1 - lr) + w * lr
+        height = self.h * (1 - lr) + h * lr
+
+        # clip boundary
+        cx, cy, width, height = self._bbox_clip(cx, cy, width, height, img.shape[:2])
+
+        # udpate state
+        self.center_pos = np.array([cx, cy])
+        self.w = width
+        self.h = height
+        bbox = [cx - width / 2, cy - height / 2, width, height]
+        best_score = score[best_idx]
+        return {'bbox': bbox, 'best_score': best_score}
+
+def get_frames(video_name):
+    """
+    Args:
+        Path to input video frame
+    Return:
+        Frame
+    """
+    cap = cv.VideoCapture(video_name if video_name else 0)
+    while True:
+        ret, frame = cap.read()
+        if ret:
+            yield frame
+        else:
+            break
+
+def main():
+    """ Sample SiamRPN Tracker
+    """
+    # Computation backends supported by layers
+    backends = (cv.dnn.DNN_BACKEND_DEFAULT, cv.dnn.DNN_BACKEND_HALIDE, cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_BACKEND_OPENCV)
+    # Target Devices for computation
+    targets = (cv.dnn.DNN_TARGET_CPU, cv.dnn.DNN_TARGET_OPENCL, cv.dnn.DNN_TARGET_OPENCL_FP16, cv.dnn.DNN_TARGET_MYRIAD)
+
+    parser = argparse.ArgumentParser(description='Use this script to run SiamRPN++ Visual Tracker',
+                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--input_video', type=str, help='Path to input video file. Skip this argument to capture frames from a camera.')
+    parser.add_argument('--target_net', type=str, default='target_net.onnx', help='Path to part of SiamRPN++ ran on target frame.')
+    parser.add_argument('--search_net', type=str, default='search_net.onnx', help='Path to part of SiamRPN++ ran on search frame.')
+    parser.add_argument('--rpn_head', type=str, default='rpn_head.onnx', help='Path to RPN Head ONNX model.')
+    parser.add_argument('--backend', choices=backends, default=cv.dnn.DNN_BACKEND_DEFAULT, type=int,
+                        help='Select a computation backend: '
+                        "%d: automatically (by default) "
+                        "%d: Halide"
+                        "%d: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit)"
+                        "%d: OpenCV Implementation" % backends)
+    parser.add_argument('--target', choices=targets, default=cv.dnn.DNN_TARGET_CPU, type=int,
+                        help='Select a target device: '
+                        "%d: CPU target (by default)"
+                        "%d: OpenCL"
+                        "%d: OpenCL FP16"
+                        "%d: Myriad" % targets)
+    args, _ = parser.parse_known_args()
+
+    if args.input_video and not os.path.isfile(args.input_video):
+        raise OSError("Input video file does not exist")
+    if not os.path.isfile(args.target_net):
+        raise OSError("Target Net does not exist")
+    if not os.path.isfile(args.search_net):
+        raise OSError("Search Net does not exist")
+    if not os.path.isfile(args.rpn_head):
+        raise OSError("RPN Head Net does not exist")
+
+    #Load the Networks
+    target_net = cv.dnn.readNetFromONNX(args.target_net)
+    target_net.setPreferableBackend(args.backend)
+    target_net.setPreferableTarget(args.target)
+    search_net = cv.dnn.readNetFromONNX(args.search_net)
+    search_net.setPreferableBackend(args.backend)
+    search_net.setPreferableTarget(args.target)
+    rpn_head = cv.dnn.readNetFromONNX(args.rpn_head)
+    rpn_head.setPreferableBackend(args.backend)
+    rpn_head.setPreferableTarget(args.target)
+    model = ModelBuilder(target_net, search_net, rpn_head)
+    tracker = SiamRPNTracker(model)
+
+    first_frame = True
+    cv.namedWindow('SiamRPN++ Tracker', cv.WINDOW_AUTOSIZE)
+    for frame in get_frames(args.input_video):
+        if first_frame:
+            try:
+                init_rect = cv.selectROI('SiamRPN++ Tracker', frame, False, False)
+            except:
+                exit()
+            tracker.init(frame, init_rect)
+            first_frame = False
+        else:
+            outputs = tracker.track(frame)
+            bbox = list(map(int, outputs['bbox']))
+            x,y,w,h = bbox
+            cv.rectangle(frame, (x, y), (x+w, y+h), (0, 255, 0), 3)
+        cv.imshow('SiamRPN++ Tracker', frame)
+        key = cv.waitKey(1)
+        if key == ord("q"):
+            break
+
+if __name__ == '__main__':
+    main()