diff --git a/doc/opencv.bib b/doc/opencv.bib
index e861e5b756..0135418d4a 100644
--- a/doc/opencv.bib
+++ b/doc/opencv.bib
@@ -102,6 +102,14 @@
   publisher = {Elsevier},
   url = {https://www.cs.bgu.ac.il/~icbv161/wiki.files/Readings/1981-Ballard-Generalizing_the_Hough_Transform_to_Detect_Arbitrary_Shapes.pdf}
 }
+@techreport{blanco2010tutorial,
+  title = {A tutorial on SE(3) transformation parameterizations and on-manifold optimization},
+  author = {Blanco, Jose-Luis},
+  institution = {University of Malaga},
+  number = {012010},
+  year = {2010},
+  url = {http://ingmec.ual.es/~jlblanco/papers/jlblanco2010geometry3D_techrep.pdf}
+}
 @article{Borgefors86,
   author = {Borgefors, Gunilla},
   title = {Distance transformations in digital images},
@@ -277,6 +285,12 @@
   year = {2013},
   url = {http://ethaneade.com/optimization.pdf}
 }
+@misc{Eade17,
+  author = {Eade, Ethan},
+  title = {Lie Groups for 2D and 3D Transformation},
+  year = {2017},
+  url = {http://www.ethaneade.com/lie.pdf}
+}
 @inproceedings{EM11,
   author = {Gastal, Eduardo SL and Oliveira, Manuel M},
   title = {Domain transform for edge-aware image and video processing},
@@ -397,6 +411,14 @@
   year = {1999},
   url = {https://pdfs.semanticscholar.org/090d/25f94cb021bdd3400a2f547f989a6a5e07ec.pdf}
 }
+@article{Gallego2014ACF,
+  title = {A Compact Formula for the Derivative of a 3-D Rotation in Exponential Coordinates},
+  author = {Guillermo Gallego and Anthony J. Yezzi},
+  journal = {Journal of Mathematical Imaging and Vision},
+  year = {2014},
+  volume = {51},
+  pages = {378-384}
+}
 @article{taubin1991,
   abstract = {The author addresses the problem of parametric representation and estimation of complex planar curves in 2-D surfaces in 3-D, and nonplanar space curves in 3-D. Curves and surfaces can be defined either parametrically or implicitly, with the latter representation used here. A planar curve is the set of zeros of a smooth function of two variables <e1>x</e1>-<e1>y</e1>, a surface is the set of zeros of a smooth function of three variables <e1>x</e1>-<e1>y</e1>-<e1>z</e1>, and a space curve is the intersection of two surfaces, which are the set of zeros of two linearly independent smooth functions of three variables <e1>x</e1>-<e1>y</e1>-<e1>z</e1> For example, the surface of a complex object in 3-D can be represented as a subset of a single implicit surface, with similar results for planar and space curves. It is shown how this unified representation can be used for object recognition, object position estimation, and segmentation of objects into meaningful subobjects, that is, the detection of `interest regions' that are more complex than high curvature regions and, hence, more useful as features for object recognition},
   author = {Taubin, Gabriel},
@@ -915,6 +937,13 @@
   journal = {Retrieved on August},
   volume = {6}
 }
+@article{Sol2018AML,
+  title = {A micro Lie theory for state estimation in robotics},
+  author = {Joan Sol{\`a} and J{\'e}r{\'e}mie Deray and Dinesh Atchuthan},
+  journal = {ArXiv},
+  year = {2018},
+  volume={abs/1812.01537}
+}
 @misc{SteweniusCFS,
   author = {Stewenius, Henrik},
   title = {Calibrated Fivepoint solver},
diff --git a/modules/calib3d/doc/pics/pinhole_camera_model.png b/modules/calib3d/doc/pics/pinhole_camera_model.png
index 3594de9684..111112b006 100644
Binary files a/modules/calib3d/doc/pics/pinhole_camera_model.png and b/modules/calib3d/doc/pics/pinhole_camera_model.png differ
diff --git a/modules/calib3d/include/opencv2/calib3d.hpp b/modules/calib3d/include/opencv2/calib3d.hpp
index f57af877e4..ced32fee9b 100644
--- a/modules/calib3d/include/opencv2/calib3d.hpp
+++ b/modules/calib3d/include/opencv2/calib3d.hpp
@@ -60,24 +60,24 @@ or
 
 \f[s  \vecthree{u}{v}{1} = \vecthreethree{f_x}{0}{c_x}{0}{f_y}{c_y}{0}{0}{1}
 \begin{bmatrix}
-r_{11} & r_{12} & r_{13} & t_1  \\
-r_{21} & r_{22} & r_{23} & t_2  \\
-r_{31} & r_{32} & r_{33} & t_3
+r_{11} & r_{12} & r_{13} & t_x  \\
+r_{21} & r_{22} & r_{23} & t_y  \\
+r_{31} & r_{32} & r_{33} & t_z
 \end{bmatrix}
 \begin{bmatrix}
-X \\
-Y \\
-Z \\
+X_w \\
+Y_w \\
+Z_w \\
 1
 \end{bmatrix}\f]
 
 where:
 
--   \f$(X, Y, Z)\f$ are the coordinates of a 3D point in the world coordinate space
+-   \f$(X_w, Y_w, Z_w)\f$ are the coordinates of a 3D point in the world coordinate space
 -   \f$(u, v)\f$ are the coordinates of the projection point in pixels
 -   \f$A\f$ is a camera matrix, or a matrix of intrinsic parameters
--   \f$(cx, cy)\f$ is a principal point that is usually at the image center
--   \f$fx, fy\f$ are the focal lengths expressed in pixel units.
+-   \f$(c_x, c_y)\f$ is a principal point that is usually at the image center
+-   \f$f_x, f_y\f$ are the focal lengths expressed in pixel units.
 
 Thus, if an image from the camera is scaled by a factor, all of these parameters should be scaled
 (multiplied/divided, respectively) by the same factor. The matrix of intrinsic parameters does not
@@ -85,15 +85,15 @@ depend on the scene viewed. So, once estimated, it can be re-used as long as the
 fixed (in case of zoom lens). The joint rotation-translation matrix \f$[R|t]\f$ is called a matrix of
 extrinsic parameters. It is used to describe the camera motion around a static scene, or vice versa,
 rigid motion of an object in front of a still camera. That is, \f$[R|t]\f$ translates coordinates of a
-point \f$(X, Y, Z)\f$ to a coordinate system, fixed with respect to the camera. The transformation above
-is equivalent to the following (when \f$z \ne 0\f$ ):
+world point \f$(X_w, Y_w, Z_w)\f$ to a coordinate system, fixed with respect to the camera.
+The transformation above is equivalent to the following (when \f$z \ne 0\f$ ):
 
 \f[\begin{array}{l}
-\vecthree{x}{y}{z} = R  \vecthree{X}{Y}{Z} + t \\
-x' = x/z \\
-y' = y/z \\
-u = f_x*x' + c_x \\
-v = f_y*y' + c_y
+\vecthree{X_c}{Y_c}{Z_c} = R  \vecthree{X_w}{Y_w}{Z_w} + t \\
+x' = X_c/Z_c \\
+y' = Y_c/Z_c \\
+u = f_x \times x' + c_x \\
+v = f_y \times y' + c_y
 \end{array}\f]
 
 The following figure illustrates the pinhole camera model.
@@ -104,14 +104,14 @@ Real lenses usually have some distortion, mostly radial distortion and slight ta
 So, the above model is extended as:
 
 \f[\begin{array}{l}
-\vecthree{x}{y}{z} = R  \vecthree{X}{Y}{Z} + t \\
-x' = x/z \\
-y' = y/z \\
+\vecthree{X_c}{Y_c}{Z_c} = R  \vecthree{X_w}{Y_w}{Z_w} + t \\
+x' = X_c/Z_c \\
+y' = Y_c/Z_c \\
 x'' = x'  \frac{1 + k_1 r^2 + k_2 r^4 + k_3 r^6}{1 + k_4 r^2 + k_5 r^4 + k_6 r^6} + 2 p_1 x' y' + p_2(r^2 + 2 x'^2) + s_1 r^2 + s_2 r^4 \\
 y'' = y'  \frac{1 + k_1 r^2 + k_2 r^4 + k_3 r^6}{1 + k_4 r^2 + k_5 r^4 + k_6 r^6} + p_1 (r^2 + 2 y'^2) + 2 p_2 x' y' + s_3 r^2 + s_4 r^4 \\
 \text{where} \quad r^2 = x'^2 + y'^2  \\
-u = f_x*x'' + c_x \\
-v = f_y*y'' + c_y
+u = f_x \times x'' + c_x \\
+v = f_y \times y'' + c_y
 \end{array}\f]
 
 \f$k_1\f$, \f$k_2\f$, \f$k_3\f$, \f$k_4\f$, \f$k_5\f$, and \f$k_6\f$ are radial distortion coefficients. \f$p_1\f$ and \f$p_2\f$ are
@@ -133,8 +133,8 @@ s\vecthree{x'''}{y'''}{1} =
 \vecthreethree{R_{33}(\tau_x, \tau_y)}{0}{-R_{13}(\tau_x, \tau_y)}
 {0}{R_{33}(\tau_x, \tau_y)}{-R_{23}(\tau_x, \tau_y)}
 {0}{0}{1} R(\tau_x, \tau_y) \vecthree{x''}{y''}{1}\\
-u = f_x*x''' + c_x \\
-v = f_y*y''' + c_y
+u = f_x \times x''' + c_x \\
+v = f_y \times y''' + c_y
 \end{array}\f]
 
 where the matrix \f$R(\tau_x, \tau_y)\f$ is defined by two rotations with angular parameter \f$\tau_x\f$
@@ -317,7 +317,7 @@ enum HandEyeCalibrationMethod
 @param jacobian Optional output Jacobian matrix, 3x9 or 9x3, which is a matrix of partial
 derivatives of the output array components with respect to the input array components.
 
-\f[\begin{array}{l} \theta \leftarrow norm(r) \\ r  \leftarrow r/ \theta \\ R =  \cos{\theta} I + (1- \cos{\theta} ) r r^T +  \sin{\theta} \vecthreethree{0}{-r_z}{r_y}{r_z}{0}{-r_x}{-r_y}{r_x}{0} \end{array}\f]
+\f[\begin{array}{l} \theta \leftarrow norm(r) \\ r  \leftarrow r/ \theta \\ R =  \cos(\theta) I + (1- \cos{\theta} ) r r^T +  \sin(\theta) \vecthreethree{0}{-r_z}{r_y}{r_z}{0}{-r_x}{-r_y}{r_x}{0} \end{array}\f]
 
 Inverse transformation can be also done easily, since
 
@@ -325,7 +325,16 @@ Inverse transformation can be also done easily, since
 
 A rotation vector is a convenient and most compact representation of a rotation matrix (since any
 rotation matrix has just 3 degrees of freedom). The representation is used in the global 3D geometry
-optimization procedures like calibrateCamera, stereoCalibrate, or solvePnP .
+optimization procedures like @ref calibrateCamera, @ref stereoCalibrate, or @ref solvePnP .
+
+@note More information about the computation of the derivative of a 3D rotation matrix with respect to its exponential coordinate
+can be found in:
+    - A Compact Formula for the Derivative of a 3-D Rotation in Exponential Coordinates, Guillermo Gallego, Anthony J. Yezzi @cite Gallego2014ACF
+
+@note Useful information on SE(3) and Lie Groups can be found in:
+    - A tutorial on SE(3) transformation parameterizations and on-manifold optimization, Jose-Luis Blanco @cite blanco2010tutorial
+    - Lie Groups for 2D and 3D Transformation, Ethan Eade @cite Eade17
+    - A micro Lie theory for state estimation in robotics, Joan Solà, Jérémie Deray, Dinesh Atchuthan @cite Sol2018AML
  */
 CV_EXPORTS_W void Rodrigues( InputArray src, OutputArray dst, OutputArray jacobian = noArray() );
 
@@ -634,10 +643,10 @@ Number of input points must be 4. Object points must be defined in the following
 - for all the other flags, number of input points must be >= 4 and object points can be in any configuration.
 
 @param objectPoints Array of object points in the object coordinate space, Nx3 1-channel or
-1xN/Nx1 3-channel, where N is the number of points. vector\<Point3f\> can be also passed here.
+1xN/Nx1 3-channel, where N is the number of points. vector\<Point3d\> can be also passed here.
 @param imagePoints Array of corresponding image points, Nx2 1-channel or 1xN/Nx1 2-channel,
-where N is the number of points. vector\<Point2f\> can be also passed here.
-@param cameraMatrix Input camera matrix \f$A = \vecthreethree{fx}{0}{cx}{0}{fy}{cy}{0}{0}{1}\f$ .
+where N is the number of points. vector\<Point2d\> can be also passed here.
+@param cameraMatrix Input camera matrix \f$A = \vecthreethree{f_x}{0}{c_x}{0}{f_y}{c_y}{0}{0}{1}\f$ .
 @param distCoeffs Input vector of distortion coefficients
 \f$(k_1, k_2, p_1, p_2[, k_3[, k_4, k_5, k_6 [, s_1, s_2, s_3, s_4[, \tau_x, \tau_y]]]])\f$ of
 4, 5, 8, 12 or 14 elements. If the vector is NULL/empty, the zero distortion coefficients are
@@ -695,7 +704,7 @@ using the perspective projection model \f$ \Pi \f$ and the camera intrinsic para
   v \\
   1
   \end{bmatrix} &=
-  \bf{A} \hspace{0.1em} \Pi \hspace{0.2em} ^{c}\bf{M}_w
+  \bf{A} \hspace{0.1em} \Pi \hspace{0.2em} ^{c}\bf{T}_w
   \begin{bmatrix}
   X_{w} \\
   Y_{w} \\
@@ -743,7 +752,7 @@ a 3D point expressed in the world frame into the camera frame:
   Z_c \\
   1
   \end{bmatrix} &=
-  \hspace{0.2em} ^{c}\bf{M}_w
+  \hspace{0.2em} ^{c}\bf{T}_w
   \begin{bmatrix}
   X_{w} \\
   Y_{w} \\
@@ -809,9 +818,9 @@ CV_EXPORTS_W bool solvePnP( InputArray objectPoints, InputArray imagePoints,
 /** @brief Finds an object pose from 3D-2D point correspondences using the RANSAC scheme.
 
 @param objectPoints Array of object points in the object coordinate space, Nx3 1-channel or
-1xN/Nx1 3-channel, where N is the number of points. vector\<Point3f\> can be also passed here.
+1xN/Nx1 3-channel, where N is the number of points. vector\<Point3d\> can be also passed here.
 @param imagePoints Array of corresponding image points, Nx2 1-channel or 1xN/Nx1 2-channel,
-where N is the number of points. vector\<Point2f\> can be also passed here.
+where N is the number of points. vector\<Point2d\> can be also passed here.
 @param cameraMatrix Input camera matrix \f$A = \vecthreethree{fx}{0}{cx}{0}{fy}{cy}{0}{0}{1}\f$ .
 @param distCoeffs Input vector of distortion coefficients
 \f$(k_1, k_2, p_1, p_2[, k_3[, k_4, k_5, k_6 [, s_1, s_2, s_3, s_4[, \tau_x, \tau_y]]]])\f$ of
@@ -861,7 +870,7 @@ CV_EXPORTS_W bool solvePnPRansac( InputArray objectPoints, InputArray imagePoint
 1x3/3x1 3-channel. vector\<Point3f\> can be also passed here.
 @param imagePoints Array of corresponding image points, 3x2 1-channel or 1x3/3x1 2-channel.
  vector\<Point2f\> can be also passed here.
-@param cameraMatrix Input camera matrix \f$A = \vecthreethree{fx}{0}{cx}{0}{fy}{cy}{0}{0}{1}\f$ .
+@param cameraMatrix Input camera matrix \f$A = \vecthreethree{f_x}{0}{c_x}{0}{f_y}{c_y}{0}{0}{1}\f$ .
 @param distCoeffs Input vector of distortion coefficients
 \f$(k_1, k_2, p_1, p_2[, k_3[, k_4, k_5, k_6 [, s_1, s_2, s_3, s_4[, \tau_x, \tau_y]]]])\f$ of
 4, 5, 8, 12 or 14 elements. If the vector is NULL/empty, the zero distortion coefficients are
@@ -890,10 +899,10 @@ CV_EXPORTS_W int solveP3P( InputArray objectPoints, InputArray imagePoints,
 to the camera coordinate frame) from a 3D-2D point correspondences and starting from an initial solution.
 
 @param objectPoints Array of object points in the object coordinate space, Nx3 1-channel or 1xN/Nx1 3-channel,
-where N is the number of points. vector\<Point3f\> can also be passed here.
+where N is the number of points. vector\<Point3d\> can also be passed here.
 @param imagePoints Array of corresponding image points, Nx2 1-channel or 1xN/Nx1 2-channel,
-where N is the number of points. vector\<Point2f\> can also be passed here.
-@param cameraMatrix Input camera matrix \f$A = \vecthreethree{fx}{0}{cx}{0}{fy}{cy}{0}{0}{1}\f$ .
+where N is the number of points. vector\<Point2d\> can also be passed here.
+@param cameraMatrix Input camera matrix \f$A = \vecthreethree{f_x}{0}{c_x}{0}{f_y}{c_y}{0}{0}{1}\f$ .
 @param distCoeffs Input vector of distortion coefficients
 \f$(k_1, k_2, p_1, p_2[, k_3[, k_4, k_5, k_6 [, s_1, s_2, s_3, s_4[, \tau_x, \tau_y]]]])\f$ of
 4, 5, 8, 12 or 14 elements. If the vector is NULL/empty, the zero distortion coefficients are
@@ -918,10 +927,10 @@ CV_EXPORTS_W void solvePnPRefineLM( InputArray objectPoints, InputArray imagePoi
 to the camera coordinate frame) from a 3D-2D point correspondences and starting from an initial solution.
 
 @param objectPoints Array of object points in the object coordinate space, Nx3 1-channel or 1xN/Nx1 3-channel,
-where N is the number of points. vector\<Point3f\> can also be passed here.
+where N is the number of points. vector\<Point3d\> can also be passed here.
 @param imagePoints Array of corresponding image points, Nx2 1-channel or 1xN/Nx1 2-channel,
-where N is the number of points. vector\<Point2f\> can also be passed here.
-@param cameraMatrix Input camera matrix \f$A = \vecthreethree{fx}{0}{cx}{0}{fy}{cy}{0}{0}{1}\f$ .
+where N is the number of points. vector\<Point2d\> can also be passed here.
+@param cameraMatrix Input camera matrix \f$A = \vecthreethree{f_x}{0}{c_x}{0}{f_y}{c_y}{0}{0}{1}\f$ .
 @param distCoeffs Input vector of distortion coefficients
 \f$(k_1, k_2, p_1, p_2[, k_3[, k_4, k_5, k_6 [, s_1, s_2, s_3, s_4[, \tau_x, \tau_y]]]])\f$ of
 4, 5, 8, 12 or 14 elements. If the vector is NULL/empty, the zero distortion coefficients are
@@ -960,10 +969,10 @@ Number of input points must be 4 and 2 solutions are returned. Object points mus
 Only 1 solution is returned.
 
 @param objectPoints Array of object points in the object coordinate space, Nx3 1-channel or
-1xN/Nx1 3-channel, where N is the number of points. vector\<Point3f\> can be also passed here.
+1xN/Nx1 3-channel, where N is the number of points. vector\<Point3d\> can be also passed here.
 @param imagePoints Array of corresponding image points, Nx2 1-channel or 1xN/Nx1 2-channel,
-where N is the number of points. vector\<Point2f\> can be also passed here.
-@param cameraMatrix Input camera matrix \f$A = \vecthreethree{fx}{0}{cx}{0}{fy}{cy}{0}{0}{1}\f$ .
+where N is the number of points. vector\<Point2d\> can be also passed here.
+@param cameraMatrix Input camera matrix \f$A = \vecthreethree{f_x}{0}{c_x}{0}{f_y}{c_y}{0}{0}{1}\f$ .
 @param distCoeffs Input vector of distortion coefficients
 \f$(k_1, k_2, p_1, p_2[, k_3[, k_4, k_5, k_6 [, s_1, s_2, s_3, s_4[, \tau_x, \tau_y]]]])\f$ of
 4, 5, 8, 12 or 14 elements. If the vector is NULL/empty, the zero distortion coefficients are
@@ -1028,7 +1037,7 @@ using the perspective projection model \f$ \Pi \f$ and the camera intrinsic para
   v \\
   1
   \end{bmatrix} &=
-  \bf{A} \hspace{0.1em} \Pi \hspace{0.2em} ^{c}\bf{M}_w
+  \bf{A} \hspace{0.1em} \Pi \hspace{0.2em} ^{c}\bf{T}_w
   \begin{bmatrix}
   X_{w} \\
   Y_{w} \\
@@ -1076,7 +1085,7 @@ a 3D point expressed in the world frame into the camera frame:
   Z_c \\
   1
   \end{bmatrix} &=
-  \hspace{0.2em} ^{c}\bf{M}_w
+  \hspace{0.2em} ^{c}\bf{T}_w
   \begin{bmatrix}
   X_{w} \\
   Y_{w} \\
diff --git a/modules/calib3d/misc/java/test/Calib3dTest.java b/modules/calib3d/misc/java/test/Calib3dTest.java
index ce9c762406..28d1c8b1ad 100644
--- a/modules/calib3d/misc/java/test/Calib3dTest.java
+++ b/modules/calib3d/misc/java/test/Calib3dTest.java
@@ -1,5 +1,7 @@
 package org.opencv.test.calib3d;
 
+import java.util.ArrayList;
+
 import org.opencv.calib3d.Calib3d;
 import org.opencv.core.Core;
 import org.opencv.core.CvType;
@@ -650,6 +652,47 @@ public class Calib3dTest extends OpenCVTestCase {
         assertEquals((1 << 22), Calib3d.CALIB_USE_EXTRINSIC_GUESS);
     }
 
+    public void testSolvePnPGeneric_regression_16040() {
+        Mat intrinsics = Mat.eye(3, 3, CvType.CV_64F);
+        intrinsics.put(0, 0, 400);
+        intrinsics.put(1, 1, 400);
+        intrinsics.put(0, 2, 640 / 2);
+        intrinsics.put(1, 2, 480 / 2);
+
+        final int minPnpPointsNum = 4;
+
+        MatOfPoint3f points3d = new MatOfPoint3f();
+        points3d.alloc(minPnpPointsNum);
+        MatOfPoint2f points2d = new MatOfPoint2f();
+        points2d.alloc(minPnpPointsNum);
+
+        for (int i = 0; i < minPnpPointsNum; i++) {
+            double x = Math.random() * 100 - 50;
+            double y = Math.random() * 100 - 50;
+            points2d.put(i, 0, x, y); //add(new Point(x, y));
+            points3d.put(i, 0, 0, y, x); // add(new Point3(0, y, x));
+        }
+
+        ArrayList<Mat> rvecs = new ArrayList<Mat>();
+        ArrayList<Mat> tvecs = new ArrayList<Mat>();
+
+        Mat rvec = new Mat();
+        Mat tvec = new Mat();
+
+        Mat reprojectionError = new Mat(2, 1, CvType.CV_64FC1);
+
+        Calib3d.solvePnPGeneric(points3d, points2d, intrinsics, new MatOfDouble(), rvecs, tvecs, false, Calib3d.SOLVEPNP_IPPE, rvec, tvec, reprojectionError);
+
+        Mat truth_rvec = new Mat(3, 1, CvType.CV_64F);
+        truth_rvec.put(0, 0, 0, Math.PI / 2, 0);
+
+        Mat truth_tvec = new Mat(3, 1, CvType.CV_64F);
+        truth_tvec.put(0, 0, -320, -240, 400);
+
+        assertMatEqual(truth_rvec, rvecs.get(0), 10 * EPS);
+        assertMatEqual(truth_tvec, tvecs.get(0), 1000 * EPS);
+    }
+
     public void testGetDefaultNewCameraMatrixMat() {
         Mat mtx = Calib3d.getDefaultNewCameraMatrix(gray0);
 
@@ -775,5 +818,4 @@ public class Calib3dTest extends OpenCVTestCase {
             assertTrue(src.toList().get(i).equals(dst.toList().get(i)));
         }
     }
-
 }
diff --git a/modules/calib3d/misc/python/test/test_solvepnp.py b/modules/calib3d/misc/python/test/test_solvepnp.py
new file mode 100644
index 0000000000..b751e15c74
--- /dev/null
+++ b/modules/calib3d/misc/python/test/test_solvepnp.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python
+# Python 2/3 compatibility
+from __future__ import print_function
+
+import numpy as np
+import cv2 as cv
+
+from tests_common import NewOpenCVTests
+
+class solvepnp_test(NewOpenCVTests):
+
+    def test_regression_16040(self):
+        obj_points = np.array([[0, 0, 0], [0, 1, 0], [1, 1, 0], [1, 0, 0]], dtype=np.float32)
+        img_points = np.array(
+            [[700, 400], [700, 600], [900, 600], [900, 400]], dtype=np.float32
+        )
+
+        cameraMatrix = np.array(
+            [[712.0634, 0, 800], [0, 712.540, 500], [0, 0, 1]], dtype=np.float32
+        )
+        distCoeffs = np.array([[0, 0, 0, 0]], dtype=np.float32)
+        r = np.array([], dtype=np.float32)
+        x, r, t, e = cv.solvePnPGeneric(
+            obj_points, img_points, cameraMatrix, distCoeffs, reprojectionError=r
+        )
+
+    def test_regression_16040_2(self):
+        obj_points = np.array([[0, 0, 0], [0, 1, 0], [1, 1, 0], [1, 0, 0]], dtype=np.float32)
+        img_points = np.array(
+            [[[700, 400], [700, 600], [900, 600], [900, 400]]], dtype=np.float32
+        )
+
+        cameraMatrix = np.array(
+            [[712.0634, 0, 800], [0, 712.540, 500], [0, 0, 1]], dtype=np.float32
+        )
+        distCoeffs = np.array([[0, 0, 0, 0]], dtype=np.float32)
+        r = np.array([], dtype=np.float32)
+        x, r, t, e = cv.solvePnPGeneric(
+            obj_points, img_points, cameraMatrix, distCoeffs, reprojectionError=r
+        )
+
+
+if __name__ == '__main__':
+    NewOpenCVTests.bootstrap()
diff --git a/modules/calib3d/src/calibration.cpp b/modules/calib3d/src/calibration.cpp
index b663a63544..77ecc45418 100644
--- a/modules/calib3d/src/calibration.cpp
+++ b/modules/calib3d/src/calibration.cpp
@@ -1287,8 +1287,8 @@ CV_IMPL void cvInitIntrinsicParams2D( const CvMat* objectPoints,
 
     matA.reset(cvCreateMat( 2*nimages, 2, CV_64F ));
     _b.reset(cvCreateMat( 2*nimages, 1, CV_64F ));
-    a[2] = (!imageSize.width) ? 0.5 : (imageSize.width)*0.5;
-    a[5] = (!imageSize.height) ? 0.5 : (imageSize.height)*0.5;
+    a[2] = (!imageSize.width) ? 0.5 : (imageSize.width - 1)*0.5;
+    a[5] = (!imageSize.height) ? 0.5 : (imageSize.height - 1)*0.5;
     _allH.reset(cvCreateMat( nimages, 9, CV_64F ));
 
     // extract vanishing points in order to obtain initial value for the focal length
@@ -2567,8 +2567,8 @@ void cvStereoRectify( const CvMat* _cameraMatrix1, const CvMat* _cameraMatrix2,
         for( i = 0; i < 4; i++ )
         {
             int j = (i<2) ? 0 : 1;
-            _pts[i].x = (float)((i % 2)*(nx));
-            _pts[i].y = (float)(j*(ny));
+            _pts[i].x = (float)((i % 2)*(nx-1));
+            _pts[i].y = (float)(j*(ny-1));
         }
         cvUndistortPoints( &pts, &pts, A, Dk, 0, 0 );
         cvConvertPointsHomogeneous( &pts, &pts_3 );
@@ -2582,8 +2582,8 @@ void cvStereoRectify( const CvMat* _cameraMatrix1, const CvMat* _cameraMatrix2,
         _a_tmp[1][2]=0.0;
         cvProjectPoints2( &pts_3, k == 0 ? _R1 : _R2, &Z, &A_tmp, 0, &pts );
         CvScalar avg = cvAvg(&pts);
-        cc_new[k].x = (nx)/2 - avg.val[0];
-        cc_new[k].y = (ny)/2 - avg.val[1];
+        cc_new[k].x = (nx-1)/2 - avg.val[0];
+        cc_new[k].y = (ny-1)/2 - avg.val[1];
     }
 
     // vertical focal length must be the same for both images to keep the epipolar constraint
@@ -2720,8 +2720,8 @@ void cvGetOptimalNewCameraMatrix( const CvMat* cameraMatrix, const CvMat* distCo
     {
         double cx0 = M[0][2];
         double cy0 = M[1][2];
-        double cx = (newImgSize.width)*0.5;
-        double cy = (newImgSize.height)*0.5;
+        double cx = (newImgSize.width-1)*0.5;
+        double cy = (newImgSize.height-1)*0.5;
 
         icvGetRectangles( cameraMatrix, distCoeffs, 0, cameraMatrix, imgSize, inner, outer );
         double s0 = std::max(std::max(std::max((double)cx/(cx0 - inner.x), (double)cy/(cy0 - inner.y)),
@@ -2755,14 +2755,14 @@ void cvGetOptimalNewCameraMatrix( const CvMat* cameraMatrix, const CvMat* distCo
         icvGetRectangles( cameraMatrix, distCoeffs, 0, 0, imgSize, inner, outer );
 
         // Projection mapping inner rectangle to viewport
-        double fx0 = (newImgSize.width) / inner.width;
-        double fy0 = (newImgSize.height) / inner.height;
+        double fx0 = (newImgSize.width  - 1) / inner.width;
+        double fy0 = (newImgSize.height - 1) / inner.height;
         double cx0 = -fx0 * inner.x;
         double cy0 = -fy0 * inner.y;
 
         // Projection mapping outer rectangle to viewport
-        double fx1 = (newImgSize.width) / outer.width;
-        double fy1 = (newImgSize.height) / outer.height;
+        double fx1 = (newImgSize.width  - 1) / outer.width;
+        double fy1 = (newImgSize.height - 1) / outer.height;
         double cx1 = -fx1 * outer.x;
         double cy1 = -fy1 * outer.y;
 
@@ -2826,8 +2826,8 @@ CV_IMPL int cvStereoRectifyUncalibrated(
     cvGEMM( &U, &W, 1, 0, 0, &W, CV_GEMM_A_T );
     cvMatMul( &W, &V, &F );
 
-    cx = cvRound( (imgSize.width)*0.5 );
-    cy = cvRound( (imgSize.height)*0.5 );
+    cx = cvRound( (imgSize.width-1)*0.5 );
+    cy = cvRound( (imgSize.height-1)*0.5 );
 
     cvZero( _H1 );
     cvZero( _H2 );
diff --git a/modules/calib3d/src/solvepnp.cpp b/modules/calib3d/src/solvepnp.cpp
index e724b89c44..1b76ce82e2 100644
--- a/modules/calib3d/src/solvepnp.cpp
+++ b/modules/calib3d/src/solvepnp.cpp
@@ -753,10 +753,8 @@ int solvePnPGeneric( InputArray _opoints, InputArray _ipoints,
     CV_Assert( ( (npoints >= 4) || (npoints == 3 && flags == SOLVEPNP_ITERATIVE && useExtrinsicGuess) )
                && npoints == std::max(ipoints.checkVector(2, CV_32F), ipoints.checkVector(2, CV_64F)) );
 
-    if (opoints.cols == 3)
-        opoints = opoints.reshape(3);
-    if (ipoints.cols == 2)
-        ipoints = ipoints.reshape(2);
+    opoints = opoints.reshape(3, npoints);
+    ipoints = ipoints.reshape(2, npoints);
 
     if( flags != SOLVEPNP_ITERATIVE )
         useExtrinsicGuess = false;
@@ -796,7 +794,7 @@ int solvePnPGeneric( InputArray _opoints, InputArray _ipoints,
     else if (flags == SOLVEPNP_P3P || flags == SOLVEPNP_AP3P)
     {
         vector<Mat> rvecs, tvecs;
-        solveP3P(_opoints, _ipoints, _cameraMatrix, _distCoeffs, rvecs, tvecs, flags);
+        solveP3P(opoints, ipoints, _cameraMatrix, _distCoeffs, rvecs, tvecs, flags);
         vec_rvecs.insert(vec_rvecs.end(), rvecs.begin(), rvecs.end());
         vec_tvecs.insert(vec_tvecs.end(), tvecs.begin(), tvecs.end());
     }
@@ -1017,37 +1015,37 @@ int solvePnPGeneric( InputArray _opoints, InputArray _ipoints,
                      "Type of reprojectionError must be CV_32FC1 or CV_64FC1!");
 
         Mat objectPoints, imagePoints;
-        if (_opoints.depth() == CV_32F)
+        if (opoints.depth() == CV_32F)
         {
-            _opoints.getMat().convertTo(objectPoints, CV_64F);
+            opoints.convertTo(objectPoints, CV_64F);
         }
         else
         {
-            objectPoints = _opoints.getMat();
+            objectPoints = opoints;
         }
-        if (_ipoints.depth() == CV_32F)
+        if (ipoints.depth() == CV_32F)
         {
-            _ipoints.getMat().convertTo(imagePoints, CV_64F);
+            ipoints.convertTo(imagePoints, CV_64F);
         }
         else
         {
-            imagePoints = _ipoints.getMat();
+            imagePoints = ipoints;
         }
 
         for (size_t i = 0; i < vec_rvecs.size(); i++)
         {
             vector<Point2d> projectedPoints;
             projectPoints(objectPoints, vec_rvecs[i], vec_tvecs[i], cameraMatrix, distCoeffs, projectedPoints);
-            double rmse = norm(projectedPoints, imagePoints, NORM_L2) / sqrt(2*projectedPoints.size());
+            double rmse = norm(Mat(projectedPoints, false), imagePoints, NORM_L2) / sqrt(2*projectedPoints.size());
 
             Mat err = reprojectionError.getMat();
             if (type == CV_32F)
             {
-                err.at<float>(0,static_cast<int>(i)) = static_cast<float>(rmse);
+                err.at<float>(static_cast<int>(i)) = static_cast<float>(rmse);
             }
             else
             {
-                err.at<double>(0,static_cast<int>(i)) = rmse;
+                err.at<double>(static_cast<int>(i)) = rmse;
             }
         }
     }
diff --git a/modules/calib3d/test/test_cameracalibration.cpp b/modules/calib3d/test/test_cameracalibration.cpp
index 5c5c4841e6..dbad5c1801 100644
--- a/modules/calib3d/test/test_cameracalibration.cpp
+++ b/modules/calib3d/test/test_cameracalibration.cpp
@@ -1239,108 +1239,6 @@ void CV_ProjectPointsTest_CPP::project( const Mat& objectPoints, const Mat& rvec
 
 ///////////////////////////////// Stereo Calibration /////////////////////////////////////
 
-class CV_StereoCalibrationCornerTest : public cvtest::BaseTest
-{
-public:
-    CV_StereoCalibrationCornerTest();
-    ~CV_StereoCalibrationCornerTest();
-    void clear();
-protected:
-    void run(int);
-};
-
-CV_StereoCalibrationCornerTest::CV_StereoCalibrationCornerTest()
-{
-}
-
-
-CV_StereoCalibrationCornerTest::~CV_StereoCalibrationCornerTest()
-{
-    clear();
-}
-
-void CV_StereoCalibrationCornerTest::clear()
-{
-    cvtest::BaseTest::clear();
-}
-
-static bool resizeCameraMatrix(const Mat &in_cm, Mat &dst_cm, double scale)
-{
-    if (in_cm.empty() || in_cm.cols != 3 || in_cm.rows != 3 || in_cm.type() != CV_64FC1)
-        return false;
-    dst_cm = in_cm * scale;
-    dst_cm.at<double>(2, 2) = 1.0;
-    return true;
-}
-
-// see https://github.com/opencv/opencv/pull/6836 for details
-void CV_StereoCalibrationCornerTest::run(int)
-{
-    const Matx33d M1(906.7857732303256, 0.0, 1026.456125870669,
-                            0.0, 906.7857732303256, 540.0531577669913,
-                            0.0, 0.0, 1.0);
-    const Matx33d M2(906.782205162265, 0.0, 1014.619997352785,
-                            0.0, 906.782205162265, 561.9990018887295,
-                            0.0, 0.0, 1.0);
-    const Matx<double, 5, 1> D1(0.0064836857220181504, 0.033880363848984636, 0.0, 0.0, -0.042996356352306114);
-    const Matx<double, 5, 1> D2(0.023754068600491646, -0.02364619610835259, 0.0, 0.0, 0.0015014971456262652);
-
-    const Size imageSize(2048, 1088);
-    const double scale = 0.25;
-
-    const Matx33d Rot(0.999788461750194, -0.015696495349844446, -0.013291041528534329,
-                             0.015233019205877604, 0.999296086451901, -0.034282455101525826,
-                             0.01381980018141639, 0.03407274036010432, 0.9993238021218641);
-    const Matx31d T(-1.552005597952028, 0.0019508251875105093, -0.023335501616116062);
-
-    // generate camera matrices for resized image rectification.
-    Mat srcM1(M1), srcM2(M2);
-    Mat rszM1, rszM2;
-    resizeCameraMatrix(srcM1, rszM1, scale);
-    resizeCameraMatrix(srcM2, rszM2, scale);
-    Size rszImageSize(cvRound(scale * imageSize.width), cvRound(scale * imageSize.height));
-    Size srcImageSize = imageSize;
-    // apply stereoRectify
-    Mat srcR[2], srcP[2], srcQ;
-    Mat rszR[2], rszP[2], rszQ;
-    stereoRectify(srcM1, D1, srcM2, D2, srcImageSize, Rot, T,
-                  srcR[0], srcR[1], srcP[0], srcP[1], srcQ,
-                  CALIB_ZERO_DISPARITY, 0);
-    stereoRectify(rszM1, D1, rszM2, D2, rszImageSize, Rot, T,
-                  rszR[0], rszR[1], rszP[0], rszP[1], rszQ,
-                  CALIB_ZERO_DISPARITY, 0);
-    // generate remap maps
-    Mat srcRmap[2], rszRmap[2];
-    initUndistortRectifyMap(srcM1, D1, srcR[0], srcP[0], srcImageSize, CV_32FC2, srcRmap[0], srcRmap[1]);
-    initUndistortRectifyMap(rszM1, D1, rszR[0], rszP[0], rszImageSize, CV_32FC2, rszRmap[0], rszRmap[1]);
-
-    // generate source image
-    // it's an artificial pattern with white rect in the center
-    Mat image(imageSize, CV_8UC3);
-    image.setTo(0);
-    image(cv::Rect(imageSize.width / 3, imageSize.height / 3, imageSize.width / 3, imageSize.height / 3)).setTo(255);
-
-    // perform remap-resize
-    Mat src_result;
-    remap(image, src_result, srcRmap[0], srcRmap[1], INTER_LINEAR);
-    resize(src_result, src_result, Size(), scale, scale, INTER_LINEAR_EXACT);
-    // perform resize-remap
-    Mat rsz_result;
-    resize(image, rsz_result, Size(), scale, scale, INTER_LINEAR_EXACT);
-    remap(rsz_result, rsz_result, rszRmap[0], rszRmap[1], INTER_LINEAR);
-
-    // modifying the camera matrix with resizeCameraMatrix must yield the same
-    // result as calibrating the downscaled images
-    int cnz = countNonZero((cv::Mat(src_result - rsz_result) != 0)(
-                               cv::Rect(src_result.cols / 3, src_result.rows / 3,
-                                        (int)(src_result.cols / 3.1), int(src_result.rows / 3.1))));
-    if (cnz)
-    {
-        ts->printf( cvtest::TS::LOG, "The camera matrix is wrong for downscaled image\n");
-        ts->set_failed_test_info( cvtest::TS::FAIL_BAD_ACCURACY );
-    }
-}
-
 class CV_StereoCalibrationTest : public cvtest::BaseTest
 {
 public:
@@ -2007,7 +1905,7 @@ TEST(Calib3d_ProjectPoints_CPP, outputShape)
 }
 
 TEST(Calib3d_StereoCalibrate_CPP, regression) { CV_StereoCalibrationTest_CPP test; test.safe_run(); }
-TEST(Calib3d_StereoCalibrateCorner, regression) { CV_StereoCalibrationCornerTest test; test.safe_run(); }
+
 
 TEST(Calib3d_StereoCalibrate_CPP, extended)
 {
diff --git a/modules/core/include/opencv2/core/types_c.h b/modules/core/include/opencv2/core/types_c.h
index a095bc3856..97aeab375f 100644
--- a/modules/core/include/opencv2/core/types_c.h
+++ b/modules/core/include/opencv2/core/types_c.h
@@ -44,8 +44,10 @@
 #ifndef OPENCV_CORE_TYPES_H
 #define OPENCV_CORE_TYPES_H
 
-#if !defined(__OPENCV_BUILD) && !defined(CV__DISABLE_C_API_CTORS)
-#define CV__ENABLE_C_API_CTORS // enable C API ctors (must be removed)
+#ifdef CV__ENABLE_C_API_CTORS  // invalid C API ctors (must be removed)
+#if defined(_WIN32) && !defined(CV__SKIP_MESSAGE_MALFORMED_C_API_CTORS)
+#error "C API ctors don't work on Win32: https://github.com/opencv/opencv/issues/15990"
+#endif
 #endif
 
 //#define CV__VALIDATE_UNUNITIALIZED_VARS 1  // C++11 & GCC only
diff --git a/modules/core/misc/java/src/cpp/core_manual.hpp b/modules/core/misc/java/src/cpp/core_manual.hpp
index 82d460fa53..4ca87e836a 100644
--- a/modules/core/misc/java/src/cpp/core_manual.hpp
+++ b/modules/core/misc/java/src/cpp/core_manual.hpp
@@ -9,7 +9,7 @@ CV_EXPORTS_W void setErrorVerbosity(bool verbose);
 
 }
 
-#if 0
+#ifdef OPENCV_BINDINGS_PARSER
 
 namespace cv
 {
@@ -30,4 +30,4 @@ CV_EXPORTS_W void min(InputArray src1, Scalar src2, OutputArray dst);
 CV_EXPORTS_W void max(InputArray src1, Scalar src2, OutputArray dst);
 
 }
-#endif //0
+#endif
diff --git a/modules/core/src/norm.cpp b/modules/core/src/norm.cpp
index 0e06bff4d1..e25874e975 100644
--- a/modules/core/src/norm.cpp
+++ b/modules/core/src/norm.cpp
@@ -1078,7 +1078,8 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m
 {
     CV_INSTRUMENT_REGION();
 
-    CV_Assert( _src1.sameSize(_src2) && _src1.type() == _src2.type() );
+    CV_CheckTypeEQ(_src1.type(), _src2.type(), "Input type mismatch");
+    CV_Assert(_src1.sameSize(_src2));
 
 #if defined HAVE_OPENCL || defined HAVE_IPP
     double _result = 0;
diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp
index 0794c15005..b415bde4c3 100644
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@@ -1345,16 +1345,44 @@ namespace details {
 #endif
 #endif
 
+template <class T>
+class DisposedSingletonMark
+{
+private:
+    static bool mark;
+protected:
+    DisposedSingletonMark() {}
+    ~DisposedSingletonMark()
+    {
+        mark = true;
+    }
+public:
+    static bool isDisposed() { return mark; }
+};
+
 // TLS platform abstraction layer
-class TlsAbstraction
+class TlsAbstraction : public DisposedSingletonMark<TlsAbstraction>
 {
 public:
     TlsAbstraction();
     ~TlsAbstraction();
-    void* GetData() const;
-    void  SetData(void *pData);
+    void* getData() const
+    {
+        if (isDisposed())  // guard: static initialization order fiasco
+            return NULL;
+        return getData_();
+    }
+    void setData(void *pData)
+    {
+        if (isDisposed())  // guard: static initialization order fiasco
+            return;
+        return setData_(pData);
+    }
 
 private:
+    void* getData_() const;
+    void setData_(void *pData);
+
 #ifdef _WIN32
 #ifndef WINRT
     DWORD tlsKey;
@@ -1364,16 +1392,30 @@ private:
 #endif
 };
 
+template<> bool DisposedSingletonMark<TlsAbstraction>::mark = false;
+
+static TlsAbstraction& getTlsAbstraction_()
+{
+    static TlsAbstraction g_tls;  // disposed in atexit() handlers (required for unregistering our callbacks)
+    return g_tls;
+}
+static TlsAbstraction* getTlsAbstraction()
+{
+    static TlsAbstraction* instance = &getTlsAbstraction_();
+    return DisposedSingletonMark<TlsAbstraction>::isDisposed() ? NULL : instance;
+}
+
+
 #ifdef _WIN32
 #ifdef WINRT
 static __declspec( thread ) void* tlsData = NULL; // using C++11 thread attribute for local thread data
 TlsAbstraction::TlsAbstraction() {}
 TlsAbstraction::~TlsAbstraction() {}
-void* TlsAbstraction::GetData() const
+void* TlsAbstraction::getData_() const
 {
     return tlsData;
 }
-void  TlsAbstraction::SetData(void *pData)
+void TlsAbstraction::setData_(void *pData)
 {
     tlsData = pData;
 }
@@ -1397,8 +1439,9 @@ TlsAbstraction::~TlsAbstraction()
 #else // CV_USE_FLS
     FlsFree(tlsKey);
 #endif // CV_USE_FLS
+    tlsKey = TLS_OUT_OF_INDEXES;
 }
-void* TlsAbstraction::GetData() const
+void* TlsAbstraction::getData_() const
 {
 #ifndef CV_USE_FLS
     return TlsGetValue(tlsKey);
@@ -1406,7 +1449,7 @@ void* TlsAbstraction::GetData() const
     return FlsGetValue(tlsKey);
 #endif // CV_USE_FLS
 }
-void  TlsAbstraction::SetData(void *pData)
+void TlsAbstraction::setData_(void *pData)
 {
 #ifndef CV_USE_FLS
     CV_Assert(TlsSetValue(tlsKey, pData) == TRUE);
@@ -1423,13 +1466,18 @@ TlsAbstraction::TlsAbstraction()
 }
 TlsAbstraction::~TlsAbstraction()
 {
-    CV_Assert(pthread_key_delete(tlsKey) == 0);
+    if (pthread_key_delete(tlsKey) != 0)
+    {
+        // Don't use logging here
+        fprintf(stderr, "OpenCV ERROR: TlsAbstraction::~TlsAbstraction(): pthread_key_delete() call failed\n");
+        fflush(stderr);
+    }
 }
-void* TlsAbstraction::GetData() const
+void* TlsAbstraction::getData_() const
 {
     return pthread_getspecific(tlsKey);
 }
-void  TlsAbstraction::SetData(void *pData)
+void TlsAbstraction::setData_(void *pData)
 {
     CV_Assert(pthread_setspecific(tlsKey, pData) == 0);
 }
@@ -1462,12 +1510,17 @@ public:
     {
         // TlsStorage object should not be released
         // There is no reliable way to avoid problems caused by static initialization order fiasco
-        CV_LOG_FATAL(NULL, "TlsStorage::~TlsStorage() call is not expected");
+        // Don't use logging here
+        fprintf(stderr, "OpenCV FATAL: TlsStorage::~TlsStorage() call is not expected\n");
+        fflush(stderr);
     }
 
     void releaseThread(void* tlsValue = NULL)
     {
-        ThreadData *pTD = tlsValue == NULL ? (ThreadData*)tls.GetData() : (ThreadData*)tlsValue;
+        TlsAbstraction* tls = getTlsAbstraction();
+        if (NULL == tls)
+            return;  // TLS signleton is not available (terminated)
+        ThreadData *pTD = tlsValue == NULL ? (ThreadData*)tls->getData() : (ThreadData*)tlsValue;
         if (pTD == NULL)
             return;  // no OpenCV TLS data for this thread
         AutoLock guard(mtxGlobalAccess);
@@ -1477,7 +1530,7 @@ public:
             {
                 threads[i] = NULL;
                 if (tlsValue == NULL)
-                    tls.SetData(0);
+                    tls->setData(0);
                 std::vector<void*>& thread_slots = pTD->slots;
                 for (size_t slotIdx = 0; slotIdx < thread_slots.size(); slotIdx++)
                 {
@@ -1489,13 +1542,16 @@ public:
                     if (container)
                         container->deleteDataInstance(pData);
                     else
-                        CV_LOG_ERROR(NULL, "TLS: container for slotIdx=" << slotIdx << " is NULL. Can't release thread data");
+                    {
+                        fprintf(stderr, "OpenCV ERROR: TLS: container for slotIdx=%d is NULL. Can't release thread data\n", (int)slotIdx);
+                        fflush(stderr);
+                    }
                 }
                 delete pTD;
                 return;
             }
         }
-        CV_LOG_WARNING(NULL, "TLS: Can't release thread TLS data (unknown pointer or data race): " << (void*)pTD);
+        fprintf(stderr, "OpenCV WARNING: TLS: Can't release thread TLS data (unknown pointer or data race): %p\n", (void*)pTD); fflush(stderr);
     }
 
     // Reserve TLS storage index
@@ -1552,7 +1608,11 @@ public:
         CV_Assert(tlsSlotsSize > slotIdx);
 #endif
 
-        ThreadData* threadData = (ThreadData*)tls.GetData();
+        TlsAbstraction* tls = getTlsAbstraction();
+        if (NULL == tls)
+            return NULL;  // TLS signleton is not available (terminated)
+
+        ThreadData* threadData = (ThreadData*)tls->getData();
         if(threadData && threadData->slots.size() > slotIdx)
             return threadData->slots[slotIdx];
 
@@ -1584,11 +1644,15 @@ public:
         CV_Assert(tlsSlotsSize > slotIdx);
 #endif
 
-        ThreadData* threadData = (ThreadData*)tls.GetData();
+        TlsAbstraction* tls = getTlsAbstraction();
+        if (NULL == tls)
+            return;  // TLS signleton is not available (terminated)
+
+        ThreadData* threadData = (ThreadData*)tls->getData();
         if(!threadData)
         {
             threadData = new ThreadData;
-            tls.SetData((void*)threadData);
+            tls->setData((void*)threadData);
             {
                 AutoLock guard(mtxGlobalAccess);
 
@@ -1623,8 +1687,6 @@ public:
     }
 
 private:
-    TlsAbstraction tls; // TLS abstraction layer instance
-
     Mutex  mtxGlobalAccess;           // Shared objects operation guard
     size_t tlsSlotsSize;              // equal to tlsSlots.size() in synchronized sections
                                       // without synchronization this counter doesn't decrease - it is used for slotIdx sanity checks
diff --git a/modules/dnn/perf/perf_net.cpp b/modules/dnn/perf/perf_net.cpp
index c0628dba96..d4c962e741 100644
--- a/modules/dnn/perf/perf_net.cpp
+++ b/modules/dnn/perf/perf_net.cpp
@@ -108,7 +108,7 @@ PERF_TEST_P_(DNNTestNetwork, Inception_5h)
 
 PERF_TEST_P_(DNNTestNetwork, ENet)
 {
-    if ((backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019) ||
+    if ((backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target != DNN_TARGET_CPU) ||
         (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16))
         throw SkipTestException("");
     processNet("dnn/Enet-model-best.net", "", "enet.yml",
diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp
index dae834d8d0..8a01efc0c0 100644
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@@ -1734,7 +1734,7 @@ struct Net::Impl
             Ptr<Layer> layer = ld.layerInstance;
             if (!fused && !layer->supportBackend(preferableBackend))
             {
-                bool customizable = ld.id != 0 && ld.outputBlobs.size() == 1 &&
+                bool customizable = ld.id != 0 &&
                                     INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2019R2) &&
                                     supportsCPUFallback;
                 // TODO: there is a bug in Myriad plugin with custom layers shape infer.
diff --git a/modules/dnn/src/op_inf_engine.cpp b/modules/dnn/src/op_inf_engine.cpp
index aac6e3191b..19d74baa43 100644
--- a/modules/dnn/src/op_inf_engine.cpp
+++ b/modules/dnn/src/op_inf_engine.cpp
@@ -278,11 +278,28 @@ void InfEngineBackendNet::connect(const std::vector<Ptr<BackendWrapper> >& input
     {
         const auto& inp = inpWrappers[i];
         const std::string& inpName = inp->dataPtr->getName();
+
+        std::string inpLayerName = inpName;
+        size_t inpPortId = inpName.rfind('.');
+        if (inpPortId != std::string::npos)
+        {
+            std::string portIdStr = inpName.substr(inpPortId + 1);
+            if (std::all_of(portIdStr.begin(), portIdStr.end(), ::isdigit))
+            {
+                inpLayerName = inpName.substr(0, inpPortId);
+                inpPortId = atoi(portIdStr.c_str());
+            }
+            else
+                inpPortId = 0;
+        }
+        else
+            inpPortId = 0;
+
         int inpId;
-        it = layers.find(inpName);
+        it = layers.find(inpLayerName);
         if (it == layers.end())
         {
-            InferenceEngine::Builder::InputLayer inpLayer(!inpName.empty() ? inpName : kDefaultInpLayerName);
+            InferenceEngine::Builder::InputLayer inpLayer(!inpLayerName.empty() ? inpLayerName : kDefaultInpLayerName);
             std::vector<size_t> shape(inp->blob->getTensorDesc().getDims());
             inpLayer.setPort(InferenceEngine::Port(shape));
             inpId = netBuilder.addLayer(inpLayer);
@@ -292,24 +309,28 @@ void InfEngineBackendNet::connect(const std::vector<Ptr<BackendWrapper> >& input
         else
             inpId = it->second;
 
-        netBuilder.connect((size_t)inpId, {(size_t)layerId, i});
-        unconnectedLayersIds.erase(inpId);
+        netBuilder.connect({(size_t)inpId, inpPortId}, {(size_t)layerId, i});
+        unconnectedPorts.erase({inpId, inpPortId});
     }
     CV_Assert(!outputs.empty());
-    InferenceEngine::DataPtr dataPtr = infEngineDataNode(outputs[0]);
+    for (int i = 0; i < outputs.size(); ++i)
+    {
+        InferenceEngine::DataPtr dataPtr = infEngineDataNode(outputs[i]);
+        std::string outputName = outputs.size() > 1 ? (layerName + "." + std::to_string(i)) : layerName;
 #if INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2019R1)
-    dataPtr->name = layerName;
+        dataPtr->name = outputName;
 #else
-    dataPtr->setName(layerName);
+        dataPtr->setName(outputName);
 #endif
+    }
 }
 
 void InfEngineBackendNet::init(Target targetId)
 {
     if (!hasNetOwner)
     {
-        CV_Assert(!unconnectedLayersIds.empty());
-        for (int id : unconnectedLayersIds)
+        CV_Assert(!unconnectedPorts.empty());
+        for (const auto& port : unconnectedPorts)
         {
             InferenceEngine::Builder::OutputLayer outLayer("myconv1");
 #if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2019R1)
@@ -320,7 +341,7 @@ void InfEngineBackendNet::init(Target targetId)
                                            InferenceEngine::Precision::FP32;
             outLayer.setPort(InferenceEngine::Port({}, p));
 #endif
-            netBuilder.addLayer({InferenceEngine::PortInfo(id)}, outLayer);
+            netBuilder.addLayer({InferenceEngine::PortInfo(port.first, port.second)}, outLayer);
         }
         netBuilder.getContext().addShapeInferImpl(kOpenCVLayersType,
                             std::make_shared<InfEngineCustomLayerShapeInfer>());
@@ -409,8 +430,10 @@ void InfEngineBackendNet::addLayer(InferenceEngine::Builder::Layer& layer)
 
     int id = netBuilder.addLayer(layer);
     const std::string& layerName = layer.getName();
+
     CV_Assert(layers.insert({layerName, id}).second);
-    unconnectedLayersIds.insert(id);
+    for (int i = 0; i < layer.getOutputPorts().size(); ++i)
+        unconnectedPorts.insert({id, i});
 
 #if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2019R1)
     // By default, all the weights are connected to last ports ids.
diff --git a/modules/dnn/src/op_inf_engine.hpp b/modules/dnn/src/op_inf_engine.hpp
index e644e2ebce..0b1398dd09 100644
--- a/modules/dnn/src/op_inf_engine.hpp
+++ b/modules/dnn/src/op_inf_engine.hpp
@@ -132,7 +132,7 @@ private:
     std::map<std::string, int> layers;
     std::vector<std::string> requestedOutputs;
 
-    std::set<int> unconnectedLayersIds;
+    std::set<std::pair<int, int> > unconnectedPorts;
 };
 
 class InfEngineBackendNode : public BackendNode
diff --git a/modules/dnn/test/test_caffe_importer.cpp b/modules/dnn/test/test_caffe_importer.cpp
index d5b4e997e3..0607c7d68f 100644
--- a/modules/dnn/test/test_caffe_importer.cpp
+++ b/modules/dnn/test/test_caffe_importer.cpp
@@ -182,6 +182,17 @@ TEST_P(Reproducibility_AlexNet, Accuracy)
         ASSERT_FALSE(net.empty());
     }
 
+    // Test input layer size
+    std::vector<MatShape> inLayerShapes;
+    std::vector<MatShape> outLayerShapes;
+    net.getLayerShapes(MatShape(), 0, inLayerShapes, outLayerShapes);
+    ASSERT_FALSE(inLayerShapes.empty());
+    ASSERT_EQ(inLayerShapes[0].size(), 4);
+    ASSERT_EQ(inLayerShapes[0][0], 1);
+    ASSERT_EQ(inLayerShapes[0][1], 3);
+    ASSERT_EQ(inLayerShapes[0][2], 227);
+    ASSERT_EQ(inLayerShapes[0][3], 227);
+
     const float l1 = 1e-5;
     const float lInf = (targetId == DNN_TARGET_OPENCL_FP16) ? 3e-3 : 1e-4;
 
diff --git a/modules/dnn/test/test_tf_importer.cpp b/modules/dnn/test/test_tf_importer.cpp
index 8d3eb88107..c49ed51937 100644
--- a/modules/dnn/test/test_tf_importer.cpp
+++ b/modules/dnn/test/test_tf_importer.cpp
@@ -737,9 +737,8 @@ TEST_P(Test_TensorFlow_layers, lstm)
 
 TEST_P(Test_TensorFlow_layers, split)
 {
-    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_MYRIAD &&
-        getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_2)
-        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_2, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_MYRIAD)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
     runTensorFlowNet("split");
 }
 
diff --git a/modules/python/src2/hdr_parser.py b/modules/python/src2/hdr_parser.py
index ab8615fb21..8d7f8707c2 100755
--- a/modules/python/src2/hdr_parser.py
+++ b/modules/python/src2/hdr_parser.py
@@ -793,6 +793,7 @@ class CppHeaderParser(object):
         COMMENT = 1 # inside a multi-line comment
         DIRECTIVE = 2 # inside a multi-line preprocessor directive
         DOCSTRING = 3 # inside a multi-line docstring
+        DIRECTIVE_IF_0 = 4 # inside a '#if 0' directive
 
         state = SCAN
 
@@ -802,6 +803,8 @@ class CppHeaderParser(object):
         self.lineno = 0
         self.wrap_mode = wmode
 
+        depth_if_0 = 0
+
         for l0 in linelist:
             self.lineno += 1
             #print(state, self.lineno, l0)
@@ -813,8 +816,28 @@ class CppHeaderParser(object):
                 # fall through to the if state == DIRECTIVE check
 
             if state == DIRECTIVE:
-                if not l.endswith("\\"):
-                    state = SCAN
+                if l.endswith("\\"):
+                    continue
+                state = SCAN
+                l = re.sub(r'//(.+)?', '', l).strip()  # drop // comment
+                if l == '#if 0' or l == '#if defined(__OPENCV_BUILD)' or l == '#ifdef __OPENCV_BUILD':
+                    state = DIRECTIVE_IF_0
+                    depth_if_0 = 1
+                continue
+
+            if state == DIRECTIVE_IF_0:
+                if l.startswith('#'):
+                    l = l[1:].strip()
+                    if l.startswith("if"):
+                        depth_if_0 += 1
+                        continue
+                    if l.startswith("endif"):
+                        depth_if_0 -= 1
+                        if depth_if_0 == 0:
+                            state = SCAN
+                else:
+                    # print('---- {:30s}:{:5d}: {}'.format(hname[-30:], self.lineno, l))
+                    pass
                 continue
 
             if state == COMMENT: