diff --git a/apps/traincascade/boost.cpp b/apps/traincascade/boost.cpp
index 993d6885fd..f05229be5c 100644
--- a/apps/traincascade/boost.cpp
+++ b/apps/traincascade/boost.cpp
@@ -543,7 +543,7 @@ void CvCascadeBoostTrainData::setData( const CvFeatureEvaluator* _featureEvaluat
     featureEvaluator = _featureEvaluator;
 
     max_c_count = MAX( 2, featureEvaluator->getMaxCatCount() );
-    _resp = featureEvaluator->getCls();
+    _resp = cvMat(featureEvaluator->getCls());
     responses = &_resp;
     // TODO: check responses: elements must be 0 or 1
 
diff --git a/apps/traincascade/old_ml_boost.cpp b/apps/traincascade/old_ml_boost.cpp
index c887c77b92..a937ea670c 100644
--- a/apps/traincascade/old_ml_boost.cpp
+++ b/apps/traincascade/old_ml_boost.cpp
@@ -2122,12 +2122,12 @@ CvBoost::train( const Mat& _train_data, int _tflag,
                const Mat& _missing_mask,
                CvBoostParams _params, bool _update )
 {
-    train_data_hdr = _train_data;
+    train_data_hdr = cvMat(_train_data);
     train_data_mat = _train_data;
-    responses_hdr = _responses;
+    responses_hdr = cvMat(_responses);
     responses_mat = _responses;
 
-    CvMat vidx = _var_idx, sidx = _sample_idx, vtype = _var_type, mmask = _missing_mask;
+    CvMat vidx = cvMat(_var_idx), sidx = cvMat(_sample_idx), vtype = cvMat(_var_type), mmask = cvMat(_missing_mask);
 
     return train(&train_data_hdr, _tflag, &responses_hdr, vidx.data.ptr ? &vidx : 0,
           sidx.data.ptr ? &sidx : 0, vtype.data.ptr ? &vtype : 0,
@@ -2138,7 +2138,7 @@ float
 CvBoost::predict( const Mat& _sample, const Mat& _missing,
                   const Range& slice, bool raw_mode, bool return_sum ) const
 {
-    CvMat sample = _sample, mmask = _missing;
+    CvMat sample = cvMat(_sample), mmask = cvMat(_missing);
     /*if( weak_responses )
     {
         int weak_count = cvSliceLength( slice, weak );
diff --git a/apps/traincascade/old_ml_tree.cpp b/apps/traincascade/old_ml_tree.cpp
index ed6b6eed45..d4826b814f 100644
--- a/apps/traincascade/old_ml_tree.cpp
+++ b/apps/traincascade/old_ml_tree.cpp
@@ -1592,12 +1592,12 @@ bool CvDTree::train( const Mat& _train_data, int _tflag,
                     const Mat& _sample_idx, const Mat& _var_type,
                     const Mat& _missing_mask, CvDTreeParams _params )
 {
-    train_data_hdr = _train_data;
+    train_data_hdr = cvMat(_train_data);
     train_data_mat = _train_data;
-    responses_hdr = _responses;
+    responses_hdr = cvMat(_responses);
     responses_mat = _responses;
 
-    CvMat vidx=_var_idx, sidx=_sample_idx, vtype=_var_type, mmask=_missing_mask;
+    CvMat vidx=cvMat(_var_idx), sidx=cvMat(_sample_idx), vtype=cvMat(_var_type), mmask=cvMat(_missing_mask);
 
     return train(&train_data_hdr, _tflag, &responses_hdr, vidx.data.ptr ? &vidx : 0, sidx.data.ptr ? &sidx : 0,
                  vtype.data.ptr ? &vtype : 0, mmask.data.ptr ? &mmask : 0, _params);
@@ -3734,7 +3734,7 @@ CvDTreeNode* CvDTree::predict( const CvMat* _sample,
 
 CvDTreeNode* CvDTree::predict( const Mat& _sample, const Mat& _missing, bool preprocessed_input ) const
 {
-    CvMat sample = _sample, mmask = _missing;
+    CvMat sample = cvMat(_sample), mmask = cvMat(_missing);
     return predict(&sample, mmask.data.ptr ? &mmask : 0, preprocessed_input);
 }
 
diff --git a/cmake/OpenCVCompilerOptions.cmake b/cmake/OpenCVCompilerOptions.cmake
index 30e4a00a3f..082debf195 100644
--- a/cmake/OpenCVCompilerOptions.cmake
+++ b/cmake/OpenCVCompilerOptions.cmake
@@ -125,8 +125,8 @@ if(CV_GCC OR CV_CLANG)
     )
       add_extra_compiler_option(-Wimplicit-fallthrough=3)
     endif()
-    if(CV_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 7.2.0)
-      add_extra_compiler_option(-Wno-strict-overflow) # Issue is fixed in GCC 7.2.1
+    if(CV_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+      add_extra_compiler_option(-Wno-strict-overflow) # Issue appears when compiling surf.cpp from opencv_contrib/modules/xfeatures2d
     endif()
   endif()
   add_extra_compiler_option(-fdiagnostics-show-option)
diff --git a/modules/calib3d/perf/perf_pnp.cpp b/modules/calib3d/perf/perf_pnp.cpp
index d3d6076252..7c7254a0df 100644
--- a/modules/calib3d/perf/perf_pnp.cpp
+++ b/modules/calib3d/perf/perf_pnp.cpp
@@ -52,8 +52,8 @@ PERF_TEST_P(PointsNum_Algo, solvePnP,
         cv::solvePnP(points3d, points2d, intrinsics, distortion, rvec, tvec, false, algo);
     }
 
-    SANITY_CHECK(rvec, 1e-6);
-    SANITY_CHECK(tvec, 1e-6);
+    SANITY_CHECK(rvec, 1e-4);
+    SANITY_CHECK(tvec, 1e-4);
 }
 
 PERF_TEST_P(PointsNum_Algo, solvePnPSmallPoints,
diff --git a/modules/calib3d/src/calibration.cpp b/modules/calib3d/src/calibration.cpp
index 022eb1ff06..ac5fd419a5 100644
--- a/modules/calib3d/src/calibration.cpp
+++ b/modules/calib3d/src/calibration.cpp
@@ -977,7 +977,7 @@ CV_IMPL void cvFindExtrinsicCameraParams2( const CvMat* objectPoints,
     int i, count;
     double a[9], ar[9]={1,0,0,0,1,0,0,0,1}, R[9];
     double MM[9], U[9], V[9], W[3];
-    CvScalar Mc;
+    cv::Scalar Mc;
     double param[6];
     CvMat matA = cvMat( 3, 3, CV_64F, a );
     CvMat _Ar = cvMat( 3, 3, CV_64F, ar );
@@ -1478,7 +1478,7 @@ static double cvCalibrateCamera2Internal( const CvMat* objectPoints,
                 CV_Error( CV_StsOutOfRange,
                     "The specified aspect ratio (= cameraMatrix[0][0] / cameraMatrix[1][1]) is incorrect" );
         }
-        CvMat _matM(matM), m(_m);
+        CvMat _matM = cvMat(matM), m = cvMat(_m);
         cvInitIntrinsicParams2D( &_matM, &m, npoints, imageSize, &matA, aspectRatio );
     }
 
@@ -1550,8 +1550,8 @@ static double cvCalibrateCamera2Internal( const CvMat* objectPoints,
         cvGetRows( solver.param, &_ri, NINTRINSIC + i*6, NINTRINSIC + i*6 + 3 );
         cvGetRows( solver.param, &_ti, NINTRINSIC + i*6 + 3, NINTRINSIC + i*6 + 6 );
 
-        CvMat _Mi(matM.colRange(pos, pos + ni));
-        CvMat _mi(_m.colRange(pos, pos + ni));
+        CvMat _Mi = cvMat(matM.colRange(pos, pos + ni));
+        CvMat _mi = cvMat(_m.colRange(pos, pos + ni));
 
         cvFindExtrinsicCameraParams2( &_Mi, &_mi, &matA, &_k, &_ri, &_ti );
     }
@@ -1590,17 +1590,17 @@ static double cvCalibrateCamera2Internal( const CvMat* objectPoints,
             cvGetRows( solver.param, &_ri, NINTRINSIC + i*6, NINTRINSIC + i*6 + 3 );
             cvGetRows( solver.param, &_ti, NINTRINSIC + i*6 + 3, NINTRINSIC + i*6 + 6 );
 
-            CvMat _Mi(matM.colRange(pos, pos + ni));
-            CvMat _mi(_m.colRange(pos, pos + ni));
-            CvMat _me(allErrors.colRange(pos, pos + ni));
+            CvMat _Mi = cvMat(matM.colRange(pos, pos + ni));
+            CvMat _mi = cvMat(_m.colRange(pos, pos + ni));
+            CvMat _me = cvMat(allErrors.colRange(pos, pos + ni));
 
             _Je.resize(ni*2); _Ji.resize(ni*2); _err.resize(ni*2);
-            CvMat _dpdr(_Je.colRange(0, 3));
-            CvMat _dpdt(_Je.colRange(3, 6));
-            CvMat _dpdf(_Ji.colRange(0, 2));
-            CvMat _dpdc(_Ji.colRange(2, 4));
-            CvMat _dpdk(_Ji.colRange(4, NINTRINSIC));
-            CvMat _mp(_err.reshape(2, 1));
+            CvMat _dpdr = cvMat(_Je.colRange(0, 3));
+            CvMat _dpdt = cvMat(_Je.colRange(3, 6));
+            CvMat _dpdf = cvMat(_Ji.colRange(0, 2));
+            CvMat _dpdc = cvMat(_Ji.colRange(2, 4));
+            CvMat _dpdk = cvMat(_Ji.colRange(4, NINTRINSIC));
+            CvMat _mp = cvMat(_err.reshape(2, 1));
 
             if( calcJ )
             {
@@ -2081,7 +2081,7 @@ static double cvStereoCalibrateImpl( const CvMat* _objectPoints, const CvMat* _i
         for( i = ofs = 0; i < nimages; ofs += ni, i++ )
         {
             ni = npoints->data.i[i];
-            CvMat objpt_i, _part;
+            CvMat objpt_i;
 
             om[0] = cvMat(3,1,CV_64F,solver.param->data.db+(i+1)*6);
             T[0] = cvMat(3,1,CV_64F,solver.param->data.db+(i+1)*6+3);
@@ -2095,12 +2095,12 @@ static double cvStereoCalibrateImpl( const CvMat* _objectPoints, const CvMat* _i
             objpt_i = cvMat(1, ni, CV_64FC3, objectPoints->data.db + ofs*3);
             err.resize(ni*2); Je.resize(ni*2); J_LR.resize(ni*2); Ji.resize(ni*2);
 
-            CvMat tmpimagePoints(err.reshape(2, 1));
-            CvMat dpdf(Ji.colRange(0, 2));
-            CvMat dpdc(Ji.colRange(2, 4));
-            CvMat dpdk(Ji.colRange(4, NINTRINSIC));
-            CvMat dpdrot(Je.colRange(0, 3));
-            CvMat dpdt(Je.colRange(3, 6));
+            CvMat tmpimagePoints = cvMat(err.reshape(2, 1));
+            CvMat dpdf = cvMat(Ji.colRange(0, 2));
+            CvMat dpdc = cvMat(Ji.colRange(2, 4));
+            CvMat dpdk = cvMat(Ji.colRange(4, NINTRINSIC));
+            CvMat dpdrot = cvMat(Je.colRange(0, 3));
+            CvMat dpdt = cvMat(Je.colRange(3, 6));
 
             for( k = 0; k < 2; k++ )
             {
@@ -2363,7 +2363,7 @@ void cvStereoRectify( const CvMat* _cameraMatrix1, const CvMat* _cameraMatrix2,
     // calculate projection/camera matrices
     // these contain the relevant rectified image internal params (fx, fy=fx, cx, cy)
     double fc_new = DBL_MAX;
-    CvPoint2D64f cc_new[2] = {{0,0}, {0,0}};
+    CvPoint2D64f cc_new[2] = {};
 
     newImgSize = newImgSize.width * newImgSize.height != 0 ? newImgSize : imageSize;
     const double ratio_x = (double)newImgSize.width / imageSize.width / 2;
@@ -2375,8 +2375,8 @@ void cvStereoRectify( const CvMat* _cameraMatrix1, const CvMat* _cameraMatrix2,
     {
         const CvMat* A = k == 0 ? _cameraMatrix1 : _cameraMatrix2;
         const CvMat* Dk = k == 0 ? _distCoeffs1 : _distCoeffs2;
-        CvPoint2D32f _pts[4];
-        CvPoint3D32f _pts_3[4];
+        CvPoint2D32f _pts[4] = {};
+        CvPoint3D32f _pts_3[4] = {};
         CvMat pts = cvMat(1, 4, CV_32FC2, _pts);
         CvMat pts_3 = cvMat(1, 4, CV_32FC3, _pts_3);
 
@@ -2485,18 +2485,22 @@ void cvStereoRectify( const CvMat* _cameraMatrix1, const CvMat* _cameraMatrix2,
 
     if(roi1)
     {
-        *roi1 = cv::Rect(cvCeil((inner1.x - cx1_0)*s + cx1),
+        *roi1 = cvRect(
+            cv::Rect(cvCeil((inner1.x - cx1_0)*s + cx1),
                      cvCeil((inner1.y - cy1_0)*s + cy1),
                      cvFloor(inner1.width*s), cvFloor(inner1.height*s))
-            & cv::Rect(0, 0, newImgSize.width, newImgSize.height);
+            & cv::Rect(0, 0, newImgSize.width, newImgSize.height)
+        );
     }
 
     if(roi2)
     {
-        *roi2 = cv::Rect(cvCeil((inner2.x - cx2_0)*s + cx2),
+        *roi2 = cvRect(
+            cv::Rect(cvCeil((inner2.x - cx2_0)*s + cx2),
                      cvCeil((inner2.y - cy2_0)*s + cy2),
                      cvFloor(inner2.width*s), cvFloor(inner2.height*s))
-            & cv::Rect(0, 0, newImgSize.width, newImgSize.height);
+            & cv::Rect(0, 0, newImgSize.width, newImgSize.height)
+        );
     }
     }
 
@@ -2557,7 +2561,7 @@ void cvGetOptimalNewCameraMatrix( const CvMat* cameraMatrix, const CvMat* distCo
                                      (float)(inner.height*s));
             cv::Rect r(cvCeil(inner.x), cvCeil(inner.y), cvFloor(inner.width), cvFloor(inner.height));
             r &= cv::Rect(0, 0, newImgSize.width, newImgSize.height);
-            *validPixROI = r;
+            *validPixROI = cvRect(r);
         }
     }
     else
@@ -2589,7 +2593,7 @@ void cvGetOptimalNewCameraMatrix( const CvMat* cameraMatrix, const CvMat* distCo
             icvGetRectangles( cameraMatrix, distCoeffs, 0, &matM, imgSize, inner, outer );
             cv::Rect r = inner;
             r &= cv::Rect(0, 0, newImgSize.width, newImgSize.height);
-            *validPixROI = r;
+            *validPixROI = cvRect(r);
         }
     }
 
@@ -3162,30 +3166,29 @@ static void collectCalibrationData( InputArrayOfArrays objectPoints,
     Point3f* objPtData = objPtMat.ptr<Point3f>();
     Point2f* imgPtData1 = imgPtMat1.ptr<Point2f>();
 
-#if defined __GNUC__ && __GNUC__ >= 8
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wclass-memaccess"
-#endif
     for( i = 0; i < nimages; i++, j += ni )
     {
         Mat objpt = objectPoints.getMat(i);
         Mat imgpt1 = imagePoints1.getMat(i);
         ni = objpt.checkVector(3, CV_32F);
         npoints.at<int>(i) = ni;
-        memcpy( objPtData + j, objpt.ptr(), ni*sizeof(objPtData[0]) );
-        memcpy( imgPtData1 + j, imgpt1.ptr(), ni*sizeof(imgPtData1[0]) );
+        for (int n = 0; n < ni; ++n)
+        {
+            objPtData[j + n] = objpt.ptr<Point3f>()[n];
+            imgPtData1[j + n] = imgpt1.ptr<Point2f>()[n];
+        }
 
         if( imgPtData2 )
         {
             Mat imgpt2 = imagePoints2.getMat(i);
             int ni2 = imgpt2.checkVector(2, CV_32F);
             CV_Assert( ni == ni2 );
-            memcpy( imgPtData2 + j, imgpt2.ptr(), ni*sizeof(imgPtData2[0]) );
+            for (int n = 0; n < ni2; ++n)
+            {
+                imgPtData2[j + n] = imgpt2.ptr<Point2f>()[n];
+            }
         }
     }
-#if defined __GNUC__ && __GNUC__ >= 8
-#pragma GCC diagnostic pop
-#endif
 }
 
 static Mat prepareCameraMatrix(Mat& cameraMatrix0, int rtype)
@@ -3228,11 +3231,11 @@ void cv::Rodrigues(InputArray _src, OutputArray _dst, OutputArray _jacobian)
     bool v2m = src.cols == 1 || src.rows == 1;
     _dst.create(3, v2m ? 3 : 1, src.depth());
     Mat dst = _dst.getMat();
-    CvMat _csrc = src, _cdst = dst, _cjacobian;
+    CvMat _csrc = cvMat(src), _cdst = cvMat(dst), _cjacobian;
     if( _jacobian.needed() )
     {
         _jacobian.create(v2m ? Size(9, 3) : Size(3, 9), src.depth());
-        _cjacobian = _jacobian.getMat();
+        _cjacobian = cvMat(_jacobian.getMat());
     }
     bool ok = cvRodrigues2(&_csrc, &_cdst, _jacobian.needed() ? &_cjacobian : 0) > 0;
     if( !ok )
@@ -3247,7 +3250,8 @@ void cv::matMulDeriv( InputArray _Amat, InputArray _Bmat,
     Mat A = _Amat.getMat(), B = _Bmat.getMat();
     _dABdA.create(A.rows*B.cols, A.rows*A.cols, A.type());
     _dABdB.create(A.rows*B.cols, B.rows*B.cols, A.type());
-    CvMat matA = A, matB = B, c_dABdA = _dABdA.getMat(), c_dABdB = _dABdB.getMat();
+    Mat dABdA = _dABdA.getMat(), dABdB = _dABdB.getMat();
+    CvMat matA = cvMat(A), matB = cvMat(B), c_dABdA = cvMat(dABdA), c_dABdB = cvMat(dABdB);
     cvCalcMatMulDeriv(&matA, &matB, &c_dABdA, &c_dABdB);
 }
 
@@ -3267,8 +3271,8 @@ void cv::composeRT( InputArray _rvec1, InputArray _tvec1,
     _tvec3.create(tvec1.size(), rtype);
     Mat rvec3 = _rvec3.getMat(), tvec3 = _tvec3.getMat();
 
-    CvMat c_rvec1 = rvec1, c_tvec1 = tvec1, c_rvec2 = rvec2,
-          c_tvec2 = tvec2, c_rvec3 = rvec3, c_tvec3 = tvec3;
+    CvMat c_rvec1 = cvMat(rvec1), c_tvec1 = cvMat(tvec1), c_rvec2 = cvMat(rvec2),
+          c_tvec2 = cvMat(tvec2), c_rvec3 = cvMat(rvec3), c_tvec3 = cvMat(tvec3);
     CvMat c_dr3dr1, c_dr3dt1, c_dr3dr2, c_dr3dt2, c_dt3dr1, c_dt3dt1, c_dt3dr2, c_dt3dt2;
     CvMat *p_dr3dr1=0, *p_dr3dt1=0, *p_dr3dr2=0, *p_dr3dt2=0, *p_dt3dr1=0, *p_dt3dt1=0, *p_dt3dr2=0, *p_dt3dt2=0;
 #define CV_COMPOSE_RT_PARAM(name) \
@@ -3277,7 +3281,7 @@ void cv::composeRT( InputArray _rvec1, InputArray _tvec1,
     { \
         _ ## name.create(3, 3, rtype); \
         name = _ ## name.getMat(); \
-        p_ ## name = &(c_ ## name = name); \
+        p_ ## name = &(c_ ## name = cvMat(name)); \
     }
 
     CV_COMPOSE_RT_PARAM(dr3dr1); CV_COMPOSE_RT_PARAM(dr3dt1);
@@ -3310,31 +3314,32 @@ void cv::projectPoints( InputArray _opoints,
 
     _ipoints.create(npoints, 1, CV_MAKETYPE(depth, 2), -1, true);
     Mat imagePoints = _ipoints.getMat();
-    CvMat c_imagePoints(imagePoints);
-    CvMat c_objectPoints = opoints;
+    CvMat c_imagePoints = cvMat(imagePoints);
+    CvMat c_objectPoints = cvMat(opoints);
     Mat cameraMatrix = _cameraMatrix.getMat();
 
     Mat rvec = _rvec.getMat(), tvec = _tvec.getMat();
-    CvMat c_cameraMatrix = cameraMatrix;
-    CvMat c_rvec = rvec, c_tvec = tvec;
+    CvMat c_cameraMatrix = cvMat(cameraMatrix);
+    CvMat c_rvec = cvMat(rvec), c_tvec = cvMat(tvec);
 
     double dc0buf[5]={0};
     Mat dc0(5,1,CV_64F,dc0buf);
     Mat distCoeffs = _distCoeffs.getMat();
     if( distCoeffs.empty() )
         distCoeffs = dc0;
-    CvMat c_distCoeffs = distCoeffs;
+    CvMat c_distCoeffs = cvMat(distCoeffs);
     int ndistCoeffs = distCoeffs.rows + distCoeffs.cols - 1;
 
+    Mat jacobian;
     if( _jacobian.needed() )
     {
         _jacobian.create(npoints*2, 3+3+2+2+ndistCoeffs, CV_64F);
-        Mat jacobian = _jacobian.getMat();
-        pdpdrot = &(dpdrot = jacobian.colRange(0, 3));
-        pdpdt = &(dpdt = jacobian.colRange(3, 6));
-        pdpdf = &(dpdf = jacobian.colRange(6, 8));
-        pdpdc = &(dpdc = jacobian.colRange(8, 10));
-        pdpddist = &(dpddist = jacobian.colRange(10, 10+ndistCoeffs));
+        jacobian = _jacobian.getMat();
+        pdpdrot = &(dpdrot = cvMat(jacobian.colRange(0, 3)));
+        pdpdt = &(dpdt = cvMat(jacobian.colRange(3, 6)));
+        pdpdf = &(dpdf = cvMat(jacobian.colRange(6, 8)));
+        pdpdc = &(dpdc = cvMat(jacobian.colRange(8, 10)));
+        pdpddist = &(dpddist = cvMat(jacobian.colRange(10, 10+ndistCoeffs)));
     }
 
     cvProjectPoints2( &c_objectPoints, &c_rvec, &c_tvec, &c_cameraMatrix, &c_distCoeffs,
@@ -3350,9 +3355,9 @@ cv::Mat cv::initCameraMatrix2D( InputArrayOfArrays objectPoints,
     Mat objPt, imgPt, npoints, cameraMatrix(3, 3, CV_64F);
     collectCalibrationData( objectPoints, imagePoints, noArray(),
                             objPt, imgPt, 0, npoints );
-    CvMat _objPt = objPt, _imgPt = imgPt, _npoints = npoints, _cameraMatrix = cameraMatrix;
+    CvMat _objPt = cvMat(objPt), _imgPt = cvMat(imgPt), _npoints = cvMat(npoints), _cameraMatrix = cvMat(cameraMatrix);
     cvInitIntrinsicParams2D( &_objPt, &_imgPt, &_npoints,
-                             imageSize, &_cameraMatrix, aspectRatio );
+                             cvSize(imageSize), &_cameraMatrix, aspectRatio );
     return cameraMatrix;
 }
 
@@ -3434,16 +3439,16 @@ double cv::calibrateCamera(InputArrayOfArrays _objectPoints,
 
     collectCalibrationData( _objectPoints, _imagePoints, noArray(),
                             objPt, imgPt, 0, npoints );
-    CvMat c_objPt = objPt, c_imgPt = imgPt, c_npoints = npoints;
-    CvMat c_cameraMatrix = cameraMatrix, c_distCoeffs = distCoeffs;
-    CvMat c_rvecM = rvecM, c_tvecM = tvecM, c_stdDev = stdDeviationsM, c_errors = errorsM;
+    CvMat c_objPt = cvMat(objPt), c_imgPt = cvMat(imgPt), c_npoints = cvMat(npoints);
+    CvMat c_cameraMatrix = cvMat(cameraMatrix), c_distCoeffs = cvMat(distCoeffs);
+    CvMat c_rvecM = cvMat(rvecM), c_tvecM = cvMat(tvecM), c_stdDev = cvMat(stdDeviationsM), c_errors = cvMat(errorsM);
 
-    double reprojErr = cvCalibrateCamera2Internal(&c_objPt, &c_imgPt, &c_npoints, imageSize,
+    double reprojErr = cvCalibrateCamera2Internal(&c_objPt, &c_imgPt, &c_npoints, cvSize(imageSize),
                                           &c_cameraMatrix, &c_distCoeffs,
                                           rvecs_needed ? &c_rvecM : NULL,
                                           tvecs_needed ? &c_tvecM : NULL,
                                           stddev_needed ? &c_stdDev : NULL,
-                                          errors_needed ? &c_errors : NULL, flags, criteria );
+                                          errors_needed ? &c_errors : NULL, flags, cvTermCriteria(criteria));
 
     if( stddev_needed )
     {
@@ -3582,35 +3587,40 @@ double cv::stereoCalibrate( InputArrayOfArrays _objectPoints,
 
     collectCalibrationData( _objectPoints, _imagePoints1, _imagePoints2,
                             objPt, imgPt, &imgPt2, npoints );
-    CvMat c_objPt = objPt, c_imgPt = imgPt, c_imgPt2 = imgPt2, c_npoints = npoints;
-    CvMat c_cameraMatrix1 = cameraMatrix1, c_distCoeffs1 = distCoeffs1;
-    CvMat c_cameraMatrix2 = cameraMatrix2, c_distCoeffs2 = distCoeffs2;
-    CvMat c_matR = _Rmat.getMat(), c_matT = _Tmat.getMat(), c_matE, c_matF, c_matErr;
+    CvMat c_objPt = cvMat(objPt), c_imgPt = cvMat(imgPt), c_imgPt2 = cvMat(imgPt2), c_npoints = cvMat(npoints);
+    CvMat c_cameraMatrix1 = cvMat(cameraMatrix1), c_distCoeffs1 = cvMat(distCoeffs1);
+    CvMat c_cameraMatrix2 = cvMat(cameraMatrix2), c_distCoeffs2 = cvMat(distCoeffs2);
+    Mat matR_ = _Rmat.getMat(), matT_ = _Tmat.getMat();
+    CvMat c_matR = cvMat(matR_), c_matT = cvMat(matT_), c_matE, c_matF, c_matErr;
 
     bool E_needed = _Emat.needed(), F_needed = _Fmat.needed(), errors_needed = _perViewErrors.needed();
 
+    Mat matE_, matF_, matErr_;
     if( E_needed )
     {
         _Emat.create(3, 3, rtype);
-        c_matE = _Emat.getMat();
+        matE_ = _Emat.getMat();
+        c_matE = cvMat(matE_);
     }
     if( F_needed )
     {
         _Fmat.create(3, 3, rtype);
-        c_matF = _Fmat.getMat();
+        matF_ = _Fmat.getMat();
+        c_matF = cvMat(matF_);
     }
 
     if( errors_needed )
     {
         int nimages = int(_objectPoints.total());
         _perViewErrors.create(nimages, 2, CV_64F);
-        c_matErr = _perViewErrors.getMat();
+        matErr_ = _perViewErrors.getMat();
+        c_matErr = cvMat(matErr_);
     }
 
     double err = cvStereoCalibrateImpl(&c_objPt, &c_imgPt, &c_imgPt2, &c_npoints, &c_cameraMatrix1,
-                                       &c_distCoeffs1, &c_cameraMatrix2, &c_distCoeffs2, imageSize, &c_matR,
+                                       &c_distCoeffs1, &c_cameraMatrix2, &c_distCoeffs2, cvSize(imageSize), &c_matR,
                                        &c_matT, E_needed ? &c_matE : NULL, F_needed ? &c_matF : NULL,
-                                       errors_needed ? &c_matErr : NULL, flags, criteria);
+                                       errors_needed ? &c_matErr : NULL, flags, cvTermCriteria(criteria));
 
     cameraMatrix1.copyTo(_cameraMatrix1);
     cameraMatrix2.copyTo(_cameraMatrix2);
@@ -3633,31 +3643,32 @@ void cv::stereoRectify( InputArray _cameraMatrix1, InputArray _distCoeffs1,
     Mat cameraMatrix1 = _cameraMatrix1.getMat(), cameraMatrix2 = _cameraMatrix2.getMat();
     Mat distCoeffs1 = _distCoeffs1.getMat(), distCoeffs2 = _distCoeffs2.getMat();
     Mat Rmat = _Rmat.getMat(), Tmat = _Tmat.getMat();
-    CvMat c_cameraMatrix1 = cameraMatrix1;
-    CvMat c_cameraMatrix2 = cameraMatrix2;
-    CvMat c_distCoeffs1 = distCoeffs1;
-    CvMat c_distCoeffs2 = distCoeffs2;
-    CvMat c_R = Rmat, c_T = Tmat;
+    CvMat c_cameraMatrix1 = cvMat(cameraMatrix1);
+    CvMat c_cameraMatrix2 = cvMat(cameraMatrix2);
+    CvMat c_distCoeffs1 = cvMat(distCoeffs1);
+    CvMat c_distCoeffs2 = cvMat(distCoeffs2);
+    CvMat c_R = cvMat(Rmat), c_T = cvMat(Tmat);
 
     int rtype = CV_64F;
     _Rmat1.create(3, 3, rtype);
     _Rmat2.create(3, 3, rtype);
     _Pmat1.create(3, 4, rtype);
     _Pmat2.create(3, 4, rtype);
-    CvMat c_R1 = _Rmat1.getMat(), c_R2 = _Rmat2.getMat(), c_P1 = _Pmat1.getMat(), c_P2 = _Pmat2.getMat();
+    Mat R1 = _Rmat1.getMat(), R2 = _Rmat2.getMat(), P1 = _Pmat1.getMat(), P2 = _Pmat2.getMat(), Q;
+    CvMat c_R1 = cvMat(R1), c_R2 = cvMat(R2), c_P1 = cvMat(P1), c_P2 = cvMat(P2);
     CvMat c_Q, *p_Q = 0;
 
     if( _Qmat.needed() )
     {
         _Qmat.create(4, 4, rtype);
-        p_Q = &(c_Q = _Qmat.getMat());
+        p_Q = &(c_Q = cvMat(Q = _Qmat.getMat()));
     }
 
     CvMat *p_distCoeffs1 = distCoeffs1.empty() ? NULL : &c_distCoeffs1;
     CvMat *p_distCoeffs2 = distCoeffs2.empty() ? NULL : &c_distCoeffs2;
     cvStereoRectify( &c_cameraMatrix1, &c_cameraMatrix2, p_distCoeffs1, p_distCoeffs2,
-        imageSize, &c_R, &c_T, &c_R1, &c_R2, &c_P1, &c_P2, p_Q, flags, alpha,
-        newImageSize, (CvRect*)validPixROI1, (CvRect*)validPixROI2);
+        cvSize(imageSize), &c_R, &c_T, &c_R1, &c_R2, &c_P1, &c_P2, p_Q, flags, alpha,
+        cvSize(newImageSize), (CvRect*)validPixROI1, (CvRect*)validPixROI2);
 }
 
 bool cv::stereoRectifyUncalibrated( InputArray _points1, InputArray _points2,
@@ -3671,11 +3682,12 @@ bool cv::stereoRectifyUncalibrated( InputArray _points1, InputArray _points2,
     _Hmat2.create(3, 3, rtype);
     Mat F = _Fmat.getMat();
     Mat points1 = _points1.getMat(), points2 = _points2.getMat();
-    CvMat c_pt1 = points1, c_pt2 = points2;
-    CvMat c_F, *p_F=0, c_H1 = _Hmat1.getMat(), c_H2 = _Hmat2.getMat();
+    CvMat c_pt1 = cvMat(points1), c_pt2 = cvMat(points2);
+    Mat H1 = _Hmat1.getMat(), H2 = _Hmat2.getMat();
+    CvMat c_F, *p_F=0, c_H1 = cvMat(H1), c_H2 = cvMat(H2);
     if( F.size() == Size(3, 3) )
-        p_F = &(c_F = F);
-    return cvStereoRectifyUncalibrated(&c_pt1, &c_pt2, p_F, imgSize, &c_H1, &c_H2, threshold) > 0;
+        p_F = &(c_F = cvMat(F));
+    return cvStereoRectifyUncalibrated(&c_pt1, &c_pt2, p_F, cvSize(imgSize), &c_H1, &c_H2, threshold) > 0;
 }
 
 cv::Mat cv::getOptimalNewCameraMatrix( InputArray _cameraMatrix,
@@ -3686,14 +3698,14 @@ cv::Mat cv::getOptimalNewCameraMatrix( InputArray _cameraMatrix,
     CV_INSTRUMENT_REGION()
 
     Mat cameraMatrix = _cameraMatrix.getMat(), distCoeffs = _distCoeffs.getMat();
-    CvMat c_cameraMatrix = cameraMatrix, c_distCoeffs = distCoeffs;
+    CvMat c_cameraMatrix = cvMat(cameraMatrix), c_distCoeffs = cvMat(distCoeffs);
 
     Mat newCameraMatrix(3, 3, CV_MAT_TYPE(c_cameraMatrix.type));
-    CvMat c_newCameraMatrix = newCameraMatrix;
+    CvMat c_newCameraMatrix = cvMat(newCameraMatrix);
 
-    cvGetOptimalNewCameraMatrix(&c_cameraMatrix, &c_distCoeffs, imgSize,
+    cvGetOptimalNewCameraMatrix(&c_cameraMatrix, &c_distCoeffs, cvSize(imgSize),
                                 alpha, &c_newCameraMatrix,
-                                newImgSize, (CvRect*)validPixROI, (int)centerPrincipalPoint);
+                                cvSize(newImgSize), (CvRect*)validPixROI, (int)centerPrincipalPoint);
     return newCameraMatrix;
 }
 
@@ -3714,7 +3726,7 @@ cv::Vec3d cv::RQDecomp3x3( InputArray _Mmat,
     Mat Qmat = _Qmat.getMat();
     Vec3d eulerAngles;
 
-    CvMat matM = M, matR = Rmat, matQ = Qmat;
+    CvMat matM = cvMat(M), matR = cvMat(Rmat), matQ = cvMat(Qmat);
 #define CV_RQDecomp3x3_PARAM(name) \
     Mat name; \
     CvMat c_ ## name, *p ## name = NULL; \
@@ -3722,7 +3734,7 @@ cv::Vec3d cv::RQDecomp3x3( InputArray _Mmat,
     { \
         _ ## name.create(3, 3, M.type()); \
         name = _ ## name.getMat(); \
-        c_ ## name = name; p ## name = &c_ ## name; \
+        c_ ## name = cvMat(name); p ## name = &c_ ## name; \
     }
 
     CV_RQDecomp3x3_PARAM(Qx);
@@ -3749,8 +3761,8 @@ void cv::decomposeProjectionMatrix( InputArray _projMatrix, OutputArray _cameraM
     Mat cameraMatrix = _cameraMatrix.getMat();
     Mat rotMatrix = _rotMatrix.getMat();
     Mat transVect = _transVect.getMat();
-    CvMat c_projMatrix = projMatrix, c_cameraMatrix = cameraMatrix;
-    CvMat c_rotMatrix = rotMatrix, c_transVect = transVect;
+    CvMat c_projMatrix = cvMat(projMatrix), c_cameraMatrix = cvMat(cameraMatrix);
+    CvMat c_rotMatrix = cvMat(rotMatrix), c_transVect = cvMat(transVect);
     CvPoint3D64f *p_eulerAngles = 0;
 
 #define CV_decomposeProjectionMatrix_PARAM(name) \
@@ -3760,7 +3772,7 @@ void cv::decomposeProjectionMatrix( InputArray _projMatrix, OutputArray _cameraM
     { \
         _ ## name.create(3, 3, type); \
         name = _ ## name.getMat(); \
-        c_ ## name = name; p_ ## name = &c_ ## name; \
+        c_ ## name = cvMat(name); p_ ## name = &c_ ## name; \
     }
 
     CV_decomposeProjectionMatrix_PARAM(rotMatrixX);
diff --git a/modules/calib3d/src/compat_stereo.cpp b/modules/calib3d/src/compat_stereo.cpp
index c00adeeb87..69e22a2d71 100644
--- a/modules/calib3d/src/compat_stereo.cpp
+++ b/modules/calib3d/src/compat_stereo.cpp
@@ -111,8 +111,8 @@ void cvFindStereoCorrespondenceBM( const CvArr* leftarr, const CvArr* rightarr,
 CvRect cvGetValidDisparityROI( CvRect roi1, CvRect roi2, int minDisparity,
                               int numberOfDisparities, int SADWindowSize )
 {
-    return (CvRect)cv::getValidDisparityROI( roi1, roi2, minDisparity,
-                                            numberOfDisparities, SADWindowSize );
+    return cvRect(cv::getValidDisparityROI( roi1, roi2, minDisparity,
+                                            numberOfDisparities, SADWindowSize));
 }
 
 void cvValidateDisparity( CvArr* _disp, const CvArr* _cost, int minDisparity,
diff --git a/modules/calib3d/src/solvepnp.cpp b/modules/calib3d/src/solvepnp.cpp
index 78426210b9..e783d58c31 100644
--- a/modules/calib3d/src/solvepnp.cpp
+++ b/modules/calib3d/src/solvepnp.cpp
@@ -134,9 +134,9 @@ bool solvePnP( InputArray _opoints, InputArray _ipoints,
     }
     else if (flags == SOLVEPNP_ITERATIVE)
     {
-        CvMat c_objectPoints = opoints, c_imagePoints = ipoints;
-        CvMat c_cameraMatrix = cameraMatrix, c_distCoeffs = distCoeffs;
-        CvMat c_rvec = rvec, c_tvec = tvec;
+        CvMat c_objectPoints = cvMat(opoints), c_imagePoints = cvMat(ipoints);
+        CvMat c_cameraMatrix = cvMat(cameraMatrix), c_distCoeffs = cvMat(distCoeffs);
+        CvMat c_rvec = cvMat(rvec), c_tvec = cvMat(tvec);
         cvFindExtrinsicCameraParams2(&c_objectPoints, &c_imagePoints, &c_cameraMatrix,
                                      (c_distCoeffs.rows && c_distCoeffs.cols) ? &c_distCoeffs : 0,
                                      &c_rvec, &c_tvec, useExtrinsicGuess );
diff --git a/modules/calib3d/src/triangulate.cpp b/modules/calib3d/src/triangulate.cpp
index 959c244e9d..7af40aa582 100644
--- a/modules/calib3d/src/triangulate.cpp
+++ b/modules/calib3d/src/triangulate.cpp
@@ -358,11 +358,12 @@ void cv::triangulatePoints( InputArray _projMatr1, InputArray _projMatr2,
     if((points2.rows == 1 || points2.cols == 1) && points2.channels() == 2)
         points2 = points2.reshape(1, static_cast<int>(points2.total())).t();
 
-    CvMat cvMatr1 = matr1, cvMatr2 = matr2;
-    CvMat cvPoints1 = points1, cvPoints2 = points2;
+    CvMat cvMatr1 = cvMat(matr1), cvMatr2 = cvMat(matr2);
+    CvMat cvPoints1 = cvMat(points1), cvPoints2 = cvMat(points2);
 
     _points4D.create(4, points1.cols, points1.type());
-    CvMat cvPoints4D = _points4D.getMat();
+    Mat cvPoints4D_ = _points4D.getMat();
+    CvMat cvPoints4D = cvMat(cvPoints4D_);
 
     cvTriangulatePoints(&cvMatr1, &cvMatr2, &cvPoints1, &cvPoints2, &cvPoints4D);
 }
@@ -375,12 +376,13 @@ void cv::correctMatches( InputArray _F, InputArray _points1, InputArray _points2
     Mat F = _F.getMat();
     Mat points1 = _points1.getMat(), points2 = _points2.getMat();
 
-    CvMat cvPoints1 = points1, cvPoints2 = points2;
-    CvMat cvF = F;
+    CvMat cvPoints1 = cvMat(points1), cvPoints2 = cvMat(points2);
+    CvMat cvF = cvMat(F);
 
     _newPoints1.create(points1.size(), points1.type());
     _newPoints2.create(points2.size(), points2.type());
-    CvMat cvNewPoints1 = _newPoints1.getMat(), cvNewPoints2 = _newPoints2.getMat();
+    Mat cvNewPoints1_ = _newPoints1.getMat(), cvNewPoints2_ = _newPoints2.getMat();
+    CvMat cvNewPoints1 = cvMat(cvNewPoints1_), cvNewPoints2 = cvMat(cvNewPoints2_);
 
     cvCorrectMatches(&cvF, &cvPoints1, &cvPoints2, &cvNewPoints1, &cvNewPoints2);
 }
diff --git a/modules/calib3d/test/test_affine3.cpp b/modules/calib3d/test/test_affine3.cpp
index b0efecaa9f..a69978b1e5 100644
--- a/modules/calib3d/test/test_affine3.cpp
+++ b/modules/calib3d/test/test_affine3.cpp
@@ -47,16 +47,15 @@ namespace opencv_test { namespace {
 
 TEST(Calib3d_Affine3f, accuracy)
 {
+    const double eps = 1e-5;
     cv::Vec3d rvec(0.2, 0.5, 0.3);
     cv::Affine3d affine(rvec);
 
     cv::Mat expected;
     cv::Rodrigues(rvec, expected);
 
-
-    ASSERT_EQ(0, cvtest::norm(cv::Mat(affine.matrix, false).colRange(0, 3).rowRange(0, 3) != expected, cv::NORM_L2));
-    ASSERT_EQ(0, cvtest::norm(cv::Mat(affine.linear()) != expected, cv::NORM_L2));
-
+    ASSERT_LE(cvtest::norm(cv::Mat(affine.matrix, false).colRange(0, 3).rowRange(0, 3), expected, cv::NORM_L2), eps);
+    ASSERT_LE(cvtest::norm(cv::Mat(affine.linear()), expected, cv::NORM_L2), eps);
 
     cv::Matx33d R = cv::Matx33d::eye();
 
diff --git a/modules/calib3d/test/test_cameracalibration.cpp b/modules/calib3d/test/test_cameracalibration.cpp
index d8f6cd2e03..f20edfea27 100644
--- a/modules/calib3d/test/test_cameracalibration.cpp
+++ b/modules/calib3d/test/test_cameracalibration.cpp
@@ -290,8 +290,8 @@ void CV_CameraCalibrationTest::run( int start_from )
     cv::String            filepath;
     cv::String            filename;
 
-    CvSize          imageSize;
-    CvSize          etalonSize;
+    Size            imageSize;
+    Size            etalonSize;
     int             numImages;
 
     CvPoint2D64f*   imagePoints;
@@ -531,7 +531,7 @@ void CV_CameraCalibrationTest::run( int start_from )
         /* Now we can calibrate camera */
         calibrate(  numImages,
                     numbers,
-                    imageSize,
+                    cvSize(imageSize),
                     imagePoints,
                     objectPoints,
                     distortion,
@@ -1009,9 +1009,9 @@ void CV_CalibrationMatrixValuesTest_C::calibMatrixValues( const Mat& _cameraMatr
                                                double& fovx, double& fovy, double& focalLength,
                                                Point2d& principalPoint, double& aspectRatio )
 {
-    CvMat cameraMatrix = _cameraMatrix;
-    CvPoint2D64f pp;
-    cvCalibrationMatrixValues( &cameraMatrix, imageSize, apertureWidth, apertureHeight,
+    CvMat cameraMatrix = cvMat(_cameraMatrix);
+    CvPoint2D64f pp = {0, 0};
+    cvCalibrationMatrixValues( &cameraMatrix, cvSize(imageSize), apertureWidth, apertureHeight,
         &fovx, &fovy, &focalLength, &pp, &aspectRatio );
     principalPoint.x = pp.x;
     principalPoint.y = pp.y;
@@ -1305,9 +1305,9 @@ void CV_ProjectPointsTest_C::project( const Mat& opoints, const Mat& rvec, const
     dpdc.create(npoints*2, 2, CV_64F);
     dpddist.create(npoints*2, distCoeffs.rows + distCoeffs.cols - 1, CV_64F);
     Mat imagePoints(ipoints);
-    CvMat _objectPoints = opoints, _imagePoints = imagePoints;
-    CvMat _rvec = rvec, _tvec = tvec, _cameraMatrix = cameraMatrix, _distCoeffs = distCoeffs;
-    CvMat _dpdrot = dpdrot, _dpdt = dpdt, _dpdf = dpdf, _dpdc = dpdc, _dpddist = dpddist;
+    CvMat _objectPoints = cvMat(opoints), _imagePoints = cvMat(imagePoints);
+    CvMat _rvec = cvMat(rvec), _tvec = cvMat(tvec), _cameraMatrix = cvMat(cameraMatrix), _distCoeffs = cvMat(distCoeffs);
+    CvMat _dpdrot = cvMat(dpdrot), _dpdt = cvMat(dpdt), _dpdf = cvMat(dpdf), _dpdc = cvMat(dpdc), _dpddist = cvMat(dpddist);
 
     cvProjectPoints2( &_objectPoints, &_rvec, &_tvec, &_cameraMatrix, &_distCoeffs,
                       &_imagePoints, &_dpdrot, &_dpdt, &_dpdf, &_dpdc, &_dpddist, aspectRatio );
@@ -1925,14 +1925,14 @@ double CV_StereoCalibrationTest_C::calibrateStereoCamera( const vector<vector<Po
         std::copy(imagePoints1[i].begin(), imagePoints1[i].end(), imgPtData + j);
         std::copy(imagePoints2[i].begin(), imagePoints2[i].end(), imgPtData2 + j);
     }
-    CvMat _objPt = objPt, _imgPt = imgPt, _imgPt2 = imgPt2, _npoints = npoints;
-    CvMat _cameraMatrix1 = cameraMatrix1, _distCoeffs1 = distCoeffs1;
-    CvMat _cameraMatrix2 = cameraMatrix2, _distCoeffs2 = distCoeffs2;
-    CvMat matR = R, matT = T, matE = E, matF = F;
+    CvMat _objPt = cvMat(objPt), _imgPt = cvMat(imgPt), _imgPt2 = cvMat(imgPt2), _npoints = cvMat(npoints);
+    CvMat _cameraMatrix1 = cvMat(cameraMatrix1), _distCoeffs1 = cvMat(distCoeffs1);
+    CvMat _cameraMatrix2 = cvMat(cameraMatrix2), _distCoeffs2 = cvMat(distCoeffs2);
+    CvMat matR = cvMat(R), matT = cvMat(T), matE = cvMat(E), matF = cvMat(F);
 
     return cvStereoCalibrate(&_objPt, &_imgPt, &_imgPt2, &_npoints, &_cameraMatrix1,
-        &_distCoeffs1, &_cameraMatrix2, &_distCoeffs2, imageSize,
-        &matR, &matT, &matE, &matF, flags, criteria );
+        &_distCoeffs1, &_cameraMatrix2, &_distCoeffs2, cvSize(imageSize),
+        &matR, &matT, &matE, &matF, flags, cvTermCriteria(criteria));
 }
 
 void CV_StereoCalibrationTest_C::rectify( const Mat& cameraMatrix1, const Mat& distCoeffs1,
@@ -1948,12 +1948,12 @@ void CV_StereoCalibrationTest_C::rectify( const Mat& cameraMatrix1, const Mat& d
     P1.create(3, 4, rtype);
     P2.create(3, 4, rtype);
     Q.create(4, 4, rtype);
-    CvMat _cameraMatrix1 = cameraMatrix1, _distCoeffs1 = distCoeffs1;
-    CvMat _cameraMatrix2 = cameraMatrix2, _distCoeffs2 = distCoeffs2;
-    CvMat matR = R, matT = T, _R1 = R1, _R2 = R2, _P1 = P1, _P2 = P2, matQ = Q;
+    CvMat _cameraMatrix1 = cvMat(cameraMatrix1), _distCoeffs1 = cvMat(distCoeffs1);
+    CvMat _cameraMatrix2 = cvMat(cameraMatrix2), _distCoeffs2 = cvMat(distCoeffs2);
+    CvMat matR = cvMat(R), matT = cvMat(T), _R1 = cvMat(R1), _R2 = cvMat(R2), _P1 = cvMat(P1), _P2 = cvMat(P2), matQ = cvMat(Q);
     cvStereoRectify( &_cameraMatrix1, &_cameraMatrix2, &_distCoeffs1, &_distCoeffs2,
-        imageSize, &matR, &matT, &_R1, &_R2, &_P1, &_P2, &matQ, flags,
-        alpha, newImageSize, (CvRect*)validPixROI1, (CvRect*)validPixROI2);
+        cvSize(imageSize), &matR, &matT, &_R1, &_R2, &_P1, &_P2, &matQ, flags,
+        alpha, cvSize(newImageSize), (CvRect*)validPixROI1, (CvRect*)validPixROI2);
 }
 
 bool CV_StereoCalibrationTest_C::rectifyUncalibrated( const Mat& points1,
@@ -1961,19 +1961,19 @@ bool CV_StereoCalibrationTest_C::rectifyUncalibrated( const Mat& points1,
 {
     H1.create(3, 3, CV_64F);
     H2.create(3, 3, CV_64F);
-    CvMat _pt1 = points1, _pt2 = points2, matF, *pF=0, _H1 = H1, _H2 = H2;
+    CvMat _pt1 = cvMat(points1), _pt2 = cvMat(points2), matF, *pF=0, _H1 = cvMat(H1), _H2 = cvMat(H2);
     if( F.size() == Size(3, 3) )
-        pF = &(matF = F);
-    return cvStereoRectifyUncalibrated(&_pt1, &_pt2, pF, imgSize, &_H1, &_H2, threshold) > 0;
+        pF = &(matF = cvMat(F));
+    return cvStereoRectifyUncalibrated(&_pt1, &_pt2, pF, cvSize(imgSize), &_H1, &_H2, threshold) > 0;
 }
 
 void CV_StereoCalibrationTest_C::triangulate( const Mat& P1, const Mat& P2,
         const Mat &points1, const Mat &points2,
         Mat &points4D )
 {
-    CvMat _P1 = P1, _P2 = P2, _points1 = points1, _points2 = points2;
+    CvMat _P1 = cvMat(P1), _P2 = cvMat(P2), _points1 = cvMat(points1), _points2 = cvMat(points2);
     points4D.create(4, points1.cols, points1.type());
-    CvMat _points4D = points4D;
+    CvMat _points4D = cvMat(points4D);
     cvTriangulatePoints(&_P1, &_P2, &_points1, &_points2, &_points4D);
 }
 
@@ -1981,10 +1981,10 @@ void CV_StereoCalibrationTest_C::correct( const Mat& F,
         const Mat &points1, const Mat &points2,
         Mat &newPoints1, Mat &newPoints2 )
 {
-    CvMat _F = F, _points1 = points1, _points2 = points2;
+    CvMat _F = cvMat(F), _points1 = cvMat(points1), _points2 = cvMat(points2);
     newPoints1.create(1, points1.cols, points1.type());
     newPoints2.create(1, points2.cols, points2.type());
-    CvMat _newPoints1 = newPoints1, _newPoints2 = newPoints2;
+    CvMat _newPoints1 = cvMat(newPoints1), _newPoints2 = cvMat(newPoints2);
     cvCorrectMatches(&_F, &_points1, &_points2, &_newPoints1, &_newPoints2);
 }
 
diff --git a/modules/calib3d/test/test_cameracalibration_badarg.cpp b/modules/calib3d/test/test_cameracalibration_badarg.cpp
index b63d4b4cfd..c367432f40 100644
--- a/modules/calib3d/test/test_cameracalibration_badarg.cpp
+++ b/modules/calib3d/test/test_cameracalibration_badarg.cpp
@@ -75,7 +75,7 @@ protected:
 
         void operator()() const
         {
-            cvCalibrateCamera2(objPts, imgPts, npoints, imageSize,
+            cvCalibrateCamera2(objPts, imgPts, npoints, cvSize(imageSize),
                 cameraMatrix, distCoeffs, rvecs, tvecs, flags );
         }
     };
@@ -137,13 +137,13 @@ void CV_CameraCalibrationBadArgTest::run( int /* start_from */ )
     //CV_CALIB_FIX_PRINCIPAL_POINT    //CV_CALIB_ZERO_TANGENT_DIST
     //CV_CALIB_FIX_FOCAL_LENGTH    //CV_CALIB_FIX_K1    //CV_CALIB_FIX_K2    //CV_CALIB_FIX_K3
 
-    objPts = objPts_cpp;
-    imgPts = imgPts_cpp;
-    npoints = npoints_cpp;
-    cameraMatrix = cameraMatrix_cpp;
-    distCoeffs = distCoeffs_cpp;
-    rvecs = rvecs_cpp;
-    tvecs = tvecs_cpp;
+    objPts = cvMat(objPts_cpp);
+    imgPts = cvMat(imgPts_cpp);
+    npoints = cvMat(npoints_cpp);
+    cameraMatrix = cvMat(cameraMatrix_cpp);
+    distCoeffs = cvMat(distCoeffs_cpp);
+    rvecs = cvMat(rvecs_cpp);
+    tvecs = cvMat(tvecs_cpp);
 
     /* /*//*/ */
     int errors = 0;
@@ -178,8 +178,8 @@ void CV_CameraCalibrationBadArgTest::run( int /* start_from */ )
 
     Mat bad_nts_cpp1 = Mat_<float>(M, 1, 1.f);
     Mat bad_nts_cpp2 = Mat_<int>(3, 3, corSize.width * corSize.height);
-    CvMat bad_npts_c1 = bad_nts_cpp1;
-    CvMat bad_npts_c2 = bad_nts_cpp2;
+    CvMat bad_npts_c1 = cvMat(bad_nts_cpp1);
+    CvMat bad_npts_c2 = cvMat(bad_nts_cpp2);
 
     bad_caller = caller;
     bad_caller.npoints = &bad_npts_c1;
@@ -197,13 +197,13 @@ void CV_CameraCalibrationBadArgTest::run( int /* start_from */ )
     bad_caller.tvecs = (CvMat*)zeros.ptr();
     errors += run_test_case( CV_StsBadArg, "Bad tvecs header", bad_caller );
 
-    Mat bad_rvecs_cpp1(M+1, 1, CV_32FC3); CvMat bad_rvecs_c1 = bad_rvecs_cpp1;
-    Mat bad_tvecs_cpp1(M+1, 1, CV_32FC3); CvMat bad_tvecs_c1 = bad_tvecs_cpp1;
+    Mat bad_rvecs_cpp1(M+1, 1, CV_32FC3); CvMat bad_rvecs_c1 = cvMat(bad_rvecs_cpp1);
+    Mat bad_tvecs_cpp1(M+1, 1, CV_32FC3); CvMat bad_tvecs_c1 = cvMat(bad_tvecs_cpp1);
 
 
 
-    Mat bad_rvecs_cpp2(M, 2, CV_32FC3); CvMat bad_rvecs_c2 = bad_rvecs_cpp2;
-    Mat bad_tvecs_cpp2(M, 2, CV_32FC3); CvMat bad_tvecs_c2 = bad_tvecs_cpp2;
+    Mat bad_rvecs_cpp2(M, 2, CV_32FC3); CvMat bad_rvecs_c2 = cvMat(bad_rvecs_cpp2);
+    Mat bad_tvecs_cpp2(M, 2, CV_32FC3); CvMat bad_tvecs_c2 = cvMat(bad_tvecs_cpp2);
 
     bad_caller = caller;
     bad_caller.rvecs = &bad_rvecs_c1;
@@ -221,9 +221,9 @@ void CV_CameraCalibrationBadArgTest::run( int /* start_from */ )
     bad_caller.tvecs = &bad_tvecs_c2;
     errors += run_test_case( CV_StsBadArg, "Bad tvecs header", bad_caller );
 
-    Mat bad_cameraMatrix_cpp1(3, 3, CV_32S); CvMat bad_cameraMatrix_c1 = bad_cameraMatrix_cpp1;
-    Mat bad_cameraMatrix_cpp2(2, 3, CV_32F); CvMat bad_cameraMatrix_c2 = bad_cameraMatrix_cpp2;
-    Mat bad_cameraMatrix_cpp3(3, 2, CV_64F); CvMat bad_cameraMatrix_c3 = bad_cameraMatrix_cpp3;
+    Mat bad_cameraMatrix_cpp1(3, 3, CV_32S); CvMat bad_cameraMatrix_c1 = cvMat(bad_cameraMatrix_cpp1);
+    Mat bad_cameraMatrix_cpp2(2, 3, CV_32F); CvMat bad_cameraMatrix_c2 = cvMat(bad_cameraMatrix_cpp2);
+    Mat bad_cameraMatrix_cpp3(3, 2, CV_64F); CvMat bad_cameraMatrix_c3 = cvMat(bad_cameraMatrix_cpp3);
 
 
 
@@ -239,9 +239,9 @@ void CV_CameraCalibrationBadArgTest::run( int /* start_from */ )
     bad_caller.cameraMatrix = &bad_cameraMatrix_c3;
     errors += run_test_case( CV_StsBadArg, "Bad camearaMatrix header", bad_caller );
 
-    Mat bad_distCoeffs_cpp1(1, 5, CV_32S); CvMat bad_distCoeffs_c1 = bad_distCoeffs_cpp1;
-    Mat bad_distCoeffs_cpp2(2, 2, CV_64F); CvMat bad_distCoeffs_c2 = bad_distCoeffs_cpp2;
-    Mat bad_distCoeffs_cpp3(1, 6, CV_64F); CvMat bad_distCoeffs_c3 = bad_distCoeffs_cpp3;
+    Mat bad_distCoeffs_cpp1(1, 5, CV_32S); CvMat bad_distCoeffs_c1 = cvMat(bad_distCoeffs_cpp1);
+    Mat bad_distCoeffs_cpp2(2, 2, CV_64F); CvMat bad_distCoeffs_c2 = cvMat(bad_distCoeffs_cpp2);
+    Mat bad_distCoeffs_cpp3(1, 6, CV_64F); CvMat bad_distCoeffs_c3 = cvMat(bad_distCoeffs_cpp3);
 
 
 
@@ -259,7 +259,7 @@ void CV_CameraCalibrationBadArgTest::run( int /* start_from */ )
     errors += run_test_case( CV_StsBadArg, "Bad distCoeffs header", bad_caller );
 
     double CM[] = {0, 0, 0, /**/0, 0, 0, /**/0, 0, 0};
-    Mat bad_cameraMatrix_cpp4(3, 3, CV_64F, CM); CvMat bad_cameraMatrix_c4 = bad_cameraMatrix_cpp4;
+    Mat bad_cameraMatrix_cpp4(3, 3, CV_64F, CM); CvMat bad_cameraMatrix_c4 = cvMat(bad_cameraMatrix_cpp4);
 
     bad_caller = caller;
     bad_caller.flags |= CV_CALIB_USE_INTRINSIC_GUESS;
@@ -302,7 +302,7 @@ void CV_CameraCalibrationBadArgTest::run( int /* start_from */ )
 
     /////////////////////////////////////////////////////////////////////////////////////
     bad_caller = caller;
-    Mat bad_objPts_cpp5 = objPts_cpp.clone(); CvMat bad_objPts_c5 = bad_objPts_cpp5;
+    Mat bad_objPts_cpp5 = objPts_cpp.clone(); CvMat bad_objPts_c5 = cvMat(bad_objPts_cpp5);
     bad_caller.objPts = &bad_objPts_c5;
 
     cv::RNG& rng = theRNG();
@@ -347,9 +347,9 @@ protected:
         Mat zeros(1, sizeof(CvMat), CV_8U, Scalar(0));
         CvMat src_c, dst_c, jacobian_c;
 
-        Mat src_cpp(3, 1, CV_32F); src_c = src_cpp;
-        Mat dst_cpp(3, 3, CV_32F); dst_c = dst_cpp;
-        Mat jacobian_cpp(3, 9, CV_32F); jacobian_c = jacobian_cpp;
+        Mat src_cpp(3, 1, CV_32F); src_c = cvMat(src_cpp);
+        Mat dst_cpp(3, 3, CV_32F); dst_c = cvMat(dst_cpp);
+        Mat jacobian_cpp(3, 9, CV_32F); jacobian_c = cvMat(jacobian_cpp);
 
         C_Caller caller, bad_caller;
         caller.src = &src_c;
@@ -373,11 +373,11 @@ protected:
         bad_caller.dst = 0;
         errors += run_test_case( CV_StsNullPtr, "Dst is zero pointer", bad_caller );
 
-        Mat bad_src_cpp1(3, 1, CV_8U); CvMat bad_src_c1 = bad_src_cpp1;
-        Mat bad_dst_cpp1(3, 1, CV_8U); CvMat bad_dst_c1 = bad_dst_cpp1;
-        Mat bad_jac_cpp1(3, 1, CV_8U); CvMat bad_jac_c1 = bad_jac_cpp1;
-        Mat bad_jac_cpp2(3, 1, CV_32FC2); CvMat bad_jac_c2 = bad_jac_cpp2;
-        Mat bad_jac_cpp3(3, 1, CV_32F); CvMat bad_jac_c3 = bad_jac_cpp3;
+        Mat bad_src_cpp1(3, 1, CV_8U); CvMat bad_src_c1 = cvMat(bad_src_cpp1);
+        Mat bad_dst_cpp1(3, 1, CV_8U); CvMat bad_dst_c1 = cvMat(bad_dst_cpp1);
+        Mat bad_jac_cpp1(3, 1, CV_8U); CvMat bad_jac_c1 = cvMat(bad_jac_cpp1);
+        Mat bad_jac_cpp2(3, 1, CV_32FC2); CvMat bad_jac_c2 = cvMat(bad_jac_cpp2);
+        Mat bad_jac_cpp3(3, 1, CV_32F); CvMat bad_jac_c3 = cvMat(bad_jac_cpp3);
 
         bad_caller = caller;
         bad_caller.src = &bad_src_c1;
@@ -403,15 +403,15 @@ protected:
         bad_caller.jacobian = &bad_jac_c3;
         errors += run_test_case( CV_StsBadSize, "Bad jacobian format", bad_caller );
 
-        Mat bad_src_cpp2(1, 1, CV_32F); CvMat bad_src_c2 = bad_src_cpp2;
+        Mat bad_src_cpp2(1, 1, CV_32F); CvMat bad_src_c2 = cvMat(bad_src_cpp2);
 
         bad_caller = caller;
         bad_caller.src = &bad_src_c2;
         errors += run_test_case( CV_StsBadSize, "Bad src format", bad_caller );
 
-        Mat bad_dst_cpp2(2, 1, CV_32F); CvMat bad_dst_c2 = bad_dst_cpp2;
-        Mat bad_dst_cpp3(3, 2, CV_32F); CvMat bad_dst_c3 = bad_dst_cpp3;
-        Mat bad_dst_cpp4(3, 3, CV_32FC2); CvMat bad_dst_c4 = bad_dst_cpp4;
+        Mat bad_dst_cpp2(2, 1, CV_32F); CvMat bad_dst_c2 = cvMat(bad_dst_cpp2);
+        Mat bad_dst_cpp3(3, 2, CV_32F); CvMat bad_dst_c3 = cvMat(bad_dst_cpp3);
+        Mat bad_dst_cpp4(3, 3, CV_32FC2); CvMat bad_dst_c4 = cvMat(bad_dst_cpp4);
 
         bad_caller = caller;
         bad_caller.dst = &bad_dst_c2;
@@ -427,11 +427,11 @@ protected:
 
 
         /********/
-        src_cpp.create(3, 3, CV_32F); src_c = src_cpp;
-        dst_cpp.create(3, 1, CV_32F); dst_c = dst_cpp;
+        src_cpp.create(3, 3, CV_32F); src_c = cvMat(src_cpp);
+        dst_cpp.create(3, 1, CV_32F); dst_c = cvMat(dst_cpp);
 
 
-        Mat bad_dst_cpp5(5, 5, CV_32F); CvMat bad_dst_c5 = bad_dst_cpp5;
+        Mat bad_dst_cpp5(5, 5, CV_32F); CvMat bad_dst_c5 = cvMat(bad_dst_cpp5);
 
         bad_caller = caller;
         bad_caller.dst = &bad_dst_c5;
@@ -488,15 +488,7 @@ protected:
 
     void run(int /* start_from */ )
     {
-        CvMat zeros;
-#if defined __GNUC__ && __GNUC__ >= 8
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wclass-memaccess"
-#endif
-        memset(&zeros, 0, sizeof(zeros));
-#if defined __GNUC__ && __GNUC__ >= 8
-#pragma GCC diagnostic pop
-#endif
+        CvMat zeros = CvMat();
 
         C_Caller caller, bad_caller;
         CvMat objectPoints_c, r_vec_c, t_vec_c, A_c, distCoeffs_c, imagePoints_c,
@@ -504,24 +496,24 @@ protected:
 
         const int n = 10;
 
-        Mat imagePoints_cpp(1, n, CV_32FC2); imagePoints_c = imagePoints_cpp;
+        Mat imagePoints_cpp(1, n, CV_32FC2); imagePoints_c = cvMat(imagePoints_cpp);
 
         Mat objectPoints_cpp(1, n, CV_32FC3);
         randu(objectPoints_cpp, Scalar::all(1), Scalar::all(10));
-        objectPoints_c = objectPoints_cpp;
+        objectPoints_c = cvMat(objectPoints_cpp);
 
-        Mat t_vec_cpp(Mat::zeros(1, 3, CV_32F)); t_vec_c = t_vec_cpp;
+        Mat t_vec_cpp(Mat::zeros(1, 3, CV_32F)); t_vec_c = cvMat(t_vec_cpp);
         Mat r_vec_cpp(3, 1, CV_32F);
-        cvtest::Rodrigues(Mat::eye(3, 3, CV_32F), r_vec_cpp); r_vec_c = r_vec_cpp;
+        cvtest::Rodrigues(Mat::eye(3, 3, CV_32F), r_vec_cpp); r_vec_c = cvMat(r_vec_cpp);
 
-        Mat A_cpp = camMat.clone(); A_c = A_cpp;
-        Mat distCoeffs_cpp = distCoeffs.clone(); distCoeffs_c = distCoeffs_cpp;
+        Mat A_cpp = camMat.clone(); A_c = cvMat(A_cpp);
+        Mat distCoeffs_cpp = distCoeffs.clone(); distCoeffs_c = cvMat(distCoeffs_cpp);
 
-        Mat dpdr_cpp(2*n, 3, CV_32F); dpdr_c = dpdr_cpp;
-        Mat dpdt_cpp(2*n, 3, CV_32F); dpdt_c = dpdt_cpp;
-        Mat dpdf_cpp(2*n, 2, CV_32F); dpdf_c = dpdf_cpp;
-        Mat dpdc_cpp(2*n, 2, CV_32F); dpdc_c = dpdc_cpp;
-        Mat dpdk_cpp(2*n, 4, CV_32F); dpdk_c = dpdk_cpp;
+        Mat dpdr_cpp(2*n, 3, CV_32F); dpdr_c = cvMat(dpdr_cpp);
+        Mat dpdt_cpp(2*n, 3, CV_32F); dpdt_c = cvMat(dpdt_cpp);
+        Mat dpdf_cpp(2*n, 2, CV_32F); dpdf_c = cvMat(dpdf_cpp);
+        Mat dpdc_cpp(2*n, 2, CV_32F); dpdc_c = cvMat(dpdc_cpp);
+        Mat dpdk_cpp(2*n, 4, CV_32F); dpdk_c = cvMat(dpdk_cpp);
 
         caller.aspectRatio = 1.0;
         caller.objectPoints = &objectPoints_c;
@@ -561,9 +553,9 @@ protected:
         errors += run_test_case( CV_StsBadArg, "Zero imagePoints", bad_caller );
 
         /****************************/
-        Mat bad_r_vec_cpp1(r_vec_cpp.size(), CV_32S); CvMat bad_r_vec_c1 = bad_r_vec_cpp1;
-        Mat bad_r_vec_cpp2(2, 2, CV_32F); CvMat bad_r_vec_c2 = bad_r_vec_cpp2;
-        Mat bad_r_vec_cpp3(r_vec_cpp.size(), CV_32FC2); CvMat bad_r_vec_c3 = bad_r_vec_cpp3;
+        Mat bad_r_vec_cpp1(r_vec_cpp.size(), CV_32S); CvMat bad_r_vec_c1 = cvMat(bad_r_vec_cpp1);
+        Mat bad_r_vec_cpp2(2, 2, CV_32F); CvMat bad_r_vec_c2 = cvMat(bad_r_vec_cpp2);
+        Mat bad_r_vec_cpp3(r_vec_cpp.size(), CV_32FC2); CvMat bad_r_vec_c3 = cvMat(bad_r_vec_cpp3);
 
         bad_caller = caller;
         bad_caller.r_vec = &bad_r_vec_c1;
@@ -578,9 +570,9 @@ protected:
         errors += run_test_case( CV_StsBadArg, "Bad rvec format", bad_caller );
 
         /****************************/
-        Mat bad_t_vec_cpp1(t_vec_cpp.size(), CV_32S); CvMat bad_t_vec_c1 = bad_t_vec_cpp1;
-        Mat bad_t_vec_cpp2(2, 2, CV_32F); CvMat bad_t_vec_c2 = bad_t_vec_cpp2;
-        Mat bad_t_vec_cpp3(1, 1, CV_32FC2); CvMat bad_t_vec_c3 = bad_t_vec_cpp3;
+        Mat bad_t_vec_cpp1(t_vec_cpp.size(), CV_32S); CvMat bad_t_vec_c1 = cvMat(bad_t_vec_cpp1);
+        Mat bad_t_vec_cpp2(2, 2, CV_32F); CvMat bad_t_vec_c2 = cvMat(bad_t_vec_cpp2);
+        Mat bad_t_vec_cpp3(1, 1, CV_32FC2); CvMat bad_t_vec_c3 = cvMat(bad_t_vec_cpp3);
 
         bad_caller = caller;
         bad_caller.t_vec = &bad_t_vec_c1;
@@ -595,8 +587,8 @@ protected:
         errors += run_test_case( CV_StsBadArg, "Bad tvec format", bad_caller );
 
         /****************************/
-        Mat bad_A_cpp1(A_cpp.size(), CV_32S); CvMat bad_A_c1 = bad_A_cpp1;
-        Mat bad_A_cpp2(2, 2, CV_32F); CvMat bad_A_c2 = bad_A_cpp2;
+        Mat bad_A_cpp1(A_cpp.size(), CV_32S); CvMat bad_A_c1 = cvMat(bad_A_cpp1);
+        Mat bad_A_cpp2(2, 2, CV_32F); CvMat bad_A_c2 = cvMat(bad_A_cpp2);
 
         bad_caller = caller;
         bad_caller.A = &bad_A_c1;
@@ -607,9 +599,9 @@ protected:
         errors += run_test_case( CV_StsBadArg, "Bad A format", bad_caller );
 
         /****************************/
-        Mat bad_distCoeffs_cpp1(distCoeffs_cpp.size(), CV_32S); CvMat bad_distCoeffs_c1 = bad_distCoeffs_cpp1;
-        Mat bad_distCoeffs_cpp2(2, 2, CV_32F); CvMat bad_distCoeffs_c2 = bad_distCoeffs_cpp2;
-        Mat bad_distCoeffs_cpp3(1, 7, CV_32F); CvMat bad_distCoeffs_c3 = bad_distCoeffs_cpp3;
+        Mat bad_distCoeffs_cpp1(distCoeffs_cpp.size(), CV_32S); CvMat bad_distCoeffs_c1 = cvMat(bad_distCoeffs_cpp1);
+        Mat bad_distCoeffs_cpp2(2, 2, CV_32F); CvMat bad_distCoeffs_c2 = cvMat(bad_distCoeffs_cpp2);
+        Mat bad_distCoeffs_cpp3(1, 7, CV_32F); CvMat bad_distCoeffs_c3 = cvMat(bad_distCoeffs_cpp3);
 
         bad_caller = caller;
         bad_caller.distCoeffs = &zeros;
@@ -629,9 +621,9 @@ protected:
 
 
         /****************************/
-        Mat bad_dpdr_cpp1(dpdr_cpp.size(), CV_32S); CvMat bad_dpdr_c1 = bad_dpdr_cpp1;
-        Mat bad_dpdr_cpp2(dpdr_cpp.cols+1, 3, CV_32F); CvMat bad_dpdr_c2 = bad_dpdr_cpp2;
-        Mat bad_dpdr_cpp3(dpdr_cpp.cols, 7, CV_32F); CvMat bad_dpdr_c3 = bad_dpdr_cpp3;
+        Mat bad_dpdr_cpp1(dpdr_cpp.size(), CV_32S); CvMat bad_dpdr_c1 = cvMat(bad_dpdr_cpp1);
+        Mat bad_dpdr_cpp2(dpdr_cpp.cols+1, 3, CV_32F); CvMat bad_dpdr_c2 = cvMat(bad_dpdr_cpp2);
+        Mat bad_dpdr_cpp3(dpdr_cpp.cols, 7, CV_32F); CvMat bad_dpdr_c3 = cvMat(bad_dpdr_cpp3);
 
         bad_caller = caller;
         bad_caller.dpdr = &zeros;
@@ -669,7 +661,7 @@ protected:
 
         /****************************/
 
-        Mat bad_dpdf_cpp2(dpdr_cpp.cols+1, 2, CV_32F); CvMat bad_dpdf_c2 = bad_dpdf_cpp2;
+        Mat bad_dpdf_cpp2(dpdr_cpp.cols+1, 2, CV_32F); CvMat bad_dpdf_c2 = cvMat(bad_dpdf_cpp2);
 
         bad_caller = caller;
         bad_caller.dpdf = &zeros;
diff --git a/modules/calib3d/test/test_chesscorners_badarg.cpp b/modules/calib3d/test/test_chesscorners_badarg.cpp
index a12bd53e84..422e364d46 100644
--- a/modules/calib3d/test/test_chesscorners_badarg.cpp
+++ b/modules/calib3d/test/test_chesscorners_badarg.cpp
@@ -78,9 +78,9 @@ protected:
             findChessboardCorners(img, pattern_size, corners, flags);
         else
             if (!drawCorners)
-                cvFindChessboardCorners( &arr, pattern_size, out_corners, out_corner_count, flags );
+                cvFindChessboardCorners( &arr, cvSize(pattern_size), out_corners, out_corner_count, flags );
             else
-                cvDrawChessboardCorners( &drawCorImg, pattern_size,
+                cvDrawChessboardCorners( &drawCorImg, cvSize(pattern_size),
                     (CvPoint2D32f*)(corners.empty() ? 0 : &corners[0]),
                     (int)corners.size(), was_found);
     }
@@ -128,14 +128,14 @@ void CV_ChessboardDetectorBadArgTest::run( int /*start_from */)
     drawCorners = false;
 
     img = cb.clone();
-    arr = img;
+    arr = cvMat(img);
     out_corner_count = 0;
     out_corners = 0;
     errors += run_test_case( CV_StsNullPtr, "Null pointer to corners" );
 
     drawCorners = true;
     Mat cvdrawCornImg(img.size(), CV_8UC2);
-    drawCorImg = cvdrawCornImg;
+    drawCorImg = cvMat(cvdrawCornImg);
     was_found = true;
     errors += run_test_case( CV_StsUnsupportedFormat, "2 channel image" );
 
diff --git a/modules/calib3d/test/test_chesscorners_timing.cpp b/modules/calib3d/test/test_chesscorners_timing.cpp
index b89d2e0147..1e10d9682c 100644
--- a/modules/calib3d/test/test_chesscorners_timing.cpp
+++ b/modules/calib3d/test/test_chesscorners_timing.cpp
@@ -96,7 +96,7 @@ void CV_ChessboardDetectorTimingTest::run( int start_from )
     {
         int count0 = -1;
         int count = 0;
-        CvSize pattern_size;
+        Size pattern_size;
         int result, result1 = 0;
 
         const char* imgname = cvReadString((CvFileNode*)cvGetSeqElem(board_list->data.seq,idx*4), "dummy.txt");
@@ -110,7 +110,7 @@ void CV_ChessboardDetectorTimingTest::run( int start_from )
         filename = cv::format("%s%s", filepath.c_str(), imgname );
 
         cv::Mat img2 = cv::imread( filename );
-        img = img2;
+        img = cvIplImage(img2);
 
         if( img2.empty() )
         {
@@ -135,11 +135,11 @@ void CV_ChessboardDetectorTimingTest::run( int start_from )
         v = (CvPoint2D32f*)_v->data.fl;
 
         int64 _time0 = cvGetTickCount();
-        result = cvCheckChessboard(gray, pattern_size);
+        result = cvCheckChessboard(gray, cvSize(pattern_size));
         int64 _time01 = cvGetTickCount();
 
         OPENCV_CALL( result1 = cvFindChessboardCorners(
-                 gray, pattern_size, v, &count, 15 ));
+                 gray, cvSize(pattern_size), v, &count, 15 ));
         int64 _time1 = cvGetTickCount();
 
         if( result != is_chessboard )
diff --git a/modules/calib3d/test/test_cornerssubpix.cpp b/modules/calib3d/test/test_cornerssubpix.cpp
index 2332fa88a0..05b75c5cbc 100644
--- a/modules/calib3d/test/test_cornerssubpix.cpp
+++ b/modules/calib3d/test/test_cornerssubpix.cpp
@@ -180,7 +180,7 @@ void CV_ChessboardSubpixelTest::run( int )
             break;
         }
 
-        IplImage chessboard_image_header = chessboard_image;
+        IplImage chessboard_image_header = cvIplImage(chessboard_image);
         cvFindCornerSubPix(&chessboard_image_header, (CvPoint2D32f*)&test_corners[0],
             (int)test_corners.size(), cvSize(3, 3), cvSize(1, 1), cvTermCriteria(CV_TERMCRIT_EPS|CV_TERMCRIT_ITER,300,0.1));
         find4QuadCornerSubpix(chessboard_image, test_corners, Size(5, 5));
diff --git a/modules/calib3d/test/test_fundam.cpp b/modules/calib3d/test/test_fundam.cpp
index 47e5d53982..236db6ec4d 100644
--- a/modules/calib3d/test/test_fundam.cpp
+++ b/modules/calib3d/test/test_fundam.cpp
@@ -351,9 +351,9 @@ static int cvTsRodrigues( const CvMat* src, CvMat* dst, CvMat* jacobian )
 {
     CV_Assert(src.data != dst.data && "Inplace is not supported");
     CV_Assert(!dst.empty() && "'dst' must be allocated");
-    CvMat _src = src, _dst = dst, _jac;
+    CvMat _src = cvMat(src), _dst = cvMat(dst), _jac;
     if( jac )
-        _jac = *jac;
+        _jac = cvMat(*jac);
     cvTsRodrigues(&_src, &_dst, jac ? &_jac : 0);
 }
 
@@ -667,13 +667,13 @@ void CV_RodriguesTest::run_func()
 
     if( calc_jacobians )
     {
-        v2m_jac = test_mat[OUTPUT][1];
-        m2v_jac = test_mat[OUTPUT][3];
+        v2m_jac = cvMat(test_mat[OUTPUT][1]);
+        m2v_jac = cvMat(test_mat[OUTPUT][3]);
     }
 
     if( !test_cpp )
     {
-        CvMat _input = test_mat[INPUT][0], _output = test_mat[OUTPUT][0], _output2 = test_mat[OUTPUT][2];
+        CvMat _input = cvMat(test_mat[INPUT][0]), _output = cvMat(test_mat[OUTPUT][0]), _output2 = cvMat(test_mat[OUTPUT][2]);
         cvRodrigues2( &_input, &_output, calc_jacobians ? &v2m_jac : 0 );
         cvRodrigues2( &_output, &_output2, calc_jacobians ? &m2v_jac : 0 );
     }
@@ -980,8 +980,8 @@ int CV_FundamentalMatTest::prepare_test_case( int test_case_idx )
 void CV_FundamentalMatTest::run_func()
 {
     // cvFindFundamentalMat calls cv::findFundamentalMat
-    CvMat _input0 = test_mat[INPUT][0], _input1 = test_mat[INPUT][1];
-    CvMat F = test_mat[TEMP][0], mask = test_mat[TEMP][1];
+    CvMat _input0 = cvMat(test_mat[INPUT][0]), _input1 = cvMat(test_mat[INPUT][1]);
+    CvMat F = cvMat(test_mat[TEMP][0]), mask = cvMat(test_mat[TEMP][1]);
     f_result = cvFindFundamentalMat( &_input0, &_input1, &F, method, MAX(sigma*3, 0.01), 0, &mask );
 }
 
@@ -1543,7 +1543,7 @@ void CV_ConvertHomogeneousTest::fill_array( int /*test_case_idx*/, int /*i*/, in
 
 void CV_ConvertHomogeneousTest::run_func()
 {
-    CvMat _input = test_mat[INPUT][0], _output = test_mat[OUTPUT][0];
+    CvMat _input = cvMat(test_mat[INPUT][0]), _output = cvMat(test_mat[OUTPUT][0]);
     cvConvertPointsHomogeneous( &_input, &_output );
 }
 
@@ -1678,7 +1678,7 @@ void CV_ComputeEpilinesTest::fill_array( int test_case_idx, int i, int j, Mat& a
 
 void CV_ComputeEpilinesTest::run_func()
 {
-    CvMat _points = test_mat[INPUT][0], _F = test_mat[INPUT][1], _lines = test_mat[OUTPUT][0];
+    CvMat _points = cvMat(test_mat[INPUT][0]), _F = cvMat(test_mat[INPUT][1]), _lines = cvMat(test_mat[OUTPUT][0]);
     cvComputeCorrespondEpilines( &_points, which_image, &_F, &_lines );
 }
 
diff --git a/modules/calib3d/test/test_reproject_image_to_3d.cpp b/modules/calib3d/test/test_reproject_image_to_3d.cpp
index 7254803b24..eaaefd90ce 100644
--- a/modules/calib3d/test/test_reproject_image_to_3d.cpp
+++ b/modules/calib3d/test/test_reproject_image_to_3d.cpp
@@ -124,7 +124,7 @@ protected:
 
         Mat_<out3d_t> _3dImg(disp.size());
 
-        CvMat cvdisp = disp; CvMat cv_3dImg = _3dImg; CvMat cvQ = Q;
+        CvMat cvdisp = cvMat(disp); CvMat cv_3dImg = cvMat(_3dImg); CvMat cvQ = cvMat(Q);
         cvReprojectImageTo3D( &cvdisp, &cv_3dImg, &cvQ, handleMissingValues );
 
         if (std::numeric_limits<OutT>::max() == std::numeric_limits<float>::max())
diff --git a/modules/calib3d/test/test_undistort.cpp b/modules/calib3d/test/test_undistort.cpp
index 6f344a12e9..e4fe4fe1f3 100644
--- a/modules/calib3d/test/test_undistort.cpp
+++ b/modules/calib3d/test/test_undistort.cpp
@@ -410,7 +410,7 @@ void CV_UndistortPointsTest::prepare_to_validation(int /*test_case_idx*/)
     {
         if (useDstMat)
         {
-            CvMat temp = dst_points_mat;
+            CvMat temp = cvMat(dst_points_mat);
             for (int i=0;i<N_POINTS*2;i++)
             {
                 points[i] = temp.data.fl[i];
@@ -469,14 +469,14 @@ void CV_UndistortPointsTest::run_func()
     }
     else
     {
-        CvMat _input0 = test_mat[INPUT][0], _input1 = test_mat[INPUT][1], _input2, _input3, _input4;
-        CvMat _output = test_mat[TEMP][0];
+        CvMat _input0 = cvMat(test_mat[INPUT][0]), _input1 = cvMat(test_mat[INPUT][1]), _input2, _input3, _input4;
+        CvMat _output = cvMat(test_mat[TEMP][0]);
         if(!zero_distortion)
-            _input2 = test_mat[INPUT][2];
+            _input2 = cvMat(test_mat[INPUT][2]);
         if(!zero_R)
-            _input3 = test_mat[INPUT][3];
+            _input3 = cvMat(test_mat[INPUT][3]);
         if(!zero_new_cam)
-            _input4 = test_mat[INPUT][4];
+            _input4 = cvMat(test_mat[INPUT][4]);
         cvUndistortPoints(&_input0, &_output, &_input1,
                           zero_distortion ? 0 : &_input2,
                           zero_R ? 0 : &_input3,
@@ -853,10 +853,10 @@ void CV_InitUndistortRectifyMapTest::prepare_to_validation(int/* test_case_idx*/
     CvMat _new_cam = cvMat(test_mat[INPUT][4].rows,test_mat[INPUT][4].cols,CV_64F,new_cam);
     CvMat _points= cvMat(test_mat[INPUT][0].rows,test_mat[INPUT][0].cols,CV_64FC2,points);
 
-    CvMat _input1 = test_mat[INPUT][1];
-    CvMat _input2 = test_mat[INPUT][2];
-    CvMat _input3 = test_mat[INPUT][3];
-    CvMat _input4 = test_mat[INPUT][4];
+    CvMat _input1 = cvMat(test_mat[INPUT][1]);
+    CvMat _input2 = cvMat(test_mat[INPUT][2]);
+    CvMat _input3 = cvMat(test_mat[INPUT][3]);
+    CvMat _input4 = cvMat(test_mat[INPUT][4]);
 
     cvtest::convert(cvarrToMat(&_input1), cvarrToMat(&_camera), -1);
     cvtest::convert(cvarrToMat(&_input2), cvarrToMat(&_distort), -1);
@@ -871,8 +871,8 @@ void CV_InitUndistortRectifyMapTest::prepare_to_validation(int/* test_case_idx*/
     }
     cv::Mat map1,map2;
     cv::convertMaps(mapx,mapy,map1,map2,CV_32FC1);
-    CvMat _map1 = map1;
-    CvMat _map2 = map2;
+    CvMat _map1 = cvMat(map1);
+    CvMat _map2 = cvMat(map2);
     for (int i=0;i<N_POINTS;i++)
     {
         double u = test_mat[INPUT][0].ptr<double>()[2*i];
@@ -886,7 +886,7 @@ void CV_InitUndistortRectifyMapTest::prepare_to_validation(int/* test_case_idx*/
     cvUndistortPoints(&_points,&ref_points,&_camera,
                       zero_distortion ? 0 : &_distort, zero_R ? 0 : &_rot, zero_new_cam ? &_camera : &_new_cam);
     //cvTsDistortPoints(&_points,&ref_points,&_camera,&_distort,&_rot,&_new_cam);
-    CvMat dst = test_mat[REF_OUTPUT][0];
+    CvMat dst = cvMat(test_mat[REF_OUTPUT][0]);
     cvtest::convert(cvarrToMat(&ref_points), cvarrToMat(&dst), -1);
 
     cvtest::copy(test_mat[INPUT][0],test_mat[OUTPUT][0]);
@@ -912,13 +912,13 @@ void CV_InitUndistortRectifyMapTest::run_func()
     }
     else
     {
-        CvMat input1 = test_mat[INPUT][1], input2, input3, input4;
+        CvMat input1 = cvMat(test_mat[INPUT][1]), input2, input3, input4;
         if( !zero_distortion )
-            input2 = test_mat[INPUT][2];
+            input2 = cvMat(test_mat[INPUT][2]);
         if( !zero_R )
-            input3 = test_mat[INPUT][3];
+            input3 = cvMat(test_mat[INPUT][3]);
         if( !zero_new_cam )
-            input4 = test_mat[INPUT][4];
+            input4 = cvMat(test_mat[INPUT][4]);
         cvInitUndistortRectifyMap(&input1,
                                   zero_distortion ? 0 : &input2,
                                   zero_R ? 0 : &input3,
diff --git a/modules/core/include/opencv2/core/core_c.h b/modules/core/include/opencv2/core/core_c.h
index ce28fa1bef..e5fe516d26 100644
--- a/modules/core/include/opencv2/core/core_c.h
+++ b/modules/core/include/opencv2/core/core_c.h
@@ -3064,7 +3064,7 @@ template<typename _Tp> inline void Seq<_Tp>::copyTo(std::vector<_Tp>& vec, const
     size_t len = !seq ? 0 : range == Range::all() ? seq->total : range.end - range.start;
     vec.resize(len);
     if( seq && len )
-        cvCvtSeqToArray(seq, &vec[0], range);
+        cvCvtSeqToArray(seq, &vec[0], cvSlice(range));
 }
 
 template<typename _Tp> inline Seq<_Tp>::operator std::vector<_Tp>() const
diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h
index 659f59bbf9..5c8b9f9b5c 100644
--- a/modules/core/include/opencv2/core/cvdef.h
+++ b/modules/core/include/opencv2/core/cvdef.h
@@ -219,15 +219,10 @@ enum CpuFeatures {
 typedef union Cv16suf
 {
     short i;
+    ushort u;
 #if CV_FP16_TYPE
     __fp16 h;
 #endif
-    struct _fp16Format
-    {
-        unsigned int significand : 10;
-        unsigned int exponent    : 5;
-        unsigned int sign        : 1;
-    } fmt;
 }
 Cv16suf;
 
@@ -236,12 +231,6 @@ typedef union Cv32suf
     int i;
     unsigned u;
     float f;
-    struct _fp32Format
-    {
-        unsigned int significand : 23;
-        unsigned int exponent    : 8;
-        unsigned int sign        : 1;
-    } fmt;
 }
 Cv32suf;
 
@@ -515,6 +504,115 @@ typedef ::uint64_t uint64_t;
 #include <stdint.h>
 #endif
 
+#ifdef __cplusplus
+namespace cv
+{
+
+class float16_t
+{
+public:
+#if CV_FP16_TYPE
+
+    float16_t() {}
+    explicit float16_t(float x) { h = (__fp16)x; }
+    operator float() const { return (float)h; }
+    static float16_t fromBits(ushort w)
+    {
+        Cv16suf u;
+        u.u = w;
+        float16_t result;
+        result.h = u.h;
+        return result;
+    }
+    static float16_t zero()
+    {
+        float16_t result;
+        result.h = (__fp16)0;
+        return result;
+    }
+    ushort bits() const
+    {
+        Cv16suf u;
+        u.h = h;
+        return u.u;
+    }
+protected:
+    __fp16 h;
+
+#else
+    float16_t() {}
+    explicit float16_t(float x)
+    {
+    #if CV_AVX2
+        __m128 v = _mm_load_ss(&x);
+        w = (ushort)_mm_cvtsi128_si32(_mm_cvtps_ph(v, 0));
+    #else
+        Cv32suf in;
+        in.f = x;
+        unsigned sign = in.u & 0x80000000;
+        in.u ^= sign;
+
+        if( in.u >= 0x47800000 )
+            w = (ushort)(in.u > 0x7f800000 ? 0x7e00 : 0x7c00);
+        else
+        {
+            if (in.u < 0x38800000)
+            {
+                in.f += 0.5f;
+                w = (ushort)(in.u - 0x3f000000);
+            }
+            else
+            {
+                unsigned t = in.u + 0xc8000fff;
+                w = (ushort)((t + ((in.u >> 13) & 1)) >> 13);
+            }
+        }
+
+        w = (ushort)(w | (sign >> 16));
+    #endif
+    }
+
+    operator float() const
+    {
+    #if CV_AVX2
+        float f;
+        _mm_store_ss(&f, _mm_cvtph_ps(_mm_cvtsi32_si128(w)));
+        return f;
+    #else
+        Cv32suf out;
+
+        unsigned t = ((w & 0x7fff) << 13) + 0x38000000;
+        unsigned sign = (w & 0x8000) << 16;
+        unsigned e = w & 0x7c00;
+
+        out.u = t + (1 << 23);
+        out.u = (e >= 0x7c00 ? t + 0x38000000 :
+                 e == 0 ? (out.f -= 6.103515625e-05f, out.u) : t) | sign;
+        return out.f;
+    #endif
+    }
+
+    static float16_t fromBits(ushort b)
+    {
+        float16_t result;
+        result.w = b;
+        return result;
+    }
+    static float16_t zero()
+    {
+        float16_t result;
+        result.w = (ushort)0;
+        return result;
+    }
+    ushort bits() const { return w; }
+protected:
+    ushort w;
+
+#endif
+};
+
+}
+#endif
 
 //! @}
 
diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp
index 6505f255cb..a321627081 100644
--- a/modules/core/include/opencv2/core/hal/intrin.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin.hpp
@@ -252,7 +252,8 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
     CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(unsigned, v_uint64, prefix) \
     CV_INTRIN_DEFINE_WIDE_INTRIN(float, v_float32, f32, prefix, load) \
     CV_INTRIN_DEFINE_WIDE_INTRIN(int64, v_int64, s64, prefix, load) \
-    CV_INTRIN_DEFINE_WIDE_INTRIN(uint64, v_uint64, u64, prefix, load)
+    CV_INTRIN_DEFINE_WIDE_INTRIN(uint64, v_uint64, u64, prefix, load) \
+    CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(float16_t, v_float32, prefix)
 
 template<typename _Tp> struct V_RegTraits
 {
@@ -286,9 +287,6 @@ template<typename _Tp> struct V_RegTraits
 #if CV_SIMD128_64F
     CV_DEF_REG_TRAITS(v, v_float64x2, double, f64, v_float64x2, void, void, v_int64x2, v_int32x4);
 #endif
-#if CV_SIMD128_FP16
-    CV_DEF_REG_TRAITS(v, v_float16x8, short, f16, v_float16x8, void, void, v_int16x8, v_int16x8);
-#endif
 #endif
 
 #if CV_SIMD256
@@ -302,9 +300,6 @@ template<typename _Tp> struct V_RegTraits
     CV_DEF_REG_TRAITS(v256, v_uint64x4, uint64, u64, v_uint64x4, void, void, v_int64x4, void);
     CV_DEF_REG_TRAITS(v256, v_int64x4, int64, s64, v_uint64x4, void, void, v_int64x4, void);
     CV_DEF_REG_TRAITS(v256, v_float64x4, double, f64, v_float64x4, void, void, v_int64x4, v_int32x8);
-#if CV_SIMD256_FP16
-    CV_DEF_REG_TRAITS(v256, v_float16x16, short, f16, v_float16x16, void, void, v_int16x16, void);
-#endif
 #endif
 
 #if CV_SIMD512 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 512)
@@ -335,14 +330,6 @@ namespace CV__SIMD_NAMESPACE {
     #if CV_SIMD256_64F
     typedef v_float64x4  v_float64;
     #endif
-    #if CV_FP16
-    #define vx_load_fp16_f32 v256_load_fp16_f32
-    #define vx_store_fp16 v_store_fp16
-    #endif
-    #if CV_SIMD256_FP16
-    typedef v_float16x16  v_float16;
-    CV_INTRIN_DEFINE_WIDE_INTRIN(short, v_float16, f16, v256, load_f16)
-    #endif
     CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v256)
     CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v256, load)
     inline void vx_cleanup() { v256_cleanup(); }
@@ -353,7 +340,6 @@ using namespace CV__SIMD_NAMESPACE;
 namespace CV__SIMD_NAMESPACE {
     #define CV_SIMD CV_SIMD128
     #define CV_SIMD_64F CV_SIMD128_64F
-    #define CV_SIMD_FP16 CV_SIMD128_FP16
     #define CV_SIMD_WIDTH 16
     typedef v_uint8x16  v_uint8;
     typedef v_int8x16   v_int8;
@@ -367,14 +353,6 @@ namespace CV__SIMD_NAMESPACE {
     #if CV_SIMD128_64F
     typedef v_float64x2 v_float64;
     #endif
-    #if CV_FP16
-    #define vx_load_fp16_f32 v128_load_fp16_f32
-    #define vx_store_fp16 v_store_fp16
-    #endif
-    #if CV_SIMD128_FP16
-    typedef v_float16x8  v_float16;
-    CV_INTRIN_DEFINE_WIDE_INTRIN(short, v_float16, f16, v, load_f16)
-    #endif
     CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v)
     #if CV_SIMD128_64F
     CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v, load)
diff --git a/modules/core/include/opencv2/core/hal/intrin_avx.hpp b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
index 36c7c0f1a1..a38c25e385 100644
--- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
@@ -234,7 +234,15 @@ struct v_uint64x4
     { val = _mm256_setr_epi64x((int64)v0, (int64)v1, (int64)v2, (int64)v3); }
     v_uint64x4() : val(_mm256_setzero_si256()) {}
     uint64 get0() const
-    { return (uint64)_mm_cvtsi128_si64(_mm256_castsi256_si128(val)); }
+    {
+    #if defined __x86_64__ || defined _M_X64
+        return (uint64)_mm_cvtsi128_si64(_mm256_castsi256_si128(val));
+    #else
+        int a = _mm_cvtsi128_si32(_mm256_castsi256_si128(val));
+        int b = _mm_cvtsi128_si32(_mm256_castsi256_si128(_mm256_srli_epi64(val, 32)));
+        return (unsigned)a | ((uint64)(unsigned)b << 32);
+    #endif
+    }
 };
 
 struct v_int64x4
@@ -247,7 +255,17 @@ struct v_int64x4
     v_int64x4(int64 v0, int64 v1, int64 v2, int64 v3)
     { val = _mm256_setr_epi64x(v0, v1, v2, v3); }
     v_int64x4() : val(_mm256_setzero_si256()) {}
-    int64 get0() const { return (int64)_mm_cvtsi128_si64(_mm256_castsi256_si128(val)); }
+
+    int64 get0() const
+    {
+    #if defined __x86_64__ || defined _M_X64
+        return (int64)_mm_cvtsi128_si64(_mm256_castsi256_si128(val));
+    #else
+        int a = _mm_cvtsi128_si32(_mm256_castsi256_si128(val));
+        int b = _mm_cvtsi128_si32(_mm256_castsi256_si128(_mm256_srli_epi64(val, 32)));
+        return (int64)((unsigned)a | ((uint64)(unsigned)b << 32));
+    #endif
+    }
 };
 
 struct v_float64x4
@@ -1396,10 +1414,17 @@ inline v_int8x32 v_pack(const v_int16x16& a, const v_int16x16& b)
 { return v_int8x32(_v256_shuffle_odd_64(_mm256_packs_epi16(a.val, b.val))); }
 
 inline v_uint8x32 v_pack(const v_uint16x16& a, const v_uint16x16& b)
-{ return v_uint8x32(_v256_shuffle_odd_64(_mm256_packus_epi16(a.val, b.val))); }
+{
+    __m256i t = _mm256_set1_epi16(255);
+    __m256i a1 = _mm256_min_epu16(a.val, t);
+    __m256i b1 = _mm256_min_epu16(b.val, t);
+    return v_uint8x32(_v256_shuffle_odd_64(_mm256_packus_epi16(a1, b1)));
+}
 
 inline v_uint8x32 v_pack_u(const v_int16x16& a, const v_int16x16& b)
-{ return v_pack(v_reinterpret_as_u16(a), v_reinterpret_as_u16(b)); }
+{
+    return v_uint8x32(_v256_shuffle_odd_64(_mm256_packus_epi16(a.val, b.val)));
+}
 
 inline void v_pack_store(schar* ptr, const v_int16x16& a)
 { v_store_low(ptr, v_pack(a, a)); }
@@ -2372,6 +2397,18 @@ OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_float32x8, float, f32, v_uint32x8, un
 OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int64x4, int64, s64, v_uint64x4, uint64, u64)
 OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_float64x4, double, f64, v_uint64x4, uint64, u64)
 
+// FP16
+inline v_float32x8 v256_load_expand(const float16_t* ptr)
+{
+    return v_float32x8(_mm256_cvtph_ps(_mm_loadu_si128((const __m128i*)ptr)));
+}
+
+inline void v_pack_store(float16_t* ptr, const v_float32x8& a)
+{
+    __m128i ah = _mm256_cvtps_ph(a.val, 0);
+    _mm_storeu_si128((__m128i*)ptr, ah);
+}
+
 inline void v256_cleanup() { _mm256_zeroupper(); }
 
 //! @name Check SIMD256 support
diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
index ccd317682d..64a457a530 100644
--- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@@ -2062,6 +2062,28 @@ inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
                        v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + m3.s[3]);
 }
 
+////// FP16 suport ///////
+
+inline v_reg<float, V_TypeTraits<float>::nlanes128>
+v_load_expand(const float16_t* ptr)
+{
+    v_reg<float, V_TypeTraits<float>::nlanes128> v;
+    for( int i = 0; i < v.nlanes; i++ )
+    {
+        v.s[i] = ptr[i];
+    }
+    return v;
+}
+
+inline void
+v_pack_store(float16_t* ptr, v_reg<float, V_TypeTraits<float>::nlanes128>& v)
+{
+    for( int i = 0; i < v.nlanes; i++ )
+    {
+        ptr[i] = float16_t(v.s[i]);
+    }
+}
+
 inline void v_cleanup() {}
 
 //! @}
diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
index c017b075f1..d87b4e2ba0 100644
--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@@ -62,15 +62,6 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
 #define CV_SIMD128_64F 0
 #endif
 
-#ifndef CV_SIMD128_FP16
-# if CV_FP16 && (defined(__GNUC__) && __GNUC__ >= 5)  // #12027: float16x8_t is missing in GCC 4.8.2
-#   define CV_SIMD128_FP16 1
-# endif
-#endif
-#ifndef CV_SIMD128_FP16
-# define CV_SIMD128_FP16 0
-#endif
-
 #if CV_SIMD128_64F
 #define OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv, suffix) \
 template <typename T> static inline \
@@ -329,53 +320,6 @@ inline void v_store_fp16(short* ptr, const v_float32x4& a)
 }
 #endif
 
-
-#if CV_SIMD128_FP16
-// Workaround for old compilers
-static inline int16x8_t vreinterpretq_s16_f16(float16x8_t a) { return (int16x8_t)a; }
-static inline float16x8_t vreinterpretq_f16_s16(int16x8_t a) { return (float16x8_t)a; }
-
-static inline float16x8_t cv_vld1q_f16(const void* ptr)
-{
-#ifndef vld1q_f16 // APPLE compiler defines vld1_f16 as macro
-    return vreinterpretq_f16_s16(vld1q_s16((const short*)ptr));
-#else
-    return vld1q_f16((const __fp16*)ptr);
-#endif
-}
-static inline void cv_vst1q_f16(void* ptr, float16x8_t a)
-{
-#ifndef vst1q_f16 // APPLE compiler defines vst1_f16 as macro
-    vst1q_s16((short*)ptr, vreinterpretq_s16_f16(a));
-#else
-    vst1q_f16((__fp16*)ptr, a);
-#endif
-}
-
-struct v_float16x8
-{
-    typedef short lane_type;
-    enum { nlanes = 8 };
-
-    v_float16x8() {}
-    explicit v_float16x8(float16x8_t v) : val(v) {}
-    v_float16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
-    {
-        short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
-        val = cv_vld1q_f16(v);
-    }
-    short get0() const
-    {
-        return vgetq_lane_s16(vreinterpretq_s16_f16(val), 0);
-    }
-    float16x8_t val;
-};
-
-inline v_float16x8 v_setzero_f16() { return v_float16x8(vreinterpretq_f16_s16(vdupq_n_s16((short)0))); }
-inline v_float16x8 v_setall_f16(short v) { return v_float16x8(vreinterpretq_f16_s16(vdupq_n_s16(v))); }
-
-#endif // CV_SIMD128_FP16
-
 #define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \
 inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \
 inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \
@@ -934,24 +878,6 @@ OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32)
 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float64x2, double, f64)
 #endif
 
-#if CV_SIMD128_FP16
-// Workaround for old comiplers
-inline v_float16x8 v_load_f16(const short* ptr)
-{ return v_float16x8(cv_vld1q_f16(ptr)); }
-inline v_float16x8 v_load_f16_aligned(const short* ptr)
-{ return v_float16x8(cv_vld1q_f16(ptr)); }
-
-inline v_float16x8 v_load_f16_low(const short* ptr)
-{ return v_float16x8(vcombine_f16(cv_vld1_f16(ptr), vdup_n_f16((float16_t)0))); }
-inline v_float16x8 v_load_f16_halves(const short* ptr0, const short* ptr1)
-{ return v_float16x8(vcombine_f16(cv_vld1_f16(ptr0), cv_vld1_f16(ptr1))); }
-
-inline void v_store(short* ptr, const v_float16x8& a)
-{ cv_vst1q_f16(ptr, a.val); }
-inline void v_store_aligned(short* ptr, const v_float16x8& a)
-{ cv_vst1q_f16(ptr, a.val); }
-#endif
-
 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
 inline scalartype v_reduce_##func(const _Tpvec& a) \
 { \
@@ -1507,22 +1433,6 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
 }
 #endif
 
-#if CV_SIMD128_FP16
-inline v_float32x4 v_cvt_f32(const v_float16x8& a)
-{
-    return v_float32x4(vcvt_f32_f16(vget_low_f16(a.val)));
-}
-inline v_float32x4 v_cvt_f32_high(const v_float16x8& a)
-{
-    return v_float32x4(vcvt_f32_f16(vget_high_f16(a.val)));
-}
-
-inline v_float16x8 v_cvt_f16(const v_float32x4& a, const v_float32x4& b)
-{
-    return v_float16x8(vcombine_f16(vcvt_f16_f32(a.val), vcvt_f16_f32(b.val)));
-}
-#endif
-
 ////////////// Lookup table access ////////////////////
 
 inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
@@ -1588,6 +1498,47 @@ inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_flo
 }
 #endif
 
+////// FP16 suport ///////
+#if CV_FP16
+inline v_float32x4 v_load_expand(const float16_t* ptr)
+{
+    float16x4_t v =
+    #ifndef vld1_f16 // APPLE compiler defines vld1_f16 as macro
+        (float16x4_t)vld1_s16((const short*)ptr);
+    #else
+        vld1_f16((const __fp16*)ptr);
+    #endif
+    return v_float32x4(vcvt_f32_f16(v));
+}
+
+inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
+{
+    float16x4_t hv = vcvt_f16_f32(v.val);
+
+    #ifndef vst1_f16 // APPLE compiler defines vst1_f16 as macro
+        vst1_s16((short*)ptr, (int16x4_t)hv);
+    #else
+        vst1_f16((__fp16*)ptr, hv);
+    #endif
+}
+#else
+inline v_float32x4 v_load_expand(const float16_t* ptr)
+{
+    const int N = 4;
+    float buf[N];
+    for( int i = 0; i < N; i++ ) buf[i] = (float)ptr[i];
+    return v_load(buf);
+}
+
+inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
+{
+    const int N = 4;
+    float buf[N];
+    v_store(buf, v);
+    for( int i = 0; i < N; i++ ) ptr[i] = float16_t(buf[i]);
+}
+#endif
+
 inline void v_cleanup() {}
 
 //! @name Check SIMD support
diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
index 159ef356b5..29c4f646ec 100644
--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@@ -404,7 +404,7 @@ void v_rshr_pack_u_store(uchar* ptr, const v_int16x8& a)
 inline v_int8x16 v_pack(const v_int16x8& a, const v_int16x8& b)
 { return v_int8x16(_mm_packs_epi16(a.val, b.val)); }
 
-inline void v_pack_store(schar* ptr, v_int16x8& a)
+inline void v_pack_store(schar* ptr, const v_int16x8& a)
 { _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a.val, a.val)); }
 
 template<int n> inline
@@ -2655,6 +2655,50 @@ inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_flo
     y = v_float64x2(_mm_unpackhi_pd(xy0, xy1));
 }
 
+
+////////////// FP16 support ///////////////////////////
+
+inline v_float32x4 v_load_expand(const float16_t* ptr)
+{
+    const __m128i z = _mm_setzero_si128(), delta = _mm_set1_epi32(0x38000000);
+    const __m128i signmask = _mm_set1_epi32(0x80000000), maxexp = _mm_set1_epi32(0x7c000000);
+    const __m128 deltaf = _mm_castsi128_ps(_mm_set1_epi32(0x38800000));
+    __m128i bits = _mm_unpacklo_epi16(z, _mm_loadl_epi64((const __m128i*)ptr)); // h << 16
+    __m128i e = _mm_and_si128(bits, maxexp), sign = _mm_and_si128(bits, signmask);
+    __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_xor_si128(bits, sign), 3), delta); // ((h & 0x7fff) << 13) + delta
+    __m128i zt = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_add_epi32(t, _mm_set1_epi32(1 << 23))), deltaf));
+
+    t = _mm_add_epi32(t, _mm_and_si128(delta, _mm_cmpeq_epi32(maxexp, e)));
+    __m128i zmask = _mm_cmpeq_epi32(e, z);
+    __m128i ft = v_select_si128(zmask, zt, t);
+    return v_float32x4(_mm_castsi128_ps(_mm_or_si128(ft, sign)));
+}
+
+inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
+{
+    const __m128i signmask = _mm_set1_epi32(0x80000000);
+    const __m128i rval = _mm_set1_epi32(0x3f000000);
+
+    __m128i t = _mm_castps_si128(v.val);
+    __m128i sign = _mm_srai_epi32(_mm_and_si128(t, signmask), 16);
+    t = _mm_andnot_si128(signmask, t);
+
+    __m128i finitemask = _mm_cmpgt_epi32(_mm_set1_epi32(0x47800000), t);
+    __m128i isnan = _mm_cmpgt_epi32(t, _mm_set1_epi32(0x7f800000));
+    __m128i naninf = v_select_si128(isnan, _mm_set1_epi32(0x7e00), _mm_set1_epi32(0x7c00));
+    __m128i tinymask = _mm_cmpgt_epi32(_mm_set1_epi32(0x38800000), t);
+    __m128i tt = _mm_castps_si128(_mm_add_ps(_mm_castsi128_ps(t), _mm_castsi128_ps(rval)));
+    tt = _mm_sub_epi32(tt, rval);
+    __m128i odd = _mm_and_si128(_mm_srli_epi32(t, 13), _mm_set1_epi32(1));
+    __m128i nt = _mm_add_epi32(t, _mm_set1_epi32(0xc8000fff));
+    nt = _mm_srli_epi32(_mm_add_epi32(nt, odd), 13);
+    t = v_select_si128(tinymask, tt, nt);
+    t = v_select_si128(finitemask, t, naninf);
+    t = _mm_or_si128(t, sign);
+    t = _mm_packs_epi32(t, t);
+    _mm_storel_epi64((__m128i*)ptr, t);
+}
+
 inline void v_cleanup() {}
 
 //! @name Check SIMD support
diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
index a45e7a875f..fb81986f6c 100644
--- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
@@ -916,6 +916,24 @@ inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_flo
     y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]);
 }
 
+/////// FP16 support ////////
+
+// [TODO] implement these 2 using VSX or universal intrinsics (copy from intrin_sse.cpp and adopt)
+inline v_float32x4 v_load_expand(const float16_t* ptr)
+{
+    return v_float32x4((float)ptr[0], (float)ptr[1], (float)ptr[2], (float)ptr[3]);
+}
+
+inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
+{
+    float CV_DECL_ALIGNED(32) f[4];
+    v_store_aligned(f, v);
+    ptr[0] = float16_t(f[0]);
+    ptr[1] = float16_t(f[1]);
+    ptr[2] = float16_t(f[2]);
+    ptr[3] = float16_t(f[3]);
+}
+
 inline void v_cleanup() {}
 
 
diff --git a/modules/core/include/opencv2/core/types_c.h b/modules/core/include/opencv2/core/types_c.h
index 81e986fcd1..9ee8c9d2b6 100644
--- a/modules/core/include/opencv2/core/types_c.h
+++ b/modules/core/include/opencv2/core/types_c.h
@@ -44,6 +44,29 @@
 #ifndef OPENCV_CORE_TYPES_H
 #define OPENCV_CORE_TYPES_H
 
+#if !defined(__OPENCV_BUILD) && !defined(CV__DISABLE_C_API_CTORS)
+#define CV__ENABLE_C_API_CTORS // enable C API ctors (must be removed)
+#endif
+
+//#define CV__VALIDATE_UNUNITIALIZED_VARS 1  // C++11 & GCC only
+
+#ifdef __cplusplus
+
+#ifdef CV__VALIDATE_UNUNITIALIZED_VARS
+#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+#define CV_STRUCT_INITIALIZER {0,}
+#else
+#if defined(__GNUC__) && __GNUC__ == 4  // GCC 4.x warns on "= {}" initialization, fixed in GCC 5.0
+#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+#endif
+#define CV_STRUCT_INITIALIZER {}
+#endif
+
+#else
+#define CV_STRUCT_INITIALIZER {0}
+#endif
+
+
 #ifdef HAVE_IPL
 #  ifndef __IPL_H__
 #    if defined _WIN32
@@ -285,6 +308,11 @@ CV_INLINE double cvRandReal( CvRNG* rng )
 #define IPL_BORDER_REFLECT    2
 #define IPL_BORDER_WRAP       3
 
+#ifdef __cplusplus
+typedef struct _IplImage IplImage;
+CV_EXPORTS _IplImage cvIplImage(const cv::Mat& m);
+#endif
+
 /** The IplImage is taken from the Intel Image Processing Library, in which the format is native. OpenCV
 only supports a subset of possible IplImage formats, as outlined in the parameter list above.
 
@@ -294,9 +322,6 @@ hand, the Intel Image Processing Library processes the area of intersection betw
 destination images (or ROIs), allowing them to vary independently.
 */
 typedef struct
-#ifdef __cplusplus
-  CV_EXPORTS
-#endif
 _IplImage
 {
     int  nSize;             /**< sizeof(IplImage) */
@@ -330,13 +355,22 @@ _IplImage
                                (not necessarily aligned) -
                                needed for correct deallocation */
 
-#ifdef __cplusplus
+#if defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus)
     _IplImage() {}
-    _IplImage(const cv::Mat& m);
+    _IplImage(const cv::Mat& m) { *this = cvIplImage(m); }
 #endif
 }
 IplImage;
 
+CV_INLINE IplImage cvIplImage()
+{
+#if !defined(CV__ENABLE_C_API_CTORS)
+    IplImage self = CV_STRUCT_INITIALIZER; self.nSize = sizeof(IplImage); return self;
+#else
+    return _IplImage();
+#endif
+}
+
 typedef struct _IplTileInfo IplTileInfo;
 
 typedef struct _IplROI
@@ -460,13 +494,10 @@ typedef struct CvMat
     int cols;
 #endif
 
-
-#ifdef __cplusplus
+#if defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus)
     CvMat() {}
-    CvMat(const CvMat& m) { memcpy(this, &m, sizeof(CvMat));}
-    CvMat(const cv::Mat& m);
+    CvMat(const cv::Mat& m) { *this = cvMat(m); }
 #endif
-
 }
 CvMat;
 
@@ -529,15 +560,8 @@ CV_INLINE CvMat cvMat( int rows, int cols, int type, void* data CV_DEFAULT(NULL)
 }
 
 #ifdef __cplusplus
-inline CvMat::CvMat(const cv::Mat& m)
-{
-    CV_DbgAssert(m.dims <= 2);
-    *this = cvMat(m.rows, m.dims == 1 ? 1 : m.cols, m.type(), m.data);
-    step = (int)m.step[0];
-    type = (type & ~cv::Mat::CONTINUOUS_FLAG) | (m.flags & cv::Mat::CONTINUOUS_FLAG);
-}
 
-inline CvMat cvMat(const cv::Mat& m)
+CV_INLINE CvMat cvMat(const cv::Mat& m)
 {
     CvMat self;
     CV_DbgAssert(m.dims <= 2);
@@ -546,7 +570,24 @@ inline CvMat cvMat(const cv::Mat& m)
     self.type = (self.type & ~cv::Mat::CONTINUOUS_FLAG) | (m.flags & cv::Mat::CONTINUOUS_FLAG);
     return self;
 }
+CV_INLINE CvMat cvMat()
+{
+#if !defined(CV__ENABLE_C_API_CTORS)
+    CvMat self = CV_STRUCT_INITIALIZER; return self;
+#else
+    return CvMat();
 #endif
+}
+CV_INLINE CvMat cvMat(const CvMat& m)
+{
+#if !defined(CV__ENABLE_C_API_CTORS)
+    CvMat self = CV_STRUCT_INITIALIZER; memcpy(&self, &m, sizeof(self)); return self;
+#else
+    return CvMat(m);
+#endif
+}
+
+#endif // __cplusplus
 
 
 #define CV_MAT_ELEM_PTR_FAST( mat, row, col, pix_size )  \
@@ -630,13 +671,15 @@ CV_INLINE int cvIplDepth( int type )
 
 #define CV_MAX_DIM            32
 
+#ifdef __cplusplus
+typedef struct CvMatND CvMatND;
+CV_EXPORTS CvMatND cvMatND(const cv::Mat& m);
+#endif
+
 /**
   @deprecated consider using cv::Mat instead
   */
 typedef struct
-#ifdef __cplusplus
-  CV_EXPORTS
-#endif
 CvMatND
 {
     int type;
@@ -661,13 +704,23 @@ CvMatND
     }
     dim[CV_MAX_DIM];
 
-#ifdef __cplusplus
+#if defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus)
     CvMatND() {}
-    CvMatND(const cv::Mat& m);
+    CvMatND(const cv::Mat& m) { *this = cvMatND(m); }
 #endif
 }
 CvMatND;
 
+
+CV_INLINE CvMatND cvMatND()
+{
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    CvMatND self = CV_STRUCT_INITIALIZER; return self;
+#else
+    return CvMatND();
+#endif
+}
+
 #define CV_IS_MATND_HDR(mat) \
     ((mat) != NULL && (((const CvMatND*)(mat))->type & CV_MAGIC_MASK) == CV_MATND_MAGIC_VAL)
 
@@ -684,11 +737,7 @@ CvMatND;
 
 struct CvSet;
 
-typedef struct
-#ifdef __cplusplus
-  CV_EXPORTS
-#endif
-CvSparseMat
+typedef struct CvSparseMat
 {
     int type;
     int dims;
@@ -703,13 +752,13 @@ CvSparseMat
     int size[CV_MAX_DIM];
 
 #ifdef __cplusplus
-    void copyToSparseMat(cv::SparseMat& m) const;
+    CV_EXPORTS void copyToSparseMat(cv::SparseMat& m) const;
 #endif
 }
 CvSparseMat;
 
 #ifdef __cplusplus
-    CV_EXPORTS CvSparseMat* cvCreateSparseMat(const cv::SparseMat& m);
+CV_EXPORTS CvSparseMat* cvCreateSparseMat(const cv::SparseMat& m);
 #endif
 
 #define CV_IS_SPARSE_MAT_HDR(mat) \
@@ -796,10 +845,23 @@ typedef struct CvRect
     int width;
     int height;
 
-#ifdef __cplusplus
+#ifdef CV__VALIDATE_UNUNITIALIZED_VARS
+    CvRect() __attribute__(( warning("Non-initialized variable") )) {};
+    template<typename _Tp> CvRect(const std::initializer_list<_Tp> list)
+    {
+        CV_Assert(list.size() == 0 || list.size() == 4);
+        x = y = width = height = 0;
+        if (list.size() == 4)
+        {
+            x = list.begin()[0]; y = list.begin()[1]; width = list.begin()[2]; height = list.begin()[3];
+        }
+    };
+#elif defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus)
     CvRect(int _x = 0, int _y = 0, int w = 0, int h = 0): x(_x), y(_y), width(w), height(h) {}
     template<typename _Tp>
     CvRect(const cv::Rect_<_Tp>& r): x(cv::saturate_cast<int>(r.x)), y(cv::saturate_cast<int>(r.y)), width(cv::saturate_cast<int>(r.width)), height(cv::saturate_cast<int>(r.height)) {}
+#endif
+#ifdef __cplusplus
     template<typename _Tp>
     operator cv::Rect_<_Tp>() const { return cv::Rect_<_Tp>((_Tp)x, (_Tp)y, (_Tp)width, (_Tp)height); }
 #endif
@@ -809,16 +871,16 @@ CvRect;
 /** constructs CvRect structure. */
 CV_INLINE  CvRect  cvRect( int x, int y, int width, int height )
 {
-    CvRect r;
-
-    r.x = x;
-    r.y = y;
-    r.width = width;
-    r.height = height;
-
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    CvRect r = {x, y, width, height};
+#else
+    CvRect r(x, y , width, height);
+#endif
     return r;
 }
-
+#ifdef __cplusplus
+CV_INLINE CvRect cvRect(const cv::Rect& rc) { return cvRect(rc.x, rc.y, rc.width, rc.height); }
+#endif
 
 CV_INLINE  IplROI  cvRectToROI( CvRect rect, int coi )
 {
@@ -853,26 +915,28 @@ typedef struct CvTermCriteria
                      CV_TERMCRIT_EPS */
     int    max_iter;
     double epsilon;
-
-#ifdef __cplusplus
+#if defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus)
     CvTermCriteria(int _type = 0, int _iter = 0, double _eps = 0) : type(_type), max_iter(_iter), epsilon(_eps)  {}
     CvTermCriteria(const cv::TermCriteria& t) : type(t.type), max_iter(t.maxCount), epsilon(t.epsilon)  {}
+#endif
+#ifdef __cplusplus
     operator cv::TermCriteria() const { return cv::TermCriteria(type, max_iter, epsilon); }
 #endif
-
 }
 CvTermCriteria;
 
 CV_INLINE  CvTermCriteria  cvTermCriteria( int type, int max_iter, double epsilon )
 {
-    CvTermCriteria t;
-
-    t.type = type;
-    t.max_iter = max_iter;
-    t.epsilon = (float)epsilon;
-
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    CvTermCriteria t = { type, max_iter, (float)epsilon};
+#else
+    CvTermCriteria t(type, max_iter, epsilon);
+#endif
     return t;
 }
+#ifdef __cplusplus
+CV_INLINE CvTermCriteria cvTermCriteria(const cv::TermCriteria& t) { return cvTermCriteria(t.type, t.maxCount, t.epsilon); }
+#endif
 
 
 /******************************* CvPoint and variants ***********************************/
@@ -882,10 +946,23 @@ typedef struct CvPoint
     int x;
     int y;
 
-#ifdef __cplusplus
+#ifdef CV__VALIDATE_UNUNITIALIZED_VARS
+    CvPoint() __attribute__(( warning("Non-initialized variable") )) {}
+    template<typename _Tp> CvPoint(const std::initializer_list<_Tp> list)
+    {
+        CV_Assert(list.size() == 0 || list.size() == 2);
+        x = y = 0;
+        if (list.size() == 2)
+        {
+            x = list.begin()[0]; y = list.begin()[1];
+        }
+    };
+#elif defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus)
     CvPoint(int _x = 0, int _y = 0): x(_x), y(_y) {}
     template<typename _Tp>
     CvPoint(const cv::Point_<_Tp>& pt): x((int)pt.x), y((int)pt.y) {}
+#endif
+#ifdef __cplusplus
     template<typename _Tp>
     operator cv::Point_<_Tp>() const { return cv::Point_<_Tp>(cv::saturate_cast<_Tp>(x), cv::saturate_cast<_Tp>(y)); }
 #endif
@@ -895,24 +972,39 @@ CvPoint;
 /** constructs CvPoint structure. */
 CV_INLINE  CvPoint  cvPoint( int x, int y )
 {
-    CvPoint p;
-
-    p.x = x;
-    p.y = y;
-
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    CvPoint p = {x, y};
+#else
+    CvPoint p(x, y);
+#endif
     return p;
 }
-
+#ifdef __cplusplus
+CV_INLINE CvPoint cvPoint(const cv::Point& pt) { return cvPoint(pt.x, pt.y); }
+#endif
 
 typedef struct CvPoint2D32f
 {
     float x;
     float y;
 
-#ifdef __cplusplus
+#ifdef CV__VALIDATE_UNUNITIALIZED_VARS
+    CvPoint2D32f() __attribute__(( warning("Non-initialized variable") )) {}
+    template<typename _Tp> CvPoint2D32f(const std::initializer_list<_Tp> list)
+    {
+        CV_Assert(list.size() == 0 || list.size() == 2);
+        x = y = 0;
+        if (list.size() == 2)
+        {
+            x = list.begin()[0]; y = list.begin()[1];
+        }
+    };
+#elif defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus)
     CvPoint2D32f(float _x = 0, float _y = 0): x(_x), y(_y) {}
     template<typename _Tp>
     CvPoint2D32f(const cv::Point_<_Tp>& pt): x((float)pt.x), y((float)pt.y) {}
+#endif
+#ifdef __cplusplus
     template<typename _Tp>
     operator cv::Point_<_Tp>() const { return cv::Point_<_Tp>(cv::saturate_cast<_Tp>(x), cv::saturate_cast<_Tp>(y)); }
 #endif
@@ -922,11 +1014,11 @@ CvPoint2D32f;
 /** constructs CvPoint2D32f structure. */
 CV_INLINE  CvPoint2D32f  cvPoint2D32f( double x, double y )
 {
-    CvPoint2D32f p;
-
-    p.x = (float)x;
-    p.y = (float)y;
-
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    CvPoint2D32f p = { (float)x, (float)y };
+#else
+    CvPoint2D32f p((float)x, (float)y);
+#endif
     return p;
 }
 
@@ -934,7 +1026,11 @@ CV_INLINE  CvPoint2D32f  cvPoint2D32f( double x, double y )
 template<typename _Tp>
 CvPoint2D32f cvPoint2D32f(const cv::Point_<_Tp>& pt)
 {
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    CvPoint2D32f p = { (float)pt.x, (float)pt.y };
+#else
     CvPoint2D32f p((float)pt.x, (float)pt.y);
+#endif
     return p;
 }
 #endif
@@ -948,10 +1044,11 @@ CV_INLINE  CvPoint2D32f  cvPointTo32f( CvPoint point )
 /** converts CvPoint2D32f to CvPoint. */
 CV_INLINE  CvPoint  cvPointFrom32f( CvPoint2D32f point )
 {
-    CvPoint ipt;
-    ipt.x = cvRound(point.x);
-    ipt.y = cvRound(point.y);
-
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    CvPoint ipt = { cvRound(point.x), cvRound(point.y) };
+#else
+    CvPoint ipt(cvRound(point.x), cvRound(point.y));
+#endif
     return ipt;
 }
 
@@ -962,10 +1059,23 @@ typedef struct CvPoint3D32f
     float y;
     float z;
 
-#ifdef __cplusplus
+#ifdef CV__VALIDATE_UNUNITIALIZED_VARS
+    CvPoint3D32f() __attribute__(( warning("Non-initialized variable") )) {}
+    template<typename _Tp> CvPoint3D32f(const std::initializer_list<_Tp> list)
+    {
+        CV_Assert(list.size() == 0 || list.size() == 3);
+        x = y = z = 0;
+        if (list.size() == 3)
+        {
+            x = list.begin()[0]; y = list.begin()[1]; z = list.begin()[2];
+        }
+    };
+#elif defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus)
     CvPoint3D32f(float _x = 0, float _y = 0, float _z = 0): x(_x), y(_y), z(_z) {}
     template<typename _Tp>
     CvPoint3D32f(const cv::Point3_<_Tp>& pt): x((float)pt.x), y((float)pt.y), z((float)pt.z) {}
+#endif
+#ifdef __cplusplus
     template<typename _Tp>
     operator cv::Point3_<_Tp>() const { return cv::Point3_<_Tp>(cv::saturate_cast<_Tp>(x), cv::saturate_cast<_Tp>(y), cv::saturate_cast<_Tp>(z)); }
 #endif
@@ -975,31 +1085,51 @@ CvPoint3D32f;
 /** constructs CvPoint3D32f structure. */
 CV_INLINE  CvPoint3D32f  cvPoint3D32f( double x, double y, double z )
 {
-    CvPoint3D32f p;
-
-    p.x = (float)x;
-    p.y = (float)y;
-    p.z = (float)z;
-
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    CvPoint3D32f p = { (float)x, (float)y, (float)z };
+#else
+    CvPoint3D32f p((float)x, (float)y, (float)z);
+#endif
     return p;
 }
 
+#ifdef __cplusplus
+template<typename _Tp>
+CvPoint3D32f cvPoint3D32f(const cv::Point3_<_Tp>& pt)
+{
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    CvPoint3D32f p  = { (float)pt.x, (float)pt.y, (float)pt.z };
+#else
+    CvPoint3D32f p((float)pt.x, (float)pt.y, (float)pt.z);
+#endif
+    return p;
+}
+#endif
+
 
 typedef struct CvPoint2D64f
 {
     double x;
     double y;
+#ifdef CV__VALIDATE_UNUNITIALIZED_VARS
+    CvPoint2D64f() __attribute__(( warning("Non-initialized variable") )) {}
+    template<typename _Tp> CvPoint2D64f(const std::initializer_list<_Tp> list)
+    {
+        CV_Assert(list.size() == 0 || list.size() == 2);
+        x = y = 0;
+        if (list.size() == 2)
+        {
+            x = list.begin()[0]; y = list.begin()[1];
+        }
+    };
+#endif
 }
 CvPoint2D64f;
 
 /** constructs CvPoint2D64f structure.*/
 CV_INLINE  CvPoint2D64f  cvPoint2D64f( double x, double y )
 {
-    CvPoint2D64f p;
-
-    p.x = x;
-    p.y = y;
-
+    CvPoint2D64f p = { x, y };
     return p;
 }
 
@@ -1009,18 +1139,25 @@ typedef struct CvPoint3D64f
     double x;
     double y;
     double z;
+#ifdef CV__VALIDATE_UNUNITIALIZED_VARS
+    CvPoint3D64f() __attribute__(( warning("Non-initialized variable") )) {}
+    template<typename _Tp> CvPoint3D64f(const std::initializer_list<_Tp> list)
+    {
+        CV_Assert(list.size() == 0 || list.size() == 3);
+        x = y = z = 0;
+        if (list.size() == 3)
+        {
+            x = list.begin()[0]; y = list.begin()[1]; z = list.begin()[2];
+        }
+    };
+#endif
 }
 CvPoint3D64f;
 
 /** constructs CvPoint3D64f structure. */
 CV_INLINE  CvPoint3D64f  cvPoint3D64f( double x, double y, double z )
 {
-    CvPoint3D64f p;
-
-    p.x = x;
-    p.y = y;
-    p.z = z;
-
+    CvPoint3D64f p = { x, y, z };
     return p;
 }
 
@@ -1032,10 +1169,23 @@ typedef struct CvSize
     int width;
     int height;
 
-#ifdef __cplusplus
+#ifdef CV__VALIDATE_UNUNITIALIZED_VARS
+    CvSize() __attribute__(( warning("Non-initialized variable") )) {}
+    template<typename _Tp> CvSize(const std::initializer_list<_Tp> list)
+    {
+        CV_Assert(list.size() == 0 || list.size() == 2);
+        width = 0; height = 0;
+        if (list.size() == 2)
+        {
+            width = list.begin()[0]; height = list.begin()[1];
+        }
+    };
+#elif defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus)
     CvSize(int w = 0, int h = 0): width(w), height(h) {}
     template<typename _Tp>
     CvSize(const cv::Size_<_Tp>& sz): width(cv::saturate_cast<int>(sz.width)), height(cv::saturate_cast<int>(sz.height)) {}
+#endif
+#ifdef __cplusplus
     template<typename _Tp>
     operator cv::Size_<_Tp>() const { return cv::Size_<_Tp>(cv::saturate_cast<_Tp>(width), cv::saturate_cast<_Tp>(height)); }
 #endif
@@ -1045,23 +1195,48 @@ CvSize;
 /** constructs CvSize structure. */
 CV_INLINE  CvSize  cvSize( int width, int height )
 {
-    CvSize s;
-
-    s.width = width;
-    s.height = height;
-
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    CvSize s = { width, height };
+#else
+    CvSize s(width, height);
+#endif
     return s;
 }
 
+#ifdef __cplusplus
+CV_INLINE CvSize cvSize(const cv::Size& sz)
+{
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    CvSize s = { sz.width, sz.height };
+#else
+    CvSize s(sz.width, sz.height);
+#endif
+    return s;
+}
+#endif
+
 typedef struct CvSize2D32f
 {
     float width;
     float height;
 
-#ifdef __cplusplus
+#ifdef CV__VALIDATE_UNUNITIALIZED_VARS
+    CvSize2D32f() __attribute__(( warning("Non-initialized variable") )) {}
+    template<typename _Tp> CvSize2D32f(const std::initializer_list<_Tp> list)
+    {
+        CV_Assert(list.size() == 0 || list.size() == 2);
+        width = 0; height = 0;
+        if (list.size() == 2)
+        {
+            width = list.begin()[0]; height = list.begin()[1];
+        }
+    };
+#elif defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus)
     CvSize2D32f(float w = 0, float h = 0): width(w), height(h) {}
     template<typename _Tp>
     CvSize2D32f(const cv::Size_<_Tp>& sz): width(cv::saturate_cast<float>(sz.width)), height(cv::saturate_cast<float>(sz.height)) {}
+#endif
+#ifdef __cplusplus
     template<typename _Tp>
     operator cv::Size_<_Tp>() const { return cv::Size_<_Tp>(cv::saturate_cast<_Tp>(width), cv::saturate_cast<_Tp>(height)); }
 #endif
@@ -1071,13 +1246,25 @@ CvSize2D32f;
 /** constructs CvSize2D32f structure. */
 CV_INLINE  CvSize2D32f  cvSize2D32f( double width, double height )
 {
-    CvSize2D32f s;
-
-    s.width = (float)width;
-    s.height = (float)height;
-
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    CvSize2D32f s = { (float)width, (float)height };
+#else
+    CvSize2D32f s((float)width, (float)height);
+#endif
     return s;
 }
+#ifdef __cplusplus
+template<typename _Tp>
+CvSize2D32f cvSize2D32f(const cv::Size_<_Tp>& sz)
+{
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    CvSize2D32f s = { (float)sz.width, (float)sz.height };
+#else
+    CvSize2D32f s((float)sz.width, (float)sz.height);
+#endif
+    return s;
+}
+#endif
 
 /** @sa RotatedRect
  */
@@ -1088,15 +1275,37 @@ typedef struct CvBox2D
     float angle;          /**< Angle between the horizontal axis           */
                           /**< and the first side (i.e. length) in degrees */
 
-#ifdef __cplusplus
+#if defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus)
     CvBox2D(CvPoint2D32f c = CvPoint2D32f(), CvSize2D32f s = CvSize2D32f(), float a = 0) : center(c), size(s), angle(a) {}
     CvBox2D(const cv::RotatedRect& rr) : center(rr.center), size(rr.size), angle(rr.angle) {}
+#endif
+#ifdef __cplusplus
     operator cv::RotatedRect() const { return cv::RotatedRect(center, size, angle); }
 #endif
 }
 CvBox2D;
 
 
+#ifdef __cplusplus
+CV_INLINE CvBox2D cvBox2D(CvPoint2D32f c = CvPoint2D32f(), CvSize2D32f s = CvSize2D32f(), float a = 0)
+{
+    CvBox2D self;
+    self.center = c;
+    self.size = s;
+    self.angle = a;
+    return self;
+}
+CV_INLINE CvBox2D cvBox2D(const cv::RotatedRect& rr)
+{
+    CvBox2D self;
+    self.center = cvPoint2D32f(rr.center);
+    self.size = cvSize2D32f(rr.size);
+    self.angle = rr.angle;
+    return self;
+}
+#endif
+
+
 /** Line iterator state: */
 typedef struct CvLineIterator
 {
@@ -1122,7 +1331,19 @@ typedef struct CvSlice
 {
     int  start_index, end_index;
 
-#if defined(__cplusplus) && !defined(__CUDACC__)
+#ifdef CV__VALIDATE_UNUNITIALIZED_VARS
+    CvSlice() __attribute__(( warning("Non-initialized variable") )) {}
+    template<typename _Tp> CvSlice(const std::initializer_list<_Tp> list)
+    {
+        CV_Assert(list.size() == 0 || list.size() == 2);
+        start_index = end_index = 0;
+        if (list.size() == 2)
+        {
+            start_index = list.begin()[0]; end_index = list.begin()[1];
+        }
+    };
+#endif
+#if defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus) && !defined(__CUDACC__)
     CvSlice(int start = 0, int end = 0) : start_index(start), end_index(end) {}
     CvSlice(const cv::Range& r) { *this = (r.start != INT_MIN && r.end != INT_MAX) ? CvSlice(r.start, r.end) : CvSlice(0, CV_WHOLE_SEQ_END_INDEX); }
     operator cv::Range() const { return (start_index == 0 && end_index == CV_WHOLE_SEQ_END_INDEX ) ? cv::Range::all() : cv::Range(start_index, end_index); }
@@ -1132,13 +1353,21 @@ CvSlice;
 
 CV_INLINE  CvSlice  cvSlice( int start, int end )
 {
-    CvSlice slice;
-    slice.start_index = start;
-    slice.end_index = end;
-
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    CvSlice slice = { start, end };
+#else
+    CvSlice slice(start, end);
+#endif
     return slice;
 }
 
+#if defined(__cplusplus)
+CV_INLINE  CvSlice  cvSlice(const cv::Range& r)
+{
+    CvSlice slice = (r.start != INT_MIN && r.end != INT_MAX) ? cvSlice(r.start, r.end) : cvSlice(0, CV_WHOLE_SEQ_END_INDEX);
+    return slice;
+}
+#endif
 
 
 /************************************* CvScalar *****************************************/
@@ -1148,13 +1377,22 @@ typedef struct CvScalar
 {
     double val[4];
 
-#ifdef __cplusplus
+#ifdef CV__VALIDATE_UNUNITIALIZED_VARS
+    CvScalar() __attribute__(( warning("Non-initialized variable") )) {}
+    CvScalar(const std::initializer_list<double> list)
+    {
+        CV_Assert(list.size() == 0 || list.size() == 4);
+        val[0] = val[1] = val[2] = val[3] = 0;
+        if (list.size() == 4)
+        {
+            val[0] = list.begin()[0]; val[1] = list.begin()[1]; val[2] = list.begin()[2]; val[3] = list.begin()[3];
+        }
+    };
+#elif defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus)
     CvScalar() {}
     CvScalar(double d0, double d1 = 0, double d2 = 0, double d3 = 0) { val[0] = d0; val[1] = d1; val[2] = d2; val[3] = d3; }
     template<typename _Tp>
     CvScalar(const cv::Scalar_<_Tp>& s) { val[0] = s.val[0]; val[1] = s.val[1]; val[2] = s.val[2]; val[3] = s.val[3]; }
-    template<typename _Tp>
-    operator cv::Scalar_<_Tp>() const { return cv::Scalar_<_Tp>(cv::saturate_cast<_Tp>(val[0]), cv::saturate_cast<_Tp>(val[1]), cv::saturate_cast<_Tp>(val[2]), cv::saturate_cast<_Tp>(val[3])); }
     template<typename _Tp, int cn>
     CvScalar(const cv::Vec<_Tp, cn>& v)
     {
@@ -1163,22 +1401,59 @@ typedef struct CvScalar
         for( ; i < 4; i++ ) val[i] = 0;
     }
 #endif
+#ifdef __cplusplus
+    template<typename _Tp>
+    operator cv::Scalar_<_Tp>() const { return cv::Scalar_<_Tp>(cv::saturate_cast<_Tp>(val[0]), cv::saturate_cast<_Tp>(val[1]), cv::saturate_cast<_Tp>(val[2]), cv::saturate_cast<_Tp>(val[3])); }
+#endif
 }
 CvScalar;
 
 CV_INLINE  CvScalar  cvScalar( double val0, double val1 CV_DEFAULT(0),
                                double val2 CV_DEFAULT(0), double val3 CV_DEFAULT(0))
 {
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    CvScalar scalar = CV_STRUCT_INITIALIZER;
+#else
     CvScalar scalar;
+#endif
     scalar.val[0] = val0; scalar.val[1] = val1;
     scalar.val[2] = val2; scalar.val[3] = val3;
     return scalar;
 }
 
+#ifdef __cplusplus
+CV_INLINE CvScalar cvScalar()
+{
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    CvScalar scalar = CV_STRUCT_INITIALIZER;
+#else
+    CvScalar scalar;
+#endif
+    scalar.val[0] = scalar.val[1] = scalar.val[2] = scalar.val[3] = 0;
+    return scalar;
+}
+CV_INLINE CvScalar cvScalar(const cv::Scalar& s)
+{
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    CvScalar scalar = CV_STRUCT_INITIALIZER;
+#else
+    CvScalar scalar;
+#endif
+    scalar.val[0] = s.val[0];
+    scalar.val[1] = s.val[1];
+    scalar.val[2] = s.val[2];
+    scalar.val[3] = s.val[3];
+    return scalar;
+}
+#endif
 
 CV_INLINE  CvScalar  cvRealScalar( double val0 )
 {
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    CvScalar scalar = CV_STRUCT_INITIALIZER;
+#else
     CvScalar scalar;
+#endif
     scalar.val[0] = val0;
     scalar.val[1] = scalar.val[2] = scalar.val[3] = 0;
     return scalar;
@@ -1186,7 +1461,11 @@ CV_INLINE  CvScalar  cvRealScalar( double val0 )
 
 CV_INLINE  CvScalar  cvScalarAll( double val0123 )
 {
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    CvScalar scalar = CV_STRUCT_INITIALIZER;
+#else
     CvScalar scalar;
+#endif
     scalar.val[0] = val0123;
     scalar.val[1] = val0123;
     scalar.val[2] = val0123;
@@ -1239,7 +1518,7 @@ typedef struct CvSeqBlock
 {
     struct CvSeqBlock*  prev; /**< Previous sequence block.                   */
     struct CvSeqBlock*  next; /**< Next sequence block.                       */
-  int    start_index;         /**< Index of the first element in the block +  */
+    int    start_index;       /**< Index of the first element in the block +  */
                               /**< sequence->first->start_index.              */
     int    count;             /**< Number of elements in the block.           */
     schar* data;              /**< Pointer to the first element of the block. */
diff --git a/modules/core/perf/opencl/perf_arithm.cpp b/modules/core/perf/opencl/perf_arithm.cpp
index 7556041c94..9f5f6e9e77 100644
--- a/modules/core/perf/opencl/perf_arithm.cpp
+++ b/modules/core/perf/opencl/perf_arithm.cpp
@@ -117,7 +117,7 @@ OCL_PERF_TEST_P(LogFixture, Log, ::testing::Combine(
     OCL_TEST_CYCLE() cv::log(src, dst);
 
     if (CV_MAT_DEPTH(type) >= CV_32F)
-        SANITY_CHECK(dst, 1e-5, ERROR_RELATIVE);
+        SANITY_CHECK(dst, 2e-4, ERROR_RELATIVE);
     else
         SANITY_CHECK(dst, 1);
 }
diff --git a/modules/core/perf/perf_addWeighted.cpp b/modules/core/perf/perf_addWeighted.cpp
index 15daced72e..2822bc61e7 100644
--- a/modules/core/perf/perf_addWeighted.cpp
+++ b/modules/core/perf/perf_addWeighted.cpp
@@ -11,6 +11,7 @@ PERF_TEST_P(Size_MatType, addWeighted, TYPICAL_MATS_ADWEIGHTED)
 {
     Size size = get<0>(GetParam());
     int type = get<1>(GetParam());
+    int depth = CV_MAT_DEPTH(type);
     Mat src1(size, type);
     Mat src2(size, type);
     double alpha = 3.75;
@@ -21,7 +22,7 @@ PERF_TEST_P(Size_MatType, addWeighted, TYPICAL_MATS_ADWEIGHTED)
 
     declare.in(src1, src2, dst, WARMUP_RNG).out(dst);
 
-    if (CV_MAT_DEPTH(type) == CV_32S)
+    if (depth == CV_32S)
     {
         // there might be not enough precision for integers
         src1 /= 2048;
@@ -30,7 +31,7 @@ PERF_TEST_P(Size_MatType, addWeighted, TYPICAL_MATS_ADWEIGHTED)
 
     TEST_CYCLE() cv::addWeighted( src1, alpha, src2, beta, gamma, dst, dst.type() );
 
-    SANITY_CHECK(dst, 1);
+    SANITY_CHECK(dst, depth == CV_32S ? 4 : 1);
 }
 
 } // namespace
diff --git a/modules/core/perf/perf_convertTo.cpp b/modules/core/perf/perf_convertTo.cpp
index c6c157e704..344d81cb8a 100644
--- a/modules/core/perf/perf_convertTo.cpp
+++ b/modules/core/perf/perf_convertTo.cpp
@@ -33,7 +33,7 @@ PERF_TEST_P( Size_DepthSrc_DepthDst_Channels_alpha, convertTo,
     int runs = (sz.width <= 640) ? 8 : 1;
     TEST_CYCLE_MULTIRUN(runs) src.convertTo(dst, depthDst, alpha);
 
-    double eps = depthSrc <= CV_32S ? 1e-12 : (FLT_EPSILON * maxValue);
+    double eps = depthSrc <= CV_32S && (depthDst <= CV_32S || depthDst == CV_64F) ? 1e-12 : (FLT_EPSILON * maxValue);
     eps = eps * std::max(1.0, fabs(alpha));
     SANITY_CHECK(dst, eps);
 }
diff --git a/modules/core/perf/perf_split.cpp b/modules/core/perf/perf_split.cpp
index d1d66a10bb..2cbc0b289c 100644
--- a/modules/core/perf/perf_split.cpp
+++ b/modules/core/perf/perf_split.cpp
@@ -27,11 +27,7 @@ PERF_TEST_P( Size_Depth_Channels, split,
     int runs = (sz.width <= 640) ? 8 : 1;
     TEST_CYCLE_MULTIRUN(runs) split(m, (vector<Mat>&)mv);
 
-#if defined (__aarch64__)
     SANITY_CHECK(mv, 2e-5);
-#else
-    SANITY_CHECK(mv, 1e-12);
-#endif
 }
 
 } // namespace
diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp
index be368ce761..4929ebefc0 100644
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@@ -617,7 +617,7 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
     if( (kind1 == kind2 || cn == 1) && sz1 == sz2 && dims1 <= 2 && dims2 <= 2 && type1 == type2 &&
         !haveMask && ((!_dst.fixedType() && (dtype < 0 || CV_MAT_DEPTH(dtype) == depth1)) ||
                        (_dst.fixedType() && _dst.type() == type1)) &&
-        ((src1Scalar && src2Scalar) || (!src1Scalar && !src2Scalar)) )
+        (src1Scalar == src2Scalar) )
     {
         _dst.createSameSize(*psrc1, type1);
         CV_OCL_RUN(use_opencl,
@@ -1204,7 +1204,7 @@ void cv::compare(InputArray _src1, InputArray _src2, OutputArray _dst, int op)
             compare(_src2, _src1, _dst, op);
             return;
         }
-        else if( (is_src1_scalar && is_src2_scalar) || (!is_src1_scalar && !is_src2_scalar) )
+        else if(is_src1_scalar == is_src2_scalar)
             CV_Error( CV_StsUnmatchedSizes,
                      "The operation is neither 'array op array' (where arrays have the same size and the same type), "
                      "nor 'array op scalar', nor 'scalar op array'" );
diff --git a/modules/core/src/array.cpp b/modules/core/src/array.cpp
index 45e6ee81d6..dde8b2606f 100644
--- a/modules/core/src/array.cpp
+++ b/modules/core/src/array.cpp
@@ -1017,7 +1017,7 @@ cvGetRawData( const CvArr* arr, uchar** data, int* step, CvSize* roi_size )
             *data = mat->data.ptr;
 
         if( roi_size )
-            *roi_size = cvGetMatSize( mat );
+            *roi_size = cvSize(cvGetMatSize( mat ));
     }
     else if( CV_IS_IMAGE( arr ))
     {
@@ -1218,7 +1218,7 @@ cvGetDimSize( const CvArr* arr, int index )
 CV_IMPL CvSize
 cvGetSize( const CvArr* arr )
 {
-    CvSize size;
+    CvSize size = {0, 0};
 
     if( CV_IS_MAT_HDR_Z( arr ))
     {
@@ -1918,7 +1918,7 @@ cvPtrND( const CvArr* arr, const int* idx, int* _type,
 CV_IMPL  CvScalar
 cvGet1D( const CvArr* arr, int idx )
 {
-    CvScalar scalar(0);
+    CvScalar scalar = cvScalar();
     int type = 0;
     uchar* ptr;
 
@@ -1953,7 +1953,7 @@ cvGet1D( const CvArr* arr, int idx )
 CV_IMPL  CvScalar
 cvGet2D( const CvArr* arr, int y, int x )
 {
-    CvScalar scalar(0);
+    CvScalar scalar = cvScalar();
     int type = 0;
     uchar* ptr;
 
@@ -1987,7 +1987,7 @@ cvGet2D( const CvArr* arr, int y, int x )
 CV_IMPL  CvScalar
 cvGet3D( const CvArr* arr, int z, int y, int x )
 {
-    CvScalar scalar(0);
+    CvScalar scalar = cvScalar();
     int type = 0;
     uchar* ptr;
 
@@ -2009,7 +2009,7 @@ cvGet3D( const CvArr* arr, int z, int y, int x )
 CV_IMPL  CvScalar
 cvGetND( const CvArr* arr, const int* idx )
 {
-    CvScalar scalar(0);
+    CvScalar scalar = cvScalar();
     int type = 0;
     uchar* ptr;
 
@@ -2916,15 +2916,7 @@ cvInitImageHeader( IplImage * image, CvSize size, int depth,
     if( !image )
         CV_Error( CV_HeaderIsNull, "null pointer to header" );
 
-#if defined __GNUC__ && __GNUC__ >= 8
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wclass-memaccess"
-#endif
-    memset( image, 0, sizeof( *image ));
-#if defined __GNUC__ && __GNUC__ >= 8
-#pragma GCC diagnostic pop
-#endif
-    image->nSize = sizeof( *image );
+    *image = cvIplImage();
 
     icvGetColorModel( channels, &colorModel, &channelSeq );
     for (int i = 0; i < 4; i++)
@@ -3081,7 +3073,7 @@ cvResetImageROI( IplImage* image )
 CV_IMPL CvRect
 cvGetImageROI( const IplImage* img )
 {
-    CvRect rect;
+    CvRect rect = {0, 0, 0, 0};
     if( !img )
         CV_Error( CV_StsNullPtr, "Null pointer to image" );
 
diff --git a/modules/core/src/batch_distance.cpp b/modules/core/src/batch_distance.cpp
index 4c90db7ec4..a5aeefc348 100644
--- a/modules/core/src/batch_distance.cpp
+++ b/modules/core/src/batch_distance.cpp
@@ -5,6 +5,7 @@
 
 #include "precomp.hpp"
 #include "stat.hpp"
+#include <opencv2/core/hal/hal.hpp>
 
 namespace cv
 {
@@ -45,6 +46,24 @@ void batchDistL2Sqr_(const _Tp* src1, const _Tp* src2, size_t step2,
     }
 }
 
+template<>
+void batchDistL2Sqr_(const float* src1, const float* src2, size_t step2,
+                     int nvecs, int len, float* dist, const uchar* mask)
+{
+    step2 /= sizeof(src2[0]);
+    if( !mask )
+    {
+        for( int i = 0; i < nvecs; i++ )
+            dist[i] = hal::normL2Sqr_(src1, src2 + step2*i, len);
+    }
+    else
+    {
+        float val0 = std::numeric_limits<float>::max();
+        for( int i = 0; i < nvecs; i++ )
+            dist[i] = mask[i] ? hal::normL2Sqr_(src1, src2 + step2*i, len) : val0;
+    }
+}
+
 template<typename _Tp, typename _Rt>
 void batchDistL2_(const _Tp* src1, const _Tp* src2, size_t step2,
                   int nvecs, int len, _Rt* dist, const uchar* mask)
@@ -63,6 +82,24 @@ void batchDistL2_(const _Tp* src1, const _Tp* src2, size_t step2,
     }
 }
 
+template<>
+void batchDistL2_(const float* src1, const float* src2, size_t step2,
+                  int nvecs, int len, float* dist, const uchar* mask)
+{
+    step2 /= sizeof(src2[0]);
+    if( !mask )
+    {
+        for( int i = 0; i < nvecs; i++ )
+            dist[i] = std::sqrt(hal::normL2Sqr_(src1, src2 + step2*i, len));
+    }
+    else
+    {
+        float val0 = std::numeric_limits<float>::max();
+        for( int i = 0; i < nvecs; i++ )
+            dist[i] = mask[i] ? std::sqrt(hal::normL2Sqr_(src1, src2 + step2*i, len)) : val0;
+    }
+}
+
 static void batchDistHamming(const uchar* src1, const uchar* src2, size_t step2,
                              int nvecs, int len, int* dist, const uchar* mask)
 {
diff --git a/modules/core/src/convert.avx2.cpp b/modules/core/src/convert.avx2.cpp
deleted file mode 100644
index b724cbbf1e..0000000000
--- a/modules/core/src/convert.avx2.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html
-
-
-#include "precomp.hpp"
-#include "convert.hpp"
-
-namespace cv
-{
-namespace opt_AVX2
-{
-
-void cvtScale_s16s32f32Line_AVX2(const short* src, int* dst, float scale, float shift, int width)
-{
-    int x = 0;
-
-    __m256 scale256 = _mm256_set1_ps(scale);
-    __m256 shift256 = _mm256_set1_ps(shift);
-    const int shuffle = 0xD8;
-
-    for (; x <= width - 16; x += 16)
-    {
-        __m256i v_src = _mm256_loadu_si256((const __m256i *)(src + x));
-        v_src = _mm256_permute4x64_epi64(v_src, shuffle);
-        __m256i v_src_lo = _mm256_srai_epi32(_mm256_unpacklo_epi16(v_src, v_src), 16);
-        __m256i v_src_hi = _mm256_srai_epi32(_mm256_unpackhi_epi16(v_src, v_src), 16);
-        __m256 v_dst0 = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(v_src_lo), scale256), shift256);
-        __m256 v_dst1 = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(v_src_hi), scale256), shift256);
-        _mm256_storeu_si256((__m256i *)(dst + x), _mm256_cvtps_epi32(v_dst0));
-        _mm256_storeu_si256((__m256i *)(dst + x + 8), _mm256_cvtps_epi32(v_dst1));
-    }
-
-    for (; x < width; x++)
-        dst[x] = saturate_cast<int>(src[x] * scale + shift);
-}
-
-}
-} // cv::
-/* End of file. */
diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp
index 75b4967194..a54f4c1bcd 100644
--- a/modules/core/src/convert.cpp
+++ b/modules/core/src/convert.cpp
@@ -2,1093 +2,242 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html
 
-
 #include "precomp.hpp"
 #include "opencl_kernels_core.hpp"
 #include "convert.hpp"
-#include "opencv2/core/openvx/ovx_defs.hpp"
 
 namespace cv {
 
-template <typename T, typename DT>
-struct Cvt_SIMD
-{
-    int operator() (const T *, DT *, int) const
-    {
-        return 0;
-    }
-};
+/*namespace hal {
 
-#if CV_SIMD128
-// from uchar
-
-template <>
-struct Cvt_SIMD<uchar, schar>
+void cvt16f32f( const float16_t* src, float* dst, int len )
 {
-    int operator() (const uchar * src, schar * dst, int width) const
+    int j = 0;
+#if CV_SIMD
+    const int VECSZ = v_float32::nlanes;
+    for( ; j < len; j += VECSZ )
     {
-        int x = 0;
-        if (hasSIMD128())
+        if( j > len - VECSZ )
         {
-            int cWidth = v_uint16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_int16x8 v_src = v_reinterpret_as_s16(v_load_expand(src + x));
-                v_store_low(dst + x, v_pack(v_src, v_src));
-            }
+            if( j == 0 )
+                break;
+            j = len - VECSZ;
         }
-        return x;
+        v_store(dst + j, vx_load_expand(src + j));
     }
-};
-
-template <>
-struct Cvt_SIMD<uchar, ushort>
-{
-    int operator() (const uchar * src, ushort * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_uint16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-                v_store(dst + x, v_load_expand(src + x));
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<uchar, short>
-{
-    int operator() (const uchar * src, short * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_uint16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_int16x8 v_src = v_reinterpret_as_s16(v_load_expand(src + x));
-                v_store(dst + x, v_src);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<uchar, int>
-{
-    int operator() (const uchar * src, int * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_int32x4::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_uint16x8 v_src = v_load_expand(src + x);
-                v_uint32x4 v_src1, v_src2;
-                v_expand(v_src, v_src1, v_src2);
-                v_store(dst + x, v_reinterpret_as_s32(v_src1));
-                v_store(dst + x + cWidth, v_reinterpret_as_s32(v_src2));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<uchar, float>
-{
-    int operator() (const uchar * src, float * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_float32x4::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_uint16x8 v_src = v_load_expand(src + x);
-                v_uint32x4 v_src1, v_src2;
-                v_expand(v_src, v_src1, v_src2);
-                v_store(dst + x, v_cvt_f32(v_reinterpret_as_s32(v_src1)));
-                v_store(dst + x + cWidth, v_cvt_f32(v_reinterpret_as_s32(v_src2)));
-            }
-        }
-        return x;
-    }
-};
-
-// from schar
-
-template <>
-struct Cvt_SIMD<schar, uchar>
-{
-    int operator() (const schar * src, uchar * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_int16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-                v_pack_u_store(dst + x, v_load_expand(src + x));
-        }
-
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<schar, short>
-{
-    int operator() (const schar * src, short * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_int16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-                v_store(dst + x, v_load_expand(src + x));
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<schar, ushort>
-{
-    int operator() (const schar * src, ushort * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_int16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_int16x8 v_src = v_load_expand(src + x);
-                v_int32x4 v_src1, v_src2;
-                v_expand(v_src, v_src1, v_src2);
-                v_store(dst + x, v_pack_u(v_src1, v_src2));
-            }
-        }
-        return x;
-    }
-};
-
-
-template <>
-struct Cvt_SIMD<schar, int>
-{
-    int operator() (const schar * src, int * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_int32x4::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_int16x8 v_src = v_load_expand(src + x);
-                v_int32x4 v_src1, v_src2;
-                v_expand(v_src, v_src1, v_src2);
-                v_store(dst + x, v_src1);
-                v_store(dst + x + cWidth, v_src2);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<schar, float>
-{
-    int operator() (const schar * src, float * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_int32x4::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_int16x8 v_src = v_load_expand(src + x);
-                v_int32x4 v_src1, v_src2;
-                v_expand(v_src, v_src1, v_src2);
-                v_store(dst + x, v_cvt_f32(v_src1));
-                v_store(dst + x + cWidth, v_cvt_f32(v_src2));
-            }
-        }
-        return x;
-    }
-};
-
-// from ushort
-
-template <>
-struct Cvt_SIMD<ushort, uchar>
-{
-    int operator() (const ushort * src, uchar * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_uint16x8::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_uint16x8 v_src1 = v_load(src + x), v_src2 = v_load(src + x + cWidth);
-                v_store(dst + x, v_pack(v_src1, v_src2));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<ushort, schar>
-{
-    int operator() (const ushort * src, schar * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_uint16x8::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_uint16x8 v_src1 = v_load(src + x), v_src2 = v_load(src + x + cWidth);
-                v_uint32x4 v_dst10, v_dst11, v_dst20, v_dst21;
-                v_expand(v_src1, v_dst10, v_dst11);
-                v_expand(v_src2, v_dst20, v_dst21);
-
-                v_store(dst + x, v_pack(
-                    v_pack(v_reinterpret_as_s32(v_dst10), v_reinterpret_as_s32(v_dst11)),
-                    v_pack(v_reinterpret_as_s32(v_dst20), v_reinterpret_as_s32(v_dst21))));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<ushort, short>
-{
-    int operator() (const ushort * src, short * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_uint16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_uint16x8 v_src = v_load(src + x);
-                v_uint32x4 v_dst0, v_dst1;
-                v_expand(v_src, v_dst0, v_dst1);
-                v_store(dst + x, v_pack(v_reinterpret_as_s32(v_dst0), v_reinterpret_as_s32(v_dst1)));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<ushort, int>
-{
-    int operator() (const ushort * src, int * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_int32x4::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_uint16x8 v_src = v_load(src + x);
-                v_uint32x4 v_src1, v_src2;
-                v_expand(v_src, v_src1, v_src2);
-                v_store(dst + x, v_reinterpret_as_s32(v_src1));
-                v_store(dst + x + cWidth, v_reinterpret_as_s32(v_src2));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<ushort, float>
-{
-    int operator() (const ushort * src, float * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_int32x4::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_uint16x8 v_src = v_load(src + x);
-                v_uint32x4 v_src1, v_src2;
-                v_expand(v_src, v_src1, v_src2);
-                v_store(dst + x, v_cvt_f32(v_reinterpret_as_s32(v_src1)));
-                v_store(dst + x + cWidth, v_cvt_f32(v_reinterpret_as_s32(v_src2)));
-            }
-        }
-        return x;
-    }
-};
-
-
-// from short
-
-template <>
-struct Cvt_SIMD<short, uchar>
-{
-    int operator() (const short * src, uchar * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_int16x8::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_int16x8 v_src1 = v_load(src + x), v_src2 = v_load(src + x + cWidth);
-                v_store(dst + x, v_pack_u(v_src1, v_src2));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<short, schar>
-{
-    int operator() (const short * src, schar * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_int16x8::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_int16x8 v_src1 = v_load(src + x), v_src2 = v_load(src + x + cWidth);
-                v_store(dst + x, v_pack(v_src1, v_src2));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<short, ushort>
-{
-    int operator() (const short * src, ushort * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_int16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_int16x8 v_src = v_load(src + x);
-                v_int32x4 v_dst1, v_dst2;
-                v_expand(v_src, v_dst1, v_dst2);
-                v_store(dst + x, v_pack_u(v_dst1, v_dst2));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<short, int>
-{
-    int operator() (const short * src, int * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_int32x4::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_int16x8 v_src = v_load(src + x);
-                v_int32x4 v_dst1, v_dst2;
-                v_expand(v_src, v_dst1, v_dst2);
-                v_store(dst + x, v_dst1);
-                v_store(dst + x + cWidth, v_dst2);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<short, float>
-{
-    int operator() (const short * src, float * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_int32x4::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_int16x8 v_src = v_load(src + x);
-                v_int32x4 v_dst1, v_dst2;
-                v_expand(v_src, v_dst1, v_dst2);
-                v_store(dst + x, v_cvt_f32(v_dst1));
-                v_store(dst + x + cWidth, v_cvt_f32(v_dst2));
-            }
-        }
-        return x;
-    }
-};
-
-// from int
-
-template <>
-struct Cvt_SIMD<int, uchar>
-{
-    int operator() (const int * src, uchar * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_int32x4::nlanes;
-            for (; x <= width - cWidth * 4; x += cWidth * 4)
-            {
-                v_int32x4 v_src1 = v_load(src + x), v_src2 = v_load(src + x + cWidth);
-                v_int32x4 v_src3 = v_load(src + x + cWidth * 2), v_src4 = v_load(src + x + cWidth * 3);
-                v_int16x8 v_dst1 = v_pack(v_src1, v_src2);
-                v_int16x8 v_dst2 = v_pack(v_src3, v_src4);
-                v_store(dst + x, v_pack_u(v_dst1, v_dst2));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<int, schar>
-{
-    int operator() (const int * src, schar * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_int32x4::nlanes;
-            for (; x <= width - cWidth * 4; x += cWidth * 4)
-            {
-                v_int32x4 v_src1 = v_load(src + x), v_src2 = v_load(src + x + cWidth);
-                v_int32x4 v_src3 = v_load(src + x + cWidth * 2), v_src4 = v_load(src + x + cWidth * 3);
-                v_int16x8 v_dst1 = v_pack(v_src1, v_src2);
-                v_int16x8 v_dst2 = v_pack(v_src3, v_src4);
-                v_store(dst + x, v_pack(v_dst1, v_dst2));
-            }
-        }
-        return x;
-    }
-};
-
-
-template <>
-struct Cvt_SIMD<int, ushort>
-{
-    int operator() (const int * src, ushort * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_int32x4::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_int32x4 v_src1 = v_load(src + x), v_src2 = v_load(src + x + cWidth);
-                v_store(dst + x, v_pack_u(v_src1, v_src2));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<int, short>
-{
-    int operator() (const int * src, short * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_int32x4::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_int32x4 v_src1 = v_load(src + x), v_src2 = v_load(src + x + cWidth);
-                v_store(dst + x, v_pack(v_src1, v_src2));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<int, float>
-{
-    int operator() (const int * src, float * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_int32x4::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-                v_store(dst + x, v_cvt_f32(v_load(src + x)));
-        }
-        return x;
-    }
-};
-
-// from float
-
-template <>
-struct Cvt_SIMD<float, uchar>
-{
-    int operator() (const float * src, uchar * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_float32x4::nlanes;
-            for (; x <= width - cWidth * 4; x += cWidth * 4)
-            {
-                v_int32x4 v_src1 = v_round(v_load(src + x));
-                v_int32x4 v_src2 = v_round(v_load(src + x + cWidth));
-                v_int32x4 v_src3 = v_round(v_load(src + x + cWidth * 2));
-                v_int32x4 v_src4 = v_round(v_load(src + x + cWidth * 3));
-                v_uint16x8 v_dst1 = v_pack_u(v_src1, v_src2);
-                v_uint16x8 v_dst2 = v_pack_u(v_src3, v_src4);
-                v_store(dst + x, v_pack(v_dst1, v_dst2));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<float, schar>
-{
-    int operator() (const float * src, schar * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_float32x4::nlanes;
-            for (; x <= width - cWidth * 4; x += cWidth * 4)
-            {
-                v_int32x4 v_src1 = v_round(v_load(src + x));
-                v_int32x4 v_src2 = v_round(v_load(src + x + cWidth));
-                v_int32x4 v_src3 = v_round(v_load(src + x + cWidth * 2));
-                v_int32x4 v_src4 = v_round(v_load(src + x + cWidth * 3));
-                v_int16x8 v_dst1 = v_pack(v_src1, v_src2);
-                v_int16x8 v_dst2 = v_pack(v_src3, v_src4);
-                v_store(dst + x, v_pack(v_dst1, v_dst2));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<float, ushort>
-{
-    int operator() (const float * src, ushort * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_float32x4::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_int32x4 v_src1 = v_round(v_load(src + x));
-                v_int32x4 v_src2 = v_round(v_load(src + x + cWidth));
-                v_store(dst + x, v_pack_u(v_src1, v_src2));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<float, short>
-{
-    int operator() (const float * src, short * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_float32x4::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_int32x4 v_src1 = v_round(v_load(src + x));
-                v_int32x4 v_src2 = v_round(v_load(src + x + cWidth));
-                v_store(dst + x, v_pack(v_src1, v_src2));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<float, int>
-{
-    int operator() (const float * src, int * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_float32x4::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-                v_store(dst + x, v_round(v_load(src + x)));
-        }
-        return x;
-    }
-};
-#if CV_SIMD128_64F
-// from double
-
-template <>
-struct Cvt_SIMD<double, uchar>
-{
-    int operator() (const double * src, uchar * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_float64x2::nlanes;
-            for (; x <= width - cWidth * 4; x += cWidth * 4)
-            {
-                v_float32x4 v_src0 = v_cvt_f32(v_load(src + x));
-                v_float32x4 v_src1 = v_cvt_f32(v_load(src + x + cWidth));
-                v_float32x4 v_src2 = v_cvt_f32(v_load(src + x + cWidth * 2));
-                v_float32x4 v_src3 = v_cvt_f32(v_load(src + x + cWidth * 3));
-
-                v_src0 = v_combine_low(v_src0, v_src1);
-                v_src1 = v_combine_low(v_src2, v_src3);
-
-                v_int16x8 v_dst = v_pack(v_round(v_src0), v_round(v_src1));
-                v_pack_u_store(dst + x, v_dst);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<double, schar>
-{
-    int operator() (const double * src, schar * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_float64x2::nlanes;
-            for (; x <= width - cWidth * 4; x += cWidth * 4)
-            {
-                v_float32x4 v_src0 = v_cvt_f32(v_load(src + x));
-                v_float32x4 v_src1 = v_cvt_f32(v_load(src + x + cWidth));
-                v_float32x4 v_src2 = v_cvt_f32(v_load(src + x + cWidth * 2));
-                v_float32x4 v_src3 = v_cvt_f32(v_load(src + x + cWidth * 3));
-
-                v_src0 = v_combine_low(v_src0, v_src1);
-                v_src1 = v_combine_low(v_src2, v_src3);
-
-                v_int16x8 v_dst = v_pack(v_round(v_src0), v_round(v_src1));
-                v_store_low(dst + x, v_pack(v_dst, v_dst));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<double, ushort>
-{
-    int operator() (const double * src, ushort * dst, int width) const
-    {
-        int x = 0;
-#if CV_TRY_SSE4_1
-        if (CV_CPU_HAS_SUPPORT_SSE4_1)
-            return opt_SSE4_1::Cvt_SIMD_f64u16_SSE41(src, dst, width);
 #endif
-        if (hasSIMD128())
-        {
-            int cWidth = v_float64x2::nlanes;
-            for (; x <= width - cWidth * 4; x += cWidth * 4)
-            {
-                v_float32x4 v_src0 = v_cvt_f32(v_load(src + x));
-                v_float32x4 v_src1 = v_cvt_f32(v_load(src + x + cWidth));
-                v_float32x4 v_src2 = v_cvt_f32(v_load(src + x + cWidth * 2));
-                v_float32x4 v_src3 = v_cvt_f32(v_load(src + x + cWidth * 3));
+    for( ; j < len; j++ )
+        dst[j] = (float)src[j];
+}
 
-                v_src0 = v_combine_low(v_src0, v_src1);
-                v_src1 = v_combine_low(v_src2, v_src3);
-
-                v_uint16x8 v_dst = v_pack_u(v_round(v_src0), v_round(v_src1));
-                v_store(dst + x, v_dst);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<double, short>
+void cvt32f16f( const float* src, float16_t* dst, int len )
 {
-    int operator() (const double * src, short * dst, int width) const
+    int j = 0;
+#if CV_SIMD
+    const int VECSZ = v_float32::nlanes;
+    for( ; j < len; j += VECSZ )
     {
-        int x = 0;
-        if (hasSIMD128())
+        if( j > len - VECSZ )
         {
-            int cWidth = v_float64x2::nlanes;
-            for (; x <= width - cWidth * 4; x += cWidth * 4)
-            {
-                v_float32x4 v_src0 = v_cvt_f32(v_load(src + x));
-                v_float32x4 v_src1 = v_cvt_f32(v_load(src + x + cWidth));
-                v_float32x4 v_src2 = v_cvt_f32(v_load(src + x + cWidth * 2));
-                v_float32x4 v_src3 = v_cvt_f32(v_load(src + x + cWidth * 3));
-
-                v_src0 = v_combine_low(v_src0, v_src1);
-                v_src1 = v_combine_low(v_src2, v_src3);
-
-                v_int16x8 v_dst = v_pack(v_round(v_src0), v_round(v_src1));
-                v_store(dst + x, v_dst);
-            }
+            if( j == 0 )
+                break;
+            j = len - VECSZ;
         }
-        return x;
+        v_pack_store(dst + j, vx_load(src + j));
     }
-};
-
-template <>
-struct Cvt_SIMD<double, int>
-{
-    int operator() (const double * src, int * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_float64x2::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_int32x4 v_src0 = v_round(v_load(src + x));
-                v_int32x4 v_src1 = v_round(v_load(src + x + cWidth));
-
-                v_store(dst + x, v_combine_low(v_src0, v_src1));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<double, float>
-{
-    int operator() (const double * src, float * dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_float64x2::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_float32x4 v_src0 = v_cvt_f32(v_load(src + x));
-                v_float32x4 v_src1 = v_cvt_f32(v_load(src + x + cWidth));
-
-                v_store(dst + x, v_combine_low(v_src0, v_src1));
-            }
-        }
-        return x;
-    }
-};
-
-// to double
-
-template <>
-struct Cvt_SIMD<uchar, double>
-{
-    int operator() (const uchar* src, double* dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_float64x2::nlanes;
-            for (; x <= width - cWidth * 4; x += cWidth * 4)
-            {
-                v_uint16x8 v_src = v_load_expand(src + x);
-                v_uint32x4 v_src1, v_src2;
-                v_expand(v_src, v_src1, v_src2);
-                v_store(dst + x, v_cvt_f64(v_reinterpret_as_s32(v_src1)));
-                v_store(dst + x + cWidth, v_cvt_f64_high(v_reinterpret_as_s32(v_src1)));
-                v_store(dst + x + cWidth * 2, v_cvt_f64(v_reinterpret_as_s32(v_src2)));
-                v_store(dst + x + cWidth * 3, v_cvt_f64_high(v_reinterpret_as_s32(v_src2)));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<schar, double>
-{
-    int operator() (const schar* src, double* dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_float64x2::nlanes;
-            for (; x <= width - cWidth * 4; x += cWidth * 4)
-            {
-                v_int16x8 v_src = v_load_expand(src + x);
-                v_int32x4 v_src1, v_src2;
-                v_expand(v_src, v_src1, v_src2);
-                v_store(dst + x, v_cvt_f64(v_src1));
-                v_store(dst + x + cWidth, v_cvt_f64_high(v_src1));
-                v_store(dst + x + cWidth * 2, v_cvt_f64(v_src2));
-                v_store(dst + x + cWidth * 3, v_cvt_f64_high(v_src2));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<ushort, double>
-{
-    int operator() (const ushort* src, double* dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_float64x2::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_uint32x4 v_src = v_load_expand(src + x);
-
-                v_store(dst + x, v_cvt_f64(v_reinterpret_as_s32(v_src)));
-                v_store(dst + x + cWidth, v_cvt_f64_high(v_reinterpret_as_s32(v_src)));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<short, double>
-{
-    int operator() (const short* src, double* dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_float64x2::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_int32x4 v_src = v_load_expand(src + x);
-
-                v_store(dst + x, v_cvt_f64(v_src));
-                v_store(dst + x + cWidth, v_cvt_f64_high(v_src));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<int, double>
-{
-    int operator() (const int* src, double* dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_float64x2::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_int32x4 v_src = v_load(src + x);
-
-                v_store(dst + x, v_cvt_f64(v_src));
-                v_store(dst + x + cWidth, v_cvt_f64_high(v_src));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<float, double>
-{
-    int operator() (const float* src, double* dst, int width) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            int cWidth = v_float64x2::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_float32x4 v_src = v_load(src + x);
-
-                v_store(dst + x, v_cvt_f64(v_src));
-                v_store(dst + x + cWidth, v_cvt_f64_high(v_src));
-            }
-        }
-        return x;
-    }
-};
-#endif // CV_SIMD128_64F
-#endif // CV_SIMD128
-
-
-#ifdef HAVE_OPENVX
-
-template<typename T, typename DT>
-static bool _openvx_cvt(const T* src, size_t sstep,
-                        DT* dst, size_t dstep, Size continuousSize)
-{
-    using namespace ivx;
-
-    if(!(continuousSize.width > 0 && continuousSize.height > 0))
-    {
-        return true;
-    }
-
-    //.height is for number of continuous pieces
-    //.width  is for length of one piece
-    Size imgSize = continuousSize;
-    if(continuousSize.height == 1)
-    {
-        if(sstep / sizeof(T) == dstep / sizeof(DT) && sstep / sizeof(T) > 0 &&
-           continuousSize.width % (sstep / sizeof(T)) == 0)
-        {
-            //continuous n-lines image
-            imgSize.width  = sstep / sizeof(T);
-            imgSize.height = continuousSize.width / (sstep / sizeof(T));
-        }
-        else
-        {
-            //1-row image with possibly incorrect step
-            sstep = continuousSize.width * sizeof(T);
-            dstep = continuousSize.width * sizeof(DT);
-        }
-    }
-
-    int srcType = DataType<T>::type, dstType = DataType<DT>::type;
-
-    if (ovx::skipSmallImages<VX_KERNEL_CONVERTDEPTH>(imgSize.width, imgSize.height))
-        return false;
-
-    try
-    {
-        Context context = ovx::getOpenVXContext();
-
-        // Other conversions are marked as "experimental"
-        if(context.vendorID() == VX_ID_KHRONOS &&
-           !(srcType == CV_8U  && dstType == CV_16S) &&
-           !(srcType == CV_16S && dstType == CV_8U))
-        {
-            return false;
-        }
-
-        Image srcImage = Image::createFromHandle(context, Image::matTypeToFormat(srcType),
-                                                 Image::createAddressing(imgSize.width, imgSize.height,
-                                                                         (vx_uint32)sizeof(T), (vx_uint32)sstep),
-                                                 (void*)src);
-        Image dstImage = Image::createFromHandle(context, Image::matTypeToFormat(dstType),
-                                                 Image::createAddressing(imgSize.width, imgSize.height,
-                                                                         (vx_uint32)sizeof(DT), (vx_uint32)dstep),
-                                                 (void*)dst);
-
-        IVX_CHECK_STATUS(vxuConvertDepth(context, srcImage, dstImage, VX_CONVERT_POLICY_SATURATE, 0));
-
-#ifdef VX_VERSION_1_1
-        //we should take user memory back before release
-        //(it's not done automatically according to standard)
-        srcImage.swapHandle(); dstImage.swapHandle();
 #endif
-    }
-    catch (RuntimeError & e)
-    {
-        VX_DbgThrow(e.what());
-    }
-    catch (WrapperError & e)
-    {
-        VX_DbgThrow(e.what());
-    }
-
-    return true;
+    for( ; j < len; j++ )
+        dst[j] = float16_t(src[j]);
 }
 
-template<typename T, typename DT>
-static bool openvx_cvt(const T* src, size_t sstep,
-                       DT* dst, size_t dstep, Size size)
+/*void addRNGBias32f( float* arr, const float* scaleBiasPairs, int len )
 {
-    (void)src; (void)sstep; (void)dst; (void)dstep; (void)size;
-    return false;
+    // the loop is simple enough, so we let the compiler to vectorize it
+    for( int i = 0; i < len; i++ )
+        arr[i] = scaleBiasPairs[i*2 + 1];
 }
 
-#define DEFINE_OVX_CVT_SPECIALIZATION(T, DT) \
-template<>                                                                    \
-bool openvx_cvt(const T *src, size_t sstep, DT *dst, size_t dstep, Size size) \
-{                                                                             \
-    return _openvx_cvt<T, DT>(src, sstep, dst, dstep, size);                  \
-}
-
-DEFINE_OVX_CVT_SPECIALIZATION(uchar, ushort)
-DEFINE_OVX_CVT_SPECIALIZATION(uchar, short)
-DEFINE_OVX_CVT_SPECIALIZATION(uchar, int)
-DEFINE_OVX_CVT_SPECIALIZATION(ushort, uchar)
-DEFINE_OVX_CVT_SPECIALIZATION(ushort, int)
-DEFINE_OVX_CVT_SPECIALIZATION(short, uchar)
-DEFINE_OVX_CVT_SPECIALIZATION(short, int)
-DEFINE_OVX_CVT_SPECIALIZATION(int, uchar)
-DEFINE_OVX_CVT_SPECIALIZATION(int, ushort)
-DEFINE_OVX_CVT_SPECIALIZATION(int, short)
-
-#endif
-
-template<typename T, typename DT> static void
-cvt_( const T* src, size_t sstep,
-      DT* dst, size_t dstep, Size size )
+void addRNGBias64f( double* arr, const double* scaleBiasPairs, int len )
 {
-    CV_OVX_RUN(
-        true,
-        openvx_cvt(src, sstep, dst, dstep, size)
-    )
-
-    sstep /= sizeof(src[0]);
-    dstep /= sizeof(dst[0]);
-    Cvt_SIMD<T, DT> vop;
-
-    for( ; size.height--; src += sstep, dst += dstep )
-    {
-        int x = vop(src, dst, size.width);
-        #if CV_ENABLE_UNROLLED
-        for( ; x <= size.width - 4; x += 4 )
-        {
-            DT t0, t1;
-            t0 = saturate_cast<DT>(src[x]);
-            t1 = saturate_cast<DT>(src[x+1]);
-            dst[x] = t0; dst[x+1] = t1;
-            t0 = saturate_cast<DT>(src[x+2]);
-            t1 = saturate_cast<DT>(src[x+3]);
-            dst[x+2] = t0; dst[x+3] = t1;
-        }
-        #endif
-        for( ; x < size.width; x++ )
-            dst[x] = saturate_cast<DT>(src[x]);
-    }
+    // the loop is simple enough, so we let the compiler to vectorize it
+    for( int i = 0; i < len; i++ )
+        arr[i] = scaleBiasPairs[i*2 + 1];
 }
 
-template<typename T> static void
-cpy_( const T* src, size_t sstep, T* dst, size_t dstep, Size size )
+}*/
+
+template<typename _Ts, typename _Td, typename _Twvec> inline void
+cvt_( const _Ts* src, size_t sstep, _Td* dst, size_t dstep, Size size )
 {
     sstep /= sizeof(src[0]);
     dstep /= sizeof(dst[0]);
 
-    for( ; size.height--; src += sstep, dst += dstep )
-        memcpy(dst, src, size.width*sizeof(src[0]));
+    for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
+    {
+        int j = 0;
+#if CV_SIMD
+        const int VECSZ = _Twvec::nlanes*2;
+        for( ; j < size.width; j += VECSZ )
+        {
+            if( j > size.width - VECSZ )
+            {
+                if( j == 0 || src == (_Ts*)dst )
+                    break;
+                j = size.width - VECSZ;
+            }
+            _Twvec v0, v1;
+            vx_load_pair_as(src + j, v0, v1);
+            v_store_pair_as(dst + j, v0, v1);
+        }
+#endif
+        for( ; j < size.width; j++ )
+            dst[j] = saturate_cast<_Td>(src[j]);
+    }
 }
 
+// in order to reduce the code size, for (16f <-> ...) conversions
+// we add a conversion function without loop unrolling
+template<typename _Ts, typename _Td, typename _Twvec> inline void
+cvt1_( const _Ts* src, size_t sstep, _Td* dst, size_t dstep, Size size )
+{
+    sstep /= sizeof(src[0]);
+    dstep /= sizeof(dst[0]);
 
+    for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
+    {
+        int j = 0;
+#if CV_SIMD
+        const int VECSZ = _Twvec::nlanes;
+        for( ; j < size.width; j += VECSZ )
+        {
+            if( j > size.width - VECSZ )
+            {
+                if( j == 0 || src == (_Ts*)dst )
+                    break;
+                j = size.width - VECSZ;
+            }
+            _Twvec v;
+            vx_load_as(src + j, v);
+            v_store_as(dst + j, v);
+        }
+        vx_cleanup();
+#endif
+        for( ; j < size.width; j++ )
+            dst[j] = saturate_cast<_Td>(src[j]);
+    }
+}
+
+static void cvtCopy( const uchar* src, size_t sstep,
+                     uchar* dst, size_t dstep, Size size, size_t elemsize)
+{
+    size_t len = size.width*elemsize;
+    for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
+    {
+        memcpy( dst, src, len );
+    }
+}
+
+#define DEF_CVT_FUNC(suffix, cvtfunc, _Ts, _Td, _Twvec) \
+static void cvt##suffix(const _Ts* src, size_t sstep, uchar*, size_t, \
+                        _Td* dst, size_t dstep, Size size, void*) \
+{ cvtfunc<_Ts, _Td, _Twvec>(src, sstep, dst, dstep, size); }
+
+////////////////////// 8u -> ... ////////////////////////
+
+DEF_CVT_FUNC(8u8s,  cvt_,  uchar, schar,    v_int16)
+DEF_CVT_FUNC(8u16u, cvt_,  uchar, ushort,   v_uint16)
+DEF_CVT_FUNC(8u16s, cvt_,  uchar, short,    v_int16)
+DEF_CVT_FUNC(8u32s, cvt_,  uchar, int,      v_int32)
+DEF_CVT_FUNC(8u32f, cvt_,  uchar, float,    v_float32)
+DEF_CVT_FUNC(8u64f, cvt_,  uchar, double,   v_int32)
+//DEF_CVT_FUNC(8u16f, cvt1_, uchar, float16_t, v_float32)
+
+////////////////////// 8s -> ... ////////////////////////
+
+DEF_CVT_FUNC(8s8u,  cvt_,  schar, uchar,    v_int16)
+DEF_CVT_FUNC(8s16u, cvt_,  schar, ushort,   v_uint16)
+DEF_CVT_FUNC(8s16s, cvt_,  schar, short,    v_int16)
+DEF_CVT_FUNC(8s32s, cvt_,  schar, int,      v_int32)
+DEF_CVT_FUNC(8s32f, cvt_,  schar, float,    v_float32)
+DEF_CVT_FUNC(8s64f, cvt_,  schar, double,   v_int32)
+//DEF_CVT_FUNC(8s16f, cvt1_, schar, float16_t, v_float32)
+
+////////////////////// 16u -> ... ////////////////////////
+
+DEF_CVT_FUNC(16u8u,  cvt_, ushort, uchar,  v_uint16)
+DEF_CVT_FUNC(16u8s,  cvt_, ushort, schar,  v_uint16)
+DEF_CVT_FUNC(16u16s, cvt_, ushort, short,  v_int32)
+DEF_CVT_FUNC(16u32s, cvt_, ushort, int,    v_int32)
+DEF_CVT_FUNC(16u32f, cvt_, ushort, float,  v_float32)
+DEF_CVT_FUNC(16u64f, cvt_, ushort, double, v_int32)
+//DEF_CVT_FUNC(16u16f, cvt1_,ushort, float16_t, v_float32)
+
+////////////////////// 16s -> ... ////////////////////////
+
+DEF_CVT_FUNC(16s8u,  cvt_, short, uchar,  v_int16)
+DEF_CVT_FUNC(16s8s,  cvt_, short, schar,  v_int16)
+DEF_CVT_FUNC(16s16u, cvt_, short, ushort, v_int32)
+DEF_CVT_FUNC(16s32s, cvt_, short, int,    v_int32)
+DEF_CVT_FUNC(16s32f, cvt_, short, float,  v_float32)
+DEF_CVT_FUNC(16s64f, cvt_, short, double, v_int32)
+//DEF_CVT_FUNC(16s16f, cvt1_,short, float16_t, v_float32)
+
+////////////////////// 32s -> ... ////////////////////////
+
+DEF_CVT_FUNC(32s8u,  cvt_, int, uchar,  v_int32)
+DEF_CVT_FUNC(32s8s,  cvt_, int, schar,  v_int32)
+DEF_CVT_FUNC(32s16u, cvt_, int, ushort, v_int32)
+DEF_CVT_FUNC(32s16s, cvt_, int, short,  v_int32)
+DEF_CVT_FUNC(32s32f, cvt_, int, float,  v_float32)
+DEF_CVT_FUNC(32s64f, cvt_, int, double, v_int32)
+//DEF_CVT_FUNC(32s16f, cvt1_,int, float16_t, v_float32)
+
+////////////////////// 32f -> ... ////////////////////////
+
+DEF_CVT_FUNC(32f8u,  cvt_, float, uchar,  v_float32)
+DEF_CVT_FUNC(32f8s,  cvt_, float, schar,  v_float32)
+DEF_CVT_FUNC(32f16u, cvt_, float, ushort, v_float32)
+DEF_CVT_FUNC(32f16s, cvt_, float, short,  v_float32)
+DEF_CVT_FUNC(32f32s, cvt_, float, int,    v_float32)
+DEF_CVT_FUNC(32f64f, cvt_, float, double, v_float32)
+DEF_CVT_FUNC(32f16f, cvt1_,float, float16_t, v_float32)
+
+////////////////////// 64f -> ... ////////////////////////
+
+DEF_CVT_FUNC(64f8u,  cvt_, double, uchar,  v_int32)
+DEF_CVT_FUNC(64f8s,  cvt_, double, schar,  v_int32)
+DEF_CVT_FUNC(64f16u, cvt_, double, ushort, v_int32)
+DEF_CVT_FUNC(64f16s, cvt_, double, short,  v_int32)
+DEF_CVT_FUNC(64f32s, cvt_, double, int,    v_int32)
+DEF_CVT_FUNC(64f32f, cvt_, double, float,  v_float32)
+//DEF_CVT_FUNC(64f16f, cvt1_,double, float16_t, v_float32)
+
+////////////////////// 16f -> ... ////////////////////////
+
+//DEF_CVT_FUNC(16f8u,  cvt_,  float16_t, uchar,  v_float32)
+//DEF_CVT_FUNC(16f8s,  cvt_,  float16_t, schar,  v_float32)
+//DEF_CVT_FUNC(16f16u, cvt1_, float16_t, ushort, v_float32)
+//DEF_CVT_FUNC(16f16s, cvt1_, float16_t, short,  v_float32)
+//DEF_CVT_FUNC(16f32s, cvt1_, float16_t, int,    v_float32)
+DEF_CVT_FUNC(16f32f, cvt1_, float16_t, float,  v_float32)
+//DEF_CVT_FUNC(16f64f, cvt1_, float16_t, double, v_float32)
+
+///////////// "conversion" w/o conversion ///////////////
+
+static void cvt8u(const uchar* src, size_t sstep, uchar*, size_t, uchar* dst, size_t dstep, Size size, void*)
+{ cvtCopy(src, sstep, dst, dstep, size, 1); }
+
+static void cvt16u(const ushort* src, size_t sstep, uchar*, size_t, ushort* dst, size_t dstep, Size size, void*)
+{ cvtCopy((const uchar*)src, sstep, (uchar*)dst, dstep, size, 2); }
+
+static void cvt32s(const int* src, size_t sstep, uchar*, size_t, int* dst, size_t dstep, Size size, void*)
+{ cvtCopy((const uchar*)src, sstep, (uchar*)dst, dstep, size, 4); }
+
+static void cvt64s(const int64* src, size_t sstep, uchar*, size_t, int64* dst, size_t dstep, Size size, void*)
+{ cvtCopy((const uchar*)src, sstep, (uchar*)dst, dstep, size, 8); }
+
+
+/* [TODO] Recover IPP calls
 #if defined(HAVE_IPP)
 #define DEF_CVT_FUNC_F(suffix, stype, dtype, ippFavor) \
 static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
@@ -1129,7 +278,6 @@ static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
     cpy_(src, sstep, dst, dstep, size); \
 }
 
-
 DEF_CPY_FUNC(8u,     uchar)
 DEF_CVT_FUNC_F(8s8u,   schar, uchar, 8s8u_C1Rs)
 DEF_CVT_FUNC_F(16u8u,  ushort, uchar, 16u8u_C1R)
@@ -1182,7 +330,7 @@ DEF_CVT_FUNC(16s64f, short, double)
 DEF_CVT_FUNC(32s64f, int, double)
 DEF_CVT_FUNC(32f64f, float, double)
 DEF_CPY_FUNC(64s,    int64)
-
+*/
 
 BinaryFunc getConvertFunc(int sdepth, int ddepth)
 {
@@ -1191,114 +339,78 @@ BinaryFunc getConvertFunc(int sdepth, int ddepth)
         {
             (BinaryFunc)(cvt8u), (BinaryFunc)GET_OPTIMIZED(cvt8s8u), (BinaryFunc)GET_OPTIMIZED(cvt16u8u),
             (BinaryFunc)GET_OPTIMIZED(cvt16s8u), (BinaryFunc)GET_OPTIMIZED(cvt32s8u), (BinaryFunc)GET_OPTIMIZED(cvt32f8u),
-            (BinaryFunc)GET_OPTIMIZED(cvt64f8u), 0
+            (BinaryFunc)GET_OPTIMIZED(cvt64f8u), 0 //(BinaryFunc)(cvt16f8u)
         },
         {
             (BinaryFunc)GET_OPTIMIZED(cvt8u8s), (BinaryFunc)cvt8u, (BinaryFunc)GET_OPTIMIZED(cvt16u8s),
             (BinaryFunc)GET_OPTIMIZED(cvt16s8s), (BinaryFunc)GET_OPTIMIZED(cvt32s8s), (BinaryFunc)GET_OPTIMIZED(cvt32f8s),
-            (BinaryFunc)GET_OPTIMIZED(cvt64f8s), 0
+            (BinaryFunc)GET_OPTIMIZED(cvt64f8s), 0 //(BinaryFunc)(cvt16f8s)
         },
         {
             (BinaryFunc)GET_OPTIMIZED(cvt8u16u), (BinaryFunc)GET_OPTIMIZED(cvt8s16u), (BinaryFunc)cvt16u,
             (BinaryFunc)GET_OPTIMIZED(cvt16s16u), (BinaryFunc)GET_OPTIMIZED(cvt32s16u), (BinaryFunc)GET_OPTIMIZED(cvt32f16u),
-            (BinaryFunc)GET_OPTIMIZED(cvt64f16u), 0
+            (BinaryFunc)GET_OPTIMIZED(cvt64f16u), 0 //(BinaryFunc)(cvt16f16u)
         },
         {
             (BinaryFunc)GET_OPTIMIZED(cvt8u16s), (BinaryFunc)GET_OPTIMIZED(cvt8s16s), (BinaryFunc)GET_OPTIMIZED(cvt16u16s),
             (BinaryFunc)cvt16u, (BinaryFunc)GET_OPTIMIZED(cvt32s16s), (BinaryFunc)GET_OPTIMIZED(cvt32f16s),
-            (BinaryFunc)GET_OPTIMIZED(cvt64f16s), 0
+            (BinaryFunc)GET_OPTIMIZED(cvt64f16s), 0 //(BinaryFunc)(cvt16f16s)
         },
         {
             (BinaryFunc)GET_OPTIMIZED(cvt8u32s), (BinaryFunc)GET_OPTIMIZED(cvt8s32s), (BinaryFunc)GET_OPTIMIZED(cvt16u32s),
             (BinaryFunc)GET_OPTIMIZED(cvt16s32s), (BinaryFunc)cvt32s, (BinaryFunc)GET_OPTIMIZED(cvt32f32s),
-            (BinaryFunc)GET_OPTIMIZED(cvt64f32s), 0
+            (BinaryFunc)GET_OPTIMIZED(cvt64f32s), 0 //(BinaryFunc)(cvt16f32s)
         },
         {
             (BinaryFunc)GET_OPTIMIZED(cvt8u32f), (BinaryFunc)GET_OPTIMIZED(cvt8s32f), (BinaryFunc)GET_OPTIMIZED(cvt16u32f),
             (BinaryFunc)GET_OPTIMIZED(cvt16s32f), (BinaryFunc)GET_OPTIMIZED(cvt32s32f), (BinaryFunc)cvt32s,
-            (BinaryFunc)GET_OPTIMIZED(cvt64f32f), 0
+            (BinaryFunc)GET_OPTIMIZED(cvt64f32f), 0 //(BinaryFunc)(cvt16f32f)
         },
         {
             (BinaryFunc)GET_OPTIMIZED(cvt8u64f), (BinaryFunc)GET_OPTIMIZED(cvt8s64f), (BinaryFunc)GET_OPTIMIZED(cvt16u64f),
             (BinaryFunc)GET_OPTIMIZED(cvt16s64f), (BinaryFunc)GET_OPTIMIZED(cvt32s64f), (BinaryFunc)GET_OPTIMIZED(cvt32f64f),
-            (BinaryFunc)(cvt64s), 0
+            (BinaryFunc)(cvt64s), 0 //(BinaryFunc)(cvt16f64f)
         },
         {
             0, 0, 0, 0, 0, 0, 0, 0
+            //(BinaryFunc)(cvt8u16f), (BinaryFunc)(cvt8s16f), (BinaryFunc)(cvt16u16f), (BinaryFunc)(cvt16s16f),
+            //(BinaryFunc)(cvt32s16f), (BinaryFunc)(cvt32f16f), (BinaryFunc)(cvt64f16f), (BinaryFunc)(cvt16u)
         }
     };
-
     return cvtTab[CV_MAT_DEPTH(ddepth)][CV_MAT_DEPTH(sdepth)];
 }
 
-} // cv::
-
-#ifdef HAVE_IPP
-namespace cv
+#ifdef HAVE_OPENCL
+static bool ocl_convertFp16( InputArray _src, OutputArray _dst, int sdepth, int ddepth )
 {
-static bool ipp_convertTo(Mat &src, Mat &dst, double alpha, double beta)
-{
-#ifdef HAVE_IPP_IW
-    CV_INSTRUMENT_REGION_IPP()
+    int type = _src.type(), cn = CV_MAT_CN(type);
 
-    IppDataType srcDepth = ippiGetDataType(src.depth());
-    IppDataType dstDepth = ippiGetDataType(dst.depth());
-    int         channels = src.channels();
-
-    if(src.dims == 0)
+    _dst.createSameSize( _src, CV_MAKETYPE(ddepth, cn) );
+    int kercn = 1;
+    int rowsPerWI = 1;
+    String build_opt = format("-D HALF_SUPPORT -D srcT=%s -D dstT=%s -D rowsPerWI=%d%s",
+                              sdepth == CV_32F ? "float" : "half",
+                              sdepth == CV_32F ? "half" : "float",
+                              rowsPerWI,
+                              sdepth == CV_32F ? " -D FLOAT_TO_HALF " : "");
+    ocl::Kernel k("convertFp16", ocl::core::halfconvert_oclsrc, build_opt);
+    if (k.empty())
         return false;
 
-    ::ipp::IwiImage iwSrc;
-    ::ipp::IwiImage iwDst;
+    UMat src = _src.getUMat();
+    UMat dst = _dst.getUMat();
 
-    try
-    {
-        IppHintAlgorithm mode = ippAlgHintFast;
-        if(dstDepth == ipp64f ||
-            (dstDepth == ipp32f && (srcDepth == ipp32s || srcDepth == ipp64f)) ||
-            (dstDepth == ipp32s && (srcDepth == ipp32s || srcDepth == ipp64f)))
-            mode = ippAlgHintAccurate;
+    ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
+    dstarg = ocl::KernelArg::WriteOnly(dst, cn, kercn);
 
-        if(src.dims <= 2)
-        {
-            Size sz = getContinuousSize(src, dst, channels);
+    k.args(srcarg, dstarg);
 
-            iwSrc.Init(ippiSize(sz), srcDepth, 1, NULL, (void*)src.ptr(), src.step);
-            iwDst.Init(ippiSize(sz), dstDepth, 1, NULL, (void*)dst.ptr(), dst.step);
-
-            CV_INSTRUMENT_FUN_IPP(::ipp::iwiScale, iwSrc, iwDst, alpha, beta, ::ipp::IwiScaleParams(mode));
-        }
-        else
-        {
-            const Mat *arrays[] = {&src, &dst, NULL};
-            uchar     *ptrs[2]  = {NULL};
-            NAryMatIterator it(arrays, ptrs);
-
-            iwSrc.Init(ippiSize(it.size, 1), srcDepth, channels);
-            iwDst.Init(ippiSize(it.size, 1), dstDepth, channels);
-
-            for(size_t i = 0; i < it.nplanes; i++, ++it)
-            {
-                iwSrc.m_ptr  = ptrs[0];
-                iwDst.m_ptr  = ptrs[1];
-
-                CV_INSTRUMENT_FUN_IPP(::ipp::iwiScale, iwSrc, iwDst, alpha, beta, ::ipp::IwiScaleParams(mode));
-            }
-        }
-    }
-    catch (::ipp::IwException)
-    {
-        return false;
-    }
-    return true;
-#else
-    CV_UNUSED(src); CV_UNUSED(dst); CV_UNUSED(alpha); CV_UNUSED(beta);
-    return false;
-#endif
+    size_t globalsize[2] = { (size_t)src.cols * cn / kercn, ((size_t)src.rows + rowsPerWI - 1) / rowsPerWI };
+    return k.run(2, globalsize, NULL, false);
 }
-} // cv::
 #endif
 
+} // cv::
 
 void cv::Mat::convertTo(OutputArray _dst, int _type, double alpha, double beta) const
 {
@@ -1331,7 +443,6 @@ void cv::Mat::convertTo(OutputArray _dst, int _type, double alpha, double beta)
         _dst.create( dims, size, _type );
     Mat dst = _dst.getMat();
 
-    CV_IPP_RUN_FAST(ipp_convertTo(src, dst, alpha, beta ));
 
     BinaryFunc func = noScale ? getConvertFunc(sdepth, ddepth) : getConvertScaleFunc(sdepth, ddepth);
     double scale[] = {alpha, beta};
@@ -1341,7 +452,6 @@ void cv::Mat::convertTo(OutputArray _dst, int _type, double alpha, double beta)
     if( dims <= 2 )
     {
         Size sz = getContinuousSize(src, dst, cn);
-
         func( src.data, src.step, 0, 0, dst.data, dst.step, sz, scale );
     }
     else
@@ -1358,118 +468,30 @@ void cv::Mat::convertTo(OutputArray _dst, int _type, double alpha, double beta)
 
 //==================================================================================================
 
-namespace cv {
-
-// template for FP16 HW conversion function
-template<typename T, typename DT> static void
-cvtScaleHalf_( const T* src, size_t sstep, DT* dst, size_t dstep, Size size);
-
-template<> void
-cvtScaleHalf_<float, short>( const float* src, size_t sstep, short* dst, size_t dstep, Size size )
-{
-    CV_CPU_CALL_FP16_(cvtScaleHalf_SIMD32f16f, (src, sstep, dst, dstep, size));
-
-#if !CV_CPU_FORCE_FP16
-    sstep /= sizeof(src[0]);
-    dstep /= sizeof(dst[0]);
-
-    for( ; size.height--; src += sstep, dst += dstep )
-    {
-        for ( int x = 0; x < size.width; x++ )
-        {
-            dst[x] = convertFp16SW(src[x]);
-        }
-    }
-#endif
-}
-
-template<> void
-cvtScaleHalf_<short, float>( const short* src, size_t sstep, float* dst, size_t dstep, Size size )
-{
-    CV_CPU_CALL_FP16_(cvtScaleHalf_SIMD16f32f, (src, sstep, dst, dstep, size));
-
-#if !CV_CPU_FORCE_FP16
-    sstep /= sizeof(src[0]);
-    dstep /= sizeof(dst[0]);
-
-    for( ; size.height--; src += sstep, dst += dstep )
-    {
-        for ( int x = 0; x < size.width; x++ )
-        {
-            dst[x] = convertFp16SW(src[x]);
-        }
-    }
-#endif
-}
-
-#define DEF_CVT_SCALE_FP16_FUNC(suffix, stype, dtype) \
-static void cvtScaleHalf##suffix( const stype* src, size_t sstep, \
-dtype* dst, size_t dstep, Size size, void*) \
-{ \
-    cvtScaleHalf_<stype,dtype>(src, sstep, dst, dstep, size); \
-}
-
-DEF_CVT_SCALE_FP16_FUNC(32f16f, float, short)
-DEF_CVT_SCALE_FP16_FUNC(16f32f, short, float)
-
-static UnaryFunc getConvertFuncFp16(int ddepth)
-{
-    static UnaryFunc cvtTab[] =
-    {
-        0, 0, 0,
-        (UnaryFunc)(cvtScaleHalf32f16f), 0, (UnaryFunc)(cvtScaleHalf16f32f),
-        0, 0,
-    };
-    return cvtTab[CV_MAT_DEPTH(ddepth)];
-}
-
-
-#ifdef HAVE_OPENCL
-
-static bool ocl_convertFp16( InputArray _src, OutputArray _dst, int ddepth )
-{
-    int type = _src.type(), cn = CV_MAT_CN(type);
-
-    _dst.createSameSize( _src, CV_MAKETYPE(ddepth, cn) );
-    int kercn = 1;
-    int rowsPerWI = 1;
-    String build_opt = format("-D HALF_SUPPORT -D dstT=%s -D srcT=%s -D rowsPerWI=%d%s",
-                           ddepth == CV_16S ? "half" : "float",
-                           ddepth == CV_16S ? "float" : "half",
-                           rowsPerWI,
-                           ddepth == CV_16S ? " -D FLOAT_TO_HALF " : "");
-    ocl::Kernel k("convertFp16", ocl::core::halfconvert_oclsrc, build_opt);
-    if (k.empty())
-        return false;
-
-    UMat src = _src.getUMat();
-    UMat dst = _dst.getUMat();
-
-    ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
-            dstarg = ocl::KernelArg::WriteOnly(dst, cn, kercn);
-
-    k.args(srcarg, dstarg);
-
-    size_t globalsize[2] = { (size_t)src.cols * cn / kercn, ((size_t)src.rows + rowsPerWI - 1) / rowsPerWI };
-    return k.run(2, globalsize, NULL, false);
-}
-
-#endif
-
-} //cv::
-
-void cv::convertFp16( InputArray _src, OutputArray _dst)
+void cv::convertFp16( InputArray _src, OutputArray _dst )
 {
     CV_INSTRUMENT_REGION()
 
-    int ddepth = 0;
-    switch( _src.depth() )
+    int sdepth = _src.depth(), ddepth = 0;
+    BinaryFunc func = 0;
+
+    switch( sdepth )
     {
     case CV_32F:
-        ddepth = CV_16S;
+        if(_dst.fixedType())
+        {
+            ddepth = _dst.depth();
+            CV_Assert(ddepth == CV_16S /*|| ddepth == CV_16F*/);
+            CV_Assert(_dst.channels() == _src.channels());
+        }
+        else
+            ddepth =  CV_16S;
+        func = (BinaryFunc)cvt32f16f;
         break;
     case CV_16S:
+    //case CV_16F:
         ddepth = CV_32F;
+        func = (BinaryFunc)cvt16f32f;
         break;
     default:
         CV_Error(Error::StsUnsupportedFormat, "Unsupported input depth");
@@ -1477,21 +499,21 @@ void cv::convertFp16( InputArray _src, OutputArray _dst)
     }
 
     CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
-               ocl_convertFp16(_src, _dst, ddepth))
+               ocl_convertFp16(_src, _dst, sdepth, ddepth))
 
     Mat src = _src.getMat();
 
     int type = CV_MAKETYPE(ddepth, src.channels());
     _dst.create( src.dims, src.size, type );
     Mat dst = _dst.getMat();
-    UnaryFunc func = getConvertFuncFp16(ddepth);
     int cn = src.channels();
+
     CV_Assert( func != 0 );
 
     if( src.dims <= 2 )
     {
         Size sz = getContinuousSize(src, dst, cn);
-        func( src.data, src.step, dst.data, dst.step, sz, 0);
+        func( src.data, src.step, 0, 0, dst.data, dst.step, sz, 0);
     }
     else
     {
@@ -1501,6 +523,6 @@ void cv::convertFp16( InputArray _src, OutputArray _dst)
         Size sz((int)(it.size*cn), 1);
 
         for( size_t i = 0; i < it.nplanes; i++, ++it )
-            func(ptrs[0], 1, ptrs[1], 1, sz, 0);
+            func(ptrs[0], 0, 0, 0, ptrs[1], 0, sz, 0);
     }
 }
diff --git a/modules/core/src/convert.fp16.cpp b/modules/core/src/convert.fp16.cpp
deleted file mode 100644
index 7168e8d643..0000000000
--- a/modules/core/src/convert.fp16.cpp
+++ /dev/null
@@ -1,126 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html
-
-
-#include "precomp.hpp"
-#include "convert.hpp"
-
-namespace cv
-{
-namespace opt_FP16
-{
-#if !defined(CV_NEON) || !CV_NEON
-const static int cVectorWidth = 8;
-
-void cvtScaleHalf_SIMD32f16f( const float* src, size_t sstep, short* dst, size_t dstep, cv::Size size )
-{
-    CV_INSTRUMENT_REGION()
-
-    sstep /= sizeof(src[0]);
-    dstep /= sizeof(dst[0]);
-
-    for( ; size.height--; src += sstep, dst += dstep )
-    {
-        int x = 0;
-        for ( ; x <= size.width - cVectorWidth ; x += cVectorWidth )
-        {
-            __m256 v_src = _mm256_loadu_ps(src + x);
-
-            // round to nearest even
-            __m128i v_dst = _mm256_cvtps_ph(v_src, 0);
-
-            _mm_storeu_si128((__m128i*)(dst + x), v_dst);
-        }
-
-        for ( ; x < size.width; x++ )
-        {
-            dst[x] = convertFp16SW(src[x]);
-        }
-    }
-}
-
-void cvtScaleHalf_SIMD16f32f( const short* src, size_t sstep, float* dst, size_t dstep, cv::Size size )
-{
-    CV_INSTRUMENT_REGION()
-
-    sstep /= sizeof(src[0]);
-    dstep /= sizeof(dst[0]);
-
-    for( ; size.height--; src += sstep, dst += dstep )
-    {
-        int x = 0;
-        for ( ; x <= size.width - cVectorWidth ; x += cVectorWidth )
-        {
-            __m128i v_src = _mm_loadu_si128((__m128i*)(src + x));
-
-            __m256 v_dst = _mm256_cvtph_ps(v_src);
-
-            _mm256_storeu_ps(dst + x, v_dst);
-        }
-
-        for ( ; x < size.width; x++ )
-        {
-            dst[x] = convertFp16SW(src[x]);
-        }
-    }
-}
-#elif CV_NEON
-const static int cVectorWidth = 4;
-
-void cvtScaleHalf_SIMD32f16f( const float* src, size_t sstep, short* dst, size_t dstep, cv::Size size )
-{
-    CV_INSTRUMENT_REGION()
-
-    sstep /= sizeof(src[0]);
-    dstep /= sizeof(dst[0]);
-
-    for( ; size.height--; src += sstep, dst += dstep )
-    {
-        int x = 0;
-        for ( ; x <= size.width - cVectorWidth ; x += cVectorWidth)
-        {
-            float32x4_t v_src = vld1q_f32(src + x);
-            float16x4_t v_dst = vcvt_f16_f32(v_src);
-
-            cv_vst1_f16(dst + x, v_dst);
-        }
-
-        for ( ; x < size.width; x++ )
-        {
-            dst[x] = convertFp16SW(src[x]);
-        }
-    }
-}
-
-void cvtScaleHalf_SIMD16f32f( const short* src, size_t sstep, float* dst, size_t dstep, cv::Size size )
-{
-    CV_INSTRUMENT_REGION()
-
-    sstep /= sizeof(src[0]);
-    dstep /= sizeof(dst[0]);
-
-    for( ; size.height--; src += sstep, dst += dstep )
-    {
-        int x = 0;
-        for ( ; x <= size.width - cVectorWidth ; x += cVectorWidth )
-        {
-            float16x4_t v_src = cv_vld1_f16((__fp16*)src + x);
-
-            float32x4_t v_dst = vcvt_f32_f16(v_src);
-
-            vst1q_f32(dst + x, v_dst);
-        }
-
-        for ( ; x < size.width; x++ )
-        {
-            dst[x] = convertFp16SW(src[x]);
-        }
-    }
-}
-#else
-#error "Unsupported build configuration"
-#endif
-}
-
-} // cv::
diff --git a/modules/core/src/convert.hpp b/modules/core/src/convert.hpp
index 580076367e..0d0aa3a770 100644
--- a/modules/core/src/convert.hpp
+++ b/modules/core/src/convert.hpp
@@ -8,192 +8,402 @@
 
 #include "opencv2/core/types.hpp"
 
-namespace
-{
-float convertFp16SW(short fp16);
-short convertFp16SW(float fp32);
-
-#if !CV_FP16_TYPE
-// const numbers for floating points format
-const unsigned int kShiftSignificand    = 13;
-const unsigned int kMaskFp16Significand = 0x3ff;
-const unsigned int kBiasFp16Exponent    = 15;
-const unsigned int kBiasFp32Exponent    = 127;
-#endif
-
-#if CV_FP16_TYPE
-inline float convertFp16SW(short fp16)
-{
-    // Fp16 -> Fp32
-    Cv16suf a;
-    a.i = fp16;
-    return (float)a.h;
-}
-#else
-inline float convertFp16SW(short fp16)
-{
-    // Fp16 -> Fp32
-    Cv16suf b;
-    b.i = fp16;
-    int exponent    = b.fmt.exponent - kBiasFp16Exponent;
-    int significand = b.fmt.significand;
-
-    Cv32suf a;
-    a.i = 0;
-    a.fmt.sign = b.fmt.sign; // sign bit
-    if( exponent == 16 )
-    {
-        // Inf or NaN
-        a.i = a.i | 0x7F800000;
-        if( significand != 0 )
-        {
-            // NaN
-#if defined(__x86_64__) || defined(_M_X64)
-            // 64bit
-            a.i = a.i | 0x7FC00000;
-#endif
-            a.fmt.significand = a.fmt.significand | (significand << kShiftSignificand);
-        }
-        return a.f;
-    }
-    else if ( exponent == -(int)kBiasFp16Exponent )
-    {
-        // subnormal in Fp16
-        if( significand == 0 )
-        {
-            // zero
-            return a.f;
-        }
-        else
-        {
-            int shift = -1;
-            while( ( significand & 0x400 ) == 0 )
-            {
-                significand = significand << 1;
-                shift++;
-            }
-            significand = significand & kMaskFp16Significand;
-            exponent -= shift;
-        }
-    }
-
-    a.fmt.exponent = (exponent+kBiasFp32Exponent);
-    a.fmt.significand = significand << kShiftSignificand;
-    return a.f;
-}
-#endif
-
-#if CV_FP16_TYPE
-inline short convertFp16SW(float fp32)
-{
-    // Fp32 -> Fp16
-    Cv16suf a;
-    a.h = (__fp16)fp32;
-    return a.i;
-}
-#else
-inline short convertFp16SW(float fp32)
-{
-    // Fp32 -> Fp16
-    Cv32suf a;
-    a.f = fp32;
-    int exponent    = a.fmt.exponent - kBiasFp32Exponent;
-    int significand = a.fmt.significand;
-
-    Cv16suf result;
-    result.i = 0;
-    unsigned int absolute = a.i & 0x7fffffff;
-    if( 0x477ff000 <= absolute )
-    {
-        // Inf in Fp16
-        result.i = result.i | 0x7C00;
-        if( exponent == 128 && significand != 0 )
-        {
-            // NaN
-            result.i = (short)( result.i | 0x200 | ( significand >> kShiftSignificand ) );
-        }
-    }
-    else if ( absolute < 0x33000001 )
-    {
-        // too small for fp16
-        result.i = 0;
-    }
-    else if ( absolute < 0x387fe000 )
-    {
-        // subnormal in Fp16
-        int fp16Significand = significand | 0x800000;
-        int bitShift = (-exponent) - 1;
-        fp16Significand = fp16Significand >> bitShift;
-
-        // special cases to round up
-        bitShift = exponent + 24;
-        int threshold = ( ( 0x400000 >> bitShift ) | ( ( ( significand & ( 0x800000 >> bitShift ) ) >> ( 126 - a.fmt.exponent ) ) ^ 1 ) );
-        if( absolute == 0x33c00000 )
-        {
-            result.i = 2;
-        }
-        else
-        {
-            if( threshold <= ( significand & ( 0xffffff >> ( exponent + 25 ) ) ) )
-            {
-                fp16Significand++;
-            }
-            result.i = (short)fp16Significand;
-        }
-    }
-    else
-    {
-        // usual situation
-        // exponent
-        result.fmt.exponent = ( exponent + kBiasFp16Exponent );
-
-        // significand;
-        short fp16Significand = (short)(significand >> kShiftSignificand);
-        result.fmt.significand = fp16Significand;
-
-        // special cases to round up
-        short lsb10bitsFp32 = (significand & 0x1fff);
-        short threshold = 0x1000 + ( ( fp16Significand & 0x1 ) ? 0 : 1 );
-        if( threshold <= lsb10bitsFp32 )
-        {
-            result.i++;
-        }
-        else if ( fp16Significand == kMaskFp16Significand && exponent == -15)
-        {
-            result.i++;
-        }
-    }
-
-    // sign bit
-    result.fmt.sign = a.fmt.sign;
-    return result.i;
-}
-#endif
-
-}
-
 namespace cv
 {
-namespace opt_FP16
+
+#if CV_SIMD
+
+static inline void vx_load_as(const uchar* ptr, v_float32& a)
+{ a = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(ptr))); }
+
+static inline void vx_load_as(const schar* ptr, v_float32& a)
+{ a = v_cvt_f32(vx_load_expand_q(ptr)); }
+
+static inline void vx_load_as(const ushort* ptr, v_float32& a)
+{ a = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand(ptr))); }
+
+static inline void vx_load_as(const short* ptr, v_float32& a)
+{ a = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand(ptr))); }
+
+static inline void vx_load_as(const int* ptr, v_float32& a)
+{ a = v_cvt_f32(vx_load(ptr)); }
+
+static inline void vx_load_as(const float* ptr, v_float32& a)
+{ a = vx_load(ptr); }
+
+static inline void vx_load_as(const float16_t* ptr, v_float32& a)
+{ a = vx_load_expand(ptr); }
+
+static inline void v_store_as(ushort* ptr, const v_float32& a)
+{ v_pack_u_store(ptr, v_round(a)); }
+
+static inline void v_store_as(short* ptr, const v_float32& a)
+{ v_pack_store(ptr, v_round(a)); }
+
+static inline void v_store_as(int* ptr, const v_float32& a)
+{ v_store(ptr, v_round(a)); }
+
+static inline void v_store_as(float* ptr, const v_float32& a)
+{ v_store(ptr, a); }
+
+static inline void v_store_as(float16_t* ptr, const v_float32& a)
+{ v_pack_store(ptr, a); }
+
+static inline void vx_load_pair_as(const uchar* ptr, v_uint16& a, v_uint16& b)
+{ v_expand(vx_load(ptr), a, b); }
+
+static inline void vx_load_pair_as(const schar* ptr, v_uint16& a, v_uint16& b)
 {
-void cvtScaleHalf_SIMD32f16f( const float* src, size_t sstep, short* dst, size_t dstep, cv::Size size );
-void cvtScaleHalf_SIMD16f32f( const short* src, size_t sstep, float* dst, size_t dstep, cv::Size size );
+    const v_int8 z = vx_setzero_s8();
+    v_int16 sa, sb;
+    v_expand(v_max(vx_load(ptr), z), sa, sb);
+    a = v_reinterpret_as_u16(sa);
+    b = v_reinterpret_as_u16(sb);
 }
-namespace opt_AVX2
+
+static inline void vx_load_pair_as(const ushort* ptr, v_uint16& a, v_uint16& b)
+{ a = vx_load(ptr); b = vx_load(ptr + v_uint16::nlanes); }
+
+static inline void vx_load_pair_as(const uchar* ptr, v_int16& a, v_int16& b)
 {
-void cvtScale_s16s32f32Line_AVX2(const short* src, int* dst, float scale, float shift, int width);
+    v_uint16 ua, ub;
+    v_expand(vx_load(ptr), ua, ub);
+    a = v_reinterpret_as_s16(ua);
+    b = v_reinterpret_as_s16(ub);
 }
-namespace opt_SSE4_1
+
+static inline void vx_load_pair_as(const schar* ptr, v_int16& a, v_int16& b)
+{ v_expand(vx_load(ptr), a, b); }
+
+static inline void vx_load_pair_as(const short* ptr, v_int16& a, v_int16& b)
+{ a = vx_load(ptr); b = vx_load(ptr + v_uint16::nlanes); }
+
+static inline void vx_load_pair_as(const uchar* ptr, v_int32& a, v_int32& b)
 {
-    int cvtScale_SIMD_u8u16f32_SSE41(const uchar * src, ushort * dst, int width, float scale, float shift);
-    int cvtScale_SIMD_s8u16f32_SSE41(const schar * src, ushort * dst, int width, float scale, float shift);
-    int cvtScale_SIMD_u16u16f32_SSE41(const ushort * src, ushort * dst, int width, float scale, float shift);
-    int cvtScale_SIMD_s16u16f32_SSE41(const short * src, ushort * dst, int width, float scale, float shift);
-    int cvtScale_SIMD_s32u16f32_SSE41(const int * src, ushort * dst, int width, float scale, float shift);
-    int cvtScale_SIMD_f32u16f32_SSE41(const float * src, ushort * dst, int width, float scale, float shift);
-    int cvtScale_SIMD_f64u16f32_SSE41(const double * src, ushort * dst, int width, float scale, float shift);
-    int Cvt_SIMD_f64u16_SSE41(const double * src, ushort * dst, int width);
+    v_uint32 ua, ub;
+    v_expand(vx_load_expand(ptr), ua, ub);
+    a = v_reinterpret_as_s32(ua);
+    b = v_reinterpret_as_s32(ub);
 }
+
+static inline void vx_load_pair_as(const schar* ptr, v_int32& a, v_int32& b)
+{ v_expand(vx_load_expand(ptr), a, b); }
+
+static inline void vx_load_pair_as(const ushort* ptr, v_int32& a, v_int32& b)
+{
+    v_uint32 ua, ub;
+    v_expand(vx_load(ptr), ua, ub);
+    a = v_reinterpret_as_s32(ua);
+    b = v_reinterpret_as_s32(ub);
+}
+
+static inline void vx_load_pair_as(const short* ptr, v_int32& a, v_int32& b)
+{
+    v_expand(vx_load(ptr), a, b);
+}
+
+static inline void vx_load_pair_as(const int* ptr, v_int32& a, v_int32& b)
+{
+    a = vx_load(ptr);
+    b = vx_load(ptr + v_int32::nlanes);
+}
+
+static inline void vx_load_pair_as(const uchar* ptr, v_float32& a, v_float32& b)
+{
+    v_uint32 ua, ub;
+    v_expand(vx_load_expand(ptr), ua, ub);
+    a = v_cvt_f32(v_reinterpret_as_s32(ua));
+    b = v_cvt_f32(v_reinterpret_as_s32(ub));
+}
+
+static inline void vx_load_pair_as(const schar* ptr, v_float32& a, v_float32& b)
+{
+    v_int32 ia, ib;
+    v_expand(vx_load_expand(ptr), ia, ib);
+    a = v_cvt_f32(ia);
+    b = v_cvt_f32(ib);
+}
+
+static inline void vx_load_pair_as(const ushort* ptr, v_float32& a, v_float32& b)
+{
+    v_uint32 ua, ub;
+    v_expand(vx_load(ptr), ua, ub);
+    a = v_cvt_f32(v_reinterpret_as_s32(ua));
+    b = v_cvt_f32(v_reinterpret_as_s32(ub));
+}
+
+static inline void vx_load_pair_as(const short* ptr, v_float32& a, v_float32& b)
+{
+    v_int32 ia, ib;
+    v_expand(vx_load(ptr), ia, ib);
+    a = v_cvt_f32(ia);
+    b = v_cvt_f32(ib);
+}
+
+static inline void vx_load_pair_as(const int* ptr, v_float32& a, v_float32& b)
+{
+    v_int32 ia = vx_load(ptr), ib = vx_load(ptr + v_int32::nlanes);
+    a = v_cvt_f32(ia);
+    b = v_cvt_f32(ib);
+}
+
+static inline void vx_load_pair_as(const float* ptr, v_float32& a, v_float32& b)
+{ a = vx_load(ptr); b = vx_load(ptr + v_float32::nlanes); }
+
+//static inline void vx_load_pair_as(const float16_t* ptr, v_float32& a, v_float32& b)
+//{
+//    a = vx_load_expand(ptr);
+//    b = vx_load_expand(ptr + v_float32::nlanes);
+//}
+
+
+static inline void v_store_pair_as(uchar* ptr, const v_uint16& a, const v_uint16& b)
+{
+    v_store(ptr, v_pack(a, b));
+}
+
+static inline void v_store_pair_as(schar* ptr, const v_uint16& a, const v_uint16& b)
+{
+    const v_uint8 maxval = vx_setall_u8((uchar)std::numeric_limits<schar>::max());
+    v_uint8 v = v_pack(a, b);
+    v_store(ptr, v_reinterpret_as_s8(v_min(v, maxval)));
+}
+
+static inline void v_store_pair_as(ushort* ptr, const v_uint16& a, const v_uint16& b)
+{ v_store(ptr, a); v_store(ptr + v_uint16::nlanes, b); }
+
+static inline void v_store_pair_as(uchar* ptr, const v_int16& a, const v_int16& b)
+{ v_store(ptr, v_pack_u(a, b)); }
+
+static inline void v_store_pair_as(schar* ptr, const v_int16& a, const v_int16& b)
+{ v_store(ptr, v_pack(a, b)); }
+
+static inline void v_store_pair_as(short* ptr, const v_int16& a, const v_int16& b)
+{ v_store(ptr, a); v_store(ptr + v_int16::nlanes, b); }
+
+static inline void v_store_pair_as(uchar* ptr, const v_int32& a, const v_int32& b)
+{ v_pack_u_store(ptr, v_pack(a, b)); }
+
+static inline void v_store_pair_as(schar* ptr, const v_int32& a, const v_int32& b)
+{ v_pack_store(ptr, v_pack(a, b)); }
+
+static inline void v_store_pair_as(ushort* ptr, const v_int32& a, const v_int32& b)
+{ v_store(ptr, v_pack_u(a, b)); }
+
+static inline void v_store_pair_as(short* ptr, const v_int32& a, const v_int32& b)
+{ v_store(ptr, v_pack(a, b)); }
+
+static inline void v_store_pair_as(int* ptr, const v_int32& a, const v_int32& b)
+{
+    v_store(ptr, a);
+    v_store(ptr + v_int32::nlanes, b);
+}
+
+static inline void v_store_pair_as(uchar* ptr, const v_float32& a, const v_float32& b)
+{ v_pack_u_store(ptr, v_pack(v_round(a), v_round(b))); }
+
+static inline void v_store_pair_as(schar* ptr, const v_float32& a, const v_float32& b)
+{ v_pack_store(ptr, v_pack(v_round(a), v_round(b))); }
+
+static inline void v_store_pair_as(ushort* ptr, const v_float32& a, const v_float32& b)
+{ v_store(ptr, v_pack_u(v_round(a), v_round(b))); }
+
+static inline void v_store_pair_as(short* ptr, const v_float32& a, const v_float32& b)
+{ v_store(ptr, v_pack(v_round(a), v_round(b))); }
+
+static inline void v_store_pair_as(int* ptr, const v_float32& a, const v_float32& b)
+{
+    v_int32 ia = v_round(a), ib = v_round(b);
+    v_store(ptr, ia);
+    v_store(ptr + v_int32::nlanes, ib);
+}
+
+static inline void v_store_pair_as(float* ptr, const v_float32& a, const v_float32& b)
+{ v_store(ptr, a); v_store(ptr + v_float32::nlanes, b); }
+
+#if CV_SIMD_64F
+
+static inline void vx_load_as(const double* ptr, v_float32& a)
+{
+    v_float64 v0 = vx_load(ptr), v1 = vx_load(ptr + v_float64::nlanes);
+    a = v_cvt_f32(v0, v1);
+}
+
+static inline void vx_load_pair_as(const double* ptr, v_int32& a, v_int32& b)
+{
+    v_float64 v0 = vx_load(ptr), v1 = vx_load(ptr + v_float64::nlanes);
+    v_float64 v2 = vx_load(ptr + v_float64::nlanes*2), v3 = vx_load(ptr + v_float64::nlanes*3);
+    v_int32 iv0 = v_round(v0), iv1 = v_round(v1);
+    v_int32 iv2 = v_round(v2), iv3 = v_round(v3);
+    a = v_combine_low(iv0, iv1);
+    b = v_combine_low(iv2, iv3);
+}
+
+static inline void vx_load_pair_as(const double* ptr, v_float32& a, v_float32& b)
+{
+    v_float64 v0 = vx_load(ptr), v1 = vx_load(ptr + v_float64::nlanes);
+    v_float64 v2 = vx_load(ptr + v_float64::nlanes*2), v3 = vx_load(ptr + v_float64::nlanes*3);
+    a = v_cvt_f32(v0, v1);
+    b = v_cvt_f32(v2, v3);
+}
+
+static inline void vx_load_pair_as(const uchar* ptr, v_float64& a, v_float64& b)
+{
+    v_int32 v0 = v_reinterpret_as_s32(vx_load_expand_q(ptr));
+    a = v_cvt_f64(v0);
+    b = v_cvt_f64_high(v0);
+}
+
+static inline void vx_load_pair_as(const schar* ptr, v_float64& a, v_float64& b)
+{
+    v_int32 v0 = vx_load_expand_q(ptr);
+    a = v_cvt_f64(v0);
+    b = v_cvt_f64_high(v0);
+}
+
+static inline void vx_load_pair_as(const ushort* ptr, v_float64& a, v_float64& b)
+{
+    v_int32 v0 = v_reinterpret_as_s32(vx_load_expand(ptr));
+    a = v_cvt_f64(v0);
+    b = v_cvt_f64_high(v0);
+}
+
+static inline void vx_load_pair_as(const short* ptr, v_float64& a, v_float64& b)
+{
+    v_int32 v0 = vx_load_expand(ptr);
+    a = v_cvt_f64(v0);
+    b = v_cvt_f64_high(v0);
+}
+
+static inline void vx_load_pair_as(const int* ptr, v_float64& a, v_float64& b)
+{
+    v_int32 v0 = vx_load(ptr);
+    a = v_cvt_f64(v0);
+    b = v_cvt_f64_high(v0);
+}
+
+static inline void vx_load_pair_as(const float* ptr, v_float64& a, v_float64& b)
+{
+    v_float32 v0 = vx_load(ptr);
+    a = v_cvt_f64(v0);
+    b = v_cvt_f64_high(v0);
+}
+
+static inline void vx_load_pair_as(const double* ptr, v_float64& a, v_float64& b)
+{
+    a = vx_load(ptr);
+    b = vx_load(ptr + v_float64::nlanes);
+}
+
+//static inline void vx_load_pair_as(const float16_t* ptr, v_float64& a, v_float64& b)
+//{
+//    v_float32 v0 = vx_load_expand(ptr);
+//    a = v_cvt_f64(v0);
+//    b = v_cvt_f64_high(v0);
+//}
+
+static inline void v_store_as(double* ptr, const v_float32& a)
+{
+    v_float64 fa0 = v_cvt_f64(a), fa1 = v_cvt_f64_high(a);
+    v_store(ptr, fa0);
+    v_store(ptr + v_float64::nlanes, fa1);
+}
+
+static inline void v_store_pair_as(double* ptr, const v_int32& a, const v_int32& b)
+{
+    v_float64 fa0 = v_cvt_f64(a), fa1 = v_cvt_f64_high(a);
+    v_float64 fb0 = v_cvt_f64(b), fb1 = v_cvt_f64_high(b);
+
+    v_store(ptr, fa0);
+    v_store(ptr + v_float64::nlanes, fa1);
+    v_store(ptr + v_float64::nlanes*2, fb0);
+    v_store(ptr + v_float64::nlanes*3, fb1);
+}
+
+static inline void v_store_pair_as(double* ptr, const v_float32& a, const v_float32& b)
+{
+    v_float64 fa0 = v_cvt_f64(a), fa1 = v_cvt_f64_high(a);
+    v_float64 fb0 = v_cvt_f64(b), fb1 = v_cvt_f64_high(b);
+
+    v_store(ptr, fa0);
+    v_store(ptr + v_float64::nlanes, fa1);
+    v_store(ptr + v_float64::nlanes*2, fb0);
+    v_store(ptr + v_float64::nlanes*3, fb1);
+}
+
+static inline void v_store_pair_as(double* ptr, const v_float64& a, const v_float64& b)
+{
+    v_store(ptr, a);
+    v_store(ptr + v_float64::nlanes, b);
+}
+
+static inline void v_store_pair_as(int* ptr, const v_float64& a, const v_float64& b)
+{
+    v_int32 ia = v_round(a), ib = v_round(b);
+    v_store(ptr, v_combine_low(ia, ib));
+}
+
+static inline void v_store_pair_as(float* ptr, const v_float64& a, const v_float64& b)
+{
+    v_float32 v = v_cvt_f32(a, b);
+    v_store(ptr, v);
+}
+
+//static inline void v_store_pair_as(float16_t* ptr, const v_float64& a, const v_float64& b)
+//{
+//    v_float32 v = v_cvt_f32(a, b);
+//    v_pack_store(ptr, v);
+//}
+
+#else
+
+static inline void vx_load_as(const double* ptr, v_float32& a)
+{
+    const int VECSZ = v_float32::nlanes;
+    float buf[VECSZ*2];
+
+    for( int i = 0; i < VECSZ; i++ )
+        buf[i] = saturate_cast<float>(ptr[i]);
+    a = vx_load(buf);
+}
+
+template<typename _Tdvec>
+static inline void vx_load_pair_as(const double* ptr, _Tdvec& a, _Tdvec& b)
+{
+    const int VECSZ = _Tdvec::nlanes;
+    typename _Tdvec::lane_type buf[VECSZ*2];
+
+    for( int i = 0; i < VECSZ*2; i++ )
+        buf[i] = saturate_cast<typename _Tdvec::lane_type>(ptr[i]);
+    a = vx_load(buf);
+    b = vx_load(buf + VECSZ);
+}
+
+static inline void v_store_as(double* ptr, const v_float32& a)
+{
+    const int VECSZ = v_float32::nlanes;
+    float buf[VECSZ];
+
+    v_store(buf, a);
+    for( int i = 0; i < VECSZ; i++ )
+        ptr[i] = (double)buf[i];
+}
+
+template<typename _Tsvec>
+static inline void v_store_pair_as(double* ptr, const _Tsvec& a, const _Tsvec& b)
+{
+    const int VECSZ = _Tsvec::nlanes;
+    typename _Tsvec::lane_type buf[VECSZ*2];
+
+    v_store(buf, a); v_store(buf + VECSZ, b);
+    for( int i = 0; i < VECSZ*2; i++ )
+        ptr[i] = (double)buf[i];
+}
+
+#endif /////////// CV_SIMD_64F
+
+#endif /////////// CV_SIMD
+
 }
 
 #endif // SRC_CONVERT_HPP
diff --git a/modules/core/src/convert.sse4_1.cpp b/modules/core/src/convert.sse4_1.cpp
deleted file mode 100644
index 3c18063d1d..0000000000
--- a/modules/core/src/convert.sse4_1.cpp
+++ /dev/null
@@ -1,203 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html
-
-
-#include "precomp.hpp"
-#include "convert.hpp"
-
-namespace cv
-{
-namespace opt_SSE4_1
-{
-
-int cvtScale_SIMD_u8u16f32_SSE41(const uchar * src, ushort * dst, int width, float scale, float shift)
-{
-    int x = 0;
-
-    __m128i v_zero = _mm_setzero_si128();
-    __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
-    for ( ; x <= width - 8; x += 8)
-    {
-        __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero);
-        __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
-        __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
-        v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
-        __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
-        __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
-                                            _mm_cvtps_epi32(v_dst_1));
-        _mm_storeu_si128((__m128i *)(dst + x), v_dst);
-    }
-
-    return x;
-}
-
-int cvtScale_SIMD_s8u16f32_SSE41(const schar * src, ushort * dst, int width, float scale, float shift)
-{
-    int x = 0;
-
-    __m128i v_zero = _mm_setzero_si128();
-    __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
-    for ( ; x <= width - 8; x += 8)
-    {
-        __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8);
-        __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
-        __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
-        v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
-        __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
-        __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
-                                            _mm_cvtps_epi32(v_dst_1));
-        _mm_storeu_si128((__m128i *)(dst + x), v_dst);
-    }
-
-    return x;
-}
-
-int cvtScale_SIMD_u16u16f32_SSE41(const ushort * src, ushort * dst, int width, float scale, float shift)
-{
-    int x = 0;
-
-    __m128i v_zero = _mm_setzero_si128();
-    __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
-    for ( ; x <= width - 8; x += 8)
-    {
-        __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
-        __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
-        __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
-        v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
-        __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
-        __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
-                                            _mm_cvtps_epi32(v_dst_1));
-        _mm_storeu_si128((__m128i *)(dst + x), v_dst);
-    }
-
-    return x;
-}
-
-int cvtScale_SIMD_s16u16f32_SSE41(const short * src, ushort * dst, int width, float scale, float shift)
-{
-    int x = 0;
-
-    __m128i v_zero = _mm_setzero_si128();
-    __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
-    for ( ; x <= width - 8; x += 8)
-    {
-        __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
-        __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
-        __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
-        v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
-        __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
-        __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
-                                            _mm_cvtps_epi32(v_dst_1));
-        _mm_storeu_si128((__m128i *)(dst + x), v_dst);
-    }
-
-    return x;
-}
-
-int cvtScale_SIMD_s32u16f32_SSE41(const int * src, ushort * dst, int width, float scale, float shift)
-{
-    int x = 0;
-
-    __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
-    for ( ; x <= width - 8; x += 8)
-    {
-        __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
-        __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
-
-        v_src = _mm_loadu_si128((__m128i const *)(src + x + 4));
-        __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
-
-        __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
-                                            _mm_cvtps_epi32(v_dst_1));
-        _mm_storeu_si128((__m128i *)(dst + x), v_dst);
-    }
-
-    return x;
-}
-
-int cvtScale_SIMD_f32u16f32_SSE41(const float * src, ushort * dst, int width, float scale, float shift)
-{
-    int x = 0;
-
-    __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
-    for ( ; x <= width - 8; x += 8)
-    {
-        __m128 v_src = _mm_loadu_ps(src + x);
-        __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
-
-        v_src = _mm_loadu_ps(src + x + 4);
-        __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
-
-        __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
-                                            _mm_cvtps_epi32(v_dst_1));
-        _mm_storeu_si128((__m128i *)(dst + x), v_dst);
-    }
-
-    return x;
-}
-
-int cvtScale_SIMD_f64u16f32_SSE41(const double * src, ushort * dst, int width, float scale, float shift)
-{
-    int x = 0;
-
-    __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
-    for ( ; x <= width - 8; x += 8)
-    {
-        __m128 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)),
-                                        _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)));
-        __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
-
-        v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)),
-                                _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)));
-        __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
-
-        __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
-                                            _mm_cvtps_epi32(v_dst_1));
-        _mm_storeu_si128((__m128i *)(dst + x), v_dst);
-    }
-
-    return x;
-}
-
-int Cvt_SIMD_f64u16_SSE41(const double * src, ushort * dst, int width)
-{
-    int x = 0;
-
-    for ( ; x <= width - 8; x += 8)
-    {
-        __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x));
-        __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2));
-        __m128 v_src2 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 4));
-        __m128 v_src3 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6));
-
-        v_src0 = _mm_movelh_ps(v_src0, v_src1);
-        v_src1 = _mm_movelh_ps(v_src2, v_src3);
-
-        __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_src0),
-                                            _mm_cvtps_epi32(v_src1));
-        _mm_storeu_si128((__m128i *)(dst + x), v_dst);
-    }
-
-    return x;
-}
-
-}
-} // cv::
-
-/* End of file. */
diff --git a/modules/core/src/convert_scale.cpp b/modules/core/src/convert_scale.cpp
index 25f5a963b7..0d4b5151a3 100644
--- a/modules/core/src/convert_scale.cpp
+++ b/modules/core/src/convert_scale.cpp
@@ -14,1623 +14,278 @@
 namespace cv
 {
 
-template<typename T, typename DT, typename WT>
-struct cvtScaleAbs_SIMD
+template<typename _Ts, typename _Td> inline void
+cvtabs_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
+            Size size, float a, float b )
 {
-    int operator () (const T *, DT *, int, WT, WT) const
-    {
-        return 0;
-    }
-};
-
-#if CV_SIMD128
-
-static inline void v_load_expand_from_u8_f32(const uchar* src, const v_float32x4 &v_scale, const v_float32x4 &v_shift, v_float32x4 &a, v_float32x4 &b)
-{
-    v_uint32x4 v_src0, v_src1;
-    v_expand(v_load_expand(src), v_src0, v_src1);
-
-    a = v_shift + v_scale * v_cvt_f32(v_reinterpret_as_s32(v_src0));
-    b = v_shift + v_scale * v_cvt_f32(v_reinterpret_as_s32(v_src1));
-}
-
-static inline void v_load_expand_from_s8_f32(const schar* src, const v_float32x4 &v_scale, const v_float32x4 &v_shift, v_float32x4 &a, v_float32x4 &b)
-{
-    v_int32x4 v_src0, v_src1;
-    v_expand(v_load_expand(src), v_src0, v_src1);
-
-    a = v_shift + v_scale * v_cvt_f32(v_src0);
-    b = v_shift + v_scale * v_cvt_f32(v_src1);
-}
-
-static inline void v_load_expand_from_u16_f32(const ushort* src, const v_float32x4 &v_scale, const v_float32x4 &v_shift, v_float32x4 &a, v_float32x4 &b)
-{
-    v_uint32x4 v_src0, v_src1;
-    v_expand(v_load(src), v_src0, v_src1);
-
-    a = v_shift + v_scale * v_cvt_f32(v_reinterpret_as_s32(v_src0));
-    b = v_shift + v_scale * v_cvt_f32(v_reinterpret_as_s32(v_src1));
-}
-
-static inline void v_load_expand_from_s16_f32(const short* src, const v_float32x4 &v_scale, const v_float32x4 &v_shift, v_float32x4 &a, v_float32x4 &b)
-{
-    v_int32x4 v_src0, v_src1;
-    v_expand(v_load(src), v_src0, v_src1);
-
-    a = v_shift + v_scale * v_cvt_f32(v_src0);
-    b = v_shift + v_scale * v_cvt_f32(v_src1);
-}
-
-static inline void v_load_expand_from_s32_f32(const int* src, const v_float32x4 &v_scale, const v_float32x4 &v_shift, v_float32x4 &a, v_float32x4 &b)
-{
-    a = v_shift + v_scale * v_cvt_f32(v_load(src));
-    b = v_shift + v_scale * v_cvt_f32(v_load(src + v_int32x4::nlanes));
-}
-
-template <>
-struct cvtScaleAbs_SIMD<uchar, uchar, float>
-{
-    int operator () (const uchar * src, uchar * dst, int width,
-        float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift);
-            v_float32x4 v_scale = v_setall_f32(scale);
-            const int cWidth = v_uint16x8::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_float32x4 v_dst_0, v_dst_1, v_dst_2, v_dst_3;
-                v_load_expand_from_u8_f32(src + x, v_scale, v_shift, v_dst_0, v_dst_1);
-                v_load_expand_from_u8_f32(src + x + cWidth, v_scale, v_shift, v_dst_2, v_dst_3);
-                v_dst_0 = v_abs(v_dst_0);
-                v_dst_1 = v_abs(v_dst_1);
-                v_dst_2 = v_abs(v_dst_2);
-                v_dst_3 = v_abs(v_dst_3);
-
-                v_int16x8 v_dsti_0 = v_pack(v_round(v_dst_0), v_round(v_dst_1));
-                v_int16x8 v_dsti_1 = v_pack(v_round(v_dst_2), v_round(v_dst_3));
-                v_store(dst + x, v_pack_u(v_dsti_0, v_dsti_1));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScaleAbs_SIMD<schar, uchar, float>
-{
-    int operator () (const schar * src, uchar * dst, int width,
-        float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift);
-            v_float32x4 v_scale = v_setall_f32(scale);
-            const int cWidth = v_int16x8::nlanes;
-            for (; x <= width - cWidth*2; x += cWidth*2)
-            {
-                v_float32x4 v_dst_0, v_dst_1, v_dst_2, v_dst_3;
-                v_load_expand_from_s8_f32(src + x, v_scale, v_shift, v_dst_0, v_dst_1);
-                v_load_expand_from_s8_f32(src + x + cWidth, v_scale, v_shift, v_dst_2, v_dst_3);
-                v_dst_0 = v_abs(v_dst_0);
-                v_dst_1 = v_abs(v_dst_1);
-                v_dst_2 = v_abs(v_dst_2);
-                v_dst_3 = v_abs(v_dst_3);
-
-                v_uint16x8 v_dsti_0 = v_pack_u(v_round(v_dst_0), v_round(v_dst_1));
-                v_uint16x8 v_dsti_1 = v_pack_u(v_round(v_dst_2), v_round(v_dst_3));
-                v_store(dst + x, v_pack(v_dsti_0, v_dsti_1));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScaleAbs_SIMD<ushort, uchar, float>
-{
-    int operator () (const ushort * src, uchar * dst, int width,
-        float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift);
-            v_float32x4 v_scale = v_setall_f32(scale);
-            const int cWidth = v_uint16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_float32x4 v_dst0, v_dst1;
-                v_load_expand_from_u16_f32(src + x, v_scale, v_shift, v_dst0, v_dst1);
-                v_dst0 = v_abs(v_dst0);
-                v_dst1 = v_abs(v_dst1);
-
-                v_int16x8 v_dst = v_pack(v_round(v_dst0), v_round(v_dst1));
-                v_pack_u_store(dst + x, v_dst);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScaleAbs_SIMD<short, uchar, float>
-{
-    int operator () (const short * src, uchar * dst, int width,
-        float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift);
-            v_float32x4 v_scale = v_setall_f32(scale);
-            const int cWidth = v_int16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_float32x4 v_dst0, v_dst1;
-                v_load_expand_from_s16_f32(src + x, v_scale, v_shift, v_dst0, v_dst1);
-                v_dst0 = v_abs(v_dst0);
-                v_dst1 = v_abs(v_dst1);
-
-                v_int16x8 v_dst = v_pack(v_round(v_dst0), v_round(v_dst1));
-                v_pack_u_store(dst + x, v_dst);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScaleAbs_SIMD<int, uchar, float>
-{
-    int operator () (const int * src, uchar * dst, int width,
-        float scale, float shift) const
-    {
-        int x = 0;
-        v_float32x4 v_shift = v_setall_f32(shift);
-        v_float32x4 v_scale = v_setall_f32(scale);
-        const int cWidth = v_int32x4::nlanes;
-        for (; x <= width - cWidth * 2; x += cWidth * 2)
-        {
-            v_float32x4 v_dst_0 = v_cvt_f32(v_load(src + x)) * v_scale;
-            v_dst_0 = v_abs(v_dst_0 + v_shift);
-
-            v_float32x4 v_dst_1 = v_cvt_f32(v_load(src + x + cWidth)) * v_scale;
-            v_dst_1 = v_abs(v_dst_1 + v_shift);
-
-            v_int16x8 v_dst = v_pack(v_round(v_dst_0), v_round(v_dst_1));
-            v_pack_u_store(dst + x, v_dst);
-        }
-
-        return x;
-    }
-};
-
-template <>
-struct cvtScaleAbs_SIMD<float, uchar, float>
-{
-    int operator () (const float * src, uchar * dst, int width,
-        float scale, float shift) const
-    {
-        int x = 0;
-        v_float32x4 v_shift = v_setall_f32(shift);
-        v_float32x4 v_scale = v_setall_f32(scale);
-        int cWidth = v_float32x4::nlanes;
-        for (; x <= width - cWidth * 2; x += cWidth * 2)
-        {
-            v_float32x4 v_dst_0 = v_load(src + x) * v_scale;
-            v_dst_0 = v_abs(v_dst_0 + v_shift);
-
-            v_float32x4 v_dst_1 = v_load(src + x + cWidth) * v_scale;
-            v_dst_1 = v_abs(v_dst_1 + v_shift);
-
-            v_int16x8 v_dst = v_pack(v_round(v_dst_0), v_round(v_dst_1));
-            v_pack_u_store(dst + x, v_dst);
-        }
-        return x;
-    }
-};
-
-#if CV_SIMD128_64F
-template <>
-struct cvtScaleAbs_SIMD<double, uchar, float>
-{
-    int operator () (const double * src, uchar * dst, int width,
-        float scale, float shift) const
-    {
-        int x = 0;
-
-        if (hasSIMD128())
-        {
-            v_float32x4 v_scale = v_setall_f32(scale);
-            v_float32x4 v_shift = v_setall_f32(shift);
-            int cWidth = v_float64x2::nlanes;
-            for (; x <= width - cWidth * 4; x += cWidth * 4)
-            {
-                v_float32x4 v_src1, v_src2, v_dummy;
-                v_recombine(v_cvt_f32(v_load(src + x)), v_cvt_f32(v_load(src + x + cWidth)), v_src1, v_dummy);
-                v_recombine(v_cvt_f32(v_load(src + x + cWidth * 2)), v_cvt_f32(v_load(src + x + cWidth * 3)), v_src2, v_dummy);
-
-                v_float32x4 v_dst1 = v_abs((v_src1 * v_scale) + v_shift);
-                v_float32x4 v_dst2 = v_abs((v_src2 * v_scale) + v_shift);
-
-                v_int16x8 v_dst_i = v_pack(v_round(v_dst1), v_round(v_dst2));
-                v_pack_u_store(dst + x, v_dst_i);
-            }
-        }
-
-        return x;
-    }
-};
-#endif // CV_SIMD128_64F
-
+#if CV_SIMD
+    v_float32 va = vx_setall_f32(a), vb = vx_setall_f32(b);
+    const int VECSZ = v_float32::nlanes*2;
 #endif
-
-template<typename T, typename DT, typename WT> static void
-cvtScaleAbs_( const T* src, size_t sstep,
-              DT* dst, size_t dstep, Size size,
-              WT scale, WT shift )
-{
-    sstep /= sizeof(src[0]);
-    dstep /= sizeof(dst[0]);
-    cvtScaleAbs_SIMD<T, DT, WT> vop;
-
-    for( ; size.height--; src += sstep, dst += dstep )
-    {
-        int x = vop(src, dst, size.width, scale, shift);
-
-        #if CV_ENABLE_UNROLLED
-        for( ; x <= size.width - 4; x += 4 )
-        {
-            DT t0, t1;
-            t0 = saturate_cast<DT>(std::abs(src[x]*scale + shift));
-            t1 = saturate_cast<DT>(std::abs(src[x+1]*scale + shift));
-            dst[x] = t0; dst[x+1] = t1;
-            t0 = saturate_cast<DT>(std::abs(src[x+2]*scale + shift));
-            t1 = saturate_cast<DT>(std::abs(src[x+3]*scale + shift));
-            dst[x+2] = t0; dst[x+3] = t1;
-        }
-        #endif
-        for( ; x < size.width; x++ )
-            dst[x] = saturate_cast<DT>(std::abs(src[x]*scale + shift));
-    }
-}
-
-template <typename T, typename DT, typename WT>
-struct cvtScale_SIMD
-{
-    int operator () (const T *, DT *, int, WT, WT) const
-    {
-        return 0;
-    }
-};
-
-#if CV_SIMD128
-
-// from uchar
-
-template <>
-struct cvtScale_SIMD<uchar, uchar, float>
-{
-    int operator () (const uchar * src, uchar * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_uint16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_float32x4 v_src1, v_src2;
-                v_load_expand_from_u8_f32(src + x, v_scale, v_shift, v_src1, v_src2);
-
-                v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
-                v_pack_u_store(dst + x, v_dst);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<uchar, schar, float>
-{
-    int operator () (const uchar * src, schar * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_int16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_float32x4 v_src1, v_src2;
-                v_load_expand_from_u8_f32(src + x, v_scale, v_shift, v_src1, v_src2);
-
-                v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
-                v_store_low(dst + x, v_pack(v_dst, v_dst));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<uchar, ushort, float>
-{
-    int operator () (const uchar * src, ushort * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-#if CV_TRY_SSE4_1
-        if (CV_CPU_HAS_SUPPORT_SSE4_1)
-            return opt_SSE4_1::cvtScale_SIMD_u8u16f32_SSE41(src, dst, width, scale, shift);
-#endif
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_uint16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_float32x4 v_src1, v_src2;
-                v_load_expand_from_u8_f32(src + x, v_scale, v_shift, v_src1, v_src2);
-
-                v_uint16x8 v_dst = v_pack_u(v_round(v_src1), v_round(v_src2));
-                v_store(dst + x, v_dst);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<uchar, short, float>
-{
-    int operator () (const uchar * src, short * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_int16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_float32x4 v_src1, v_src2;
-                v_load_expand_from_u8_f32(src + x, v_scale, v_shift, v_src1, v_src2);
-
-                v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
-                v_store(dst + x, v_dst);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<uchar, int, float>
-{
-    int operator () (const uchar * src, int * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_int32x4::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_float32x4 v_src1, v_src2;
-                v_load_expand_from_u8_f32(src + x, v_scale, v_shift, v_src1, v_src2);
-
-                v_store(dst + x, v_round(v_src1));
-                v_store(dst + x + cWidth, v_round(v_src2));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<uchar, float, float>
-{
-    int operator () (const uchar * src, float * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_float32x4::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_float32x4 v_src1, v_src2;
-                v_load_expand_from_u8_f32(src + x, v_scale, v_shift, v_src1, v_src2);
-
-                v_store(dst + x, v_src1);
-                v_store(dst + x + cWidth, v_src2);
-            }
-        }
-        return x;
-    }
-};
-
-// from schar
-
-template <>
-struct cvtScale_SIMD<schar, uchar, float>
-{
-    int operator () (const schar * src, uchar * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_uint16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_float32x4 v_src1, v_src2;
-                v_load_expand_from_s8_f32(src + x, v_scale, v_shift, v_src1, v_src2);
-
-                v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
-                v_pack_u_store(dst + x, v_dst);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<schar, schar, float>
-{
-    int operator () (const schar * src, schar * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_int16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_float32x4 v_src1, v_src2;
-                v_load_expand_from_s8_f32(src + x, v_scale, v_shift, v_src1, v_src2);
-
-                v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
-                v_store_low(dst + x, v_pack(v_dst, v_dst));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<schar, ushort, float>
-{
-    int operator () (const schar * src, ushort * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-#if CV_TRY_SSE4_1
-        if (CV_CPU_HAS_SUPPORT_SSE4_1)
-            return opt_SSE4_1::cvtScale_SIMD_s8u16f32_SSE41(src, dst, width, scale, shift);
-#endif
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_uint16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_float32x4 v_src1, v_src2;
-                v_load_expand_from_s8_f32(src + x, v_scale, v_shift, v_src1, v_src2);
-
-                v_uint16x8 v_dst = v_pack_u(v_round(v_src1), v_round(v_src2));
-                v_store(dst + x, v_dst);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<schar, short, float>
-{
-    int operator () (const schar * src, short * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_int16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_float32x4 v_src1, v_src2;
-                v_load_expand_from_s8_f32(src + x, v_scale, v_shift, v_src1, v_src2);
-
-                v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
-                v_store(dst + x, v_dst);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<schar, int, float>
-{
-    int operator () (const schar * src, int * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_int32x4::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_float32x4 v_src1, v_src2;
-                v_load_expand_from_s8_f32(src + x, v_scale, v_shift, v_src1, v_src2);
-
-                v_store(dst + x, v_round(v_src1));
-                v_store(dst + x + cWidth, v_round(v_src2));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<schar, float, float>
-{
-    int operator () (const schar * src, float * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_float32x4::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_float32x4 v_src1, v_src2;
-                v_load_expand_from_s8_f32(src + x, v_scale, v_shift, v_src1, v_src2);
-
-                v_store(dst + x, v_src1);
-                v_store(dst + x + cWidth, v_src2);
-            }
-        }
-        return x;
-    }
-};
-
-// from ushort
-
-template <>
-struct cvtScale_SIMD<ushort, uchar, float>
-{
-    int operator () (const ushort * src, uchar * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_uint16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_float32x4 v_src1, v_src2;
-                v_load_expand_from_u16_f32(src + x, v_scale, v_shift, v_src1, v_src2);
-
-                v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
-                v_pack_u_store(dst + x, v_dst);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<ushort, schar, float>
-{
-    int operator () (const ushort * src, schar * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_int16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_float32x4 v_src1, v_src2;
-                v_load_expand_from_u16_f32(src + x, v_scale, v_shift, v_src1, v_src2);
-
-                v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
-                v_store_low(dst + x, v_pack(v_dst, v_dst));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<ushort, ushort, float>
-{
-    int operator () (const ushort * src, ushort * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-#if CV_TRY_SSE4_1
-        if (CV_CPU_HAS_SUPPORT_SSE4_1)
-            return opt_SSE4_1::cvtScale_SIMD_u16u16f32_SSE41(src, dst, width, scale, shift);
-#endif
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_uint16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_float32x4 v_src1, v_src2;
-                v_load_expand_from_u16_f32(src + x, v_scale, v_shift, v_src1, v_src2);
-
-                v_uint16x8 v_dst = v_pack_u(v_round(v_src1), v_round(v_src2));
-                v_store(dst + x, v_dst);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<ushort, short, float>
-{
-    int operator () (const ushort * src, short * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_int16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_float32x4 v_src1, v_src2;
-                v_load_expand_from_u16_f32(src + x, v_scale, v_shift, v_src1, v_src2);
-
-                v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
-                v_store(dst + x, v_dst);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<ushort, int, float>
-{
-    int operator () (const ushort * src, int * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_int32x4::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_float32x4 v_src1, v_src2;
-                v_load_expand_from_u16_f32(src + x, v_scale, v_shift, v_src1, v_src2);
-
-                v_store(dst + x, v_round(v_src1));
-                v_store(dst + x + cWidth, v_round(v_src2));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<ushort, float, float>
-{
-    int operator () (const ushort * src, float * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_float32x4::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_float32x4 v_src1, v_src2;
-                v_load_expand_from_u16_f32(src + x, v_scale, v_shift, v_src1, v_src2);
-
-                v_store(dst + x, v_src1);
-                v_store(dst + x + cWidth, v_src2);
-            }
-        }
-        return x;
-    }
-};
-
-// from short
-
-template <>
-struct cvtScale_SIMD<short, uchar, float>
-{
-    int operator () (const short * src, uchar * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_uint16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_float32x4 v_src1, v_src2;
-                v_load_expand_from_s16_f32(src + x, v_scale, v_shift, v_src1, v_src2);
-
-                v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
-                v_pack_u_store(dst + x, v_dst);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<short, schar, float>
-{
-    int operator () (const short * src, schar * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_int16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_float32x4 v_src1, v_src2;
-                v_load_expand_from_s16_f32(src + x, v_scale, v_shift, v_src1, v_src2);
-
-                v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
-                v_store_low(dst + x, v_pack(v_dst, v_dst));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<short, ushort, float>
-{
-    int operator () (const short * src, ushort * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-#if CV_TRY_SSE4_1
-        if (CV_CPU_HAS_SUPPORT_SSE4_1)
-            return opt_SSE4_1::cvtScale_SIMD_s16u16f32_SSE41(src, dst, width, scale, shift);
-#endif
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_uint16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_float32x4 v_src1, v_src2;
-                v_load_expand_from_s16_f32(src + x, v_scale, v_shift, v_src1, v_src2);
-
-                v_uint16x8 v_dst = v_pack_u(v_round(v_src1), v_round(v_src2));
-                v_store(dst + x, v_dst);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<short, short, float>
-{
-    int operator () (const short * src, short * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_uint16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_float32x4 v_src1, v_src2;
-                v_load_expand_from_s16_f32(src + x, v_scale, v_shift, v_src1, v_src2);
-
-                v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
-                v_store(dst + x, v_dst);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<short, float, float>
-{
-    int operator () (const short * src, float * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_float32x4::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_float32x4 v_src1, v_src2;
-                v_load_expand_from_s16_f32(src + x, v_scale, v_shift, v_src1, v_src2);
-
-                v_store(dst + x, v_src1);
-                v_store(dst + x + cWidth, v_src2);
-            }
-        }
-        return x;
-    }
-};
-
-// from int
-
-template <>
-struct cvtScale_SIMD<int, uchar, float>
-{
-    int operator () (const int * src, uchar * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_uint16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_float32x4 v_src1, v_src2;
-                v_load_expand_from_s32_f32(src + x, v_scale, v_shift, v_src1, v_src2);
-
-                v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
-                v_pack_u_store(dst + x, v_dst);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<int, schar, float>
-{
-    int operator () (const int * src, schar * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_int16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_float32x4 v_src1, v_src2;
-                v_load_expand_from_s32_f32(src + x, v_scale, v_shift, v_src1, v_src2);
-
-                v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
-                v_store_low(dst + x, v_pack(v_dst, v_dst));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<int, ushort, float>
-{
-    int operator () (const int * src, ushort * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-#if CV_TRY_SSE4_1
-        if (CV_CPU_HAS_SUPPORT_SSE4_1)
-            return opt_SSE4_1::cvtScale_SIMD_s32u16f32_SSE41(src, dst, width, scale, shift);
-#endif
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_uint16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_float32x4 v_src1, v_src2;
-                v_load_expand_from_s32_f32(src + x, v_scale, v_shift, v_src1, v_src2);
-
-                v_uint16x8 v_dst = v_pack_u(v_round(v_src1), v_round(v_src2));
-                v_store(dst + x, v_dst);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<int, short, float>
-{
-    int operator () (const int * src, short * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_int16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_float32x4 v_src1, v_src2;
-                v_load_expand_from_s32_f32(src + x, v_scale, v_shift, v_src1, v_src2);
-
-                v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
-                v_store(dst + x, v_dst);
-            }
-        }
-        return x;
-    }
-};
-
-#if CV_SIMD128_64F
-template <>
-struct cvtScale_SIMD<int, int, double>
-{
-    int operator () (const int * src, int * dst, int width, double scale, double shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale);
-            int cWidth = v_int32x4::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                double v_srcbuf[] = { (double)src[x], (double)src[x+1], (double)src[x+2], (double)src[x+3] };
-                v_float64x2 v_src1 = v_shift + v_scale * v_load(v_srcbuf);
-                v_float64x2 v_src2 = v_shift + v_scale * v_load(v_srcbuf + 2);
-                v_store(dst + x, v_combine_low(v_round(v_src1), v_round(v_src2)));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<int, float, double>
-{
-    int operator () (const int * src, float * dst, int width, double scale, double shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale);
-            int cWidth = v_int32x4::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                double v_srcbuf[] = { (double)src[x], (double)src[x+1], (double)src[x+2], (double)src[x+3] };
-                v_float64x2 v_src1 = v_shift + v_scale * v_load(v_srcbuf);
-                v_float64x2 v_src2 = v_shift + v_scale * v_load(v_srcbuf + 2);
-                v_store(dst + x, v_combine_low(v_cvt_f32(v_src1), v_cvt_f32(v_src2)));
-            }
-        }
-        return x;
-    }
-};
-#endif //CV_SIMD128_64F
-
-// from float
-
-template <>
-struct cvtScale_SIMD<float, uchar, float>
-{
-    int operator () (const float * src, uchar * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_float32x4::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_float32x4 v_dst1 = v_shift + v_scale * v_load(src + x);
-                v_float32x4 v_dst2 = v_shift + v_scale * v_load(src + x + cWidth);
-
-                v_int16x8 v_dst = v_pack(v_round(v_dst1), v_round(v_dst2));
-                v_pack_u_store(dst + x, v_dst);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<float, schar, float>
-{
-    int operator () (const float * src, schar * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_float32x4::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_float32x4 v_dst1 = v_shift + v_scale * v_load(src + x);
-                v_float32x4 v_dst2 = v_shift + v_scale * v_load(src + x + cWidth);
-
-                v_int16x8 v_dst = v_pack(v_round(v_dst1), v_round(v_dst2));
-                v_store_low(dst + x, v_pack(v_dst, v_dst));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<float, ushort, float>
-{
-    int operator () (const float * src, ushort * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-#if CV_TRY_SSE4_1
-        if (CV_CPU_HAS_SUPPORT_SSE4_1)
-            return opt_SSE4_1::cvtScale_SIMD_f32u16f32_SSE41(src, dst, width, scale, shift);
-#endif
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_float32x4::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_float32x4 v_dst1 = v_shift + v_scale * v_load(src + x);
-                v_float32x4 v_dst2 = v_shift + v_scale * v_load(src + x + cWidth);
-
-                v_uint16x8 v_dst = v_pack_u(v_round(v_dst1), v_round(v_dst2));
-                v_store(dst + x, v_dst);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<float, short, float>
-{
-    int operator () (const float * src, short * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_float32x4::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_float32x4 v_dst1 = v_shift + v_scale * v_load(src + x);
-                v_float32x4 v_dst2 = v_shift + v_scale * v_load(src + x + cWidth);
-
-                v_int16x8 v_dst = v_pack(v_round(v_dst1), v_round(v_dst2));
-                v_store(dst + x, v_dst);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<float, int, float>
-{
-    int operator () (const float * src, int * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_float32x4::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-                v_store(dst + x, v_round(v_load(src + x) * v_scale + v_shift));
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<float, float, float>
-{
-    int operator () (const float * src, float * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
-            int cWidth = v_float32x4::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-                v_store(dst + x, v_load(src + x) * v_scale + v_shift);
-        }
-        return x;
-    }
-};
-
-#if CV_SIMD128_64F
-
-static inline void v_load_scale_shift(const double* src, const v_float64x2& v_scale, const v_float64x2 &v_shift, v_float32x4& v_dst1, v_float32x4 &v_dst2)
-{
-    int cWidth = v_float64x2::nlanes;
-    v_float64x2 v_src1 = v_shift + v_scale * v_load(src);
-    v_float64x2 v_src2 = v_shift + v_scale * v_load(src + cWidth);
-    v_float64x2 v_src3 = v_shift + v_scale * v_load(src + cWidth * 2);
-    v_float64x2 v_src4 = v_shift + v_scale * v_load(src + cWidth * 3);
-    v_dst1 = v_combine_low(v_cvt_f32(v_src1), v_cvt_f32(v_src2));
-    v_dst2 = v_combine_low(v_cvt_f32(v_src3), v_cvt_f32(v_src4));
-}
-
-static inline void v_store_scale_shift_s32_to_f64(double *dst, const v_float64x2 &v_scale, const v_float64x2 &v_shift, const v_int32x4 &v1, const v_int32x4 &v2)
-{
-    v_float64x2 v_dst1 = v_shift + v_scale * v_cvt_f64(v1);
-    v_float64x2 v_dst2 = v_shift + v_scale * v_cvt_f64_high(v1);
-    v_float64x2 v_dst3 = v_shift + v_scale * v_cvt_f64(v2);
-    v_float64x2 v_dst4 = v_shift + v_scale * v_cvt_f64_high(v2);
-
-    v_store(dst, v_dst1);
-    v_store(dst + v_float64x2::nlanes, v_dst2);
-    v_store(dst + v_float64x2::nlanes * 2, v_dst3);
-    v_store(dst + v_float64x2::nlanes * 3, v_dst4);
-}
-
-static inline void v_store_scale_shift_f32_to_f64(double *dst, const v_float64x2 &v_scale, const v_float64x2 &v_shift, const v_float32x4 &v1, const v_float32x4 &v2)
-{
-    v_float64x2 v_dst1 = v_shift + v_scale * v_cvt_f64(v1);
-    v_float64x2 v_dst2 = v_shift + v_scale * v_cvt_f64_high(v1);
-    v_float64x2 v_dst3 = v_shift + v_scale * v_cvt_f64(v2);
-    v_float64x2 v_dst4 = v_shift + v_scale * v_cvt_f64_high(v2);
-
-    v_store(dst, v_dst1);
-    v_store(dst + v_float64x2::nlanes, v_dst2);
-    v_store(dst + v_float64x2::nlanes * 2, v_dst3);
-    v_store(dst + v_float64x2::nlanes * 3, v_dst4);
-}
-
-// from double
-
-template <>
-struct cvtScale_SIMD<double, uchar, float>
-{
-    int operator () (const double * src, uchar * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float64x2 v_shift = v_setall_f64((double)shift), v_scale = v_setall_f64((double)scale);
-            int cWidth = v_float64x2::nlanes;
-            for (; x <= width - cWidth * 4; x += cWidth * 4)
-            {
-                v_float32x4 v_dst1, v_dst2;
-                v_load_scale_shift(src + x, v_scale, v_shift, v_dst1, v_dst2);
-                v_pack_u_store(dst + x, v_pack(v_round(v_dst1), v_round(v_dst2)));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<double, schar, float>
-{
-    int operator () (const double * src, schar * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float64x2 v_shift = v_setall_f64((double)shift), v_scale = v_setall_f64((double)scale);
-            int cWidth = v_float64x2::nlanes;
-            for (; x <= width - cWidth * 4; x += cWidth * 4)
-            {
-                v_float32x4 v_dst1, v_dst2;
-                v_load_scale_shift(src + x, v_scale, v_shift, v_dst1, v_dst2);
-                v_int16x8 v_dst = v_pack(v_round(v_dst1), v_round(v_dst2));
-                v_pack_store(dst + x, v_dst);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<double, ushort, float>
-{
-    int operator () (const double * src, ushort * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-#if CV_TRY_SSE4_1
-        if (CV_CPU_HAS_SUPPORT_SSE4_1)
-            return opt_SSE4_1::cvtScale_SIMD_f64u16f32_SSE41(src, dst, width, scale, shift);
-#endif
-        if (hasSIMD128())
-        {
-            v_float64x2 v_shift = v_setall_f64((double)shift), v_scale = v_setall_f64((double)scale);
-            int cWidth = v_uint16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_float32x4 v_dst1, v_dst2;
-                v_load_scale_shift(src + x, v_scale, v_shift, v_dst1, v_dst2);
-                v_uint16x8 v_dst = v_pack_u(v_round(v_dst1), v_round(v_dst2));
-                v_store(dst + x, v_dst);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<double, short, float>
-{
-    int operator () (const double * src, short * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float64x2 v_shift = v_setall_f64((double)shift), v_scale = v_setall_f64((double)scale);
-            int cWidth = v_int16x8::nlanes;
-            for (; x <= width - cWidth; x += cWidth)
-            {
-                v_float32x4 v_dst1, v_dst2;
-                v_load_scale_shift(src + x, v_scale, v_shift, v_dst1, v_dst2);
-                v_int16x8 v_dst = v_pack(v_round(v_dst1), v_round(v_dst2));
-                v_store(dst + x, v_dst);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<double, int, double>
-{
-    int operator () (const double * src, int * dst, int width, double scale, double shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale);
-            int cWidth = v_float64x2::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_float64x2 v_src1 = v_shift + v_scale * v_load(src + x);
-                v_float64x2 v_src2 = v_shift + v_scale * v_load(src + x + cWidth);
-
-                v_store(dst + x, v_combine_low(v_round(v_src1), v_round(v_src2)));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<double, float, double>
-{
-    int operator () (const double * src, float * dst, int width, double scale, double shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale);
-            int cWidth = v_float64x2::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_float64x2 v_src1 = v_shift + v_scale * v_load(src + x);
-                v_float64x2 v_src2 = v_shift + v_scale * v_load(src + x + cWidth);
-                v_float32x4 v_dst1 = v_cvt_f32(v_src1);
-                v_float32x4 v_dst2 = v_cvt_f32(v_src2);
-
-                v_store(dst + x, v_combine_low(v_dst1, v_dst2));
-            }
-        }
-        return x;
-    }
-};
-
-// to double
-
-template <>
-struct cvtScale_SIMD<uchar, double, double>
-{
-    int operator () (const uchar * src, double * dst, int width, double scale, double shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale);
-            int cWidth = v_float64x2::nlanes;
-            for (; x <= width - cWidth * 4; x += cWidth * 4)
-            {
-                v_uint32x4 v_src1, v_src2;
-                v_expand(v_load_expand(src + x), v_src1, v_src2);
-                v_store_scale_shift_s32_to_f64(dst + x, v_scale, v_shift
-                    , v_reinterpret_as_s32(v_src1), v_reinterpret_as_s32(v_src2));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<schar, double, double>
-{
-    int operator () (const schar * src, double * dst, int width, double scale, double shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale);
-            int cWidth = v_float64x2::nlanes;
-            for (; x <= width - cWidth * 4; x += cWidth * 4)
-            {
-                v_int32x4 v_src1, v_src2;
-                v_expand(v_load_expand(src + x), v_src1, v_src2);
-                v_store_scale_shift_s32_to_f64(dst + x, v_scale, v_shift, v_src1, v_src2);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<ushort, double, double>
-{
-    int operator () (const ushort * src, double * dst, int width, double scale, double shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale);
-            int cWidth = v_float64x2::nlanes;
-            for (; x <= width - cWidth * 4; x += cWidth * 4)
-            {
-                v_uint32x4 v_src1, v_src2;
-                v_expand(v_load(src + x), v_src1, v_src2);
-                v_store_scale_shift_s32_to_f64(dst + x, v_scale, v_shift
-                    , v_reinterpret_as_s32(v_src1), v_reinterpret_as_s32(v_src2));
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<short, double, double>
-{
-    int operator () (const short * src, double * dst, int width, double scale, double shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale);
-            int cWidth = v_float64x2::nlanes;
-            for (; x <= width - cWidth * 4; x += cWidth * 4)
-            {
-                v_int32x4 v_src1, v_src2;
-                v_expand(v_load(src + x), v_src1, v_src2);
-                v_store_scale_shift_s32_to_f64(dst + x, v_scale, v_shift, v_src1, v_src2);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<int, double, double>
-{
-    int operator () (const int * src, double * dst, int width, double scale, double shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale);
-            int cWidth = v_float32x4::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_int32x4 v_src1 = v_load(src + x);
-                v_int32x4 v_src2 = v_load(src + x + cWidth);
-                v_store_scale_shift_s32_to_f64(dst + x, v_scale, v_shift, v_src1, v_src2);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<float, double, double>
-{
-    int operator () (const float * src, double * dst, int width, double scale, double shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale);
-            int cWidth = v_float32x4::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_float32x4 v_src1 = v_load(src + x);
-                v_float32x4 v_src2 = v_load(src + x + cWidth);
-                v_store_scale_shift_f32_to_f64(dst + x, v_scale, v_shift, v_src1, v_src2);
-            }
-        }
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<double, double, double>
-{
-    int operator () (const double * src, double * dst, int width, double scale, double shift) const
-    {
-        int x = 0;
-        if (hasSIMD128())
-        {
-            v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale);
-            int cWidth = v_float64x2::nlanes;
-            for (; x <= width - cWidth * 2; x += cWidth * 2)
-            {
-                v_float64x2 v_src1 = v_shift + v_scale * v_load(src + x);
-                v_float64x2 v_src2 = v_shift + v_scale * v_load(src + x + cWidth);
-                v_store(dst + x, v_src1);
-                v_store(dst + x + cWidth, v_src2);
-            }
-        }
-        return x;
-    }
-};
-#endif
-#endif
-
-template<typename T, typename DT, typename WT> static void
-cvtScale_( const T* src, size_t sstep,
-           DT* dst, size_t dstep, Size size,
-           WT scale, WT shift )
-{
     sstep /= sizeof(src[0]);
     dstep /= sizeof(dst[0]);
 
-    cvtScale_SIMD<T, DT, WT> vop;
-
-    for( ; size.height--; src += sstep, dst += dstep )
+    for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
     {
-        int x = vop(src, dst, size.width, scale, shift);
-
-        #if CV_ENABLE_UNROLLED
-        for( ; x <= size.width - 4; x += 4 )
+        int j = 0;
+#if CV_SIMD
+        for( ; j < size.width; j += VECSZ )
         {
-            DT t0, t1;
-            t0 = saturate_cast<DT>(src[x]*scale + shift);
-            t1 = saturate_cast<DT>(src[x+1]*scale + shift);
-            dst[x] = t0; dst[x+1] = t1;
-            t0 = saturate_cast<DT>(src[x+2]*scale + shift);
-            t1 = saturate_cast<DT>(src[x+3]*scale + shift);
-            dst[x+2] = t0; dst[x+3] = t1;
+            if( j > size.width - VECSZ )
+            {
+                if( j == 0 || src == (_Ts*)dst )
+                    break;
+                j = size.width - VECSZ;
+            }
+            v_float32 v0, v1;
+            vx_load_pair_as(src + j, v0, v1);
+            v0 = v_fma(v0, va, vb);
+            v1 = v_fma(v1, va, vb);
+            v_store_pair_as(dst + j, v_abs(v0), v_abs(v1));
         }
-        #endif
-
-        for( ; x < size.width; x++ )
-            dst[x] = saturate_cast<DT>(src[x]*scale + shift);
+#endif
+        for( ; j < size.width; j++ )
+            dst[j] = saturate_cast<_Td>(std::abs(src[j]*a + b));
     }
 }
 
-template<> void
-cvtScale_<short, int, float>( const short* src, size_t sstep,
-           int* dst, size_t dstep, Size size,
-           float scale, float shift )
+// variant for convrsions 16f <-> ... w/o unrolling
+template<typename _Ts, typename _Td> inline void
+cvtabs1_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
+             Size size, float a, float b )
 {
+#if CV_SIMD
+    v_float32 va = vx_setall_f32(a), vb = vx_setall_f32(b);
+    const int VECSZ = v_float32::nlanes*2;
+#endif
     sstep /= sizeof(src[0]);
     dstep /= sizeof(dst[0]);
 
-    for( ; size.height--; src += sstep, dst += dstep )
+    for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
     {
-        int x = 0;
-        #if CV_TRY_AVX2
-        if (CV_CPU_HAS_SUPPORT_AVX2)
+        int j = 0;
+#if CV_SIMD
+        for( ; j < size.width; j += VECSZ )
         {
-            opt_AVX2::cvtScale_s16s32f32Line_AVX2(src, dst, scale, shift, size.width);
-            continue;
-        }
-        #endif
-        #if CV_SIMD128
-        if (hasSIMD128())
-        {
-            v_float32x4 v_shift = v_setall_f32(shift);
-            v_float32x4 v_scale = v_setall_f32(scale);
-            int cWidth = v_int32x4::nlanes;
-            for (; x <= size.width - cWidth * 2; x += cWidth * 2)
+            if( j > size.width - VECSZ )
             {
-                v_int16x8 v_src = v_load(src + x);
-                v_int32x4 v_src1, v_src2;
-                v_expand(v_src, v_src1, v_src2);
-                v_float32x4 v_tmp1 = v_cvt_f32(v_src1);
-                v_float32x4 v_tmp2 = v_cvt_f32(v_src2);
-
-                v_tmp1 = v_tmp1 * v_scale + v_shift;
-                v_tmp2 = v_tmp2 * v_scale + v_shift;
-
-                v_store(dst + x, v_round(v_tmp1));
-                v_store(dst + x + cWidth, v_round(v_tmp2));
+                if( j == 0 || src == (_Ts*)dst )
+                    break;
+                j = size.width - VECSZ;
             }
+            v_float32 v0;
+            vx_load_as(src + j, v0);
+            v0 = v_fma(v0, va, vb);
+            v_store_as(dst + j, v_abs(v0));
         }
-        #endif
-
-        for(; x < size.width; x++ )
-            dst[x] = saturate_cast<int>(src[x]*scale + shift);
+#endif
+        for( ; j < size.width; j++ )
+            dst[j] = saturate_cast<_Td>(src[j]*a + b);
     }
 }
 
+template<typename _Ts, typename _Td> inline void
+cvt_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
+         Size size, float a, float b )
+{
+#if CV_SIMD
+    v_float32 va = vx_setall_f32(a), vb = vx_setall_f32(b);
+    const int VECSZ = v_float32::nlanes*2;
+#endif
+    sstep /= sizeof(src[0]);
+    dstep /= sizeof(dst[0]);
+
+    for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
+    {
+        int j = 0;
+#if CV_SIMD
+        for( ; j < size.width; j += VECSZ )
+        {
+            if( j > size.width - VECSZ )
+            {
+                if( j == 0 || src == (_Ts*)dst )
+                    break;
+                j = size.width - VECSZ;
+            }
+            v_float32 v0, v1;
+            vx_load_pair_as(src + j, v0, v1);
+            v0 = v_fma(v0, va, vb);
+            v1 = v_fma(v1, va, vb);
+            v_store_pair_as(dst + j, v0, v1);
+        }
+#endif
+        for( ; j < size.width; j++ )
+            dst[j] = saturate_cast<_Td>(src[j]*a + b);
+    }
+}
+
+// variant for convrsions 16f <-> ... w/o unrolling
+template<typename _Ts, typename _Td> inline void
+cvt1_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
+          Size size, float a, float b )
+{
+#if CV_SIMD
+    v_float32 va = vx_setall_f32(a), vb = vx_setall_f32(b);
+    const int VECSZ = v_float32::nlanes;
+#endif
+    sstep /= sizeof(src[0]);
+    dstep /= sizeof(dst[0]);
+
+    for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
+    {
+        int j = 0;
+#if CV_SIMD
+        for( ; j < size.width; j += VECSZ )
+        {
+            if( j > size.width - VECSZ )
+            {
+                if( j == 0 || src == (_Ts*)dst )
+                    break;
+                j = size.width - VECSZ;
+            }
+            v_float32 v0;
+            vx_load_as(src + j, v0);
+            v0 = v_fma(v0, va, vb);
+            v_store_as(dst + j, v0);
+        }
+#endif
+        for( ; j < size.width; j++ )
+            dst[j] = saturate_cast<_Td>(src[j]*a + b);
+    }
+}
+
+
+template<typename _Ts, typename _Td> inline void
+cvt_64f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
+         Size size, double a, double b )
+{
+#if CV_SIMD_64F
+    v_float64 va = vx_setall_f64(a), vb = vx_setall_f64(b);
+    const int VECSZ = v_float64::nlanes*2;
+#endif
+    sstep /= sizeof(src[0]);
+    dstep /= sizeof(dst[0]);
+
+    for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
+    {
+        int j = 0;
+#if CV_SIMD_64F
+        for( ; j < size.width; j += VECSZ )
+        {
+            if( j > size.width - VECSZ )
+            {
+                if( j == 0 || src == (_Ts*)dst )
+                    break;
+                j = size.width - VECSZ;
+            }
+            v_float64 v0, v1;
+            vx_load_pair_as(src + j, v0, v1);
+            v0 = v_fma(v0, va, vb);
+            v1 = v_fma(v1, va, vb);
+            v_store_pair_as(dst + j, v0, v1);
+        }
+#endif
+        for( ; j < size.width; j++ )
+            dst[j] = saturate_cast<_Td>(src[j]*a + b);
+    }
+}
 
 //==================================================================================================
 
-#define DEF_CVT_SCALE_ABS_FUNC(suffix, tfunc, stype, dtype, wtype) \
+#define DEF_CVT_SCALE_ABS_FUNC(suffix, cvt, stype, dtype, wtype) \
 static void cvtScaleAbs##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
-                         dtype* dst, size_t dstep, Size size, double* scale) \
+                                 dtype* dst, size_t dstep, Size size, double* scale) \
 { \
-    tfunc(src, sstep, dst, dstep, size, (wtype)scale[0], (wtype)scale[1]); \
+    cvt(src, sstep, dst, dstep, size, (wtype)scale[0], (wtype)scale[1]); \
 }
 
 
-#define DEF_CVT_SCALE_FUNC(suffix, stype, dtype, wtype) \
+#define DEF_CVT_SCALE_FUNC(suffix, cvt, stype, dtype, wtype) \
 static void cvtScale##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
-dtype* dst, size_t dstep, Size size, double* scale) \
+                              dtype* dst, size_t dstep, Size size, double* scale) \
 { \
-    cvtScale_(src, sstep, dst, dstep, size, (wtype)scale[0], (wtype)scale[1]); \
+    cvt(src, sstep, dst, dstep, size, (wtype)scale[0], (wtype)scale[1]); \
 }
 
-DEF_CVT_SCALE_ABS_FUNC(8u, cvtScaleAbs_, uchar, uchar, float)
-DEF_CVT_SCALE_ABS_FUNC(8s8u, cvtScaleAbs_, schar, uchar, float)
-DEF_CVT_SCALE_ABS_FUNC(16u8u, cvtScaleAbs_, ushort, uchar, float)
-DEF_CVT_SCALE_ABS_FUNC(16s8u, cvtScaleAbs_, short, uchar, float)
-DEF_CVT_SCALE_ABS_FUNC(32s8u, cvtScaleAbs_, int, uchar, float)
-DEF_CVT_SCALE_ABS_FUNC(32f8u, cvtScaleAbs_, float, uchar, float)
-DEF_CVT_SCALE_ABS_FUNC(64f8u, cvtScaleAbs_, double, uchar, float)
+DEF_CVT_SCALE_ABS_FUNC(8u,    cvtabs_32f, uchar,  uchar, float)
+DEF_CVT_SCALE_ABS_FUNC(8s8u,  cvtabs_32f, schar,  uchar, float)
+DEF_CVT_SCALE_ABS_FUNC(16u8u, cvtabs_32f, ushort, uchar, float)
+DEF_CVT_SCALE_ABS_FUNC(16s8u, cvtabs_32f, short,  uchar, float)
+DEF_CVT_SCALE_ABS_FUNC(32s8u, cvtabs_32f, int,    uchar, float)
+DEF_CVT_SCALE_ABS_FUNC(32f8u, cvtabs_32f, float,  uchar, float)
+DEF_CVT_SCALE_ABS_FUNC(64f8u, cvtabs_32f, double, uchar, float)
 
+DEF_CVT_SCALE_FUNC(8u,     cvt_32f, uchar,  uchar, float)
+DEF_CVT_SCALE_FUNC(8s8u,   cvt_32f, schar,  uchar, float)
+DEF_CVT_SCALE_FUNC(16u8u,  cvt_32f, ushort, uchar, float)
+DEF_CVT_SCALE_FUNC(16s8u,  cvt_32f, short,  uchar, float)
+DEF_CVT_SCALE_FUNC(32s8u,  cvt_32f, int,    uchar, float)
+DEF_CVT_SCALE_FUNC(32f8u,  cvt_32f, float,  uchar, float)
+DEF_CVT_SCALE_FUNC(64f8u,  cvt_32f, double, uchar, float)
+//DEF_CVT_SCALE_FUNC(16f8u,  cvt_32f, float16_t, uchar, float)
 
-DEF_CVT_SCALE_FUNC(8u,     uchar, uchar, float)
-DEF_CVT_SCALE_FUNC(8s8u,   schar, uchar, float)
-DEF_CVT_SCALE_FUNC(16u8u,  ushort, uchar, float)
-DEF_CVT_SCALE_FUNC(16s8u,  short, uchar, float)
-DEF_CVT_SCALE_FUNC(32s8u,  int, uchar, float)
-DEF_CVT_SCALE_FUNC(32f8u,  float, uchar, float)
-DEF_CVT_SCALE_FUNC(64f8u,  double, uchar, float)
+DEF_CVT_SCALE_FUNC(8u8s,   cvt_32f, uchar,  schar, float)
+DEF_CVT_SCALE_FUNC(8s,     cvt_32f, schar,  schar, float)
+DEF_CVT_SCALE_FUNC(16u8s,  cvt_32f, ushort, schar, float)
+DEF_CVT_SCALE_FUNC(16s8s,  cvt_32f, short,  schar, float)
+DEF_CVT_SCALE_FUNC(32s8s,  cvt_32f, int,    schar, float)
+DEF_CVT_SCALE_FUNC(32f8s,  cvt_32f, float,  schar, float)
+DEF_CVT_SCALE_FUNC(64f8s,  cvt_32f, double, schar, float)
+//DEF_CVT_SCALE_FUNC(16f8s,  cvt_32f, float16_t, schar, float)
 
-DEF_CVT_SCALE_FUNC(8u8s,   uchar, schar, float)
-DEF_CVT_SCALE_FUNC(8s,     schar, schar, float)
-DEF_CVT_SCALE_FUNC(16u8s,  ushort, schar, float)
-DEF_CVT_SCALE_FUNC(16s8s,  short, schar, float)
-DEF_CVT_SCALE_FUNC(32s8s,  int, schar, float)
-DEF_CVT_SCALE_FUNC(32f8s,  float, schar, float)
-DEF_CVT_SCALE_FUNC(64f8s,  double, schar, float)
+DEF_CVT_SCALE_FUNC(8u16u,  cvt_32f, uchar,  ushort, float)
+DEF_CVT_SCALE_FUNC(8s16u,  cvt_32f, schar,  ushort, float)
+DEF_CVT_SCALE_FUNC(16u,    cvt_32f, ushort, ushort, float)
+DEF_CVT_SCALE_FUNC(16s16u, cvt_32f, short,  ushort, float)
+DEF_CVT_SCALE_FUNC(32s16u, cvt_32f, int,    ushort, float)
+DEF_CVT_SCALE_FUNC(32f16u, cvt_32f, float,  ushort, float)
+DEF_CVT_SCALE_FUNC(64f16u, cvt_32f, double, ushort, float)
+//DEF_CVT_SCALE_FUNC(16f16u, cvt1_32f, float16_t, ushort, float)
 
-DEF_CVT_SCALE_FUNC(8u16u,  uchar, ushort, float)
-DEF_CVT_SCALE_FUNC(8s16u,  schar, ushort, float)
-DEF_CVT_SCALE_FUNC(16u,    ushort, ushort, float)
-DEF_CVT_SCALE_FUNC(16s16u, short, ushort, float)
-DEF_CVT_SCALE_FUNC(32s16u, int, ushort, float)
-DEF_CVT_SCALE_FUNC(32f16u, float, ushort, float)
-DEF_CVT_SCALE_FUNC(64f16u, double, ushort, float)
+DEF_CVT_SCALE_FUNC(8u16s,  cvt_32f, uchar,  short, float)
+DEF_CVT_SCALE_FUNC(8s16s,  cvt_32f, schar,  short, float)
+DEF_CVT_SCALE_FUNC(16u16s, cvt_32f, ushort, short, float)
+DEF_CVT_SCALE_FUNC(16s,    cvt_32f, short,  short, float)
+DEF_CVT_SCALE_FUNC(32s16s, cvt_32f, int,    short, float)
+DEF_CVT_SCALE_FUNC(32f16s, cvt_32f, float,  short, float)
+DEF_CVT_SCALE_FUNC(64f16s, cvt_32f, double, short, float)
+//DEF_CVT_SCALE_FUNC(16f16s, cvt1_32f, float16_t, short, float)
 
-DEF_CVT_SCALE_FUNC(8u16s,  uchar, short, float)
-DEF_CVT_SCALE_FUNC(8s16s,  schar, short, float)
-DEF_CVT_SCALE_FUNC(16u16s, ushort, short, float)
-DEF_CVT_SCALE_FUNC(16s,    short, short, float)
-DEF_CVT_SCALE_FUNC(32s16s, int, short, float)
-DEF_CVT_SCALE_FUNC(32f16s, float, short, float)
-DEF_CVT_SCALE_FUNC(64f16s, double, short, float)
+DEF_CVT_SCALE_FUNC(8u32s,  cvt_32f, uchar,  int, float)
+DEF_CVT_SCALE_FUNC(8s32s,  cvt_32f, schar,  int, float)
+DEF_CVT_SCALE_FUNC(16u32s, cvt_32f, ushort, int, float)
+DEF_CVT_SCALE_FUNC(16s32s, cvt_32f, short,  int, float)
+DEF_CVT_SCALE_FUNC(32s,    cvt_64f, int,    int, double)
+DEF_CVT_SCALE_FUNC(32f32s, cvt_32f, float,  int, float)
+DEF_CVT_SCALE_FUNC(64f32s, cvt_64f, double, int, double)
+//DEF_CVT_SCALE_FUNC(16f32s, cvt1_32f, float16_t, int, float)
 
-DEF_CVT_SCALE_FUNC(8u32s,  uchar, int, float)
-DEF_CVT_SCALE_FUNC(8s32s,  schar, int, float)
-DEF_CVT_SCALE_FUNC(16u32s, ushort, int, float)
-DEF_CVT_SCALE_FUNC(16s32s, short, int, float)
-DEF_CVT_SCALE_FUNC(32s,    int, int, double)
-DEF_CVT_SCALE_FUNC(32f32s, float, int, float)
-DEF_CVT_SCALE_FUNC(64f32s, double, int, double)
+DEF_CVT_SCALE_FUNC(8u32f,  cvt_32f, uchar,  float, float)
+DEF_CVT_SCALE_FUNC(8s32f,  cvt_32f, schar,  float, float)
+DEF_CVT_SCALE_FUNC(16u32f, cvt_32f, ushort, float, float)
+DEF_CVT_SCALE_FUNC(16s32f, cvt_32f, short,  float, float)
+DEF_CVT_SCALE_FUNC(32s32f, cvt_32f, int,    float, float)
+DEF_CVT_SCALE_FUNC(32f,    cvt_32f, float,  float, float)
+DEF_CVT_SCALE_FUNC(64f32f, cvt_64f, double, float, double)
+//DEF_CVT_SCALE_FUNC(16f32f, cvt1_32f, float16_t, float, float)
 
-DEF_CVT_SCALE_FUNC(8u32f,  uchar, float, float)
-DEF_CVT_SCALE_FUNC(8s32f,  schar, float, float)
-DEF_CVT_SCALE_FUNC(16u32f, ushort, float, float)
-DEF_CVT_SCALE_FUNC(16s32f, short, float, float)
-DEF_CVT_SCALE_FUNC(32s32f, int, float, double)
-DEF_CVT_SCALE_FUNC(32f,    float, float, float)
-DEF_CVT_SCALE_FUNC(64f32f, double, float, double)
+DEF_CVT_SCALE_FUNC(8u64f,  cvt_64f, uchar,  double, double)
+DEF_CVT_SCALE_FUNC(8s64f,  cvt_64f, schar,  double, double)
+DEF_CVT_SCALE_FUNC(16u64f, cvt_64f, ushort, double, double)
+DEF_CVT_SCALE_FUNC(16s64f, cvt_64f, short,  double, double)
+DEF_CVT_SCALE_FUNC(32s64f, cvt_64f, int,    double, double)
+DEF_CVT_SCALE_FUNC(32f64f, cvt_64f, float,  double, double)
+DEF_CVT_SCALE_FUNC(64f,    cvt_64f, double, double, double)
+//DEF_CVT_SCALE_FUNC(16f64f, cvt_64f, float16_t, double, double)
 
-DEF_CVT_SCALE_FUNC(8u64f,  uchar, double, double)
-DEF_CVT_SCALE_FUNC(8s64f,  schar, double, double)
-DEF_CVT_SCALE_FUNC(16u64f, ushort, double, double)
-DEF_CVT_SCALE_FUNC(16s64f, short, double, double)
-DEF_CVT_SCALE_FUNC(32s64f, int, double, double)
-DEF_CVT_SCALE_FUNC(32f64f, float, double, double)
-DEF_CVT_SCALE_FUNC(64f,    double, double, double)
+/*DEF_CVT_SCALE_FUNC(8u16f,  cvt1_32f, uchar,  float16_t, float)
+DEF_CVT_SCALE_FUNC(8s16f,  cvt1_32f, schar,  float16_t, float)
+DEF_CVT_SCALE_FUNC(16u16f, cvt1_32f, ushort, float16_t, float)
+DEF_CVT_SCALE_FUNC(16s16f, cvt1_32f, short,  float16_t, float)
+DEF_CVT_SCALE_FUNC(32s16f, cvt1_32f, int,    float16_t, float)
+DEF_CVT_SCALE_FUNC(32f16f, cvt1_32f, float,  float16_t, float)
+DEF_CVT_SCALE_FUNC(64f16f, cvt_64f,  double, float16_t, double)
+DEF_CVT_SCALE_FUNC(16f,    cvt1_32f, float16_t, float16_t, float)*/
 
 static BinaryFunc getCvtScaleAbsFunc(int depth)
 {
@@ -1651,41 +306,44 @@ BinaryFunc getConvertScaleFunc(int sdepth, int ddepth)
         {
             (BinaryFunc)GET_OPTIMIZED(cvtScale8u), (BinaryFunc)GET_OPTIMIZED(cvtScale8s8u), (BinaryFunc)GET_OPTIMIZED(cvtScale16u8u),
             (BinaryFunc)GET_OPTIMIZED(cvtScale16s8u), (BinaryFunc)GET_OPTIMIZED(cvtScale32s8u), (BinaryFunc)GET_OPTIMIZED(cvtScale32f8u),
-            (BinaryFunc)cvtScale64f8u, 0
+            (BinaryFunc)cvtScale64f8u, 0 //(BinaryFunc)cvtScale16f8u
         },
         {
             (BinaryFunc)GET_OPTIMIZED(cvtScale8u8s), (BinaryFunc)GET_OPTIMIZED(cvtScale8s), (BinaryFunc)GET_OPTIMIZED(cvtScale16u8s),
             (BinaryFunc)GET_OPTIMIZED(cvtScale16s8s), (BinaryFunc)GET_OPTIMIZED(cvtScale32s8s), (BinaryFunc)GET_OPTIMIZED(cvtScale32f8s),
-            (BinaryFunc)cvtScale64f8s, 0
+            (BinaryFunc)cvtScale64f8s, 0 //(BinaryFunc)cvtScale16f8s
         },
         {
             (BinaryFunc)GET_OPTIMIZED(cvtScale8u16u), (BinaryFunc)GET_OPTIMIZED(cvtScale8s16u), (BinaryFunc)GET_OPTIMIZED(cvtScale16u),
             (BinaryFunc)GET_OPTIMIZED(cvtScale16s16u), (BinaryFunc)GET_OPTIMIZED(cvtScale32s16u), (BinaryFunc)GET_OPTIMIZED(cvtScale32f16u),
-            (BinaryFunc)cvtScale64f16u, 0
+            (BinaryFunc)cvtScale64f16u, 0 //(BinaryFunc)cvtScale16f16u
         },
         {
             (BinaryFunc)GET_OPTIMIZED(cvtScale8u16s), (BinaryFunc)GET_OPTIMIZED(cvtScale8s16s), (BinaryFunc)GET_OPTIMIZED(cvtScale16u16s),
             (BinaryFunc)GET_OPTIMIZED(cvtScale16s), (BinaryFunc)GET_OPTIMIZED(cvtScale32s16s), (BinaryFunc)GET_OPTIMIZED(cvtScale32f16s),
-            (BinaryFunc)cvtScale64f16s, 0
+            (BinaryFunc)cvtScale64f16s, 0 //(BinaryFunc)cvtScale16f16s
         },
         {
             (BinaryFunc)GET_OPTIMIZED(cvtScale8u32s), (BinaryFunc)GET_OPTIMIZED(cvtScale8s32s), (BinaryFunc)GET_OPTIMIZED(cvtScale16u32s),
             (BinaryFunc)GET_OPTIMIZED(cvtScale16s32s), (BinaryFunc)GET_OPTIMIZED(cvtScale32s), (BinaryFunc)GET_OPTIMIZED(cvtScale32f32s),
-            (BinaryFunc)cvtScale64f32s, 0
+            (BinaryFunc)cvtScale64f32s, 0 //(BinaryFunc)cvtScale16f32s
         },
         {
             (BinaryFunc)GET_OPTIMIZED(cvtScale8u32f), (BinaryFunc)GET_OPTIMIZED(cvtScale8s32f), (BinaryFunc)GET_OPTIMIZED(cvtScale16u32f),
             (BinaryFunc)GET_OPTIMIZED(cvtScale16s32f), (BinaryFunc)GET_OPTIMIZED(cvtScale32s32f), (BinaryFunc)GET_OPTIMIZED(cvtScale32f),
-            (BinaryFunc)cvtScale64f32f, 0
+            (BinaryFunc)cvtScale64f32f, 0 //(BinaryFunc)cvtScale16f32f
         },
         {
             (BinaryFunc)cvtScale8u64f, (BinaryFunc)cvtScale8s64f, (BinaryFunc)cvtScale16u64f,
             (BinaryFunc)cvtScale16s64f, (BinaryFunc)cvtScale32s64f, (BinaryFunc)cvtScale32f64f,
-            (BinaryFunc)cvtScale64f, 0
+            (BinaryFunc)cvtScale64f, 0 //(BinaryFunc)cvtScale16f64f
         },
         {
             0, 0, 0, 0, 0, 0, 0, 0
-        }
+            /*(BinaryFunc)cvtScale8u16f, (BinaryFunc)cvtScale8s16f, (BinaryFunc)cvtScale16u16f,
+            (BinaryFunc)cvtScale16s16f, (BinaryFunc)cvtScale32s16f, (BinaryFunc)cvtScale32f16f,
+            (BinaryFunc)cvtScale64f16f, (BinaryFunc)cvtScale16f*/
+        },
     };
 
     return cvtScaleTab[CV_MAT_DEPTH(ddepth)][CV_MAT_DEPTH(sdepth)];
diff --git a/modules/core/src/copy.cpp b/modules/core/src/copy.cpp
index 8f93d4bb72..c3f75bb384 100644
--- a/modules/core/src/copy.cpp
+++ b/modules/core/src/copy.cpp
@@ -258,7 +258,7 @@ void Mat::copyTo( OutputArray _dst ) const
         UMat dst = _dst.getUMat();
         CV_Assert(dst.u != NULL);
         size_t i, sz[CV_MAX_DIM] = {0}, dstofs[CV_MAX_DIM], esz = elemSize();
-        CV_Assert(dims >= 0 && dims < CV_MAX_DIM);
+        CV_Assert(dims > 0 && dims < CV_MAX_DIM);
         for( i = 0; i < (size_t)dims; i++ )
             sz[i] = size.p[i];
         sz[dims-1] *= esz;
diff --git a/modules/core/src/kmeans.cpp b/modules/core/src/kmeans.cpp
index c34e254953..b94f354732 100644
--- a/modules/core/src/kmeans.cpp
+++ b/modules/core/src/kmeans.cpp
@@ -43,6 +43,7 @@
 
 #include "precomp.hpp"
 #include <opencv2/core/utils/configuration.private.hpp>
+#include <opencv2/core/hal/hal.hpp>
 
 ////////////////////////////////////////// kmeans ////////////////////////////////////////////
 
@@ -74,7 +75,7 @@ public:
 
         for (int i = begin; i<end; i++)
         {
-            tdist2[i] = std::min(normL2Sqr(data.ptr<float>(i), data.ptr<float>(ci), dims), dist[i]);
+            tdist2[i] = std::min(hal::normL2Sqr_(data.ptr<float>(i), data.ptr<float>(ci), dims), dist[i]);
         }
     }
 
@@ -106,7 +107,7 @@ static void generateCentersPP(const Mat& data, Mat& _out_centers,
 
     for (int i = 0; i < N; i++)
     {
-        dist[i] = normL2Sqr(data.ptr<float>(i), data.ptr<float>(centers[0]), dims);
+        dist[i] = hal::normL2Sqr_(data.ptr<float>(i), data.ptr<float>(centers[0]), dims);
         sum0 += dist[i];
     }
 
@@ -185,7 +186,7 @@ public:
             if (onlyDistance)
             {
                 const float* center = centers.ptr<float>(labels[i]);
-                distances[i] = normL2Sqr(sample, center, dims);
+                distances[i] = hal::normL2Sqr_(sample, center, dims);
                 continue;
             }
             else
@@ -196,7 +197,7 @@ public:
                 for (int k = 0; k < K; k++)
                 {
                     const float* center = centers.ptr<float>(k);
-                    const double dist = normL2Sqr(sample, center, dims);
+                    const double dist = hal::normL2Sqr_(sample, center, dims);
 
                     if (min_dist > dist)
                     {
@@ -379,7 +380,7 @@ double cv::kmeans( InputArray _data, int K,
                         if (labels[i] != max_k)
                             continue;
                         const float* sample = data.ptr<float>(i);
-                        double dist = normL2Sqr(sample, _base_center, dims);
+                        double dist = hal::normL2Sqr_(sample, _base_center, dims);
 
                         if (max_dist <= dist)
                         {
diff --git a/modules/core/src/matrix_c.cpp b/modules/core/src/matrix_c.cpp
index a4efd4ccf3..1c3e58857c 100644
--- a/modules/core/src/matrix_c.cpp
+++ b/modules/core/src/matrix_c.cpp
@@ -4,20 +4,24 @@
 
 // glue
 
-CvMatND::CvMatND(const cv::Mat& m)
+CvMatND cvMatND(const cv::Mat& m)
 {
-    cvInitMatNDHeader(this, m.dims, m.size, m.type(), m.data );
+    CvMatND self;
+    cvInitMatNDHeader(&self, m.dims, m.size, m.type(), m.data );
     int i, d = m.dims;
     for( i = 0; i < d; i++ )
-        dim[i].step = (int)m.step[i];
-    type |= m.flags & cv::Mat::CONTINUOUS_FLAG;
+        self.dim[i].step = (int)m.step[i];
+    self.type |= m.flags & cv::Mat::CONTINUOUS_FLAG;
+    return self;
 }
 
-_IplImage::_IplImage(const cv::Mat& m)
+_IplImage cvIplImage(const cv::Mat& m)
 {
+    _IplImage self;
     CV_Assert( m.dims <= 2 );
-    cvInitImageHeader(this, m.size(), cvIplDepth(m.flags), m.channels());
-    cvSetData(this, m.data, (int)m.step[0]);
+    cvInitImageHeader(&self, cvSize(m.size()), cvIplDepth(m.flags), m.channels());
+    cvSetData(&self, m.data, (int)m.step[0]);
+    return self;
 }
 
 namespace cv {
@@ -222,7 +226,7 @@ CV_IMPL void cvSetIdentity( CvArr* arr, CvScalar value )
 
 CV_IMPL CvScalar cvTrace( const CvArr* arr )
 {
-    return cv::trace(cv::cvarrToMat(arr));
+    return cvScalar(cv::trace(cv::cvarrToMat(arr)));
 }
 
 
diff --git a/modules/core/src/persistence_cpp.cpp b/modules/core/src/persistence_cpp.cpp
index e584a130dd..7e4ae71946 100644
--- a/modules/core/src/persistence_cpp.cpp
+++ b/modules/core/src/persistence_cpp.cpp
@@ -457,12 +457,12 @@ void write( FileStorage& fs, const String& name, const Mat& value )
 {
     if( value.dims <= 2 )
     {
-        CvMat mat = value;
+        CvMat mat = cvMat(value);
         cvWrite( *fs, name.size() ? name.c_str() : 0, &mat );
     }
     else
     {
-        CvMatND mat = value;
+        CvMatND mat = cvMatND(value);
         cvWrite( *fs, name.size() ? name.c_str() : 0, &mat );
     }
 }
diff --git a/modules/core/src/persistence_types.cpp b/modules/core/src/persistence_types.cpp
index 7ef115b5e3..86a50aa0f3 100644
--- a/modules/core/src/persistence_types.cpp
+++ b/modules/core/src/persistence_types.cpp
@@ -31,7 +31,7 @@ static void icvWriteMat( CvFileStorage* fs, const char* name, const void* struct
 {
     const CvMat* mat = (const CvMat*)struct_ptr;
     char dt[16];
-    CvSize size;
+    cv::Size size;
     int y;
 
     assert( CV_IS_MAT_HDR_Z(mat) );
@@ -380,7 +380,7 @@ static void icvWriteImage( CvFileStorage* fs, const char* name, const void* stru
 {
     const IplImage* image = (const IplImage*)struct_ptr;
     char dt_buf[16], *dt;
-    CvSize size;
+    cv::Size size;
     int y, depth;
 
     assert( CV_IS_IMAGE(image) );
@@ -435,7 +435,7 @@ static void* icvReadImage( CvFileStorage* fs, CvFileNode* node )
     CvFileNode* data;
     CvFileNode* roi_node;
     CvSeqReader reader;
-    CvRect roi;
+    cv::Rect roi;
     int y, width, height, elem_type, coi, depth;
     const char* origin, *data_order;
 
@@ -472,7 +472,7 @@ static void* icvReadImage( CvFileStorage* fs, CvFileNode* node )
         roi.height = cvReadIntByName( fs, roi_node, "height", 0 );
         coi = cvReadIntByName( fs, roi_node, "coi", 0 );
 
-        cvSetImageROI( image, roi );
+        cvSetImageROI( image, cvRect(roi) );
         cvSetImageCOI( image, coi );
     }
 
diff --git a/modules/core/src/stat_c.cpp b/modules/core/src/stat_c.cpp
index 504b453fcc..d7355b9f94 100644
--- a/modules/core/src/stat_c.cpp
+++ b/modules/core/src/stat_c.cpp
@@ -17,7 +17,7 @@ CV_IMPL CvScalar cvSum( const CvArr* srcarr )
             sum = cv::Scalar(sum[coi-1]);
         }
     }
-    return sum;
+    return cvScalar(sum);
 }
 
 CV_IMPL int cvCountNonZero( const CvArr* imgarr )
@@ -43,7 +43,7 @@ cvAvg( const void* imgarr, const void* maskarr )
             mean = cv::Scalar(mean[coi-1]);
         }
     }
-    return mean;
+    return cvScalar(mean);
 }
 
 
diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp
index a1409f0979..6666bc4253 100644
--- a/modules/core/test/test_intrin_utils.hpp
+++ b/modules/core/test/test_intrin_utils.hpp
@@ -1123,7 +1123,6 @@ template<typename R> struct TheTest
         return *this;
     }
 
-#if CV_FP16
     TheTest & test_loadstore_fp16_f32()
     {
         printf("test_loadstore_fp16_f32 ...\n");
@@ -1133,14 +1132,14 @@ template<typename R> struct TheTest
         AlignedData<v_float32> data_f32; data_f32.a.clear();
         AlignedData<v_uint16> out;
 
-        R r1 = vx_load_fp16_f32((short*)data.a.d);
+        R r1 = vx_load_expand((const cv::float16_t*)data.a.d);
         R r2(r1);
         EXPECT_EQ(1.0f, r1.get0());
         vx_store(data_f32.a.d, r2);
         EXPECT_EQ(-2.0f, data_f32.a.d[R::nlanes - 1]);
 
         out.a.clear();
-        vx_store_fp16((short*)out.a.d, r2);
+        v_pack_store((cv::float16_t*)out.a.d, r2);
         for (int i = 0; i < R::nlanes; ++i)
         {
             EXPECT_EQ(data.a[i], out.a[i]) << "i=" << i;
@@ -1148,9 +1147,8 @@ template<typename R> struct TheTest
 
         return *this;
     }
-#endif
 
-#if CV_SIMD_FP16
+#if 0
     TheTest & test_loadstore_fp16()
     {
         printf("test_loadstore_fp16 ...\n");
@@ -1165,7 +1163,7 @@ template<typename R> struct TheTest
 
         // check some initialization methods
         R r1 = data.u;
-        R r2 = vx_load_f16(data.a.d);
+        R r2 = vx_load_expand((const float16_t*)data.a.d);
         R r3(r2);
         EXPECT_EQ(data.u[0], r1.get0());
         EXPECT_EQ(data.a[0], r2.get0());
diff --git a/modules/core/test/test_io.cpp b/modules/core/test/test_io.cpp
index b4e3d10db6..c7473b9ded 100644
--- a/modules/core/test/test_io.cpp
+++ b/modules/core/test/test_io.cpp
@@ -214,7 +214,7 @@ protected:
             }
 
             CvMat* m = (CvMat*)fs["test_mat"].readObj();
-            CvMat _test_mat = test_mat;
+            CvMat _test_mat = cvMat(test_mat);
             double max_diff = 0;
             CvMat stub1, _test_stub1;
             cvReshape(m, &stub1, 1, 0);
@@ -234,7 +234,7 @@ protected:
                 cvReleaseMat(&m);
 
             CvMatND* m_nd = (CvMatND*)fs["test_mat_nd"].readObj();
-            CvMatND _test_mat_nd = test_mat_nd;
+            CvMatND _test_mat_nd = cvMatND(test_mat_nd);
 
             if( !m_nd || !CV_IS_MATND(m_nd) )
             {
@@ -263,7 +263,7 @@ protected:
 
             MatND mat_nd2;
             fs["test_mat_nd"] >> mat_nd2;
-            CvMatND m_nd2 = mat_nd2;
+            CvMatND m_nd2 = cvMatND(mat_nd2);
             cvGetMat(&m_nd2, &stub, 0, 1);
             cvReshape(&stub, &stub1, 1, 0);
 
diff --git a/modules/core/test/test_mat.cpp b/modules/core/test/test_mat.cpp
index e7ccd95c8c..dab8373d5e 100644
--- a/modules/core/test/test_mat.cpp
+++ b/modules/core/test/test_mat.cpp
@@ -415,15 +415,15 @@ TEST(Core_PCA, accuracy)
 
 #ifdef CHECK_C
     // 4. check C PCA & ROW
-    _points = rPoints;
-    _testPoints = rTestPoints;
-    _avg = avg;
-    _eval = eval;
-    _evec = evec;
+    _points = cvMat(rPoints);
+    _testPoints = cvMat(rTestPoints);
+    _avg = cvMat(avg);
+    _eval = cvMat(eval);
+    _evec = cvMat(evec);
     prjTestPoints.create(rTestPoints.rows, maxComponents, rTestPoints.type() );
     backPrjTestPoints.create(rPoints.size(), rPoints.type() );
-    _prjTestPoints = prjTestPoints;
-    _backPrjTestPoints = backPrjTestPoints;
+    _prjTestPoints = cvMat(prjTestPoints);
+    _backPrjTestPoints = cvMat(backPrjTestPoints);
 
     cvCalcPCA( &_points, &_avg, &_eval, &_evec, CV_PCA_DATA_AS_ROW );
     cvProjectPCA( &_testPoints, &_avg, &_evec, &_prjTestPoints );
@@ -435,13 +435,13 @@ TEST(Core_PCA, accuracy)
     ASSERT_LE(err, diffBackPrjEps) << "bad accuracy of cvBackProjectPCA() (CV_PCA_DATA_AS_ROW)";
 
     // 5. check C PCA & COL
-    _points = cPoints;
-    _testPoints = cTestPoints;
-    avg = avg.t(); _avg = avg;
-    eval = eval.t(); _eval = eval;
-    evec = evec.t(); _evec = evec;
-    prjTestPoints = prjTestPoints.t(); _prjTestPoints = prjTestPoints;
-    backPrjTestPoints = backPrjTestPoints.t(); _backPrjTestPoints = backPrjTestPoints;
+    _points = cvMat(cPoints);
+    _testPoints = cvMat(cTestPoints);
+    avg = avg.t(); _avg = cvMat(avg);
+    eval = eval.t(); _eval = cvMat(eval);
+    evec = evec.t(); _evec = cvMat(evec);
+    prjTestPoints = prjTestPoints.t(); _prjTestPoints = cvMat(prjTestPoints);
+    backPrjTestPoints = backPrjTestPoints.t(); _backPrjTestPoints = cvMat(backPrjTestPoints);
 
     cvCalcPCA( &_points, &_avg, &_eval, &_evec, CV_PCA_DATA_AS_COL );
     cvProjectPCA( &_testPoints, &_avg, &_evec, &_prjTestPoints );
@@ -615,7 +615,7 @@ void Core_ArrayOpTest::run( int /* start_from */)
     {
         int sz3[] = {5, 10, 15};
         MatND A(3, sz3, CV_32F), B(3, sz3, CV_16SC4);
-        CvMatND matA = A, matB = B;
+        CvMatND matA = cvMatND(A), matB = cvMatND(B);
         RNG rng;
         rng.fill(A, CV_RAND_UNI, Scalar::all(-10), Scalar::all(10));
         rng.fill(B, CV_RAND_UNI, Scalar::all(-10), Scalar::all(10));
@@ -625,8 +625,8 @@ void Core_ArrayOpTest::run( int /* start_from */)
         Scalar val1(-1000, 30, 3, 8);
         cvSetRealND(&matA, idx0, val0);
         cvSetReal3D(&matA, idx1[0], idx1[1], idx1[2], -val0);
-        cvSetND(&matB, idx0, val1);
-        cvSet3D(&matB, idx1[0], idx1[1], idx1[2], -val1);
+        cvSetND(&matB, idx0, cvScalar(val1));
+        cvSet3D(&matB, idx1[0], idx1[1], idx1[2], cvScalar(-val1));
         Ptr<CvMatND> matC(cvCloneMatND(&matB));
 
         if( A.at<float>(idx0[0], idx0[1], idx0[2]) != val0 ||
diff --git a/modules/core/test/test_math.cpp b/modules/core/test/test_math.cpp
index 68dfc3c969..c763ed80dc 100644
--- a/modules/core/test/test_math.cpp
+++ b/modules/core/test/test_math.cpp
@@ -526,7 +526,7 @@ void Core_CrossProductTest::get_test_array_types_and_sizes( int,
     RNG& rng = ts->get_rng();
     int depth = cvtest::randInt(rng) % 2 + CV_32F;
     int cn = cvtest::randInt(rng) & 1 ? 3 : 1, type = CV_MAKETYPE(depth, cn);
-    CvSize sz;
+    Size sz;
 
     types[INPUT][0] = types[INPUT][1] = types[OUTPUT][0] = types[REF_OUTPUT][0] = type;
 
@@ -549,7 +549,7 @@ void Core_CrossProductTest::run_func()
 
 void Core_CrossProductTest::prepare_to_validation( int )
 {
-    CvScalar a(0), b(0), c(0);
+    cv::Scalar a, b, c;
 
     if( test_mat[INPUT][0].rows > 1 )
     {
@@ -595,7 +595,7 @@ void Core_CrossProductTest::prepare_to_validation( int )
     }
     else
     {
-        cvSet1D( test_array[REF_OUTPUT][0], 0, c );
+        cvSet1D( test_array[REF_OUTPUT][0], 0, cvScalar(c) );
     }
 }
 
@@ -896,7 +896,7 @@ double Core_TransformTest::get_success_error_level( int test_case_idx, int i, in
 
 void Core_TransformTest::run_func()
 {
-    CvMat _m = test_mat[INPUT][1], _shift = test_mat[INPUT][2];
+    CvMat _m = cvMat(test_mat[INPUT][1]), _shift = cvMat(test_mat[INPUT][2]);
     cvTransform( test_array[INPUT][0], test_array[OUTPUT][0], &_m, _shift.data.ptr ? &_shift : 0);
 }
 
@@ -1010,7 +1010,7 @@ double Core_PerspectiveTransformTest::get_success_error_level( int test_case_idx
 
 void Core_PerspectiveTransformTest::run_func()
 {
-    CvMat _m = test_mat[INPUT][1];
+    CvMat _m = cvMat(test_mat[INPUT][1]);
     cvPerspectiveTransform( test_array[INPUT][0], test_array[OUTPUT][0], &_m );
 }
 
@@ -1117,7 +1117,7 @@ static void cvTsPerspectiveTransform( const CvArr* _src, CvArr* _dst, const CvMa
 
 void Core_PerspectiveTransformTest::prepare_to_validation( int )
 {
-    CvMat transmat = test_mat[INPUT][1];
+    CvMat transmat = cvMat(test_mat[INPUT][1]);
     cvTsPerspectiveTransform( test_array[INPUT][0], test_array[REF_OUTPUT][0], &transmat );
 }
 
@@ -1287,9 +1287,9 @@ int Core_CovarMatrixTest::prepare_test_case( int test_case_idx )
         if( single_matrix )
         {
             if( !are_images )
-                *((CvMat*)_hdr_data) = test_mat[INPUT][0];
+                *((CvMat*)_hdr_data) = cvMat(test_mat[INPUT][0]);
             else
-                *((IplImage*)_hdr_data) = test_mat[INPUT][0];
+                *((IplImage*)_hdr_data) = cvIplImage(test_mat[INPUT][0]);
             temp_hdrs[0] = _hdr_data;
         }
         else
@@ -1304,9 +1304,9 @@ int Core_CovarMatrixTest::prepare_test_case( int test_case_idx )
                     part = test_mat[INPUT][0].col(i);
 
                 if( !are_images )
-                    *((CvMat*)ptr) = part;
+                    *((CvMat*)ptr) = cvMat(part);
                 else
-                    *((IplImage*)ptr) = part;
+                    *((IplImage*)ptr) = cvIplImage(part);
 
                 temp_hdrs[i] = ptr;
             }
@@ -1539,7 +1539,7 @@ static double cvTsLU( CvMat* a, CvMat* b=NULL, CvMat* x=NULL, int* rank=0 )
 void Core_DetTest::prepare_to_validation( int )
 {
     test_mat[INPUT][0].convertTo(test_mat[TEMP][0], test_mat[TEMP][0].type());
-    CvMat temp0 = test_mat[TEMP][0];
+    CvMat temp0 = cvMat(test_mat[TEMP][0]);
     test_mat[REF_OUTPUT][0].at<Scalar>(0,0) = cvRealScalar(cvTsLU(&temp0, 0, 0));
 }
 
@@ -1676,7 +1676,7 @@ void Core_InvertTest::prepare_to_validation( int )
     Mat& temp1 = test_mat[TEMP][1];
     Mat& dst0 = test_mat[REF_OUTPUT][0];
     Mat& dst = test_mat[OUTPUT][0];
-    CvMat _input = input;
+    CvMat _input = cvMat(input);
     double ratio = 0, det = cvTsSVDet( &_input, &ratio );
     double threshold = (input.depth() == CV_32F ? FLT_EPSILON : DBL_EPSILON)*1000;
 
@@ -1733,7 +1733,7 @@ void Core_SolveTest::get_test_array_types_and_sizes( int test_case_idx, vector<v
     RNG& rng = ts->get_rng();
     int bits = cvtest::randInt(rng);
     Base::get_test_array_types_and_sizes( test_case_idx, sizes, types );
-    CvSize in_sz = sizes[INPUT][0];
+    CvSize in_sz = cvSize(sizes[INPUT][0]);
     if( in_sz.width > in_sz.height )
         in_sz = cvSize(in_sz.height, in_sz.width);
     Base::get_test_array_types_and_sizes( test_case_idx, sizes, types );
@@ -1813,14 +1813,14 @@ void Core_SolveTest::prepare_to_validation( int )
             Mat& temp1 = test_mat[TEMP][1];
             cvtest::convert(input, temp1, temp1.type());
             dst = Scalar::all(0);
-            CvMat _temp1 = temp1;
+            CvMat _temp1 = cvMat(temp1);
             double det = cvTsLU( &_temp1, 0, 0 );
             dst0 = Scalar::all(det != 0);
             return;
         }
 
         double threshold = (input.type() == CV_32F ? FLT_EPSILON : DBL_EPSILON)*1000;
-        CvMat _input = input;
+        CvMat _input = cvMat(input);
         double ratio = 0, det = cvTsSVDet( &_input, &ratio );
         if( det < threshold || ratio < threshold )
         {
@@ -2105,7 +2105,7 @@ void Core_SVBkSbTest::get_test_array_types_and_sizes( int test_case_idx, vector<
     int bits = cvtest::randInt(rng);
     Base::get_test_array_types_and_sizes( test_case_idx, sizes, types );
     int min_size, i, m, n;
-    CvSize b_size;
+    cv::Size b_size;
 
     min_size = MIN( sizes[INPUT][0].width, sizes[INPUT][0].height );
 
@@ -2122,7 +2122,7 @@ void Core_SVBkSbTest::get_test_array_types_and_sizes( int test_case_idx, vector<
     n = sizes[INPUT][0].width;
 
     sizes[INPUT][1] = Size(0,0);
-    b_size = Size(m,m);
+    b_size = cvSize(m, m);
     if( have_b )
     {
         sizes[INPUT][1].height = sizes[INPUT][0].height;
@@ -2174,7 +2174,7 @@ int Core_SVBkSbTest::prepare_test_case( int test_case_idx )
             cvtest::copy( temp, input );
         }
 
-        CvMat _input = input;
+        CvMat _input = cvMat(input);
         cvSVD( &_input, test_array[TEMP][0], test_array[TEMP][1], test_array[TEMP][2], flags );
     }
 
@@ -2210,7 +2210,7 @@ void Core_SVBkSbTest::prepare_to_validation( int )
     Size w_size = compact ? Size(min_size,min_size) : Size(m,n);
     Mat& w = test_mat[TEMP][0];
     Mat wdb( w_size.height, w_size.width, CV_64FC1 );
-    CvMat _w = w, _wdb = wdb;
+    CvMat _w = cvMat(w), _wdb = cvMat(wdb);
     // use exactly the same threshold as in icvSVD... ,
     // so the changes in the library and here should be synchronized.
     double threshold = cv::sum(w)[0]*(DBL_EPSILON*2);//(is_float ? FLT_EPSILON*10 : DBL_EPSILON*2);
@@ -3230,6 +3230,22 @@ softdouble naiveExp(softdouble x)
     }
 }
 
+static float makeFP32(int sign, int exponent, int significand)
+{
+    Cv32suf x;
+    x.u = (unsigned)(((sign & 1) << 31) | ((exponent&255) << 23) | (significand & 0x7fffff));
+    return x.f;
+}
+
+static float makeRandomFP32(RNG& rng, int sign, int exprange)
+{
+    if( sign == -1 )
+        sign = rng() % 2;
+    int exponent = rng() % exprange;
+    int significand = rng() % (1 << 23);
+    return makeFP32(sign, exponent, significand);
+}
+
 TEST(Core_SoftFloat, exp32)
 {
     //special cases
@@ -3246,13 +3262,11 @@ TEST(Core_SoftFloat, exp32)
     inputs.push_back(softfloat::min());
     for(int i = 0; i < 50000; i++)
     {
-        Cv32suf x;
-        x.fmt.sign = rng() % 2;
-        x.fmt.exponent = rng() % (10 + 127); //bigger exponent will produce inf
-        x.fmt.significand = rng() % (1 << 23);
-        if(softfloat(x.f) > ln_max)
-            x.f = rng.uniform(0.0f, (float)ln_max);
-        inputs.push_back(softfloat(x.f));
+        float x = makeRandomFP32(rng, -1, 10+127 //bigger exponent will produce inf
+                                 );
+        if(softfloat(x) > ln_max)
+            x = rng.uniform(0.0f, (float)ln_max);
+        inputs.push_back(softfloat(x));
     }
 
     for(size_t i = 0; i < inputs.size(); i++)
@@ -3323,11 +3337,7 @@ TEST(Core_SoftFloat, log32)
     EXPECT_TRUE(log(softfloat::nan()).isNaN());
     for(int i = 0; i < nValues; i++)
     {
-        Cv32suf x;
-        x.fmt.sign = 1;
-        x.fmt.exponent = rng() % 255;
-        x.fmt.significand = rng() % (1 << 23);
-        softfloat x32(x.f);
+        softfloat x32(makeRandomFP32(rng, 1, 255));
         ASSERT_TRUE(log(x32).isNaN());
     }
     EXPECT_TRUE(log(softfloat::zero()).isInf());
@@ -3340,11 +3350,7 @@ TEST(Core_SoftFloat, log32)
     inputs.push_back(softfloat::max());
     for(int i = 0; i < nValues; i++)
     {
-        Cv32suf x;
-        x.fmt.sign = 0;
-        x.fmt.exponent = rng() % 255;
-        x.fmt.significand = rng() % (1 << 23);
-        inputs.push_back(softfloat(x.f));
+        inputs.push_back(softfloat(makeRandomFP32(rng, 0, 255)));
     }
 
     for(size_t i = 0; i < inputs.size(); i++)
@@ -3426,11 +3432,7 @@ TEST(Core_SoftFloat, cbrt32)
     inputs.push_back(softfloat::min());
     for(int i = 0; i < 50000; i++)
     {
-        Cv32suf x;
-        x.fmt.sign = rng() % 2;
-        x.fmt.exponent = rng() % 255;
-        x.fmt.significand = rng() % (1 << 23);
-        inputs.push_back(softfloat(x.f));
+        inputs.push_back(softfloat(makeRandomFP32(rng, -1, 255)));
     }
 
     for(size_t i = 0; i < inputs.size(); i++)
@@ -3522,11 +3524,8 @@ TEST(Core_SoftFloat, pow32)
     // inf ** y == inf, if y > 0
     for(size_t i = 0; i < nValues; i++)
     {
-        Cv32suf x;
-        x.fmt.sign = 0;
-        x.fmt.exponent = rng() % 255;
-        x.fmt.significand = rng() % (1 << 23);
-        softfloat x32 = softfloat(x.f);
+        float x = makeRandomFP32(rng, 0, 255);
+        softfloat x32 = softfloat(x);
         ASSERT_TRUE(pow( inf, x32).isInf());
         ASSERT_TRUE(pow(-inf, x32).isInf());
         ASSERT_EQ(pow( inf, -x32), zero);
@@ -3538,17 +3537,9 @@ TEST(Core_SoftFloat, pow32)
     // x ** y == nan, if x < 0 and y is not integer
     for(size_t i = 0; i < nValues; i++)
     {
-        Cv32suf x;
-        x.fmt.sign = 1;
-        x.fmt.exponent = rng() % 255;
-        x.fmt.significand = rng() % (1 << 23);
-        softfloat x32(x.f);
-        Cv32suf y;
-        y.fmt.sign = rng() % 2;
-        //bigger exponent produces integer numbers only
-        y.fmt.exponent = rng() % (23 + 127);
-        y.fmt.significand = rng() % (1 << 23);
-        softfloat y32(y.f);
+        softfloat x32(makeRandomFP32(rng, 1, 255));
+        softfloat y32(makeRandomFP32(rng, -1, 23+127 //bigger exponent produces integer numbers only
+                                     ));
         int yi = cvRound(y32);
         if(y32 != softfloat(yi))
             ASSERT_TRUE(pow(x32, y32).isNaN());
@@ -3565,11 +3556,7 @@ TEST(Core_SoftFloat, pow32)
     // 0 ** y == 0, if y > 0
     for(size_t i = 0; i < nValues; i++)
     {
-        Cv32suf x;
-        x.fmt.sign = 0;
-        x.fmt.exponent = rng() % 255;
-        x.fmt.significand = rng() % (1 << 23);
-        softfloat x32(x.f);
+        softfloat x32(makeRandomFP32(rng, 0, 255));
         ASSERT_TRUE(pow(zero, -x32).isInf());
         if(x32 != one)
         {
diff --git a/modules/core/test/test_operations.cpp b/modules/core/test/test_operations.cpp
index 1bdf516a16..e72400c7fa 100644
--- a/modules/core/test/test_operations.cpp
+++ b/modules/core/test/test_operations.cpp
@@ -970,7 +970,7 @@ bool CV_OperationsTest::operations1()
         Size sz(10, 20);
         if (sz.area() != 200) throw test_excep();
         if (sz.width != 10 || sz.height != 20) throw test_excep();
-        if (((CvSize)sz).width != 10 || ((CvSize)sz).height != 20) throw test_excep();
+        if (cvSize(sz).width != 10 || cvSize(sz).height != 20) throw test_excep();
 
         Vec<double, 5> v5d(1, 1, 1, 1, 1);
         Vec<double, 6> v6d(1, 1, 1, 1, 1, 1);
diff --git a/modules/cudalegacy/src/fgd.cpp b/modules/cudalegacy/src/fgd.cpp
index 237f1c05fa..7e5728a1c5 100644
--- a/modules/cudalegacy/src/fgd.cpp
+++ b/modules/cudalegacy/src/fgd.cpp
@@ -373,7 +373,7 @@ namespace
         // Discard under-size foreground regions:
 
         d_foreground.download(h_foreground);
-        IplImage ipl_foreground = h_foreground;
+        IplImage ipl_foreground = cvIplImage(h_foreground);
         CvSeq* first_seq = 0;
 
         cvFindContours(&ipl_foreground, storage, &first_seq, sizeof(CvContour), CV_RETR_LIST);
diff --git a/modules/dnn/include/opencv2/dnn/dnn.hpp b/modules/dnn/include/opencv2/dnn/dnn.hpp
index c322d47b82..3edd2ad8d0 100644
--- a/modules/dnn/include/opencv2/dnn/dnn.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.hpp
@@ -158,8 +158,6 @@ CV__DNN_INLINE_NS_BEGIN
     };
 
     class CV_EXPORTS ActivationLayer;
-    class CV_EXPORTS BatchNormLayer;
-    class CV_EXPORTS ScaleLayer;
 
     /** @brief This interface class allows to build new Layers - are building blocks of networks.
      *
@@ -174,20 +172,31 @@ CV__DNN_INLINE_NS_BEGIN
         CV_PROP_RW std::vector<Mat> blobs;
 
         /** @brief Computes and sets internal parameters according to inputs, outputs and blobs.
+         *  @deprecated Use Layer::finalize(InputArrayOfArrays, OutputArrayOfArrays) instead
          *  @param[in]  input  vector of already allocated input blobs
          *  @param[out] output vector of already allocated output blobs
          *
          * If this method is called after network has allocated all memory for input and output blobs
          * and before inferencing.
          */
-        virtual void finalize(const std::vector<Mat*> &input, std::vector<Mat> &output);
+        CV_DEPRECATED virtual void finalize(const std::vector<Mat*> &input, std::vector<Mat> &output);
+
+        /** @brief Computes and sets internal parameters according to inputs, outputs and blobs.
+         *  @param[in]  inputs  vector of already allocated input blobs
+         *  @param[out] outputs vector of already allocated output blobs
+         *
+         * If this method is called after network has allocated all memory for input and output blobs
+         * and before inferencing.
+         */
+        CV_WRAP virtual void finalize(InputArrayOfArrays inputs, OutputArrayOfArrays outputs);
 
         /** @brief Given the @p input blobs, computes the output @p blobs.
+         *  @deprecated Use Layer::forward(InputArrayOfArrays, OutputArrayOfArrays, OutputArrayOfArrays) instead
          *  @param[in]  input  the input blobs.
          *  @param[out] output allocated output blobs, which will store results of the computation.
          *  @param[out] internals allocated internal blobs
          */
-        virtual void forward(std::vector<Mat*> &input, std::vector<Mat> &output, std::vector<Mat> &internals) = 0;
+        CV_DEPRECATED virtual void forward(std::vector<Mat*> &input, std::vector<Mat> &output, std::vector<Mat> &internals);
 
         /** @brief Given the @p input blobs, computes the output @p blobs.
          *  @param[in]  inputs  the input blobs.
@@ -203,15 +212,23 @@ CV__DNN_INLINE_NS_BEGIN
          */
         void forward_fallback(InputArrayOfArrays inputs, OutputArrayOfArrays outputs, OutputArrayOfArrays internals);
 
-        /** @brief @overload */
-        CV_WRAP void finalize(const std::vector<Mat> &inputs, CV_OUT std::vector<Mat> &outputs);
+        /** @brief
+         * @overload
+         * @deprecated Use Layer::finalize(InputArrayOfArrays, OutputArrayOfArrays) instead
+         */
+        CV_DEPRECATED void finalize(const std::vector<Mat> &inputs, CV_OUT std::vector<Mat> &outputs);
 
-        /** @brief @overload */
-        CV_WRAP std::vector<Mat> finalize(const std::vector<Mat> &inputs);
+        /** @brief
+         * @overload
+         * @deprecated Use Layer::finalize(InputArrayOfArrays, OutputArrayOfArrays) instead
+         */
+        CV_DEPRECATED std::vector<Mat> finalize(const std::vector<Mat> &inputs);
 
-        /** @brief Allocates layer and computes output. */
-        CV_WRAP void run(const std::vector<Mat> &inputs, CV_OUT std::vector<Mat> &outputs,
-                         CV_IN_OUT std::vector<Mat> &internals);
+        /** @brief Allocates layer and computes output.
+         *  @deprecated This method will be removed in the future release.
+         */
+        CV_DEPRECATED CV_WRAP void run(const std::vector<Mat> &inputs, CV_OUT std::vector<Mat> &outputs,
+                                       CV_IN_OUT std::vector<Mat> &internals);
 
         /** @brief Returns index of input blob into the input array.
          *  @param inputName label of input blob
@@ -381,9 +398,6 @@ CV__DNN_INLINE_NS_BEGIN
         /** @brief Returns pointers to input layers of specific layer. */
         std::vector<Ptr<Layer> > getLayerInputs(LayerId layerId); // FIXIT: CV_WRAP
 
-        /** @brief Delete layer for the network (not implemented yet) */
-        CV_WRAP void deleteLayer(LayerId layer);
-
         /** @brief Connects output of the first layer to input of the second layer.
          *  @param outPin descriptor of the first layer output.
          *  @param inpPin descriptor of the second layer input.
diff --git a/modules/dnn/misc/python/pyopencv_dnn.hpp b/modules/dnn/misc/python/pyopencv_dnn.hpp
index 670d70d59b..650becc736 100644
--- a/modules/dnn/misc/python/pyopencv_dnn.hpp
+++ b/modules/dnn/misc/python/pyopencv_dnn.hpp
@@ -146,16 +146,16 @@ public:
         return false;
     }
 
-    virtual void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &) CV_OVERRIDE
+    virtual void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays) CV_OVERRIDE
     {
         PyGILState_STATE gstate;
         gstate = PyGILState_Ensure();
 
-        std::vector<Mat> inps(inputs.size());
-        for (size_t i = 0; i < inputs.size(); ++i)
-            inps[i] = *inputs[i];
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
 
-        PyObject* args = pyopencv_from(inps);
+        PyObject* args = pyopencv_from(inputs);
         PyObject* res = PyObject_CallMethodObjArgs(o, PyString_FromString("forward"), args, NULL);
         Py_DECREF(args);
         PyGILState_Release(gstate);
@@ -174,11 +174,6 @@ public:
         }
     }
 
-    virtual void forward(InputArrayOfArrays, OutputArrayOfArrays, OutputArrayOfArrays) CV_OVERRIDE
-    {
-        CV_Error(Error::StsNotImplemented, "");
-    }
-
 private:
     // Map layers types to python classes.
     static std::map<std::string, std::vector<PyObject*> > pyLayers;
diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp
index 51cba618da..f8cc11c83e 100644
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@@ -430,19 +430,24 @@ struct DataLayer : public Layer
                backendId == DNN_BACKEND_INFERENCE_ENGINE && inputsData.size() == 1;
     }
 
-    void forward(InputArrayOfArrays inputs, OutputArrayOfArrays outputs, OutputArrayOfArrays internals) CV_OVERRIDE
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
     {
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
         CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
-                   forward_ocl(inputs, outputs, internals));
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
-        Layer::forward_fallback(inputs, outputs, internals);
-    }
+        if (outputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> outputs, internals;
+        outputs_arr.getMatVector(outputs);
+        internals_arr.getMatVector(internals);
 
-    void forward(std::vector<Mat*>&, std::vector<Mat>& outputs, std::vector<Mat> &) CV_OVERRIDE
-    {
         // Supported modes:
         // | Input type | Output type |
         // |       fp32 |        fp32 |
@@ -567,8 +572,11 @@ struct DataLayer : public Layer
         return false;
     }
 
-    void finalize(const std::vector<Mat*>&, std::vector<Mat>& outputs) CV_OVERRIDE
+    virtual void finalize(InputArrayOfArrays, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
     {
+        std::vector<Mat> outputs;
+        outputs_arr.getMatVector(outputs);
+
         CV_Assert_N(outputs.size() == scaleFactors.size(), outputs.size() == means.size(),
                   inputsData.size() == outputs.size());
         skip = true;
@@ -1414,6 +1422,7 @@ struct Net::Impl
                 addInfEngineNetOutputs(ld);
                 net = Ptr<InfEngineBackendNet>();
                 netBlobsWrappers.clear();
+                layer->preferableTarget = DNN_TARGET_CPU;
                 continue;
             }
             ld.skip = true;  // Initially skip all Inference Engine supported layers.
@@ -1622,7 +1631,12 @@ struct Net::Impl
 
         Ptr<Layer> layerPtr = ld.getLayerInstance();
         {
-            layerPtr->finalize(ld.inputBlobs, ld.outputBlobs);
+            std::vector<Mat> inps(ld.inputBlobs.size());
+            for (int i = 0; i < ld.inputBlobs.size(); ++i)
+            {
+                inps[i] = *ld.inputBlobs[i];
+            }
+            layerPtr->finalize(inps, ld.outputBlobs);
             layerPtr->preferableTarget = preferableTarget;
 #if 0
             std::cout << "\toutputs:";
@@ -2138,7 +2152,12 @@ struct Net::Impl
                             ld.inputBlobsWrappers[i]->copyToHost();
                     }
 
-                    layer->forward(ld.inputBlobs, ld.outputBlobs, ld.internals);
+                    std::vector<Mat> inps(ld.inputBlobs.size());
+                    for (int i = 0; i < ld.inputBlobs.size(); ++i)
+                    {
+                        inps[i] = *ld.inputBlobs[i];
+                    }
+                    layer->forward(inps, ld.outputBlobs, ld.internals);
 
                     if (DNN_CHECK_NAN_INF)
                     {
@@ -2712,11 +2731,6 @@ int Net::getLayerId(const String &layer)
     return impl->getLayerId(layer);
 }
 
-void Net::deleteLayer(LayerId)
-{
-    CV_Error(Error::StsNotImplemented, "");
-}
-
 Ptr<Layer> Net::getLayer(LayerId layerId)
 {
     LayerData &ld = impl->getLayerData(layerId);
@@ -3172,10 +3186,7 @@ static void vecToPVec(const std::vector<T> &v, std::vector<T*> &pv)
 void Layer::finalize(const std::vector<Mat> &inputs, std::vector<Mat> &outputs)
 {
     CV_TRACE_FUNCTION();
-
-    std::vector<Mat*> inputsp;
-    vecToPVec(inputs, inputsp);
-    this->finalize(inputsp, outputs);
+    this->finalize((InputArrayOfArrays)inputs, (OutputArrayOfArrays)outputs);
 }
 
 void Layer::finalize(const std::vector<Mat*> &input, std::vector<Mat> &output)
@@ -3183,6 +3194,18 @@ void Layer::finalize(const std::vector<Mat*> &input, std::vector<Mat> &output)
     (void)input;(void)output;
 }
 
+void Layer::finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr)
+{
+    CV_TRACE_FUNCTION();
+    std::vector<Mat> inputs, outputs;
+    inputs_arr.getMatVector(inputs);
+    outputs_arr.getMatVector(outputs);
+
+    std::vector<Mat*> inputsp;
+    vecToPVec(inputs, inputsp);
+    this->finalize(inputsp, outputs);
+}
+
 std::vector<Mat> Layer::finalize(const std::vector<Mat> &inputs)
 {
     CV_TRACE_FUNCTION();
@@ -3192,12 +3215,17 @@ std::vector<Mat> Layer::finalize(const std::vector<Mat> &inputs)
     return outputs;
 }
 
-void Layer::forward(InputArrayOfArrays inputs, OutputArrayOfArrays outputs, OutputArrayOfArrays internals)
+void Layer::forward(std::vector<Mat*> &input, std::vector<Mat> &output, std::vector<Mat> &internals)
+{
+    // We kept this method for compatibility. DNN calls it now only to support users' implementations.
+}
+
+void Layer::forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr)
 {
     CV_TRACE_FUNCTION();
     CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
-    Layer::forward_fallback(inputs, outputs, internals);
+    Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
 }
 
 void Layer::forward_fallback(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr)
@@ -3241,7 +3269,6 @@ void Layer::forward_fallback(InputArrayOfArrays inputs_arr, OutputArrayOfArrays
         internals_arr.assign(orig_internals);
         return;
     }
-
     std::vector<Mat> inpvec;
     std::vector<Mat> outputs;
     std::vector<Mat> internals;
@@ -3265,10 +3292,8 @@ void Layer::run(const std::vector<Mat> &inputs, std::vector<Mat> &outputs, std::
 {
     CV_TRACE_FUNCTION();
 
-    std::vector<Mat*> inputsp;
-    vecToPVec(inputs, inputsp);
-    this->finalize(inputsp, outputs);
-    this->forward(inputsp, outputs, internals);
+    this->finalize(inputs, outputs);
+    this->forward(inputs, outputs, internals);
 }
 
 Layer::~Layer() {}
diff --git a/modules/dnn/src/layers/batch_norm_layer.cpp b/modules/dnn/src/layers/batch_norm_layer.cpp
index c3a54c127d..6dfa222d15 100644
--- a/modules/dnn/src/layers/batch_norm_layer.cpp
+++ b/modules/dnn/src/layers/batch_norm_layer.cpp
@@ -234,18 +234,20 @@ public:
                    OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
-        Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
-    }
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
 
-    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals) CV_OVERRIDE
-    {
-        CV_TRACE_FUNCTION();
-        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
 
         CV_Assert(blobs.size() >= 2);
         CV_Assert(inputs.size() == 1);
 
-        Mat &inpBlob = *inputs[0];
+        Mat &inpBlob = inputs[0];
         CV_Assert(inpBlob.dims == 2 || inpBlob.dims == 4);
         int rows = inpBlob.dims > 2 ? inpBlob.size[2] : 1;
         int cols = inpBlob.dims > 2 ? inpBlob.size[3] : 1;
diff --git a/modules/dnn/src/layers/blank_layer.cpp b/modules/dnn/src/layers/blank_layer.cpp
index 4cf3e96bbd..8f8e66d761 100644
--- a/modules/dnn/src/layers/blank_layer.cpp
+++ b/modules/dnn/src/layers/blank_layer.cpp
@@ -99,17 +99,19 @@ public:
                    OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
-        Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
-    }
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
 
-    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals) CV_OVERRIDE
-    {
-        CV_TRACE_FUNCTION();
-        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
 
         for (int i = 0, n = outputs.size(); i < n; ++i)
-            if (outputs[i].data != inputs[i]->data)
-                inputs[i]->copyTo(outputs[i]);
+            if (outputs[i].data != inputs[i].data)
+                inputs[i].copyTo(outputs[i]);
     }
 
     virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
diff --git a/modules/dnn/src/layers/concat_layer.cpp b/modules/dnn/src/layers/concat_layer.cpp
index 145dc526fb..92e5421db9 100644
--- a/modules/dnn/src/layers/concat_layer.cpp
+++ b/modules/dnn/src/layers/concat_layer.cpp
@@ -111,12 +111,12 @@ public:
     class ChannelConcatInvoker : public ParallelLoopBody
     {
     public:
-        std::vector<Mat*>* inputs;
+        std::vector<Mat>* inputs;
         Mat* output;
         int nstripes;
         std::vector<const float*> chptrs;
 
-        static void run(std::vector<Mat*>& inputs, Mat& output, int nstripes)
+        static void run(std::vector<Mat>& inputs, Mat& output, int nstripes)
         {
             ChannelConcatInvoker cc;
             cc.inputs = &inputs;
@@ -127,7 +127,7 @@ public:
             int nchannels = 0, batchsz = output.size[0];
             for( i = 0; i < ninputs; i++ )
             {
-                Mat& inp = *inputs[i];
+                Mat& inp = inputs[i];
                 CV_Assert( inp.isContinuous() && (inp.type() == CV_32F || inp.type() == CV_16S) &&
                            inp.dims == 4 && inp.size[0] == output.size[0] &&
                            inp.size[2] == output.size[2] &&
@@ -142,7 +142,7 @@ public:
             int ofs = 0;
             for( i = 0; i < ninputs; i++)
             {
-                Mat& inp = *inputs[i];
+                Mat& inp = inputs[i];
                 for( int j = 0; j < batchsz; j++ )
                     for( int k = 0; k < inp.size[1]; k++ )
                     {
@@ -241,15 +241,17 @@ public:
                    OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
-        Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
-    }
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
 
-    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals) CV_OVERRIDE
-    {
-        CV_TRACE_FUNCTION();
-        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
 
-        int cAxis = clamp(axis, inputs[0]->dims);
+        int cAxis = clamp(axis, inputs[0].dims);
         Mat& outMat = outputs[0];
 
         if (padding)
@@ -267,14 +269,14 @@ public:
             ranges[cAxis].start = 0;
             for (size_t i = 0; i < inputs.size(); i++)
             {
-                ranges[cAxis].end = ranges[cAxis].start + inputs[i]->size[cAxis];
+                ranges[cAxis].end = ranges[cAxis].start + inputs[i].size[cAxis];
                 for (int j = 0; j < outMat.dims; ++j)
                 {
                     if (j == cAxis) continue;
-                    ranges[j].start = (outMat.size[j] - inputs[i]->size[j]) / 2;
-                    ranges[j].end = ranges[j].start + inputs[i]->size[j];
+                    ranges[j].start = (outMat.size[j] - inputs[i].size[j]) / 2;
+                    ranges[j].end = ranges[j].start + inputs[i].size[j];
                 }
-                inputs[i]->copyTo(outMat(&ranges[0]));
+                inputs[i].copyTo(outMat(&ranges[0]));
                 ranges[cAxis].start = ranges[cAxis].end;
             }
         }
diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp
index 54b324538a..40719f3764 100644
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@@ -79,49 +79,24 @@ public:
                   adjustPad.height < stride.height);
     }
 
-    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
     {
-#ifdef HAVE_INF_ENGINE
-        if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
-        {
-            if (type == "Convolution")
-                return preferableTarget != DNN_TARGET_MYRIAD || dilation.width == dilation.height;
-            else
-            {
-                CV_Assert(type == "Deconvolution");
-                const int outGroupCn = blobs[0].size[1];  // Weights are in IOHW layout
-                const int group = numOutput / outGroupCn;
-                if (group != 1)
-                {
-#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R3)
-                    return preferableTarget == DNN_TARGET_CPU;
-#endif
-                    return false;
-                }
-                if (preferableTarget == DNN_TARGET_OPENCL || preferableTarget == DNN_TARGET_OPENCL_FP16)
-                    return dilation.width == 1 && dilation.height == 1;
-                return true;
-            }
-        }
-        else
-#endif  // HAVE_INF_ENGINE
-            return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_HALIDE;
-    }
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
 
-    void finalize(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs) CV_OVERRIDE
-    {
         CV_Assert(inputs.size() > 0);
 
         CV_Assert(blobs.size() >= 1 && blobs.size() <= 2);
         CV_Assert(blobs[0].dims == 4 && blobs[0].size[3] == kernel.width && blobs[0].size[2] == kernel.height);
 
-        const Mat &input = *inputs[0];
+        const Mat &input = inputs[0];
         CV_Assert(input.dims == 4 && (input.type() == CV_32F || input.type() == CV_64F || input.type() == CV_16S));
         for (size_t i = 0; i < inputs.size(); i++)
         {
-            CV_Assert(inputs[i]->type() == input.type());
-            CV_Assert(inputs[i]->dims == 4 && inputs[i]->size[1] == input.size[1]);
-            CV_Assert(inputs[i]->size[2] == input.size[2] && inputs[i]->size[3] == input.size[3]);
+            CV_Assert(inputs[i].type() == input.type());
+            CV_Assert(inputs[i].dims == 4 && inputs[i].size[1] == input.size[1]);
+            CV_Assert(inputs[i].size[2] == input.size[2] && inputs[i].size[3] == input.size[3]);
         }
 
         Size outSize = Size(outputs[0].size[3], outputs[0].size[2]);
@@ -225,6 +200,14 @@ public:
         return shape(out.area(), ksize);
     }
 
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
+            return preferableTarget != DNN_TARGET_MYRIAD || dilation.width == dilation.height;
+        else
+            return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_HALIDE;
+    }
+
     bool getMemoryShapes(const std::vector<MatShape> &inputs,
                          const int requiredOutputs,
                          std::vector<MatShape> &outputs,
@@ -262,9 +245,9 @@ public:
         return false;
     }
 
-    virtual void finalize(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs) CV_OVERRIDE
+    virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
     {
-        BaseConvolutionLayerImpl::finalize(inputs, outputs);
+        BaseConvolutionLayerImpl::finalize(inputs_arr, outputs_arr);
 
         CV_Assert(!blobs.empty());
         const int outCn = blobs[0].size[0];
@@ -1007,22 +990,24 @@ public:
         CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
-        Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
-    }
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
 
-    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals) CV_OVERRIDE
-    {
-        CV_TRACE_FUNCTION();
-        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
 
         /*printf("conv %s: input (%d x %d x %d x %d), kernel (%d x %d), pad (%d x %d), stride (%d x %d), dilation (%d x %d)\n",
-               name.c_str(), inputs[0]->size[0], inputs[0]->size[1], inputs[0]->size[2], inputs[0]->size[3],
+               name.c_str(), inputs[0].size[0], inputs[0].size[1], inputs[0].size[2], inputs[0].size[3],
                kernel.width, kernel.height, pad.width, pad.height,
                stride.width, stride.height, dilation.width, dilation.height);*/
-        CV_Assert_N(inputs.size() == (size_t)1, inputs[0]->size[1] % blobs[0].size[1] == 0,
-                    outputs.size() == 1, inputs[0]->data != outputs[0].data);
+        CV_Assert_N(inputs.size() == (size_t)1, inputs[0].size[1] % blobs[0].size[1] == 0,
+                    outputs.size() == 1, inputs[0].data != outputs[0].data);
 
-        int ngroups = inputs[0]->size[1]/blobs[0].size[1];
+        int ngroups = inputs[0].size[1]/blobs[0].size[1];
         CV_Assert(outputs[0].size[1] % ngroups == 0);
         int outCn = blobs[0].size[0];
 
@@ -1049,7 +1034,7 @@ public:
 
         int nstripes = std::max(getNumThreads(), 1);
 
-        ParallelConv::run(*inputs[0], outputs[0], weightsMat, biasvec, reluslope,
+        ParallelConv::run(inputs[0], outputs[0], weightsMat, biasvec, reluslope,
                           kernel, pad, stride, dilation, activ.get(), ngroups, nstripes);
     }
 
@@ -1089,6 +1074,29 @@ public:
         return shape(ksize, inpH * inpW);
     }
 
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+#ifdef HAVE_INF_ENGINE
+        if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
+        {
+            const int outGroupCn = blobs[0].size[1];  // Weights are in IOHW layout
+            const int group = numOutput / outGroupCn;
+            if (group != 1)
+            {
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R3)
+                return preferableTarget == DNN_TARGET_CPU;
+#endif
+                return false;
+            }
+            if (preferableTarget == DNN_TARGET_OPENCL || preferableTarget == DNN_TARGET_OPENCL_FP16)
+                return dilation.width == 1 && dilation.height == 1;
+            return true;
+        }
+        else
+#endif  // HAVE_INF_ENGINE
+            return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_HALIDE;
+    }
+
     bool getMemoryShapes(const std::vector<MatShape> &inputs,
                          const int requiredOutputs,
                          std::vector<MatShape> &outputs,
@@ -1141,11 +1149,15 @@ public:
         return false;
     }
 
-    void finalize(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs) CV_OVERRIDE
+    void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
     {
-        BaseConvolutionLayerImpl::finalize(inputs, outputs);
+        BaseConvolutionLayerImpl::finalize(inputs_arr, outputs_arr);
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
         getConvPoolPaddings(Size(outputs[0].size[3], outputs[0].size[2]),
-                            Size(inputs[0]->size[3], inputs[0]->size[2]),
+                            Size(inputs[0].size[3], inputs[0].size[2]),
                             kernel, stride, padMode, dilation, pad);
     }
 
@@ -1494,18 +1506,21 @@ public:
 
         CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
                    OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
-                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr));
 
-        Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
-    }
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
 
-    void forward(std::vector<Mat *> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals) CV_OVERRIDE
-    {
-        CV_TRACE_FUNCTION();
-        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+        std::vector<Mat> inputs, outputs, internals;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+        internals_arr.getMatVector(internals);
 
         int outCn = numOutput;
-        int inpCn = inputs[0]->size[1];
+        int inpCn = inputs[0].size[1];
         bool is1x1flag = is1x1();
         int nstripes = getNumThreads();
 
@@ -1520,13 +1535,13 @@ public:
             int ngroups = outCn / blobs[0].size[1];
             int inpGroupCn = inpCn / ngroups;
             int outGroupCn = blobs[0].size[1];
-            const Mat& inp = *inputs[ii];
+            const Mat& inp = inputs[ii];
             Mat& out = outputs[ii];
             int numImg = inp.size[0];
             int inpH = inp.size[2], inpW = inp.size[3];
             int outH = out.size[2], outW = out.size[3];
 
-            Mat convBlob = inputs[ii]->reshape(1, numImg*inpCn);
+            Mat convBlob = inputs[ii].reshape(1, numImg*inpCn);
             Mat decnBlob = out.reshape(1, numImg*outCn);
 
             for (int n = 0; n < numImg; n++)
diff --git a/modules/dnn/src/layers/crop_and_resize_layer.cpp b/modules/dnn/src/layers/crop_and_resize_layer.cpp
index b79fb89a8a..4596b4721f 100644
--- a/modules/dnn/src/layers/crop_and_resize_layer.cpp
+++ b/modules/dnn/src/layers/crop_and_resize_layer.cpp
@@ -40,17 +40,19 @@ public:
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
-        Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
-    }
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
 
-    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals) CV_OVERRIDE
-    {
-        CV_TRACE_FUNCTION();
-        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
 
-        Mat& inp = *inputs[0];
+        Mat& inp = inputs[0];
         Mat& out = outputs[0];
-        Mat boxes = inputs[1]->reshape(1, inputs[1]->total() / 7);
+        Mat boxes = inputs[1].reshape(1, inputs[1].total() / 7);
         const int numChannels = inp.size[1];
         const int inpHeight = inp.size[2];
         const int inpWidth = inp.size[3];
diff --git a/modules/dnn/src/layers/crop_layer.cpp b/modules/dnn/src/layers/crop_layer.cpp
index 3572b88337..f1c41c4036 100644
--- a/modules/dnn/src/layers/crop_layer.cpp
+++ b/modules/dnn/src/layers/crop_layer.cpp
@@ -90,12 +90,14 @@ public:
         return false;
     }
 
-    void finalize(const std::vector<Mat *> &inputs, std::vector<Mat> &outputs) CV_OVERRIDE
+    void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays) CV_OVERRIDE
     {
+        std::vector<Mat> inputs;
+        inputs_arr.getMatVector(inputs);
         CV_Assert(2 == inputs.size());
 
-        const Mat &inpBlob = *inputs[0];
-        const Mat &inpSzBlob = *inputs[1];
+        const Mat &inpBlob = inputs[0];
+        const Mat &inpSzBlob = inputs[1];
 
         int dims = inpBlob.dims;
         int start_axis = clamp(startAxis, dims);
@@ -135,18 +137,18 @@ public:
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
-        Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
-    }
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
 
-    void forward(std::vector<Mat *> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals) CV_OVERRIDE
-    {
-        CV_TRACE_FUNCTION();
-        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
 
-        Mat &input = *inputs[0];
-        Mat &output = outputs[0];
-
-        input(&crop_ranges[0]).copyTo(output);
+        Mat &input = inputs[0];
+        input(&crop_ranges[0]).copyTo(outputs[0]);
     }
 
     virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
diff --git a/modules/dnn/src/layers/detection_output_layer.cpp b/modules/dnn/src/layers/detection_output_layer.cpp
index bd926e49ce..d94cdc02a5 100644
--- a/modules/dnn/src/layers/detection_output_layer.cpp
+++ b/modules/dnn/src/layers/detection_output_layer.cpp
@@ -419,27 +419,28 @@ public:
                        OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                        forward_ocl(inputs_arr, outputs_arr, internals_arr))
         }
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
 
-        Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
-    }
-
-    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals) CV_OVERRIDE
-    {
-        CV_TRACE_FUNCTION();
-        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
 
         std::vector<LabelBBox> allDecodedBBoxes;
         std::vector<Mat> allConfidenceScores;
 
-        int num = inputs[0]->size[0];
+        int num = inputs[0].size[0];
 
         // extract predictions from input layers
         {
-            int numPriors = inputs[2]->size[2] / 4;
+            int numPriors = inputs[2].size[2] / 4;
 
-            const float* locationData = inputs[0]->ptr<float>();
-            const float* confidenceData = inputs[1]->ptr<float>();
-            const float* priorData = inputs[2]->ptr<float>();
+            const float* locationData = inputs[0].ptr<float>();
+            const float* confidenceData = inputs[1].ptr<float>();
+            const float* priorData = inputs[2].ptr<float>();
 
             // Retrieve all location predictions
             std::vector<LabelBBox> allLocationPredictions;
@@ -465,9 +466,9 @@ public:
                 else
                 {
                     // Input image sizes;
-                    CV_Assert(inputs[3]->dims == 4);
-                    clipBounds.xmax = inputs[3]->size[3] - 1;
-                    clipBounds.ymax = inputs[3]->size[2] - 1;
+                    CV_Assert(inputs[3].dims == 4);
+                    clipBounds.xmax = inputs[3].size[3] - 1;
+                    clipBounds.ymax = inputs[3].size[2] - 1;
                 }
             }
             DecodeBBoxesAll(allLocationPredictions, priorBBoxes, priorVariances, num,
@@ -502,6 +503,8 @@ public:
                                        allIndices[i], _groupByClasses);
         }
         CV_Assert(count == numKept);
+        // Sync results back due changed output shape.
+        outputs_arr.assign(outputs);
     }
 
     size_t outputDetections_(
diff --git a/modules/dnn/src/layers/elementwise_layers.cpp b/modules/dnn/src/layers/elementwise_layers.cpp
index 74c89e62de..c042f5fc55 100644
--- a/modules/dnn/src/layers/elementwise_layers.cpp
+++ b/modules/dnn/src/layers/elementwise_layers.cpp
@@ -187,16 +187,19 @@ public:
         CV_OCL_RUN(IS_DNN_OPENCL_TARGET(this->preferableTarget),
                    func.applyOCL(inputs_arr, outputs_arr, internals_arr))
 
-        Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
-    }
+        if (inputs_arr.depth() == CV_16S)
+        {
+            Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
 
-    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals) CV_OVERRIDE
-    {
-        CV_TRACE_FUNCTION();
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
 
         for (size_t i = 0; i < inputs.size(); i++)
         {
-            const Mat &src = *inputs[i];
+            const Mat &src = inputs[i];
             Mat &dst = outputs[i];
             CV_Assert(src.size == dst.size && src.type() == dst.type() &&
                       src.isContinuous() && dst.isContinuous() && src.type() == CV_32F);
diff --git a/modules/dnn/src/layers/eltwise_layer.cpp b/modules/dnn/src/layers/eltwise_layer.cpp
index b1a3493a90..4da92dffa6 100644
--- a/modules/dnn/src/layers/eltwise_layer.cpp
+++ b/modules/dnn/src/layers/eltwise_layer.cpp
@@ -123,7 +123,7 @@ public:
     class EltwiseInvoker : public ParallelLoopBody
     {
     public:
-        const Mat** srcs;
+        const Mat* srcs;
         int nsrcs;
         Mat* dst;
         const std::vector<float>* coeffs;
@@ -135,7 +135,7 @@ public:
 
         EltwiseInvoker() : srcs(0), nsrcs(0), dst(0), coeffs(0), op(PROD), nstripes(0), activ(0), channels(0), planeSize(0)  {}
 
-        static void run(const Mat** srcs, int nsrcs, Mat& dst,
+        static void run(const Mat* srcs, int nsrcs, Mat& dst,
                         const std::vector<float>& coeffs, EltwiseOp op,
                         const ActivationLayer* activ, int nstripes)
         {
@@ -144,9 +144,9 @@ public:
 
             for( int i = 0; i > nsrcs; i++ )
             {
-                CV_Assert(srcs[i]->size == dst.size &&
-                          srcs[i]->type() == dst.type() &&
-                          srcs[i]->isContinuous());
+                CV_Assert(srcs[i].size == dst.size &&
+                          srcs[i].type() == dst.type() &&
+                          srcs[i].isContinuous());
             }
 
             EltwiseInvoker p;
@@ -200,14 +200,14 @@ public:
                 for( c = 0; c < channels; c++ )
                 {
                     size_t globalDelta = delta + (sampleIdx*channels + c)*planeSize;
-                    const float* srcptr0 = srcs[0]->ptr<float>() + globalDelta;
+                    const float* srcptr0 = srcs[0].ptr<float>() + globalDelta;
                     float* dstptr = dstptr0 + globalDelta;
 
                     if( op == PROD )
                     {
                         for( k = 1; k < n; k++ )
                         {
-                            const float* srcptr1 = srcs[k]->ptr<float>() + globalDelta;
+                            const float* srcptr1 = srcs[k].ptr<float>() + globalDelta;
                             for( j = 0; j < blockSize; j++ )
                             {
                                 dstptr[j] = srcptr0[j]*srcptr1[j];
@@ -219,7 +219,7 @@ public:
                     {
                         for( k = 1; k < n; k++ )
                         {
-                            const float* srcptr1 = srcs[k]->ptr<float>() + globalDelta;
+                            const float* srcptr1 = srcs[k].ptr<float>() + globalDelta;
                             for( j = 0; j < blockSize; j++ )
                             {
                                 dstptr[j] = std::max(srcptr0[j], srcptr1[j]);
@@ -231,7 +231,7 @@ public:
                     {
                         for( k = 1; k < n; k++ )
                         {
-                            const float* srcptr1 = srcs[k]->ptr<float>() + globalDelta;
+                            const float* srcptr1 = srcs[k].ptr<float>() + globalDelta;
                             for( j = 0; j < blockSize; j++ )
                             {
                                 dstptr[j] = srcptr0[j] + srcptr1[j];
@@ -244,7 +244,7 @@ public:
                         float c0 = coeffsptr[0];
                         for( k = 1; k < n; k++ )
                         {
-                            const float* srcptr1 = srcs[k]->ptr<float>() + globalDelta;
+                            const float* srcptr1 = srcs[k].ptr<float>() + globalDelta;
                             float c1 = coeffsptr[k];
                             for( j = 0; j < blockSize; j++ )
                             {
@@ -358,17 +358,19 @@ public:
                    OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
-        Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
-    }
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
 
-    void forward(std::vector<Mat *> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals) CV_OVERRIDE
-    {
-        CV_TRACE_FUNCTION();
-        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
 
         CV_Assert(outputs.size() == 1);
         const int nstripes = getNumThreads();
-        EltwiseInvoker::run((const Mat**)&inputs[0], (int)inputs.size(), outputs[0],
+        EltwiseInvoker::run(&inputs[0], (int)inputs.size(), outputs[0],
                             coeffs, op, activ.get(), nstripes);
     }
 
diff --git a/modules/dnn/src/layers/flatten_layer.cpp b/modules/dnn/src/layers/flatten_layer.cpp
index 41ec8dfc53..bda9ba46a1 100644
--- a/modules/dnn/src/layers/flatten_layer.cpp
+++ b/modules/dnn/src/layers/flatten_layer.cpp
@@ -139,18 +139,23 @@ public:
                    OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
-        Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
-    }
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
 
-    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals) CV_OVERRIDE
-    {
-        CV_TRACE_FUNCTION();
-        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
 
         for (size_t i = 0; i < inputs.size(); i++)
         {
             MatShape outShape = shape(outputs[i]);
-            outputs[i] = inputs[i]->reshape(1, (int)outShape.size(), &outShape[0]);
+            if (inputs[i].data != outputs[i].data)
+            {
+                inputs[i].reshape(1, (int)outShape.size(), &outShape[0]).copyTo(outputs[i]);
+            }
         }
     }
 
diff --git a/modules/dnn/src/layers/fully_connected_layer.cpp b/modules/dnn/src/layers/fully_connected_layer.cpp
index e40f8c6bdd..1195c57e0a 100644
--- a/modules/dnn/src/layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/layers/fully_connected_layer.cpp
@@ -273,7 +273,7 @@ public:
     };
 
 #ifdef HAVE_OPENCL
-    void finalize(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs) CV_OVERRIDE
+    virtual void finalize(InputArrayOfArrays, OutputArrayOfArrays) CV_OVERRIDE
     {
         innerProductOp.release();
     }
@@ -393,20 +393,22 @@ public:
                    OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
-        Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
-    }
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
 
-    void forward(std::vector<Mat*> &input, std::vector<Mat> &output, std::vector<Mat> &) CV_OVERRIDE
-    {
-        CV_TRACE_FUNCTION();
-        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+        std::vector<Mat> input, output;
+        inputs_arr.getMatVector(input);
+        outputs_arr.getMatVector(output);
 
-        int axisCan = clamp(axis, input[0]->dims);
-        int outerSize = input[0]->total(0, axisCan);
+        int axisCan = clamp(axis, input[0].dims);
+        int outerSize = input[0].total(0, axisCan);
 
         for (size_t i = 0; i < input.size(); i++)
         {
-            Mat srcMat = input[i]->reshape(1, outerSize);
+            Mat srcMat = input[i].reshape(1, outerSize);
             Mat dstMat = output[i].reshape(1, outerSize);
 
             const int nstripes = getNumThreads();
diff --git a/modules/dnn/src/layers/lrn_layer.cpp b/modules/dnn/src/layers/lrn_layer.cpp
index 8d9f28dbb3..eeca613f3c 100644
--- a/modules/dnn/src/layers/lrn_layer.cpp
+++ b/modules/dnn/src/layers/lrn_layer.cpp
@@ -96,7 +96,7 @@ public:
     }
 
 #ifdef HAVE_OPENCL
-    void finalize(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs) CV_OVERRIDE
+    virtual void finalize(InputArrayOfArrays, OutputArrayOfArrays) CV_OVERRIDE
     {
         lrnOp.release();
     }
@@ -152,21 +152,23 @@ public:
                    OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
-        Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
-    }
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
 
-    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals) CV_OVERRIDE
-    {
-        CV_TRACE_FUNCTION();
-        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
 
         CV_Assert(inputs.size() == outputs.size());
 
         for (int i = 0; i < inputs.size(); i++)
         {
-            CV_Assert(inputs[i]->dims == 4);
+            CV_Assert(inputs[i].dims == 4);
 
-            Mat &src = *inputs[i];
+            Mat &src = inputs[i];
             Mat &dst = outputs[i];
 
             switch (type)
diff --git a/modules/dnn/src/layers/max_unpooling_layer.cpp b/modules/dnn/src/layers/max_unpooling_layer.cpp
index 98cb3595aa..0d9d62c44e 100644
--- a/modules/dnn/src/layers/max_unpooling_layer.cpp
+++ b/modules/dnn/src/layers/max_unpooling_layer.cpp
@@ -62,17 +62,19 @@ public:
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
-        Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
-    }
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
 
-    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals) CV_OVERRIDE
-    {
-        CV_TRACE_FUNCTION();
-        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
 
         CV_Assert(inputs.size() == 2);
-        Mat& input = *inputs[0];
-        Mat& indices = *inputs[1];
+        Mat& input = inputs[0];
+        Mat& indices = inputs[1];
 
         CV_Assert(input.total() == indices.total());
         CV_Assert(input.size[0] == 1);
diff --git a/modules/dnn/src/layers/mvn_layer.cpp b/modules/dnn/src/layers/mvn_layer.cpp
index 6a2c6f1dd9..2a369c71fa 100644
--- a/modules/dnn/src/layers/mvn_layer.cpp
+++ b/modules/dnn/src/layers/mvn_layer.cpp
@@ -96,13 +96,15 @@ public:
         return fuse_relu;
     }
 
-    void finalize(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs) CV_OVERRIDE
+    void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays) CV_OVERRIDE
     {
+        std::vector<Mat> inputs;
+        inputs_arr.getMatVector(inputs);
         int splitDim = (acrossChannels) ? 1 : 2;
         int i, newRows = 1;
         for( i = 0; i < splitDim; i++ )
-            newRows *= inputs[0]->size[i];
-        zeroDev = inputs[0]->total() == newRows;
+            newRows *= inputs[0].size[i];
+        zeroDev = inputs[0].total() == newRows;
     }
 
     virtual bool supportBackend(int backendId) CV_OVERRIDE
@@ -271,17 +273,20 @@ public:
         CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
-        Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
-    }
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
 
-    void forward(std::vector<Mat *> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals) CV_OVERRIDE
-    {
-        CV_TRACE_FUNCTION();
-        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+        std::vector<Mat> inputs, outputs, internals;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+        internals_arr.getMatVector(internals);
 
         for (size_t inpIdx = 0; inpIdx < inputs.size(); inpIdx++)
         {
-            Mat &inpBlob = *inputs[inpIdx];
+            Mat &inpBlob = inputs[inpIdx];
             Mat &outBlob = outputs[inpIdx];
 
             int splitDim = (acrossChannels) ? 1 : 2;
diff --git a/modules/dnn/src/layers/normalize_bbox_layer.cpp b/modules/dnn/src/layers/normalize_bbox_layer.cpp
index fbb29292c2..694d3d1039 100644
--- a/modules/dnn/src/layers/normalize_bbox_layer.cpp
+++ b/modules/dnn/src/layers/normalize_bbox_layer.cpp
@@ -89,12 +89,14 @@ public:
         return true;
     }
 
-    void finalize(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs) CV_OVERRIDE
+    void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays) CV_OVERRIDE
     {
+        std::vector<Mat> inputs;
+        inputs_arr.getMatVector(inputs);
         CV_Assert(inputs.size() == 1);
-        endAxis = endAxis == -1 ? (inputs[0]->dims - 1) : endAxis;
-        startAxis = startAxis == -1 ? (inputs[0]->dims - 1) : startAxis;
-        acrossSpatial = (startAxis == 1 && endAxis == inputs[0]->dims - 1);
+        endAxis = endAxis == -1 ? (inputs[0].dims - 1) : endAxis;
+        startAxis = startAxis == -1 ? (inputs[0].dims - 1) : startAxis;
+        acrossSpatial = (startAxis == 1 && endAxis == inputs[0].dims - 1);
     }
 
 #ifdef HAVE_OPENCL
@@ -186,18 +188,21 @@ public:
                    OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
-        Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
-    }
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
 
-    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals) CV_OVERRIDE
-    {
-        CV_TRACE_FUNCTION();
-        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+        std::vector<Mat> inputs, outputs, internals;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+        internals_arr.getMatVector(internals);
 
         CV_Assert(inputs.size() == 1 && outputs.size() == 1);
-        CV_Assert(inputs[0]->total() == outputs[0].total());
+        CV_Assert(inputs[0].total() == outputs[0].total());
 
-        const Mat& inp0 = *inputs[0];
+        const Mat& inp0 = inputs[0];
         Mat& buffer = internals[0];
         startAxis = clamp(startAxis, inp0.dims);
         endAxis = clamp(endAxis, inp0.dims);
diff --git a/modules/dnn/src/layers/padding_layer.cpp b/modules/dnn/src/layers/padding_layer.cpp
index af58c78f55..7aa12d7748 100644
--- a/modules/dnn/src/layers/padding_layer.cpp
+++ b/modules/dnn/src/layers/padding_layer.cpp
@@ -61,14 +61,17 @@ public:
         return false;
     }
 
-    void finalize(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs) CV_OVERRIDE
+    void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays) CV_OVERRIDE
     {
+        std::vector<Mat> inputs;
+        inputs_arr.getMatVector(inputs);
+
         // Compute dstRanges.
-        const MatSize& inpShape = inputs[0]->size;
+        const MatSize& inpShape = inputs[0].size;
         dstRanges.resize(paddings.size());
 
         int offset = 0;
-        if (inputDims != -1 && inputs[0]->dims != inputDims)
+        if (inputDims != -1 && inputs[0].dims != inputDims)
         {
             dstRanges.insert(dstRanges.begin(), Range::all());
             offset = 1;
@@ -81,7 +84,7 @@ public:
         }
 
         // Add the rest of dimensions.
-        for (int i = dstRanges.size(); i < inputs[0]->dims; ++i)
+        for (int i = dstRanges.size(); i < inputs[0].dims; ++i)
             dstRanges.push_back(Range::all());
     }
 
@@ -96,31 +99,33 @@ public:
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
-        Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
-    }
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
 
-    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals) CV_OVERRIDE
-    {
-        CV_TRACE_FUNCTION();
-        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
 
         if (paddingType == "constant")
         {
             outputs[0].setTo(paddingValue);
-            inputs[0]->copyTo(outputs[0](dstRanges));
+            inputs[0].copyTo(outputs[0](dstRanges));
         }
         else if (paddingType == "reflect")
         {
             CV_Assert(inputs.size() == 1);
             CV_Assert(outputs.size() == 1);
-            CV_Assert(inputs[0]->dims == 4);
+            CV_Assert(inputs[0].dims == 4);
             CV_Assert(outputs[0].dims == 4);
 
-            if (inputs[0]->size[0] != outputs[0].size[0] || inputs[0]->size[1] != outputs[0].size[1])
+            if (inputs[0].size[0] != outputs[0].size[0] || inputs[0].size[1] != outputs[0].size[1])
                 CV_Error(Error::StsNotImplemented, "Only spatial reflection padding is supported.");
 
-            const int inpHeight = inputs[0]->size[2];
-            const int inpWidth = inputs[0]->size[3];
+            const int inpHeight = inputs[0].size[2];
+            const int inpWidth = inputs[0].size[3];
             const int outHeight = outputs[0].size[2];
             const int outWidth = outputs[0].size[3];
             const int padTop = dstRanges[2].start;
@@ -130,11 +135,11 @@ public:
             CV_CheckLT(padTop, inpHeight, ""); CV_CheckLT(padBottom, inpHeight, "");
             CV_CheckLT(padLeft, inpWidth, ""); CV_CheckLT(padRight, inpWidth, "");
 
-            for (size_t n = 0; n < inputs[0]->size[0]; ++n)
+            for (size_t n = 0; n < inputs[0].size[0]; ++n)
             {
-                for (size_t ch = 0; ch < inputs[0]->size[1]; ++ch)
+                for (size_t ch = 0; ch < inputs[0].size[1]; ++ch)
                 {
-                    copyMakeBorder(getPlane(*inputs[0], n, ch),
+                    copyMakeBorder(getPlane(inputs[0], n, ch),
                                    getPlane(outputs[0], n, ch),
                                    padTop, padBottom, padLeft, padRight,
                                    BORDER_REFLECT_101);
diff --git a/modules/dnn/src/layers/permute_layer.cpp b/modules/dnn/src/layers/permute_layer.cpp
index 65d78517e6..a8fe9dd861 100644
--- a/modules/dnn/src/layers/permute_layer.cpp
+++ b/modules/dnn/src/layers/permute_layer.cpp
@@ -172,18 +172,21 @@ public:
         _count = _oldStride[0] * shapeBefore[0];
     }
 
-    void finalize(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs) CV_OVERRIDE
+    void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
     {
         if(!_needsPermute)
         {
             return;
         }
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
 
         CV_Assert(inputs.size() > 0);
-        const Mat& inp0 = *inputs[0];
+        const Mat& inp0 = inputs[0];
         CV_Assert((int)_numAxes == inp0.dims);
 
-        computeStrides(shape(*inputs[0]), shape(outputs[0]));
+        computeStrides(shape(inputs[0]), shape(outputs[0]));
 
 #ifdef HAVE_OPENCL
         if (uorder.empty())
@@ -319,22 +322,24 @@ public:
                    OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
-        Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
-    }
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
 
-    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals) CV_OVERRIDE
-    {
-        CV_TRACE_FUNCTION();
-        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
 
         size_t k, ninputs = inputs.size();
         if(!_needsPermute)
         {
             for (k = 0; k < ninputs; k++)
             {
-                CV_Assert(outputs[k].total() == inputs[k]->total());
-                if (outputs[k].data != inputs[k]->data)
-                    inputs[k]->copyTo(outputs[k]);
+                CV_Assert(outputs[k].total() == inputs[k].total());
+                if (outputs[k].data != inputs[k].data)
+                    inputs[k].copyTo(outputs[k]);
             }
         }
         else
@@ -346,10 +351,10 @@ public:
 
             for (k = 0; k < ninputs; k++)
             {
-                const Mat& inp = *inputs[k];
+                const Mat& inp = inputs[k];
                 Mat& out = outputs[k];
 
-                CV_Assert(inp.dims == numAxes && inp.size == inputs[0]->size);
+                CV_Assert(inp.dims == numAxes && inp.size == inputs[0].size);
                 CV_Assert(out.dims == numAxes && out.size == outputs[0].size);
 
                 CV_Assert(inp.isContinuous() && out.isContinuous());
diff --git a/modules/dnn/src/layers/pooling_layer.cpp b/modules/dnn/src/layers/pooling_layer.cpp
index 200751d9b2..40bcb5e672 100644
--- a/modules/dnn/src/layers/pooling_layer.cpp
+++ b/modules/dnn/src/layers/pooling_layer.cpp
@@ -114,11 +114,15 @@ public:
     Ptr<OCL4DNNPool<float> > poolOp;
 #endif
 
-    void finalize(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs) CV_OVERRIDE
+    void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
     {
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
         CV_Assert(!inputs.empty());
 
-        cv::Size inp(inputs[0]->size[3], inputs[0]->size[2]),
+        cv::Size inp(inputs[0].size[3], inputs[0].size[2]),
                 out(outputs[0].size[3], outputs[0].size[2]);
 
         if(globalPooling)
@@ -204,28 +208,29 @@ public:
             CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
                        forward_ocl(inputs_arr, outputs_arr, internals_arr))
         }
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
 
-        Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
-    }
-
-    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals) CV_OVERRIDE
-    {
-        CV_TRACE_FUNCTION();
-        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
 
         switch (type)
         {
             case MAX:
                 CV_Assert_N(inputs.size() == 1, outputs.size() == 2);
-                maxPooling(*inputs[0], outputs[0], outputs[1]);
+                maxPooling(inputs[0], outputs[0], outputs[1]);
                 break;
             case AVE:
                 CV_Assert_N(inputs.size() == 1, outputs.size() == 1);
-                avePooling(*inputs[0], outputs[0]);
+                avePooling(inputs[0], outputs[0]);
                 break;
             case ROI: case PSROI:
                 CV_Assert_N(inputs.size() == 2, outputs.size() == 1);
-                roiPooling(*inputs[0], *inputs[1], outputs[0]);
+                roiPooling(inputs[0], inputs[1], outputs[0]);
                 break;
             default:
                 CV_Error(Error::StsNotImplemented, "Not implemented");
diff --git a/modules/dnn/src/layers/prior_box_layer.cpp b/modules/dnn/src/layers/prior_box_layer.cpp
index d4ffbbaa97..28755120d9 100644
--- a/modules/dnn/src/layers/prior_box_layer.cpp
+++ b/modules/dnn/src/layers/prior_box_layer.cpp
@@ -297,15 +297,18 @@ public:
         return false;
     }
 
-    void finalize(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs) CV_OVERRIDE
+    void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays) CV_OVERRIDE
     {
-        CV_CheckGT(inputs.size(), (size_t)1, "");
-        CV_CheckEQ(inputs[0]->dims, 4, ""); CV_CheckEQ(inputs[1]->dims, 4, "");
-        int layerWidth = inputs[0]->size[3];
-        int layerHeight = inputs[0]->size[2];
+        std::vector<Mat> inputs;
+        inputs_arr.getMatVector(inputs);
 
-        int imageWidth = inputs[1]->size[3];
-        int imageHeight = inputs[1]->size[2];
+        CV_CheckGT(inputs.size(), (size_t)1, "");
+        CV_CheckEQ(inputs[0].dims, 4, ""); CV_CheckEQ(inputs[1].dims, 4, "");
+        int layerWidth = inputs[0].size[3];
+        int layerHeight = inputs[0].size[2];
+
+        int imageWidth = inputs[1].size[3];
+        int imageHeight = inputs[1].size[2];
 
         _stepY = _stepY == 0 ? (static_cast<float>(imageHeight) / layerHeight) : _stepY;
         _stepX = _stepX == 0 ? (static_cast<float>(imageWidth) / layerWidth) : _stepX;
@@ -403,21 +406,23 @@ public:
                    OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
-        Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
-    }
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
 
-    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals) CV_OVERRIDE
-    {
-        CV_TRACE_FUNCTION();
-        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
 
         CV_Assert(inputs.size() == 2);
 
-        int _layerWidth = inputs[0]->size[3];
-        int _layerHeight = inputs[0]->size[2];
+        int _layerWidth = inputs[0].size[3];
+        int _layerHeight = inputs[0].size[2];
 
-        int _imageWidth = inputs[1]->size[3];
-        int _imageHeight = inputs[1]->size[2];
+        int _imageWidth = inputs[1].size[3];
+        int _imageHeight = inputs[1].size[2];
 
         float* outputPtr = outputs[0].ptr<float>();
         float _boxWidth, _boxHeight;
diff --git a/modules/dnn/src/layers/proposal_layer.cpp b/modules/dnn/src/layers/proposal_layer.cpp
index cdc5e2250a..ad9ea9a97e 100644
--- a/modules/dnn/src/layers/proposal_layer.cpp
+++ b/modules/dnn/src/layers/proposal_layer.cpp
@@ -137,24 +137,27 @@ public:
         return false;
     }
 
-    void finalize(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs) CV_OVERRIDE
+    void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays) CV_OVERRIDE
     {
-        std::vector<Mat*> layerInputs;
+        std::vector<Mat> inputs;
+        inputs_arr.getMatVector(inputs);
+
+        std::vector<Mat> layerInputs;
         std::vector<Mat> layerOutputs;
 
         // Scores permute layer.
-        Mat scores = getObjectScores(*inputs[0]);
-        layerInputs.assign(1, &scores);
+        Mat scores = getObjectScores(inputs[0]);
+        layerInputs.assign(1, scores);
         layerOutputs.assign(1, Mat(shape(scores.size[0], scores.size[2],
                                          scores.size[3], scores.size[1]), CV_32FC1));
         scoresPermute->finalize(layerInputs, layerOutputs);
 
         // BBox predictions permute layer.
-        Mat* bboxDeltas = inputs[1];
-        CV_Assert(bboxDeltas->dims == 4);
+        const Mat& bboxDeltas = inputs[1];
+        CV_Assert(bboxDeltas.dims == 4);
         layerInputs.assign(1, bboxDeltas);
-        layerOutputs.assign(1, Mat(shape(bboxDeltas->size[0], bboxDeltas->size[2],
-                                         bboxDeltas->size[3], bboxDeltas->size[1]), CV_32FC1));
+        layerOutputs.assign(1, Mat(shape(bboxDeltas.size[0], bboxDeltas.size[2],
+                                         bboxDeltas.size[3], bboxDeltas.size[1]), CV_32FC1));
         deltasPermute->finalize(layerInputs, layerOutputs);
     }
 
@@ -251,19 +254,22 @@ public:
                    OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
-        Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
-    }
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
 
-    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals) CV_OVERRIDE
-    {
-        CV_TRACE_FUNCTION();
-        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+        std::vector<Mat> inputs, outputs, internals;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+        internals_arr.getMatVector(internals);
 
         CV_Assert(inputs.size() == 3);
         CV_Assert(internals.size() == 3);
-        const Mat& scores = *inputs[0];
-        const Mat& bboxDeltas = *inputs[1];
-        const Mat& imInfo = *inputs[2];
+        const Mat& scores = inputs[0];
+        const Mat& bboxDeltas = inputs[1];
+        const Mat& imInfo = inputs[2];
         Mat& priorBoxes = internals[0];
         Mat& permuttedScores = internals[1];
         Mat& permuttedDeltas = internals[2];
diff --git a/modules/dnn/src/layers/recurrent_layers.cpp b/modules/dnn/src/layers/recurrent_layers.cpp
index 15f791a0b0..6a6cf0ce81 100644
--- a/modules/dnn/src/layers/recurrent_layers.cpp
+++ b/modules/dnn/src/layers/recurrent_layers.cpp
@@ -216,11 +216,14 @@ public:
         return false;
     }
 
-    void finalize(const std::vector<Mat*> &input, std::vector<Mat> &output) CV_OVERRIDE
+    void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays) CV_OVERRIDE
     {
+        std::vector<Mat> input;
+        inputs_arr.getMatVector(input);
+
         CV_Assert(!usePeephole && blobs.size() == 3 || usePeephole && blobs.size() == 6);
         CV_Assert(input.size() == 1);
-        const Mat& inp0 = *input[0];
+        const Mat& inp0 = input[0];
 
         Mat &Wh = blobs[0], &Wx = blobs[1];
         int numOut = Wh.size[1];
@@ -256,13 +259,16 @@ public:
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
-        Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
-    }
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
 
-    void forward(std::vector<Mat*> &input, std::vector<Mat> &output, std::vector<Mat> &internals) CV_OVERRIDE
-    {
-        CV_TRACE_FUNCTION();
-        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+        std::vector<Mat> input, output, internals;
+        inputs_arr.getMatVector(input);
+        outputs_arr.getMatVector(output);
+        internals_arr.getMatVector(internals);
 
         const Mat &Wh = blobs[0];
         const Mat &Wx = blobs[1];
@@ -277,7 +283,7 @@ public:
         dummyOnes.setTo(1.);
 
         int numSamplesTotal = numTimeStamps*numSamples;
-        Mat xTs = input[0]->reshape(1, numSamplesTotal);
+        Mat xTs = input[0].reshape(1, numSamplesTotal);
 
         Mat hOutTs = output[0].reshape(1, numSamplesTotal);
         Mat cOutTs = produceCellOutput ? output[1].reshape(1, numSamplesTotal) : Mat();
@@ -432,8 +438,11 @@ public:
         return false;
     }
 
-    void finalize(const std::vector<Mat*> &input, std::vector<Mat> &output) CV_OVERRIDE
+    void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays) CV_OVERRIDE
     {
+        std::vector<Mat> input, outputs;
+        inputs_arr.getMatVector(input);
+
         CV_Assert(input.size() >= 1 && input.size() <= 2);
 
         Wxh = blobs[0];
@@ -446,7 +455,7 @@ public:
         numX = Wxh.cols;
         numO = Who.rows;
 
-        const Mat& inp0 = *input[0];
+        const Mat& inp0 = input[0];
 
         CV_Assert(inp0.dims >= 2);
         CV_Assert(inp0.total(2) == numX);
@@ -477,15 +486,18 @@ public:
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
-        Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
-    }
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
 
-    void forward(std::vector<Mat*> &input, std::vector<Mat> &output, std::vector<Mat> &internals) CV_OVERRIDE
-    {
-        CV_TRACE_FUNCTION();
-        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+        std::vector<Mat> input, output, internals;
+        inputs_arr.getMatVector(input);
+        outputs_arr.getMatVector(output);
+        internals_arr.getMatVector(internals);
 
-        Mat xTs = input[0]->reshape(1, numSamplesTotal);
+        Mat xTs = input[0].reshape(1, numSamplesTotal);
         Mat oTs = output[0].reshape(1, numSamplesTotal);
         Mat hTs = produceH ? output[1].reshape(1, numSamplesTotal) : Mat();
         Mat hCurr = internals[0];
diff --git a/modules/dnn/src/layers/region_layer.cpp b/modules/dnn/src/layers/region_layer.cpp
index 50e68b2fa5..2284430789 100644
--- a/modules/dnn/src/layers/region_layer.cpp
+++ b/modules/dnn/src/layers/region_layer.cpp
@@ -190,13 +190,16 @@ public:
                    OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
-        Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
-    }
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
 
-    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals) CV_OVERRIDE
-    {
-        CV_TRACE_FUNCTION();
-        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+        std::vector<Mat> inputs, outputs, internals;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+        internals_arr.getMatVector(internals);
 
         CV_Assert(inputs.size() >= 1);
         CV_Assert(outputs.size() == 1);
@@ -206,14 +209,14 @@ public:
 
         for (size_t ii = 0; ii < outputs.size(); ii++)
         {
-            Mat &inpBlob = *inputs[ii];
+            Mat &inpBlob = inputs[ii];
             Mat &outBlob = outputs[ii];
 
             int rows = inpBlob.size[1];
             int cols = inpBlob.size[2];
-            CV_Assert(inputs.size() < 2 || inputs[1]->dims == 4);
-            int hNorm = inputs.size() > 1 ? inputs[1]->size[2] : rows;
-            int wNorm = inputs.size() > 1 ? inputs[1]->size[3] : cols;
+            CV_Assert(inputs.size() < 2 || inputs[1].dims == 4);
+            int hNorm = inputs.size() > 1 ? inputs[1].size[2] : rows;
+            int wNorm = inputs.size() > 1 ? inputs[1].size[3] : cols;
 
             const float *srcData = inpBlob.ptr<float>();
             float *dstData = outBlob.ptr<float>();
diff --git a/modules/dnn/src/layers/reorg_layer.cpp b/modules/dnn/src/layers/reorg_layer.cpp
index 89b6f1d85b..790428ddc9 100644
--- a/modules/dnn/src/layers/reorg_layer.cpp
+++ b/modules/dnn/src/layers/reorg_layer.cpp
@@ -139,17 +139,19 @@ public:
                    OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
-        Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
-    }
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
 
-    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals) CV_OVERRIDE
-    {
-        CV_TRACE_FUNCTION();
-        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
 
         for (size_t i = 0; i < inputs.size(); i++)
         {
-            Mat srcBlob = *inputs[i];
+            Mat srcBlob = inputs[i];
             MatShape inputShape = shape(srcBlob), outShape = shape(outputs[i]);
             float *dstData = outputs[0].ptr<float>();
             const float *srcData = srcBlob.ptr<float>();
diff --git a/modules/dnn/src/layers/reshape_layer.cpp b/modules/dnn/src/layers/reshape_layer.cpp
index 69814c0839..d56507e0f6 100644
--- a/modules/dnn/src/layers/reshape_layer.cpp
+++ b/modules/dnn/src/layers/reshape_layer.cpp
@@ -237,17 +237,18 @@ public:
                    OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
-        Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
-    }
-
-    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals) CV_OVERRIDE
-    {
-        CV_TRACE_FUNCTION();
-        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
 
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
         for (size_t i = 0; i < outputs.size(); i++)
         {
-            Mat srcBlob = *inputs[i];
+            Mat srcBlob = inputs[i];
             if (outputs[i].data != srcBlob.data)
                 srcBlob.reshape(1, shape(outputs[i])).copyTo(outputs[i]);
         }
diff --git a/modules/dnn/src/layers/resize_layer.cpp b/modules/dnn/src/layers/resize_layer.cpp
index dab62f12f7..c090ad82ff 100644
--- a/modules/dnn/src/layers/resize_layer.cpp
+++ b/modules/dnn/src/layers/resize_layer.cpp
@@ -57,22 +57,26 @@ public:
             return backendId == DNN_BACKEND_OPENCV;
     }
 
-    virtual void finalize(const std::vector<Mat*>& inputs, std::vector<Mat> &outputs) CV_OVERRIDE
+    virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
     {
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
         if (!outWidth && !outHeight)
         {
             outHeight = outputs[0].size[2];
             outWidth = outputs[0].size[3];
         }
         if (alignCorners && outHeight > 1)
-            scaleHeight = static_cast<float>(inputs[0]->size[2] - 1) / (outHeight - 1);
+            scaleHeight = static_cast<float>(inputs[0].size[2] - 1) / (outHeight - 1);
         else
-            scaleHeight = static_cast<float>(inputs[0]->size[2]) / outHeight;
+            scaleHeight = static_cast<float>(inputs[0].size[2]) / outHeight;
 
         if (alignCorners && outWidth > 1)
-            scaleWidth = static_cast<float>(inputs[0]->size[3] - 1) / (outWidth - 1);
+            scaleWidth = static_cast<float>(inputs[0].size[3] - 1) / (outWidth - 1);
         else
-            scaleWidth = static_cast<float>(inputs[0]->size[3]) / outWidth;
+            scaleWidth = static_cast<float>(inputs[0].size[3]) / outWidth;
     }
 
     void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
@@ -80,24 +84,27 @@ public:
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
-        Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
-    }
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
 
-    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals) CV_OVERRIDE
-    {
-        CV_TRACE_FUNCTION();
-        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+        std::vector<Mat> inputs, outputs, internals;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+        internals_arr.getMatVector(internals);
 
-        if (outHeight == inputs[0]->size[2] && outWidth == inputs[0]->size[3])
+        if (outHeight == inputs[0].size[2] && outWidth == inputs[0].size[3])
             return;
 
-        Mat& inp = *inputs[0];
+        Mat& inp = inputs[0];
         Mat& out = outputs[0];
         if (interpolation == "nearest")
         {
-            for (size_t n = 0; n < inputs[0]->size[0]; ++n)
+            for (size_t n = 0; n < inputs[0].size[0]; ++n)
             {
-                for (size_t ch = 0; ch < inputs[0]->size[1]; ++ch)
+                for (size_t ch = 0; ch < inputs[0].size[1]; ++ch)
                 {
                     resize(getPlane(inp, n, ch), getPlane(out, n, ch),
                            Size(outWidth, outHeight), 0, 0, INTER_NEAREST);
@@ -203,15 +210,19 @@ public:
         return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_INFERENCE_ENGINE;
     }
 
-    virtual void finalize(const std::vector<Mat*>& inputs, std::vector<Mat> &outputs) CV_OVERRIDE
+    virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
     {
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
         if (!outWidth && !outHeight)
         {
             outHeight = outputs[0].size[2];
             outWidth = outputs[0].size[3];
         }
-        int inpHeight = inputs[0]->size[2];
-        int inpWidth = inputs[0]->size[3];
+        int inpHeight = inputs[0].size[2];
+        int inpWidth = inputs[0].size[3];
         scaleHeight = (outHeight > 1) ? (static_cast<float>(inpHeight - 1) / (outHeight - 1)) : 0.f;
         scaleWidth = (outWidth > 1) ? (static_cast<float>(inpWidth - 1) / (outWidth - 1)) : 0.f;
     }
diff --git a/modules/dnn/src/layers/scale_layer.cpp b/modules/dnn/src/layers/scale_layer.cpp
index 9ab005ce20..9c74bb0f85 100644
--- a/modules/dnn/src/layers/scale_layer.cpp
+++ b/modules/dnn/src/layers/scale_layer.cpp
@@ -40,8 +40,10 @@ public:
         return true;
     }
 
-    virtual void finalize(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs) CV_OVERRIDE
+    virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays) CV_OVERRIDE
     {
+        std::vector<Mat> inputs;
+        inputs_arr.getMatVector(inputs);
         hasWeights = blobs.size() == 2 || (blobs.size() == 1 && !hasBias);
         CV_Assert(inputs.size() == 2 && blobs.empty() || blobs.size() == (int)hasWeights + (int)hasBias);
     }
@@ -57,20 +59,23 @@ public:
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
-        Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
-    }
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
 
-    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals) CV_OVERRIDE
-    {
-        CV_TRACE_FUNCTION();
-        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
         CV_Assert_N(outputs.size() == 1, !blobs.empty() || inputs.size() == 2);
 
-        Mat &inpBlob = *inputs[0];
+        Mat &inpBlob = inputs[0];
         Mat &outBlob = outputs[0];
         // There is a mode when we multiply a first blob by a second one
         // instead of trainable weights.
-        Mat weights = blobs.empty() ? *inputs[1] : (hasWeights ? blobs[0] : Mat());
+        Mat weights = blobs.empty() ? inputs[1] : (hasWeights ? blobs[0] : Mat());
         Mat bias = hasBias ? blobs.back().reshape(1, 1) : Mat();
         if (!weights.empty())
             weights = weights.reshape(1, 1);
diff --git a/modules/dnn/src/layers/shuffle_channel_layer.cpp b/modules/dnn/src/layers/shuffle_channel_layer.cpp
index 19c6cfc88e..67fb489f84 100644
--- a/modules/dnn/src/layers/shuffle_channel_layer.cpp
+++ b/modules/dnn/src/layers/shuffle_channel_layer.cpp
@@ -28,17 +28,21 @@ public:
         return group == 1;
     }
 
-    virtual void finalize(const std::vector<Mat*>& inputs, std::vector<Mat> &outputs) CV_OVERRIDE
+    virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
     {
         if (group != 1)
         {
+            std::vector<Mat> inputs, outputs;
+            inputs_arr.getMatVector(inputs);
+            outputs_arr.getMatVector(outputs);
+
             LayerParams lp;
             float order[] = {0, 2, 1, 3};
             lp.set("order", DictValue::arrayInt(&order[0], 4));
             permute = PermuteLayer::create(lp);
 
-            Mat inp = *inputs[0];
-            Mat out = outputs[0];
+            const Mat& inp = inputs[0];
+            const Mat& out = outputs[0];
 
             permuteInpShape.resize(4);
             permuteInpShape[0] = inp.size[0];
@@ -52,11 +56,8 @@ public:
             permuteOutShape[2] = permuteInpShape[1];
             permuteOutShape[3] = permuteInpShape[3];
 
-            inp = inp.reshape(1, permuteInpShape);
-            out = out.reshape(1, permuteOutShape);
-
-            std::vector<Mat*> permuteInputs(1, &inp);
-            std::vector<Mat> permuteOutputs(1, out);
+            std::vector<Mat> permuteInputs(1, inp.reshape(1, permuteInpShape));
+            std::vector<Mat> permuteOutputs(1, out.reshape(1, permuteOutShape));
             permute->finalize(permuteInputs, permuteOutputs);
         }
     }
@@ -66,15 +67,18 @@ public:
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
-        Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
-    }
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
 
-    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals) CV_OVERRIDE
-    {
-        CV_TRACE_FUNCTION();
-        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+        std::vector<Mat> inputs, outputs, internals;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+        internals_arr.getMatVector(internals);
 
-        Mat inp = *inputs[0];
+        Mat inp = inputs[0];
         Mat out = outputs[0];
         if (inp.data != out.data)
         {
@@ -82,7 +86,7 @@ public:
             {
                 inp = inp.reshape(1, permuteInpShape);
                 out = out.reshape(1, permuteOutShape);
-                std::vector<Mat*> permuteInputs(1, &inp);
+                std::vector<Mat> permuteInputs(1, inp);
                 std::vector<Mat> permuteOutputs(1, out);
                 permute->forward(permuteInputs, permuteOutputs, internals);
             }
diff --git a/modules/dnn/src/layers/slice_layer.cpp b/modules/dnn/src/layers/slice_layer.cpp
index 2b0685826f..e24842f9de 100644
--- a/modules/dnn/src/layers/slice_layer.cpp
+++ b/modules/dnn/src/layers/slice_layer.cpp
@@ -144,10 +144,14 @@ public:
         return false;
     }
 
-    void finalize(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs) CV_OVERRIDE
+    void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
     {
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
         CV_Assert(inputs.size() == 1);
-        const MatSize& inpShape = inputs[0]->size;
+        const MatSize& inpShape = inputs[0].size;
 
         if (sliceRanges.empty())
         {
@@ -239,15 +243,17 @@ public:
                    OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
-        Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
-    }
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
 
-    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals) CV_OVERRIDE
-    {
-        CV_TRACE_FUNCTION();
-        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
 
-        const Mat& inpMat = *inputs[0];
+        const Mat& inpMat = inputs[0];
         CV_Assert(outputs.size() == sliceRanges.size());
         for (size_t i = 0; i < outputs.size(); i++)
         {
diff --git a/modules/dnn/src/layers/softmax_layer.cpp b/modules/dnn/src/layers/softmax_layer.cpp
index eefd321bb3..1eb5f0da2c 100644
--- a/modules/dnn/src/layers/softmax_layer.cpp
+++ b/modules/dnn/src/layers/softmax_layer.cpp
@@ -191,15 +191,18 @@ public:
                    OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
-        Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
-    }
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
 
-    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals) CV_OVERRIDE
-    {
-        CV_TRACE_FUNCTION();
-        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+        std::vector<Mat> inputs, outputs, internals;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+        internals_arr.getMatVector(internals);
 
-        const Mat &src = *inputs[0];
+        const Mat &src = inputs[0];
         Mat &dst = outputs[0];
 
         int axis = clamp(axisRaw, src.dims);
diff --git a/modules/dnn/src/layers/split_layer.cpp b/modules/dnn/src/layers/split_layer.cpp
index f3ba67450b..2fe5df1509 100644
--- a/modules/dnn/src/layers/split_layer.cpp
+++ b/modules/dnn/src/layers/split_layer.cpp
@@ -83,18 +83,19 @@ public:
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
-        Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
-    }
-
-    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals) CV_OVERRIDE
-    {
-        CV_TRACE_FUNCTION();
-        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
 
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
         for (size_t i = 0; i < outputs.size(); i++)
         {
-            CV_Assert(inputs[0]->total() == outputs[i].total());
-            inputs[0]->copyTo(outputs[i]);
+            CV_Assert(inputs[0].total() == outputs[i].total());
+            inputs[0].copyTo(outputs[i]);
         }
     }
 };
diff --git a/modules/dnn/src/op_inf_engine.cpp b/modules/dnn/src/op_inf_engine.cpp
index 43d8d1eb2d..85f6690790 100644
--- a/modules/dnn/src/op_inf_engine.cpp
+++ b/modules/dnn/src/op_inf_engine.cpp
@@ -551,12 +551,6 @@ bool InfEngineBackendLayer::supportBackend(int backendId)
            backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine();
 }
 
-void InfEngineBackendLayer::forward(std::vector<Mat*> &input, std::vector<Mat> &output,
-                                    std::vector<Mat> &internals)
-{
-    CV_Error(Error::StsError, "Choose Inference Engine as a preferable backend.");
-}
-
 void InfEngineBackendLayer::forward(InputArrayOfArrays inputs, OutputArrayOfArrays outputs,
                                     OutputArrayOfArrays internals)
 {
diff --git a/modules/dnn/src/op_inf_engine.hpp b/modules/dnn/src/op_inf_engine.hpp
index f49a8e0445..f04a5c636c 100644
--- a/modules/dnn/src/op_inf_engine.hpp
+++ b/modules/dnn/src/op_inf_engine.hpp
@@ -196,9 +196,6 @@ public:
                                  std::vector<MatShape> &outputs,
                                  std::vector<MatShape> &internals) const CV_OVERRIDE;
 
-    virtual void forward(std::vector<Mat*> &input, std::vector<Mat> &output,
-                         std::vector<Mat> &internals) CV_OVERRIDE;
-
     virtual void forward(InputArrayOfArrays inputs, OutputArrayOfArrays outputs,
                          OutputArrayOfArrays internals) CV_OVERRIDE;
 
diff --git a/modules/dnn/test/test_caffe_importer.cpp b/modules/dnn/test/test_caffe_importer.cpp
index 33f2fa1f22..85ff7ace21 100644
--- a/modules/dnn/test/test_caffe_importer.cpp
+++ b/modules/dnn/test/test_caffe_importer.cpp
@@ -391,7 +391,7 @@ TEST_P(Test_Caffe_nets, Colorization)
     Mat out = net.forward();
 
     // Reference output values are in range [-29.1, 69.5]
-    const double l1 = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.21 : 4e-4;
+    const double l1 = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.25 : 4e-4;
     const double lInf = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 5.3 : 3e-3;
     normAssert(out, ref, "", l1, lInf);
 }
diff --git a/modules/dnn/test/test_layers.cpp b/modules/dnn/test/test_layers.cpp
index 28f1167dc5..14c6f55f40 100644
--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@@ -61,16 +61,13 @@ static String _tf(TString filename)
 void runLayer(Ptr<Layer> layer, std::vector<Mat> &inpBlobs, std::vector<Mat> &outBlobs)
 {
     size_t ninputs = inpBlobs.size();
-    std::vector<Mat> inp_(ninputs);
-    std::vector<Mat*> inp(ninputs);
-    std::vector<Mat> outp, intp;
+    std::vector<Mat> inp(ninputs), outp, intp;
     std::vector<MatShape> inputs, outputs, internals;
 
     for (size_t i = 0; i < ninputs; i++)
     {
-        inp_[i] = inpBlobs[i].clone();
-        inp[i] = &inp_[i];
-        inputs.push_back(shape(inp_[i]));
+        inp[i] = inpBlobs[i].clone();
+        inputs.push_back(shape(inp[i]));
     }
 
     layer->getMemoryShapes(inputs, 0, outputs, internals);
@@ -1052,8 +1049,6 @@ public:
         return backendId == DNN_BACKEND_OPENCV;
     }
 
-    virtual void forward(std::vector<cv::Mat*> &inputs, std::vector<cv::Mat> &outputs, std::vector<cv::Mat> &internals) CV_OVERRIDE {}
-
     virtual void forward(cv::InputArrayOfArrays inputs, cv::OutputArrayOfArrays outputs, cv::OutputArrayOfArrays internals) CV_OVERRIDE {}
 };
 
@@ -1151,8 +1146,11 @@ public:
         return false;
     }
 
-    virtual void finalize(const std::vector<Mat*>& inputs, std::vector<Mat> &outputs) CV_OVERRIDE
+    virtual void finalize(InputArrayOfArrays, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
     {
+        std::vector<Mat> outputs;
+        outputs_arr.getMatVector(outputs);
+
         if (!outWidth && !outHeight)
         {
             outHeight = outputs[0].size[2];
@@ -1161,9 +1159,22 @@ public:
     }
 
     // Implementation of this custom layer is based on https://github.com/cdmh/deeplab-public/blob/master/src/caffe/layers/interp_layer.cpp
-    virtual void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat>& internals) CV_OVERRIDE
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
     {
-        Mat& inp = *inputs[0];
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        Mat& inp = inputs[0];
         Mat& out = outputs[0];
         const float* inpData = (float*)inp.data;
         float* outData = (float*)out.data;
diff --git a/modules/dnn/test/test_misc.cpp b/modules/dnn/test/test_misc.cpp
index ae7c7d00a7..acf8dae427 100644
--- a/modules/dnn/test/test_misc.cpp
+++ b/modules/dnn/test/test_misc.cpp
@@ -6,7 +6,8 @@
 // Third party copyrights are property of their respective owners.
 
 #include "test_precomp.hpp"
-
+#include <opencv2/core/ocl.hpp>
+#include <opencv2/core/opencl/ocl_defs.hpp>
 #include <opencv2/dnn/layer.details.hpp>  // CV_DNN_REGISTER_LAYER_CLASS
 
 namespace opencv_test { namespace {
@@ -87,9 +88,13 @@ public:
         return Ptr<Layer>(new FirstCustomLayer(params));
     }
 
-    virtual void forward(InputArrayOfArrays, OutputArrayOfArrays, OutputArrayOfArrays) CV_OVERRIDE {}
-    virtual void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat>& internals) CV_OVERRIDE
+    void forward(InputArrayOfArrays, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays) CV_OVERRIDE
     {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        std::vector<Mat> outputs;
+        outputs_arr.getMatVector(outputs);
         outputs[0].setTo(1);
     }
 };
@@ -104,9 +109,13 @@ public:
         return Ptr<Layer>(new SecondCustomLayer(params));
     }
 
-    virtual void forward(InputArrayOfArrays, OutputArrayOfArrays, OutputArrayOfArrays) CV_OVERRIDE {}
-    virtual void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat>& internals) CV_OVERRIDE
+    void forward(InputArrayOfArrays, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays) CV_OVERRIDE
     {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        std::vector<Mat> outputs;
+        outputs_arr.getMatVector(outputs);
         outputs[0].setTo(2);
     }
 };
@@ -178,4 +187,125 @@ INSTANTIATE_TEST_CASE_P(/**/, setInput, Combine(
   dnnBackendsAndTargets()
 ));
 
+class CustomLayerWithDeprecatedForward CV_FINAL : public Layer
+{
+public:
+    CustomLayerWithDeprecatedForward(const LayerParams &params) : Layer(params) {}
+
+    static Ptr<Layer> create(LayerParams& params)
+    {
+        return Ptr<Layer>(new CustomLayerWithDeprecatedForward(params));
+    }
+
+    virtual void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals) CV_OVERRIDE
+    {
+        CV_Assert_N(inputs[0]->depth() == CV_32F, outputs[0].depth() == CV_32F);
+        cv::add(*inputs[0], 0.5f, outputs[0]);
+    }
+};
+
+class CustomLayerWithDeprecatedForwardAndFallback CV_FINAL : public Layer
+{
+public:
+    CustomLayerWithDeprecatedForwardAndFallback(const LayerParams &params) : Layer(params) {}
+
+    static Ptr<Layer> create(LayerParams& params)
+    {
+        return Ptr<Layer>(new CustomLayerWithDeprecatedForwardAndFallback(params));
+    }
+
+    void forward(InputArrayOfArrays inputs, OutputArrayOfArrays outputs, OutputArrayOfArrays internals) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        CV_OCL_RUN(preferableTarget == DNN_TARGET_OPENCL || preferableTarget == DNN_TARGET_OPENCL_FP16,
+                   forward_ocl(inputs, outputs, internals));
+
+        Layer::forward_fallback(inputs, outputs, internals);
+    }
+
+    virtual void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals) CV_OVERRIDE
+    {
+        CV_Assert_N(inputs[0]->depth() == CV_32F, outputs[0].depth() == CV_32F);
+        cv::add(*inputs[0], 0.5f, outputs[0]);
+    }
+
+#ifdef HAVE_OPENCL
+    bool forward_ocl(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr)
+    {
+        if (inputs_arr.depth() != CV_32F)
+            return false;
+
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+        inputs_arr.getUMatVector(inputs);
+        outputs_arr.getUMatVector(outputs);
+        cv::add(inputs[0], 0.5f, outputs[0]);
+        return true;
+    }
+#endif
+};
+
+typedef testing::TestWithParam<tuple<Backend, Target> > DeprecatedForward;
+TEST_P(DeprecatedForward, CustomLayer)
+{
+    const int backend  = get<0>(GetParam());
+    const int target   = get<1>(GetParam());
+
+    Mat inp(5, 5, CV_32FC1);
+    randu(inp, -1.0f, 1.0f);
+    inp = blobFromImage(inp);
+
+    CV_DNN_REGISTER_LAYER_CLASS(CustomType, CustomLayerWithDeprecatedForward);
+    try
+    {
+        LayerParams lp;
+        Net net;
+        net.addLayerToPrev("testLayer", "CustomType", lp);
+        net.setPreferableBackend(backend);
+        net.setPreferableTarget(target);
+        net.setInput(inp);
+        Mat out = net.forward();
+        normAssert(out, inp + 0.5f, "", 2e-4, 7e-4);
+    }
+    catch (...)
+    {
+        LayerFactory::unregisterLayer("CustomType");
+        throw;
+    }
+    LayerFactory::unregisterLayer("CustomType");
+}
+
+TEST_P(DeprecatedForward, CustomLayerWithFallback)
+{
+    const int backend  = get<0>(GetParam());
+    const int target   = get<1>(GetParam());
+
+    Mat inp(5, 5, CV_32FC1);
+    randu(inp, -1.0f, 1.0f);
+    inp = blobFromImage(inp);
+
+    CV_DNN_REGISTER_LAYER_CLASS(CustomType, CustomLayerWithDeprecatedForwardAndFallback);
+    try
+    {
+        LayerParams lp;
+        Net net;
+        net.addLayerToPrev("testLayer", "CustomType", lp);
+        net.setPreferableBackend(backend);
+        net.setPreferableTarget(target);
+        net.setInput(inp);
+        Mat out = net.forward();
+        normAssert(out, inp + 0.5f, "", 2e-4, 7e-4);
+    }
+    catch (...)
+    {
+        LayerFactory::unregisterLayer("CustomType");
+        throw;
+    }
+    LayerFactory::unregisterLayer("CustomType");
+}
+
+INSTANTIATE_TEST_CASE_P(/**/, DeprecatedForward, dnnBackendsAndTargets());
+
 }} // namespace
diff --git a/modules/dnn/test/test_torch_importer.cpp b/modules/dnn/test/test_torch_importer.cpp
index 88742c68cc..bd5f11249d 100644
--- a/modules/dnn/test/test_torch_importer.cpp
+++ b/modules/dnn/test/test_torch_importer.cpp
@@ -313,14 +313,14 @@ TEST_P(Test_Torch_nets, ENet_accuracy)
     // Due to numerical instability in Pooling-Unpooling layers (indexes jittering)
     // thresholds for ENet must be changed. Accuracy of results was checked on
     // Cityscapes dataset and difference in mIOU with Torch is 10E-4%
-    normAssert(ref, out, "", 0.00044, target == DNN_TARGET_CPU ? 0.453 : 0.44);
+    normAssert(ref, out, "", 0.00044, /*target == DNN_TARGET_CPU ? 0.453 : */0.5);
 
     const int N = 3;
     for (int i = 0; i < N; i++)
     {
         net.setInput(inputBlob, "");
         Mat out = net.forward();
-        normAssert(ref, out, "", 0.00044, target == DNN_TARGET_CPU ? 0.453 : 0.44);
+        normAssert(ref, out, "", 0.00044, /*target == DNN_TARGET_CPU ? 0.453 : */0.5);
     }
 }
 
@@ -411,15 +411,22 @@ public:
         return false;
     }
 
-    virtual void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals) CV_OVERRIDE
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays) CV_OVERRIDE
     {
-        Mat& inp = *inputs[0];
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        Mat& inp = inputs[0];
         Mat& out = outputs[0];
         const int outHeight = out.size[2];
         const int outWidth = out.size[3];
-        for (size_t n = 0; n < inputs[0]->size[0]; ++n)
+        for (size_t n = 0; n < inp.size[0]; ++n)
         {
-            for (size_t ch = 0; ch < inputs[0]->size[1]; ++ch)
+            for (size_t ch = 0; ch < inp.size[1]; ++ch)
             {
                 resize(getPlane(inp, n, ch), getPlane(out, n, ch),
                        Size(outWidth, outHeight), 0, 0, INTER_NEAREST);
diff --git a/modules/highgui/src/window.cpp b/modules/highgui/src/window.cpp
index 7bc9d432c3..4f7c2f7686 100644
--- a/modules/highgui/src/window.cpp
+++ b/modules/highgui/src/window.cpp
@@ -356,7 +356,7 @@ void cv::imshow( const String& winname, InputArray _img )
     CV_Assert(size.width>0 && size.height>0);
     {
         Mat img = _img.getMat();
-        CvMat c_img = img;
+        CvMat c_img = cvMat(img);
         cvShowImage(winname.c_str(), &c_img);
     }
 #else
diff --git a/modules/highgui/src/window_gtk.cpp b/modules/highgui/src/window_gtk.cpp
index 5ab7de86ab..bb6397faee 100644
--- a/modules/highgui/src/window_gtk.cpp
+++ b/modules/highgui/src/window_gtk.cpp
@@ -1755,8 +1755,8 @@ static gboolean icvOnMouse( GtkWidget *widget, GdkEvent *event, gpointer user_da
 {
     // TODO move this logic to CvImageWidget
     CvWindow* window = (CvWindow*)user_data;
-    CvPoint2D32f pt32f(-1., -1.);
-    CvPoint pt(-1,-1);
+    CvPoint2D32f pt32f = {-1., -1.};
+    CvPoint pt = {-1,-1};
     int cv_event = -1, state = 0, flags = 0;
     CvImageWidget * image_widget = CV_IMAGE_WIDGET( widget );
 
diff --git a/modules/imgcodecs/src/loadsave.cpp b/modules/imgcodecs/src/loadsave.cpp
index 8d89b412a0..77b0ff124a 100644
--- a/modules/imgcodecs/src/loadsave.cpp
+++ b/modules/imgcodecs/src/loadsave.cpp
@@ -491,7 +491,7 @@ imread_( const String& filename, int flags, int hdrtype, Mat* mat=0 )
     }
     else
     {
-        image = cvCreateImage( size, cvIplDepth(type), CV_MAT_CN(type) );
+        image = cvCreateImage(cvSize(size), cvIplDepth(type), CV_MAT_CN(type));
         temp = cvarrToMat( image );
     }
 
@@ -838,7 +838,7 @@ imdecode_( const Mat& buf, int flags, int hdrtype, Mat* mat=0 )
     }
     else
     {
-        image = cvCreateImage( size, cvIplDepth(type), CV_MAT_CN(type) );
+        image = cvCreateImage(cvSize(size), cvIplDepth(type), CV_MAT_CN(type));
         temp = cvarrToMat(image);
     }
 
diff --git a/modules/imgcodecs/src/utils.cpp b/modules/imgcodecs/src/utils.cpp
index 3273912289..6aeb631060 100644
--- a/modules/imgcodecs/src/utils.cpp
+++ b/modules/imgcodecs/src/utils.cpp
@@ -652,7 +652,7 @@ cvConvertImage( const CvArr* srcarr, CvArr* dstarr, int flags )
         uchar *s = src->data.ptr, *d = dst->data.ptr;
         int s_step = src->step, d_step = dst->step;
         int code = src_cn*10 + dst_cn;
-        CvSize size(src->cols, src->rows);
+        CvSize size = {src->cols, src->rows};
 
         if( CV_IS_MAT_CONT(src->type & dst->type) )
         {
diff --git a/modules/imgproc/include/opencv2/imgproc/imgproc_c.h b/modules/imgproc/include/opencv2/imgproc/imgproc_c.h
index 78032b7626..cec0f3653a 100644
--- a/modules/imgproc/include/opencv2/imgproc/imgproc_c.h
+++ b/modules/imgproc/include/opencv2/imgproc/imgproc_c.h
@@ -1036,9 +1036,10 @@ CV_INLINE  void  cvEllipseBox( CvArr* img, CvBox2D box, CvScalar color,
                                int thickness CV_DEFAULT(1),
                                int line_type CV_DEFAULT(8), int shift CV_DEFAULT(0) )
 {
-    CvSize axes;
-    axes.width = cvRound(box.size.width*0.5);
-    axes.height = cvRound(box.size.height*0.5);
+    CvSize axes = cvSize(
+        cvRound(box.size.width*0.5),
+        cvRound(box.size.height*0.5)
+    );
 
     cvEllipse( img, cvPointFrom32f( box.center ), axes, box.angle,
                0, 360, color, thickness, line_type, shift );
diff --git a/modules/imgproc/include/opencv2/imgproc/types_c.h b/modules/imgproc/include/opencv2/imgproc/types_c.h
index 13ffe1b1a3..d3e55f576f 100644
--- a/modules/imgproc/include/opencv2/imgproc/types_c.h
+++ b/modules/imgproc/include/opencv2/imgproc/types_c.h
@@ -410,7 +410,7 @@ typedef struct CvMoments
     double  mu20, mu11, mu02, mu30, mu21, mu12, mu03; /**< central moments */
     double  inv_sqrt_m00; /**< m00 != 0 ? 1/sqrt(m00) : 0 */
 
-#ifdef __cplusplus
+#if defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus)
     CvMoments(){}
     CvMoments(const cv::Moments& m)
     {
@@ -430,6 +430,36 @@ typedef struct CvMoments
 }
 CvMoments;
 
+#ifdef __cplusplus
+} // extern "C"
+
+CV_INLINE CvMoments cvMoments()
+{
+#if !defined(CV__ENABLE_C_API_CTORS)
+    CvMoments self = CV_STRUCT_INITIALIZER; return self;
+#else
+    return CvMoments();
+#endif
+}
+
+CV_INLINE CvMoments cvMoments(const cv::Moments& m)
+{
+#if !defined(CV__ENABLE_C_API_CTORS)
+    double am00 = std::abs(m.m00);
+    CvMoments self = {
+        m.m00, m.m10, m.m01, m.m20, m.m11, m.m02, m.m30, m.m21, m.m12, m.m03,
+        m.mu20, m.mu11, m.mu02, m.mu30, m.mu21, m.mu12, m.mu03,
+        am00 > DBL_EPSILON ? 1./std::sqrt(am00) : 0
+    };
+    return self;
+#else
+    return CvMoments(m);
+#endif
+}
+
+extern "C" {
+#endif // __cplusplus
+
 /** Hu invariants */
 typedef struct CvHuMoments
 {
diff --git a/modules/imgproc/src/approx.cpp b/modules/imgproc/src/approx.cpp
index 0af613d666..954ebc26a7 100644
--- a/modules/imgproc/src/approx.cpp
+++ b/modules/imgproc/src/approx.cpp
@@ -135,7 +135,7 @@ CvSeq* icvApproximateChainTC89( CvChain* chain, int header_size,
        Determines support region for all the remained points */
     do
     {
-        CvPoint pt0;
+        cv::Point2i pt0;
         int k, l = 0, d_num = 0;
 
         i = (int)(current - array);
diff --git a/modules/imgproc/src/contours.cpp b/modules/imgproc/src/contours.cpp
index 456b3cc2bb..0ded70f793 100644
--- a/modules/imgproc/src/contours.cpp
+++ b/modules/imgproc/src/contours.cpp
@@ -49,7 +49,7 @@
      (deltas)[6] =  (step), (deltas)[7] =  (step) + (nch))
 
 static const CvPoint icvCodeDeltas[8] =
-    { CvPoint(1, 0), CvPoint(1, -1), CvPoint(0, -1), CvPoint(-1, -1), CvPoint(-1, 0), CvPoint(-1, 1), CvPoint(0, 1), CvPoint(1, 1) };
+    { {1, 0}, {1, -1}, {0, -1}, {-1, -1}, {-1, 0}, {-1, 1}, {0, 1}, {1, 1} };
 
 CV_IMPL void
 cvStartReadChainPoints( CvChain * chain, CvChainPtReader * reader )
@@ -77,19 +77,15 @@ cvStartReadChainPoints( CvChain * chain, CvChainPtReader * reader )
 CV_IMPL CvPoint
 cvReadChainPoint( CvChainPtReader * reader )
 {
-    schar *ptr;
-    int code;
-    CvPoint pt;
-
     if( !reader )
         CV_Error( CV_StsNullPtr, "" );
 
-    pt = reader->pt;
+    cv::Point2i pt = reader->pt;
 
-    ptr = reader->ptr;
-    if( ptr )
+    schar *ptr = reader->ptr;
+    if (ptr)
     {
-        code = *ptr++;
+        int code = *ptr++;
 
         if( ptr >= reader->block_max )
         {
@@ -104,7 +100,7 @@ cvReadChainPoint( CvChainPtReader * reader )
         reader->pt.y = pt.y + icvCodeDeltas[code].y;
     }
 
-    return pt;
+    return cvPoint(pt);
 }
 
 
@@ -209,14 +205,7 @@ cvStartFindContours_Impl( void* _img, CvMemStorage* storage,
         CV_Error( CV_StsBadSize, "" );
 
     CvContourScanner scanner = (CvContourScanner)cvAlloc( sizeof( *scanner ));
-#if defined __GNUC__ && __GNUC__ >= 8
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wclass-memaccess"
-#endif
     memset( scanner, 0, sizeof(*scanner) );
-#if defined __GNUC__ && __GNUC__ >= 8
-#pragma GCC diagnostic pop
-#endif
 
     scanner->storage1 = scanner->storage2 = storage;
     scanner->img0 = (schar *) img;
@@ -700,7 +689,7 @@ icvFetchContourEx( schar*               ptr,
     int         deltas[MAX_SIZE];
     CvSeqWriter writer;
     schar        *i0 = ptr, *i1, *i3, *i4 = NULL;
-    CvRect      rect;
+    cv::Rect    rect;
     int         prev_s = -1, s, s_end;
     int         method = _method - 1;
 
@@ -810,14 +799,14 @@ icvFetchContourEx( schar*               ptr,
     cvEndWriteSeq( &writer );
 
     if( _method != CV_CHAIN_CODE )
-        ((CvContour*)contour)->rect = rect;
+        ((CvContour*)contour)->rect = cvRect(rect);
 
     CV_DbgAssert( (writer.seq->total == 0 && writer.seq->first == 0) ||
             writer.seq->total > writer.seq->first->count ||
             (writer.seq->first->prev == writer.seq->first &&
              writer.seq->first->next == writer.seq->first) );
 
-    if( _rect )  *_rect = rect;
+    if( _rect )  *_rect = cvRect(rect);
 }
 
 
@@ -888,7 +877,7 @@ icvFetchContourEx_32s( int*                 ptr,
     int         deltas[MAX_SIZE];
     CvSeqWriter writer;
     int        *i0 = ptr, *i1, *i3, *i4;
-    CvRect      rect;
+    cv::Rect    rect;
     int         prev_s = -1, s, s_end;
     int         method = _method - 1;
     const int   right_flag = INT_MIN;
@@ -1000,14 +989,14 @@ icvFetchContourEx_32s( int*                 ptr,
     cvEndWriteSeq( &writer );
 
     if( _method != CV_CHAIN_CODE )
-        ((CvContour*)contour)->rect = rect;
+        ((CvContour*)contour)->rect = cvRect(rect);
 
     CV_DbgAssert( (writer.seq->total == 0 && writer.seq->first == 0) ||
            writer.seq->total > writer.seq->first->count ||
            (writer.seq->first->prev == writer.seq->first &&
             writer.seq->first->next == writer.seq->first) );
 
-    if( _rect )  *_rect = rect;
+    if (_rect) *_rect = cvRect(rect);
 }
 
 
@@ -1035,7 +1024,7 @@ cvFindNextContour( CvContourScanner scanner )
     int width = scanner->img_size.width;
     int height = scanner->img_size.height;
     int mode = scanner->mode;
-    CvPoint lnbd = scanner->lnbd;
+    cv::Point2i lnbd = scanner->lnbd;
     int nbd = scanner->nbd;
     int prev = img[x - 1];
     int new_mask = -2;
@@ -1125,7 +1114,7 @@ cvFindNextContour( CvContourScanner scanner )
                 _CvContourInfo *par_info = 0;
                 CvSeq *seq = 0;
                 int is_hole = 0;
-                CvPoint origin;
+                cv::Point2i origin;
 
                 /* if not external contour */
                 if( (!img_i && !(prev == 0 && p == 1)) ||
@@ -1259,7 +1248,7 @@ cvFindNextContour( CvContourScanner scanner )
 
                 l_cinfo->is_hole = is_hole;
                 l_cinfo->contour = seq;
-                l_cinfo->origin = origin;
+                l_cinfo->origin = cvPoint(origin);
                 l_cinfo->parent = par_info;
 
                 if( scanner->approx_method1 != scanner->approx_method2 )
@@ -1292,7 +1281,7 @@ cvFindNextContour( CvContourScanner scanner )
                 scanner->l_cinfo = l_cinfo;
                 scanner->pt.x = !img_i ? x + 1 : x + 1 - is_hole;
                 scanner->pt.y = y;
-                scanner->lnbd = lnbd;
+                scanner->lnbd = cvPoint(lnbd);
                 scanner->img = (schar *) img;
                 scanner->nbd = nbd;
                 return l_cinfo->contour;
@@ -1480,7 +1469,7 @@ icvFindContoursInInterval( const CvArr* src,
 
     uchar*  src_data = 0;
     int  img_step = 0;
-    CvSize  img_size;
+    cv::Size img_size;
 
     int  connect_flag;
     int  lower_total;
@@ -1529,7 +1518,7 @@ icvFindContoursInInterval( const CvArr* src,
         CV_Error( CV_StsBadArg, "Input array must be 8uC1 or 8sC1" );
     src_data = mat->data.ptr;
     img_step = mat->step;
-    img_size = cvGetMatSize( mat );
+    img_size = cvGetMatSize(mat);
 
     // Create temporary sequences
     runs = cvCreateSeq(0, sizeof(CvSeq), sizeof(CvLinkedRunPoint), storage00 );
@@ -1550,7 +1539,7 @@ icvFindContoursInInterval( const CvArr* src,
     tmp_prev = upper_line;
     for( j = 0; j < img_size.width; )
     {
-        j = findStartContourPoint(src_data, img_size, j, haveSIMD);
+        j = findStartContourPoint(src_data, cvSize(img_size), j, haveSIMD);
 
         if( j == img_size.width )
             break;
@@ -1560,7 +1549,7 @@ icvFindContoursInInterval( const CvArr* src,
         tmp_prev->next = (CvLinkedRunPoint*)CV_GET_WRITTEN_ELEM( writer );
         tmp_prev = tmp_prev->next;
 
-        j = findEndContourPoint(src_data, img_size, j + 1, haveSIMD);
+        j = findEndContourPoint(src_data, cvSize(img_size), j + 1, haveSIMD);
 
         tmp.pt.x = j - 1;
         CV_WRITE_SEQ_ELEM( tmp, writer );
@@ -1584,7 +1573,7 @@ icvFindContoursInInterval( const CvArr* src,
         all_total = runs->total;
         for( j = 0; j < img_size.width; )
         {
-            j = findStartContourPoint(src_data, img_size, j, haveSIMD);
+            j = findStartContourPoint(src_data, cvSize(img_size), j, haveSIMD);
 
             if( j == img_size.width ) break;
 
@@ -1593,7 +1582,7 @@ icvFindContoursInInterval( const CvArr* src,
             tmp_prev->next = (CvLinkedRunPoint*)CV_GET_WRITTEN_ELEM( writer );
             tmp_prev = tmp_prev->next;
 
-            j = findEndContourPoint(src_data, img_size, j + 1, haveSIMD);
+            j = findEndContourPoint(src_data, cvSize(img_size), j + 1, haveSIMD);
 
             tmp.pt.x = j - 1;
             CV_WRITE_SEQ_ELEM( tmp, writer );
@@ -1908,11 +1897,11 @@ void cv::findContours( InputOutputArray _image, OutputArrayOfArrays _contours,
         image = image0;
     }
     MemStorage storage(cvCreateMemStorage());
-    CvMat _cimage = image;
+    CvMat _cimage = cvMat(image);
     CvSeq* _ccontours = 0;
     if( _hierarchy.needed() )
         _hierarchy.clear();
-    cvFindContours_Impl(&_cimage, storage, &_ccontours, sizeof(CvContour), mode, method, offset + offset0, 0);
+    cvFindContours_Impl(&_cimage, storage, &_ccontours, sizeof(CvContour), mode, method, cvPoint(offset0 + offset), 0);
     if( !_ccontours )
     {
         _contours.clear();
diff --git a/modules/imgproc/src/drawing.cpp b/modules/imgproc/src/drawing.cpp
index 09fb6363e2..3d710da9a5 100644
--- a/modules/imgproc/src/drawing.cpp
+++ b/modules/imgproc/src/drawing.cpp
@@ -2478,7 +2478,7 @@ void cv::drawContours( InputOutputArray _image, InputArrayOfArrays _contours,
     CV_INSTRUMENT_REGION()
 
     Mat image = _image.getMat(), hierarchy = _hierarchy.getMat();
-    CvMat _cimage = image;
+    CvMat _cimage = cvMat(image);
 
     size_t ncontours = _contours.total();
     size_t i = 0, first = 0, last = ncontours;
@@ -2547,8 +2547,8 @@ void cv::drawContours( InputOutputArray _image, InputArrayOfArrays _contours,
         }
     }
 
-    cvDrawContours( &_cimage, &seq[first], color, color, contourIdx >= 0 ?
-                   -maxLevel : maxLevel, thickness, lineType, offset );
+    cvDrawContours( &_cimage, &seq[first], cvScalar(color), cvScalar(color), contourIdx >= 0 ?
+                   -maxLevel : maxLevel, thickness, lineType, cvPoint(offset) );
 }
 
 
@@ -2559,11 +2559,6 @@ static const int CodeDeltas[8][2] =
 #define CV_ADJUST_EDGE_COUNT( count, seq )  \
     ((count) -= ((count) == (seq)->total && !CV_IS_SEQ_CLOSED(seq)))
 
-#if defined __GNUC__ && __GNUC__ >= 8
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wclass-memaccess"
-#endif
-
 CV_IMPL void
 cvDrawContours( void* _img, CvSeq* contour,
                 CvScalar _externalColor, CvScalar _holeColor,
@@ -2657,14 +2652,14 @@ cvDrawContours( void* _img, CvSeq* contour,
             int shift = 0;
 
             count -= !CV_IS_SEQ_CLOSED(contour);
-            CV_READ_SEQ_ELEM( pt1, reader );
+            { CvPoint pt_ = CV_STRUCT_INITIALIZER; CV_READ_SEQ_ELEM(pt_, reader); pt1 = pt_; }
             pt1 += offset;
             if( thickness < 0 )
                 pts.push_back(pt1);
 
             for( i = 0; i < count; i++ )
             {
-                CV_READ_SEQ_ELEM( pt2, reader );
+                { CvPoint pt_ = CV_STRUCT_INITIALIZER; CV_READ_SEQ_ELEM(pt_, reader); pt2 = pt_; }
                 pt2 += offset;
                 if( thickness >= 0 )
                     cv::ThickLine( img, pt1, pt2, clr, thickness, line_type, 2, shift );
@@ -2706,7 +2701,7 @@ cvEllipse2Poly( CvPoint center, CvSize axes, int angle,
 CV_IMPL CvScalar
 cvColorToScalar( double packed_color, int type )
 {
-    CvScalar scalar;
+    cv::Scalar scalar;
 
     if( CV_MAT_DEPTH( type ) == CV_8U )
     {
@@ -2764,7 +2759,7 @@ cvColorToScalar( double packed_color, int type )
         }
     }
 
-    return scalar;
+    return cvScalar(scalar);
 }
 
 CV_IMPL int
@@ -2892,11 +2887,7 @@ cvGetTextSize( const char *text, const CvFont *_font, CvSize *_size, int *_base_
     cv::Size size = cv::getTextSize( text, _font->font_face, (_font->hscale + _font->vscale)*0.5,
                                      _font->thickness, _base_line );
     if( _size )
-        *_size = size;
+        *_size = cvSize(size);
 }
 
-#if defined __GNUC__ && __GNUC__ >= 8
-#pragma GCC diagnostic pop // "-Wclass-memaccess"
-#endif
-
 /* End of file. */
diff --git a/modules/imgproc/src/emd.cpp b/modules/imgproc/src/emd.cpp
index 0b985a066e..ad74990ce9 100644
--- a/modules/imgproc/src/emd.cpp
+++ b/modules/imgproc/src/emd.cpp
@@ -1156,15 +1156,15 @@ float cv::EMD( InputArray _signature1, InputArray _signature2,
     Mat signature1 = _signature1.getMat(), signature2 = _signature2.getMat();
     Mat cost = _cost.getMat(), flow;
 
-    CvMat _csignature1 = signature1;
-    CvMat _csignature2 = signature2;
-    CvMat _ccost = cost, _cflow;
+    CvMat _csignature1 = cvMat(signature1);
+    CvMat _csignature2 = cvMat(signature2);
+    CvMat _ccost = cvMat(cost), _cflow;
     if( _flow.needed() )
     {
         _flow.create(signature1.rows, signature2.rows, CV_32F);
         flow = _flow.getMat();
         flow = Scalar::all(0);
-        _cflow = flow;
+        _cflow = cvMat(flow);
     }
 
     return cvCalcEMD2( &_csignature1, &_csignature2, distType, 0, cost.empty() ? 0 : &_ccost,
diff --git a/modules/imgproc/src/featureselect.cpp b/modules/imgproc/src/featureselect.cpp
index c384a81148..3686310eaf 100644
--- a/modules/imgproc/src/featureselect.cpp
+++ b/modules/imgproc/src/featureselect.cpp
@@ -530,7 +530,7 @@ cvGoodFeaturesToTrack( const void* _image, void*, void*,
 
     size_t i, ncorners = corners.size();
     for( i = 0; i < ncorners; i++ )
-        _corners[i] = corners[i];
+        _corners[i] = cvPoint2D32f(corners[i]);
     *_corner_count = (int)ncorners;
 }
 
diff --git a/modules/imgproc/src/floodfill.cpp b/modules/imgproc/src/floodfill.cpp
index 0509c61267..da69e29969 100644
--- a/modules/imgproc/src/floodfill.cpp
+++ b/modules/imgproc/src/floodfill.cpp
@@ -641,15 +641,8 @@ cvFloodFill( CvArr* arr, CvPoint seed_point,
              CvScalar newVal, CvScalar lo_diff, CvScalar up_diff,
              CvConnectedComp* comp, int flags, CvArr* maskarr )
 {
-#if defined __GNUC__ && __GNUC__ >= 8
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wclass-memaccess"
-#endif
     if( comp )
         memset( comp, 0, sizeof(*comp) );
-#if defined __GNUC__ && __GNUC__ >= 8
-#pragma GCC diagnostic pop
-#endif
 
     cv::Mat img = cv::cvarrToMat(arr), mask = cv::cvarrToMat(maskarr);
     int area = cv::floodFill(img, mask, seed_point, newVal,
diff --git a/modules/imgproc/src/geometry.cpp b/modules/imgproc/src/geometry.cpp
index 421fa64ed3..2e602ed580 100644
--- a/modules/imgproc/src/geometry.cpp
+++ b/modules/imgproc/src/geometry.cpp
@@ -46,7 +46,7 @@ cvMaxRect( const CvRect* rect1, const CvRect* rect2 )
 {
     if( rect1 && rect2 )
     {
-        CvRect max_rect;
+        cv::Rect max_rect;
         int a, b;
 
         max_rect.x = a = rect1->x;
@@ -72,7 +72,7 @@ cvMaxRect( const CvRect* rect1, const CvRect* rect2 )
         if( max_rect.height < b )
             max_rect.height = b;
         max_rect.height -= max_rect.y;
-        return max_rect;
+        return cvRect(max_rect);
     }
     else if( rect1 )
         return *rect1;
diff --git a/modules/imgproc/src/grabcut.cpp b/modules/imgproc/src/grabcut.cpp
index ff3c601548..21dace9072 100644
--- a/modules/imgproc/src/grabcut.cpp
+++ b/modules/imgproc/src/grabcut.cpp
@@ -69,7 +69,7 @@ public:
     void endLearning();
 
 private:
-    void calcInverseCovAndDeterm( int ci );
+    void calcInverseCovAndDeterm(int ci, double singularFix);
     Mat model;
     double* coefs;
     double* mean;
@@ -103,7 +103,7 @@ GMM::GMM( Mat& _model )
 
     for( int ci = 0; ci < componentsCount; ci++ )
         if( coefs[ci] > 0 )
-             calcInverseCovAndDeterm( ci );
+             calcInverseCovAndDeterm(ci, 0.0);
     totalSampleCount = 0;
 }
 
@@ -175,7 +175,6 @@ void GMM::addSample( int ci, const Vec3d color )
 void GMM::endLearning()
 {
     CV_Assert(totalSampleCount > 0);
-    const double variance = 0.01;
     for( int ci = 0; ci < componentsCount; ci++ )
     {
         int n = sampleCounts[ci];
@@ -183,48 +182,49 @@ void GMM::endLearning()
             coefs[ci] = 0;
         else
         {
+            double inv_n = 1.0 / n;
             coefs[ci] = (double)n/totalSampleCount;
 
             double* m = mean + 3*ci;
-            m[0] = sums[ci][0]/n; m[1] = sums[ci][1]/n; m[2] = sums[ci][2]/n;
+            m[0] = sums[ci][0] * inv_n; m[1] = sums[ci][1] * inv_n; m[2] = sums[ci][2] * inv_n;
 
             double* c = cov + 9*ci;
-            c[0] = prods[ci][0][0]/n - m[0]*m[0]; c[1] = prods[ci][0][1]/n - m[0]*m[1]; c[2] = prods[ci][0][2]/n - m[0]*m[2];
-            c[3] = prods[ci][1][0]/n - m[1]*m[0]; c[4] = prods[ci][1][1]/n - m[1]*m[1]; c[5] = prods[ci][1][2]/n - m[1]*m[2];
-            c[6] = prods[ci][2][0]/n - m[2]*m[0]; c[7] = prods[ci][2][1]/n - m[2]*m[1]; c[8] = prods[ci][2][2]/n - m[2]*m[2];
+            c[0] = prods[ci][0][0] * inv_n - m[0]*m[0]; c[1] = prods[ci][0][1] * inv_n - m[0]*m[1]; c[2] = prods[ci][0][2] * inv_n - m[0]*m[2];
+            c[3] = prods[ci][1][0] * inv_n - m[1]*m[0]; c[4] = prods[ci][1][1] * inv_n - m[1]*m[1]; c[5] = prods[ci][1][2] * inv_n - m[1]*m[2];
+            c[6] = prods[ci][2][0] * inv_n - m[2]*m[0]; c[7] = prods[ci][2][1] * inv_n - m[2]*m[1]; c[8] = prods[ci][2][2] * inv_n - m[2]*m[2];
 
-            double dtrm = c[0]*(c[4]*c[8]-c[5]*c[7]) - c[1]*(c[3]*c[8]-c[5]*c[6]) + c[2]*(c[3]*c[7]-c[4]*c[6]);
-            if( dtrm <= std::numeric_limits<double>::epsilon() )
-            {
-                // Adds the white noise to avoid singular covariance matrix.
-                c[0] += variance;
-                c[4] += variance;
-                c[8] += variance;
-            }
-
-            calcInverseCovAndDeterm(ci);
+            calcInverseCovAndDeterm(ci, 0.01);
         }
     }
 }
 
-void GMM::calcInverseCovAndDeterm( int ci )
+void GMM::calcInverseCovAndDeterm(int ci, const double singularFix)
 {
     if( coefs[ci] > 0 )
     {
         double *c = cov + 9*ci;
-        double dtrm =
-              covDeterms[ci] = c[0]*(c[4]*c[8]-c[5]*c[7]) - c[1]*(c[3]*c[8]-c[5]*c[6]) + c[2]*(c[3]*c[7]-c[4]*c[6]);
+        double dtrm = c[0]*(c[4]*c[8]-c[5]*c[7]) - c[1]*(c[3]*c[8]-c[5]*c[6]) + c[2]*(c[3]*c[7]-c[4]*c[6]);
+        if (dtrm <= 1e-6 && singularFix > 0)
+        {
+            // Adds the white noise to avoid singular covariance matrix.
+            c[0] += singularFix;
+            c[4] += singularFix;
+            c[8] += singularFix;
+            dtrm = c[0] * (c[4] * c[8] - c[5] * c[7]) - c[1] * (c[3] * c[8] - c[5] * c[6]) + c[2] * (c[3] * c[7] - c[4] * c[6]);
+        }
+        covDeterms[ci] = dtrm;
 
         CV_Assert( dtrm > std::numeric_limits<double>::epsilon() );
-        inverseCovs[ci][0][0] =  (c[4]*c[8] - c[5]*c[7]) / dtrm;
-        inverseCovs[ci][1][0] = -(c[3]*c[8] - c[5]*c[6]) / dtrm;
-        inverseCovs[ci][2][0] =  (c[3]*c[7] - c[4]*c[6]) / dtrm;
-        inverseCovs[ci][0][1] = -(c[1]*c[8] - c[2]*c[7]) / dtrm;
-        inverseCovs[ci][1][1] =  (c[0]*c[8] - c[2]*c[6]) / dtrm;
-        inverseCovs[ci][2][1] = -(c[0]*c[7] - c[1]*c[6]) / dtrm;
-        inverseCovs[ci][0][2] =  (c[1]*c[5] - c[2]*c[4]) / dtrm;
-        inverseCovs[ci][1][2] = -(c[0]*c[5] - c[2]*c[3]) / dtrm;
-        inverseCovs[ci][2][2] =  (c[0]*c[4] - c[1]*c[3]) / dtrm;
+        double inv_dtrm = 1.0 / dtrm;
+        inverseCovs[ci][0][0] =  (c[4]*c[8] - c[5]*c[7]) * inv_dtrm;
+        inverseCovs[ci][1][0] = -(c[3]*c[8] - c[5]*c[6]) * inv_dtrm;
+        inverseCovs[ci][2][0] =  (c[3]*c[7] - c[4]*c[6]) * inv_dtrm;
+        inverseCovs[ci][0][1] = -(c[1]*c[8] - c[2]*c[7]) * inv_dtrm;
+        inverseCovs[ci][1][1] =  (c[0]*c[8] - c[2]*c[6]) * inv_dtrm;
+        inverseCovs[ci][2][1] = -(c[0]*c[7] - c[1]*c[6]) * inv_dtrm;
+        inverseCovs[ci][0][2] =  (c[1]*c[5] - c[2]*c[4]) * inv_dtrm;
+        inverseCovs[ci][1][2] = -(c[0]*c[5] - c[2]*c[3]) * inv_dtrm;
+        inverseCovs[ci][2][2] =  (c[0]*c[4] - c[1]*c[3]) * inv_dtrm;
     }
 }
 
diff --git a/modules/imgproc/src/histogram.cpp b/modules/imgproc/src/histogram.cpp
index 6eb848068b..b420453405 100644
--- a/modules/imgproc/src/histogram.cpp
+++ b/modules/imgproc/src/histogram.cpp
@@ -2445,7 +2445,7 @@ cvGetMinMaxHistValue( const CvHistogram* hist,
     if( !CV_IS_SPARSE_HIST(hist) )
     {
         CvMat mat;
-        CvPoint minPt, maxPt;
+        CvPoint minPt = {0, 0}, maxPt = {0, 0};
 
         cvGetMat( hist->bins, &mat, 0, 1 );
         cvMinMaxLoc( &mat, &minVal, &maxVal, &minPt, &maxPt );
@@ -2969,7 +2969,7 @@ cvCalcArrBackProjectPatch( CvArr** arr, CvArr* dst, CvSize patch_size, CvHistogr
     CvMat dststub, *dstmat;
     int i, dims;
     int x, y;
-    CvSize size;
+    cv::Size size;
 
     if( !CV_IS_HIST(hist))
         CV_Error( CV_StsBadArg, "Bad histogram pointer" );
diff --git a/modules/imgproc/src/moments.cpp b/modules/imgproc/src/moments.cpp
index fc986aff0b..7b7ba145e9 100644
--- a/modules/imgproc/src/moments.cpp
+++ b/modules/imgproc/src/moments.cpp
@@ -815,7 +815,7 @@ CV_IMPL void cvMoments( const CvArr* arr, CvMoments* moments, int binary )
         src = cv::cvarrToMat(arr);
     cv::Moments m = cv::moments(src, binary != 0);
     CV_Assert( moments != 0 );
-    *moments = m;
+    *moments = cvMoments(m);
 }
 
 
diff --git a/modules/imgproc/src/pyramids.cpp b/modules/imgproc/src/pyramids.cpp
index 1e058feb1f..a87c5ce0c2 100644
--- a/modules/imgproc/src/pyramids.cpp
+++ b/modules/imgproc/src/pyramids.cpp
@@ -1673,7 +1673,7 @@ cvCreatePyramid( const CvArr* srcarr, int extra_layers, double rate,
         CV_Error( CV_StsOutOfRange, "The number of extra layers must be non negative" );
 
     int i, layer_step, elem_size = CV_ELEM_SIZE(src->type);
-    CvSize layer_size, size = cvGetMatSize(src);
+    cv::Size layer_size, size = cvGetMatSize(src);
 
     if( bufarr )
     {
diff --git a/modules/imgproc/src/rotcalipers.cpp b/modules/imgproc/src/rotcalipers.cpp
index 487e1d6243..d121903626 100644
--- a/modules/imgproc/src/rotcalipers.cpp
+++ b/modules/imgproc/src/rotcalipers.cpp
@@ -401,7 +401,7 @@ cvMinAreaRect2( const CvArr* array, CvMemStorage* /*storage*/ )
     cv::Mat points = cv::cvarrToMat(array, false, false, 0, &abuf);
 
     cv::RotatedRect rr = cv::minAreaRect(points);
-    return (CvBox2D)rr;
+    return cvBox2D(rr);
 }
 
 void cv::boxPoints(cv::RotatedRect box, OutputArray _pts)
diff --git a/modules/imgproc/src/shapedescr.cpp b/modules/imgproc/src/shapedescr.cpp
index 460a61f9f5..35e36401c6 100644
--- a/modules/imgproc/src/shapedescr.cpp
+++ b/modules/imgproc/src/shapedescr.cpp
@@ -969,7 +969,7 @@ cvMinEnclosingCircle( const void* array, CvPoint2D32f * _center, float *_radius
 
     cv::minEnclosingCircle(points, center, radius);
     if(_center)
-        *_center = center;
+        *_center = cvPoint2D32f(center);
     if(_radius)
         *_radius = radius;
     return 1;
@@ -1009,8 +1009,8 @@ icvMemCopy( double **buf1, double **buf2, double **buf3, int *b_max )
 /* area of a contour sector */
 static double icvContourSecArea( CvSeq * contour, CvSlice slice )
 {
-    CvPoint pt;                 /*  pointer to points   */
-    CvPoint pt_s, pt_e;         /*  first and last points  */
+    cv::Point pt;                 /*  pointer to points   */
+    cv::Point pt_s, pt_e;         /*  first and last points  */
     CvSeqReader reader;         /*  points reader of contour   */
 
     int p_max = 2, p_ind;
@@ -1044,10 +1044,10 @@ static double icvContourSecArea( CvSeq * contour, CvSlice slice )
 
     cvStartReadSeq( contour, &reader, 0 );
     cvSetSeqReaderPos( &reader, slice.start_index );
-    CV_READ_SEQ_ELEM( pt_s, reader );
+    { CvPoint pt_s_ = CV_STRUCT_INITIALIZER; CV_READ_SEQ_ELEM(pt_s_, reader); pt_s = pt_s_; }
     p_ind = 0;
     cvSetSeqReaderPos( &reader, slice.end_index );
-    CV_READ_SEQ_ELEM( pt_e, reader );
+    { CvPoint pt_e_ = CV_STRUCT_INITIALIZER; CV_READ_SEQ_ELEM(pt_e_, reader); pt_e = pt_e_; }
 
 /*    normal coefficients    */
     nx = pt_s.y - pt_e.y;
@@ -1056,7 +1056,7 @@ static double icvContourSecArea( CvSeq * contour, CvSlice slice )
 
     while( lpt-- > 0 )
     {
-        CV_READ_SEQ_ELEM( pt, reader );
+        { CvPoint pt_ = CV_STRUCT_INITIALIZER; CV_READ_SEQ_ELEM(pt_, reader); pt = pt_; }
 
         if( flag == 0 )
         {
@@ -1294,14 +1294,14 @@ cvFitEllipse2( const CvArr* array )
 {
     cv::AutoBuffer<double> abuf;
     cv::Mat points = cv::cvarrToMat(array, false, false, 0, &abuf);
-    return cv::fitEllipse(points);
+    return cvBox2D(cv::fitEllipse(points));
 }
 
 /* Calculates bounding rectagnle of a point set or retrieves already calculated */
 CV_IMPL  CvRect
 cvBoundingRect( CvArr* array, int update )
 {
-    CvRect  rect;
+    cv::Rect rect;
     CvContour contour_header;
     CvSeq* ptseq = 0;
     CvSeqBlock block;
@@ -1343,16 +1343,16 @@ cvBoundingRect( CvArr* array, int update )
 
     if( mat )
     {
-        rect = cv::maskBoundingRect(cv::cvarrToMat(mat));
+        rect = cvRect(cv::maskBoundingRect(cv::cvarrToMat(mat)));
     }
     else if( ptseq->total )
     {
         cv::AutoBuffer<double> abuf;
-        rect = cv::pointSetBoundingRect(cv::cvarrToMat(ptseq, false, false, 0, &abuf));
+        rect = cvRect(cv::pointSetBoundingRect(cv::cvarrToMat(ptseq, false, false, 0, &abuf)));
     }
     if( update )
-        ((CvContour*)ptseq)->rect = rect;
-    return rect;
+        ((CvContour*)ptseq)->rect = cvRect(rect);
+    return cvRect(rect);
 }
 
 /* End of file. */
diff --git a/modules/imgproc/src/undistort.cpp b/modules/imgproc/src/undistort.cpp
index dc71bc42eb..1c9399611f 100644
--- a/modules/imgproc/src/undistort.cpp
+++ b/modules/imgproc/src/undistort.cpp
@@ -567,14 +567,14 @@ void cv::undistortPoints( InputArray _src, OutputArray _dst,
     _dst.create(src.size(), src.type(), -1, true);
     Mat dst = _dst.getMat();
 
-    CvMat _csrc = src, _cdst = dst, _ccameraMatrix = cameraMatrix;
+    CvMat _csrc = cvMat(src), _cdst = cvMat(dst), _ccameraMatrix = cvMat(cameraMatrix);
     CvMat matR, matP, _cdistCoeffs, *pR=0, *pP=0, *pD=0;
     if( !R.empty() )
-        pR = &(matR = R);
+        pR = &(matR = cvMat(R));
     if( !P.empty() )
-        pP = &(matP = P);
+        pP = &(matP = cvMat(P));
     if( !distCoeffs.empty() )
-        pD = &(_cdistCoeffs = distCoeffs);
+        pD = &(_cdistCoeffs = cvMat(distCoeffs));
     cvUndistortPointsInternal(&_csrc, &_cdst, &_ccameraMatrix, pD, pR, pP, criteria);
 }
 
diff --git a/modules/imgproc/test/test_approxpoly.cpp b/modules/imgproc/test/test_approxpoly.cpp
index fbb977a87a..69511a6f8c 100644
--- a/modules/imgproc/test/test_approxpoly.cpp
+++ b/modules/imgproc/test/test_approxpoly.cpp
@@ -126,10 +126,10 @@ bool CV_ApproxPolyTest::get_contour( int /*type*/, CvSeq** Seq, int* d,
     int i;
     CvSeq* seq;
     int total = cvtest::randInt(rng) % 1000 + 1;
-    CvPoint center;
+    Point center;
     int radius, angle;
     double deg_to_rad = CV_PI/180.;
-    CvPoint pt;
+    Point pt;
 
     center.x = cvtest::randInt( rng ) % 1000;
     center.y = cvtest::randInt( rng ) % 1000;
@@ -166,7 +166,7 @@ int CV_ApproxPolyTest::check_slice( CvPoint StartPt, CvPoint EndPt,
                                    int* _j, int Count )
 {
     ///////////
-    CvPoint Pt;
+    Point Pt;
     ///////////
     bool flag;
     double dy,dx;
@@ -208,7 +208,7 @@ int CV_ApproxPolyTest::check_slice( CvPoint StartPt, CvPoint EndPt,
     /////// find start point and check distance ////////
     for( j = *_j; j < Count; j++ )
     {
-        CV_READ_SEQ_ELEM( Pt, *SrcReader );
+        { CvPoint pt_ = CV_STRUCT_INITIALIZER; CV_READ_SEQ_ELEM(pt_, *SrcReader); Pt = pt_; }
         if( StartPt.x == Pt.x && StartPt.y == Pt.y ) break;
         else
         {
@@ -230,7 +230,7 @@ int CV_ApproxPolyTest::check( CvSeq* SrcSeq, CvSeq* DstSeq, float Eps )
     //////////
     CvSeqReader  DstReader;
     CvSeqReader  SrcReader;
-    CvPoint StartPt, EndPt;
+    CvPoint StartPt = {0, 0}, EndPt = {0, 0};
     ///////////
     int TotalErrors = 0;
     ///////////
diff --git a/modules/imgproc/test/test_contours.cpp b/modules/imgproc/test/test_contours.cpp
index 1202be3840..a5c924829d 100644
--- a/modules/imgproc/test/test_contours.cpp
+++ b/modules/imgproc/test/test_contours.cpp
@@ -65,7 +65,7 @@ protected:
 
     int min_log_img_width, max_log_img_width;
     int min_log_img_height, max_log_img_height;
-    CvSize img_size;
+    Size img_size;
     int count, count2;
 
     IplImage* img[NUM_IMG];
@@ -170,9 +170,9 @@ cvTsGenerateBlobImage( IplImage* img, int min_blob_size, int max_blob_size,
                        RNG& rng )
 {
     int i;
-    CvSize size;
+    Size size;
 
-    assert( img->depth == IPL_DEPTH_8U && img->nChannels == 1 );
+    CV_Assert(img->depth == IPL_DEPTH_8U && img->nChannels == 1);
 
     cvZero( img );
 
@@ -182,8 +182,8 @@ cvTsGenerateBlobImage( IplImage* img, int min_blob_size, int max_blob_size,
 
     for( i = 0; i < blob_count; i++ )
     {
-        CvPoint center;
-        CvSize  axes;
+        Point center;
+        Size  axes;
         int angle = cvtest::randInt(rng) % 180;
         int brightness = cvtest::randInt(rng) %
                          (max_brightness - min_brightness) + min_brightness;
@@ -195,7 +195,7 @@ cvTsGenerateBlobImage( IplImage* img, int min_blob_size, int max_blob_size,
         axes.height = (cvtest::randInt(rng) %
                       (max_blob_size - min_blob_size) + min_blob_size + 1)/2;
 
-        cvEllipse( img, center, axes, angle, 0, 360, cvScalar(brightness), CV_FILLED );
+        cvEllipse( img, cvPoint(center), cvSize(axes), angle, 0, 360, cvScalar(brightness), CV_FILLED );
     }
 
     cvResetImageROI( img );
@@ -246,7 +246,7 @@ int CV_FindContourTest::prepare_test_case( int test_case_idx )
     storage = cvCreateMemStorage( 1 << 10 );
 
     for( i = 0; i < NUM_IMG; i++ )
-        img[i] = cvCreateImage( img_size, 8, 1 );
+        img[i] = cvCreateImage( cvSize(img_size), 8, 1 );
 
     cvTsGenerateBlobImage( img[0], min_blob_size, max_blob_size,
         blob_count, min_brightness, max_brightness, rng );
@@ -376,8 +376,8 @@ int CV_FindContourTest::validate_test_results( int /*test_case_idx*/ )
 
             for(int i = 0; i < seq1->total; i++ )
             {
-                CvPoint pt1;
-                CvPoint pt2;
+                CvPoint pt1 = {0, 0};
+                CvPoint pt2 = {0, 0};
 
                 CV_READ_SEQ_ELEM( pt1, reader1 );
                 CV_READ_SEQ_ELEM( pt2, reader2 );
diff --git a/modules/imgproc/test/test_convhull.cpp b/modules/imgproc/test/test_convhull.cpp
index 7ad2f5fff7..3f82e04524 100644
--- a/modules/imgproc/test/test_convhull.cpp
+++ b/modules/imgproc/test/test_convhull.cpp
@@ -77,6 +77,13 @@ cvTsDist( CvPoint2D32f a, CvPoint2D32f b )
     double dy = a.y - b.y;
     return sqrt(dx*dx + dy*dy);
 }
+CV_INLINE double
+cvTsDist( const Point2f& a, const Point2f& b )
+{
+    double dx = a.x - b.x;
+    double dy = a.y - b.y;
+    return sqrt(dx*dx + dy*dy);
+}
 
 CV_INLINE double
 cvTsPtLineDist( CvPoint2D32f pt, CvPoint2D32f a, CvPoint2D32f b )
@@ -95,7 +102,7 @@ static double
 cvTsPointPolygonTest( CvPoint2D32f pt, const CvPoint2D32f* vv, int n, int* _idx=0, int* _on_edge=0 )
 {
     int i;
-    CvPoint2D32f v = vv[n-1], v0;
+    Point2f v = vv[n-1], v0;
     double min_dist_num = FLT_MAX, min_dist_denom = 1;
     int min_dist_idx = -1, min_on_edge = 0;
     int counter = 0;
@@ -169,9 +176,9 @@ cvTsMiddlePoint(const cv::Point2f &a, const cv::Point2f &b)
 static bool
 cvTsIsPointOnLineSegment(const cv::Point2f &x, const cv::Point2f &a, const cv::Point2f &b)
 {
-    double d1 = cvTsDist(CvPoint2D32f(x.x, x.y), CvPoint2D32f(a.x, a.y));
-    double d2 = cvTsDist(CvPoint2D32f(x.x, x.y), CvPoint2D32f(b.x, b.y));
-    double d3 = cvTsDist(CvPoint2D32f(a.x, a.y), CvPoint2D32f(b.x, b.y));
+    double d1 = cvTsDist(cvPoint2D32f(x.x, x.y), cvPoint2D32f(a.x, a.y));
+    double d2 = cvTsDist(cvPoint2D32f(x.x, x.y), cvPoint2D32f(b.x, b.y));
+    double d3 = cvTsDist(cvPoint2D32f(a.x, a.y), cvPoint2D32f(b.x, b.y));
 
     return (abs(d1 + d2 - d3) <= (1E-5));
 }
@@ -207,7 +214,7 @@ protected:
     void* points;
     void* result;
     double low_high_range;
-    CvScalar low, high;
+    Scalar low, high;
 
     bool test_cpp;
 };
@@ -694,7 +701,7 @@ void CV_MinAreaRectTest::run_func()
     else
     {
         cv::RotatedRect r = cv::minAreaRect(cv::cvarrToMat(points));
-        box = (CvBox2D)r;
+        box = cvBox2D(r);
         r.points((cv::Point2f*)box_pt);
     }
 }
@@ -938,7 +945,7 @@ protected:
     void run_func(void);
     int validate_test_results( int test_case_idx );
 
-    CvPoint2D32f center;
+    Point2f center;
     float radius;
 };
 
@@ -951,7 +958,11 @@ CV_MinCircleTest::CV_MinCircleTest()
 void CV_MinCircleTest::run_func()
 {
     if(!test_cpp)
-        cvMinEnclosingCircle( points, &center, &radius );
+    {
+        CvPoint2D32f c_center = cvPoint2D32f(center);
+        cvMinEnclosingCircle( points, &c_center, &radius );
+        center = c_center;
+    }
     else
     {
         cv::Point2f tmpcenter;
@@ -966,8 +977,8 @@ int CV_MinCircleTest::validate_test_results( int test_case_idx )
     double eps = 1.03;
     int code = CV_BaseShapeDescrTest::validate_test_results( test_case_idx );
     int i, j = 0, point_count = points2->rows + points2->cols - 1;
-    CvPoint2D32f *p = (CvPoint2D32f*)(points2->data.ptr);
-    CvPoint2D32f v[3];
+    Point2f *p = (Point2f*)(points2->data.ptr);
+    Point2f v[3];
 
 #if 0
     {
@@ -989,7 +1000,7 @@ int CV_MinCircleTest::validate_test_results( int test_case_idx )
     // remember at most 3 points that are close to the boundary
     for( i = 0; i < point_count; i++ )
     {
-        double d = cvTsDist( p[i], center );
+        double d = cvTsDist(p[i], center);
         if( d > radius )
         {
             ts->printf( cvtest::TS::LOG, "The point #%d is outside of the circle\n", i );
@@ -1145,7 +1156,8 @@ int CV_PerimeterTest::validate_test_results( int test_case_idx )
     int code = CV_BaseShapeDescrTest::validate_test_results( test_case_idx );
     int i, len = slice.end_index - slice.start_index, total = points2->cols + points2->rows - 1;
     double result0 = 0;
-    CvPoint2D32f prev_pt, pt, *ptr;
+    Point2f prev_pt, pt;
+    CvPoint2D32f *ptr;
 
     if( len < 0 )
         len += total;
@@ -1195,7 +1207,7 @@ protected:
     void generate_point_set( void* points );
     void run_func(void);
     int validate_test_results( int test_case_idx );
-    CvBox2D box0, box;
+    RotatedRect box0, box;
     double min_ellipse_size, max_noise;
 };
 
@@ -1248,12 +1260,12 @@ void CV_FitEllipseTest::generate_point_set( void* pointsSet )
         data = ptm->data.ptr;
     }
 
-    assert( point_type == CV_32SC2 || point_type == CV_32FC2 );
+    CV_Assert(point_type == CV_32SC2 || point_type == CV_32FC2);
 
     for( i = 0; i < total; i++ )
     {
         CvPoint* pp;
-        CvPoint2D32f p;
+        CvPoint2D32f p = {0, 0};
         double angle = cvtest::randReal(rng)*CV_PI*2;
         double x = box0.size.height*0.5*(cos(angle) + (cvtest::randReal(rng)-0.5)*2*max_noise);
         double y = box0.size.width*0.5*(sin(angle) + (cvtest::randReal(rng)-0.5)*2*max_noise);
@@ -1291,7 +1303,7 @@ void CV_FitEllipseTest::run_func()
     if(!test_cpp)
         box = cvFitEllipse2( points );
     else
-        box = (CvBox2D)cv::fitEllipse(cv::cvarrToMat(points));
+        box = cv::fitEllipse(cv::cvarrToMat(points));
 }
 
 int CV_FitEllipseTest::validate_test_results( int test_case_idx )
@@ -1459,7 +1471,7 @@ void CV_FitEllipseParallelTest::generate_point_set( void* )
 
 void CV_FitEllipseParallelTest::run_func()
 {
-    box = (CvBox2D)cv::fitEllipse(pointsMat);
+    box = cv::fitEllipse(pointsMat);
 }
 
 CV_FitEllipseParallelTest::~CV_FitEllipseParallelTest(){
@@ -1704,7 +1716,7 @@ cvTsGenerateTousledBlob( CvPoint2D32f center, CvSize2D32f axes,
     for( i = 0; i < total; i++ )
     {
         CvPoint* pp;
-        CvPoint2D32f p;
+        Point2f p;
 
         double phi0 = 2*CV_PI*i/total;
         double phi = CV_PI*angle/180.;
@@ -1730,7 +1742,7 @@ cvTsGenerateTousledBlob( CvPoint2D32f center, CvSize2D32f axes,
             pp->y = cvRound(p.y);
         }
         else
-            *(CvPoint2D32f*)pp = p;
+            *(CvPoint2D32f*)pp = cvPoint2D32f(p);
     }
 }
 
@@ -1747,11 +1759,11 @@ protected:
     int validate_test_results( int test_case_idx );
     CvMoments moments0, moments;
     double area0, area;
-    CvSize2D32f axes;
-    CvPoint2D32f center;
+    Size2f axes;
+    Point2f center;
     int max_max_r_scale;
     double max_r_scale, angle;
-    CvSize img_size;
+    Size img_size;
 };
 
 
@@ -1785,7 +1797,7 @@ void CV_ContourMomentsTest::generate_point_set( void* pointsSet )
     max_r_scale = cvtest::randReal(rng)*max_max_r_scale*0.01;
     angle = cvtest::randReal(rng)*360;
 
-    cvTsGenerateTousledBlob( center, axes, max_r_scale, angle, pointsSet, rng );
+    cvTsGenerateTousledBlob( cvPoint2D32f(center), cvSize2D32f(axes), max_r_scale, angle, pointsSet, rng );
 
     if( points1 )
         points1->flags = CV_SEQ_MAGIC_VAL + CV_SEQ_POLYGON;
@@ -1811,7 +1823,7 @@ void CV_ContourMomentsTest::run_func()
     }
     else
     {
-        moments = (CvMoments)cv::moments(cv::cvarrToMat(points));
+        moments = cvMoments(cv::moments(cv::cvarrToMat(points)));
         area = cv::contourArea(cv::cvarrToMat(points));
     }
 }
@@ -1904,13 +1916,13 @@ void CV_PerimeterAreaSliceTest::run( int )
         cvClearMemStorage(storage);
         CvSeq* contour = cvCreateSeq(CV_SEQ_POLYGON, sizeof(CvSeq), sizeof(CvPoint), storage);
         double dphi = CV_PI*2/n;
-        CvPoint center;
+        Point center;
         center.x = rng.uniform(cvCeil(max_r), cvFloor(640-max_r));
         center.y = rng.uniform(cvCeil(max_r), cvFloor(480-max_r));
 
         for( int j = 0; j < n; j++ )
         {
-            CvPoint pt;
+            CvPoint pt = CV_STRUCT_INITIALIZER;
             double r = rng.uniform(min_r, max_r);
             double phi = j*dphi;
             pt.x = cvRound(center.x + r*cos(phi));
@@ -1918,7 +1930,7 @@ void CV_PerimeterAreaSliceTest::run( int )
             cvSeqPush(contour, &pt);
         }
 
-        CvSlice slice;
+        CvSlice slice = {0, 0};
         for(;;)
         {
             slice.start_index = rng.uniform(-n/2, 3*n/2);
diff --git a/modules/imgproc/test/test_distancetransform.cpp b/modules/imgproc/test/test_distancetransform.cpp
index 75ec5ec8fa..652b5bfd24 100644
--- a/modules/imgproc/test/test_distancetransform.cpp
+++ b/modules/imgproc/test/test_distancetransform.cpp
@@ -275,7 +275,7 @@ cvTsDistTransform( const CvMat* _src, CvMat* _dst, int dist_type,
 
 void CV_DisTransTest::prepare_to_validation( int /*test_case_idx*/ )
 {
-    CvMat _input = test_mat[INPUT][0], _output = test_mat[REF_OUTPUT][0];
+    CvMat _input = cvMat(test_mat[INPUT][0]), _output = cvMat(test_mat[REF_OUTPUT][0]);
 
     cvTsDistTransform( &_input, &_output, dist_type, mask_size, mask, 0 );
 }
diff --git a/modules/imgproc/test/test_drawing.cpp b/modules/imgproc/test/test_drawing.cpp
index f5fd6fc921..c42d7aa572 100644
--- a/modules/imgproc/test/test_drawing.cpp
+++ b/modules/imgproc/test/test_drawing.cpp
@@ -258,7 +258,7 @@ void CV_DrawingTest_C::draw( Mat& _img )
 {
     CvSize imgSize = cvSize(600, 400);
     _img.create( imgSize, CV_8UC3 );
-    CvMat img = _img;
+    CvMat img = cvMat(_img);
 
     vector<CvPoint> polyline(4);
     polyline[0] = cvPoint(0, 0);
@@ -282,7 +282,7 @@ void CV_DrawingTest_C::draw( Mat& _img )
     if( cvClipLine(imgSize, &p1, &p2) )
         cvCircle( &img, cvPoint(390,100), 10, cvScalar(0,0,255), 3 ); // not draw
 
-    p1 = Point(imgSize.width-1,1), p2 = Point(imgSize.width,3);
+    p1 = cvPoint(imgSize.width-1,1), p2 = cvPoint(imgSize.width,3);
     if( cvClipLine(imgSize, &p1, &p2) )
         cvEllipse( &img, cvPoint(390,100), cvSize(20,30), 60, 0, 220.0, cvScalar(0,200,0), 4 ); //draw
 
@@ -292,7 +292,7 @@ void CV_DrawingTest_C::draw( Mat& _img )
     box.size.width = 200;
     box.size.height = 100;
     box.angle = 160;
-    cvEllipseBox( &img, box, Scalar(200,200,255), 5 );
+    cvEllipseBox( &img, box, cvScalar(200,200,255), 5 );
 
     polyline.resize(9);
     pts = &polyline[0];
@@ -311,7 +311,7 @@ void CV_DrawingTest_C::draw( Mat& _img )
     n = (int)polyline.size();
     actualSize = cvEllipse2Poly( cvPoint(500,300), cvSize(50,80), 0, 0, 180, &polyline[0], 10 );
     CV_Assert(actualSize == n);
-    cvPolyLine( &img, &pts, &n, 1, true, Scalar(100,200,100), 20 );
+    cvPolyLine( &img, &pts, &n, 1, true, cvScalar(100,200,100), 20 );
     cvFillConvexPoly( &img, pts, n, cvScalar(0, 80, 0) );
 
     polyline.resize(8);
@@ -335,7 +335,7 @@ void CV_DrawingTest_C::draw( Mat& _img )
     CvFont font;
     cvInitFont( &font, FONT_HERSHEY_SCRIPT_SIMPLEX, 2, 2, 0, 3 );
     int baseline = 0;
-    CvSize textSize;
+    CvSize textSize = {0, 0};
     cvGetTextSize( text1.c_str(), &font, &textSize, &baseline );
     baseline += font.thickness;
     CvPoint textOrg = cvPoint((imgSize.width - textSize.width)/2, (imgSize.height + textSize.height)/2);
@@ -398,7 +398,7 @@ void CV_DrawingTest_C::draw( Mat& _img )
 int CV_DrawingTest_C::checkLineIterator( Mat& _img )
 {
     CvLineIterator it;
-    CvMat img = _img;
+    CvMat img = cvMat(_img);
     int count = cvInitLineIterator( &img, cvPoint(0,300), cvPoint(1000, 300), &it );
     for(int i = 0; i < count; i++ )
     {
diff --git a/modules/imgproc/test/test_filter.cpp b/modules/imgproc/test/test_filter.cpp
index e2da595ebe..b345994405 100644
--- a/modules/imgproc/test/test_filter.cpp
+++ b/modules/imgproc/test/test_filter.cpp
@@ -53,8 +53,8 @@ protected:
     int read_params( CvFileStorage* fs );
     void get_test_array_types_and_sizes( int test_case_idx, vector<vector<Size> >& sizes, vector<vector<int> >& types );
     void get_minmax_bounds( int i, int j, int type, Scalar& low, Scalar& high );
-    CvSize aperture_size;
-    CvPoint anchor;
+    Size aperture_size;
+    Point anchor;
     int max_aperture_size;
     bool fp_kernel;
     bool inplace;
@@ -70,8 +70,8 @@ CV_FilterBaseTest::CV_FilterBaseTest( bool _fp_kernel ) : fp_kernel(_fp_kernel)
     test_array[REF_OUTPUT].push_back(NULL);
     max_aperture_size = 13;
     inplace = false;
-    aperture_size = cvSize(0,0);
-    anchor = cvPoint(0,0);
+    aperture_size = Size(0,0);
+    anchor = Point(0,0);
     element_wise_relative_error = false;
 }
 
@@ -420,9 +420,9 @@ double CV_FilterTest::get_success_error_level( int /*test_case_idx*/, int /*i*/,
 
 void CV_FilterTest::run_func()
 {
-    CvMat kernel = test_mat[INPUT][1];
+    CvMat kernel = cvMat(test_mat[INPUT][1]);
     cvFilter2D( test_array[inplace ? OUTPUT : INPUT][0],
-                test_array[OUTPUT][0], &kernel, anchor );
+                test_array[OUTPUT][0], &kernel, cvPoint(anchor));
 }
 
 
@@ -1119,7 +1119,7 @@ void CV_PyramidBaseTest::get_test_array_types_and_sizes( int test_case_idx,
     const int depthes[] = {CV_8U, CV_16S, CV_16U, CV_32F};
 
     RNG& rng = ts->get_rng();
-    CvSize sz;
+    CvSize sz = {0, 0};
     CV_FilterBaseTest::get_test_array_types_and_sizes( test_case_idx, sizes, types );
 
     int depth = depthes[cvtest::randInt(rng) % (sizeof(depthes)/sizeof(depthes[0]))];
diff --git a/modules/imgproc/test/test_floodfill.cpp b/modules/imgproc/test/test_floodfill.cpp
index 7c6c9e8bd4..b880c4ee37 100644
--- a/modules/imgproc/test/test_floodfill.cpp
+++ b/modules/imgproc/test/test_floodfill.cpp
@@ -60,9 +60,9 @@ protected:
     void get_timing_test_array_types_and_sizes( int test_case_idx, vector<vector<Size> >& sizes, vector<vector<int> >& types
                                                 CvSize** whole_sizes, bool *are_images );
     void print_timing_params( int test_case_idx, char* ptr, int params_left );*/
-    CvPoint seed_pt;
-    CvScalar new_val;
-    CvScalar l_diff, u_diff;
+    Point seed_pt;
+    Scalar new_val;
+    Scalar l_diff, u_diff;
     int connectivity;
     bool use_mask, mask_only;
     int range_type;
@@ -115,8 +115,8 @@ void CV_FloodFillTest::get_test_array_types_and_sizes( int test_case_idx,
         sizes[INPUT_OUTPUT][1] = sizes[REF_INPUT_OUTPUT][1] = cvSize(0,0);
     else
     {
-        CvSize sz = sizes[INPUT_OUTPUT][0];
-        sizes[INPUT_OUTPUT][1] = sizes[REF_INPUT_OUTPUT][1] = cvSize(sz.width+2,sz.height+2);
+        Size sz = sizes[INPUT_OUTPUT][0];
+        sizes[INPUT_OUTPUT][1] = sizes[REF_INPUT_OUTPUT][1] = Size(sz.width+2,sz.height+2);
     }
 
     seed_pt.x = cvtest::randInt(rng) % sizes[INPUT_OUTPUT][0].width;
@@ -194,7 +194,7 @@ void CV_FloodFillTest::run_func()
     if(!test_cpp)
     {
         CvConnectedComp comp;
-        cvFloodFill( test_array[INPUT_OUTPUT][0], seed_pt, new_val, l_diff, u_diff, &comp,
+        cvFloodFill( test_array[INPUT_OUTPUT][0], cvPoint(seed_pt), cvScalar(new_val), cvScalar(l_diff), cvScalar(u_diff), &comp,
                      flags, test_array[INPUT_OUTPUT][1] );
         odata[0] = comp.area;
         odata[1] = comp.rect.x;
@@ -269,7 +269,7 @@ cvTsFloodFill( CvMat* _img, CvPoint seed_pt, CvScalar new_val,
     {
         Mat m_mask = cvarrToMat(mask);
         cvtest::set( m_mask, Scalar::all(0), Mat() );
-        cvRectangle( mask, cvPoint(0,0), cvPoint(mask->cols-1,mask->rows-1), Scalar::all(1.), 1, 8, 0 );
+        cvRectangle( mask, cvPoint(0,0), cvPoint(mask->cols-1,mask->rows-1), cvScalar(Scalar::all(1.)), 1, 8, 0 );
     }
 
     new_mask_val = (new_mask_val != 0 ? new_mask_val : 1) << 8;
@@ -515,9 +515,9 @@ _exit_:
 void CV_FloodFillTest::prepare_to_validation( int /*test_case_idx*/ )
 {
     double* comp = test_mat[REF_OUTPUT][0].ptr<double>();
-    CvMat _input = test_mat[REF_INPUT_OUTPUT][0];
-    CvMat _mask = test_mat[REF_INPUT_OUTPUT][1];
-    cvTsFloodFill( &_input, seed_pt, new_val, l_diff, u_diff,
+    CvMat _input = cvMat(test_mat[REF_INPUT_OUTPUT][0]);
+    CvMat _mask = cvMat(test_mat[REF_INPUT_OUTPUT][1]);
+    cvTsFloodFill( &_input, cvPoint(seed_pt), cvScalar(new_val), cvScalar(l_diff), cvScalar(u_diff),
                    _mask.data.ptr ? &_mask : 0,
                    comp, connectivity, range_type,
                    new_mask_val, mask_only );
diff --git a/modules/imgproc/test/test_histograms.cpp b/modules/imgproc/test/test_histograms.cpp
index 5386c29ac7..4214892739 100644
--- a/modules/imgproc/test/test_histograms.cpp
+++ b/modules/imgproc/test/test_histograms.cpp
@@ -1729,15 +1729,15 @@ int CV_CalcBackProjectPatchTest::prepare_test_case( int test_case_idx )
 
 void CV_CalcBackProjectPatchTest::run_func(void)
 {
-    CvMat dst(images[CV_MAX_DIM]);
+    CvMat dst = cvMat(images[CV_MAX_DIM]);
     vector<CvMat >  img(cdims);
     vector<CvMat*> pimg(cdims);
     for(int i = 0; i < cdims; i++)
     {
-        img[i] = CvMat(images[i]);
+        img[i] = cvMat(images[i]);
         pimg[i] = &img[i];
     }
-    cvCalcArrBackProjectPatch( (CvArr**)&pimg[0], &dst, patch_size, hist[0], method, factor );
+    cvCalcArrBackProjectPatch( (CvArr**)&pimg[0], &dst, cvSize(patch_size), hist[0], method, factor );
 }
 
 
diff --git a/modules/imgproc/test/test_imgwarp.cpp b/modules/imgproc/test/test_imgwarp.cpp
index 741ec7736e..400426af15 100644
--- a/modules/imgproc/test/test_imgwarp.cpp
+++ b/modules/imgproc/test/test_imgwarp.cpp
@@ -222,7 +222,7 @@ void CV_ResizeTest::get_test_array_types_and_sizes( int test_case_idx, vector<ve
 {
     RNG& rng = ts->get_rng();
     CV_ImgWarpBaseTest::get_test_array_types_and_sizes( test_case_idx, sizes, types );
-    CvSize sz;
+    Size sz;
 
     sz.width = (cvtest::randInt(rng) % sizes[INPUT][0].width) + 1;
     sz.height = (cvtest::randInt(rng) % sizes[INPUT][0].height) + 1;
@@ -272,7 +272,7 @@ double CV_ResizeTest::get_success_error_level( int /*test_case_idx*/, int /*i*/,
 
 void CV_ResizeTest::prepare_to_validation( int /*test_case_idx*/ )
 {
-    CvMat _src = test_mat[INPUT][0], _dst = test_mat[REF_INPUT_OUTPUT][0];
+    CvMat _src = cvMat(test_mat[INPUT][0]), _dst = cvMat(test_mat[REF_INPUT_OUTPUT][0]);
     CvMat *src = &_src, *dst = &_dst;
     int i, j, k;
     CvMat* x_idx = cvCreateMat( 1, dst->cols, CV_32SC1 );
@@ -504,17 +504,17 @@ CV_WarpAffineTest::CV_WarpAffineTest() : CV_ImgWarpBaseTest( true )
 void CV_WarpAffineTest::get_test_array_types_and_sizes( int test_case_idx, vector<vector<Size> >& sizes, vector<vector<int> >& types )
 {
     CV_ImgWarpBaseTest::get_test_array_types_and_sizes( test_case_idx, sizes, types );
-    CvSize sz = sizes[INPUT][0];
+    Size sz = sizes[INPUT][0];
     // run for the second time to get output of a different size
     CV_ImgWarpBaseTest::get_test_array_types_and_sizes( test_case_idx, sizes, types );
     sizes[INPUT][0] = sz;
-    sizes[INPUT][1] = cvSize( 3, 2 );
+    sizes[INPUT][1] = Size( 3, 2 );
 }
 
 
 void CV_WarpAffineTest::run_func()
 {
-    CvMat mtx = test_mat[INPUT][1];
+    CvMat mtx = cvMat(test_mat[INPUT][1]);
     cvWarpAffine( test_array[INPUT][0], test_array[INPUT_OUTPUT][0], &mtx, interpolation );
 }
 
@@ -533,7 +533,7 @@ int CV_WarpAffineTest::prepare_test_case( int test_case_idx )
     const Mat& src = test_mat[INPUT][0];
     const Mat& dst = test_mat[INPUT_OUTPUT][0];
     Mat& mat = test_mat[INPUT][1];
-    CvPoint2D32f center;
+    Point2f center;
     double scale, angle;
 
     if( code <= 0 )
@@ -615,17 +615,17 @@ CV_WarpPerspectiveTest::CV_WarpPerspectiveTest() : CV_ImgWarpBaseTest( true )
 void CV_WarpPerspectiveTest::get_test_array_types_and_sizes( int test_case_idx, vector<vector<Size> >& sizes, vector<vector<int> >& types )
 {
     CV_ImgWarpBaseTest::get_test_array_types_and_sizes( test_case_idx, sizes, types );
-    CvSize sz = sizes[INPUT][0];
+    Size sz = sizes[INPUT][0];
     // run for the second time to get output of a different size
     CV_ImgWarpBaseTest::get_test_array_types_and_sizes( test_case_idx, sizes, types );
     sizes[INPUT][0] = sz;
-    sizes[INPUT][1] = cvSize( 3, 3 );
+    sizes[INPUT][1] = Size( 3, 3 );
 }
 
 
 void CV_WarpPerspectiveTest::run_func()
 {
-    CvMat mtx = test_mat[INPUT][1];
+    CvMat mtx = cvMat(test_mat[INPUT][1]);
     cvWarpPerspective( test_array[INPUT][0], test_array[INPUT_OUTPUT][0], &mtx, interpolation );
 }
 
@@ -641,8 +641,8 @@ int CV_WarpPerspectiveTest::prepare_test_case( int test_case_idx )
 {
     RNG& rng = ts->get_rng();
     int code = CV_ImgWarpBaseTest::prepare_test_case( test_case_idx );
-    const CvMat& src = test_mat[INPUT][0];
-    const CvMat& dst = test_mat[INPUT_OUTPUT][0];
+    const CvMat src = cvMat(test_mat[INPUT][0]);
+    const CvMat dst = cvMat(test_mat[INPUT_OUTPUT][0]);
     Mat& mat = test_mat[INPUT][1];
     Point2f s[4], d[4];
     int i;
@@ -880,7 +880,7 @@ void CV_UndistortTest::run_func()
 {
     if (!useCPlus)
     {
-        CvMat a = test_mat[INPUT][1], k = test_mat[INPUT][2];
+        CvMat a = cvMat(test_mat[INPUT][1]), k = cvMat(test_mat[INPUT][2]);
         cvUndistort2( test_array[INPUT][0], test_array[INPUT_OUTPUT][0], &a, &k);
     }
     else
@@ -1024,7 +1024,7 @@ void CV_UndistortMapTest::get_test_array_types_and_sizes( int test_case_idx, vec
     cvtest::ArrayTest::get_test_array_types_and_sizes( test_case_idx, sizes, types );
     int depth = cvtest::randInt(rng)%2 ? CV_64F : CV_32F;
 
-    CvSize sz = sizes[OUTPUT][0];
+    Size sz = sizes[OUTPUT][0];
     types[INPUT][0] = types[INPUT][1] = depth;
     dualChannel = cvtest::randInt(rng)%2 == 0;
     types[OUTPUT][0] = types[OUTPUT][1] =
@@ -1048,7 +1048,7 @@ void CV_UndistortMapTest::fill_array( int test_case_idx, int i, int j, Mat& arr
 
 void CV_UndistortMapTest::run_func()
 {
-    CvMat a = test_mat[INPUT][0], k = test_mat[INPUT][1];
+    CvMat a = cvMat(test_mat[INPUT][0]), k = cvMat(test_mat[INPUT][1]);
 
     if (!dualChannel )
         cvInitUndistortMap( &a, &k, test_array[OUTPUT][0], test_array[OUTPUT][1] );
@@ -1189,7 +1189,7 @@ void CV_GetRectSubPixTest::get_test_array_types_and_sizes( int test_case_idx, ve
     CV_ImgWarpBaseTest::get_test_array_types_and_sizes( test_case_idx, sizes, types );
     int src_depth = cvtest::randInt(rng) % 2, dst_depth;
     int cn = cvtest::randInt(rng) % 2 ? 3 : 1;
-    CvSize src_size, dst_size;
+    Size src_size, dst_size;
 
     dst_depth = src_depth = src_depth == 0 ? CV_8U : CV_32F;
     if( src_depth < CV_32F && cvtest::randInt(rng) % 2 )
@@ -1293,7 +1293,7 @@ void CV_GetQuadSubPixTest::get_test_array_types_and_sizes( int test_case_idx, ve
 {
     int min_size = 4;
     CV_ImgWarpBaseTest::get_test_array_types_and_sizes( test_case_idx, sizes, types );
-    CvSize sz = sizes[INPUT][0], dsz;
+    Size sz = sizes[INPUT][0], dsz;
     RNG& rng = ts->get_rng();
     int msz, src_depth = cvtest::randInt(rng) % 2, dst_depth;
     int cn = cvtest::randInt(rng) % 2 ? 3 : 1;
@@ -1323,7 +1323,7 @@ void CV_GetQuadSubPixTest::get_test_array_types_and_sizes( int test_case_idx, ve
 
 void CV_GetQuadSubPixTest::run_func()
 {
-    CvMat mtx = test_mat[INPUT][1];
+    CvMat mtx = cvMat(test_mat[INPUT][1]);
     cvGetQuadrangleSubPix( test_array[INPUT][0], test_array[INPUT_OUTPUT][0], &mtx );
 }
 
@@ -1343,7 +1343,7 @@ int CV_GetQuadSubPixTest::prepare_test_case( int test_case_idx )
     int code = CV_ImgWarpBaseTest::prepare_test_case( test_case_idx );
     const Mat& src = test_mat[INPUT][0];
     Mat& mat = test_mat[INPUT][1];
-    CvPoint2D32f center;
+    Point2f center;
     double scale, angle;
 
     if( code <= 0 )
diff --git a/modules/imgproc/test/test_moments.cpp b/modules/imgproc/test/test_moments.cpp
index bb0716cca3..faaf4cd108 100644
--- a/modules/imgproc/test/test_moments.cpp
+++ b/modules/imgproc/test/test_moments.cpp
@@ -161,7 +161,7 @@ void CV_MomentsTest::run_func()
             ttime += (double)getTickCount() - t;
             ncalls++;
             printf("%g\n", ttime/ncalls/u.total()));
-        *m = new_m;
+        *m = cvMoments(new_m);
     }
     else
         cvMoments( test_array[INPUT][0], m, is_binary );
@@ -179,22 +179,13 @@ void CV_MomentsTest::run_func()
 void CV_MomentsTest::prepare_to_validation( int /*test_case_idx*/ )
 {
     Mat& src = test_mat[INPUT][0];
-    CvMoments m;
+    CvMoments m = cvMoments();
     double* mdata = test_mat[REF_OUTPUT][0].ptr<double>();
     int depth = src.depth();
     int cn = src.channels();
     int i, y, x, cols = src.cols;
     double xc = 0., yc = 0.;
 
-#if defined __GNUC__ && __GNUC__ >= 8
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wclass-memaccess"
-#endif
-    memset( &m, 0, sizeof(m));
-#if defined __GNUC__ && __GNUC__ >= 8
-#pragma GCC diagnostic pop
-#endif
-
     int coi = 0;
     for( y = 0; y < src.rows; y++ )
     {
diff --git a/modules/imgproc/test/test_templmatch.cpp b/modules/imgproc/test/test_templmatch.cpp
index 085fabc575..d1389f4c79 100644
--- a/modules/imgproc/test/test_templmatch.cpp
+++ b/modules/imgproc/test/test_templmatch.cpp
@@ -151,7 +151,7 @@ static void cvTsMatchTemplate( const CvMat* img, const CvMat* templ, CvMat* resu
     int width_n = templ->cols*cn, height = templ->rows;
     int a_step = img->step / CV_ELEM_SIZE(img->type & CV_MAT_DEPTH_MASK);
     int b_step = templ->step / CV_ELEM_SIZE(templ->type & CV_MAT_DEPTH_MASK);
-    CvScalar b_mean, b_sdv;
+    CvScalar b_mean = CV_STRUCT_INITIALIZER, b_sdv = CV_STRUCT_INITIALIZER;
     double b_denom = 1., b_sum2 = 0;
     int area = templ->rows*templ->cols;
 
@@ -191,8 +191,8 @@ static void cvTsMatchTemplate( const CvMat* img, const CvMat* templ, CvMat* resu
     {
         for( j = 0; j < result->cols; j++ )
         {
-            CvScalar a_sum(0), a_sum2(0);
-            CvScalar ccorr(0);
+            Scalar a_sum(0), a_sum2(0);
+            Scalar ccorr(0);
             double value = 0.;
 
             if( depth == CV_8U )
@@ -308,8 +308,8 @@ static void cvTsMatchTemplate( const CvMat* img, const CvMat* templ, CvMat* resu
 
 void CV_TemplMatchTest::prepare_to_validation( int /*test_case_idx*/ )
 {
-    CvMat _input = test_mat[INPUT][0], _templ = test_mat[INPUT][1];
-    CvMat _output = test_mat[REF_OUTPUT][0];
+    CvMat _input = cvMat(test_mat[INPUT][0]), _templ = cvMat(test_mat[INPUT][1]);
+    CvMat _output = cvMat(test_mat[REF_OUTPUT][0]);
     cvTsMatchTemplate( &_input, &_templ, &_output, method );
 
     //if( ts->get_current_test_info()->test_case_idx == 0 )
diff --git a/modules/imgproc/test/test_watershed.cpp b/modules/imgproc/test/test_watershed.cpp
index 3a2eb0068e..dbd3eae9a9 100644
--- a/modules/imgproc/test/test_watershed.cpp
+++ b/modules/imgproc/test/test_watershed.cpp
@@ -73,12 +73,12 @@ void CV_WatershedTest::run( int /* start_from */)
 
     Mat markers(orig.size(), CV_32SC1);
     markers = Scalar(0);
-    IplImage iplmrks = markers;
+    IplImage iplmrks = cvIplImage(markers);
 
     vector<unsigned char> colors(1);
     for(int i = 0; cnts != 0; cnts = cnts->h_next, ++i )
     {
-        cvDrawContours( &iplmrks, cnts, Scalar::all(i + 1), Scalar::all(i + 1), -1, CV_FILLED);
+        cvDrawContours( &iplmrks, cnts, cvScalar(Scalar::all(i + 1)), cvScalar(Scalar::all(i + 1)), -1, CV_FILLED);
         Point* p = (Point*)cvGetSeqElem(cnts, 0);
 
         //expected image was added with 1 in order to save to png
diff --git a/modules/java/common.cmake b/modules/java/common.cmake
index df9c2de77b..31f5528997 100644
--- a/modules/java/common.cmake
+++ b/modules/java/common.cmake
@@ -5,6 +5,12 @@ else()
   ocv_update(OPENCV_JAVA_LIB_NAME_SUFFIX "${OPENCV_VERSION_MAJOR}${OPENCV_VERSION_MINOR}${OPENCV_VERSION_PATCH}")
 endif()
 
+if(MSVC)
+  ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4996)
+else()
+  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wdeprecated-declarations)
+endif()
+
 # get list of modules to wrap
 # message(STATUS "Wrapped in java:")
 set(OPENCV_JAVA_MODULES)
diff --git a/modules/objdetect/src/cascadedetect.cpp b/modules/objdetect/src/cascadedetect.cpp
index 8f00c7bd99..d39d47d5e7 100644
--- a/modules/objdetect/src/cascadedetect.cpp
+++ b/modules/objdetect/src/cascadedetect.cpp
@@ -1227,9 +1227,9 @@ static void detectMultiScaleOldFormat( const Mat& image, Ptr<CvHaarClassifierCas
                                        bool outputRejectLevels = false )
 {
     MemStorage storage(cvCreateMemStorage(0));
-    CvMat _image = image;
+    CvMat _image = cvMat(image);
     CvSeq* _objects = cvHaarDetectObjectsForROC( &_image, oldCascade, storage, rejectLevels, levelWeights, scaleFactor,
-                                                 minNeighbors, flags, minObjectSize, maxObjectSize, outputRejectLevels );
+                                                 minNeighbors, flags, cvSize(minObjectSize), cvSize(maxObjectSize), outputRejectLevels );
     Seq<CvAvgComp>(_objects).copyTo(vecAvgComp);
     objects.resize(vecAvgComp.size());
     std::transform(vecAvgComp.begin(), vecAvgComp.end(), objects.begin(), getRect());
diff --git a/modules/objdetect/src/haar.cpp b/modules/objdetect/src/haar.cpp
index 4d45d28d10..489728dd1f 100644
--- a/modules/objdetect/src/haar.cpp
+++ b/modules/objdetect/src/haar.cpp
@@ -67,11 +67,6 @@
 #  endif
 #endif
 
-#if defined __GNUC__ && __GNUC__ >= 8
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wclass-memaccess"
-#endif
-
 /* these settings affect the quality of detection: change with care */
 #define CV_ADJUST_FEATURES 1
 #define CV_ADJUST_WEIGHTS  0
@@ -171,7 +166,7 @@ icvCreateHidHaarClassifierCascade( CvHaarClassifierCascade* cascade )
     char errorstr[1000];
     CvHidHaarClassifier* haar_classifier_ptr;
     CvHidHaarTreeNode* haar_node_ptr;
-    CvSize orig_window_size;
+    cv::Size orig_window_size;
     bool has_tilted_features = false;
     int max_count = 0;
 
@@ -409,7 +404,7 @@ cvSetImagesForHaarClassifierCascade( CvHaarClassifierCascade* _cascade,
     CvHidHaarClassifierCascade* cascade;
     int coi0 = 0, coi1 = 0;
     int i;
-    CvRect equRect;
+    cv::Rect equRect;
     double weight_scale;
 
     if( !CV_IS_HAAR_CLASSIFIER(_cascade) )
@@ -495,7 +490,7 @@ cvSetImagesForHaarClassifierCascade( CvHaarClassifierCascade* _cascade,
                 CvHidHaarFeature* hidfeature =
                     &cascade->stage_classifier[i].classifier[j].node[l].feature;
                 double sum0 = 0, area0 = 0;
-                CvRect r[3];
+                cv::Rect r[3];
 
                 int base_w = -1, base_h = -1;
                 int new_base_w = 0, new_base_h = 0;
@@ -539,7 +534,7 @@ cvSetImagesForHaarClassifierCascade( CvHaarClassifierCascade* _cascade,
 
                 for( k = 0; k < nr; k++ )
                 {
-                    CvRect tr;
+                    cv::Rect tr;
                     double correction_ratio;
 
                     if( flagx )
@@ -1298,14 +1293,13 @@ cvHaarDetectObjectsForROC( const CvArr* _img,
 
         for( factor = 1; ; factor *= scaleFactor )
         {
-            CvSize winSize(cvRound(winSize0.width*factor),
-                                cvRound(winSize0.height*factor));
-            CvSize sz(cvRound( img->cols/factor ), cvRound( img->rows/factor ));
-            CvSize sz1(sz.width - winSize0.width + 1, sz.height - winSize0.height + 1);
+            CvSize winSize = { cvRound(winSize0.width*factor),
+                               cvRound(winSize0.height*factor) };
+            CvSize sz = { cvRound(img->cols/factor), cvRound(img->rows/factor) };
+            CvSize sz1 = { sz.width - winSize0.width + 1, sz.height - winSize0.height + 1 };
 
-            CvRect equRect(icv_object_win_border, icv_object_win_border,
-                winSize0.width - icv_object_win_border*2,
-                winSize0.height - icv_object_win_border*2);
+            CvRect equRect = { icv_object_win_border, icv_object_win_border,
+                winSize0.width - icv_object_win_border*2, winSize0.height - icv_object_win_border*2 };
 
             CvMat img1, sum1, sqsum1, norm1, tilted1, mask1;
             CvMat* _tilted = 0;
@@ -1385,9 +1379,9 @@ cvHaarDetectObjectsForROC( const CvArr* _img,
         for( ; n_factors-- > 0; factor *= scaleFactor )
         {
             const double ystep = std::max( 2., factor );
-            CvSize winSize(cvRound( cascade->orig_window_size.width * factor ),
-                                cvRound( cascade->orig_window_size.height * factor ));
-            CvRect equRect;
+            cv::Size winSize(cvRound(cascade->orig_window_size.width * factor),
+                             cvRound(cascade->orig_window_size.height * factor));
+            cv::Rect equRect;
             int *p[4] = {0,0,0,0};
             int *pq[4] = {0,0,0,0};
             int startX = 0, startY = 0;
@@ -1504,14 +1498,14 @@ cvHaarDetectObjectsForROC( const CvArr* _img,
 
     if( findBiggestObject && rectList.size() )
     {
-        CvAvgComp result_comp = {CvRect(),0};
+        CvAvgComp result_comp = {{0, 0, 0, 0},0};
 
         for( size_t i = 0; i < rectList.size(); i++ )
         {
             cv::Rect r = rectList[i];
             if( r.area() > cv::Rect(result_comp.rect).area() )
             {
-                result_comp.rect = r;
+                result_comp.rect = cvRect(r);
                 result_comp.neighbors = rweights[i];
             }
         }
@@ -1522,7 +1516,7 @@ cvHaarDetectObjectsForROC( const CvArr* _img,
         for( size_t i = 0; i < rectList.size(); i++ )
         {
             CvAvgComp c;
-            c.rect = rectList[i];
+            c.rect = cvRect(rectList[i]);
             c.neighbors = !rweights.empty() ? rweights[i] : 0;
             cvSeqPush( result_seq, &c );
         }
@@ -1601,13 +1595,13 @@ icvLoadCascadeCART( const char** input_cascade, int n, CvSize orig_window_size )
 
                 for( k = 0; k < rects; k++ )
                 {
-                    CvRect r;
+                    cv::Rect r;
                     int band = 0;
                     sscanf( stage, "%d%d%d%d%d%f%n",
                             &r.x, &r.y, &r.width, &r.height, &band,
                             &(classifier->haar_feature[l].rect[k].weight), &dl );
                     stage += dl;
-                    classifier->haar_feature[l].rect[k].r = r;
+                    classifier->haar_feature[l].rect[k].r = cvRect(r);
                 }
                 sscanf( stage, "%99s%n", str, &dl );
                 stage += dl;
@@ -1905,7 +1899,7 @@ icvReadHaarClassifier( CvFileStorage* fs, CvFileNode* node )
                 for( l = 0; l < rects_fn->data.seq->total; ++l )
                 {
                     CvFileNode* rect_fn;
-                    CvRect r;
+                    cv::Rect r;
 
                     rect_fn = (CvFileNode*) rects_reader.ptr;
                     if( !CV_NODE_IS_SEQ( rect_fn->tag ) || rect_fn->data.seq->total != 5 )
@@ -1960,7 +1954,7 @@ icvReadHaarClassifier( CvFileStorage* fs, CvFileNode* node )
                     }
 
                     classifier->haar_feature[k].rect[l].weight = (float) fn->data.f;
-                    classifier->haar_feature[k].rect[l].r = r;
+                    classifier->haar_feature[k].rect[l].r = cvRect(r);
 
                     CV_NEXT_SEQ_ELEM( sizeof( *rect_fn ), rects_reader );
                 } /* for each rect */
@@ -2295,8 +2289,4 @@ CvType haar_type( CV_TYPE_NAME_HAAR, icvIsHaarClassifier,
                   icvReadHaarClassifier, icvWriteHaarClassifier,
                   icvCloneHaarClassifier );
 
-#if defined __GNUC__ && __GNUC__ >= 8
-#pragma GCC diagnostic pop
-#endif
-
 /* End of file. */
diff --git a/modules/objdetect/test/test_cascadeandhog.cpp b/modules/objdetect/test/test_cascadeandhog.cpp
index fffb23b396..106f3341b2 100644
--- a/modules/objdetect/test/test_cascadeandhog.cpp
+++ b/modules/objdetect/test/test_cascadeandhog.cpp
@@ -430,7 +430,7 @@ int CV_CascadeDetectorTest::detectMultiScale_C( const string& filename,
     cvtColor( img, grayImg, COLOR_BGR2GRAY );
     equalizeHist( grayImg, grayImg );
 
-    CvMat c_gray = grayImg;
+    CvMat c_gray = cvMat(grayImg);
     CvSeq* rs = cvHaarDetectObjects(&c_gray, c_cascade, storage, 1.1, 3, flags[di] );
 
     objects.clear();
diff --git a/modules/photo/src/inpaint.cpp b/modules/photo/src/inpaint.cpp
index ded962b16b..adab6a3c9e 100644
--- a/modules/photo/src/inpaint.cpp
+++ b/modules/photo/src/inpaint.cpp
@@ -201,12 +201,14 @@ public:
     }
 };
 
-inline float VectorScalMult(CvPoint2D32f v1,CvPoint2D32f v2) {
+static inline float VectorScalMult(const cv::Point2f& v1, const cv::Point2f& v2)
+{
    return v1.x*v2.x+v1.y*v2.y;
 }
 
-inline float VectorLength(CvPoint2D32f v1) {
-   return v1.x*v1.x+v1.y*v1.y;
+static inline float VectorLength(const cv::Point2f& v1)
+{
+    return v1.x*v1.x+v1.y*v1.y;
 }
 
 ///////////////////////////////////////////////////////////////////////////////////////////
@@ -307,7 +309,7 @@ icvTeleaInpaintFMM(const CvMat *f, CvMat *t, CvMat *out, int range, CvPriorityQu
                CV_MAT_ELEM(*t,float,i,j) = dist;
 
                for (color=0; color<=2; color++) {
-                  CvPoint2D32f gradI,gradT,r;
+                  cv::Point2f gradI,gradT,r;
                   float Ia=0,Jx=0,Jy=0,s=1.0e-20f,w,dst,lev,dir,sat;
 
                   if (CV_MAT_ELEM(*f,uchar,i,j+1)!=INSIDE) {
@@ -419,7 +421,7 @@ icvTeleaInpaintFMM(const CvMat *f, CvMat *t, CvMat *out, int range, CvPriorityQu
                CV_MAT_ELEM(*t,float,i,j) = dist;
 
                for (color=0; color<=0; color++) {
-                  CvPoint2D32f gradI,gradT,r;
+                  cv::Point2f gradI,gradT,r;
                   float Ia=0,Jx=0,Jy=0,s=1.0e-20f,w,dst,lev,dir,sat;
 
                   if (CV_MAT_ELEM(*f,uchar,i,j+1)!=INSIDE) {
@@ -539,7 +541,7 @@ icvNSInpaintFMM(const CvMat *f, CvMat *t, CvMat *out, int range, CvPriorityQueue
                CV_MAT_ELEM(*t,float,i,j) = dist;
 
                for (color=0; color<=2; color++) {
-                  CvPoint2D32f gradI,r;
+                  cv::Point2f gradI,r;
                   float Ia=0,s=1.0e-20f,w,dst,dir;
 
                   for (k=i-range; k<=i+range; k++) {
@@ -627,7 +629,7 @@ icvNSInpaintFMM(const CvMat *f, CvMat *t, CvMat *out, int range, CvPriorityQueue
                CV_MAT_ELEM(*t,float,i,j) = dist;
 
                {
-                  CvPoint2D32f gradI,r;
+                  cv::Point2f gradI,r;
                   float Ia=0,s=1.0e-20f,w,dst,dir;
 
                   for (k=i-range; k<=i+range; k++) {
@@ -847,6 +849,6 @@ void cv::inpaint( InputArray _src, InputArray _mask, OutputArray _dst,
     Mat src = _src.getMat(), mask = _mask.getMat();
     _dst.create( src.size(), src.type() );
     Mat dst = _dst.getMat();
-    CvMat c_src = src, c_mask = mask, c_dst = dst;
+    CvMat c_src = cvMat(src), c_mask = cvMat(mask), c_dst = cvMat(dst);
     cvInpaint( &c_src, &c_mask, &c_dst, inpaintRange, flags );
 }
diff --git a/modules/photo/test/test_hdr.cpp b/modules/photo/test/test_hdr.cpp
index c4bf536278..fd4797fac7 100644
--- a/modules/photo/test/test_hdr.cpp
+++ b/modules/photo/test/test_hdr.cpp
@@ -213,11 +213,7 @@ TEST(Photo_MergeRobertson, regression)
     loadImage(test_path + "merge/robertson.hdr", expected);
     merge->process(images, result, times);
 
-#if defined(__aarch64__) || defined(__PPC64__)
     const float eps = 6.f;
-#else
-    const float eps = 5.f;
-#endif
     checkEqual(expected, result, eps, "MergeRobertson");
 }
 
diff --git a/modules/stitching/src/motion_estimators.cpp b/modules/stitching/src/motion_estimators.cpp
index 8d36538912..6a07842444 100644
--- a/modules/stitching/src/motion_estimators.cpp
+++ b/modules/stitching/src/motion_estimators.cpp
@@ -255,10 +255,10 @@ bool BundleAdjusterBase::estimate(const std::vector<ImageFeatures> &features,
 
     CvLevMarq solver(num_images_ * num_params_per_cam_,
                      total_num_matches_ * num_errs_per_measurement_,
-                     term_criteria_);
+                     cvTermCriteria(term_criteria_));
 
     Mat err, jac;
-    CvMat matParams = cam_params_;
+    CvMat matParams = cvMat(cam_params_);
     cvCopy(&matParams, solver.param);
 
     int iter = 0;
@@ -278,7 +278,7 @@ bool BundleAdjusterBase::estimate(const std::vector<ImageFeatures> &features,
         if (_jac)
         {
             calcJacobian(jac);
-            CvMat tmp = jac;
+            CvMat tmp = cvMat(jac);
             cvCopy(&tmp, _jac);
         }
 
@@ -287,7 +287,7 @@ bool BundleAdjusterBase::estimate(const std::vector<ImageFeatures> &features,
             calcError(err);
             LOG_CHAT(".");
             iter++;
-            CvMat tmp = err;
+            CvMat tmp = cvMat(err);
             cvCopy(&tmp, _err);
         }
     }
diff --git a/modules/ts/src/ts_arrtest.cpp b/modules/ts/src/ts_arrtest.cpp
index 8ba0b3786e..365cf1550e 100644
--- a/modules/ts/src/ts_arrtest.cpp
+++ b/modules/ts/src/ts_arrtest.cpp
@@ -158,8 +158,8 @@ int ArrayTest::prepare_test_case( int test_case_idx )
         {
             unsigned t = randInt(rng);
             bool create_mask = true, use_roi = false;
-            CvSize size = sizes[i][j], whole_size = size;
-            CvRect roi;
+            CvSize size = cvSize(sizes[i][j]), whole_size = size;
+            CvRect roi = CV_STRUCT_INITIALIZER;
 
             is_image = !cvmat_allowed ? true : iplimage_allowed ? (t & 1) != 0 : false;
             create_mask = (t & 6) == 0; // ~ each of 3 tests will use mask
diff --git a/modules/video/src/compat_video.cpp b/modules/video/src/compat_video.cpp
index 9e01f925e6..da3996a012 100644
--- a/modules/video/src/compat_video.cpp
+++ b/modules/video/src/compat_video.cpp
@@ -56,7 +56,7 @@ cvMeanShift( const void* imgProb, CvRect windowIn,
 
     if( comp )
     {
-        comp->rect = window;
+        comp->rect = cvRect(window);
         comp->area = cvRound(cv::sum(img(window))[0]);
     }
 
@@ -76,13 +76,13 @@ cvCamShift( const void* imgProb, CvRect windowIn,
 
     if( comp )
     {
-        comp->rect = window;
+        comp->rect = cvRect(window);
         cv::Rect roi = rr.boundingRect() & cv::Rect(0, 0, img.cols, img.rows);
         comp->area = cvRound(cv::sum(img(roi))[0]);
     }
 
     if( box )
-        *box = rr;
+        *box = cvBox2D(rr);
 
     return rr.size.width*rr.size.height > 0.f ? 1 : -1;
 }
diff --git a/modules/video/test/test_camshift.cpp b/modules/video/test/test_camshift.cpp
index f93d124482..4aa0fa6839 100644
--- a/modules/video/test/test_camshift.cpp
+++ b/modules/video/test/test_camshift.cpp
@@ -435,7 +435,7 @@ void CV_MeanShiftTest::run_func(void)
 int CV_MeanShiftTest::validate_test_results( int /*test_case_idx*/ )
 {
     int code = cvtest::TS::OK;
-    CvPoint2D32f c;
+    Point2f c;
     double m = MAX(box0.size.width, box0.size.height), delta;
 
     if( cvIsNaN(comp.area) || cvIsInf(comp.area) || comp.area <= 0 )
diff --git a/modules/video/test/test_optflowpyrlk.cpp b/modules/video/test/test_optflowpyrlk.cpp
index 0abb573a7f..1f08270ec7 100644
--- a/modules/video/test/test_optflowpyrlk.cpp
+++ b/modules/video/test/test_optflowpyrlk.cpp
@@ -120,7 +120,7 @@ void CV_OptFlowPyrLKTest::run( int )
     /* read first image */
     sprintf( filename, "%soptflow/%s", ts->get_data_path().c_str(), "rock_1.bmp" );
     imgI2 = cv::imread( filename, cv::IMREAD_UNCHANGED );
-    imgI = imgI2;
+    imgI = cvIplImage(imgI2);
 
     if( imgI2.empty() )
     {
@@ -132,7 +132,7 @@ void CV_OptFlowPyrLKTest::run( int )
     /* read second image */
     sprintf( filename, "%soptflow/%s", ts->get_data_path().c_str(), "rock_2.bmp" );
     imgJ2 = cv::imread( filename, cv::IMREAD_UNCHANGED );
-    imgJ = imgJ2;
+    imgJ = cvIplImage(imgJ2);
 
     if( imgJ2.empty() )
     {
diff --git a/modules/videoio/src/cap.cpp b/modules/videoio/src/cap.cpp
index c1ee87a15a..9c14be018f 100644
--- a/modules/videoio/src/cap.cpp
+++ b/modules/videoio/src/cap.cpp
@@ -366,7 +366,7 @@ void VideoWriter::write(const Mat& image)
         iwriter->write(image);
     else
     {
-        IplImage _img = image;
+        IplImage _img = cvIplImage(image);
         cvWriteFrame(writer, &_img);
     }
 }
diff --git a/modules/videoio/src/cap_openni.cpp b/modules/videoio/src/cap_openni.cpp
index 7e826ff75b..e4dbea80d7 100644
--- a/modules/videoio/src/cap_openni.cpp
+++ b/modules/videoio/src/cap_openni.cpp
@@ -382,7 +382,7 @@ IplImage* CvCapture_OpenNI::OutputMap::getIplImagePtr()
     if( mat.empty() )
         return 0;
 
-    iplHeader = IplImage(mat);
+    iplHeader = cvIplImage(mat);
     return &iplHeader;
 }
 
diff --git a/modules/videoio/src/cap_openni2.cpp b/modules/videoio/src/cap_openni2.cpp
index 9a67a417f6..b5c426809d 100644
--- a/modules/videoio/src/cap_openni2.cpp
+++ b/modules/videoio/src/cap_openni2.cpp
@@ -192,7 +192,7 @@ IplImage* CvCapture_OpenNI2::OutputMap::getIplImagePtr()
     if( mat.empty() )
         return 0;
 
-    iplHeader = IplImage(mat);
+    iplHeader = cvIplImage(mat);
     return &iplHeader;
 }
 
diff --git a/modules/videoio/src/cap_v4l.cpp b/modules/videoio/src/cap_v4l.cpp
index e0077c6845..5eaca3ae12 100644
--- a/modules/videoio/src/cap_v4l.cpp
+++ b/modules/videoio/src/cap_v4l.cpp
@@ -553,7 +553,7 @@ static int v4l2_num_channels(__u32 palette) {
 }
 
 static void v4l2_create_frame(CvCaptureCAM_V4L *capture) {
-    CvSize size(capture->form.fmt.pix.width, capture->form.fmt.pix.height);
+    CvSize size = {capture->form.fmt.pix.width, capture->form.fmt.pix.height};
     int channels = 3;
     int depth = IPL_DEPTH_8U;
 
@@ -563,7 +563,7 @@ static void v4l2_create_frame(CvCaptureCAM_V4L *capture) {
         switch(capture->palette) {
         case V4L2_PIX_FMT_MJPEG:
         case V4L2_PIX_FMT_JPEG:
-            size = CvSize(capture->buffers[capture->bufferIndex].length, 1);
+            size = cvSize(capture->buffers[capture->bufferIndex].length, 1);
             break;
         case V4L2_PIX_FMT_YVU420:
         case V4L2_PIX_FMT_YUV420:
diff --git a/modules/videoio/src/videoio_registry.cpp b/modules/videoio/src/videoio_registry.cpp
index e3d0bcab95..6be4c08e06 100644
--- a/modules/videoio/src/videoio_registry.cpp
+++ b/modules/videoio/src/videoio_registry.cpp
@@ -666,22 +666,22 @@ void VideoWriter_create(CvVideoWriter*& writer, Ptr<IVideoWriter>& iwriter, Vide
 #endif
 #ifdef HAVE_VFW
     case CAP_VFW:
-        CREATE_WRITER_LEGACY(cvCreateVideoWriter_VFW(filename.c_str(), fourcc, fps, frameSize, isColor))
+        CREATE_WRITER_LEGACY(cvCreateVideoWriter_VFW(filename.c_str(), fourcc, fps, cvSize(frameSize), isColor))
         break;
 #endif
 #ifdef HAVE_AVFOUNDATION
     case CAP_AVFOUNDATION:
-        CREATE_WRITER_LEGACY(cvCreateVideoWriter_AVFoundation(filename.c_str(), fourcc, fps, frameSize, isColor))
+        CREATE_WRITER_LEGACY(cvCreateVideoWriter_AVFoundation(filename.c_str(), fourcc, fps, cvSize(frameSize), isColor))
         break;
 #endif
 #if defined(HAVE_QUICKTIME) || defined(HAVE_QTKIT)
     case(CAP_QT):
-        CREATE_WRITER_LEGACY(cvCreateVideoWriter_QT(filename.c_str(), fourcc, fps, frameSize, isColor))
+        CREATE_WRITER_LEGACY(cvCreateVideoWriter_QT(filename.c_str(), fourcc, fps, cvSize(frameSize), isColor))
         break;
 #endif
 #ifdef HAVE_GSTREAMER
 case CAP_GSTREAMER:
-        CREATE_WRITER_LEGACY(cvCreateVideoWriter_GStreamer (filename.c_str(), fourcc, fps, frameSize, isColor))
+        CREATE_WRITER_LEGACY(cvCreateVideoWriter_GStreamer (filename.c_str(), fourcc, fps, cvSize(frameSize), isColor))
         break;
 #endif
     case CAP_OPENCV_MJPEG:
diff --git a/platforms/ios/build_framework.py b/platforms/ios/build_framework.py
old mode 100644
new mode 100755
index 32305f9a08..d624e08d90
--- a/platforms/ios/build_framework.py
+++ b/platforms/ios/build_framework.py
@@ -183,7 +183,7 @@ class Builder:
         cmakecmd = self.getCMakeArgs(arch, target) + \
             (["-DCMAKE_TOOLCHAIN_FILE=%s" % toolchain] if toolchain is not None else [])
         if target.lower().startswith("iphoneos"):
-            cmakecmd.append("-DENABLE_NEON=ON")
+            cmakecmd.append("-DCPU_BASELINE=NEON;FP16")
         cmakecmd.append(self.opencv)
         cmakecmd.extend(cmakeargs)
         execute(cmakecmd, cwd = builddir)
diff --git a/samples/dnn/custom_layers.hpp b/samples/dnn/custom_layers.hpp
index 8a3d5d88c1..f471106aea 100644
--- a/samples/dnn/custom_layers.hpp
+++ b/samples/dnn/custom_layers.hpp
@@ -35,10 +35,23 @@ public:
     }
 
     // Implementation of this custom layer is based on https://github.com/cdmh/deeplab-public/blob/master/src/caffe/layers/interp_layer.cpp
-    virtual void forward(std::vector<cv::Mat*> &inputs, std::vector<cv::Mat> &outputs, std::vector<cv::Mat> &internals) CV_OVERRIDE
+    virtual void forward(cv::InputArrayOfArrays inputs_arr,
+                         cv::OutputArrayOfArrays outputs_arr,
+                         cv::OutputArrayOfArrays internals_arr) CV_OVERRIDE
     {
-        CV_UNUSED(internals);
-        cv::Mat& inp = *inputs[0];
+        if (inputs_arr.depth() == CV_16S)
+        {
+            // In case of DNN_TARGET_OPENCL_FP16 target the following method
+            // converts data from FP16 to FP32 and calls this forward again.
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<cv::Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        cv::Mat& inp = inputs[0];
         cv::Mat& out = outputs[0];
         const float* inpData = (float*)inp.data;
         float* outData = (float*)out.data;
@@ -78,8 +91,6 @@ public:
         }
     }
 
-    virtual void forward(cv::InputArrayOfArrays, cv::OutputArrayOfArrays, cv::OutputArrayOfArrays) CV_OVERRIDE {}
-
 private:
     int outWidth, outHeight;
 };
@@ -134,8 +145,10 @@ public:
         return false;
     }
 
-    virtual void finalize(const std::vector<cv::Mat*>&, std::vector<cv::Mat> &outputs) CV_OVERRIDE
+    virtual void finalize(cv::InputArrayOfArrays, cv::OutputArrayOfArrays outputs_arr) CV_OVERRIDE
     {
+        std::vector<cv::Mat> outputs;
+        outputs_arr.getMatVector(outputs);
         if (!outWidth && !outHeight)
         {
             outHeight = outputs[0].size[2];
@@ -145,9 +158,23 @@ public:
 
     // This implementation is based on a reference implementation from
     // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
-    virtual void forward(std::vector<cv::Mat*> &inputs, std::vector<cv::Mat> &outputs, std::vector<cv::Mat> &) CV_OVERRIDE
+    virtual void forward(cv::InputArrayOfArrays inputs_arr,
+                         cv::OutputArrayOfArrays outputs_arr,
+                         cv::OutputArrayOfArrays internals_arr) CV_OVERRIDE
     {
-        cv::Mat& inp = *inputs[0];
+        if (inputs_arr.depth() == CV_16S)
+        {
+            // In case of DNN_TARGET_OPENCL_FP16 target the following method
+            // converts data from FP16 to FP32 and calls this forward again.
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<cv::Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        cv::Mat& inp = inputs[0];
         cv::Mat& out = outputs[0];
         const float* inpData = (float*)inp.data;
         float* outData = (float*)out.data;
@@ -185,8 +212,6 @@ public:
         }
     }
 
-    virtual void forward(cv::InputArrayOfArrays, cv::OutputArrayOfArrays, cv::OutputArrayOfArrays) CV_OVERRIDE {}
-
 private:
     static inline int offset(const cv::MatSize& size, int c, int x, int y, int b)
     {
@@ -221,14 +246,15 @@ public:
     //! [MyLayer::getMemoryShapes]
 
     //! [MyLayer::forward]
-    virtual void forward(std::vector<cv::Mat*> &inputs, std::vector<cv::Mat> &outputs, std::vector<cv::Mat> &internals) CV_OVERRIDE;
+    virtual void forward(cv::InputArrayOfArrays inputs,
+                         cv::OutputArrayOfArrays outputs,
+                         cv::OutputArrayOfArrays internals) CV_OVERRIDE;
     //! [MyLayer::forward]
 
     //! [MyLayer::finalize]
-    virtual void finalize(const std::vector<cv::Mat*> &inputs, std::vector<cv::Mat> &outputs) CV_OVERRIDE;
+    virtual void finalize(cv::InputArrayOfArrays inputs,
+                          cv::OutputArrayOfArrays outputs) CV_OVERRIDE;
     //! [MyLayer::finalize]
-
-    virtual void forward(cv::InputArrayOfArrays inputs, cv::OutputArrayOfArrays outputs, cv::OutputArrayOfArrays internals) CV_OVERRIDE;
 };
 //! [A custom layer interface]