diff --git a/3rdparty/openjpeg/CMakeLists.txt b/3rdparty/openjpeg/CMakeLists.txt
index d3db9e8c47..188381f1e2 100644
--- a/3rdparty/openjpeg/CMakeLists.txt
+++ b/3rdparty/openjpeg/CMakeLists.txt
@@ -16,6 +16,7 @@ ocv_warnings_disable(CMAKE_C_FLAGS
     -Wunused-but-set-variable # clang15
     -Wmissing-prototypes # clang, function opj_t1_ht_decode_cblk
     -Wmissing-declarations # gcc, function opj_t1_ht_decode_cblk
+    -Wdocumentation # clang
 )
 
 #-----------------------------------------------------------------------------
diff --git a/3rdparty/protobuf/CMakeLists.txt b/3rdparty/protobuf/CMakeLists.txt
index 5e8e3a9ed2..7df035cac9 100644
--- a/3rdparty/protobuf/CMakeLists.txt
+++ b/3rdparty/protobuf/CMakeLists.txt
@@ -27,6 +27,8 @@ else()
                                        -Wimplicit-fallthrough
                                        -Warray-bounds  # GCC 9+
                                        -Wstringop-overflow -Wstringop-overread # GCC 11-12
+                                       -Wextra-semi # clang
+                                       -Wcomma # clang
   )
 endif()
 if(CV_ICC)
diff --git a/cmake/OpenCVDetectPython.cmake b/cmake/OpenCVDetectPython.cmake
index 2c92e33eb6..d9bb5bb9a6 100644
--- a/cmake/OpenCVDetectPython.cmake
+++ b/cmake/OpenCVDetectPython.cmake
@@ -209,7 +209,7 @@ if(NOT ${found})
           message(STATUS "  PYTHON3_NUMPY_INCLUDE_DIRS")
         else()
           # Attempt to discover the NumPy include directory. If this succeeds, then build python API with NumPy
-          execute_process(COMMAND "${_executable}" -c "import os; os.environ['DISTUTILS_USE_SDK']='1'; import numpy.distutils; print(os.pathsep.join(numpy.distutils.misc_util.get_numpy_include_dirs()))"
+          execute_process(COMMAND "${_executable}" -c "import numpy; print(numpy.get_include())"
                           RESULT_VARIABLE _numpy_process
                           OUTPUT_VARIABLE _numpy_include_dirs
                           OUTPUT_STRIP_TRAILING_WHITESPACE)
diff --git a/doc/pattern_tools/gen_pattern.py b/doc/pattern_tools/gen_pattern.py
index bec535baf6..f426bb11c5 100755
--- a/doc/pattern_tools/gen_pattern.py
+++ b/doc/pattern_tools/gen_pattern.py
@@ -186,6 +186,8 @@ class PatternMaker:
         yspacing = (self.height - self.rows * self.square_size) / 2.0
 
         ch_ar_border = (self.square_size - self.aruco_marker_size)/2
+        if ch_ar_border < side*0.7:
+            print("Marker border {} is less than 70% of ArUco pin size {}. Please increase --square_size or decrease --marker_size for stable board detection".format(ch_ar_border, int(side)))
         marker_id = 0
         for y in range(0, self.rows):
             for x in range(0, self.cols):
@@ -283,6 +285,9 @@ def main():
             else:
                 raise ValueError("The marker {},{} is outside the checkerboard".format(x, y))
 
+    if p_type == "charuco_board" and aruco_marker_size >= square_size:
+        raise ValueError("ArUco markers size must be smaller than square size")
+
     pm = PatternMaker(columns, rows, output, units, square_size, radius_rate, page_width, page_height, markers, aruco_marker_size, dict_file)
     # dict for easy lookup of pattern type
     mp = {"circles": pm.make_circles_pattern, "acircles": pm.make_acircles_pattern,
diff --git a/modules/3d/src/fundam.cpp b/modules/3d/src/fundam.cpp
index 358bc30ee3..87786d6511 100644
--- a/modules/3d/src/fundam.cpp
+++ b/modules/3d/src/fundam.cpp
@@ -112,7 +112,7 @@ public:
      *            2 columns 1 channel
      * @param _m2 destination points containing (x,y), depth is CV_32F with 1 column 2 channels or
      *            2 columns 1 channel
-     * @param _model, CV_64FC1, 3x3, normalized, i.e., the last element is 1
+     * @param _model CV_64FC1, 3x3, normalized, i.e., the last element is 1
      */
     int runKernel( InputArray _m1, InputArray _m2, OutputArray _model ) const CV_OVERRIDE
     {
@@ -187,7 +187,7 @@ public:
      * @param _m1 depth CV_32F, 1-channel with 2 columns or 2-channel with 1 column
      * @param _m2 depth CV_32F, 1-channel with 2 columns or 2-channel with 1 column
      * @param _model CV_64FC1, 3x3
-     * @param _err, output, CV_32FC1, square of the L2 norm
+     * @param _err output, CV_32FC1, square of the L2 norm
      */
     void computeError( InputArray _m1, InputArray _m2, InputArray _model, OutputArray _err ) const CV_OVERRIDE
     {
diff --git a/modules/3d/src/ippe.hpp b/modules/3d/src/ippe.hpp
index bcd3e58aef..4b3716d422 100644
--- a/modules/3d/src/ippe.hpp
+++ b/modules/3d/src/ippe.hpp
@@ -111,7 +111,7 @@ private:
     /**
      * @brief                           Computes the translation solution for a given rotation solution
      * @param objectPoints              Array of corresponding object points, 1xN/Nx1 3-channel where N is the number of points
-     * @param normalizedImagePoints     Array of corresponding image points (undistorted), 1xN/Nx1 2-channel where N is the number of points
+     * @param normalizedImgPoints       Array of corresponding image points (undistorted), 1xN/Nx1 2-channel where N is the number of points
      * @param R                         Rotation solution (3x1 rotation vector)
      * @param t                         Translation solution (3x1 rotation vector)
      */
@@ -220,10 +220,10 @@ private:
 
     /**
      * @brief                   Computes the average depth of an object given its pose in camera coordinates
-     * @param objectPoints:     Object points defined in 3D object space
-     * @param rvec:             Rotation component of pose
-     * @param tvec:             Translation component of pose
-     * @return:                 average depth of the object
+     * @param objectPoints      Object points defined in 3D object space
+     * @param rvec              Rotation component of pose
+     * @param tvec              Translation component of pose
+     * @return                  average depth of the object
      */
     double meanSceneDepth(InputArray objectPoints, InputArray rvec, InputArray tvec);
 
diff --git a/modules/3d/src/p3p.cpp b/modules/3d/src/p3p.cpp
index 9d9cd3787c..8e16697968 100644
--- a/modules/3d/src/p3p.cpp
+++ b/modules/3d/src/p3p.cpp
@@ -220,8 +220,8 @@ int p3p::solve(double R[4][3][3], double t[4][3],
 /// Only the solution to the main branch.
 /// Reference : X.S. Gao, X.-R. Hou, J. Tang, H.-F. Chang; "Complete Solution Classification for the Perspective-Three-Point Problem"
 /// IEEE Trans. on PAMI, vol. 25, No. 8, August 2003
-/// \param lengths3D Lengths of line segments up to four solutions.
-/// \param dist3D Distance between 3D points in pairs |BC|, |AC|, |AB|.
+/// \param lengths Lengths of line segments up to four solutions.
+/// \param distances Distance between 3D points in pairs |BC|, |AC|, |AB|.
 /// \param cosines Cosine of the angles /_BPC, /_APC, /_APB.
 /// \returns Number of solutions.
 /// WARNING: NOT ALL THE DEGENERATE CASES ARE IMPLEMENTED
diff --git a/modules/3d/src/precomp.hpp b/modules/3d/src/precomp.hpp
index 7d96a4becc..ef6bf01297 100755
--- a/modules/3d/src/precomp.hpp
+++ b/modules/3d/src/precomp.hpp
@@ -89,7 +89,7 @@ namespace cv {
  * @param ep outlier ratio
  * @param modelPoints number of model points required for estimation
  * @param maxIters maximum number of iterations
- * @return
+ * @return The number of iterations according to the formula
  * \f[
  * \frac{\ln(1-p)}{\ln\left(1-(1-ep)^\mathrm{modelPoints}\right)}
  * \f]
diff --git a/modules/3d/src/rho.cpp b/modules/3d/src/rho.cpp
index d7b3cb6854..c6f4adb92d 100644
--- a/modules/3d/src/rho.cpp
+++ b/modules/3d/src/rho.cpp
@@ -486,7 +486,7 @@ void rhoSeed(Ptr<RHO_HEST> p, uint64_t seed){
  * Estimates the homography using the given context, matches and parameters to
  * PROSAC.
  *
- * @param [in/out] p       The context to use for homography estimation. Must
+ * @param [in,out] p       The context to use for homography estimation. Must
  *                             be already initialized. Cannot be NULL.
  * @param [in]     src     The pointer to the source points of the matches.
  *                             Must be aligned to 4 bytes. Cannot be NULL.
diff --git a/modules/3d/src/rho.h b/modules/3d/src/rho.h
index f1ba2f67a4..2efceed128 100644
--- a/modules/3d/src/rho.h
+++ b/modules/3d/src/rho.h
@@ -206,7 +206,7 @@ void rhoSeed(Ptr<RHO_HEST> p, uint64_t seed);
  * homography with at least the minimum required support, and 0 if it was not.
  *
  *
- * @param [in/out] p       The context to use for homography estimation. Must
+ * @param [in,out] p       The context to use for homography estimation. Must
  *                             be already initialized. Cannot be NULL.
  * @param [in]     src     The pointer to the source points of the matches.
  *                             Must be aligned to 4 bytes. Cannot be NULL.
diff --git a/modules/3d/src/undistort.simd.hpp b/modules/3d/src/undistort.simd.hpp
index 7998a3b086..70bac44702 100644
--- a/modules/3d/src/undistort.simd.hpp
+++ b/modules/3d/src/undistort.simd.hpp
@@ -89,8 +89,8 @@ public:
         s2(_s2),
         s3(_s3),
         s4(_s4) {
-#if CV_SIMD_64F
-        for (int i = 0; i < 2 * v_float64::nlanes; ++i)
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
+        for (int i = 0; i < 2 * VTraits<v_float64>::vlanes(); ++i)
         {
             s_x[i] = ir[0] * i;
             s_y[i] = ir[3] * i;
@@ -123,26 +123,26 @@ public:
             else
                 CV_Assert(m1 != NULL);
 
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
             const v_float64 v_one = vx_setall_f64(1.0);
-            for (; j <= size.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes, _x += 2*v_float64::nlanes * ir[0], _y += 2*v_float64::nlanes * ir[3], _w += 2*v_float64::nlanes * ir[6])
+            for (; j <= size.width - 2*VTraits<v_float64>::vlanes(); j += 2*VTraits<v_float64>::vlanes(), _x += 2*VTraits<v_float64>::vlanes() * ir[0], _y += 2*VTraits<v_float64>::vlanes() * ir[3], _w += 2*VTraits<v_float64>::vlanes() * ir[6])
             {
                 v_float64 m_0, m_1, m_2, m_3;
-                m_2 = v_one / (vx_setall_f64(_w) + vx_load(s_w));
-                m_3 = v_one / (vx_setall_f64(_w) + vx_load(s_w + v_float64::nlanes));
+                m_2 = v_div(v_one, v_add(vx_setall_f64(_w), vx_load(this->s_w)));
+                m_3 = v_div(v_one, v_add(vx_setall_f64(_w), vx_load(this->s_w + VTraits<v_float64>::vlanes())));
                 m_0 = vx_setall_f64(_x); m_1 = vx_setall_f64(_y);
-                v_float64 x_0 = (m_0 + vx_load(s_x)) * m_2;
-                v_float64 x_1 = (m_0 + vx_load(s_x + v_float64::nlanes)) * m_3;
-                v_float64 y_0 = (m_1 + vx_load(s_y)) * m_2;
-                v_float64 y_1 = (m_1 + vx_load(s_y + v_float64::nlanes)) * m_3;
+                v_float64 x_0 = v_mul(v_add(m_0, vx_load(this->s_x)), m_2);
+                v_float64 x_1 = v_mul(v_add(m_0, vx_load(this->s_x + VTraits<v_float64>::vlanes())), m_3);
+                v_float64 y_0 = v_mul(v_add(m_1, vx_load(this->s_y)), m_2);
+                v_float64 y_1 = v_mul(v_add(m_1, vx_load(this->s_y + VTraits<v_float64>::vlanes())), m_3);
 
-                v_float64 xd_0 = x_0 * x_0;
-                v_float64 yd_0 = y_0 * y_0;
-                v_float64 xd_1 = x_1 * x_1;
-                v_float64 yd_1 = y_1 * y_1;
+                v_float64 xd_0 = v_mul(x_0, x_0);
+                v_float64 yd_0 = v_mul(y_0, y_0);
+                v_float64 xd_1 = v_mul(x_1, x_1);
+                v_float64 yd_1 = v_mul(y_1, y_1);
 
-                v_float64 r2_0 = xd_0 + yd_0;
-                v_float64 r2_1 = xd_1 + yd_1;
+                v_float64 r2_0 = v_add(xd_0, yd_0);
+                v_float64 r2_1 = v_add(xd_1, yd_1);
 
                 m_1 = vx_setall_f64(k3);
                 m_2 = vx_setall_f64(k2);
@@ -151,18 +151,18 @@ public:
                 m_1 = v_muladd(v_muladd(v_muladd(m_1, r2_1, m_2), r2_1, m_3), r2_1, v_one);
                 m_3 = vx_setall_f64(k6);
                 m_2 = vx_setall_f64(k5);
-                m_0 /= v_muladd(v_muladd(v_muladd(m_3, r2_0, m_2), r2_0, vx_setall_f64(k4)), r2_0, v_one);
-                m_1 /= v_muladd(v_muladd(v_muladd(m_3, r2_1, m_2), r2_1, vx_setall_f64(k4)), r2_1, v_one);
+                m_0 = v_div(m_0, v_muladd(v_muladd(v_muladd(m_3, r2_0, m_2), r2_0, vx_setall_f64(this->k4)), r2_0, v_one));
+                m_1 = v_div(m_1, v_muladd(v_muladd(v_muladd(m_3, r2_1, m_2), r2_1, vx_setall_f64(this->k4)), r2_1, v_one));
 
                 m_3 = vx_setall_f64(2.0);
                 xd_0 = v_muladd(m_3, xd_0, r2_0);
                 yd_0 = v_muladd(m_3, yd_0, r2_0);
                 xd_1 = v_muladd(m_3, xd_1, r2_1);
                 yd_1 = v_muladd(m_3, yd_1, r2_1);
-                m_2 = x_0 * y_0 * m_3;
-                m_3 = x_1 * y_1 * m_3;
+                m_2 = v_mul(v_mul(x_0, y_0), m_3);
+                m_3 = v_mul(v_mul(x_1, y_1), m_3);
 
-                x_0 *= m_0; y_0 *= m_0; x_1 *= m_1; y_1 *= m_1;
+                x_0 = v_mul(x_0, m_0); y_0 = v_mul(y_0, m_0); x_1 = v_mul(x_1, m_1); y_1 = v_mul(y_1, m_1);
 
                 m_0 = vx_setall_f64(p1);
                 m_1 = vx_setall_f64(p2);
@@ -176,8 +176,8 @@ public:
                 xd_1 = v_muladd(m_0, m_3, xd_1);
                 yd_1 = v_muladd(m_1, m_3, yd_1);
 
-                m_0 = r2_0 * r2_0;
-                m_1 = r2_1 * r2_1;
+                m_0 = v_mul(r2_0, r2_0);
+                m_1 = v_mul(r2_1, r2_1);
                 m_2 = vx_setall_f64(s2);
                 m_3 = vx_setall_f64(s1);
                 xd_0 = v_muladd(m_3, r2_0, v_muladd(m_2, m_0, xd_0));
@@ -203,17 +203,17 @@ public:
                 r2_0 = v_muladd(m_0, xd_0, v_muladd(m_1, yd_0, m_2));
                 r2_1 = v_muladd(m_0, xd_1, v_muladd(m_1, yd_1, m_2));
                 m_0 = vx_setzero_f64();
-                r2_0 = v_select(r2_0 == m_0, v_one, v_one / r2_0);
-                r2_1 = v_select(r2_1 == m_0, v_one, v_one / r2_1);
+                r2_0 = v_select(v_eq(r2_0, m_0), v_one, v_div(v_one, r2_0));
+                r2_1 = v_select(v_eq(r2_1, m_0), v_one, v_div(v_one, r2_1));
 
                 m_0 = vx_setall_f64(fx);
                 m_1 = vx_setall_f64(u0);
                 m_2 = vx_setall_f64(fy);
                 m_3 = vx_setall_f64(v0);
-                x_0 = v_muladd(m_0 * r2_0, x_0, m_1);
-                y_0 = v_muladd(m_2 * r2_0, y_0, m_3);
-                x_1 = v_muladd(m_0 * r2_1, x_1, m_1);
-                y_1 = v_muladd(m_2 * r2_1, y_1, m_3);
+                x_0 = v_muladd(v_mul(m_0, r2_0), x_0, m_1);
+                y_0 = v_muladd(v_mul(m_2, r2_0), y_0, m_3);
+                x_1 = v_muladd(v_mul(m_0, r2_1), x_1, m_1);
+                y_1 = v_muladd(v_mul(m_2, r2_1), y_1, m_3);
 
                 if (m1type == CV_32FC1)
                 {
@@ -225,20 +225,20 @@ public:
                     v_float32 mf0, mf1;
                     v_zip(v_cvt_f32(x_0, x_1), v_cvt_f32(y_0, y_1), mf0, mf1);
                     v_store(&m1f[j * 2], mf0);
-                    v_store(&m1f[j * 2 + v_float32::nlanes], mf1);
+                    v_store(&m1f[j * 2 + VTraits<v_float32>::vlanes()], mf1);
                 }
                 else // m1type == CV_16SC2
                 {
                     m_0 = vx_setall_f64(INTER_TAB_SIZE);
-                    x_0 *= m_0; x_1 *= m_0; y_0 *= m_0; y_1 *= m_0;
+                    x_0 = v_mul(x_0, m_0); x_1 = v_mul(x_1, m_0); y_0 = v_mul(y_0, m_0); y_1 = v_mul(y_1, m_0);
 
                     v_int32 mask = vx_setall_s32(INTER_TAB_SIZE - 1);
                     v_int32 iu = v_round(x_0, x_1);
                     v_int32 iv = v_round(y_0, y_1);
 
-                    v_pack_u_store(&m2[j], (iu & mask) + (iv & mask) * vx_setall_s32(INTER_TAB_SIZE));
+                    v_pack_u_store(&m2[j], v_add(v_and(iu, mask), v_mul(v_and(iv, mask), vx_setall_s32(INTER_TAB_SIZE))));
                     v_int32 out0, out1;
-                    v_zip(iu >> INTER_BITS, iv >> INTER_BITS, out0, out1);
+                    v_zip(v_shr<INTER_BITS>(iu), v_shr<INTER_BITS>(iv), out0, out1);
                     v_store(&m1[j * 2], v_pack(out0, out1));
                 }
             }
@@ -302,10 +302,10 @@ private:
     double s2;
     double s3;
     double s4;
-#if CV_SIMD_64F
-    double s_x[2*v_float64::nlanes];
-    double s_y[2*v_float64::nlanes];
-    double s_w[2*v_float64::nlanes];
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
+    double s_x[2*VTraits<v_float64>::max_nlanes];
+    double s_y[2*VTraits<v_float64>::max_nlanes];
+    double s_w[2*VTraits<v_float64>::max_nlanes];
 #endif
 };
 }
diff --git a/modules/calib/src/chessboard.hpp b/modules/calib/src/chessboard.hpp
index f49b83572f..80519d15a5 100644
--- a/modules/calib/src/chessboard.hpp
+++ b/modules/calib/src/chessboard.hpp
@@ -203,12 +203,12 @@ class Chessboard: public cv::Feature2D
                  * d12/d34 = d13/d24
                  *
                  * point order on the line:
-                 * pt1 --> pt2 --> pt3 --> pt4
+                 * p0 --> p1 --> p2 --> p3
                  *
-                 * \param[in] pt1 First point coordinate
-                 * \param[in] pt2 Second point coordinate
-                 * \param[in] pt3 Third point coordinate
-                 * \param[out] pt4 Forth point coordinate
+                 * \param[in] p0 First point coordinate
+                 * \param[in] p1 Second point coordinate
+                 * \param[in] p2 Third point coordinate
+                 * \param[out] p3 Forth point coordinate
                  *
                  */
                 static bool estimatePoint(const cv::Point2f &p0,const cv::Point2f &p1,const cv::Point2f &p2,cv::Point2f &p3);
@@ -309,7 +309,7 @@ class Chessboard: public cv::Feature2D
                  * \brief Draws the corners into the given image
                  *
                  * \param[in] m The image
-                 * \param[out] m The resulting image
+                 * \param[out] out The resulting image
                  * \param[in] H optional homography to calculate search area
                  *
                  */
@@ -668,7 +668,7 @@ class Chessboard: public cv::Feature2D
                   * \brief Calculates the average edge sharpness for the chessboard
                   *
                   * \param[in] image The image where the chessboard was detected
-                  * \param[in] rise_distante Rise distance 0.8 means 10% ... 90%
+                  * \param[in] rise_distance Rise distance 0.8 means 10% ... 90%
                   * \param[in] vertical by default only edge response for horiontal lines are calculated
                   *
                   * \returns Scalar(sharpness, average min_val, average max_val)
diff --git a/modules/calib/src/precomp.hpp b/modules/calib/src/precomp.hpp
index f83ec63386..e179d79d12 100644
--- a/modules/calib/src/precomp.hpp
+++ b/modules/calib/src/precomp.hpp
@@ -66,7 +66,7 @@ namespace cv {
  * @param ep outlier ratio
  * @param modelPoints number of model points required for estimation
  * @param maxIters maximum number of iterations
- * @return
+ * @return The number of iterations according to the formula
  * \f[
  * \frac{\ln(1-p)}{\ln\left(1-(1-ep)^\mathrm{modelPoints}\right)}
  * \f]
diff --git a/modules/core/include/opencv2/core/dualquaternion.inl.hpp b/modules/core/include/opencv2/core/dualquaternion.inl.hpp
index 6abb15924b..1a68f12d30 100644
--- a/modules/core/include/opencv2/core/dualquaternion.inl.hpp
+++ b/modules/core/include/opencv2/core/dualquaternion.inl.hpp
@@ -36,15 +36,15 @@
 namespace cv {
 
 template <typename T>
-DualQuat<T>::DualQuat():w(0), x(0), y(0), z(0), w_(0), x_(0), y_(0), z_(0){};
+DualQuat<T>::DualQuat():w(0), x(0), y(0), z(0), w_(0), x_(0), y_(0), z_(0){}
 
 template <typename T>
 DualQuat<T>::DualQuat(const T vw, const T vx, const T vy, const T vz, const T _w, const T _x, const T _y, const T _z):
-                      w(vw), x(vx), y(vy), z(vz), w_(_w), x_(_x), y_(_y), z_(_z){};
+                      w(vw), x(vx), y(vy), z(vz), w_(_w), x_(_x), y_(_y), z_(_z){}
 
 template <typename T>
 DualQuat<T>::DualQuat(const Vec<T, 8> &q):w(q[0]), x(q[1]), y(q[2]), z(q[3]),
-                                          w_(q[4]), x_(q[5]), y_(q[6]), z_(q[7]){};
+                                          w_(q[4]), x_(q[5]), y_(q[6]), z_(q[7]){}
 
 template <typename T>
 DualQuat<T> DualQuat<T>::createFromQuat(const Quat<T> &realPart, const Quat<T> &dualPart)
diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp
index 738ffb2d22..b607b8b3b0 100644
--- a/modules/core/include/opencv2/core/hal/intrin.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin.hpp
@@ -987,6 +987,15 @@ namespace CV__SIMD_NAMESPACE {
     { \
         return a op b; \
     }
+    #define OPENCV_HAL_WRAP_EQ_OP(_Tpvec) \
+    inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \
+    { \
+        return a == b; \
+    } \
+    inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
+    { \
+        return a != b; \
+    }
 
     #define OPENCV_HAL_WRAP_CMP(_Tpvec) \
     OPENCV_HAL_WRAP_CMP_OP(_Tpvec, eq, ==) \
@@ -999,11 +1008,11 @@ namespace CV__SIMD_NAMESPACE {
     OPENCV_HAL_WRAP_CMP(v_uint8)
     OPENCV_HAL_WRAP_CMP(v_uint16)
     OPENCV_HAL_WRAP_CMP(v_uint32)
-    // OPENCV_HAL_WRAP_CMP(v_uint64)
+    OPENCV_HAL_WRAP_EQ_OP(v_uint64)
     OPENCV_HAL_WRAP_CMP(v_int8)
     OPENCV_HAL_WRAP_CMP(v_int16)
     OPENCV_HAL_WRAP_CMP(v_int32)
-    // OPENCV_HAL_WRAP_CMP(v_int64)
+    OPENCV_HAL_WRAP_EQ_OP(v_int64)
     OPENCV_HAL_WRAP_CMP(v_float32)
     #if CV_SIMD_64F
     OPENCV_HAL_WRAP_CMP(v_float64)
@@ -1012,9 +1021,11 @@ namespace CV__SIMD_NAMESPACE {
         OPENCV_HAL_WRAP_CMP(v_uint8x16)
         OPENCV_HAL_WRAP_CMP(v_uint16x8)
         OPENCV_HAL_WRAP_CMP(v_uint32x4)
+        OPENCV_HAL_WRAP_EQ_OP(v_uint64x2)
         OPENCV_HAL_WRAP_CMP(v_int8x16)
         OPENCV_HAL_WRAP_CMP(v_int16x8)
         OPENCV_HAL_WRAP_CMP(v_int32x4)
+        OPENCV_HAL_WRAP_EQ_OP(v_int64x2)
         OPENCV_HAL_WRAP_CMP(v_float32x4)
         #if CV_SIMD_64F
         OPENCV_HAL_WRAP_CMP(v_float64x2)
@@ -1024,9 +1035,11 @@ namespace CV__SIMD_NAMESPACE {
         OPENCV_HAL_WRAP_CMP(v_uint8x32)
         OPENCV_HAL_WRAP_CMP(v_uint16x16)
         OPENCV_HAL_WRAP_CMP(v_uint32x8)
+        OPENCV_HAL_WRAP_EQ_OP(v_uint64x4)
         OPENCV_HAL_WRAP_CMP(v_int8x32)
         OPENCV_HAL_WRAP_CMP(v_int16x16)
         OPENCV_HAL_WRAP_CMP(v_int32x8)
+        OPENCV_HAL_WRAP_EQ_OP(v_int64x4)
         OPENCV_HAL_WRAP_CMP(v_float32x8)
         #if CV_SIMD_64F
         OPENCV_HAL_WRAP_CMP(v_float64x4)
diff --git a/modules/core/include/opencv2/core/hal/intrin_forward.hpp b/modules/core/include/opencv2/core/hal/intrin_forward.hpp
index 979f15a277..28f67cc9ef 100644
--- a/modules/core/include/opencv2/core/hal/intrin_forward.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_forward.hpp
@@ -188,4 +188,4 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
 
 //! @endcond
 
-} // cv::
\ No newline at end of file
+} // cv::
diff --git a/modules/core/include/opencv2/core/hal/intrin_rvv_011_compat.hpp b/modules/core/include/opencv2/core/hal/intrin_rvv_011_compat.hpp
new file mode 100644
index 0000000000..da5e0fdd57
--- /dev/null
+++ b/modules/core/include/opencv2/core/hal/intrin_rvv_011_compat.hpp
@@ -0,0 +1,33 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// 0.11 -> 0.12 compatibility
+
+#ifndef _RVV_IMPLICIT_VXRM
+#define _RVV_IMPLICIT_VXRM __RISCV_VXRM_RNU
+#endif
+
+// NOTE: masked should go first to avoid extra substitution (3 arg -> 4 arg -> 5 arg)
+
+// masked
+#define __riscv_vaadd(_1, _2, _3, _4) __riscv_vaadd(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
+#define __riscv_vasub(_1, _2, _3, _4) __riscv_vasub(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
+#define __riscv_vaaddu(_1, _2, _3, _4) __riscv_vaaddu(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
+#define __riscv_vasubu(_1, _2, _3, _4) __riscv_vasubu(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
+#define __riscv_vsmul(_1, _2, _3, _4) __riscv_vsmul(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
+#define __riscv_vssra(_1, _2, _3, _4) __riscv_vssra(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
+#define __riscv_vssrl(_1, _2, _3, _4) __riscv_vssrl(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
+#define __riscv_vnclip(_1, _2, _3, _4) __riscv_vnclip(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
+#define __riscv_vnclipu(_1, _2, _3, _4) __riscv_vnclipu(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
+
+// unmasked
+#define __riscv_vaadd(_1, _2, _3) __riscv_vaadd(_1, _2, _RVV_IMPLICIT_VXRM, _3)
+#define __riscv_vasub(_1, _2, _3) __riscv_vasub(_1, _2, _RVV_IMPLICIT_VXRM, _3)
+#define __riscv_vaaddu(_1, _2, _3) __riscv_vaaddu(_1, _2, _RVV_IMPLICIT_VXRM, _3)
+#define __riscv_vasubu(_1, _2, _3) __riscv_vasubu(_1, _2, _RVV_IMPLICIT_VXRM, _3)
+#define __riscv_vsmul(_1, _2, _3) __riscv_vsmul(_1, _2, _RVV_IMPLICIT_VXRM, _3)
+#define __riscv_vssra(_1, _2, _3) __riscv_vssra(_1, _2, _RVV_IMPLICIT_VXRM, _3)
+#define __riscv_vssrl(_1, _2, _3) __riscv_vssrl(_1, _2, _RVV_IMPLICIT_VXRM, _3)
+#define __riscv_vnclip(_1, _2, _3) __riscv_vnclip(_1, _2, _RVV_IMPLICIT_VXRM, _3)
+#define __riscv_vnclipu(_1, _2, _3) __riscv_vnclipu(_1, _2, _RVV_IMPLICIT_VXRM, _3)
diff --git a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp
index a45c90cf90..14988fc03c 100644
--- a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp
@@ -21,6 +21,10 @@
 #include "intrin_rvv_010_compat_overloaded-non-policy.hpp"
 #endif
 
+#if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic>11999
+#include "intrin_rvv_011_compat.hpp"
+#endif
+
 #if defined(__GNUC__) && !defined(__clang__)
 // FIXIT: eliminate massive warnigs from templates
 // GCC from 'rvv-next': riscv64-unknown-linux-gnu-g++ (g42df3464463) 12.0.1 20220505 (prerelease)
diff --git a/modules/core/include/opencv2/core/matx.hpp b/modules/core/include/opencv2/core/matx.hpp
index 76c214b757..fae4526584 100644
--- a/modules/core/include/opencv2/core/matx.hpp
+++ b/modules/core/include/opencv2/core/matx.hpp
@@ -225,7 +225,7 @@ public:
     void copyTo(const _OutputArray& dst) const;
     void convertTo(const _OutputArray& dst, int type, double scale=1., double shift=0.) const;
 
-    _Tp val[m*n]; //< matrix elements
+    _Tp val[m*n]; ///< matrix elements
 };
 
 typedef Matx<float, 1, 2> Matx12f;
diff --git a/modules/core/include/opencv2/core/ocl.hpp b/modules/core/include/opencv2/core/ocl.hpp
index f09e1d2034..3c3a998788 100644
--- a/modules/core/include/opencv2/core/ocl.hpp
+++ b/modules/core/include/opencv2/core/ocl.hpp
@@ -774,7 +774,7 @@ public:
     void start();
     void stop();
 
-    uint64 durationNS() const; //< duration in nanoseconds
+    uint64 durationNS() const; ///< duration in nanoseconds
 
 protected:
     struct Impl;
diff --git a/modules/core/include/opencv2/core/types.hpp b/modules/core/include/opencv2/core/types.hpp
index 0e0aa980e1..85ca9c2ae9 100644
--- a/modules/core/include/opencv2/core/types.hpp
+++ b/modules/core/include/opencv2/core/types.hpp
@@ -89,7 +89,7 @@ public:
     //! conjugation
     Complex conj() const;
 
-    _Tp re, im; //< the real and the imaginary parts
+    _Tp re, im; ///< the real and the imaginary parts
 };
 
 typedef Complex<float> Complexf;
@@ -2028,8 +2028,8 @@ double jaccardDistance(const Rect_<_Tp>& a, const Rect_<_Tp>& b) {
 /** @brief Finds out if there is any intersection between two rectangles
  *
  * mainly useful for language bindings
- * @param rect1 First rectangle
- * @param rect2 Second rectangle
+ * @param a First rectangle
+ * @param b Second rectangle
  * @return the area of the intersection
  */
 CV_EXPORTS_W inline double rectangleIntersectionArea(const Rect2d& a, const Rect2d& b) { return (a & b).area(); }
diff --git a/modules/core/include/opencv2/core/utils/filesystem.private.hpp b/modules/core/include/opencv2/core/utils/filesystem.private.hpp
index 70df64f0d4..c6bd5b316a 100644
--- a/modules/core/include/opencv2/core/utils/filesystem.private.hpp
+++ b/modules/core/include/opencv2/core/utils/filesystem.private.hpp
@@ -47,11 +47,11 @@ public:
     explicit FileLock(const char* fname);
     ~FileLock();
 
-    void lock(); //< acquire exclusive (writer) lock
-    void unlock(); //< release exclusive (writer) lock
+    void lock(); ///< acquire exclusive (writer) lock
+    void unlock(); ///< release exclusive (writer) lock
 
-    void lock_shared(); //< acquire shareable (reader) lock
-    void unlock_shared(); //< release shareable (reader) lock
+    void lock_shared(); ///< acquire shareable (reader) lock
+    void unlock_shared(); ///< release shareable (reader) lock
 
     struct Impl;
 protected:
diff --git a/modules/core/include/opencv2/core/utils/trace.hpp b/modules/core/include/opencv2/core/utils/trace.hpp
index ef5d35b4f2..ea43bbeea1 100644
--- a/modules/core/include/opencv2/core/utils/trace.hpp
+++ b/modules/core/include/opencv2/core/utils/trace.hpp
@@ -70,11 +70,11 @@ public:
     struct LocationExtraData;
     struct LocationStaticStorage
     {
-        LocationExtraData** ppExtra;   //< implementation specific data
-        const char* name;              //< region name (function name or other custom name)
-        const char* filename;          //< source code filename
-        int line;                      //< source code line
-        int flags;                     //< flags (implementation code path: Plain, IPP, OpenCL)
+        LocationExtraData** ppExtra;   ///< implementation specific data
+        const char* name;              ///< region name (function name or other custom name)
+        const char* filename;          ///< source code filename
+        int line;                      ///< source code line
+        int flags;                     ///< flags (implementation code path: Plain, IPP, OpenCL)
     };
 
     Region(const LocationStaticStorage& location);
@@ -100,18 +100,18 @@ private:
 
 //! Specify region flags
 enum RegionLocationFlag {
-    REGION_FLAG_FUNCTION = (1 << 0),             //< region is function (=1) / nested named region (=0)
-    REGION_FLAG_APP_CODE = (1 << 1),             //< region is Application code (=1) / OpenCV library code (=0)
-    REGION_FLAG_SKIP_NESTED = (1 << 2),          //< avoid processing of nested regions
+    REGION_FLAG_FUNCTION = (1 << 0),             ///< region is function (=1) / nested named region (=0)
+    REGION_FLAG_APP_CODE = (1 << 1),             ///< region is Application code (=1) / OpenCV library code (=0)
+    REGION_FLAG_SKIP_NESTED = (1 << 2),          ///< avoid processing of nested regions
 
-    REGION_FLAG_IMPL_IPP = (1 << 16),            //< region is part of IPP code path
-    REGION_FLAG_IMPL_OPENCL = (2 << 16),         //< region is part of OpenCL code path
-    REGION_FLAG_IMPL_OPENVX = (3 << 16),         //< region is part of OpenVX code path
+    REGION_FLAG_IMPL_IPP = (1 << 16),            ///< region is part of IPP code path
+    REGION_FLAG_IMPL_OPENCL = (2 << 16),         ///< region is part of OpenCL code path
+    REGION_FLAG_IMPL_OPENVX = (3 << 16),         ///< region is part of OpenVX code path
 
     REGION_FLAG_IMPL_MASK = (15 << 16),
 
     REGION_FLAG_REGION_FORCE = (1 << 30),
-    REGION_FLAG_REGION_NEXT = (1 << 31),         //< close previous region (see #CV_TRACE_REGION_NEXT macro)
+    REGION_FLAG_REGION_NEXT = (1 << 31),         ///< close previous region (see #CV_TRACE_REGION_NEXT macro)
 
     ENUM_REGION_FLAG_FORCE_INT = INT_MAX
 };
diff --git a/modules/core/misc/java/test/CoreTest.java b/modules/core/misc/java/test/CoreTest.java
index c63cb23fab..a236152ca4 100644
--- a/modules/core/misc/java/test/CoreTest.java
+++ b/modules/core/misc/java/test/CoreTest.java
@@ -962,9 +962,9 @@ public class CoreTest extends OpenCVTestCase {
 
         assertEquals(0.0, d);
 
-        d = Core.Mahalanobis(line1, line2, covar);
-
-        assertTrue(d > 0.0);
+        // Bug: https://github.com/opencv/opencv/issues/24348
+        // d = Core.Mahalanobis(line1, line2, covar);
+        // assertTrue(d > 0.0);
     }
 
     public void testMax() {
diff --git a/modules/core/misc/python/pyopencv_async.hpp b/modules/core/misc/python/pyopencv_async.hpp
index 6a8e73526e..625365ac50 100644
--- a/modules/core/misc/python/pyopencv_async.hpp
+++ b/modules/core/misc/python/pyopencv_async.hpp
@@ -2,7 +2,7 @@
 
 #include "opencv2/core/async.hpp"
 
-CV_PY_TO_CLASS(AsyncArray);
-CV_PY_FROM_CLASS(AsyncArray);
+CV_PY_TO_CLASS(AsyncArray)
+CV_PY_FROM_CLASS(AsyncArray)
 
 #endif
diff --git a/modules/core/misc/python/pyopencv_cuda.hpp b/modules/core/misc/python/pyopencv_cuda.hpp
index 5be4977ca0..a424498f27 100644
--- a/modules/core/misc/python/pyopencv_cuda.hpp
+++ b/modules/core/misc/python/pyopencv_cuda.hpp
@@ -20,18 +20,18 @@ template<> struct pyopencvVecConverter<cuda::GpuMat>
     }
 };
 
-CV_PY_TO_CLASS(cuda::GpuMat);
-CV_PY_TO_CLASS(cuda::Stream);
-CV_PY_TO_CLASS(cuda::Event);
-CV_PY_TO_CLASS(cuda::HostMem);
+CV_PY_TO_CLASS(cuda::GpuMat)
+CV_PY_TO_CLASS(cuda::Stream)
+CV_PY_TO_CLASS(cuda::Event)
+CV_PY_TO_CLASS(cuda::HostMem)
 
-CV_PY_TO_CLASS_PTR(cuda::GpuMat);
-CV_PY_TO_CLASS_PTR(cuda::GpuMat::Allocator);
+CV_PY_TO_CLASS_PTR(cuda::GpuMat)
+CV_PY_TO_CLASS_PTR(cuda::GpuMat::Allocator)
 
-CV_PY_FROM_CLASS(cuda::GpuMat);
-CV_PY_FROM_CLASS(cuda::Stream);
-CV_PY_FROM_CLASS(cuda::HostMem);
+CV_PY_FROM_CLASS(cuda::GpuMat)
+CV_PY_FROM_CLASS(cuda::Stream)
+CV_PY_FROM_CLASS(cuda::HostMem)
 
-CV_PY_FROM_CLASS_PTR(cuda::GpuMat::Allocator);
+CV_PY_FROM_CLASS_PTR(cuda::GpuMat::Allocator)
 
 #endif
diff --git a/modules/core/misc/python/pyopencv_umat.hpp b/modules/core/misc/python/pyopencv_umat.hpp
index 697adaf202..63f002503b 100644
--- a/modules/core/misc/python/pyopencv_umat.hpp
+++ b/modules/core/misc/python/pyopencv_umat.hpp
@@ -4,8 +4,8 @@
 
 typedef std::vector<Range> vector_Range;
 
-CV_PY_TO_CLASS(UMat);
-CV_PY_FROM_CLASS(UMat);
+CV_PY_TO_CLASS(UMat)
+CV_PY_FROM_CLASS(UMat)
 
 static bool cv_mappable_to(const Ptr<Mat>& src, Ptr<UMat>& dst)
 {
diff --git a/modules/core/perf/perf_allocation.cpp b/modules/core/perf/perf_allocation.cpp
index 2f3bf3eaa7..237a8dc1a3 100755
--- a/modules/core/perf/perf_allocation.cpp
+++ b/modules/core/perf/perf_allocation.cpp
@@ -45,4 +45,4 @@ PERF_TEST_P(MatDepth_tb, DISABLED_Allocation_Aligned,
     SANITY_CHECK_NOTHING();
 }
 
-};
+}
diff --git a/modules/core/src/alloc.cpp b/modules/core/src/alloc.cpp
index a0def9db2e..cb2db71e2c 100644
--- a/modules/core/src/alloc.cpp
+++ b/modules/core/src/alloc.cpp
@@ -53,7 +53,6 @@
 #undef CV__ALLOCATOR_STATS_LOG
 
 //#define OPENCV_ALLOC_ENABLE_STATISTICS
-#define OPENCV_ALLOC_STATISTICS_LIMIT 4096  // don't track buffers less than N bytes
 
 
 #ifdef HAVE_POSIX_MEMALIGN
@@ -63,6 +62,7 @@
 #endif
 
 #ifdef OPENCV_ALLOC_ENABLE_STATISTICS
+#define OPENCV_ALLOC_STATISTICS_LIMIT 4096  // don't track buffers less than N bytes
 #include <map>
 #endif
 
diff --git a/modules/core/src/arithm.dispatch.cpp b/modules/core/src/arithm.dispatch.cpp
index 1cbceaee29..b6a854379d 100644
--- a/modules/core/src/arithm.dispatch.cpp
+++ b/modules/core/src/arithm.dispatch.cpp
@@ -8,4 +8,4 @@
 #include "arithm.simd_declarations.hpp"
 
 #define ARITHM_DISPATCHING_ONLY
-#include "arithm.simd.hpp"
\ No newline at end of file
+#include "arithm.simd.hpp"
diff --git a/modules/core/src/arithm.simd.hpp b/modules/core/src/arithm.simd.hpp
index 7054b3e6b6..3f2f3f10a1 100644
--- a/modules/core/src/arithm.simd.hpp
+++ b/modules/core/src/arithm.simd.hpp
@@ -69,7 +69,7 @@
 #define DEFINE_SIMD_F32(fun, ...) \
     DEFINE_SIMD(__CV_CAT(fun, 32f), float, v_float32, __VA_ARGS__)
 
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
     #define DEFINE_SIMD_F64(fun, ...) \
         DEFINE_SIMD(__CV_CAT(fun, 64f), double, v_float64, __VA_ARGS__)
 #else
@@ -262,7 +262,7 @@ struct op_absdiff
 template<>
 struct op_absdiff<schar, v_int8>
 {
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     static inline v_int8 r(const v_int8& a, const v_int8& b)
     { return v_absdiffs(a, b); }
 #endif
@@ -272,7 +272,7 @@ struct op_absdiff<schar, v_int8>
 template<>
 struct op_absdiff<short, v_int16>
 {
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     static inline v_int16 r(const v_int16& a, const v_int16& b)
     { return v_absdiffs(a, b); }
 #endif
@@ -282,7 +282,7 @@ struct op_absdiff<short, v_int16>
 template<>
 struct op_absdiff<int, v_int32>
 {
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     static inline v_int32 r(const v_int32& a, const v_int32& b)
     { return v_reinterpret_as_s32(v_absdiff(a, b)); }
 #endif
@@ -327,7 +327,7 @@ struct op_not
 
 //////////////////////////// Loaders /////////////////////////////////
 
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 
 template< template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
 struct bin_loader
@@ -392,7 +392,7 @@ template<template<typename T1, typename Tvec> class OP, typename T1, typename Tv
 static void bin_loop(const T1* src1, size_t step1, const T1* src2, size_t step2, T1* dst, size_t step, int width, int height)
 {
     typedef OP<T1, Tvec> op;
-#if CV_SIMD  || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     typedef bin_loader<OP, T1, Tvec> ldr;
     const int wide_step = VTraits<Tvec>::vlanes();
     #if !CV_NEON && CV_SIMD_WIDTH == 16
@@ -410,7 +410,7 @@ static void bin_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
     {
         int x = 0;
 
-    #if CV_SIMD || CV_SIMD_SCALABLE
+    #if (CV_SIMD || CV_SIMD_SCALABLE)
         #if !CV_NEON && !CV_MSA
         if (is_aligned(src1, src2, dst))
         {
@@ -460,7 +460,7 @@ static void bin_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
     vx_cleanup();
 }
 
-#if !CV_SIMD_64F
+#if !(CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
 template<template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
 static void bin_loop_nosimd(const T1* src1, size_t step1, const T1* src2, size_t step2, T1* dst, size_t step, int width, int height)
 {
@@ -492,7 +492,7 @@ static void bin_loop_nosimd(const T1* src1, size_t step1, const T1* src2, size_t
 #define BIN_LOOP64F bin_loop_nosimd
 #else
 #define BIN_LOOP64F bin_loop
-#endif //!CV_SIMD_64F
+#endif //!(CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
 
 #endif // ARITHM_DEFINITIONS_ONLY
 
@@ -617,7 +617,7 @@ struct op_cmpne
 
 //////////////////////////// Loaders /////////////////////////////////
 
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 // todo: add support for RW alignment & stream
 template<int nload, template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
 struct cmp_loader_n
@@ -697,7 +697,7 @@ template<template<typename T1, typename Tvec> class OP, typename T1, typename Tv
 static void cmp_loop(const T1* src1, size_t step1, const T1* src2, size_t step2, uchar* dst, size_t step, int width, int height)
 {
     typedef OP<T1, Tvec> op;
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     typedef cmp_loader_n<sizeof(T1), OP, T1, Tvec> ldr;
     const int wide_step = VTraits<Tvec>::vlanes() * sizeof(T1);
 #endif // CV_SIMD
@@ -709,7 +709,7 @@ static void cmp_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
     {
         int x = 0;
 
-    #if CV_SIMD || CV_SIMD_SCALABLE
+    #if (CV_SIMD || CV_SIMD_SCALABLE)
         for (; x <= width - wide_step; x += wide_step)
         {
             ldr::l(src1 + x, src2 + x, dst + x);
@@ -764,7 +764,7 @@ static void cmp_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
     }
 }
 
-#if !CV_SIMD_64F
+#if !(CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
 template< template<typename T1, typename Tvec> class OP, typename T1>
 static void cmp_loop_nosimd(const T1* src1, size_t step1, const T1* src2, size_t step2, uchar* dst, size_t step, int width, int height)
 {
@@ -818,7 +818,7 @@ static void cmp_loop_nosimd(const double* src1, size_t step1, const double* src2
         break;
     }
 }
-#endif // !CV_SIMD_64F
+#endif // !(CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
 
 #endif // ARITHM_DEFINITIONS_ONLY
 
@@ -876,7 +876,7 @@ DEFINE_SIMD_ALL(cmp)
 
 //////////////////////////// Loaders ///////////////////////////////
 
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 // todo: add support for RW alignment & stream
 template<int nload, template<typename T1, typename T2, typename Tvec> class OP, typename T1, typename T2, typename Tvec>
 struct scalar_loader_n
@@ -1095,16 +1095,16 @@ struct scalar_loader_n<sizeof(float), OP, float, T2, v_float32>
 };
 #endif // CV_SIMD
 
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
 template<template<typename T1, typename T2, typename Tvec> class OP>
 struct scalar_loader_n<sizeof(int), OP, int, double, v_int32>
 {
     typedef OP<int, float, v_int32> op;
     typedef OP<double, double, v_float64> op64;
-    enum {step = v_int32::nlanes};
 
     static inline void l(const int* src1, const int* src2, const double* scalar, int* dst)
     {
+        const int step = VTraits<v_int32>::vlanes();
         v_int32 v_src1 = vx_load(src1);
         v_int32 v_src2 = vx_load(src2);
         v_int32 v_src1s = vx_load(src1 + step);
@@ -1121,6 +1121,7 @@ struct scalar_loader_n<sizeof(int), OP, int, double, v_int32>
     }
     static inline void l(const int* src1, const double* scalar, int* dst)
     {
+        const int step = VTraits<v_int32>::vlanes();
         v_int32 v_src1 = vx_load(src1);
         v_int32 v_src1s = vx_load(src1 + step);
 
@@ -1165,10 +1166,10 @@ struct scalar_loader_n<sizeof(float), OP, float, double, v_float32>
 {
     typedef OP<float, float, v_float32> op;
     typedef OP<double, double, v_float64> op64;
-    enum {step = v_float32::nlanes};
 
     static inline void l(const float* src1, const float* src2, const double* scalar, float* dst)
     {
+        const int step = VTraits<v_float32>::vlanes();
         v_float32 v_src1 = vx_load(src1);
         v_float32 v_src2 = vx_load(src2);
         v_float32 v_src1s = vx_load(src1 + step);
@@ -1182,6 +1183,7 @@ struct scalar_loader_n<sizeof(float), OP, float, double, v_float32>
     }
     static inline void l(const float* src1, const double* scalar, float* dst)
     {
+        const int step = VTraits<v_float32>::vlanes();
         v_float32 v_src1 = vx_load(src1);
         v_float32 v_src1s = vx_load(src1 + step);
 
@@ -1222,10 +1224,10 @@ template<template<typename T1, typename T2, typename Tvec> class OP>
 struct scalar_loader_n<sizeof(double), OP, double, double, v_float64>
 {
     typedef OP<double, double, v_float64> op;
-    enum {step = v_float64::nlanes};
 
     static inline void l(const double* src1, const double* src2, const double* scalar, double* dst)
     {
+        const int step = VTraits<v_float64>::vlanes();
         v_float64 v_src1 = vx_load(src1);
         v_float64 v_src2 = vx_load(src2);
         v_float64 v_src1s = vx_load(src1 + step);
@@ -1239,6 +1241,7 @@ struct scalar_loader_n<sizeof(double), OP, double, double, v_float64>
     }
     static inline void l(const double* src1, const double* scalar, double* dst)
     {
+        const int step = VTraits<v_float64>::vlanes();
         v_float64 v_src1 = vx_load(src1);
         v_float64 v_src1s = vx_load(src1 + step);
 
@@ -1249,7 +1252,7 @@ struct scalar_loader_n<sizeof(double), OP, double, double, v_float64>
         v_store(dst + step, r1);
     }
 };
-#endif // CV_SIMD_64F
+#endif // (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
 
 //////////////////////////// Loops /////////////////////////////////
 
@@ -1259,7 +1262,7 @@ static void scalar_loop(const T1* src1, size_t step1, const T1* src2, size_t ste
                  T1* dst, size_t step, int width, int height, const T2* scalar)
 {
     typedef OP<T1, T2, Tvec> op;
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     typedef scalar_loader_n<sizeof(T1), OP, T1, T2, Tvec> ldr;
     const int wide_step = sizeof(T1) > sizeof(ushort) ? VTraits<Tvec>::vlanes() * 2 :
                           sizeof(T1) == sizeof(uchar) ? VTraits<Tvec>::vlanes() / 2 : VTraits<Tvec>::vlanes();
@@ -1273,7 +1276,7 @@ static void scalar_loop(const T1* src1, size_t step1, const T1* src2, size_t ste
     {
         int x = 0;
 
-    #if CV_SIMD || CV_SIMD_SCALABLE
+    #if (CV_SIMD || CV_SIMD_SCALABLE)
         for (; x <= width - wide_step; x += wide_step)
         {
             ldr::l(src1 + x, src2 + x, scalar, dst + x);
@@ -1305,7 +1308,7 @@ template<template<typename T1, typename T2, typename Tvec> class OP, typename T1
 static void scalar_loop(const T1* src1, size_t step1, T1* dst, size_t step, int width, int height, const T2* scalar)
 {
     typedef OP<T1, T2, Tvec> op;
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     typedef scalar_loader_n<sizeof(T1), OP, T1, T2, Tvec> ldr;
     const int wide_step = sizeof(T1) > sizeof(ushort) ? VTraits<Tvec>::vlanes() * 2 :
                           sizeof(T1) == sizeof(uchar) ? VTraits<Tvec>::vlanes() / 2 : VTraits<Tvec>::vlanes();
@@ -1318,7 +1321,7 @@ static void scalar_loop(const T1* src1, size_t step1, T1* dst, size_t step, int
     {
         int x = 0;
 
-    #if CV_SIMD || CV_SIMD_SCALABLE
+    #if (CV_SIMD || CV_SIMD_SCALABLE)
         for (; x <= width - wide_step; x += wide_step)
         {
             ldr::l(src1 + x, scalar, dst + x);
@@ -1345,7 +1348,7 @@ static void scalar_loop(const T1* src1, size_t step1, T1* dst, size_t step, int
     vx_cleanup();
 }
 
-#if !CV_SIMD_64F
+#if !(CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
 // dual source
 template<template<typename T1, typename T2, typename Tvec> class OP, typename T1, typename T2, typename Tvec>
 static void scalar_loop_nosimd(const T1* src1, size_t step1, const T1* src2, size_t step2,
@@ -1409,7 +1412,7 @@ static void scalar_loop_nosimd(const T1* src1, size_t step1, T1* dst, size_t ste
 #define SCALAR_LOOP64F scalar_loop_nosimd
 #else
 #define SCALAR_LOOP64F scalar_loop
-#endif // !CV_SIMD_64F
+#endif // !(CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
 
 #endif // ARITHM_DEFINITIONS_ONLY
 
@@ -1433,7 +1436,7 @@ struct op_mul
 template<typename T1, typename T2, typename Tvec>
 struct op_mul_scale
 {
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar)
     {
         const v_float32 v_scalar = vx_setall_f32(*scalar);
@@ -1449,7 +1452,7 @@ struct op_mul_scale
 template<>
 struct op_mul_scale<double, double, v_float64>
 {
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
     static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalar)
     {
         const v_float64 v_scalar = vx_setall_f64(*scalar);
@@ -1574,7 +1577,7 @@ struct op_div_f
 template<typename T1, typename T2, typename Tvec>
 struct op_div_scale
 {
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar)
     {
         const v_float32 v_scalar = vx_setall_f32(*scalar);
@@ -1596,7 +1599,7 @@ struct op_div_scale
 template<>
 struct op_div_scale<float, float, v_float32>
 {
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     static inline v_float32 r(const v_float32& a, const v_float32& b, const float* scalar)
     {
         const v_float32 v_scalar = vx_setall_f32(*scalar);
@@ -1610,7 +1613,7 @@ struct op_div_scale<float, float, v_float32>
 template<>
 struct op_div_scale<double, double, v_float64>
 {
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
     static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalar)
     {
         const v_float64 v_scalar = vx_setall_f64(*scalar);
@@ -1682,7 +1685,7 @@ DEFINE_SIMD_ALL(div, div_loop)
 template<typename T1, typename T2, typename Tvec>
 struct op_add_scale
 {
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar)
     {
         const v_float32 v_alpha = vx_setall_f32(*scalar);
@@ -1698,7 +1701,7 @@ struct op_add_scale
 template<>
 struct op_add_scale<double, double, v_float64>
 {
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
     static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalar)
     {
         const v_float64 v_alpha = vx_setall_f64(*scalar);
@@ -1715,7 +1718,7 @@ struct op_add_scale<double, double, v_float64>
 template<typename T1, typename T2, typename Tvec>
 struct op_add_weighted
 {
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalars)
     {
         const v_float32 v_alpha = vx_setall_f32(scalars[0]);
@@ -1733,7 +1736,7 @@ struct op_add_weighted
 template<>
 struct op_add_weighted<double, double, v_float64>
 {
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
     static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalars)
     {
         const v_float64 v_alpha = vx_setall_f64(scalars[0]);
@@ -1832,7 +1835,7 @@ DEFINE_SIMD_F64(addWeighted, add_weighted_loop_d)
 template<typename T1, typename T2, typename Tvec>
 struct op_recip
 {
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     static inline v_float32 r(const v_float32& a, const T2* scalar)
     {
         const v_float32 v_scalar = vx_setall_f32(*scalar);
@@ -1854,7 +1857,7 @@ struct op_recip
 template<>
 struct op_recip<float, float, v_float32>
 {
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     static inline v_float32 r(const v_float32& a, const float* scalar)
     {
         const v_float32 v_scalar = vx_setall_f32(*scalar);
@@ -1868,7 +1871,7 @@ struct op_recip<float, float, v_float32>
 template<>
 struct op_recip<double, double, v_float64>
 {
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
     static inline v_float64 r(const v_float64& a, const double* scalar)
     {
         const v_float64 v_scalar = vx_setall_f64(*scalar);
diff --git a/modules/core/src/arithm_ipp.hpp b/modules/core/src/arithm_ipp.hpp
index 4aa7d006e4..ed722113a7 100644
--- a/modules/core/src/arithm_ipp.hpp
+++ b/modules/core/src/arithm_ipp.hpp
@@ -414,4 +414,4 @@ inline int arithm_ipp_mul32f(const float *src1, size_t step1, const float *src2,
 
 #if !ARITHM_USE_IPP
 #define ARITHM_CALL_IPP(...)
-#endif
\ No newline at end of file
+#endif
diff --git a/modules/core/src/dxt.cpp b/modules/core/src/dxt.cpp
index 8377f5df1a..ce263a506f 100644
--- a/modules/core/src/dxt.cpp
+++ b/modules/core/src/dxt.cpp
@@ -64,8 +64,6 @@ namespace cv
                                Discrete Fourier Transform
 \****************************************************************************************/
 
-#define CV_MAX_LOCAL_DFT_SIZE  (1 << 15)
-
 static unsigned char bitrevTab[] =
 {
   0x00,0x80,0x40,0xc0,0x20,0xa0,0x60,0xe0,0x10,0x90,0x50,0xd0,0x30,0xb0,0x70,0xf0,
diff --git a/modules/core/src/hal_replacement.hpp b/modules/core/src/hal_replacement.hpp
index 25acff662c..1f2b259920 100644
--- a/modules/core/src/hal_replacement.hpp
+++ b/modules/core/src/hal_replacement.hpp
@@ -69,10 +69,14 @@
 /**
 Add: _dst[i] = src1[i] + src2[i]_ @n
 Sub: _dst[i] = src1[i] - src2[i]_
-@param src1_data,src1_step first source image data and step
-@param src2_data,src2_step second source image data and step
-@param dst_data,dst_step destination image data and step
-@param width,height dimensions of the images
+@param src1_data first source image data
+@param src1_step first source image step
+@param src2_data second source image data
+@param src2_step second source image step
+@param dst_data destination image data
+@param dst_step destination image step
+@param width width of the images
+@param height height of the images
 */
 //! @addtogroup core_hal_interface_addsub Element-wise add and subtract
 //! @{
@@ -96,10 +100,14 @@ inline int hal_ni_sub64f(const double *src1_data, size_t src1_step, const double
 /**
 Minimum: _dst[i] = min(src1[i], src2[i])_ @n
 Maximum: _dst[i] = max(src1[i], src2[i])_
-@param src1_data,src1_step first source image data and step
-@param src2_data,src2_step second source image data and step
-@param dst_data,dst_step destination image data and step
-@param width,height dimensions of the images
+@param src1_data first source image data
+@param src1_step first source image step
+@param src2_data second source image data
+@param src2_step second source image step
+@param dst_data destination image data
+@param dst_step destination image step
+@param width width of the images
+@param height height of the images
 */
 //! @addtogroup core_hal_interface_minmax Element-wise minimum or maximum
 //! @{
@@ -122,11 +130,14 @@ inline int hal_ni_min64f(const double *src1_data, size_t src1_step, const double
 
 /**
 Absolute difference: _dst[i] = | src1[i] - src2[i] |_
-@param src1_data,src1_step first source image data and step
-@param src2_data,src2_step second source image data and step
-@param dst_data,dst_step destination image data and step
-@param width,height dimensions of the images
-@param scale additional multiplier
+@param src1_data first source image data
+@param src1_step first source image step
+@param src2_data second source image data
+@param src2_step second source image step
+@param dst_data destination image data
+@param dst_step destination image step
+@param width width of the images
+@param height height of the images
 */
 //! @addtogroup core_hal_interface_absdiff Element-wise absolute difference
 //! @{
@@ -144,10 +155,14 @@ Bitwise AND: _dst[i] = src1[i] & src2[i]_ @n
 Bitwise OR: _dst[i] = src1[i] | src2[i]_ @n
 Bitwise XOR: _dst[i] = src1[i] ^ src2[i]_ @n
 Bitwise NOT: _dst[i] = !src[i]_
-@param src1_data,src1_step first source image data and step
-@param src2_data,src2_step second source image data and step
-@param dst_data,dst_step destination image data and step
-@param width,height dimensions of the images
+@param src1_data first source image data
+@param src1_step first source image step
+@param src2_data second source image data
+@param src2_step second source image step
+@param dst_data destination image data
+@param dst_step destination image step
+@param width width of the images
+@param height height of the images
  */
 //! @addtogroup core_hal_interface_logical Bitwise logical operations
 //! @{
@@ -201,10 +216,14 @@ inline int hal_ni_not8u(const uchar *src_data, size_t src_step, uchar *dst_data,
 
 /**
 Compare: _dst[i] = src1[i] op src2[i]_
-@param src1_data,src1_step first source image data and step
-@param src2_data,src2_step second source image data and step
-@param dst_data,dst_step destination image data and step
-@param width,height dimensions of the images
+@param src1_data first source image data
+@param src1_step first source image step
+@param src2_data second source image data
+@param src2_step second source image step
+@param dst_data destination image data
+@param dst_step destination image step
+@param width width of the images
+@param height height of the images
 @param operation one of (CV_HAL_CMP_EQ, CV_HAL_CMP_GT, ...)
 */
 //! @addtogroup core_hal_interface_compare Element-wise compare
@@ -230,10 +249,14 @@ inline int hal_ni_cmp64f(const double *src1_data, size_t src1_step, const double
 
 /**
 Multiply: _dst[i] = scale * src1[i] * src2[i]_
-@param src1_data,src1_step first source image data and step
-@param src2_data,src2_step second source image data and step
-@param dst_data,dst_step destination image data and step
-@param width,height dimensions of the images
+@param src1_data first source image data
+@param src1_step first source image step
+@param src2_data second source image data
+@param src2_step second source image step
+@param dst_data destination image data
+@param dst_step destination image step
+@param width width of the images
+@param height height of the images
 @param scale additional multiplier
 */
 //! @addtogroup core_hal_interface_multiply Element-wise multiply
@@ -249,10 +272,14 @@ inline int hal_ni_mul64f(const double *src1_data, size_t src1_step, const double
 
 /**
 Divide: _dst[i] = scale * src1[i] / src2[i]_
-@param src1_data,src1_step first source image data and step
-@param src2_data,src2_step second source image data and step
-@param dst_data,dst_step destination image data and step
-@param width,height dimensions of the images
+@param src1_data first source image data and step
+@param src1_step first source image data and step
+@param src2_data second source image data and step
+@param src2_step second source image data and step
+@param dst_data destination image data and step
+@param dst_step destination image data and step
+@param width dimensions of the images
+@param height dimensions of the images
 @param scale additional multiplier
 */
 //! @addtogroup core_hal_interface_divide Element-wise divide
@@ -268,9 +295,12 @@ inline int hal_ni_div64f(const double *src1_data, size_t src1_step, const double
 
 /**
 Computes reciprocial: _dst[i] = scale / src[i]_
-@param src_data,src_step source image data and step
-@param dst_data,dst_step destination image data and step
-@param width,height dimensions of the images
+@param src_data source image data
+@param src_step source image step
+@param dst_data destination image data
+@param dst_step destination image step
+@param width width of the images
+@param height height of the images
 @param scale additional multiplier
  */
 //! @addtogroup core_hal_interface_reciprocial Element-wise reciprocial
@@ -310,10 +340,14 @@ inline int hal_ni_recip64f(const double *src_data, size_t src_step, double *dst_
 
 /**
 Computes weighted sum of two arrays using formula: _dst[i] = a * src1[i] + b * src2[i] + c_
-@param src1_data,src1_step first source image data and step
-@param src2_data,src2_step second source image data and step
-@param dst_data,dst_step destination image data and step
-@param width,height dimensions of the images
+@param src1_data first source image data
+@param src1_step first source image step
+@param src2_data second source image data
+@param src2_step second source image step
+@param dst_data destination image data
+@param dst_step destination image step
+@param width width of the images
+@param height height of the images
 @param scalars numbers _a_, _b_, and _c_
  */
 //! @addtogroup core_hal_interface_addWeighted Element-wise weighted sum
@@ -381,7 +415,8 @@ inline int hal_ni_merge64s(const int64 **src_data, int64 *dst_data, int len, int
 
 
 /**
-@param y,x source Y and X arrays
+@param y source Y arrays
+@param x source X arrays
 @param dst destination array
 @param len length of arrays
 @param angleInDegrees if set to true return angles in degrees, otherwise in radians
@@ -399,7 +434,8 @@ inline int hal_ni_fastAtan64f(const double* y, const double* x, double* dst, int
 
 
 /**
-@param x,y source X and Y arrays
+@param x source X array
+@param y source Y array
 @param dst destination array
 @param len length of arrays
  */
@@ -530,7 +566,8 @@ inline int hal_ni_dftFree1D(cvhalDFT *context) { return CV_HAL_ERROR_NOT_IMPLEME
 
 /**
 @param context double pointer to context storing all necessary data
-@param width,height image dimensions
+@param width image width
+@param height image height
 @param depth image type (CV_32F or CV_64F)
 @param src_channels number of channels in input image
 @param dst_channels number of channels in output image
@@ -540,8 +577,10 @@ inline int hal_ni_dftFree1D(cvhalDFT *context) { return CV_HAL_ERROR_NOT_IMPLEME
 inline int hal_ni_dftInit2D(cvhalDFT **context, int width, int height, int depth, int src_channels, int dst_channels, int flags, int nonzero_rows) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 /**
 @param context pointer to context storing all necessary data
-@param src_data,src_step source image data and step
-@param dst_data,dst_step destination image data and step
+@param src_data source image data
+@param src_step source image step
+@param dst_data destination image data
+@param dst_step destination image step
  */
 inline int hal_ni_dft2D(cvhalDFT *context, const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 /**
@@ -557,15 +596,18 @@ inline int hal_ni_dftFree2D(cvhalDFT *context) { return CV_HAL_ERROR_NOT_IMPLEME
 
 /**
 @param context double pointer to context storing all necessary data
-@param width,height image dimensions
+@param width image width
+@param height image height
 @param depth image type (CV_32F or CV_64F)
 @param flags algorithm options (combination of CV_HAL_DFT_INVERSE, ...)
  */
 inline int hal_ni_dctInit2D(cvhalDFT **context, int width, int height, int depth, int flags) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 /**
 @param context pointer to context storing all necessary data
-@param src_data,src_step source image data and step
-@param dst_data,dst_step destination image data and step
+@param src_data source image data
+@param src_step source image step
+@param dst_data destination image data
+@param dst_step destination image step
  */
 inline int hal_ni_dct2D(cvhalDFT *context, const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 /**
@@ -717,11 +759,15 @@ inline int hal_ni_gemm64fc(const double* src1, size_t src1_step, const double* s
 
 /**
    @brief Finds the global minimum and maximum in an array.
-   @param src_data,src_step Source image
-   @param width,height Source image dimensions
+   @param src_data Source image
+   @param src_step Source image
+   @param width Source image dimensions
+   @param height Source image dimensions
    @param depth Depth of source image
-   @param minVal,maxVal Pointer to the returned global minimum and maximum in an array.
-   @param minIdx,maxIdx Pointer to the returned minimum and maximum location.
+   @param minVal Pointer to the returned global minimum and maximum in an array.
+   @param maxVal Pointer to the returned global minimum and maximum in an array.
+   @param minIdx Pointer to the returned minimum and maximum location.
+   @param maxIdx Pointer to the returned minimum and maximum location.
    @param mask Specified array region.
 */
 inline int hal_ni_minMaxIdx(const uchar* src_data, size_t src_step, int width, int height, int depth, double* minVal, double* maxVal,
@@ -731,6 +777,47 @@ inline int hal_ni_minMaxIdx(const uchar* src_data, size_t src_step, int width, i
 #define cv_hal_minMaxIdx hal_ni_minMaxIdx
 //! @endcond
 
+/**
+   @brief hal_flip
+   @param src_type source and destination image type
+   @param src_data source image data
+   @param src_step source image step
+   @param src_width source and destination image width
+   @param src_height source and destination image height
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param flip_mode 0 flips around x-axis, positive around y-axis, negative both
+ */
+inline int hal_ni_flip(int src_type, const uchar* src_data, size_t src_step, int src_width, int src_height,
+                       uchar* dst_data, size_t dst_step, int flip_mode) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+//! @cond IGNORED
+#define cv_hal_flip hal_ni_flip
+//! @endcond
+
+
+/**
+   @brief rotate90
+   @param src_type source and destination image type
+   @param src_data source image data
+   @param src_step source image step
+   @param src_width source image width
+   If angle has value [180] it is also destination image width
+   If angle has values [90, 270] it is also destination image height
+   @param src_height source and destination image height (destination image width for angles [90, 270])
+   If angle has value [180] it is also destination image height
+   If angle has values [90, 270] it is also destination image width
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param angle clockwise angle for rotation in degrees from set [90, 180, 270]
+ */
+inline int hal_ni_rotate90(int src_type, const uchar* src_data, size_t src_step, int src_width, int src_height,
+                           uchar* dst_data, size_t dst_step, int angle) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+//! @cond IGNORED
+#define cv_hal_rotate90 hal_ni_rotate90
+//! @endcond
+
 //! @}
 
 
diff --git a/modules/core/src/has_non_zero.simd.hpp b/modules/core/src/has_non_zero.simd.hpp
index 6ea8bcd7d2..e9f9b683d6 100644
--- a/modules/core/src/has_non_zero.simd.hpp
+++ b/modules/core/src/has_non_zero.simd.hpp
@@ -87,11 +87,11 @@ static bool hasNonZero8u( const uchar* src, size_t len )
 {
     bool res = false;
     const uchar* srcEnd = src+len;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     typedef v_uint8 v_type;
     const v_type v_zero = vx_setzero_u8();
     constexpr const int unrollCount = 2;
-    int step = v_type::nlanes * unrollCount;
+    int step = VTraits<v_type>::vlanes() * unrollCount;
     int len0 = len & -step;
     const uchar* srcSimdEnd = src+len0;
 
@@ -99,10 +99,10 @@ static bool hasNonZero8u( const uchar* src, size_t len )
     while(!res && countSIMD--)
     {
         v_type v0 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v1 = vx_load(src);
-        src += v_type::nlanes;
-        res = v_check_any(((v0 | v1) != v_zero));
+        src += VTraits<v_type>::vlanes();
+        res = v_check_any((v_ne(v_or(v0, v1), v_zero)));
     }
 
     v_cleanup();
@@ -114,11 +114,11 @@ static bool hasNonZero16u( const ushort* src, size_t len )
 {
     bool res = false;
     const ushort* srcEnd = src+len;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     typedef v_uint16 v_type;
     const v_type v_zero = vx_setzero_u16();
     constexpr const int unrollCount = 4;
-    int step = v_type::nlanes * unrollCount;
+    int step = VTraits<v_type>::vlanes() * unrollCount;
     int len0 = len & -step;
     const ushort* srcSimdEnd = src+len0;
 
@@ -126,16 +126,16 @@ static bool hasNonZero16u( const ushort* src, size_t len )
     while(!res && countSIMD--)
     {
         v_type v0 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v1 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v2 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v3 = vx_load(src);
-        src += v_type::nlanes;
-        v0 |= v1;
-        v2 |= v3;
-        res = v_check_any(((v0 | v2) != v_zero));
+        src += VTraits<v_type>::vlanes();
+        v0 = v_or(v0, v1);
+        v2 = v_or(v2, v3);
+        res = v_check_any((v_ne(v_or(v0, v2), v_zero)));
     }
 
     v_cleanup();
@@ -147,11 +147,11 @@ static bool hasNonZero32s( const int* src, size_t len )
 {
     bool res = false;
     const int* srcEnd = src+len;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     typedef v_int32 v_type;
     const v_type v_zero = vx_setzero_s32();
     constexpr const int unrollCount = 8;
-    int step = v_type::nlanes * unrollCount;
+    int step = VTraits<v_type>::vlanes() * unrollCount;
     int len0 = len & -step;
     const int* srcSimdEnd = src+len0;
 
@@ -159,29 +159,29 @@ static bool hasNonZero32s( const int* src, size_t len )
     while(!res && countSIMD--)
     {
         v_type v0 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v1 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v2 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v3 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v4 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v5 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v6 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v7 = vx_load(src);
-        src += v_type::nlanes;
-        v0 |= v1;
-        v2 |= v3;
-        v4 |= v5;
-        v6 |= v7;
+        src += VTraits<v_type>::vlanes();
+        v0 = v_or(v0, v1);
+        v2 = v_or(v2, v3);
+        v4 = v_or(v4, v5);
+        v6 = v_or(v6, v7);
 
-        v0 |= v2;
-        v4 |= v6;
-        res = v_check_any(((v0 | v4) != v_zero));
+        v0 = v_or(v0, v2);
+        v4 = v_or(v4, v6);
+        res = v_check_any((v_ne(v_or(v0, v4), v_zero)));
     }
 
     v_cleanup();
@@ -193,11 +193,11 @@ static bool hasNonZero32f( const float* src, size_t len )
 {
     bool res = false;
     const float* srcEnd = src+len;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     typedef v_float32 v_type;
     const v_type v_zero = vx_setzero_f32();
     constexpr const int unrollCount = 8;
-    int step = v_type::nlanes * unrollCount;
+    int step = VTraits<v_type>::vlanes() * unrollCount;
     int len0 = len & -step;
     const float* srcSimdEnd = src+len0;
 
@@ -205,30 +205,30 @@ static bool hasNonZero32f( const float* src, size_t len )
     while(!res && countSIMD--)
     {
         v_type v0 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v1 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v2 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v3 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v4 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v5 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v6 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v7 = vx_load(src);
-        src += v_type::nlanes;
-        v0 |= v1;
-        v2 |= v3;
-        v4 |= v5;
-        v6 |= v7;
+        src += VTraits<v_type>::vlanes();
+        v0 = v_or(v0, v1);
+        v2 = v_or(v2, v3);
+        v4 = v_or(v4, v5);
+        v6 = v_or(v6, v7);
 
-        v0 |= v2;
-        v4 |= v6;
+        v0 = v_or(v0, v2);
+        v4 = v_or(v4, v6);
         //res = v_check_any(((v0 | v4) != v_zero));//beware : (NaN != 0) returns "false" since != is mapped to _CMP_NEQ_OQ and not _CMP_NEQ_UQ
-        res = !v_check_all(((v0 | v4) == v_zero));
+        res = !v_check_all((v_eq(v_or(v0, v4), v_zero)));
     }
 
     v_cleanup();
@@ -240,11 +240,11 @@ static bool hasNonZero64f( const double* src, size_t len )
 {
     bool res = false;
     const double* srcEnd = src+len;
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
     typedef v_float64 v_type;
     const v_type v_zero = vx_setzero_f64();
     constexpr const int unrollCount = 16;
-    int step = v_type::nlanes * unrollCount;
+    int step = VTraits<v_type>::vlanes() * unrollCount;
     int len0 = len & -step;
     const double* srcSimdEnd = src+len0;
 
@@ -252,55 +252,55 @@ static bool hasNonZero64f( const double* src, size_t len )
     while(!res && countSIMD--)
     {
         v_type v0 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v1 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v2 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v3 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v4 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v5 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v6 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v7 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v8 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v9 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v10 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v11 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v12 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v13 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v14 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v15 = vx_load(src);
-        src += v_type::nlanes;
-        v0  |= v1;
-        v2  |= v3;
-        v4  |= v5;
-        v6  |= v7;
-        v8  |= v9;
-        v10 |= v11;
-        v12 |= v13;
-        v14 |= v15;
+        src += VTraits<v_type>::vlanes();
+        v0 = v_or(v0, v1);
+        v2 = v_or(v2, v3);
+        v4 = v_or(v4, v5);
+        v6 = v_or(v6, v7);
+        v8 = v_or(v8, v9);
+        v10 = v_or(v10, v11);
+        v12 = v_or(v12, v13);
+        v14 = v_or(v14, v15);
 
-        v0  |= v2;
-        v4  |= v6;
-        v8  |= v10;
-        v12 |= v14;
+        v0 = v_or(v0, v2);
+        v4 = v_or(v4, v6);
+        v8 = v_or(v8, v10);
+        v12 = v_or(v12, v14);
 
-        v0  |= v4;
-        v8  |= v12;
+        v0 = v_or(v0, v4);
+        v8 = v_or(v8, v12);
         //res = v_check_any(((v0 | v8) != v_zero));//beware : (NaN != 0) returns "false" since != is mapped to _CMP_NEQ_OQ and not _CMP_NEQ_UQ
-        res = !v_check_all(((v0 | v8) == v_zero));
+        res = !v_check_all((v_eq(v_or(v0, v8), v_zero)));
     }
 
     v_cleanup();
diff --git a/modules/core/src/lapack.cpp b/modules/core/src/lapack.cpp
index 981e098bd5..138246406d 100644
--- a/modules/core/src/lapack.cpp
+++ b/modules/core/src/lapack.cpp
@@ -276,7 +276,7 @@ template<typename T> struct VBLAS
     int givens(T*, T*, int, T, T) const { return 0; }
 };
 
-#if CV_SIMD // TODO: enable for CV_SIMD_SCALABLE_64F
+#if CV_SIMD // TODO: enable for CV_SIMD_SCALABLE, GCC 13 related
 template<> inline int VBLAS<float>::dot(const float* a, const float* b, int n, float* result) const
 {
     if( n < 2*VTraits<v_float32>::vlanes() )
diff --git a/modules/core/src/matmul.simd.hpp b/modules/core/src/matmul.simd.hpp
index 4ac1e21bb6..54a04c5f97 100644
--- a/modules/core/src/matmul.simd.hpp
+++ b/modules/core/src/matmul.simd.hpp
@@ -2549,6 +2549,7 @@ double dotProd_16s(const short* src1, const short* src2, int len)
 double dotProd_32s(const int* src1, const int* src2, int len)
 {
 #if CV_SIMD_64F // TODO: enable for CV_SIMD_SCALABLE_64F
+// Test failed on RVV(QEMU): Too big difference (=1.20209e-08 > 1.11022e-12)
     double r = .0;
     int i = 0;
     const int step  = VTraits<v_int32>::vlanes();
diff --git a/modules/core/src/matrix_transform.cpp b/modules/core/src/matrix_transform.cpp
index 43bf9be057..c4c7a73b4c 100644
--- a/modules/core/src/matrix_transform.cpp
+++ b/modules/core/src/matrix_transform.cpp
@@ -4,6 +4,7 @@
 
 #include "precomp.hpp"
 #include "opencl_kernels_core.hpp"
+#include "hal_replacement.hpp"
 #include "opencv2/core/detail/dispatch_helper.impl.hpp"
 
 #include <algorithm> // std::swap_ranges
@@ -802,6 +803,9 @@ void flip( InputArray _src, OutputArray _dst, int flip_mode )
     _dst.create( size, type );
     Mat dst = _dst.getMat();
 
+    CALL_HAL(flip, cv_hal_flip, type, src.ptr(), src.step, src.cols, src.rows,
+             dst.ptr(), dst.step, flip_mode);
+
     CV_IPP_RUN_FAST(ipp_flip(src, dst, flip_mode));
 
     size_t esz = CV_ELEM_SIZE(type);
@@ -1075,10 +1079,8 @@ void broadcast(InputArray _src, InputArray _shape, OutputArray _dst) {
     }
 }
 
-void rotate(InputArray _src, OutputArray _dst, int rotateMode)
+static void rotateImpl(InputArray _src, OutputArray _dst, int rotateMode)
 {
-    CV_Assert(_src.dims() <= 2);
-
     switch (rotateMode)
     {
     case ROTATE_90_CLOCKWISE:
@@ -1097,4 +1099,51 @@ void rotate(InputArray _src, OutputArray _dst, int rotateMode)
     }
 }
 
+void rotate(InputArray _src, OutputArray _dst, int rotateMode)
+{
+    CV_Assert(_src.dims() <= 2);
+    int angle;
+
+    if (_dst.isUMat())
+    {
+        rotateImpl(_src, _dst, rotateMode);
+        return;
+    }
+
+    Mat src = _src.getMat();
+    int type = src.type();
+    if( src.empty() )
+    {
+        _dst.release();
+        return;
+    }
+
+    switch (rotateMode)
+    {
+    case ROTATE_90_CLOCKWISE:
+        _dst.create(src.cols, src.rows, type);
+        angle = 90;
+        break;
+    case ROTATE_180:
+        _dst.create(src.rows, src.cols, type);
+        angle = 180;
+        break;
+    case ROTATE_90_COUNTERCLOCKWISE:
+        _dst.create(src.cols, src.rows, type);
+        angle = 270;
+        break;
+    default:
+        _dst.create(src.rows, src.cols, type);
+        angle = 0;
+        break;
+    }
+
+    Mat dst = _dst.getMat();
+    CALL_HAL(rotate90, cv_hal_rotate90, type, src.ptr(), src.step, src.cols, src.rows,
+             dst.ptr(), dst.step, angle);
+
+    // use src (Mat) since _src (InputArray) is updated by _dst.create() when in-place
+    rotateImpl(src, _dst, rotateMode);
+}
+
 }  // namespace
diff --git a/modules/core/src/mean.simd.hpp b/modules/core/src/mean.simd.hpp
index 60dba7afcf..c6bbc20b89 100644
--- a/modules/core/src/mean.simd.hpp
+++ b/modules/core/src/mean.simd.hpp
@@ -24,7 +24,7 @@ struct SumSqr_SIMD
     }
 };
 
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 
 template <>
 struct SumSqr_SIMD<uchar, int, int>
diff --git a/modules/core/src/minmax.cpp b/modules/core/src/minmax.cpp
index 29a0ea3450..a31ccad83e 100644
--- a/modules/core/src/minmax.cpp
+++ b/modules/core/src/minmax.cpp
@@ -1546,9 +1546,9 @@ void cv::minMaxIdx(InputArray _src, double* minVal,
     if (!src.empty() && mask.empty())
     {
         if( minidx == 0 )
-             minidx = 1;
-         if( maxidx == 0 )
-             maxidx = 1;
+            minidx = 1;
+        if( maxidx == 0 )
+            maxidx = 1;
     }
 
     if( minidx == 0 )
diff --git a/modules/core/src/parallel.cpp b/modules/core/src/parallel.cpp
index 63e699f02c..395c7e5bd5 100644
--- a/modules/core/src/parallel.cpp
+++ b/modules/core/src/parallel.cpp
@@ -791,7 +791,7 @@ int getThreadNum()
         return 0;
     #endif
 #elif defined HAVE_HPX
-        return (int)(hpx::get_num_worker_threads());
+    return (int)(hpx::get_num_worker_threads());
 #elif defined HAVE_OPENMP
     return omp_get_thread_num();
 #elif defined HAVE_GCD
diff --git a/modules/core/src/persistence_base64_encoding.cpp b/modules/core/src/persistence_base64_encoding.cpp
index 7d90fd422b..3fce79c080 100644
--- a/modules/core/src/persistence_base64_encoding.cpp
+++ b/modules/core/src/persistence_base64_encoding.cpp
@@ -367,4 +367,4 @@ size_t base64::RawDataToBinaryConvertor::make_to_binary_funcs(const std::string
     return offset_packed;
 }
 
-}
\ No newline at end of file
+}
diff --git a/modules/core/src/persistence_base64_encoding.hpp b/modules/core/src/persistence_base64_encoding.hpp
index 1ee5201e14..8b66e94095 100644
--- a/modules/core/src/persistence_base64_encoding.hpp
+++ b/modules/core/src/persistence_base64_encoding.hpp
@@ -124,4 +124,4 @@ private:
 }
 
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/modules/core/src/softfloat.cpp b/modules/core/src/softfloat.cpp
index a876ee14e2..b5ac5d7dc5 100644
--- a/modules/core/src/softfloat.cpp
+++ b/modules/core/src/softfloat.cpp
@@ -306,9 +306,6 @@ softdouble cos(const softdouble& a) { return f64_cos(a); }
 | The values to return on conversions to 32-bit integer formats that raise an
 | invalid exception.
 *----------------------------------------------------------------------------*/
-#define ui32_fromPosOverflow 0xFFFFFFFF
-#define ui32_fromNegOverflow 0
-#define ui32_fromNaN         0xFFFFFFFF
 #define i32_fromPosOverflow  0x7FFFFFFF
 #define i32_fromNegOverflow  (-0x7FFFFFFF - 1)
 #define i32_fromNaN          0x7FFFFFFF
@@ -317,9 +314,6 @@ softdouble cos(const softdouble& a) { return f64_cos(a); }
 | The values to return on conversions to 64-bit integer formats that raise an
 | invalid exception.
 *----------------------------------------------------------------------------*/
-#define ui64_fromPosOverflow UINT64_C( 0xFFFFFFFFFFFFFFFF )
-#define ui64_fromNegOverflow 0
-#define ui64_fromNaN         UINT64_C( 0xFFFFFFFFFFFFFFFF )
 #define i64_fromPosOverflow  UINT64_C( 0x7FFFFFFFFFFFFFFF )
 //fixed unsigned unary minus: -x == ~x + 1
 //#define i64_fromNegOverflow (-UINT64_C( 0x7FFFFFFFFFFFFFFF ) - 1)
@@ -422,34 +416,6 @@ struct uint64_extra { uint64_t v, extra; };
 struct uint128_extra { struct uint128 v; uint64_t extra; };
 #endif
 
-/*----------------------------------------------------------------------------
-| These macros are used to isolate the differences in word order between big-
-| endian and little-endian platforms.
-*----------------------------------------------------------------------------*/
-#ifndef WORDS_BIGENDIAN
-#define wordIncr 1
-#define indexWord( total, n ) (n)
-#define indexWordHi( total ) ((total) - 1)
-#define indexWordLo( total ) 0
-#define indexMultiword( total, m, n ) (n)
-#define indexMultiwordHi( total, n ) ((total) - (n))
-#define indexMultiwordLo( total, n ) 0
-#define indexMultiwordHiBut( total, n ) (n)
-#define indexMultiwordLoBut( total, n ) 0
-#define INIT_UINTM4( v3, v2, v1, v0 ) { v0, v1, v2, v3 }
-#else
-#define wordIncr -1
-#define indexWord( total, n ) ((total) - 1 - (n))
-#define indexWordHi( total ) 0
-#define indexWordLo( total ) ((total) - 1)
-#define indexMultiword( total, m, n ) ((total) - 1 - (m))
-#define indexMultiwordHi( total, n ) 0
-#define indexMultiwordLo( total, n ) ((total) - (n))
-#define indexMultiwordHiBut( total, n ) 0
-#define indexMultiwordLoBut( total, n ) (n)
-#define INIT_UINTM4( v3, v2, v1, v0 ) { v3, v2, v1, v0 }
-#endif
-
 enum {
     softfloat_mulAdd_subC    = 1,
     softfloat_mulAdd_subProd = 2
diff --git a/modules/core/src/split.simd.hpp b/modules/core/src/split.simd.hpp
index 88414161b8..109d759f24 100644
--- a/modules/core/src/split.simd.hpp
+++ b/modules/core/src/split.simd.hpp
@@ -220,4 +220,4 @@ void split64s(const int64* src, int64** dst, int len, int cn )
 
 #endif
 CV_CPU_OPTIMIZATION_NAMESPACE_END
-}} // namespace
\ No newline at end of file
+}} // namespace
diff --git a/modules/core/test/test_arithm.cpp b/modules/core/test/test_arithm.cpp
index 199e19dea7..a9d71967aa 100644
--- a/modules/core/test/test_arithm.cpp
+++ b/modules/core/test/test_arithm.cpp
@@ -672,7 +672,7 @@ static void inRangeS(const Mat& src, const Scalar& lb, const Scalar& rb, Mat& ds
 }
 
 } // namespace
-CVTEST_GUARD_SYMBOL(inRange);
+CVTEST_GUARD_SYMBOL(inRange)
 
 struct InRangeSOp : public BaseArithmOp
 {
@@ -1202,7 +1202,7 @@ struct MeanOp : public BaseArithmOp
     MeanOp() : BaseArithmOp(1, FIX_ALPHA+FIX_BETA+FIX_GAMMA+SUPPORT_MASK+SCALAR_OUTPUT, 1, 1, Scalar::all(0))
     {
         context = 3;
-    };
+    }
     void op(const vector<Mat>& src, Mat& dst, const Mat& mask)
     {
         dst.create(1, 1, CV_64FC4);
@@ -1225,7 +1225,7 @@ struct SumOp : public BaseArithmOp
     SumOp() : BaseArithmOp(1, FIX_ALPHA+FIX_BETA+FIX_GAMMA+SCALAR_OUTPUT, 1, 1, Scalar::all(0))
     {
         context = 3;
-    };
+    }
     void op(const vector<Mat>& src, Mat& dst, const Mat&)
     {
         dst.create(1, 1, CV_64FC4);
@@ -1285,7 +1285,7 @@ struct MeanStdDevOp : public BaseArithmOp
     {
         cn = 0;
         context = 7;
-    };
+    }
     void op(const vector<Mat>& src, Mat& dst, const Mat& mask)
     {
         dst.create(1, 2, CV_64FC4);
@@ -1326,7 +1326,7 @@ struct NormOp : public BaseArithmOp
     {
         context = 1;
         normType = 0;
-    };
+    }
     int getRandomType(RNG& rng)
     {
         int type = cvtest::randomType(rng, baseArithmTypeMask, 1, 4);
@@ -1372,7 +1372,7 @@ struct MinMaxLocOp : public BaseArithmOp
     MinMaxLocOp() : BaseArithmOp(1, FIX_ALPHA+FIX_BETA+FIX_GAMMA+SUPPORT_MASK+SCALAR_OUTPUT, 1, 1, Scalar::all(0))
     {
         context = ARITHM_MAX_NDIMS*2 + 2;
-    };
+    }
     int getRandomType(RNG& rng)
     {
         return cvtest::randomType(rng, baseArithmTypeMask, 1, 1);
@@ -1419,7 +1419,7 @@ struct reduceArgMinMaxOp : public BaseArithmOp
                           isLast(false), isMax(false), axis(0)
     {
         context = ARITHM_MAX_NDIMS*2 + 2;
-    };
+    }
     int getRandomType(RNG& rng) override
     {
         return cvtest::randomType(rng, baseArithmTypeMask, 1, 1);
diff --git a/modules/core/test/test_io.cpp b/modules/core/test/test_io.cpp
index 4def1a0a0a..0b54b18d4f 100644
--- a/modules/core/test/test_io.cpp
+++ b/modules/core/test/test_io.cpp
@@ -435,6 +435,8 @@ protected:
                 CV_Assert( ov1 == v1 );
                 CV_Assert( osc1 == sc1 );
                 CV_Assert( og1 == g1 );
+                fs.release();
+                remove(fname.c_str());
             }
             catch(...)
             {
@@ -489,6 +491,7 @@ TEST(Core_InputOutput, FileStorage)
     char arr[66];
     snprintf(arr, sizeof(arr), "snprintf is hell %d", 666);
     EXPECT_NO_THROW(f << arr);
+    remove(file.c_str());
 }
 
 TEST(Core_InputOutput, FileStorageKey)
@@ -534,6 +537,7 @@ TEST(Core_InputOutput, FileStorageSpaces)
         ASSERT_STREQ(values[i].c_str(), valuesReadAppend[i].c_str());
     }
     g3.release();
+    EXPECT_EQ(0, remove(fileName.c_str()));
 }
 
 struct data_t
@@ -585,12 +589,15 @@ struct data_t
 
 static void test_filestorage_basic(int write_flags, const char* suffix_name, bool testReadWrite, bool useMemory = false)
 {
+    const bool generateTestData = false; // enable to regenerate reference in opencv_extra
     const ::testing::TestInfo* const test_info = ::testing::UnitTest::GetInstance()->current_test_info();
     CV_Assert(test_info);
     std::string name = (std::string(test_info->test_case_name()) + "--" + test_info->name() + suffix_name);
     std::string name_34 = string(cvtest::TS::ptr()->get_data_path()) + "io/3_4/" + name;
-    if (!testReadWrite)
+    if (!testReadWrite || generateTestData)
         name = string(cvtest::TS::ptr()->get_data_path()) + "io/" + name;
+    else
+        name = cv::tempfile(name.c_str());
 
     {
         const size_t rawdata_N = 40;
@@ -636,10 +643,7 @@ static void test_filestorage_basic(int write_flags, const char* suffix_name, boo
                 rawdata.push_back(tmp);
             }
         }
-#ifdef GENERATE_TEST_DATA
-#else
-        if (testReadWrite || useMemory)
-#endif
+        if (testReadWrite || useMemory || generateTestData)
         {
             cv::FileStorage fs(name, write_flags + (useMemory ? cv::FileStorage::MEMORY : 0));
             fs << "normal_2d_mat" << _2d_out;
@@ -761,9 +765,13 @@ static void test_filestorage_basic(int write_flags, const char* suffix_name, boo
         ASSERT_EQ(_rd_in.dims   , _rd_out.dims);
         ASSERT_EQ(_rd_in.depth(), _rd_out.depth());
 
-        if (useMemory) {
+        if (useMemory)
+        {
             EXPECT_EQ(0, cv::norm(_rd_in, _rd_out, NORM_INF));
         }
+        if (testReadWrite && !useMemory && !generateTestData) {
+            EXPECT_EQ(0, remove(name.c_str()));
+        }
     }
 }
 
@@ -810,7 +818,7 @@ TEST(Core_InputOutput, filestorage_heap_overflow)
     const ::testing::TestInfo* const test_info = ::testing::UnitTest::GetInstance()->current_test_info();
     CV_Assert(test_info);
 
-    std::string name = std::string(test_info->test_case_name()) + "--" + test_info->name();
+    std::string name = cv::tempfile();
     const char data[] = {0x00, 0x2f, 0x4a, 0x4a, 0x50, 0x4a, 0x4a };
 
     std::ofstream file;
@@ -822,6 +830,7 @@ TEST(Core_InputOutput, filestorage_heap_overflow)
 
     // This just shouldn't segfault, otherwise it's fine
     EXPECT_ANY_THROW(FileStorage(name, FileStorage::READ));
+    EXPECT_EQ(0, remove(name.c_str()));
 }
 
 TEST(Core_InputOutput, filestorage_base64_valid_call)
@@ -832,18 +841,6 @@ TEST(Core_InputOutput, filestorage_base64_valid_call)
         : (std::string(test_info->test_case_name()) + "--" + test_info->name());
 
     char const * filenames[] = {
-        "core_io_base64_other_test.yml",
-        "core_io_base64_other_test.xml",
-        "core_io_base64_other_test.json",
-        "core_io_base64_other_test.yml?base64",
-        "core_io_base64_other_test.xml?base64",
-        "core_io_base64_other_test.json?base64",
-        0
-    };
-    char const * real_name[] = {
-        "core_io_base64_other_test.yml",
-        "core_io_base64_other_test.xml",
-        "core_io_base64_other_test.json",
         "core_io_base64_other_test.yml",
         "core_io_base64_other_test.xml",
         "core_io_base64_other_test.json",
@@ -855,14 +852,16 @@ TEST(Core_InputOutput, filestorage_base64_valid_call)
 
     for (int n = 0; n < 6; n++)
     {
-        char const* suffix_name = filenames[n];
-        SCOPED_TRACE(suffix_name);
-        std::string name = basename + '_' + suffix_name;
-        std::string file_name = basename + '_' + real_name[n];
+        const int idx = n / 2;
+        const std::string mode_suffix = (n % 2 == 0) ? "" : "?base64";
+        std::string suffix_name = basename + "_" + filenames[idx];
+        std::string file_name = cv::tempfile(suffix_name.c_str());
+        std::string mode_file_name = file_name + mode_suffix;
+        SCOPED_TRACE(mode_file_name);
 
         EXPECT_NO_THROW(
         {
-            cv::FileStorage fs(name, cv::FileStorage::WRITE_BASE64);
+            cv::FileStorage fs(mode_file_name, cv::FileStorage::WRITE_BASE64);
 
             fs << "manydata" << "[";
             fs << "[:";
@@ -890,7 +889,7 @@ TEST(Core_InputOutput, filestorage_base64_valid_call)
 
         EXPECT_NO_THROW(
         {
-            cv::FileStorage fs(name, cv::FileStorage::WRITE);
+            cv::FileStorage fs(mode_file_name, cv::FileStorage::WRITE);
 
             fs << "manydata" << "[";
             fs << str_out;
@@ -934,10 +933,10 @@ TEST(Core_InputOutput, filestorage_base64_invalid_call)
         0
     };
 
-    for (char const ** ptr = filenames; *ptr; ptr++)
+    for (int idx = 0; idx < 3; ++idx)
     {
-        char const * suffix_name = *ptr;
-        std::string name = basename + '_' + suffix_name;
+        const string base_suffix = basename + '_' + filenames[idx];
+        std::string name = cv::tempfile(base_suffix.c_str());
 
         EXPECT_NO_THROW({
             cv::FileStorage fs(name, cv::FileStorage::WRITE);
@@ -958,7 +957,7 @@ TEST(Core_InputOutput, filestorage_base64_invalid_call)
 
 TEST(Core_InputOutput, filestorage_yml_vec2i)
 {
-    const std::string file_name = "vec2i.yml";
+    const std::string file_name = cv::tempfile("vec2i.yml");
     cv::Vec2i vec(2, 1), ovec;
 
     /* write */
@@ -1040,7 +1039,7 @@ TEST(Core_InputOutput, filestorage_vec_vec_io)
         }
     }
 
-    String fileName = "vec_vec_io_test.";
+    String basename = "vec_vec_io_test.";
 
     std::vector<String> formats;
     formats.push_back("xml");
@@ -1049,11 +1048,13 @@ TEST(Core_InputOutput, filestorage_vec_vec_io)
 
     for(size_t i = 0; i < formats.size(); i++)
     {
-        FileStorage writer(fileName + formats[i], FileStorage::WRITE);
+        const String basename_plus(basename + formats[i]);
+        const String fileName = tempfile(basename_plus.c_str());
+        FileStorage writer(fileName, FileStorage::WRITE);
         writer << "vecVecMat" << outputMats;
         writer.release();
 
-        FileStorage reader(fileName + formats[i], FileStorage::READ);
+        FileStorage reader(fileName, FileStorage::READ);
         std::vector<std::vector<Mat> > testMats;
         reader["vecVecMat"] >> testMats;
 
@@ -1070,7 +1071,7 @@ TEST(Core_InputOutput, filestorage_vec_vec_io)
         }
 
         reader.release();
-        remove((fileName + formats[i]).c_str());
+        remove(fileName.c_str());
     }
 }
 
@@ -1661,7 +1662,7 @@ TEST(Core_InputOutput, FileStorage_json_bool)
 
 TEST(Core_InputOutput, FileStorage_free_file_after_exception)
 {
-    const std::string fileName = "FileStorage_free_file_after_exception_test.yml";
+    const std::string fileName = cv::tempfile("FileStorage_free_file_after_exception_test.yml");
     const std::string content = "%YAML:1.0\n cameraMatrix;:: !<tag:yaml.org,2002:opencv-matrix>\n";
 
     std::fstream testFile;
@@ -1684,11 +1685,11 @@ TEST(Core_InputOutput, FileStorage_free_file_after_exception)
 TEST(Core_InputOutput, FileStorage_write_to_sequence)
 {
     const std::vector<std::string> formatExts = { ".yml", ".json", ".xml" };
-    const std::string fileName = "FileStorage_write_to_sequence";
-
     for (const auto& ext : formatExts)
     {
-        FileStorage fs(fileName + ext, FileStorage::WRITE);
+        const std::string name = tempfile(ext.c_str());
+
+        FileStorage fs(name, FileStorage::WRITE);
         std::vector<int> in = { 23, 42 };
         fs.startWriteStruct("some_sequence", cv::FileNode::SEQ);
         for (int i : in)
@@ -1696,7 +1697,7 @@ TEST(Core_InputOutput, FileStorage_write_to_sequence)
         fs.endWriteStruct();
         fs.release();
 
-        FileStorage fsIn(fileName + ext, FileStorage::READ);
+        FileStorage fsIn(name, FileStorage::READ);
         FileNode seq = fsIn["some_sequence"];
         FileNodeIterator it = seq.begin(), it_end = seq.end();
         std::vector<int> out;
@@ -1704,12 +1705,13 @@ TEST(Core_InputOutput, FileStorage_write_to_sequence)
             out.push_back((int)*it);
 
         EXPECT_EQ(in, out);
+        EXPECT_EQ(0, remove(name.c_str()));
     }
 }
 
 TEST(Core_InputOutput, FileStorage_YAML_parse_multiple_documents)
 {
-    const std::string filename = "FileStorage_YAML_parse_multiple_documents.yml";
+    const std::string filename = cv::tempfile("FileStorage_YAML_parse_multiple_documents.yml");
     FileStorage fs;
 
     fs.open(filename, FileStorage::WRITE);
diff --git a/modules/core/test/test_mat.cpp b/modules/core/test/test_mat.cpp
index 5f14767bd3..68075d15ab 100644
--- a/modules/core/test/test_mat.cpp
+++ b/modules/core/test/test_mat.cpp
@@ -475,12 +475,13 @@ TEST(Core_PCA, accuracy)
     ASSERT_LE(err, diffBackPrjEps) << "bad accuracy of cvBackProjectPCA() (CV_PCA_DATA_AS_COL)";
 #endif
     // Test read and write
-    FileStorage fs( "PCA_store.yml", FileStorage::WRITE );
+    const std::string filename = cv::tempfile("PCA_store.yml");
+    FileStorage fs( filename, FileStorage::WRITE );
     rPCA.write( fs );
     fs.release();
 
     PCA lPCA;
-    fs.open( "PCA_store.yml", FileStorage::READ );
+    fs.open( filename, FileStorage::READ );
     lPCA.read( fs.root() );
     err = cvtest::norm(rPCA.eigenvectors, lPCA.eigenvectors, NORM_L2 | NORM_RELATIVE);
     EXPECT_LE(err, 0) << "bad accuracy of write/load functions (YML)";
@@ -488,6 +489,7 @@ TEST(Core_PCA, accuracy)
     EXPECT_LE(err, 0) << "bad accuracy of write/load functions (YML)";
     err = cvtest::norm(rPCA.mean, lPCA.mean, NORM_L2 | NORM_RELATIVE);
     EXPECT_LE(err, 0) << "bad accuracy of write/load functions (YML)";
+    EXPECT_EQ(0, remove(filename.c_str()));
 }
 
 class Core_ArrayOpTest : public cvtest::BaseTest
diff --git a/modules/dnn/include/opencv2/dnn/all_layers.hpp b/modules/dnn/include/opencv2/dnn/all_layers.hpp
index b00d3f54a6..587eda102f 100644
--- a/modules/dnn/include/opencv2/dnn/all_layers.hpp
+++ b/modules/dnn/include/opencv2/dnn/all_layers.hpp
@@ -588,11 +588,11 @@ CV__DNN_INLINE_NS_BEGIN
     {
     public:
         virtual void forwardSlice(const float* src, float* dst, int len,
-                                  size_t outPlaneSize, int cn0, int cn1) const {};
+                                  size_t outPlaneSize, int cn0, int cn1) const {}
         virtual void forwardSlice(const int* src, const int* lut, int* dst, int len,
-                                  size_t outPlaneSize, int cn0, int cn1) const {};
+                                  size_t outPlaneSize, int cn0, int cn1) const {}
         virtual void forwardSlice(const int8_t* src, const int8_t* lut, int8_t* dst, int len,
-                                  size_t outPlaneSize, int cn0, int cn1) const {};
+                                  size_t outPlaneSize, int cn0, int cn1) const {}
     };
 
     class CV_EXPORTS ReLULayer : public ActivationLayer
diff --git a/modules/dnn/perf/perf_net.cpp b/modules/dnn/perf/perf_net.cpp
index 5fc3681762..46484e82c5 100644
--- a/modules/dnn/perf/perf_net.cpp
+++ b/modules/dnn/perf/perf_net.cpp
@@ -28,22 +28,28 @@ public:
         target = (dnn::Target)(int)get<1>(GetParam());
     }
 
-    void processNet(std::string weights, std::string proto, const Mat& input, const std::string& outputLayer = "")
-    {
-        randu(input, 0.0f, 1.0f);
-
+    void processNet(std::string weights, std::string proto,
+                    const std::vector<std::tuple<Mat, std::string>>& inputs, const std::string& outputLayer = ""){
         weights = findDataFile(weights, false);
         if (!proto.empty())
             proto = findDataFile(proto);
         net = readNet(proto, weights);
-        net.setInput(blobFromImage(input, 1.0, Size(), Scalar(), false));
+        // Set multiple inputs
+        for(auto &inp: inputs){
+            net.setInput(std::get<0>(inp), std::get<1>(inp));
+        }
+
         net.setPreferableBackend(backend);
         net.setPreferableTarget(target);
 
-        MatShape netInputShape = shape(1, 3, input.rows, input.cols);
+        // Calculate multiple inputs memory consumption
+        std::vector<MatShape> netMatShapes;
+        for(auto &inp: inputs){
+            netMatShapes.push_back(shape(std::get<0>(inp)));
+        }
         size_t weightsMemory = 0, blobsMemory = 0;
-        net.getMemoryConsumption(netInputShape, weightsMemory, blobsMemory);
-        int64 flops = net.getFLOPS(netInputShape);
+        net.getMemoryConsumption(netMatShapes, weightsMemory, blobsMemory);
+        int64 flops = net.getFLOPS(netMatShapes);
         CV_Assert(flops > 0);
 
         net.forward(outputLayer); // warmup
@@ -59,33 +65,48 @@ public:
 
         SANITY_CHECK_NOTHING();
     }
+
+    void processNet(std::string weights, std::string proto,
+                    Mat &input, const std::string& outputLayer = "")
+    {
+        processNet(weights, proto, {std::make_tuple(input, "")}, outputLayer);
+    }
+
+    void processNet(std::string weights, std::string proto,
+                    Size inpSize, const std::string& outputLayer = "")
+    {
+        Mat input_data(inpSize, CV_32FC3);
+        randu(input_data, 0.0f, 1.0f);
+        Mat input = blobFromImage(input_data, 1.0, Size(), Scalar(), false);
+        processNet(weights, proto, input, outputLayer);
+    }
 };
 
 
 PERF_TEST_P_(DNNTestNetwork, AlexNet)
 {
-    processNet("dnn/bvlc_alexnet.caffemodel", "dnn/bvlc_alexnet.prototxt", Mat(cv::Size(227, 227), CV_32FC3));
+    processNet("dnn/bvlc_alexnet.caffemodel", "dnn/bvlc_alexnet.prototxt", cv::Size(227, 227));
 }
 
 PERF_TEST_P_(DNNTestNetwork, GoogLeNet)
 {
-    processNet("dnn/bvlc_googlenet.caffemodel", "dnn/bvlc_googlenet.prototxt", Mat(cv::Size(224, 224), CV_32FC3));
+    processNet("dnn/bvlc_googlenet.caffemodel", "dnn/bvlc_googlenet.prototxt", cv::Size(224, 224));
 }
 
 PERF_TEST_P_(DNNTestNetwork, ResNet_50)
 {
-    processNet("dnn/ResNet-50-model.caffemodel", "dnn/ResNet-50-deploy.prototxt", Mat(cv::Size(224, 224), CV_32FC3));
+    processNet("dnn/ResNet-50-model.caffemodel", "dnn/ResNet-50-deploy.prototxt", cv::Size(224, 224));
 }
 
 PERF_TEST_P_(DNNTestNetwork, SqueezeNet_v1_1)
 {
-    processNet("dnn/squeezenet_v1.1.caffemodel", "dnn/squeezenet_v1.1.prototxt", Mat(cv::Size(227, 227), CV_32FC3));
+    processNet("dnn/squeezenet_v1.1.caffemodel", "dnn/squeezenet_v1.1.prototxt", cv::Size(227, 227));
 }
 
 PERF_TEST_P_(DNNTestNetwork, Inception_5h)
 {
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019) throw SkipTestException("");
-    processNet("dnn/tensorflow_inception_graph.pb", "", Mat(cv::Size(224, 224), CV_32FC3), "softmax2");
+    processNet("dnn/tensorflow_inception_graph.pb", "", cv::Size(224, 224), "softmax2");
 }
 
 PERF_TEST_P_(DNNTestNetwork, ENet)
@@ -97,12 +118,12 @@ PERF_TEST_P_(DNNTestNetwork, ENet)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
         throw SkipTestException("");
 #endif
-    processNet("dnn/Enet-model-best.net", "", Mat(cv::Size(512, 256), CV_32FC3));
+    processNet("dnn/Enet-model-best.net", "", cv::Size(512, 256));
 }
 
 PERF_TEST_P_(DNNTestNetwork, SSD)
 {
-    processNet("dnn/VGG_ILSVRC2016_SSD_300x300_iter_440000.caffemodel", "dnn/ssd_vgg16.prototxt", Mat(cv::Size(300, 300), CV_32FC3));
+    processNet("dnn/VGG_ILSVRC2016_SSD_300x300_iter_440000.caffemodel", "dnn/ssd_vgg16.prototxt", cv::Size(300, 300));
 }
 
 PERF_TEST_P_(DNNTestNetwork, OpenFace)
@@ -111,27 +132,27 @@ PERF_TEST_P_(DNNTestNetwork, OpenFace)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && (target == DNN_TARGET_MYRIAD || target == DNN_TARGET_HDDL))
         throw SkipTestException("");
 #endif
-    processNet("dnn/openface_nn4.small2.v1.t7", "", Mat(cv::Size(96, 96), CV_32FC3));
+    processNet("dnn/openface_nn4.small2.v1.t7", "", cv::Size(96, 96));
 }
 
 PERF_TEST_P_(DNNTestNetwork, MobileNet_SSD_Caffe)
 {
-    processNet("dnn/MobileNetSSD_deploy_19e3ec3.caffemodel", "dnn/MobileNetSSD_deploy_19e3ec3.prototxt", Mat(cv::Size(300, 300), CV_32FC3));
+    processNet("dnn/MobileNetSSD_deploy_19e3ec3.caffemodel", "dnn/MobileNetSSD_deploy_19e3ec3.prototxt", cv::Size(300, 300));
 }
 
 PERF_TEST_P_(DNNTestNetwork, MobileNet_SSD_v1_TensorFlow)
 {
-    processNet("dnn/ssd_mobilenet_v1_coco_2017_11_17.pb", "ssd_mobilenet_v1_coco_2017_11_17.pbtxt", Mat(cv::Size(300, 300), CV_32FC3));
+    processNet("dnn/ssd_mobilenet_v1_coco_2017_11_17.pb", "ssd_mobilenet_v1_coco_2017_11_17.pbtxt", cv::Size(300, 300));
 }
 
 PERF_TEST_P_(DNNTestNetwork, MobileNet_SSD_v2_TensorFlow)
 {
-    processNet("dnn/ssd_mobilenet_v2_coco_2018_03_29.pb", "ssd_mobilenet_v2_coco_2018_03_29.pbtxt", Mat(cv::Size(300, 300), CV_32FC3));
+    processNet("dnn/ssd_mobilenet_v2_coco_2018_03_29.pb", "ssd_mobilenet_v2_coco_2018_03_29.pbtxt", cv::Size(300, 300));
 }
 
 PERF_TEST_P_(DNNTestNetwork, DenseNet_121)
 {
-    processNet("dnn/DenseNet_121.caffemodel", "dnn/DenseNet_121.prototxt", Mat(cv::Size(224, 224), CV_32FC3));
+    processNet("dnn/DenseNet_121.caffemodel", "dnn/DenseNet_121.prototxt", cv::Size(224, 224));
 }
 
 PERF_TEST_P_(DNNTestNetwork, OpenPose_pose_mpi_faster_4_stages)
@@ -140,17 +161,17 @@ PERF_TEST_P_(DNNTestNetwork, OpenPose_pose_mpi_faster_4_stages)
         throw SkipTestException("");
     // The same .caffemodel but modified .prototxt
     // See https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/src/openpose/pose/poseParameters.cpp
-    processNet("dnn/openpose_pose_mpi.caffemodel", "dnn/openpose_pose_mpi_faster_4_stages.prototxt", Mat(cv::Size(368, 368), CV_32FC3));
+    processNet("dnn/openpose_pose_mpi.caffemodel", "dnn/openpose_pose_mpi_faster_4_stages.prototxt", cv::Size(368, 368));
 }
 
 PERF_TEST_P_(DNNTestNetwork, opencv_face_detector)
 {
-    processNet("dnn/opencv_face_detector.caffemodel", "dnn/opencv_face_detector.prototxt", Mat(cv::Size(300, 300), CV_32FC3));
+    processNet("dnn/opencv_face_detector.caffemodel", "dnn/opencv_face_detector.prototxt", cv::Size(300, 300));
 }
 
 PERF_TEST_P_(DNNTestNetwork, Inception_v2_SSD_TensorFlow)
 {
-    processNet("dnn/ssd_inception_v2_coco_2017_11_17.pb", "ssd_inception_v2_coco_2017_11_17.pbtxt", Mat(cv::Size(300, 300), CV_32FC3));
+    processNet("dnn/ssd_inception_v2_coco_2017_11_17.pb", "ssd_inception_v2_coco_2017_11_17.pbtxt", cv::Size(300, 300));
 }
 
 PERF_TEST_P_(DNNTestNetwork, YOLOv3)
@@ -168,9 +189,7 @@ PERF_TEST_P_(DNNTestNetwork, YOLOv3)
 #endif
 
     Mat sample = imread(findDataFile("dnn/dog416.png"));
-    cvtColor(sample, sample, COLOR_BGR2RGB);
-    Mat inp;
-    sample.convertTo(inp, CV_32FC3, 1.0f / 255, 0);
+    Mat inp = blobFromImage(sample, 1.0 / 255.0, Size(), Scalar(), true);
     processNet("dnn/yolov3.weights", "dnn/yolov3.cfg", inp);
 }
 
@@ -186,9 +205,7 @@ PERF_TEST_P_(DNNTestNetwork, YOLOv4)
         throw SkipTestException("Test is disabled in OpenVINO 2020.4");
 #endif
     Mat sample = imread(findDataFile("dnn/dog416.png"));
-    cvtColor(sample, sample, COLOR_BGR2RGB);
-    Mat inp;
-    sample.convertTo(inp, CV_32FC3, 1.0f / 255, 0);
+    Mat inp = blobFromImage(sample, 1.0 / 255.0, Size(), Scalar(), true);
     processNet("dnn/yolov4.weights", "dnn/yolov4.cfg", inp);
 }
 
@@ -199,20 +216,39 @@ PERF_TEST_P_(DNNTestNetwork, YOLOv4_tiny)
         throw SkipTestException("");
 #endif
     Mat sample = imread(findDataFile("dnn/dog416.png"));
-    cvtColor(sample, sample, COLOR_BGR2RGB);
-    Mat inp;
-    sample.convertTo(inp, CV_32FC3, 1.0f / 255, 0);
+    Mat inp = blobFromImage(sample, 1.0 / 255.0, Size(), Scalar(), true);
     processNet("dnn/yolov4-tiny-2020-12.weights", "dnn/yolov4-tiny-2020-12.cfg", inp);
 }
 
+PERF_TEST_P_(DNNTestNetwork, YOLOv5) {
+    applyTestTag(CV_TEST_TAG_MEMORY_512MB);
+    Mat sample = imread(findDataFile("dnn/dog416.png"));
+    Mat inp = blobFromImage(sample, 1.0 / 255.0, Size(640, 640), Scalar(), true);
+    processNet("", "dnn/yolov5n.onnx", inp);
+}
+
+PERF_TEST_P_(DNNTestNetwork, YOLOv8) {
+    applyTestTag(CV_TEST_TAG_MEMORY_512MB);
+    Mat sample = imread(findDataFile("dnn/dog416.png"));
+    Mat inp = blobFromImage(sample, 1.0 / 255.0, Size(640, 640), Scalar(), true);
+    processNet("", "dnn/yolov8n.onnx", inp);
+}
+
+PERF_TEST_P_(DNNTestNetwork, YOLOX) {
+    applyTestTag(CV_TEST_TAG_MEMORY_512MB);
+    Mat sample = imread(findDataFile("dnn/dog416.png"));
+    Mat inp = blobFromImage(sample, 1.0 / 255.0, Size(640, 640), Scalar(), true);
+    processNet("", "dnn/yolox_s.onnx", inp);
+}
+
 PERF_TEST_P_(DNNTestNetwork, EAST_text_detection)
 {
-    processNet("dnn/frozen_east_text_detection.pb", "", Mat(cv::Size(320, 320), CV_32FC3));
+    processNet("dnn/frozen_east_text_detection.pb", "", cv::Size(320, 320));
 }
 
 PERF_TEST_P_(DNNTestNetwork, FastNeuralStyle_eccv16)
 {
-    processNet("dnn/fast_neural_style_eccv16_starry_night.t7", "", Mat(cv::Size(320, 240), CV_32FC3));
+    processNet("dnn/fast_neural_style_eccv16_starry_night.t7", "", cv::Size(320, 240));
 }
 
 PERF_TEST_P_(DNNTestNetwork, Inception_v2_Faster_RCNN)
@@ -233,7 +269,8 @@ PERF_TEST_P_(DNNTestNetwork, Inception_v2_Faster_RCNN)
         (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16))
         throw SkipTestException("");
     processNet("dnn/faster_rcnn_inception_v2_coco_2018_01_28.pb",
-               "dnn/faster_rcnn_inception_v2_coco_2018_01_28.pbtxt", Mat(cv::Size(800, 600), CV_32FC3));
+               "dnn/faster_rcnn_inception_v2_coco_2018_01_28.pbtxt",
+               cv::Size(800, 600));
 }
 
 PERF_TEST_P_(DNNTestNetwork, EfficientDet)
@@ -241,12 +278,88 @@ PERF_TEST_P_(DNNTestNetwork, EfficientDet)
     if (target != DNN_TARGET_CPU)
         throw SkipTestException("");
     Mat sample = imread(findDataFile("dnn/dog416.png"));
-    resize(sample, sample, Size(512, 512));
-    Mat inp;
-    sample.convertTo(inp, CV_32FC3, 1.0/255);
+    Mat inp = blobFromImage(sample, 1.0 / 255.0, Size(512, 512), Scalar(), true);
     processNet("dnn/efficientdet-d0.pb", "dnn/efficientdet-d0.pbtxt", inp);
 }
 
+PERF_TEST_P_(DNNTestNetwork, EfficientNet)
+{
+    Mat sample = imread(findDataFile("dnn/dog416.png"));
+    Mat inp = blobFromImage(sample, 1.0 / 255.0, Size(224, 224), Scalar(), true);
+    transposeND(inp, {0, 2, 3, 1}, inp);
+    processNet("", "dnn/efficientnet-lite4.onnx", inp);
+}
+
+PERF_TEST_P_(DNNTestNetwork, YuNet) {
+    processNet("", "dnn/onnx/models/yunet-202303.onnx", cv::Size(640, 640));
+}
+
+PERF_TEST_P_(DNNTestNetwork, SFace) {
+    processNet("", "dnn/face_recognition_sface_2021dec.onnx", cv::Size(112, 112));
+}
+
+PERF_TEST_P_(DNNTestNetwork, MPPalm) {
+    Mat inp(cv::Size(192, 192), CV_32FC3);
+    randu(inp, 0.0f, 1.0f);
+    inp = blobFromImage(inp, 1.0, Size(), Scalar(), false);
+    transposeND(inp, {0, 2, 3, 1}, inp);
+    processNet("", "dnn/palm_detection_mediapipe_2023feb.onnx", inp);
+}
+
+PERF_TEST_P_(DNNTestNetwork, MPHand) {
+    Mat inp(cv::Size(224, 224), CV_32FC3);
+    randu(inp, 0.0f, 1.0f);
+    inp = blobFromImage(inp, 1.0, Size(), Scalar(), false);
+    transposeND(inp, {0, 2, 3, 1}, inp);
+    processNet("", "dnn/handpose_estimation_mediapipe_2023feb.onnx", inp);
+}
+
+PERF_TEST_P_(DNNTestNetwork, MPPose) {
+    Mat inp(cv::Size(256, 256), CV_32FC3);
+    randu(inp, 0.0f, 1.0f);
+    inp = blobFromImage(inp, 1.0, Size(), Scalar(), false);
+    transposeND(inp, {0, 2, 3, 1}, inp);
+    processNet("", "dnn/pose_estimation_mediapipe_2023mar.onnx", inp);
+}
+
+PERF_TEST_P_(DNNTestNetwork, PPOCRv3) {
+    applyTestTag(CV_TEST_TAG_MEMORY_512MB);
+    processNet("", "dnn/onnx/models/PP_OCRv3_DB_text_det.onnx", cv::Size(736, 736));
+}
+
+PERF_TEST_P_(DNNTestNetwork, PPHumanSeg) {
+    processNet("", "dnn/human_segmentation_pphumanseg_2023mar.onnx", cv::Size(192, 192));
+}
+
+PERF_TEST_P_(DNNTestNetwork, CRNN) {
+    Mat inp(cv::Size(100, 32), CV_32FC1);
+    randu(inp, 0.0f, 1.0f);
+    inp = blobFromImage(inp, 1.0, Size(), Scalar(), false);
+    processNet("", "dnn/text_recognition_CRNN_EN_2021sep.onnx", inp);
+}
+
+PERF_TEST_P_(DNNTestNetwork, ViTTrack) {
+    Mat inp1(cv::Size(128, 128), CV_32FC3);
+    Mat inp2(cv::Size(256, 256), CV_32FC3);
+    randu(inp1, 0.0f, 1.0f);
+    randu(inp2, 0.0f, 1.0f);
+    inp1 = blobFromImage(inp1, 1.0, Size(), Scalar(), false);
+    inp2 = blobFromImage(inp2, 1.0, Size(), Scalar(), false);
+    processNet("", "dnn/onnx/models/vitTracker.onnx", {std::make_tuple(inp1, "template"), std::make_tuple(inp2, "search")});
+}
+
+
+PERF_TEST_P_(DNNTestNetwork, EfficientDet_int8)
+{
+    if (target != DNN_TARGET_CPU || (backend != DNN_BACKEND_OPENCV &&
+        backend != DNN_BACKEND_TIMVX && backend != DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)) {
+        throw SkipTestException("");
+    }
+    Mat inp = imread(findDataFile("dnn/dog416.png"));
+    inp = blobFromImage(inp, 1.0 / 255.0, Size(320, 320), Scalar(), true);
+    processNet("", "dnn/tflite/coco_efficientdet_lite0_v1_1.0_quant_2021_09_06.tflite", inp);
+}
+
 INSTANTIATE_TEST_CASE_P(/*nothing*/, DNNTestNetwork, dnnBackendsAndTargets());
 
 } // namespace
diff --git a/modules/dnn/src/graph_simplifier.hpp b/modules/dnn/src/graph_simplifier.hpp
index 39d6262c1b..22bc50e3e5 100644
--- a/modules/dnn/src/graph_simplifier.hpp
+++ b/modules/dnn/src/graph_simplifier.hpp
@@ -17,7 +17,7 @@ namespace cv { namespace dnn {
 class ImportNodeWrapper
 {
 public:
-    virtual ~ImportNodeWrapper() {};
+    virtual ~ImportNodeWrapper() {}
 
     virtual int getNumInputs() const = 0;
 
@@ -33,7 +33,7 @@ public:
 class ImportGraphWrapper
 {
 public:
-    virtual ~ImportGraphWrapper() {};
+    virtual ~ImportGraphWrapper() {}
 
     virtual Ptr<ImportNodeWrapper> getNode(int idx) const = 0;
 
diff --git a/modules/dnn/src/ie_ngraph.cpp b/modules/dnn/src/ie_ngraph.cpp
index f9341febb5..7eeb62bcf6 100644
--- a/modules/dnn/src/ie_ngraph.cpp
+++ b/modules/dnn/src/ie_ngraph.cpp
@@ -590,7 +590,7 @@ void InfEngineNgraphNet::init(Target targetId)
             allBlobs[name] = ov::Tensor(src.get_element_type(), outShape, src.data());
         }
 
-        ppp.output(i++).tensor().set_element_type(ov::element::f32);  // Should be always FP32
+        ppp.output(i++).tensor().set_element_type(src.get_element_type());
     }
 
     ppp.build();
@@ -840,6 +840,8 @@ ov::Tensor wrapToNgraphBlob(const Mat& m) {
         return ov::Tensor(ov::element::f32, shape, m.data);
     else if (m.type() == CV_8U)
         return ov::Tensor(ov::element::u8, shape, m.data);
+    else if (m.type() == CV_8SC1)
+        return ov::Tensor(ov::element::i8, shape, m.data);
     else if (m.type() == CV_32SC1)
         return ov::Tensor(ov::element::i32, shape, m.data);
     else
@@ -1234,6 +1236,32 @@ void InfEngineNgraphNet::forward(const std::vector<Ptr<BackendWrapper> >& outBlo
 #endif // OpenVINO >= 2022.1
 }
 
+ngraph::Output<ngraph::Node> ngraphQuantize(ngraph::Output<ngraph::Node> input, float output_sc, float output_zp) {
+    float outLow = -128, outHigh = 127;
+    float inpLow = output_sc * (outLow - output_zp);
+    float inpHigh = output_sc * (outHigh - output_zp);
+    return std::make_shared<ngraph::op::FakeQuantize>(input,
+        std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &inpLow),
+        std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &inpHigh),
+        std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &outLow),
+        std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &outHigh),
+        256 // levels
+    );
+}
+
+ngraph::Output<ngraph::Node> ngraphDequantize(ngraph::Output<ngraph::Node> input, float input_sc, float input_zp) {
+    float inpLow = -128, inpHigh = 127;
+    float outLow = input_sc * (inpLow - input_zp);
+    float outHigh = input_sc * (inpHigh - input_zp);
+    return std::make_shared<ngraph::op::FakeQuantize>(input,
+        std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &inpLow),
+        std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &inpHigh),
+        std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &outLow),
+        std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &outHigh),
+        256 // levels
+    );
+}
+
 #endif
 
 }}
diff --git a/modules/dnn/src/ie_ngraph.hpp b/modules/dnn/src/ie_ngraph.hpp
index cc8f53ca5c..8672f1a3c2 100644
--- a/modules/dnn/src/ie_ngraph.hpp
+++ b/modules/dnn/src/ie_ngraph.hpp
@@ -148,6 +148,9 @@ private:
     InferenceEngine::CNNNetwork t_net;
 };
 
+ngraph::Output<ngraph::Node> ngraphQuantize(ngraph::Output<ngraph::Node> input, float output_sc, float output_zp);
+ngraph::Output<ngraph::Node> ngraphDequantize(ngraph::Output<ngraph::Node> input, float input_sc, float input_zp);
+
 #endif  // HAVE_DNN_NGRAPH
 
 }}  // namespace cv::dnn
diff --git a/modules/dnn/src/int8layers/batch_norm_layer.cpp b/modules/dnn/src/int8layers/batch_norm_layer.cpp
index a3a9ebb261..7ef169deea 100644
--- a/modules/dnn/src/int8layers/batch_norm_layer.cpp
+++ b/modules/dnn/src/int8layers/batch_norm_layer.cpp
@@ -5,6 +5,7 @@
 #include "../precomp.hpp"
 #include "layers_common.hpp"
 #include "../op_timvx.hpp"
+#include "../ie_ngraph.hpp"
 
 #include <opencv2/dnn/shape_utils.hpp>
 
@@ -110,7 +111,8 @@ public:
             return true;
         }
 
-        return backendId == DNN_BACKEND_OPENCV;
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;
     }
 
     bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
@@ -238,6 +240,27 @@ public:
         return Ptr<BackendNode>();
     }
 
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs, const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        auto input = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+
+        input = ngraphDequantize(input, input_sc, input_zp);
+
+        std::vector<size_t> shape(input.get_shape().size(), 1);
+        shape[1] = origin_weights.total();
+
+        ngraph::Output<ngraph::Node> res;
+        auto ieWeights = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, shape, origin_weights.data);
+        auto ieBias = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, shape, origin_bias.data);
+        res = std::make_shared<ngraph::op::v1::Multiply>(input, ieWeights);
+        res = std::make_shared<ngraph::op::v1::Add>(res, ieBias);
+
+        res = ngraphQuantize(res, output_sc, output_zp);
+        return new InfEngineNgraphNode(res);
+    }
+#endif  // HAVE_DNN_NGRAPH
+
     void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
     {
         CV_TRACE_FUNCTION();
diff --git a/modules/dnn/src/int8layers/convolution_layer.cpp b/modules/dnn/src/int8layers/convolution_layer.cpp
index 3d6f6bc824..60301a406c 100644
--- a/modules/dnn/src/int8layers/convolution_layer.cpp
+++ b/modules/dnn/src/int8layers/convolution_layer.cpp
@@ -10,6 +10,7 @@
 #include "opencv2/core/hal/hal.hpp"
 #include "opencv2/core/hal/intrin.hpp"
 #include "../op_timvx.hpp"
+#include "../ie_ngraph.hpp"
 #include <iostream>
 #include <numeric>
 
@@ -18,7 +19,7 @@ namespace cv
 namespace dnn
 {
 
-#if CV_SIMD
+#if CV_SIMD128
 static inline void v_expand_mul_add(const v_int8x16& a, const v_int8x16& b,
                                     v_int32x4& out0, v_int32x4& out1, v_int32x4& out2, v_int32x4& out3)
 {
@@ -195,7 +196,8 @@ public:
         }
 #endif
         // Only default backend and Conv1D/Conv2D/Conv3D are supported
-        return backendId == DNN_BACKEND_OPENCV && ksize >= 1 && ksize <= 3;
+        return (backendId == DNN_BACKEND_OPENCV && ksize >= 1 && ksize <= 3) ||
+               backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;
     }
 
     bool getMemoryShapes(const std::vector<MatShape> &inputs,
@@ -561,6 +563,126 @@ public:
         return Ptr<BackendNode>();
     }
 
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> > &inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        CV_Assert(!blobs.empty());
+        CV_Assert_N(inputs.size() >= 1, nodes.size() >= 1);
+        CV_CheckTypeEQ(weightsMat.type(), CV_8S, "");
+        auto ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        std::vector<size_t> dims = ieInpNode.get_shape();
+        CV_Check(dims.size(), dims.size() >= 3 && dims.size() <= 5, "");
+        CV_Assert(ieInpNode.get_element_type() == ngraph::element::f32);
+        ngraph::Output<ngraph::Node> ieWeights;
+        if (nodes.size() > 1)
+            ieWeights = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
+        const int inpCn = dims[1];
+        const int inpGroupCn = nodes.size() > 1 ? ieWeights.get_shape()[1] : blobs[0].size[1];
+        const int group = inpCn / inpGroupCn;
+
+        std::vector<size_t> kernel_shape;
+        if (group != 1)
+        {
+            kernel_shape.push_back(group);
+        }
+        kernel_shape.push_back(numOutput / group);
+        kernel_shape.push_back(inpCn / group);
+        std::copy(kernel_size.begin(), kernel_size.end(), back_inserter(kernel_shape));
+
+        if (nodes.size() == 1)
+        {
+            ieWeights = std::make_shared<ngraph::op::Constant>(ngraph::element::i8, kernel_shape, blobs[0].data);
+        }
+        else
+        {
+            auto shape = std::make_shared<ngraph::op::Constant>(ngraph::element::i64,
+                             ngraph::Shape{kernel_shape.size()}, std::vector<int64_t>(kernel_shape.begin(), kernel_shape.end()));
+            ieWeights  = std::make_shared<ngraph::op::v1::Reshape>(ieWeights, shape, true);
+        }
+
+        ngraph::op::PadType pad_type = ngraph::op::PadType::EXPLICIT;
+        if (!padMode.empty())
+            pad_type = padMode == "VALID" ? ngraph::op::PadType::VALID : ngraph::op::PadType::SAME_UPPER;
+
+        ieInpNode = ngraphDequantize(ieInpNode, input_sc, input_zp);
+
+        const float low = -128, high = 127;
+        std::vector<float> inpLows(numOutput, low);
+        std::vector<float> inpHighs(numOutput, high);
+        std::vector<float> outLows(numOutput);
+        std::vector<float> outHighs(numOutput);
+        std::vector<size_t> quantShape(kernel_shape.size(), 1);
+        if (group != 1)
+        {
+            quantShape[0] = group;
+            quantShape[1] = numOutput / group;
+        }
+        else
+        {
+            quantShape[0] = numOutput;
+        }
+
+        for (int i = 0; i < numOutput; ++i) {
+            outLows[i] = low * outputMultiplier[i] * output_sc / input_sc;
+            outHighs[i] = high * outputMultiplier[i] * output_sc / input_sc;
+        }
+        ieWeights = std::make_shared<ngraph::op::Convert>(ieWeights, ngraph::element::f32);
+        ieWeights = std::make_shared<ngraph::op::FakeQuantize>(ieWeights,
+            std::make_shared<ngraph::op::Constant>(ngraph::element::f32, quantShape, inpLows.data()),
+            std::make_shared<ngraph::op::Constant>(ngraph::element::f32, quantShape, inpHighs.data()),
+            std::make_shared<ngraph::op::Constant>(ngraph::element::f32, quantShape, outLows.data()),
+            std::make_shared<ngraph::op::Constant>(ngraph::element::f32, quantShape, outHighs.data()),
+            256 // levels
+        );
+
+        ngraph::Output<ngraph::Node> conv_node;
+        if (group != 1) {
+            conv_node = std::make_shared<ngraph::op::v1::GroupConvolution>(
+                                ieInpNode, ieWeights,
+                                ngraph::Strides(strides),
+                                ngraph::CoordinateDiff(std::vector<std::ptrdiff_t>(pads_begin.begin(), pads_begin.end())),
+                                ngraph::CoordinateDiff(std::vector<std::ptrdiff_t>(pads_end.begin(),   pads_end.end())),
+                                ngraph::Strides(dilations),
+                                pad_type);
+        } else {
+            conv_node = std::make_shared<ngraph::op::v1::Convolution>(
+                                ieInpNode, ieWeights,
+                                ngraph::Strides(strides),
+                                ngraph::CoordinateDiff(std::vector<std::ptrdiff_t>(pads_begin.begin(), pads_begin.end())),
+                                ngraph::CoordinateDiff(std::vector<std::ptrdiff_t>(pads_end.begin(), pads_end.end())),
+                                ngraph::Strides(dilations),
+                                pad_type);
+        }
+
+        std::vector<size_t> shape(conv_node.get_shape().size(), 1);
+        shape[1] = conv_node.get_shape()[1];
+        if (biasvec.size() || nodes.size() == 3)
+        {
+            std::shared_ptr<ngraph::Node> bias;
+            if (nodes.size() == 3)
+            {
+                auto bias_shape = std::make_shared<ngraph::op::Constant>(ngraph::element::i64,
+                                    ngraph::Shape{shape.size()}, std::vector<int64_t>(shape.begin(), shape.end()));
+                bias = std::make_shared<ngraph::op::v1::Reshape>(nodes[2].dynamicCast<InfEngineNgraphNode>()->node, bias_shape, true);
+            }
+            else
+            {
+                std::vector<float> ovBias(numOutput);
+                for (int i = 0; i < numOutput; ++i) {
+                    ovBias[i] = (biasvec[i] + input_zp * cv::sum(blobs[0].row(i))[0]) * outputMultiplier[i] * output_sc;
+                }
+                bias = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape(shape), ovBias.data());
+            }
+            conv_node = std::make_shared<ngraph::op::v1::Add>(conv_node, bias, ngraph::op::AutoBroadcastType::NUMPY);
+        }
+
+        conv_node = ngraphQuantize(conv_node, output_sc, output_zp);
+
+        return new InfEngineNgraphNode(conv_node);
+    }
+#endif  // HAVE_DNN_NGRAPH
+
     class ParallelConv : public cv::ParallelLoopBody
     {
     public:
@@ -893,7 +1015,7 @@ public:
                                         outptr[0] = std::min(std::max(out1, -128), 127);
                                         out_j = 1;
                                     }
-                                #if CV_SIMD
+                                #if CV_SIMD128
                                     if( stride_w == 1 )
                                     {
                                         const int out_delta = 16;
diff --git a/modules/dnn/src/int8layers/elementwise_layers.cpp b/modules/dnn/src/int8layers/elementwise_layers.cpp
index f1b78f48fb..2f40a4039f 100644
--- a/modules/dnn/src/int8layers/elementwise_layers.cpp
+++ b/modules/dnn/src/int8layers/elementwise_layers.cpp
@@ -5,6 +5,7 @@
 #include "../precomp.hpp"
 #include "layers_common.hpp"
 #include "../op_timvx.hpp"
+#include "../ie_ngraph.hpp"
 
 #include <opencv2/dnn/shape_utils.hpp>
 #include <iostream>
@@ -56,7 +57,7 @@ public:
             return tvActType != tvActNotSupported;
         }
 #endif
-        return backendId == DNN_BACKEND_OPENCV;
+        return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;
     }
 
     bool getMemoryShapes(const std::vector<MatShape> &inputs,
@@ -244,6 +245,42 @@ public:
         return Ptr<BackendNode>();
     }
 
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> > &inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        auto input = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+
+        input = ngraphDequantize(input, input_sc, input_zp);
+
+        ngraph::Output<ngraph::Node> res;
+        if (type == "ReLU6Int8") {
+            res = std::make_shared<ngraph::op::Clamp>(input, 0.0f, 6.0f);
+        } else if (type == "ReLUInt8") {
+            if (slope) {
+                auto param = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &slope);
+                res = std::make_shared<ngraph::op::PRelu>(input, param);
+            } else {
+                res = std::make_shared<ngraph::op::Relu>(input);
+            }
+        } else if (type == "ELUInt8") {
+            res = std::make_shared<ngraph::op::Elu>(input, 1.0f);
+        } else if (type == "MishInt8") {
+            res = std::make_shared<ngraph::op::v4::Mish>(input);
+        } else if (type == "AbsValInt8") {
+            res = std::make_shared<ngraph::op::Abs>(input);
+        } else if (type == "SigmoidInt8") {
+            res = std::make_shared<ngraph::op::Sigmoid>(input);
+        } else {
+            CV_Error(Error::StsNotImplemented, type + " activation with OpenVINO");
+        }
+
+        res = ngraphQuantize(res, output_sc, output_zp);
+
+        return new InfEngineNgraphNode(res);
+    }
+#endif  // HAVE_DNN_NGRAPH
+
     void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
     {
         CV_TRACE_FUNCTION();
diff --git a/modules/dnn/src/int8layers/eltwise_layer.cpp b/modules/dnn/src/int8layers/eltwise_layer.cpp
index e0a8d4787c..a3bb6ec2d6 100644
--- a/modules/dnn/src/int8layers/eltwise_layer.cpp
+++ b/modules/dnn/src/int8layers/eltwise_layer.cpp
@@ -5,6 +5,7 @@
 #include "../precomp.hpp"
 #include "layers_common.hpp"
 #include "../op_timvx.hpp"
+#include "../ie_ngraph.hpp"
 #include <opencv2/dnn/shape_utils.hpp>
 
 namespace cv
@@ -138,7 +139,7 @@ public:
         // For TimVX Backend, only ELTWISE_CHANNNELS_SAME was supported.
         if (backendId == DNN_BACKEND_TIMVX && haveTimVX())
             return channelsModeInput == ELTWISE_CHANNNELS_SAME;
-        return backendId == DNN_BACKEND_OPENCV;
+        return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;
     }
 
     bool getMemoryShapes(const std::vector<MatShape> &inputs,
@@ -369,6 +370,38 @@ public:
         return Ptr<BackendNode>();
     }
 
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> > &inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        CV_Assert(nodes.size() >= 2);
+        std::vector<ngraph::Output<ngraph::Node>> ieInpNodes(nodes.size());
+        for (size_t i = 0; i < nodes.size(); i++)
+        {
+            ieInpNodes[i] = nodes[i].dynamicCast<InfEngineNgraphNode>()->node;
+
+            float input_sc = !coeffs.empty() ? coeffs[i] : 1.0f;
+            float input_zp = op == PROD ? zeropoints[i] : 0.0f;
+            ieInpNodes[i] = ngraphDequantize(ieInpNodes[i], input_sc, input_zp);
+        }
+
+        auto res = ieInpNodes[0];
+        for (size_t i = 1; i < ieInpNodes.size(); i++)
+        {
+            switch (op) {
+                case SUM:  res = std::make_shared<ngraph::op::v1::Add>(res, ieInpNodes[i]); break;
+                case PROD: res = std::make_shared<ngraph::op::v1::Multiply>(res, ieInpNodes[i]); break;
+                case MAX:  res = std::make_shared<ngraph::op::v1::Maximum>(res, ieInpNodes[i]); break;
+                default: CV_Error(Error::StsNotImplemented, "Unsupported eltwise operation");
+            }
+        }
+
+        res = ngraphQuantize(res, 1.0f, offset);
+
+        return new InfEngineNgraphNode(res);
+    }
+#endif  // HAVE_DNN_NGRAPH
+
     class EltwiseInvoker : public ParallelLoopBody
     {
         EltwiseLayerInt8Impl& self;
diff --git a/modules/dnn/src/int8layers/fully_connected_layer.cpp b/modules/dnn/src/int8layers/fully_connected_layer.cpp
index 867f002dd4..b8e3bd6ee5 100644
--- a/modules/dnn/src/int8layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/int8layers/fully_connected_layer.cpp
@@ -5,6 +5,7 @@
 #include "../precomp.hpp"
 #include "layers_common.hpp"
 #include "../op_timvx.hpp"
+#include "../ie_ngraph.hpp"
 
 #include <opencv2/dnn/shape_utils.hpp>
 
@@ -86,7 +87,8 @@ public:
                return false;
         }
 
-        return backendId == DNN_BACKEND_OPENCV;
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;
     }
 
     virtual bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
@@ -303,7 +305,7 @@ public:
             #endif
                 {
                     int i = 0;
-            #if CV_SIMD
+            #if CV_SIMD128
                     for( ; i  <= nw - 4; i += 4, wptr += 4*wstep )
                     {
                         v_int32x4 vs0 = v_setzero_s32(), vs1 = v_setzero_s32(),
@@ -395,6 +397,77 @@ public:
 
     }
 
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> > &inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        CV_CheckTypeEQ(blobs[0].type(), CV_8S, "");  // weights
+        CV_CheckTypeEQ(blobs[1].type(), CV_32S, "");  // bias
+        CV_CheckTypeEQ(outputMultiplier.type(), CV_32F, "");
+
+        ngraph::Output<ngraph::Node> input = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        ngraph::Output<ngraph::Node> ieWeights, ieBias, matmul;
+        bool transA = false, transB = true;
+        size_t numOutput = blobs[0].size[0];
+
+        if (nodes.size() == 2)
+        {
+            CV_Error(Error::StsNotImplemented, "");
+            // auto inp2 = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
+            // matmul = std::make_shared<ngraph::op::MatMul>(ieInpNode, inp2, transA, transB);
+        }
+        else
+        {
+            std::vector<int> shape(1 + normalize_axis(axis, input.get_shape().size()), 0);
+            shape[shape.size() - 1] = -1;
+            input = std::make_shared<ngraph::op::v1::Reshape>(
+                input,
+                std::make_shared<ngraph::op::Constant>(ngraph::element::i32, ngraph::Shape{shape.size()}, shape.data()),
+                true
+            );
+
+            input = ngraphDequantize(input, input_sc, input_zp);
+
+            const float low = -128, high = 127;
+            std::vector<float> inpLows(numOutput, low);
+            std::vector<float> inpHighs(numOutput, high);
+            std::vector<float> outLows(numOutput);
+            std::vector<float> outHighs(numOutput);
+            for (int i = 0; i < numOutput; ++i) {
+                outLows[i] = low * outputMultiplier.ptr<float>()[i] * output_sc / input_sc;
+                outHighs[i] = high * outputMultiplier.ptr<float>()[i] * output_sc / input_sc;
+            }
+
+            std::vector<size_t> weight_shape{(size_t)blobs[0].size[0], (size_t)blobs[0].size[1]};
+            ieWeights = std::make_shared<ngraph::op::Constant>(ngraph::element::i8, weight_shape, blobs[0].data);
+            ieWeights = std::make_shared<ngraph::op::Convert>(ieWeights, ngraph::element::f32);
+            ieWeights = std::make_shared<ngraph::op::FakeQuantize>(ieWeights,
+                std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{numOutput, 1}, inpLows.data()),
+                std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{numOutput, 1}, inpHighs.data()),
+                std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{numOutput, 1}, outLows.data()),
+                std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{numOutput, 1}, outHighs.data()),
+                256 // levels
+            );
+            matmul = std::make_shared<ngraph::op::MatMul>(input, ieWeights, transA, transB);
+        }
+
+        if (blobs.size() > 1) {
+            int32_t* bias = blobs[1].ptr<int32_t>();
+            std::vector<float> ovBias(blobs[1].total());
+            for (int i = 0; i < ovBias.size(); ++i) {
+                ovBias[i] = (bias[i] + input_zp * cv::sum(blobs[0].row(i))[0]) * outputMultiplier.ptr<float>()[i] * output_sc;
+            }
+            auto bias_node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32,
+                                            ngraph::Shape{blobs[1].total()}, ovBias.data());
+            matmul = std::make_shared<ngraph::op::v1::Add>(matmul, bias_node);
+        }
+
+        matmul = ngraphQuantize(matmul, output_sc, output_zp);
+
+        return new InfEngineNgraphNode(matmul);
+    }
+#endif  // HAVE_DNN_NGRAPH
+
     Mat weightsMat, biasMat, outputMultiplier, activationLUT;
     Ptr<ActivationLayerInt8> activ;
 };
diff --git a/modules/dnn/src/int8layers/pooling_layer.cpp b/modules/dnn/src/int8layers/pooling_layer.cpp
index a2dda5eb07..bfff3d34c5 100644
--- a/modules/dnn/src/int8layers/pooling_layer.cpp
+++ b/modules/dnn/src/int8layers/pooling_layer.cpp
@@ -5,6 +5,7 @@
 #include "../precomp.hpp"
 #include "layers_common.hpp"
 #include "../op_timvx.hpp"
+#include "../ie_ngraph.hpp"
 #include "opencv2/core/hal/intrin.hpp"
 
 #include <float.h>
@@ -124,6 +125,10 @@ public:
                 return type == MAX || type == AVE;
             return false;
         }
+        else if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        {
+            return true;
+        }
 
         return false;
     }
@@ -271,6 +276,49 @@ public:
         return Ptr<BackendNode>();
     }
 
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> > &inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        auto input = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+
+        input = ngraphDequantize(input, input_sc, input_zp);
+
+        ngraph::op::PadType pad_type = ngraph::op::PadType::EXPLICIT;
+        if (!padMode.empty())
+            pad_type = padMode == "VALID" ? ngraph::op::PadType::VALID : ngraph::op::PadType::SAME_UPPER;
+
+        auto rounding_type = ceilMode ? ngraph::op::RoundingType::CEIL : ngraph::op::RoundingType::FLOOR;
+        ngraph::Output<ngraph::Node> pool;
+        if (type == MAX) {
+            pool = std::make_shared<ngraph::op::v1::MaxPool>(input, ngraph::Strides(strides),
+                        ngraph::Shape(pads_begin), ngraph::Shape(pads_end), ngraph::Shape(kernel_size),
+                        rounding_type, pad_type);
+        } else if (type == AVE) {
+            pool = std::make_shared<ngraph::op::v1::AvgPool>(input, ngraph::Strides(strides),
+                        ngraph::Shape(pads_begin), ngraph::Shape(pads_end), ngraph::Shape(kernel_size),
+                        !avePoolPaddedArea, rounding_type, pad_type);
+        } else if (type == SUM) {
+            ngraph::Shape inpShape = input.get_shape();
+            CV_Assert(inpShape.size() == 2 + kernel_size.size());
+            std::vector<int64_t> axes;
+            for (size_t i = 0; i < kernel_size.size(); i++)
+            {
+                if (inpShape[2 + i] == kernel_size[i])
+                    axes.push_back(2 + i);
+            }
+            auto reduction_axes = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{axes.size()}, axes);
+            pool = std::make_shared<ngraph::op::v1::ReduceSum>(input, reduction_axes, true);
+        } else {
+            CV_Error(Error::StsNotImplemented, format("INT8 Pooling type: %d", type));
+        }
+
+        pool = ngraphQuantize(pool, output_sc, output_zp);
+
+        return new InfEngineNgraphNode(pool);
+    }
+#endif  // HAVE_DNN_NGRAPH
+
     void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
     {
         CV_TRACE_FUNCTION();
diff --git a/modules/dnn/src/int8layers/quantization_utils.cpp b/modules/dnn/src/int8layers/quantization_utils.cpp
index a4a822efdd..ece2a2f355 100644
--- a/modules/dnn/src/int8layers/quantization_utils.cpp
+++ b/modules/dnn/src/int8layers/quantization_utils.cpp
@@ -5,6 +5,7 @@
 #include "../precomp.hpp"
 #include "layers_common.hpp"
 #include "../op_timvx.hpp"
+#include "../ie_ngraph.hpp"
 
 namespace cv
 {
@@ -98,7 +99,8 @@ public:
 
     virtual bool supportBackend(int backendId) CV_OVERRIDE
     {
-        return backendId == DNN_BACKEND_OPENCV;
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;
     }
 
     bool getMemoryShapes(const std::vector<MatShape> &inputs,
@@ -171,6 +173,16 @@ public:
         else
             inputs[0].convertTo(outputs[0], CV_8S, 1.f/scales[0], zeropoints[0]);
     }
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        const auto input = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        auto quantized = ngraphQuantize(input, scales[0], zeropoints[0]);
+        return Ptr<BackendNode>(new InfEngineNgraphNode(quantized));
+    }
+#endif  // HAVE_DNN_NGRAPH
 };
 
 // Dequantize INT8 Inputs to FP32/FP16
@@ -214,7 +226,7 @@ public:
 
     virtual bool supportBackend(int backendId) CV_OVERRIDE
     {
-        return backendId == DNN_BACKEND_OPENCV;
+        return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;
     }
 
     bool getMemoryShapes(const std::vector<MatShape> &inputs,
@@ -285,6 +297,16 @@ public:
         else
             inputs[0].convertTo(outputs[0], CV_32F, scales[0], -(scales[0]*zeropoints[0]));
     }
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        const auto input = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        auto quantized = ngraphDequantize(input, scales[0], zeropoints[0]);
+        return new InfEngineNgraphNode(quantized);
+    }
+#endif  // HAVE_DNN_NGRAPH
 };
 
 // Rescale/Requantize INT8 Inputs from (scale1, zeropoint1) to (scale2, zeropoint2)
diff --git a/modules/dnn/src/int8layers/scale_layer.cpp b/modules/dnn/src/int8layers/scale_layer.cpp
index d7f676d047..e50c4cea0e 100644
--- a/modules/dnn/src/int8layers/scale_layer.cpp
+++ b/modules/dnn/src/int8layers/scale_layer.cpp
@@ -6,6 +6,7 @@
 #include "layers_common.hpp"
 #include <opencv2/imgproc.hpp>
 #include <opencv2/dnn/shape_utils.hpp>
+#include "../ie_ngraph.hpp"
 
 namespace cv
 {
@@ -72,7 +73,8 @@ public:
 
     virtual bool supportBackend(int backendId) CV_OVERRIDE
     {
-        return backendId == DNN_BACKEND_OPENCV;
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;
     }
 
     bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
@@ -186,6 +188,59 @@ public:
         return flops;
     }
 
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs, const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        std::vector<ngraph::Output<ngraph::Node>> ieInpNodes(nodes.size());
+        for (int i = 0; i < nodes.size(); ++i) {
+            ieInpNodes[i] = nodes[i].dynamicCast<InfEngineNgraphNode>()->node;
+        }
+
+        ieInpNodes[0] = ngraphDequantize(ieInpNodes[0], inp_sc[0], inp_zp[0]);
+
+        CV_Assert(!blobs.empty() || ieInpNodes.size() == 1 + (int)hasWeights + (int)hasBias);
+
+        ngraph::Output<ngraph::Node> weights, bias;
+        if (blobs.empty()) {
+            if (hasWeights)
+                weights = ieInpNodes[1];
+            if (hasBias)
+                bias = ieInpNodes[1 + (int)hasWeights];
+        } else {
+            std::vector<size_t> shape = ieInpNodes[0].get_shape();
+            int cAxis = normalize_axis(axis, shape.size());
+
+            size_t numWeights = blobs[0].total();
+            for (int i = 0; i < cAxis; ++i) {
+                shape[i] = 1;
+            }
+            for (int i = cAxis; i < shape.size(); ++i) {
+                if (numWeights == 1) {
+                    shape[i] = 1;
+                }
+                numWeights = std::max(numWeights / shape[i], (size_t)1);
+            }
+
+            if (hasWeights)
+                weights = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, shape, blobs[0].data);
+            if (hasBias)
+                bias = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, shape, blobs[(int)hasWeights].data);
+        }
+
+        ngraph::Output<ngraph::Node> res = ieInpNodes[0];
+        if (hasWeights) {
+            res = std::make_shared<ngraph::op::v1::Multiply>(res, weights);
+        }
+        if (hasBias) {
+            res = std::make_shared<ngraph::op::v1::Add>(res, bias);
+        }
+
+        res = ngraphQuantize(res, output_sc, output_zp);
+
+        return new InfEngineNgraphNode(res);
+    }
+#endif  // HAVE_DNN_NGRAPH
+
 private:
     bool hasWeights;
     std::vector<float> inp_sc;
diff --git a/modules/dnn/src/int8layers/softmax_layer.cpp b/modules/dnn/src/int8layers/softmax_layer.cpp
index 5096e541e6..28c6837cca 100644
--- a/modules/dnn/src/int8layers/softmax_layer.cpp
+++ b/modules/dnn/src/int8layers/softmax_layer.cpp
@@ -5,6 +5,7 @@
 #include "../precomp.hpp"
 #include "layers_common.hpp"
 #include "../op_timvx.hpp"
+#include "../ie_ngraph.hpp"
 
 #include <algorithm>
 #include <stdlib.h>
@@ -90,7 +91,8 @@ public:
     virtual bool supportBackend(int backendId) CV_OVERRIDE
     {
         return backendId == DNN_BACKEND_OPENCV ||
-            (backendId == DNN_BACKEND_TIMVX && haveTimVX());
+            (backendId == DNN_BACKEND_TIMVX && haveTimVX()) ||
+            backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;
     }
 
     virtual bool tryFuse(Ptr<Layer>& top) CV_OVERRIDE
@@ -194,6 +196,26 @@ public:
         return Ptr<BackendNode>();
     }
 
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> > &inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        auto input = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+
+        input = ngraphDequantize(input, input_sc, input_zp);
+
+        ngraph::Output<ngraph::Node> res;
+        if (logSoftMax) {
+            res = std::make_shared<ngraph::op::v5::LogSoftmax>(input, axis);
+        } else {
+            res = std::make_shared<ngraph::op::v1::Softmax>(input, axis);
+        }
+
+        res = ngraphQuantize(res, output_sc, output_zp);
+        return new InfEngineNgraphNode(res);
+    }
+#endif  // HAVE_DNN_NGRAPH
+
     template <bool with_log>
     class SoftmaxInt8Invoker : public ParallelLoopBody {
     public:
diff --git a/modules/dnn/src/layers/const_layer.cpp b/modules/dnn/src/layers/const_layer.cpp
index 34f9587825..2a1e27db56 100644
--- a/modules/dnn/src/layers/const_layer.cpp
+++ b/modules/dnn/src/layers/const_layer.cpp
@@ -62,10 +62,15 @@ public:
     {
         std::vector<UMat> outputs;
         outs.getUMatVector(outputs);
-        if (outs.depth() == CV_16S)
-            convertFp16(blobs[0], outputs[0]);
+        if (outs.depth() == CV_16S) {
+            auto blob = blobs[0];
+            if (blob.type() != CV_32F) {
+                blob.convertTo(blob, CV_32F);
+            }
+            convertFp16(blob, outputs[0]);
+        }
         else
-            blobs[0].copyTo(outputs[0]);
+            blobs[0].convertTo(outputs[0], outputs[0].type());
         return true;
     }
 #endif
@@ -80,7 +85,7 @@ public:
 
         std::vector<Mat> outputs;
         outputs_arr.getMatVector(outputs);
-        blobs[0].copyTo(outputs[0]);
+        blobs[0].convertTo(outputs[0], outputs[0].type());
     }
 
 #ifdef HAVE_CANN
@@ -123,9 +128,23 @@ public:
     virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
                                         const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
     {
-        auto node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32,
+        ngraph::element::Type dType;
+        if (blobs[0].depth() == CV_32F) {
+            dType = ngraph::element::f32;
+        } else if (blobs[0].depth() == CV_32S) {
+            dType = ngraph::element::i32;
+        } else if (blobs[0].depth() == CV_8S) {
+            dType = ngraph::element::i8;
+        } else {
+            CV_Error(Error::StsNotImplemented, format("Unexpected Const data depth: %d", blobs[0].depth()));
+        }
+        std::shared_ptr<ngraph::Node> node =
+                    std::make_shared<ngraph::op::Constant>(dType,
                                                            getShape<size_t>(blobs[0]),
                                                            blobs[0].data);
+        if (node->get_element_type() != ngraph::element::f32) {
+            node = std::make_shared<ngraph::op::Convert>(node, ngraph::element::f32);
+        }
         return Ptr<BackendNode>(new InfEngineNgraphNode(node));
     }
 #endif  // HAVE_DNN_NGRAPH
@@ -151,7 +170,11 @@ public:
         auto context = reinterpret_cast<csl::CSLContext*>(context_);
 
         CV_Assert(blobs.size() == 1);
-        return make_cuda_node<cuda4dnn::ConstOp>(preferableTarget, std::move(context->stream), blobs[0]);
+        Mat blob = blobs[0];
+        if (blob.type() != CV_32F) {
+            blob.convertTo(blob, CV_32F);
+        }
+        return make_cuda_node<cuda4dnn::ConstOp>(preferableTarget, std::move(context->stream), blob);
     }
 #endif
 
diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp
index 3bb5b12a54..281ab8a08f 100644
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@@ -201,8 +201,6 @@ public:
 };
 
 
-#define IS_POWER_LAYER(layer) \
-            (!layer.empty() && !layer->type.compare("Power"))
 //TODO: simultaneously convolution and bias addition for cache optimization
 class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
 {
diff --git a/modules/dnn/src/layers/cpu_kernels/fast_gemm_kernels.default.hpp b/modules/dnn/src/layers/cpu_kernels/fast_gemm_kernels.default.hpp
index 6a8ef6b590..b9362bb4d5 100644
--- a/modules/dnn/src/layers/cpu_kernels/fast_gemm_kernels.default.hpp
+++ b/modules/dnn/src/layers/cpu_kernels/fast_gemm_kernels.default.hpp
@@ -12,16 +12,16 @@
 #include <opencv2/core/hal/intrin.hpp>
 #include <opencv2/core/utility.hpp> // parallel_for_
 
-#define FAST_GEMM_DEFAULT_STORAGE (1<<20) // 2^20
-#define FAST_GEMM_DEFAULT_MAX_STACKBUF (1 << 14)
+#define FAST_GEMM_STORAGE (1<<20) // 2^20
+#define FAST_GEMM_MAX_STACKBUF (1 << 14)
 
-#define FAST_GEMM_DEFAULT_F32_MC 64
-#define FAST_GEMM_DEFAULT_F32_NC 240
-#define FAST_GEMM_DEFAULT_F32_MR 8
-#define FAST_GEMM_DEFAULT_F32_NR 12
-#define FAST_GEMM_DEFAULT_F32_PACKED_STRIDE_K 256
+#define FAST_GEMM_F32_MC 64
+#define FAST_GEMM_F32_NC 240
+#define FAST_GEMM_F32_MR 8
+#define FAST_GEMM_F32_NR 12
+#define FAST_GEMM_F32_PACKED_STRIDE_K 64
 
-#define FAST_GEMM_DEFAULT_IMPLEMENT_PACK(N, suffix, styp, dtyp) \
+#define FAST_GEMM_IMPLEMENT_PACK(N, suffix, styp, dtyp) \
 static void fast_gemm_pack##N##suffix( int m, int k, const void* A_, \
                                       int lda0, int lda1, void* packA_ ) \
 { \
@@ -32,47 +32,47 @@ static void fast_gemm_pack##N##suffix( int m, int k, const void* A_, \
             const styp* a_ptr = A + lda0*i; \
             for( int j = 0; j < k*lda1; packA += N, j += lda1 ) \
             { \
-                FAST_GEMM_DEFAULT_LOAD_TO_BUF_##N(styp); \
-                FAST_GEMM_DEFAULT_PACK##suffix##_##N(buf, packA); \
+                FAST_GEMM_LOAD_TO_BUF_##N(styp); \
+                FAST_GEMM_PACK##suffix##_##N(buf, packA); \
             } \
         } else { \
             const styp* a_ptr[N]; \
             for (int k = 0; k < N; k++) a_ptr[k] = A + lda0*(i+k < m ? i+k : i); \
             for( int j = 0; j < k*lda1; packA += N, j += lda1 ) \
             { \
-                FAST_GEMM_DEFAULT_LOAD_TO_BUF_BORDERS_##N(styp); \
-                FAST_GEMM_DEFAULT_PACK##suffix##_##N(buf, packA); \
+                FAST_GEMM_LOAD_TO_BUF_BORDERS_##N(styp); \
+                FAST_GEMM_PACK##suffix##_##N(buf, packA); \
             } \
         } \
     } \
 }
 
-#define FAST_GEMM_DEFAULT_LOAD_TO_BUF_8(styp) \
+#define FAST_GEMM_LOAD_TO_BUF_8(styp) \
     styp buf[] = { \
         a_ptr[j], a_ptr[j+lda0], a_ptr[j+lda0*2], a_ptr[j+lda0*3], \
         a_ptr[j+lda0*4], a_ptr[j+lda0*5], a_ptr[j+lda0*6], a_ptr[j+lda0*7] }
 
-#define FAST_GEMM_DEFAULT_LOAD_TO_BUF_BORDERS_8(styp) \
+#define FAST_GEMM_LOAD_TO_BUF_BORDERS_8(styp) \
     styp buf[] = { \
         a_ptr[0][j], a_ptr[1][j], a_ptr[2][j], a_ptr[3][j], \
         a_ptr[4][j], a_ptr[5][j], a_ptr[6][j], a_ptr[7][j] }
 
-#define FAST_GEMM_DEFAULT_LOAD_TO_BUF_12(styp) \
+#define FAST_GEMM_LOAD_TO_BUF_12(styp) \
     styp buf[] = { \
         a_ptr[j], a_ptr[j+lda0], a_ptr[j+lda0*2], a_ptr[j+lda0*3], \
         a_ptr[j+lda0*4], a_ptr[j+lda0*5], a_ptr[j+lda0*6], a_ptr[j+lda0*7], \
         a_ptr[j+lda0*8], a_ptr[j+lda0*9], a_ptr[j+lda0*10], a_ptr[j+lda0*11] }
 
-#define FAST_GEMM_DEFAULT_LOAD_TO_BUF_BORDERS_12(styp) \
+#define FAST_GEMM_LOAD_TO_BUF_BORDERS_12(styp) \
     styp buf[] = { \
         a_ptr[0][j], a_ptr[1][j], a_ptr[2][j], a_ptr[3][j], \
         a_ptr[4][j], a_ptr[5][j], a_ptr[6][j], a_ptr[7][j], \
         a_ptr[8][j], a_ptr[9][j], a_ptr[10][j], a_ptr[11][j] }
 
-#define FAST_GEMM_DEFAULT_PACK_COPY(src, dst, N) \
+#define FAST_GEMM_PACK_COPY(src, dst, N) \
     memcpy((dst), (src), N*sizeof(src[0]))
-#define FAST_GEMM_DEFAULT_PACK_f32_8(src, dst) FAST_GEMM_DEFAULT_PACK_COPY((src), (dst), 8)
-#define FAST_GEMM_DEFAULT_PACK_f32_12(src, dst) FAST_GEMM_DEFAULT_PACK_COPY((src), (dst), 12)
+#define FAST_GEMM_PACK_f32_8(src, dst) FAST_GEMM_PACK_COPY((src), (dst), 8)
+#define FAST_GEMM_PACK_f32_12(src, dst) FAST_GEMM_PACK_COPY((src), (dst), 12)
 
 namespace cv { namespace dnn { namespace cpu_baseline {
 
@@ -88,20 +88,20 @@ void fastGemmKernel(int M, int N, int K,
                     float alpha, const char *A, int lda0, int lda1,
                     const char *packed_B, float beta, char *C, int ldc, int esz);
 
-FAST_GEMM_DEFAULT_IMPLEMENT_PACK(8, _f32, float, float)
-FAST_GEMM_DEFAULT_IMPLEMENT_PACK(12, _f32, float, float)
+FAST_GEMM_IMPLEMENT_PACK(8, _f32, float, float)
+FAST_GEMM_IMPLEMENT_PACK(12, _f32, float, float)
 
 int fastGemmPackBSize(int N, int K) {
-    int GEMM_NC = FAST_GEMM_DEFAULT_F32_NC, GEMM_NR = FAST_GEMM_DEFAULT_F32_NR;
+    int GEMM_NC = FAST_GEMM_F32_NC, GEMM_NR = FAST_GEMM_F32_NR;
     int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
 
     return static_cast<int>((N + NC - 1) / NC) * NC * K;
 }
 
 void fastGemmPackBKernel(const char *B, char *packed_B, int N, int K, int ldb0, int ldb1, int esz) {
-    int GEMM_NC = FAST_GEMM_DEFAULT_F32_NC, GEMM_NR = FAST_GEMM_DEFAULT_F32_NR;
+    int GEMM_NC = FAST_GEMM_F32_NC, GEMM_NR = FAST_GEMM_F32_NR;
     int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
-    int KC = std::min(FAST_GEMM_DEFAULT_F32_PACKED_STRIDE_K, K);
+    int KC = std::min(FAST_GEMM_F32_PACKED_STRIDE_K, K);
 
     int n_tiles = (N + NC - 1) / NC;
     for (int r = 0; r < n_tiles; ++r) {
@@ -116,140 +116,50 @@ void fastGemmPackBKernel(const char *B, char *packed_B, int N, int K, int ldb0,
     }
 }
 
-#if CV_SIMD128
-static void fast_gemm8x12_f32(int k, const char *a_, const char *b_,
-                       char *c_, int ldc, float alpha) {
+static inline void fast_gemm_f32(int k, const char *a_, const char *b_,
+                                 char *c_, int ldc, float alpha) {
     const float* a = (const float*)a_;
     const float* b = (const float*)b_;
     float* c = (float*)c_;
 
-    v_float32x4 s00 = v_setzero_f32(), s01 = s00, s02 = s00;
-    v_float32x4 s10 = s00, s11 = s00, s12 = s00;
-    v_float32x4 s20 = s00, s21 = s00, s22 = s00;
-    v_float32x4 s30 = s00, s31 = s00, s32 = s00;
-    v_float32x4 s40 = s00, s41 = s00, s42 = s00;
-    v_float32x4 s50 = s00, s51 = s00, s52 = s00;
-    v_float32x4 s60 = s00, s61 = s00, s62 = s00;
-    v_float32x4 s70 = s00, s71 = s00, s72 = s00;
-
-    for(int p = 0; p < k; p++, a += FAST_GEMM_DEFAULT_F32_MR, b += FAST_GEMM_DEFAULT_F32_NR) {
-        v_float32x4 b0 = v_load(b), b1 = v_load(b + 4), b2 = v_load(b + 8);
-
-        v_float32x4 a0 = v_setall_f32(*a);
-        s00 = v_fma(b0, a0, s00);
-        s01 = v_fma(b1, a0, s01);
-        s02 = v_fma(b2, a0, s02);
-        v_float32x4 a1 = v_setall_f32(*(a + 1));
-        s10 = v_fma(b0, a1, s10);
-        s11 = v_fma(b1, a1, s11);
-        s12 = v_fma(b2, a1, s12);
-
-        v_float32x4 a2 = v_setall_f32(*(a + 2));
-        s20 = v_fma(b0, a2, s20);
-        s21 = v_fma(b1, a2, s21);
-        s22 = v_fma(b2, a2, s22);
-        v_float32x4 a3 = v_setall_f32(*(a + 3));
-        s30 = v_fma(b0, a3, s30);
-        s31 = v_fma(b1, a3, s31);
-        s32 = v_fma(b2, a3, s32);
-
-        a0 = v_setall_f32(*(a + 4));
-        s40 = v_fma(b0, a0, s40);
-        s41 = v_fma(b1, a0, s41);
-        s42 = v_fma(b2, a0, s42);
-        a1 = v_setall_f32(*(a + 5));
-        s50 = v_fma(b0, a1, s50);
-        s51 = v_fma(b1, a1, s51);
-        s52 = v_fma(b2, a1, s52);
-
-        a2 = v_setall_f32(*(a + 6));
-        s60 = v_fma(b0, a2, s60);
-        s61 = v_fma(b1, a2, s61);
-        s62 = v_fma(b2, a2, s62);
-        a3 = v_setall_f32(*(a + 7));
-        s70 = v_fma(b0, a3, s70);
-        s71 = v_fma(b1, a3, s71);
-        s72 = v_fma(b2, a3, s72);
-    }
-
-    v_float32x4 c0, c1, c2, c3, c4, c5, v_alpha = v_setall_f32(alpha);
-#define FAST_GEMM_FINALE(row0, row1)       \
-    c0 = v_load(c + row0 * ldc);         \
-    c1 = v_load(c + row0 * ldc + 4);     \
-    c2 = v_load(c + row0 * ldc + 8);     \
-    c3 = v_load(c + row1 * ldc);         \
-    c4 = v_load(c + row1 * ldc + 4);     \
-    c5 = v_load(c + row1 * ldc + 8);     \
-    c0 = v_fma(s##row0##0, v_alpha, c0); \
-    c1 = v_fma(s##row0##1, v_alpha, c1); \
-    c2 = v_fma(s##row0##2, v_alpha, c2); \
-    c3 = v_fma(s##row1##0, v_alpha, c3); \
-    c4 = v_fma(s##row1##1, v_alpha, c4); \
-    c5 = v_fma(s##row1##2, v_alpha, c5); \
-    v_store(c + row0 * ldc, c0);         \
-    v_store(c + row0 * ldc + 4, c1);     \
-    v_store(c + row0 * ldc + 8, c2);     \
-    v_store(c + row1 * ldc, c3);         \
-    v_store(c + row1 * ldc + 4, c4);     \
-    v_store(c + row1 * ldc + 8, c5);
-
-    FAST_GEMM_FINALE(0, 1);
-    FAST_GEMM_FINALE(2, 3);
-    FAST_GEMM_FINALE(4, 5);
-    FAST_GEMM_FINALE(6, 7);
-#undef FAST_GEMM_FINALE
-}
-
-#else
-static void fast_gemm_f32(int k, const char *a_, const char *b_,
-                          char *c_, int ldc, float alpha) {
-    const float* a = (const float*)a_;
-    const float* b = (const float*)b_;
-    float* c = (float*)c_;
-
-    float sbuf[FAST_GEMM_DEFAULT_F32_MR * FAST_GEMM_DEFAULT_F32_NR];
+    float sbuf[FAST_GEMM_F32_MR * FAST_GEMM_F32_NR];
     memset(sbuf, 0, sizeof(sbuf));
     for(int p = 0; p < k; p++) {
-        for( int i = 0; i < FAST_GEMM_DEFAULT_F32_MR; i++ ) {
-            float ai = a[FAST_GEMM_DEFAULT_F32_MR * p + i];
-            for( int j = 0; j < FAST_GEMM_DEFAULT_F32_NR; j++ )
-                sbuf[i * FAST_GEMM_DEFAULT_F32_NR + j] += b[FAST_GEMM_DEFAULT_F32_NR * p + j] * ai;
+        for( int i = 0; i < FAST_GEMM_F32_MR; i++ ) {
+            float ai = a[FAST_GEMM_F32_MR * p + i];
+            for( int j = 0; j < FAST_GEMM_F32_NR; j++ )
+                sbuf[i * FAST_GEMM_F32_NR + j] += b[FAST_GEMM_F32_NR * p + j] * ai;
         }
     }
-    for (int i = 0; i < FAST_GEMM_DEFAULT_F32_MR; i++) {
-        for (int j = 0; j < FAST_GEMM_DEFAULT_F32_NR; j++)
-            c[i * ldc + j] += alpha * sbuf[i * FAST_GEMM_DEFAULT_F32_NR + j];
+    for (int i = 0; i < FAST_GEMM_F32_MR; i++) {
+        for (int j = 0; j < FAST_GEMM_F32_NR; j++)
+            c[i * ldc + j] += alpha * sbuf[i * FAST_GEMM_F32_NR + j];
     }
 }
-#endif // CV_SIMD128
 
 static void fast_gemm_macro_kernel(int m, int n, int k,
                                    const char *packed_A, const char *packed_B,
                                    float alpha, char *c, int ldc0, int esz) {
     int ldc0_esz = ldc0 * esz;
 
-    double tempC[FAST_GEMM_DEFAULT_F32_MR * FAST_GEMM_DEFAULT_F32_NR]; // make sure the buffer is big enough
-    for(int i = 0; i < m; i += FAST_GEMM_DEFAULT_F32_MR) {
-        for(int j = 0; j < n; j += FAST_GEMM_DEFAULT_F32_NR) {
+    double tempC[FAST_GEMM_F32_MR * FAST_GEMM_F32_NR]; // make sure the buffer is big enough
+    for(int i = 0; i < m; i += FAST_GEMM_F32_MR) {
+        for(int j = 0; j < n; j += FAST_GEMM_F32_NR) {
             char* cptr0 = &c[i * ldc0_esz + j * esz];
             char* cptr = cptr0;
             int ldc = ldc0;
-            int mr = m - i < FAST_GEMM_DEFAULT_F32_MR ? m - i : FAST_GEMM_DEFAULT_F32_MR;
-            int nr = n - j < FAST_GEMM_DEFAULT_F32_NR ? n - j : FAST_GEMM_DEFAULT_F32_NR;
+            int mr = m - i < FAST_GEMM_F32_MR ? m - i : FAST_GEMM_F32_MR;
+            int nr = n - j < FAST_GEMM_F32_NR ? n - j : FAST_GEMM_F32_NR;
             int nr_esz = nr * esz;
-            bool partial = (bool)((mr < FAST_GEMM_DEFAULT_F32_MR) | (nr < FAST_GEMM_DEFAULT_F32_NR));
+            bool partial = (bool)((mr < FAST_GEMM_F32_MR) | (nr < FAST_GEMM_F32_NR));
             if (partial) {
                 memset(tempC, 0, sizeof(tempC));
                 cptr = (char *)tempC;
-                ldc = FAST_GEMM_DEFAULT_F32_NR;
+                ldc = FAST_GEMM_F32_NR;
                 for(int p = 0; p < mr; p++)
                     memcpy(cptr + p * (ldc * esz), cptr0 + p * ldc0_esz, nr_esz);
             }
-#if CV_SIMD128
-            fast_gemm8x12_f32(k, packed_A + i * k * esz, packed_B + j * k * esz, cptr, ldc, alpha);
-#else
             fast_gemm_f32(k, packed_A + i * k * esz, packed_B + j * k * esz, cptr, ldc, alpha);
-#endif
 
             if (partial) {
                 for(int p = 0; p < mr; p++)
@@ -263,19 +173,19 @@ void fastGemmKernel(int M, int N, int K,
                     float alpha, const char *A, int lda0, int lda1,
                     const char *B, int ldb0, int ldb1,
                     float beta, char *C, int ldc, int esz) {
-    int GEMM_MC = FAST_GEMM_DEFAULT_F32_MC,
-        GEMM_NC = FAST_GEMM_DEFAULT_F32_NC,
-        GEMM_MR = FAST_GEMM_DEFAULT_F32_MR,
-        GEMM_NR = FAST_GEMM_DEFAULT_F32_NR;
+    int GEMM_MC = FAST_GEMM_F32_MC,
+        GEMM_NC = FAST_GEMM_F32_NC,
+        GEMM_MR = FAST_GEMM_F32_MR,
+        GEMM_NR = FAST_GEMM_F32_NR;
 
     int MC = (((GEMM_MC < M ? GEMM_MC : M) + GEMM_MR - 1) / GEMM_MR) * GEMM_MR;
     int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
-    int KC = FAST_GEMM_DEFAULT_STORAGE / ((MC + NC) * esz);
+    int KC = FAST_GEMM_STORAGE / ((MC + NC) * esz);
     KC = KC > 8 ? KC : 8;
     KC = KC < K ? KC : K;
 
     size_t buff_size = KC * (MC + NC) * esz;
-    bool use_stackbuff = buff_size <= FAST_GEMM_DEFAULT_MAX_STACKBUF;
+    bool use_stackbuff = buff_size <= FAST_GEMM_MAX_STACKBUF;
     int m_tiles = (M + MC - 1) / MC;
     int n_tiles = (N + NC - 1) / NC;
     int total_tiles = m_tiles * n_tiles;
@@ -328,17 +238,17 @@ void fastGemmKernel(int M, int N, int K,
 void fastGemmKernel(int M, int N, int K,
                     float alpha, const char *A, int lda0, int lda1,
                     const char *packed_B, float beta, char *C, int ldc, int esz) {
-    int GEMM_MC = FAST_GEMM_DEFAULT_F32_MC,
-        GEMM_NC = FAST_GEMM_DEFAULT_F32_NC,
-        GEMM_MR = FAST_GEMM_DEFAULT_F32_MR,
-        GEMM_NR = FAST_GEMM_DEFAULT_F32_NR;
+    int GEMM_MC = FAST_GEMM_F32_MC,
+        GEMM_NC = FAST_GEMM_F32_NC,
+        GEMM_MR = FAST_GEMM_F32_MR,
+        GEMM_NR = FAST_GEMM_F32_NR;
 
     int MC = (((GEMM_MC < M ? GEMM_MC : M) + GEMM_MR - 1) / GEMM_MR) * GEMM_MR;
     int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
-    int KC = std::min(FAST_GEMM_DEFAULT_F32_PACKED_STRIDE_K, K);
+    int KC = std::min(FAST_GEMM_F32_PACKED_STRIDE_K, K);
 
     size_t buff_size = KC * MC * esz;
-    bool use_stackbuff = buff_size <= FAST_GEMM_DEFAULT_MAX_STACKBUF;
+    bool use_stackbuff = buff_size <= FAST_GEMM_MAX_STACKBUF;
     int m_tiles = (M + MC - 1) / MC;
     int n_tiles = (N + NC - 1) / NC;
     int total_tiles = m_tiles * n_tiles;
@@ -391,3 +301,29 @@ void fastGemmKernel(int M, int N, int K,
 }
 
 }}} // cv::dnn::cpu_baseline
+
+#undef FAST_GEMM_STORAGE
+#undef FAST_GEMM_MAX_STACKBUF
+#ifdef FAST_GEMM_F32_MC
+#undef FAST_GEMM_F32_MC
+#endif
+#ifdef FAST_GEMM_F32_NC
+#undef FAST_GEMM_F32_NC
+#endif
+#ifdef FAST_GEMM_F32_MR
+#undef FAST_GEMM_F32_MR
+#endif
+#ifdef FAST_GEMM_F32_NR
+#undef FAST_GEMM_F32_NR
+#endif
+#ifdef FAST_GEMM_F32_PACKED_STRIDE_K
+#undef FAST_GEMM_F32_PACKED_STRIDE_K
+#endif
+#undef FAST_GEMM_IMPLEMENT_PACK
+#undef FAST_GEMM_LOAD_TO_BUF_8
+#undef FAST_GEMM_LOAD_TO_BUF_BORDERS_8
+#undef FAST_GEMM_LOAD_TO_BUF_12
+#undef FAST_GEMM_LOAD_TO_BUF_BORDERS_12
+#undef FAST_GEMM_PACK_COPY
+#undef FAST_GEMM_PACK_f32_8
+#undef FAST_GEMM_PACK_f32_12
diff --git a/modules/dnn/src/layers/cpu_kernels/fast_gemm_kernels.simd.hpp b/modules/dnn/src/layers/cpu_kernels/fast_gemm_kernels.simd.hpp
index 99a7d3b2d7..7d123ed9b5 100644
--- a/modules/dnn/src/layers/cpu_kernels/fast_gemm_kernels.simd.hpp
+++ b/modules/dnn/src/layers/cpu_kernels/fast_gemm_kernels.simd.hpp
@@ -15,37 +15,31 @@
 #define FAST_GEMM_STORAGE (1<<20) // 2^20
 #define FAST_GEMM_MAX_STACKBUF (1 << 14)
 
-#if CV_NEON
-#define FAST_GEMM_F32_MC 64
-#define FAST_GEMM_F32_NC 240
-#elif CV_AVX
+#if CV_AVX
 #define FAST_GEMM_F32_MC 60
 #define FAST_GEMM_F32_NC 320
 #elif CV_LASX
 #define FAST_GEMM_F32_MC 48
 #define FAST_GEMM_F32_NC 128
+#else // CV_NEON_AARCH64, SIMD128
+#define FAST_GEMM_F32_MC 64
+#define FAST_GEMM_F32_NC 240
 #endif
 
-// micro kernel size
-#if CV_NEON && CV_NEON_AARCH64
-#define FAST_GEMM_F32_MR 8
-#define FAST_GEMM_F32_NR 12
-#elif CV_NEON
-#define FAST_GEMM_F32_MR 4
-#define FAST_GEMM_F32_NR 12
-#elif CV_AVX
+#if CV_AVX
 #define FAST_GEMM_F32_MR 12
 #define FAST_GEMM_F32_NR 8
 #elif CV_LASX
 #define FAST_GEMM_F32_MR 12
 #define FAST_GEMM_F32_NR 16
+#else // CV_NEON_AARCH64, CV_SIMD128
+#define FAST_GEMM_F32_MR 8
+#define FAST_GEMM_F32_NR 12
 #endif
 
-#if CV_NEON
-#define FAST_GEMM_F32_PACKED_STRIDE_K 64
-#elif CV_AVX
+#if CV_AVX
 #define FAST_GEMM_F32_PACKED_STRIDE_K 128
-#elif CV_LASX
+#else // CV_LASX, CV_NEON_AARCH64, CV_SIMD128
 #define FAST_GEMM_F32_PACKED_STRIDE_K 64
 #endif
 
@@ -75,14 +69,6 @@ static void fast_gemm_pack##N##suffix( int m, int k, const void* A_, \
     } \
 }
 
-#define FAST_GEMM_LOAD_TO_BUF_4(styp) \
-    styp buf[] = { \
-        a_ptr[j], a_ptr[j+lda0], a_ptr[j+lda0*2], a_ptr[j+lda0*3] }
-
-#define FAST_GEMM_LOAD_TO_BUF_BORDERS_4(styp) \
-    styp buf[] = { \
-        a_ptr[0][j], a_ptr[1][j], a_ptr[2][j], a_ptr[3][j] }
-
 #define FAST_GEMM_LOAD_TO_BUF_8(styp) \
     styp buf[] = { \
         a_ptr[j], a_ptr[j+lda0], a_ptr[j+lda0*2], a_ptr[j+lda0*3], \
@@ -121,7 +107,6 @@ static void fast_gemm_pack##N##suffix( int m, int k, const void* A_, \
 
 #define FAST_GEMM_PACK_COPY(src, dst, N) \
     memcpy((dst), (src), N*sizeof(src[0]))
-#define FAST_GEMM_PACK_f32_4(src, dst) FAST_GEMM_PACK_COPY((src), (dst), 4)
 #define FAST_GEMM_PACK_f32_8(src, dst) FAST_GEMM_PACK_COPY((src), (dst), 8)
 #define FAST_GEMM_PACK_f32_12(src, dst) FAST_GEMM_PACK_COPY((src), (dst), 12)
 #define FAST_GEMM_PACK_f32_16(src, dst) FAST_GEMM_PACK_COPY((src), (dst), 16)
@@ -130,7 +115,6 @@ namespace cv { namespace dnn {
 
 CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
 
-// TODO: type to size_t
 int fastGemmPackBSize(int N, int K);
 
 void fastGemmPackBKernel(const char *B, char *packed_B, int N, int K, int ldb0, int ldb1, int esz);
@@ -143,44 +127,18 @@ void fastGemmKernel(int M, int N, int K,
                     float alpha, const char *A, int lda0, int lda1,
                     const char *packed_B, float beta, char *C, int ldc, int esz);
 
-// NEON (AARCH64: 32 x 128-bit registers, armv7: 16 x 128-bit registers)
-#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_NEON
+#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
 
-#if CV_NEON_AARCH64
-FAST_GEMM_IMPLEMENT_PACK(8, _f32, float, float)
-#else
-FAST_GEMM_IMPLEMENT_PACK(4, _f32, float, float)
-#endif
-FAST_GEMM_IMPLEMENT_PACK(12, _f32, float, float)
+/*
+    Compute kernels that optimized for different platforms
+*/
+#if CV_NEON && CV_NEON_AARCH64 // AARCH64: 32 x 128-bit registers
 
-int fastGemmPackBSize(int N, int K) {
-    int GEMM_NC = FAST_GEMM_F32_NC, GEMM_NR = FAST_GEMM_F32_NR;
-    int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
+FAST_GEMM_IMPLEMENT_PACK(8, _f32, float, float) // a packer
+FAST_GEMM_IMPLEMENT_PACK(12, _f32, float, float) // b packer
 
-    return static_cast<int>((N + NC - 1) / NC) * NC * K;
-}
-
-void fastGemmPackBKernel(const char *B, char *packed_B, int N, int K, int ldb0, int ldb1, int esz) {
-    int GEMM_NC = FAST_GEMM_F32_NC, GEMM_NR = FAST_GEMM_F32_NR;
-    int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
-    int KC = std::min(FAST_GEMM_F32_PACKED_STRIDE_K, K);
-
-    int n_tiles = (N + NC - 1) / NC;
-    for (int r = 0; r < n_tiles; ++r) {
-        int j0 = r * NC;
-        int nc = N - j0 < NC ? N - j0 : NC;
-        int _nc = static_cast<int>((nc + GEMM_NR - 1) / GEMM_NR) * GEMM_NR * esz;
-        for (int k = 0; k < K; k += KC) {
-            int kc = K - k < KC ? K - k : KC;
-            fast_gemm_pack12_f32(nc, kc, B + (k * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_B);
-            packed_B += _nc * kc;
-        }
-    }
-}
-
-#if CV_NEON_AARCH64
-static void fast_gemm8x12_f32(int k, const char *a_, const char *b_,
-                       char *c_, int ldc, float alpha) {
+static inline void fast_gemm8x12_f32(int k, const char *a_, const char *b_,
+                                     char *c_, int ldc, float alpha) {
     const float* a = (const float*)a_;
     const float* b = (const float*)b_;
     float* c = (float*)c_;
@@ -258,278 +216,17 @@ static void fast_gemm8x12_f32(int k, const char *a_, const char *b_,
 #undef FAST_GEMM_FINALE
 }
 
-#else // CV_NEON_AARCH64
-static void fast_gemm4x12_f32(int k, const char *a_, const char *b_,
-                       char *c_, int ldc, float alpha) {
-    const float* a = (const float*)a_;
-    const float* b = (const float*)b_;
-    float* c = (float*)c_;
+#elif CV_AVX // AVX and AVX2 (16 x 256-bit registers)
 
-    float32x4_t s00 = vdupq_n_f32(0.f), s01 = s00, s02 = s00,
-                s10 = s00, s11 = s00, s12 = s00,
-                s20 = s00, s21 = s00, s22 = s00,
-                s30 = s00, s31 = s00, s32 = s00;
-
-    for(int p = 0; p < k; p++, a += FAST_GEMM_F32_MR, b += FAST_GEMM_F32_NR)
-    {
-        float32x4_t b0 = vld1q_f32(b), b1 = vld1q_f32(b + 4), b2 = vld1q_f32(b + 8);
-
-        float32x4_t a0 = vld1q_dup_f32(a);
-        s00 = vmlaq_f32(a0, b0, s00);
-        s01 = vmlaq_f32(a0, b1, s01);
-        s02 = vmlaq_f32(a0, b2, s02);
-
-        a0 = vld1q_dup_f32(a + 1);
-        s10 = vmlaq_f32(a0, b0, s10);
-        s11 = vmlaq_f32(a0, b1, s11);
-        s12 = vmlaq_f32(a0, b2, s12);
-
-        a0 = vld1q_dup_f32(a + 2);
-        s20 = vmlaq_f32(a0, b0, s20);
-        s21 = vmlaq_f32(a0, b1, s21);
-        s22 = vmlaq_f32(a0, b2, s22);
-
-        a0 = vld1q_dup_f32(a + 3);
-        s30 = vmlaq_f32(a0, b0, s30);
-        s31 = vmlaq_f32(a0, b1, s31);
-        s32 = vmlaq_f32(a0, b2, s32);
-    }
-
-    float32x4_t c0, c1, c2, v_alpha = vdupq_n_f32(alpha);
-#define FAST_GEMM_FINALE(row0)               \
-    c0 = vld1q_f32(c + row0 * ldc);          \
-    c1 = vld1q_f32(c + row0 * ldc + 4);      \
-    c2 = vld1q_f32(c + row0 * ldc + 8);      \
-    c0 = vmlaq_f32(c0, s##row0##0, v_alpha); \
-    c1 = vmlaq_f32(c1, s##row0##1, v_alpha); \
-    c2 = vmlaq_f32(c2, s##row0##2, v_alpha); \
-    vst1q_f32(c + row0 * ldc, c0);           \
-    vst1q_f32(c + row0 * ldc + 4, c1);       \
-    vst1q_f32(c + row0 * ldc + 8, c2);
-
-    FAST_GEMM_FINALE(0);
-    FAST_GEMM_FINALE(1);
-    FAST_GEMM_FINALE(2);
-    FAST_GEMM_FINALE(3);
-#undef FAST_GEMM_FINALE
-}
-
-#endif // micro kernel CV_NEON_AARCH64
-
-static void fast_gemm_macro_kernel(int m, int n, int k,
-                                   const char *packed_A, const char *packed_B,
-                                   float alpha, char *c, int ldc0, int esz) {
-    int ldc0_esz = ldc0 * esz;
-
-    double tempC[FAST_GEMM_F32_MR * FAST_GEMM_F32_NR]; // make sure the buffer is big enough
-    for(int i = 0; i < m; i += FAST_GEMM_F32_MR) {
-        for(int j = 0; j < n; j += FAST_GEMM_F32_NR) {
-            char* cptr0 = &c[i * ldc0_esz + j * esz];
-            char* cptr = cptr0;
-            int ldc = ldc0;
-            int mr = m - i < FAST_GEMM_F32_MR ? m - i : FAST_GEMM_F32_MR;
-            int nr = n - j < FAST_GEMM_F32_NR ? n - j : FAST_GEMM_F32_NR;
-            int nr_esz = nr * esz;
-            bool partial = (bool)((mr < FAST_GEMM_F32_MR) | (nr < FAST_GEMM_F32_NR));
-            if (partial) {
-                memset(tempC, 0, sizeof(tempC));
-                cptr = (char *)tempC;
-                ldc = FAST_GEMM_F32_NR;
-                for(int p = 0; p < mr; p++)
-                    memcpy(cptr + p * (ldc * esz), cptr0 + p * ldc0_esz, nr_esz);
-            }
-#if CV_NEON_AARCH64
-            fast_gemm8x12_f32(k, packed_A + i * k * esz, packed_B + j * k * esz, cptr, ldc, alpha);
-#else
-            fast_gemm4x12_f32(k, packed_A + i * k * esz, packed_B + j * k * esz, cptr, ldc, alpha);
-#endif
-
-            if (partial) {
-                for(int p = 0; p < mr; p++)
-                    memcpy(cptr0 + p * ldc0_esz, cptr + p * (ldc * esz), nr_esz);
-            }
-        }
-    }
-}
-
-void fastGemmKernel(int M, int N, int K,
-                    float alpha, const char *A, int lda0, int lda1,
-                    const char *B, int ldb0, int ldb1,
-                    float beta, char *C, int ldc, int esz) {
-    int GEMM_MC = FAST_GEMM_F32_MC,
-        GEMM_NC = FAST_GEMM_F32_NC,
-        GEMM_MR = FAST_GEMM_F32_MR,
-        GEMM_NR = FAST_GEMM_F32_NR;
-
-    int MC = (((GEMM_MC < M ? GEMM_MC : M) + GEMM_MR - 1) / GEMM_MR) * GEMM_MR;
-    int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
-    int KC = FAST_GEMM_STORAGE / ((MC + NC) * esz);
-    KC = KC > 8 ? KC : 8;
-    KC = KC < K ? KC : K;
-
-    size_t buff_size = KC * (MC + NC) * esz;
-    bool use_stackbuff = buff_size <= FAST_GEMM_MAX_STACKBUF;
-    int m_tiles = (M + MC - 1) / MC;
-    int n_tiles = (N + NC - 1) / NC;
-    int total_tiles = m_tiles * n_tiles;
-
-    auto fn = [&](const Range &r) {
-        char* packed_a = (char*)(use_stackbuff ? alloca(buff_size) : malloc(buff_size));
-        char* packed_b = packed_a + KC * MC * esz;
-        int start = r.start;
-        int end = r.end;
-
-        for (int tile_idx = start; tile_idx < end; tile_idx++) {
-            int i0 = (tile_idx / n_tiles) * MC;
-            int j0 = (tile_idx % n_tiles) * NC;
-            int mc = M - i0 < MC ? M - i0 : MC;
-            int nc = N - j0 < NC ? N - j0 : NC;
-            int ldc_block = ldc;
-            char* c_block = C + (i0 * ldc + j0) * esz;
-
-            if (beta == 0.f) {
-                for(int i = 0; i < mc; i++)
-                    memset(c_block + i * ldc_block * esz, 0, nc * esz);
-            } else if (beta != 1.f) {
-                for(int i = 0; i < mc; i++) {
-                    float* c_i = (float*)c_block + i * ldc_block;
-                    for(int j = 0; j < nc; j++)
-                        c_i[j] *= beta;
-                }
-            }
-
-            for(int k0 = 0; k0 < K; k0 += KC)
-            {
-                int kc = K - k0 < KC ? K - k0 : KC;
-#if CV_NEON_AARCH64
-                fast_gemm_pack8_f32(mc, kc, A + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
-#else
-                fast_gemm_pack4_f32(mc, kc, A + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
-#endif
-                fast_gemm_pack12_f32(nc, kc, B + (k0 * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_b);
-                fast_gemm_macro_kernel(mc, nc, kc, packed_a, packed_b, alpha, c_block, ldc_block, esz);
-            }
-        }
-
-        if (!use_stackbuff) {
-            free(packed_a);
-        }
-    };
-
-    int total = total_tiles;
-    int cost_per_thread = static_cast<int>((K / KC) * (MC / GEMM_MR) * (NC / GEMM_NR));
-    double nstripes = (size_t)total * cost_per_thread * (1 / 1024.0);
-    parallel_for_(Range(0, total), fn, nstripes);
-}
-
-void fastGemmKernel(int M, int N, int K,
-                    float alpha, const char *A, int lda0, int lda1,
-                    const char *packed_B, float beta, char *C, int ldc, int esz) {
-    int GEMM_MC = FAST_GEMM_F32_MC,
-        GEMM_NC = FAST_GEMM_F32_NC,
-        GEMM_MR = FAST_GEMM_F32_MR,
-        GEMM_NR = FAST_GEMM_F32_NR;
-
-    int MC = (((GEMM_MC < M ? GEMM_MC : M) + GEMM_MR - 1) / GEMM_MR) * GEMM_MR;
-    int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
-    int KC = std::min(FAST_GEMM_F32_PACKED_STRIDE_K, K);
-
-    size_t buff_size = KC * MC * esz;
-    bool use_stackbuff = buff_size <= FAST_GEMM_MAX_STACKBUF;
-    int m_tiles = (M + MC - 1) / MC;
-    int n_tiles = (N + NC - 1) / NC;
-    int total_tiles = m_tiles * n_tiles;
-
-    auto fn = [&](const Range &r) {
-        char* packed_a = (char*)(use_stackbuff ? alloca(buff_size) : malloc(buff_size)); // TODO: use AutoBuffer
-        const char *packed_b_ = packed_B;
-        int start = r.start;
-        int end = r.end;
-
-        for (int tile_idx = start; tile_idx < end; tile_idx++) {
-            int i0 = (tile_idx / n_tiles) * MC;
-            int j0 = (tile_idx % n_tiles) * NC;
-            int mc = M - i0 < MC ? M - i0 : MC;
-            int nc = N - j0 < NC ? N - j0 : NC;
-            int ldc_block = ldc;
-            char* c_block = C + (i0 * ldc + j0) * esz;
-            packed_b_ = packed_B + j0 * K * esz;
-
-            if (beta == 0.f) {
-                for(int i = 0; i < mc; i++)
-                    memset(c_block + i * ldc_block * esz, 0, nc * esz);
-            } else if (beta != 1.f) {
-                for(int i = 0; i < mc; i++) {
-                    float* c_i = (float*)c_block + i * ldc_block;
-                    for(int j = 0; j < nc; j++)
-                        c_i[j] *= beta;
-                }
-            }
-
-            int _nc = static_cast<int>((nc + GEMM_NR - 1) / GEMM_NR) * GEMM_NR * esz;
-            for(int k0 = 0; k0 < K; k0 += KC)
-            {
-                int kc = K - k0 < KC ? K - k0 : KC;
-#if CV_NEON_AARCH64
-                fast_gemm_pack8_f32(mc, kc, A + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
-#else
-                fast_gemm_pack4_f32(mc, kc, A + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
-#endif
-                fast_gemm_macro_kernel(mc, nc, kc, packed_a, packed_b_, alpha, c_block, ldc_block, esz);
-                packed_b_ += _nc * kc;
-            }
-        }
-
-        if (!use_stackbuff) {
-            free(packed_a);
-        }
-    };
-
-    int total = total_tiles;
-    int cost_per_thread = static_cast<int>((K / KC) * (MC / GEMM_MR) * (NC / GEMM_NR));
-    double nstripes = (size_t)total * cost_per_thread * (1 / 1024.0);
-    parallel_for_(Range(0, total), fn, nstripes);
-}
-
-#endif // CV_NEON, CV_NEON_AARCH64
-
-// AVX and AVX2 (16 x 256-bit registers)
-#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_AVX
-
-FAST_GEMM_IMPLEMENT_PACK(8, _f32, float, float)
-FAST_GEMM_IMPLEMENT_PACK(12, _f32, float, float)
-
-int fastGemmPackBSize(int N, int K) {
-    int GEMM_NC = FAST_GEMM_F32_NC, GEMM_NR = FAST_GEMM_F32_NR;
-    int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
-
-    return static_cast<int>((N + NC - 1) / NC) * NC * K;
-}
-
-void fastGemmPackBKernel(const char *B, char *packed_B, int N, int K, int ldb0, int ldb1, int esz) {
-    int GEMM_NC = FAST_GEMM_F32_NC, GEMM_NR = FAST_GEMM_F32_NR;
-    int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
-    int KC = std::min(FAST_GEMM_F32_PACKED_STRIDE_K, K);
-
-    int n_tiles = (N + NC - 1) / NC;
-    for (int r = 0; r < n_tiles; ++r) {
-        int j0 = r * NC;
-        int nc = N - j0 < NC ? N - j0 : NC;
-        int _nc = static_cast<int>((nc + GEMM_NR - 1) / GEMM_NR) * GEMM_NR * esz;
-        for (int k = 0; k < K; k += KC) {
-            int kc = K - k < KC ? K - k : KC;
-            fast_gemm_pack8_f32(nc, kc, B + (k * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_B);
-            packed_B += _nc * kc;
-        }
-    }
-}
+FAST_GEMM_IMPLEMENT_PACK(8, _f32, float, float) // a packer
+FAST_GEMM_IMPLEMENT_PACK(12, _f32, float, float) // b packer
 
 #if !CV_FMA3 // AVX workaround for FMA
 #undef _mm256_fmadd_ps
 #define _mm256_fmadd_ps(a, b, c) _mm256_add_ps(c, _mm256_mul_ps(a, b))
 #endif
 
-static void fast_gemm12x8_f32(int k, const char *a_, const char *b_, char *c_, int ldc, float alpha) {
+static inline void fast_gemm12x8_f32(int k, const char *a_, const char *b_, char *c_, int ldc, float alpha) {
     const float* a = (const float*)a_;
     const float* b = (const float*)b_;
     float* c = (float*)c_;
@@ -599,203 +296,12 @@ static void fast_gemm12x8_f32(int k, const char *a_, const char *b_, char *c_, i
 #undef FAST_GEMM_FINALE
 }
 
-static void fast_gemm_macro_kernel(int m, int n, int k,
-                                   const char *packed_A, const char *packed_B,
-                                   float alpha, char *c, int ldc0, int esz) {
-    int ldc0_esz = ldc0 * esz;
+#elif CV_LASX // LASX (32 x 256-bit registers)
 
-    double tempC[FAST_GEMM_F32_MR * FAST_GEMM_F32_NR]; // make sure the buffer is big enough
-    for(int i = 0; i < m; i += FAST_GEMM_F32_MR) {
-        for(int j = 0; j < n; j += FAST_GEMM_F32_NR) {
-            char* cptr0 = &c[i * ldc0_esz + j * esz];
-            char* cptr = cptr0;
-            int ldc = ldc0;
-            int mr = m - i < FAST_GEMM_F32_MR ? m - i : FAST_GEMM_F32_MR;
-            int nr = n - j < FAST_GEMM_F32_NR ? n - j : FAST_GEMM_F32_NR;
-            int nr_esz = nr * esz;
-            bool partial = (bool)((mr < FAST_GEMM_F32_MR) | (nr < FAST_GEMM_F32_NR));
-            if (partial) {
-                memset(tempC, 0, sizeof(tempC));
-                cptr = (char *)tempC;
-                ldc = FAST_GEMM_F32_NR;
-                for(int p = 0; p < mr; p++)
-                    memcpy(cptr + p * (ldc * esz), cptr0 + p * ldc0_esz, nr_esz);
-            }
-            fast_gemm12x8_f32(k, packed_A + i * k * esz, packed_B + j * k * esz, cptr, ldc, alpha);
+FAST_GEMM_IMPLEMENT_PACK(12, _f32, float, float) // a packer
+FAST_GEMM_IMPLEMENT_PACK(16, _f32, float, float) // b packer
 
-            if (partial) {
-                for(int p = 0; p < mr; p++)
-                    memcpy(cptr0 + p * ldc0_esz, cptr + p * (ldc * esz), nr_esz);
-            }
-        }
-    }
-}
-
-void fastGemmKernel(int M, int N, int K,
-                    float alpha, const char *A, int lda0, int lda1,
-                    const char *B, int ldb0, int ldb1,
-                    float beta, char *C, int ldc, int esz) {
-    int GEMM_MC = FAST_GEMM_F32_MC,
-        GEMM_NC = FAST_GEMM_F32_NC,
-        GEMM_MR = FAST_GEMM_F32_MR,
-        GEMM_NR = FAST_GEMM_F32_NR;
-
-    int MC = (((GEMM_MC < M ? GEMM_MC : M) + GEMM_MR - 1) / GEMM_MR) * GEMM_MR;
-    int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
-    int KC = FAST_GEMM_STORAGE / ((MC + NC) * esz);
-    KC = KC > 8 ? KC : 8;
-    KC = KC < K ? KC : K;
-
-    size_t buff_size = KC * (MC + NC) * esz;
-    bool use_stackbuff = buff_size <= FAST_GEMM_MAX_STACKBUF;
-    int m_tiles = (M + MC - 1) / MC;
-    int n_tiles = (N + NC - 1) / NC;
-    int total_tiles = m_tiles * n_tiles;
-
-    auto fn = [&](const Range &r) {
-        char* packed_a = (char*)(use_stackbuff ? alloca(buff_size) : malloc(buff_size));
-        char* packed_b = packed_a + KC * MC * esz;
-        int start = r.start;
-        int end = r.end;
-
-        for (int tile_idx = start; tile_idx < end; tile_idx++) {
-            int i0 = (tile_idx / n_tiles) * MC;
-            int j0 = (tile_idx % n_tiles) * NC;
-            int mc = M - i0 < MC ? M - i0 : MC;
-            int nc = N - j0 < NC ? N - j0 : NC;
-            int ldc_block = ldc;
-            char* c_block = C + (i0 * ldc + j0) * esz;
-
-            if (beta == 0.f) {
-                for(int i = 0; i < mc; i++)
-                    memset(c_block + i * ldc_block * esz, 0, nc * esz);
-            } else if (beta != 1.f) {
-                for(int i = 0; i < mc; i++) {
-                    float* c_i = (float*)c_block + i * ldc_block;
-                    for(int j = 0; j < nc; j++)
-                        c_i[j] *= beta;
-                }
-            }
-
-            for(int k0 = 0; k0 < K; k0 += KC)
-            {
-                int kc = K - k0 < KC ? K - k0 : KC;
-                fast_gemm_pack12_f32(mc, kc, A + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
-                fast_gemm_pack8_f32(nc, kc, B + (k0 * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_b);
-                fast_gemm_macro_kernel(mc, nc, kc, packed_a, packed_b, alpha, c_block, ldc_block, esz);
-            }
-        }
-
-        if (!use_stackbuff) {
-            free(packed_a);
-        }
-    };
-
-    int total = total_tiles;
-    int cost_per_thread = static_cast<int>((K / KC) * (MC / GEMM_MR) * (NC / GEMM_NR));
-    double nstripes = (size_t)total * cost_per_thread * (1 / 1024.0);
-    parallel_for_(Range(0, total), fn, nstripes);
-}
-
-void fastGemmKernel(int M, int N, int K,
-                    float alpha, const char *A, int lda0, int lda1,
-                    const char *packed_B, float beta, char *C, int ldc, int esz) {
-    int GEMM_MC = FAST_GEMM_F32_MC,
-        GEMM_NC = FAST_GEMM_F32_NC,
-        GEMM_MR = FAST_GEMM_F32_MR,
-        GEMM_NR = FAST_GEMM_F32_NR;
-
-    int MC = (((GEMM_MC < M ? GEMM_MC : M) + GEMM_MR - 1) / GEMM_MR) * GEMM_MR;
-    int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
-    int KC = std::min(FAST_GEMM_F32_PACKED_STRIDE_K, K);
-
-    size_t buff_size = KC * MC * esz;
-    bool use_stackbuff = buff_size <= FAST_GEMM_MAX_STACKBUF;
-    int m_tiles = (M + MC - 1) / MC;
-    int n_tiles = (N + NC - 1) / NC;
-    int total_tiles = m_tiles * n_tiles;
-
-    auto fn = [&](const Range &r) {
-        char* packed_a = (char*)(use_stackbuff ? alloca(buff_size) : malloc(buff_size)); // TODO: use AutoBuffer
-        const char *packed_b_ = packed_B;
-        int start = r.start;
-        int end = r.end;
-
-        for (int tile_idx = start; tile_idx < end; tile_idx++) {
-            int i0 = (tile_idx / n_tiles) * MC;
-            int j0 = (tile_idx % n_tiles) * NC;
-            int mc = M - i0 < MC ? M - i0 : MC;
-            int nc = N - j0 < NC ? N - j0 : NC;
-            int ldc_block = ldc;
-            char* c_block = C + (i0 * ldc + j0) * esz;
-            packed_b_ = packed_B + j0 * K * esz;
-
-            if (beta == 0.f) {
-                for(int i = 0; i < mc; i++)
-                    memset(c_block + i * ldc_block * esz, 0, nc * esz);
-            } else if (beta != 1.f) {
-                for(int i = 0; i < mc; i++) {
-                    float* c_i = (float*)c_block + i * ldc_block;
-                    for(int j = 0; j < nc; j++)
-                        c_i[j] *= beta;
-                }
-            }
-
-            int _nc = static_cast<int>((nc + GEMM_NR - 1) / GEMM_NR) * GEMM_NR * esz;
-            for(int k0 = 0; k0 < K; k0 += KC)
-            {
-                int kc = K - k0 < KC ? K - k0 : KC;
-                fast_gemm_pack12_f32(mc, kc, A + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
-                fast_gemm_macro_kernel(mc, nc, kc, packed_a, packed_b_, alpha, c_block, ldc_block, esz);
-                packed_b_ += _nc * kc;
-            }
-        }
-
-        if (!use_stackbuff) {
-            free(packed_a);
-        }
-    };
-
-    int total = total_tiles;
-    int cost_per_thread = static_cast<int>((K / KC) * (MC / GEMM_MR) * (NC / GEMM_NR));
-    double nstripes = (size_t)total * cost_per_thread * (1 / 1024.0);
-    parallel_for_(Range(0, total), fn, nstripes);
-}
-
-#endif // CV_AVX, CV_AVX2
-
-// LASX (32 x 256-bit registers)
-#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_LASX
-
-FAST_GEMM_IMPLEMENT_PACK(12, _f32, float, float)
-FAST_GEMM_IMPLEMENT_PACK(16, _f32, float, float)
-
-int fastGemmPackBSize(int N, int K) {
-    int GEMM_NC = FAST_GEMM_F32_NC, GEMM_NR = FAST_GEMM_F32_NR;
-    int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
-
-    return static_cast<int>((N + NC - 1) / NC) * NC * K;
-}
-
-void fastGemmPackBKernel(const char *B, char *packed_B, int N, int K, int ldb0, int ldb1, int esz) {
-    int GEMM_NC = FAST_GEMM_F32_NC, GEMM_NR = FAST_GEMM_F32_NR;
-    int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
-    int KC = std::min(FAST_GEMM_F32_PACKED_STRIDE_K, K);
-
-    int n_tiles = (N + NC - 1) / NC;
-    for (int r = 0; r < n_tiles; ++r) {
-        int j0 = r * NC;
-        int nc = N - j0 < NC ? N - j0 : NC;
-        int _nc = static_cast<int>((nc + GEMM_NR - 1) / GEMM_NR) * GEMM_NR * esz;
-        for (int k = 0; k < K; k += KC) {
-            int kc = K - k < KC ? K - k : KC;
-            fast_gemm_pack16_f32(nc, kc, B + (k * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_B);
-            packed_B += _nc * kc;
-        }
-    }
-}
-
-static void fast_gemm12x16_f32(int k, const char *a_, const char *b_, char *c_, int ldc, float alpha) {
+static inline void fast_gemm12x16_f32(int k, const char *a_, const char *b_, char *c_, int ldc, float alpha) {
     const float* a = (const float*)a_;
     const float* b = (const float*)b_;
     float* c = (float*)c_;
@@ -889,9 +395,99 @@ static void fast_gemm12x16_f32(int k, const char *a_, const char *b_, char *c_,
 #undef FAST_GEMM_FINALE
 }
 
-static void fast_gemm_macro_kernel(int m, int n, int k,
-                                   const char *packed_A, const char *packed_B,
-                                   float alpha, char *c, int ldc0, int esz) {
+#elif CV_SIMD128 // armv7: 16 x 128-bit registers
+
+FAST_GEMM_IMPLEMENT_PACK(8, _f32, float, float) // a packer
+FAST_GEMM_IMPLEMENT_PACK(12, _f32, float, float) // b packer
+
+static inline void fast_gemm8x12_f32(int k, const char *a_, const char *b_,
+                                     char *c_, int ldc, float alpha) {
+    const float* a = (const float*)a_;
+    const float* b = (const float*)b_;
+    float* c = (float*)c_;
+
+    v_float32x4 s00 = v_setzero_f32(), s01 = s00, s02 = s00;
+    v_float32x4 s10 = s00, s11 = s00, s12 = s00;
+    v_float32x4 s20 = s00, s21 = s00, s22 = s00;
+    v_float32x4 s30 = s00, s31 = s00, s32 = s00;
+    v_float32x4 s40 = s00, s41 = s00, s42 = s00;
+    v_float32x4 s50 = s00, s51 = s00, s52 = s00;
+    v_float32x4 s60 = s00, s61 = s00, s62 = s00;
+    v_float32x4 s70 = s00, s71 = s00, s72 = s00;
+
+    for(int p = 0; p < k; p++, a += FAST_GEMM_F32_MR, b += FAST_GEMM_F32_NR) {
+        v_float32x4 b0 = v_load(b), b1 = v_load(b + 4), b2 = v_load(b + 8);
+
+        v_float32x4 a0 = v_setall_f32(*a);
+        s00 = v_fma(b0, a0, s00);
+        s01 = v_fma(b1, a0, s01);
+        s02 = v_fma(b2, a0, s02);
+        v_float32x4 a1 = v_setall_f32(*(a + 1));
+        s10 = v_fma(b0, a1, s10);
+        s11 = v_fma(b1, a1, s11);
+        s12 = v_fma(b2, a1, s12);
+
+        v_float32x4 a2 = v_setall_f32(*(a + 2));
+        s20 = v_fma(b0, a2, s20);
+        s21 = v_fma(b1, a2, s21);
+        s22 = v_fma(b2, a2, s22);
+        v_float32x4 a3 = v_setall_f32(*(a + 3));
+        s30 = v_fma(b0, a3, s30);
+        s31 = v_fma(b1, a3, s31);
+        s32 = v_fma(b2, a3, s32);
+
+        a0 = v_setall_f32(*(a + 4));
+        s40 = v_fma(b0, a0, s40);
+        s41 = v_fma(b1, a0, s41);
+        s42 = v_fma(b2, a0, s42);
+        a1 = v_setall_f32(*(a + 5));
+        s50 = v_fma(b0, a1, s50);
+        s51 = v_fma(b1, a1, s51);
+        s52 = v_fma(b2, a1, s52);
+
+        a2 = v_setall_f32(*(a + 6));
+        s60 = v_fma(b0, a2, s60);
+        s61 = v_fma(b1, a2, s61);
+        s62 = v_fma(b2, a2, s62);
+        a3 = v_setall_f32(*(a + 7));
+        s70 = v_fma(b0, a3, s70);
+        s71 = v_fma(b1, a3, s71);
+        s72 = v_fma(b2, a3, s72);
+    }
+
+    v_float32x4 c0, c1, c2, c3, c4, c5, v_alpha = v_setall_f32(alpha);
+#define FAST_GEMM_FINALE(row0, row1)       \
+    c0 = v_load(c + row0 * ldc);         \
+    c1 = v_load(c + row0 * ldc + 4);     \
+    c2 = v_load(c + row0 * ldc + 8);     \
+    c3 = v_load(c + row1 * ldc);         \
+    c4 = v_load(c + row1 * ldc + 4);     \
+    c5 = v_load(c + row1 * ldc + 8);     \
+    c0 = v_fma(s##row0##0, v_alpha, c0); \
+    c1 = v_fma(s##row0##1, v_alpha, c1); \
+    c2 = v_fma(s##row0##2, v_alpha, c2); \
+    c3 = v_fma(s##row1##0, v_alpha, c3); \
+    c4 = v_fma(s##row1##1, v_alpha, c4); \
+    c5 = v_fma(s##row1##2, v_alpha, c5); \
+    v_store(c + row0 * ldc, c0);         \
+    v_store(c + row0 * ldc + 4, c1);     \
+    v_store(c + row0 * ldc + 8, c2);     \
+    v_store(c + row1 * ldc, c3);         \
+    v_store(c + row1 * ldc + 4, c4);     \
+    v_store(c + row1 * ldc + 8, c5);
+
+    FAST_GEMM_FINALE(0, 1);
+    FAST_GEMM_FINALE(2, 3);
+    FAST_GEMM_FINALE(4, 5);
+    FAST_GEMM_FINALE(6, 7);
+#undef FAST_GEMM_FINALE
+}
+
+#endif
+
+static inline void fast_gemm_macro_kernel(int m, int n, int k,
+                                          const char *packed_A, const char *packed_B,
+                                          float alpha, char *c, int ldc0, int esz) {
     int ldc0_esz = ldc0 * esz;
 
     double tempC[FAST_GEMM_F32_MR * FAST_GEMM_F32_NR]; // make sure the buffer is big enough
@@ -911,7 +507,15 @@ static void fast_gemm_macro_kernel(int m, int n, int k,
                 for(int p = 0; p < mr; p++)
                     memcpy(cptr + p * (ldc * esz), cptr0 + p * ldc0_esz, nr_esz);
             }
+#if CV_NEON && CV_NEON_AARCH64
+            fast_gemm8x12_f32(k, packed_A + i * k * esz, packed_B + j * k * esz, cptr, ldc, alpha);
+#elif CV_AVX
+            fast_gemm12x8_f32(k, packed_A + i * k * esz, packed_B + j * k * esz, cptr, ldc, alpha);
+#elif CV_LASX
             fast_gemm12x16_f32(k, packed_A + i * k * esz, packed_B + j * k * esz, cptr, ldc, alpha);
+#elif CV_SIMD128
+            fast_gemm8x12_f32(k, packed_A + i * k * esz, packed_B + j * k * esz, cptr, ldc, alpha);
+#endif
 
             if (partial) {
                 for(int p = 0; p < mr; p++)
@@ -921,6 +525,39 @@ static void fast_gemm_macro_kernel(int m, int n, int k,
     }
 }
 
+int fastGemmPackBSize(int N, int K) {
+    int GEMM_NC = FAST_GEMM_F32_NC, GEMM_NR = FAST_GEMM_F32_NR;
+    int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
+
+    return static_cast<int>((N + NC - 1) / NC) * NC * K;
+}
+
+void fastGemmPackBKernel(const char *B, char *packed_B, int N, int K, int ldb0, int ldb1, int esz) {
+    int GEMM_NC = FAST_GEMM_F32_NC, GEMM_NR = FAST_GEMM_F32_NR;
+    int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
+    int KC = std::min(FAST_GEMM_F32_PACKED_STRIDE_K, K);
+
+    int n_tiles = (N + NC - 1) / NC;
+    for (int r = 0; r < n_tiles; ++r) {
+        int j0 = r * NC;
+        int nc = N - j0 < NC ? N - j0 : NC;
+        int _nc = static_cast<int>((nc + GEMM_NR - 1) / GEMM_NR) * GEMM_NR * esz;
+        for (int k = 0; k < K; k += KC) {
+            int kc = K - k < KC ? K - k : KC;
+#if CV_NEON && CV_NEON_AARCH64
+            fast_gemm_pack12_f32(nc, kc, B + (k * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_B);
+#elif CV_AVX
+            fast_gemm_pack8_f32(nc, kc, B + (k * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_B);
+#elif CV_LASX
+            fast_gemm_pack16_f32(nc, kc, B + (k * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_B);
+#elif CV_SIMD128
+            fast_gemm_pack12_f32(nc, kc, B + (k * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_B);
+#endif
+            packed_B += _nc * kc;
+        }
+    }
+}
+
 void fastGemmKernel(int M, int N, int K,
                     float alpha, const char *A, int lda0, int lda1,
                     const char *B, int ldb0, int ldb1,
@@ -970,8 +607,29 @@ void fastGemmKernel(int M, int N, int K,
             for(int k0 = 0; k0 < K; k0 += KC)
             {
                 int kc = K - k0 < KC ? K - k0 : KC;
+                // pack a
+#if CV_NEON && CV_NEON_AARCH64
+                fast_gemm_pack8_f32(mc, kc, A + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
+#elif CV_AVX
                 fast_gemm_pack12_f32(mc, kc, A + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
+#elif CV_LASX
+                fast_gemm_pack12_f32(mc, kc, A + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
+#elif CV_SIMD128
+                fast_gemm_pack8_f32(mc, kc, A + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
+#endif
+
+                // pack b
+#if CV_NEON && CV_NEON_AARCH64
+                fast_gemm_pack12_f32(nc, kc, B + (k0 * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_b);
+#elif CV_AVX
+                fast_gemm_pack8_f32(nc, kc, B + (k0 * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_b);
+#elif CV_LASX
                 fast_gemm_pack16_f32(nc, kc, B + (k0 * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_b);
+#elif CV_SIMD128
+                fast_gemm_pack12_f32(nc, kc, B + (k0 * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_b);
+#endif
+
+                // run kernel
                 fast_gemm_macro_kernel(mc, nc, kc, packed_a, packed_b, alpha, c_block, ldc_block, esz);
             }
         }
@@ -1035,7 +693,18 @@ void fastGemmKernel(int M, int N, int K,
             for(int k0 = 0; k0 < K; k0 += KC)
             {
                 int kc = K - k0 < KC ? K - k0 : KC;
+                // pack a
+#if CV_NEON && CV_NEON_AARCH64
+                fast_gemm_pack8_f32(mc, kc, A + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
+#elif CV_AVX
                 fast_gemm_pack12_f32(mc, kc, A + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
+#elif CV_LASX
+                fast_gemm_pack12_f32(mc, kc, A + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
+#elif CV_SIMD128
+                fast_gemm_pack8_f32(mc, kc, A + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
+#endif
+
+                // run kernel
                 fast_gemm_macro_kernel(mc, nc, kc, packed_a, packed_b_, alpha, c_block, ldc_block, esz);
                 packed_b_ += _nc * kc;
             }
@@ -1052,8 +721,37 @@ void fastGemmKernel(int M, int N, int K,
     parallel_for_(Range(0, total), fn, nstripes);
 }
 
-#endif // CV_LASX
+#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
 
 CV_CPU_OPTIMIZATION_NAMESPACE_END
 
 }} // cv::dnn
+
+#undef FAST_GEMM_STORAGE
+#undef FAST_GEMM_MAX_STACKBUF
+#ifdef FAST_GEMM_F32_MC
+#undef FAST_GEMM_F32_MC
+#endif
+#ifdef FAST_GEMM_F32_NC
+#undef FAST_GEMM_F32_NC
+#endif
+#ifdef FAST_GEMM_F32_MR
+#undef FAST_GEMM_F32_MR
+#endif
+#ifdef FAST_GEMM_F32_NR
+#undef FAST_GEMM_F32_NR
+#endif
+#ifdef FAST_GEMM_F32_PACKED_STRIDE_K
+#undef FAST_GEMM_F32_PACKED_STRIDE_K
+#endif
+#undef FAST_GEMM_IMPLEMENT_PACK
+#undef FAST_GEMM_LOAD_TO_BUF_8
+#undef FAST_GEMM_LOAD_TO_BUF_BORDERS_8
+#undef FAST_GEMM_LOAD_TO_BUF_12
+#undef FAST_GEMM_LOAD_TO_BUF_BORDERS_12
+#undef FAST_GEMM_LOAD_TO_BUF_16
+#undef FAST_GEMM_LOAD_TO_BUF_BORDERS_16
+#undef FAST_GEMM_PACK_COPY
+#undef FAST_GEMM_PACK_f32_8
+#undef FAST_GEMM_PACK_f32_12
+#undef FAST_GEMM_PACK_f32_16
diff --git a/modules/dnn/src/layers/cumsum_layer.cpp b/modules/dnn/src/layers/cumsum_layer.cpp
index 9c70f306d4..0104b2d568 100644
--- a/modules/dnn/src/layers/cumsum_layer.cpp
+++ b/modules/dnn/src/layers/cumsum_layer.cpp
@@ -47,73 +47,76 @@ public:
         inputs_arr.getMatVector(inputs);
         outputs_arr.getMatVector(outputs);
 
-        // Get x tensor.
-        const auto &src_mat = inputs[0];
-        const auto *src_ptr = src_mat.ptr<float>();
+        // Get input tensor.
+        const auto& src_mat = inputs[0];
+        const auto* src_ptr = src_mat.ptr<float>();
 
-        // Get axis.
-        const int axis = normalize_axis(axis_raw, src_mat.dims);
+        // Get target axis.
+        int axis = inputs.size() > 1 ? parseAxis(inputs[1]) : axis_raw;
+        axis = normalize_axis(axis, src_mat.dims);
 
-        // Get y tensor.
-        auto &dst_mat = outputs[0];
-        src_mat.copyTo(dst_mat);
-        auto *dst_ptr = dst_mat.ptr<float>();
+
+        // Get output tensor.
+        auto& dst_mat = outputs[0];
+        auto* dst_ptr = dst_mat.ptr<float>();
 
         // Get flags.
         const auto exclusive = exclusive_raw == 1;
         const auto reverse = reverse_raw == 1;
 
-        // Get parameters to iterate outer dimension.
+        // Data with [dim_1, .. , dim_k-1, target_dim, dim_k+1, .. , dim_n]
+        // dimensions is represented here as [outer_dim, target_dim, inner_dim]
         const size_t outer_size = src_mat.total(0, axis);
-        const size_t outer_step_length = src_mat.total(axis);
+        const size_t target_size = src_mat.size[axis];
+        const size_t inner_size = src_mat.total(axis + 1);
+        const size_t outer_step_length = target_size * inner_size;
 
-        // Get parameters to iterate inner dimension.
-        const size_t inner_size = src_mat.size[axis];
+        // Calculating steps in target dimensions
+        const int target_start = reverse ? target_size - 1 : 0;
+        const int target_stop = reverse ? -1 : target_size;
+        const int target_delta = reverse ? -1 : 1;
+        const int target_step = target_delta * inner_size;
 
-        if (!inner_size)
-            return;
+        // If exclusive, the j-th output element would be the sum of the first (j-1) elements.
+        // Otherwise, it would be the sum of the first j elements.
+        const int exclusive_delta = exclusive ? target_step : 0;
 
-        const size_t inner_step_length = src_mat.total(axis + 1);
-        const int inner_step = (reverse ? -1 : 1) * inner_step_length;
-        const int inner_start = reverse ? inner_size - 1 : 0;
-        const int inner_stop = reverse ? -1 : inner_size;
-        const int inner_delta = reverse ? -1 : 1;
-
-        // Get parameters to populate channels.
-        const size_t num_channels = src_mat.total(axis + 1);
-
-        for (size_t outer_dim = 0; outer_dim < outer_size; outer_dim++)
+        for (size_t outer_idx = 0; outer_idx < outer_size; outer_idx++)
         {
-            const size_t outer_offset = outer_dim * outer_step_length;
-            size_t src_offset = outer_offset + inner_start * inner_step_length;
+            const size_t target_offset = outer_idx * outer_step_length;
 
-            // Populate first element of inner dimension.
-            for (size_t channel = 0; channel < num_channels; channel++)
+            // Handle first element of target dimension.
+            size_t first_inner_offset = target_offset + target_start * inner_size;
+            if (exclusive)
+                for (size_t inner_idx = 0; inner_idx < inner_size; inner_idx++)
+                    dst_ptr[first_inner_offset + inner_idx] = 0.0f;
+            else
+                for (size_t inner_idx = 0; inner_idx < inner_size; inner_idx++)
+                    dst_ptr[first_inner_offset + inner_idx] = src_ptr[first_inner_offset + inner_idx];
+
+            // Handle remaining elements of target dimension.
+            for (int target_idx = target_start + target_delta; target_idx != target_stop; target_idx += target_delta)
             {
-                if (exclusive)
+                const size_t inner_offset = target_offset + target_idx * inner_size;
+
+                for (size_t inner_idx = 0; inner_idx < inner_size; inner_idx++)
                 {
-                    dst_ptr[src_offset + channel] = 0.0f;
-                }
-                else
-                {
-                    dst_ptr[src_offset + channel] = src_ptr[src_offset + channel];
-                    src_offset += inner_step;
+                    dst_ptr[inner_offset + inner_idx] = dst_ptr[inner_offset - target_step + inner_idx] +
+                        src_ptr[inner_offset - exclusive_delta + inner_idx];
                 }
             }
+        }
+    }
 
-            // Populate remaining elements of inner dimension.
-            for (int inner_dim = inner_start + inner_delta; inner_dim != inner_stop; inner_dim += inner_delta)
-            {
-                const size_t dst_offset = outer_offset + inner_dim * inner_step_length;
-
-                for (size_t channel = 0; channel < num_channels; channel++)
-                {
-                    const size_t previous_dst_offset = dst_offset - inner_step;
-                    dst_ptr[dst_offset + channel] = dst_ptr[previous_dst_offset + channel] +
-                            src_ptr[src_offset + channel];
-                    src_offset += inner_step;
-                }
-            }
+    int parseAxis(const Mat& axis_mat) {
+        CV_CheckEQ(axis_mat.total(), 1u, "Axis tensor should contain single value");
+        if (axis_mat.type() == CV_32SC1)
+            return axis_mat.at<int32_t>(0);
+        else
+        {
+            Mat axis_mat_int;
+            axis_mat.convertTo(axis_mat_int, CV_32SC1);
+            return axis_mat_int.at<int32_t>(0);
         }
     }
 
diff --git a/modules/dnn/src/layers/einsum_layer.cpp b/modules/dnn/src/layers/einsum_layer.cpp
index cfa06e375e..2cfb36da13 100644
--- a/modules/dnn/src/layers/einsum_layer.cpp
+++ b/modules/dnn/src/layers/einsum_layer.cpp
@@ -38,7 +38,6 @@ Mat batchwiseMatMul(
     const Mat& input2,
     const MatShape& input2ShapeOverride)
 {
-
     // Sanity checks before the actual MatMul
     //input_1.DataType() == input_2.DataType(), "Data types of the inputs must match for MatMul");
 
@@ -391,6 +390,15 @@ public:
                  OutputArrayOfArrays outputs_arr,
                  OutputArrayOfArrays internals_arr) CV_OVERRIDE
     {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
         // homogenize inputs
         preProcessInputs(inputs_arr);
 
diff --git a/modules/dnn/src/layers/elementwise_layers.cpp b/modules/dnn/src/layers/elementwise_layers.cpp
index 9da308ec77..58c2cf3998 100644
--- a/modules/dnn/src/layers/elementwise_layers.cpp
+++ b/modules/dnn/src/layers/elementwise_layers.cpp
@@ -984,13 +984,7 @@ struct MishFunctor : public BaseDefaultFunctor<MishFunctor>
 #ifdef HAVE_DNN_NGRAPH
     std::shared_ptr<ngraph::Node> initNgraphAPI(const ngraph::Output<ngraph::Node>& node)
     {
-        float one = 1.0f;
-        auto constant = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &one);
-        auto exp_node = std::make_shared<ngraph::op::v0::Exp>(node);
-        auto sum = std::make_shared<ngraph::op::v1::Add>(constant, exp_node, ngraph::op::AutoBroadcastType::NUMPY);
-        auto log_node = std::make_shared<ngraph::op::v0::Log>(sum);
-        auto tanh_node = std::make_shared<ngraph::op::Tanh>(log_node);
-        return std::make_shared<ngraph::op::v1::Multiply>(node, tanh_node);
+        return std::make_shared<ngraph::op::v4::Mish>(node);
     }
 #endif  // HAVE_DNN_NGRAPH
 
@@ -1190,10 +1184,7 @@ struct AbsValFunctor : public BaseDefaultFunctor<AbsValFunctor>
 #ifdef HAVE_DNN_NGRAPH
     std::shared_ptr<ngraph::Node> initNgraphAPI(const ngraph::Output<ngraph::Node>& node)
     {
-        float coeff = -0.999999f;
-        // float coeff = preferableTarget == DNN_TARGET_MYRIAD ? -0.999f : -0.999999f;
-        auto slope = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &coeff);
-        return std::make_shared<ngraph::op::PRelu>(node, slope);
+        return std::make_shared<ngraph::op::Abs>(node);
     }
 #endif  // HAVE_DNN_NGRAPH
 
@@ -2563,11 +2554,6 @@ template<>
 const char* const ReciprocalFunctor::BaseDefaultFunctor<ReciprocalFunctor>::ocl_kernel_name = "ReciprocalForward";
 
 
-#define ACTIVATION_CREATOR_FOR(_Layer, _Functor, ...) \
-Ptr<_Layer> _Layer::create() { \
-    return return Ptr<_Layer>( new ElementWiseLayer<_Functor>(_Functor()) ); }
-
-
 Ptr<ReLULayer> ReLULayer::create(const LayerParams& params)
 {
     float negativeSlope = params.get<float>("negative_slope", 0.f);
diff --git a/modules/dnn/src/layers/gemm_layer.cpp b/modules/dnn/src/layers/gemm_layer.cpp
index 0a58abce5d..a553f97568 100644
--- a/modules/dnn/src/layers/gemm_layer.cpp
+++ b/modules/dnn/src/layers/gemm_layer.cpp
@@ -191,7 +191,6 @@ public:
         size_t dims_Y = shape_Y.size();
         int M = shape_Y[dims_Y - 2], N = shape_Y[dims_Y - 1];
         int K = trans_a ? ma : na;
-        int batches = std::accumulate(shape_A.begin(), shape_A.end() - 2, 1, std::multiplies<int>());
 
         // broadcast C and copy C to output
         if (have_bias) {
@@ -201,9 +200,7 @@ public:
             int step = M * N;
             CV_CheckEQ(broadcast_C.size(), static_cast<size_t>(step), "DNN/Gemm: C is not broadcast properly");
             float *ptr_y = Y.ptr<float>();
-            for (int i = 0; i < batches; i++) {
-                std::memcpy(ptr_y + i * step, broadcast_C.data(), step * sizeof(float));
-            }
+            std::memcpy(ptr_y, broadcast_C.data(), step * sizeof(float));
         } else { // initialization
             float *ptr_y = Y.ptr<float>();
             size_t total = Y.total();
@@ -212,7 +209,6 @@ public:
 
         if (const_B) {
             CV_CheckGT(packed_B.size(), static_cast<size_t>(0), "DNN/Gemm: constant B is not pre-packed");
-            M *= batches;
             fastGemm(trans_a, M, N, K, alpha, A.ptr<const float>(), na, packed_B.data(), 1.f, Y.ptr<float>(), N, opt);
         } else {
             fastGemmBatched(trans_a, trans_b, alpha, A, inputs[1], 1.f, Y, opt);
diff --git a/modules/dnn/src/layers/softmax_layer.cpp b/modules/dnn/src/layers/softmax_layer.cpp
index e2085e944b..41077eda47 100644
--- a/modules/dnn/src/layers/softmax_layer.cpp
+++ b/modules/dnn/src/layers/softmax_layer.cpp
@@ -359,11 +359,11 @@ public:
     {
         auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
         int axis = normalize_axis(axisRaw, ieInpNode.get_shape().size());
-        auto softmax = std::make_shared<ngraph::op::v1::Softmax>(ieInpNode, axis);
-        if (logSoftMax)
-            return Ptr<BackendNode>(new InfEngineNgraphNode(std::make_shared<ngraph::op::v0::Log>(softmax)));
-
-        return Ptr<BackendNode>(new InfEngineNgraphNode(softmax));
+        if (logSoftMax) {
+            return new InfEngineNgraphNode(std::make_shared<ngraph::op::v5::LogSoftmax>(ieInpNode, axis));
+        } else {
+            return new InfEngineNgraphNode(std::make_shared<ngraph::op::v1::Softmax>(ieInpNode, axis));
+        }
     }
 #endif  // HAVE_DNN_NGRAPH
 
diff --git a/modules/dnn/src/legacy_backend.cpp b/modules/dnn/src/legacy_backend.cpp
index 16e33595dd..6f216f75fc 100644
--- a/modules/dnn/src/legacy_backend.cpp
+++ b/modules/dnn/src/legacy_backend.cpp
@@ -23,7 +23,7 @@ BackendNode::BackendNode(int backendId)
     : backendId(backendId)
 {}
 
-BackendNode::~BackendNode() {};
+BackendNode::~BackendNode() {}
 
 BackendWrapper::BackendWrapper(int backendId, int targetId)
     : backendId(backendId)
diff --git a/modules/dnn/src/model.cpp b/modules/dnn/src/model.cpp
index 8d1a788956..64b2706d38 100644
--- a/modules/dnn/src/model.cpp
+++ b/modules/dnn/src/model.cpp
@@ -306,9 +306,9 @@ void ClassificationModel::classify(InputArray frame, int& classId, float& conf)
 }
 
 KeypointsModel::KeypointsModel(const String& model, const String& config)
-    : Model(model, config) {};
+    : Model(model, config) {}
 
-KeypointsModel::KeypointsModel(const Net& network) : Model(network) {};
+KeypointsModel::KeypointsModel(const Net& network) : Model(network) {}
 
 std::vector<Point2f> KeypointsModel::estimate(InputArray frame, float thresh)
 {
@@ -364,9 +364,9 @@ std::vector<Point2f> KeypointsModel::estimate(InputArray frame, float thresh)
 }
 
 SegmentationModel::SegmentationModel(const String& model, const String& config)
-    : Model(model, config) {};
+    : Model(model, config) {}
 
-SegmentationModel::SegmentationModel(const Net& network) : Model(network) {};
+SegmentationModel::SegmentationModel(const Net& network) : Model(network) {}
 
 void SegmentationModel::segment(InputArray frame, OutputArray mask)
 {
diff --git a/modules/dnn/src/net_impl_backend.cpp b/modules/dnn/src/net_impl_backend.cpp
index ac1424d262..cd12e6a21b 100644
--- a/modules/dnn/src/net_impl_backend.cpp
+++ b/modules/dnn/src/net_impl_backend.cpp
@@ -155,11 +155,19 @@ void Net::Impl::setPreferableBackend(Net& net, int backendId)
     if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
         backendId = DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;  // = getInferenceEngineBackendTypeParam();
 
-    if (netWasQuantized && backendId != DNN_BACKEND_OPENCV && backendId != DNN_BACKEND_TIMVX)
+    if (netWasQuantized && backendId != DNN_BACKEND_OPENCV && backendId != DNN_BACKEND_TIMVX &&
+        backendId != DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
     {
-        CV_LOG_WARNING(NULL, "DNN: Only default and TIMVX backends support quantized networks");
+        CV_LOG_WARNING(NULL, "DNN: Only default, TIMVX and OpenVINO backends support quantized networks");
         backendId = DNN_BACKEND_OPENCV;
     }
+#ifdef HAVE_DNN_NGRAPH
+    if (netWasQuantized && backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && INF_ENGINE_VER_MAJOR_LT(INF_ENGINE_RELEASE_2023_0))
+    {
+        CV_LOG_WARNING(NULL, "DNN: OpenVINO 2023.0 and higher is required to supports quantized networks");
+        backendId = DNN_BACKEND_OPENCV;
+    }
+#endif
 
     if (preferableBackend != backendId)
     {
diff --git a/modules/dnn/src/net_openvino.cpp b/modules/dnn/src/net_openvino.cpp
index c274f44a87..adcfea60f0 100644
--- a/modules/dnn/src/net_openvino.cpp
+++ b/modules/dnn/src/net_openvino.cpp
@@ -48,7 +48,6 @@ public:
         CV_Assert(basePtr_);
         Net::Impl& base = *basePtr_;
         CV_Assert(!base.netWasAllocated);
-        CV_Assert(!base.netWasQuantized);
         netInputLayer = base.netInputLayer;
         blobsToKeep = base.blobsToKeep;
         layers = base.layers;
diff --git a/modules/dnn/src/onnx/onnx_importer.cpp b/modules/dnn/src/onnx/onnx_importer.cpp
index 1dc4d056c7..9cec94c1fc 100644
--- a/modules/dnn/src/onnx/onnx_importer.cpp
+++ b/modules/dnn/src/onnx/onnx_importer.cpp
@@ -383,7 +383,7 @@ void runLayer(LayerParams& params, const std::vector<Mat>& inputs,
     {
         inpShapes[i] = shape(inputs[i]);
         if (i > 0 && ddepth != inputs[i].depth())
-            CV_Error(Error::StsNotImplemented, "Mixed input data types.");
+            CV_Error(Error::StsNotImplemented, cv::format("Mixed input data types. Required type: %d, actual type: %d", ddepth, inputs[i].depth()));
 
         // Quantize and Dequantize layer have different output type than input.
         if (params.type != "Quantize" && params.type != "Dequantize")
@@ -1502,7 +1502,7 @@ void ONNXImporter::lstm_extractConsts(LayerParams& layerParams, const opencv_onn
             blob = Mat(blobShape, CV_32FC1, 0.);
         }
         layerParams.blobs.push_back(blob);
-};
+}
 
 void ONNXImporter::lstm_add_reshape(const std::string& input_name, const std::string& output_name, int* layerShape, size_t n)
 {
@@ -1517,7 +1517,7 @@ void ONNXImporter::lstm_add_reshape(const std::string& input_name, const std::st
     reshape_proto.add_input(input_name);
     reshape_proto.add_output(output_name);
     addLayer(reshapeLp, reshape_proto);
-};
+}
 
 std::string ONNXImporter::lstm_add_slice(int index, const std::string& input_name, int* begin, int* end, size_t n)
 {
@@ -1536,7 +1536,7 @@ std::string ONNXImporter::lstm_add_slice(int index, const std::string& input_nam
     addLayer(sliceLP, slice_proto);
 
     return slice_proto.output(0);
-};
+}
 
 std::string ONNXImporter::lstm_fix_dims(LayerParams& layerParams, const opencv_onnx::NodeProto& lstm_proto,
                                         int batch_size, int num_directions, int hidden_size, bool need_y, const std::string& y_name,
@@ -1564,7 +1564,7 @@ std::string ONNXImporter::lstm_fix_dims(LayerParams& layerParams, const opencv_o
     addLayer(permuteLP, permute_proto);
 
     return permute_proto.output(0);
-};
+}
 
 void ONNXImporter::lstm_add_transform(int num_directions, int batch_size, int hidden_size,
                                       int index, const std::string& input_name, const std::string& output_name)
@@ -1606,7 +1606,7 @@ void ONNXImporter::lstm_add_transform(int num_directions, int batch_size, int hi
         int layerShape[] = {2, batch_size, hidden_size};
         lstm_add_reshape(concat_proto.output(0), output_name, layerShape, sizeof(layerShape) / sizeof(layerShape[0]));
     }
-};
+}
 
 void ONNXImporter::parseLSTM(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto_)
 {
diff --git a/modules/dnn/src/op_inf_engine.hpp b/modules/dnn/src/op_inf_engine.hpp
index 45913d3b31..c5a2e58683 100644
--- a/modules/dnn/src/op_inf_engine.hpp
+++ b/modules/dnn/src/op_inf_engine.hpp
@@ -27,6 +27,7 @@
 #define INF_ENGINE_RELEASE_2021_3 2021030000
 #define INF_ENGINE_RELEASE_2021_4 2021040000
 #define INF_ENGINE_RELEASE_2022_1 2022010000
+#define INF_ENGINE_RELEASE_2023_0 2023000000
 
 #ifndef INF_ENGINE_RELEASE
 #warning("IE version have not been provided via command-line. Using 2021.4 by default")
diff --git a/modules/dnn/src/tensorflow/tf_importer.cpp b/modules/dnn/src/tensorflow/tf_importer.cpp
index 756bdc949c..28744c586a 100644
--- a/modules/dnn/src/tensorflow/tf_importer.cpp
+++ b/modules/dnn/src/tensorflow/tf_importer.cpp
@@ -3227,7 +3227,7 @@ void TFLayerHandler::fillRegistry(const tensorflow::GraphDef& net)
         }
     }
     printMissing();
-};
+}
 
 bool TFLayerHandler::handleMissing(const tensorflow::NodeDef& layer)
 {
diff --git a/modules/dnn/test/test_backends.cpp b/modules/dnn/test/test_backends.cpp
index 6940fee99b..2faa2c8e60 100644
--- a/modules/dnn/test/test_backends.cpp
+++ b/modules/dnn/test/test_backends.cpp
@@ -151,10 +151,12 @@ TEST_P(DNNTestNetwork, ENet)
 {
     applyTestTag(target == DNN_TARGET_CPU ? "" : CV_TEST_TAG_MEMORY_512MB);
 
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+#endif
     if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
     if (backend == DNN_BACKEND_CUDA && target == DNN_TARGET_CUDA_FP16)
@@ -482,7 +484,7 @@ TEST_P(DNNTestNetwork, FastNeuralStyle_eccv16)
     Mat img = imread(findDataFile("dnn/googlenet_1.png"));
     Mat inp = blobFromImage(img, 1.0, Size(320, 240), Scalar(103.939, 116.779, 123.68), false, false);
     // Output image has values in range [-143.526, 148.539].
-    float l1 = 2e-4, lInf = 2e-3;
+    float l1 = 2e-4, lInf = 2.4e-3;
     if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
     {
         l1 = 0.4;
@@ -875,8 +877,12 @@ TEST_P(MaxPooling, Accuracy)
     Target targetId = get<1>(get<5>(GetParam()));
 
     // https://github.com/openvinotoolkit/openvino/issues/18731
-    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && stride != Size(1, 1))
-        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && stride != Size(1, 1)) {
+        int ow = ceil(static_cast<float>(inSize.width + 2 * pad.width - kernel.width) / stride.width);
+        int oh = ceil(static_cast<float>(inSize.height + 2 * pad.height - kernel.height) / stride.height);
+        if (ow * stride.width >= inSize.width + pad.width || oh * stride.height >= inSize.height + pad.height)
+            applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+    }
 
 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2019010000)
     if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && targetId == DNN_TARGET_MYRIAD
@@ -1026,10 +1032,12 @@ INSTANTIATE_TEST_CASE_P(Layer_Test_Backends, SoftMax, testing::Combine(
 //////////////////////////////////////////////////////////////////////////////
 TEST_P(Test_layers_backends, MaxPoolUnpool)
 {
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+#endif
 
     LayerParams pool;
     pool.set("pool", "max");
diff --git a/modules/dnn/test/test_int8_layers.cpp b/modules/dnn/test/test_int8_layers.cpp
index 97fb456ddc..075cc4c069 100644
--- a/modules/dnn/test/test_int8_layers.cpp
+++ b/modules/dnn/test/test_int8_layers.cpp
@@ -14,6 +14,9 @@ testing::internal::ParamGenerator< tuple<Backend, Target> > dnnBackendsAndTarget
     targets.push_back(make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU));
 #ifdef HAVE_TIMVX
     targets.push_back(make_tuple(DNN_BACKEND_TIMVX, DNN_TARGET_NPU));
+#endif
+#ifdef HAVE_INF_ENGINE
+    targets.push_back(make_tuple(DNN_BACKEND_INFERENCE_ENGINE_NGRAPH, DNN_TARGET_CPU));
 #endif
     return testing::ValuesIn(targets);
 }
@@ -66,8 +69,6 @@ public:
             outPath = _tf("onnx/data/output_" + basename);
         }
         ASSERT_FALSE(net.empty());
-        net.setPreferableBackend(backend);
-        net.setPreferableTarget(target);
 
         for (int i = 0; i < numInps; i++)
             inps[i] = blobFromNPY(inpPath + ((numInps > 1) ? cv::format("_%d.npy", i) : ".npy"));
@@ -78,6 +79,8 @@ public:
         qnet = net.quantize(inps, CV_8S, CV_8S, perChannel);
         qnet.getInputDetails(inputScale, inputZp);
         qnet.getOutputDetails(outputScale, outputZp);
+        qnet.setPreferableBackend(backend);
+        qnet.setPreferableTarget(target);
 
         // Quantize inputs to int8
         // int8_value = float_value/scale + zero-point
@@ -98,7 +101,7 @@ public:
             if (out_i.dims == 2 && ref_i.dims == 1) {
                 ref_i = ref_i.reshape(1, 1);
             }
-            normAssert(ref_i, out_i, "", l1, lInf);
+            normAssert(ref_i, out_i, basename.c_str(), l1, lInf);
         }
     }
 };
@@ -201,10 +204,13 @@ TEST_P(Test_Int8_layers, Padding)
 
 TEST_P(Test_Int8_layers, AvePooling)
 {
-    testLayer("layer_pooling_ave", "Caffe", 0.0021, 0.0075);
+    // Some tests failed with OpenVINO due to wrong padded area calculation
+    if (backend != DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        testLayer("layer_pooling_ave", "Caffe", 0.0021, 0.0075);
     testLayer("ave_pool_same", "TensorFlow", 0.00153, 0.0041);
     testLayer("average_pooling_1d", "ONNX", 0.002, 0.0048);
-    testLayer("average_pooling", "ONNX", 0.0014, 0.0032);
+    if (backend != DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        testLayer("average_pooling", "ONNX", 0.0014, 0.0032);
     testLayer("average_pooling_dynamic_axes", "ONNX", 0.0014, 0.006);
 
     if (target != DNN_TARGET_CPU)
@@ -220,8 +226,6 @@ TEST_P(Test_Int8_layers, MaxPooling)
         throw SkipTestException("Only CPU is supported");
     testLayer("pool_conv_3d", "ONNX", 0.0033, 0.0124);
 
-    /* All the below tests have MaxPooling as last layer, so computeMaxIdx is set to true
-       which is not supported by int8 maxpooling
     testLayer("layer_pooling_max", "Caffe", 0.0021, 0.004);
     testLayer("max_pool_even", "TensorFlow", 0.0048, 0.0139);
     testLayer("max_pool_odd_valid", "TensorFlow", 0.0043, 0.012);
@@ -231,7 +235,7 @@ TEST_P(Test_Int8_layers, MaxPooling)
     testLayer("two_maxpooling_1d", "ONNX", 0.0037, 0.0052);
     testLayer("maxpooling", "ONNX", 0.0034, 0.0065);
     testLayer("two_maxpooling", "ONNX", 0.0025, 0.0052);
-    testLayer("max_pool3d", "ONNX", 0.0028, 0.0069);*/
+    testLayer("max_pool3d", "ONNX", 0.0028, 0.0069);
 }
 
 TEST_P(Test_Int8_layers, Reduce)
@@ -326,7 +330,10 @@ TEST_P(Test_Int8_layers, DISABLED_Softmax_unfused_ONNX)  // FIXIT Support 'Ident
 TEST_P(Test_Int8_layers, Concat)
 {
     testLayer("layer_concat_shared_input", "Caffe", 0.0076, 0.029, 1, 1, true, false);
-    testLayer("concat_axis_1", "TensorFlow", 0.0056, 0.017);
+    if (backend != DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) {
+        // Crashes with segfault
+        testLayer("concat_axis_1", "TensorFlow", 0.0056, 0.017);
+    }
     testLayer("keras_pad_concat", "TensorFlow", 0.0032, 0.0089);
     testLayer("concat_3d", "TensorFlow", 0.005, 0.014);
     testLayer("concatenation", "ONNX", 0.0032, 0.009);
@@ -404,10 +411,13 @@ TEST_P(Test_Int8_layers, Reshape)
         testLayer("reshape_nchw", "TensorFlow", 0.0089, 0.029);
 
     testLayer("reshape_conv", "TensorFlow", 0.035, 0.054);
-    testLayer("reshape_reduce", "TensorFlow", 0.0042, 0.0078);
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        testLayer("reshape_reduce", "TensorFlow", 0.0053, 0.011);
+    else
+        testLayer("reshape_reduce", "TensorFlow", 0.0042, 0.0078);
     testLayer("reshape_as_shape", "TensorFlow", 0.0014, 0.0028);
     testLayer("reshape_no_reorder", "TensorFlow", 0.0014, 0.0028);
-    testLayer("shift_reshape_no_reorder", "TensorFlow", 0.0063, 0.014);
+    testLayer("shift_reshape_no_reorder", "TensorFlow", 0.0063, backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH ? 0.016 : 0.014);
     testLayer("dynamic_reshape", "ONNX", 0.0047, 0.0079);
     testLayer("dynamic_reshape_opset_11", "ONNX", 0.0048, 0.0081);
     testLayer("flatten_by_prod", "ONNX", 0.0048, 0.0081);
@@ -495,10 +505,10 @@ TEST_P(Test_Int8_layers, Eltwise)
 
     testLayer("conv_2_inps", "Caffe", 0.0086, 0.0232, 2, 1, true, false);
     testLayer("eltwise_sub", "TensorFlow", 0.015, 0.047);
-    testLayer("eltwise_add_vec", "TensorFlow", 0.037, 0.21); // tflite 0.0095, 0.0365
+    testLayer("eltwise_add_vec", "TensorFlow", 0.037, backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH ? 0.24 : 0.21); // tflite 0.0095, 0.0365
     testLayer("eltwise_mul_vec", "TensorFlow", 0.173, 1.14); // tflite 0.0028, 0.017
     testLayer("channel_broadcast", "TensorFlow", 0.0025, 0.0063);
-    testLayer("split_equals", "TensorFlow", 0.02, 0.065);
+    testLayer("split_equals", "TensorFlow", backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH ? 0.021 : 0.02, 0.065);
     testLayer("mul", "ONNX", 0.0039, 0.014);
     testLayer("split_max", "ONNX", 0.004, 0.012);
 }
@@ -555,10 +565,10 @@ public:
         Mat blob = readTensorFromONNX(findDataFile("dnn/onnx/data/input_" + basename + ".pb"));
         Mat ref = readTensorFromONNX(findDataFile("dnn/onnx/data/output_" + basename + ".pb"));
         Net baseNet = readNetFromONNX(onnxmodel);
-        baseNet.setPreferableBackend(backend);
-        baseNet.setPreferableTarget(target);
 
         Net qnet = baseNet.quantize(blob, CV_32F, CV_32F, perChannel);
+        qnet.setPreferableBackend(backend);
+        qnet.setPreferableTarget(target);
         qnet.setInput(blob);
         Mat out = qnet.forward();
 
@@ -703,9 +713,6 @@ TEST_P(Test_Int8_nets, AlexNet)
 #else
     applyTestTag(target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB);
 #endif
-    if (backend != DNN_BACKEND_OPENCV)
-        throw SkipTestException("Only OpenCV backend is supported");
-
     if (target == DNN_TARGET_OPENCL_FP16 && !ocl::Device::getDefault().isIntel())
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
     if (target == DNN_TARGET_OPENCL && !ocl::Device::getDefault().isIntel())
@@ -746,8 +753,6 @@ TEST_P(Test_Int8_nets, GoogLeNet)
 TEST_P(Test_Int8_nets, ResNet50)
 {
     applyTestTag(target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB);
-    if (backend != DNN_BACKEND_OPENCV)
-        throw SkipTestException("Only OpenCV backend is supported");
 
     if (target == DNN_TARGET_OPENCL_FP16 && !ocl::Device::getDefault().isIntel())
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
@@ -778,6 +783,8 @@ TEST_P(Test_Int8_nets, DenseNet121)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
     if (target == DNN_TARGET_OPENCL && !ocl::Device::getDefault().isIntel())
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL);
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
 
     Net net = readNetFromCaffe(findDataFile("dnn/DenseNet_121.prototxt", false),
                                findDataFile("dnn/DenseNet_121.caffemodel", false));
@@ -959,6 +966,8 @@ TEST_P(Test_Int8_nets, opencv_face_detector)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
     if (target == DNN_TARGET_OPENCL && !ocl::Device::getDefault().isIntel())
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL);
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
 
     Net net = readNetFromCaffe(findDataFile("dnn/opencv_face_detector.prototxt"),
                                findDataFile("dnn/opencv_face_detector.caffemodel", false));
@@ -1025,7 +1034,8 @@ TEST_P(Test_Int8_nets, FasterRCNN_resnet50)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
     if (target == DNN_TARGET_OPENCL && !ocl::Device::getDefault().isIntel())
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL);
-
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
     if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
 
@@ -1052,7 +1062,8 @@ TEST_P(Test_Int8_nets, FasterRCNN_inceptionv2)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
     if (target == DNN_TARGET_OPENCL && !ocl::Device::getDefault().isIntel())
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL);
-
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
     if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
 
@@ -1083,6 +1094,8 @@ TEST_P(Test_Int8_nets, FasterRCNN_vgg16)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
     if (target == DNN_TARGET_OPENCL && !ocl::Device::getDefault().isIntel())
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL);
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
 
     Net net = readNetFromCaffe(findDataFile("dnn/faster_rcnn_vgg16.prototxt"),
                                findDataFile("dnn/VGG16_faster_rcnn_final.caffemodel", false));
@@ -1110,6 +1123,8 @@ TEST_P(Test_Int8_nets, FasterRCNN_zf)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
     if (target == DNN_TARGET_OPENCL && !ocl::Device::getDefault().isIntel())
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL);
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
 
     Net net = readNetFromCaffe(findDataFile("dnn/faster_rcnn_zf.prototxt"),
                                findDataFile("dnn/ZF_faster_rcnn_final.caffemodel", false));
@@ -1142,6 +1157,9 @@ TEST_P(Test_Int8_nets, RFCN)
                                     0, 12, 0.94786, 132.093, 223.903, 338.077, 566.16);
 
     float confThreshold = 0.8, scoreDiff = 0.15, iouDiff = 0.11;
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) {
+        iouDiff = 0.12;
+    }
     testFaster(net, ref, confThreshold, scoreDiff, iouDiff);
 }
 
@@ -1321,6 +1339,8 @@ TEST_P(Test_Int8_nets, YOLOv4_tiny)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
     if (target == DNN_TARGET_OPENCL && !ocl::Device::getDefault().isIntel())
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL);
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
 
     const float confThreshold = 0.6;
 
diff --git a/modules/dnn/test/test_layers.cpp b/modules/dnn/test/test_layers.cpp
index cbc9067ffb..536bc7dcc4 100644
--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@@ -413,10 +413,12 @@ TEST_P(Test_Caffe_layers, layer_prelu_fc)
 
 TEST_P(Test_Caffe_layers, Reshape_Split_Slice)
 {
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+#endif
 
     Net net = readNetFromCaffe(_tf("reshape_and_slice_routines.prototxt"));
     ASSERT_FALSE(net.empty());
@@ -795,8 +797,10 @@ TEST_P(Test_Caffe_layers, DataAugmentation)
 
 TEST_P(Test_Caffe_layers, Resample)
 {
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
     if (backend != DNN_BACKEND_OPENCV)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+#endif
     testLayerUsingCaffeModels("nearest_2inps", false, false, 0.0, 0.0, 2);
     testLayerUsingCaffeModels("nearest", false, false);
 }
diff --git a/modules/dnn/test/test_main.cpp b/modules/dnn/test/test_main.cpp
index a0d876b087..b7b95d2485 100644
--- a/modules/dnn/test/test_main.cpp
+++ b/modules/dnn/test/test_main.cpp
@@ -4,4 +4,4 @@
     #include <hpx/hpx_main.hpp>
 #endif
 
-CV_TEST_MAIN("", initDNNTests());
+CV_TEST_MAIN("", initDNNTests())
diff --git a/modules/dnn/test/test_onnx_conformance.cpp b/modules/dnn/test/test_onnx_conformance.cpp
index e9c7a9313a..5fb9133b93 100644
--- a/modules/dnn/test/test_onnx_conformance.cpp
+++ b/modules/dnn/test/test_onnx_conformance.cpp
@@ -1236,4 +1236,4 @@ INSTANTIATE_TEST_CASE_P(/**/, Test_ONNX_conformance,
     printOnnxConfParams
 );
 
-};
+}
diff --git a/modules/dnn/test/test_onnx_conformance_layer_filter__cuda_denylist.inl.hpp b/modules/dnn/test/test_onnx_conformance_layer_filter__cuda_denylist.inl.hpp
index 4c05f10305..96778ef5d4 100644
--- a/modules/dnn/test/test_onnx_conformance_layer_filter__cuda_denylist.inl.hpp
+++ b/modules/dnn/test/test_onnx_conformance_layer_filter__cuda_denylist.inl.hpp
@@ -46,6 +46,13 @@
 "test_conv_with_strides_and_asymmetric_padding",
 "test_conv_with_strides_no_padding",
 "test_conv_with_strides_padding",
+"test_cumsum_1d",
+"test_cumsum_1d_exclusive",
+"test_cumsum_1d_reverse",
+"test_cumsum_1d_reverse_exclusive",
+"test_cumsum_2d_axis_0",
+"test_cumsum_2d_axis_1",
+"test_cumsum_2d_negative_axis",
 "test_div_bcast",
 "test_div_uint8",
 "test_dropout_default_ratio",
diff --git a/modules/dnn/test/test_onnx_conformance_layer_filter__vulkan_denylist.inl.hpp b/modules/dnn/test/test_onnx_conformance_layer_filter__vulkan_denylist.inl.hpp
index 8156686428..6f8d7aef20 100644
--- a/modules/dnn/test/test_onnx_conformance_layer_filter__vulkan_denylist.inl.hpp
+++ b/modules/dnn/test/test_onnx_conformance_layer_filter__vulkan_denylist.inl.hpp
@@ -40,6 +40,13 @@
 "test_cast_STRING_to_FLOAT",
 "test_castlike_FLOAT_to_STRING_expanded",
 "test_castlike_STRING_to_FLOAT_expanded",
+"test_cumsum_1d",
+"test_cumsum_1d_exclusive",
+"test_cumsum_1d_reverse",
+"test_cumsum_1d_reverse_exclusive",
+"test_cumsum_2d_axis_0",
+"test_cumsum_2d_axis_1",
+"test_cumsum_2d_negative_axis",
 "test_concat_1d_axis_negative_1",
 "test_div_uint8",
 "test_flatten_axis0",
diff --git a/modules/dnn/test/test_onnx_conformance_layer_parser_denylist.inl.hpp b/modules/dnn/test/test_onnx_conformance_layer_parser_denylist.inl.hpp
index d9d3285b32..00dd9191c5 100644
--- a/modules/dnn/test/test_onnx_conformance_layer_parser_denylist.inl.hpp
+++ b/modules/dnn/test/test_onnx_conformance_layer_parser_denylist.inl.hpp
@@ -89,13 +89,6 @@
 "test_convtranspose_pad",
 "test_convtranspose_pads",
 "test_convtranspose_with_kernel",
-"test_cumsum_1d",
-"test_cumsum_1d_exclusive",
-"test_cumsum_1d_reverse",
-"test_cumsum_1d_reverse_exclusive",
-"test_cumsum_2d_axis_0",
-"test_cumsum_2d_axis_1",
-"test_cumsum_2d_negative_axis",
 "test_dequantizelinear",
 "test_dequantizelinear_axis",
 "test_det_2d",
@@ -547,3 +540,11 @@
 "test_xor_bcast4v2d",
 "test_xor_bcast4v3d",
 "test_xor_bcast4v4d",
+// Cumsum related issue: https://github.com/opencv/opencv/issues/24437
+"test_cumsum_1d",
+"test_cumsum_1d_exclusive",
+"test_cumsum_1d_reverse",
+"test_cumsum_1d_reverse_exclusive",
+"test_cumsum_2d_axis_0",
+"test_cumsum_2d_axis_1",
+"test_cumsum_2d_negative_axis",
diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp
index 6ce46dc6e9..30bc9568df 100644
--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@@ -681,6 +681,9 @@ TEST_P(Test_ONNX_layers, Compare_GT)
 
     testONNXModels("greater");
 }
+TEST_P(Test_ONNX_layers, Greater_input_dtype_int64) {
+    testONNXModels("greater_input_dtype_int64");
+}
 
 TEST_P(Test_ONNX_layers, Compare_LT)
 {
@@ -1063,10 +1066,12 @@ TEST_P(Test_ONNX_layers, ResizeUnfused)
 
 TEST_P(Test_ONNX_layers, ResizeUnfusedTwoInputs)
 {
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+#endif
     testONNXModels("upsample_unfused_two_inputs_opset9_torch1.4", npy, 0, 0, false, true, 2);
     testONNXModels("upsample_unfused_two_inputs_opset11_torch1.4", npy, 0, 0, false, true, 2);
 }
@@ -1170,10 +1175,12 @@ TEST_P(Test_ONNX_layers, ReduceL2)
 
 TEST_P(Test_ONNX_layers, Split)
 {
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+#endif
     testONNXModels("split_0");
     testONNXModels("split_1");
     testONNXModels("split_2");
@@ -1249,10 +1256,12 @@ TEST_P(Test_ONNX_layers, Softmax)
 
 TEST_P(Test_ONNX_layers, Split_EltwiseMax)
 {
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+#endif
     testONNXModels("split_max");
 }
 
@@ -2058,12 +2067,16 @@ TEST_P(Test_ONNX_layers, Quantized_Unsqueeze)
 TEST_P(Test_ONNX_layers, Quantized_Resize)
 {
     testONNXModels("quantized_resize_nearest");
-    testONNXModels("quantized_resize_bilinear", npy, 2e-4, 0.003);
-    testONNXModels("quantized_resize_bilinear_align", npy, 3e-4, 0.003);
+    double l1 = backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH ? 0.0013 : 2e-4;
+    testONNXModels("quantized_resize_bilinear", npy, l1, 0.003);
+    l1 = backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH ? 0.0013 : 3e-4;
+    testONNXModels("quantized_resize_bilinear_align", npy, l1, 0.003);
 }
 
 TEST_P(Test_ONNX_layers, Quantized_Concat)
 {
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
     testONNXModels("quantized_concat");
     testONNXModels("quantized_concat_const_blob");
 }
@@ -2080,6 +2093,8 @@ TEST_P(Test_ONNX_layers, OutputRegistration)
 
 TEST_P(Test_ONNX_layers, QLinearSoftmax)
 {
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
     testONNXModels("qlinearsoftmax_v11", npy, 0.002, 0.002); // 2D coerced
     testONNXModels("qlinearsoftmax_v13", npy, 0.002, 0.002);
 }
@@ -2669,37 +2684,37 @@ TEST_P(Test_ONNX_layers, where_node)
     testONNXModels("where_layer");
 }
 
-TEST_P(Test_ONNX_layers, Conformance_Gemm_all_attributes) {
+TEST_P(Test_ONNX_layers, Gemm_all_attributes) {
     testONNXModels("test_gemm_all_attributes", pb, 0, 0, false, true, 2);
 }
-TEST_P(Test_ONNX_layers, Conformance_Gemm_alpha) {
+TEST_P(Test_ONNX_layers, Gemm_alpha) {
     testONNXModels("test_gemm_alpha", pb, 0, 0, false, true, 2);
 }
-TEST_P(Test_ONNX_layers, Conformance_Gemm_beta) {
+TEST_P(Test_ONNX_layers, Gemm_beta) {
     testONNXModels("test_gemm_beta", pb, 0, 0, false, true, 2);
 }
-TEST_P(Test_ONNX_layers, Conformance_Gemm_default_matrix_bias) {
+TEST_P(Test_ONNX_layers, Gemm_default_matrix_bias) {
     testONNXModels("test_gemm_default_matrix_bias", pb, 0, 0, false, true, 2);
 }
-TEST_P(Test_ONNX_layers, Conformance_Gemm_default_no_bias) {
+TEST_P(Test_ONNX_layers, Gemm_default_no_bias) {
     testONNXModels("test_gemm_default_no_bias", pb, 0, 0, false, true, 2);
 }
-TEST_P(Test_ONNX_layers, Conformance_Gemm_default_scalar_bias) {
+TEST_P(Test_ONNX_layers, Gemm_default_scalar_bias) {
     testONNXModels("test_gemm_default_scalar_bias", pb, 0, 0, false, true, 2);
 }
-TEST_P(Test_ONNX_layers, Conformance_Gemm_default_single_elem_vector_bias) {
+TEST_P(Test_ONNX_layers, Gemm_default_single_elem_vector_bias) {
     testONNXModels("test_gemm_default_single_elem_vector_bias", pb, 0, 0, false, true, 2);
 }
-TEST_P(Test_ONNX_layers, Conformance_Gemm_default_vector_bias) {
+TEST_P(Test_ONNX_layers, Gemm_default_vector_bias) {
     testONNXModels("test_gemm_default_vector_bias", pb, 0, 0, false, true, 2);
 }
-TEST_P(Test_ONNX_layers, Conformance_Gemm_default_zero_bias) {
+TEST_P(Test_ONNX_layers, Gemm_default_zero_bias) {
     testONNXModels("test_gemm_default_zero_bias", pb, 0, 0, false, true, 2);
 }
-TEST_P(Test_ONNX_layers, Conformance_Gemm_transposeA) {
+TEST_P(Test_ONNX_layers, Gemm_transposeA) {
     testONNXModels("test_gemm_transposeA", pb, 0, 0, false, true, 2);
 }
-TEST_P(Test_ONNX_layers, Conformance_Gemm_transposeB) {
+TEST_P(Test_ONNX_layers, Gemm_transposeB) {
     testONNXModels("test_gemm_transposeB", pb, 0, 0, false, true, 2);
 }
 
diff --git a/modules/dnn/test/test_tf_importer.cpp b/modules/dnn/test/test_tf_importer.cpp
index 274fa8cee0..1d36cddc23 100644
--- a/modules/dnn/test/test_tf_importer.cpp
+++ b/modules/dnn/test/test_tf_importer.cpp
@@ -619,10 +619,12 @@ TEST_P(Test_TensorFlow_layers, pooling_reduce_sum_1_2_true)
 
 TEST_P(Test_TensorFlow_layers, max_pool_grad)
 {
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+#endif
     runTensorFlowNet("max_pool_grad");
 }
 
@@ -1496,17 +1498,21 @@ TEST_P(Test_TensorFlow_layers, split)
 
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_MYRIAD)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+#endif
     runTensorFlowNet("split");
 }
 
 TEST_P(Test_TensorFlow_layers, split_equals)
 {
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+#endif
     runTensorFlowNet("split_equals");
 }
 
@@ -1581,7 +1587,7 @@ TEST_P(Test_TensorFlow_layers, relu6)
 
 TEST_P(Test_TensorFlow_layers, subpixel)
 {
-#if defined(INF_ENGINE_RELEASE)
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
@@ -1621,8 +1627,10 @@ TEST_P(Test_TensorFlow_layers, resize_bilinear_align_corners)
 // TF case: align_corners=False, half_pixel_centers=True
 TEST_P(Test_TensorFlow_layers, resize_bilinear_half_pixel)
 {
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+#endif
 
     runTensorFlowNet("resize_bilinear", false, 0.0, 0.0, false, "_half_pixel");
 }
@@ -1636,8 +1644,10 @@ TEST_P(Test_TensorFlow_layers, resize_bilinear_factor)
 // TF case: align_corners=False, half_pixel_centers=True
 TEST_P(Test_TensorFlow_layers, resize_bilinear_factor_half_pixel)
 {
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+#endif
 
     runTensorFlowNet("resize_bilinear_factor", false, 0.0, 0.0, false, "_half_pixel");
 }
diff --git a/modules/dnn/test/test_tflite_importer.cpp b/modules/dnn/test/test_tflite_importer.cpp
index 4f3a8b4a96..29f8bae25e 100644
--- a/modules/dnn/test/test_tflite_importer.cpp
+++ b/modules/dnn/test/test_tflite_importer.cpp
@@ -204,6 +204,10 @@ TEST_P(Test_TFLite, max_unpooling)
 }
 
 TEST_P(Test_TFLite, EfficientDet_int8) {
+    if (target != DNN_TARGET_CPU || (backend != DNN_BACKEND_OPENCV &&
+        backend != DNN_BACKEND_TIMVX && backend != DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)) {
+        throw SkipTestException("Only OpenCV, TimVX and OpenVINO targets support INT8 on CPU");
+    }
     Net net = readNet(findDataFile("dnn/tflite/coco_efficientdet_lite0_v1_1.0_quant_2021_09_06.tflite", false));
     net.setPreferableBackend(backend);
     net.setPreferableTarget(target);
diff --git a/modules/dnn/test/test_torch_importer.cpp b/modules/dnn/test/test_torch_importer.cpp
index ae39d5d22e..254422f072 100644
--- a/modules/dnn/test/test_torch_importer.cpp
+++ b/modules/dnn/test/test_torch_importer.cpp
@@ -449,7 +449,7 @@ TEST_P(Test_Torch_nets, ENet_accuracy)
         throw SkipTestException("");
     }
 #endif
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2021010000)
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
 #endif
diff --git a/modules/features2d/include/opencv2/features2d.hpp b/modules/features2d/include/opencv2/features2d.hpp
index fae41af5d6..fab90c6b3d 100644
--- a/modules/features2d/include/opencv2/features2d.hpp
+++ b/modules/features2d/include/opencv2/features2d.hpp
@@ -872,11 +872,15 @@ public:
     @param nOctaveLayers Default number of sublevels per scale level
     @param diffusivity Diffusivity type. DIFF_PM_G1, DIFF_PM_G2, DIFF_WEICKERT or
     DIFF_CHARBONNIER
+    @param max_points Maximum amount of returned points. In case if image contains
+    more features, then the features with highest response are returned.
+    Negative value means no limitation.
      */
     CV_WRAP static Ptr<AKAZE> create(AKAZE::DescriptorType descriptor_type = AKAZE::DESCRIPTOR_MLDB,
                                      int descriptor_size = 0, int descriptor_channels = 3,
                                      float threshold = 0.001f, int nOctaves = 4,
-                                     int nOctaveLayers = 4, KAZE::DiffusivityType diffusivity = KAZE::DIFF_PM_G2);
+                                     int nOctaveLayers = 4, KAZE::DiffusivityType diffusivity = KAZE::DIFF_PM_G2,
+                                     int max_points = -1);
 
     CV_WRAP virtual void setDescriptorType(AKAZE::DescriptorType dtype) = 0;
     CV_WRAP virtual AKAZE::DescriptorType getDescriptorType() const = 0;
@@ -899,6 +903,9 @@ public:
     CV_WRAP virtual void setDiffusivity(KAZE::DiffusivityType diff) = 0;
     CV_WRAP virtual KAZE::DiffusivityType getDiffusivity() const = 0;
     CV_WRAP virtual String getDefaultName() const CV_OVERRIDE;
+
+    CV_WRAP virtual void setMaxPoints(int max_points) = 0;
+    CV_WRAP virtual int getMaxPoints() const = 0;
 };
 
 //! @} features2d_main
diff --git a/modules/features2d/misc/java/test/AKAZEDescriptorExtractorTest.java b/modules/features2d/misc/java/test/AKAZEDescriptorExtractorTest.java
index fd98cddee1..69b12d00b1 100644
--- a/modules/features2d/misc/java/test/AKAZEDescriptorExtractorTest.java
+++ b/modules/features2d/misc/java/test/AKAZEDescriptorExtractorTest.java
@@ -58,7 +58,7 @@ public class AKAZEDescriptorExtractorTest extends OpenCVTestCase {
 
         extractor.write(filename);
 
-        String truth = "%YAML:1.0\n---\nformat: 3\nname: \"Feature2D.AKAZE\"\ndescriptor: 5\ndescriptor_channels: 3\ndescriptor_size: 0\nthreshold: 1.0000000474974513e-03\noctaves: 4\nsublevels: 4\ndiffusivity: 1\n";
+        String truth = "%YAML:1.0\n---\nformat: 3\nname: \"Feature2D.AKAZE\"\ndescriptor: 5\ndescriptor_channels: 3\ndescriptor_size: 0\nthreshold: 1.0000000474974513e-03\noctaves: 4\nsublevels: 4\ndiffusivity: 1\nmax_points: -1\n";
         String actual = readFile(filename);
         actual = actual.replaceAll("e([+-])0(\\d\\d)", "e$1$2"); // NOTE: workaround for different platforms double representation
         assertEquals(truth, actual);
diff --git a/modules/features2d/src/akaze.cpp b/modules/features2d/src/akaze.cpp
index 7aa97dae36..a41ee55200 100644
--- a/modules/features2d/src/akaze.cpp
+++ b/modules/features2d/src/akaze.cpp
@@ -61,7 +61,7 @@ namespace cv
     {
     public:
         AKAZE_Impl(DescriptorType _descriptor_type, int _descriptor_size, int _descriptor_channels,
-                 float _threshold, int _octaves, int _sublevels, KAZE::DiffusivityType _diffusivity)
+                 float _threshold, int _octaves, int _sublevels, KAZE::DiffusivityType _diffusivity, int _max_points)
         : descriptor(_descriptor_type)
         , descriptor_channels(_descriptor_channels)
         , descriptor_size(_descriptor_size)
@@ -69,6 +69,7 @@ namespace cv
         , octaves(_octaves)
         , sublevels(_sublevels)
         , diffusivity(_diffusivity)
+        , max_points(_max_points)
         {
         }
 
@@ -98,6 +99,9 @@ namespace cv
         void setDiffusivity(KAZE::DiffusivityType diff_) CV_OVERRIDE{ diffusivity = diff_; }
         KAZE::DiffusivityType getDiffusivity() const CV_OVERRIDE{ return diffusivity; }
 
+        void setMaxPoints(int max_points_) CV_OVERRIDE { max_points = max_points_; }
+        int getMaxPoints() const CV_OVERRIDE { return max_points; }
+
         // returns the descriptor size in bytes
         int descriptorSize() const CV_OVERRIDE
         {
@@ -195,6 +199,12 @@ namespace cv
                 KeyPointsFilter::runByPixelsMask(keypoints, mask.getMat());
             }
 
+            if (max_points > 0 && (int)keypoints.size() > max_points) {
+                std::partial_sort(keypoints.begin(), keypoints.begin() + max_points, keypoints.end(),
+                    [](const cv::KeyPoint& k1, const cv::KeyPoint& k2) {return k1.response > k2.response;});
+                keypoints.erase(keypoints.begin() + max_points, keypoints.end());
+            }
+
             if(descriptors.needed())
             {
                 impl.Compute_Descriptors(keypoints, descriptors);
@@ -215,6 +225,7 @@ namespace cv
             fs << "octaves" << octaves;
             fs << "sublevels" << sublevels;
             fs << "diffusivity" << diffusivity;
+            fs << "max_points" << max_points;
         }
 
         void read(const FileNode& fn) CV_OVERRIDE
@@ -234,6 +245,8 @@ namespace cv
                 sublevels = (int)fn["sublevels"];
             if (!fn["diffusivity"].empty())
                 diffusivity = static_cast<KAZE::DiffusivityType>((int)fn["diffusivity"]);
+            if (!fn["max_points"].empty())
+                max_points = (int)fn["max_points"];
         }
 
         DescriptorType descriptor;
@@ -243,15 +256,16 @@ namespace cv
         int octaves;
         int sublevels;
         KAZE::DiffusivityType diffusivity;
+        int max_points;
     };
 
     Ptr<AKAZE> AKAZE::create(DescriptorType descriptor_type,
                              int descriptor_size, int descriptor_channels,
                              float threshold, int octaves,
-                             int sublevels, KAZE::DiffusivityType diffusivity)
+                             int sublevels, KAZE::DiffusivityType diffusivity, int max_points)
     {
         return makePtr<AKAZE_Impl>(descriptor_type, descriptor_size, descriptor_channels,
-                                   threshold, octaves, sublevels, diffusivity);
+                                   threshold, octaves, sublevels, diffusivity, max_points);
     }
 
     String AKAZE::getDefaultName() const
diff --git a/modules/features2d/src/hal_replacement.hpp b/modules/features2d/src/hal_replacement.hpp
index f9fbf96daa..6476d21651 100644
--- a/modules/features2d/src/hal_replacement.hpp
+++ b/modules/features2d/src/hal_replacement.hpp
@@ -64,9 +64,12 @@
 //! @{
 /**
    @brief Detects corners using the FAST algorithm, returns mask.
-   @param src_data,src_step Source image
-   @param dst_data,dst_step Destination mask
-   @param width,height Source image dimensions
+   @param src_data Source image data
+   @param src_step Source image step
+   @param dst_data Destination mask data
+   @param dst_step Destination mask step
+   @param width Source image width
+   @param height Source image height
    @param type FAST type
 */
 inline int hal_ni_FAST_dense(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, cv::FastFeatureDetector::DetectorType type) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
@@ -89,8 +92,10 @@ inline int hal_ni_FAST_NMS(const uchar* src_data, size_t src_step, uchar* dst_da
 
 /**
    @brief Detects corners using the FAST algorithm.
-   @param src_data,src_step Source image
-   @param width,height Source image dimensions
+   @param src_data Source image data
+   @param src_step Source image step
+   @param width Source image width
+   @param height Source image height
    @param keypoints_data Pointer to keypoints
    @param keypoints_count Count of keypoints
    @param threshold Threshold for keypoint
diff --git a/modules/features2d/src/kaze/nldiffusion_functions.cpp b/modules/features2d/src/kaze/nldiffusion_functions.cpp
index 59939a2bbf..942b8d7875 100644
--- a/modules/features2d/src/kaze/nldiffusion_functions.cpp
+++ b/modules/features2d/src/kaze/nldiffusion_functions.cpp
@@ -86,9 +86,9 @@ void image_derivatives_scharr(const cv::Mat& src, cv::Mat& dst, int xorder, int
 /**
  * @brief This function computes the Perona and Malik conductivity coefficient g1
  * g1 = exp(-|dL|^2/k^2)
- * @param Lx First order image derivative in X-direction (horizontal)
- * @param Ly First order image derivative in Y-direction (vertical)
- * @param dst Output image
+ * @param _Lx First order image derivative in X-direction (horizontal)
+ * @param _Ly First order image derivative in Y-direction (vertical)
+ * @param _dst Output image
  * @param k Contrast factor parameter
  */
 void pm_g1(InputArray _Lx, InputArray _Ly, OutputArray _dst, float k) {
@@ -117,9 +117,9 @@ void pm_g1(InputArray _Lx, InputArray _Ly, OutputArray _dst, float k) {
 /**
  * @brief This function computes the Perona and Malik conductivity coefficient g2
  * g2 = 1 / (1 + dL^2 / k^2)
- * @param Lx First order image derivative in X-direction (horizontal)
- * @param Ly First order image derivative in Y-direction (vertical)
- * @param dst Output image
+ * @param _Lx First order image derivative in X-direction (horizontal)
+ * @param _Ly First order image derivative in Y-direction (vertical)
+ * @param _dst Output image
  * @param k Contrast factor parameter
  */
 void pm_g2(InputArray _Lx, InputArray _Ly, OutputArray _dst, float k) {
@@ -146,9 +146,9 @@ void pm_g2(InputArray _Lx, InputArray _Ly, OutputArray _dst, float k) {
 /* ************************************************************************* */
 /**
  * @brief This function computes Weickert conductivity coefficient gw
- * @param Lx First order image derivative in X-direction (horizontal)
- * @param Ly First order image derivative in Y-direction (vertical)
- * @param dst Output image
+ * @param _Lx First order image derivative in X-direction (horizontal)
+ * @param _Ly First order image derivative in Y-direction (vertical)
+ * @param _dst Output image
  * @param k Contrast factor parameter
  * @note For more information check the following paper: J. Weickert
  * Applications of nonlinear diffusion in image processing and computer vision,
@@ -183,9 +183,9 @@ void weickert_diffusivity(InputArray _Lx, InputArray _Ly, OutputArray _dst, floa
 /**
 * @brief This function computes Charbonnier conductivity coefficient gc
 * gc = 1 / sqrt(1 + dL^2 / k^2)
-* @param Lx First order image derivative in X-direction (horizontal)
-* @param Ly First order image derivative in Y-direction (vertical)
-* @param dst Output image
+* @param _Lx First order image derivative in X-direction (horizontal)
+* @param _Ly First order image derivative in Y-direction (vertical)
+* @param _dst Output image
 * @param k Contrast factor parameter
 * @note For more information check the following paper: J. Weickert
 * Applications of nonlinear diffusion in image processing and computer vision,
@@ -323,7 +323,7 @@ void compute_scharr_derivatives(const cv::Mat& src, cv::Mat& dst, int xorder, in
  * @param _ky Vertical kernel values
  * @param dx Derivative order in X-direction (horizontal)
  * @param dy Derivative order in Y-direction (vertical)
- * @param scale_ Scale factor or derivative size
+ * @param scale Scale factor or derivative size
  */
 void compute_derivative_kernels(cv::OutputArray _kx, cv::OutputArray _ky, int dx, int dy, int scale) {
     CV_INSTRUMENT_REGION();
@@ -415,7 +415,7 @@ private:
 /* ************************************************************************* */
 /**
 * @brief This function performs a scalar non-linear diffusion step
-* @param Ld2 Output image in the evolution
+* @param Ld Output image in the evolution
 * @param c Conductivity image
 * @param Lstep Previous image in the evolution
 * @param stepsize The step size in time units
@@ -490,7 +490,7 @@ void nld_step_scalar(cv::Mat& Ld, const cv::Mat& c, cv::Mat& Lstep, float stepsi
 /* ************************************************************************* */
 /**
 * @brief This function downsamples the input image using OpenCV resize
-* @param img Input image to be downsampled
+* @param src Input image to be downsampled
 * @param dst Output image with half of the resolution of the input image
 */
 void halfsample_image(const cv::Mat& src, cv::Mat& dst) {
diff --git a/modules/features2d/src/kaze/utils.h b/modules/features2d/src/kaze/utils.h
index 44e5b76935..6319943062 100644
--- a/modules/features2d/src/kaze/utils.h
+++ b/modules/features2d/src/kaze/utils.h
@@ -6,7 +6,7 @@
  * @brief This function computes the value of a 2D Gaussian function
  * @param x X Position
  * @param y Y Position
- * @param sig Standard Deviation
+ * @param sigma Standard Deviation
  */
 inline float gaussian(float x, float y, float sigma) {
   return expf(-(x*x + y*y) / (2.0f*sigma*sigma));
diff --git a/modules/flann/include/opencv2/flann/composite_index.h b/modules/flann/include/opencv2/flann/composite_index.h
index f1af41ac26..37a6223f88 100644
--- a/modules/flann/include/opencv2/flann/composite_index.h
+++ b/modules/flann/include/opencv2/flann/composite_index.h
@@ -80,7 +80,6 @@ public:
      * @param inputData dataset containing the points to index
      * @param params Index parameters
      * @param d Distance functor
-     * @return
      */
     CompositeIndex(const Matrix<ElementType>& inputData, const IndexParams& params = CompositeIndexParams(),
                    Distance d = Distance()) : index_params_(params)
diff --git a/modules/flann/include/opencv2/flann/dynamic_bitset.h b/modules/flann/include/opencv2/flann/dynamic_bitset.h
index a00ce1bb7e..676cb0b71e 100644
--- a/modules/flann/include/opencv2/flann/dynamic_bitset.h
+++ b/modules/flann/include/opencv2/flann/dynamic_bitset.h
@@ -97,7 +97,6 @@ public:
     }
 
     /** @brief set one bit to 0
-     * @param index
      */
     void reset(size_t index)
     {
@@ -108,7 +107,6 @@ public:
      * This function is useful when resetting a given set of bits so that the
      * whole bitset ends up being 0: if that's the case, we don't care about setting
      * other bits to 0
-     * @param index
      */
     void reset_block(size_t index)
     {
@@ -116,7 +114,6 @@ public:
     }
 
     /** resize the bitset so that it contains at least sz bits
-     * @param sz
      */
     void resize(size_t sz)
     {
diff --git a/modules/flann/include/opencv2/flann/logger.h b/modules/flann/include/opencv2/flann/logger.h
index 8911812a77..31f9bbd77f 100644
--- a/modules/flann/include/opencv2/flann/logger.h
+++ b/modules/flann/include/opencv2/flann/logger.h
@@ -101,7 +101,6 @@ public:
      * Print log message
      * @param level Log level
      * @param fmt Message format
-     * @return
      */
     static int log(int level, const char* fmt, ...)
     {
diff --git a/modules/flann/include/opencv2/flann/lsh_table.h b/modules/flann/include/opencv2/flann/lsh_table.h
index a189562d3a..3b8ffd4075 100644
--- a/modules/flann/include/opencv2/flann/lsh_table.h
+++ b/modules/flann/include/opencv2/flann/lsh_table.h
@@ -214,8 +214,6 @@ public:
     }
 
     /** Get a bucket given the key
-     * @param key
-     * @return
      */
     inline const Bucket* getBucketFromKey(BucketKey key) const
     {
@@ -253,7 +251,6 @@ public:
     }
 
     /** Get statistics about the table
-     * @return
      */
     LshStats getStats() const;
 
diff --git a/modules/flann/include/opencv2/flann/random.h b/modules/flann/include/opencv2/flann/random.h
index 2c1809c3a9..5a12ef3046 100644
--- a/modules/flann/include/opencv2/flann/random.h
+++ b/modules/flann/include/opencv2/flann/random.h
@@ -106,7 +106,6 @@ public:
     /**
      * Constructor.
      * @param n Size of the interval from which to generate
-     * @return
      */
     UniqueRandom(int n)
     {
diff --git a/modules/flann/include/opencv2/flann/result_set.h b/modules/flann/include/opencv2/flann/result_set.h
index c5d31e8ade..aa679df71c 100644
--- a/modules/flann/include/opencv2/flann/result_set.h
+++ b/modules/flann/include/opencv2/flann/result_set.h
@@ -360,7 +360,6 @@ public:
     }
 
     /** The number of neighbors in the set
-     * @return
      */
     size_t size() const
     {
@@ -369,7 +368,6 @@ public:
 
     /** The distance of the furthest neighbor
      * If we don't have enough neighbors, it returns the max possible value
-     * @return
      */
     inline DistanceType worstDist() const CV_OVERRIDE
     {
@@ -490,7 +488,6 @@ public:
 
     /** The distance of the furthest neighbor
      * If we don't have enough neighbors, it returns the max possible value
-     * @return
      */
     inline DistanceType worstDist() const CV_OVERRIDE
     {
diff --git a/modules/flann/src/flann.cpp b/modules/flann/src/flann.cpp
index 388418f889..b7930c548a 100644
--- a/modules/flann/src/flann.cpp
+++ b/modules/flann/src/flann.cpp
@@ -35,7 +35,7 @@ namespace cvflann
      * \deprecated Provided for backward compatibility
     */
     flann_distance_t flann_distance_type_ = FLANN_DIST_L2;
-    flann_distance_t flann_distance_type() { return flann_distance_type_; }
+    CV_DEPRECATED flann_distance_t flann_distance_type() { return flann_distance_type_; }
 
     /**
      * Set distance type to used
diff --git a/modules/gapi/cmake/DownloadADE.cmake b/modules/gapi/cmake/DownloadADE.cmake
index 26407f4fef..e7a145b14a 100644
--- a/modules/gapi/cmake/DownloadADE.cmake
+++ b/modules/gapi/cmake/DownloadADE.cmake
@@ -1,7 +1,7 @@
 set(ade_src_dir "${OpenCV_BINARY_DIR}/3rdparty/ade")
-set(ade_filename "v0.1.2b.zip")
-set(ade_subdir "ade-0.1.2b")
-set(ade_md5 "4f93a0844dfc463c617d83b09011819a")
+set(ade_filename "v0.1.2c.zip")
+set(ade_subdir "ade-0.1.2c")
+set(ade_md5 "4e996f545f9dddf2348cf593cbb2726f")
 ocv_download(FILENAME ${ade_filename}
              HASH ${ade_md5}
              URL
diff --git a/modules/gapi/include/opencv2/gapi/garg.hpp b/modules/gapi/include/opencv2/gapi/garg.hpp
index bfe147f8f0..2a8315f9d8 100644
--- a/modules/gapi/include/opencv2/gapi/garg.hpp
+++ b/modules/gapi/include/opencv2/gapi/garg.hpp
@@ -241,6 +241,7 @@ namespace gapi
  *
  * @brief G-API functions and classes for serialization and deserialization.
  */
+
 /** @brief Wraps deserialized output GRunArgs to GRunArgsP which can be used by GCompiled.
  *
  * Since it's impossible to get modifiable output arguments from deserialization
@@ -254,6 +255,7 @@ namespace gapi
  * @see deserialize
  */
 GAPI_EXPORTS cv::GRunArgsP bind(cv::GRunArgs &out_args);
+
 /** @brief Wraps output GRunArgsP available during graph execution to GRunArgs which can be serialized.
  *
  * GRunArgsP is pointer-to-value, so to be serialized they need to be binded to real values
diff --git a/modules/gapi/include/opencv2/gapi/garray.hpp b/modules/gapi/include/opencv2/gapi/garray.hpp
index b6aa715518..a2951993f2 100644
--- a/modules/gapi/include/opencv2/gapi/garray.hpp
+++ b/modules/gapi/include/opencv2/gapi/garray.hpp
@@ -102,17 +102,17 @@ namespace detail
         GAPI_Assert(m_hint != nullptr);
         using U = typename std::decay<T>::type;
         return dynamic_cast<TypeHint<U>*>(m_hint.get()) != nullptr;
-    };
+    }
 
     template <typename T>
     void GArrayU::specifyType(){
         m_hint.reset(new TypeHint<typename std::decay<T>::type>);
-    };
+    }
 
     template <typename T>
     void GArrayU::storeKind(){
         setKind(cv::detail::GOpaqueTraits<T>::kind);
-    };
+    }
 
     // This class represents a typed STL vector reference.
     // Depending on origins, this reference may be either "just a" reference to
diff --git a/modules/gapi/include/opencv2/gapi/gcomputation.hpp b/modules/gapi/include/opencv2/gapi/gcomputation.hpp
index 13944c7852..196eb37c6b 100644
--- a/modules/gapi/include/opencv2/gapi/gcomputation.hpp
+++ b/modules/gapi/include/opencv2/gapi/gcomputation.hpp
@@ -50,6 +50,7 @@ namespace s11n {
  *
  * @brief G-API classes for constructed and compiled graphs.
  */
+
 /**
  * @brief GComputation class represents a captured computation
  * graph. GComputation objects form boundaries for expression code
diff --git a/modules/gapi/include/opencv2/gapi/gkernel.hpp b/modules/gapi/include/opencv2/gapi/gkernel.hpp
index 1b910adc82..6ec6bf573d 100644
--- a/modules/gapi/include/opencv2/gapi/gkernel.hpp
+++ b/modules/gapi/include/opencv2/gapi/gkernel.hpp
@@ -430,7 +430,7 @@ namespace gapi {
 
         virtual ~GFunctor() = default;
     protected:
-        GFunctor(const char* id) : m_id(id) { };
+        GFunctor(const char* id) : m_id(id) { }
     private:
         const char* m_id;
     };
@@ -692,7 +692,7 @@ namespace gapi {
         int unused[] = { 0, (pkg.include<KK>(), 0)... };
         cv::util::suppress_unused_warning(unused);
         return pkg;
-    };
+    }
 
     template<typename... FF>
     GKernelPackage kernels(FF&... functors)
@@ -701,7 +701,7 @@ namespace gapi {
         int unused[] = { 0, (pkg.include(functors), 0)... };
         cv::util::suppress_unused_warning(unused);
         return pkg;
-    };
+    }
 
     /** @} */
 
diff --git a/modules/gapi/include/opencv2/gapi/gmat.hpp b/modules/gapi/include/opencv2/gapi/gmat.hpp
index 7bea97bbc5..198cb728d0 100644
--- a/modules/gapi/include/opencv2/gapi/gmat.hpp
+++ b/modules/gapi/include/opencv2/gapi/gmat.hpp
@@ -48,6 +48,7 @@ struct GOrigin;
  *    `cv::GOpaque<T>`   | T
  *    cv::GFrame         | cv::MediaFrame
  */
+
 /**
  * @brief GMat class represents image or tensor data in the
  * graph.
diff --git a/modules/gapi/include/opencv2/gapi/gopaque.hpp b/modules/gapi/include/opencv2/gapi/gopaque.hpp
index 1d12f127da..a3f98a9867 100644
--- a/modules/gapi/include/opencv2/gapi/gopaque.hpp
+++ b/modules/gapi/include/opencv2/gapi/gopaque.hpp
@@ -98,18 +98,18 @@ namespace detail
         GAPI_Assert(m_hint != nullptr);
         using U = util::decay_t<T>;
         return dynamic_cast<TypeHint<U>*>(m_hint.get()) != nullptr;
-    };
+    }
 
     template <typename T>
     void GOpaqueU::specifyType(){
         m_hint.reset(new TypeHint<util::decay_t<T>>);
-    };
+    }
 
     template <typename T>
     void GOpaqueU::storeKind(){
         // FIXME: Add assert here on cv::Mat and cv::Scalar?
         setKind(cv::detail::GOpaqueTraits<T>::kind);
-    };
+    }
 
     // This class represents a typed object reference.
     // Depending on origins, this reference may be either "just a" reference to
diff --git a/modules/gapi/include/opencv2/gapi/gstreaming.hpp b/modules/gapi/include/opencv2/gapi/gstreaming.hpp
index 5677768a96..e8f534ee5f 100644
--- a/modules/gapi/include/opencv2/gapi/gstreaming.hpp
+++ b/modules/gapi/include/opencv2/gapi/gstreaming.hpp
@@ -409,7 +409,7 @@ namespace streaming {
 struct GAPI_EXPORTS_W_SIMPLE queue_capacity
 {
     GAPI_WRAP
-    explicit queue_capacity(size_t cap = 1) : capacity(cap) { };
+    explicit queue_capacity(size_t cap = 1) : capacity(cap) { }
     GAPI_PROP_RW
     size_t capacity;
 };
diff --git a/modules/gapi/include/opencv2/gapi/gtransform.hpp b/modules/gapi/include/opencv2/gapi/gtransform.hpp
index 109bc87b7f..ce88c894d7 100644
--- a/modules/gapi/include/opencv2/gapi/gtransform.hpp
+++ b/modules/gapi/include/opencv2/gapi/gtransform.hpp
@@ -91,7 +91,7 @@ public:
     {                                                           \
     struct G_DESCR_HELPER_CLASS(Class)                          \
     {                                                           \
-        static constexpr const char *descr() { return Descr; }; \
+        static constexpr const char *descr() { return Descr; }  \
     };                                                          \
     }
 
diff --git a/modules/gapi/include/opencv2/gapi/gtype_traits.hpp b/modules/gapi/include/opencv2/gapi/gtype_traits.hpp
index a1703a52cb..c42d64a761 100644
--- a/modules/gapi/include/opencv2/gapi/gtype_traits.hpp
+++ b/modules/gapi/include/opencv2/gapi/gtype_traits.hpp
@@ -231,10 +231,10 @@ template<typename T> struct GObtainCtor {
     static HostCtor get() { return HostCtor{}; }
 };
 template<typename T> struct GObtainCtor<GArray<T> > {
-    static HostCtor get() { return HostCtor{ConstructVec{&GArray<T>::VCtor}}; };
+    static HostCtor get() { return HostCtor{ConstructVec{&GArray<T>::VCtor}}; }
 };
 template<typename T> struct GObtainCtor<GOpaque<T> > {
-    static HostCtor get() { return HostCtor{ConstructOpaque{&GOpaque<T>::Ctor}}; };
+    static HostCtor get() { return HostCtor{ConstructOpaque{&GOpaque<T>::Ctor}}; }
 };
 } // namespace detail
 } // namespace cv
diff --git a/modules/gapi/include/opencv2/gapi/gtyped.hpp b/modules/gapi/include/opencv2/gapi/gtyped.hpp
index c1c16d1767..2acc2f7ffb 100644
--- a/modules/gapi/include/opencv2/gapi/gtyped.hpp
+++ b/modules/gapi/include/opencv2/gapi/gtyped.hpp
@@ -40,7 +40,7 @@ namespace detail
     //workaround for MSVC 19.0 bug
     template <typename T>
     auto make_default()->decltype(T{}) {return {};}
-}; // detail
+} // detail
 
 /**
  * @brief This class is a typed wrapper over a regular GComputation.
diff --git a/modules/gapi/include/opencv2/gapi/infer/ie.hpp b/modules/gapi/include/opencv2/gapi/infer/ie.hpp
index b403479ca2..9f9518d0b8 100644
--- a/modules/gapi/include/opencv2/gapi/infer/ie.hpp
+++ b/modules/gapi/include/opencv2/gapi/infer/ie.hpp
@@ -173,7 +173,7 @@ public:
               , {}
               , {}
               , {} } {
-    };
+    }
 
     /** @overload
     Use this constructor to work with pre-compiled network.
@@ -202,7 +202,7 @@ public:
               , {}
               , {}
               , {} } {
-    };
+    }
 
     /** @brief Specifies sequence of network input layers names for inference.
 
@@ -547,7 +547,7 @@ public:
                 detail::ParamDesc::Kind::Load, true, {}, {}, {}, 1u,
                 {}, {}, {}, {}, InferMode::Async, {}, {}, {}, {} },
           m_tag(tag) {
-    };
+    }
 
     /** @overload
 
@@ -565,7 +565,7 @@ public:
                 detail::ParamDesc::Kind::Import, true, {}, {}, {}, 1u,
                 {}, {}, {}, {}, InferMode::Async, {}, {}, {}, {} },
           m_tag(tag) {
-    };
+    }
 
     /** @see ie::Params::pluginConfig. */
     Params& pluginConfig(const IEConfig& cfg) {
diff --git a/modules/gapi/include/opencv2/gapi/infer/onnx.hpp b/modules/gapi/include/opencv2/gapi/infer/onnx.hpp
index ff5febcf90..4efb750439 100644
--- a/modules/gapi/include/opencv2/gapi/infer/onnx.hpp
+++ b/modules/gapi/include/opencv2/gapi/infer/onnx.hpp
@@ -293,7 +293,7 @@ public:
         desc.num_out = std::tuple_size<typename Net::OutArgs>::value;
         desc.is_generic = false;
         desc.disable_mem_pattern = false;
-    };
+    }
 
     /** @brief Specifies sequence of network input layers names for inference.
 
diff --git a/modules/gapi/include/opencv2/gapi/media.hpp b/modules/gapi/include/opencv2/gapi/media.hpp
index 5da8eeab48..1470f00d04 100644
--- a/modules/gapi/include/opencv2/gapi/media.hpp
+++ b/modules/gapi/include/opencv2/gapi/media.hpp
@@ -33,6 +33,7 @@ namespace cv {
  * @brief Extra G-API data structures used to pass input/output data
  * to the graph for processing.
  */
+
 /**
  * @brief cv::MediaFrame class represents an image/media frame
  * obtained from an external source.
diff --git a/modules/gapi/include/opencv2/gapi/own/convert.hpp b/modules/gapi/include/opencv2/gapi/own/convert.hpp
index dd59c4b5aa..7bebec9cf0 100644
--- a/modules/gapi/include/opencv2/gapi/own/convert.hpp
+++ b/modules/gapi/include/opencv2/gapi/own/convert.hpp
@@ -31,7 +31,7 @@ namespace cv
         return (m.dims <= 2)
             ?  cv::gapi::own::Mat{m.rows, m.cols, m.type(), m.data, m.step}
             :  cv::gapi::own::Mat{to_own<int>(m.size), m.type(), m.data};
-    };
+    }
 
 namespace gapi
 {
diff --git a/modules/gapi/include/opencv2/gapi/own/scalar.hpp b/modules/gapi/include/opencv2/gapi/own/scalar.hpp
index bda91c83b5..3b107befcc 100644
--- a/modules/gapi/include/opencv2/gapi/own/scalar.hpp
+++ b/modules/gapi/include/opencv2/gapi/own/scalar.hpp
@@ -21,7 +21,7 @@ class GAPI_EXPORTS Scalar
 {
 public:
     Scalar() = default;
-    explicit Scalar(double v0) { val[0] = v0; };
+    explicit Scalar(double v0) { val[0] = v0; }
     Scalar(double v0, double v1, double v2 = 0, double v3 = 0)
         : val{v0, v1, v2, v3}
     {
diff --git a/modules/gapi/include/opencv2/gapi/s11n.hpp b/modules/gapi/include/opencv2/gapi/s11n.hpp
index 0bf368a856..a94f55c249 100644
--- a/modules/gapi/include/opencv2/gapi/s11n.hpp
+++ b/modules/gapi/include/opencv2/gapi/s11n.hpp
@@ -337,7 +337,7 @@ namespace detail {
 template<typename V>
 IOStream& put_v(IOStream&, const V&, std::size_t) {
     GAPI_Error("variant>>: requested index is invalid");
-};
+}
 
 template<typename V, typename X, typename... Xs>
 IOStream& put_v(IOStream& os, const V& v, std::size_t x) {
diff --git a/modules/gapi/misc/python/pyopencv_gapi.hpp b/modules/gapi/misc/python/pyopencv_gapi.hpp
index 3269a7d470..a13b8e545d 100644
--- a/modules/gapi/misc/python/pyopencv_gapi.hpp
+++ b/modules/gapi/misc/python/pyopencv_gapi.hpp
@@ -321,7 +321,7 @@ PyObject* pyopencv_from(const cv::detail::OpaqueRef& o)
 
     PyErr_SetString(PyExc_TypeError, "Unsupported GOpaque type");
     return NULL;
-};
+}
 
 template <>
 PyObject* pyopencv_from(const cv::detail::VectorRef& v)
diff --git a/modules/gapi/misc/python/python_bridge.hpp b/modules/gapi/misc/python/python_bridge.hpp
index 53edf38b30..8926b21a79 100644
--- a/modules/gapi/misc/python/python_bridge.hpp
+++ b/modules/gapi/misc/python/python_bridge.hpp
@@ -137,7 +137,7 @@ public:
     using Storage = cv::detail::MakeVariantType<cv::GOpaque, GOPAQUE_TYPE_LIST_G(ID_, ID)>;
 
     template<typename T>
-    GOpaqueT(cv::GOpaque<T> arg) : m_type(cv::detail::ArgTypeTraits<T>::type), m_arg(arg) { };
+    GOpaqueT(cv::GOpaque<T> arg) : m_type(cv::detail::ArgTypeTraits<T>::type), m_arg(arg) { }
 
     GAPI_WRAP GOpaqueT(gapi::ArgType type) : m_type(type)
     {
@@ -175,7 +175,7 @@ public:
     using Storage = cv::detail::MakeVariantType<cv::GArray, GARRAY_TYPE_LIST_G(ID_, ID)>;
 
     template<typename T>
-    GArrayT(cv::GArray<T> arg) : m_type(cv::detail::ArgTypeTraits<T>::type), m_arg(arg) { };
+    GArrayT(cv::GArray<T> arg) : m_type(cv::detail::ArgTypeTraits<T>::type), m_arg(arg) { }
 
     GAPI_WRAP GArrayT(gapi::ArgType type) : m_type(type)
     {
diff --git a/modules/gapi/perf/common/gapi_render_perf_tests_inl.hpp b/modules/gapi/perf/common/gapi_render_perf_tests_inl.hpp
index 66e8c37319..04d814eac7 100644
--- a/modules/gapi/perf/common/gapi_render_perf_tests_inl.hpp
+++ b/modules/gapi/perf/common/gapi_render_perf_tests_inl.hpp
@@ -16,7 +16,7 @@ void create_rand_mats(const cv::Size &size, MatType type, cv::Mat &ref_mat, cv::
     ref_mat.create(size, type);
     cv::randu(ref_mat, cv::Scalar::all(0), cv::Scalar::all(255));
     ref_mat.copyTo(gapi_mat);
-};
+}
 
 } // namespace
 
diff --git a/modules/gapi/src/api/render_ocv.cpp b/modules/gapi/src/api/render_ocv.cpp
index f1e9be4b48..e15f56bfeb 100644
--- a/modules/gapi/src/api/render_ocv.cpp
+++ b/modules/gapi/src/api/render_ocv.cpp
@@ -67,7 +67,7 @@ inline void mosaic(cv::Mat& mat, const cv::Rect &rect, int cellSz)
             cell_roi = cv::mean(cell_roi);
         }
     }
-};
+}
 
 inline void blendImage(const cv::Mat& img,
                        const cv::Mat& alpha,
@@ -120,7 +120,7 @@ inline void poly(cv::Mat& mat,
 {
     std::vector<std::vector<cv::Point>> points{pp.points};
     cv::fillPoly(mat, points, pp.color, pp.lt, pp.shift);
-};
+}
 
 struct BGR2YUVConverter
 {
@@ -133,13 +133,13 @@ struct BGR2YUVConverter
         return {y, u, v};
     }
 
-    void cvtImg(const cv::Mat& in, cv::Mat& out) { cv::cvtColor(in, out, cv::COLOR_BGR2YUV); };
+    void cvtImg(const cv::Mat& in, cv::Mat& out) { cv::cvtColor(in, out, cv::COLOR_BGR2YUV); }
 };
 
 struct EmptyConverter
 {
-    cv::Scalar cvtColor(const cv::Scalar& bgr)   const { return bgr; };
-    void cvtImg(const cv::Mat& in, cv::Mat& out) const { out = in;   };
+    cv::Scalar cvtColor(const cv::Scalar& bgr)   const { return bgr; }
+    void cvtImg(const cv::Mat& in, cv::Mat& out) const { out = in;   }
 };
 
 // FIXME util::visitor ?
diff --git a/modules/gapi/src/backends/common/serialization.cpp b/modules/gapi/src/backends/common/serialization.cpp
index 2a71a782b0..6fe924e61b 100644
--- a/modules/gapi/src/backends/common/serialization.cpp
+++ b/modules/gapi/src/backends/common/serialization.cpp
@@ -8,9 +8,8 @@
 #include <map> // map
 #include <ade/util/zip_range.hpp> // indexed
 
-#define NOMINMAX
-
 #ifdef _WIN32
+#define NOMINMAX
 #include <winsock.h>      // htonl, ntohl
 #else
 #include <netinet/in.h>   // htonl, ntohl
diff --git a/modules/gapi/src/backends/common/serialization.hpp b/modules/gapi/src/backends/common/serialization.hpp
index a64805e25c..384004c725 100644
--- a/modules/gapi/src/backends/common/serialization.hpp
+++ b/modules/gapi/src/backends/common/serialization.hpp
@@ -195,7 +195,7 @@ class GAPI_EXPORTS ByteMemoryInStream final: public IIStream {
     size_t m_idx = 0u;
 
     void check(std::size_t n) { (void) n; GAPI_DbgAssert(m_idx+n-1 < m_storage.size()); }
-    uint32_t getU32() { uint32_t v{}; *this >> v; return v; };
+    uint32_t getU32() { uint32_t v{}; *this >> v; return v; }
 
     //virtual IIStream& operator>> (uint32_t &) final;
 
diff --git a/modules/gapi/src/backends/fluid/gfluidcore.cpp b/modules/gapi/src/backends/fluid/gfluidcore.cpp
index c2686c7bd3..50615b2652 100644
--- a/modules/gapi/src/backends/fluid/gfluidcore.cpp
+++ b/modules/gapi/src/backends/fluid/gfluidcore.cpp
@@ -13,7 +13,7 @@
 #include <opencv2/core/hal/hal.hpp>
 #include <opencv2/core/hal/intrin.hpp>
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 #include "gfluidcore_func.hpp"
 #endif
 
@@ -113,7 +113,7 @@ static inline DST divr(SRC1 x, SRC2 y, float scale=1)
 // Fluid kernels: addWeighted
 //
 //---------------------------
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 CV_ALWAYS_INLINE v_float32 v_load_f32(const ushort* in)
 {
     return v_cvt_f32(v_reinterpret_as_s32(vx_load_expand(in)));
@@ -150,8 +150,8 @@ CV_ALWAYS_INLINE int addw_simd(const SRC in1[], const SRC in2[], DST out[],
                   ((std::is_same<SRC, short>::value) && (std::is_same<DST, short>::value)),
                   "This templated overload is only for short and ushort type combinations.");
 
-    constexpr int nlanes = (std::is_same<DST, ushort>::value) ? static_cast<int>(v_uint16::nlanes) :
-                                                                static_cast<int>(v_int16::nlanes);
+    const int nlanes = (std::is_same<DST, ushort>::value) ? static_cast<int>(VTraits<v_uint16>::vlanes()) :
+                                                                static_cast<int>(VTraits<v_int16>::vlanes());
 
     if (length < nlanes)
         return 0;
@@ -189,7 +189,7 @@ CV_ALWAYS_INLINE int addw_simd(const SRC in1[], const SRC in2[], uchar out[],
                                const float _alpha, const float _beta,
                                const float _gamma, int length)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
 
     if (length < nlanes)
         return 0;
@@ -298,7 +298,7 @@ GAPI_FLUID_KERNEL(GFluidAddW, cv::gapi::core::GAddW, false)
 
 enum Arithm { ARITHM_ABSDIFF, ARITHM_ADD, ARITHM_SUBTRACT, ARITHM_MULTIPLY, ARITHM_DIVIDE };
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 CV_ALWAYS_INLINE void absdiff_store(short out[], const v_int16& a, const v_int16& b, int x)
 {
     vx_store(&out[x], v_absdiffs(a, b));
@@ -322,7 +322,7 @@ CV_ALWAYS_INLINE void absdiff_store(float out[], const v_float32& a, const v_flo
 template<typename T, typename VT>
 CV_ALWAYS_INLINE int absdiff_impl(const T in1[], const T in2[], T out[], int length)
 {
-    constexpr int nlanes = static_cast<int>(VT::nlanes);
+    const int nlanes = static_cast<int>(VTraits<VT>::vlanes());
 
     if (length < nlanes)
         return 0;
@@ -403,7 +403,7 @@ CV_ALWAYS_INLINE void run_arithm(Buffer &dst, const View &src1, const View &src2
     {
         case ARITHM_ADD:
         {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
             x = add_simd(in1, in2, out, length);
 #endif
             for (; x < length; ++x)
@@ -412,7 +412,7 @@ CV_ALWAYS_INLINE void run_arithm(Buffer &dst, const View &src1, const View &src2
         }
         case ARITHM_SUBTRACT:
         {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
             x = sub_simd(in1, in2, out, length);
 #endif
             for (; x < length; ++x)
@@ -421,7 +421,7 @@ CV_ALWAYS_INLINE void run_arithm(Buffer &dst, const View &src1, const View &src2
         }
         case ARITHM_MULTIPLY:
         {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
             x = mul_simd(in1, in2, out, length, scale);
 #endif
             for (; x < length; ++x)
@@ -430,7 +430,7 @@ CV_ALWAYS_INLINE void run_arithm(Buffer &dst, const View &src1, const View &src2
         }
         case ARITHM_DIVIDE:
         {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
             x = div_simd(in1, in2, out, length, scale);
 #endif
             for (; x < length; ++x)
@@ -569,7 +569,7 @@ static void run_absdiff(Buffer &dst, const View &src1, const View &src2)
 
     int x = 0;
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     x = absdiff_simd(in1, in2, out, length);
 #endif
     for (; x < length; ++x)
@@ -660,7 +660,7 @@ CV_ALWAYS_INLINE void run_arithm_s(Buffer &dst, const View &src, const float sca
     case ARITHM_ADD:
     {
         int w = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         w = addc_simd(in, scalar, out, length, chan);
 #endif
         for (; w < length; ++w)
@@ -671,7 +671,7 @@ CV_ALWAYS_INLINE void run_arithm_s(Buffer &dst, const View &src, const float sca
     case ARITHM_SUBTRACT:
     {
         int w = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         w = subc_simd(in, scalar, out, length, chan);
 #endif
         for (; w < length; ++w)
@@ -681,7 +681,7 @@ CV_ALWAYS_INLINE void run_arithm_s(Buffer &dst, const View &src, const float sca
     case ARITHM_MULTIPLY:
     {
         int w = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         w = mulc_simd(in, scalar, out, length, chan, scale);
 #endif
         for (; w < width; ++w)
@@ -709,7 +709,7 @@ CV_ALWAYS_INLINE void run_arithm_rs(Buffer &dst, const View &src, const float sc
     case ARITHM_SUBTRACT:
     {
         int w = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         w = subrc_simd(scalar, in, out, length, chan);
 #endif
         for (; w < length; ++w)
@@ -721,7 +721,7 @@ CV_ALWAYS_INLINE void run_arithm_rs(Buffer &dst, const View &src, const float sc
     case ARITHM_DIVIDE:
     {
         int w = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         w = divrc_simd(scalar, in, out, length, chan, scale);
 #endif
         for (; w < length; ++w)
@@ -744,7 +744,7 @@ CV_ALWAYS_INLINE void setScratchSize(Buffer& scratch, const int buflen)
 
 CV_ALWAYS_INLINE void initScratchBuffer(Buffer& scratch)
 {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     // 512 bits / 32 bits = 16 elements of float32 can contain a AVX 512 SIMD vector.
     constexpr int maxNlanes = 16;
 
@@ -783,7 +783,7 @@ CV_ALWAYS_INLINE void run_absdiffc(Buffer& dst, const View& src, const float sca
     const int length = width * chan;
 
     int w = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     w = absdiffc_simd(in, scalar, out, length, chan);
 #endif
 
@@ -1076,7 +1076,7 @@ CV_ALWAYS_INLINE void run_divc(Buffer& dst, const View& src, Buffer& scratch,
     const int length = width * chan;
 
     int w = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     int scratch_length = scratch.length();
     int indicator_offset = scratch_length - 1;
     const int set_mask_indicator = static_cast<int>(*(scratch.OutLine<float>() + (indicator_offset)));
@@ -1143,7 +1143,7 @@ GAPI_FLUID_KERNEL(GFluidDivC, cv::gapi::core::GDivC, true)
 
     static void initScratch(const GMatDesc&, const GScalarDesc&, double, int, Buffer& scratch)
     {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
             // 512 bits / 32 bits = 16 elements of float32 a AVX512 SIMD vector can contain.
             constexpr int maxNlanes = 16;
 
@@ -1565,7 +1565,7 @@ template<typename SRC, typename DST>
 CV_ALWAYS_INLINE void convertto_impl(const SRC in[], DST out[], const int length)
 {
     int x = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     x = convertto_simd(in, out, length);
 #endif
     // tail of SIMD cycle
@@ -1580,7 +1580,7 @@ CV_ALWAYS_INLINE void convertto_impl(const SRC *in, DST* out, const float alpha,
                                      const int length)
 {
     int x = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     x = convertto_scaled_simd(in, out, alpha, beta, length);
 #endif
 
@@ -2096,9 +2096,7 @@ static void run_inrange3(uchar out[], const uchar in[], int width,
         v_load_deinterleave(&in[3*w], i0, i1, i2);
 
         v_uint8x16 o;
-        o = (i0 >= v_setall_u8(lower[0])) & (i0 <= v_setall_u8(upper[0])) &
-            (i1 >= v_setall_u8(lower[1])) & (i1 <= v_setall_u8(upper[1])) &
-            (i2 >= v_setall_u8(lower[2])) & (i2 <= v_setall_u8(upper[2]));
+        o = v_and(v_and(v_and(v_and(v_and(v_ge(i0, v_setall_u8(lower[0])), v_le(i0, v_setall_u8(upper[0]))), v_ge(i1, v_setall_u8(lower[1]))), v_le(i1, v_setall_u8(upper[1]))), v_ge(i2, v_setall_u8(lower[2]))), v_le(i2, v_setall_u8(upper[2])));
 
         v_store(&out[w], o);
     }
@@ -2226,7 +2224,7 @@ static void run_select_row3(int width, uchar out[], uchar in1[], uchar in2[], uc
         v_load_deinterleave(&in2[3*w], a2, b2, c2);
 
         mask = v_load(&in3[w]);
-        mask = mask != v_setzero_u8();
+        mask = v_ne(mask, v_setzero_u8());
 
         a = v_select(mask, a1, a2);
         b = v_select(mask, b1, b2);
@@ -2332,7 +2330,7 @@ GAPI_FLUID_KERNEL(GFluidSplit3, cv::gapi::core::GSplit3, false)
         int width = src.length();
         int w = 0;
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         w = split3_simd(in, out1, out2, out3, width);
 #endif
 
@@ -2364,7 +2362,7 @@ GAPI_FLUID_KERNEL(GFluidSplit4, cv::gapi::core::GSplit4, false)
         int width = src.length();
         int w = 0;
 
-    #if CV_SIMD
+    #if (CV_SIMD || CV_SIMD_SCALABLE)
         w = split4_simd(in, out1, out2, out3, out4, width);
     #endif
 
@@ -2389,7 +2387,7 @@ CV_ALWAYS_INLINE void run_merge3(Buffer& dst, const View& src1, const View& src2
     int width = dst.length();
     int w = 0;
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         w = merge3_simd(in1, in2, in3, out, width);
 #endif
 
@@ -2442,7 +2440,7 @@ GAPI_FLUID_KERNEL(GFluidMerge4, cv::gapi::core::GMerge4, false)
 
         int w = 0; // cycle counter
 
-    #if CV_SIMD
+    #if (CV_SIMD || CV_SIMD_SCALABLE)
         w = merge4_simd(in1, in2, in3, in4, out, width);
     #endif
 
diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp b/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp
index 05d3417024..a0ef4b1479 100644
--- a/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp
+++ b/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp
@@ -7,7 +7,7 @@
 #if !defined(GAPI_STANDALONE)
 
 #include <opencv2/core/hal/intrin.hpp>
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 #include "gfluidcore_func.hpp"
 #include "gfluidcore_func.simd.hpp"
 
diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.hpp b/modules/gapi/src/backends/fluid/gfluidcore_func.hpp
index 0511f4e095..0186ea020e 100644
--- a/modules/gapi/src/backends/fluid/gfluidcore_func.hpp
+++ b/modules/gapi/src/backends/fluid/gfluidcore_func.hpp
@@ -6,7 +6,7 @@
 
 #pragma once
 
-#if !defined(GAPI_STANDALONE) && CV_SIMD
+#if !defined(GAPI_STANDALONE) && (CV_SIMD || CV_SIMD_SCALABLE)
 
 #include <opencv2/core.hpp>
 
diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp b/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp
index aed0ee97d8..6191e9ab05 100644
--- a/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp
+++ b/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp
@@ -402,22 +402,22 @@ CV_ALWAYS_INLINE v_float32 vg_load_f32(const uchar* in)
 
 CV_ALWAYS_INLINE v_float32 mul_op(scale_tag, const v_float32& a, const v_float32& b, const v_float32& scale)
 {
-    return (scale*a * b);
+    return (v_mul(v_mul(scale, a), b));
 }
 
 CV_ALWAYS_INLINE v_float32 mul_op(not_scale_tag, const v_float32& a, const v_float32& b, const v_float32&)
 {
-    return a * b;
+    return v_mul(a, b);
 }
 
 CV_ALWAYS_INLINE v_float32 div_op(scale_tag, const v_float32& a, const v_float32& div, const v_float32& scale)
 {
-    return (a*scale/div);
+    return (v_div(v_mul(a, scale), div));
 }
 
 CV_ALWAYS_INLINE v_float32 div_op(not_scale_tag, const v_float32& a, const v_float32& div, const v_float32&)
 {
-    return a / div;
+    return v_div(a, div);
 }
 
 CV_ALWAYS_INLINE void v_store_i16(short* dst, const v_int32& res1, const v_int32& res2)
@@ -433,13 +433,13 @@ CV_ALWAYS_INLINE void v_store_i16(ushort* dst, const v_int32& res1, const v_int3
 CV_ALWAYS_INLINE void v_store_select(short* dst, const v_int16& div, const v_int16& v_zero,
                                      const v_int32& res1, const v_int32& res2)
 {
-    vx_store(dst, v_select(div == v_zero, v_zero, v_pack(res1, res2)));
+    vx_store(dst, v_select(v_eq(div, v_zero), v_zero, v_pack(res1, res2)));
 }
 
 CV_ALWAYS_INLINE void v_store_select(ushort* dst, const v_int16& div, const v_int16& v_zero,
                                      const v_int32& res1, const v_int32& res2)
 {
-    vx_store(dst, v_select(v_reinterpret_as_u16(div == v_zero),
+    vx_store(dst, v_select(v_reinterpret_as_u16(v_eq(div, v_zero)),
                            v_reinterpret_as_u16(v_zero), v_pack_u(res1, res2)));
 }
 
@@ -451,7 +451,7 @@ void div_simd_impl(scale_tag_t s_tag, const v_float32& a1, const v_float32& a2,
                    const v_float32& a3, const v_float32& a4, const uchar* in2x,
                    uchar* outx, const v_float32& v_scale, const v_int16& v_zero)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
 
     v_int16 div1 = v_reinterpret_as_s16(vx_load_expand(in2x));
     v_int16 div2 = v_reinterpret_as_s16(vx_load_expand(&in2x[nlanes/2]));
@@ -466,8 +466,8 @@ void div_simd_impl(scale_tag_t s_tag, const v_float32& a1, const v_float32& a2,
             sum3 = v_round(div_op(s_tag, a3, fdiv3, v_scale)),
             sum4 = v_round(div_op(s_tag, a4, fdiv4, v_scale));
 
-    v_int16 res1 = v_select((div1 == v_zero), v_zero, v_pack(sum1, sum2));
-    v_int16 res2 = v_select((div2 == v_zero), v_zero, v_pack(sum3, sum4));
+    v_int16 res1 = v_select((v_eq(div1, v_zero)), v_zero, v_pack(sum1, sum2));
+    v_int16 res2 = v_select((v_eq(div2, v_zero)), v_zero, v_pack(sum3, sum4));
 
     vx_store(outx, v_pack_u(res1, res2));
 }
@@ -480,7 +480,7 @@ div_simd_impl(scale_tag_t s_tag, const v_float32& a1, const v_float32& a2,
               const v_float32& a3, const v_float32& a4, const SRC* in2x,
               uchar* outx, const v_float32& v_scale, const v_int16& v_zero)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
 
     v_int16 div1 = v_reinterpret_as_s16(vx_load(in2x));
     v_int16 div2 = v_reinterpret_as_s16(vx_load(&in2x[nlanes/2]));
@@ -495,8 +495,8 @@ div_simd_impl(scale_tag_t s_tag, const v_float32& a1, const v_float32& a2,
             sum3 = v_round(div_op(s_tag, a3, fdiv3, v_scale)),
             sum4 = v_round(div_op(s_tag, a4, fdiv4, v_scale));
 
-    v_int16 res1 = v_select((div1 == v_zero), v_zero, v_pack(sum1, sum2));
-    v_int16 res2 = v_select((div2 == v_zero), v_zero, v_pack(sum3, sum4));
+    v_int16 res1 = v_select((v_eq(div1, v_zero)), v_zero, v_pack(sum1, sum2));
+    v_int16 res2 = v_select((v_eq(div2, v_zero)), v_zero, v_pack(sum3, sum4));
 
     vx_store(outx, v_pack_u(res1, res2));
 }
@@ -507,7 +507,7 @@ CV_ALWAYS_INLINE void div_simd_impl(scale_tag_t s_tag, const v_float32& a1,
                                     const v_float32& a4, const float* in2x, uchar* outx,
                                     const v_float32& v_scale, const v_float32& v_zero)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
 
     v_float32 div1 = vg_load_f32(in2x);
     v_float32 div2 = vg_load_f32(&in2x[nlanes / 4]);
@@ -519,10 +519,10 @@ CV_ALWAYS_INLINE void div_simd_impl(scale_tag_t s_tag, const v_float32& a1,
     v_float32 r3 = div_op(s_tag, a3, div3, v_scale);
     v_float32 r4 = div_op(s_tag, a4, div4, v_scale);
 
-    v_float32 sel1 = v_select((div1 == v_zero), v_zero, r1);
-    v_float32 sel2 = v_select((div2 == v_zero), v_zero, r2);
-    v_float32 sel3 = v_select((div3 == v_zero), v_zero, r3);
-    v_float32 sel4 = v_select((div4 == v_zero), v_zero, r4);
+    v_float32 sel1 = v_select((v_eq(div1, v_zero)), v_zero, r1);
+    v_float32 sel2 = v_select((v_eq(div2, v_zero)), v_zero, r2);
+    v_float32 sel3 = v_select((v_eq(div3, v_zero)), v_zero, r3);
+    v_float32 sel4 = v_select((v_eq(div4, v_zero)), v_zero, r4);
 
     v_int32 res1 = v_round(sel1);
     v_int32 res2 = v_round(sel2);
@@ -536,7 +536,7 @@ template<typename scale_tag_t, typename SRC, typename Vtype>
 CV_ALWAYS_INLINE void div_hal(scale_tag_t s_tag, const SRC* in1x, const SRC* in2x, uchar* outx,
                               const v_float32& v_scale, const Vtype& v_zero)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
 
     v_float32 a1 = vg_load_f32(in1x);
     v_float32 a2 = vg_load_f32(&in1x[nlanes / 4]);
@@ -595,7 +595,7 @@ div_simd_impl(scale_tag_t s_tag, const v_float32& a1, const v_float32& a2,
               const float* in2x, DST* outx, const v_float32& v_scale,
               const v_float32& v_zero)
 {
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
 
     v_float32 fdiv1 = vg_load_f32(in2x);
     v_float32 fdiv2 = vg_load_f32(&in2x[nlanes / 2]);
@@ -603,8 +603,8 @@ div_simd_impl(scale_tag_t s_tag, const v_float32& a1, const v_float32& a2,
     v_float32 r1 = div_op(s_tag, a1, fdiv1, v_scale);
     v_float32 r2 = div_op(s_tag, a2, fdiv2, v_scale);
 
-    v_int32 res1 = v_round(v_select((fdiv1 == v_zero), v_zero, r1));
-    v_int32 res2 = v_round(v_select((fdiv2 == v_zero), v_zero, r2));
+    v_int32 res1 = v_round(v_select((v_eq(fdiv1, v_zero)), v_zero, r1));
+    v_int32 res2 = v_round(v_select((v_eq(fdiv2, v_zero)), v_zero, r2));
 
     v_store_i16(outx, res1, res2);
 }
@@ -616,7 +616,7 @@ typename std::enable_if<std::is_same<DST, short>::value ||
 div_hal(scale_tag_t s_tag, const SRC* in1x, const SRC* in2x, DST* outx,
         const v_float32& v_scale, const Vtype& v_zero)
 {
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
 
     v_float32 a1 = vg_load_f32(in1x);
     v_float32 a2 = vg_load_f32(&in1x[nlanes / 2]);
@@ -648,12 +648,12 @@ template<typename scale_tag_t, typename SRC, typename DST>
 CV_ALWAYS_INLINE int div_simd_common(scale_tag_t s_tag, const SRC in1[], const SRC in2[],
                                      DST out[], const int length, float scale)
 {
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
 
     if (length < nlanes)
         return 0;
 
-    const zero_vec_type_of_t<SRC> v_zero = vx_setall<typename zero_vec_type_of_t<SRC>::lane_type>(0);
+    const zero_vec_type_of_t<SRC> v_zero = vx_setall<typename VTraits< zero_vec_type_of_t<SRC> >::lane_type>(0);
     v_float32 v_scale = vx_setall_f32(scale);
 
     int x = 0;
@@ -724,7 +724,7 @@ typename std::enable_if<(std::is_same<SRC, short>::value && std::is_same<DST, us
                         (std::is_same<SRC, ushort>::value && std::is_same<DST, short>::value), int>::type
 mul_hal(scale_tag_t t, const SRC in1[], const SRC in2[], DST out[], const int length, double _scale)
 {
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
 
     if (length < nlanes)
         return 0;
@@ -769,7 +769,7 @@ typename std::enable_if<std::is_same<SRC, short>::value ||
                         std::is_same<SRC, ushort>::value, int>::type
 mul_hal(scale_tag_t t, const SRC in1[], const SRC in2[], uchar out[], const int length, double _scale)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
 
     if (length < nlanes)
         return 0;
@@ -824,7 +824,7 @@ template<typename scale_tag_t>
 CV_ALWAYS_INLINE int mul_hal(scale_tag_t t, const float in1[], const float in2[], uchar out[],
                              const int length, double _scale)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
 
     if (length < nlanes)
         return 0;
@@ -869,7 +869,7 @@ typename std::enable_if<std::is_same<DST, short>::value ||
                         std::is_same<DST, ushort>::value, int>::type
 mul_hal(scale_tag_t t, const uchar in1[], const uchar in2[], DST out[], const int length, double _scale)
 {
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
 
     if (length < nlanes)
         return 0;
@@ -914,7 +914,7 @@ typename std::enable_if<std::is_same<DST, short>::value ||
                         std::is_same<DST, ushort>::value, int>::type
 mul_hal(scale_tag_t t, const float in1[], const float in2[], DST out[], const int length, double _scale)
 {
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
 
     if (length < nlanes)
         return 0;
@@ -954,7 +954,7 @@ template<typename scale_tag_t, typename SRC>
 CV_ALWAYS_INLINE int mul_hal(scale_tag_t t, const SRC in1[], const SRC in2[], float out[],
                              const int length, double _scale)
 {
-    constexpr int nlanes = v_float32::nlanes;
+    const int nlanes = VTraits<v_float32>::vlanes();
 
     if (length < nlanes)
         return 0;
@@ -1049,7 +1049,7 @@ CV_ALWAYS_INLINE void arithmOpScalar_pack_store_c3(short* outx,       const v_in
                                                    const v_int32& c4, const v_int32& c5,
                                                    const v_int32& c6)
 {
-    constexpr int nlanes = v_int16::nlanes;
+    const int nlanes = VTraits<v_int16>::vlanes();
     vx_store(outx,           v_pack(c1, c2));
     vx_store(&outx[nlanes],   v_pack(c3, c4));
     vx_store(&outx[2*nlanes], v_pack(c5, c6));
@@ -1060,7 +1060,7 @@ CV_ALWAYS_INLINE void arithmOpScalar_pack_store_c3(ushort* outx,      const v_in
                                                    const v_int32& c4, const v_int32& c5,
                                                    const v_int32& c6)
 {
-    constexpr int nlanes = v_uint16::nlanes;
+    const int nlanes = VTraits<v_uint16>::vlanes();
     vx_store(outx,            v_pack_u(c1, c2));
     vx_store(&outx[nlanes],   v_pack_u(c3, c4));
     vx_store(&outx[2*nlanes], v_pack_u(c5, c6));
@@ -1068,37 +1068,37 @@ CV_ALWAYS_INLINE void arithmOpScalar_pack_store_c3(ushort* outx,      const v_in
 
 CV_ALWAYS_INLINE v_float32 oper(add_tag, const v_float32& a, const v_float32& sc)
 {
-    return a + sc;
+    return v_add(a, sc);
 }
 
 CV_ALWAYS_INLINE v_float32 oper(sub_tag, const v_float32& a, const v_float32& sc)
 {
-    return a - sc;
+    return v_sub(a, sc);
 }
 
 CV_ALWAYS_INLINE v_float32 oper(subr_tag, const v_float32& a, const v_float32& sc)
 {
-    return sc - a;
+    return v_sub(sc, a);
 }
 
 CV_ALWAYS_INLINE v_float32 oper(mul_tag, const v_float32& a, const v_float32& sc)
 {
-    return a * sc;
+    return v_mul(a, sc);
 }
 
 CV_ALWAYS_INLINE v_float32 oper_scaled(mul_tag, const v_float32& a, const v_float32& v_scalar, const v_float32& v_scale)
 {
-    return v_scale * a * v_scalar;
+    return v_mul(v_mul(v_scale, a), v_scalar);
 }
 
 CV_ALWAYS_INLINE v_float32 oper(div_tag, const v_float32& a, const v_float32& sc)
 {
-    return a / sc;
+    return v_div(a, sc);
 }
 
 CV_ALWAYS_INLINE v_float32 oper_scaled(div_tag, const v_float32& a, const v_float32& v_scalar, const v_float32& v_scale)
 {
-    return a*v_scale / v_scalar;
+    return v_div(v_mul(a, v_scale), v_scalar);
 }
 
 CV_ALWAYS_INLINE v_float32 oper(absdiff_tag, const v_float32& a, const v_float32& sc)
@@ -1223,8 +1223,8 @@ CV_ALWAYS_INLINE int arithmOpScalar_simd_c3(oper_tag t, const SRC in[],
                                             const int length)
 {
     constexpr int chan = 3;
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
-    constexpr int lanes = chan * nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
+    const int lanes = chan * nlanes;
 
     if (length < lanes)
         return 0;
@@ -1263,7 +1263,7 @@ CV_ALWAYS_INLINE int arithmOpScalar_simd_common(oper_tag t, const SRC in[],
                                                 const float scalar[], DST out[],
                                                 const int length)
 {
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
 
     if (length < nlanes)
         return 0;
@@ -1489,8 +1489,8 @@ CV_ALWAYS_INLINE int arithmOpScalarScaled_simd_c3(oper_tag op, const SRC in[],
                                                   const int length, const float scale)
 {
     constexpr int chan = 3;
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
-    constexpr int lanes = chan * nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
+    const int lanes = chan * nlanes;
 
     if (length < lanes)
         return 0;
@@ -1576,7 +1576,7 @@ CV_ALWAYS_INLINE int arithmOpScalarScaled_simd_common(oper_tag op, const SRC in[
                                                       const float scalar[], DST out[],
                                                       const int length, const float scale)
 {
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
 
     if (length < nlanes)
         return 0;
@@ -1675,10 +1675,10 @@ divc_simd_common_impl(scale_tag_t s_tag, const SRC in[], DST out[],
                       const v_float32& v_scalar, const v_float32& v_scale,
                       const int length)
 {
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
 
     v_float32 v_zero = vx_setzero_f32();
-    v_float32 v_mask = (v_scalar == v_zero);
+    v_float32 v_mask = (v_eq(v_scalar, v_zero));
 
     int x = 0;
     for (;;)
@@ -1709,10 +1709,10 @@ CV_ALWAYS_INLINE int divc_simd_common_impl(scale_tag_t s_tag, const SRC in[],
                                            uchar out[], const v_float32& v_scalar,
                                            const v_float32& v_scale, const int length)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
 
     v_float32 v_zero = vx_setzero_f32();
-    v_float32 v_mask = (v_scalar == v_zero);
+    v_float32 v_mask = (v_eq(v_scalar, v_zero));
 
     int x = 0;
     for (;;)
@@ -1747,7 +1747,7 @@ CV_ALWAYS_INLINE int divc_simd_common_impl(scale_tag_t s_tag, const SRC in[],
                                            float out[], const v_float32& v_scalar,
                                            const v_float32& v_scale, const int length)
 {
-    constexpr int nlanes = v_float32::nlanes;
+    const int nlanes = VTraits<v_float32>::vlanes();
     int x = 0;
     for (;;)
     {
@@ -1774,7 +1774,7 @@ CV_ALWAYS_INLINE int divc_mask_simd_common(scale_tag_t tag, const SRC in[],
                                            const float scalar[], DST out[],
                                            const int length, const float scale)
 {
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
 
     if (length < nlanes)
         return 0;
@@ -1796,9 +1796,9 @@ divc_simd_c3_impl(scale_tag_t s_tag, SRC in[], DST out[], const v_float32& s1,
                   const int nlanes, const int lanes)
 {
     v_float32 v_zero = vx_setzero_f32();
-    v_float32 v_mask1 = (s1 == v_zero);
-    v_float32 v_mask2 = (s2 == v_zero);
-    v_float32 v_mask3 = (s3 == v_zero);
+    v_float32 v_mask1 = (v_eq(s1, v_zero));
+    v_float32 v_mask2 = (v_eq(s2, v_zero));
+    v_float32 v_mask3 = (v_eq(s3, v_zero));
 
     int x = 0;
     for (;;)
@@ -1839,9 +1839,9 @@ CV_ALWAYS_INLINE int divc_simd_c3_impl(scale_tag_t s_tag, const SRC* in, uchar*
                                        const int length, const int nlanes, const int lanes)
 {
     v_float32 v_zero = vx_setzero_f32();
-    v_float32 v_mask1 = (s1 == v_zero);
-    v_float32 v_mask2 = (s2 == v_zero);
-    v_float32 v_mask3 = (s3 == v_zero);
+    v_float32 v_mask1 = (v_eq(s1, v_zero));
+    v_float32 v_mask2 = (v_eq(s2, v_zero));
+    v_float32 v_mask3 = (v_eq(s3, v_zero));
 
     int x = 0;
     for (;;)
@@ -1917,8 +1917,8 @@ CV_ALWAYS_INLINE int divc_mask_simd_c3(scale_tag_t s_tag, const SRC in[],
                                        const int length, const float scale)
 {
     constexpr int chan = 3;
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
-    constexpr int lanes = chan * nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
+    const int lanes = chan * nlanes;
 
     if (length < lanes)
         return 0;
@@ -2084,7 +2084,7 @@ CV_ALWAYS_INLINE int divrc_simd_common(scale_tag_t s_tag, const SRC in[],
                                        const float scalar[], DST out[],
                                        const int length, const float scale)
 {
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
 
     if (length < nlanes)
         return 0;
@@ -2092,7 +2092,7 @@ CV_ALWAYS_INLINE int divrc_simd_common(scale_tag_t s_tag, const SRC in[],
     v_float32 v_scalar = vx_load(scalar);
     v_float32 v_scale = vx_setall_f32(scale);
     zero_vec_type_of_t<SRC> v_zero =
-                         vx_setall<typename zero_vec_type_of_t<SRC>::lane_type>(0);
+                         vx_setall<typename VTraits<zero_vec_type_of_t<SRC>>::lane_type>(0);
 
     int x = 0;
     for (;;)
@@ -2121,7 +2121,7 @@ CV_ALWAYS_INLINE void divrc_simd_c3_calc(scale_tag_t s_tag, const uchar* inx, uc
                                          const v_uint8& v_zero)
 {
     v_uint8 div = vx_load(inx);
-    v_uint8 v_mask = (div == v_zero);
+    v_uint8 v_mask = (v_eq(div, v_zero));
 
     v_uint16 div1 = v_expand_low(div);
     v_uint16 div2 = v_expand_high(div);
@@ -2147,13 +2147,13 @@ divrc_simd_c3_calc(scale_tag_t s_tag, const SRC* inx, uchar* outx,
                    const v_float32& s3, const v_float32& v_scale,
                    const v_int16& v_zero)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
 
     v_int16 div1 = v_reinterpret_as_s16(vx_load(inx));
     v_int16 div2 = v_reinterpret_as_s16(vx_load(&inx[nlanes / 2]));
 
-    v_int16 v_mask1 = (div1 == v_zero);
-    v_int16 v_mask2 = (div2 == v_zero);
+    v_int16 v_mask1 = (v_eq(div1, v_zero));
+    v_int16 v_mask2 = (v_eq(div2, v_zero));
 
     v_float32 fdiv1 = v_cvt_f32(v_expand_low(div1));
     v_float32 fdiv2 = v_cvt_f32(v_expand_high(div1));
@@ -2175,17 +2175,17 @@ CV_ALWAYS_INLINE void divrc_simd_c3_calc(scale_tag_t s_tag, const float* inx, uc
                                          const v_float32& s3, const v_float32& v_scale,
                                          const v_float32& v_zero)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
 
     v_float32 fdiv1 = vg_load_f32(inx);
     v_float32 fdiv2 = vg_load_f32(&inx[nlanes / 4]);
     v_float32 fdiv3 = vg_load_f32(&inx[nlanes / 2]);
     v_float32 fdiv4 = vg_load_f32(&inx[3 * nlanes / 4]);
 
-    v_float32 v_mask1 = (fdiv1 == v_zero);
-    v_float32 v_mask2 = (fdiv2 == v_zero);
-    v_float32 v_mask3 = (fdiv3 == v_zero);
-    v_float32 v_mask4 = (fdiv4 == v_zero);
+    v_float32 v_mask1 = (v_eq(fdiv1, v_zero));
+    v_float32 v_mask2 = (v_eq(fdiv2, v_zero));
+    v_float32 v_mask3 = (v_eq(fdiv3, v_zero));
+    v_float32 v_mask4 = (v_eq(fdiv4, v_zero));
 
     vx_store(outx,
              v_pack_u(v_pack(v_round(v_select(v_mask1, v_zero, div_op(s_tag, s1, fdiv1, v_scale))),
@@ -2202,7 +2202,7 @@ CV_ALWAYS_INLINE int divrc_simd_c3_impl(scale_tag_t s_tag, const SRC in[], uchar
                                         const int length, const int nlanes, const int lanes)
 {
     univ_zero_vec_type_of_t<SRC> v_zero =
-        vx_setall<typename univ_zero_vec_type_of_t<SRC>::lane_type>(0);
+        vx_setall<typename VTraits<univ_zero_vec_type_of_t<SRC>>::lane_type>(0);
 
     int x = 0;
     for (;;)
@@ -2235,7 +2235,7 @@ divrc_simd_c3_calc(scale_tag_t s_tag, const uchar* inx, DST* outx,
                    const v_float32& s3, const v_float32& v_scale,
                    const v_int16& v_zero)
 {
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
     v_uint8 div = vx_load(inx);
 
     v_int16 div1 = v_reinterpret_as_s16(v_expand_low(div));
@@ -2268,7 +2268,7 @@ divrc_simd_c3_calc(scale_tag_t s_tag, const SRC* inx, DST* outx,
                    const v_float32& s3, const v_float32& v_scale,
                    const v_int16& v_zero)
 {
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
 
     v_int16 div1 = v_reinterpret_as_s16(vx_load(inx));
     v_int16 div2 = v_reinterpret_as_s16(vx_load(&inx[nlanes]));
@@ -2298,7 +2298,7 @@ divrc_simd_c3_calc(scale_tag_t s_tag, const float* inx, DST* outx,
                    const v_float32& s3, const v_float32& v_scale,
                    const v_float32& v_zero)
 {
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
 
     v_float32 fdiv1 = vg_load_f32(inx);
     v_float32 fdiv2 = vg_load_f32(&inx[nlanes/2]);
@@ -2307,12 +2307,12 @@ divrc_simd_c3_calc(scale_tag_t s_tag, const float* inx, DST* outx,
     v_float32 fdiv5 = vg_load_f32(&inx[2*nlanes]);
     v_float32 fdiv6 = vg_load_f32(&inx[5*nlanes/2]);
 
-    v_store_i16(outx, v_round(v_select(fdiv1 == v_zero, v_zero, div_op(s_tag, s1, fdiv1, v_scale))),
-                      v_round(v_select(fdiv2 == v_zero, v_zero, div_op(s_tag, s2, fdiv2, v_scale))));
-    v_store_i16(&outx[nlanes], v_round(v_select(fdiv3 == v_zero, v_zero, div_op(s_tag, s3, fdiv3, v_scale))),
-                               v_round(v_select(fdiv4 == v_zero, v_zero, div_op(s_tag, s1, fdiv4, v_scale))));
-    v_store_i16(&outx[2*nlanes], v_round(v_select(fdiv5 == v_zero, v_zero, div_op(s_tag, s2, fdiv5, v_scale))),
-                                 v_round(v_select(fdiv6 == v_zero, v_zero, div_op(s_tag, s3, fdiv6, v_scale))));
+    v_store_i16(outx, v_round(v_select(v_eq(fdiv1, v_zero), v_zero, div_op(s_tag, s1, fdiv1, v_scale))),
+                      v_round(v_select(v_eq(fdiv2, v_zero), v_zero, div_op(s_tag, s2, fdiv2, v_scale))));
+    v_store_i16(&outx[nlanes], v_round(v_select(v_eq(fdiv3, v_zero), v_zero, div_op(s_tag, s3, fdiv3, v_scale))),
+                               v_round(v_select(v_eq(fdiv4, v_zero), v_zero, div_op(s_tag, s1, fdiv4, v_scale))));
+    v_store_i16(&outx[2*nlanes], v_round(v_select(v_eq(fdiv5, v_zero), v_zero, div_op(s_tag, s2, fdiv5, v_scale))),
+                                 v_round(v_select(v_eq(fdiv6, v_zero), v_zero, div_op(s_tag, s3, fdiv6, v_scale))));
 }
 
 template<typename scale_tag_t, typename SRC, typename DST>
@@ -2325,7 +2325,7 @@ divrc_simd_c3_impl(scale_tag_t s_tag, const SRC in[], DST out[], const v_float32
                    const int, const int lanes)
 {
     zero_vec_type_of_t<SRC> v_zero =
-        vx_setall<typename zero_vec_type_of_t<SRC>::lane_type>(0);
+        vx_setall<typename VTraits<zero_vec_type_of_t<SRC>>::lane_type>(0);
 
     int x = 0;
     for (;;)
@@ -2385,8 +2385,8 @@ CV_ALWAYS_INLINE int divrc_simd_c3(scale_tag_t s_tag, const SRC in[],
                                    const int length, const float scale)
 {
     constexpr int chan = 3;
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
-    constexpr int lanes = chan * nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
+    const int lanes = chan * nlanes;
 
     if (length < lanes)
         return 0;
@@ -2473,7 +2473,7 @@ DIVRC_SIMD(float, float)
 int split3_simd(const uchar in[], uchar out1[], uchar out2[], uchar out3[],
                 const int width)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
     if (width < nlanes)
         return 0;
 
@@ -2507,7 +2507,7 @@ int split3_simd(const uchar in[], uchar out1[], uchar out2[], uchar out3[],
 int split4_simd(const uchar in[], uchar out1[], uchar out2[],
                 uchar out3[], uchar out4[], const int width)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
     if (width < nlanes)
         return 0;
 
@@ -2543,7 +2543,7 @@ int split4_simd(const uchar in[], uchar out1[], uchar out2[],
 int merge3_simd(const T in1[], const T in2[], const T in3[],        \
                 T out[], const int width)                           \
 {                                                                   \
-    constexpr int nlanes = vector_type_of_t<T>::nlanes;             \
+    const int nlanes = VTraits<vector_type_of_t<T>>::vlanes();      \
     if (width < nlanes)                                             \
         return 0;                                                   \
                                                                     \
@@ -2584,7 +2584,7 @@ MERGE3_SIMD(float)
 int merge4_simd(const uchar in1[], const uchar in2[], const uchar in3[],
                 const uchar in4[], uchar out[], const int width)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
     if (width < nlanes)
         return 0;
 
@@ -2618,13 +2618,13 @@ int merge4_simd(const uchar in1[], const uchar in2[], const uchar in3[],
 template<typename VT>
 CV_ALWAYS_INLINE VT oper(add_tag, const VT& a, const VT& b)
 {
-    return a + b;
+    return v_add(a, b);
 }
 
 template<typename VT>
 CV_ALWAYS_INLINE VT oper(sub_tag, const VT& a, const VT& b)
 {
-    return a - b;
+    return v_sub(a, b);
 }
 
 CV_ALWAYS_INLINE void pack_store_uchar(uchar* outx, const v_uint16& c1, const v_uint16& c2)
@@ -2653,7 +2653,7 @@ typename std::enable_if<std::is_same<SRC, short>::value ||
                         std::is_same<SRC, ushort>::value, void>::type
 arithmOp_simd_impl(oper_tag op, const SRC* in1x, const SRC* in2x, uchar* outx)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
 
     vector_type_of_t<SRC> a1 = vx_load(in1x);
     vector_type_of_t<SRC> a2 = vx_load(&in1x[nlanes / 2]);
@@ -2667,7 +2667,7 @@ template<typename oper_tag>
 CV_ALWAYS_INLINE void arithmOp_simd_impl(oper_tag op, const float* in1x,
                                          const float* in2x, uchar* outx)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
 
     v_float32 a1 = vx_load(in1x);
     v_float32 a2 = vx_load(&in1x[nlanes / 4]);
@@ -2709,7 +2709,7 @@ typename std::enable_if<std::is_same<DST, short>::value ||
                         std::is_same<DST, ushort>::value, void>::type
 arithmOp_simd_impl(oper_tag op, const float* in1x, const float* in2x, DST* outx)
 {
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
     v_float32 a1 = vx_load(in1x);
     v_float32 a2 = vx_load(&in1x[nlanes/2]);
     v_float32 b1 = vx_load(in2x);
@@ -2761,7 +2761,7 @@ template<typename oper_tag, typename SRC, typename DST>
 CV_ALWAYS_INLINE int arithmOp_simd(oper_tag op, const SRC in1[], const SRC in2[],
                                    DST out[], const int length)
 {
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
 
     if (length < nlanes)
         return 0;
@@ -2869,7 +2869,7 @@ CV_ALWAYS_INLINE void store_i16(short* outx, const v_int16& res)
 
 CV_ALWAYS_INLINE void convertto_simd_nocoeff_impl(const float* inx, uchar* outx)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
 
     v_int32 a1 = v_round(vx_load(inx));
     v_int32 a2 = v_round(vx_load(&inx[nlanes/4]));
@@ -2887,7 +2887,7 @@ CV_ALWAYS_INLINE
 typename std::enable_if<SRC_SHORT_OR_USHORT, void>::type
 convertto_simd_nocoeff_impl(const SRC* inx, uchar* outx)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
 
     vector_type_of_t<SRC> a1 = vx_load(inx);
     vector_type_of_t<SRC> a2 = vx_load(&inx[nlanes/2]);
@@ -2902,7 +2902,7 @@ CV_ALWAYS_INLINE
 typename std::enable_if<DST_SHORT_OR_USHORT, void>::type
 convertto_simd_nocoeff_impl(const float* inx, DST* outx)
 {
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
 
     v_int32 a1 = v_round(vx_load(inx));
     v_int32 a2 = v_round(vx_load(&inx[nlanes/2]));
@@ -2942,7 +2942,7 @@ CV_ALWAYS_INLINE void convertto_simd_nocoeff_impl(const SRC* inx, float* outx)
 #define CONVERTTO_NOCOEF_SIMD(SRC, DST)                            \
 int convertto_simd(const SRC in[], DST out[], const int length)    \
 {                                                                  \
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;          \
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();   \
     if (length < nlanes)                                           \
         return 0;                                                  \
                                                                    \
@@ -2982,7 +2982,7 @@ CV_ALWAYS_INLINE void convertto_scaled_simd_impl(const float* inx, uchar* outx,
                                                  const v_float32& v_alpha,
                                                  const v_float32& v_beta)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
 
     v_float32 a1 = vx_load(inx);
     v_float32 a2 = vx_load(&inx[nlanes / 4]);
@@ -3003,7 +3003,7 @@ typename std::enable_if<SRC_SHORT_OR_USHORT, void>::type
 convertto_scaled_simd_impl(const SRC* inx, uchar* outx, const v_float32& v_alpha,
                            const v_float32& v_beta)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
 
     v_int16 a = v_reinterpret_as_s16(vx_load(inx));
     v_int16 b = v_reinterpret_as_s16(vx_load(&inx[nlanes / 2]));
@@ -3050,7 +3050,7 @@ convertto_scaled_simd_impl(const float* inx, DST* outx,
                            const v_float32& v_alpha,
                            const v_float32& v_beta)
 {
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
 
     v_float32 a1 = vx_load(inx);
     v_float32 a2 = vx_load(&inx[nlanes / 2]);
@@ -3111,7 +3111,7 @@ CV_ALWAYS_INLINE void convertto_scaled_simd_impl(const SRC* inx, float* outx,
 int convertto_scaled_simd(const SRC in[], DST out[], const float alpha,     \
                           const float beta, const int length)               \
 {                                                                           \
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;                   \
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();            \
     if (length < nlanes)                                                    \
         return 0;                                                           \
                                                                             \
diff --git a/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp b/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp
index 9766cf7cc6..6c517b1f57 100644
--- a/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp
+++ b/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp
@@ -175,7 +175,7 @@ RUN_MEDBLUR3X3_IMPL( float)
 
 #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 template<typename SRC>
 static inline v_float32 vx_load_f32(const SRC* ptr)
 {
@@ -228,8 +228,8 @@ void run_rgb2gray_impl(uchar out[], const uchar in[], int width,
     GAPI_Assert(rc + gc + bc <= unity);
     GAPI_Assert(rc + gc + bc >= USHRT_MAX);
 
-#if CV_SIMD
-    constexpr int nlanes = v_uint8::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int nlanes = VTraits<v_uint8>::vlanes();
     if (width >= nlanes)
     {
         for (int w=0; w < width; )
@@ -247,14 +247,8 @@ void run_rgb2gray_impl(uchar out[], const uchar in[], int width,
 
                 v_uint16 y0, y1;
                 static const ushort half = 1 << 7; // Q0.8.8
-                y0 = (v_mul_hi(r0 << 8, vx_setall_u16(rc)) +
-                      v_mul_hi(g0 << 8, vx_setall_u16(gc)) +
-                      v_mul_hi(b0 << 8, vx_setall_u16(bc)) +
-                                        vx_setall_u16(half)) >> 8;
-                y1 = (v_mul_hi(r1 << 8, vx_setall_u16(rc)) +
-                      v_mul_hi(g1 << 8, vx_setall_u16(gc)) +
-                      v_mul_hi(b1 << 8, vx_setall_u16(bc)) +
-                                        vx_setall_u16(half)) >> 8;
+                y0 = v_shr<8>(v_add(v_add(v_add(v_mul_hi(v_shl<8>(r0), vx_setall_u16(rc)), v_mul_hi(v_shl<8>(g0), vx_setall_u16(gc))), v_mul_hi(v_shl<8>(b0), vx_setall_u16(bc))), vx_setall_u16(half)));
+                y1 = v_shr<8>(v_add(v_add(v_add(v_mul_hi(v_shl<8>(r1), vx_setall_u16(rc)), v_mul_hi(v_shl<8>(g1), vx_setall_u16(gc))), v_mul_hi(v_shl<8>(b1), vx_setall_u16(bc))), vx_setall_u16(half)));
 
                 v_uint8 y;
                 y = v_pack(y0, y1);
@@ -316,10 +310,10 @@ void run_rgb2hsv_impl(uchar out[], const uchar in[], const int sdiv_table[],
             v_uint8x16 v_min_rgb = v_min(v_min(r, g), b);
             v_uint8x16 v_max_rgb = v_max(v_max(r, g), b);
 
-            v_uint8x16 v_diff = v_max_rgb - v_min_rgb;
+            v_uint8x16 v_diff = v_sub(v_max_rgb, v_min_rgb);
 
-            v_uint8x16 v_r_eq_max = (r == v_max_rgb);
-            v_uint8x16 v_g_eq_max = (g == v_max_rgb);
+            v_uint8x16 v_r_eq_max = (v_eq(r, v_max_rgb));
+            v_uint8x16 v_g_eq_max = (v_eq(g, v_max_rgb));
 
             v_uint8x16 v;
             // get V-ch
@@ -327,10 +321,10 @@ void run_rgb2hsv_impl(uchar out[], const uchar in[], const int sdiv_table[],
 
             // divide v into 4x4 vectors because later int32 required
             v_uint32x4 v_idx[4];
-            v_idx[0] = v_reinterpret_as_u32(v & mask1);
-            v_idx[1] = v_reinterpret_as_u32(v & mask2) >> 8;
-            v_idx[2] = v_reinterpret_as_u32(v & mask3) >> 16;
-            v_idx[3] = v_reinterpret_as_u32(v & mask4) >> 24;
+            v_idx[0] = v_reinterpret_as_u32(v_and(v, mask1));
+            v_idx[1] = v_shr<8>(v_reinterpret_as_u32(v_and(v, mask2)));
+            v_idx[2] = v_shr<16>(v_reinterpret_as_u32(v_and(v, mask3)));
+            v_idx[3] = v_shr<24>(v_reinterpret_as_u32(v_and(v, mask4)));
 
             v_uint32x4 sv_elems_32[4];
             sv_elems_32[0] = v_reinterpret_as_u32(v_lut(sdiv_table, v_reinterpret_as_s32(v_idx[0])));
@@ -341,19 +335,19 @@ void run_rgb2hsv_impl(uchar out[], const uchar in[], const int sdiv_table[],
             // divide and calculate s according to above feature
             v_uint32x4 ss[4];
 
-            v_uint32x4 v_add = v_setall_u32(1) << (hsv_shift - 1);
+            v_uint32x4 vadd = v_setall_u32(1) << (hsv_shift - 1);
 
             v_uint32x4 v_diff_exp[4];
-            v_diff_exp[0] = v_reinterpret_as_u32(v_reinterpret_as_u8(v_diff) & mask1);
-            v_diff_exp[1] = v_reinterpret_as_u32(v_reinterpret_as_u8(v_diff) & mask2) >> 8;
-            v_diff_exp[2] = v_reinterpret_as_u32(v_reinterpret_as_u8(v_diff) & mask3) >> 16;
-            v_diff_exp[3] = v_reinterpret_as_u32(v_reinterpret_as_u8(v_diff) & mask4) >> 24;
+            v_diff_exp[0] = v_reinterpret_as_u32(v_and(v_reinterpret_as_u8(v_diff), mask1));
+            v_diff_exp[1] = v_shr<8>(v_reinterpret_as_u32(v_and(v_reinterpret_as_u8(v_diff), mask2)));
+            v_diff_exp[2] = v_shr<16>(v_reinterpret_as_u32(v_and(v_reinterpret_as_u8(v_diff), mask3)));
+            v_diff_exp[3] = v_shr<24>(v_reinterpret_as_u32(v_and(v_reinterpret_as_u8(v_diff), mask4)));
 
             // s = (diff * sdiv_table[v] + (1 << (hsv_shift-1))) >> hsv_shift;
-            ss[0] = (v_diff_exp[0] * sv_elems_32[0] + v_add) >> hsv_shift;
-            ss[1] = (v_diff_exp[1] * sv_elems_32[1] + v_add) >> hsv_shift;
-            ss[2] = (v_diff_exp[2] * sv_elems_32[2] + v_add) >> hsv_shift;
-            ss[3] = (v_diff_exp[3] * sv_elems_32[3] + v_add) >> hsv_shift;
+            ss[0] = v_shr<hsv_shift>(v_add(v_mul(v_diff_exp[0], sv_elems_32[0]), vadd));
+            ss[1] = v_shr<hsv_shift>(v_add(v_mul(v_diff_exp[1], sv_elems_32[1]), vadd));
+            ss[2] = v_shr<hsv_shift>(v_add(v_mul(v_diff_exp[2], sv_elems_32[2]), vadd));
+            ss[3] = v_shr<hsv_shift>(v_add(v_mul(v_diff_exp[3], sv_elems_32[3]), vadd));
 
             // reconstruct order of S-ch
             v_uint32x4 zip[8];
@@ -413,17 +407,17 @@ void run_rgb2hsv_impl(uchar out[], const uchar in[], const int sdiv_table[],
             //h = (_vr & (g - b)) + (~_vr & ((_vg & (b - r + 2 * diff)) + ((~_vg) & (r - g + 4 * diff))));
             v_int32x4 hh[4];
             hh[0] = v_reinterpret_as_s32(v_select(e[0], v_reinterpret_as_s32(gg[0] - bb[0]),
-                                         v_select(p[0], v_reinterpret_as_s32(bb[0] - rr[0] + v_setall_u32(2) * vdd[0]),
-                                                        v_reinterpret_as_s32(rr[0] - gg[0] + v_setall_u32(4) * vdd[0]))));
+                                         v_select(p[0], v_reinterpret_as_s32(v_add(v_sub(bb[0], rr[0]), v_mul(v_setall_u32(2), vdd[0]))),
+                                                        v_reinterpret_as_s32(v_add(v_sub(rr[0], gg[0]), v_mul(v_setall_u32(4), vdd[0]))))));
             hh[1] = v_reinterpret_as_s32(v_select(e[1], v_reinterpret_as_s32(gg[1] - bb[1]),
-                                         v_select(p[1], v_reinterpret_as_s32(bb[1] - rr[1] + v_setall_u32(2) * vdd[1]),
-                                                        v_reinterpret_as_s32(rr[1] - gg[1] + v_setall_u32(4) * vdd[1]))));
+                                         v_select(p[1], v_reinterpret_as_s32(v_add(v_sub(bb[1], rr[1]), v_mul(v_setall_u32(2), vdd[1]))),
+                                                        v_reinterpret_as_s32(v_add(v_sub(rr[1], gg[1]), v_mul(v_setall_u32(4), vdd[1]))))));
             hh[2] = v_reinterpret_as_s32(v_select(e[2], v_reinterpret_as_s32(gg[2] - bb[2]),
-                                         v_select(p[2], v_reinterpret_as_s32(bb[2] - rr[2] + v_setall_u32(2) * vdd[2]),
-                                                        v_reinterpret_as_s32(rr[2] - gg[2] + v_setall_u32(4) * vdd[2]))));
+                                         v_select(p[2], v_reinterpret_as_s32(v_add(v_sub(bb[2], rr[2]), v_mul(v_setall_u32(2), vdd[2]))),
+                                                        v_reinterpret_as_s32(v_add(v_sub(rr[2], gg[2]), v_mul(v_setall_u32(4), vdd[2]))))));
             hh[3] = v_reinterpret_as_s32(v_select(e[3], v_reinterpret_as_s32(gg[3] - bb[3]),
-                                         v_select(p[3], v_reinterpret_as_s32(bb[3] - rr[3] + v_setall_u32(2) * vdd[3]),
-                                                        v_reinterpret_as_s32(rr[3] - gg[3] + v_setall_u32(4) * vdd[3]))));
+                                         v_select(p[3], v_reinterpret_as_s32(v_add(v_sub(bb[3], rr[3]), v_mul(v_setall_u32(2), vdd[3]))),
+                                                        v_reinterpret_as_s32(v_add(v_sub(rr[3], gg[3]), v_mul(v_setall_u32(4), vdd[3]))))));
 
             //h = (h * hdiv_table[diff] + (1 << (hsv_shift-1))) >> hsv_shift;
             v_uint32x4 h_elems_32[4];
@@ -432,10 +426,10 @@ void run_rgb2hsv_impl(uchar out[], const uchar in[], const int sdiv_table[],
             h_elems_32[2] = v_reinterpret_as_u32(v_lut(hdiv_table, v_reinterpret_as_s32(vdd[2])));
             h_elems_32[3] = v_reinterpret_as_u32(v_lut(hdiv_table, v_reinterpret_as_s32(vdd[3])));
 
-            hh[0] = (hh[0] * v_reinterpret_as_s32(h_elems_32[0]) + v_reinterpret_as_s32(v_add)) >> hsv_shift;
-            hh[1] = (hh[1] * v_reinterpret_as_s32(h_elems_32[1]) + v_reinterpret_as_s32(v_add)) >> hsv_shift;
-            hh[2] = (hh[2] * v_reinterpret_as_s32(h_elems_32[2]) + v_reinterpret_as_s32(v_add)) >> hsv_shift;
-            hh[3] = (hh[3] * v_reinterpret_as_s32(h_elems_32[3]) + v_reinterpret_as_s32(v_add)) >> hsv_shift;
+            hh[0] = v_shr(v_add(v_mul(hh[0], v_reinterpret_as_s32(h_elems_32[0])), v_reinterpret_as_s32(vadd)), hsv_shift);
+            hh[1] = v_shr(v_add(v_mul(hh[1], v_reinterpret_as_s32(h_elems_32[1])), v_reinterpret_as_s32(vadd)), hsv_shift);
+            hh[2] = v_shr(v_add(v_mul(hh[2], v_reinterpret_as_s32(h_elems_32[2])), v_reinterpret_as_s32(vadd)), hsv_shift);
+            hh[3] = v_shr(v_add(v_mul(hh[3], v_reinterpret_as_s32(h_elems_32[3])), v_reinterpret_as_s32(vadd)), hsv_shift);
 
             // check for negative H
             v_int32x4 v_h_less_0[4];
@@ -534,7 +528,7 @@ void run_bayergr2rgb_bg_impl(uchar out[], const uchar **in, int width)
             // calculate b-channel
             v_expand(b2, l_1, r_1);
             v_expand(b2_offset, l_2, r_2);
-            v_uint8x16 b2_sum = v_rshr_pack<1>(l_1 + l_2, r_1 + r_2);
+            v_uint8x16 b2_sum = v_rshr_pack<1>(v_add(l_1, l_2), v_add(r_1, r_2));
 
             v_uint8x16 b_low, b_high;
             v_zip(b2_sum, b2_offset, b_low, b_high);
@@ -547,9 +541,9 @@ void run_bayergr2rgb_bg_impl(uchar out[], const uchar **in, int width)
             v_expand(r3_offset, l_4, r_4);
 
             v_uint8x16 r13offset_sum, r13_sum;
-            r13offset_sum = v_rshr_pack<2>(l_1 + l_2 + l_3 + l_4,
-                                           r_1 + r_2 + r_3 + r_4);
-            r13_sum = v_rshr_pack<1>(l_1 + l_3, r_1 + r_3);
+            r13offset_sum = v_rshr_pack<2>(v_add(v_add(v_add(l_1, l_2), l_3), l_4),
+                                           v_add(v_add(v_add(r_1, r_2), r_3), r_4));
+            r13_sum = v_rshr_pack<1>(v_add(l_1, l_3), v_add(r_1, r_3));
 
             v_uint8x16 r_low, r_high;
             v_zip(r13_sum, r13offset_sum, r_low, r_high);
@@ -561,8 +555,8 @@ void run_bayergr2rgb_bg_impl(uchar out[], const uchar **in, int width)
             v_expand(g2, l_3, r_3);
             v_expand(g2_offset, l_4, r_4);
 
-            v_uint8x16 g_out_sum = v_rshr_pack<2>(l_1 + l_2 + l_3 + l_4,
-                                                  r_1 + r_2 + r_3 + r_4);
+            v_uint8x16 g_out_sum = v_rshr_pack<2>(v_add(v_add(v_add(l_1, l_2), l_3), l_4),
+                                                  v_add(v_add(v_add(r_1, r_2), r_3), r_4));
 
             v_uint8x16 g_low, g_high;
             v_zip(g2, g_out_sum, g_low, g_high);
@@ -646,7 +640,7 @@ void run_bayergr2rgb_gr_impl(uchar out[], const uchar **in, int width)
             // calculate r-channel
             v_expand(r2, l_1, r_1);
             v_expand(r2_offset, l_2, r_2);
-            v_uint8x16 r2_sum = v_rshr_pack<1>(l_1 + l_2, r_1 + r_2);
+            v_uint8x16 r2_sum = v_rshr_pack<1>(v_add(l_1, l_2), v_add(r_1, r_2));
 
             v_uint8x16 r_low, r_high;
             v_zip(r2, r2_sum, r_low, r_high);
@@ -659,9 +653,9 @@ void run_bayergr2rgb_gr_impl(uchar out[], const uchar **in, int width)
             v_expand(b3_offset, l_4, r_4);
 
             v_uint8x16 b13offset_sum, b13_sum;
-            b13offset_sum = v_rshr_pack<2>(l_1 + l_2 + l_3 + l_4,
-                                           r_1 + r_2 + r_3 + r_4);
-            b13_sum = v_rshr_pack<1>(l_2 + l_4, r_2 + r_4);
+            b13offset_sum = v_rshr_pack<2>(v_add(v_add(v_add(l_1, l_2), l_3), l_4),
+                                           v_add(v_add(v_add(r_1, r_2), r_3), r_4));
+            b13_sum = v_rshr_pack<1>(v_add(l_2, l_4), v_add(r_2, r_4));
 
             v_uint8x16 b_low, b_high;
             v_zip(b13offset_sum, b13_sum, b_low, b_high);
@@ -673,8 +667,8 @@ void run_bayergr2rgb_gr_impl(uchar out[], const uchar **in, int width)
             v_expand(g2, l_3, r_3);
             v_expand(g2_offset, l_4, r_4);
 
-            v_uint8x16 g_out_sum = v_rshr_pack<2>(l_1 + l_2 + l_3 + l_4,
-                                                  r_1 + r_2 + r_3 + r_4);
+            v_uint8x16 g_out_sum = v_rshr_pack<2>(v_add(v_add(v_add(l_1, l_2), l_3), l_4),
+                                                  v_add(v_add(v_add(r_1, r_2), r_3), r_4));
 
             v_uint8x16 g_low, g_high;
             v_zip(g_out_sum, g2_offset, g_low, g_high);
@@ -749,8 +743,8 @@ void run_rgb2yuv_impl(uchar out[], const uchar in[], int width, const float coef
 
     int w = 0;
 
-#if CV_SIMD
-    static const int nlanes = v_uint8::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    static const int nlanes = VTraits<v_uint8>::vlanes();
     for ( ; w <= width - nlanes; w += nlanes)
     {
         v_uint8 r, g, b;
@@ -761,20 +755,16 @@ void run_rgb2yuv_impl(uchar out[], const uchar in[], int width, const float coef
         v_expand(g, _g0, _g1);
         v_expand(b, _b0, _b1);
 
-        _r0 = _r0 << 7;                         // Q0.9.7 un-signed
-        _r1 = _r1 << 7;
-        _g0 = _g0 << 7;
-        _g1 = _g1 << 7;
-        _b0 = _b0 << 7;
-        _b1 = _b1 << 7;
+        _r0 = v_shl<7>(_r0);                         // Q0.9.7 un-signed
+        _r1 = v_shl<7>(_r1);
+        _g0 = v_shl<7>(_g0);
+        _g1 = v_shl<7>(_g1);
+        _b0 = v_shl<7>(_b0);
+        _b1 = v_shl<7>(_b1);
 
         v_uint16 _y0, _y1;
-        _y0 = v_mul_hi(vx_setall_u16(c0), _r0)  // Q0.9.7
-            + v_mul_hi(vx_setall_u16(c1), _g0)
-            + v_mul_hi(vx_setall_u16(c2), _b0);
-        _y1 = v_mul_hi(vx_setall_u16(c0), _r1)
-            + v_mul_hi(vx_setall_u16(c1), _g1)
-            + v_mul_hi(vx_setall_u16(c2), _b1);
+        _y0 = v_add(v_add(v_mul_hi(vx_setall_u16(c0), _r0), v_mul_hi(vx_setall_u16(c1), _g0)), v_mul_hi(vx_setall_u16(c2), _b0));
+        _y1 = v_add(v_add(v_mul_hi(vx_setall_u16(c0), _r1), v_mul_hi(vx_setall_u16(c1), _g1)), v_mul_hi(vx_setall_u16(c2), _b1));
 
         v_int16 r0, r1, b0, b1, y0, y1;
         r0 = v_reinterpret_as_s16(_r0);         // Q1.8.7 signed
@@ -785,18 +775,18 @@ void run_rgb2yuv_impl(uchar out[], const uchar in[], int width, const float coef
         y1 = v_reinterpret_as_s16(_y1);
 
         v_int16 u0, u1, v0, v1;
-        u0 = v_mul_hi(vx_setall_s16(c3), b0 - y0);  // Q1.12.3
-        u1 = v_mul_hi(vx_setall_s16(c3), b1 - y1);
-        v0 = v_mul_hi(vx_setall_s16(c4), r0 - y0);
-        v1 = v_mul_hi(vx_setall_s16(c4), r1 - y1);
+        u0 = v_mul_hi(vx_setall_s16(c3), v_sub(b0, y0));  // Q1.12.3
+        u1 = v_mul_hi(vx_setall_s16(c3), v_sub(b1, y1));
+        v0 = v_mul_hi(vx_setall_s16(c4), v_sub(r0, y0));
+        v1 = v_mul_hi(vx_setall_s16(c4), v_sub(r1, y1));
 
         v_uint8 y, u, v;
-        y = v_pack((_y0 + vx_setall_u16(1 << 6)) >> 7,
-                   (_y1 + vx_setall_u16(1 << 6)) >> 7);
-        u = v_pack_u((u0 + vx_setall_s16(257 << 2)) >> 3,  // 257 << 2 = 128.5 * (1 << 3)
-                     (u1 + vx_setall_s16(257 << 2)) >> 3);
-        v = v_pack_u((v0 + vx_setall_s16(257 << 2)) >> 3,
-                     (v1 + vx_setall_s16(257 << 2)) >> 3);
+        y = v_pack(v_shr<7>(v_add(_y0, vx_setall_u16(1 << 6))),
+                   v_shr<7>(v_add(_y1, vx_setall_u16(1 << 6))));
+        u = v_pack_u(v_shr<3>(v_add(u0, vx_setall_s16(257 << 2))),  // 257 << 2 = 128.5 * (1 << 3)
+                     v_shr<3>(v_add(u1, vx_setall_s16(257 << 2))));
+        v = v_pack_u(v_shr<3>(v_add(v0, vx_setall_s16(257 << 2))),
+                     v_shr<3>(v_add(v1, vx_setall_s16(257 << 2))));
 
         v_store_interleave(&out[3*w], y, u, v);
     }
@@ -825,8 +815,8 @@ void run_yuv2rgb_impl(uchar out[], const uchar in[], int width, const float coef
 
     int w = 0;
 
-#if CV_SIMD
-    static const int nlanes = v_uint8::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    static const int nlanes = VTraits<v_uint8>::vlanes();
     for ( ; w <= width - nlanes; w += nlanes)
     {
         v_uint8 y, u, v;
@@ -845,30 +835,28 @@ void run_yuv2rgb_impl(uchar out[], const uchar in[], int width, const float coef
         v0 = v_reinterpret_as_s16(_v0);
         v1 = v_reinterpret_as_s16(_v1);
 
-        y0 =  y0 << 3;                              // Q1.12.3
-        y1 =  y1 << 3;
-        u0 = (u0 - vx_setall_s16(128)) << 7;        // Q1.8.7
-        u1 = (u1 - vx_setall_s16(128)) << 7;
-        v0 = (v0 - vx_setall_s16(128)) << 7;
-        v1 = (v1 - vx_setall_s16(128)) << 7;
+        y0 =  v_shl<3>(y0);                              // Q1.12.3
+        y1 =  v_shl<3>(y1);
+        u0 = v_shl<7>(v_sub(u0, vx_setall_s16(128)));        // Q1.8.7
+        u1 = v_shl<7>(v_sub(u1, vx_setall_s16(128)));
+        v0 = v_shl<7>(v_sub(v0, vx_setall_s16(128)));
+        v1 = v_shl<7>(v_sub(v1, vx_setall_s16(128)));
 
         v_int16 r0, r1, g0, g1, b0, b1;
-        r0 = y0 + v_mul_hi(vx_setall_s16(c0), v0);  // Q1.12.3
-        r1 = y1 + v_mul_hi(vx_setall_s16(c0), v1);
-        g0 = y0 + v_mul_hi(vx_setall_s16(c1), u0)
-                + v_mul_hi(vx_setall_s16(c2), v0);
-        g1 = y1 + v_mul_hi(vx_setall_s16(c1), u1)
-                + v_mul_hi(vx_setall_s16(c2), v1);
-        b0 = y0 + v_mul_hi(vx_setall_s16(c3), u0);
-        b1 = y1 + v_mul_hi(vx_setall_s16(c3), u1);
+        r0 = v_add(y0, v_mul_hi(vx_setall_s16(c0), v0));  // Q1.12.3
+        r1 = v_add(y1, v_mul_hi(vx_setall_s16(c0), v1));
+        g0 = v_add(v_add(y0, v_mul_hi(vx_setall_s16(c1), u0)), v_mul_hi(vx_setall_s16(c2), v0));
+        g1 = v_add(v_add(y1, v_mul_hi(vx_setall_s16(c1), u1)), v_mul_hi(vx_setall_s16(c2), v1));
+        b0 = v_add(y0, v_mul_hi(vx_setall_s16(c3), u0));
+        b1 = v_add(y1, v_mul_hi(vx_setall_s16(c3), u1));
 
         v_uint8 r, g, b;
-        r = v_pack_u((r0 + vx_setall_s16(1 << 2)) >> 3,
-                     (r1 + vx_setall_s16(1 << 2)) >> 3);
-        g = v_pack_u((g0 + vx_setall_s16(1 << 2)) >> 3,
-                     (g1 + vx_setall_s16(1 << 2)) >> 3);
-        b = v_pack_u((b0 + vx_setall_s16(1 << 2)) >> 3,
-                     (b1 + vx_setall_s16(1 << 2)) >> 3);
+        r = v_pack_u(v_shr<3>(v_add(r0, vx_setall_s16(1 << 2))),
+                     v_shr<3>(v_add(r1, vx_setall_s16(1 << 2))));
+        g = v_pack_u(v_shr<3>(v_add(g0, vx_setall_s16(1 << 2))),
+                     v_shr<3>(v_add(g1, vx_setall_s16(1 << 2))));
+        b = v_pack_u(v_shr<3>(v_add(b0, vx_setall_s16(1 << 2))),
+                     v_shr<3>(v_add(b1, vx_setall_s16(1 << 2))));
 
         v_store_interleave(&out[3*w], r, g, b);
     }
@@ -920,41 +908,37 @@ void run_rgb2yuv422_impl(uchar out[], const uchar in[], int width)
             v_expand(g, gg1, gg2);
             v_expand(b, bb1, bb2);
 
-            rr1 = rr1 << 7;
-            rr2 = rr2 << 7;
-            gg1 = gg1 << 7;
-            gg2 = gg2 << 7;
-            bb1 = bb1 << 7;
-            bb2 = bb2 << 7;
+            rr1 = v_shl<7>(rr1);
+            rr2 = v_shl<7>(rr2);
+            gg1 = v_shl<7>(gg1);
+            gg2 = v_shl<7>(gg2);
+            bb1 = v_shl<7>(bb1);
+            bb2 = v_shl<7>(bb2);
 
             v_uint16x8 yy1, yy2;
 
-            yy1 = v_mul_hi(v_setall_u16(c0), rr1) +
-                  v_mul_hi(v_setall_u16(c1), gg1) +
-                  v_mul_hi(v_setall_u16(c2), bb1);
+            yy1 = v_add(v_add(v_mul_hi(v_setall_u16(c0), rr1), v_mul_hi(v_setall_u16(c1), gg1)), v_mul_hi(v_setall_u16(c2), bb1));
 
-            yy2 = v_mul_hi(v_setall_u16(c0), rr2) +
-                  v_mul_hi(v_setall_u16(c1), gg2) +
-                  v_mul_hi(v_setall_u16(c2), bb2);
+            yy2 = v_add(v_add(v_mul_hi(v_setall_u16(c0), rr2), v_mul_hi(v_setall_u16(c1), gg2)), v_mul_hi(v_setall_u16(c2), bb2));
 
             v_int16x8 u1, u2, v1, v2;
 
-            u1 = v_mul_hi(v_setall_s16(c3), v_reinterpret_as_s16(bb1) - v_reinterpret_as_s16(yy1));
-            u2 = v_mul_hi(v_setall_s16(c3), v_reinterpret_as_s16(bb2) - v_reinterpret_as_s16(yy2));
-            v1 = v_mul_hi(v_setall_s16(c4), v_reinterpret_as_s16(rr1) - v_reinterpret_as_s16(yy1));
-            v2 = v_mul_hi(v_setall_s16(c4), v_reinterpret_as_s16(rr2) - v_reinterpret_as_s16(yy2));
+            u1 = v_mul_hi(v_setall_s16(c3), v_sub(v_reinterpret_as_s16(bb1), v_reinterpret_as_s16(yy1)));
+            u2 = v_mul_hi(v_setall_s16(c3), v_sub(v_reinterpret_as_s16(bb2), v_reinterpret_as_s16(yy2)));
+            v1 = v_mul_hi(v_setall_s16(c4), v_sub(v_reinterpret_as_s16(rr1), v_reinterpret_as_s16(yy1)));
+            v2 = v_mul_hi(v_setall_s16(c4), v_sub(v_reinterpret_as_s16(rr2), v_reinterpret_as_s16(yy2)));
 
-            y = v_pack((yy1 + v_setall_u16(1 << 6)) >> 7,
-                       (yy2 + v_setall_u16(1 << 6)) >> 7);
-            u = v_pack_u((u1 + v_setall_s16(257 << 2)) >> 3,
-                         (u2 + v_setall_s16(257 << 2)) >> 3);
-            v = v_pack_u((v1 + v_setall_s16(257 << 2)) >> 3,
-                         (v2 + v_setall_s16(257 << 2)) >> 3);
+            y = v_pack(v_shr<7>(v_add(yy1, v_setall_u16(1 << 6))),
+                       v_shr<7>(v_add(yy2, v_setall_u16(1 << 6))));
+            u = v_pack_u(v_shr<3>(v_add(u1, v_setall_s16(257 << 2))),
+                         v_shr<3>(v_add(u2, v_setall_s16(257 << 2))));
+            v = v_pack_u(v_shr<3>(v_add(v1, v_setall_s16(257 << 2))),
+                         v_shr<3>(v_add(v2, v_setall_s16(257 << 2))));
 
             uint8_t ff = 0xff;
             v_uint8x16 mask(ff, 0, ff, 0, ff, 0, ff, 0, ff, 0, ff, 0, ff, 0, ff, 0);
-            v_uint8x16 uu = u & mask;
-            v_uint8x16 vv = v & mask;
+            v_uint8x16 uu = v_and(u, mask);
+            v_uint8x16 vv = v_and(v, mask);
             // extract even u and v
             v_uint8x16 u_low = v_pack(v_reinterpret_as_u16(uu), v_reinterpret_as_u16(uu));
             v_uint8x16 v_low = v_pack(v_reinterpret_as_u16(vv), v_reinterpret_as_u16(vv));
@@ -1001,7 +985,7 @@ void run_rgb2yuv422_impl(uchar out[], const uchar in[], int width)
 //
 //-----------------------------
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 // this variant not using buf[] appears 15% faster than reference any-2-float code below
 template<bool noscale, typename SRC>
 static void run_sepfilter3x3_any2float(float out[], const SRC *in[], int width, int chan,
@@ -1016,7 +1000,7 @@ static void run_sepfilter3x3_any2float(float out[], const SRC *in[], int width,
 
     for (int l=0; l < length; )
     {
-        static const int nlanes = v_float32::nlanes;
+        static const int nlanes = VTraits<v_float32>::vlanes();
 
         // main part
         for ( ; l <= length - nlanes; l += nlanes)
@@ -1026,7 +1010,7 @@ static void run_sepfilter3x3_any2float(float out[], const SRC *in[], int width,
                 v_float32 t0 = vx_load_f32(&i[l - shift]);
                 v_float32 t1 = vx_load_f32(&i[l        ]);
                 v_float32 t2 = vx_load_f32(&i[l + shift]);
-                v_float32 t = t0 * vx_setall_f32(kx0);
+                v_float32 t = v_mul(t0, vx_setall_f32(kx0));
                     t = v_fma(t1,  vx_setall_f32(kx1), t);
                     t = v_fma(t2,  vx_setall_f32(kx2), t);
                 return t;
@@ -1035,7 +1019,7 @@ static void run_sepfilter3x3_any2float(float out[], const SRC *in[], int width,
             v_float32 s0 = xsum(in[0]);
             v_float32 s1 = xsum(in[1]);
             v_float32 s2 = xsum(in[2]);
-            v_float32 s = s0 * vx_setall_f32(ky0);
+            v_float32 s = v_mul(s0, vx_setall_f32(ky0));
                 s = v_fma(s1,  vx_setall_f32(ky1), s);
                 s = v_fma(s2,  vx_setall_f32(ky2), s);
 
@@ -1097,16 +1081,16 @@ static void run_sepfilter3x3_any2short(DST out[], const SRC *in[], int width, in
 
     for (int l=0; l < length;)
     {
-        constexpr int nlanes = v_int16::nlanes;
+        const int nlanes = VTraits<v_int16>::vlanes();
 
         // main part of row
         for (; l <= length - nlanes; l += nlanes)
         {
-            v_float32 sum0 = vx_load(&buf[r0][l])            * vx_setall_f32(ky0);
+            v_float32 sum0 = v_mul(vx_load(&buf[r0][l]), vx_setall_f32(ky0));
                 sum0 = v_fma(vx_load(&buf[r1][l]),             vx_setall_f32(ky1), sum0);
                 sum0 = v_fma(vx_load(&buf[r2][l]),             vx_setall_f32(ky2), sum0);
 
-            v_float32 sum1 = vx_load(&buf[r0][l + nlanes/2]) * vx_setall_f32(ky0);
+            v_float32 sum1 = v_mul(vx_load(&buf[r0][l + nlanes / 2]), vx_setall_f32(ky0));
                 sum1 = v_fma(vx_load(&buf[r1][l + nlanes/2]),  vx_setall_f32(ky1), sum1);
                 sum1 = v_fma(vx_load(&buf[r2][l + nlanes/2]),  vx_setall_f32(ky2), sum1);
 
@@ -1181,24 +1165,24 @@ static void run_sepfilter3x3_any2char(uchar out[], const SRC *in[], int width, i
 
     for (int l=0; l < length;)
     {
-        constexpr int nlanes = v_uint8::nlanes;
+        const int nlanes = VTraits<v_uint8>::vlanes();
 
         // main part of row
         for (; l <= length - nlanes; l += nlanes)
         {
-            v_float32 sum0 = vx_load(&buf[r0][l])              * vx_setall_f32(ky0);
+            v_float32 sum0 = v_mul(vx_load(&buf[r0][l]), vx_setall_f32(ky0));
                 sum0 = v_fma(vx_load(&buf[r1][l]),               vx_setall_f32(ky1), sum0);
                 sum0 = v_fma(vx_load(&buf[r2][l]),               vx_setall_f32(ky2), sum0);
 
-            v_float32 sum1 = vx_load(&buf[r0][l +   nlanes/4]) * vx_setall_f32(ky0);
+            v_float32 sum1 = v_mul(vx_load(&buf[r0][l + nlanes / 4]), vx_setall_f32(ky0));
                 sum1 = v_fma(vx_load(&buf[r1][l +   nlanes/4]),  vx_setall_f32(ky1), sum1);
                 sum1 = v_fma(vx_load(&buf[r2][l +   nlanes/4]),  vx_setall_f32(ky2), sum1);
 
-            v_float32 sum2 = vx_load(&buf[r0][l + 2*nlanes/4]) * vx_setall_f32(ky0);
+            v_float32 sum2 = v_mul(vx_load(&buf[r0][l + 2 * nlanes / 4]), vx_setall_f32(ky0));
                 sum2 = v_fma(vx_load(&buf[r1][l + 2*nlanes/4]),  vx_setall_f32(ky1), sum2);
                 sum2 = v_fma(vx_load(&buf[r2][l + 2*nlanes/4]),  vx_setall_f32(ky2), sum2);
 
-            v_float32 sum3 = vx_load(&buf[r0][l + 3*nlanes/4]) * vx_setall_f32(ky0);
+            v_float32 sum3 = v_mul(vx_load(&buf[r0][l + 3 * nlanes / 4]), vx_setall_f32(ky0));
                 sum3 = v_fma(vx_load(&buf[r1][l + 3*nlanes/4]),  vx_setall_f32(ky1), sum3);
                 sum3 = v_fma(vx_load(&buf[r2][l + 3*nlanes/4]),  vx_setall_f32(ky2), sum3);
 
@@ -1284,7 +1268,7 @@ static void run_sepfilter3x3_char2short(short out[], const uchar *in[], int widt
     {
         for (int l=0; l < length;)
         {
-            constexpr int nlanes = v_int16::nlanes;
+            const int nlanes = VTraits<v_int16>::vlanes();
 
             // main part of output row
             for (; l <= length - nlanes; l += nlanes)
@@ -1292,9 +1276,7 @@ static void run_sepfilter3x3_char2short(short out[], const uchar *in[], int widt
                 v_uint16 t0 = vx_load_expand(&in[k][l - shift]);  // previous
                 v_uint16 t1 = vx_load_expand(&in[k][l        ]);  // current
                 v_uint16 t2 = vx_load_expand(&in[k][l + shift]);  // next pixel
-                v_int16 t = v_reinterpret_as_s16(t0) * vx_setall_s16(ikx0) +
-                            v_reinterpret_as_s16(t1) * vx_setall_s16(ikx1) +
-                            v_reinterpret_as_s16(t2) * vx_setall_s16(ikx2);
+                v_int16 t = v_add(v_add(v_mul(v_reinterpret_as_s16(t0), vx_setall_s16(ikx0)), v_mul(v_reinterpret_as_s16(t1), vx_setall_s16(ikx1))), v_mul(v_reinterpret_as_s16(t2), vx_setall_s16(ikx2)));
                 v_store(&ibuf[r[k]][l], t);
             }
 
@@ -1311,7 +1293,7 @@ static void run_sepfilter3x3_char2short(short out[], const uchar *in[], int widt
 
     for (int l=0; l < length;)
     {
-        constexpr int nlanes = v_int16::nlanes;
+        const int nlanes = VTraits<v_int16>::vlanes();
 
         // main part of output row
         for (; l <= length - nlanes; l += nlanes)
@@ -1319,13 +1301,11 @@ static void run_sepfilter3x3_char2short(short out[], const uchar *in[], int widt
             v_int16 s0 = vx_load(&ibuf[r[0]][l]);  // previous
             v_int16 s1 = vx_load(&ibuf[r[1]][l]);  // current
             v_int16 s2 = vx_load(&ibuf[r[2]][l]);  // next row
-            v_int16 s = s0 * vx_setall_s16(iky0) +
-                        s1 * vx_setall_s16(iky1) +
-                        s2 * vx_setall_s16(iky2);
+            v_int16 s = v_add(v_add(v_mul(s0, vx_setall_s16(iky0)), v_mul(s1, vx_setall_s16(iky1))), v_mul(s2, vx_setall_s16(iky2)));
 
             if (!noscale)
             {
-                s = v_mul_hi(s << 1, vx_setall_s16(iscale)) + vx_setall_s16(idelta);
+                s = v_add(v_mul_hi(v_shl<1>(s), vx_setall_s16(iscale)), vx_setall_s16(idelta));
             }
 
             v_store(&out[l], s);
@@ -1399,7 +1379,7 @@ static void run_sepfilter3x3_code(DST out[], const SRC *in[], int width, int cha
                                   float scale, float delta,
                                   float *buf[], int y, int y0)
 {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     int length = width * chan;
 
     // length variable may be unused if types do not match at 'if' statements below
@@ -1407,7 +1387,7 @@ static void run_sepfilter3x3_code(DST out[], const SRC *in[], int width, int cha
 
 #if USE_SEPFILTER3X3_CHAR2SHORT
     if (std::is_same<DST, short>::value && std::is_same<SRC, uchar>::value &&
-        length >= v_int16::nlanes)
+        length >= VTraits<v_int16>::vlanes())
     {
         // only slightly faster than more generic any-to-short (see below)
         run_sepfilter3x3_char2short<noscale>(reinterpret_cast<short*>(out),
@@ -1419,7 +1399,7 @@ static void run_sepfilter3x3_code(DST out[], const SRC *in[], int width, int cha
 #endif
 
     if (std::is_same<DST, float>::value && std::is_same<SRC, float>::value &&
-        length >= v_float32::nlanes)
+        length >= VTraits<v_float32>::vlanes())
     {
         // appears 15% faster than reference any-to-float code (called below)
         run_sepfilter3x3_any2float<noscale>(reinterpret_cast<float*>(out), in,
@@ -1427,7 +1407,7 @@ static void run_sepfilter3x3_code(DST out[], const SRC *in[], int width, int cha
         return;
     }
 
-    if (std::is_same<DST, short>::value && length >= v_int16::nlanes)
+    if (std::is_same<DST, short>::value && length >= VTraits<v_int16>::vlanes())
     {
         // appears 10-40x faster than reference due to much faster rounding
         run_sepfilter3x3_any2short<noscale>(reinterpret_cast<short*>(out), in,
@@ -1436,7 +1416,7 @@ static void run_sepfilter3x3_code(DST out[], const SRC *in[], int width, int cha
         return;
     }
 
-    if (std::is_same<DST, ushort>::value && length >= v_uint16::nlanes)
+    if (std::is_same<DST, ushort>::value && length >= VTraits<v_uint16>::vlanes())
     {
         // appears 10-40x faster than reference due to much faster rounding
         run_sepfilter3x3_any2short<noscale>(reinterpret_cast<ushort*>(out), in,
@@ -1445,7 +1425,7 @@ static void run_sepfilter3x3_code(DST out[], const SRC *in[], int width, int cha
         return;
     }
 
-    if (std::is_same<DST, uchar>::value && length >= v_uint8::nlanes)
+    if (std::is_same<DST, uchar>::value && length >= VTraits<v_uint8>::vlanes())
     {
         // appears 10-40x faster than reference due to much faster rounding
         run_sepfilter3x3_any2char<noscale>(reinterpret_cast<uchar*>(out), in,
@@ -1499,7 +1479,7 @@ RUN_SEPFILTER3X3_IMPL(float, float)
 //
 //-----------------------------
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 
 // this code with manually vectored rounding to uchar
 template<bool noscale, typename SRC>
@@ -1549,17 +1529,17 @@ static void run_sepfilter5x5_any2char(uchar out[], const SRC *in[], int width, i
 
     // vertical pass
 
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
 
     for (int l = 0; l < length;)
     {
         // main part of row
         for (; l <= length - nlanes; l += nlanes)
         {
-            v_float32 sum0 = vx_load(&buf[r[0]][l]) * vx_setall_f32(ky[0]);
-            v_float32 sum1 = vx_load(&buf[r[0]][l + nlanes / 4]) * vx_setall_f32(ky[0]);
-            v_float32 sum2 = vx_load(&buf[r[0]][l + 2 * nlanes / 4]) * vx_setall_f32(ky[0]);
-            v_float32 sum3 = vx_load(&buf[r[0]][l + 3 * nlanes / 4]) * vx_setall_f32(ky[0]);
+            v_float32 sum0 = v_mul(vx_load(&buf[r[0]][l]), vx_setall_f32(ky[0]));
+            v_float32 sum1 = v_mul(vx_load(&buf[r[0]][l + nlanes / 4]), vx_setall_f32(ky[0]));
+            v_float32 sum2 = v_mul(vx_load(&buf[r[0]][l + 2 * nlanes / 4]), vx_setall_f32(ky[0]));
+            v_float32 sum3 = v_mul(vx_load(&buf[r[0]][l + 3 * nlanes / 4]), vx_setall_f32(ky[0]));
 
             for (int n = 1; n < kyLen; ++n)
             {
@@ -1647,15 +1627,15 @@ static void run_sepfilter5x5_any2short(DST out[], const SRC *in[], int width, in
 
     // vertical pass
 
-    constexpr int nlanes = v_int16::nlanes;
+    const int nlanes = VTraits<v_int16>::vlanes();
     for (int l = 0; l < length;)
     {
         //GAPI_Assert(length >= nlanes);
         // main part of row
         for (; l <= length - nlanes; l += nlanes)
         {
-            v_float32 sum0 = vx_load(&buf[r[0]][l]) * vx_setall_f32(ky[0]);
-            v_float32 sum1 = vx_load(&buf[r[0]][l + nlanes / 2]) * vx_setall_f32(ky[0]);
+            v_float32 sum0 = v_mul(vx_load(&buf[r[0]][l]), vx_setall_f32(ky[0]));
+            v_float32 sum1 = v_mul(vx_load(&buf[r[0]][l + nlanes / 2]), vx_setall_f32(ky[0]));
 
             for (int j = 1; j < kyLen; ++j)
             {
@@ -1702,14 +1682,10 @@ static void run_sepfilter5x5_any2float(float out[], const SRC *in[], int width,
                                        const float kx[], const float ky[], int border,
                                        float scale, float delta)
 {
-    constexpr int kxLen = 5;
-    constexpr int kyLen = kxLen;
-    constexpr int buffSize = 5;
-
     const int length = width * chan;
     const int shift = chan;
 
-    static const int nlanes = v_float32::nlanes;
+    static const int nlanes = VTraits<v_float32>::vlanes();
     for (int l = 0; l < length; )
     {
         //GAPI_Assert(length >= nlanes);
@@ -1717,33 +1693,33 @@ static void run_sepfilter5x5_any2float(float out[], const SRC *in[], int width,
         for (; l <= length - nlanes; l += nlanes)
         {
             auto xsum = [l, border, shift, kx](const SRC inp[])
-            {
-                v_float32 t[5];
-                for (int i = 0; i < 5; ++i)
-                {
-                    t[i] = vx_load_f32(&inp[l + (i - border)*shift]);
-                }
+            { //buffSize = 5
+                v_float32 t0 = vx_load_f32(&inp[l + (0 - border)*shift]);
+                v_float32 t1 = vx_load_f32(&inp[l + (1 - border)*shift]);
+                v_float32 t2 = vx_load_f32(&inp[l + (2 - border)*shift]);
+                v_float32 t3 = vx_load_f32(&inp[l + (3 - border)*shift]);
+                v_float32 t4 = vx_load_f32(&inp[l + (4 - border)*shift]);
 
-                v_float32 sum = t[0] * vx_setall_f32(kx[0]);
-                for (int j = 1; j < 5; ++j)
-                {
-                    sum = v_fma(t[j], vx_setall_f32(kx[j]), sum);
-                }
+                v_float32 sum = v_mul(t0, vx_setall_f32(kx[0]));
+                sum = v_fma(t1, vx_setall_f32(kx[1]), sum);
+                sum = v_fma(t2, vx_setall_f32(kx[2]), sum);
+                sum = v_fma(t3, vx_setall_f32(kx[3]), sum);
+                sum = v_fma(t4, vx_setall_f32(kx[4]), sum);
 
                 return sum;
             };
 
-            v_float32 s[buffSize];
-            for (int m = 0; m < buffSize; ++m)
-            {
-                s[m] = xsum(in[m]);
-            }
+            v_float32 s0 = xsum(in[0]);
+            v_float32 s1 = xsum(in[1]);
+            v_float32 s2 = xsum(in[2]);
+            v_float32 s3 = xsum(in[3]);
+            v_float32 s4 = xsum(in[4]);
 
-            v_float32 sum = s[0] * vx_setall_f32(ky[0]);
-            for (int n = 1; n < kyLen; ++n)
-            {
-                sum = v_fma(s[n], vx_setall_f32(ky[n]), sum);
-            }
+            v_float32 sum = v_mul(s0, vx_setall_f32(ky[0]));
+            sum = v_fma(s1, vx_setall_f32(ky[1]), sum);
+            sum = v_fma(s2, vx_setall_f32(ky[2]), sum);
+            sum = v_fma(s3, vx_setall_f32(ky[3]), sum);
+            sum = v_fma(s4, vx_setall_f32(ky[4]), sum);
 
             if (!noscale)
             {
@@ -1819,7 +1795,7 @@ static void run_sepfilter5x5_char2short(short out[], const uchar *in[], int widt
     // this kernel (Fluid does rows consequently: y=y0, y0+1, ...)
     int k0 = (y == y0) ? 0 : 4;
 
-    constexpr int nlanes = v_int16::nlanes;
+    const int nlanes = VTraits<v_int16>::vlanes();
 
     for (int k = k0; k < kyLen; ++k)
     {
@@ -1830,16 +1806,18 @@ static void run_sepfilter5x5_char2short(short out[], const uchar *in[], int widt
             // main part of output row
             for (; l <= length - nlanes; l += nlanes)
             {
-                v_uint16 t[kxLen];
                 v_int16 sum = vx_setzero_s16();
 
-                for (int i = 0; i < kxLen; ++i)
-                {
-                    // previous, current, next pixels
-                    t[i] = vx_load_expand(&in[k][l + (i - border)*shift]);
+                auto process = [&](int i) {
+                    v_uint16 t = vx_load_expand(&in[k][l + (i - border)*shift]);
+                    return v_add(sum, v_mul(v_reinterpret_as_s16(t), vx_setall_s16(ikx[i])));
+                };
 
-                    sum += v_reinterpret_as_s16(t[i]) * vx_setall_s16(ikx[i]);
-                }
+                sum = process(0);
+                sum = process(1);
+                sum = process(2);
+                sum = process(3);
+                sum = process(4);
 
                 v_store(&ibuf[r[k]][l], sum);
             }
@@ -1861,20 +1839,21 @@ static void run_sepfilter5x5_char2short(short out[], const uchar *in[], int widt
         // main part of output row
         for (; l <= length - nlanes; l += nlanes)
         {
-            v_int16 s[buffSize];
             v_int16 sum = vx_setzero_s16();
 
-            for (int i = 0; i < kyLen; ++i)
-            {
-                // previous, current, next rows
-                s[i] = vx_load(&ibuf[r[i]][l]);
-
-                sum += s[i] * vx_setall_s16(iky[i]);
-            }
+            auto process = [&](int i) {
+                v_int16 s = vx_load(&ibuf[r[i]][l]);
+                return v_add(sum, v_mul(s, vx_setall_s16(iky[i])));
+            };
+            sum = process(0);
+            sum = process(1);
+            sum = process(2);
+            sum = process(3);
+            sum = process(4);
 
             if (!noscale)
             {
-                sum = v_mul_hi(sum << 1, vx_setall_s16(iscale)) + vx_setall_s16(idelta);
+                sum = v_add(v_mul_hi(v_shl<1>(sum), vx_setall_s16(iscale)), vx_setall_s16(idelta));
             }
 
             v_store(&out[l], sum);
@@ -1965,14 +1944,14 @@ static void run_sepfilter5x5_code(DST out[], const SRC *in[], int width, int cha
                                   const float kx[], const float ky[], int border,
                                   float scale, float delta, float *buf[], int y, int y0)
 {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     int length = width * chan;
 
     // length variable may be unused if types do not match at 'if' statements below
     (void)length;
 
     if (std::is_same<DST, short>::value && std::is_same<SRC, uchar>::value &&
-        length >= v_int16::nlanes)
+        length >= VTraits<v_int16>::vlanes())
     {
         run_sepfilter5x5_char2short<noscale>(reinterpret_cast<short*>(out),
                                              reinterpret_cast<const uchar**>(in),
@@ -1982,14 +1961,14 @@ static void run_sepfilter5x5_code(DST out[], const SRC *in[], int width, int cha
     }
 
     if (std::is_same<DST, float>::value && std::is_same<SRC, float>::value &&
-        length >= v_float32::nlanes)
+        length >= VTraits<v_float32>::vlanes())
     {
         run_sepfilter5x5_any2float<noscale>(reinterpret_cast<float*>(out), in, width,
                                             chan, kx, ky, border, scale, delta);
         return;
     }
 
-    if (std::is_same<DST, short>::value && length >= v_int16::nlanes)
+    if (std::is_same<DST, short>::value && length >= VTraits<v_int16>::vlanes())
     {
         run_sepfilter5x5_any2short<noscale>(reinterpret_cast<short*>(out), in, width,
                                             chan, kx, ky, border, scale, delta,
@@ -1997,7 +1976,7 @@ static void run_sepfilter5x5_code(DST out[], const SRC *in[], int width, int cha
         return;
     }
 
-    if (std::is_same<DST, ushort>::value && length >= v_uint16::nlanes)
+    if (std::is_same<DST, ushort>::value && length >= VTraits<v_uint16>::vlanes())
     {
         run_sepfilter5x5_any2short<noscale>(reinterpret_cast<ushort*>(out), in, width,
                                             chan, kx, ky, border, scale, delta,
@@ -2005,7 +1984,7 @@ static void run_sepfilter5x5_code(DST out[], const SRC *in[], int width, int cha
         return;
     }
 
-    if (std::is_same<DST, uchar>::value && length >= v_uint8::nlanes)
+    if (std::is_same<DST, uchar>::value && length >= VTraits<v_uint8>::vlanes())
     {
         run_sepfilter5x5_any2char<noscale>(reinterpret_cast<uchar*>(out), in, width,
                                            chan, kx, ky, border, scale, delta,
@@ -2086,7 +2065,7 @@ static void run_filter2d_3x3_reference(DST out[], const SRC *in[], int width, in
     }
 }
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 // assume DST is short or ushort
 template<bool noscale, typename DST, typename SRC>
 static void run_filter2d_3x3_any2short(DST out[], const SRC *in[], int width, int chan,
@@ -2106,14 +2085,14 @@ static void run_filter2d_3x3_any2short(DST out[], const SRC *in[], int width, in
 
     for (int l=0; l < length;)
     {
-        static constexpr int nlanes = v_int16::nlanes;
+        static const int nlanes = VTraits<v_int16>::vlanes();
 
         // main part of output row
         for (; l <= length - nlanes; l += nlanes)
         {
             auto sumx = [in, shift, &k](int i, int j)
             {
-                v_float32 s = vx_load_f32(&in[i][j - shift]) * vx_setall_f32(k[i][0]);
+                v_float32 s = v_mul(vx_load_f32(&in[i][j - shift]), vx_setall_f32(k[i][0]));
                     s = v_fma(vx_load_f32(&in[i][j        ]),  vx_setall_f32(k[i][1]), s);
                     s = v_fma(vx_load_f32(&in[i][j + shift]),  vx_setall_f32(k[i][2]), s);
                 return s;
@@ -2121,8 +2100,8 @@ static void run_filter2d_3x3_any2short(DST out[], const SRC *in[], int width, in
 
             int l0 = l;
             int l1 = l + nlanes/2;
-            v_float32 sum0 = sumx(0, l0) + sumx(1, l0) + sumx(2, l0);
-            v_float32 sum1 = sumx(0, l1) + sumx(1, l1) + sumx(2, l1);
+            v_float32 sum0 = v_add(sumx(0, l0), sumx(1, l0), sumx(2, l0));
+            v_float32 sum1 = v_add(sumx(0, l1), sumx(1, l1), sumx(2, l1));
 
             if (!noscale)
             {
@@ -2172,14 +2151,14 @@ static void run_filter2d_3x3_any2char(uchar out[], const SRC *in[], int width, i
 
     for (int l=0; l < length;)
     {
-        static constexpr int nlanes = v_uint8::nlanes;
+        static const int nlanes = VTraits<v_uint8>::vlanes();
 
         // main part of output row
         for (; l <= length - nlanes; l += nlanes)
         {
             auto sumx = [in, shift, &k](int i, int j)
             {
-                v_float32 s = vx_load_f32(&in[i][j - shift]) * vx_setall_f32(k[i][0]);
+                v_float32 s = v_mul(vx_load_f32(&in[i][j - shift]), vx_setall_f32(k[i][0]));
                     s = v_fma(vx_load_f32(&in[i][j        ]),  vx_setall_f32(k[i][1]), s);
                     s = v_fma(vx_load_f32(&in[i][j + shift]),  vx_setall_f32(k[i][2]), s);
                 return s;
@@ -2189,10 +2168,10 @@ static void run_filter2d_3x3_any2char(uchar out[], const SRC *in[], int width, i
             int l1 = l +   nlanes/4;
             int l2 = l + 2*nlanes/4;
             int l3 = l + 3*nlanes/4;
-            v_float32 sum0 = sumx(0, l0) + sumx(1, l0) + sumx(2, l0);
-            v_float32 sum1 = sumx(0, l1) + sumx(1, l1) + sumx(2, l1);
-            v_float32 sum2 = sumx(0, l2) + sumx(1, l2) + sumx(2, l2);
-            v_float32 sum3 = sumx(0, l3) + sumx(1, l3) + sumx(2, l3);
+            v_float32 sum0 = v_add(sumx(0, l0), sumx(1, l0), sumx(2, l0));
+            v_float32 sum1 = v_add(sumx(0, l1), sumx(1, l1), sumx(2, l1));
+            v_float32 sum2 = v_add(sumx(0, l2), sumx(1, l2), sumx(2, l2));
+            v_float32 sum3 = v_add(sumx(0, l3), sumx(1, l3), sumx(2, l3));
 
             if (!noscale)
             {
@@ -2228,20 +2207,20 @@ template<bool noscale, typename DST, typename SRC>
 static void run_filter2d_3x3_code(DST out[], const SRC *in[], int width, int chan,
                                   const float kernel[], float scale, float delta)
 {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     int length = width * chan;
 
     // length variable may be unused if types do not match at 'if' statements below
     (void) length;
 
-    if (std::is_same<DST, short>::value && length >= v_int16::nlanes)
+    if (std::is_same<DST, short>::value && length >= VTraits<v_int16>::vlanes())
     {
         run_filter2d_3x3_any2short<noscale>(reinterpret_cast<short*>(out), in,
                                             width, chan, kernel, scale, delta);
         return;
     }
 
-    if (std::is_same<DST, ushort>::value && length >= v_uint16::nlanes)
+    if (std::is_same<DST, ushort>::value && length >= VTraits<v_uint16>::vlanes())
     {
         run_filter2d_3x3_any2short<noscale>(reinterpret_cast<ushort*>(out), in,
                                             width, chan, kernel, scale, delta);
@@ -2249,7 +2228,7 @@ static void run_filter2d_3x3_code(DST out[], const SRC *in[], int width, int cha
     }
 
 
-    if (std::is_same<DST, uchar>::value && length >= v_uint8::nlanes)
+    if (std::is_same<DST, uchar>::value && length >= VTraits<v_uint8>::vlanes())
     {
         run_filter2d_3x3_any2char<noscale>(reinterpret_cast<uchar*>(out), in,
                                            width, chan, kernel, scale, delta);
@@ -2446,7 +2425,7 @@ static void run_morphology3x3_reference(T out[], const T *in[], int width, int c
     CV_Error(cv::Error::StsBadArg, "unsupported morphology");
 }
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 template<typename T, typename VT, typename S>
 static void run_morphology3x3_simd(T out[], const T *in[], int width, int chan,
                                    const uchar k[], MorphShape k_type,
@@ -2467,7 +2446,7 @@ static void run_morphology3x3_simd(T out[], const T *in[], int width, int chan,
         {
             for (int l=0; l < length;)
             {
-                constexpr int nlanes = VT::nlanes;
+                const int nlanes = VTraits<VT>::vlanes();
 
                 // main part of output row
                 for (; l <= length - nlanes; l += nlanes)
@@ -2503,7 +2482,7 @@ static void run_morphology3x3_simd(T out[], const T *in[], int width, int chan,
         {
             for (int l=0; l < length;)
             {
-                constexpr int nlanes = VT::nlanes;
+                const int nlanes = VTraits<VT>::vlanes();
 
                 // main part of output row
                 for (; l <= length - nlanes; l += nlanes)
@@ -2537,7 +2516,7 @@ static void run_morphology3x3_simd(T out[], const T *in[], int width, int chan,
 
         for (int l=0; l < length;)
         {
-            constexpr int nlanes = VT::nlanes;
+            const int nlanes = VTraits<VT>::vlanes();
 
             // main part of output row
             for (; l <= length - nlanes; l += nlanes)
@@ -2575,7 +2554,7 @@ static void run_morphology3x3_simd(T out[], const T *in[], int width, int chan,
         {
             for (int l=0; l < length;)
             {
-                constexpr int nlanes = VT::nlanes;
+                const int nlanes = VTraits<VT>::vlanes();
 
                 // main part of output row
                 for (; l <= length - nlanes; l += nlanes)
@@ -2611,7 +2590,7 @@ static void run_morphology3x3_simd(T out[], const T *in[], int width, int chan,
         {
             for (int l=0; l < length;)
             {
-                constexpr int nlanes = VT::nlanes;
+                const int nlanes = VTraits<VT>::vlanes();
 
                 // main part of output row
                 for (; l <= length - nlanes; l += nlanes)
@@ -2645,7 +2624,7 @@ static void run_morphology3x3_simd(T out[], const T *in[], int width, int chan,
 
         for (int l=0; l < length;)
         {
-            constexpr int nlanes = VT::nlanes;
+            const int nlanes = VTraits<VT>::vlanes();
 
             // main part of output row
             for (; l <= length - nlanes; l += nlanes)
@@ -2686,13 +2665,13 @@ static void run_morphology3x3_code(T out[], const T *in[], int width, int chan,
                                    const uchar k[], MorphShape k_type,
                                    Morphology morphology)
 {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     int length = width * chan;
 
     // length variable may be unused if types do not match at 'if' statements below
     (void) length;
 
-    if (std::is_same<T, float>::value && length >= v_float32::nlanes)
+    if (std::is_same<T, float>::value && length >= VTraits<v_float32>::vlanes())
     {
         run_morphology3x3_simd<float, v_float32>(reinterpret_cast<float*>(out),
                                                  reinterpret_cast<const float**>(in),
@@ -2701,7 +2680,7 @@ static void run_morphology3x3_code(T out[], const T *in[], int width, int chan,
         return;
     }
 
-    if (std::is_same<T, short>::value && length >= v_int16::nlanes)
+    if (std::is_same<T, short>::value && length >= VTraits<v_int16>::vlanes())
     {
         run_morphology3x3_simd<short, v_int16>(reinterpret_cast<short*>(out),
                                                reinterpret_cast<const short**>(in),
@@ -2710,7 +2689,7 @@ static void run_morphology3x3_code(T out[], const T *in[], int width, int chan,
         return;
     }
 
-    if (std::is_same<T, ushort>::value && length >= v_uint16::nlanes)
+    if (std::is_same<T, ushort>::value && length >= VTraits<v_uint16>::vlanes())
     {
         run_morphology3x3_simd<ushort, v_uint16>(reinterpret_cast<ushort*>(out),
                                                  reinterpret_cast<const ushort**>(in),
@@ -2719,7 +2698,7 @@ static void run_morphology3x3_code(T out[], const T *in[], int width, int chan,
         return;
     }
 
-    if (std::is_same<T, uchar>::value && length >= v_uint8::nlanes)
+    if (std::is_same<T, uchar>::value && length >= VTraits<v_uint8>::vlanes())
     {
         run_morphology3x3_simd<uchar, v_uint8>(reinterpret_cast<uchar*>(out),
                                                reinterpret_cast<const uchar**>(in),
@@ -2796,7 +2775,7 @@ static void run_medblur3x3_reference(T out[], const T *in[], int width, int chan
     }
 }
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 template<typename VT, typename T>
 static void run_medblur3x3_simd(T out[], const T *in[], int width, int chan)
 {
@@ -2808,7 +2787,7 @@ static void run_medblur3x3_simd(T out[], const T *in[], int width, int chan)
 
     for (int l=0; l < length;)
     {
-        constexpr int nlanes = VT::nlanes;
+        const int nlanes = VTraits<VT>::vlanes();
 
         // main part of output row
         for (; l <= length - nlanes; l += nlanes)
@@ -2866,13 +2845,13 @@ static void run_medblur3x3_simd(T out[], const T *in[], int width, int chan)
 template<typename T>
 static void run_medblur3x3_code(T out[], const T *in[], int width, int chan)
 {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     int length = width * chan;
 
     // length variable may be unused if types do not match at 'if' statements below
     (void) length;
 
-    if (std::is_same<T, float>::value && length >= v_float32::nlanes)
+    if (std::is_same<T, float>::value && length >= VTraits<v_float32>::vlanes())
     {
         run_medblur3x3_simd<v_float32>(reinterpret_cast<float*>(out),
                                        reinterpret_cast<const float**>(in),
@@ -2880,7 +2859,7 @@ static void run_medblur3x3_code(T out[], const T *in[], int width, int chan)
         return;
     }
 
-    if (std::is_same<T, short>::value && length >= v_int16::nlanes)
+    if (std::is_same<T, short>::value && length >= VTraits<v_int16>::vlanes())
     {
         run_medblur3x3_simd<v_int16>(reinterpret_cast<short*>(out),
                                      reinterpret_cast<const short**>(in),
@@ -2888,7 +2867,7 @@ static void run_medblur3x3_code(T out[], const T *in[], int width, int chan)
         return;
     }
 
-    if (std::is_same<T, ushort>::value && length >= v_uint16::nlanes)
+    if (std::is_same<T, ushort>::value && length >= VTraits<v_uint16>::vlanes())
     {
         run_medblur3x3_simd<v_uint16>(reinterpret_cast<ushort*>(out),
                                       reinterpret_cast<const ushort**>(in),
@@ -2896,7 +2875,7 @@ static void run_medblur3x3_code(T out[], const T *in[], int width, int chan)
         return;
     }
 
-    if (std::is_same<T, uchar>::value && length >= v_uint8::nlanes)
+    if (std::is_same<T, uchar>::value && length >= VTraits<v_uint8>::vlanes())
     {
         run_medblur3x3_simd<v_uint8>(reinterpret_cast<uchar*>(out),
                                      reinterpret_cast<const uchar**>(in),
diff --git a/modules/gapi/src/backends/streaming/gstreamingbackend.cpp b/modules/gapi/src/backends/streaming/gstreamingbackend.cpp
index ae7125f2e5..0f966648d8 100644
--- a/modules/gapi/src/backends/streaming/gstreamingbackend.cpp
+++ b/modules/gapi/src/backends/streaming/gstreamingbackend.cpp
@@ -159,7 +159,7 @@ struct Copy: public cv::detail::KernelTag
         return cv::gapi::streaming::IActor::Ptr(new Actor(args));
     }
 
-    static cv::gapi::streaming::GStreamingKernel kernel() { return {&create}; };
+    static cv::gapi::streaming::GStreamingKernel kernel() { return {&create}; }
 };
 
 void Copy::Actor::run(cv::gimpl::GIslandExecutable::IInput  &in,
@@ -249,7 +249,7 @@ struct GOCVBGR: public cv::detail::KernelTag
     {
         return cv::gapi::streaming::IActor::Ptr(new Actor(args));
     }
-    static cv::gapi::streaming::GStreamingKernel kernel() { return {&create}; };
+    static cv::gapi::streaming::GStreamingKernel kernel() { return {&create}; }
 };
 
 void GOCVBGR::Actor::extractRMat(const cv::MediaFrame& frame, cv::RMat& rmat)
@@ -323,7 +323,7 @@ struct GOCVY: public cv::detail::KernelTag
     {
         return cv::gapi::streaming::IActor::Ptr(new Actor(args));
     }
-    static cv::gapi::streaming::GStreamingKernel kernel() { return {&create}; };
+    static cv::gapi::streaming::GStreamingKernel kernel() { return {&create}; }
 };
 
 void GOCVY::Actor::extractRMat(const cv::MediaFrame& frame, cv::RMat& rmat)
@@ -389,7 +389,7 @@ struct GOCVUV: public cv::detail::KernelTag
     {
         return cv::gapi::streaming::IActor::Ptr(new Actor(args));
     }
-    static cv::gapi::streaming::GStreamingKernel kernel() { return {&create}; };
+    static cv::gapi::streaming::GStreamingKernel kernel() { return {&create}; }
 };
 
 void GOCVUV::Actor::extractRMat(const cv::MediaFrame& frame, cv::RMat& rmat)
diff --git a/modules/gapi/src/compiler/gcompiled_priv.hpp b/modules/gapi/src/compiler/gcompiled_priv.hpp
index 3f873aba23..20b76781a5 100644
--- a/modules/gapi/src/compiler/gcompiled_priv.hpp
+++ b/modules/gapi/src/compiler/gcompiled_priv.hpp
@@ -27,7 +27,7 @@ namespace cv {
 namespace gimpl
 {
     struct GRuntimeArgs;
-};
+}
 
 // FIXME: GAPI_EXPORTS is here only due to tests and Windows linker issues
 class GAPI_EXPORTS GCompiled::Priv
diff --git a/modules/gapi/src/compiler/gstreaming_priv.hpp b/modules/gapi/src/compiler/gstreaming_priv.hpp
index 0fd5fc7b7f..fc5ba73be0 100644
--- a/modules/gapi/src/compiler/gstreaming_priv.hpp
+++ b/modules/gapi/src/compiler/gstreaming_priv.hpp
@@ -16,7 +16,7 @@ namespace cv {
 namespace gimpl
 {
     struct GRuntimeArgs;
-};
+}
 
 // FIXME: GAPI_EXPORTS is here only due to tests and Windows linker issues
 // FIXME: It seems it clearly duplicates the GStreamingCompiled and
diff --git a/modules/gapi/src/compiler/passes/pattern_matching.cpp b/modules/gapi/src/compiler/passes/pattern_matching.cpp
index d52b48a631..71ed859413 100644
--- a/modules/gapi/src/compiler/passes/pattern_matching.cpp
+++ b/modules/gapi/src/compiler/passes/pattern_matching.cpp
@@ -73,7 +73,7 @@ bool compareDataNodes(const ade::NodeHandle& first, const std::vector<std::size_
     // check that first and second nodes have the same type of DATA::Storage.
 
     return true;
-};
+}
 
 // Returns true if two OP nodes semantically and structurally identical:
 //    - both nodes have the same kernel name
@@ -130,7 +130,7 @@ bool compareOpNodes(const VisitedMatchings& matchedVisitedNodes,
     }
 
     return true;
-};
+}
 
 // Retrieves and return sample from the cartesian product of candidates sets
 VisitedMatchings sampleFromProduct(std::size_t sampleIdx, // index of the sample in the product
@@ -168,7 +168,7 @@ std::size_t labelOf (const ade::NodeHandle& node, // reader node
     else {
         return graph.metadata(edge).get<cv::gimpl::Output>().port;
     }
-};
+}
 
 inline bool IS_STARTPOINT(const ade::NodeHandle& nh){
     return nh->inEdges().empty();
diff --git a/modules/gapi/test/common/gapi_core_tests_inl.hpp b/modules/gapi/test/common/gapi_core_tests_inl.hpp
index 11b6e066a6..ae81ca2055 100644
--- a/modules/gapi/test/common/gapi_core_tests_inl.hpp
+++ b/modules/gapi/test/common/gapi_core_tests_inl.hpp
@@ -1699,7 +1699,7 @@ namespace {
             return cv::MediaFrame::View(std::move(pp), std::move(ss));
         }
     };
-};
+}
 
 namespace {
     class TestMediaGray final : public cv::MediaFrame::IAdapter {
@@ -1718,7 +1718,7 @@ namespace {
             return cv::MediaFrame::View(std::move(pp), std::move(ss));
         }
     };
-};
+}
 
 TEST_P(SizeMFTest, ParseTest)
 {
diff --git a/modules/gapi/test/common/gapi_render_tests.cpp b/modules/gapi/test/common/gapi_render_tests.cpp
index e29406d783..abfef99121 100644
--- a/modules/gapi/test/common/gapi_render_tests.cpp
+++ b/modules/gapi/test/common/gapi_render_tests.cpp
@@ -92,6 +92,6 @@ void blendImageRef(cv::Mat& mat, const cv::Point& org, const cv::Mat& img, const
     roi32f += img32f;
 
     roi32f.convertTo(roi, CV_8U, 255.0);
-};
+}
 
 } // namespace opencv_test
diff --git a/modules/gapi/test/common/gapi_render_tests.hpp b/modules/gapi/test/common/gapi_render_tests.hpp
index 30caca9e6d..73924d96ac 100644
--- a/modules/gapi/test/common/gapi_render_tests.hpp
+++ b/modules/gapi/test/common/gapi_render_tests.hpp
@@ -115,7 +115,7 @@ struct Fixture : public RenderNV12TestBase API {                  \
     __WRAP_VAARGS(DEFINE_SPECIFIC_PARAMS_##Number(__VA_ARGS__))   \
     Fixture() {                                                   \
         Init(sz_);                                                \
-    };                                                            \
+    }                                                             \
 };
 
 #define GAPI_RENDER_TEST_FIXTURE_BGR(Fixture, API, Number, ...)  \
@@ -123,7 +123,7 @@ struct Fixture : public RenderBGRTestBase API {                  \
     __WRAP_VAARGS(DEFINE_SPECIFIC_PARAMS_##Number(__VA_ARGS__))   \
     Fixture() {                                                   \
         Init(sz_);                                                \
-    };                                                            \
+    }                                                             \
 };
 
 #define GET_VA_ARGS(...) __VA_ARGS__
diff --git a/modules/gapi/test/cpu/gapi_ocv_stateful_kernel_tests.cpp b/modules/gapi/test/cpu/gapi_ocv_stateful_kernel_tests.cpp
index b462e701f2..850b0e2e6c 100644
--- a/modules/gapi/test/cpu/gapi_ocv_stateful_kernel_tests.cpp
+++ b/modules/gapi/test/cpu/gapi_ocv_stateful_kernel_tests.cpp
@@ -165,7 +165,7 @@ namespace
             out = true;
         }
     };
-};
+}
 
 TEST(StatefulKernel, StateInitOnceInRegularMode)
 {
@@ -190,7 +190,7 @@ TEST(StatefulKernel, StateInitOnceInRegularMode)
         EXPECT_TRUE(params.pSetupsCount != nullptr);
         EXPECT_EQ(1, *params.pSetupsCount);
     }
-};
+}
 
 struct StateInitOnce : public ::testing::TestWithParam<bool>{};
 TEST_P(StateInitOnce, StreamingCompiledWithMeta)
diff --git a/modules/gapi/test/gapi_async_test.cpp b/modules/gapi/test/gapi_async_test.cpp
index 5a7194a17f..7086f47c5c 100644
--- a/modules/gapi/test/gapi_async_test.cpp
+++ b/modules/gapi/test/gapi_async_test.cpp
@@ -207,7 +207,7 @@ struct CallBack: crtp_cast<crtp_final_t> {
             mtx.unlock();
             cv.notify_one();
         };
-    };
+    }
 
     template<typename... Args >
     void start_async(Args&&... args){
diff --git a/modules/gapi/test/gapi_fluid_test.cpp b/modules/gapi/test/gapi_fluid_test.cpp
index 03c98e3ef3..22884934ed 100644
--- a/modules/gapi/test/gapi_fluid_test.cpp
+++ b/modules/gapi/test/gapi_fluid_test.cpp
@@ -28,12 +28,12 @@ namespace
     void WriteFunction(uint8_t* row, int nr, int w) {
         for (int i = 0; i < w; i++)
             row[i] = static_cast<uint8_t>(nr+i);
-    };
+    }
     void ReadFunction1x1(const uint8_t* row, int w) {
         for (int i = 0; i < w; i++)
             std::cout << std::setw(4) << static_cast<int>(row[i]) << " ";
         std::cout << "\n";
-    };
+    }
     void ReadFunction3x3(const uint8_t* rows[3], int w) {
         for (int i = 0; i < 3; i++) {
             for (int j = -1; j < w+1; j++) {
@@ -42,7 +42,7 @@ namespace
             std::cout << "\n";
         }
         std::cout << "\n";
-    };
+    }
 }
 
 TEST(FluidBuffer, InputTest)
diff --git a/modules/gapi/test/gapi_kernel_tests.cpp b/modules/gapi/test/gapi_kernel_tests.cpp
index dbb0a7f269..5adb668752 100644
--- a/modules/gapi/test/gapi_kernel_tests.cpp
+++ b/modules/gapi/test/gapi_kernel_tests.cpp
@@ -215,7 +215,7 @@ TEST(KernelPackage, RemoveBackend)
     EXPECT_FALSE(pkg.includes<J::Foo>());
     EXPECT_FALSE(pkg.includes<J::Bar>());
     EXPECT_TRUE(pkg.includes<S::Baz>());
-};
+}
 
 TEST(KernelPackage, RemoveAPI)
 {
@@ -228,7 +228,7 @@ TEST(KernelPackage, RemoveAPI)
     pkg.remove<I::Foo>();
     EXPECT_TRUE(pkg.includes<J::Bar>());
     EXPECT_FALSE(pkg.includes<J::Foo>());
-};
+}
 
 TEST(KernelPackage, CreateHetero)
 {
diff --git a/modules/gapi/test/internal/gapi_int_executor_tests.cpp b/modules/gapi/test/internal/gapi_int_executor_tests.cpp
index 79117aebf3..9bed7b5058 100644
--- a/modules/gapi/test/internal/gapi_int_executor_tests.cpp
+++ b/modules/gapi/test/internal/gapi_int_executor_tests.cpp
@@ -55,7 +55,7 @@ public:
     GMockExecutable(bool can_reshape = true)
         : m_priv(new Priv{can_reshape, 0, 0})
     {
-    };
+    }
 
     void setReshape(bool can_reshape) { m_priv->m_can_reshape = can_reshape; }
 
@@ -92,7 +92,7 @@ class GMockBackendImpl final: public cv::gapi::GBackend::Priv
     }
 
 public:
-    GMockBackendImpl(const GMockExecutable& exec) : m_exec(exec) { };
+    GMockBackendImpl(const GMockExecutable& exec) : m_exec(exec) { }
     int getCompileCounter() const { return m_compile_counter; }
 };
 
@@ -124,8 +124,8 @@ GMockFunctor mock_kernel(const cv::gapi::GBackend& backend, Callable c)
                        };
 }
 
-void dummyFooImpl(const cv::Mat&, cv::Mat&)                 { };
-void dummyBarImpl(const cv::Mat&, const cv::Mat&, cv::Mat&) { };
+void dummyFooImpl(const cv::Mat&, cv::Mat&)                 { }
+void dummyBarImpl(const cv::Mat&, const cv::Mat&, cv::Mat&) { }
 
 struct GExecutorReshapeTest: public ::testing::Test
 {
@@ -155,7 +155,7 @@ struct GExecutorReshapeTest: public ::testing::Test
     std::shared_ptr<GMockBackendImpl> backend_impl2;
     cv::gapi::GBackend                backend2;
     cv::GKernelPackage                pkg;
-    cv::Mat                           in_mat1, in_mat2, out_mat;;
+    cv::Mat                           in_mat1, in_mat2, out_mat;
 };
 
 } // anonymous namespace
diff --git a/modules/gapi/test/internal/gapi_int_island_tests.cpp b/modules/gapi/test/internal/gapi_int_island_tests.cpp
index 7da1670ecc..dbc6ad12f4 100644
--- a/modules/gapi/test/internal/gapi_int_island_tests.cpp
+++ b/modules/gapi/test/internal/gapi_int_island_tests.cpp
@@ -627,7 +627,7 @@ namespace
         void assignIsland(const std::string &s)
         {
             cv::gapi::island(s, cv::GIn(tmp[0]), cv::GOut(tmp[2]));
-        };
+        }
     };
     TEST_P(CheckName, Test)
     {
diff --git a/modules/highgui/include/opencv2/highgui.hpp b/modules/highgui/include/opencv2/highgui.hpp
index b32cab2e1c..4c42a8d685 100644
--- a/modules/highgui/include/opencv2/highgui.hpp
+++ b/modules/highgui/include/opencv2/highgui.hpp
@@ -298,9 +298,7 @@ You can call cv::destroyWindow or cv::destroyAllWindows to close the window and
 memory usage. For a simple program, you do not really have to call these functions because all the
 resources and windows of the application are closed automatically by the operating system upon exit.
 
-@note
-
-Qt backend supports additional flags:
+@note Qt backend supports additional flags:
  -   **WINDOW_NORMAL or WINDOW_AUTOSIZE:** WINDOW_NORMAL enables you to resize the
      window, whereas WINDOW_AUTOSIZE adjusts automatically the window size to fit the
      displayed image (see imshow ), and you cannot change the window size manually.
@@ -333,9 +331,7 @@ CV_EXPORTS_W int startWindowThread();
 
 /** @brief Similar to #waitKey, but returns full key code.
 
-@note
-
-Key code is implementation specific and depends on used backend: QT/GTK/Win32/etc
+@note Key code is implementation specific and depends on used backend: QT/GTK/Win32/etc
 
 */
 CV_EXPORTS_W int waitKeyEx(int delay = 0);
@@ -402,11 +398,7 @@ For example, **waitKey(0)** will display the window infinitely until any keypres
 for image display). **waitKey(25)** will display a frame and wait approximately 25 ms for a key
 press (suitable for displaying a video frame-by-frame). To remove the window, use cv::destroyWindow.
 
-@note
-
-[__Windows Backend Only__] Pressing Ctrl+C will copy the image to the clipboard.
-
-[__Windows Backend Only__] Pressing Ctrl+S will show a dialog to save the image.
+@note [__Windows Backend Only__] Pressing Ctrl+C will copy the image to the clipboard. Pressing Ctrl+S will show a dialog to save the image.
 
 @param winname Name of the window.
 @param mat Image to be shown.
@@ -415,10 +407,8 @@ CV_EXPORTS_W void imshow(const String& winname, InputArray mat);
 
 /** @brief Resizes the window to the specified size
 
-@note
-
--   The specified window size is for the image area. Toolbars are not counted.
--   Only windows created without cv::WINDOW_AUTOSIZE flag can be resized.
+@note The specified window size is for the image area. Toolbars are not counted.
+Only windows created without cv::WINDOW_AUTOSIZE flag can be resized.
 
 @param winname Window name.
 @param width The new window width.
@@ -500,9 +490,7 @@ For cv::EVENT_MOUSEWHEEL positive and negative values mean forward and backward
 respectively. For cv::EVENT_MOUSEHWHEEL, where available, positive and negative values mean right and
 left scrolling, respectively.
 
-@note
-
-Mouse-wheel events are currently supported only on Windows and Cocoa
+@note Mouse-wheel events are currently supported only on Windows and Cocoa.
 
 @param flags The mouse callback flags parameter.
  */
@@ -557,9 +545,7 @@ and range, assigns a variable value to be a position synchronized with the track
 the callback function onChange to be called on the trackbar position change. The created trackbar is
 displayed in the specified window winname.
 
-@note
-
-[__Qt Backend Only__] winname can be empty if the trackbar should be attached to the
+@note [__Qt Backend Only__] winname can be empty if the trackbar should be attached to the
 control panel.
 
 Clicking the label of each trackbar enables editing the trackbar values manually.
@@ -585,9 +571,7 @@ CV_EXPORTS int createTrackbar(const String& trackbarname, const String& winname,
 
 The function returns the current position of the specified trackbar.
 
-@note
-
-[__Qt Backend Only__] winname can be empty if the trackbar is attached to the control
+@note [__Qt Backend Only__] winname can be empty if the trackbar is attached to the control
 panel.
 
 @param trackbarname Name of the trackbar.
@@ -599,9 +583,7 @@ CV_EXPORTS_W int getTrackbarPos(const String& trackbarname, const String& winnam
 
 The function sets the position of the specified trackbar in the specified window.
 
-@note
-
-[__Qt Backend Only__] winname can be empty if the trackbar is attached to the control
+@note [__Qt Backend Only__] winname can be empty if the trackbar is attached to the control
 panel.
 
 @param trackbarname Name of the trackbar.
@@ -614,9 +596,7 @@ CV_EXPORTS_W void setTrackbarPos(const String& trackbarname, const String& winna
 
 The function sets the maximum position of the specified trackbar in the specified window.
 
-@note
-
-[__Qt Backend Only__] winname can be empty if the trackbar is attached to the control
+@note [__Qt Backend Only__] winname can be empty if the trackbar is attached to the control
 panel.
 
 @param trackbarname Name of the trackbar.
@@ -629,9 +609,7 @@ CV_EXPORTS_W void setTrackbarMax(const String& trackbarname, const String& winna
 
 The function sets the minimum position of the specified trackbar in the specified window.
 
-@note
-
-[__Qt Backend Only__] winname can be empty if the trackbar is attached to the control
+@note [__Qt Backend Only__] winname can be empty if the trackbar is attached to the control
 panel.
 
 @param trackbarname Name of the trackbar.
diff --git a/modules/highgui/src/registry.impl.hpp b/modules/highgui/src/registry.impl.hpp
index 66693f1b07..23f4e9f4e1 100644
--- a/modules/highgui/src/registry.impl.hpp
+++ b/modules/highgui/src/registry.impl.hpp
@@ -61,7 +61,7 @@ std::vector<BackendInfo>& getBuiltinBackendsInfo()
 #endif
     };
     return g_backends;
-};
+}
 
 static
 bool sortByPriority(const BackendInfo &lhs, const BackendInfo &rhs)
diff --git a/modules/highgui/src/roiSelector.cpp b/modules/highgui/src/roiSelector.cpp
index 1bbd246c05..56881a97f4 100644
--- a/modules/highgui/src/roiSelector.cpp
+++ b/modules/highgui/src/roiSelector.cpp
@@ -118,7 +118,7 @@ class ROISelector
         bool drawFromCenter;
 
         // initializer list
-        handlerT() : isDrawing(false), drawFromCenter(true){};
+        handlerT() : isDrawing(false), drawFromCenter(true){}
     } selectorParams;
 
   private:
diff --git a/modules/highgui/src/window_QT.cpp b/modules/highgui/src/window_QT.cpp
index 8d1e71a10f..1f0f432981 100644
--- a/modules/highgui/src/window_QT.cpp
+++ b/modules/highgui/src/window_QT.cpp
@@ -1612,6 +1612,7 @@ CvWinProperties::~CvWinProperties()
 
 CvWindow::CvWindow(QString name, int arg2)
 {
+    Q_INIT_RESOURCE(window_QT);
     type = type_CvWindow;
 
     param_flags = arg2 & 0x0000000F;
diff --git a/modules/imgcodecs/include/opencv2/imgcodecs.hpp b/modules/imgcodecs/include/opencv2/imgcodecs.hpp
index 5e201b52fb..2d3bc4e6f2 100644
--- a/modules/imgcodecs/include/opencv2/imgcodecs.hpp
+++ b/modules/imgcodecs/include/opencv2/imgcodecs.hpp
@@ -319,8 +319,8 @@ See cv::imread for the list of supported formats and flags description.
 CV_EXPORTS_W Mat imdecode( InputArray buf, int flags );
 
 /** @overload
-@param buf
-@param flags
+@param buf Input array or vector of bytes.
+@param flags The same flags as in cv::imread, see cv::ImreadModes.
 @param dst The optional output placeholder for the decoded matrix. It can save the image
 reallocations when the function is called repeatedly for images of the same size.
 */
diff --git a/modules/imgcodecs/src/exif.cpp b/modules/imgcodecs/src/exif.cpp
index 28d52047d8..5ca3cc315d 100644
--- a/modules/imgcodecs/src/exif.cpp
+++ b/modules/imgcodecs/src/exif.cpp
@@ -133,7 +133,7 @@ bool ExifReader::parseExif(unsigned char* data, const size_t size)
  * @brief Filling m_exif member with exif directory elements
  *          This is internal function and is not exposed to client
  *
- *  @return The function doesn't return any value. In case of unsuccessful parsing
+ *  The function doesn't return any value. In case of unsuccessful parsing
  *      the m_exif member is not filled up
  */
 void ExifReader::parseExif()
diff --git a/modules/imgcodecs/src/grfmt_tiff.cpp b/modules/imgcodecs/src/grfmt_tiff.cpp
index 4febee36db..7fbdfce1fd 100644
--- a/modules/imgcodecs/src/grfmt_tiff.cpp
+++ b/modules/imgcodecs/src/grfmt_tiff.cpp
@@ -72,11 +72,6 @@ static void extend_cvtColor( InputArray _src, OutputArray _dst, int code );
         CV_Error(Error::StsError, "OpenCV TIFF: failed " #call); \
     }
 
-#define CV_TIFF_CHECK_CALL_INFO(call) \
-    if (0 == (call)) { \
-        CV_LOG_INFO(NULL, "OpenCV TIFF(line " << __LINE__ << "): failed optional call: " #call ", ignoring"); \
-    }
-
 #define CV_TIFF_CHECK_CALL_DEBUG(call) \
     if (0 == (call)) { \
         CV_LOG_DEBUG(NULL, "OpenCV TIFF(line " << __LINE__ << "): failed optional call: " #call ", ignoring"); \
diff --git a/modules/imgcodecs/test/test_tiff.cpp b/modules/imgcodecs/test/test_tiff.cpp
index 3aea5b85d5..82da0cdf42 100644
--- a/modules/imgcodecs/test/test_tiff.cpp
+++ b/modules/imgcodecs/test/test_tiff.cpp
@@ -1045,6 +1045,7 @@ TEST(Imgcodecs_Tiff_Modes, write_multipage)
     {
         EXPECT_PRED_FORMAT2(cvtest::MatComparator(0, 0), read_pages[i], pages[i]);
     }
+    EXPECT_EQ(0, remove(tmp_filename.c_str()));
 }
 
 //==================================================================================================
diff --git a/modules/imgproc/include/opencv2/imgproc.hpp b/modules/imgproc/include/opencv2/imgproc.hpp
index ca62eb26fb..d0b63ad370 100644
--- a/modules/imgproc/include/opencv2/imgproc.hpp
+++ b/modules/imgproc/include/opencv2/imgproc.hpp
@@ -845,7 +845,41 @@ enum ColorConversionCodes {
     COLOR_BayerRG2RGBA = COLOR_BayerBG2BGRA, //!< equivalent to BGGR Bayer pattern
     COLOR_BayerGR2RGBA = COLOR_BayerGB2BGRA, //!< equivalent to GBRG Bayer pattern
 
-    COLOR_COLORCVT_MAX  = 143
+    //! RGB to YUV 4:2:2 family
+
+    COLOR_RGB2YUV_UYVY = 143,
+    COLOR_BGR2YUV_UYVY = 144,
+    COLOR_RGB2YUV_Y422 = COLOR_RGB2YUV_UYVY,
+    COLOR_BGR2YUV_Y422 = COLOR_BGR2YUV_UYVY,
+    COLOR_RGB2YUV_UYNV = COLOR_RGB2YUV_UYVY,
+    COLOR_BGR2YUV_UYNV = COLOR_BGR2YUV_UYVY,
+
+    COLOR_RGBA2YUV_UYVY = 145,
+    COLOR_BGRA2YUV_UYVY = 146,
+    COLOR_RGBA2YUV_Y422 = COLOR_RGBA2YUV_UYVY,
+    COLOR_BGRA2YUV_Y422 = COLOR_BGRA2YUV_UYVY,
+    COLOR_RGBA2YUV_UYNV = COLOR_RGBA2YUV_UYVY,
+    COLOR_BGRA2YUV_UYNV = COLOR_BGRA2YUV_UYVY,
+
+    COLOR_RGB2YUV_YUY2 = 147,
+    COLOR_BGR2YUV_YUY2 = 148,
+    COLOR_RGB2YUV_YVYU = 149,
+    COLOR_BGR2YUV_YVYU = 150,
+    COLOR_RGB2YUV_YUYV = COLOR_RGB2YUV_YUY2,
+    COLOR_BGR2YUV_YUYV = COLOR_BGR2YUV_YUY2,
+    COLOR_RGB2YUV_YUNV = COLOR_RGB2YUV_YUY2,
+    COLOR_BGR2YUV_YUNV = COLOR_BGR2YUV_YUY2,
+
+    COLOR_RGBA2YUV_YUY2 = 151,
+    COLOR_BGRA2YUV_YUY2 = 152,
+    COLOR_RGBA2YUV_YVYU = 153,
+    COLOR_BGRA2YUV_YVYU = 154,
+    COLOR_RGBA2YUV_YUYV = COLOR_RGBA2YUV_YUY2,
+    COLOR_BGRA2YUV_YUYV = COLOR_BGRA2YUV_YUY2,
+    COLOR_RGBA2YUV_YUNV = COLOR_RGBA2YUV_YUY2,
+    COLOR_BGRA2YUV_YUNV = COLOR_BGRA2YUV_YUY2,
+
+    COLOR_COLORCVT_MAX  = 155
 };
 
 //! @addtogroup imgproc_shape
@@ -3719,10 +3753,10 @@ stored in two planes.
 
 This function only supports YUV420 to RGB conversion as of now.
 
-@param src1: 8-bit image (#CV_8U) of the Y plane.
-@param src2: image containing interleaved U/V plane.
-@param dst: output image.
-@param code: Specifies the type of conversion. It can take any of the following values:
+@param src1 8-bit image (#CV_8U) of the Y plane.
+@param src2 image containing interleaved U/V plane.
+@param dst output image.
+@param code Specifies the type of conversion. It can take any of the following values:
 - #COLOR_YUV2BGR_NV12
 - #COLOR_YUV2RGB_NV12
 - #COLOR_YUV2BGRA_NV12
diff --git a/modules/imgproc/include/opencv2/imgproc/hal/hal.hpp b/modules/imgproc/include/opencv2/imgproc/hal/hal.hpp
index f129012ba6..48851ece07 100644
--- a/modules/imgproc/include/opencv2/imgproc/hal/hal.hpp
+++ b/modules/imgproc/include/opencv2/imgproc/hal/hal.hpp
@@ -224,6 +224,11 @@ CV_EXPORTS void cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step,
                                     int width, int height,
                                     int dcn, bool swapBlue, int uIdx, int ycn);
 
+CV_EXPORTS void cvtOnePlaneBGRtoYUV(const uchar * src_data, size_t src_step,
+                                    uchar * dst_data, size_t dst_step,
+                                    int width, int height,
+                                    int scn, bool swapBlue, int uIdx, int ycn);
+
 CV_EXPORTS void cvtRGBAtoMultipliedRGBA(const uchar * src_data, size_t src_step,
                                         uchar * dst_data, size_t dst_step,
                                         int width, int height);
diff --git a/modules/imgproc/misc/java/test/ImgprocTest.java b/modules/imgproc/misc/java/test/ImgprocTest.java
index 4b1ea9157d..3d2ddb1a7d 100644
--- a/modules/imgproc/misc/java/test/ImgprocTest.java
+++ b/modules/imgproc/misc/java/test/ImgprocTest.java
@@ -639,7 +639,7 @@ public class ImgprocTest extends OpenCVTestCase {
         Imgproc.distanceTransformWithLabels(gray128, dst, labels, Imgproc.DIST_L2, 3);
 
         assertMatEqual(dstLables, labels);
-        assertMatEqual(getMat(CvType.CV_32FC1, 8192), dst, EPS);
+        assertMatEqual(getMat(CvType.CV_32FC1, 65533.805), dst, EPS);
     }
 
     public void testDrawContoursMatListOfMatIntScalar() {
diff --git a/modules/imgproc/perf/opencl/perf_color.cpp b/modules/imgproc/perf/opencl/perf_color.cpp
index 9525e1f275..dce588879b 100644
--- a/modules/imgproc/perf/opencl/perf_color.cpp
+++ b/modules/imgproc/perf/opencl/perf_color.cpp
@@ -58,7 +58,7 @@ CV_ENUM(ConversionTypes, COLOR_RGB2GRAY, COLOR_RGB2BGR, COLOR_RGB2YUV, COLOR_YUV
         COLOR_YCrCb2RGB, COLOR_RGB2XYZ, COLOR_XYZ2RGB, COLOR_RGB2HSV, COLOR_HSV2RGB, COLOR_RGB2HLS,
         COLOR_HLS2RGB, COLOR_BGR5652BGR, COLOR_BGR2BGR565, COLOR_RGBA2mRGBA, COLOR_mRGBA2RGBA,
         COLOR_RGB2Lab, COLOR_Lab2BGR, COLOR_RGB2Luv, COLOR_Luv2LBGR, COLOR_YUV2RGB_NV12, COLOR_YUV2RGB_IYUV,
-        COLOR_YUV2GRAY_420, COLOR_RGB2YUV_IYUV, COLOR_YUV2RGB_YUY2, COLOR_YUV2GRAY_YUY2)
+        COLOR_YUV2GRAY_420, COLOR_RGB2YUV_IYUV, COLOR_YUV2RGB_YUY2, COLOR_RGB2YUV_YUY2, COLOR_YUV2GRAY_YUY2)
 
 typedef tuple<Size, tuple<ConversionTypes, int, int> > CvtColorParams;
 typedef TestBaseWithParam<CvtColorParams> CvtColorFixture;
@@ -91,6 +91,7 @@ OCL_PERF_TEST_P(CvtColorFixture, CvtColor, testing::Combine(
                     make_tuple(ConversionTypes(COLOR_YUV2GRAY_420), 1, 1),
                     make_tuple(ConversionTypes(COLOR_RGB2YUV_IYUV), 3, 1),
                     make_tuple(ConversionTypes(COLOR_YUV2RGB_YUY2), 2, 3),
+                    make_tuple(ConversionTypes(COLOR_RGB2YUV_YUY2), 3, 2),
                     make_tuple(ConversionTypes(COLOR_YUV2GRAY_YUY2), 2, 1)
                     )))
 {
diff --git a/modules/imgproc/perf/perf_cvt_color.cpp b/modules/imgproc/perf/perf_cvt_color.cpp
index ab169ecfca..5915b507ce 100644
--- a/modules/imgproc/perf/perf_cvt_color.cpp
+++ b/modules/imgproc/perf/perf_cvt_color.cpp
@@ -178,7 +178,9 @@ CV_ENUM(CvtModeBayer,
 CV_ENUM(CvtMode2, COLOR_YUV2BGR_NV12, COLOR_YUV2BGRA_NV12, COLOR_YUV2RGB_NV12, COLOR_YUV2RGBA_NV12, COLOR_YUV2BGR_NV21, COLOR_YUV2BGRA_NV21, COLOR_YUV2RGB_NV21, COLOR_YUV2RGBA_NV21,
                   COLOR_YUV2BGR_YV12, COLOR_YUV2BGRA_YV12, COLOR_YUV2RGB_YV12, COLOR_YUV2RGBA_YV12, COLOR_YUV2BGR_IYUV, COLOR_YUV2BGRA_IYUV, COLOR_YUV2RGB_IYUV, COLOR_YUV2RGBA_IYUV,
                   COLOR_YUV2GRAY_420, COLOR_YUV2RGB_UYVY, COLOR_YUV2BGR_UYVY, COLOR_YUV2RGBA_UYVY, COLOR_YUV2BGRA_UYVY, COLOR_YUV2RGB_YUY2, COLOR_YUV2BGR_YUY2, COLOR_YUV2RGB_YVYU,
-                  COLOR_YUV2BGR_YVYU, COLOR_YUV2RGBA_YUY2, COLOR_YUV2BGRA_YUY2, COLOR_YUV2RGBA_YVYU, COLOR_YUV2BGRA_YVYU)
+                  COLOR_YUV2BGR_YVYU, COLOR_YUV2RGBA_YUY2, COLOR_YUV2BGRA_YUY2, COLOR_YUV2RGBA_YVYU, COLOR_YUV2BGRA_YVYU,
+                  COLOR_RGB2YUV_UYVY, COLOR_BGR2YUV_UYVY, COLOR_RGBA2YUV_UYVY, COLOR_BGRA2YUV_UYVY, COLOR_RGB2YUV_YUY2, COLOR_BGR2YUV_YUY2, COLOR_RGB2YUV_YVYU,
+                  COLOR_BGR2YUV_YVYU, COLOR_RGBA2YUV_YUY2, COLOR_BGRA2YUV_YUY2, COLOR_RGBA2YUV_YVYU, COLOR_BGRA2YUV_YVYU)
 
 CV_ENUM(CvtMode3, COLOR_RGB2YUV_IYUV, COLOR_BGR2YUV_IYUV, COLOR_RGBA2YUV_IYUV, COLOR_BGRA2YUV_IYUV,
                   COLOR_RGB2YUV_YV12, COLOR_BGR2YUV_YV12, COLOR_RGBA2YUV_YV12, COLOR_BGRA2YUV_YV12)
@@ -225,12 +227,20 @@ static ChPair getConversionInfo(int cvtMode)
     case COLOR_YUV2RGB_YUY2: case COLOR_YUV2BGR_YUY2:
     case COLOR_YUV2RGB_YVYU: case COLOR_YUV2BGR_YVYU:
         return ChPair(2,3);
+    case COLOR_RGB2YUV_UYVY: case COLOR_BGR2YUV_UYVY:
+    case COLOR_RGB2YUV_YUY2: case COLOR_BGR2YUV_YUY2:
+    case COLOR_RGB2YUV_YVYU: case COLOR_BGR2YUV_YVYU:
+        return ChPair(3,2);
     case COLOR_BGR5552BGRA: case COLOR_BGR5552RGBA:
     case COLOR_BGR5652BGRA: case COLOR_BGR5652RGBA:
     case COLOR_YUV2RGBA_UYVY: case COLOR_YUV2BGRA_UYVY:
     case COLOR_YUV2RGBA_YUY2: case COLOR_YUV2BGRA_YUY2:
     case COLOR_YUV2RGBA_YVYU: case COLOR_YUV2BGRA_YVYU:
         return ChPair(2,4);
+    case COLOR_RGBA2YUV_UYVY: case COLOR_BGRA2YUV_UYVY:
+    case COLOR_RGBA2YUV_YUY2: case COLOR_BGRA2YUV_YUY2:
+    case COLOR_RGBA2YUV_YVYU: case COLOR_BGRA2YUV_YVYU:
+        return ChPair(4,2);
     case COLOR_BGR2GRAY: case COLOR_RGB2GRAY:
     case COLOR_RGB2YUV_IYUV: case COLOR_RGB2YUV_YV12:
     case COLOR_BGR2YUV_IYUV: case COLOR_BGR2YUV_YV12:
diff --git a/modules/imgproc/perf/perf_integral.cpp b/modules/imgproc/perf/perf_integral.cpp
index 2b1ab381e7..0a4fc49329 100644
--- a/modules/imgproc/perf/perf_integral.cpp
+++ b/modules/imgproc/perf/perf_integral.cpp
@@ -13,7 +13,7 @@ enum PerfSqMatDepth{
     DEPTH_32F_64F,
     DEPTH_64F_64F};
 
-CV_ENUM(IntegralOutputDepths, DEPTH_32S_32S, DEPTH_32S_32F, DEPTH_32S_64F, DEPTH_32F_32F, DEPTH_32F_64F, DEPTH_64F_64F);
+CV_ENUM(IntegralOutputDepths, DEPTH_32S_32S, DEPTH_32S_32F, DEPTH_32S_64F, DEPTH_32F_32F, DEPTH_32F_64F, DEPTH_64F_64F)
 
 static int extraOutputDepths[6][2] = {{CV_32S, CV_32S}, {CV_32S, CV_32F}, {CV_32S, CV_64F}, {CV_32F, CV_32F}, {CV_32F, CV_64F}, {CV_64F, CV_64F}};
 
diff --git a/modules/imgproc/src/accum.dispatch.cpp b/modules/imgproc/src/accum.dispatch.cpp
index 8bbf37cc4a..4d2e044933 100644
--- a/modules/imgproc/src/accum.dispatch.cpp
+++ b/modules/imgproc/src/accum.dispatch.cpp
@@ -17,4 +17,4 @@ DEF_ACC_FLT_FUNCS(32f, float, float)
 DEF_ACC_FLT_FUNCS(32f64f, float, double)
 DEF_ACC_FLT_FUNCS(64f, double, double)
 
-} //cv::hal
\ No newline at end of file
+} //cv::hal
diff --git a/modules/imgproc/src/accum.simd.hpp b/modules/imgproc/src/accum.simd.hpp
index 7fe7aabeaf..1336302613 100644
--- a/modules/imgproc/src/accum.simd.hpp
+++ b/modules/imgproc/src/accum.simd.hpp
@@ -475,9 +475,9 @@ void acc_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn
 void acc_simd_(const float* src, float* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-#if CV_SIMD
-    const int cVectorWidth = v_uint16::nlanes;
-    const int step = v_float32::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int cVectorWidth = VTraits<v_uint16>::vlanes();
+    const int step = VTraits<v_float32>::vlanes();
 
     if (!mask)
     {
@@ -493,8 +493,8 @@ void acc_simd_(const float* src, float* dst, const uchar* mask, int len, int cn)
         #else
         for (; x <= size - cVectorWidth; x += cVectorWidth)
         {
-            v_store(dst + x, vx_load(dst + x) + vx_load(src + x));
-            v_store(dst + x + step, vx_load(dst + x + step) + vx_load(src + x + step));
+            v_store(dst + x, v_add(vx_load(dst + x), vx_load(src + x)));
+            v_store(dst + x + step, v_add(vx_load(dst + x + step), vx_load(src + x + step)));
         }
         #endif // CV_AVX && !CV_AVX2
     }
@@ -508,11 +508,11 @@ void acc_simd_(const float* src, float* dst, const uchar* mask, int len, int cn)
                 v_uint16 v_masku16 = vx_load_expand(mask + x);
                 v_uint32 v_masku320, v_masku321;
                 v_expand(v_masku16, v_masku320, v_masku321);
-                v_float32 v_mask0 = v_reinterpret_as_f32(~(v_masku320 == v_reinterpret_as_u32(v_0)));
-                v_float32 v_mask1 = v_reinterpret_as_f32(~(v_masku321 == v_reinterpret_as_u32(v_0)));
+                v_float32 v_mask0 = v_reinterpret_as_f32(v_not(v_eq(v_masku320, v_reinterpret_as_u32(v_0))));
+                v_float32 v_mask1 = v_reinterpret_as_f32(v_not(v_eq(v_masku321, v_reinterpret_as_u32(v_0))));
 
-                v_store(dst + x, vx_load(dst + x) + (vx_load(src + x) & v_mask0));
-                v_store(dst + x + step, vx_load(dst + x + step) + (vx_load(src + x + step) & v_mask1));
+                v_store(dst + x, v_add(vx_load(dst + x), v_and(vx_load(src + x), v_mask0)));
+                v_store(dst + x + step, v_add(vx_load(dst + x + step), v_and(vx_load(src + x + step), v_mask1)));
             }
         }
         else if (cn == 3)
@@ -522,25 +522,25 @@ void acc_simd_(const float* src, float* dst, const uchar* mask, int len, int cn)
                 v_uint16 v_masku16 = vx_load_expand(mask + x);
                 v_uint32 v_masku320, v_masku321;
                 v_expand(v_masku16, v_masku320, v_masku321);
-                v_float32 v_mask0 = v_reinterpret_as_f32(~(v_masku320 == v_reinterpret_as_u32(v_0)));
-                v_float32 v_mask1 = v_reinterpret_as_f32(~(v_masku321 == v_reinterpret_as_u32(v_0)));
+                v_float32 v_mask0 = v_reinterpret_as_f32(v_not(v_eq(v_masku320, v_reinterpret_as_u32(v_0))));
+                v_float32 v_mask1 = v_reinterpret_as_f32(v_not(v_eq(v_masku321, v_reinterpret_as_u32(v_0))));
 
                 v_float32 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
                 v_load_deinterleave(src + x * cn, v_src00, v_src10, v_src20);
                 v_load_deinterleave(src + (x + step) * cn, v_src01, v_src11, v_src21);
-                v_src00 = v_src00 & v_mask0;
-                v_src01 = v_src01 & v_mask1;
-                v_src10 = v_src10 & v_mask0;
-                v_src11 = v_src11 & v_mask1;
-                v_src20 = v_src20 & v_mask0;
-                v_src21 = v_src21 & v_mask1;
+                v_src00 = v_and(v_src00, v_mask0);
+                v_src01 = v_and(v_src01, v_mask1);
+                v_src10 = v_and(v_src10, v_mask0);
+                v_src11 = v_and(v_src11, v_mask1);
+                v_src20 = v_and(v_src20, v_mask0);
+                v_src21 = v_and(v_src21, v_mask1);
 
                 v_float32 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
                 v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
                 v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
 
-                v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
-                v_store_interleave(dst + (x + step) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
+                v_store_interleave(dst + x * cn, v_add(v_dst00, v_src00), v_add(v_dst10, v_src10), v_add(v_dst20, v_src20));
+                v_store_interleave(dst + (x + step) * cn, v_add(v_dst01, v_src01), v_add(v_dst11, v_src11), v_add(v_dst21, v_src21));
             }
         }
     }
@@ -862,9 +862,9 @@ void acc_simd_(const ushort* src, double* dst, const uchar* mask, int len, int c
 void acc_simd_(const float* src, double* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-#if CV_SIMD_64F
-    const int cVectorWidth = v_float32::nlanes;
-    const int step = v_float64::nlanes;
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
+    const int cVectorWidth = VTraits<v_float32>::vlanes();
+    const int step = VTraits<v_float64>::vlanes();
 
     if (!mask)
     {
@@ -889,8 +889,8 @@ void acc_simd_(const float* src, double* dst, const uchar* mask, int len, int cn
             v_float64 v_src0 = v_cvt_f64(v_src);
             v_float64 v_src1 = v_cvt_f64_high(v_src);
 
-            v_store(dst + x, vx_load(dst + x) + v_src0);
-            v_store(dst + x + step, vx_load(dst + x + step) + v_src1);
+            v_store(dst + x, v_add(vx_load(dst + x), v_src0));
+            v_store(dst + x + step, v_add(vx_load(dst + x + step), v_src1));
         }
         #endif // CV_AVX && !CV_AVX2
     }
@@ -904,15 +904,15 @@ void acc_simd_(const float* src, double* dst, const uchar* mask, int len, int cn
                 v_uint32 v_masku32 = vx_load_expand_q(mask + x);
                 v_uint64 v_masku640, v_masku641;
                 v_expand(v_masku32, v_masku640, v_masku641);
-                v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
-                v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
+                v_float64 v_mask0 = v_reinterpret_as_f64(v_not(v_eq(v_masku640, v_0)));
+                v_float64 v_mask1 = v_reinterpret_as_f64(v_not(v_eq(v_masku641, v_0)));
 
                 v_float32 v_src = vx_load(src + x);
-                v_float64 v_src0 = v_cvt_f64(v_src) & v_mask0;
-                v_float64 v_src1 = v_cvt_f64_high(v_src) & v_mask1;
+                v_float64 v_src0 = v_and(v_cvt_f64(v_src), v_mask0);
+                v_float64 v_src1 = v_and(v_cvt_f64_high(v_src), v_mask1);
 
-                v_store(dst + x, vx_load(dst + x) + v_src0);
-                v_store(dst + x + step, vx_load(dst + x + step) + v_src1);
+                v_store(dst + x, v_add(vx_load(dst + x), v_src0));
+                v_store(dst + x + step, v_add(vx_load(dst + x + step), v_src1));
             }
         }
         else if (cn == 3)
@@ -922,24 +922,24 @@ void acc_simd_(const float* src, double* dst, const uchar* mask, int len, int cn
                 v_uint32 v_masku32 = vx_load_expand_q(mask + x);
                 v_uint64 v_masku640, v_masku641;
                 v_expand(v_masku32, v_masku640, v_masku641);
-                v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
-                v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
+                v_float64 v_mask0 = v_reinterpret_as_f64(v_not(v_eq(v_masku640, v_0)));
+                v_float64 v_mask1 = v_reinterpret_as_f64(v_not(v_eq(v_masku641, v_0)));
 
                 v_float32 v_src0, v_src1, v_src2;
                 v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
-                v_float64 v_src00 = v_cvt_f64(v_src0) & v_mask0;
-                v_float64 v_src01 = v_cvt_f64_high(v_src0) & v_mask1;
-                v_float64 v_src10 = v_cvt_f64(v_src1) & v_mask0;
-                v_float64 v_src11 = v_cvt_f64_high(v_src1) & v_mask1;
-                v_float64 v_src20 = v_cvt_f64(v_src2) & v_mask0;
-                v_float64 v_src21 = v_cvt_f64_high(v_src2) & v_mask1;
+                v_float64 v_src00 = v_and(v_cvt_f64(v_src0), v_mask0);
+                v_float64 v_src01 = v_and(v_cvt_f64_high(v_src0), v_mask1);
+                v_float64 v_src10 = v_and(v_cvt_f64(v_src1), v_mask0);
+                v_float64 v_src11 = v_and(v_cvt_f64_high(v_src1), v_mask1);
+                v_float64 v_src20 = v_and(v_cvt_f64(v_src2), v_mask0);
+                v_float64 v_src21 = v_and(v_cvt_f64_high(v_src2), v_mask1);
 
                 v_float64 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
                 v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
                 v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
 
-                v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
-                v_store_interleave(dst + (x + step) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
+                v_store_interleave(dst + x * cn, v_add(v_dst00, v_src00), v_add(v_dst10, v_src10), v_add(v_dst20, v_src20));
+                v_store_interleave(dst + (x + step) * cn, v_add(v_dst01, v_src01), v_add(v_dst11, v_src11), v_add(v_dst21, v_src21));
             }
         }
     }
@@ -950,9 +950,9 @@ void acc_simd_(const float* src, double* dst, const uchar* mask, int len, int cn
 void acc_simd_(const double* src, double* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-#if CV_SIMD_64F
-    const int cVectorWidth = v_float64::nlanes * 2;
-    const int step = v_float64::nlanes;
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
+    const int cVectorWidth = VTraits<v_float64>::vlanes() * 2;
+    const int step = VTraits<v_float64>::vlanes();
 
     if (!mask)
     {
@@ -971,8 +971,8 @@ void acc_simd_(const double* src, double* dst, const uchar* mask, int len, int c
             v_float64 v_src0 = vx_load(src + x);
             v_float64 v_src1 = vx_load(src + x + step);
 
-            v_store(dst + x, vx_load(dst + x) + v_src0);
-            v_store(dst + x + step, vx_load(dst + x + step) + v_src1);
+            v_store(dst + x, v_add(vx_load(dst + x), v_src0));
+            v_store(dst + x + step, v_add(vx_load(dst + x + step), v_src1));
         }
         #endif // CV_AVX && !CV_AVX2
     }
@@ -986,14 +986,14 @@ void acc_simd_(const double* src, double* dst, const uchar* mask, int len, int c
                 v_uint32 v_masku32 = vx_load_expand_q(mask + x);
                 v_uint64 v_masku640, v_masku641;
                 v_expand(v_masku32, v_masku640, v_masku641);
-                v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
-                v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
+                v_float64 v_mask0 = v_reinterpret_as_f64(v_not(v_eq(v_masku640, v_0)));
+                v_float64 v_mask1 = v_reinterpret_as_f64(v_not(v_eq(v_masku641, v_0)));
 
                 v_float64 v_src0 = vx_load(src + x);
                 v_float64 v_src1 = vx_load(src + x + step);
 
-                v_store(dst + x, vx_load(dst + x) + (v_src0 & v_mask0));
-                v_store(dst + x + step, vx_load(dst + x + step) + (v_src1 & v_mask1));
+                v_store(dst + x, v_add(vx_load(dst + x), v_and(v_src0, v_mask0)));
+                v_store(dst + x + step, v_add(vx_load(dst + x + step), v_and(v_src1, v_mask1)));
             }
         }
         else if (cn == 3)
@@ -1003,25 +1003,25 @@ void acc_simd_(const double* src, double* dst, const uchar* mask, int len, int c
                 v_uint32 v_masku32 = vx_load_expand_q(mask + x);
                 v_uint64 v_masku640, v_masku641;
                 v_expand(v_masku32, v_masku640, v_masku641);
-                v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
-                v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
+                v_float64 v_mask0 = v_reinterpret_as_f64(v_not(v_eq(v_masku640, v_0)));
+                v_float64 v_mask1 = v_reinterpret_as_f64(v_not(v_eq(v_masku641, v_0)));
 
                 v_float64 v_src00, v_src10, v_src20, v_src01, v_src11, v_src21;
                 v_load_deinterleave(src + x * cn, v_src00, v_src10, v_src20);
                 v_load_deinterleave(src + (x + step) * cn, v_src01, v_src11, v_src21);
-                v_src00 = v_src00 & v_mask0;
-                v_src01 = v_src01 & v_mask1;
-                v_src10 = v_src10 & v_mask0;
-                v_src11 = v_src11 & v_mask1;
-                v_src20 = v_src20 & v_mask0;
-                v_src21 = v_src21 & v_mask1;
+                v_src00 = v_and(v_src00, v_mask0);
+                v_src01 = v_and(v_src01, v_mask1);
+                v_src10 = v_and(v_src10, v_mask0);
+                v_src11 = v_and(v_src11, v_mask1);
+                v_src20 = v_and(v_src20, v_mask0);
+                v_src21 = v_and(v_src21, v_mask1);
 
                 v_float64 v_dst00, v_dst10, v_dst20, v_dst01, v_dst11, v_dst21;
                 v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
                 v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
 
-                v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
-                v_store_interleave(dst + (x + step) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
+                v_store_interleave(dst + x * cn, v_add(v_dst00, v_src00), v_add(v_dst10, v_src10), v_add(v_dst20, v_src20));
+                v_store_interleave(dst + (x + step) * cn, v_add(v_dst01, v_src01), v_add(v_dst11, v_src11), v_add(v_dst21, v_src21));
             }
         }
     }
@@ -1256,9 +1256,9 @@ void accSqr_simd_(const ushort* src, float* dst, const uchar* mask, int len, int
 void accSqr_simd_(const float* src, float* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-#if CV_SIMD
-    const int cVectorWidth = v_uint16::nlanes;
-    const int step = v_float32::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int cVectorWidth = VTraits<v_uint16>::vlanes();
+    const int step = VTraits<v_float32>::vlanes();
 
     if (!mask)
     {
@@ -1293,12 +1293,12 @@ void accSqr_simd_(const float* src, float* dst, const uchar* mask, int len, int
                 v_uint16 v_mask16 = vx_load_expand(mask + x);
                 v_uint32 v_mask_0, v_mask_1;
                 v_expand(v_mask16, v_mask_0, v_mask_1);
-                v_float32 v_mask0 = v_reinterpret_as_f32(~(v_mask_0 == v_0));
-                v_float32 v_mask1 = v_reinterpret_as_f32(~(v_mask_1 == v_0));
+                v_float32 v_mask0 = v_reinterpret_as_f32(v_not(v_eq(v_mask_0, v_0)));
+                v_float32 v_mask1 = v_reinterpret_as_f32(v_not(v_eq(v_mask_1, v_0)));
                 v_float32 v_src0 = vx_load(src + x);
                 v_float32 v_src1 = vx_load(src + x + step);
-                v_src0 = v_src0 & v_mask0;
-                v_src1 = v_src1 & v_mask1;
+                v_src0 = v_and(v_src0, v_mask0);
+                v_src1 = v_and(v_src1, v_mask1);
 
                 v_store(dst + x, v_fma(v_src0, v_src0, vx_load(dst + x)));
                 v_store(dst + x + step, v_fma(v_src1, v_src1, vx_load(dst + x + step)));
@@ -1311,18 +1311,18 @@ void accSqr_simd_(const float* src, float* dst, const uchar* mask, int len, int
                 v_uint16 v_mask16 = vx_load_expand(mask + x);
                 v_uint32 v_mask_0, v_mask_1;
                 v_expand(v_mask16, v_mask_0, v_mask_1);
-                v_float32 v_mask0 = v_reinterpret_as_f32(~(v_mask_0 == v_0));
-                v_float32 v_mask1 = v_reinterpret_as_f32(~(v_mask_1 == v_0));
+                v_float32 v_mask0 = v_reinterpret_as_f32(v_not(v_eq(v_mask_0, v_0)));
+                v_float32 v_mask1 = v_reinterpret_as_f32(v_not(v_eq(v_mask_1, v_0)));
 
                 v_float32 v_src00, v_src10, v_src20, v_src01, v_src11, v_src21;
                 v_load_deinterleave(src + x * cn, v_src00, v_src10, v_src20);
                 v_load_deinterleave(src + (x + step) * cn, v_src01, v_src11, v_src21);
-                v_src00 = v_src00 & v_mask0;
-                v_src01 = v_src01 & v_mask1;
-                v_src10 = v_src10 & v_mask0;
-                v_src11 = v_src11 & v_mask1;
-                v_src20 = v_src20 & v_mask0;
-                v_src21 = v_src21 & v_mask1;
+                v_src00 = v_and(v_src00, v_mask0);
+                v_src01 = v_and(v_src01, v_mask1);
+                v_src10 = v_and(v_src10, v_mask0);
+                v_src11 = v_and(v_src11, v_mask1);
+                v_src20 = v_and(v_src20, v_mask0);
+                v_src21 = v_and(v_src21, v_mask1);
 
                 v_float32 v_dst00, v_dst10, v_dst20, v_dst01, v_dst11, v_dst21;
                 v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
@@ -1625,9 +1625,9 @@ void accSqr_simd_(const ushort* src, double* dst, const uchar* mask, int len, in
 void accSqr_simd_(const float* src, double* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-#if CV_SIMD_64F
-    const int cVectorWidth = v_float32::nlanes;
-    const int step = v_float64::nlanes;
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
+    const int cVectorWidth = VTraits<v_float32>::vlanes();
+    const int step = VTraits<v_float64>::vlanes();
 
     if (!mask)
     {
@@ -1667,9 +1667,9 @@ void accSqr_simd_(const float* src, double* dst, const uchar* mask, int len, int
             for (; x <= len - cVectorWidth; x += cVectorWidth)
             {
                 v_uint32 v_mask = vx_load_expand_q(mask + x);;
-                v_mask = ~(v_mask == v_0);
+                v_mask = v_not(v_eq(v_mask, v_0));
                 v_float32 v_src = vx_load(src + x);
-                v_src = v_src & v_reinterpret_as_f32(v_mask);
+                v_src = v_and(v_src, v_reinterpret_as_f32(v_mask));
                 v_float64 v_src0 = v_cvt_f64(v_src);
                 v_float64 v_src1 = v_cvt_f64_high(v_src);
 
@@ -1682,13 +1682,13 @@ void accSqr_simd_(const float* src, double* dst, const uchar* mask, int len, int
             for (; x <= len - cVectorWidth; x += cVectorWidth)
             {
                 v_uint32 v_mask = vx_load_expand_q(mask + x);
-                v_mask = ~(v_mask == v_0);
+                v_mask = v_not(v_eq(v_mask, v_0));
 
                 v_float32 v_src0, v_src1, v_src2;
                 v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
-                v_src0 = v_src0 & v_reinterpret_as_f32(v_mask);
-                v_src1 = v_src1 & v_reinterpret_as_f32(v_mask);
-                v_src2 = v_src2 & v_reinterpret_as_f32(v_mask);
+                v_src0 = v_and(v_src0, v_reinterpret_as_f32(v_mask));
+                v_src1 = v_and(v_src1, v_reinterpret_as_f32(v_mask));
+                v_src2 = v_and(v_src2, v_reinterpret_as_f32(v_mask));
 
                 v_float64 v_src00 = v_cvt_f64(v_src0);
                 v_float64 v_src01 = v_cvt_f64_high(v_src0);
@@ -1720,9 +1720,9 @@ void accSqr_simd_(const float* src, double* dst, const uchar* mask, int len, int
 void accSqr_simd_(const double* src, double* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-#if CV_SIMD_64F
-    const int cVectorWidth = v_float64::nlanes * 2;
-    const int step = v_float64::nlanes;
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
+    const int cVectorWidth = VTraits<v_float64>::vlanes() * 2;
+    const int step = VTraits<v_float64>::vlanes();
 
     if (!mask)
     {
@@ -1756,12 +1756,12 @@ void accSqr_simd_(const double* src, double* dst, const uchar* mask, int len, in
                 v_uint32 v_mask32 = vx_load_expand_q(mask + x);
                 v_uint64 v_masku640, v_masku641;
                 v_expand(v_mask32, v_masku640, v_masku641);
-                v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
-                v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
+                v_float64 v_mask0 = v_reinterpret_as_f64(v_not(v_eq(v_masku640, v_0)));
+                v_float64 v_mask1 = v_reinterpret_as_f64(v_not(v_eq(v_masku641, v_0)));
                 v_float64 v_src0 = vx_load(src + x);
                 v_float64 v_src1 = vx_load(src + x + step);
-                v_src0 = v_src0 & v_mask0;
-                v_src1 = v_src1 & v_mask1;
+                v_src0 = v_and(v_src0, v_mask0);
+                v_src1 = v_and(v_src1, v_mask1);
                 v_store(dst + x, v_fma(v_src0, v_src0, vx_load(dst + x)));
                 v_store(dst + x + step, v_fma(v_src1, v_src1, vx_load(dst + x + step)));
             }
@@ -1773,18 +1773,18 @@ void accSqr_simd_(const double* src, double* dst, const uchar* mask, int len, in
                 v_uint32 v_mask32 = vx_load_expand_q(mask + x);
                 v_uint64 v_masku640, v_masku641;
                 v_expand(v_mask32, v_masku640, v_masku641);
-                v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
-                v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
+                v_float64 v_mask0 = v_reinterpret_as_f64(v_not(v_eq(v_masku640, v_0)));
+                v_float64 v_mask1 = v_reinterpret_as_f64(v_not(v_eq(v_masku641, v_0)));
 
                 v_float64 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
                 v_load_deinterleave(src + x * cn, v_src00, v_src10, v_src20);
                 v_load_deinterleave(src + (x + step) * cn, v_src01, v_src11, v_src21);
-                v_src00 = v_src00 & v_mask0;
-                v_src01 = v_src01 & v_mask1;
-                v_src10 = v_src10 & v_mask0;
-                v_src11 = v_src11 & v_mask1;
-                v_src20 = v_src20 & v_mask0;
-                v_src21 = v_src21 & v_mask1;
+                v_src00 = v_and(v_src00, v_mask0);
+                v_src01 = v_and(v_src01, v_mask1);
+                v_src10 = v_and(v_src10, v_mask0);
+                v_src11 = v_and(v_src11, v_mask1);
+                v_src20 = v_and(v_src20, v_mask0);
+                v_src21 = v_and(v_src21, v_mask1);
 
                 v_float64 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
                 v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
@@ -2035,9 +2035,9 @@ void accProd_simd_(const ushort* src1, const ushort* src2, float* dst, const uch
 void accProd_simd_(const float* src1, const float* src2, float* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-#if CV_SIMD
-    const int cVectorWidth = v_uint16::nlanes;
-    const int step = v_float32::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int cVectorWidth = VTraits<v_uint16>::vlanes();
+    const int step = VTraits<v_float32>::vlanes();
 
     if (!mask)
     {
@@ -2069,11 +2069,11 @@ void accProd_simd_(const float* src1, const float* src2, float* dst, const uchar
             {
                 v_uint32 v_mask32_0 = vx_load_expand_q(mask + x);
                 v_uint32 v_mask32_1 = vx_load_expand_q(mask + x + step);
-                v_float32 v_mask0 = v_reinterpret_as_f32(~(v_mask32_0 == v_0));
-                v_float32 v_mask1 = v_reinterpret_as_f32(~(v_mask32_1 == v_0));
+                v_float32 v_mask0 = v_reinterpret_as_f32(v_not(v_eq(v_mask32_0, v_0)));
+                v_float32 v_mask1 = v_reinterpret_as_f32(v_not(v_eq(v_mask32_1, v_0)));
 
-                v_store(dst + x, vx_load(dst + x) + ((vx_load(src1 + x) * vx_load(src2 + x)) & v_mask0));
-                v_store(dst + x + step, vx_load(dst + x + step) + ((vx_load(src1 + x + step) * vx_load(src2 + x + step)) & v_mask1));
+                v_store(dst + x, v_add(vx_load(dst + x), v_and(v_mul(vx_load(src1 + x), vx_load(src2 + x)), v_mask0)));
+                v_store(dst + x + step, v_add(vx_load(dst + x + step), v_and(v_mul(vx_load(src1 + x + step), vx_load(src2 + x + step)), v_mask1)));
             }
         }
         else if (cn == 3)
@@ -2082,8 +2082,8 @@ void accProd_simd_(const float* src1, const float* src2, float* dst, const uchar
             {
                 v_uint32 v_mask32_0 = vx_load_expand_q(mask + x);
                 v_uint32 v_mask32_1 = vx_load_expand_q(mask + x + step);
-                v_float32 v_mask0 = v_reinterpret_as_f32(~(v_mask32_0 == v_0));
-                v_float32 v_mask1 = v_reinterpret_as_f32(~(v_mask32_1 == v_0));
+                v_float32 v_mask0 = v_reinterpret_as_f32(v_not(v_eq(v_mask32_0, v_0)));
+                v_float32 v_mask1 = v_reinterpret_as_f32(v_not(v_eq(v_mask32_1, v_0)));
 
                 v_float32 v_1src00, v_1src01, v_1src10, v_1src11, v_1src20, v_1src21;
                 v_float32 v_2src00, v_2src01, v_2src10, v_2src11, v_2src20, v_2src21;
@@ -2096,8 +2096,8 @@ void accProd_simd_(const float* src1, const float* src2, float* dst, const uchar
                 v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
                 v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
 
-                v_store_interleave(dst + x * cn, v_dst00 + ((v_1src00 * v_2src00) & v_mask0), v_dst10 + ((v_1src10 * v_2src10) & v_mask0), v_dst20 + ((v_1src20 * v_2src20) & v_mask0));
-                v_store_interleave(dst + (x + step) * cn, v_dst01 + ((v_1src01 * v_2src01) & v_mask1), v_dst11 + ((v_1src11 * v_2src11) & v_mask1), v_dst21 + ((v_1src21 * v_2src21) & v_mask1));
+                v_store_interleave(dst + x * cn, v_add(v_dst00, v_and(v_mul(v_1src00, v_2src00), v_mask0)), v_add(v_dst10, v_and(v_mul(v_1src10, v_2src10), v_mask0)), v_add(v_dst20, v_and(v_mul(v_1src20, v_2src20), v_mask0)));
+                v_store_interleave(dst + (x + step) * cn, v_add(v_dst01, v_and(v_mul(v_1src01, v_2src01), v_mask1)), v_add(v_dst11, v_and(v_mul(v_1src11, v_2src11), v_mask1)), v_add(v_dst21, v_and(v_mul(v_1src21, v_2src21), v_mask1)));
             }
         }
     }
@@ -2398,9 +2398,9 @@ void accProd_simd_(const ushort* src1, const ushort* src2, double* dst, const uc
 void accProd_simd_(const float* src1, const float* src2, double* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-#if CV_SIMD_64F
-    const int cVectorWidth = v_float32::nlanes;
-    const int step = v_float64::nlanes;
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
+    const int cVectorWidth = VTraits<v_float32>::vlanes();
+    const int step = VTraits<v_float64>::vlanes();
 
     if (!mask)
     {
@@ -2447,11 +2447,11 @@ void accProd_simd_(const float* src1, const float* src2, double* dst, const ucha
             for (; x <= len - cVectorWidth; x += cVectorWidth)
             {
                 v_uint32 v_mask = vx_load_expand_q(mask + x);
-                v_mask = ~(v_mask == v_0);
+                v_mask = v_not(v_eq(v_mask, v_0));
                 v_float32 v_1src = vx_load(src1 + x);
                 v_float32 v_2src = vx_load(src2 + x);
-                v_1src = v_1src & v_reinterpret_as_f32(v_mask);
-                v_2src = v_2src & v_reinterpret_as_f32(v_mask);
+                v_1src = v_and(v_1src, v_reinterpret_as_f32(v_mask));
+                v_2src = v_and(v_2src, v_reinterpret_as_f32(v_mask));
 
                 v_float64 v_1src0 = v_cvt_f64(v_1src);
                 v_float64 v_1src1 = v_cvt_f64_high(v_1src);
@@ -2467,16 +2467,16 @@ void accProd_simd_(const float* src1, const float* src2, double* dst, const ucha
             for (; x <= len - cVectorWidth; x += cVectorWidth)
             {
                 v_uint32 v_mask = vx_load_expand_q(mask + x);
-                v_mask = ~(v_mask == v_0);
+                v_mask = v_not(v_eq(v_mask, v_0));
                 v_float32 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2;
                 v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2);
                 v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2);
-                v_1src0 = v_1src0 & v_reinterpret_as_f32(v_mask);
-                v_1src1 = v_1src1 & v_reinterpret_as_f32(v_mask);
-                v_1src2 = v_1src2 & v_reinterpret_as_f32(v_mask);
-                v_2src0 = v_2src0 & v_reinterpret_as_f32(v_mask);
-                v_2src1 = v_2src1 & v_reinterpret_as_f32(v_mask);
-                v_2src2 = v_2src2 & v_reinterpret_as_f32(v_mask);
+                v_1src0 = v_and(v_1src0, v_reinterpret_as_f32(v_mask));
+                v_1src1 = v_and(v_1src1, v_reinterpret_as_f32(v_mask));
+                v_1src2 = v_and(v_1src2, v_reinterpret_as_f32(v_mask));
+                v_2src0 = v_and(v_2src0, v_reinterpret_as_f32(v_mask));
+                v_2src1 = v_and(v_2src1, v_reinterpret_as_f32(v_mask));
+                v_2src2 = v_and(v_2src2, v_reinterpret_as_f32(v_mask));
 
                 v_float64 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
                 v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
@@ -2501,9 +2501,9 @@ void accProd_simd_(const float* src1, const float* src2, double* dst, const ucha
 void accProd_simd_(const double* src1, const double* src2, double* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-#if CV_SIMD_64F
-    const int cVectorWidth = v_float64::nlanes * 2;
-    const int step = v_float64::nlanes;
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
+    const int cVectorWidth = VTraits<v_float64>::vlanes() * 2;
+    const int step = VTraits<v_float64>::vlanes();
 
     if (!mask)
     {
@@ -2542,16 +2542,16 @@ void accProd_simd_(const double* src1, const double* src2, double* dst, const uc
                 v_uint32 v_mask32 = vx_load_expand_q(mask + x);
                 v_uint64 v_masku640, v_masku641;
                 v_expand(v_mask32, v_masku640, v_masku641);
-                v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
-                v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
+                v_float64 v_mask0 = v_reinterpret_as_f64(v_not(v_eq(v_masku640, v_0)));
+                v_float64 v_mask1 = v_reinterpret_as_f64(v_not(v_eq(v_masku641, v_0)));
 
                 v_float64 v_src00 = vx_load(src1 + x);
                 v_float64 v_src01 = vx_load(src1 + x + step);
                 v_float64 v_src10 = vx_load(src2 + x);
                 v_float64 v_src11 = vx_load(src2 + x + step);
 
-                v_store(dst + x, vx_load(dst + x) + ((v_src00 * v_src10) & v_mask0));
-                v_store(dst + x + step, vx_load(dst + x + step) + ((v_src01 * v_src11) & v_mask1));
+                v_store(dst + x, v_add(vx_load(dst + x), v_and(v_mul(v_src00, v_src10), v_mask0)));
+                v_store(dst + x + step, v_add(vx_load(dst + x + step), v_and(v_mul(v_src01, v_src11), v_mask1)));
             }
         }
         else if (cn == 3)
@@ -2561,8 +2561,8 @@ void accProd_simd_(const double* src1, const double* src2, double* dst, const uc
                 v_uint32 v_mask32 = vx_load_expand_q(mask + x);
                 v_uint64 v_masku640, v_masku641;
                 v_expand(v_mask32, v_masku640, v_masku641);
-                v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
-                v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
+                v_float64 v_mask0 = v_reinterpret_as_f64(v_not(v_eq(v_masku640, v_0)));
+                v_float64 v_mask1 = v_reinterpret_as_f64(v_not(v_eq(v_masku641, v_0)));
 
                 v_float64 v_1src00, v_1src01, v_1src10, v_1src11, v_1src20, v_1src21;
                 v_float64 v_2src00, v_2src01, v_2src10, v_2src11, v_2src20, v_2src21;
@@ -2570,19 +2570,19 @@ void accProd_simd_(const double* src1, const double* src2, double* dst, const uc
                 v_load_deinterleave(src1 + (x + step) * cn, v_1src01, v_1src11, v_1src21);
                 v_load_deinterleave(src2 + x * cn, v_2src00, v_2src10, v_2src20);
                 v_load_deinterleave(src2 + (x + step) * cn, v_2src01, v_2src11, v_2src21);
-                v_float64 v_src00 = (v_1src00 & v_mask0) * v_2src00;
-                v_float64 v_src01 = (v_1src01 & v_mask1) * v_2src01;
-                v_float64 v_src10 = (v_1src10 & v_mask0) * v_2src10;
-                v_float64 v_src11 = (v_1src11 & v_mask1) * v_2src11;
-                v_float64 v_src20 = (v_1src20 & v_mask0) * v_2src20;
-                v_float64 v_src21 = (v_1src21 & v_mask1) * v_2src21;
+                v_float64 v_src00 = v_mul(v_and(v_1src00, v_mask0), v_2src00);
+                v_float64 v_src01 = v_mul(v_and(v_1src01, v_mask1), v_2src01);
+                v_float64 v_src10 = v_mul(v_and(v_1src10, v_mask0), v_2src10);
+                v_float64 v_src11 = v_mul(v_and(v_1src11, v_mask1), v_2src11);
+                v_float64 v_src20 = v_mul(v_and(v_1src20, v_mask0), v_2src20);
+                v_float64 v_src21 = v_mul(v_and(v_1src21, v_mask1), v_2src21);
 
                 v_float64 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
                 v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
                 v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
 
-                v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
-                v_store_interleave(dst + (x + step) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
+                v_store_interleave(dst + x * cn, v_add(v_dst00, v_src00), v_add(v_dst10, v_src10), v_add(v_dst20, v_src20));
+                v_store_interleave(dst + (x + step) * cn, v_add(v_dst01, v_src01), v_add(v_dst11, v_src11), v_add(v_dst21, v_src21));
             }
         }
     }
diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp
index d12253a184..88bfb1a575 100644
--- a/modules/imgproc/src/color.cpp
+++ b/modules/imgproc/src/color.cpp
@@ -90,6 +90,20 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
 
         return oclCvtColorOnePlaneYUV2BGR(_src, _dst, dcn, bidx, uidx, yidx);
     }
+    case COLOR_RGB2YUV_UYVY: case COLOR_BGR2YUV_UYVY: case COLOR_RGBA2YUV_UYVY: case COLOR_BGRA2YUV_UYVY:
+    case COLOR_RGB2YUV_YUY2: case COLOR_BGR2YUV_YUY2: case COLOR_RGB2YUV_YVYU: case COLOR_BGR2YUV_YVYU:
+    case COLOR_RGBA2YUV_YUY2: case COLOR_BGRA2YUV_YUY2: case COLOR_RGBA2YUV_YVYU: case COLOR_BGRA2YUV_YVYU:
+    {
+        int yidx = (code==COLOR_RGB2YUV_UYVY || code==COLOR_RGBA2YUV_UYVY ||
+                    code==COLOR_BGR2YUV_UYVY || code==COLOR_BGRA2YUV_UYVY) ? 1 : 0;
+        int uidx = (code==COLOR_RGB2YUV_YVYU || code==COLOR_RGBA2YUV_YVYU ||
+                    code==COLOR_BGR2YUV_YVYU || code==COLOR_BGRA2YUV_YVYU) ? 2 : 0;
+        uidx = 1 - yidx + uidx;
+
+        bool res = oclCvtColorOnePlaneBGR2YUV(_src, _dst, dcn, bidx, uidx, yidx);
+
+        return res;
+    }
     case COLOR_BGR2YCrCb:
     case COLOR_RGB2YCrCb:
         return oclCvtColorBGR2YCrCb(_src, _dst, bidx);
@@ -339,6 +353,19 @@ void cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
                 break;
             }
 
+        case COLOR_RGB2YUV_UYVY: case COLOR_BGR2YUV_UYVY: case COLOR_RGBA2YUV_UYVY: case COLOR_BGRA2YUV_UYVY:
+        case COLOR_RGB2YUV_YUY2: case COLOR_BGR2YUV_YUY2: case COLOR_RGB2YUV_YVYU: case COLOR_BGR2YUV_YVYU:
+        case COLOR_RGBA2YUV_YUY2: case COLOR_BGRA2YUV_YUY2: case COLOR_RGBA2YUV_YVYU: case COLOR_BGRA2YUV_YVYU:
+            //http://www.fourcc.org/yuv.php#UYVY
+            //http://www.fourcc.org/yuv.php#YUY2
+            //http://www.fourcc.org/yuv.php#YVYU
+            {
+                int ycn  = (code==COLOR_RGB2YUV_UYVY ||  code==COLOR_BGR2YUV_UYVY ||
+                            code==COLOR_RGBA2YUV_UYVY || code==COLOR_BGRA2YUV_UYVY) ? 1 : 0;
+                cvtColorOnePlaneBGR2YUV(_src, _dst, swapBlue(code), uIndex(code), ycn);
+                break;
+            }
+
         case COLOR_YUV2GRAY_UYVY:
         case COLOR_YUV2GRAY_YUY2:
             cvtColorYUV2Gray_ch(_src, _dst, code == COLOR_YUV2GRAY_UYVY ? 1 : 0);
diff --git a/modules/imgproc/src/color.hpp b/modules/imgproc/src/color.hpp
index abbd65ec06..7751d823b1 100644
--- a/modules/imgproc/src/color.hpp
+++ b/modules/imgproc/src/color.hpp
@@ -71,6 +71,8 @@ inline bool swapBlue(int code)
     case COLOR_YUV2BGR_UYVY: case COLOR_YUV2BGRA_UYVY: case COLOR_YUV2BGR_YUY2:
     case COLOR_YUV2BGRA_YUY2:  case COLOR_YUV2BGR_YVYU: case COLOR_YUV2BGRA_YVYU:
     case COLOR_BGR2YUV_IYUV: case COLOR_BGRA2YUV_IYUV: case COLOR_BGR2YUV_YV12: case COLOR_BGRA2YUV_YV12:
+    case COLOR_BGR2YUV_UYVY:   case COLOR_BGRA2YUV_UYVY: case COLOR_BGR2YUV_YUY2:
+    case COLOR_BGRA2YUV_YUY2:  case COLOR_BGR2YUV_YVYU:  case COLOR_BGRA2YUV_YVYU:
         return false;
     default:
         return true;
@@ -124,6 +126,13 @@ inline int dstChannels(int code)
 
             return 3;
 
+        case COLOR_RGB2YUV_UYVY: case COLOR_BGR2YUV_UYVY: case COLOR_RGB2YUV_YVYU: case COLOR_BGR2YUV_YVYU:
+        case COLOR_RGB2YUV_YUY2: case COLOR_BGR2YUV_YUY2:
+        case COLOR_RGBA2YUV_UYVY: case COLOR_BGRA2YUV_UYVY: case COLOR_RGBA2YUV_YVYU: case COLOR_BGRA2YUV_YVYU:
+        case COLOR_RGBA2YUV_YUY2: case COLOR_BGRA2YUV_YUY2:
+
+            return 2;
+
         default:
             return 0;
     }
@@ -159,6 +168,7 @@ inline int uIndex(int code)
             return 2;
 
         case COLOR_YUV2RGB_YVYU: case COLOR_YUV2BGR_YVYU: case COLOR_YUV2RGBA_YVYU: case COLOR_YUV2BGRA_YVYU:
+        case COLOR_RGB2YUV_YVYU: case COLOR_BGR2YUV_YVYU: case COLOR_RGBA2YUV_YVYU: case COLOR_BGRA2YUV_YVYU:
         case COLOR_RGB2YUV_IYUV: case COLOR_BGR2YUV_IYUV: case COLOR_RGBA2YUV_IYUV: case COLOR_BGRA2YUV_IYUV:
         case COLOR_YUV2BGR_NV21:  case COLOR_YUV2RGB_NV21: case COLOR_YUV2BGRA_NV21: case COLOR_YUV2RGBA_NV21:
         case COLOR_YUV2BGR_YV12: case COLOR_YUV2RGB_YV12: case COLOR_YUV2BGRA_YV12: case COLOR_YUV2RGBA_YV12:
@@ -169,6 +179,8 @@ inline int uIndex(int code)
         case COLOR_YUV2BGR_IYUV: case COLOR_YUV2RGB_IYUV: case COLOR_YUV2BGRA_IYUV: case COLOR_YUV2RGBA_IYUV:
         case COLOR_YUV2RGB_UYVY: case COLOR_YUV2BGR_UYVY: case COLOR_YUV2RGBA_UYVY: case COLOR_YUV2BGRA_UYVY:
         case COLOR_YUV2RGB_YUY2: case COLOR_YUV2BGR_YUY2: case COLOR_YUV2RGBA_YUY2: case COLOR_YUV2BGRA_YUY2:
+        case COLOR_RGB2YUV_UYVY: case COLOR_BGR2YUV_UYVY: case COLOR_RGBA2YUV_UYVY: case COLOR_BGRA2YUV_UYVY:
+        case COLOR_RGB2YUV_YUY2: case COLOR_BGR2YUV_YUY2: case COLOR_RGBA2YUV_YUY2: case COLOR_BGRA2YUV_YUY2:
 
             return 0;
 
@@ -529,6 +541,7 @@ bool oclCvtColorBGR2YUV( InputArray _src, OutputArray _dst, int bidx );
 bool oclCvtColorYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx );
 
 bool oclCvtColorOnePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, int uidx, int yidx );
+bool oclCvtColorOnePlaneBGR2YUV( InputArray _src, OutputArray _dst, int dcn, int bidx, int uidx, int yidx );
 bool oclCvtColorTwoPlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, int uidx );
 bool oclCvtColorThreePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, int uidx );
 bool oclCvtColorBGR2ThreePlaneYUV( InputArray _src, OutputArray _dst, int bidx, int uidx );
@@ -547,6 +560,7 @@ void cvtColorBGR2YUV( InputArray _src, OutputArray _dst, bool swapb, bool crcb);
 void cvtColorYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, bool crcb);
 
 void cvtColorOnePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int uidx, int ycn);
+void cvtColorOnePlaneBGR2YUV( InputArray _src, OutputArray _dst, bool swapb, int uidx, int ycn);
 void cvtColorTwoPlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int uidx );
 void cvtColorTwoPlaneYUV2BGRpair( InputArray _ysrc, InputArray _uvsrc, OutputArray _dst, int dcn, bool swapb, int uidx );
 void cvtColorThreePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int uidx );
diff --git a/modules/imgproc/src/color.simd_helpers.hpp b/modules/imgproc/src/color.simd_helpers.hpp
index 6642ff69c4..06b9ba3d06 100644
--- a/modules/imgproc/src/color.simd_helpers.hpp
+++ b/modules/imgproc/src/color.simd_helpers.hpp
@@ -76,7 +76,7 @@ struct Set<i0, -1, -1>
 
 enum SizePolicy
 {
-    TO_YUV, FROM_YUV, FROM_UYVY, NONE
+    TO_YUV, FROM_YUV, FROM_UYVY, TO_UYVY, NONE
 };
 
 template< typename VScn, typename VDcn, typename VDepth, SizePolicy sizePolicy = NONE >
@@ -109,6 +109,7 @@ struct CvtHelper
             dstSz = Size(sz.width, sz.height * 2 / 3);
             break;
         case FROM_UYVY:
+        case TO_UYVY:
             CV_Assert( sz.width % 2 == 0);
             dstSz = sz;
             break;
diff --git a/modules/imgproc/src/color_hsv.simd.hpp b/modules/imgproc/src/color_hsv.simd.hpp
index bea1decc3a..bef9497760 100644
--- a/modules/imgproc/src/color_hsv.simd.hpp
+++ b/modules/imgproc/src/color_hsv.simd.hpp
@@ -98,7 +98,7 @@ struct RGB2HSV_b
 
         int i = 0;
 
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         const int vsize = VTraits<v_uint8>::vlanes();
         for ( ; i <= n - vsize;
               i += vsize, src += scn*vsize, dst += 3*vsize)
@@ -274,7 +274,7 @@ struct RGB2HSV_f
     : srccn(_srccn), blueIdx(_blueIdx), hrange(_hrange)
     { }
 
-    #if CV_SIMD || CV_SIMD_SCALABLE
+    #if (CV_SIMD || CV_SIMD_SCALABLE)
     inline void process(const v_float32& v_r, const v_float32& v_g, const v_float32& v_b,
                         v_float32& v_h, v_float32& v_s, v_float32& v_v,
                         float hscale) const
@@ -308,7 +308,7 @@ struct RGB2HSV_f
         float hscale = hrange*(1.f/360.f);
         n *= 3;
 
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         const int vsize = VTraits<v_float32>::vlanes();
         for ( ; i <= n - 3*vsize; i += 3*vsize, src += scn * vsize)
         {
@@ -368,7 +368,7 @@ struct RGB2HSV_f
 };
 
 
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 inline void HSV2RGB_simd(const v_float32& h, const v_float32& s, const v_float32& v,
                          v_float32& b, v_float32& g, v_float32& r, float hscale)
 {
@@ -473,7 +473,7 @@ struct HSV2RGB_f
         float hs = hscale;
         n *= 3;
 
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         const int vsize = VTraits<v_float32>::vlanes();
         v_float32 valpha = vx_setall_f32(alpha);
         for (; i <= n - vsize*3; i += vsize*3, dst += dcn * vsize)
@@ -530,7 +530,7 @@ struct HSV2RGB_b
         int j = 0, dcn = dstcn;
         uchar alpha = ColorChannel<uchar>::max();
 
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         const int vsize = VTraits<v_float32>::vlanes();
 
         for (j = 0; j <= (n - vsize*4) * 3; j += 3 * 4 * vsize, dst += dcn * 4 * vsize)
@@ -679,7 +679,7 @@ struct RGB2HLS_f
     {
     }
 
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     inline void process(const v_float32& r, const v_float32& g, const v_float32& b,
                         const v_float32& vhscale,
                         v_float32& h, v_float32& l, v_float32& s) const
@@ -718,7 +718,7 @@ struct RGB2HLS_f
 
         int i = 0, bidx = blueIdx, scn = srccn;
 
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         const int vsize = VTraits<v_float32>::vlanes();
         v_float32 vhscale = vx_setall_f32(hscale);
 
@@ -802,13 +802,13 @@ struct RGB2HLS_b
 
         int scn = srccn;
 
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         float CV_DECL_ALIGNED(CV_SIMD_WIDTH) buf[bufChannels*BLOCK_SIZE];
 #else
         float CV_DECL_ALIGNED(16) buf[bufChannels*BLOCK_SIZE];
 #endif
 
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         static const int fsize = VTraits<v_float32>::vlanes();
         //TODO: fix that when v_interleave is available
         float CV_DECL_ALIGNED(CV_SIMD_WIDTH) interTmpM[VTraits<v_float32>::max_nlanes*3];
@@ -823,7 +823,7 @@ struct RGB2HLS_b
         {
             int dn = std::min(n - i, (int)BLOCK_SIZE);
 
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
             v_float32 v255inv = vx_setall_f32(1.f/255.f);
             if (scn == 3)
             {
@@ -902,7 +902,7 @@ struct RGB2HLS_b
             cvt(buf, buf, dn);
 
             int j = 0;
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
             for( ; j <= dn*3 - fsize*3*4; j += fsize*3*4)
             {
                 v_float32 f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11;
@@ -973,7 +973,7 @@ struct HLS2RGB_f
     : dstcn(_dstcn), blueIdx(_blueIdx), hscale(6.f/_hrange)
     { }
 
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     inline void process(const v_float32& h, const v_float32& l, const v_float32& s,
                         v_float32& b, v_float32& g, v_float32& r) const
     {
@@ -1016,7 +1016,7 @@ struct HLS2RGB_f
         int i = 0, bidx = blueIdx, dcn = dstcn;
         float alpha = ColorChannel<float>::max();
 
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         static const int vsize = VTraits<v_float32>::vlanes();
         for (; i <= n - vsize; i += vsize, src += 3*vsize, dst += dcn*vsize)
         {
@@ -1099,13 +1099,13 @@ struct HLS2RGB_b
         int i, j, dcn = dstcn;
         uchar alpha = ColorChannel<uchar>::max();
 
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         float CV_DECL_ALIGNED(CV_SIMD_WIDTH) buf[bufChannels*BLOCK_SIZE];
 #else
         float CV_DECL_ALIGNED(16) buf[bufChannels*BLOCK_SIZE];
 #endif
 
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         static const int fsize = VTraits<v_float32>::vlanes();
         //TODO: fix that when v_interleave is available
         float CV_DECL_ALIGNED(CV_SIMD_WIDTH) interTmpM[VTraits<v_float32>::max_nlanes*3];
@@ -1122,7 +1122,7 @@ struct HLS2RGB_b
             int dn = std::min(n - i, (int)BLOCK_SIZE);
             j = 0;
 
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
             for( ; j <= dn*3 - 3*4*fsize; j += 3*4*fsize)
             {
                 // 3x uchar -> 3*4 float
@@ -1179,7 +1179,7 @@ struct HLS2RGB_b
             }
             cvt(buf, buf, dn);
 
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
             v_float32 v255 = vx_setall_f32(255.f);
             if(dcn == 3)
             {
diff --git a/modules/imgproc/src/color_yuv.dispatch.cpp b/modules/imgproc/src/color_yuv.dispatch.cpp
index 559005e07f..8720908100 100644
--- a/modules/imgproc/src/color_yuv.dispatch.cpp
+++ b/modules/imgproc/src/color_yuv.dispatch.cpp
@@ -206,6 +206,19 @@ void cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step,
         CV_CPU_DISPATCH_MODES_ALL);
 }
 
+void cvtOnePlaneBGRtoYUV(const uchar * src_data, size_t src_step,
+                         uchar * dst_data, size_t dst_step,
+                         int width, int height,
+                         int scn, bool swapBlue, int uIdx, int ycn)
+{
+    CV_INSTRUMENT_REGION();
+
+    CALL_HAL(cvtOnePlaneBGRtoYUV, cv_hal_cvtOnePlaneBGRtoYUV, src_data, src_step, dst_data, dst_step, width, height, scn, swapBlue, uIdx, ycn);
+
+    CV_CPU_DISPATCH(cvtOnePlaneBGRtoYUV, (src_data, src_step, dst_data, dst_step, width, height, scn, swapBlue, uIdx, ycn),
+        CV_CPU_DISPATCH_MODES_ALL);
+}
+
 } // namespace hal
 
 //
@@ -281,6 +294,20 @@ bool oclCvtColorOnePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int
     return h.run();
 }
 
+bool oclCvtColorOnePlaneBGR2YUV( InputArray _src, OutputArray _dst, int dcn, int bidx, int uidx, int yidx )
+{
+    OclHelper< Set<3, 4>, Set<2>, Set<CV_8U, CV_16U, CV_32F> > h(_src, _dst, dcn);
+
+    if(!h.createKernel("RGB2YUV_422", ocl::imgproc::color_yuv_oclsrc,
+                       format("-D dcn=%d -D bidx=%d -D uidx=%d -D yidx=%d", dcn, bidx, uidx, yidx
+                       )))
+    {
+        return false;
+    }
+
+    return h.run();
+}
+
 bool oclCvtColorYUV2Gray_420( InputArray _src, OutputArray _dst )
 {
     OclHelper< Set<1>, Set<1>, Set<CV_8U>, FROM_YUV> h(_src, _dst, 1);
@@ -360,6 +387,14 @@ void cvtColorOnePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool s
                              dcn, swapb, uidx, ycn);
 }
 
+void cvtColorOnePlaneBGR2YUV( InputArray _src, OutputArray _dst, bool swapb, int uidx, int ycn)
+{
+    CvtHelper< Set<3, 4>, Set<2>, Set<CV_8U>, TO_UYVY > h(_src, _dst, 2);
+
+    hal::cvtOnePlaneBGRtoYUV(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows,
+                             h.scn, swapb, uidx, ycn);
+}
+
 void cvtColorYUV2Gray_ch( InputArray _src, OutputArray _dst, int coi )
 {
     CV_Assert( _src.channels() == 2 && _src.depth() == CV_8U );
diff --git a/modules/imgproc/src/color_yuv.simd.hpp b/modules/imgproc/src/color_yuv.simd.hpp
index 580329f660..03f1b653e5 100644
--- a/modules/imgproc/src/color_yuv.simd.hpp
+++ b/modules/imgproc/src/color_yuv.simd.hpp
@@ -37,6 +37,10 @@ void cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step,
                          uchar * dst_data, size_t dst_step,
                          int width, int height,
                          int dcn, bool swapBlue, int uIdx, int ycn);
+void cvtOnePlaneBGRtoYUV(const uchar * src_data, size_t src_step,
+                         uchar * dst_data, size_t dst_step,
+                         int width, int height,
+                         int scn, bool swapBlue, int uIdx, int ycn);
 
 #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
 
@@ -1852,6 +1856,114 @@ inline void cvtYUV422toRGB(uchar * dst_data, size_t dst_step, const uchar * src_
         converter(Range(0, height));
 }
 
+
+///////////////////////////////////// RGB -> YUV422 /////////////////////////////////////
+
+static const int RGB2YUV422_SHIFT = 14;
+
+// Coefficients based on ITU.BT-601, ISBN 1-878707-09-4 (https://fourcc.org/fccyvrgb.php)
+// The conversion coefficients for RGB to YUV422 are based on the ones for RGB to YUV.
+// For both Y components, the coefficients are applied as given in the link to each input RGB pixel
+// separately. For U and V, they are reduced by half to account for two RGB pixels contributing
+// to the same U and V values. In other words, the U and V contributions from the two RGB pixels
+// are averaged. The integer versions are obtained by multiplying the float versions by 16384
+// and rounding to the nearest integer.
+
+int   c_RGB2YUV422Coeffs_i[10]  = {1024, 8192, 4211,  8258,  1606,
+                                   -1212, -2384,  3596, -3015,  -582};
+
+static inline void RGB2Y(const uchar r, const uchar g, const uchar b, uchar& y)
+{
+    int y_ = r * c_RGB2YUV422Coeffs_i[2] + g * c_RGB2YUV422Coeffs_i[3] +
+             b * c_RGB2YUV422Coeffs_i[4] + c_RGB2YUV422Coeffs_i[0]*256;
+    y = saturate_cast<uchar>(((1 << (RGB2YUV422_SHIFT-1)) + y_) >> RGB2YUV422_SHIFT);
+}
+
+static inline void RGB2UV(const uchar r1, const uchar g1, const uchar b1,
+                          const uchar r2, const uchar g2, const uchar b2,
+                          uchar& u, uchar& v)
+{
+    int sr = r1 + r2, sg = g1 + g2, sb = b1 + b2;
+
+    int u_ = sr * c_RGB2YUV422Coeffs_i[5] + sg * c_RGB2YUV422Coeffs_i[6] +
+             sb * c_RGB2YUV422Coeffs_i[7] + c_RGB2YUV422Coeffs_i[1]*256;
+    u = saturate_cast<uchar>(((1 << (RGB2YUV422_SHIFT-1)) + u_) >> RGB2YUV422_SHIFT);
+
+    int v_ = sr * c_RGB2YUV422Coeffs_i[7] + sg * c_RGB2YUV422Coeffs_i[8] +
+             sb * c_RGB2YUV422Coeffs_i[9] + c_RGB2YUV422Coeffs_i[1]*256;
+    v = saturate_cast<uchar>(((1 << (RGB2YUV422_SHIFT-1)) + v_) >> RGB2YUV422_SHIFT);
+}
+
+template<int yidx, int uidx, int vidx>
+static inline void cvtRGB82Yuv422(const uchar r1, const uchar g1, const uchar b1,
+                                    const uchar r2, const uchar g2, const uchar b2,
+                                    uchar* row)
+{
+    uchar &u = row[uidx], &v = row[vidx], &y1 = row[yidx], &y2 = row[yidx+2];
+
+    RGB2Y(r1, g1, b1, y1);
+    RGB2Y(r2, g2, b2, y2);
+
+    RGB2UV(r1, g1, b1, r2, g2, b2, u, v);
+}
+
+// bIdx is 0 or 2; [uIdx, yIdx] is [0, 0], [0, 1], [1, 0]; scn is 3 or 4
+template<int bIdx, int uIdx, int yIdx, int scn>
+struct RGB8toYUV422Invoker : ParallelLoopBody
+{
+    uchar * dst_data;
+    size_t dst_step;
+    const uchar * src_data;
+    size_t src_step;
+    int width;
+
+    RGB8toYUV422Invoker(uchar * _dst_data, size_t _dst_step,
+                        const uchar * _src_data, size_t _src_step,
+                        int _width)
+        : dst_data(_dst_data), dst_step(_dst_step), src_data(_src_data), src_step(_src_step), width(_width) {}
+
+    void operator()(const Range& range) const CV_OVERRIDE
+    {
+        int rangeBegin = range.start;
+        int rangeEnd = range.end;
+
+        // [yIdx, uIdx] | [uidx, vidx]:
+        //     0, 0     |     1, 3
+        //     0, 1     |     3, 1
+        //     1, 0     |     0, 2
+        const int uidx = 1 - yIdx + uIdx * 2;
+        const int vidx = (2 + uidx) % 4;
+        const int ridx = (2-bIdx);
+        const uchar* rgb_src = src_data + rangeBegin * (src_step);
+        const uchar* rgb_src2 = rgb_src+scn;
+
+        for (int j = rangeBegin; j < rangeEnd; j++, rgb_src += src_step, rgb_src2 = rgb_src+scn)
+        {
+            uchar* row = dst_data + (dst_step) * j;
+            int i = 0;
+            for (; i < scn * width; i += (scn << 1), row += 4)
+            {
+                const uchar r1 = rgb_src[i+ridx], g1 = rgb_src[i+1], b1 = rgb_src[i+bIdx];
+                const uchar r2 = rgb_src2[i+ridx], g2 = rgb_src2[i+1], b2 = rgb_src2[i+bIdx];
+
+                cvtRGB82Yuv422<yIdx, uidx, vidx>(r1, g1, b1, r2, g2, b2, row);
+            }
+        }
+    }
+};
+
+template<int bIdx, int uIdx, int yIdx, int scn>
+inline void cvtRGBtoYUV422(uchar * dst_data, size_t dst_step, const uchar * src_data, size_t src_step,
+                           int width, int height)
+{
+    RGB8toYUV422Invoker<bIdx, uIdx, yIdx, scn> converter(dst_data, dst_step, src_data, src_step, width);
+    if (width * height >= MIN_SIZE_FOR_PARALLEL_YUV422_CONVERSION)
+        parallel_for_(Range(0, height), converter);
+    else
+        converter(Range(0, height));
+}
+
+
 } // namespace anon
 
 
@@ -2033,6 +2145,35 @@ void cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step,
     cvtPtr(dst_data, dst_step, src_data, src_step, width, height);
 }
 
+void cvtOnePlaneBGRtoYUV(const uchar * src_data, size_t src_step,
+                         uchar * dst_data, size_t dst_step,
+                         int width, int height,
+                         int scn, bool swapBlue, int uIdx, int ycn)
+{
+    CV_INSTRUMENT_REGION();
+
+    cvt_1plane_yuv_ptr_t cvtPtr;
+    int blueIdx = swapBlue ? 2 : 0;
+    switch(scn*1000 + blueIdx*100 + uIdx*10 + ycn)
+    {
+    case 3000: cvtPtr = cvtRGBtoYUV422<0,0,0,3>; break;
+    case 3001: cvtPtr = cvtRGBtoYUV422<0,0,1,3>; break;
+    case 3010: cvtPtr = cvtRGBtoYUV422<0,1,0,3>; break;
+    case 3200: cvtPtr = cvtRGBtoYUV422<2,0,0,3>; break;
+    case 3201: cvtPtr = cvtRGBtoYUV422<2,0,1,3>; break;
+    case 3210: cvtPtr = cvtRGBtoYUV422<2,1,0,3>; break;
+    case 4000: cvtPtr = cvtRGBtoYUV422<0,0,0,4>; break;
+    case 4001: cvtPtr = cvtRGBtoYUV422<0,0,1,4>; break;
+    case 4010: cvtPtr = cvtRGBtoYUV422<0,1,0,4>; break;
+    case 4200: cvtPtr = cvtRGBtoYUV422<2,0,0,4>; break;
+    case 4201: cvtPtr = cvtRGBtoYUV422<2,0,1,4>; break;
+    case 4210: cvtPtr = cvtRGBtoYUV422<2,1,0,4>; break;
+    default: CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" ); break;
+    };
+
+    cvtPtr(dst_data, dst_step, src_data, src_step, width, height);
+}
+
 #endif
 CV_CPU_OPTIMIZATION_NAMESPACE_END
 }} // namespace
diff --git a/modules/imgproc/src/contours.cpp b/modules/imgproc/src/contours.cpp
index 3c5501659a..cae583f618 100644
--- a/modules/imgproc/src/contours.cpp
+++ b/modules/imgproc/src/contours.cpp
@@ -510,9 +510,6 @@ It supports both hierarchical and plane variants of Suzuki algorithm.
 typedef struct _CvContourScanner* CvContourScanner;
 
 
-#define _CV_FIND_CONTOURS_FLAGS_EXTERNAL_ONLY    1
-#define _CV_FIND_CONTOURS_FLAGS_HIERARCHIC       2
-
 /*
    Initializes scanner structure.
    Prepare image for scanning ( clear borders and convert all pixels to 0-1.
diff --git a/modules/imgproc/src/distransform.cpp b/modules/imgproc/src/distransform.cpp
index adb0359c07..b11da8ebc2 100755
--- a/modules/imgproc/src/distransform.cpp
+++ b/modules/imgproc/src/distransform.cpp
@@ -45,24 +45,23 @@ namespace cv
 {
 
 static const int DIST_SHIFT = 16;
-static const int INIT_DIST0 = INT_MAX;
-static const int DIST_MAX   = (INT_MAX >> 2);
 #define  CV_FLT_TO_FIX(x,n)  cvRound((x)*(1<<(n)))
 
 static void
-initTopBottom( Mat& temp, int border )
+initTopBottom( Mat& temp, int border, unsigned int value )
 {
     Size size = temp.size();
+    unsigned int* ttop = (unsigned int*)temp.ptr<int>(0);
+    unsigned int* tbottom = (unsigned int*)temp.ptr<int>(size.height - 1);
     for( int i = 0; i < border; i++ )
     {
-        int* ttop = temp.ptr<int>(i);
-        int* tbottom = temp.ptr<int>(size.height - i - 1);
-
         for( int j = 0; j < size.width; j++ )
         {
-            ttop[j] = INIT_DIST0;
-            tbottom[j] = INIT_DIST0;
+            ttop[j] = value;
+            tbottom[j] = value;
         }
+        ttop += size.width;
+        tbottom -= size.width;
     }
 }
 
@@ -74,6 +73,7 @@ distanceTransform_3x3( const Mat& _src, Mat& _temp, Mat& _dist, const float* met
     int i, j;
     const unsigned int HV_DIST = CV_FLT_TO_FIX( metrics[0], DIST_SHIFT );
     const unsigned int DIAG_DIST = CV_FLT_TO_FIX( metrics[1], DIST_SHIFT );
+    const unsigned int DIST_MAX = UINT_MAX - DIAG_DIST;
     const float scale = 1.f/(1 << DIST_SHIFT);
 
     const uchar* src = _src.ptr();
@@ -84,7 +84,7 @@ distanceTransform_3x3( const Mat& _src, Mat& _temp, Mat& _dist, const float* met
     int dststep = (int)(_dist.step/sizeof(dist[0]));
     Size size = _src.size();
 
-    initTopBottom( _temp, BORDER );
+    initTopBottom( _temp, BORDER, DIST_MAX );
 
     // forward pass
     unsigned int* tmp = (unsigned int*)(temp + BORDER*step) + BORDER;
@@ -92,7 +92,7 @@ distanceTransform_3x3( const Mat& _src, Mat& _temp, Mat& _dist, const float* met
     for( i = 0; i < size.height; i++ )
     {
         for( j = 0; j < BORDER; j++ )
-            tmp[-j-1] = tmp[size.width + j] = INIT_DIST0;
+            tmp[-j-1] = tmp[size.width + j] = DIST_MAX;
 
         for( j = 0; j < size.width; j++ )
         {
@@ -107,7 +107,7 @@ distanceTransform_3x3( const Mat& _src, Mat& _temp, Mat& _dist, const float* met
                 if( t0 > t ) t0 = t;
                 t = tmp[j-1] + HV_DIST;
                 if( t0 > t ) t0 = t;
-                tmp[j] = t0;
+                tmp[j] = (t0 > DIST_MAX) ? DIST_MAX : t0;
             }
         }
         tmp += step;
@@ -135,7 +135,6 @@ distanceTransform_3x3( const Mat& _src, Mat& _temp, Mat& _dist, const float* met
                 if( t0 > t ) t0 = t;
                 tmp[j] = t0;
             }
-            t0 = (t0 > DIST_MAX) ? DIST_MAX : t0;
             d[j] = (float)(t0 * scale);
         }
         d -= dststep;
@@ -151,6 +150,7 @@ distanceTransform_5x5( const Mat& _src, Mat& _temp, Mat& _dist, const float* met
     const unsigned int HV_DIST = CV_FLT_TO_FIX( metrics[0], DIST_SHIFT );
     const unsigned int DIAG_DIST = CV_FLT_TO_FIX( metrics[1], DIST_SHIFT );
     const unsigned int LONG_DIST = CV_FLT_TO_FIX( metrics[2], DIST_SHIFT );
+    const unsigned int DIST_MAX = UINT_MAX - LONG_DIST;
     const float scale = 1.f/(1 << DIST_SHIFT);
 
     const uchar* src = _src.ptr();
@@ -161,7 +161,7 @@ distanceTransform_5x5( const Mat& _src, Mat& _temp, Mat& _dist, const float* met
     int dststep = (int)(_dist.step/sizeof(dist[0]));
     Size size = _src.size();
 
-    initTopBottom( _temp, BORDER );
+    initTopBottom( _temp, BORDER, DIST_MAX );
 
     // forward pass
     unsigned int* tmp = (unsigned int*)(temp + BORDER*step) + BORDER;
@@ -169,7 +169,7 @@ distanceTransform_5x5( const Mat& _src, Mat& _temp, Mat& _dist, const float* met
     for( i = 0; i < size.height; i++ )
     {
         for( j = 0; j < BORDER; j++ )
-            tmp[-j-1] = tmp[size.width + j] = INIT_DIST0;
+            tmp[-j-1] = tmp[size.width + j] = DIST_MAX;
 
         for( j = 0; j < size.width; j++ )
         {
@@ -192,7 +192,7 @@ distanceTransform_5x5( const Mat& _src, Mat& _temp, Mat& _dist, const float* met
                 if( t0 > t ) t0 = t;
                 t = tmp[j-1] + HV_DIST;
                 if( t0 > t ) t0 = t;
-                tmp[j] = t0;
+                tmp[j] = (t0 > DIST_MAX) ? DIST_MAX : t0;
             }
         }
         tmp += step;
@@ -228,7 +228,6 @@ distanceTransform_5x5( const Mat& _src, Mat& _temp, Mat& _dist, const float* met
                 if( t0 > t ) t0 = t;
                 tmp[j] = t0;
             }
-            t0 = (t0 > DIST_MAX) ? DIST_MAX : t0;
             d[j] = (float)(t0 * scale);
         }
         d -= dststep;
@@ -245,6 +244,7 @@ distanceTransformEx_5x5( const Mat& _src, Mat& _temp, Mat& _dist, Mat& _labels,
     const unsigned int HV_DIST = CV_FLT_TO_FIX( metrics[0], DIST_SHIFT );
     const unsigned int DIAG_DIST = CV_FLT_TO_FIX( metrics[1], DIST_SHIFT );
     const unsigned int LONG_DIST = CV_FLT_TO_FIX( metrics[2], DIST_SHIFT );
+    const unsigned int DIST_MAX = UINT_MAX - LONG_DIST;
     const float scale = 1.f/(1 << DIST_SHIFT);
 
     const uchar* src = _src.ptr();
@@ -257,7 +257,7 @@ distanceTransformEx_5x5( const Mat& _src, Mat& _temp, Mat& _dist, Mat& _labels,
     int lstep = (int)(_labels.step/sizeof(labels[0]));
     Size size = _src.size();
 
-    initTopBottom( _temp, BORDER );
+    initTopBottom( _temp, BORDER, DIST_MAX );
 
     // forward pass
     const uchar* s = src;
@@ -266,7 +266,7 @@ distanceTransformEx_5x5( const Mat& _src, Mat& _temp, Mat& _dist, Mat& _labels,
     for( i = 0; i < size.height; i++ )
     {
         for( j = 0; j < BORDER; j++ )
-            tmp[-j-1] = tmp[size.width + j] = INIT_DIST0;
+            tmp[-j-1] = tmp[size.width + j] = DIST_MAX;
 
         for( j = 0; j < size.width; j++ )
         {
@@ -277,7 +277,7 @@ distanceTransformEx_5x5( const Mat& _src, Mat& _temp, Mat& _dist, Mat& _labels,
             }
             else
             {
-                unsigned int t0 = INIT_DIST0, t;
+                unsigned int t0 = DIST_MAX, t;
                 int l0 = 0;
 
                 t = tmp[j-step*2-1] + LONG_DIST;
@@ -402,7 +402,6 @@ distanceTransformEx_5x5( const Mat& _src, Mat& _temp, Mat& _dist, Mat& _labels,
                 tmp[j] = t0;
                 lls[j] = l0;
             }
-            t0 = (t0 > DIST_MAX) ? DIST_MAX : t0;
             d[j] = (float)(t0 * scale);
         }
         d -= dststep;
@@ -455,7 +454,7 @@ static void getDistanceTransformMask( int maskType, float *metrics )
 
 struct DTColumnInvoker : ParallelLoopBody
 {
-    DTColumnInvoker( const Mat* _src, Mat* _dst, const int* _sat_tab, const int* _sqr_tab)
+    DTColumnInvoker( const Mat* _src, Mat* _dst, const int* _sat_tab, const unsigned int* _sqr_tab)
     {
         src = _src;
         dst = _dst;
@@ -496,12 +495,14 @@ struct DTColumnInvoker : ParallelLoopBody
     const Mat* src;
     Mat* dst;
     const int* sat_tab;
-    const int* sqr_tab;
+    const unsigned int* sqr_tab;
 };
 
+static const int PRECISE_DIST_MAX = 1 << 16;
+
 struct DTRowInvoker : ParallelLoopBody
 {
-    DTRowInvoker( Mat* _dst, const int* _sqr_tab, const float* _inv_tab )
+    DTRowInvoker( Mat* _dst, const unsigned int* _sqr_tab, const float* _inv_tab )
     {
         dst = _dst;
         sqr_tab = _sqr_tab;
@@ -528,7 +529,7 @@ struct DTRowInvoker : ParallelLoopBody
             z[1] = inf;
             f[0] = d[0];
 
-            for( q = 1, k = 0; q < n; q++ )
+            for( q = 1, k = 0; q < std::min(PRECISE_DIST_MAX, n); q++ )
             {
                 float fq = d[q];
                 f[q] = fq;
@@ -547,6 +548,25 @@ struct DTRowInvoker : ParallelLoopBody
                     }
                 }
             }
+            for(; q < n; q++ )
+            {
+                float fq = d[q];
+                f[q] = fq;
+
+                for(;;k--)
+                {
+                    p = v[k];
+                    float s = (fq - d[p] + static_cast<float>(q + p) * (q - p))*inv_tab[q - p];
+                    if( s > z[k] )
+                    {
+                        k++;
+                        v[k] = q;
+                        z[k] = s;
+                        z[k+1] = inf;
+                        break;
+                    }
+                }
+            }
 
             for( q = 0, k = 0; q < n; q++ )
             {
@@ -559,14 +579,14 @@ struct DTRowInvoker : ParallelLoopBody
     }
 
     Mat* dst;
-    const int* sqr_tab;
+    const unsigned int* sqr_tab;
     const float* inv_tab;
 };
 
 static void
 trueDistTrans( const Mat& src, Mat& dst )
 {
-    const int inf = INT_MAX;
+    const unsigned int inf = UINT_MAX;
 
     CV_Assert( src.size() == dst.size() );
 
@@ -575,12 +595,12 @@ trueDistTrans( const Mat& src, Mat& dst )
 
     cv::AutoBuffer<uchar> _buf(std::max(m*2*sizeof(int) + (m*3+1)*sizeof(int), n*2*sizeof(float)));
     // stage 1: compute 1d distance transform of each column
-    int* sqr_tab = (int*)_buf.data();
+    unsigned int* sqr_tab = (unsigned int*)_buf.data();
     int* sat_tab = cv::alignPtr((int*)(sqr_tab + m*2), sizeof(int));
     int shift = m*2;
 
     for( i = 0; i < m; i++ )
-        sqr_tab[i] = i*i;
+        sqr_tab[i] = i >= PRECISE_DIST_MAX ? inf : static_cast<unsigned int>(i) * i;
     for( i = m; i < m*2; i++ )
         sqr_tab[i] = inf;
     for( i = 0; i < shift; i++ )
@@ -598,7 +618,7 @@ trueDistTrans( const Mat& src, Mat& dst )
     for( i = 1; i < n; i++ )
     {
         inv_tab[i] = (float)(0.5/i);
-        sqr_tab[i] = i*i;
+        sqr_tab[i] = i >= PRECISE_DIST_MAX ? inf : static_cast<unsigned int>(i) * i;
     }
 
     cv::parallel_for_(cv::Range(0, m), cv::DTRowInvoker(&dst, sqr_tab, inv_tab));
diff --git a/modules/imgproc/src/drawing.cpp b/modules/imgproc/src/drawing.cpp
index dae4c71bf2..b53098bcd2 100644
--- a/modules/imgproc/src/drawing.cpp
+++ b/modules/imgproc/src/drawing.cpp
@@ -2265,9 +2265,6 @@ void cv::drawContours( InputOutputArray _image, InputArrayOfArrays _contours,
 static const int CodeDeltas[8][2] =
 { {1, 0}, {1, -1}, {0, -1}, {-1, -1}, {-1, 0}, {-1, 1}, {0, 1}, {1, 1} };
 
-#define CV_ADJUST_EDGE_COUNT( count, seq )  \
-    ((count) -= ((count) == (seq)->total && !CV_IS_SEQ_CLOSED(seq)))
-
 void
 cvDrawContours( void* _img, CvSeq* contour,
                 CvScalar _externalColor, CvScalar _holeColor,
diff --git a/modules/imgproc/src/filter.dispatch.cpp b/modules/imgproc/src/filter.dispatch.cpp
index 43693dbf80..15bf9682fc 100644
--- a/modules/imgproc/src/filter.dispatch.cpp
+++ b/modules/imgproc/src/filter.dispatch.cpp
@@ -163,8 +163,6 @@ void FilterEngine::init( const Ptr<BaseFilter>& _filter2D,
     wholeSize = Size(-1,-1);
 }
 
-#define VEC_ALIGN CV_MALLOC_ALIGN
-
 int FilterEngine::start(const Size& _wholeSize, const Size& sz, const Point& ofs)
 {
     CV_INSTRUMENT_REGION();
diff --git a/modules/imgproc/src/hal_replacement.hpp b/modules/imgproc/src/hal_replacement.hpp
index 6eb956bfee..c066f3d6f3 100644
--- a/modules/imgproc/src/hal_replacement.hpp
+++ b/modules/imgproc/src/hal_replacement.hpp
@@ -300,9 +300,12 @@ inline int hal_ni_warpPerspective(int src_type, const uchar *src_data, size_t sr
 
 /**
    @brief hal_cvtBGRtoBGR
-   @param src_data,src_step source image data and step
-   @param dst_data,dst_step destination image data and step
-   @param width,height image size
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param width image width
+   @param height image height
    @param depth image depth (one of CV_8U, CV_16U, CV_32F)
    @param scn source image channels (3 or 4)
    @param dcn destination image channels (3 or 4)
@@ -313,9 +316,12 @@ inline int hal_ni_cvtBGRtoBGR(const uchar * src_data, size_t src_step, uchar * d
 
 /**
    @brief hal_cvtBGRtoBGR5x5
-   @param src_data,src_step source image data and step
-   @param dst_data,dst_step destination image data and step
-   @param width,height image size
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param width image width
+   @param height image height
    @param scn source image channels (3 or 4)
    @param swapBlue if set to true B and R source channels will be swapped (treat as RGB)
    @param greenBits number of bits for green channel (5 or 6)
@@ -326,9 +332,12 @@ inline int hal_ni_cvtBGRtoBGR5x5(const uchar * src_data, size_t src_step, uchar
 
 /**
    @brief hal_cvtBGR5x5toBGR
-   @param src_data,src_step source image data and step
-   @param dst_data,dst_step destination image data and step
-   @param width,height image size
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param width image width
+   @param height image height
    @param dcn destination image channels (3 or 4)
    @param swapBlue if set to true B and R destination channels will be swapped (write RGB)
    @param greenBits number of bits for green channel (5 or 6)
@@ -339,9 +348,12 @@ inline int hal_ni_cvtBGR5x5toBGR(const uchar * src_data, size_t src_step, uchar
 
 /**
    @brief hal_cvtBGRtoGray
-   @param src_data,src_step source image data and step
-   @param dst_data,dst_step destination image data and step
-   @param width,height image size
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param width image width
+   @param height image height
    @param depth image depth (one of CV_8U, CV_16U or CV_32F)
    @param scn source image channels (3 or 4)
    @param swapBlue if set to true B and R source channels will be swapped (treat as RGB)
@@ -351,9 +363,12 @@ inline int hal_ni_cvtBGRtoGray(const uchar * src_data, size_t src_step, uchar *
 
 /**
    @brief hal_cvtGraytoBGR
-   @param src_data,src_step source image data and step
-   @param dst_data,dst_step destination image data and step
-   @param width,height image size
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param width image width
+   @param height image height
    @param depth image depth (one of CV_8U, CV_16U or CV_32F)
    @param dcn destination image channels (3 or 4)
    Convert from 1-channel gray to BGR, RGB, RGBA or BGRA.
@@ -362,9 +377,12 @@ inline int hal_ni_cvtGraytoBGR(const uchar * src_data, size_t src_step, uchar *
 
 /**
    @brief hal_cvtBGR5x5toGray
-   @param src_data,src_step source image data and step
-   @param dst_data,dst_step destination image data and step
-   @param width,height image size
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param width image width
+   @param height image height
    @param greenBits number of bits for green channel (5 or 6)
    Convert from packed BGR (16 bits per pixel, 555 or 565) to 1-channel gray.
    Support only CV_8U images.
@@ -373,9 +391,12 @@ inline int hal_ni_cvtBGR5x5toGray(const uchar * src_data, size_t src_step, uchar
 
 /**
    @brief hal_cvtGraytoBGR5x5
-   @param src_data,src_step source image data and step
-   @param dst_data,dst_step destination image data and step
-   @param width,height image size
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param width image width
+   @param height image height
    @param greenBits number of bits for green channel (5 or 6)
    Convert from 1-channel gray to packed BGR (16 bits per pixel, 555 or 565).
    Support only CV_8U images.
@@ -384,9 +405,12 @@ inline int hal_ni_cvtGraytoBGR5x5(const uchar * src_data, size_t src_step, uchar
 
 /**
    @brief hal_cvtBGRtoYUV
-   @param src_data,src_step source image data and step
-   @param dst_data,dst_step destination image data and step
-   @param width,height image size
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param width image width
+   @param height image height
    @param depth image depth (one of CV_8U, CV_16U or CV_32F)
    @param scn source image channels (3 or 4)
    @param swapBlue if set to true B and R source channels will be swapped (treat as RGB)
@@ -397,9 +421,12 @@ inline int hal_ni_cvtBGRtoYUV(const uchar * src_data, size_t src_step, uchar * d
 
 /**
    @brief hal_cvtYUVtoBGR
-   @param src_data,src_step source image data and step
-   @param dst_data,dst_step destination image data and step
-   @param width,height image size
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param width image width
+   @param height image height
    @param depth image depth (one of CV_8U, CV_16U or CV_32F)
    @param dcn destination image channels (3 or 4)
    @param swapBlue if set to true B and R destination channels will be swapped (write RGB)
@@ -410,9 +437,12 @@ inline int hal_ni_cvtYUVtoBGR(const uchar * src_data, size_t src_step, uchar * d
 
 /**
    @brief hal_cvtBGRtoXYZ
-   @param src_data,src_step source image data and step
-   @param dst_data,dst_step destination image data and step
-   @param width,height image size
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param width image width
+   @param height image height
    @param depth image depth (one of CV_8U, CV_16U or CV_32F)
    @param scn source image channels (3 or 4)
    @param swapBlue if set to true B and R source channels will be swapped (treat as RGB)
@@ -422,9 +452,12 @@ inline int hal_ni_cvtBGRtoXYZ(const uchar * src_data, size_t src_step, uchar * d
 
 /**
    @brief hal_cvtXYZtoBGR
-   @param src_data,src_step source image data and step
-   @param dst_data,dst_step destination image data and step
-   @param width,height image size
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param width image width
+   @param height image height
    @param depth image depth (one of CV_8U, CV_16U or CV_32F)
    @param dcn destination image channels (3 or 4)
    @param swapBlue if set to true B and R destination channels will be swapped (write RGB)
@@ -434,9 +467,12 @@ inline int hal_ni_cvtXYZtoBGR(const uchar * src_data, size_t src_step, uchar * d
 
 /**
    @brief hal_cvtBGRtoHSV
-   @param src_data,src_step source image data and step
-   @param dst_data,dst_step destination image data and step
-   @param width,height image size
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param width image width
+   @param height image height
    @param depth image depth (one of CV_8U or CV_32F)
    @param scn source image channels (3 or 4)
    @param swapBlue if set to true B and R source channels will be swapped (treat as RGB)
@@ -448,9 +484,12 @@ inline int hal_ni_cvtBGRtoHSV(const uchar * src_data, size_t src_step, uchar * d
 
 /**
    @brief hal_cvtHSVtoBGR
-   @param src_data,src_step source image data and step
-   @param dst_data,dst_step destination image data and step
-   @param width,height image size
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param width image width
+   @param height image height
    @param depth image depth (one of CV_8U or CV_32F)
    @param dcn destination image channels (3 or 4)
    @param swapBlue if set to true B and R destination channels will be swapped (write RGB)
@@ -462,9 +501,12 @@ inline int hal_ni_cvtHSVtoBGR(const uchar * src_data, size_t src_step, uchar * d
 
 /**
    @brief hal_cvtBGRtoLab
-   @param src_data,src_step source image data and step
-   @param dst_data,dst_step destination image data and step
-   @param width,height image size
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param width image width
+   @param height image height
    @param depth image depth (one of CV_8U or CV_32F)
    @param scn source image channels (3 or 4)
    @param swapBlue if set to true B and R source channels will be swapped (treat as RGB)
@@ -476,9 +518,12 @@ inline int hal_ni_cvtBGRtoLab(const uchar * src_data, size_t src_step, uchar * d
 
 /**
    @brief hal_cvtLabtoBGR
-   @param src_data,src_step source image data and step
-   @param dst_data,dst_step destination image data and step
-   @param width,height image size
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param width image width
+   @param height image height
    @param depth image depth (one of CV_8U or CV_32F)
    @param dcn destination image channels (3 or 4)
    @param swapBlue if set to true B and R destination channels will be swapped (write RGB)
@@ -490,9 +535,12 @@ inline int hal_ni_cvtLabtoBGR(const uchar * src_data, size_t src_step, uchar * d
 
 /**
    @brief hal_cvtTwoPlaneYUVtoBGR
-   @param src_data,src_step source image data and step
-   @param dst_data,dst_step destination image data and step
-   @param dst_width,dst_height destination image size
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param dst_width destination image width
+   @param dst_height destination image height
    @param dcn destination image channels (3 or 4)
    @param swapBlue if set to true B and R destination channels will be swapped (write RGB)
    @param uIdx U-channel index in the interleaved U/V plane (0 or 1)
@@ -503,10 +551,14 @@ inline int hal_ni_cvtTwoPlaneYUVtoBGR(const uchar * src_data, size_t src_step, u
 
 /**
    @brief Extended version of hal_cvtTwoPlaneYUVtoBGR.
-   @param y_data,y_step source image data and step (Y-plane)
-   @param uv_data,uv_step source image data and step (UV-plane)
-   @param dst_data,dst_step destination image data and step
-   @param dst_width,dst_height destination image size
+   @param y_data source image data (Y-plane)
+   @param y_step source image step (Y-plane)
+   @param uv_data source image data (UV-plane)
+   @param uv_step source image step (UV-plane)
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param dst_width destination image width
+   @param dst_height destination image height
    @param dcn destination image channels (3 or 4)
    @param swapBlue if set to true B and R destination channels will be swapped (write RGB)
    @param uIdx U-channel index in the interleaved U/V plane (0 or 1)
@@ -519,10 +571,14 @@ inline int hal_ni_cvtTwoPlaneYUVtoBGREx(const uchar * y_data, size_t y_step, con
 
 /**
    @brief hal_cvtBGRtoTwoPlaneYUV
-   @param src_data,src_step source image data and step
-   @param y_data,y_step destination image data and step (Y-plane)
-   @param uv_data,uv_step destination image data and step (UV-plane)
-   @param width,height image size
+   @param src_data source image data
+   @param src_step source image step
+   @param y_data destination image data (Y-plane)
+   @param y_step destination image step (Y-plane)
+   @param uv_data destination image data (UV-plane)
+   @param uv_step destination image step (UV-plane)
+   @param width image width
+   @param height image height
    @param scn source image channels (3 or 4)
    @param swapBlue if set to true B and R source channels will be swapped (treat as RGB)
    @param uIdx U-channel plane index (0 or 1)
@@ -536,9 +592,12 @@ inline int hal_ni_cvtBGRtoTwoPlaneYUV(const uchar * src_data, size_t src_step,
 
 /**
    @brief hal_cvtThreePlaneYUVtoBGR
-   @param src_data,src_step source image data and step
-   @param dst_data,dst_step destination image data and step
-   @param dst_width,dst_height destination image size
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param dst_width destination image width
+   @param dst_height destination image height
    @param dcn destination image channels (3 or 4)
    @param swapBlue if set to true B and R destination channels will be swapped (write RGB)
    @param uIdx U-channel plane index (0 or 1)
@@ -549,9 +608,12 @@ inline int hal_ni_cvtThreePlaneYUVtoBGR(const uchar * src_data, size_t src_step,
 
 /**
    @brief hal_cvtBGRtoThreePlaneYUV
-   @param src_data,src_step source image data and step
-   @param dst_data,dst_step destination image data and step
-   @param width,height image size
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param width image width
+   @param height image height
    @param scn source image channels (3 or 4)
    @param swapBlue if set to true B and R source channels will be swapped (treat as RGB)
    @param uIdx U-channel plane index (0 or 1)
@@ -562,9 +624,12 @@ inline int hal_ni_cvtBGRtoThreePlaneYUV(const uchar * src_data, size_t src_step,
 
 /**
    @brief hal_cvtOnePlaneYUVtoBGR
-   @param src_data,src_step source image data and step
-   @param dst_data,dst_step destination image data and step
-   @param width,height image size
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param width image width
+   @param height image height
    @param dcn destination image channels (3 or 4)
    @param swapBlue if set to true B and R destination channels will be swapped (write RGB)
    @param uIdx U-channel index (0 or 1)
@@ -574,12 +639,28 @@ inline int hal_ni_cvtBGRtoThreePlaneYUV(const uchar * src_data, size_t src_step,
  */
 inline int hal_ni_cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int dcn, bool swapBlue, int uIdx, int ycn) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 
-
 /**
-   @brief hal_cvtRGBAtoMultipliedRGBA
+   @brief hal_cvtOnePlaneBGRtoYUV
    @param src_data,src_step source image data and step
    @param dst_data,dst_step destination image data and step
    @param width,height image size
+   @param scn source image channels (3 or 4)
+   @param swapBlue if set to true B and R destination channels will be swapped (write RGB)
+   @param uIdx U-channel index (0 or 1)
+   @param ycn Y-channel index (0 or 1)
+   Convert from BGR, RGB, BGRA or RGBA to UYVY, YUY2 or YVYU.
+   Only for CV_8U.
+ */
+inline int hal_ni_cvtOnePlaneBGRtoYUV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int scn, bool swapBlue, int uIdx, int ycn) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+/**
+   @brief hal_cvtRGBAtoMultipliedRGBA
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param width image width
+   @param height image height
    Convert from BGRA or RGBA to format with multiplied alpha channel.
    Only for CV_8U.
  */
@@ -587,9 +668,12 @@ inline int hal_ni_cvtRGBAtoMultipliedRGBA(const uchar * src_data, size_t src_ste
 
 /**
    @brief hal_cvtMultipliedRGBAtoRGBA
-   @param src_data,src_step source image data and step
-   @param dst_data,dst_step destination image data and step
-   @param width,height image size
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param width image width
+   @param height image height
    Convert from format with multiplied alpha channel to BGRA or RGBA.
    Only for CV_8U.
  */
@@ -617,18 +701,26 @@ inline int hal_ni_cvtMultipliedRGBAtoRGBA(const uchar * src_data, size_t src_ste
 #define cv_hal_cvtThreePlaneYUVtoBGR hal_ni_cvtThreePlaneYUVtoBGR
 #define cv_hal_cvtBGRtoThreePlaneYUV hal_ni_cvtBGRtoThreePlaneYUV
 #define cv_hal_cvtOnePlaneYUVtoBGR hal_ni_cvtOnePlaneYUVtoBGR
+#define cv_hal_cvtOnePlaneBGRtoYUV hal_ni_cvtOnePlaneBGRtoYUV
 #define cv_hal_cvtRGBAtoMultipliedRGBA hal_ni_cvtRGBAtoMultipliedRGBA
 #define cv_hal_cvtMultipliedRGBAtoRGBA hal_ni_cvtMultipliedRGBAtoRGBA
 //! @endcond
 
 /**
    @brief Calculate integral image
-   @param depth,sdepth,sqdepth Depths of source image, sum image and square sum image
-   @param src_data,src_step Source image
-   @param sum_data,sum_step Sum image
-   @param sqsum_data,sqsum_step Square sum image
-   @param tilted_data,tilted_step Tilted sum image
-   @param width,height Source image dimensions
+   @param depth Depth of source image
+   @param sdepth Depth of sum image
+   @param sqdepth Depth of square sum image
+   @param src_data Source image data
+   @param src_step Source image step
+   @param sum_data Sum image data
+   @param sum_step Sum image step
+   @param sqsum_data Square sum image data
+   @param sqsum_step Square sum image step
+   @param tilted_data Tilted sum image data
+   @param tilted_step Tilted sum image step
+   @param width Source image width
+   @param height Source image height
    @param cn Number of channels
    @note Following combinations of image depths are used:
    Source | Sum | Square sum
@@ -655,9 +747,12 @@ inline int hal_ni_integral(int depth, int sdepth, int sqdepth, const uchar * src
 
 /**
    @brief Calculate medianBlur filter
-   @param src_data,src_step Source image
-   @param dst_data,dst_step Destination image
-   @param width,height Source image dimensions
+   @param src_data Source image data
+   @param src_step Source image step
+   @param dst_data Destination image data
+   @param dst_step Destination image step
+   @param width Source image width
+   @param height Source image height
    @param depth Depths of source and destination image
    @param cn Number of channels
    @param ksize Size of kernel
@@ -670,9 +765,12 @@ inline int hal_ni_medianBlur(const uchar* src_data, size_t src_step, uchar* dst_
 
 /**
    @brief Calculates adaptive threshold
-   @param src_data,src_step Source image
-   @param dst_data,dst_step Destination image
-   @param width,height Source image dimensions
+   @param src_data Source image data
+   @param src_step Source image step
+   @param dst_data Destination image data
+   @param dst_step Destination image step
+   @param width Source image width
+   @param height Source image height
    @param maxValue Value assigned to the pixels for which the condition is satisfied
    @param adaptiveMethod Adaptive thresholding algorithm
    @param thresholdType Thresholding type
@@ -687,9 +785,12 @@ inline int hal_ni_adaptiveThreshold(const uchar* src_data, size_t src_step, ucha
 
 /**
    @brief Calculates fixed-level threshold to each array element
-   @param src_data,src_step Source image
-   @param dst_data,dst_step Destination image
-   @param width,height Source image dimensions
+   @param src_data Source image data
+   @param src_step Source image step
+   @param dst_data Destination image data
+   @param dst_step Destination image step
+   @param width Source image width
+   @param height Source image height
    @param depth Depths of source and destination image
    @param cn Number of channels
    @param thresh Threshold value
@@ -704,14 +805,23 @@ inline int hal_ni_threshold(const uchar* src_data, size_t src_step, uchar* dst_d
 
 /**
    @brief Calculate box filter
-   @param src_data,src_step Source image
-   @param dst_data,dst_step Destination image
-   @param width,height Source image dimensions
-   @param src_depth,dst_depth Depths of source and destination image
+   @param src_data Source image data
+   @param src_step Source image step
+   @param dst_data Destination image data
+   @param dst_step Destination image step
+   @param width Source image width
+   @param height Source image height
+   @param src_depth Depth of source image
+   @param dst_depth Depts of destination image
    @param cn Number of channels
-   @param margin_left,margin_top,margin_right,margin_bottom Margins for source image
-   @param ksize_width,ksize_height Size of kernel
-   @param anchor_x,anchor_y Anchor point
+   @param margin_left Left margins for source image
+   @param margin_top Top margins for source image
+   @param margin_right Right margins for source image
+   @param margin_bottom Bottom margins for source image
+   @param ksize_width Width of kernel
+   @param ksize_height Height of kernel
+   @param anchor_x Anchor point x coordinate
+   @param anchor_y Anchor point y coordinate
    @param normalize If true then result is normalized
    @param border_type Border type
 */
@@ -723,14 +833,22 @@ inline int hal_ni_boxFilter(const uchar* src_data, size_t src_step, uchar* dst_d
 
 /**
    @brief Blurs an image using a Gaussian filter.
-   @param src_data,src_step Source image
-   @param dst_data,dst_step Destination image
-   @param width,height Source image dimensions
+   @param src_data Source image data
+   @param src_step Source image step
+   @param dst_data Destination image data
+   @param dst_step Destination image step
+   @param width Source image width
+   @param height Source image height
    @param depth Depth of source and destination image
    @param cn Number of channels
-   @param margin_left,margin_top,margin_right,margin_bottom Margins for source image
-   @param ksize_width,ksize_height Size of kernel
-   @param sigmaX,sigmaY Gaussian kernel standard deviation.
+   @param margin_left Left margins for source image
+   @param margin_top Top margins for source image
+   @param margin_right Right margins for source image
+   @param margin_bottom Bottom margins for source image
+   @param ksize_width Width of kernel
+   @param ksize_height Height of kernel
+   @param sigmaX Gaussian kernel standard deviation.
+   @param sigmaY Gaussian kernel standard deviation.
    @param border_type Border type
 */
 inline int hal_ni_gaussianBlur(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int depth, int cn, size_t margin_left, size_t margin_top, size_t margin_right, size_t margin_bottom, size_t ksize_width, size_t ksize_height, double sigmaX, double sigmaY, int border_type) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
@@ -741,13 +859,21 @@ inline int hal_ni_gaussianBlur(const uchar* src_data, size_t src_step, uchar* ds
 
 /**
    @brief Computes Sobel derivatives
-   @param src_depth,dst_depth Depths of source and destination image
-   @param src_data,src_step Source image
-   @param dst_data,dst_step Destination image
-   @param width,height Source image dimensions
+   @param src_depth Depth of source image
+   @param dst_depth Depts of destination image
+   @param src_data Source image data
+   @param src_step Source image step
+   @param dst_data Destination image data
+   @param dst_step Destination image step
+   @param width Source image width
+   @param height Source image height
    @param cn Number of channels
-   @param margin_left,margin_top,margin_right,margin_bottom Margins for source image
-   @param dx,dy orders of the derivative x and y respectively
+   @param margin_left Left margins for source image
+   @param margin_top Top margins for source image
+   @param margin_right Right margins for source image
+   @param margin_bottom Bottom margins for source image
+   @param dx orders of the derivative x
+   @param dy orders of the derivative y
    @param ksize Size of kernel
    @param scale Scale factor for the computed derivative values
    @param delta Delta value that is added to the results prior to storing them in dst
@@ -761,13 +887,21 @@ inline int hal_ni_sobel(const uchar* src_data, size_t src_step, uchar* dst_data,
 
 /**
    @brief Computes Scharr filter
-   @param src_depth,dst_depth Depths of source and destination image
-   @param src_data,src_step Source image
-   @param dst_data,dst_step Destination image
-   @param width,height Source image dimensions
+   @param src_depth Depth of source image
+   @param dst_depth Depts of destination image
+   @param src_data Source image data
+   @param src_step Source image step
+   @param dst_data Destination image data
+   @param dst_step Destination image step
+   @param width Source image width
+   @param height Source image height
    @param cn Number of channels
-   @param margin_left,margin_top,margin_right,margin_bottom Margins for source image
-   @param dx,dy orders of the derivative x and y respectively
+   @param margin_left Left margins for source image
+   @param margin_top Top margins for source image
+   @param margin_right Right margins for source image
+   @param margin_bottom Bottom margins for source image
+   @param dx orders of the derivative x
+   @param dy orders of the derivative y
    @param scale Scale factor for the computed derivative values
    @param delta Delta value that is added to the results prior to storing them in dst
    @param border_type Border type
@@ -781,10 +915,14 @@ inline int hal_ni_scharr(const uchar* src_data, size_t src_step, uchar* dst_data
 /**
    @brief Perform Gaussian Blur and downsampling for input tile.
    @param depth Depths of source and destination image
-   @param src_data,src_step Source image
-   @param dst_data,dst_step Destination image
-   @param src_width,src_height Source image dimensions
-   @param dst_width,dst_height Destination image dimensions
+   @param src_data Source image data
+   @param src_step Source image step
+   @param dst_data Destination image data
+   @param dst_step Destination image step
+   @param src_width Source image width
+   @param src_height Source image height
+   @param dst_width Destination image width
+   @param dst_height Destination image height
    @param cn Number of channels
    @param border_type Border type
 */
@@ -796,11 +934,15 @@ inline int hal_ni_pyrdown(const uchar* src_data, size_t src_step, int src_width,
 
 /**
    @brief Canny edge detector
-   @param src_data,src_step Source image
-   @param dst_data,dst_step Destination image
-   @param width,height Source image dimensions
+   @param src_data Source image data
+   @param src_step Source image step
+   @param dst_data Destination image data
+   @param dst_step Destination image step
+   @param width Source image width
+   @param height Source image height
    @param cn Number of channels
-   @param lowThreshold, highThreshold Thresholds value
+   @param lowThreshold low hresholds value
+   @param highThreshold high thresholds value
    @param ksize Kernel size for Sobel operator.
    @param L2gradient Flag, indicating use L2 or L1 norma.
 */
diff --git a/modules/imgproc/src/intelligent_scissors.cpp b/modules/imgproc/src/intelligent_scissors.cpp
index 6e2dfc3288..2007575891 100644
--- a/modules/imgproc/src/intelligent_scissors.cpp
+++ b/modules/imgproc/src/intelligent_scissors.cpp
@@ -90,9 +90,9 @@ struct IntelligentScissorsMB::Impl
     int laplacianKernelSize = 3;  // 1 or 3
 
     // image features
-    Mat_<Point2f> gradient_direction;  //< I: normalized laplacian x/y components
-    Mat_<float> gradient_magnitude;  //< Fg: gradient cost function
-    Mat_<uchar> non_edge_feature;  //< Fz: zero-crossing function
+    Mat_<Point2f> gradient_direction;  ///< I: normalized laplacian x/y components
+    Mat_<float> gradient_magnitude;  ///< Fg: gradient cost function
+    Mat_<uchar> non_edge_feature;  ///< Fz: zero-crossing function
 
     float weight_non_edge_compute = 0.0f;
 
diff --git a/modules/imgproc/src/lsd.cpp b/modules/imgproc/src/lsd.cpp
index fa01c60127..ed6b00f986 100644
--- a/modules/imgproc/src/lsd.cpp
+++ b/modules/imgproc/src/lsd.cpp
@@ -214,7 +214,7 @@ public:
 /**
  * Draw lines on the given canvas.
  *
- * @param image     The image, where lines will be drawn.
+ * @param _image    The image, where lines will be drawn.
  *                  Should have the size of the image, where the lines were found
  * @param lines     The lines that need to be drawn
  */
@@ -226,7 +226,7 @@ public:
  * @param size      The size of the image, where lines1 and lines2 were found.
  * @param lines1    The first lines that need to be drawn. Color - Blue.
  * @param lines2    The second lines that need to be drawn. Color - Red.
- * @param image     An optional image, where lines will be drawn.
+ * @param _image    An optional image, where lines will be drawn.
  *                  Should have the size of the image, where the lines were found
  * @return          The number of mismatching pixels between lines1 and lines2.
  */
@@ -308,8 +308,6 @@ private:
  *
  * @param threshold      The minimum value of the angle that is considered defined, otherwise NOTDEF
  * @param n_bins         The number of bins with which gradients are ordered by, using bucket sort.
- * @param ordered_points Return: Vector of coordinate points that are pseudo ordered by magnitude.
- *                       Pixels would be ordered by norm value, up to a precision given by max_grad/n_bins.
  */
     void ll_angle(const double& threshold, const unsigned int& n_bins);
 
diff --git a/modules/imgproc/src/median_blur.simd.hpp b/modules/imgproc/src/median_blur.simd.hpp
index 90f0b20330..7d8423d322 100644
--- a/modules/imgproc/src/median_blur.simd.hpp
+++ b/modules/imgproc/src/median_blur.simd.hpp
@@ -548,7 +548,7 @@ struct MinMax32f
     }
 };
 
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 
 struct MinMaxVec8u
 {
@@ -688,7 +688,7 @@ medianBlur_SortNet( const Mat& _src, Mat& _dst, int m )
                 if( limit == size.width )
                     break;
 
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                 int nlanes = VTraits<typename VecOp::arg_type>::vlanes();
 #else
                 int nlanes = 1;
@@ -793,7 +793,7 @@ medianBlur_SortNet( const Mat& _src, Mat& _dst, int m )
                 if( limit == size.width )
                     break;
 
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                 int nlanes = VTraits<typename VecOp::arg_type>::vlanes();
 #else
                 int nlanes = 1;
diff --git a/modules/imgproc/src/opencl/color_yuv.cl b/modules/imgproc/src/opencl/color_yuv.cl
index bf75a1c5b8..c536f87a0b 100644
--- a/modules/imgproc/src/opencl/color_yuv.cl
+++ b/modules/imgproc/src/opencl/color_yuv.cl
@@ -568,6 +568,76 @@ __kernel void YUV2RGB_422(__global const uchar* srcptr, int src_step, int src_of
     }
 }
 
+// Coefficients based on ITU.BT-601, ISBN 1-878707-09-4 (https://fourcc.org/fccyvrgb.php)
+// The conversion coefficients for RGB to YUV422 are based on the ones for RGB to YUV.
+// For both Y components, the coefficients are applied as given in the link to each input RGB pixel
+// separately. For U and V, they are reduced by half to account for two RGB pixels contributing
+// to the same U and V values. In other words, the U and V contributions from the two RGB pixels
+// are averaged. The integer versions are obtained by multiplying the float versions by 16384
+// and rounding to the nearest integer.
+
+__constant float c_RGB2YUV422Coeffs_f[10]  = {0.0625, 0.5, 0.257, 0.504, 0.098, -0.074 , -0.1455, 0.2195, -0.184 , -0.0355};
+__constant int   c_RGB2YUV422Coeffs_i[10]  = {1024 * HALF_MAX_NUM * 2, 8192 * HALF_MAX_NUM * 2, 4211,  8258,  1606, -1212, -2384,  3596, -3015,  -582};
+
+__kernel void RGB2YUV_422(__global const uchar* srcptr, int src_step, int src_offset,
+                          __global uchar* dstptr, int dst_step, int dst_offset,
+                          int rows, int cols)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols/2)
+    {
+        int src_index = mad24(y, src_step, mad24(x << 1, scnbytes, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x << 1, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);
+                __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);
+                DATA_TYPE_3 src_pix1 = vload3(0, src);
+                DATA_TYPE b1 = src_pix1.B_COMP, g1 = src_pix1.G_COMP, r1 = src_pix1.R_COMP;
+                DATA_TYPE_3 src_pix2 = vload3(0, src+scn);
+                DATA_TYPE b2 = src_pix2.B_COMP, g2 = src_pix2.G_COMP, r2 = src_pix2.R_COMP;
+
+
+#ifdef DEPTH_5
+                __constant float * coeffs = c_RGB2YUV422Coeffs_f;
+                #define MAC_fn fma
+                #define res_dtype DATA_TYPE
+                #define mul_fn(x,y) (x*y)
+                #define output_scale_fn(x) x
+#else
+                __constant int * coeffs = c_RGB2YUV422Coeffs_i;
+                #define MAC_fn mad24
+                #define res_dtype int
+                #define mul_fn mul24
+                #define output_scale_fn(x) SAT_CAST(CV_DESCALE(x, yuv_shift))
+#endif
+
+                const res_dtype Y1 = MAC_fn(coeffs[2], r1, coeffs[0] + MAC_fn(coeffs[3], g1, mul_fn(coeffs[4], b1)));
+                const res_dtype Y2 = MAC_fn(coeffs[2], r2, coeffs[0] + MAC_fn(coeffs[3], g2, mul_fn(coeffs[4], b2)));
+
+                const res_dtype sr = r1+r2, sg = g1+g2, sb = b1+b2;
+                const res_dtype U = MAC_fn(coeffs[5], sr, coeffs[1] + MAC_fn(coeffs[6], sg, mul_fn(coeffs[7], sb)));
+                const res_dtype V = MAC_fn(coeffs[7], sr, coeffs[1] + MAC_fn(coeffs[8], sg, mul_fn(coeffs[9], sb)));
+
+                dst[uidx] = output_scale_fn(U);
+                dst[(2 + uidx) % 4] = output_scale_fn(V);
+                dst[yidx] = output_scale_fn(Y1);
+                dst[yidx+2] = output_scale_fn(Y2);
+
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
+
 ///////////////////////////////////// RGB <-> YCrCb //////////////////////////////////////
 
 __constant float c_RGB2YCrCbCoeffs_f[5] = {R2YF, G2YF, B2YF, YCRF, YCBF};
diff --git a/modules/imgproc/src/sumpixels.avx512_skx.hpp b/modules/imgproc/src/sumpixels.avx512_skx.hpp
index 81d9d1d846..09b777b268 100644
--- a/modules/imgproc/src/sumpixels.avx512_skx.hpp
+++ b/modules/imgproc/src/sumpixels.avx512_skx.hpp
@@ -26,7 +26,7 @@ template<size_t num_channels> class IntegralCalculator;
 template<size_t num_channels>
 class IntegralCalculator  {
 public:
-    IntegralCalculator() {};
+    IntegralCalculator() {}
 
 
     void calculate_integral_avx512(const uchar *src, size_t _srcstep,
diff --git a/modules/imgproc/test/ocl/test_color.cpp b/modules/imgproc/test/ocl/test_color.cpp
index bdbc6a90d0..55401ba73c 100644
--- a/modules/imgproc/test/ocl/test_color.cpp
+++ b/modules/imgproc/test/ocl/test_color.cpp
@@ -458,6 +458,43 @@ OCL_TEST_P(CvtColor_YUV2RGB_422, YUV2BGR_YVYU) { performTest(2, 3, CVTCODE(YUV2B
 OCL_TEST_P(CvtColor_YUV2RGB_422, YUV2RGBA_YVYU) { performTest(2, 4, CVTCODE(YUV2RGBA_YVYU)); }
 OCL_TEST_P(CvtColor_YUV2RGB_422, YUV2BGRA_YVYU) { performTest(2, 4, CVTCODE(YUV2BGRA_YVYU)); }
 
+// RGBA -> YUV422
+
+struct CvtColor_RGB2YUV_422 :
+        public CvtColor
+{
+    void generateTestData(int channelsIn, int channelsOut)
+    {
+        const int srcType = CV_MAKE_TYPE(depth, channelsIn);
+        const int dstType = CV_MAKE_TYPE(depth, channelsOut);
+
+        Size roiSize = randomSize(1, MAX_VALUE);
+        roiSize.width *= 2;
+        roiSize.height *= 2;
+
+        Border srcBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
+        randomSubMat(src, src_roi, roiSize, srcBorder, srcType, 2, 100);
+
+        Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
+        randomSubMat(dst, dst_roi, roiSize, dstBorder, dstType, 6, 16);
+
+        UMAT_UPLOAD_INPUT_PARAMETER(src);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst);
+    }
+};
+
+OCL_TEST_P(CvtColor_RGB2YUV_422, RGB2YUV_UYVY)  { performTest(3, 2, CVTCODE(RGB2YUV_UYVY)); }
+OCL_TEST_P(CvtColor_RGB2YUV_422, BGR2YUV_UYVY)  { performTest(3, 2, CVTCODE(BGR2YUV_UYVY)); }
+OCL_TEST_P(CvtColor_RGB2YUV_422, RGBA2YUV_UYVY) { performTest(4, 2, CVTCODE(RGBA2YUV_UYVY)); }
+OCL_TEST_P(CvtColor_RGB2YUV_422, BGRA2YUV_UYVY) { performTest(4, 2, CVTCODE(BGRA2YUV_UYVY)); }
+OCL_TEST_P(CvtColor_RGB2YUV_422, RGB2YUV_YUY2)  { performTest(3, 2, CVTCODE(RGB2YUV_YUY2)); }
+OCL_TEST_P(CvtColor_RGB2YUV_422, BGR2YUV_YUY2)  { performTest(3, 2, CVTCODE(BGR2YUV_YUY2)); }
+OCL_TEST_P(CvtColor_RGB2YUV_422, RGBA2YUV_YUY2) { performTest(4, 2, CVTCODE(RGBA2YUV_YUY2)); }
+OCL_TEST_P(CvtColor_RGB2YUV_422, BGRA2YUV_YUY2) { performTest(4, 2, CVTCODE(BGRA2YUV_YUY2)); }
+OCL_TEST_P(CvtColor_RGB2YUV_422, RGB2YUV_YVYU)  { performTest(3, 2, CVTCODE(RGB2YUV_YVYU)); }
+OCL_TEST_P(CvtColor_RGB2YUV_422, BGR2YUV_YVYU)  { performTest(3, 2, CVTCODE(BGR2YUV_YVYU)); }
+OCL_TEST_P(CvtColor_RGB2YUV_422, RGBA2YUV_YVYU) { performTest(4, 2, CVTCODE(RGBA2YUV_YVYU)); }
+OCL_TEST_P(CvtColor_RGB2YUV_422, BGRA2YUV_YVYU) { performTest(4, 2, CVTCODE(BGRA2YUV_YVYU)); }
 
 OCL_INSTANTIATE_TEST_CASE_P(ImgProc, CvtColor8u,
                             testing::Combine(testing::Values(MatDepth(CV_8U)), Bool()));
@@ -485,6 +522,11 @@ OCL_INSTANTIATE_TEST_CASE_P(ImgProc, CvtColor_YUV2RGB_422,
                                 testing::Values(MatDepth(CV_8U)),
                                 Bool()));
 
+OCL_INSTANTIATE_TEST_CASE_P(ImgProc, CvtColor_RGB2YUV_422,
+                            testing::Combine(
+                                testing::Values(MatDepth(CV_8U)),
+                                Bool()));
+
 } } // namespace opencv_test::ocl
 
 #endif
diff --git a/modules/imgproc/test/ocl/test_houghlines.cpp b/modules/imgproc/test/ocl/test_houghlines.cpp
index 4e7b8917ac..64d5b248a3 100644
--- a/modules/imgproc/test/ocl/test_houghlines.cpp
+++ b/modules/imgproc/test/ocl/test_houghlines.cpp
@@ -181,4 +181,4 @@ OCL_INSTANTIATE_TEST_CASE_P(Imgproc, HoughLinesP, Combine(Values(100, 150),
 
 } } // namespace opencv_test::ocl
 
-#endif // HAVE_OPENCL
\ No newline at end of file
+#endif // HAVE_OPENCL
diff --git a/modules/imgproc/test/test_cvtyuv.cpp b/modules/imgproc/test/test_cvtyuv.cpp
index cb49baab0a..7114ef035d 100644
--- a/modules/imgproc/test/test_cvtyuv.cpp
+++ b/modules/imgproc/test/test_cvtyuv.cpp
@@ -159,6 +159,42 @@ class I420Writer: public YUV420pWriter
     }
 };
 
+class YUV422Writer: public YUVwriter
+{
+    int channels() { return 2; }
+    Size size(Size imgSize) { return Size(imgSize.width, imgSize.height); }
+};
+
+class UYVYWriter: public YUV422Writer
+{
+    virtual void write(Mat& yuv, int row, int col, const YUV& val)
+    {
+        yuv.ptr<Vec2b>(row)[col][1] = val[0];
+        yuv.ptr<Vec2b>(row)[(col/2)*2][0] = val[1];
+        yuv.ptr<Vec2b>(row)[(col/2)*2 + 1][0] = val[2];
+    }
+};
+
+class YUY2Writer: public YUV422Writer
+{
+    virtual void write(Mat& yuv, int row, int col, const YUV& val)
+    {
+        yuv.ptr<Vec2b>(row)[col][0] = val[0];
+        yuv.ptr<Vec2b>(row)[(col/2)*2][1] = val[1];
+        yuv.ptr<Vec2b>(row)[(col/2)*2 + 1][1] = val[2];
+    }
+};
+
+class YVYUWriter: public YUV422Writer
+{
+    virtual void write(Mat& yuv, int row, int col, const YUV& val)
+    {
+        yuv.ptr<Vec2b>(row)[col][0] = val[0];
+        yuv.ptr<Vec2b>(row)[(col/2)*2 + 1][1] = val[1];
+        yuv.ptr<Vec2b>(row)[(col/2)*2][1] = val[2];
+    }
+};
+
 class YUV420Reader: public YUVreader
 {
     int channels() { return 1; }
@@ -357,6 +393,36 @@ public:
     }
 };
 
+class RGB2YUV422_Converter
+{
+public:
+    YUV convert(RGB rgb1, RGB rgb2, int idx)
+    {
+        int r1 = rgb1[0];
+        int g1 = rgb1[1];
+        int b1 = rgb1[2];
+
+        int r2 = rgb2[0];
+        int g2 = rgb2[1];
+        int b2 = rgb2[2];
+
+        // Coefficients below based on ITU.BT-601, ISBN 1-878707-09-4 (https://fourcc.org/fccyvrgb.php)
+        // The conversion coefficients for RGB to YUV422 are based on the ones for RGB to YUV.
+        // For both Y components, the coefficients are applied as given in the link to each input RGB pixel
+        // separately. For U and V, they are reduced by half to account for two RGB pixels contributing
+        // to the same U and V values. In other words, the U and V contributions from the two RGB pixels
+        // are averaged. The integer versions are obtained by multiplying the float versions by 16384
+        // and rounding to the nearest integer.
+
+        uchar y1 = saturate_cast<uchar>((int)( 0.257f*r1 + 0.504f*g1 + 0.098f*b1 + 16));
+        uchar y2 = saturate_cast<uchar>((int)( 0.257f*r2 + 0.504f*g2 + 0.098f*b2 + 16));
+        uchar u = saturate_cast<uchar>((int)(-0.074f*(r1+r2) - 0.1455f*(g1+g2) + 0.2195f*(b1+b2) + 128));
+        uchar v = saturate_cast<uchar>((int)( 0.2195f*(r1+r2) - 0.184f*(g1+g2) - 0.0355f*(b1+b2) + 128));
+
+        return YUV((idx==0)?y1:y2, u, v);
+    }
+};
+
 YUVreader* YUVreader::getReader(int code)
 {
     switch(code)
@@ -421,15 +487,27 @@ RGBreader* RGBreader::getReader(int code)
     {
     case COLOR_RGB2YUV_YV12:
     case COLOR_RGB2YUV_I420:
+    case COLOR_RGB2YUV_UYVY:
+    case COLOR_RGB2YUV_YUY2:
+    case COLOR_RGB2YUV_YVYU:
         return new RGB888Reader();
     case COLOR_BGR2YUV_YV12:
     case COLOR_BGR2YUV_I420:
+    case COLOR_BGR2YUV_UYVY:
+    case COLOR_BGR2YUV_YUY2:
+    case COLOR_BGR2YUV_YVYU:
         return new BGR888Reader();
     case COLOR_RGBA2YUV_I420:
     case COLOR_RGBA2YUV_YV12:
+    case COLOR_RGBA2YUV_UYVY:
+    case COLOR_RGBA2YUV_YUY2:
+    case COLOR_RGBA2YUV_YVYU:
         return new RGBA8888Reader();
     case COLOR_BGRA2YUV_YV12:
     case COLOR_BGRA2YUV_I420:
+    case COLOR_BGRA2YUV_UYVY:
+    case COLOR_BGRA2YUV_YUY2:
+    case COLOR_BGRA2YUV_YVYU:
         return new BGRA8888Reader();
     default:
         return 0;
@@ -505,6 +583,21 @@ YUVwriter* YUVwriter::getWriter(int code)
     case COLOR_RGBA2YUV_YV12:
     case COLOR_BGRA2YUV_YV12:
         return new YV12Writer();
+    case COLOR_RGB2YUV_UYVY:
+    case COLOR_BGR2YUV_UYVY:
+    case COLOR_RGBA2YUV_UYVY:
+    case COLOR_BGRA2YUV_UYVY:
+        return new UYVYWriter();
+    case COLOR_RGB2YUV_YUY2:
+    case COLOR_BGR2YUV_YUY2:
+    case COLOR_RGBA2YUV_YUY2:
+    case COLOR_BGRA2YUV_YUY2:
+        return new YUY2Writer();
+    case COLOR_RGB2YUV_YVYU:
+    case COLOR_BGR2YUV_YVYU:
+    case COLOR_RGBA2YUV_YVYU:
+    case COLOR_BGRA2YUV_YVYU:
+        return new YVYUWriter();
     case COLOR_RGB2YUV_I420:
     case COLOR_BGR2YUV_I420:
     case COLOR_RGBA2YUV_I420:
@@ -545,6 +638,21 @@ void referenceRGB2YUV(const Mat& rgb, Mat& yuv, RGBreader* rgbReader, YUVwriter*
             yuvWriter->write(yuv, row, col, cvt.convert(rgbReader->read(rgb, row, col)));
 }
 
+template<class convertor>
+void referenceRGB2YUV422(const Mat& rgb, Mat& yuv, RGBreader* rgbReader, YUVwriter* yuvWriter)
+{
+    convertor cvt;
+
+    for(int row = 0; row < rgb.rows; ++row)
+    {
+            for(int col = 0; col < rgb.cols; col+=2)
+            {
+                yuvWriter->write(yuv, row, col, cvt.convert(rgbReader->read(rgb, row, col), rgbReader->read(rgb, row, col+1), 0));
+                yuvWriter->write(yuv, row, col+1, cvt.convert(rgbReader->read(rgb, row, col), rgbReader->read(rgb, row, col+1), 1));
+            }
+    }
+}
+
 struct ConversionYUV
 {
     explicit ConversionYUV( const int code )
@@ -611,6 +719,28 @@ struct ConversionYUV
     GRAYwriter* grayWriter_;
 };
 
+bool is_rgb2yuv422(int code)
+{
+    switch (code)
+    {
+        case COLOR_RGB2YUV_UYVY:
+        case COLOR_BGR2YUV_UYVY:
+        case COLOR_RGBA2YUV_UYVY:
+        case COLOR_BGRA2YUV_UYVY:
+        case COLOR_RGB2YUV_YUY2:
+        case COLOR_BGR2YUV_YUY2:
+        case COLOR_RGBA2YUV_YUY2:
+        case COLOR_BGRA2YUV_YUY2:
+        case COLOR_RGB2YUV_YVYU:
+        case COLOR_BGR2YUV_YVYU:
+        case COLOR_RGBA2YUV_YVYU:
+        case COLOR_BGRA2YUV_YVYU:
+            return true;
+        default:
+            return false;
+    }
+}
+
 CV_ENUM(YUVCVTS, COLOR_YUV2RGB_NV12, COLOR_YUV2BGR_NV12, COLOR_YUV2RGB_NV21, COLOR_YUV2BGR_NV21,
                  COLOR_YUV2RGBA_NV12, COLOR_YUV2BGRA_NV12, COLOR_YUV2RGBA_NV21, COLOR_YUV2BGRA_NV21,
                  COLOR_YUV2RGB_YV12, COLOR_YUV2BGR_YV12, COLOR_YUV2RGB_IYUV, COLOR_YUV2BGR_IYUV,
@@ -620,13 +750,18 @@ CV_ENUM(YUVCVTS, COLOR_YUV2RGB_NV12, COLOR_YUV2BGR_NV12, COLOR_YUV2RGB_NV21, COL
                  COLOR_YUV2RGBA_YUY2, COLOR_YUV2BGRA_YUY2, COLOR_YUV2RGBA_YVYU, COLOR_YUV2BGRA_YVYU,
                  COLOR_YUV2GRAY_420, COLOR_YUV2GRAY_UYVY, COLOR_YUV2GRAY_YUY2,
                  COLOR_YUV2BGR, COLOR_YUV2RGB, COLOR_RGB2YUV_YV12, COLOR_BGR2YUV_YV12, COLOR_RGBA2YUV_YV12,
-                 COLOR_BGRA2YUV_YV12, COLOR_RGB2YUV_I420, COLOR_BGR2YUV_I420, COLOR_RGBA2YUV_I420, COLOR_BGRA2YUV_I420)
+                 COLOR_BGRA2YUV_YV12, COLOR_RGB2YUV_I420, COLOR_BGR2YUV_I420, COLOR_RGBA2YUV_I420, COLOR_BGRA2YUV_I420,
+                 COLOR_RGB2YUV_UYVY,  COLOR_BGR2YUV_UYVY,  COLOR_RGBA2YUV_UYVY, COLOR_BGRA2YUV_UYVY,
+                 COLOR_RGB2YUV_YUY2,  COLOR_BGR2YUV_YUY2,  COLOR_RGB2YUV_YVYU,  COLOR_BGR2YUV_YVYU,
+                 COLOR_RGBA2YUV_YUY2, COLOR_BGRA2YUV_YUY2, COLOR_RGBA2YUV_YVYU, COLOR_BGRA2YUV_YVYU)
 
 typedef ::testing::TestWithParam<YUVCVTS> Imgproc_ColorYUV;
 
 TEST_P(Imgproc_ColorYUV, accuracy)
 {
     int code = GetParam();
+    bool yuv422 = is_rgb2yuv422(code);
+
     RNG& random = theRNG();
 
     ConversionYUV cvt(code);
@@ -654,7 +789,12 @@ TEST_P(Imgproc_ColorYUV, accuracy)
         else if(cvt.grayWriter_)
             referenceYUV2GRAY<YUV2GRAY_Converter>(src, gold, cvt.yuvReader_, cvt.grayWriter_);
         else if(cvt.yuvWriter_)
-            referenceRGB2YUV<RGB2YUV_Converter>  (src, gold, cvt.rgbReader_, cvt.yuvWriter_);
+        {
+            if(!yuv422)
+                referenceRGB2YUV<RGB2YUV_Converter>  (src, gold, cvt.rgbReader_, cvt.yuvWriter_);
+            else
+                referenceRGB2YUV422<RGB2YUV422_Converter>  (src, gold, cvt.rgbReader_, cvt.yuvWriter_);
+        }
 
         cv::cvtColor(src, dst, code, -1);
 
@@ -665,6 +805,8 @@ TEST_P(Imgproc_ColorYUV, accuracy)
 TEST_P(Imgproc_ColorYUV, roi_accuracy)
 {
     int code = GetParam();
+    bool yuv422 = is_rgb2yuv422(code);
+
     RNG& random = theRNG();
 
     ConversionYUV cvt(code);
@@ -701,7 +843,12 @@ TEST_P(Imgproc_ColorYUV, roi_accuracy)
         else if(cvt.grayWriter_)
             referenceYUV2GRAY<YUV2GRAY_Converter>(src, gold, cvt.yuvReader_, cvt.grayWriter_);
         else if(cvt.yuvWriter_)
-            referenceRGB2YUV<RGB2YUV_Converter>  (src, gold, cvt.rgbReader_, cvt.yuvWriter_);
+        {
+            if(!yuv422)
+                referenceRGB2YUV<RGB2YUV_Converter>  (src, gold, cvt.rgbReader_, cvt.yuvWriter_);
+            else
+                referenceRGB2YUV422<RGB2YUV422_Converter>  (src, gold, cvt.rgbReader_, cvt.yuvWriter_);
+        }
 
         cv::cvtColor(src, dst, code, -1);
 
@@ -722,7 +869,11 @@ INSTANTIATE_TEST_CASE_P(cvt422, Imgproc_ColorYUV,
     ::testing::Values((int)COLOR_YUV2RGB_UYVY, (int)COLOR_YUV2BGR_UYVY, (int)COLOR_YUV2RGBA_UYVY, (int)COLOR_YUV2BGRA_UYVY,
                       (int)COLOR_YUV2RGB_YUY2, (int)COLOR_YUV2BGR_YUY2, (int)COLOR_YUV2RGB_YVYU, (int)COLOR_YUV2BGR_YVYU,
                       (int)COLOR_YUV2RGBA_YUY2, (int)COLOR_YUV2BGRA_YUY2, (int)COLOR_YUV2RGBA_YVYU, (int)COLOR_YUV2BGRA_YVYU,
-                      (int)COLOR_YUV2GRAY_UYVY, (int)COLOR_YUV2GRAY_YUY2));
+                      (int)COLOR_YUV2GRAY_UYVY, (int)COLOR_YUV2GRAY_YUY2,
+                      (int)COLOR_RGB2YUV_UYVY,  (int)COLOR_BGR2YUV_UYVY,  (int)COLOR_RGBA2YUV_UYVY, (int)COLOR_BGRA2YUV_UYVY,
+                      (int)COLOR_RGB2YUV_YUY2,  (int)COLOR_BGR2YUV_YUY2,  (int)COLOR_RGB2YUV_YVYU,  (int)COLOR_BGR2YUV_YVYU,
+                      (int)COLOR_RGBA2YUV_YUY2, (int)COLOR_BGRA2YUV_YUY2, (int)COLOR_RGBA2YUV_YVYU, (int)COLOR_BGRA2YUV_YVYU,
+                      (int)COLOR_RGB2YUV_YUY2));
 
 }
 
diff --git a/modules/imgproc/test/test_distancetransform.cpp b/modules/imgproc/test/test_distancetransform.cpp
index b9d480e524..c6d6b827b4 100644
--- a/modules/imgproc/test/test_distancetransform.cpp
+++ b/modules/imgproc/test/test_distancetransform.cpp
@@ -40,6 +40,7 @@
 //M*/
 
 #include "test_precomp.hpp"
+#include <numeric>
 
 namespace opencv_test { namespace {
 
@@ -128,4 +129,51 @@ BIGDATA_TEST(Imgproc_DistanceTransform, issue_23895_5x5_labels)
     EXPECT_EQ(nz, 0);
 }
 
+TEST(Imgproc_DistanceTransform, max_distance_3x3)
+{
+    Mat src = Mat::ones(1, 70000, CV_8U), dist;
+    src.at<uint8_t>(0, 0) = 0;
+    distanceTransform(src, dist, DIST_L2, DIST_MASK_3);
+
+    double minVal, maxVal;
+    minMaxLoc(dist, &minVal, &maxVal);
+    EXPECT_GE(maxVal, 65533);
+}
+
+TEST(Imgproc_DistanceTransform, max_distance_5x5)
+{
+    Mat src = Mat::ones(1, 70000, CV_8U), dist;
+    src.at<uint8_t>(0, 0) = 0;
+    distanceTransform(src, dist, DIST_L2, DIST_MASK_5);
+
+    double minVal, maxVal;
+    minMaxLoc(dist, &minVal, &maxVal);
+    EXPECT_GE(maxVal, 65533);
+}
+
+TEST(Imgproc_DistanceTransform, max_distance_5x5_labels)
+{
+    Mat src = Mat::ones(1, 70000, CV_8U), dist, labels;
+    src.at<uint8_t>(0, 0) = 0;
+    distanceTransform(src, dist, labels, DIST_L2, DIST_MASK_5);
+
+    double minVal, maxVal;
+    minMaxLoc(dist, &minVal, &maxVal);
+    EXPECT_GE(maxVal, 65533);
+}
+
+TEST(Imgproc_DistanceTransform, precise_long_dist)
+{
+    static const int maxDist = 1 << 16;
+    Mat src = Mat::ones(1, 70000, CV_8U), dist;
+    src.at<uint8_t>(0, 0) = 0;
+    distanceTransform(src, dist, DIST_L2, DIST_MASK_PRECISE, CV_32F);
+
+    Mat expected(src.size(), CV_32F);
+    std::iota(expected.begin<float>(), expected.end<float>(), 0.f);
+    expected.colRange(maxDist, expected.cols).setTo(maxDist);
+
+    EXPECT_EQ(cv::norm(expected, dist, NORM_INF), 0);
+}
+
 }} // namespace
diff --git a/modules/imgproc/test/test_houghlines.cpp b/modules/imgproc/test/test_houghlines.cpp
index 003420ae65..2d784d7a7a 100644
--- a/modules/imgproc/test/test_houghlines.cpp
+++ b/modules/imgproc/test/test_houghlines.cpp
@@ -53,7 +53,7 @@ struct SimilarWith
     T value;
     float theta_eps;
     float rho_eps;
-    SimilarWith(T val, float e, float r_e): value(val), theta_eps(e), rho_eps(r_e) { };
+    SimilarWith(T val, float e, float r_e): value(val), theta_eps(e), rho_eps(r_e) { }
     bool operator()(const T& other);
 };
 
diff --git a/modules/imgproc/test/test_pc.cpp b/modules/imgproc/test/test_pc.cpp
index 173866ac58..9e2c7f5b82 100644
--- a/modules/imgproc/test/test_pc.cpp
+++ b/modules/imgproc/test/test_pc.cpp
@@ -301,7 +301,7 @@ static std::pair<double, double> divide_complex_numbers( const double nu_re, con
     const double result_re = nu_re * de_re + nu_im * de_im;
     const double result_im = nu_re * (-de_im) + nu_im * de_re;
     return std::pair<double, double>(result_re / result_de, result_im / result_de);
-};
+}
 
 /// Helper function to divide a DFT in src1 by a DFT in src2 with depths depth_t.  The DFTs are
 /// complex matrices.
diff --git a/modules/imgproc/test/test_subdivision2d.cpp b/modules/imgproc/test/test_subdivision2d.cpp
index 0a366206b3..17549b6b15 100644
--- a/modules/imgproc/test/test_subdivision2d.cpp
+++ b/modules/imgproc/test/test_subdivision2d.cpp
@@ -50,4 +50,4 @@ TEST(Imgproc_Subdiv2D_getTriangleList, regression_5788)
     EXPECT_EQ(trig_cnt, 105);
 }
 
-}};
+}}
diff --git a/modules/java/test/pure_test/build.xml b/modules/java/test/pure_test/build.xml
index e596c82e9d..fa1e7c3ab9 100644
--- a/modules/java/test/pure_test/build.xml
+++ b/modules/java/test/pure_test/build.xml
@@ -42,7 +42,7 @@
 
   <target name="test" depends="jar">
     <mkdir dir="${test.dir}"/>
-    <junit printsummary="withOutAndErr" haltonfailure="false" haltonerror="false" showoutput="true" logfailedtests="true" maxmemory="256m">
+    <junit printsummary="withOutAndErr" failureproperty="junit_test.failed" haltonfailure="false" haltonerror="false" showoutput="true" logfailedtests="true" maxmemory="256m">
       <sysproperty key="java.library.path" path="${opencv.lib.path}"/>
       <env key="PATH" path="${opencv.lib.path}:${env.PATH}:${env.Path}"/>
       <env key="DYLD_LIBRARY_PATH" path="${env.OPENCV_SAVED_DYLD_LIBRARY_PATH}"/>  <!-- https://github.com/opencv/opencv/issues/14353 -->
@@ -65,6 +65,7 @@
       </fileset>
       <report format="noframes" todir="${test.dir}"/>
     </junitreport>
+    <fail message="JUnit test execution failed" if="junit_test.failed"/>
   </target>
 
   <target name="build" depends="jar">
diff --git a/modules/ml/src/svm.cpp b/modules/ml/src/svm.cpp
index 6c3db22b72..40c18c03ea 100644
--- a/modules/ml/src/svm.cpp
+++ b/modules/ml/src/svm.cpp
@@ -638,9 +638,6 @@ public:
         #undef is_lower_bound
         #define is_lower_bound(i) (alpha_status[i] < 0)
 
-        #undef is_free
-        #define is_free(i) (alpha_status[i] == 0)
-
         #undef get_C
         #define get_C(i) (C[y[i]>0])
 
@@ -648,9 +645,6 @@ public:
         #define update_alpha_status(i) \
             alpha_status[i] = (schar)(alpha[i] >= get_C(i) ? 1 : alpha[i] <= 0 ? -1 : 0)
 
-        #undef reconstruct_gradient
-        #define reconstruct_gradient() /* empty for now */
-
         bool solve_generic( SolutionInfo& si )
         {
             const schar* y = &y_vec[0];
diff --git a/modules/ml/test/test_mltests.cpp b/modules/ml/test/test_mltests.cpp
index c7353057d3..a67f6b0bf2 100644
--- a/modules/ml/test/test_mltests.cpp
+++ b/modules/ml/test/test_mltests.cpp
@@ -55,7 +55,7 @@ static DatasetDesc & getDataset(const string & name)
 
 // interfaces and templates
 
-template <typename T> string modelName() { return "Unknown"; };
+template <typename T> string modelName() { return "Unknown"; }
 template <typename T> Ptr<T> tuneModel(const DatasetDesc &, Ptr<T> m) { return m; }
 
 struct IModelFactory
diff --git a/modules/objdetect/include/opencv2/objdetect.hpp b/modules/objdetect/include/opencv2/objdetect.hpp
index 6e1e22953a..29e789f871 100644
--- a/modules/objdetect/include/opencv2/objdetect.hpp
+++ b/modules/objdetect/include/opencv2/objdetect.hpp
@@ -721,19 +721,21 @@ public:
         ECI_UTF8 = 26
     };
 
-    /** @brief QR code encoder parameters.
-     @param version The optional version of QR code (by default - maximum possible depending on
-                    the length of the string).
-     @param correction_level The optional level of error correction (by default - the lowest).
-     @param mode The optional encoding mode - Numeric, Alphanumeric, Byte, Kanji, ECI or Structured Append.
-     @param structure_number The optional number of QR codes to generate in Structured Append mode.
-    */
+    /** @brief QR code encoder parameters. */
     struct CV_EXPORTS_W_SIMPLE Params
     {
         CV_WRAP Params();
+
+        //! The optional version of QR code (by default - maximum possible depending on the length of the string).
         CV_PROP_RW int version;
+
+        //! The optional level of error correction (by default - the lowest).
         CV_PROP_RW CorrectionLevel correction_level;
+
+        //! The optional encoding mode - Numeric, Alphanumeric, Byte, Kanji, ECI or Structured Append.
         CV_PROP_RW EncodeMode mode;
+
+        //! The optional number of QR codes to generate in Structured Append mode.
         CV_PROP_RW int structure_number;
     };
 
diff --git a/modules/objdetect/include/opencv2/objdetect/aruco_detector.hpp b/modules/objdetect/include/opencv2/objdetect/aruco_detector.hpp
index f885a2af87..c51abb0c93 100644
--- a/modules/objdetect/include/opencv2/objdetect/aruco_detector.hpp
+++ b/modules/objdetect/include/opencv2/objdetect/aruco_detector.hpp
@@ -36,6 +36,7 @@ struct CV_EXPORTS_W_SIMPLE DetectorParameters {
         minMarkerDistanceRate = 0.05;
         cornerRefinementMethod = (int)CORNER_REFINE_NONE;
         cornerRefinementWinSize = 5;
+        relativeCornerRefinmentWinSize = 0.3f;
         cornerRefinementMaxIterations = 30;
         cornerRefinementMinAccuracy = 0.1;
         markerBorderBits = 1;
@@ -56,7 +57,7 @@ struct CV_EXPORTS_W_SIMPLE DetectorParameters {
         useAruco3Detection = false;
         minSideLengthCanonicalImg = 32;
         minMarkerLengthRatioOriginalImg = 0.0;
-    };
+    }
 
     /** @brief Read a new set of DetectorParameters from FileNode (use FileStorage.root()).
      */
@@ -108,9 +109,27 @@ struct CV_EXPORTS_W_SIMPLE DetectorParameters {
     /** @brief default value CORNER_REFINE_NONE */
     CV_PROP_RW int cornerRefinementMethod;
 
-    /// window size for the corner refinement process (in pixels) (default 5).
+    /** @brief maximum window size for the corner refinement process (in pixels) (default 5).
+     *
+     * The window size may decrease if the ArUco marker is too small, check relativeCornerRefinmentWinSize.
+     * The final window size is calculated as:
+     * min(cornerRefinementWinSize, averageArucoModuleSize*relativeCornerRefinmentWinSize),
+     * where averageArucoModuleSize is average module size of ArUco marker in pixels.
+     * (ArUco marker is composed of black and white modules)
+     */
     CV_PROP_RW int cornerRefinementWinSize;
 
+    /** @brief Dynamic window size for corner refinement relative to Aruco module size (default 0.3).
+     *
+     * The final window size is calculated as:
+     * min(cornerRefinementWinSize, averageArucoModuleSize*relativeCornerRefinmentWinSize),
+     * where averageArucoModuleSize is average module size of ArUco marker in pixels.
+     * (ArUco marker is composed of black and white modules)
+     * In the case of markers located far from each other, it may be useful to increase the value of the parameter to 0.4-0.5.
+     * In the case of markers located close to each other, it may be useful to decrease the parameter value to 0.1-0.2.
+     */
+    CV_PROP_RW float relativeCornerRefinmentWinSize;
+
     /// maximum number of iterations for stop criteria of the corner refinement process (default 30).
     CV_PROP_RW int cornerRefinementMaxIterations;
 
diff --git a/modules/objdetect/include/opencv2/objdetect/detection_based_tracker.hpp b/modules/objdetect/include/opencv2/objdetect/detection_based_tracker.hpp
index fb96c668a5..8050278b42 100644
--- a/modules/objdetect/include/opencv2/objdetect/detection_based_tracker.hpp
+++ b/modules/objdetect/include/opencv2/objdetect/detection_based_tracker.hpp
@@ -192,7 +192,7 @@ class CV_EXPORTS DetectionBasedTracker
             {
                 lastPositions.push_back(rect);
                 id=getNextId();
-            };
+            }
 
             static int getNextId()
             {
diff --git a/modules/objdetect/include/opencv2/objdetect/face.hpp b/modules/objdetect/include/opencv2/objdetect/face.hpp
index a8e98c4012..d8e96b5dfd 100644
--- a/modules/objdetect/include/opencv2/objdetect/face.hpp
+++ b/modules/objdetect/include/opencv2/objdetect/face.hpp
@@ -20,7 +20,7 @@ model download link: https://github.com/opencv/opencv_zoo/tree/master/models/fac
 class CV_EXPORTS_W FaceDetectorYN
 {
 public:
-    virtual ~FaceDetectorYN() {};
+    virtual ~FaceDetectorYN() {}
 
     /** @brief Set the size for the network input, which overwrites the input size of creating model. Call this method when the size of input image does not match the input size when creating model
      *
@@ -99,7 +99,7 @@ model download link: https://github.com/opencv/opencv_zoo/tree/master/models/fac
 class CV_EXPORTS_W FaceRecognizerSF
 {
 public:
-    virtual ~FaceRecognizerSF() {};
+    virtual ~FaceRecognizerSF() {}
 
     /** @brief Definition of distance used for calculating the distance between two face features
      */
diff --git a/modules/objdetect/src/aruco/aruco_board.cpp b/modules/objdetect/src/aruco/aruco_board.cpp
index cf45a96450..3d4217e02a 100644
--- a/modules/objdetect/src/aruco/aruco_board.cpp
+++ b/modules/objdetect/src/aruco/aruco_board.cpp
@@ -3,6 +3,7 @@
 // of this distribution and at http://opencv.org/license.html
 
 #include "../precomp.hpp"
+#include <opencv2/core/utils/logger.hpp>
 #include "opencv2/objdetect/aruco_board.hpp"
 
 #include <opencv2/objdetect/aruco_dictionary.hpp>
@@ -250,7 +251,11 @@ GridBoard::GridBoard() {}
 GridBoard::GridBoard(const Size& size, float markerLength, float markerSeparation,
                      const Dictionary &dictionary, InputArray ids):
     Board(new GridBoardImpl(dictionary, size, markerLength, markerSeparation)) {
-
+    float onePin = markerLength / ((float)(dictionary.markerSize+2));
+    if (markerSeparation < onePin*.7f) {
+        CV_LOG_WARNING(NULL, "Marker border " << markerSeparation << " is less than 70% of ArUco pin size "
+            << onePin << ". Please increase markerSeparation or decrease markerLength for stable board detection");
+    }
     size_t totalMarkers = (size_t) size.width*size.height;
     CV_Assert(ids.empty() || totalMarkers == ids.total());
     vector<vector<Point3f> > objPoints;
@@ -541,7 +546,12 @@ CharucoBoard::CharucoBoard(const Size& size, float squareLength, float markerLen
     Board(new CharucoBoardImpl(dictionary, size, squareLength, markerLength)) {
 
     CV_Assert(size.width > 1 && size.height > 1 && markerLength > 0 && squareLength > markerLength);
-
+    float onePin = markerLength / ((float)(dictionary.markerSize+2));
+    float markerSeparation = (squareLength - markerLength)/2.f;
+    if (markerSeparation < onePin*.7f) {
+        CV_LOG_WARNING(NULL, "Marker border " << markerSeparation << " is less than 70% of ArUco pin size "
+            << onePin <<". Please increase markerSeparation or decrease markerLength for stable board detection");
+    }
     ids.copyTo(impl->ids);
 
     static_pointer_cast<CharucoBoardImpl>(impl)->createCharucoBoard();
diff --git a/modules/objdetect/src/aruco/aruco_detector.cpp b/modules/objdetect/src/aruco/aruco_detector.cpp
index 73643177ee..8faa92a3da 100644
--- a/modules/objdetect/src/aruco/aruco_detector.cpp
+++ b/modules/objdetect/src/aruco/aruco_detector.cpp
@@ -35,6 +35,8 @@ static inline bool readWrite(DetectorParameters &params, const FileNode* readNod
     check |= readWriteParameter("minMarkerDistanceRate", params.minMarkerDistanceRate, readNode, writeStorage);
     check |= readWriteParameter("cornerRefinementMethod", params.cornerRefinementMethod, readNode, writeStorage);
     check |= readWriteParameter("cornerRefinementWinSize", params.cornerRefinementWinSize, readNode, writeStorage);
+    check |= readWriteParameter("relativeCornerRefinmentWinSize", params.relativeCornerRefinmentWinSize, readNode,
+                                writeStorage);
     check |= readWriteParameter("cornerRefinementMaxIterations", params.cornerRefinementMaxIterations,
                                 readNode, writeStorage);
     check |= readWriteParameter("cornerRefinementMinAccuracy", params.cornerRefinementMinAccuracy,
@@ -692,7 +694,7 @@ static void _identifyCandidates(InputArray grey,
 
 /**
  * Line fitting  A * B = C :: Called from function refineCandidateLines
- * @param nContours, contour-container
+ * @param nContours contour-container
  */
 static Point3f _interpolate2Dline(const vector<Point2f>& nContours){
     CV_Assert(nContours.size() >= 2);
@@ -748,10 +750,8 @@ static Point2f _getCrossPoint(Point3f nLine1, Point3f nLine2){
 
 /**
  * Refine Corners using the contour vector :: Called from function detectMarkers
- * @param nContours, contour-container
- * @param nCorners, candidate Corners
- * @param camMatrix, cameraMatrix input 3x3 floating-point camera matrix
- * @param distCoeff, distCoeffs vector of distortion coefficient
+ * @param nContours contour-container
+ * @param nCorners candidate Corners
  */
 static void _refineCandidateLines(vector<Point>& nContours, vector<Point2f>& nCorners){
     vector<Point2f> contour2f(nContours.begin(), nContours.end());
@@ -847,6 +847,16 @@ struct ArucoDetector::ArucoDetectorImpl {
                       const RefineParameters& _refineParams): dictionary(_dictionary),
                       detectorParams(_detectorParams), refineParams(_refineParams) {}
 
+    float getAverageArucoPinSize(vector<Point2f> markerCorners) {
+        float averageArucoModuleSize = 0.f;
+        int numPins = dictionary.markerSize + detectorParams.markerBorderBits * 2;
+        for (size_t i = 0ull; i < markerCorners.size(); i++) {
+            averageArucoModuleSize += sqrt(normL2Sqr<float>(Point2f(markerCorners[i] - markerCorners[(i+1ull)%markerCorners.size()])));
+        }
+        averageArucoModuleSize /= ((float)markerCorners.size()*numPins);
+        return averageArucoModuleSize;
+}
+
 };
 
 ArucoDetector::ArucoDetector(const Dictionary &_dictionary,
@@ -951,13 +961,15 @@ void ArucoDetector::detectMarkers(InputArray _image, OutputArrayOfArrays _corner
                     const float scale_init = (float) grey_pyramid[closest_pyr_image_idx].cols / grey.cols;
                     findCornerInPyrImage(scale_init, closest_pyr_image_idx, grey_pyramid, Mat(candidates[i]), detectorParams);
                 }
-                else
-                cornerSubPix(grey, Mat(candidates[i]),
-                             Size(detectorParams.cornerRefinementWinSize, detectorParams.cornerRefinementWinSize),
-                             Size(-1, -1),
-                             TermCriteria(TermCriteria::MAX_ITER | TermCriteria::EPS,
-                                          detectorParams.cornerRefinementMaxIterations,
-                                          detectorParams.cornerRefinementMinAccuracy));
+                else {
+                    int cornerRefinementWinSize = std::max(1, cvRound(detectorParams.relativeCornerRefinmentWinSize*
+                                                                      arucoDetectorImpl->getAverageArucoPinSize(candidates[i])));
+                    cornerRefinementWinSize = min(cornerRefinementWinSize, detectorParams.cornerRefinementWinSize);
+                    cornerSubPix(grey, Mat(candidates[i]), Size(cornerRefinementWinSize, cornerRefinementWinSize), Size(-1, -1),
+                                 TermCriteria(TermCriteria::MAX_ITER | TermCriteria::EPS,
+                                              detectorParams.cornerRefinementMaxIterations,
+                                              detectorParams.cornerRefinementMinAccuracy));
+                }
             }
         });
     }
@@ -1223,8 +1235,13 @@ void ArucoDetector::refineDetectedMarkers(InputArray _image, const Board& _board
                 CV_Assert(detectorParams.cornerRefinementWinSize > 0 &&
                           detectorParams.cornerRefinementMaxIterations > 0 &&
                           detectorParams.cornerRefinementMinAccuracy > 0);
+
+                std::vector<Point2f> marker(closestRotatedMarker.begin<Point2f>(), closestRotatedMarker.end<Point2f>());
+                int cornerRefinementWinSize = std::max(1, cvRound(detectorParams.relativeCornerRefinmentWinSize*
+                                                                  arucoDetectorImpl->getAverageArucoPinSize(marker)));
+                cornerRefinementWinSize = min(cornerRefinementWinSize, detectorParams.cornerRefinementWinSize);
                 cornerSubPix(grey, closestRotatedMarker,
-                             Size(detectorParams.cornerRefinementWinSize, detectorParams.cornerRefinementWinSize),
+                             Size(cornerRefinementWinSize, cornerRefinementWinSize),
                              Size(-1, -1), TermCriteria(TermCriteria::MAX_ITER | TermCriteria::EPS,
                                                         detectorParams.cornerRefinementMaxIterations,
                                                         detectorParams.cornerRefinementMinAccuracy));
diff --git a/modules/objdetect/src/aruco/charuco_detector.cpp b/modules/objdetect/src/aruco/charuco_detector.cpp
index a7b17c4798..d5b4087a2c 100644
--- a/modules/objdetect/src/aruco/charuco_detector.cpp
+++ b/modules/objdetect/src/aruco/charuco_detector.cpp
@@ -27,13 +27,12 @@ struct CharucoDetector::CharucoDetectorImpl {
     bool checkBoard(InputArrayOfArrays markerCorners, InputArray markerIds, InputArray charucoCorners, InputArray charucoIds) {
         vector<Mat> mCorners;
         markerCorners.getMatVector(mCorners);
-        const Mat& mIds = markerIds.getMat();
-
-        const Mat& chCorners = charucoCorners.getMat();
-        const Mat& chIds = charucoIds.getMat();
+        const Mat mIds = markerIds.getMat();
+        const Mat chCorners = charucoCorners.getMat();
+        const Mat chIds = charucoIds.getMat();
         const vector<int>& boardIds = board.getIds();
 
-        const vector<vector<int> >& nearestMarkerIdx = board.getNearestMarkerIdx();
+        const vector<vector<int> > nearestMarkerIdx = board.getNearestMarkerIdx();
         vector<Point2f> distance(board.getNearestMarkerIdx().size(), Point2f(0.f, std::numeric_limits<float>::max()));
         // distance[i].x: max distance from the i-th charuco corner to charuco corner-forming markers.
         // The two charuco corner-forming markers of i-th charuco corner are defined in getNearestMarkerIdx()[i]
diff --git a/modules/objdetect/src/face_recognize.cpp b/modules/objdetect/src/face_recognize.cpp
index 497303e42b..8183573ce9 100644
--- a/modules/objdetect/src/face_recognize.cpp
+++ b/modules/objdetect/src/face_recognize.cpp
@@ -25,7 +25,7 @@ public:
 
         net.setPreferableBackend(backend_id);
         net.setPreferableTarget(target_id);
-    };
+    }
     void alignCrop(InputArray _src_img, InputArray _face_mat, OutputArray _aligned_img) const override
     {
         Mat face_mat = _face_mat.getMat();
@@ -39,13 +39,13 @@ public:
         }
         Mat warp_mat = getSimilarityTransformMatrix(src_point);
         warpAffine(_src_img, _aligned_img, warp_mat, Size(112, 112), INTER_LINEAR);
-    };
+    }
     void feature(InputArray _aligned_img, OutputArray _face_feature) override
     {
         Mat inputBolb = dnn::blobFromImage(_aligned_img, 1, Size(112, 112), Scalar(0, 0, 0), true, false);
         net.setInput(inputBolb);
         net.forward(_face_feature);
-    };
+    }
     double match(InputArray _face_feature1, InputArray _face_feature2, int dis_type) const override
     {
         Mat face_feature1 = _face_feature1.getMat(), face_feature2 = _face_feature2.getMat();
@@ -60,7 +60,7 @@ public:
             throw std::invalid_argument("invalid parameter " + std::to_string(dis_type));
         }
 
-    };
+    }
 
 private:
     Mat getSimilarityTransformMatrix(float src[5][2]) const {
diff --git a/modules/objdetect/src/qrcode.cpp b/modules/objdetect/src/qrcode.cpp
index adba841d97..17e97f7cac 100644
--- a/modules/objdetect/src/qrcode.cpp
+++ b/modules/objdetect/src/qrcode.cpp
@@ -2728,6 +2728,58 @@ bool QRDecode::samplingForVersion()
     return true;
 }
 
+
+static bool checkASCIIcompatible(const uint8_t* str, const size_t size) {
+    for (size_t i = 0; i < size; ++i) {
+        uint8_t byte = str[i];
+        if (byte >= 0x80)
+            return false;
+    }
+    return true;
+}
+
+static bool checkUTF8(const uint8_t* str, const size_t size) {
+    for (size_t i = 0; i < size; ++i) {
+        uint8_t byte = str[i];
+        if (byte >= 0x80) {
+            // Check that symbol is encoded correctly.
+
+            // Count number of bytes per symbol as a number of leading non-zero bits
+            uint8_t numBytesPerSymbol;
+            if ((byte & 0xe0) == 0xc0)
+                numBytesPerSymbol = 2;
+            else if ((byte & 0xf0) == 0xe0)
+                numBytesPerSymbol = 3;
+            else if ((byte & 0xf8) == 0xf0)
+                numBytesPerSymbol = 4;
+            else
+                return false;
+
+            for (size_t j = 1; j < numBytesPerSymbol; ++j) {
+                if (i + j >= size || (str[i + j] & 0xc0) != 0x80) {
+                    return false;
+                }
+            }
+            i += numBytesPerSymbol - 1;
+        }
+    }
+    return true;
+}
+
+static std::string encodeUTF8_bytesarray(const uint8_t* str, const size_t size) {
+    std::ostringstream res;
+    for (size_t i = 0; i < size; ++i) {
+        uint8_t byte = str[i];
+        if (byte >= 0x80) {
+            res << (char)(0xc0 | (byte >> 6));
+            res << (char)(0x80 | (byte & 0x3f));
+        } else {
+            res << (char)byte;
+        }
+    }
+    return res.str();
+}
+
 bool QRDecode::decodingProcess()
 {
 #ifdef HAVE_QUIRC
@@ -2757,11 +2809,58 @@ bool QRDecode::decodingProcess()
 
     if (errorCode != 0) { return false; }
 
-    for (int i = 0; i < qr_code_data.payload_len; i++)
+    CV_LOG_INFO(NULL, "QR: decoded with .version=" << qr_code_data.version << " .data_type=" << qr_code_data.data_type << " .eci=" << qr_code_data.eci << " .payload_len=" << qr_code_data.payload_len)
+
+    switch (qr_code_data.data_type)
     {
-        result_info += qr_code_data.payload[i];
+        case QUIRC_DATA_TYPE_NUMERIC:
+            if (!checkASCIIcompatible(qr_code_data.payload, qr_code_data.payload_len)) {
+                CV_LOG_INFO(NULL, "QR: DATA_TYPE_NUMERIC payload must be ACSII compatible string");
+                return false;
+            }
+            result_info.assign((const char*)qr_code_data.payload, qr_code_data.payload_len);
+            return true;
+        case QUIRC_DATA_TYPE_ALPHA:
+            if (!checkASCIIcompatible(qr_code_data.payload, qr_code_data.payload_len)) {
+                CV_LOG_INFO(NULL, "QR: DATA_TYPE_ALPHA payload must be ASCII compatible string");
+                return false;
+            }
+            result_info.assign((const char*)qr_code_data.payload, qr_code_data.payload_len);
+            return true;
+        case QUIRC_DATA_TYPE_BYTE:
+            // https://en.wikipedia.org/wiki/Extended_Channel_Interpretation
+            if (qr_code_data.eci == QUIRC_ECI_UTF_8) {
+                CV_LOG_INFO(NULL, "QR: payload ECI is UTF-8");
+                if (!checkUTF8(qr_code_data.payload, qr_code_data.payload_len)) {
+                    CV_LOG_INFO(NULL, "QUIRC_DATA_TYPE_BYTE with UTF-8 ECI must be UTF-8 compatible string");
+                    return false;
+                }
+                result_info.assign((const char*)qr_code_data.payload, qr_code_data.payload_len);
+            } else if (qr_code_data.eci == 25/*ECI_UTF_16BE*/) {
+                CV_LOG_INFO(NULL, "QR: UTF-16BE ECI is not supported");
+                return false;
+            } else if (checkASCIIcompatible(qr_code_data.payload, qr_code_data.payload_len)) {
+                CV_LOG_INFO(NULL, "QR: payload is ASCII compatible (special handling for symbols encoding is not needed)");
+                result_info.assign((const char*)qr_code_data.payload, qr_code_data.payload_len);
+            } else {
+                if (checkUTF8(qr_code_data.payload, qr_code_data.payload_len)) {
+                    CV_LOG_INFO(NULL, "QR: payload QUIRC_DATA_TYPE_BYTE is UTF-8 compatible, return as-is");
+                    result_info.assign((const char*)qr_code_data.payload, qr_code_data.payload_len);
+                } else {
+                    CV_LOG_INFO(NULL, "QR: assume 1-byte per symbol encoding");
+                    result_info = encodeUTF8_bytesarray(qr_code_data.payload, qr_code_data.payload_len);
+                }
+            }
+            return true;
+        case QUIRC_DATA_TYPE_KANJI:
+            // FIXIT BUG: we must return UTF-8 compatible string
+            CV_LOG_WARNING(NULL, "QR: Kanji is not supported properly");
+            result_info.assign((const char*)qr_code_data.payload, qr_code_data.payload_len);
+            return true;
     }
-    return true;
+
+    CV_LOG_WARNING(NULL, "QR: unsupported QR data type");
+    return false;
 #else
     return false;
 #endif
diff --git a/modules/objdetect/src/qrcode_encoder.cpp b/modules/objdetect/src/qrcode_encoder.cpp
index 24a9548899..4ab1e1ac40 100644
--- a/modules/objdetect/src/qrcode_encoder.cpp
+++ b/modules/objdetect/src/qrcode_encoder.cpp
@@ -703,7 +703,7 @@ bool QRCodeEncoderImpl::stringToBits(const std::string& input_info)
         default:
             return encodeAuto(input_info, payload);
     }
-};
+}
 
 void QRCodeEncoderImpl::eccGenerate(vector<vector<uint8_t> > &data_blocks, vector<vector<uint8_t> > &ecc_blocks)
 {
diff --git a/modules/objdetect/test/test_qrcode.cpp b/modules/objdetect/test/test_qrcode.cpp
index 9b7d8ceda4..89954c993a 100644
--- a/modules/objdetect/test/test_qrcode.cpp
+++ b/modules/objdetect/test/test_qrcode.cpp
@@ -637,4 +637,25 @@ TEST_P(Objdetect_QRCode_detectAndDecodeMulti, decode_9_qrcodes_version7)
 
 #endif // UPDATE_QRCODE_TEST_DATA
 
+TEST(Objdetect_QRCode_detectAndDecode, utf8_output)
+{
+#ifndef HAVE_QUIRC
+    throw SkipTestException("Quirc is required for decoding");
+#else
+    const std::string name_current_image = "umlaut.png";
+    const std::string root = "qrcode/";
+
+    std::string image_path = findDataFile(root + name_current_image);
+    Mat src = imread(image_path);
+    ASSERT_FALSE(src.empty()) << "Can't read image: " << image_path;
+
+    QRCodeDetector qrcode;
+    std::vector<Point> corners;
+    Mat straight;
+    std::string decoded_info = qrcode.detectAndDecode(src, corners, straight);
+    EXPECT_FALSE(decoded_info.empty());
+    EXPECT_NE(decoded_info.find("M\xc3\xbcllheimstrasse"), std::string::npos);
+#endif // HAVE_QUIRC
+}
+
 }} // namespace
diff --git a/modules/photo/include/opencv2/photo/cuda.hpp b/modules/photo/include/opencv2/photo/cuda.hpp
index b6ab40a764..709ad2d26f 100644
--- a/modules/photo/include/opencv2/photo/cuda.hpp
+++ b/modules/photo/include/opencv2/photo/cuda.hpp
@@ -78,7 +78,7 @@ CV_WRAP inline void nonLocalMeans(const GpuMat& src, CV_OUT GpuMat& dst,
                             Stream& stream = Stream::Null())
 {
     nonLocalMeans(InputArray(src), OutputArray(dst), h, search_window, block_size, borderMode, stream);
-};
+}
 
 /** @brief Perform image denoising using Non-local Means Denoising algorithm
 <http://www.ipol.im/pub/algo/bcm_non_local_means_denoising> with several computational
diff --git a/modules/photo/src/denoise_tvl1.cpp b/modules/photo/src/denoise_tvl1.cpp
index df756c4c85..5f49de1421 100644
--- a/modules/photo/src/denoise_tvl1.cpp
+++ b/modules/photo/src/denoise_tvl1.cpp
@@ -42,8 +42,6 @@
 #include <vector>
 #include <algorithm>
 
-#define ABSCLIP(val,threshold) MIN(MAX((val),-(threshold)),(threshold))
-
 namespace cv{
 
     class AddFloatToCharScaled{
diff --git a/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp b/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp
index d36c85a840..9da5c0cf8c 100644
--- a/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp
@@ -174,7 +174,7 @@ public:
     static inline int calcUpDownDist(T a_up, T a_down, T b_up, T b_down)
     {
         return calcDist<T>(a_down, b_down) - calcDist<T>(a_up, b_up);
-    };
+    }
 
     template <typename T, typename WT>
     static inline WT calcWeight(double dist, const float *h,
@@ -296,7 +296,7 @@ public:
     static inline int calcUpDownDist(T a_up, T a_down, T b_up, T b_down)
     {
         return calcUpDownDist_<T>::f(a_up, a_down, b_up, b_down);
-    };
+    }
 
     template <typename T, typename WT>
     static inline WT calcWeight(double dist, const float *h,
diff --git a/modules/python/src2/gen2.py b/modules/python/src2/gen2.py
index 3235966b23..fa3a3fb9f1 100755
--- a/modules/python/src2/gen2.py
+++ b/modules/python/src2/gen2.py
@@ -432,7 +432,7 @@ class ClassInfo(object):
         if self.constructor is not None:
             constructor_name = self.constructor.get_wrapper_name()
 
-        return 'CVPY_TYPE({}, {}, {}, {}, {}, {}, "{}");\n'.format(
+        return 'CVPY_TYPE({}, {}, {}, {}, {}, {}, "{}")\n'.format(
             self.export_name,
             self.class_id,
             self.cname if self.issimple else "Ptr<{}>".format(self.cname),
@@ -1294,7 +1294,7 @@ class PythonWrapperGenerator(object):
         code = ""
         if re.sub(r"^cv\.", "", enum_name) != wname:
             code += "typedef {0} {1};\n".format(cname, wname)
-        code += "CV_PY_FROM_ENUM({0});\nCV_PY_TO_ENUM({0});\n\n".format(wname)
+        code += "CV_PY_FROM_ENUM({0})\nCV_PY_TO_ENUM({0})\n\n".format(wname)
         self.code_enums.write(code)
 
     def save(self, path, name, buf):
diff --git a/modules/python/src2/typing_stubs_generation/generation.py b/modules/python/src2/typing_stubs_generation/generation.py
index c5578bb3a8..563c09cc87 100644
--- a/modules/python/src2/typing_stubs_generation/generation.py
+++ b/modules/python/src2/typing_stubs_generation/generation.py
@@ -444,7 +444,7 @@ def _generate_function_stub(function_node: FunctionNode,
     elif function_node.is_static:
         decorators.append(" " * indent + "@staticmethod")
     if len(function_node.overloads) > 1:
-        decorators.append(" " * indent + "@typing.overload")
+        decorators.append(" " * indent + "@_typing.overload")
 
     function_module = get_enclosing_namespace(function_node)
     function_module_name = function_module.full_export_name
@@ -578,7 +578,7 @@ def _collect_required_imports(root: NamespaceNode) -> Collection[str]:
     for cls in for_each_class(root):
         if not has_overload and check_overload_presence(cls):
             has_overload = True
-            required_imports.add("import typing")
+            required_imports.add("import typing as _typing")
         # Add required imports for class properties
         for prop in cls.properties:
             _add_required_usage_imports(prop.type_node, required_imports)
@@ -593,7 +593,7 @@ def _collect_required_imports(root: NamespaceNode) -> Collection[str]:
             has_protocol = True
 
     if has_overload:
-        required_imports.add("import typing")
+        required_imports.add("import typing as _typing")
     # Importing modules required to resolve functions arguments
     for overload in for_each_function_overload(root):
         for arg in filter(lambda a: a.type_node is not None,
@@ -634,6 +634,8 @@ def _populate_reexported_symbols(root: NamespaceNode) -> None:
 
     _reexport_submodule(root)
 
+    root.reexported_submodules.append("typing")
+
     # Special cases, symbols defined in possible pure Python submodules
     # should be
     root.reexported_submodules_symbols["mat_wrapper"].append("Mat")
@@ -735,10 +737,10 @@ def _generate_typing_module(root: NamespaceNode, output_path: Path) -> None:
         )
         return ConditionalAliasTypeNode(
             enum_export_name,
-            "typing.TYPE_CHECKING",
+            "_typing.TYPE_CHECKING",
             positive_branch_type=enum_node_alias,
             negative_branch_type=PrimitiveTypeNode.int_(enum_export_name),
-            condition_required_imports=("import typing", )
+            condition_required_imports=("import typing as _typing", )
         )
 
     def register_alias(alias_node: AliasTypeNode) -> None:
diff --git a/modules/python/src2/typing_stubs_generation/nodes/type_node.py b/modules/python/src2/typing_stubs_generation/nodes/type_node.py
index 089ff2ee9d..3f242e730e 100644
--- a/modules/python/src2/typing_stubs_generation/nodes/type_node.py
+++ b/modules/python/src2/typing_stubs_generation/nodes/type_node.py
@@ -163,11 +163,11 @@ class AnyTypeNode(TypeNode):
     """
     @property
     def typename(self) -> str:
-        return "typing.Any"
+        return "_typing.Any"
 
     @property
     def required_usage_imports(self) -> Generator[str, None, None]:
-        yield "import typing"
+        yield "import typing as _typing"
 
 
 class PrimitiveTypeNode(TypeNode):
@@ -474,11 +474,11 @@ class ConditionalAliasTypeNode(TypeNode):
         """Type subscription is not possible in python 3.8 and older numpy versions."""
         return cls(
             ctype_name,
-            "typing.TYPE_CHECKING",
+            "_typing.TYPE_CHECKING",
             NDArrayTypeNode(ctype_name, shape, dtype),
             NDArrayTypeNode(ctype_name, shape, dtype,
                             use_numpy_generics=False),
-            condition_required_imports=("import typing",)
+            condition_required_imports=("import typing as _typing",)
         )
 
 
@@ -499,14 +499,14 @@ class NDArrayTypeNode(TypeNode):
         if self._use_numpy_generics:
             # NOTE: Shape is not fully supported yet
             dtype = self.dtype if self.dtype is not None else "numpy.generic"
-            return f"numpy.ndarray[typing.Any, numpy.dtype[{dtype}]]"
+            return f"numpy.ndarray[_typing.Any, numpy.dtype[{dtype}]]"
         return "numpy.ndarray"
 
     @property
     def required_usage_imports(self) -> Generator[str, None, None]:
         yield "import numpy"
         # if self.shape is None:
-        yield "import typing"
+        yield "import typing as _typing"
 
 
 class ASTNodeTypeNode(TypeNode):
@@ -668,13 +668,13 @@ class ContainerTypeNode(AggregatedTypeNode):
 
     @property
     def required_definition_imports(self) -> Generator[str, None, None]:
-        yield "import typing"
+        yield "import typing as _typing"
         yield from super().required_definition_imports
 
     @property
     def required_usage_imports(self) -> Generator[str, None, None]:
         if TypeNode.compatible_to_runtime_usage:
-            yield "import typing"
+            yield "import typing as _typing"
         yield from super().required_usage_imports
 
     @abc.abstractproperty
@@ -695,7 +695,7 @@ class SequenceTypeNode(ContainerTypeNode):
 
     @property
     def type_format(self) -> str:
-        return "typing.Sequence[{}]"
+        return "_typing.Sequence[{}]"
 
     @property
     def types_separator(self) -> str:
@@ -709,7 +709,7 @@ class TupleTypeNode(ContainerTypeNode):
     @property
     def type_format(self) -> str:
         if TypeNode.compatible_to_runtime_usage:
-            return "typing.Tuple[{}]"
+            return "_typing.Tuple[{}]"
         return "tuple[{}]"
 
     @property
@@ -723,7 +723,7 @@ class UnionTypeNode(ContainerTypeNode):
     @property
     def type_format(self) -> str:
         if TypeNode.compatible_to_runtime_usage:
-            return "typing.Union[{}]"
+            return "_typing.Union[{}]"
         return "{}"
 
     @property
@@ -743,7 +743,7 @@ class OptionalTypeNode(ContainerTypeNode):
     @property
     def type_format(self) -> str:
         if TypeNode.compatible_to_runtime_usage:
-            return "typing.Optional[{}]"
+            return "_typing.Optional[{}]"
         return "{} | None"
 
     @property
@@ -769,7 +769,7 @@ class DictTypeNode(ContainerTypeNode):
     @property
     def type_format(self) -> str:
         if TypeNode.compatible_to_runtime_usage:
-            return "typing.Dict[{}]"
+            return "_typing.Dict[{}]"
         return "dict[{}]"
 
     @property
@@ -810,32 +810,32 @@ class CallableTypeNode(AggregatedTypeNode):
 
     @property
     def typename(self) -> str:
-        return 'typing.Callable[[{}], {}]'.format(
+        return '_typing.Callable[[{}], {}]'.format(
             ', '.join(arg.typename for arg in self.arg_types),
             self.ret_type.typename
         )
 
     @property
     def full_typename(self) -> str:
-        return 'typing.Callable[[{}], {}]'.format(
+        return '_typing.Callable[[{}], {}]'.format(
             ', '.join(arg.full_typename for arg in self.arg_types),
             self.ret_type.full_typename
         )
 
     def relative_typename(self, module: str) -> str:
-        return 'typing.Callable[[{}], {}]'.format(
+        return '_typing.Callable[[{}], {}]'.format(
             ', '.join(arg.relative_typename(module) for arg in self.arg_types),
             self.ret_type.relative_typename(module)
         )
 
     @property
     def required_definition_imports(self) -> Generator[str, None, None]:
-        yield "import typing"
+        yield "import typing as _typing"
         yield from super().required_definition_imports
 
     @property
     def required_usage_imports(self) -> Generator[str, None, None]:
-        yield "import typing"
+        yield "import typing as _typing"
         yield from super().required_usage_imports
 
 
@@ -847,7 +847,7 @@ class ClassTypeNode(ContainerTypeNode):
 
     @property
     def type_format(self) -> str:
-        return "typing.Type[{}]"
+        return "_typing.Type[{}]"
 
     @property
     def types_separator(self) -> str:
diff --git a/modules/stereo/perf/perf_stereosgbm.cpp b/modules/stereo/perf/perf_stereosgbm.cpp
index 8ae477748a..0ae3735565 100644
--- a/modules/stereo/perf/perf_stereosgbm.cpp
+++ b/modules/stereo/perf/perf_stereosgbm.cpp
@@ -43,7 +43,7 @@ using namespace testing;
 
 static void MakeArtificialExample(Mat& dst_left_view, Mat& dst_view);
 
-CV_ENUM(SGBMModes, StereoSGBM::MODE_SGBM, StereoSGBM::MODE_SGBM_3WAY, StereoSGBM::MODE_HH4);
+CV_ENUM(SGBMModes, StereoSGBM::MODE_SGBM, StereoSGBM::MODE_SGBM_3WAY, StereoSGBM::MODE_HH4)
 typedef tuple<Size, int, SGBMModes> SGBMParams;
 typedef TestBaseWithParam<SGBMParams> TestStereoCorrespSGBM;
 
diff --git a/modules/stereo/src/precomp.hpp b/modules/stereo/src/precomp.hpp
index a786466ae6..24f8f675b8 100644
--- a/modules/stereo/src/precomp.hpp
+++ b/modules/stereo/src/precomp.hpp
@@ -65,7 +65,7 @@ namespace cv
  * @param ep outlier ratio
  * @param modelPoints number of model points required for estimation
  * @param maxIters maximum number of iterations
- * @return
+ * @return The number of iterations according to the formula
  * \f[
  * \frac{\ln(1-p)}{\ln\left(1-(1-ep)^\mathrm{modelPoints}\right)}
  * \f]
diff --git a/modules/stitching/include/opencv2/stitching/detail/exposure_compensate.hpp b/modules/stitching/include/opencv2/stitching/detail/exposure_compensate.hpp
index 23db56dd41..9554d7117f 100644
--- a/modules/stitching/include/opencv2/stitching/detail/exposure_compensate.hpp
+++ b/modules/stitching/include/opencv2/stitching/detail/exposure_compensate.hpp
@@ -86,10 +86,10 @@ public:
     @param mask Image mask
         */
     CV_WRAP virtual void apply(int index, Point corner, InputOutputArray image, InputArray mask) = 0;
-    CV_WRAP virtual void getMatGains(CV_OUT std::vector<Mat>& ) {CV_Error(Error::StsInternal, "");};
-    CV_WRAP virtual void setMatGains(std::vector<Mat>& ) { CV_Error(Error::StsInternal, ""); };
-    CV_WRAP void setUpdateGain(bool b) { updateGain = b; };
-    CV_WRAP bool getUpdateGain() { return updateGain; };
+    CV_WRAP virtual void getMatGains(CV_OUT std::vector<Mat>& ) {CV_Error(Error::StsInternal, "");}
+    CV_WRAP virtual void setMatGains(std::vector<Mat>& ) { CV_Error(Error::StsInternal, ""); }
+    CV_WRAP void setUpdateGain(bool b) { updateGain = b; }
+    CV_WRAP bool getUpdateGain() { return updateGain; }
 protected :
     bool updateGain;
 };
@@ -102,8 +102,8 @@ public:
     void feed(const std::vector<Point> &/*corners*/, const std::vector<UMat> &/*images*/,
               const std::vector<std::pair<UMat,uchar> > &/*masks*/) CV_OVERRIDE { }
     CV_WRAP void apply(int /*index*/, Point /*corner*/, InputOutputArray /*image*/, InputArray /*mask*/) CV_OVERRIDE { }
-    CV_WRAP void getMatGains(CV_OUT std::vector<Mat>& umv) CV_OVERRIDE { umv.clear(); return; };
-    CV_WRAP void setMatGains(std::vector<Mat>& umv) CV_OVERRIDE { umv.clear(); return; };
+    CV_WRAP void getMatGains(CV_OUT std::vector<Mat>& umv) CV_OVERRIDE { umv.clear(); return; }
+    CV_WRAP void setMatGains(std::vector<Mat>& umv) CV_OVERRIDE { umv.clear(); return; }
 };
 
 /** @brief Exposure compensator which tries to remove exposure related artifacts by adjusting image
diff --git a/modules/stitching/include/opencv2/stitching/detail/matchers.hpp b/modules/stitching/include/opencv2/stitching/detail/matchers.hpp
index dce0b1454d..8c2f15a570 100644
--- a/modules/stitching/include/opencv2/stitching/detail/matchers.hpp
+++ b/modules/stitching/include/opencv2/stitching/detail/matchers.hpp
@@ -61,7 +61,7 @@ struct CV_EXPORTS_W_SIMPLE ImageFeatures
     CV_PROP_RW Size img_size;
     CV_PROP_RW std::vector<KeyPoint> keypoints;
     CV_PROP_RW UMat descriptors;
-    CV_WRAP std::vector<KeyPoint> getKeypoints() { return keypoints; };
+    CV_WRAP std::vector<KeyPoint> getKeypoints() { return keypoints; }
 };
 /** @brief
 
@@ -109,8 +109,8 @@ struct CV_EXPORTS_W_SIMPLE MatchesInfo
     CV_PROP_RW int num_inliers;                    //!< Number of geometrically consistent matches
     CV_PROP_RW Mat H;                              //!< Estimated transformation
     CV_PROP_RW double confidence;                  //!< Confidence two images are from the same panorama
-    CV_WRAP std::vector<DMatch> getMatches() { return matches; };
-    CV_WRAP std::vector<uchar> getInliers() { return inliers_mask; };
+    CV_WRAP std::vector<DMatch> getMatches() { return matches; }
+    CV_WRAP std::vector<uchar> getInliers() { return inliers_mask; }
 };
 
 /** @brief Feature matchers base class. */
@@ -138,7 +138,7 @@ public:
     @sa detail::MatchesInfo
     */
     CV_WRAP_AS(apply2) void operator ()(const std::vector<ImageFeatures> &features, CV_OUT std::vector<MatchesInfo> &pairwise_matches,
-                                        const cv::UMat &mask = cv::UMat()) { match(features, pairwise_matches, mask); };
+                                        const cv::UMat &mask = cv::UMat()) { match(features, pairwise_matches, mask); }
 
     /** @return True, if it's possible to use the same matcher instance in parallel, false otherwise
     */
diff --git a/modules/stitching/include/opencv2/stitching/warpers.hpp b/modules/stitching/include/opencv2/stitching/warpers.hpp
index aa1ce5a6a7..0a5bf63de2 100644
--- a/modules/stitching/include/opencv2/stitching/warpers.hpp
+++ b/modules/stitching/include/opencv2/stitching/warpers.hpp
@@ -53,7 +53,7 @@ namespace cv {
 
     public:
         CV_WRAP PyRotationWarper(String type, float scale);
-        CV_WRAP PyRotationWarper() {};
+        CV_WRAP PyRotationWarper() {}
         ~PyRotationWarper() {}
 
         /** @brief Projects the image point.
diff --git a/modules/ts/include/opencv2/ts/ts_ext.hpp b/modules/ts/include/opencv2/ts/ts_ext.hpp
index e7e01fb3ed..eebf4c594b 100644
--- a/modules/ts/include/opencv2/ts/ts_ext.hpp
+++ b/modules/ts/include/opencv2/ts/ts_ext.hpp
@@ -13,7 +13,7 @@ void checkIppStatus();
 extern bool skipUnstableTests;
 extern bool runBigDataTests;
 extern int testThreads;
-extern int debugLevel;  //< 0 - no debug, 1 - basic test debug information, >1 - extra debug information
+extern int debugLevel;  ///< 0 - no debug, 1 - basic test debug information, >1 - extra debug information
 
 void testSetUp();
 void testTearDown();
diff --git a/modules/video/include/opencv2/video/detail/tracking.detail.hpp b/modules/video/include/opencv2/video/detail/tracking.detail.hpp
index 1e6107900d..3c7823b7dc 100644
--- a/modules/video/include/opencv2/video/detail/tracking.detail.hpp
+++ b/modules/video/include/opencv2/video/detail/tracking.detail.hpp
@@ -171,7 +171,7 @@ width, height, orientation, etc.
 class CV_EXPORTS TrackerTargetState
 {
 public:
-    virtual ~TrackerTargetState() {};
+    virtual ~TrackerTargetState() {}
     /** @brief Get the position
     * @return The position
     */
diff --git a/modules/video/include/opencv2/video/tracking.hpp b/modules/video/include/opencv2/video/tracking.hpp
index 7f93a79a72..8dbcfbf216 100644
--- a/modules/video/include/opencv2/video/tracking.hpp
+++ b/modules/video/include/opencv2/video/tracking.hpp
@@ -166,7 +166,7 @@ performance boost.
 The function implements a sparse iterative version of the Lucas-Kanade optical flow in pyramids. See
 @cite Bouguet00 . The function is parallelized with the TBB library.
 
-@note
+@note Some examples:
 
 -   An example using the Lucas-Kanade optical flow algorithm can be found at
     opencv_source_code/samples/cpp/lkdemo.cpp
@@ -213,7 +213,7 @@ The function finds an optical flow for each prev pixel using the @cite Farneback
 
 \f[\texttt{prev} (y,x)  \sim \texttt{next} ( y + \texttt{flow} (y,x)[1],  x + \texttt{flow} (y,x)[0])\f]
 
-@note
+@note Some examples:
 
 -   An example using the optical flow algorithm described by Gunnar Farneback can be found at
     opencv_source_code/samples/cpp/fback.cpp
diff --git a/modules/video/src/tracking/detail/tracker_mil_model.hpp b/modules/video/src/tracking/detail/tracker_mil_model.hpp
index dddfae5536..027ddd0679 100644
--- a/modules/video/src/tracking/detail/tracker_mil_model.hpp
+++ b/modules/video/src/tracking/detail/tracker_mil_model.hpp
@@ -36,7 +36,7 @@ public:
     /**
    * \brief Destructor
    */
-    ~TrackerMILModel() {};
+    ~TrackerMILModel() {}
 
     /**
    * \brief Set the mode
diff --git a/modules/video/src/tracking/detail/tracker_mil_state.hpp b/modules/video/src/tracking/detail/tracker_mil_state.hpp
index 12af1c33df..f4eeee9796 100644
--- a/modules/video/src/tracking/detail/tracker_mil_state.hpp
+++ b/modules/video/src/tracking/detail/tracker_mil_state.hpp
@@ -34,7 +34,7 @@ public:
         */
         TrackerMILTargetState(const Point2f& position, int width, int height, bool foreground, const Mat& features);
 
-        ~TrackerMILTargetState() {};
+        ~TrackerMILTargetState() {}
 
         /** @brief Set label: true for target foreground, false for background
         @param foreground Label for background/foreground
diff --git a/modules/video/src/tracking/detail/tracking_online_mil.cpp b/modules/video/src/tracking/detail/tracking_online_mil.cpp
index c9472aa947..b1d74916f7 100644
--- a/modules/video/src/tracking/detail/tracking_online_mil.cpp
+++ b/modules/video/src/tracking/detail/tracking_online_mil.cpp
@@ -29,7 +29,7 @@ public:
     bool operator<(SortableElementRev<T>& b)
     {
         return (_val < b._val);
-    };
+    }
 };
 
 static bool CompareSortableElementRev(const SortableElementRev<float>& i, const SortableElementRev<float>& j)
@@ -57,7 +57,7 @@ void sort_order_des(std::vector<T>& v, std::vector<int>& order)
         order[i] = v2[i]._ind;
         v[i] = v2[i]._val;
     }
-};
+}
 
 //implementations for strong classifier
 
diff --git a/modules/video/test/ocl/test_dis.cpp b/modules/video/test/ocl/test_dis.cpp
index 4df7f9a197..0a49452f13 100644
--- a/modules/video/test/ocl/test_dis.cpp
+++ b/modules/video/test/ocl/test_dis.cpp
@@ -46,7 +46,7 @@
 
 namespace opencv_test { namespace {
 
-CV_ENUM(DIS_TestPresets, DISOpticalFlow::PRESET_ULTRAFAST, DISOpticalFlow::PRESET_FAST, DISOpticalFlow::PRESET_MEDIUM);
+CV_ENUM(DIS_TestPresets, DISOpticalFlow::PRESET_ULTRAFAST, DISOpticalFlow::PRESET_FAST, DISOpticalFlow::PRESET_MEDIUM)
 
 typedef ocl::TSTestWithParam<DIS_TestPresets> OCL_DenseOpticalFlow_DIS;
 
diff --git a/modules/videoio/include/opencv2/videoio.hpp b/modules/videoio/include/opencv2/videoio.hpp
index 7bc3934891..7e9c5b304a 100644
--- a/modules/videoio/include/opencv2/videoio.hpp
+++ b/modules/videoio/include/opencv2/videoio.hpp
@@ -189,7 +189,7 @@ enum VideoCaptureProperties {
        CAP_PROP_HW_ACCELERATION_USE_OPENCL=52, //!< (**open-only**) If non-zero, create new OpenCL context and bind it to current thread. The OpenCL context created with Video Acceleration context attached it (if not attached yet) for optimized GPU data copy between HW accelerated decoder and cv::UMat.
        CAP_PROP_OPEN_TIMEOUT_MSEC=53, //!< (**open-only**) timeout in milliseconds for opening a video capture (applicable for FFmpeg and GStreamer back-ends only)
        CAP_PROP_READ_TIMEOUT_MSEC=54, //!< (**open-only**) timeout in milliseconds for reading from a video capture (applicable for FFmpeg and GStreamer back-ends only)
-       CAP_PROP_STREAM_OPEN_TIME_USEC =55, //<! (read-only) time in microseconds since Jan 1 1970 when stream was opened. Applicable for FFmpeg backend only. Useful for RTSP and other live streams
+       CAP_PROP_STREAM_OPEN_TIME_USEC =55, //!< (read-only) time in microseconds since Jan 1 1970 when stream was opened. Applicable for FFmpeg backend only. Useful for RTSP and other live streams
        CAP_PROP_VIDEO_TOTAL_CHANNELS = 56, //!< (read-only) Number of video channels
        CAP_PROP_VIDEO_STREAM = 57, //!< (**open-only**) Specify video stream, 0-based index. Use -1 to disable video stream from file or IP cameras. Default value is 0.
        CAP_PROP_AUDIO_STREAM = 58, //!< (**open-only**) Specify stream in multi-language media files, -1 - disable audio processing or microphone. Default value is -1.
diff --git a/modules/videoio/test/test_audio.cpp b/modules/videoio/test/test_audio.cpp
index efe5f67346..0a1abd43ad 100644
--- a/modules/videoio/test/test_audio.cpp
+++ b/modules/videoio/test/test_audio.cpp
@@ -14,7 +14,7 @@ typedef std::tuple<std::string, int, int, double, VideoCaptureAPIs> param;
 class AudioBaseTest
 {
 protected:
-    AudioBaseTest(){};
+    AudioBaseTest(){}
     void getValidAudioData()
     {
         const double step = 3.14/22050;
@@ -157,7 +157,7 @@ public:
             params = {  CAP_PROP_AUDIO_STREAM, 0,
                         CAP_PROP_VIDEO_STREAM, 0,
                         CAP_PROP_AUDIO_DATA_DEPTH, CV_16S };
-        };
+        }
 
     void doTest()
     {
diff --git a/modules/videoio/test/test_camera.cpp b/modules/videoio/test/test_camera.cpp
index 4466919f50..f11fa3f251 100644
--- a/modules/videoio/test/test_camera.cpp
+++ b/modules/videoio/test/test_camera.cpp
@@ -223,6 +223,35 @@ TEST(DISABLED_videoio_camera, v4l_read_framesize)
     capture.release();
 }
 
+TEST(DISABLED_videoio_camera, v4l_rgb_convert)
+{
+    VideoCapture capture(CAP_V4L2);
+    ASSERT_TRUE(capture.isOpened());
+    std::cout << "Camera 0 via " << capture.getBackendName() << " backend" << std::endl;
+    std::cout << " Frame width: " << capture.get(CAP_PROP_FRAME_WIDTH) << std::endl;
+    std::cout << "      height: " << capture.get(CAP_PROP_FRAME_HEIGHT) << std::endl;
+    std::cout << "Pixel format: " << capture.get(cv::CAP_PROP_FORMAT) << std::endl;
+    if (capture.get(CAP_PROP_FOURCC) != VideoWriter::fourcc('Y', 'U', 'Y', 'V'))
+    {
+        throw SkipTestException("Camera does not support YUYV format");
+    }
+    capture.set(cv::CAP_PROP_CONVERT_RGB, 0);
+    std::cout << "New pixel format: " << capture.get(cv::CAP_PROP_FORMAT) << std::endl;
+
+    cv::Mat frame;
+    for (int i = 0; i < 10; i++)
+    {
+        int pixel_type  = (int)capture.get(cv::CAP_PROP_FORMAT);
+        int channels    = CV_MAT_CN(pixel_type);
+        int pixel_bytes = CV_ELEM_SIZE(pixel_type);
+
+        // YUYV is expected for most of popular USB cam (COLOR_YUV2BGR_YUYV conversion)
+        EXPECT_EQ(2, channels);
+        EXPECT_EQ(2, pixel_bytes);
+
+        capture >> frame;
+    }
+}
 
 static
 utils::Paths getTestCameras()
diff --git a/modules/videoio/test/test_precomp.hpp b/modules/videoio/test/test_precomp.hpp
index 815264c494..7106faeb1e 100644
--- a/modules/videoio/test/test_precomp.hpp
+++ b/modules/videoio/test/test_precomp.hpp
@@ -98,11 +98,11 @@ inline void generateFrame(int i, int frame_count, cv::Mat & frame)
 class BunnyParameters
 {
 public:
-    inline static int    getWidth()  { return 672; };
-    inline static int    getHeight() { return 384; };
-    inline static int    getFps()    { return 24; };
-    inline static double getTime()   { return 5.21; };
-    inline static int    getCount()  { return cvRound(getFps() * getTime()); };
+    inline static int    getWidth()  { return 672; }
+    inline static int    getHeight() { return 384; }
+    inline static int    getFps()    { return 24; }
+    inline static double getTime()   { return 5.21; }
+    inline static int    getCount()  { return cvRound(getFps() * getTime()); }
     inline static std::string getFilename(const std::string &ext)
     {
         return cvtest::TS::ptr()->get_data_path() + "video/big_buck_bunny" + ext;
diff --git a/samples/cpp/train_HOG.cpp b/samples/cpp/train_HOG.cpp
index 4a160fe4eb..c8355ee591 100644
--- a/samples/cpp/train_HOG.cpp
+++ b/samples/cpp/train_HOG.cpp
@@ -47,7 +47,7 @@ void convert_to_ml( const vector< Mat > & train_samples, Mat& trainData )
     //--Convert data
     const int rows = (int)train_samples.size();
     const int cols = (int)std::max( train_samples[0].cols, train_samples[0].rows );
-    Mat tmp( 1, cols, CV_32FC1 ); //< used for transposition if needed
+    Mat tmp( 1, cols, CV_32FC1 ); ///< used for transposition if needed
     trainData = Mat( rows, cols, CV_32FC1 );
 
     for( size_t i = 0 ; i < train_samples.size(); ++i )
diff --git a/samples/dnn/README.md b/samples/dnn/README.md
index ac407b79c3..c99b735a1f 100644
--- a/samples/dnn/README.md
+++ b/samples/dnn/README.md
@@ -78,7 +78,7 @@ AR @[ IoU=0.50:0.95 | area= large | maxDets=100 ] | 0.528     | 0.528          |
 ```
 
 ## References
-* [Models downloading script](https://github.com/opencv/opencv/samples/dnn/download_models.py)
+* [Models downloading script](https://github.com/opencv/opencv/blob/5.x/samples/dnn/download_models.py)
 * [Configuration files adopted for OpenCV](https://github.com/opencv/opencv_extra/tree/5.x/testdata/dnn)
 * [How to import models from TensorFlow Object Detection API](https://github.com/opencv/opencv/wiki/TensorFlow-Object-Detection-API)
 * [Names of classes from different datasets](https://github.com/opencv/opencv/tree/5.x/samples/data/dnn)