Merge branch 4.x

2025-06-07 09:25:45 +08:00 · 2023-10-16 21:25:56 +03:00 · 2023-10-16 21:25:56 +03:00 · 97620c053f
commit 97620c053f
parent d789cb459c 2f63c58380
221 changed files with 3584 additions and 2215 deletions
--- a/3rdparty/openjpeg/CMakeLists.txt
+++ b/3rdparty/openjpeg/CMakeLists.txt
@ -16,6 +16,7 @@ ocv_warnings_disable(CMAKE_C_FLAGS
    -Wunused-but-set-variable # clang15
    -Wmissing-prototypes # clang, function opj_t1_ht_decode_cblk
    -Wmissing-declarations # gcc, function opj_t1_ht_decode_cblk
+    -Wdocumentation # clang
 )

 #-----------------------------------------------------------------------------
--- a/3rdparty/protobuf/CMakeLists.txt
+++ b/3rdparty/protobuf/CMakeLists.txt
@ -27,6 +27,8 @@ else()
                                       -Wimplicit-fallthrough
                                       -Warray-bounds  # GCC 9+
                                       -Wstringop-overflow -Wstringop-overread # GCC 11-12
+                                       -Wextra-semi # clang
+                                       -Wcomma # clang
  )
 endif()
 if(CV_ICC)
--- a/cmake/OpenCVDetectPython.cmake
+++ b/cmake/OpenCVDetectPython.cmake
@ -209,7 +209,7 @@ if(NOT ${found})
          message(STATUS "  PYTHON3_NUMPY_INCLUDE_DIRS")
        else()
          # Attempt to discover the NumPy include directory. If this succeeds, then build python API with NumPy
-          execute_process(COMMAND "${_executable}" -c "import os; os.environ['DISTUTILS_USE_SDK']='1'; import numpy.distutils; print(os.pathsep.join(numpy.distutils.misc_util.get_numpy_include_dirs()))"
+          execute_process(COMMAND "${_executable}" -c "import numpy; print(numpy.get_include())"
                          RESULT_VARIABLE _numpy_process
                          OUTPUT_VARIABLE _numpy_include_dirs
                          OUTPUT_STRIP_TRAILING_WHITESPACE)
--- a/doc/pattern_tools/gen_pattern.py
+++ b/doc/pattern_tools/gen_pattern.py
@ -186,6 +186,8 @@ class PatternMaker:
        yspacing = (self.height - self.rows * self.square_size) / 2.0

        ch_ar_border = (self.square_size - self.aruco_marker_size)/2
+        if ch_ar_border < side*0.7:
+            print("Marker border {} is less than 70% of ArUco pin size {}. Please increase --square_size or decrease --marker_size for stable board detection".format(ch_ar_border, int(side)))
        marker_id = 0
        for y in range(0, self.rows):
            for x in range(0, self.cols):
@ -283,6 +285,9 @@ def main():
            else:
                raise ValueError("The marker {},{} is outside the checkerboard".format(x, y))

+    if p_type == "charuco_board" and aruco_marker_size >= square_size:
+        raise ValueError("ArUco markers size must be smaller than square size")
+
    pm = PatternMaker(columns, rows, output, units, square_size, radius_rate, page_width, page_height, markers, aruco_marker_size, dict_file)
    # dict for easy lookup of pattern type
    mp = {"circles": pm.make_circles_pattern, "acircles": pm.make_acircles_pattern,
--- a/modules/3d/src/fundam.cpp
+++ b/modules/3d/src/fundam.cpp
@ -112,7 +112,7 @@ public:
     *            2 columns 1 channel
     * @param _m2 destination points containing (x,y), depth is CV_32F with 1 column 2 channels or
     *            2 columns 1 channel
-     * @param _model, CV_64FC1, 3x3, normalized, i.e., the last element is 1
+     * @param _model CV_64FC1, 3x3, normalized, i.e., the last element is 1
     */
    int runKernel( InputArray _m1, InputArray _m2, OutputArray _model ) const CV_OVERRIDE
    {
@ -187,7 +187,7 @@ public:
     * @param _m1 depth CV_32F, 1-channel with 2 columns or 2-channel with 1 column
     * @param _m2 depth CV_32F, 1-channel with 2 columns or 2-channel with 1 column
     * @param _model CV_64FC1, 3x3
-     * @param _err, output, CV_32FC1, square of the L2 norm
+     * @param _err output, CV_32FC1, square of the L2 norm
     */
    void computeError( InputArray _m1, InputArray _m2, InputArray _model, OutputArray _err ) const CV_OVERRIDE
    {
--- a/modules/3d/src/ippe.hpp
+++ b/modules/3d/src/ippe.hpp
@ -111,7 +111,7 @@ private:
    /**
     * @brief                           Computes the translation solution for a given rotation solution
     * @param objectPoints              Array of corresponding object points, 1xN/Nx1 3-channel where N is the number of points
-     * @param normalizedImagePoints     Array of corresponding image points (undistorted), 1xN/Nx1 2-channel where N is the number of points
+     * @param normalizedImgPoints       Array of corresponding image points (undistorted), 1xN/Nx1 2-channel where N is the number of points
     * @param R                         Rotation solution (3x1 rotation vector)
     * @param t                         Translation solution (3x1 rotation vector)
     */
@ -220,10 +220,10 @@ private:

    /**
     * @brief                   Computes the average depth of an object given its pose in camera coordinates
-     * @param objectPoints:     Object points defined in 3D object space
-     * @param rvec:             Rotation component of pose
-     * @param tvec:             Translation component of pose
-     * @return:                 average depth of the object
+     * @param objectPoints      Object points defined in 3D object space
+     * @param rvec              Rotation component of pose
+     * @param tvec              Translation component of pose
+     * @return                  average depth of the object
     */
    double meanSceneDepth(InputArray objectPoints, InputArray rvec, InputArray tvec);

--- a/modules/3d/src/p3p.cpp
+++ b/modules/3d/src/p3p.cpp
@ -220,8 +220,8 @@ int p3p::solve(double R[4][3][3], double t[4][3],
 /// Only the solution to the main branch.
 /// Reference : X.S. Gao, X.-R. Hou, J. Tang, H.-F. Chang; "Complete Solution Classification for the Perspective-Three-Point Problem"
 /// IEEE Trans. on PAMI, vol. 25, No. 8, August 2003
-/// \param lengths3D Lengths of line segments up to four solutions.
-/// \param dist3D Distance between 3D points in pairs |BC|, |AC|, |AB|.
+/// \param lengths Lengths of line segments up to four solutions.
+/// \param distances Distance between 3D points in pairs |BC|, |AC|, |AB|.
 /// \param cosines Cosine of the angles /_BPC, /_APC, /_APB.
 /// \returns Number of solutions.
 /// WARNING: NOT ALL THE DEGENERATE CASES ARE IMPLEMENTED
--- a/modules/3d/src/precomp.hpp
+++ b/modules/3d/src/precomp.hpp
@ -89,7 +89,7 @@ namespace cv {
 * @param ep outlier ratio
 * @param modelPoints number of model points required for estimation
 * @param maxIters maximum number of iterations
- * @return
+ * @return The number of iterations according to the formula
 * \f[
 * \frac{\ln(1-p)}{\ln\left(1-(1-ep)^\mathrm{modelPoints}\right)}
 * \f]
--- a/modules/3d/src/rho.cpp
+++ b/modules/3d/src/rho.cpp
@ -486,7 +486,7 @@ void rhoSeed(Ptr<RHO_HEST> p, uint64_t seed){
 * Estimates the homography using the given context, matches and parameters to
 * PROSAC.
 *
- * @param [in/out] p       The context to use for homography estimation. Must
+ * @param [in,out] p       The context to use for homography estimation. Must
 *                             be already initialized. Cannot be NULL.
 * @param [in]     src     The pointer to the source points of the matches.
 *                             Must be aligned to 4 bytes. Cannot be NULL.
--- a/modules/3d/src/rho.h
+++ b/modules/3d/src/rho.h
@ -206,7 +206,7 @@ void rhoSeed(Ptr<RHO_HEST> p, uint64_t seed);
 * homography with at least the minimum required support, and 0 if it was not.
 *
 *
- * @param [in/out] p       The context to use for homography estimation. Must
+ * @param [in,out] p       The context to use for homography estimation. Must
 *                             be already initialized. Cannot be NULL.
 * @param [in]     src     The pointer to the source points of the matches.
 *                             Must be aligned to 4 bytes. Cannot be NULL.
--- a/modules/3d/src/undistort.simd.hpp
+++ b/modules/3d/src/undistort.simd.hpp
@ -89,8 +89,8 @@ public:
        s2(_s2),
        s3(_s3),
        s4(_s4) {
-#if CV_SIMD_64F
-        for (int i = 0; i < 2 * v_float64::nlanes; ++i)
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
+        for (int i = 0; i < 2 * VTraits<v_float64>::vlanes(); ++i)
        {
            s_x[i] = ir[0] * i;
            s_y[i] = ir[3] * i;
@ -123,26 +123,26 @@ public:
            else
                CV_Assert(m1 != NULL);

-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
            const v_float64 v_one = vx_setall_f64(1.0);
-            for (; j <= size.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes, _x += 2*v_float64::nlanes * ir[0], _y += 2*v_float64::nlanes * ir[3], _w += 2*v_float64::nlanes * ir[6])
+            for (; j <= size.width - 2*VTraits<v_float64>::vlanes(); j += 2*VTraits<v_float64>::vlanes(), _x += 2*VTraits<v_float64>::vlanes() * ir[0], _y += 2*VTraits<v_float64>::vlanes() * ir[3], _w += 2*VTraits<v_float64>::vlanes() * ir[6])
            {
                v_float64 m_0, m_1, m_2, m_3;
-                m_2 = v_one / (vx_setall_f64(_w) + vx_load(s_w));
-                m_3 = v_one / (vx_setall_f64(_w) + vx_load(s_w + v_float64::nlanes));
+                m_2 = v_div(v_one, v_add(vx_setall_f64(_w), vx_load(this->s_w)));
+                m_3 = v_div(v_one, v_add(vx_setall_f64(_w), vx_load(this->s_w + VTraits<v_float64>::vlanes())));
                m_0 = vx_setall_f64(_x); m_1 = vx_setall_f64(_y);
-                v_float64 x_0 = (m_0 + vx_load(s_x)) * m_2;
-                v_float64 x_1 = (m_0 + vx_load(s_x + v_float64::nlanes)) * m_3;
-                v_float64 y_0 = (m_1 + vx_load(s_y)) * m_2;
-                v_float64 y_1 = (m_1 + vx_load(s_y + v_float64::nlanes)) * m_3;
+                v_float64 x_0 = v_mul(v_add(m_0, vx_load(this->s_x)), m_2);
+                v_float64 x_1 = v_mul(v_add(m_0, vx_load(this->s_x + VTraits<v_float64>::vlanes())), m_3);
+                v_float64 y_0 = v_mul(v_add(m_1, vx_load(this->s_y)), m_2);
+                v_float64 y_1 = v_mul(v_add(m_1, vx_load(this->s_y + VTraits<v_float64>::vlanes())), m_3);

-                v_float64 xd_0 = x_0 * x_0;
-                v_float64 yd_0 = y_0 * y_0;
-                v_float64 xd_1 = x_1 * x_1;
-                v_float64 yd_1 = y_1 * y_1;
+                v_float64 xd_0 = v_mul(x_0, x_0);
+                v_float64 yd_0 = v_mul(y_0, y_0);
+                v_float64 xd_1 = v_mul(x_1, x_1);
+                v_float64 yd_1 = v_mul(y_1, y_1);

-                v_float64 r2_0 = xd_0 + yd_0;
-                v_float64 r2_1 = xd_1 + yd_1;
+                v_float64 r2_0 = v_add(xd_0, yd_0);
+                v_float64 r2_1 = v_add(xd_1, yd_1);

                m_1 = vx_setall_f64(k3);
                m_2 = vx_setall_f64(k2);
@ -151,18 +151,18 @@ public:
                m_1 = v_muladd(v_muladd(v_muladd(m_1, r2_1, m_2), r2_1, m_3), r2_1, v_one);
                m_3 = vx_setall_f64(k6);
                m_2 = vx_setall_f64(k5);
-                m_0 /= v_muladd(v_muladd(v_muladd(m_3, r2_0, m_2), r2_0, vx_setall_f64(k4)), r2_0, v_one);
-                m_1 /= v_muladd(v_muladd(v_muladd(m_3, r2_1, m_2), r2_1, vx_setall_f64(k4)), r2_1, v_one);
+                m_0 = v_div(m_0, v_muladd(v_muladd(v_muladd(m_3, r2_0, m_2), r2_0, vx_setall_f64(this->k4)), r2_0, v_one));
+                m_1 = v_div(m_1, v_muladd(v_muladd(v_muladd(m_3, r2_1, m_2), r2_1, vx_setall_f64(this->k4)), r2_1, v_one));

                m_3 = vx_setall_f64(2.0);
                xd_0 = v_muladd(m_3, xd_0, r2_0);
                yd_0 = v_muladd(m_3, yd_0, r2_0);
                xd_1 = v_muladd(m_3, xd_1, r2_1);
                yd_1 = v_muladd(m_3, yd_1, r2_1);
-                m_2 = x_0 * y_0 * m_3;
-                m_3 = x_1 * y_1 * m_3;
+                m_2 = v_mul(v_mul(x_0, y_0), m_3);
+                m_3 = v_mul(v_mul(x_1, y_1), m_3);

-                x_0 *= m_0; y_0 *= m_0; x_1 *= m_1; y_1 *= m_1;
+                x_0 = v_mul(x_0, m_0); y_0 = v_mul(y_0, m_0); x_1 = v_mul(x_1, m_1); y_1 = v_mul(y_1, m_1);

                m_0 = vx_setall_f64(p1);
                m_1 = vx_setall_f64(p2);
@ -176,8 +176,8 @@ public:
                xd_1 = v_muladd(m_0, m_3, xd_1);
                yd_1 = v_muladd(m_1, m_3, yd_1);

-                m_0 = r2_0 * r2_0;
-                m_1 = r2_1 * r2_1;
+                m_0 = v_mul(r2_0, r2_0);
+                m_1 = v_mul(r2_1, r2_1);
                m_2 = vx_setall_f64(s2);
                m_3 = vx_setall_f64(s1);
                xd_0 = v_muladd(m_3, r2_0, v_muladd(m_2, m_0, xd_0));
@ -203,17 +203,17 @@ public:
                r2_0 = v_muladd(m_0, xd_0, v_muladd(m_1, yd_0, m_2));
                r2_1 = v_muladd(m_0, xd_1, v_muladd(m_1, yd_1, m_2));
                m_0 = vx_setzero_f64();
-                r2_0 = v_select(r2_0 == m_0, v_one, v_one / r2_0);
-                r2_1 = v_select(r2_1 == m_0, v_one, v_one / r2_1);
+                r2_0 = v_select(v_eq(r2_0, m_0), v_one, v_div(v_one, r2_0));
+                r2_1 = v_select(v_eq(r2_1, m_0), v_one, v_div(v_one, r2_1));

                m_0 = vx_setall_f64(fx);
                m_1 = vx_setall_f64(u0);
                m_2 = vx_setall_f64(fy);
                m_3 = vx_setall_f64(v0);
-                x_0 = v_muladd(m_0 * r2_0, x_0, m_1);
-                y_0 = v_muladd(m_2 * r2_0, y_0, m_3);
-                x_1 = v_muladd(m_0 * r2_1, x_1, m_1);
-                y_1 = v_muladd(m_2 * r2_1, y_1, m_3);
+                x_0 = v_muladd(v_mul(m_0, r2_0), x_0, m_1);
+                y_0 = v_muladd(v_mul(m_2, r2_0), y_0, m_3);
+                x_1 = v_muladd(v_mul(m_0, r2_1), x_1, m_1);
+                y_1 = v_muladd(v_mul(m_2, r2_1), y_1, m_3);

                if (m1type == CV_32FC1)
                {
@ -225,20 +225,20 @@ public:
                    v_float32 mf0, mf1;
                    v_zip(v_cvt_f32(x_0, x_1), v_cvt_f32(y_0, y_1), mf0, mf1);
                    v_store(&m1f[j * 2], mf0);
-                    v_store(&m1f[j * 2 + v_float32::nlanes], mf1);
+                    v_store(&m1f[j * 2 + VTraits<v_float32>::vlanes()], mf1);
                }
                else // m1type == CV_16SC2
                {
                    m_0 = vx_setall_f64(INTER_TAB_SIZE);
-                    x_0 *= m_0; x_1 *= m_0; y_0 *= m_0; y_1 *= m_0;
+                    x_0 = v_mul(x_0, m_0); x_1 = v_mul(x_1, m_0); y_0 = v_mul(y_0, m_0); y_1 = v_mul(y_1, m_0);

                    v_int32 mask = vx_setall_s32(INTER_TAB_SIZE - 1);
                    v_int32 iu = v_round(x_0, x_1);
                    v_int32 iv = v_round(y_0, y_1);

-                    v_pack_u_store(&m2[j], (iu & mask) + (iv & mask) * vx_setall_s32(INTER_TAB_SIZE));
+                    v_pack_u_store(&m2[j], v_add(v_and(iu, mask), v_mul(v_and(iv, mask), vx_setall_s32(INTER_TAB_SIZE))));
                    v_int32 out0, out1;
-                    v_zip(iu >> INTER_BITS, iv >> INTER_BITS, out0, out1);
+                    v_zip(v_shr<INTER_BITS>(iu), v_shr<INTER_BITS>(iv), out0, out1);
                    v_store(&m1[j * 2], v_pack(out0, out1));
                }
            }
@ -302,10 +302,10 @@ private:
    double s2;
    double s3;
    double s4;
-#if CV_SIMD_64F
-    double s_x[2*v_float64::nlanes];
-    double s_y[2*v_float64::nlanes];
-    double s_w[2*v_float64::nlanes];
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
+    double s_x[2*VTraits<v_float64>::max_nlanes];
+    double s_y[2*VTraits<v_float64>::max_nlanes];
+    double s_w[2*VTraits<v_float64>::max_nlanes];
 #endif
 };
 }
--- a/modules/calib/src/chessboard.hpp
+++ b/modules/calib/src/chessboard.hpp
@ -203,12 +203,12 @@ class Chessboard: public cv::Feature2D
                 * d12/d34 = d13/d24
                 *
                 * point order on the line:
-                 * pt1 --> pt2 --> pt3 --> pt4
+                 * p0 --> p1 --> p2 --> p3
                 *
-                 * \param[in] pt1 First point coordinate
-                 * \param[in] pt2 Second point coordinate
-                 * \param[in] pt3 Third point coordinate
-                 * \param[out] pt4 Forth point coordinate
+                 * \param[in] p0 First point coordinate
+                 * \param[in] p1 Second point coordinate
+                 * \param[in] p2 Third point coordinate
+                 * \param[out] p3 Forth point coordinate
                 *
                 */
                static bool estimatePoint(const cv::Point2f &p0,const cv::Point2f &p1,const cv::Point2f &p2,cv::Point2f &p3);
@ -309,7 +309,7 @@ class Chessboard: public cv::Feature2D
                 * \brief Draws the corners into the given image
                 *
                 * \param[in] m The image
-                 * \param[out] m The resulting image
+                 * \param[out] out The resulting image
                 * \param[in] H optional homography to calculate search area
                 *
                 */
@ -668,7 +668,7 @@ class Chessboard: public cv::Feature2D
                  * \brief Calculates the average edge sharpness for the chessboard
                  *
                  * \param[in] image The image where the chessboard was detected
-                  * \param[in] rise_distante Rise distance 0.8 means 10% ... 90%
+                  * \param[in] rise_distance Rise distance 0.8 means 10% ... 90%
                  * \param[in] vertical by default only edge response for horiontal lines are calculated
                  *
                  * \returns Scalar(sharpness, average min_val, average max_val)
--- a/modules/calib/src/precomp.hpp
+++ b/modules/calib/src/precomp.hpp
@ -66,7 +66,7 @@ namespace cv {
 * @param ep outlier ratio
 * @param modelPoints number of model points required for estimation
 * @param maxIters maximum number of iterations
- * @return
+ * @return The number of iterations according to the formula
 * \f[
 * \frac{\ln(1-p)}{\ln\left(1-(1-ep)^\mathrm{modelPoints}\right)}
 * \f]
--- a/modules/core/include/opencv2/core/dualquaternion.inl.hpp
+++ b/modules/core/include/opencv2/core/dualquaternion.inl.hpp
@ -36,15 +36,15 @@
 namespace cv {

 template <typename T>
-DualQuat<T>::DualQuat():w(0), x(0), y(0), z(0), w_(0), x_(0), y_(0), z_(0){};
+DualQuat<T>::DualQuat():w(0), x(0), y(0), z(0), w_(0), x_(0), y_(0), z_(0){}

 template <typename T>
 DualQuat<T>::DualQuat(const T vw, const T vx, const T vy, const T vz, const T _w, const T _x, const T _y, const T _z):
-                      w(vw), x(vx), y(vy), z(vz), w_(_w), x_(_x), y_(_y), z_(_z){};
+                      w(vw), x(vx), y(vy), z(vz), w_(_w), x_(_x), y_(_y), z_(_z){}

 template <typename T>
 DualQuat<T>::DualQuat(const Vec<T, 8> &q):w(q[0]), x(q[1]), y(q[2]), z(q[3]),
-                                          w_(q[4]), x_(q[5]), y_(q[6]), z_(q[7]){};
+                                          w_(q[4]), x_(q[5]), y_(q[6]), z_(q[7]){}

 template <typename T>
 DualQuat<T> DualQuat<T>::createFromQuat(const Quat<T> &realPart, const Quat<T> &dualPart)
--- a/modules/core/include/opencv2/core/hal/intrin.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin.hpp
@ -987,6 +987,15 @@ namespace CV__SIMD_NAMESPACE {
    { \
        return a op b; \
    }
+    #define OPENCV_HAL_WRAP_EQ_OP(_Tpvec) \
+    inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \
+    { \
+        return a == b; \
+    } \
+    inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
+    { \
+        return a != b; \
+    }

    #define OPENCV_HAL_WRAP_CMP(_Tpvec) \
    OPENCV_HAL_WRAP_CMP_OP(_Tpvec, eq, ==) \
@ -999,11 +1008,11 @@ namespace CV__SIMD_NAMESPACE {
    OPENCV_HAL_WRAP_CMP(v_uint8)
    OPENCV_HAL_WRAP_CMP(v_uint16)
    OPENCV_HAL_WRAP_CMP(v_uint32)
-    // OPENCV_HAL_WRAP_CMP(v_uint64)
+    OPENCV_HAL_WRAP_EQ_OP(v_uint64)
    OPENCV_HAL_WRAP_CMP(v_int8)
    OPENCV_HAL_WRAP_CMP(v_int16)
    OPENCV_HAL_WRAP_CMP(v_int32)
-    // OPENCV_HAL_WRAP_CMP(v_int64)
+    OPENCV_HAL_WRAP_EQ_OP(v_int64)
    OPENCV_HAL_WRAP_CMP(v_float32)
    #if CV_SIMD_64F
    OPENCV_HAL_WRAP_CMP(v_float64)
@ -1012,9 +1021,11 @@ namespace CV__SIMD_NAMESPACE {
        OPENCV_HAL_WRAP_CMP(v_uint8x16)
        OPENCV_HAL_WRAP_CMP(v_uint16x8)
        OPENCV_HAL_WRAP_CMP(v_uint32x4)
+        OPENCV_HAL_WRAP_EQ_OP(v_uint64x2)
        OPENCV_HAL_WRAP_CMP(v_int8x16)
        OPENCV_HAL_WRAP_CMP(v_int16x8)
        OPENCV_HAL_WRAP_CMP(v_int32x4)
+        OPENCV_HAL_WRAP_EQ_OP(v_int64x2)
        OPENCV_HAL_WRAP_CMP(v_float32x4)
        #if CV_SIMD_64F
        OPENCV_HAL_WRAP_CMP(v_float64x2)
@ -1024,9 +1035,11 @@ namespace CV__SIMD_NAMESPACE {
        OPENCV_HAL_WRAP_CMP(v_uint8x32)
        OPENCV_HAL_WRAP_CMP(v_uint16x16)
        OPENCV_HAL_WRAP_CMP(v_uint32x8)
+        OPENCV_HAL_WRAP_EQ_OP(v_uint64x4)
        OPENCV_HAL_WRAP_CMP(v_int8x32)
        OPENCV_HAL_WRAP_CMP(v_int16x16)
        OPENCV_HAL_WRAP_CMP(v_int32x8)
+        OPENCV_HAL_WRAP_EQ_OP(v_int64x4)
        OPENCV_HAL_WRAP_CMP(v_float32x8)
        #if CV_SIMD_64F
        OPENCV_HAL_WRAP_CMP(v_float64x4)
--- a/modules/core/include/opencv2/core/hal/intrin_forward.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_forward.hpp
@ -188,4 +188,4 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END

 //! @endcond

-} // cv::
+} // cv::
--- a/modules/core/include/opencv2/core/hal/intrin_rvv_011_compat.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_rvv_011_compat.hpp
@ -0,0 +1,33 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// 0.11 -> 0.12 compatibility
+
+#ifndef _RVV_IMPLICIT_VXRM
+#define _RVV_IMPLICIT_VXRM __RISCV_VXRM_RNU
+#endif
+
+// NOTE: masked should go first to avoid extra substitution (3 arg -> 4 arg -> 5 arg)
+
+// masked
+#define __riscv_vaadd(_1, _2, _3, _4) __riscv_vaadd(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
+#define __riscv_vasub(_1, _2, _3, _4) __riscv_vasub(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
+#define __riscv_vaaddu(_1, _2, _3, _4) __riscv_vaaddu(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
+#define __riscv_vasubu(_1, _2, _3, _4) __riscv_vasubu(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
+#define __riscv_vsmul(_1, _2, _3, _4) __riscv_vsmul(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
+#define __riscv_vssra(_1, _2, _3, _4) __riscv_vssra(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
+#define __riscv_vssrl(_1, _2, _3, _4) __riscv_vssrl(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
+#define __riscv_vnclip(_1, _2, _3, _4) __riscv_vnclip(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
+#define __riscv_vnclipu(_1, _2, _3, _4) __riscv_vnclipu(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
+
+// unmasked
+#define __riscv_vaadd(_1, _2, _3) __riscv_vaadd(_1, _2, _RVV_IMPLICIT_VXRM, _3)
+#define __riscv_vasub(_1, _2, _3) __riscv_vasub(_1, _2, _RVV_IMPLICIT_VXRM, _3)
+#define __riscv_vaaddu(_1, _2, _3) __riscv_vaaddu(_1, _2, _RVV_IMPLICIT_VXRM, _3)
+#define __riscv_vasubu(_1, _2, _3) __riscv_vasubu(_1, _2, _RVV_IMPLICIT_VXRM, _3)
+#define __riscv_vsmul(_1, _2, _3) __riscv_vsmul(_1, _2, _RVV_IMPLICIT_VXRM, _3)
+#define __riscv_vssra(_1, _2, _3) __riscv_vssra(_1, _2, _RVV_IMPLICIT_VXRM, _3)
+#define __riscv_vssrl(_1, _2, _3) __riscv_vssrl(_1, _2, _RVV_IMPLICIT_VXRM, _3)
+#define __riscv_vnclip(_1, _2, _3) __riscv_vnclip(_1, _2, _RVV_IMPLICIT_VXRM, _3)
+#define __riscv_vnclipu(_1, _2, _3) __riscv_vnclipu(_1, _2, _RVV_IMPLICIT_VXRM, _3)
--- a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp
@ -21,6 +21,10 @@
 #include "intrin_rvv_010_compat_overloaded-non-policy.hpp"
 #endif

+#if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic>11999
+#include "intrin_rvv_011_compat.hpp"
+#endif
+
 #if defined(__GNUC__) && !defined(__clang__)
 // FIXIT: eliminate massive warnigs from templates
 // GCC from 'rvv-next': riscv64-unknown-linux-gnu-g++ (g42df3464463) 12.0.1 20220505 (prerelease)
--- a/modules/core/include/opencv2/core/matx.hpp
+++ b/modules/core/include/opencv2/core/matx.hpp
@ -225,7 +225,7 @@ public:
    void copyTo(const _OutputArray& dst) const;
    void convertTo(const _OutputArray& dst, int type, double scale=1., double shift=0.) const;

-    _Tp val[m*n]; //< matrix elements
+    _Tp val[m*n]; ///< matrix elements
 };

 typedef Matx<float, 1, 2> Matx12f;
--- a/modules/core/include/opencv2/core/ocl.hpp
+++ b/modules/core/include/opencv2/core/ocl.hpp
@ -774,7 +774,7 @@ public:
    void start();
    void stop();

-    uint64 durationNS() const; //< duration in nanoseconds
+    uint64 durationNS() const; ///< duration in nanoseconds

 protected:
    struct Impl;
--- a/modules/core/include/opencv2/core/types.hpp
+++ b/modules/core/include/opencv2/core/types.hpp
@ -89,7 +89,7 @@ public:
    //! conjugation
    Complex conj() const;

-    _Tp re, im; //< the real and the imaginary parts
+    _Tp re, im; ///< the real and the imaginary parts
 };

 typedef Complex<float> Complexf;
@ -2028,8 +2028,8 @@ double jaccardDistance(const Rect_<_Tp>& a, const Rect_<_Tp>& b) {
 /** @brief Finds out if there is any intersection between two rectangles
 *
 * mainly useful for language bindings
- * @param rect1 First rectangle
- * @param rect2 Second rectangle
+ * @param a First rectangle
+ * @param b Second rectangle
 * @return the area of the intersection
 */
 CV_EXPORTS_W inline double rectangleIntersectionArea(const Rect2d& a, const Rect2d& b) { return (a & b).area(); }
--- a/modules/core/include/opencv2/core/utils/filesystem.private.hpp
+++ b/modules/core/include/opencv2/core/utils/filesystem.private.hpp
@ -47,11 +47,11 @@ public:
    explicit FileLock(const char* fname);
    ~FileLock();

-    void lock(); //< acquire exclusive (writer) lock
-    void unlock(); //< release exclusive (writer) lock
+    void lock(); ///< acquire exclusive (writer) lock
+    void unlock(); ///< release exclusive (writer) lock

-    void lock_shared(); //< acquire shareable (reader) lock
-    void unlock_shared(); //< release shareable (reader) lock
+    void lock_shared(); ///< acquire shareable (reader) lock
+    void unlock_shared(); ///< release shareable (reader) lock

    struct Impl;
 protected:
--- a/modules/core/include/opencv2/core/utils/trace.hpp
+++ b/modules/core/include/opencv2/core/utils/trace.hpp
@ -70,11 +70,11 @@ public:
    struct LocationExtraData;
    struct LocationStaticStorage
    {
-        LocationExtraData** ppExtra;   //< implementation specific data
-        const char* name;              //< region name (function name or other custom name)
-        const char* filename;          //< source code filename
-        int line;                      //< source code line
-        int flags;                     //< flags (implementation code path: Plain, IPP, OpenCL)
+        LocationExtraData** ppExtra;   ///< implementation specific data
+        const char* name;              ///< region name (function name or other custom name)
+        const char* filename;          ///< source code filename
+        int line;                      ///< source code line
+        int flags;                     ///< flags (implementation code path: Plain, IPP, OpenCL)
    };

    Region(const LocationStaticStorage& location);
@ -100,18 +100,18 @@ private:

 //! Specify region flags
 enum RegionLocationFlag {
-    REGION_FLAG_FUNCTION = (1 << 0),             //< region is function (=1) / nested named region (=0)
-    REGION_FLAG_APP_CODE = (1 << 1),             //< region is Application code (=1) / OpenCV library code (=0)
-    REGION_FLAG_SKIP_NESTED = (1 << 2),          //< avoid processing of nested regions
+    REGION_FLAG_FUNCTION = (1 << 0),             ///< region is function (=1) / nested named region (=0)
+    REGION_FLAG_APP_CODE = (1 << 1),             ///< region is Application code (=1) / OpenCV library code (=0)
+    REGION_FLAG_SKIP_NESTED = (1 << 2),          ///< avoid processing of nested regions

-    REGION_FLAG_IMPL_IPP = (1 << 16),            //< region is part of IPP code path
-    REGION_FLAG_IMPL_OPENCL = (2 << 16),         //< region is part of OpenCL code path
-    REGION_FLAG_IMPL_OPENVX = (3 << 16),         //< region is part of OpenVX code path
+    REGION_FLAG_IMPL_IPP = (1 << 16),            ///< region is part of IPP code path
+    REGION_FLAG_IMPL_OPENCL = (2 << 16),         ///< region is part of OpenCL code path
+    REGION_FLAG_IMPL_OPENVX = (3 << 16),         ///< region is part of OpenVX code path

    REGION_FLAG_IMPL_MASK = (15 << 16),

    REGION_FLAG_REGION_FORCE = (1 << 30),
-    REGION_FLAG_REGION_NEXT = (1 << 31),         //< close previous region (see #CV_TRACE_REGION_NEXT macro)
+    REGION_FLAG_REGION_NEXT = (1 << 31),         ///< close previous region (see #CV_TRACE_REGION_NEXT macro)

    ENUM_REGION_FLAG_FORCE_INT = INT_MAX
 };
--- a/modules/core/misc/java/test/CoreTest.java
+++ b/modules/core/misc/java/test/CoreTest.java
@ -962,9 +962,9 @@ public class CoreTest extends OpenCVTestCase {

        assertEquals(0.0, d);

-        d = Core.Mahalanobis(line1, line2, covar);
-
-        assertTrue(d > 0.0);
+        // Bug: https://github.com/opencv/opencv/issues/24348
+        // d = Core.Mahalanobis(line1, line2, covar);
+        // assertTrue(d > 0.0);
    }

    public void testMax() {
--- a/modules/core/misc/python/pyopencv_async.hpp
+++ b/modules/core/misc/python/pyopencv_async.hpp
@ -2,7 +2,7 @@

 #include "opencv2/core/async.hpp"

-CV_PY_TO_CLASS(AsyncArray);
-CV_PY_FROM_CLASS(AsyncArray);
+CV_PY_TO_CLASS(AsyncArray)
+CV_PY_FROM_CLASS(AsyncArray)

 #endif
--- a/modules/core/misc/python/pyopencv_cuda.hpp
+++ b/modules/core/misc/python/pyopencv_cuda.hpp
@ -20,18 +20,18 @@ template<> struct pyopencvVecConverter<cuda::GpuMat>
    }
 };

-CV_PY_TO_CLASS(cuda::GpuMat);
-CV_PY_TO_CLASS(cuda::Stream);
-CV_PY_TO_CLASS(cuda::Event);
-CV_PY_TO_CLASS(cuda::HostMem);
+CV_PY_TO_CLASS(cuda::GpuMat)
+CV_PY_TO_CLASS(cuda::Stream)
+CV_PY_TO_CLASS(cuda::Event)
+CV_PY_TO_CLASS(cuda::HostMem)

-CV_PY_TO_CLASS_PTR(cuda::GpuMat);
-CV_PY_TO_CLASS_PTR(cuda::GpuMat::Allocator);
+CV_PY_TO_CLASS_PTR(cuda::GpuMat)
+CV_PY_TO_CLASS_PTR(cuda::GpuMat::Allocator)

-CV_PY_FROM_CLASS(cuda::GpuMat);
-CV_PY_FROM_CLASS(cuda::Stream);
-CV_PY_FROM_CLASS(cuda::HostMem);
+CV_PY_FROM_CLASS(cuda::GpuMat)
+CV_PY_FROM_CLASS(cuda::Stream)
+CV_PY_FROM_CLASS(cuda::HostMem)

-CV_PY_FROM_CLASS_PTR(cuda::GpuMat::Allocator);
+CV_PY_FROM_CLASS_PTR(cuda::GpuMat::Allocator)

 #endif
--- a/modules/core/misc/python/pyopencv_umat.hpp
+++ b/modules/core/misc/python/pyopencv_umat.hpp
@ -4,8 +4,8 @@

 typedef std::vector<Range> vector_Range;

-CV_PY_TO_CLASS(UMat);
-CV_PY_FROM_CLASS(UMat);
+CV_PY_TO_CLASS(UMat)
+CV_PY_FROM_CLASS(UMat)

 static bool cv_mappable_to(const Ptr<Mat>& src, Ptr<UMat>& dst)
 {
--- a/modules/core/perf/perf_allocation.cpp
+++ b/modules/core/perf/perf_allocation.cpp
@ -45,4 +45,4 @@ PERF_TEST_P(MatDepth_tb, DISABLED_Allocation_Aligned,
    SANITY_CHECK_NOTHING();
 }

-};
+}
--- a/modules/core/src/alloc.cpp
+++ b/modules/core/src/alloc.cpp
@ -53,7 +53,6 @@
 #undef CV__ALLOCATOR_STATS_LOG

 //#define OPENCV_ALLOC_ENABLE_STATISTICS
-#define OPENCV_ALLOC_STATISTICS_LIMIT 4096  // don't track buffers less than N bytes


 #ifdef HAVE_POSIX_MEMALIGN
@ -63,6 +62,7 @@
 #endif

 #ifdef OPENCV_ALLOC_ENABLE_STATISTICS
+#define OPENCV_ALLOC_STATISTICS_LIMIT 4096  // don't track buffers less than N bytes
 #include <map>
 #endif

--- a/modules/core/src/arithm.dispatch.cpp
+++ b/modules/core/src/arithm.dispatch.cpp
@ -8,4 +8,4 @@
 #include "arithm.simd_declarations.hpp"

 #define ARITHM_DISPATCHING_ONLY
-#include "arithm.simd.hpp"
+#include "arithm.simd.hpp"
--- a/modules/core/src/arithm.simd.hpp
+++ b/modules/core/src/arithm.simd.hpp
@ -69,7 +69,7 @@
 #define DEFINE_SIMD_F32(fun, ...) \
    DEFINE_SIMD(__CV_CAT(fun, 32f), float, v_float32, __VA_ARGS__)

-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
    #define DEFINE_SIMD_F64(fun, ...) \
        DEFINE_SIMD(__CV_CAT(fun, 64f), double, v_float64, __VA_ARGS__)
 #else
@ -262,7 +262,7 @@ struct op_absdiff
 template<>
 struct op_absdiff<schar, v_int8>
 {
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    static inline v_int8 r(const v_int8& a, const v_int8& b)
    { return v_absdiffs(a, b); }
 #endif
@ -272,7 +272,7 @@ struct op_absdiff<schar, v_int8>
 template<>
 struct op_absdiff<short, v_int16>
 {
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    static inline v_int16 r(const v_int16& a, const v_int16& b)
    { return v_absdiffs(a, b); }
 #endif
@ -282,7 +282,7 @@ struct op_absdiff<short, v_int16>
 template<>
 struct op_absdiff<int, v_int32>
 {
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    static inline v_int32 r(const v_int32& a, const v_int32& b)
    { return v_reinterpret_as_s32(v_absdiff(a, b)); }
 #endif
@ -327,7 +327,7 @@ struct op_not

 //////////////////////////// Loaders /////////////////////////////////

-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)

 template< template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
 struct bin_loader
@ -392,7 +392,7 @@ template<template<typename T1, typename Tvec> class OP, typename T1, typename Tv
 static void bin_loop(const T1* src1, size_t step1, const T1* src2, size_t step2, T1* dst, size_t step, int width, int height)
 {
    typedef OP<T1, Tvec> op;
-#if CV_SIMD  || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    typedef bin_loader<OP, T1, Tvec> ldr;
    const int wide_step = VTraits<Tvec>::vlanes();
    #if !CV_NEON && CV_SIMD_WIDTH == 16
@ -410,7 +410,7 @@ static void bin_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
    {
        int x = 0;

-    #if CV_SIMD || CV_SIMD_SCALABLE
+    #if (CV_SIMD || CV_SIMD_SCALABLE)
        #if !CV_NEON && !CV_MSA
        if (is_aligned(src1, src2, dst))
        {
@ -460,7 +460,7 @@ static void bin_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
    vx_cleanup();
 }

-#if !CV_SIMD_64F
+#if !(CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
 template<template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
 static void bin_loop_nosimd(const T1* src1, size_t step1, const T1* src2, size_t step2, T1* dst, size_t step, int width, int height)
 {
@ -492,7 +492,7 @@ static void bin_loop_nosimd(const T1* src1, size_t step1, const T1* src2, size_t
 #define BIN_LOOP64F bin_loop_nosimd
 #else
 #define BIN_LOOP64F bin_loop
-#endif //!CV_SIMD_64F
+#endif //!(CV_SIMD_64F || CV_SIMD_SCALABLE_64F)

 #endif // ARITHM_DEFINITIONS_ONLY

@ -617,7 +617,7 @@ struct op_cmpne

 //////////////////////////// Loaders /////////////////////////////////

-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 // todo: add support for RW alignment & stream
 template<int nload, template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
 struct cmp_loader_n
@ -697,7 +697,7 @@ template<template<typename T1, typename Tvec> class OP, typename T1, typename Tv
 static void cmp_loop(const T1* src1, size_t step1, const T1* src2, size_t step2, uchar* dst, size_t step, int width, int height)
 {
    typedef OP<T1, Tvec> op;
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    typedef cmp_loader_n<sizeof(T1), OP, T1, Tvec> ldr;
    const int wide_step = VTraits<Tvec>::vlanes() * sizeof(T1);
 #endif // CV_SIMD
@ -709,7 +709,7 @@ static void cmp_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
    {
        int x = 0;

-    #if CV_SIMD || CV_SIMD_SCALABLE
+    #if (CV_SIMD || CV_SIMD_SCALABLE)
        for (; x <= width - wide_step; x += wide_step)
        {
            ldr::l(src1 + x, src2 + x, dst + x);
@ -764,7 +764,7 @@ static void cmp_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
    }
 }

-#if !CV_SIMD_64F
+#if !(CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
 template< template<typename T1, typename Tvec> class OP, typename T1>
 static void cmp_loop_nosimd(const T1* src1, size_t step1, const T1* src2, size_t step2, uchar* dst, size_t step, int width, int height)
 {
@ -818,7 +818,7 @@ static void cmp_loop_nosimd(const double* src1, size_t step1, const double* src2
        break;
    }
 }
-#endif // !CV_SIMD_64F
+#endif // !(CV_SIMD_64F || CV_SIMD_SCALABLE_64F)

 #endif // ARITHM_DEFINITIONS_ONLY

@ -876,7 +876,7 @@ DEFINE_SIMD_ALL(cmp)

 //////////////////////////// Loaders ///////////////////////////////

-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 // todo: add support for RW alignment & stream
 template<int nload, template<typename T1, typename T2, typename Tvec> class OP, typename T1, typename T2, typename Tvec>
 struct scalar_loader_n
@ -1095,16 +1095,16 @@ struct scalar_loader_n<sizeof(float), OP, float, T2, v_float32>
 };
 #endif // CV_SIMD

-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
 template<template<typename T1, typename T2, typename Tvec> class OP>
 struct scalar_loader_n<sizeof(int), OP, int, double, v_int32>
 {
    typedef OP<int, float, v_int32> op;
    typedef OP<double, double, v_float64> op64;
-    enum {step = v_int32::nlanes};

    static inline void l(const int* src1, const int* src2, const double* scalar, int* dst)
    {
+        const int step = VTraits<v_int32>::vlanes();
        v_int32 v_src1 = vx_load(src1);
        v_int32 v_src2 = vx_load(src2);
        v_int32 v_src1s = vx_load(src1 + step);
@ -1121,6 +1121,7 @@ struct scalar_loader_n<sizeof(int), OP, int, double, v_int32>
    }
    static inline void l(const int* src1, const double* scalar, int* dst)
    {
+        const int step = VTraits<v_int32>::vlanes();
        v_int32 v_src1 = vx_load(src1);
        v_int32 v_src1s = vx_load(src1 + step);

@ -1165,10 +1166,10 @@ struct scalar_loader_n<sizeof(float), OP, float, double, v_float32>
 {
    typedef OP<float, float, v_float32> op;
    typedef OP<double, double, v_float64> op64;
-    enum {step = v_float32::nlanes};

    static inline void l(const float* src1, const float* src2, const double* scalar, float* dst)
    {
+        const int step = VTraits<v_float32>::vlanes();
        v_float32 v_src1 = vx_load(src1);
        v_float32 v_src2 = vx_load(src2);
        v_float32 v_src1s = vx_load(src1 + step);
@ -1182,6 +1183,7 @@ struct scalar_loader_n<sizeof(float), OP, float, double, v_float32>
    }
    static inline void l(const float* src1, const double* scalar, float* dst)
    {
+        const int step = VTraits<v_float32>::vlanes();
        v_float32 v_src1 = vx_load(src1);
        v_float32 v_src1s = vx_load(src1 + step);

@ -1222,10 +1224,10 @@ template<template<typename T1, typename T2, typename Tvec> class OP>
 struct scalar_loader_n<sizeof(double), OP, double, double, v_float64>
 {
    typedef OP<double, double, v_float64> op;
-    enum {step = v_float64::nlanes};

    static inline void l(const double* src1, const double* src2, const double* scalar, double* dst)
    {
+        const int step = VTraits<v_float64>::vlanes();
        v_float64 v_src1 = vx_load(src1);
        v_float64 v_src2 = vx_load(src2);
        v_float64 v_src1s = vx_load(src1 + step);
@ -1239,6 +1241,7 @@ struct scalar_loader_n<sizeof(double), OP, double, double, v_float64>
    }
    static inline void l(const double* src1, const double* scalar, double* dst)
    {
+        const int step = VTraits<v_float64>::vlanes();
        v_float64 v_src1 = vx_load(src1);
        v_float64 v_src1s = vx_load(src1 + step);

@ -1249,7 +1252,7 @@ struct scalar_loader_n<sizeof(double), OP, double, double, v_float64>
        v_store(dst + step, r1);
    }
 };
-#endif // CV_SIMD_64F
+#endif // (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)

 //////////////////////////// Loops /////////////////////////////////

@ -1259,7 +1262,7 @@ static void scalar_loop(const T1* src1, size_t step1, const T1* src2, size_t ste
                 T1* dst, size_t step, int width, int height, const T2* scalar)
 {
    typedef OP<T1, T2, Tvec> op;
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    typedef scalar_loader_n<sizeof(T1), OP, T1, T2, Tvec> ldr;
    const int wide_step = sizeof(T1) > sizeof(ushort) ? VTraits<Tvec>::vlanes() * 2 :
                          sizeof(T1) == sizeof(uchar) ? VTraits<Tvec>::vlanes() / 2 : VTraits<Tvec>::vlanes();
@ -1273,7 +1276,7 @@ static void scalar_loop(const T1* src1, size_t step1, const T1* src2, size_t ste
    {
        int x = 0;

-    #if CV_SIMD || CV_SIMD_SCALABLE
+    #if (CV_SIMD || CV_SIMD_SCALABLE)
        for (; x <= width - wide_step; x += wide_step)
        {
            ldr::l(src1 + x, src2 + x, scalar, dst + x);
@ -1305,7 +1308,7 @@ template<template<typename T1, typename T2, typename Tvec> class OP, typename T1
 static void scalar_loop(const T1* src1, size_t step1, T1* dst, size_t step, int width, int height, const T2* scalar)
 {
    typedef OP<T1, T2, Tvec> op;
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    typedef scalar_loader_n<sizeof(T1), OP, T1, T2, Tvec> ldr;
    const int wide_step = sizeof(T1) > sizeof(ushort) ? VTraits<Tvec>::vlanes() * 2 :
                          sizeof(T1) == sizeof(uchar) ? VTraits<Tvec>::vlanes() / 2 : VTraits<Tvec>::vlanes();
@ -1318,7 +1321,7 @@ static void scalar_loop(const T1* src1, size_t step1, T1* dst, size_t step, int
    {
        int x = 0;

-    #if CV_SIMD || CV_SIMD_SCALABLE
+    #if (CV_SIMD || CV_SIMD_SCALABLE)
        for (; x <= width - wide_step; x += wide_step)
        {
            ldr::l(src1 + x, scalar, dst + x);
@ -1345,7 +1348,7 @@ static void scalar_loop(const T1* src1, size_t step1, T1* dst, size_t step, int
    vx_cleanup();
 }

-#if !CV_SIMD_64F
+#if !(CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
 // dual source
 template<template<typename T1, typename T2, typename Tvec> class OP, typename T1, typename T2, typename Tvec>
 static void scalar_loop_nosimd(const T1* src1, size_t step1, const T1* src2, size_t step2,
@ -1409,7 +1412,7 @@ static void scalar_loop_nosimd(const T1* src1, size_t step1, T1* dst, size_t ste
 #define SCALAR_LOOP64F scalar_loop_nosimd
 #else
 #define SCALAR_LOOP64F scalar_loop
-#endif // !CV_SIMD_64F
+#endif // !(CV_SIMD_64F || CV_SIMD_SCALABLE_64F)

 #endif // ARITHM_DEFINITIONS_ONLY

@ -1433,7 +1436,7 @@ struct op_mul
 template<typename T1, typename T2, typename Tvec>
 struct op_mul_scale
 {
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar)
    {
        const v_float32 v_scalar = vx_setall_f32(*scalar);
@ -1449,7 +1452,7 @@ struct op_mul_scale
 template<>
 struct op_mul_scale<double, double, v_float64>
 {
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
    static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalar)
    {
        const v_float64 v_scalar = vx_setall_f64(*scalar);
@ -1574,7 +1577,7 @@ struct op_div_f
 template<typename T1, typename T2, typename Tvec>
 struct op_div_scale
 {
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar)
    {
        const v_float32 v_scalar = vx_setall_f32(*scalar);
@ -1596,7 +1599,7 @@ struct op_div_scale
 template<>
 struct op_div_scale<float, float, v_float32>
 {
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    static inline v_float32 r(const v_float32& a, const v_float32& b, const float* scalar)
    {
        const v_float32 v_scalar = vx_setall_f32(*scalar);
@ -1610,7 +1613,7 @@ struct op_div_scale<float, float, v_float32>
 template<>
 struct op_div_scale<double, double, v_float64>
 {
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
    static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalar)
    {
        const v_float64 v_scalar = vx_setall_f64(*scalar);
@ -1682,7 +1685,7 @@ DEFINE_SIMD_ALL(div, div_loop)
 template<typename T1, typename T2, typename Tvec>
 struct op_add_scale
 {
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar)
    {
        const v_float32 v_alpha = vx_setall_f32(*scalar);
@ -1698,7 +1701,7 @@ struct op_add_scale
 template<>
 struct op_add_scale<double, double, v_float64>
 {
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
    static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalar)
    {
        const v_float64 v_alpha = vx_setall_f64(*scalar);
@ -1715,7 +1718,7 @@ struct op_add_scale<double, double, v_float64>
 template<typename T1, typename T2, typename Tvec>
 struct op_add_weighted
 {
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalars)
    {
        const v_float32 v_alpha = vx_setall_f32(scalars[0]);
@ -1733,7 +1736,7 @@ struct op_add_weighted
 template<>
 struct op_add_weighted<double, double, v_float64>
 {
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
    static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalars)
    {
        const v_float64 v_alpha = vx_setall_f64(scalars[0]);
@ -1832,7 +1835,7 @@ DEFINE_SIMD_F64(addWeighted, add_weighted_loop_d)
 template<typename T1, typename T2, typename Tvec>
 struct op_recip
 {
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    static inline v_float32 r(const v_float32& a, const T2* scalar)
    {
        const v_float32 v_scalar = vx_setall_f32(*scalar);
@ -1854,7 +1857,7 @@ struct op_recip
 template<>
 struct op_recip<float, float, v_float32>
 {
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    static inline v_float32 r(const v_float32& a, const float* scalar)
    {
        const v_float32 v_scalar = vx_setall_f32(*scalar);
@ -1868,7 +1871,7 @@ struct op_recip<float, float, v_float32>
 template<>
 struct op_recip<double, double, v_float64>
 {
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
    static inline v_float64 r(const v_float64& a, const double* scalar)
    {
        const v_float64 v_scalar = vx_setall_f64(*scalar);
--- a/modules/core/src/arithm_ipp.hpp
+++ b/modules/core/src/arithm_ipp.hpp
@ -414,4 +414,4 @@ inline int arithm_ipp_mul32f(const float *src1, size_t step1, const float *src2,

 #if !ARITHM_USE_IPP
 #define ARITHM_CALL_IPP(...)
-#endif
+#endif
--- a/modules/core/src/dxt.cpp
+++ b/modules/core/src/dxt.cpp
@ -64,8 +64,6 @@ namespace cv
                               Discrete Fourier Transform
 \****************************************************************************************/

-#define CV_MAX_LOCAL_DFT_SIZE  (1 << 15)
-
 static unsigned char bitrevTab[] =
 {
  0x00,0x80,0x40,0xc0,0x20,0xa0,0x60,0xe0,0x10,0x90,0x50,0xd0,0x30,0xb0,0x70,0xf0,
--- a/modules/core/src/hal_replacement.hpp
+++ b/modules/core/src/hal_replacement.hpp
@ -69,10 +69,14 @@
 /**
 Add: _dst[i] = src1[i] + src2[i]_ @n
 Sub: _dst[i] = src1[i] - src2[i]_
-@param src1_data,src1_step first source image data and step
-@param src2_data,src2_step second source image data and step
-@param dst_data,dst_step destination image data and step
-@param width,height dimensions of the images
+@param src1_data first source image data
+@param src1_step first source image step
+@param src2_data second source image data
+@param src2_step second source image step
+@param dst_data destination image data
+@param dst_step destination image step
+@param width width of the images
+@param height height of the images
 */
 //! @addtogroup core_hal_interface_addsub Element-wise add and subtract
 //! @{
@ -96,10 +100,14 @@ inline int hal_ni_sub64f(const double *src1_data, size_t src1_step, const double
 /**
 Minimum: _dst[i] = min(src1[i], src2[i])_ @n
 Maximum: _dst[i] = max(src1[i], src2[i])_
-@param src1_data,src1_step first source image data and step
-@param src2_data,src2_step second source image data and step
-@param dst_data,dst_step destination image data and step
-@param width,height dimensions of the images
+@param src1_data first source image data
+@param src1_step first source image step
+@param src2_data second source image data
+@param src2_step second source image step
+@param dst_data destination image data
+@param dst_step destination image step
+@param width width of the images
+@param height height of the images
 */
 //! @addtogroup core_hal_interface_minmax Element-wise minimum or maximum
 //! @{
@ -122,11 +130,14 @@ inline int hal_ni_min64f(const double *src1_data, size_t src1_step, const double

 /**
 Absolute difference: _dst[i] = | src1[i] - src2[i] |_
-@param src1_data,src1_step first source image data and step
-@param src2_data,src2_step second source image data and step
-@param dst_data,dst_step destination image data and step
-@param width,height dimensions of the images
-@param scale additional multiplier
+@param src1_data first source image data
+@param src1_step first source image step
+@param src2_data second source image data
+@param src2_step second source image step
+@param dst_data destination image data
+@param dst_step destination image step
+@param width width of the images
+@param height height of the images
 */
 //! @addtogroup core_hal_interface_absdiff Element-wise absolute difference
 //! @{
@ -144,10 +155,14 @@ Bitwise AND: _dst[i] = src1[i] & src2[i]_ @n
 Bitwise OR: _dst[i] = src1[i] | src2[i]_ @n
 Bitwise XOR: _dst[i] = src1[i] ^ src2[i]_ @n
 Bitwise NOT: _dst[i] = !src[i]_
-@param src1_data,src1_step first source image data and step
-@param src2_data,src2_step second source image data and step
-@param dst_data,dst_step destination image data and step
-@param width,height dimensions of the images
+@param src1_data first source image data
+@param src1_step first source image step
+@param src2_data second source image data
+@param src2_step second source image step
+@param dst_data destination image data
+@param dst_step destination image step
+@param width width of the images
+@param height height of the images
 */
 //! @addtogroup core_hal_interface_logical Bitwise logical operations
 //! @{
@ -201,10 +216,14 @@ inline int hal_ni_not8u(const uchar *src_data, size_t src_step, uchar *dst_data,

 /**
 Compare: _dst[i] = src1[i] op src2[i]_
-@param src1_data,src1_step first source image data and step
-@param src2_data,src2_step second source image data and step
-@param dst_data,dst_step destination image data and step
-@param width,height dimensions of the images
+@param src1_data first source image data
+@param src1_step first source image step
+@param src2_data second source image data
+@param src2_step second source image step
+@param dst_data destination image data
+@param dst_step destination image step
+@param width width of the images
+@param height height of the images
@param operation one of (CV_HAL_CMP_EQ, CV_HAL_CMP_GT, ...)
 */
 //! @addtogroup core_hal_interface_compare Element-wise compare
@ -230,10 +249,14 @@ inline int hal_ni_cmp64f(const double *src1_data, size_t src1_step, const double

 /**
 Multiply: _dst[i] = scale * src1[i] * src2[i]_
-@param src1_data,src1_step first source image data and step
-@param src2_data,src2_step second source image data and step
-@param dst_data,dst_step destination image data and step
-@param width,height dimensions of the images
+@param src1_data first source image data
+@param src1_step first source image step
+@param src2_data second source image data
+@param src2_step second source image step
+@param dst_data destination image data
+@param dst_step destination image step
+@param width width of the images
+@param height height of the images
@param scale additional multiplier
 */
 //! @addtogroup core_hal_interface_multiply Element-wise multiply
@ -249,10 +272,14 @@ inline int hal_ni_mul64f(const double *src1_data, size_t src1_step, const double

 /**
 Divide: _dst[i] = scale * src1[i] / src2[i]_
-@param src1_data,src1_step first source image data and step
-@param src2_data,src2_step second source image data and step
-@param dst_data,dst_step destination image data and step
-@param width,height dimensions of the images
+@param src1_data first source image data and step
+@param src1_step first source image data and step
+@param src2_data second source image data and step
+@param src2_step second source image data and step
+@param dst_data destination image data and step
+@param dst_step destination image data and step
+@param width dimensions of the images
+@param height dimensions of the images
@param scale additional multiplier
 */
 //! @addtogroup core_hal_interface_divide Element-wise divide
@ -268,9 +295,12 @@ inline int hal_ni_div64f(const double *src1_data, size_t src1_step, const double

 /**
 Computes reciprocial: _dst[i] = scale / src[i]_
-@param src_data,src_step source image data and step
-@param dst_data,dst_step destination image data and step
-@param width,height dimensions of the images
+@param src_data source image data
+@param src_step source image step
+@param dst_data destination image data
+@param dst_step destination image step
+@param width width of the images
+@param height height of the images
@param scale additional multiplier
 */
 //! @addtogroup core_hal_interface_reciprocial Element-wise reciprocial
@ -310,10 +340,14 @@ inline int hal_ni_recip64f(const double *src_data, size_t src_step, double *dst_

 /**
 Computes weighted sum of two arrays using formula: _dst[i] = a * src1[i] + b * src2[i] + c_
-@param src1_data,src1_step first source image data and step
-@param src2_data,src2_step second source image data and step
-@param dst_data,dst_step destination image data and step
-@param width,height dimensions of the images
+@param src1_data first source image data
+@param src1_step first source image step
+@param src2_data second source image data
+@param src2_step second source image step
+@param dst_data destination image data
+@param dst_step destination image step
+@param width width of the images
+@param height height of the images
@param scalars numbers _a_, _b_, and _c_
 */
 //! @addtogroup core_hal_interface_addWeighted Element-wise weighted sum
@ -381,7 +415,8 @@ inline int hal_ni_merge64s(const int64 **src_data, int64 *dst_data, int len, int


 /**
-@param y,x source Y and X arrays
+@param y source Y arrays
+@param x source X arrays
@param dst destination array
@param len length of arrays
@param angleInDegrees if set to true return angles in degrees, otherwise in radians
@ -399,7 +434,8 @@ inline int hal_ni_fastAtan64f(const double* y, const double* x, double* dst, int


 /**
-@param x,y source X and Y arrays
+@param x source X array
+@param y source Y array
@param dst destination array
@param len length of arrays
 */
@ -530,7 +566,8 @@ inline int hal_ni_dftFree1D(cvhalDFT *context) { return CV_HAL_ERROR_NOT_IMPLEME

 /**
@param context double pointer to context storing all necessary data
-@param width,height image dimensions
+@param width image width
+@param height image height
@param depth image type (CV_32F or CV_64F)
@param src_channels number of channels in input image
@param dst_channels number of channels in output image
@ -540,8 +577,10 @@ inline int hal_ni_dftFree1D(cvhalDFT *context) { return CV_HAL_ERROR_NOT_IMPLEME
 inline int hal_ni_dftInit2D(cvhalDFT **context, int width, int height, int depth, int src_channels, int dst_channels, int flags, int nonzero_rows) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 /**
@param context pointer to context storing all necessary data
-@param src_data,src_step source image data and step
-@param dst_data,dst_step destination image data and step
+@param src_data source image data
+@param src_step source image step
+@param dst_data destination image data
+@param dst_step destination image step
 */
 inline int hal_ni_dft2D(cvhalDFT *context, const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 /**
@ -557,15 +596,18 @@ inline int hal_ni_dftFree2D(cvhalDFT *context) { return CV_HAL_ERROR_NOT_IMPLEME

 /**
@param context double pointer to context storing all necessary data
-@param width,height image dimensions
+@param width image width
+@param height image height
@param depth image type (CV_32F or CV_64F)
@param flags algorithm options (combination of CV_HAL_DFT_INVERSE, ...)
 */
 inline int hal_ni_dctInit2D(cvhalDFT **context, int width, int height, int depth, int flags) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 /**
@param context pointer to context storing all necessary data
-@param src_data,src_step source image data and step
-@param dst_data,dst_step destination image data and step
+@param src_data source image data
+@param src_step source image step
+@param dst_data destination image data
+@param dst_step destination image step
 */
 inline int hal_ni_dct2D(cvhalDFT *context, const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 /**
@ -717,11 +759,15 @@ inline int hal_ni_gemm64fc(const double* src1, size_t src1_step, const double* s

 /**
   @brief Finds the global minimum and maximum in an array.
-   @param src_data,src_step Source image
-   @param width,height Source image dimensions
+   @param src_data Source image
+   @param src_step Source image
+   @param width Source image dimensions
+   @param height Source image dimensions
   @param depth Depth of source image
-   @param minVal,maxVal Pointer to the returned global minimum and maximum in an array.
-   @param minIdx,maxIdx Pointer to the returned minimum and maximum location.
+   @param minVal Pointer to the returned global minimum and maximum in an array.
+   @param maxVal Pointer to the returned global minimum and maximum in an array.
+   @param minIdx Pointer to the returned minimum and maximum location.
+   @param maxIdx Pointer to the returned minimum and maximum location.
   @param mask Specified array region.
 */
 inline int hal_ni_minMaxIdx(const uchar* src_data, size_t src_step, int width, int height, int depth, double* minVal, double* maxVal,
@ -731,6 +777,47 @@ inline int hal_ni_minMaxIdx(const uchar* src_data, size_t src_step, int width, i
 #define cv_hal_minMaxIdx hal_ni_minMaxIdx
 //! @endcond

+/**
+   @brief hal_flip
+   @param src_type source and destination image type
+   @param src_data source image data
+   @param src_step source image step
+   @param src_width source and destination image width
+   @param src_height source and destination image height
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param flip_mode 0 flips around x-axis, positive around y-axis, negative both
+ */
+inline int hal_ni_flip(int src_type, const uchar* src_data, size_t src_step, int src_width, int src_height,
+                       uchar* dst_data, size_t dst_step, int flip_mode) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+//! @cond IGNORED
+#define cv_hal_flip hal_ni_flip
+//! @endcond
+
+
+/**
+   @brief rotate90
+   @param src_type source and destination image type
+   @param src_data source image data
+   @param src_step source image step
+   @param src_width source image width
+   If angle has value [180] it is also destination image width
+   If angle has values [90, 270] it is also destination image height
+   @param src_height source and destination image height (destination image width for angles [90, 270])
+   If angle has value [180] it is also destination image height
+   If angle has values [90, 270] it is also destination image width
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param angle clockwise angle for rotation in degrees from set [90, 180, 270]
+ */
+inline int hal_ni_rotate90(int src_type, const uchar* src_data, size_t src_step, int src_width, int src_height,
+                           uchar* dst_data, size_t dst_step, int angle) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+//! @cond IGNORED
+#define cv_hal_rotate90 hal_ni_rotate90
+//! @endcond
+
 //! @}


--- a/modules/core/src/has_non_zero.simd.hpp
+++ b/modules/core/src/has_non_zero.simd.hpp
@ -87,11 +87,11 @@ static bool hasNonZero8u( const uchar* src, size_t len )
 {
    bool res = false;
    const uchar* srcEnd = src+len;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    typedef v_uint8 v_type;
    const v_type v_zero = vx_setzero_u8();
    constexpr const int unrollCount = 2;
-    int step = v_type::nlanes * unrollCount;
+    int step = VTraits<v_type>::vlanes() * unrollCount;
    int len0 = len & -step;
    const uchar* srcSimdEnd = src+len0;

@ -99,10 +99,10 @@ static bool hasNonZero8u( const uchar* src, size_t len )
    while(!res && countSIMD--)
    {
        v_type v0 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v1 = vx_load(src);
-        src += v_type::nlanes;
-        res = v_check_any(((v0 | v1) != v_zero));
+        src += VTraits<v_type>::vlanes();
+        res = v_check_any((v_ne(v_or(v0, v1), v_zero)));
    }

    v_cleanup();
@ -114,11 +114,11 @@ static bool hasNonZero16u( const ushort* src, size_t len )
 {
    bool res = false;
    const ushort* srcEnd = src+len;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    typedef v_uint16 v_type;
    const v_type v_zero = vx_setzero_u16();
    constexpr const int unrollCount = 4;
-    int step = v_type::nlanes * unrollCount;
+    int step = VTraits<v_type>::vlanes() * unrollCount;
    int len0 = len & -step;
    const ushort* srcSimdEnd = src+len0;

@ -126,16 +126,16 @@ static bool hasNonZero16u( const ushort* src, size_t len )
    while(!res && countSIMD--)
    {
        v_type v0 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v1 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v2 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v3 = vx_load(src);
-        src += v_type::nlanes;
-        v0 |= v1;
-        v2 |= v3;
-        res = v_check_any(((v0 | v2) != v_zero));
+        src += VTraits<v_type>::vlanes();
+        v0 = v_or(v0, v1);
+        v2 = v_or(v2, v3);
+        res = v_check_any((v_ne(v_or(v0, v2), v_zero)));
    }

    v_cleanup();
@ -147,11 +147,11 @@ static bool hasNonZero32s( const int* src, size_t len )
 {
    bool res = false;
    const int* srcEnd = src+len;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    typedef v_int32 v_type;
    const v_type v_zero = vx_setzero_s32();
    constexpr const int unrollCount = 8;
-    int step = v_type::nlanes * unrollCount;
+    int step = VTraits<v_type>::vlanes() * unrollCount;
    int len0 = len & -step;
    const int* srcSimdEnd = src+len0;

@ -159,29 +159,29 @@ static bool hasNonZero32s( const int* src, size_t len )
    while(!res && countSIMD--)
    {
        v_type v0 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v1 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v2 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v3 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v4 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v5 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v6 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v7 = vx_load(src);
-        src += v_type::nlanes;
-        v0 |= v1;
-        v2 |= v3;
-        v4 |= v5;
-        v6 |= v7;
+        src += VTraits<v_type>::vlanes();
+        v0 = v_or(v0, v1);
+        v2 = v_or(v2, v3);
+        v4 = v_or(v4, v5);
+        v6 = v_or(v6, v7);

-        v0 |= v2;
-        v4 |= v6;
-        res = v_check_any(((v0 | v4) != v_zero));
+        v0 = v_or(v0, v2);
+        v4 = v_or(v4, v6);
+        res = v_check_any((v_ne(v_or(v0, v4), v_zero)));
    }

    v_cleanup();
@ -193,11 +193,11 @@ static bool hasNonZero32f( const float* src, size_t len )
 {
    bool res = false;
    const float* srcEnd = src+len;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    typedef v_float32 v_type;
    const v_type v_zero = vx_setzero_f32();
    constexpr const int unrollCount = 8;
-    int step = v_type::nlanes * unrollCount;
+    int step = VTraits<v_type>::vlanes() * unrollCount;
    int len0 = len & -step;
    const float* srcSimdEnd = src+len0;

@ -205,30 +205,30 @@ static bool hasNonZero32f( const float* src, size_t len )
    while(!res && countSIMD--)
    {
        v_type v0 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v1 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v2 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v3 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v4 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v5 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v6 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v7 = vx_load(src);
-        src += v_type::nlanes;
-        v0 |= v1;
-        v2 |= v3;
-        v4 |= v5;
-        v6 |= v7;
+        src += VTraits<v_type>::vlanes();
+        v0 = v_or(v0, v1);
+        v2 = v_or(v2, v3);
+        v4 = v_or(v4, v5);
+        v6 = v_or(v6, v7);

-        v0 |= v2;
-        v4 |= v6;
+        v0 = v_or(v0, v2);
+        v4 = v_or(v4, v6);
        //res = v_check_any(((v0 | v4) != v_zero));//beware : (NaN != 0) returns "false" since != is mapped to _CMP_NEQ_OQ and not _CMP_NEQ_UQ
-        res = !v_check_all(((v0 | v4) == v_zero));
+        res = !v_check_all((v_eq(v_or(v0, v4), v_zero)));
    }

    v_cleanup();
@ -240,11 +240,11 @@ static bool hasNonZero64f( const double* src, size_t len )
 {
    bool res = false;
    const double* srcEnd = src+len;
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
    typedef v_float64 v_type;
    const v_type v_zero = vx_setzero_f64();
    constexpr const int unrollCount = 16;
-    int step = v_type::nlanes * unrollCount;
+    int step = VTraits<v_type>::vlanes() * unrollCount;
    int len0 = len & -step;
    const double* srcSimdEnd = src+len0;

@ -252,55 +252,55 @@ static bool hasNonZero64f( const double* src, size_t len )
    while(!res && countSIMD--)
    {
        v_type v0 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v1 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v2 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v3 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v4 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v5 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v6 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v7 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v8 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v9 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v10 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v11 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v12 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v13 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v14 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
        v_type v15 = vx_load(src);
-        src += v_type::nlanes;
-        v0  |= v1;
-        v2  |= v3;
-        v4  |= v5;
-        v6  |= v7;
-        v8  |= v9;
-        v10 |= v11;
-        v12 |= v13;
-        v14 |= v15;
+        src += VTraits<v_type>::vlanes();
+        v0 = v_or(v0, v1);
+        v2 = v_or(v2, v3);
+        v4 = v_or(v4, v5);
+        v6 = v_or(v6, v7);
+        v8 = v_or(v8, v9);
+        v10 = v_or(v10, v11);
+        v12 = v_or(v12, v13);
+        v14 = v_or(v14, v15);

-        v0  |= v2;
-        v4  |= v6;
-        v8  |= v10;
-        v12 |= v14;
+        v0 = v_or(v0, v2);
+        v4 = v_or(v4, v6);
+        v8 = v_or(v8, v10);
+        v12 = v_or(v12, v14);

-        v0  |= v4;
-        v8  |= v12;
+        v0 = v_or(v0, v4);
+        v8 = v_or(v8, v12);
        //res = v_check_any(((v0 | v8) != v_zero));//beware : (NaN != 0) returns "false" since != is mapped to _CMP_NEQ_OQ and not _CMP_NEQ_UQ
-        res = !v_check_all(((v0 | v8) == v_zero));
+        res = !v_check_all((v_eq(v_or(v0, v8), v_zero)));
    }

    v_cleanup();
--- a/modules/core/src/lapack.cpp
+++ b/modules/core/src/lapack.cpp
@ -276,7 +276,7 @@ template<typename T> struct VBLAS
    int givens(T*, T*, int, T, T) const { return 0; }
 };

-#if CV_SIMD // TODO: enable for CV_SIMD_SCALABLE_64F
+#if CV_SIMD // TODO: enable for CV_SIMD_SCALABLE, GCC 13 related
 template<> inline int VBLAS<float>::dot(const float* a, const float* b, int n, float* result) const
 {
    if( n < 2*VTraits<v_float32>::vlanes() )
--- a/modules/core/src/matmul.simd.hpp
+++ b/modules/core/src/matmul.simd.hpp
@ -2549,6 +2549,7 @@ double dotProd_16s(const short* src1, const short* src2, int len)
 double dotProd_32s(const int* src1, const int* src2, int len)
 {
 #if CV_SIMD_64F // TODO: enable for CV_SIMD_SCALABLE_64F
+// Test failed on RVV(QEMU): Too big difference (=1.20209e-08 > 1.11022e-12)
    double r = .0;
    int i = 0;
    const int step  = VTraits<v_int32>::vlanes();
--- a/modules/core/src/matrix_transform.cpp
+++ b/modules/core/src/matrix_transform.cpp
@ -4,6 +4,7 @@

 #include "precomp.hpp"
 #include "opencl_kernels_core.hpp"
+#include "hal_replacement.hpp"
 #include "opencv2/core/detail/dispatch_helper.impl.hpp"

 #include <algorithm> // std::swap_ranges
@ -802,6 +803,9 @@ void flip( InputArray _src, OutputArray _dst, int flip_mode )
    _dst.create( size, type );
    Mat dst = _dst.getMat();

+    CALL_HAL(flip, cv_hal_flip, type, src.ptr(), src.step, src.cols, src.rows,
+             dst.ptr(), dst.step, flip_mode);
+
    CV_IPP_RUN_FAST(ipp_flip(src, dst, flip_mode));

    size_t esz = CV_ELEM_SIZE(type);
@ -1075,10 +1079,8 @@ void broadcast(InputArray _src, InputArray _shape, OutputArray _dst) {
    }
 }

-void rotate(InputArray _src, OutputArray _dst, int rotateMode)
+static void rotateImpl(InputArray _src, OutputArray _dst, int rotateMode)
 {
-    CV_Assert(_src.dims() <= 2);
-
    switch (rotateMode)
    {
    case ROTATE_90_CLOCKWISE:
@ -1097,4 +1099,51 @@ void rotate(InputArray _src, OutputArray _dst, int rotateMode)
    }
 }

+void rotate(InputArray _src, OutputArray _dst, int rotateMode)
+{
+    CV_Assert(_src.dims() <= 2);
+    int angle;
+
+    if (_dst.isUMat())
+    {
+        rotateImpl(_src, _dst, rotateMode);
+        return;
+    }
+
+    Mat src = _src.getMat();
+    int type = src.type();
+    if( src.empty() )
+    {
+        _dst.release();
+        return;
+    }
+
+    switch (rotateMode)
+    {
+    case ROTATE_90_CLOCKWISE:
+        _dst.create(src.cols, src.rows, type);
+        angle = 90;
+        break;
+    case ROTATE_180:
+        _dst.create(src.rows, src.cols, type);
+        angle = 180;
+        break;
+    case ROTATE_90_COUNTERCLOCKWISE:
+        _dst.create(src.cols, src.rows, type);
+        angle = 270;
+        break;
+    default:
+        _dst.create(src.rows, src.cols, type);
+        angle = 0;
+        break;
+    }
+
+    Mat dst = _dst.getMat();
+    CALL_HAL(rotate90, cv_hal_rotate90, type, src.ptr(), src.step, src.cols, src.rows,
+             dst.ptr(), dst.step, angle);
+
+    // use src (Mat) since _src (InputArray) is updated by _dst.create() when in-place
+    rotateImpl(src, _dst, rotateMode);
+}
+
 }  // namespace
--- a/modules/core/src/mean.simd.hpp
+++ b/modules/core/src/mean.simd.hpp
@ -24,7 +24,7 @@ struct SumSqr_SIMD
    }
 };

-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)

 template <>
 struct SumSqr_SIMD<uchar, int, int>
--- a/modules/core/src/minmax.cpp
+++ b/modules/core/src/minmax.cpp
@ -1546,9 +1546,9 @@ void cv::minMaxIdx(InputArray _src, double* minVal,
    if (!src.empty() && mask.empty())
    {
        if( minidx == 0 )
-             minidx = 1;
-         if( maxidx == 0 )
-             maxidx = 1;
+            minidx = 1;
+        if( maxidx == 0 )
+            maxidx = 1;
    }

    if( minidx == 0 )
--- a/modules/core/src/parallel.cpp
+++ b/modules/core/src/parallel.cpp
@ -791,7 +791,7 @@ int getThreadNum()
        return 0;
    #endif
 #elif defined HAVE_HPX
-        return (int)(hpx::get_num_worker_threads());
+    return (int)(hpx::get_num_worker_threads());
 #elif defined HAVE_OPENMP
    return omp_get_thread_num();
 #elif defined HAVE_GCD
--- a/modules/core/src/persistence_base64_encoding.cpp
+++ b/modules/core/src/persistence_base64_encoding.cpp
@ -367,4 +367,4 @@ size_t base64::RawDataToBinaryConvertor::make_to_binary_funcs(const std::string
    return offset_packed;
 }

-}
+}
--- a/modules/core/src/persistence_base64_encoding.hpp
+++ b/modules/core/src/persistence_base64_encoding.hpp
@ -124,4 +124,4 @@ private:
 }

 }
-#endif
+#endif
--- a/modules/core/src/softfloat.cpp
+++ b/modules/core/src/softfloat.cpp
@ -306,9 +306,6 @@ softdouble cos(const softdouble& a) { return f64_cos(a); }
 | The values to return on conversions to 32-bit integer formats that raise an
 | invalid exception.
 *----------------------------------------------------------------------------*/
-#define ui32_fromPosOverflow 0xFFFFFFFF
-#define ui32_fromNegOverflow 0
-#define ui32_fromNaN         0xFFFFFFFF
 #define i32_fromPosOverflow  0x7FFFFFFF
 #define i32_fromNegOverflow  (-0x7FFFFFFF - 1)
 #define i32_fromNaN          0x7FFFFFFF
@ -317,9 +314,6 @@ softdouble cos(const softdouble& a) { return f64_cos(a); }
 | The values to return on conversions to 64-bit integer formats that raise an
 | invalid exception.
 *----------------------------------------------------------------------------*/
-#define ui64_fromPosOverflow UINT64_C( 0xFFFFFFFFFFFFFFFF )
-#define ui64_fromNegOverflow 0
-#define ui64_fromNaN         UINT64_C( 0xFFFFFFFFFFFFFFFF )
 #define i64_fromPosOverflow  UINT64_C( 0x7FFFFFFFFFFFFFFF )
 //fixed unsigned unary minus: -x == ~x + 1
 //#define i64_fromNegOverflow (-UINT64_C( 0x7FFFFFFFFFFFFFFF ) - 1)
@ -422,34 +416,6 @@ struct uint64_extra { uint64_t v, extra; };
 struct uint128_extra { struct uint128 v; uint64_t extra; };
 #endif

-/*----------------------------------------------------------------------------
-| These macros are used to isolate the differences in word order between big-
-| endian and little-endian platforms.
-*----------------------------------------------------------------------------*/
-#ifndef WORDS_BIGENDIAN
-#define wordIncr 1
-#define indexWord( total, n ) (n)
-#define indexWordHi( total ) ((total) - 1)
-#define indexWordLo( total ) 0
-#define indexMultiword( total, m, n ) (n)
-#define indexMultiwordHi( total, n ) ((total) - (n))
-#define indexMultiwordLo( total, n ) 0
-#define indexMultiwordHiBut( total, n ) (n)
-#define indexMultiwordLoBut( total, n ) 0
-#define INIT_UINTM4( v3, v2, v1, v0 ) { v0, v1, v2, v3 }
-#else
-#define wordIncr -1
-#define indexWord( total, n ) ((total) - 1 - (n))
-#define indexWordHi( total ) 0
-#define indexWordLo( total ) ((total) - 1)
-#define indexMultiword( total, m, n ) ((total) - 1 - (m))
-#define indexMultiwordHi( total, n ) 0
-#define indexMultiwordLo( total, n ) ((total) - (n))
-#define indexMultiwordHiBut( total, n ) 0
-#define indexMultiwordLoBut( total, n ) (n)
-#define INIT_UINTM4( v3, v2, v1, v0 ) { v3, v2, v1, v0 }
-#endif
-
 enum {
    softfloat_mulAdd_subC    = 1,
    softfloat_mulAdd_subProd = 2
--- a/modules/core/src/split.simd.hpp
+++ b/modules/core/src/split.simd.hpp
@ -220,4 +220,4 @@ void split64s(const int64* src, int64** dst, int len, int cn )

 #endif
 CV_CPU_OPTIMIZATION_NAMESPACE_END
-}} // namespace
+}} // namespace
--- a/modules/core/test/test_arithm.cpp
+++ b/modules/core/test/test_arithm.cpp
@ -672,7 +672,7 @@ static void inRangeS(const Mat& src, const Scalar& lb, const Scalar& rb, Mat& ds
 }

 } // namespace
-CVTEST_GUARD_SYMBOL(inRange);
+CVTEST_GUARD_SYMBOL(inRange)

 struct InRangeSOp : public BaseArithmOp
 {
@ -1202,7 +1202,7 @@ struct MeanOp : public BaseArithmOp
    MeanOp() : BaseArithmOp(1, FIX_ALPHA+FIX_BETA+FIX_GAMMA+SUPPORT_MASK+SCALAR_OUTPUT, 1, 1, Scalar::all(0))
    {
        context = 3;
-    };
+    }
    void op(const vector<Mat>& src, Mat& dst, const Mat& mask)
    {
        dst.create(1, 1, CV_64FC4);
@ -1225,7 +1225,7 @@ struct SumOp : public BaseArithmOp
    SumOp() : BaseArithmOp(1, FIX_ALPHA+FIX_BETA+FIX_GAMMA+SCALAR_OUTPUT, 1, 1, Scalar::all(0))
    {
        context = 3;
-    };
+    }
    void op(const vector<Mat>& src, Mat& dst, const Mat&)
    {
        dst.create(1, 1, CV_64FC4);
@ -1285,7 +1285,7 @@ struct MeanStdDevOp : public BaseArithmOp
    {
        cn = 0;
        context = 7;
-    };
+    }
    void op(const vector<Mat>& src, Mat& dst, const Mat& mask)
    {
        dst.create(1, 2, CV_64FC4);
@ -1326,7 +1326,7 @@ struct NormOp : public BaseArithmOp
    {
        context = 1;
        normType = 0;
-    };
+    }
    int getRandomType(RNG& rng)
    {
        int type = cvtest::randomType(rng, baseArithmTypeMask, 1, 4);
@ -1372,7 +1372,7 @@ struct MinMaxLocOp : public BaseArithmOp
    MinMaxLocOp() : BaseArithmOp(1, FIX_ALPHA+FIX_BETA+FIX_GAMMA+SUPPORT_MASK+SCALAR_OUTPUT, 1, 1, Scalar::all(0))
    {
        context = ARITHM_MAX_NDIMS*2 + 2;
-    };
+    }
    int getRandomType(RNG& rng)
    {
        return cvtest::randomType(rng, baseArithmTypeMask, 1, 1);
@ -1419,7 +1419,7 @@ struct reduceArgMinMaxOp : public BaseArithmOp
                          isLast(false), isMax(false), axis(0)
    {
        context = ARITHM_MAX_NDIMS*2 + 2;
-    };
+    }
    int getRandomType(RNG& rng) override
    {
        return cvtest::randomType(rng, baseArithmTypeMask, 1, 1);
--- a/modules/core/test/test_io.cpp
+++ b/modules/core/test/test_io.cpp
@ -435,6 +435,8 @@ protected:
                CV_Assert( ov1 == v1 );
                CV_Assert( osc1 == sc1 );
                CV_Assert( og1 == g1 );
+                fs.release();
+                remove(fname.c_str());
            }
            catch(...)
            {
@ -489,6 +491,7 @@ TEST(Core_InputOutput, FileStorage)
    char arr[66];
    snprintf(arr, sizeof(arr), "snprintf is hell %d", 666);
    EXPECT_NO_THROW(f << arr);
+    remove(file.c_str());
 }

 TEST(Core_InputOutput, FileStorageKey)
@ -534,6 +537,7 @@ TEST(Core_InputOutput, FileStorageSpaces)
        ASSERT_STREQ(values[i].c_str(), valuesReadAppend[i].c_str());
    }
    g3.release();
+    EXPECT_EQ(0, remove(fileName.c_str()));
 }

 struct data_t
@ -585,12 +589,15 @@ struct data_t

 static void test_filestorage_basic(int write_flags, const char* suffix_name, bool testReadWrite, bool useMemory = false)
 {
+    const bool generateTestData = false; // enable to regenerate reference in opencv_extra
    const ::testing::TestInfo* const test_info = ::testing::UnitTest::GetInstance()->current_test_info();
    CV_Assert(test_info);
    std::string name = (std::string(test_info->test_case_name()) + "--" + test_info->name() + suffix_name);
    std::string name_34 = string(cvtest::TS::ptr()->get_data_path()) + "io/3_4/" + name;
-    if (!testReadWrite)
+    if (!testReadWrite || generateTestData)
        name = string(cvtest::TS::ptr()->get_data_path()) + "io/" + name;
+    else
+        name = cv::tempfile(name.c_str());

    {
        const size_t rawdata_N = 40;
@ -636,10 +643,7 @@ static void test_filestorage_basic(int write_flags, const char* suffix_name, boo
                rawdata.push_back(tmp);
            }
        }
-#ifdef GENERATE_TEST_DATA
-#else
-        if (testReadWrite || useMemory)
-#endif
+        if (testReadWrite || useMemory || generateTestData)
        {
            cv::FileStorage fs(name, write_flags + (useMemory ? cv::FileStorage::MEMORY : 0));
            fs << "normal_2d_mat" << _2d_out;
@ -761,9 +765,13 @@ static void test_filestorage_basic(int write_flags, const char* suffix_name, boo
        ASSERT_EQ(_rd_in.dims   , _rd_out.dims);
        ASSERT_EQ(_rd_in.depth(), _rd_out.depth());

-        if (useMemory) {
+        if (useMemory)
+        {
            EXPECT_EQ(0, cv::norm(_rd_in, _rd_out, NORM_INF));
        }
+        if (testReadWrite && !useMemory && !generateTestData) {
+            EXPECT_EQ(0, remove(name.c_str()));
+        }
    }
 }

@ -810,7 +818,7 @@ TEST(Core_InputOutput, filestorage_heap_overflow)
    const ::testing::TestInfo* const test_info = ::testing::UnitTest::GetInstance()->current_test_info();
    CV_Assert(test_info);

-    std::string name = std::string(test_info->test_case_name()) + "--" + test_info->name();
+    std::string name = cv::tempfile();
    const char data[] = {0x00, 0x2f, 0x4a, 0x4a, 0x50, 0x4a, 0x4a };

    std::ofstream file;
@ -822,6 +830,7 @@ TEST(Core_InputOutput, filestorage_heap_overflow)

    // This just shouldn't segfault, otherwise it's fine
    EXPECT_ANY_THROW(FileStorage(name, FileStorage::READ));
+    EXPECT_EQ(0, remove(name.c_str()));
 }

 TEST(Core_InputOutput, filestorage_base64_valid_call)
@ -832,18 +841,6 @@ TEST(Core_InputOutput, filestorage_base64_valid_call)
        : (std::string(test_info->test_case_name()) + "--" + test_info->name());

    char const * filenames[] = {
-        "core_io_base64_other_test.yml",
-        "core_io_base64_other_test.xml",
-        "core_io_base64_other_test.json",
-        "core_io_base64_other_test.yml?base64",
-        "core_io_base64_other_test.xml?base64",
-        "core_io_base64_other_test.json?base64",
-        0
-    };
-    char const * real_name[] = {
-        "core_io_base64_other_test.yml",
-        "core_io_base64_other_test.xml",
-        "core_io_base64_other_test.json",
        "core_io_base64_other_test.yml",
        "core_io_base64_other_test.xml",
        "core_io_base64_other_test.json",
@ -855,14 +852,16 @@ TEST(Core_InputOutput, filestorage_base64_valid_call)

    for (int n = 0; n < 6; n++)
    {
-        char const* suffix_name = filenames[n];
-        SCOPED_TRACE(suffix_name);
-        std::string name = basename + '_' + suffix_name;
-        std::string file_name = basename + '_' + real_name[n];
+        const int idx = n / 2;
+        const std::string mode_suffix = (n % 2 == 0) ? "" : "?base64";
+        std::string suffix_name = basename + "_" + filenames[idx];
+        std::string file_name = cv::tempfile(suffix_name.c_str());
+        std::string mode_file_name = file_name + mode_suffix;
+        SCOPED_TRACE(mode_file_name);

        EXPECT_NO_THROW(
        {
-            cv::FileStorage fs(name, cv::FileStorage::WRITE_BASE64);
+            cv::FileStorage fs(mode_file_name, cv::FileStorage::WRITE_BASE64);

            fs << "manydata" << "[";
            fs << "[:";
@ -890,7 +889,7 @@ TEST(Core_InputOutput, filestorage_base64_valid_call)

        EXPECT_NO_THROW(
        {
-            cv::FileStorage fs(name, cv::FileStorage::WRITE);
+            cv::FileStorage fs(mode_file_name, cv::FileStorage::WRITE);

            fs << "manydata" << "[";
            fs << str_out;
@ -934,10 +933,10 @@ TEST(Core_InputOutput, filestorage_base64_invalid_call)
        0
    };

-    for (char const ** ptr = filenames; *ptr; ptr++)
+    for (int idx = 0; idx < 3; ++idx)
    {
-        char const * suffix_name = *ptr;
-        std::string name = basename + '_' + suffix_name;
+        const string base_suffix = basename + '_' + filenames[idx];
+        std::string name = cv::tempfile(base_suffix.c_str());

        EXPECT_NO_THROW({
            cv::FileStorage fs(name, cv::FileStorage::WRITE);
@ -958,7 +957,7 @@ TEST(Core_InputOutput, filestorage_base64_invalid_call)

 TEST(Core_InputOutput, filestorage_yml_vec2i)
 {
-    const std::string file_name = "vec2i.yml";
+    const std::string file_name = cv::tempfile("vec2i.yml");
    cv::Vec2i vec(2, 1), ovec;

    /* write */
@ -1040,7 +1039,7 @@ TEST(Core_InputOutput, filestorage_vec_vec_io)
        }
    }

-    String fileName = "vec_vec_io_test.";
+    String basename = "vec_vec_io_test.";

    std::vector<String> formats;
    formats.push_back("xml");
@ -1049,11 +1048,13 @@ TEST(Core_InputOutput, filestorage_vec_vec_io)

    for(size_t i = 0; i < formats.size(); i++)
    {
-        FileStorage writer(fileName + formats[i], FileStorage::WRITE);
+        const String basename_plus(basename + formats[i]);
+        const String fileName = tempfile(basename_plus.c_str());
+        FileStorage writer(fileName, FileStorage::WRITE);
        writer << "vecVecMat" << outputMats;
        writer.release();

-        FileStorage reader(fileName + formats[i], FileStorage::READ);
+        FileStorage reader(fileName, FileStorage::READ);
        std::vector<std::vector<Mat> > testMats;
        reader["vecVecMat"] >> testMats;

@ -1070,7 +1071,7 @@ TEST(Core_InputOutput, filestorage_vec_vec_io)
        }

        reader.release();
-        remove((fileName + formats[i]).c_str());
+        remove(fileName.c_str());
    }
 }

@ -1661,7 +1662,7 @@ TEST(Core_InputOutput, FileStorage_json_bool)

 TEST(Core_InputOutput, FileStorage_free_file_after_exception)
 {
-    const std::string fileName = "FileStorage_free_file_after_exception_test.yml";
+    const std::string fileName = cv::tempfile("FileStorage_free_file_after_exception_test.yml");
    const std::string content = "%YAML:1.0\n cameraMatrix;:: !<tag:yaml.org,2002:opencv-matrix>\n";

    std::fstream testFile;
@ -1684,11 +1685,11 @@ TEST(Core_InputOutput, FileStorage_free_file_after_exception)
 TEST(Core_InputOutput, FileStorage_write_to_sequence)
 {
    const std::vector<std::string> formatExts = { ".yml", ".json", ".xml" };
-    const std::string fileName = "FileStorage_write_to_sequence";
-
    for (const auto& ext : formatExts)
    {
-        FileStorage fs(fileName + ext, FileStorage::WRITE);
+        const std::string name = tempfile(ext.c_str());
+
+        FileStorage fs(name, FileStorage::WRITE);
        std::vector<int> in = { 23, 42 };
        fs.startWriteStruct("some_sequence", cv::FileNode::SEQ);
        for (int i : in)
@ -1696,7 +1697,7 @@ TEST(Core_InputOutput, FileStorage_write_to_sequence)
        fs.endWriteStruct();
        fs.release();

-        FileStorage fsIn(fileName + ext, FileStorage::READ);
+        FileStorage fsIn(name, FileStorage::READ);
        FileNode seq = fsIn["some_sequence"];
        FileNodeIterator it = seq.begin(), it_end = seq.end();
        std::vector<int> out;
@ -1704,12 +1705,13 @@ TEST(Core_InputOutput, FileStorage_write_to_sequence)
            out.push_back((int)*it);

        EXPECT_EQ(in, out);
+        EXPECT_EQ(0, remove(name.c_str()));
    }
 }

 TEST(Core_InputOutput, FileStorage_YAML_parse_multiple_documents)
 {
-    const std::string filename = "FileStorage_YAML_parse_multiple_documents.yml";
+    const std::string filename = cv::tempfile("FileStorage_YAML_parse_multiple_documents.yml");
    FileStorage fs;

    fs.open(filename, FileStorage::WRITE);
--- a/modules/core/test/test_mat.cpp
+++ b/modules/core/test/test_mat.cpp
@ -475,12 +475,13 @@ TEST(Core_PCA, accuracy)
    ASSERT_LE(err, diffBackPrjEps) << "bad accuracy of cvBackProjectPCA() (CV_PCA_DATA_AS_COL)";
 #endif
    // Test read and write
-    FileStorage fs( "PCA_store.yml", FileStorage::WRITE );
+    const std::string filename = cv::tempfile("PCA_store.yml");
+    FileStorage fs( filename, FileStorage::WRITE );
    rPCA.write( fs );
    fs.release();

    PCA lPCA;
-    fs.open( "PCA_store.yml", FileStorage::READ );
+    fs.open( filename, FileStorage::READ );
    lPCA.read( fs.root() );
    err = cvtest::norm(rPCA.eigenvectors, lPCA.eigenvectors, NORM_L2 | NORM_RELATIVE);
    EXPECT_LE(err, 0) << "bad accuracy of write/load functions (YML)";
@ -488,6 +489,7 @@ TEST(Core_PCA, accuracy)
    EXPECT_LE(err, 0) << "bad accuracy of write/load functions (YML)";
    err = cvtest::norm(rPCA.mean, lPCA.mean, NORM_L2 | NORM_RELATIVE);
    EXPECT_LE(err, 0) << "bad accuracy of write/load functions (YML)";
+    EXPECT_EQ(0, remove(filename.c_str()));
 }

 class Core_ArrayOpTest : public cvtest::BaseTest
--- a/modules/dnn/include/opencv2/dnn/all_layers.hpp
+++ b/modules/dnn/include/opencv2/dnn/all_layers.hpp
@ -588,11 +588,11 @@ CV__DNN_INLINE_NS_BEGIN
    {
    public:
        virtual void forwardSlice(const float* src, float* dst, int len,
-                                  size_t outPlaneSize, int cn0, int cn1) const {};
+                                  size_t outPlaneSize, int cn0, int cn1) const {}
        virtual void forwardSlice(const int* src, const int* lut, int* dst, int len,
-                                  size_t outPlaneSize, int cn0, int cn1) const {};
+                                  size_t outPlaneSize, int cn0, int cn1) const {}
        virtual void forwardSlice(const int8_t* src, const int8_t* lut, int8_t* dst, int len,
-                                  size_t outPlaneSize, int cn0, int cn1) const {};
+                                  size_t outPlaneSize, int cn0, int cn1) const {}
    };

    class CV_EXPORTS ReLULayer : public ActivationLayer
--- a/modules/dnn/perf/perf_net.cpp
+++ b/modules/dnn/perf/perf_net.cpp
@ -28,22 +28,28 @@ public:
        target = (dnn::Target)(int)get<1>(GetParam());
    }

-    void processNet(std::string weights, std::string proto, const Mat& input, const std::string& outputLayer = "")
-    {
-        randu(input, 0.0f, 1.0f);
-
+    void processNet(std::string weights, std::string proto,
+                    const std::vector<std::tuple<Mat, std::string>>& inputs, const std::string& outputLayer = ""){
        weights = findDataFile(weights, false);
        if (!proto.empty())
            proto = findDataFile(proto);
        net = readNet(proto, weights);
-        net.setInput(blobFromImage(input, 1.0, Size(), Scalar(), false));
+        // Set multiple inputs
+        for(auto &inp: inputs){
+            net.setInput(std::get<0>(inp), std::get<1>(inp));
+        }
+
        net.setPreferableBackend(backend);
        net.setPreferableTarget(target);

-        MatShape netInputShape = shape(1, 3, input.rows, input.cols);
+        // Calculate multiple inputs memory consumption
+        std::vector<MatShape> netMatShapes;
+        for(auto &inp: inputs){
+            netMatShapes.push_back(shape(std::get<0>(inp)));
+        }
        size_t weightsMemory = 0, blobsMemory = 0;
-        net.getMemoryConsumption(netInputShape, weightsMemory, blobsMemory);
-        int64 flops = net.getFLOPS(netInputShape);
+        net.getMemoryConsumption(netMatShapes, weightsMemory, blobsMemory);
+        int64 flops = net.getFLOPS(netMatShapes);
        CV_Assert(flops > 0);

        net.forward(outputLayer); // warmup
@ -59,33 +65,48 @@ public:

        SANITY_CHECK_NOTHING();
    }
+
+    void processNet(std::string weights, std::string proto,
+                    Mat &input, const std::string& outputLayer = "")
+    {
+        processNet(weights, proto, {std::make_tuple(input, "")}, outputLayer);
+    }
+
+    void processNet(std::string weights, std::string proto,
+                    Size inpSize, const std::string& outputLayer = "")
+    {
+        Mat input_data(inpSize, CV_32FC3);
+        randu(input_data, 0.0f, 1.0f);
+        Mat input = blobFromImage(input_data, 1.0, Size(), Scalar(), false);
+        processNet(weights, proto, input, outputLayer);
+    }
 };


 PERF_TEST_P_(DNNTestNetwork, AlexNet)
 {
-    processNet("dnn/bvlc_alexnet.caffemodel", "dnn/bvlc_alexnet.prototxt", Mat(cv::Size(227, 227), CV_32FC3));
+    processNet("dnn/bvlc_alexnet.caffemodel", "dnn/bvlc_alexnet.prototxt", cv::Size(227, 227));
 }

 PERF_TEST_P_(DNNTestNetwork, GoogLeNet)
 {
-    processNet("dnn/bvlc_googlenet.caffemodel", "dnn/bvlc_googlenet.prototxt", Mat(cv::Size(224, 224), CV_32FC3));
+    processNet("dnn/bvlc_googlenet.caffemodel", "dnn/bvlc_googlenet.prototxt", cv::Size(224, 224));
 }

 PERF_TEST_P_(DNNTestNetwork, ResNet_50)
 {
-    processNet("dnn/ResNet-50-model.caffemodel", "dnn/ResNet-50-deploy.prototxt", Mat(cv::Size(224, 224), CV_32FC3));
+    processNet("dnn/ResNet-50-model.caffemodel", "dnn/ResNet-50-deploy.prototxt", cv::Size(224, 224));
 }

 PERF_TEST_P_(DNNTestNetwork, SqueezeNet_v1_1)
 {
-    processNet("dnn/squeezenet_v1.1.caffemodel", "dnn/squeezenet_v1.1.prototxt", Mat(cv::Size(227, 227), CV_32FC3));
+    processNet("dnn/squeezenet_v1.1.caffemodel", "dnn/squeezenet_v1.1.prototxt", cv::Size(227, 227));
 }

 PERF_TEST_P_(DNNTestNetwork, Inception_5h)
 {
    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019) throw SkipTestException("");
-    processNet("dnn/tensorflow_inception_graph.pb", "", Mat(cv::Size(224, 224), CV_32FC3), "softmax2");
+    processNet("dnn/tensorflow_inception_graph.pb", "", cv::Size(224, 224), "softmax2");
 }

 PERF_TEST_P_(DNNTestNetwork, ENet)
@ -97,12 +118,12 @@ PERF_TEST_P_(DNNTestNetwork, ENet)
    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
        throw SkipTestException("");
 #endif
-    processNet("dnn/Enet-model-best.net", "", Mat(cv::Size(512, 256), CV_32FC3));
+    processNet("dnn/Enet-model-best.net", "", cv::Size(512, 256));
 }

 PERF_TEST_P_(DNNTestNetwork, SSD)
 {
-    processNet("dnn/VGG_ILSVRC2016_SSD_300x300_iter_440000.caffemodel", "dnn/ssd_vgg16.prototxt", Mat(cv::Size(300, 300), CV_32FC3));
+    processNet("dnn/VGG_ILSVRC2016_SSD_300x300_iter_440000.caffemodel", "dnn/ssd_vgg16.prototxt", cv::Size(300, 300));
 }

 PERF_TEST_P_(DNNTestNetwork, OpenFace)
@ -111,27 +132,27 @@ PERF_TEST_P_(DNNTestNetwork, OpenFace)
    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && (target == DNN_TARGET_MYRIAD || target == DNN_TARGET_HDDL))
        throw SkipTestException("");
 #endif
-    processNet("dnn/openface_nn4.small2.v1.t7", "", Mat(cv::Size(96, 96), CV_32FC3));
+    processNet("dnn/openface_nn4.small2.v1.t7", "", cv::Size(96, 96));
 }

 PERF_TEST_P_(DNNTestNetwork, MobileNet_SSD_Caffe)
 {
-    processNet("dnn/MobileNetSSD_deploy_19e3ec3.caffemodel", "dnn/MobileNetSSD_deploy_19e3ec3.prototxt", Mat(cv::Size(300, 300), CV_32FC3));
+    processNet("dnn/MobileNetSSD_deploy_19e3ec3.caffemodel", "dnn/MobileNetSSD_deploy_19e3ec3.prototxt", cv::Size(300, 300));
 }

 PERF_TEST_P_(DNNTestNetwork, MobileNet_SSD_v1_TensorFlow)
 {
-    processNet("dnn/ssd_mobilenet_v1_coco_2017_11_17.pb", "ssd_mobilenet_v1_coco_2017_11_17.pbtxt", Mat(cv::Size(300, 300), CV_32FC3));
+    processNet("dnn/ssd_mobilenet_v1_coco_2017_11_17.pb", "ssd_mobilenet_v1_coco_2017_11_17.pbtxt", cv::Size(300, 300));
 }

 PERF_TEST_P_(DNNTestNetwork, MobileNet_SSD_v2_TensorFlow)
 {
-    processNet("dnn/ssd_mobilenet_v2_coco_2018_03_29.pb", "ssd_mobilenet_v2_coco_2018_03_29.pbtxt", Mat(cv::Size(300, 300), CV_32FC3));
+    processNet("dnn/ssd_mobilenet_v2_coco_2018_03_29.pb", "ssd_mobilenet_v2_coco_2018_03_29.pbtxt", cv::Size(300, 300));
 }

 PERF_TEST_P_(DNNTestNetwork, DenseNet_121)
 {
-    processNet("dnn/DenseNet_121.caffemodel", "dnn/DenseNet_121.prototxt", Mat(cv::Size(224, 224), CV_32FC3));
+    processNet("dnn/DenseNet_121.caffemodel", "dnn/DenseNet_121.prototxt", cv::Size(224, 224));
 }

 PERF_TEST_P_(DNNTestNetwork, OpenPose_pose_mpi_faster_4_stages)
@ -140,17 +161,17 @@ PERF_TEST_P_(DNNTestNetwork, OpenPose_pose_mpi_faster_4_stages)
        throw SkipTestException("");
    // The same .caffemodel but modified .prototxt
    // See https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/src/openpose/pose/poseParameters.cpp
-    processNet("dnn/openpose_pose_mpi.caffemodel", "dnn/openpose_pose_mpi_faster_4_stages.prototxt", Mat(cv::Size(368, 368), CV_32FC3));
+    processNet("dnn/openpose_pose_mpi.caffemodel", "dnn/openpose_pose_mpi_faster_4_stages.prototxt", cv::Size(368, 368));
 }

 PERF_TEST_P_(DNNTestNetwork, opencv_face_detector)
 {
-    processNet("dnn/opencv_face_detector.caffemodel", "dnn/opencv_face_detector.prototxt", Mat(cv::Size(300, 300), CV_32FC3));
+    processNet("dnn/opencv_face_detector.caffemodel", "dnn/opencv_face_detector.prototxt", cv::Size(300, 300));
 }

 PERF_TEST_P_(DNNTestNetwork, Inception_v2_SSD_TensorFlow)
 {
-    processNet("dnn/ssd_inception_v2_coco_2017_11_17.pb", "ssd_inception_v2_coco_2017_11_17.pbtxt", Mat(cv::Size(300, 300), CV_32FC3));
+    processNet("dnn/ssd_inception_v2_coco_2017_11_17.pb", "ssd_inception_v2_coco_2017_11_17.pbtxt", cv::Size(300, 300));
 }

 PERF_TEST_P_(DNNTestNetwork, YOLOv3)
@ -168,9 +189,7 @@ PERF_TEST_P_(DNNTestNetwork, YOLOv3)
 #endif

    Mat sample = imread(findDataFile("dnn/dog416.png"));
-    cvtColor(sample, sample, COLOR_BGR2RGB);
-    Mat inp;
-    sample.convertTo(inp, CV_32FC3, 1.0f / 255, 0);
+    Mat inp = blobFromImage(sample, 1.0 / 255.0, Size(), Scalar(), true);
    processNet("dnn/yolov3.weights", "dnn/yolov3.cfg", inp);
 }

@ -186,9 +205,7 @@ PERF_TEST_P_(DNNTestNetwork, YOLOv4)
        throw SkipTestException("Test is disabled in OpenVINO 2020.4");
 #endif
    Mat sample = imread(findDataFile("dnn/dog416.png"));
-    cvtColor(sample, sample, COLOR_BGR2RGB);
-    Mat inp;
-    sample.convertTo(inp, CV_32FC3, 1.0f / 255, 0);
+    Mat inp = blobFromImage(sample, 1.0 / 255.0, Size(), Scalar(), true);
    processNet("dnn/yolov4.weights", "dnn/yolov4.cfg", inp);
 }

@ -199,20 +216,39 @@ PERF_TEST_P_(DNNTestNetwork, YOLOv4_tiny)
        throw SkipTestException("");
 #endif
    Mat sample = imread(findDataFile("dnn/dog416.png"));
-    cvtColor(sample, sample, COLOR_BGR2RGB);
-    Mat inp;
-    sample.convertTo(inp, CV_32FC3, 1.0f / 255, 0);
+    Mat inp = blobFromImage(sample, 1.0 / 255.0, Size(), Scalar(), true);
    processNet("dnn/yolov4-tiny-2020-12.weights", "dnn/yolov4-tiny-2020-12.cfg", inp);
 }

+PERF_TEST_P_(DNNTestNetwork, YOLOv5) {
+    applyTestTag(CV_TEST_TAG_MEMORY_512MB);
+    Mat sample = imread(findDataFile("dnn/dog416.png"));
+    Mat inp = blobFromImage(sample, 1.0 / 255.0, Size(640, 640), Scalar(), true);
+    processNet("", "dnn/yolov5n.onnx", inp);
+}
+
+PERF_TEST_P_(DNNTestNetwork, YOLOv8) {
+    applyTestTag(CV_TEST_TAG_MEMORY_512MB);
+    Mat sample = imread(findDataFile("dnn/dog416.png"));
+    Mat inp = blobFromImage(sample, 1.0 / 255.0, Size(640, 640), Scalar(), true);
+    processNet("", "dnn/yolov8n.onnx", inp);
+}
+
+PERF_TEST_P_(DNNTestNetwork, YOLOX) {
+    applyTestTag(CV_TEST_TAG_MEMORY_512MB);
+    Mat sample = imread(findDataFile("dnn/dog416.png"));
+    Mat inp = blobFromImage(sample, 1.0 / 255.0, Size(640, 640), Scalar(), true);
+    processNet("", "dnn/yolox_s.onnx", inp);
+}
+
 PERF_TEST_P_(DNNTestNetwork, EAST_text_detection)
 {
-    processNet("dnn/frozen_east_text_detection.pb", "", Mat(cv::Size(320, 320), CV_32FC3));
+    processNet("dnn/frozen_east_text_detection.pb", "", cv::Size(320, 320));
 }

 PERF_TEST_P_(DNNTestNetwork, FastNeuralStyle_eccv16)
 {
-    processNet("dnn/fast_neural_style_eccv16_starry_night.t7", "", Mat(cv::Size(320, 240), CV_32FC3));
+    processNet("dnn/fast_neural_style_eccv16_starry_night.t7", "", cv::Size(320, 240));
 }

 PERF_TEST_P_(DNNTestNetwork, Inception_v2_Faster_RCNN)
@ -233,7 +269,8 @@ PERF_TEST_P_(DNNTestNetwork, Inception_v2_Faster_RCNN)
        (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16))
        throw SkipTestException("");
    processNet("dnn/faster_rcnn_inception_v2_coco_2018_01_28.pb",
-               "dnn/faster_rcnn_inception_v2_coco_2018_01_28.pbtxt", Mat(cv::Size(800, 600), CV_32FC3));
+               "dnn/faster_rcnn_inception_v2_coco_2018_01_28.pbtxt",
+               cv::Size(800, 600));
 }

 PERF_TEST_P_(DNNTestNetwork, EfficientDet)
@ -241,12 +278,88 @@ PERF_TEST_P_(DNNTestNetwork, EfficientDet)
    if (target != DNN_TARGET_CPU)
        throw SkipTestException("");
    Mat sample = imread(findDataFile("dnn/dog416.png"));
-    resize(sample, sample, Size(512, 512));
-    Mat inp;
-    sample.convertTo(inp, CV_32FC3, 1.0/255);
+    Mat inp = blobFromImage(sample, 1.0 / 255.0, Size(512, 512), Scalar(), true);
    processNet("dnn/efficientdet-d0.pb", "dnn/efficientdet-d0.pbtxt", inp);
 }

+PERF_TEST_P_(DNNTestNetwork, EfficientNet)
+{
+    Mat sample = imread(findDataFile("dnn/dog416.png"));
+    Mat inp = blobFromImage(sample, 1.0 / 255.0, Size(224, 224), Scalar(), true);
+    transposeND(inp, {0, 2, 3, 1}, inp);
+    processNet("", "dnn/efficientnet-lite4.onnx", inp);
+}
+
+PERF_TEST_P_(DNNTestNetwork, YuNet) {
+    processNet("", "dnn/onnx/models/yunet-202303.onnx", cv::Size(640, 640));
+}
+
+PERF_TEST_P_(DNNTestNetwork, SFace) {
+    processNet("", "dnn/face_recognition_sface_2021dec.onnx", cv::Size(112, 112));
+}
+
+PERF_TEST_P_(DNNTestNetwork, MPPalm) {
+    Mat inp(cv::Size(192, 192), CV_32FC3);
+    randu(inp, 0.0f, 1.0f);
+    inp = blobFromImage(inp, 1.0, Size(), Scalar(), false);
+    transposeND(inp, {0, 2, 3, 1}, inp);
+    processNet("", "dnn/palm_detection_mediapipe_2023feb.onnx", inp);
+}
+
+PERF_TEST_P_(DNNTestNetwork, MPHand) {
+    Mat inp(cv::Size(224, 224), CV_32FC3);
+    randu(inp, 0.0f, 1.0f);
+    inp = blobFromImage(inp, 1.0, Size(), Scalar(), false);
+    transposeND(inp, {0, 2, 3, 1}, inp);
+    processNet("", "dnn/handpose_estimation_mediapipe_2023feb.onnx", inp);
+}
+
+PERF_TEST_P_(DNNTestNetwork, MPPose) {
+    Mat inp(cv::Size(256, 256), CV_32FC3);
+    randu(inp, 0.0f, 1.0f);
+    inp = blobFromImage(inp, 1.0, Size(), Scalar(), false);
+    transposeND(inp, {0, 2, 3, 1}, inp);
+    processNet("", "dnn/pose_estimation_mediapipe_2023mar.onnx", inp);
+}
+
+PERF_TEST_P_(DNNTestNetwork, PPOCRv3) {
+    applyTestTag(CV_TEST_TAG_MEMORY_512MB);
+    processNet("", "dnn/onnx/models/PP_OCRv3_DB_text_det.onnx", cv::Size(736, 736));
+}
+
+PERF_TEST_P_(DNNTestNetwork, PPHumanSeg) {
+    processNet("", "dnn/human_segmentation_pphumanseg_2023mar.onnx", cv::Size(192, 192));
+}
+
+PERF_TEST_P_(DNNTestNetwork, CRNN) {
+    Mat inp(cv::Size(100, 32), CV_32FC1);
+    randu(inp, 0.0f, 1.0f);
+    inp = blobFromImage(inp, 1.0, Size(), Scalar(), false);
+    processNet("", "dnn/text_recognition_CRNN_EN_2021sep.onnx", inp);
+}
+
+PERF_TEST_P_(DNNTestNetwork, ViTTrack) {
+    Mat inp1(cv::Size(128, 128), CV_32FC3);
+    Mat inp2(cv::Size(256, 256), CV_32FC3);
+    randu(inp1, 0.0f, 1.0f);
+    randu(inp2, 0.0f, 1.0f);
+    inp1 = blobFromImage(inp1, 1.0, Size(), Scalar(), false);
+    inp2 = blobFromImage(inp2, 1.0, Size(), Scalar(), false);
+    processNet("", "dnn/onnx/models/vitTracker.onnx", {std::make_tuple(inp1, "template"), std::make_tuple(inp2, "search")});
+}
+
+
+PERF_TEST_P_(DNNTestNetwork, EfficientDet_int8)
+{
+    if (target != DNN_TARGET_CPU || (backend != DNN_BACKEND_OPENCV &&
+        backend != DNN_BACKEND_TIMVX && backend != DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)) {
+        throw SkipTestException("");
+    }
+    Mat inp = imread(findDataFile("dnn/dog416.png"));
+    inp = blobFromImage(inp, 1.0 / 255.0, Size(320, 320), Scalar(), true);
+    processNet("", "dnn/tflite/coco_efficientdet_lite0_v1_1.0_quant_2021_09_06.tflite", inp);
+}
+
 INSTANTIATE_TEST_CASE_P(/*nothing*/, DNNTestNetwork, dnnBackendsAndTargets());

 } // namespace
--- a/modules/dnn/src/graph_simplifier.hpp
+++ b/modules/dnn/src/graph_simplifier.hpp
@ -17,7 +17,7 @@ namespace cv { namespace dnn {
 class ImportNodeWrapper
 {
 public:
-    virtual ~ImportNodeWrapper() {};
+    virtual ~ImportNodeWrapper() {}

    virtual int getNumInputs() const = 0;

@ -33,7 +33,7 @@ public:
 class ImportGraphWrapper
 {
 public:
-    virtual ~ImportGraphWrapper() {};
+    virtual ~ImportGraphWrapper() {}

    virtual Ptr<ImportNodeWrapper> getNode(int idx) const = 0;

--- a/modules/dnn/src/ie_ngraph.cpp
+++ b/modules/dnn/src/ie_ngraph.cpp
@ -590,7 +590,7 @@ void InfEngineNgraphNet::init(Target targetId)
            allBlobs[name] = ov::Tensor(src.get_element_type(), outShape, src.data());
        }

-        ppp.output(i++).tensor().set_element_type(ov::element::f32);  // Should be always FP32
+        ppp.output(i++).tensor().set_element_type(src.get_element_type());
    }

    ppp.build();
@ -840,6 +840,8 @@ ov::Tensor wrapToNgraphBlob(const Mat& m) {
        return ov::Tensor(ov::element::f32, shape, m.data);
    else if (m.type() == CV_8U)
        return ov::Tensor(ov::element::u8, shape, m.data);
+    else if (m.type() == CV_8SC1)
+        return ov::Tensor(ov::element::i8, shape, m.data);
    else if (m.type() == CV_32SC1)
        return ov::Tensor(ov::element::i32, shape, m.data);
    else
@ -1234,6 +1236,32 @@ void InfEngineNgraphNet::forward(const std::vector<Ptr<BackendWrapper> >& outBlo
 #endif // OpenVINO >= 2022.1
 }

+ngraph::Output<ngraph::Node> ngraphQuantize(ngraph::Output<ngraph::Node> input, float output_sc, float output_zp) {
+    float outLow = -128, outHigh = 127;
+    float inpLow = output_sc * (outLow - output_zp);
+    float inpHigh = output_sc * (outHigh - output_zp);
+    return std::make_shared<ngraph::op::FakeQuantize>(input,
+        std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &inpLow),
+        std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &inpHigh),
+        std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &outLow),
+        std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &outHigh),
+        256 // levels
+    );
+}
+
+ngraph::Output<ngraph::Node> ngraphDequantize(ngraph::Output<ngraph::Node> input, float input_sc, float input_zp) {
+    float inpLow = -128, inpHigh = 127;
+    float outLow = input_sc * (inpLow - input_zp);
+    float outHigh = input_sc * (inpHigh - input_zp);
+    return std::make_shared<ngraph::op::FakeQuantize>(input,
+        std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &inpLow),
+        std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &inpHigh),
+        std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &outLow),
+        std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &outHigh),
+        256 // levels
+    );
+}
+
 #endif

 }}
--- a/modules/dnn/src/ie_ngraph.hpp
+++ b/modules/dnn/src/ie_ngraph.hpp
@ -148,6 +148,9 @@ private:
    InferenceEngine::CNNNetwork t_net;
 };

+ngraph::Output<ngraph::Node> ngraphQuantize(ngraph::Output<ngraph::Node> input, float output_sc, float output_zp);
+ngraph::Output<ngraph::Node> ngraphDequantize(ngraph::Output<ngraph::Node> input, float input_sc, float input_zp);
+
 #endif  // HAVE_DNN_NGRAPH

 }}  // namespace cv::dnn
--- a/modules/dnn/src/int8layers/batch_norm_layer.cpp
+++ b/modules/dnn/src/int8layers/batch_norm_layer.cpp
@ -5,6 +5,7 @@
 #include "../precomp.hpp"
 #include "layers_common.hpp"
 #include "../op_timvx.hpp"
+#include "../ie_ngraph.hpp"

 #include <opencv2/dnn/shape_utils.hpp>

@ -110,7 +111,8 @@ public:
            return true;
        }

-        return backendId == DNN_BACKEND_OPENCV;
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;
    }

    bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
@ -238,6 +240,27 @@ public:
        return Ptr<BackendNode>();
    }

+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs, const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        auto input = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+
+        input = ngraphDequantize(input, input_sc, input_zp);
+
+        std::vector<size_t> shape(input.get_shape().size(), 1);
+        shape[1] = origin_weights.total();
+
+        ngraph::Output<ngraph::Node> res;
+        auto ieWeights = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, shape, origin_weights.data);
+        auto ieBias = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, shape, origin_bias.data);
+        res = std::make_shared<ngraph::op::v1::Multiply>(input, ieWeights);
+        res = std::make_shared<ngraph::op::v1::Add>(res, ieBias);
+
+        res = ngraphQuantize(res, output_sc, output_zp);
+        return new InfEngineNgraphNode(res);
+    }
+#endif  // HAVE_DNN_NGRAPH
+
    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
    {
        CV_TRACE_FUNCTION();
--- a/modules/dnn/src/int8layers/convolution_layer.cpp
+++ b/modules/dnn/src/int8layers/convolution_layer.cpp
@ -10,6 +10,7 @@
 #include "opencv2/core/hal/hal.hpp"
 #include "opencv2/core/hal/intrin.hpp"
 #include "../op_timvx.hpp"
+#include "../ie_ngraph.hpp"
 #include <iostream>
 #include <numeric>

@ -18,7 +19,7 @@ namespace cv
 namespace dnn
 {

-#if CV_SIMD
+#if CV_SIMD128
 static inline void v_expand_mul_add(const v_int8x16& a, const v_int8x16& b,
                                    v_int32x4& out0, v_int32x4& out1, v_int32x4& out2, v_int32x4& out3)
 {
@ -195,7 +196,8 @@ public:
        }
 #endif
        // Only default backend and Conv1D/Conv2D/Conv3D are supported
-        return backendId == DNN_BACKEND_OPENCV && ksize >= 1 && ksize <= 3;
+        return (backendId == DNN_BACKEND_OPENCV && ksize >= 1 && ksize <= 3) ||
+               backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;
    }

    bool getMemoryShapes(const std::vector<MatShape> &inputs,
@ -561,6 +563,126 @@ public:
        return Ptr<BackendNode>();
    }

+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> > &inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        CV_Assert(!blobs.empty());
+        CV_Assert_N(inputs.size() >= 1, nodes.size() >= 1);
+        CV_CheckTypeEQ(weightsMat.type(), CV_8S, "");
+        auto ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        std::vector<size_t> dims = ieInpNode.get_shape();
+        CV_Check(dims.size(), dims.size() >= 3 && dims.size() <= 5, "");
+        CV_Assert(ieInpNode.get_element_type() == ngraph::element::f32);
+        ngraph::Output<ngraph::Node> ieWeights;
+        if (nodes.size() > 1)
+            ieWeights = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
+        const int inpCn = dims[1];
+        const int inpGroupCn = nodes.size() > 1 ? ieWeights.get_shape()[1] : blobs[0].size[1];
+        const int group = inpCn / inpGroupCn;
+
+        std::vector<size_t> kernel_shape;
+        if (group != 1)
+        {
+            kernel_shape.push_back(group);
+        }
+        kernel_shape.push_back(numOutput / group);
+        kernel_shape.push_back(inpCn / group);
+        std::copy(kernel_size.begin(), kernel_size.end(), back_inserter(kernel_shape));
+
+        if (nodes.size() == 1)
+        {
+            ieWeights = std::make_shared<ngraph::op::Constant>(ngraph::element::i8, kernel_shape, blobs[0].data);
+        }
+        else
+        {
+            auto shape = std::make_shared<ngraph::op::Constant>(ngraph::element::i64,
+                             ngraph::Shape{kernel_shape.size()}, std::vector<int64_t>(kernel_shape.begin(), kernel_shape.end()));
+            ieWeights  = std::make_shared<ngraph::op::v1::Reshape>(ieWeights, shape, true);
+        }
+
+        ngraph::op::PadType pad_type = ngraph::op::PadType::EXPLICIT;
+        if (!padMode.empty())
+            pad_type = padMode == "VALID" ? ngraph::op::PadType::VALID : ngraph::op::PadType::SAME_UPPER;
+
+        ieInpNode = ngraphDequantize(ieInpNode, input_sc, input_zp);
+
+        const float low = -128, high = 127;
+        std::vector<float> inpLows(numOutput, low);
+        std::vector<float> inpHighs(numOutput, high);
+        std::vector<float> outLows(numOutput);
+        std::vector<float> outHighs(numOutput);
+        std::vector<size_t> quantShape(kernel_shape.size(), 1);
+        if (group != 1)
+        {
+            quantShape[0] = group;
+            quantShape[1] = numOutput / group;
+        }
+        else
+        {
+            quantShape[0] = numOutput;
+        }
+
+        for (int i = 0; i < numOutput; ++i) {
+            outLows[i] = low * outputMultiplier[i] * output_sc / input_sc;
+            outHighs[i] = high * outputMultiplier[i] * output_sc / input_sc;
+        }
+        ieWeights = std::make_shared<ngraph::op::Convert>(ieWeights, ngraph::element::f32);
+        ieWeights = std::make_shared<ngraph::op::FakeQuantize>(ieWeights,
+            std::make_shared<ngraph::op::Constant>(ngraph::element::f32, quantShape, inpLows.data()),
+            std::make_shared<ngraph::op::Constant>(ngraph::element::f32, quantShape, inpHighs.data()),
+            std::make_shared<ngraph::op::Constant>(ngraph::element::f32, quantShape, outLows.data()),
+            std::make_shared<ngraph::op::Constant>(ngraph::element::f32, quantShape, outHighs.data()),
+            256 // levels
+        );
+
+        ngraph::Output<ngraph::Node> conv_node;
+        if (group != 1) {
+            conv_node = std::make_shared<ngraph::op::v1::GroupConvolution>(
+                                ieInpNode, ieWeights,
+                                ngraph::Strides(strides),
+                                ngraph::CoordinateDiff(std::vector<std::ptrdiff_t>(pads_begin.begin(), pads_begin.end())),
+                                ngraph::CoordinateDiff(std::vector<std::ptrdiff_t>(pads_end.begin(),   pads_end.end())),
+                                ngraph::Strides(dilations),
+                                pad_type);
+        } else {
+            conv_node = std::make_shared<ngraph::op::v1::Convolution>(
+                                ieInpNode, ieWeights,
+                                ngraph::Strides(strides),
+                                ngraph::CoordinateDiff(std::vector<std::ptrdiff_t>(pads_begin.begin(), pads_begin.end())),
+                                ngraph::CoordinateDiff(std::vector<std::ptrdiff_t>(pads_end.begin(), pads_end.end())),
+                                ngraph::Strides(dilations),
+                                pad_type);
+        }
+
+        std::vector<size_t> shape(conv_node.get_shape().size(), 1);
+        shape[1] = conv_node.get_shape()[1];
+        if (biasvec.size() || nodes.size() == 3)
+        {
+            std::shared_ptr<ngraph::Node> bias;
+            if (nodes.size() == 3)
+            {
+                auto bias_shape = std::make_shared<ngraph::op::Constant>(ngraph::element::i64,
+                                    ngraph::Shape{shape.size()}, std::vector<int64_t>(shape.begin(), shape.end()));
+                bias = std::make_shared<ngraph::op::v1::Reshape>(nodes[2].dynamicCast<InfEngineNgraphNode>()->node, bias_shape, true);
+            }
+            else
+            {
+                std::vector<float> ovBias(numOutput);
+                for (int i = 0; i < numOutput; ++i) {
+                    ovBias[i] = (biasvec[i] + input_zp * cv::sum(blobs[0].row(i))[0]) * outputMultiplier[i] * output_sc;
+                }
+                bias = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape(shape), ovBias.data());
+            }
+            conv_node = std::make_shared<ngraph::op::v1::Add>(conv_node, bias, ngraph::op::AutoBroadcastType::NUMPY);
+        }
+
+        conv_node = ngraphQuantize(conv_node, output_sc, output_zp);
+
+        return new InfEngineNgraphNode(conv_node);
+    }
+#endif  // HAVE_DNN_NGRAPH
+
    class ParallelConv : public cv::ParallelLoopBody
    {
    public:
@ -893,7 +1015,7 @@ public:
                                        outptr[0] = std::min(std::max(out1, -128), 127);
                                        out_j = 1;
                                    }
-                                #if CV_SIMD
+                                #if CV_SIMD128
                                    if( stride_w == 1 )
                                    {
                                        const int out_delta = 16;
--- a/modules/dnn/src/int8layers/elementwise_layers.cpp
+++ b/modules/dnn/src/int8layers/elementwise_layers.cpp
@ -5,6 +5,7 @@
 #include "../precomp.hpp"
 #include "layers_common.hpp"
 #include "../op_timvx.hpp"
+#include "../ie_ngraph.hpp"

 #include <opencv2/dnn/shape_utils.hpp>
 #include <iostream>
@ -56,7 +57,7 @@ public:
            return tvActType != tvActNotSupported;
        }
 #endif
-        return backendId == DNN_BACKEND_OPENCV;
+        return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;
    }

    bool getMemoryShapes(const std::vector<MatShape> &inputs,
@ -244,6 +245,42 @@ public:
        return Ptr<BackendNode>();
    }

+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> > &inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        auto input = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+
+        input = ngraphDequantize(input, input_sc, input_zp);
+
+        ngraph::Output<ngraph::Node> res;
+        if (type == "ReLU6Int8") {
+            res = std::make_shared<ngraph::op::Clamp>(input, 0.0f, 6.0f);
+        } else if (type == "ReLUInt8") {
+            if (slope) {
+                auto param = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &slope);
+                res = std::make_shared<ngraph::op::PRelu>(input, param);
+            } else {
+                res = std::make_shared<ngraph::op::Relu>(input);
+            }
+        } else if (type == "ELUInt8") {
+            res = std::make_shared<ngraph::op::Elu>(input, 1.0f);
+        } else if (type == "MishInt8") {
+            res = std::make_shared<ngraph::op::v4::Mish>(input);
+        } else if (type == "AbsValInt8") {
+            res = std::make_shared<ngraph::op::Abs>(input);
+        } else if (type == "SigmoidInt8") {
+            res = std::make_shared<ngraph::op::Sigmoid>(input);
+        } else {
+            CV_Error(Error::StsNotImplemented, type + " activation with OpenVINO");
+        }
+
+        res = ngraphQuantize(res, output_sc, output_zp);
+
+        return new InfEngineNgraphNode(res);
+    }
+#endif  // HAVE_DNN_NGRAPH
+
    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
    {
        CV_TRACE_FUNCTION();
--- a/modules/dnn/src/int8layers/eltwise_layer.cpp
+++ b/modules/dnn/src/int8layers/eltwise_layer.cpp
@ -5,6 +5,7 @@
 #include "../precomp.hpp"
 #include "layers_common.hpp"
 #include "../op_timvx.hpp"
+#include "../ie_ngraph.hpp"
 #include <opencv2/dnn/shape_utils.hpp>

 namespace cv
@ -138,7 +139,7 @@ public:
        // For TimVX Backend, only ELTWISE_CHANNNELS_SAME was supported.
        if (backendId == DNN_BACKEND_TIMVX && haveTimVX())
            return channelsModeInput == ELTWISE_CHANNNELS_SAME;
-        return backendId == DNN_BACKEND_OPENCV;
+        return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;
    }

    bool getMemoryShapes(const std::vector<MatShape> &inputs,
@ -369,6 +370,38 @@ public:
        return Ptr<BackendNode>();
    }

+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> > &inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        CV_Assert(nodes.size() >= 2);
+        std::vector<ngraph::Output<ngraph::Node>> ieInpNodes(nodes.size());
+        for (size_t i = 0; i < nodes.size(); i++)
+        {
+            ieInpNodes[i] = nodes[i].dynamicCast<InfEngineNgraphNode>()->node;
+
+            float input_sc = !coeffs.empty() ? coeffs[i] : 1.0f;
+            float input_zp = op == PROD ? zeropoints[i] : 0.0f;
+            ieInpNodes[i] = ngraphDequantize(ieInpNodes[i], input_sc, input_zp);
+        }
+
+        auto res = ieInpNodes[0];
+        for (size_t i = 1; i < ieInpNodes.size(); i++)
+        {
+            switch (op) {
+                case SUM:  res = std::make_shared<ngraph::op::v1::Add>(res, ieInpNodes[i]); break;
+                case PROD: res = std::make_shared<ngraph::op::v1::Multiply>(res, ieInpNodes[i]); break;
+                case MAX:  res = std::make_shared<ngraph::op::v1::Maximum>(res, ieInpNodes[i]); break;
+                default: CV_Error(Error::StsNotImplemented, "Unsupported eltwise operation");
+            }
+        }
+
+        res = ngraphQuantize(res, 1.0f, offset);
+
+        return new InfEngineNgraphNode(res);
+    }
+#endif  // HAVE_DNN_NGRAPH
+
    class EltwiseInvoker : public ParallelLoopBody
    {
        EltwiseLayerInt8Impl& self;
--- a/modules/dnn/src/int8layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/int8layers/fully_connected_layer.cpp
@ -5,6 +5,7 @@
 #include "../precomp.hpp"
 #include "layers_common.hpp"
 #include "../op_timvx.hpp"
+#include "../ie_ngraph.hpp"

 #include <opencv2/dnn/shape_utils.hpp>

@ -86,7 +87,8 @@ public:
               return false;
        }

-        return backendId == DNN_BACKEND_OPENCV;
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;
    }

    virtual bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
@ -303,7 +305,7 @@ public:
            #endif
                {
                    int i = 0;
-            #if CV_SIMD
+            #if CV_SIMD128
                    for( ; i  <= nw - 4; i += 4, wptr += 4*wstep )
                    {
                        v_int32x4 vs0 = v_setzero_s32(), vs1 = v_setzero_s32(),
@ -395,6 +397,77 @@ public:

    }

+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> > &inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        CV_CheckTypeEQ(blobs[0].type(), CV_8S, "");  // weights
+        CV_CheckTypeEQ(blobs[1].type(), CV_32S, "");  // bias
+        CV_CheckTypeEQ(outputMultiplier.type(), CV_32F, "");
+
+        ngraph::Output<ngraph::Node> input = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        ngraph::Output<ngraph::Node> ieWeights, ieBias, matmul;
+        bool transA = false, transB = true;
+        size_t numOutput = blobs[0].size[0];
+
+        if (nodes.size() == 2)
+        {
+            CV_Error(Error::StsNotImplemented, "");
+            // auto inp2 = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
+            // matmul = std::make_shared<ngraph::op::MatMul>(ieInpNode, inp2, transA, transB);
+        }
+        else
+        {
+            std::vector<int> shape(1 + normalize_axis(axis, input.get_shape().size()), 0);
+            shape[shape.size() - 1] = -1;
+            input = std::make_shared<ngraph::op::v1::Reshape>(
+                input,
+                std::make_shared<ngraph::op::Constant>(ngraph::element::i32, ngraph::Shape{shape.size()}, shape.data()),
+                true
+            );
+
+            input = ngraphDequantize(input, input_sc, input_zp);
+
+            const float low = -128, high = 127;
+            std::vector<float> inpLows(numOutput, low);
+            std::vector<float> inpHighs(numOutput, high);
+            std::vector<float> outLows(numOutput);
+            std::vector<float> outHighs(numOutput);
+            for (int i = 0; i < numOutput; ++i) {
+                outLows[i] = low * outputMultiplier.ptr<float>()[i] * output_sc / input_sc;
+                outHighs[i] = high * outputMultiplier.ptr<float>()[i] * output_sc / input_sc;
+            }
+
+            std::vector<size_t> weight_shape{(size_t)blobs[0].size[0], (size_t)blobs[0].size[1]};
+            ieWeights = std::make_shared<ngraph::op::Constant>(ngraph::element::i8, weight_shape, blobs[0].data);
+            ieWeights = std::make_shared<ngraph::op::Convert>(ieWeights, ngraph::element::f32);
+            ieWeights = std::make_shared<ngraph::op::FakeQuantize>(ieWeights,
+                std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{numOutput, 1}, inpLows.data()),
+                std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{numOutput, 1}, inpHighs.data()),
+                std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{numOutput, 1}, outLows.data()),
+                std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{numOutput, 1}, outHighs.data()),
+                256 // levels
+            );
+            matmul = std::make_shared<ngraph::op::MatMul>(input, ieWeights, transA, transB);
+        }
+
+        if (blobs.size() > 1) {
+            int32_t* bias = blobs[1].ptr<int32_t>();
+            std::vector<float> ovBias(blobs[1].total());
+            for (int i = 0; i < ovBias.size(); ++i) {
+                ovBias[i] = (bias[i] + input_zp * cv::sum(blobs[0].row(i))[0]) * outputMultiplier.ptr<float>()[i] * output_sc;
+            }
+            auto bias_node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32,
+                                            ngraph::Shape{blobs[1].total()}, ovBias.data());
+            matmul = std::make_shared<ngraph::op::v1::Add>(matmul, bias_node);
+        }
+
+        matmul = ngraphQuantize(matmul, output_sc, output_zp);
+
+        return new InfEngineNgraphNode(matmul);
+    }
+#endif  // HAVE_DNN_NGRAPH
+
    Mat weightsMat, biasMat, outputMultiplier, activationLUT;
    Ptr<ActivationLayerInt8> activ;
 };
--- a/modules/dnn/src/int8layers/pooling_layer.cpp
+++ b/modules/dnn/src/int8layers/pooling_layer.cpp
@ -5,6 +5,7 @@
 #include "../precomp.hpp"
 #include "layers_common.hpp"
 #include "../op_timvx.hpp"
+#include "../ie_ngraph.hpp"
 #include "opencv2/core/hal/intrin.hpp"

 #include <float.h>
@ -124,6 +125,10 @@ public:
                return type == MAX || type == AVE;
            return false;
        }
+        else if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        {
+            return true;
+        }

        return false;
    }
@ -271,6 +276,49 @@ public:
        return Ptr<BackendNode>();
    }

+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> > &inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        auto input = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+
+        input = ngraphDequantize(input, input_sc, input_zp);
+
+        ngraph::op::PadType pad_type = ngraph::op::PadType::EXPLICIT;
+        if (!padMode.empty())
+            pad_type = padMode == "VALID" ? ngraph::op::PadType::VALID : ngraph::op::PadType::SAME_UPPER;
+
+        auto rounding_type = ceilMode ? ngraph::op::RoundingType::CEIL : ngraph::op::RoundingType::FLOOR;
+        ngraph::Output<ngraph::Node> pool;
+        if (type == MAX) {
+            pool = std::make_shared<ngraph::op::v1::MaxPool>(input, ngraph::Strides(strides),
+                        ngraph::Shape(pads_begin), ngraph::Shape(pads_end), ngraph::Shape(kernel_size),
+                        rounding_type, pad_type);
+        } else if (type == AVE) {
+            pool = std::make_shared<ngraph::op::v1::AvgPool>(input, ngraph::Strides(strides),
+                        ngraph::Shape(pads_begin), ngraph::Shape(pads_end), ngraph::Shape(kernel_size),
+                        !avePoolPaddedArea, rounding_type, pad_type);
+        } else if (type == SUM) {
+            ngraph::Shape inpShape = input.get_shape();
+            CV_Assert(inpShape.size() == 2 + kernel_size.size());
+            std::vector<int64_t> axes;
+            for (size_t i = 0; i < kernel_size.size(); i++)
+            {
+                if (inpShape[2 + i] == kernel_size[i])
+                    axes.push_back(2 + i);
+            }
+            auto reduction_axes = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{axes.size()}, axes);
+            pool = std::make_shared<ngraph::op::v1::ReduceSum>(input, reduction_axes, true);
+        } else {
+            CV_Error(Error::StsNotImplemented, format("INT8 Pooling type: %d", type));
+        }
+
+        pool = ngraphQuantize(pool, output_sc, output_zp);
+
+        return new InfEngineNgraphNode(pool);
+    }
+#endif  // HAVE_DNN_NGRAPH
+
    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
    {
        CV_TRACE_FUNCTION();
--- a/modules/dnn/src/int8layers/quantization_utils.cpp
+++ b/modules/dnn/src/int8layers/quantization_utils.cpp
@ -5,6 +5,7 @@
 #include "../precomp.hpp"
 #include "layers_common.hpp"
 #include "../op_timvx.hpp"
+#include "../ie_ngraph.hpp"

 namespace cv
 {
@ -98,7 +99,8 @@ public:

    virtual bool supportBackend(int backendId) CV_OVERRIDE
    {
-        return backendId == DNN_BACKEND_OPENCV;
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;
    }

    bool getMemoryShapes(const std::vector<MatShape> &inputs,
@ -171,6 +173,16 @@ public:
        else
            inputs[0].convertTo(outputs[0], CV_8S, 1.f/scales[0], zeropoints[0]);
    }
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        const auto input = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        auto quantized = ngraphQuantize(input, scales[0], zeropoints[0]);
+        return Ptr<BackendNode>(new InfEngineNgraphNode(quantized));
+    }
+#endif  // HAVE_DNN_NGRAPH
 };

 // Dequantize INT8 Inputs to FP32/FP16
@ -214,7 +226,7 @@ public:

    virtual bool supportBackend(int backendId) CV_OVERRIDE
    {
-        return backendId == DNN_BACKEND_OPENCV;
+        return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;
    }

    bool getMemoryShapes(const std::vector<MatShape> &inputs,
@ -285,6 +297,16 @@ public:
        else
            inputs[0].convertTo(outputs[0], CV_32F, scales[0], -(scales[0]*zeropoints[0]));
    }
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        const auto input = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        auto quantized = ngraphDequantize(input, scales[0], zeropoints[0]);
+        return new InfEngineNgraphNode(quantized);
+    }
+#endif  // HAVE_DNN_NGRAPH
 };

 // Rescale/Requantize INT8 Inputs from (scale1, zeropoint1) to (scale2, zeropoint2)
--- a/modules/dnn/src/int8layers/scale_layer.cpp
+++ b/modules/dnn/src/int8layers/scale_layer.cpp
@ -6,6 +6,7 @@
 #include "layers_common.hpp"
 #include <opencv2/imgproc.hpp>
 #include <opencv2/dnn/shape_utils.hpp>
+#include "../ie_ngraph.hpp"

 namespace cv
 {
@ -72,7 +73,8 @@ public:

    virtual bool supportBackend(int backendId) CV_OVERRIDE
    {
-        return backendId == DNN_BACKEND_OPENCV;
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;
    }

    bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
@ -186,6 +188,59 @@ public:
        return flops;
    }

+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs, const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        std::vector<ngraph::Output<ngraph::Node>> ieInpNodes(nodes.size());
+        for (int i = 0; i < nodes.size(); ++i) {
+            ieInpNodes[i] = nodes[i].dynamicCast<InfEngineNgraphNode>()->node;
+        }
+
+        ieInpNodes[0] = ngraphDequantize(ieInpNodes[0], inp_sc[0], inp_zp[0]);
+
+        CV_Assert(!blobs.empty() || ieInpNodes.size() == 1 + (int)hasWeights + (int)hasBias);
+
+        ngraph::Output<ngraph::Node> weights, bias;
+        if (blobs.empty()) {
+            if (hasWeights)
+                weights = ieInpNodes[1];
+            if (hasBias)
+                bias = ieInpNodes[1 + (int)hasWeights];
+        } else {
+            std::vector<size_t> shape = ieInpNodes[0].get_shape();
+            int cAxis = normalize_axis(axis, shape.size());
+
+            size_t numWeights = blobs[0].total();
+            for (int i = 0; i < cAxis; ++i) {
+                shape[i] = 1;
+            }
+            for (int i = cAxis; i < shape.size(); ++i) {
+                if (numWeights == 1) {
+                    shape[i] = 1;
+                }
+                numWeights = std::max(numWeights / shape[i], (size_t)1);
+            }
+
+            if (hasWeights)
+                weights = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, shape, blobs[0].data);
+            if (hasBias)
+                bias = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, shape, blobs[(int)hasWeights].data);
+        }
+
+        ngraph::Output<ngraph::Node> res = ieInpNodes[0];
+        if (hasWeights) {
+            res = std::make_shared<ngraph::op::v1::Multiply>(res, weights);
+        }
+        if (hasBias) {
+            res = std::make_shared<ngraph::op::v1::Add>(res, bias);
+        }
+
+        res = ngraphQuantize(res, output_sc, output_zp);
+
+        return new InfEngineNgraphNode(res);
+    }
+#endif  // HAVE_DNN_NGRAPH
+
 private:
    bool hasWeights;
    std::vector<float> inp_sc;
--- a/modules/dnn/src/int8layers/softmax_layer.cpp
+++ b/modules/dnn/src/int8layers/softmax_layer.cpp
@ -5,6 +5,7 @@
 #include "../precomp.hpp"
 #include "layers_common.hpp"
 #include "../op_timvx.hpp"
+#include "../ie_ngraph.hpp"

 #include <algorithm>
 #include <stdlib.h>
@ -90,7 +91,8 @@ public:
    virtual bool supportBackend(int backendId) CV_OVERRIDE
    {
        return backendId == DNN_BACKEND_OPENCV ||
-            (backendId == DNN_BACKEND_TIMVX && haveTimVX());
+            (backendId == DNN_BACKEND_TIMVX && haveTimVX()) ||
+            backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;
    }

    virtual bool tryFuse(Ptr<Layer>& top) CV_OVERRIDE
@ -194,6 +196,26 @@ public:
        return Ptr<BackendNode>();
    }

+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> > &inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        auto input = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+
+        input = ngraphDequantize(input, input_sc, input_zp);
+
+        ngraph::Output<ngraph::Node> res;
+        if (logSoftMax) {
+            res = std::make_shared<ngraph::op::v5::LogSoftmax>(input, axis);
+        } else {
+            res = std::make_shared<ngraph::op::v1::Softmax>(input, axis);
+        }
+
+        res = ngraphQuantize(res, output_sc, output_zp);
+        return new InfEngineNgraphNode(res);
+    }
+#endif  // HAVE_DNN_NGRAPH
+
    template <bool with_log>
    class SoftmaxInt8Invoker : public ParallelLoopBody {
    public:
--- a/modules/dnn/src/layers/const_layer.cpp
+++ b/modules/dnn/src/layers/const_layer.cpp
@ -62,10 +62,15 @@ public:
    {
        std::vector<UMat> outputs;
        outs.getUMatVector(outputs);
-        if (outs.depth() == CV_16S)
-            convertFp16(blobs[0], outputs[0]);
+        if (outs.depth() == CV_16S) {
+            auto blob = blobs[0];
+            if (blob.type() != CV_32F) {
+                blob.convertTo(blob, CV_32F);
+            }
+            convertFp16(blob, outputs[0]);
+        }
        else
-            blobs[0].copyTo(outputs[0]);
+            blobs[0].convertTo(outputs[0], outputs[0].type());
        return true;
    }
 #endif
@ -80,7 +85,7 @@ public:

        std::vector<Mat> outputs;
        outputs_arr.getMatVector(outputs);
-        blobs[0].copyTo(outputs[0]);
+        blobs[0].convertTo(outputs[0], outputs[0].type());
    }

 #ifdef HAVE_CANN
@ -123,9 +128,23 @@ public:
    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
    {
-        auto node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32,
+        ngraph::element::Type dType;
+        if (blobs[0].depth() == CV_32F) {
+            dType = ngraph::element::f32;
+        } else if (blobs[0].depth() == CV_32S) {
+            dType = ngraph::element::i32;
+        } else if (blobs[0].depth() == CV_8S) {
+            dType = ngraph::element::i8;
+        } else {
+            CV_Error(Error::StsNotImplemented, format("Unexpected Const data depth: %d", blobs[0].depth()));
+        }
+        std::shared_ptr<ngraph::Node> node =
+                    std::make_shared<ngraph::op::Constant>(dType,
                                                           getShape<size_t>(blobs[0]),
                                                           blobs[0].data);
+        if (node->get_element_type() != ngraph::element::f32) {
+            node = std::make_shared<ngraph::op::Convert>(node, ngraph::element::f32);
+        }
        return Ptr<BackendNode>(new InfEngineNgraphNode(node));
    }
 #endif  // HAVE_DNN_NGRAPH
@ -151,7 +170,11 @@ public:
        auto context = reinterpret_cast<csl::CSLContext*>(context_);

        CV_Assert(blobs.size() == 1);
-        return make_cuda_node<cuda4dnn::ConstOp>(preferableTarget, std::move(context->stream), blobs[0]);
+        Mat blob = blobs[0];
+        if (blob.type() != CV_32F) {
+            blob.convertTo(blob, CV_32F);
+        }
+        return make_cuda_node<cuda4dnn::ConstOp>(preferableTarget, std::move(context->stream), blob);
    }
 #endif

--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@ -201,8 +201,6 @@ public:
 };


-#define IS_POWER_LAYER(layer) \
-            (!layer.empty() && !layer->type.compare("Power"))
 //TODO: simultaneously convolution and bias addition for cache optimization
 class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
 {
--- a/modules/dnn/src/layers/cpu_kernels/fast_gemm_kernels.default.hpp
+++ b/modules/dnn/src/layers/cpu_kernels/fast_gemm_kernels.default.hpp
@ -12,16 +12,16 @@
 #include <opencv2/core/hal/intrin.hpp>
 #include <opencv2/core/utility.hpp> // parallel_for_

-#define FAST_GEMM_DEFAULT_STORAGE (1<<20) // 2^20
-#define FAST_GEMM_DEFAULT_MAX_STACKBUF (1 << 14)
+#define FAST_GEMM_STORAGE (1<<20) // 2^20
+#define FAST_GEMM_MAX_STACKBUF (1 << 14)

-#define FAST_GEMM_DEFAULT_F32_MC 64
-#define FAST_GEMM_DEFAULT_F32_NC 240
-#define FAST_GEMM_DEFAULT_F32_MR 8
-#define FAST_GEMM_DEFAULT_F32_NR 12
-#define FAST_GEMM_DEFAULT_F32_PACKED_STRIDE_K 256
+#define FAST_GEMM_F32_MC 64
+#define FAST_GEMM_F32_NC 240
+#define FAST_GEMM_F32_MR 8
+#define FAST_GEMM_F32_NR 12
+#define FAST_GEMM_F32_PACKED_STRIDE_K 64

-#define FAST_GEMM_DEFAULT_IMPLEMENT_PACK(N, suffix, styp, dtyp) \
+#define FAST_GEMM_IMPLEMENT_PACK(N, suffix, styp, dtyp) \
 static void fast_gemm_pack##N##suffix( int m, int k, const void* A_, \
                                      int lda0, int lda1, void* packA_ ) \
 { \
@ -32,47 +32,47 @@ static void fast_gemm_pack##N##suffix( int m, int k, const void* A_, \
            const styp* a_ptr = A + lda0*i; \
            for( int j = 0; j < k*lda1; packA += N, j += lda1 ) \
            { \
-                FAST_GEMM_DEFAULT_LOAD_TO_BUF_##N(styp); \
-                FAST_GEMM_DEFAULT_PACK##suffix##_##N(buf, packA); \
+                FAST_GEMM_LOAD_TO_BUF_##N(styp); \
+                FAST_GEMM_PACK##suffix##_##N(buf, packA); \
            } \
        } else { \
            const styp* a_ptr[N]; \
            for (int k = 0; k < N; k++) a_ptr[k] = A + lda0*(i+k < m ? i+k : i); \
            for( int j = 0; j < k*lda1; packA += N, j += lda1 ) \
            { \
-                FAST_GEMM_DEFAULT_LOAD_TO_BUF_BORDERS_##N(styp); \
-                FAST_GEMM_DEFAULT_PACK##suffix##_##N(buf, packA); \
+                FAST_GEMM_LOAD_TO_BUF_BORDERS_##N(styp); \
+                FAST_GEMM_PACK##suffix##_##N(buf, packA); \
            } \
        } \
    } \
 }

-#define FAST_GEMM_DEFAULT_LOAD_TO_BUF_8(styp) \
+#define FAST_GEMM_LOAD_TO_BUF_8(styp) \
    styp buf[] = { \
        a_ptr[j], a_ptr[j+lda0], a_ptr[j+lda0*2], a_ptr[j+lda0*3], \
        a_ptr[j+lda0*4], a_ptr[j+lda0*5], a_ptr[j+lda0*6], a_ptr[j+lda0*7] }

-#define FAST_GEMM_DEFAULT_LOAD_TO_BUF_BORDERS_8(styp) \
+#define FAST_GEMM_LOAD_TO_BUF_BORDERS_8(styp) \
    styp buf[] = { \
        a_ptr[0][j], a_ptr[1][j], a_ptr[2][j], a_ptr[3][j], \
        a_ptr[4][j], a_ptr[5][j], a_ptr[6][j], a_ptr[7][j] }

-#define FAST_GEMM_DEFAULT_LOAD_TO_BUF_12(styp) \
+#define FAST_GEMM_LOAD_TO_BUF_12(styp) \
    styp buf[] = { \
        a_ptr[j], a_ptr[j+lda0], a_ptr[j+lda0*2], a_ptr[j+lda0*3], \
        a_ptr[j+lda0*4], a_ptr[j+lda0*5], a_ptr[j+lda0*6], a_ptr[j+lda0*7], \
        a_ptr[j+lda0*8], a_ptr[j+lda0*9], a_ptr[j+lda0*10], a_ptr[j+lda0*11] }

-#define FAST_GEMM_DEFAULT_LOAD_TO_BUF_BORDERS_12(styp) \
+#define FAST_GEMM_LOAD_TO_BUF_BORDERS_12(styp) \
    styp buf[] = { \
        a_ptr[0][j], a_ptr[1][j], a_ptr[2][j], a_ptr[3][j], \
        a_ptr[4][j], a_ptr[5][j], a_ptr[6][j], a_ptr[7][j], \
        a_ptr[8][j], a_ptr[9][j], a_ptr[10][j], a_ptr[11][j] }

-#define FAST_GEMM_DEFAULT_PACK_COPY(src, dst, N) \
+#define FAST_GEMM_PACK_COPY(src, dst, N) \
    memcpy((dst), (src), N*sizeof(src[0]))
-#define FAST_GEMM_DEFAULT_PACK_f32_8(src, dst) FAST_GEMM_DEFAULT_PACK_COPY((src), (dst), 8)
-#define FAST_GEMM_DEFAULT_PACK_f32_12(src, dst) FAST_GEMM_DEFAULT_PACK_COPY((src), (dst), 12)
+#define FAST_GEMM_PACK_f32_8(src, dst) FAST_GEMM_PACK_COPY((src), (dst), 8)
+#define FAST_GEMM_PACK_f32_12(src, dst) FAST_GEMM_PACK_COPY((src), (dst), 12)

 namespace cv { namespace dnn { namespace cpu_baseline {

@ -88,20 +88,20 @@ void fastGemmKernel(int M, int N, int K,
                    float alpha, const char *A, int lda0, int lda1,
                    const char *packed_B, float beta, char *C, int ldc, int esz);

-FAST_GEMM_DEFAULT_IMPLEMENT_PACK(8, _f32, float, float)
-FAST_GEMM_DEFAULT_IMPLEMENT_PACK(12, _f32, float, float)
+FAST_GEMM_IMPLEMENT_PACK(8, _f32, float, float)
+FAST_GEMM_IMPLEMENT_PACK(12, _f32, float, float)

 int fastGemmPackBSize(int N, int K) {
-    int GEMM_NC = FAST_GEMM_DEFAULT_F32_NC, GEMM_NR = FAST_GEMM_DEFAULT_F32_NR;
+    int GEMM_NC = FAST_GEMM_F32_NC, GEMM_NR = FAST_GEMM_F32_NR;
    int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;

    return static_cast<int>((N + NC - 1) / NC) * NC * K;
 }

 void fastGemmPackBKernel(const char *B, char *packed_B, int N, int K, int ldb0, int ldb1, int esz) {
-    int GEMM_NC = FAST_GEMM_DEFAULT_F32_NC, GEMM_NR = FAST_GEMM_DEFAULT_F32_NR;
+    int GEMM_NC = FAST_GEMM_F32_NC, GEMM_NR = FAST_GEMM_F32_NR;
    int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
-    int KC = std::min(FAST_GEMM_DEFAULT_F32_PACKED_STRIDE_K, K);
+    int KC = std::min(FAST_GEMM_F32_PACKED_STRIDE_K, K);

    int n_tiles = (N + NC - 1) / NC;
    for (int r = 0; r < n_tiles; ++r) {
@ -116,140 +116,50 @@ void fastGemmPackBKernel(const char *B, char *packed_B, int N, int K, int ldb0,
    }
 }

-#if CV_SIMD128
-static void fast_gemm8x12_f32(int k, const char *a_, const char *b_,
-                       char *c_, int ldc, float alpha) {
+static inline void fast_gemm_f32(int k, const char *a_, const char *b_,
+                                 char *c_, int ldc, float alpha) {
    const float* a = (const float*)a_;
    const float* b = (const float*)b_;
    float* c = (float*)c_;

-    v_float32x4 s00 = v_setzero_f32(), s01 = s00, s02 = s00;
-    v_float32x4 s10 = s00, s11 = s00, s12 = s00;
-    v_float32x4 s20 = s00, s21 = s00, s22 = s00;
-    v_float32x4 s30 = s00, s31 = s00, s32 = s00;
-    v_float32x4 s40 = s00, s41 = s00, s42 = s00;
-    v_float32x4 s50 = s00, s51 = s00, s52 = s00;
-    v_float32x4 s60 = s00, s61 = s00, s62 = s00;
-    v_float32x4 s70 = s00, s71 = s00, s72 = s00;
-
-    for(int p = 0; p < k; p++, a += FAST_GEMM_DEFAULT_F32_MR, b += FAST_GEMM_DEFAULT_F32_NR) {
-        v_float32x4 b0 = v_load(b), b1 = v_load(b + 4), b2 = v_load(b + 8);
-
-        v_float32x4 a0 = v_setall_f32(*a);
-        s00 = v_fma(b0, a0, s00);
-        s01 = v_fma(b1, a0, s01);
-        s02 = v_fma(b2, a0, s02);
-        v_float32x4 a1 = v_setall_f32(*(a + 1));
-        s10 = v_fma(b0, a1, s10);
-        s11 = v_fma(b1, a1, s11);
-        s12 = v_fma(b2, a1, s12);
-
-        v_float32x4 a2 = v_setall_f32(*(a + 2));
-        s20 = v_fma(b0, a2, s20);
-        s21 = v_fma(b1, a2, s21);
-        s22 = v_fma(b2, a2, s22);
-        v_float32x4 a3 = v_setall_f32(*(a + 3));
-        s30 = v_fma(b0, a3, s30);
-        s31 = v_fma(b1, a3, s31);
-        s32 = v_fma(b2, a3, s32);
-
-        a0 = v_setall_f32(*(a + 4));
-        s40 = v_fma(b0, a0, s40);
-        s41 = v_fma(b1, a0, s41);
-        s42 = v_fma(b2, a0, s42);
-        a1 = v_setall_f32(*(a + 5));
-        s50 = v_fma(b0, a1, s50);
-        s51 = v_fma(b1, a1, s51);
-        s52 = v_fma(b2, a1, s52);
-
-        a2 = v_setall_f32(*(a + 6));
-        s60 = v_fma(b0, a2, s60);
-        s61 = v_fma(b1, a2, s61);
-        s62 = v_fma(b2, a2, s62);
-        a3 = v_setall_f32(*(a + 7));
-        s70 = v_fma(b0, a3, s70);
-        s71 = v_fma(b1, a3, s71);
-        s72 = v_fma(b2, a3, s72);
-    }
-
-    v_float32x4 c0, c1, c2, c3, c4, c5, v_alpha = v_setall_f32(alpha);
-#define FAST_GEMM_FINALE(row0, row1)       \
-    c0 = v_load(c + row0 * ldc);         \
-    c1 = v_load(c + row0 * ldc + 4);     \
-    c2 = v_load(c + row0 * ldc + 8);     \
-    c3 = v_load(c + row1 * ldc);         \
-    c4 = v_load(c + row1 * ldc + 4);     \
-    c5 = v_load(c + row1 * ldc + 8);     \
-    c0 = v_fma(s##row0##0, v_alpha, c0); \
-    c1 = v_fma(s##row0##1, v_alpha, c1); \
-    c2 = v_fma(s##row0##2, v_alpha, c2); \
-    c3 = v_fma(s##row1##0, v_alpha, c3); \
-    c4 = v_fma(s##row1##1, v_alpha, c4); \
-    c5 = v_fma(s##row1##2, v_alpha, c5); \
-    v_store(c + row0 * ldc, c0);         \
-    v_store(c + row0 * ldc + 4, c1);     \
-    v_store(c + row0 * ldc + 8, c2);     \
-    v_store(c + row1 * ldc, c3);         \
-    v_store(c + row1 * ldc + 4, c4);     \
-    v_store(c + row1 * ldc + 8, c5);
-
-    FAST_GEMM_FINALE(0, 1);
-    FAST_GEMM_FINALE(2, 3);
-    FAST_GEMM_FINALE(4, 5);
-    FAST_GEMM_FINALE(6, 7);
-#undef FAST_GEMM_FINALE
-}
-
-#else
-static void fast_gemm_f32(int k, const char *a_, const char *b_,
-                          char *c_, int ldc, float alpha) {
-    const float* a = (const float*)a_;
-    const float* b = (const float*)b_;
-    float* c = (float*)c_;
-
-    float sbuf[FAST_GEMM_DEFAULT_F32_MR * FAST_GEMM_DEFAULT_F32_NR];
+    float sbuf[FAST_GEMM_F32_MR * FAST_GEMM_F32_NR];
    memset(sbuf, 0, sizeof(sbuf));
    for(int p = 0; p < k; p++) {
-        for( int i = 0; i < FAST_GEMM_DEFAULT_F32_MR; i++ ) {
-            float ai = a[FAST_GEMM_DEFAULT_F32_MR * p + i];
-            for( int j = 0; j < FAST_GEMM_DEFAULT_F32_NR; j++ )
-                sbuf[i * FAST_GEMM_DEFAULT_F32_NR + j] += b[FAST_GEMM_DEFAULT_F32_NR * p + j] * ai;
+        for( int i = 0; i < FAST_GEMM_F32_MR; i++ ) {
+            float ai = a[FAST_GEMM_F32_MR * p + i];
+            for( int j = 0; j < FAST_GEMM_F32_NR; j++ )
+                sbuf[i * FAST_GEMM_F32_NR + j] += b[FAST_GEMM_F32_NR * p + j] * ai;
        }
    }
-    for (int i = 0; i < FAST_GEMM_DEFAULT_F32_MR; i++) {
-        for (int j = 0; j < FAST_GEMM_DEFAULT_F32_NR; j++)
-            c[i * ldc + j] += alpha * sbuf[i * FAST_GEMM_DEFAULT_F32_NR + j];
+    for (int i = 0; i < FAST_GEMM_F32_MR; i++) {
+        for (int j = 0; j < FAST_GEMM_F32_NR; j++)
+            c[i * ldc + j] += alpha * sbuf[i * FAST_GEMM_F32_NR + j];
    }
 }
-#endif // CV_SIMD128

 static void fast_gemm_macro_kernel(int m, int n, int k,
                                   const char *packed_A, const char *packed_B,
                                   float alpha, char *c, int ldc0, int esz) {
    int ldc0_esz = ldc0 * esz;

-    double tempC[FAST_GEMM_DEFAULT_F32_MR * FAST_GEMM_DEFAULT_F32_NR]; // make sure the buffer is big enough
-    for(int i = 0; i < m; i += FAST_GEMM_DEFAULT_F32_MR) {
-        for(int j = 0; j < n; j += FAST_GEMM_DEFAULT_F32_NR) {
+    double tempC[FAST_GEMM_F32_MR * FAST_GEMM_F32_NR]; // make sure the buffer is big enough
+    for(int i = 0; i < m; i += FAST_GEMM_F32_MR) {
+        for(int j = 0; j < n; j += FAST_GEMM_F32_NR) {
            char* cptr0 = &c[i * ldc0_esz + j * esz];
            char* cptr = cptr0;
            int ldc = ldc0;
-            int mr = m - i < FAST_GEMM_DEFAULT_F32_MR ? m - i : FAST_GEMM_DEFAULT_F32_MR;
-            int nr = n - j < FAST_GEMM_DEFAULT_F32_NR ? n - j : FAST_GEMM_DEFAULT_F32_NR;
+            int mr = m - i < FAST_GEMM_F32_MR ? m - i : FAST_GEMM_F32_MR;
+            int nr = n - j < FAST_GEMM_F32_NR ? n - j : FAST_GEMM_F32_NR;
            int nr_esz = nr * esz;
-            bool partial = (bool)((mr < FAST_GEMM_DEFAULT_F32_MR) | (nr < FAST_GEMM_DEFAULT_F32_NR));
+            bool partial = (bool)((mr < FAST_GEMM_F32_MR) | (nr < FAST_GEMM_F32_NR));
            if (partial) {
                memset(tempC, 0, sizeof(tempC));
                cptr = (char *)tempC;
-                ldc = FAST_GEMM_DEFAULT_F32_NR;
+                ldc = FAST_GEMM_F32_NR;
                for(int p = 0; p < mr; p++)
                    memcpy(cptr + p * (ldc * esz), cptr0 + p * ldc0_esz, nr_esz);
            }
-#if CV_SIMD128
-            fast_gemm8x12_f32(k, packed_A + i * k * esz, packed_B + j * k * esz, cptr, ldc, alpha);
-#else
            fast_gemm_f32(k, packed_A + i * k * esz, packed_B + j * k * esz, cptr, ldc, alpha);
-#endif

            if (partial) {
                for(int p = 0; p < mr; p++)
@ -263,19 +173,19 @@ void fastGemmKernel(int M, int N, int K,
                    float alpha, const char *A, int lda0, int lda1,
                    const char *B, int ldb0, int ldb1,
                    float beta, char *C, int ldc, int esz) {
-    int GEMM_MC = FAST_GEMM_DEFAULT_F32_MC,
-        GEMM_NC = FAST_GEMM_DEFAULT_F32_NC,
-        GEMM_MR = FAST_GEMM_DEFAULT_F32_MR,
-        GEMM_NR = FAST_GEMM_DEFAULT_F32_NR;
+    int GEMM_MC = FAST_GEMM_F32_MC,
+        GEMM_NC = FAST_GEMM_F32_NC,
+        GEMM_MR = FAST_GEMM_F32_MR,
+        GEMM_NR = FAST_GEMM_F32_NR;

    int MC = (((GEMM_MC < M ? GEMM_MC : M) + GEMM_MR - 1) / GEMM_MR) * GEMM_MR;
    int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
-    int KC = FAST_GEMM_DEFAULT_STORAGE / ((MC + NC) * esz);
+    int KC = FAST_GEMM_STORAGE / ((MC + NC) * esz);
    KC = KC > 8 ? KC : 8;
    KC = KC < K ? KC : K;

    size_t buff_size = KC * (MC + NC) * esz;
-    bool use_stackbuff = buff_size <= FAST_GEMM_DEFAULT_MAX_STACKBUF;
+    bool use_stackbuff = buff_size <= FAST_GEMM_MAX_STACKBUF;
    int m_tiles = (M + MC - 1) / MC;
    int n_tiles = (N + NC - 1) / NC;
    int total_tiles = m_tiles * n_tiles;
@ -328,17 +238,17 @@ void fastGemmKernel(int M, int N, int K,
 void fastGemmKernel(int M, int N, int K,
                    float alpha, const char *A, int lda0, int lda1,
                    const char *packed_B, float beta, char *C, int ldc, int esz) {
-    int GEMM_MC = FAST_GEMM_DEFAULT_F32_MC,
-        GEMM_NC = FAST_GEMM_DEFAULT_F32_NC,
-        GEMM_MR = FAST_GEMM_DEFAULT_F32_MR,
-        GEMM_NR = FAST_GEMM_DEFAULT_F32_NR;
+    int GEMM_MC = FAST_GEMM_F32_MC,
+        GEMM_NC = FAST_GEMM_F32_NC,
+        GEMM_MR = FAST_GEMM_F32_MR,
+        GEMM_NR = FAST_GEMM_F32_NR;

    int MC = (((GEMM_MC < M ? GEMM_MC : M) + GEMM_MR - 1) / GEMM_MR) * GEMM_MR;
    int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
-    int KC = std::min(FAST_GEMM_DEFAULT_F32_PACKED_STRIDE_K, K);
+    int KC = std::min(FAST_GEMM_F32_PACKED_STRIDE_K, K);

    size_t buff_size = KC * MC * esz;
-    bool use_stackbuff = buff_size <= FAST_GEMM_DEFAULT_MAX_STACKBUF;
+    bool use_stackbuff = buff_size <= FAST_GEMM_MAX_STACKBUF;
    int m_tiles = (M + MC - 1) / MC;
    int n_tiles = (N + NC - 1) / NC;
    int total_tiles = m_tiles * n_tiles;
@ -391,3 +301,29 @@ void fastGemmKernel(int M, int N, int K,
 }

 }}} // cv::dnn::cpu_baseline
+
+#undef FAST_GEMM_STORAGE
+#undef FAST_GEMM_MAX_STACKBUF
+#ifdef FAST_GEMM_F32_MC
+#undef FAST_GEMM_F32_MC
+#endif
+#ifdef FAST_GEMM_F32_NC
+#undef FAST_GEMM_F32_NC
+#endif
+#ifdef FAST_GEMM_F32_MR
+#undef FAST_GEMM_F32_MR
+#endif
+#ifdef FAST_GEMM_F32_NR
+#undef FAST_GEMM_F32_NR
+#endif
+#ifdef FAST_GEMM_F32_PACKED_STRIDE_K
+#undef FAST_GEMM_F32_PACKED_STRIDE_K
+#endif
+#undef FAST_GEMM_IMPLEMENT_PACK
+#undef FAST_GEMM_LOAD_TO_BUF_8
+#undef FAST_GEMM_LOAD_TO_BUF_BORDERS_8
+#undef FAST_GEMM_LOAD_TO_BUF_12
+#undef FAST_GEMM_LOAD_TO_BUF_BORDERS_12
+#undef FAST_GEMM_PACK_COPY
+#undef FAST_GEMM_PACK_f32_8
+#undef FAST_GEMM_PACK_f32_12
--- a/modules/dnn/src/layers/cpu_kernels/fast_gemm_kernels.simd.hpp
+++ b/modules/dnn/src/layers/cpu_kernels/fast_gemm_kernels.simd.hpp
@ -15,37 +15,31 @@
 #define FAST_GEMM_STORAGE (1<<20) // 2^20
 #define FAST_GEMM_MAX_STACKBUF (1 << 14)

-#if CV_NEON
-#define FAST_GEMM_F32_MC 64
-#define FAST_GEMM_F32_NC 240
-#elif CV_AVX
+#if CV_AVX
 #define FAST_GEMM_F32_MC 60
 #define FAST_GEMM_F32_NC 320
 #elif CV_LASX
 #define FAST_GEMM_F32_MC 48
 #define FAST_GEMM_F32_NC 128
+#else // CV_NEON_AARCH64, SIMD128
+#define FAST_GEMM_F32_MC 64
+#define FAST_GEMM_F32_NC 240
 #endif

-// micro kernel size
-#if CV_NEON && CV_NEON_AARCH64
-#define FAST_GEMM_F32_MR 8
-#define FAST_GEMM_F32_NR 12
-#elif CV_NEON
-#define FAST_GEMM_F32_MR 4
-#define FAST_GEMM_F32_NR 12
-#elif CV_AVX
+#if CV_AVX
 #define FAST_GEMM_F32_MR 12
 #define FAST_GEMM_F32_NR 8
 #elif CV_LASX
 #define FAST_GEMM_F32_MR 12
 #define FAST_GEMM_F32_NR 16
+#else // CV_NEON_AARCH64, CV_SIMD128
+#define FAST_GEMM_F32_MR 8
+#define FAST_GEMM_F32_NR 12
 #endif

-#if CV_NEON
-#define FAST_GEMM_F32_PACKED_STRIDE_K 64
-#elif CV_AVX
+#if CV_AVX
 #define FAST_GEMM_F32_PACKED_STRIDE_K 128
-#elif CV_LASX
+#else // CV_LASX, CV_NEON_AARCH64, CV_SIMD128
 #define FAST_GEMM_F32_PACKED_STRIDE_K 64
 #endif

@ -75,14 +69,6 @@ static void fast_gemm_pack##N##suffix( int m, int k, const void* A_, \
    } \
 }

-#define FAST_GEMM_LOAD_TO_BUF_4(styp) \
-    styp buf[] = { \
-        a_ptr[j], a_ptr[j+lda0], a_ptr[j+lda0*2], a_ptr[j+lda0*3] }
-
-#define FAST_GEMM_LOAD_TO_BUF_BORDERS_4(styp) \
-    styp buf[] = { \
-        a_ptr[0][j], a_ptr[1][j], a_ptr[2][j], a_ptr[3][j] }
-
 #define FAST_GEMM_LOAD_TO_BUF_8(styp) \
    styp buf[] = { \
        a_ptr[j], a_ptr[j+lda0], a_ptr[j+lda0*2], a_ptr[j+lda0*3], \
@ -121,7 +107,6 @@ static void fast_gemm_pack##N##suffix( int m, int k, const void* A_, \

 #define FAST_GEMM_PACK_COPY(src, dst, N) \
    memcpy((dst), (src), N*sizeof(src[0]))
-#define FAST_GEMM_PACK_f32_4(src, dst) FAST_GEMM_PACK_COPY((src), (dst), 4)
 #define FAST_GEMM_PACK_f32_8(src, dst) FAST_GEMM_PACK_COPY((src), (dst), 8)
 #define FAST_GEMM_PACK_f32_12(src, dst) FAST_GEMM_PACK_COPY((src), (dst), 12)
 #define FAST_GEMM_PACK_f32_16(src, dst) FAST_GEMM_PACK_COPY((src), (dst), 16)
@ -130,7 +115,6 @@ namespace cv { namespace dnn {

 CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN

-// TODO: type to size_t
 int fastGemmPackBSize(int N, int K);

 void fastGemmPackBKernel(const char *B, char *packed_B, int N, int K, int ldb0, int ldb1, int esz);
@ -143,44 +127,18 @@ void fastGemmKernel(int M, int N, int K,
                    float alpha, const char *A, int lda0, int lda1,
                    const char *packed_B, float beta, char *C, int ldc, int esz);

-// NEON (AARCH64: 32 x 128-bit registers, armv7: 16 x 128-bit registers)
-#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_NEON
+#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY

-#if CV_NEON_AARCH64
-FAST_GEMM_IMPLEMENT_PACK(8, _f32, float, float)
-#else
-FAST_GEMM_IMPLEMENT_PACK(4, _f32, float, float)
-#endif
-FAST_GEMM_IMPLEMENT_PACK(12, _f32, float, float)
+/*
+    Compute kernels that optimized for different platforms
+*/
+#if CV_NEON && CV_NEON_AARCH64 // AARCH64: 32 x 128-bit registers

-int fastGemmPackBSize(int N, int K) {
-    int GEMM_NC = FAST_GEMM_F32_NC, GEMM_NR = FAST_GEMM_F32_NR;
-    int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
+FAST_GEMM_IMPLEMENT_PACK(8, _f32, float, float) // a packer
+FAST_GEMM_IMPLEMENT_PACK(12, _f32, float, float) // b packer

-    return static_cast<int>((N + NC - 1) / NC) * NC * K;
-}
-
-void fastGemmPackBKernel(const char *B, char *packed_B, int N, int K, int ldb0, int ldb1, int esz) {
-    int GEMM_NC = FAST_GEMM_F32_NC, GEMM_NR = FAST_GEMM_F32_NR;
-    int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
-    int KC = std::min(FAST_GEMM_F32_PACKED_STRIDE_K, K);
-
-    int n_tiles = (N + NC - 1) / NC;
-    for (int r = 0; r < n_tiles; ++r) {
-        int j0 = r * NC;
-        int nc = N - j0 < NC ? N - j0 : NC;
-        int _nc = static_cast<int>((nc + GEMM_NR - 1) / GEMM_NR) * GEMM_NR * esz;
-        for (int k = 0; k < K; k += KC) {
-            int kc = K - k < KC ? K - k : KC;
-            fast_gemm_pack12_f32(nc, kc, B + (k * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_B);
-            packed_B += _nc * kc;
-        }
-    }
-}
-
-#if CV_NEON_AARCH64
-static void fast_gemm8x12_f32(int k, const char *a_, const char *b_,
-                       char *c_, int ldc, float alpha) {
+static inline void fast_gemm8x12_f32(int k, const char *a_, const char *b_,
+                                     char *c_, int ldc, float alpha) {
    const float* a = (const float*)a_;
    const float* b = (const float*)b_;
    float* c = (float*)c_;
@ -258,278 +216,17 @@ static void fast_gemm8x12_f32(int k, const char *a_, const char *b_,
 #undef FAST_GEMM_FINALE
 }

-#else // CV_NEON_AARCH64
-static void fast_gemm4x12_f32(int k, const char *a_, const char *b_,
-                       char *c_, int ldc, float alpha) {
-    const float* a = (const float*)a_;
-    const float* b = (const float*)b_;
-    float* c = (float*)c_;
+#elif CV_AVX // AVX and AVX2 (16 x 256-bit registers)

-    float32x4_t s00 = vdupq_n_f32(0.f), s01 = s00, s02 = s00,
-                s10 = s00, s11 = s00, s12 = s00,
-                s20 = s00, s21 = s00, s22 = s00,
-                s30 = s00, s31 = s00, s32 = s00;
-
-    for(int p = 0; p < k; p++, a += FAST_GEMM_F32_MR, b += FAST_GEMM_F32_NR)
-    {
-        float32x4_t b0 = vld1q_f32(b), b1 = vld1q_f32(b + 4), b2 = vld1q_f32(b + 8);
-
-        float32x4_t a0 = vld1q_dup_f32(a);
-        s00 = vmlaq_f32(a0, b0, s00);
-        s01 = vmlaq_f32(a0, b1, s01);
-        s02 = vmlaq_f32(a0, b2, s02);
-
-        a0 = vld1q_dup_f32(a + 1);
-        s10 = vmlaq_f32(a0, b0, s10);
-        s11 = vmlaq_f32(a0, b1, s11);
-        s12 = vmlaq_f32(a0, b2, s12);
-
-        a0 = vld1q_dup_f32(a + 2);
-        s20 = vmlaq_f32(a0, b0, s20);
-        s21 = vmlaq_f32(a0, b1, s21);
-        s22 = vmlaq_f32(a0, b2, s22);
-
-        a0 = vld1q_dup_f32(a + 3);
-        s30 = vmlaq_f32(a0, b0, s30);
-        s31 = vmlaq_f32(a0, b1, s31);
-        s32 = vmlaq_f32(a0, b2, s32);
-    }
-
-    float32x4_t c0, c1, c2, v_alpha = vdupq_n_f32(alpha);
-#define FAST_GEMM_FINALE(row0)               \
-    c0 = vld1q_f32(c + row0 * ldc);          \
-    c1 = vld1q_f32(c + row0 * ldc + 4);      \
-    c2 = vld1q_f32(c + row0 * ldc + 8);      \
-    c0 = vmlaq_f32(c0, s##row0##0, v_alpha); \
-    c1 = vmlaq_f32(c1, s##row0##1, v_alpha); \
-    c2 = vmlaq_f32(c2, s##row0##2, v_alpha); \
-    vst1q_f32(c + row0 * ldc, c0);           \
-    vst1q_f32(c + row0 * ldc + 4, c1);       \
-    vst1q_f32(c + row0 * ldc + 8, c2);
-
-    FAST_GEMM_FINALE(0);
-    FAST_GEMM_FINALE(1);
-    FAST_GEMM_FINALE(2);
-    FAST_GEMM_FINALE(3);
-#undef FAST_GEMM_FINALE
-}
-
-#endif // micro kernel CV_NEON_AARCH64
-
-static void fast_gemm_macro_kernel(int m, int n, int k,
-                                   const char *packed_A, const char *packed_B,
-                                   float alpha, char *c, int ldc0, int esz) {
-    int ldc0_esz = ldc0 * esz;
-
-    double tempC[FAST_GEMM_F32_MR * FAST_GEMM_F32_NR]; // make sure the buffer is big enough
-    for(int i = 0; i < m; i += FAST_GEMM_F32_MR) {
-        for(int j = 0; j < n; j += FAST_GEMM_F32_NR) {
-            char* cptr0 = &c[i * ldc0_esz + j * esz];
-            char* cptr = cptr0;
-            int ldc = ldc0;
-            int mr = m - i < FAST_GEMM_F32_MR ? m - i : FAST_GEMM_F32_MR;
-            int nr = n - j < FAST_GEMM_F32_NR ? n - j : FAST_GEMM_F32_NR;
-            int nr_esz = nr * esz;
-            bool partial = (bool)((mr < FAST_GEMM_F32_MR) | (nr < FAST_GEMM_F32_NR));
-            if (partial) {
-                memset(tempC, 0, sizeof(tempC));
-                cptr = (char *)tempC;
-                ldc = FAST_GEMM_F32_NR;
-                for(int p = 0; p < mr; p++)
-                    memcpy(cptr + p * (ldc * esz), cptr0 + p * ldc0_esz, nr_esz);
-            }
-#if CV_NEON_AARCH64
-            fast_gemm8x12_f32(k, packed_A + i * k * esz, packed_B + j * k * esz, cptr, ldc, alpha);
-#else
-            fast_gemm4x12_f32(k, packed_A + i * k * esz, packed_B + j * k * esz, cptr, ldc, alpha);
-#endif
-
-            if (partial) {
-                for(int p = 0; p < mr; p++)
-                    memcpy(cptr0 + p * ldc0_esz, cptr + p * (ldc * esz), nr_esz);
-            }
-        }
-    }
-}
-
-void fastGemmKernel(int M, int N, int K,
-                    float alpha, const char *A, int lda0, int lda1,
-                    const char *B, int ldb0, int ldb1,
-                    float beta, char *C, int ldc, int esz) {
-    int GEMM_MC = FAST_GEMM_F32_MC,
-        GEMM_NC = FAST_GEMM_F32_NC,
-        GEMM_MR = FAST_GEMM_F32_MR,
-        GEMM_NR = FAST_GEMM_F32_NR;
-
-    int MC = (((GEMM_MC < M ? GEMM_MC : M) + GEMM_MR - 1) / GEMM_MR) * GEMM_MR;
-    int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
-    int KC = FAST_GEMM_STORAGE / ((MC + NC) * esz);
-    KC = KC > 8 ? KC : 8;
-    KC = KC < K ? KC : K;
-
-    size_t buff_size = KC * (MC + NC) * esz;
-    bool use_stackbuff = buff_size <= FAST_GEMM_MAX_STACKBUF;
-    int m_tiles = (M + MC - 1) / MC;
-    int n_tiles = (N + NC - 1) / NC;
-    int total_tiles = m_tiles * n_tiles;
-
-    auto fn = [&](const Range &r) {
-        char* packed_a = (char*)(use_stackbuff ? alloca(buff_size) : malloc(buff_size));
-        char* packed_b = packed_a + KC * MC * esz;
-        int start = r.start;
-        int end = r.end;
-
-        for (int tile_idx = start; tile_idx < end; tile_idx++) {
-            int i0 = (tile_idx / n_tiles) * MC;
-            int j0 = (tile_idx % n_tiles) * NC;
-            int mc = M - i0 < MC ? M - i0 : MC;
-            int nc = N - j0 < NC ? N - j0 : NC;
-            int ldc_block = ldc;
-            char* c_block = C + (i0 * ldc + j0) * esz;
-
-            if (beta == 0.f) {
-                for(int i = 0; i < mc; i++)
-                    memset(c_block + i * ldc_block * esz, 0, nc * esz);
-            } else if (beta != 1.f) {
-                for(int i = 0; i < mc; i++) {
-                    float* c_i = (float*)c_block + i * ldc_block;
-                    for(int j = 0; j < nc; j++)
-                        c_i[j] *= beta;
-                }
-            }
-
-            for(int k0 = 0; k0 < K; k0 += KC)
-            {
-                int kc = K - k0 < KC ? K - k0 : KC;
-#if CV_NEON_AARCH64
-                fast_gemm_pack8_f32(mc, kc, A + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
-#else
-                fast_gemm_pack4_f32(mc, kc, A + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
-#endif
-                fast_gemm_pack12_f32(nc, kc, B + (k0 * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_b);
-                fast_gemm_macro_kernel(mc, nc, kc, packed_a, packed_b, alpha, c_block, ldc_block, esz);
-            }
-        }
-
-        if (!use_stackbuff) {
-            free(packed_a);
-        }
-    };
-
-    int total = total_tiles;
-    int cost_per_thread = static_cast<int>((K / KC) * (MC / GEMM_MR) * (NC / GEMM_NR));
-    double nstripes = (size_t)total * cost_per_thread * (1 / 1024.0);
-    parallel_for_(Range(0, total), fn, nstripes);
-}
-
-void fastGemmKernel(int M, int N, int K,
-                    float alpha, const char *A, int lda0, int lda1,
-                    const char *packed_B, float beta, char *C, int ldc, int esz) {
-    int GEMM_MC = FAST_GEMM_F32_MC,
-        GEMM_NC = FAST_GEMM_F32_NC,
-        GEMM_MR = FAST_GEMM_F32_MR,
-        GEMM_NR = FAST_GEMM_F32_NR;
-
-    int MC = (((GEMM_MC < M ? GEMM_MC : M) + GEMM_MR - 1) / GEMM_MR) * GEMM_MR;
-    int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
-    int KC = std::min(FAST_GEMM_F32_PACKED_STRIDE_K, K);
-
-    size_t buff_size = KC * MC * esz;
-    bool use_stackbuff = buff_size <= FAST_GEMM_MAX_STACKBUF;
-    int m_tiles = (M + MC - 1) / MC;
-    int n_tiles = (N + NC - 1) / NC;
-    int total_tiles = m_tiles * n_tiles;
-
-    auto fn = [&](const Range &r) {
-        char* packed_a = (char*)(use_stackbuff ? alloca(buff_size) : malloc(buff_size)); // TODO: use AutoBuffer
-        const char *packed_b_ = packed_B;
-        int start = r.start;
-        int end = r.end;
-
-        for (int tile_idx = start; tile_idx < end; tile_idx++) {
-            int i0 = (tile_idx / n_tiles) * MC;
-            int j0 = (tile_idx % n_tiles) * NC;
-            int mc = M - i0 < MC ? M - i0 : MC;
-            int nc = N - j0 < NC ? N - j0 : NC;
-            int ldc_block = ldc;
-            char* c_block = C + (i0 * ldc + j0) * esz;
-            packed_b_ = packed_B + j0 * K * esz;
-
-            if (beta == 0.f) {
-                for(int i = 0; i < mc; i++)
-                    memset(c_block + i * ldc_block * esz, 0, nc * esz);
-            } else if (beta != 1.f) {
-                for(int i = 0; i < mc; i++) {
-                    float* c_i = (float*)c_block + i * ldc_block;
-                    for(int j = 0; j < nc; j++)
-                        c_i[j] *= beta;
-                }
-            }
-
-            int _nc = static_cast<int>((nc + GEMM_NR - 1) / GEMM_NR) * GEMM_NR * esz;
-            for(int k0 = 0; k0 < K; k0 += KC)
-            {
-                int kc = K - k0 < KC ? K - k0 : KC;
-#if CV_NEON_AARCH64
-                fast_gemm_pack8_f32(mc, kc, A + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
-#else
-                fast_gemm_pack4_f32(mc, kc, A + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
-#endif
-                fast_gemm_macro_kernel(mc, nc, kc, packed_a, packed_b_, alpha, c_block, ldc_block, esz);
-                packed_b_ += _nc * kc;
-            }
-        }
-
-        if (!use_stackbuff) {
-            free(packed_a);
-        }
-    };
-
-    int total = total_tiles;
-    int cost_per_thread = static_cast<int>((K / KC) * (MC / GEMM_MR) * (NC / GEMM_NR));
-    double nstripes = (size_t)total * cost_per_thread * (1 / 1024.0);
-    parallel_for_(Range(0, total), fn, nstripes);
-}
-
-#endif // CV_NEON, CV_NEON_AARCH64
-
-// AVX and AVX2 (16 x 256-bit registers)
-#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_AVX
-
-FAST_GEMM_IMPLEMENT_PACK(8, _f32, float, float)
-FAST_GEMM_IMPLEMENT_PACK(12, _f32, float, float)
-
-int fastGemmPackBSize(int N, int K) {
-    int GEMM_NC = FAST_GEMM_F32_NC, GEMM_NR = FAST_GEMM_F32_NR;
-    int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
-
-    return static_cast<int>((N + NC - 1) / NC) * NC * K;
-}
-
-void fastGemmPackBKernel(const char *B, char *packed_B, int N, int K, int ldb0, int ldb1, int esz) {
-    int GEMM_NC = FAST_GEMM_F32_NC, GEMM_NR = FAST_GEMM_F32_NR;
-    int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
-    int KC = std::min(FAST_GEMM_F32_PACKED_STRIDE_K, K);
-
-    int n_tiles = (N + NC - 1) / NC;
-    for (int r = 0; r < n_tiles; ++r) {
-        int j0 = r * NC;
-        int nc = N - j0 < NC ? N - j0 : NC;
-        int _nc = static_cast<int>((nc + GEMM_NR - 1) / GEMM_NR) * GEMM_NR * esz;
-        for (int k = 0; k < K; k += KC) {
-            int kc = K - k < KC ? K - k : KC;
-            fast_gemm_pack8_f32(nc, kc, B + (k * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_B);
-            packed_B += _nc * kc;
-        }
-    }
-}
+FAST_GEMM_IMPLEMENT_PACK(8, _f32, float, float) // a packer
+FAST_GEMM_IMPLEMENT_PACK(12, _f32, float, float) // b packer

 #if !CV_FMA3 // AVX workaround for FMA
 #undef _mm256_fmadd_ps
 #define _mm256_fmadd_ps(a, b, c) _mm256_add_ps(c, _mm256_mul_ps(a, b))
 #endif

-static void fast_gemm12x8_f32(int k, const char *a_, const char *b_, char *c_, int ldc, float alpha) {
+static inline void fast_gemm12x8_f32(int k, const char *a_, const char *b_, char *c_, int ldc, float alpha) {
    const float* a = (const float*)a_;
    const float* b = (const float*)b_;
    float* c = (float*)c_;
@ -599,203 +296,12 @@ static void fast_gemm12x8_f32(int k, const char *a_, const char *b_, char *c_, i
 #undef FAST_GEMM_FINALE
 }

-static void fast_gemm_macro_kernel(int m, int n, int k,
-                                   const char *packed_A, const char *packed_B,
-                                   float alpha, char *c, int ldc0, int esz) {
-    int ldc0_esz = ldc0 * esz;
+#elif CV_LASX // LASX (32 x 256-bit registers)

-    double tempC[FAST_GEMM_F32_MR * FAST_GEMM_F32_NR]; // make sure the buffer is big enough
-    for(int i = 0; i < m; i += FAST_GEMM_F32_MR) {
-        for(int j = 0; j < n; j += FAST_GEMM_F32_NR) {
-            char* cptr0 = &c[i * ldc0_esz + j * esz];
-            char* cptr = cptr0;
-            int ldc = ldc0;
-            int mr = m - i < FAST_GEMM_F32_MR ? m - i : FAST_GEMM_F32_MR;
-            int nr = n - j < FAST_GEMM_F32_NR ? n - j : FAST_GEMM_F32_NR;
-            int nr_esz = nr * esz;
-            bool partial = (bool)((mr < FAST_GEMM_F32_MR) | (nr < FAST_GEMM_F32_NR));
-            if (partial) {
-                memset(tempC, 0, sizeof(tempC));
-                cptr = (char *)tempC;
-                ldc = FAST_GEMM_F32_NR;
-                for(int p = 0; p < mr; p++)
-                    memcpy(cptr + p * (ldc * esz), cptr0 + p * ldc0_esz, nr_esz);
-            }
-            fast_gemm12x8_f32(k, packed_A + i * k * esz, packed_B + j * k * esz, cptr, ldc, alpha);
+FAST_GEMM_IMPLEMENT_PACK(12, _f32, float, float) // a packer
+FAST_GEMM_IMPLEMENT_PACK(16, _f32, float, float) // b packer

-            if (partial) {
-                for(int p = 0; p < mr; p++)
-                    memcpy(cptr0 + p * ldc0_esz, cptr + p * (ldc * esz), nr_esz);
-            }
-        }
-    }
-}
-
-void fastGemmKernel(int M, int N, int K,
-                    float alpha, const char *A, int lda0, int lda1,
-                    const char *B, int ldb0, int ldb1,
-                    float beta, char *C, int ldc, int esz) {
-    int GEMM_MC = FAST_GEMM_F32_MC,
-        GEMM_NC = FAST_GEMM_F32_NC,
-        GEMM_MR = FAST_GEMM_F32_MR,
-        GEMM_NR = FAST_GEMM_F32_NR;
-
-    int MC = (((GEMM_MC < M ? GEMM_MC : M) + GEMM_MR - 1) / GEMM_MR) * GEMM_MR;
-    int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
-    int KC = FAST_GEMM_STORAGE / ((MC + NC) * esz);
-    KC = KC > 8 ? KC : 8;
-    KC = KC < K ? KC : K;
-
-    size_t buff_size = KC * (MC + NC) * esz;
-    bool use_stackbuff = buff_size <= FAST_GEMM_MAX_STACKBUF;
-    int m_tiles = (M + MC - 1) / MC;
-    int n_tiles = (N + NC - 1) / NC;
-    int total_tiles = m_tiles * n_tiles;
-
-    auto fn = [&](const Range &r) {
-        char* packed_a = (char*)(use_stackbuff ? alloca(buff_size) : malloc(buff_size));
-        char* packed_b = packed_a + KC * MC * esz;
-        int start = r.start;
-        int end = r.end;
-
-        for (int tile_idx = start; tile_idx < end; tile_idx++) {
-            int i0 = (tile_idx / n_tiles) * MC;
-            int j0 = (tile_idx % n_tiles) * NC;
-            int mc = M - i0 < MC ? M - i0 : MC;
-            int nc = N - j0 < NC ? N - j0 : NC;
-            int ldc_block = ldc;
-            char* c_block = C + (i0 * ldc + j0) * esz;
-
-            if (beta == 0.f) {
-                for(int i = 0; i < mc; i++)
-                    memset(c_block + i * ldc_block * esz, 0, nc * esz);
-            } else if (beta != 1.f) {
-                for(int i = 0; i < mc; i++) {
-                    float* c_i = (float*)c_block + i * ldc_block;
-                    for(int j = 0; j < nc; j++)
-                        c_i[j] *= beta;
-                }
-            }
-
-            for(int k0 = 0; k0 < K; k0 += KC)
-            {
-                int kc = K - k0 < KC ? K - k0 : KC;
-                fast_gemm_pack12_f32(mc, kc, A + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
-                fast_gemm_pack8_f32(nc, kc, B + (k0 * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_b);
-                fast_gemm_macro_kernel(mc, nc, kc, packed_a, packed_b, alpha, c_block, ldc_block, esz);
-            }
-        }
-
-        if (!use_stackbuff) {
-            free(packed_a);
-        }
-    };
-
-    int total = total_tiles;
-    int cost_per_thread = static_cast<int>((K / KC) * (MC / GEMM_MR) * (NC / GEMM_NR));
-    double nstripes = (size_t)total * cost_per_thread * (1 / 1024.0);
-    parallel_for_(Range(0, total), fn, nstripes);
-}
-
-void fastGemmKernel(int M, int N, int K,
-                    float alpha, const char *A, int lda0, int lda1,
-                    const char *packed_B, float beta, char *C, int ldc, int esz) {
-    int GEMM_MC = FAST_GEMM_F32_MC,
-        GEMM_NC = FAST_GEMM_F32_NC,
-        GEMM_MR = FAST_GEMM_F32_MR,
-        GEMM_NR = FAST_GEMM_F32_NR;
-
-    int MC = (((GEMM_MC < M ? GEMM_MC : M) + GEMM_MR - 1) / GEMM_MR) * GEMM_MR;
-    int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
-    int KC = std::min(FAST_GEMM_F32_PACKED_STRIDE_K, K);
-
-    size_t buff_size = KC * MC * esz;
-    bool use_stackbuff = buff_size <= FAST_GEMM_MAX_STACKBUF;
-    int m_tiles = (M + MC - 1) / MC;
-    int n_tiles = (N + NC - 1) / NC;
-    int total_tiles = m_tiles * n_tiles;
-
-    auto fn = [&](const Range &r) {
-        char* packed_a = (char*)(use_stackbuff ? alloca(buff_size) : malloc(buff_size)); // TODO: use AutoBuffer
-        const char *packed_b_ = packed_B;
-        int start = r.start;
-        int end = r.end;
-
-        for (int tile_idx = start; tile_idx < end; tile_idx++) {
-            int i0 = (tile_idx / n_tiles) * MC;
-            int j0 = (tile_idx % n_tiles) * NC;
-            int mc = M - i0 < MC ? M - i0 : MC;
-            int nc = N - j0 < NC ? N - j0 : NC;
-            int ldc_block = ldc;
-            char* c_block = C + (i0 * ldc + j0) * esz;
-            packed_b_ = packed_B + j0 * K * esz;
-
-            if (beta == 0.f) {
-                for(int i = 0; i < mc; i++)
-                    memset(c_block + i * ldc_block * esz, 0, nc * esz);
-            } else if (beta != 1.f) {
-                for(int i = 0; i < mc; i++) {
-                    float* c_i = (float*)c_block + i * ldc_block;
-                    for(int j = 0; j < nc; j++)
-                        c_i[j] *= beta;
-                }
-            }
-
-            int _nc = static_cast<int>((nc + GEMM_NR - 1) / GEMM_NR) * GEMM_NR * esz;
-            for(int k0 = 0; k0 < K; k0 += KC)
-            {
-                int kc = K - k0 < KC ? K - k0 : KC;
-                fast_gemm_pack12_f32(mc, kc, A + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
-                fast_gemm_macro_kernel(mc, nc, kc, packed_a, packed_b_, alpha, c_block, ldc_block, esz);
-                packed_b_ += _nc * kc;
-            }
-        }
-
-        if (!use_stackbuff) {
-            free(packed_a);
-        }
-    };
-
-    int total = total_tiles;
-    int cost_per_thread = static_cast<int>((K / KC) * (MC / GEMM_MR) * (NC / GEMM_NR));
-    double nstripes = (size_t)total * cost_per_thread * (1 / 1024.0);
-    parallel_for_(Range(0, total), fn, nstripes);
-}
-
-#endif // CV_AVX, CV_AVX2
-
-// LASX (32 x 256-bit registers)
-#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_LASX
-
-FAST_GEMM_IMPLEMENT_PACK(12, _f32, float, float)
-FAST_GEMM_IMPLEMENT_PACK(16, _f32, float, float)
-
-int fastGemmPackBSize(int N, int K) {
-    int GEMM_NC = FAST_GEMM_F32_NC, GEMM_NR = FAST_GEMM_F32_NR;
-    int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
-
-    return static_cast<int>((N + NC - 1) / NC) * NC * K;
-}
-
-void fastGemmPackBKernel(const char *B, char *packed_B, int N, int K, int ldb0, int ldb1, int esz) {
-    int GEMM_NC = FAST_GEMM_F32_NC, GEMM_NR = FAST_GEMM_F32_NR;
-    int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
-    int KC = std::min(FAST_GEMM_F32_PACKED_STRIDE_K, K);
-
-    int n_tiles = (N + NC - 1) / NC;
-    for (int r = 0; r < n_tiles; ++r) {
-        int j0 = r * NC;
-        int nc = N - j0 < NC ? N - j0 : NC;
-        int _nc = static_cast<int>((nc + GEMM_NR - 1) / GEMM_NR) * GEMM_NR * esz;
-        for (int k = 0; k < K; k += KC) {
-            int kc = K - k < KC ? K - k : KC;
-            fast_gemm_pack16_f32(nc, kc, B + (k * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_B);
-            packed_B += _nc * kc;
-        }
-    }
-}
-
-static void fast_gemm12x16_f32(int k, const char *a_, const char *b_, char *c_, int ldc, float alpha) {
+static inline void fast_gemm12x16_f32(int k, const char *a_, const char *b_, char *c_, int ldc, float alpha) {
    const float* a = (const float*)a_;
    const float* b = (const float*)b_;
    float* c = (float*)c_;
@ -889,9 +395,99 @@ static void fast_gemm12x16_f32(int k, const char *a_, const char *b_, char *c_,
 #undef FAST_GEMM_FINALE
 }

-static void fast_gemm_macro_kernel(int m, int n, int k,
-                                   const char *packed_A, const char *packed_B,
-                                   float alpha, char *c, int ldc0, int esz) {
+#elif CV_SIMD128 // armv7: 16 x 128-bit registers
+
+FAST_GEMM_IMPLEMENT_PACK(8, _f32, float, float) // a packer
+FAST_GEMM_IMPLEMENT_PACK(12, _f32, float, float) // b packer
+
+static inline void fast_gemm8x12_f32(int k, const char *a_, const char *b_,
+                                     char *c_, int ldc, float alpha) {
+    const float* a = (const float*)a_;
+    const float* b = (const float*)b_;
+    float* c = (float*)c_;
+
+    v_float32x4 s00 = v_setzero_f32(), s01 = s00, s02 = s00;
+    v_float32x4 s10 = s00, s11 = s00, s12 = s00;
+    v_float32x4 s20 = s00, s21 = s00, s22 = s00;
+    v_float32x4 s30 = s00, s31 = s00, s32 = s00;
+    v_float32x4 s40 = s00, s41 = s00, s42 = s00;
+    v_float32x4 s50 = s00, s51 = s00, s52 = s00;
+    v_float32x4 s60 = s00, s61 = s00, s62 = s00;
+    v_float32x4 s70 = s00, s71 = s00, s72 = s00;
+
+    for(int p = 0; p < k; p++, a += FAST_GEMM_F32_MR, b += FAST_GEMM_F32_NR) {
+        v_float32x4 b0 = v_load(b), b1 = v_load(b + 4), b2 = v_load(b + 8);
+
+        v_float32x4 a0 = v_setall_f32(*a);
+        s00 = v_fma(b0, a0, s00);
+        s01 = v_fma(b1, a0, s01);
+        s02 = v_fma(b2, a0, s02);
+        v_float32x4 a1 = v_setall_f32(*(a + 1));
+        s10 = v_fma(b0, a1, s10);
+        s11 = v_fma(b1, a1, s11);
+        s12 = v_fma(b2, a1, s12);
+
+        v_float32x4 a2 = v_setall_f32(*(a + 2));
+        s20 = v_fma(b0, a2, s20);
+        s21 = v_fma(b1, a2, s21);
+        s22 = v_fma(b2, a2, s22);
+        v_float32x4 a3 = v_setall_f32(*(a + 3));
+        s30 = v_fma(b0, a3, s30);
+        s31 = v_fma(b1, a3, s31);
+        s32 = v_fma(b2, a3, s32);
+
+        a0 = v_setall_f32(*(a + 4));
+        s40 = v_fma(b0, a0, s40);
+        s41 = v_fma(b1, a0, s41);
+        s42 = v_fma(b2, a0, s42);
+        a1 = v_setall_f32(*(a + 5));
+        s50 = v_fma(b0, a1, s50);
+        s51 = v_fma(b1, a1, s51);
+        s52 = v_fma(b2, a1, s52);
+
+        a2 = v_setall_f32(*(a + 6));
+        s60 = v_fma(b0, a2, s60);
+        s61 = v_fma(b1, a2, s61);
+        s62 = v_fma(b2, a2, s62);
+        a3 = v_setall_f32(*(a + 7));
+        s70 = v_fma(b0, a3, s70);
+        s71 = v_fma(b1, a3, s71);
+        s72 = v_fma(b2, a3, s72);
+    }
+
+    v_float32x4 c0, c1, c2, c3, c4, c5, v_alpha = v_setall_f32(alpha);
+#define FAST_GEMM_FINALE(row0, row1)       \
+    c0 = v_load(c + row0 * ldc);         \
+    c1 = v_load(c + row0 * ldc + 4);     \
+    c2 = v_load(c + row0 * ldc + 8);     \
+    c3 = v_load(c + row1 * ldc);         \
+    c4 = v_load(c + row1 * ldc + 4);     \
+    c5 = v_load(c + row1 * ldc + 8);     \
+    c0 = v_fma(s##row0##0, v_alpha, c0); \
+    c1 = v_fma(s##row0##1, v_alpha, c1); \
+    c2 = v_fma(s##row0##2, v_alpha, c2); \
+    c3 = v_fma(s##row1##0, v_alpha, c3); \
+    c4 = v_fma(s##row1##1, v_alpha, c4); \
+    c5 = v_fma(s##row1##2, v_alpha, c5); \
+    v_store(c + row0 * ldc, c0);         \
+    v_store(c + row0 * ldc + 4, c1);     \
+    v_store(c + row0 * ldc + 8, c2);     \
+    v_store(c + row1 * ldc, c3);         \
+    v_store(c + row1 * ldc + 4, c4);     \
+    v_store(c + row1 * ldc + 8, c5);
+
+    FAST_GEMM_FINALE(0, 1);
+    FAST_GEMM_FINALE(2, 3);
+    FAST_GEMM_FINALE(4, 5);
+    FAST_GEMM_FINALE(6, 7);
+#undef FAST_GEMM_FINALE
+}
+
+#endif
+
+static inline void fast_gemm_macro_kernel(int m, int n, int k,
+                                          const char *packed_A, const char *packed_B,
+                                          float alpha, char *c, int ldc0, int esz) {
    int ldc0_esz = ldc0 * esz;

    double tempC[FAST_GEMM_F32_MR * FAST_GEMM_F32_NR]; // make sure the buffer is big enough
@ -911,7 +507,15 @@ static void fast_gemm_macro_kernel(int m, int n, int k,
                for(int p = 0; p < mr; p++)
                    memcpy(cptr + p * (ldc * esz), cptr0 + p * ldc0_esz, nr_esz);
            }
+#if CV_NEON && CV_NEON_AARCH64
+            fast_gemm8x12_f32(k, packed_A + i * k * esz, packed_B + j * k * esz, cptr, ldc, alpha);
+#elif CV_AVX
+            fast_gemm12x8_f32(k, packed_A + i * k * esz, packed_B + j * k * esz, cptr, ldc, alpha);
+#elif CV_LASX
            fast_gemm12x16_f32(k, packed_A + i * k * esz, packed_B + j * k * esz, cptr, ldc, alpha);
+#elif CV_SIMD128
+            fast_gemm8x12_f32(k, packed_A + i * k * esz, packed_B + j * k * esz, cptr, ldc, alpha);
+#endif

            if (partial) {
                for(int p = 0; p < mr; p++)
@ -921,6 +525,39 @@ static void fast_gemm_macro_kernel(int m, int n, int k,
    }
 }

+int fastGemmPackBSize(int N, int K) {
+    int GEMM_NC = FAST_GEMM_F32_NC, GEMM_NR = FAST_GEMM_F32_NR;
+    int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
+
+    return static_cast<int>((N + NC - 1) / NC) * NC * K;
+}
+
+void fastGemmPackBKernel(const char *B, char *packed_B, int N, int K, int ldb0, int ldb1, int esz) {
+    int GEMM_NC = FAST_GEMM_F32_NC, GEMM_NR = FAST_GEMM_F32_NR;
+    int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
+    int KC = std::min(FAST_GEMM_F32_PACKED_STRIDE_K, K);
+
+    int n_tiles = (N + NC - 1) / NC;
+    for (int r = 0; r < n_tiles; ++r) {
+        int j0 = r * NC;
+        int nc = N - j0 < NC ? N - j0 : NC;
+        int _nc = static_cast<int>((nc + GEMM_NR - 1) / GEMM_NR) * GEMM_NR * esz;
+        for (int k = 0; k < K; k += KC) {
+            int kc = K - k < KC ? K - k : KC;
+#if CV_NEON && CV_NEON_AARCH64
+            fast_gemm_pack12_f32(nc, kc, B + (k * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_B);
+#elif CV_AVX
+            fast_gemm_pack8_f32(nc, kc, B + (k * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_B);
+#elif CV_LASX
+            fast_gemm_pack16_f32(nc, kc, B + (k * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_B);
+#elif CV_SIMD128
+            fast_gemm_pack12_f32(nc, kc, B + (k * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_B);
+#endif
+            packed_B += _nc * kc;
+        }
+    }
+}
+
 void fastGemmKernel(int M, int N, int K,
                    float alpha, const char *A, int lda0, int lda1,
                    const char *B, int ldb0, int ldb1,
@ -970,8 +607,29 @@ void fastGemmKernel(int M, int N, int K,
            for(int k0 = 0; k0 < K; k0 += KC)
            {
                int kc = K - k0 < KC ? K - k0 : KC;
+                // pack a
+#if CV_NEON && CV_NEON_AARCH64
+                fast_gemm_pack8_f32(mc, kc, A + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
+#elif CV_AVX
                fast_gemm_pack12_f32(mc, kc, A + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
+#elif CV_LASX
+                fast_gemm_pack12_f32(mc, kc, A + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
+#elif CV_SIMD128
+                fast_gemm_pack8_f32(mc, kc, A + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
+#endif
+
+                // pack b
+#if CV_NEON && CV_NEON_AARCH64
+                fast_gemm_pack12_f32(nc, kc, B + (k0 * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_b);
+#elif CV_AVX
+                fast_gemm_pack8_f32(nc, kc, B + (k0 * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_b);
+#elif CV_LASX
                fast_gemm_pack16_f32(nc, kc, B + (k0 * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_b);
+#elif CV_SIMD128
+                fast_gemm_pack12_f32(nc, kc, B + (k0 * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_b);
+#endif
+
+                // run kernel
                fast_gemm_macro_kernel(mc, nc, kc, packed_a, packed_b, alpha, c_block, ldc_block, esz);
            }
        }
@ -1035,7 +693,18 @@ void fastGemmKernel(int M, int N, int K,
            for(int k0 = 0; k0 < K; k0 += KC)
            {
                int kc = K - k0 < KC ? K - k0 : KC;
+                // pack a
+#if CV_NEON && CV_NEON_AARCH64
+                fast_gemm_pack8_f32(mc, kc, A + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
+#elif CV_AVX
                fast_gemm_pack12_f32(mc, kc, A + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
+#elif CV_LASX
+                fast_gemm_pack12_f32(mc, kc, A + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
+#elif CV_SIMD128
+                fast_gemm_pack8_f32(mc, kc, A + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
+#endif
+
+                // run kernel
                fast_gemm_macro_kernel(mc, nc, kc, packed_a, packed_b_, alpha, c_block, ldc_block, esz);
                packed_b_ += _nc * kc;
            }
@ -1052,8 +721,37 @@ void fastGemmKernel(int M, int N, int K,
    parallel_for_(Range(0, total), fn, nstripes);
 }

-#endif // CV_LASX
+#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY

 CV_CPU_OPTIMIZATION_NAMESPACE_END

 }} // cv::dnn
+
+#undef FAST_GEMM_STORAGE
+#undef FAST_GEMM_MAX_STACKBUF
+#ifdef FAST_GEMM_F32_MC
+#undef FAST_GEMM_F32_MC
+#endif
+#ifdef FAST_GEMM_F32_NC
+#undef FAST_GEMM_F32_NC
+#endif
+#ifdef FAST_GEMM_F32_MR
+#undef FAST_GEMM_F32_MR
+#endif
+#ifdef FAST_GEMM_F32_NR
+#undef FAST_GEMM_F32_NR
+#endif
+#ifdef FAST_GEMM_F32_PACKED_STRIDE_K
+#undef FAST_GEMM_F32_PACKED_STRIDE_K
+#endif
+#undef FAST_GEMM_IMPLEMENT_PACK
+#undef FAST_GEMM_LOAD_TO_BUF_8
+#undef FAST_GEMM_LOAD_TO_BUF_BORDERS_8
+#undef FAST_GEMM_LOAD_TO_BUF_12
+#undef FAST_GEMM_LOAD_TO_BUF_BORDERS_12
+#undef FAST_GEMM_LOAD_TO_BUF_16
+#undef FAST_GEMM_LOAD_TO_BUF_BORDERS_16
+#undef FAST_GEMM_PACK_COPY
+#undef FAST_GEMM_PACK_f32_8
+#undef FAST_GEMM_PACK_f32_12
+#undef FAST_GEMM_PACK_f32_16
--- a/modules/dnn/src/layers/cumsum_layer.cpp
+++ b/modules/dnn/src/layers/cumsum_layer.cpp
@ -47,73 +47,76 @@ public:
        inputs_arr.getMatVector(inputs);
        outputs_arr.getMatVector(outputs);

-        // Get x tensor.
-        const auto &src_mat = inputs[0];
-        const auto *src_ptr = src_mat.ptr<float>();
+        // Get input tensor.
+        const auto& src_mat = inputs[0];
+        const auto* src_ptr = src_mat.ptr<float>();

-        // Get axis.
-        const int axis = normalize_axis(axis_raw, src_mat.dims);
+        // Get target axis.
+        int axis = inputs.size() > 1 ? parseAxis(inputs[1]) : axis_raw;
+        axis = normalize_axis(axis, src_mat.dims);

-        // Get y tensor.
-        auto &dst_mat = outputs[0];
-        src_mat.copyTo(dst_mat);
-        auto *dst_ptr = dst_mat.ptr<float>();
+
+        // Get output tensor.
+        auto& dst_mat = outputs[0];
+        auto* dst_ptr = dst_mat.ptr<float>();

        // Get flags.
        const auto exclusive = exclusive_raw == 1;
        const auto reverse = reverse_raw == 1;

-        // Get parameters to iterate outer dimension.
+        // Data with [dim_1, .. , dim_k-1, target_dim, dim_k+1, .. , dim_n]
+        // dimensions is represented here as [outer_dim, target_dim, inner_dim]
        const size_t outer_size = src_mat.total(0, axis);
-        const size_t outer_step_length = src_mat.total(axis);
+        const size_t target_size = src_mat.size[axis];
+        const size_t inner_size = src_mat.total(axis + 1);
+        const size_t outer_step_length = target_size * inner_size;

-        // Get parameters to iterate inner dimension.
-        const size_t inner_size = src_mat.size[axis];
+        // Calculating steps in target dimensions
+        const int target_start = reverse ? target_size - 1 : 0;
+        const int target_stop = reverse ? -1 : target_size;
+        const int target_delta = reverse ? -1 : 1;
+        const int target_step = target_delta * inner_size;

-        if (!inner_size)
-            return;
+        // If exclusive, the j-th output element would be the sum of the first (j-1) elements.
+        // Otherwise, it would be the sum of the first j elements.
+        const int exclusive_delta = exclusive ? target_step : 0;

-        const size_t inner_step_length = src_mat.total(axis + 1);
-        const int inner_step = (reverse ? -1 : 1) * inner_step_length;
-        const int inner_start = reverse ? inner_size - 1 : 0;
-        const int inner_stop = reverse ? -1 : inner_size;
-        const int inner_delta = reverse ? -1 : 1;
-
-        // Get parameters to populate channels.
-        const size_t num_channels = src_mat.total(axis + 1);
-
-        for (size_t outer_dim = 0; outer_dim < outer_size; outer_dim++)
+        for (size_t outer_idx = 0; outer_idx < outer_size; outer_idx++)
        {
-            const size_t outer_offset = outer_dim * outer_step_length;
-            size_t src_offset = outer_offset + inner_start * inner_step_length;
+            const size_t target_offset = outer_idx * outer_step_length;

-            // Populate first element of inner dimension.
-            for (size_t channel = 0; channel < num_channels; channel++)
+            // Handle first element of target dimension.
+            size_t first_inner_offset = target_offset + target_start * inner_size;
+            if (exclusive)
+                for (size_t inner_idx = 0; inner_idx < inner_size; inner_idx++)
+                    dst_ptr[first_inner_offset + inner_idx] = 0.0f;
+            else
+                for (size_t inner_idx = 0; inner_idx < inner_size; inner_idx++)
+                    dst_ptr[first_inner_offset + inner_idx] = src_ptr[first_inner_offset + inner_idx];
+
+            // Handle remaining elements of target dimension.
+            for (int target_idx = target_start + target_delta; target_idx != target_stop; target_idx += target_delta)
            {
-                if (exclusive)
+                const size_t inner_offset = target_offset + target_idx * inner_size;
+
+                for (size_t inner_idx = 0; inner_idx < inner_size; inner_idx++)
                {
-                    dst_ptr[src_offset + channel] = 0.0f;
-                }
-                else
-                {
-                    dst_ptr[src_offset + channel] = src_ptr[src_offset + channel];
-                    src_offset += inner_step;
+                    dst_ptr[inner_offset + inner_idx] = dst_ptr[inner_offset - target_step + inner_idx] +
+                        src_ptr[inner_offset - exclusive_delta + inner_idx];
                }
            }
+        }
+    }

-            // Populate remaining elements of inner dimension.
-            for (int inner_dim = inner_start + inner_delta; inner_dim != inner_stop; inner_dim += inner_delta)
-            {
-                const size_t dst_offset = outer_offset + inner_dim * inner_step_length;
-
-                for (size_t channel = 0; channel < num_channels; channel++)
-                {
-                    const size_t previous_dst_offset = dst_offset - inner_step;
-                    dst_ptr[dst_offset + channel] = dst_ptr[previous_dst_offset + channel] +
-                            src_ptr[src_offset + channel];
-                    src_offset += inner_step;
-                }
-            }
+    int parseAxis(const Mat& axis_mat) {
+        CV_CheckEQ(axis_mat.total(), 1u, "Axis tensor should contain single value");
+        if (axis_mat.type() == CV_32SC1)
+            return axis_mat.at<int32_t>(0);
+        else
+        {
+            Mat axis_mat_int;
+            axis_mat.convertTo(axis_mat_int, CV_32SC1);
+            return axis_mat_int.at<int32_t>(0);
        }
    }

--- a/modules/dnn/src/layers/einsum_layer.cpp
+++ b/modules/dnn/src/layers/einsum_layer.cpp
@ -38,7 +38,6 @@ Mat batchwiseMatMul(
    const Mat& input2,
    const MatShape& input2ShapeOverride)
 {
-
    // Sanity checks before the actual MatMul
    //input_1.DataType() == input_2.DataType(), "Data types of the inputs must match for MatMul");

@ -391,6 +390,15 @@ public:
                 OutputArrayOfArrays outputs_arr,
                 OutputArrayOfArrays internals_arr) CV_OVERRIDE
    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
        // homogenize inputs
        preProcessInputs(inputs_arr);

--- a/modules/dnn/src/layers/elementwise_layers.cpp
+++ b/modules/dnn/src/layers/elementwise_layers.cpp
@ -984,13 +984,7 @@ struct MishFunctor : public BaseDefaultFunctor<MishFunctor>
 #ifdef HAVE_DNN_NGRAPH
    std::shared_ptr<ngraph::Node> initNgraphAPI(const ngraph::Output<ngraph::Node>& node)
    {
-        float one = 1.0f;
-        auto constant = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &one);
-        auto exp_node = std::make_shared<ngraph::op::v0::Exp>(node);
-        auto sum = std::make_shared<ngraph::op::v1::Add>(constant, exp_node, ngraph::op::AutoBroadcastType::NUMPY);
-        auto log_node = std::make_shared<ngraph::op::v0::Log>(sum);
-        auto tanh_node = std::make_shared<ngraph::op::Tanh>(log_node);
-        return std::make_shared<ngraph::op::v1::Multiply>(node, tanh_node);
+        return std::make_shared<ngraph::op::v4::Mish>(node);
    }
 #endif  // HAVE_DNN_NGRAPH

@ -1190,10 +1184,7 @@ struct AbsValFunctor : public BaseDefaultFunctor<AbsValFunctor>
 #ifdef HAVE_DNN_NGRAPH
    std::shared_ptr<ngraph::Node> initNgraphAPI(const ngraph::Output<ngraph::Node>& node)
    {
-        float coeff = -0.999999f;
-        // float coeff = preferableTarget == DNN_TARGET_MYRIAD ? -0.999f : -0.999999f;
-        auto slope = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &coeff);
-        return std::make_shared<ngraph::op::PRelu>(node, slope);
+        return std::make_shared<ngraph::op::Abs>(node);
    }
 #endif  // HAVE_DNN_NGRAPH

@ -2563,11 +2554,6 @@ template<>
 const char* const ReciprocalFunctor::BaseDefaultFunctor<ReciprocalFunctor>::ocl_kernel_name = "ReciprocalForward";


-#define ACTIVATION_CREATOR_FOR(_Layer, _Functor, ...) \
-Ptr<_Layer> _Layer::create() { \
-    return return Ptr<_Layer>( new ElementWiseLayer<_Functor>(_Functor()) ); }
-
-
 Ptr<ReLULayer> ReLULayer::create(const LayerParams& params)
 {
    float negativeSlope = params.get<float>("negative_slope", 0.f);
--- a/modules/dnn/src/layers/gemm_layer.cpp
+++ b/modules/dnn/src/layers/gemm_layer.cpp
@ -191,7 +191,6 @@ public:
        size_t dims_Y = shape_Y.size();
        int M = shape_Y[dims_Y - 2], N = shape_Y[dims_Y - 1];
        int K = trans_a ? ma : na;
-        int batches = std::accumulate(shape_A.begin(), shape_A.end() - 2, 1, std::multiplies<int>());

        // broadcast C and copy C to output
        if (have_bias) {
@ -201,9 +200,7 @@ public:
            int step = M * N;
            CV_CheckEQ(broadcast_C.size(), static_cast<size_t>(step), "DNN/Gemm: C is not broadcast properly");
            float *ptr_y = Y.ptr<float>();
-            for (int i = 0; i < batches; i++) {
-                std::memcpy(ptr_y + i * step, broadcast_C.data(), step * sizeof(float));
-            }
+            std::memcpy(ptr_y, broadcast_C.data(), step * sizeof(float));
        } else { // initialization
            float *ptr_y = Y.ptr<float>();
            size_t total = Y.total();
@ -212,7 +209,6 @@ public:

        if (const_B) {
            CV_CheckGT(packed_B.size(), static_cast<size_t>(0), "DNN/Gemm: constant B is not pre-packed");
-            M *= batches;
            fastGemm(trans_a, M, N, K, alpha, A.ptr<const float>(), na, packed_B.data(), 1.f, Y.ptr<float>(), N, opt);
        } else {
            fastGemmBatched(trans_a, trans_b, alpha, A, inputs[1], 1.f, Y, opt);
--- a/modules/dnn/src/layers/softmax_layer.cpp
+++ b/modules/dnn/src/layers/softmax_layer.cpp
@ -359,11 +359,11 @@ public:
    {
        auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
        int axis = normalize_axis(axisRaw, ieInpNode.get_shape().size());
-        auto softmax = std::make_shared<ngraph::op::v1::Softmax>(ieInpNode, axis);
-        if (logSoftMax)
-            return Ptr<BackendNode>(new InfEngineNgraphNode(std::make_shared<ngraph::op::v0::Log>(softmax)));
-
-        return Ptr<BackendNode>(new InfEngineNgraphNode(softmax));
+        if (logSoftMax) {
+            return new InfEngineNgraphNode(std::make_shared<ngraph::op::v5::LogSoftmax>(ieInpNode, axis));
+        } else {
+            return new InfEngineNgraphNode(std::make_shared<ngraph::op::v1::Softmax>(ieInpNode, axis));
+        }
    }
 #endif  // HAVE_DNN_NGRAPH

--- a/modules/dnn/src/legacy_backend.cpp
+++ b/modules/dnn/src/legacy_backend.cpp
@ -23,7 +23,7 @@ BackendNode::BackendNode(int backendId)
    : backendId(backendId)
 {}

-BackendNode::~BackendNode() {};
+BackendNode::~BackendNode() {}

 BackendWrapper::BackendWrapper(int backendId, int targetId)
    : backendId(backendId)
--- a/modules/dnn/src/model.cpp
+++ b/modules/dnn/src/model.cpp
@ -306,9 +306,9 @@ void ClassificationModel::classify(InputArray frame, int& classId, float& conf)
 }

 KeypointsModel::KeypointsModel(const String& model, const String& config)
-    : Model(model, config) {};
+    : Model(model, config) {}

-KeypointsModel::KeypointsModel(const Net& network) : Model(network) {};
+KeypointsModel::KeypointsModel(const Net& network) : Model(network) {}

 std::vector<Point2f> KeypointsModel::estimate(InputArray frame, float thresh)
 {
@ -364,9 +364,9 @@ std::vector<Point2f> KeypointsModel::estimate(InputArray frame, float thresh)
 }

 SegmentationModel::SegmentationModel(const String& model, const String& config)
-    : Model(model, config) {};
+    : Model(model, config) {}

-SegmentationModel::SegmentationModel(const Net& network) : Model(network) {};
+SegmentationModel::SegmentationModel(const Net& network) : Model(network) {}

 void SegmentationModel::segment(InputArray frame, OutputArray mask)
 {
--- a/modules/dnn/src/net_impl_backend.cpp
+++ b/modules/dnn/src/net_impl_backend.cpp
@ -155,11 +155,19 @@ void Net::Impl::setPreferableBackend(Net& net, int backendId)
    if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
        backendId = DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;  // = getInferenceEngineBackendTypeParam();

-    if (netWasQuantized && backendId != DNN_BACKEND_OPENCV && backendId != DNN_BACKEND_TIMVX)
+    if (netWasQuantized && backendId != DNN_BACKEND_OPENCV && backendId != DNN_BACKEND_TIMVX &&
+        backendId != DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
    {
-        CV_LOG_WARNING(NULL, "DNN: Only default and TIMVX backends support quantized networks");
+        CV_LOG_WARNING(NULL, "DNN: Only default, TIMVX and OpenVINO backends support quantized networks");
        backendId = DNN_BACKEND_OPENCV;
    }
+#ifdef HAVE_DNN_NGRAPH
+    if (netWasQuantized && backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && INF_ENGINE_VER_MAJOR_LT(INF_ENGINE_RELEASE_2023_0))
+    {
+        CV_LOG_WARNING(NULL, "DNN: OpenVINO 2023.0 and higher is required to supports quantized networks");
+        backendId = DNN_BACKEND_OPENCV;
+    }
+#endif

    if (preferableBackend != backendId)
    {
--- a/modules/dnn/src/net_openvino.cpp
+++ b/modules/dnn/src/net_openvino.cpp
@ -48,7 +48,6 @@ public:
        CV_Assert(basePtr_);
        Net::Impl& base = *basePtr_;
        CV_Assert(!base.netWasAllocated);
-        CV_Assert(!base.netWasQuantized);
        netInputLayer = base.netInputLayer;
        blobsToKeep = base.blobsToKeep;
        layers = base.layers;
--- a/modules/dnn/src/onnx/onnx_importer.cpp
+++ b/modules/dnn/src/onnx/onnx_importer.cpp
@ -383,7 +383,7 @@ void runLayer(LayerParams& params, const std::vector<Mat>& inputs,
    {
        inpShapes[i] = shape(inputs[i]);
        if (i > 0 && ddepth != inputs[i].depth())
-            CV_Error(Error::StsNotImplemented, "Mixed input data types.");
+            CV_Error(Error::StsNotImplemented, cv::format("Mixed input data types. Required type: %d, actual type: %d", ddepth, inputs[i].depth()));

        // Quantize and Dequantize layer have different output type than input.
        if (params.type != "Quantize" && params.type != "Dequantize")
@ -1502,7 +1502,7 @@ void ONNXImporter::lstm_extractConsts(LayerParams& layerParams, const opencv_onn
            blob = Mat(blobShape, CV_32FC1, 0.);
        }
        layerParams.blobs.push_back(blob);
-};
+}

 void ONNXImporter::lstm_add_reshape(const std::string& input_name, const std::string& output_name, int* layerShape, size_t n)
 {
@ -1517,7 +1517,7 @@ void ONNXImporter::lstm_add_reshape(const std::string& input_name, const std::st
    reshape_proto.add_input(input_name);
    reshape_proto.add_output(output_name);
    addLayer(reshapeLp, reshape_proto);
-};
+}

 std::string ONNXImporter::lstm_add_slice(int index, const std::string& input_name, int* begin, int* end, size_t n)
 {
@ -1536,7 +1536,7 @@ std::string ONNXImporter::lstm_add_slice(int index, const std::string& input_nam
    addLayer(sliceLP, slice_proto);

    return slice_proto.output(0);
-};
+}

 std::string ONNXImporter::lstm_fix_dims(LayerParams& layerParams, const opencv_onnx::NodeProto& lstm_proto,
                                        int batch_size, int num_directions, int hidden_size, bool need_y, const std::string& y_name,
@ -1564,7 +1564,7 @@ std::string ONNXImporter::lstm_fix_dims(LayerParams& layerParams, const opencv_o
    addLayer(permuteLP, permute_proto);

    return permute_proto.output(0);
-};
+}

 void ONNXImporter::lstm_add_transform(int num_directions, int batch_size, int hidden_size,
                                      int index, const std::string& input_name, const std::string& output_name)
@ -1606,7 +1606,7 @@ void ONNXImporter::lstm_add_transform(int num_directions, int batch_size, int hi
        int layerShape[] = {2, batch_size, hidden_size};
        lstm_add_reshape(concat_proto.output(0), output_name, layerShape, sizeof(layerShape) / sizeof(layerShape[0]));
    }
-};
+}

 void ONNXImporter::parseLSTM(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto_)
 {
--- a/modules/dnn/src/op_inf_engine.hpp
+++ b/modules/dnn/src/op_inf_engine.hpp
@ -27,6 +27,7 @@
 #define INF_ENGINE_RELEASE_2021_3 2021030000
 #define INF_ENGINE_RELEASE_2021_4 2021040000
 #define INF_ENGINE_RELEASE_2022_1 2022010000
+#define INF_ENGINE_RELEASE_2023_0 2023000000

 #ifndef INF_ENGINE_RELEASE
 #warning("IE version have not been provided via command-line. Using 2021.4 by default")
--- a/modules/dnn/src/tensorflow/tf_importer.cpp
+++ b/modules/dnn/src/tensorflow/tf_importer.cpp
@ -3227,7 +3227,7 @@ void TFLayerHandler::fillRegistry(const tensorflow::GraphDef& net)
        }
    }
    printMissing();
-};
+}

 bool TFLayerHandler::handleMissing(const tensorflow::NodeDef& layer)
 {
--- a/modules/dnn/test/test_backends.cpp
+++ b/modules/dnn/test/test_backends.cpp
@ -151,10 +151,12 @@ TEST_P(DNNTestNetwork, ENet)
 {
    applyTestTag(target == DNN_TARGET_CPU ? "" : CV_TEST_TAG_MEMORY_512MB);

+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+#endif
    if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
    if (backend == DNN_BACKEND_CUDA && target == DNN_TARGET_CUDA_FP16)
@ -482,7 +484,7 @@ TEST_P(DNNTestNetwork, FastNeuralStyle_eccv16)
    Mat img = imread(findDataFile("dnn/googlenet_1.png"));
    Mat inp = blobFromImage(img, 1.0, Size(320, 240), Scalar(103.939, 116.779, 123.68), false, false);
    // Output image has values in range [-143.526, 148.539].
-    float l1 = 2e-4, lInf = 2e-3;
+    float l1 = 2e-4, lInf = 2.4e-3;
    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
    {
        l1 = 0.4;
@ -875,8 +877,12 @@ TEST_P(MaxPooling, Accuracy)
    Target targetId = get<1>(get<5>(GetParam()));

    // https://github.com/openvinotoolkit/openvino/issues/18731
-    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && stride != Size(1, 1))
-        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && stride != Size(1, 1)) {
+        int ow = ceil(static_cast<float>(inSize.width + 2 * pad.width - kernel.width) / stride.width);
+        int oh = ceil(static_cast<float>(inSize.height + 2 * pad.height - kernel.height) / stride.height);
+        if (ow * stride.width >= inSize.width + pad.width || oh * stride.height >= inSize.height + pad.height)
+            applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+    }

 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2019010000)
    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && targetId == DNN_TARGET_MYRIAD
@ -1026,10 +1032,12 @@ INSTANTIATE_TEST_CASE_P(Layer_Test_Backends, SoftMax, testing::Combine(
 //////////////////////////////////////////////////////////////////////////////
 TEST_P(Test_layers_backends, MaxPoolUnpool)
 {
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+#endif

    LayerParams pool;
    pool.set("pool", "max");
--- a/modules/dnn/test/test_int8_layers.cpp
+++ b/modules/dnn/test/test_int8_layers.cpp
@ -14,6 +14,9 @@ testing::internal::ParamGenerator< tuple<Backend, Target> > dnnBackendsAndTarget
    targets.push_back(make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU));
 #ifdef HAVE_TIMVX
    targets.push_back(make_tuple(DNN_BACKEND_TIMVX, DNN_TARGET_NPU));
+#endif
+#ifdef HAVE_INF_ENGINE
+    targets.push_back(make_tuple(DNN_BACKEND_INFERENCE_ENGINE_NGRAPH, DNN_TARGET_CPU));
 #endif
    return testing::ValuesIn(targets);
 }
@ -66,8 +69,6 @@ public:
            outPath = _tf("onnx/data/output_" + basename);
        }
        ASSERT_FALSE(net.empty());
-        net.setPreferableBackend(backend);
-        net.setPreferableTarget(target);

        for (int i = 0; i < numInps; i++)
            inps[i] = blobFromNPY(inpPath + ((numInps > 1) ? cv::format("_%d.npy", i) : ".npy"));
@ -78,6 +79,8 @@ public:
        qnet = net.quantize(inps, CV_8S, CV_8S, perChannel);
        qnet.getInputDetails(inputScale, inputZp);
        qnet.getOutputDetails(outputScale, outputZp);
+        qnet.setPreferableBackend(backend);
+        qnet.setPreferableTarget(target);

        // Quantize inputs to int8
        // int8_value = float_value/scale + zero-point
@ -98,7 +101,7 @@ public:
            if (out_i.dims == 2 && ref_i.dims == 1) {
                ref_i = ref_i.reshape(1, 1);
            }
-            normAssert(ref_i, out_i, "", l1, lInf);
+            normAssert(ref_i, out_i, basename.c_str(), l1, lInf);
        }
    }
 };
@ -201,10 +204,13 @@ TEST_P(Test_Int8_layers, Padding)

 TEST_P(Test_Int8_layers, AvePooling)
 {
-    testLayer("layer_pooling_ave", "Caffe", 0.0021, 0.0075);
+    // Some tests failed with OpenVINO due to wrong padded area calculation
+    if (backend != DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        testLayer("layer_pooling_ave", "Caffe", 0.0021, 0.0075);
    testLayer("ave_pool_same", "TensorFlow", 0.00153, 0.0041);
    testLayer("average_pooling_1d", "ONNX", 0.002, 0.0048);
-    testLayer("average_pooling", "ONNX", 0.0014, 0.0032);
+    if (backend != DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        testLayer("average_pooling", "ONNX", 0.0014, 0.0032);
    testLayer("average_pooling_dynamic_axes", "ONNX", 0.0014, 0.006);

    if (target != DNN_TARGET_CPU)
@ -220,8 +226,6 @@ TEST_P(Test_Int8_layers, MaxPooling)
        throw SkipTestException("Only CPU is supported");
    testLayer("pool_conv_3d", "ONNX", 0.0033, 0.0124);

-    /* All the below tests have MaxPooling as last layer, so computeMaxIdx is set to true
-       which is not supported by int8 maxpooling
    testLayer("layer_pooling_max", "Caffe", 0.0021, 0.004);
    testLayer("max_pool_even", "TensorFlow", 0.0048, 0.0139);
    testLayer("max_pool_odd_valid", "TensorFlow", 0.0043, 0.012);
@ -231,7 +235,7 @@ TEST_P(Test_Int8_layers, MaxPooling)
    testLayer("two_maxpooling_1d", "ONNX", 0.0037, 0.0052);
    testLayer("maxpooling", "ONNX", 0.0034, 0.0065);
    testLayer("two_maxpooling", "ONNX", 0.0025, 0.0052);
-    testLayer("max_pool3d", "ONNX", 0.0028, 0.0069);*/
+    testLayer("max_pool3d", "ONNX", 0.0028, 0.0069);
 }

 TEST_P(Test_Int8_layers, Reduce)
@ -326,7 +330,10 @@ TEST_P(Test_Int8_layers, DISABLED_Softmax_unfused_ONNX)  // FIXIT Support 'Ident
 TEST_P(Test_Int8_layers, Concat)
 {
    testLayer("layer_concat_shared_input", "Caffe", 0.0076, 0.029, 1, 1, true, false);
-    testLayer("concat_axis_1", "TensorFlow", 0.0056, 0.017);
+    if (backend != DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) {
+        // Crashes with segfault
+        testLayer("concat_axis_1", "TensorFlow", 0.0056, 0.017);
+    }
    testLayer("keras_pad_concat", "TensorFlow", 0.0032, 0.0089);
    testLayer("concat_3d", "TensorFlow", 0.005, 0.014);
    testLayer("concatenation", "ONNX", 0.0032, 0.009);
@ -404,10 +411,13 @@ TEST_P(Test_Int8_layers, Reshape)
        testLayer("reshape_nchw", "TensorFlow", 0.0089, 0.029);

    testLayer("reshape_conv", "TensorFlow", 0.035, 0.054);
-    testLayer("reshape_reduce", "TensorFlow", 0.0042, 0.0078);
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        testLayer("reshape_reduce", "TensorFlow", 0.0053, 0.011);
+    else
+        testLayer("reshape_reduce", "TensorFlow", 0.0042, 0.0078);
    testLayer("reshape_as_shape", "TensorFlow", 0.0014, 0.0028);
    testLayer("reshape_no_reorder", "TensorFlow", 0.0014, 0.0028);
-    testLayer("shift_reshape_no_reorder", "TensorFlow", 0.0063, 0.014);
+    testLayer("shift_reshape_no_reorder", "TensorFlow", 0.0063, backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH ? 0.016 : 0.014);
    testLayer("dynamic_reshape", "ONNX", 0.0047, 0.0079);
    testLayer("dynamic_reshape_opset_11", "ONNX", 0.0048, 0.0081);
    testLayer("flatten_by_prod", "ONNX", 0.0048, 0.0081);
@ -495,10 +505,10 @@ TEST_P(Test_Int8_layers, Eltwise)

    testLayer("conv_2_inps", "Caffe", 0.0086, 0.0232, 2, 1, true, false);
    testLayer("eltwise_sub", "TensorFlow", 0.015, 0.047);
-    testLayer("eltwise_add_vec", "TensorFlow", 0.037, 0.21); // tflite 0.0095, 0.0365
+    testLayer("eltwise_add_vec", "TensorFlow", 0.037, backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH ? 0.24 : 0.21); // tflite 0.0095, 0.0365
    testLayer("eltwise_mul_vec", "TensorFlow", 0.173, 1.14); // tflite 0.0028, 0.017
    testLayer("channel_broadcast", "TensorFlow", 0.0025, 0.0063);
-    testLayer("split_equals", "TensorFlow", 0.02, 0.065);
+    testLayer("split_equals", "TensorFlow", backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH ? 0.021 : 0.02, 0.065);
    testLayer("mul", "ONNX", 0.0039, 0.014);
    testLayer("split_max", "ONNX", 0.004, 0.012);
 }
@ -555,10 +565,10 @@ public:
        Mat blob = readTensorFromONNX(findDataFile("dnn/onnx/data/input_" + basename + ".pb"));
        Mat ref = readTensorFromONNX(findDataFile("dnn/onnx/data/output_" + basename + ".pb"));
        Net baseNet = readNetFromONNX(onnxmodel);
-        baseNet.setPreferableBackend(backend);
-        baseNet.setPreferableTarget(target);

        Net qnet = baseNet.quantize(blob, CV_32F, CV_32F, perChannel);
+        qnet.setPreferableBackend(backend);
+        qnet.setPreferableTarget(target);
        qnet.setInput(blob);
        Mat out = qnet.forward();

@ -703,9 +713,6 @@ TEST_P(Test_Int8_nets, AlexNet)
 #else
    applyTestTag(target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB);
 #endif
-    if (backend != DNN_BACKEND_OPENCV)
-        throw SkipTestException("Only OpenCV backend is supported");
-
    if (target == DNN_TARGET_OPENCL_FP16 && !ocl::Device::getDefault().isIntel())
        applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
    if (target == DNN_TARGET_OPENCL && !ocl::Device::getDefault().isIntel())
@ -746,8 +753,6 @@ TEST_P(Test_Int8_nets, GoogLeNet)
 TEST_P(Test_Int8_nets, ResNet50)
 {
    applyTestTag(target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB);
-    if (backend != DNN_BACKEND_OPENCV)
-        throw SkipTestException("Only OpenCV backend is supported");

    if (target == DNN_TARGET_OPENCL_FP16 && !ocl::Device::getDefault().isIntel())
        applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
@ -778,6 +783,8 @@ TEST_P(Test_Int8_nets, DenseNet121)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
    if (target == DNN_TARGET_OPENCL && !ocl::Device::getDefault().isIntel())
        applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL);
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);

    Net net = readNetFromCaffe(findDataFile("dnn/DenseNet_121.prototxt", false),
                               findDataFile("dnn/DenseNet_121.caffemodel", false));
@ -959,6 +966,8 @@ TEST_P(Test_Int8_nets, opencv_face_detector)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
    if (target == DNN_TARGET_OPENCL && !ocl::Device::getDefault().isIntel())
        applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL);
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);

    Net net = readNetFromCaffe(findDataFile("dnn/opencv_face_detector.prototxt"),
                               findDataFile("dnn/opencv_face_detector.caffemodel", false));
@ -1025,7 +1034,8 @@ TEST_P(Test_Int8_nets, FasterRCNN_resnet50)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
    if (target == DNN_TARGET_OPENCL && !ocl::Device::getDefault().isIntel())
        applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL);
-
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
    if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);

@ -1052,7 +1062,8 @@ TEST_P(Test_Int8_nets, FasterRCNN_inceptionv2)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
    if (target == DNN_TARGET_OPENCL && !ocl::Device::getDefault().isIntel())
        applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL);
-
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
    if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);

@ -1083,6 +1094,8 @@ TEST_P(Test_Int8_nets, FasterRCNN_vgg16)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
    if (target == DNN_TARGET_OPENCL && !ocl::Device::getDefault().isIntel())
        applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL);
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);

    Net net = readNetFromCaffe(findDataFile("dnn/faster_rcnn_vgg16.prototxt"),
                               findDataFile("dnn/VGG16_faster_rcnn_final.caffemodel", false));
@ -1110,6 +1123,8 @@ TEST_P(Test_Int8_nets, FasterRCNN_zf)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
    if (target == DNN_TARGET_OPENCL && !ocl::Device::getDefault().isIntel())
        applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL);
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);

    Net net = readNetFromCaffe(findDataFile("dnn/faster_rcnn_zf.prototxt"),
                               findDataFile("dnn/ZF_faster_rcnn_final.caffemodel", false));
@ -1142,6 +1157,9 @@ TEST_P(Test_Int8_nets, RFCN)
                                    0, 12, 0.94786, 132.093, 223.903, 338.077, 566.16);

    float confThreshold = 0.8, scoreDiff = 0.15, iouDiff = 0.11;
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) {
+        iouDiff = 0.12;
+    }
    testFaster(net, ref, confThreshold, scoreDiff, iouDiff);
 }

@ -1321,6 +1339,8 @@ TEST_P(Test_Int8_nets, YOLOv4_tiny)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
    if (target == DNN_TARGET_OPENCL && !ocl::Device::getDefault().isIntel())
        applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL);
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);

    const float confThreshold = 0.6;

--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@ -413,10 +413,12 @@ TEST_P(Test_Caffe_layers, layer_prelu_fc)

 TEST_P(Test_Caffe_layers, Reshape_Split_Slice)
 {
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+#endif

    Net net = readNetFromCaffe(_tf("reshape_and_slice_routines.prototxt"));
    ASSERT_FALSE(net.empty());
@ -795,8 +797,10 @@ TEST_P(Test_Caffe_layers, DataAugmentation)

 TEST_P(Test_Caffe_layers, Resample)
 {
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
    if (backend != DNN_BACKEND_OPENCV)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+#endif
    testLayerUsingCaffeModels("nearest_2inps", false, false, 0.0, 0.0, 2);
    testLayerUsingCaffeModels("nearest", false, false);
 }
--- a/modules/dnn/test/test_main.cpp
+++ b/modules/dnn/test/test_main.cpp
@ -4,4 +4,4 @@
    #include <hpx/hpx_main.hpp>
 #endif

-CV_TEST_MAIN("", initDNNTests());
+CV_TEST_MAIN("", initDNNTests())
--- a/modules/dnn/test/test_onnx_conformance.cpp
+++ b/modules/dnn/test/test_onnx_conformance.cpp
@ -1236,4 +1236,4 @@ INSTANTIATE_TEST_CASE_P(/**/, Test_ONNX_conformance,
    printOnnxConfParams
 );

-};
+}
--- a/modules/dnn/test/test_onnx_conformance_layer_filter__cuda_denylist.inl.hpp
+++ b/modules/dnn/test/test_onnx_conformance_layer_filter__cuda_denylist.inl.hpp
@ -46,6 +46,13 @@
 "test_conv_with_strides_and_asymmetric_padding",
 "test_conv_with_strides_no_padding",
 "test_conv_with_strides_padding",
+"test_cumsum_1d",
+"test_cumsum_1d_exclusive",
+"test_cumsum_1d_reverse",
+"test_cumsum_1d_reverse_exclusive",
+"test_cumsum_2d_axis_0",
+"test_cumsum_2d_axis_1",
+"test_cumsum_2d_negative_axis",
 "test_div_bcast",
 "test_div_uint8",
 "test_dropout_default_ratio",
--- a/modules/dnn/test/test_onnx_conformance_layer_filter__vulkan_denylist.inl.hpp
+++ b/modules/dnn/test/test_onnx_conformance_layer_filter__vulkan_denylist.inl.hpp
@ -40,6 +40,13 @@
 "test_cast_STRING_to_FLOAT",
 "test_castlike_FLOAT_to_STRING_expanded",
 "test_castlike_STRING_to_FLOAT_expanded",
+"test_cumsum_1d",
+"test_cumsum_1d_exclusive",
+"test_cumsum_1d_reverse",
+"test_cumsum_1d_reverse_exclusive",
+"test_cumsum_2d_axis_0",
+"test_cumsum_2d_axis_1",
+"test_cumsum_2d_negative_axis",
 "test_concat_1d_axis_negative_1",
 "test_div_uint8",
 "test_flatten_axis0",
--- a/modules/dnn/test/test_onnx_conformance_layer_parser_denylist.inl.hpp
+++ b/modules/dnn/test/test_onnx_conformance_layer_parser_denylist.inl.hpp
@ -89,13 +89,6 @@
 "test_convtranspose_pad",
 "test_convtranspose_pads",
 "test_convtranspose_with_kernel",
-"test_cumsum_1d",
-"test_cumsum_1d_exclusive",
-"test_cumsum_1d_reverse",
-"test_cumsum_1d_reverse_exclusive",
-"test_cumsum_2d_axis_0",
-"test_cumsum_2d_axis_1",
-"test_cumsum_2d_negative_axis",
 "test_dequantizelinear",
 "test_dequantizelinear_axis",
 "test_det_2d",
@ -547,3 +540,11 @@
 "test_xor_bcast4v2d",
 "test_xor_bcast4v3d",
 "test_xor_bcast4v4d",
+// Cumsum related issue: https://github.com/opencv/opencv/issues/24437
+"test_cumsum_1d",
+"test_cumsum_1d_exclusive",
+"test_cumsum_1d_reverse",
+"test_cumsum_1d_reverse_exclusive",
+"test_cumsum_2d_axis_0",
+"test_cumsum_2d_axis_1",
+"test_cumsum_2d_negative_axis",
--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@ -681,6 +681,9 @@ TEST_P(Test_ONNX_layers, Compare_GT)

    testONNXModels("greater");
 }
+TEST_P(Test_ONNX_layers, Greater_input_dtype_int64) {
+    testONNXModels("greater_input_dtype_int64");
+}

 TEST_P(Test_ONNX_layers, Compare_LT)
 {
@ -1063,10 +1066,12 @@ TEST_P(Test_ONNX_layers, ResizeUnfused)

 TEST_P(Test_ONNX_layers, ResizeUnfusedTwoInputs)
 {
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+#endif
    testONNXModels("upsample_unfused_two_inputs_opset9_torch1.4", npy, 0, 0, false, true, 2);
    testONNXModels("upsample_unfused_two_inputs_opset11_torch1.4", npy, 0, 0, false, true, 2);
 }
@ -1170,10 +1175,12 @@ TEST_P(Test_ONNX_layers, ReduceL2)

 TEST_P(Test_ONNX_layers, Split)
 {
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+#endif
    testONNXModels("split_0");
    testONNXModels("split_1");
    testONNXModels("split_2");
@ -1249,10 +1256,12 @@ TEST_P(Test_ONNX_layers, Softmax)

 TEST_P(Test_ONNX_layers, Split_EltwiseMax)
 {
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+#endif
    testONNXModels("split_max");
 }

@ -2058,12 +2067,16 @@ TEST_P(Test_ONNX_layers, Quantized_Unsqueeze)
 TEST_P(Test_ONNX_layers, Quantized_Resize)
 {
    testONNXModels("quantized_resize_nearest");
-    testONNXModels("quantized_resize_bilinear", npy, 2e-4, 0.003);
-    testONNXModels("quantized_resize_bilinear_align", npy, 3e-4, 0.003);
+    double l1 = backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH ? 0.0013 : 2e-4;
+    testONNXModels("quantized_resize_bilinear", npy, l1, 0.003);
+    l1 = backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH ? 0.0013 : 3e-4;
+    testONNXModels("quantized_resize_bilinear_align", npy, l1, 0.003);
 }

 TEST_P(Test_ONNX_layers, Quantized_Concat)
 {
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
    testONNXModels("quantized_concat");
    testONNXModels("quantized_concat_const_blob");
 }
@ -2080,6 +2093,8 @@ TEST_P(Test_ONNX_layers, OutputRegistration)

 TEST_P(Test_ONNX_layers, QLinearSoftmax)
 {
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
    testONNXModels("qlinearsoftmax_v11", npy, 0.002, 0.002); // 2D coerced
    testONNXModels("qlinearsoftmax_v13", npy, 0.002, 0.002);
 }
@ -2669,37 +2684,37 @@ TEST_P(Test_ONNX_layers, where_node)
    testONNXModels("where_layer");
 }

-TEST_P(Test_ONNX_layers, Conformance_Gemm_all_attributes) {
+TEST_P(Test_ONNX_layers, Gemm_all_attributes) {
    testONNXModels("test_gemm_all_attributes", pb, 0, 0, false, true, 2);
 }
-TEST_P(Test_ONNX_layers, Conformance_Gemm_alpha) {
+TEST_P(Test_ONNX_layers, Gemm_alpha) {
    testONNXModels("test_gemm_alpha", pb, 0, 0, false, true, 2);
 }
-TEST_P(Test_ONNX_layers, Conformance_Gemm_beta) {
+TEST_P(Test_ONNX_layers, Gemm_beta) {
    testONNXModels("test_gemm_beta", pb, 0, 0, false, true, 2);
 }
-TEST_P(Test_ONNX_layers, Conformance_Gemm_default_matrix_bias) {
+TEST_P(Test_ONNX_layers, Gemm_default_matrix_bias) {
    testONNXModels("test_gemm_default_matrix_bias", pb, 0, 0, false, true, 2);
 }
-TEST_P(Test_ONNX_layers, Conformance_Gemm_default_no_bias) {
+TEST_P(Test_ONNX_layers, Gemm_default_no_bias) {
    testONNXModels("test_gemm_default_no_bias", pb, 0, 0, false, true, 2);
 }
-TEST_P(Test_ONNX_layers, Conformance_Gemm_default_scalar_bias) {
+TEST_P(Test_ONNX_layers, Gemm_default_scalar_bias) {
    testONNXModels("test_gemm_default_scalar_bias", pb, 0, 0, false, true, 2);
 }
-TEST_P(Test_ONNX_layers, Conformance_Gemm_default_single_elem_vector_bias) {
+TEST_P(Test_ONNX_layers, Gemm_default_single_elem_vector_bias) {
    testONNXModels("test_gemm_default_single_elem_vector_bias", pb, 0, 0, false, true, 2);
 }
-TEST_P(Test_ONNX_layers, Conformance_Gemm_default_vector_bias) {
+TEST_P(Test_ONNX_layers, Gemm_default_vector_bias) {
    testONNXModels("test_gemm_default_vector_bias", pb, 0, 0, false, true, 2);
 }
-TEST_P(Test_ONNX_layers, Conformance_Gemm_default_zero_bias) {
+TEST_P(Test_ONNX_layers, Gemm_default_zero_bias) {
    testONNXModels("test_gemm_default_zero_bias", pb, 0, 0, false, true, 2);
 }
-TEST_P(Test_ONNX_layers, Conformance_Gemm_transposeA) {
+TEST_P(Test_ONNX_layers, Gemm_transposeA) {
    testONNXModels("test_gemm_transposeA", pb, 0, 0, false, true, 2);
 }
-TEST_P(Test_ONNX_layers, Conformance_Gemm_transposeB) {
+TEST_P(Test_ONNX_layers, Gemm_transposeB) {
    testONNXModels("test_gemm_transposeB", pb, 0, 0, false, true, 2);
 }

--- a/modules/dnn/test/test_tf_importer.cpp
+++ b/modules/dnn/test/test_tf_importer.cpp
@ -619,10 +619,12 @@ TEST_P(Test_TensorFlow_layers, pooling_reduce_sum_1_2_true)

 TEST_P(Test_TensorFlow_layers, max_pool_grad)
 {
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+#endif
    runTensorFlowNet("max_pool_grad");
 }

@ -1496,17 +1498,21 @@ TEST_P(Test_TensorFlow_layers, split)

    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_MYRIAD)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+#endif
    runTensorFlowNet("split");
 }

 TEST_P(Test_TensorFlow_layers, split_equals)
 {
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+#endif
    runTensorFlowNet("split_equals");
 }

@ -1581,7 +1587,7 @@ TEST_P(Test_TensorFlow_layers, relu6)

 TEST_P(Test_TensorFlow_layers, subpixel)
 {
-#if defined(INF_ENGINE_RELEASE)
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
@ -1621,8 +1627,10 @@ TEST_P(Test_TensorFlow_layers, resize_bilinear_align_corners)
 // TF case: align_corners=False, half_pixel_centers=True
 TEST_P(Test_TensorFlow_layers, resize_bilinear_half_pixel)
 {
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+#endif

    runTensorFlowNet("resize_bilinear", false, 0.0, 0.0, false, "_half_pixel");
 }
@ -1636,8 +1644,10 @@ TEST_P(Test_TensorFlow_layers, resize_bilinear_factor)
 // TF case: align_corners=False, half_pixel_centers=True
 TEST_P(Test_TensorFlow_layers, resize_bilinear_factor_half_pixel)
 {
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+#endif

    runTensorFlowNet("resize_bilinear_factor", false, 0.0, 0.0, false, "_half_pixel");
 }
--- a/modules/dnn/test/test_tflite_importer.cpp
+++ b/modules/dnn/test/test_tflite_importer.cpp
@ -204,6 +204,10 @@ TEST_P(Test_TFLite, max_unpooling)
 }

 TEST_P(Test_TFLite, EfficientDet_int8) {
+    if (target != DNN_TARGET_CPU || (backend != DNN_BACKEND_OPENCV &&
+        backend != DNN_BACKEND_TIMVX && backend != DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)) {
+        throw SkipTestException("Only OpenCV, TimVX and OpenVINO targets support INT8 on CPU");
+    }
    Net net = readNet(findDataFile("dnn/tflite/coco_efficientdet_lite0_v1_1.0_quant_2021_09_06.tflite", false));
    net.setPreferableBackend(backend);
    net.setPreferableTarget(target);
--- a/modules/dnn/test/test_torch_importer.cpp
+++ b/modules/dnn/test/test_torch_importer.cpp
@ -449,7 +449,7 @@ TEST_P(Test_Torch_nets, ENet_accuracy)
        throw SkipTestException("");
    }
 #endif
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2021010000)
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
 #endif
--- a/modules/features2d/include/opencv2/features2d.hpp
+++ b/modules/features2d/include/opencv2/features2d.hpp
@ -872,11 +872,15 @@ public:
    @param nOctaveLayers Default number of sublevels per scale level
    @param diffusivity Diffusivity type. DIFF_PM_G1, DIFF_PM_G2, DIFF_WEICKERT or
    DIFF_CHARBONNIER
+    @param max_points Maximum amount of returned points. In case if image contains
+    more features, then the features with highest response are returned.
+    Negative value means no limitation.
     */
    CV_WRAP static Ptr<AKAZE> create(AKAZE::DescriptorType descriptor_type = AKAZE::DESCRIPTOR_MLDB,
                                     int descriptor_size = 0, int descriptor_channels = 3,
                                     float threshold = 0.001f, int nOctaves = 4,
-                                     int nOctaveLayers = 4, KAZE::DiffusivityType diffusivity = KAZE::DIFF_PM_G2);
+                                     int nOctaveLayers = 4, KAZE::DiffusivityType diffusivity = KAZE::DIFF_PM_G2,
+                                     int max_points = -1);

    CV_WRAP virtual void setDescriptorType(AKAZE::DescriptorType dtype) = 0;
    CV_WRAP virtual AKAZE::DescriptorType getDescriptorType() const = 0;
@ -899,6 +903,9 @@ public:
    CV_WRAP virtual void setDiffusivity(KAZE::DiffusivityType diff) = 0;
    CV_WRAP virtual KAZE::DiffusivityType getDiffusivity() const = 0;
    CV_WRAP virtual String getDefaultName() const CV_OVERRIDE;
+
+    CV_WRAP virtual void setMaxPoints(int max_points) = 0;
+    CV_WRAP virtual int getMaxPoints() const = 0;
 };

 //! @} features2d_main
--- a/modules/features2d/misc/java/test/AKAZEDescriptorExtractorTest.java
+++ b/modules/features2d/misc/java/test/AKAZEDescriptorExtractorTest.java
@ -58,7 +58,7 @@ public class AKAZEDescriptorExtractorTest extends OpenCVTestCase {

        extractor.write(filename);

-        String truth = "%YAML:1.0\n---\nformat: 3\nname: \"Feature2D.AKAZE\"\ndescriptor: 5\ndescriptor_channels: 3\ndescriptor_size: 0\nthreshold: 1.0000000474974513e-03\noctaves: 4\nsublevels: 4\ndiffusivity: 1\n";
+        String truth = "%YAML:1.0\n---\nformat: 3\nname: \"Feature2D.AKAZE\"\ndescriptor: 5\ndescriptor_channels: 3\ndescriptor_size: 0\nthreshold: 1.0000000474974513e-03\noctaves: 4\nsublevels: 4\ndiffusivity: 1\nmax_points: -1\n";
        String actual = readFile(filename);
        actual = actual.replaceAll("e([+-])0(\\d\\d)", "e$1$2"); // NOTE: workaround for different platforms double representation
        assertEquals(truth, actual);
--- a/modules/features2d/src/akaze.cpp
+++ b/modules/features2d/src/akaze.cpp
@ -61,7 +61,7 @@ namespace cv
    {
    public:
        AKAZE_Impl(DescriptorType _descriptor_type, int _descriptor_size, int _descriptor_channels,
-                 float _threshold, int _octaves, int _sublevels, KAZE::DiffusivityType _diffusivity)
+                 float _threshold, int _octaves, int _sublevels, KAZE::DiffusivityType _diffusivity, int _max_points)
        : descriptor(_descriptor_type)
        , descriptor_channels(_descriptor_channels)
        , descriptor_size(_descriptor_size)
@ -69,6 +69,7 @@ namespace cv
        , octaves(_octaves)
        , sublevels(_sublevels)
        , diffusivity(_diffusivity)
+        , max_points(_max_points)
        {
        }

@ -98,6 +99,9 @@ namespace cv
        void setDiffusivity(KAZE::DiffusivityType diff_) CV_OVERRIDE{ diffusivity = diff_; }
        KAZE::DiffusivityType getDiffusivity() const CV_OVERRIDE{ return diffusivity; }

+        void setMaxPoints(int max_points_) CV_OVERRIDE { max_points = max_points_; }
+        int getMaxPoints() const CV_OVERRIDE { return max_points; }
+
        // returns the descriptor size in bytes
        int descriptorSize() const CV_OVERRIDE
        {
@ -195,6 +199,12 @@ namespace cv
                KeyPointsFilter::runByPixelsMask(keypoints, mask.getMat());
            }

+            if (max_points > 0 && (int)keypoints.size() > max_points) {
+                std::partial_sort(keypoints.begin(), keypoints.begin() + max_points, keypoints.end(),
+                    [](const cv::KeyPoint& k1, const cv::KeyPoint& k2) {return k1.response > k2.response;});
+                keypoints.erase(keypoints.begin() + max_points, keypoints.end());
+            }
+
            if(descriptors.needed())
            {
                impl.Compute_Descriptors(keypoints, descriptors);
@ -215,6 +225,7 @@ namespace cv
            fs << "octaves" << octaves;
            fs << "sublevels" << sublevels;
            fs << "diffusivity" << diffusivity;
+            fs << "max_points" << max_points;
        }

        void read(const FileNode& fn) CV_OVERRIDE
@ -234,6 +245,8 @@ namespace cv
                sublevels = (int)fn["sublevels"];
            if (!fn["diffusivity"].empty())
                diffusivity = static_cast<KAZE::DiffusivityType>((int)fn["diffusivity"]);
+            if (!fn["max_points"].empty())
+                max_points = (int)fn["max_points"];
        }

        DescriptorType descriptor;
@ -243,15 +256,16 @@ namespace cv
        int octaves;
        int sublevels;
        KAZE::DiffusivityType diffusivity;
+        int max_points;
    };

    Ptr<AKAZE> AKAZE::create(DescriptorType descriptor_type,
                             int descriptor_size, int descriptor_channels,
                             float threshold, int octaves,
-                             int sublevels, KAZE::DiffusivityType diffusivity)
+                             int sublevels, KAZE::DiffusivityType diffusivity, int max_points)
    {
        return makePtr<AKAZE_Impl>(descriptor_type, descriptor_size, descriptor_channels,
-                                   threshold, octaves, sublevels, diffusivity);
+                                   threshold, octaves, sublevels, diffusivity, max_points);
    }

    String AKAZE::getDefaultName() const
--- a/modules/features2d/src/hal_replacement.hpp
+++ b/modules/features2d/src/hal_replacement.hpp
@ -64,9 +64,12 @@
 //! @{
 /**
   @brief Detects corners using the FAST algorithm, returns mask.
-   @param src_data,src_step Source image
-   @param dst_data,dst_step Destination mask
-   @param width,height Source image dimensions
+   @param src_data Source image data
+   @param src_step Source image step
+   @param dst_data Destination mask data
+   @param dst_step Destination mask step
+   @param width Source image width
+   @param height Source image height
   @param type FAST type
 */
 inline int hal_ni_FAST_dense(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, cv::FastFeatureDetector::DetectorType type) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
@ -89,8 +92,10 @@ inline int hal_ni_FAST_NMS(const uchar* src_data, size_t src_step, uchar* dst_da

 /**
   @brief Detects corners using the FAST algorithm.
-   @param src_data,src_step Source image
-   @param width,height Source image dimensions
+   @param src_data Source image data
+   @param src_step Source image step
+   @param width Source image width
+   @param height Source image height
   @param keypoints_data Pointer to keypoints
   @param keypoints_count Count of keypoints
   @param threshold Threshold for keypoint
--- a/modules/features2d/src/kaze/nldiffusion_functions.cpp
+++ b/modules/features2d/src/kaze/nldiffusion_functions.cpp
@ -86,9 +86,9 @@ void image_derivatives_scharr(const cv::Mat& src, cv::Mat& dst, int xorder, int
 /**
 * @brief This function computes the Perona and Malik conductivity coefficient g1
 * g1 = exp(-|dL|^2/k^2)
- * @param Lx First order image derivative in X-direction (horizontal)
- * @param Ly First order image derivative in Y-direction (vertical)
- * @param dst Output image
+ * @param _Lx First order image derivative in X-direction (horizontal)
+ * @param _Ly First order image derivative in Y-direction (vertical)
+ * @param _dst Output image
 * @param k Contrast factor parameter
 */
 void pm_g1(InputArray _Lx, InputArray _Ly, OutputArray _dst, float k) {
@ -117,9 +117,9 @@ void pm_g1(InputArray _Lx, InputArray _Ly, OutputArray _dst, float k) {
 /**
 * @brief This function computes the Perona and Malik conductivity coefficient g2
 * g2 = 1 / (1 + dL^2 / k^2)
- * @param Lx First order image derivative in X-direction (horizontal)
- * @param Ly First order image derivative in Y-direction (vertical)
- * @param dst Output image
+ * @param _Lx First order image derivative in X-direction (horizontal)
+ * @param _Ly First order image derivative in Y-direction (vertical)
+ * @param _dst Output image
 * @param k Contrast factor parameter
 */
 void pm_g2(InputArray _Lx, InputArray _Ly, OutputArray _dst, float k) {
@ -146,9 +146,9 @@ void pm_g2(InputArray _Lx, InputArray _Ly, OutputArray _dst, float k) {
 /* ************************************************************************* */
 /**
 * @brief This function computes Weickert conductivity coefficient gw
- * @param Lx First order image derivative in X-direction (horizontal)
- * @param Ly First order image derivative in Y-direction (vertical)
- * @param dst Output image
+ * @param _Lx First order image derivative in X-direction (horizontal)
+ * @param _Ly First order image derivative in Y-direction (vertical)
+ * @param _dst Output image
 * @param k Contrast factor parameter
 * @note For more information check the following paper: J. Weickert
 * Applications of nonlinear diffusion in image processing and computer vision,
@ -183,9 +183,9 @@ void weickert_diffusivity(InputArray _Lx, InputArray _Ly, OutputArray _dst, floa
 /**
 * @brief This function computes Charbonnier conductivity coefficient gc
 * gc = 1 / sqrt(1 + dL^2 / k^2)
-* @param Lx First order image derivative in X-direction (horizontal)
-* @param Ly First order image derivative in Y-direction (vertical)
-* @param dst Output image
+* @param _Lx First order image derivative in X-direction (horizontal)
+* @param _Ly First order image derivative in Y-direction (vertical)
+* @param _dst Output image
 * @param k Contrast factor parameter
 * @note For more information check the following paper: J. Weickert
 * Applications of nonlinear diffusion in image processing and computer vision,
@ -323,7 +323,7 @@ void compute_scharr_derivatives(const cv::Mat& src, cv::Mat& dst, int xorder, in
 * @param _ky Vertical kernel values
 * @param dx Derivative order in X-direction (horizontal)
 * @param dy Derivative order in Y-direction (vertical)
- * @param scale_ Scale factor or derivative size
+ * @param scale Scale factor or derivative size
 */
 void compute_derivative_kernels(cv::OutputArray _kx, cv::OutputArray _ky, int dx, int dy, int scale) {
    CV_INSTRUMENT_REGION();
@ -415,7 +415,7 @@ private:
 /* ************************************************************************* */
 /**
 * @brief This function performs a scalar non-linear diffusion step
-* @param Ld2 Output image in the evolution
+* @param Ld Output image in the evolution
 * @param c Conductivity image
 * @param Lstep Previous image in the evolution
 * @param stepsize The step size in time units
@ -490,7 +490,7 @@ void nld_step_scalar(cv::Mat& Ld, const cv::Mat& c, cv::Mat& Lstep, float stepsi
 /* ************************************************************************* */
 /**
 * @brief This function downsamples the input image using OpenCV resize
-* @param img Input image to be downsampled
+* @param src Input image to be downsampled
 * @param dst Output image with half of the resolution of the input image
 */
 void halfsample_image(const cv::Mat& src, cv::Mat& dst) {
--- a/modules/features2d/src/kaze/utils.h
+++ b/modules/features2d/src/kaze/utils.h
@ -6,7 +6,7 @@
 * @brief This function computes the value of a 2D Gaussian function
 * @param x X Position
 * @param y Y Position
- * @param sig Standard Deviation
+ * @param sigma Standard Deviation
 */
 inline float gaussian(float x, float y, float sigma) {
  return expf(-(x*x + y*y) / (2.0f*sigma*sigma));
--- a/modules/flann/include/opencv2/flann/composite_index.h
+++ b/modules/flann/include/opencv2/flann/composite_index.h
@ -80,7 +80,6 @@ public:
     * @param inputData dataset containing the points to index
     * @param params Index parameters
     * @param d Distance functor
-     * @return
     */
    CompositeIndex(const Matrix<ElementType>& inputData, const IndexParams& params = CompositeIndexParams(),
                   Distance d = Distance()) : index_params_(params)
--- a/modules/flann/include/opencv2/flann/dynamic_bitset.h
+++ b/modules/flann/include/opencv2/flann/dynamic_bitset.h
@ -97,7 +97,6 @@ public:
    }

    /** @brief set one bit to 0
-     * @param index
     */
    void reset(size_t index)
    {
@ -108,7 +107,6 @@ public:
     * This function is useful when resetting a given set of bits so that the
     * whole bitset ends up being 0: if that's the case, we don't care about setting
     * other bits to 0
-     * @param index
     */
    void reset_block(size_t index)
    {
@ -116,7 +114,6 @@ public:
    }

    /** resize the bitset so that it contains at least sz bits
-     * @param sz
     */
    void resize(size_t sz)
    {
--- a/modules/flann/include/opencv2/flann/logger.h
+++ b/modules/flann/include/opencv2/flann/logger.h
@ -101,7 +101,6 @@ public:
     * Print log message
     * @param level Log level
     * @param fmt Message format
-     * @return
     */
    static int log(int level, const char* fmt, ...)
    {
--- a/modules/flann/include/opencv2/flann/lsh_table.h
+++ b/modules/flann/include/opencv2/flann/lsh_table.h
@ -214,8 +214,6 @@ public:
    }

    /** Get a bucket given the key
-     * @param key
-     * @return
     */
    inline const Bucket* getBucketFromKey(BucketKey key) const
    {
@ -253,7 +251,6 @@ public:
    }

    /** Get statistics about the table
-     * @return
     */
    LshStats getStats() const;

--- a/Show More
+++ b/Show More